aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/Makefile14
-rw-r--r--fs/ocfs2/alloc.c465
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/netdebug.c441
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/cluster/tcp.c164
-rw-r--r--fs/ocfs2/cluster/tcp.h32
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h26
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h49
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c911
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h86
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c70
-rw-r--r--fs/ocfs2/dlm/dlmlock.c22
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c200
-rw-r--r--fs/ocfs2/dlmglue.c645
-rw-r--r--fs/ocfs2/dlmglue.h5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/heartbeat.c184
-rw-r--r--fs/ocfs2/heartbeat.h17
-rw-r--r--fs/ocfs2/ioctl.c24
-rw-r--r--fs/ocfs2/ioctl.h3
-rw-r--r--fs/ocfs2/journal.c211
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/ocfs2.h77
-rw-r--r--fs/ocfs2/ocfs2_fs.h79
-rw-r--r--fs/ocfs2/ocfs2_lockid.h2
-rw-r--r--fs/ocfs2/slot_map.c454
-rw-r--r--fs/ocfs2/slot_map.h32
-rw-r--r--fs/ocfs2/stack_o2cb.c420
-rw-r--r--fs/ocfs2/stack_user.c883
-rw-r--r--fs/ocfs2/stackglue.c568
-rw-r--r--fs/ocfs2/stackglue.h261
-rw-r--r--fs/ocfs2/suballoc.c103
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c208
40 files changed, 5651 insertions, 1046 deletions
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42c..f6956de56fdb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES 3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4 4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o 5obj-$(CONFIG_OCFS2_FS) += \
6 ocfs2.o \
7 ocfs2_stackglue.o
8
9obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
10obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
6 11
7ocfs2-objs := \ 12ocfs2-objs := \
8 alloc.o \ 13 alloc.o \
@@ -31,5 +36,10 @@ ocfs2-objs := \
31 uptodate.o \ 36 uptodate.o \
32 ver.o 37 ver.o
33 38
39ocfs2_stackglue-objs := stackglue.o
40ocfs2_stack_o2cb-objs := stack_o2cb.o
41ocfs2_stack_user-objs := stack_user.o
42
43# cluster/ is always needed when OCFS2_FS for masklog support
34obj-$(CONFIG_OCFS2_FS) += cluster/ 44obj-$(CONFIG_OCFS2_FS) += cluster/
35obj-$(CONFIG_OCFS2_FS) += dlm/ 45obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2e..41f84c92094f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1029 BUG_ON(!next_free); 1029 BUG_ON(!next_free);
1030 1030
1031 /* The tree code before us didn't allow enough room in the leaf. */ 1031 /* The tree code before us didn't allow enough room in the leaf. */
1032 if (el->l_next_free_rec == el->l_count && !has_empty) 1032 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1033 BUG();
1034 1033
1035 /* 1034 /*
1036 * The easiest way to approach this is to just remove the 1035 * The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1450 * - When our insert into the right path leaf is at the leftmost edge 1449 * - When our insert into the right path leaf is at the leftmost edge
1451 * and requires an update of the path immediately to it's left. This 1450 * and requires an update of the path immediately to it's left. This
1452 * can occur at the end of some types of rotation and appending inserts. 1451 * can occur at the end of some types of rotation and appending inserts.
1452 * - When we've adjusted the last extent record in the left path leaf and the
1453 * 1st extent record in the right path leaf during cross extent block merge.
1453 */ 1454 */
1454static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1455static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1455 struct ocfs2_path *left_path, 1456 struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2712 } 2713 }
2713} 2714}
2714 2715
2716static int ocfs2_get_right_path(struct inode *inode,
2717 struct ocfs2_path *left_path,
2718 struct ocfs2_path **ret_right_path)
2719{
2720 int ret;
2721 u32 right_cpos;
2722 struct ocfs2_path *right_path = NULL;
2723 struct ocfs2_extent_list *left_el;
2724
2725 *ret_right_path = NULL;
2726
2727 /* This function shouldn't be called for non-trees. */
2728 BUG_ON(left_path->p_tree_depth == 0);
2729
2730 left_el = path_leaf_el(left_path);
2731 BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2732
2733 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2734 &right_cpos);
2735 if (ret) {
2736 mlog_errno(ret);
2737 goto out;
2738 }
2739
2740 /* This function shouldn't be called for the rightmost leaf. */
2741 BUG_ON(right_cpos == 0);
2742
2743 right_path = ocfs2_new_path(path_root_bh(left_path),
2744 path_root_el(left_path));
2745 if (!right_path) {
2746 ret = -ENOMEM;
2747 mlog_errno(ret);
2748 goto out;
2749 }
2750
2751 ret = ocfs2_find_path(inode, right_path, right_cpos);
2752 if (ret) {
2753 mlog_errno(ret);
2754 goto out;
2755 }
2756
2757 *ret_right_path = right_path;
2758out:
2759 if (ret)
2760 ocfs2_free_path(right_path);
2761 return ret;
2762}
2763
2715/* 2764/*
2716 * Remove split_rec clusters from the record at index and merge them 2765 * Remove split_rec clusters from the record at index and merge them
2717 * onto the beginning of the record at index + 1. 2766 * onto the beginning of the record "next" to it.
2767 * For index < l_count - 1, the next means the extent rec at index + 1.
2768 * For index == l_count - 1, the "next" means the 1st extent rec of the
2769 * next extent block.
2718 */ 2770 */
2719static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, 2771static int ocfs2_merge_rec_right(struct inode *inode,
2720 handle_t *handle, 2772 struct ocfs2_path *left_path,
2721 struct ocfs2_extent_rec *split_rec, 2773 handle_t *handle,
2722 struct ocfs2_extent_list *el, int index) 2774 struct ocfs2_extent_rec *split_rec,
2775 int index)
2723{ 2776{
2724 int ret; 2777 int ret, next_free, i;
2725 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2778 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2726 struct ocfs2_extent_rec *left_rec; 2779 struct ocfs2_extent_rec *left_rec;
2727 struct ocfs2_extent_rec *right_rec; 2780 struct ocfs2_extent_rec *right_rec;
2781 struct ocfs2_extent_list *right_el;
2782 struct ocfs2_path *right_path = NULL;
2783 int subtree_index = 0;
2784 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2785 struct buffer_head *bh = path_leaf_bh(left_path);
2786 struct buffer_head *root_bh = NULL;
2728 2787
2729 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 2788 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2730
2731 left_rec = &el->l_recs[index]; 2789 left_rec = &el->l_recs[index];
2732 right_rec = &el->l_recs[index + 1]; 2790
2791 if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
2792 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2793 /* we meet with a cross extent block merge. */
2794 ret = ocfs2_get_right_path(inode, left_path, &right_path);
2795 if (ret) {
2796 mlog_errno(ret);
2797 goto out;
2798 }
2799
2800 right_el = path_leaf_el(right_path);
2801 next_free = le16_to_cpu(right_el->l_next_free_rec);
2802 BUG_ON(next_free <= 0);
2803 right_rec = &right_el->l_recs[0];
2804 if (ocfs2_is_empty_extent(right_rec)) {
2805 BUG_ON(le16_to_cpu(next_free) <= 1);
2806 right_rec = &right_el->l_recs[1];
2807 }
2808
2809 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2810 le16_to_cpu(left_rec->e_leaf_clusters) !=
2811 le32_to_cpu(right_rec->e_cpos));
2812
2813 subtree_index = ocfs2_find_subtree_root(inode,
2814 left_path, right_path);
2815
2816 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2817 handle->h_buffer_credits,
2818 right_path);
2819 if (ret) {
2820 mlog_errno(ret);
2821 goto out;
2822 }
2823
2824 root_bh = left_path->p_node[subtree_index].bh;
2825 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2826
2827 ret = ocfs2_journal_access(handle, inode, root_bh,
2828 OCFS2_JOURNAL_ACCESS_WRITE);
2829 if (ret) {
2830 mlog_errno(ret);
2831 goto out;
2832 }
2833
2834 for (i = subtree_index + 1;
2835 i < path_num_items(right_path); i++) {
2836 ret = ocfs2_journal_access(handle, inode,
2837 right_path->p_node[i].bh,
2838 OCFS2_JOURNAL_ACCESS_WRITE);
2839 if (ret) {
2840 mlog_errno(ret);
2841 goto out;
2842 }
2843
2844 ret = ocfs2_journal_access(handle, inode,
2845 left_path->p_node[i].bh,
2846 OCFS2_JOURNAL_ACCESS_WRITE);
2847 if (ret) {
2848 mlog_errno(ret);
2849 goto out;
2850 }
2851 }
2852
2853 } else {
2854 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2855 right_rec = &el->l_recs[index + 1];
2856 }
2733 2857
2734 ret = ocfs2_journal_access(handle, inode, bh, 2858 ret = ocfs2_journal_access(handle, inode, bh,
2735 OCFS2_JOURNAL_ACCESS_WRITE); 2859 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2751 if (ret) 2875 if (ret)
2752 mlog_errno(ret); 2876 mlog_errno(ret);
2753 2877
2878 if (right_path) {
2879 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2880 if (ret)
2881 mlog_errno(ret);
2882
2883 ocfs2_complete_edge_insert(inode, handle, left_path,
2884 right_path, subtree_index);
2885 }
2886out:
2887 if (right_path)
2888 ocfs2_free_path(right_path);
2889 return ret;
2890}
2891
2892static int ocfs2_get_left_path(struct inode *inode,
2893 struct ocfs2_path *right_path,
2894 struct ocfs2_path **ret_left_path)
2895{
2896 int ret;
2897 u32 left_cpos;
2898 struct ocfs2_path *left_path = NULL;
2899
2900 *ret_left_path = NULL;
2901
2902 /* This function shouldn't be called for non-trees. */
2903 BUG_ON(right_path->p_tree_depth == 0);
2904
2905 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
2906 right_path, &left_cpos);
2907 if (ret) {
2908 mlog_errno(ret);
2909 goto out;
2910 }
2911
2912 /* This function shouldn't be called for the leftmost leaf. */
2913 BUG_ON(left_cpos == 0);
2914
2915 left_path = ocfs2_new_path(path_root_bh(right_path),
2916 path_root_el(right_path));
2917 if (!left_path) {
2918 ret = -ENOMEM;
2919 mlog_errno(ret);
2920 goto out;
2921 }
2922
2923 ret = ocfs2_find_path(inode, left_path, left_cpos);
2924 if (ret) {
2925 mlog_errno(ret);
2926 goto out;
2927 }
2928
2929 *ret_left_path = left_path;
2754out: 2930out:
2931 if (ret)
2932 ocfs2_free_path(left_path);
2755 return ret; 2933 return ret;
2756} 2934}
2757 2935
2758/* 2936/*
2759 * Remove split_rec clusters from the record at index and merge them 2937 * Remove split_rec clusters from the record at index and merge them
2760 * onto the tail of the record at index - 1. 2938 * onto the tail of the record "before" it.
2939 * For index > 0, the "before" means the extent rec at index - 1.
2940 *
2941 * For index == 0, the "before" means the last record of the previous
2942 * extent block. And there is also a situation that we may need to
2943 * remove the rightmost leaf extent block in the right_path and change
2944 * the right path to indicate the new rightmost path.
2761 */ 2945 */
2762static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, 2946static int ocfs2_merge_rec_left(struct inode *inode,
2947 struct ocfs2_path *right_path,
2763 handle_t *handle, 2948 handle_t *handle,
2764 struct ocfs2_extent_rec *split_rec, 2949 struct ocfs2_extent_rec *split_rec,
2765 struct ocfs2_extent_list *el, int index) 2950 struct ocfs2_cached_dealloc_ctxt *dealloc,
2951 int index)
2766{ 2952{
2767 int ret, has_empty_extent = 0; 2953 int ret, i, subtree_index = 0, has_empty_extent = 0;
2768 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2954 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2769 struct ocfs2_extent_rec *left_rec; 2955 struct ocfs2_extent_rec *left_rec;
2770 struct ocfs2_extent_rec *right_rec; 2956 struct ocfs2_extent_rec *right_rec;
2957 struct ocfs2_extent_list *el = path_leaf_el(right_path);
2958 struct buffer_head *bh = path_leaf_bh(right_path);
2959 struct buffer_head *root_bh = NULL;
2960 struct ocfs2_path *left_path = NULL;
2961 struct ocfs2_extent_list *left_el;
2771 2962
2772 BUG_ON(index <= 0); 2963 BUG_ON(index < 0);
2773 2964
2774 left_rec = &el->l_recs[index - 1];
2775 right_rec = &el->l_recs[index]; 2965 right_rec = &el->l_recs[index];
2776 if (ocfs2_is_empty_extent(&el->l_recs[0])) 2966 if (index == 0) {
2777 has_empty_extent = 1; 2967 /* we meet with a cross extent block merge. */
2968 ret = ocfs2_get_left_path(inode, right_path, &left_path);
2969 if (ret) {
2970 mlog_errno(ret);
2971 goto out;
2972 }
2973
2974 left_el = path_leaf_el(left_path);
2975 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
2976 le16_to_cpu(left_el->l_count));
2977
2978 left_rec = &left_el->l_recs[
2979 le16_to_cpu(left_el->l_next_free_rec) - 1];
2980 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2981 le16_to_cpu(left_rec->e_leaf_clusters) !=
2982 le32_to_cpu(split_rec->e_cpos));
2983
2984 subtree_index = ocfs2_find_subtree_root(inode,
2985 left_path, right_path);
2986
2987 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2988 handle->h_buffer_credits,
2989 left_path);
2990 if (ret) {
2991 mlog_errno(ret);
2992 goto out;
2993 }
2994
2995 root_bh = left_path->p_node[subtree_index].bh;
2996 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2997
2998 ret = ocfs2_journal_access(handle, inode, root_bh,
2999 OCFS2_JOURNAL_ACCESS_WRITE);
3000 if (ret) {
3001 mlog_errno(ret);
3002 goto out;
3003 }
3004
3005 for (i = subtree_index + 1;
3006 i < path_num_items(right_path); i++) {
3007 ret = ocfs2_journal_access(handle, inode,
3008 right_path->p_node[i].bh,
3009 OCFS2_JOURNAL_ACCESS_WRITE);
3010 if (ret) {
3011 mlog_errno(ret);
3012 goto out;
3013 }
3014
3015 ret = ocfs2_journal_access(handle, inode,
3016 left_path->p_node[i].bh,
3017 OCFS2_JOURNAL_ACCESS_WRITE);
3018 if (ret) {
3019 mlog_errno(ret);
3020 goto out;
3021 }
3022 }
3023 } else {
3024 left_rec = &el->l_recs[index - 1];
3025 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3026 has_empty_extent = 1;
3027 }
2778 3028
2779 ret = ocfs2_journal_access(handle, inode, bh, 3029 ret = ocfs2_journal_access(handle, inode, bh,
2780 OCFS2_JOURNAL_ACCESS_WRITE); 3030 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2790 *left_rec = *split_rec; 3040 *left_rec = *split_rec;
2791 3041
2792 has_empty_extent = 0; 3042 has_empty_extent = 0;
2793 } else { 3043 } else
2794 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3044 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2795 }
2796 3045
2797 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3046 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2798 le64_add_cpu(&right_rec->e_blkno, 3047 le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2805 if (ret) 3054 if (ret)
2806 mlog_errno(ret); 3055 mlog_errno(ret);
2807 3056
3057 if (left_path) {
3058 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3059 if (ret)
3060 mlog_errno(ret);
3061
3062 /*
3063 * In the situation that the right_rec is empty and the extent
3064 * block is empty also, ocfs2_complete_edge_insert can't handle
3065 * it and we need to delete the right extent block.
3066 */
3067 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3068 le16_to_cpu(el->l_next_free_rec) == 1) {
3069
3070 ret = ocfs2_remove_rightmost_path(inode, handle,
3071 right_path, dealloc);
3072 if (ret) {
3073 mlog_errno(ret);
3074 goto out;
3075 }
3076
3077 /* Now the rightmost extent block has been deleted.
3078 * So we use the new rightmost path.
3079 */
3080 ocfs2_mv_path(right_path, left_path);
3081 left_path = NULL;
3082 } else
3083 ocfs2_complete_edge_insert(inode, handle, left_path,
3084 right_path, subtree_index);
3085 }
2808out: 3086out:
3087 if (left_path)
3088 ocfs2_free_path(left_path);
2809 return ret; 3089 return ret;
2810} 3090}
2811 3091
2812static int ocfs2_try_to_merge_extent(struct inode *inode, 3092static int ocfs2_try_to_merge_extent(struct inode *inode,
2813 handle_t *handle, 3093 handle_t *handle,
2814 struct ocfs2_path *left_path, 3094 struct ocfs2_path *path,
2815 int split_index, 3095 int split_index,
2816 struct ocfs2_extent_rec *split_rec, 3096 struct ocfs2_extent_rec *split_rec,
2817 struct ocfs2_cached_dealloc_ctxt *dealloc, 3097 struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2819 3099
2820{ 3100{
2821 int ret = 0; 3101 int ret = 0;
2822 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3102 struct ocfs2_extent_list *el = path_leaf_el(path);
2823 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3103 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2824 3104
2825 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3105 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2832 * extents - having more than one in a leaf is 3112 * extents - having more than one in a leaf is
2833 * illegal. 3113 * illegal.
2834 */ 3114 */
2835 ret = ocfs2_rotate_tree_left(inode, handle, left_path, 3115 ret = ocfs2_rotate_tree_left(inode, handle, path,
2836 dealloc); 3116 dealloc);
2837 if (ret) { 3117 if (ret) {
2838 mlog_errno(ret); 3118 mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2847 * Left-right contig implies this. 3127 * Left-right contig implies this.
2848 */ 3128 */
2849 BUG_ON(!ctxt->c_split_covers_rec); 3129 BUG_ON(!ctxt->c_split_covers_rec);
2850 BUG_ON(split_index == 0);
2851 3130
2852 /* 3131 /*
2853 * Since the leftright insert always covers the entire 3132 * Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2858 * Since the adding of an empty extent shifts 3137 * Since the adding of an empty extent shifts
2859 * everything back to the right, there's no need to 3138 * everything back to the right, there's no need to
2860 * update split_index here. 3139 * update split_index here.
3140 *
3141 * When the split_index is zero, we need to merge it to the
3142 * prevoius extent block. It is more efficient and easier
3143 * if we do merge_right first and merge_left later.
2861 */ 3144 */
2862 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), 3145 ret = ocfs2_merge_rec_right(inode, path,
2863 handle, split_rec, el, split_index); 3146 handle, split_rec,
3147 split_index);
2864 if (ret) { 3148 if (ret) {
2865 mlog_errno(ret); 3149 mlog_errno(ret);
2866 goto out; 3150 goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2871 */ 3155 */
2872 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3156 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2873 3157
2874 /* 3158 /* The merge left us with an empty extent, remove it. */
2875 * The left merge left us with an empty extent, remove 3159 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
2876 * it.
2877 */
2878 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2879 if (ret) { 3160 if (ret) {
2880 mlog_errno(ret); 3161 mlog_errno(ret);
2881 goto out; 3162 goto out;
2882 } 3163 }
2883 split_index--; 3164
2884 rec = &el->l_recs[split_index]; 3165 rec = &el->l_recs[split_index];
2885 3166
2886 /* 3167 /*
2887 * Note that we don't pass split_rec here on purpose - 3168 * Note that we don't pass split_rec here on purpose -
2888 * we've merged it into the left side. 3169 * we've merged it into the rec already.
2889 */ 3170 */
2890 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), 3171 ret = ocfs2_merge_rec_left(inode, path,
2891 handle, rec, el, split_index); 3172 handle, rec,
3173 dealloc,
3174 split_index);
3175
2892 if (ret) { 3176 if (ret) {
2893 mlog_errno(ret); 3177 mlog_errno(ret);
2894 goto out; 3178 goto out;
2895 } 3179 }
2896 3180
2897 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3181 ret = ocfs2_rotate_tree_left(inode, handle, path,
2898
2899 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2900 dealloc); 3182 dealloc);
2901 /* 3183 /*
2902 * Error from this last rotate is not critical, so 3184 * Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2915 */ 3197 */
2916 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3198 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2917 ret = ocfs2_merge_rec_left(inode, 3199 ret = ocfs2_merge_rec_left(inode,
2918 path_leaf_bh(left_path), 3200 path,
2919 handle, split_rec, el, 3201 handle, split_rec,
3202 dealloc,
2920 split_index); 3203 split_index);
2921 if (ret) { 3204 if (ret) {
2922 mlog_errno(ret); 3205 mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2924 } 3207 }
2925 } else { 3208 } else {
2926 ret = ocfs2_merge_rec_right(inode, 3209 ret = ocfs2_merge_rec_right(inode,
2927 path_leaf_bh(left_path), 3210 path,
2928 handle, split_rec, el, 3211 handle, split_rec,
2929 split_index); 3212 split_index);
2930 if (ret) { 3213 if (ret) {
2931 mlog_errno(ret); 3214 mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2938 * The merge may have left an empty extent in 3221 * The merge may have left an empty extent in
2939 * our leaf. Try to rotate it away. 3222 * our leaf. Try to rotate it away.
2940 */ 3223 */
2941 ret = ocfs2_rotate_tree_left(inode, handle, left_path, 3224 ret = ocfs2_rotate_tree_left(inode, handle, path,
2942 dealloc); 3225 dealloc);
2943 if (ret) 3226 if (ret)
2944 mlog_errno(ret); 3227 mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
3498} 3781}
3499 3782
3500static enum ocfs2_contig_type 3783static enum ocfs2_contig_type
3501ocfs2_figure_merge_contig_type(struct inode *inode, 3784ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3502 struct ocfs2_extent_list *el, int index, 3785 struct ocfs2_extent_list *el, int index,
3503 struct ocfs2_extent_rec *split_rec) 3786 struct ocfs2_extent_rec *split_rec)
3504{ 3787{
3505 struct ocfs2_extent_rec *rec; 3788 int status;
3506 enum ocfs2_contig_type ret = CONTIG_NONE; 3789 enum ocfs2_contig_type ret = CONTIG_NONE;
3790 u32 left_cpos, right_cpos;
3791 struct ocfs2_extent_rec *rec = NULL;
3792 struct ocfs2_extent_list *new_el;
3793 struct ocfs2_path *left_path = NULL, *right_path = NULL;
3794 struct buffer_head *bh;
3795 struct ocfs2_extent_block *eb;
3796
3797 if (index > 0) {
3798 rec = &el->l_recs[index - 1];
3799 } else if (path->p_tree_depth > 0) {
3800 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3801 path, &left_cpos);
3802 if (status)
3803 goto out;
3804
3805 if (left_cpos != 0) {
3806 left_path = ocfs2_new_path(path_root_bh(path),
3807 path_root_el(path));
3808 if (!left_path)
3809 goto out;
3810
3811 status = ocfs2_find_path(inode, left_path, left_cpos);
3812 if (status)
3813 goto out;
3814
3815 new_el = path_leaf_el(left_path);
3816
3817 if (le16_to_cpu(new_el->l_next_free_rec) !=
3818 le16_to_cpu(new_el->l_count)) {
3819 bh = path_leaf_bh(left_path);
3820 eb = (struct ocfs2_extent_block *)bh->b_data;
3821 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3822 eb);
3823 goto out;
3824 }
3825 rec = &new_el->l_recs[
3826 le16_to_cpu(new_el->l_next_free_rec) - 1];
3827 }
3828 }
3507 3829
3508 /* 3830 /*
3509 * We're careful to check for an empty extent record here - 3831 * We're careful to check for an empty extent record here -
3510 * the merge code will know what to do if it sees one. 3832 * the merge code will know what to do if it sees one.
3511 */ 3833 */
3512 3834 if (rec) {
3513 if (index > 0) {
3514 rec = &el->l_recs[index - 1];
3515 if (index == 1 && ocfs2_is_empty_extent(rec)) { 3835 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3516 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 3836 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3517 ret = CONTIG_RIGHT; 3837 ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
3520 } 3840 }
3521 } 3841 }
3522 3842
3523 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { 3843 rec = NULL;
3844 if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3845 rec = &el->l_recs[index + 1];
3846 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3847 path->p_tree_depth > 0) {
3848 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3849 path, &right_cpos);
3850 if (status)
3851 goto out;
3852
3853 if (right_cpos == 0)
3854 goto out;
3855
3856 right_path = ocfs2_new_path(path_root_bh(path),
3857 path_root_el(path));
3858 if (!right_path)
3859 goto out;
3860
3861 status = ocfs2_find_path(inode, right_path, right_cpos);
3862 if (status)
3863 goto out;
3864
3865 new_el = path_leaf_el(right_path);
3866 rec = &new_el->l_recs[0];
3867 if (ocfs2_is_empty_extent(rec)) {
3868 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3869 bh = path_leaf_bh(right_path);
3870 eb = (struct ocfs2_extent_block *)bh->b_data;
3871 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3872 eb);
3873 goto out;
3874 }
3875 rec = &new_el->l_recs[1];
3876 }
3877 }
3878
3879 if (rec) {
3524 enum ocfs2_contig_type contig_type; 3880 enum ocfs2_contig_type contig_type;
3525 3881
3526 rec = &el->l_recs[index + 1];
3527 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 3882 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3528 3883
3529 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 3884 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
3532 ret = contig_type; 3887 ret = contig_type;
3533 } 3888 }
3534 3889
3890out:
3891 if (left_path)
3892 ocfs2_free_path(left_path);
3893 if (right_path)
3894 ocfs2_free_path(right_path);
3895
3535 return ret; 3896 return ret;
3536} 3897}
3537 3898
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3994 goto out; 4355 goto out;
3995 } 4356 }
3996 4357
3997 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, 4358 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
3998 split_index, 4359 split_index,
3999 split_rec); 4360 split_rec);
4000 4361
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
4788 status = ocfs2_flush_truncate_log(osb); 5149 status = ocfs2_flush_truncate_log(osb);
4789 if (status < 0) 5150 if (status < 0)
4790 mlog_errno(status); 5151 mlog_errno(status);
5152 else
5153 ocfs2_init_inode_steal_slot(osb);
4791 5154
4792 mlog_exit(status); 5155 mlog_exit(status);
4793} 5156}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 90383ed61005..17964c0505a9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
467 unsigned to) 467 unsigned to)
468{ 468{
469 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 469 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
470 handle_t *handle = NULL; 470 handle_t *handle;
471 int ret = 0; 471 int ret = 0;
472 472
473 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 473 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
474 if (!handle) { 474 if (IS_ERR(handle)) {
475 ret = -ENOMEM; 475 ret = -ENOMEM;
476 mlog_errno(ret); 476 mlog_errno(ret);
477 goto out; 477 goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
487 } 487 }
488out: 488out:
489 if (ret) { 489 if (ret) {
490 if (handle) 490 if (!IS_ERR(handle))
491 ocfs2_commit_trans(osb, handle); 491 ocfs2_commit_trans(osb, handle);
492 handle = ERR_PTR(ret); 492 handle = ERR_PTR(ret);
493 } 493 }
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f13650..bc8c5e7d8608 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2 2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o ver.o 4 quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000000..7bf3c0ea7bd9
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * netdebug.c
5 *
6 * debug functionality for o2net
7 *
8 * Copyright (C) 2005, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifdef CONFIG_DEBUG_FS
28
29#include <linux/module.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/idr.h>
33#include <linux/kref.h>
34#include <linux/seq_file.h>
35#include <linux/debugfs.h>
36
37#include <linux/uaccess.h>
38
39#include "tcp.h"
40#include "nodemanager.h"
41#define MLOG_MASK_PREFIX ML_TCP
42#include "masklog.h"
43
44#include "tcp_internal.h"
45
46#define O2NET_DEBUG_DIR "o2net"
47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking"
49
50static struct dentry *o2net_dentry;
51static struct dentry *sc_dentry;
52static struct dentry *nst_dentry;
53
54static DEFINE_SPINLOCK(o2net_debug_lock);
55
56static LIST_HEAD(sock_containers);
57static LIST_HEAD(send_tracking);
58
59void o2net_debug_add_nst(struct o2net_send_tracking *nst)
60{
61 spin_lock(&o2net_debug_lock);
62 list_add(&nst->st_net_debug_item, &send_tracking);
63 spin_unlock(&o2net_debug_lock);
64}
65
66void o2net_debug_del_nst(struct o2net_send_tracking *nst)
67{
68 spin_lock(&o2net_debug_lock);
69 if (!list_empty(&nst->st_net_debug_item))
70 list_del_init(&nst->st_net_debug_item);
71 spin_unlock(&o2net_debug_lock);
72}
73
74static struct o2net_send_tracking
75 *next_nst(struct o2net_send_tracking *nst_start)
76{
77 struct o2net_send_tracking *nst, *ret = NULL;
78
79 assert_spin_locked(&o2net_debug_lock);
80
81 list_for_each_entry(nst, &nst_start->st_net_debug_item,
82 st_net_debug_item) {
83 /* discover the head of the list */
84 if (&nst->st_net_debug_item == &send_tracking)
85 break;
86
87 /* use st_task to detect real nsts in the list */
88 if (nst->st_task != NULL) {
89 ret = nst;
90 break;
91 }
92 }
93
94 return ret;
95}
96
97static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
98{
99 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
100
101 spin_lock(&o2net_debug_lock);
102 nst = next_nst(dummy_nst);
103 spin_unlock(&o2net_debug_lock);
104
105 return nst;
106}
107
108static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
109{
110 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
111
112 spin_lock(&o2net_debug_lock);
113 nst = next_nst(dummy_nst);
114 list_del_init(&dummy_nst->st_net_debug_item);
115 if (nst)
116 list_add(&dummy_nst->st_net_debug_item,
117 &nst->st_net_debug_item);
118 spin_unlock(&o2net_debug_lock);
119
120 return nst; /* unused, just needs to be null when done */
121}
122
123static int nst_seq_show(struct seq_file *seq, void *v)
124{
125 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
126
127 spin_lock(&o2net_debug_lock);
128 nst = next_nst(dummy_nst);
129
130 if (nst != NULL) {
131 /* get_task_comm isn't exported. oh well. */
132 seq_printf(seq, "%p:\n"
133 " pid: %lu\n"
134 " tgid: %lu\n"
135 " process name: %s\n"
136 " node: %u\n"
137 " sc: %p\n"
138 " message id: %d\n"
139 " message type: %u\n"
140 " message key: 0x%08x\n"
141 " sock acquiry: %lu.%lu\n"
142 " send start: %lu.%lu\n"
143 " wait start: %lu.%lu\n",
144 nst, (unsigned long)nst->st_task->pid,
145 (unsigned long)nst->st_task->tgid,
146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
150 nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
151 nst->st_status_time.tv_sec,
152 nst->st_status_time.tv_usec);
153 }
154
155 spin_unlock(&o2net_debug_lock);
156
157 return 0;
158}
159
160static void nst_seq_stop(struct seq_file *seq, void *v)
161{
162}
163
164static struct seq_operations nst_seq_ops = {
165 .start = nst_seq_start,
166 .next = nst_seq_next,
167 .stop = nst_seq_stop,
168 .show = nst_seq_show,
169};
170
171static int nst_fop_open(struct inode *inode, struct file *file)
172{
173 struct o2net_send_tracking *dummy_nst;
174 struct seq_file *seq;
175 int ret;
176
177 dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
178 if (dummy_nst == NULL) {
179 ret = -ENOMEM;
180 goto out;
181 }
182 dummy_nst->st_task = NULL;
183
184 ret = seq_open(file, &nst_seq_ops);
185 if (ret)
186 goto out;
187
188 seq = file->private_data;
189 seq->private = dummy_nst;
190 o2net_debug_add_nst(dummy_nst);
191
192 dummy_nst = NULL;
193
194out:
195 kfree(dummy_nst);
196 return ret;
197}
198
199static int nst_fop_release(struct inode *inode, struct file *file)
200{
201 struct seq_file *seq = file->private_data;
202 struct o2net_send_tracking *dummy_nst = seq->private;
203
204 o2net_debug_del_nst(dummy_nst);
205 return seq_release_private(inode, file);
206}
207
208static struct file_operations nst_seq_fops = {
209 .open = nst_fop_open,
210 .read = seq_read,
211 .llseek = seq_lseek,
212 .release = nst_fop_release,
213};
214
215void o2net_debug_add_sc(struct o2net_sock_container *sc)
216{
217 spin_lock(&o2net_debug_lock);
218 list_add(&sc->sc_net_debug_item, &sock_containers);
219 spin_unlock(&o2net_debug_lock);
220}
221
222void o2net_debug_del_sc(struct o2net_sock_container *sc)
223{
224 spin_lock(&o2net_debug_lock);
225 list_del_init(&sc->sc_net_debug_item);
226 spin_unlock(&o2net_debug_lock);
227}
228
229static struct o2net_sock_container
230 *next_sc(struct o2net_sock_container *sc_start)
231{
232 struct o2net_sock_container *sc, *ret = NULL;
233
234 assert_spin_locked(&o2net_debug_lock);
235
236 list_for_each_entry(sc, &sc_start->sc_net_debug_item,
237 sc_net_debug_item) {
238 /* discover the head of the list miscast as a sc */
239 if (&sc->sc_net_debug_item == &sock_containers)
240 break;
241
242 /* use sc_page to detect real scs in the list */
243 if (sc->sc_page != NULL) {
244 ret = sc;
245 break;
246 }
247 }
248
249 return ret;
250}
251
252static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
253{
254 struct o2net_sock_container *sc, *dummy_sc = seq->private;
255
256 spin_lock(&o2net_debug_lock);
257 sc = next_sc(dummy_sc);
258 spin_unlock(&o2net_debug_lock);
259
260 return sc;
261}
262
263static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264{
265 struct o2net_sock_container *sc, *dummy_sc = seq->private;
266
267 spin_lock(&o2net_debug_lock);
268 sc = next_sc(dummy_sc);
269 list_del_init(&dummy_sc->sc_net_debug_item);
270 if (sc)
271 list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
272 spin_unlock(&o2net_debug_lock);
273
274 return sc; /* unused, just needs to be null when done */
275}
276
277#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
278
279static int sc_seq_show(struct seq_file *seq, void *v)
280{
281 struct o2net_sock_container *sc, *dummy_sc = seq->private;
282
283 spin_lock(&o2net_debug_lock);
284 sc = next_sc(dummy_sc);
285
286 if (sc != NULL) {
287 struct inet_sock *inet = NULL;
288
289 __be32 saddr = 0, daddr = 0;
290 __be16 sport = 0, dport = 0;
291
292 if (sc->sc_sock) {
293 inet = inet_sk(sc->sc_sock->sk);
294 /* the stack's structs aren't sparse endian clean */
295 saddr = (__force __be32)inet->saddr;
296 daddr = (__force __be32)inet->daddr;
297 sport = (__force __be16)inet->sport;
298 dport = (__force __be16)inet->dport;
299 }
300
301 /* XXX sigh, inet-> doesn't have sparse annotation so any
302 * use of it here generates a warning with -Wbitwise */
303 seq_printf(seq, "%p:\n"
304 " krefs: %d\n"
305 " sock: %u.%u.%u.%u:%u -> "
306 "%u.%u.%u.%u:%u\n"
307 " remote node: %s\n"
308 " page off: %zu\n"
309 " handshake ok: %u\n"
310 " timer: %lu.%lu\n"
311 " data ready: %lu.%lu\n"
312 " advance start: %lu.%lu\n"
313 " advance stop: %lu.%lu\n"
314 " func start: %lu.%lu\n"
315 " func stop: %lu.%lu\n"
316 " func key: %u\n"
317 " func type: %u\n",
318 sc,
319 atomic_read(&sc->sc_kref.refcount),
320 NIPQUAD(saddr), inet ? ntohs(sport) : 0,
321 NIPQUAD(daddr), inet ? ntohs(dport) : 0,
322 sc->sc_node->nd_name,
323 sc->sc_page_off,
324 sc->sc_handshake_ok,
325 TV_SEC_USEC(sc->sc_tv_timer),
326 TV_SEC_USEC(sc->sc_tv_data_ready),
327 TV_SEC_USEC(sc->sc_tv_advance_start),
328 TV_SEC_USEC(sc->sc_tv_advance_stop),
329 TV_SEC_USEC(sc->sc_tv_func_start),
330 TV_SEC_USEC(sc->sc_tv_func_stop),
331 sc->sc_msg_key,
332 sc->sc_msg_type);
333 }
334
335
336 spin_unlock(&o2net_debug_lock);
337
338 return 0;
339}
340
341static void sc_seq_stop(struct seq_file *seq, void *v)
342{
343}
344
345static struct seq_operations sc_seq_ops = {
346 .start = sc_seq_start,
347 .next = sc_seq_next,
348 .stop = sc_seq_stop,
349 .show = sc_seq_show,
350};
351
352static int sc_fop_open(struct inode *inode, struct file *file)
353{
354 struct o2net_sock_container *dummy_sc;
355 struct seq_file *seq;
356 int ret;
357
358 dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
359 if (dummy_sc == NULL) {
360 ret = -ENOMEM;
361 goto out;
362 }
363 dummy_sc->sc_page = NULL;
364
365 ret = seq_open(file, &sc_seq_ops);
366 if (ret)
367 goto out;
368
369 seq = file->private_data;
370 seq->private = dummy_sc;
371 o2net_debug_add_sc(dummy_sc);
372
373 dummy_sc = NULL;
374
375out:
376 kfree(dummy_sc);
377 return ret;
378}
379
380static int sc_fop_release(struct inode *inode, struct file *file)
381{
382 struct seq_file *seq = file->private_data;
383 struct o2net_sock_container *dummy_sc = seq->private;
384
385 o2net_debug_del_sc(dummy_sc);
386 return seq_release_private(inode, file);
387}
388
389static struct file_operations sc_seq_fops = {
390 .open = sc_fop_open,
391 .read = seq_read,
392 .llseek = seq_lseek,
393 .release = sc_fop_release,
394};
395
396int o2net_debugfs_init(void)
397{
398 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
399 if (!o2net_dentry) {
400 mlog_errno(-ENOMEM);
401 goto bail;
402 }
403
404 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
405 o2net_dentry, NULL,
406 &nst_seq_fops);
407 if (!nst_dentry) {
408 mlog_errno(-ENOMEM);
409 goto bail;
410 }
411
412 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
413 o2net_dentry, NULL,
414 &sc_seq_fops);
415 if (!sc_dentry) {
416 mlog_errno(-ENOMEM);
417 goto bail;
418 }
419
420 return 0;
421bail:
422 if (sc_dentry)
423 debugfs_remove(sc_dentry);
424 if (nst_dentry)
425 debugfs_remove(nst_dentry);
426 if (o2net_dentry)
427 debugfs_remove(o2net_dentry);
428 return -ENOMEM;
429}
430
431void o2net_debugfs_exit(void)
432{
433 if (sc_dentry)
434 debugfs_remove(sc_dentry);
435 if (nst_dentry)
436 debugfs_remove(nst_dentry);
437 if (o2net_dentry)
438 debugfs_remove(o2net_dentry);
439}
440
441#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7e..cf9401e8cd0b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
959 cluster_print_version(); 959 cluster_print_version();
960 960
961 o2hb_init(); 961 o2hb_init();
962 o2net_init(); 962
963 ret = o2net_init();
964 if (ret)
965 goto out;
963 966
964 ocfs2_table_header = register_sysctl_table(ocfs2_root_table); 967 ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
965 if (!ocfs2_table_header) { 968 if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723d..98429fd68499 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
57void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
58{ 58{
59 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
60 sysfs_remove_link(NULL, "o2cb");
60 kset_unregister(o2cb_kset); 61 kset_unregister(o2cb_kset);
61} 62}
62 63
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
68 if (!o2cb_kset) 69 if (!o2cb_kset)
69 return -ENOMEM; 70 return -ENOMEM;
70 71
72 /*
73 * Create this symlink for backwards compatibility with old
74 * versions of ocfs2-tools which look for things in /sys/o2cb.
75 */
76 ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
77 if (ret)
78 goto error;
79
71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); 80 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
72 if (ret) 81 if (ret)
73 goto error; 82 goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b205..1e44ad14881a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
142static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); 142static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); 143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
144 144
145/* 145static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
146 * FIXME: These should use to_o2nm_cluster_from_node(), but we end up 146 u32 msgkey, struct task_struct *task, u8 node)
147 * losing our parent link to the cluster during shutdown. This can be 147{
148 * solved by adding a pre-removal callback to configfs, or passing 148#ifdef CONFIG_DEBUG_FS
149 * around the cluster with the node. -jeffm 149 INIT_LIST_HEAD(&nst->st_net_debug_item);
150 */ 150 nst->st_task = task;
151static inline int o2net_reconnect_delay(struct o2nm_node *node) 151 nst->st_msg_type = msgtype;
152 nst->st_msg_key = msgkey;
153 nst->st_node = node;
154#endif
155}
156
157static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
158{
159#ifdef CONFIG_DEBUG_FS
160 do_gettimeofday(&nst->st_sock_time);
161#endif
162}
163
164static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
165{
166#ifdef CONFIG_DEBUG_FS
167 do_gettimeofday(&nst->st_send_time);
168#endif
169}
170
171static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
172{
173#ifdef CONFIG_DEBUG_FS
174 do_gettimeofday(&nst->st_status_time);
175#endif
176}
177
178static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
179 struct o2net_sock_container *sc)
180{
181#ifdef CONFIG_DEBUG_FS
182 nst->st_sc = sc;
183#endif
184}
185
186static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
187{
188#ifdef CONFIG_DEBUG_FS
189 nst->st_id = msg_id;
190#endif
191}
192
193static inline int o2net_reconnect_delay(void)
152{ 194{
153 return o2nm_single_cluster->cl_reconnect_delay_ms; 195 return o2nm_single_cluster->cl_reconnect_delay_ms;
154} 196}
155 197
156static inline int o2net_keepalive_delay(struct o2nm_node *node) 198static inline int o2net_keepalive_delay(void)
157{ 199{
158 return o2nm_single_cluster->cl_keepalive_delay_ms; 200 return o2nm_single_cluster->cl_keepalive_delay_ms;
159} 201}
160 202
161static inline int o2net_idle_timeout(struct o2nm_node *node) 203static inline int o2net_idle_timeout(void)
162{ 204{
163 return o2nm_single_cluster->cl_idle_timeout_ms; 205 return o2nm_single_cluster->cl_idle_timeout_ms;
164} 206}
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
296 o2nm_node_put(sc->sc_node); 338 o2nm_node_put(sc->sc_node);
297 sc->sc_node = NULL; 339 sc->sc_node = NULL;
298 340
341 o2net_debug_del_sc(sc);
299 kfree(sc); 342 kfree(sc);
300} 343}
301 344
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
336 379
337 ret = sc; 380 ret = sc;
338 sc->sc_page = page; 381 sc->sc_page = page;
382 o2net_debug_add_sc(sc);
339 sc = NULL; 383 sc = NULL;
340 page = NULL; 384 page = NULL;
341 385
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
399 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); 443 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
400 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); 444 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
401 445
402 /* we won't reconnect after our valid conn goes away for
403 * this hb iteration.. here so it shows up in the logs */
404 if (was_valid && !valid && err == 0) 446 if (was_valid && !valid && err == 0)
405 err = -ENOTCONN; 447 err = -ENOTCONN;
406 448
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
430 472
431 if (!was_valid && valid) { 473 if (!was_valid && valid) {
432 o2quo_conn_up(o2net_num_from_nn(nn)); 474 o2quo_conn_up(o2net_num_from_nn(nn));
433 /* this is a bit of a hack. we only try reconnecting
434 * when heartbeating starts until we get a connection.
435 * if that connection then dies we don't try reconnecting.
436 * the only way to start connecting again is to down
437 * heartbeat and bring it back up. */
438 cancel_delayed_work(&nn->nn_connect_expired); 475 cancel_delayed_work(&nn->nn_connect_expired);
439 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 476 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
440 o2nm_this_node() > sc->sc_node->nd_num ? 477 o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
451 /* delay if we're withing a RECONNECT_DELAY of the 488 /* delay if we're withing a RECONNECT_DELAY of the
452 * last attempt */ 489 * last attempt */
453 delay = (nn->nn_last_connect_attempt + 490 delay = (nn->nn_last_connect_attempt +
454 msecs_to_jiffies(o2net_reconnect_delay(NULL))) 491 msecs_to_jiffies(o2net_reconnect_delay()))
455 - jiffies; 492 - jiffies;
456 if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL))) 493 if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
457 delay = 0; 494 delay = 0;
458 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); 495 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
459 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); 496 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
497
498 /*
499 * Delay the expired work after idle timeout.
500 *
501 * We might have lots of failed connection attempts that run
502 * through here but we only cancel the connect_expired work when
503 * a connection attempt succeeds. So only the first enqueue of
504 * the connect_expired work will do anything. The rest will see
505 * that it's already queued and do nothing.
506 */
507 delay += msecs_to_jiffies(o2net_idle_timeout());
508 queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
460 } 509 }
461 510
462 /* keep track of the nn's sc ref for the caller */ 511 /* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
914 struct o2net_status_wait nsw = { 963 struct o2net_status_wait nsw = {
915 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), 964 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
916 }; 965 };
966 struct o2net_send_tracking nst;
967
968 o2net_init_nst(&nst, msg_type, key, current, target_node);
917 969
918 if (o2net_wq == NULL) { 970 if (o2net_wq == NULL) {
919 mlog(0, "attempt to tx without o2netd running\n"); 971 mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
939 goto out; 991 goto out;
940 } 992 }
941 993
994 o2net_debug_add_nst(&nst);
995
996 o2net_set_nst_sock_time(&nst);
997
942 ret = wait_event_interruptible(nn->nn_sc_wq, 998 ret = wait_event_interruptible(nn->nn_sc_wq,
943 o2net_tx_can_proceed(nn, &sc, &error)); 999 o2net_tx_can_proceed(nn, &sc, &error));
944 if (!ret && error) 1000 if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
946 if (ret) 1002 if (ret)
947 goto out; 1003 goto out;
948 1004
1005 o2net_set_nst_sock_container(&nst, sc);
1006
949 veclen = caller_veclen + 1; 1007 veclen = caller_veclen + 1;
950 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); 1008 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
951 if (vec == NULL) { 1009 if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
972 goto out; 1030 goto out;
973 1031
974 msg->msg_num = cpu_to_be32(nsw.ns_id); 1032 msg->msg_num = cpu_to_be32(nsw.ns_id);
1033 o2net_set_nst_msg_id(&nst, nsw.ns_id);
1034
1035 o2net_set_nst_send_time(&nst);
975 1036
976 /* finally, convert the message header to network byte-order 1037 /* finally, convert the message header to network byte-order
977 * and send */ 1038 * and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
986 } 1047 }
987 1048
988 /* wait on other node's handler */ 1049 /* wait on other node's handler */
1050 o2net_set_nst_status_time(&nst);
989 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1051 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
990 1052
991 /* Note that we avoid overwriting the callers status return 1053 /* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
998 mlog(0, "woken, returning system status %d, user status %d\n", 1060 mlog(0, "woken, returning system status %d, user status %d\n",
999 ret, nsw.ns_status); 1061 ret, nsw.ns_status);
1000out: 1062out:
1063 o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
1001 if (sc) 1064 if (sc)
1002 sc_put(sc); 1065 sc_put(sc);
1003 if (vec) 1066 if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1154 * but isn't. This can ultimately cause corruption. 1217 * but isn't. This can ultimately cause corruption.
1155 */ 1218 */
1156 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1219 if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
1157 o2net_idle_timeout(sc->sc_node)) { 1220 o2net_idle_timeout()) {
1158 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1221 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
1159 "%u ms, but we use %u ms locally. disconnecting\n", 1222 "%u ms, but we use %u ms locally. disconnecting\n",
1160 SC_NODEF_ARGS(sc), 1223 SC_NODEF_ARGS(sc),
1161 be32_to_cpu(hand->o2net_idle_timeout_ms), 1224 be32_to_cpu(hand->o2net_idle_timeout_ms),
1162 o2net_idle_timeout(sc->sc_node)); 1225 o2net_idle_timeout());
1163 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1226 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1164 return -1; 1227 return -1;
1165 } 1228 }
1166 1229
1167 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1230 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
1168 o2net_keepalive_delay(sc->sc_node)) { 1231 o2net_keepalive_delay()) {
1169 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1232 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
1170 "%u ms, but we use %u ms locally. disconnecting\n", 1233 "%u ms, but we use %u ms locally. disconnecting\n",
1171 SC_NODEF_ARGS(sc), 1234 SC_NODEF_ARGS(sc),
1172 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1235 be32_to_cpu(hand->o2net_keepalive_delay_ms),
1173 o2net_keepalive_delay(sc->sc_node)); 1236 o2net_keepalive_delay());
1174 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1237 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1175 return -1; 1238 return -1;
1176 } 1239 }
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1193 * shut down already */ 1256 * shut down already */
1194 if (nn->nn_sc == sc) { 1257 if (nn->nn_sc == sc) {
1195 o2net_sc_reset_idle_timer(sc); 1258 o2net_sc_reset_idle_timer(sc);
1259 atomic_set(&nn->nn_timeout, 0);
1196 o2net_set_nn_state(nn, sc, 1, 0); 1260 o2net_set_nn_state(nn, sc, 1, 0);
1197 } 1261 }
1198 spin_unlock(&nn->nn_lock); 1262 spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
1347{ 1411{
1348 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1412 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
1349 O2HB_MAX_WRITE_TIMEOUT_MS); 1413 O2HB_MAX_WRITE_TIMEOUT_MS);
1350 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( 1414 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
1351 o2net_idle_timeout(NULL));
1352 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( 1415 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
1353 o2net_keepalive_delay(NULL)); 1416 o2net_keepalive_delay());
1354 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( 1417 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
1355 o2net_reconnect_delay(NULL)); 1418 o2net_reconnect_delay());
1356} 1419}
1357 1420
1358/* ------------------------------------------------------------ */ 1421/* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
1391static void o2net_idle_timer(unsigned long data) 1454static void o2net_idle_timer(unsigned long data)
1392{ 1455{
1393 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1456 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1457 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1394 struct timeval now; 1458 struct timeval now;
1395 1459
1396 do_gettimeofday(&now); 1460 do_gettimeofday(&now);
1397 1461
1398 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1462 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1399 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1463 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1400 o2net_idle_timeout(sc->sc_node) / 1000, 1464 o2net_idle_timeout() / 1000,
1401 o2net_idle_timeout(sc->sc_node) % 1000); 1465 o2net_idle_timeout() % 1000);
1402 mlog(ML_NOTICE, "here are some times that might help debug the " 1466 mlog(ML_NOTICE, "here are some times that might help debug the "
1403 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1467 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1404 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1468 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
1413 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1477 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1414 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1478 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1415 1479
1480 /*
1481 * Initialize the nn_timeout so that the next connection attempt
1482 * will continue in o2net_start_connect.
1483 */
1484 atomic_set(&nn->nn_timeout, 1);
1485
1416 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1486 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1417} 1487}
1418 1488
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1420{ 1490{
1421 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1491 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1422 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1492 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1423 msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); 1493 msecs_to_jiffies(o2net_keepalive_delay()));
1424 do_gettimeofday(&sc->sc_tv_timer); 1494 do_gettimeofday(&sc->sc_tv_timer);
1425 mod_timer(&sc->sc_idle_timeout, 1495 mod_timer(&sc->sc_idle_timeout,
1426 jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); 1496 jiffies + msecs_to_jiffies(o2net_idle_timeout()));
1427} 1497}
1428 1498
1429static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) 1499static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
1447 struct socket *sock = NULL; 1517 struct socket *sock = NULL;
1448 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1518 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1449 int ret = 0, stop; 1519 int ret = 0, stop;
1520 unsigned int timeout;
1450 1521
1451 /* if we're greater we initiate tx, otherwise we accept */ 1522 /* if we're greater we initiate tx, otherwise we accept */
1452 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1523 if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
1466 } 1537 }
1467 1538
1468 spin_lock(&nn->nn_lock); 1539 spin_lock(&nn->nn_lock);
1469 /* see if we already have one pending or have given up */ 1540 /*
1470 stop = (nn->nn_sc || nn->nn_persistent_error); 1541 * see if we already have one pending or have given up.
1542 * For nn_timeout, it is set when we close the connection
1543 * because of the idle time out. So it means that we have
1544 * at least connected to that node successfully once,
1545 * now try to connect to it again.
1546 */
1547 timeout = atomic_read(&nn->nn_timeout);
1548 stop = (nn->nn_sc ||
1549 (nn->nn_persistent_error &&
1550 (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
1471 spin_unlock(&nn->nn_lock); 1551 spin_unlock(&nn->nn_lock);
1472 if (stop) 1552 if (stop)
1473 goto out; 1553 goto out;
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work)
1555 mlog(ML_ERROR, "no connection established with node %u after " 1635 mlog(ML_ERROR, "no connection established with node %u after "
1556 "%u.%u seconds, giving up and returning errors.\n", 1636 "%u.%u seconds, giving up and returning errors.\n",
1557 o2net_num_from_nn(nn), 1637 o2net_num_from_nn(nn),
1558 o2net_idle_timeout(NULL) / 1000, 1638 o2net_idle_timeout() / 1000,
1559 o2net_idle_timeout(NULL) % 1000); 1639 o2net_idle_timeout() % 1000);
1560 1640
1561 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1641 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1562 } 1642 }
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
1579 1659
1580 /* don't reconnect until it's heartbeating again */ 1660 /* don't reconnect until it's heartbeating again */
1581 spin_lock(&nn->nn_lock); 1661 spin_lock(&nn->nn_lock);
1662 atomic_set(&nn->nn_timeout, 0);
1582 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1663 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1583 spin_unlock(&nn->nn_lock); 1664 spin_unlock(&nn->nn_lock);
1584 1665
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1610 1691
1611 /* ensure an immediate connect attempt */ 1692 /* ensure an immediate connect attempt */
1612 nn->nn_last_connect_attempt = jiffies - 1693 nn->nn_last_connect_attempt = jiffies -
1613 (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); 1694 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
1614 1695
1615 if (node_num != o2nm_this_node()) { 1696 if (node_num != o2nm_this_node()) {
1616 /* heartbeat doesn't work unless a local node number is
1617 * configured and doing so brings up the o2net_wq, so we can
1618 * use it.. */
1619 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1620 msecs_to_jiffies(o2net_idle_timeout(node)));
1621
1622 /* believe it or not, accept and node hearbeating testing 1697 /* believe it or not, accept and node hearbeating testing
1623 * can succeed for this node before we got here.. so 1698 * can succeed for this node before we got here.. so
1624 * only use set_nn_state to clear the persistent error 1699 * only use set_nn_state to clear the persistent error
1625 * if that hasn't already happened */ 1700 * if that hasn't already happened */
1626 spin_lock(&nn->nn_lock); 1701 spin_lock(&nn->nn_lock);
1702 atomic_set(&nn->nn_timeout, 0);
1627 if (nn->nn_persistent_error) 1703 if (nn->nn_persistent_error)
1628 o2net_set_nn_state(nn, NULL, 0, 0); 1704 o2net_set_nn_state(nn, NULL, 0, 0);
1629 spin_unlock(&nn->nn_lock); 1705 spin_unlock(&nn->nn_lock);
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
1747 new_sock = NULL; 1823 new_sock = NULL;
1748 1824
1749 spin_lock(&nn->nn_lock); 1825 spin_lock(&nn->nn_lock);
1826 atomic_set(&nn->nn_timeout, 0);
1750 o2net_set_nn_state(nn, sc, 0, 0); 1827 o2net_set_nn_state(nn, sc, 0, 0);
1751 spin_unlock(&nn->nn_lock); 1828 spin_unlock(&nn->nn_lock);
1752 1829
@@ -1922,6 +1999,9 @@ int o2net_init(void)
1922 1999
1923 o2quo_init(); 2000 o2quo_init();
1924 2001
2002 if (o2net_debugfs_init())
2003 return -ENOMEM;
2004
1925 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2005 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
1926 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2006 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
1927 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2007 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1941,6 +2021,7 @@ int o2net_init(void)
1941 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { 2021 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1942 struct o2net_node *nn = o2net_nn_from_num(i); 2022 struct o2net_node *nn = o2net_nn_from_num(i);
1943 2023
2024 atomic_set(&nn->nn_timeout, 0);
1944 spin_lock_init(&nn->nn_lock); 2025 spin_lock_init(&nn->nn_lock);
1945 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); 2026 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
1946 INIT_DELAYED_WORK(&nn->nn_connect_expired, 2027 INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1962,4 +2043,5 @@ void o2net_exit(void)
1962 kfree(o2net_hand); 2043 kfree(o2net_hand);
1963 kfree(o2net_keep_req); 2044 kfree(o2net_keep_req);
1964 kfree(o2net_keep_resp); 2045 kfree(o2net_keep_resp);
2046 o2net_debugfs_exit();
1965} 2047}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3dd..a705d5d19036 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
117int o2net_init(void); 117int o2net_init(void);
118void o2net_exit(void); 118void o2net_exit(void);
119 119
120struct o2net_send_tracking;
121struct o2net_sock_container;
122
123#ifdef CONFIG_DEBUG_FS
124int o2net_debugfs_init(void);
125void o2net_debugfs_exit(void);
126void o2net_debug_add_nst(struct o2net_send_tracking *nst);
127void o2net_debug_del_nst(struct o2net_send_tracking *nst);
128void o2net_debug_add_sc(struct o2net_sock_container *sc);
129void o2net_debug_del_sc(struct o2net_sock_container *sc);
130#else
131static int o2net_debugfs_init(void)
132{
133 return 0;
134}
135static void o2net_debugfs_exit(void)
136{
137}
138static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
139{
140}
141static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
142{
143}
144static void o2net_debug_add_sc(struct o2net_sock_container *sc)
145{
146}
147static void o2net_debug_del_sc(struct o2net_sock_container *sc)
148{
149}
150#endif /* CONFIG_DEBUG_FS */
151
120#endif /* O2CLUSTER_TCP_H */ 152#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
95 unsigned nn_sc_valid:1; 95 unsigned nn_sc_valid:1;
96 /* if this is set tx just returns it */ 96 /* if this is set tx just returns it */
97 int nn_persistent_error; 97 int nn_persistent_error;
98 /* It is only set to 1 after the idle time out. */
99 atomic_t nn_timeout;
98 100
99 /* threads waiting for an sc to arrive wait on the wq for generation 101 /* threads waiting for an sc to arrive wait on the wq for generation
100 * to increase. it is increased when a connecting socket succeeds 102 * to increase. it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
164 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
165 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
166 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk, int bytes);
167 169#ifdef CONFIG_DEBUG_FS
170 struct list_head sc_net_debug_item;
171#endif
168 struct timeval sc_tv_timer; 172 struct timeval sc_tv_timer;
169 struct timeval sc_tv_data_ready; 173 struct timeval sc_tv_data_ready;
170 struct timeval sc_tv_advance_start; 174 struct timeval sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
206 struct list_head ns_node_item; 210 struct list_head ns_node_item;
207}; 211};
208 212
213#ifdef CONFIG_DEBUG_FS
214/* just for state dumps */
215struct o2net_send_tracking {
216 struct list_head st_net_debug_item;
217 struct task_struct *st_task;
218 struct o2net_sock_container *st_sc;
219 u32 st_id;
220 u32 st_msg_type;
221 u32 st_msg_key;
222 u8 st_node;
223 struct timeval st_sock_time;
224 struct timeval st_send_time;
225 struct timeval st_status_time;
226};
227#else
228struct o2net_send_tracking {
229 u32 dummy;
230};
231#endif /* CONFIG_DEBUG_FS */
232
209#endif /* O2CLUSTER_TCP_INTERNAL_H */ 233#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d270..190361375700 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index dc8ea666efdb..d5a86fb81a49 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
49/* Intended to make it easier for us to switch out hash functions */ 49/* Intended to make it easier for us to switch out hash functions */
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) 50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
51 51
52enum dlm_mle_type {
53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION
56};
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61};
62
63struct dlm_master_list_entry {
64 struct list_head list;
65 struct list_head hb_events;
66 struct dlm_ctxt *dlm;
67 spinlock_t spinlock;
68 wait_queue_head_t wq;
69 atomic_t woken;
70 struct kref mle_refs;
71 int inuse;
72 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
73 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
74 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
75 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
76 u8 master;
77 u8 new_master;
78 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down;
81 union {
82 struct dlm_lock_resource *res;
83 struct dlm_lock_name name;
84 } u;
85};
86
52enum dlm_ast_type { 87enum dlm_ast_type {
53 DLM_AST = 0, 88 DLM_AST = 0,
54 DLM_BAST, 89 DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
101 struct list_head purge_list; 136 struct list_head purge_list;
102 struct list_head pending_asts; 137 struct list_head pending_asts;
103 struct list_head pending_basts; 138 struct list_head pending_basts;
139 struct list_head tracking_list;
104 unsigned int purge_count; 140 unsigned int purge_count;
105 spinlock_t spinlock; 141 spinlock_t spinlock;
106 spinlock_t ast_lock; 142 spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
122 atomic_t remote_resources; 158 atomic_t remote_resources;
123 atomic_t unknown_resources; 159 atomic_t unknown_resources;
124 160
161 struct dlm_debug_ctxt *dlm_debug_ctxt;
162 struct dentry *dlm_debugfs_subroot;
163
125 /* NOTE: Next three are protected by dlm_domain_lock */ 164 /* NOTE: Next three are protected by dlm_domain_lock */
126 struct kref dlm_refs; 165 struct kref dlm_refs;
127 enum dlm_ctxt_state dlm_state; 166 enum dlm_ctxt_state dlm_state;
@@ -270,6 +309,9 @@ struct dlm_lock_resource
270 struct list_head dirty; 309 struct list_head dirty;
271 struct list_head recovering; // dlm_recovery_ctxt.resources list 310 struct list_head recovering; // dlm_recovery_ctxt.resources list
272 311
312 /* Added during init and removed during release */
313 struct list_head tracking; /* dlm->tracking_list */
314
273 /* unused lock resources have their last_used stamped and are 315 /* unused lock resources have their last_used stamped and are
274 * put on a list for the dlm thread to run. */ 316 * put on a list for the dlm thread to run. */
275 unsigned long last_used; 317 unsigned long last_used;
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
963 DLM_LOCK_RES_MIGRATING)); 1005 DLM_LOCK_RES_MIGRATING));
964} 1006}
965 1007
1008/* create/destroy slab caches */
1009int dlm_init_master_caches(void);
1010void dlm_destroy_master_caches(void);
1011
1012int dlm_init_lock_cache(void);
1013void dlm_destroy_lock_cache(void);
966 1014
967int dlm_init_mle_cache(void); 1015int dlm_init_mle_cache(void);
968void dlm_destroy_mle_cache(void); 1016void dlm_destroy_mle_cache(void);
1017
969void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); 1018void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
970int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, 1019int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
971 struct dlm_lock_resource *res); 1020 struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d4..5f6d858770a2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * debug functionality for the dlm 6 * debug functionality for the dlm
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/sysctl.h> 31#include <linux/sysctl.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33#include <linux/debugfs.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
@@ -37,17 +38,16 @@
37 38
38#include "dlmapi.h" 39#include "dlmapi.h"
39#include "dlmcommon.h" 40#include "dlmcommon.h"
40
41#include "dlmdomain.h" 41#include "dlmdomain.h"
42#include "dlmdebug.h"
42 43
43#define MLOG_MASK_PREFIX ML_DLM 44#define MLOG_MASK_PREFIX ML_DLM
44#include "cluster/masklog.h" 45#include "cluster/masklog.h"
45 46
47int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
48
46void dlm_print_one_lock_resource(struct dlm_lock_resource *res) 49void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
47{ 50{
48 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
49 res->lockname.len, res->lockname.name,
50 res->owner, res->state);
51 spin_lock(&res->spinlock); 51 spin_lock(&res->spinlock);
52 __dlm_print_one_lock_resource(res); 52 __dlm_print_one_lock_resource(res);
53 spin_unlock(&res->spinlock); 53 spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
58 int bit; 58 int bit;
59 assert_spin_locked(&res->spinlock); 59 assert_spin_locked(&res->spinlock);
60 60
61 mlog(ML_NOTICE, " refmap nodes: [ "); 61 printk(" refmap nodes: [ ");
62 bit = 0; 62 bit = 0;
63 while (1) { 63 while (1) {
64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
70 printk("], inflight=%u\n", res->inflight_locks); 70 printk("], inflight=%u\n", res->inflight_locks);
71} 71}
72 72
73static void __dlm_print_lock(struct dlm_lock *lock)
74{
75 spin_lock(&lock->spinlock);
76
77 printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
78 "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
79 "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
80 lock->ml.type, lock->ml.convert_type, lock->ml.node,
81 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
82 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
83 atomic_read(&lock->lock_refs.refcount),
84 (list_empty(&lock->ast_list) ? 'y' : 'n'),
85 (lock->ast_pending ? 'y' : 'n'),
86 (list_empty(&lock->bast_list) ? 'y' : 'n'),
87 (lock->bast_pending ? 'y' : 'n'),
88 (lock->convert_pending ? 'y' : 'n'),
89 (lock->lock_pending ? 'y' : 'n'),
90 (lock->cancel_pending ? 'y' : 'n'),
91 (lock->unlock_pending ? 'y' : 'n'));
92
93 spin_unlock(&lock->spinlock);
94}
95
73void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 96void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
74{ 97{
75 struct list_head *iter2; 98 struct list_head *iter2;
76 struct dlm_lock *lock; 99 struct dlm_lock *lock;
100 char buf[DLM_LOCKID_NAME_MAX];
77 101
78 assert_spin_locked(&res->spinlock); 102 assert_spin_locked(&res->spinlock);
79 103
80 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", 104 stringify_lockname(res->lockname.name, res->lockname.len,
81 res->lockname.len, res->lockname.name, 105 buf, sizeof(buf) - 1);
82 res->owner, res->state); 106 printk("lockres: %s, owner=%u, state=%u\n",
83 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", 107 buf, res->owner, res->state);
84 res->last_used, list_empty(&res->purge) ? "no" : "yes"); 108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
109 res->last_used, atomic_read(&res->refs.refcount),
110 list_empty(&res->purge) ? "no" : "yes");
111 printk(" on dirty list: %s, on reco list: %s, "
112 "migrating pending: %s\n",
113 list_empty(&res->dirty) ? "no" : "yes",
114 list_empty(&res->recovering) ? "no" : "yes",
115 res->migration_pending ? "yes" : "no");
116 printk(" inflight locks: %d, asts reserved: %d\n",
117 res->inflight_locks, atomic_read(&res->asts_reserved));
85 dlm_print_lockres_refmap(res); 118 dlm_print_lockres_refmap(res);
86 mlog(ML_NOTICE, " granted queue: \n"); 119 printk(" granted queue:\n");
87 list_for_each(iter2, &res->granted) { 120 list_for_each(iter2, &res->granted) {
88 lock = list_entry(iter2, struct dlm_lock, list); 121 lock = list_entry(iter2, struct dlm_lock, list);
89 spin_lock(&lock->spinlock); 122 __dlm_print_lock(lock);
90 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
91 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
92 lock->ml.type, lock->ml.convert_type, lock->ml.node,
93 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
94 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
95 list_empty(&lock->ast_list) ? 'y' : 'n',
96 lock->ast_pending ? 'y' : 'n',
97 list_empty(&lock->bast_list) ? 'y' : 'n',
98 lock->bast_pending ? 'y' : 'n');
99 spin_unlock(&lock->spinlock);
100 } 123 }
101 mlog(ML_NOTICE, " converting queue: \n"); 124 printk(" converting queue:\n");
102 list_for_each(iter2, &res->converting) { 125 list_for_each(iter2, &res->converting) {
103 lock = list_entry(iter2, struct dlm_lock, list); 126 lock = list_entry(iter2, struct dlm_lock, list);
104 spin_lock(&lock->spinlock); 127 __dlm_print_lock(lock);
105 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
106 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
107 lock->ml.type, lock->ml.convert_type, lock->ml.node,
108 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
109 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
110 list_empty(&lock->ast_list) ? 'y' : 'n',
111 lock->ast_pending ? 'y' : 'n',
112 list_empty(&lock->bast_list) ? 'y' : 'n',
113 lock->bast_pending ? 'y' : 'n');
114 spin_unlock(&lock->spinlock);
115 } 128 }
116 mlog(ML_NOTICE, " blocked queue: \n"); 129 printk(" blocked queue:\n");
117 list_for_each(iter2, &res->blocked) { 130 list_for_each(iter2, &res->blocked) {
118 lock = list_entry(iter2, struct dlm_lock, list); 131 lock = list_entry(iter2, struct dlm_lock, list);
119 spin_lock(&lock->spinlock); 132 __dlm_print_lock(lock);
120 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
121 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
122 lock->ml.type, lock->ml.convert_type, lock->ml.node,
123 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
124 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
125 list_empty(&lock->ast_list) ? 'y' : 'n',
126 lock->ast_pending ? 'y' : 'n',
127 list_empty(&lock->bast_list) ? 'y' : 'n',
128 lock->bast_pending ? 'y' : 'n');
129 spin_unlock(&lock->spinlock);
130 } 133 }
131} 134}
132 135
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
136} 139}
137EXPORT_SYMBOL_GPL(dlm_print_one_lock); 140EXPORT_SYMBOL_GPL(dlm_print_one_lock);
138 141
139#if 0
140void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
141{
142 struct dlm_lock_resource *res;
143 struct hlist_node *iter;
144 struct hlist_head *bucket;
145 int i;
146
147 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
148 dlm->name, dlm->node_num, dlm->key);
149 if (!dlm || !dlm->name) {
150 mlog(ML_ERROR, "dlm=%p\n", dlm);
151 return;
152 }
153
154 spin_lock(&dlm->spinlock);
155 for (i=0; i<DLM_HASH_BUCKETS; i++) {
156 bucket = dlm_lockres_hash(dlm, i);
157 hlist_for_each_entry(res, iter, bucket, hash_node)
158 dlm_print_one_lock_resource(res);
159 }
160 spin_unlock(&dlm->spinlock);
161}
162#endif /* 0 */
163
164static const char *dlm_errnames[] = { 142static const char *dlm_errnames[] = {
165 [DLM_NORMAL] = "DLM_NORMAL", 143 [DLM_NORMAL] = "DLM_NORMAL",
166 [DLM_GRANTED] = "DLM_GRANTED", 144 [DLM_GRANTED] = "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
266 return dlm_errnames[err]; 244 return dlm_errnames[err];
267} 245}
268EXPORT_SYMBOL_GPL(dlm_errname); 246EXPORT_SYMBOL_GPL(dlm_errname);
247
248/* NOTE: This function converts a lockname into a string. It uses knowledge
249 * of the format of the lockname that should be outside the purview of the dlm.
250 * We are adding only to make dlm debugging slightly easier.
251 *
252 * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
253 */
254int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
255{
256 int out = 0;
257 __be64 inode_blkno_be;
258
259#define OCFS2_DENTRY_LOCK_INO_START 18
260 if (*lockname == 'N') {
261 memcpy((__be64 *)&inode_blkno_be,
262 (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
263 sizeof(__be64));
264 out += snprintf(buf + out, len - out, "%.*s%08x",
265 OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
266 (unsigned int)be64_to_cpu(inode_blkno_be));
267 } else
268 out += snprintf(buf + out, len - out, "%.*s",
269 locklen, lockname);
270 return out;
271}
272
273static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
274 char *buf, int len)
275{
276 int out = 0;
277 int i = -1;
278
279 while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
280 out += snprintf(buf + out, len - out, "%d ", i);
281
282 return out;
283}
284
285static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
286{
287 int out = 0;
288 unsigned int namelen;
289 const char *name;
290 char *mle_type;
291
292 if (mle->type != DLM_MLE_MASTER) {
293 namelen = mle->u.name.len;
294 name = mle->u.name.name;
295 } else {
296 namelen = mle->u.res->lockname.len;
297 name = mle->u.res->lockname.name;
298 }
299
300 if (mle->type == DLM_MLE_BLOCK)
301 mle_type = "BLK";
302 else if (mle->type == DLM_MLE_MASTER)
303 mle_type = "MAS";
304 else
305 mle_type = "MIG";
306
307 out += stringify_lockname(name, namelen, buf + out, len - out);
308 out += snprintf(buf + out, len - out,
309 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
310 mle_type, mle->master, mle->new_master,
311 !list_empty(&mle->hb_events),
312 !!mle->inuse,
313 atomic_read(&mle->mle_refs.refcount));
314
315 out += snprintf(buf + out, len - out, "Maybe=");
316 out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
317 buf + out, len - out);
318 out += snprintf(buf + out, len - out, "\n");
319
320 out += snprintf(buf + out, len - out, "Vote=");
321 out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
322 buf + out, len - out);
323 out += snprintf(buf + out, len - out, "\n");
324
325 out += snprintf(buf + out, len - out, "Response=");
326 out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
327 buf + out, len - out);
328 out += snprintf(buf + out, len - out, "\n");
329
330 out += snprintf(buf + out, len - out, "Node=");
331 out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
332 buf + out, len - out);
333 out += snprintf(buf + out, len - out, "\n");
334
335 out += snprintf(buf + out, len - out, "\n");
336
337 return out;
338}
339
340void dlm_print_one_mle(struct dlm_master_list_entry *mle)
341{
342 char *buf;
343
344 buf = (char *) get_zeroed_page(GFP_NOFS);
345 if (buf) {
346 dump_mle(mle, buf, PAGE_SIZE - 1);
347 free_page((unsigned long)buf);
348 }
349}
350
351#ifdef CONFIG_DEBUG_FS
352
353static struct dentry *dlm_debugfs_root = NULL;
354
355#define DLM_DEBUGFS_DIR "o2dlm"
356#define DLM_DEBUGFS_DLM_STATE "dlm_state"
357#define DLM_DEBUGFS_LOCKING_STATE "locking_state"
358#define DLM_DEBUGFS_MLE_STATE "mle_state"
359#define DLM_DEBUGFS_PURGE_LIST "purge_list"
360
361/* begin - utils funcs */
362static void dlm_debug_free(struct kref *kref)
363{
364 struct dlm_debug_ctxt *dc;
365
366 dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
367
368 kfree(dc);
369}
370
371void dlm_debug_put(struct dlm_debug_ctxt *dc)
372{
373 if (dc)
374 kref_put(&dc->debug_refcnt, dlm_debug_free);
375}
376
377static void dlm_debug_get(struct dlm_debug_ctxt *dc)
378{
379 kref_get(&dc->debug_refcnt);
380}
381
382static struct debug_buffer *debug_buffer_allocate(void)
383{
384 struct debug_buffer *db = NULL;
385
386 db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
387 if (!db)
388 goto bail;
389
390 db->len = PAGE_SIZE;
391 db->buf = kmalloc(db->len, GFP_KERNEL);
392 if (!db->buf)
393 goto bail;
394
395 return db;
396bail:
397 kfree(db);
398 return NULL;
399}
400
401static ssize_t debug_buffer_read(struct file *file, char __user *buf,
402 size_t nbytes, loff_t *ppos)
403{
404 struct debug_buffer *db = file->private_data;
405
406 return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
407}
408
409static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
410{
411 struct debug_buffer *db = file->private_data;
412 loff_t new = -1;
413
414 switch (whence) {
415 case 0:
416 new = off;
417 break;
418 case 1:
419 new = file->f_pos + off;
420 break;
421 }
422
423 if (new < 0 || new > db->len)
424 return -EINVAL;
425
426 return (file->f_pos = new);
427}
428
429static int debug_buffer_release(struct inode *inode, struct file *file)
430{
431 struct debug_buffer *db = (struct debug_buffer *)file->private_data;
432
433 if (db)
434 kfree(db->buf);
435 kfree(db);
436
437 return 0;
438}
439/* end - util funcs */
440
441/* begin - purge list funcs */
442static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
443{
444 struct dlm_lock_resource *res;
445 int out = 0;
446 unsigned long total = 0;
447
448 out += snprintf(db->buf + out, db->len - out,
449 "Dumping Purgelist for Domain: %s\n", dlm->name);
450
451 spin_lock(&dlm->spinlock);
452 list_for_each_entry(res, &dlm->purge_list, purge) {
453 ++total;
454 if (db->len - out < 100)
455 continue;
456 spin_lock(&res->spinlock);
457 out += stringify_lockname(res->lockname.name,
458 res->lockname.len,
459 db->buf + out, db->len - out);
460 out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
461 (jiffies - res->last_used)/HZ);
462 spin_unlock(&res->spinlock);
463 }
464 spin_unlock(&dlm->spinlock);
465
466 out += snprintf(db->buf + out, db->len - out,
467 "Total on list: %ld\n", total);
468
469 return out;
470}
471
472static int debug_purgelist_open(struct inode *inode, struct file *file)
473{
474 struct dlm_ctxt *dlm = inode->i_private;
475 struct debug_buffer *db;
476
477 db = debug_buffer_allocate();
478 if (!db)
479 goto bail;
480
481 db->len = debug_purgelist_print(dlm, db);
482
483 file->private_data = db;
484
485 return 0;
486bail:
487 return -ENOMEM;
488}
489
490static struct file_operations debug_purgelist_fops = {
491 .open = debug_purgelist_open,
492 .release = debug_buffer_release,
493 .read = debug_buffer_read,
494 .llseek = debug_buffer_llseek,
495};
496/* end - purge list funcs */
497
498/* begin - debug mle funcs */
499static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
500{
501 struct dlm_master_list_entry *mle;
502 int out = 0;
503 unsigned long total = 0;
504
505 out += snprintf(db->buf + out, db->len - out,
506 "Dumping MLEs for Domain: %s\n", dlm->name);
507
508 spin_lock(&dlm->master_lock);
509 list_for_each_entry(mle, &dlm->master_list, list) {
510 ++total;
511 if (db->len - out < 200)
512 continue;
513 out += dump_mle(mle, db->buf + out, db->len - out);
514 }
515 spin_unlock(&dlm->master_lock);
516
517 out += snprintf(db->buf + out, db->len - out,
518 "Total on list: %ld\n", total);
519 return out;
520}
521
522static int debug_mle_open(struct inode *inode, struct file *file)
523{
524 struct dlm_ctxt *dlm = inode->i_private;
525 struct debug_buffer *db;
526
527 db = debug_buffer_allocate();
528 if (!db)
529 goto bail;
530
531 db->len = debug_mle_print(dlm, db);
532
533 file->private_data = db;
534
535 return 0;
536bail:
537 return -ENOMEM;
538}
539
540static struct file_operations debug_mle_fops = {
541 .open = debug_mle_open,
542 .release = debug_buffer_release,
543 .read = debug_buffer_read,
544 .llseek = debug_buffer_llseek,
545};
546
547/* end - debug mle funcs */
548
549/* begin - debug lockres funcs */
550static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
551{
552 int out;
553
554#define DEBUG_LOCK_VERSION 1
555 spin_lock(&lock->spinlock);
556 out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
557 "%d,%d,%d,%d\n",
558 DEBUG_LOCK_VERSION,
559 list_type, lock->ml.type, lock->ml.convert_type,
560 lock->ml.node,
561 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
562 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
563 !list_empty(&lock->ast_list),
564 !list_empty(&lock->bast_list),
565 lock->ast_pending, lock->bast_pending,
566 lock->convert_pending, lock->lock_pending,
567 lock->cancel_pending, lock->unlock_pending,
568 atomic_read(&lock->lock_refs.refcount));
569 spin_unlock(&lock->spinlock);
570
571 return out;
572}
573
574static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
575{
576 struct dlm_lock *lock;
577 int i;
578 int out = 0;
579
580 out += snprintf(buf + out, len - out, "NAME:");
581 out += stringify_lockname(res->lockname.name, res->lockname.len,
582 buf + out, len - out);
583 out += snprintf(buf + out, len - out, "\n");
584
585#define DEBUG_LRES_VERSION 1
586 out += snprintf(buf + out, len - out,
587 "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
588 DEBUG_LRES_VERSION,
589 res->owner, res->state, res->last_used,
590 !list_empty(&res->purge),
591 !list_empty(&res->dirty),
592 !list_empty(&res->recovering),
593 res->inflight_locks, res->migration_pending,
594 atomic_read(&res->asts_reserved),
595 atomic_read(&res->refs.refcount));
596
597 /* refmap */
598 out += snprintf(buf + out, len - out, "RMAP:");
599 out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
600 buf + out, len - out);
601 out += snprintf(buf + out, len - out, "\n");
602
603 /* lvb */
604 out += snprintf(buf + out, len - out, "LVBX:");
605 for (i = 0; i < DLM_LVB_LEN; i++)
606 out += snprintf(buf + out, len - out,
607 "%02x", (unsigned char)res->lvb[i]);
608 out += snprintf(buf + out, len - out, "\n");
609
610 /* granted */
611 list_for_each_entry(lock, &res->granted, list)
612 out += dump_lock(lock, 0, buf + out, len - out);
613
614 /* converting */
615 list_for_each_entry(lock, &res->converting, list)
616 out += dump_lock(lock, 1, buf + out, len - out);
617
618 /* blocked */
619 list_for_each_entry(lock, &res->blocked, list)
620 out += dump_lock(lock, 2, buf + out, len - out);
621
622 out += snprintf(buf + out, len - out, "\n");
623
624 return out;
625}
626
627static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
628{
629 struct debug_lockres *dl = m->private;
630 struct dlm_ctxt *dlm = dl->dl_ctxt;
631 struct dlm_lock_resource *res = NULL;
632
633 spin_lock(&dlm->spinlock);
634
635 if (dl->dl_res) {
636 list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
637 if (dl->dl_res) {
638 dlm_lockres_put(dl->dl_res);
639 dl->dl_res = NULL;
640 }
641 if (&res->tracking == &dlm->tracking_list) {
642 mlog(0, "End of list found, %p\n", res);
643 dl = NULL;
644 break;
645 }
646 dlm_lockres_get(res);
647 dl->dl_res = res;
648 break;
649 }
650 } else {
651 if (!list_empty(&dlm->tracking_list)) {
652 list_for_each_entry(res, &dlm->tracking_list, tracking)
653 break;
654 dlm_lockres_get(res);
655 dl->dl_res = res;
656 } else
657 dl = NULL;
658 }
659
660 if (dl) {
661 spin_lock(&dl->dl_res->spinlock);
662 dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
663 spin_unlock(&dl->dl_res->spinlock);
664 }
665
666 spin_unlock(&dlm->spinlock);
667
668 return dl;
669}
670
671static void lockres_seq_stop(struct seq_file *m, void *v)
672{
673}
674
675static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
676{
677 return NULL;
678}
679
680static int lockres_seq_show(struct seq_file *s, void *v)
681{
682 struct debug_lockres *dl = (struct debug_lockres *)v;
683
684 seq_printf(s, "%s", dl->dl_buf);
685
686 return 0;
687}
688
689static struct seq_operations debug_lockres_ops = {
690 .start = lockres_seq_start,
691 .stop = lockres_seq_stop,
692 .next = lockres_seq_next,
693 .show = lockres_seq_show,
694};
695
696static int debug_lockres_open(struct inode *inode, struct file *file)
697{
698 struct dlm_ctxt *dlm = inode->i_private;
699 int ret = -ENOMEM;
700 struct seq_file *seq;
701 struct debug_lockres *dl = NULL;
702
703 dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
704 if (!dl) {
705 mlog_errno(ret);
706 goto bail;
707 }
708
709 dl->dl_len = PAGE_SIZE;
710 dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
711 if (!dl->dl_buf) {
712 mlog_errno(ret);
713 goto bail;
714 }
715
716 ret = seq_open(file, &debug_lockres_ops);
717 if (ret) {
718 mlog_errno(ret);
719 goto bail;
720 }
721
722 seq = (struct seq_file *) file->private_data;
723 seq->private = dl;
724
725 dlm_grab(dlm);
726 dl->dl_ctxt = dlm;
727
728 return 0;
729bail:
730 if (dl)
731 kfree(dl->dl_buf);
732 kfree(dl);
733 return ret;
734}
735
736static int debug_lockres_release(struct inode *inode, struct file *file)
737{
738 struct seq_file *seq = (struct seq_file *)file->private_data;
739 struct debug_lockres *dl = (struct debug_lockres *)seq->private;
740
741 if (dl->dl_res)
742 dlm_lockres_put(dl->dl_res);
743 dlm_put(dl->dl_ctxt);
744 kfree(dl->dl_buf);
745 return seq_release_private(inode, file);
746}
747
748static struct file_operations debug_lockres_fops = {
749 .open = debug_lockres_open,
750 .release = debug_lockres_release,
751 .read = seq_read,
752 .llseek = seq_lseek,
753};
754/* end - debug lockres funcs */
755
756/* begin - debug state funcs */
757static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
758{
759 int out = 0;
760 struct dlm_reco_node_data *node;
761 char *state;
762 int lres, rres, ures, tres;
763
764 lres = atomic_read(&dlm->local_resources);
765 rres = atomic_read(&dlm->remote_resources);
766 ures = atomic_read(&dlm->unknown_resources);
767 tres = lres + rres + ures;
768
769 spin_lock(&dlm->spinlock);
770
771 switch (dlm->dlm_state) {
772 case DLM_CTXT_NEW:
773 state = "NEW"; break;
774 case DLM_CTXT_JOINED:
775 state = "JOINED"; break;
776 case DLM_CTXT_IN_SHUTDOWN:
777 state = "SHUTDOWN"; break;
778 case DLM_CTXT_LEAVING:
779 state = "LEAVING"; break;
780 default:
781 state = "UNKNOWN"; break;
782 }
783
784 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
785 out += snprintf(db->buf + out, db->len - out,
786 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
787
788 /* Thread Pid: xxx Node: xxx State: xxxxx */
789 out += snprintf(db->buf + out, db->len - out,
790 "Thread Pid: %d Node: %d State: %s\n",
791 dlm->dlm_thread_task->pid, dlm->node_num, state);
792
793 /* Number of Joins: xxx Joining Node: xxx */
794 out += snprintf(db->buf + out, db->len - out,
795 "Number of Joins: %d Joining Node: %d\n",
796 dlm->num_joins, dlm->joining_node);
797
798 /* Domain Map: xx xx xx */
799 out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
800 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
801 db->buf + out, db->len - out);
802 out += snprintf(db->buf + out, db->len - out, "\n");
803
804 /* Live Map: xx xx xx */
805 out += snprintf(db->buf + out, db->len - out, "Live Map: ");
806 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
807 db->buf + out, db->len - out);
808 out += snprintf(db->buf + out, db->len - out, "\n");
809
810 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */
811 out += snprintf(db->buf + out, db->len - out,
812 "Mastered Resources Total: %d Locally: %d "
813 "Remotely: %d Unknown: %d\n",
814 tres, lres, rres, ures);
815
816 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
817 out += snprintf(db->buf + out, db->len - out,
818 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
819 "PendingBASTs=%s Master=%s\n",
820 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
821 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
822 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
823 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
824 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
825
826 /* Purge Count: xxx Refs: xxx */
827 out += snprintf(db->buf + out, db->len - out,
828 "Purge Count: %d Refs: %d\n", dlm->purge_count,
829 atomic_read(&dlm->dlm_refs.refcount));
830
831 /* Dead Node: xxx */
832 out += snprintf(db->buf + out, db->len - out,
833 "Dead Node: %d\n", dlm->reco.dead_node);
834
835 /* What about DLM_RECO_STATE_FINALIZE? */
836 if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
837 state = "ACTIVE";
838 else
839 state = "INACTIVE";
840
841 /* Recovery Pid: xxxx Master: xxx State: xxxx */
842 out += snprintf(db->buf + out, db->len - out,
843 "Recovery Pid: %d Master: %d State: %s\n",
844 dlm->dlm_reco_thread_task->pid,
845 dlm->reco.new_master, state);
846
847 /* Recovery Map: xx xx */
848 out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
849 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
850 db->buf + out, db->len - out);
851 out += snprintf(db->buf + out, db->len - out, "\n");
852
853 /* Recovery Node State: */
854 out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
855 list_for_each_entry(node, &dlm->reco.node_data, list) {
856 switch (node->state) {
857 case DLM_RECO_NODE_DATA_INIT:
858 state = "INIT";
859 break;
860 case DLM_RECO_NODE_DATA_REQUESTING:
861 state = "REQUESTING";
862 break;
863 case DLM_RECO_NODE_DATA_DEAD:
864 state = "DEAD";
865 break;
866 case DLM_RECO_NODE_DATA_RECEIVING:
867 state = "RECEIVING";
868 break;
869 case DLM_RECO_NODE_DATA_REQUESTED:
870 state = "REQUESTED";
871 break;
872 case DLM_RECO_NODE_DATA_DONE:
873 state = "DONE";
874 break;
875 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
876 state = "FINALIZE-SENT";
877 break;
878 default:
879 state = "BAD";
880 break;
881 }
882 out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
883 node->node_num, state);
884 }
885
886 spin_unlock(&dlm->spinlock);
887
888 return out;
889}
890
891static int debug_state_open(struct inode *inode, struct file *file)
892{
893 struct dlm_ctxt *dlm = inode->i_private;
894 struct debug_buffer *db = NULL;
895
896 db = debug_buffer_allocate();
897 if (!db)
898 goto bail;
899
900 db->len = debug_state_print(dlm, db);
901
902 file->private_data = db;
903
904 return 0;
905bail:
906 return -ENOMEM;
907}
908
909static struct file_operations debug_state_fops = {
910 .open = debug_state_open,
911 .release = debug_buffer_release,
912 .read = debug_buffer_read,
913 .llseek = debug_buffer_llseek,
914};
915/* end - debug state funcs */
916
917/* files in subroot */
918int dlm_debug_init(struct dlm_ctxt *dlm)
919{
920 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
921
922 /* for dumping dlm_ctxt */
923 dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
924 S_IFREG|S_IRUSR,
925 dlm->dlm_debugfs_subroot,
926 dlm, &debug_state_fops);
927 if (!dc->debug_state_dentry) {
928 mlog_errno(-ENOMEM);
929 goto bail;
930 }
931
932 /* for dumping lockres */
933 dc->debug_lockres_dentry =
934 debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
935 S_IFREG|S_IRUSR,
936 dlm->dlm_debugfs_subroot,
937 dlm, &debug_lockres_fops);
938 if (!dc->debug_lockres_dentry) {
939 mlog_errno(-ENOMEM);
940 goto bail;
941 }
942
943 /* for dumping mles */
944 dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
945 S_IFREG|S_IRUSR,
946 dlm->dlm_debugfs_subroot,
947 dlm, &debug_mle_fops);
948 if (!dc->debug_mle_dentry) {
949 mlog_errno(-ENOMEM);
950 goto bail;
951 }
952
953 /* for dumping lockres on the purge list */
954 dc->debug_purgelist_dentry =
955 debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
956 S_IFREG|S_IRUSR,
957 dlm->dlm_debugfs_subroot,
958 dlm, &debug_purgelist_fops);
959 if (!dc->debug_purgelist_dentry) {
960 mlog_errno(-ENOMEM);
961 goto bail;
962 }
963
964 dlm_debug_get(dc);
965 return 0;
966
967bail:
968 dlm_debug_shutdown(dlm);
969 return -ENOMEM;
970}
971
972void dlm_debug_shutdown(struct dlm_ctxt *dlm)
973{
974 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
975
976 if (dc) {
977 if (dc->debug_purgelist_dentry)
978 debugfs_remove(dc->debug_purgelist_dentry);
979 if (dc->debug_mle_dentry)
980 debugfs_remove(dc->debug_mle_dentry);
981 if (dc->debug_lockres_dentry)
982 debugfs_remove(dc->debug_lockres_dentry);
983 if (dc->debug_state_dentry)
984 debugfs_remove(dc->debug_state_dentry);
985 dlm_debug_put(dc);
986 }
987}
988
989/* subroot - domain dir */
990int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
991{
992 dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
993 dlm_debugfs_root);
994 if (!dlm->dlm_debugfs_subroot) {
995 mlog_errno(-ENOMEM);
996 goto bail;
997 }
998
999 dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
1000 GFP_KERNEL);
1001 if (!dlm->dlm_debug_ctxt) {
1002 mlog_errno(-ENOMEM);
1003 goto bail;
1004 }
1005 kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
1006
1007 return 0;
1008bail:
1009 dlm_destroy_debugfs_subroot(dlm);
1010 return -ENOMEM;
1011}
1012
1013void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
1014{
1015 if (dlm->dlm_debugfs_subroot)
1016 debugfs_remove(dlm->dlm_debugfs_subroot);
1017}
1018
1019/* debugfs root */
1020int dlm_create_debugfs_root(void)
1021{
1022 dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
1023 if (!dlm_debugfs_root) {
1024 mlog_errno(-ENOMEM);
1025 return -ENOMEM;
1026 }
1027 return 0;
1028}
1029
1030void dlm_destroy_debugfs_root(void)
1031{
1032 if (dlm_debugfs_root)
1033 debugfs_remove(dlm_debugfs_root);
1034}
1035#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..d34a62a3a625
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,86 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2008 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_print_one_mle(struct dlm_master_list_entry *mle);
29
30#ifdef CONFIG_DEBUG_FS
31
32struct dlm_debug_ctxt {
33 struct kref debug_refcnt;
34 struct dentry *debug_state_dentry;
35 struct dentry *debug_lockres_dentry;
36 struct dentry *debug_mle_dentry;
37 struct dentry *debug_purgelist_dentry;
38};
39
40struct debug_buffer {
41 int len;
42 char *buf;
43};
44
45struct debug_lockres {
46 int dl_len;
47 char *dl_buf;
48 struct dlm_ctxt *dl_ctxt;
49 struct dlm_lock_resource *dl_res;
50};
51
52int dlm_debug_init(struct dlm_ctxt *dlm);
53void dlm_debug_shutdown(struct dlm_ctxt *dlm);
54
55int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
56void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
57
58int dlm_create_debugfs_root(void);
59void dlm_destroy_debugfs_root(void);
60
61#else
62
63static int dlm_debug_init(struct dlm_ctxt *dlm)
64{
65 return 0;
66}
67static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
68{
69}
70static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
71{
72 return 0;
73}
74static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
75{
76}
77static int dlm_create_debugfs_root(void)
78{
79 return 0;
80}
81static void dlm_destroy_debugfs_root(void)
82{
83}
84
85#endif /* CONFIG_DEBUG_FS */
86#endif /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0879d86113e3..63f8125824e8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/err.h> 35#include <linux/err.h>
36#include <linux/debugfs.h>
36 37
37#include "cluster/heartbeat.h" 38#include "cluster/heartbeat.h"
38#include "cluster/nodemanager.h" 39#include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
40 41
41#include "dlmapi.h" 42#include "dlmapi.h"
42#include "dlmcommon.h" 43#include "dlmcommon.h"
43
44#include "dlmdomain.h" 44#include "dlmdomain.h"
45#include "dlmdebug.h"
45 46
46#include "dlmver.h" 47#include "dlmver.h"
47 48
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
298 299
299static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) 300static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
300{ 301{
302 dlm_destroy_debugfs_subroot(dlm);
303
301 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
302 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
303 306
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
395static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) 398static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
396{ 399{
397 dlm_unregister_domain_handlers(dlm); 400 dlm_unregister_domain_handlers(dlm);
401 dlm_debug_shutdown(dlm);
398 dlm_complete_thread(dlm); 402 dlm_complete_thread(dlm);
399 dlm_complete_recovery_thread(dlm); 403 dlm_complete_recovery_thread(dlm);
400 dlm_destroy_dlm_worker(dlm); 404 dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
644void dlm_unregister_domain(struct dlm_ctxt *dlm) 648void dlm_unregister_domain(struct dlm_ctxt *dlm)
645{ 649{
646 int leave = 0; 650 int leave = 0;
651 struct dlm_lock_resource *res;
647 652
648 spin_lock(&dlm_domain_lock); 653 spin_lock(&dlm_domain_lock);
649 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); 654 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
673 msleep(500); 678 msleep(500);
674 mlog(0, "%s: more migration to do\n", dlm->name); 679 mlog(0, "%s: more migration to do\n", dlm->name);
675 } 680 }
681
682 /* This list should be empty. If not, print remaining lockres */
683 if (!list_empty(&dlm->tracking_list)) {
684 mlog(ML_ERROR, "Following lockres' are still on the "
685 "tracking list:\n");
686 list_for_each_entry(res, &dlm->tracking_list, tracking)
687 dlm_print_one_lock_resource(res);
688 }
689
676 dlm_mark_domain_leaving(dlm); 690 dlm_mark_domain_leaving(dlm);
677 dlm_leave_domain(dlm); 691 dlm_leave_domain(dlm);
678 dlm_complete_dlm_shutdown(dlm); 692 dlm_complete_dlm_shutdown(dlm);
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1405 goto bail; 1419 goto bail;
1406 } 1420 }
1407 1421
1422 status = dlm_debug_init(dlm);
1423 if (status < 0) {
1424 mlog_errno(status);
1425 goto bail;
1426 }
1427
1408 status = dlm_launch_thread(dlm); 1428 status = dlm_launch_thread(dlm);
1409 if (status < 0) { 1429 if (status < 0) {
1410 mlog_errno(status); 1430 mlog_errno(status);
@@ -1472,6 +1492,7 @@ bail:
1472 1492
1473 if (status) { 1493 if (status) {
1474 dlm_unregister_domain_handlers(dlm); 1494 dlm_unregister_domain_handlers(dlm);
1495 dlm_debug_shutdown(dlm);
1475 dlm_complete_thread(dlm); 1496 dlm_complete_thread(dlm);
1476 dlm_complete_recovery_thread(dlm); 1497 dlm_complete_recovery_thread(dlm);
1477 dlm_destroy_dlm_worker(dlm); 1498 dlm_destroy_dlm_worker(dlm);
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1484 u32 key) 1505 u32 key)
1485{ 1506{
1486 int i; 1507 int i;
1508 int ret;
1487 struct dlm_ctxt *dlm = NULL; 1509 struct dlm_ctxt *dlm = NULL;
1488 1510
1489 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); 1511 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 dlm->key = key; 1538 dlm->key = key;
1517 dlm->node_num = o2nm_this_node(); 1539 dlm->node_num = o2nm_this_node();
1518 1540
1541 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) {
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name);
1545 kfree(dlm);
1546 dlm = NULL;
1547 goto leave;
1548 }
1549
1519 spin_lock_init(&dlm->spinlock); 1550 spin_lock_init(&dlm->spinlock);
1520 spin_lock_init(&dlm->master_lock); 1551 spin_lock_init(&dlm->master_lock);
1521 spin_lock_init(&dlm->ast_lock); 1552 spin_lock_init(&dlm->ast_lock);
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1526 INIT_LIST_HEAD(&dlm->reco.node_data); 1557 INIT_LIST_HEAD(&dlm->reco.node_data);
1527 INIT_LIST_HEAD(&dlm->purge_list); 1558 INIT_LIST_HEAD(&dlm->purge_list);
1528 INIT_LIST_HEAD(&dlm->dlm_domain_handlers); 1559 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
1560 INIT_LIST_HEAD(&dlm->tracking_list);
1529 dlm->reco.state = 0; 1561 dlm->reco.state = 0;
1530 1562
1531 INIT_LIST_HEAD(&dlm->pending_asts); 1563 INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void)
1816 dlm_print_version(); 1848 dlm_print_version();
1817 1849
1818 status = dlm_init_mle_cache(); 1850 status = dlm_init_mle_cache();
1819 if (status) 1851 if (status) {
1820 return -1; 1852 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
1853 goto error;
1854 }
1855
1856 status = dlm_init_master_caches();
1857 if (status) {
1858 mlog(ML_ERROR, "Could not create o2dlm_lockres and "
1859 "o2dlm_lockname slabcaches\n");
1860 goto error;
1861 }
1862
1863 status = dlm_init_lock_cache();
1864 if (status) {
1865 mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
1866 goto error;
1867 }
1821 1868
1822 status = dlm_register_net_handlers(); 1869 status = dlm_register_net_handlers();
1823 if (status) { 1870 if (status) {
1824 dlm_destroy_mle_cache(); 1871 mlog(ML_ERROR, "Unable to register network handlers\n");
1825 return -1; 1872 goto error;
1826 } 1873 }
1827 1874
1875 status = dlm_create_debugfs_root();
1876 if (status)
1877 goto error;
1878
1828 return 0; 1879 return 0;
1880error:
1881 dlm_unregister_net_handlers();
1882 dlm_destroy_lock_cache();
1883 dlm_destroy_master_caches();
1884 dlm_destroy_mle_cache();
1885 return -1;
1829} 1886}
1830 1887
1831static void __exit dlm_exit (void) 1888static void __exit dlm_exit (void)
1832{ 1889{
1890 dlm_destroy_debugfs_root();
1833 dlm_unregister_net_handlers(); 1891 dlm_unregister_net_handlers();
1892 dlm_destroy_lock_cache();
1893 dlm_destroy_master_caches();
1834 dlm_destroy_mle_cache(); 1894 dlm_destroy_mle_cache();
1835} 1895}
1836 1896
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9a..83a9f2972ac8 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
53#define MLOG_MASK_PREFIX ML_DLM 53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h" 54#include "cluster/masklog.h"
55 55
56static struct kmem_cache *dlm_lock_cache = NULL;
57
56static DEFINE_SPINLOCK(dlm_cookie_lock); 58static DEFINE_SPINLOCK(dlm_cookie_lock);
57static u64 dlm_next_cookie = 1; 59static u64 dlm_next_cookie = 1;
58 60
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
64static void dlm_lock_release(struct kref *kref); 66static void dlm_lock_release(struct kref *kref);
65static void dlm_lock_detach_lockres(struct dlm_lock *lock); 67static void dlm_lock_detach_lockres(struct dlm_lock *lock);
66 68
69int dlm_init_lock_cache(void)
70{
71 dlm_lock_cache = kmem_cache_create("o2dlm_lock",
72 sizeof(struct dlm_lock),
73 0, SLAB_HWCACHE_ALIGN, NULL);
74 if (dlm_lock_cache == NULL)
75 return -ENOMEM;
76 return 0;
77}
78
79void dlm_destroy_lock_cache(void)
80{
81 if (dlm_lock_cache)
82 kmem_cache_destroy(dlm_lock_cache);
83}
84
67/* Tell us whether we can grant a new lock request. 85/* Tell us whether we can grant a new lock request.
68 * locking: 86 * locking:
69 * caller needs: res->spinlock 87 * caller needs: res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
353 mlog(0, "freeing kernel-allocated lksb\n"); 371 mlog(0, "freeing kernel-allocated lksb\n");
354 kfree(lock->lksb); 372 kfree(lock->lksb);
355 } 373 }
356 kfree(lock); 374 kmem_cache_free(dlm_lock_cache, lock);
357} 375}
358 376
359/* associate a lock with it's lockres, getting a ref on the lockres */ 377/* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
412 struct dlm_lock *lock; 430 struct dlm_lock *lock;
413 int kernel_allocated = 0; 431 int kernel_allocated = 0;
414 432
415 lock = kzalloc(sizeof(*lock), GFP_NOFS); 433 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
416 if (!lock) 434 if (!lock)
417 return NULL; 435 return NULL;
418 436
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ea6b89577860..efc015c6128a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,47 +48,11 @@
48#include "dlmapi.h" 48#include "dlmapi.h"
49#include "dlmcommon.h" 49#include "dlmcommon.h"
50#include "dlmdomain.h" 50#include "dlmdomain.h"
51#include "dlmdebug.h"
51 52
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
53#include "cluster/masklog.h" 54#include "cluster/masklog.h"
54 55
55enum dlm_mle_type {
56 DLM_MLE_BLOCK,
57 DLM_MLE_MASTER,
58 DLM_MLE_MIGRATION
59};
60
61struct dlm_lock_name
62{
63 u8 len;
64 u8 name[DLM_LOCKID_NAME_MAX];
65};
66
67struct dlm_master_list_entry
68{
69 struct list_head list;
70 struct list_head hb_events;
71 struct dlm_ctxt *dlm;
72 spinlock_t spinlock;
73 wait_queue_head_t wq;
74 atomic_t woken;
75 struct kref mle_refs;
76 int inuse;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
80 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
81 u8 master;
82 u8 new_master;
83 enum dlm_mle_type type;
84 struct o2hb_callback_func mle_hb_up;
85 struct o2hb_callback_func mle_hb_down;
86 union {
87 struct dlm_lock_resource *res;
88 struct dlm_lock_name name;
89 } u;
90};
91
92static void dlm_mle_node_down(struct dlm_ctxt *dlm, 56static void dlm_mle_node_down(struct dlm_ctxt *dlm,
93 struct dlm_master_list_entry *mle, 57 struct dlm_master_list_entry *mle,
94 struct o2nm_node *node, 58 struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
128 return 1; 92 return 1;
129} 93}
130 94
131#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) 95static struct kmem_cache *dlm_lockres_cache = NULL;
132static void _dlm_print_nodemap(unsigned long *map, const char *mapname) 96static struct kmem_cache *dlm_lockname_cache = NULL;
133{
134 int i;
135 printk("%s=[ ", mapname);
136 for (i=0; i<O2NM_MAX_NODES; i++)
137 if (test_bit(i, map))
138 printk("%d ", i);
139 printk("]");
140}
141
142static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
143{
144 int refs;
145 char *type;
146 char attached;
147 u8 master;
148 unsigned int namelen;
149 const char *name;
150 struct kref *k;
151 unsigned long *maybe = mle->maybe_map,
152 *vote = mle->vote_map,
153 *resp = mle->response_map,
154 *node = mle->node_map;
155
156 k = &mle->mle_refs;
157 if (mle->type == DLM_MLE_BLOCK)
158 type = "BLK";
159 else if (mle->type == DLM_MLE_MASTER)
160 type = "MAS";
161 else
162 type = "MIG";
163 refs = atomic_read(&k->refcount);
164 master = mle->master;
165 attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
166
167 if (mle->type != DLM_MLE_MASTER) {
168 namelen = mle->u.name.len;
169 name = mle->u.name.name;
170 } else {
171 namelen = mle->u.res->lockname.len;
172 name = mle->u.res->lockname.name;
173 }
174
175 mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
176 namelen, name, type, refs, master, mle->new_master, attached,
177 mle->inuse);
178 dlm_print_nodemap(maybe);
179 printk(", ");
180 dlm_print_nodemap(vote);
181 printk(", ");
182 dlm_print_nodemap(resp);
183 printk(", ");
184 dlm_print_nodemap(node);
185 printk(", ");
186 printk("\n");
187}
188
189#if 0
190/* Code here is included but defined out as it aids debugging */
191
192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{
194 struct dlm_master_list_entry *mle;
195
196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
197 spin_lock(&dlm->master_lock);
198 list_for_each_entry(mle, &dlm->master_list, list)
199 dlm_print_one_mle(mle);
200 spin_unlock(&dlm->master_lock);
201}
202
203int dlm_dump_all_mles(const char __user *data, unsigned int len)
204{
205 struct dlm_ctxt *dlm;
206
207 spin_lock(&dlm_domain_lock);
208 list_for_each_entry(dlm, &dlm_domains, list) {
209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
210 dlm_dump_mles(dlm);
211 }
212 spin_unlock(&dlm_domain_lock);
213 return len;
214}
215EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
216
217#endif /* 0 */
218
219
220static struct kmem_cache *dlm_mle_cache = NULL; 97static struct kmem_cache *dlm_mle_cache = NULL;
221 98
222
223static void dlm_mle_release(struct kref *kref); 99static void dlm_mle_release(struct kref *kref);
224static void dlm_init_mle(struct dlm_master_list_entry *mle, 100static void dlm_init_mle(struct dlm_master_list_entry *mle,
225 enum dlm_mle_type type, 101 enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
507 383
508int dlm_init_mle_cache(void) 384int dlm_init_mle_cache(void)
509{ 385{
510 dlm_mle_cache = kmem_cache_create("dlm_mle_cache", 386 dlm_mle_cache = kmem_cache_create("o2dlm_mle",
511 sizeof(struct dlm_master_list_entry), 387 sizeof(struct dlm_master_list_entry),
512 0, SLAB_HWCACHE_ALIGN, 388 0, SLAB_HWCACHE_ALIGN,
513 NULL); 389 NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
560 * LOCK RESOURCE FUNCTIONS 436 * LOCK RESOURCE FUNCTIONS
561 */ 437 */
562 438
439int dlm_init_master_caches(void)
440{
441 dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
442 sizeof(struct dlm_lock_resource),
443 0, SLAB_HWCACHE_ALIGN, NULL);
444 if (!dlm_lockres_cache)
445 goto bail;
446
447 dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
448 DLM_LOCKID_NAME_MAX, 0,
449 SLAB_HWCACHE_ALIGN, NULL);
450 if (!dlm_lockname_cache)
451 goto bail;
452
453 return 0;
454bail:
455 dlm_destroy_master_caches();
456 return -ENOMEM;
457}
458
459void dlm_destroy_master_caches(void)
460{
461 if (dlm_lockname_cache)
462 kmem_cache_destroy(dlm_lockname_cache);
463
464 if (dlm_lockres_cache)
465 kmem_cache_destroy(dlm_lockres_cache);
466}
467
563static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, 468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
564 struct dlm_lock_resource *res, 469 struct dlm_lock_resource *res,
565 u8 owner) 470 u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
610 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 515 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
611 res->lockname.name); 516 res->lockname.name);
612 517
518 if (!list_empty(&res->tracking))
519 list_del_init(&res->tracking);
520 else {
521 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
522 res->lockname.len, res->lockname.name);
523 dlm_print_one_lock_resource(res);
524 }
525
613 if (!hlist_unhashed(&res->hash_node) || 526 if (!hlist_unhashed(&res->hash_node) ||
614 !list_empty(&res->granted) || 527 !list_empty(&res->granted) ||
615 !list_empty(&res->converting) || 528 !list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
642 BUG_ON(!list_empty(&res->recovering)); 555 BUG_ON(!list_empty(&res->recovering));
643 BUG_ON(!list_empty(&res->purge)); 556 BUG_ON(!list_empty(&res->purge));
644 557
645 kfree(res->lockname.name); 558 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
646 559
647 kfree(res); 560 kmem_cache_free(dlm_lockres_cache, res);
648} 561}
649 562
650void dlm_lockres_put(struct dlm_lock_resource *res) 563void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
677 INIT_LIST_HEAD(&res->dirty); 590 INIT_LIST_HEAD(&res->dirty);
678 INIT_LIST_HEAD(&res->recovering); 591 INIT_LIST_HEAD(&res->recovering);
679 INIT_LIST_HEAD(&res->purge); 592 INIT_LIST_HEAD(&res->purge);
593 INIT_LIST_HEAD(&res->tracking);
680 atomic_set(&res->asts_reserved, 0); 594 atomic_set(&res->asts_reserved, 0);
681 res->migration_pending = 0; 595 res->migration_pending = 0;
682 res->inflight_locks = 0; 596 res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
692 606
693 res->last_used = 0; 607 res->last_used = 0;
694 608
609 list_add_tail(&res->tracking, &dlm->tracking_list);
610
695 memset(res->lvb, 0, DLM_LVB_LEN); 611 memset(res->lvb, 0, DLM_LVB_LEN);
696 memset(res->refmap, 0, sizeof(res->refmap)); 612 memset(res->refmap, 0, sizeof(res->refmap));
697} 613}
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
700 const char *name, 616 const char *name,
701 unsigned int namelen) 617 unsigned int namelen)
702{ 618{
703 struct dlm_lock_resource *res; 619 struct dlm_lock_resource *res = NULL;
704 620
705 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); 621 res = (struct dlm_lock_resource *)
622 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
706 if (!res) 623 if (!res)
707 return NULL; 624 goto error;
708 625
709 res->lockname.name = kmalloc(namelen, GFP_NOFS); 626 res->lockname.name = (char *)
710 if (!res->lockname.name) { 627 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
711 kfree(res); 628 if (!res->lockname.name)
712 return NULL; 629 goto error;
713 }
714 630
715 dlm_init_lockres(dlm, res, name, namelen); 631 dlm_init_lockres(dlm, res, name, namelen);
716 return res; 632 return res;
633
634error:
635 if (res && res->lockname.name)
636 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
637
638 if (res)
639 kmem_cache_free(dlm_lockres_cache, res);
640 return NULL;
717} 641}
718 642
719void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 643void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1f1873bf41fb..394d25a131a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,18 +27,11 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/crc32.h>
31#include <linux/kthread.h> 30#include <linux/kthread.h>
32#include <linux/pagemap.h> 31#include <linux/pagemap.h>
33#include <linux/debugfs.h> 32#include <linux/debugfs.h>
34#include <linux/seq_file.h> 33#include <linux/seq_file.h>
35 34
36#include <cluster/heartbeat.h>
37#include <cluster/nodemanager.h>
38#include <cluster/tcp.h>
39
40#include <dlm/dlmapi.h>
41
42#define MLOG_MASK_PREFIX ML_DLM_GLUE 35#define MLOG_MASK_PREFIX ML_DLM_GLUE
43#include <cluster/masklog.h> 36#include <cluster/masklog.h>
44 37
@@ -53,6 +46,7 @@
53#include "heartbeat.h" 46#include "heartbeat.h"
54#include "inode.h" 47#include "inode.h"
55#include "journal.h" 48#include "journal.h"
49#include "stackglue.h"
56#include "slot_map.h" 50#include "slot_map.h"
57#include "super.h" 51#include "super.h"
58#include "uptodate.h" 52#include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
113 unsigned int line, 107 unsigned int line,
114 struct ocfs2_lock_res *lockres) 108 struct ocfs2_lock_res *lockres)
115{ 109{
116 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 110 struct ocfs2_meta_lvb *lvb =
111 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
117 112
118 mlog(level, "LVB information for %s (called from %s:%u):\n", 113 mlog(level, "LVB information for %s (called from %s:%u):\n",
119 lockres->l_name, function, line); 114 lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
259 .flags = 0, 254 .flags = 0,
260}; 255};
261 256
262/*
263 * This is the filesystem locking protocol version.
264 *
265 * Whenever the filesystem does new things with locks (adds or removes a
266 * lock, orders them differently, does different things underneath a lock),
267 * the version must be changed. The protocol is negotiated when joining
268 * the dlm domain. A node may join the domain if its major version is
269 * identical to all other nodes and its minor version is greater than
270 * or equal to all other nodes. When its minor version is greater than
271 * the other nodes, it will run at the minor version specified by the
272 * other nodes.
273 *
274 * If a locking change is made that will not be compatible with older
275 * versions, the major number must be increased and the minor version set
276 * to zero. If a change merely adds a behavior that can be disabled when
277 * speaking to older versions, the minor version must be increased. If a
278 * change adds a fully backwards compatible change (eg, LVB changes that
279 * are just ignored by older versions), the version does not need to be
280 * updated.
281 */
282const struct dlm_protocol_version ocfs2_locking_protocol = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285};
286
287static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 257static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
288{ 258{
289 return lockres->l_type == OCFS2_LOCK_TYPE_META || 259 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
316static int ocfs2_lock_create(struct ocfs2_super *osb, 286static int ocfs2_lock_create(struct ocfs2_super *osb,
317 struct ocfs2_lock_res *lockres, 287 struct ocfs2_lock_res *lockres,
318 int level, 288 int level,
319 int dlm_flags); 289 u32 dlm_flags);
320static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 290static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
321 int wanted); 291 int wanted);
322static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 292static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
330 struct ocfs2_lock_res *lockres); 300 struct ocfs2_lock_res *lockres);
331static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 301static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
332 int convert); 302 int convert);
333#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ 303#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
334 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 304 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
335 "resource %s: %s\n", dlm_errname(_stat), _func, \ 305 _err, _func, _lockres->l_name); \
336 _lockres->l_name, dlm_errmsg(_stat)); \
337} while (0) 306} while (0)
338static int ocfs2_downconvert_thread(void *arg); 307static int ocfs2_downconvert_thread(void *arg);
339static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, 308static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
342 struct buffer_head **bh); 311 struct buffer_head **bh);
343static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 312static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
344static inline int ocfs2_highest_compat_lock_level(int level); 313static inline int ocfs2_highest_compat_lock_level(int level);
345static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 314static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
346 int new_level); 315 int new_level);
347static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 316static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
348 struct ocfs2_lock_res *lockres, 317 struct ocfs2_lock_res *lockres,
349 int new_level, 318 int new_level,
350 int lvb); 319 int lvb,
320 unsigned int generation);
351static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 321static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
352 struct ocfs2_lock_res *lockres); 322 struct ocfs2_lock_res *lockres);
353static int ocfs2_cancel_convert(struct ocfs2_super *osb, 323static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
406 res->l_ops = ops; 376 res->l_ops = ops;
407 res->l_priv = priv; 377 res->l_priv = priv;
408 378
409 res->l_level = LKM_IVMODE; 379 res->l_level = DLM_LOCK_IV;
410 res->l_requested = LKM_IVMODE; 380 res->l_requested = DLM_LOCK_IV;
411 res->l_blocking = LKM_IVMODE; 381 res->l_blocking = DLM_LOCK_IV;
412 res->l_action = OCFS2_AST_INVALID; 382 res->l_action = OCFS2_AST_INVALID;
413 res->l_unlock_action = OCFS2_UNLOCK_INVALID; 383 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
414 384
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
604 BUG_ON(!lockres); 574 BUG_ON(!lockres);
605 575
606 switch(level) { 576 switch(level) {
607 case LKM_EXMODE: 577 case DLM_LOCK_EX:
608 lockres->l_ex_holders++; 578 lockres->l_ex_holders++;
609 break; 579 break;
610 case LKM_PRMODE: 580 case DLM_LOCK_PR:
611 lockres->l_ro_holders++; 581 lockres->l_ro_holders++;
612 break; 582 break;
613 default: 583 default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
625 BUG_ON(!lockres); 595 BUG_ON(!lockres);
626 596
627 switch(level) { 597 switch(level) {
628 case LKM_EXMODE: 598 case DLM_LOCK_EX:
629 BUG_ON(!lockres->l_ex_holders); 599 BUG_ON(!lockres->l_ex_holders);
630 lockres->l_ex_holders--; 600 lockres->l_ex_holders--;
631 break; 601 break;
632 case LKM_PRMODE: 602 case DLM_LOCK_PR:
633 BUG_ON(!lockres->l_ro_holders); 603 BUG_ON(!lockres->l_ro_holders);
634 lockres->l_ro_holders--; 604 lockres->l_ro_holders--;
635 break; 605 break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
644 * lock types are added. */ 614 * lock types are added. */
645static inline int ocfs2_highest_compat_lock_level(int level) 615static inline int ocfs2_highest_compat_lock_level(int level)
646{ 616{
647 int new_level = LKM_EXMODE; 617 int new_level = DLM_LOCK_EX;
648 618
649 if (level == LKM_EXMODE) 619 if (level == DLM_LOCK_EX)
650 new_level = LKM_NLMODE; 620 new_level = DLM_LOCK_NL;
651 else if (level == LKM_PRMODE) 621 else if (level == DLM_LOCK_PR)
652 new_level = LKM_PRMODE; 622 new_level = DLM_LOCK_PR;
653 return new_level; 623 return new_level;
654} 624}
655 625
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
688 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 658 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
689 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 659 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
690 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 660 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
691 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 661 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
692 662
693 lockres->l_level = lockres->l_requested; 663 lockres->l_level = lockres->l_requested;
694 if (lockres->l_level <= 664 if (lockres->l_level <=
695 ocfs2_highest_compat_lock_level(lockres->l_blocking)) { 665 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
696 lockres->l_blocking = LKM_NLMODE; 666 lockres->l_blocking = DLM_LOCK_NL;
697 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 667 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
698 } 668 }
699 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 669 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
712 * information is already up to data. Convert from NL to 682 * information is already up to data. Convert from NL to
713 * *anything* however should mark ourselves as needing an 683 * *anything* however should mark ourselves as needing an
714 * update */ 684 * update */
715 if (lockres->l_level == LKM_NLMODE && 685 if (lockres->l_level == DLM_LOCK_NL &&
716 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 686 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
717 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 687 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
718 688
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
729 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); 699 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
730 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 700 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
731 701
732 if (lockres->l_requested > LKM_NLMODE && 702 if (lockres->l_requested > DLM_LOCK_NL &&
733 !(lockres->l_flags & OCFS2_LOCK_LOCAL) && 703 !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
734 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 704 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
735 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 705 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
767 return needs_downconvert; 737 return needs_downconvert;
768} 738}
769 739
740/*
741 * OCFS2_LOCK_PENDING and l_pending_gen.
742 *
743 * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
744 * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
745 * for more details on the race.
746 *
747 * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
748 * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
749 * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
750 * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
751 * the caller is going to try to clear PENDING again. If nothing else is
752 * happening, __lockres_clear_pending() sees PENDING is unset and does
753 * nothing.
754 *
755 * But what if another path (eg downconvert thread) has just started a
756 * new locking action? The other path has re-set PENDING. Our path
757 * cannot clear PENDING, because that will re-open the original race
758 * window.
759 *
760 * [Example]
761 *
762 * ocfs2_meta_lock()
763 * ocfs2_cluster_lock()
764 * set BUSY
765 * set PENDING
766 * drop l_lock
767 * ocfs2_dlm_lock()
768 * ocfs2_locking_ast() ocfs2_downconvert_thread()
769 * clear PENDING ocfs2_unblock_lock()
770 * take_l_lock
771 * !BUSY
772 * ocfs2_prepare_downconvert()
773 * set BUSY
774 * set PENDING
775 * drop l_lock
776 * take l_lock
777 * clear PENDING
778 * drop l_lock
779 * <window>
780 * ocfs2_dlm_lock()
781 *
782 * So as you can see, we now have a window where l_lock is not held,
783 * PENDING is not set, and ocfs2_dlm_lock() has not been called.
784 *
785 * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
786 * set by ocfs2_prepare_downconvert(). That wasn't nice.
787 *
788 * To solve this we introduce l_pending_gen. A call to
789 * lockres_clear_pending() will only do so when it is passed a generation
790 * number that matches the lockres. lockres_set_pending() will return the
791 * current generation number. When ocfs2_cluster_lock() goes to clear
792 * PENDING, it passes the generation it got from set_pending(). In our
793 * example above, the generation numbers will *not* match. Thus,
794 * ocfs2_cluster_lock() will not clear the PENDING set by
795 * ocfs2_prepare_downconvert().
796 */
797
798/* Unlocked version for ocfs2_locking_ast() */
799static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
800 unsigned int generation,
801 struct ocfs2_super *osb)
802{
803 assert_spin_locked(&lockres->l_lock);
804
805 /*
806 * The ast and locking functions can race us here. The winner
807 * will clear pending, the loser will not.
808 */
809 if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
810 (lockres->l_pending_gen != generation))
811 return;
812
813 lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
814 lockres->l_pending_gen++;
815
816 /*
817 * The downconvert thread may have skipped us because we
818 * were PENDING. Wake it up.
819 */
820 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
821 ocfs2_wake_downconvert_thread(osb);
822}
823
824/* Locked version for callers of ocfs2_dlm_lock() */
825static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
826 unsigned int generation,
827 struct ocfs2_super *osb)
828{
829 unsigned long flags;
830
831 spin_lock_irqsave(&lockres->l_lock, flags);
832 __lockres_clear_pending(lockres, generation, osb);
833 spin_unlock_irqrestore(&lockres->l_lock, flags);
834}
835
836static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
837{
838 assert_spin_locked(&lockres->l_lock);
839 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
840
841 lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
842
843 return lockres->l_pending_gen;
844}
845
846
770static void ocfs2_blocking_ast(void *opaque, int level) 847static void ocfs2_blocking_ast(void *opaque, int level)
771{ 848{
772 struct ocfs2_lock_res *lockres = opaque; 849 struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
774 int needs_downconvert; 851 int needs_downconvert;
775 unsigned long flags; 852 unsigned long flags;
776 853
777 BUG_ON(level <= LKM_NLMODE); 854 BUG_ON(level <= DLM_LOCK_NL);
778 855
779 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 856 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
780 lockres->l_name, level, lockres->l_level, 857 lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
801static void ocfs2_locking_ast(void *opaque) 878static void ocfs2_locking_ast(void *opaque)
802{ 879{
803 struct ocfs2_lock_res *lockres = opaque; 880 struct ocfs2_lock_res *lockres = opaque;
804 struct dlm_lockstatus *lksb = &lockres->l_lksb; 881 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
805 unsigned long flags; 882 unsigned long flags;
883 int status;
806 884
807 spin_lock_irqsave(&lockres->l_lock, flags); 885 spin_lock_irqsave(&lockres->l_lock, flags);
808 886
809 if (lksb->status != DLM_NORMAL) { 887 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
810 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", 888
811 lockres->l_name, lksb->status); 889 if (status == -EAGAIN) {
890 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
891 goto out;
892 }
893
894 if (status) {
895 mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
896 lockres->l_name, status);
812 spin_unlock_irqrestore(&lockres->l_lock, flags); 897 spin_unlock_irqrestore(&lockres->l_lock, flags);
813 return; 898 return;
814 } 899 }
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
831 lockres->l_unlock_action); 916 lockres->l_unlock_action);
832 BUG(); 917 BUG();
833 } 918 }
834 919out:
835 /* set it to something invalid so if we get called again we 920 /* set it to something invalid so if we get called again we
836 * can catch it. */ 921 * can catch it. */
837 lockres->l_action = OCFS2_AST_INVALID; 922 lockres->l_action = OCFS2_AST_INVALID;
838 923
924 /* Did we try to cancel this lock? Clear that state */
925 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
926 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
927
928 /*
929 * We may have beaten the locking functions here. We certainly
930 * know that dlm_lock() has been called :-)
931 * Because we can't have two lock calls in flight at once, we
932 * can use lockres->l_pending_gen.
933 */
934 __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
935
839 wake_up(&lockres->l_event); 936 wake_up(&lockres->l_event);
840 spin_unlock_irqrestore(&lockres->l_lock, flags); 937 spin_unlock_irqrestore(&lockres->l_lock, flags);
841} 938}
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
865static int ocfs2_lock_create(struct ocfs2_super *osb, 962static int ocfs2_lock_create(struct ocfs2_super *osb,
866 struct ocfs2_lock_res *lockres, 963 struct ocfs2_lock_res *lockres,
867 int level, 964 int level,
868 int dlm_flags) 965 u32 dlm_flags)
869{ 966{
870 int ret = 0; 967 int ret = 0;
871 enum dlm_status status = DLM_NORMAL;
872 unsigned long flags; 968 unsigned long flags;
969 unsigned int gen;
873 970
874 mlog_entry_void(); 971 mlog_entry_void();
875 972
876 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, 973 mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
877 dlm_flags); 974 dlm_flags);
878 975
879 spin_lock_irqsave(&lockres->l_lock, flags); 976 spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
886 lockres->l_action = OCFS2_AST_ATTACH; 983 lockres->l_action = OCFS2_AST_ATTACH;
887 lockres->l_requested = level; 984 lockres->l_requested = level;
888 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 985 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
986 gen = lockres_set_pending(lockres);
889 spin_unlock_irqrestore(&lockres->l_lock, flags); 987 spin_unlock_irqrestore(&lockres->l_lock, flags);
890 988
891 status = dlmlock(osb->dlm, 989 ret = ocfs2_dlm_lock(osb->cconn,
892 level, 990 level,
893 &lockres->l_lksb, 991 &lockres->l_lksb,
894 dlm_flags, 992 dlm_flags,
895 lockres->l_name, 993 lockres->l_name,
896 OCFS2_LOCK_ID_MAX_LEN - 1, 994 OCFS2_LOCK_ID_MAX_LEN - 1,
897 ocfs2_locking_ast, 995 lockres);
898 lockres, 996 lockres_clear_pending(lockres, gen, osb);
899 ocfs2_blocking_ast); 997 if (ret) {
900 if (status != DLM_NORMAL) { 998 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
901 ocfs2_log_dlm_error("dlmlock", status, lockres);
902 ret = -EINVAL;
903 ocfs2_recover_from_dlm_error(lockres, 1); 999 ocfs2_recover_from_dlm_error(lockres, 1);
904 } 1000 }
905 1001
906 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); 1002 mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
907 1003
908bail: 1004bail:
909 mlog_exit(ret); 1005 mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
1016static int ocfs2_cluster_lock(struct ocfs2_super *osb, 1112static int ocfs2_cluster_lock(struct ocfs2_super *osb,
1017 struct ocfs2_lock_res *lockres, 1113 struct ocfs2_lock_res *lockres,
1018 int level, 1114 int level,
1019 int lkm_flags, 1115 u32 lkm_flags,
1020 int arg_flags) 1116 int arg_flags)
1021{ 1117{
1022 struct ocfs2_mask_waiter mw; 1118 struct ocfs2_mask_waiter mw;
1023 enum dlm_status status;
1024 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 1119 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
1025 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 1120 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
1026 unsigned long flags; 1121 unsigned long flags;
1122 unsigned int gen;
1123 int noqueue_attempted = 0;
1027 1124
1028 mlog_entry_void(); 1125 mlog_entry_void();
1029 1126
1030 ocfs2_init_mask_waiter(&mw); 1127 ocfs2_init_mask_waiter(&mw);
1031 1128
1032 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 1129 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
1033 lkm_flags |= LKM_VALBLK; 1130 lkm_flags |= DLM_LKF_VALBLK;
1034 1131
1035again: 1132again:
1036 wait = 0; 1133 wait = 0;
@@ -1068,52 +1165,56 @@ again:
1068 } 1165 }
1069 1166
1070 if (level > lockres->l_level) { 1167 if (level > lockres->l_level) {
1168 if (noqueue_attempted > 0) {
1169 ret = -EAGAIN;
1170 goto unlock;
1171 }
1172 if (lkm_flags & DLM_LKF_NOQUEUE)
1173 noqueue_attempted = 1;
1174
1071 if (lockres->l_action != OCFS2_AST_INVALID) 1175 if (lockres->l_action != OCFS2_AST_INVALID)
1072 mlog(ML_ERROR, "lockres %s has action %u pending\n", 1176 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1073 lockres->l_name, lockres->l_action); 1177 lockres->l_name, lockres->l_action);
1074 1178
1075 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 1179 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1076 lockres->l_action = OCFS2_AST_ATTACH; 1180 lockres->l_action = OCFS2_AST_ATTACH;
1077 lkm_flags &= ~LKM_CONVERT; 1181 lkm_flags &= ~DLM_LKF_CONVERT;
1078 } else { 1182 } else {
1079 lockres->l_action = OCFS2_AST_CONVERT; 1183 lockres->l_action = OCFS2_AST_CONVERT;
1080 lkm_flags |= LKM_CONVERT; 1184 lkm_flags |= DLM_LKF_CONVERT;
1081 } 1185 }
1082 1186
1083 lockres->l_requested = level; 1187 lockres->l_requested = level;
1084 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1188 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1189 gen = lockres_set_pending(lockres);
1085 spin_unlock_irqrestore(&lockres->l_lock, flags); 1190 spin_unlock_irqrestore(&lockres->l_lock, flags);
1086 1191
1087 BUG_ON(level == LKM_IVMODE); 1192 BUG_ON(level == DLM_LOCK_IV);
1088 BUG_ON(level == LKM_NLMODE); 1193 BUG_ON(level == DLM_LOCK_NL);
1089 1194
1090 mlog(0, "lock %s, convert from %d to level = %d\n", 1195 mlog(0, "lock %s, convert from %d to level = %d\n",
1091 lockres->l_name, lockres->l_level, level); 1196 lockres->l_name, lockres->l_level, level);
1092 1197
1093 /* call dlm_lock to upgrade lock now */ 1198 /* call dlm_lock to upgrade lock now */
1094 status = dlmlock(osb->dlm, 1199 ret = ocfs2_dlm_lock(osb->cconn,
1095 level, 1200 level,
1096 &lockres->l_lksb, 1201 &lockres->l_lksb,
1097 lkm_flags, 1202 lkm_flags,
1098 lockres->l_name, 1203 lockres->l_name,
1099 OCFS2_LOCK_ID_MAX_LEN - 1, 1204 OCFS2_LOCK_ID_MAX_LEN - 1,
1100 ocfs2_locking_ast, 1205 lockres);
1101 lockres, 1206 lockres_clear_pending(lockres, gen, osb);
1102 ocfs2_blocking_ast); 1207 if (ret) {
1103 if (status != DLM_NORMAL) { 1208 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
1104 if ((lkm_flags & LKM_NOQUEUE) && 1209 (ret != -EAGAIN)) {
1105 (status == DLM_NOTQUEUED)) 1210 ocfs2_log_dlm_error("ocfs2_dlm_lock",
1106 ret = -EAGAIN; 1211 ret, lockres);
1107 else {
1108 ocfs2_log_dlm_error("dlmlock", status,
1109 lockres);
1110 ret = -EINVAL;
1111 } 1212 }
1112 ocfs2_recover_from_dlm_error(lockres, 1); 1213 ocfs2_recover_from_dlm_error(lockres, 1);
1113 goto out; 1214 goto out;
1114 } 1215 }
1115 1216
1116 mlog(0, "lock %s, successfull return from dlmlock\n", 1217 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
1117 lockres->l_name); 1218 lockres->l_name);
1118 1219
1119 /* At this point we've gone inside the dlm and need to 1220 /* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
1177 int ex, 1278 int ex,
1178 int local) 1279 int local)
1179{ 1280{
1180 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1281 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1181 unsigned long flags; 1282 unsigned long flags;
1182 int lkm_flags = local ? LKM_LOCAL : 0; 1283 u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
1183 1284
1184 spin_lock_irqsave(&lockres->l_lock, flags); 1285 spin_lock_irqsave(&lockres->l_lock, flags);
1185 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 1286 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1222 } 1323 }
1223 1324
1224 /* 1325 /*
1225 * We don't want to use LKM_LOCAL on a meta data lock as they 1326 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
1226 * don't use a generation in their lock names. 1327 * don't use a generation in their lock names.
1227 */ 1328 */
1228 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); 1329 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1261 1362
1262 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1363 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1263 1364
1264 level = write ? LKM_EXMODE : LKM_PRMODE; 1365 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1265 1366
1266 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1367 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1267 0); 1368 0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1274 1375
1275void ocfs2_rw_unlock(struct inode *inode, int write) 1376void ocfs2_rw_unlock(struct inode *inode, int write)
1276{ 1377{
1277 int level = write ? LKM_EXMODE : LKM_PRMODE; 1378 int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1278 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1379 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1380 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 1381
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
1312 lockres = &OCFS2_I(inode)->ip_open_lockres; 1413 lockres = &OCFS2_I(inode)->ip_open_lockres;
1313 1414
1314 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1415 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1315 LKM_PRMODE, 0, 0); 1416 DLM_LOCK_PR, 0, 0);
1316 if (status < 0) 1417 if (status < 0)
1317 mlog_errno(status); 1418 mlog_errno(status);
1318 1419
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1340 1441
1341 lockres = &OCFS2_I(inode)->ip_open_lockres; 1442 lockres = &OCFS2_I(inode)->ip_open_lockres;
1342 1443
1343 level = write ? LKM_EXMODE : LKM_PRMODE; 1444 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1344 1445
1345 /* 1446 /*
1346 * The file system may already holding a PRMODE/EXMODE open lock. 1447 * The file system may already holding a PRMODE/EXMODE open lock.
1347 * Since we pass LKM_NOQUEUE, the request won't block waiting on 1448 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
1348 * other nodes and the -EAGAIN will indicate to the caller that 1449 * other nodes and the -EAGAIN will indicate to the caller that
1349 * this inode is still in use. 1450 * this inode is still in use.
1350 */ 1451 */
1351 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1452 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1352 level, LKM_NOQUEUE, 0); 1453 level, DLM_LKF_NOQUEUE, 0);
1353 1454
1354out: 1455out:
1355 mlog_exit(status); 1456 mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
1374 1475
1375 if(lockres->l_ro_holders) 1476 if(lockres->l_ro_holders)
1376 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1477 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1377 LKM_PRMODE); 1478 DLM_LOCK_PR);
1378 if(lockres->l_ex_holders) 1479 if(lockres->l_ex_holders)
1379 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1480 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1380 LKM_EXMODE); 1481 DLM_LOCK_EX);
1381 1482
1382out: 1483out:
1383 mlog_exit_void(); 1484 mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1464 ocfs2_init_mask_waiter(&mw); 1565 ocfs2_init_mask_waiter(&mw);
1465 1566
1466 if ((lockres->l_flags & OCFS2_LOCK_BUSY) || 1567 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1467 (lockres->l_level > LKM_NLMODE)) { 1568 (lockres->l_level > DLM_LOCK_NL)) {
1468 mlog(ML_ERROR, 1569 mlog(ML_ERROR,
1469 "File lock \"%s\" has busy or locked state: flags: 0x%lx, " 1570 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1470 "level: %u\n", lockres->l_name, lockres->l_flags, 1571 "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1503 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1604 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1504 spin_unlock_irqrestore(&lockres->l_lock, flags); 1605 spin_unlock_irqrestore(&lockres->l_lock, flags);
1505 1606
1506 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags, 1607 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1507 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1608 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1508 ocfs2_locking_ast, lockres, ocfs2_blocking_ast); 1609 lockres);
1509 if (ret != DLM_NORMAL) { 1610 if (ret) {
1510 if (trylock && ret == DLM_NOTQUEUED) 1611 if (!trylock || (ret != -EAGAIN)) {
1511 ret = -EAGAIN; 1612 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
1512 else {
1513 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1514 ret = -EINVAL; 1613 ret = -EINVAL;
1515 } 1614 }
1516 1615
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1537 * to just bubble sucess back up to the user. 1636 * to just bubble sucess back up to the user.
1538 */ 1637 */
1539 ret = ocfs2_flock_handle_signal(lockres, level); 1638 ret = ocfs2_flock_handle_signal(lockres, level);
1639 } else if (!ret && (level > lockres->l_level)) {
1640 /* Trylock failed asynchronously */
1641 BUG_ON(!trylock);
1642 ret = -EAGAIN;
1540 } 1643 }
1541 1644
1542out: 1645out:
@@ -1549,6 +1652,7 @@ out:
1549void ocfs2_file_unlock(struct file *file) 1652void ocfs2_file_unlock(struct file *file)
1550{ 1653{
1551 int ret; 1654 int ret;
1655 unsigned int gen;
1552 unsigned long flags; 1656 unsigned long flags;
1553 struct ocfs2_file_private *fp = file->private_data; 1657 struct ocfs2_file_private *fp = file->private_data;
1554 struct ocfs2_lock_res *lockres = &fp->fp_flock; 1658 struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
1572 * Fake a blocking ast for the downconvert code. 1676 * Fake a blocking ast for the downconvert code.
1573 */ 1677 */
1574 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 1678 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1575 lockres->l_blocking = LKM_EXMODE; 1679 lockres->l_blocking = DLM_LOCK_EX;
1576 1680
1577 ocfs2_prepare_downconvert(lockres, LKM_NLMODE); 1681 gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1578 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1682 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1579 spin_unlock_irqrestore(&lockres->l_lock, flags); 1683 spin_unlock_irqrestore(&lockres->l_lock, flags);
1580 1684
1581 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0); 1685 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
1582 if (ret) { 1686 if (ret) {
1583 mlog_errno(ret); 1687 mlog_errno(ret);
1584 return; 1688 return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1601 * condition. */ 1705 * condition. */
1602 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1706 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1603 switch(lockres->l_blocking) { 1707 switch(lockres->l_blocking) {
1604 case LKM_EXMODE: 1708 case DLM_LOCK_EX:
1605 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 1709 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1606 kick = 1; 1710 kick = 1;
1607 break; 1711 break;
1608 case LKM_PRMODE: 1712 case DLM_LOCK_PR:
1609 if (!lockres->l_ex_holders) 1713 if (!lockres->l_ex_holders)
1610 kick = 1; 1714 kick = 1;
1611 break; 1715 break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1648 1752
1649 mlog_entry_void(); 1753 mlog_entry_void();
1650 1754
1651 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1755 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1652 1756
1653 /* 1757 /*
1654 * Invalidate the LVB of a deleted inode - this way other 1758 * Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1700 1804
1701 mlog_meta_lvb(0, lockres); 1805 mlog_meta_lvb(0, lockres);
1702 1806
1703 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1807 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1704 1808
1705 /* We're safe here without the lockres lock... */ 1809 /* We're safe here without the lockres lock... */
1706 spin_lock(&oi->ip_lock); 1810 spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1735static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 1839static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1736 struct ocfs2_lock_res *lockres) 1840 struct ocfs2_lock_res *lockres)
1737{ 1841{
1738 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1842 struct ocfs2_meta_lvb *lvb =
1843 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1739 1844
1740 if (lvb->lvb_version == OCFS2_LVB_VERSION 1845 if (lvb->lvb_version == OCFS2_LVB_VERSION
1741 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 1846 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
1923 int ex, 2028 int ex,
1924 int arg_flags) 2029 int arg_flags)
1925{ 2030{
1926 int status, level, dlm_flags, acquired; 2031 int status, level, acquired;
2032 u32 dlm_flags;
1927 struct ocfs2_lock_res *lockres = NULL; 2033 struct ocfs2_lock_res *lockres = NULL;
1928 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2034 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1929 struct buffer_head *local_bh = NULL; 2035 struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
1950 goto local; 2056 goto local;
1951 2057
1952 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2058 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1953 wait_event(osb->recovery_event, 2059 ocfs2_wait_for_recovery(osb);
1954 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1955 2060
1956 lockres = &OCFS2_I(inode)->ip_inode_lockres; 2061 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1957 level = ex ? LKM_EXMODE : LKM_PRMODE; 2062 level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1958 dlm_flags = 0; 2063 dlm_flags = 0;
1959 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 2064 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1960 dlm_flags |= LKM_NOQUEUE; 2065 dlm_flags |= DLM_LKF_NOQUEUE;
1961 2066
1962 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 2067 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1963 if (status < 0) { 2068 if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
1974 * committed to owning this lock so we don't allow signals to 2079 * committed to owning this lock so we don't allow signals to
1975 * abort the operation. */ 2080 * abort the operation. */
1976 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2081 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1977 wait_event(osb->recovery_event, 2082 ocfs2_wait_for_recovery(osb);
1978 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1979 2083
1980local: 2084local:
1981 /* 2085 /*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
2109void ocfs2_inode_unlock(struct inode *inode, 2213void ocfs2_inode_unlock(struct inode *inode,
2110 int ex) 2214 int ex)
2111{ 2215{
2112 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2216 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2113 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; 2217 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
2114 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2218 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2115 2219
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2130 int ex) 2234 int ex)
2131{ 2235{
2132 int status = 0; 2236 int status = 0;
2133 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2237 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2134 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2238 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2135 struct buffer_head *bh;
2136 struct ocfs2_slot_info *si = osb->slot_info;
2137 2239
2138 mlog_entry_void(); 2240 mlog_entry_void();
2139 2241
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2159 goto bail; 2261 goto bail;
2160 } 2262 }
2161 if (status) { 2263 if (status) {
2162 bh = si->si_bh; 2264 status = ocfs2_refresh_slot_info(osb);
2163 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
2164 si->si_inode);
2165 if (status == 0)
2166 ocfs2_update_slot_info(si);
2167 2265
2168 ocfs2_complete_lock_res_refresh(lockres, status); 2266 ocfs2_complete_lock_res_refresh(lockres, status);
2169 2267
@@ -2178,7 +2276,7 @@ bail:
2178void ocfs2_super_unlock(struct ocfs2_super *osb, 2276void ocfs2_super_unlock(struct ocfs2_super *osb,
2179 int ex) 2277 int ex)
2180{ 2278{
2181 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2279 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2182 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2280 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2183 2281
2184 if (!ocfs2_mount_local(osb)) 2282 if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
2196 if (ocfs2_mount_local(osb)) 2294 if (ocfs2_mount_local(osb))
2197 return 0; 2295 return 0;
2198 2296
2199 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); 2297 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
2200 if (status < 0) 2298 if (status < 0)
2201 mlog_errno(status); 2299 mlog_errno(status);
2202 2300
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2208 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 2306 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2209 2307
2210 if (!ocfs2_mount_local(osb)) 2308 if (!ocfs2_mount_local(osb))
2211 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); 2309 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2212} 2310}
2213 2311
2214int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2312int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2215{ 2313{
2216 int ret; 2314 int ret;
2217 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2315 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2218 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2316 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2219 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2317 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2220 2318
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2235 2333
2236void ocfs2_dentry_unlock(struct dentry *dentry, int ex) 2334void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2237{ 2335{
2238 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2336 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2239 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2337 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2240 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2338 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2241 2339
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2400 lockres->l_blocking); 2498 lockres->l_blocking);
2401 2499
2402 /* Dump the raw LVB */ 2500 /* Dump the raw LVB */
2403 lvb = lockres->l_lksb.lvb; 2501 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2404 for(i = 0; i < DLM_LVB_LEN; i++) 2502 for(i = 0; i < DLM_LVB_LEN; i++)
2405 seq_printf(m, "0x%x\t", lvb[i]); 2503 seq_printf(m, "0x%x\t", lvb[i]);
2406 2504
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2504int ocfs2_dlm_init(struct ocfs2_super *osb) 2602int ocfs2_dlm_init(struct ocfs2_super *osb)
2505{ 2603{
2506 int status = 0; 2604 int status = 0;
2507 u32 dlm_key; 2605 struct ocfs2_cluster_connection *conn = NULL;
2508 struct dlm_ctxt *dlm = NULL;
2509 2606
2510 mlog_entry_void(); 2607 mlog_entry_void();
2511 2608
2512 if (ocfs2_mount_local(osb)) 2609 if (ocfs2_mount_local(osb)) {
2610 osb->node_num = 0;
2513 goto local; 2611 goto local;
2612 }
2514 2613
2515 status = ocfs2_dlm_init_debug(osb); 2614 status = ocfs2_dlm_init_debug(osb);
2516 if (status < 0) { 2615 if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2527 goto bail; 2626 goto bail;
2528 } 2627 }
2529 2628
2530 /* used by the dlm code to make message headers unique, each
2531 * node in this domain must agree on this. */
2532 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2533
2534 /* for now, uuid == domain */ 2629 /* for now, uuid == domain */
2535 dlm = dlm_register_domain(osb->uuid_str, dlm_key, 2630 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2536 &osb->osb_locking_proto); 2631 osb->uuid_str,
2537 if (IS_ERR(dlm)) { 2632 strlen(osb->uuid_str),
2538 status = PTR_ERR(dlm); 2633 ocfs2_do_node_down, osb,
2634 &conn);
2635 if (status) {
2539 mlog_errno(status); 2636 mlog_errno(status);
2540 goto bail; 2637 goto bail;
2541 } 2638 }
2542 2639
2543 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); 2640 status = ocfs2_cluster_this_node(&osb->node_num);
2641 if (status < 0) {
2642 mlog_errno(status);
2643 mlog(ML_ERROR,
2644 "could not find this host's node number\n");
2645 ocfs2_cluster_disconnect(conn, 0);
2646 goto bail;
2647 }
2544 2648
2545local: 2649local:
2546 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2650 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2547 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2651 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2548 2652
2549 osb->dlm = dlm; 2653 osb->cconn = conn;
2550 2654
2551 status = 0; 2655 status = 0;
2552bail: 2656bail:
@@ -2560,14 +2664,19 @@ bail:
2560 return status; 2664 return status;
2561} 2665}
2562 2666
2563void ocfs2_dlm_shutdown(struct ocfs2_super *osb) 2667void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2668 int hangup_pending)
2564{ 2669{
2565 mlog_entry_void(); 2670 mlog_entry_void();
2566 2671
2567 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2568
2569 ocfs2_drop_osb_locks(osb); 2672 ocfs2_drop_osb_locks(osb);
2570 2673
2674 /*
2675 * Now that we have dropped all locks and ocfs2_dismount_volume()
2676 * has disabled recovery, the DLM won't be talking to us. It's
2677 * safe to tear things down before disconnecting the cluster.
2678 */
2679
2571 if (osb->dc_task) { 2680 if (osb->dc_task) {
2572 kthread_stop(osb->dc_task); 2681 kthread_stop(osb->dc_task);
2573 osb->dc_task = NULL; 2682 osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2576 ocfs2_lock_res_free(&osb->osb_super_lockres); 2685 ocfs2_lock_res_free(&osb->osb_super_lockres);
2577 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2686 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2578 2687
2579 dlm_unregister_domain(osb->dlm); 2688 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2580 osb->dlm = NULL; 2689 osb->cconn = NULL;
2581 2690
2582 ocfs2_dlm_shutdown_debug(osb); 2691 ocfs2_dlm_shutdown_debug(osb);
2583 2692
2584 mlog_exit_void(); 2693 mlog_exit_void();
2585} 2694}
2586 2695
2587static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) 2696static void ocfs2_unlock_ast(void *opaque, int error)
2588{ 2697{
2589 struct ocfs2_lock_res *lockres = opaque; 2698 struct ocfs2_lock_res *lockres = opaque;
2590 unsigned long flags; 2699 unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
2595 lockres->l_unlock_action); 2704 lockres->l_unlock_action);
2596 2705
2597 spin_lock_irqsave(&lockres->l_lock, flags); 2706 spin_lock_irqsave(&lockres->l_lock, flags);
2598 /* We tried to cancel a convert request, but it was already 2707 if (error) {
2599 * granted. All we want to do here is clear our unlock 2708 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
2600 * state. The wake_up call done at the bottom is redundant 2709 "unlock_action %d\n", error, lockres->l_name,
2601 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2602 * hurt anything anyway */
2603 if (status == DLM_CANCELGRANT &&
2604 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2605 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2606
2607 /* We don't clear the busy flag in this case as it
2608 * should have been cleared by the ast which the dlm
2609 * has called. */
2610 goto complete_unlock;
2611 }
2612
2613 if (status != DLM_NORMAL) {
2614 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2615 "unlock_action %d\n", status, lockres->l_name,
2616 lockres->l_unlock_action); 2710 lockres->l_unlock_action);
2617 spin_unlock_irqrestore(&lockres->l_lock, flags); 2711 spin_unlock_irqrestore(&lockres->l_lock, flags);
2618 return; 2712 return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
2624 lockres->l_action = OCFS2_AST_INVALID; 2718 lockres->l_action = OCFS2_AST_INVALID;
2625 break; 2719 break;
2626 case OCFS2_UNLOCK_DROP_LOCK: 2720 case OCFS2_UNLOCK_DROP_LOCK:
2627 lockres->l_level = LKM_IVMODE; 2721 lockres->l_level = DLM_LOCK_IV;
2628 break; 2722 break;
2629 default: 2723 default:
2630 BUG(); 2724 BUG();
2631 } 2725 }
2632 2726
2633 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 2727 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2634complete_unlock:
2635 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 2728 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2636 spin_unlock_irqrestore(&lockres->l_lock, flags); 2729 spin_unlock_irqrestore(&lockres->l_lock, flags);
2637 2730
@@ -2643,16 +2736,16 @@ complete_unlock:
2643static int ocfs2_drop_lock(struct ocfs2_super *osb, 2736static int ocfs2_drop_lock(struct ocfs2_super *osb,
2644 struct ocfs2_lock_res *lockres) 2737 struct ocfs2_lock_res *lockres)
2645{ 2738{
2646 enum dlm_status status; 2739 int ret;
2647 unsigned long flags; 2740 unsigned long flags;
2648 int lkm_flags = 0; 2741 u32 lkm_flags = 0;
2649 2742
2650 /* We didn't get anywhere near actually using this lockres. */ 2743 /* We didn't get anywhere near actually using this lockres. */
2651 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) 2744 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2652 goto out; 2745 goto out;
2653 2746
2654 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 2747 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
2655 lkm_flags |= LKM_VALBLK; 2748 lkm_flags |= DLM_LKF_VALBLK;
2656 2749
2657 spin_lock_irqsave(&lockres->l_lock, flags); 2750 spin_lock_irqsave(&lockres->l_lock, flags);
2658 2751
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2678 2771
2679 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { 2772 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
2680 if (lockres->l_flags & OCFS2_LOCK_ATTACHED && 2773 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2681 lockres->l_level == LKM_EXMODE && 2774 lockres->l_level == DLM_LOCK_EX &&
2682 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) 2775 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2683 lockres->l_ops->set_lvb(lockres); 2776 lockres->l_ops->set_lvb(lockres);
2684 } 2777 }
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2707 2800
2708 mlog(0, "lock %s\n", lockres->l_name); 2801 mlog(0, "lock %s\n", lockres->l_name);
2709 2802
2710 status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags, 2803 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
2711 ocfs2_unlock_ast, lockres); 2804 lockres);
2712 if (status != DLM_NORMAL) { 2805 if (ret) {
2713 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2806 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
2714 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 2807 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2715 dlm_print_one_lock(lockres->l_lksb.lockid); 2808 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2716 BUG(); 2809 BUG();
2717 } 2810 }
2718 mlog(0, "lock %s, successfull return from dlmunlock\n", 2811 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
2719 lockres->l_name); 2812 lockres->l_name);
2720 2813
2721 ocfs2_wait_on_busy_lock(lockres); 2814 ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2806 return status; 2899 return status;
2807} 2900}
2808 2901
2809static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 2902static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2810 int new_level) 2903 int new_level)
2811{ 2904{
2812 assert_spin_locked(&lockres->l_lock); 2905 assert_spin_locked(&lockres->l_lock);
2813 2906
2814 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 2907 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
2815 2908
2816 if (lockres->l_level <= new_level) { 2909 if (lockres->l_level <= new_level) {
2817 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", 2910 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
2818 lockres->l_level, new_level); 2911 lockres->l_level, new_level);
2819 BUG(); 2912 BUG();
2820 } 2913 }
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2825 lockres->l_action = OCFS2_AST_DOWNCONVERT; 2918 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2826 lockres->l_requested = new_level; 2919 lockres->l_requested = new_level;
2827 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2920 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2921 return lockres_set_pending(lockres);
2828} 2922}
2829 2923
2830static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 2924static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2831 struct ocfs2_lock_res *lockres, 2925 struct ocfs2_lock_res *lockres,
2832 int new_level, 2926 int new_level,
2833 int lvb) 2927 int lvb,
2928 unsigned int generation)
2834{ 2929{
2835 int ret, dlm_flags = LKM_CONVERT; 2930 int ret;
2836 enum dlm_status status; 2931 u32 dlm_flags = DLM_LKF_CONVERT;
2837 2932
2838 mlog_entry_void(); 2933 mlog_entry_void();
2839 2934
2840 if (lvb) 2935 if (lvb)
2841 dlm_flags |= LKM_VALBLK; 2936 dlm_flags |= DLM_LKF_VALBLK;
2842 2937
2843 status = dlmlock(osb->dlm, 2938 ret = ocfs2_dlm_lock(osb->cconn,
2844 new_level, 2939 new_level,
2845 &lockres->l_lksb, 2940 &lockres->l_lksb,
2846 dlm_flags, 2941 dlm_flags,
2847 lockres->l_name, 2942 lockres->l_name,
2848 OCFS2_LOCK_ID_MAX_LEN - 1, 2943 OCFS2_LOCK_ID_MAX_LEN - 1,
2849 ocfs2_locking_ast, 2944 lockres);
2850 lockres, 2945 lockres_clear_pending(lockres, generation, osb);
2851 ocfs2_blocking_ast); 2946 if (ret) {
2852 if (status != DLM_NORMAL) { 2947 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
2853 ocfs2_log_dlm_error("dlmlock", status, lockres);
2854 ret = -EINVAL;
2855 ocfs2_recover_from_dlm_error(lockres, 1); 2948 ocfs2_recover_from_dlm_error(lockres, 1);
2856 goto bail; 2949 goto bail;
2857 } 2950 }
@@ -2862,7 +2955,7 @@ bail:
2862 return ret; 2955 return ret;
2863} 2956}
2864 2957
2865/* returns 1 when the caller should unlock and call dlmunlock */ 2958/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
2866static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 2959static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2867 struct ocfs2_lock_res *lockres) 2960 struct ocfs2_lock_res *lockres)
2868{ 2961{
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2898 struct ocfs2_lock_res *lockres) 2991 struct ocfs2_lock_res *lockres)
2899{ 2992{
2900 int ret; 2993 int ret;
2901 enum dlm_status status;
2902 2994
2903 mlog_entry_void(); 2995 mlog_entry_void();
2904 mlog(0, "lock %s\n", lockres->l_name); 2996 mlog(0, "lock %s\n", lockres->l_name);
2905 2997
2906 ret = 0; 2998 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
2907 status = dlmunlock(osb->dlm, 2999 DLM_LKF_CANCEL, lockres);
2908 &lockres->l_lksb, 3000 if (ret) {
2909 LKM_CANCEL, 3001 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
2910 ocfs2_unlock_ast,
2911 lockres);
2912 if (status != DLM_NORMAL) {
2913 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2914 ret = -EINVAL;
2915 ocfs2_recover_from_dlm_error(lockres, 0); 3002 ocfs2_recover_from_dlm_error(lockres, 0);
2916 } 3003 }
2917 3004
2918 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); 3005 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
2919 3006
2920 mlog_exit(ret); 3007 mlog_exit(ret);
2921 return ret; 3008 return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2930 int new_level; 3017 int new_level;
2931 int ret = 0; 3018 int ret = 0;
2932 int set_lvb = 0; 3019 int set_lvb = 0;
3020 unsigned int gen;
2933 3021
2934 mlog_entry_void(); 3022 mlog_entry_void();
2935 3023
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2939 3027
2940recheck: 3028recheck:
2941 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 3029 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
3030 /* XXX
3031 * This is a *big* race. The OCFS2_LOCK_PENDING flag
3032 * exists entirely for one reason - another thread has set
3033 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
3034 *
3035 * If we do ocfs2_cancel_convert() before the other thread
3036 * calls dlm_lock(), our cancel will do nothing. We will
3037 * get no ast, and we will have no way of knowing the
3038 * cancel failed. Meanwhile, the other thread will call
3039 * into dlm_lock() and wait...forever.
3040 *
3041 * Why forever? Because another node has asked for the
3042 * lock first; that's why we're here in unblock_lock().
3043 *
3044 * The solution is OCFS2_LOCK_PENDING. When PENDING is
3045 * set, we just requeue the unblock. Only when the other
3046 * thread has called dlm_lock() and cleared PENDING will
3047 * we then cancel their request.
3048 *
3049 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
3050 * at the same time they set OCFS2_DLM_BUSY. They must
3051 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3052 */
3053 if (lockres->l_flags & OCFS2_LOCK_PENDING)
3054 goto leave_requeue;
3055
2942 ctl->requeue = 1; 3056 ctl->requeue = 1;
2943 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3057 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2944 spin_unlock_irqrestore(&lockres->l_lock, flags); 3058 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
2952 3066
2953 /* if we're blocking an exclusive and we have *any* holders, 3067 /* if we're blocking an exclusive and we have *any* holders,
2954 * then requeue. */ 3068 * then requeue. */
2955 if ((lockres->l_blocking == LKM_EXMODE) 3069 if ((lockres->l_blocking == DLM_LOCK_EX)
2956 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3070 && (lockres->l_ex_holders || lockres->l_ro_holders))
2957 goto leave_requeue; 3071 goto leave_requeue;
2958 3072
2959 /* If it's a PR we're blocking, then only 3073 /* If it's a PR we're blocking, then only
2960 * requeue if we've got any EX holders */ 3074 * requeue if we've got any EX holders */
2961 if (lockres->l_blocking == LKM_PRMODE && 3075 if (lockres->l_blocking == DLM_LOCK_PR &&
2962 lockres->l_ex_holders) 3076 lockres->l_ex_holders)
2963 goto leave_requeue; 3077 goto leave_requeue;
2964 3078
@@ -3005,7 +3119,7 @@ downconvert:
3005 ctl->requeue = 0; 3119 ctl->requeue = 0;
3006 3120
3007 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { 3121 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
3008 if (lockres->l_level == LKM_EXMODE) 3122 if (lockres->l_level == DLM_LOCK_EX)
3009 set_lvb = 1; 3123 set_lvb = 1;
3010 3124
3011 /* 3125 /*
@@ -3018,9 +3132,11 @@ downconvert:
3018 lockres->l_ops->set_lvb(lockres); 3132 lockres->l_ops->set_lvb(lockres);
3019 } 3133 }
3020 3134
3021 ocfs2_prepare_downconvert(lockres, new_level); 3135 gen = ocfs2_prepare_downconvert(lockres, new_level);
3022 spin_unlock_irqrestore(&lockres->l_lock, flags); 3136 spin_unlock_irqrestore(&lockres->l_lock, flags);
3023 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); 3137 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
3138 gen);
3139
3024leave: 3140leave:
3025 mlog_exit(ret); 3141 mlog_exit(ret);
3026 return ret; 3142 return ret;
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3059 (unsigned long long)OCFS2_I(inode)->ip_blkno); 3175 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3060 } 3176 }
3061 sync_mapping_buffers(mapping); 3177 sync_mapping_buffers(mapping);
3062 if (blocking == LKM_EXMODE) { 3178 if (blocking == DLM_LOCK_EX) {
3063 truncate_inode_pages(mapping, 0); 3179 truncate_inode_pages(mapping, 0);
3064 } else { 3180 } else {
3065 /* We only need to wait on the I/O if we're not also 3181 /* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3080 struct inode *inode = ocfs2_lock_res_inode(lockres); 3196 struct inode *inode = ocfs2_lock_res_inode(lockres);
3081 int checkpointed = ocfs2_inode_fully_checkpointed(inode); 3197 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
3082 3198
3083 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); 3199 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3084 BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed); 3200 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
3085 3201
3086 if (checkpointed) 3202 if (checkpointed)
3087 return 1; 3203 return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3145 * valid. The downconvert code will retain a PR for this node, 3261 * valid. The downconvert code will retain a PR for this node,
3146 * so there's no further work to do. 3262 * so there's no further work to do.
3147 */ 3263 */
3148 if (blocking == LKM_PRMODE) 3264 if (blocking == DLM_LOCK_PR)
3149 return UNBLOCK_CONTINUE; 3265 return UNBLOCK_CONTINUE;
3150 3266
3151 /* 3267 /*
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3219 return UNBLOCK_CONTINUE_POST; 3335 return UNBLOCK_CONTINUE_POST;
3220} 3336}
3221 3337
3338/*
3339 * This is the filesystem locking protocol. It provides the lock handling
3340 * hooks for the underlying DLM. It has a maximum version number.
3341 * The version number allows interoperability with systems running at
3342 * the same major number and an equal or smaller minor number.
3343 *
3344 * Whenever the filesystem does new things with locks (adds or removes a
3345 * lock, orders them differently, does different things underneath a lock),
3346 * the version must be changed. The protocol is negotiated when joining
3347 * the dlm domain. A node may join the domain if its major version is
3348 * identical to all other nodes and its minor version is greater than
3349 * or equal to all other nodes. When its minor version is greater than
3350 * the other nodes, it will run at the minor version specified by the
3351 * other nodes.
3352 *
3353 * If a locking change is made that will not be compatible with older
3354 * versions, the major number must be increased and the minor version set
3355 * to zero. If a change merely adds a behavior that can be disabled when
3356 * speaking to older versions, the minor version must be increased. If a
3357 * change adds a fully backwards compatible change (eg, LVB changes that
3358 * are just ignored by older versions), the version does not need to be
3359 * updated.
3360 */
3361static struct ocfs2_locking_protocol lproto = {
3362 .lp_max_version = {
3363 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3364 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3365 },
3366 .lp_lock_ast = ocfs2_locking_ast,
3367 .lp_blocking_ast = ocfs2_blocking_ast,
3368 .lp_unlock_ast = ocfs2_unlock_ast,
3369};
3370
3371void ocfs2_set_locking_protocol(void)
3372{
3373 ocfs2_stack_glue_set_locking_protocol(&lproto);
3374}
3375
3376
3222static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3377static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3223 struct ocfs2_lock_res *lockres) 3378 struct ocfs2_lock_res *lockres)
3224{ 3379{
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902404b4..2bb01f09c1b1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
61void ocfs2_dlm_shutdown(struct ocfs2_super *osb); 61void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
62void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); 62void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
63void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, 63void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
64 enum ocfs2_lock_type type, 64 enum ocfs2_lock_type type,
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
114struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 114struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
115void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 115void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
116 116
117extern const struct dlm_protocol_version ocfs2_locking_protocol; 117/* To set the locking protocol on module initialization */
118void ocfs2_set_locking_protocol(void);
118#endif /* DLMGLUE_H */ 119#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85d..9154c82d3258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
2242 .open = ocfs2_file_open, 2242 .open = ocfs2_file_open,
2243 .aio_read = ocfs2_file_aio_read, 2243 .aio_read = ocfs2_file_aio_read,
2244 .aio_write = ocfs2_file_aio_write, 2244 .aio_write = ocfs2_file_aio_write,
2245 .ioctl = ocfs2_ioctl, 2245 .unlocked_ioctl = ocfs2_ioctl,
2246#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2247 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2248#endif 2248#endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
2258 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release, 2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open, 2260 .open = ocfs2_dir_open,
2261 .ioctl = ocfs2_ioctl, 2261 .unlocked_ioctl = ocfs2_ioctl,
2262#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2263 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2264#endif 2264#endif
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf64da0..c6e7213db868 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,9 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <dlm/dlmapi.h>
34 31
35#define MLOG_MASK_PREFIX ML_SUPER 32#define MLOG_MASK_PREFIX ML_SUPER
36#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
48 int bit); 45 int bit);
49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 46static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
50 int bit); 47 int bit);
51static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
52 48
53/* special case -1 for now 49/* special case -1 for now
54 * TODO: should *really* make sure the calling func never passes -1!! */ 50 * TODO: should *really* make sure the calling func never passes -1!! */
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
62void ocfs2_init_node_maps(struct ocfs2_super *osb) 58void ocfs2_init_node_maps(struct ocfs2_super *osb)
63{ 59{
64 spin_lock_init(&osb->node_map_lock); 60 spin_lock_init(&osb->node_map_lock);
65 ocfs2_node_map_init(&osb->recovery_map);
66 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
67} 62}
68 63
69static void ocfs2_do_node_down(int node_num, 64void ocfs2_do_node_down(int node_num, void *data)
70 struct ocfs2_super *osb)
71{ 65{
66 struct ocfs2_super *osb = data;
67
72 BUG_ON(osb->node_num == node_num); 68 BUG_ON(osb->node_num == node_num);
73 69
74 mlog(0, "ocfs2: node down event for %d\n", node_num); 70 mlog(0, "ocfs2: node down event for %d\n", node_num);
75 71
76 if (!osb->dlm) { 72 if (!osb->cconn) {
77 /* 73 /*
78 * No DLM means we're not even ready to participate yet. 74 * No cluster connection means we're not even ready to
79 * We check the slots after the DLM comes up, so we will 75 * participate yet. We check the slots after the cluster
80 * notice the node death then. We can safely ignore it 76 * comes up, so we will notice the node death then. We
81 * here. 77 * can safely ignore it here.
82 */ 78 */
83 return; 79 return;
84 } 80 }
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num,
86 ocfs2_recovery_thread(osb, node_num); 82 ocfs2_recovery_thread(osb, node_num);
87} 83}
88 84
89/* Called from the dlm when it's about to evict a node. We may also
90 * get a heartbeat callback later. */
91static void ocfs2_dlm_eviction_cb(int node_num,
92 void *data)
93{
94 struct ocfs2_super *osb = (struct ocfs2_super *) data;
95 struct super_block *sb = osb->sb;
96
97 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
98 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
99
100 ocfs2_do_node_down(node_num, osb);
101}
102
103void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
104{
105 /* Not exactly a heartbeat callback, but leads to essentially
106 * the same path so we set it up here. */
107 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
108 ocfs2_dlm_eviction_cb,
109 osb);
110}
111
112void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
113{
114 int ret;
115 char *argv[5], *envp[3];
116
117 if (ocfs2_mount_local(osb))
118 return;
119
120 if (!osb->uuid_str) {
121 /* This can happen if we don't get far enough in mount... */
122 mlog(0, "No UUID with which to stop heartbeat!\n\n");
123 return;
124 }
125
126 argv[0] = (char *)o2nm_get_hb_ctl_path();
127 argv[1] = "-K";
128 argv[2] = "-u";
129 argv[3] = osb->uuid_str;
130 argv[4] = NULL;
131
132 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
133
134 /* minimal command environment taken from cpu_run_sbin_hotplug */
135 envp[0] = "HOME=/";
136 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
137 envp[2] = NULL;
138
139 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
140 if (ret < 0)
141 mlog_errno(ret);
142}
143
144static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 85static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
145 int bit) 86 int bit)
146{ 87{
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
192 return ret; 133 return ret;
193} 134}
194 135
195static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
196{
197 int bit;
198 bit = find_next_bit(map->map, map->num_nodes, 0);
199 if (bit < map->num_nodes)
200 return 0;
201 return 1;
202}
203
204int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
205 struct ocfs2_node_map *map)
206{
207 int ret;
208 BUG_ON(map->num_nodes == 0);
209 spin_lock(&osb->node_map_lock);
210 ret = __ocfs2_node_map_is_empty(map);
211 spin_unlock(&osb->node_map_lock);
212 return ret;
213}
214
215#if 0
216
217static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
218 struct ocfs2_node_map *from)
219{
220 BUG_ON(from->num_nodes == 0);
221 ocfs2_node_map_init(target);
222 __ocfs2_node_map_set(target, from);
223}
224
225/* returns 1 if bit is the only bit set in target, 0 otherwise */
226int ocfs2_node_map_is_only(struct ocfs2_super *osb,
227 struct ocfs2_node_map *target,
228 int bit)
229{
230 struct ocfs2_node_map temp;
231 int ret;
232
233 spin_lock(&osb->node_map_lock);
234 __ocfs2_node_map_dup(&temp, target);
235 __ocfs2_node_map_clear_bit(&temp, bit);
236 ret = __ocfs2_node_map_is_empty(&temp);
237 spin_unlock(&osb->node_map_lock);
238
239 return ret;
240}
241
242static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
243 struct ocfs2_node_map *from)
244{
245 int num_longs, i;
246
247 BUG_ON(target->num_nodes != from->num_nodes);
248 BUG_ON(target->num_nodes == 0);
249
250 num_longs = BITS_TO_LONGS(target->num_nodes);
251 for (i = 0; i < num_longs; i++)
252 target->map[i] = from->map[i];
253}
254
255#endif /* 0 */
256
257/* Returns whether the recovery bit was actually set - it may not be
258 * if a node is still marked as needing recovery */
259int ocfs2_recovery_map_set(struct ocfs2_super *osb,
260 int num)
261{
262 int set = 0;
263
264 spin_lock(&osb->node_map_lock);
265
266 if (!test_bit(num, osb->recovery_map.map)) {
267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
268 set = 1;
269 }
270
271 spin_unlock(&osb->node_map_lock);
272
273 return set;
274}
275
276void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
277 int num)
278{
279 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
280}
281
282int ocfs2_node_map_iterate(struct ocfs2_super *osb,
283 struct ocfs2_node_map *map,
284 int idx)
285{
286 int i = idx;
287
288 idx = O2NM_INVALID_NODE_NUM;
289 spin_lock(&osb->node_map_lock);
290 if ((i != O2NM_INVALID_NODE_NUM) &&
291 (i >= 0) &&
292 (i < map->num_nodes)) {
293 while(i < map->num_nodes) {
294 if (test_bit(i, map->map)) {
295 idx = i;
296 break;
297 }
298 i++;
299 }
300 }
301 spin_unlock(&osb->node_map_lock);
302 return idx;
303}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63aed7611..74b9c5dda28d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,13 +28,10 @@
28 28
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_do_node_down(int node_num, void *data);
32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
33 32
34/* node map functions - used to keep track of mounted and in-recovery 33/* node map functions - used to keep track of mounted and in-recovery
35 * nodes. */ 34 * nodes. */
36int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
37 struct ocfs2_node_map *map);
38void ocfs2_node_map_set_bit(struct ocfs2_super *osb, 35void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
39 struct ocfs2_node_map *map, 36 struct ocfs2_node_map *map,
40 int bit); 37 int bit);
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
44int ocfs2_node_map_test_bit(struct ocfs2_super *osb, 41int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map, 42 struct ocfs2_node_map *map,
46 int bit); 43 int bit);
47int ocfs2_node_map_iterate(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int idx);
50static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map)
52{
53 return ocfs2_node_map_iterate(osb, map, 0);
54}
55int ocfs2_recovery_map_set(struct ocfs2_super *osb,
56 int num);
57void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
58 int num);
59 44
60#endif /* OCFS2_HEARTBEAT_H */ 45#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162b..7b142f0ce995 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/smp_lock.h>
10 11
11#define MLOG_MASK_PREFIX ML_INODE 12#define MLOG_MASK_PREFIX ML_INODE
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -59,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
59 goto bail; 60 goto bail;
60 } 61 }
61 62
62 status = -EROFS;
63 if (IS_RDONLY(inode))
64 goto bail_unlock;
65
66 status = -EACCES; 63 status = -EACCES;
67 if (!is_owner_or_cap(inode)) 64 if (!is_owner_or_cap(inode))
68 goto bail_unlock; 65 goto bail_unlock;
@@ -112,9 +109,9 @@ bail:
112 return status; 109 return status;
113} 110}
114 111
115int ocfs2_ioctl(struct inode * inode, struct file * filp, 112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
116 unsigned int cmd, unsigned long arg)
117{ 113{
114 struct inode *inode = filp->f_path.dentry->d_inode;
118 unsigned int flags; 115 unsigned int flags;
119 int new_clusters; 116 int new_clusters;
120 int status; 117 int status;
@@ -133,8 +130,13 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
133 if (get_user(flags, (int __user *) arg)) 130 if (get_user(flags, (int __user *) arg))
134 return -EFAULT; 131 return -EFAULT;
135 132
136 return ocfs2_set_inode_attr(inode, flags, 133 status = mnt_want_write(filp->f_path.mnt);
134 if (status)
135 return status;
136 status = ocfs2_set_inode_attr(inode, flags,
137 OCFS2_FL_MODIFIABLE); 137 OCFS2_FL_MODIFIABLE);
138 mnt_drop_write(filp->f_path.mnt);
139 return status;
138 case OCFS2_IOC_RESVSP: 140 case OCFS2_IOC_RESVSP:
139 case OCFS2_IOC_RESVSP64: 141 case OCFS2_IOC_RESVSP64:
140 case OCFS2_IOC_UNRESVSP: 142 case OCFS2_IOC_UNRESVSP:
@@ -168,9 +170,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
168#ifdef CONFIG_COMPAT 170#ifdef CONFIG_COMPAT
169long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) 171long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
170{ 172{
171 struct inode *inode = file->f_path.dentry->d_inode;
172 int ret;
173
174 switch (cmd) { 173 switch (cmd) {
175 case OCFS2_IOC32_GETFLAGS: 174 case OCFS2_IOC32_GETFLAGS:
176 cmd = OCFS2_IOC_GETFLAGS; 175 cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +189,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
190 return -ENOIOCTLCMD; 189 return -ENOIOCTLCMD;
191 } 190 }
192 191
193 lock_kernel(); 192 return ocfs2_ioctl(file, cmd, arg);
194 ret = ocfs2_ioctl(inode, file, cmd, arg);
195 unlock_kernel();
196 return ret;
197} 193}
198#endif 194#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0d..cf9a5ee30fef 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_H
12 12
13int ocfs2_ioctl(struct inode * inode, struct file * filp, 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14 unsigned int cmd, unsigned long arg);
15long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
16 15
17#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c3..9698338adc39 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64 int slot); 64 int slot);
65static int ocfs2_commit_thread(void *arg); 65static int ocfs2_commit_thread(void *arg);
66 66
67
68/*
69 * The recovery_list is a simple linked list of node numbers to recover.
70 * It is protected by the recovery_lock.
71 */
72
73struct ocfs2_recovery_map {
74 unsigned int rm_used;
75 unsigned int *rm_entries;
76};
77
78int ocfs2_recovery_init(struct ocfs2_super *osb)
79{
80 struct ocfs2_recovery_map *rm;
81
82 mutex_init(&osb->recovery_lock);
83 osb->disable_recovery = 0;
84 osb->recovery_thread_task = NULL;
85 init_waitqueue_head(&osb->recovery_event);
86
87 rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
88 osb->max_slots * sizeof(unsigned int),
89 GFP_KERNEL);
90 if (!rm) {
91 mlog_errno(-ENOMEM);
92 return -ENOMEM;
93 }
94
95 rm->rm_entries = (unsigned int *)((char *)rm +
96 sizeof(struct ocfs2_recovery_map));
97 osb->recovery_map = rm;
98
99 return 0;
100}
101
102/* we can't grab the goofy sem lock from inside wait_event, so we use
103 * memory barriers to make sure that we'll see the null task before
104 * being woken up */
105static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
106{
107 mb();
108 return osb->recovery_thread_task != NULL;
109}
110
111void ocfs2_recovery_exit(struct ocfs2_super *osb)
112{
113 struct ocfs2_recovery_map *rm;
114
115 /* disable any new recovery threads and wait for any currently
116 * running ones to exit. Do this before setting the vol_state. */
117 mutex_lock(&osb->recovery_lock);
118 osb->disable_recovery = 1;
119 mutex_unlock(&osb->recovery_lock);
120 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
121
122 /* At this point, we know that no more recovery threads can be
123 * launched, so wait for any recovery completion work to
124 * complete. */
125 flush_workqueue(ocfs2_wq);
126
127 /*
128 * Now that recovery is shut down, and the osb is about to be
129 * freed, the osb_lock is not taken here.
130 */
131 rm = osb->recovery_map;
132 /* XXX: Should we bug if there are dirty entries? */
133
134 kfree(rm);
135}
136
137static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
138 unsigned int node_num)
139{
140 int i;
141 struct ocfs2_recovery_map *rm = osb->recovery_map;
142
143 assert_spin_locked(&osb->osb_lock);
144
145 for (i = 0; i < rm->rm_used; i++) {
146 if (rm->rm_entries[i] == node_num)
147 return 1;
148 }
149
150 return 0;
151}
152
153/* Behaves like test-and-set. Returns the previous value */
154static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
155 unsigned int node_num)
156{
157 struct ocfs2_recovery_map *rm = osb->recovery_map;
158
159 spin_lock(&osb->osb_lock);
160 if (__ocfs2_recovery_map_test(osb, node_num)) {
161 spin_unlock(&osb->osb_lock);
162 return 1;
163 }
164
165 /* XXX: Can this be exploited? Not from o2dlm... */
166 BUG_ON(rm->rm_used >= osb->max_slots);
167
168 rm->rm_entries[rm->rm_used] = node_num;
169 rm->rm_used++;
170 spin_unlock(&osb->osb_lock);
171
172 return 0;
173}
174
175static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
176 unsigned int node_num)
177{
178 int i;
179 struct ocfs2_recovery_map *rm = osb->recovery_map;
180
181 spin_lock(&osb->osb_lock);
182
183 for (i = 0; i < rm->rm_used; i++) {
184 if (rm->rm_entries[i] == node_num)
185 break;
186 }
187
188 if (i < rm->rm_used) {
189 /* XXX: be careful with the pointer math */
190 memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
191 (rm->rm_used - i - 1) * sizeof(unsigned int));
192 rm->rm_used--;
193 }
194
195 spin_unlock(&osb->osb_lock);
196}
197
67static int ocfs2_commit_cache(struct ocfs2_super *osb) 198static int ocfs2_commit_cache(struct ocfs2_super *osb)
68{ 199{
69 int status = 0; 200 int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
586 717
587 mlog_entry_void(); 718 mlog_entry_void();
588 719
589 if (!journal) 720 BUG_ON(!journal);
590 BUG();
591 721
592 osb = journal->j_osb; 722 osb = journal->j_osb;
593 723
@@ -650,6 +780,23 @@ bail:
650 return status; 780 return status;
651} 781}
652 782
783static int ocfs2_recovery_completed(struct ocfs2_super *osb)
784{
785 int empty;
786 struct ocfs2_recovery_map *rm = osb->recovery_map;
787
788 spin_lock(&osb->osb_lock);
789 empty = (rm->rm_used == 0);
790 spin_unlock(&osb->osb_lock);
791
792 return empty;
793}
794
795void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
796{
797 wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
798}
799
653/* 800/*
654 * JBD Might read a cached version of another nodes journal file. We 801 * JBD Might read a cached version of another nodes journal file. We
655 * don't want this as this file changes often and we get no 802 * don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
848{ 995{
849 int status, node_num; 996 int status, node_num;
850 struct ocfs2_super *osb = arg; 997 struct ocfs2_super *osb = arg;
998 struct ocfs2_recovery_map *rm = osb->recovery_map;
851 999
852 mlog_entry_void(); 1000 mlog_entry_void();
853 1001
@@ -863,26 +1011,29 @@ restart:
863 goto bail; 1011 goto bail;
864 } 1012 }
865 1013
866 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1014 spin_lock(&osb->osb_lock);
867 node_num = ocfs2_node_map_first_set_bit(osb, 1015 while (rm->rm_used) {
868 &osb->recovery_map); 1016 /* It's always safe to remove entry zero, as we won't
869 if (node_num == O2NM_INVALID_NODE_NUM) { 1017 * clear it until ocfs2_recover_node() has succeeded. */
870 mlog(0, "Out of nodes to recover.\n"); 1018 node_num = rm->rm_entries[0];
871 break; 1019 spin_unlock(&osb->osb_lock);
872 }
873 1020
874 status = ocfs2_recover_node(osb, node_num); 1021 status = ocfs2_recover_node(osb, node_num);
875 if (status < 0) { 1022 if (!status) {
1023 ocfs2_recovery_map_clear(osb, node_num);
1024 } else {
876 mlog(ML_ERROR, 1025 mlog(ML_ERROR,
877 "Error %d recovering node %d on device (%u,%u)!\n", 1026 "Error %d recovering node %d on device (%u,%u)!\n",
878 status, node_num, 1027 status, node_num,
879 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1028 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
880 mlog(ML_ERROR, "Volume requires unmount.\n"); 1029 mlog(ML_ERROR, "Volume requires unmount.\n");
881 continue;
882 } 1030 }
883 1031
884 ocfs2_recovery_map_clear(osb, node_num); 1032 spin_lock(&osb->osb_lock);
885 } 1033 }
1034 spin_unlock(&osb->osb_lock);
1035 mlog(0, "All nodes recovered\n");
1036
886 ocfs2_super_unlock(osb, 1); 1037 ocfs2_super_unlock(osb, 1);
887 1038
888 /* We always run recovery on our own orphan dir - the dead 1039 /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
893 1044
894bail: 1045bail:
895 mutex_lock(&osb->recovery_lock); 1046 mutex_lock(&osb->recovery_lock);
896 if (!status && 1047 if (!status && !ocfs2_recovery_completed(osb)) {
897 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
898 mutex_unlock(&osb->recovery_lock); 1048 mutex_unlock(&osb->recovery_lock);
899 goto restart; 1049 goto restart;
900 } 1050 }
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
924 1074
925 /* People waiting on recovery will wait on 1075 /* People waiting on recovery will wait on
926 * the recovery map to empty. */ 1076 * the recovery map to empty. */
927 if (!ocfs2_recovery_map_set(osb, node_num)) 1077 if (ocfs2_recovery_map_set(osb, node_num))
928 mlog(0, "node %d already be in recovery.\n", node_num); 1078 mlog(0, "node %d already in recovery map.\n", node_num);
929 1079
930 mlog(0, "starting recovery thread...\n"); 1080 mlog(0, "starting recovery thread...\n");
931 1081
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1079{ 1229{
1080 int status = 0; 1230 int status = 0;
1081 int slot_num; 1231 int slot_num;
1082 struct ocfs2_slot_info *si = osb->slot_info;
1083 struct ocfs2_dinode *la_copy = NULL; 1232 struct ocfs2_dinode *la_copy = NULL;
1084 struct ocfs2_dinode *tl_copy = NULL; 1233 struct ocfs2_dinode *tl_copy = NULL;
1085 1234
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1092 * case we should've called ocfs2_journal_load instead. */ 1241 * case we should've called ocfs2_journal_load instead. */
1093 BUG_ON(osb->node_num == node_num); 1242 BUG_ON(osb->node_num == node_num);
1094 1243
1095 slot_num = ocfs2_node_num_to_slot(si, node_num); 1244 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1096 if (slot_num == OCFS2_INVALID_SLOT) { 1245 if (slot_num == -ENOENT) {
1097 status = 0; 1246 status = 0;
1098 mlog(0, "no slot for this node, so no recovery required.\n"); 1247 mlog(0, "no slot for this node, so no recovery required.\n");
1099 goto done; 1248 goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1123 1272
1124 /* Likewise, this would be a strange but ultimately not so 1273 /* Likewise, this would be a strange but ultimately not so
1125 * harmful place to get an error... */ 1274 * harmful place to get an error... */
1126 ocfs2_clear_slot(si, slot_num); 1275 status = ocfs2_clear_slot(osb, slot_num);
1127 status = ocfs2_update_disk_slots(osb, si);
1128 if (status < 0) 1276 if (status < 0)
1129 mlog_errno(status); 1277 mlog_errno(status);
1130 1278
@@ -1184,23 +1332,24 @@ bail:
1184 * slot info struct has been updated from disk. */ 1332 * slot info struct has been updated from disk. */
1185int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) 1333int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1186{ 1334{
1187 int status, i, node_num; 1335 unsigned int node_num;
1188 struct ocfs2_slot_info *si = osb->slot_info; 1336 int status, i;
1189 1337
1190 /* This is called with the super block cluster lock, so we 1338 /* This is called with the super block cluster lock, so we
1191 * know that the slot map can't change underneath us. */ 1339 * know that the slot map can't change underneath us. */
1192 1340
1193 spin_lock(&si->si_lock); 1341 spin_lock(&osb->osb_lock);
1194 for(i = 0; i < si->si_num_slots; i++) { 1342 for (i = 0; i < osb->max_slots; i++) {
1195 if (i == osb->slot_num) 1343 if (i == osb->slot_num)
1196 continue; 1344 continue;
1197 if (ocfs2_is_empty_slot(si, i)) 1345
1346 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
1347 if (status == -ENOENT)
1198 continue; 1348 continue;
1199 1349
1200 node_num = si->si_global_node_nums[i]; 1350 if (__ocfs2_recovery_map_test(osb, node_num))
1201 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1202 continue; 1351 continue;
1203 spin_unlock(&si->si_lock); 1352 spin_unlock(&osb->osb_lock);
1204 1353
1205 /* Ok, we have a slot occupied by another node which 1354 /* Ok, we have a slot occupied by another node which
1206 * is not in the recovery map. We trylock his journal 1355 * is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1216 goto bail; 1365 goto bail;
1217 } 1366 }
1218 1367
1219 spin_lock(&si->si_lock); 1368 spin_lock(&osb->osb_lock);
1220 } 1369 }
1221 spin_unlock(&si->si_lock); 1370 spin_unlock(&osb->osb_lock);
1222 1371
1223 status = 0; 1372 status = 0;
1224bail: 1373bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e78..db82be2532ed 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
134 134
135/* Exported only for the journal struct init code in super.c. Do not call. */ 135/* Exported only for the journal struct init code in super.c. Do not call. */
136void ocfs2_complete_recovery(struct work_struct *work); 136void ocfs2_complete_recovery(struct work_struct *work);
137void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
138
139int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb);
137 141
138/* 142/*
139 * Journal Control: 143 * Journal Control:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ab83fd562429..ce0dc147602a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -447,6 +447,8 @@ out_mutex:
447 iput(main_bm_inode); 447 iput(main_bm_inode);
448 448
449out: 449out:
450 if (!status)
451 ocfs2_init_inode_steal_slot(osb);
450 mlog_exit(status); 452 mlog_exit(status);
451 return status; 453 return status;
452} 454}
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
523 } 525 }
524 526
525 ac->ac_inode = local_alloc_inode; 527 ac->ac_inode = local_alloc_inode;
528 /* We should never use localalloc from another slot */
529 ac->ac_alloc_slot = osb->slot_num;
526 ac->ac_which = OCFS2_AC_USE_LOCAL; 530 ac->ac_which = OCFS2_AC_USE_LOCAL;
527 get_bh(osb->local_alloc_bh); 531 get_bh(osb->local_alloc_bh);
528 ac->ac_bh = osb->local_alloc_bh; 532 ac->ac_bh = osb->local_alloc_bh;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad9587516..d5d808fe0140 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
424 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 424 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
425 fe->i_blkno = cpu_to_le64(fe_blkno); 425 fe->i_blkno = cpu_to_le64(fe_blkno);
426 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 426 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
427 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); 427 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
428 fe->i_uid = cpu_to_le32(current->fsuid); 428 fe->i_uid = cpu_to_le32(current->fsuid);
429 if (dir->i_mode & S_ISGID) { 429 if (dir->i_mode & S_ISGID) {
430 fe->i_gid = cpu_to_le32(dir->i_gid); 430 fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
997 * 997 *
998 * And that's why, just like the VFS, we need a file system 998 * And that's why, just like the VFS, we need a file system
999 * rename lock. */ 999 * rename lock. */
1000 if (old_dentry != new_dentry) { 1000 if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
1001 status = ocfs2_rename_lock(osb); 1001 status = ocfs2_rename_lock(osb);
1002 if (status < 0) { 1002 if (status < 0) {
1003 mlog_errno(status); 1003 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e3..31692379c170 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,11 +36,8 @@
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/jbd.h> 37#include <linux/jbd.h>
38 38
39#include "cluster/nodemanager.h" 39/* For union ocfs2_dlm_lksb */
40#include "cluster/heartbeat.h" 40#include "stackglue.h"
41#include "cluster/tcp.h"
42
43#include "dlm/dlmapi.h"
44 41
45#include "ocfs2_fs.h" 42#include "ocfs2_fs.h"
46#include "ocfs2_lockid.h" 43#include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
101 * dropped. */ 98 * dropped. */
102#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 99#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
103#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ 100#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
101#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
102 call to dlm_lock. Only
103 exists with BUSY set. */
104 104
105struct ocfs2_lock_res_ops; 105struct ocfs2_lock_res_ops;
106 106
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
120 int l_level; 120 int l_level;
121 unsigned int l_ro_holders; 121 unsigned int l_ro_holders;
122 unsigned int l_ex_holders; 122 unsigned int l_ex_holders;
123 struct dlm_lockstatus l_lksb; 123 union ocfs2_dlm_lksb l_lksb;
124 124
125 /* used from AST/BAST funcs. */ 125 /* used from AST/BAST funcs. */
126 enum ocfs2_ast_action l_action; 126 enum ocfs2_ast_action l_action;
127 enum ocfs2_unlock_action l_unlock_action; 127 enum ocfs2_unlock_action l_unlock_action;
128 int l_requested; 128 int l_requested;
129 int l_blocking; 129 int l_blocking;
130 unsigned int l_pending_gen;
130 131
131 wait_queue_head_t l_event; 132 wait_queue_head_t l_event;
132 133
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
179#define OCFS2_DEFAULT_ATIME_QUANTUM 60 180#define OCFS2_DEFAULT_ATIME_QUANTUM 60
180 181
181struct ocfs2_journal; 182struct ocfs2_journal;
183struct ocfs2_slot_info;
184struct ocfs2_recovery_map;
182struct ocfs2_super 185struct ocfs2_super
183{ 186{
184 struct task_struct *commit_task; 187 struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
190 struct ocfs2_slot_info *slot_info; 193 struct ocfs2_slot_info *slot_info;
191 194
192 spinlock_t node_map_lock; 195 spinlock_t node_map_lock;
193 struct ocfs2_node_map recovery_map;
194 196
195 u64 root_blkno; 197 u64 root_blkno;
196 u64 system_dir_blkno; 198 u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
206 u32 s_feature_incompat; 208 u32 s_feature_incompat;
207 u32 s_feature_ro_compat; 209 u32 s_feature_ro_compat;
208 210
209 /* Protects s_next_generaion, osb_flags. Could protect more on 211 /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
210 * osb as it's very short lived. */ 212 * Could protect more on osb as it's very short lived.
213 */
211 spinlock_t osb_lock; 214 spinlock_t osb_lock;
212 u32 s_next_generation; 215 u32 s_next_generation;
213 unsigned long osb_flags; 216 unsigned long osb_flags;
217 s16 s_inode_steal_slot;
218 atomic_t s_num_inodes_stolen;
214 219
215 unsigned long s_mount_opt; 220 unsigned long s_mount_opt;
216 unsigned int s_atime_quantum; 221 unsigned int s_atime_quantum;
217 222
218 u16 max_slots; 223 unsigned int max_slots;
219 s16 node_num; 224 unsigned int node_num;
220 s16 slot_num; 225 int slot_num;
221 s16 preferred_slot; 226 int preferred_slot;
222 int s_sectsize_bits; 227 int s_sectsize_bits;
223 int s_clustersize; 228 int s_clustersize;
224 int s_clustersize_bits; 229 int s_clustersize_bits;
225 230
226 atomic_t vol_state; 231 atomic_t vol_state;
227 struct mutex recovery_lock; 232 struct mutex recovery_lock;
233 struct ocfs2_recovery_map *recovery_map;
228 struct task_struct *recovery_thread_task; 234 struct task_struct *recovery_thread_task;
229 int disable_recovery; 235 int disable_recovery;
230 wait_queue_head_t checkpoint_event; 236 wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
245 struct ocfs2_alloc_stats alloc_stats; 251 struct ocfs2_alloc_stats alloc_stats;
246 char dev_str[20]; /* "major,minor" of the device */ 252 char dev_str[20]; /* "major,minor" of the device */
247 253
248 struct dlm_ctxt *dlm; 254 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
255 struct ocfs2_cluster_connection *cconn;
249 struct ocfs2_lock_res osb_super_lockres; 256 struct ocfs2_lock_res osb_super_lockres;
250 struct ocfs2_lock_res osb_rename_lockres; 257 struct ocfs2_lock_res osb_rename_lockres;
251 struct dlm_eviction_cb osb_eviction_cb;
252 struct ocfs2_dlm_debug *osb_dlm_debug; 258 struct ocfs2_dlm_debug *osb_dlm_debug;
253 struct dlm_protocol_version osb_locking_proto;
254 259
255 struct dentry *osb_debug_root; 260 struct dentry *osb_debug_root;
256 261
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
367 return ret; 372 return ret;
368} 373}
369 374
375static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
376{
377 return (osb->s_feature_incompat &
378 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
379}
380
370static inline int ocfs2_mount_local(struct ocfs2_super *osb) 381static inline int ocfs2_mount_local(struct ocfs2_super *osb)
371{ 382{
372 return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); 383 return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
373} 384}
374 385
386static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
387{
388 return (osb->s_feature_incompat &
389 OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
390}
391
392
375#define OCFS2_IS_VALID_DINODE(ptr) \ 393#define OCFS2_IS_VALID_DINODE(ptr) \
376 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) 394 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
377 395
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
522 return pages_per_cluster; 540 return pages_per_cluster;
523} 541}
524 542
543static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
544{
545 spin_lock(&osb->osb_lock);
546 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
547 spin_unlock(&osb->osb_lock);
548 atomic_set(&osb->s_num_inodes_stolen, 0);
549}
550
551static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
552 s16 slot)
553{
554 spin_lock(&osb->osb_lock);
555 osb->s_inode_steal_slot = slot;
556 spin_unlock(&osb->osb_lock);
557}
558
559static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
560{
561 s16 slot;
562
563 spin_lock(&osb->osb_lock);
564 slot = osb->s_inode_steal_slot;
565 spin_unlock(&osb->osb_lock);
566
567 return slot;
568}
569
525#define ocfs2_set_bit ext2_set_bit 570#define ocfs2_set_bit ext2_set_bit
526#define ocfs2_clear_bit ext2_clear_bit 571#define ocfs2_clear_bit ext2_clear_bit
527#define ocfs2_test_bit ext2_test_bit 572#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982f..52c426665154 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,9 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA) 91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
92 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
93 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
92#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 94#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
93 95
94/* 96/*
@@ -125,6 +127,21 @@
125/* Support for data packed into inode blocks */ 127/* Support for data packed into inode blocks */
126#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 128#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
127 129
130/* Support for the extended slot map */
131#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
132
133
134/*
135 * Support for alternate, userspace cluster stacks. If set, the superblock
136 * field s_cluster_info contains a tag for the alternate stack in use as
137 * well as the name of the cluster being joined.
138 * mount.ocfs2 must pass in a matching stack name.
139 *
140 * If not set, the classic stack will be used. This is compatbile with
141 * all older versions.
142 */
143#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
144
128/* 145/*
129 * backup superblock flag is used to indicate that this volume 146 * backup superblock flag is used to indicate that this volume
130 * has backup superblocks. 147 * has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
267#define OCFS2_VOL_UUID_LEN 16 284#define OCFS2_VOL_UUID_LEN 16
268#define OCFS2_MAX_VOL_LABEL_LEN 64 285#define OCFS2_MAX_VOL_LABEL_LEN 64
269 286
287/* The alternate, userspace stack fields */
288#define OCFS2_STACK_LABEL_LEN 4
289#define OCFS2_CLUSTER_NAME_LEN 16
290
270/* Journal limits (in bytes) */ 291/* Journal limits (in bytes) */
271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 292#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
272 293
@@ -475,6 +496,47 @@ struct ocfs2_extent_block
475}; 496};
476 497
477/* 498/*
499 * On disk slot map for OCFS2. This defines the contents of the "slot_map"
500 * system file. A slot is valid if it contains a node number >= 0. The
501 * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
502 */
503struct ocfs2_slot_map {
504/*00*/ __le16 sm_slots[0];
505/*
506 * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
507 * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
508 */
509};
510
511struct ocfs2_extended_slot {
512/*00*/ __u8 es_valid;
513 __u8 es_reserved1[3];
514 __le32 es_node_num;
515/*10*/
516};
517
518/*
519 * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
520 * is set. It separates out the valid marker from the node number, and
521 * has room to grow. Unlike the old slot map, this format is defined by
522 * i_size.
523 */
524struct ocfs2_slot_map_extended {
525/*00*/ struct ocfs2_extended_slot se_slots[0];
526/*
527 * Actual size is i_size of the slot_map system file. It should
528 * match s_max_slots * sizeof(struct ocfs2_extended_slot)
529 */
530};
531
532struct ocfs2_cluster_info {
533/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
534 __le32 ci_reserved;
535/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
536/*18*/
537};
538
539/*
478 * On disk superblock for OCFS2 540 * On disk superblock for OCFS2
479 * Note that it is contained inside an ocfs2_dinode, so all offsets 541 * Note that it is contained inside an ocfs2_dinode, so all offsets
480 * are relative to the start of ocfs2_dinode.id2. 542 * are relative to the start of ocfs2_dinode.id2.
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
506 * group header */ 568 * group header */
507/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 569/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
508/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 570/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
509/*A0*/ 571/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
572 stack. Only valid
573 with INCOMPAT flag. */
574/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */
575/*140*/
576
577 /*
578 * NOTE: As stated above, all offsets are relative to
579 * ocfs2_dinode.id2, which is at 0xC0 in the inode.
580 * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within
581 * our smallest blocksize, which is 512 bytes. To ensure this,
582 * we reserve the space in s_reserved2. Anything past s_reserved2
583 * will not be available on the smallest blocksize.
584 */
510}; 585};
511 586
512/* 587/*
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2b..82c200f7a8f1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
101{ 101{
102#ifdef __KERNEL__ 102#ifdef __KERNEL__
103 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); 103 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
104#endif 104#endif
105 return ocfs2_lock_type_strings[type]; 105 return ocfs2_lock_type_strings[type];
106} 106}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e64..bb5ff8939bf1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,81 +42,244 @@
42 42
43#include "buffer_head_io.h" 43#include "buffer_head_io.h"
44 44
45static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 45
46 s16 global); 46struct ocfs2_slot {
47static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, 47 int sl_valid;
48 s16 slot_num, 48 unsigned int sl_node_num;
49 s16 node_num); 49};
50 50
51/* post the slot information on disk into our slot_info struct. */ 51struct ocfs2_slot_info {
52void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52 int si_extended;
53 int si_slots_per_block;
54 struct inode *si_inode;
55 unsigned int si_blocks;
56 struct buffer_head **si_bh;
57 unsigned int si_num_slots;
58 struct ocfs2_slot *si_slots;
59};
60
61
62static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
63 unsigned int node_num);
64
65static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
66 int slot_num)
67{
68 BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
69 si->si_slots[slot_num].sl_valid = 0;
70}
71
72static void ocfs2_set_slot(struct ocfs2_slot_info *si,
73 int slot_num, unsigned int node_num)
74{
75 BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
76
77 si->si_slots[slot_num].sl_valid = 1;
78 si->si_slots[slot_num].sl_node_num = node_num;
79}
80
81/* This version is for the extended slot map */
82static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
83{
84 int b, i, slotno;
85 struct ocfs2_slot_map_extended *se;
86
87 slotno = 0;
88 for (b = 0; b < si->si_blocks; b++) {
89 se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
90 for (i = 0;
91 (i < si->si_slots_per_block) &&
92 (slotno < si->si_num_slots);
93 i++, slotno++) {
94 if (se->se_slots[i].es_valid)
95 ocfs2_set_slot(si, slotno,
96 le32_to_cpu(se->se_slots[i].es_node_num));
97 else
98 ocfs2_invalidate_slot(si, slotno);
99 }
100 }
101}
102
103/*
104 * Post the slot information on disk into our slot_info struct.
105 * Must be protected by osb_lock.
106 */
107static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
53{ 108{
54 int i; 109 int i;
55 __le16 *disk_info; 110 struct ocfs2_slot_map *sm;
56 111
57 /* we don't read the slot block here as ocfs2_super_lock 112 sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
58 * should've made sure we have the most recent copy. */
59 spin_lock(&si->si_lock);
60 disk_info = (__le16 *) si->si_bh->b_data;
61 113
62 for (i = 0; i < si->si_size; i++) 114 for (i = 0; i < si->si_num_slots; i++) {
63 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); 115 if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
116 ocfs2_invalidate_slot(si, i);
117 else
118 ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
119 }
120}
64 121
65 spin_unlock(&si->si_lock); 122static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
123{
124 /*
125 * The slot data will have been refreshed when ocfs2_super_lock
126 * was taken.
127 */
128 if (si->si_extended)
129 ocfs2_update_slot_info_extended(si);
130 else
131 ocfs2_update_slot_info_old(si);
132}
133
134int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
135{
136 int ret;
137 struct ocfs2_slot_info *si = osb->slot_info;
138
139 if (si == NULL)
140 return 0;
141
142 BUG_ON(si->si_blocks == 0);
143 BUG_ON(si->si_bh == NULL);
144
145 mlog(0, "Refreshing slot map, reading %u block(s)\n",
146 si->si_blocks);
147
148 /*
149 * We pass -1 as blocknr because we expect all of si->si_bh to
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */
153 ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
154 si->si_inode);
155 if (ret == 0) {
156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si);
158 spin_unlock(&osb->osb_lock);
159 }
160
161 return ret;
66} 162}
67 163
68/* post the our slot info stuff into it's destination bh and write it 164/* post the our slot info stuff into it's destination bh and write it
69 * out. */ 165 * out. */
70int ocfs2_update_disk_slots(struct ocfs2_super *osb, 166static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
71 struct ocfs2_slot_info *si) 167 int slot_num,
168 struct buffer_head **bh)
72{ 169{
73 int status, i; 170 int blkind = slot_num / si->si_slots_per_block;
74 __le16 *disk_info = (__le16 *) si->si_bh->b_data; 171 int slotno = slot_num % si->si_slots_per_block;
172 struct ocfs2_slot_map_extended *se;
173
174 BUG_ON(blkind >= si->si_blocks);
175
176 se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
177 se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
178 if (si->si_slots[slot_num].sl_valid)
179 se->se_slots[slotno].es_node_num =
180 cpu_to_le32(si->si_slots[slot_num].sl_node_num);
181 *bh = si->si_bh[blkind];
182}
75 183
76 spin_lock(&si->si_lock); 184static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
77 for (i = 0; i < si->si_size; i++) 185 int slot_num,
78 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); 186 struct buffer_head **bh)
79 spin_unlock(&si->si_lock); 187{
188 int i;
189 struct ocfs2_slot_map *sm;
190
191 sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
192 for (i = 0; i < si->si_num_slots; i++) {
193 if (si->si_slots[i].sl_valid)
194 sm->sm_slots[i] =
195 cpu_to_le16(si->si_slots[i].sl_node_num);
196 else
197 sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
198 }
199 *bh = si->si_bh[0];
200}
201
202static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
203 struct ocfs2_slot_info *si,
204 int slot_num)
205{
206 int status;
207 struct buffer_head *bh;
208
209 spin_lock(&osb->osb_lock);
210 if (si->si_extended)
211 ocfs2_update_disk_slot_extended(si, slot_num, &bh);
212 else
213 ocfs2_update_disk_slot_old(si, slot_num, &bh);
214 spin_unlock(&osb->osb_lock);
80 215
81 status = ocfs2_write_block(osb, si->si_bh, si->si_inode); 216 status = ocfs2_write_block(osb, bh, si->si_inode);
82 if (status < 0) 217 if (status < 0)
83 mlog_errno(status); 218 mlog_errno(status);
84 219
85 return status; 220 return status;
86} 221}
87 222
88/* try to find global node in the slot info. Returns 223/*
89 * OCFS2_INVALID_SLOT if nothing is found. */ 224 * Calculate how many bytes are needed by the slot map. Returns
90static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 225 * an error if the slot map file is too small.
91 s16 global) 226 */
227static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
228 struct inode *inode,
229 unsigned long long *bytes)
92{ 230{
93 int i; 231 unsigned long long bytes_needed;
94 s16 ret = OCFS2_INVALID_SLOT; 232
233 if (ocfs2_uses_extended_slot_map(osb)) {
234 bytes_needed = osb->max_slots *
235 sizeof(struct ocfs2_extended_slot);
236 } else {
237 bytes_needed = osb->max_slots * sizeof(__le16);
238 }
239 if (bytes_needed > i_size_read(inode)) {
240 mlog(ML_ERROR,
241 "Slot map file is too small! (size %llu, needed %llu)\n",
242 i_size_read(inode), bytes_needed);
243 return -ENOSPC;
244 }
245
246 *bytes = bytes_needed;
247 return 0;
248}
249
250/* try to find global node in the slot info. Returns -ENOENT
251 * if nothing is found. */
252static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
253 unsigned int node_num)
254{
255 int i, ret = -ENOENT;
95 256
96 for(i = 0; i < si->si_num_slots; i++) { 257 for(i = 0; i < si->si_num_slots; i++) {
97 if (global == si->si_global_node_nums[i]) { 258 if (si->si_slots[i].sl_valid &&
98 ret = (s16) i; 259 (node_num == si->si_slots[i].sl_node_num)) {
260 ret = i;
99 break; 261 break;
100 } 262 }
101 } 263 }
264
102 return ret; 265 return ret;
103} 266}
104 267
105static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) 268static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
269 int preferred)
106{ 270{
107 int i; 271 int i, ret = -ENOSPC;
108 s16 ret = OCFS2_INVALID_SLOT;
109 272
110 if (preferred >= 0 && preferred < si->si_num_slots) { 273 if ((preferred >= 0) && (preferred < si->si_num_slots)) {
111 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { 274 if (!si->si_slots[preferred].sl_valid) {
112 ret = preferred; 275 ret = preferred;
113 goto out; 276 goto out;
114 } 277 }
115 } 278 }
116 279
117 for(i = 0; i < si->si_num_slots; i++) { 280 for(i = 0; i < si->si_num_slots; i++) {
118 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 281 if (!si->si_slots[i].sl_valid) {
119 ret = (s16) i; 282 ret = i;
120 break; 283 break;
121 } 284 }
122 } 285 }
@@ -124,58 +287,155 @@ out:
124 return ret; 287 return ret;
125} 288}
126 289
127s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 290int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
128 s16 global)
129{ 291{
130 s16 ret; 292 int slot;
293 struct ocfs2_slot_info *si = osb->slot_info;
131 294
132 spin_lock(&si->si_lock); 295 spin_lock(&osb->osb_lock);
133 ret = __ocfs2_node_num_to_slot(si, global); 296 slot = __ocfs2_node_num_to_slot(si, node_num);
134 spin_unlock(&si->si_lock); 297 spin_unlock(&osb->osb_lock);
135 return ret; 298
299 return slot;
300}
301
302int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
303 unsigned int *node_num)
304{
305 struct ocfs2_slot_info *si = osb->slot_info;
306
307 assert_spin_locked(&osb->osb_lock);
308
309 BUG_ON(slot_num < 0);
310 BUG_ON(slot_num > osb->max_slots);
311
312 if (!si->si_slots[slot_num].sl_valid)
313 return -ENOENT;
314
315 *node_num = si->si_slots[slot_num].sl_node_num;
316 return 0;
136} 317}
137 318
138static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, 319static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
139 s16 slot_num,
140 s16 node_num)
141{ 320{
142 BUG_ON(slot_num == OCFS2_INVALID_SLOT); 321 unsigned int i;
143 BUG_ON(slot_num >= si->si_num_slots); 322
144 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && 323 if (si == NULL)
145 (node_num >= O2NM_MAX_NODES)); 324 return;
325
326 if (si->si_inode)
327 iput(si->si_inode);
328 if (si->si_bh) {
329 for (i = 0; i < si->si_blocks; i++) {
330 if (si->si_bh[i]) {
331 brelse(si->si_bh[i]);
332 si->si_bh[i] = NULL;
333 }
334 }
335 kfree(si->si_bh);
336 }
146 337
147 si->si_global_node_nums[slot_num] = node_num; 338 kfree(si);
148} 339}
149 340
150void ocfs2_clear_slot(struct ocfs2_slot_info *si, 341int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
151 s16 slot_num)
152{ 342{
153 spin_lock(&si->si_lock); 343 struct ocfs2_slot_info *si = osb->slot_info;
154 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); 344
155 spin_unlock(&si->si_lock); 345 if (si == NULL)
346 return 0;
347
348 spin_lock(&osb->osb_lock);
349 ocfs2_invalidate_slot(si, slot_num);
350 spin_unlock(&osb->osb_lock);
351
352 return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
156} 353}
157 354
158int ocfs2_init_slot_info(struct ocfs2_super *osb) 355static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
356 struct ocfs2_slot_info *si)
159{ 357{
160 int status, i; 358 int status = 0;
161 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes;
361 unsigned int i;
362 struct buffer_head *bh;
363
364 status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
365 if (status)
366 goto bail;
367
368 blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
369 BUG_ON(blocks > UINT_MAX);
370 si->si_blocks = blocks;
371 if (!si->si_blocks)
372 goto bail;
373
374 if (si->si_extended)
375 si->si_slots_per_block =
376 (osb->sb->s_blocksize /
377 sizeof(struct ocfs2_extended_slot));
378 else
379 si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
380
381 /* The size checks above should ensure this */
382 BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
383
384 mlog(0, "Slot map needs %u buffers for %llu bytes\n",
385 si->si_blocks, bytes);
386
387 si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
388 GFP_KERNEL);
389 if (!si->si_bh) {
390 status = -ENOMEM;
391 mlog_errno(status);
392 goto bail;
393 }
394
395 for (i = 0; i < si->si_blocks; i++) {
396 status = ocfs2_extent_map_get_blocks(si->si_inode, i,
397 &blkno, NULL, NULL);
398 if (status < 0) {
399 mlog_errno(status);
400 goto bail;
401 }
402
403 mlog(0, "Reading slot map block %u at %llu\n", i,
404 (unsigned long long)blkno);
405
406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
408 if (status < 0) {
409 mlog_errno(status);
410 goto bail;
411 }
412
413 si->si_bh[i] = bh;
414 }
415
416bail:
417 return status;
418}
419
420int ocfs2_init_slot_info(struct ocfs2_super *osb)
421{
422 int status;
162 struct inode *inode = NULL; 423 struct inode *inode = NULL;
163 struct buffer_head *bh = NULL;
164 struct ocfs2_slot_info *si; 424 struct ocfs2_slot_info *si;
165 425
166 si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); 426 si = kzalloc(sizeof(struct ocfs2_slot_info) +
427 (sizeof(struct ocfs2_slot) * osb->max_slots),
428 GFP_KERNEL);
167 if (!si) { 429 if (!si) {
168 status = -ENOMEM; 430 status = -ENOMEM;
169 mlog_errno(status); 431 mlog_errno(status);
170 goto bail; 432 goto bail;
171 } 433 }
172 434
173 spin_lock_init(&si->si_lock); 435 si->si_extended = ocfs2_uses_extended_slot_map(osb);
174 si->si_num_slots = osb->max_slots; 436 si->si_num_slots = osb->max_slots;
175 si->si_size = OCFS2_MAX_SLOTS; 437 si->si_slots = (struct ocfs2_slot *)((char *)si +
176 438 sizeof(struct ocfs2_slot_info));
177 for(i = 0; i < si->si_num_slots; i++)
178 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
179 439
180 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, 440 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
181 OCFS2_INVALID_SLOT); 441 OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
185 goto bail; 445 goto bail;
186 } 446 }
187 447
188 status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); 448 si->si_inode = inode;
189 if (status < 0) { 449 status = ocfs2_map_slot_buffers(osb, si);
190 mlog_errno(status);
191 goto bail;
192 }
193
194 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
195 if (status < 0) { 450 if (status < 0) {
196 mlog_errno(status); 451 mlog_errno(status);
197 goto bail; 452 goto bail;
198 } 453 }
199 454
200 si->si_inode = inode; 455 osb->slot_info = (struct ocfs2_slot_info *)si;
201 si->si_bh = bh;
202 osb->slot_info = si;
203bail: 456bail:
204 if (status < 0 && si) 457 if (status < 0 && si)
205 ocfs2_free_slot_info(si); 458 __ocfs2_free_slot_info(si);
206 459
207 return status; 460 return status;
208} 461}
209 462
210void ocfs2_free_slot_info(struct ocfs2_slot_info *si) 463void ocfs2_free_slot_info(struct ocfs2_super *osb)
211{ 464{
212 if (si->si_inode) 465 struct ocfs2_slot_info *si = osb->slot_info;
213 iput(si->si_inode); 466
214 if (si->si_bh) 467 osb->slot_info = NULL;
215 brelse(si->si_bh); 468 __ocfs2_free_slot_info(si);
216 kfree(si);
217} 469}
218 470
219int ocfs2_find_slot(struct ocfs2_super *osb) 471int ocfs2_find_slot(struct ocfs2_super *osb)
220{ 472{
221 int status; 473 int status;
222 s16 slot; 474 int slot;
223 struct ocfs2_slot_info *si; 475 struct ocfs2_slot_info *si;
224 476
225 mlog_entry_void(); 477 mlog_entry_void();
226 478
227 si = osb->slot_info; 479 si = osb->slot_info;
228 480
481 spin_lock(&osb->osb_lock);
229 ocfs2_update_slot_info(si); 482 ocfs2_update_slot_info(si);
230 483
231 spin_lock(&si->si_lock);
232 /* search for ourselves first and take the slot if it already 484 /* search for ourselves first and take the slot if it already
233 * exists. Perhaps we need to mark this in a variable for our 485 * exists. Perhaps we need to mark this in a variable for our
234 * own journal recovery? Possibly not, though we certainly 486 * own journal recovery? Possibly not, though we certainly
235 * need to warn to the user */ 487 * need to warn to the user */
236 slot = __ocfs2_node_num_to_slot(si, osb->node_num); 488 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
237 if (slot == OCFS2_INVALID_SLOT) { 489 if (slot < 0) {
238 /* if no slot yet, then just take 1st available 490 /* if no slot yet, then just take 1st available
239 * one. */ 491 * one. */
240 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); 492 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
241 if (slot == OCFS2_INVALID_SLOT) { 493 if (slot < 0) {
242 spin_unlock(&si->si_lock); 494 spin_unlock(&osb->osb_lock);
243 mlog(ML_ERROR, "no free slots available!\n"); 495 mlog(ML_ERROR, "no free slots available!\n");
244 status = -EINVAL; 496 status = -EINVAL;
245 goto bail; 497 goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 500 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
249 slot); 501 slot);
250 502
251 __ocfs2_fill_slot(si, slot, osb->node_num); 503 ocfs2_set_slot(si, slot, osb->node_num);
252 osb->slot_num = slot; 504 osb->slot_num = slot;
253 spin_unlock(&si->si_lock); 505 spin_unlock(&osb->osb_lock);
254 506
255 mlog(0, "taking node slot %d\n", osb->slot_num); 507 mlog(0, "taking node slot %d\n", osb->slot_num);
256 508
257 status = ocfs2_update_disk_slots(osb, si); 509 status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
258 if (status < 0) 510 if (status < 0)
259 mlog_errno(status); 511 mlog_errno(status);
260 512
@@ -265,27 +517,27 @@ bail:
265 517
266void ocfs2_put_slot(struct ocfs2_super *osb) 518void ocfs2_put_slot(struct ocfs2_super *osb)
267{ 519{
268 int status; 520 int status, slot_num;
269 struct ocfs2_slot_info *si = osb->slot_info; 521 struct ocfs2_slot_info *si = osb->slot_info;
270 522
271 if (!si) 523 if (!si)
272 return; 524 return;
273 525
526 spin_lock(&osb->osb_lock);
274 ocfs2_update_slot_info(si); 527 ocfs2_update_slot_info(si);
275 528
276 spin_lock(&si->si_lock); 529 slot_num = osb->slot_num;
277 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); 530 ocfs2_invalidate_slot(si, osb->slot_num);
278 osb->slot_num = OCFS2_INVALID_SLOT; 531 osb->slot_num = OCFS2_INVALID_SLOT;
279 spin_unlock(&si->si_lock); 532 spin_unlock(&osb->osb_lock);
280 533
281 status = ocfs2_update_disk_slots(osb, si); 534 status = ocfs2_update_disk_slot(osb, si, slot_num);
282 if (status < 0) { 535 if (status < 0) {
283 mlog_errno(status); 536 mlog_errno(status);
284 goto bail; 537 goto bail;
285 } 538 }
286 539
287bail: 540bail:
288 osb->slot_info = NULL; 541 ocfs2_free_slot_info(osb);
289 ocfs2_free_slot_info(si);
290} 542}
291 543
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaade..601c95fd7003 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,38 +27,18 @@
27#ifndef SLOTMAP_H 27#ifndef SLOTMAP_H
28#define SLOTMAP_H 28#define SLOTMAP_H
29 29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb); 30int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si); 31void ocfs2_free_slot_info(struct ocfs2_super *osb);
42 32
43int ocfs2_find_slot(struct ocfs2_super *osb); 33int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb); 34void ocfs2_put_slot(struct ocfs2_super *osb);
45 35
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si); 36int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54 37
55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 38int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
56 int slot_num) 39int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
57{ 40 unsigned int *node_num);
58 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
59 assert_spin_locked(&si->si_lock);
60 41
61 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; 42int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
62}
63 43
64#endif 44#endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000000..ac1d74c63bf5
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,420 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stack_o2cb.c
5 *
6 * Code which interfaces ocfs2 with the o2cb stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/crc32.h>
21#include <linux/module.h>
22
23/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
24#include <linux/fs.h>
25
26#include "cluster/masklog.h"
27#include "cluster/nodemanager.h"
28#include "cluster/heartbeat.h"
29
30#include "stackglue.h"
31
32struct o2dlm_private {
33 struct dlm_eviction_cb op_eviction_cb;
34};
35
36static struct ocfs2_stack_plugin o2cb_stack;
37
38/* These should be identical */
39#if (DLM_LOCK_IV != LKM_IVMODE)
40# error Lock modes do not match
41#endif
42#if (DLM_LOCK_NL != LKM_NLMODE)
43# error Lock modes do not match
44#endif
45#if (DLM_LOCK_CR != LKM_CRMODE)
46# error Lock modes do not match
47#endif
48#if (DLM_LOCK_CW != LKM_CWMODE)
49# error Lock modes do not match
50#endif
51#if (DLM_LOCK_PR != LKM_PRMODE)
52# error Lock modes do not match
53#endif
54#if (DLM_LOCK_PW != LKM_PWMODE)
55# error Lock modes do not match
56#endif
57#if (DLM_LOCK_EX != LKM_EXMODE)
58# error Lock modes do not match
59#endif
60static inline int mode_to_o2dlm(int mode)
61{
62 BUG_ON(mode > LKM_MAXMODE);
63
64 return mode;
65}
66
67#define map_flag(_generic, _o2dlm) \
68 if (flags & (_generic)) { \
69 flags &= ~(_generic); \
70 o2dlm_flags |= (_o2dlm); \
71 }
72static int flags_to_o2dlm(u32 flags)
73{
74 int o2dlm_flags = 0;
75
76 map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
77 map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
78 map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
79 map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
80 map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
81 map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
82 map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
83 map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
84 map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
85
86 /* map_flag() should have cleared every flag passed in */
87 BUG_ON(flags != 0);
88
89 return o2dlm_flags;
90}
91#undef map_flag
92
93/*
94 * Map an o2dlm status to standard errno values.
95 *
96 * o2dlm only uses a handful of these, and returns even fewer to the
97 * caller. Still, we try to assign sane values to each error.
98 *
99 * The following value pairs have special meanings to dlmglue, thus
100 * the right hand side needs to stay unique - never duplicate the
101 * mapping elsewhere in the table!
102 *
103 * DLM_NORMAL: 0
104 * DLM_NOTQUEUED: -EAGAIN
105 * DLM_CANCELGRANT: -EBUSY
106 * DLM_CANCEL: -DLM_ECANCEL
107 */
108/* Keep in sync with dlmapi.h */
109static int status_map[] = {
110 [DLM_NORMAL] = 0, /* Success */
111 [DLM_GRANTED] = -EINVAL,
112 [DLM_DENIED] = -EACCES,
113 [DLM_DENIED_NOLOCKS] = -EACCES,
114 [DLM_WORKING] = -EACCES,
115 [DLM_BLOCKED] = -EINVAL,
116 [DLM_BLOCKED_ORPHAN] = -EINVAL,
117 [DLM_DENIED_GRACE_PERIOD] = -EACCES,
118 [DLM_SYSERR] = -ENOMEM, /* It is what it is */
119 [DLM_NOSUPPORT] = -EPROTO,
120 [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */
121 [DLM_IVLOCKID] = -EINVAL,
122 [DLM_SYNC] = -EINVAL,
123 [DLM_BADTYPE] = -EINVAL,
124 [DLM_BADRESOURCE] = -EINVAL,
125 [DLM_MAXHANDLES] = -ENOMEM,
126 [DLM_NOCLINFO] = -EINVAL,
127 [DLM_NOLOCKMGR] = -EINVAL,
128 [DLM_NOPURGED] = -EINVAL,
129 [DLM_BADARGS] = -EINVAL,
130 [DLM_VOID] = -EINVAL,
131 [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */
132 [DLM_IVBUFLEN] = -EINVAL,
133 [DLM_CVTUNGRANT] = -EPERM,
134 [DLM_BADPARAM] = -EINVAL,
135 [DLM_VALNOTVALID] = -EINVAL,
136 [DLM_REJECTED] = -EPERM,
137 [DLM_ABORT] = -EINVAL,
138 [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */
139 [DLM_IVRESHANDLE] = -EINVAL,
140 [DLM_DEADLOCK] = -EDEADLK,
141 [DLM_DENIED_NOASTS] = -EINVAL,
142 [DLM_FORWARD] = -EINVAL,
143 [DLM_TIMEOUT] = -ETIMEDOUT,
144 [DLM_IVGROUPID] = -EINVAL,
145 [DLM_VERS_CONFLICT] = -EOPNOTSUPP,
146 [DLM_BAD_DEVICE_PATH] = -ENOENT,
147 [DLM_NO_DEVICE_PERMISSION] = -EPERM,
148 [DLM_NO_CONTROL_DEVICE] = -ENOENT,
149 [DLM_RECOVERING] = -ENOTCONN,
150 [DLM_MIGRATING] = -ERESTART,
151 [DLM_MAXSTATS] = -EINVAL,
152};
153
154static int dlm_status_to_errno(enum dlm_status status)
155{
156 BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
157
158 return status_map[status];
159}
160
161static void o2dlm_lock_ast_wrapper(void *astarg)
162{
163 BUG_ON(o2cb_stack.sp_proto == NULL);
164
165 o2cb_stack.sp_proto->lp_lock_ast(astarg);
166}
167
168static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
169{
170 BUG_ON(o2cb_stack.sp_proto == NULL);
171
172 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
173}
174
175static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
176{
177 int error = dlm_status_to_errno(status);
178
179 BUG_ON(o2cb_stack.sp_proto == NULL);
180
181 /*
182 * In o2dlm, you can get both the lock_ast() for the lock being
183 * granted and the unlock_ast() for the CANCEL failing. A
184 * successful cancel sends DLM_NORMAL here. If the
185 * lock grant happened before the cancel arrived, you get
186 * DLM_CANCELGRANT.
187 *
188 * There's no need for the double-ast. If we see DLM_CANCELGRANT,
189 * we just ignore it. We expect the lock_ast() to handle the
190 * granted lock.
191 */
192 if (status == DLM_CANCELGRANT)
193 return;
194
195 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
196}
197
198static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
199 int mode,
200 union ocfs2_dlm_lksb *lksb,
201 u32 flags,
202 void *name,
203 unsigned int namelen,
204 void *astarg)
205{
206 enum dlm_status status;
207 int o2dlm_mode = mode_to_o2dlm(mode);
208 int o2dlm_flags = flags_to_o2dlm(flags);
209 int ret;
210
211 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
212 o2dlm_flags, name, namelen,
213 o2dlm_lock_ast_wrapper, astarg,
214 o2dlm_blocking_ast_wrapper);
215 ret = dlm_status_to_errno(status);
216 return ret;
217}
218
219static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
220 union ocfs2_dlm_lksb *lksb,
221 u32 flags,
222 void *astarg)
223{
224 enum dlm_status status;
225 int o2dlm_flags = flags_to_o2dlm(flags);
226 int ret;
227
228 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
229 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
230 ret = dlm_status_to_errno(status);
231 return ret;
232}
233
234static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
235{
236 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
237}
238
239static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
240{
241 return (void *)(lksb->lksb_o2dlm.lvb);
242}
243
244static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
245{
246 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
247}
248
249/*
250 * Called from the dlm when it's about to evict a node. This is how the
251 * classic stack signals node death.
252 */
253static void o2dlm_eviction_cb(int node_num, void *data)
254{
255 struct ocfs2_cluster_connection *conn = data;
256
257 mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
258 node_num, conn->cc_namelen, conn->cc_name);
259
260 conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
261}
262
263static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
264{
265 int rc = 0;
266 u32 dlm_key;
267 struct dlm_ctxt *dlm;
268 struct o2dlm_private *priv;
269 struct dlm_protocol_version dlm_version;
270
271 BUG_ON(conn == NULL);
272 BUG_ON(o2cb_stack.sp_proto == NULL);
273
274 /* for now we only have one cluster/node, make sure we see it
275 * in the heartbeat universe */
276 if (!o2hb_check_local_node_heartbeating()) {
277 rc = -EINVAL;
278 goto out;
279 }
280
281 priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
282 if (!priv) {
283 rc = -ENOMEM;
284 goto out_free;
285 }
286
287 /* This just fills the structure in. It is safe to pass conn. */
288 dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
289 conn);
290
291 conn->cc_private = priv;
292
293 /* used by the dlm code to make message headers unique, each
294 * node in this domain must agree on this. */
295 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
296 dlm_version.pv_major = conn->cc_version.pv_major;
297 dlm_version.pv_minor = conn->cc_version.pv_minor;
298
299 dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
300 if (IS_ERR(dlm)) {
301 rc = PTR_ERR(dlm);
302 mlog_errno(rc);
303 goto out_free;
304 }
305
306 conn->cc_version.pv_major = dlm_version.pv_major;
307 conn->cc_version.pv_minor = dlm_version.pv_minor;
308 conn->cc_lockspace = dlm;
309
310 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
311
312out_free:
313 if (rc && conn->cc_private)
314 kfree(conn->cc_private);
315
316out:
317 return rc;
318}
319
320static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
321 int hangup_pending)
322{
323 struct dlm_ctxt *dlm = conn->cc_lockspace;
324 struct o2dlm_private *priv = conn->cc_private;
325
326 dlm_unregister_eviction_cb(&priv->op_eviction_cb);
327 conn->cc_private = NULL;
328 kfree(priv);
329
330 dlm_unregister_domain(dlm);
331 conn->cc_lockspace = NULL;
332
333 return 0;
334}
335
336static void o2hb_stop(const char *group)
337{
338 int ret;
339 char *argv[5], *envp[3];
340
341 argv[0] = (char *)o2nm_get_hb_ctl_path();
342 argv[1] = "-K";
343 argv[2] = "-u";
344 argv[3] = (char *)group;
345 argv[4] = NULL;
346
347 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
348
349 /* minimal command environment taken from cpu_run_sbin_hotplug */
350 envp[0] = "HOME=/";
351 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
352 envp[2] = NULL;
353
354 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
355 if (ret < 0)
356 mlog_errno(ret);
357}
358
359/*
360 * Hangup is a hack for tools compatibility. Older ocfs2-tools software
361 * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This
362 * happens regardless of whether the DLM got started, so we can't do it
363 * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into
364 * the glue and provide a "hangup" API for super.c to call.
365 *
366 * Other stacks will eventually provide a NULL ->hangup() pointer.
367 */
368static void o2cb_cluster_hangup(const char *group, int grouplen)
369{
370 o2hb_stop(group);
371}
372
373static int o2cb_cluster_this_node(unsigned int *node)
374{
375 int node_num;
376
377 node_num = o2nm_this_node();
378 if (node_num == O2NM_INVALID_NODE_NUM)
379 return -ENOENT;
380
381 if (node_num >= O2NM_MAX_NODES)
382 return -EOVERFLOW;
383
384 *node = node_num;
385 return 0;
386}
387
388struct ocfs2_stack_operations o2cb_stack_ops = {
389 .connect = o2cb_cluster_connect,
390 .disconnect = o2cb_cluster_disconnect,
391 .hangup = o2cb_cluster_hangup,
392 .this_node = o2cb_cluster_this_node,
393 .dlm_lock = o2cb_dlm_lock,
394 .dlm_unlock = o2cb_dlm_unlock,
395 .lock_status = o2cb_dlm_lock_status,
396 .lock_lvb = o2cb_dlm_lvb,
397 .dump_lksb = o2cb_dump_lksb,
398};
399
400static struct ocfs2_stack_plugin o2cb_stack = {
401 .sp_name = "o2cb",
402 .sp_ops = &o2cb_stack_ops,
403 .sp_owner = THIS_MODULE,
404};
405
406static int __init o2cb_stack_init(void)
407{
408 return ocfs2_stack_glue_register(&o2cb_stack);
409}
410
411static void __exit o2cb_stack_exit(void)
412{
413 ocfs2_stack_glue_unregister(&o2cb_stack);
414}
415
416MODULE_AUTHOR("Oracle");
417MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
418MODULE_LICENSE("GPL");
419module_init(o2cb_stack_init);
420module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000000..7428663f9cbb
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,883 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stack_user.c
5 *
6 * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/module.h>
21#include <linux/fs.h>
22#include <linux/miscdevice.h>
23#include <linux/mutex.h>
24#include <linux/reboot.h>
25#include <asm/uaccess.h>
26
27#include "ocfs2.h" /* For struct ocfs2_lock_res */
28#include "stackglue.h"
29
30
31/*
32 * The control protocol starts with a handshake. Until the handshake
33 * is complete, the control device will fail all write(2)s.
34 *
35 * The handshake is simple. First, the client reads until EOF. Each line
36 * of output is a supported protocol tag. All protocol tags are a single
37 * character followed by a two hex digit version number. Currently the
38 * only things supported is T01, for "Text-base version 0x01". Next, the
39 * client writes the version they would like to use, including the newline.
40 * Thus, the protocol tag is 'T01\n'. If the version tag written is
41 * unknown, -EINVAL is returned. Once the negotiation is complete, the
42 * client can start sending messages.
43 *
44 * The T01 protocol has three messages. First is the "SETN" message.
45 * It has the following syntax:
46 *
47 * SETN<space><8-char-hex-nodenum><newline>
48 *
49 * This is 14 characters.
50 *
51 * The "SETN" message must be the first message following the protocol.
52 * It tells ocfs2_control the local node number.
53 *
54 * Next comes the "SETV" message. It has the following syntax:
55 *
56 * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
57 *
58 * This is 11 characters.
59 *
60 * The "SETV" message sets the filesystem locking protocol version as
61 * negotiated by the client. The client negotiates based on the maximum
62 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
63 * number from the "SETV" message must match
64 * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
65 * must be less than or equal to ...->lp_max_version.pv_minor.
66 *
67 * Once this information has been set, mounts will be allowed. From this
68 * point on, the "DOWN" message can be sent for node down notification.
69 * It has the following syntax:
70 *
71 * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
72 *
73 * eg:
74 *
75 * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
76 *
77 * This is 47 characters.
78 */
79
80/*
81 * Whether or not the client has done the handshake.
82 * For now, we have just one protocol version.
83 */
84#define OCFS2_CONTROL_PROTO "T01\n"
85#define OCFS2_CONTROL_PROTO_LEN 4
86
87/* Handshake states */
88#define OCFS2_CONTROL_HANDSHAKE_INVALID (0)
89#define OCFS2_CONTROL_HANDSHAKE_READ (1)
90#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2)
91#define OCFS2_CONTROL_HANDSHAKE_VALID (3)
92
93/* Messages */
94#define OCFS2_CONTROL_MESSAGE_OP_LEN 4
95#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN"
96#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
97#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV"
98#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11
99#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN"
100#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47
101#define OCFS2_TEXT_UUID_LEN 32
102#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
103#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
104
105/*
106 * ocfs2_live_connection is refcounted because the filesystem and
107 * miscdevice sides can detach in different order. Let's just be safe.
108 */
109struct ocfs2_live_connection {
110 struct list_head oc_list;
111 struct ocfs2_cluster_connection *oc_conn;
112};
113
114struct ocfs2_control_private {
115 struct list_head op_list;
116 int op_state;
117 int op_this_node;
118 struct ocfs2_protocol_version op_proto;
119};
120
121/* SETN<space><8-char-hex-nodenum><newline> */
122struct ocfs2_control_message_setn {
123 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
124 char space;
125 char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
126 char newline;
127};
128
129/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
130struct ocfs2_control_message_setv {
131 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
132 char space1;
133 char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
134 char space2;
135 char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
136 char newline;
137};
138
139/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
140struct ocfs2_control_message_down {
141 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
142 char space1;
143 char uuid[OCFS2_TEXT_UUID_LEN];
144 char space2;
145 char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
146 char newline;
147};
148
149union ocfs2_control_message {
150 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
151 struct ocfs2_control_message_setn u_setn;
152 struct ocfs2_control_message_setv u_setv;
153 struct ocfs2_control_message_down u_down;
154};
155
156static struct ocfs2_stack_plugin user_stack;
157
158static atomic_t ocfs2_control_opened;
159static int ocfs2_control_this_node = -1;
160static struct ocfs2_protocol_version running_proto;
161
162static LIST_HEAD(ocfs2_live_connection_list);
163static LIST_HEAD(ocfs2_control_private_list);
164static DEFINE_MUTEX(ocfs2_control_lock);
165
166static inline void ocfs2_control_set_handshake_state(struct file *file,
167 int state)
168{
169 struct ocfs2_control_private *p = file->private_data;
170 p->op_state = state;
171}
172
173static inline int ocfs2_control_get_handshake_state(struct file *file)
174{
175 struct ocfs2_control_private *p = file->private_data;
176 return p->op_state;
177}
178
179static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
180{
181 size_t len = strlen(name);
182 struct ocfs2_live_connection *c;
183
184 BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
185
186 list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
187 if ((c->oc_conn->cc_namelen == len) &&
188 !strncmp(c->oc_conn->cc_name, name, len))
189 return c;
190 }
191
192 return c;
193}
194
195/*
196 * ocfs2_live_connection structures are created underneath the ocfs2
197 * mount path. Since the VFS prevents multiple calls to
198 * fill_super(), we can't get dupes here.
199 */
200static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
201 struct ocfs2_live_connection **c_ret)
202{
203 int rc = 0;
204 struct ocfs2_live_connection *c;
205
206 c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
207 if (!c)
208 return -ENOMEM;
209
210 mutex_lock(&ocfs2_control_lock);
211 c->oc_conn = conn;
212
213 if (atomic_read(&ocfs2_control_opened))
214 list_add(&c->oc_list, &ocfs2_live_connection_list);
215 else {
216 printk(KERN_ERR
217 "ocfs2: Userspace control daemon is not present\n");
218 rc = -ESRCH;
219 }
220
221 mutex_unlock(&ocfs2_control_lock);
222
223 if (!rc)
224 *c_ret = c;
225 else
226 kfree(c);
227
228 return rc;
229}
230
231/*
232 * This function disconnects the cluster connection from ocfs2_control.
233 * Afterwards, userspace can't affect the cluster connection.
234 */
235static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
236{
237 mutex_lock(&ocfs2_control_lock);
238 list_del_init(&c->oc_list);
239 c->oc_conn = NULL;
240 mutex_unlock(&ocfs2_control_lock);
241
242 kfree(c);
243}
244
245static int ocfs2_control_cfu(void *target, size_t target_len,
246 const char __user *buf, size_t count)
247{
248 /* The T01 expects write(2) calls to have exactly one command */
249 if ((count != target_len) ||
250 (count > sizeof(union ocfs2_control_message)))
251 return -EINVAL;
252
253 if (copy_from_user(target, buf, target_len))
254 return -EFAULT;
255
256 return 0;
257}
258
259static ssize_t ocfs2_control_validate_protocol(struct file *file,
260 const char __user *buf,
261 size_t count)
262{
263 ssize_t ret;
264 char kbuf[OCFS2_CONTROL_PROTO_LEN];
265
266 ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
267 buf, count);
268 if (ret)
269 return ret;
270
271 if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
272 return -EINVAL;
273
274 ocfs2_control_set_handshake_state(file,
275 OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
276
277 return count;
278}
279
280static void ocfs2_control_send_down(const char *uuid,
281 int nodenum)
282{
283 struct ocfs2_live_connection *c;
284
285 mutex_lock(&ocfs2_control_lock);
286
287 c = ocfs2_connection_find(uuid);
288 if (c) {
289 BUG_ON(c->oc_conn == NULL);
290 c->oc_conn->cc_recovery_handler(nodenum,
291 c->oc_conn->cc_recovery_data);
292 }
293
294 mutex_unlock(&ocfs2_control_lock);
295}
296
297/*
298 * Called whenever configuration elements are sent to /dev/ocfs2_control.
299 * If all configuration elements are present, try to set the global
300 * values. If there is a problem, return an error. Skip any missing
301 * elements, and only bump ocfs2_control_opened when we have all elements
302 * and are successful.
303 */
304static int ocfs2_control_install_private(struct file *file)
305{
306 int rc = 0;
307 int set_p = 1;
308 struct ocfs2_control_private *p = file->private_data;
309
310 BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
311
312 mutex_lock(&ocfs2_control_lock);
313
314 if (p->op_this_node < 0) {
315 set_p = 0;
316 } else if ((ocfs2_control_this_node >= 0) &&
317 (ocfs2_control_this_node != p->op_this_node)) {
318 rc = -EINVAL;
319 goto out_unlock;
320 }
321
322 if (!p->op_proto.pv_major) {
323 set_p = 0;
324 } else if (!list_empty(&ocfs2_live_connection_list) &&
325 ((running_proto.pv_major != p->op_proto.pv_major) ||
326 (running_proto.pv_minor != p->op_proto.pv_minor))) {
327 rc = -EINVAL;
328 goto out_unlock;
329 }
330
331 if (set_p) {
332 ocfs2_control_this_node = p->op_this_node;
333 running_proto.pv_major = p->op_proto.pv_major;
334 running_proto.pv_minor = p->op_proto.pv_minor;
335 }
336
337out_unlock:
338 mutex_unlock(&ocfs2_control_lock);
339
340 if (!rc && set_p) {
341 /* We set the global values successfully */
342 atomic_inc(&ocfs2_control_opened);
343 ocfs2_control_set_handshake_state(file,
344 OCFS2_CONTROL_HANDSHAKE_VALID);
345 }
346
347 return rc;
348}
349
350static int ocfs2_control_get_this_node(void)
351{
352 int rc;
353
354 mutex_lock(&ocfs2_control_lock);
355 if (ocfs2_control_this_node < 0)
356 rc = -EINVAL;
357 else
358 rc = ocfs2_control_this_node;
359 mutex_unlock(&ocfs2_control_lock);
360
361 return rc;
362}
363
364static int ocfs2_control_do_setnode_msg(struct file *file,
365 struct ocfs2_control_message_setn *msg)
366{
367 long nodenum;
368 char *ptr = NULL;
369 struct ocfs2_control_private *p = file->private_data;
370
371 if (ocfs2_control_get_handshake_state(file) !=
372 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
373 return -EINVAL;
374
375 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
376 OCFS2_CONTROL_MESSAGE_OP_LEN))
377 return -EINVAL;
378
379 if ((msg->space != ' ') || (msg->newline != '\n'))
380 return -EINVAL;
381 msg->space = msg->newline = '\0';
382
383 nodenum = simple_strtol(msg->nodestr, &ptr, 16);
384 if (!ptr || *ptr)
385 return -EINVAL;
386
387 if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
388 (nodenum > INT_MAX) || (nodenum < 0))
389 return -ERANGE;
390 p->op_this_node = nodenum;
391
392 return ocfs2_control_install_private(file);
393}
394
395static int ocfs2_control_do_setversion_msg(struct file *file,
396 struct ocfs2_control_message_setv *msg)
397 {
398 long major, minor;
399 char *ptr = NULL;
400 struct ocfs2_control_private *p = file->private_data;
401 struct ocfs2_protocol_version *max =
402 &user_stack.sp_proto->lp_max_version;
403
404 if (ocfs2_control_get_handshake_state(file) !=
405 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
406 return -EINVAL;
407
408 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
409 OCFS2_CONTROL_MESSAGE_OP_LEN))
410 return -EINVAL;
411
412 if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
413 (msg->newline != '\n'))
414 return -EINVAL;
415 msg->space1 = msg->space2 = msg->newline = '\0';
416
417 major = simple_strtol(msg->major, &ptr, 16);
418 if (!ptr || *ptr)
419 return -EINVAL;
420 minor = simple_strtol(msg->minor, &ptr, 16);
421 if (!ptr || *ptr)
422 return -EINVAL;
423
424 /*
425 * The major must be between 1 and 255, inclusive. The minor
426 * must be between 0 and 255, inclusive. The version passed in
427 * must be within the maximum version supported by the filesystem.
428 */
429 if ((major == LONG_MIN) || (major == LONG_MAX) ||
430 (major > (u8)-1) || (major < 1))
431 return -ERANGE;
432 if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
433 (minor > (u8)-1) || (minor < 0))
434 return -ERANGE;
435 if ((major != max->pv_major) ||
436 (minor > max->pv_minor))
437 return -EINVAL;
438
439 p->op_proto.pv_major = major;
440 p->op_proto.pv_minor = minor;
441
442 return ocfs2_control_install_private(file);
443}
444
445static int ocfs2_control_do_down_msg(struct file *file,
446 struct ocfs2_control_message_down *msg)
447{
448 long nodenum;
449 char *p = NULL;
450
451 if (ocfs2_control_get_handshake_state(file) !=
452 OCFS2_CONTROL_HANDSHAKE_VALID)
453 return -EINVAL;
454
455 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
456 OCFS2_CONTROL_MESSAGE_OP_LEN))
457 return -EINVAL;
458
459 if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
460 (msg->newline != '\n'))
461 return -EINVAL;
462 msg->space1 = msg->space2 = msg->newline = '\0';
463
464 nodenum = simple_strtol(msg->nodestr, &p, 16);
465 if (!p || *p)
466 return -EINVAL;
467
468 if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
469 (nodenum > INT_MAX) || (nodenum < 0))
470 return -ERANGE;
471
472 ocfs2_control_send_down(msg->uuid, nodenum);
473
474 return 0;
475}
476
477static ssize_t ocfs2_control_message(struct file *file,
478 const char __user *buf,
479 size_t count)
480{
481 ssize_t ret;
482 union ocfs2_control_message msg;
483
484 /* Try to catch padding issues */
485 WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
486 (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
487
488 memset(&msg, 0, sizeof(union ocfs2_control_message));
489 ret = ocfs2_control_cfu(&msg, count, buf, count);
490 if (ret)
491 goto out;
492
493 if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
494 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
495 OCFS2_CONTROL_MESSAGE_OP_LEN))
496 ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
497 else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
498 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
499 OCFS2_CONTROL_MESSAGE_OP_LEN))
500 ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
501 else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
502 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
503 OCFS2_CONTROL_MESSAGE_OP_LEN))
504 ret = ocfs2_control_do_down_msg(file, &msg.u_down);
505 else
506 ret = -EINVAL;
507
508out:
509 return ret ? ret : count;
510}
511
512static ssize_t ocfs2_control_write(struct file *file,
513 const char __user *buf,
514 size_t count,
515 loff_t *ppos)
516{
517 ssize_t ret;
518
519 switch (ocfs2_control_get_handshake_state(file)) {
520 case OCFS2_CONTROL_HANDSHAKE_INVALID:
521 ret = -EINVAL;
522 break;
523
524 case OCFS2_CONTROL_HANDSHAKE_READ:
525 ret = ocfs2_control_validate_protocol(file, buf,
526 count);
527 break;
528
529 case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
530 case OCFS2_CONTROL_HANDSHAKE_VALID:
531 ret = ocfs2_control_message(file, buf, count);
532 break;
533
534 default:
535 BUG();
536 ret = -EIO;
537 break;
538 }
539
540 return ret;
541}
542
543/*
544 * This is a naive version. If we ever have a new protocol, we'll expand
545 * it. Probably using seq_file.
546 */
547static ssize_t ocfs2_control_read(struct file *file,
548 char __user *buf,
549 size_t count,
550 loff_t *ppos)
551{
552 char *proto_string = OCFS2_CONTROL_PROTO;
553 size_t to_write = 0;
554
555 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
556 return 0;
557
558 to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
559 if (to_write > count)
560 to_write = count;
561 if (copy_to_user(buf, proto_string + *ppos, to_write))
562 return -EFAULT;
563
564 *ppos += to_write;
565
566 /* Have we read the whole protocol list? */
567 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
568 ocfs2_control_set_handshake_state(file,
569 OCFS2_CONTROL_HANDSHAKE_READ);
570
571 return to_write;
572}
573
574static int ocfs2_control_release(struct inode *inode, struct file *file)
575{
576 struct ocfs2_control_private *p = file->private_data;
577
578 mutex_lock(&ocfs2_control_lock);
579
580 if (ocfs2_control_get_handshake_state(file) !=
581 OCFS2_CONTROL_HANDSHAKE_VALID)
582 goto out;
583
584 if (atomic_dec_and_test(&ocfs2_control_opened)) {
585 if (!list_empty(&ocfs2_live_connection_list)) {
586 /* XXX: Do bad things! */
587 printk(KERN_ERR
588 "ocfs2: Unexpected release of ocfs2_control!\n"
589 " Loss of cluster connection requires "
590 "an emergency restart!\n");
591 emergency_restart();
592 }
593 /*
594 * Last valid close clears the node number and resets
595 * the locking protocol version
596 */
597 ocfs2_control_this_node = -1;
598 running_proto.pv_major = 0;
599 running_proto.pv_major = 0;
600 }
601
602out:
603 list_del_init(&p->op_list);
604 file->private_data = NULL;
605
606 mutex_unlock(&ocfs2_control_lock);
607
608 kfree(p);
609
610 return 0;
611}
612
613static int ocfs2_control_open(struct inode *inode, struct file *file)
614{
615 struct ocfs2_control_private *p;
616
617 p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
618 if (!p)
619 return -ENOMEM;
620 p->op_this_node = -1;
621
622 mutex_lock(&ocfs2_control_lock);
623 file->private_data = p;
624 list_add(&p->op_list, &ocfs2_control_private_list);
625 mutex_unlock(&ocfs2_control_lock);
626
627 return 0;
628}
629
630static const struct file_operations ocfs2_control_fops = {
631 .open = ocfs2_control_open,
632 .release = ocfs2_control_release,
633 .read = ocfs2_control_read,
634 .write = ocfs2_control_write,
635 .owner = THIS_MODULE,
636};
637
638struct miscdevice ocfs2_control_device = {
639 .minor = MISC_DYNAMIC_MINOR,
640 .name = "ocfs2_control",
641 .fops = &ocfs2_control_fops,
642};
643
644static int ocfs2_control_init(void)
645{
646 int rc;
647
648 atomic_set(&ocfs2_control_opened, 0);
649
650 rc = misc_register(&ocfs2_control_device);
651 if (rc)
652 printk(KERN_ERR
653 "ocfs2: Unable to register ocfs2_control device "
654 "(errno %d)\n",
655 -rc);
656
657 return rc;
658}
659
660static void ocfs2_control_exit(void)
661{
662 int rc;
663
664 rc = misc_deregister(&ocfs2_control_device);
665 if (rc)
666 printk(KERN_ERR
667 "ocfs2: Unable to deregister ocfs2_control device "
668 "(errno %d)\n",
669 -rc);
670}
671
672static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
673{
674 struct ocfs2_lock_res *res = astarg;
675 return &res->l_lksb.lksb_fsdlm;
676}
677
678static void fsdlm_lock_ast_wrapper(void *astarg)
679{
680 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
681 int status = lksb->sb_status;
682
683 BUG_ON(user_stack.sp_proto == NULL);
684
685 /*
686 * For now we're punting on the issue of other non-standard errors
687 * where we can't tell if the unlock_ast or lock_ast should be called.
688 * The main "other error" that's possible is EINVAL which means the
689 * function was called with invalid args, which shouldn't be possible
690 * since the caller here is under our control. Other non-standard
691 * errors probably fall into the same category, or otherwise are fatal
692 * which means we can't carry on anyway.
693 */
694
695 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
696 user_stack.sp_proto->lp_unlock_ast(astarg, 0);
697 else
698 user_stack.sp_proto->lp_lock_ast(astarg);
699}
700
701static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
702{
703 BUG_ON(user_stack.sp_proto == NULL);
704
705 user_stack.sp_proto->lp_blocking_ast(astarg, level);
706}
707
708static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
709 int mode,
710 union ocfs2_dlm_lksb *lksb,
711 u32 flags,
712 void *name,
713 unsigned int namelen,
714 void *astarg)
715{
716 int ret;
717
718 if (!lksb->lksb_fsdlm.sb_lvbptr)
719 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
720 sizeof(struct dlm_lksb);
721
722 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
723 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
724 fsdlm_lock_ast_wrapper, astarg,
725 fsdlm_blocking_ast_wrapper);
726 return ret;
727}
728
729static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
730 union ocfs2_dlm_lksb *lksb,
731 u32 flags,
732 void *astarg)
733{
734 int ret;
735
736 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
737 flags, &lksb->lksb_fsdlm, astarg);
738 return ret;
739}
740
741static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
742{
743 return lksb->lksb_fsdlm.sb_status;
744}
745
746static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
747{
748 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
749}
750
751static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
752{
753}
754
755/*
756 * Compare a requested locking protocol version against the current one.
757 *
758 * If the major numbers are different, they are incompatible.
759 * If the current minor is greater than the request, they are incompatible.
760 * If the current minor is less than or equal to the request, they are
761 * compatible, and the requester should run at the current minor version.
762 */
763static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
764 struct ocfs2_protocol_version *request)
765{
766 if (existing->pv_major != request->pv_major)
767 return 1;
768
769 if (existing->pv_minor > request->pv_minor)
770 return 1;
771
772 if (existing->pv_minor < request->pv_minor)
773 request->pv_minor = existing->pv_minor;
774
775 return 0;
776}
777
778static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
779{
780 dlm_lockspace_t *fsdlm;
781 struct ocfs2_live_connection *control;
782 int rc = 0;
783
784 BUG_ON(conn == NULL);
785
786 rc = ocfs2_live_connection_new(conn, &control);
787 if (rc)
788 goto out;
789
790 /*
791 * running_proto must have been set before we allowed any mounts
792 * to proceed.
793 */
794 if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
795 printk(KERN_ERR
796 "Unable to mount with fs locking protocol version "
797 "%u.%u because the userspace control daemon has "
798 "negotiated %u.%u\n",
799 conn->cc_version.pv_major, conn->cc_version.pv_minor,
800 running_proto.pv_major, running_proto.pv_minor);
801 rc = -EPROTO;
802 ocfs2_live_connection_drop(control);
803 goto out;
804 }
805
806 rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
807 &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
808 if (rc) {
809 ocfs2_live_connection_drop(control);
810 goto out;
811 }
812
813 conn->cc_private = control;
814 conn->cc_lockspace = fsdlm;
815out:
816 return rc;
817}
818
819static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
820 int hangup_pending)
821{
822 dlm_release_lockspace(conn->cc_lockspace, 2);
823 conn->cc_lockspace = NULL;
824 ocfs2_live_connection_drop(conn->cc_private);
825 conn->cc_private = NULL;
826 return 0;
827}
828
829static int user_cluster_this_node(unsigned int *this_node)
830{
831 int rc;
832
833 rc = ocfs2_control_get_this_node();
834 if (rc < 0)
835 return rc;
836
837 *this_node = rc;
838 return 0;
839}
840
841static struct ocfs2_stack_operations user_stack_ops = {
842 .connect = user_cluster_connect,
843 .disconnect = user_cluster_disconnect,
844 .this_node = user_cluster_this_node,
845 .dlm_lock = user_dlm_lock,
846 .dlm_unlock = user_dlm_unlock,
847 .lock_status = user_dlm_lock_status,
848 .lock_lvb = user_dlm_lvb,
849 .dump_lksb = user_dlm_dump_lksb,
850};
851
852static struct ocfs2_stack_plugin user_stack = {
853 .sp_name = "user",
854 .sp_ops = &user_stack_ops,
855 .sp_owner = THIS_MODULE,
856};
857
858
859static int __init user_stack_init(void)
860{
861 int rc;
862
863 rc = ocfs2_control_init();
864 if (!rc) {
865 rc = ocfs2_stack_glue_register(&user_stack);
866 if (rc)
867 ocfs2_control_exit();
868 }
869
870 return rc;
871}
872
873static void __exit user_stack_exit(void)
874{
875 ocfs2_stack_glue_unregister(&user_stack);
876 ocfs2_control_exit();
877}
878
879MODULE_AUTHOR("Oracle");
880MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
881MODULE_LICENSE("GPL");
882module_init(user_stack_init);
883module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000000..119f60cea9cc
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,568 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stackglue.c
5 *
6 * Code which implements an OCFS2 specific interface to underlying
7 * cluster stacks.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation, version 2.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 */
20
21#include <linux/list.h>
22#include <linux/spinlock.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/kmod.h>
26#include <linux/fs.h>
27#include <linux/kobject.h>
28#include <linux/sysfs.h>
29
30#include "ocfs2_fs.h"
31
32#include "stackglue.h"
33
34#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
35#define OCFS2_STACK_PLUGIN_USER "user"
36
37static struct ocfs2_locking_protocol *lproto;
38static DEFINE_SPINLOCK(ocfs2_stack_lock);
39static LIST_HEAD(ocfs2_stack_list);
40static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
41
42/*
43 * The stack currently in use. If not null, active_stack->sp_count > 0,
44 * the module is pinned, and the locking protocol cannot be changed.
45 */
46static struct ocfs2_stack_plugin *active_stack;
47
48static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
49{
50 struct ocfs2_stack_plugin *p;
51
52 assert_spin_locked(&ocfs2_stack_lock);
53
54 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
55 if (!strcmp(p->sp_name, name))
56 return p;
57 }
58
59 return NULL;
60}
61
62static int ocfs2_stack_driver_request(const char *stack_name,
63 const char *plugin_name)
64{
65 int rc;
66 struct ocfs2_stack_plugin *p;
67
68 spin_lock(&ocfs2_stack_lock);
69
70 /*
71 * If the stack passed by the filesystem isn't the selected one,
72 * we can't continue.
73 */
74 if (strcmp(stack_name, cluster_stack_name)) {
75 rc = -EBUSY;
76 goto out;
77 }
78
79 if (active_stack) {
80 /*
81 * If the active stack isn't the one we want, it cannot
82 * be selected right now.
83 */
84 if (!strcmp(active_stack->sp_name, plugin_name))
85 rc = 0;
86 else
87 rc = -EBUSY;
88 goto out;
89 }
90
91 p = ocfs2_stack_lookup(plugin_name);
92 if (!p || !try_module_get(p->sp_owner)) {
93 rc = -ENOENT;
94 goto out;
95 }
96
97 /* Ok, the stack is pinned */
98 p->sp_count++;
99 active_stack = p;
100
101 rc = 0;
102
103out:
104 spin_unlock(&ocfs2_stack_lock);
105 return rc;
106}
107
108/*
109 * This function looks up the appropriate stack and makes it active. If
110 * there is no stack, it tries to load it. It will fail if the stack still
111 * cannot be found. It will also fail if a different stack is in use.
112 */
113static int ocfs2_stack_driver_get(const char *stack_name)
114{
115 int rc;
116 char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
117
118 /*
119 * Classic stack does not pass in a stack name. This is
120 * compatible with older tools as well.
121 */
122 if (!stack_name || !*stack_name)
123 stack_name = OCFS2_STACK_PLUGIN_O2CB;
124
125 if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
126 printk(KERN_ERR
127 "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
128 stack_name);
129 return -EINVAL;
130 }
131
132 /* Anything that isn't the classic stack is a user stack */
133 if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
134 plugin_name = OCFS2_STACK_PLUGIN_USER;
135
136 rc = ocfs2_stack_driver_request(stack_name, plugin_name);
137 if (rc == -ENOENT) {
138 request_module("ocfs2_stack_%s", plugin_name);
139 rc = ocfs2_stack_driver_request(stack_name, plugin_name);
140 }
141
142 if (rc == -ENOENT) {
143 printk(KERN_ERR
144 "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
145 plugin_name);
146 } else if (rc == -EBUSY) {
147 printk(KERN_ERR
148 "ocfs2: A different cluster stack is in use\n");
149 }
150
151 return rc;
152}
153
154static void ocfs2_stack_driver_put(void)
155{
156 spin_lock(&ocfs2_stack_lock);
157 BUG_ON(active_stack == NULL);
158 BUG_ON(active_stack->sp_count == 0);
159
160 active_stack->sp_count--;
161 if (!active_stack->sp_count) {
162 module_put(active_stack->sp_owner);
163 active_stack = NULL;
164 }
165 spin_unlock(&ocfs2_stack_lock);
166}
167
168int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
169{
170 int rc;
171
172 spin_lock(&ocfs2_stack_lock);
173 if (!ocfs2_stack_lookup(plugin->sp_name)) {
174 plugin->sp_count = 0;
175 plugin->sp_proto = lproto;
176 list_add(&plugin->sp_list, &ocfs2_stack_list);
177 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
178 plugin->sp_name);
179 rc = 0;
180 } else {
181 printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
182 plugin->sp_name);
183 rc = -EEXIST;
184 }
185 spin_unlock(&ocfs2_stack_lock);
186
187 return rc;
188}
189EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
190
191void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
192{
193 struct ocfs2_stack_plugin *p;
194
195 spin_lock(&ocfs2_stack_lock);
196 p = ocfs2_stack_lookup(plugin->sp_name);
197 if (p) {
198 BUG_ON(p != plugin);
199 BUG_ON(plugin == active_stack);
200 BUG_ON(plugin->sp_count != 0);
201 list_del_init(&plugin->sp_list);
202 printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
203 plugin->sp_name);
204 } else {
205 printk(KERN_ERR "Stack \"%s\" is not registered\n",
206 plugin->sp_name);
207 }
208 spin_unlock(&ocfs2_stack_lock);
209}
210EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
211
212void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
213{
214 struct ocfs2_stack_plugin *p;
215
216 BUG_ON(proto == NULL);
217
218 spin_lock(&ocfs2_stack_lock);
219 BUG_ON(active_stack != NULL);
220
221 lproto = proto;
222 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
223 p->sp_proto = lproto;
224 }
225
226 spin_unlock(&ocfs2_stack_lock);
227}
228EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
229
230
231/*
232 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
233 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
234 * underlying stack plugins need to pilfer the lksb off of the lock_res.
235 * If some other structure needs to be passed as an astarg, the plugins
236 * will need to be given a different avenue to the lksb.
237 */
238int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
239 int mode,
240 union ocfs2_dlm_lksb *lksb,
241 u32 flags,
242 void *name,
243 unsigned int namelen,
244 struct ocfs2_lock_res *astarg)
245{
246 BUG_ON(lproto == NULL);
247
248 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
249 name, namelen, astarg);
250}
251EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
252
253int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
254 union ocfs2_dlm_lksb *lksb,
255 u32 flags,
256 struct ocfs2_lock_res *astarg)
257{
258 BUG_ON(lproto == NULL);
259
260 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
261}
262EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
263
264int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
265{
266 return active_stack->sp_ops->lock_status(lksb);
267}
268EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
269
270/*
271 * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we
272 * don't cast at the glue level. The real answer is that the header
273 * ordering is nigh impossible.
274 */
275void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
276{
277 return active_stack->sp_ops->lock_lvb(lksb);
278}
279EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
280
281void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
282{
283 active_stack->sp_ops->dump_lksb(lksb);
284}
285EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
286
287int ocfs2_cluster_connect(const char *stack_name,
288 const char *group,
289 int grouplen,
290 void (*recovery_handler)(int node_num,
291 void *recovery_data),
292 void *recovery_data,
293 struct ocfs2_cluster_connection **conn)
294{
295 int rc = 0;
296 struct ocfs2_cluster_connection *new_conn;
297
298 BUG_ON(group == NULL);
299 BUG_ON(conn == NULL);
300 BUG_ON(recovery_handler == NULL);
301
302 if (grouplen > GROUP_NAME_MAX) {
303 rc = -EINVAL;
304 goto out;
305 }
306
307 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
308 GFP_KERNEL);
309 if (!new_conn) {
310 rc = -ENOMEM;
311 goto out;
312 }
313
314 memcpy(new_conn->cc_name, group, grouplen);
315 new_conn->cc_namelen = grouplen;
316 new_conn->cc_recovery_handler = recovery_handler;
317 new_conn->cc_recovery_data = recovery_data;
318
319 /* Start the new connection at our maximum compatibility level */
320 new_conn->cc_version = lproto->lp_max_version;
321
322 /* This will pin the stack driver if successful */
323 rc = ocfs2_stack_driver_get(stack_name);
324 if (rc)
325 goto out_free;
326
327 rc = active_stack->sp_ops->connect(new_conn);
328 if (rc) {
329 ocfs2_stack_driver_put();
330 goto out_free;
331 }
332
333 *conn = new_conn;
334
335out_free:
336 if (rc)
337 kfree(new_conn);
338
339out:
340 return rc;
341}
342EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
343
344/* If hangup_pending is 0, the stack driver will be dropped */
345int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
346 int hangup_pending)
347{
348 int ret;
349
350 BUG_ON(conn == NULL);
351
352 ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
353
354 /* XXX Should we free it anyway? */
355 if (!ret) {
356 kfree(conn);
357 if (!hangup_pending)
358 ocfs2_stack_driver_put();
359 }
360
361 return ret;
362}
363EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
364
365void ocfs2_cluster_hangup(const char *group, int grouplen)
366{
367 BUG_ON(group == NULL);
368 BUG_ON(group[grouplen] != '\0');
369
370 if (active_stack->sp_ops->hangup)
371 active_stack->sp_ops->hangup(group, grouplen);
372
373 /* cluster_disconnect() was called with hangup_pending==1 */
374 ocfs2_stack_driver_put();
375}
376EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
377
378int ocfs2_cluster_this_node(unsigned int *node)
379{
380 return active_stack->sp_ops->this_node(node);
381}
382EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
383
384
385/*
386 * Sysfs bits
387 */
388
389static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
390 struct kobj_attribute *attr,
391 char *buf)
392{
393 ssize_t ret = 0;
394
395 spin_lock(&ocfs2_stack_lock);
396 if (lproto)
397 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
398 lproto->lp_max_version.pv_major,
399 lproto->lp_max_version.pv_minor);
400 spin_unlock(&ocfs2_stack_lock);
401
402 return ret;
403}
404
405static struct kobj_attribute ocfs2_attr_max_locking_protocol =
406 __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
407 ocfs2_max_locking_protocol_show, NULL);
408
409static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
410 struct kobj_attribute *attr,
411 char *buf)
412{
413 ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
414 struct ocfs2_stack_plugin *p;
415
416 spin_lock(&ocfs2_stack_lock);
417 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
418 ret = snprintf(buf, remain, "%s\n",
419 p->sp_name);
420 if (ret < 0) {
421 total = ret;
422 break;
423 }
424 if (ret == remain) {
425 /* snprintf() didn't fit */
426 total = -E2BIG;
427 break;
428 }
429 total += ret;
430 remain -= ret;
431 }
432 spin_unlock(&ocfs2_stack_lock);
433
434 return total;
435}
436
437static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
438 __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
439 ocfs2_loaded_cluster_plugins_show, NULL);
440
441static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
442 struct kobj_attribute *attr,
443 char *buf)
444{
445 ssize_t ret = 0;
446
447 spin_lock(&ocfs2_stack_lock);
448 if (active_stack) {
449 ret = snprintf(buf, PAGE_SIZE, "%s\n",
450 active_stack->sp_name);
451 if (ret == PAGE_SIZE)
452 ret = -E2BIG;
453 }
454 spin_unlock(&ocfs2_stack_lock);
455
456 return ret;
457}
458
459static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
460 __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
461 ocfs2_active_cluster_plugin_show, NULL);
462
463static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
464 struct kobj_attribute *attr,
465 char *buf)
466{
467 ssize_t ret;
468 spin_lock(&ocfs2_stack_lock);
469 ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
470 spin_unlock(&ocfs2_stack_lock);
471
472 return ret;
473}
474
475static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
476 struct kobj_attribute *attr,
477 const char *buf, size_t count)
478{
479 size_t len = count;
480 ssize_t ret;
481
482 if (len == 0)
483 return len;
484
485 if (buf[len - 1] == '\n')
486 len--;
487
488 if ((len != OCFS2_STACK_LABEL_LEN) ||
489 (strnlen(buf, len) != len))
490 return -EINVAL;
491
492 spin_lock(&ocfs2_stack_lock);
493 if (active_stack) {
494 if (!strncmp(buf, cluster_stack_name, len))
495 ret = count;
496 else
497 ret = -EBUSY;
498 } else {
499 memcpy(cluster_stack_name, buf, len);
500 ret = count;
501 }
502 spin_unlock(&ocfs2_stack_lock);
503
504 return ret;
505}
506
507
508static struct kobj_attribute ocfs2_attr_cluster_stack =
509 __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
510 ocfs2_cluster_stack_show,
511 ocfs2_cluster_stack_store);
512
513static struct attribute *ocfs2_attrs[] = {
514 &ocfs2_attr_max_locking_protocol.attr,
515 &ocfs2_attr_loaded_cluster_plugins.attr,
516 &ocfs2_attr_active_cluster_plugin.attr,
517 &ocfs2_attr_cluster_stack.attr,
518 NULL,
519};
520
521static struct attribute_group ocfs2_attr_group = {
522 .attrs = ocfs2_attrs,
523};
524
525static struct kset *ocfs2_kset;
526
527static void ocfs2_sysfs_exit(void)
528{
529 kset_unregister(ocfs2_kset);
530}
531
532static int ocfs2_sysfs_init(void)
533{
534 int ret;
535
536 ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
537 if (!ocfs2_kset)
538 return -ENOMEM;
539
540 ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
541 if (ret)
542 goto error;
543
544 return 0;
545
546error:
547 kset_unregister(ocfs2_kset);
548 return ret;
549}
550
551static int __init ocfs2_stack_glue_init(void)
552{
553 strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
554
555 return ocfs2_sysfs_init();
556}
557
558static void __exit ocfs2_stack_glue_exit(void)
559{
560 lproto = NULL;
561 ocfs2_sysfs_exit();
562}
563
564MODULE_AUTHOR("Oracle");
565MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
566MODULE_LICENSE("GPL");
567module_init(ocfs2_stack_glue_init);
568module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000000..005e4f170e0f
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,261 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stackglue.h
5 *
6 * Glue to the underlying cluster stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20
21#ifndef STACKGLUE_H
22#define STACKGLUE_H
23
24#include <linux/types.h>
25#include <linux/list.h>
26#include <linux/dlmconstants.h>
27
28#include "dlm/dlmapi.h"
29#include <linux/dlm.h>
30
31/*
32 * dlmconstants.h does not have a LOCAL flag. We hope to remove it
33 * some day, but right now we need it. Let's fake it. This value is larger
34 * than any flag in dlmconstants.h.
35 */
36#define DLM_LKF_LOCAL 0x00100000
37
38/*
39 * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
40 * wants to be in a public header.
41 */
42#define GROUP_NAME_MAX 64
43
44
45/*
46 * ocfs2_protocol_version changes when ocfs2 does something different in
47 * its inter-node behavior. See dlmglue.c for more information.
48 */
49struct ocfs2_protocol_version {
50 u8 pv_major;
51 u8 pv_minor;
52};
53
54/*
55 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
56 */
57struct ocfs2_locking_protocol {
58 struct ocfs2_protocol_version lp_max_version;
59 void (*lp_lock_ast)(void *astarg);
60 void (*lp_blocking_ast)(void *astarg, int level);
61 void (*lp_unlock_ast)(void *astarg, int error);
62};
63
64
65/*
66 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
67 * has a pointer to separately allocated lvb space. This struct exists only to
68 * include in the lksb union to make space for a combined dlm_lksb and lvb.
69 */
70struct fsdlm_lksb_plus_lvb {
71 struct dlm_lksb lksb;
72 char lvb[DLM_LVB_LEN];
73};
74
75/*
76 * A union of all lock status structures. We define it here so that the
77 * size of the union is known. Lock status structures are embedded in
78 * ocfs2 inodes.
79 */
80union ocfs2_dlm_lksb {
81 struct dlm_lockstatus lksb_o2dlm;
82 struct dlm_lksb lksb_fsdlm;
83 struct fsdlm_lksb_plus_lvb padding;
84};
85
86/*
87 * A cluster connection. Mostly opaque to ocfs2, the connection holds
88 * state for the underlying stack. ocfs2 does use cc_version to determine
89 * locking compatibility.
90 */
91struct ocfs2_cluster_connection {
92 char cc_name[GROUP_NAME_MAX];
93 int cc_namelen;
94 struct ocfs2_protocol_version cc_version;
95 void (*cc_recovery_handler)(int node_num, void *recovery_data);
96 void *cc_recovery_data;
97 void *cc_lockspace;
98 void *cc_private;
99};
100
101/*
102 * Each cluster stack implements the stack operations structure. Not used
103 * in the ocfs2 code, the stackglue code translates generic cluster calls
104 * into stack operations.
105 */
106struct ocfs2_stack_operations {
107 /*
108 * The fs code calls ocfs2_cluster_connect() to attach a new
109 * filesystem to the cluster stack. The ->connect() op is passed
110 * an ocfs2_cluster_connection with the name and recovery field
111 * filled in.
112 *
113 * The stack must set up any notification mechanisms and create
114 * the filesystem lockspace in the DLM. The lockspace should be
115 * stored on cc_lockspace. Any other information can be stored on
116 * cc_private.
117 *
118 * ->connect() must not return until it is guaranteed that
119 *
120 * - Node down notifications for the filesystem will be recieved
121 * and passed to conn->cc_recovery_handler().
122 * - Locking requests for the filesystem will be processed.
123 */
124 int (*connect)(struct ocfs2_cluster_connection *conn);
125
126 /*
127 * The fs code calls ocfs2_cluster_disconnect() when a filesystem
128 * no longer needs cluster services. All DLM locks have been
129 * dropped, and recovery notification is being ignored by the
130 * fs code. The stack must disengage from the DLM and discontinue
131 * recovery notification.
132 *
133 * Once ->disconnect() has returned, the connection structure will
134 * be freed. Thus, a stack must not return from ->disconnect()
135 * until it will no longer reference the conn pointer.
136 *
137 * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
138 * be dropping the reference on the module.
139 */
140 int (*disconnect)(struct ocfs2_cluster_connection *conn,
141 int hangup_pending);
142
143 /*
144 * ocfs2_cluster_hangup() exists for compatibility with older
145 * ocfs2 tools. Only the classic stack really needs it. As such
146 * ->hangup() is not required of all stacks. See the comment by
147 * ocfs2_cluster_hangup() for more details.
148 *
149 * Note that ocfs2_cluster_hangup() can only be called if
150 * hangup_pending was passed to ocfs2_cluster_disconnect().
151 */
152 void (*hangup)(const char *group, int grouplen);
153
154 /*
155 * ->this_node() returns the cluster's unique identifier for the
156 * local node.
157 */
158 int (*this_node)(unsigned int *node);
159
160 /*
161 * Call the underlying dlm lock function. The ->dlm_lock()
162 * callback should convert the flags and mode as appropriate.
163 *
164 * ast and bast functions are not part of the call because the
165 * stack will likely want to wrap ast and bast calls before passing
166 * them to stack->sp_proto.
167 */
168 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
169 int mode,
170 union ocfs2_dlm_lksb *lksb,
171 u32 flags,
172 void *name,
173 unsigned int namelen,
174 void *astarg);
175
176 /*
177 * Call the underlying dlm unlock function. The ->dlm_unlock()
178 * function should convert the flags as appropriate.
179 *
180 * The unlock ast is not passed, as the stack will want to wrap
181 * it before calling stack->sp_proto->lp_unlock_ast().
182 */
183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
184 union ocfs2_dlm_lksb *lksb,
185 u32 flags,
186 void *astarg);
187
188 /*
189 * Return the status of the current lock status block. The fs
190 * code should never dereference the union. The ->lock_status()
191 * callback pulls out the stack-specific lksb, converts the status
192 * to a proper errno, and returns it.
193 */
194 int (*lock_status)(union ocfs2_dlm_lksb *lksb);
195
196 /*
197 * Pull the lvb pointer off of the stack-specific lksb.
198 */
199 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
200
201 /*
202 * This is an optoinal debugging hook. If provided, the
203 * stack can dump debugging information about this lock.
204 */
205 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
206};
207
208/*
209 * Each stack plugin must describe itself by registering a
210 * ocfs2_stack_plugin structure. This is only seen by stackglue and the
211 * stack driver.
212 */
213struct ocfs2_stack_plugin {
214 char *sp_name;
215 struct ocfs2_stack_operations *sp_ops;
216 struct module *sp_owner;
217
218 /* These are managed by the stackglue code. */
219 struct list_head sp_list;
220 unsigned int sp_count;
221 struct ocfs2_locking_protocol *sp_proto;
222};
223
224
225/* Used by the filesystem */
226int ocfs2_cluster_connect(const char *stack_name,
227 const char *group,
228 int grouplen,
229 void (*recovery_handler)(int node_num,
230 void *recovery_data),
231 void *recovery_data,
232 struct ocfs2_cluster_connection **conn);
233int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
234 int hangup_pending);
235void ocfs2_cluster_hangup(const char *group, int grouplen);
236int ocfs2_cluster_this_node(unsigned int *node);
237
238struct ocfs2_lock_res;
239int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
240 int mode,
241 union ocfs2_dlm_lksb *lksb,
242 u32 flags,
243 void *name,
244 unsigned int namelen,
245 struct ocfs2_lock_res *astarg);
246int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
247 union ocfs2_dlm_lksb *lksb,
248 u32 flags,
249 struct ocfs2_lock_res *astarg);
250
251int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
252void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
253void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
254
255void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
256
257
258/* Used by stack plugins */
259int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
260void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
261#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004df..d2d278fb9819 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,11 @@
46 46
47#include "buffer_head_io.h" 47#include "buffer_head_io.h"
48 48
49#define NOT_ALLOC_NEW_GROUP 0
50#define ALLOC_NEW_GROUP 1
51
52#define OCFS2_MAX_INODES_TO_STEAL 1024
53
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 54static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 55static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 56static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
106 u64 *bg_blkno, 111 u64 *bg_blkno,
107 u16 *bg_bit_off); 112 u16 *bg_bit_off);
108 113
109void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 114static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
110{ 115{
111 struct inode *inode = ac->ac_inode; 116 struct inode *inode = ac->ac_inode;
112 117
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
117 mutex_unlock(&inode->i_mutex); 122 mutex_unlock(&inode->i_mutex);
118 123
119 iput(inode); 124 iput(inode);
125 ac->ac_inode = NULL;
120 } 126 }
121 if (ac->ac_bh) 127 if (ac->ac_bh) {
122 brelse(ac->ac_bh); 128 brelse(ac->ac_bh);
129 ac->ac_bh = NULL;
130 }
131}
132
133void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
134{
135 ocfs2_free_ac_resource(ac);
123 kfree(ac); 136 kfree(ac);
124} 137}
125 138
@@ -391,7 +404,8 @@ bail:
391static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 404static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
392 struct ocfs2_alloc_context *ac, 405 struct ocfs2_alloc_context *ac,
393 int type, 406 int type,
394 u32 slot) 407 u32 slot,
408 int alloc_new_group)
395{ 409{
396 int status; 410 int status;
397 u32 bits_wanted = ac->ac_bits_wanted; 411 u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
420 } 434 }
421 435
422 ac->ac_inode = alloc_inode; 436 ac->ac_inode = alloc_inode;
437 ac->ac_alloc_slot = slot;
423 438
424 fe = (struct ocfs2_dinode *) bh->b_data; 439 fe = (struct ocfs2_dinode *) bh->b_data;
425 if (!OCFS2_IS_VALID_DINODE(fe)) { 440 if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
446 goto bail; 461 goto bail;
447 } 462 }
448 463
464 if (alloc_new_group != ALLOC_NEW_GROUP) {
465 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
466 "and we don't alloc a new group for it.\n",
467 slot, bits_wanted, free_bits);
468 status = -ENOSPC;
469 goto bail;
470 }
471
449 status = ocfs2_block_group_alloc(osb, alloc_inode, bh); 472 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
450 if (status < 0) { 473 if (status < 0) {
451 if (status != -ENOSPC) 474 if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
490 (*ac)->ac_group_search = ocfs2_block_group_search; 513 (*ac)->ac_group_search = ocfs2_block_group_search;
491 514
492 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 515 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
493 EXTENT_ALLOC_SYSTEM_INODE, slot); 516 EXTENT_ALLOC_SYSTEM_INODE,
517 slot, ALLOC_NEW_GROUP);
494 if (status < 0) { 518 if (status < 0) {
495 if (status != -ENOSPC) 519 if (status != -ENOSPC)
496 mlog_errno(status); 520 mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
508 return status; 532 return status;
509} 533}
510 534
535static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
536 struct ocfs2_alloc_context *ac)
537{
538 int i, status = -ENOSPC;
539 s16 slot = ocfs2_get_inode_steal_slot(osb);
540
541 /* Start to steal inodes from the first slot after ours. */
542 if (slot == OCFS2_INVALID_SLOT)
543 slot = osb->slot_num + 1;
544
545 for (i = 0; i < osb->max_slots; i++, slot++) {
546 if (slot == osb->max_slots)
547 slot = 0;
548
549 if (slot == osb->slot_num)
550 continue;
551
552 status = ocfs2_reserve_suballoc_bits(osb, ac,
553 INODE_ALLOC_SYSTEM_INODE,
554 slot, NOT_ALLOC_NEW_GROUP);
555 if (status >= 0) {
556 ocfs2_set_inode_steal_slot(osb, slot);
557 break;
558 }
559
560 ocfs2_free_ac_resource(ac);
561 }
562
563 return status;
564}
565
511int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 566int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
512 struct ocfs2_alloc_context **ac) 567 struct ocfs2_alloc_context **ac)
513{ 568{
514 int status; 569 int status;
570 s16 slot = ocfs2_get_inode_steal_slot(osb);
515 571
516 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 572 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
517 if (!(*ac)) { 573 if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
525 581
526 (*ac)->ac_group_search = ocfs2_block_group_search; 582 (*ac)->ac_group_search = ocfs2_block_group_search;
527 583
584 /*
585 * slot is set when we successfully steal inode from other nodes.
586 * It is reset in 3 places:
587 * 1. when we flush the truncate log
588 * 2. when we complete local alloc recovery.
589 * 3. when we successfully allocate from our own slot.
590 * After it is set, we will go on stealing inodes until we find the
591 * need to check our slots to see whether there is some space for us.
592 */
593 if (slot != OCFS2_INVALID_SLOT &&
594 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
595 goto inode_steal;
596
597 atomic_set(&osb->s_num_inodes_stolen, 0);
528 status = ocfs2_reserve_suballoc_bits(osb, *ac, 598 status = ocfs2_reserve_suballoc_bits(osb, *ac,
529 INODE_ALLOC_SYSTEM_INODE, 599 INODE_ALLOC_SYSTEM_INODE,
530 osb->slot_num); 600 osb->slot_num, ALLOC_NEW_GROUP);
601 if (status >= 0) {
602 status = 0;
603
604 /*
605 * Some inodes must be freed by us, so try to allocate
606 * from our own next time.
607 */
608 if (slot != OCFS2_INVALID_SLOT)
609 ocfs2_init_inode_steal_slot(osb);
610 goto bail;
611 } else if (status < 0 && status != -ENOSPC) {
612 mlog_errno(status);
613 goto bail;
614 }
615
616 ocfs2_free_ac_resource(*ac);
617
618inode_steal:
619 status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
620 atomic_inc(&osb->s_num_inodes_stolen);
531 if (status < 0) { 621 if (status < 0) {
532 if (status != -ENOSPC) 622 if (status != -ENOSPC)
533 mlog_errno(status); 623 mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
557 647
558 status = ocfs2_reserve_suballoc_bits(osb, ac, 648 status = ocfs2_reserve_suballoc_bits(osb, ac,
559 GLOBAL_BITMAP_SYSTEM_INODE, 649 GLOBAL_BITMAP_SYSTEM_INODE,
560 OCFS2_INVALID_SLOT); 650 OCFS2_INVALID_SLOT,
651 ALLOC_NEW_GROUP);
561 if (status < 0 && status != -ENOSPC) { 652 if (status < 0 && status != -ENOSPC) {
562 mlog_errno(status); 653 mlog_errno(status);
563 goto bail; 654 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb459..544c600662bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
36struct ocfs2_alloc_context { 36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */ 37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */ 38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_alloc_slot; /* which slot are we allocating from? */
39 u32 ac_bits_wanted; 40 u32 ac_bits_wanted;
40 u32 ac_bits_given; 41 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1 42#define OCFS2_AC_USE_LOCAL 1
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9f..df63ba20ae90 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
40#include <linux/crc32.h> 40#include <linux/crc32.h>
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h> 42#include <linux/mount.h>
43 43#include <linux/seq_file.h>
44#include <cluster/nodemanager.h>
45 44
46#define MLOG_MASK_PREFIX ML_SUPER 45#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
88 unsigned int atime_quantum; 87 unsigned int atime_quantum;
89 signed short slot; 88 signed short slot;
90 unsigned int localalloc_opt; 89 unsigned int localalloc_opt;
90 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
91}; 91};
92 92
93static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
109static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 109static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 110static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
111static void ocfs2_release_system_inodes(struct ocfs2_super *osb); 111static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
112static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
113static int ocfs2_check_volume(struct ocfs2_super *osb); 112static int ocfs2_check_volume(struct ocfs2_super *osb);
114static int ocfs2_verify_volume(struct ocfs2_dinode *di, 113static int ocfs2_verify_volume(struct ocfs2_dinode *di,
115 struct buffer_head *bh, 114 struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
154 Opt_commit, 153 Opt_commit,
155 Opt_localalloc, 154 Opt_localalloc,
156 Opt_localflocks, 155 Opt_localflocks,
156 Opt_stack,
157 Opt_err, 157 Opt_err,
158}; 158};
159 159
@@ -172,6 +172,7 @@ static match_table_t tokens = {
172 {Opt_commit, "commit=%u"}, 172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"}, 173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"}, 174 {Opt_localflocks, "localflocks"},
175 {Opt_stack, "cluster_stack=%s"},
175 {Opt_err, NULL} 176 {Opt_err, NULL}
176}; 177};
177 178
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
551 } 552 }
552 } 553 }
553 554
555 if (ocfs2_userspace_stack(osb)) {
556 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
557 mlog(ML_ERROR, "Userspace stack expected, but "
558 "o2cb heartbeat arguments passed to mount\n");
559 return -EINVAL;
560 }
561 }
562
554 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 563 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
555 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { 564 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
565 !ocfs2_userspace_stack(osb)) {
556 mlog(ML_ERROR, "Heartbeat has to be started to mount " 566 mlog(ML_ERROR, "Heartbeat has to be started to mount "
557 "a read-write clustered device.\n"); 567 "a read-write clustered device.\n");
558 return -EINVAL; 568 return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
562 return 0; 572 return 0;
563} 573}
564 574
575/*
576 * If we're using a userspace stack, mount should have passed
577 * a name that matches the disk. If not, mount should not
578 * have passed a stack.
579 */
580static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
581 struct mount_options *mopt)
582{
583 if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
584 mlog(ML_ERROR,
585 "cluster stack passed to mount, but this filesystem "
586 "does not support it\n");
587 return -EINVAL;
588 }
589
590 if (ocfs2_userspace_stack(osb) &&
591 strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
592 OCFS2_STACK_LABEL_LEN)) {
593 mlog(ML_ERROR,
594 "cluster stack passed to mount (\"%s\") does not "
595 "match the filesystem (\"%s\")\n",
596 mopt->cluster_stack,
597 osb->osb_cluster_stack);
598 return -EINVAL;
599 }
600
601 return 0;
602}
603
565static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 604static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
566{ 605{
567 struct dentry *root; 606 struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
579 goto read_super_error; 618 goto read_super_error;
580 } 619 }
581 620
582 /* for now we only have one cluster/node, make sure we see it
583 * in the heartbeat universe */
584 if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
585 if (!o2hb_check_local_node_heartbeating()) {
586 status = -EINVAL;
587 goto read_super_error;
588 }
589 }
590
591 /* probe for superblock */ 621 /* probe for superblock */
592 status = ocfs2_sb_probe(sb, &bh, &sector_size); 622 status = ocfs2_sb_probe(sb, &bh, &sector_size);
593 if (status < 0) { 623 if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
609 osb->osb_commit_interval = parsed_options.commit_interval; 639 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt; 640 osb->local_alloc_size = parsed_options.localalloc_opt;
611 641
642 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
643 if (status)
644 goto read_super_error;
645
612 sb->s_magic = OCFS2_SUPER_MAGIC; 646 sb->s_magic = OCFS2_SUPER_MAGIC;
613 647
614 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 648 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
694 if (ocfs2_mount_local(osb)) 728 if (ocfs2_mount_local(osb))
695 snprintf(nodestr, sizeof(nodestr), "local"); 729 snprintf(nodestr, sizeof(nodestr), "local");
696 else 730 else
697 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 731 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
698 732
699 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " 733 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
700 "with %s data mode.\n", 734 "with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 797 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
764 mopt->slot = OCFS2_INVALID_SLOT; 798 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 799 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
800 mopt->cluster_stack[0] = '\0';
766 801
767 if (!options) { 802 if (!options) {
768 status = 1; 803 status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
864 if (!is_remount) 899 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; 900 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break; 901 break;
902 case Opt_stack:
903 /* Check both that the option we were passed
904 * is of the right length and that it is a proper
905 * string of the right length.
906 */
907 if (((args[0].to - args[0].from) !=
908 OCFS2_STACK_LABEL_LEN) ||
909 (strnlen(args[0].from,
910 OCFS2_STACK_LABEL_LEN) !=
911 OCFS2_STACK_LABEL_LEN)) {
912 mlog(ML_ERROR,
913 "Invalid cluster_stack option\n");
914 status = 0;
915 goto bail;
916 }
917 memcpy(mopt->cluster_stack, args[0].from,
918 OCFS2_STACK_LABEL_LEN);
919 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
920 break;
867 default: 921 default:
868 mlog(ML_ERROR, 922 mlog(ML_ERROR,
869 "Unrecognized mount option \"%s\" " 923 "Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 976 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,"); 977 seq_printf(s, ",localflocks,");
924 978
979 if (osb->osb_cluster_stack[0])
980 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
981 osb->osb_cluster_stack);
982
925 return 0; 983 return 0;
926} 984}
927 985
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
957 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1015 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
958 } 1016 }
959 1017
1018 ocfs2_set_locking_protocol();
1019
960leave: 1020leave:
961 if (status < 0) { 1021 if (status < 0) {
962 ocfs2_free_mem_caches(); 1022 ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
1132 return 0; 1192 return 0;
1133} 1193}
1134 1194
1135/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1136static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1137{
1138 int status;
1139
1140 /* XXX hold a ref on the node while mounte? easy enough, if
1141 * desirable. */
1142 if (ocfs2_mount_local(osb))
1143 osb->node_num = 0;
1144 else
1145 osb->node_num = o2nm_this_node();
1146
1147 if (osb->node_num == O2NM_MAX_NODES) {
1148 mlog(ML_ERROR, "could not find this host's node number\n");
1149 status = -ENOENT;
1150 goto bail;
1151 }
1152
1153 mlog(0, "I am node %d\n", osb->node_num);
1154
1155 status = 0;
1156bail:
1157 return status;
1158}
1159
1160static int ocfs2_mount_volume(struct super_block *sb) 1195static int ocfs2_mount_volume(struct super_block *sb)
1161{ 1196{
1162 int status = 0; 1197 int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1168 if (ocfs2_is_hard_readonly(osb)) 1203 if (ocfs2_is_hard_readonly(osb))
1169 goto leave; 1204 goto leave;
1170 1205
1171 status = ocfs2_fill_local_node_info(osb);
1172 if (status < 0) {
1173 mlog_errno(status);
1174 goto leave;
1175 }
1176
1177 status = ocfs2_dlm_init(osb); 1206 status = ocfs2_dlm_init(osb);
1178 if (status < 0) { 1207 if (status < 0) {
1179 mlog_errno(status); 1208 mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
1224 return status; 1253 return status;
1225} 1254}
1226 1255
1227/* we can't grab the goofy sem lock from inside wait_event, so we use
1228 * memory barriers to make sure that we'll see the null task before
1229 * being woken up */
1230static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1231{
1232 mb();
1233 return osb->recovery_thread_task != NULL;
1234}
1235
1236static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) 1256static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1237{ 1257{
1238 int tmp; 1258 int tmp, hangup_needed = 0;
1239 struct ocfs2_super *osb = NULL; 1259 struct ocfs2_super *osb = NULL;
1240 char nodestr[8]; 1260 char nodestr[8];
1241 1261
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1249 1269
1250 ocfs2_truncate_log_shutdown(osb); 1270 ocfs2_truncate_log_shutdown(osb);
1251 1271
1252 /* disable any new recovery threads and wait for any currently 1272 /* This will disable recovery and flush any recovery work. */
1253 * running ones to exit. Do this before setting the vol_state. */ 1273 ocfs2_recovery_exit(osb);
1254 mutex_lock(&osb->recovery_lock);
1255 osb->disable_recovery = 1;
1256 mutex_unlock(&osb->recovery_lock);
1257 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1258
1259 /* At this point, we know that no more recovery threads can be
1260 * launched, so wait for any recovery completion work to
1261 * complete. */
1262 flush_workqueue(ocfs2_wq);
1263 1274
1264 ocfs2_journal_shutdown(osb); 1275 ocfs2_journal_shutdown(osb);
1265 1276
1266 ocfs2_sync_blockdev(sb); 1277 ocfs2_sync_blockdev(sb);
1267 1278
1268 /* No dlm means we've failed during mount, so skip all the 1279 /* No cluster connection means we've failed during mount, so skip
1269 * steps which depended on that to complete. */ 1280 * all the steps which depended on that to complete. */
1270 if (osb->dlm) { 1281 if (osb->cconn) {
1271 tmp = ocfs2_super_lock(osb, 1); 1282 tmp = ocfs2_super_lock(osb, 1);
1272 if (tmp < 0) { 1283 if (tmp < 0) {
1273 mlog_errno(tmp); 1284 mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1278 if (osb->slot_num != OCFS2_INVALID_SLOT) 1289 if (osb->slot_num != OCFS2_INVALID_SLOT)
1279 ocfs2_put_slot(osb); 1290 ocfs2_put_slot(osb);
1280 1291
1281 if (osb->dlm) 1292 if (osb->cconn)
1282 ocfs2_super_unlock(osb, 1); 1293 ocfs2_super_unlock(osb, 1);
1283 1294
1284 ocfs2_release_system_inodes(osb); 1295 ocfs2_release_system_inodes(osb);
1285 1296
1286 if (osb->dlm) 1297 /*
1287 ocfs2_dlm_shutdown(osb); 1298 * If we're dismounting due to mount error, mount.ocfs2 will clean
1299 * up heartbeat. If we're a local mount, there is no heartbeat.
1300 * If we failed before we got a uuid_str yet, we can't stop
1301 * heartbeat. Otherwise, do it.
1302 */
1303 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
1304 hangup_needed = 1;
1305
1306 if (osb->cconn)
1307 ocfs2_dlm_shutdown(osb, hangup_needed);
1288 1308
1289 debugfs_remove(osb->osb_debug_root); 1309 debugfs_remove(osb->osb_debug_root);
1290 1310
1291 if (!mnt_err) 1311 if (hangup_needed)
1292 ocfs2_stop_heartbeat(osb); 1312 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
1293 1313
1294 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); 1314 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1295 1315
1296 if (ocfs2_mount_local(osb)) 1316 if (ocfs2_mount_local(osb))
1297 snprintf(nodestr, sizeof(nodestr), "local"); 1317 snprintf(nodestr, sizeof(nodestr), "local");
1298 else 1318 else
1299 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 1319 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
1300 1320
1301 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", 1321 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
1302 osb->dev_str, nodestr); 1322 osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1355 sb->s_fs_info = osb; 1375 sb->s_fs_info = osb;
1356 sb->s_op = &ocfs2_sops; 1376 sb->s_op = &ocfs2_sops;
1357 sb->s_export_op = &ocfs2_export_ops; 1377 sb->s_export_op = &ocfs2_export_ops;
1358 osb->osb_locking_proto = ocfs2_locking_protocol;
1359 sb->s_time_gran = 1; 1378 sb->s_time_gran = 1;
1360 sb->s_flags |= MS_NOATIME; 1379 sb->s_flags |= MS_NOATIME;
1361 /* this is needed to support O_LARGEFILE */ 1380 /* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1368 osb->s_sectsize_bits = blksize_bits(sector_size); 1387 osb->s_sectsize_bits = blksize_bits(sector_size);
1369 BUG_ON(!osb->s_sectsize_bits); 1388 BUG_ON(!osb->s_sectsize_bits);
1370 1389
1371 init_waitqueue_head(&osb->recovery_event);
1372 spin_lock_init(&osb->dc_task_lock); 1390 spin_lock_init(&osb->dc_task_lock);
1373 init_waitqueue_head(&osb->dc_event); 1391 init_waitqueue_head(&osb->dc_event);
1374 osb->dc_work_sequence = 0; 1392 osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1376 INIT_LIST_HEAD(&osb->blocked_lock_list); 1394 INIT_LIST_HEAD(&osb->blocked_lock_list);
1377 osb->blocked_lock_count = 0; 1395 osb->blocked_lock_count = 0;
1378 spin_lock_init(&osb->osb_lock); 1396 spin_lock_init(&osb->osb_lock);
1397 ocfs2_init_inode_steal_slot(osb);
1379 1398
1380 atomic_set(&osb->alloc_stats.moves, 0); 1399 atomic_set(&osb->alloc_stats.moves, 0);
1381 atomic_set(&osb->alloc_stats.local_data, 0); 1400 atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
1388 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1407 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1389 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1408 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1390 1409
1391 mutex_init(&osb->recovery_lock); 1410 status = ocfs2_recovery_init(osb);
1392 1411 if (status) {
1393 osb->disable_recovery = 0; 1412 mlog(ML_ERROR, "Unable to initialize recovery state\n");
1394 osb->recovery_thread_task = NULL; 1413 mlog_errno(status);
1414 goto bail;
1415 }
1395 1416
1396 init_waitqueue_head(&osb->checkpoint_event); 1417 init_waitqueue_head(&osb->checkpoint_event);
1397 atomic_set(&osb->needs_checkpoint, 0); 1418 atomic_set(&osb->needs_checkpoint, 0);
1398 1419
1399 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1420 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1400 1421
1401 osb->node_num = O2NM_INVALID_NODE_NUM;
1402 osb->slot_num = OCFS2_INVALID_SLOT; 1422 osb->slot_num = OCFS2_INVALID_SLOT;
1403 1423
1404 osb->local_alloc_state = OCFS2_LA_UNUSED; 1424 osb->local_alloc_state = OCFS2_LA_UNUSED;
1405 osb->local_alloc_bh = NULL; 1425 osb->local_alloc_bh = NULL;
1406 1426
1407 ocfs2_setup_hb_callbacks(osb);
1408
1409 init_waitqueue_head(&osb->osb_mount_event); 1427 init_waitqueue_head(&osb->osb_mount_event);
1410 1428
1411 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 1429 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
1455 goto bail; 1473 goto bail;
1456 } 1474 }
1457 1475
1476 if (ocfs2_userspace_stack(osb)) {
1477 memcpy(osb->osb_cluster_stack,
1478 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
1479 OCFS2_STACK_LABEL_LEN);
1480 osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
1481 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
1482 mlog(ML_ERROR,
1483 "couldn't mount because of an invalid "
1484 "cluster stack label (%s) \n",
1485 osb->osb_cluster_stack);
1486 status = -EINVAL;
1487 goto bail;
1488 }
1489 } else {
1490 /* The empty string is identical with classic tools that
1491 * don't know about s_cluster_info. */
1492 osb->osb_cluster_stack[0] = '\0';
1493 }
1494
1458 get_random_bytes(&osb->s_next_generation, sizeof(u32)); 1495 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1459 1496
1460 /* FIXME 1497 /* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
1724 1761
1725 /* This function assumes that the caller has the main osb resource */ 1762 /* This function assumes that the caller has the main osb resource */
1726 1763
1727 if (osb->slot_info) 1764 ocfs2_free_slot_info(osb);
1728 ocfs2_free_slot_info(osb->slot_info);
1729 1765
1730 kfree(osb->osb_orphan_wipes); 1766 kfree(osb->osb_orphan_wipes);
1731 /* FIXME 1767 /* FIXME