diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-18 13:15:22 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-18 13:15:22 -0400 |
commit | e675349e2bdbfb661fa0d8ff2441b4cf48fb7e48 (patch) | |
tree | 7443e324c951f375945905dc436b012c98a00e05 | |
parent | ef38ff9d372d4fe69e415370939a0f1fb5783af1 (diff) | |
parent | 2309e9e040fe29469fb85a384636c455b62fe525 (diff) |
Merge branch 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2
* 'upstream-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2: (64 commits)
ocfs2/net: Add debug interface to o2net
ocfs2: Only build ocfs2/dlm with the o2cb stack module
ocfs2/cluster: Get rid of arguments to the timeout routines
ocfs2: Put tree in MAINTAINERS
ocfs2: Use BUG_ON
ocfs2: Convert ocfs2 over to unlocked_ioctl
ocfs2: Improve rename locking
fs/ocfs2/aops.c: test for IS_ERR rather than 0
ocfs2: Add inode stealing for ocfs2_reserve_new_inode
ocfs2: Add ac_alloc_slot in ocfs2_alloc_context
ocfs2: Add a new parameter for ocfs2_reserve_suballoc_bits
ocfs2: Enable cross extent block merge.
ocfs2: Add support for cross extent block
ocfs2: Move /sys/o2cb to /sys/fs/o2cb
sysfs: Allow removal of symlinks in the sysfs root
ocfs2: Reconnect after idle time out.
ocfs2/dlm: Cleanup lockres print
ocfs2/dlm: Fix lockname in lockres print function
ocfs2/dlm: Move dlm_print_one_mle() from dlmmaster.c to dlmdebug.c
ocfs2/dlm: Dumps the purgelist into a debugfs file
...
47 files changed, 5800 insertions, 1042 deletions
diff --git a/Documentation/ABI/obsolete/o2cb b/Documentation/ABI/obsolete/o2cb new file mode 100644 index 000000000000..9c49d8e6c0cc --- /dev/null +++ b/Documentation/ABI/obsolete/o2cb | |||
@@ -0,0 +1,11 @@ | |||
1 | What: /sys/o2cb symlink | ||
2 | Date: Dec 2005 | ||
3 | KernelVersion: 2.6.16 | ||
4 | Contact: ocfs2-devel@oss.oracle.com | ||
5 | Description: This is a symlink: /sys/o2cb to /sys/fs/o2cb. The symlink will | ||
6 | be removed when new versions of ocfs2-tools which know to look | ||
7 | in /sys/fs/o2cb are sufficiently prevalent. Don't code new | ||
8 | software to look here, it should try /sys/fs/o2cb instead. | ||
9 | See Documentation/ABI/stable/o2cb for more information on usage. | ||
10 | Users: ocfs2-tools. It's sufficient to mail proposed changes to | ||
11 | ocfs2-devel@oss.oracle.com. | ||
diff --git a/Documentation/ABI/stable/o2cb b/Documentation/ABI/stable/o2cb new file mode 100644 index 000000000000..5eb1545e0b8d --- /dev/null +++ b/Documentation/ABI/stable/o2cb | |||
@@ -0,0 +1,10 @@ | |||
1 | What: /sys/fs/o2cb/ (was /sys/o2cb) | ||
2 | Date: Dec 2005 | ||
3 | KernelVersion: 2.6.16 | ||
4 | Contact: ocfs2-devel@oss.oracle.com | ||
5 | Description: Ocfs2-tools looks at 'interface-revision' for versioning | ||
6 | information. Each logmask/ file controls a set of debug prints | ||
7 | and can be written into with the strings "allow", "deny", or | ||
8 | "off". Reading the file returns the current state. | ||
9 | Users: ocfs2-tools. It's sufficient to mail proposed changes to | ||
10 | ocfs2-devel@oss.oracle.com. | ||
diff --git a/Documentation/ABI/testing/sysfs-ocfs2 b/Documentation/ABI/testing/sysfs-ocfs2 new file mode 100644 index 000000000000..b7cc516a8a8a --- /dev/null +++ b/Documentation/ABI/testing/sysfs-ocfs2 | |||
@@ -0,0 +1,89 @@ | |||
1 | What: /sys/fs/ocfs2/ | ||
2 | Date: April 2008 | ||
3 | Contact: ocfs2-devel@oss.oracle.com | ||
4 | Description: | ||
5 | The /sys/fs/ocfs2 directory contains knobs used by the | ||
6 | ocfs2-tools to interact with the filesystem. | ||
7 | |||
8 | What: /sys/fs/ocfs2/max_locking_protocol | ||
9 | Date: April 2008 | ||
10 | Contact: ocfs2-devel@oss.oracle.com | ||
11 | Description: | ||
12 | The /sys/fs/ocfs2/max_locking_protocol file displays version | ||
13 | of ocfs2 locking supported by the filesystem. This version | ||
14 | covers how ocfs2 uses distributed locking between cluster | ||
15 | nodes. | ||
16 | |||
17 | The protocol version has a major and minor number. Two | ||
18 | cluster nodes can interoperate if they have an identical | ||
19 | major number and an overlapping minor number - thus, | ||
20 | a node with version 1.10 can interoperate with a node | ||
21 | sporting version 1.8, as long as both use the 1.8 protocol. | ||
22 | |||
23 | Reading from this file returns a single line, the major | ||
24 | number and minor number joined by a period, eg "1.10". | ||
25 | |||
26 | This file is read-only. The value is compiled into the | ||
27 | driver. | ||
28 | |||
29 | What: /sys/fs/ocfs2/loaded_cluster_plugins | ||
30 | Date: April 2008 | ||
31 | Contact: ocfs2-devel@oss.oracle.com | ||
32 | Description: | ||
33 | The /sys/fs/ocfs2/loaded_cluster_plugins file describes | ||
34 | the available plugins to support ocfs2 cluster operation. | ||
35 | A cluster plugin is required to use ocfs2 in a cluster. | ||
36 | There are currently two available plugins: | ||
37 | |||
38 | * 'o2cb' - The classic o2cb cluster stack that ocfs2 has | ||
39 | used since its inception. | ||
40 | * 'user' - A plugin supporting userspace cluster software | ||
41 | in conjunction with fs/dlm. | ||
42 | |||
43 | Reading from this file returns the names of all loaded | ||
44 | plugins, one per line. | ||
45 | |||
46 | This file is read-only. Its contents may change as | ||
47 | plugins are loaded or removed. | ||
48 | |||
49 | What: /sys/fs/ocfs2/active_cluster_plugin | ||
50 | Date: April 2008 | ||
51 | Contact: ocfs2-devel@oss.oracle.com | ||
52 | Description: | ||
53 | The /sys/fs/ocfs2/active_cluster_plugin displays which | ||
54 | cluster plugin is currently in use by the filesystem. | ||
55 | The active plugin will appear in the loaded_cluster_plugins | ||
56 | file as well. Only one plugin can be used at a time. | ||
57 | |||
58 | Reading from this file returns the name of the active plugin | ||
59 | on a single line. | ||
60 | |||
61 | This file is read-only. Which plugin is active depends on | ||
62 | the cluster stack in use. The contents may change | ||
63 | when all filesystems are unmounted and the cluster stack | ||
64 | is changed. | ||
65 | |||
66 | What: /sys/fs/ocfs2/cluster_stack | ||
67 | Date: April 2008 | ||
68 | Contact: ocfs2-devel@oss.oracle.com | ||
69 | Description: | ||
70 | The /sys/fs/ocfs2/cluster_stack file contains the name | ||
71 | of current ocfs2 cluster stack. This value is set by | ||
72 | userspace tools when bringing the cluster stack online. | ||
73 | |||
74 | Cluster stack names are 4 characters in length. | ||
75 | |||
76 | When the 'o2cb' cluster stack is used, the 'o2cb' cluster | ||
77 | plugin is active. All other cluster stacks use the 'user' | ||
78 | cluster plugin. | ||
79 | |||
80 | Reading from this file returns the name of the current | ||
81 | cluster stack on a single line. | ||
82 | |||
83 | Writing a new stack name to this file changes the current | ||
84 | cluster stack unless there are mounted ocfs2 filesystems. | ||
85 | If there are mounted filesystems, attempts to change the | ||
86 | stack return an error. | ||
87 | |||
88 | Users: | ||
89 | ocfs2-tools <ocfs2-tools-devel@oss.oracle.com> | ||
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 164c89394cff..4b70622a8a91 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt | |||
@@ -318,3 +318,13 @@ Why: Not used in-tree. The current out-of-tree users used it to | |||
318 | code / infrastructure should be in the kernel and not in some | 318 | code / infrastructure should be in the kernel and not in some |
319 | out-of-tree driver. | 319 | out-of-tree driver. |
320 | Who: Thomas Gleixner <tglx@linutronix.de> | 320 | Who: Thomas Gleixner <tglx@linutronix.de> |
321 | |||
322 | --------------------------- | ||
323 | |||
324 | What: /sys/o2cb symlink | ||
325 | When: January 2010 | ||
326 | Why: /sys/fs/o2cb is the proper location for this information - /sys/o2cb | ||
327 | exists as a symlink for backwards compatibility for old versions of | ||
328 | ocfs2-tools. 2 years should be sufficient time to phase in new versions | ||
329 | which know to look in /sys/fs/o2cb. | ||
330 | Who: ocfs2-devel@oss.oracle.com | ||
diff --git a/MAINTAINERS b/MAINTAINERS index 3eceebb48c92..974ee8ddb12c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
@@ -2952,6 +2952,7 @@ P: Joel Becker | |||
2952 | M: joel.becker@oracle.com | 2952 | M: joel.becker@oracle.com |
2953 | L: ocfs2-devel@oss.oracle.com | 2953 | L: ocfs2-devel@oss.oracle.com |
2954 | W: http://oss.oracle.com/projects/ocfs2/ | 2954 | W: http://oss.oracle.com/projects/ocfs2/ |
2955 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git | ||
2955 | S: Supported | 2956 | S: Supported |
2956 | 2957 | ||
2957 | OMNIKEY CARDMAN 4000 DRIVER | 2958 | OMNIKEY CARDMAN 4000 DRIVER |
diff --git a/fs/Kconfig b/fs/Kconfig index c509123bea49..028ae38ecc52 100644 --- a/fs/Kconfig +++ b/fs/Kconfig | |||
@@ -444,6 +444,32 @@ config OCFS2_FS | |||
444 | For more information on OCFS2, see the file | 444 | For more information on OCFS2, see the file |
445 | <file:Documentation/filesystems/ocfs2.txt>. | 445 | <file:Documentation/filesystems/ocfs2.txt>. |
446 | 446 | ||
447 | config OCFS2_FS_O2CB | ||
448 | tristate "O2CB Kernelspace Clustering" | ||
449 | depends on OCFS2_FS | ||
450 | default y | ||
451 | help | ||
452 | OCFS2 includes a simple kernelspace clustering package, the OCFS2 | ||
453 | Cluster Base. It only requires a very small userspace component | ||
454 | to configure it. This comes with the standard ocfs2-tools package. | ||
455 | O2CB is limited to maintaining a cluster for OCFS2 file systems. | ||
456 | It cannot manage any other cluster applications. | ||
457 | |||
458 | It is always safe to say Y here, as the clustering method is | ||
459 | run-time selectable. | ||
460 | |||
461 | config OCFS2_FS_USERSPACE_CLUSTER | ||
462 | tristate "OCFS2 Userspace Clustering" | ||
463 | depends on OCFS2_FS && DLM | ||
464 | default y | ||
465 | help | ||
466 | This option will allow OCFS2 to use userspace clustering services | ||
467 | in conjunction with the DLM in fs/dlm. If you are using a | ||
468 | userspace cluster manager, say Y here. | ||
469 | |||
470 | It is safe to say Y, as the clustering method is run-time | ||
471 | selectable. | ||
472 | |||
447 | config OCFS2_DEBUG_MASKLOG | 473 | config OCFS2_DEBUG_MASKLOG |
448 | bool "OCFS2 logging support" | 474 | bool "OCFS2 logging support" |
449 | depends on OCFS2_FS | 475 | depends on OCFS2_FS |
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 4d4ce48bb42c..f6956de56fdb 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile | |||
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2 | |||
2 | 2 | ||
3 | EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES | 3 | EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES |
4 | 4 | ||
5 | obj-$(CONFIG_OCFS2_FS) += ocfs2.o | 5 | obj-$(CONFIG_OCFS2_FS) += \ |
6 | ocfs2.o \ | ||
7 | ocfs2_stackglue.o | ||
8 | |||
9 | obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o | ||
10 | obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o | ||
6 | 11 | ||
7 | ocfs2-objs := \ | 12 | ocfs2-objs := \ |
8 | alloc.o \ | 13 | alloc.o \ |
@@ -31,5 +36,10 @@ ocfs2-objs := \ | |||
31 | uptodate.o \ | 36 | uptodate.o \ |
32 | ver.o | 37 | ver.o |
33 | 38 | ||
39 | ocfs2_stackglue-objs := stackglue.o | ||
40 | ocfs2_stack_o2cb-objs := stack_o2cb.o | ||
41 | ocfs2_stack_user-objs := stack_user.o | ||
42 | |||
43 | # cluster/ is always needed when OCFS2_FS for masklog support | ||
34 | obj-$(CONFIG_OCFS2_FS) += cluster/ | 44 | obj-$(CONFIG_OCFS2_FS) += cluster/ |
35 | obj-$(CONFIG_OCFS2_FS) += dlm/ | 45 | obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/ |
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 447206eb5c2e..41f84c92094f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c | |||
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, | |||
1029 | BUG_ON(!next_free); | 1029 | BUG_ON(!next_free); |
1030 | 1030 | ||
1031 | /* The tree code before us didn't allow enough room in the leaf. */ | 1031 | /* The tree code before us didn't allow enough room in the leaf. */ |
1032 | if (el->l_next_free_rec == el->l_count && !has_empty) | 1032 | BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); |
1033 | BUG(); | ||
1034 | 1033 | ||
1035 | /* | 1034 | /* |
1036 | * The easiest way to approach this is to just remove the | 1035 | * The easiest way to approach this is to just remove the |
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, | |||
1450 | * - When our insert into the right path leaf is at the leftmost edge | 1449 | * - When our insert into the right path leaf is at the leftmost edge |
1451 | * and requires an update of the path immediately to it's left. This | 1450 | * and requires an update of the path immediately to it's left. This |
1452 | * can occur at the end of some types of rotation and appending inserts. | 1451 | * can occur at the end of some types of rotation and appending inserts. |
1452 | * - When we've adjusted the last extent record in the left path leaf and the | ||
1453 | * 1st extent record in the right path leaf during cross extent block merge. | ||
1453 | */ | 1454 | */ |
1454 | static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, | 1455 | static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, |
1455 | struct ocfs2_path *left_path, | 1456 | struct ocfs2_path *left_path, |
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, | |||
2712 | } | 2713 | } |
2713 | } | 2714 | } |
2714 | 2715 | ||
2716 | static int ocfs2_get_right_path(struct inode *inode, | ||
2717 | struct ocfs2_path *left_path, | ||
2718 | struct ocfs2_path **ret_right_path) | ||
2719 | { | ||
2720 | int ret; | ||
2721 | u32 right_cpos; | ||
2722 | struct ocfs2_path *right_path = NULL; | ||
2723 | struct ocfs2_extent_list *left_el; | ||
2724 | |||
2725 | *ret_right_path = NULL; | ||
2726 | |||
2727 | /* This function shouldn't be called for non-trees. */ | ||
2728 | BUG_ON(left_path->p_tree_depth == 0); | ||
2729 | |||
2730 | left_el = path_leaf_el(left_path); | ||
2731 | BUG_ON(left_el->l_next_free_rec != left_el->l_count); | ||
2732 | |||
2733 | ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, | ||
2734 | &right_cpos); | ||
2735 | if (ret) { | ||
2736 | mlog_errno(ret); | ||
2737 | goto out; | ||
2738 | } | ||
2739 | |||
2740 | /* This function shouldn't be called for the rightmost leaf. */ | ||
2741 | BUG_ON(right_cpos == 0); | ||
2742 | |||
2743 | right_path = ocfs2_new_path(path_root_bh(left_path), | ||
2744 | path_root_el(left_path)); | ||
2745 | if (!right_path) { | ||
2746 | ret = -ENOMEM; | ||
2747 | mlog_errno(ret); | ||
2748 | goto out; | ||
2749 | } | ||
2750 | |||
2751 | ret = ocfs2_find_path(inode, right_path, right_cpos); | ||
2752 | if (ret) { | ||
2753 | mlog_errno(ret); | ||
2754 | goto out; | ||
2755 | } | ||
2756 | |||
2757 | *ret_right_path = right_path; | ||
2758 | out: | ||
2759 | if (ret) | ||
2760 | ocfs2_free_path(right_path); | ||
2761 | return ret; | ||
2762 | } | ||
2763 | |||
2715 | /* | 2764 | /* |
2716 | * Remove split_rec clusters from the record at index and merge them | 2765 | * Remove split_rec clusters from the record at index and merge them |
2717 | * onto the beginning of the record at index + 1. | 2766 | * onto the beginning of the record "next" to it. |
2767 | * For index < l_count - 1, the next means the extent rec at index + 1. | ||
2768 | * For index == l_count - 1, the "next" means the 1st extent rec of the | ||
2769 | * next extent block. | ||
2718 | */ | 2770 | */ |
2719 | static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, | 2771 | static int ocfs2_merge_rec_right(struct inode *inode, |
2720 | handle_t *handle, | 2772 | struct ocfs2_path *left_path, |
2721 | struct ocfs2_extent_rec *split_rec, | 2773 | handle_t *handle, |
2722 | struct ocfs2_extent_list *el, int index) | 2774 | struct ocfs2_extent_rec *split_rec, |
2775 | int index) | ||
2723 | { | 2776 | { |
2724 | int ret; | 2777 | int ret, next_free, i; |
2725 | unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); | 2778 | unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); |
2726 | struct ocfs2_extent_rec *left_rec; | 2779 | struct ocfs2_extent_rec *left_rec; |
2727 | struct ocfs2_extent_rec *right_rec; | 2780 | struct ocfs2_extent_rec *right_rec; |
2781 | struct ocfs2_extent_list *right_el; | ||
2782 | struct ocfs2_path *right_path = NULL; | ||
2783 | int subtree_index = 0; | ||
2784 | struct ocfs2_extent_list *el = path_leaf_el(left_path); | ||
2785 | struct buffer_head *bh = path_leaf_bh(left_path); | ||
2786 | struct buffer_head *root_bh = NULL; | ||
2728 | 2787 | ||
2729 | BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); | 2788 | BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); |
2730 | |||
2731 | left_rec = &el->l_recs[index]; | 2789 | left_rec = &el->l_recs[index]; |
2732 | right_rec = &el->l_recs[index + 1]; | 2790 | |
2791 | if (index == le16_to_cpu(el->l_next_free_rec - 1) && | ||
2792 | le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { | ||
2793 | /* we meet with a cross extent block merge. */ | ||
2794 | ret = ocfs2_get_right_path(inode, left_path, &right_path); | ||
2795 | if (ret) { | ||
2796 | mlog_errno(ret); | ||
2797 | goto out; | ||
2798 | } | ||
2799 | |||
2800 | right_el = path_leaf_el(right_path); | ||
2801 | next_free = le16_to_cpu(right_el->l_next_free_rec); | ||
2802 | BUG_ON(next_free <= 0); | ||
2803 | right_rec = &right_el->l_recs[0]; | ||
2804 | if (ocfs2_is_empty_extent(right_rec)) { | ||
2805 | BUG_ON(le16_to_cpu(next_free) <= 1); | ||
2806 | right_rec = &right_el->l_recs[1]; | ||
2807 | } | ||
2808 | |||
2809 | BUG_ON(le32_to_cpu(left_rec->e_cpos) + | ||
2810 | le16_to_cpu(left_rec->e_leaf_clusters) != | ||
2811 | le32_to_cpu(right_rec->e_cpos)); | ||
2812 | |||
2813 | subtree_index = ocfs2_find_subtree_root(inode, | ||
2814 | left_path, right_path); | ||
2815 | |||
2816 | ret = ocfs2_extend_rotate_transaction(handle, subtree_index, | ||
2817 | handle->h_buffer_credits, | ||
2818 | right_path); | ||
2819 | if (ret) { | ||
2820 | mlog_errno(ret); | ||
2821 | goto out; | ||
2822 | } | ||
2823 | |||
2824 | root_bh = left_path->p_node[subtree_index].bh; | ||
2825 | BUG_ON(root_bh != right_path->p_node[subtree_index].bh); | ||
2826 | |||
2827 | ret = ocfs2_journal_access(handle, inode, root_bh, | ||
2828 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2829 | if (ret) { | ||
2830 | mlog_errno(ret); | ||
2831 | goto out; | ||
2832 | } | ||
2833 | |||
2834 | for (i = subtree_index + 1; | ||
2835 | i < path_num_items(right_path); i++) { | ||
2836 | ret = ocfs2_journal_access(handle, inode, | ||
2837 | right_path->p_node[i].bh, | ||
2838 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2839 | if (ret) { | ||
2840 | mlog_errno(ret); | ||
2841 | goto out; | ||
2842 | } | ||
2843 | |||
2844 | ret = ocfs2_journal_access(handle, inode, | ||
2845 | left_path->p_node[i].bh, | ||
2846 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
2847 | if (ret) { | ||
2848 | mlog_errno(ret); | ||
2849 | goto out; | ||
2850 | } | ||
2851 | } | ||
2852 | |||
2853 | } else { | ||
2854 | BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); | ||
2855 | right_rec = &el->l_recs[index + 1]; | ||
2856 | } | ||
2733 | 2857 | ||
2734 | ret = ocfs2_journal_access(handle, inode, bh, | 2858 | ret = ocfs2_journal_access(handle, inode, bh, |
2735 | OCFS2_JOURNAL_ACCESS_WRITE); | 2859 | OCFS2_JOURNAL_ACCESS_WRITE); |
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, | |||
2751 | if (ret) | 2875 | if (ret) |
2752 | mlog_errno(ret); | 2876 | mlog_errno(ret); |
2753 | 2877 | ||
2878 | if (right_path) { | ||
2879 | ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); | ||
2880 | if (ret) | ||
2881 | mlog_errno(ret); | ||
2882 | |||
2883 | ocfs2_complete_edge_insert(inode, handle, left_path, | ||
2884 | right_path, subtree_index); | ||
2885 | } | ||
2886 | out: | ||
2887 | if (right_path) | ||
2888 | ocfs2_free_path(right_path); | ||
2889 | return ret; | ||
2890 | } | ||
2891 | |||
2892 | static int ocfs2_get_left_path(struct inode *inode, | ||
2893 | struct ocfs2_path *right_path, | ||
2894 | struct ocfs2_path **ret_left_path) | ||
2895 | { | ||
2896 | int ret; | ||
2897 | u32 left_cpos; | ||
2898 | struct ocfs2_path *left_path = NULL; | ||
2899 | |||
2900 | *ret_left_path = NULL; | ||
2901 | |||
2902 | /* This function shouldn't be called for non-trees. */ | ||
2903 | BUG_ON(right_path->p_tree_depth == 0); | ||
2904 | |||
2905 | ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, | ||
2906 | right_path, &left_cpos); | ||
2907 | if (ret) { | ||
2908 | mlog_errno(ret); | ||
2909 | goto out; | ||
2910 | } | ||
2911 | |||
2912 | /* This function shouldn't be called for the leftmost leaf. */ | ||
2913 | BUG_ON(left_cpos == 0); | ||
2914 | |||
2915 | left_path = ocfs2_new_path(path_root_bh(right_path), | ||
2916 | path_root_el(right_path)); | ||
2917 | if (!left_path) { | ||
2918 | ret = -ENOMEM; | ||
2919 | mlog_errno(ret); | ||
2920 | goto out; | ||
2921 | } | ||
2922 | |||
2923 | ret = ocfs2_find_path(inode, left_path, left_cpos); | ||
2924 | if (ret) { | ||
2925 | mlog_errno(ret); | ||
2926 | goto out; | ||
2927 | } | ||
2928 | |||
2929 | *ret_left_path = left_path; | ||
2754 | out: | 2930 | out: |
2931 | if (ret) | ||
2932 | ocfs2_free_path(left_path); | ||
2755 | return ret; | 2933 | return ret; |
2756 | } | 2934 | } |
2757 | 2935 | ||
2758 | /* | 2936 | /* |
2759 | * Remove split_rec clusters from the record at index and merge them | 2937 | * Remove split_rec clusters from the record at index and merge them |
2760 | * onto the tail of the record at index - 1. | 2938 | * onto the tail of the record "before" it. |
2939 | * For index > 0, the "before" means the extent rec at index - 1. | ||
2940 | * | ||
2941 | * For index == 0, the "before" means the last record of the previous | ||
2942 | * extent block. And there is also a situation that we may need to | ||
2943 | * remove the rightmost leaf extent block in the right_path and change | ||
2944 | * the right path to indicate the new rightmost path. | ||
2761 | */ | 2945 | */ |
2762 | static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, | 2946 | static int ocfs2_merge_rec_left(struct inode *inode, |
2947 | struct ocfs2_path *right_path, | ||
2763 | handle_t *handle, | 2948 | handle_t *handle, |
2764 | struct ocfs2_extent_rec *split_rec, | 2949 | struct ocfs2_extent_rec *split_rec, |
2765 | struct ocfs2_extent_list *el, int index) | 2950 | struct ocfs2_cached_dealloc_ctxt *dealloc, |
2951 | int index) | ||
2766 | { | 2952 | { |
2767 | int ret, has_empty_extent = 0; | 2953 | int ret, i, subtree_index = 0, has_empty_extent = 0; |
2768 | unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); | 2954 | unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); |
2769 | struct ocfs2_extent_rec *left_rec; | 2955 | struct ocfs2_extent_rec *left_rec; |
2770 | struct ocfs2_extent_rec *right_rec; | 2956 | struct ocfs2_extent_rec *right_rec; |
2957 | struct ocfs2_extent_list *el = path_leaf_el(right_path); | ||
2958 | struct buffer_head *bh = path_leaf_bh(right_path); | ||
2959 | struct buffer_head *root_bh = NULL; | ||
2960 | struct ocfs2_path *left_path = NULL; | ||
2961 | struct ocfs2_extent_list *left_el; | ||
2771 | 2962 | ||
2772 | BUG_ON(index <= 0); | 2963 | BUG_ON(index < 0); |
2773 | 2964 | ||
2774 | left_rec = &el->l_recs[index - 1]; | ||
2775 | right_rec = &el->l_recs[index]; | 2965 | right_rec = &el->l_recs[index]; |
2776 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | 2966 | if (index == 0) { |
2777 | has_empty_extent = 1; | 2967 | /* we meet with a cross extent block merge. */ |
2968 | ret = ocfs2_get_left_path(inode, right_path, &left_path); | ||
2969 | if (ret) { | ||
2970 | mlog_errno(ret); | ||
2971 | goto out; | ||
2972 | } | ||
2973 | |||
2974 | left_el = path_leaf_el(left_path); | ||
2975 | BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != | ||
2976 | le16_to_cpu(left_el->l_count)); | ||
2977 | |||
2978 | left_rec = &left_el->l_recs[ | ||
2979 | le16_to_cpu(left_el->l_next_free_rec) - 1]; | ||
2980 | BUG_ON(le32_to_cpu(left_rec->e_cpos) + | ||
2981 | le16_to_cpu(left_rec->e_leaf_clusters) != | ||
2982 | le32_to_cpu(split_rec->e_cpos)); | ||
2983 | |||
2984 | subtree_index = ocfs2_find_subtree_root(inode, | ||
2985 | left_path, right_path); | ||
2986 | |||
2987 | ret = ocfs2_extend_rotate_transaction(handle, subtree_index, | ||
2988 | handle->h_buffer_credits, | ||
2989 | left_path); | ||
2990 | if (ret) { | ||
2991 | mlog_errno(ret); | ||
2992 | goto out; | ||
2993 | } | ||
2994 | |||
2995 | root_bh = left_path->p_node[subtree_index].bh; | ||
2996 | BUG_ON(root_bh != right_path->p_node[subtree_index].bh); | ||
2997 | |||
2998 | ret = ocfs2_journal_access(handle, inode, root_bh, | ||
2999 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
3000 | if (ret) { | ||
3001 | mlog_errno(ret); | ||
3002 | goto out; | ||
3003 | } | ||
3004 | |||
3005 | for (i = subtree_index + 1; | ||
3006 | i < path_num_items(right_path); i++) { | ||
3007 | ret = ocfs2_journal_access(handle, inode, | ||
3008 | right_path->p_node[i].bh, | ||
3009 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
3010 | if (ret) { | ||
3011 | mlog_errno(ret); | ||
3012 | goto out; | ||
3013 | } | ||
3014 | |||
3015 | ret = ocfs2_journal_access(handle, inode, | ||
3016 | left_path->p_node[i].bh, | ||
3017 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
3018 | if (ret) { | ||
3019 | mlog_errno(ret); | ||
3020 | goto out; | ||
3021 | } | ||
3022 | } | ||
3023 | } else { | ||
3024 | left_rec = &el->l_recs[index - 1]; | ||
3025 | if (ocfs2_is_empty_extent(&el->l_recs[0])) | ||
3026 | has_empty_extent = 1; | ||
3027 | } | ||
2778 | 3028 | ||
2779 | ret = ocfs2_journal_access(handle, inode, bh, | 3029 | ret = ocfs2_journal_access(handle, inode, bh, |
2780 | OCFS2_JOURNAL_ACCESS_WRITE); | 3030 | OCFS2_JOURNAL_ACCESS_WRITE); |
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, | |||
2790 | *left_rec = *split_rec; | 3040 | *left_rec = *split_rec; |
2791 | 3041 | ||
2792 | has_empty_extent = 0; | 3042 | has_empty_extent = 0; |
2793 | } else { | 3043 | } else |
2794 | le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); | 3044 | le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); |
2795 | } | ||
2796 | 3045 | ||
2797 | le32_add_cpu(&right_rec->e_cpos, split_clusters); | 3046 | le32_add_cpu(&right_rec->e_cpos, split_clusters); |
2798 | le64_add_cpu(&right_rec->e_blkno, | 3047 | le64_add_cpu(&right_rec->e_blkno, |
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, | |||
2805 | if (ret) | 3054 | if (ret) |
2806 | mlog_errno(ret); | 3055 | mlog_errno(ret); |
2807 | 3056 | ||
3057 | if (left_path) { | ||
3058 | ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); | ||
3059 | if (ret) | ||
3060 | mlog_errno(ret); | ||
3061 | |||
3062 | /* | ||
3063 | * In the situation that the right_rec is empty and the extent | ||
3064 | * block is empty also, ocfs2_complete_edge_insert can't handle | ||
3065 | * it and we need to delete the right extent block. | ||
3066 | */ | ||
3067 | if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && | ||
3068 | le16_to_cpu(el->l_next_free_rec) == 1) { | ||
3069 | |||
3070 | ret = ocfs2_remove_rightmost_path(inode, handle, | ||
3071 | right_path, dealloc); | ||
3072 | if (ret) { | ||
3073 | mlog_errno(ret); | ||
3074 | goto out; | ||
3075 | } | ||
3076 | |||
3077 | /* Now the rightmost extent block has been deleted. | ||
3078 | * So we use the new rightmost path. | ||
3079 | */ | ||
3080 | ocfs2_mv_path(right_path, left_path); | ||
3081 | left_path = NULL; | ||
3082 | } else | ||
3083 | ocfs2_complete_edge_insert(inode, handle, left_path, | ||
3084 | right_path, subtree_index); | ||
3085 | } | ||
2808 | out: | 3086 | out: |
3087 | if (left_path) | ||
3088 | ocfs2_free_path(left_path); | ||
2809 | return ret; | 3089 | return ret; |
2810 | } | 3090 | } |
2811 | 3091 | ||
2812 | static int ocfs2_try_to_merge_extent(struct inode *inode, | 3092 | static int ocfs2_try_to_merge_extent(struct inode *inode, |
2813 | handle_t *handle, | 3093 | handle_t *handle, |
2814 | struct ocfs2_path *left_path, | 3094 | struct ocfs2_path *path, |
2815 | int split_index, | 3095 | int split_index, |
2816 | struct ocfs2_extent_rec *split_rec, | 3096 | struct ocfs2_extent_rec *split_rec, |
2817 | struct ocfs2_cached_dealloc_ctxt *dealloc, | 3097 | struct ocfs2_cached_dealloc_ctxt *dealloc, |
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2819 | 3099 | ||
2820 | { | 3100 | { |
2821 | int ret = 0; | 3101 | int ret = 0; |
2822 | struct ocfs2_extent_list *el = path_leaf_el(left_path); | 3102 | struct ocfs2_extent_list *el = path_leaf_el(path); |
2823 | struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; | 3103 | struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; |
2824 | 3104 | ||
2825 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); | 3105 | BUG_ON(ctxt->c_contig_type == CONTIG_NONE); |
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2832 | * extents - having more than one in a leaf is | 3112 | * extents - having more than one in a leaf is |
2833 | * illegal. | 3113 | * illegal. |
2834 | */ | 3114 | */ |
2835 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, | 3115 | ret = ocfs2_rotate_tree_left(inode, handle, path, |
2836 | dealloc); | 3116 | dealloc); |
2837 | if (ret) { | 3117 | if (ret) { |
2838 | mlog_errno(ret); | 3118 | mlog_errno(ret); |
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2847 | * Left-right contig implies this. | 3127 | * Left-right contig implies this. |
2848 | */ | 3128 | */ |
2849 | BUG_ON(!ctxt->c_split_covers_rec); | 3129 | BUG_ON(!ctxt->c_split_covers_rec); |
2850 | BUG_ON(split_index == 0); | ||
2851 | 3130 | ||
2852 | /* | 3131 | /* |
2853 | * Since the leftright insert always covers the entire | 3132 | * Since the leftright insert always covers the entire |
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2858 | * Since the adding of an empty extent shifts | 3137 | * Since the adding of an empty extent shifts |
2859 | * everything back to the right, there's no need to | 3138 | * everything back to the right, there's no need to |
2860 | * update split_index here. | 3139 | * update split_index here. |
3140 | * | ||
3141 | * When the split_index is zero, we need to merge it to the | ||
3142 | * prevoius extent block. It is more efficient and easier | ||
3143 | * if we do merge_right first and merge_left later. | ||
2861 | */ | 3144 | */ |
2862 | ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), | 3145 | ret = ocfs2_merge_rec_right(inode, path, |
2863 | handle, split_rec, el, split_index); | 3146 | handle, split_rec, |
3147 | split_index); | ||
2864 | if (ret) { | 3148 | if (ret) { |
2865 | mlog_errno(ret); | 3149 | mlog_errno(ret); |
2866 | goto out; | 3150 | goto out; |
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2871 | */ | 3155 | */ |
2872 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); | 3156 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); |
2873 | 3157 | ||
2874 | /* | 3158 | /* The merge left us with an empty extent, remove it. */ |
2875 | * The left merge left us with an empty extent, remove | 3159 | ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); |
2876 | * it. | ||
2877 | */ | ||
2878 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc); | ||
2879 | if (ret) { | 3160 | if (ret) { |
2880 | mlog_errno(ret); | 3161 | mlog_errno(ret); |
2881 | goto out; | 3162 | goto out; |
2882 | } | 3163 | } |
2883 | split_index--; | 3164 | |
2884 | rec = &el->l_recs[split_index]; | 3165 | rec = &el->l_recs[split_index]; |
2885 | 3166 | ||
2886 | /* | 3167 | /* |
2887 | * Note that we don't pass split_rec here on purpose - | 3168 | * Note that we don't pass split_rec here on purpose - |
2888 | * we've merged it into the left side. | 3169 | * we've merged it into the rec already. |
2889 | */ | 3170 | */ |
2890 | ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), | 3171 | ret = ocfs2_merge_rec_left(inode, path, |
2891 | handle, rec, el, split_index); | 3172 | handle, rec, |
3173 | dealloc, | ||
3174 | split_index); | ||
3175 | |||
2892 | if (ret) { | 3176 | if (ret) { |
2893 | mlog_errno(ret); | 3177 | mlog_errno(ret); |
2894 | goto out; | 3178 | goto out; |
2895 | } | 3179 | } |
2896 | 3180 | ||
2897 | BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); | 3181 | ret = ocfs2_rotate_tree_left(inode, handle, path, |
2898 | |||
2899 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, | ||
2900 | dealloc); | 3182 | dealloc); |
2901 | /* | 3183 | /* |
2902 | * Error from this last rotate is not critical, so | 3184 | * Error from this last rotate is not critical, so |
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2915 | */ | 3197 | */ |
2916 | if (ctxt->c_contig_type == CONTIG_RIGHT) { | 3198 | if (ctxt->c_contig_type == CONTIG_RIGHT) { |
2917 | ret = ocfs2_merge_rec_left(inode, | 3199 | ret = ocfs2_merge_rec_left(inode, |
2918 | path_leaf_bh(left_path), | 3200 | path, |
2919 | handle, split_rec, el, | 3201 | handle, split_rec, |
3202 | dealloc, | ||
2920 | split_index); | 3203 | split_index); |
2921 | if (ret) { | 3204 | if (ret) { |
2922 | mlog_errno(ret); | 3205 | mlog_errno(ret); |
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2924 | } | 3207 | } |
2925 | } else { | 3208 | } else { |
2926 | ret = ocfs2_merge_rec_right(inode, | 3209 | ret = ocfs2_merge_rec_right(inode, |
2927 | path_leaf_bh(left_path), | 3210 | path, |
2928 | handle, split_rec, el, | 3211 | handle, split_rec, |
2929 | split_index); | 3212 | split_index); |
2930 | if (ret) { | 3213 | if (ret) { |
2931 | mlog_errno(ret); | 3214 | mlog_errno(ret); |
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, | |||
2938 | * The merge may have left an empty extent in | 3221 | * The merge may have left an empty extent in |
2939 | * our leaf. Try to rotate it away. | 3222 | * our leaf. Try to rotate it away. |
2940 | */ | 3223 | */ |
2941 | ret = ocfs2_rotate_tree_left(inode, handle, left_path, | 3224 | ret = ocfs2_rotate_tree_left(inode, handle, path, |
2942 | dealloc); | 3225 | dealloc); |
2943 | if (ret) | 3226 | if (ret) |
2944 | mlog_errno(ret); | 3227 | mlog_errno(ret); |
@@ -3498,20 +3781,57 @@ out: | |||
3498 | } | 3781 | } |
3499 | 3782 | ||
3500 | static enum ocfs2_contig_type | 3783 | static enum ocfs2_contig_type |
3501 | ocfs2_figure_merge_contig_type(struct inode *inode, | 3784 | ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, |
3502 | struct ocfs2_extent_list *el, int index, | 3785 | struct ocfs2_extent_list *el, int index, |
3503 | struct ocfs2_extent_rec *split_rec) | 3786 | struct ocfs2_extent_rec *split_rec) |
3504 | { | 3787 | { |
3505 | struct ocfs2_extent_rec *rec; | 3788 | int status; |
3506 | enum ocfs2_contig_type ret = CONTIG_NONE; | 3789 | enum ocfs2_contig_type ret = CONTIG_NONE; |
3790 | u32 left_cpos, right_cpos; | ||
3791 | struct ocfs2_extent_rec *rec = NULL; | ||
3792 | struct ocfs2_extent_list *new_el; | ||
3793 | struct ocfs2_path *left_path = NULL, *right_path = NULL; | ||
3794 | struct buffer_head *bh; | ||
3795 | struct ocfs2_extent_block *eb; | ||
3796 | |||
3797 | if (index > 0) { | ||
3798 | rec = &el->l_recs[index - 1]; | ||
3799 | } else if (path->p_tree_depth > 0) { | ||
3800 | status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, | ||
3801 | path, &left_cpos); | ||
3802 | if (status) | ||
3803 | goto out; | ||
3804 | |||
3805 | if (left_cpos != 0) { | ||
3806 | left_path = ocfs2_new_path(path_root_bh(path), | ||
3807 | path_root_el(path)); | ||
3808 | if (!left_path) | ||
3809 | goto out; | ||
3810 | |||
3811 | status = ocfs2_find_path(inode, left_path, left_cpos); | ||
3812 | if (status) | ||
3813 | goto out; | ||
3814 | |||
3815 | new_el = path_leaf_el(left_path); | ||
3816 | |||
3817 | if (le16_to_cpu(new_el->l_next_free_rec) != | ||
3818 | le16_to_cpu(new_el->l_count)) { | ||
3819 | bh = path_leaf_bh(left_path); | ||
3820 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
3821 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, | ||
3822 | eb); | ||
3823 | goto out; | ||
3824 | } | ||
3825 | rec = &new_el->l_recs[ | ||
3826 | le16_to_cpu(new_el->l_next_free_rec) - 1]; | ||
3827 | } | ||
3828 | } | ||
3507 | 3829 | ||
3508 | /* | 3830 | /* |
3509 | * We're careful to check for an empty extent record here - | 3831 | * We're careful to check for an empty extent record here - |
3510 | * the merge code will know what to do if it sees one. | 3832 | * the merge code will know what to do if it sees one. |
3511 | */ | 3833 | */ |
3512 | 3834 | if (rec) { | |
3513 | if (index > 0) { | ||
3514 | rec = &el->l_recs[index - 1]; | ||
3515 | if (index == 1 && ocfs2_is_empty_extent(rec)) { | 3835 | if (index == 1 && ocfs2_is_empty_extent(rec)) { |
3516 | if (split_rec->e_cpos == el->l_recs[index].e_cpos) | 3836 | if (split_rec->e_cpos == el->l_recs[index].e_cpos) |
3517 | ret = CONTIG_RIGHT; | 3837 | ret = CONTIG_RIGHT; |
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode, | |||
3520 | } | 3840 | } |
3521 | } | 3841 | } |
3522 | 3842 | ||
3523 | if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { | 3843 | rec = NULL; |
3844 | if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) | ||
3845 | rec = &el->l_recs[index + 1]; | ||
3846 | else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && | ||
3847 | path->p_tree_depth > 0) { | ||
3848 | status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, | ||
3849 | path, &right_cpos); | ||
3850 | if (status) | ||
3851 | goto out; | ||
3852 | |||
3853 | if (right_cpos == 0) | ||
3854 | goto out; | ||
3855 | |||
3856 | right_path = ocfs2_new_path(path_root_bh(path), | ||
3857 | path_root_el(path)); | ||
3858 | if (!right_path) | ||
3859 | goto out; | ||
3860 | |||
3861 | status = ocfs2_find_path(inode, right_path, right_cpos); | ||
3862 | if (status) | ||
3863 | goto out; | ||
3864 | |||
3865 | new_el = path_leaf_el(right_path); | ||
3866 | rec = &new_el->l_recs[0]; | ||
3867 | if (ocfs2_is_empty_extent(rec)) { | ||
3868 | if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { | ||
3869 | bh = path_leaf_bh(right_path); | ||
3870 | eb = (struct ocfs2_extent_block *)bh->b_data; | ||
3871 | OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, | ||
3872 | eb); | ||
3873 | goto out; | ||
3874 | } | ||
3875 | rec = &new_el->l_recs[1]; | ||
3876 | } | ||
3877 | } | ||
3878 | |||
3879 | if (rec) { | ||
3524 | enum ocfs2_contig_type contig_type; | 3880 | enum ocfs2_contig_type contig_type; |
3525 | 3881 | ||
3526 | rec = &el->l_recs[index + 1]; | ||
3527 | contig_type = ocfs2_extent_contig(inode, rec, split_rec); | 3882 | contig_type = ocfs2_extent_contig(inode, rec, split_rec); |
3528 | 3883 | ||
3529 | if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) | 3884 | if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) |
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, | |||
3532 | ret = contig_type; | 3887 | ret = contig_type; |
3533 | } | 3888 | } |
3534 | 3889 | ||
3890 | out: | ||
3891 | if (left_path) | ||
3892 | ocfs2_free_path(left_path); | ||
3893 | if (right_path) | ||
3894 | ocfs2_free_path(right_path); | ||
3895 | |||
3535 | return ret; | 3896 | return ret; |
3536 | } | 3897 | } |
3537 | 3898 | ||
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, | |||
3994 | goto out; | 4355 | goto out; |
3995 | } | 4356 | } |
3996 | 4357 | ||
3997 | ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, | 4358 | ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, |
3998 | split_index, | 4359 | split_index, |
3999 | split_rec); | 4360 | split_rec); |
4000 | 4361 | ||
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) | |||
4788 | status = ocfs2_flush_truncate_log(osb); | 5149 | status = ocfs2_flush_truncate_log(osb); |
4789 | if (status < 0) | 5150 | if (status < 0) |
4790 | mlog_errno(status); | 5151 | mlog_errno(status); |
5152 | else | ||
5153 | ocfs2_init_inode_steal_slot(osb); | ||
4791 | 5154 | ||
4792 | mlog_exit(status); | 5155 | mlog_exit(status); |
4793 | } | 5156 | } |
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 90383ed61005..17964c0505a9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | |||
467 | unsigned to) | 467 | unsigned to) |
468 | { | 468 | { |
469 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 469 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
470 | handle_t *handle = NULL; | 470 | handle_t *handle; |
471 | int ret = 0; | 471 | int ret = 0; |
472 | 472 | ||
473 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 473 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
474 | if (!handle) { | 474 | if (IS_ERR(handle)) { |
475 | ret = -ENOMEM; | 475 | ret = -ENOMEM; |
476 | mlog_errno(ret); | 476 | mlog_errno(ret); |
477 | goto out; | 477 | goto out; |
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, | |||
487 | } | 487 | } |
488 | out: | 488 | out: |
489 | if (ret) { | 489 | if (ret) { |
490 | if (handle) | 490 | if (!IS_ERR(handle)) |
491 | ocfs2_commit_trans(osb, handle); | 491 | ocfs2_commit_trans(osb, handle); |
492 | handle = ERR_PTR(ret); | 492 | handle = ERR_PTR(ret); |
493 | } | 493 | } |
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile index cdd162f13650..bc8c5e7d8608 100644 --- a/fs/ocfs2/cluster/Makefile +++ b/fs/ocfs2/cluster/Makefile | |||
@@ -1,4 +1,4 @@ | |||
1 | obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o | 1 | obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o |
2 | 2 | ||
3 | ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ | 3 | ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ |
4 | quorum.o tcp.o ver.o | 4 | quorum.o tcp.o netdebug.o ver.o |
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c new file mode 100644 index 000000000000..7bf3c0ea7bd9 --- /dev/null +++ b/fs/ocfs2/cluster/netdebug.c | |||
@@ -0,0 +1,441 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * netdebug.c | ||
5 | * | ||
6 | * debug functionality for o2net | ||
7 | * | ||
8 | * Copyright (C) 2005, 2008 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation; either | ||
13 | * version 2 of the License, or (at your option) any later version. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | * | ||
20 | * You should have received a copy of the GNU General Public | ||
21 | * License along with this program; if not, write to the | ||
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
23 | * Boston, MA 021110-1307, USA. | ||
24 | * | ||
25 | */ | ||
26 | |||
27 | #ifdef CONFIG_DEBUG_FS | ||
28 | |||
29 | #include <linux/module.h> | ||
30 | #include <linux/types.h> | ||
31 | #include <linux/slab.h> | ||
32 | #include <linux/idr.h> | ||
33 | #include <linux/kref.h> | ||
34 | #include <linux/seq_file.h> | ||
35 | #include <linux/debugfs.h> | ||
36 | |||
37 | #include <linux/uaccess.h> | ||
38 | |||
39 | #include "tcp.h" | ||
40 | #include "nodemanager.h" | ||
41 | #define MLOG_MASK_PREFIX ML_TCP | ||
42 | #include "masklog.h" | ||
43 | |||
44 | #include "tcp_internal.h" | ||
45 | |||
46 | #define O2NET_DEBUG_DIR "o2net" | ||
47 | #define SC_DEBUG_NAME "sock_containers" | ||
48 | #define NST_DEBUG_NAME "send_tracking" | ||
49 | |||
50 | static struct dentry *o2net_dentry; | ||
51 | static struct dentry *sc_dentry; | ||
52 | static struct dentry *nst_dentry; | ||
53 | |||
54 | static DEFINE_SPINLOCK(o2net_debug_lock); | ||
55 | |||
56 | static LIST_HEAD(sock_containers); | ||
57 | static LIST_HEAD(send_tracking); | ||
58 | |||
59 | void o2net_debug_add_nst(struct o2net_send_tracking *nst) | ||
60 | { | ||
61 | spin_lock(&o2net_debug_lock); | ||
62 | list_add(&nst->st_net_debug_item, &send_tracking); | ||
63 | spin_unlock(&o2net_debug_lock); | ||
64 | } | ||
65 | |||
66 | void o2net_debug_del_nst(struct o2net_send_tracking *nst) | ||
67 | { | ||
68 | spin_lock(&o2net_debug_lock); | ||
69 | if (!list_empty(&nst->st_net_debug_item)) | ||
70 | list_del_init(&nst->st_net_debug_item); | ||
71 | spin_unlock(&o2net_debug_lock); | ||
72 | } | ||
73 | |||
74 | static struct o2net_send_tracking | ||
75 | *next_nst(struct o2net_send_tracking *nst_start) | ||
76 | { | ||
77 | struct o2net_send_tracking *nst, *ret = NULL; | ||
78 | |||
79 | assert_spin_locked(&o2net_debug_lock); | ||
80 | |||
81 | list_for_each_entry(nst, &nst_start->st_net_debug_item, | ||
82 | st_net_debug_item) { | ||
83 | /* discover the head of the list */ | ||
84 | if (&nst->st_net_debug_item == &send_tracking) | ||
85 | break; | ||
86 | |||
87 | /* use st_task to detect real nsts in the list */ | ||
88 | if (nst->st_task != NULL) { | ||
89 | ret = nst; | ||
90 | break; | ||
91 | } | ||
92 | } | ||
93 | |||
94 | return ret; | ||
95 | } | ||
96 | |||
97 | static void *nst_seq_start(struct seq_file *seq, loff_t *pos) | ||
98 | { | ||
99 | struct o2net_send_tracking *nst, *dummy_nst = seq->private; | ||
100 | |||
101 | spin_lock(&o2net_debug_lock); | ||
102 | nst = next_nst(dummy_nst); | ||
103 | spin_unlock(&o2net_debug_lock); | ||
104 | |||
105 | return nst; | ||
106 | } | ||
107 | |||
108 | static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
109 | { | ||
110 | struct o2net_send_tracking *nst, *dummy_nst = seq->private; | ||
111 | |||
112 | spin_lock(&o2net_debug_lock); | ||
113 | nst = next_nst(dummy_nst); | ||
114 | list_del_init(&dummy_nst->st_net_debug_item); | ||
115 | if (nst) | ||
116 | list_add(&dummy_nst->st_net_debug_item, | ||
117 | &nst->st_net_debug_item); | ||
118 | spin_unlock(&o2net_debug_lock); | ||
119 | |||
120 | return nst; /* unused, just needs to be null when done */ | ||
121 | } | ||
122 | |||
123 | static int nst_seq_show(struct seq_file *seq, void *v) | ||
124 | { | ||
125 | struct o2net_send_tracking *nst, *dummy_nst = seq->private; | ||
126 | |||
127 | spin_lock(&o2net_debug_lock); | ||
128 | nst = next_nst(dummy_nst); | ||
129 | |||
130 | if (nst != NULL) { | ||
131 | /* get_task_comm isn't exported. oh well. */ | ||
132 | seq_printf(seq, "%p:\n" | ||
133 | " pid: %lu\n" | ||
134 | " tgid: %lu\n" | ||
135 | " process name: %s\n" | ||
136 | " node: %u\n" | ||
137 | " sc: %p\n" | ||
138 | " message id: %d\n" | ||
139 | " message type: %u\n" | ||
140 | " message key: 0x%08x\n" | ||
141 | " sock acquiry: %lu.%lu\n" | ||
142 | " send start: %lu.%lu\n" | ||
143 | " wait start: %lu.%lu\n", | ||
144 | nst, (unsigned long)nst->st_task->pid, | ||
145 | (unsigned long)nst->st_task->tgid, | ||
146 | nst->st_task->comm, nst->st_node, | ||
147 | nst->st_sc, nst->st_id, nst->st_msg_type, | ||
148 | nst->st_msg_key, | ||
149 | nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec, | ||
150 | nst->st_send_time.tv_sec, nst->st_send_time.tv_usec, | ||
151 | nst->st_status_time.tv_sec, | ||
152 | nst->st_status_time.tv_usec); | ||
153 | } | ||
154 | |||
155 | spin_unlock(&o2net_debug_lock); | ||
156 | |||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | static void nst_seq_stop(struct seq_file *seq, void *v) | ||
161 | { | ||
162 | } | ||
163 | |||
164 | static struct seq_operations nst_seq_ops = { | ||
165 | .start = nst_seq_start, | ||
166 | .next = nst_seq_next, | ||
167 | .stop = nst_seq_stop, | ||
168 | .show = nst_seq_show, | ||
169 | }; | ||
170 | |||
171 | static int nst_fop_open(struct inode *inode, struct file *file) | ||
172 | { | ||
173 | struct o2net_send_tracking *dummy_nst; | ||
174 | struct seq_file *seq; | ||
175 | int ret; | ||
176 | |||
177 | dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); | ||
178 | if (dummy_nst == NULL) { | ||
179 | ret = -ENOMEM; | ||
180 | goto out; | ||
181 | } | ||
182 | dummy_nst->st_task = NULL; | ||
183 | |||
184 | ret = seq_open(file, &nst_seq_ops); | ||
185 | if (ret) | ||
186 | goto out; | ||
187 | |||
188 | seq = file->private_data; | ||
189 | seq->private = dummy_nst; | ||
190 | o2net_debug_add_nst(dummy_nst); | ||
191 | |||
192 | dummy_nst = NULL; | ||
193 | |||
194 | out: | ||
195 | kfree(dummy_nst); | ||
196 | return ret; | ||
197 | } | ||
198 | |||
199 | static int nst_fop_release(struct inode *inode, struct file *file) | ||
200 | { | ||
201 | struct seq_file *seq = file->private_data; | ||
202 | struct o2net_send_tracking *dummy_nst = seq->private; | ||
203 | |||
204 | o2net_debug_del_nst(dummy_nst); | ||
205 | return seq_release_private(inode, file); | ||
206 | } | ||
207 | |||
208 | static struct file_operations nst_seq_fops = { | ||
209 | .open = nst_fop_open, | ||
210 | .read = seq_read, | ||
211 | .llseek = seq_lseek, | ||
212 | .release = nst_fop_release, | ||
213 | }; | ||
214 | |||
215 | void o2net_debug_add_sc(struct o2net_sock_container *sc) | ||
216 | { | ||
217 | spin_lock(&o2net_debug_lock); | ||
218 | list_add(&sc->sc_net_debug_item, &sock_containers); | ||
219 | spin_unlock(&o2net_debug_lock); | ||
220 | } | ||
221 | |||
222 | void o2net_debug_del_sc(struct o2net_sock_container *sc) | ||
223 | { | ||
224 | spin_lock(&o2net_debug_lock); | ||
225 | list_del_init(&sc->sc_net_debug_item); | ||
226 | spin_unlock(&o2net_debug_lock); | ||
227 | } | ||
228 | |||
229 | static struct o2net_sock_container | ||
230 | *next_sc(struct o2net_sock_container *sc_start) | ||
231 | { | ||
232 | struct o2net_sock_container *sc, *ret = NULL; | ||
233 | |||
234 | assert_spin_locked(&o2net_debug_lock); | ||
235 | |||
236 | list_for_each_entry(sc, &sc_start->sc_net_debug_item, | ||
237 | sc_net_debug_item) { | ||
238 | /* discover the head of the list miscast as a sc */ | ||
239 | if (&sc->sc_net_debug_item == &sock_containers) | ||
240 | break; | ||
241 | |||
242 | /* use sc_page to detect real scs in the list */ | ||
243 | if (sc->sc_page != NULL) { | ||
244 | ret = sc; | ||
245 | break; | ||
246 | } | ||
247 | } | ||
248 | |||
249 | return ret; | ||
250 | } | ||
251 | |||
252 | static void *sc_seq_start(struct seq_file *seq, loff_t *pos) | ||
253 | { | ||
254 | struct o2net_sock_container *sc, *dummy_sc = seq->private; | ||
255 | |||
256 | spin_lock(&o2net_debug_lock); | ||
257 | sc = next_sc(dummy_sc); | ||
258 | spin_unlock(&o2net_debug_lock); | ||
259 | |||
260 | return sc; | ||
261 | } | ||
262 | |||
263 | static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) | ||
264 | { | ||
265 | struct o2net_sock_container *sc, *dummy_sc = seq->private; | ||
266 | |||
267 | spin_lock(&o2net_debug_lock); | ||
268 | sc = next_sc(dummy_sc); | ||
269 | list_del_init(&dummy_sc->sc_net_debug_item); | ||
270 | if (sc) | ||
271 | list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item); | ||
272 | spin_unlock(&o2net_debug_lock); | ||
273 | |||
274 | return sc; /* unused, just needs to be null when done */ | ||
275 | } | ||
276 | |||
277 | #define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec | ||
278 | |||
279 | static int sc_seq_show(struct seq_file *seq, void *v) | ||
280 | { | ||
281 | struct o2net_sock_container *sc, *dummy_sc = seq->private; | ||
282 | |||
283 | spin_lock(&o2net_debug_lock); | ||
284 | sc = next_sc(dummy_sc); | ||
285 | |||
286 | if (sc != NULL) { | ||
287 | struct inet_sock *inet = NULL; | ||
288 | |||
289 | __be32 saddr = 0, daddr = 0; | ||
290 | __be16 sport = 0, dport = 0; | ||
291 | |||
292 | if (sc->sc_sock) { | ||
293 | inet = inet_sk(sc->sc_sock->sk); | ||
294 | /* the stack's structs aren't sparse endian clean */ | ||
295 | saddr = (__force __be32)inet->saddr; | ||
296 | daddr = (__force __be32)inet->daddr; | ||
297 | sport = (__force __be16)inet->sport; | ||
298 | dport = (__force __be16)inet->dport; | ||
299 | } | ||
300 | |||
301 | /* XXX sigh, inet-> doesn't have sparse annotation so any | ||
302 | * use of it here generates a warning with -Wbitwise */ | ||
303 | seq_printf(seq, "%p:\n" | ||
304 | " krefs: %d\n" | ||
305 | " sock: %u.%u.%u.%u:%u -> " | ||
306 | "%u.%u.%u.%u:%u\n" | ||
307 | " remote node: %s\n" | ||
308 | " page off: %zu\n" | ||
309 | " handshake ok: %u\n" | ||
310 | " timer: %lu.%lu\n" | ||
311 | " data ready: %lu.%lu\n" | ||
312 | " advance start: %lu.%lu\n" | ||
313 | " advance stop: %lu.%lu\n" | ||
314 | " func start: %lu.%lu\n" | ||
315 | " func stop: %lu.%lu\n" | ||
316 | " func key: %u\n" | ||
317 | " func type: %u\n", | ||
318 | sc, | ||
319 | atomic_read(&sc->sc_kref.refcount), | ||
320 | NIPQUAD(saddr), inet ? ntohs(sport) : 0, | ||
321 | NIPQUAD(daddr), inet ? ntohs(dport) : 0, | ||
322 | sc->sc_node->nd_name, | ||
323 | sc->sc_page_off, | ||
324 | sc->sc_handshake_ok, | ||
325 | TV_SEC_USEC(sc->sc_tv_timer), | ||
326 | TV_SEC_USEC(sc->sc_tv_data_ready), | ||
327 | TV_SEC_USEC(sc->sc_tv_advance_start), | ||
328 | TV_SEC_USEC(sc->sc_tv_advance_stop), | ||
329 | TV_SEC_USEC(sc->sc_tv_func_start), | ||
330 | TV_SEC_USEC(sc->sc_tv_func_stop), | ||
331 | sc->sc_msg_key, | ||
332 | sc->sc_msg_type); | ||
333 | } | ||
334 | |||
335 | |||
336 | spin_unlock(&o2net_debug_lock); | ||
337 | |||
338 | return 0; | ||
339 | } | ||
340 | |||
341 | static void sc_seq_stop(struct seq_file *seq, void *v) | ||
342 | { | ||
343 | } | ||
344 | |||
345 | static struct seq_operations sc_seq_ops = { | ||
346 | .start = sc_seq_start, | ||
347 | .next = sc_seq_next, | ||
348 | .stop = sc_seq_stop, | ||
349 | .show = sc_seq_show, | ||
350 | }; | ||
351 | |||
352 | static int sc_fop_open(struct inode *inode, struct file *file) | ||
353 | { | ||
354 | struct o2net_sock_container *dummy_sc; | ||
355 | struct seq_file *seq; | ||
356 | int ret; | ||
357 | |||
358 | dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); | ||
359 | if (dummy_sc == NULL) { | ||
360 | ret = -ENOMEM; | ||
361 | goto out; | ||
362 | } | ||
363 | dummy_sc->sc_page = NULL; | ||
364 | |||
365 | ret = seq_open(file, &sc_seq_ops); | ||
366 | if (ret) | ||
367 | goto out; | ||
368 | |||
369 | seq = file->private_data; | ||
370 | seq->private = dummy_sc; | ||
371 | o2net_debug_add_sc(dummy_sc); | ||
372 | |||
373 | dummy_sc = NULL; | ||
374 | |||
375 | out: | ||
376 | kfree(dummy_sc); | ||
377 | return ret; | ||
378 | } | ||
379 | |||
380 | static int sc_fop_release(struct inode *inode, struct file *file) | ||
381 | { | ||
382 | struct seq_file *seq = file->private_data; | ||
383 | struct o2net_sock_container *dummy_sc = seq->private; | ||
384 | |||
385 | o2net_debug_del_sc(dummy_sc); | ||
386 | return seq_release_private(inode, file); | ||
387 | } | ||
388 | |||
389 | static struct file_operations sc_seq_fops = { | ||
390 | .open = sc_fop_open, | ||
391 | .read = seq_read, | ||
392 | .llseek = seq_lseek, | ||
393 | .release = sc_fop_release, | ||
394 | }; | ||
395 | |||
396 | int o2net_debugfs_init(void) | ||
397 | { | ||
398 | o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); | ||
399 | if (!o2net_dentry) { | ||
400 | mlog_errno(-ENOMEM); | ||
401 | goto bail; | ||
402 | } | ||
403 | |||
404 | nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, | ||
405 | o2net_dentry, NULL, | ||
406 | &nst_seq_fops); | ||
407 | if (!nst_dentry) { | ||
408 | mlog_errno(-ENOMEM); | ||
409 | goto bail; | ||
410 | } | ||
411 | |||
412 | sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, | ||
413 | o2net_dentry, NULL, | ||
414 | &sc_seq_fops); | ||
415 | if (!sc_dentry) { | ||
416 | mlog_errno(-ENOMEM); | ||
417 | goto bail; | ||
418 | } | ||
419 | |||
420 | return 0; | ||
421 | bail: | ||
422 | if (sc_dentry) | ||
423 | debugfs_remove(sc_dentry); | ||
424 | if (nst_dentry) | ||
425 | debugfs_remove(nst_dentry); | ||
426 | if (o2net_dentry) | ||
427 | debugfs_remove(o2net_dentry); | ||
428 | return -ENOMEM; | ||
429 | } | ||
430 | |||
431 | void o2net_debugfs_exit(void) | ||
432 | { | ||
433 | if (sc_dentry) | ||
434 | debugfs_remove(sc_dentry); | ||
435 | if (nst_dentry) | ||
436 | debugfs_remove(nst_dentry); | ||
437 | if (o2net_dentry) | ||
438 | debugfs_remove(o2net_dentry); | ||
439 | } | ||
440 | |||
441 | #endif /* CONFIG_DEBUG_FS */ | ||
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 709fba25bf7e..cf9401e8cd0b 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c | |||
@@ -959,7 +959,10 @@ static int __init init_o2nm(void) | |||
959 | cluster_print_version(); | 959 | cluster_print_version(); |
960 | 960 | ||
961 | o2hb_init(); | 961 | o2hb_init(); |
962 | o2net_init(); | 962 | |
963 | ret = o2net_init(); | ||
964 | if (ret) | ||
965 | goto out; | ||
963 | 966 | ||
964 | ocfs2_table_header = register_sysctl_table(ocfs2_root_table); | 967 | ocfs2_table_header = register_sysctl_table(ocfs2_root_table); |
965 | if (!ocfs2_table_header) { | 968 | if (!ocfs2_table_header) { |
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index 0c095ce7723d..98429fd68499 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c | |||
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset; | |||
57 | void o2cb_sys_shutdown(void) | 57 | void o2cb_sys_shutdown(void) |
58 | { | 58 | { |
59 | mlog_sys_shutdown(); | 59 | mlog_sys_shutdown(); |
60 | sysfs_remove_link(NULL, "o2cb"); | ||
60 | kset_unregister(o2cb_kset); | 61 | kset_unregister(o2cb_kset); |
61 | } | 62 | } |
62 | 63 | ||
@@ -68,6 +69,14 @@ int o2cb_sys_init(void) | |||
68 | if (!o2cb_kset) | 69 | if (!o2cb_kset) |
69 | return -ENOMEM; | 70 | return -ENOMEM; |
70 | 71 | ||
72 | /* | ||
73 | * Create this symlink for backwards compatibility with old | ||
74 | * versions of ocfs2-tools which look for things in /sys/o2cb. | ||
75 | */ | ||
76 | ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb"); | ||
77 | if (ret) | ||
78 | goto error; | ||
79 | |||
71 | ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); | 80 | ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); |
72 | if (ret) | 81 | if (ret) |
73 | goto error; | 82 | goto error; |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index b8057c51b205..1e44ad14881a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data); | |||
142 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); | 142 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); |
143 | static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); | 143 | static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); |
144 | 144 | ||
145 | /* | 145 | static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, |
146 | * FIXME: These should use to_o2nm_cluster_from_node(), but we end up | 146 | u32 msgkey, struct task_struct *task, u8 node) |
147 | * losing our parent link to the cluster during shutdown. This can be | 147 | { |
148 | * solved by adding a pre-removal callback to configfs, or passing | 148 | #ifdef CONFIG_DEBUG_FS |
149 | * around the cluster with the node. -jeffm | 149 | INIT_LIST_HEAD(&nst->st_net_debug_item); |
150 | */ | 150 | nst->st_task = task; |
151 | static inline int o2net_reconnect_delay(struct o2nm_node *node) | 151 | nst->st_msg_type = msgtype; |
152 | nst->st_msg_key = msgkey; | ||
153 | nst->st_node = node; | ||
154 | #endif | ||
155 | } | ||
156 | |||
157 | static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) | ||
158 | { | ||
159 | #ifdef CONFIG_DEBUG_FS | ||
160 | do_gettimeofday(&nst->st_sock_time); | ||
161 | #endif | ||
162 | } | ||
163 | |||
164 | static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) | ||
165 | { | ||
166 | #ifdef CONFIG_DEBUG_FS | ||
167 | do_gettimeofday(&nst->st_send_time); | ||
168 | #endif | ||
169 | } | ||
170 | |||
171 | static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) | ||
172 | { | ||
173 | #ifdef CONFIG_DEBUG_FS | ||
174 | do_gettimeofday(&nst->st_status_time); | ||
175 | #endif | ||
176 | } | ||
177 | |||
178 | static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, | ||
179 | struct o2net_sock_container *sc) | ||
180 | { | ||
181 | #ifdef CONFIG_DEBUG_FS | ||
182 | nst->st_sc = sc; | ||
183 | #endif | ||
184 | } | ||
185 | |||
186 | static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) | ||
187 | { | ||
188 | #ifdef CONFIG_DEBUG_FS | ||
189 | nst->st_id = msg_id; | ||
190 | #endif | ||
191 | } | ||
192 | |||
193 | static inline int o2net_reconnect_delay(void) | ||
152 | { | 194 | { |
153 | return o2nm_single_cluster->cl_reconnect_delay_ms; | 195 | return o2nm_single_cluster->cl_reconnect_delay_ms; |
154 | } | 196 | } |
155 | 197 | ||
156 | static inline int o2net_keepalive_delay(struct o2nm_node *node) | 198 | static inline int o2net_keepalive_delay(void) |
157 | { | 199 | { |
158 | return o2nm_single_cluster->cl_keepalive_delay_ms; | 200 | return o2nm_single_cluster->cl_keepalive_delay_ms; |
159 | } | 201 | } |
160 | 202 | ||
161 | static inline int o2net_idle_timeout(struct o2nm_node *node) | 203 | static inline int o2net_idle_timeout(void) |
162 | { | 204 | { |
163 | return o2nm_single_cluster->cl_idle_timeout_ms; | 205 | return o2nm_single_cluster->cl_idle_timeout_ms; |
164 | } | 206 | } |
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref) | |||
296 | o2nm_node_put(sc->sc_node); | 338 | o2nm_node_put(sc->sc_node); |
297 | sc->sc_node = NULL; | 339 | sc->sc_node = NULL; |
298 | 340 | ||
341 | o2net_debug_del_sc(sc); | ||
299 | kfree(sc); | 342 | kfree(sc); |
300 | } | 343 | } |
301 | 344 | ||
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) | |||
336 | 379 | ||
337 | ret = sc; | 380 | ret = sc; |
338 | sc->sc_page = page; | 381 | sc->sc_page = page; |
382 | o2net_debug_add_sc(sc); | ||
339 | sc = NULL; | 383 | sc = NULL; |
340 | page = NULL; | 384 | page = NULL; |
341 | 385 | ||
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
399 | mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); | 443 | mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); |
400 | mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); | 444 | mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); |
401 | 445 | ||
402 | /* we won't reconnect after our valid conn goes away for | ||
403 | * this hb iteration.. here so it shows up in the logs */ | ||
404 | if (was_valid && !valid && err == 0) | 446 | if (was_valid && !valid && err == 0) |
405 | err = -ENOTCONN; | 447 | err = -ENOTCONN; |
406 | 448 | ||
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
430 | 472 | ||
431 | if (!was_valid && valid) { | 473 | if (!was_valid && valid) { |
432 | o2quo_conn_up(o2net_num_from_nn(nn)); | 474 | o2quo_conn_up(o2net_num_from_nn(nn)); |
433 | /* this is a bit of a hack. we only try reconnecting | ||
434 | * when heartbeating starts until we get a connection. | ||
435 | * if that connection then dies we don't try reconnecting. | ||
436 | * the only way to start connecting again is to down | ||
437 | * heartbeat and bring it back up. */ | ||
438 | cancel_delayed_work(&nn->nn_connect_expired); | 475 | cancel_delayed_work(&nn->nn_connect_expired); |
439 | printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", | 476 | printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", |
440 | o2nm_this_node() > sc->sc_node->nd_num ? | 477 | o2nm_this_node() > sc->sc_node->nd_num ? |
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn, | |||
451 | /* delay if we're withing a RECONNECT_DELAY of the | 488 | /* delay if we're withing a RECONNECT_DELAY of the |
452 | * last attempt */ | 489 | * last attempt */ |
453 | delay = (nn->nn_last_connect_attempt + | 490 | delay = (nn->nn_last_connect_attempt + |
454 | msecs_to_jiffies(o2net_reconnect_delay(NULL))) | 491 | msecs_to_jiffies(o2net_reconnect_delay())) |
455 | - jiffies; | 492 | - jiffies; |
456 | if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL))) | 493 | if (delay > msecs_to_jiffies(o2net_reconnect_delay())) |
457 | delay = 0; | 494 | delay = 0; |
458 | mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); | 495 | mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); |
459 | queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); | 496 | queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); |
497 | |||
498 | /* | ||
499 | * Delay the expired work after idle timeout. | ||
500 | * | ||
501 | * We might have lots of failed connection attempts that run | ||
502 | * through here but we only cancel the connect_expired work when | ||
503 | * a connection attempt succeeds. So only the first enqueue of | ||
504 | * the connect_expired work will do anything. The rest will see | ||
505 | * that it's already queued and do nothing. | ||
506 | */ | ||
507 | delay += msecs_to_jiffies(o2net_idle_timeout()); | ||
508 | queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay); | ||
460 | } | 509 | } |
461 | 510 | ||
462 | /* keep track of the nn's sc ref for the caller */ | 511 | /* keep track of the nn's sc ref for the caller */ |
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
914 | struct o2net_status_wait nsw = { | 963 | struct o2net_status_wait nsw = { |
915 | .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), | 964 | .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), |
916 | }; | 965 | }; |
966 | struct o2net_send_tracking nst; | ||
967 | |||
968 | o2net_init_nst(&nst, msg_type, key, current, target_node); | ||
917 | 969 | ||
918 | if (o2net_wq == NULL) { | 970 | if (o2net_wq == NULL) { |
919 | mlog(0, "attempt to tx without o2netd running\n"); | 971 | mlog(0, "attempt to tx without o2netd running\n"); |
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
939 | goto out; | 991 | goto out; |
940 | } | 992 | } |
941 | 993 | ||
994 | o2net_debug_add_nst(&nst); | ||
995 | |||
996 | o2net_set_nst_sock_time(&nst); | ||
997 | |||
942 | ret = wait_event_interruptible(nn->nn_sc_wq, | 998 | ret = wait_event_interruptible(nn->nn_sc_wq, |
943 | o2net_tx_can_proceed(nn, &sc, &error)); | 999 | o2net_tx_can_proceed(nn, &sc, &error)); |
944 | if (!ret && error) | 1000 | if (!ret && error) |
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
946 | if (ret) | 1002 | if (ret) |
947 | goto out; | 1003 | goto out; |
948 | 1004 | ||
1005 | o2net_set_nst_sock_container(&nst, sc); | ||
1006 | |||
949 | veclen = caller_veclen + 1; | 1007 | veclen = caller_veclen + 1; |
950 | vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); | 1008 | vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); |
951 | if (vec == NULL) { | 1009 | if (vec == NULL) { |
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
972 | goto out; | 1030 | goto out; |
973 | 1031 | ||
974 | msg->msg_num = cpu_to_be32(nsw.ns_id); | 1032 | msg->msg_num = cpu_to_be32(nsw.ns_id); |
1033 | o2net_set_nst_msg_id(&nst, nsw.ns_id); | ||
1034 | |||
1035 | o2net_set_nst_send_time(&nst); | ||
975 | 1036 | ||
976 | /* finally, convert the message header to network byte-order | 1037 | /* finally, convert the message header to network byte-order |
977 | * and send */ | 1038 | * and send */ |
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
986 | } | 1047 | } |
987 | 1048 | ||
988 | /* wait on other node's handler */ | 1049 | /* wait on other node's handler */ |
1050 | o2net_set_nst_status_time(&nst); | ||
989 | wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); | 1051 | wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); |
990 | 1052 | ||
991 | /* Note that we avoid overwriting the callers status return | 1053 | /* Note that we avoid overwriting the callers status return |
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, | |||
998 | mlog(0, "woken, returning system status %d, user status %d\n", | 1060 | mlog(0, "woken, returning system status %d, user status %d\n", |
999 | ret, nsw.ns_status); | 1061 | ret, nsw.ns_status); |
1000 | out: | 1062 | out: |
1063 | o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ | ||
1001 | if (sc) | 1064 | if (sc) |
1002 | sc_put(sc); | 1065 | sc_put(sc); |
1003 | if (vec) | 1066 | if (vec) |
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1154 | * but isn't. This can ultimately cause corruption. | 1217 | * but isn't. This can ultimately cause corruption. |
1155 | */ | 1218 | */ |
1156 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != | 1219 | if (be32_to_cpu(hand->o2net_idle_timeout_ms) != |
1157 | o2net_idle_timeout(sc->sc_node)) { | 1220 | o2net_idle_timeout()) { |
1158 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " | 1221 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " |
1159 | "%u ms, but we use %u ms locally. disconnecting\n", | 1222 | "%u ms, but we use %u ms locally. disconnecting\n", |
1160 | SC_NODEF_ARGS(sc), | 1223 | SC_NODEF_ARGS(sc), |
1161 | be32_to_cpu(hand->o2net_idle_timeout_ms), | 1224 | be32_to_cpu(hand->o2net_idle_timeout_ms), |
1162 | o2net_idle_timeout(sc->sc_node)); | 1225 | o2net_idle_timeout()); |
1163 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1226 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1164 | return -1; | 1227 | return -1; |
1165 | } | 1228 | } |
1166 | 1229 | ||
1167 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != | 1230 | if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != |
1168 | o2net_keepalive_delay(sc->sc_node)) { | 1231 | o2net_keepalive_delay()) { |
1169 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " | 1232 | mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " |
1170 | "%u ms, but we use %u ms locally. disconnecting\n", | 1233 | "%u ms, but we use %u ms locally. disconnecting\n", |
1171 | SC_NODEF_ARGS(sc), | 1234 | SC_NODEF_ARGS(sc), |
1172 | be32_to_cpu(hand->o2net_keepalive_delay_ms), | 1235 | be32_to_cpu(hand->o2net_keepalive_delay_ms), |
1173 | o2net_keepalive_delay(sc->sc_node)); | 1236 | o2net_keepalive_delay()); |
1174 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); | 1237 | o2net_ensure_shutdown(nn, sc, -ENOTCONN); |
1175 | return -1; | 1238 | return -1; |
1176 | } | 1239 | } |
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc) | |||
1193 | * shut down already */ | 1256 | * shut down already */ |
1194 | if (nn->nn_sc == sc) { | 1257 | if (nn->nn_sc == sc) { |
1195 | o2net_sc_reset_idle_timer(sc); | 1258 | o2net_sc_reset_idle_timer(sc); |
1259 | atomic_set(&nn->nn_timeout, 0); | ||
1196 | o2net_set_nn_state(nn, sc, 1, 0); | 1260 | o2net_set_nn_state(nn, sc, 1, 0); |
1197 | } | 1261 | } |
1198 | spin_unlock(&nn->nn_lock); | 1262 | spin_unlock(&nn->nn_lock); |
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void) | |||
1347 | { | 1411 | { |
1348 | o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( | 1412 | o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( |
1349 | O2HB_MAX_WRITE_TIMEOUT_MS); | 1413 | O2HB_MAX_WRITE_TIMEOUT_MS); |
1350 | o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( | 1414 | o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout()); |
1351 | o2net_idle_timeout(NULL)); | ||
1352 | o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( | 1415 | o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( |
1353 | o2net_keepalive_delay(NULL)); | 1416 | o2net_keepalive_delay()); |
1354 | o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( | 1417 | o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( |
1355 | o2net_reconnect_delay(NULL)); | 1418 | o2net_reconnect_delay()); |
1356 | } | 1419 | } |
1357 | 1420 | ||
1358 | /* ------------------------------------------------------------ */ | 1421 | /* ------------------------------------------------------------ */ |
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work) | |||
1391 | static void o2net_idle_timer(unsigned long data) | 1454 | static void o2net_idle_timer(unsigned long data) |
1392 | { | 1455 | { |
1393 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; | 1456 | struct o2net_sock_container *sc = (struct o2net_sock_container *)data; |
1457 | struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); | ||
1394 | struct timeval now; | 1458 | struct timeval now; |
1395 | 1459 | ||
1396 | do_gettimeofday(&now); | 1460 | do_gettimeofday(&now); |
1397 | 1461 | ||
1398 | printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " | 1462 | printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " |
1399 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), | 1463 | "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), |
1400 | o2net_idle_timeout(sc->sc_node) / 1000, | 1464 | o2net_idle_timeout() / 1000, |
1401 | o2net_idle_timeout(sc->sc_node) % 1000); | 1465 | o2net_idle_timeout() % 1000); |
1402 | mlog(ML_NOTICE, "here are some times that might help debug the " | 1466 | mlog(ML_NOTICE, "here are some times that might help debug the " |
1403 | "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " | 1467 | "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " |
1404 | "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", | 1468 | "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", |
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data) | |||
1413 | sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, | 1477 | sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, |
1414 | sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); | 1478 | sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); |
1415 | 1479 | ||
1480 | /* | ||
1481 | * Initialize the nn_timeout so that the next connection attempt | ||
1482 | * will continue in o2net_start_connect. | ||
1483 | */ | ||
1484 | atomic_set(&nn->nn_timeout, 1); | ||
1485 | |||
1416 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); | 1486 | o2net_sc_queue_work(sc, &sc->sc_shutdown_work); |
1417 | } | 1487 | } |
1418 | 1488 | ||
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) | |||
1420 | { | 1490 | { |
1421 | o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); | 1491 | o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); |
1422 | o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, | 1492 | o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, |
1423 | msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); | 1493 | msecs_to_jiffies(o2net_keepalive_delay())); |
1424 | do_gettimeofday(&sc->sc_tv_timer); | 1494 | do_gettimeofday(&sc->sc_tv_timer); |
1425 | mod_timer(&sc->sc_idle_timeout, | 1495 | mod_timer(&sc->sc_idle_timeout, |
1426 | jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); | 1496 | jiffies + msecs_to_jiffies(o2net_idle_timeout())); |
1427 | } | 1497 | } |
1428 | 1498 | ||
1429 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) | 1499 | static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) |
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work) | |||
1447 | struct socket *sock = NULL; | 1517 | struct socket *sock = NULL; |
1448 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; | 1518 | struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; |
1449 | int ret = 0, stop; | 1519 | int ret = 0, stop; |
1520 | unsigned int timeout; | ||
1450 | 1521 | ||
1451 | /* if we're greater we initiate tx, otherwise we accept */ | 1522 | /* if we're greater we initiate tx, otherwise we accept */ |
1452 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) | 1523 | if (o2nm_this_node() <= o2net_num_from_nn(nn)) |
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work) | |||
1466 | } | 1537 | } |
1467 | 1538 | ||
1468 | spin_lock(&nn->nn_lock); | 1539 | spin_lock(&nn->nn_lock); |
1469 | /* see if we already have one pending or have given up */ | 1540 | /* |
1470 | stop = (nn->nn_sc || nn->nn_persistent_error); | 1541 | * see if we already have one pending or have given up. |
1542 | * For nn_timeout, it is set when we close the connection | ||
1543 | * because of the idle time out. So it means that we have | ||
1544 | * at least connected to that node successfully once, | ||
1545 | * now try to connect to it again. | ||
1546 | */ | ||
1547 | timeout = atomic_read(&nn->nn_timeout); | ||
1548 | stop = (nn->nn_sc || | ||
1549 | (nn->nn_persistent_error && | ||
1550 | (nn->nn_persistent_error != -ENOTCONN || timeout == 0))); | ||
1471 | spin_unlock(&nn->nn_lock); | 1551 | spin_unlock(&nn->nn_lock); |
1472 | if (stop) | 1552 | if (stop) |
1473 | goto out; | 1553 | goto out; |
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work) | |||
1555 | mlog(ML_ERROR, "no connection established with node %u after " | 1635 | mlog(ML_ERROR, "no connection established with node %u after " |
1556 | "%u.%u seconds, giving up and returning errors.\n", | 1636 | "%u.%u seconds, giving up and returning errors.\n", |
1557 | o2net_num_from_nn(nn), | 1637 | o2net_num_from_nn(nn), |
1558 | o2net_idle_timeout(NULL) / 1000, | 1638 | o2net_idle_timeout() / 1000, |
1559 | o2net_idle_timeout(NULL) % 1000); | 1639 | o2net_idle_timeout() % 1000); |
1560 | 1640 | ||
1561 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | 1641 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); |
1562 | } | 1642 | } |
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node) | |||
1579 | 1659 | ||
1580 | /* don't reconnect until it's heartbeating again */ | 1660 | /* don't reconnect until it's heartbeating again */ |
1581 | spin_lock(&nn->nn_lock); | 1661 | spin_lock(&nn->nn_lock); |
1662 | atomic_set(&nn->nn_timeout, 0); | ||
1582 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); | 1663 | o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); |
1583 | spin_unlock(&nn->nn_lock); | 1664 | spin_unlock(&nn->nn_lock); |
1584 | 1665 | ||
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, | |||
1610 | 1691 | ||
1611 | /* ensure an immediate connect attempt */ | 1692 | /* ensure an immediate connect attempt */ |
1612 | nn->nn_last_connect_attempt = jiffies - | 1693 | nn->nn_last_connect_attempt = jiffies - |
1613 | (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); | 1694 | (msecs_to_jiffies(o2net_reconnect_delay()) + 1); |
1614 | 1695 | ||
1615 | if (node_num != o2nm_this_node()) { | 1696 | if (node_num != o2nm_this_node()) { |
1616 | /* heartbeat doesn't work unless a local node number is | ||
1617 | * configured and doing so brings up the o2net_wq, so we can | ||
1618 | * use it.. */ | ||
1619 | queue_delayed_work(o2net_wq, &nn->nn_connect_expired, | ||
1620 | msecs_to_jiffies(o2net_idle_timeout(node))); | ||
1621 | |||
1622 | /* believe it or not, accept and node hearbeating testing | 1697 | /* believe it or not, accept and node hearbeating testing |
1623 | * can succeed for this node before we got here.. so | 1698 | * can succeed for this node before we got here.. so |
1624 | * only use set_nn_state to clear the persistent error | 1699 | * only use set_nn_state to clear the persistent error |
1625 | * if that hasn't already happened */ | 1700 | * if that hasn't already happened */ |
1626 | spin_lock(&nn->nn_lock); | 1701 | spin_lock(&nn->nn_lock); |
1702 | atomic_set(&nn->nn_timeout, 0); | ||
1627 | if (nn->nn_persistent_error) | 1703 | if (nn->nn_persistent_error) |
1628 | o2net_set_nn_state(nn, NULL, 0, 0); | 1704 | o2net_set_nn_state(nn, NULL, 0, 0); |
1629 | spin_unlock(&nn->nn_lock); | 1705 | spin_unlock(&nn->nn_lock); |
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock) | |||
1747 | new_sock = NULL; | 1823 | new_sock = NULL; |
1748 | 1824 | ||
1749 | spin_lock(&nn->nn_lock); | 1825 | spin_lock(&nn->nn_lock); |
1826 | atomic_set(&nn->nn_timeout, 0); | ||
1750 | o2net_set_nn_state(nn, sc, 0, 0); | 1827 | o2net_set_nn_state(nn, sc, 0, 0); |
1751 | spin_unlock(&nn->nn_lock); | 1828 | spin_unlock(&nn->nn_lock); |
1752 | 1829 | ||
@@ -1922,6 +1999,9 @@ int o2net_init(void) | |||
1922 | 1999 | ||
1923 | o2quo_init(); | 2000 | o2quo_init(); |
1924 | 2001 | ||
2002 | if (o2net_debugfs_init()) | ||
2003 | return -ENOMEM; | ||
2004 | |||
1925 | o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); | 2005 | o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); |
1926 | o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); | 2006 | o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); |
1927 | o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); | 2007 | o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); |
@@ -1941,6 +2021,7 @@ int o2net_init(void) | |||
1941 | for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { | 2021 | for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { |
1942 | struct o2net_node *nn = o2net_nn_from_num(i); | 2022 | struct o2net_node *nn = o2net_nn_from_num(i); |
1943 | 2023 | ||
2024 | atomic_set(&nn->nn_timeout, 0); | ||
1944 | spin_lock_init(&nn->nn_lock); | 2025 | spin_lock_init(&nn->nn_lock); |
1945 | INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); | 2026 | INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); |
1946 | INIT_DELAYED_WORK(&nn->nn_connect_expired, | 2027 | INIT_DELAYED_WORK(&nn->nn_connect_expired, |
@@ -1962,4 +2043,5 @@ void o2net_exit(void) | |||
1962 | kfree(o2net_hand); | 2043 | kfree(o2net_hand); |
1963 | kfree(o2net_keep_req); | 2044 | kfree(o2net_keep_req); |
1964 | kfree(o2net_keep_resp); | 2045 | kfree(o2net_keep_resp); |
2046 | o2net_debugfs_exit(); | ||
1965 | } | 2047 | } |
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index f36f66aab3dd..a705d5d19036 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h | |||
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void); | |||
117 | int o2net_init(void); | 117 | int o2net_init(void); |
118 | void o2net_exit(void); | 118 | void o2net_exit(void); |
119 | 119 | ||
120 | struct o2net_send_tracking; | ||
121 | struct o2net_sock_container; | ||
122 | |||
123 | #ifdef CONFIG_DEBUG_FS | ||
124 | int o2net_debugfs_init(void); | ||
125 | void o2net_debugfs_exit(void); | ||
126 | void o2net_debug_add_nst(struct o2net_send_tracking *nst); | ||
127 | void o2net_debug_del_nst(struct o2net_send_tracking *nst); | ||
128 | void o2net_debug_add_sc(struct o2net_sock_container *sc); | ||
129 | void o2net_debug_del_sc(struct o2net_sock_container *sc); | ||
130 | #else | ||
131 | static int o2net_debugfs_init(void) | ||
132 | { | ||
133 | return 0; | ||
134 | } | ||
135 | static void o2net_debugfs_exit(void) | ||
136 | { | ||
137 | } | ||
138 | static void o2net_debug_add_nst(struct o2net_send_tracking *nst) | ||
139 | { | ||
140 | } | ||
141 | static void o2net_debug_del_nst(struct o2net_send_tracking *nst) | ||
142 | { | ||
143 | } | ||
144 | static void o2net_debug_add_sc(struct o2net_sock_container *sc) | ||
145 | { | ||
146 | } | ||
147 | static void o2net_debug_del_sc(struct o2net_sock_container *sc) | ||
148 | { | ||
149 | } | ||
150 | #endif /* CONFIG_DEBUG_FS */ | ||
151 | |||
120 | #endif /* O2CLUSTER_TCP_H */ | 152 | #endif /* O2CLUSTER_TCP_H */ |
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index d25b9af28500..8d58cfe410b1 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h | |||
@@ -95,6 +95,8 @@ struct o2net_node { | |||
95 | unsigned nn_sc_valid:1; | 95 | unsigned nn_sc_valid:1; |
96 | /* if this is set tx just returns it */ | 96 | /* if this is set tx just returns it */ |
97 | int nn_persistent_error; | 97 | int nn_persistent_error; |
98 | /* It is only set to 1 after the idle time out. */ | ||
99 | atomic_t nn_timeout; | ||
98 | 100 | ||
99 | /* threads waiting for an sc to arrive wait on the wq for generation | 101 | /* threads waiting for an sc to arrive wait on the wq for generation |
100 | * to increase. it is increased when a connecting socket succeeds | 102 | * to increase. it is increased when a connecting socket succeeds |
@@ -164,7 +166,9 @@ struct o2net_sock_container { | |||
164 | /* original handlers for the sockets */ | 166 | /* original handlers for the sockets */ |
165 | void (*sc_state_change)(struct sock *sk); | 167 | void (*sc_state_change)(struct sock *sk); |
166 | void (*sc_data_ready)(struct sock *sk, int bytes); | 168 | void (*sc_data_ready)(struct sock *sk, int bytes); |
167 | 169 | #ifdef CONFIG_DEBUG_FS | |
170 | struct list_head sc_net_debug_item; | ||
171 | #endif | ||
168 | struct timeval sc_tv_timer; | 172 | struct timeval sc_tv_timer; |
169 | struct timeval sc_tv_data_ready; | 173 | struct timeval sc_tv_data_ready; |
170 | struct timeval sc_tv_advance_start; | 174 | struct timeval sc_tv_advance_start; |
@@ -206,4 +210,24 @@ struct o2net_status_wait { | |||
206 | struct list_head ns_node_item; | 210 | struct list_head ns_node_item; |
207 | }; | 211 | }; |
208 | 212 | ||
213 | #ifdef CONFIG_DEBUG_FS | ||
214 | /* just for state dumps */ | ||
215 | struct o2net_send_tracking { | ||
216 | struct list_head st_net_debug_item; | ||
217 | struct task_struct *st_task; | ||
218 | struct o2net_sock_container *st_sc; | ||
219 | u32 st_id; | ||
220 | u32 st_msg_type; | ||
221 | u32 st_msg_key; | ||
222 | u8 st_node; | ||
223 | struct timeval st_sock_time; | ||
224 | struct timeval st_send_time; | ||
225 | struct timeval st_status_time; | ||
226 | }; | ||
227 | #else | ||
228 | struct o2net_send_tracking { | ||
229 | u32 dummy; | ||
230 | }; | ||
231 | #endif /* CONFIG_DEBUG_FS */ | ||
232 | |||
209 | #endif /* O2CLUSTER_TCP_INTERNAL_H */ | 233 | #endif /* O2CLUSTER_TCP_INTERNAL_H */ |
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile index ce3f7c29d270..190361375700 100644 --- a/fs/ocfs2/dlm/Makefile +++ b/fs/ocfs2/dlm/Makefile | |||
@@ -1,6 +1,6 @@ | |||
1 | EXTRA_CFLAGS += -Ifs/ocfs2 | 1 | EXTRA_CFLAGS += -Ifs/ocfs2 |
2 | 2 | ||
3 | obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o | 3 | obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o |
4 | 4 | ||
5 | ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ | 5 | ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ |
6 | dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o | 6 | dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o |
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index dc8ea666efdb..d5a86fb81a49 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h | |||
@@ -49,6 +49,41 @@ | |||
49 | /* Intended to make it easier for us to switch out hash functions */ | 49 | /* Intended to make it easier for us to switch out hash functions */ |
50 | #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) | 50 | #define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) |
51 | 51 | ||
52 | enum dlm_mle_type { | ||
53 | DLM_MLE_BLOCK, | ||
54 | DLM_MLE_MASTER, | ||
55 | DLM_MLE_MIGRATION | ||
56 | }; | ||
57 | |||
58 | struct dlm_lock_name { | ||
59 | u8 len; | ||
60 | u8 name[DLM_LOCKID_NAME_MAX]; | ||
61 | }; | ||
62 | |||
63 | struct dlm_master_list_entry { | ||
64 | struct list_head list; | ||
65 | struct list_head hb_events; | ||
66 | struct dlm_ctxt *dlm; | ||
67 | spinlock_t spinlock; | ||
68 | wait_queue_head_t wq; | ||
69 | atomic_t woken; | ||
70 | struct kref mle_refs; | ||
71 | int inuse; | ||
72 | unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
73 | unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
74 | unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
75 | unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
76 | u8 master; | ||
77 | u8 new_master; | ||
78 | enum dlm_mle_type type; | ||
79 | struct o2hb_callback_func mle_hb_up; | ||
80 | struct o2hb_callback_func mle_hb_down; | ||
81 | union { | ||
82 | struct dlm_lock_resource *res; | ||
83 | struct dlm_lock_name name; | ||
84 | } u; | ||
85 | }; | ||
86 | |||
52 | enum dlm_ast_type { | 87 | enum dlm_ast_type { |
53 | DLM_AST = 0, | 88 | DLM_AST = 0, |
54 | DLM_BAST, | 89 | DLM_BAST, |
@@ -101,6 +136,7 @@ struct dlm_ctxt | |||
101 | struct list_head purge_list; | 136 | struct list_head purge_list; |
102 | struct list_head pending_asts; | 137 | struct list_head pending_asts; |
103 | struct list_head pending_basts; | 138 | struct list_head pending_basts; |
139 | struct list_head tracking_list; | ||
104 | unsigned int purge_count; | 140 | unsigned int purge_count; |
105 | spinlock_t spinlock; | 141 | spinlock_t spinlock; |
106 | spinlock_t ast_lock; | 142 | spinlock_t ast_lock; |
@@ -122,6 +158,9 @@ struct dlm_ctxt | |||
122 | atomic_t remote_resources; | 158 | atomic_t remote_resources; |
123 | atomic_t unknown_resources; | 159 | atomic_t unknown_resources; |
124 | 160 | ||
161 | struct dlm_debug_ctxt *dlm_debug_ctxt; | ||
162 | struct dentry *dlm_debugfs_subroot; | ||
163 | |||
125 | /* NOTE: Next three are protected by dlm_domain_lock */ | 164 | /* NOTE: Next three are protected by dlm_domain_lock */ |
126 | struct kref dlm_refs; | 165 | struct kref dlm_refs; |
127 | enum dlm_ctxt_state dlm_state; | 166 | enum dlm_ctxt_state dlm_state; |
@@ -270,6 +309,9 @@ struct dlm_lock_resource | |||
270 | struct list_head dirty; | 309 | struct list_head dirty; |
271 | struct list_head recovering; // dlm_recovery_ctxt.resources list | 310 | struct list_head recovering; // dlm_recovery_ctxt.resources list |
272 | 311 | ||
312 | /* Added during init and removed during release */ | ||
313 | struct list_head tracking; /* dlm->tracking_list */ | ||
314 | |||
273 | /* unused lock resources have their last_used stamped and are | 315 | /* unused lock resources have their last_used stamped and are |
274 | * put on a list for the dlm thread to run. */ | 316 | * put on a list for the dlm thread to run. */ |
275 | unsigned long last_used; | 317 | unsigned long last_used; |
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) | |||
963 | DLM_LOCK_RES_MIGRATING)); | 1005 | DLM_LOCK_RES_MIGRATING)); |
964 | } | 1006 | } |
965 | 1007 | ||
1008 | /* create/destroy slab caches */ | ||
1009 | int dlm_init_master_caches(void); | ||
1010 | void dlm_destroy_master_caches(void); | ||
1011 | |||
1012 | int dlm_init_lock_cache(void); | ||
1013 | void dlm_destroy_lock_cache(void); | ||
966 | 1014 | ||
967 | int dlm_init_mle_cache(void); | 1015 | int dlm_init_mle_cache(void); |
968 | void dlm_destroy_mle_cache(void); | 1016 | void dlm_destroy_mle_cache(void); |
1017 | |||
969 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); | 1018 | void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); |
970 | int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, | 1019 | int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, |
971 | struct dlm_lock_resource *res); | 1020 | struct dlm_lock_resource *res); |
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 64239b37e5d4..5f6d858770a2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c | |||
@@ -5,7 +5,7 @@ | |||
5 | * | 5 | * |
6 | * debug functionality for the dlm | 6 | * debug functionality for the dlm |
7 | * | 7 | * |
8 | * Copyright (C) 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2004, 2008 Oracle. All rights reserved. |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU General Public | 11 | * modify it under the terms of the GNU General Public |
@@ -30,6 +30,7 @@ | |||
30 | #include <linux/utsname.h> | 30 | #include <linux/utsname.h> |
31 | #include <linux/sysctl.h> | 31 | #include <linux/sysctl.h> |
32 | #include <linux/spinlock.h> | 32 | #include <linux/spinlock.h> |
33 | #include <linux/debugfs.h> | ||
33 | 34 | ||
34 | #include "cluster/heartbeat.h" | 35 | #include "cluster/heartbeat.h" |
35 | #include "cluster/nodemanager.h" | 36 | #include "cluster/nodemanager.h" |
@@ -37,17 +38,16 @@ | |||
37 | 38 | ||
38 | #include "dlmapi.h" | 39 | #include "dlmapi.h" |
39 | #include "dlmcommon.h" | 40 | #include "dlmcommon.h" |
40 | |||
41 | #include "dlmdomain.h" | 41 | #include "dlmdomain.h" |
42 | #include "dlmdebug.h" | ||
42 | 43 | ||
43 | #define MLOG_MASK_PREFIX ML_DLM | 44 | #define MLOG_MASK_PREFIX ML_DLM |
44 | #include "cluster/masklog.h" | 45 | #include "cluster/masklog.h" |
45 | 46 | ||
47 | int stringify_lockname(const char *lockname, int locklen, char *buf, int len); | ||
48 | |||
46 | void dlm_print_one_lock_resource(struct dlm_lock_resource *res) | 49 | void dlm_print_one_lock_resource(struct dlm_lock_resource *res) |
47 | { | 50 | { |
48 | mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", | ||
49 | res->lockname.len, res->lockname.name, | ||
50 | res->owner, res->state); | ||
51 | spin_lock(&res->spinlock); | 51 | spin_lock(&res->spinlock); |
52 | __dlm_print_one_lock_resource(res); | 52 | __dlm_print_one_lock_resource(res); |
53 | spin_unlock(&res->spinlock); | 53 | spin_unlock(&res->spinlock); |
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) | |||
58 | int bit; | 58 | int bit; |
59 | assert_spin_locked(&res->spinlock); | 59 | assert_spin_locked(&res->spinlock); |
60 | 60 | ||
61 | mlog(ML_NOTICE, " refmap nodes: [ "); | 61 | printk(" refmap nodes: [ "); |
62 | bit = 0; | 62 | bit = 0; |
63 | while (1) { | 63 | while (1) { |
64 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); | 64 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); |
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res) | |||
70 | printk("], inflight=%u\n", res->inflight_locks); | 70 | printk("], inflight=%u\n", res->inflight_locks); |
71 | } | 71 | } |
72 | 72 | ||
73 | static void __dlm_print_lock(struct dlm_lock *lock) | ||
74 | { | ||
75 | spin_lock(&lock->spinlock); | ||
76 | |||
77 | printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, " | ||
78 | "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), " | ||
79 | "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n", | ||
80 | lock->ml.type, lock->ml.convert_type, lock->ml.node, | ||
81 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
82 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
83 | atomic_read(&lock->lock_refs.refcount), | ||
84 | (list_empty(&lock->ast_list) ? 'y' : 'n'), | ||
85 | (lock->ast_pending ? 'y' : 'n'), | ||
86 | (list_empty(&lock->bast_list) ? 'y' : 'n'), | ||
87 | (lock->bast_pending ? 'y' : 'n'), | ||
88 | (lock->convert_pending ? 'y' : 'n'), | ||
89 | (lock->lock_pending ? 'y' : 'n'), | ||
90 | (lock->cancel_pending ? 'y' : 'n'), | ||
91 | (lock->unlock_pending ? 'y' : 'n')); | ||
92 | |||
93 | spin_unlock(&lock->spinlock); | ||
94 | } | ||
95 | |||
73 | void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) | 96 | void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) |
74 | { | 97 | { |
75 | struct list_head *iter2; | 98 | struct list_head *iter2; |
76 | struct dlm_lock *lock; | 99 | struct dlm_lock *lock; |
100 | char buf[DLM_LOCKID_NAME_MAX]; | ||
77 | 101 | ||
78 | assert_spin_locked(&res->spinlock); | 102 | assert_spin_locked(&res->spinlock); |
79 | 103 | ||
80 | mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", | 104 | stringify_lockname(res->lockname.name, res->lockname.len, |
81 | res->lockname.len, res->lockname.name, | 105 | buf, sizeof(buf) - 1); |
82 | res->owner, res->state); | 106 | printk("lockres: %s, owner=%u, state=%u\n", |
83 | mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", | 107 | buf, res->owner, res->state); |
84 | res->last_used, list_empty(&res->purge) ? "no" : "yes"); | 108 | printk(" last used: %lu, refcnt: %u, on purge list: %s\n", |
109 | res->last_used, atomic_read(&res->refs.refcount), | ||
110 | list_empty(&res->purge) ? "no" : "yes"); | ||
111 | printk(" on dirty list: %s, on reco list: %s, " | ||
112 | "migrating pending: %s\n", | ||
113 | list_empty(&res->dirty) ? "no" : "yes", | ||
114 | list_empty(&res->recovering) ? "no" : "yes", | ||
115 | res->migration_pending ? "yes" : "no"); | ||
116 | printk(" inflight locks: %d, asts reserved: %d\n", | ||
117 | res->inflight_locks, atomic_read(&res->asts_reserved)); | ||
85 | dlm_print_lockres_refmap(res); | 118 | dlm_print_lockres_refmap(res); |
86 | mlog(ML_NOTICE, " granted queue: \n"); | 119 | printk(" granted queue:\n"); |
87 | list_for_each(iter2, &res->granted) { | 120 | list_for_each(iter2, &res->granted) { |
88 | lock = list_entry(iter2, struct dlm_lock, list); | 121 | lock = list_entry(iter2, struct dlm_lock, list); |
89 | spin_lock(&lock->spinlock); | 122 | __dlm_print_lock(lock); |
90 | mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " | ||
91 | "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", | ||
92 | lock->ml.type, lock->ml.convert_type, lock->ml.node, | ||
93 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
94 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
95 | list_empty(&lock->ast_list) ? 'y' : 'n', | ||
96 | lock->ast_pending ? 'y' : 'n', | ||
97 | list_empty(&lock->bast_list) ? 'y' : 'n', | ||
98 | lock->bast_pending ? 'y' : 'n'); | ||
99 | spin_unlock(&lock->spinlock); | ||
100 | } | 123 | } |
101 | mlog(ML_NOTICE, " converting queue: \n"); | 124 | printk(" converting queue:\n"); |
102 | list_for_each(iter2, &res->converting) { | 125 | list_for_each(iter2, &res->converting) { |
103 | lock = list_entry(iter2, struct dlm_lock, list); | 126 | lock = list_entry(iter2, struct dlm_lock, list); |
104 | spin_lock(&lock->spinlock); | 127 | __dlm_print_lock(lock); |
105 | mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " | ||
106 | "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", | ||
107 | lock->ml.type, lock->ml.convert_type, lock->ml.node, | ||
108 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
109 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
110 | list_empty(&lock->ast_list) ? 'y' : 'n', | ||
111 | lock->ast_pending ? 'y' : 'n', | ||
112 | list_empty(&lock->bast_list) ? 'y' : 'n', | ||
113 | lock->bast_pending ? 'y' : 'n'); | ||
114 | spin_unlock(&lock->spinlock); | ||
115 | } | 128 | } |
116 | mlog(ML_NOTICE, " blocked queue: \n"); | 129 | printk(" blocked queue:\n"); |
117 | list_for_each(iter2, &res->blocked) { | 130 | list_for_each(iter2, &res->blocked) { |
118 | lock = list_entry(iter2, struct dlm_lock, list); | 131 | lock = list_entry(iter2, struct dlm_lock, list); |
119 | spin_lock(&lock->spinlock); | 132 | __dlm_print_lock(lock); |
120 | mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, " | ||
121 | "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n", | ||
122 | lock->ml.type, lock->ml.convert_type, lock->ml.node, | ||
123 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
124 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
125 | list_empty(&lock->ast_list) ? 'y' : 'n', | ||
126 | lock->ast_pending ? 'y' : 'n', | ||
127 | list_empty(&lock->bast_list) ? 'y' : 'n', | ||
128 | lock->bast_pending ? 'y' : 'n'); | ||
129 | spin_unlock(&lock->spinlock); | ||
130 | } | 133 | } |
131 | } | 134 | } |
132 | 135 | ||
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid) | |||
136 | } | 139 | } |
137 | EXPORT_SYMBOL_GPL(dlm_print_one_lock); | 140 | EXPORT_SYMBOL_GPL(dlm_print_one_lock); |
138 | 141 | ||
139 | #if 0 | ||
140 | void dlm_dump_lock_resources(struct dlm_ctxt *dlm) | ||
141 | { | ||
142 | struct dlm_lock_resource *res; | ||
143 | struct hlist_node *iter; | ||
144 | struct hlist_head *bucket; | ||
145 | int i; | ||
146 | |||
147 | mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", | ||
148 | dlm->name, dlm->node_num, dlm->key); | ||
149 | if (!dlm || !dlm->name) { | ||
150 | mlog(ML_ERROR, "dlm=%p\n", dlm); | ||
151 | return; | ||
152 | } | ||
153 | |||
154 | spin_lock(&dlm->spinlock); | ||
155 | for (i=0; i<DLM_HASH_BUCKETS; i++) { | ||
156 | bucket = dlm_lockres_hash(dlm, i); | ||
157 | hlist_for_each_entry(res, iter, bucket, hash_node) | ||
158 | dlm_print_one_lock_resource(res); | ||
159 | } | ||
160 | spin_unlock(&dlm->spinlock); | ||
161 | } | ||
162 | #endif /* 0 */ | ||
163 | |||
164 | static const char *dlm_errnames[] = { | 142 | static const char *dlm_errnames[] = { |
165 | [DLM_NORMAL] = "DLM_NORMAL", | 143 | [DLM_NORMAL] = "DLM_NORMAL", |
166 | [DLM_GRANTED] = "DLM_GRANTED", | 144 | [DLM_GRANTED] = "DLM_GRANTED", |
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err) | |||
266 | return dlm_errnames[err]; | 244 | return dlm_errnames[err]; |
267 | } | 245 | } |
268 | EXPORT_SYMBOL_GPL(dlm_errname); | 246 | EXPORT_SYMBOL_GPL(dlm_errname); |
247 | |||
248 | /* NOTE: This function converts a lockname into a string. It uses knowledge | ||
249 | * of the format of the lockname that should be outside the purview of the dlm. | ||
250 | * We are adding only to make dlm debugging slightly easier. | ||
251 | * | ||
252 | * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h. | ||
253 | */ | ||
254 | int stringify_lockname(const char *lockname, int locklen, char *buf, int len) | ||
255 | { | ||
256 | int out = 0; | ||
257 | __be64 inode_blkno_be; | ||
258 | |||
259 | #define OCFS2_DENTRY_LOCK_INO_START 18 | ||
260 | if (*lockname == 'N') { | ||
261 | memcpy((__be64 *)&inode_blkno_be, | ||
262 | (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START], | ||
263 | sizeof(__be64)); | ||
264 | out += snprintf(buf + out, len - out, "%.*s%08x", | ||
265 | OCFS2_DENTRY_LOCK_INO_START - 1, lockname, | ||
266 | (unsigned int)be64_to_cpu(inode_blkno_be)); | ||
267 | } else | ||
268 | out += snprintf(buf + out, len - out, "%.*s", | ||
269 | locklen, lockname); | ||
270 | return out; | ||
271 | } | ||
272 | |||
273 | static int stringify_nodemap(unsigned long *nodemap, int maxnodes, | ||
274 | char *buf, int len) | ||
275 | { | ||
276 | int out = 0; | ||
277 | int i = -1; | ||
278 | |||
279 | while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes) | ||
280 | out += snprintf(buf + out, len - out, "%d ", i); | ||
281 | |||
282 | return out; | ||
283 | } | ||
284 | |||
285 | static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len) | ||
286 | { | ||
287 | int out = 0; | ||
288 | unsigned int namelen; | ||
289 | const char *name; | ||
290 | char *mle_type; | ||
291 | |||
292 | if (mle->type != DLM_MLE_MASTER) { | ||
293 | namelen = mle->u.name.len; | ||
294 | name = mle->u.name.name; | ||
295 | } else { | ||
296 | namelen = mle->u.res->lockname.len; | ||
297 | name = mle->u.res->lockname.name; | ||
298 | } | ||
299 | |||
300 | if (mle->type == DLM_MLE_BLOCK) | ||
301 | mle_type = "BLK"; | ||
302 | else if (mle->type == DLM_MLE_MASTER) | ||
303 | mle_type = "MAS"; | ||
304 | else | ||
305 | mle_type = "MIG"; | ||
306 | |||
307 | out += stringify_lockname(name, namelen, buf + out, len - out); | ||
308 | out += snprintf(buf + out, len - out, | ||
309 | "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n", | ||
310 | mle_type, mle->master, mle->new_master, | ||
311 | !list_empty(&mle->hb_events), | ||
312 | !!mle->inuse, | ||
313 | atomic_read(&mle->mle_refs.refcount)); | ||
314 | |||
315 | out += snprintf(buf + out, len - out, "Maybe="); | ||
316 | out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES, | ||
317 | buf + out, len - out); | ||
318 | out += snprintf(buf + out, len - out, "\n"); | ||
319 | |||
320 | out += snprintf(buf + out, len - out, "Vote="); | ||
321 | out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES, | ||
322 | buf + out, len - out); | ||
323 | out += snprintf(buf + out, len - out, "\n"); | ||
324 | |||
325 | out += snprintf(buf + out, len - out, "Response="); | ||
326 | out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES, | ||
327 | buf + out, len - out); | ||
328 | out += snprintf(buf + out, len - out, "\n"); | ||
329 | |||
330 | out += snprintf(buf + out, len - out, "Node="); | ||
331 | out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES, | ||
332 | buf + out, len - out); | ||
333 | out += snprintf(buf + out, len - out, "\n"); | ||
334 | |||
335 | out += snprintf(buf + out, len - out, "\n"); | ||
336 | |||
337 | return out; | ||
338 | } | ||
339 | |||
340 | void dlm_print_one_mle(struct dlm_master_list_entry *mle) | ||
341 | { | ||
342 | char *buf; | ||
343 | |||
344 | buf = (char *) get_zeroed_page(GFP_NOFS); | ||
345 | if (buf) { | ||
346 | dump_mle(mle, buf, PAGE_SIZE - 1); | ||
347 | free_page((unsigned long)buf); | ||
348 | } | ||
349 | } | ||
350 | |||
351 | #ifdef CONFIG_DEBUG_FS | ||
352 | |||
353 | static struct dentry *dlm_debugfs_root = NULL; | ||
354 | |||
355 | #define DLM_DEBUGFS_DIR "o2dlm" | ||
356 | #define DLM_DEBUGFS_DLM_STATE "dlm_state" | ||
357 | #define DLM_DEBUGFS_LOCKING_STATE "locking_state" | ||
358 | #define DLM_DEBUGFS_MLE_STATE "mle_state" | ||
359 | #define DLM_DEBUGFS_PURGE_LIST "purge_list" | ||
360 | |||
361 | /* begin - utils funcs */ | ||
362 | static void dlm_debug_free(struct kref *kref) | ||
363 | { | ||
364 | struct dlm_debug_ctxt *dc; | ||
365 | |||
366 | dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt); | ||
367 | |||
368 | kfree(dc); | ||
369 | } | ||
370 | |||
371 | void dlm_debug_put(struct dlm_debug_ctxt *dc) | ||
372 | { | ||
373 | if (dc) | ||
374 | kref_put(&dc->debug_refcnt, dlm_debug_free); | ||
375 | } | ||
376 | |||
377 | static void dlm_debug_get(struct dlm_debug_ctxt *dc) | ||
378 | { | ||
379 | kref_get(&dc->debug_refcnt); | ||
380 | } | ||
381 | |||
382 | static struct debug_buffer *debug_buffer_allocate(void) | ||
383 | { | ||
384 | struct debug_buffer *db = NULL; | ||
385 | |||
386 | db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL); | ||
387 | if (!db) | ||
388 | goto bail; | ||
389 | |||
390 | db->len = PAGE_SIZE; | ||
391 | db->buf = kmalloc(db->len, GFP_KERNEL); | ||
392 | if (!db->buf) | ||
393 | goto bail; | ||
394 | |||
395 | return db; | ||
396 | bail: | ||
397 | kfree(db); | ||
398 | return NULL; | ||
399 | } | ||
400 | |||
401 | static ssize_t debug_buffer_read(struct file *file, char __user *buf, | ||
402 | size_t nbytes, loff_t *ppos) | ||
403 | { | ||
404 | struct debug_buffer *db = file->private_data; | ||
405 | |||
406 | return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len); | ||
407 | } | ||
408 | |||
409 | static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence) | ||
410 | { | ||
411 | struct debug_buffer *db = file->private_data; | ||
412 | loff_t new = -1; | ||
413 | |||
414 | switch (whence) { | ||
415 | case 0: | ||
416 | new = off; | ||
417 | break; | ||
418 | case 1: | ||
419 | new = file->f_pos + off; | ||
420 | break; | ||
421 | } | ||
422 | |||
423 | if (new < 0 || new > db->len) | ||
424 | return -EINVAL; | ||
425 | |||
426 | return (file->f_pos = new); | ||
427 | } | ||
428 | |||
429 | static int debug_buffer_release(struct inode *inode, struct file *file) | ||
430 | { | ||
431 | struct debug_buffer *db = (struct debug_buffer *)file->private_data; | ||
432 | |||
433 | if (db) | ||
434 | kfree(db->buf); | ||
435 | kfree(db); | ||
436 | |||
437 | return 0; | ||
438 | } | ||
439 | /* end - util funcs */ | ||
440 | |||
441 | /* begin - purge list funcs */ | ||
442 | static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db) | ||
443 | { | ||
444 | struct dlm_lock_resource *res; | ||
445 | int out = 0; | ||
446 | unsigned long total = 0; | ||
447 | |||
448 | out += snprintf(db->buf + out, db->len - out, | ||
449 | "Dumping Purgelist for Domain: %s\n", dlm->name); | ||
450 | |||
451 | spin_lock(&dlm->spinlock); | ||
452 | list_for_each_entry(res, &dlm->purge_list, purge) { | ||
453 | ++total; | ||
454 | if (db->len - out < 100) | ||
455 | continue; | ||
456 | spin_lock(&res->spinlock); | ||
457 | out += stringify_lockname(res->lockname.name, | ||
458 | res->lockname.len, | ||
459 | db->buf + out, db->len - out); | ||
460 | out += snprintf(db->buf + out, db->len - out, "\t%ld\n", | ||
461 | (jiffies - res->last_used)/HZ); | ||
462 | spin_unlock(&res->spinlock); | ||
463 | } | ||
464 | spin_unlock(&dlm->spinlock); | ||
465 | |||
466 | out += snprintf(db->buf + out, db->len - out, | ||
467 | "Total on list: %ld\n", total); | ||
468 | |||
469 | return out; | ||
470 | } | ||
471 | |||
472 | static int debug_purgelist_open(struct inode *inode, struct file *file) | ||
473 | { | ||
474 | struct dlm_ctxt *dlm = inode->i_private; | ||
475 | struct debug_buffer *db; | ||
476 | |||
477 | db = debug_buffer_allocate(); | ||
478 | if (!db) | ||
479 | goto bail; | ||
480 | |||
481 | db->len = debug_purgelist_print(dlm, db); | ||
482 | |||
483 | file->private_data = db; | ||
484 | |||
485 | return 0; | ||
486 | bail: | ||
487 | return -ENOMEM; | ||
488 | } | ||
489 | |||
490 | static struct file_operations debug_purgelist_fops = { | ||
491 | .open = debug_purgelist_open, | ||
492 | .release = debug_buffer_release, | ||
493 | .read = debug_buffer_read, | ||
494 | .llseek = debug_buffer_llseek, | ||
495 | }; | ||
496 | /* end - purge list funcs */ | ||
497 | |||
498 | /* begin - debug mle funcs */ | ||
499 | static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db) | ||
500 | { | ||
501 | struct dlm_master_list_entry *mle; | ||
502 | int out = 0; | ||
503 | unsigned long total = 0; | ||
504 | |||
505 | out += snprintf(db->buf + out, db->len - out, | ||
506 | "Dumping MLEs for Domain: %s\n", dlm->name); | ||
507 | |||
508 | spin_lock(&dlm->master_lock); | ||
509 | list_for_each_entry(mle, &dlm->master_list, list) { | ||
510 | ++total; | ||
511 | if (db->len - out < 200) | ||
512 | continue; | ||
513 | out += dump_mle(mle, db->buf + out, db->len - out); | ||
514 | } | ||
515 | spin_unlock(&dlm->master_lock); | ||
516 | |||
517 | out += snprintf(db->buf + out, db->len - out, | ||
518 | "Total on list: %ld\n", total); | ||
519 | return out; | ||
520 | } | ||
521 | |||
522 | static int debug_mle_open(struct inode *inode, struct file *file) | ||
523 | { | ||
524 | struct dlm_ctxt *dlm = inode->i_private; | ||
525 | struct debug_buffer *db; | ||
526 | |||
527 | db = debug_buffer_allocate(); | ||
528 | if (!db) | ||
529 | goto bail; | ||
530 | |||
531 | db->len = debug_mle_print(dlm, db); | ||
532 | |||
533 | file->private_data = db; | ||
534 | |||
535 | return 0; | ||
536 | bail: | ||
537 | return -ENOMEM; | ||
538 | } | ||
539 | |||
540 | static struct file_operations debug_mle_fops = { | ||
541 | .open = debug_mle_open, | ||
542 | .release = debug_buffer_release, | ||
543 | .read = debug_buffer_read, | ||
544 | .llseek = debug_buffer_llseek, | ||
545 | }; | ||
546 | |||
547 | /* end - debug mle funcs */ | ||
548 | |||
549 | /* begin - debug lockres funcs */ | ||
550 | static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len) | ||
551 | { | ||
552 | int out; | ||
553 | |||
554 | #define DEBUG_LOCK_VERSION 1 | ||
555 | spin_lock(&lock->spinlock); | ||
556 | out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d," | ||
557 | "%d,%d,%d,%d\n", | ||
558 | DEBUG_LOCK_VERSION, | ||
559 | list_type, lock->ml.type, lock->ml.convert_type, | ||
560 | lock->ml.node, | ||
561 | dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)), | ||
562 | dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)), | ||
563 | !list_empty(&lock->ast_list), | ||
564 | !list_empty(&lock->bast_list), | ||
565 | lock->ast_pending, lock->bast_pending, | ||
566 | lock->convert_pending, lock->lock_pending, | ||
567 | lock->cancel_pending, lock->unlock_pending, | ||
568 | atomic_read(&lock->lock_refs.refcount)); | ||
569 | spin_unlock(&lock->spinlock); | ||
570 | |||
571 | return out; | ||
572 | } | ||
573 | |||
574 | static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len) | ||
575 | { | ||
576 | struct dlm_lock *lock; | ||
577 | int i; | ||
578 | int out = 0; | ||
579 | |||
580 | out += snprintf(buf + out, len - out, "NAME:"); | ||
581 | out += stringify_lockname(res->lockname.name, res->lockname.len, | ||
582 | buf + out, len - out); | ||
583 | out += snprintf(buf + out, len - out, "\n"); | ||
584 | |||
585 | #define DEBUG_LRES_VERSION 1 | ||
586 | out += snprintf(buf + out, len - out, | ||
587 | "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n", | ||
588 | DEBUG_LRES_VERSION, | ||
589 | res->owner, res->state, res->last_used, | ||
590 | !list_empty(&res->purge), | ||
591 | !list_empty(&res->dirty), | ||
592 | !list_empty(&res->recovering), | ||
593 | res->inflight_locks, res->migration_pending, | ||
594 | atomic_read(&res->asts_reserved), | ||
595 | atomic_read(&res->refs.refcount)); | ||
596 | |||
597 | /* refmap */ | ||
598 | out += snprintf(buf + out, len - out, "RMAP:"); | ||
599 | out += stringify_nodemap(res->refmap, O2NM_MAX_NODES, | ||
600 | buf + out, len - out); | ||
601 | out += snprintf(buf + out, len - out, "\n"); | ||
602 | |||
603 | /* lvb */ | ||
604 | out += snprintf(buf + out, len - out, "LVBX:"); | ||
605 | for (i = 0; i < DLM_LVB_LEN; i++) | ||
606 | out += snprintf(buf + out, len - out, | ||
607 | "%02x", (unsigned char)res->lvb[i]); | ||
608 | out += snprintf(buf + out, len - out, "\n"); | ||
609 | |||
610 | /* granted */ | ||
611 | list_for_each_entry(lock, &res->granted, list) | ||
612 | out += dump_lock(lock, 0, buf + out, len - out); | ||
613 | |||
614 | /* converting */ | ||
615 | list_for_each_entry(lock, &res->converting, list) | ||
616 | out += dump_lock(lock, 1, buf + out, len - out); | ||
617 | |||
618 | /* blocked */ | ||
619 | list_for_each_entry(lock, &res->blocked, list) | ||
620 | out += dump_lock(lock, 2, buf + out, len - out); | ||
621 | |||
622 | out += snprintf(buf + out, len - out, "\n"); | ||
623 | |||
624 | return out; | ||
625 | } | ||
626 | |||
627 | static void *lockres_seq_start(struct seq_file *m, loff_t *pos) | ||
628 | { | ||
629 | struct debug_lockres *dl = m->private; | ||
630 | struct dlm_ctxt *dlm = dl->dl_ctxt; | ||
631 | struct dlm_lock_resource *res = NULL; | ||
632 | |||
633 | spin_lock(&dlm->spinlock); | ||
634 | |||
635 | if (dl->dl_res) { | ||
636 | list_for_each_entry(res, &dl->dl_res->tracking, tracking) { | ||
637 | if (dl->dl_res) { | ||
638 | dlm_lockres_put(dl->dl_res); | ||
639 | dl->dl_res = NULL; | ||
640 | } | ||
641 | if (&res->tracking == &dlm->tracking_list) { | ||
642 | mlog(0, "End of list found, %p\n", res); | ||
643 | dl = NULL; | ||
644 | break; | ||
645 | } | ||
646 | dlm_lockres_get(res); | ||
647 | dl->dl_res = res; | ||
648 | break; | ||
649 | } | ||
650 | } else { | ||
651 | if (!list_empty(&dlm->tracking_list)) { | ||
652 | list_for_each_entry(res, &dlm->tracking_list, tracking) | ||
653 | break; | ||
654 | dlm_lockres_get(res); | ||
655 | dl->dl_res = res; | ||
656 | } else | ||
657 | dl = NULL; | ||
658 | } | ||
659 | |||
660 | if (dl) { | ||
661 | spin_lock(&dl->dl_res->spinlock); | ||
662 | dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1); | ||
663 | spin_unlock(&dl->dl_res->spinlock); | ||
664 | } | ||
665 | |||
666 | spin_unlock(&dlm->spinlock); | ||
667 | |||
668 | return dl; | ||
669 | } | ||
670 | |||
671 | static void lockres_seq_stop(struct seq_file *m, void *v) | ||
672 | { | ||
673 | } | ||
674 | |||
675 | static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
676 | { | ||
677 | return NULL; | ||
678 | } | ||
679 | |||
680 | static int lockres_seq_show(struct seq_file *s, void *v) | ||
681 | { | ||
682 | struct debug_lockres *dl = (struct debug_lockres *)v; | ||
683 | |||
684 | seq_printf(s, "%s", dl->dl_buf); | ||
685 | |||
686 | return 0; | ||
687 | } | ||
688 | |||
689 | static struct seq_operations debug_lockres_ops = { | ||
690 | .start = lockres_seq_start, | ||
691 | .stop = lockres_seq_stop, | ||
692 | .next = lockres_seq_next, | ||
693 | .show = lockres_seq_show, | ||
694 | }; | ||
695 | |||
696 | static int debug_lockres_open(struct inode *inode, struct file *file) | ||
697 | { | ||
698 | struct dlm_ctxt *dlm = inode->i_private; | ||
699 | int ret = -ENOMEM; | ||
700 | struct seq_file *seq; | ||
701 | struct debug_lockres *dl = NULL; | ||
702 | |||
703 | dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); | ||
704 | if (!dl) { | ||
705 | mlog_errno(ret); | ||
706 | goto bail; | ||
707 | } | ||
708 | |||
709 | dl->dl_len = PAGE_SIZE; | ||
710 | dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); | ||
711 | if (!dl->dl_buf) { | ||
712 | mlog_errno(ret); | ||
713 | goto bail; | ||
714 | } | ||
715 | |||
716 | ret = seq_open(file, &debug_lockres_ops); | ||
717 | if (ret) { | ||
718 | mlog_errno(ret); | ||
719 | goto bail; | ||
720 | } | ||
721 | |||
722 | seq = (struct seq_file *) file->private_data; | ||
723 | seq->private = dl; | ||
724 | |||
725 | dlm_grab(dlm); | ||
726 | dl->dl_ctxt = dlm; | ||
727 | |||
728 | return 0; | ||
729 | bail: | ||
730 | if (dl) | ||
731 | kfree(dl->dl_buf); | ||
732 | kfree(dl); | ||
733 | return ret; | ||
734 | } | ||
735 | |||
736 | static int debug_lockres_release(struct inode *inode, struct file *file) | ||
737 | { | ||
738 | struct seq_file *seq = (struct seq_file *)file->private_data; | ||
739 | struct debug_lockres *dl = (struct debug_lockres *)seq->private; | ||
740 | |||
741 | if (dl->dl_res) | ||
742 | dlm_lockres_put(dl->dl_res); | ||
743 | dlm_put(dl->dl_ctxt); | ||
744 | kfree(dl->dl_buf); | ||
745 | return seq_release_private(inode, file); | ||
746 | } | ||
747 | |||
748 | static struct file_operations debug_lockres_fops = { | ||
749 | .open = debug_lockres_open, | ||
750 | .release = debug_lockres_release, | ||
751 | .read = seq_read, | ||
752 | .llseek = seq_lseek, | ||
753 | }; | ||
754 | /* end - debug lockres funcs */ | ||
755 | |||
756 | /* begin - debug state funcs */ | ||
757 | static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db) | ||
758 | { | ||
759 | int out = 0; | ||
760 | struct dlm_reco_node_data *node; | ||
761 | char *state; | ||
762 | int lres, rres, ures, tres; | ||
763 | |||
764 | lres = atomic_read(&dlm->local_resources); | ||
765 | rres = atomic_read(&dlm->remote_resources); | ||
766 | ures = atomic_read(&dlm->unknown_resources); | ||
767 | tres = lres + rres + ures; | ||
768 | |||
769 | spin_lock(&dlm->spinlock); | ||
770 | |||
771 | switch (dlm->dlm_state) { | ||
772 | case DLM_CTXT_NEW: | ||
773 | state = "NEW"; break; | ||
774 | case DLM_CTXT_JOINED: | ||
775 | state = "JOINED"; break; | ||
776 | case DLM_CTXT_IN_SHUTDOWN: | ||
777 | state = "SHUTDOWN"; break; | ||
778 | case DLM_CTXT_LEAVING: | ||
779 | state = "LEAVING"; break; | ||
780 | default: | ||
781 | state = "UNKNOWN"; break; | ||
782 | } | ||
783 | |||
784 | /* Domain: xxxxxxxxxx Key: 0xdfbac769 */ | ||
785 | out += snprintf(db->buf + out, db->len - out, | ||
786 | "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key); | ||
787 | |||
788 | /* Thread Pid: xxx Node: xxx State: xxxxx */ | ||
789 | out += snprintf(db->buf + out, db->len - out, | ||
790 | "Thread Pid: %d Node: %d State: %s\n", | ||
791 | dlm->dlm_thread_task->pid, dlm->node_num, state); | ||
792 | |||
793 | /* Number of Joins: xxx Joining Node: xxx */ | ||
794 | out += snprintf(db->buf + out, db->len - out, | ||
795 | "Number of Joins: %d Joining Node: %d\n", | ||
796 | dlm->num_joins, dlm->joining_node); | ||
797 | |||
798 | /* Domain Map: xx xx xx */ | ||
799 | out += snprintf(db->buf + out, db->len - out, "Domain Map: "); | ||
800 | out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES, | ||
801 | db->buf + out, db->len - out); | ||
802 | out += snprintf(db->buf + out, db->len - out, "\n"); | ||
803 | |||
804 | /* Live Map: xx xx xx */ | ||
805 | out += snprintf(db->buf + out, db->len - out, "Live Map: "); | ||
806 | out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES, | ||
807 | db->buf + out, db->len - out); | ||
808 | out += snprintf(db->buf + out, db->len - out, "\n"); | ||
809 | |||
810 | /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */ | ||
811 | out += snprintf(db->buf + out, db->len - out, | ||
812 | "Mastered Resources Total: %d Locally: %d " | ||
813 | "Remotely: %d Unknown: %d\n", | ||
814 | tres, lres, rres, ures); | ||
815 | |||
816 | /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */ | ||
817 | out += snprintf(db->buf + out, db->len - out, | ||
818 | "Lists: Dirty=%s Purge=%s PendingASTs=%s " | ||
819 | "PendingBASTs=%s Master=%s\n", | ||
820 | (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"), | ||
821 | (list_empty(&dlm->purge_list) ? "Empty" : "InUse"), | ||
822 | (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"), | ||
823 | (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"), | ||
824 | (list_empty(&dlm->master_list) ? "Empty" : "InUse")); | ||
825 | |||
826 | /* Purge Count: xxx Refs: xxx */ | ||
827 | out += snprintf(db->buf + out, db->len - out, | ||
828 | "Purge Count: %d Refs: %d\n", dlm->purge_count, | ||
829 | atomic_read(&dlm->dlm_refs.refcount)); | ||
830 | |||
831 | /* Dead Node: xxx */ | ||
832 | out += snprintf(db->buf + out, db->len - out, | ||
833 | "Dead Node: %d\n", dlm->reco.dead_node); | ||
834 | |||
835 | /* What about DLM_RECO_STATE_FINALIZE? */ | ||
836 | if (dlm->reco.state == DLM_RECO_STATE_ACTIVE) | ||
837 | state = "ACTIVE"; | ||
838 | else | ||
839 | state = "INACTIVE"; | ||
840 | |||
841 | /* Recovery Pid: xxxx Master: xxx State: xxxx */ | ||
842 | out += snprintf(db->buf + out, db->len - out, | ||
843 | "Recovery Pid: %d Master: %d State: %s\n", | ||
844 | dlm->dlm_reco_thread_task->pid, | ||
845 | dlm->reco.new_master, state); | ||
846 | |||
847 | /* Recovery Map: xx xx */ | ||
848 | out += snprintf(db->buf + out, db->len - out, "Recovery Map: "); | ||
849 | out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES, | ||
850 | db->buf + out, db->len - out); | ||
851 | out += snprintf(db->buf + out, db->len - out, "\n"); | ||
852 | |||
853 | /* Recovery Node State: */ | ||
854 | out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n"); | ||
855 | list_for_each_entry(node, &dlm->reco.node_data, list) { | ||
856 | switch (node->state) { | ||
857 | case DLM_RECO_NODE_DATA_INIT: | ||
858 | state = "INIT"; | ||
859 | break; | ||
860 | case DLM_RECO_NODE_DATA_REQUESTING: | ||
861 | state = "REQUESTING"; | ||
862 | break; | ||
863 | case DLM_RECO_NODE_DATA_DEAD: | ||
864 | state = "DEAD"; | ||
865 | break; | ||
866 | case DLM_RECO_NODE_DATA_RECEIVING: | ||
867 | state = "RECEIVING"; | ||
868 | break; | ||
869 | case DLM_RECO_NODE_DATA_REQUESTED: | ||
870 | state = "REQUESTED"; | ||
871 | break; | ||
872 | case DLM_RECO_NODE_DATA_DONE: | ||
873 | state = "DONE"; | ||
874 | break; | ||
875 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | ||
876 | state = "FINALIZE-SENT"; | ||
877 | break; | ||
878 | default: | ||
879 | state = "BAD"; | ||
880 | break; | ||
881 | } | ||
882 | out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n", | ||
883 | node->node_num, state); | ||
884 | } | ||
885 | |||
886 | spin_unlock(&dlm->spinlock); | ||
887 | |||
888 | return out; | ||
889 | } | ||
890 | |||
891 | static int debug_state_open(struct inode *inode, struct file *file) | ||
892 | { | ||
893 | struct dlm_ctxt *dlm = inode->i_private; | ||
894 | struct debug_buffer *db = NULL; | ||
895 | |||
896 | db = debug_buffer_allocate(); | ||
897 | if (!db) | ||
898 | goto bail; | ||
899 | |||
900 | db->len = debug_state_print(dlm, db); | ||
901 | |||
902 | file->private_data = db; | ||
903 | |||
904 | return 0; | ||
905 | bail: | ||
906 | return -ENOMEM; | ||
907 | } | ||
908 | |||
909 | static struct file_operations debug_state_fops = { | ||
910 | .open = debug_state_open, | ||
911 | .release = debug_buffer_release, | ||
912 | .read = debug_buffer_read, | ||
913 | .llseek = debug_buffer_llseek, | ||
914 | }; | ||
915 | /* end - debug state funcs */ | ||
916 | |||
917 | /* files in subroot */ | ||
918 | int dlm_debug_init(struct dlm_ctxt *dlm) | ||
919 | { | ||
920 | struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; | ||
921 | |||
922 | /* for dumping dlm_ctxt */ | ||
923 | dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, | ||
924 | S_IFREG|S_IRUSR, | ||
925 | dlm->dlm_debugfs_subroot, | ||
926 | dlm, &debug_state_fops); | ||
927 | if (!dc->debug_state_dentry) { | ||
928 | mlog_errno(-ENOMEM); | ||
929 | goto bail; | ||
930 | } | ||
931 | |||
932 | /* for dumping lockres */ | ||
933 | dc->debug_lockres_dentry = | ||
934 | debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, | ||
935 | S_IFREG|S_IRUSR, | ||
936 | dlm->dlm_debugfs_subroot, | ||
937 | dlm, &debug_lockres_fops); | ||
938 | if (!dc->debug_lockres_dentry) { | ||
939 | mlog_errno(-ENOMEM); | ||
940 | goto bail; | ||
941 | } | ||
942 | |||
943 | /* for dumping mles */ | ||
944 | dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, | ||
945 | S_IFREG|S_IRUSR, | ||
946 | dlm->dlm_debugfs_subroot, | ||
947 | dlm, &debug_mle_fops); | ||
948 | if (!dc->debug_mle_dentry) { | ||
949 | mlog_errno(-ENOMEM); | ||
950 | goto bail; | ||
951 | } | ||
952 | |||
953 | /* for dumping lockres on the purge list */ | ||
954 | dc->debug_purgelist_dentry = | ||
955 | debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, | ||
956 | S_IFREG|S_IRUSR, | ||
957 | dlm->dlm_debugfs_subroot, | ||
958 | dlm, &debug_purgelist_fops); | ||
959 | if (!dc->debug_purgelist_dentry) { | ||
960 | mlog_errno(-ENOMEM); | ||
961 | goto bail; | ||
962 | } | ||
963 | |||
964 | dlm_debug_get(dc); | ||
965 | return 0; | ||
966 | |||
967 | bail: | ||
968 | dlm_debug_shutdown(dlm); | ||
969 | return -ENOMEM; | ||
970 | } | ||
971 | |||
972 | void dlm_debug_shutdown(struct dlm_ctxt *dlm) | ||
973 | { | ||
974 | struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; | ||
975 | |||
976 | if (dc) { | ||
977 | if (dc->debug_purgelist_dentry) | ||
978 | debugfs_remove(dc->debug_purgelist_dentry); | ||
979 | if (dc->debug_mle_dentry) | ||
980 | debugfs_remove(dc->debug_mle_dentry); | ||
981 | if (dc->debug_lockres_dentry) | ||
982 | debugfs_remove(dc->debug_lockres_dentry); | ||
983 | if (dc->debug_state_dentry) | ||
984 | debugfs_remove(dc->debug_state_dentry); | ||
985 | dlm_debug_put(dc); | ||
986 | } | ||
987 | } | ||
988 | |||
989 | /* subroot - domain dir */ | ||
990 | int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) | ||
991 | { | ||
992 | dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, | ||
993 | dlm_debugfs_root); | ||
994 | if (!dlm->dlm_debugfs_subroot) { | ||
995 | mlog_errno(-ENOMEM); | ||
996 | goto bail; | ||
997 | } | ||
998 | |||
999 | dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), | ||
1000 | GFP_KERNEL); | ||
1001 | if (!dlm->dlm_debug_ctxt) { | ||
1002 | mlog_errno(-ENOMEM); | ||
1003 | goto bail; | ||
1004 | } | ||
1005 | kref_init(&dlm->dlm_debug_ctxt->debug_refcnt); | ||
1006 | |||
1007 | return 0; | ||
1008 | bail: | ||
1009 | dlm_destroy_debugfs_subroot(dlm); | ||
1010 | return -ENOMEM; | ||
1011 | } | ||
1012 | |||
1013 | void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) | ||
1014 | { | ||
1015 | if (dlm->dlm_debugfs_subroot) | ||
1016 | debugfs_remove(dlm->dlm_debugfs_subroot); | ||
1017 | } | ||
1018 | |||
1019 | /* debugfs root */ | ||
1020 | int dlm_create_debugfs_root(void) | ||
1021 | { | ||
1022 | dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL); | ||
1023 | if (!dlm_debugfs_root) { | ||
1024 | mlog_errno(-ENOMEM); | ||
1025 | return -ENOMEM; | ||
1026 | } | ||
1027 | return 0; | ||
1028 | } | ||
1029 | |||
1030 | void dlm_destroy_debugfs_root(void) | ||
1031 | { | ||
1032 | if (dlm_debugfs_root) | ||
1033 | debugfs_remove(dlm_debugfs_root); | ||
1034 | } | ||
1035 | #endif /* CONFIG_DEBUG_FS */ | ||
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h new file mode 100644 index 000000000000..d34a62a3a625 --- /dev/null +++ b/fs/ocfs2/dlm/dlmdebug.h | |||
@@ -0,0 +1,86 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * dlmdebug.h | ||
5 | * | ||
6 | * Copyright (C) 2008 Oracle. All rights reserved. | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or | ||
9 | * modify it under the terms of the GNU General Public | ||
10 | * License as published by the Free Software Foundation; either | ||
11 | * version 2 of the License, or (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
16 | * General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public | ||
19 | * License along with this program; if not, write to the | ||
20 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
21 | * Boston, MA 021110-1307, USA. | ||
22 | * | ||
23 | */ | ||
24 | |||
25 | #ifndef DLMDEBUG_H | ||
26 | #define DLMDEBUG_H | ||
27 | |||
28 | void dlm_print_one_mle(struct dlm_master_list_entry *mle); | ||
29 | |||
30 | #ifdef CONFIG_DEBUG_FS | ||
31 | |||
32 | struct dlm_debug_ctxt { | ||
33 | struct kref debug_refcnt; | ||
34 | struct dentry *debug_state_dentry; | ||
35 | struct dentry *debug_lockres_dentry; | ||
36 | struct dentry *debug_mle_dentry; | ||
37 | struct dentry *debug_purgelist_dentry; | ||
38 | }; | ||
39 | |||
40 | struct debug_buffer { | ||
41 | int len; | ||
42 | char *buf; | ||
43 | }; | ||
44 | |||
45 | struct debug_lockres { | ||
46 | int dl_len; | ||
47 | char *dl_buf; | ||
48 | struct dlm_ctxt *dl_ctxt; | ||
49 | struct dlm_lock_resource *dl_res; | ||
50 | }; | ||
51 | |||
52 | int dlm_debug_init(struct dlm_ctxt *dlm); | ||
53 | void dlm_debug_shutdown(struct dlm_ctxt *dlm); | ||
54 | |||
55 | int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); | ||
56 | void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); | ||
57 | |||
58 | int dlm_create_debugfs_root(void); | ||
59 | void dlm_destroy_debugfs_root(void); | ||
60 | |||
61 | #else | ||
62 | |||
63 | static int dlm_debug_init(struct dlm_ctxt *dlm) | ||
64 | { | ||
65 | return 0; | ||
66 | } | ||
67 | static void dlm_debug_shutdown(struct dlm_ctxt *dlm) | ||
68 | { | ||
69 | } | ||
70 | static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) | ||
71 | { | ||
72 | return 0; | ||
73 | } | ||
74 | static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) | ||
75 | { | ||
76 | } | ||
77 | static int dlm_create_debugfs_root(void) | ||
78 | { | ||
79 | return 0; | ||
80 | } | ||
81 | static void dlm_destroy_debugfs_root(void) | ||
82 | { | ||
83 | } | ||
84 | |||
85 | #endif /* CONFIG_DEBUG_FS */ | ||
86 | #endif /* DLMDEBUG_H */ | ||
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 0879d86113e3..63f8125824e8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/spinlock.h> | 33 | #include <linux/spinlock.h> |
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/err.h> | 35 | #include <linux/err.h> |
36 | #include <linux/debugfs.h> | ||
36 | 37 | ||
37 | #include "cluster/heartbeat.h" | 38 | #include "cluster/heartbeat.h" |
38 | #include "cluster/nodemanager.h" | 39 | #include "cluster/nodemanager.h" |
@@ -40,8 +41,8 @@ | |||
40 | 41 | ||
41 | #include "dlmapi.h" | 42 | #include "dlmapi.h" |
42 | #include "dlmcommon.h" | 43 | #include "dlmcommon.h" |
43 | |||
44 | #include "dlmdomain.h" | 44 | #include "dlmdomain.h" |
45 | #include "dlmdebug.h" | ||
45 | 46 | ||
46 | #include "dlmver.h" | 47 | #include "dlmver.h" |
47 | 48 | ||
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain) | |||
298 | 299 | ||
299 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) | 300 | static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) |
300 | { | 301 | { |
302 | dlm_destroy_debugfs_subroot(dlm); | ||
303 | |||
301 | if (dlm->lockres_hash) | 304 | if (dlm->lockres_hash) |
302 | dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); | 305 | dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); |
303 | 306 | ||
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) | |||
395 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) | 398 | static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) |
396 | { | 399 | { |
397 | dlm_unregister_domain_handlers(dlm); | 400 | dlm_unregister_domain_handlers(dlm); |
401 | dlm_debug_shutdown(dlm); | ||
398 | dlm_complete_thread(dlm); | 402 | dlm_complete_thread(dlm); |
399 | dlm_complete_recovery_thread(dlm); | 403 | dlm_complete_recovery_thread(dlm); |
400 | dlm_destroy_dlm_worker(dlm); | 404 | dlm_destroy_dlm_worker(dlm); |
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm) | |||
644 | void dlm_unregister_domain(struct dlm_ctxt *dlm) | 648 | void dlm_unregister_domain(struct dlm_ctxt *dlm) |
645 | { | 649 | { |
646 | int leave = 0; | 650 | int leave = 0; |
651 | struct dlm_lock_resource *res; | ||
647 | 652 | ||
648 | spin_lock(&dlm_domain_lock); | 653 | spin_lock(&dlm_domain_lock); |
649 | BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); | 654 | BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); |
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm) | |||
673 | msleep(500); | 678 | msleep(500); |
674 | mlog(0, "%s: more migration to do\n", dlm->name); | 679 | mlog(0, "%s: more migration to do\n", dlm->name); |
675 | } | 680 | } |
681 | |||
682 | /* This list should be empty. If not, print remaining lockres */ | ||
683 | if (!list_empty(&dlm->tracking_list)) { | ||
684 | mlog(ML_ERROR, "Following lockres' are still on the " | ||
685 | "tracking list:\n"); | ||
686 | list_for_each_entry(res, &dlm->tracking_list, tracking) | ||
687 | dlm_print_one_lock_resource(res); | ||
688 | } | ||
689 | |||
676 | dlm_mark_domain_leaving(dlm); | 690 | dlm_mark_domain_leaving(dlm); |
677 | dlm_leave_domain(dlm); | 691 | dlm_leave_domain(dlm); |
678 | dlm_complete_dlm_shutdown(dlm); | 692 | dlm_complete_dlm_shutdown(dlm); |
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) | |||
1405 | goto bail; | 1419 | goto bail; |
1406 | } | 1420 | } |
1407 | 1421 | ||
1422 | status = dlm_debug_init(dlm); | ||
1423 | if (status < 0) { | ||
1424 | mlog_errno(status); | ||
1425 | goto bail; | ||
1426 | } | ||
1427 | |||
1408 | status = dlm_launch_thread(dlm); | 1428 | status = dlm_launch_thread(dlm); |
1409 | if (status < 0) { | 1429 | if (status < 0) { |
1410 | mlog_errno(status); | 1430 | mlog_errno(status); |
@@ -1472,6 +1492,7 @@ bail: | |||
1472 | 1492 | ||
1473 | if (status) { | 1493 | if (status) { |
1474 | dlm_unregister_domain_handlers(dlm); | 1494 | dlm_unregister_domain_handlers(dlm); |
1495 | dlm_debug_shutdown(dlm); | ||
1475 | dlm_complete_thread(dlm); | 1496 | dlm_complete_thread(dlm); |
1476 | dlm_complete_recovery_thread(dlm); | 1497 | dlm_complete_recovery_thread(dlm); |
1477 | dlm_destroy_dlm_worker(dlm); | 1498 | dlm_destroy_dlm_worker(dlm); |
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1484 | u32 key) | 1505 | u32 key) |
1485 | { | 1506 | { |
1486 | int i; | 1507 | int i; |
1508 | int ret; | ||
1487 | struct dlm_ctxt *dlm = NULL; | 1509 | struct dlm_ctxt *dlm = NULL; |
1488 | 1510 | ||
1489 | dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); | 1511 | dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); |
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1516 | dlm->key = key; | 1538 | dlm->key = key; |
1517 | dlm->node_num = o2nm_this_node(); | 1539 | dlm->node_num = o2nm_this_node(); |
1518 | 1540 | ||
1541 | ret = dlm_create_debugfs_subroot(dlm); | ||
1542 | if (ret < 0) { | ||
1543 | dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); | ||
1544 | kfree(dlm->name); | ||
1545 | kfree(dlm); | ||
1546 | dlm = NULL; | ||
1547 | goto leave; | ||
1548 | } | ||
1549 | |||
1519 | spin_lock_init(&dlm->spinlock); | 1550 | spin_lock_init(&dlm->spinlock); |
1520 | spin_lock_init(&dlm->master_lock); | 1551 | spin_lock_init(&dlm->master_lock); |
1521 | spin_lock_init(&dlm->ast_lock); | 1552 | spin_lock_init(&dlm->ast_lock); |
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, | |||
1526 | INIT_LIST_HEAD(&dlm->reco.node_data); | 1557 | INIT_LIST_HEAD(&dlm->reco.node_data); |
1527 | INIT_LIST_HEAD(&dlm->purge_list); | 1558 | INIT_LIST_HEAD(&dlm->purge_list); |
1528 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); | 1559 | INIT_LIST_HEAD(&dlm->dlm_domain_handlers); |
1560 | INIT_LIST_HEAD(&dlm->tracking_list); | ||
1529 | dlm->reco.state = 0; | 1561 | dlm->reco.state = 0; |
1530 | 1562 | ||
1531 | INIT_LIST_HEAD(&dlm->pending_asts); | 1563 | INIT_LIST_HEAD(&dlm->pending_asts); |
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void) | |||
1816 | dlm_print_version(); | 1848 | dlm_print_version(); |
1817 | 1849 | ||
1818 | status = dlm_init_mle_cache(); | 1850 | status = dlm_init_mle_cache(); |
1819 | if (status) | 1851 | if (status) { |
1820 | return -1; | 1852 | mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n"); |
1853 | goto error; | ||
1854 | } | ||
1855 | |||
1856 | status = dlm_init_master_caches(); | ||
1857 | if (status) { | ||
1858 | mlog(ML_ERROR, "Could not create o2dlm_lockres and " | ||
1859 | "o2dlm_lockname slabcaches\n"); | ||
1860 | goto error; | ||
1861 | } | ||
1862 | |||
1863 | status = dlm_init_lock_cache(); | ||
1864 | if (status) { | ||
1865 | mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n"); | ||
1866 | goto error; | ||
1867 | } | ||
1821 | 1868 | ||
1822 | status = dlm_register_net_handlers(); | 1869 | status = dlm_register_net_handlers(); |
1823 | if (status) { | 1870 | if (status) { |
1824 | dlm_destroy_mle_cache(); | 1871 | mlog(ML_ERROR, "Unable to register network handlers\n"); |
1825 | return -1; | 1872 | goto error; |
1826 | } | 1873 | } |
1827 | 1874 | ||
1875 | status = dlm_create_debugfs_root(); | ||
1876 | if (status) | ||
1877 | goto error; | ||
1878 | |||
1828 | return 0; | 1879 | return 0; |
1880 | error: | ||
1881 | dlm_unregister_net_handlers(); | ||
1882 | dlm_destroy_lock_cache(); | ||
1883 | dlm_destroy_master_caches(); | ||
1884 | dlm_destroy_mle_cache(); | ||
1885 | return -1; | ||
1829 | } | 1886 | } |
1830 | 1887 | ||
1831 | static void __exit dlm_exit (void) | 1888 | static void __exit dlm_exit (void) |
1832 | { | 1889 | { |
1890 | dlm_destroy_debugfs_root(); | ||
1833 | dlm_unregister_net_handlers(); | 1891 | dlm_unregister_net_handlers(); |
1892 | dlm_destroy_lock_cache(); | ||
1893 | dlm_destroy_master_caches(); | ||
1834 | dlm_destroy_mle_cache(); | 1894 | dlm_destroy_mle_cache(); |
1835 | } | 1895 | } |
1836 | 1896 | ||
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 52578d907d9a..83a9f2972ac8 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c | |||
@@ -53,6 +53,8 @@ | |||
53 | #define MLOG_MASK_PREFIX ML_DLM | 53 | #define MLOG_MASK_PREFIX ML_DLM |
54 | #include "cluster/masklog.h" | 54 | #include "cluster/masklog.h" |
55 | 55 | ||
56 | static struct kmem_cache *dlm_lock_cache = NULL; | ||
57 | |||
56 | static DEFINE_SPINLOCK(dlm_cookie_lock); | 58 | static DEFINE_SPINLOCK(dlm_cookie_lock); |
57 | static u64 dlm_next_cookie = 1; | 59 | static u64 dlm_next_cookie = 1; |
58 | 60 | ||
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type, | |||
64 | static void dlm_lock_release(struct kref *kref); | 66 | static void dlm_lock_release(struct kref *kref); |
65 | static void dlm_lock_detach_lockres(struct dlm_lock *lock); | 67 | static void dlm_lock_detach_lockres(struct dlm_lock *lock); |
66 | 68 | ||
69 | int dlm_init_lock_cache(void) | ||
70 | { | ||
71 | dlm_lock_cache = kmem_cache_create("o2dlm_lock", | ||
72 | sizeof(struct dlm_lock), | ||
73 | 0, SLAB_HWCACHE_ALIGN, NULL); | ||
74 | if (dlm_lock_cache == NULL) | ||
75 | return -ENOMEM; | ||
76 | return 0; | ||
77 | } | ||
78 | |||
79 | void dlm_destroy_lock_cache(void) | ||
80 | { | ||
81 | if (dlm_lock_cache) | ||
82 | kmem_cache_destroy(dlm_lock_cache); | ||
83 | } | ||
84 | |||
67 | /* Tell us whether we can grant a new lock request. | 85 | /* Tell us whether we can grant a new lock request. |
68 | * locking: | 86 | * locking: |
69 | * caller needs: res->spinlock | 87 | * caller needs: res->spinlock |
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref) | |||
353 | mlog(0, "freeing kernel-allocated lksb\n"); | 371 | mlog(0, "freeing kernel-allocated lksb\n"); |
354 | kfree(lock->lksb); | 372 | kfree(lock->lksb); |
355 | } | 373 | } |
356 | kfree(lock); | 374 | kmem_cache_free(dlm_lock_cache, lock); |
357 | } | 375 | } |
358 | 376 | ||
359 | /* associate a lock with it's lockres, getting a ref on the lockres */ | 377 | /* associate a lock with it's lockres, getting a ref on the lockres */ |
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, | |||
412 | struct dlm_lock *lock; | 430 | struct dlm_lock *lock; |
413 | int kernel_allocated = 0; | 431 | int kernel_allocated = 0; |
414 | 432 | ||
415 | lock = kzalloc(sizeof(*lock), GFP_NOFS); | 433 | lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS); |
416 | if (!lock) | 434 | if (!lock) |
417 | return NULL; | 435 | return NULL; |
418 | 436 | ||
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ea6b89577860..efc015c6128a 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -48,47 +48,11 @@ | |||
48 | #include "dlmapi.h" | 48 | #include "dlmapi.h" |
49 | #include "dlmcommon.h" | 49 | #include "dlmcommon.h" |
50 | #include "dlmdomain.h" | 50 | #include "dlmdomain.h" |
51 | #include "dlmdebug.h" | ||
51 | 52 | ||
52 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) | 53 | #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) |
53 | #include "cluster/masklog.h" | 54 | #include "cluster/masklog.h" |
54 | 55 | ||
55 | enum dlm_mle_type { | ||
56 | DLM_MLE_BLOCK, | ||
57 | DLM_MLE_MASTER, | ||
58 | DLM_MLE_MIGRATION | ||
59 | }; | ||
60 | |||
61 | struct dlm_lock_name | ||
62 | { | ||
63 | u8 len; | ||
64 | u8 name[DLM_LOCKID_NAME_MAX]; | ||
65 | }; | ||
66 | |||
67 | struct dlm_master_list_entry | ||
68 | { | ||
69 | struct list_head list; | ||
70 | struct list_head hb_events; | ||
71 | struct dlm_ctxt *dlm; | ||
72 | spinlock_t spinlock; | ||
73 | wait_queue_head_t wq; | ||
74 | atomic_t woken; | ||
75 | struct kref mle_refs; | ||
76 | int inuse; | ||
77 | unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
78 | unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
79 | unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
80 | unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
81 | u8 master; | ||
82 | u8 new_master; | ||
83 | enum dlm_mle_type type; | ||
84 | struct o2hb_callback_func mle_hb_up; | ||
85 | struct o2hb_callback_func mle_hb_down; | ||
86 | union { | ||
87 | struct dlm_lock_resource *res; | ||
88 | struct dlm_lock_name name; | ||
89 | } u; | ||
90 | }; | ||
91 | |||
92 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, | 56 | static void dlm_mle_node_down(struct dlm_ctxt *dlm, |
93 | struct dlm_master_list_entry *mle, | 57 | struct dlm_master_list_entry *mle, |
94 | struct o2nm_node *node, | 58 | struct o2nm_node *node, |
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, | |||
128 | return 1; | 92 | return 1; |
129 | } | 93 | } |
130 | 94 | ||
131 | #define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) | 95 | static struct kmem_cache *dlm_lockres_cache = NULL; |
132 | static void _dlm_print_nodemap(unsigned long *map, const char *mapname) | 96 | static struct kmem_cache *dlm_lockname_cache = NULL; |
133 | { | ||
134 | int i; | ||
135 | printk("%s=[ ", mapname); | ||
136 | for (i=0; i<O2NM_MAX_NODES; i++) | ||
137 | if (test_bit(i, map)) | ||
138 | printk("%d ", i); | ||
139 | printk("]"); | ||
140 | } | ||
141 | |||
142 | static void dlm_print_one_mle(struct dlm_master_list_entry *mle) | ||
143 | { | ||
144 | int refs; | ||
145 | char *type; | ||
146 | char attached; | ||
147 | u8 master; | ||
148 | unsigned int namelen; | ||
149 | const char *name; | ||
150 | struct kref *k; | ||
151 | unsigned long *maybe = mle->maybe_map, | ||
152 | *vote = mle->vote_map, | ||
153 | *resp = mle->response_map, | ||
154 | *node = mle->node_map; | ||
155 | |||
156 | k = &mle->mle_refs; | ||
157 | if (mle->type == DLM_MLE_BLOCK) | ||
158 | type = "BLK"; | ||
159 | else if (mle->type == DLM_MLE_MASTER) | ||
160 | type = "MAS"; | ||
161 | else | ||
162 | type = "MIG"; | ||
163 | refs = atomic_read(&k->refcount); | ||
164 | master = mle->master; | ||
165 | attached = (list_empty(&mle->hb_events) ? 'N' : 'Y'); | ||
166 | |||
167 | if (mle->type != DLM_MLE_MASTER) { | ||
168 | namelen = mle->u.name.len; | ||
169 | name = mle->u.name.name; | ||
170 | } else { | ||
171 | namelen = mle->u.res->lockname.len; | ||
172 | name = mle->u.res->lockname.name; | ||
173 | } | ||
174 | |||
175 | mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", | ||
176 | namelen, name, type, refs, master, mle->new_master, attached, | ||
177 | mle->inuse); | ||
178 | dlm_print_nodemap(maybe); | ||
179 | printk(", "); | ||
180 | dlm_print_nodemap(vote); | ||
181 | printk(", "); | ||
182 | dlm_print_nodemap(resp); | ||
183 | printk(", "); | ||
184 | dlm_print_nodemap(node); | ||
185 | printk(", "); | ||
186 | printk("\n"); | ||
187 | } | ||
188 | |||
189 | #if 0 | ||
190 | /* Code here is included but defined out as it aids debugging */ | ||
191 | |||
192 | static void dlm_dump_mles(struct dlm_ctxt *dlm) | ||
193 | { | ||
194 | struct dlm_master_list_entry *mle; | ||
195 | |||
196 | mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); | ||
197 | spin_lock(&dlm->master_lock); | ||
198 | list_for_each_entry(mle, &dlm->master_list, list) | ||
199 | dlm_print_one_mle(mle); | ||
200 | spin_unlock(&dlm->master_lock); | ||
201 | } | ||
202 | |||
203 | int dlm_dump_all_mles(const char __user *data, unsigned int len) | ||
204 | { | ||
205 | struct dlm_ctxt *dlm; | ||
206 | |||
207 | spin_lock(&dlm_domain_lock); | ||
208 | list_for_each_entry(dlm, &dlm_domains, list) { | ||
209 | mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); | ||
210 | dlm_dump_mles(dlm); | ||
211 | } | ||
212 | spin_unlock(&dlm_domain_lock); | ||
213 | return len; | ||
214 | } | ||
215 | EXPORT_SYMBOL_GPL(dlm_dump_all_mles); | ||
216 | |||
217 | #endif /* 0 */ | ||
218 | |||
219 | |||
220 | static struct kmem_cache *dlm_mle_cache = NULL; | 97 | static struct kmem_cache *dlm_mle_cache = NULL; |
221 | 98 | ||
222 | |||
223 | static void dlm_mle_release(struct kref *kref); | 99 | static void dlm_mle_release(struct kref *kref); |
224 | static void dlm_init_mle(struct dlm_master_list_entry *mle, | 100 | static void dlm_init_mle(struct dlm_master_list_entry *mle, |
225 | enum dlm_mle_type type, | 101 | enum dlm_mle_type type, |
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm, | |||
507 | 383 | ||
508 | int dlm_init_mle_cache(void) | 384 | int dlm_init_mle_cache(void) |
509 | { | 385 | { |
510 | dlm_mle_cache = kmem_cache_create("dlm_mle_cache", | 386 | dlm_mle_cache = kmem_cache_create("o2dlm_mle", |
511 | sizeof(struct dlm_master_list_entry), | 387 | sizeof(struct dlm_master_list_entry), |
512 | 0, SLAB_HWCACHE_ALIGN, | 388 | 0, SLAB_HWCACHE_ALIGN, |
513 | NULL); | 389 | NULL); |
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref) | |||
560 | * LOCK RESOURCE FUNCTIONS | 436 | * LOCK RESOURCE FUNCTIONS |
561 | */ | 437 | */ |
562 | 438 | ||
439 | int dlm_init_master_caches(void) | ||
440 | { | ||
441 | dlm_lockres_cache = kmem_cache_create("o2dlm_lockres", | ||
442 | sizeof(struct dlm_lock_resource), | ||
443 | 0, SLAB_HWCACHE_ALIGN, NULL); | ||
444 | if (!dlm_lockres_cache) | ||
445 | goto bail; | ||
446 | |||
447 | dlm_lockname_cache = kmem_cache_create("o2dlm_lockname", | ||
448 | DLM_LOCKID_NAME_MAX, 0, | ||
449 | SLAB_HWCACHE_ALIGN, NULL); | ||
450 | if (!dlm_lockname_cache) | ||
451 | goto bail; | ||
452 | |||
453 | return 0; | ||
454 | bail: | ||
455 | dlm_destroy_master_caches(); | ||
456 | return -ENOMEM; | ||
457 | } | ||
458 | |||
459 | void dlm_destroy_master_caches(void) | ||
460 | { | ||
461 | if (dlm_lockname_cache) | ||
462 | kmem_cache_destroy(dlm_lockname_cache); | ||
463 | |||
464 | if (dlm_lockres_cache) | ||
465 | kmem_cache_destroy(dlm_lockres_cache); | ||
466 | } | ||
467 | |||
563 | static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, | 468 | static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, |
564 | struct dlm_lock_resource *res, | 469 | struct dlm_lock_resource *res, |
565 | u8 owner) | 470 | u8 owner) |
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref) | |||
610 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, | 515 | mlog(0, "destroying lockres %.*s\n", res->lockname.len, |
611 | res->lockname.name); | 516 | res->lockname.name); |
612 | 517 | ||
518 | if (!list_empty(&res->tracking)) | ||
519 | list_del_init(&res->tracking); | ||
520 | else { | ||
521 | mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n", | ||
522 | res->lockname.len, res->lockname.name); | ||
523 | dlm_print_one_lock_resource(res); | ||
524 | } | ||
525 | |||
613 | if (!hlist_unhashed(&res->hash_node) || | 526 | if (!hlist_unhashed(&res->hash_node) || |
614 | !list_empty(&res->granted) || | 527 | !list_empty(&res->granted) || |
615 | !list_empty(&res->converting) || | 528 | !list_empty(&res->converting) || |
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref) | |||
642 | BUG_ON(!list_empty(&res->recovering)); | 555 | BUG_ON(!list_empty(&res->recovering)); |
643 | BUG_ON(!list_empty(&res->purge)); | 556 | BUG_ON(!list_empty(&res->purge)); |
644 | 557 | ||
645 | kfree(res->lockname.name); | 558 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); |
646 | 559 | ||
647 | kfree(res); | 560 | kmem_cache_free(dlm_lockres_cache, res); |
648 | } | 561 | } |
649 | 562 | ||
650 | void dlm_lockres_put(struct dlm_lock_resource *res) | 563 | void dlm_lockres_put(struct dlm_lock_resource *res) |
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, | |||
677 | INIT_LIST_HEAD(&res->dirty); | 590 | INIT_LIST_HEAD(&res->dirty); |
678 | INIT_LIST_HEAD(&res->recovering); | 591 | INIT_LIST_HEAD(&res->recovering); |
679 | INIT_LIST_HEAD(&res->purge); | 592 | INIT_LIST_HEAD(&res->purge); |
593 | INIT_LIST_HEAD(&res->tracking); | ||
680 | atomic_set(&res->asts_reserved, 0); | 594 | atomic_set(&res->asts_reserved, 0); |
681 | res->migration_pending = 0; | 595 | res->migration_pending = 0; |
682 | res->inflight_locks = 0; | 596 | res->inflight_locks = 0; |
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, | |||
692 | 606 | ||
693 | res->last_used = 0; | 607 | res->last_used = 0; |
694 | 608 | ||
609 | list_add_tail(&res->tracking, &dlm->tracking_list); | ||
610 | |||
695 | memset(res->lvb, 0, DLM_LVB_LEN); | 611 | memset(res->lvb, 0, DLM_LVB_LEN); |
696 | memset(res->refmap, 0, sizeof(res->refmap)); | 612 | memset(res->refmap, 0, sizeof(res->refmap)); |
697 | } | 613 | } |
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | |||
700 | const char *name, | 616 | const char *name, |
701 | unsigned int namelen) | 617 | unsigned int namelen) |
702 | { | 618 | { |
703 | struct dlm_lock_resource *res; | 619 | struct dlm_lock_resource *res = NULL; |
704 | 620 | ||
705 | res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); | 621 | res = (struct dlm_lock_resource *) |
622 | kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS); | ||
706 | if (!res) | 623 | if (!res) |
707 | return NULL; | 624 | goto error; |
708 | 625 | ||
709 | res->lockname.name = kmalloc(namelen, GFP_NOFS); | 626 | res->lockname.name = (char *) |
710 | if (!res->lockname.name) { | 627 | kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS); |
711 | kfree(res); | 628 | if (!res->lockname.name) |
712 | return NULL; | 629 | goto error; |
713 | } | ||
714 | 630 | ||
715 | dlm_init_lockres(dlm, res, name, namelen); | 631 | dlm_init_lockres(dlm, res, name, namelen); |
716 | return res; | 632 | return res; |
633 | |||
634 | error: | ||
635 | if (res && res->lockname.name) | ||
636 | kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); | ||
637 | |||
638 | if (res) | ||
639 | kmem_cache_free(dlm_lockres_cache, res); | ||
640 | return NULL; | ||
717 | } | 641 | } |
718 | 642 | ||
719 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | 643 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, |
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 1f1873bf41fb..394d25a131a5 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c | |||
@@ -27,18 +27,11 @@ | |||
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/crc32.h> | ||
31 | #include <linux/kthread.h> | 30 | #include <linux/kthread.h> |
32 | #include <linux/pagemap.h> | 31 | #include <linux/pagemap.h> |
33 | #include <linux/debugfs.h> | 32 | #include <linux/debugfs.h> |
34 | #include <linux/seq_file.h> | 33 | #include <linux/seq_file.h> |
35 | 34 | ||
36 | #include <cluster/heartbeat.h> | ||
37 | #include <cluster/nodemanager.h> | ||
38 | #include <cluster/tcp.h> | ||
39 | |||
40 | #include <dlm/dlmapi.h> | ||
41 | |||
42 | #define MLOG_MASK_PREFIX ML_DLM_GLUE | 35 | #define MLOG_MASK_PREFIX ML_DLM_GLUE |
43 | #include <cluster/masklog.h> | 36 | #include <cluster/masklog.h> |
44 | 37 | ||
@@ -53,6 +46,7 @@ | |||
53 | #include "heartbeat.h" | 46 | #include "heartbeat.h" |
54 | #include "inode.h" | 47 | #include "inode.h" |
55 | #include "journal.h" | 48 | #include "journal.h" |
49 | #include "stackglue.h" | ||
56 | #include "slot_map.h" | 50 | #include "slot_map.h" |
57 | #include "super.h" | 51 | #include "super.h" |
58 | #include "uptodate.h" | 52 | #include "uptodate.h" |
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level, | |||
113 | unsigned int line, | 107 | unsigned int line, |
114 | struct ocfs2_lock_res *lockres) | 108 | struct ocfs2_lock_res *lockres) |
115 | { | 109 | { |
116 | struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | 110 | struct ocfs2_meta_lvb *lvb = |
111 | (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); | ||
117 | 112 | ||
118 | mlog(level, "LVB information for %s (called from %s:%u):\n", | 113 | mlog(level, "LVB information for %s (called from %s:%u):\n", |
119 | lockres->l_name, function, line); | 114 | lockres->l_name, function, line); |
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = { | |||
259 | .flags = 0, | 254 | .flags = 0, |
260 | }; | 255 | }; |
261 | 256 | ||
262 | /* | ||
263 | * This is the filesystem locking protocol version. | ||
264 | * | ||
265 | * Whenever the filesystem does new things with locks (adds or removes a | ||
266 | * lock, orders them differently, does different things underneath a lock), | ||
267 | * the version must be changed. The protocol is negotiated when joining | ||
268 | * the dlm domain. A node may join the domain if its major version is | ||
269 | * identical to all other nodes and its minor version is greater than | ||
270 | * or equal to all other nodes. When its minor version is greater than | ||
271 | * the other nodes, it will run at the minor version specified by the | ||
272 | * other nodes. | ||
273 | * | ||
274 | * If a locking change is made that will not be compatible with older | ||
275 | * versions, the major number must be increased and the minor version set | ||
276 | * to zero. If a change merely adds a behavior that can be disabled when | ||
277 | * speaking to older versions, the minor version must be increased. If a | ||
278 | * change adds a fully backwards compatible change (eg, LVB changes that | ||
279 | * are just ignored by older versions), the version does not need to be | ||
280 | * updated. | ||
281 | */ | ||
282 | const struct dlm_protocol_version ocfs2_locking_protocol = { | ||
283 | .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, | ||
284 | .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, | ||
285 | }; | ||
286 | |||
287 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) | 257 | static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) |
288 | { | 258 | { |
289 | return lockres->l_type == OCFS2_LOCK_TYPE_META || | 259 | return lockres->l_type == OCFS2_LOCK_TYPE_META || |
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l | |||
316 | static int ocfs2_lock_create(struct ocfs2_super *osb, | 286 | static int ocfs2_lock_create(struct ocfs2_super *osb, |
317 | struct ocfs2_lock_res *lockres, | 287 | struct ocfs2_lock_res *lockres, |
318 | int level, | 288 | int level, |
319 | int dlm_flags); | 289 | u32 dlm_flags); |
320 | static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, | 290 | static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, |
321 | int wanted); | 291 | int wanted); |
322 | static void ocfs2_cluster_unlock(struct ocfs2_super *osb, | 292 | static void ocfs2_cluster_unlock(struct ocfs2_super *osb, |
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, | |||
330 | struct ocfs2_lock_res *lockres); | 300 | struct ocfs2_lock_res *lockres); |
331 | static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, | 301 | static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, |
332 | int convert); | 302 | int convert); |
333 | #define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ | 303 | #define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ |
334 | mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ | 304 | mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ |
335 | "resource %s: %s\n", dlm_errname(_stat), _func, \ | 305 | _err, _func, _lockres->l_name); \ |
336 | _lockres->l_name, dlm_errmsg(_stat)); \ | ||
337 | } while (0) | 306 | } while (0) |
338 | static int ocfs2_downconvert_thread(void *arg); | 307 | static int ocfs2_downconvert_thread(void *arg); |
339 | static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, | 308 | static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, |
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode, | |||
342 | struct buffer_head **bh); | 311 | struct buffer_head **bh); |
343 | static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); | 312 | static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); |
344 | static inline int ocfs2_highest_compat_lock_level(int level); | 313 | static inline int ocfs2_highest_compat_lock_level(int level); |
345 | static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, | 314 | static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, |
346 | int new_level); | 315 | int new_level); |
347 | static int ocfs2_downconvert_lock(struct ocfs2_super *osb, | 316 | static int ocfs2_downconvert_lock(struct ocfs2_super *osb, |
348 | struct ocfs2_lock_res *lockres, | 317 | struct ocfs2_lock_res *lockres, |
349 | int new_level, | 318 | int new_level, |
350 | int lvb); | 319 | int lvb, |
320 | unsigned int generation); | ||
351 | static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, | 321 | static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, |
352 | struct ocfs2_lock_res *lockres); | 322 | struct ocfs2_lock_res *lockres); |
353 | static int ocfs2_cancel_convert(struct ocfs2_super *osb, | 323 | static int ocfs2_cancel_convert(struct ocfs2_super *osb, |
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb, | |||
406 | res->l_ops = ops; | 376 | res->l_ops = ops; |
407 | res->l_priv = priv; | 377 | res->l_priv = priv; |
408 | 378 | ||
409 | res->l_level = LKM_IVMODE; | 379 | res->l_level = DLM_LOCK_IV; |
410 | res->l_requested = LKM_IVMODE; | 380 | res->l_requested = DLM_LOCK_IV; |
411 | res->l_blocking = LKM_IVMODE; | 381 | res->l_blocking = DLM_LOCK_IV; |
412 | res->l_action = OCFS2_AST_INVALID; | 382 | res->l_action = OCFS2_AST_INVALID; |
413 | res->l_unlock_action = OCFS2_UNLOCK_INVALID; | 383 | res->l_unlock_action = OCFS2_UNLOCK_INVALID; |
414 | 384 | ||
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, | |||
604 | BUG_ON(!lockres); | 574 | BUG_ON(!lockres); |
605 | 575 | ||
606 | switch(level) { | 576 | switch(level) { |
607 | case LKM_EXMODE: | 577 | case DLM_LOCK_EX: |
608 | lockres->l_ex_holders++; | 578 | lockres->l_ex_holders++; |
609 | break; | 579 | break; |
610 | case LKM_PRMODE: | 580 | case DLM_LOCK_PR: |
611 | lockres->l_ro_holders++; | 581 | lockres->l_ro_holders++; |
612 | break; | 582 | break; |
613 | default: | 583 | default: |
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, | |||
625 | BUG_ON(!lockres); | 595 | BUG_ON(!lockres); |
626 | 596 | ||
627 | switch(level) { | 597 | switch(level) { |
628 | case LKM_EXMODE: | 598 | case DLM_LOCK_EX: |
629 | BUG_ON(!lockres->l_ex_holders); | 599 | BUG_ON(!lockres->l_ex_holders); |
630 | lockres->l_ex_holders--; | 600 | lockres->l_ex_holders--; |
631 | break; | 601 | break; |
632 | case LKM_PRMODE: | 602 | case DLM_LOCK_PR: |
633 | BUG_ON(!lockres->l_ro_holders); | 603 | BUG_ON(!lockres->l_ro_holders); |
634 | lockres->l_ro_holders--; | 604 | lockres->l_ro_holders--; |
635 | break; | 605 | break; |
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres, | |||
644 | * lock types are added. */ | 614 | * lock types are added. */ |
645 | static inline int ocfs2_highest_compat_lock_level(int level) | 615 | static inline int ocfs2_highest_compat_lock_level(int level) |
646 | { | 616 | { |
647 | int new_level = LKM_EXMODE; | 617 | int new_level = DLM_LOCK_EX; |
648 | 618 | ||
649 | if (level == LKM_EXMODE) | 619 | if (level == DLM_LOCK_EX) |
650 | new_level = LKM_NLMODE; | 620 | new_level = DLM_LOCK_NL; |
651 | else if (level == LKM_PRMODE) | 621 | else if (level == DLM_LOCK_PR) |
652 | new_level = LKM_PRMODE; | 622 | new_level = DLM_LOCK_PR; |
653 | return new_level; | 623 | return new_level; |
654 | } | 624 | } |
655 | 625 | ||
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res | |||
688 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); | 658 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); |
689 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); | 659 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); |
690 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); | 660 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); |
691 | BUG_ON(lockres->l_blocking <= LKM_NLMODE); | 661 | BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); |
692 | 662 | ||
693 | lockres->l_level = lockres->l_requested; | 663 | lockres->l_level = lockres->l_requested; |
694 | if (lockres->l_level <= | 664 | if (lockres->l_level <= |
695 | ocfs2_highest_compat_lock_level(lockres->l_blocking)) { | 665 | ocfs2_highest_compat_lock_level(lockres->l_blocking)) { |
696 | lockres->l_blocking = LKM_NLMODE; | 666 | lockres->l_blocking = DLM_LOCK_NL; |
697 | lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); | 667 | lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); |
698 | } | 668 | } |
699 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | 669 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); |
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo | |||
712 | * information is already up to data. Convert from NL to | 682 | * information is already up to data. Convert from NL to |
713 | * *anything* however should mark ourselves as needing an | 683 | * *anything* however should mark ourselves as needing an |
714 | * update */ | 684 | * update */ |
715 | if (lockres->l_level == LKM_NLMODE && | 685 | if (lockres->l_level == DLM_LOCK_NL && |
716 | lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) | 686 | lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) |
717 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | 687 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); |
718 | 688 | ||
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc | |||
729 | BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); | 699 | BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); |
730 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | 700 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); |
731 | 701 | ||
732 | if (lockres->l_requested > LKM_NLMODE && | 702 | if (lockres->l_requested > DLM_LOCK_NL && |
733 | !(lockres->l_flags & OCFS2_LOCK_LOCAL) && | 703 | !(lockres->l_flags & OCFS2_LOCK_LOCAL) && |
734 | lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) | 704 | lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) |
735 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); | 705 | lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); |
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres, | |||
767 | return needs_downconvert; | 737 | return needs_downconvert; |
768 | } | 738 | } |
769 | 739 | ||
740 | /* | ||
741 | * OCFS2_LOCK_PENDING and l_pending_gen. | ||
742 | * | ||
743 | * Why does OCFS2_LOCK_PENDING exist? To close a race between setting | ||
744 | * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock() | ||
745 | * for more details on the race. | ||
746 | * | ||
747 | * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces | ||
748 | * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock() | ||
749 | * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear | ||
750 | * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns, | ||
751 | * the caller is going to try to clear PENDING again. If nothing else is | ||
752 | * happening, __lockres_clear_pending() sees PENDING is unset and does | ||
753 | * nothing. | ||
754 | * | ||
755 | * But what if another path (eg downconvert thread) has just started a | ||
756 | * new locking action? The other path has re-set PENDING. Our path | ||
757 | * cannot clear PENDING, because that will re-open the original race | ||
758 | * window. | ||
759 | * | ||
760 | * [Example] | ||
761 | * | ||
762 | * ocfs2_meta_lock() | ||
763 | * ocfs2_cluster_lock() | ||
764 | * set BUSY | ||
765 | * set PENDING | ||
766 | * drop l_lock | ||
767 | * ocfs2_dlm_lock() | ||
768 | * ocfs2_locking_ast() ocfs2_downconvert_thread() | ||
769 | * clear PENDING ocfs2_unblock_lock() | ||
770 | * take_l_lock | ||
771 | * !BUSY | ||
772 | * ocfs2_prepare_downconvert() | ||
773 | * set BUSY | ||
774 | * set PENDING | ||
775 | * drop l_lock | ||
776 | * take l_lock | ||
777 | * clear PENDING | ||
778 | * drop l_lock | ||
779 | * <window> | ||
780 | * ocfs2_dlm_lock() | ||
781 | * | ||
782 | * So as you can see, we now have a window where l_lock is not held, | ||
783 | * PENDING is not set, and ocfs2_dlm_lock() has not been called. | ||
784 | * | ||
785 | * The core problem is that ocfs2_cluster_lock() has cleared the PENDING | ||
786 | * set by ocfs2_prepare_downconvert(). That wasn't nice. | ||
787 | * | ||
788 | * To solve this we introduce l_pending_gen. A call to | ||
789 | * lockres_clear_pending() will only do so when it is passed a generation | ||
790 | * number that matches the lockres. lockres_set_pending() will return the | ||
791 | * current generation number. When ocfs2_cluster_lock() goes to clear | ||
792 | * PENDING, it passes the generation it got from set_pending(). In our | ||
793 | * example above, the generation numbers will *not* match. Thus, | ||
794 | * ocfs2_cluster_lock() will not clear the PENDING set by | ||
795 | * ocfs2_prepare_downconvert(). | ||
796 | */ | ||
797 | |||
798 | /* Unlocked version for ocfs2_locking_ast() */ | ||
799 | static void __lockres_clear_pending(struct ocfs2_lock_res *lockres, | ||
800 | unsigned int generation, | ||
801 | struct ocfs2_super *osb) | ||
802 | { | ||
803 | assert_spin_locked(&lockres->l_lock); | ||
804 | |||
805 | /* | ||
806 | * The ast and locking functions can race us here. The winner | ||
807 | * will clear pending, the loser will not. | ||
808 | */ | ||
809 | if (!(lockres->l_flags & OCFS2_LOCK_PENDING) || | ||
810 | (lockres->l_pending_gen != generation)) | ||
811 | return; | ||
812 | |||
813 | lockres_clear_flags(lockres, OCFS2_LOCK_PENDING); | ||
814 | lockres->l_pending_gen++; | ||
815 | |||
816 | /* | ||
817 | * The downconvert thread may have skipped us because we | ||
818 | * were PENDING. Wake it up. | ||
819 | */ | ||
820 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) | ||
821 | ocfs2_wake_downconvert_thread(osb); | ||
822 | } | ||
823 | |||
824 | /* Locked version for callers of ocfs2_dlm_lock() */ | ||
825 | static void lockres_clear_pending(struct ocfs2_lock_res *lockres, | ||
826 | unsigned int generation, | ||
827 | struct ocfs2_super *osb) | ||
828 | { | ||
829 | unsigned long flags; | ||
830 | |||
831 | spin_lock_irqsave(&lockres->l_lock, flags); | ||
832 | __lockres_clear_pending(lockres, generation, osb); | ||
833 | spin_unlock_irqrestore(&lockres->l_lock, flags); | ||
834 | } | ||
835 | |||
836 | static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres) | ||
837 | { | ||
838 | assert_spin_locked(&lockres->l_lock); | ||
839 | BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); | ||
840 | |||
841 | lockres_or_flags(lockres, OCFS2_LOCK_PENDING); | ||
842 | |||
843 | return lockres->l_pending_gen; | ||
844 | } | ||
845 | |||
846 | |||
770 | static void ocfs2_blocking_ast(void *opaque, int level) | 847 | static void ocfs2_blocking_ast(void *opaque, int level) |
771 | { | 848 | { |
772 | struct ocfs2_lock_res *lockres = opaque; | 849 | struct ocfs2_lock_res *lockres = opaque; |
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level) | |||
774 | int needs_downconvert; | 851 | int needs_downconvert; |
775 | unsigned long flags; | 852 | unsigned long flags; |
776 | 853 | ||
777 | BUG_ON(level <= LKM_NLMODE); | 854 | BUG_ON(level <= DLM_LOCK_NL); |
778 | 855 | ||
779 | mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", | 856 | mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", |
780 | lockres->l_name, level, lockres->l_level, | 857 | lockres->l_name, level, lockres->l_level, |
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level) | |||
801 | static void ocfs2_locking_ast(void *opaque) | 878 | static void ocfs2_locking_ast(void *opaque) |
802 | { | 879 | { |
803 | struct ocfs2_lock_res *lockres = opaque; | 880 | struct ocfs2_lock_res *lockres = opaque; |
804 | struct dlm_lockstatus *lksb = &lockres->l_lksb; | 881 | struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres); |
805 | unsigned long flags; | 882 | unsigned long flags; |
883 | int status; | ||
806 | 884 | ||
807 | spin_lock_irqsave(&lockres->l_lock, flags); | 885 | spin_lock_irqsave(&lockres->l_lock, flags); |
808 | 886 | ||
809 | if (lksb->status != DLM_NORMAL) { | 887 | status = ocfs2_dlm_lock_status(&lockres->l_lksb); |
810 | mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", | 888 | |
811 | lockres->l_name, lksb->status); | 889 | if (status == -EAGAIN) { |
890 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | ||
891 | goto out; | ||
892 | } | ||
893 | |||
894 | if (status) { | ||
895 | mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n", | ||
896 | lockres->l_name, status); | ||
812 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 897 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
813 | return; | 898 | return; |
814 | } | 899 | } |
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque) | |||
831 | lockres->l_unlock_action); | 916 | lockres->l_unlock_action); |
832 | BUG(); | 917 | BUG(); |
833 | } | 918 | } |
834 | 919 | out: | |
835 | /* set it to something invalid so if we get called again we | 920 | /* set it to something invalid so if we get called again we |
836 | * can catch it. */ | 921 | * can catch it. */ |
837 | lockres->l_action = OCFS2_AST_INVALID; | 922 | lockres->l_action = OCFS2_AST_INVALID; |
838 | 923 | ||
924 | /* Did we try to cancel this lock? Clear that state */ | ||
925 | if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) | ||
926 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; | ||
927 | |||
928 | /* | ||
929 | * We may have beaten the locking functions here. We certainly | ||
930 | * know that dlm_lock() has been called :-) | ||
931 | * Because we can't have two lock calls in flight at once, we | ||
932 | * can use lockres->l_pending_gen. | ||
933 | */ | ||
934 | __lockres_clear_pending(lockres, lockres->l_pending_gen, osb); | ||
935 | |||
839 | wake_up(&lockres->l_event); | 936 | wake_up(&lockres->l_event); |
840 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 937 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
841 | } | 938 | } |
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, | |||
865 | static int ocfs2_lock_create(struct ocfs2_super *osb, | 962 | static int ocfs2_lock_create(struct ocfs2_super *osb, |
866 | struct ocfs2_lock_res *lockres, | 963 | struct ocfs2_lock_res *lockres, |
867 | int level, | 964 | int level, |
868 | int dlm_flags) | 965 | u32 dlm_flags) |
869 | { | 966 | { |
870 | int ret = 0; | 967 | int ret = 0; |
871 | enum dlm_status status = DLM_NORMAL; | ||
872 | unsigned long flags; | 968 | unsigned long flags; |
969 | unsigned int gen; | ||
873 | 970 | ||
874 | mlog_entry_void(); | 971 | mlog_entry_void(); |
875 | 972 | ||
876 | mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, | 973 | mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level, |
877 | dlm_flags); | 974 | dlm_flags); |
878 | 975 | ||
879 | spin_lock_irqsave(&lockres->l_lock, flags); | 976 | spin_lock_irqsave(&lockres->l_lock, flags); |
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb, | |||
886 | lockres->l_action = OCFS2_AST_ATTACH; | 983 | lockres->l_action = OCFS2_AST_ATTACH; |
887 | lockres->l_requested = level; | 984 | lockres->l_requested = level; |
888 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | 985 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); |
986 | gen = lockres_set_pending(lockres); | ||
889 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 987 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
890 | 988 | ||
891 | status = dlmlock(osb->dlm, | 989 | ret = ocfs2_dlm_lock(osb->cconn, |
892 | level, | 990 | level, |
893 | &lockres->l_lksb, | 991 | &lockres->l_lksb, |
894 | dlm_flags, | 992 | dlm_flags, |
895 | lockres->l_name, | 993 | lockres->l_name, |
896 | OCFS2_LOCK_ID_MAX_LEN - 1, | 994 | OCFS2_LOCK_ID_MAX_LEN - 1, |
897 | ocfs2_locking_ast, | 995 | lockres); |
898 | lockres, | 996 | lockres_clear_pending(lockres, gen, osb); |
899 | ocfs2_blocking_ast); | 997 | if (ret) { |
900 | if (status != DLM_NORMAL) { | 998 | ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); |
901 | ocfs2_log_dlm_error("dlmlock", status, lockres); | ||
902 | ret = -EINVAL; | ||
903 | ocfs2_recover_from_dlm_error(lockres, 1); | 999 | ocfs2_recover_from_dlm_error(lockres, 1); |
904 | } | 1000 | } |
905 | 1001 | ||
906 | mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); | 1002 | mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name); |
907 | 1003 | ||
908 | bail: | 1004 | bail: |
909 | mlog_exit(ret); | 1005 | mlog_exit(ret); |
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw, | |||
1016 | static int ocfs2_cluster_lock(struct ocfs2_super *osb, | 1112 | static int ocfs2_cluster_lock(struct ocfs2_super *osb, |
1017 | struct ocfs2_lock_res *lockres, | 1113 | struct ocfs2_lock_res *lockres, |
1018 | int level, | 1114 | int level, |
1019 | int lkm_flags, | 1115 | u32 lkm_flags, |
1020 | int arg_flags) | 1116 | int arg_flags) |
1021 | { | 1117 | { |
1022 | struct ocfs2_mask_waiter mw; | 1118 | struct ocfs2_mask_waiter mw; |
1023 | enum dlm_status status; | ||
1024 | int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); | 1119 | int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); |
1025 | int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ | 1120 | int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ |
1026 | unsigned long flags; | 1121 | unsigned long flags; |
1122 | unsigned int gen; | ||
1123 | int noqueue_attempted = 0; | ||
1027 | 1124 | ||
1028 | mlog_entry_void(); | 1125 | mlog_entry_void(); |
1029 | 1126 | ||
1030 | ocfs2_init_mask_waiter(&mw); | 1127 | ocfs2_init_mask_waiter(&mw); |
1031 | 1128 | ||
1032 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) | 1129 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) |
1033 | lkm_flags |= LKM_VALBLK; | 1130 | lkm_flags |= DLM_LKF_VALBLK; |
1034 | 1131 | ||
1035 | again: | 1132 | again: |
1036 | wait = 0; | 1133 | wait = 0; |
@@ -1068,52 +1165,56 @@ again: | |||
1068 | } | 1165 | } |
1069 | 1166 | ||
1070 | if (level > lockres->l_level) { | 1167 | if (level > lockres->l_level) { |
1168 | if (noqueue_attempted > 0) { | ||
1169 | ret = -EAGAIN; | ||
1170 | goto unlock; | ||
1171 | } | ||
1172 | if (lkm_flags & DLM_LKF_NOQUEUE) | ||
1173 | noqueue_attempted = 1; | ||
1174 | |||
1071 | if (lockres->l_action != OCFS2_AST_INVALID) | 1175 | if (lockres->l_action != OCFS2_AST_INVALID) |
1072 | mlog(ML_ERROR, "lockres %s has action %u pending\n", | 1176 | mlog(ML_ERROR, "lockres %s has action %u pending\n", |
1073 | lockres->l_name, lockres->l_action); | 1177 | lockres->l_name, lockres->l_action); |
1074 | 1178 | ||
1075 | if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { | 1179 | if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { |
1076 | lockres->l_action = OCFS2_AST_ATTACH; | 1180 | lockres->l_action = OCFS2_AST_ATTACH; |
1077 | lkm_flags &= ~LKM_CONVERT; | 1181 | lkm_flags &= ~DLM_LKF_CONVERT; |
1078 | } else { | 1182 | } else { |
1079 | lockres->l_action = OCFS2_AST_CONVERT; | 1183 | lockres->l_action = OCFS2_AST_CONVERT; |
1080 | lkm_flags |= LKM_CONVERT; | 1184 | lkm_flags |= DLM_LKF_CONVERT; |
1081 | } | 1185 | } |
1082 | 1186 | ||
1083 | lockres->l_requested = level; | 1187 | lockres->l_requested = level; |
1084 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | 1188 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); |
1189 | gen = lockres_set_pending(lockres); | ||
1085 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 1190 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
1086 | 1191 | ||
1087 | BUG_ON(level == LKM_IVMODE); | 1192 | BUG_ON(level == DLM_LOCK_IV); |
1088 | BUG_ON(level == LKM_NLMODE); | 1193 | BUG_ON(level == DLM_LOCK_NL); |
1089 | 1194 | ||
1090 | mlog(0, "lock %s, convert from %d to level = %d\n", | 1195 | mlog(0, "lock %s, convert from %d to level = %d\n", |
1091 | lockres->l_name, lockres->l_level, level); | 1196 | lockres->l_name, lockres->l_level, level); |
1092 | 1197 | ||
1093 | /* call dlm_lock to upgrade lock now */ | 1198 | /* call dlm_lock to upgrade lock now */ |
1094 | status = dlmlock(osb->dlm, | 1199 | ret = ocfs2_dlm_lock(osb->cconn, |
1095 | level, | 1200 | level, |
1096 | &lockres->l_lksb, | 1201 | &lockres->l_lksb, |
1097 | lkm_flags, | 1202 | lkm_flags, |
1098 | lockres->l_name, | 1203 | lockres->l_name, |
1099 | OCFS2_LOCK_ID_MAX_LEN - 1, | 1204 | OCFS2_LOCK_ID_MAX_LEN - 1, |
1100 | ocfs2_locking_ast, | 1205 | lockres); |
1101 | lockres, | 1206 | lockres_clear_pending(lockres, gen, osb); |
1102 | ocfs2_blocking_ast); | 1207 | if (ret) { |
1103 | if (status != DLM_NORMAL) { | 1208 | if (!(lkm_flags & DLM_LKF_NOQUEUE) || |
1104 | if ((lkm_flags & LKM_NOQUEUE) && | 1209 | (ret != -EAGAIN)) { |
1105 | (status == DLM_NOTQUEUED)) | 1210 | ocfs2_log_dlm_error("ocfs2_dlm_lock", |
1106 | ret = -EAGAIN; | 1211 | ret, lockres); |
1107 | else { | ||
1108 | ocfs2_log_dlm_error("dlmlock", status, | ||
1109 | lockres); | ||
1110 | ret = -EINVAL; | ||
1111 | } | 1212 | } |
1112 | ocfs2_recover_from_dlm_error(lockres, 1); | 1213 | ocfs2_recover_from_dlm_error(lockres, 1); |
1113 | goto out; | 1214 | goto out; |
1114 | } | 1215 | } |
1115 | 1216 | ||
1116 | mlog(0, "lock %s, successfull return from dlmlock\n", | 1217 | mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", |
1117 | lockres->l_name); | 1218 | lockres->l_name); |
1118 | 1219 | ||
1119 | /* At this point we've gone inside the dlm and need to | 1220 | /* At this point we've gone inside the dlm and need to |
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb, | |||
1177 | int ex, | 1278 | int ex, |
1178 | int local) | 1279 | int local) |
1179 | { | 1280 | { |
1180 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 1281 | int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
1181 | unsigned long flags; | 1282 | unsigned long flags; |
1182 | int lkm_flags = local ? LKM_LOCAL : 0; | 1283 | u32 lkm_flags = local ? DLM_LKF_LOCAL : 0; |
1183 | 1284 | ||
1184 | spin_lock_irqsave(&lockres->l_lock, flags); | 1285 | spin_lock_irqsave(&lockres->l_lock, flags); |
1185 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); | 1286 | BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); |
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode) | |||
1222 | } | 1323 | } |
1223 | 1324 | ||
1224 | /* | 1325 | /* |
1225 | * We don't want to use LKM_LOCAL on a meta data lock as they | 1326 | * We don't want to use DLM_LKF_LOCAL on a meta data lock as they |
1226 | * don't use a generation in their lock names. | 1327 | * don't use a generation in their lock names. |
1227 | */ | 1328 | */ |
1228 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); | 1329 | ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); |
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) | |||
1261 | 1362 | ||
1262 | lockres = &OCFS2_I(inode)->ip_rw_lockres; | 1363 | lockres = &OCFS2_I(inode)->ip_rw_lockres; |
1263 | 1364 | ||
1264 | level = write ? LKM_EXMODE : LKM_PRMODE; | 1365 | level = write ? DLM_LOCK_EX : DLM_LOCK_PR; |
1265 | 1366 | ||
1266 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, | 1367 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, |
1267 | 0); | 1368 | 0); |
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write) | |||
1274 | 1375 | ||
1275 | void ocfs2_rw_unlock(struct inode *inode, int write) | 1376 | void ocfs2_rw_unlock(struct inode *inode, int write) |
1276 | { | 1377 | { |
1277 | int level = write ? LKM_EXMODE : LKM_PRMODE; | 1378 | int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; |
1278 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; | 1379 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; |
1279 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1380 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1280 | 1381 | ||
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode) | |||
1312 | lockres = &OCFS2_I(inode)->ip_open_lockres; | 1413 | lockres = &OCFS2_I(inode)->ip_open_lockres; |
1313 | 1414 | ||
1314 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, | 1415 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, |
1315 | LKM_PRMODE, 0, 0); | 1416 | DLM_LOCK_PR, 0, 0); |
1316 | if (status < 0) | 1417 | if (status < 0) |
1317 | mlog_errno(status); | 1418 | mlog_errno(status); |
1318 | 1419 | ||
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write) | |||
1340 | 1441 | ||
1341 | lockres = &OCFS2_I(inode)->ip_open_lockres; | 1442 | lockres = &OCFS2_I(inode)->ip_open_lockres; |
1342 | 1443 | ||
1343 | level = write ? LKM_EXMODE : LKM_PRMODE; | 1444 | level = write ? DLM_LOCK_EX : DLM_LOCK_PR; |
1344 | 1445 | ||
1345 | /* | 1446 | /* |
1346 | * The file system may already holding a PRMODE/EXMODE open lock. | 1447 | * The file system may already holding a PRMODE/EXMODE open lock. |
1347 | * Since we pass LKM_NOQUEUE, the request won't block waiting on | 1448 | * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on |
1348 | * other nodes and the -EAGAIN will indicate to the caller that | 1449 | * other nodes and the -EAGAIN will indicate to the caller that |
1349 | * this inode is still in use. | 1450 | * this inode is still in use. |
1350 | */ | 1451 | */ |
1351 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, | 1452 | status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, |
1352 | level, LKM_NOQUEUE, 0); | 1453 | level, DLM_LKF_NOQUEUE, 0); |
1353 | 1454 | ||
1354 | out: | 1455 | out: |
1355 | mlog_exit(status); | 1456 | mlog_exit(status); |
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode) | |||
1374 | 1475 | ||
1375 | if(lockres->l_ro_holders) | 1476 | if(lockres->l_ro_holders) |
1376 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, | 1477 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, |
1377 | LKM_PRMODE); | 1478 | DLM_LOCK_PR); |
1378 | if(lockres->l_ex_holders) | 1479 | if(lockres->l_ex_holders) |
1379 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, | 1480 | ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, |
1380 | LKM_EXMODE); | 1481 | DLM_LOCK_EX); |
1381 | 1482 | ||
1382 | out: | 1483 | out: |
1383 | mlog_exit_void(); | 1484 | mlog_exit_void(); |
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) | |||
1464 | ocfs2_init_mask_waiter(&mw); | 1565 | ocfs2_init_mask_waiter(&mw); |
1465 | 1566 | ||
1466 | if ((lockres->l_flags & OCFS2_LOCK_BUSY) || | 1567 | if ((lockres->l_flags & OCFS2_LOCK_BUSY) || |
1467 | (lockres->l_level > LKM_NLMODE)) { | 1568 | (lockres->l_level > DLM_LOCK_NL)) { |
1468 | mlog(ML_ERROR, | 1569 | mlog(ML_ERROR, |
1469 | "File lock \"%s\" has busy or locked state: flags: 0x%lx, " | 1570 | "File lock \"%s\" has busy or locked state: flags: 0x%lx, " |
1470 | "level: %u\n", lockres->l_name, lockres->l_flags, | 1571 | "level: %u\n", lockres->l_name, lockres->l_flags, |
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) | |||
1503 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); | 1604 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); |
1504 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 1605 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
1505 | 1606 | ||
1506 | ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags, | 1607 | ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags, |
1507 | lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, | 1608 | lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, |
1508 | ocfs2_locking_ast, lockres, ocfs2_blocking_ast); | 1609 | lockres); |
1509 | if (ret != DLM_NORMAL) { | 1610 | if (ret) { |
1510 | if (trylock && ret == DLM_NOTQUEUED) | 1611 | if (!trylock || (ret != -EAGAIN)) { |
1511 | ret = -EAGAIN; | 1612 | ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); |
1512 | else { | ||
1513 | ocfs2_log_dlm_error("dlmlock", ret, lockres); | ||
1514 | ret = -EINVAL; | 1613 | ret = -EINVAL; |
1515 | } | 1614 | } |
1516 | 1615 | ||
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock) | |||
1537 | * to just bubble sucess back up to the user. | 1636 | * to just bubble sucess back up to the user. |
1538 | */ | 1637 | */ |
1539 | ret = ocfs2_flock_handle_signal(lockres, level); | 1638 | ret = ocfs2_flock_handle_signal(lockres, level); |
1639 | } else if (!ret && (level > lockres->l_level)) { | ||
1640 | /* Trylock failed asynchronously */ | ||
1641 | BUG_ON(!trylock); | ||
1642 | ret = -EAGAIN; | ||
1540 | } | 1643 | } |
1541 | 1644 | ||
1542 | out: | 1645 | out: |
@@ -1549,6 +1652,7 @@ out: | |||
1549 | void ocfs2_file_unlock(struct file *file) | 1652 | void ocfs2_file_unlock(struct file *file) |
1550 | { | 1653 | { |
1551 | int ret; | 1654 | int ret; |
1655 | unsigned int gen; | ||
1552 | unsigned long flags; | 1656 | unsigned long flags; |
1553 | struct ocfs2_file_private *fp = file->private_data; | 1657 | struct ocfs2_file_private *fp = file->private_data; |
1554 | struct ocfs2_lock_res *lockres = &fp->fp_flock; | 1658 | struct ocfs2_lock_res *lockres = &fp->fp_flock; |
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file) | |||
1572 | * Fake a blocking ast for the downconvert code. | 1676 | * Fake a blocking ast for the downconvert code. |
1573 | */ | 1677 | */ |
1574 | lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); | 1678 | lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); |
1575 | lockres->l_blocking = LKM_EXMODE; | 1679 | lockres->l_blocking = DLM_LOCK_EX; |
1576 | 1680 | ||
1577 | ocfs2_prepare_downconvert(lockres, LKM_NLMODE); | 1681 | gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE); |
1578 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); | 1682 | lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); |
1579 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 1683 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
1580 | 1684 | ||
1581 | ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0); | 1685 | ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen); |
1582 | if (ret) { | 1686 | if (ret) { |
1583 | mlog_errno(ret); | 1687 | mlog_errno(ret); |
1584 | return; | 1688 | return; |
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, | |||
1601 | * condition. */ | 1705 | * condition. */ |
1602 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { | 1706 | if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { |
1603 | switch(lockres->l_blocking) { | 1707 | switch(lockres->l_blocking) { |
1604 | case LKM_EXMODE: | 1708 | case DLM_LOCK_EX: |
1605 | if (!lockres->l_ex_holders && !lockres->l_ro_holders) | 1709 | if (!lockres->l_ex_holders && !lockres->l_ro_holders) |
1606 | kick = 1; | 1710 | kick = 1; |
1607 | break; | 1711 | break; |
1608 | case LKM_PRMODE: | 1712 | case DLM_LOCK_PR: |
1609 | if (!lockres->l_ex_holders) | 1713 | if (!lockres->l_ex_holders) |
1610 | kick = 1; | 1714 | kick = 1; |
1611 | break; | 1715 | break; |
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) | |||
1648 | 1752 | ||
1649 | mlog_entry_void(); | 1753 | mlog_entry_void(); |
1650 | 1754 | ||
1651 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | 1755 | lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); |
1652 | 1756 | ||
1653 | /* | 1757 | /* |
1654 | * Invalidate the LVB of a deleted inode - this way other | 1758 | * Invalidate the LVB of a deleted inode - this way other |
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) | |||
1700 | 1804 | ||
1701 | mlog_meta_lvb(0, lockres); | 1805 | mlog_meta_lvb(0, lockres); |
1702 | 1806 | ||
1703 | lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | 1807 | lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); |
1704 | 1808 | ||
1705 | /* We're safe here without the lockres lock... */ | 1809 | /* We're safe here without the lockres lock... */ |
1706 | spin_lock(&oi->ip_lock); | 1810 | spin_lock(&oi->ip_lock); |
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) | |||
1735 | static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, | 1839 | static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, |
1736 | struct ocfs2_lock_res *lockres) | 1840 | struct ocfs2_lock_res *lockres) |
1737 | { | 1841 | { |
1738 | struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; | 1842 | struct ocfs2_meta_lvb *lvb = |
1843 | (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); | ||
1739 | 1844 | ||
1740 | if (lvb->lvb_version == OCFS2_LVB_VERSION | 1845 | if (lvb->lvb_version == OCFS2_LVB_VERSION |
1741 | && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) | 1846 | && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) |
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode, | |||
1923 | int ex, | 2028 | int ex, |
1924 | int arg_flags) | 2029 | int arg_flags) |
1925 | { | 2030 | { |
1926 | int status, level, dlm_flags, acquired; | 2031 | int status, level, acquired; |
2032 | u32 dlm_flags; | ||
1927 | struct ocfs2_lock_res *lockres = NULL; | 2033 | struct ocfs2_lock_res *lockres = NULL; |
1928 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2034 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1929 | struct buffer_head *local_bh = NULL; | 2035 | struct buffer_head *local_bh = NULL; |
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode, | |||
1950 | goto local; | 2056 | goto local; |
1951 | 2057 | ||
1952 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) | 2058 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) |
1953 | wait_event(osb->recovery_event, | 2059 | ocfs2_wait_for_recovery(osb); |
1954 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | ||
1955 | 2060 | ||
1956 | lockres = &OCFS2_I(inode)->ip_inode_lockres; | 2061 | lockres = &OCFS2_I(inode)->ip_inode_lockres; |
1957 | level = ex ? LKM_EXMODE : LKM_PRMODE; | 2062 | level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
1958 | dlm_flags = 0; | 2063 | dlm_flags = 0; |
1959 | if (arg_flags & OCFS2_META_LOCK_NOQUEUE) | 2064 | if (arg_flags & OCFS2_META_LOCK_NOQUEUE) |
1960 | dlm_flags |= LKM_NOQUEUE; | 2065 | dlm_flags |= DLM_LKF_NOQUEUE; |
1961 | 2066 | ||
1962 | status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); | 2067 | status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); |
1963 | if (status < 0) { | 2068 | if (status < 0) { |
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode, | |||
1974 | * committed to owning this lock so we don't allow signals to | 2079 | * committed to owning this lock so we don't allow signals to |
1975 | * abort the operation. */ | 2080 | * abort the operation. */ |
1976 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) | 2081 | if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) |
1977 | wait_event(osb->recovery_event, | 2082 | ocfs2_wait_for_recovery(osb); |
1978 | ocfs2_node_map_is_empty(osb, &osb->recovery_map)); | ||
1979 | 2083 | ||
1980 | local: | 2084 | local: |
1981 | /* | 2085 | /* |
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, | |||
2109 | void ocfs2_inode_unlock(struct inode *inode, | 2213 | void ocfs2_inode_unlock(struct inode *inode, |
2110 | int ex) | 2214 | int ex) |
2111 | { | 2215 | { |
2112 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 2216 | int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
2113 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; | 2217 | struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; |
2114 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 2218 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
2115 | 2219 | ||
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb, | |||
2130 | int ex) | 2234 | int ex) |
2131 | { | 2235 | { |
2132 | int status = 0; | 2236 | int status = 0; |
2133 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 2237 | int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
2134 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; | 2238 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; |
2135 | struct buffer_head *bh; | ||
2136 | struct ocfs2_slot_info *si = osb->slot_info; | ||
2137 | 2239 | ||
2138 | mlog_entry_void(); | 2240 | mlog_entry_void(); |
2139 | 2241 | ||
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, | |||
2159 | goto bail; | 2261 | goto bail; |
2160 | } | 2262 | } |
2161 | if (status) { | 2263 | if (status) { |
2162 | bh = si->si_bh; | 2264 | status = ocfs2_refresh_slot_info(osb); |
2163 | status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0, | ||
2164 | si->si_inode); | ||
2165 | if (status == 0) | ||
2166 | ocfs2_update_slot_info(si); | ||
2167 | 2265 | ||
2168 | ocfs2_complete_lock_res_refresh(lockres, status); | 2266 | ocfs2_complete_lock_res_refresh(lockres, status); |
2169 | 2267 | ||
@@ -2178,7 +2276,7 @@ bail: | |||
2178 | void ocfs2_super_unlock(struct ocfs2_super *osb, | 2276 | void ocfs2_super_unlock(struct ocfs2_super *osb, |
2179 | int ex) | 2277 | int ex) |
2180 | { | 2278 | { |
2181 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 2279 | int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
2182 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; | 2280 | struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; |
2183 | 2281 | ||
2184 | if (!ocfs2_mount_local(osb)) | 2282 | if (!ocfs2_mount_local(osb)) |
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb) | |||
2196 | if (ocfs2_mount_local(osb)) | 2294 | if (ocfs2_mount_local(osb)) |
2197 | return 0; | 2295 | return 0; |
2198 | 2296 | ||
2199 | status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); | 2297 | status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0); |
2200 | if (status < 0) | 2298 | if (status < 0) |
2201 | mlog_errno(status); | 2299 | mlog_errno(status); |
2202 | 2300 | ||
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb) | |||
2208 | struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; | 2306 | struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; |
2209 | 2307 | ||
2210 | if (!ocfs2_mount_local(osb)) | 2308 | if (!ocfs2_mount_local(osb)) |
2211 | ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); | 2309 | ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); |
2212 | } | 2310 | } |
2213 | 2311 | ||
2214 | int ocfs2_dentry_lock(struct dentry *dentry, int ex) | 2312 | int ocfs2_dentry_lock(struct dentry *dentry, int ex) |
2215 | { | 2313 | { |
2216 | int ret; | 2314 | int ret; |
2217 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 2315 | int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
2218 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; | 2316 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; |
2219 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); | 2317 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); |
2220 | 2318 | ||
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex) | |||
2235 | 2333 | ||
2236 | void ocfs2_dentry_unlock(struct dentry *dentry, int ex) | 2334 | void ocfs2_dentry_unlock(struct dentry *dentry, int ex) |
2237 | { | 2335 | { |
2238 | int level = ex ? LKM_EXMODE : LKM_PRMODE; | 2336 | int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; |
2239 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; | 2337 | struct ocfs2_dentry_lock *dl = dentry->d_fsdata; |
2240 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); | 2338 | struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); |
2241 | 2339 | ||
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v) | |||
2400 | lockres->l_blocking); | 2498 | lockres->l_blocking); |
2401 | 2499 | ||
2402 | /* Dump the raw LVB */ | 2500 | /* Dump the raw LVB */ |
2403 | lvb = lockres->l_lksb.lvb; | 2501 | lvb = ocfs2_dlm_lvb(&lockres->l_lksb); |
2404 | for(i = 0; i < DLM_LVB_LEN; i++) | 2502 | for(i = 0; i < DLM_LVB_LEN; i++) |
2405 | seq_printf(m, "0x%x\t", lvb[i]); | 2503 | seq_printf(m, "0x%x\t", lvb[i]); |
2406 | 2504 | ||
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) | |||
2504 | int ocfs2_dlm_init(struct ocfs2_super *osb) | 2602 | int ocfs2_dlm_init(struct ocfs2_super *osb) |
2505 | { | 2603 | { |
2506 | int status = 0; | 2604 | int status = 0; |
2507 | u32 dlm_key; | 2605 | struct ocfs2_cluster_connection *conn = NULL; |
2508 | struct dlm_ctxt *dlm = NULL; | ||
2509 | 2606 | ||
2510 | mlog_entry_void(); | 2607 | mlog_entry_void(); |
2511 | 2608 | ||
2512 | if (ocfs2_mount_local(osb)) | 2609 | if (ocfs2_mount_local(osb)) { |
2610 | osb->node_num = 0; | ||
2513 | goto local; | 2611 | goto local; |
2612 | } | ||
2514 | 2613 | ||
2515 | status = ocfs2_dlm_init_debug(osb); | 2614 | status = ocfs2_dlm_init_debug(osb); |
2516 | if (status < 0) { | 2615 | if (status < 0) { |
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb) | |||
2527 | goto bail; | 2626 | goto bail; |
2528 | } | 2627 | } |
2529 | 2628 | ||
2530 | /* used by the dlm code to make message headers unique, each | ||
2531 | * node in this domain must agree on this. */ | ||
2532 | dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str)); | ||
2533 | |||
2534 | /* for now, uuid == domain */ | 2629 | /* for now, uuid == domain */ |
2535 | dlm = dlm_register_domain(osb->uuid_str, dlm_key, | 2630 | status = ocfs2_cluster_connect(osb->osb_cluster_stack, |
2536 | &osb->osb_locking_proto); | 2631 | osb->uuid_str, |
2537 | if (IS_ERR(dlm)) { | 2632 | strlen(osb->uuid_str), |
2538 | status = PTR_ERR(dlm); | 2633 | ocfs2_do_node_down, osb, |
2634 | &conn); | ||
2635 | if (status) { | ||
2539 | mlog_errno(status); | 2636 | mlog_errno(status); |
2540 | goto bail; | 2637 | goto bail; |
2541 | } | 2638 | } |
2542 | 2639 | ||
2543 | dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); | 2640 | status = ocfs2_cluster_this_node(&osb->node_num); |
2641 | if (status < 0) { | ||
2642 | mlog_errno(status); | ||
2643 | mlog(ML_ERROR, | ||
2644 | "could not find this host's node number\n"); | ||
2645 | ocfs2_cluster_disconnect(conn, 0); | ||
2646 | goto bail; | ||
2647 | } | ||
2544 | 2648 | ||
2545 | local: | 2649 | local: |
2546 | ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); | 2650 | ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); |
2547 | ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); | 2651 | ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); |
2548 | 2652 | ||
2549 | osb->dlm = dlm; | 2653 | osb->cconn = conn; |
2550 | 2654 | ||
2551 | status = 0; | 2655 | status = 0; |
2552 | bail: | 2656 | bail: |
@@ -2560,14 +2664,19 @@ bail: | |||
2560 | return status; | 2664 | return status; |
2561 | } | 2665 | } |
2562 | 2666 | ||
2563 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb) | 2667 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb, |
2668 | int hangup_pending) | ||
2564 | { | 2669 | { |
2565 | mlog_entry_void(); | 2670 | mlog_entry_void(); |
2566 | 2671 | ||
2567 | dlm_unregister_eviction_cb(&osb->osb_eviction_cb); | ||
2568 | |||
2569 | ocfs2_drop_osb_locks(osb); | 2672 | ocfs2_drop_osb_locks(osb); |
2570 | 2673 | ||
2674 | /* | ||
2675 | * Now that we have dropped all locks and ocfs2_dismount_volume() | ||
2676 | * has disabled recovery, the DLM won't be talking to us. It's | ||
2677 | * safe to tear things down before disconnecting the cluster. | ||
2678 | */ | ||
2679 | |||
2571 | if (osb->dc_task) { | 2680 | if (osb->dc_task) { |
2572 | kthread_stop(osb->dc_task); | 2681 | kthread_stop(osb->dc_task); |
2573 | osb->dc_task = NULL; | 2682 | osb->dc_task = NULL; |
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb) | |||
2576 | ocfs2_lock_res_free(&osb->osb_super_lockres); | 2685 | ocfs2_lock_res_free(&osb->osb_super_lockres); |
2577 | ocfs2_lock_res_free(&osb->osb_rename_lockres); | 2686 | ocfs2_lock_res_free(&osb->osb_rename_lockres); |
2578 | 2687 | ||
2579 | dlm_unregister_domain(osb->dlm); | 2688 | ocfs2_cluster_disconnect(osb->cconn, hangup_pending); |
2580 | osb->dlm = NULL; | 2689 | osb->cconn = NULL; |
2581 | 2690 | ||
2582 | ocfs2_dlm_shutdown_debug(osb); | 2691 | ocfs2_dlm_shutdown_debug(osb); |
2583 | 2692 | ||
2584 | mlog_exit_void(); | 2693 | mlog_exit_void(); |
2585 | } | 2694 | } |
2586 | 2695 | ||
2587 | static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) | 2696 | static void ocfs2_unlock_ast(void *opaque, int error) |
2588 | { | 2697 | { |
2589 | struct ocfs2_lock_res *lockres = opaque; | 2698 | struct ocfs2_lock_res *lockres = opaque; |
2590 | unsigned long flags; | 2699 | unsigned long flags; |
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) | |||
2595 | lockres->l_unlock_action); | 2704 | lockres->l_unlock_action); |
2596 | 2705 | ||
2597 | spin_lock_irqsave(&lockres->l_lock, flags); | 2706 | spin_lock_irqsave(&lockres->l_lock, flags); |
2598 | /* We tried to cancel a convert request, but it was already | 2707 | if (error) { |
2599 | * granted. All we want to do here is clear our unlock | 2708 | mlog(ML_ERROR, "Dlm passes error %d for lock %s, " |
2600 | * state. The wake_up call done at the bottom is redundant | 2709 | "unlock_action %d\n", error, lockres->l_name, |
2601 | * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't | ||
2602 | * hurt anything anyway */ | ||
2603 | if (status == DLM_CANCELGRANT && | ||
2604 | lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) { | ||
2605 | mlog(0, "Got cancelgrant for %s\n", lockres->l_name); | ||
2606 | |||
2607 | /* We don't clear the busy flag in this case as it | ||
2608 | * should have been cleared by the ast which the dlm | ||
2609 | * has called. */ | ||
2610 | goto complete_unlock; | ||
2611 | } | ||
2612 | |||
2613 | if (status != DLM_NORMAL) { | ||
2614 | mlog(ML_ERROR, "Dlm passes status %d for lock %s, " | ||
2615 | "unlock_action %d\n", status, lockres->l_name, | ||
2616 | lockres->l_unlock_action); | 2710 | lockres->l_unlock_action); |
2617 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 2711 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
2618 | return; | 2712 | return; |
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) | |||
2624 | lockres->l_action = OCFS2_AST_INVALID; | 2718 | lockres->l_action = OCFS2_AST_INVALID; |
2625 | break; | 2719 | break; |
2626 | case OCFS2_UNLOCK_DROP_LOCK: | 2720 | case OCFS2_UNLOCK_DROP_LOCK: |
2627 | lockres->l_level = LKM_IVMODE; | 2721 | lockres->l_level = DLM_LOCK_IV; |
2628 | break; | 2722 | break; |
2629 | default: | 2723 | default: |
2630 | BUG(); | 2724 | BUG(); |
2631 | } | 2725 | } |
2632 | 2726 | ||
2633 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); | 2727 | lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); |
2634 | complete_unlock: | ||
2635 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; | 2728 | lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; |
2636 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 2729 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
2637 | 2730 | ||
@@ -2643,16 +2736,16 @@ complete_unlock: | |||
2643 | static int ocfs2_drop_lock(struct ocfs2_super *osb, | 2736 | static int ocfs2_drop_lock(struct ocfs2_super *osb, |
2644 | struct ocfs2_lock_res *lockres) | 2737 | struct ocfs2_lock_res *lockres) |
2645 | { | 2738 | { |
2646 | enum dlm_status status; | 2739 | int ret; |
2647 | unsigned long flags; | 2740 | unsigned long flags; |
2648 | int lkm_flags = 0; | 2741 | u32 lkm_flags = 0; |
2649 | 2742 | ||
2650 | /* We didn't get anywhere near actually using this lockres. */ | 2743 | /* We didn't get anywhere near actually using this lockres. */ |
2651 | if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) | 2744 | if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) |
2652 | goto out; | 2745 | goto out; |
2653 | 2746 | ||
2654 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) | 2747 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) |
2655 | lkm_flags |= LKM_VALBLK; | 2748 | lkm_flags |= DLM_LKF_VALBLK; |
2656 | 2749 | ||
2657 | spin_lock_irqsave(&lockres->l_lock, flags); | 2750 | spin_lock_irqsave(&lockres->l_lock, flags); |
2658 | 2751 | ||
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, | |||
2678 | 2771 | ||
2679 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { | 2772 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { |
2680 | if (lockres->l_flags & OCFS2_LOCK_ATTACHED && | 2773 | if (lockres->l_flags & OCFS2_LOCK_ATTACHED && |
2681 | lockres->l_level == LKM_EXMODE && | 2774 | lockres->l_level == DLM_LOCK_EX && |
2682 | !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) | 2775 | !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) |
2683 | lockres->l_ops->set_lvb(lockres); | 2776 | lockres->l_ops->set_lvb(lockres); |
2684 | } | 2777 | } |
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, | |||
2707 | 2800 | ||
2708 | mlog(0, "lock %s\n", lockres->l_name); | 2801 | mlog(0, "lock %s\n", lockres->l_name); |
2709 | 2802 | ||
2710 | status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags, | 2803 | ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags, |
2711 | ocfs2_unlock_ast, lockres); | 2804 | lockres); |
2712 | if (status != DLM_NORMAL) { | 2805 | if (ret) { |
2713 | ocfs2_log_dlm_error("dlmunlock", status, lockres); | 2806 | ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); |
2714 | mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); | 2807 | mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); |
2715 | dlm_print_one_lock(lockres->l_lksb.lockid); | 2808 | ocfs2_dlm_dump_lksb(&lockres->l_lksb); |
2716 | BUG(); | 2809 | BUG(); |
2717 | } | 2810 | } |
2718 | mlog(0, "lock %s, successfull return from dlmunlock\n", | 2811 | mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", |
2719 | lockres->l_name); | 2812 | lockres->l_name); |
2720 | 2813 | ||
2721 | ocfs2_wait_on_busy_lock(lockres); | 2814 | ocfs2_wait_on_busy_lock(lockres); |
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode) | |||
2806 | return status; | 2899 | return status; |
2807 | } | 2900 | } |
2808 | 2901 | ||
2809 | static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, | 2902 | static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, |
2810 | int new_level) | 2903 | int new_level) |
2811 | { | 2904 | { |
2812 | assert_spin_locked(&lockres->l_lock); | 2905 | assert_spin_locked(&lockres->l_lock); |
2813 | 2906 | ||
2814 | BUG_ON(lockres->l_blocking <= LKM_NLMODE); | 2907 | BUG_ON(lockres->l_blocking <= DLM_LOCK_NL); |
2815 | 2908 | ||
2816 | if (lockres->l_level <= new_level) { | 2909 | if (lockres->l_level <= new_level) { |
2817 | mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", | 2910 | mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n", |
2818 | lockres->l_level, new_level); | 2911 | lockres->l_level, new_level); |
2819 | BUG(); | 2912 | BUG(); |
2820 | } | 2913 | } |
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, | |||
2825 | lockres->l_action = OCFS2_AST_DOWNCONVERT; | 2918 | lockres->l_action = OCFS2_AST_DOWNCONVERT; |
2826 | lockres->l_requested = new_level; | 2919 | lockres->l_requested = new_level; |
2827 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); | 2920 | lockres_or_flags(lockres, OCFS2_LOCK_BUSY); |
2921 | return lockres_set_pending(lockres); | ||
2828 | } | 2922 | } |
2829 | 2923 | ||
2830 | static int ocfs2_downconvert_lock(struct ocfs2_super *osb, | 2924 | static int ocfs2_downconvert_lock(struct ocfs2_super *osb, |
2831 | struct ocfs2_lock_res *lockres, | 2925 | struct ocfs2_lock_res *lockres, |
2832 | int new_level, | 2926 | int new_level, |
2833 | int lvb) | 2927 | int lvb, |
2928 | unsigned int generation) | ||
2834 | { | 2929 | { |
2835 | int ret, dlm_flags = LKM_CONVERT; | 2930 | int ret; |
2836 | enum dlm_status status; | 2931 | u32 dlm_flags = DLM_LKF_CONVERT; |
2837 | 2932 | ||
2838 | mlog_entry_void(); | 2933 | mlog_entry_void(); |
2839 | 2934 | ||
2840 | if (lvb) | 2935 | if (lvb) |
2841 | dlm_flags |= LKM_VALBLK; | 2936 | dlm_flags |= DLM_LKF_VALBLK; |
2842 | 2937 | ||
2843 | status = dlmlock(osb->dlm, | 2938 | ret = ocfs2_dlm_lock(osb->cconn, |
2844 | new_level, | 2939 | new_level, |
2845 | &lockres->l_lksb, | 2940 | &lockres->l_lksb, |
2846 | dlm_flags, | 2941 | dlm_flags, |
2847 | lockres->l_name, | 2942 | lockres->l_name, |
2848 | OCFS2_LOCK_ID_MAX_LEN - 1, | 2943 | OCFS2_LOCK_ID_MAX_LEN - 1, |
2849 | ocfs2_locking_ast, | 2944 | lockres); |
2850 | lockres, | 2945 | lockres_clear_pending(lockres, generation, osb); |
2851 | ocfs2_blocking_ast); | 2946 | if (ret) { |
2852 | if (status != DLM_NORMAL) { | 2947 | ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres); |
2853 | ocfs2_log_dlm_error("dlmlock", status, lockres); | ||
2854 | ret = -EINVAL; | ||
2855 | ocfs2_recover_from_dlm_error(lockres, 1); | 2948 | ocfs2_recover_from_dlm_error(lockres, 1); |
2856 | goto bail; | 2949 | goto bail; |
2857 | } | 2950 | } |
@@ -2862,7 +2955,7 @@ bail: | |||
2862 | return ret; | 2955 | return ret; |
2863 | } | 2956 | } |
2864 | 2957 | ||
2865 | /* returns 1 when the caller should unlock and call dlmunlock */ | 2958 | /* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */ |
2866 | static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, | 2959 | static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, |
2867 | struct ocfs2_lock_res *lockres) | 2960 | struct ocfs2_lock_res *lockres) |
2868 | { | 2961 | { |
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb, | |||
2898 | struct ocfs2_lock_res *lockres) | 2991 | struct ocfs2_lock_res *lockres) |
2899 | { | 2992 | { |
2900 | int ret; | 2993 | int ret; |
2901 | enum dlm_status status; | ||
2902 | 2994 | ||
2903 | mlog_entry_void(); | 2995 | mlog_entry_void(); |
2904 | mlog(0, "lock %s\n", lockres->l_name); | 2996 | mlog(0, "lock %s\n", lockres->l_name); |
2905 | 2997 | ||
2906 | ret = 0; | 2998 | ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, |
2907 | status = dlmunlock(osb->dlm, | 2999 | DLM_LKF_CANCEL, lockres); |
2908 | &lockres->l_lksb, | 3000 | if (ret) { |
2909 | LKM_CANCEL, | 3001 | ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres); |
2910 | ocfs2_unlock_ast, | ||
2911 | lockres); | ||
2912 | if (status != DLM_NORMAL) { | ||
2913 | ocfs2_log_dlm_error("dlmunlock", status, lockres); | ||
2914 | ret = -EINVAL; | ||
2915 | ocfs2_recover_from_dlm_error(lockres, 0); | 3002 | ocfs2_recover_from_dlm_error(lockres, 0); |
2916 | } | 3003 | } |
2917 | 3004 | ||
2918 | mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); | 3005 | mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name); |
2919 | 3006 | ||
2920 | mlog_exit(ret); | 3007 | mlog_exit(ret); |
2921 | return ret; | 3008 | return ret; |
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, | |||
2930 | int new_level; | 3017 | int new_level; |
2931 | int ret = 0; | 3018 | int ret = 0; |
2932 | int set_lvb = 0; | 3019 | int set_lvb = 0; |
3020 | unsigned int gen; | ||
2933 | 3021 | ||
2934 | mlog_entry_void(); | 3022 | mlog_entry_void(); |
2935 | 3023 | ||
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb, | |||
2939 | 3027 | ||
2940 | recheck: | 3028 | recheck: |
2941 | if (lockres->l_flags & OCFS2_LOCK_BUSY) { | 3029 | if (lockres->l_flags & OCFS2_LOCK_BUSY) { |
3030 | /* XXX | ||
3031 | * This is a *big* race. The OCFS2_LOCK_PENDING flag | ||
3032 | * exists entirely for one reason - another thread has set | ||
3033 | * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock(). | ||
3034 | * | ||
3035 | * If we do ocfs2_cancel_convert() before the other thread | ||
3036 | * calls dlm_lock(), our cancel will do nothing. We will | ||
3037 | * get no ast, and we will have no way of knowing the | ||
3038 | * cancel failed. Meanwhile, the other thread will call | ||
3039 | * into dlm_lock() and wait...forever. | ||
3040 | * | ||
3041 | * Why forever? Because another node has asked for the | ||
3042 | * lock first; that's why we're here in unblock_lock(). | ||
3043 | * | ||
3044 | * The solution is OCFS2_LOCK_PENDING. When PENDING is | ||
3045 | * set, we just requeue the unblock. Only when the other | ||
3046 | * thread has called dlm_lock() and cleared PENDING will | ||
3047 | * we then cancel their request. | ||
3048 | * | ||
3049 | * All callers of dlm_lock() must set OCFS2_DLM_PENDING | ||
3050 | * at the same time they set OCFS2_DLM_BUSY. They must | ||
3051 | * clear OCFS2_DLM_PENDING after dlm_lock() returns. | ||
3052 | */ | ||
3053 | if (lockres->l_flags & OCFS2_LOCK_PENDING) | ||
3054 | goto leave_requeue; | ||
3055 | |||
2942 | ctl->requeue = 1; | 3056 | ctl->requeue = 1; |
2943 | ret = ocfs2_prepare_cancel_convert(osb, lockres); | 3057 | ret = ocfs2_prepare_cancel_convert(osb, lockres); |
2944 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 3058 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
@@ -2952,13 +3066,13 @@ recheck: | |||
2952 | 3066 | ||
2953 | /* if we're blocking an exclusive and we have *any* holders, | 3067 | /* if we're blocking an exclusive and we have *any* holders, |
2954 | * then requeue. */ | 3068 | * then requeue. */ |
2955 | if ((lockres->l_blocking == LKM_EXMODE) | 3069 | if ((lockres->l_blocking == DLM_LOCK_EX) |
2956 | && (lockres->l_ex_holders || lockres->l_ro_holders)) | 3070 | && (lockres->l_ex_holders || lockres->l_ro_holders)) |
2957 | goto leave_requeue; | 3071 | goto leave_requeue; |
2958 | 3072 | ||
2959 | /* If it's a PR we're blocking, then only | 3073 | /* If it's a PR we're blocking, then only |
2960 | * requeue if we've got any EX holders */ | 3074 | * requeue if we've got any EX holders */ |
2961 | if (lockres->l_blocking == LKM_PRMODE && | 3075 | if (lockres->l_blocking == DLM_LOCK_PR && |
2962 | lockres->l_ex_holders) | 3076 | lockres->l_ex_holders) |
2963 | goto leave_requeue; | 3077 | goto leave_requeue; |
2964 | 3078 | ||
@@ -3005,7 +3119,7 @@ downconvert: | |||
3005 | ctl->requeue = 0; | 3119 | ctl->requeue = 0; |
3006 | 3120 | ||
3007 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { | 3121 | if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { |
3008 | if (lockres->l_level == LKM_EXMODE) | 3122 | if (lockres->l_level == DLM_LOCK_EX) |
3009 | set_lvb = 1; | 3123 | set_lvb = 1; |
3010 | 3124 | ||
3011 | /* | 3125 | /* |
@@ -3018,9 +3132,11 @@ downconvert: | |||
3018 | lockres->l_ops->set_lvb(lockres); | 3132 | lockres->l_ops->set_lvb(lockres); |
3019 | } | 3133 | } |
3020 | 3134 | ||
3021 | ocfs2_prepare_downconvert(lockres, new_level); | 3135 | gen = ocfs2_prepare_downconvert(lockres, new_level); |
3022 | spin_unlock_irqrestore(&lockres->l_lock, flags); | 3136 | spin_unlock_irqrestore(&lockres->l_lock, flags); |
3023 | ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); | 3137 | ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, |
3138 | gen); | ||
3139 | |||
3024 | leave: | 3140 | leave: |
3025 | mlog_exit(ret); | 3141 | mlog_exit(ret); |
3026 | return ret; | 3142 | return ret; |
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, | |||
3059 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 3175 | (unsigned long long)OCFS2_I(inode)->ip_blkno); |
3060 | } | 3176 | } |
3061 | sync_mapping_buffers(mapping); | 3177 | sync_mapping_buffers(mapping); |
3062 | if (blocking == LKM_EXMODE) { | 3178 | if (blocking == DLM_LOCK_EX) { |
3063 | truncate_inode_pages(mapping, 0); | 3179 | truncate_inode_pages(mapping, 0); |
3064 | } else { | 3180 | } else { |
3065 | /* We only need to wait on the I/O if we're not also | 3181 | /* We only need to wait on the I/O if we're not also |
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres, | |||
3080 | struct inode *inode = ocfs2_lock_res_inode(lockres); | 3196 | struct inode *inode = ocfs2_lock_res_inode(lockres); |
3081 | int checkpointed = ocfs2_inode_fully_checkpointed(inode); | 3197 | int checkpointed = ocfs2_inode_fully_checkpointed(inode); |
3082 | 3198 | ||
3083 | BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); | 3199 | BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR); |
3084 | BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed); | 3200 | BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed); |
3085 | 3201 | ||
3086 | if (checkpointed) | 3202 | if (checkpointed) |
3087 | return 1; | 3203 | return 1; |
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, | |||
3145 | * valid. The downconvert code will retain a PR for this node, | 3261 | * valid. The downconvert code will retain a PR for this node, |
3146 | * so there's no further work to do. | 3262 | * so there's no further work to do. |
3147 | */ | 3263 | */ |
3148 | if (blocking == LKM_PRMODE) | 3264 | if (blocking == DLM_LOCK_PR) |
3149 | return UNBLOCK_CONTINUE; | 3265 | return UNBLOCK_CONTINUE; |
3150 | 3266 | ||
3151 | /* | 3267 | /* |
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, | |||
3219 | return UNBLOCK_CONTINUE_POST; | 3335 | return UNBLOCK_CONTINUE_POST; |
3220 | } | 3336 | } |
3221 | 3337 | ||
3338 | /* | ||
3339 | * This is the filesystem locking protocol. It provides the lock handling | ||
3340 | * hooks for the underlying DLM. It has a maximum version number. | ||
3341 | * The version number allows interoperability with systems running at | ||
3342 | * the same major number and an equal or smaller minor number. | ||
3343 | * | ||
3344 | * Whenever the filesystem does new things with locks (adds or removes a | ||
3345 | * lock, orders them differently, does different things underneath a lock), | ||
3346 | * the version must be changed. The protocol is negotiated when joining | ||
3347 | * the dlm domain. A node may join the domain if its major version is | ||
3348 | * identical to all other nodes and its minor version is greater than | ||
3349 | * or equal to all other nodes. When its minor version is greater than | ||
3350 | * the other nodes, it will run at the minor version specified by the | ||
3351 | * other nodes. | ||
3352 | * | ||
3353 | * If a locking change is made that will not be compatible with older | ||
3354 | * versions, the major number must be increased and the minor version set | ||
3355 | * to zero. If a change merely adds a behavior that can be disabled when | ||
3356 | * speaking to older versions, the minor version must be increased. If a | ||
3357 | * change adds a fully backwards compatible change (eg, LVB changes that | ||
3358 | * are just ignored by older versions), the version does not need to be | ||
3359 | * updated. | ||
3360 | */ | ||
3361 | static struct ocfs2_locking_protocol lproto = { | ||
3362 | .lp_max_version = { | ||
3363 | .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, | ||
3364 | .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, | ||
3365 | }, | ||
3366 | .lp_lock_ast = ocfs2_locking_ast, | ||
3367 | .lp_blocking_ast = ocfs2_blocking_ast, | ||
3368 | .lp_unlock_ast = ocfs2_unlock_ast, | ||
3369 | }; | ||
3370 | |||
3371 | void ocfs2_set_locking_protocol(void) | ||
3372 | { | ||
3373 | ocfs2_stack_glue_set_locking_protocol(&lproto); | ||
3374 | } | ||
3375 | |||
3376 | |||
3222 | static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, | 3377 | static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, |
3223 | struct ocfs2_lock_res *lockres) | 3378 | struct ocfs2_lock_res *lockres) |
3224 | { | 3379 | { |
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index e3cf902404b4..2bb01f09c1b1 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h | |||
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb { | |||
58 | #define OCFS2_LOCK_NONBLOCK (0x04) | 58 | #define OCFS2_LOCK_NONBLOCK (0x04) |
59 | 59 | ||
60 | int ocfs2_dlm_init(struct ocfs2_super *osb); | 60 | int ocfs2_dlm_init(struct ocfs2_super *osb); |
61 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb); | 61 | void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending); |
62 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); | 62 | void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); |
63 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, | 63 | void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, |
64 | enum ocfs2_lock_type type, | 64 | enum ocfs2_lock_type type, |
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb); | |||
114 | struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); | 114 | struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); |
115 | void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); | 115 | void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); |
116 | 116 | ||
117 | extern const struct dlm_protocol_version ocfs2_locking_protocol; | 117 | /* To set the locking protocol on module initialization */ |
118 | void ocfs2_set_locking_protocol(void); | ||
118 | #endif /* DLMGLUE_H */ | 119 | #endif /* DLMGLUE_H */ |
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ed5d5232e85d..9154c82d3258 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = { | |||
2242 | .open = ocfs2_file_open, | 2242 | .open = ocfs2_file_open, |
2243 | .aio_read = ocfs2_file_aio_read, | 2243 | .aio_read = ocfs2_file_aio_read, |
2244 | .aio_write = ocfs2_file_aio_write, | 2244 | .aio_write = ocfs2_file_aio_write, |
2245 | .ioctl = ocfs2_ioctl, | 2245 | .unlocked_ioctl = ocfs2_ioctl, |
2246 | #ifdef CONFIG_COMPAT | 2246 | #ifdef CONFIG_COMPAT |
2247 | .compat_ioctl = ocfs2_compat_ioctl, | 2247 | .compat_ioctl = ocfs2_compat_ioctl, |
2248 | #endif | 2248 | #endif |
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = { | |||
2258 | .fsync = ocfs2_sync_file, | 2258 | .fsync = ocfs2_sync_file, |
2259 | .release = ocfs2_dir_release, | 2259 | .release = ocfs2_dir_release, |
2260 | .open = ocfs2_dir_open, | 2260 | .open = ocfs2_dir_open, |
2261 | .ioctl = ocfs2_ioctl, | 2261 | .unlocked_ioctl = ocfs2_ioctl, |
2262 | #ifdef CONFIG_COMPAT | 2262 | #ifdef CONFIG_COMPAT |
2263 | .compat_ioctl = ocfs2_compat_ioctl, | 2263 | .compat_ioctl = ocfs2_compat_ioctl, |
2264 | #endif | 2264 | #endif |
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c index 0758daf64da0..c6e7213db868 100644 --- a/fs/ocfs2/heartbeat.c +++ b/fs/ocfs2/heartbeat.c | |||
@@ -28,9 +28,6 @@ | |||
28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/highmem.h> | 30 | #include <linux/highmem.h> |
31 | #include <linux/kmod.h> | ||
32 | |||
33 | #include <dlm/dlmapi.h> | ||
34 | 31 | ||
35 | #define MLOG_MASK_PREFIX ML_SUPER | 32 | #define MLOG_MASK_PREFIX ML_SUPER |
36 | #include <cluster/masklog.h> | 33 | #include <cluster/masklog.h> |
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, | |||
48 | int bit); | 45 | int bit); |
49 | static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, | 46 | static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, |
50 | int bit); | 47 | int bit); |
51 | static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map); | ||
52 | 48 | ||
53 | /* special case -1 for now | 49 | /* special case -1 for now |
54 | * TODO: should *really* make sure the calling func never passes -1!! */ | 50 | * TODO: should *really* make sure the calling func never passes -1!! */ |
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map) | |||
62 | void ocfs2_init_node_maps(struct ocfs2_super *osb) | 58 | void ocfs2_init_node_maps(struct ocfs2_super *osb) |
63 | { | 59 | { |
64 | spin_lock_init(&osb->node_map_lock); | 60 | spin_lock_init(&osb->node_map_lock); |
65 | ocfs2_node_map_init(&osb->recovery_map); | ||
66 | ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); | 61 | ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); |
67 | } | 62 | } |
68 | 63 | ||
69 | static void ocfs2_do_node_down(int node_num, | 64 | void ocfs2_do_node_down(int node_num, void *data) |
70 | struct ocfs2_super *osb) | ||
71 | { | 65 | { |
66 | struct ocfs2_super *osb = data; | ||
67 | |||
72 | BUG_ON(osb->node_num == node_num); | 68 | BUG_ON(osb->node_num == node_num); |
73 | 69 | ||
74 | mlog(0, "ocfs2: node down event for %d\n", node_num); | 70 | mlog(0, "ocfs2: node down event for %d\n", node_num); |
75 | 71 | ||
76 | if (!osb->dlm) { | 72 | if (!osb->cconn) { |
77 | /* | 73 | /* |
78 | * No DLM means we're not even ready to participate yet. | 74 | * No cluster connection means we're not even ready to |
79 | * We check the slots after the DLM comes up, so we will | 75 | * participate yet. We check the slots after the cluster |
80 | * notice the node death then. We can safely ignore it | 76 | * comes up, so we will notice the node death then. We |
81 | * here. | 77 | * can safely ignore it here. |
82 | */ | 78 | */ |
83 | return; | 79 | return; |
84 | } | 80 | } |
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num, | |||
86 | ocfs2_recovery_thread(osb, node_num); | 82 | ocfs2_recovery_thread(osb, node_num); |
87 | } | 83 | } |
88 | 84 | ||
89 | /* Called from the dlm when it's about to evict a node. We may also | ||
90 | * get a heartbeat callback later. */ | ||
91 | static void ocfs2_dlm_eviction_cb(int node_num, | ||
92 | void *data) | ||
93 | { | ||
94 | struct ocfs2_super *osb = (struct ocfs2_super *) data; | ||
95 | struct super_block *sb = osb->sb; | ||
96 | |||
97 | mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n", | ||
98 | MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num); | ||
99 | |||
100 | ocfs2_do_node_down(node_num, osb); | ||
101 | } | ||
102 | |||
103 | void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb) | ||
104 | { | ||
105 | /* Not exactly a heartbeat callback, but leads to essentially | ||
106 | * the same path so we set it up here. */ | ||
107 | dlm_setup_eviction_cb(&osb->osb_eviction_cb, | ||
108 | ocfs2_dlm_eviction_cb, | ||
109 | osb); | ||
110 | } | ||
111 | |||
112 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb) | ||
113 | { | ||
114 | int ret; | ||
115 | char *argv[5], *envp[3]; | ||
116 | |||
117 | if (ocfs2_mount_local(osb)) | ||
118 | return; | ||
119 | |||
120 | if (!osb->uuid_str) { | ||
121 | /* This can happen if we don't get far enough in mount... */ | ||
122 | mlog(0, "No UUID with which to stop heartbeat!\n\n"); | ||
123 | return; | ||
124 | } | ||
125 | |||
126 | argv[0] = (char *)o2nm_get_hb_ctl_path(); | ||
127 | argv[1] = "-K"; | ||
128 | argv[2] = "-u"; | ||
129 | argv[3] = osb->uuid_str; | ||
130 | argv[4] = NULL; | ||
131 | |||
132 | mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); | ||
133 | |||
134 | /* minimal command environment taken from cpu_run_sbin_hotplug */ | ||
135 | envp[0] = "HOME=/"; | ||
136 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
137 | envp[2] = NULL; | ||
138 | |||
139 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | ||
140 | if (ret < 0) | ||
141 | mlog_errno(ret); | ||
142 | } | ||
143 | |||
144 | static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, | 85 | static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, |
145 | int bit) | 86 | int bit) |
146 | { | 87 | { |
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb, | |||
192 | return ret; | 133 | return ret; |
193 | } | 134 | } |
194 | 135 | ||
195 | static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map) | ||
196 | { | ||
197 | int bit; | ||
198 | bit = find_next_bit(map->map, map->num_nodes, 0); | ||
199 | if (bit < map->num_nodes) | ||
200 | return 0; | ||
201 | return 1; | ||
202 | } | ||
203 | |||
204 | int ocfs2_node_map_is_empty(struct ocfs2_super *osb, | ||
205 | struct ocfs2_node_map *map) | ||
206 | { | ||
207 | int ret; | ||
208 | BUG_ON(map->num_nodes == 0); | ||
209 | spin_lock(&osb->node_map_lock); | ||
210 | ret = __ocfs2_node_map_is_empty(map); | ||
211 | spin_unlock(&osb->node_map_lock); | ||
212 | return ret; | ||
213 | } | ||
214 | |||
215 | #if 0 | ||
216 | |||
217 | static void __ocfs2_node_map_dup(struct ocfs2_node_map *target, | ||
218 | struct ocfs2_node_map *from) | ||
219 | { | ||
220 | BUG_ON(from->num_nodes == 0); | ||
221 | ocfs2_node_map_init(target); | ||
222 | __ocfs2_node_map_set(target, from); | ||
223 | } | ||
224 | |||
225 | /* returns 1 if bit is the only bit set in target, 0 otherwise */ | ||
226 | int ocfs2_node_map_is_only(struct ocfs2_super *osb, | ||
227 | struct ocfs2_node_map *target, | ||
228 | int bit) | ||
229 | { | ||
230 | struct ocfs2_node_map temp; | ||
231 | int ret; | ||
232 | |||
233 | spin_lock(&osb->node_map_lock); | ||
234 | __ocfs2_node_map_dup(&temp, target); | ||
235 | __ocfs2_node_map_clear_bit(&temp, bit); | ||
236 | ret = __ocfs2_node_map_is_empty(&temp); | ||
237 | spin_unlock(&osb->node_map_lock); | ||
238 | |||
239 | return ret; | ||
240 | } | ||
241 | |||
242 | static void __ocfs2_node_map_set(struct ocfs2_node_map *target, | ||
243 | struct ocfs2_node_map *from) | ||
244 | { | ||
245 | int num_longs, i; | ||
246 | |||
247 | BUG_ON(target->num_nodes != from->num_nodes); | ||
248 | BUG_ON(target->num_nodes == 0); | ||
249 | |||
250 | num_longs = BITS_TO_LONGS(target->num_nodes); | ||
251 | for (i = 0; i < num_longs; i++) | ||
252 | target->map[i] = from->map[i]; | ||
253 | } | ||
254 | |||
255 | #endif /* 0 */ | ||
256 | |||
257 | /* Returns whether the recovery bit was actually set - it may not be | ||
258 | * if a node is still marked as needing recovery */ | ||
259 | int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
260 | int num) | ||
261 | { | ||
262 | int set = 0; | ||
263 | |||
264 | spin_lock(&osb->node_map_lock); | ||
265 | |||
266 | if (!test_bit(num, osb->recovery_map.map)) { | ||
267 | __ocfs2_node_map_set_bit(&osb->recovery_map, num); | ||
268 | set = 1; | ||
269 | } | ||
270 | |||
271 | spin_unlock(&osb->node_map_lock); | ||
272 | |||
273 | return set; | ||
274 | } | ||
275 | |||
276 | void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
277 | int num) | ||
278 | { | ||
279 | ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num); | ||
280 | } | ||
281 | |||
282 | int ocfs2_node_map_iterate(struct ocfs2_super *osb, | ||
283 | struct ocfs2_node_map *map, | ||
284 | int idx) | ||
285 | { | ||
286 | int i = idx; | ||
287 | |||
288 | idx = O2NM_INVALID_NODE_NUM; | ||
289 | spin_lock(&osb->node_map_lock); | ||
290 | if ((i != O2NM_INVALID_NODE_NUM) && | ||
291 | (i >= 0) && | ||
292 | (i < map->num_nodes)) { | ||
293 | while(i < map->num_nodes) { | ||
294 | if (test_bit(i, map->map)) { | ||
295 | idx = i; | ||
296 | break; | ||
297 | } | ||
298 | i++; | ||
299 | } | ||
300 | } | ||
301 | spin_unlock(&osb->node_map_lock); | ||
302 | return idx; | ||
303 | } | ||
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h index eac63aed7611..74b9c5dda28d 100644 --- a/fs/ocfs2/heartbeat.h +++ b/fs/ocfs2/heartbeat.h | |||
@@ -28,13 +28,10 @@ | |||
28 | 28 | ||
29 | void ocfs2_init_node_maps(struct ocfs2_super *osb); | 29 | void ocfs2_init_node_maps(struct ocfs2_super *osb); |
30 | 30 | ||
31 | void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); | 31 | void ocfs2_do_node_down(int node_num, void *data); |
32 | void ocfs2_stop_heartbeat(struct ocfs2_super *osb); | ||
33 | 32 | ||
34 | /* node map functions - used to keep track of mounted and in-recovery | 33 | /* node map functions - used to keep track of mounted and in-recovery |
35 | * nodes. */ | 34 | * nodes. */ |
36 | int ocfs2_node_map_is_empty(struct ocfs2_super *osb, | ||
37 | struct ocfs2_node_map *map); | ||
38 | void ocfs2_node_map_set_bit(struct ocfs2_super *osb, | 35 | void ocfs2_node_map_set_bit(struct ocfs2_super *osb, |
39 | struct ocfs2_node_map *map, | 36 | struct ocfs2_node_map *map, |
40 | int bit); | 37 | int bit); |
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb, | |||
44 | int ocfs2_node_map_test_bit(struct ocfs2_super *osb, | 41 | int ocfs2_node_map_test_bit(struct ocfs2_super *osb, |
45 | struct ocfs2_node_map *map, | 42 | struct ocfs2_node_map *map, |
46 | int bit); | 43 | int bit); |
47 | int ocfs2_node_map_iterate(struct ocfs2_super *osb, | ||
48 | struct ocfs2_node_map *map, | ||
49 | int idx); | ||
50 | static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb, | ||
51 | struct ocfs2_node_map *map) | ||
52 | { | ||
53 | return ocfs2_node_map_iterate(osb, map, 0); | ||
54 | } | ||
55 | int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
56 | int num); | ||
57 | void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
58 | int num); | ||
59 | 44 | ||
60 | #endif /* OCFS2_HEARTBEAT_H */ | 45 | #endif /* OCFS2_HEARTBEAT_H */ |
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 5177fba5162b..b413166dd163 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c | |||
@@ -7,6 +7,7 @@ | |||
7 | 7 | ||
8 | #include <linux/fs.h> | 8 | #include <linux/fs.h> |
9 | #include <linux/mount.h> | 9 | #include <linux/mount.h> |
10 | #include <linux/smp_lock.h> | ||
10 | 11 | ||
11 | #define MLOG_MASK_PREFIX ML_INODE | 12 | #define MLOG_MASK_PREFIX ML_INODE |
12 | #include <cluster/masklog.h> | 13 | #include <cluster/masklog.h> |
@@ -112,9 +113,9 @@ bail: | |||
112 | return status; | 113 | return status; |
113 | } | 114 | } |
114 | 115 | ||
115 | int ocfs2_ioctl(struct inode * inode, struct file * filp, | 116 | long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) |
116 | unsigned int cmd, unsigned long arg) | ||
117 | { | 117 | { |
118 | struct inode *inode = filp->f_path.dentry->d_inode; | ||
118 | unsigned int flags; | 119 | unsigned int flags; |
119 | int new_clusters; | 120 | int new_clusters; |
120 | int status; | 121 | int status; |
@@ -168,9 +169,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp, | |||
168 | #ifdef CONFIG_COMPAT | 169 | #ifdef CONFIG_COMPAT |
169 | long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) | 170 | long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) |
170 | { | 171 | { |
171 | struct inode *inode = file->f_path.dentry->d_inode; | ||
172 | int ret; | ||
173 | |||
174 | switch (cmd) { | 172 | switch (cmd) { |
175 | case OCFS2_IOC32_GETFLAGS: | 173 | case OCFS2_IOC32_GETFLAGS: |
176 | cmd = OCFS2_IOC_GETFLAGS; | 174 | cmd = OCFS2_IOC_GETFLAGS; |
@@ -190,9 +188,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) | |||
190 | return -ENOIOCTLCMD; | 188 | return -ENOIOCTLCMD; |
191 | } | 189 | } |
192 | 190 | ||
193 | lock_kernel(); | 191 | return ocfs2_ioctl(file, cmd, arg); |
194 | ret = ocfs2_ioctl(inode, file, cmd, arg); | ||
195 | unlock_kernel(); | ||
196 | return ret; | ||
197 | } | 192 | } |
198 | #endif | 193 | #endif |
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h index 4d6c4f430d0d..cf9a5ee30fef 100644 --- a/fs/ocfs2/ioctl.h +++ b/fs/ocfs2/ioctl.h | |||
@@ -10,8 +10,7 @@ | |||
10 | #ifndef OCFS2_IOCTL_H | 10 | #ifndef OCFS2_IOCTL_H |
11 | #define OCFS2_IOCTL_H | 11 | #define OCFS2_IOCTL_H |
12 | 12 | ||
13 | int ocfs2_ioctl(struct inode * inode, struct file * filp, | 13 | long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); |
14 | unsigned int cmd, unsigned long arg); | ||
15 | long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); | 14 | long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); |
16 | 15 | ||
17 | #endif /* OCFS2_IOCTL_H */ | 16 | #endif /* OCFS2_IOCTL_H */ |
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f31c7e8c19c3..9698338adc39 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c | |||
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, | |||
64 | int slot); | 64 | int slot); |
65 | static int ocfs2_commit_thread(void *arg); | 65 | static int ocfs2_commit_thread(void *arg); |
66 | 66 | ||
67 | |||
68 | /* | ||
69 | * The recovery_list is a simple linked list of node numbers to recover. | ||
70 | * It is protected by the recovery_lock. | ||
71 | */ | ||
72 | |||
73 | struct ocfs2_recovery_map { | ||
74 | unsigned int rm_used; | ||
75 | unsigned int *rm_entries; | ||
76 | }; | ||
77 | |||
78 | int ocfs2_recovery_init(struct ocfs2_super *osb) | ||
79 | { | ||
80 | struct ocfs2_recovery_map *rm; | ||
81 | |||
82 | mutex_init(&osb->recovery_lock); | ||
83 | osb->disable_recovery = 0; | ||
84 | osb->recovery_thread_task = NULL; | ||
85 | init_waitqueue_head(&osb->recovery_event); | ||
86 | |||
87 | rm = kzalloc(sizeof(struct ocfs2_recovery_map) + | ||
88 | osb->max_slots * sizeof(unsigned int), | ||
89 | GFP_KERNEL); | ||
90 | if (!rm) { | ||
91 | mlog_errno(-ENOMEM); | ||
92 | return -ENOMEM; | ||
93 | } | ||
94 | |||
95 | rm->rm_entries = (unsigned int *)((char *)rm + | ||
96 | sizeof(struct ocfs2_recovery_map)); | ||
97 | osb->recovery_map = rm; | ||
98 | |||
99 | return 0; | ||
100 | } | ||
101 | |||
102 | /* we can't grab the goofy sem lock from inside wait_event, so we use | ||
103 | * memory barriers to make sure that we'll see the null task before | ||
104 | * being woken up */ | ||
105 | static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) | ||
106 | { | ||
107 | mb(); | ||
108 | return osb->recovery_thread_task != NULL; | ||
109 | } | ||
110 | |||
111 | void ocfs2_recovery_exit(struct ocfs2_super *osb) | ||
112 | { | ||
113 | struct ocfs2_recovery_map *rm; | ||
114 | |||
115 | /* disable any new recovery threads and wait for any currently | ||
116 | * running ones to exit. Do this before setting the vol_state. */ | ||
117 | mutex_lock(&osb->recovery_lock); | ||
118 | osb->disable_recovery = 1; | ||
119 | mutex_unlock(&osb->recovery_lock); | ||
120 | wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); | ||
121 | |||
122 | /* At this point, we know that no more recovery threads can be | ||
123 | * launched, so wait for any recovery completion work to | ||
124 | * complete. */ | ||
125 | flush_workqueue(ocfs2_wq); | ||
126 | |||
127 | /* | ||
128 | * Now that recovery is shut down, and the osb is about to be | ||
129 | * freed, the osb_lock is not taken here. | ||
130 | */ | ||
131 | rm = osb->recovery_map; | ||
132 | /* XXX: Should we bug if there are dirty entries? */ | ||
133 | |||
134 | kfree(rm); | ||
135 | } | ||
136 | |||
137 | static int __ocfs2_recovery_map_test(struct ocfs2_super *osb, | ||
138 | unsigned int node_num) | ||
139 | { | ||
140 | int i; | ||
141 | struct ocfs2_recovery_map *rm = osb->recovery_map; | ||
142 | |||
143 | assert_spin_locked(&osb->osb_lock); | ||
144 | |||
145 | for (i = 0; i < rm->rm_used; i++) { | ||
146 | if (rm->rm_entries[i] == node_num) | ||
147 | return 1; | ||
148 | } | ||
149 | |||
150 | return 0; | ||
151 | } | ||
152 | |||
153 | /* Behaves like test-and-set. Returns the previous value */ | ||
154 | static int ocfs2_recovery_map_set(struct ocfs2_super *osb, | ||
155 | unsigned int node_num) | ||
156 | { | ||
157 | struct ocfs2_recovery_map *rm = osb->recovery_map; | ||
158 | |||
159 | spin_lock(&osb->osb_lock); | ||
160 | if (__ocfs2_recovery_map_test(osb, node_num)) { | ||
161 | spin_unlock(&osb->osb_lock); | ||
162 | return 1; | ||
163 | } | ||
164 | |||
165 | /* XXX: Can this be exploited? Not from o2dlm... */ | ||
166 | BUG_ON(rm->rm_used >= osb->max_slots); | ||
167 | |||
168 | rm->rm_entries[rm->rm_used] = node_num; | ||
169 | rm->rm_used++; | ||
170 | spin_unlock(&osb->osb_lock); | ||
171 | |||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | static void ocfs2_recovery_map_clear(struct ocfs2_super *osb, | ||
176 | unsigned int node_num) | ||
177 | { | ||
178 | int i; | ||
179 | struct ocfs2_recovery_map *rm = osb->recovery_map; | ||
180 | |||
181 | spin_lock(&osb->osb_lock); | ||
182 | |||
183 | for (i = 0; i < rm->rm_used; i++) { | ||
184 | if (rm->rm_entries[i] == node_num) | ||
185 | break; | ||
186 | } | ||
187 | |||
188 | if (i < rm->rm_used) { | ||
189 | /* XXX: be careful with the pointer math */ | ||
190 | memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]), | ||
191 | (rm->rm_used - i - 1) * sizeof(unsigned int)); | ||
192 | rm->rm_used--; | ||
193 | } | ||
194 | |||
195 | spin_unlock(&osb->osb_lock); | ||
196 | } | ||
197 | |||
67 | static int ocfs2_commit_cache(struct ocfs2_super *osb) | 198 | static int ocfs2_commit_cache(struct ocfs2_super *osb) |
68 | { | 199 | { |
69 | int status = 0; | 200 | int status = 0; |
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local) | |||
586 | 717 | ||
587 | mlog_entry_void(); | 718 | mlog_entry_void(); |
588 | 719 | ||
589 | if (!journal) | 720 | BUG_ON(!journal); |
590 | BUG(); | ||
591 | 721 | ||
592 | osb = journal->j_osb; | 722 | osb = journal->j_osb; |
593 | 723 | ||
@@ -650,6 +780,23 @@ bail: | |||
650 | return status; | 780 | return status; |
651 | } | 781 | } |
652 | 782 | ||
783 | static int ocfs2_recovery_completed(struct ocfs2_super *osb) | ||
784 | { | ||
785 | int empty; | ||
786 | struct ocfs2_recovery_map *rm = osb->recovery_map; | ||
787 | |||
788 | spin_lock(&osb->osb_lock); | ||
789 | empty = (rm->rm_used == 0); | ||
790 | spin_unlock(&osb->osb_lock); | ||
791 | |||
792 | return empty; | ||
793 | } | ||
794 | |||
795 | void ocfs2_wait_for_recovery(struct ocfs2_super *osb) | ||
796 | { | ||
797 | wait_event(osb->recovery_event, ocfs2_recovery_completed(osb)); | ||
798 | } | ||
799 | |||
653 | /* | 800 | /* |
654 | * JBD Might read a cached version of another nodes journal file. We | 801 | * JBD Might read a cached version of another nodes journal file. We |
655 | * don't want this as this file changes often and we get no | 802 | * don't want this as this file changes often and we get no |
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg) | |||
848 | { | 995 | { |
849 | int status, node_num; | 996 | int status, node_num; |
850 | struct ocfs2_super *osb = arg; | 997 | struct ocfs2_super *osb = arg; |
998 | struct ocfs2_recovery_map *rm = osb->recovery_map; | ||
851 | 999 | ||
852 | mlog_entry_void(); | 1000 | mlog_entry_void(); |
853 | 1001 | ||
@@ -863,26 +1011,29 @@ restart: | |||
863 | goto bail; | 1011 | goto bail; |
864 | } | 1012 | } |
865 | 1013 | ||
866 | while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { | 1014 | spin_lock(&osb->osb_lock); |
867 | node_num = ocfs2_node_map_first_set_bit(osb, | 1015 | while (rm->rm_used) { |
868 | &osb->recovery_map); | 1016 | /* It's always safe to remove entry zero, as we won't |
869 | if (node_num == O2NM_INVALID_NODE_NUM) { | 1017 | * clear it until ocfs2_recover_node() has succeeded. */ |
870 | mlog(0, "Out of nodes to recover.\n"); | 1018 | node_num = rm->rm_entries[0]; |
871 | break; | 1019 | spin_unlock(&osb->osb_lock); |
872 | } | ||
873 | 1020 | ||
874 | status = ocfs2_recover_node(osb, node_num); | 1021 | status = ocfs2_recover_node(osb, node_num); |
875 | if (status < 0) { | 1022 | if (!status) { |
1023 | ocfs2_recovery_map_clear(osb, node_num); | ||
1024 | } else { | ||
876 | mlog(ML_ERROR, | 1025 | mlog(ML_ERROR, |
877 | "Error %d recovering node %d on device (%u,%u)!\n", | 1026 | "Error %d recovering node %d on device (%u,%u)!\n", |
878 | status, node_num, | 1027 | status, node_num, |
879 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | 1028 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); |
880 | mlog(ML_ERROR, "Volume requires unmount.\n"); | 1029 | mlog(ML_ERROR, "Volume requires unmount.\n"); |
881 | continue; | ||
882 | } | 1030 | } |
883 | 1031 | ||
884 | ocfs2_recovery_map_clear(osb, node_num); | 1032 | spin_lock(&osb->osb_lock); |
885 | } | 1033 | } |
1034 | spin_unlock(&osb->osb_lock); | ||
1035 | mlog(0, "All nodes recovered\n"); | ||
1036 | |||
886 | ocfs2_super_unlock(osb, 1); | 1037 | ocfs2_super_unlock(osb, 1); |
887 | 1038 | ||
888 | /* We always run recovery on our own orphan dir - the dead | 1039 | /* We always run recovery on our own orphan dir - the dead |
@@ -893,8 +1044,7 @@ restart: | |||
893 | 1044 | ||
894 | bail: | 1045 | bail: |
895 | mutex_lock(&osb->recovery_lock); | 1046 | mutex_lock(&osb->recovery_lock); |
896 | if (!status && | 1047 | if (!status && !ocfs2_recovery_completed(osb)) { |
897 | !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { | ||
898 | mutex_unlock(&osb->recovery_lock); | 1048 | mutex_unlock(&osb->recovery_lock); |
899 | goto restart; | 1049 | goto restart; |
900 | } | 1050 | } |
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) | |||
924 | 1074 | ||
925 | /* People waiting on recovery will wait on | 1075 | /* People waiting on recovery will wait on |
926 | * the recovery map to empty. */ | 1076 | * the recovery map to empty. */ |
927 | if (!ocfs2_recovery_map_set(osb, node_num)) | 1077 | if (ocfs2_recovery_map_set(osb, node_num)) |
928 | mlog(0, "node %d already be in recovery.\n", node_num); | 1078 | mlog(0, "node %d already in recovery map.\n", node_num); |
929 | 1079 | ||
930 | mlog(0, "starting recovery thread...\n"); | 1080 | mlog(0, "starting recovery thread...\n"); |
931 | 1081 | ||
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, | |||
1079 | { | 1229 | { |
1080 | int status = 0; | 1230 | int status = 0; |
1081 | int slot_num; | 1231 | int slot_num; |
1082 | struct ocfs2_slot_info *si = osb->slot_info; | ||
1083 | struct ocfs2_dinode *la_copy = NULL; | 1232 | struct ocfs2_dinode *la_copy = NULL; |
1084 | struct ocfs2_dinode *tl_copy = NULL; | 1233 | struct ocfs2_dinode *tl_copy = NULL; |
1085 | 1234 | ||
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, | |||
1092 | * case we should've called ocfs2_journal_load instead. */ | 1241 | * case we should've called ocfs2_journal_load instead. */ |
1093 | BUG_ON(osb->node_num == node_num); | 1242 | BUG_ON(osb->node_num == node_num); |
1094 | 1243 | ||
1095 | slot_num = ocfs2_node_num_to_slot(si, node_num); | 1244 | slot_num = ocfs2_node_num_to_slot(osb, node_num); |
1096 | if (slot_num == OCFS2_INVALID_SLOT) { | 1245 | if (slot_num == -ENOENT) { |
1097 | status = 0; | 1246 | status = 0; |
1098 | mlog(0, "no slot for this node, so no recovery required.\n"); | 1247 | mlog(0, "no slot for this node, so no recovery required.\n"); |
1099 | goto done; | 1248 | goto done; |
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, | |||
1123 | 1272 | ||
1124 | /* Likewise, this would be a strange but ultimately not so | 1273 | /* Likewise, this would be a strange but ultimately not so |
1125 | * harmful place to get an error... */ | 1274 | * harmful place to get an error... */ |
1126 | ocfs2_clear_slot(si, slot_num); | 1275 | status = ocfs2_clear_slot(osb, slot_num); |
1127 | status = ocfs2_update_disk_slots(osb, si); | ||
1128 | if (status < 0) | 1276 | if (status < 0) |
1129 | mlog_errno(status); | 1277 | mlog_errno(status); |
1130 | 1278 | ||
@@ -1184,23 +1332,24 @@ bail: | |||
1184 | * slot info struct has been updated from disk. */ | 1332 | * slot info struct has been updated from disk. */ |
1185 | int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) | 1333 | int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) |
1186 | { | 1334 | { |
1187 | int status, i, node_num; | 1335 | unsigned int node_num; |
1188 | struct ocfs2_slot_info *si = osb->slot_info; | 1336 | int status, i; |
1189 | 1337 | ||
1190 | /* This is called with the super block cluster lock, so we | 1338 | /* This is called with the super block cluster lock, so we |
1191 | * know that the slot map can't change underneath us. */ | 1339 | * know that the slot map can't change underneath us. */ |
1192 | 1340 | ||
1193 | spin_lock(&si->si_lock); | 1341 | spin_lock(&osb->osb_lock); |
1194 | for(i = 0; i < si->si_num_slots; i++) { | 1342 | for (i = 0; i < osb->max_slots; i++) { |
1195 | if (i == osb->slot_num) | 1343 | if (i == osb->slot_num) |
1196 | continue; | 1344 | continue; |
1197 | if (ocfs2_is_empty_slot(si, i)) | 1345 | |
1346 | status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); | ||
1347 | if (status == -ENOENT) | ||
1198 | continue; | 1348 | continue; |
1199 | 1349 | ||
1200 | node_num = si->si_global_node_nums[i]; | 1350 | if (__ocfs2_recovery_map_test(osb, node_num)) |
1201 | if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num)) | ||
1202 | continue; | 1351 | continue; |
1203 | spin_unlock(&si->si_lock); | 1352 | spin_unlock(&osb->osb_lock); |
1204 | 1353 | ||
1205 | /* Ok, we have a slot occupied by another node which | 1354 | /* Ok, we have a slot occupied by another node which |
1206 | * is not in the recovery map. We trylock his journal | 1355 | * is not in the recovery map. We trylock his journal |
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) | |||
1216 | goto bail; | 1365 | goto bail; |
1217 | } | 1366 | } |
1218 | 1367 | ||
1219 | spin_lock(&si->si_lock); | 1368 | spin_lock(&osb->osb_lock); |
1220 | } | 1369 | } |
1221 | spin_unlock(&si->si_lock); | 1370 | spin_unlock(&osb->osb_lock); |
1222 | 1371 | ||
1223 | status = 0; | 1372 | status = 0; |
1224 | bail: | 1373 | bail: |
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 220f3e818e78..db82be2532ed 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h | |||
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, | |||
134 | 134 | ||
135 | /* Exported only for the journal struct init code in super.c. Do not call. */ | 135 | /* Exported only for the journal struct init code in super.c. Do not call. */ |
136 | void ocfs2_complete_recovery(struct work_struct *work); | 136 | void ocfs2_complete_recovery(struct work_struct *work); |
137 | void ocfs2_wait_for_recovery(struct ocfs2_super *osb); | ||
138 | |||
139 | int ocfs2_recovery_init(struct ocfs2_super *osb); | ||
140 | void ocfs2_recovery_exit(struct ocfs2_super *osb); | ||
137 | 141 | ||
138 | /* | 142 | /* |
139 | * Journal Control: | 143 | * Journal Control: |
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ab83fd562429..ce0dc147602a 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c | |||
@@ -447,6 +447,8 @@ out_mutex: | |||
447 | iput(main_bm_inode); | 447 | iput(main_bm_inode); |
448 | 448 | ||
449 | out: | 449 | out: |
450 | if (!status) | ||
451 | ocfs2_init_inode_steal_slot(osb); | ||
450 | mlog_exit(status); | 452 | mlog_exit(status); |
451 | return status; | 453 | return status; |
452 | } | 454 | } |
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, | |||
523 | } | 525 | } |
524 | 526 | ||
525 | ac->ac_inode = local_alloc_inode; | 527 | ac->ac_inode = local_alloc_inode; |
528 | /* We should never use localalloc from another slot */ | ||
529 | ac->ac_alloc_slot = osb->slot_num; | ||
526 | ac->ac_which = OCFS2_AC_USE_LOCAL; | 530 | ac->ac_which = OCFS2_AC_USE_LOCAL; |
527 | get_bh(osb->local_alloc_bh); | 531 | get_bh(osb->local_alloc_bh); |
528 | ac->ac_bh = osb->local_alloc_bh; | 532 | ac->ac_bh = osb->local_alloc_bh; |
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ae9ad9587516..d5d808fe0140 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, | |||
424 | fe->i_fs_generation = cpu_to_le32(osb->fs_generation); | 424 | fe->i_fs_generation = cpu_to_le32(osb->fs_generation); |
425 | fe->i_blkno = cpu_to_le64(fe_blkno); | 425 | fe->i_blkno = cpu_to_le64(fe_blkno); |
426 | fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); | 426 | fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); |
427 | fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); | 427 | fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); |
428 | fe->i_uid = cpu_to_le32(current->fsuid); | 428 | fe->i_uid = cpu_to_le32(current->fsuid); |
429 | if (dir->i_mode & S_ISGID) { | 429 | if (dir->i_mode & S_ISGID) { |
430 | fe->i_gid = cpu_to_le32(dir->i_gid); | 430 | fe->i_gid = cpu_to_le32(dir->i_gid); |
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir, | |||
997 | * | 997 | * |
998 | * And that's why, just like the VFS, we need a file system | 998 | * And that's why, just like the VFS, we need a file system |
999 | * rename lock. */ | 999 | * rename lock. */ |
1000 | if (old_dentry != new_dentry) { | 1000 | if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) { |
1001 | status = ocfs2_rename_lock(osb); | 1001 | status = ocfs2_rename_lock(osb); |
1002 | if (status < 0) { | 1002 | if (status < 0) { |
1003 | mlog_errno(status); | 1003 | mlog_errno(status); |
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 6546cef212e3..31692379c170 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h | |||
@@ -36,11 +36,8 @@ | |||
36 | #include <linux/mutex.h> | 36 | #include <linux/mutex.h> |
37 | #include <linux/jbd.h> | 37 | #include <linux/jbd.h> |
38 | 38 | ||
39 | #include "cluster/nodemanager.h" | 39 | /* For union ocfs2_dlm_lksb */ |
40 | #include "cluster/heartbeat.h" | 40 | #include "stackglue.h" |
41 | #include "cluster/tcp.h" | ||
42 | |||
43 | #include "dlm/dlmapi.h" | ||
44 | 41 | ||
45 | #include "ocfs2_fs.h" | 42 | #include "ocfs2_fs.h" |
46 | #include "ocfs2_lockid.h" | 43 | #include "ocfs2_lockid.h" |
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action { | |||
101 | * dropped. */ | 98 | * dropped. */ |
102 | #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ | 99 | #define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ |
103 | #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ | 100 | #define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ |
101 | #define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a | ||
102 | call to dlm_lock. Only | ||
103 | exists with BUSY set. */ | ||
104 | 104 | ||
105 | struct ocfs2_lock_res_ops; | 105 | struct ocfs2_lock_res_ops; |
106 | 106 | ||
@@ -120,13 +120,14 @@ struct ocfs2_lock_res { | |||
120 | int l_level; | 120 | int l_level; |
121 | unsigned int l_ro_holders; | 121 | unsigned int l_ro_holders; |
122 | unsigned int l_ex_holders; | 122 | unsigned int l_ex_holders; |
123 | struct dlm_lockstatus l_lksb; | 123 | union ocfs2_dlm_lksb l_lksb; |
124 | 124 | ||
125 | /* used from AST/BAST funcs. */ | 125 | /* used from AST/BAST funcs. */ |
126 | enum ocfs2_ast_action l_action; | 126 | enum ocfs2_ast_action l_action; |
127 | enum ocfs2_unlock_action l_unlock_action; | 127 | enum ocfs2_unlock_action l_unlock_action; |
128 | int l_requested; | 128 | int l_requested; |
129 | int l_blocking; | 129 | int l_blocking; |
130 | unsigned int l_pending_gen; | ||
130 | 131 | ||
131 | wait_queue_head_t l_event; | 132 | wait_queue_head_t l_event; |
132 | 133 | ||
@@ -179,6 +180,8 @@ enum ocfs2_mount_options | |||
179 | #define OCFS2_DEFAULT_ATIME_QUANTUM 60 | 180 | #define OCFS2_DEFAULT_ATIME_QUANTUM 60 |
180 | 181 | ||
181 | struct ocfs2_journal; | 182 | struct ocfs2_journal; |
183 | struct ocfs2_slot_info; | ||
184 | struct ocfs2_recovery_map; | ||
182 | struct ocfs2_super | 185 | struct ocfs2_super |
183 | { | 186 | { |
184 | struct task_struct *commit_task; | 187 | struct task_struct *commit_task; |
@@ -190,7 +193,6 @@ struct ocfs2_super | |||
190 | struct ocfs2_slot_info *slot_info; | 193 | struct ocfs2_slot_info *slot_info; |
191 | 194 | ||
192 | spinlock_t node_map_lock; | 195 | spinlock_t node_map_lock; |
193 | struct ocfs2_node_map recovery_map; | ||
194 | 196 | ||
195 | u64 root_blkno; | 197 | u64 root_blkno; |
196 | u64 system_dir_blkno; | 198 | u64 system_dir_blkno; |
@@ -206,25 +208,29 @@ struct ocfs2_super | |||
206 | u32 s_feature_incompat; | 208 | u32 s_feature_incompat; |
207 | u32 s_feature_ro_compat; | 209 | u32 s_feature_ro_compat; |
208 | 210 | ||
209 | /* Protects s_next_generaion, osb_flags. Could protect more on | 211 | /* Protects s_next_generation, osb_flags and s_inode_steal_slot. |
210 | * osb as it's very short lived. */ | 212 | * Could protect more on osb as it's very short lived. |
213 | */ | ||
211 | spinlock_t osb_lock; | 214 | spinlock_t osb_lock; |
212 | u32 s_next_generation; | 215 | u32 s_next_generation; |
213 | unsigned long osb_flags; | 216 | unsigned long osb_flags; |
217 | s16 s_inode_steal_slot; | ||
218 | atomic_t s_num_inodes_stolen; | ||
214 | 219 | ||
215 | unsigned long s_mount_opt; | 220 | unsigned long s_mount_opt; |
216 | unsigned int s_atime_quantum; | 221 | unsigned int s_atime_quantum; |
217 | 222 | ||
218 | u16 max_slots; | 223 | unsigned int max_slots; |
219 | s16 node_num; | 224 | unsigned int node_num; |
220 | s16 slot_num; | 225 | int slot_num; |
221 | s16 preferred_slot; | 226 | int preferred_slot; |
222 | int s_sectsize_bits; | 227 | int s_sectsize_bits; |
223 | int s_clustersize; | 228 | int s_clustersize; |
224 | int s_clustersize_bits; | 229 | int s_clustersize_bits; |
225 | 230 | ||
226 | atomic_t vol_state; | 231 | atomic_t vol_state; |
227 | struct mutex recovery_lock; | 232 | struct mutex recovery_lock; |
233 | struct ocfs2_recovery_map *recovery_map; | ||
228 | struct task_struct *recovery_thread_task; | 234 | struct task_struct *recovery_thread_task; |
229 | int disable_recovery; | 235 | int disable_recovery; |
230 | wait_queue_head_t checkpoint_event; | 236 | wait_queue_head_t checkpoint_event; |
@@ -245,12 +251,11 @@ struct ocfs2_super | |||
245 | struct ocfs2_alloc_stats alloc_stats; | 251 | struct ocfs2_alloc_stats alloc_stats; |
246 | char dev_str[20]; /* "major,minor" of the device */ | 252 | char dev_str[20]; /* "major,minor" of the device */ |
247 | 253 | ||
248 | struct dlm_ctxt *dlm; | 254 | char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; |
255 | struct ocfs2_cluster_connection *cconn; | ||
249 | struct ocfs2_lock_res osb_super_lockres; | 256 | struct ocfs2_lock_res osb_super_lockres; |
250 | struct ocfs2_lock_res osb_rename_lockres; | 257 | struct ocfs2_lock_res osb_rename_lockres; |
251 | struct dlm_eviction_cb osb_eviction_cb; | ||
252 | struct ocfs2_dlm_debug *osb_dlm_debug; | 258 | struct ocfs2_dlm_debug *osb_dlm_debug; |
253 | struct dlm_protocol_version osb_locking_proto; | ||
254 | 259 | ||
255 | struct dentry *osb_debug_root; | 260 | struct dentry *osb_debug_root; |
256 | 261 | ||
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb) | |||
367 | return ret; | 372 | return ret; |
368 | } | 373 | } |
369 | 374 | ||
375 | static inline int ocfs2_userspace_stack(struct ocfs2_super *osb) | ||
376 | { | ||
377 | return (osb->s_feature_incompat & | ||
378 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK); | ||
379 | } | ||
380 | |||
370 | static inline int ocfs2_mount_local(struct ocfs2_super *osb) | 381 | static inline int ocfs2_mount_local(struct ocfs2_super *osb) |
371 | { | 382 | { |
372 | return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); | 383 | return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); |
373 | } | 384 | } |
374 | 385 | ||
386 | static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) | ||
387 | { | ||
388 | return (osb->s_feature_incompat & | ||
389 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP); | ||
390 | } | ||
391 | |||
392 | |||
375 | #define OCFS2_IS_VALID_DINODE(ptr) \ | 393 | #define OCFS2_IS_VALID_DINODE(ptr) \ |
376 | (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) | 394 | (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) |
377 | 395 | ||
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) | |||
522 | return pages_per_cluster; | 540 | return pages_per_cluster; |
523 | } | 541 | } |
524 | 542 | ||
543 | static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) | ||
544 | { | ||
545 | spin_lock(&osb->osb_lock); | ||
546 | osb->s_inode_steal_slot = OCFS2_INVALID_SLOT; | ||
547 | spin_unlock(&osb->osb_lock); | ||
548 | atomic_set(&osb->s_num_inodes_stolen, 0); | ||
549 | } | ||
550 | |||
551 | static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb, | ||
552 | s16 slot) | ||
553 | { | ||
554 | spin_lock(&osb->osb_lock); | ||
555 | osb->s_inode_steal_slot = slot; | ||
556 | spin_unlock(&osb->osb_lock); | ||
557 | } | ||
558 | |||
559 | static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) | ||
560 | { | ||
561 | s16 slot; | ||
562 | |||
563 | spin_lock(&osb->osb_lock); | ||
564 | slot = osb->s_inode_steal_slot; | ||
565 | spin_unlock(&osb->osb_lock); | ||
566 | |||
567 | return slot; | ||
568 | } | ||
569 | |||
525 | #define ocfs2_set_bit ext2_set_bit | 570 | #define ocfs2_set_bit ext2_set_bit |
526 | #define ocfs2_clear_bit ext2_clear_bit | 571 | #define ocfs2_clear_bit ext2_clear_bit |
527 | #define ocfs2_test_bit ext2_test_bit | 572 | #define ocfs2_test_bit ext2_test_bit |
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 3633edd3982f..52c426665154 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h | |||
@@ -88,7 +88,9 @@ | |||
88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB | 88 | #define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB |
89 | #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ | 89 | #define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ |
90 | | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | 90 | | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ |
91 | | OCFS2_FEATURE_INCOMPAT_INLINE_DATA) | 91 | | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ |
92 | | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | ||
93 | | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) | ||
92 | #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN | 94 | #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN |
93 | 95 | ||
94 | /* | 96 | /* |
@@ -125,6 +127,21 @@ | |||
125 | /* Support for data packed into inode blocks */ | 127 | /* Support for data packed into inode blocks */ |
126 | #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 | 128 | #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 |
127 | 129 | ||
130 | /* Support for the extended slot map */ | ||
131 | #define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 | ||
132 | |||
133 | |||
134 | /* | ||
135 | * Support for alternate, userspace cluster stacks. If set, the superblock | ||
136 | * field s_cluster_info contains a tag for the alternate stack in use as | ||
137 | * well as the name of the cluster being joined. | ||
138 | * mount.ocfs2 must pass in a matching stack name. | ||
139 | * | ||
140 | * If not set, the classic stack will be used. This is compatbile with | ||
141 | * all older versions. | ||
142 | */ | ||
143 | #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 | ||
144 | |||
128 | /* | 145 | /* |
129 | * backup superblock flag is used to indicate that this volume | 146 | * backup superblock flag is used to indicate that this volume |
130 | * has backup superblocks. | 147 | * has backup superblocks. |
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input { | |||
267 | #define OCFS2_VOL_UUID_LEN 16 | 284 | #define OCFS2_VOL_UUID_LEN 16 |
268 | #define OCFS2_MAX_VOL_LABEL_LEN 64 | 285 | #define OCFS2_MAX_VOL_LABEL_LEN 64 |
269 | 286 | ||
287 | /* The alternate, userspace stack fields */ | ||
288 | #define OCFS2_STACK_LABEL_LEN 4 | ||
289 | #define OCFS2_CLUSTER_NAME_LEN 16 | ||
290 | |||
270 | /* Journal limits (in bytes) */ | 291 | /* Journal limits (in bytes) */ |
271 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) | 292 | #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) |
272 | 293 | ||
@@ -475,6 +496,47 @@ struct ocfs2_extent_block | |||
475 | }; | 496 | }; |
476 | 497 | ||
477 | /* | 498 | /* |
499 | * On disk slot map for OCFS2. This defines the contents of the "slot_map" | ||
500 | * system file. A slot is valid if it contains a node number >= 0. The | ||
501 | * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty. | ||
502 | */ | ||
503 | struct ocfs2_slot_map { | ||
504 | /*00*/ __le16 sm_slots[0]; | ||
505 | /* | ||
506 | * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255, | ||
507 | * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize. | ||
508 | */ | ||
509 | }; | ||
510 | |||
511 | struct ocfs2_extended_slot { | ||
512 | /*00*/ __u8 es_valid; | ||
513 | __u8 es_reserved1[3]; | ||
514 | __le32 es_node_num; | ||
515 | /*10*/ | ||
516 | }; | ||
517 | |||
518 | /* | ||
519 | * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP | ||
520 | * is set. It separates out the valid marker from the node number, and | ||
521 | * has room to grow. Unlike the old slot map, this format is defined by | ||
522 | * i_size. | ||
523 | */ | ||
524 | struct ocfs2_slot_map_extended { | ||
525 | /*00*/ struct ocfs2_extended_slot se_slots[0]; | ||
526 | /* | ||
527 | * Actual size is i_size of the slot_map system file. It should | ||
528 | * match s_max_slots * sizeof(struct ocfs2_extended_slot) | ||
529 | */ | ||
530 | }; | ||
531 | |||
532 | struct ocfs2_cluster_info { | ||
533 | /*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN]; | ||
534 | __le32 ci_reserved; | ||
535 | /*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN]; | ||
536 | /*18*/ | ||
537 | }; | ||
538 | |||
539 | /* | ||
478 | * On disk superblock for OCFS2 | 540 | * On disk superblock for OCFS2 |
479 | * Note that it is contained inside an ocfs2_dinode, so all offsets | 541 | * Note that it is contained inside an ocfs2_dinode, so all offsets |
480 | * are relative to the start of ocfs2_dinode.id2. | 542 | * are relative to the start of ocfs2_dinode.id2. |
@@ -506,7 +568,20 @@ struct ocfs2_super_block { | |||
506 | * group header */ | 568 | * group header */ |
507 | /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ | 569 | /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ |
508 | /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ | 570 | /*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ |
509 | /*A0*/ | 571 | /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace |
572 | stack. Only valid | ||
573 | with INCOMPAT flag. */ | ||
574 | /*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ | ||
575 | /*140*/ | ||
576 | |||
577 | /* | ||
578 | * NOTE: As stated above, all offsets are relative to | ||
579 | * ocfs2_dinode.id2, which is at 0xC0 in the inode. | ||
580 | * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within | ||
581 | * our smallest blocksize, which is 512 bytes. To ensure this, | ||
582 | * we reserve the space in s_reserved2. Anything past s_reserved2 | ||
583 | * will not be available on the smallest blocksize. | ||
584 | */ | ||
510 | }; | 585 | }; |
511 | 586 | ||
512 | /* | 587 | /* |
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 86f3e3799c2b..82c200f7a8f1 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h | |||
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = { | |||
100 | static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) | 100 | static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) |
101 | { | 101 | { |
102 | #ifdef __KERNEL__ | 102 | #ifdef __KERNEL__ |
103 | mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); | 103 | BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); |
104 | #endif | 104 | #endif |
105 | return ocfs2_lock_type_strings[type]; | 105 | return ocfs2_lock_type_strings[type]; |
106 | } | 106 | } |
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 3a50ce555e64..bb5ff8939bf1 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c | |||
@@ -42,81 +42,244 @@ | |||
42 | 42 | ||
43 | #include "buffer_head_io.h" | 43 | #include "buffer_head_io.h" |
44 | 44 | ||
45 | static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | 45 | |
46 | s16 global); | 46 | struct ocfs2_slot { |
47 | static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, | 47 | int sl_valid; |
48 | s16 slot_num, | 48 | unsigned int sl_node_num; |
49 | s16 node_num); | 49 | }; |
50 | 50 | ||
51 | /* post the slot information on disk into our slot_info struct. */ | 51 | struct ocfs2_slot_info { |
52 | void ocfs2_update_slot_info(struct ocfs2_slot_info *si) | 52 | int si_extended; |
53 | int si_slots_per_block; | ||
54 | struct inode *si_inode; | ||
55 | unsigned int si_blocks; | ||
56 | struct buffer_head **si_bh; | ||
57 | unsigned int si_num_slots; | ||
58 | struct ocfs2_slot *si_slots; | ||
59 | }; | ||
60 | |||
61 | |||
62 | static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
63 | unsigned int node_num); | ||
64 | |||
65 | static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si, | ||
66 | int slot_num) | ||
67 | { | ||
68 | BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); | ||
69 | si->si_slots[slot_num].sl_valid = 0; | ||
70 | } | ||
71 | |||
72 | static void ocfs2_set_slot(struct ocfs2_slot_info *si, | ||
73 | int slot_num, unsigned int node_num) | ||
74 | { | ||
75 | BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots)); | ||
76 | |||
77 | si->si_slots[slot_num].sl_valid = 1; | ||
78 | si->si_slots[slot_num].sl_node_num = node_num; | ||
79 | } | ||
80 | |||
81 | /* This version is for the extended slot map */ | ||
82 | static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si) | ||
83 | { | ||
84 | int b, i, slotno; | ||
85 | struct ocfs2_slot_map_extended *se; | ||
86 | |||
87 | slotno = 0; | ||
88 | for (b = 0; b < si->si_blocks; b++) { | ||
89 | se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data; | ||
90 | for (i = 0; | ||
91 | (i < si->si_slots_per_block) && | ||
92 | (slotno < si->si_num_slots); | ||
93 | i++, slotno++) { | ||
94 | if (se->se_slots[i].es_valid) | ||
95 | ocfs2_set_slot(si, slotno, | ||
96 | le32_to_cpu(se->se_slots[i].es_node_num)); | ||
97 | else | ||
98 | ocfs2_invalidate_slot(si, slotno); | ||
99 | } | ||
100 | } | ||
101 | } | ||
102 | |||
103 | /* | ||
104 | * Post the slot information on disk into our slot_info struct. | ||
105 | * Must be protected by osb_lock. | ||
106 | */ | ||
107 | static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si) | ||
53 | { | 108 | { |
54 | int i; | 109 | int i; |
55 | __le16 *disk_info; | 110 | struct ocfs2_slot_map *sm; |
56 | 111 | ||
57 | /* we don't read the slot block here as ocfs2_super_lock | 112 | sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; |
58 | * should've made sure we have the most recent copy. */ | ||
59 | spin_lock(&si->si_lock); | ||
60 | disk_info = (__le16 *) si->si_bh->b_data; | ||
61 | 113 | ||
62 | for (i = 0; i < si->si_size; i++) | 114 | for (i = 0; i < si->si_num_slots; i++) { |
63 | si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); | 115 | if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT) |
116 | ocfs2_invalidate_slot(si, i); | ||
117 | else | ||
118 | ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i])); | ||
119 | } | ||
120 | } | ||
64 | 121 | ||
65 | spin_unlock(&si->si_lock); | 122 | static void ocfs2_update_slot_info(struct ocfs2_slot_info *si) |
123 | { | ||
124 | /* | ||
125 | * The slot data will have been refreshed when ocfs2_super_lock | ||
126 | * was taken. | ||
127 | */ | ||
128 | if (si->si_extended) | ||
129 | ocfs2_update_slot_info_extended(si); | ||
130 | else | ||
131 | ocfs2_update_slot_info_old(si); | ||
132 | } | ||
133 | |||
134 | int ocfs2_refresh_slot_info(struct ocfs2_super *osb) | ||
135 | { | ||
136 | int ret; | ||
137 | struct ocfs2_slot_info *si = osb->slot_info; | ||
138 | |||
139 | if (si == NULL) | ||
140 | return 0; | ||
141 | |||
142 | BUG_ON(si->si_blocks == 0); | ||
143 | BUG_ON(si->si_bh == NULL); | ||
144 | |||
145 | mlog(0, "Refreshing slot map, reading %u block(s)\n", | ||
146 | si->si_blocks); | ||
147 | |||
148 | /* | ||
149 | * We pass -1 as blocknr because we expect all of si->si_bh to | ||
150 | * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If | ||
151 | * this is not true, the read of -1 (UINT64_MAX) will fail. | ||
152 | */ | ||
153 | ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, | ||
154 | si->si_inode); | ||
155 | if (ret == 0) { | ||
156 | spin_lock(&osb->osb_lock); | ||
157 | ocfs2_update_slot_info(si); | ||
158 | spin_unlock(&osb->osb_lock); | ||
159 | } | ||
160 | |||
161 | return ret; | ||
66 | } | 162 | } |
67 | 163 | ||
68 | /* post the our slot info stuff into it's destination bh and write it | 164 | /* post the our slot info stuff into it's destination bh and write it |
69 | * out. */ | 165 | * out. */ |
70 | int ocfs2_update_disk_slots(struct ocfs2_super *osb, | 166 | static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si, |
71 | struct ocfs2_slot_info *si) | 167 | int slot_num, |
168 | struct buffer_head **bh) | ||
72 | { | 169 | { |
73 | int status, i; | 170 | int blkind = slot_num / si->si_slots_per_block; |
74 | __le16 *disk_info = (__le16 *) si->si_bh->b_data; | 171 | int slotno = slot_num % si->si_slots_per_block; |
172 | struct ocfs2_slot_map_extended *se; | ||
173 | |||
174 | BUG_ON(blkind >= si->si_blocks); | ||
175 | |||
176 | se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data; | ||
177 | se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid; | ||
178 | if (si->si_slots[slot_num].sl_valid) | ||
179 | se->se_slots[slotno].es_node_num = | ||
180 | cpu_to_le32(si->si_slots[slot_num].sl_node_num); | ||
181 | *bh = si->si_bh[blkind]; | ||
182 | } | ||
75 | 183 | ||
76 | spin_lock(&si->si_lock); | 184 | static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si, |
77 | for (i = 0; i < si->si_size; i++) | 185 | int slot_num, |
78 | disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); | 186 | struct buffer_head **bh) |
79 | spin_unlock(&si->si_lock); | 187 | { |
188 | int i; | ||
189 | struct ocfs2_slot_map *sm; | ||
190 | |||
191 | sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data; | ||
192 | for (i = 0; i < si->si_num_slots; i++) { | ||
193 | if (si->si_slots[i].sl_valid) | ||
194 | sm->sm_slots[i] = | ||
195 | cpu_to_le16(si->si_slots[i].sl_node_num); | ||
196 | else | ||
197 | sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT); | ||
198 | } | ||
199 | *bh = si->si_bh[0]; | ||
200 | } | ||
201 | |||
202 | static int ocfs2_update_disk_slot(struct ocfs2_super *osb, | ||
203 | struct ocfs2_slot_info *si, | ||
204 | int slot_num) | ||
205 | { | ||
206 | int status; | ||
207 | struct buffer_head *bh; | ||
208 | |||
209 | spin_lock(&osb->osb_lock); | ||
210 | if (si->si_extended) | ||
211 | ocfs2_update_disk_slot_extended(si, slot_num, &bh); | ||
212 | else | ||
213 | ocfs2_update_disk_slot_old(si, slot_num, &bh); | ||
214 | spin_unlock(&osb->osb_lock); | ||
80 | 215 | ||
81 | status = ocfs2_write_block(osb, si->si_bh, si->si_inode); | 216 | status = ocfs2_write_block(osb, bh, si->si_inode); |
82 | if (status < 0) | 217 | if (status < 0) |
83 | mlog_errno(status); | 218 | mlog_errno(status); |
84 | 219 | ||
85 | return status; | 220 | return status; |
86 | } | 221 | } |
87 | 222 | ||
88 | /* try to find global node in the slot info. Returns | 223 | /* |
89 | * OCFS2_INVALID_SLOT if nothing is found. */ | 224 | * Calculate how many bytes are needed by the slot map. Returns |
90 | static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | 225 | * an error if the slot map file is too small. |
91 | s16 global) | 226 | */ |
227 | static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb, | ||
228 | struct inode *inode, | ||
229 | unsigned long long *bytes) | ||
92 | { | 230 | { |
93 | int i; | 231 | unsigned long long bytes_needed; |
94 | s16 ret = OCFS2_INVALID_SLOT; | 232 | |
233 | if (ocfs2_uses_extended_slot_map(osb)) { | ||
234 | bytes_needed = osb->max_slots * | ||
235 | sizeof(struct ocfs2_extended_slot); | ||
236 | } else { | ||
237 | bytes_needed = osb->max_slots * sizeof(__le16); | ||
238 | } | ||
239 | if (bytes_needed > i_size_read(inode)) { | ||
240 | mlog(ML_ERROR, | ||
241 | "Slot map file is too small! (size %llu, needed %llu)\n", | ||
242 | i_size_read(inode), bytes_needed); | ||
243 | return -ENOSPC; | ||
244 | } | ||
245 | |||
246 | *bytes = bytes_needed; | ||
247 | return 0; | ||
248 | } | ||
249 | |||
250 | /* try to find global node in the slot info. Returns -ENOENT | ||
251 | * if nothing is found. */ | ||
252 | static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
253 | unsigned int node_num) | ||
254 | { | ||
255 | int i, ret = -ENOENT; | ||
95 | 256 | ||
96 | for(i = 0; i < si->si_num_slots; i++) { | 257 | for(i = 0; i < si->si_num_slots; i++) { |
97 | if (global == si->si_global_node_nums[i]) { | 258 | if (si->si_slots[i].sl_valid && |
98 | ret = (s16) i; | 259 | (node_num == si->si_slots[i].sl_node_num)) { |
260 | ret = i; | ||
99 | break; | 261 | break; |
100 | } | 262 | } |
101 | } | 263 | } |
264 | |||
102 | return ret; | 265 | return ret; |
103 | } | 266 | } |
104 | 267 | ||
105 | static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) | 268 | static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, |
269 | int preferred) | ||
106 | { | 270 | { |
107 | int i; | 271 | int i, ret = -ENOSPC; |
108 | s16 ret = OCFS2_INVALID_SLOT; | ||
109 | 272 | ||
110 | if (preferred >= 0 && preferred < si->si_num_slots) { | 273 | if ((preferred >= 0) && (preferred < si->si_num_slots)) { |
111 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { | 274 | if (!si->si_slots[preferred].sl_valid) { |
112 | ret = preferred; | 275 | ret = preferred; |
113 | goto out; | 276 | goto out; |
114 | } | 277 | } |
115 | } | 278 | } |
116 | 279 | ||
117 | for(i = 0; i < si->si_num_slots; i++) { | 280 | for(i = 0; i < si->si_num_slots; i++) { |
118 | if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { | 281 | if (!si->si_slots[i].sl_valid) { |
119 | ret = (s16) i; | 282 | ret = i; |
120 | break; | 283 | break; |
121 | } | 284 | } |
122 | } | 285 | } |
@@ -124,58 +287,155 @@ out: | |||
124 | return ret; | 287 | return ret; |
125 | } | 288 | } |
126 | 289 | ||
127 | s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | 290 | int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num) |
128 | s16 global) | ||
129 | { | 291 | { |
130 | s16 ret; | 292 | int slot; |
293 | struct ocfs2_slot_info *si = osb->slot_info; | ||
131 | 294 | ||
132 | spin_lock(&si->si_lock); | 295 | spin_lock(&osb->osb_lock); |
133 | ret = __ocfs2_node_num_to_slot(si, global); | 296 | slot = __ocfs2_node_num_to_slot(si, node_num); |
134 | spin_unlock(&si->si_lock); | 297 | spin_unlock(&osb->osb_lock); |
135 | return ret; | 298 | |
299 | return slot; | ||
300 | } | ||
301 | |||
302 | int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, | ||
303 | unsigned int *node_num) | ||
304 | { | ||
305 | struct ocfs2_slot_info *si = osb->slot_info; | ||
306 | |||
307 | assert_spin_locked(&osb->osb_lock); | ||
308 | |||
309 | BUG_ON(slot_num < 0); | ||
310 | BUG_ON(slot_num > osb->max_slots); | ||
311 | |||
312 | if (!si->si_slots[slot_num].sl_valid) | ||
313 | return -ENOENT; | ||
314 | |||
315 | *node_num = si->si_slots[slot_num].sl_node_num; | ||
316 | return 0; | ||
136 | } | 317 | } |
137 | 318 | ||
138 | static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, | 319 | static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) |
139 | s16 slot_num, | ||
140 | s16 node_num) | ||
141 | { | 320 | { |
142 | BUG_ON(slot_num == OCFS2_INVALID_SLOT); | 321 | unsigned int i; |
143 | BUG_ON(slot_num >= si->si_num_slots); | 322 | |
144 | BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && | 323 | if (si == NULL) |
145 | (node_num >= O2NM_MAX_NODES)); | 324 | return; |
325 | |||
326 | if (si->si_inode) | ||
327 | iput(si->si_inode); | ||
328 | if (si->si_bh) { | ||
329 | for (i = 0; i < si->si_blocks; i++) { | ||
330 | if (si->si_bh[i]) { | ||
331 | brelse(si->si_bh[i]); | ||
332 | si->si_bh[i] = NULL; | ||
333 | } | ||
334 | } | ||
335 | kfree(si->si_bh); | ||
336 | } | ||
146 | 337 | ||
147 | si->si_global_node_nums[slot_num] = node_num; | 338 | kfree(si); |
148 | } | 339 | } |
149 | 340 | ||
150 | void ocfs2_clear_slot(struct ocfs2_slot_info *si, | 341 | int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num) |
151 | s16 slot_num) | ||
152 | { | 342 | { |
153 | spin_lock(&si->si_lock); | 343 | struct ocfs2_slot_info *si = osb->slot_info; |
154 | __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); | 344 | |
155 | spin_unlock(&si->si_lock); | 345 | if (si == NULL) |
346 | return 0; | ||
347 | |||
348 | spin_lock(&osb->osb_lock); | ||
349 | ocfs2_invalidate_slot(si, slot_num); | ||
350 | spin_unlock(&osb->osb_lock); | ||
351 | |||
352 | return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num); | ||
156 | } | 353 | } |
157 | 354 | ||
158 | int ocfs2_init_slot_info(struct ocfs2_super *osb) | 355 | static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, |
356 | struct ocfs2_slot_info *si) | ||
159 | { | 357 | { |
160 | int status, i; | 358 | int status = 0; |
161 | u64 blkno; | 359 | u64 blkno; |
360 | unsigned long long blocks, bytes; | ||
361 | unsigned int i; | ||
362 | struct buffer_head *bh; | ||
363 | |||
364 | status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes); | ||
365 | if (status) | ||
366 | goto bail; | ||
367 | |||
368 | blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes); | ||
369 | BUG_ON(blocks > UINT_MAX); | ||
370 | si->si_blocks = blocks; | ||
371 | if (!si->si_blocks) | ||
372 | goto bail; | ||
373 | |||
374 | if (si->si_extended) | ||
375 | si->si_slots_per_block = | ||
376 | (osb->sb->s_blocksize / | ||
377 | sizeof(struct ocfs2_extended_slot)); | ||
378 | else | ||
379 | si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16); | ||
380 | |||
381 | /* The size checks above should ensure this */ | ||
382 | BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks); | ||
383 | |||
384 | mlog(0, "Slot map needs %u buffers for %llu bytes\n", | ||
385 | si->si_blocks, bytes); | ||
386 | |||
387 | si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, | ||
388 | GFP_KERNEL); | ||
389 | if (!si->si_bh) { | ||
390 | status = -ENOMEM; | ||
391 | mlog_errno(status); | ||
392 | goto bail; | ||
393 | } | ||
394 | |||
395 | for (i = 0; i < si->si_blocks; i++) { | ||
396 | status = ocfs2_extent_map_get_blocks(si->si_inode, i, | ||
397 | &blkno, NULL, NULL); | ||
398 | if (status < 0) { | ||
399 | mlog_errno(status); | ||
400 | goto bail; | ||
401 | } | ||
402 | |||
403 | mlog(0, "Reading slot map block %u at %llu\n", i, | ||
404 | (unsigned long long)blkno); | ||
405 | |||
406 | bh = NULL; /* Acquire a fresh bh */ | ||
407 | status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); | ||
408 | if (status < 0) { | ||
409 | mlog_errno(status); | ||
410 | goto bail; | ||
411 | } | ||
412 | |||
413 | si->si_bh[i] = bh; | ||
414 | } | ||
415 | |||
416 | bail: | ||
417 | return status; | ||
418 | } | ||
419 | |||
420 | int ocfs2_init_slot_info(struct ocfs2_super *osb) | ||
421 | { | ||
422 | int status; | ||
162 | struct inode *inode = NULL; | 423 | struct inode *inode = NULL; |
163 | struct buffer_head *bh = NULL; | ||
164 | struct ocfs2_slot_info *si; | 424 | struct ocfs2_slot_info *si; |
165 | 425 | ||
166 | si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); | 426 | si = kzalloc(sizeof(struct ocfs2_slot_info) + |
427 | (sizeof(struct ocfs2_slot) * osb->max_slots), | ||
428 | GFP_KERNEL); | ||
167 | if (!si) { | 429 | if (!si) { |
168 | status = -ENOMEM; | 430 | status = -ENOMEM; |
169 | mlog_errno(status); | 431 | mlog_errno(status); |
170 | goto bail; | 432 | goto bail; |
171 | } | 433 | } |
172 | 434 | ||
173 | spin_lock_init(&si->si_lock); | 435 | si->si_extended = ocfs2_uses_extended_slot_map(osb); |
174 | si->si_num_slots = osb->max_slots; | 436 | si->si_num_slots = osb->max_slots; |
175 | si->si_size = OCFS2_MAX_SLOTS; | 437 | si->si_slots = (struct ocfs2_slot *)((char *)si + |
176 | 438 | sizeof(struct ocfs2_slot_info)); | |
177 | for(i = 0; i < si->si_num_slots; i++) | ||
178 | si->si_global_node_nums[i] = OCFS2_INVALID_SLOT; | ||
179 | 439 | ||
180 | inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, | 440 | inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, |
181 | OCFS2_INVALID_SLOT); | 441 | OCFS2_INVALID_SLOT); |
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) | |||
185 | goto bail; | 445 | goto bail; |
186 | } | 446 | } |
187 | 447 | ||
188 | status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); | 448 | si->si_inode = inode; |
189 | if (status < 0) { | 449 | status = ocfs2_map_slot_buffers(osb, si); |
190 | mlog_errno(status); | ||
191 | goto bail; | ||
192 | } | ||
193 | |||
194 | status = ocfs2_read_block(osb, blkno, &bh, 0, inode); | ||
195 | if (status < 0) { | 450 | if (status < 0) { |
196 | mlog_errno(status); | 451 | mlog_errno(status); |
197 | goto bail; | 452 | goto bail; |
198 | } | 453 | } |
199 | 454 | ||
200 | si->si_inode = inode; | 455 | osb->slot_info = (struct ocfs2_slot_info *)si; |
201 | si->si_bh = bh; | ||
202 | osb->slot_info = si; | ||
203 | bail: | 456 | bail: |
204 | if (status < 0 && si) | 457 | if (status < 0 && si) |
205 | ocfs2_free_slot_info(si); | 458 | __ocfs2_free_slot_info(si); |
206 | 459 | ||
207 | return status; | 460 | return status; |
208 | } | 461 | } |
209 | 462 | ||
210 | void ocfs2_free_slot_info(struct ocfs2_slot_info *si) | 463 | void ocfs2_free_slot_info(struct ocfs2_super *osb) |
211 | { | 464 | { |
212 | if (si->si_inode) | 465 | struct ocfs2_slot_info *si = osb->slot_info; |
213 | iput(si->si_inode); | 466 | |
214 | if (si->si_bh) | 467 | osb->slot_info = NULL; |
215 | brelse(si->si_bh); | 468 | __ocfs2_free_slot_info(si); |
216 | kfree(si); | ||
217 | } | 469 | } |
218 | 470 | ||
219 | int ocfs2_find_slot(struct ocfs2_super *osb) | 471 | int ocfs2_find_slot(struct ocfs2_super *osb) |
220 | { | 472 | { |
221 | int status; | 473 | int status; |
222 | s16 slot; | 474 | int slot; |
223 | struct ocfs2_slot_info *si; | 475 | struct ocfs2_slot_info *si; |
224 | 476 | ||
225 | mlog_entry_void(); | 477 | mlog_entry_void(); |
226 | 478 | ||
227 | si = osb->slot_info; | 479 | si = osb->slot_info; |
228 | 480 | ||
481 | spin_lock(&osb->osb_lock); | ||
229 | ocfs2_update_slot_info(si); | 482 | ocfs2_update_slot_info(si); |
230 | 483 | ||
231 | spin_lock(&si->si_lock); | ||
232 | /* search for ourselves first and take the slot if it already | 484 | /* search for ourselves first and take the slot if it already |
233 | * exists. Perhaps we need to mark this in a variable for our | 485 | * exists. Perhaps we need to mark this in a variable for our |
234 | * own journal recovery? Possibly not, though we certainly | 486 | * own journal recovery? Possibly not, though we certainly |
235 | * need to warn to the user */ | 487 | * need to warn to the user */ |
236 | slot = __ocfs2_node_num_to_slot(si, osb->node_num); | 488 | slot = __ocfs2_node_num_to_slot(si, osb->node_num); |
237 | if (slot == OCFS2_INVALID_SLOT) { | 489 | if (slot < 0) { |
238 | /* if no slot yet, then just take 1st available | 490 | /* if no slot yet, then just take 1st available |
239 | * one. */ | 491 | * one. */ |
240 | slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); | 492 | slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); |
241 | if (slot == OCFS2_INVALID_SLOT) { | 493 | if (slot < 0) { |
242 | spin_unlock(&si->si_lock); | 494 | spin_unlock(&osb->osb_lock); |
243 | mlog(ML_ERROR, "no free slots available!\n"); | 495 | mlog(ML_ERROR, "no free slots available!\n"); |
244 | status = -EINVAL; | 496 | status = -EINVAL; |
245 | goto bail; | 497 | goto bail; |
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb) | |||
248 | mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", | 500 | mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", |
249 | slot); | 501 | slot); |
250 | 502 | ||
251 | __ocfs2_fill_slot(si, slot, osb->node_num); | 503 | ocfs2_set_slot(si, slot, osb->node_num); |
252 | osb->slot_num = slot; | 504 | osb->slot_num = slot; |
253 | spin_unlock(&si->si_lock); | 505 | spin_unlock(&osb->osb_lock); |
254 | 506 | ||
255 | mlog(0, "taking node slot %d\n", osb->slot_num); | 507 | mlog(0, "taking node slot %d\n", osb->slot_num); |
256 | 508 | ||
257 | status = ocfs2_update_disk_slots(osb, si); | 509 | status = ocfs2_update_disk_slot(osb, si, osb->slot_num); |
258 | if (status < 0) | 510 | if (status < 0) |
259 | mlog_errno(status); | 511 | mlog_errno(status); |
260 | 512 | ||
@@ -265,27 +517,27 @@ bail: | |||
265 | 517 | ||
266 | void ocfs2_put_slot(struct ocfs2_super *osb) | 518 | void ocfs2_put_slot(struct ocfs2_super *osb) |
267 | { | 519 | { |
268 | int status; | 520 | int status, slot_num; |
269 | struct ocfs2_slot_info *si = osb->slot_info; | 521 | struct ocfs2_slot_info *si = osb->slot_info; |
270 | 522 | ||
271 | if (!si) | 523 | if (!si) |
272 | return; | 524 | return; |
273 | 525 | ||
526 | spin_lock(&osb->osb_lock); | ||
274 | ocfs2_update_slot_info(si); | 527 | ocfs2_update_slot_info(si); |
275 | 528 | ||
276 | spin_lock(&si->si_lock); | 529 | slot_num = osb->slot_num; |
277 | __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); | 530 | ocfs2_invalidate_slot(si, osb->slot_num); |
278 | osb->slot_num = OCFS2_INVALID_SLOT; | 531 | osb->slot_num = OCFS2_INVALID_SLOT; |
279 | spin_unlock(&si->si_lock); | 532 | spin_unlock(&osb->osb_lock); |
280 | 533 | ||
281 | status = ocfs2_update_disk_slots(osb, si); | 534 | status = ocfs2_update_disk_slot(osb, si, slot_num); |
282 | if (status < 0) { | 535 | if (status < 0) { |
283 | mlog_errno(status); | 536 | mlog_errno(status); |
284 | goto bail; | 537 | goto bail; |
285 | } | 538 | } |
286 | 539 | ||
287 | bail: | 540 | bail: |
288 | osb->slot_info = NULL; | 541 | ocfs2_free_slot_info(osb); |
289 | ocfs2_free_slot_info(si); | ||
290 | } | 542 | } |
291 | 543 | ||
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h index 1025872aaade..601c95fd7003 100644 --- a/fs/ocfs2/slot_map.h +++ b/fs/ocfs2/slot_map.h | |||
@@ -27,38 +27,18 @@ | |||
27 | #ifndef SLOTMAP_H | 27 | #ifndef SLOTMAP_H |
28 | #define SLOTMAP_H | 28 | #define SLOTMAP_H |
29 | 29 | ||
30 | struct ocfs2_slot_info { | ||
31 | spinlock_t si_lock; | ||
32 | |||
33 | struct inode *si_inode; | ||
34 | struct buffer_head *si_bh; | ||
35 | unsigned int si_num_slots; | ||
36 | unsigned int si_size; | ||
37 | s16 si_global_node_nums[OCFS2_MAX_SLOTS]; | ||
38 | }; | ||
39 | |||
40 | int ocfs2_init_slot_info(struct ocfs2_super *osb); | 30 | int ocfs2_init_slot_info(struct ocfs2_super *osb); |
41 | void ocfs2_free_slot_info(struct ocfs2_slot_info *si); | 31 | void ocfs2_free_slot_info(struct ocfs2_super *osb); |
42 | 32 | ||
43 | int ocfs2_find_slot(struct ocfs2_super *osb); | 33 | int ocfs2_find_slot(struct ocfs2_super *osb); |
44 | void ocfs2_put_slot(struct ocfs2_super *osb); | 34 | void ocfs2_put_slot(struct ocfs2_super *osb); |
45 | 35 | ||
46 | void ocfs2_update_slot_info(struct ocfs2_slot_info *si); | 36 | int ocfs2_refresh_slot_info(struct ocfs2_super *osb); |
47 | int ocfs2_update_disk_slots(struct ocfs2_super *osb, | ||
48 | struct ocfs2_slot_info *si); | ||
49 | |||
50 | s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, | ||
51 | s16 global); | ||
52 | void ocfs2_clear_slot(struct ocfs2_slot_info *si, | ||
53 | s16 slot_num); | ||
54 | 37 | ||
55 | static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, | 38 | int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num); |
56 | int slot_num) | 39 | int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num, |
57 | { | 40 | unsigned int *node_num); |
58 | BUG_ON(slot_num == OCFS2_INVALID_SLOT); | ||
59 | assert_spin_locked(&si->si_lock); | ||
60 | 41 | ||
61 | return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; | 42 | int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num); |
62 | } | ||
63 | 43 | ||
64 | #endif | 44 | #endif |
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c new file mode 100644 index 000000000000..ac1d74c63bf5 --- /dev/null +++ b/fs/ocfs2/stack_o2cb.c | |||
@@ -0,0 +1,420 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * stack_o2cb.c | ||
5 | * | ||
6 | * Code which interfaces ocfs2 with the o2cb stack. | ||
7 | * | ||
8 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation, version 2. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | */ | ||
19 | |||
20 | #include <linux/crc32.h> | ||
21 | #include <linux/module.h> | ||
22 | |||
23 | /* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */ | ||
24 | #include <linux/fs.h> | ||
25 | |||
26 | #include "cluster/masklog.h" | ||
27 | #include "cluster/nodemanager.h" | ||
28 | #include "cluster/heartbeat.h" | ||
29 | |||
30 | #include "stackglue.h" | ||
31 | |||
32 | struct o2dlm_private { | ||
33 | struct dlm_eviction_cb op_eviction_cb; | ||
34 | }; | ||
35 | |||
36 | static struct ocfs2_stack_plugin o2cb_stack; | ||
37 | |||
38 | /* These should be identical */ | ||
39 | #if (DLM_LOCK_IV != LKM_IVMODE) | ||
40 | # error Lock modes do not match | ||
41 | #endif | ||
42 | #if (DLM_LOCK_NL != LKM_NLMODE) | ||
43 | # error Lock modes do not match | ||
44 | #endif | ||
45 | #if (DLM_LOCK_CR != LKM_CRMODE) | ||
46 | # error Lock modes do not match | ||
47 | #endif | ||
48 | #if (DLM_LOCK_CW != LKM_CWMODE) | ||
49 | # error Lock modes do not match | ||
50 | #endif | ||
51 | #if (DLM_LOCK_PR != LKM_PRMODE) | ||
52 | # error Lock modes do not match | ||
53 | #endif | ||
54 | #if (DLM_LOCK_PW != LKM_PWMODE) | ||
55 | # error Lock modes do not match | ||
56 | #endif | ||
57 | #if (DLM_LOCK_EX != LKM_EXMODE) | ||
58 | # error Lock modes do not match | ||
59 | #endif | ||
60 | static inline int mode_to_o2dlm(int mode) | ||
61 | { | ||
62 | BUG_ON(mode > LKM_MAXMODE); | ||
63 | |||
64 | return mode; | ||
65 | } | ||
66 | |||
67 | #define map_flag(_generic, _o2dlm) \ | ||
68 | if (flags & (_generic)) { \ | ||
69 | flags &= ~(_generic); \ | ||
70 | o2dlm_flags |= (_o2dlm); \ | ||
71 | } | ||
72 | static int flags_to_o2dlm(u32 flags) | ||
73 | { | ||
74 | int o2dlm_flags = 0; | ||
75 | |||
76 | map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE); | ||
77 | map_flag(DLM_LKF_CANCEL, LKM_CANCEL); | ||
78 | map_flag(DLM_LKF_CONVERT, LKM_CONVERT); | ||
79 | map_flag(DLM_LKF_VALBLK, LKM_VALBLK); | ||
80 | map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK); | ||
81 | map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN); | ||
82 | map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE); | ||
83 | map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT); | ||
84 | map_flag(DLM_LKF_LOCAL, LKM_LOCAL); | ||
85 | |||
86 | /* map_flag() should have cleared every flag passed in */ | ||
87 | BUG_ON(flags != 0); | ||
88 | |||
89 | return o2dlm_flags; | ||
90 | } | ||
91 | #undef map_flag | ||
92 | |||
93 | /* | ||
94 | * Map an o2dlm status to standard errno values. | ||
95 | * | ||
96 | * o2dlm only uses a handful of these, and returns even fewer to the | ||
97 | * caller. Still, we try to assign sane values to each error. | ||
98 | * | ||
99 | * The following value pairs have special meanings to dlmglue, thus | ||
100 | * the right hand side needs to stay unique - never duplicate the | ||
101 | * mapping elsewhere in the table! | ||
102 | * | ||
103 | * DLM_NORMAL: 0 | ||
104 | * DLM_NOTQUEUED: -EAGAIN | ||
105 | * DLM_CANCELGRANT: -EBUSY | ||
106 | * DLM_CANCEL: -DLM_ECANCEL | ||
107 | */ | ||
108 | /* Keep in sync with dlmapi.h */ | ||
109 | static int status_map[] = { | ||
110 | [DLM_NORMAL] = 0, /* Success */ | ||
111 | [DLM_GRANTED] = -EINVAL, | ||
112 | [DLM_DENIED] = -EACCES, | ||
113 | [DLM_DENIED_NOLOCKS] = -EACCES, | ||
114 | [DLM_WORKING] = -EACCES, | ||
115 | [DLM_BLOCKED] = -EINVAL, | ||
116 | [DLM_BLOCKED_ORPHAN] = -EINVAL, | ||
117 | [DLM_DENIED_GRACE_PERIOD] = -EACCES, | ||
118 | [DLM_SYSERR] = -ENOMEM, /* It is what it is */ | ||
119 | [DLM_NOSUPPORT] = -EPROTO, | ||
120 | [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */ | ||
121 | [DLM_IVLOCKID] = -EINVAL, | ||
122 | [DLM_SYNC] = -EINVAL, | ||
123 | [DLM_BADTYPE] = -EINVAL, | ||
124 | [DLM_BADRESOURCE] = -EINVAL, | ||
125 | [DLM_MAXHANDLES] = -ENOMEM, | ||
126 | [DLM_NOCLINFO] = -EINVAL, | ||
127 | [DLM_NOLOCKMGR] = -EINVAL, | ||
128 | [DLM_NOPURGED] = -EINVAL, | ||
129 | [DLM_BADARGS] = -EINVAL, | ||
130 | [DLM_VOID] = -EINVAL, | ||
131 | [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */ | ||
132 | [DLM_IVBUFLEN] = -EINVAL, | ||
133 | [DLM_CVTUNGRANT] = -EPERM, | ||
134 | [DLM_BADPARAM] = -EINVAL, | ||
135 | [DLM_VALNOTVALID] = -EINVAL, | ||
136 | [DLM_REJECTED] = -EPERM, | ||
137 | [DLM_ABORT] = -EINVAL, | ||
138 | [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */ | ||
139 | [DLM_IVRESHANDLE] = -EINVAL, | ||
140 | [DLM_DEADLOCK] = -EDEADLK, | ||
141 | [DLM_DENIED_NOASTS] = -EINVAL, | ||
142 | [DLM_FORWARD] = -EINVAL, | ||
143 | [DLM_TIMEOUT] = -ETIMEDOUT, | ||
144 | [DLM_IVGROUPID] = -EINVAL, | ||
145 | [DLM_VERS_CONFLICT] = -EOPNOTSUPP, | ||
146 | [DLM_BAD_DEVICE_PATH] = -ENOENT, | ||
147 | [DLM_NO_DEVICE_PERMISSION] = -EPERM, | ||
148 | [DLM_NO_CONTROL_DEVICE] = -ENOENT, | ||
149 | [DLM_RECOVERING] = -ENOTCONN, | ||
150 | [DLM_MIGRATING] = -ERESTART, | ||
151 | [DLM_MAXSTATS] = -EINVAL, | ||
152 | }; | ||
153 | |||
154 | static int dlm_status_to_errno(enum dlm_status status) | ||
155 | { | ||
156 | BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); | ||
157 | |||
158 | return status_map[status]; | ||
159 | } | ||
160 | |||
161 | static void o2dlm_lock_ast_wrapper(void *astarg) | ||
162 | { | ||
163 | BUG_ON(o2cb_stack.sp_proto == NULL); | ||
164 | |||
165 | o2cb_stack.sp_proto->lp_lock_ast(astarg); | ||
166 | } | ||
167 | |||
168 | static void o2dlm_blocking_ast_wrapper(void *astarg, int level) | ||
169 | { | ||
170 | BUG_ON(o2cb_stack.sp_proto == NULL); | ||
171 | |||
172 | o2cb_stack.sp_proto->lp_blocking_ast(astarg, level); | ||
173 | } | ||
174 | |||
175 | static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status) | ||
176 | { | ||
177 | int error = dlm_status_to_errno(status); | ||
178 | |||
179 | BUG_ON(o2cb_stack.sp_proto == NULL); | ||
180 | |||
181 | /* | ||
182 | * In o2dlm, you can get both the lock_ast() for the lock being | ||
183 | * granted and the unlock_ast() for the CANCEL failing. A | ||
184 | * successful cancel sends DLM_NORMAL here. If the | ||
185 | * lock grant happened before the cancel arrived, you get | ||
186 | * DLM_CANCELGRANT. | ||
187 | * | ||
188 | * There's no need for the double-ast. If we see DLM_CANCELGRANT, | ||
189 | * we just ignore it. We expect the lock_ast() to handle the | ||
190 | * granted lock. | ||
191 | */ | ||
192 | if (status == DLM_CANCELGRANT) | ||
193 | return; | ||
194 | |||
195 | o2cb_stack.sp_proto->lp_unlock_ast(astarg, error); | ||
196 | } | ||
197 | |||
198 | static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn, | ||
199 | int mode, | ||
200 | union ocfs2_dlm_lksb *lksb, | ||
201 | u32 flags, | ||
202 | void *name, | ||
203 | unsigned int namelen, | ||
204 | void *astarg) | ||
205 | { | ||
206 | enum dlm_status status; | ||
207 | int o2dlm_mode = mode_to_o2dlm(mode); | ||
208 | int o2dlm_flags = flags_to_o2dlm(flags); | ||
209 | int ret; | ||
210 | |||
211 | status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm, | ||
212 | o2dlm_flags, name, namelen, | ||
213 | o2dlm_lock_ast_wrapper, astarg, | ||
214 | o2dlm_blocking_ast_wrapper); | ||
215 | ret = dlm_status_to_errno(status); | ||
216 | return ret; | ||
217 | } | ||
218 | |||
219 | static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn, | ||
220 | union ocfs2_dlm_lksb *lksb, | ||
221 | u32 flags, | ||
222 | void *astarg) | ||
223 | { | ||
224 | enum dlm_status status; | ||
225 | int o2dlm_flags = flags_to_o2dlm(flags); | ||
226 | int ret; | ||
227 | |||
228 | status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm, | ||
229 | o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg); | ||
230 | ret = dlm_status_to_errno(status); | ||
231 | return ret; | ||
232 | } | ||
233 | |||
234 | static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb) | ||
235 | { | ||
236 | return dlm_status_to_errno(lksb->lksb_o2dlm.status); | ||
237 | } | ||
238 | |||
239 | static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb) | ||
240 | { | ||
241 | return (void *)(lksb->lksb_o2dlm.lvb); | ||
242 | } | ||
243 | |||
244 | static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb) | ||
245 | { | ||
246 | dlm_print_one_lock(lksb->lksb_o2dlm.lockid); | ||
247 | } | ||
248 | |||
249 | /* | ||
250 | * Called from the dlm when it's about to evict a node. This is how the | ||
251 | * classic stack signals node death. | ||
252 | */ | ||
253 | static void o2dlm_eviction_cb(int node_num, void *data) | ||
254 | { | ||
255 | struct ocfs2_cluster_connection *conn = data; | ||
256 | |||
257 | mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", | ||
258 | node_num, conn->cc_namelen, conn->cc_name); | ||
259 | |||
260 | conn->cc_recovery_handler(node_num, conn->cc_recovery_data); | ||
261 | } | ||
262 | |||
263 | static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) | ||
264 | { | ||
265 | int rc = 0; | ||
266 | u32 dlm_key; | ||
267 | struct dlm_ctxt *dlm; | ||
268 | struct o2dlm_private *priv; | ||
269 | struct dlm_protocol_version dlm_version; | ||
270 | |||
271 | BUG_ON(conn == NULL); | ||
272 | BUG_ON(o2cb_stack.sp_proto == NULL); | ||
273 | |||
274 | /* for now we only have one cluster/node, make sure we see it | ||
275 | * in the heartbeat universe */ | ||
276 | if (!o2hb_check_local_node_heartbeating()) { | ||
277 | rc = -EINVAL; | ||
278 | goto out; | ||
279 | } | ||
280 | |||
281 | priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL); | ||
282 | if (!priv) { | ||
283 | rc = -ENOMEM; | ||
284 | goto out_free; | ||
285 | } | ||
286 | |||
287 | /* This just fills the structure in. It is safe to pass conn. */ | ||
288 | dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb, | ||
289 | conn); | ||
290 | |||
291 | conn->cc_private = priv; | ||
292 | |||
293 | /* used by the dlm code to make message headers unique, each | ||
294 | * node in this domain must agree on this. */ | ||
295 | dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen); | ||
296 | dlm_version.pv_major = conn->cc_version.pv_major; | ||
297 | dlm_version.pv_minor = conn->cc_version.pv_minor; | ||
298 | |||
299 | dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version); | ||
300 | if (IS_ERR(dlm)) { | ||
301 | rc = PTR_ERR(dlm); | ||
302 | mlog_errno(rc); | ||
303 | goto out_free; | ||
304 | } | ||
305 | |||
306 | conn->cc_version.pv_major = dlm_version.pv_major; | ||
307 | conn->cc_version.pv_minor = dlm_version.pv_minor; | ||
308 | conn->cc_lockspace = dlm; | ||
309 | |||
310 | dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); | ||
311 | |||
312 | out_free: | ||
313 | if (rc && conn->cc_private) | ||
314 | kfree(conn->cc_private); | ||
315 | |||
316 | out: | ||
317 | return rc; | ||
318 | } | ||
319 | |||
320 | static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn, | ||
321 | int hangup_pending) | ||
322 | { | ||
323 | struct dlm_ctxt *dlm = conn->cc_lockspace; | ||
324 | struct o2dlm_private *priv = conn->cc_private; | ||
325 | |||
326 | dlm_unregister_eviction_cb(&priv->op_eviction_cb); | ||
327 | conn->cc_private = NULL; | ||
328 | kfree(priv); | ||
329 | |||
330 | dlm_unregister_domain(dlm); | ||
331 | conn->cc_lockspace = NULL; | ||
332 | |||
333 | return 0; | ||
334 | } | ||
335 | |||
336 | static void o2hb_stop(const char *group) | ||
337 | { | ||
338 | int ret; | ||
339 | char *argv[5], *envp[3]; | ||
340 | |||
341 | argv[0] = (char *)o2nm_get_hb_ctl_path(); | ||
342 | argv[1] = "-K"; | ||
343 | argv[2] = "-u"; | ||
344 | argv[3] = (char *)group; | ||
345 | argv[4] = NULL; | ||
346 | |||
347 | mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]); | ||
348 | |||
349 | /* minimal command environment taken from cpu_run_sbin_hotplug */ | ||
350 | envp[0] = "HOME=/"; | ||
351 | envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; | ||
352 | envp[2] = NULL; | ||
353 | |||
354 | ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); | ||
355 | if (ret < 0) | ||
356 | mlog_errno(ret); | ||
357 | } | ||
358 | |||
359 | /* | ||
360 | * Hangup is a hack for tools compatibility. Older ocfs2-tools software | ||
361 | * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This | ||
362 | * happens regardless of whether the DLM got started, so we can't do it | ||
363 | * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into | ||
364 | * the glue and provide a "hangup" API for super.c to call. | ||
365 | * | ||
366 | * Other stacks will eventually provide a NULL ->hangup() pointer. | ||
367 | */ | ||
368 | static void o2cb_cluster_hangup(const char *group, int grouplen) | ||
369 | { | ||
370 | o2hb_stop(group); | ||
371 | } | ||
372 | |||
373 | static int o2cb_cluster_this_node(unsigned int *node) | ||
374 | { | ||
375 | int node_num; | ||
376 | |||
377 | node_num = o2nm_this_node(); | ||
378 | if (node_num == O2NM_INVALID_NODE_NUM) | ||
379 | return -ENOENT; | ||
380 | |||
381 | if (node_num >= O2NM_MAX_NODES) | ||
382 | return -EOVERFLOW; | ||
383 | |||
384 | *node = node_num; | ||
385 | return 0; | ||
386 | } | ||
387 | |||
388 | struct ocfs2_stack_operations o2cb_stack_ops = { | ||
389 | .connect = o2cb_cluster_connect, | ||
390 | .disconnect = o2cb_cluster_disconnect, | ||
391 | .hangup = o2cb_cluster_hangup, | ||
392 | .this_node = o2cb_cluster_this_node, | ||
393 | .dlm_lock = o2cb_dlm_lock, | ||
394 | .dlm_unlock = o2cb_dlm_unlock, | ||
395 | .lock_status = o2cb_dlm_lock_status, | ||
396 | .lock_lvb = o2cb_dlm_lvb, | ||
397 | .dump_lksb = o2cb_dump_lksb, | ||
398 | }; | ||
399 | |||
400 | static struct ocfs2_stack_plugin o2cb_stack = { | ||
401 | .sp_name = "o2cb", | ||
402 | .sp_ops = &o2cb_stack_ops, | ||
403 | .sp_owner = THIS_MODULE, | ||
404 | }; | ||
405 | |||
406 | static int __init o2cb_stack_init(void) | ||
407 | { | ||
408 | return ocfs2_stack_glue_register(&o2cb_stack); | ||
409 | } | ||
410 | |||
411 | static void __exit o2cb_stack_exit(void) | ||
412 | { | ||
413 | ocfs2_stack_glue_unregister(&o2cb_stack); | ||
414 | } | ||
415 | |||
416 | MODULE_AUTHOR("Oracle"); | ||
417 | MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack"); | ||
418 | MODULE_LICENSE("GPL"); | ||
419 | module_init(o2cb_stack_init); | ||
420 | module_exit(o2cb_stack_exit); | ||
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c new file mode 100644 index 000000000000..7428663f9cbb --- /dev/null +++ b/fs/ocfs2/stack_user.c | |||
@@ -0,0 +1,883 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * stack_user.c | ||
5 | * | ||
6 | * Code which interfaces ocfs2 with fs/dlm and a userspace stack. | ||
7 | * | ||
8 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation, version 2. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | */ | ||
19 | |||
20 | #include <linux/module.h> | ||
21 | #include <linux/fs.h> | ||
22 | #include <linux/miscdevice.h> | ||
23 | #include <linux/mutex.h> | ||
24 | #include <linux/reboot.h> | ||
25 | #include <asm/uaccess.h> | ||
26 | |||
27 | #include "ocfs2.h" /* For struct ocfs2_lock_res */ | ||
28 | #include "stackglue.h" | ||
29 | |||
30 | |||
31 | /* | ||
32 | * The control protocol starts with a handshake. Until the handshake | ||
33 | * is complete, the control device will fail all write(2)s. | ||
34 | * | ||
35 | * The handshake is simple. First, the client reads until EOF. Each line | ||
36 | * of output is a supported protocol tag. All protocol tags are a single | ||
37 | * character followed by a two hex digit version number. Currently the | ||
38 | * only things supported is T01, for "Text-base version 0x01". Next, the | ||
39 | * client writes the version they would like to use, including the newline. | ||
40 | * Thus, the protocol tag is 'T01\n'. If the version tag written is | ||
41 | * unknown, -EINVAL is returned. Once the negotiation is complete, the | ||
42 | * client can start sending messages. | ||
43 | * | ||
44 | * The T01 protocol has three messages. First is the "SETN" message. | ||
45 | * It has the following syntax: | ||
46 | * | ||
47 | * SETN<space><8-char-hex-nodenum><newline> | ||
48 | * | ||
49 | * This is 14 characters. | ||
50 | * | ||
51 | * The "SETN" message must be the first message following the protocol. | ||
52 | * It tells ocfs2_control the local node number. | ||
53 | * | ||
54 | * Next comes the "SETV" message. It has the following syntax: | ||
55 | * | ||
56 | * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> | ||
57 | * | ||
58 | * This is 11 characters. | ||
59 | * | ||
60 | * The "SETV" message sets the filesystem locking protocol version as | ||
61 | * negotiated by the client. The client negotiates based on the maximum | ||
62 | * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major | ||
63 | * number from the "SETV" message must match | ||
64 | * user_stack.sp_proto->lp_max_version.pv_major, and the minor number | ||
65 | * must be less than or equal to ...->lp_max_version.pv_minor. | ||
66 | * | ||
67 | * Once this information has been set, mounts will be allowed. From this | ||
68 | * point on, the "DOWN" message can be sent for node down notification. | ||
69 | * It has the following syntax: | ||
70 | * | ||
71 | * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> | ||
72 | * | ||
73 | * eg: | ||
74 | * | ||
75 | * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n | ||
76 | * | ||
77 | * This is 47 characters. | ||
78 | */ | ||
79 | |||
80 | /* | ||
81 | * Whether or not the client has done the handshake. | ||
82 | * For now, we have just one protocol version. | ||
83 | */ | ||
84 | #define OCFS2_CONTROL_PROTO "T01\n" | ||
85 | #define OCFS2_CONTROL_PROTO_LEN 4 | ||
86 | |||
87 | /* Handshake states */ | ||
88 | #define OCFS2_CONTROL_HANDSHAKE_INVALID (0) | ||
89 | #define OCFS2_CONTROL_HANDSHAKE_READ (1) | ||
90 | #define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2) | ||
91 | #define OCFS2_CONTROL_HANDSHAKE_VALID (3) | ||
92 | |||
93 | /* Messages */ | ||
94 | #define OCFS2_CONTROL_MESSAGE_OP_LEN 4 | ||
95 | #define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN" | ||
96 | #define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14 | ||
97 | #define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV" | ||
98 | #define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11 | ||
99 | #define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN" | ||
100 | #define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47 | ||
101 | #define OCFS2_TEXT_UUID_LEN 32 | ||
102 | #define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2 | ||
103 | #define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8 | ||
104 | |||
105 | /* | ||
106 | * ocfs2_live_connection is refcounted because the filesystem and | ||
107 | * miscdevice sides can detach in different order. Let's just be safe. | ||
108 | */ | ||
109 | struct ocfs2_live_connection { | ||
110 | struct list_head oc_list; | ||
111 | struct ocfs2_cluster_connection *oc_conn; | ||
112 | }; | ||
113 | |||
114 | struct ocfs2_control_private { | ||
115 | struct list_head op_list; | ||
116 | int op_state; | ||
117 | int op_this_node; | ||
118 | struct ocfs2_protocol_version op_proto; | ||
119 | }; | ||
120 | |||
121 | /* SETN<space><8-char-hex-nodenum><newline> */ | ||
122 | struct ocfs2_control_message_setn { | ||
123 | char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; | ||
124 | char space; | ||
125 | char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; | ||
126 | char newline; | ||
127 | }; | ||
128 | |||
129 | /* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */ | ||
130 | struct ocfs2_control_message_setv { | ||
131 | char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; | ||
132 | char space1; | ||
133 | char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; | ||
134 | char space2; | ||
135 | char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN]; | ||
136 | char newline; | ||
137 | }; | ||
138 | |||
139 | /* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */ | ||
140 | struct ocfs2_control_message_down { | ||
141 | char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; | ||
142 | char space1; | ||
143 | char uuid[OCFS2_TEXT_UUID_LEN]; | ||
144 | char space2; | ||
145 | char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN]; | ||
146 | char newline; | ||
147 | }; | ||
148 | |||
149 | union ocfs2_control_message { | ||
150 | char tag[OCFS2_CONTROL_MESSAGE_OP_LEN]; | ||
151 | struct ocfs2_control_message_setn u_setn; | ||
152 | struct ocfs2_control_message_setv u_setv; | ||
153 | struct ocfs2_control_message_down u_down; | ||
154 | }; | ||
155 | |||
156 | static struct ocfs2_stack_plugin user_stack; | ||
157 | |||
158 | static atomic_t ocfs2_control_opened; | ||
159 | static int ocfs2_control_this_node = -1; | ||
160 | static struct ocfs2_protocol_version running_proto; | ||
161 | |||
162 | static LIST_HEAD(ocfs2_live_connection_list); | ||
163 | static LIST_HEAD(ocfs2_control_private_list); | ||
164 | static DEFINE_MUTEX(ocfs2_control_lock); | ||
165 | |||
166 | static inline void ocfs2_control_set_handshake_state(struct file *file, | ||
167 | int state) | ||
168 | { | ||
169 | struct ocfs2_control_private *p = file->private_data; | ||
170 | p->op_state = state; | ||
171 | } | ||
172 | |||
173 | static inline int ocfs2_control_get_handshake_state(struct file *file) | ||
174 | { | ||
175 | struct ocfs2_control_private *p = file->private_data; | ||
176 | return p->op_state; | ||
177 | } | ||
178 | |||
179 | static struct ocfs2_live_connection *ocfs2_connection_find(const char *name) | ||
180 | { | ||
181 | size_t len = strlen(name); | ||
182 | struct ocfs2_live_connection *c; | ||
183 | |||
184 | BUG_ON(!mutex_is_locked(&ocfs2_control_lock)); | ||
185 | |||
186 | list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) { | ||
187 | if ((c->oc_conn->cc_namelen == len) && | ||
188 | !strncmp(c->oc_conn->cc_name, name, len)) | ||
189 | return c; | ||
190 | } | ||
191 | |||
192 | return c; | ||
193 | } | ||
194 | |||
195 | /* | ||
196 | * ocfs2_live_connection structures are created underneath the ocfs2 | ||
197 | * mount path. Since the VFS prevents multiple calls to | ||
198 | * fill_super(), we can't get dupes here. | ||
199 | */ | ||
200 | static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn, | ||
201 | struct ocfs2_live_connection **c_ret) | ||
202 | { | ||
203 | int rc = 0; | ||
204 | struct ocfs2_live_connection *c; | ||
205 | |||
206 | c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); | ||
207 | if (!c) | ||
208 | return -ENOMEM; | ||
209 | |||
210 | mutex_lock(&ocfs2_control_lock); | ||
211 | c->oc_conn = conn; | ||
212 | |||
213 | if (atomic_read(&ocfs2_control_opened)) | ||
214 | list_add(&c->oc_list, &ocfs2_live_connection_list); | ||
215 | else { | ||
216 | printk(KERN_ERR | ||
217 | "ocfs2: Userspace control daemon is not present\n"); | ||
218 | rc = -ESRCH; | ||
219 | } | ||
220 | |||
221 | mutex_unlock(&ocfs2_control_lock); | ||
222 | |||
223 | if (!rc) | ||
224 | *c_ret = c; | ||
225 | else | ||
226 | kfree(c); | ||
227 | |||
228 | return rc; | ||
229 | } | ||
230 | |||
231 | /* | ||
232 | * This function disconnects the cluster connection from ocfs2_control. | ||
233 | * Afterwards, userspace can't affect the cluster connection. | ||
234 | */ | ||
235 | static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c) | ||
236 | { | ||
237 | mutex_lock(&ocfs2_control_lock); | ||
238 | list_del_init(&c->oc_list); | ||
239 | c->oc_conn = NULL; | ||
240 | mutex_unlock(&ocfs2_control_lock); | ||
241 | |||
242 | kfree(c); | ||
243 | } | ||
244 | |||
245 | static int ocfs2_control_cfu(void *target, size_t target_len, | ||
246 | const char __user *buf, size_t count) | ||
247 | { | ||
248 | /* The T01 expects write(2) calls to have exactly one command */ | ||
249 | if ((count != target_len) || | ||
250 | (count > sizeof(union ocfs2_control_message))) | ||
251 | return -EINVAL; | ||
252 | |||
253 | if (copy_from_user(target, buf, target_len)) | ||
254 | return -EFAULT; | ||
255 | |||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static ssize_t ocfs2_control_validate_protocol(struct file *file, | ||
260 | const char __user *buf, | ||
261 | size_t count) | ||
262 | { | ||
263 | ssize_t ret; | ||
264 | char kbuf[OCFS2_CONTROL_PROTO_LEN]; | ||
265 | |||
266 | ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN, | ||
267 | buf, count); | ||
268 | if (ret) | ||
269 | return ret; | ||
270 | |||
271 | if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN)) | ||
272 | return -EINVAL; | ||
273 | |||
274 | ocfs2_control_set_handshake_state(file, | ||
275 | OCFS2_CONTROL_HANDSHAKE_PROTOCOL); | ||
276 | |||
277 | return count; | ||
278 | } | ||
279 | |||
280 | static void ocfs2_control_send_down(const char *uuid, | ||
281 | int nodenum) | ||
282 | { | ||
283 | struct ocfs2_live_connection *c; | ||
284 | |||
285 | mutex_lock(&ocfs2_control_lock); | ||
286 | |||
287 | c = ocfs2_connection_find(uuid); | ||
288 | if (c) { | ||
289 | BUG_ON(c->oc_conn == NULL); | ||
290 | c->oc_conn->cc_recovery_handler(nodenum, | ||
291 | c->oc_conn->cc_recovery_data); | ||
292 | } | ||
293 | |||
294 | mutex_unlock(&ocfs2_control_lock); | ||
295 | } | ||
296 | |||
297 | /* | ||
298 | * Called whenever configuration elements are sent to /dev/ocfs2_control. | ||
299 | * If all configuration elements are present, try to set the global | ||
300 | * values. If there is a problem, return an error. Skip any missing | ||
301 | * elements, and only bump ocfs2_control_opened when we have all elements | ||
302 | * and are successful. | ||
303 | */ | ||
304 | static int ocfs2_control_install_private(struct file *file) | ||
305 | { | ||
306 | int rc = 0; | ||
307 | int set_p = 1; | ||
308 | struct ocfs2_control_private *p = file->private_data; | ||
309 | |||
310 | BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL); | ||
311 | |||
312 | mutex_lock(&ocfs2_control_lock); | ||
313 | |||
314 | if (p->op_this_node < 0) { | ||
315 | set_p = 0; | ||
316 | } else if ((ocfs2_control_this_node >= 0) && | ||
317 | (ocfs2_control_this_node != p->op_this_node)) { | ||
318 | rc = -EINVAL; | ||
319 | goto out_unlock; | ||
320 | } | ||
321 | |||
322 | if (!p->op_proto.pv_major) { | ||
323 | set_p = 0; | ||
324 | } else if (!list_empty(&ocfs2_live_connection_list) && | ||
325 | ((running_proto.pv_major != p->op_proto.pv_major) || | ||
326 | (running_proto.pv_minor != p->op_proto.pv_minor))) { | ||
327 | rc = -EINVAL; | ||
328 | goto out_unlock; | ||
329 | } | ||
330 | |||
331 | if (set_p) { | ||
332 | ocfs2_control_this_node = p->op_this_node; | ||
333 | running_proto.pv_major = p->op_proto.pv_major; | ||
334 | running_proto.pv_minor = p->op_proto.pv_minor; | ||
335 | } | ||
336 | |||
337 | out_unlock: | ||
338 | mutex_unlock(&ocfs2_control_lock); | ||
339 | |||
340 | if (!rc && set_p) { | ||
341 | /* We set the global values successfully */ | ||
342 | atomic_inc(&ocfs2_control_opened); | ||
343 | ocfs2_control_set_handshake_state(file, | ||
344 | OCFS2_CONTROL_HANDSHAKE_VALID); | ||
345 | } | ||
346 | |||
347 | return rc; | ||
348 | } | ||
349 | |||
350 | static int ocfs2_control_get_this_node(void) | ||
351 | { | ||
352 | int rc; | ||
353 | |||
354 | mutex_lock(&ocfs2_control_lock); | ||
355 | if (ocfs2_control_this_node < 0) | ||
356 | rc = -EINVAL; | ||
357 | else | ||
358 | rc = ocfs2_control_this_node; | ||
359 | mutex_unlock(&ocfs2_control_lock); | ||
360 | |||
361 | return rc; | ||
362 | } | ||
363 | |||
364 | static int ocfs2_control_do_setnode_msg(struct file *file, | ||
365 | struct ocfs2_control_message_setn *msg) | ||
366 | { | ||
367 | long nodenum; | ||
368 | char *ptr = NULL; | ||
369 | struct ocfs2_control_private *p = file->private_data; | ||
370 | |||
371 | if (ocfs2_control_get_handshake_state(file) != | ||
372 | OCFS2_CONTROL_HANDSHAKE_PROTOCOL) | ||
373 | return -EINVAL; | ||
374 | |||
375 | if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, | ||
376 | OCFS2_CONTROL_MESSAGE_OP_LEN)) | ||
377 | return -EINVAL; | ||
378 | |||
379 | if ((msg->space != ' ') || (msg->newline != '\n')) | ||
380 | return -EINVAL; | ||
381 | msg->space = msg->newline = '\0'; | ||
382 | |||
383 | nodenum = simple_strtol(msg->nodestr, &ptr, 16); | ||
384 | if (!ptr || *ptr) | ||
385 | return -EINVAL; | ||
386 | |||
387 | if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || | ||
388 | (nodenum > INT_MAX) || (nodenum < 0)) | ||
389 | return -ERANGE; | ||
390 | p->op_this_node = nodenum; | ||
391 | |||
392 | return ocfs2_control_install_private(file); | ||
393 | } | ||
394 | |||
395 | static int ocfs2_control_do_setversion_msg(struct file *file, | ||
396 | struct ocfs2_control_message_setv *msg) | ||
397 | { | ||
398 | long major, minor; | ||
399 | char *ptr = NULL; | ||
400 | struct ocfs2_control_private *p = file->private_data; | ||
401 | struct ocfs2_protocol_version *max = | ||
402 | &user_stack.sp_proto->lp_max_version; | ||
403 | |||
404 | if (ocfs2_control_get_handshake_state(file) != | ||
405 | OCFS2_CONTROL_HANDSHAKE_PROTOCOL) | ||
406 | return -EINVAL; | ||
407 | |||
408 | if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, | ||
409 | OCFS2_CONTROL_MESSAGE_OP_LEN)) | ||
410 | return -EINVAL; | ||
411 | |||
412 | if ((msg->space1 != ' ') || (msg->space2 != ' ') || | ||
413 | (msg->newline != '\n')) | ||
414 | return -EINVAL; | ||
415 | msg->space1 = msg->space2 = msg->newline = '\0'; | ||
416 | |||
417 | major = simple_strtol(msg->major, &ptr, 16); | ||
418 | if (!ptr || *ptr) | ||
419 | return -EINVAL; | ||
420 | minor = simple_strtol(msg->minor, &ptr, 16); | ||
421 | if (!ptr || *ptr) | ||
422 | return -EINVAL; | ||
423 | |||
424 | /* | ||
425 | * The major must be between 1 and 255, inclusive. The minor | ||
426 | * must be between 0 and 255, inclusive. The version passed in | ||
427 | * must be within the maximum version supported by the filesystem. | ||
428 | */ | ||
429 | if ((major == LONG_MIN) || (major == LONG_MAX) || | ||
430 | (major > (u8)-1) || (major < 1)) | ||
431 | return -ERANGE; | ||
432 | if ((minor == LONG_MIN) || (minor == LONG_MAX) || | ||
433 | (minor > (u8)-1) || (minor < 0)) | ||
434 | return -ERANGE; | ||
435 | if ((major != max->pv_major) || | ||
436 | (minor > max->pv_minor)) | ||
437 | return -EINVAL; | ||
438 | |||
439 | p->op_proto.pv_major = major; | ||
440 | p->op_proto.pv_minor = minor; | ||
441 | |||
442 | return ocfs2_control_install_private(file); | ||
443 | } | ||
444 | |||
445 | static int ocfs2_control_do_down_msg(struct file *file, | ||
446 | struct ocfs2_control_message_down *msg) | ||
447 | { | ||
448 | long nodenum; | ||
449 | char *p = NULL; | ||
450 | |||
451 | if (ocfs2_control_get_handshake_state(file) != | ||
452 | OCFS2_CONTROL_HANDSHAKE_VALID) | ||
453 | return -EINVAL; | ||
454 | |||
455 | if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, | ||
456 | OCFS2_CONTROL_MESSAGE_OP_LEN)) | ||
457 | return -EINVAL; | ||
458 | |||
459 | if ((msg->space1 != ' ') || (msg->space2 != ' ') || | ||
460 | (msg->newline != '\n')) | ||
461 | return -EINVAL; | ||
462 | msg->space1 = msg->space2 = msg->newline = '\0'; | ||
463 | |||
464 | nodenum = simple_strtol(msg->nodestr, &p, 16); | ||
465 | if (!p || *p) | ||
466 | return -EINVAL; | ||
467 | |||
468 | if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) || | ||
469 | (nodenum > INT_MAX) || (nodenum < 0)) | ||
470 | return -ERANGE; | ||
471 | |||
472 | ocfs2_control_send_down(msg->uuid, nodenum); | ||
473 | |||
474 | return 0; | ||
475 | } | ||
476 | |||
477 | static ssize_t ocfs2_control_message(struct file *file, | ||
478 | const char __user *buf, | ||
479 | size_t count) | ||
480 | { | ||
481 | ssize_t ret; | ||
482 | union ocfs2_control_message msg; | ||
483 | |||
484 | /* Try to catch padding issues */ | ||
485 | WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) != | ||
486 | (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1))); | ||
487 | |||
488 | memset(&msg, 0, sizeof(union ocfs2_control_message)); | ||
489 | ret = ocfs2_control_cfu(&msg, count, buf, count); | ||
490 | if (ret) | ||
491 | goto out; | ||
492 | |||
493 | if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) && | ||
494 | !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP, | ||
495 | OCFS2_CONTROL_MESSAGE_OP_LEN)) | ||
496 | ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn); | ||
497 | else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) && | ||
498 | !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP, | ||
499 | OCFS2_CONTROL_MESSAGE_OP_LEN)) | ||
500 | ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv); | ||
501 | else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) && | ||
502 | !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP, | ||
503 | OCFS2_CONTROL_MESSAGE_OP_LEN)) | ||
504 | ret = ocfs2_control_do_down_msg(file, &msg.u_down); | ||
505 | else | ||
506 | ret = -EINVAL; | ||
507 | |||
508 | out: | ||
509 | return ret ? ret : count; | ||
510 | } | ||
511 | |||
512 | static ssize_t ocfs2_control_write(struct file *file, | ||
513 | const char __user *buf, | ||
514 | size_t count, | ||
515 | loff_t *ppos) | ||
516 | { | ||
517 | ssize_t ret; | ||
518 | |||
519 | switch (ocfs2_control_get_handshake_state(file)) { | ||
520 | case OCFS2_CONTROL_HANDSHAKE_INVALID: | ||
521 | ret = -EINVAL; | ||
522 | break; | ||
523 | |||
524 | case OCFS2_CONTROL_HANDSHAKE_READ: | ||
525 | ret = ocfs2_control_validate_protocol(file, buf, | ||
526 | count); | ||
527 | break; | ||
528 | |||
529 | case OCFS2_CONTROL_HANDSHAKE_PROTOCOL: | ||
530 | case OCFS2_CONTROL_HANDSHAKE_VALID: | ||
531 | ret = ocfs2_control_message(file, buf, count); | ||
532 | break; | ||
533 | |||
534 | default: | ||
535 | BUG(); | ||
536 | ret = -EIO; | ||
537 | break; | ||
538 | } | ||
539 | |||
540 | return ret; | ||
541 | } | ||
542 | |||
543 | /* | ||
544 | * This is a naive version. If we ever have a new protocol, we'll expand | ||
545 | * it. Probably using seq_file. | ||
546 | */ | ||
547 | static ssize_t ocfs2_control_read(struct file *file, | ||
548 | char __user *buf, | ||
549 | size_t count, | ||
550 | loff_t *ppos) | ||
551 | { | ||
552 | char *proto_string = OCFS2_CONTROL_PROTO; | ||
553 | size_t to_write = 0; | ||
554 | |||
555 | if (*ppos >= OCFS2_CONTROL_PROTO_LEN) | ||
556 | return 0; | ||
557 | |||
558 | to_write = OCFS2_CONTROL_PROTO_LEN - *ppos; | ||
559 | if (to_write > count) | ||
560 | to_write = count; | ||
561 | if (copy_to_user(buf, proto_string + *ppos, to_write)) | ||
562 | return -EFAULT; | ||
563 | |||
564 | *ppos += to_write; | ||
565 | |||
566 | /* Have we read the whole protocol list? */ | ||
567 | if (*ppos >= OCFS2_CONTROL_PROTO_LEN) | ||
568 | ocfs2_control_set_handshake_state(file, | ||
569 | OCFS2_CONTROL_HANDSHAKE_READ); | ||
570 | |||
571 | return to_write; | ||
572 | } | ||
573 | |||
574 | static int ocfs2_control_release(struct inode *inode, struct file *file) | ||
575 | { | ||
576 | struct ocfs2_control_private *p = file->private_data; | ||
577 | |||
578 | mutex_lock(&ocfs2_control_lock); | ||
579 | |||
580 | if (ocfs2_control_get_handshake_state(file) != | ||
581 | OCFS2_CONTROL_HANDSHAKE_VALID) | ||
582 | goto out; | ||
583 | |||
584 | if (atomic_dec_and_test(&ocfs2_control_opened)) { | ||
585 | if (!list_empty(&ocfs2_live_connection_list)) { | ||
586 | /* XXX: Do bad things! */ | ||
587 | printk(KERN_ERR | ||
588 | "ocfs2: Unexpected release of ocfs2_control!\n" | ||
589 | " Loss of cluster connection requires " | ||
590 | "an emergency restart!\n"); | ||
591 | emergency_restart(); | ||
592 | } | ||
593 | /* | ||
594 | * Last valid close clears the node number and resets | ||
595 | * the locking protocol version | ||
596 | */ | ||
597 | ocfs2_control_this_node = -1; | ||
598 | running_proto.pv_major = 0; | ||
599 | running_proto.pv_major = 0; | ||
600 | } | ||
601 | |||
602 | out: | ||
603 | list_del_init(&p->op_list); | ||
604 | file->private_data = NULL; | ||
605 | |||
606 | mutex_unlock(&ocfs2_control_lock); | ||
607 | |||
608 | kfree(p); | ||
609 | |||
610 | return 0; | ||
611 | } | ||
612 | |||
613 | static int ocfs2_control_open(struct inode *inode, struct file *file) | ||
614 | { | ||
615 | struct ocfs2_control_private *p; | ||
616 | |||
617 | p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL); | ||
618 | if (!p) | ||
619 | return -ENOMEM; | ||
620 | p->op_this_node = -1; | ||
621 | |||
622 | mutex_lock(&ocfs2_control_lock); | ||
623 | file->private_data = p; | ||
624 | list_add(&p->op_list, &ocfs2_control_private_list); | ||
625 | mutex_unlock(&ocfs2_control_lock); | ||
626 | |||
627 | return 0; | ||
628 | } | ||
629 | |||
630 | static const struct file_operations ocfs2_control_fops = { | ||
631 | .open = ocfs2_control_open, | ||
632 | .release = ocfs2_control_release, | ||
633 | .read = ocfs2_control_read, | ||
634 | .write = ocfs2_control_write, | ||
635 | .owner = THIS_MODULE, | ||
636 | }; | ||
637 | |||
638 | struct miscdevice ocfs2_control_device = { | ||
639 | .minor = MISC_DYNAMIC_MINOR, | ||
640 | .name = "ocfs2_control", | ||
641 | .fops = &ocfs2_control_fops, | ||
642 | }; | ||
643 | |||
644 | static int ocfs2_control_init(void) | ||
645 | { | ||
646 | int rc; | ||
647 | |||
648 | atomic_set(&ocfs2_control_opened, 0); | ||
649 | |||
650 | rc = misc_register(&ocfs2_control_device); | ||
651 | if (rc) | ||
652 | printk(KERN_ERR | ||
653 | "ocfs2: Unable to register ocfs2_control device " | ||
654 | "(errno %d)\n", | ||
655 | -rc); | ||
656 | |||
657 | return rc; | ||
658 | } | ||
659 | |||
660 | static void ocfs2_control_exit(void) | ||
661 | { | ||
662 | int rc; | ||
663 | |||
664 | rc = misc_deregister(&ocfs2_control_device); | ||
665 | if (rc) | ||
666 | printk(KERN_ERR | ||
667 | "ocfs2: Unable to deregister ocfs2_control device " | ||
668 | "(errno %d)\n", | ||
669 | -rc); | ||
670 | } | ||
671 | |||
672 | static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg) | ||
673 | { | ||
674 | struct ocfs2_lock_res *res = astarg; | ||
675 | return &res->l_lksb.lksb_fsdlm; | ||
676 | } | ||
677 | |||
678 | static void fsdlm_lock_ast_wrapper(void *astarg) | ||
679 | { | ||
680 | struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg); | ||
681 | int status = lksb->sb_status; | ||
682 | |||
683 | BUG_ON(user_stack.sp_proto == NULL); | ||
684 | |||
685 | /* | ||
686 | * For now we're punting on the issue of other non-standard errors | ||
687 | * where we can't tell if the unlock_ast or lock_ast should be called. | ||
688 | * The main "other error" that's possible is EINVAL which means the | ||
689 | * function was called with invalid args, which shouldn't be possible | ||
690 | * since the caller here is under our control. Other non-standard | ||
691 | * errors probably fall into the same category, or otherwise are fatal | ||
692 | * which means we can't carry on anyway. | ||
693 | */ | ||
694 | |||
695 | if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL) | ||
696 | user_stack.sp_proto->lp_unlock_ast(astarg, 0); | ||
697 | else | ||
698 | user_stack.sp_proto->lp_lock_ast(astarg); | ||
699 | } | ||
700 | |||
701 | static void fsdlm_blocking_ast_wrapper(void *astarg, int level) | ||
702 | { | ||
703 | BUG_ON(user_stack.sp_proto == NULL); | ||
704 | |||
705 | user_stack.sp_proto->lp_blocking_ast(astarg, level); | ||
706 | } | ||
707 | |||
708 | static int user_dlm_lock(struct ocfs2_cluster_connection *conn, | ||
709 | int mode, | ||
710 | union ocfs2_dlm_lksb *lksb, | ||
711 | u32 flags, | ||
712 | void *name, | ||
713 | unsigned int namelen, | ||
714 | void *astarg) | ||
715 | { | ||
716 | int ret; | ||
717 | |||
718 | if (!lksb->lksb_fsdlm.sb_lvbptr) | ||
719 | lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb + | ||
720 | sizeof(struct dlm_lksb); | ||
721 | |||
722 | ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm, | ||
723 | flags|DLM_LKF_NODLCKWT, name, namelen, 0, | ||
724 | fsdlm_lock_ast_wrapper, astarg, | ||
725 | fsdlm_blocking_ast_wrapper); | ||
726 | return ret; | ||
727 | } | ||
728 | |||
729 | static int user_dlm_unlock(struct ocfs2_cluster_connection *conn, | ||
730 | union ocfs2_dlm_lksb *lksb, | ||
731 | u32 flags, | ||
732 | void *astarg) | ||
733 | { | ||
734 | int ret; | ||
735 | |||
736 | ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid, | ||
737 | flags, &lksb->lksb_fsdlm, astarg); | ||
738 | return ret; | ||
739 | } | ||
740 | |||
741 | static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb) | ||
742 | { | ||
743 | return lksb->lksb_fsdlm.sb_status; | ||
744 | } | ||
745 | |||
746 | static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb) | ||
747 | { | ||
748 | return (void *)(lksb->lksb_fsdlm.sb_lvbptr); | ||
749 | } | ||
750 | |||
751 | static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) | ||
752 | { | ||
753 | } | ||
754 | |||
755 | /* | ||
756 | * Compare a requested locking protocol version against the current one. | ||
757 | * | ||
758 | * If the major numbers are different, they are incompatible. | ||
759 | * If the current minor is greater than the request, they are incompatible. | ||
760 | * If the current minor is less than or equal to the request, they are | ||
761 | * compatible, and the requester should run at the current minor version. | ||
762 | */ | ||
763 | static int fs_protocol_compare(struct ocfs2_protocol_version *existing, | ||
764 | struct ocfs2_protocol_version *request) | ||
765 | { | ||
766 | if (existing->pv_major != request->pv_major) | ||
767 | return 1; | ||
768 | |||
769 | if (existing->pv_minor > request->pv_minor) | ||
770 | return 1; | ||
771 | |||
772 | if (existing->pv_minor < request->pv_minor) | ||
773 | request->pv_minor = existing->pv_minor; | ||
774 | |||
775 | return 0; | ||
776 | } | ||
777 | |||
778 | static int user_cluster_connect(struct ocfs2_cluster_connection *conn) | ||
779 | { | ||
780 | dlm_lockspace_t *fsdlm; | ||
781 | struct ocfs2_live_connection *control; | ||
782 | int rc = 0; | ||
783 | |||
784 | BUG_ON(conn == NULL); | ||
785 | |||
786 | rc = ocfs2_live_connection_new(conn, &control); | ||
787 | if (rc) | ||
788 | goto out; | ||
789 | |||
790 | /* | ||
791 | * running_proto must have been set before we allowed any mounts | ||
792 | * to proceed. | ||
793 | */ | ||
794 | if (fs_protocol_compare(&running_proto, &conn->cc_version)) { | ||
795 | printk(KERN_ERR | ||
796 | "Unable to mount with fs locking protocol version " | ||
797 | "%u.%u because the userspace control daemon has " | ||
798 | "negotiated %u.%u\n", | ||
799 | conn->cc_version.pv_major, conn->cc_version.pv_minor, | ||
800 | running_proto.pv_major, running_proto.pv_minor); | ||
801 | rc = -EPROTO; | ||
802 | ocfs2_live_connection_drop(control); | ||
803 | goto out; | ||
804 | } | ||
805 | |||
806 | rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name), | ||
807 | &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN); | ||
808 | if (rc) { | ||
809 | ocfs2_live_connection_drop(control); | ||
810 | goto out; | ||
811 | } | ||
812 | |||
813 | conn->cc_private = control; | ||
814 | conn->cc_lockspace = fsdlm; | ||
815 | out: | ||
816 | return rc; | ||
817 | } | ||
818 | |||
819 | static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn, | ||
820 | int hangup_pending) | ||
821 | { | ||
822 | dlm_release_lockspace(conn->cc_lockspace, 2); | ||
823 | conn->cc_lockspace = NULL; | ||
824 | ocfs2_live_connection_drop(conn->cc_private); | ||
825 | conn->cc_private = NULL; | ||
826 | return 0; | ||
827 | } | ||
828 | |||
829 | static int user_cluster_this_node(unsigned int *this_node) | ||
830 | { | ||
831 | int rc; | ||
832 | |||
833 | rc = ocfs2_control_get_this_node(); | ||
834 | if (rc < 0) | ||
835 | return rc; | ||
836 | |||
837 | *this_node = rc; | ||
838 | return 0; | ||
839 | } | ||
840 | |||
841 | static struct ocfs2_stack_operations user_stack_ops = { | ||
842 | .connect = user_cluster_connect, | ||
843 | .disconnect = user_cluster_disconnect, | ||
844 | .this_node = user_cluster_this_node, | ||
845 | .dlm_lock = user_dlm_lock, | ||
846 | .dlm_unlock = user_dlm_unlock, | ||
847 | .lock_status = user_dlm_lock_status, | ||
848 | .lock_lvb = user_dlm_lvb, | ||
849 | .dump_lksb = user_dlm_dump_lksb, | ||
850 | }; | ||
851 | |||
852 | static struct ocfs2_stack_plugin user_stack = { | ||
853 | .sp_name = "user", | ||
854 | .sp_ops = &user_stack_ops, | ||
855 | .sp_owner = THIS_MODULE, | ||
856 | }; | ||
857 | |||
858 | |||
859 | static int __init user_stack_init(void) | ||
860 | { | ||
861 | int rc; | ||
862 | |||
863 | rc = ocfs2_control_init(); | ||
864 | if (!rc) { | ||
865 | rc = ocfs2_stack_glue_register(&user_stack); | ||
866 | if (rc) | ||
867 | ocfs2_control_exit(); | ||
868 | } | ||
869 | |||
870 | return rc; | ||
871 | } | ||
872 | |||
873 | static void __exit user_stack_exit(void) | ||
874 | { | ||
875 | ocfs2_stack_glue_unregister(&user_stack); | ||
876 | ocfs2_control_exit(); | ||
877 | } | ||
878 | |||
879 | MODULE_AUTHOR("Oracle"); | ||
880 | MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks"); | ||
881 | MODULE_LICENSE("GPL"); | ||
882 | module_init(user_stack_init); | ||
883 | module_exit(user_stack_exit); | ||
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c new file mode 100644 index 000000000000..119f60cea9cc --- /dev/null +++ b/fs/ocfs2/stackglue.c | |||
@@ -0,0 +1,568 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * stackglue.c | ||
5 | * | ||
6 | * Code which implements an OCFS2 specific interface to underlying | ||
7 | * cluster stacks. | ||
8 | * | ||
9 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
10 | * | ||
11 | * This program is free software; you can redistribute it and/or | ||
12 | * modify it under the terms of the GNU General Public | ||
13 | * License as published by the Free Software Foundation, version 2. | ||
14 | * | ||
15 | * This program is distributed in the hope that it will be useful, | ||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
18 | * General Public License for more details. | ||
19 | */ | ||
20 | |||
21 | #include <linux/list.h> | ||
22 | #include <linux/spinlock.h> | ||
23 | #include <linux/module.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/kmod.h> | ||
26 | #include <linux/fs.h> | ||
27 | #include <linux/kobject.h> | ||
28 | #include <linux/sysfs.h> | ||
29 | |||
30 | #include "ocfs2_fs.h" | ||
31 | |||
32 | #include "stackglue.h" | ||
33 | |||
34 | #define OCFS2_STACK_PLUGIN_O2CB "o2cb" | ||
35 | #define OCFS2_STACK_PLUGIN_USER "user" | ||
36 | |||
37 | static struct ocfs2_locking_protocol *lproto; | ||
38 | static DEFINE_SPINLOCK(ocfs2_stack_lock); | ||
39 | static LIST_HEAD(ocfs2_stack_list); | ||
40 | static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1]; | ||
41 | |||
42 | /* | ||
43 | * The stack currently in use. If not null, active_stack->sp_count > 0, | ||
44 | * the module is pinned, and the locking protocol cannot be changed. | ||
45 | */ | ||
46 | static struct ocfs2_stack_plugin *active_stack; | ||
47 | |||
48 | static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) | ||
49 | { | ||
50 | struct ocfs2_stack_plugin *p; | ||
51 | |||
52 | assert_spin_locked(&ocfs2_stack_lock); | ||
53 | |||
54 | list_for_each_entry(p, &ocfs2_stack_list, sp_list) { | ||
55 | if (!strcmp(p->sp_name, name)) | ||
56 | return p; | ||
57 | } | ||
58 | |||
59 | return NULL; | ||
60 | } | ||
61 | |||
62 | static int ocfs2_stack_driver_request(const char *stack_name, | ||
63 | const char *plugin_name) | ||
64 | { | ||
65 | int rc; | ||
66 | struct ocfs2_stack_plugin *p; | ||
67 | |||
68 | spin_lock(&ocfs2_stack_lock); | ||
69 | |||
70 | /* | ||
71 | * If the stack passed by the filesystem isn't the selected one, | ||
72 | * we can't continue. | ||
73 | */ | ||
74 | if (strcmp(stack_name, cluster_stack_name)) { | ||
75 | rc = -EBUSY; | ||
76 | goto out; | ||
77 | } | ||
78 | |||
79 | if (active_stack) { | ||
80 | /* | ||
81 | * If the active stack isn't the one we want, it cannot | ||
82 | * be selected right now. | ||
83 | */ | ||
84 | if (!strcmp(active_stack->sp_name, plugin_name)) | ||
85 | rc = 0; | ||
86 | else | ||
87 | rc = -EBUSY; | ||
88 | goto out; | ||
89 | } | ||
90 | |||
91 | p = ocfs2_stack_lookup(plugin_name); | ||
92 | if (!p || !try_module_get(p->sp_owner)) { | ||
93 | rc = -ENOENT; | ||
94 | goto out; | ||
95 | } | ||
96 | |||
97 | /* Ok, the stack is pinned */ | ||
98 | p->sp_count++; | ||
99 | active_stack = p; | ||
100 | |||
101 | rc = 0; | ||
102 | |||
103 | out: | ||
104 | spin_unlock(&ocfs2_stack_lock); | ||
105 | return rc; | ||
106 | } | ||
107 | |||
108 | /* | ||
109 | * This function looks up the appropriate stack and makes it active. If | ||
110 | * there is no stack, it tries to load it. It will fail if the stack still | ||
111 | * cannot be found. It will also fail if a different stack is in use. | ||
112 | */ | ||
113 | static int ocfs2_stack_driver_get(const char *stack_name) | ||
114 | { | ||
115 | int rc; | ||
116 | char *plugin_name = OCFS2_STACK_PLUGIN_O2CB; | ||
117 | |||
118 | /* | ||
119 | * Classic stack does not pass in a stack name. This is | ||
120 | * compatible with older tools as well. | ||
121 | */ | ||
122 | if (!stack_name || !*stack_name) | ||
123 | stack_name = OCFS2_STACK_PLUGIN_O2CB; | ||
124 | |||
125 | if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) { | ||
126 | printk(KERN_ERR | ||
127 | "ocfs2 passed an invalid cluster stack label: \"%s\"\n", | ||
128 | stack_name); | ||
129 | return -EINVAL; | ||
130 | } | ||
131 | |||
132 | /* Anything that isn't the classic stack is a user stack */ | ||
133 | if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB)) | ||
134 | plugin_name = OCFS2_STACK_PLUGIN_USER; | ||
135 | |||
136 | rc = ocfs2_stack_driver_request(stack_name, plugin_name); | ||
137 | if (rc == -ENOENT) { | ||
138 | request_module("ocfs2_stack_%s", plugin_name); | ||
139 | rc = ocfs2_stack_driver_request(stack_name, plugin_name); | ||
140 | } | ||
141 | |||
142 | if (rc == -ENOENT) { | ||
143 | printk(KERN_ERR | ||
144 | "ocfs2: Cluster stack driver \"%s\" cannot be found\n", | ||
145 | plugin_name); | ||
146 | } else if (rc == -EBUSY) { | ||
147 | printk(KERN_ERR | ||
148 | "ocfs2: A different cluster stack is in use\n"); | ||
149 | } | ||
150 | |||
151 | return rc; | ||
152 | } | ||
153 | |||
154 | static void ocfs2_stack_driver_put(void) | ||
155 | { | ||
156 | spin_lock(&ocfs2_stack_lock); | ||
157 | BUG_ON(active_stack == NULL); | ||
158 | BUG_ON(active_stack->sp_count == 0); | ||
159 | |||
160 | active_stack->sp_count--; | ||
161 | if (!active_stack->sp_count) { | ||
162 | module_put(active_stack->sp_owner); | ||
163 | active_stack = NULL; | ||
164 | } | ||
165 | spin_unlock(&ocfs2_stack_lock); | ||
166 | } | ||
167 | |||
168 | int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin) | ||
169 | { | ||
170 | int rc; | ||
171 | |||
172 | spin_lock(&ocfs2_stack_lock); | ||
173 | if (!ocfs2_stack_lookup(plugin->sp_name)) { | ||
174 | plugin->sp_count = 0; | ||
175 | plugin->sp_proto = lproto; | ||
176 | list_add(&plugin->sp_list, &ocfs2_stack_list); | ||
177 | printk(KERN_INFO "ocfs2: Registered cluster interface %s\n", | ||
178 | plugin->sp_name); | ||
179 | rc = 0; | ||
180 | } else { | ||
181 | printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n", | ||
182 | plugin->sp_name); | ||
183 | rc = -EEXIST; | ||
184 | } | ||
185 | spin_unlock(&ocfs2_stack_lock); | ||
186 | |||
187 | return rc; | ||
188 | } | ||
189 | EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register); | ||
190 | |||
191 | void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin) | ||
192 | { | ||
193 | struct ocfs2_stack_plugin *p; | ||
194 | |||
195 | spin_lock(&ocfs2_stack_lock); | ||
196 | p = ocfs2_stack_lookup(plugin->sp_name); | ||
197 | if (p) { | ||
198 | BUG_ON(p != plugin); | ||
199 | BUG_ON(plugin == active_stack); | ||
200 | BUG_ON(plugin->sp_count != 0); | ||
201 | list_del_init(&plugin->sp_list); | ||
202 | printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n", | ||
203 | plugin->sp_name); | ||
204 | } else { | ||
205 | printk(KERN_ERR "Stack \"%s\" is not registered\n", | ||
206 | plugin->sp_name); | ||
207 | } | ||
208 | spin_unlock(&ocfs2_stack_lock); | ||
209 | } | ||
210 | EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister); | ||
211 | |||
212 | void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto) | ||
213 | { | ||
214 | struct ocfs2_stack_plugin *p; | ||
215 | |||
216 | BUG_ON(proto == NULL); | ||
217 | |||
218 | spin_lock(&ocfs2_stack_lock); | ||
219 | BUG_ON(active_stack != NULL); | ||
220 | |||
221 | lproto = proto; | ||
222 | list_for_each_entry(p, &ocfs2_stack_list, sp_list) { | ||
223 | p->sp_proto = lproto; | ||
224 | } | ||
225 | |||
226 | spin_unlock(&ocfs2_stack_lock); | ||
227 | } | ||
228 | EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol); | ||
229 | |||
230 | |||
231 | /* | ||
232 | * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take | ||
233 | * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the | ||
234 | * underlying stack plugins need to pilfer the lksb off of the lock_res. | ||
235 | * If some other structure needs to be passed as an astarg, the plugins | ||
236 | * will need to be given a different avenue to the lksb. | ||
237 | */ | ||
238 | int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, | ||
239 | int mode, | ||
240 | union ocfs2_dlm_lksb *lksb, | ||
241 | u32 flags, | ||
242 | void *name, | ||
243 | unsigned int namelen, | ||
244 | struct ocfs2_lock_res *astarg) | ||
245 | { | ||
246 | BUG_ON(lproto == NULL); | ||
247 | |||
248 | return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags, | ||
249 | name, namelen, astarg); | ||
250 | } | ||
251 | EXPORT_SYMBOL_GPL(ocfs2_dlm_lock); | ||
252 | |||
253 | int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, | ||
254 | union ocfs2_dlm_lksb *lksb, | ||
255 | u32 flags, | ||
256 | struct ocfs2_lock_res *astarg) | ||
257 | { | ||
258 | BUG_ON(lproto == NULL); | ||
259 | |||
260 | return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg); | ||
261 | } | ||
262 | EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock); | ||
263 | |||
264 | int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb) | ||
265 | { | ||
266 | return active_stack->sp_ops->lock_status(lksb); | ||
267 | } | ||
268 | EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status); | ||
269 | |||
270 | /* | ||
271 | * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we | ||
272 | * don't cast at the glue level. The real answer is that the header | ||
273 | * ordering is nigh impossible. | ||
274 | */ | ||
275 | void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb) | ||
276 | { | ||
277 | return active_stack->sp_ops->lock_lvb(lksb); | ||
278 | } | ||
279 | EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb); | ||
280 | |||
281 | void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) | ||
282 | { | ||
283 | active_stack->sp_ops->dump_lksb(lksb); | ||
284 | } | ||
285 | EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); | ||
286 | |||
287 | int ocfs2_cluster_connect(const char *stack_name, | ||
288 | const char *group, | ||
289 | int grouplen, | ||
290 | void (*recovery_handler)(int node_num, | ||
291 | void *recovery_data), | ||
292 | void *recovery_data, | ||
293 | struct ocfs2_cluster_connection **conn) | ||
294 | { | ||
295 | int rc = 0; | ||
296 | struct ocfs2_cluster_connection *new_conn; | ||
297 | |||
298 | BUG_ON(group == NULL); | ||
299 | BUG_ON(conn == NULL); | ||
300 | BUG_ON(recovery_handler == NULL); | ||
301 | |||
302 | if (grouplen > GROUP_NAME_MAX) { | ||
303 | rc = -EINVAL; | ||
304 | goto out; | ||
305 | } | ||
306 | |||
307 | new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection), | ||
308 | GFP_KERNEL); | ||
309 | if (!new_conn) { | ||
310 | rc = -ENOMEM; | ||
311 | goto out; | ||
312 | } | ||
313 | |||
314 | memcpy(new_conn->cc_name, group, grouplen); | ||
315 | new_conn->cc_namelen = grouplen; | ||
316 | new_conn->cc_recovery_handler = recovery_handler; | ||
317 | new_conn->cc_recovery_data = recovery_data; | ||
318 | |||
319 | /* Start the new connection at our maximum compatibility level */ | ||
320 | new_conn->cc_version = lproto->lp_max_version; | ||
321 | |||
322 | /* This will pin the stack driver if successful */ | ||
323 | rc = ocfs2_stack_driver_get(stack_name); | ||
324 | if (rc) | ||
325 | goto out_free; | ||
326 | |||
327 | rc = active_stack->sp_ops->connect(new_conn); | ||
328 | if (rc) { | ||
329 | ocfs2_stack_driver_put(); | ||
330 | goto out_free; | ||
331 | } | ||
332 | |||
333 | *conn = new_conn; | ||
334 | |||
335 | out_free: | ||
336 | if (rc) | ||
337 | kfree(new_conn); | ||
338 | |||
339 | out: | ||
340 | return rc; | ||
341 | } | ||
342 | EXPORT_SYMBOL_GPL(ocfs2_cluster_connect); | ||
343 | |||
344 | /* If hangup_pending is 0, the stack driver will be dropped */ | ||
345 | int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, | ||
346 | int hangup_pending) | ||
347 | { | ||
348 | int ret; | ||
349 | |||
350 | BUG_ON(conn == NULL); | ||
351 | |||
352 | ret = active_stack->sp_ops->disconnect(conn, hangup_pending); | ||
353 | |||
354 | /* XXX Should we free it anyway? */ | ||
355 | if (!ret) { | ||
356 | kfree(conn); | ||
357 | if (!hangup_pending) | ||
358 | ocfs2_stack_driver_put(); | ||
359 | } | ||
360 | |||
361 | return ret; | ||
362 | } | ||
363 | EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect); | ||
364 | |||
365 | void ocfs2_cluster_hangup(const char *group, int grouplen) | ||
366 | { | ||
367 | BUG_ON(group == NULL); | ||
368 | BUG_ON(group[grouplen] != '\0'); | ||
369 | |||
370 | if (active_stack->sp_ops->hangup) | ||
371 | active_stack->sp_ops->hangup(group, grouplen); | ||
372 | |||
373 | /* cluster_disconnect() was called with hangup_pending==1 */ | ||
374 | ocfs2_stack_driver_put(); | ||
375 | } | ||
376 | EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup); | ||
377 | |||
378 | int ocfs2_cluster_this_node(unsigned int *node) | ||
379 | { | ||
380 | return active_stack->sp_ops->this_node(node); | ||
381 | } | ||
382 | EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node); | ||
383 | |||
384 | |||
385 | /* | ||
386 | * Sysfs bits | ||
387 | */ | ||
388 | |||
389 | static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj, | ||
390 | struct kobj_attribute *attr, | ||
391 | char *buf) | ||
392 | { | ||
393 | ssize_t ret = 0; | ||
394 | |||
395 | spin_lock(&ocfs2_stack_lock); | ||
396 | if (lproto) | ||
397 | ret = snprintf(buf, PAGE_SIZE, "%u.%u\n", | ||
398 | lproto->lp_max_version.pv_major, | ||
399 | lproto->lp_max_version.pv_minor); | ||
400 | spin_unlock(&ocfs2_stack_lock); | ||
401 | |||
402 | return ret; | ||
403 | } | ||
404 | |||
405 | static struct kobj_attribute ocfs2_attr_max_locking_protocol = | ||
406 | __ATTR(max_locking_protocol, S_IFREG | S_IRUGO, | ||
407 | ocfs2_max_locking_protocol_show, NULL); | ||
408 | |||
409 | static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj, | ||
410 | struct kobj_attribute *attr, | ||
411 | char *buf) | ||
412 | { | ||
413 | ssize_t ret = 0, total = 0, remain = PAGE_SIZE; | ||
414 | struct ocfs2_stack_plugin *p; | ||
415 | |||
416 | spin_lock(&ocfs2_stack_lock); | ||
417 | list_for_each_entry(p, &ocfs2_stack_list, sp_list) { | ||
418 | ret = snprintf(buf, remain, "%s\n", | ||
419 | p->sp_name); | ||
420 | if (ret < 0) { | ||
421 | total = ret; | ||
422 | break; | ||
423 | } | ||
424 | if (ret == remain) { | ||
425 | /* snprintf() didn't fit */ | ||
426 | total = -E2BIG; | ||
427 | break; | ||
428 | } | ||
429 | total += ret; | ||
430 | remain -= ret; | ||
431 | } | ||
432 | spin_unlock(&ocfs2_stack_lock); | ||
433 | |||
434 | return total; | ||
435 | } | ||
436 | |||
437 | static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins = | ||
438 | __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO, | ||
439 | ocfs2_loaded_cluster_plugins_show, NULL); | ||
440 | |||
441 | static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj, | ||
442 | struct kobj_attribute *attr, | ||
443 | char *buf) | ||
444 | { | ||
445 | ssize_t ret = 0; | ||
446 | |||
447 | spin_lock(&ocfs2_stack_lock); | ||
448 | if (active_stack) { | ||
449 | ret = snprintf(buf, PAGE_SIZE, "%s\n", | ||
450 | active_stack->sp_name); | ||
451 | if (ret == PAGE_SIZE) | ||
452 | ret = -E2BIG; | ||
453 | } | ||
454 | spin_unlock(&ocfs2_stack_lock); | ||
455 | |||
456 | return ret; | ||
457 | } | ||
458 | |||
459 | static struct kobj_attribute ocfs2_attr_active_cluster_plugin = | ||
460 | __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO, | ||
461 | ocfs2_active_cluster_plugin_show, NULL); | ||
462 | |||
463 | static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj, | ||
464 | struct kobj_attribute *attr, | ||
465 | char *buf) | ||
466 | { | ||
467 | ssize_t ret; | ||
468 | spin_lock(&ocfs2_stack_lock); | ||
469 | ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name); | ||
470 | spin_unlock(&ocfs2_stack_lock); | ||
471 | |||
472 | return ret; | ||
473 | } | ||
474 | |||
475 | static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj, | ||
476 | struct kobj_attribute *attr, | ||
477 | const char *buf, size_t count) | ||
478 | { | ||
479 | size_t len = count; | ||
480 | ssize_t ret; | ||
481 | |||
482 | if (len == 0) | ||
483 | return len; | ||
484 | |||
485 | if (buf[len - 1] == '\n') | ||
486 | len--; | ||
487 | |||
488 | if ((len != OCFS2_STACK_LABEL_LEN) || | ||
489 | (strnlen(buf, len) != len)) | ||
490 | return -EINVAL; | ||
491 | |||
492 | spin_lock(&ocfs2_stack_lock); | ||
493 | if (active_stack) { | ||
494 | if (!strncmp(buf, cluster_stack_name, len)) | ||
495 | ret = count; | ||
496 | else | ||
497 | ret = -EBUSY; | ||
498 | } else { | ||
499 | memcpy(cluster_stack_name, buf, len); | ||
500 | ret = count; | ||
501 | } | ||
502 | spin_unlock(&ocfs2_stack_lock); | ||
503 | |||
504 | return ret; | ||
505 | } | ||
506 | |||
507 | |||
508 | static struct kobj_attribute ocfs2_attr_cluster_stack = | ||
509 | __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR, | ||
510 | ocfs2_cluster_stack_show, | ||
511 | ocfs2_cluster_stack_store); | ||
512 | |||
513 | static struct attribute *ocfs2_attrs[] = { | ||
514 | &ocfs2_attr_max_locking_protocol.attr, | ||
515 | &ocfs2_attr_loaded_cluster_plugins.attr, | ||
516 | &ocfs2_attr_active_cluster_plugin.attr, | ||
517 | &ocfs2_attr_cluster_stack.attr, | ||
518 | NULL, | ||
519 | }; | ||
520 | |||
521 | static struct attribute_group ocfs2_attr_group = { | ||
522 | .attrs = ocfs2_attrs, | ||
523 | }; | ||
524 | |||
525 | static struct kset *ocfs2_kset; | ||
526 | |||
527 | static void ocfs2_sysfs_exit(void) | ||
528 | { | ||
529 | kset_unregister(ocfs2_kset); | ||
530 | } | ||
531 | |||
532 | static int ocfs2_sysfs_init(void) | ||
533 | { | ||
534 | int ret; | ||
535 | |||
536 | ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj); | ||
537 | if (!ocfs2_kset) | ||
538 | return -ENOMEM; | ||
539 | |||
540 | ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group); | ||
541 | if (ret) | ||
542 | goto error; | ||
543 | |||
544 | return 0; | ||
545 | |||
546 | error: | ||
547 | kset_unregister(ocfs2_kset); | ||
548 | return ret; | ||
549 | } | ||
550 | |||
551 | static int __init ocfs2_stack_glue_init(void) | ||
552 | { | ||
553 | strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB); | ||
554 | |||
555 | return ocfs2_sysfs_init(); | ||
556 | } | ||
557 | |||
558 | static void __exit ocfs2_stack_glue_exit(void) | ||
559 | { | ||
560 | lproto = NULL; | ||
561 | ocfs2_sysfs_exit(); | ||
562 | } | ||
563 | |||
564 | MODULE_AUTHOR("Oracle"); | ||
565 | MODULE_DESCRIPTION("ocfs2 cluter stack glue layer"); | ||
566 | MODULE_LICENSE("GPL"); | ||
567 | module_init(ocfs2_stack_glue_init); | ||
568 | module_exit(ocfs2_stack_glue_exit); | ||
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h new file mode 100644 index 000000000000..005e4f170e0f --- /dev/null +++ b/fs/ocfs2/stackglue.h | |||
@@ -0,0 +1,261 @@ | |||
1 | /* -*- mode: c; c-basic-offset: 8; -*- | ||
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | ||
3 | * | ||
4 | * stackglue.h | ||
5 | * | ||
6 | * Glue to the underlying cluster stack. | ||
7 | * | ||
8 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or | ||
11 | * modify it under the terms of the GNU General Public | ||
12 | * License as published by the Free Software Foundation, version 2. | ||
13 | * | ||
14 | * This program is distributed in the hope that it will be useful, | ||
15 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
16 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
17 | * General Public License for more details. | ||
18 | */ | ||
19 | |||
20 | |||
21 | #ifndef STACKGLUE_H | ||
22 | #define STACKGLUE_H | ||
23 | |||
24 | #include <linux/types.h> | ||
25 | #include <linux/list.h> | ||
26 | #include <linux/dlmconstants.h> | ||
27 | |||
28 | #include "dlm/dlmapi.h" | ||
29 | #include <linux/dlm.h> | ||
30 | |||
31 | /* | ||
32 | * dlmconstants.h does not have a LOCAL flag. We hope to remove it | ||
33 | * some day, but right now we need it. Let's fake it. This value is larger | ||
34 | * than any flag in dlmconstants.h. | ||
35 | */ | ||
36 | #define DLM_LKF_LOCAL 0x00100000 | ||
37 | |||
38 | /* | ||
39 | * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably | ||
40 | * wants to be in a public header. | ||
41 | */ | ||
42 | #define GROUP_NAME_MAX 64 | ||
43 | |||
44 | |||
45 | /* | ||
46 | * ocfs2_protocol_version changes when ocfs2 does something different in | ||
47 | * its inter-node behavior. See dlmglue.c for more information. | ||
48 | */ | ||
49 | struct ocfs2_protocol_version { | ||
50 | u8 pv_major; | ||
51 | u8 pv_minor; | ||
52 | }; | ||
53 | |||
54 | /* | ||
55 | * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf. | ||
56 | */ | ||
57 | struct ocfs2_locking_protocol { | ||
58 | struct ocfs2_protocol_version lp_max_version; | ||
59 | void (*lp_lock_ast)(void *astarg); | ||
60 | void (*lp_blocking_ast)(void *astarg, int level); | ||
61 | void (*lp_unlock_ast)(void *astarg, int error); | ||
62 | }; | ||
63 | |||
64 | |||
65 | /* | ||
66 | * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only | ||
67 | * has a pointer to separately allocated lvb space. This struct exists only to | ||
68 | * include in the lksb union to make space for a combined dlm_lksb and lvb. | ||
69 | */ | ||
70 | struct fsdlm_lksb_plus_lvb { | ||
71 | struct dlm_lksb lksb; | ||
72 | char lvb[DLM_LVB_LEN]; | ||
73 | }; | ||
74 | |||
75 | /* | ||
76 | * A union of all lock status structures. We define it here so that the | ||
77 | * size of the union is known. Lock status structures are embedded in | ||
78 | * ocfs2 inodes. | ||
79 | */ | ||
80 | union ocfs2_dlm_lksb { | ||
81 | struct dlm_lockstatus lksb_o2dlm; | ||
82 | struct dlm_lksb lksb_fsdlm; | ||
83 | struct fsdlm_lksb_plus_lvb padding; | ||
84 | }; | ||
85 | |||
86 | /* | ||
87 | * A cluster connection. Mostly opaque to ocfs2, the connection holds | ||
88 | * state for the underlying stack. ocfs2 does use cc_version to determine | ||
89 | * locking compatibility. | ||
90 | */ | ||
91 | struct ocfs2_cluster_connection { | ||
92 | char cc_name[GROUP_NAME_MAX]; | ||
93 | int cc_namelen; | ||
94 | struct ocfs2_protocol_version cc_version; | ||
95 | void (*cc_recovery_handler)(int node_num, void *recovery_data); | ||
96 | void *cc_recovery_data; | ||
97 | void *cc_lockspace; | ||
98 | void *cc_private; | ||
99 | }; | ||
100 | |||
101 | /* | ||
102 | * Each cluster stack implements the stack operations structure. Not used | ||
103 | * in the ocfs2 code, the stackglue code translates generic cluster calls | ||
104 | * into stack operations. | ||
105 | */ | ||
106 | struct ocfs2_stack_operations { | ||
107 | /* | ||
108 | * The fs code calls ocfs2_cluster_connect() to attach a new | ||
109 | * filesystem to the cluster stack. The ->connect() op is passed | ||
110 | * an ocfs2_cluster_connection with the name and recovery field | ||
111 | * filled in. | ||
112 | * | ||
113 | * The stack must set up any notification mechanisms and create | ||
114 | * the filesystem lockspace in the DLM. The lockspace should be | ||
115 | * stored on cc_lockspace. Any other information can be stored on | ||
116 | * cc_private. | ||
117 | * | ||
118 | * ->connect() must not return until it is guaranteed that | ||
119 | * | ||
120 | * - Node down notifications for the filesystem will be recieved | ||
121 | * and passed to conn->cc_recovery_handler(). | ||
122 | * - Locking requests for the filesystem will be processed. | ||
123 | */ | ||
124 | int (*connect)(struct ocfs2_cluster_connection *conn); | ||
125 | |||
126 | /* | ||
127 | * The fs code calls ocfs2_cluster_disconnect() when a filesystem | ||
128 | * no longer needs cluster services. All DLM locks have been | ||
129 | * dropped, and recovery notification is being ignored by the | ||
130 | * fs code. The stack must disengage from the DLM and discontinue | ||
131 | * recovery notification. | ||
132 | * | ||
133 | * Once ->disconnect() has returned, the connection structure will | ||
134 | * be freed. Thus, a stack must not return from ->disconnect() | ||
135 | * until it will no longer reference the conn pointer. | ||
136 | * | ||
137 | * If hangup_pending is zero, ocfs2_cluster_disconnect() will also | ||
138 | * be dropping the reference on the module. | ||
139 | */ | ||
140 | int (*disconnect)(struct ocfs2_cluster_connection *conn, | ||
141 | int hangup_pending); | ||
142 | |||
143 | /* | ||
144 | * ocfs2_cluster_hangup() exists for compatibility with older | ||
145 | * ocfs2 tools. Only the classic stack really needs it. As such | ||
146 | * ->hangup() is not required of all stacks. See the comment by | ||
147 | * ocfs2_cluster_hangup() for more details. | ||
148 | * | ||
149 | * Note that ocfs2_cluster_hangup() can only be called if | ||
150 | * hangup_pending was passed to ocfs2_cluster_disconnect(). | ||
151 | */ | ||
152 | void (*hangup)(const char *group, int grouplen); | ||
153 | |||
154 | /* | ||
155 | * ->this_node() returns the cluster's unique identifier for the | ||
156 | * local node. | ||
157 | */ | ||
158 | int (*this_node)(unsigned int *node); | ||
159 | |||
160 | /* | ||
161 | * Call the underlying dlm lock function. The ->dlm_lock() | ||
162 | * callback should convert the flags and mode as appropriate. | ||
163 | * | ||
164 | * ast and bast functions are not part of the call because the | ||
165 | * stack will likely want to wrap ast and bast calls before passing | ||
166 | * them to stack->sp_proto. | ||
167 | */ | ||
168 | int (*dlm_lock)(struct ocfs2_cluster_connection *conn, | ||
169 | int mode, | ||
170 | union ocfs2_dlm_lksb *lksb, | ||
171 | u32 flags, | ||
172 | void *name, | ||
173 | unsigned int namelen, | ||
174 | void *astarg); | ||
175 | |||
176 | /* | ||
177 | * Call the underlying dlm unlock function. The ->dlm_unlock() | ||
178 | * function should convert the flags as appropriate. | ||
179 | * | ||
180 | * The unlock ast is not passed, as the stack will want to wrap | ||
181 | * it before calling stack->sp_proto->lp_unlock_ast(). | ||
182 | */ | ||
183 | int (*dlm_unlock)(struct ocfs2_cluster_connection *conn, | ||
184 | union ocfs2_dlm_lksb *lksb, | ||
185 | u32 flags, | ||
186 | void *astarg); | ||
187 | |||
188 | /* | ||
189 | * Return the status of the current lock status block. The fs | ||
190 | * code should never dereference the union. The ->lock_status() | ||
191 | * callback pulls out the stack-specific lksb, converts the status | ||
192 | * to a proper errno, and returns it. | ||
193 | */ | ||
194 | int (*lock_status)(union ocfs2_dlm_lksb *lksb); | ||
195 | |||
196 | /* | ||
197 | * Pull the lvb pointer off of the stack-specific lksb. | ||
198 | */ | ||
199 | void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); | ||
200 | |||
201 | /* | ||
202 | * This is an optoinal debugging hook. If provided, the | ||
203 | * stack can dump debugging information about this lock. | ||
204 | */ | ||
205 | void (*dump_lksb)(union ocfs2_dlm_lksb *lksb); | ||
206 | }; | ||
207 | |||
208 | /* | ||
209 | * Each stack plugin must describe itself by registering a | ||
210 | * ocfs2_stack_plugin structure. This is only seen by stackglue and the | ||
211 | * stack driver. | ||
212 | */ | ||
213 | struct ocfs2_stack_plugin { | ||
214 | char *sp_name; | ||
215 | struct ocfs2_stack_operations *sp_ops; | ||
216 | struct module *sp_owner; | ||
217 | |||
218 | /* These are managed by the stackglue code. */ | ||
219 | struct list_head sp_list; | ||
220 | unsigned int sp_count; | ||
221 | struct ocfs2_locking_protocol *sp_proto; | ||
222 | }; | ||
223 | |||
224 | |||
225 | /* Used by the filesystem */ | ||
226 | int ocfs2_cluster_connect(const char *stack_name, | ||
227 | const char *group, | ||
228 | int grouplen, | ||
229 | void (*recovery_handler)(int node_num, | ||
230 | void *recovery_data), | ||
231 | void *recovery_data, | ||
232 | struct ocfs2_cluster_connection **conn); | ||
233 | int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn, | ||
234 | int hangup_pending); | ||
235 | void ocfs2_cluster_hangup(const char *group, int grouplen); | ||
236 | int ocfs2_cluster_this_node(unsigned int *node); | ||
237 | |||
238 | struct ocfs2_lock_res; | ||
239 | int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn, | ||
240 | int mode, | ||
241 | union ocfs2_dlm_lksb *lksb, | ||
242 | u32 flags, | ||
243 | void *name, | ||
244 | unsigned int namelen, | ||
245 | struct ocfs2_lock_res *astarg); | ||
246 | int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn, | ||
247 | union ocfs2_dlm_lksb *lksb, | ||
248 | u32 flags, | ||
249 | struct ocfs2_lock_res *astarg); | ||
250 | |||
251 | int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); | ||
252 | void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); | ||
253 | void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); | ||
254 | |||
255 | void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); | ||
256 | |||
257 | |||
258 | /* Used by stack plugins */ | ||
259 | int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); | ||
260 | void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); | ||
261 | #endif /* STACKGLUE_H */ | ||
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 72c198a004df..d2d278fb9819 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c | |||
@@ -46,6 +46,11 @@ | |||
46 | 46 | ||
47 | #include "buffer_head_io.h" | 47 | #include "buffer_head_io.h" |
48 | 48 | ||
49 | #define NOT_ALLOC_NEW_GROUP 0 | ||
50 | #define ALLOC_NEW_GROUP 1 | ||
51 | |||
52 | #define OCFS2_MAX_INODES_TO_STEAL 1024 | ||
53 | |||
49 | static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); | 54 | static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); |
50 | static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); | 55 | static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); |
51 | static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); | 56 | static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); |
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, | |||
106 | u64 *bg_blkno, | 111 | u64 *bg_blkno, |
107 | u16 *bg_bit_off); | 112 | u16 *bg_bit_off); |
108 | 113 | ||
109 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) | 114 | static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) |
110 | { | 115 | { |
111 | struct inode *inode = ac->ac_inode; | 116 | struct inode *inode = ac->ac_inode; |
112 | 117 | ||
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) | |||
117 | mutex_unlock(&inode->i_mutex); | 122 | mutex_unlock(&inode->i_mutex); |
118 | 123 | ||
119 | iput(inode); | 124 | iput(inode); |
125 | ac->ac_inode = NULL; | ||
120 | } | 126 | } |
121 | if (ac->ac_bh) | 127 | if (ac->ac_bh) { |
122 | brelse(ac->ac_bh); | 128 | brelse(ac->ac_bh); |
129 | ac->ac_bh = NULL; | ||
130 | } | ||
131 | } | ||
132 | |||
133 | void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) | ||
134 | { | ||
135 | ocfs2_free_ac_resource(ac); | ||
123 | kfree(ac); | 136 | kfree(ac); |
124 | } | 137 | } |
125 | 138 | ||
@@ -391,7 +404,8 @@ bail: | |||
391 | static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | 404 | static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, |
392 | struct ocfs2_alloc_context *ac, | 405 | struct ocfs2_alloc_context *ac, |
393 | int type, | 406 | int type, |
394 | u32 slot) | 407 | u32 slot, |
408 | int alloc_new_group) | ||
395 | { | 409 | { |
396 | int status; | 410 | int status; |
397 | u32 bits_wanted = ac->ac_bits_wanted; | 411 | u32 bits_wanted = ac->ac_bits_wanted; |
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | |||
420 | } | 434 | } |
421 | 435 | ||
422 | ac->ac_inode = alloc_inode; | 436 | ac->ac_inode = alloc_inode; |
437 | ac->ac_alloc_slot = slot; | ||
423 | 438 | ||
424 | fe = (struct ocfs2_dinode *) bh->b_data; | 439 | fe = (struct ocfs2_dinode *) bh->b_data; |
425 | if (!OCFS2_IS_VALID_DINODE(fe)) { | 440 | if (!OCFS2_IS_VALID_DINODE(fe)) { |
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, | |||
446 | goto bail; | 461 | goto bail; |
447 | } | 462 | } |
448 | 463 | ||
464 | if (alloc_new_group != ALLOC_NEW_GROUP) { | ||
465 | mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, " | ||
466 | "and we don't alloc a new group for it.\n", | ||
467 | slot, bits_wanted, free_bits); | ||
468 | status = -ENOSPC; | ||
469 | goto bail; | ||
470 | } | ||
471 | |||
449 | status = ocfs2_block_group_alloc(osb, alloc_inode, bh); | 472 | status = ocfs2_block_group_alloc(osb, alloc_inode, bh); |
450 | if (status < 0) { | 473 | if (status < 0) { |
451 | if (status != -ENOSPC) | 474 | if (status != -ENOSPC) |
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, | |||
490 | (*ac)->ac_group_search = ocfs2_block_group_search; | 513 | (*ac)->ac_group_search = ocfs2_block_group_search; |
491 | 514 | ||
492 | status = ocfs2_reserve_suballoc_bits(osb, (*ac), | 515 | status = ocfs2_reserve_suballoc_bits(osb, (*ac), |
493 | EXTENT_ALLOC_SYSTEM_INODE, slot); | 516 | EXTENT_ALLOC_SYSTEM_INODE, |
517 | slot, ALLOC_NEW_GROUP); | ||
494 | if (status < 0) { | 518 | if (status < 0) { |
495 | if (status != -ENOSPC) | 519 | if (status != -ENOSPC) |
496 | mlog_errno(status); | 520 | mlog_errno(status); |
@@ -508,10 +532,42 @@ bail: | |||
508 | return status; | 532 | return status; |
509 | } | 533 | } |
510 | 534 | ||
535 | static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, | ||
536 | struct ocfs2_alloc_context *ac) | ||
537 | { | ||
538 | int i, status = -ENOSPC; | ||
539 | s16 slot = ocfs2_get_inode_steal_slot(osb); | ||
540 | |||
541 | /* Start to steal inodes from the first slot after ours. */ | ||
542 | if (slot == OCFS2_INVALID_SLOT) | ||
543 | slot = osb->slot_num + 1; | ||
544 | |||
545 | for (i = 0; i < osb->max_slots; i++, slot++) { | ||
546 | if (slot == osb->max_slots) | ||
547 | slot = 0; | ||
548 | |||
549 | if (slot == osb->slot_num) | ||
550 | continue; | ||
551 | |||
552 | status = ocfs2_reserve_suballoc_bits(osb, ac, | ||
553 | INODE_ALLOC_SYSTEM_INODE, | ||
554 | slot, NOT_ALLOC_NEW_GROUP); | ||
555 | if (status >= 0) { | ||
556 | ocfs2_set_inode_steal_slot(osb, slot); | ||
557 | break; | ||
558 | } | ||
559 | |||
560 | ocfs2_free_ac_resource(ac); | ||
561 | } | ||
562 | |||
563 | return status; | ||
564 | } | ||
565 | |||
511 | int ocfs2_reserve_new_inode(struct ocfs2_super *osb, | 566 | int ocfs2_reserve_new_inode(struct ocfs2_super *osb, |
512 | struct ocfs2_alloc_context **ac) | 567 | struct ocfs2_alloc_context **ac) |
513 | { | 568 | { |
514 | int status; | 569 | int status; |
570 | s16 slot = ocfs2_get_inode_steal_slot(osb); | ||
515 | 571 | ||
516 | *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); | 572 | *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); |
517 | if (!(*ac)) { | 573 | if (!(*ac)) { |
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, | |||
525 | 581 | ||
526 | (*ac)->ac_group_search = ocfs2_block_group_search; | 582 | (*ac)->ac_group_search = ocfs2_block_group_search; |
527 | 583 | ||
584 | /* | ||
585 | * slot is set when we successfully steal inode from other nodes. | ||
586 | * It is reset in 3 places: | ||
587 | * 1. when we flush the truncate log | ||
588 | * 2. when we complete local alloc recovery. | ||
589 | * 3. when we successfully allocate from our own slot. | ||
590 | * After it is set, we will go on stealing inodes until we find the | ||
591 | * need to check our slots to see whether there is some space for us. | ||
592 | */ | ||
593 | if (slot != OCFS2_INVALID_SLOT && | ||
594 | atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL) | ||
595 | goto inode_steal; | ||
596 | |||
597 | atomic_set(&osb->s_num_inodes_stolen, 0); | ||
528 | status = ocfs2_reserve_suballoc_bits(osb, *ac, | 598 | status = ocfs2_reserve_suballoc_bits(osb, *ac, |
529 | INODE_ALLOC_SYSTEM_INODE, | 599 | INODE_ALLOC_SYSTEM_INODE, |
530 | osb->slot_num); | 600 | osb->slot_num, ALLOC_NEW_GROUP); |
601 | if (status >= 0) { | ||
602 | status = 0; | ||
603 | |||
604 | /* | ||
605 | * Some inodes must be freed by us, so try to allocate | ||
606 | * from our own next time. | ||
607 | */ | ||
608 | if (slot != OCFS2_INVALID_SLOT) | ||
609 | ocfs2_init_inode_steal_slot(osb); | ||
610 | goto bail; | ||
611 | } else if (status < 0 && status != -ENOSPC) { | ||
612 | mlog_errno(status); | ||
613 | goto bail; | ||
614 | } | ||
615 | |||
616 | ocfs2_free_ac_resource(*ac); | ||
617 | |||
618 | inode_steal: | ||
619 | status = ocfs2_steal_inode_from_other_nodes(osb, *ac); | ||
620 | atomic_inc(&osb->s_num_inodes_stolen); | ||
531 | if (status < 0) { | 621 | if (status < 0) { |
532 | if (status != -ENOSPC) | 622 | if (status != -ENOSPC) |
533 | mlog_errno(status); | 623 | mlog_errno(status); |
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, | |||
557 | 647 | ||
558 | status = ocfs2_reserve_suballoc_bits(osb, ac, | 648 | status = ocfs2_reserve_suballoc_bits(osb, ac, |
559 | GLOBAL_BITMAP_SYSTEM_INODE, | 649 | GLOBAL_BITMAP_SYSTEM_INODE, |
560 | OCFS2_INVALID_SLOT); | 650 | OCFS2_INVALID_SLOT, |
651 | ALLOC_NEW_GROUP); | ||
561 | if (status < 0 && status != -ENOSPC) { | 652 | if (status < 0 && status != -ENOSPC) { |
562 | mlog_errno(status); | 653 | mlog_errno(status); |
563 | goto bail; | 654 | goto bail; |
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 8799033bb459..544c600662bd 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h | |||
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *, | |||
36 | struct ocfs2_alloc_context { | 36 | struct ocfs2_alloc_context { |
37 | struct inode *ac_inode; /* which bitmap are we allocating from? */ | 37 | struct inode *ac_inode; /* which bitmap are we allocating from? */ |
38 | struct buffer_head *ac_bh; /* file entry bh */ | 38 | struct buffer_head *ac_bh; /* file entry bh */ |
39 | u32 ac_alloc_slot; /* which slot are we allocating from? */ | ||
39 | u32 ac_bits_wanted; | 40 | u32 ac_bits_wanted; |
40 | u32 ac_bits_given; | 41 | u32 ac_bits_given; |
41 | #define OCFS2_AC_USE_LOCAL 1 | 42 | #define OCFS2_AC_USE_LOCAL 1 |
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index bec75aff3d9f..df63ba20ae90 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c | |||
@@ -40,8 +40,7 @@ | |||
40 | #include <linux/crc32.h> | 40 | #include <linux/crc32.h> |
41 | #include <linux/debugfs.h> | 41 | #include <linux/debugfs.h> |
42 | #include <linux/mount.h> | 42 | #include <linux/mount.h> |
43 | 43 | #include <linux/seq_file.h> | |
44 | #include <cluster/nodemanager.h> | ||
45 | 44 | ||
46 | #define MLOG_MASK_PREFIX ML_SUPER | 45 | #define MLOG_MASK_PREFIX ML_SUPER |
47 | #include <cluster/masklog.h> | 46 | #include <cluster/masklog.h> |
@@ -88,6 +87,7 @@ struct mount_options | |||
88 | unsigned int atime_quantum; | 87 | unsigned int atime_quantum; |
89 | signed short slot; | 88 | signed short slot; |
90 | unsigned int localalloc_opt; | 89 | unsigned int localalloc_opt; |
90 | char cluster_stack[OCFS2_STACK_LABEL_LEN + 1]; | ||
91 | }; | 91 | }; |
92 | 92 | ||
93 | static int ocfs2_parse_options(struct super_block *sb, char *options, | 93 | static int ocfs2_parse_options(struct super_block *sb, char *options, |
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait); | |||
109 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); | 109 | static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); |
110 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); | 110 | static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); |
111 | static void ocfs2_release_system_inodes(struct ocfs2_super *osb); | 111 | static void ocfs2_release_system_inodes(struct ocfs2_super *osb); |
112 | static int ocfs2_fill_local_node_info(struct ocfs2_super *osb); | ||
113 | static int ocfs2_check_volume(struct ocfs2_super *osb); | 112 | static int ocfs2_check_volume(struct ocfs2_super *osb); |
114 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, | 113 | static int ocfs2_verify_volume(struct ocfs2_dinode *di, |
115 | struct buffer_head *bh, | 114 | struct buffer_head *bh, |
@@ -154,6 +153,7 @@ enum { | |||
154 | Opt_commit, | 153 | Opt_commit, |
155 | Opt_localalloc, | 154 | Opt_localalloc, |
156 | Opt_localflocks, | 155 | Opt_localflocks, |
156 | Opt_stack, | ||
157 | Opt_err, | 157 | Opt_err, |
158 | }; | 158 | }; |
159 | 159 | ||
@@ -172,6 +172,7 @@ static match_table_t tokens = { | |||
172 | {Opt_commit, "commit=%u"}, | 172 | {Opt_commit, "commit=%u"}, |
173 | {Opt_localalloc, "localalloc=%d"}, | 173 | {Opt_localalloc, "localalloc=%d"}, |
174 | {Opt_localflocks, "localflocks"}, | 174 | {Opt_localflocks, "localflocks"}, |
175 | {Opt_stack, "cluster_stack=%s"}, | ||
175 | {Opt_err, NULL} | 176 | {Opt_err, NULL} |
176 | }; | 177 | }; |
177 | 178 | ||
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) | |||
551 | } | 552 | } |
552 | } | 553 | } |
553 | 554 | ||
555 | if (ocfs2_userspace_stack(osb)) { | ||
556 | if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) { | ||
557 | mlog(ML_ERROR, "Userspace stack expected, but " | ||
558 | "o2cb heartbeat arguments passed to mount\n"); | ||
559 | return -EINVAL; | ||
560 | } | ||
561 | } | ||
562 | |||
554 | if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { | 563 | if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { |
555 | if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { | 564 | if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) && |
565 | !ocfs2_userspace_stack(osb)) { | ||
556 | mlog(ML_ERROR, "Heartbeat has to be started to mount " | 566 | mlog(ML_ERROR, "Heartbeat has to be started to mount " |
557 | "a read-write clustered device.\n"); | 567 | "a read-write clustered device.\n"); |
558 | return -EINVAL; | 568 | return -EINVAL; |
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb) | |||
562 | return 0; | 572 | return 0; |
563 | } | 573 | } |
564 | 574 | ||
575 | /* | ||
576 | * If we're using a userspace stack, mount should have passed | ||
577 | * a name that matches the disk. If not, mount should not | ||
578 | * have passed a stack. | ||
579 | */ | ||
580 | static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, | ||
581 | struct mount_options *mopt) | ||
582 | { | ||
583 | if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) { | ||
584 | mlog(ML_ERROR, | ||
585 | "cluster stack passed to mount, but this filesystem " | ||
586 | "does not support it\n"); | ||
587 | return -EINVAL; | ||
588 | } | ||
589 | |||
590 | if (ocfs2_userspace_stack(osb) && | ||
591 | strncmp(osb->osb_cluster_stack, mopt->cluster_stack, | ||
592 | OCFS2_STACK_LABEL_LEN)) { | ||
593 | mlog(ML_ERROR, | ||
594 | "cluster stack passed to mount (\"%s\") does not " | ||
595 | "match the filesystem (\"%s\")\n", | ||
596 | mopt->cluster_stack, | ||
597 | osb->osb_cluster_stack); | ||
598 | return -EINVAL; | ||
599 | } | ||
600 | |||
601 | return 0; | ||
602 | } | ||
603 | |||
565 | static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | 604 | static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) |
566 | { | 605 | { |
567 | struct dentry *root; | 606 | struct dentry *root; |
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
579 | goto read_super_error; | 618 | goto read_super_error; |
580 | } | 619 | } |
581 | 620 | ||
582 | /* for now we only have one cluster/node, make sure we see it | ||
583 | * in the heartbeat universe */ | ||
584 | if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) { | ||
585 | if (!o2hb_check_local_node_heartbeating()) { | ||
586 | status = -EINVAL; | ||
587 | goto read_super_error; | ||
588 | } | ||
589 | } | ||
590 | |||
591 | /* probe for superblock */ | 621 | /* probe for superblock */ |
592 | status = ocfs2_sb_probe(sb, &bh, §or_size); | 622 | status = ocfs2_sb_probe(sb, &bh, §or_size); |
593 | if (status < 0) { | 623 | if (status < 0) { |
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
609 | osb->osb_commit_interval = parsed_options.commit_interval; | 639 | osb->osb_commit_interval = parsed_options.commit_interval; |
610 | osb->local_alloc_size = parsed_options.localalloc_opt; | 640 | osb->local_alloc_size = parsed_options.localalloc_opt; |
611 | 641 | ||
642 | status = ocfs2_verify_userspace_stack(osb, &parsed_options); | ||
643 | if (status) | ||
644 | goto read_super_error; | ||
645 | |||
612 | sb->s_magic = OCFS2_SUPER_MAGIC; | 646 | sb->s_magic = OCFS2_SUPER_MAGIC; |
613 | 647 | ||
614 | /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, | 648 | /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, |
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) | |||
694 | if (ocfs2_mount_local(osb)) | 728 | if (ocfs2_mount_local(osb)) |
695 | snprintf(nodestr, sizeof(nodestr), "local"); | 729 | snprintf(nodestr, sizeof(nodestr), "local"); |
696 | else | 730 | else |
697 | snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); | 731 | snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); |
698 | 732 | ||
699 | printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " | 733 | printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " |
700 | "with %s data mode.\n", | 734 | "with %s data mode.\n", |
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
763 | mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; | 797 | mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; |
764 | mopt->slot = OCFS2_INVALID_SLOT; | 798 | mopt->slot = OCFS2_INVALID_SLOT; |
765 | mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; | 799 | mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; |
800 | mopt->cluster_stack[0] = '\0'; | ||
766 | 801 | ||
767 | if (!options) { | 802 | if (!options) { |
768 | status = 1; | 803 | status = 1; |
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb, | |||
864 | if (!is_remount) | 899 | if (!is_remount) |
865 | mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; | 900 | mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; |
866 | break; | 901 | break; |
902 | case Opt_stack: | ||
903 | /* Check both that the option we were passed | ||
904 | * is of the right length and that it is a proper | ||
905 | * string of the right length. | ||
906 | */ | ||
907 | if (((args[0].to - args[0].from) != | ||
908 | OCFS2_STACK_LABEL_LEN) || | ||
909 | (strnlen(args[0].from, | ||
910 | OCFS2_STACK_LABEL_LEN) != | ||
911 | OCFS2_STACK_LABEL_LEN)) { | ||
912 | mlog(ML_ERROR, | ||
913 | "Invalid cluster_stack option\n"); | ||
914 | status = 0; | ||
915 | goto bail; | ||
916 | } | ||
917 | memcpy(mopt->cluster_stack, args[0].from, | ||
918 | OCFS2_STACK_LABEL_LEN); | ||
919 | mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; | ||
920 | break; | ||
867 | default: | 921 | default: |
868 | mlog(ML_ERROR, | 922 | mlog(ML_ERROR, |
869 | "Unrecognized mount option \"%s\" " | 923 | "Unrecognized mount option \"%s\" " |
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) | |||
922 | if (opts & OCFS2_MOUNT_LOCALFLOCKS) | 976 | if (opts & OCFS2_MOUNT_LOCALFLOCKS) |
923 | seq_printf(s, ",localflocks,"); | 977 | seq_printf(s, ",localflocks,"); |
924 | 978 | ||
979 | if (osb->osb_cluster_stack[0]) | ||
980 | seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, | ||
981 | osb->osb_cluster_stack); | ||
982 | |||
925 | return 0; | 983 | return 0; |
926 | } | 984 | } |
927 | 985 | ||
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void) | |||
957 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); | 1015 | mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); |
958 | } | 1016 | } |
959 | 1017 | ||
1018 | ocfs2_set_locking_protocol(); | ||
1019 | |||
960 | leave: | 1020 | leave: |
961 | if (status < 0) { | 1021 | if (status < 0) { |
962 | ocfs2_free_mem_caches(); | 1022 | ocfs2_free_mem_caches(); |
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb, | |||
1132 | return 0; | 1192 | return 0; |
1133 | } | 1193 | } |
1134 | 1194 | ||
1135 | /* ocfs2 1.0 only allows one cluster and node identity per kernel image. */ | ||
1136 | static int ocfs2_fill_local_node_info(struct ocfs2_super *osb) | ||
1137 | { | ||
1138 | int status; | ||
1139 | |||
1140 | /* XXX hold a ref on the node while mounte? easy enough, if | ||
1141 | * desirable. */ | ||
1142 | if (ocfs2_mount_local(osb)) | ||
1143 | osb->node_num = 0; | ||
1144 | else | ||
1145 | osb->node_num = o2nm_this_node(); | ||
1146 | |||
1147 | if (osb->node_num == O2NM_MAX_NODES) { | ||
1148 | mlog(ML_ERROR, "could not find this host's node number\n"); | ||
1149 | status = -ENOENT; | ||
1150 | goto bail; | ||
1151 | } | ||
1152 | |||
1153 | mlog(0, "I am node %d\n", osb->node_num); | ||
1154 | |||
1155 | status = 0; | ||
1156 | bail: | ||
1157 | return status; | ||
1158 | } | ||
1159 | |||
1160 | static int ocfs2_mount_volume(struct super_block *sb) | 1195 | static int ocfs2_mount_volume(struct super_block *sb) |
1161 | { | 1196 | { |
1162 | int status = 0; | 1197 | int status = 0; |
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb) | |||
1168 | if (ocfs2_is_hard_readonly(osb)) | 1203 | if (ocfs2_is_hard_readonly(osb)) |
1169 | goto leave; | 1204 | goto leave; |
1170 | 1205 | ||
1171 | status = ocfs2_fill_local_node_info(osb); | ||
1172 | if (status < 0) { | ||
1173 | mlog_errno(status); | ||
1174 | goto leave; | ||
1175 | } | ||
1176 | |||
1177 | status = ocfs2_dlm_init(osb); | 1206 | status = ocfs2_dlm_init(osb); |
1178 | if (status < 0) { | 1207 | if (status < 0) { |
1179 | mlog_errno(status); | 1208 | mlog_errno(status); |
@@ -1224,18 +1253,9 @@ leave: | |||
1224 | return status; | 1253 | return status; |
1225 | } | 1254 | } |
1226 | 1255 | ||
1227 | /* we can't grab the goofy sem lock from inside wait_event, so we use | ||
1228 | * memory barriers to make sure that we'll see the null task before | ||
1229 | * being woken up */ | ||
1230 | static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) | ||
1231 | { | ||
1232 | mb(); | ||
1233 | return osb->recovery_thread_task != NULL; | ||
1234 | } | ||
1235 | |||
1236 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | 1256 | static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) |
1237 | { | 1257 | { |
1238 | int tmp; | 1258 | int tmp, hangup_needed = 0; |
1239 | struct ocfs2_super *osb = NULL; | 1259 | struct ocfs2_super *osb = NULL; |
1240 | char nodestr[8]; | 1260 | char nodestr[8]; |
1241 | 1261 | ||
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1249 | 1269 | ||
1250 | ocfs2_truncate_log_shutdown(osb); | 1270 | ocfs2_truncate_log_shutdown(osb); |
1251 | 1271 | ||
1252 | /* disable any new recovery threads and wait for any currently | 1272 | /* This will disable recovery and flush any recovery work. */ |
1253 | * running ones to exit. Do this before setting the vol_state. */ | 1273 | ocfs2_recovery_exit(osb); |
1254 | mutex_lock(&osb->recovery_lock); | ||
1255 | osb->disable_recovery = 1; | ||
1256 | mutex_unlock(&osb->recovery_lock); | ||
1257 | wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); | ||
1258 | |||
1259 | /* At this point, we know that no more recovery threads can be | ||
1260 | * launched, so wait for any recovery completion work to | ||
1261 | * complete. */ | ||
1262 | flush_workqueue(ocfs2_wq); | ||
1263 | 1274 | ||
1264 | ocfs2_journal_shutdown(osb); | 1275 | ocfs2_journal_shutdown(osb); |
1265 | 1276 | ||
1266 | ocfs2_sync_blockdev(sb); | 1277 | ocfs2_sync_blockdev(sb); |
1267 | 1278 | ||
1268 | /* No dlm means we've failed during mount, so skip all the | 1279 | /* No cluster connection means we've failed during mount, so skip |
1269 | * steps which depended on that to complete. */ | 1280 | * all the steps which depended on that to complete. */ |
1270 | if (osb->dlm) { | 1281 | if (osb->cconn) { |
1271 | tmp = ocfs2_super_lock(osb, 1); | 1282 | tmp = ocfs2_super_lock(osb, 1); |
1272 | if (tmp < 0) { | 1283 | if (tmp < 0) { |
1273 | mlog_errno(tmp); | 1284 | mlog_errno(tmp); |
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) | |||
1278 | if (osb->slot_num != OCFS2_INVALID_SLOT) | 1289 | if (osb->slot_num != OCFS2_INVALID_SLOT) |
1279 | ocfs2_put_slot(osb); | 1290 | ocfs2_put_slot(osb); |
1280 | 1291 | ||
1281 | if (osb->dlm) | 1292 | if (osb->cconn) |
1282 | ocfs2_super_unlock(osb, 1); | 1293 | ocfs2_super_unlock(osb, 1); |
1283 | 1294 | ||
1284 | ocfs2_release_system_inodes(osb); | 1295 | ocfs2_release_system_inodes(osb); |
1285 | 1296 | ||
1286 | if (osb->dlm) | 1297 | /* |
1287 | ocfs2_dlm_shutdown(osb); | 1298 | * If we're dismounting due to mount error, mount.ocfs2 will clean |
1299 | * up heartbeat. If we're a local mount, there is no heartbeat. | ||
1300 | * If we failed before we got a uuid_str yet, we can't stop | ||
1301 | * heartbeat. Otherwise, do it. | ||
1302 | */ | ||
1303 | if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str) | ||
1304 | hangup_needed = 1; | ||
1305 | |||
1306 | if (osb->cconn) | ||
1307 | ocfs2_dlm_shutdown(osb, hangup_needed); | ||
1288 | 1308 | ||
1289 | debugfs_remove(osb->osb_debug_root); | 1309 | debugfs_remove(osb->osb_debug_root); |
1290 | 1310 | ||
1291 | if (!mnt_err) | 1311 | if (hangup_needed) |
1292 | ocfs2_stop_heartbeat(osb); | 1312 | ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); |
1293 | 1313 | ||
1294 | atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); | 1314 | atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); |
1295 | 1315 | ||
1296 | if (ocfs2_mount_local(osb)) | 1316 | if (ocfs2_mount_local(osb)) |
1297 | snprintf(nodestr, sizeof(nodestr), "local"); | 1317 | snprintf(nodestr, sizeof(nodestr), "local"); |
1298 | else | 1318 | else |
1299 | snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); | 1319 | snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num); |
1300 | 1320 | ||
1301 | printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", | 1321 | printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", |
1302 | osb->dev_str, nodestr); | 1322 | osb->dev_str, nodestr); |
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1355 | sb->s_fs_info = osb; | 1375 | sb->s_fs_info = osb; |
1356 | sb->s_op = &ocfs2_sops; | 1376 | sb->s_op = &ocfs2_sops; |
1357 | sb->s_export_op = &ocfs2_export_ops; | 1377 | sb->s_export_op = &ocfs2_export_ops; |
1358 | osb->osb_locking_proto = ocfs2_locking_protocol; | ||
1359 | sb->s_time_gran = 1; | 1378 | sb->s_time_gran = 1; |
1360 | sb->s_flags |= MS_NOATIME; | 1379 | sb->s_flags |= MS_NOATIME; |
1361 | /* this is needed to support O_LARGEFILE */ | 1380 | /* this is needed to support O_LARGEFILE */ |
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1368 | osb->s_sectsize_bits = blksize_bits(sector_size); | 1387 | osb->s_sectsize_bits = blksize_bits(sector_size); |
1369 | BUG_ON(!osb->s_sectsize_bits); | 1388 | BUG_ON(!osb->s_sectsize_bits); |
1370 | 1389 | ||
1371 | init_waitqueue_head(&osb->recovery_event); | ||
1372 | spin_lock_init(&osb->dc_task_lock); | 1390 | spin_lock_init(&osb->dc_task_lock); |
1373 | init_waitqueue_head(&osb->dc_event); | 1391 | init_waitqueue_head(&osb->dc_event); |
1374 | osb->dc_work_sequence = 0; | 1392 | osb->dc_work_sequence = 0; |
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1376 | INIT_LIST_HEAD(&osb->blocked_lock_list); | 1394 | INIT_LIST_HEAD(&osb->blocked_lock_list); |
1377 | osb->blocked_lock_count = 0; | 1395 | osb->blocked_lock_count = 0; |
1378 | spin_lock_init(&osb->osb_lock); | 1396 | spin_lock_init(&osb->osb_lock); |
1397 | ocfs2_init_inode_steal_slot(osb); | ||
1379 | 1398 | ||
1380 | atomic_set(&osb->alloc_stats.moves, 0); | 1399 | atomic_set(&osb->alloc_stats.moves, 0); |
1381 | atomic_set(&osb->alloc_stats.local_data, 0); | 1400 | atomic_set(&osb->alloc_stats.local_data, 0); |
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1388 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", | 1407 | snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", |
1389 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); | 1408 | MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); |
1390 | 1409 | ||
1391 | mutex_init(&osb->recovery_lock); | 1410 | status = ocfs2_recovery_init(osb); |
1392 | 1411 | if (status) { | |
1393 | osb->disable_recovery = 0; | 1412 | mlog(ML_ERROR, "Unable to initialize recovery state\n"); |
1394 | osb->recovery_thread_task = NULL; | 1413 | mlog_errno(status); |
1414 | goto bail; | ||
1415 | } | ||
1395 | 1416 | ||
1396 | init_waitqueue_head(&osb->checkpoint_event); | 1417 | init_waitqueue_head(&osb->checkpoint_event); |
1397 | atomic_set(&osb->needs_checkpoint, 0); | 1418 | atomic_set(&osb->needs_checkpoint, 0); |
1398 | 1419 | ||
1399 | osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; | 1420 | osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; |
1400 | 1421 | ||
1401 | osb->node_num = O2NM_INVALID_NODE_NUM; | ||
1402 | osb->slot_num = OCFS2_INVALID_SLOT; | 1422 | osb->slot_num = OCFS2_INVALID_SLOT; |
1403 | 1423 | ||
1404 | osb->local_alloc_state = OCFS2_LA_UNUSED; | 1424 | osb->local_alloc_state = OCFS2_LA_UNUSED; |
1405 | osb->local_alloc_bh = NULL; | 1425 | osb->local_alloc_bh = NULL; |
1406 | 1426 | ||
1407 | ocfs2_setup_hb_callbacks(osb); | ||
1408 | |||
1409 | init_waitqueue_head(&osb->osb_mount_event); | 1427 | init_waitqueue_head(&osb->osb_mount_event); |
1410 | 1428 | ||
1411 | osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); | 1429 | osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); |
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb, | |||
1455 | goto bail; | 1473 | goto bail; |
1456 | } | 1474 | } |
1457 | 1475 | ||
1476 | if (ocfs2_userspace_stack(osb)) { | ||
1477 | memcpy(osb->osb_cluster_stack, | ||
1478 | OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, | ||
1479 | OCFS2_STACK_LABEL_LEN); | ||
1480 | osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; | ||
1481 | if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { | ||
1482 | mlog(ML_ERROR, | ||
1483 | "couldn't mount because of an invalid " | ||
1484 | "cluster stack label (%s) \n", | ||
1485 | osb->osb_cluster_stack); | ||
1486 | status = -EINVAL; | ||
1487 | goto bail; | ||
1488 | } | ||
1489 | } else { | ||
1490 | /* The empty string is identical with classic tools that | ||
1491 | * don't know about s_cluster_info. */ | ||
1492 | osb->osb_cluster_stack[0] = '\0'; | ||
1493 | } | ||
1494 | |||
1458 | get_random_bytes(&osb->s_next_generation, sizeof(u32)); | 1495 | get_random_bytes(&osb->s_next_generation, sizeof(u32)); |
1459 | 1496 | ||
1460 | /* FIXME | 1497 | /* FIXME |
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) | |||
1724 | 1761 | ||
1725 | /* This function assumes that the caller has the main osb resource */ | 1762 | /* This function assumes that the caller has the main osb resource */ |
1726 | 1763 | ||
1727 | if (osb->slot_info) | 1764 | ocfs2_free_slot_info(osb); |
1728 | ocfs2_free_slot_info(osb->slot_info); | ||
1729 | 1765 | ||
1730 | kfree(osb->osb_orphan_wipes); | 1766 | kfree(osb->osb_orphan_wipes); |
1731 | /* FIXME | 1767 | /* FIXME |
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 5f66c4466151..817f5966edca 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c | |||
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char | |||
87 | 87 | ||
88 | void sysfs_remove_link(struct kobject * kobj, const char * name) | 88 | void sysfs_remove_link(struct kobject * kobj, const char * name) |
89 | { | 89 | { |
90 | sysfs_hash_and_remove(kobj->sd, name); | 90 | struct sysfs_dirent *parent_sd = NULL; |
91 | |||
92 | if (!kobj) | ||
93 | parent_sd = &sysfs_root; | ||
94 | else | ||
95 | parent_sd = kobj->sd; | ||
96 | |||
97 | sysfs_hash_and_remove(parent_sd, name); | ||
91 | } | 98 | } |
92 | 99 | ||
93 | static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, | 100 | static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, |