diff options
author | David Teigland <teigland@redhat.com> | 2011-03-28 15:17:26 -0400 |
---|---|---|
committer | David Teigland <teigland@redhat.com> | 2011-04-01 15:19:06 -0400 |
commit | c6ff669bac5c409f4cb74366248f51b73f7d6feb (patch) | |
tree | 14c4b7dc943a7dde8fd6d80bc9d149dadc0d59b8 /fs/dlm/lock.c | |
parent | 4bcad6c1ef53a9a0224f4654ceb3b9030d0769ec (diff) |
dlm: delayed reply message warning
Add an option (disabled by default) to print a warning message
when a lock has been waiting a configurable amount of time for
a reply message from another node. This is mainly for debugging.
Signed-off-by: David Teigland <teigland@redhat.com>
Diffstat (limited to 'fs/dlm/lock.c')
-rw-r--r-- | fs/dlm/lock.c | 100 |
1 files changed, 93 insertions, 7 deletions
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 04b8c449303f..e3c864120371 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c | |||
@@ -799,10 +799,84 @@ static int msg_reply_type(int mstype) | |||
799 | return -1; | 799 | return -1; |
800 | } | 800 | } |
801 | 801 | ||
802 | static int nodeid_warned(int nodeid, int num_nodes, int *warned) | ||
803 | { | ||
804 | int i; | ||
805 | |||
806 | for (i = 0; i < num_nodes; i++) { | ||
807 | if (!warned[i]) { | ||
808 | warned[i] = nodeid; | ||
809 | return 0; | ||
810 | } | ||
811 | if (warned[i] == nodeid) | ||
812 | return 1; | ||
813 | } | ||
814 | return 0; | ||
815 | } | ||
816 | |||
817 | void dlm_scan_waiters(struct dlm_ls *ls) | ||
818 | { | ||
819 | struct dlm_lkb *lkb; | ||
820 | ktime_t zero = ktime_set(0, 0); | ||
821 | s64 us; | ||
822 | s64 debug_maxus = 0; | ||
823 | u32 debug_scanned = 0; | ||
824 | u32 debug_expired = 0; | ||
825 | int num_nodes = 0; | ||
826 | int *warned = NULL; | ||
827 | |||
828 | if (!dlm_config.ci_waitwarn_us) | ||
829 | return; | ||
830 | |||
831 | mutex_lock(&ls->ls_waiters_mutex); | ||
832 | |||
833 | list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { | ||
834 | if (ktime_equal(lkb->lkb_wait_time, zero)) | ||
835 | continue; | ||
836 | |||
837 | debug_scanned++; | ||
838 | |||
839 | us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_wait_time)); | ||
840 | |||
841 | if (us < dlm_config.ci_waitwarn_us) | ||
842 | continue; | ||
843 | |||
844 | lkb->lkb_wait_time = zero; | ||
845 | |||
846 | debug_expired++; | ||
847 | if (us > debug_maxus) | ||
848 | debug_maxus = us; | ||
849 | |||
850 | if (!num_nodes) { | ||
851 | num_nodes = ls->ls_num_nodes; | ||
852 | warned = kmalloc(GFP_KERNEL, num_nodes * sizeof(int)); | ||
853 | if (warned) | ||
854 | memset(warned, 0, num_nodes * sizeof(int)); | ||
855 | } | ||
856 | if (!warned) | ||
857 | continue; | ||
858 | if (nodeid_warned(lkb->lkb_wait_nodeid, num_nodes, warned)) | ||
859 | continue; | ||
860 | |||
861 | log_error(ls, "waitwarn %x %lld %d us check connection to " | ||
862 | "node %d", lkb->lkb_id, (long long)us, | ||
863 | dlm_config.ci_waitwarn_us, lkb->lkb_wait_nodeid); | ||
864 | } | ||
865 | mutex_unlock(&ls->ls_waiters_mutex); | ||
866 | |||
867 | if (warned) | ||
868 | kfree(warned); | ||
869 | |||
870 | if (debug_expired) | ||
871 | log_debug(ls, "scan_waiters %u warn %u over %d us max %lld us", | ||
872 | debug_scanned, debug_expired, | ||
873 | dlm_config.ci_waitwarn_us, (long long)debug_maxus); | ||
874 | } | ||
875 | |||
802 | /* add/remove lkb from global waiters list of lkb's waiting for | 876 | /* add/remove lkb from global waiters list of lkb's waiting for |
803 | a reply from a remote node */ | 877 | a reply from a remote node */ |
804 | 878 | ||
805 | static int add_to_waiters(struct dlm_lkb *lkb, int mstype) | 879 | static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid) |
806 | { | 880 | { |
807 | struct dlm_ls *ls = lkb->lkb_resource->res_ls; | 881 | struct dlm_ls *ls = lkb->lkb_resource->res_ls; |
808 | int error = 0; | 882 | int error = 0; |
@@ -842,6 +916,8 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype) | |||
842 | 916 | ||
843 | lkb->lkb_wait_count++; | 917 | lkb->lkb_wait_count++; |
844 | lkb->lkb_wait_type = mstype; | 918 | lkb->lkb_wait_type = mstype; |
919 | lkb->lkb_wait_time = ktime_get(); | ||
920 | lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */ | ||
845 | hold_lkb(lkb); | 921 | hold_lkb(lkb); |
846 | list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); | 922 | list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); |
847 | out: | 923 | out: |
@@ -1157,6 +1233,16 @@ void dlm_adjust_timeouts(struct dlm_ls *ls) | |||
1157 | list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) | 1233 | list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) |
1158 | lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); | 1234 | lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); |
1159 | mutex_unlock(&ls->ls_timeout_mutex); | 1235 | mutex_unlock(&ls->ls_timeout_mutex); |
1236 | |||
1237 | if (!dlm_config.ci_waitwarn_us) | ||
1238 | return; | ||
1239 | |||
1240 | mutex_lock(&ls->ls_waiters_mutex); | ||
1241 | list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { | ||
1242 | if (ktime_to_us(lkb->lkb_wait_time)) | ||
1243 | lkb->lkb_wait_time = ktime_get(); | ||
1244 | } | ||
1245 | mutex_unlock(&ls->ls_waiters_mutex); | ||
1160 | } | 1246 | } |
1161 | 1247 | ||
1162 | /* lkb is master or local copy */ | 1248 | /* lkb is master or local copy */ |
@@ -2844,12 +2930,12 @@ static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) | |||
2844 | struct dlm_mhandle *mh; | 2930 | struct dlm_mhandle *mh; |
2845 | int to_nodeid, error; | 2931 | int to_nodeid, error; |
2846 | 2932 | ||
2847 | error = add_to_waiters(lkb, mstype); | 2933 | to_nodeid = r->res_nodeid; |
2934 | |||
2935 | error = add_to_waiters(lkb, mstype, to_nodeid); | ||
2848 | if (error) | 2936 | if (error) |
2849 | return error; | 2937 | return error; |
2850 | 2938 | ||
2851 | to_nodeid = r->res_nodeid; | ||
2852 | |||
2853 | error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); | 2939 | error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); |
2854 | if (error) | 2940 | if (error) |
2855 | goto fail; | 2941 | goto fail; |
@@ -2951,12 +3037,12 @@ static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) | |||
2951 | struct dlm_mhandle *mh; | 3037 | struct dlm_mhandle *mh; |
2952 | int to_nodeid, error; | 3038 | int to_nodeid, error; |
2953 | 3039 | ||
2954 | error = add_to_waiters(lkb, DLM_MSG_LOOKUP); | 3040 | to_nodeid = dlm_dir_nodeid(r); |
3041 | |||
3042 | error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid); | ||
2955 | if (error) | 3043 | if (error) |
2956 | return error; | 3044 | return error; |
2957 | 3045 | ||
2958 | to_nodeid = dlm_dir_nodeid(r); | ||
2959 | |||
2960 | error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); | 3046 | error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); |
2961 | if (error) | 3047 | if (error) |
2962 | goto fail; | 3048 | goto fail; |