diff options
-rw-r--r-- | fs/ceph/ceph_fs.h | 2 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 156 |
2 files changed, 57 insertions, 101 deletions
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h index db3fed33c4aa..d0f2557bb41b 100644 --- a/fs/ceph/ceph_fs.h +++ b/fs/ceph/ceph_fs.h | |||
@@ -39,7 +39,7 @@ | |||
39 | #define CEPH_MDS_PROTOCOL 9 /* cluster internal */ | 39 | #define CEPH_MDS_PROTOCOL 9 /* cluster internal */ |
40 | #define CEPH_MON_PROTOCOL 5 /* cluster internal */ | 40 | #define CEPH_MON_PROTOCOL 5 /* cluster internal */ |
41 | #define CEPH_OSDC_PROTOCOL 22 /* server/client */ | 41 | #define CEPH_OSDC_PROTOCOL 22 /* server/client */ |
42 | #define CEPH_MDSC_PROTOCOL 30 /* server/client */ | 42 | #define CEPH_MDSC_PROTOCOL 31 /* server/client */ |
43 | #define CEPH_MONC_PROTOCOL 15 /* server/client */ | 43 | #define CEPH_MONC_PROTOCOL 15 /* server/client */ |
44 | 44 | ||
45 | 45 | ||
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index ec884e2845db..6e08f488a30f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -9,6 +9,7 @@ | |||
9 | #include "messenger.h" | 9 | #include "messenger.h" |
10 | #include "decode.h" | 10 | #include "decode.h" |
11 | #include "auth.h" | 11 | #include "auth.h" |
12 | #include "pagelist.h" | ||
12 | 13 | ||
13 | /* | 14 | /* |
14 | * A cluster of MDS (metadata server) daemons is responsible for | 15 | * A cluster of MDS (metadata server) daemons is responsible for |
@@ -1971,20 +1972,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, | |||
1971 | /* | 1972 | /* |
1972 | * Encode information about a cap for a reconnect with the MDS. | 1973 | * Encode information about a cap for a reconnect with the MDS. |
1973 | */ | 1974 | */ |
1974 | struct encode_caps_data { | ||
1975 | void **pp; | ||
1976 | void *end; | ||
1977 | int *num_caps; | ||
1978 | }; | ||
1979 | |||
1980 | static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | 1975 | static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, |
1981 | void *arg) | 1976 | void *arg) |
1982 | { | 1977 | { |
1983 | struct ceph_mds_cap_reconnect *rec; | 1978 | struct ceph_mds_cap_reconnect rec; |
1984 | struct ceph_inode_info *ci; | 1979 | struct ceph_inode_info *ci; |
1985 | struct encode_caps_data *data = (struct encode_caps_data *)arg; | 1980 | struct ceph_pagelist *pagelist = arg; |
1986 | void *p = *(data->pp); | ||
1987 | void *end = data->end; | ||
1988 | char *path; | 1981 | char *path; |
1989 | int pathlen, err; | 1982 | int pathlen, err; |
1990 | u64 pathbase; | 1983 | u64 pathbase; |
@@ -1995,8 +1988,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1995 | dout(" adding %p ino %llx.%llx cap %p %lld %s\n", | 1988 | dout(" adding %p ino %llx.%llx cap %p %lld %s\n", |
1996 | inode, ceph_vinop(inode), cap, cap->cap_id, | 1989 | inode, ceph_vinop(inode), cap, cap->cap_id, |
1997 | ceph_cap_string(cap->issued)); | 1990 | ceph_cap_string(cap->issued)); |
1998 | ceph_decode_need(&p, end, sizeof(u64), needmore); | 1991 | err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); |
1999 | ceph_encode_64(&p, ceph_ino(inode)); | 1992 | if (err) |
1993 | return err; | ||
2000 | 1994 | ||
2001 | dentry = d_find_alias(inode); | 1995 | dentry = d_find_alias(inode); |
2002 | if (dentry) { | 1996 | if (dentry) { |
@@ -2009,33 +2003,29 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
2009 | path = NULL; | 2003 | path = NULL; |
2010 | pathlen = 0; | 2004 | pathlen = 0; |
2011 | } | 2005 | } |
2012 | ceph_decode_need(&p, end, pathlen+4, needmore); | 2006 | err = ceph_pagelist_encode_string(pagelist, path, pathlen); |
2013 | ceph_encode_string(&p, end, path, pathlen); | 2007 | if (err) |
2008 | goto out; | ||
2014 | 2009 | ||
2015 | ceph_decode_need(&p, end, sizeof(*rec), needmore); | ||
2016 | rec = p; | ||
2017 | p += sizeof(*rec); | ||
2018 | BUG_ON(p > end); | ||
2019 | spin_lock(&inode->i_lock); | 2010 | spin_lock(&inode->i_lock); |
2020 | cap->seq = 0; /* reset cap seq */ | 2011 | cap->seq = 0; /* reset cap seq */ |
2021 | cap->issue_seq = 0; /* and issue_seq */ | 2012 | cap->issue_seq = 0; /* and issue_seq */ |
2022 | rec->cap_id = cpu_to_le64(cap->cap_id); | 2013 | rec.cap_id = cpu_to_le64(cap->cap_id); |
2023 | rec->pathbase = cpu_to_le64(pathbase); | 2014 | rec.pathbase = cpu_to_le64(pathbase); |
2024 | rec->wanted = cpu_to_le32(__ceph_caps_wanted(ci)); | 2015 | rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); |
2025 | rec->issued = cpu_to_le32(cap->issued); | 2016 | rec.issued = cpu_to_le32(cap->issued); |
2026 | rec->size = cpu_to_le64(inode->i_size); | 2017 | rec.size = cpu_to_le64(inode->i_size); |
2027 | ceph_encode_timespec(&rec->mtime, &inode->i_mtime); | 2018 | ceph_encode_timespec(&rec.mtime, &inode->i_mtime); |
2028 | ceph_encode_timespec(&rec->atime, &inode->i_atime); | 2019 | ceph_encode_timespec(&rec.atime, &inode->i_atime); |
2029 | rec->snaprealm = cpu_to_le64(ci->i_snap_realm->ino); | 2020 | rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); |
2030 | spin_unlock(&inode->i_lock); | 2021 | spin_unlock(&inode->i_lock); |
2031 | 2022 | ||
2023 | err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); | ||
2024 | |||
2025 | out: | ||
2032 | kfree(path); | 2026 | kfree(path); |
2033 | dput(dentry); | 2027 | dput(dentry); |
2034 | (*data->num_caps)++; | 2028 | return err; |
2035 | *(data->pp) = p; | ||
2036 | return 0; | ||
2037 | needmore: | ||
2038 | return -ENOSPC; | ||
2039 | } | 2029 | } |
2040 | 2030 | ||
2041 | 2031 | ||
@@ -2053,19 +2043,26 @@ needmore: | |||
2053 | */ | 2043 | */ |
2054 | static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) | 2044 | static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) |
2055 | { | 2045 | { |
2056 | struct ceph_mds_session *session; | 2046 | struct ceph_mds_session *session = NULL; |
2057 | struct ceph_msg *reply; | 2047 | struct ceph_msg *reply; |
2058 | int newlen, len = 4 + 1; | ||
2059 | void *p, *end; | ||
2060 | int err; | 2048 | int err; |
2061 | int num_caps, num_realms = 0; | ||
2062 | int got; | 2049 | int got; |
2063 | u64 next_snap_ino = 0; | 2050 | u64 next_snap_ino = 0; |
2064 | __le32 *pnum_caps, *pnum_realms; | 2051 | struct ceph_pagelist *pagelist; |
2065 | struct encode_caps_data iter_args; | ||
2066 | 2052 | ||
2067 | pr_info("reconnect to recovering mds%d\n", mds); | 2053 | pr_info("reconnect to recovering mds%d\n", mds); |
2068 | 2054 | ||
2055 | pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); | ||
2056 | if (!pagelist) | ||
2057 | goto fail_nopagelist; | ||
2058 | ceph_pagelist_init(pagelist); | ||
2059 | |||
2060 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL); | ||
2061 | if (IS_ERR(reply)) { | ||
2062 | err = PTR_ERR(reply); | ||
2063 | goto fail_nomsg; | ||
2064 | } | ||
2065 | |||
2069 | /* find session */ | 2066 | /* find session */ |
2070 | session = __ceph_lookup_mds_session(mdsc, mds); | 2067 | session = __ceph_lookup_mds_session(mdsc, mds); |
2071 | mutex_unlock(&mdsc->mutex); /* drop lock for duration */ | 2068 | mutex_unlock(&mdsc->mutex); /* drop lock for duration */ |
@@ -2081,12 +2078,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) | |||
2081 | 2078 | ||
2082 | /* replay unsafe requests */ | 2079 | /* replay unsafe requests */ |
2083 | replay_unsafe_requests(mdsc, session); | 2080 | replay_unsafe_requests(mdsc, session); |
2084 | |||
2085 | /* estimate needed space */ | ||
2086 | len += session->s_nr_caps * | ||
2087 | (100+sizeof(struct ceph_mds_cap_reconnect)); | ||
2088 | pr_info("estimating i need %d bytes for %d caps\n", | ||
2089 | len, session->s_nr_caps); | ||
2090 | } else { | 2081 | } else { |
2091 | dout("no session for mds%d, will send short reconnect\n", | 2082 | dout("no session for mds%d, will send short reconnect\n", |
2092 | mds); | 2083 | mds); |
@@ -2094,41 +2085,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds) | |||
2094 | 2085 | ||
2095 | down_read(&mdsc->snap_rwsem); | 2086 | down_read(&mdsc->snap_rwsem); |
2096 | 2087 | ||
2097 | retry: | 2088 | if (!session) |
2098 | /* build reply */ | ||
2099 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, len, 0, 0, NULL); | ||
2100 | if (IS_ERR(reply)) { | ||
2101 | err = PTR_ERR(reply); | ||
2102 | pr_err("send_mds_reconnect ENOMEM on %d for mds%d\n", | ||
2103 | len, mds); | ||
2104 | goto out; | ||
2105 | } | ||
2106 | p = reply->front.iov_base; | ||
2107 | end = p + len; | ||
2108 | |||
2109 | if (!session) { | ||
2110 | ceph_encode_8(&p, 1); /* session was closed */ | ||
2111 | ceph_encode_32(&p, 0); | ||
2112 | goto send; | 2089 | goto send; |
2113 | } | ||
2114 | dout("session %p state %s\n", session, | 2090 | dout("session %p state %s\n", session, |
2115 | session_state_name(session->s_state)); | 2091 | session_state_name(session->s_state)); |
2116 | 2092 | ||
2117 | /* traverse this session's caps */ | 2093 | /* traverse this session's caps */ |
2118 | ceph_encode_8(&p, 0); | 2094 | err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); |
2119 | pnum_caps = p; | 2095 | if (err) |
2120 | ceph_encode_32(&p, session->s_nr_caps); | 2096 | goto fail; |
2121 | num_caps = 0; | 2097 | err = iterate_session_caps(session, encode_caps_cb, pagelist); |
2122 | |||
2123 | iter_args.pp = &p; | ||
2124 | iter_args.end = end; | ||
2125 | iter_args.num_caps = &num_caps; | ||
2126 | err = iterate_session_caps(session, encode_caps_cb, &iter_args); | ||
2127 | if (err == -ENOSPC) | ||
2128 | goto needmore; | ||
2129 | if (err < 0) | 2098 | if (err < 0) |
2130 | goto out; | 2099 | goto out; |
2131 | *pnum_caps = cpu_to_le32(num_caps); | ||
2132 | 2100 | ||
2133 | /* | 2101 | /* |
2134 | * snaprealms. we provide mds with the ino, seq (version), and | 2102 | * snaprealms. we provide mds with the ino, seq (version), and |
@@ -2136,14 +2104,9 @@ retry: | |||
2136 | * it will tell us. | 2104 | * it will tell us. |
2137 | */ | 2105 | */ |
2138 | next_snap_ino = 0; | 2106 | next_snap_ino = 0; |
2139 | /* save some space for the snaprealm count */ | ||
2140 | pnum_realms = p; | ||
2141 | ceph_decode_need(&p, end, sizeof(*pnum_realms), needmore); | ||
2142 | p += sizeof(*pnum_realms); | ||
2143 | num_realms = 0; | ||
2144 | while (1) { | 2107 | while (1) { |
2145 | struct ceph_snap_realm *realm; | 2108 | struct ceph_snap_realm *realm; |
2146 | struct ceph_mds_snaprealm_reconnect *sr_rec; | 2109 | struct ceph_mds_snaprealm_reconnect sr_rec; |
2147 | got = radix_tree_gang_lookup(&mdsc->snap_realms, | 2110 | got = radix_tree_gang_lookup(&mdsc->snap_realms, |
2148 | (void **)&realm, next_snap_ino, 1); | 2111 | (void **)&realm, next_snap_ino, 1); |
2149 | if (!got) | 2112 | if (!got) |
@@ -2151,22 +2114,19 @@ retry: | |||
2151 | 2114 | ||
2152 | dout(" adding snap realm %llx seq %lld parent %llx\n", | 2115 | dout(" adding snap realm %llx seq %lld parent %llx\n", |
2153 | realm->ino, realm->seq, realm->parent_ino); | 2116 | realm->ino, realm->seq, realm->parent_ino); |
2154 | ceph_decode_need(&p, end, sizeof(*sr_rec), needmore); | 2117 | sr_rec.ino = cpu_to_le64(realm->ino); |
2155 | sr_rec = p; | 2118 | sr_rec.seq = cpu_to_le64(realm->seq); |
2156 | sr_rec->ino = cpu_to_le64(realm->ino); | 2119 | sr_rec.parent = cpu_to_le64(realm->parent_ino); |
2157 | sr_rec->seq = cpu_to_le64(realm->seq); | 2120 | err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec)); |
2158 | sr_rec->parent = cpu_to_le64(realm->parent_ino); | 2121 | if (err) |
2159 | p += sizeof(*sr_rec); | 2122 | goto fail; |
2160 | num_realms++; | ||
2161 | next_snap_ino = realm->ino + 1; | 2123 | next_snap_ino = realm->ino + 1; |
2162 | } | 2124 | } |
2163 | *pnum_realms = cpu_to_le32(num_realms); | ||
2164 | 2125 | ||
2165 | send: | 2126 | send: |
2166 | reply->front.iov_len = p - reply->front.iov_base; | 2127 | reply->pagelist = pagelist; |
2167 | reply->hdr.front_len = cpu_to_le32(reply->front.iov_len); | 2128 | reply->hdr.data_len = cpu_to_le32(pagelist->length); |
2168 | dout("final len was %u (guessed %d)\n", | 2129 | reply->nr_pages = calc_pages_for(0, pagelist->length); |
2169 | (unsigned)reply->front.iov_len, len); | ||
2170 | ceph_con_send(&session->s_con, reply); | 2130 | ceph_con_send(&session->s_con, reply); |
2171 | 2131 | ||
2172 | if (session) { | 2132 | if (session) { |
@@ -2183,18 +2143,14 @@ out: | |||
2183 | mutex_lock(&mdsc->mutex); | 2143 | mutex_lock(&mdsc->mutex); |
2184 | return; | 2144 | return; |
2185 | 2145 | ||
2186 | needmore: | 2146 | fail: |
2187 | /* | ||
2188 | * we need a larger buffer. this doesn't very accurately | ||
2189 | * factor in snap realms, but it's safe. | ||
2190 | */ | ||
2191 | num_caps += num_realms; | ||
2192 | newlen = len * ((100 * (session->s_nr_caps+3)) / (num_caps + 1)) / 100; | ||
2193 | pr_info("i guessed %d, and did %d of %d caps, retrying with %d\n", | ||
2194 | len, num_caps, session->s_nr_caps, newlen); | ||
2195 | len = newlen; | ||
2196 | ceph_msg_put(reply); | 2147 | ceph_msg_put(reply); |
2197 | goto retry; | 2148 | fail_nomsg: |
2149 | ceph_pagelist_release(pagelist); | ||
2150 | kfree(pagelist); | ||
2151 | fail_nopagelist: | ||
2152 | pr_err("ENOMEM preparing reconnect for mds%d\n", mds); | ||
2153 | goto out; | ||
2198 | } | 2154 | } |
2199 | 2155 | ||
2200 | 2156 | ||