aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/mdsmap.c
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-10-06 14:31:09 -0400
committerSage Weil <sage@newdream.net>2009-10-06 14:31:09 -0400
commit2f2dc053404febedc9c273452d9d518fb31fde72 (patch)
tree286ff35153d0b52349e035a69f3f795fdcb0afb6 /fs/ceph/mdsmap.c
parent1d3576fd10f0d7a104204267b81cf84a07028dad (diff)
ceph: MDS client
The MDS (metadata server) client is responsible for submitting requests to the MDS cluster and parsing the response. We decide which MDS to submit each request to based on cached information about the current partition of the directory hierarchy across the cluster. A stateful session is opened with each MDS before we submit requests to it, and a mutex is used to control the ordering of messages within each session. An MDS request may generate two responses. The first indicates the operation was a success and returns any result. A second reply is sent when the operation commits to disk. Note that locking on the MDS ensures that the results of updates are visible only to the updating client before the operation commits. Requests are linked to the containing directory so that an fsync will wait for them to commit. If an MDS fails and/or recovers, we resubmit requests as needed. We also reconnect existing capabilities to a recovering MDS to reestablish that shared session state. Old dentry leases are invalidated. Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/mdsmap.c')
-rw-r--r--fs/ceph/mdsmap.c166
1 files changed, 166 insertions, 0 deletions
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..15913cbeb289
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,166 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 int i, j, n;
53 int err = -EINVAL;
54 u16 version;
55
56 m = kzalloc(sizeof(*m), GFP_NOFS);
57 if (m == NULL)
58 return ERR_PTR(-ENOMEM);
59
60 ceph_decode_16_safe(p, end, version, bad);
61
62 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
63 ceph_decode_32(p, m->m_epoch);
64 ceph_decode_32(p, m->m_client_epoch);
65 ceph_decode_32(p, m->m_last_failure);
66 ceph_decode_32(p, m->m_root);
67 ceph_decode_32(p, m->m_session_timeout);
68 ceph_decode_32(p, m->m_session_autoclose);
69 ceph_decode_64(p, m->m_max_file_size);
70 ceph_decode_32(p, m->m_max_mds);
71
72 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
73 if (m->m_info == NULL)
74 goto badmem;
75
76 /* pick out active nodes from mds_info (state > 0) */
77 ceph_decode_32(p, n);
78 for (i = 0; i < n; i++) {
79 u32 namelen;
80 s32 mds, inc, state;
81 u64 state_seq;
82 u8 infoversion;
83 struct ceph_entity_addr addr;
84 u32 num_export_targets;
85 void *pexport_targets = NULL;
86
87 ceph_decode_need(p, end, sizeof(addr) + 1 + sizeof(u32), bad);
88 *p += sizeof(addr); /* skip addr key */
89 ceph_decode_8(p, infoversion);
90 ceph_decode_32(p, namelen); /* skip mds name */
91 *p += namelen;
92
93 ceph_decode_need(p, end,
94 5*sizeof(u32) + sizeof(u64) +
95 sizeof(addr) + sizeof(struct ceph_timespec),
96 bad);
97 ceph_decode_32(p, mds);
98 ceph_decode_32(p, inc);
99 ceph_decode_32(p, state);
100 ceph_decode_64(p, state_seq);
101 ceph_decode_copy(p, &addr, sizeof(addr));
102 *p += sizeof(struct ceph_timespec);
103 *p += sizeof(u32);
104 ceph_decode_32_safe(p, end, namelen, bad);
105 *p += sizeof(namelen);
106 if (infoversion >= 2) {
107 ceph_decode_32_safe(p, end, num_export_targets, bad);
108 pexport_targets = *p;
109 *p += sizeof(num_export_targets * sizeof(u32));
110 } else {
111 num_export_targets = 0;
112 }
113
114 dout("mdsmap_decode %d/%d mds%d.%d %s %s\n",
115 i+1, n, mds, inc, pr_addr(&addr.in_addr),
116 ceph_mds_state_name(state));
117 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
118 m->m_info[mds].state = state;
119 m->m_info[mds].addr = addr;
120 m->m_info[mds].num_export_targets = num_export_targets;
121 if (num_export_targets) {
122 m->m_info[mds].export_targets =
123 kcalloc(num_export_targets, sizeof(u32),
124 GFP_NOFS);
125 for (j = 0; j < num_export_targets; j++)
126 ceph_decode_32(&pexport_targets,
127 m->m_info[mds].export_targets[j]);
128 } else {
129 m->m_info[mds].export_targets = NULL;
130 }
131 }
132 }
133
134 /* pg_pools */
135 ceph_decode_32_safe(p, end, n, bad);
136 m->m_num_data_pg_pools = n;
137 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
138 if (!m->m_data_pg_pools)
139 goto badmem;
140 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
141 for (i = 0; i < n; i++)
142 ceph_decode_32(p, m->m_data_pg_pools[i]);
143 ceph_decode_32(p, m->m_cas_pg_pool);
144
145 /* ok, we don't care about the rest. */
146 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
147 return m;
148
149badmem:
150 err = -ENOMEM;
151bad:
152 pr_err("corrupt mdsmap\n");
153 ceph_mdsmap_destroy(m);
154 return ERR_PTR(-EINVAL);
155}
156
157void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
158{
159 int i;
160
161 for (i = 0; i < m->m_max_mds; i++)
162 kfree(m->m_info[i].export_targets);
163 kfree(m->m_info);
164 kfree(m->m_data_pg_pools);
165 kfree(m);
166}