aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/mds_client.h
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2009-10-06 14:31:09 -0400
committerSage Weil <sage@newdream.net>2009-10-06 14:31:09 -0400
commit2f2dc053404febedc9c273452d9d518fb31fde72 (patch)
tree286ff35153d0b52349e035a69f3f795fdcb0afb6 /fs/ceph/mds_client.h
parent1d3576fd10f0d7a104204267b81cf84a07028dad (diff)
ceph: MDS client
The MDS (metadata server) client is responsible for submitting requests to the MDS cluster and parsing the response. We decide which MDS to submit each request to based on cached information about the current partition of the directory hierarchy across the cluster. A stateful session is opened with each MDS before we submit requests to it, and a mutex is used to control the ordering of messages within each session. An MDS request may generate two responses. The first indicates the operation was a success and returns any result. A second reply is sent when the operation commits to disk. Note that locking on the MDS ensures that the results of updates are visible only to the updating client before the operation commits. Requests are linked to the containing directory so that an fsync will wait for them to commit. If an MDS fails and/or recovers, we resubmit requests as needed. We also reconnect existing capabilities to a recovering MDS to reestablish that shared session state. Old dentry leases are invalidated. Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/mds_client.h')
-rw-r--r--fs/ceph/mds_client.h321
1 files changed, 321 insertions, 0 deletions
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..f566e9c84295
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,321 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/list.h>
6#include <linux/mutex.h>
7#include <linux/radix-tree.h>
8#include <linux/spinlock.h>
9
10#include "types.h"
11#include "messenger.h"
12#include "mdsmap.h"
13
14/*
15 * Some lock dependencies:
16 *
17 * session->s_mutex
18 * mdsc->mutex
19 *
20 * mdsc->snap_rwsem
21 *
22 * inode->i_lock
23 * mdsc->snap_flush_lock
24 * mdsc->cap_delay_lock
25 *
26 */
27
28struct ceph_client;
29struct ceph_cap;
30
31/*
32 * parsed info about a single inode. pointers are into the encoded
33 * on-wire structures within the mds reply message payload.
34 */
35struct ceph_mds_reply_info_in {
36 struct ceph_mds_reply_inode *in;
37 u32 symlink_len;
38 char *symlink;
39 u32 xattr_len;
40 char *xattr_data;
41};
42
43/*
44 * parsed info about an mds reply, including information about the
45 * target inode and/or its parent directory and dentry, and directory
46 * contents (for readdir results).
47 */
48struct ceph_mds_reply_info_parsed {
49 struct ceph_mds_reply_head *head;
50
51 struct ceph_mds_reply_info_in diri, targeti;
52 struct ceph_mds_reply_dirfrag *dirfrag;
53 char *dname;
54 u32 dname_len;
55 struct ceph_mds_reply_lease *dlease;
56
57 struct ceph_mds_reply_dirfrag *dir_dir;
58 int dir_nr;
59 char **dir_dname;
60 u32 *dir_dname_len;
61 struct ceph_mds_reply_lease **dir_dlease;
62 struct ceph_mds_reply_info_in *dir_in;
63 u8 dir_complete, dir_end;
64
65 /* encoded blob describing snapshot contexts for certain
66 operations (e.g., open) */
67 void *snapblob;
68 int snapblob_len;
69};
70
71
72/*
73 * cap releases are batched and sent to the MDS en masse.
74 */
75#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
76 sizeof(struct ceph_mds_cap_release)) / \
77 sizeof(struct ceph_mds_cap_item))
78
79
80/*
81 * state associated with each MDS<->client session
82 */
83enum {
84 CEPH_MDS_SESSION_NEW = 1,
85 CEPH_MDS_SESSION_OPENING = 2,
86 CEPH_MDS_SESSION_OPEN = 3,
87 CEPH_MDS_SESSION_HUNG = 4,
88 CEPH_MDS_SESSION_CLOSING = 5,
89 CEPH_MDS_SESSION_RESTARTING = 6,
90 CEPH_MDS_SESSION_RECONNECTING = 7,
91};
92
93struct ceph_mds_session {
94 struct ceph_mds_client *s_mdsc;
95 int s_mds;
96 int s_state;
97 unsigned long s_ttl; /* time until mds kills us */
98 u64 s_seq; /* incoming msg seq # */
99 struct mutex s_mutex; /* serialize session messages */
100
101 struct ceph_connection s_con;
102
103 /* protected by s_cap_lock */
104 spinlock_t s_cap_lock;
105 u32 s_cap_gen; /* inc each time we get mds stale msg */
106 unsigned long s_cap_ttl; /* when session caps expire */
107 struct list_head s_caps; /* all caps issued by this session */
108 int s_nr_caps, s_trim_caps;
109 int s_num_cap_releases;
110 struct list_head s_cap_releases; /* waiting cap_release messages */
111 struct list_head s_cap_releases_done; /* ready to send */
112
113 /* protected by mutex */
114 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
115 struct list_head s_cap_snaps_flushing;
116 unsigned long s_renew_requested; /* last time we sent a renew req */
117 u64 s_renew_seq;
118
119 atomic_t s_ref;
120 struct list_head s_waiting; /* waiting requests */
121 struct list_head s_unsafe; /* unsafe requests */
122};
123
124/*
125 * modes of choosing which MDS to send a request to
126 */
127enum {
128 USE_ANY_MDS,
129 USE_RANDOM_MDS,
130 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
131};
132
133struct ceph_mds_request;
134struct ceph_mds_client;
135
136/*
137 * request completion callback
138 */
139typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
140 struct ceph_mds_request *req);
141
142/*
143 * an in-flight mds request
144 */
145struct ceph_mds_request {
146 u64 r_tid; /* transaction id */
147
148 int r_op; /* mds op code */
149 int r_mds;
150
151 /* operation on what? */
152 struct inode *r_inode; /* arg1 */
153 struct dentry *r_dentry; /* arg1 */
154 struct dentry *r_old_dentry; /* arg2: rename from or link from */
155 char *r_path1, *r_path2;
156 struct ceph_vino r_ino1, r_ino2;
157
158 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
159 struct inode *r_target_inode; /* resulting inode */
160
161 union ceph_mds_request_args r_args;
162 int r_fmode; /* file mode, if expecting cap */
163
164 /* for choosing which mds to send this request to */
165 int r_direct_mode;
166 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
167 bool r_direct_is_hash; /* true if r_direct_hash is valid */
168
169 /* data payload is used for xattr ops */
170 struct page **r_pages;
171 int r_num_pages;
172 int r_data_len;
173
174 /* what caps shall we drop? */
175 int r_inode_drop, r_inode_unless;
176 int r_dentry_drop, r_dentry_unless;
177 int r_old_dentry_drop, r_old_dentry_unless;
178 struct inode *r_old_inode;
179 int r_old_inode_drop, r_old_inode_unless;
180
181 struct ceph_msg *r_request; /* original request */
182 struct ceph_msg *r_reply;
183 struct ceph_mds_reply_info_parsed r_reply_info;
184 int r_err;
185
186 unsigned long r_timeout; /* optional. jiffies */
187 unsigned long r_started; /* start time to measure timeout against */
188 unsigned long r_request_started; /* start time for mds request only,
189 used to measure lease durations */
190
191 /* link unsafe requests to parent directory, for fsync */
192 struct inode *r_unsafe_dir;
193 struct list_head r_unsafe_dir_item;
194
195 struct ceph_mds_session *r_session;
196
197 int r_attempts; /* resend attempts */
198 int r_num_fwd; /* number of forward attempts */
199 int r_num_stale;
200 int r_resend_mds; /* mds to resend to next, if any*/
201
202 atomic_t r_ref;
203 struct list_head r_wait;
204 struct completion r_completion;
205 struct completion r_safe_completion;
206 ceph_mds_request_callback_t r_callback;
207 struct list_head r_unsafe_item; /* per-session unsafe list item */
208 bool r_got_unsafe, r_got_safe;
209
210 bool r_did_prepopulate;
211 u32 r_readdir_offset;
212
213 struct ceph_cap_reservation r_caps_reservation;
214 int r_num_caps;
215};
216
217/*
218 * mds client state
219 */
220struct ceph_mds_client {
221 struct ceph_client *client;
222 struct mutex mutex; /* all nested structures */
223
224 struct ceph_mdsmap *mdsmap;
225 struct completion safe_umount_waiters, session_close_waiters;
226 struct list_head waiting_for_map;
227
228 struct ceph_mds_session **sessions; /* NULL for mds if no session */
229 int max_sessions; /* len of s_mds_sessions */
230 int stopping; /* true if shutting down */
231
232 /*
233 * snap_rwsem will cover cap linkage into snaprealms, and
234 * realm snap contexts. (later, we can do per-realm snap
235 * contexts locks..) the empty list contains realms with no
236 * references (implying they contain no inodes with caps) that
237 * should be destroyed.
238 */
239 struct rw_semaphore snap_rwsem;
240 struct radix_tree_root snap_realms;
241 struct list_head snap_empty;
242 spinlock_t snap_empty_lock; /* protect snap_empty */
243
244 u64 last_tid; /* most recent mds request */
245 struct radix_tree_root request_tree; /* pending mds requests */
246 struct delayed_work delayed_work; /* delayed work */
247 unsigned long last_renew_caps; /* last time we renewed our caps */
248 struct list_head cap_delay_list; /* caps with delayed release */
249 spinlock_t cap_delay_lock; /* protects cap_delay_list */
250 struct list_head snap_flush_list; /* cap_snaps ready to flush */
251 spinlock_t snap_flush_lock;
252
253 u64 cap_flush_seq;
254 struct list_head cap_dirty; /* inodes with dirty caps */
255 int num_cap_flushing; /* # caps we are flushing */
256 spinlock_t cap_dirty_lock; /* protects above items */
257 wait_queue_head_t cap_flushing_wq;
258
259 struct dentry *debugfs_file;
260
261 spinlock_t dentry_lru_lock;
262 struct list_head dentry_lru;
263 int num_dentry;
264};
265
266extern const char *ceph_mds_op_name(int op);
267
268extern struct ceph_mds_session *
269__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
270
271static inline struct ceph_mds_session *
272ceph_get_mds_session(struct ceph_mds_session *s)
273{
274 atomic_inc(&s->s_ref);
275 return s;
276}
277
278extern void ceph_put_mds_session(struct ceph_mds_session *s);
279
280extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
281 struct ceph_msg *msg, int mds);
282
283extern void ceph_mdsc_init(struct ceph_mds_client *mdsc,
284 struct ceph_client *client);
285extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
286extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
287
288extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
289
290extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
291 struct inode *inode,
292 struct dentry *dn, int mask);
293
294extern struct ceph_mds_request *
295ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
296extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
297 struct ceph_mds_request *req);
298extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
299 struct inode *dir,
300 struct ceph_mds_request *req);
301static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
302{
303 atomic_inc(&req->r_ref);
304}
305extern void ceph_mdsc_put_request(struct ceph_mds_request *req);
306
307extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
308
309extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
310 int stop_on_nosnap);
311
312extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
313extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
314 struct inode *inode,
315 struct dentry *dentry, char action,
316 u32 seq);
317
318extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
319 struct ceph_msg *msg);
320
321#endif