aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1193
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c258
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c122
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c680
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c81
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2955
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c409
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c484
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1224
-rw-r--r--fs/ceph/export.c224
-rw-r--r--fs/ceph/file.c938
-rw-r--r--fs/ceph/inode.c1774
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3043
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2249
-rw-r--r--fs/ceph/messenger.h255
-rw-r--r--fs/ceph/mon_client.c835
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1550
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1062
-rw-r--r--fs/ceph/osdmap.h126
-rw-r--r--fs/ceph/pagelist.c55
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h376
-rw-r--r--fs/ceph/snap.c907
-rw-r--r--fs/ceph/super.c1031
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c845
62 files changed, 28107 insertions, 0 deletions
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..412593703d1e
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1193 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/slab.h>
9#include <linux/pagevec.h>
10#include <linux/task_io_accounting_ops.h>
11
12#include "super.h"
13#include "osd_client.h"
14
15/*
16 * Ceph address space ops.
17 *
18 * There are a few funny things going on here.
19 *
20 * The page->private field is used to reference a struct
21 * ceph_snap_context for _every_ dirty page. This indicates which
22 * snapshot the page was logically dirtied in, and thus which snap
23 * context needs to be associated with the osd write during writeback.
24 *
25 * Similarly, struct ceph_inode_info maintains a set of counters to
26 * count dirty pages on the inode. In the absense of snapshots,
27 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
28 *
29 * When a snapshot is taken (that is, when the client receives
30 * notification that a snapshot was taken), each inode with caps and
31 * with dirty pages (dirty pages implies there is a cap) gets a new
32 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
33 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
34 * moved to capsnap->dirty. (Unless a sync write is currently in
35 * progress. In that case, the capsnap is said to be "pending", new
36 * writes cannot start, and the capsnap isn't "finalized" until the
37 * write completes (or fails) and a final size/mtime for the inode for
38 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
39 *
40 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
41 * we look for the first capsnap in i_cap_snaps and write out pages in
42 * that snap context _only_. Then we move on to the next capsnap,
43 * eventually reaching the "live" or "head" context (i.e., pages that
44 * are not yet snapped) and are writing the most recently dirtied
45 * pages.
46 *
47 * Invalidate and so forth must take care to ensure the dirty page
48 * accounting is preserved.
49 */
50
51#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
52#define CONGESTION_OFF_THRESH(congestion_kb) \
53 (CONGESTION_ON_THRESH(congestion_kb) - \
54 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
55
56
57
58/*
59 * Dirty a page. Optimistically adjust accounting, on the assumption
60 * that we won't race with invalidate. If we do, readjust.
61 */
62static int ceph_set_page_dirty(struct page *page)
63{
64 struct address_space *mapping = page->mapping;
65 struct inode *inode;
66 struct ceph_inode_info *ci;
67 int undo = 0;
68 struct ceph_snap_context *snapc;
69
70 if (unlikely(!mapping))
71 return !TestSetPageDirty(page);
72
73 if (TestSetPageDirty(page)) {
74 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
75 mapping->host, page, page->index);
76 return 0;
77 }
78
79 inode = mapping->host;
80 ci = ceph_inode(inode);
81
82 /*
83 * Note that we're grabbing a snapc ref here without holding
84 * any locks!
85 */
86 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
87
88 /* dirty the head */
89 spin_lock(&inode->i_lock);
90 if (ci->i_wrbuffer_ref_head == 0)
91 ci->i_head_snapc = ceph_get_snap_context(snapc);
92 ++ci->i_wrbuffer_ref_head;
93 if (ci->i_wrbuffer_ref == 0)
94 igrab(inode);
95 ++ci->i_wrbuffer_ref;
96 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
97 "snapc %p seq %lld (%d snaps)\n",
98 mapping->host, page, page->index,
99 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
100 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
101 snapc, snapc->seq, snapc->num_snaps);
102 spin_unlock(&inode->i_lock);
103
104 /* now adjust page */
105 spin_lock_irq(&mapping->tree_lock);
106 if (page->mapping) { /* Race with truncate? */
107 WARN_ON_ONCE(!PageUptodate(page));
108
109 if (mapping_cap_account_dirty(mapping)) {
110 __inc_zone_page_state(page, NR_FILE_DIRTY);
111 __inc_bdi_stat(mapping->backing_dev_info,
112 BDI_RECLAIMABLE);
113 task_io_account_write(PAGE_CACHE_SIZE);
114 }
115 radix_tree_tag_set(&mapping->page_tree,
116 page_index(page), PAGECACHE_TAG_DIRTY);
117
118 /*
119 * Reference snap context in page->private. Also set
120 * PagePrivate so that we get invalidatepage callback.
121 */
122 page->private = (unsigned long)snapc;
123 SetPagePrivate(page);
124 } else {
125 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
126 undo = 1;
127 }
128
129 spin_unlock_irq(&mapping->tree_lock);
130
131 if (undo)
132 /* whoops, we failed to dirty the page */
133 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
134
135 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
136
137 BUG_ON(!PageDirty(page));
138 return 1;
139}
140
141/*
142 * If we are truncating the full page (i.e. offset == 0), adjust the
143 * dirty page counters appropriately. Only called if there is private
144 * data on the page.
145 */
146static void ceph_invalidatepage(struct page *page, unsigned long offset)
147{
148 struct inode *inode;
149 struct ceph_inode_info *ci;
150 struct ceph_snap_context *snapc = (void *)page->private;
151
152 BUG_ON(!PageLocked(page));
153 BUG_ON(!page->private);
154 BUG_ON(!PagePrivate(page));
155 BUG_ON(!page->mapping);
156
157 inode = page->mapping->host;
158
159 /*
160 * We can get non-dirty pages here due to races between
161 * set_page_dirty and truncate_complete_page; just spit out a
162 * warning, in case we end up with accounting problems later.
163 */
164 if (!PageDirty(page))
165 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
166
167 if (offset == 0)
168 ClearPageChecked(page);
169
170 ci = ceph_inode(inode);
171 if (offset == 0) {
172 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
173 inode, page, page->index, offset);
174 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
175 ceph_put_snap_context(snapc);
176 page->private = 0;
177 ClearPagePrivate(page);
178 } else {
179 dout("%p invalidatepage %p idx %lu partial dirty page\n",
180 inode, page, page->index);
181 }
182}
183
184/* just a sanity check */
185static int ceph_releasepage(struct page *page, gfp_t g)
186{
187 struct inode *inode = page->mapping ? page->mapping->host : NULL;
188 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
189 WARN_ON(PageDirty(page));
190 WARN_ON(page->private);
191 WARN_ON(PagePrivate(page));
192 return 0;
193}
194
195/*
196 * read a single page, without unlocking it.
197 */
198static int readpage_nounlock(struct file *filp, struct page *page)
199{
200 struct inode *inode = filp->f_dentry->d_inode;
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
203 int err = 0;
204 u64 len = PAGE_CACHE_SIZE;
205
206 dout("readpage inode %p file %p page %p index %lu\n",
207 inode, filp, page, page->index);
208 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
209 page->index << PAGE_CACHE_SHIFT, &len,
210 ci->i_truncate_seq, ci->i_truncate_size,
211 &page, 1);
212 if (err == -ENOENT)
213 err = 0;
214 if (err < 0) {
215 SetPageError(page);
216 goto out;
217 } else if (err < PAGE_CACHE_SIZE) {
218 /* zero fill remainder of page */
219 zero_user_segment(page, err, PAGE_CACHE_SIZE);
220 }
221 SetPageUptodate(page);
222
223out:
224 return err < 0 ? err : 0;
225}
226
227static int ceph_readpage(struct file *filp, struct page *page)
228{
229 int r = readpage_nounlock(filp, page);
230 unlock_page(page);
231 return r;
232}
233
234/*
235 * Build a vector of contiguous pages from the provided page list.
236 */
237static struct page **page_vector_from_list(struct list_head *page_list,
238 unsigned *nr_pages)
239{
240 struct page **pages;
241 struct page *page;
242 int next_index, contig_pages = 0;
243
244 /* build page vector */
245 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
246 if (!pages)
247 return ERR_PTR(-ENOMEM);
248
249 BUG_ON(list_empty(page_list));
250 next_index = list_entry(page_list->prev, struct page, lru)->index;
251 list_for_each_entry_reverse(page, page_list, lru) {
252 if (page->index == next_index) {
253 dout("readpages page %d %p\n", contig_pages, page);
254 pages[contig_pages] = page;
255 contig_pages++;
256 next_index++;
257 } else {
258 break;
259 }
260 }
261 *nr_pages = contig_pages;
262 return pages;
263}
264
265/*
266 * Read multiple pages. Leave pages we don't read + unlock in page_list;
267 * the caller (VM) cleans them up.
268 */
269static int ceph_readpages(struct file *file, struct address_space *mapping,
270 struct list_head *page_list, unsigned nr_pages)
271{
272 struct inode *inode = file->f_dentry->d_inode;
273 struct ceph_inode_info *ci = ceph_inode(inode);
274 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
275 int rc = 0;
276 struct page **pages;
277 struct pagevec pvec;
278 loff_t offset;
279 u64 len;
280
281 dout("readpages %p file %p nr_pages %d\n",
282 inode, file, nr_pages);
283
284 pages = page_vector_from_list(page_list, &nr_pages);
285 if (IS_ERR(pages))
286 return PTR_ERR(pages);
287
288 /* guess read extent */
289 offset = pages[0]->index << PAGE_CACHE_SHIFT;
290 len = nr_pages << PAGE_CACHE_SHIFT;
291 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
292 offset, &len,
293 ci->i_truncate_seq, ci->i_truncate_size,
294 pages, nr_pages);
295 if (rc == -ENOENT)
296 rc = 0;
297 if (rc < 0)
298 goto out;
299
300 /* set uptodate and add to lru in pagevec-sized chunks */
301 pagevec_init(&pvec, 0);
302 for (; !list_empty(page_list) && len > 0;
303 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
304 struct page *page =
305 list_entry(page_list->prev, struct page, lru);
306
307 list_del(&page->lru);
308
309 if (rc < (int)PAGE_CACHE_SIZE) {
310 /* zero (remainder of) page */
311 int s = rc < 0 ? 0 : rc;
312 zero_user_segment(page, s, PAGE_CACHE_SIZE);
313 }
314
315 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
316 page_cache_release(page);
317 dout("readpages %p add_to_page_cache failed %p\n",
318 inode, page);
319 continue;
320 }
321 dout("readpages %p adding %p idx %lu\n", inode, page,
322 page->index);
323 flush_dcache_page(page);
324 SetPageUptodate(page);
325 unlock_page(page);
326 if (pagevec_add(&pvec, page) == 0)
327 pagevec_lru_add_file(&pvec); /* add to lru */
328 }
329 pagevec_lru_add_file(&pvec);
330 rc = 0;
331
332out:
333 kfree(pages);
334 return rc;
335}
336
337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back.
340 */
341static struct ceph_snap_context *get_oldest_context(struct inode *inode,
342 u64 *snap_size)
343{
344 struct ceph_inode_info *ci = ceph_inode(inode);
345 struct ceph_snap_context *snapc = NULL;
346 struct ceph_cap_snap *capsnap = NULL;
347
348 spin_lock(&inode->i_lock);
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_head_snapc) {
360 snapc = ceph_get_snap_context(ci->i_head_snapc);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 spin_unlock(&inode->i_lock);
365 return snapc;
366}
367
368/*
369 * Write a single page, but leave the page locked.
370 *
371 * If we get a write error, set the page error bit, but still adjust the
372 * dirty page accounting (i.e., page is no longer dirty).
373 */
374static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
375{
376 struct inode *inode;
377 struct ceph_inode_info *ci;
378 struct ceph_client *client;
379 struct ceph_osd_client *osdc;
380 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
381 int len = PAGE_CACHE_SIZE;
382 loff_t i_size;
383 int err = 0;
384 struct ceph_snap_context *snapc, *oldest;
385 u64 snap_size = 0;
386 long writeback_stat;
387
388 dout("writepage %p idx %lu\n", page, page->index);
389
390 if (!page->mapping || !page->mapping->host) {
391 dout("writepage %p - no mapping\n", page);
392 return -EFAULT;
393 }
394 inode = page->mapping->host;
395 ci = ceph_inode(inode);
396 client = ceph_inode_to_client(inode);
397 osdc = &client->osdc;
398
399 /* verify this is a writeable snap context */
400 snapc = (void *)page->private;
401 if (snapc == NULL) {
402 dout("writepage %p page %p not dirty?\n", inode, page);
403 goto out;
404 }
405 oldest = get_oldest_context(inode, &snap_size);
406 if (snapc->seq > oldest->seq) {
407 dout("writepage %p page %p snapc %p not writeable - noop\n",
408 inode, page, (void *)page->private);
409 /* we should only noop if called by kswapd */
410 WARN_ON((current->flags & PF_MEMALLOC) == 0);
411 ceph_put_snap_context(oldest);
412 goto out;
413 }
414 ceph_put_snap_context(oldest);
415
416 /* is this a partial page at end of file? */
417 if (snap_size)
418 i_size = snap_size;
419 else
420 i_size = i_size_read(inode);
421 if (i_size < page_off + len)
422 len = i_size - page_off;
423
424 dout("writepage %p page %p index %lu on %llu~%u\n",
425 inode, page, page->index, page_off, len);
426
427 writeback_stat = atomic_long_inc_return(&client->writeback_count);
428 if (writeback_stat >
429 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
430 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
431
432 set_page_writeback(page);
433 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
434 &ci->i_layout, snapc,
435 page_off, len,
436 ci->i_truncate_seq, ci->i_truncate_size,
437 &inode->i_mtime,
438 &page, 1, 0, 0, true);
439 if (err < 0) {
440 dout("writepage setting page/mapping error %d %p\n", err, page);
441 SetPageError(page);
442 mapping_set_error(&inode->i_data, err);
443 if (wbc)
444 wbc->pages_skipped++;
445 } else {
446 dout("writepage cleaned page %p\n", page);
447 err = 0; /* vfs expects us to return 0 */
448 }
449 page->private = 0;
450 ClearPagePrivate(page);
451 end_page_writeback(page);
452 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
453 ceph_put_snap_context(snapc); /* page's reference */
454out:
455 return err;
456}
457
458static int ceph_writepage(struct page *page, struct writeback_control *wbc)
459{
460 int err;
461 struct inode *inode = page->mapping->host;
462 BUG_ON(!inode);
463 igrab(inode);
464 err = writepage_nounlock(page, wbc);
465 unlock_page(page);
466 iput(inode);
467 return err;
468}
469
470
471/*
472 * lame release_pages helper. release_pages() isn't exported to
473 * modules.
474 */
475static void ceph_release_pages(struct page **pages, int num)
476{
477 struct pagevec pvec;
478 int i;
479
480 pagevec_init(&pvec, 0);
481 for (i = 0; i < num; i++) {
482 if (pagevec_add(&pvec, pages[i]) == 0)
483 pagevec_release(&pvec);
484 }
485 pagevec_release(&pvec);
486}
487
488
489/*
490 * async writeback completion handler.
491 *
492 * If we get an error, set the mapping error bit, but not the individual
493 * page error bits.
494 */
495static void writepages_finish(struct ceph_osd_request *req,
496 struct ceph_msg *msg)
497{
498 struct inode *inode = req->r_inode;
499 struct ceph_osd_reply_head *replyhead;
500 struct ceph_osd_op *op;
501 struct ceph_inode_info *ci = ceph_inode(inode);
502 unsigned wrote;
503 struct page *page;
504 int i;
505 struct ceph_snap_context *snapc = req->r_snapc;
506 struct address_space *mapping = inode->i_mapping;
507 struct writeback_control *wbc = req->r_wbc;
508 __s32 rc = -EIO;
509 u64 bytes = 0;
510 struct ceph_client *client = ceph_inode_to_client(inode);
511 long writeback_stat;
512 unsigned issued = __ceph_caps_issued(ci, NULL);
513
514 /* parse reply */
515 replyhead = msg->front.iov_base;
516 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
517 op = (void *)(replyhead + 1);
518 rc = le32_to_cpu(replyhead->result);
519 bytes = le64_to_cpu(op->extent.length);
520
521 if (rc >= 0) {
522 /*
523 * Assume we wrote the pages we originally sent. The
524 * osd might reply with fewer pages if our writeback
525 * raced with a truncation and was adjusted at the osd,
526 * so don't believe the reply.
527 */
528 wrote = req->r_num_pages;
529 } else {
530 wrote = 0;
531 mapping_set_error(mapping, rc);
532 }
533 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
534 inode, rc, bytes, wrote);
535
536 /* clean all pages */
537 for (i = 0; i < req->r_num_pages; i++) {
538 page = req->r_pages[i];
539 BUG_ON(!page);
540 WARN_ON(!PageUptodate(page));
541
542 writeback_stat =
543 atomic_long_dec_return(&client->writeback_count);
544 if (writeback_stat <
545 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
546 clear_bdi_congested(&client->backing_dev_info,
547 BLK_RW_ASYNC);
548
549 if (i >= wrote) {
550 dout("inode %p skipping page %p\n", inode, page);
551 wbc->pages_skipped++;
552 }
553 ceph_put_snap_context((void *)page->private);
554 page->private = 0;
555 ClearPagePrivate(page);
556 dout("unlocking %d %p\n", i, page);
557 end_page_writeback(page);
558
559 /*
560 * We lost the cache cap, need to truncate the page before
561 * it is unlocked, otherwise we'd truncate it later in the
562 * page truncation thread, possibly losing some data that
563 * raced its way in
564 */
565 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
566 generic_error_remove_page(inode->i_mapping, page);
567
568 unlock_page(page);
569 }
570 dout("%p wrote+cleaned %d pages\n", inode, wrote);
571 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
572
573 ceph_release_pages(req->r_pages, req->r_num_pages);
574 if (req->r_pages_from_pool)
575 mempool_free(req->r_pages,
576 ceph_client(inode->i_sb)->wb_pagevec_pool);
577 else
578 kfree(req->r_pages);
579 ceph_osdc_put_request(req);
580}
581
582/*
583 * allocate a page vec, either directly, or if necessary, via a the
584 * mempool. we avoid the mempool if we can because req->r_num_pages
585 * may be less than the maximum write size.
586 */
587static void alloc_page_vec(struct ceph_client *client,
588 struct ceph_osd_request *req)
589{
590 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
591 GFP_NOFS);
592 if (!req->r_pages) {
593 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
594 req->r_pages_from_pool = 1;
595 WARN_ON(!req->r_pages);
596 }
597}
598
599/*
600 * initiate async writeback
601 */
602static int ceph_writepages_start(struct address_space *mapping,
603 struct writeback_control *wbc)
604{
605 struct inode *inode = mapping->host;
606 struct backing_dev_info *bdi = mapping->backing_dev_info;
607 struct ceph_inode_info *ci = ceph_inode(inode);
608 struct ceph_client *client;
609 pgoff_t index, start, end;
610 int range_whole = 0;
611 int should_loop = 1;
612 pgoff_t max_pages = 0, max_pages_ever = 0;
613 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
614 struct pagevec pvec;
615 int done = 0;
616 int rc = 0;
617 unsigned wsize = 1 << inode->i_blkbits;
618 struct ceph_osd_request *req = NULL;
619 int do_sync;
620 u64 snap_size = 0;
621
622 /*
623 * Include a 'sync' in the OSD request if this is a data
624 * integrity write (e.g., O_SYNC write or fsync()), or if our
625 * cap is being revoked.
626 */
627 do_sync = wbc->sync_mode == WB_SYNC_ALL;
628 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
629 do_sync = 1;
630 dout("writepages_start %p dosync=%d (mode=%s)\n",
631 inode, do_sync,
632 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
633 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
634
635 client = ceph_inode_to_client(inode);
636 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
637 pr_warning("writepage_start %p on forced umount\n", inode);
638 return -EIO; /* we're in a forced umount, don't write! */
639 }
640 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
641 wsize = client->mount_args->wsize;
642 if (wsize < PAGE_CACHE_SIZE)
643 wsize = PAGE_CACHE_SIZE;
644 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
645
646 pagevec_init(&pvec, 0);
647
648 /* ?? */
649 if (wbc->nonblocking && bdi_write_congested(bdi)) {
650 dout(" writepages congested\n");
651 wbc->encountered_congestion = 1;
652 goto out_final;
653 }
654
655 /* where to start/end? */
656 if (wbc->range_cyclic) {
657 start = mapping->writeback_index; /* Start from prev offset */
658 end = -1;
659 dout(" cyclic, start at %lu\n", start);
660 } else {
661 start = wbc->range_start >> PAGE_CACHE_SHIFT;
662 end = wbc->range_end >> PAGE_CACHE_SHIFT;
663 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
664 range_whole = 1;
665 should_loop = 0;
666 dout(" not cyclic, %lu to %lu\n", start, end);
667 }
668 index = start;
669
670retry:
671 /* find oldest snap context with dirty data */
672 ceph_put_snap_context(snapc);
673 snapc = get_oldest_context(inode, &snap_size);
674 if (!snapc) {
675 /* hmm, why does writepages get called when there
676 is no dirty data? */
677 dout(" no snap context with dirty data?\n");
678 goto out;
679 }
680 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
681 snapc, snapc->seq, snapc->num_snaps);
682 if (last_snapc && snapc != last_snapc) {
683 /* if we switched to a newer snapc, restart our scan at the
684 * start of the original file range. */
685 dout(" snapc differs from last pass, restarting at %lu\n",
686 index);
687 index = start;
688 }
689 last_snapc = snapc;
690
691 while (!done && index <= end) {
692 unsigned i;
693 int first;
694 pgoff_t next;
695 int pvec_pages, locked_pages;
696 struct page *page;
697 int want;
698 u64 offset, len;
699 struct ceph_osd_request_head *reqhead;
700 struct ceph_osd_op *op;
701 long writeback_stat;
702
703 next = 0;
704 locked_pages = 0;
705 max_pages = max_pages_ever;
706
707get_more_pages:
708 first = -1;
709 want = min(end - index,
710 min((pgoff_t)PAGEVEC_SIZE,
711 max_pages - (pgoff_t)locked_pages) - 1)
712 + 1;
713 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
714 PAGECACHE_TAG_DIRTY,
715 want);
716 dout("pagevec_lookup_tag got %d\n", pvec_pages);
717 if (!pvec_pages && !locked_pages)
718 break;
719 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
720 page = pvec.pages[i];
721 dout("? %p idx %lu\n", page, page->index);
722 if (locked_pages == 0)
723 lock_page(page); /* first page */
724 else if (!trylock_page(page))
725 break;
726
727 /* only dirty pages, or our accounting breaks */
728 if (unlikely(!PageDirty(page)) ||
729 unlikely(page->mapping != mapping)) {
730 dout("!dirty or !mapping %p\n", page);
731 unlock_page(page);
732 break;
733 }
734 if (!wbc->range_cyclic && page->index > end) {
735 dout("end of range %p\n", page);
736 done = 1;
737 unlock_page(page);
738 break;
739 }
740 if (next && (page->index != next)) {
741 dout("not consecutive %p\n", page);
742 unlock_page(page);
743 break;
744 }
745 if (wbc->sync_mode != WB_SYNC_NONE) {
746 dout("waiting on writeback %p\n", page);
747 wait_on_page_writeback(page);
748 }
749 if ((snap_size && page_offset(page) > snap_size) ||
750 (!snap_size &&
751 page_offset(page) > i_size_read(inode))) {
752 dout("%p page eof %llu\n", page, snap_size ?
753 snap_size : i_size_read(inode));
754 done = 1;
755 unlock_page(page);
756 break;
757 }
758 if (PageWriteback(page)) {
759 dout("%p under writeback\n", page);
760 unlock_page(page);
761 break;
762 }
763
764 /* only if matching snap context */
765 pgsnapc = (void *)page->private;
766 if (pgsnapc->seq > snapc->seq) {
767 dout("page snapc %p %lld > oldest %p %lld\n",
768 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
769 unlock_page(page);
770 if (!locked_pages)
771 continue; /* keep looking for snap */
772 break;
773 }
774
775 if (!clear_page_dirty_for_io(page)) {
776 dout("%p !clear_page_dirty_for_io\n", page);
777 unlock_page(page);
778 break;
779 }
780
781 /* ok */
782 if (locked_pages == 0) {
783 /* prepare async write request */
784 offset = page->index << PAGE_CACHE_SHIFT;
785 len = wsize;
786 req = ceph_osdc_new_request(&client->osdc,
787 &ci->i_layout,
788 ceph_vino(inode),
789 offset, &len,
790 CEPH_OSD_OP_WRITE,
791 CEPH_OSD_FLAG_WRITE |
792 CEPH_OSD_FLAG_ONDISK,
793 snapc, do_sync,
794 ci->i_truncate_seq,
795 ci->i_truncate_size,
796 &inode->i_mtime, true, 1);
797 max_pages = req->r_num_pages;
798
799 alloc_page_vec(client, req);
800 req->r_callback = writepages_finish;
801 req->r_inode = inode;
802 req->r_wbc = wbc;
803 }
804
805 /* note position of first page in pvec */
806 if (first < 0)
807 first = i;
808 dout("%p will write page %p idx %lu\n",
809 inode, page, page->index);
810
811 writeback_stat = atomic_long_inc_return(&client->writeback_count);
812 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
813 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
814 }
815
816 set_page_writeback(page);
817 req->r_pages[locked_pages] = page;
818 locked_pages++;
819 next = page->index + 1;
820 }
821
822 /* did we get anything? */
823 if (!locked_pages)
824 goto release_pvec_pages;
825 if (i) {
826 int j;
827 BUG_ON(!locked_pages || first < 0);
828
829 if (pvec_pages && i == pvec_pages &&
830 locked_pages < max_pages) {
831 dout("reached end pvec, trying for more\n");
832 pagevec_reinit(&pvec);
833 goto get_more_pages;
834 }
835
836 /* shift unused pages over in the pvec... we
837 * will need to release them below. */
838 for (j = i; j < pvec_pages; j++) {
839 dout(" pvec leftover page %p\n",
840 pvec.pages[j]);
841 pvec.pages[j-i+first] = pvec.pages[j];
842 }
843 pvec.nr -= i-first;
844 }
845
846 /* submit the write */
847 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
848 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
849 (u64)locked_pages << PAGE_CACHE_SHIFT);
850 dout("writepages got %d pages at %llu~%llu\n",
851 locked_pages, offset, len);
852
853 /* revise final length, page count */
854 req->r_num_pages = locked_pages;
855 reqhead = req->r_request->front.iov_base;
856 op = (void *)(reqhead + 1);
857 op->extent.length = cpu_to_le64(len);
858 op->payload_len = cpu_to_le32(len);
859 req->r_request->hdr.data_len = cpu_to_le32(len);
860
861 ceph_osdc_start_request(&client->osdc, req, true);
862 req = NULL;
863
864 /* continue? */
865 index = next;
866 wbc->nr_to_write -= locked_pages;
867 if (wbc->nr_to_write <= 0)
868 done = 1;
869
870release_pvec_pages:
871 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
872 pvec.nr ? pvec.pages[0] : NULL);
873 pagevec_release(&pvec);
874
875 if (locked_pages && !done)
876 goto retry;
877 }
878
879 if (should_loop && !done) {
880 /* more to do; loop back to beginning of file */
881 dout("writepages looping back to beginning of file\n");
882 should_loop = 0;
883 index = 0;
884 goto retry;
885 }
886
887 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
888 mapping->writeback_index = index;
889
890out:
891 if (req)
892 ceph_osdc_put_request(req);
893 if (rc > 0)
894 rc = 0; /* vfs expects us to return 0 */
895 ceph_put_snap_context(snapc);
896 dout("writepages done, rc = %d\n", rc);
897out_final:
898 return rc;
899}
900
901
902
903/*
904 * See if a given @snapc is either writeable, or already written.
905 */
906static int context_is_writeable_or_written(struct inode *inode,
907 struct ceph_snap_context *snapc)
908{
909 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
910 int ret = !oldest || snapc->seq <= oldest->seq;
911
912 ceph_put_snap_context(oldest);
913 return ret;
914}
915
916/*
917 * We are only allowed to write into/dirty the page if the page is
918 * clean, or already dirty within the same snap context.
919 *
920 * called with page locked.
921 * return success with page locked,
922 * or any failure (incl -EAGAIN) with page unlocked.
923 */
924static int ceph_update_writeable_page(struct file *file,
925 loff_t pos, unsigned len,
926 struct page *page)
927{
928 struct inode *inode = file->f_dentry->d_inode;
929 struct ceph_inode_info *ci = ceph_inode(inode);
930 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
931 loff_t page_off = pos & PAGE_CACHE_MASK;
932 int pos_in_page = pos & ~PAGE_CACHE_MASK;
933 int end_in_page = pos_in_page + len;
934 loff_t i_size;
935 int r;
936 struct ceph_snap_context *snapc, *oldest;
937
938retry_locked:
939 /* writepages currently holds page lock, but if we change that later, */
940 wait_on_page_writeback(page);
941
942 /* check snap context */
943 BUG_ON(!ci->i_snap_realm);
944 down_read(&mdsc->snap_rwsem);
945 BUG_ON(!ci->i_snap_realm->cached_context);
946 snapc = (void *)page->private;
947 if (snapc && snapc != ci->i_head_snapc) {
948 /*
949 * this page is already dirty in another (older) snap
950 * context! is it writeable now?
951 */
952 oldest = get_oldest_context(inode, NULL);
953 up_read(&mdsc->snap_rwsem);
954
955 if (snapc->seq > oldest->seq) {
956 ceph_put_snap_context(oldest);
957 dout(" page %p snapc %p not current or oldest\n",
958 page, snapc);
959 /*
960 * queue for writeback, and wait for snapc to
961 * be writeable or written
962 */
963 snapc = ceph_get_snap_context(snapc);
964 unlock_page(page);
965 ceph_queue_writeback(inode);
966 r = wait_event_interruptible(ci->i_cap_wq,
967 context_is_writeable_or_written(inode, snapc));
968 ceph_put_snap_context(snapc);
969 if (r == -ERESTARTSYS)
970 return r;
971 return -EAGAIN;
972 }
973 ceph_put_snap_context(oldest);
974
975 /* yay, writeable, do it now (without dropping page lock) */
976 dout(" page %p snapc %p not current, but oldest\n",
977 page, snapc);
978 if (!clear_page_dirty_for_io(page))
979 goto retry_locked;
980 r = writepage_nounlock(page, NULL);
981 if (r < 0)
982 goto fail_nosnap;
983 goto retry_locked;
984 }
985
986 if (PageUptodate(page)) {
987 dout(" page %p already uptodate\n", page);
988 return 0;
989 }
990
991 /* full page? */
992 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
993 return 0;
994
995 /* past end of file? */
996 i_size = inode->i_size; /* caller holds i_mutex */
997
998 if (i_size + len > inode->i_sb->s_maxbytes) {
999 /* file is too big */
1000 r = -EINVAL;
1001 goto fail;
1002 }
1003
1004 if (page_off >= i_size ||
1005 (pos_in_page == 0 && (pos+len) >= i_size &&
1006 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1007 dout(" zeroing %p 0 - %d and %d - %d\n",
1008 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1009 zero_user_segments(page,
1010 0, pos_in_page,
1011 end_in_page, PAGE_CACHE_SIZE);
1012 return 0;
1013 }
1014
1015 /* we need to read it. */
1016 up_read(&mdsc->snap_rwsem);
1017 r = readpage_nounlock(file, page);
1018 if (r < 0)
1019 goto fail_nosnap;
1020 goto retry_locked;
1021
1022fail:
1023 up_read(&mdsc->snap_rwsem);
1024fail_nosnap:
1025 unlock_page(page);
1026 return r;
1027}
1028
1029/*
1030 * We are only allowed to write into/dirty the page if the page is
1031 * clean, or already dirty within the same snap context.
1032 */
1033static int ceph_write_begin(struct file *file, struct address_space *mapping,
1034 loff_t pos, unsigned len, unsigned flags,
1035 struct page **pagep, void **fsdata)
1036{
1037 struct inode *inode = file->f_dentry->d_inode;
1038 struct page *page;
1039 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1040 int r;
1041
1042 do {
1043 /* get a page */
1044 page = grab_cache_page_write_begin(mapping, index, 0);
1045 if (!page)
1046 return -ENOMEM;
1047 *pagep = page;
1048
1049 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1050 inode, page, (int)pos, (int)len);
1051
1052 r = ceph_update_writeable_page(file, pos, len, page);
1053 } while (r == -EAGAIN);
1054
1055 return r;
1056}
1057
1058/*
1059 * we don't do anything in here that simple_write_end doesn't do
1060 * except adjust dirty page accounting and drop read lock on
1061 * mdsc->snap_rwsem.
1062 */
1063static int ceph_write_end(struct file *file, struct address_space *mapping,
1064 loff_t pos, unsigned len, unsigned copied,
1065 struct page *page, void *fsdata)
1066{
1067 struct inode *inode = file->f_dentry->d_inode;
1068 struct ceph_client *client = ceph_inode_to_client(inode);
1069 struct ceph_mds_client *mdsc = &client->mdsc;
1070 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1071 int check_cap = 0;
1072
1073 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1074 inode, page, (int)pos, (int)copied, (int)len);
1075
1076 /* zero the stale part of the page if we did a short copy */
1077 if (copied < len)
1078 zero_user_segment(page, from+copied, len);
1079
1080 /* did file size increase? */
1081 /* (no need for i_size_read(); we caller holds i_mutex */
1082 if (pos+copied > inode->i_size)
1083 check_cap = ceph_inode_set_size(inode, pos+copied);
1084
1085 if (!PageUptodate(page))
1086 SetPageUptodate(page);
1087
1088 set_page_dirty(page);
1089
1090 unlock_page(page);
1091 up_read(&mdsc->snap_rwsem);
1092 page_cache_release(page);
1093
1094 if (check_cap)
1095 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1096
1097 return copied;
1098}
1099
1100/*
1101 * we set .direct_IO to indicate direct io is supported, but since we
1102 * intercept O_DIRECT reads and writes early, this function should
1103 * never get called.
1104 */
1105static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1106 const struct iovec *iov,
1107 loff_t pos, unsigned long nr_segs)
1108{
1109 WARN_ON(1);
1110 return -EINVAL;
1111}
1112
1113const struct address_space_operations ceph_aops = {
1114 .readpage = ceph_readpage,
1115 .readpages = ceph_readpages,
1116 .writepage = ceph_writepage,
1117 .writepages = ceph_writepages_start,
1118 .write_begin = ceph_write_begin,
1119 .write_end = ceph_write_end,
1120 .set_page_dirty = ceph_set_page_dirty,
1121 .invalidatepage = ceph_invalidatepage,
1122 .releasepage = ceph_releasepage,
1123 .direct_IO = ceph_direct_io,
1124};
1125
1126
1127/*
1128 * vm ops
1129 */
1130
1131/*
1132 * Reuse write_begin here for simplicity.
1133 */
1134static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1135{
1136 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1137 struct page *page = vmf->page;
1138 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1139 loff_t off = page->index << PAGE_CACHE_SHIFT;
1140 loff_t size, len;
1141 int ret;
1142
1143 size = i_size_read(inode);
1144 if (off + PAGE_CACHE_SIZE <= size)
1145 len = PAGE_CACHE_SIZE;
1146 else
1147 len = size & ~PAGE_CACHE_MASK;
1148
1149 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1150 off, len, page, page->index);
1151
1152 lock_page(page);
1153
1154 ret = VM_FAULT_NOPAGE;
1155 if ((off > size) ||
1156 (page->mapping != inode->i_mapping))
1157 goto out;
1158
1159 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1160 if (ret == 0) {
1161 /* success. we'll keep the page locked. */
1162 set_page_dirty(page);
1163 up_read(&mdsc->snap_rwsem);
1164 ret = VM_FAULT_LOCKED;
1165 } else {
1166 if (ret == -ENOMEM)
1167 ret = VM_FAULT_OOM;
1168 else
1169 ret = VM_FAULT_SIGBUS;
1170 }
1171out:
1172 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1173 if (ret != VM_FAULT_LOCKED)
1174 unlock_page(page);
1175 return ret;
1176}
1177
1178static struct vm_operations_struct ceph_vmops = {
1179 .fault = filemap_fault,
1180 .page_mkwrite = ceph_page_mkwrite,
1181};
1182
1183int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1184{
1185 struct address_space *mapping = file->f_mapping;
1186
1187 if (!mapping->a_ops->readpage)
1188 return -ENOEXEC;
1189 file_accessed(file);
1190 vma->vm_ops = &ceph_vmops;
1191 vma->vm_flags |= VM_CAN_NONLINEAR;
1192 return 0;
1193}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..f6394b94b866
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,258 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/slab.h>
5#include <linux/err.h>
6
7#include "types.h"
8#include "auth_none.h"
9#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12
13#include "messenger.h"
14
15/*
16 * get protocol handler
17 */
18static u32 supported_protocols[] = {
19 CEPH_AUTH_NONE,
20 CEPH_AUTH_CEPHX
21};
22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{
25 switch (protocol) {
26 case CEPH_AUTH_NONE:
27 return ceph_auth_none_init(ac);
28 case CEPH_AUTH_CEPHX:
29 return ceph_x_init(ac);
30 default:
31 return -ENOENT;
32 }
33}
34
35/*
36 * setup, teardown.
37 */
38struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
39{
40 struct ceph_auth_client *ac;
41 int ret;
42
43 dout("auth_init name '%s' secret '%s'\n", name, secret);
44
45 ret = -ENOMEM;
46 ac = kzalloc(sizeof(*ac), GFP_NOFS);
47 if (!ac)
48 goto out;
49
50 ac->negotiating = true;
51 if (name)
52 ac->name = name;
53 else
54 ac->name = CEPH_AUTH_NAME_DEFAULT;
55 dout("auth_init name %s secret %s\n", ac->name, secret);
56 ac->secret = secret;
57 return ac;
58
59out:
60 return ERR_PTR(ret);
61}
62
63void ceph_auth_destroy(struct ceph_auth_client *ac)
64{
65 dout("auth_destroy %p\n", ac);
66 if (ac->ops)
67 ac->ops->destroy(ac);
68 kfree(ac);
69}
70
71/*
72 * Reset occurs when reconnecting to the monitor.
73 */
74void ceph_auth_reset(struct ceph_auth_client *ac)
75{
76 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac);
79 ac->negotiating = true;
80}
81
82int ceph_entity_name_encode(const char *name, void **p, void *end)
83{
84 int len = strlen(name);
85
86 if (*p + 2*sizeof(u32) + len > end)
87 return -ERANGE;
88 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
89 ceph_encode_32(p, len);
90 ceph_encode_copy(p, name, len);
91 return 0;
92}
93
94/*
95 * Initiate protocol negotiation with monitor. Include entity name
96 * and list supported protocols.
97 */
98int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
99{
100 struct ceph_mon_request_header *monhdr = buf;
101 void *p = monhdr + 1, *end = buf + len, *lenp;
102 int i, num;
103 int ret;
104
105 dout("auth_build_hello\n");
106 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1);
108 monhdr->session_mon_tid = 0;
109
110 ceph_encode_32(&p, 0); /* no protocol, yet */
111
112 lenp = p;
113 p += sizeof(u32);
114
115 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
116 ceph_encode_8(&p, 1);
117 num = ARRAY_SIZE(supported_protocols);
118 ceph_encode_32(&p, num);
119 ceph_decode_need(&p, end, num * sizeof(u32), bad);
120 for (i = 0; i < num; i++)
121 ceph_encode_32(&p, supported_protocols[i]);
122
123 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0)
125 return ret;
126 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id);
128
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf;
131
132bad:
133 return -ERANGE;
134}
135
136int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len)
138{
139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1;
141 void *end = msg_buf + msg_len;
142 int ret;
143
144 monhdr->have_version = 0;
145 monhdr->session_mon = cpu_to_le16(-1);
146 monhdr->session_mon_tid = 0;
147
148 ceph_encode_32(&p, ac->protocol);
149
150 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
151 if (ret < 0) {
152 pr_err("error %d building request\n", ret);
153 return ret;
154 }
155 dout(" built request %d bytes\n", ret);
156 ceph_encode_32(&p, ret);
157 return p + ret - msg_buf;
158}
159
160/*
161 * Handle auth message from monitor.
162 */
163int ceph_handle_auth_reply(struct ceph_auth_client *ac,
164 void *buf, size_t len,
165 void *reply_buf, size_t reply_len)
166{
167 void *p = buf;
168 void *end = buf + len;
169 int protocol;
170 s32 result;
171 u64 global_id;
172 void *payload, *payload_end;
173 int payload_len;
174 char *result_msg;
175 int result_msg_len;
176 int ret = -EINVAL;
177
178 dout("handle_auth_reply %p %p\n", p, end);
179 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
180 protocol = ceph_decode_32(&p);
181 result = ceph_decode_32(&p);
182 global_id = ceph_decode_64(&p);
183 payload_len = ceph_decode_32(&p);
184 payload = p;
185 p += payload_len;
186 ceph_decode_need(&p, end, sizeof(u32), bad);
187 result_msg_len = ceph_decode_32(&p);
188 result_msg = p;
189 p += result_msg_len;
190 if (p != end)
191 goto bad;
192
193 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
194 result_msg, global_id, payload_len);
195
196 payload_end = payload + payload_len;
197
198 if (global_id && ac->global_id != global_id) {
199 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
200 ac->global_id = global_id;
201 }
202
203 if (ac->negotiating) {
204 /* server does not support our protocols? */
205 if (!protocol && result < 0) {
206 ret = result;
207 goto out;
208 }
209 /* set up (new) protocol handler? */
210 if (ac->protocol && ac->protocol != protocol) {
211 ac->ops->destroy(ac);
212 ac->protocol = 0;
213 ac->ops = NULL;
214 }
215 if (ac->protocol != protocol) {
216 ret = ceph_auth_init_protocol(ac, protocol);
217 if (ret) {
218 pr_err("error %d on auth protocol %d init\n",
219 ret, protocol);
220 goto out;
221 }
222 }
223
224 ac->negotiating = false;
225 }
226
227 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
228 if (ret == -EAGAIN) {
229 return ceph_build_auth_request(ac, reply_buf, reply_len);
230 } else if (ret) {
231 pr_err("authentication error %d\n", ret);
232 return ret;
233 }
234 return 0;
235
236bad:
237 pr_err("failed to decode auth msg\n");
238out:
239 return ret;
240}
241
242int ceph_build_auth(struct ceph_auth_client *ac,
243 void *msg_buf, size_t msg_len)
244{
245 if (!ac->protocol)
246 return ceph_auth_build_hello(ac, msg_buf, msg_len);
247 BUG_ON(!ac->ops);
248 if (!ac->ops->is_authenticated(ac))
249 return ceph_build_auth_request(ac, msg_buf, msg_len);
250 return 0;
251}
252
253int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
254{
255 if (!ac->ops)
256 return 0;
257 return ac->ops->is_authenticated(ac);
258}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..8cd9e3af07f7
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,122 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12
13static void reset(struct ceph_auth_client *ac)
14{
15 struct ceph_auth_none_info *xi = ac->private;
16
17 xi->starting = true;
18 xi->built_authorizer = false;
19}
20
21static void destroy(struct ceph_auth_client *ac)
22{
23 kfree(ac->private);
24 ac->private = NULL;
25}
26
27static int is_authenticated(struct ceph_auth_client *ac)
28{
29 struct ceph_auth_none_info *xi = ac->private;
30
31 return !xi->starting;
32}
33
34/*
35 * the generic auth code decode the global_id, and we carry no actual
36 * authenticate state, so nothing happens here.
37 */
38static int handle_reply(struct ceph_auth_client *ac, int result,
39 void *buf, void *end)
40{
41 struct ceph_auth_none_info *xi = ac->private;
42
43 xi->starting = false;
44 return result;
45}
46
47/*
48 * build an 'authorizer' with our entity_name and global_id. we can
49 * reuse a single static copy since it is identical for all services
50 * we connect to.
51 */
52static int ceph_auth_none_create_authorizer(
53 struct ceph_auth_client *ac, int peer_type,
54 struct ceph_authorizer **a,
55 void **buf, size_t *len,
56 void **reply_buf, size_t *reply_len)
57{
58 struct ceph_auth_none_info *ai = ac->private;
59 struct ceph_none_authorizer *au = &ai->au;
60 void *p, *end;
61 int ret;
62
63 if (!ai->built_authorizer) {
64 p = au->buf;
65 end = p + sizeof(au->buf);
66 ceph_encode_8(&p, 1);
67 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
68 if (ret < 0)
69 goto bad;
70 ceph_decode_need(&p, end, sizeof(u64), bad2);
71 ceph_encode_64(&p, ac->global_id);
72 au->buf_len = p - (void *)au->buf;
73 ai->built_authorizer = true;
74 dout("built authorizer len %d\n", au->buf_len);
75 }
76
77 *a = (struct ceph_authorizer *)au;
78 *buf = au->buf;
79 *len = au->buf_len;
80 *reply_buf = au->reply_buf;
81 *reply_len = sizeof(au->reply_buf);
82 return 0;
83
84bad2:
85 ret = -ERANGE;
86bad:
87 return ret;
88}
89
90static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
91 struct ceph_authorizer *a)
92{
93 /* nothing to do */
94}
95
96static const struct ceph_auth_client_ops ceph_auth_none_ops = {
97 .reset = reset,
98 .destroy = destroy,
99 .is_authenticated = is_authenticated,
100 .handle_reply = handle_reply,
101 .create_authorizer = ceph_auth_none_create_authorizer,
102 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
103};
104
105int ceph_auth_none_init(struct ceph_auth_client *ac)
106{
107 struct ceph_auth_none_info *xi;
108
109 dout("ceph_auth_none_init %p\n", ac);
110 xi = kzalloc(sizeof(*xi), GFP_NOFS);
111 if (!xi)
112 return -ENOMEM;
113
114 xi->starting = true;
115 xi->built_authorizer = false;
116
117 ac->protocol = CEPH_AUTH_NONE;
118 ac->private = xi;
119 ac->ops = &ceph_auth_none_ops;
120 return 0;
121}
122
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..d9001a4dc8cc
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,680 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7#include <linux/slab.h>
8
9#include "auth_x.h"
10#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14
15struct kmem_cache *ceph_x_ticketbuf_cachep;
16
17#define TEMP_TICKET_BUF_LEN 256
18
19static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
20
21static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
22{
23 struct ceph_x_info *xi = ac->private;
24 int need;
25
26 ceph_x_validate_tickets(ac, &need);
27 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
28 ac->want_keys, need, xi->have_keys);
29 return (ac->want_keys & xi->have_keys) == ac->want_keys;
30}
31
32static int ceph_x_encrypt_buflen(int ilen)
33{
34 return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
35 sizeof(u32);
36}
37
38static int ceph_x_encrypt(struct ceph_crypto_key *secret,
39 void *ibuf, int ilen, void *obuf, size_t olen)
40{
41 struct ceph_x_encrypt_header head = {
42 .struct_v = 1,
43 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
44 };
45 size_t len = olen - sizeof(u32);
46 int ret;
47
48 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
49 &head, sizeof(head), ibuf, ilen);
50 if (ret)
51 return ret;
52 ceph_encode_32(&obuf, len);
53 return len + sizeof(u32);
54}
55
56static int ceph_x_decrypt(struct ceph_crypto_key *secret,
57 void **p, void *end, void *obuf, size_t olen)
58{
59 struct ceph_x_encrypt_header head;
60 size_t head_len = sizeof(head);
61 int len, ret;
62
63 len = ceph_decode_32(p);
64 if (*p + len > end)
65 return -EINVAL;
66
67 dout("ceph_x_decrypt len %d\n", len);
68 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
69 *p, len);
70 if (ret)
71 return ret;
72 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
73 return -EPERM;
74 *p += len;
75 return olen;
76}
77
78/*
79 * get existing (or insert new) ticket handler
80 */
81struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
82 int service)
83{
84 struct ceph_x_ticket_handler *th;
85 struct ceph_x_info *xi = ac->private;
86 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
87
88 while (*p) {
89 parent = *p;
90 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
91 if (service < th->service)
92 p = &(*p)->rb_left;
93 else if (service > th->service)
94 p = &(*p)->rb_right;
95 else
96 return th;
97 }
98
99 /* add it */
100 th = kzalloc(sizeof(*th), GFP_NOFS);
101 if (!th)
102 return ERR_PTR(-ENOMEM);
103 th->service = service;
104 rb_link_node(&th->node, parent, p);
105 rb_insert_color(&th->node, &xi->ticket_handlers);
106 return th;
107}
108
109static void remove_ticket_handler(struct ceph_auth_client *ac,
110 struct ceph_x_ticket_handler *th)
111{
112 struct ceph_x_info *xi = ac->private;
113
114 dout("remove_ticket_handler %p %d\n", th, th->service);
115 rb_erase(&th->node, &xi->ticket_handlers);
116 ceph_crypto_key_destroy(&th->session_key);
117 if (th->ticket_blob)
118 ceph_buffer_put(th->ticket_blob);
119 kfree(th);
120}
121
122static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
123 struct ceph_crypto_key *secret,
124 void *buf, void *end)
125{
126 struct ceph_x_info *xi = ac->private;
127 int num;
128 void *p = buf;
129 int ret;
130 char *dbuf;
131 char *ticket_buf;
132 u8 struct_v;
133
134 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
135 if (!dbuf)
136 return -ENOMEM;
137
138 ret = -ENOMEM;
139 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
140 GFP_NOFS | GFP_ATOMIC);
141 if (!ticket_buf)
142 goto out_dbuf;
143
144 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
145 struct_v = ceph_decode_8(&p);
146 if (struct_v != 1)
147 goto bad;
148 num = ceph_decode_32(&p);
149 dout("%d tickets\n", num);
150 while (num--) {
151 int type;
152 u8 struct_v;
153 struct ceph_x_ticket_handler *th;
154 void *dp, *dend;
155 int dlen;
156 char is_enc;
157 struct timespec validity;
158 struct ceph_crypto_key old_key;
159 void *tp, *tpend;
160 struct ceph_timespec new_validity;
161 struct ceph_crypto_key new_session_key;
162 struct ceph_buffer *new_ticket_blob;
163 unsigned long new_expires, new_renew_after;
164 u64 new_secret_id;
165
166 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
167
168 type = ceph_decode_32(&p);
169 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
170
171 struct_v = ceph_decode_8(&p);
172 if (struct_v != 1)
173 goto bad;
174
175 th = get_ticket_handler(ac, type);
176 if (IS_ERR(th)) {
177 ret = PTR_ERR(th);
178 goto out;
179 }
180
181 /* blob for me */
182 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
183 TEMP_TICKET_BUF_LEN);
184 if (dlen <= 0) {
185 ret = dlen;
186 goto out;
187 }
188 dout(" decrypted %d bytes\n", dlen);
189 dend = dbuf + dlen;
190 dp = dbuf;
191
192 struct_v = ceph_decode_8(&dp);
193 if (struct_v != 1)
194 goto bad;
195
196 memcpy(&old_key, &th->session_key, sizeof(old_key));
197 ret = ceph_crypto_key_decode(&new_session_key, &dp, dend);
198 if (ret)
199 goto out;
200
201 ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
202 ceph_decode_timespec(&validity, &new_validity);
203 new_expires = get_seconds() + validity.tv_sec;
204 new_renew_after = new_expires - (validity.tv_sec / 4);
205 dout(" expires=%lu renew_after=%lu\n", new_expires,
206 new_renew_after);
207
208 /* ticket blob for service */
209 ceph_decode_8_safe(&p, end, is_enc, bad);
210 tp = ticket_buf;
211 if (is_enc) {
212 /* encrypted */
213 dout(" encrypted ticket\n");
214 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
215 TEMP_TICKET_BUF_LEN);
216 if (dlen < 0) {
217 ret = dlen;
218 goto out;
219 }
220 dlen = ceph_decode_32(&tp);
221 } else {
222 /* unencrypted */
223 ceph_decode_32_safe(&p, end, dlen, bad);
224 ceph_decode_need(&p, end, dlen, bad);
225 ceph_decode_copy(&p, ticket_buf, dlen);
226 }
227 tpend = tp + dlen;
228 dout(" ticket blob is %d bytes\n", dlen);
229 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
230 struct_v = ceph_decode_8(&tp);
231 new_secret_id = ceph_decode_64(&tp);
232 ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
233 if (ret)
234 goto out;
235
236 /* all is well, update our ticket */
237 ceph_crypto_key_destroy(&th->session_key);
238 if (th->ticket_blob)
239 ceph_buffer_put(th->ticket_blob);
240 th->session_key = new_session_key;
241 th->ticket_blob = new_ticket_blob;
242 th->validity = new_validity;
243 th->secret_id = new_secret_id;
244 th->expires = new_expires;
245 th->renew_after = new_renew_after;
246 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
247 type, ceph_entity_type_name(type), th->secret_id,
248 (int)th->ticket_blob->vec.iov_len);
249 xi->have_keys |= th->service;
250 }
251
252 ret = 0;
253out:
254 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
255out_dbuf:
256 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
257 return ret;
258
259bad:
260 ret = -EINVAL;
261 goto out;
262}
263
264static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
265 struct ceph_x_ticket_handler *th,
266 struct ceph_x_authorizer *au)
267{
268 int maxlen;
269 struct ceph_x_authorize_a *msg_a;
270 struct ceph_x_authorize_b msg_b;
271 void *p, *end;
272 int ret;
273 int ticket_blob_len =
274 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
275
276 dout("build_authorizer for %s %p\n",
277 ceph_entity_type_name(th->service), au);
278
279 maxlen = sizeof(*msg_a) + sizeof(msg_b) +
280 ceph_x_encrypt_buflen(ticket_blob_len);
281 dout(" need len %d\n", maxlen);
282 if (au->buf && au->buf->alloc_len < maxlen) {
283 ceph_buffer_put(au->buf);
284 au->buf = NULL;
285 }
286 if (!au->buf) {
287 au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
288 if (!au->buf)
289 return -ENOMEM;
290 }
291 au->service = th->service;
292
293 msg_a = au->buf->vec.iov_base;
294 msg_a->struct_v = 1;
295 msg_a->global_id = cpu_to_le64(ac->global_id);
296 msg_a->service_id = cpu_to_le32(th->service);
297 msg_a->ticket_blob.struct_v = 1;
298 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
299 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
300 if (ticket_blob_len) {
301 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
302 th->ticket_blob->vec.iov_len);
303 }
304 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
305 le64_to_cpu(msg_a->ticket_blob.secret_id));
306
307 p = msg_a + 1;
308 p += ticket_blob_len;
309 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
310
311 get_random_bytes(&au->nonce, sizeof(au->nonce));
312 msg_b.struct_v = 1;
313 msg_b.nonce = cpu_to_le64(au->nonce);
314 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
315 p, end - p);
316 if (ret < 0)
317 goto out_buf;
318 p += ret;
319 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
320 dout(" built authorizer nonce %llx len %d\n", au->nonce,
321 (int)au->buf->vec.iov_len);
322 BUG_ON(au->buf->vec.iov_len > maxlen);
323 return 0;
324
325out_buf:
326 ceph_buffer_put(au->buf);
327 au->buf = NULL;
328 return ret;
329}
330
331static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
332 void **p, void *end)
333{
334 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
335 ceph_encode_8(p, 1);
336 ceph_encode_64(p, th->secret_id);
337 if (th->ticket_blob) {
338 const char *buf = th->ticket_blob->vec.iov_base;
339 u32 len = th->ticket_blob->vec.iov_len;
340
341 ceph_encode_32_safe(p, end, len, bad);
342 ceph_encode_copy_safe(p, end, buf, len, bad);
343 } else {
344 ceph_encode_32_safe(p, end, 0, bad);
345 }
346
347 return 0;
348bad:
349 return -ERANGE;
350}
351
352static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
353{
354 int want = ac->want_keys;
355 struct ceph_x_info *xi = ac->private;
356 int service;
357
358 *pneed = ac->want_keys & ~(xi->have_keys);
359
360 for (service = 1; service <= want; service <<= 1) {
361 struct ceph_x_ticket_handler *th;
362
363 if (!(ac->want_keys & service))
364 continue;
365
366 if (*pneed & service)
367 continue;
368
369 th = get_ticket_handler(ac, service);
370
371 if (!th) {
372 *pneed |= service;
373 continue;
374 }
375
376 if (get_seconds() >= th->renew_after)
377 *pneed |= service;
378 if (get_seconds() >= th->expires)
379 xi->have_keys &= ~service;
380 }
381}
382
383
384static int ceph_x_build_request(struct ceph_auth_client *ac,
385 void *buf, void *end)
386{
387 struct ceph_x_info *xi = ac->private;
388 int need;
389 struct ceph_x_request_header *head = buf;
390 int ret;
391 struct ceph_x_ticket_handler *th =
392 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
393
394 ceph_x_validate_tickets(ac, &need);
395
396 dout("build_request want %x have %x need %x\n",
397 ac->want_keys, xi->have_keys, need);
398
399 if (need & CEPH_ENTITY_TYPE_AUTH) {
400 struct ceph_x_authenticate *auth = (void *)(head + 1);
401 void *p = auth + 1;
402 struct ceph_x_challenge_blob tmp;
403 char tmp_enc[40];
404 u64 *u;
405
406 if (p > end)
407 return -ERANGE;
408
409 dout(" get_auth_session_key\n");
410 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
411
412 /* encrypt and hash */
413 get_random_bytes(&auth->client_challenge, sizeof(u64));
414 tmp.client_challenge = auth->client_challenge;
415 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
416 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
417 tmp_enc, sizeof(tmp_enc));
418 if (ret < 0)
419 return ret;
420
421 auth->struct_v = 1;
422 auth->key = 0;
423 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
424 auth->key ^= *u;
425 dout(" server_challenge %llx client_challenge %llx key %llx\n",
426 xi->server_challenge, le64_to_cpu(auth->client_challenge),
427 le64_to_cpu(auth->key));
428
429 /* now encode the old ticket if exists */
430 ret = ceph_x_encode_ticket(th, &p, end);
431 if (ret < 0)
432 return ret;
433
434 return p - buf;
435 }
436
437 if (need) {
438 void *p = head + 1;
439 struct ceph_x_service_ticket_request *req;
440
441 if (p > end)
442 return -ERANGE;
443 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
444
445 BUG_ON(!th);
446 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
447 if (ret)
448 return ret;
449 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
450 xi->auth_authorizer.buf->vec.iov_len);
451
452 req = p;
453 req->keys = cpu_to_le32(need);
454 p += sizeof(*req);
455 return p - buf;
456 }
457
458 return 0;
459}
460
461static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
462 void *buf, void *end)
463{
464 struct ceph_x_info *xi = ac->private;
465 struct ceph_x_reply_header *head = buf;
466 struct ceph_x_ticket_handler *th;
467 int len = end - buf;
468 int op;
469 int ret;
470
471 if (result)
472 return result; /* XXX hmm? */
473
474 if (xi->starting) {
475 /* it's a hello */
476 struct ceph_x_server_challenge *sc = buf;
477
478 if (len != sizeof(*sc))
479 return -EINVAL;
480 xi->server_challenge = le64_to_cpu(sc->server_challenge);
481 dout("handle_reply got server challenge %llx\n",
482 xi->server_challenge);
483 xi->starting = false;
484 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
485 return -EAGAIN;
486 }
487
488 op = le32_to_cpu(head->op);
489 result = le32_to_cpu(head->result);
490 dout("handle_reply op %d result %d\n", op, result);
491 switch (op) {
492 case CEPHX_GET_AUTH_SESSION_KEY:
493 /* verify auth key */
494 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
495 buf + sizeof(*head), end);
496 break;
497
498 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
499 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
500 BUG_ON(!th);
501 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
502 buf + sizeof(*head), end);
503 break;
504
505 default:
506 return -EINVAL;
507 }
508 if (ret)
509 return ret;
510 if (ac->want_keys == xi->have_keys)
511 return 0;
512 return -EAGAIN;
513}
514
515static int ceph_x_create_authorizer(
516 struct ceph_auth_client *ac, int peer_type,
517 struct ceph_authorizer **a,
518 void **buf, size_t *len,
519 void **reply_buf, size_t *reply_len)
520{
521 struct ceph_x_authorizer *au;
522 struct ceph_x_ticket_handler *th;
523 int ret;
524
525 th = get_ticket_handler(ac, peer_type);
526 if (IS_ERR(th))
527 return PTR_ERR(th);
528
529 au = kzalloc(sizeof(*au), GFP_NOFS);
530 if (!au)
531 return -ENOMEM;
532
533 ret = ceph_x_build_authorizer(ac, th, au);
534 if (ret) {
535 kfree(au);
536 return ret;
537 }
538
539 *a = (struct ceph_authorizer *)au;
540 *buf = au->buf->vec.iov_base;
541 *len = au->buf->vec.iov_len;
542 *reply_buf = au->reply_buf;
543 *reply_len = sizeof(au->reply_buf);
544 return 0;
545}
546
547static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
548 struct ceph_authorizer *a, size_t len)
549{
550 struct ceph_x_authorizer *au = (void *)a;
551 struct ceph_x_ticket_handler *th;
552 int ret = 0;
553 struct ceph_x_authorize_reply reply;
554 void *p = au->reply_buf;
555 void *end = p + sizeof(au->reply_buf);
556
557 th = get_ticket_handler(ac, au->service);
558 if (!th)
559 return -EIO; /* hrm! */
560 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
561 if (ret < 0)
562 return ret;
563 if (ret != sizeof(reply))
564 return -EPERM;
565
566 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
567 ret = -EPERM;
568 else
569 ret = 0;
570 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
571 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
572 return ret;
573}
574
575static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
576 struct ceph_authorizer *a)
577{
578 struct ceph_x_authorizer *au = (void *)a;
579
580 ceph_buffer_put(au->buf);
581 kfree(au);
582}
583
584
585static void ceph_x_reset(struct ceph_auth_client *ac)
586{
587 struct ceph_x_info *xi = ac->private;
588
589 dout("reset\n");
590 xi->starting = true;
591 xi->server_challenge = 0;
592}
593
594static void ceph_x_destroy(struct ceph_auth_client *ac)
595{
596 struct ceph_x_info *xi = ac->private;
597 struct rb_node *p;
598
599 dout("ceph_x_destroy %p\n", ac);
600 ceph_crypto_key_destroy(&xi->secret);
601
602 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
603 struct ceph_x_ticket_handler *th =
604 rb_entry(p, struct ceph_x_ticket_handler, node);
605 remove_ticket_handler(ac, th);
606 }
607
608 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
609
610 kfree(ac->private);
611 ac->private = NULL;
612}
613
614static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
615 int peer_type)
616{
617 struct ceph_x_ticket_handler *th;
618
619 th = get_ticket_handler(ac, peer_type);
620 if (th && !IS_ERR(th))
621 remove_ticket_handler(ac, th);
622}
623
624
625static const struct ceph_auth_client_ops ceph_x_ops = {
626 .is_authenticated = ceph_x_is_authenticated,
627 .build_request = ceph_x_build_request,
628 .handle_reply = ceph_x_handle_reply,
629 .create_authorizer = ceph_x_create_authorizer,
630 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
631 .destroy_authorizer = ceph_x_destroy_authorizer,
632 .invalidate_authorizer = ceph_x_invalidate_authorizer,
633 .reset = ceph_x_reset,
634 .destroy = ceph_x_destroy,
635};
636
637
638int ceph_x_init(struct ceph_auth_client *ac)
639{
640 struct ceph_x_info *xi;
641 int ret;
642
643 dout("ceph_x_init %p\n", ac);
644 xi = kzalloc(sizeof(*xi), GFP_NOFS);
645 if (!xi)
646 return -ENOMEM;
647
648 ret = -ENOMEM;
649 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
650 TEMP_TICKET_BUF_LEN, 8,
651 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
652 NULL);
653 if (!ceph_x_ticketbuf_cachep)
654 goto done_nomem;
655 ret = -EINVAL;
656 if (!ac->secret) {
657 pr_err("no secret set (for auth_x protocol)\n");
658 goto done_nomem;
659 }
660
661 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
662 if (ret)
663 goto done_nomem;
664
665 xi->starting = true;
666 xi->ticket_handlers = RB_ROOT;
667
668 ac->protocol = CEPH_AUTH_CEPHX;
669 ac->private = xi;
670 ac->ops = &ceph_x_ops;
671 return 0;
672
673done_nomem:
674 kfree(xi);
675 if (ceph_x_ticketbuf_cachep)
676 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
677 return ret;
678}
679
680
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..c67535d70aa6
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,81 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5
6#include "buffer.h"
7#include "decode.h"
8
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{
11 struct ceph_buffer *b;
12
13 b = kmalloc(sizeof(*b), gfp);
14 if (!b)
15 return NULL;
16
17 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
18 if (b->vec.iov_base) {
19 b->is_vmalloc = false;
20 } else {
21 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
22 if (!b->vec.iov_base) {
23 kfree(b);
24 return NULL;
25 }
26 b->is_vmalloc = true;
27 }
28
29 kref_init(&b->kref);
30 b->alloc_len = len;
31 b->vec.iov_len = len;
32 dout("buffer_new %p\n", b);
33 return b;
34}
35
36void ceph_buffer_release(struct kref *kref)
37{
38 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
39
40 dout("buffer_release %p\n", b);
41 if (b->vec.iov_base) {
42 if (b->is_vmalloc)
43 vfree(b->vec.iov_base);
44 else
45 kfree(b->vec.iov_base);
46 }
47 kfree(b);
48}
49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{
68 size_t len;
69
70 ceph_decode_need(p, end, sizeof(u32), bad);
71 len = ceph_decode_32(p);
72 dout("decode_buffer len %d\n", (int)len);
73 ceph_decode_need(p, end, len, bad);
74 *b = ceph_buffer_new(len, GFP_NOFS);
75 if (!*b)
76 return -ENOMEM;
77 ceph_decode_copy(p, (*b)->vec.iov_base, len);
78 return 0;
79bad:
80 return -EINVAL;
81}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..aa2239fa9a3b
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2955 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/slab.h>
7#include <linux/vmalloc.h>
8#include <linux/wait.h>
9#include <linux/writeback.h>
10
11#include "super.h"
12#include "decode.h"
13#include "messenger.h"
14
15/*
16 * Capability management
17 *
18 * The Ceph metadata servers control client access to inode metadata
19 * and file data by issuing capabilities, granting clients permission
20 * to read and/or write both inode field and file data to OSDs
21 * (storage nodes). Each capability consists of a set of bits
22 * indicating which operations are allowed.
23 *
24 * If the client holds a *_SHARED cap, the client has a coherent value
25 * that can be safely read from the cached inode.
26 *
27 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
28 * client is allowed to change inode attributes (e.g., file size,
29 * mtime), note its dirty state in the ceph_cap, and asynchronously
30 * flush that metadata change to the MDS.
31 *
32 * In the event of a conflicting operation (perhaps by another
33 * client), the MDS will revoke the conflicting client capabilities.
34 *
35 * In order for a client to cache an inode, it must hold a capability
36 * with at least one MDS server. When inodes are released, release
37 * notifications are batched and periodically sent en masse to the MDS
38 * cluster to release server state.
39 */
40
41
42/*
43 * Generate readable cap strings for debugging output.
44 */
45#define MAX_CAP_STR 20
46static char cap_str[MAX_CAP_STR][40];
47static DEFINE_SPINLOCK(cap_str_lock);
48static int last_cap_str;
49
50static char *gcap_string(char *s, int c)
51{
52 if (c & CEPH_CAP_GSHARED)
53 *s++ = 's';
54 if (c & CEPH_CAP_GEXCL)
55 *s++ = 'x';
56 if (c & CEPH_CAP_GCACHE)
57 *s++ = 'c';
58 if (c & CEPH_CAP_GRD)
59 *s++ = 'r';
60 if (c & CEPH_CAP_GWR)
61 *s++ = 'w';
62 if (c & CEPH_CAP_GBUFFER)
63 *s++ = 'b';
64 if (c & CEPH_CAP_GLAZYIO)
65 *s++ = 'l';
66 return s;
67}
68
69const char *ceph_cap_string(int caps)
70{
71 int i;
72 char *s;
73 int c;
74
75 spin_lock(&cap_str_lock);
76 i = last_cap_str++;
77 if (last_cap_str == MAX_CAP_STR)
78 last_cap_str = 0;
79 spin_unlock(&cap_str_lock);
80
81 s = cap_str[i];
82
83 if (caps & CEPH_CAP_PIN)
84 *s++ = 'p';
85
86 c = (caps >> CEPH_CAP_SAUTH) & 3;
87 if (c) {
88 *s++ = 'A';
89 s = gcap_string(s, c);
90 }
91
92 c = (caps >> CEPH_CAP_SLINK) & 3;
93 if (c) {
94 *s++ = 'L';
95 s = gcap_string(s, c);
96 }
97
98 c = (caps >> CEPH_CAP_SXATTR) & 3;
99 if (c) {
100 *s++ = 'X';
101 s = gcap_string(s, c);
102 }
103
104 c = caps >> CEPH_CAP_SFILE;
105 if (c) {
106 *s++ = 'F';
107 s = gcap_string(s, c);
108 }
109
110 if (s == cap_str[i])
111 *s++ = '-';
112 *s = 0;
113 return cap_str[i];
114}
115
116/*
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{
137 INIT_LIST_HEAD(&caps_list);
138 spin_lock_init(&caps_list_lock);
139}
140
141void ceph_caps_finalize(void)
142{
143 struct ceph_cap *cap;
144
145 spin_lock(&caps_list_lock);
146 while (!list_empty(&caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
148 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap);
150 }
151 caps_total_count = 0;
152 caps_avail_count = 0;
153 caps_use_count = 0;
154 caps_reserve_count = 0;
155 caps_min_count = 0;
156 spin_unlock(&caps_list_lock);
157}
158
159void ceph_adjust_min_caps(int delta)
160{
161 spin_lock(&caps_list_lock);
162 caps_min_count += delta;
163 BUG_ON(caps_min_count < 0);
164 spin_unlock(&caps_list_lock);
165}
166
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
168{
169 int i;
170 struct ceph_cap *cap;
171 int have;
172 int alloc = 0;
173 LIST_HEAD(newcaps);
174 int ret = 0;
175
176 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177
178 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock);
180 if (caps_avail_count >= need)
181 have = need;
182 else
183 have = caps_avail_count;
184 caps_avail_count -= have;
185 caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
187 caps_avail_count);
188 spin_unlock(&caps_list_lock);
189
190 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
192 if (!cap) {
193 ret = -ENOMEM;
194 goto out_alloc_count;
195 }
196 list_add(&cap->caps_item, &newcaps);
197 alloc++;
198 }
199 BUG_ON(have + alloc != need);
200
201 spin_lock(&caps_list_lock);
202 caps_total_count += alloc;
203 caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list);
205
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
207 caps_avail_count);
208 spin_unlock(&caps_list_lock);
209
210 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count,
213 caps_avail_count);
214 return 0;
215
216out_alloc_count:
217 /* we didn't manage to reserve as much as we needed */
218 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
219 ctx, need, have);
220 return ret;
221}
222
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
224{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) {
227 spin_lock(&caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count;
231 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count,
234 caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
236 caps_avail_count);
237 spin_unlock(&caps_list_lock);
238 }
239 return 0;
240}
241
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
243{
244 struct ceph_cap *cap = NULL;
245
246 /* temporary, until we do something about cap import/export */
247 if (!ctx)
248 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249
250 spin_lock(&caps_list_lock);
251 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
252 ctx, ctx->count, caps_total_count, caps_use_count,
253 caps_reserve_count, caps_avail_count);
254 BUG_ON(!ctx->count);
255 BUG_ON(ctx->count > caps_reserve_count);
256 BUG_ON(list_empty(&caps_list));
257
258 ctx->count--;
259 caps_reserve_count--;
260 caps_use_count++;
261
262 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
263 list_del(&cap->caps_item);
264
265 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
266 caps_avail_count);
267 spin_unlock(&caps_list_lock);
268 return cap;
269}
270
271void ceph_put_cap(struct ceph_cap *cap)
272{
273 spin_lock(&caps_list_lock);
274 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
275 cap, caps_total_count, caps_use_count,
276 caps_reserve_count, caps_avail_count);
277 caps_use_count--;
278 /*
279 * Keep some preallocated caps around (ceph_min_count), to
280 * avoid lots of free/alloc churn.
281 */
282 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
283 caps_total_count--;
284 kmem_cache_free(ceph_cap_cachep, cap);
285 } else {
286 caps_avail_count++;
287 list_add(&cap->caps_item, &caps_list);
288 }
289
290 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
291 caps_avail_count);
292 spin_unlock(&caps_list_lock);
293}
294
295void ceph_reservation_status(struct ceph_client *client,
296 int *total, int *avail, int *used, int *reserved,
297 int *min)
298{
299 if (total)
300 *total = caps_total_count;
301 if (avail)
302 *avail = caps_avail_count;
303 if (used)
304 *used = caps_use_count;
305 if (reserved)
306 *reserved = caps_reserve_count;
307 if (min)
308 *min = caps_min_count;
309}
310
311/*
312 * Find ceph_cap for given mds, if any.
313 *
314 * Called with i_lock held.
315 */
316static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
317{
318 struct ceph_cap *cap;
319 struct rb_node *n = ci->i_caps.rb_node;
320
321 while (n) {
322 cap = rb_entry(n, struct ceph_cap, ci_node);
323 if (mds < cap->mds)
324 n = n->rb_left;
325 else if (mds > cap->mds)
326 n = n->rb_right;
327 else
328 return cap;
329 }
330 return NULL;
331}
332
333/*
334 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
335 * -1.
336 */
337static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
338{
339 struct ceph_cap *cap;
340 int mds = -1;
341 struct rb_node *p;
342
343 /* prefer mds with WR|WRBUFFER|EXCL caps */
344 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
345 cap = rb_entry(p, struct ceph_cap, ci_node);
346 mds = cap->mds;
347 if (mseq)
348 *mseq = cap->mseq;
349 if (cap->issued & (CEPH_CAP_FILE_WR |
350 CEPH_CAP_FILE_BUFFER |
351 CEPH_CAP_FILE_EXCL))
352 break;
353 }
354 return mds;
355}
356
357int ceph_get_cap_mds(struct inode *inode)
358{
359 int mds;
360 spin_lock(&inode->i_lock);
361 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
362 spin_unlock(&inode->i_lock);
363 return mds;
364}
365
366/*
367 * Called under i_lock.
368 */
369static void __insert_cap_node(struct ceph_inode_info *ci,
370 struct ceph_cap *new)
371{
372 struct rb_node **p = &ci->i_caps.rb_node;
373 struct rb_node *parent = NULL;
374 struct ceph_cap *cap = NULL;
375
376 while (*p) {
377 parent = *p;
378 cap = rb_entry(parent, struct ceph_cap, ci_node);
379 if (new->mds < cap->mds)
380 p = &(*p)->rb_left;
381 else if (new->mds > cap->mds)
382 p = &(*p)->rb_right;
383 else
384 BUG();
385 }
386
387 rb_link_node(&new->ci_node, parent, p);
388 rb_insert_color(&new->ci_node, &ci->i_caps);
389}
390
391/*
392 * (re)set cap hold timeouts, which control the delayed release
393 * of unused caps back to the MDS. Should be called on cap use.
394 */
395static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
396 struct ceph_inode_info *ci)
397{
398 struct ceph_mount_args *ma = mdsc->client->mount_args;
399
400 ci->i_hold_caps_min = round_jiffies(jiffies +
401 ma->caps_wanted_delay_min * HZ);
402 ci->i_hold_caps_max = round_jiffies(jiffies +
403 ma->caps_wanted_delay_max * HZ);
404 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
405 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
406}
407
408/*
409 * (Re)queue cap at the end of the delayed cap release list.
410 *
411 * If I_FLUSH is set, leave the inode at the front of the list.
412 *
413 * Caller holds i_lock
414 * -> we take mdsc->cap_delay_lock
415 */
416static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
417 struct ceph_inode_info *ci)
418{
419 __cap_set_timeouts(mdsc, ci);
420 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
421 ci->i_ceph_flags, ci->i_hold_caps_max);
422 if (!mdsc->stopping) {
423 spin_lock(&mdsc->cap_delay_lock);
424 if (!list_empty(&ci->i_cap_delay_list)) {
425 if (ci->i_ceph_flags & CEPH_I_FLUSH)
426 goto no_change;
427 list_del_init(&ci->i_cap_delay_list);
428 }
429 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
430no_change:
431 spin_unlock(&mdsc->cap_delay_lock);
432 }
433}
434
435/*
436 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
437 * indicating we should send a cap message to flush dirty metadata
438 * asap, and move to the front of the delayed cap list.
439 */
440static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
441 struct ceph_inode_info *ci)
442{
443 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
444 spin_lock(&mdsc->cap_delay_lock);
445 ci->i_ceph_flags |= CEPH_I_FLUSH;
446 if (!list_empty(&ci->i_cap_delay_list))
447 list_del_init(&ci->i_cap_delay_list);
448 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
449 spin_unlock(&mdsc->cap_delay_lock);
450}
451
452/*
453 * Cancel delayed work on cap.
454 *
455 * Caller must hold i_lock.
456 */
457static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
458 struct ceph_inode_info *ci)
459{
460 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
461 if (list_empty(&ci->i_cap_delay_list))
462 return;
463 spin_lock(&mdsc->cap_delay_lock);
464 list_del_init(&ci->i_cap_delay_list);
465 spin_unlock(&mdsc->cap_delay_lock);
466}
467
468/*
469 * Common issue checks for add_cap, handle_cap_grant.
470 */
471static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
472 unsigned issued)
473{
474 unsigned had = __ceph_caps_issued(ci, NULL);
475
476 /*
477 * Each time we receive FILE_CACHE anew, we increment
478 * i_rdcache_gen.
479 */
480 if ((issued & CEPH_CAP_FILE_CACHE) &&
481 (had & CEPH_CAP_FILE_CACHE) == 0)
482 ci->i_rdcache_gen++;
483
484 /*
485 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
486 * don't know what happened to this directory while we didn't
487 * have the cap.
488 */
489 if ((issued & CEPH_CAP_FILE_SHARED) &&
490 (had & CEPH_CAP_FILE_SHARED) == 0) {
491 ci->i_shared_gen++;
492 if (S_ISDIR(ci->vfs_inode.i_mode)) {
493 dout(" marking %p NOT complete\n", &ci->vfs_inode);
494 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
495 }
496 }
497}
498
499/*
500 * Add a capability under the given MDS session.
501 *
502 * Caller should hold session snap_rwsem (read) and s_mutex.
503 *
504 * @fmode is the open file mode, if we are opening a file, otherwise
505 * it is < 0. (This is so we can atomically add the cap and add an
506 * open file reference to it.)
507 */
508int ceph_add_cap(struct inode *inode,
509 struct ceph_mds_session *session, u64 cap_id,
510 int fmode, unsigned issued, unsigned wanted,
511 unsigned seq, unsigned mseq, u64 realmino, int flags,
512 struct ceph_cap_reservation *caps_reservation)
513{
514 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
515 struct ceph_inode_info *ci = ceph_inode(inode);
516 struct ceph_cap *new_cap = NULL;
517 struct ceph_cap *cap;
518 int mds = session->s_mds;
519 int actual_wanted;
520
521 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
522 session->s_mds, cap_id, ceph_cap_string(issued), seq);
523
524 /*
525 * If we are opening the file, include file mode wanted bits
526 * in wanted.
527 */
528 if (fmode >= 0)
529 wanted |= ceph_caps_for_mode(fmode);
530
531retry:
532 spin_lock(&inode->i_lock);
533 cap = __get_cap_for_mds(ci, mds);
534 if (!cap) {
535 if (new_cap) {
536 cap = new_cap;
537 new_cap = NULL;
538 } else {
539 spin_unlock(&inode->i_lock);
540 new_cap = get_cap(caps_reservation);
541 if (new_cap == NULL)
542 return -ENOMEM;
543 goto retry;
544 }
545
546 cap->issued = 0;
547 cap->implemented = 0;
548 cap->mds = mds;
549 cap->mds_wanted = 0;
550
551 cap->ci = ci;
552 __insert_cap_node(ci, cap);
553
554 /* clear out old exporting info? (i.e. on cap import) */
555 if (ci->i_cap_exporting_mds == mds) {
556 ci->i_cap_exporting_issued = 0;
557 ci->i_cap_exporting_mseq = 0;
558 ci->i_cap_exporting_mds = -1;
559 }
560
561 /* add to session cap list */
562 cap->session = session;
563 spin_lock(&session->s_cap_lock);
564 list_add_tail(&cap->session_caps, &session->s_caps);
565 session->s_nr_caps++;
566 spin_unlock(&session->s_cap_lock);
567 }
568
569 if (!ci->i_snap_realm) {
570 /*
571 * add this inode to the appropriate snap realm
572 */
573 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
574 realmino);
575 if (realm) {
576 ceph_get_snap_realm(mdsc, realm);
577 spin_lock(&realm->inodes_with_caps_lock);
578 ci->i_snap_realm = realm;
579 list_add(&ci->i_snap_realm_item,
580 &realm->inodes_with_caps);
581 spin_unlock(&realm->inodes_with_caps_lock);
582 } else {
583 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
584 realmino);
585 }
586 }
587
588 __check_cap_issue(ci, cap, issued);
589
590 /*
591 * If we are issued caps we don't want, or the mds' wanted
592 * value appears to be off, queue a check so we'll release
593 * later and/or update the mds wanted value.
594 */
595 actual_wanted = __ceph_caps_wanted(ci);
596 if ((wanted & ~actual_wanted) ||
597 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
598 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
599 ceph_cap_string(issued), ceph_cap_string(wanted),
600 ceph_cap_string(actual_wanted));
601 __cap_delay_requeue(mdsc, ci);
602 }
603
604 if (flags & CEPH_CAP_FLAG_AUTH)
605 ci->i_auth_cap = cap;
606 else if (ci->i_auth_cap == cap)
607 ci->i_auth_cap = NULL;
608
609 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
610 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
611 ceph_cap_string(issued|cap->issued), seq, mds);
612 cap->cap_id = cap_id;
613 cap->issued = issued;
614 cap->implemented |= issued;
615 cap->mds_wanted |= wanted;
616 cap->seq = seq;
617 cap->issue_seq = seq;
618 cap->mseq = mseq;
619 cap->cap_gen = session->s_cap_gen;
620
621 if (fmode >= 0)
622 __ceph_get_fmode(ci, fmode);
623 spin_unlock(&inode->i_lock);
624 wake_up(&ci->i_cap_wq);
625 return 0;
626}
627
628/*
629 * Return true if cap has not timed out and belongs to the current
630 * generation of the MDS session (i.e. has not gone 'stale' due to
631 * us losing touch with the mds).
632 */
633static int __cap_is_valid(struct ceph_cap *cap)
634{
635 unsigned long ttl;
636 u32 gen;
637
638 spin_lock(&cap->session->s_cap_lock);
639 gen = cap->session->s_cap_gen;
640 ttl = cap->session->s_cap_ttl;
641 spin_unlock(&cap->session->s_cap_lock);
642
643 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
644 dout("__cap_is_valid %p cap %p issued %s "
645 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
646 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
647 return 0;
648 }
649
650 return 1;
651}
652
653/*
654 * Return set of valid cap bits issued to us. Note that caps time
655 * out, and may be invalidated in bulk if the client session times out
656 * and session->s_cap_gen is bumped.
657 */
658int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
659{
660 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
661 struct ceph_cap *cap;
662 struct rb_node *p;
663
664 if (implemented)
665 *implemented = 0;
666 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
667 cap = rb_entry(p, struct ceph_cap, ci_node);
668 if (!__cap_is_valid(cap))
669 continue;
670 dout("__ceph_caps_issued %p cap %p issued %s\n",
671 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
672 have |= cap->issued;
673 if (implemented)
674 *implemented |= cap->implemented;
675 }
676 return have;
677}
678
679/*
680 * Get cap bits issued by caps other than @ocap
681 */
682int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
683{
684 int have = ci->i_snap_caps;
685 struct ceph_cap *cap;
686 struct rb_node *p;
687
688 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
689 cap = rb_entry(p, struct ceph_cap, ci_node);
690 if (cap == ocap)
691 continue;
692 if (!__cap_is_valid(cap))
693 continue;
694 have |= cap->issued;
695 }
696 return have;
697}
698
699/*
700 * Move a cap to the end of the LRU (oldest caps at list head, newest
701 * at list tail).
702 */
703static void __touch_cap(struct ceph_cap *cap)
704{
705 struct ceph_mds_session *s = cap->session;
706
707 spin_lock(&s->s_cap_lock);
708 if (s->s_cap_iterator == NULL) {
709 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
710 s->s_mds);
711 list_move_tail(&cap->session_caps, &s->s_caps);
712 } else {
713 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
714 &cap->ci->vfs_inode, cap, s->s_mds);
715 }
716 spin_unlock(&s->s_cap_lock);
717}
718
719/*
720 * Check if we hold the given mask. If so, move the cap(s) to the
721 * front of their respective LRUs. (This is the preferred way for
722 * callers to check for caps they want.)
723 */
724int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
725{
726 struct ceph_cap *cap;
727 struct rb_node *p;
728 int have = ci->i_snap_caps;
729
730 if ((have & mask) == mask) {
731 dout("__ceph_caps_issued_mask %p snap issued %s"
732 " (mask %s)\n", &ci->vfs_inode,
733 ceph_cap_string(have),
734 ceph_cap_string(mask));
735 return 1;
736 }
737
738 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
739 cap = rb_entry(p, struct ceph_cap, ci_node);
740 if (!__cap_is_valid(cap))
741 continue;
742 if ((cap->issued & mask) == mask) {
743 dout("__ceph_caps_issued_mask %p cap %p issued %s"
744 " (mask %s)\n", &ci->vfs_inode, cap,
745 ceph_cap_string(cap->issued),
746 ceph_cap_string(mask));
747 if (touch)
748 __touch_cap(cap);
749 return 1;
750 }
751
752 /* does a combination of caps satisfy mask? */
753 have |= cap->issued;
754 if ((have & mask) == mask) {
755 dout("__ceph_caps_issued_mask %p combo issued %s"
756 " (mask %s)\n", &ci->vfs_inode,
757 ceph_cap_string(cap->issued),
758 ceph_cap_string(mask));
759 if (touch) {
760 struct rb_node *q;
761
762 /* touch this + preceeding caps */
763 __touch_cap(cap);
764 for (q = rb_first(&ci->i_caps); q != p;
765 q = rb_next(q)) {
766 cap = rb_entry(q, struct ceph_cap,
767 ci_node);
768 if (!__cap_is_valid(cap))
769 continue;
770 __touch_cap(cap);
771 }
772 }
773 return 1;
774 }
775 }
776
777 return 0;
778}
779
780/*
781 * Return true if mask caps are currently being revoked by an MDS.
782 */
783int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
784{
785 struct inode *inode = &ci->vfs_inode;
786 struct ceph_cap *cap;
787 struct rb_node *p;
788 int ret = 0;
789
790 spin_lock(&inode->i_lock);
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (__cap_is_valid(cap) &&
794 (cap->implemented & ~cap->issued & mask)) {
795 ret = 1;
796 break;
797 }
798 }
799 spin_unlock(&inode->i_lock);
800 dout("ceph_caps_revoking %p %s = %d\n", inode,
801 ceph_cap_string(mask), ret);
802 return ret;
803}
804
805int __ceph_caps_used(struct ceph_inode_info *ci)
806{
807 int used = 0;
808 if (ci->i_pin_ref)
809 used |= CEPH_CAP_PIN;
810 if (ci->i_rd_ref)
811 used |= CEPH_CAP_FILE_RD;
812 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
813 used |= CEPH_CAP_FILE_CACHE;
814 if (ci->i_wr_ref)
815 used |= CEPH_CAP_FILE_WR;
816 if (ci->i_wrbuffer_ref)
817 used |= CEPH_CAP_FILE_BUFFER;
818 return used;
819}
820
821/*
822 * wanted, by virtue of open file modes
823 */
824int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
825{
826 int want = 0;
827 int mode;
828 for (mode = 0; mode < 4; mode++)
829 if (ci->i_nr_by_mode[mode])
830 want |= ceph_caps_for_mode(mode);
831 return want;
832}
833
834/*
835 * Return caps we have registered with the MDS(s) as 'wanted'.
836 */
837int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
838{
839 struct ceph_cap *cap;
840 struct rb_node *p;
841 int mds_wanted = 0;
842
843 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
844 cap = rb_entry(p, struct ceph_cap, ci_node);
845 if (!__cap_is_valid(cap))
846 continue;
847 mds_wanted |= cap->mds_wanted;
848 }
849 return mds_wanted;
850}
851
852/*
853 * called under i_lock
854 */
855static int __ceph_is_any_caps(struct ceph_inode_info *ci)
856{
857 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
858}
859
860/*
861 * caller should hold i_lock.
862 * caller will not hold session s_mutex if called from destroy_inode.
863 */
864void __ceph_remove_cap(struct ceph_cap *cap)
865{
866 struct ceph_mds_session *session = cap->session;
867 struct ceph_inode_info *ci = cap->ci;
868 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
869
870 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
871
872 /* remove from inode list */
873 rb_erase(&cap->ci_node, &ci->i_caps);
874 cap->ci = NULL;
875 if (ci->i_auth_cap == cap)
876 ci->i_auth_cap = NULL;
877
878 /* remove from session list */
879 spin_lock(&session->s_cap_lock);
880 if (session->s_cap_iterator == cap) {
881 /* not yet, we are iterating over this very cap */
882 dout("__ceph_remove_cap delaying %p removal from session %p\n",
883 cap, cap->session);
884 } else {
885 list_del_init(&cap->session_caps);
886 session->s_nr_caps--;
887 cap->session = NULL;
888 }
889 spin_unlock(&session->s_cap_lock);
890
891 if (cap->session == NULL)
892 ceph_put_cap(cap);
893
894 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
895 struct ceph_snap_realm *realm = ci->i_snap_realm;
896 spin_lock(&realm->inodes_with_caps_lock);
897 list_del_init(&ci->i_snap_realm_item);
898 ci->i_snap_realm_counter++;
899 ci->i_snap_realm = NULL;
900 spin_unlock(&realm->inodes_with_caps_lock);
901 ceph_put_snap_realm(mdsc, realm);
902 }
903 if (!__ceph_is_any_real_caps(ci))
904 __cap_delay_cancel(mdsc, ci);
905}
906
907/*
908 * Build and send a cap message to the given MDS.
909 *
910 * Caller should be holding s_mutex.
911 */
912static int send_cap_msg(struct ceph_mds_session *session,
913 u64 ino, u64 cid, int op,
914 int caps, int wanted, int dirty,
915 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
916 u64 size, u64 max_size,
917 struct timespec *mtime, struct timespec *atime,
918 u64 time_warp_seq,
919 uid_t uid, gid_t gid, mode_t mode,
920 u64 xattr_version,
921 struct ceph_buffer *xattrs_buf,
922 u64 follows)
923{
924 struct ceph_mds_caps *fc;
925 struct ceph_msg *msg;
926
927 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
928 " seq %u/%u mseq %u follows %lld size %llu/%llu"
929 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
930 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
931 ceph_cap_string(dirty),
932 seq, issue_seq, mseq, follows, size, max_size,
933 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
934
935 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
936 if (IS_ERR(msg))
937 return PTR_ERR(msg);
938
939 msg->hdr.tid = cpu_to_le64(flush_tid);
940
941 fc = msg->front.iov_base;
942 memset(fc, 0, sizeof(*fc));
943
944 fc->cap_id = cpu_to_le64(cid);
945 fc->op = cpu_to_le32(op);
946 fc->seq = cpu_to_le32(seq);
947 fc->issue_seq = cpu_to_le32(issue_seq);
948 fc->migrate_seq = cpu_to_le32(mseq);
949 fc->caps = cpu_to_le32(caps);
950 fc->wanted = cpu_to_le32(wanted);
951 fc->dirty = cpu_to_le32(dirty);
952 fc->ino = cpu_to_le64(ino);
953 fc->snap_follows = cpu_to_le64(follows);
954
955 fc->size = cpu_to_le64(size);
956 fc->max_size = cpu_to_le64(max_size);
957 if (mtime)
958 ceph_encode_timespec(&fc->mtime, mtime);
959 if (atime)
960 ceph_encode_timespec(&fc->atime, atime);
961 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
962
963 fc->uid = cpu_to_le32(uid);
964 fc->gid = cpu_to_le32(gid);
965 fc->mode = cpu_to_le32(mode);
966
967 fc->xattr_version = cpu_to_le64(xattr_version);
968 if (xattrs_buf) {
969 msg->middle = ceph_buffer_get(xattrs_buf);
970 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
972 }
973
974 ceph_con_send(&session->s_con, msg);
975 return 0;
976}
977
978/*
979 * Queue cap releases when an inode is dropped from our cache. Since
980 * inode is about to be destroyed, there is no need for i_lock.
981 */
982void ceph_queue_caps_release(struct inode *inode)
983{
984 struct ceph_inode_info *ci = ceph_inode(inode);
985 struct rb_node *p;
986
987 p = rb_first(&ci->i_caps);
988 while (p) {
989 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
990 struct ceph_mds_session *session = cap->session;
991 struct ceph_msg *msg;
992 struct ceph_mds_cap_release *head;
993 struct ceph_mds_cap_item *item;
994
995 spin_lock(&session->s_cap_lock);
996 BUG_ON(!session->s_num_cap_releases);
997 msg = list_first_entry(&session->s_cap_releases,
998 struct ceph_msg, list_head);
999
1000 dout(" adding %p release to mds%d msg %p (%d left)\n",
1001 inode, session->s_mds, msg, session->s_num_cap_releases);
1002
1003 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1004 head = msg->front.iov_base;
1005 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1006 item = msg->front.iov_base + msg->front.iov_len;
1007 item->ino = cpu_to_le64(ceph_ino(inode));
1008 item->cap_id = cpu_to_le64(cap->cap_id);
1009 item->migrate_seq = cpu_to_le32(cap->mseq);
1010 item->seq = cpu_to_le32(cap->issue_seq);
1011
1012 session->s_num_cap_releases--;
1013
1014 msg->front.iov_len += sizeof(*item);
1015 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1016 dout(" release msg %p full\n", msg);
1017 list_move_tail(&msg->list_head,
1018 &session->s_cap_releases_done);
1019 } else {
1020 dout(" release msg %p at %d/%d (%d)\n", msg,
1021 (int)le32_to_cpu(head->num),
1022 (int)CEPH_CAPS_PER_RELEASE,
1023 (int)msg->front.iov_len);
1024 }
1025 spin_unlock(&session->s_cap_lock);
1026 p = rb_next(p);
1027 __ceph_remove_cap(cap);
1028 }
1029}
1030
1031/*
1032 * Send a cap msg on the given inode. Update our caps state, then
1033 * drop i_lock and send the message.
1034 *
1035 * Make note of max_size reported/requested from mds, revoked caps
1036 * that have now been implemented.
1037 *
1038 * Make half-hearted attempt ot to invalidate page cache if we are
1039 * dropping RDCACHE. Note that this will leave behind locked pages
1040 * that we'll then need to deal with elsewhere.
1041 *
1042 * Return non-zero if delayed release, or we experienced an error
1043 * such that the caller should requeue + retry later.
1044 *
1045 * called with i_lock, then drops it.
1046 * caller should hold snap_rwsem (read), s_mutex.
1047 */
1048static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1049 int op, int used, int want, int retain, int flushing,
1050 unsigned *pflush_tid)
1051 __releases(cap->ci->vfs_inode->i_lock)
1052{
1053 struct ceph_inode_info *ci = cap->ci;
1054 struct inode *inode = &ci->vfs_inode;
1055 u64 cap_id = cap->cap_id;
1056 int held, revoking, dropping, keep;
1057 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1058 u64 size, max_size;
1059 struct timespec mtime, atime;
1060 int wake = 0;
1061 mode_t mode;
1062 uid_t uid;
1063 gid_t gid;
1064 struct ceph_mds_session *session;
1065 u64 xattr_version = 0;
1066 int delayed = 0;
1067 u64 flush_tid = 0;
1068 int i;
1069 int ret;
1070
1071 held = cap->issued | cap->implemented;
1072 revoking = cap->implemented & ~cap->issued;
1073 retain &= ~revoking;
1074 dropping = cap->issued & ~retain;
1075
1076 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1077 inode, cap, cap->session,
1078 ceph_cap_string(held), ceph_cap_string(held & retain),
1079 ceph_cap_string(revoking));
1080 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1081
1082 session = cap->session;
1083
1084 /* don't release wanted unless we've waited a bit. */
1085 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1086 time_before(jiffies, ci->i_hold_caps_min)) {
1087 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1088 ceph_cap_string(cap->issued),
1089 ceph_cap_string(cap->issued & retain),
1090 ceph_cap_string(cap->mds_wanted),
1091 ceph_cap_string(want));
1092 want |= cap->mds_wanted;
1093 retain |= cap->issued;
1094 delayed = 1;
1095 }
1096 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1097
1098 cap->issued &= retain; /* drop bits we don't want */
1099 if (cap->implemented & ~cap->issued) {
1100 /*
1101 * Wake up any waiters on wanted -> needed transition.
1102 * This is due to the weird transition from buffered
1103 * to sync IO... we need to flush dirty pages _before_
1104 * allowing sync writes to avoid reordering.
1105 */
1106 wake = 1;
1107 }
1108 cap->implemented &= cap->issued | used;
1109 cap->mds_wanted = want;
1110
1111 if (flushing) {
1112 /*
1113 * assign a tid for flush operations so we can avoid
1114 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1115 * clean type races. track latest tid for every bit
1116 * so we can handle flush AxFw, flush Fw, and have the
1117 * first ack clean Ax.
1118 */
1119 flush_tid = ++ci->i_cap_flush_last_tid;
1120 if (pflush_tid)
1121 *pflush_tid = flush_tid;
1122 dout(" cap_flush_tid %d\n", (int)flush_tid);
1123 for (i = 0; i < CEPH_CAP_BITS; i++)
1124 if (flushing & (1 << i))
1125 ci->i_cap_flush_tid[i] = flush_tid;
1126 }
1127
1128 keep = cap->implemented;
1129 seq = cap->seq;
1130 issue_seq = cap->issue_seq;
1131 mseq = cap->mseq;
1132 size = inode->i_size;
1133 ci->i_reported_size = size;
1134 max_size = ci->i_wanted_max_size;
1135 ci->i_requested_max_size = max_size;
1136 mtime = inode->i_mtime;
1137 atime = inode->i_atime;
1138 time_warp_seq = ci->i_time_warp_seq;
1139 follows = ci->i_snap_realm->cached_context->seq;
1140 uid = inode->i_uid;
1141 gid = inode->i_gid;
1142 mode = inode->i_mode;
1143
1144 if (dropping & CEPH_CAP_XATTR_EXCL) {
1145 __ceph_build_xattrs_blob(ci);
1146 xattr_version = ci->i_xattrs.version + 1;
1147 }
1148
1149 spin_unlock(&inode->i_lock);
1150
1151 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1152 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1153 size, max_size, &mtime, &atime, time_warp_seq,
1154 uid, gid, mode,
1155 xattr_version,
1156 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1157 follows);
1158 if (ret < 0) {
1159 dout("error sending cap msg, must requeue %p\n", inode);
1160 delayed = 1;
1161 }
1162
1163 if (wake)
1164 wake_up(&ci->i_cap_wq);
1165
1166 return delayed;
1167}
1168
1169/*
1170 * When a snapshot is taken, clients accumulate dirty metadata on
1171 * inodes with capabilities in ceph_cap_snaps to describe the file
1172 * state at the time the snapshot was taken. This must be flushed
1173 * asynchronously back to the MDS once sync writes complete and dirty
1174 * data is written out.
1175 *
1176 * Called under i_lock. Takes s_mutex as needed.
1177 */
1178void __ceph_flush_snaps(struct ceph_inode_info *ci,
1179 struct ceph_mds_session **psession)
1180{
1181 struct inode *inode = &ci->vfs_inode;
1182 int mds;
1183 struct ceph_cap_snap *capsnap;
1184 u32 mseq;
1185 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1186 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1187 session->s_mutex */
1188 u64 next_follows = 0; /* keep track of how far we've gotten through the
1189 i_cap_snaps list, and skip these entries next time
1190 around to avoid an infinite loop */
1191
1192 if (psession)
1193 session = *psession;
1194
1195 dout("__flush_snaps %p\n", inode);
1196retry:
1197 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1198 /* avoid an infiniute loop after retry */
1199 if (capsnap->follows < next_follows)
1200 continue;
1201 /*
1202 * we need to wait for sync writes to complete and for dirty
1203 * pages to be written out.
1204 */
1205 if (capsnap->dirty_pages || capsnap->writing)
1206 continue;
1207
1208 /*
1209 * if cap writeback already occurred, we should have dropped
1210 * the capsnap in ceph_put_wrbuffer_cap_refs.
1211 */
1212 BUG_ON(capsnap->dirty == 0);
1213
1214 /* pick mds, take s_mutex */
1215 mds = __ceph_get_cap_mds(ci, &mseq);
1216 if (session && session->s_mds != mds) {
1217 dout("oops, wrong session %p mutex\n", session);
1218 mutex_unlock(&session->s_mutex);
1219 ceph_put_mds_session(session);
1220 session = NULL;
1221 }
1222 if (!session) {
1223 spin_unlock(&inode->i_lock);
1224 mutex_lock(&mdsc->mutex);
1225 session = __ceph_lookup_mds_session(mdsc, mds);
1226 mutex_unlock(&mdsc->mutex);
1227 if (session) {
1228 dout("inverting session/ino locks on %p\n",
1229 session);
1230 mutex_lock(&session->s_mutex);
1231 }
1232 /*
1233 * if session == NULL, we raced against a cap
1234 * deletion. retry, and we'll get a better
1235 * @mds value next time.
1236 */
1237 spin_lock(&inode->i_lock);
1238 goto retry;
1239 }
1240
1241 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1242 atomic_inc(&capsnap->nref);
1243 if (!list_empty(&capsnap->flushing_item))
1244 list_del_init(&capsnap->flushing_item);
1245 list_add_tail(&capsnap->flushing_item,
1246 &session->s_cap_snaps_flushing);
1247 spin_unlock(&inode->i_lock);
1248
1249 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1250 inode, capsnap, next_follows, capsnap->size);
1251 send_cap_msg(session, ceph_vino(inode).ino, 0,
1252 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1253 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1254 capsnap->size, 0,
1255 &capsnap->mtime, &capsnap->atime,
1256 capsnap->time_warp_seq,
1257 capsnap->uid, capsnap->gid, capsnap->mode,
1258 0, NULL,
1259 capsnap->follows);
1260
1261 next_follows = capsnap->follows + 1;
1262 ceph_put_cap_snap(capsnap);
1263
1264 spin_lock(&inode->i_lock);
1265 goto retry;
1266 }
1267
1268 /* we flushed them all; remove this inode from the queue */
1269 spin_lock(&mdsc->snap_flush_lock);
1270 list_del_init(&ci->i_snap_flush_item);
1271 spin_unlock(&mdsc->snap_flush_lock);
1272
1273 if (psession)
1274 *psession = session;
1275 else if (session) {
1276 mutex_unlock(&session->s_mutex);
1277 ceph_put_mds_session(session);
1278 }
1279}
1280
1281static void ceph_flush_snaps(struct ceph_inode_info *ci)
1282{
1283 struct inode *inode = &ci->vfs_inode;
1284
1285 spin_lock(&inode->i_lock);
1286 __ceph_flush_snaps(ci, NULL);
1287 spin_unlock(&inode->i_lock);
1288}
1289
1290/*
1291 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1292 * list.
1293 */
1294void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1295{
1296 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1297 struct inode *inode = &ci->vfs_inode;
1298 int was = ci->i_dirty_caps;
1299 int dirty = 0;
1300
1301 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1302 ceph_cap_string(mask), ceph_cap_string(was),
1303 ceph_cap_string(was | mask));
1304 ci->i_dirty_caps |= mask;
1305 if (was == 0) {
1306 dout(" inode %p now dirty\n", &ci->vfs_inode);
1307 BUG_ON(!list_empty(&ci->i_dirty_item));
1308 spin_lock(&mdsc->cap_dirty_lock);
1309 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1310 spin_unlock(&mdsc->cap_dirty_lock);
1311 if (ci->i_flushing_caps == 0) {
1312 igrab(inode);
1313 dirty |= I_DIRTY_SYNC;
1314 }
1315 }
1316 BUG_ON(list_empty(&ci->i_dirty_item));
1317 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1318 (mask & CEPH_CAP_FILE_BUFFER))
1319 dirty |= I_DIRTY_DATASYNC;
1320 if (dirty)
1321 __mark_inode_dirty(inode, dirty);
1322 __cap_delay_requeue(mdsc, ci);
1323}
1324
1325/*
1326 * Add dirty inode to the flushing list. Assigned a seq number so we
1327 * can wait for caps to flush without starving.
1328 *
1329 * Called under i_lock.
1330 */
1331static int __mark_caps_flushing(struct inode *inode,
1332 struct ceph_mds_session *session)
1333{
1334 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1335 struct ceph_inode_info *ci = ceph_inode(inode);
1336 int flushing;
1337
1338 BUG_ON(ci->i_dirty_caps == 0);
1339 BUG_ON(list_empty(&ci->i_dirty_item));
1340
1341 flushing = ci->i_dirty_caps;
1342 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1343 ceph_cap_string(flushing),
1344 ceph_cap_string(ci->i_flushing_caps),
1345 ceph_cap_string(ci->i_flushing_caps | flushing));
1346 ci->i_flushing_caps |= flushing;
1347 ci->i_dirty_caps = 0;
1348 dout(" inode %p now !dirty\n", inode);
1349
1350 spin_lock(&mdsc->cap_dirty_lock);
1351 list_del_init(&ci->i_dirty_item);
1352
1353 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1354 if (list_empty(&ci->i_flushing_item)) {
1355 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1356 mdsc->num_cap_flushing++;
1357 dout(" inode %p now flushing seq %lld\n", inode,
1358 ci->i_cap_flush_seq);
1359 } else {
1360 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1361 dout(" inode %p now flushing (more) seq %lld\n", inode,
1362 ci->i_cap_flush_seq);
1363 }
1364 spin_unlock(&mdsc->cap_dirty_lock);
1365
1366 return flushing;
1367}
1368
1369/*
1370 * try to invalidate mapping pages without blocking.
1371 */
1372static int mapping_is_empty(struct address_space *mapping)
1373{
1374 struct page *page = find_get_page(mapping, 0);
1375
1376 if (!page)
1377 return 1;
1378
1379 put_page(page);
1380 return 0;
1381}
1382
1383static int try_nonblocking_invalidate(struct inode *inode)
1384{
1385 struct ceph_inode_info *ci = ceph_inode(inode);
1386 u32 invalidating_gen = ci->i_rdcache_gen;
1387
1388 spin_unlock(&inode->i_lock);
1389 invalidate_mapping_pages(&inode->i_data, 0, -1);
1390 spin_lock(&inode->i_lock);
1391
1392 if (mapping_is_empty(&inode->i_data) &&
1393 invalidating_gen == ci->i_rdcache_gen) {
1394 /* success. */
1395 dout("try_nonblocking_invalidate %p success\n", inode);
1396 ci->i_rdcache_gen = 0;
1397 ci->i_rdcache_revoking = 0;
1398 return 0;
1399 }
1400 dout("try_nonblocking_invalidate %p failed\n", inode);
1401 return -1;
1402}
1403
1404/*
1405 * Swiss army knife function to examine currently used and wanted
1406 * versus held caps. Release, flush, ack revoked caps to mds as
1407 * appropriate.
1408 *
1409 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1410 * cap release further.
1411 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1412 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1413 * further delay.
1414 */
1415void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1416 struct ceph_mds_session *session)
1417 __releases(session->s_mutex)
1418{
1419 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1420 struct ceph_mds_client *mdsc = &client->mdsc;
1421 struct inode *inode = &ci->vfs_inode;
1422 struct ceph_cap *cap;
1423 int file_wanted, used;
1424 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1425 int issued, implemented, want, retain, revoking, flushing = 0;
1426 int mds = -1; /* keep track of how far we've gone through i_caps list
1427 to avoid an infinite loop on retry */
1428 struct rb_node *p;
1429 int tried_invalidate = 0;
1430 int delayed = 0, sent = 0, force_requeue = 0, num;
1431 int queue_invalidate = 0;
1432 int is_delayed = flags & CHECK_CAPS_NODELAY;
1433
1434 /* if we are unmounting, flush any unused caps immediately. */
1435 if (mdsc->stopping)
1436 is_delayed = 1;
1437
1438 spin_lock(&inode->i_lock);
1439
1440 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1441 flags |= CHECK_CAPS_FLUSH;
1442
1443 /* flush snaps first time around only */
1444 if (!list_empty(&ci->i_cap_snaps))
1445 __ceph_flush_snaps(ci, &session);
1446 goto retry_locked;
1447retry:
1448 spin_lock(&inode->i_lock);
1449retry_locked:
1450 file_wanted = __ceph_caps_file_wanted(ci);
1451 used = __ceph_caps_used(ci);
1452 want = file_wanted | used;
1453 issued = __ceph_caps_issued(ci, &implemented);
1454 revoking = implemented & ~issued;
1455
1456 retain = want | CEPH_CAP_PIN;
1457 if (!mdsc->stopping && inode->i_nlink > 0) {
1458 if (want) {
1459 retain |= CEPH_CAP_ANY; /* be greedy */
1460 } else {
1461 retain |= CEPH_CAP_ANY_SHARED;
1462 /*
1463 * keep RD only if we didn't have the file open RW,
1464 * because then the mds would revoke it anyway to
1465 * journal max_size=0.
1466 */
1467 if (ci->i_max_size == 0)
1468 retain |= CEPH_CAP_ANY_RD;
1469 }
1470 }
1471
1472 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1473 " issued %s revoking %s retain %s %s%s%s\n", inode,
1474 ceph_cap_string(file_wanted),
1475 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1476 ceph_cap_string(ci->i_flushing_caps),
1477 ceph_cap_string(issued), ceph_cap_string(revoking),
1478 ceph_cap_string(retain),
1479 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1480 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1481 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1482
1483 /*
1484 * If we no longer need to hold onto old our caps, and we may
1485 * have cached pages, but don't want them, then try to invalidate.
1486 * If we fail, it's because pages are locked.... try again later.
1487 */
1488 if ((!is_delayed || mdsc->stopping) &&
1489 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1490 ci->i_rdcache_gen && /* may have cached pages */
1491 (file_wanted == 0 || /* no open files */
1492 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1493 !tried_invalidate) {
1494 dout("check_caps trying to invalidate on %p\n", inode);
1495 if (try_nonblocking_invalidate(inode) < 0) {
1496 if (revoking & CEPH_CAP_FILE_CACHE) {
1497 dout("check_caps queuing invalidate\n");
1498 queue_invalidate = 1;
1499 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1500 } else {
1501 dout("check_caps failed to invalidate pages\n");
1502 /* we failed to invalidate pages. check these
1503 caps again later. */
1504 force_requeue = 1;
1505 __cap_set_timeouts(mdsc, ci);
1506 }
1507 }
1508 tried_invalidate = 1;
1509 goto retry_locked;
1510 }
1511
1512 num = 0;
1513 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1514 cap = rb_entry(p, struct ceph_cap, ci_node);
1515 num++;
1516
1517 /* avoid looping forever */
1518 if (mds >= cap->mds ||
1519 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1520 continue;
1521
1522 /* NOTE: no side-effects allowed, until we take s_mutex */
1523
1524 revoking = cap->implemented & ~cap->issued;
1525 if (revoking)
1526 dout(" mds%d revoking %s\n", cap->mds,
1527 ceph_cap_string(revoking));
1528
1529 if (cap == ci->i_auth_cap &&
1530 (cap->issued & CEPH_CAP_FILE_WR)) {
1531 /* request larger max_size from MDS? */
1532 if (ci->i_wanted_max_size > ci->i_max_size &&
1533 ci->i_wanted_max_size > ci->i_requested_max_size) {
1534 dout("requesting new max_size\n");
1535 goto ack;
1536 }
1537
1538 /* approaching file_max? */
1539 if ((inode->i_size << 1) >= ci->i_max_size &&
1540 (ci->i_reported_size << 1) < ci->i_max_size) {
1541 dout("i_size approaching max_size\n");
1542 goto ack;
1543 }
1544 }
1545 /* flush anything dirty? */
1546 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1547 ci->i_dirty_caps) {
1548 dout("flushing dirty caps\n");
1549 goto ack;
1550 }
1551
1552 /* completed revocation? going down and there are no caps? */
1553 if (revoking && (revoking & used) == 0) {
1554 dout("completed revocation of %s\n",
1555 ceph_cap_string(cap->implemented & ~cap->issued));
1556 goto ack;
1557 }
1558
1559 /* want more caps from mds? */
1560 if (want & ~(cap->mds_wanted | cap->issued))
1561 goto ack;
1562
1563 /* things we might delay */
1564 if ((cap->issued & ~retain) == 0 &&
1565 cap->mds_wanted == want)
1566 continue; /* nope, all good */
1567
1568 if (is_delayed)
1569 goto ack;
1570
1571 /* delay? */
1572 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1573 time_before(jiffies, ci->i_hold_caps_max)) {
1574 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1575 ceph_cap_string(cap->issued),
1576 ceph_cap_string(cap->issued & retain),
1577 ceph_cap_string(cap->mds_wanted),
1578 ceph_cap_string(want));
1579 delayed++;
1580 continue;
1581 }
1582
1583ack:
1584 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1585 dout(" skipping %p I_NOFLUSH set\n", inode);
1586 continue;
1587 }
1588
1589 if (session && session != cap->session) {
1590 dout("oops, wrong session %p mutex\n", session);
1591 mutex_unlock(&session->s_mutex);
1592 session = NULL;
1593 }
1594 if (!session) {
1595 session = cap->session;
1596 if (mutex_trylock(&session->s_mutex) == 0) {
1597 dout("inverting session/ino locks on %p\n",
1598 session);
1599 spin_unlock(&inode->i_lock);
1600 if (took_snap_rwsem) {
1601 up_read(&mdsc->snap_rwsem);
1602 took_snap_rwsem = 0;
1603 }
1604 mutex_lock(&session->s_mutex);
1605 goto retry;
1606 }
1607 }
1608 /* take snap_rwsem after session mutex */
1609 if (!took_snap_rwsem) {
1610 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1611 dout("inverting snap/in locks on %p\n",
1612 inode);
1613 spin_unlock(&inode->i_lock);
1614 down_read(&mdsc->snap_rwsem);
1615 took_snap_rwsem = 1;
1616 goto retry;
1617 }
1618 took_snap_rwsem = 1;
1619 }
1620
1621 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1622 flushing = __mark_caps_flushing(inode, session);
1623
1624 mds = cap->mds; /* remember mds, so we don't repeat */
1625 sent++;
1626
1627 /* __send_cap drops i_lock */
1628 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1629 retain, flushing, NULL);
1630 goto retry; /* retake i_lock and restart our cap scan. */
1631 }
1632
1633 /*
1634 * Reschedule delayed caps release if we delayed anything,
1635 * otherwise cancel.
1636 */
1637 if (delayed && is_delayed)
1638 force_requeue = 1; /* __send_cap delayed release; requeue */
1639 if (!delayed && !is_delayed)
1640 __cap_delay_cancel(mdsc, ci);
1641 else if (!is_delayed || force_requeue)
1642 __cap_delay_requeue(mdsc, ci);
1643
1644 spin_unlock(&inode->i_lock);
1645
1646 if (queue_invalidate)
1647 ceph_queue_invalidate(inode);
1648
1649 if (session)
1650 mutex_unlock(&session->s_mutex);
1651 if (took_snap_rwsem)
1652 up_read(&mdsc->snap_rwsem);
1653}
1654
1655/*
1656 * Try to flush dirty caps back to the auth mds.
1657 */
1658static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1659 unsigned *flush_tid)
1660{
1661 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1662 struct ceph_inode_info *ci = ceph_inode(inode);
1663 int unlock_session = session ? 0 : 1;
1664 int flushing = 0;
1665
1666retry:
1667 spin_lock(&inode->i_lock);
1668 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1669 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1670 goto out;
1671 }
1672 if (ci->i_dirty_caps && ci->i_auth_cap) {
1673 struct ceph_cap *cap = ci->i_auth_cap;
1674 int used = __ceph_caps_used(ci);
1675 int want = __ceph_caps_wanted(ci);
1676 int delayed;
1677
1678 if (!session) {
1679 spin_unlock(&inode->i_lock);
1680 session = cap->session;
1681 mutex_lock(&session->s_mutex);
1682 goto retry;
1683 }
1684 BUG_ON(session != cap->session);
1685 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1686 goto out;
1687
1688 flushing = __mark_caps_flushing(inode, session);
1689
1690 /* __send_cap drops i_lock */
1691 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1692 cap->issued | cap->implemented, flushing,
1693 flush_tid);
1694 if (!delayed)
1695 goto out_unlocked;
1696
1697 spin_lock(&inode->i_lock);
1698 __cap_delay_requeue(mdsc, ci);
1699 }
1700out:
1701 spin_unlock(&inode->i_lock);
1702out_unlocked:
1703 if (session && unlock_session)
1704 mutex_unlock(&session->s_mutex);
1705 return flushing;
1706}
1707
1708/*
1709 * Return true if we've flushed caps through the given flush_tid.
1710 */
1711static int caps_are_flushed(struct inode *inode, unsigned tid)
1712{
1713 struct ceph_inode_info *ci = ceph_inode(inode);
1714 int dirty, i, ret = 1;
1715
1716 spin_lock(&inode->i_lock);
1717 dirty = __ceph_caps_dirty(ci);
1718 for (i = 0; i < CEPH_CAP_BITS; i++)
1719 if ((ci->i_flushing_caps & (1 << i)) &&
1720 ci->i_cap_flush_tid[i] <= tid) {
1721 /* still flushing this bit */
1722 ret = 0;
1723 break;
1724 }
1725 spin_unlock(&inode->i_lock);
1726 return ret;
1727}
1728
1729/*
1730 * Wait on any unsafe replies for the given inode. First wait on the
1731 * newest request, and make that the upper bound. Then, if there are
1732 * more requests, keep waiting on the oldest as long as it is still older
1733 * than the original request.
1734 */
1735static void sync_write_wait(struct inode *inode)
1736{
1737 struct ceph_inode_info *ci = ceph_inode(inode);
1738 struct list_head *head = &ci->i_unsafe_writes;
1739 struct ceph_osd_request *req;
1740 u64 last_tid;
1741
1742 spin_lock(&ci->i_unsafe_lock);
1743 if (list_empty(head))
1744 goto out;
1745
1746 /* set upper bound as _last_ entry in chain */
1747 req = list_entry(head->prev, struct ceph_osd_request,
1748 r_unsafe_item);
1749 last_tid = req->r_tid;
1750
1751 do {
1752 ceph_osdc_get_request(req);
1753 spin_unlock(&ci->i_unsafe_lock);
1754 dout("sync_write_wait on tid %llu (until %llu)\n",
1755 req->r_tid, last_tid);
1756 wait_for_completion(&req->r_safe_completion);
1757 spin_lock(&ci->i_unsafe_lock);
1758 ceph_osdc_put_request(req);
1759
1760 /*
1761 * from here on look at first entry in chain, since we
1762 * only want to wait for anything older than last_tid
1763 */
1764 if (list_empty(head))
1765 break;
1766 req = list_entry(head->next, struct ceph_osd_request,
1767 r_unsafe_item);
1768 } while (req->r_tid < last_tid);
1769out:
1770 spin_unlock(&ci->i_unsafe_lock);
1771}
1772
1773int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1774{
1775 struct inode *inode = dentry->d_inode;
1776 struct ceph_inode_info *ci = ceph_inode(inode);
1777 unsigned flush_tid;
1778 int ret;
1779 int dirty;
1780
1781 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1782 sync_write_wait(inode);
1783
1784 ret = filemap_write_and_wait(inode->i_mapping);
1785 if (ret < 0)
1786 return ret;
1787
1788 dirty = try_flush_caps(inode, NULL, &flush_tid);
1789 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1790
1791 /*
1792 * only wait on non-file metadata writeback (the mds
1793 * can recover size and mtime, so we don't need to
1794 * wait for that)
1795 */
1796 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1797 dout("fsync waiting for flush_tid %u\n", flush_tid);
1798 ret = wait_event_interruptible(ci->i_cap_wq,
1799 caps_are_flushed(inode, flush_tid));
1800 }
1801
1802 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1803 return ret;
1804}
1805
1806/*
1807 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1808 * queue inode for flush but don't do so immediately, because we can
1809 * get by with fewer MDS messages if we wait for data writeback to
1810 * complete first.
1811 */
1812int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1813{
1814 struct ceph_inode_info *ci = ceph_inode(inode);
1815 unsigned flush_tid;
1816 int err = 0;
1817 int dirty;
1818 int wait = wbc->sync_mode == WB_SYNC_ALL;
1819
1820 dout("write_inode %p wait=%d\n", inode, wait);
1821 if (wait) {
1822 dirty = try_flush_caps(inode, NULL, &flush_tid);
1823 if (dirty)
1824 err = wait_event_interruptible(ci->i_cap_wq,
1825 caps_are_flushed(inode, flush_tid));
1826 } else {
1827 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1828
1829 spin_lock(&inode->i_lock);
1830 if (__ceph_caps_dirty(ci))
1831 __cap_delay_requeue_front(mdsc, ci);
1832 spin_unlock(&inode->i_lock);
1833 }
1834 return err;
1835}
1836
1837/*
1838 * After a recovering MDS goes active, we need to resend any caps
1839 * we were flushing.
1840 *
1841 * Caller holds session->s_mutex.
1842 */
1843static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1844 struct ceph_mds_session *session)
1845{
1846 struct ceph_cap_snap *capsnap;
1847
1848 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1849 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1850 flushing_item) {
1851 struct ceph_inode_info *ci = capsnap->ci;
1852 struct inode *inode = &ci->vfs_inode;
1853 struct ceph_cap *cap;
1854
1855 spin_lock(&inode->i_lock);
1856 cap = ci->i_auth_cap;
1857 if (cap && cap->session == session) {
1858 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1859 cap, capsnap);
1860 __ceph_flush_snaps(ci, &session);
1861 } else {
1862 pr_err("%p auth cap %p not mds%d ???\n", inode,
1863 cap, session->s_mds);
1864 spin_unlock(&inode->i_lock);
1865 }
1866 }
1867}
1868
1869void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1870 struct ceph_mds_session *session)
1871{
1872 struct ceph_inode_info *ci;
1873
1874 kick_flushing_capsnaps(mdsc, session);
1875
1876 dout("kick_flushing_caps mds%d\n", session->s_mds);
1877 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1878 struct inode *inode = &ci->vfs_inode;
1879 struct ceph_cap *cap;
1880 int delayed = 0;
1881
1882 spin_lock(&inode->i_lock);
1883 cap = ci->i_auth_cap;
1884 if (cap && cap->session == session) {
1885 dout("kick_flushing_caps %p cap %p %s\n", inode,
1886 cap, ceph_cap_string(ci->i_flushing_caps));
1887 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1888 __ceph_caps_used(ci),
1889 __ceph_caps_wanted(ci),
1890 cap->issued | cap->implemented,
1891 ci->i_flushing_caps, NULL);
1892 if (delayed) {
1893 spin_lock(&inode->i_lock);
1894 __cap_delay_requeue(mdsc, ci);
1895 spin_unlock(&inode->i_lock);
1896 }
1897 } else {
1898 pr_err("%p auth cap %p not mds%d ???\n", inode,
1899 cap, session->s_mds);
1900 spin_unlock(&inode->i_lock);
1901 }
1902 }
1903}
1904
1905
1906/*
1907 * Take references to capabilities we hold, so that we don't release
1908 * them to the MDS prematurely.
1909 *
1910 * Protected by i_lock.
1911 */
1912static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1913{
1914 if (got & CEPH_CAP_PIN)
1915 ci->i_pin_ref++;
1916 if (got & CEPH_CAP_FILE_RD)
1917 ci->i_rd_ref++;
1918 if (got & CEPH_CAP_FILE_CACHE)
1919 ci->i_rdcache_ref++;
1920 if (got & CEPH_CAP_FILE_WR)
1921 ci->i_wr_ref++;
1922 if (got & CEPH_CAP_FILE_BUFFER) {
1923 if (ci->i_wrbuffer_ref == 0)
1924 igrab(&ci->vfs_inode);
1925 ci->i_wrbuffer_ref++;
1926 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1927 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1928 }
1929}
1930
1931/*
1932 * Try to grab cap references. Specify those refs we @want, and the
1933 * minimal set we @need. Also include the larger offset we are writing
1934 * to (when applicable), and check against max_size here as well.
1935 * Note that caller is responsible for ensuring max_size increases are
1936 * requested from the MDS.
1937 */
1938static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1939 int *got, loff_t endoff, int *check_max, int *err)
1940{
1941 struct inode *inode = &ci->vfs_inode;
1942 int ret = 0;
1943 int have, implemented;
1944 int file_wanted;
1945
1946 dout("get_cap_refs %p need %s want %s\n", inode,
1947 ceph_cap_string(need), ceph_cap_string(want));
1948 spin_lock(&inode->i_lock);
1949
1950 /* make sure file is actually open */
1951 file_wanted = __ceph_caps_file_wanted(ci);
1952 if ((file_wanted & need) == 0) {
1953 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1954 ceph_cap_string(need), ceph_cap_string(file_wanted));
1955 *err = -EBADF;
1956 ret = 1;
1957 goto out;
1958 }
1959
1960 if (need & CEPH_CAP_FILE_WR) {
1961 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1962 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1963 inode, endoff, ci->i_max_size);
1964 if (endoff > ci->i_wanted_max_size) {
1965 *check_max = 1;
1966 ret = 1;
1967 }
1968 goto out;
1969 }
1970 /*
1971 * If a sync write is in progress, we must wait, so that we
1972 * can get a final snapshot value for size+mtime.
1973 */
1974 if (__ceph_have_pending_cap_snap(ci)) {
1975 dout("get_cap_refs %p cap_snap_pending\n", inode);
1976 goto out;
1977 }
1978 }
1979 have = __ceph_caps_issued(ci, &implemented);
1980
1981 /*
1982 * disallow writes while a truncate is pending
1983 */
1984 if (ci->i_truncate_pending)
1985 have &= ~CEPH_CAP_FILE_WR;
1986
1987 if ((have & need) == need) {
1988 /*
1989 * Look at (implemented & ~have & not) so that we keep waiting
1990 * on transition from wanted -> needed caps. This is needed
1991 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1992 * going before a prior buffered writeback happens.
1993 */
1994 int not = want & ~(have & need);
1995 int revoking = implemented & ~have;
1996 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1997 inode, ceph_cap_string(have), ceph_cap_string(not),
1998 ceph_cap_string(revoking));
1999 if ((revoking & not) == 0) {
2000 *got = need | (have & want);
2001 __take_cap_refs(ci, *got);
2002 ret = 1;
2003 }
2004 } else {
2005 dout("get_cap_refs %p have %s needed %s\n", inode,
2006 ceph_cap_string(have), ceph_cap_string(need));
2007 }
2008out:
2009 spin_unlock(&inode->i_lock);
2010 dout("get_cap_refs %p ret %d got %s\n", inode,
2011 ret, ceph_cap_string(*got));
2012 return ret;
2013}
2014
2015/*
2016 * Check the offset we are writing up to against our current
2017 * max_size. If necessary, tell the MDS we want to write to
2018 * a larger offset.
2019 */
2020static void check_max_size(struct inode *inode, loff_t endoff)
2021{
2022 struct ceph_inode_info *ci = ceph_inode(inode);
2023 int check = 0;
2024
2025 /* do we need to explicitly request a larger max_size? */
2026 spin_lock(&inode->i_lock);
2027 if ((endoff >= ci->i_max_size ||
2028 endoff > (inode->i_size << 1)) &&
2029 endoff > ci->i_wanted_max_size) {
2030 dout("write %p at large endoff %llu, req max_size\n",
2031 inode, endoff);
2032 ci->i_wanted_max_size = endoff;
2033 check = 1;
2034 }
2035 spin_unlock(&inode->i_lock);
2036 if (check)
2037 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2038}
2039
2040/*
2041 * Wait for caps, and take cap references. If we can't get a WR cap
2042 * due to a small max_size, make sure we check_max_size (and possibly
2043 * ask the mds) so we don't get hung up indefinitely.
2044 */
2045int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2046 loff_t endoff)
2047{
2048 int check_max, ret, err;
2049
2050retry:
2051 if (endoff > 0)
2052 check_max_size(&ci->vfs_inode, endoff);
2053 check_max = 0;
2054 err = 0;
2055 ret = wait_event_interruptible(ci->i_cap_wq,
2056 try_get_cap_refs(ci, need, want,
2057 got, endoff,
2058 &check_max, &err));
2059 if (err)
2060 ret = err;
2061 if (check_max)
2062 goto retry;
2063 return ret;
2064}
2065
2066/*
2067 * Take cap refs. Caller must already know we hold at least one ref
2068 * on the caps in question or we don't know this is safe.
2069 */
2070void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2071{
2072 spin_lock(&ci->vfs_inode.i_lock);
2073 __take_cap_refs(ci, caps);
2074 spin_unlock(&ci->vfs_inode.i_lock);
2075}
2076
2077/*
2078 * Release cap refs.
2079 *
2080 * If we released the last ref on any given cap, call ceph_check_caps
2081 * to release (or schedule a release).
2082 *
2083 * If we are releasing a WR cap (from a sync write), finalize any affected
2084 * cap_snap, and wake up any waiters.
2085 */
2086void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2087{
2088 struct inode *inode = &ci->vfs_inode;
2089 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2090 struct ceph_cap_snap *capsnap;
2091
2092 spin_lock(&inode->i_lock);
2093 if (had & CEPH_CAP_PIN)
2094 --ci->i_pin_ref;
2095 if (had & CEPH_CAP_FILE_RD)
2096 if (--ci->i_rd_ref == 0)
2097 last++;
2098 if (had & CEPH_CAP_FILE_CACHE)
2099 if (--ci->i_rdcache_ref == 0)
2100 last++;
2101 if (had & CEPH_CAP_FILE_BUFFER) {
2102 if (--ci->i_wrbuffer_ref == 0) {
2103 last++;
2104 put++;
2105 }
2106 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2107 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2108 }
2109 if (had & CEPH_CAP_FILE_WR)
2110 if (--ci->i_wr_ref == 0) {
2111 last++;
2112 if (!list_empty(&ci->i_cap_snaps)) {
2113 capsnap = list_first_entry(&ci->i_cap_snaps,
2114 struct ceph_cap_snap,
2115 ci_item);
2116 if (capsnap->writing) {
2117 capsnap->writing = 0;
2118 flushsnaps =
2119 __ceph_finish_cap_snap(ci,
2120 capsnap);
2121 wake = 1;
2122 }
2123 }
2124 }
2125 spin_unlock(&inode->i_lock);
2126
2127 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2128 last ? " last" : "", put ? " put" : "");
2129
2130 if (last && !flushsnaps)
2131 ceph_check_caps(ci, 0, NULL);
2132 else if (flushsnaps)
2133 ceph_flush_snaps(ci);
2134 if (wake)
2135 wake_up(&ci->i_cap_wq);
2136 if (put)
2137 iput(inode);
2138}
2139
2140/*
2141 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2142 * context. Adjust per-snap dirty page accounting as appropriate.
2143 * Once all dirty data for a cap_snap is flushed, flush snapped file
2144 * metadata back to the MDS. If we dropped the last ref, call
2145 * ceph_check_caps.
2146 */
2147void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2148 struct ceph_snap_context *snapc)
2149{
2150 struct inode *inode = &ci->vfs_inode;
2151 int last = 0;
2152 int complete_capsnap = 0;
2153 int drop_capsnap = 0;
2154 int found = 0;
2155 struct ceph_cap_snap *capsnap = NULL;
2156
2157 spin_lock(&inode->i_lock);
2158 ci->i_wrbuffer_ref -= nr;
2159 last = !ci->i_wrbuffer_ref;
2160
2161 if (ci->i_head_snapc == snapc) {
2162 ci->i_wrbuffer_ref_head -= nr;
2163 if (!ci->i_wrbuffer_ref_head) {
2164 ceph_put_snap_context(ci->i_head_snapc);
2165 ci->i_head_snapc = NULL;
2166 }
2167 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2168 inode,
2169 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2170 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2171 last ? " LAST" : "");
2172 } else {
2173 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2174 if (capsnap->context == snapc) {
2175 found = 1;
2176 break;
2177 }
2178 }
2179 BUG_ON(!found);
2180 capsnap->dirty_pages -= nr;
2181 if (capsnap->dirty_pages == 0) {
2182 complete_capsnap = 1;
2183 if (capsnap->dirty == 0)
2184 /* cap writeback completed before we created
2185 * the cap_snap; no FLUSHSNAP is needed */
2186 drop_capsnap = 1;
2187 }
2188 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2189 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2190 inode, capsnap, capsnap->context->seq,
2191 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2192 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2193 last ? " (wrbuffer last)" : "",
2194 complete_capsnap ? " (complete capsnap)" : "",
2195 drop_capsnap ? " (drop capsnap)" : "");
2196 if (drop_capsnap) {
2197 ceph_put_snap_context(capsnap->context);
2198 list_del(&capsnap->ci_item);
2199 list_del(&capsnap->flushing_item);
2200 ceph_put_cap_snap(capsnap);
2201 }
2202 }
2203
2204 spin_unlock(&inode->i_lock);
2205
2206 if (last) {
2207 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2208 iput(inode);
2209 } else if (complete_capsnap) {
2210 ceph_flush_snaps(ci);
2211 wake_up(&ci->i_cap_wq);
2212 }
2213 if (drop_capsnap)
2214 iput(inode);
2215}
2216
2217/*
2218 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2219 * actually be a revocation if it specifies a smaller cap set.)
2220 *
2221 * caller holds s_mutex and i_lock, we drop both.
2222 *
2223 * return value:
2224 * 0 - ok
2225 * 1 - check_caps on auth cap only (writeback)
2226 * 2 - check_caps (ack revoke)
2227 */
2228static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2229 struct ceph_mds_session *session,
2230 struct ceph_cap *cap,
2231 struct ceph_buffer *xattr_buf)
2232 __releases(inode->i_lock)
2233 __releases(session->s_mutex)
2234{
2235 struct ceph_inode_info *ci = ceph_inode(inode);
2236 int mds = session->s_mds;
2237 int seq = le32_to_cpu(grant->seq);
2238 int newcaps = le32_to_cpu(grant->caps);
2239 int issued, implemented, used, wanted, dirty;
2240 u64 size = le64_to_cpu(grant->size);
2241 u64 max_size = le64_to_cpu(grant->max_size);
2242 struct timespec mtime, atime, ctime;
2243 int check_caps = 0;
2244 int wake = 0;
2245 int writeback = 0;
2246 int revoked_rdcache = 0;
2247 int queue_invalidate = 0;
2248
2249 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2250 inode, cap, mds, seq, ceph_cap_string(newcaps));
2251 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2252 inode->i_size);
2253
2254 /*
2255 * If CACHE is being revoked, and we have no dirty buffers,
2256 * try to invalidate (once). (If there are dirty buffers, we
2257 * will invalidate _after_ writeback.)
2258 */
2259 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2260 !ci->i_wrbuffer_ref) {
2261 if (try_nonblocking_invalidate(inode) == 0) {
2262 revoked_rdcache = 1;
2263 } else {
2264 /* there were locked pages.. invalidate later
2265 in a separate thread. */
2266 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2267 queue_invalidate = 1;
2268 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2269 }
2270 }
2271 }
2272
2273 /* side effects now are allowed */
2274
2275 issued = __ceph_caps_issued(ci, &implemented);
2276 issued |= implemented | __ceph_caps_dirty(ci);
2277
2278 cap->cap_gen = session->s_cap_gen;
2279
2280 __check_cap_issue(ci, cap, newcaps);
2281
2282 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2283 inode->i_mode = le32_to_cpu(grant->mode);
2284 inode->i_uid = le32_to_cpu(grant->uid);
2285 inode->i_gid = le32_to_cpu(grant->gid);
2286 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2287 inode->i_uid, inode->i_gid);
2288 }
2289
2290 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2291 inode->i_nlink = le32_to_cpu(grant->nlink);
2292
2293 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2294 int len = le32_to_cpu(grant->xattr_len);
2295 u64 version = le64_to_cpu(grant->xattr_version);
2296
2297 if (version > ci->i_xattrs.version) {
2298 dout(" got new xattrs v%llu on %p len %d\n",
2299 version, inode, len);
2300 if (ci->i_xattrs.blob)
2301 ceph_buffer_put(ci->i_xattrs.blob);
2302 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2303 ci->i_xattrs.version = version;
2304 }
2305 }
2306
2307 /* size/ctime/mtime/atime? */
2308 ceph_fill_file_size(inode, issued,
2309 le32_to_cpu(grant->truncate_seq),
2310 le64_to_cpu(grant->truncate_size), size);
2311 ceph_decode_timespec(&mtime, &grant->mtime);
2312 ceph_decode_timespec(&atime, &grant->atime);
2313 ceph_decode_timespec(&ctime, &grant->ctime);
2314 ceph_fill_file_time(inode, issued,
2315 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2316 &atime);
2317
2318 /* max size increase? */
2319 if (max_size != ci->i_max_size) {
2320 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2321 ci->i_max_size = max_size;
2322 if (max_size >= ci->i_wanted_max_size) {
2323 ci->i_wanted_max_size = 0; /* reset */
2324 ci->i_requested_max_size = 0;
2325 }
2326 wake = 1;
2327 }
2328
2329 /* check cap bits */
2330 wanted = __ceph_caps_wanted(ci);
2331 used = __ceph_caps_used(ci);
2332 dirty = __ceph_caps_dirty(ci);
2333 dout(" my wanted = %s, used = %s, dirty %s\n",
2334 ceph_cap_string(wanted),
2335 ceph_cap_string(used),
2336 ceph_cap_string(dirty));
2337 if (wanted != le32_to_cpu(grant->wanted)) {
2338 dout("mds wanted %s -> %s\n",
2339 ceph_cap_string(le32_to_cpu(grant->wanted)),
2340 ceph_cap_string(wanted));
2341 grant->wanted = cpu_to_le32(wanted);
2342 }
2343
2344 cap->seq = seq;
2345
2346 /* file layout may have changed */
2347 ci->i_layout = grant->layout;
2348
2349 /* revocation, grant, or no-op? */
2350 if (cap->issued & ~newcaps) {
2351 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2352 ceph_cap_string(newcaps));
2353 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2354 writeback = 1; /* will delay ack */
2355 else if (dirty & ~newcaps)
2356 check_caps = 1; /* initiate writeback in check_caps */
2357 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2358 revoked_rdcache)
2359 check_caps = 2; /* send revoke ack in check_caps */
2360 cap->issued = newcaps;
2361 cap->implemented |= newcaps;
2362 } else if (cap->issued == newcaps) {
2363 dout("caps unchanged: %s -> %s\n",
2364 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2365 } else {
2366 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2367 ceph_cap_string(newcaps));
2368 cap->issued = newcaps;
2369 cap->implemented |= newcaps; /* add bits only, to
2370 * avoid stepping on a
2371 * pending revocation */
2372 wake = 1;
2373 }
2374 BUG_ON(cap->issued & ~cap->implemented);
2375
2376 spin_unlock(&inode->i_lock);
2377 if (writeback)
2378 /*
2379 * queue inode for writeback: we can't actually call
2380 * filemap_write_and_wait, etc. from message handler
2381 * context.
2382 */
2383 ceph_queue_writeback(inode);
2384 if (queue_invalidate)
2385 ceph_queue_invalidate(inode);
2386 if (wake)
2387 wake_up(&ci->i_cap_wq);
2388
2389 if (check_caps == 1)
2390 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2391 session);
2392 else if (check_caps == 2)
2393 ceph_check_caps(ci, CHECK_CAPS_NODELAY, session);
2394 else
2395 mutex_unlock(&session->s_mutex);
2396}
2397
2398/*
2399 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2400 * MDS has been safely committed.
2401 */
2402static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2403 struct ceph_mds_caps *m,
2404 struct ceph_mds_session *session,
2405 struct ceph_cap *cap)
2406 __releases(inode->i_lock)
2407{
2408 struct ceph_inode_info *ci = ceph_inode(inode);
2409 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2410 unsigned seq = le32_to_cpu(m->seq);
2411 int dirty = le32_to_cpu(m->dirty);
2412 int cleaned = 0;
2413 int drop = 0;
2414 int i;
2415
2416 for (i = 0; i < CEPH_CAP_BITS; i++)
2417 if ((dirty & (1 << i)) &&
2418 flush_tid == ci->i_cap_flush_tid[i])
2419 cleaned |= 1 << i;
2420
2421 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2422 " flushing %s -> %s\n",
2423 inode, session->s_mds, seq, ceph_cap_string(dirty),
2424 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2425 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2426
2427 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2428 goto out;
2429
2430 ci->i_flushing_caps &= ~cleaned;
2431
2432 spin_lock(&mdsc->cap_dirty_lock);
2433 if (ci->i_flushing_caps == 0) {
2434 list_del_init(&ci->i_flushing_item);
2435 if (!list_empty(&session->s_cap_flushing))
2436 dout(" mds%d still flushing cap on %p\n",
2437 session->s_mds,
2438 &list_entry(session->s_cap_flushing.next,
2439 struct ceph_inode_info,
2440 i_flushing_item)->vfs_inode);
2441 mdsc->num_cap_flushing--;
2442 wake_up(&mdsc->cap_flushing_wq);
2443 dout(" inode %p now !flushing\n", inode);
2444
2445 if (ci->i_dirty_caps == 0) {
2446 dout(" inode %p now clean\n", inode);
2447 BUG_ON(!list_empty(&ci->i_dirty_item));
2448 drop = 1;
2449 } else {
2450 BUG_ON(list_empty(&ci->i_dirty_item));
2451 }
2452 }
2453 spin_unlock(&mdsc->cap_dirty_lock);
2454 wake_up(&ci->i_cap_wq);
2455
2456out:
2457 spin_unlock(&inode->i_lock);
2458 if (drop)
2459 iput(inode);
2460}
2461
2462/*
2463 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2464 * throw away our cap_snap.
2465 *
2466 * Caller hold s_mutex.
2467 */
2468static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2469 struct ceph_mds_caps *m,
2470 struct ceph_mds_session *session)
2471{
2472 struct ceph_inode_info *ci = ceph_inode(inode);
2473 u64 follows = le64_to_cpu(m->snap_follows);
2474 struct ceph_cap_snap *capsnap;
2475 int drop = 0;
2476
2477 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2478 inode, ci, session->s_mds, follows);
2479
2480 spin_lock(&inode->i_lock);
2481 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2482 if (capsnap->follows == follows) {
2483 if (capsnap->flush_tid != flush_tid) {
2484 dout(" cap_snap %p follows %lld tid %lld !="
2485 " %lld\n", capsnap, follows,
2486 flush_tid, capsnap->flush_tid);
2487 break;
2488 }
2489 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2490 dout(" removing %p cap_snap %p follows %lld\n",
2491 inode, capsnap, follows);
2492 ceph_put_snap_context(capsnap->context);
2493 list_del(&capsnap->ci_item);
2494 list_del(&capsnap->flushing_item);
2495 ceph_put_cap_snap(capsnap);
2496 drop = 1;
2497 break;
2498 } else {
2499 dout(" skipping cap_snap %p follows %lld\n",
2500 capsnap, capsnap->follows);
2501 }
2502 }
2503 spin_unlock(&inode->i_lock);
2504 if (drop)
2505 iput(inode);
2506}
2507
2508/*
2509 * Handle TRUNC from MDS, indicating file truncation.
2510 *
2511 * caller hold s_mutex.
2512 */
2513static void handle_cap_trunc(struct inode *inode,
2514 struct ceph_mds_caps *trunc,
2515 struct ceph_mds_session *session)
2516 __releases(inode->i_lock)
2517{
2518 struct ceph_inode_info *ci = ceph_inode(inode);
2519 int mds = session->s_mds;
2520 int seq = le32_to_cpu(trunc->seq);
2521 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2522 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2523 u64 size = le64_to_cpu(trunc->size);
2524 int implemented = 0;
2525 int dirty = __ceph_caps_dirty(ci);
2526 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2527 int queue_trunc = 0;
2528
2529 issued |= implemented | dirty;
2530
2531 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2532 inode, mds, seq, truncate_size, truncate_seq);
2533 queue_trunc = ceph_fill_file_size(inode, issued,
2534 truncate_seq, truncate_size, size);
2535 spin_unlock(&inode->i_lock);
2536
2537 if (queue_trunc)
2538 ceph_queue_vmtruncate(inode);
2539}
2540
2541/*
2542 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2543 * different one. If we are the most recent migration we've seen (as
2544 * indicated by mseq), make note of the migrating cap bits for the
2545 * duration (until we see the corresponding IMPORT).
2546 *
2547 * caller holds s_mutex
2548 */
2549static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2550 struct ceph_mds_session *session)
2551{
2552 struct ceph_inode_info *ci = ceph_inode(inode);
2553 int mds = session->s_mds;
2554 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2555 struct ceph_cap *cap = NULL, *t;
2556 struct rb_node *p;
2557 int remember = 1;
2558
2559 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2560 inode, ci, mds, mseq);
2561
2562 spin_lock(&inode->i_lock);
2563
2564 /* make sure we haven't seen a higher mseq */
2565 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2566 t = rb_entry(p, struct ceph_cap, ci_node);
2567 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2568 dout(" higher mseq on cap from mds%d\n",
2569 t->session->s_mds);
2570 remember = 0;
2571 }
2572 if (t->session->s_mds == mds)
2573 cap = t;
2574 }
2575
2576 if (cap) {
2577 if (remember) {
2578 /* make note */
2579 ci->i_cap_exporting_mds = mds;
2580 ci->i_cap_exporting_mseq = mseq;
2581 ci->i_cap_exporting_issued = cap->issued;
2582 }
2583 __ceph_remove_cap(cap);
2584 }
2585 /* else, we already released it */
2586
2587 spin_unlock(&inode->i_lock);
2588}
2589
2590/*
2591 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2592 * clean them up.
2593 *
2594 * caller holds s_mutex.
2595 */
2596static void handle_cap_import(struct ceph_mds_client *mdsc,
2597 struct inode *inode, struct ceph_mds_caps *im,
2598 struct ceph_mds_session *session,
2599 void *snaptrace, int snaptrace_len)
2600{
2601 struct ceph_inode_info *ci = ceph_inode(inode);
2602 int mds = session->s_mds;
2603 unsigned issued = le32_to_cpu(im->caps);
2604 unsigned wanted = le32_to_cpu(im->wanted);
2605 unsigned seq = le32_to_cpu(im->seq);
2606 unsigned mseq = le32_to_cpu(im->migrate_seq);
2607 u64 realmino = le64_to_cpu(im->realm);
2608 u64 cap_id = le64_to_cpu(im->cap_id);
2609
2610 if (ci->i_cap_exporting_mds >= 0 &&
2611 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2612 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2613 " - cleared exporting from mds%d\n",
2614 inode, ci, mds, mseq,
2615 ci->i_cap_exporting_mds);
2616 ci->i_cap_exporting_issued = 0;
2617 ci->i_cap_exporting_mseq = 0;
2618 ci->i_cap_exporting_mds = -1;
2619 } else {
2620 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2621 inode, ci, mds, mseq);
2622 }
2623
2624 down_write(&mdsc->snap_rwsem);
2625 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2626 false);
2627 downgrade_write(&mdsc->snap_rwsem);
2628 ceph_add_cap(inode, session, cap_id, -1,
2629 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2630 NULL /* no caps context */);
2631 try_flush_caps(inode, session, NULL);
2632 up_read(&mdsc->snap_rwsem);
2633}
2634
2635/*
2636 * Handle a caps message from the MDS.
2637 *
2638 * Identify the appropriate session, inode, and call the right handler
2639 * based on the cap op.
2640 */
2641void ceph_handle_caps(struct ceph_mds_session *session,
2642 struct ceph_msg *msg)
2643{
2644 struct ceph_mds_client *mdsc = session->s_mdsc;
2645 struct super_block *sb = mdsc->client->sb;
2646 struct inode *inode;
2647 struct ceph_cap *cap;
2648 struct ceph_mds_caps *h;
2649 int mds = session->s_mds;
2650 int op;
2651 u32 seq;
2652 struct ceph_vino vino;
2653 u64 cap_id;
2654 u64 size, max_size;
2655 u64 tid;
2656 void *snaptrace;
2657
2658 dout("handle_caps from mds%d\n", mds);
2659
2660 /* decode */
2661 tid = le64_to_cpu(msg->hdr.tid);
2662 if (msg->front.iov_len < sizeof(*h))
2663 goto bad;
2664 h = msg->front.iov_base;
2665 snaptrace = h + 1;
2666 op = le32_to_cpu(h->op);
2667 vino.ino = le64_to_cpu(h->ino);
2668 vino.snap = CEPH_NOSNAP;
2669 cap_id = le64_to_cpu(h->cap_id);
2670 seq = le32_to_cpu(h->seq);
2671 size = le64_to_cpu(h->size);
2672 max_size = le64_to_cpu(h->max_size);
2673
2674 mutex_lock(&session->s_mutex);
2675 session->s_seq++;
2676 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2677 (unsigned)seq);
2678
2679 /* lookup ino */
2680 inode = ceph_find_inode(sb, vino);
2681 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2682 vino.snap, inode);
2683 if (!inode) {
2684 dout(" i don't have ino %llx\n", vino.ino);
2685 goto done;
2686 }
2687
2688 /* these will work even if we don't have a cap yet */
2689 switch (op) {
2690 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2691 handle_cap_flushsnap_ack(inode, tid, h, session);
2692 goto done;
2693
2694 case CEPH_CAP_OP_EXPORT:
2695 handle_cap_export(inode, h, session);
2696 goto done;
2697
2698 case CEPH_CAP_OP_IMPORT:
2699 handle_cap_import(mdsc, inode, h, session,
2700 snaptrace, le32_to_cpu(h->snap_trace_len));
2701 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2702 session);
2703 goto done_unlocked;
2704 }
2705
2706 /* the rest require a cap */
2707 spin_lock(&inode->i_lock);
2708 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2709 if (!cap) {
2710 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2711 inode, ceph_ino(inode), ceph_snap(inode), mds);
2712 spin_unlock(&inode->i_lock);
2713 goto done;
2714 }
2715
2716 /* note that each of these drops i_lock for us */
2717 switch (op) {
2718 case CEPH_CAP_OP_REVOKE:
2719 case CEPH_CAP_OP_GRANT:
2720 handle_cap_grant(inode, h, session, cap, msg->middle);
2721 goto done_unlocked;
2722
2723 case CEPH_CAP_OP_FLUSH_ACK:
2724 handle_cap_flush_ack(inode, tid, h, session, cap);
2725 break;
2726
2727 case CEPH_CAP_OP_TRUNC:
2728 handle_cap_trunc(inode, h, session);
2729 break;
2730
2731 default:
2732 spin_unlock(&inode->i_lock);
2733 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2734 ceph_cap_op_name(op));
2735 }
2736
2737done:
2738 mutex_unlock(&session->s_mutex);
2739done_unlocked:
2740 if (inode)
2741 iput(inode);
2742 return;
2743
2744bad:
2745 pr_err("ceph_handle_caps: corrupt message\n");
2746 ceph_msg_dump(msg);
2747 return;
2748}
2749
2750/*
2751 * Delayed work handler to process end of delayed cap release LRU list.
2752 */
2753void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2754{
2755 struct ceph_inode_info *ci;
2756 int flags = CHECK_CAPS_NODELAY;
2757
2758 dout("check_delayed_caps\n");
2759 while (1) {
2760 spin_lock(&mdsc->cap_delay_lock);
2761 if (list_empty(&mdsc->cap_delay_list))
2762 break;
2763 ci = list_first_entry(&mdsc->cap_delay_list,
2764 struct ceph_inode_info,
2765 i_cap_delay_list);
2766 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2767 time_before(jiffies, ci->i_hold_caps_max))
2768 break;
2769 list_del_init(&ci->i_cap_delay_list);
2770 spin_unlock(&mdsc->cap_delay_lock);
2771 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2772 ceph_check_caps(ci, flags, NULL);
2773 }
2774 spin_unlock(&mdsc->cap_delay_lock);
2775}
2776
2777/*
2778 * Flush all dirty caps to the mds
2779 */
2780void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2781{
2782 struct ceph_inode_info *ci, *nci = NULL;
2783 struct inode *inode, *ninode = NULL;
2784 struct list_head *p, *n;
2785
2786 dout("flush_dirty_caps\n");
2787 spin_lock(&mdsc->cap_dirty_lock);
2788 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2789 if (nci) {
2790 ci = nci;
2791 inode = ninode;
2792 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2793 dout("flush_dirty_caps inode %p (was next inode)\n",
2794 inode);
2795 } else {
2796 ci = list_entry(p, struct ceph_inode_info,
2797 i_dirty_item);
2798 inode = igrab(&ci->vfs_inode);
2799 BUG_ON(!inode);
2800 dout("flush_dirty_caps inode %p\n", inode);
2801 }
2802 if (n != &mdsc->cap_dirty) {
2803 nci = list_entry(n, struct ceph_inode_info,
2804 i_dirty_item);
2805 ninode = igrab(&nci->vfs_inode);
2806 BUG_ON(!ninode);
2807 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2808 dout("flush_dirty_caps next inode %p, noflush\n",
2809 ninode);
2810 } else {
2811 nci = NULL;
2812 ninode = NULL;
2813 }
2814 spin_unlock(&mdsc->cap_dirty_lock);
2815 if (inode) {
2816 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2817 NULL);
2818 iput(inode);
2819 }
2820 spin_lock(&mdsc->cap_dirty_lock);
2821 }
2822 spin_unlock(&mdsc->cap_dirty_lock);
2823}
2824
2825/*
2826 * Drop open file reference. If we were the last open file,
2827 * we may need to release capabilities to the MDS (or schedule
2828 * their delayed release).
2829 */
2830void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2831{
2832 struct inode *inode = &ci->vfs_inode;
2833 int last = 0;
2834
2835 spin_lock(&inode->i_lock);
2836 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2837 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2838 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2839 if (--ci->i_nr_by_mode[fmode] == 0)
2840 last++;
2841 spin_unlock(&inode->i_lock);
2842
2843 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2844 ceph_check_caps(ci, 0, NULL);
2845}
2846
2847/*
2848 * Helpers for embedding cap and dentry lease releases into mds
2849 * requests.
2850 *
2851 * @force is used by dentry_release (below) to force inclusion of a
2852 * record for the directory inode, even when there aren't any caps to
2853 * drop.
2854 */
2855int ceph_encode_inode_release(void **p, struct inode *inode,
2856 int mds, int drop, int unless, int force)
2857{
2858 struct ceph_inode_info *ci = ceph_inode(inode);
2859 struct ceph_cap *cap;
2860 struct ceph_mds_request_release *rel = *p;
2861 int ret = 0;
2862 int used = 0;
2863
2864 spin_lock(&inode->i_lock);
2865 used = __ceph_caps_used(ci);
2866
2867 dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
2868 mds, ceph_cap_string(used), ceph_cap_string(drop),
2869 ceph_cap_string(unless));
2870
2871 /* only drop unused caps */
2872 drop &= ~used;
2873
2874 cap = __get_cap_for_mds(ci, mds);
2875 if (cap && __cap_is_valid(cap)) {
2876 if (force ||
2877 ((cap->issued & drop) &&
2878 (cap->issued & unless) == 0)) {
2879 if ((cap->issued & drop) &&
2880 (cap->issued & unless) == 0) {
2881 dout("encode_inode_release %p cap %p %s -> "
2882 "%s\n", inode, cap,
2883 ceph_cap_string(cap->issued),
2884 ceph_cap_string(cap->issued & ~drop));
2885 cap->issued &= ~drop;
2886 cap->implemented &= ~drop;
2887 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2888 int wanted = __ceph_caps_wanted(ci);
2889 dout(" wanted %s -> %s (act %s)\n",
2890 ceph_cap_string(cap->mds_wanted),
2891 ceph_cap_string(cap->mds_wanted &
2892 ~wanted),
2893 ceph_cap_string(wanted));
2894 cap->mds_wanted &= wanted;
2895 }
2896 } else {
2897 dout("encode_inode_release %p cap %p %s"
2898 " (force)\n", inode, cap,
2899 ceph_cap_string(cap->issued));
2900 }
2901
2902 rel->ino = cpu_to_le64(ceph_ino(inode));
2903 rel->cap_id = cpu_to_le64(cap->cap_id);
2904 rel->seq = cpu_to_le32(cap->seq);
2905 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2906 rel->mseq = cpu_to_le32(cap->mseq);
2907 rel->caps = cpu_to_le32(cap->issued);
2908 rel->wanted = cpu_to_le32(cap->mds_wanted);
2909 rel->dname_len = 0;
2910 rel->dname_seq = 0;
2911 *p += sizeof(*rel);
2912 ret = 1;
2913 } else {
2914 dout("encode_inode_release %p cap %p %s\n",
2915 inode, cap, ceph_cap_string(cap->issued));
2916 }
2917 }
2918 spin_unlock(&inode->i_lock);
2919 return ret;
2920}
2921
2922int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2923 int mds, int drop, int unless)
2924{
2925 struct inode *dir = dentry->d_parent->d_inode;
2926 struct ceph_mds_request_release *rel = *p;
2927 struct ceph_dentry_info *di = ceph_dentry(dentry);
2928 int force = 0;
2929 int ret;
2930
2931 /*
2932 * force an record for the directory caps if we have a dentry lease.
2933 * this is racy (can't take i_lock and d_lock together), but it
2934 * doesn't have to be perfect; the mds will revoke anything we don't
2935 * release.
2936 */
2937 spin_lock(&dentry->d_lock);
2938 if (di->lease_session && di->lease_session->s_mds == mds)
2939 force = 1;
2940 spin_unlock(&dentry->d_lock);
2941
2942 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2943
2944 spin_lock(&dentry->d_lock);
2945 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2946 dout("encode_dentry_release %p mds%d seq %d\n",
2947 dentry, mds, (int)di->lease_seq);
2948 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2949 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2950 *p += dentry->d_name.len;
2951 rel->dname_seq = cpu_to_le32(di->lease_seq);
2952 }
2953 spin_unlock(&dentry->d_lock);
2954 return ret;
2955}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..f704b3b62424
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,409 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <linux/slab.h>
7#include <crypto/hash.h>
8
9#include "crypto.h"
10#include "decode.h"
11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{
14 if (*p + sizeof(u16) + sizeof(key->created) +
15 sizeof(u16) + key->len > end)
16 return -ERANGE;
17 ceph_encode_16(p, key->type);
18 ceph_encode_copy(p, &key->created, sizeof(key->created));
19 ceph_encode_16(p, key->len);
20 ceph_encode_copy(p, key->key, key->len);
21 return 0;
22}
23
24int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
25{
26 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
27 key->type = ceph_decode_16(p);
28 ceph_decode_copy(p, &key->created, sizeof(key->created));
29 key->len = ceph_decode_16(p);
30 ceph_decode_need(p, end, key->len, bad);
31 key->key = kmalloc(key->len, GFP_NOFS);
32 if (!key->key)
33 return -ENOMEM;
34 ceph_decode_copy(p, key->key, key->len);
35 return 0;
36
37bad:
38 dout("failed to decode crypto key\n");
39 return -EINVAL;
40}
41
42int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
43{
44 int inlen = strlen(inkey);
45 int blen = inlen * 3 / 4;
46 void *buf, *p;
47 int ret;
48
49 dout("crypto_key_unarmor %s\n", inkey);
50 buf = kmalloc(blen, GFP_NOFS);
51 if (!buf)
52 return -ENOMEM;
53 blen = ceph_unarmor(buf, inkey, inkey+inlen);
54 if (blen < 0) {
55 kfree(buf);
56 return blen;
57 }
58
59 p = buf;
60 ret = ceph_crypto_key_decode(key, &p, p + blen);
61 kfree(buf);
62 if (ret)
63 return ret;
64 dout("crypto_key_unarmor key %p type %d len %d\n", key,
65 key->type, key->len);
66 return 0;
67}
68
69
70
71#define AES_KEY_SIZE 16
72
73static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
74{
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76}
77
78const u8 *aes_iv = "cephsageyudagreg";
79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
81 const void *src, size_t src_len)
82{
83 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
85 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
86 int ret;
87 void *iv;
88 int ivsize;
89 size_t zero_padding = (0x10 - (src_len & 0x0f));
90 char pad[16];
91
92 if (IS_ERR(tfm))
93 return PTR_ERR(tfm);
94
95 memset(pad, zero_padding, zero_padding);
96
97 *dst_len = src_len + zero_padding;
98
99 crypto_blkcipher_setkey((void *)tfm, key, key_len);
100 sg_init_table(sg_in, 2);
101 sg_set_buf(&sg_in[0], src, src_len);
102 sg_set_buf(&sg_in[1], pad, zero_padding);
103 sg_init_table(sg_out, 1);
104 sg_set_buf(sg_out, dst, *dst_len);
105 iv = crypto_blkcipher_crt(tfm)->iv;
106 ivsize = crypto_blkcipher_ivsize(tfm);
107
108 memcpy(iv, aes_iv, ivsize);
109 /*
110 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
111 key, key_len, 1);
112 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
113 src, src_len, 1);
114 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
115 pad, zero_padding, 1);
116 */
117 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
118 src_len + zero_padding);
119 crypto_free_blkcipher(tfm);
120 if (ret < 0)
121 pr_err("ceph_aes_crypt failed %d\n", ret);
122 /*
123 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
124 dst, *dst_len, 1);
125 */
126 return 0;
127}
128
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
130 const void *src1, size_t src1_len,
131 const void *src2, size_t src2_len)
132{
133 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
135 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
136 int ret;
137 void *iv;
138 int ivsize;
139 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
140 char pad[16];
141
142 if (IS_ERR(tfm))
143 return PTR_ERR(tfm);
144
145 memset(pad, zero_padding, zero_padding);
146
147 *dst_len = src1_len + src2_len + zero_padding;
148
149 crypto_blkcipher_setkey((void *)tfm, key, key_len);
150 sg_init_table(sg_in, 3);
151 sg_set_buf(&sg_in[0], src1, src1_len);
152 sg_set_buf(&sg_in[1], src2, src2_len);
153 sg_set_buf(&sg_in[2], pad, zero_padding);
154 sg_init_table(sg_out, 1);
155 sg_set_buf(sg_out, dst, *dst_len);
156 iv = crypto_blkcipher_crt(tfm)->iv;
157 ivsize = crypto_blkcipher_ivsize(tfm);
158
159 memcpy(iv, aes_iv, ivsize);
160 /*
161 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
162 key, key_len, 1);
163 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
164 src1, src1_len, 1);
165 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
166 src2, src2_len, 1);
167 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
168 pad, zero_padding, 1);
169 */
170 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
171 src1_len + src2_len + zero_padding);
172 crypto_free_blkcipher(tfm);
173 if (ret < 0)
174 pr_err("ceph_aes_crypt2 failed %d\n", ret);
175 /*
176 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
177 dst, *dst_len, 1);
178 */
179 return 0;
180}
181
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
183 const void *src, size_t src_len)
184{
185 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
187 struct blkcipher_desc desc = { .tfm = tfm };
188 char pad[16];
189 void *iv;
190 int ivsize;
191 int ret;
192 int last_byte;
193
194 if (IS_ERR(tfm))
195 return PTR_ERR(tfm);
196
197 crypto_blkcipher_setkey((void *)tfm, key, key_len);
198 sg_init_table(sg_in, 1);
199 sg_init_table(sg_out, 2);
200 sg_set_buf(sg_in, src, src_len);
201 sg_set_buf(&sg_out[0], dst, *dst_len);
202 sg_set_buf(&sg_out[1], pad, sizeof(pad));
203
204 iv = crypto_blkcipher_crt(tfm)->iv;
205 ivsize = crypto_blkcipher_ivsize(tfm);
206
207 memcpy(iv, aes_iv, ivsize);
208
209 /*
210 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
211 key, key_len, 1);
212 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
213 src, src_len, 1);
214 */
215
216 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
217 crypto_free_blkcipher(tfm);
218 if (ret < 0) {
219 pr_err("ceph_aes_decrypt failed %d\n", ret);
220 return ret;
221 }
222
223 if (src_len <= *dst_len)
224 last_byte = ((char *)dst)[src_len - 1];
225 else
226 last_byte = pad[src_len - *dst_len - 1];
227 if (last_byte <= 16 && src_len >= last_byte) {
228 *dst_len = src_len - last_byte;
229 } else {
230 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
231 last_byte, (int)src_len);
232 return -EPERM; /* bad padding */
233 }
234 /*
235 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
236 dst, *dst_len, 1);
237 */
238 return 0;
239}
240
241int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len)
245{
246 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
248 struct blkcipher_desc desc = { .tfm = tfm };
249 char pad[16];
250 void *iv;
251 int ivsize;
252 int ret;
253 int last_byte;
254
255 if (IS_ERR(tfm))
256 return PTR_ERR(tfm);
257
258 sg_init_table(sg_in, 1);
259 sg_set_buf(sg_in, src, src_len);
260 sg_init_table(sg_out, 3);
261 sg_set_buf(&sg_out[0], dst1, *dst1_len);
262 sg_set_buf(&sg_out[1], dst2, *dst2_len);
263 sg_set_buf(&sg_out[2], pad, sizeof(pad));
264
265 crypto_blkcipher_setkey((void *)tfm, key, key_len);
266 iv = crypto_blkcipher_crt(tfm)->iv;
267 ivsize = crypto_blkcipher_ivsize(tfm);
268
269 memcpy(iv, aes_iv, ivsize);
270
271 /*
272 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
273 key, key_len, 1);
274 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
275 src, src_len, 1);
276 */
277
278 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
279 crypto_free_blkcipher(tfm);
280 if (ret < 0) {
281 pr_err("ceph_aes_decrypt failed %d\n", ret);
282 return ret;
283 }
284
285 if (src_len <= *dst1_len)
286 last_byte = ((char *)dst1)[src_len - 1];
287 else if (src_len <= *dst1_len + *dst2_len)
288 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
289 else
290 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
291 if (last_byte <= 16 && src_len >= last_byte) {
292 src_len -= last_byte;
293 } else {
294 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
295 last_byte, (int)src_len);
296 return -EPERM; /* bad padding */
297 }
298
299 if (src_len < *dst1_len) {
300 *dst1_len = src_len;
301 *dst2_len = 0;
302 } else {
303 *dst2_len = src_len - *dst1_len;
304 }
305 /*
306 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
307 dst1, *dst1_len, 1);
308 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
309 dst2, *dst2_len, 1);
310 */
311
312 return 0;
313}
314
315
316int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
317 const void *src, size_t src_len)
318{
319 switch (secret->type) {
320 case CEPH_CRYPTO_NONE:
321 if (*dst_len < src_len)
322 return -ERANGE;
323 memcpy(dst, src, src_len);
324 *dst_len = src_len;
325 return 0;
326
327 case CEPH_CRYPTO_AES:
328 return ceph_aes_decrypt(secret->key, secret->len, dst,
329 dst_len, src, src_len);
330
331 default:
332 return -EINVAL;
333 }
334}
335
336int ceph_decrypt2(struct ceph_crypto_key *secret,
337 void *dst1, size_t *dst1_len,
338 void *dst2, size_t *dst2_len,
339 const void *src, size_t src_len)
340{
341 size_t t;
342
343 switch (secret->type) {
344 case CEPH_CRYPTO_NONE:
345 if (*dst1_len + *dst2_len < src_len)
346 return -ERANGE;
347 t = min(*dst1_len, src_len);
348 memcpy(dst1, src, t);
349 *dst1_len = t;
350 src += t;
351 src_len -= t;
352 if (src_len) {
353 t = min(*dst2_len, src_len);
354 memcpy(dst2, src, t);
355 *dst2_len = t;
356 }
357 return 0;
358
359 case CEPH_CRYPTO_AES:
360 return ceph_aes_decrypt2(secret->key, secret->len,
361 dst1, dst1_len, dst2, dst2_len,
362 src, src_len);
363
364 default:
365 return -EINVAL;
366 }
367}
368
369int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
370 const void *src, size_t src_len)
371{
372 switch (secret->type) {
373 case CEPH_CRYPTO_NONE:
374 if (*dst_len < src_len)
375 return -ERANGE;
376 memcpy(dst, src, src_len);
377 *dst_len = src_len;
378 return 0;
379
380 case CEPH_CRYPTO_AES:
381 return ceph_aes_encrypt(secret->key, secret->len, dst,
382 dst_len, src, src_len);
383
384 default:
385 return -EINVAL;
386 }
387}
388
389int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
390 const void *src1, size_t src1_len,
391 const void *src2, size_t src2_len)
392{
393 switch (secret->type) {
394 case CEPH_CRYPTO_NONE:
395 if (*dst_len < src1_len + src2_len)
396 return -ERANGE;
397 memcpy(dst, src1, src1_len);
398 memcpy(dst + src1_len, src2, src2_len);
399 *dst_len = src1_len + src2_len;
400 return 0;
401
402 case CEPH_CRYPTO_AES:
403 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
404 src1, src1_len, src2, src2_len);
405
406 default:
407 return -EINVAL;
408 }
409}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..f7048da92acc
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,484 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53
54static int mdsmap_show(struct seq_file *s, void *p)
55{
56 int i;
57 struct ceph_client *client = s->private;
58
59 if (client->mdsc.mdsmap == NULL)
60 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state;
71
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state));
74 }
75 return 0;
76}
77
78static int osdmap_show(struct seq_file *s, void *p)
79{
80 int i;
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_statfs_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
130 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
131 seq_printf(s, "%lld statfs\n", req->tid);
132 }
133
134 mutex_unlock(&monc->mutex);
135 return 0;
136}
137
138static int mdsc_show(struct seq_file *s, void *p)
139{
140 struct ceph_client *client = s->private;
141 struct ceph_mds_client *mdsc = &client->mdsc;
142 struct ceph_mds_request *req;
143 struct rb_node *rp;
144 int pathlen;
145 u64 pathbase;
146 char *path;
147
148 mutex_lock(&mdsc->mutex);
149 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
150 req = rb_entry(rp, struct ceph_mds_request, r_node);
151
152 if (req->r_request)
153 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
154 else
155 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
156
157 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
158
159 if (req->r_got_unsafe)
160 seq_printf(s, "\t(unsafe)");
161 else
162 seq_printf(s, "\t");
163
164 if (req->r_inode) {
165 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
166 } else if (req->r_dentry) {
167 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
168 &pathbase, 0);
169 spin_lock(&req->r_dentry->d_lock);
170 seq_printf(s, " #%llx/%.*s (%s)",
171 ceph_ino(req->r_dentry->d_parent->d_inode),
172 req->r_dentry->d_name.len,
173 req->r_dentry->d_name.name,
174 path ? path : "");
175 spin_unlock(&req->r_dentry->d_lock);
176 kfree(path);
177 } else if (req->r_path1) {
178 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
179 req->r_path1);
180 }
181
182 if (req->r_old_dentry) {
183 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
184 &pathbase, 0);
185 spin_lock(&req->r_old_dentry->d_lock);
186 seq_printf(s, " #%llx/%.*s (%s)",
187 ceph_ino(req->r_old_dentry->d_parent->d_inode),
188 req->r_old_dentry->d_name.len,
189 req->r_old_dentry->d_name.name,
190 path ? path : "");
191 spin_unlock(&req->r_old_dentry->d_lock);
192 kfree(path);
193 } else if (req->r_path2) {
194 if (req->r_ino2.ino)
195 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
196 req->r_path2);
197 else
198 seq_printf(s, " %s", req->r_path2);
199 }
200
201 seq_printf(s, "\n");
202 }
203 mutex_unlock(&mdsc->mutex);
204
205 return 0;
206}
207
208static int osdc_show(struct seq_file *s, void *pp)
209{
210 struct ceph_client *client = s->private;
211 struct ceph_osd_client *osdc = &client->osdc;
212 struct rb_node *p;
213
214 mutex_lock(&osdc->request_mutex);
215 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
216 struct ceph_osd_request *req;
217 struct ceph_osd_request_head *head;
218 struct ceph_osd_op *op;
219 int num_ops;
220 int opcode, olen;
221 int i;
222
223 req = rb_entry(p, struct ceph_osd_request, r_node);
224
225 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
226 req->r_osd ? req->r_osd->o_osd : -1,
227 le32_to_cpu(req->r_pgid.pool),
228 le16_to_cpu(req->r_pgid.ps));
229
230 head = req->r_request->front.iov_base;
231 op = (void *)(head + 1);
232
233 num_ops = le16_to_cpu(head->num_ops);
234 olen = le32_to_cpu(head->object_len);
235 seq_printf(s, "%.*s", olen,
236 (const char *)(head->ops + num_ops));
237
238 if (req->r_reassert_version.epoch)
239 seq_printf(s, "\t%u'%llu",
240 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
241 le64_to_cpu(req->r_reassert_version.version));
242 else
243 seq_printf(s, "\t");
244
245 for (i = 0; i < num_ops; i++) {
246 opcode = le16_to_cpu(op->op);
247 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
248 op++;
249 }
250
251 seq_printf(s, "\n");
252 }
253 mutex_unlock(&osdc->request_mutex);
254 return 0;
255}
256
257static int caps_show(struct seq_file *s, void *p)
258{
259 struct ceph_client *client = p;
260 int total, avail, used, reserved, min;
261
262 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
263 seq_printf(s, "total\t\t%d\n"
264 "avail\t\t%d\n"
265 "used\t\t%d\n"
266 "reserved\t%d\n"
267 "min\t%d\n",
268 total, avail, used, reserved, min);
269 return 0;
270}
271
272static int dentry_lru_show(struct seq_file *s, void *ptr)
273{
274 struct ceph_client *client = s->private;
275 struct ceph_mds_client *mdsc = &client->mdsc;
276 struct ceph_dentry_info *di;
277
278 spin_lock(&mdsc->dentry_lru_lock);
279 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
280 struct dentry *dentry = di->dentry;
281 seq_printf(s, "%p %p\t%.*s\n",
282 di, dentry, dentry->d_name.len, dentry->d_name.name);
283 }
284 spin_unlock(&mdsc->dentry_lru_lock);
285
286 return 0;
287}
288
289#define DEFINE_SHOW_FUNC(name) \
290static int name##_open(struct inode *inode, struct file *file) \
291{ \
292 struct seq_file *sf; \
293 int ret; \
294 \
295 ret = single_open(file, name, NULL); \
296 sf = file->private_data; \
297 sf->private = inode->i_private; \
298 return ret; \
299} \
300 \
301static const struct file_operations name##_fops = { \
302 .open = name##_open, \
303 .read = seq_read, \
304 .llseek = seq_lseek, \
305 .release = single_release, \
306};
307
308DEFINE_SHOW_FUNC(monmap_show)
309DEFINE_SHOW_FUNC(mdsmap_show)
310DEFINE_SHOW_FUNC(osdmap_show)
311DEFINE_SHOW_FUNC(monc_show)
312DEFINE_SHOW_FUNC(mdsc_show)
313DEFINE_SHOW_FUNC(osdc_show)
314DEFINE_SHOW_FUNC(dentry_lru_show)
315DEFINE_SHOW_FUNC(caps_show)
316
317static int congestion_kb_set(void *data, u64 val)
318{
319 struct ceph_client *client = (struct ceph_client *)data;
320
321 if (client)
322 client->mount_args->congestion_kb = (int)val;
323
324 return 0;
325}
326
327static int congestion_kb_get(void *data, u64 *val)
328{
329 struct ceph_client *client = (struct ceph_client *)data;
330
331 if (client)
332 *val = (u64)client->mount_args->congestion_kb;
333
334 return 0;
335}
336
337
338DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
339 congestion_kb_set, "%llu\n");
340
341int __init ceph_debugfs_init(void)
342{
343 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
344 if (!ceph_debugfs_dir)
345 return -ENOMEM;
346 return 0;
347}
348
349void ceph_debugfs_cleanup(void)
350{
351 debugfs_remove(ceph_debugfs_dir);
352}
353
354int ceph_debugfs_client_init(struct ceph_client *client)
355{
356 int ret = 0;
357 char name[80];
358
359 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
360 PR_FSID(&client->fsid), client->monc.auth->global_id);
361
362 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
363 if (!client->debugfs_dir)
364 goto out;
365
366 client->monc.debugfs_file = debugfs_create_file("monc",
367 0600,
368 client->debugfs_dir,
369 client,
370 &monc_show_fops);
371 if (!client->monc.debugfs_file)
372 goto out;
373
374 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
375 0600,
376 client->debugfs_dir,
377 client,
378 &mdsc_show_fops);
379 if (!client->mdsc.debugfs_file)
380 goto out;
381
382 client->osdc.debugfs_file = debugfs_create_file("osdc",
383 0600,
384 client->debugfs_dir,
385 client,
386 &osdc_show_fops);
387 if (!client->osdc.debugfs_file)
388 goto out;
389
390 client->debugfs_monmap = debugfs_create_file("monmap",
391 0600,
392 client->debugfs_dir,
393 client,
394 &monmap_show_fops);
395 if (!client->debugfs_monmap)
396 goto out;
397
398 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
399 0600,
400 client->debugfs_dir,
401 client,
402 &mdsmap_show_fops);
403 if (!client->debugfs_mdsmap)
404 goto out;
405
406 client->debugfs_osdmap = debugfs_create_file("osdmap",
407 0600,
408 client->debugfs_dir,
409 client,
410 &osdmap_show_fops);
411 if (!client->debugfs_osdmap)
412 goto out;
413
414 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
415 0600,
416 client->debugfs_dir,
417 client,
418 &dentry_lru_show_fops);
419 if (!client->debugfs_dentry_lru)
420 goto out;
421
422 client->debugfs_caps = debugfs_create_file("caps",
423 0400,
424 client->debugfs_dir,
425 client,
426 &caps_show_fops);
427 if (!client->debugfs_caps)
428 goto out;
429
430 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
431 0600,
432 client->debugfs_dir,
433 client,
434 &congestion_kb_fops);
435 if (!client->debugfs_congestion_kb)
436 goto out;
437
438 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
439 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
440 name);
441
442 return 0;
443
444out:
445 ceph_debugfs_client_cleanup(client);
446 return ret;
447}
448
449void ceph_debugfs_client_cleanup(struct ceph_client *client)
450{
451 debugfs_remove(client->debugfs_bdi);
452 debugfs_remove(client->debugfs_caps);
453 debugfs_remove(client->debugfs_dentry_lru);
454 debugfs_remove(client->debugfs_osdmap);
455 debugfs_remove(client->debugfs_mdsmap);
456 debugfs_remove(client->debugfs_monmap);
457 debugfs_remove(client->osdc.debugfs_file);
458 debugfs_remove(client->mdsc.debugfs_file);
459 debugfs_remove(client->monc.debugfs_file);
460 debugfs_remove(client->debugfs_congestion_kb);
461 debugfs_remove(client->debugfs_dir);
462}
463
464#else // CONFIG_DEBUG_FS
465
466int __init ceph_debugfs_init(void)
467{
468 return 0;
469}
470
471void ceph_debugfs_cleanup(void)
472{
473}
474
475int ceph_debugfs_client_init(struct ceph_client *client)
476{
477 return 0;
478}
479
480void ceph_debugfs_client_cleanup(struct ceph_client *client)
481{
482}
483
484#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..ea8ee2e526aa
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1224 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/slab.h>
7#include <linux/sched.h>
8
9#include "super.h"
10
11/*
12 * Directory operations: readdir, lookup, create, link, unlink,
13 * rename, etc.
14 */
15
16/*
17 * Ceph MDS operations are specified in terms of a base ino and
18 * relative path. Thus, the client can specify an operation on a
19 * specific inode (e.g., a getattr due to fstat(2)), or as a path
20 * relative to, say, the root directory.
21 *
22 * Normally, we limit ourselves to strict inode ops (no path component)
23 * or dentry operations (a single path component relative to an ino). The
24 * exception to this is open_root_dentry(), which will open the mount
25 * point by name.
26 */
27
28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops;
31
32/*
33 * Initialize ceph dentry state.
34 */
35int ceph_init_dentry(struct dentry *dentry)
36{
37 struct ceph_dentry_info *di;
38
39 if (dentry->d_fsdata)
40 return 0;
41
42 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
43 dentry->d_op = &ceph_dentry_ops;
44 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
45 dentry->d_op = &ceph_snapdir_dentry_ops;
46 else
47 dentry->d_op = &ceph_snap_dentry_ops;
48
49 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
50 if (!di)
51 return -ENOMEM; /* oh well */
52
53 spin_lock(&dentry->d_lock);
54 if (dentry->d_fsdata) /* lost a race */
55 goto out_unlock;
56 di->dentry = dentry;
57 di->lease_session = NULL;
58 dentry->d_fsdata = di;
59 dentry->d_time = jiffies;
60 ceph_dentry_lru_add(dentry);
61out_unlock:
62 spin_unlock(&dentry->d_lock);
63 return 0;
64}
65
66
67
68/*
69 * for readdir, we encode the directory frag and offset within that
70 * frag into f_pos.
71 */
72static unsigned fpos_frag(loff_t p)
73{
74 return p >> 32;
75}
76static unsigned fpos_off(loff_t p)
77{
78 return p & 0xffffffff;
79}
80
81/*
82 * When possible, we try to satisfy a readdir by peeking at the
83 * dcache. We make this work by carefully ordering dentries on
84 * d_u.d_child when we initially get results back from the MDS, and
85 * falling back to a "normal" sync readdir if any dentries in the dir
86 * are dropped.
87 *
88 * I_COMPLETE tells indicates we have all dentries in the dir. It is
89 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
90 * the MDS if/when the directory is modified).
91 */
92static int __dcache_readdir(struct file *filp,
93 void *dirent, filldir_t filldir)
94{
95 struct inode *inode = filp->f_dentry->d_inode;
96 struct ceph_file_info *fi = filp->private_data;
97 struct dentry *parent = filp->f_dentry;
98 struct inode *dir = parent->d_inode;
99 struct list_head *p;
100 struct dentry *dentry, *last;
101 struct ceph_dentry_info *di;
102 int err = 0;
103
104 /* claim ref on last dentry we returned */
105 last = fi->dentry;
106 fi->dentry = NULL;
107
108 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
109 last);
110
111 spin_lock(&dcache_lock);
112
113 /* start at beginning? */
114 if (filp->f_pos == 2 || (last &&
115 filp->f_pos < ceph_dentry(last)->offset)) {
116 if (list_empty(&parent->d_subdirs))
117 goto out_unlock;
118 p = parent->d_subdirs.prev;
119 dout(" initial p %p/%p\n", p->prev, p->next);
120 } else {
121 p = last->d_u.d_child.prev;
122 }
123
124more:
125 dentry = list_entry(p, struct dentry, d_u.d_child);
126 di = ceph_dentry(dentry);
127 while (1) {
128 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
129 parent->d_subdirs.prev, parent->d_subdirs.next);
130 if (p == &parent->d_subdirs) {
131 fi->at_end = 1;
132 goto out_unlock;
133 }
134 if (!d_unhashed(dentry) && dentry->d_inode &&
135 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
136 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
137 filp->f_pos <= di->offset)
138 break;
139 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
140 dentry->d_name.len, dentry->d_name.name, di->offset,
141 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
142 !dentry->d_inode ? " null" : "");
143 p = p->prev;
144 dentry = list_entry(p, struct dentry, d_u.d_child);
145 di = ceph_dentry(dentry);
146 }
147
148 atomic_inc(&dentry->d_count);
149 spin_unlock(&dcache_lock);
150 spin_unlock(&inode->i_lock);
151
152 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
153 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
154 filp->f_pos = di->offset;
155 err = filldir(dirent, dentry->d_name.name,
156 dentry->d_name.len, di->offset,
157 dentry->d_inode->i_ino,
158 dentry->d_inode->i_mode >> 12);
159
160 if (last) {
161 if (err < 0) {
162 /* remember our position */
163 fi->dentry = last;
164 fi->next_offset = di->offset;
165 } else {
166 dput(last);
167 }
168 last = NULL;
169 }
170
171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock);
173
174 last = dentry;
175
176 if (err < 0)
177 goto out_unlock;
178
179 p = p->prev;
180 filp->f_pos++;
181
182 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
183 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
184 goto more;
185 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
186 err = -EAGAIN;
187
188out_unlock:
189 spin_unlock(&dcache_lock);
190
191 if (last) {
192 spin_unlock(&inode->i_lock);
193 dput(last);
194 spin_lock(&inode->i_lock);
195 }
196
197 return err;
198}
199
200/*
201 * make note of the last dentry we read, so we can
202 * continue at the same lexicographical point,
203 * regardless of what dir changes take place on the
204 * server.
205 */
206static int note_last_dentry(struct ceph_file_info *fi, const char *name,
207 int len)
208{
209 kfree(fi->last_name);
210 fi->last_name = kmalloc(len+1, GFP_NOFS);
211 if (!fi->last_name)
212 return -ENOMEM;
213 memcpy(fi->last_name, name, len);
214 fi->last_name[len] = 0;
215 dout("note_last_dentry '%s'\n", fi->last_name);
216 return 0;
217}
218
219static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
220{
221 struct ceph_file_info *fi = filp->private_data;
222 struct inode *inode = filp->f_dentry->d_inode;
223 struct ceph_inode_info *ci = ceph_inode(inode);
224 struct ceph_client *client = ceph_inode_to_client(inode);
225 struct ceph_mds_client *mdsc = &client->mdsc;
226 unsigned frag = fpos_frag(filp->f_pos);
227 int off = fpos_off(filp->f_pos);
228 int err;
229 u32 ftype;
230 struct ceph_mds_reply_info_parsed *rinfo;
231 const int max_entries = client->mount_args->max_readdir;
232
233 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
234 if (fi->at_end)
235 return 0;
236
237 /* always start with . and .. */
238 if (filp->f_pos == 0) {
239 /* note dir version at start of readdir so we can tell
240 * if any dentries get dropped */
241 fi->dir_release_count = ci->i_release_count;
242
243 dout("readdir off 0 -> '.'\n");
244 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
245 inode->i_ino, inode->i_mode >> 12) < 0)
246 return 0;
247 filp->f_pos = 1;
248 off = 1;
249 }
250 if (filp->f_pos == 1) {
251 dout("readdir off 1 -> '..'\n");
252 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
253 filp->f_dentry->d_parent->d_inode->i_ino,
254 inode->i_mode >> 12) < 0)
255 return 0;
256 filp->f_pos = 2;
257 off = 2;
258 }
259
260 /* can we use the dcache? */
261 spin_lock(&inode->i_lock);
262 if ((filp->f_pos == 2 || fi->dentry) &&
263 !ceph_test_opt(client, NOASYNCREADDIR) &&
264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 err = __dcache_readdir(filp, dirent, filldir);
267 if (err != -EAGAIN) {
268 spin_unlock(&inode->i_lock);
269 return err;
270 }
271 }
272 spin_unlock(&inode->i_lock);
273 if (fi->dentry) {
274 err = note_last_dentry(fi, fi->dentry->d_name.name,
275 fi->dentry->d_name.len);
276 if (err)
277 return err;
278 dput(fi->dentry);
279 fi->dentry = NULL;
280 }
281
282 /* proceed with a normal readdir */
283
284more:
285 /* do we have the correct frag content buffered? */
286 if (fi->frag != frag || fi->last_readdir == NULL) {
287 struct ceph_mds_request *req;
288 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
289 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
290
291 /* discard old result, if any */
292 if (fi->last_readdir) {
293 ceph_mdsc_put_request(fi->last_readdir);
294 fi->last_readdir = NULL;
295 }
296
297 /* requery frag tree, as the frag topology may have changed */
298 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
299
300 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
301 ceph_vinop(inode), frag, fi->last_name);
302 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
303 if (IS_ERR(req))
304 return PTR_ERR(req);
305 req->r_inode = igrab(inode);
306 req->r_dentry = dget(filp->f_dentry);
307 /* hints to request -> mds selection code */
308 req->r_direct_mode = USE_AUTH_MDS;
309 req->r_direct_hash = ceph_frag_value(frag);
310 req->r_direct_is_hash = true;
311 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) {
318 ceph_mdsc_put_request(req);
319 return err;
320 }
321 dout("readdir got and parsed readdir result=%d"
322 " on frag %x, end=%d, complete=%d\n", err, frag,
323 (int)req->r_reply_info.dir_end,
324 (int)req->r_reply_info.dir_complete);
325
326 if (!req->r_did_prepopulate) {
327 dout("readdir !did_prepopulate");
328 fi->dir_release_count--; /* preclude I_COMPLETE */
329 }
330
331 /* note next offset and last dentry name */
332 fi->offset = fi->next_offset;
333 fi->last_readdir = req;
334
335 if (req->r_reply_info.dir_end) {
336 kfree(fi->last_name);
337 fi->last_name = NULL;
338 fi->next_offset = 0;
339 } else {
340 rinfo = &req->r_reply_info;
341 err = note_last_dentry(fi,
342 rinfo->dir_dname[rinfo->dir_nr-1],
343 rinfo->dir_dname_len[rinfo->dir_nr-1]);
344 if (err)
345 return err;
346 fi->next_offset += rinfo->dir_nr;
347 }
348 }
349
350 rinfo = &fi->last_readdir->r_reply_info;
351 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
352 rinfo->dir_nr, off, fi->offset);
353 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
354 u64 pos = ceph_make_fpos(frag, off);
355 struct ceph_mds_reply_inode *in =
356 rinfo->dir_in[off - fi->offset].in;
357 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
358 off, off - fi->offset, rinfo->dir_nr, pos,
359 rinfo->dir_dname_len[off - fi->offset],
360 rinfo->dir_dname[off - fi->offset], in);
361 BUG_ON(!in);
362 ftype = le32_to_cpu(in->mode) >> 12;
363 if (filldir(dirent,
364 rinfo->dir_dname[off - fi->offset],
365 rinfo->dir_dname_len[off - fi->offset],
366 pos,
367 le64_to_cpu(in->ino),
368 ftype) < 0) {
369 dout("filldir stopping us...\n");
370 return 0;
371 }
372 off++;
373 filp->f_pos = pos + 1;
374 }
375
376 if (fi->last_name) {
377 ceph_mdsc_put_request(fi->last_readdir);
378 fi->last_readdir = NULL;
379 goto more;
380 }
381
382 /* more frags? */
383 if (!ceph_frag_is_rightmost(frag)) {
384 frag = ceph_frag_next(frag);
385 off = 0;
386 filp->f_pos = ceph_make_fpos(frag, off);
387 dout("readdir next frag is %x\n", frag);
388 goto more;
389 }
390 fi->at_end = 1;
391
392 /*
393 * if dir_release_count still matches the dir, no dentries
394 * were released during the whole readdir, and we should have
395 * the complete dir contents in our cache.
396 */
397 spin_lock(&inode->i_lock);
398 if (ci->i_release_count == fi->dir_release_count) {
399 dout(" marking %p complete\n", inode);
400 ci->i_ceph_flags |= CEPH_I_COMPLETE;
401 ci->i_max_offset = filp->f_pos;
402 }
403 spin_unlock(&inode->i_lock);
404
405 dout("readdir %p filp %p done.\n", inode, filp);
406 return 0;
407}
408
409static void reset_readdir(struct ceph_file_info *fi)
410{
411 if (fi->last_readdir) {
412 ceph_mdsc_put_request(fi->last_readdir);
413 fi->last_readdir = NULL;
414 }
415 kfree(fi->last_name);
416 fi->next_offset = 2; /* compensate for . and .. */
417 if (fi->dentry) {
418 dput(fi->dentry);
419 fi->dentry = NULL;
420 }
421 fi->at_end = 0;
422}
423
424static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
425{
426 struct ceph_file_info *fi = file->private_data;
427 struct inode *inode = file->f_mapping->host;
428 loff_t old_offset = offset;
429 loff_t retval;
430
431 mutex_lock(&inode->i_mutex);
432 switch (origin) {
433 case SEEK_END:
434 offset += inode->i_size + 2; /* FIXME */
435 break;
436 case SEEK_CUR:
437 offset += file->f_pos;
438 }
439 retval = -EINVAL;
440 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
441 if (offset != file->f_pos) {
442 file->f_pos = offset;
443 file->f_version = 0;
444 fi->at_end = 0;
445 }
446 retval = offset;
447
448 /*
449 * discard buffered readdir content on seekdir(0), or
450 * seek to new frag, or seek prior to current chunk.
451 */
452 if (offset == 0 ||
453 fpos_frag(offset) != fpos_frag(old_offset) ||
454 fpos_off(offset) < fi->offset) {
455 dout("dir_llseek dropping %p content\n", file);
456 reset_readdir(fi);
457 }
458
459 /* bump dir_release_count if we did a forward seek */
460 if (offset > old_offset)
461 fi->dir_release_count--;
462 }
463 mutex_unlock(&inode->i_mutex);
464 return retval;
465}
466
467/*
468 * Process result of a lookup/open request.
469 *
470 * Mainly, make sure we return the final req->r_dentry (if it already
471 * existed) in place of the original VFS-provided dentry when they
472 * differ.
473 *
474 * Gracefully handle the case where the MDS replies with -ENOENT and
475 * no trace (which it may do, at its discretion, e.g., if it doesn't
476 * care to issue a lease on the negative dentry).
477 */
478struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
479 struct dentry *dentry, int err)
480{
481 struct ceph_client *client = ceph_client(dentry->d_sb);
482 struct inode *parent = dentry->d_parent->d_inode;
483
484 /* .snap dir? */
485 if (err == -ENOENT &&
486 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
487 strcmp(dentry->d_name.name,
488 client->mount_args->snapdir_name) == 0) {
489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 BUG_ON(!d_unhashed(dentry));
493 d_add(dentry, inode);
494 err = 0;
495 }
496
497 if (err == -ENOENT) {
498 /* no trace? */
499 err = 0;
500 if (!req->r_reply_info.head->is_dentry) {
501 dout("ENOENT and no trace, dentry %p inode %p\n",
502 dentry, dentry->d_inode);
503 if (dentry->d_inode) {
504 d_drop(dentry);
505 err = -ENOENT;
506 } else {
507 d_add(dentry, NULL);
508 }
509 }
510 }
511 if (err)
512 dentry = ERR_PTR(err);
513 else if (dentry != req->r_dentry)
514 dentry = dget(req->r_dentry); /* we got spliced */
515 else
516 dentry = NULL;
517 return dentry;
518}
519
520static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
521{
522 return ceph_ino(inode) == CEPH_INO_ROOT &&
523 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
524}
525
526/*
527 * Look up a single dir entry. If there is a lookup intent, inform
528 * the MDS so that it gets our 'caps wanted' value in a single op.
529 */
530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
531 struct nameidata *nd)
532{
533 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
534 struct ceph_mds_client *mdsc = &client->mdsc;
535 struct ceph_mds_request *req;
536 int op;
537 int err;
538
539 dout("lookup %p dentry %p '%.*s'\n",
540 dir, dentry, dentry->d_name.len, dentry->d_name.name);
541
542 if (dentry->d_name.len > NAME_MAX)
543 return ERR_PTR(-ENAMETOOLONG);
544
545 err = ceph_init_dentry(dentry);
546 if (err < 0)
547 return ERR_PTR(err);
548
549 /* open (but not create!) intent? */
550 if (nd &&
551 (nd->flags & LOOKUP_OPEN) &&
552 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
553 !(nd->intent.open.flags & O_CREAT)) {
554 int mode = nd->intent.open.create_mode & ~current->fs->umask;
555 return ceph_lookup_open(dir, dentry, nd, mode, 1);
556 }
557
558 /* can we conclude ENOENT locally? */
559 if (dentry->d_inode == NULL) {
560 struct ceph_inode_info *ci = ceph_inode(dir);
561 struct ceph_dentry_info *di = ceph_dentry(dentry);
562
563 spin_lock(&dir->i_lock);
564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
565 if (strncmp(dentry->d_name.name,
566 client->mount_args->snapdir_name,
567 dentry->d_name.len) &&
568 !is_root_ceph_dentry(dir, dentry) &&
569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
570 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
571 di->offset = ci->i_max_offset++;
572 spin_unlock(&dir->i_lock);
573 dout(" dir %p complete, -ENOENT\n", dir);
574 d_add(dentry, NULL);
575 di->lease_shared_gen = ci->i_shared_gen;
576 return NULL;
577 }
578 spin_unlock(&dir->i_lock);
579 }
580
581 op = ceph_snap(dir) == CEPH_SNAPDIR ?
582 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
583 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
584 if (IS_ERR(req))
585 return ERR_PTR(PTR_ERR(req));
586 req->r_dentry = dget(dentry);
587 req->r_num_caps = 2;
588 /* we only need inode linkage */
589 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
590 req->r_locked_dir = dir;
591 err = ceph_mdsc_do_request(mdsc, NULL, req);
592 dentry = ceph_finish_lookup(req, dentry, err);
593 ceph_mdsc_put_request(req); /* will dput(dentry) */
594 dout("lookup result=%p\n", dentry);
595 return dentry;
596}
597
598/*
599 * If we do a create but get no trace back from the MDS, follow up with
600 * a lookup (the VFS expects us to link up the provided dentry).
601 */
602int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
603{
604 struct dentry *result = ceph_lookup(dir, dentry, NULL);
605
606 if (result && !IS_ERR(result)) {
607 /*
608 * We created the item, then did a lookup, and found
609 * it was already linked to another inode we already
610 * had in our cache (and thus got spliced). Link our
611 * dentry to that inode, but don't hash it, just in
612 * case the VFS wants to dereference it.
613 */
614 BUG_ON(!result->d_inode);
615 d_instantiate(dentry, result->d_inode);
616 return 0;
617 }
618 return PTR_ERR(result);
619}
620
621static int ceph_mknod(struct inode *dir, struct dentry *dentry,
622 int mode, dev_t rdev)
623{
624 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
625 struct ceph_mds_client *mdsc = &client->mdsc;
626 struct ceph_mds_request *req;
627 int err;
628
629 if (ceph_snap(dir) != CEPH_NOSNAP)
630 return -EROFS;
631
632 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
633 dir, dentry, mode, rdev);
634 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
635 if (IS_ERR(req)) {
636 d_drop(dentry);
637 return PTR_ERR(req);
638 }
639 req->r_dentry = dget(dentry);
640 req->r_num_caps = 2;
641 req->r_locked_dir = dir;
642 req->r_args.mknod.mode = cpu_to_le32(mode);
643 req->r_args.mknod.rdev = cpu_to_le32(rdev);
644 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
645 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
646 err = ceph_mdsc_do_request(mdsc, dir, req);
647 if (!err && !req->r_reply_info.head->is_dentry)
648 err = ceph_handle_notrace_create(dir, dentry);
649 ceph_mdsc_put_request(req);
650 if (err)
651 d_drop(dentry);
652 return err;
653}
654
655static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
656 struct nameidata *nd)
657{
658 dout("create in dir %p dentry %p name '%.*s'\n",
659 dir, dentry, dentry->d_name.len, dentry->d_name.name);
660
661 if (ceph_snap(dir) != CEPH_NOSNAP)
662 return -EROFS;
663
664 if (nd) {
665 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
666 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
667 /* hrm, what should i do here if we get aliased? */
668 if (IS_ERR(dentry))
669 return PTR_ERR(dentry);
670 return 0;
671 }
672
673 /* fall back to mknod */
674 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
675}
676
677static int ceph_symlink(struct inode *dir, struct dentry *dentry,
678 const char *dest)
679{
680 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
681 struct ceph_mds_client *mdsc = &client->mdsc;
682 struct ceph_mds_request *req;
683 int err;
684
685 if (ceph_snap(dir) != CEPH_NOSNAP)
686 return -EROFS;
687
688 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
689 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
690 if (IS_ERR(req)) {
691 d_drop(dentry);
692 return PTR_ERR(req);
693 }
694 req->r_dentry = dget(dentry);
695 req->r_num_caps = 2;
696 req->r_path2 = kstrdup(dest, GFP_NOFS);
697 req->r_locked_dir = dir;
698 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
699 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
700 err = ceph_mdsc_do_request(mdsc, dir, req);
701 if (!err && !req->r_reply_info.head->is_dentry)
702 err = ceph_handle_notrace_create(dir, dentry);
703 ceph_mdsc_put_request(req);
704 if (err)
705 d_drop(dentry);
706 return err;
707}
708
709static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
710{
711 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
712 struct ceph_mds_client *mdsc = &client->mdsc;
713 struct ceph_mds_request *req;
714 int err = -EROFS;
715 int op;
716
717 if (ceph_snap(dir) == CEPH_SNAPDIR) {
718 /* mkdir .snap/foo is a MKSNAP */
719 op = CEPH_MDS_OP_MKSNAP;
720 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
721 dentry->d_name.len, dentry->d_name.name, dentry);
722 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
723 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
724 op = CEPH_MDS_OP_MKDIR;
725 } else {
726 goto out;
727 }
728 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
729 if (IS_ERR(req)) {
730 err = PTR_ERR(req);
731 goto out;
732 }
733
734 req->r_dentry = dget(dentry);
735 req->r_num_caps = 2;
736 req->r_locked_dir = dir;
737 req->r_args.mkdir.mode = cpu_to_le32(mode);
738 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
739 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
740 err = ceph_mdsc_do_request(mdsc, dir, req);
741 if (!err && !req->r_reply_info.head->is_dentry)
742 err = ceph_handle_notrace_create(dir, dentry);
743 ceph_mdsc_put_request(req);
744out:
745 if (err < 0)
746 d_drop(dentry);
747 return err;
748}
749
750static int ceph_link(struct dentry *old_dentry, struct inode *dir,
751 struct dentry *dentry)
752{
753 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
754 struct ceph_mds_client *mdsc = &client->mdsc;
755 struct ceph_mds_request *req;
756 int err;
757
758 if (ceph_snap(dir) != CEPH_NOSNAP)
759 return -EROFS;
760
761 dout("link in dir %p old_dentry %p dentry %p\n", dir,
762 old_dentry, dentry);
763 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
764 if (IS_ERR(req)) {
765 d_drop(dentry);
766 return PTR_ERR(req);
767 }
768 req->r_dentry = dget(dentry);
769 req->r_num_caps = 2;
770 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
771 req->r_locked_dir = dir;
772 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
773 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
774 err = ceph_mdsc_do_request(mdsc, dir, req);
775 if (err)
776 d_drop(dentry);
777 else if (!req->r_reply_info.head->is_dentry)
778 d_instantiate(dentry, igrab(old_dentry->d_inode));
779 ceph_mdsc_put_request(req);
780 return err;
781}
782
783/*
784 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
785 * looks like the link count will hit 0, drop any other caps (other
786 * than PIN) we don't specifically want (due to the file still being
787 * open).
788 */
789static int drop_caps_for_unlink(struct inode *inode)
790{
791 struct ceph_inode_info *ci = ceph_inode(inode);
792 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
793
794 spin_lock(&inode->i_lock);
795 if (inode->i_nlink == 1) {
796 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
797 ci->i_ceph_flags |= CEPH_I_NODELAY;
798 }
799 spin_unlock(&inode->i_lock);
800 return drop;
801}
802
803/*
804 * rmdir and unlink are differ only by the metadata op code
805 */
806static int ceph_unlink(struct inode *dir, struct dentry *dentry)
807{
808 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
809 struct ceph_mds_client *mdsc = &client->mdsc;
810 struct inode *inode = dentry->d_inode;
811 struct ceph_mds_request *req;
812 int err = -EROFS;
813 int op;
814
815 if (ceph_snap(dir) == CEPH_SNAPDIR) {
816 /* rmdir .snap/foo is RMSNAP */
817 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
818 dentry->d_name.name, dentry);
819 op = CEPH_MDS_OP_RMSNAP;
820 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
821 dout("unlink/rmdir dir %p dn %p inode %p\n",
822 dir, dentry, inode);
823 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
824 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
825 } else
826 goto out;
827 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
828 if (IS_ERR(req)) {
829 err = PTR_ERR(req);
830 goto out;
831 }
832 req->r_dentry = dget(dentry);
833 req->r_num_caps = 2;
834 req->r_locked_dir = dir;
835 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
836 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
837 req->r_inode_drop = drop_caps_for_unlink(inode);
838 err = ceph_mdsc_do_request(mdsc, dir, req);
839 if (!err && !req->r_reply_info.head->is_dentry)
840 d_delete(dentry);
841 ceph_mdsc_put_request(req);
842out:
843 return err;
844}
845
846static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
847 struct inode *new_dir, struct dentry *new_dentry)
848{
849 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
850 struct ceph_mds_client *mdsc = &client->mdsc;
851 struct ceph_mds_request *req;
852 int err;
853
854 if (ceph_snap(old_dir) != ceph_snap(new_dir))
855 return -EXDEV;
856 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
857 ceph_snap(new_dir) != CEPH_NOSNAP)
858 return -EROFS;
859 dout("rename dir %p dentry %p to dir %p dentry %p\n",
860 old_dir, old_dentry, new_dir, new_dentry);
861 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
862 if (IS_ERR(req))
863 return PTR_ERR(req);
864 req->r_dentry = dget(new_dentry);
865 req->r_num_caps = 2;
866 req->r_old_dentry = dget(old_dentry);
867 req->r_locked_dir = new_dir;
868 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
869 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
870 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
871 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
872 /* release LINK_RDCACHE on source inode (mds will lock it) */
873 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
874 if (new_dentry->d_inode)
875 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
876 err = ceph_mdsc_do_request(mdsc, old_dir, req);
877 if (!err && !req->r_reply_info.head->is_dentry) {
878 /*
879 * Normally d_move() is done by fill_trace (called by
880 * do_request, above). If there is no trace, we need
881 * to do it here.
882 */
883 d_move(old_dentry, new_dentry);
884 }
885 ceph_mdsc_put_request(req);
886 return err;
887}
888
889
890/*
891 * Check if dentry lease is valid. If not, delete the lease. Try to
892 * renew if the least is more than half up.
893 */
894static int dentry_lease_is_valid(struct dentry *dentry)
895{
896 struct ceph_dentry_info *di;
897 struct ceph_mds_session *s;
898 int valid = 0;
899 u32 gen;
900 unsigned long ttl;
901 struct ceph_mds_session *session = NULL;
902 struct inode *dir = NULL;
903 u32 seq = 0;
904
905 spin_lock(&dentry->d_lock);
906 di = ceph_dentry(dentry);
907 if (di && di->lease_session) {
908 s = di->lease_session;
909 spin_lock(&s->s_cap_lock);
910 gen = s->s_cap_gen;
911 ttl = s->s_cap_ttl;
912 spin_unlock(&s->s_cap_lock);
913
914 if (di->lease_gen == gen &&
915 time_before(jiffies, dentry->d_time) &&
916 time_before(jiffies, ttl)) {
917 valid = 1;
918 if (di->lease_renew_after &&
919 time_after(jiffies, di->lease_renew_after)) {
920 /* we should renew */
921 dir = dentry->d_parent->d_inode;
922 session = ceph_get_mds_session(s);
923 seq = di->lease_seq;
924 di->lease_renew_after = 0;
925 di->lease_renew_from = jiffies;
926 }
927 }
928 }
929 spin_unlock(&dentry->d_lock);
930
931 if (session) {
932 ceph_mdsc_lease_send_msg(session, dir, dentry,
933 CEPH_MDS_LEASE_RENEW, seq);
934 ceph_put_mds_session(session);
935 }
936 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
937 return valid;
938}
939
940/*
941 * Check if directory-wide content lease/cap is valid.
942 */
943static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
944{
945 struct ceph_inode_info *ci = ceph_inode(dir);
946 struct ceph_dentry_info *di = ceph_dentry(dentry);
947 int valid = 0;
948
949 spin_lock(&dir->i_lock);
950 if (ci->i_shared_gen == di->lease_shared_gen)
951 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
952 spin_unlock(&dir->i_lock);
953 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
954 dir, (unsigned)ci->i_shared_gen, dentry,
955 (unsigned)di->lease_shared_gen, valid);
956 return valid;
957}
958
959/*
960 * Check if cached dentry can be trusted.
961 */
962static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
963{
964 struct inode *dir = dentry->d_parent->d_inode;
965
966 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
967 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
968
969 /* always trust cached snapped dentries, snapdir dentry */
970 if (ceph_snap(dir) != CEPH_NOSNAP) {
971 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
972 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
973 goto out_touch;
974 }
975 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
976 goto out_touch;
977
978 if (dentry_lease_is_valid(dentry) ||
979 dir_lease_is_valid(dir, dentry))
980 goto out_touch;
981
982 dout("d_revalidate %p invalid\n", dentry);
983 d_drop(dentry);
984 return 0;
985out_touch:
986 ceph_dentry_lru_touch(dentry);
987 return 1;
988}
989
990/*
991 * When a dentry is released, clear the dir I_COMPLETE if it was part
992 * of the current dir gen.
993 */
994static void ceph_dentry_release(struct dentry *dentry)
995{
996 struct ceph_dentry_info *di = ceph_dentry(dentry);
997 struct inode *parent_inode = dentry->d_parent->d_inode;
998
999 if (parent_inode) {
1000 struct ceph_inode_info *ci = ceph_inode(parent_inode);
1001
1002 spin_lock(&parent_inode->i_lock);
1003 if (ci->i_shared_gen == di->lease_shared_gen) {
1004 dout(" clearing %p complete (d_release)\n",
1005 parent_inode);
1006 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1007 ci->i_release_count++;
1008 }
1009 spin_unlock(&parent_inode->i_lock);
1010 }
1011 if (di) {
1012 ceph_dentry_lru_del(dentry);
1013 if (di->lease_session)
1014 ceph_put_mds_session(di->lease_session);
1015 kmem_cache_free(ceph_dentry_cachep, di);
1016 dentry->d_fsdata = NULL;
1017 }
1018}
1019
1020static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1021 struct nameidata *nd)
1022{
1023 /*
1024 * Eventually, we'll want to revalidate snapped metadata
1025 * too... probably...
1026 */
1027 return 1;
1028}
1029
1030
1031
1032/*
1033 * read() on a dir. This weird interface hack only works if mounted
1034 * with '-o dirstat'.
1035 */
1036static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1037 loff_t *ppos)
1038{
1039 struct ceph_file_info *cf = file->private_data;
1040 struct inode *inode = file->f_dentry->d_inode;
1041 struct ceph_inode_info *ci = ceph_inode(inode);
1042 int left;
1043
1044 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1045 return -EISDIR;
1046
1047 if (!cf->dir_info) {
1048 cf->dir_info = kmalloc(1024, GFP_NOFS);
1049 if (!cf->dir_info)
1050 return -ENOMEM;
1051 cf->dir_info_len =
1052 sprintf(cf->dir_info,
1053 "entries: %20lld\n"
1054 " files: %20lld\n"
1055 " subdirs: %20lld\n"
1056 "rentries: %20lld\n"
1057 " rfiles: %20lld\n"
1058 " rsubdirs: %20lld\n"
1059 "rbytes: %20lld\n"
1060 "rctime: %10ld.%09ld\n",
1061 ci->i_files + ci->i_subdirs,
1062 ci->i_files,
1063 ci->i_subdirs,
1064 ci->i_rfiles + ci->i_rsubdirs,
1065 ci->i_rfiles,
1066 ci->i_rsubdirs,
1067 ci->i_rbytes,
1068 (long)ci->i_rctime.tv_sec,
1069 (long)ci->i_rctime.tv_nsec);
1070 }
1071
1072 if (*ppos >= cf->dir_info_len)
1073 return 0;
1074 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1075 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1076 if (left == size)
1077 return -EFAULT;
1078 *ppos += (size - left);
1079 return size - left;
1080}
1081
1082/*
1083 * an fsync() on a dir will wait for any uncommitted directory
1084 * operations to commit.
1085 */
1086static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1087 int datasync)
1088{
1089 struct inode *inode = dentry->d_inode;
1090 struct ceph_inode_info *ci = ceph_inode(inode);
1091 struct list_head *head = &ci->i_unsafe_dirops;
1092 struct ceph_mds_request *req;
1093 u64 last_tid;
1094 int ret = 0;
1095
1096 dout("dir_fsync %p\n", inode);
1097 spin_lock(&ci->i_unsafe_lock);
1098 if (list_empty(head))
1099 goto out;
1100
1101 req = list_entry(head->prev,
1102 struct ceph_mds_request, r_unsafe_dir_item);
1103 last_tid = req->r_tid;
1104
1105 do {
1106 ceph_mdsc_get_request(req);
1107 spin_unlock(&ci->i_unsafe_lock);
1108 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1109 inode, req->r_tid, last_tid);
1110 if (req->r_timeout) {
1111 ret = wait_for_completion_timeout(
1112 &req->r_safe_completion, req->r_timeout);
1113 if (ret > 0)
1114 ret = 0;
1115 else if (ret == 0)
1116 ret = -EIO; /* timed out */
1117 } else {
1118 wait_for_completion(&req->r_safe_completion);
1119 }
1120 spin_lock(&ci->i_unsafe_lock);
1121 ceph_mdsc_put_request(req);
1122
1123 if (ret || list_empty(head))
1124 break;
1125 req = list_entry(head->next,
1126 struct ceph_mds_request, r_unsafe_dir_item);
1127 } while (req->r_tid < last_tid);
1128out:
1129 spin_unlock(&ci->i_unsafe_lock);
1130 return ret;
1131}
1132
1133/*
1134 * We maintain a private dentry LRU.
1135 *
1136 * FIXME: this needs to be changed to a per-mds lru to be useful.
1137 */
1138void ceph_dentry_lru_add(struct dentry *dn)
1139{
1140 struct ceph_dentry_info *di = ceph_dentry(dn);
1141 struct ceph_mds_client *mdsc;
1142
1143 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1144 dn->d_name.len, dn->d_name.name);
1145 if (di) {
1146 mdsc = &ceph_client(dn->d_sb)->mdsc;
1147 spin_lock(&mdsc->dentry_lru_lock);
1148 list_add_tail(&di->lru, &mdsc->dentry_lru);
1149 mdsc->num_dentry++;
1150 spin_unlock(&mdsc->dentry_lru_lock);
1151 }
1152}
1153
1154void ceph_dentry_lru_touch(struct dentry *dn)
1155{
1156 struct ceph_dentry_info *di = ceph_dentry(dn);
1157 struct ceph_mds_client *mdsc;
1158
1159 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1160 dn->d_name.len, dn->d_name.name);
1161 if (di) {
1162 mdsc = &ceph_client(dn->d_sb)->mdsc;
1163 spin_lock(&mdsc->dentry_lru_lock);
1164 list_move_tail(&di->lru, &mdsc->dentry_lru);
1165 spin_unlock(&mdsc->dentry_lru_lock);
1166 }
1167}
1168
1169void ceph_dentry_lru_del(struct dentry *dn)
1170{
1171 struct ceph_dentry_info *di = ceph_dentry(dn);
1172 struct ceph_mds_client *mdsc;
1173
1174 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1175 dn->d_name.len, dn->d_name.name);
1176 if (di) {
1177 mdsc = &ceph_client(dn->d_sb)->mdsc;
1178 spin_lock(&mdsc->dentry_lru_lock);
1179 list_del_init(&di->lru);
1180 mdsc->num_dentry--;
1181 spin_unlock(&mdsc->dentry_lru_lock);
1182 }
1183}
1184
1185const struct file_operations ceph_dir_fops = {
1186 .read = ceph_read_dir,
1187 .readdir = ceph_readdir,
1188 .llseek = ceph_dir_llseek,
1189 .open = ceph_open,
1190 .release = ceph_release,
1191 .unlocked_ioctl = ceph_ioctl,
1192 .fsync = ceph_dir_fsync,
1193};
1194
1195const struct inode_operations ceph_dir_iops = {
1196 .lookup = ceph_lookup,
1197 .permission = ceph_permission,
1198 .getattr = ceph_getattr,
1199 .setattr = ceph_setattr,
1200 .setxattr = ceph_setxattr,
1201 .getxattr = ceph_getxattr,
1202 .listxattr = ceph_listxattr,
1203 .removexattr = ceph_removexattr,
1204 .mknod = ceph_mknod,
1205 .symlink = ceph_symlink,
1206 .mkdir = ceph_mkdir,
1207 .link = ceph_link,
1208 .unlink = ceph_unlink,
1209 .rmdir = ceph_unlink,
1210 .rename = ceph_rename,
1211 .create = ceph_create,
1212};
1213
1214struct dentry_operations ceph_dentry_ops = {
1215 .d_revalidate = ceph_d_revalidate,
1216 .d_release = ceph_dentry_release,
1217};
1218
1219struct dentry_operations ceph_snapdir_dentry_ops = {
1220 .d_revalidate = ceph_snapdir_d_revalidate,
1221};
1222
1223struct dentry_operations ceph_snap_dentry_ops = {
1224};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..9d67572fb328
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,224 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <linux/slab.h>
5#include <asm/unaligned.h>
6
7#include "super.h"
8
9/*
10 * NFS export support
11 *
12 * NFS re-export of a ceph mount is, at present, only semireliable.
13 * The basic issue is that the Ceph architectures doesn't lend itself
14 * well to generating filehandles that will remain valid forever.
15 *
16 * So, we do our best. If you're lucky, your inode will be in the
17 * client's cache. If it's not, and you have a connectable fh, then
18 * the MDS server may be able to find it for you. Otherwise, you get
19 * ESTALE.
20 *
21 * There are ways to this more reliable, but in the non-connectable fh
22 * case, we won't every work perfectly, and in the connectable case,
23 * some changes are needed on the MDS side to work better.
24 */
25
26/*
27 * Basic fh
28 */
29struct ceph_nfs_fh {
30 u64 ino;
31} __attribute__ ((packed));
32
33/*
34 * Larger 'connectable' fh that includes parent ino and name hash.
35 * Use this whenever possible, as it works more reliably.
36 */
37struct ceph_nfs_confh {
38 u64 ino, parent_ino;
39 u32 parent_name_hash;
40} __attribute__ ((packed));
41
42static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
43 int connectable)
44{
45 struct ceph_nfs_fh *fh = (void *)rawfh;
46 struct ceph_nfs_confh *cfh = (void *)rawfh;
47 struct dentry *parent = dentry->d_parent;
48 struct inode *inode = dentry->d_inode;
49 int type;
50
51 /* don't re-export snaps */
52 if (ceph_snap(inode) != CEPH_NOSNAP)
53 return -EINVAL;
54
55 if (*max_len >= sizeof(*cfh)) {
56 dout("encode_fh %p connectable\n", dentry);
57 cfh->ino = ceph_ino(dentry->d_inode);
58 cfh->parent_ino = ceph_ino(parent->d_inode);
59 cfh->parent_name_hash = parent->d_name.hash;
60 *max_len = sizeof(*cfh);
61 type = 2;
62 } else if (*max_len > sizeof(*fh)) {
63 if (connectable)
64 return -ENOSPC;
65 dout("encode_fh %p\n", dentry);
66 fh->ino = ceph_ino(dentry->d_inode);
67 *max_len = sizeof(*fh);
68 type = 1;
69 } else {
70 return -ENOSPC;
71 }
72 return type;
73}
74
75/*
76 * convert regular fh to dentry
77 *
78 * FIXME: we should try harder by querying the mds for the ino.
79 */
80static struct dentry *__fh_to_dentry(struct super_block *sb,
81 struct ceph_nfs_fh *fh)
82{
83 struct inode *inode;
84 struct dentry *dentry;
85 struct ceph_vino vino;
86 int err;
87
88 dout("__fh_to_dentry %llx\n", fh->ino);
89 vino.ino = fh->ino;
90 vino.snap = CEPH_NOSNAP;
91 inode = ceph_find_inode(sb, vino);
92 if (!inode)
93 return ERR_PTR(-ESTALE);
94
95 dentry = d_obtain_alias(inode);
96 if (!dentry) {
97 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
98 fh->ino, inode);
99 iput(inode);
100 return ERR_PTR(-ENOMEM);
101 }
102 err = ceph_init_dentry(dentry);
103
104 if (err < 0) {
105 iput(inode);
106 return ERR_PTR(err);
107 }
108 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
109 return dentry;
110}
111
112/*
113 * convert connectable fh to dentry
114 */
115static struct dentry *__cfh_to_dentry(struct super_block *sb,
116 struct ceph_nfs_confh *cfh)
117{
118 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
119 struct inode *inode;
120 struct dentry *dentry;
121 struct ceph_vino vino;
122 int err;
123
124 dout("__cfh_to_dentry %llx (%llx/%x)\n",
125 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
126
127 vino.ino = cfh->ino;
128 vino.snap = CEPH_NOSNAP;
129 inode = ceph_find_inode(sb, vino);
130 if (!inode) {
131 struct ceph_mds_request *req;
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
134 USE_ANY_MDS);
135 if (IS_ERR(req))
136 return ERR_PTR(PTR_ERR(req));
137
138 req->r_ino1 = vino;
139 req->r_ino2.ino = cfh->parent_ino;
140 req->r_ino2.snap = CEPH_NOSNAP;
141 req->r_path2 = kmalloc(16, GFP_NOFS);
142 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
143 req->r_num_caps = 1;
144 err = ceph_mdsc_do_request(mdsc, NULL, req);
145 ceph_mdsc_put_request(req);
146 inode = ceph_find_inode(sb, vino);
147 if (!inode)
148 return ERR_PTR(err ? err : -ESTALE);
149 }
150
151 dentry = d_obtain_alias(inode);
152 if (!dentry) {
153 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
154 cfh->ino, inode);
155 iput(inode);
156 return ERR_PTR(-ENOMEM);
157 }
158 err = ceph_init_dentry(dentry);
159 if (err < 0) {
160 iput(inode);
161 return ERR_PTR(err);
162 }
163 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
164 return dentry;
165}
166
167static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
168 int fh_len, int fh_type)
169{
170 if (fh_type == 1)
171 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
172 else
173 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
174}
175
176/*
177 * get parent, if possible.
178 *
179 * FIXME: we could do better by querying the mds to discover the
180 * parent.
181 */
182static struct dentry *ceph_fh_to_parent(struct super_block *sb,
183 struct fid *fid,
184 int fh_len, int fh_type)
185{
186 struct ceph_nfs_confh *cfh = (void *)fid->raw;
187 struct ceph_vino vino;
188 struct inode *inode;
189 struct dentry *dentry;
190 int err;
191
192 if (fh_type == 1)
193 return ERR_PTR(-ESTALE);
194
195 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
196 cfh->parent_name_hash);
197
198 vino.ino = cfh->ino;
199 vino.snap = CEPH_NOSNAP;
200 inode = ceph_find_inode(sb, vino);
201 if (!inode)
202 return ERR_PTR(-ESTALE);
203
204 dentry = d_obtain_alias(inode);
205 if (!dentry) {
206 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
207 cfh->ino, inode);
208 iput(inode);
209 return ERR_PTR(-ENOMEM);
210 }
211 err = ceph_init_dentry(dentry);
212 if (err < 0) {
213 iput(inode);
214 return ERR_PTR(err);
215 }
216 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
217 return dentry;
218}
219
220const struct export_operations ceph_export_ops = {
221 .encode_fh = ceph_encode_fh,
222 .fh_to_dentry = ceph_fh_to_dentry,
223 .fh_to_parent = ceph_fh_to_parent,
224};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..4add3d5da2c1
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,938 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/slab.h>
5#include <linux/file.h>
6#include <linux/namei.h>
7#include <linux/writeback.h>
8
9#include "super.h"
10#include "mds_client.h"
11
12/*
13 * Ceph file operations
14 *
15 * Implement basic open/close functionality, and implement
16 * read/write.
17 *
18 * We implement three modes of file I/O:
19 * - buffered uses the generic_file_aio_{read,write} helpers
20 *
21 * - synchronous is used when there is multi-client read/write
22 * sharing, avoids the page cache, and synchronously waits for an
23 * ack from the OSD.
24 *
25 * - direct io takes the variant of the sync path that references
26 * user pages directly.
27 *
28 * fsync() flushes and waits on dirty pages, but just queues metadata
29 * for writeback: since the MDS can recover size and mtime there is no
30 * need to wait for MDS acknowledgement.
31 */
32
33
34/*
35 * Prepare an open request. Preallocate ceph_cap to avoid an
36 * inopportune ENOMEM later.
37 */
38static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{
41 struct ceph_client *client = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc;
43 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
46
47 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
48 want_auth = USE_AUTH_MDS;
49
50 req = ceph_mdsc_create_request(mdsc, op, want_auth);
51 if (IS_ERR(req))
52 goto out;
53 req->r_fmode = ceph_flags_to_mode(flags);
54 req->r_args.open.flags = cpu_to_le32(flags);
55 req->r_args.open.mode = cpu_to_le32(create_mode);
56 req->r_args.open.preferred = cpu_to_le32(-1);
57out:
58 return req;
59}
60
61/*
62 * initialize private struct file data.
63 * if we fail, clean up by dropping fmode reference on the ceph_inode
64 */
65static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
66{
67 struct ceph_file_info *cf;
68 int ret = 0;
69
70 switch (inode->i_mode & S_IFMT) {
71 case S_IFREG:
72 case S_IFDIR:
73 dout("init_file %p %p 0%o (regular)\n", inode, file,
74 inode->i_mode);
75 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
76 if (cf == NULL) {
77 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
78 return -ENOMEM;
79 }
80 cf->fmode = fmode;
81 cf->next_offset = 2;
82 file->private_data = cf;
83 BUG_ON(inode->i_fop->release != ceph_release);
84 break;
85
86 case S_IFLNK:
87 dout("init_file %p %p 0%o (symlink)\n", inode, file,
88 inode->i_mode);
89 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
90 break;
91
92 default:
93 dout("init_file %p %p 0%o (special)\n", inode, file,
94 inode->i_mode);
95 /*
96 * we need to drop the open ref now, since we don't
97 * have .release set to ceph_release.
98 */
99 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
100 BUG_ON(inode->i_fop->release == ceph_release);
101
102 /* call the proper open fop */
103 ret = inode->i_fop->open(inode, file);
104 }
105 return ret;
106}
107
108/*
109 * If the filp already has private_data, that means the file was
110 * already opened by intent during lookup, and we do nothing.
111 *
112 * If we already have the requisite capabilities, we can satisfy
113 * the open request locally (no need to request new caps from the
114 * MDS). We do, however, need to inform the MDS (asynchronously)
115 * if our wanted caps set expands.
116 */
117int ceph_open(struct inode *inode, struct file *file)
118{
119 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc;
122 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
125 int err;
126 int flags, fmode, wanted;
127
128 if (cf) {
129 dout("open file %p is already opened\n", file);
130 return 0;
131 }
132
133 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
134 flags = file->f_flags & ~(O_CREAT|O_EXCL);
135 if (S_ISDIR(inode->i_mode))
136 flags = O_DIRECTORY; /* mds likes to know */
137
138 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
139 ceph_vinop(inode), file, flags, file->f_flags);
140 fmode = ceph_flags_to_mode(flags);
141 wanted = ceph_caps_for_mode(fmode);
142
143 /* snapped files are read-only */
144 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
145 return -EROFS;
146
147 /* trivially open snapdir */
148 if (ceph_snap(inode) == CEPH_SNAPDIR) {
149 spin_lock(&inode->i_lock);
150 __ceph_get_fmode(ci, fmode);
151 spin_unlock(&inode->i_lock);
152 return ceph_init_file(inode, file, fmode);
153 }
154
155 /*
156 * No need to block if we have any caps. Update wanted set
157 * asynchronously.
158 */
159 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL);
163
164 dout("open %p fmode %d want %s issued %s using existing\n",
165 inode, fmode, ceph_cap_string(wanted),
166 ceph_cap_string(issued));
167 __ceph_get_fmode(ci, fmode);
168 spin_unlock(&inode->i_lock);
169
170 /* adjust wanted? */
171 if ((issued & wanted) != wanted &&
172 (mds_wanted & wanted) != wanted &&
173 ceph_snap(inode) != CEPH_SNAPDIR)
174 ceph_check_caps(ci, 0, NULL);
175
176 return ceph_init_file(inode, file, fmode);
177 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
178 (ci->i_snap_caps & wanted) == wanted) {
179 __ceph_get_fmode(ci, fmode);
180 spin_unlock(&inode->i_lock);
181 return ceph_init_file(inode, file, fmode);
182 }
183 spin_unlock(&inode->i_lock);
184
185 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
186 req = prepare_open_request(inode->i_sb, flags, 0);
187 if (IS_ERR(req)) {
188 err = PTR_ERR(req);
189 goto out;
190 }
191 req->r_inode = igrab(inode);
192 req->r_num_caps = 1;
193 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
194 if (!err)
195 err = ceph_init_file(inode, file, req->r_fmode);
196 ceph_mdsc_put_request(req);
197 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
198out:
199 return err;
200}
201
202
203/*
204 * Do a lookup + open with a single request.
205 *
206 * If this succeeds, but some subsequent check in the vfs
207 * may_open() fails, the struct *file gets cleaned up (i.e.
208 * ceph_release gets called). So fear not!
209 */
210/*
211 * flags
212 * path_lookup_open -> LOOKUP_OPEN
213 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
214 */
215struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode,
217 int locked_dir)
218{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc;
221 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req;
224 int err;
225 int flags = nd->intent.open.flags - 1; /* silly vfs! */
226
227 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
228 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
229
230 /* do the open */
231 req = prepare_open_request(dir->i_sb, flags, mode);
232 if (IS_ERR(req))
233 return ERR_PTR(PTR_ERR(req));
234 req->r_dentry = dget(dentry);
235 req->r_num_caps = 2;
236 if (flags & O_CREAT) {
237 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
238 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
239 }
240 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
241 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
242 dentry = ceph_finish_lookup(req, dentry, err);
243 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
244 err = ceph_handle_notrace_create(dir, dentry);
245 if (!err)
246 err = ceph_init_file(req->r_dentry->d_inode, file,
247 req->r_fmode);
248 ceph_mdsc_put_request(req);
249 dout("ceph_lookup_open result=%p\n", dentry);
250 return dentry;
251}
252
253int ceph_release(struct inode *inode, struct file *file)
254{
255 struct ceph_inode_info *ci = ceph_inode(inode);
256 struct ceph_file_info *cf = file->private_data;
257
258 dout("release inode %p file %p\n", inode, file);
259 ceph_put_fmode(ci, cf->fmode);
260 if (cf->last_readdir)
261 ceph_mdsc_put_request(cf->last_readdir);
262 kfree(cf->last_name);
263 kfree(cf->dir_info);
264 dput(cf->dentry);
265 kmem_cache_free(ceph_file_cachep, cf);
266
267 /* wake up anyone waiting for caps on this inode */
268 wake_up(&ci->i_cap_wq);
269 return 0;
270}
271
272/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **alloc_page_vector(int num_pages)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = alloc_page(GFP_NOFS);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.)
432 *
433 * If we get a short result from the OSD, check against i_size; we need to
434 * only return a short read to the caller if we hit EOF.
435 */
436static int striped_read(struct inode *inode,
437 u64 off, u64 len,
438 struct page **pages, int num_pages,
439 int *checkeof)
440{
441 struct ceph_client *client = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left;
446 int read;
447 struct page **page_pos;
448 int ret;
449 bool hit_stripe, was_short;
450
451 /*
452 * we may need to do multiple reads. not atomic, unfortunately.
453 */
454 pos = off;
455 left = len;
456 page_pos = pages;
457 pages_left = num_pages;
458 read = 0;
459
460more:
461 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq,
465 ci->i_truncate_size,
466 page_pos, pages_left);
467 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT)
470 ret = 0;
471 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
472 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
473
474 if (ret > 0) {
475 int didpages =
476 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
477
478 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read,
481 pos - off - read, pages);
482 }
483 pos += ret;
484 read = pos - off;
485 left -= ret;
486 page_pos += didpages;
487 pages_left -= didpages;
488
489 /* hit stripe? */
490 if (left && hit_stripe)
491 goto more;
492 }
493
494 if (was_short) {
495 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) {
497 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read,
499 pages);
500 read = len;
501 goto out;
502 }
503
504 /* check i_size */
505 *checkeof = 1;
506 }
507
508out:
509 if (ret >= 0)
510 ret = read;
511 dout("striped_read returns %d\n", ret);
512 return ret;
513}
514
515/*
516 * Completely synchronous read and write methods. Direct from __user
517 * buffer to osd, or directly to user pages (if O_DIRECT).
518 *
519 * If the read spans object boundary, just do multiple reads.
520 */
521static ssize_t ceph_sync_read(struct file *file, char __user *data,
522 unsigned len, loff_t *poff, int *checkeof)
523{
524 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages;
526 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len);
528 int ret;
529
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532
533 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len);
535
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else {
543 pages = alloc_page_vector(num_pages);
544 }
545 if (IS_ERR(pages))
546 return PTR_ERR(pages);
547
548 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0)
550 goto done;
551
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0)
557 *poff = off + ret;
558
559done:
560 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages);
562 else
563 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret);
565 return ret;
566}
567
568/*
569 * Write commit callback, called if we requested both an ACK and
570 * ONDISK commit reply from the OSD.
571 */
572static void sync_write_commit(struct ceph_osd_request *req,
573 struct ceph_msg *msg)
574{
575 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
576
577 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
578 spin_lock(&ci->i_unsafe_lock);
579 list_del_init(&req->r_unsafe_item);
580 spin_unlock(&ci->i_unsafe_lock);
581 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
582}
583
584/*
585 * Synchronous write, straight from __user pointer or user pages (if
586 * O_DIRECT).
587 *
588 * If write spans object boundary, just do multiple writes. (For a
589 * correct atomic write, we should e.g. take write locks on all
590 * objects, rollback on failure, etc.)
591 */
592static ssize_t ceph_sync_write(struct file *file, const char __user *data,
593 size_t left, loff_t *offset)
594{
595 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req;
599 struct page **pages;
600 int num_pages;
601 long long unsigned pos;
602 u64 len;
603 int written = 0;
604 int flags;
605 int do_sync = 0;
606 int check_caps = 0;
607 int ret;
608 struct timespec mtime = CURRENT_TIME;
609
610 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
611 return -EROFS;
612
613 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
614 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
615
616 if (file->f_flags & O_APPEND)
617 pos = i_size_read(inode);
618 else
619 pos = *offset;
620
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0)
623 return ret;
624
625 ret = invalidate_inode_pages2_range(inode->i_mapping,
626 pos >> PAGE_CACHE_SHIFT,
627 (pos + left) >> PAGE_CACHE_SHIFT);
628 if (ret < 0)
629 dout("invalidate_inode_pages2_range returned %d\n", ret);
630
631 flags = CEPH_OSD_FLAG_ORDERSNAP |
632 CEPH_OSD_FLAG_ONDISK |
633 CEPH_OSD_FLAG_WRITE;
634 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
635 flags |= CEPH_OSD_FLAG_ACK;
636 else
637 do_sync = 1;
638
639 /*
640 * we may need to do multiple writes here if we span an object
641 * boundary. this isn't atomic, unfortunately. :(
642 */
643more:
644 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context,
649 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2);
652 if (IS_ERR(req))
653 return PTR_ERR(req);
654
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages);
661 goto out;
662 }
663
664 /*
665 * throw out any page cache pages in this range. this
666 * may block.
667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
669 } else {
670 pages = alloc_page_vector(num_pages);
671 if (IS_ERR(pages)) {
672 ret = PTR_ERR(pages);
673 goto out;
674 }
675 ret = copy_user_to_page_vector(pages, data, pos, len);
676 if (ret < 0) {
677 ceph_release_page_vector(pages, num_pages);
678 goto out;
679 }
680
681 if ((file->f_flags & O_SYNC) == 0) {
682 /* get a second commit callback */
683 req->r_safe_callback = sync_write_commit;
684 req->r_own_pages = 1;
685 }
686 }
687 req->r_pages = pages;
688 req->r_num_pages = num_pages;
689 req->r_inode = inode;
690
691 ret = ceph_osdc_start_request(&client->osdc, req, false);
692 if (!ret) {
693 if (req->r_safe_callback) {
694 /*
695 * Add to inode unsafe list only after we
696 * start_request so that a tid has been assigned.
697 */
698 spin_lock(&ci->i_unsafe_lock);
699 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
700 spin_unlock(&ci->i_unsafe_lock);
701 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
702 }
703 ret = ceph_osdc_wait_request(&client->osdc, req);
704 }
705
706 if (file->f_flags & O_DIRECT)
707 put_page_vector(pages, num_pages);
708 else if (file->f_flags & O_SYNC)
709 ceph_release_page_vector(pages, num_pages);
710
711out:
712 ceph_osdc_put_request(req);
713 if (ret == 0) {
714 pos += len;
715 written += len;
716 left -= len;
717 if (left)
718 goto more;
719
720 ret = written;
721 *offset = pos;
722 if (pos > i_size_read(inode))
723 check_caps = ceph_inode_set_size(inode, pos);
724 if (check_caps)
725 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
726 NULL);
727 }
728 return ret;
729}
730
731/*
732 * Wrap generic_file_aio_read with checks for cap bits on the inode.
733 * Atomically grab references, so that those bits are not released
734 * back to the MDS mid-read.
735 *
736 * Hmm, the sync read case isn't actually async... should it be?
737 */
738static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
739 unsigned long nr_segs, loff_t pos)
740{
741 struct file *filp = iocb->ki_filp;
742 loff_t *ppos = &iocb->ki_pos;
743 size_t len = iov->iov_len;
744 struct inode *inode = filp->f_dentry->d_inode;
745 struct ceph_inode_info *ci = ceph_inode(inode);
746 void *base = iov->iov_base;
747 ssize_t ret;
748 int got = 0;
749 int checkeof = 0, read = 0;
750
751 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
752 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
753again:
754 __ceph_do_pending_vmtruncate(inode);
755 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
756 &got, -1);
757 if (ret < 0)
758 goto out;
759 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
760 inode, ceph_vinop(inode), pos, (unsigned)len,
761 ceph_cap_string(got));
762
763 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
764 (iocb->ki_filp->f_flags & O_DIRECT) ||
765 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
766 /* hmm, this isn't really async... */
767 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
768 else
769 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
770
771out:
772 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
773 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
774 ceph_put_cap_refs(ci, got);
775
776 if (checkeof && ret >= 0) {
777 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
778
779 /* hit EOF or hole? */
780 if (statret == 0 && *ppos < inode->i_size) {
781 dout("aio_read sync_read hit hole, reading more\n");
782 read += ret;
783 base += ret;
784 len -= ret;
785 checkeof = 0;
786 goto again;
787 }
788 }
789 if (ret >= 0)
790 ret += read;
791
792 return ret;
793}
794
795/*
796 * Take cap references to avoid releasing caps to MDS mid-write.
797 *
798 * If we are synchronous, and write with an old snap context, the OSD
799 * may return EOLDSNAPC. In that case, retry the write.. _after_
800 * dropping our cap refs and allowing the pending snap to logically
801 * complete _before_ this write occurs.
802 *
803 * If we are near ENOSPC, write synchronously.
804 */
805static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
806 unsigned long nr_segs, loff_t pos)
807{
808 struct file *file = iocb->ki_filp;
809 struct inode *inode = file->f_dentry->d_inode;
810 struct ceph_inode_info *ci = ceph_inode(inode);
811 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
812 loff_t endoff = pos + iov->iov_len;
813 int got = 0;
814 int ret, err;
815
816 if (ceph_snap(inode) != CEPH_NOSNAP)
817 return -EROFS;
818
819retry_snap:
820 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
821 return -ENOSPC;
822 __ceph_do_pending_vmtruncate(inode);
823 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
824 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
825 inode->i_size);
826 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
827 &got, endoff);
828 if (ret < 0)
829 goto out;
830
831 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
832 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
833 ceph_cap_string(got));
834
835 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
836 (iocb->ki_filp->f_flags & O_DIRECT) ||
837 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
838 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
839 &iocb->ki_pos);
840 } else {
841 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
842
843 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
844 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
845 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
846 err = vfs_fsync_range(file, file->f_path.dentry,
847 pos, pos + ret - 1, 1);
848 if (err < 0)
849 ret = err;
850 }
851 }
852 if (ret >= 0) {
853 spin_lock(&inode->i_lock);
854 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
855 spin_unlock(&inode->i_lock);
856 }
857
858out:
859 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
860 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
861 ceph_cap_string(got));
862 ceph_put_cap_refs(ci, got);
863
864 if (ret == -EOLDSNAPC) {
865 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
866 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
867 goto retry_snap;
868 }
869
870 return ret;
871}
872
873/*
874 * llseek. be sure to verify file size on SEEK_END.
875 */
876static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
877{
878 struct inode *inode = file->f_mapping->host;
879 int ret;
880
881 mutex_lock(&inode->i_mutex);
882 __ceph_do_pending_vmtruncate(inode);
883 switch (origin) {
884 case SEEK_END:
885 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
886 if (ret < 0) {
887 offset = ret;
888 goto out;
889 }
890 offset += inode->i_size;
891 break;
892 case SEEK_CUR:
893 /*
894 * Here we special-case the lseek(fd, 0, SEEK_CUR)
895 * position-querying operation. Avoid rewriting the "same"
896 * f_pos value back to the file because a concurrent read(),
897 * write() or lseek() might have altered it
898 */
899 if (offset == 0) {
900 offset = file->f_pos;
901 goto out;
902 }
903 offset += file->f_pos;
904 break;
905 }
906
907 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
908 offset = -EINVAL;
909 goto out;
910 }
911
912 /* Special lock needed here? */
913 if (offset != file->f_pos) {
914 file->f_pos = offset;
915 file->f_version = 0;
916 }
917
918out:
919 mutex_unlock(&inode->i_mutex);
920 return offset;
921}
922
923const struct file_operations ceph_file_fops = {
924 .open = ceph_open,
925 .release = ceph_release,
926 .llseek = ceph_llseek,
927 .read = do_sync_read,
928 .write = do_sync_write,
929 .aio_read = ceph_aio_read,
930 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap,
932 .fsync = ceph_fsync,
933 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl,
936 .compat_ioctl = ceph_ioctl,
937};
938
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..26f883c275e8
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1774 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 /*
382 * we may still have a snap_realm reference if there are stray
383 * caps in i_cap_exporting_issued or i_snap_caps.
384 */
385 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc =
387 &ceph_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm;
389
390 dout(" dropping residual ref to snap realm %p\n", realm);
391 spin_lock(&realm->inodes_with_caps_lock);
392 list_del_init(&ci->i_snap_realm_item);
393 spin_unlock(&realm->inodes_with_caps_lock);
394 ceph_put_snap_realm(mdsc, realm);
395 }
396
397 kfree(ci->i_symlink);
398 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
399 frag = rb_entry(n, struct ceph_inode_frag, node);
400 rb_erase(n, &ci->i_fragtree);
401 kfree(frag);
402 }
403
404 __ceph_destroy_xattrs(ci);
405 if (ci->i_xattrs.blob)
406 ceph_buffer_put(ci->i_xattrs.blob);
407 if (ci->i_xattrs.prealloc_blob)
408 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
409
410 kmem_cache_free(ceph_inode_cachep, ci);
411}
412
413
414/*
415 * Helpers to fill in size, ctime, mtime, and atime. We have to be
416 * careful because either the client or MDS may have more up to date
417 * info, depending on which capabilities are held, and whether
418 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
419 * and size are monotonically increasing, except when utimes() or
420 * truncate() increments the corresponding _seq values.)
421 */
422int ceph_fill_file_size(struct inode *inode, int issued,
423 u32 truncate_seq, u64 truncate_size, u64 size)
424{
425 struct ceph_inode_info *ci = ceph_inode(inode);
426 int queue_trunc = 0;
427
428 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
429 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
430 dout("size %lld -> %llu\n", inode->i_size, size);
431 inode->i_size = size;
432 inode->i_blocks = (size + (1<<9) - 1) >> 9;
433 ci->i_reported_size = size;
434 if (truncate_seq != ci->i_truncate_seq) {
435 dout("truncate_seq %u -> %u\n",
436 ci->i_truncate_seq, truncate_seq);
437 ci->i_truncate_seq = truncate_seq;
438 /*
439 * If we hold relevant caps, or in the case where we're
440 * not the only client referencing this file and we
441 * don't hold those caps, then we need to check whether
442 * the file is either opened or mmaped
443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) ||
447 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++;
450 queue_trunc = 1;
451 }
452 }
453 }
454 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
455 ci->i_truncate_size != truncate_size) {
456 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
457 truncate_size);
458 ci->i_truncate_size = truncate_size;
459 }
460 return queue_trunc;
461}
462
463void ceph_fill_file_time(struct inode *inode, int issued,
464 u64 time_warp_seq, struct timespec *ctime,
465 struct timespec *mtime, struct timespec *atime)
466{
467 struct ceph_inode_info *ci = ceph_inode(inode);
468 int warn = 0;
469
470 if (issued & (CEPH_CAP_FILE_EXCL|
471 CEPH_CAP_FILE_WR|
472 CEPH_CAP_FILE_BUFFER)) {
473 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
474 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
475 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
476 ctime->tv_sec, ctime->tv_nsec);
477 inode->i_ctime = *ctime;
478 }
479 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
480 /* the MDS did a utimes() */
481 dout("mtime %ld.%09ld -> %ld.%09ld "
482 "tw %d -> %d\n",
483 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
484 mtime->tv_sec, mtime->tv_nsec,
485 ci->i_time_warp_seq, (int)time_warp_seq);
486
487 inode->i_mtime = *mtime;
488 inode->i_atime = *atime;
489 ci->i_time_warp_seq = time_warp_seq;
490 } else if (time_warp_seq == ci->i_time_warp_seq) {
491 /* nobody did utimes(); take the max */
492 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
493 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
494 inode->i_mtime.tv_sec,
495 inode->i_mtime.tv_nsec,
496 mtime->tv_sec, mtime->tv_nsec);
497 inode->i_mtime = *mtime;
498 }
499 if (timespec_compare(atime, &inode->i_atime) > 0) {
500 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
501 inode->i_atime.tv_sec,
502 inode->i_atime.tv_nsec,
503 atime->tv_sec, atime->tv_nsec);
504 inode->i_atime = *atime;
505 }
506 } else if (issued & CEPH_CAP_FILE_EXCL) {
507 /* we did a utimes(); ignore mds values */
508 } else {
509 warn = 1;
510 }
511 } else {
512 /* we have no write caps; whatever the MDS says is true */
513 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
514 inode->i_ctime = *ctime;
515 inode->i_mtime = *mtime;
516 inode->i_atime = *atime;
517 ci->i_time_warp_seq = time_warp_seq;
518 } else {
519 warn = 1;
520 }
521 }
522 if (warn) /* time_warp_seq shouldn't go backwards */
523 dout("%p mds time_warp_seq %llu < %u\n",
524 inode, time_warp_seq, ci->i_time_warp_seq);
525}
526
527/*
528 * Populate an inode based on info from mds. May be called on new or
529 * existing inodes.
530 */
531static int fill_inode(struct inode *inode,
532 struct ceph_mds_reply_info_in *iinfo,
533 struct ceph_mds_reply_dirfrag *dirinfo,
534 struct ceph_mds_session *session,
535 unsigned long ttl_from, int cap_fmode,
536 struct ceph_cap_reservation *caps_reservation)
537{
538 struct ceph_mds_reply_inode *info = iinfo->in;
539 struct ceph_inode_info *ci = ceph_inode(inode);
540 int i;
541 int issued, implemented;
542 struct timespec mtime, atime, ctime;
543 u32 nsplits;
544 struct ceph_buffer *xattr_blob = NULL;
545 int err = 0;
546 int queue_trunc = 0;
547
548 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
549 inode, ceph_vinop(inode), le64_to_cpu(info->version),
550 ci->i_version);
551
552 /*
553 * prealloc xattr data, if it looks like we'll need it. only
554 * if len > 4 (meaning there are actually xattrs; the first 4
555 * bytes are the xattr count).
556 */
557 if (iinfo->xattr_len > 4) {
558 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
559 if (!xattr_blob)
560 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
561 iinfo->xattr_len);
562 }
563
564 spin_lock(&inode->i_lock);
565
566 /*
567 * provided version will be odd if inode value is projected,
568 * even if stable. skip the update if we have a newer info
569 * (e.g., due to inode info racing form multiple MDSs), or if
570 * we are getting projected (unstable) inode info.
571 */
572 if (le64_to_cpu(info->version) > 0 &&
573 (ci->i_version & ~1) > le64_to_cpu(info->version))
574 goto no_change;
575
576 issued = __ceph_caps_issued(ci, &implemented);
577 issued |= implemented | __ceph_caps_dirty(ci);
578
579 /* update inode */
580 ci->i_version = le64_to_cpu(info->version);
581 inode->i_version++;
582 inode->i_rdev = le32_to_cpu(info->rdev);
583
584 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
585 inode->i_mode = le32_to_cpu(info->mode);
586 inode->i_uid = le32_to_cpu(info->uid);
587 inode->i_gid = le32_to_cpu(info->gid);
588 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
589 inode->i_uid, inode->i_gid);
590 }
591
592 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
593 inode->i_nlink = le32_to_cpu(info->nlink);
594
595 /* be careful with mtime, atime, size */
596 ceph_decode_timespec(&atime, &info->atime);
597 ceph_decode_timespec(&mtime, &info->mtime);
598 ceph_decode_timespec(&ctime, &info->ctime);
599 queue_trunc = ceph_fill_file_size(inode, issued,
600 le32_to_cpu(info->truncate_seq),
601 le64_to_cpu(info->truncate_size),
602 le64_to_cpu(info->size));
603 ceph_fill_file_time(inode, issued,
604 le32_to_cpu(info->time_warp_seq),
605 &ctime, &mtime, &atime);
606
607 ci->i_max_size = le64_to_cpu(info->max_size);
608 ci->i_layout = info->layout;
609 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
610
611 /* xattrs */
612 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
613 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
614 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
615 if (ci->i_xattrs.blob)
616 ceph_buffer_put(ci->i_xattrs.blob);
617 ci->i_xattrs.blob = xattr_blob;
618 if (xattr_blob)
619 memcpy(ci->i_xattrs.blob->vec.iov_base,
620 iinfo->xattr_data, iinfo->xattr_len);
621 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
622 }
623
624 inode->i_mapping->a_ops = &ceph_aops;
625 inode->i_mapping->backing_dev_info =
626 &ceph_client(inode->i_sb)->backing_dev_info;
627
628 switch (inode->i_mode & S_IFMT) {
629 case S_IFIFO:
630 case S_IFBLK:
631 case S_IFCHR:
632 case S_IFSOCK:
633 init_special_inode(inode, inode->i_mode, inode->i_rdev);
634 inode->i_op = &ceph_file_iops;
635 break;
636 case S_IFREG:
637 inode->i_op = &ceph_file_iops;
638 inode->i_fop = &ceph_file_fops;
639 break;
640 case S_IFLNK:
641 inode->i_op = &ceph_symlink_iops;
642 if (!ci->i_symlink) {
643 int symlen = iinfo->symlink_len;
644 char *sym;
645
646 BUG_ON(symlen != inode->i_size);
647 spin_unlock(&inode->i_lock);
648
649 err = -ENOMEM;
650 sym = kmalloc(symlen+1, GFP_NOFS);
651 if (!sym)
652 goto out;
653 memcpy(sym, iinfo->symlink, symlen);
654 sym[symlen] = 0;
655
656 spin_lock(&inode->i_lock);
657 if (!ci->i_symlink)
658 ci->i_symlink = sym;
659 else
660 kfree(sym); /* lost a race */
661 }
662 break;
663 case S_IFDIR:
664 inode->i_op = &ceph_dir_iops;
665 inode->i_fop = &ceph_dir_fops;
666
667 ci->i_files = le64_to_cpu(info->files);
668 ci->i_subdirs = le64_to_cpu(info->subdirs);
669 ci->i_rbytes = le64_to_cpu(info->rbytes);
670 ci->i_rfiles = le64_to_cpu(info->rfiles);
671 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
672 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
673
674 /* set dir completion flag? */
675 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
676 ceph_snap(inode) == CEPH_NOSNAP &&
677 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
678 dout(" marking %p complete (empty)\n", inode);
679 ci->i_ceph_flags |= CEPH_I_COMPLETE;
680 ci->i_max_offset = 2;
681 }
682
683 /* it may be better to set st_size in getattr instead? */
684 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
685 inode->i_size = ci->i_rbytes;
686 break;
687 default:
688 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
689 ceph_vinop(inode), inode->i_mode);
690 }
691
692no_change:
693 spin_unlock(&inode->i_lock);
694
695 /* queue truncate if we saw i_size decrease */
696 if (queue_trunc)
697 ceph_queue_vmtruncate(inode);
698
699 /* populate frag tree */
700 /* FIXME: move me up, if/when version reflects fragtree changes */
701 nsplits = le32_to_cpu(info->fragtree.nsplits);
702 mutex_lock(&ci->i_fragtree_mutex);
703 for (i = 0; i < nsplits; i++) {
704 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
705 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
706
707 if (IS_ERR(frag))
708 continue;
709 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
710 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
711 }
712 mutex_unlock(&ci->i_fragtree_mutex);
713
714 /* were we issued a capability? */
715 if (info->cap.caps) {
716 if (ceph_snap(inode) == CEPH_NOSNAP) {
717 ceph_add_cap(inode, session,
718 le64_to_cpu(info->cap.cap_id),
719 cap_fmode,
720 le32_to_cpu(info->cap.caps),
721 le32_to_cpu(info->cap.wanted),
722 le32_to_cpu(info->cap.seq),
723 le32_to_cpu(info->cap.mseq),
724 le64_to_cpu(info->cap.realm),
725 info->cap.flags,
726 caps_reservation);
727 } else {
728 spin_lock(&inode->i_lock);
729 dout(" %p got snap_caps %s\n", inode,
730 ceph_cap_string(le32_to_cpu(info->cap.caps)));
731 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
732 if (cap_fmode >= 0)
733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock);
735 }
736 }
737
738 /* update delegation info? */
739 if (dirinfo)
740 ceph_fill_dirfrag(inode, dirinfo);
741
742 err = 0;
743
744out:
745 if (xattr_blob)
746 ceph_buffer_put(xattr_blob);
747 return err;
748}
749
750/*
751 * caller should hold session s_mutex.
752 */
753static void update_dentry_lease(struct dentry *dentry,
754 struct ceph_mds_reply_lease *lease,
755 struct ceph_mds_session *session,
756 unsigned long from_time)
757{
758 struct ceph_dentry_info *di = ceph_dentry(dentry);
759 long unsigned duration = le32_to_cpu(lease->duration_ms);
760 long unsigned ttl = from_time + (duration * HZ) / 1000;
761 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
762 struct inode *dir;
763
764 /* only track leases on regular dentries */
765 if (dentry->d_op != &ceph_dentry_ops)
766 return;
767
768 spin_lock(&dentry->d_lock);
769 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
770 dentry, le16_to_cpu(lease->mask), duration, ttl);
771
772 /* make lease_rdcache_gen match directory */
773 dir = dentry->d_parent->d_inode;
774 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
775
776 if (lease->mask == 0)
777 goto out_unlock;
778
779 if (di->lease_gen == session->s_cap_gen &&
780 time_before(ttl, dentry->d_time))
781 goto out_unlock; /* we already have a newer lease. */
782
783 if (di->lease_session && di->lease_session != session)
784 goto out_unlock;
785
786 ceph_dentry_lru_touch(dentry);
787
788 if (!di->lease_session)
789 di->lease_session = ceph_get_mds_session(session);
790 di->lease_gen = session->s_cap_gen;
791 di->lease_seq = le32_to_cpu(lease->seq);
792 di->lease_renew_after = half_ttl;
793 di->lease_renew_from = 0;
794 dentry->d_time = ttl;
795out_unlock:
796 spin_unlock(&dentry->d_lock);
797 return;
798}
799
800/*
801 * splice a dentry to an inode.
802 * caller must hold directory i_mutex for this to be safe.
803 *
804 * we will only rehash the resulting dentry if @prehash is
805 * true; @prehash will be set to false (for the benefit of
806 * the caller) if we fail.
807 */
808static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
809 bool *prehash)
810{
811 struct dentry *realdn;
812
813 /* dn must be unhashed */
814 if (!d_unhashed(dn))
815 d_drop(dn);
816 realdn = d_materialise_unique(dn, in);
817 if (IS_ERR(realdn)) {
818 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
819 dn, in, ceph_vinop(in));
820 if (prehash)
821 *prehash = false; /* don't rehash on error */
822 dn = realdn; /* note realdn contains the error */
823 goto out;
824 } else if (realdn) {
825 dout("dn %p (%d) spliced with %p (%d) "
826 "inode %p ino %llx.%llx\n",
827 dn, atomic_read(&dn->d_count),
828 realdn, atomic_read(&realdn->d_count),
829 realdn->d_inode, ceph_vinop(realdn->d_inode));
830 dput(dn);
831 dn = realdn;
832 } else {
833 BUG_ON(!ceph_dentry(dn));
834
835 dout("dn %p attached to %p ino %llx.%llx\n",
836 dn, dn->d_inode, ceph_vinop(dn->d_inode));
837 }
838 if ((!prehash || *prehash) && d_unhashed(dn))
839 d_rehash(dn);
840out:
841 return dn;
842}
843
844/*
845 * Set dentry's directory position based on the current dir's max, and
846 * order it in d_subdirs, so that dcache_readdir behaves.
847 */
848static void ceph_set_dentry_offset(struct dentry *dn)
849{
850 struct dentry *dir = dn->d_parent;
851 struct inode *inode = dn->d_parent->d_inode;
852 struct ceph_dentry_info *di;
853
854 BUG_ON(!inode);
855
856 di = ceph_dentry(dn);
857
858 spin_lock(&inode->i_lock);
859 di->offset = ceph_inode(inode)->i_max_offset++;
860 spin_unlock(&inode->i_lock);
861
862 spin_lock(&dcache_lock);
863 spin_lock(&dn->d_lock);
864 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
865 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
866 dn->d_u.d_child.prev, dn->d_u.d_child.next);
867 spin_unlock(&dn->d_lock);
868 spin_unlock(&dcache_lock);
869}
870
871/*
872 * Incorporate results into the local cache. This is either just
873 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
874 * after a lookup).
875 *
876 * A reply may contain
877 * a directory inode along with a dentry.
878 * and/or a target inode
879 *
880 * Called with snap_rwsem (read).
881 */
882int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
883 struct ceph_mds_session *session)
884{
885 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino;
889 struct ceph_client *client = ceph_sb_to_client(sb);
890 int i = 0;
891 int err = 0;
892
893 dout("fill_trace %p is_dentry %d is_target %d\n", req,
894 rinfo->head->is_dentry, rinfo->head->is_target);
895
896#if 0
897 /*
898 * Debugging hook:
899 *
900 * If we resend completed ops to a recovering mds, we get no
901 * trace. Since that is very rare, pretend this is the case
902 * to ensure the 'no trace' handlers in the callers behave.
903 *
904 * Fill in inodes unconditionally to avoid breaking cap
905 * invariants.
906 */
907 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
908 pr_info("fill_trace faking empty trace on %lld %s\n",
909 req->r_tid, ceph_mds_op_name(rinfo->head->op));
910 if (rinfo->head->is_dentry) {
911 rinfo->head->is_dentry = 0;
912 err = fill_inode(req->r_locked_dir,
913 &rinfo->diri, rinfo->dirfrag,
914 session, req->r_request_started, -1);
915 }
916 if (rinfo->head->is_target) {
917 rinfo->head->is_target = 0;
918 ininfo = rinfo->targeti.in;
919 vino.ino = le64_to_cpu(ininfo->ino);
920 vino.snap = le64_to_cpu(ininfo->snapid);
921 in = ceph_get_inode(sb, vino);
922 err = fill_inode(in, &rinfo->targeti, NULL,
923 session, req->r_request_started,
924 req->r_fmode);
925 iput(in);
926 }
927 }
928#endif
929
930 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
931 dout("fill_trace reply is empty!\n");
932 if (rinfo->head->result == 0 && req->r_locked_dir) {
933 struct ceph_inode_info *ci =
934 ceph_inode(req->r_locked_dir);
935 dout(" clearing %p complete (empty trace)\n",
936 req->r_locked_dir);
937 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
938 ci->i_release_count++;
939 }
940 return 0;
941 }
942
943 if (rinfo->head->is_dentry) {
944 struct inode *dir = req->r_locked_dir;
945
946 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
947 session, req->r_request_started, -1,
948 &req->r_caps_reservation);
949 if (err < 0)
950 return err;
951 }
952
953 /*
954 * ignore null lease/binding on snapdir ENOENT, or else we
955 * will have trouble splicing in the virtual snapdir later
956 */
957 if (rinfo->head->is_dentry && !req->r_aborted &&
958 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
959 client->mount_args->snapdir_name,
960 req->r_dentry->d_name.len))) {
961 /*
962 * lookup link rename : null -> possibly existing inode
963 * mknod symlink mkdir : null -> new inode
964 * unlink : linked -> null
965 */
966 struct inode *dir = req->r_locked_dir;
967 struct dentry *dn = req->r_dentry;
968 bool have_dir_cap, have_lease;
969
970 BUG_ON(!dn);
971 BUG_ON(!dir);
972 BUG_ON(dn->d_parent->d_inode != dir);
973 BUG_ON(ceph_ino(dir) !=
974 le64_to_cpu(rinfo->diri.in->ino));
975 BUG_ON(ceph_snap(dir) !=
976 le64_to_cpu(rinfo->diri.in->snapid));
977
978 /* do we have a lease on the whole dir? */
979 have_dir_cap =
980 (le32_to_cpu(rinfo->diri.in->cap.caps) &
981 CEPH_CAP_FILE_SHARED);
982
983 /* do we have a dn lease? */
984 have_lease = have_dir_cap ||
985 (le16_to_cpu(rinfo->dlease->mask) &
986 CEPH_LOCK_DN);
987
988 if (!have_lease)
989 dout("fill_trace no dentry lease or dir cap\n");
990
991 /* rename? */
992 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
993 dout(" src %p '%.*s' dst %p '%.*s'\n",
994 req->r_old_dentry,
995 req->r_old_dentry->d_name.len,
996 req->r_old_dentry->d_name.name,
997 dn, dn->d_name.len, dn->d_name.name);
998 dout("fill_trace doing d_move %p -> %p\n",
999 req->r_old_dentry, dn);
1000 d_move(req->r_old_dentry, dn);
1001 dout(" src %p '%.*s' dst %p '%.*s'\n",
1002 req->r_old_dentry,
1003 req->r_old_dentry->d_name.len,
1004 req->r_old_dentry->d_name.name,
1005 dn, dn->d_name.len, dn->d_name.name);
1006 /* ensure target dentry is invalidated, despite
1007 rehashing bug in vfs_rename_dir */
1008 dn->d_time = jiffies;
1009 ceph_dentry(dn)->lease_shared_gen = 0;
1010 /* take overwritten dentry's readdir offset */
1011 ceph_dentry(req->r_old_dentry)->offset =
1012 ceph_dentry(dn)->offset;
1013 dn = req->r_old_dentry; /* use old_dentry */
1014 in = dn->d_inode;
1015 }
1016
1017 /* null dentry? */
1018 if (!rinfo->head->is_target) {
1019 dout("fill_trace null dentry\n");
1020 if (dn->d_inode) {
1021 dout("d_delete %p\n", dn);
1022 d_delete(dn);
1023 } else {
1024 dout("d_instantiate %p NULL\n", dn);
1025 d_instantiate(dn, NULL);
1026 if (have_lease && d_unhashed(dn))
1027 d_rehash(dn);
1028 update_dentry_lease(dn, rinfo->dlease,
1029 session,
1030 req->r_request_started);
1031 }
1032 goto done;
1033 }
1034
1035 /* attach proper inode */
1036 ininfo = rinfo->targeti.in;
1037 vino.ino = le64_to_cpu(ininfo->ino);
1038 vino.snap = le64_to_cpu(ininfo->snapid);
1039 if (!dn->d_inode) {
1040 in = ceph_get_inode(sb, vino);
1041 if (IS_ERR(in)) {
1042 pr_err("fill_trace bad get_inode "
1043 "%llx.%llx\n", vino.ino, vino.snap);
1044 err = PTR_ERR(in);
1045 d_delete(dn);
1046 goto done;
1047 }
1048 dn = splice_dentry(dn, in, &have_lease);
1049 if (IS_ERR(dn)) {
1050 err = PTR_ERR(dn);
1051 goto done;
1052 }
1053 req->r_dentry = dn; /* may have spliced */
1054 ceph_set_dentry_offset(dn);
1055 igrab(in);
1056 } else if (ceph_ino(in) == vino.ino &&
1057 ceph_snap(in) == vino.snap) {
1058 igrab(in);
1059 } else {
1060 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1061 dn, in, ceph_ino(in), ceph_snap(in),
1062 vino.ino, vino.snap);
1063 have_lease = false;
1064 in = NULL;
1065 }
1066
1067 if (have_lease)
1068 update_dentry_lease(dn, rinfo->dlease, session,
1069 req->r_request_started);
1070 dout(" final dn %p\n", dn);
1071 i++;
1072 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1073 req->r_op == CEPH_MDS_OP_MKSNAP) {
1074 struct dentry *dn = req->r_dentry;
1075
1076 /* fill out a snapdir LOOKUPSNAP dentry */
1077 BUG_ON(!dn);
1078 BUG_ON(!req->r_locked_dir);
1079 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1080 ininfo = rinfo->targeti.in;
1081 vino.ino = le64_to_cpu(ininfo->ino);
1082 vino.snap = le64_to_cpu(ininfo->snapid);
1083 in = ceph_get_inode(sb, vino);
1084 if (IS_ERR(in)) {
1085 pr_err("fill_inode get_inode badness %llx.%llx\n",
1086 vino.ino, vino.snap);
1087 err = PTR_ERR(in);
1088 d_delete(dn);
1089 goto done;
1090 }
1091 dout(" linking snapped dir %p to dn %p\n", in, dn);
1092 dn = splice_dentry(dn, in, NULL);
1093 if (IS_ERR(dn)) {
1094 err = PTR_ERR(dn);
1095 goto done;
1096 }
1097 ceph_set_dentry_offset(dn);
1098 req->r_dentry = dn; /* may have spliced */
1099 igrab(in);
1100 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1101 }
1102
1103 if (rinfo->head->is_target) {
1104 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1105 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1106
1107 if (in == NULL || ceph_ino(in) != vino.ino ||
1108 ceph_snap(in) != vino.snap) {
1109 in = ceph_get_inode(sb, vino);
1110 if (IS_ERR(in)) {
1111 err = PTR_ERR(in);
1112 goto done;
1113 }
1114 }
1115 req->r_target_inode = in;
1116
1117 err = fill_inode(in,
1118 &rinfo->targeti, NULL,
1119 session, req->r_request_started,
1120 (le32_to_cpu(rinfo->head->result) == 0) ?
1121 req->r_fmode : -1,
1122 &req->r_caps_reservation);
1123 if (err < 0) {
1124 pr_err("fill_inode badness %p %llx.%llx\n",
1125 in, ceph_vinop(in));
1126 goto done;
1127 }
1128 }
1129
1130done:
1131 dout("fill_trace done err=%d\n", err);
1132 return err;
1133}
1134
1135/*
1136 * Prepopulate our cache with readdir results, leases, etc.
1137 */
1138int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1139 struct ceph_mds_session *session)
1140{
1141 struct dentry *parent = req->r_dentry;
1142 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1143 struct qstr dname;
1144 struct dentry *dn;
1145 struct inode *in;
1146 int err = 0, i;
1147 struct inode *snapdir = NULL;
1148 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1149 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1150 struct ceph_dentry_info *di;
1151
1152 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1153 snapdir = ceph_get_snapdir(parent->d_inode);
1154 parent = d_find_alias(snapdir);
1155 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1156 rinfo->dir_nr, parent);
1157 } else {
1158 dout("readdir_prepopulate %d items under dn %p\n",
1159 rinfo->dir_nr, parent);
1160 if (rinfo->dir_dir)
1161 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1162 }
1163
1164 for (i = 0; i < rinfo->dir_nr; i++) {
1165 struct ceph_vino vino;
1166
1167 dname.name = rinfo->dir_dname[i];
1168 dname.len = rinfo->dir_dname_len[i];
1169 dname.hash = full_name_hash(dname.name, dname.len);
1170
1171 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1172 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1173
1174retry_lookup:
1175 dn = d_lookup(parent, &dname);
1176 dout("d_lookup on parent=%p name=%.*s got %p\n",
1177 parent, dname.len, dname.name, dn);
1178
1179 if (!dn) {
1180 dn = d_alloc(parent, &dname);
1181 dout("d_alloc %p '%.*s' = %p\n", parent,
1182 dname.len, dname.name, dn);
1183 if (dn == NULL) {
1184 dout("d_alloc badness\n");
1185 err = -ENOMEM;
1186 goto out;
1187 }
1188 err = ceph_init_dentry(dn);
1189 if (err < 0)
1190 goto out;
1191 } else if (dn->d_inode &&
1192 (ceph_ino(dn->d_inode) != vino.ino ||
1193 ceph_snap(dn->d_inode) != vino.snap)) {
1194 dout(" dn %p points to wrong inode %p\n",
1195 dn, dn->d_inode);
1196 d_delete(dn);
1197 dput(dn);
1198 goto retry_lookup;
1199 } else {
1200 /* reorder parent's d_subdirs */
1201 spin_lock(&dcache_lock);
1202 spin_lock(&dn->d_lock);
1203 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1204 spin_unlock(&dn->d_lock);
1205 spin_unlock(&dcache_lock);
1206 }
1207
1208 di = dn->d_fsdata;
1209 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1210
1211 /* inode */
1212 if (dn->d_inode) {
1213 in = dn->d_inode;
1214 } else {
1215 in = ceph_get_inode(parent->d_sb, vino);
1216 if (in == NULL) {
1217 dout("new_inode badness\n");
1218 d_delete(dn);
1219 dput(dn);
1220 err = -ENOMEM;
1221 goto out;
1222 }
1223 dn = splice_dentry(dn, in, NULL);
1224 }
1225
1226 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1227 req->r_request_started, -1,
1228 &req->r_caps_reservation) < 0) {
1229 pr_err("fill_inode badness on %p\n", in);
1230 dput(dn);
1231 continue;
1232 }
1233 update_dentry_lease(dn, rinfo->dir_dlease[i],
1234 req->r_session, req->r_request_started);
1235 dput(dn);
1236 }
1237 req->r_did_prepopulate = true;
1238
1239out:
1240 if (snapdir) {
1241 iput(snapdir);
1242 dput(parent);
1243 }
1244 dout("readdir_prepopulate done\n");
1245 return err;
1246}
1247
1248int ceph_inode_set_size(struct inode *inode, loff_t size)
1249{
1250 struct ceph_inode_info *ci = ceph_inode(inode);
1251 int ret = 0;
1252
1253 spin_lock(&inode->i_lock);
1254 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1255 inode->i_size = size;
1256 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1257
1258 /* tell the MDS if we are approaching max_size */
1259 if ((size << 1) >= ci->i_max_size &&
1260 (ci->i_reported_size << 1) < ci->i_max_size)
1261 ret = 1;
1262
1263 spin_unlock(&inode->i_lock);
1264 return ret;
1265}
1266
1267/*
1268 * Write back inode data in a worker thread. (This can't be done
1269 * in the message handler context.)
1270 */
1271void ceph_queue_writeback(struct inode *inode)
1272{
1273 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1274 &ceph_inode(inode)->i_wb_work)) {
1275 dout("ceph_queue_writeback %p\n", inode);
1276 igrab(inode);
1277 } else {
1278 dout("ceph_queue_writeback %p failed\n", inode);
1279 }
1280}
1281
1282static void ceph_writeback_work(struct work_struct *work)
1283{
1284 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1285 i_wb_work);
1286 struct inode *inode = &ci->vfs_inode;
1287
1288 dout("writeback %p\n", inode);
1289 filemap_fdatawrite(&inode->i_data);
1290 iput(inode);
1291}
1292
1293/*
1294 * queue an async invalidation
1295 */
1296void ceph_queue_invalidate(struct inode *inode)
1297{
1298 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1299 &ceph_inode(inode)->i_pg_inv_work)) {
1300 dout("ceph_queue_invalidate %p\n", inode);
1301 igrab(inode);
1302 } else {
1303 dout("ceph_queue_invalidate %p failed\n", inode);
1304 }
1305}
1306
1307/*
1308 * invalidate any pages that are not dirty or under writeback. this
1309 * includes pages that are clean and mapped.
1310 */
1311static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1312{
1313 struct pagevec pvec;
1314 pgoff_t next = 0;
1315 int i;
1316
1317 pagevec_init(&pvec, 0);
1318 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1319 for (i = 0; i < pagevec_count(&pvec); i++) {
1320 struct page *page = pvec.pages[i];
1321 pgoff_t index;
1322 int skip_page =
1323 (PageDirty(page) || PageWriteback(page));
1324
1325 if (!skip_page)
1326 skip_page = !trylock_page(page);
1327
1328 /*
1329 * We really shouldn't be looking at the ->index of an
1330 * unlocked page. But we're not allowed to lock these
1331 * pages. So we rely upon nobody altering the ->index
1332 * of this (pinned-by-us) page.
1333 */
1334 index = page->index;
1335 if (index > next)
1336 next = index;
1337 next++;
1338
1339 if (skip_page)
1340 continue;
1341
1342 generic_error_remove_page(mapping, page);
1343 unlock_page(page);
1344 }
1345 pagevec_release(&pvec);
1346 cond_resched();
1347 }
1348}
1349
1350/*
1351 * Invalidate inode pages in a worker thread. (This can't be done
1352 * in the message handler context.)
1353 */
1354static void ceph_invalidate_work(struct work_struct *work)
1355{
1356 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1357 i_pg_inv_work);
1358 struct inode *inode = &ci->vfs_inode;
1359 u32 orig_gen;
1360 int check = 0;
1361
1362 spin_lock(&inode->i_lock);
1363 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1364 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1365 if (ci->i_rdcache_gen == 0 ||
1366 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1367 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1368 /* nevermind! */
1369 ci->i_rdcache_revoking = 0;
1370 spin_unlock(&inode->i_lock);
1371 goto out;
1372 }
1373 orig_gen = ci->i_rdcache_gen;
1374 spin_unlock(&inode->i_lock);
1375
1376 ceph_invalidate_nondirty_pages(inode->i_mapping);
1377
1378 spin_lock(&inode->i_lock);
1379 if (orig_gen == ci->i_rdcache_gen) {
1380 dout("invalidate_pages %p gen %d successful\n", inode,
1381 ci->i_rdcache_gen);
1382 ci->i_rdcache_gen = 0;
1383 ci->i_rdcache_revoking = 0;
1384 check = 1;
1385 } else {
1386 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1387 inode, orig_gen, ci->i_rdcache_gen);
1388 }
1389 spin_unlock(&inode->i_lock);
1390
1391 if (check)
1392 ceph_check_caps(ci, 0, NULL);
1393out:
1394 iput(inode);
1395}
1396
1397
1398/*
1399 * called by trunc_wq; take i_mutex ourselves
1400 *
1401 * We also truncate in a separate thread as well.
1402 */
1403static void ceph_vmtruncate_work(struct work_struct *work)
1404{
1405 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1406 i_vmtruncate_work);
1407 struct inode *inode = &ci->vfs_inode;
1408
1409 dout("vmtruncate_work %p\n", inode);
1410 mutex_lock(&inode->i_mutex);
1411 __ceph_do_pending_vmtruncate(inode);
1412 mutex_unlock(&inode->i_mutex);
1413 iput(inode);
1414}
1415
1416/*
1417 * Queue an async vmtruncate. If we fail to queue work, we will handle
1418 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1419 */
1420void ceph_queue_vmtruncate(struct inode *inode)
1421{
1422 struct ceph_inode_info *ci = ceph_inode(inode);
1423
1424 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1425 &ci->i_vmtruncate_work)) {
1426 dout("ceph_queue_vmtruncate %p\n", inode);
1427 igrab(inode);
1428 } else {
1429 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1430 inode, ci->i_truncate_pending);
1431 }
1432}
1433
1434/*
1435 * called with i_mutex held.
1436 *
1437 * Make sure any pending truncation is applied before doing anything
1438 * that may depend on it.
1439 */
1440void __ceph_do_pending_vmtruncate(struct inode *inode)
1441{
1442 struct ceph_inode_info *ci = ceph_inode(inode);
1443 u64 to;
1444 int wrbuffer_refs, wake = 0;
1445
1446retry:
1447 spin_lock(&inode->i_lock);
1448 if (ci->i_truncate_pending == 0) {
1449 dout("__do_pending_vmtruncate %p none pending\n", inode);
1450 spin_unlock(&inode->i_lock);
1451 return;
1452 }
1453
1454 /*
1455 * make sure any dirty snapped pages are flushed before we
1456 * possibly truncate them.. so write AND block!
1457 */
1458 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1459 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1460 inode);
1461 spin_unlock(&inode->i_lock);
1462 filemap_write_and_wait_range(&inode->i_data, 0,
1463 inode->i_sb->s_maxbytes);
1464 goto retry;
1465 }
1466
1467 to = ci->i_truncate_size;
1468 wrbuffer_refs = ci->i_wrbuffer_ref;
1469 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1470 ci->i_truncate_pending, to);
1471 spin_unlock(&inode->i_lock);
1472
1473 truncate_inode_pages(inode->i_mapping, to);
1474
1475 spin_lock(&inode->i_lock);
1476 ci->i_truncate_pending--;
1477 if (ci->i_truncate_pending == 0)
1478 wake = 1;
1479 spin_unlock(&inode->i_lock);
1480
1481 if (wrbuffer_refs == 0)
1482 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1483 if (wake)
1484 wake_up(&ci->i_cap_wq);
1485}
1486
1487
1488/*
1489 * symlinks
1490 */
1491static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1492{
1493 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1494 nd_set_link(nd, ci->i_symlink);
1495 return NULL;
1496}
1497
1498static const struct inode_operations ceph_symlink_iops = {
1499 .readlink = generic_readlink,
1500 .follow_link = ceph_sym_follow_link,
1501};
1502
1503/*
1504 * setattr
1505 */
1506int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1507{
1508 struct inode *inode = dentry->d_inode;
1509 struct ceph_inode_info *ci = ceph_inode(inode);
1510 struct inode *parent_inode = dentry->d_parent->d_inode;
1511 const unsigned int ia_valid = attr->ia_valid;
1512 struct ceph_mds_request *req;
1513 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1514 int issued;
1515 int release = 0, dirtied = 0;
1516 int mask = 0;
1517 int err = 0;
1518
1519 if (ceph_snap(inode) != CEPH_NOSNAP)
1520 return -EROFS;
1521
1522 __ceph_do_pending_vmtruncate(inode);
1523
1524 err = inode_change_ok(inode, attr);
1525 if (err != 0)
1526 return err;
1527
1528 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1529 USE_AUTH_MDS);
1530 if (IS_ERR(req))
1531 return PTR_ERR(req);
1532
1533 spin_lock(&inode->i_lock);
1534 issued = __ceph_caps_issued(ci, NULL);
1535 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1536
1537 if (ia_valid & ATTR_UID) {
1538 dout("setattr %p uid %d -> %d\n", inode,
1539 inode->i_uid, attr->ia_uid);
1540 if (issued & CEPH_CAP_AUTH_EXCL) {
1541 inode->i_uid = attr->ia_uid;
1542 dirtied |= CEPH_CAP_AUTH_EXCL;
1543 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1544 attr->ia_uid != inode->i_uid) {
1545 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1546 mask |= CEPH_SETATTR_UID;
1547 release |= CEPH_CAP_AUTH_SHARED;
1548 }
1549 }
1550 if (ia_valid & ATTR_GID) {
1551 dout("setattr %p gid %d -> %d\n", inode,
1552 inode->i_gid, attr->ia_gid);
1553 if (issued & CEPH_CAP_AUTH_EXCL) {
1554 inode->i_gid = attr->ia_gid;
1555 dirtied |= CEPH_CAP_AUTH_EXCL;
1556 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1557 attr->ia_gid != inode->i_gid) {
1558 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1559 mask |= CEPH_SETATTR_GID;
1560 release |= CEPH_CAP_AUTH_SHARED;
1561 }
1562 }
1563 if (ia_valid & ATTR_MODE) {
1564 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1565 attr->ia_mode);
1566 if (issued & CEPH_CAP_AUTH_EXCL) {
1567 inode->i_mode = attr->ia_mode;
1568 dirtied |= CEPH_CAP_AUTH_EXCL;
1569 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1570 attr->ia_mode != inode->i_mode) {
1571 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1572 mask |= CEPH_SETATTR_MODE;
1573 release |= CEPH_CAP_AUTH_SHARED;
1574 }
1575 }
1576
1577 if (ia_valid & ATTR_ATIME) {
1578 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1579 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1580 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1581 if (issued & CEPH_CAP_FILE_EXCL) {
1582 ci->i_time_warp_seq++;
1583 inode->i_atime = attr->ia_atime;
1584 dirtied |= CEPH_CAP_FILE_EXCL;
1585 } else if ((issued & CEPH_CAP_FILE_WR) &&
1586 timespec_compare(&inode->i_atime,
1587 &attr->ia_atime) < 0) {
1588 inode->i_atime = attr->ia_atime;
1589 dirtied |= CEPH_CAP_FILE_WR;
1590 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1591 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1592 ceph_encode_timespec(&req->r_args.setattr.atime,
1593 &attr->ia_atime);
1594 mask |= CEPH_SETATTR_ATIME;
1595 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1596 CEPH_CAP_FILE_WR;
1597 }
1598 }
1599 if (ia_valid & ATTR_MTIME) {
1600 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1601 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1602 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1603 if (issued & CEPH_CAP_FILE_EXCL) {
1604 ci->i_time_warp_seq++;
1605 inode->i_mtime = attr->ia_mtime;
1606 dirtied |= CEPH_CAP_FILE_EXCL;
1607 } else if ((issued & CEPH_CAP_FILE_WR) &&
1608 timespec_compare(&inode->i_mtime,
1609 &attr->ia_mtime) < 0) {
1610 inode->i_mtime = attr->ia_mtime;
1611 dirtied |= CEPH_CAP_FILE_WR;
1612 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1613 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1614 ceph_encode_timespec(&req->r_args.setattr.mtime,
1615 &attr->ia_mtime);
1616 mask |= CEPH_SETATTR_MTIME;
1617 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1618 CEPH_CAP_FILE_WR;
1619 }
1620 }
1621 if (ia_valid & ATTR_SIZE) {
1622 dout("setattr %p size %lld -> %lld\n", inode,
1623 inode->i_size, attr->ia_size);
1624 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1625 err = -EINVAL;
1626 goto out;
1627 }
1628 if ((issued & CEPH_CAP_FILE_EXCL) &&
1629 attr->ia_size > inode->i_size) {
1630 inode->i_size = attr->ia_size;
1631 inode->i_blocks =
1632 (attr->ia_size + (1 << 9) - 1) >> 9;
1633 inode->i_ctime = attr->ia_ctime;
1634 ci->i_reported_size = attr->ia_size;
1635 dirtied |= CEPH_CAP_FILE_EXCL;
1636 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1637 attr->ia_size != inode->i_size) {
1638 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1639 req->r_args.setattr.old_size =
1640 cpu_to_le64(inode->i_size);
1641 mask |= CEPH_SETATTR_SIZE;
1642 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1643 CEPH_CAP_FILE_WR;
1644 }
1645 }
1646
1647 /* these do nothing */
1648 if (ia_valid & ATTR_CTIME) {
1649 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1650 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1651 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1652 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1653 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1654 only ? "ctime only" : "ignored");
1655 inode->i_ctime = attr->ia_ctime;
1656 if (only) {
1657 /*
1658 * if kernel wants to dirty ctime but nothing else,
1659 * we need to choose a cap to dirty under, or do
1660 * a almost-no-op setattr
1661 */
1662 if (issued & CEPH_CAP_AUTH_EXCL)
1663 dirtied |= CEPH_CAP_AUTH_EXCL;
1664 else if (issued & CEPH_CAP_FILE_EXCL)
1665 dirtied |= CEPH_CAP_FILE_EXCL;
1666 else if (issued & CEPH_CAP_XATTR_EXCL)
1667 dirtied |= CEPH_CAP_XATTR_EXCL;
1668 else
1669 mask |= CEPH_SETATTR_CTIME;
1670 }
1671 }
1672 if (ia_valid & ATTR_FILE)
1673 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1674
1675 if (dirtied) {
1676 __ceph_mark_dirty_caps(ci, dirtied);
1677 inode->i_ctime = CURRENT_TIME;
1678 }
1679
1680 release &= issued;
1681 spin_unlock(&inode->i_lock);
1682
1683 if (mask) {
1684 req->r_inode = igrab(inode);
1685 req->r_inode_drop = release;
1686 req->r_args.setattr.mask = cpu_to_le32(mask);
1687 req->r_num_caps = 1;
1688 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1689 }
1690 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1691 ceph_cap_string(dirtied), mask);
1692
1693 ceph_mdsc_put_request(req);
1694 __ceph_do_pending_vmtruncate(inode);
1695 return err;
1696out:
1697 spin_unlock(&inode->i_lock);
1698 ceph_mdsc_put_request(req);
1699 return err;
1700}
1701
1702/*
1703 * Verify that we have a lease on the given mask. If not,
1704 * do a getattr against an mds.
1705 */
1706int ceph_do_getattr(struct inode *inode, int mask)
1707{
1708 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1709 struct ceph_mds_client *mdsc = &client->mdsc;
1710 struct ceph_mds_request *req;
1711 int err;
1712
1713 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1714 dout("do_getattr inode %p SNAPDIR\n", inode);
1715 return 0;
1716 }
1717
1718 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1719 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1720 return 0;
1721
1722 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1723 if (IS_ERR(req))
1724 return PTR_ERR(req);
1725 req->r_inode = igrab(inode);
1726 req->r_num_caps = 1;
1727 req->r_args.getattr.mask = cpu_to_le32(mask);
1728 err = ceph_mdsc_do_request(mdsc, NULL, req);
1729 ceph_mdsc_put_request(req);
1730 dout("do_getattr result=%d\n", err);
1731 return err;
1732}
1733
1734
1735/*
1736 * Check inode permissions. We verify we have a valid value for
1737 * the AUTH cap, then call the generic handler.
1738 */
1739int ceph_permission(struct inode *inode, int mask)
1740{
1741 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1742
1743 if (!err)
1744 err = generic_permission(inode, mask, NULL);
1745 return err;
1746}
1747
1748/*
1749 * Get all attributes. Hopefully somedata we'll have a statlite()
1750 * and can limit the fields we require to be accurate.
1751 */
1752int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1753 struct kstat *stat)
1754{
1755 struct inode *inode = dentry->d_inode;
1756 struct ceph_inode_info *ci = ceph_inode(inode);
1757 int err;
1758
1759 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1760 if (!err) {
1761 generic_fillattr(inode, stat);
1762 stat->ino = inode->i_ino;
1763 if (ceph_snap(inode) != CEPH_NOSNAP)
1764 stat->dev = ceph_snap(inode);
1765 else
1766 stat->dev = 0;
1767 if (S_ISDIR(inode->i_mode)) {
1768 stat->size = ci->i_rbytes;
1769 stat->blocks = 0;
1770 stat->blksize = 65536;
1771 }
1772 }
1773 return err;
1774}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..60a9a4ae47be
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3043 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/slab.h>
5#include <linux/sched.h>
6
7#include "mds_client.h"
8#include "mon_client.h"
9#include "super.h"
10#include "messenger.h"
11#include "decode.h"
12#include "auth.h"
13#include "pagelist.h"
14
15/*
16 * A cluster of MDS (metadata server) daemons is responsible for
17 * managing the file system namespace (the directory hierarchy and
18 * inodes) and for coordinating shared access to storage. Metadata is
19 * partitioning hierarchically across a number of servers, and that
20 * partition varies over time as the cluster adjusts the distribution
21 * in order to balance load.
22 *
23 * The MDS client is primarily responsible to managing synchronous
24 * metadata requests for operations like open, unlink, and so forth.
25 * If there is a MDS failure, we find out about it when we (possibly
26 * request and) receive a new MDS map, and can resubmit affected
27 * requests.
28 *
29 * For the most part, though, we take advantage of a lossless
30 * communications channel to the MDS, and do not need to worry about
31 * timing out or resubmitting requests.
32 *
33 * We maintain a stateful "session" with each MDS we interact with.
34 * Within each session, we sent periodic heartbeat messages to ensure
35 * any capabilities or leases we have been issues remain valid. If
36 * the session times out and goes stale, our leases and capabilities
37 * are no longer valid.
38 */
39
40static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head);
42
43const static struct ceph_connection_operations mds_con_ops;
44
45
46/*
47 * mds reply parsing
48 */
49
50/*
51 * parse individual inode info
52 */
53static int parse_reply_info_in(void **p, void *end,
54 struct ceph_mds_reply_info_in *info)
55{
56 int err = -EIO;
57
58 info->in = *p;
59 *p += sizeof(struct ceph_mds_reply_inode) +
60 sizeof(*info->in->fragtree.splits) *
61 le32_to_cpu(info->in->fragtree.nsplits);
62
63 ceph_decode_32_safe(p, end, info->symlink_len, bad);
64 ceph_decode_need(p, end, info->symlink_len, bad);
65 info->symlink = *p;
66 *p += info->symlink_len;
67
68 ceph_decode_32_safe(p, end, info->xattr_len, bad);
69 ceph_decode_need(p, end, info->xattr_len, bad);
70 info->xattr_data = *p;
71 *p += info->xattr_len;
72 return 0;
73bad:
74 return err;
75}
76
77/*
78 * parse a normal reply, which may contain a (dir+)dentry and/or a
79 * target inode.
80 */
81static int parse_reply_info_trace(void **p, void *end,
82 struct ceph_mds_reply_info_parsed *info)
83{
84 int err;
85
86 if (info->head->is_dentry) {
87 err = parse_reply_info_in(p, end, &info->diri);
88 if (err < 0)
89 goto out_bad;
90
91 if (unlikely(*p + sizeof(*info->dirfrag) > end))
92 goto bad;
93 info->dirfrag = *p;
94 *p += sizeof(*info->dirfrag) +
95 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
96 if (unlikely(*p > end))
97 goto bad;
98
99 ceph_decode_32_safe(p, end, info->dname_len, bad);
100 ceph_decode_need(p, end, info->dname_len, bad);
101 info->dname = *p;
102 *p += info->dname_len;
103 info->dlease = *p;
104 *p += sizeof(*info->dlease);
105 }
106
107 if (info->head->is_target) {
108 err = parse_reply_info_in(p, end, &info->targeti);
109 if (err < 0)
110 goto out_bad;
111 }
112
113 if (unlikely(*p != end))
114 goto bad;
115 return 0;
116
117bad:
118 err = -EIO;
119out_bad:
120 pr_err("problem parsing mds trace %d\n", err);
121 return err;
122}
123
124/*
125 * parse readdir results
126 */
127static int parse_reply_info_dir(void **p, void *end,
128 struct ceph_mds_reply_info_parsed *info)
129{
130 u32 num, i = 0;
131 int err;
132
133 info->dir_dir = *p;
134 if (*p + sizeof(*info->dir_dir) > end)
135 goto bad;
136 *p += sizeof(*info->dir_dir) +
137 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
138 if (*p > end)
139 goto bad;
140
141 ceph_decode_need(p, end, sizeof(num) + 2, bad);
142 num = ceph_decode_32(p);
143 info->dir_end = ceph_decode_8(p);
144 info->dir_complete = ceph_decode_8(p);
145 if (num == 0)
146 goto done;
147
148 /* alloc large array */
149 info->dir_nr = num;
150 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
151 sizeof(*info->dir_dname) +
152 sizeof(*info->dir_dname_len) +
153 sizeof(*info->dir_dlease),
154 GFP_NOFS);
155 if (info->dir_in == NULL) {
156 err = -ENOMEM;
157 goto out_bad;
158 }
159 info->dir_dname = (void *)(info->dir_in + num);
160 info->dir_dname_len = (void *)(info->dir_dname + num);
161 info->dir_dlease = (void *)(info->dir_dname_len + num);
162
163 while (num) {
164 /* dentry */
165 ceph_decode_need(p, end, sizeof(u32)*2, bad);
166 info->dir_dname_len[i] = ceph_decode_32(p);
167 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
168 info->dir_dname[i] = *p;
169 *p += info->dir_dname_len[i];
170 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
171 info->dir_dname[i]);
172 info->dir_dlease[i] = *p;
173 *p += sizeof(struct ceph_mds_reply_lease);
174
175 /* inode */
176 err = parse_reply_info_in(p, end, &info->dir_in[i]);
177 if (err < 0)
178 goto out_bad;
179 i++;
180 num--;
181 }
182
183done:
184 if (*p != end)
185 goto bad;
186 return 0;
187
188bad:
189 err = -EIO;
190out_bad:
191 pr_err("problem parsing dir contents %d\n", err);
192 return err;
193}
194
195/*
196 * parse entire mds reply
197 */
198static int parse_reply_info(struct ceph_msg *msg,
199 struct ceph_mds_reply_info_parsed *info)
200{
201 void *p, *end;
202 u32 len;
203 int err;
204
205 info->head = msg->front.iov_base;
206 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
207 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
208
209 /* trace */
210 ceph_decode_32_safe(&p, end, len, bad);
211 if (len > 0) {
212 err = parse_reply_info_trace(&p, p+len, info);
213 if (err < 0)
214 goto out_bad;
215 }
216
217 /* dir content */
218 ceph_decode_32_safe(&p, end, len, bad);
219 if (len > 0) {
220 err = parse_reply_info_dir(&p, p+len, info);
221 if (err < 0)
222 goto out_bad;
223 }
224
225 /* snap blob */
226 ceph_decode_32_safe(&p, end, len, bad);
227 info->snapblob_len = len;
228 info->snapblob = p;
229 p += len;
230
231 if (p != end)
232 goto bad;
233 return 0;
234
235bad:
236 err = -EIO;
237out_bad:
238 pr_err("mds parse_reply err %d\n", err);
239 return err;
240}
241
242static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
243{
244 kfree(info->dir_in);
245}
246
247
248/*
249 * sessions
250 */
251static const char *session_state_name(int s)
252{
253 switch (s) {
254 case CEPH_MDS_SESSION_NEW: return "new";
255 case CEPH_MDS_SESSION_OPENING: return "opening";
256 case CEPH_MDS_SESSION_OPEN: return "open";
257 case CEPH_MDS_SESSION_HUNG: return "hung";
258 case CEPH_MDS_SESSION_CLOSING: return "closing";
259 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
260 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
261 default: return "???";
262 }
263}
264
265static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
266{
267 if (atomic_inc_not_zero(&s->s_ref)) {
268 dout("mdsc get_session %p %d -> %d\n", s,
269 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
270 return s;
271 } else {
272 dout("mdsc get_session %p 0 -- FAIL", s);
273 return NULL;
274 }
275}
276
277void ceph_put_mds_session(struct ceph_mds_session *s)
278{
279 dout("mdsc put_session %p %d -> %d\n", s,
280 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
281 if (atomic_dec_and_test(&s->s_ref)) {
282 if (s->s_authorizer)
283 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
284 s->s_mdsc->client->monc.auth, s->s_authorizer);
285 kfree(s);
286 }
287}
288
289/*
290 * called under mdsc->mutex
291 */
292struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
293 int mds)
294{
295 struct ceph_mds_session *session;
296
297 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
298 return NULL;
299 session = mdsc->sessions[mds];
300 dout("lookup_mds_session %p %d\n", session,
301 atomic_read(&session->s_ref));
302 get_session(session);
303 return session;
304}
305
306static bool __have_session(struct ceph_mds_client *mdsc, int mds)
307{
308 if (mds >= mdsc->max_sessions)
309 return false;
310 return mdsc->sessions[mds];
311}
312
313static int __verify_registered_session(struct ceph_mds_client *mdsc,
314 struct ceph_mds_session *s)
315{
316 if (s->s_mds >= mdsc->max_sessions ||
317 mdsc->sessions[s->s_mds] != s)
318 return -ENOENT;
319 return 0;
320}
321
322/*
323 * create+register a new session for given mds.
324 * called under mdsc->mutex.
325 */
326static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
327 int mds)
328{
329 struct ceph_mds_session *s;
330
331 s = kzalloc(sizeof(*s), GFP_NOFS);
332 if (!s)
333 return ERR_PTR(-ENOMEM);
334 s->s_mdsc = mdsc;
335 s->s_mds = mds;
336 s->s_state = CEPH_MDS_SESSION_NEW;
337 s->s_ttl = 0;
338 s->s_seq = 0;
339 mutex_init(&s->s_mutex);
340
341 ceph_con_init(mdsc->client->msgr, &s->s_con);
342 s->s_con.private = s;
343 s->s_con.ops = &mds_con_ops;
344 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
345 s->s_con.peer_name.num = cpu_to_le64(mds);
346
347 spin_lock_init(&s->s_cap_lock);
348 s->s_cap_gen = 0;
349 s->s_cap_ttl = 0;
350 s->s_renew_requested = 0;
351 s->s_renew_seq = 0;
352 INIT_LIST_HEAD(&s->s_caps);
353 s->s_nr_caps = 0;
354 s->s_trim_caps = 0;
355 atomic_set(&s->s_ref, 1);
356 INIT_LIST_HEAD(&s->s_waiting);
357 INIT_LIST_HEAD(&s->s_unsafe);
358 s->s_num_cap_releases = 0;
359 s->s_cap_iterator = NULL;
360 INIT_LIST_HEAD(&s->s_cap_releases);
361 INIT_LIST_HEAD(&s->s_cap_releases_done);
362 INIT_LIST_HEAD(&s->s_cap_flushing);
363 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
364
365 dout("register_session mds%d\n", mds);
366 if (mds >= mdsc->max_sessions) {
367 int newmax = 1 << get_count_order(mds+1);
368 struct ceph_mds_session **sa;
369
370 dout("register_session realloc to %d\n", newmax);
371 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
372 if (sa == NULL)
373 goto fail_realloc;
374 if (mdsc->sessions) {
375 memcpy(sa, mdsc->sessions,
376 mdsc->max_sessions * sizeof(void *));
377 kfree(mdsc->sessions);
378 }
379 mdsc->sessions = sa;
380 mdsc->max_sessions = newmax;
381 }
382 mdsc->sessions[mds] = s;
383 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
384
385 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
386
387 return s;
388
389fail_realloc:
390 kfree(s);
391 return ERR_PTR(-ENOMEM);
392}
393
394/*
395 * called under mdsc->mutex
396 */
397static void __unregister_session(struct ceph_mds_client *mdsc,
398 struct ceph_mds_session *s)
399{
400 dout("__unregister_session mds%d %p\n", s->s_mds, s);
401 BUG_ON(mdsc->sessions[s->s_mds] != s);
402 mdsc->sessions[s->s_mds] = NULL;
403 ceph_con_close(&s->s_con);
404 ceph_put_mds_session(s);
405}
406
407/*
408 * drop session refs in request.
409 *
410 * should be last request ref, or hold mdsc->mutex
411 */
412static void put_request_session(struct ceph_mds_request *req)
413{
414 if (req->r_session) {
415 ceph_put_mds_session(req->r_session);
416 req->r_session = NULL;
417 }
418}
419
420void ceph_mdsc_release_request(struct kref *kref)
421{
422 struct ceph_mds_request *req = container_of(kref,
423 struct ceph_mds_request,
424 r_kref);
425 if (req->r_request)
426 ceph_msg_put(req->r_request);
427 if (req->r_reply) {
428 ceph_msg_put(req->r_reply);
429 destroy_reply_info(&req->r_reply_info);
430 }
431 if (req->r_inode) {
432 ceph_put_cap_refs(ceph_inode(req->r_inode),
433 CEPH_CAP_PIN);
434 iput(req->r_inode);
435 }
436 if (req->r_locked_dir)
437 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
438 CEPH_CAP_PIN);
439 if (req->r_target_inode)
440 iput(req->r_target_inode);
441 if (req->r_dentry)
442 dput(req->r_dentry);
443 if (req->r_old_dentry) {
444 ceph_put_cap_refs(
445 ceph_inode(req->r_old_dentry->d_parent->d_inode),
446 CEPH_CAP_PIN);
447 dput(req->r_old_dentry);
448 }
449 kfree(req->r_path1);
450 kfree(req->r_path2);
451 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation);
453 kfree(req);
454}
455
456/*
457 * lookup session, bump ref if found.
458 *
459 * called under mdsc->mutex.
460 */
461static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
462 u64 tid)
463{
464 struct ceph_mds_request *req;
465 struct rb_node *n = mdsc->request_tree.rb_node;
466
467 while (n) {
468 req = rb_entry(n, struct ceph_mds_request, r_node);
469 if (tid < req->r_tid)
470 n = n->rb_left;
471 else if (tid > req->r_tid)
472 n = n->rb_right;
473 else {
474 ceph_mdsc_get_request(req);
475 return req;
476 }
477 }
478 return NULL;
479}
480
481static void __insert_request(struct ceph_mds_client *mdsc,
482 struct ceph_mds_request *new)
483{
484 struct rb_node **p = &mdsc->request_tree.rb_node;
485 struct rb_node *parent = NULL;
486 struct ceph_mds_request *req = NULL;
487
488 while (*p) {
489 parent = *p;
490 req = rb_entry(parent, struct ceph_mds_request, r_node);
491 if (new->r_tid < req->r_tid)
492 p = &(*p)->rb_left;
493 else if (new->r_tid > req->r_tid)
494 p = &(*p)->rb_right;
495 else
496 BUG();
497 }
498
499 rb_link_node(&new->r_node, parent, p);
500 rb_insert_color(&new->r_node, &mdsc->request_tree);
501}
502
503/*
504 * Register an in-flight request, and assign a tid. Link to directory
505 * are modifying (if any).
506 *
507 * Called under mdsc->mutex.
508 */
509static void __register_request(struct ceph_mds_client *mdsc,
510 struct ceph_mds_request *req,
511 struct inode *dir)
512{
513 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req);
519
520 if (dir) {
521 struct ceph_inode_info *ci = ceph_inode(dir);
522
523 spin_lock(&ci->i_unsafe_lock);
524 req->r_unsafe_dir = dir;
525 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
526 spin_unlock(&ci->i_unsafe_lock);
527 }
528}
529
530static void __unregister_request(struct ceph_mds_client *mdsc,
531 struct ceph_mds_request *req)
532{
533 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
534 rb_erase(&req->r_node, &mdsc->request_tree);
535 RB_CLEAR_NODE(&req->r_node);
536
537 if (req->r_unsafe_dir) {
538 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
539
540 spin_lock(&ci->i_unsafe_lock);
541 list_del_init(&req->r_unsafe_dir_item);
542 spin_unlock(&ci->i_unsafe_lock);
543 }
544
545 ceph_mdsc_put_request(req);
546}
547
548/*
549 * Choose mds to send request to next. If there is a hint set in the
550 * request (e.g., due to a prior forward hint from the mds), use that.
551 * Otherwise, consult frag tree and/or caps to identify the
552 * appropriate mds. If all else fails, choose randomly.
553 *
554 * Called under mdsc->mutex.
555 */
556static int __choose_mds(struct ceph_mds_client *mdsc,
557 struct ceph_mds_request *req)
558{
559 struct inode *inode;
560 struct ceph_inode_info *ci;
561 struct ceph_cap *cap;
562 int mode = req->r_direct_mode;
563 int mds = -1;
564 u32 hash = req->r_direct_hash;
565 bool is_hash = req->r_direct_is_hash;
566
567 /*
568 * is there a specific mds we should try? ignore hint if we have
569 * no session and the mds is not up (active or recovering).
570 */
571 if (req->r_resend_mds >= 0 &&
572 (__have_session(mdsc, req->r_resend_mds) ||
573 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
574 dout("choose_mds using resend_mds mds%d\n",
575 req->r_resend_mds);
576 return req->r_resend_mds;
577 }
578
579 if (mode == USE_RANDOM_MDS)
580 goto random;
581
582 inode = NULL;
583 if (req->r_inode) {
584 inode = req->r_inode;
585 } else if (req->r_dentry) {
586 if (req->r_dentry->d_inode) {
587 inode = req->r_dentry->d_inode;
588 } else {
589 inode = req->r_dentry->d_parent->d_inode;
590 hash = req->r_dentry->d_name.hash;
591 is_hash = true;
592 }
593 }
594 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
595 (int)hash, mode);
596 if (!inode)
597 goto random;
598 ci = ceph_inode(inode);
599
600 if (is_hash && S_ISDIR(inode->i_mode)) {
601 struct ceph_inode_frag frag;
602 int found;
603
604 ceph_choose_frag(ci, hash, &frag, &found);
605 if (found) {
606 if (mode == USE_ANY_MDS && frag.ndist > 0) {
607 u8 r;
608
609 /* choose a random replica */
610 get_random_bytes(&r, 1);
611 r %= frag.ndist;
612 mds = frag.dist[r];
613 dout("choose_mds %p %llx.%llx "
614 "frag %u mds%d (%d/%d)\n",
615 inode, ceph_vinop(inode),
616 frag.frag, frag.mds,
617 (int)r, frag.ndist);
618 return mds;
619 }
620
621 /* since this file/dir wasn't known to be
622 * replicated, then we want to look for the
623 * authoritative mds. */
624 mode = USE_AUTH_MDS;
625 if (frag.mds >= 0) {
626 /* choose auth mds */
627 mds = frag.mds;
628 dout("choose_mds %p %llx.%llx "
629 "frag %u mds%d (auth)\n",
630 inode, ceph_vinop(inode), frag.frag, mds);
631 return mds;
632 }
633 }
634 }
635
636 spin_lock(&inode->i_lock);
637 cap = NULL;
638 if (mode == USE_AUTH_MDS)
639 cap = ci->i_auth_cap;
640 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
641 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
642 if (!cap) {
643 spin_unlock(&inode->i_lock);
644 goto random;
645 }
646 mds = cap->session->s_mds;
647 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
648 inode, ceph_vinop(inode), mds,
649 cap == ci->i_auth_cap ? "auth " : "", cap);
650 spin_unlock(&inode->i_lock);
651 return mds;
652
653random:
654 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
655 dout("choose_mds chose random mds%d\n", mds);
656 return mds;
657}
658
659
660/*
661 * session messages
662 */
663static struct ceph_msg *create_session_msg(u32 op, u64 seq)
664{
665 struct ceph_msg *msg;
666 struct ceph_mds_session_head *h;
667
668 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
669 if (IS_ERR(msg)) {
670 pr_err("create_session_msg ENOMEM creating msg\n");
671 return ERR_PTR(PTR_ERR(msg));
672 }
673 h = msg->front.iov_base;
674 h->op = cpu_to_le32(op);
675 h->seq = cpu_to_le64(seq);
676 return msg;
677}
678
679/*
680 * send session open request.
681 *
682 * called under mdsc->mutex
683 */
684static int __open_session(struct ceph_mds_client *mdsc,
685 struct ceph_mds_session *session)
686{
687 struct ceph_msg *msg;
688 int mstate;
689 int mds = session->s_mds;
690 int err = 0;
691
692 /* wait for mds to go active? */
693 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
694 dout("open_session to mds%d (%s)\n", mds,
695 ceph_mds_state_name(mstate));
696 session->s_state = CEPH_MDS_SESSION_OPENING;
697 session->s_renew_requested = jiffies;
698
699 /* send connect message */
700 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
701 if (IS_ERR(msg)) {
702 err = PTR_ERR(msg);
703 goto out;
704 }
705 ceph_con_send(&session->s_con, msg);
706
707out:
708 return 0;
709}
710
711/*
712 * session caps
713 */
714
715/*
716 * Free preallocated cap messages assigned to this session
717 */
718static void cleanup_cap_releases(struct ceph_mds_session *session)
719{
720 struct ceph_msg *msg;
721
722 spin_lock(&session->s_cap_lock);
723 while (!list_empty(&session->s_cap_releases)) {
724 msg = list_first_entry(&session->s_cap_releases,
725 struct ceph_msg, list_head);
726 list_del_init(&msg->list_head);
727 ceph_msg_put(msg);
728 }
729 while (!list_empty(&session->s_cap_releases_done)) {
730 msg = list_first_entry(&session->s_cap_releases_done,
731 struct ceph_msg, list_head);
732 list_del_init(&msg->list_head);
733 ceph_msg_put(msg);
734 }
735 spin_unlock(&session->s_cap_lock);
736}
737
738/*
739 * Helper to safely iterate over all caps associated with a session.
740 *
741 * caller must hold session s_mutex
742 */
743static int iterate_session_caps(struct ceph_mds_session *session,
744 int (*cb)(struct inode *, struct ceph_cap *,
745 void *), void *arg)
746{
747 struct list_head *p;
748 struct ceph_cap *cap;
749 struct inode *inode, *last_inode = NULL;
750 struct ceph_cap *old_cap = NULL;
751 int ret;
752
753 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
754 spin_lock(&session->s_cap_lock);
755 p = session->s_caps.next;
756 while (p != &session->s_caps) {
757 cap = list_entry(p, struct ceph_cap, session_caps);
758 inode = igrab(&cap->ci->vfs_inode);
759 if (!inode) {
760 p = p->next;
761 continue;
762 }
763 session->s_cap_iterator = cap;
764 spin_unlock(&session->s_cap_lock);
765
766 if (last_inode) {
767 iput(last_inode);
768 last_inode = NULL;
769 }
770 if (old_cap) {
771 ceph_put_cap(old_cap);
772 old_cap = NULL;
773 }
774
775 ret = cb(inode, cap, arg);
776 last_inode = inode;
777
778 spin_lock(&session->s_cap_lock);
779 p = p->next;
780 if (cap->ci == NULL) {
781 dout("iterate_session_caps finishing cap %p removal\n",
782 cap);
783 BUG_ON(cap->session != session);
784 list_del_init(&cap->session_caps);
785 session->s_nr_caps--;
786 cap->session = NULL;
787 old_cap = cap; /* put_cap it w/o locks held */
788 }
789 if (ret < 0)
790 goto out;
791 }
792 ret = 0;
793out:
794 session->s_cap_iterator = NULL;
795 spin_unlock(&session->s_cap_lock);
796
797 if (last_inode)
798 iput(last_inode);
799 if (old_cap)
800 ceph_put_cap(old_cap);
801
802 return ret;
803}
804
805static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
806 void *arg)
807{
808 struct ceph_inode_info *ci = ceph_inode(inode);
809 dout("removing cap %p, ci is %p, inode is %p\n",
810 cap, ci, &ci->vfs_inode);
811 ceph_remove_cap(cap);
812 return 0;
813}
814
815/*
816 * caller must hold session s_mutex
817 */
818static void remove_session_caps(struct ceph_mds_session *session)
819{
820 dout("remove_session_caps on %p\n", session);
821 iterate_session_caps(session, remove_session_caps_cb, NULL);
822 BUG_ON(session->s_nr_caps > 0);
823 cleanup_cap_releases(session);
824}
825
826/*
827 * wake up any threads waiting on this session's caps. if the cap is
828 * old (didn't get renewed on the client reconnect), remove it now.
829 *
830 * caller must hold s_mutex.
831 */
832static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
833 void *arg)
834{
835 struct ceph_inode_info *ci = ceph_inode(inode);
836
837 wake_up(&ci->i_cap_wq);
838 if (arg) {
839 spin_lock(&inode->i_lock);
840 ci->i_wanted_max_size = 0;
841 ci->i_requested_max_size = 0;
842 spin_unlock(&inode->i_lock);
843 }
844 return 0;
845}
846
847static void wake_up_session_caps(struct ceph_mds_session *session,
848 int reconnect)
849{
850 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
851 iterate_session_caps(session, wake_up_session_cb,
852 (void *)(unsigned long)reconnect);
853}
854
855/*
856 * Send periodic message to MDS renewing all currently held caps. The
857 * ack will reset the expiration for all caps from this session.
858 *
859 * caller holds s_mutex
860 */
861static int send_renew_caps(struct ceph_mds_client *mdsc,
862 struct ceph_mds_session *session)
863{
864 struct ceph_msg *msg;
865 int state;
866
867 if (time_after_eq(jiffies, session->s_cap_ttl) &&
868 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
869 pr_info("mds%d caps stale\n", session->s_mds);
870 session->s_renew_requested = jiffies;
871
872 /* do not try to renew caps until a recovering mds has reconnected
873 * with its clients. */
874 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
875 if (state < CEPH_MDS_STATE_RECONNECT) {
876 dout("send_renew_caps ignoring mds%d (%s)\n",
877 session->s_mds, ceph_mds_state_name(state));
878 return 0;
879 }
880
881 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
882 ceph_mds_state_name(state));
883 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
884 ++session->s_renew_seq);
885 if (IS_ERR(msg))
886 return PTR_ERR(msg);
887 ceph_con_send(&session->s_con, msg);
888 return 0;
889}
890
891/*
892 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
893 *
894 * Called under session->s_mutex
895 */
896static void renewed_caps(struct ceph_mds_client *mdsc,
897 struct ceph_mds_session *session, int is_renew)
898{
899 int was_stale;
900 int wake = 0;
901
902 spin_lock(&session->s_cap_lock);
903 was_stale = is_renew && (session->s_cap_ttl == 0 ||
904 time_after_eq(jiffies, session->s_cap_ttl));
905
906 session->s_cap_ttl = session->s_renew_requested +
907 mdsc->mdsmap->m_session_timeout*HZ;
908
909 if (was_stale) {
910 if (time_before(jiffies, session->s_cap_ttl)) {
911 pr_info("mds%d caps renewed\n", session->s_mds);
912 wake = 1;
913 } else {
914 pr_info("mds%d caps still stale\n", session->s_mds);
915 }
916 }
917 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
918 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
919 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
920 spin_unlock(&session->s_cap_lock);
921
922 if (wake)
923 wake_up_session_caps(session, 0);
924}
925
926/*
927 * send a session close request
928 */
929static int request_close_session(struct ceph_mds_client *mdsc,
930 struct ceph_mds_session *session)
931{
932 struct ceph_msg *msg;
933 int err = 0;
934
935 dout("request_close_session mds%d state %s seq %lld\n",
936 session->s_mds, session_state_name(session->s_state),
937 session->s_seq);
938 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
939 if (IS_ERR(msg))
940 err = PTR_ERR(msg);
941 else
942 ceph_con_send(&session->s_con, msg);
943 return err;
944}
945
946/*
947 * Called with s_mutex held.
948 */
949static int __close_session(struct ceph_mds_client *mdsc,
950 struct ceph_mds_session *session)
951{
952 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
953 return 0;
954 session->s_state = CEPH_MDS_SESSION_CLOSING;
955 return request_close_session(mdsc, session);
956}
957
958/*
959 * Trim old(er) caps.
960 *
961 * Because we can't cache an inode without one or more caps, we do
962 * this indirectly: if a cap is unused, we prune its aliases, at which
963 * point the inode will hopefully get dropped to.
964 *
965 * Yes, this is a bit sloppy. Our only real goal here is to respond to
966 * memory pressure from the MDS, though, so it needn't be perfect.
967 */
968static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
969{
970 struct ceph_mds_session *session = arg;
971 struct ceph_inode_info *ci = ceph_inode(inode);
972 int used, oissued, mine;
973
974 if (session->s_trim_caps <= 0)
975 return -1;
976
977 spin_lock(&inode->i_lock);
978 mine = cap->issued | cap->implemented;
979 used = __ceph_caps_used(ci);
980 oissued = __ceph_caps_issued_other(ci, cap);
981
982 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
983 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
984 ceph_cap_string(used));
985 if (ci->i_dirty_caps)
986 goto out; /* dirty caps */
987 if ((used & ~oissued) & mine)
988 goto out; /* we need these caps */
989
990 session->s_trim_caps--;
991 if (oissued) {
992 /* we aren't the only cap.. just remove us */
993 __ceph_remove_cap(cap);
994 } else {
995 /* try to drop referring dentries */
996 spin_unlock(&inode->i_lock);
997 d_prune_aliases(inode);
998 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
999 inode, cap, atomic_read(&inode->i_count));
1000 return 0;
1001 }
1002
1003out:
1004 spin_unlock(&inode->i_lock);
1005 return 0;
1006}
1007
1008/*
1009 * Trim session cap count down to some max number.
1010 */
1011static int trim_caps(struct ceph_mds_client *mdsc,
1012 struct ceph_mds_session *session,
1013 int max_caps)
1014{
1015 int trim_caps = session->s_nr_caps - max_caps;
1016
1017 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1019 if (trim_caps > 0) {
1020 session->s_trim_caps = trim_caps;
1021 iterate_session_caps(session, trim_caps_cb, session);
1022 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1023 session->s_mds, session->s_nr_caps, max_caps,
1024 trim_caps - session->s_trim_caps);
1025 session->s_trim_caps = 0;
1026 }
1027 return 0;
1028}
1029
1030/*
1031 * Allocate cap_release messages. If there is a partially full message
1032 * in the queue, try to allocate enough to cover it's remainder, so that
1033 * we can send it immediately.
1034 *
1035 * Called under s_mutex.
1036 */
1037static int add_cap_releases(struct ceph_mds_client *mdsc,
1038 struct ceph_mds_session *session,
1039 int extra)
1040{
1041 struct ceph_msg *msg;
1042 struct ceph_mds_cap_release *head;
1043 int err = -ENOMEM;
1044
1045 if (extra < 0)
1046 extra = mdsc->client->mount_args->cap_release_safety;
1047
1048 spin_lock(&session->s_cap_lock);
1049
1050 if (!list_empty(&session->s_cap_releases)) {
1051 msg = list_first_entry(&session->s_cap_releases,
1052 struct ceph_msg,
1053 list_head);
1054 head = msg->front.iov_base;
1055 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1056 }
1057
1058 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1059 spin_unlock(&session->s_cap_lock);
1060 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1061 0, 0, NULL);
1062 if (!msg)
1063 goto out_unlocked;
1064 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1065 (int)msg->front.iov_len);
1066 head = msg->front.iov_base;
1067 head->num = cpu_to_le32(0);
1068 msg->front.iov_len = sizeof(*head);
1069 spin_lock(&session->s_cap_lock);
1070 list_add(&msg->list_head, &session->s_cap_releases);
1071 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1072 }
1073
1074 if (!list_empty(&session->s_cap_releases)) {
1075 msg = list_first_entry(&session->s_cap_releases,
1076 struct ceph_msg,
1077 list_head);
1078 head = msg->front.iov_base;
1079 if (head->num) {
1080 dout(" queueing non-full %p (%d)\n", msg,
1081 le32_to_cpu(head->num));
1082 list_move_tail(&msg->list_head,
1083 &session->s_cap_releases_done);
1084 session->s_num_cap_releases -=
1085 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1086 }
1087 }
1088 err = 0;
1089 spin_unlock(&session->s_cap_lock);
1090out_unlocked:
1091 return err;
1092}
1093
1094/*
1095 * flush all dirty inode data to disk.
1096 *
1097 * returns true if we've flushed through want_flush_seq
1098 */
1099static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1100{
1101 int mds, ret = 1;
1102
1103 dout("check_cap_flush want %lld\n", want_flush_seq);
1104 mutex_lock(&mdsc->mutex);
1105 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1106 struct ceph_mds_session *session = mdsc->sessions[mds];
1107
1108 if (!session)
1109 continue;
1110 get_session(session);
1111 mutex_unlock(&mdsc->mutex);
1112
1113 mutex_lock(&session->s_mutex);
1114 if (!list_empty(&session->s_cap_flushing)) {
1115 struct ceph_inode_info *ci =
1116 list_entry(session->s_cap_flushing.next,
1117 struct ceph_inode_info,
1118 i_flushing_item);
1119 struct inode *inode = &ci->vfs_inode;
1120
1121 spin_lock(&inode->i_lock);
1122 if (ci->i_cap_flush_seq <= want_flush_seq) {
1123 dout("check_cap_flush still flushing %p "
1124 "seq %lld <= %lld to mds%d\n", inode,
1125 ci->i_cap_flush_seq, want_flush_seq,
1126 session->s_mds);
1127 ret = 0;
1128 }
1129 spin_unlock(&inode->i_lock);
1130 }
1131 mutex_unlock(&session->s_mutex);
1132 ceph_put_mds_session(session);
1133
1134 if (!ret)
1135 return ret;
1136 mutex_lock(&mdsc->mutex);
1137 }
1138
1139 mutex_unlock(&mdsc->mutex);
1140 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1141 return ret;
1142}
1143
1144/*
1145 * called under s_mutex
1146 */
1147static void send_cap_releases(struct ceph_mds_client *mdsc,
1148 struct ceph_mds_session *session)
1149{
1150 struct ceph_msg *msg;
1151
1152 dout("send_cap_releases mds%d\n", session->s_mds);
1153 while (1) {
1154 spin_lock(&session->s_cap_lock);
1155 if (list_empty(&session->s_cap_releases_done))
1156 break;
1157 msg = list_first_entry(&session->s_cap_releases_done,
1158 struct ceph_msg, list_head);
1159 list_del_init(&msg->list_head);
1160 spin_unlock(&session->s_cap_lock);
1161 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1162 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1163 ceph_con_send(&session->s_con, msg);
1164 }
1165 spin_unlock(&session->s_cap_lock);
1166}
1167
1168/*
1169 * requests
1170 */
1171
1172/*
1173 * Create an mds request.
1174 */
1175struct ceph_mds_request *
1176ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1177{
1178 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1179
1180 if (!req)
1181 return ERR_PTR(-ENOMEM);
1182
1183 req->r_started = jiffies;
1184 req->r_resend_mds = -1;
1185 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1186 req->r_fmode = -1;
1187 kref_init(&req->r_kref);
1188 INIT_LIST_HEAD(&req->r_wait);
1189 init_completion(&req->r_completion);
1190 init_completion(&req->r_safe_completion);
1191 INIT_LIST_HEAD(&req->r_unsafe_item);
1192
1193 req->r_op = op;
1194 req->r_direct_mode = mode;
1195 return req;
1196}
1197
1198/*
1199 * return oldest (lowest) request, tid in request tree, 0 if none.
1200 *
1201 * called under mdsc->mutex.
1202 */
1203static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1204{
1205 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1206 return NULL;
1207 return rb_entry(rb_first(&mdsc->request_tree),
1208 struct ceph_mds_request, r_node);
1209}
1210
1211static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1212{
1213 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1214
1215 if (req)
1216 return req->r_tid;
1217 return 0;
1218}
1219
1220/*
1221 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1222 * on build_path_from_dentry in fs/cifs/dir.c.
1223 *
1224 * If @stop_on_nosnap, generate path relative to the first non-snapped
1225 * inode.
1226 *
1227 * Encode hidden .snap dirs as a double /, i.e.
1228 * foo/.snap/bar -> foo//bar
1229 */
1230char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1231 int stop_on_nosnap)
1232{
1233 struct dentry *temp;
1234 char *path;
1235 int len, pos;
1236
1237 if (dentry == NULL)
1238 return ERR_PTR(-EINVAL);
1239
1240retry:
1241 len = 0;
1242 for (temp = dentry; !IS_ROOT(temp);) {
1243 struct inode *inode = temp->d_inode;
1244 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1245 len++; /* slash only */
1246 else if (stop_on_nosnap && inode &&
1247 ceph_snap(inode) == CEPH_NOSNAP)
1248 break;
1249 else
1250 len += 1 + temp->d_name.len;
1251 temp = temp->d_parent;
1252 if (temp == NULL) {
1253 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1254 return ERR_PTR(-EINVAL);
1255 }
1256 }
1257 if (len)
1258 len--; /* no leading '/' */
1259
1260 path = kmalloc(len+1, GFP_NOFS);
1261 if (path == NULL)
1262 return ERR_PTR(-ENOMEM);
1263 pos = len;
1264 path[pos] = 0; /* trailing null */
1265 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1266 struct inode *inode = temp->d_inode;
1267
1268 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1269 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1270 pos, temp);
1271 } else if (stop_on_nosnap && inode &&
1272 ceph_snap(inode) == CEPH_NOSNAP) {
1273 break;
1274 } else {
1275 pos -= temp->d_name.len;
1276 if (pos < 0)
1277 break;
1278 strncpy(path + pos, temp->d_name.name,
1279 temp->d_name.len);
1280 dout("build_path_dentry path+%d: %p '%.*s'\n",
1281 pos, temp, temp->d_name.len, path + pos);
1282 }
1283 if (pos)
1284 path[--pos] = '/';
1285 temp = temp->d_parent;
1286 if (temp == NULL) {
1287 pr_err("build_path_dentry corrupt dentry\n");
1288 kfree(path);
1289 return ERR_PTR(-EINVAL);
1290 }
1291 }
1292 if (pos != 0) {
1293 pr_err("build_path_dentry did not end path lookup where "
1294 "expected, namelen is %d, pos is %d\n", len, pos);
1295 /* presumably this is only possible if racing with a
1296 rename of one of the parent directories (we can not
1297 lock the dentries above us to prevent this, but
1298 retrying should be harmless) */
1299 kfree(path);
1300 goto retry;
1301 }
1302
1303 *base = ceph_ino(temp->d_inode);
1304 *plen = len;
1305 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1306 dentry, atomic_read(&dentry->d_count), *base, len, path);
1307 return path;
1308}
1309
1310static int build_dentry_path(struct dentry *dentry,
1311 const char **ppath, int *ppathlen, u64 *pino,
1312 int *pfreepath)
1313{
1314 char *path;
1315
1316 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1317 *pino = ceph_ino(dentry->d_parent->d_inode);
1318 *ppath = dentry->d_name.name;
1319 *ppathlen = dentry->d_name.len;
1320 return 0;
1321 }
1322 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1323 if (IS_ERR(path))
1324 return PTR_ERR(path);
1325 *ppath = path;
1326 *pfreepath = 1;
1327 return 0;
1328}
1329
1330static int build_inode_path(struct inode *inode,
1331 const char **ppath, int *ppathlen, u64 *pino,
1332 int *pfreepath)
1333{
1334 struct dentry *dentry;
1335 char *path;
1336
1337 if (ceph_snap(inode) == CEPH_NOSNAP) {
1338 *pino = ceph_ino(inode);
1339 *ppathlen = 0;
1340 return 0;
1341 }
1342 dentry = d_find_alias(inode);
1343 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1344 dput(dentry);
1345 if (IS_ERR(path))
1346 return PTR_ERR(path);
1347 *ppath = path;
1348 *pfreepath = 1;
1349 return 0;
1350}
1351
1352/*
1353 * request arguments may be specified via an inode *, a dentry *, or
1354 * an explicit ino+path.
1355 */
1356static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1357 const char *rpath, u64 rino,
1358 const char **ppath, int *pathlen,
1359 u64 *ino, int *freepath)
1360{
1361 int r = 0;
1362
1363 if (rinode) {
1364 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1365 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1366 ceph_snap(rinode));
1367 } else if (rdentry) {
1368 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1369 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1370 *ppath);
1371 } else if (rpath) {
1372 *ino = rino;
1373 *ppath = rpath;
1374 *pathlen = strlen(rpath);
1375 dout(" path %.*s\n", *pathlen, rpath);
1376 }
1377
1378 return r;
1379}
1380
1381/*
1382 * called under mdsc->mutex
1383 */
1384static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1385 struct ceph_mds_request *req,
1386 int mds)
1387{
1388 struct ceph_msg *msg;
1389 struct ceph_mds_request_head *head;
1390 const char *path1 = NULL;
1391 const char *path2 = NULL;
1392 u64 ino1 = 0, ino2 = 0;
1393 int pathlen1 = 0, pathlen2 = 0;
1394 int freepath1 = 0, freepath2 = 0;
1395 int len;
1396 u16 releases;
1397 void *p, *end;
1398 int ret;
1399
1400 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1401 req->r_path1, req->r_ino1.ino,
1402 &path1, &pathlen1, &ino1, &freepath1);
1403 if (ret < 0) {
1404 msg = ERR_PTR(ret);
1405 goto out;
1406 }
1407
1408 ret = set_request_path_attr(NULL, req->r_old_dentry,
1409 req->r_path2, req->r_ino2.ino,
1410 &path2, &pathlen2, &ino2, &freepath2);
1411 if (ret < 0) {
1412 msg = ERR_PTR(ret);
1413 goto out_free1;
1414 }
1415
1416 len = sizeof(*head) +
1417 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1418
1419 /* calculate (max) length for cap releases */
1420 len += sizeof(struct ceph_mds_request_release) *
1421 (!!req->r_inode_drop + !!req->r_dentry_drop +
1422 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1423 if (req->r_dentry_drop)
1424 len += req->r_dentry->d_name.len;
1425 if (req->r_old_dentry_drop)
1426 len += req->r_old_dentry->d_name.len;
1427
1428 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1429 if (IS_ERR(msg))
1430 goto out_free2;
1431
1432 msg->hdr.tid = cpu_to_le64(req->r_tid);
1433
1434 head = msg->front.iov_base;
1435 p = msg->front.iov_base + sizeof(*head);
1436 end = msg->front.iov_base + msg->front.iov_len;
1437
1438 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1439 head->op = cpu_to_le32(req->r_op);
1440 head->caller_uid = cpu_to_le32(current_fsuid());
1441 head->caller_gid = cpu_to_le32(current_fsgid());
1442 head->args = req->r_args;
1443
1444 ceph_encode_filepath(&p, end, ino1, path1);
1445 ceph_encode_filepath(&p, end, ino2, path2);
1446
1447 /* cap releases */
1448 releases = 0;
1449 if (req->r_inode_drop)
1450 releases += ceph_encode_inode_release(&p,
1451 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1452 mds, req->r_inode_drop, req->r_inode_unless, 0);
1453 if (req->r_dentry_drop)
1454 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1455 mds, req->r_dentry_drop, req->r_dentry_unless);
1456 if (req->r_old_dentry_drop)
1457 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1458 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1459 if (req->r_old_inode_drop)
1460 releases += ceph_encode_inode_release(&p,
1461 req->r_old_dentry->d_inode,
1462 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1463 head->num_releases = cpu_to_le16(releases);
1464
1465 BUG_ON(p > end);
1466 msg->front.iov_len = p - msg->front.iov_base;
1467 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1468
1469 msg->pages = req->r_pages;
1470 msg->nr_pages = req->r_num_pages;
1471 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1472 msg->hdr.data_off = cpu_to_le16(0);
1473
1474out_free2:
1475 if (freepath2)
1476 kfree((char *)path2);
1477out_free1:
1478 if (freepath1)
1479 kfree((char *)path1);
1480out:
1481 return msg;
1482}
1483
1484/*
1485 * called under mdsc->mutex if error, under no mutex if
1486 * success.
1487 */
1488static void complete_request(struct ceph_mds_client *mdsc,
1489 struct ceph_mds_request *req)
1490{
1491 if (req->r_callback)
1492 req->r_callback(mdsc, req);
1493 else
1494 complete(&req->r_completion);
1495}
1496
1497/*
1498 * called under mdsc->mutex
1499 */
1500static int __prepare_send_request(struct ceph_mds_client *mdsc,
1501 struct ceph_mds_request *req,
1502 int mds)
1503{
1504 struct ceph_mds_request_head *rhead;
1505 struct ceph_msg *msg;
1506 int flags = 0;
1507
1508 req->r_mds = mds;
1509 req->r_attempts++;
1510 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1511 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1512
1513 if (req->r_request) {
1514 ceph_msg_put(req->r_request);
1515 req->r_request = NULL;
1516 }
1517 msg = create_request_message(mdsc, req, mds);
1518 if (IS_ERR(msg)) {
1519 req->r_reply = ERR_PTR(PTR_ERR(msg));
1520 complete_request(mdsc, req);
1521 return -PTR_ERR(msg);
1522 }
1523 req->r_request = msg;
1524
1525 rhead = msg->front.iov_base;
1526 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1527 if (req->r_got_unsafe)
1528 flags |= CEPH_MDS_FLAG_REPLAY;
1529 if (req->r_locked_dir)
1530 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1531 rhead->flags = cpu_to_le32(flags);
1532 rhead->num_fwd = req->r_num_fwd;
1533 rhead->num_retry = req->r_attempts - 1;
1534
1535 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1536
1537 if (req->r_target_inode && req->r_got_unsafe)
1538 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1539 else
1540 rhead->ino = 0;
1541 return 0;
1542}
1543
1544/*
1545 * send request, or put it on the appropriate wait list.
1546 */
1547static int __do_request(struct ceph_mds_client *mdsc,
1548 struct ceph_mds_request *req)
1549{
1550 struct ceph_mds_session *session = NULL;
1551 int mds = -1;
1552 int err = -EAGAIN;
1553
1554 if (req->r_reply)
1555 goto out;
1556
1557 if (req->r_timeout &&
1558 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1559 dout("do_request timed out\n");
1560 err = -EIO;
1561 goto finish;
1562 }
1563
1564 mds = __choose_mds(mdsc, req);
1565 if (mds < 0 ||
1566 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1567 dout("do_request no mds or not active, waiting for map\n");
1568 list_add(&req->r_wait, &mdsc->waiting_for_map);
1569 goto out;
1570 }
1571
1572 /* get, open session */
1573 session = __ceph_lookup_mds_session(mdsc, mds);
1574 if (!session) {
1575 session = register_session(mdsc, mds);
1576 if (IS_ERR(session)) {
1577 err = PTR_ERR(session);
1578 goto finish;
1579 }
1580 }
1581 dout("do_request mds%d session %p state %s\n", mds, session,
1582 session_state_name(session->s_state));
1583 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1584 session->s_state != CEPH_MDS_SESSION_HUNG) {
1585 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1586 session->s_state == CEPH_MDS_SESSION_CLOSING)
1587 __open_session(mdsc, session);
1588 list_add(&req->r_wait, &session->s_waiting);
1589 goto out_session;
1590 }
1591
1592 /* send request */
1593 req->r_session = get_session(session);
1594 req->r_resend_mds = -1; /* forget any previous mds hint */
1595
1596 if (req->r_request_started == 0) /* note request start time */
1597 req->r_request_started = jiffies;
1598
1599 err = __prepare_send_request(mdsc, req, mds);
1600 if (!err) {
1601 ceph_msg_get(req->r_request);
1602 ceph_con_send(&session->s_con, req->r_request);
1603 }
1604
1605out_session:
1606 ceph_put_mds_session(session);
1607out:
1608 return err;
1609
1610finish:
1611 req->r_reply = ERR_PTR(err);
1612 complete_request(mdsc, req);
1613 goto out;
1614}
1615
1616/*
1617 * called under mdsc->mutex
1618 */
1619static void __wake_requests(struct ceph_mds_client *mdsc,
1620 struct list_head *head)
1621{
1622 struct ceph_mds_request *req, *nreq;
1623
1624 list_for_each_entry_safe(req, nreq, head, r_wait) {
1625 list_del_init(&req->r_wait);
1626 __do_request(mdsc, req);
1627 }
1628}
1629
1630/*
1631 * Wake up threads with requests pending for @mds, so that they can
1632 * resubmit their requests to a possibly different mds. If @all is set,
1633 * wake up if their requests has been forwarded to @mds, too.
1634 */
1635static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1636{
1637 struct ceph_mds_request *req;
1638 struct rb_node *p;
1639
1640 dout("kick_requests mds%d\n", mds);
1641 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1642 req = rb_entry(p, struct ceph_mds_request, r_node);
1643 if (req->r_got_unsafe)
1644 continue;
1645 if (req->r_session &&
1646 req->r_session->s_mds == mds) {
1647 dout(" kicking tid %llu\n", req->r_tid);
1648 put_request_session(req);
1649 __do_request(mdsc, req);
1650 }
1651 }
1652}
1653
1654void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1655 struct ceph_mds_request *req)
1656{
1657 dout("submit_request on %p\n", req);
1658 mutex_lock(&mdsc->mutex);
1659 __register_request(mdsc, req, NULL);
1660 __do_request(mdsc, req);
1661 mutex_unlock(&mdsc->mutex);
1662}
1663
1664/*
1665 * Synchrously perform an mds request. Take care of all of the
1666 * session setup, forwarding, retry details.
1667 */
1668int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1669 struct inode *dir,
1670 struct ceph_mds_request *req)
1671{
1672 int err;
1673
1674 dout("do_request on %p\n", req);
1675
1676 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1677 if (req->r_inode)
1678 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1679 if (req->r_locked_dir)
1680 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1681 if (req->r_old_dentry)
1682 ceph_get_cap_refs(
1683 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1684 CEPH_CAP_PIN);
1685
1686 /* issue */
1687 mutex_lock(&mdsc->mutex);
1688 __register_request(mdsc, req, dir);
1689 __do_request(mdsc, req);
1690
1691 /* wait */
1692 if (!req->r_reply) {
1693 mutex_unlock(&mdsc->mutex);
1694 if (req->r_timeout) {
1695 err = (long)wait_for_completion_interruptible_timeout(
1696 &req->r_completion, req->r_timeout);
1697 if (err == 0)
1698 req->r_reply = ERR_PTR(-EIO);
1699 else if (err < 0)
1700 req->r_reply = ERR_PTR(err);
1701 } else {
1702 err = wait_for_completion_interruptible(
1703 &req->r_completion);
1704 if (err)
1705 req->r_reply = ERR_PTR(err);
1706 }
1707 mutex_lock(&mdsc->mutex);
1708 }
1709
1710 if (IS_ERR(req->r_reply)) {
1711 err = PTR_ERR(req->r_reply);
1712 req->r_reply = NULL;
1713
1714 if (err == -ERESTARTSYS) {
1715 /* aborted */
1716 req->r_aborted = true;
1717
1718 if (req->r_locked_dir &&
1719 (req->r_op & CEPH_MDS_OP_WRITE)) {
1720 struct ceph_inode_info *ci =
1721 ceph_inode(req->r_locked_dir);
1722
1723 dout("aborted, clearing I_COMPLETE on %p\n",
1724 req->r_locked_dir);
1725 spin_lock(&req->r_locked_dir->i_lock);
1726 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1727 ci->i_release_count++;
1728 spin_unlock(&req->r_locked_dir->i_lock);
1729 }
1730 } else {
1731 /* clean up this request */
1732 __unregister_request(mdsc, req);
1733 if (!list_empty(&req->r_unsafe_item))
1734 list_del_init(&req->r_unsafe_item);
1735 complete(&req->r_safe_completion);
1736 }
1737 } else if (req->r_err) {
1738 err = req->r_err;
1739 } else {
1740 err = le32_to_cpu(req->r_reply_info.head->result);
1741 }
1742 mutex_unlock(&mdsc->mutex);
1743
1744 dout("do_request %p done, result %d\n", req, err);
1745 return err;
1746}
1747
1748/*
1749 * Handle mds reply.
1750 *
1751 * We take the session mutex and parse and process the reply immediately.
1752 * This preserves the logical ordering of replies, capabilities, etc., sent
1753 * by the MDS as they are applied to our local cache.
1754 */
1755static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1756{
1757 struct ceph_mds_client *mdsc = session->s_mdsc;
1758 struct ceph_mds_request *req;
1759 struct ceph_mds_reply_head *head = msg->front.iov_base;
1760 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1761 u64 tid;
1762 int err, result;
1763 int mds = session->s_mds;
1764
1765 if (msg->front.iov_len < sizeof(*head)) {
1766 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1767 ceph_msg_dump(msg);
1768 return;
1769 }
1770
1771 /* get request, session */
1772 tid = le64_to_cpu(msg->hdr.tid);
1773 mutex_lock(&mdsc->mutex);
1774 req = __lookup_request(mdsc, tid);
1775 if (!req) {
1776 dout("handle_reply on unknown tid %llu\n", tid);
1777 mutex_unlock(&mdsc->mutex);
1778 return;
1779 }
1780 dout("handle_reply %p\n", req);
1781
1782 /* correct session? */
1783 if (req->r_session != session) {
1784 pr_err("mdsc_handle_reply got %llu on session mds%d"
1785 " not mds%d\n", tid, session->s_mds,
1786 req->r_session ? req->r_session->s_mds : -1);
1787 mutex_unlock(&mdsc->mutex);
1788 goto out;
1789 }
1790
1791 /* dup? */
1792 if ((req->r_got_unsafe && !head->safe) ||
1793 (req->r_got_safe && head->safe)) {
1794 pr_warning("got a dup %s reply on %llu from mds%d\n",
1795 head->safe ? "safe" : "unsafe", tid, mds);
1796 mutex_unlock(&mdsc->mutex);
1797 goto out;
1798 }
1799
1800 result = le32_to_cpu(head->result);
1801
1802 /*
1803 * Tolerate 2 consecutive ESTALEs from the same mds.
1804 * FIXME: we should be looking at the cap migrate_seq.
1805 */
1806 if (result == -ESTALE) {
1807 req->r_direct_mode = USE_AUTH_MDS;
1808 req->r_num_stale++;
1809 if (req->r_num_stale <= 2) {
1810 __do_request(mdsc, req);
1811 mutex_unlock(&mdsc->mutex);
1812 goto out;
1813 }
1814 } else {
1815 req->r_num_stale = 0;
1816 }
1817
1818 if (head->safe) {
1819 req->r_got_safe = true;
1820 __unregister_request(mdsc, req);
1821 complete(&req->r_safe_completion);
1822
1823 if (req->r_got_unsafe) {
1824 /*
1825 * We already handled the unsafe response, now do the
1826 * cleanup. No need to examine the response; the MDS
1827 * doesn't include any result info in the safe
1828 * response. And even if it did, there is nothing
1829 * useful we could do with a revised return value.
1830 */
1831 dout("got safe reply %llu, mds%d\n", tid, mds);
1832 list_del_init(&req->r_unsafe_item);
1833
1834 /* last unsafe request during umount? */
1835 if (mdsc->stopping && !__get_oldest_req(mdsc))
1836 complete(&mdsc->safe_umount_waiters);
1837 mutex_unlock(&mdsc->mutex);
1838 goto out;
1839 }
1840 }
1841
1842 BUG_ON(req->r_reply);
1843
1844 if (!head->safe) {
1845 req->r_got_unsafe = true;
1846 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1847 }
1848
1849 dout("handle_reply tid %lld result %d\n", tid, result);
1850 rinfo = &req->r_reply_info;
1851 err = parse_reply_info(msg, rinfo);
1852 mutex_unlock(&mdsc->mutex);
1853
1854 mutex_lock(&session->s_mutex);
1855 if (err < 0) {
1856 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1857 ceph_msg_dump(msg);
1858 goto out_err;
1859 }
1860
1861 /* snap trace */
1862 if (rinfo->snapblob_len) {
1863 down_write(&mdsc->snap_rwsem);
1864 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1865 rinfo->snapblob + rinfo->snapblob_len,
1866 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1867 downgrade_write(&mdsc->snap_rwsem);
1868 } else {
1869 down_read(&mdsc->snap_rwsem);
1870 }
1871
1872 /* insert trace into our cache */
1873 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1874 if (err == 0) {
1875 if (result == 0 && rinfo->dir_nr)
1876 ceph_readdir_prepopulate(req, req->r_session);
1877 ceph_unreserve_caps(&req->r_caps_reservation);
1878 }
1879
1880 up_read(&mdsc->snap_rwsem);
1881out_err:
1882 if (err) {
1883 req->r_err = err;
1884 } else {
1885 req->r_reply = msg;
1886 ceph_msg_get(msg);
1887 }
1888
1889 add_cap_releases(mdsc, req->r_session, -1);
1890 mutex_unlock(&session->s_mutex);
1891
1892 /* kick calling process */
1893 complete_request(mdsc, req);
1894out:
1895 ceph_mdsc_put_request(req);
1896 return;
1897}
1898
1899
1900
1901/*
1902 * handle mds notification that our request has been forwarded.
1903 */
1904static void handle_forward(struct ceph_mds_client *mdsc,
1905 struct ceph_mds_session *session,
1906 struct ceph_msg *msg)
1907{
1908 struct ceph_mds_request *req;
1909 u64 tid = le64_to_cpu(msg->hdr.tid);
1910 u32 next_mds;
1911 u32 fwd_seq;
1912 int err = -EINVAL;
1913 void *p = msg->front.iov_base;
1914 void *end = p + msg->front.iov_len;
1915
1916 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1917 next_mds = ceph_decode_32(&p);
1918 fwd_seq = ceph_decode_32(&p);
1919
1920 mutex_lock(&mdsc->mutex);
1921 req = __lookup_request(mdsc, tid);
1922 if (!req) {
1923 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1924 goto out; /* dup reply? */
1925 }
1926
1927 if (fwd_seq <= req->r_num_fwd) {
1928 dout("forward %llu to mds%d - old seq %d <= %d\n",
1929 tid, next_mds, req->r_num_fwd, fwd_seq);
1930 } else {
1931 /* resend. forward race not possible; mds would drop */
1932 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1933 req->r_num_fwd = fwd_seq;
1934 req->r_resend_mds = next_mds;
1935 put_request_session(req);
1936 __do_request(mdsc, req);
1937 }
1938 ceph_mdsc_put_request(req);
1939out:
1940 mutex_unlock(&mdsc->mutex);
1941 return;
1942
1943bad:
1944 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1945}
1946
1947/*
1948 * handle a mds session control message
1949 */
1950static void handle_session(struct ceph_mds_session *session,
1951 struct ceph_msg *msg)
1952{
1953 struct ceph_mds_client *mdsc = session->s_mdsc;
1954 u32 op;
1955 u64 seq;
1956 int mds = session->s_mds;
1957 struct ceph_mds_session_head *h = msg->front.iov_base;
1958 int wake = 0;
1959
1960 /* decode */
1961 if (msg->front.iov_len != sizeof(*h))
1962 goto bad;
1963 op = le32_to_cpu(h->op);
1964 seq = le64_to_cpu(h->seq);
1965
1966 mutex_lock(&mdsc->mutex);
1967 if (op == CEPH_SESSION_CLOSE)
1968 __unregister_session(mdsc, session);
1969 /* FIXME: this ttl calculation is generous */
1970 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1971 mutex_unlock(&mdsc->mutex);
1972
1973 mutex_lock(&session->s_mutex);
1974
1975 dout("handle_session mds%d %s %p state %s seq %llu\n",
1976 mds, ceph_session_op_name(op), session,
1977 session_state_name(session->s_state), seq);
1978
1979 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1980 session->s_state = CEPH_MDS_SESSION_OPEN;
1981 pr_info("mds%d came back\n", session->s_mds);
1982 }
1983
1984 switch (op) {
1985 case CEPH_SESSION_OPEN:
1986 session->s_state = CEPH_MDS_SESSION_OPEN;
1987 renewed_caps(mdsc, session, 0);
1988 wake = 1;
1989 if (mdsc->stopping)
1990 __close_session(mdsc, session);
1991 break;
1992
1993 case CEPH_SESSION_RENEWCAPS:
1994 if (session->s_renew_seq == seq)
1995 renewed_caps(mdsc, session, 1);
1996 break;
1997
1998 case CEPH_SESSION_CLOSE:
1999 remove_session_caps(session);
2000 wake = 1; /* for good measure */
2001 complete(&mdsc->session_close_waiters);
2002 kick_requests(mdsc, mds, 0); /* cur only */
2003 break;
2004
2005 case CEPH_SESSION_STALE:
2006 pr_info("mds%d caps went stale, renewing\n",
2007 session->s_mds);
2008 spin_lock(&session->s_cap_lock);
2009 session->s_cap_gen++;
2010 session->s_cap_ttl = 0;
2011 spin_unlock(&session->s_cap_lock);
2012 send_renew_caps(mdsc, session);
2013 break;
2014
2015 case CEPH_SESSION_RECALL_STATE:
2016 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2017 break;
2018
2019 default:
2020 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2021 WARN_ON(1);
2022 }
2023
2024 mutex_unlock(&session->s_mutex);
2025 if (wake) {
2026 mutex_lock(&mdsc->mutex);
2027 __wake_requests(mdsc, &session->s_waiting);
2028 mutex_unlock(&mdsc->mutex);
2029 }
2030 return;
2031
2032bad:
2033 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2034 (int)msg->front.iov_len);
2035 ceph_msg_dump(msg);
2036 return;
2037}
2038
2039
2040/*
2041 * called under session->mutex.
2042 */
2043static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2044 struct ceph_mds_session *session)
2045{
2046 struct ceph_mds_request *req, *nreq;
2047 int err;
2048
2049 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2050
2051 mutex_lock(&mdsc->mutex);
2052 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2053 err = __prepare_send_request(mdsc, req, session->s_mds);
2054 if (!err) {
2055 ceph_msg_get(req->r_request);
2056 ceph_con_send(&session->s_con, req->r_request);
2057 }
2058 }
2059 mutex_unlock(&mdsc->mutex);
2060}
2061
2062/*
2063 * Encode information about a cap for a reconnect with the MDS.
2064 */
2065static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2066 void *arg)
2067{
2068 struct ceph_mds_cap_reconnect rec;
2069 struct ceph_inode_info *ci;
2070 struct ceph_pagelist *pagelist = arg;
2071 char *path;
2072 int pathlen, err;
2073 u64 pathbase;
2074 struct dentry *dentry;
2075
2076 ci = cap->ci;
2077
2078 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2079 inode, ceph_vinop(inode), cap, cap->cap_id,
2080 ceph_cap_string(cap->issued));
2081 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2082 if (err)
2083 return err;
2084
2085 dentry = d_find_alias(inode);
2086 if (dentry) {
2087 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2088 if (IS_ERR(path)) {
2089 err = PTR_ERR(path);
2090 BUG_ON(err);
2091 }
2092 } else {
2093 path = NULL;
2094 pathlen = 0;
2095 }
2096 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2097 if (err)
2098 goto out;
2099
2100 spin_lock(&inode->i_lock);
2101 cap->seq = 0; /* reset cap seq */
2102 cap->issue_seq = 0; /* and issue_seq */
2103 rec.cap_id = cpu_to_le64(cap->cap_id);
2104 rec.pathbase = cpu_to_le64(pathbase);
2105 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2106 rec.issued = cpu_to_le32(cap->issued);
2107 rec.size = cpu_to_le64(inode->i_size);
2108 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2109 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2110 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2111 spin_unlock(&inode->i_lock);
2112
2113 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2114
2115out:
2116 kfree(path);
2117 dput(dentry);
2118 return err;
2119}
2120
2121
2122/*
2123 * If an MDS fails and recovers, clients need to reconnect in order to
2124 * reestablish shared state. This includes all caps issued through
2125 * this session _and_ the snap_realm hierarchy. Because it's not
2126 * clear which snap realms the mds cares about, we send everything we
2127 * know about.. that ensures we'll then get any new info the
2128 * recovering MDS might have.
2129 *
2130 * This is a relatively heavyweight operation, but it's rare.
2131 *
2132 * called with mdsc->mutex held.
2133 */
2134static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2135{
2136 struct ceph_mds_session *session = NULL;
2137 struct ceph_msg *reply;
2138 struct rb_node *p;
2139 int err;
2140 struct ceph_pagelist *pagelist;
2141
2142 pr_info("reconnect to recovering mds%d\n", mds);
2143
2144 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2145 if (!pagelist)
2146 goto fail_nopagelist;
2147 ceph_pagelist_init(pagelist);
2148
2149 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2150 if (IS_ERR(reply)) {
2151 err = PTR_ERR(reply);
2152 goto fail_nomsg;
2153 }
2154
2155 /* find session */
2156 session = __ceph_lookup_mds_session(mdsc, mds);
2157 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2158
2159 if (session) {
2160 mutex_lock(&session->s_mutex);
2161
2162 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2163 session->s_seq = 0;
2164
2165 ceph_con_open(&session->s_con,
2166 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2167
2168 /* replay unsafe requests */
2169 replay_unsafe_requests(mdsc, session);
2170 } else {
2171 dout("no session for mds%d, will send short reconnect\n",
2172 mds);
2173 }
2174
2175 down_read(&mdsc->snap_rwsem);
2176
2177 if (!session)
2178 goto send;
2179 dout("session %p state %s\n", session,
2180 session_state_name(session->s_state));
2181
2182 /* traverse this session's caps */
2183 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2184 if (err)
2185 goto fail;
2186 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2187 if (err < 0)
2188 goto out;
2189
2190 /*
2191 * snaprealms. we provide mds with the ino, seq (version), and
2192 * parent for all of our realms. If the mds has any newer info,
2193 * it will tell us.
2194 */
2195 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2196 struct ceph_snap_realm *realm =
2197 rb_entry(p, struct ceph_snap_realm, node);
2198 struct ceph_mds_snaprealm_reconnect sr_rec;
2199
2200 dout(" adding snap realm %llx seq %lld parent %llx\n",
2201 realm->ino, realm->seq, realm->parent_ino);
2202 sr_rec.ino = cpu_to_le64(realm->ino);
2203 sr_rec.seq = cpu_to_le64(realm->seq);
2204 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2205 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2206 if (err)
2207 goto fail;
2208 }
2209
2210send:
2211 reply->pagelist = pagelist;
2212 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2213 reply->nr_pages = calc_pages_for(0, pagelist->length);
2214 ceph_con_send(&session->s_con, reply);
2215
2216 if (session) {
2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2218 __wake_requests(mdsc, &session->s_waiting);
2219 }
2220
2221out:
2222 up_read(&mdsc->snap_rwsem);
2223 if (session) {
2224 mutex_unlock(&session->s_mutex);
2225 ceph_put_mds_session(session);
2226 }
2227 mutex_lock(&mdsc->mutex);
2228 return;
2229
2230fail:
2231 ceph_msg_put(reply);
2232fail_nomsg:
2233 ceph_pagelist_release(pagelist);
2234 kfree(pagelist);
2235fail_nopagelist:
2236 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2237 goto out;
2238}
2239
2240
2241/*
2242 * compare old and new mdsmaps, kicking requests
2243 * and closing out old connections as necessary
2244 *
2245 * called under mdsc->mutex.
2246 */
2247static void check_new_map(struct ceph_mds_client *mdsc,
2248 struct ceph_mdsmap *newmap,
2249 struct ceph_mdsmap *oldmap)
2250{
2251 int i;
2252 int oldstate, newstate;
2253 struct ceph_mds_session *s;
2254
2255 dout("check_new_map new %u old %u\n",
2256 newmap->m_epoch, oldmap->m_epoch);
2257
2258 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2259 if (mdsc->sessions[i] == NULL)
2260 continue;
2261 s = mdsc->sessions[i];
2262 oldstate = ceph_mdsmap_get_state(oldmap, i);
2263 newstate = ceph_mdsmap_get_state(newmap, i);
2264
2265 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2266 i, ceph_mds_state_name(oldstate),
2267 ceph_mds_state_name(newstate),
2268 session_state_name(s->s_state));
2269
2270 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2271 ceph_mdsmap_get_addr(newmap, i),
2272 sizeof(struct ceph_entity_addr))) {
2273 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2274 /* the session never opened, just close it
2275 * out now */
2276 __wake_requests(mdsc, &s->s_waiting);
2277 __unregister_session(mdsc, s);
2278 } else {
2279 /* just close it */
2280 mutex_unlock(&mdsc->mutex);
2281 mutex_lock(&s->s_mutex);
2282 mutex_lock(&mdsc->mutex);
2283 ceph_con_close(&s->s_con);
2284 mutex_unlock(&s->s_mutex);
2285 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2286 }
2287
2288 /* kick any requests waiting on the recovering mds */
2289 kick_requests(mdsc, i, 1);
2290 } else if (oldstate == newstate) {
2291 continue; /* nothing new with this mds */
2292 }
2293
2294 /*
2295 * send reconnect?
2296 */
2297 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2298 newstate >= CEPH_MDS_STATE_RECONNECT)
2299 send_mds_reconnect(mdsc, i);
2300
2301 /*
2302 * kick requests on any mds that has gone active.
2303 *
2304 * kick requests on cur or forwarder: we may have sent
2305 * the request to mds1, mds1 told us it forwarded it
2306 * to mds2, but then we learn mds1 failed and can't be
2307 * sure it successfully forwarded our request before
2308 * it died.
2309 */
2310 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2311 newstate >= CEPH_MDS_STATE_ACTIVE) {
2312 pr_info("mds%d reconnect completed\n", s->s_mds);
2313 kick_requests(mdsc, i, 1);
2314 ceph_kick_flushing_caps(mdsc, s);
2315 wake_up_session_caps(s, 1);
2316 }
2317 }
2318}
2319
2320
2321
2322/*
2323 * leases
2324 */
2325
2326/*
2327 * caller must hold session s_mutex, dentry->d_lock
2328 */
2329void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2330{
2331 struct ceph_dentry_info *di = ceph_dentry(dentry);
2332
2333 ceph_put_mds_session(di->lease_session);
2334 di->lease_session = NULL;
2335}
2336
2337static void handle_lease(struct ceph_mds_client *mdsc,
2338 struct ceph_mds_session *session,
2339 struct ceph_msg *msg)
2340{
2341 struct super_block *sb = mdsc->client->sb;
2342 struct inode *inode;
2343 struct ceph_inode_info *ci;
2344 struct dentry *parent, *dentry;
2345 struct ceph_dentry_info *di;
2346 int mds = session->s_mds;
2347 struct ceph_mds_lease *h = msg->front.iov_base;
2348 struct ceph_vino vino;
2349 int mask;
2350 struct qstr dname;
2351 int release = 0;
2352
2353 dout("handle_lease from mds%d\n", mds);
2354
2355 /* decode */
2356 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2357 goto bad;
2358 vino.ino = le64_to_cpu(h->ino);
2359 vino.snap = CEPH_NOSNAP;
2360 mask = le16_to_cpu(h->mask);
2361 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2362 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2363 if (dname.len != get_unaligned_le32(h+1))
2364 goto bad;
2365
2366 mutex_lock(&session->s_mutex);
2367 session->s_seq++;
2368
2369 /* lookup inode */
2370 inode = ceph_find_inode(sb, vino);
2371 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2372 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2373 if (inode == NULL) {
2374 dout("handle_lease no inode %llx\n", vino.ino);
2375 goto release;
2376 }
2377 ci = ceph_inode(inode);
2378
2379 /* dentry */
2380 parent = d_find_alias(inode);
2381 if (!parent) {
2382 dout("no parent dentry on inode %p\n", inode);
2383 WARN_ON(1);
2384 goto release; /* hrm... */
2385 }
2386 dname.hash = full_name_hash(dname.name, dname.len);
2387 dentry = d_lookup(parent, &dname);
2388 dput(parent);
2389 if (!dentry)
2390 goto release;
2391
2392 spin_lock(&dentry->d_lock);
2393 di = ceph_dentry(dentry);
2394 switch (h->action) {
2395 case CEPH_MDS_LEASE_REVOKE:
2396 if (di && di->lease_session == session) {
2397 h->seq = cpu_to_le32(di->lease_seq);
2398 __ceph_mdsc_drop_dentry_lease(dentry);
2399 }
2400 release = 1;
2401 break;
2402
2403 case CEPH_MDS_LEASE_RENEW:
2404 if (di && di->lease_session == session &&
2405 di->lease_gen == session->s_cap_gen &&
2406 di->lease_renew_from &&
2407 di->lease_renew_after == 0) {
2408 unsigned long duration =
2409 le32_to_cpu(h->duration_ms) * HZ / 1000;
2410
2411 di->lease_seq = le32_to_cpu(h->seq);
2412 dentry->d_time = di->lease_renew_from + duration;
2413 di->lease_renew_after = di->lease_renew_from +
2414 (duration >> 1);
2415 di->lease_renew_from = 0;
2416 }
2417 break;
2418 }
2419 spin_unlock(&dentry->d_lock);
2420 dput(dentry);
2421
2422 if (!release)
2423 goto out;
2424
2425release:
2426 /* let's just reuse the same message */
2427 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2428 ceph_msg_get(msg);
2429 ceph_con_send(&session->s_con, msg);
2430
2431out:
2432 iput(inode);
2433 mutex_unlock(&session->s_mutex);
2434 return;
2435
2436bad:
2437 pr_err("corrupt lease message\n");
2438 ceph_msg_dump(msg);
2439}
2440
2441void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2442 struct inode *inode,
2443 struct dentry *dentry, char action,
2444 u32 seq)
2445{
2446 struct ceph_msg *msg;
2447 struct ceph_mds_lease *lease;
2448 int len = sizeof(*lease) + sizeof(u32);
2449 int dnamelen = 0;
2450
2451 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2452 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2453 dnamelen = dentry->d_name.len;
2454 len += dnamelen;
2455
2456 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2457 if (IS_ERR(msg))
2458 return;
2459 lease = msg->front.iov_base;
2460 lease->action = action;
2461 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2462 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2463 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2464 lease->seq = cpu_to_le32(seq);
2465 put_unaligned_le32(dnamelen, lease + 1);
2466 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2467
2468 /*
2469 * if this is a preemptive lease RELEASE, no need to
2470 * flush request stream, since the actual request will
2471 * soon follow.
2472 */
2473 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2474
2475 ceph_con_send(&session->s_con, msg);
2476}
2477
2478/*
2479 * Preemptively release a lease we expect to invalidate anyway.
2480 * Pass @inode always, @dentry is optional.
2481 */
2482void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2483 struct dentry *dentry, int mask)
2484{
2485 struct ceph_dentry_info *di;
2486 struct ceph_mds_session *session;
2487 u32 seq;
2488
2489 BUG_ON(inode == NULL);
2490 BUG_ON(dentry == NULL);
2491 BUG_ON(mask != CEPH_LOCK_DN);
2492
2493 /* is dentry lease valid? */
2494 spin_lock(&dentry->d_lock);
2495 di = ceph_dentry(dentry);
2496 if (!di || !di->lease_session ||
2497 di->lease_session->s_mds < 0 ||
2498 di->lease_gen != di->lease_session->s_cap_gen ||
2499 !time_before(jiffies, dentry->d_time)) {
2500 dout("lease_release inode %p dentry %p -- "
2501 "no lease on %d\n",
2502 inode, dentry, mask);
2503 spin_unlock(&dentry->d_lock);
2504 return;
2505 }
2506
2507 /* we do have a lease on this dentry; note mds and seq */
2508 session = ceph_get_mds_session(di->lease_session);
2509 seq = di->lease_seq;
2510 __ceph_mdsc_drop_dentry_lease(dentry);
2511 spin_unlock(&dentry->d_lock);
2512
2513 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2514 inode, dentry, mask, session->s_mds);
2515 ceph_mdsc_lease_send_msg(session, inode, dentry,
2516 CEPH_MDS_LEASE_RELEASE, seq);
2517 ceph_put_mds_session(session);
2518}
2519
2520/*
2521 * drop all leases (and dentry refs) in preparation for umount
2522 */
2523static void drop_leases(struct ceph_mds_client *mdsc)
2524{
2525 int i;
2526
2527 dout("drop_leases\n");
2528 mutex_lock(&mdsc->mutex);
2529 for (i = 0; i < mdsc->max_sessions; i++) {
2530 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2531 if (!s)
2532 continue;
2533 mutex_unlock(&mdsc->mutex);
2534 mutex_lock(&s->s_mutex);
2535 mutex_unlock(&s->s_mutex);
2536 ceph_put_mds_session(s);
2537 mutex_lock(&mdsc->mutex);
2538 }
2539 mutex_unlock(&mdsc->mutex);
2540}
2541
2542
2543
2544/*
2545 * delayed work -- periodically trim expired leases, renew caps with mds
2546 */
2547static void schedule_delayed(struct ceph_mds_client *mdsc)
2548{
2549 int delay = 5;
2550 unsigned hz = round_jiffies_relative(HZ * delay);
2551 schedule_delayed_work(&mdsc->delayed_work, hz);
2552}
2553
2554static void delayed_work(struct work_struct *work)
2555{
2556 int i;
2557 struct ceph_mds_client *mdsc =
2558 container_of(work, struct ceph_mds_client, delayed_work.work);
2559 int renew_interval;
2560 int renew_caps;
2561
2562 dout("mdsc delayed_work\n");
2563 ceph_check_delayed_caps(mdsc);
2564
2565 mutex_lock(&mdsc->mutex);
2566 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2567 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2568 mdsc->last_renew_caps);
2569 if (renew_caps)
2570 mdsc->last_renew_caps = jiffies;
2571
2572 for (i = 0; i < mdsc->max_sessions; i++) {
2573 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2574 if (s == NULL)
2575 continue;
2576 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2577 dout("resending session close request for mds%d\n",
2578 s->s_mds);
2579 request_close_session(mdsc, s);
2580 ceph_put_mds_session(s);
2581 continue;
2582 }
2583 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2584 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2585 s->s_state = CEPH_MDS_SESSION_HUNG;
2586 pr_info("mds%d hung\n", s->s_mds);
2587 }
2588 }
2589 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2590 /* this mds is failed or recovering, just wait */
2591 ceph_put_mds_session(s);
2592 continue;
2593 }
2594 mutex_unlock(&mdsc->mutex);
2595
2596 mutex_lock(&s->s_mutex);
2597 if (renew_caps)
2598 send_renew_caps(mdsc, s);
2599 else
2600 ceph_con_keepalive(&s->s_con);
2601 add_cap_releases(mdsc, s, -1);
2602 send_cap_releases(mdsc, s);
2603 mutex_unlock(&s->s_mutex);
2604 ceph_put_mds_session(s);
2605
2606 mutex_lock(&mdsc->mutex);
2607 }
2608 mutex_unlock(&mdsc->mutex);
2609
2610 schedule_delayed(mdsc);
2611}
2612
2613
2614int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2615{
2616 mdsc->client = client;
2617 mutex_init(&mdsc->mutex);
2618 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2619 init_completion(&mdsc->safe_umount_waiters);
2620 init_completion(&mdsc->session_close_waiters);
2621 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2622 mdsc->sessions = NULL;
2623 mdsc->max_sessions = 0;
2624 mdsc->stopping = 0;
2625 init_rwsem(&mdsc->snap_rwsem);
2626 mdsc->snap_realms = RB_ROOT;
2627 INIT_LIST_HEAD(&mdsc->snap_empty);
2628 spin_lock_init(&mdsc->snap_empty_lock);
2629 mdsc->last_tid = 0;
2630 mdsc->request_tree = RB_ROOT;
2631 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2632 mdsc->last_renew_caps = jiffies;
2633 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2634 spin_lock_init(&mdsc->cap_delay_lock);
2635 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2636 spin_lock_init(&mdsc->snap_flush_lock);
2637 mdsc->cap_flush_seq = 0;
2638 INIT_LIST_HEAD(&mdsc->cap_dirty);
2639 mdsc->num_cap_flushing = 0;
2640 spin_lock_init(&mdsc->cap_dirty_lock);
2641 init_waitqueue_head(&mdsc->cap_flushing_wq);
2642 spin_lock_init(&mdsc->dentry_lru_lock);
2643 INIT_LIST_HEAD(&mdsc->dentry_lru);
2644 return 0;
2645}
2646
2647/*
2648 * Wait for safe replies on open mds requests. If we time out, drop
2649 * all requests from the tree to avoid dangling dentry refs.
2650 */
2651static void wait_requests(struct ceph_mds_client *mdsc)
2652{
2653 struct ceph_mds_request *req;
2654 struct ceph_client *client = mdsc->client;
2655
2656 mutex_lock(&mdsc->mutex);
2657 if (__get_oldest_req(mdsc)) {
2658 mutex_unlock(&mdsc->mutex);
2659
2660 dout("wait_requests waiting for requests\n");
2661 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2662 client->mount_args->mount_timeout * HZ);
2663
2664 /* tear down remaining requests */
2665 mutex_lock(&mdsc->mutex);
2666 while ((req = __get_oldest_req(mdsc))) {
2667 dout("wait_requests timed out on tid %llu\n",
2668 req->r_tid);
2669 __unregister_request(mdsc, req);
2670 }
2671 }
2672 mutex_unlock(&mdsc->mutex);
2673 dout("wait_requests done\n");
2674}
2675
2676/*
2677 * called before mount is ro, and before dentries are torn down.
2678 * (hmm, does this still race with new lookups?)
2679 */
2680void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2681{
2682 dout("pre_umount\n");
2683 mdsc->stopping = 1;
2684
2685 drop_leases(mdsc);
2686 ceph_flush_dirty_caps(mdsc);
2687 wait_requests(mdsc);
2688}
2689
2690/*
2691 * wait for all write mds requests to flush.
2692 */
2693static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2694{
2695 struct ceph_mds_request *req = NULL, *nextreq;
2696 struct rb_node *n;
2697
2698 mutex_lock(&mdsc->mutex);
2699 dout("wait_unsafe_requests want %lld\n", want_tid);
2700restart:
2701 req = __get_oldest_req(mdsc);
2702 while (req && req->r_tid <= want_tid) {
2703 /* find next request */
2704 n = rb_next(&req->r_node);
2705 if (n)
2706 nextreq = rb_entry(n, struct ceph_mds_request, r_node);
2707 else
2708 nextreq = NULL;
2709 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2710 /* write op */
2711 ceph_mdsc_get_request(req);
2712 if (nextreq)
2713 ceph_mdsc_get_request(nextreq);
2714 mutex_unlock(&mdsc->mutex);
2715 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2716 req->r_tid, want_tid);
2717 wait_for_completion(&req->r_safe_completion);
2718 mutex_lock(&mdsc->mutex);
2719 ceph_mdsc_put_request(req);
2720 if (!nextreq)
2721 break; /* next dne before, so we're done! */
2722 if (RB_EMPTY_NODE(&nextreq->r_node)) {
2723 /* next request was removed from tree */
2724 ceph_mdsc_put_request(nextreq);
2725 goto restart;
2726 }
2727 ceph_mdsc_put_request(nextreq); /* won't go away */
2728 }
2729 req = nextreq;
2730 }
2731 mutex_unlock(&mdsc->mutex);
2732 dout("wait_unsafe_requests done\n");
2733}
2734
2735void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2736{
2737 u64 want_tid, want_flush;
2738
2739 dout("sync\n");
2740 mutex_lock(&mdsc->mutex);
2741 want_tid = mdsc->last_tid;
2742 want_flush = mdsc->cap_flush_seq;
2743 mutex_unlock(&mdsc->mutex);
2744 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2745
2746 ceph_flush_dirty_caps(mdsc);
2747
2748 wait_unsafe_requests(mdsc, want_tid);
2749 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2750}
2751
2752
2753/*
2754 * called after sb is ro.
2755 */
2756void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2757{
2758 struct ceph_mds_session *session;
2759 int i;
2760 int n;
2761 struct ceph_client *client = mdsc->client;
2762 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2763
2764 dout("close_sessions\n");
2765
2766 mutex_lock(&mdsc->mutex);
2767
2768 /* close sessions */
2769 started = jiffies;
2770 while (time_before(jiffies, started + timeout)) {
2771 dout("closing sessions\n");
2772 n = 0;
2773 for (i = 0; i < mdsc->max_sessions; i++) {
2774 session = __ceph_lookup_mds_session(mdsc, i);
2775 if (!session)
2776 continue;
2777 mutex_unlock(&mdsc->mutex);
2778 mutex_lock(&session->s_mutex);
2779 __close_session(mdsc, session);
2780 mutex_unlock(&session->s_mutex);
2781 ceph_put_mds_session(session);
2782 mutex_lock(&mdsc->mutex);
2783 n++;
2784 }
2785 if (n == 0)
2786 break;
2787
2788 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2789 break;
2790
2791 dout("waiting for sessions to close\n");
2792 mutex_unlock(&mdsc->mutex);
2793 wait_for_completion_timeout(&mdsc->session_close_waiters,
2794 timeout);
2795 mutex_lock(&mdsc->mutex);
2796 }
2797
2798 /* tear down remaining sessions */
2799 for (i = 0; i < mdsc->max_sessions; i++) {
2800 if (mdsc->sessions[i]) {
2801 session = get_session(mdsc->sessions[i]);
2802 __unregister_session(mdsc, session);
2803 mutex_unlock(&mdsc->mutex);
2804 mutex_lock(&session->s_mutex);
2805 remove_session_caps(session);
2806 mutex_unlock(&session->s_mutex);
2807 ceph_put_mds_session(session);
2808 mutex_lock(&mdsc->mutex);
2809 }
2810 }
2811
2812 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2813
2814 mutex_unlock(&mdsc->mutex);
2815
2816 ceph_cleanup_empty_realms(mdsc);
2817
2818 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2819
2820 dout("stopped\n");
2821}
2822
2823void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2824{
2825 dout("stop\n");
2826 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2827 if (mdsc->mdsmap)
2828 ceph_mdsmap_destroy(mdsc->mdsmap);
2829 kfree(mdsc->sessions);
2830}
2831
2832
2833/*
2834 * handle mds map update.
2835 */
2836void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2837{
2838 u32 epoch;
2839 u32 maplen;
2840 void *p = msg->front.iov_base;
2841 void *end = p + msg->front.iov_len;
2842 struct ceph_mdsmap *newmap, *oldmap;
2843 struct ceph_fsid fsid;
2844 int err = -EINVAL;
2845
2846 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2847 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2848 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2849 return;
2850 epoch = ceph_decode_32(&p);
2851 maplen = ceph_decode_32(&p);
2852 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2853
2854 /* do we need it? */
2855 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2856 mutex_lock(&mdsc->mutex);
2857 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2858 dout("handle_map epoch %u <= our %u\n",
2859 epoch, mdsc->mdsmap->m_epoch);
2860 mutex_unlock(&mdsc->mutex);
2861 return;
2862 }
2863
2864 newmap = ceph_mdsmap_decode(&p, end);
2865 if (IS_ERR(newmap)) {
2866 err = PTR_ERR(newmap);
2867 goto bad_unlock;
2868 }
2869
2870 /* swap into place */
2871 if (mdsc->mdsmap) {
2872 oldmap = mdsc->mdsmap;
2873 mdsc->mdsmap = newmap;
2874 check_new_map(mdsc, newmap, oldmap);
2875 ceph_mdsmap_destroy(oldmap);
2876 } else {
2877 mdsc->mdsmap = newmap; /* first mds map */
2878 }
2879 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2880
2881 __wake_requests(mdsc, &mdsc->waiting_for_map);
2882
2883 mutex_unlock(&mdsc->mutex);
2884 schedule_delayed(mdsc);
2885 return;
2886
2887bad_unlock:
2888 mutex_unlock(&mdsc->mutex);
2889bad:
2890 pr_err("error decoding mdsmap %d\n", err);
2891 return;
2892}
2893
2894static struct ceph_connection *con_get(struct ceph_connection *con)
2895{
2896 struct ceph_mds_session *s = con->private;
2897
2898 if (get_session(s)) {
2899 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2900 return con;
2901 }
2902 dout("mdsc con_get %p FAIL\n", s);
2903 return NULL;
2904}
2905
2906static void con_put(struct ceph_connection *con)
2907{
2908 struct ceph_mds_session *s = con->private;
2909
2910 ceph_put_mds_session(s);
2911 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2912}
2913
2914/*
2915 * if the client is unresponsive for long enough, the mds will kill
2916 * the session entirely.
2917 */
2918static void peer_reset(struct ceph_connection *con)
2919{
2920 struct ceph_mds_session *s = con->private;
2921
2922 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2923 s->s_mds);
2924}
2925
2926static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2927{
2928 struct ceph_mds_session *s = con->private;
2929 struct ceph_mds_client *mdsc = s->s_mdsc;
2930 int type = le16_to_cpu(msg->hdr.type);
2931
2932 mutex_lock(&mdsc->mutex);
2933 if (__verify_registered_session(mdsc, s) < 0) {
2934 mutex_unlock(&mdsc->mutex);
2935 goto out;
2936 }
2937 mutex_unlock(&mdsc->mutex);
2938
2939 switch (type) {
2940 case CEPH_MSG_MDS_MAP:
2941 ceph_mdsc_handle_map(mdsc, msg);
2942 break;
2943 case CEPH_MSG_CLIENT_SESSION:
2944 handle_session(s, msg);
2945 break;
2946 case CEPH_MSG_CLIENT_REPLY:
2947 handle_reply(s, msg);
2948 break;
2949 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2950 handle_forward(mdsc, s, msg);
2951 break;
2952 case CEPH_MSG_CLIENT_CAPS:
2953 ceph_handle_caps(s, msg);
2954 break;
2955 case CEPH_MSG_CLIENT_SNAP:
2956 ceph_handle_snap(mdsc, s, msg);
2957 break;
2958 case CEPH_MSG_CLIENT_LEASE:
2959 handle_lease(mdsc, s, msg);
2960 break;
2961
2962 default:
2963 pr_err("received unknown message type %d %s\n", type,
2964 ceph_msg_type_name(type));
2965 }
2966out:
2967 ceph_msg_put(msg);
2968}
2969
2970/*
2971 * authentication
2972 */
2973static int get_authorizer(struct ceph_connection *con,
2974 void **buf, int *len, int *proto,
2975 void **reply_buf, int *reply_len, int force_new)
2976{
2977 struct ceph_mds_session *s = con->private;
2978 struct ceph_mds_client *mdsc = s->s_mdsc;
2979 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2980 int ret = 0;
2981
2982 if (force_new && s->s_authorizer) {
2983 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2984 s->s_authorizer = NULL;
2985 }
2986 if (s->s_authorizer == NULL) {
2987 if (ac->ops->create_authorizer) {
2988 ret = ac->ops->create_authorizer(
2989 ac, CEPH_ENTITY_TYPE_MDS,
2990 &s->s_authorizer,
2991 &s->s_authorizer_buf,
2992 &s->s_authorizer_buf_len,
2993 &s->s_authorizer_reply_buf,
2994 &s->s_authorizer_reply_buf_len);
2995 if (ret)
2996 return ret;
2997 }
2998 }
2999
3000 *proto = ac->protocol;
3001 *buf = s->s_authorizer_buf;
3002 *len = s->s_authorizer_buf_len;
3003 *reply_buf = s->s_authorizer_reply_buf;
3004 *reply_len = s->s_authorizer_reply_buf_len;
3005 return 0;
3006}
3007
3008
3009static int verify_authorizer_reply(struct ceph_connection *con, int len)
3010{
3011 struct ceph_mds_session *s = con->private;
3012 struct ceph_mds_client *mdsc = s->s_mdsc;
3013 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3014
3015 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3016}
3017
3018static int invalidate_authorizer(struct ceph_connection *con)
3019{
3020 struct ceph_mds_session *s = con->private;
3021 struct ceph_mds_client *mdsc = s->s_mdsc;
3022 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3023
3024 if (ac->ops->invalidate_authorizer)
3025 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3026
3027 return ceph_monc_validate_auth(&mdsc->client->monc);
3028}
3029
3030const static struct ceph_connection_operations mds_con_ops = {
3031 .get = con_get,
3032 .put = con_put,
3033 .dispatch = dispatch,
3034 .get_authorizer = get_authorizer,
3035 .verify_authorizer_reply = verify_authorizer_reply,
3036 .invalidate_authorizer = invalidate_authorizer,
3037 .peer_reset = peer_reset,
3038};
3039
3040
3041
3042
3043/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..cdaaa131add3
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2249 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/slab.h>
10#include <linux/socket.h>
11#include <linux/string.h>
12#include <net/tcp.h>
13
14#include "super.h"
15#include "messenger.h"
16#include "decode.h"
17#include "pagelist.h"
18
19/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other
21 * hosts in the system. The messenger provides ordered and reliable
22 * delivery. We tolerate TCP disconnects by reconnecting (with
23 * exponential backoff) in the case of a fault (disconnection, bad
24 * crc, protocol error). Acks allow sent messages to be discarded by
25 * the sender.
26 */
27
28/* static tag bytes (protocol control messages) */
29static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
37
38static void queue_con(struct ceph_connection *con);
39static void con_work(struct work_struct *);
40static void ceph_fault(struct ceph_connection *con);
41
42const char *ceph_name_type_str(int t)
43{
44 switch (t) {
45 case CEPH_ENTITY_TYPE_MON: return "mon";
46 case CEPH_ENTITY_TYPE_MDS: return "mds";
47 case CEPH_ENTITY_TYPE_OSD: return "osd";
48 case CEPH_ENTITY_TYPE_CLIENT: return "client";
49 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
50 default: return "???";
51 }
52}
53
54/*
55 * nicely render a sockaddr as a string.
56 */
57#define MAX_ADDR_STR 20
58static char addr_str[MAX_ADDR_STR][40];
59static DEFINE_SPINLOCK(addr_str_lock);
60static int last_addr_str;
61
62const char *pr_addr(const struct sockaddr_storage *ss)
63{
64 int i;
65 char *s;
66 struct sockaddr_in *in4 = (void *)ss;
67 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
68 struct sockaddr_in6 *in6 = (void *)ss;
69
70 spin_lock(&addr_str_lock);
71 i = last_addr_str++;
72 if (last_addr_str == MAX_ADDR_STR)
73 last_addr_str = 0;
74 spin_unlock(&addr_str_lock);
75 s = addr_str[i];
76
77 switch (ss->ss_family) {
78 case AF_INET:
79 sprintf(s, "%u.%u.%u.%u:%u",
80 (unsigned int)quad[0],
81 (unsigned int)quad[1],
82 (unsigned int)quad[2],
83 (unsigned int)quad[3],
84 (unsigned int)ntohs(in4->sin_port));
85 break;
86
87 case AF_INET6:
88 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
89 in6->sin6_addr.s6_addr16[0],
90 in6->sin6_addr.s6_addr16[1],
91 in6->sin6_addr.s6_addr16[2],
92 in6->sin6_addr.s6_addr16[3],
93 in6->sin6_addr.s6_addr16[4],
94 in6->sin6_addr.s6_addr16[5],
95 in6->sin6_addr.s6_addr16[6],
96 in6->sin6_addr.s6_addr16[7],
97 (unsigned int)ntohs(in6->sin6_port));
98 break;
99
100 default:
101 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
102 }
103
104 return s;
105}
106
107static void encode_my_addr(struct ceph_messenger *msgr)
108{
109 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
110 ceph_encode_addr(&msgr->my_enc_addr);
111}
112
113/*
114 * work queue for all reading and writing to/from the socket.
115 */
116struct workqueue_struct *ceph_msgr_wq;
117
118int __init ceph_msgr_init(void)
119{
120 ceph_msgr_wq = create_workqueue("ceph-msgr");
121 if (IS_ERR(ceph_msgr_wq)) {
122 int ret = PTR_ERR(ceph_msgr_wq);
123 pr_err("msgr_init failed to create workqueue: %d\n", ret);
124 ceph_msgr_wq = NULL;
125 return ret;
126 }
127 return 0;
128}
129
130void ceph_msgr_exit(void)
131{
132 destroy_workqueue(ceph_msgr_wq);
133}
134
135/*
136 * socket callback functions
137 */
138
139/* data available on socket, or listen socket received a connect */
140static void ceph_data_ready(struct sock *sk, int count_unused)
141{
142 struct ceph_connection *con =
143 (struct ceph_connection *)sk->sk_user_data;
144 if (sk->sk_state != TCP_CLOSE_WAIT) {
145 dout("ceph_data_ready on %p state = %lu, queueing work\n",
146 con, con->state);
147 queue_con(con);
148 }
149}
150
151/* socket has buffer space for writing */
152static void ceph_write_space(struct sock *sk)
153{
154 struct ceph_connection *con =
155 (struct ceph_connection *)sk->sk_user_data;
156
157 /* only queue to workqueue if there is data we want to write. */
158 if (test_bit(WRITE_PENDING, &con->state)) {
159 dout("ceph_write_space %p queueing write work\n", con);
160 queue_con(con);
161 } else {
162 dout("ceph_write_space %p nothing to write\n", con);
163 }
164
165 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
166 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
167}
168
169/* socket's state has changed */
170static void ceph_state_change(struct sock *sk)
171{
172 struct ceph_connection *con =
173 (struct ceph_connection *)sk->sk_user_data;
174
175 dout("ceph_state_change %p state = %lu sk_state = %u\n",
176 con, con->state, sk->sk_state);
177
178 if (test_bit(CLOSED, &con->state))
179 return;
180
181 switch (sk->sk_state) {
182 case TCP_CLOSE:
183 dout("ceph_state_change TCP_CLOSE\n");
184 case TCP_CLOSE_WAIT:
185 dout("ceph_state_change TCP_CLOSE_WAIT\n");
186 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
187 if (test_bit(CONNECTING, &con->state))
188 con->error_msg = "connection failed";
189 else
190 con->error_msg = "socket closed";
191 queue_con(con);
192 }
193 break;
194 case TCP_ESTABLISHED:
195 dout("ceph_state_change TCP_ESTABLISHED\n");
196 queue_con(con);
197 break;
198 }
199}
200
201/*
202 * set up socket callbacks
203 */
204static void set_sock_callbacks(struct socket *sock,
205 struct ceph_connection *con)
206{
207 struct sock *sk = sock->sk;
208 sk->sk_user_data = (void *)con;
209 sk->sk_data_ready = ceph_data_ready;
210 sk->sk_write_space = ceph_write_space;
211 sk->sk_state_change = ceph_state_change;
212}
213
214
215/*
216 * socket helpers
217 */
218
219/*
220 * initiate connection to a remote socket.
221 */
222static struct socket *ceph_tcp_connect(struct ceph_connection *con)
223{
224 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
225 struct socket *sock;
226 int ret;
227
228 BUG_ON(con->sock);
229 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
230 if (ret)
231 return ERR_PTR(ret);
232 con->sock = sock;
233 sock->sk->sk_allocation = GFP_NOFS;
234
235#ifdef CONFIG_LOCKDEP
236 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
237#endif
238
239 set_sock_callbacks(sock, con);
240
241 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
242
243 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
244 if (ret == -EINPROGRESS) {
245 dout("connect %s EINPROGRESS sk_state = %u\n",
246 pr_addr(&con->peer_addr.in_addr),
247 sock->sk->sk_state);
248 ret = 0;
249 }
250 if (ret < 0) {
251 pr_err("connect %s error %d\n",
252 pr_addr(&con->peer_addr.in_addr), ret);
253 sock_release(sock);
254 con->sock = NULL;
255 con->error_msg = "connect error";
256 }
257
258 if (ret < 0)
259 return ERR_PTR(ret);
260 return sock;
261}
262
263static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
264{
265 struct kvec iov = {buf, len};
266 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
267
268 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
269}
270
271/*
272 * write something. @more is true if caller will be sending more data
273 * shortly.
274 */
275static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
276 size_t kvlen, size_t len, int more)
277{
278 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
279
280 if (more)
281 msg.msg_flags |= MSG_MORE;
282 else
283 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
284
285 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
286}
287
288
289/*
290 * Shutdown/close the socket for the given connection.
291 */
292static int con_close_socket(struct ceph_connection *con)
293{
294 int rc;
295
296 dout("con_close_socket on %p sock %p\n", con, con->sock);
297 if (!con->sock)
298 return 0;
299 set_bit(SOCK_CLOSED, &con->state);
300 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
301 sock_release(con->sock);
302 con->sock = NULL;
303 clear_bit(SOCK_CLOSED, &con->state);
304 return rc;
305}
306
307/*
308 * Reset a connection. Discard all incoming and outgoing messages
309 * and clear *_seq state.
310 */
311static void ceph_msg_remove(struct ceph_msg *msg)
312{
313 list_del_init(&msg->list_head);
314 ceph_msg_put(msg);
315}
316static void ceph_msg_remove_list(struct list_head *head)
317{
318 while (!list_empty(head)) {
319 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
320 list_head);
321 ceph_msg_remove(msg);
322 }
323}
324
325static void reset_connection(struct ceph_connection *con)
326{
327 /* reset connection, out_queue, msg_ and connect_seq */
328 /* discard existing out_queue and msg_seq */
329 ceph_msg_remove_list(&con->out_queue);
330 ceph_msg_remove_list(&con->out_sent);
331
332 if (con->in_msg) {
333 ceph_msg_put(con->in_msg);
334 con->in_msg = NULL;
335 }
336
337 con->connect_seq = 0;
338 con->out_seq = 0;
339 if (con->out_msg) {
340 ceph_msg_put(con->out_msg);
341 con->out_msg = NULL;
342 }
343 con->in_seq = 0;
344 con->in_seq_acked = 0;
345}
346
347/*
348 * mark a peer down. drop any open connections.
349 */
350void ceph_con_close(struct ceph_connection *con)
351{
352 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
353 set_bit(CLOSED, &con->state); /* in case there's queued work */
354 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
355 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
356 clear_bit(KEEPALIVE_PENDING, &con->state);
357 clear_bit(WRITE_PENDING, &con->state);
358 mutex_lock(&con->mutex);
359 reset_connection(con);
360 cancel_delayed_work(&con->work);
361 mutex_unlock(&con->mutex);
362 queue_con(con);
363}
364
365/*
366 * Reopen a closed connection, with a new peer address.
367 */
368void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
369{
370 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
371 set_bit(OPENING, &con->state);
372 clear_bit(CLOSED, &con->state);
373 memcpy(&con->peer_addr, addr, sizeof(*addr));
374 con->delay = 0; /* reset backoff memory */
375 queue_con(con);
376}
377
378/*
379 * return true if this connection ever successfully opened
380 */
381bool ceph_con_opened(struct ceph_connection *con)
382{
383 return con->connect_seq > 0;
384}
385
386/*
387 * generic get/put
388 */
389struct ceph_connection *ceph_con_get(struct ceph_connection *con)
390{
391 dout("con_get %p nref = %d -> %d\n", con,
392 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
393 if (atomic_inc_not_zero(&con->nref))
394 return con;
395 return NULL;
396}
397
398void ceph_con_put(struct ceph_connection *con)
399{
400 dout("con_put %p nref = %d -> %d\n", con,
401 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
402 BUG_ON(atomic_read(&con->nref) == 0);
403 if (atomic_dec_and_test(&con->nref)) {
404 BUG_ON(con->sock);
405 kfree(con);
406 }
407}
408
409/*
410 * initialize a new connection.
411 */
412void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
413{
414 dout("con_init %p\n", con);
415 memset(con, 0, sizeof(*con));
416 atomic_set(&con->nref, 1);
417 con->msgr = msgr;
418 mutex_init(&con->mutex);
419 INIT_LIST_HEAD(&con->out_queue);
420 INIT_LIST_HEAD(&con->out_sent);
421 INIT_DELAYED_WORK(&con->work, con_work);
422}
423
424
425/*
426 * We maintain a global counter to order connection attempts. Get
427 * a unique seq greater than @gt.
428 */
429static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
430{
431 u32 ret;
432
433 spin_lock(&msgr->global_seq_lock);
434 if (msgr->global_seq < gt)
435 msgr->global_seq = gt;
436 ret = ++msgr->global_seq;
437 spin_unlock(&msgr->global_seq_lock);
438 return ret;
439}
440
441
442/*
443 * Prepare footer for currently outgoing message, and finish things
444 * off. Assumes out_kvec* are already valid.. we just add on to the end.
445 */
446static void prepare_write_message_footer(struct ceph_connection *con, int v)
447{
448 struct ceph_msg *m = con->out_msg;
449
450 dout("prepare_write_message_footer %p\n", con);
451 con->out_kvec_is_msg = true;
452 con->out_kvec[v].iov_base = &m->footer;
453 con->out_kvec[v].iov_len = sizeof(m->footer);
454 con->out_kvec_bytes += sizeof(m->footer);
455 con->out_kvec_left++;
456 con->out_more = m->more_to_follow;
457 con->out_msg_done = true;
458}
459
460/*
461 * Prepare headers for the next outgoing message.
462 */
463static void prepare_write_message(struct ceph_connection *con)
464{
465 struct ceph_msg *m;
466 int v = 0;
467
468 con->out_kvec_bytes = 0;
469 con->out_kvec_is_msg = true;
470 con->out_msg_done = false;
471
472 /* Sneak an ack in there first? If we can get it into the same
473 * TCP packet that's a good thing. */
474 if (con->in_seq > con->in_seq_acked) {
475 con->in_seq_acked = con->in_seq;
476 con->out_kvec[v].iov_base = &tag_ack;
477 con->out_kvec[v++].iov_len = 1;
478 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
479 con->out_kvec[v].iov_base = &con->out_temp_ack;
480 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
481 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
482 }
483
484 m = list_first_entry(&con->out_queue,
485 struct ceph_msg, list_head);
486 con->out_msg = m;
487 if (test_bit(LOSSYTX, &con->state)) {
488 list_del_init(&m->list_head);
489 } else {
490 /* put message on sent list */
491 ceph_msg_get(m);
492 list_move_tail(&m->list_head, &con->out_sent);
493 }
494
495 m->hdr.seq = cpu_to_le64(++con->out_seq);
496
497 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
498 m, con->out_seq, le16_to_cpu(m->hdr.type),
499 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
500 le32_to_cpu(m->hdr.data_len),
501 m->nr_pages);
502 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
503
504 /* tag + hdr + front + middle */
505 con->out_kvec[v].iov_base = &tag_msg;
506 con->out_kvec[v++].iov_len = 1;
507 con->out_kvec[v].iov_base = &m->hdr;
508 con->out_kvec[v++].iov_len = sizeof(m->hdr);
509 con->out_kvec[v++] = m->front;
510 if (m->middle)
511 con->out_kvec[v++] = m->middle->vec;
512 con->out_kvec_left = v;
513 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
514 (m->middle ? m->middle->vec.iov_len : 0);
515 con->out_kvec_cur = con->out_kvec;
516
517 /* fill in crc (except data pages), footer */
518 con->out_msg->hdr.crc =
519 cpu_to_le32(crc32c(0, (void *)&m->hdr,
520 sizeof(m->hdr) - sizeof(m->hdr.crc)));
521 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
522 con->out_msg->footer.front_crc =
523 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
524 if (m->middle)
525 con->out_msg->footer.middle_crc =
526 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
527 m->middle->vec.iov_len));
528 else
529 con->out_msg->footer.middle_crc = 0;
530 con->out_msg->footer.data_crc = 0;
531 dout("prepare_write_message front_crc %u data_crc %u\n",
532 le32_to_cpu(con->out_msg->footer.front_crc),
533 le32_to_cpu(con->out_msg->footer.middle_crc));
534
535 /* is there a data payload? */
536 if (le32_to_cpu(m->hdr.data_len) > 0) {
537 /* initialize page iterator */
538 con->out_msg_pos.page = 0;
539 con->out_msg_pos.page_pos =
540 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
541 con->out_msg_pos.data_pos = 0;
542 con->out_msg_pos.did_page_crc = 0;
543 con->out_more = 1; /* data + footer will follow */
544 } else {
545 /* no, queue up footer too and be done */
546 prepare_write_message_footer(con, v);
547 }
548
549 set_bit(WRITE_PENDING, &con->state);
550}
551
552/*
553 * Prepare an ack.
554 */
555static void prepare_write_ack(struct ceph_connection *con)
556{
557 dout("prepare_write_ack %p %llu -> %llu\n", con,
558 con->in_seq_acked, con->in_seq);
559 con->in_seq_acked = con->in_seq;
560
561 con->out_kvec[0].iov_base = &tag_ack;
562 con->out_kvec[0].iov_len = 1;
563 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
564 con->out_kvec[1].iov_base = &con->out_temp_ack;
565 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
566 con->out_kvec_left = 2;
567 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
568 con->out_kvec_cur = con->out_kvec;
569 con->out_more = 1; /* more will follow.. eventually.. */
570 set_bit(WRITE_PENDING, &con->state);
571}
572
573/*
574 * Prepare to write keepalive byte.
575 */
576static void prepare_write_keepalive(struct ceph_connection *con)
577{
578 dout("prepare_write_keepalive %p\n", con);
579 con->out_kvec[0].iov_base = &tag_keepalive;
580 con->out_kvec[0].iov_len = 1;
581 con->out_kvec_left = 1;
582 con->out_kvec_bytes = 1;
583 con->out_kvec_cur = con->out_kvec;
584 set_bit(WRITE_PENDING, &con->state);
585}
586
587/*
588 * Connection negotiation.
589 */
590
591static void prepare_connect_authorizer(struct ceph_connection *con)
592{
593 void *auth_buf;
594 int auth_len = 0;
595 int auth_protocol = 0;
596
597 mutex_unlock(&con->mutex);
598 if (con->ops->get_authorizer)
599 con->ops->get_authorizer(con, &auth_buf, &auth_len,
600 &auth_protocol, &con->auth_reply_buf,
601 &con->auth_reply_buf_len,
602 con->auth_retry);
603 mutex_lock(&con->mutex);
604
605 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
606 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
607
608 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
609 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
610 con->out_kvec_left++;
611 con->out_kvec_bytes += auth_len;
612}
613
614/*
615 * We connected to a peer and are saying hello.
616 */
617static void prepare_write_banner(struct ceph_messenger *msgr,
618 struct ceph_connection *con)
619{
620 int len = strlen(CEPH_BANNER);
621
622 con->out_kvec[0].iov_base = CEPH_BANNER;
623 con->out_kvec[0].iov_len = len;
624 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
625 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
626 con->out_kvec_left = 2;
627 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
628 con->out_kvec_cur = con->out_kvec;
629 con->out_more = 0;
630 set_bit(WRITE_PENDING, &con->state);
631}
632
633static void prepare_write_connect(struct ceph_messenger *msgr,
634 struct ceph_connection *con,
635 int after_banner)
636{
637 unsigned global_seq = get_global_seq(con->msgr, 0);
638 int proto;
639
640 switch (con->peer_name.type) {
641 case CEPH_ENTITY_TYPE_MON:
642 proto = CEPH_MONC_PROTOCOL;
643 break;
644 case CEPH_ENTITY_TYPE_OSD:
645 proto = CEPH_OSDC_PROTOCOL;
646 break;
647 case CEPH_ENTITY_TYPE_MDS:
648 proto = CEPH_MDSC_PROTOCOL;
649 break;
650 default:
651 BUG();
652 }
653
654 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
655 con->connect_seq, global_seq, proto);
656
657 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
658 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
659 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
660 con->out_connect.global_seq = cpu_to_le32(global_seq);
661 con->out_connect.protocol_version = cpu_to_le32(proto);
662 con->out_connect.flags = 0;
663
664 if (!after_banner) {
665 con->out_kvec_left = 0;
666 con->out_kvec_bytes = 0;
667 }
668 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
669 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
670 con->out_kvec_left++;
671 con->out_kvec_bytes += sizeof(con->out_connect);
672 con->out_kvec_cur = con->out_kvec;
673 con->out_more = 0;
674 set_bit(WRITE_PENDING, &con->state);
675
676 prepare_connect_authorizer(con);
677}
678
679
680/*
681 * write as much of pending kvecs to the socket as we can.
682 * 1 -> done
683 * 0 -> socket full, but more to do
684 * <0 -> error
685 */
686static int write_partial_kvec(struct ceph_connection *con)
687{
688 int ret;
689
690 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
691 while (con->out_kvec_bytes > 0) {
692 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
693 con->out_kvec_left, con->out_kvec_bytes,
694 con->out_more);
695 if (ret <= 0)
696 goto out;
697 con->out_kvec_bytes -= ret;
698 if (con->out_kvec_bytes == 0)
699 break; /* done */
700 while (ret > 0) {
701 if (ret >= con->out_kvec_cur->iov_len) {
702 ret -= con->out_kvec_cur->iov_len;
703 con->out_kvec_cur++;
704 con->out_kvec_left--;
705 } else {
706 con->out_kvec_cur->iov_len -= ret;
707 con->out_kvec_cur->iov_base += ret;
708 ret = 0;
709 break;
710 }
711 }
712 }
713 con->out_kvec_left = 0;
714 con->out_kvec_is_msg = false;
715 ret = 1;
716out:
717 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
718 con->out_kvec_bytes, con->out_kvec_left, ret);
719 return ret; /* done! */
720}
721
722/*
723 * Write as much message data payload as we can. If we finish, queue
724 * up the footer.
725 * 1 -> done, footer is now queued in out_kvec[].
726 * 0 -> socket full, but more to do
727 * <0 -> error
728 */
729static int write_partial_msg_pages(struct ceph_connection *con)
730{
731 struct ceph_msg *msg = con->out_msg;
732 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
733 size_t len;
734 int crc = con->msgr->nocrc;
735 int ret;
736
737 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
738 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
739 con->out_msg_pos.page_pos);
740
741 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
742 struct page *page = NULL;
743 void *kaddr = NULL;
744
745 /*
746 * if we are calculating the data crc (the default), we need
747 * to map the page. if our pages[] has been revoked, use the
748 * zero page.
749 */
750 if (msg->pages) {
751 page = msg->pages[con->out_msg_pos.page];
752 if (crc)
753 kaddr = kmap(page);
754 } else if (msg->pagelist) {
755 page = list_first_entry(&msg->pagelist->head,
756 struct page, lru);
757 if (crc)
758 kaddr = kmap(page);
759 } else {
760 page = con->msgr->zero_page;
761 if (crc)
762 kaddr = page_address(con->msgr->zero_page);
763 }
764 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
765 (int)(data_len - con->out_msg_pos.data_pos));
766 if (crc && !con->out_msg_pos.did_page_crc) {
767 void *base = kaddr + con->out_msg_pos.page_pos;
768 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
769
770 BUG_ON(kaddr == NULL);
771 con->out_msg->footer.data_crc =
772 cpu_to_le32(crc32c(tmpcrc, base, len));
773 con->out_msg_pos.did_page_crc = 1;
774 }
775
776 ret = kernel_sendpage(con->sock, page,
777 con->out_msg_pos.page_pos, len,
778 MSG_DONTWAIT | MSG_NOSIGNAL |
779 MSG_MORE);
780
781 if (crc && (msg->pages || msg->pagelist))
782 kunmap(page);
783
784 if (ret <= 0)
785 goto out;
786
787 con->out_msg_pos.data_pos += ret;
788 con->out_msg_pos.page_pos += ret;
789 if (ret == len) {
790 con->out_msg_pos.page_pos = 0;
791 con->out_msg_pos.page++;
792 con->out_msg_pos.did_page_crc = 0;
793 if (msg->pagelist)
794 list_move_tail(&page->lru,
795 &msg->pagelist->head);
796 }
797 }
798
799 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
800
801 /* prepare and queue up footer, too */
802 if (!crc)
803 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
804 con->out_kvec_bytes = 0;
805 con->out_kvec_left = 0;
806 con->out_kvec_cur = con->out_kvec;
807 prepare_write_message_footer(con, 0);
808 ret = 1;
809out:
810 return ret;
811}
812
813/*
814 * write some zeros
815 */
816static int write_partial_skip(struct ceph_connection *con)
817{
818 int ret;
819
820 while (con->out_skip > 0) {
821 struct kvec iov = {
822 .iov_base = page_address(con->msgr->zero_page),
823 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
824 };
825
826 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
827 if (ret <= 0)
828 goto out;
829 con->out_skip -= ret;
830 }
831 ret = 1;
832out:
833 return ret;
834}
835
836/*
837 * Prepare to read connection handshake, or an ack.
838 */
839static void prepare_read_banner(struct ceph_connection *con)
840{
841 dout("prepare_read_banner %p\n", con);
842 con->in_base_pos = 0;
843}
844
845static void prepare_read_connect(struct ceph_connection *con)
846{
847 dout("prepare_read_connect %p\n", con);
848 con->in_base_pos = 0;
849}
850
851static void prepare_read_ack(struct ceph_connection *con)
852{
853 dout("prepare_read_ack %p\n", con);
854 con->in_base_pos = 0;
855}
856
857static void prepare_read_tag(struct ceph_connection *con)
858{
859 dout("prepare_read_tag %p\n", con);
860 con->in_base_pos = 0;
861 con->in_tag = CEPH_MSGR_TAG_READY;
862}
863
864/*
865 * Prepare to read a message.
866 */
867static int prepare_read_message(struct ceph_connection *con)
868{
869 dout("prepare_read_message %p\n", con);
870 BUG_ON(con->in_msg != NULL);
871 con->in_base_pos = 0;
872 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
873 return 0;
874}
875
876
877static int read_partial(struct ceph_connection *con,
878 int *to, int size, void *object)
879{
880 *to += size;
881 while (con->in_base_pos < *to) {
882 int left = *to - con->in_base_pos;
883 int have = size - left;
884 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
885 if (ret <= 0)
886 return ret;
887 con->in_base_pos += ret;
888 }
889 return 1;
890}
891
892
893/*
894 * Read all or part of the connect-side handshake on a new connection
895 */
896static int read_partial_banner(struct ceph_connection *con)
897{
898 int ret, to = 0;
899
900 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
901
902 /* peer's banner */
903 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
904 if (ret <= 0)
905 goto out;
906 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
907 &con->actual_peer_addr);
908 if (ret <= 0)
909 goto out;
910 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
911 &con->peer_addr_for_me);
912 if (ret <= 0)
913 goto out;
914out:
915 return ret;
916}
917
918static int read_partial_connect(struct ceph_connection *con)
919{
920 int ret, to = 0;
921
922 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
923
924 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
925 if (ret <= 0)
926 goto out;
927 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
928 con->auth_reply_buf);
929 if (ret <= 0)
930 goto out;
931
932 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
933 con, (int)con->in_reply.tag,
934 le32_to_cpu(con->in_reply.connect_seq),
935 le32_to_cpu(con->in_reply.global_seq));
936out:
937 return ret;
938
939}
940
941/*
942 * Verify the hello banner looks okay.
943 */
944static int verify_hello(struct ceph_connection *con)
945{
946 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
947 pr_err("connect to %s got bad banner\n",
948 pr_addr(&con->peer_addr.in_addr));
949 con->error_msg = "protocol error, bad banner";
950 return -1;
951 }
952 return 0;
953}
954
955static bool addr_is_blank(struct sockaddr_storage *ss)
956{
957 switch (ss->ss_family) {
958 case AF_INET:
959 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
960 case AF_INET6:
961 return
962 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
963 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
964 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
965 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
966 }
967 return false;
968}
969
970static int addr_port(struct sockaddr_storage *ss)
971{
972 switch (ss->ss_family) {
973 case AF_INET:
974 return ntohs(((struct sockaddr_in *)ss)->sin_port);
975 case AF_INET6:
976 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
977 }
978 return 0;
979}
980
981static void addr_set_port(struct sockaddr_storage *ss, int p)
982{
983 switch (ss->ss_family) {
984 case AF_INET:
985 ((struct sockaddr_in *)ss)->sin_port = htons(p);
986 case AF_INET6:
987 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
988 }
989}
990
991/*
992 * Parse an ip[:port] list into an addr array. Use the default
993 * monitor port if a port isn't specified.
994 */
995int ceph_parse_ips(const char *c, const char *end,
996 struct ceph_entity_addr *addr,
997 int max_count, int *count)
998{
999 int i;
1000 const char *p = c;
1001
1002 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
1003 for (i = 0; i < max_count; i++) {
1004 const char *ipend;
1005 struct sockaddr_storage *ss = &addr[i].in_addr;
1006 struct sockaddr_in *in4 = (void *)ss;
1007 struct sockaddr_in6 *in6 = (void *)ss;
1008 int port;
1009
1010 memset(ss, 0, sizeof(*ss));
1011 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1012 ',', &ipend)) {
1013 ss->ss_family = AF_INET;
1014 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1015 ',', &ipend)) {
1016 ss->ss_family = AF_INET6;
1017 } else {
1018 goto bad;
1019 }
1020 p = ipend;
1021
1022 /* port? */
1023 if (p < end && *p == ':') {
1024 port = 0;
1025 p++;
1026 while (p < end && *p >= '0' && *p <= '9') {
1027 port = (port * 10) + (*p - '0');
1028 p++;
1029 }
1030 if (port > 65535 || port == 0)
1031 goto bad;
1032 } else {
1033 port = CEPH_MON_PORT;
1034 }
1035
1036 addr_set_port(ss, port);
1037
1038 dout("parse_ips got %s\n", pr_addr(ss));
1039
1040 if (p == end)
1041 break;
1042 if (*p != ',')
1043 goto bad;
1044 p++;
1045 }
1046
1047 if (p != end)
1048 goto bad;
1049
1050 if (count)
1051 *count = i + 1;
1052 return 0;
1053
1054bad:
1055 pr_err("parse_ips bad ip '%s'\n", c);
1056 return -EINVAL;
1057}
1058
1059static int process_banner(struct ceph_connection *con)
1060{
1061 dout("process_banner on %p\n", con);
1062
1063 if (verify_hello(con) < 0)
1064 return -1;
1065
1066 ceph_decode_addr(&con->actual_peer_addr);
1067 ceph_decode_addr(&con->peer_addr_for_me);
1068
1069 /*
1070 * Make sure the other end is who we wanted. note that the other
1071 * end may not yet know their ip address, so if it's 0.0.0.0, give
1072 * them the benefit of the doubt.
1073 */
1074 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1075 sizeof(con->peer_addr)) != 0 &&
1076 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1077 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1078 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1079 pr_addr(&con->peer_addr.in_addr),
1080 le64_to_cpu(con->peer_addr.nonce),
1081 pr_addr(&con->actual_peer_addr.in_addr),
1082 le64_to_cpu(con->actual_peer_addr.nonce));
1083 con->error_msg = "wrong peer at address";
1084 return -1;
1085 }
1086
1087 /*
1088 * did we learn our address?
1089 */
1090 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1091 int port = addr_port(&con->msgr->inst.addr.in_addr);
1092
1093 memcpy(&con->msgr->inst.addr.in_addr,
1094 &con->peer_addr_for_me.in_addr,
1095 sizeof(con->peer_addr_for_me.in_addr));
1096 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1097 encode_my_addr(con->msgr);
1098 dout("process_banner learned my addr is %s\n",
1099 pr_addr(&con->msgr->inst.addr.in_addr));
1100 }
1101
1102 set_bit(NEGOTIATING, &con->state);
1103 prepare_read_connect(con);
1104 return 0;
1105}
1106
1107static void fail_protocol(struct ceph_connection *con)
1108{
1109 reset_connection(con);
1110 set_bit(CLOSED, &con->state); /* in case there's queued work */
1111
1112 mutex_unlock(&con->mutex);
1113 if (con->ops->bad_proto)
1114 con->ops->bad_proto(con);
1115 mutex_lock(&con->mutex);
1116}
1117
1118static int process_connect(struct ceph_connection *con)
1119{
1120 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1121 u64 req_feat = CEPH_FEATURE_REQUIRED;
1122 u64 server_feat = le64_to_cpu(con->in_reply.features);
1123
1124 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1125
1126 switch (con->in_reply.tag) {
1127 case CEPH_MSGR_TAG_FEATURES:
1128 pr_err("%s%lld %s feature set mismatch,"
1129 " my %llx < server's %llx, missing %llx\n",
1130 ENTITY_NAME(con->peer_name),
1131 pr_addr(&con->peer_addr.in_addr),
1132 sup_feat, server_feat, server_feat & ~sup_feat);
1133 con->error_msg = "missing required protocol features";
1134 fail_protocol(con);
1135 return -1;
1136
1137 case CEPH_MSGR_TAG_BADPROTOVER:
1138 pr_err("%s%lld %s protocol version mismatch,"
1139 " my %d != server's %d\n",
1140 ENTITY_NAME(con->peer_name),
1141 pr_addr(&con->peer_addr.in_addr),
1142 le32_to_cpu(con->out_connect.protocol_version),
1143 le32_to_cpu(con->in_reply.protocol_version));
1144 con->error_msg = "protocol version mismatch";
1145 fail_protocol(con);
1146 return -1;
1147
1148 case CEPH_MSGR_TAG_BADAUTHORIZER:
1149 con->auth_retry++;
1150 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1151 con->auth_retry);
1152 if (con->auth_retry == 2) {
1153 con->error_msg = "connect authorization failure";
1154 reset_connection(con);
1155 set_bit(CLOSED, &con->state);
1156 return -1;
1157 }
1158 con->auth_retry = 1;
1159 prepare_write_connect(con->msgr, con, 0);
1160 prepare_read_connect(con);
1161 break;
1162
1163 case CEPH_MSGR_TAG_RESETSESSION:
1164 /*
1165 * If we connected with a large connect_seq but the peer
1166 * has no record of a session with us (no connection, or
1167 * connect_seq == 0), they will send RESETSESION to indicate
1168 * that they must have reset their session, and may have
1169 * dropped messages.
1170 */
1171 dout("process_connect got RESET peer seq %u\n",
1172 le32_to_cpu(con->in_connect.connect_seq));
1173 pr_err("%s%lld %s connection reset\n",
1174 ENTITY_NAME(con->peer_name),
1175 pr_addr(&con->peer_addr.in_addr));
1176 reset_connection(con);
1177 prepare_write_connect(con->msgr, con, 0);
1178 prepare_read_connect(con);
1179
1180 /* Tell ceph about it. */
1181 mutex_unlock(&con->mutex);
1182 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1183 if (con->ops->peer_reset)
1184 con->ops->peer_reset(con);
1185 mutex_lock(&con->mutex);
1186 break;
1187
1188 case CEPH_MSGR_TAG_RETRY_SESSION:
1189 /*
1190 * If we sent a smaller connect_seq than the peer has, try
1191 * again with a larger value.
1192 */
1193 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1194 le32_to_cpu(con->out_connect.connect_seq),
1195 le32_to_cpu(con->in_connect.connect_seq));
1196 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1197 prepare_write_connect(con->msgr, con, 0);
1198 prepare_read_connect(con);
1199 break;
1200
1201 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1202 /*
1203 * If we sent a smaller global_seq than the peer has, try
1204 * again with a larger value.
1205 */
1206 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1207 con->peer_global_seq,
1208 le32_to_cpu(con->in_connect.global_seq));
1209 get_global_seq(con->msgr,
1210 le32_to_cpu(con->in_connect.global_seq));
1211 prepare_write_connect(con->msgr, con, 0);
1212 prepare_read_connect(con);
1213 break;
1214
1215 case CEPH_MSGR_TAG_READY:
1216 if (req_feat & ~server_feat) {
1217 pr_err("%s%lld %s protocol feature mismatch,"
1218 " my required %llx > server's %llx, need %llx\n",
1219 ENTITY_NAME(con->peer_name),
1220 pr_addr(&con->peer_addr.in_addr),
1221 req_feat, server_feat, req_feat & ~server_feat);
1222 con->error_msg = "missing required protocol features";
1223 fail_protocol(con);
1224 return -1;
1225 }
1226 clear_bit(CONNECTING, &con->state);
1227 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1228 con->connect_seq++;
1229 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1230 con->peer_global_seq,
1231 le32_to_cpu(con->in_reply.connect_seq),
1232 con->connect_seq);
1233 WARN_ON(con->connect_seq !=
1234 le32_to_cpu(con->in_reply.connect_seq));
1235
1236 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1237 set_bit(LOSSYTX, &con->state);
1238
1239 prepare_read_tag(con);
1240 break;
1241
1242 case CEPH_MSGR_TAG_WAIT:
1243 /*
1244 * If there is a connection race (we are opening
1245 * connections to each other), one of us may just have
1246 * to WAIT. This shouldn't happen if we are the
1247 * client.
1248 */
1249 pr_err("process_connect peer connecting WAIT\n");
1250
1251 default:
1252 pr_err("connect protocol error, will retry\n");
1253 con->error_msg = "protocol error, garbage tag during connect";
1254 return -1;
1255 }
1256 return 0;
1257}
1258
1259
1260/*
1261 * read (part of) an ack
1262 */
1263static int read_partial_ack(struct ceph_connection *con)
1264{
1265 int to = 0;
1266
1267 return read_partial(con, &to, sizeof(con->in_temp_ack),
1268 &con->in_temp_ack);
1269}
1270
1271
1272/*
1273 * We can finally discard anything that's been acked.
1274 */
1275static void process_ack(struct ceph_connection *con)
1276{
1277 struct ceph_msg *m;
1278 u64 ack = le64_to_cpu(con->in_temp_ack);
1279 u64 seq;
1280
1281 while (!list_empty(&con->out_sent)) {
1282 m = list_first_entry(&con->out_sent, struct ceph_msg,
1283 list_head);
1284 seq = le64_to_cpu(m->hdr.seq);
1285 if (seq > ack)
1286 break;
1287 dout("got ack for seq %llu type %d at %p\n", seq,
1288 le16_to_cpu(m->hdr.type), m);
1289 ceph_msg_remove(m);
1290 }
1291 prepare_read_tag(con);
1292}
1293
1294
1295
1296
1297static int read_partial_message_section(struct ceph_connection *con,
1298 struct kvec *section, unsigned int sec_len,
1299 u32 *crc)
1300{
1301 int left;
1302 int ret;
1303
1304 BUG_ON(!section);
1305
1306 while (section->iov_len < sec_len) {
1307 BUG_ON(section->iov_base == NULL);
1308 left = sec_len - section->iov_len;
1309 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1310 section->iov_len, left);
1311 if (ret <= 0)
1312 return ret;
1313 section->iov_len += ret;
1314 if (section->iov_len == sec_len)
1315 *crc = crc32c(0, section->iov_base,
1316 section->iov_len);
1317 }
1318
1319 return 1;
1320}
1321
1322static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1323 struct ceph_msg_header *hdr,
1324 int *skip);
1325/*
1326 * read (part of) a message.
1327 */
1328static int read_partial_message(struct ceph_connection *con)
1329{
1330 struct ceph_msg *m = con->in_msg;
1331 void *p;
1332 int ret;
1333 int to, left;
1334 unsigned front_len, middle_len, data_len, data_off;
1335 int datacrc = con->msgr->nocrc;
1336 int skip;
1337
1338 dout("read_partial_message con %p msg %p\n", con, m);
1339
1340 /* header */
1341 while (con->in_base_pos < sizeof(con->in_hdr)) {
1342 left = sizeof(con->in_hdr) - con->in_base_pos;
1343 ret = ceph_tcp_recvmsg(con->sock,
1344 (char *)&con->in_hdr + con->in_base_pos,
1345 left);
1346 if (ret <= 0)
1347 return ret;
1348 con->in_base_pos += ret;
1349 if (con->in_base_pos == sizeof(con->in_hdr)) {
1350 u32 crc = crc32c(0, (void *)&con->in_hdr,
1351 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1352 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1353 pr_err("read_partial_message bad hdr "
1354 " crc %u != expected %u\n",
1355 crc, con->in_hdr.crc);
1356 return -EBADMSG;
1357 }
1358 }
1359 }
1360 front_len = le32_to_cpu(con->in_hdr.front_len);
1361 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1362 return -EIO;
1363 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1364 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1365 return -EIO;
1366 data_len = le32_to_cpu(con->in_hdr.data_len);
1367 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1368 return -EIO;
1369 data_off = le16_to_cpu(con->in_hdr.data_off);
1370
1371 /* allocate message? */
1372 if (!con->in_msg) {
1373 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1374 con->in_hdr.front_len, con->in_hdr.data_len);
1375 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1376 if (skip) {
1377 /* skip this message */
1378 dout("alloc_msg returned NULL, skipping message\n");
1379 con->in_base_pos = -front_len - middle_len - data_len -
1380 sizeof(m->footer);
1381 con->in_tag = CEPH_MSGR_TAG_READY;
1382 return 0;
1383 }
1384 if (IS_ERR(con->in_msg)) {
1385 ret = PTR_ERR(con->in_msg);
1386 con->in_msg = NULL;
1387 con->error_msg =
1388 "error allocating memory for incoming message";
1389 return ret;
1390 }
1391 m = con->in_msg;
1392 m->front.iov_len = 0; /* haven't read it yet */
1393 if (m->middle)
1394 m->middle->vec.iov_len = 0;
1395
1396 con->in_msg_pos.page = 0;
1397 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1398 con->in_msg_pos.data_pos = 0;
1399 }
1400
1401 /* front */
1402 ret = read_partial_message_section(con, &m->front, front_len,
1403 &con->in_front_crc);
1404 if (ret <= 0)
1405 return ret;
1406
1407 /* middle */
1408 if (m->middle) {
1409 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1410 &con->in_middle_crc);
1411 if (ret <= 0)
1412 return ret;
1413 }
1414
1415 /* (page) data */
1416 while (con->in_msg_pos.data_pos < data_len) {
1417 left = min((int)(data_len - con->in_msg_pos.data_pos),
1418 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1419 BUG_ON(m->pages == NULL);
1420 p = kmap(m->pages[con->in_msg_pos.page]);
1421 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1422 left);
1423 if (ret > 0 && datacrc)
1424 con->in_data_crc =
1425 crc32c(con->in_data_crc,
1426 p + con->in_msg_pos.page_pos, ret);
1427 kunmap(m->pages[con->in_msg_pos.page]);
1428 if (ret <= 0)
1429 return ret;
1430 con->in_msg_pos.data_pos += ret;
1431 con->in_msg_pos.page_pos += ret;
1432 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1433 con->in_msg_pos.page_pos = 0;
1434 con->in_msg_pos.page++;
1435 }
1436 }
1437
1438 /* footer */
1439 to = sizeof(m->hdr) + sizeof(m->footer);
1440 while (con->in_base_pos < to) {
1441 left = to - con->in_base_pos;
1442 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1443 (con->in_base_pos - sizeof(m->hdr)),
1444 left);
1445 if (ret <= 0)
1446 return ret;
1447 con->in_base_pos += ret;
1448 }
1449 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1450 m, front_len, m->footer.front_crc, middle_len,
1451 m->footer.middle_crc, data_len, m->footer.data_crc);
1452
1453 /* crc ok? */
1454 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1455 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1456 m, con->in_front_crc, m->footer.front_crc);
1457 return -EBADMSG;
1458 }
1459 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1460 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1461 m, con->in_middle_crc, m->footer.middle_crc);
1462 return -EBADMSG;
1463 }
1464 if (datacrc &&
1465 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1466 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1467 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1468 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1469 return -EBADMSG;
1470 }
1471
1472 return 1; /* done! */
1473}
1474
1475/*
1476 * Process message. This happens in the worker thread. The callback should
1477 * be careful not to do anything that waits on other incoming messages or it
1478 * may deadlock.
1479 */
1480static void process_message(struct ceph_connection *con)
1481{
1482 struct ceph_msg *msg;
1483
1484 msg = con->in_msg;
1485 con->in_msg = NULL;
1486
1487 /* if first message, set peer_name */
1488 if (con->peer_name.type == 0)
1489 con->peer_name = msg->hdr.src.name;
1490
1491 con->in_seq++;
1492 mutex_unlock(&con->mutex);
1493
1494 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1495 msg, le64_to_cpu(msg->hdr.seq),
1496 ENTITY_NAME(msg->hdr.src.name),
1497 le16_to_cpu(msg->hdr.type),
1498 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1499 le32_to_cpu(msg->hdr.front_len),
1500 le32_to_cpu(msg->hdr.data_len),
1501 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1502 con->ops->dispatch(con, msg);
1503
1504 mutex_lock(&con->mutex);
1505 prepare_read_tag(con);
1506}
1507
1508
1509/*
1510 * Write something to the socket. Called in a worker thread when the
1511 * socket appears to be writeable and we have something ready to send.
1512 */
1513static int try_write(struct ceph_connection *con)
1514{
1515 struct ceph_messenger *msgr = con->msgr;
1516 int ret = 1;
1517
1518 dout("try_write start %p state %lu nref %d\n", con, con->state,
1519 atomic_read(&con->nref));
1520
1521 mutex_lock(&con->mutex);
1522more:
1523 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1524
1525 /* open the socket first? */
1526 if (con->sock == NULL) {
1527 /*
1528 * if we were STANDBY and are reconnecting _this_
1529 * connection, bump connect_seq now. Always bump
1530 * global_seq.
1531 */
1532 if (test_and_clear_bit(STANDBY, &con->state))
1533 con->connect_seq++;
1534
1535 prepare_write_banner(msgr, con);
1536 prepare_write_connect(msgr, con, 1);
1537 prepare_read_banner(con);
1538 set_bit(CONNECTING, &con->state);
1539 clear_bit(NEGOTIATING, &con->state);
1540
1541 BUG_ON(con->in_msg);
1542 con->in_tag = CEPH_MSGR_TAG_READY;
1543 dout("try_write initiating connect on %p new state %lu\n",
1544 con, con->state);
1545 con->sock = ceph_tcp_connect(con);
1546 if (IS_ERR(con->sock)) {
1547 con->sock = NULL;
1548 con->error_msg = "connect error";
1549 ret = -1;
1550 goto out;
1551 }
1552 }
1553
1554more_kvec:
1555 /* kvec data queued? */
1556 if (con->out_skip) {
1557 ret = write_partial_skip(con);
1558 if (ret <= 0)
1559 goto done;
1560 if (ret < 0) {
1561 dout("try_write write_partial_skip err %d\n", ret);
1562 goto done;
1563 }
1564 }
1565 if (con->out_kvec_left) {
1566 ret = write_partial_kvec(con);
1567 if (ret <= 0)
1568 goto done;
1569 }
1570
1571 /* msg pages? */
1572 if (con->out_msg) {
1573 if (con->out_msg_done) {
1574 ceph_msg_put(con->out_msg);
1575 con->out_msg = NULL; /* we're done with this one */
1576 goto do_next;
1577 }
1578
1579 ret = write_partial_msg_pages(con);
1580 if (ret == 1)
1581 goto more_kvec; /* we need to send the footer, too! */
1582 if (ret == 0)
1583 goto done;
1584 if (ret < 0) {
1585 dout("try_write write_partial_msg_pages err %d\n",
1586 ret);
1587 goto done;
1588 }
1589 }
1590
1591do_next:
1592 if (!test_bit(CONNECTING, &con->state)) {
1593 /* is anything else pending? */
1594 if (!list_empty(&con->out_queue)) {
1595 prepare_write_message(con);
1596 goto more;
1597 }
1598 if (con->in_seq > con->in_seq_acked) {
1599 prepare_write_ack(con);
1600 goto more;
1601 }
1602 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1603 prepare_write_keepalive(con);
1604 goto more;
1605 }
1606 }
1607
1608 /* Nothing to do! */
1609 clear_bit(WRITE_PENDING, &con->state);
1610 dout("try_write nothing else to write.\n");
1611done:
1612 ret = 0;
1613out:
1614 mutex_unlock(&con->mutex);
1615 dout("try_write done on %p\n", con);
1616 return ret;
1617}
1618
1619
1620
1621/*
1622 * Read what we can from the socket.
1623 */
1624static int try_read(struct ceph_connection *con)
1625{
1626 struct ceph_messenger *msgr;
1627 int ret = -1;
1628
1629 if (!con->sock)
1630 return 0;
1631
1632 if (test_bit(STANDBY, &con->state))
1633 return 0;
1634
1635 dout("try_read start on %p\n", con);
1636 msgr = con->msgr;
1637
1638 mutex_lock(&con->mutex);
1639
1640more:
1641 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1642 con->in_base_pos);
1643 if (test_bit(CONNECTING, &con->state)) {
1644 if (!test_bit(NEGOTIATING, &con->state)) {
1645 dout("try_read connecting\n");
1646 ret = read_partial_banner(con);
1647 if (ret <= 0)
1648 goto done;
1649 if (process_banner(con) < 0) {
1650 ret = -1;
1651 goto out;
1652 }
1653 }
1654 ret = read_partial_connect(con);
1655 if (ret <= 0)
1656 goto done;
1657 if (process_connect(con) < 0) {
1658 ret = -1;
1659 goto out;
1660 }
1661 goto more;
1662 }
1663
1664 if (con->in_base_pos < 0) {
1665 /*
1666 * skipping + discarding content.
1667 *
1668 * FIXME: there must be a better way to do this!
1669 */
1670 static char buf[1024];
1671 int skip = min(1024, -con->in_base_pos);
1672 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1673 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1674 if (ret <= 0)
1675 goto done;
1676 con->in_base_pos += ret;
1677 if (con->in_base_pos)
1678 goto more;
1679 }
1680 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1681 /*
1682 * what's next?
1683 */
1684 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1685 if (ret <= 0)
1686 goto done;
1687 dout("try_read got tag %d\n", (int)con->in_tag);
1688 switch (con->in_tag) {
1689 case CEPH_MSGR_TAG_MSG:
1690 prepare_read_message(con);
1691 break;
1692 case CEPH_MSGR_TAG_ACK:
1693 prepare_read_ack(con);
1694 break;
1695 case CEPH_MSGR_TAG_CLOSE:
1696 set_bit(CLOSED, &con->state); /* fixme */
1697 goto done;
1698 default:
1699 goto bad_tag;
1700 }
1701 }
1702 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1703 ret = read_partial_message(con);
1704 if (ret <= 0) {
1705 switch (ret) {
1706 case -EBADMSG:
1707 con->error_msg = "bad crc";
1708 ret = -EIO;
1709 goto out;
1710 case -EIO:
1711 con->error_msg = "io error";
1712 goto out;
1713 default:
1714 goto done;
1715 }
1716 }
1717 if (con->in_tag == CEPH_MSGR_TAG_READY)
1718 goto more;
1719 process_message(con);
1720 goto more;
1721 }
1722 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1723 ret = read_partial_ack(con);
1724 if (ret <= 0)
1725 goto done;
1726 process_ack(con);
1727 goto more;
1728 }
1729
1730done:
1731 ret = 0;
1732out:
1733 mutex_unlock(&con->mutex);
1734 dout("try_read done on %p\n", con);
1735 return ret;
1736
1737bad_tag:
1738 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1739 con->error_msg = "protocol error, garbage tag";
1740 ret = -1;
1741 goto out;
1742}
1743
1744
1745/*
1746 * Atomically queue work on a connection. Bump @con reference to
1747 * avoid races with connection teardown.
1748 *
1749 * There is some trickery going on with QUEUED and BUSY because we
1750 * only want a _single_ thread operating on each connection at any
1751 * point in time, but we want to use all available CPUs.
1752 *
1753 * The worker thread only proceeds if it can atomically set BUSY. It
1754 * clears QUEUED and does it's thing. When it thinks it's done, it
1755 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1756 * (tries again to set BUSY).
1757 *
1758 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1759 * try to queue work. If that fails (work is already queued, or BUSY)
1760 * we give up (work also already being done or is queued) but leave QUEUED
1761 * set so that the worker thread will loop if necessary.
1762 */
1763static void queue_con(struct ceph_connection *con)
1764{
1765 if (test_bit(DEAD, &con->state)) {
1766 dout("queue_con %p ignoring: DEAD\n",
1767 con);
1768 return;
1769 }
1770
1771 if (!con->ops->get(con)) {
1772 dout("queue_con %p ref count 0\n", con);
1773 return;
1774 }
1775
1776 set_bit(QUEUED, &con->state);
1777 if (test_bit(BUSY, &con->state)) {
1778 dout("queue_con %p - already BUSY\n", con);
1779 con->ops->put(con);
1780 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1781 dout("queue_con %p - already queued\n", con);
1782 con->ops->put(con);
1783 } else {
1784 dout("queue_con %p\n", con);
1785 }
1786}
1787
1788/*
1789 * Do some work on a connection. Drop a connection ref when we're done.
1790 */
1791static void con_work(struct work_struct *work)
1792{
1793 struct ceph_connection *con = container_of(work, struct ceph_connection,
1794 work.work);
1795 int backoff = 0;
1796
1797more:
1798 if (test_and_set_bit(BUSY, &con->state) != 0) {
1799 dout("con_work %p BUSY already set\n", con);
1800 goto out;
1801 }
1802 dout("con_work %p start, clearing QUEUED\n", con);
1803 clear_bit(QUEUED, &con->state);
1804
1805 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1806 dout("con_work CLOSED\n");
1807 con_close_socket(con);
1808 goto done;
1809 }
1810 if (test_and_clear_bit(OPENING, &con->state)) {
1811 /* reopen w/ new peer */
1812 dout("con_work OPENING\n");
1813 con_close_socket(con);
1814 }
1815
1816 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1817 try_read(con) < 0 ||
1818 try_write(con) < 0) {
1819 backoff = 1;
1820 ceph_fault(con); /* error/fault path */
1821 }
1822
1823done:
1824 clear_bit(BUSY, &con->state);
1825 dout("con->state=%lu\n", con->state);
1826 if (test_bit(QUEUED, &con->state)) {
1827 if (!backoff || test_bit(OPENING, &con->state)) {
1828 dout("con_work %p QUEUED reset, looping\n", con);
1829 goto more;
1830 }
1831 dout("con_work %p QUEUED reset, but just faulted\n", con);
1832 clear_bit(QUEUED, &con->state);
1833 }
1834 dout("con_work %p done\n", con);
1835
1836out:
1837 con->ops->put(con);
1838}
1839
1840
1841/*
1842 * Generic error/fault handler. A retry mechanism is used with
1843 * exponential backoff
1844 */
1845static void ceph_fault(struct ceph_connection *con)
1846{
1847 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1848 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1849 dout("fault %p state %lu to peer %s\n",
1850 con, con->state, pr_addr(&con->peer_addr.in_addr));
1851
1852 if (test_bit(LOSSYTX, &con->state)) {
1853 dout("fault on LOSSYTX channel\n");
1854 goto out;
1855 }
1856
1857 mutex_lock(&con->mutex);
1858 if (test_bit(CLOSED, &con->state))
1859 goto out_unlock;
1860
1861 con_close_socket(con);
1862
1863 if (con->in_msg) {
1864 ceph_msg_put(con->in_msg);
1865 con->in_msg = NULL;
1866 }
1867
1868 /* Requeue anything that hasn't been acked */
1869 list_splice_init(&con->out_sent, &con->out_queue);
1870
1871 /* If there are no messages in the queue, place the connection
1872 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1873 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1874 dout("fault setting STANDBY\n");
1875 set_bit(STANDBY, &con->state);
1876 } else {
1877 /* retry after a delay. */
1878 if (con->delay == 0)
1879 con->delay = BASE_DELAY_INTERVAL;
1880 else if (con->delay < MAX_DELAY_INTERVAL)
1881 con->delay *= 2;
1882 dout("fault queueing %p delay %lu\n", con, con->delay);
1883 con->ops->get(con);
1884 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1885 round_jiffies_relative(con->delay)) == 0)
1886 con->ops->put(con);
1887 }
1888
1889out_unlock:
1890 mutex_unlock(&con->mutex);
1891out:
1892 /*
1893 * in case we faulted due to authentication, invalidate our
1894 * current tickets so that we can get new ones.
1895 */
1896 if (con->auth_retry && con->ops->invalidate_authorizer) {
1897 dout("calling invalidate_authorizer()\n");
1898 con->ops->invalidate_authorizer(con);
1899 }
1900
1901 if (con->ops->fault)
1902 con->ops->fault(con);
1903}
1904
1905
1906
1907/*
1908 * create a new messenger instance
1909 */
1910struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1911{
1912 struct ceph_messenger *msgr;
1913
1914 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1915 if (msgr == NULL)
1916 return ERR_PTR(-ENOMEM);
1917
1918 spin_lock_init(&msgr->global_seq_lock);
1919
1920 /* the zero page is needed if a request is "canceled" while the message
1921 * is being written over the socket */
1922 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1923 if (!msgr->zero_page) {
1924 kfree(msgr);
1925 return ERR_PTR(-ENOMEM);
1926 }
1927 kmap(msgr->zero_page);
1928
1929 if (myaddr)
1930 msgr->inst.addr = *myaddr;
1931
1932 /* select a random nonce */
1933 msgr->inst.addr.type = 0;
1934 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1935 encode_my_addr(msgr);
1936
1937 dout("messenger_create %p\n", msgr);
1938 return msgr;
1939}
1940
1941void ceph_messenger_destroy(struct ceph_messenger *msgr)
1942{
1943 dout("destroy %p\n", msgr);
1944 kunmap(msgr->zero_page);
1945 __free_page(msgr->zero_page);
1946 kfree(msgr);
1947 dout("destroyed messenger %p\n", msgr);
1948}
1949
1950/*
1951 * Queue up an outgoing message on the given connection.
1952 */
1953void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1954{
1955 if (test_bit(CLOSED, &con->state)) {
1956 dout("con_send %p closed, dropping %p\n", con, msg);
1957 ceph_msg_put(msg);
1958 return;
1959 }
1960
1961 /* set src+dst */
1962 msg->hdr.src.name = con->msgr->inst.name;
1963 msg->hdr.src.addr = con->msgr->my_enc_addr;
1964 msg->hdr.orig_src = msg->hdr.src;
1965
1966 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1967
1968 /* queue */
1969 mutex_lock(&con->mutex);
1970 BUG_ON(!list_empty(&msg->list_head));
1971 list_add_tail(&msg->list_head, &con->out_queue);
1972 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1973 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1974 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1975 le32_to_cpu(msg->hdr.front_len),
1976 le32_to_cpu(msg->hdr.middle_len),
1977 le32_to_cpu(msg->hdr.data_len));
1978 mutex_unlock(&con->mutex);
1979
1980 /* if there wasn't anything waiting to send before, queue
1981 * new work */
1982 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1983 queue_con(con);
1984}
1985
1986/*
1987 * Revoke a message that was previously queued for send
1988 */
1989void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1990{
1991 mutex_lock(&con->mutex);
1992 if (!list_empty(&msg->list_head)) {
1993 dout("con_revoke %p msg %p\n", con, msg);
1994 list_del_init(&msg->list_head);
1995 ceph_msg_put(msg);
1996 msg->hdr.seq = 0;
1997 if (con->out_msg == msg) {
1998 ceph_msg_put(con->out_msg);
1999 con->out_msg = NULL;
2000 }
2001 if (con->out_kvec_is_msg) {
2002 con->out_skip = con->out_kvec_bytes;
2003 con->out_kvec_is_msg = false;
2004 }
2005 } else {
2006 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
2007 }
2008 mutex_unlock(&con->mutex);
2009}
2010
2011/*
2012 * Revoke a message that we may be reading data into
2013 */
2014void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2015{
2016 mutex_lock(&con->mutex);
2017 if (con->in_msg && con->in_msg == msg) {
2018 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2019 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2020 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2021
2022 /* skip rest of message */
2023 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2024 con->in_base_pos = con->in_base_pos -
2025 sizeof(struct ceph_msg_header) -
2026 front_len -
2027 middle_len -
2028 data_len -
2029 sizeof(struct ceph_msg_footer);
2030 ceph_msg_put(con->in_msg);
2031 con->in_msg = NULL;
2032 con->in_tag = CEPH_MSGR_TAG_READY;
2033 } else {
2034 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2035 con, con->in_msg, msg);
2036 }
2037 mutex_unlock(&con->mutex);
2038}
2039
2040/*
2041 * Queue a keepalive byte to ensure the tcp connection is alive.
2042 */
2043void ceph_con_keepalive(struct ceph_connection *con)
2044{
2045 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2046 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2047 queue_con(con);
2048}
2049
2050
2051/*
2052 * construct a new message with given type, size
2053 * the new msg has a ref count of 1.
2054 */
2055struct ceph_msg *ceph_msg_new(int type, int front_len,
2056 int page_len, int page_off, struct page **pages)
2057{
2058 struct ceph_msg *m;
2059
2060 m = kmalloc(sizeof(*m), GFP_NOFS);
2061 if (m == NULL)
2062 goto out;
2063 kref_init(&m->kref);
2064 INIT_LIST_HEAD(&m->list_head);
2065
2066 m->hdr.type = cpu_to_le16(type);
2067 m->hdr.front_len = cpu_to_le32(front_len);
2068 m->hdr.middle_len = 0;
2069 m->hdr.data_len = cpu_to_le32(page_len);
2070 m->hdr.data_off = cpu_to_le16(page_off);
2071 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2072 m->footer.front_crc = 0;
2073 m->footer.middle_crc = 0;
2074 m->footer.data_crc = 0;
2075 m->front_max = front_len;
2076 m->front_is_vmalloc = false;
2077 m->more_to_follow = false;
2078 m->pool = NULL;
2079
2080 /* front */
2081 if (front_len) {
2082 if (front_len > PAGE_CACHE_SIZE) {
2083 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2084 PAGE_KERNEL);
2085 m->front_is_vmalloc = true;
2086 } else {
2087 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2088 }
2089 if (m->front.iov_base == NULL) {
2090 pr_err("msg_new can't allocate %d bytes\n",
2091 front_len);
2092 goto out2;
2093 }
2094 } else {
2095 m->front.iov_base = NULL;
2096 }
2097 m->front.iov_len = front_len;
2098
2099 /* middle */
2100 m->middle = NULL;
2101
2102 /* data */
2103 m->nr_pages = calc_pages_for(page_off, page_len);
2104 m->pages = pages;
2105 m->pagelist = NULL;
2106
2107 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2108 m->nr_pages);
2109 return m;
2110
2111out2:
2112 ceph_msg_put(m);
2113out:
2114 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2115 return ERR_PTR(-ENOMEM);
2116}
2117
2118/*
2119 * Allocate "middle" portion of a message, if it is needed and wasn't
2120 * allocated by alloc_msg. This allows us to read a small fixed-size
2121 * per-type header in the front and then gracefully fail (i.e.,
2122 * propagate the error to the caller based on info in the front) when
2123 * the middle is too large.
2124 */
2125static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2126{
2127 int type = le16_to_cpu(msg->hdr.type);
2128 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2129
2130 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2131 ceph_msg_type_name(type), middle_len);
2132 BUG_ON(!middle_len);
2133 BUG_ON(msg->middle);
2134
2135 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2136 if (!msg->middle)
2137 return -ENOMEM;
2138 return 0;
2139}
2140
2141/*
2142 * Generic message allocator, for incoming messages.
2143 */
2144static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2145 struct ceph_msg_header *hdr,
2146 int *skip)
2147{
2148 int type = le16_to_cpu(hdr->type);
2149 int front_len = le32_to_cpu(hdr->front_len);
2150 int middle_len = le32_to_cpu(hdr->middle_len);
2151 struct ceph_msg *msg = NULL;
2152 int ret;
2153
2154 if (con->ops->alloc_msg) {
2155 mutex_unlock(&con->mutex);
2156 msg = con->ops->alloc_msg(con, hdr, skip);
2157 mutex_lock(&con->mutex);
2158 if (IS_ERR(msg))
2159 return msg;
2160
2161 if (*skip)
2162 return NULL;
2163 }
2164 if (!msg) {
2165 *skip = 0;
2166 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2167 if (!msg) {
2168 pr_err("unable to allocate msg type %d len %d\n",
2169 type, front_len);
2170 return ERR_PTR(-ENOMEM);
2171 }
2172 }
2173 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2174
2175 if (middle_len) {
2176 ret = ceph_alloc_middle(con, msg);
2177
2178 if (ret < 0) {
2179 ceph_msg_put(msg);
2180 return msg;
2181 }
2182 }
2183
2184 return msg;
2185}
2186
2187
2188/*
2189 * Free a generically kmalloc'd message.
2190 */
2191void ceph_msg_kfree(struct ceph_msg *m)
2192{
2193 dout("msg_kfree %p\n", m);
2194 if (m->front_is_vmalloc)
2195 vfree(m->front.iov_base);
2196 else
2197 kfree(m->front.iov_base);
2198 kfree(m);
2199}
2200
2201/*
2202 * Drop a msg ref. Destroy as needed.
2203 */
2204void ceph_msg_last_put(struct kref *kref)
2205{
2206 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2207
2208 dout("ceph_msg_put last one on %p\n", m);
2209 WARN_ON(!list_empty(&m->list_head));
2210
2211 /* drop middle, data, if any */
2212 if (m->middle) {
2213 ceph_buffer_put(m->middle);
2214 m->middle = NULL;
2215 }
2216 m->nr_pages = 0;
2217 m->pages = NULL;
2218
2219 if (m->pagelist) {
2220 ceph_pagelist_release(m->pagelist);
2221 kfree(m->pagelist);
2222 m->pagelist = NULL;
2223 }
2224
2225 if (m->pool)
2226 ceph_msgpool_put(m->pool, m);
2227 else
2228 ceph_msg_kfree(m);
2229}
2230
2231void ceph_msg_dump(struct ceph_msg *msg)
2232{
2233 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2234 msg->front_max, msg->nr_pages);
2235 print_hex_dump(KERN_DEBUG, "header: ",
2236 DUMP_PREFIX_OFFSET, 16, 1,
2237 &msg->hdr, sizeof(msg->hdr), true);
2238 print_hex_dump(KERN_DEBUG, " front: ",
2239 DUMP_PREFIX_OFFSET, 16, 1,
2240 msg->front.iov_base, msg->front.iov_len, true);
2241 if (msg->middle)
2242 print_hex_dump(KERN_DEBUG, "middle: ",
2243 DUMP_PREFIX_OFFSET, 16, 1,
2244 msg->middle->vec.iov_base,
2245 msg->middle->vec.iov_len, true);
2246 print_hex_dump(KERN_DEBUG, "footer: ",
2247 DUMP_PREFIX_OFFSET, 16, 1,
2248 &msg->footer, sizeof(msg->footer), true);
2249}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..a343dae73cdc
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,255 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern bool ceph_con_opened(struct ceph_connection *con);
227extern void ceph_con_close(struct ceph_connection *con);
228extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
230extern void ceph_con_revoke_message(struct ceph_connection *con,
231 struct ceph_msg *msg);
232extern void ceph_con_keepalive(struct ceph_connection *con);
233extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
234extern void ceph_con_put(struct ceph_connection *con);
235
236extern struct ceph_msg *ceph_msg_new(int type, int front_len,
237 int page_len, int page_off,
238 struct page **pages);
239extern void ceph_msg_kfree(struct ceph_msg *m);
240
241
242static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
243{
244 kref_get(&msg->kref);
245 return msg;
246}
247extern void ceph_msg_last_put(struct kref *kref);
248static inline void ceph_msg_put(struct ceph_msg *msg)
249{
250 kref_put(&msg->kref, ceph_msg_last_put);
251}
252
253extern void ceph_msg_dump(struct ceph_msg *msg);
254
255#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..8fdc011ca956
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,835 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/slab.h>
5#include <linux/random.h>
6#include <linux/sched.h>
7
8#include "mon_client.h"
9#include "super.h"
10#include "auth.h"
11#include "decode.h"
12
13/*
14 * Interact with Ceph monitor cluster. Handle requests for new map
15 * versions, and periodically resend as needed. Also implement
16 * statfs() and umount().
17 *
18 * A small cluster of Ceph "monitors" are responsible for managing critical
19 * cluster configuration and state information. An odd number (e.g., 3, 5)
20 * of cmon daemons use a modified version of the Paxos part-time parliament
21 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
22 * list of clients who have mounted the file system.
23 *
24 * We maintain an open, active session with a monitor at all times in order to
25 * receive timely MDSMap updates. We periodically send a keepalive byte on the
26 * TCP socket to ensure we detect a failure. If the connection does break, we
27 * randomly hunt for a new monitor. Once the connection is reestablished, we
28 * resend any outstanding requests.
29 */
30
31const static struct ceph_connection_operations mon_con_ops;
32
33static int __validate_auth(struct ceph_mon_client *monc);
34
35/*
36 * Decode a monmap blob (e.g., during mount).
37 */
38struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
39{
40 struct ceph_monmap *m = NULL;
41 int i, err = -EINVAL;
42 struct ceph_fsid fsid;
43 u32 epoch, num_mon;
44 u16 version;
45 u32 len;
46
47 ceph_decode_32_safe(&p, end, len, bad);
48 ceph_decode_need(&p, end, len, bad);
49
50 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
51
52 ceph_decode_16_safe(&p, end, version, bad);
53
54 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
55 ceph_decode_copy(&p, &fsid, sizeof(fsid));
56 epoch = ceph_decode_32(&p);
57
58 num_mon = ceph_decode_32(&p);
59 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
60
61 if (num_mon >= CEPH_MAX_MON)
62 goto bad;
63 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
64 if (m == NULL)
65 return ERR_PTR(-ENOMEM);
66 m->fsid = fsid;
67 m->epoch = epoch;
68 m->num_mon = num_mon;
69 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
70 for (i = 0; i < num_mon; i++)
71 ceph_decode_addr(&m->mon_inst[i].addr);
72
73 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
74 m->num_mon);
75 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m;
79
80bad:
81 dout("monmap_decode failed with %d\n", err);
82 kfree(m);
83 return ERR_PTR(err);
84}
85
86/*
87 * return true if *addr is included in the monmap.
88 */
89int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
90{
91 int i;
92
93 for (i = 0; i < m->num_mon; i++)
94 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
95 return 1;
96 return 0;
97}
98
99/*
100 * Send an auth request.
101 */
102static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
103{
104 monc->pending_auth = 1;
105 monc->m_auth->front.iov_len = len;
106 monc->m_auth->hdr.front_len = cpu_to_le32(len);
107 ceph_msg_get(monc->m_auth); /* keep our ref */
108 ceph_con_send(monc->con, monc->m_auth);
109}
110
111/*
112 * Close monitor session, if any.
113 */
114static void __close_session(struct ceph_mon_client *monc)
115{
116 if (monc->con) {
117 dout("__close_session closing mon%d\n", monc->cur_mon);
118 ceph_con_revoke(monc->con, monc->m_auth);
119 ceph_con_close(monc->con);
120 monc->cur_mon = -1;
121 monc->pending_auth = 0;
122 ceph_auth_reset(monc->auth);
123 }
124}
125
126/*
127 * Open a session with a (new) monitor.
128 */
129static int __open_session(struct ceph_mon_client *monc)
130{
131 char r;
132 int ret;
133
134 if (monc->cur_mon < 0) {
135 get_random_bytes(&r, 1);
136 monc->cur_mon = r % monc->monmap->num_mon;
137 dout("open_session num=%d r=%d -> mon%d\n",
138 monc->monmap->num_mon, r, monc->cur_mon);
139 monc->sub_sent = 0;
140 monc->sub_renew_after = jiffies; /* i.e., expired */
141 monc->want_next_osdmap = !!monc->want_next_osdmap;
142
143 dout("open_session mon%d opening\n", monc->cur_mon);
144 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
145 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
146 ceph_con_open(monc->con,
147 &monc->monmap->mon_inst[monc->cur_mon].addr);
148
149 /* initiatiate authentication handshake */
150 ret = ceph_auth_build_hello(monc->auth,
151 monc->m_auth->front.iov_base,
152 monc->m_auth->front_max);
153 __send_prepared_auth_request(monc, ret);
154 } else {
155 dout("open_session mon%d already open\n", monc->cur_mon);
156 }
157 return 0;
158}
159
160static bool __sub_expired(struct ceph_mon_client *monc)
161{
162 return time_after_eq(jiffies, monc->sub_renew_after);
163}
164
165/*
166 * Reschedule delayed work timer.
167 */
168static void __schedule_delayed(struct ceph_mon_client *monc)
169{
170 unsigned delay;
171
172 if (monc->cur_mon < 0 || __sub_expired(monc))
173 delay = 10 * HZ;
174 else
175 delay = 20 * HZ;
176 dout("__schedule_delayed after %u\n", delay);
177 schedule_delayed_work(&monc->delayed_work, delay);
178}
179
180/*
181 * Send subscribe request for mdsmap and/or osdmap.
182 */
183static void __send_subscribe(struct ceph_mon_client *monc)
184{
185 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
186 (unsigned)monc->sub_sent, __sub_expired(monc),
187 monc->want_next_osdmap);
188 if ((__sub_expired(monc) && !monc->sub_sent) ||
189 monc->want_next_osdmap == 1) {
190 struct ceph_msg *msg;
191 struct ceph_mon_subscribe_item *i;
192 void *p, *end;
193
194 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
195 if (!msg)
196 return;
197
198 p = msg->front.iov_base;
199 end = p + msg->front.iov_len;
200
201 dout("__send_subscribe to 'mdsmap' %u+\n",
202 (unsigned)monc->have_mdsmap);
203 if (monc->want_next_osdmap) {
204 dout("__send_subscribe to 'osdmap' %u\n",
205 (unsigned)monc->have_osdmap);
206 ceph_encode_32(&p, 3);
207 ceph_encode_string(&p, end, "osdmap", 6);
208 i = p;
209 i->have = cpu_to_le64(monc->have_osdmap);
210 i->onetime = 1;
211 p += sizeof(*i);
212 monc->want_next_osdmap = 2; /* requested */
213 } else {
214 ceph_encode_32(&p, 2);
215 }
216 ceph_encode_string(&p, end, "mdsmap", 6);
217 i = p;
218 i->have = cpu_to_le64(monc->have_mdsmap);
219 i->onetime = 0;
220 p += sizeof(*i);
221 ceph_encode_string(&p, end, "monmap", 6);
222 i = p;
223 i->have = 0;
224 i->onetime = 0;
225 p += sizeof(*i);
226
227 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_send(monc->con, msg);
230
231 monc->sub_sent = jiffies | 1; /* never 0 */
232 }
233}
234
235static void handle_subscribe_ack(struct ceph_mon_client *monc,
236 struct ceph_msg *msg)
237{
238 unsigned seconds;
239 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
240
241 if (msg->front.iov_len < sizeof(*h))
242 goto bad;
243 seconds = le32_to_cpu(h->duration);
244
245 mutex_lock(&monc->mutex);
246 if (monc->hunting) {
247 pr_info("mon%d %s session established\n",
248 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
249 monc->hunting = false;
250 }
251 dout("handle_subscribe_ack after %d seconds\n", seconds);
252 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
253 monc->sub_sent = 0;
254 mutex_unlock(&monc->mutex);
255 return;
256bad:
257 pr_err("got corrupt subscribe-ack msg\n");
258 ceph_msg_dump(msg);
259}
260
261/*
262 * Keep track of which maps we have
263 */
264int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
265{
266 mutex_lock(&monc->mutex);
267 monc->have_mdsmap = got;
268 mutex_unlock(&monc->mutex);
269 return 0;
270}
271
272int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
273{
274 mutex_lock(&monc->mutex);
275 monc->have_osdmap = got;
276 monc->want_next_osdmap = 0;
277 mutex_unlock(&monc->mutex);
278 return 0;
279}
280
281/*
282 * Register interest in the next osdmap
283 */
284void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
285{
286 dout("request_next_osdmap have %u\n", monc->have_osdmap);
287 mutex_lock(&monc->mutex);
288 if (!monc->want_next_osdmap)
289 monc->want_next_osdmap = 1;
290 if (monc->want_next_osdmap < 2)
291 __send_subscribe(monc);
292 mutex_unlock(&monc->mutex);
293}
294
295/*
296 *
297 */
298int ceph_monc_open_session(struct ceph_mon_client *monc)
299{
300 if (!monc->con) {
301 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
302 if (!monc->con)
303 return -ENOMEM;
304 ceph_con_init(monc->client->msgr, monc->con);
305 monc->con->private = monc;
306 monc->con->ops = &mon_con_ops;
307 }
308
309 mutex_lock(&monc->mutex);
310 __open_session(monc);
311 __schedule_delayed(monc);
312 mutex_unlock(&monc->mutex);
313 return 0;
314}
315
316/*
317 * The monitor responds with mount ack indicate mount success. The
318 * included client ticket allows the client to talk to MDSs and OSDs.
319 */
320static void ceph_monc_handle_map(struct ceph_mon_client *monc,
321 struct ceph_msg *msg)
322{
323 struct ceph_client *client = monc->client;
324 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
325 void *p, *end;
326
327 mutex_lock(&monc->mutex);
328
329 dout("handle_monmap\n");
330 p = msg->front.iov_base;
331 end = p + msg->front.iov_len;
332
333 monmap = ceph_monmap_decode(p, end);
334 if (IS_ERR(monmap)) {
335 pr_err("problem decoding monmap, %d\n",
336 (int)PTR_ERR(monmap));
337 goto out;
338 }
339
340 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
341 kfree(monmap);
342 goto out;
343 }
344
345 client->monc.monmap = monmap;
346 kfree(old);
347
348out:
349 mutex_unlock(&monc->mutex);
350 wake_up(&client->auth_wq);
351}
352
353/*
354 * statfs
355 */
356static struct ceph_mon_statfs_request *__lookup_statfs(
357 struct ceph_mon_client *monc, u64 tid)
358{
359 struct ceph_mon_statfs_request *req;
360 struct rb_node *n = monc->statfs_request_tree.rb_node;
361
362 while (n) {
363 req = rb_entry(n, struct ceph_mon_statfs_request, node);
364 if (tid < req->tid)
365 n = n->rb_left;
366 else if (tid > req->tid)
367 n = n->rb_right;
368 else
369 return req;
370 }
371 return NULL;
372}
373
374static void __insert_statfs(struct ceph_mon_client *monc,
375 struct ceph_mon_statfs_request *new)
376{
377 struct rb_node **p = &monc->statfs_request_tree.rb_node;
378 struct rb_node *parent = NULL;
379 struct ceph_mon_statfs_request *req = NULL;
380
381 while (*p) {
382 parent = *p;
383 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
384 if (new->tid < req->tid)
385 p = &(*p)->rb_left;
386 else if (new->tid > req->tid)
387 p = &(*p)->rb_right;
388 else
389 BUG();
390 }
391
392 rb_link_node(&new->node, parent, p);
393 rb_insert_color(&new->node, &monc->statfs_request_tree);
394}
395
396static void handle_statfs_reply(struct ceph_mon_client *monc,
397 struct ceph_msg *msg)
398{
399 struct ceph_mon_statfs_request *req;
400 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
401 u64 tid;
402
403 if (msg->front.iov_len != sizeof(*reply))
404 goto bad;
405 tid = le64_to_cpu(msg->hdr.tid);
406 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
407
408 mutex_lock(&monc->mutex);
409 req = __lookup_statfs(monc, tid);
410 if (req) {
411 *req->buf = reply->st;
412 req->result = 0;
413 }
414 mutex_unlock(&monc->mutex);
415 if (req)
416 complete(&req->completion);
417 return;
418
419bad:
420 pr_err("corrupt statfs reply, no tid\n");
421 ceph_msg_dump(msg);
422}
423
424/*
425 * (re)send a statfs request
426 */
427static int send_statfs(struct ceph_mon_client *monc,
428 struct ceph_mon_statfs_request *req)
429{
430 struct ceph_msg *msg;
431 struct ceph_mon_statfs *h;
432
433 dout("send_statfs tid %llu\n", req->tid);
434 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
435 if (IS_ERR(msg))
436 return PTR_ERR(msg);
437 req->request = msg;
438 msg->hdr.tid = cpu_to_le64(req->tid);
439 h = msg->front.iov_base;
440 h->monhdr.have_version = 0;
441 h->monhdr.session_mon = cpu_to_le16(-1);
442 h->monhdr.session_mon_tid = 0;
443 h->fsid = monc->monmap->fsid;
444 ceph_con_send(monc->con, msg);
445 return 0;
446}
447
448/*
449 * Do a synchronous statfs().
450 */
451int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
452{
453 struct ceph_mon_statfs_request req;
454 int err;
455
456 req.buf = buf;
457 init_completion(&req.completion);
458
459 /* allocate memory for reply */
460 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
461 if (err)
462 return err;
463
464 /* register request */
465 mutex_lock(&monc->mutex);
466 req.tid = ++monc->last_tid;
467 req.last_attempt = jiffies;
468 req.delay = BASE_DELAY_INTERVAL;
469 __insert_statfs(monc, &req);
470 monc->num_statfs_requests++;
471 mutex_unlock(&monc->mutex);
472
473 /* send request and wait */
474 err = send_statfs(monc, &req);
475 if (!err)
476 err = wait_for_completion_interruptible(&req.completion);
477
478 mutex_lock(&monc->mutex);
479 rb_erase(&req.node, &monc->statfs_request_tree);
480 monc->num_statfs_requests--;
481 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
482 mutex_unlock(&monc->mutex);
483
484 if (!err)
485 err = req.result;
486 return err;
487}
488
489/*
490 * Resend pending statfs requests.
491 */
492static void __resend_statfs(struct ceph_mon_client *monc)
493{
494 struct ceph_mon_statfs_request *req;
495 struct rb_node *p;
496
497 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
498 req = rb_entry(p, struct ceph_mon_statfs_request, node);
499 send_statfs(monc, req);
500 }
501}
502
503/*
504 * Delayed work. If we haven't mounted yet, retry. Otherwise,
505 * renew/retry subscription as needed (in case it is timing out, or we
506 * got an ENOMEM). And keep the monitor connection alive.
507 */
508static void delayed_work(struct work_struct *work)
509{
510 struct ceph_mon_client *monc =
511 container_of(work, struct ceph_mon_client, delayed_work.work);
512
513 dout("monc delayed_work\n");
514 mutex_lock(&monc->mutex);
515 if (monc->hunting) {
516 __close_session(monc);
517 __open_session(monc); /* continue hunting */
518 } else {
519 ceph_con_keepalive(monc->con);
520
521 __validate_auth(monc);
522
523 if (monc->auth->ops->is_authenticated(monc->auth))
524 __send_subscribe(monc);
525 }
526 __schedule_delayed(monc);
527 mutex_unlock(&monc->mutex);
528}
529
530/*
531 * On startup, we build a temporary monmap populated with the IPs
532 * provided by mount(2).
533 */
534static int build_initial_monmap(struct ceph_mon_client *monc)
535{
536 struct ceph_mount_args *args = monc->client->mount_args;
537 struct ceph_entity_addr *mon_addr = args->mon_addr;
538 int num_mon = args->num_mon;
539 int i;
540
541 /* build initial monmap */
542 monc->monmap = kzalloc(sizeof(*monc->monmap) +
543 num_mon*sizeof(monc->monmap->mon_inst[0]),
544 GFP_KERNEL);
545 if (!monc->monmap)
546 return -ENOMEM;
547 for (i = 0; i < num_mon; i++) {
548 monc->monmap->mon_inst[i].addr = mon_addr[i];
549 monc->monmap->mon_inst[i].addr.nonce = 0;
550 monc->monmap->mon_inst[i].name.type =
551 CEPH_ENTITY_TYPE_MON;
552 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
553 }
554 monc->monmap->num_mon = num_mon;
555 monc->have_fsid = false;
556
557 /* release addr memory */
558 kfree(args->mon_addr);
559 args->mon_addr = NULL;
560 args->num_mon = 0;
561 return 0;
562}
563
564int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
565{
566 int err = 0;
567
568 dout("init\n");
569 memset(monc, 0, sizeof(*monc));
570 monc->client = cl;
571 monc->monmap = NULL;
572 mutex_init(&monc->mutex);
573
574 err = build_initial_monmap(monc);
575 if (err)
576 goto out;
577
578 monc->con = NULL;
579
580 /* authentication */
581 monc->auth = ceph_auth_init(cl->mount_args->name,
582 cl->mount_args->secret);
583 if (IS_ERR(monc->auth))
584 return PTR_ERR(monc->auth);
585 monc->auth->want_keys =
586 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
587 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
588
589 /* msg pools */
590 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
591 sizeof(struct ceph_mon_subscribe_ack), 1, false);
592 if (err < 0)
593 goto out_monmap;
594 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
595 sizeof(struct ceph_mon_statfs_reply), 0, false);
596 if (err < 0)
597 goto out_pool1;
598 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
599 if (err < 0)
600 goto out_pool2;
601
602 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
603 monc->pending_auth = 0;
604 if (IS_ERR(monc->m_auth)) {
605 err = PTR_ERR(monc->m_auth);
606 monc->m_auth = NULL;
607 goto out_pool3;
608 }
609
610 monc->cur_mon = -1;
611 monc->hunting = true;
612 monc->sub_renew_after = jiffies;
613 monc->sub_sent = 0;
614
615 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
616 monc->statfs_request_tree = RB_ROOT;
617 monc->num_statfs_requests = 0;
618 monc->last_tid = 0;
619
620 monc->have_mdsmap = 0;
621 monc->have_osdmap = 0;
622 monc->want_next_osdmap = 1;
623 return 0;
624
625out_pool3:
626 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
627out_pool2:
628 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
629out_pool1:
630 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
631out_monmap:
632 kfree(monc->monmap);
633out:
634 return err;
635}
636
637void ceph_monc_stop(struct ceph_mon_client *monc)
638{
639 dout("stop\n");
640 cancel_delayed_work_sync(&monc->delayed_work);
641
642 mutex_lock(&monc->mutex);
643 __close_session(monc);
644 if (monc->con) {
645 monc->con->private = NULL;
646 monc->con->ops->put(monc->con);
647 monc->con = NULL;
648 }
649 mutex_unlock(&monc->mutex);
650
651 ceph_auth_destroy(monc->auth);
652
653 ceph_msg_put(monc->m_auth);
654 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
655 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
656 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
657
658 kfree(monc->monmap);
659}
660
661static void handle_auth_reply(struct ceph_mon_client *monc,
662 struct ceph_msg *msg)
663{
664 int ret;
665
666 mutex_lock(&monc->mutex);
667 monc->pending_auth = 0;
668 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
669 msg->front.iov_len,
670 monc->m_auth->front.iov_base,
671 monc->m_auth->front_max);
672 if (ret < 0) {
673 monc->client->auth_err = ret;
674 wake_up(&monc->client->auth_wq);
675 } else if (ret > 0) {
676 __send_prepared_auth_request(monc, ret);
677 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
678 dout("authenticated, starting session\n");
679
680 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
681 monc->client->msgr->inst.name.num = monc->auth->global_id;
682
683 __send_subscribe(monc);
684 __resend_statfs(monc);
685 }
686 mutex_unlock(&monc->mutex);
687}
688
689static int __validate_auth(struct ceph_mon_client *monc)
690{
691 int ret;
692
693 if (monc->pending_auth)
694 return 0;
695
696 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
697 monc->m_auth->front_max);
698 if (ret <= 0)
699 return ret; /* either an error, or no need to authenticate */
700 __send_prepared_auth_request(monc, ret);
701 return 0;
702}
703
704int ceph_monc_validate_auth(struct ceph_mon_client *monc)
705{
706 int ret;
707
708 mutex_lock(&monc->mutex);
709 ret = __validate_auth(monc);
710 mutex_unlock(&monc->mutex);
711 return ret;
712}
713
714/*
715 * handle incoming message
716 */
717static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
718{
719 struct ceph_mon_client *monc = con->private;
720 int type = le16_to_cpu(msg->hdr.type);
721
722 if (!monc)
723 return;
724
725 switch (type) {
726 case CEPH_MSG_AUTH_REPLY:
727 handle_auth_reply(monc, msg);
728 break;
729
730 case CEPH_MSG_MON_SUBSCRIBE_ACK:
731 handle_subscribe_ack(monc, msg);
732 break;
733
734 case CEPH_MSG_STATFS_REPLY:
735 handle_statfs_reply(monc, msg);
736 break;
737
738 case CEPH_MSG_MON_MAP:
739 ceph_monc_handle_map(monc, msg);
740 break;
741
742 case CEPH_MSG_MDS_MAP:
743 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
744 break;
745
746 case CEPH_MSG_OSD_MAP:
747 ceph_osdc_handle_map(&monc->client->osdc, msg);
748 break;
749
750 default:
751 pr_err("received unknown message type %d %s\n", type,
752 ceph_msg_type_name(type));
753 }
754 ceph_msg_put(msg);
755}
756
757/*
758 * Allocate memory for incoming message
759 */
760static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
761 struct ceph_msg_header *hdr,
762 int *skip)
763{
764 struct ceph_mon_client *monc = con->private;
765 int type = le16_to_cpu(hdr->type);
766 int front_len = le32_to_cpu(hdr->front_len);
767 struct ceph_msg *m = NULL;
768
769 *skip = 0;
770
771 switch (type) {
772 case CEPH_MSG_MON_SUBSCRIBE_ACK:
773 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
774 break;
775 case CEPH_MSG_STATFS_REPLY:
776 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
777 break;
778 case CEPH_MSG_AUTH_REPLY:
779 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
780 break;
781 case CEPH_MSG_MON_MAP:
782 case CEPH_MSG_MDS_MAP:
783 case CEPH_MSG_OSD_MAP:
784 m = ceph_msg_new(type, front_len, 0, 0, NULL);
785 break;
786 }
787
788 if (!m) {
789 pr_info("alloc_msg unknown type %d\n", type);
790 *skip = 1;
791 }
792 return m;
793}
794
795/*
796 * If the monitor connection resets, pick a new monitor and resubmit
797 * any pending requests.
798 */
799static void mon_fault(struct ceph_connection *con)
800{
801 struct ceph_mon_client *monc = con->private;
802
803 if (!monc)
804 return;
805
806 dout("mon_fault\n");
807 mutex_lock(&monc->mutex);
808 if (!con->private)
809 goto out;
810
811 if (monc->con && !monc->hunting)
812 pr_info("mon%d %s session lost, "
813 "hunting for new mon\n", monc->cur_mon,
814 pr_addr(&monc->con->peer_addr.in_addr));
815
816 __close_session(monc);
817 if (!monc->hunting) {
818 /* start hunting */
819 monc->hunting = true;
820 __open_session(monc);
821 } else {
822 /* already hunting, let's wait a bit */
823 __schedule_delayed(monc);
824 }
825out:
826 mutex_unlock(&monc->mutex);
827}
828
829const static struct ceph_connection_operations mon_con_ops = {
830 .get = ceph_con_get,
831 .put = ceph_con_put,
832 .dispatch = dispatch,
833 .fault = mon_fault,
834 .alloc_msg = mon_alloc_msg,
835};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..c7b4dedaace6
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1550 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 struct ceph_osd_request *req;
417 int ret = 0;
418
419 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
420 if (list_empty(&osd->o_requests)) {
421 __remove_osd(osdc, osd);
422 } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd],
423 &osd->o_con.peer_addr,
424 sizeof(osd->o_con.peer_addr)) == 0 &&
425 !ceph_con_opened(&osd->o_con)) {
426 dout(" osd addr hasn't changed and connection never opened,"
427 " letting msgr retry");
428 /* touch each r_stamp for handle_timeout()'s benfit */
429 list_for_each_entry(req, &osd->o_requests, r_osd_item)
430 req->r_stamp = jiffies;
431 ret = -EAGAIN;
432 } else {
433 ceph_con_close(&osd->o_con);
434 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
435 osd->o_incarnation++;
436 }
437 return ret;
438}
439
440static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
441{
442 struct rb_node **p = &osdc->osds.rb_node;
443 struct rb_node *parent = NULL;
444 struct ceph_osd *osd = NULL;
445
446 while (*p) {
447 parent = *p;
448 osd = rb_entry(parent, struct ceph_osd, o_node);
449 if (new->o_osd < osd->o_osd)
450 p = &(*p)->rb_left;
451 else if (new->o_osd > osd->o_osd)
452 p = &(*p)->rb_right;
453 else
454 BUG();
455 }
456
457 rb_link_node(&new->o_node, parent, p);
458 rb_insert_color(&new->o_node, &osdc->osds);
459}
460
461static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
462{
463 struct ceph_osd *osd;
464 struct rb_node *n = osdc->osds.rb_node;
465
466 while (n) {
467 osd = rb_entry(n, struct ceph_osd, o_node);
468 if (o < osd->o_osd)
469 n = n->rb_left;
470 else if (o > osd->o_osd)
471 n = n->rb_right;
472 else
473 return osd;
474 }
475 return NULL;
476}
477
478static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
479{
480 schedule_delayed_work(&osdc->timeout_work,
481 osdc->client->mount_args->osd_keepalive_timeout * HZ);
482}
483
484static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
485{
486 cancel_delayed_work(&osdc->timeout_work);
487}
488
489/*
490 * Register request, assign tid. If this is the first request, set up
491 * the timeout event.
492 */
493static void register_request(struct ceph_osd_client *osdc,
494 struct ceph_osd_request *req)
495{
496 mutex_lock(&osdc->request_mutex);
497 req->r_tid = ++osdc->last_tid;
498 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
499 INIT_LIST_HEAD(&req->r_req_lru_item);
500
501 dout("register_request %p tid %lld\n", req, req->r_tid);
502 __insert_request(osdc, req);
503 ceph_osdc_get_request(req);
504 osdc->num_requests++;
505
506 if (osdc->num_requests == 1) {
507 dout(" first request, scheduling timeout\n");
508 __schedule_osd_timeout(osdc);
509 }
510 mutex_unlock(&osdc->request_mutex);
511}
512
513/*
514 * called under osdc->request_mutex
515 */
516static void __unregister_request(struct ceph_osd_client *osdc,
517 struct ceph_osd_request *req)
518{
519 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
520 rb_erase(&req->r_node, &osdc->requests);
521 osdc->num_requests--;
522
523 if (req->r_osd) {
524 /* make sure the original request isn't in flight. */
525 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
526
527 list_del_init(&req->r_osd_item);
528 if (list_empty(&req->r_osd->o_requests))
529 __move_osd_to_lru(osdc, req->r_osd);
530 req->r_osd = NULL;
531 }
532
533 ceph_osdc_put_request(req);
534
535 list_del_init(&req->r_req_lru_item);
536 if (osdc->num_requests == 0) {
537 dout(" no requests, canceling timeout\n");
538 __cancel_osd_timeout(osdc);
539 }
540}
541
542/*
543 * Cancel a previously queued request message
544 */
545static void __cancel_request(struct ceph_osd_request *req)
546{
547 if (req->r_sent) {
548 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
549 req->r_sent = 0;
550 }
551 list_del_init(&req->r_req_lru_item);
552}
553
554/*
555 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
556 * (as needed), and set the request r_osd appropriately. If there is
557 * no up osd, set r_osd to NULL.
558 *
559 * Return 0 if unchanged, 1 if changed, or negative on error.
560 *
561 * Caller should hold map_sem for read and request_mutex.
562 */
563static int __map_osds(struct ceph_osd_client *osdc,
564 struct ceph_osd_request *req)
565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid;
568 int o = -1;
569 int err;
570
571 dout("map_osds %p tid %lld\n", req, req->r_tid);
572 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
573 &req->r_file_layout, osdc->osdmap);
574 if (err)
575 return err;
576 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid;
578
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
580
581 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) ||
583 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */
585
586 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1);
589
590 if (req->r_osd) {
591 __cancel_request(req);
592 list_del_init(&req->r_osd_item);
593 req->r_osd = NULL;
594 }
595
596 req->r_osd = __lookup_osd(osdc, o);
597 if (!req->r_osd && o >= 0) {
598 err = -ENOMEM;
599 req->r_osd = create_osd(osdc);
600 if (!req->r_osd)
601 goto out;
602
603 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
604 req->r_osd->o_osd = o;
605 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
606 __insert_osd(osdc, req->r_osd);
607
608 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
609 }
610
611 if (req->r_osd) {
612 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 }
615 err = 1; /* osd changed */
616
617out:
618 return err;
619}
620
621/*
622 * caller should hold map_sem (for read) and request_mutex
623 */
624static int __send_request(struct ceph_osd_client *osdc,
625 struct ceph_osd_request *req)
626{
627 struct ceph_osd_request_head *reqhead;
628 int err;
629
630 err = __map_osds(osdc, req);
631 if (err < 0)
632 return err;
633 if (req->r_osd == NULL) {
634 dout("send_request %p no up osds in pg\n", req);
635 ceph_monc_request_next_osdmap(&osdc->client->monc);
636 return 0;
637 }
638
639 dout("send_request %p tid %llu to osd%d flags %d\n",
640 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
641
642 reqhead = req->r_request->front.iov_base;
643 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
644 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
645 reqhead->reassert_version = req->r_reassert_version;
646
647 req->r_stamp = jiffies;
648 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
649
650 ceph_msg_get(req->r_request); /* send consumes a ref */
651 ceph_con_send(&req->r_osd->o_con, req->r_request);
652 req->r_sent = req->r_osd->o_incarnation;
653 return 0;
654}
655
656/*
657 * Timeout callback, called every N seconds when 1 or more osd
658 * requests has been active for more than N seconds. When this
659 * happens, we ping all OSDs with requests who have timed out to
660 * ensure any communications channel reset is detected. Reset the
661 * request timeouts another N seconds in the future as we go.
662 * Reschedule the timeout event another N seconds in future (unless
663 * there are no open requests).
664 */
665static void handle_timeout(struct work_struct *work)
666{
667 struct ceph_osd_client *osdc =
668 container_of(work, struct ceph_osd_client, timeout_work.work);
669 struct ceph_osd_request *req, *last_req = NULL;
670 struct ceph_osd *osd;
671 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
672 unsigned long keepalive =
673 osdc->client->mount_args->osd_keepalive_timeout * HZ;
674 unsigned long last_stamp = 0;
675 struct rb_node *p;
676 struct list_head slow_osds;
677
678 dout("timeout\n");
679 down_read(&osdc->map_sem);
680
681 ceph_monc_request_next_osdmap(&osdc->client->monc);
682
683 mutex_lock(&osdc->request_mutex);
684 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
685 req = rb_entry(p, struct ceph_osd_request, r_node);
686
687 if (req->r_resend) {
688 int err;
689
690 dout("osdc resending prev failed %lld\n", req->r_tid);
691 err = __send_request(osdc, req);
692 if (err)
693 dout("osdc failed again on %lld\n", req->r_tid);
694 else
695 req->r_resend = false;
696 continue;
697 }
698 }
699
700 /*
701 * reset osds that appear to be _really_ unresponsive. this
702 * is a failsafe measure.. we really shouldn't be getting to
703 * this point if the system is working properly. the monitors
704 * should mark the osd as failed and we should find out about
705 * it from an updated osd map.
706 */
707 while (!list_empty(&osdc->req_lru)) {
708 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
709 r_req_lru_item);
710
711 if (time_before(jiffies, req->r_stamp + timeout))
712 break;
713
714 BUG_ON(req == last_req && req->r_stamp == last_stamp);
715 last_req = req;
716 last_stamp = req->r_stamp;
717
718 osd = req->r_osd;
719 BUG_ON(!osd);
720 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
721 req->r_tid, osd->o_osd);
722 __kick_requests(osdc, osd);
723 }
724
725 /*
726 * ping osds that are a bit slow. this ensures that if there
727 * is a break in the TCP connection we will notice, and reopen
728 * a connection with that osd (from the fault callback).
729 */
730 INIT_LIST_HEAD(&slow_osds);
731 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
732 if (time_before(jiffies, req->r_stamp + keepalive))
733 break;
734
735 osd = req->r_osd;
736 BUG_ON(!osd);
737 dout(" tid %llu is slow, will send keepalive on osd%d\n",
738 req->r_tid, osd->o_osd);
739 list_move_tail(&osd->o_keepalive_item, &slow_osds);
740 }
741 while (!list_empty(&slow_osds)) {
742 osd = list_entry(slow_osds.next, struct ceph_osd,
743 o_keepalive_item);
744 list_del_init(&osd->o_keepalive_item);
745 ceph_con_keepalive(&osd->o_con);
746 }
747
748 __schedule_osd_timeout(osdc);
749 mutex_unlock(&osdc->request_mutex);
750
751 up_read(&osdc->map_sem);
752}
753
754static void handle_osds_timeout(struct work_struct *work)
755{
756 struct ceph_osd_client *osdc =
757 container_of(work, struct ceph_osd_client,
758 osds_timeout_work.work);
759 unsigned long delay =
760 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
761
762 dout("osds timeout\n");
763 down_read(&osdc->map_sem);
764 remove_old_osds(osdc, 0);
765 up_read(&osdc->map_sem);
766
767 schedule_delayed_work(&osdc->osds_timeout_work,
768 round_jiffies_relative(delay));
769}
770
771/*
772 * handle osd op reply. either call the callback if it is specified,
773 * or do the completion to wake up the waiting thread.
774 */
775static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
776 struct ceph_connection *con)
777{
778 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
779 struct ceph_osd_request *req;
780 u64 tid;
781 int numops, object_len, flags;
782
783 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad;
786 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op))
790 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid);
792
793 /* lookup */
794 mutex_lock(&osdc->request_mutex);
795 req = __lookup_request(osdc, tid);
796 if (req == NULL) {
797 dout("handle_reply tid %llu dne\n", tid);
798 mutex_unlock(&osdc->request_mutex);
799 return;
800 }
801 ceph_osdc_get_request(req);
802 flags = le32_to_cpu(rhead->flags);
803
804 /*
805 * if this connection filled our message, drop our reference now, to
806 * avoid a (safe but slower) revoke later.
807 */
808 if (req->r_con_filling_msg == con && req->r_reply == msg) {
809 dout(" dropping con_filling_msg ref %p\n", con);
810 req->r_con_filling_msg = NULL;
811 ceph_con_put(con);
812 }
813
814 if (!req->r_got_reply) {
815 unsigned bytes;
816
817 req->r_result = le32_to_cpu(rhead->result);
818 bytes = le32_to_cpu(msg->hdr.data_len);
819 dout("handle_reply result %d bytes %d\n", req->r_result,
820 bytes);
821 if (req->r_result == 0)
822 req->r_result = bytes;
823
824 /* in case this is a write and we need to replay, */
825 req->r_reassert_version = rhead->reassert_version;
826
827 req->r_got_reply = 1;
828 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
829 dout("handle_reply tid %llu dup ack\n", tid);
830 mutex_unlock(&osdc->request_mutex);
831 goto done;
832 }
833
834 dout("handle_reply tid %llu flags %d\n", tid, flags);
835
836 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req);
840
841 mutex_unlock(&osdc->request_mutex);
842
843 if (req->r_callback)
844 req->r_callback(req, msg);
845 else
846 complete(&req->r_completion);
847
848 if (flags & CEPH_OSD_FLAG_ONDISK) {
849 if (req->r_safe_callback)
850 req->r_safe_callback(req, msg);
851 complete(&req->r_safe_completion); /* fsync waiter */
852 }
853
854done:
855 ceph_osdc_put_request(req);
856 return;
857
858bad:
859 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
860 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
861 (int)sizeof(*rhead));
862 ceph_msg_dump(msg);
863}
864
865
866static int __kick_requests(struct ceph_osd_client *osdc,
867 struct ceph_osd *kickosd)
868{
869 struct ceph_osd_request *req;
870 struct rb_node *p, *n;
871 int needmap = 0;
872 int err;
873
874 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
875 if (kickosd) {
876 err = __reset_osd(osdc, kickosd);
877 if (err == -EAGAIN)
878 return 1;
879 } else {
880 for (p = rb_first(&osdc->osds); p; p = n) {
881 struct ceph_osd *osd =
882 rb_entry(p, struct ceph_osd, o_node);
883
884 n = rb_next(p);
885 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
886 memcmp(&osd->o_con.peer_addr,
887 ceph_osd_addr(osdc->osdmap,
888 osd->o_osd),
889 sizeof(struct ceph_entity_addr)) != 0)
890 __reset_osd(osdc, osd);
891 }
892 }
893
894 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
895 req = rb_entry(p, struct ceph_osd_request, r_node);
896
897 if (req->r_resend) {
898 dout(" r_resend set on tid %llu\n", req->r_tid);
899 __cancel_request(req);
900 goto kick;
901 }
902 if (req->r_osd && kickosd == req->r_osd) {
903 __cancel_request(req);
904 goto kick;
905 }
906
907 err = __map_osds(osdc, req);
908 if (err == 0)
909 continue; /* no change */
910 if (err < 0) {
911 /*
912 * FIXME: really, we should set the request
913 * error and fail if this isn't a 'nofail'
914 * request, but that's a fair bit more
915 * complicated to do. So retry!
916 */
917 dout(" setting r_resend on %llu\n", req->r_tid);
918 req->r_resend = true;
919 continue;
920 }
921 if (req->r_osd == NULL) {
922 dout("tid %llu maps to no valid osd\n", req->r_tid);
923 needmap++; /* request a newer map */
924 continue;
925 }
926
927kick:
928 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
929 req->r_osd ? req->r_osd->o_osd : -1);
930 req->r_flags |= CEPH_OSD_FLAG_RETRY;
931 err = __send_request(osdc, req);
932 if (err) {
933 dout(" setting r_resend on %llu\n", req->r_tid);
934 req->r_resend = true;
935 }
936 }
937
938 return needmap;
939}
940
941/*
942 * Resubmit osd requests whose osd or osd address has changed. Request
943 * a new osd map if osds are down, or we are otherwise unable to determine
944 * how to direct a request.
945 *
946 * Close connections to down osds.
947 *
948 * If @who is specified, resubmit requests for that specific osd.
949 *
950 * Caller should hold map_sem for read and request_mutex.
951 */
952static void kick_requests(struct ceph_osd_client *osdc,
953 struct ceph_osd *kickosd)
954{
955 int needmap;
956
957 mutex_lock(&osdc->request_mutex);
958 needmap = __kick_requests(osdc, kickosd);
959 mutex_unlock(&osdc->request_mutex);
960
961 if (needmap) {
962 dout("%d requests for down osds, need new map\n", needmap);
963 ceph_monc_request_next_osdmap(&osdc->client->monc);
964 }
965
966}
967/*
968 * Process updated osd map.
969 *
970 * The message contains any number of incremental and full maps, normally
971 * indicating some sort of topology change in the cluster. Kick requests
972 * off to different OSDs as needed.
973 */
974void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
975{
976 void *p, *end, *next;
977 u32 nr_maps, maplen;
978 u32 epoch;
979 struct ceph_osdmap *newmap = NULL, *oldmap;
980 int err;
981 struct ceph_fsid fsid;
982
983 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
984 p = msg->front.iov_base;
985 end = p + msg->front.iov_len;
986
987 /* verify fsid */
988 ceph_decode_need(&p, end, sizeof(fsid), bad);
989 ceph_decode_copy(&p, &fsid, sizeof(fsid));
990 if (ceph_check_fsid(osdc->client, &fsid) < 0)
991 return;
992
993 down_write(&osdc->map_sem);
994
995 /* incremental maps */
996 ceph_decode_32_safe(&p, end, nr_maps, bad);
997 dout(" %d inc maps\n", nr_maps);
998 while (nr_maps > 0) {
999 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1000 epoch = ceph_decode_32(&p);
1001 maplen = ceph_decode_32(&p);
1002 ceph_decode_need(&p, end, maplen, bad);
1003 next = p + maplen;
1004 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
1005 dout("applying incremental map %u len %d\n",
1006 epoch, maplen);
1007 newmap = osdmap_apply_incremental(&p, next,
1008 osdc->osdmap,
1009 osdc->client->msgr);
1010 if (IS_ERR(newmap)) {
1011 err = PTR_ERR(newmap);
1012 goto bad;
1013 }
1014 BUG_ON(!newmap);
1015 if (newmap != osdc->osdmap) {
1016 ceph_osdmap_destroy(osdc->osdmap);
1017 osdc->osdmap = newmap;
1018 }
1019 } else {
1020 dout("ignoring incremental map %u len %d\n",
1021 epoch, maplen);
1022 }
1023 p = next;
1024 nr_maps--;
1025 }
1026 if (newmap)
1027 goto done;
1028
1029 /* full maps */
1030 ceph_decode_32_safe(&p, end, nr_maps, bad);
1031 dout(" %d full maps\n", nr_maps);
1032 while (nr_maps) {
1033 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1034 epoch = ceph_decode_32(&p);
1035 maplen = ceph_decode_32(&p);
1036 ceph_decode_need(&p, end, maplen, bad);
1037 if (nr_maps > 1) {
1038 dout("skipping non-latest full map %u len %d\n",
1039 epoch, maplen);
1040 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1041 dout("skipping full map %u len %d, "
1042 "older than our %u\n", epoch, maplen,
1043 osdc->osdmap->epoch);
1044 } else {
1045 dout("taking full map %u len %d\n", epoch, maplen);
1046 newmap = osdmap_decode(&p, p+maplen);
1047 if (IS_ERR(newmap)) {
1048 err = PTR_ERR(newmap);
1049 goto bad;
1050 }
1051 BUG_ON(!newmap);
1052 oldmap = osdc->osdmap;
1053 osdc->osdmap = newmap;
1054 if (oldmap)
1055 ceph_osdmap_destroy(oldmap);
1056 }
1057 p += maplen;
1058 nr_maps--;
1059 }
1060
1061done:
1062 downgrade_write(&osdc->map_sem);
1063 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1064 if (newmap)
1065 kick_requests(osdc, NULL);
1066 up_read(&osdc->map_sem);
1067 return;
1068
1069bad:
1070 pr_err("osdc handle_map corrupt msg\n");
1071 ceph_msg_dump(msg);
1072 up_write(&osdc->map_sem);
1073 return;
1074}
1075
1076
1077/*
1078 * A read request prepares specific pages that data is to be read into.
1079 * When a message is being read off the wire, we call prepare_pages to
1080 * find those pages.
1081 * 0 = success, -1 failure.
1082 */
1083static int __prepare_pages(struct ceph_connection *con,
1084 struct ceph_msg_header *hdr,
1085 struct ceph_osd_request *req,
1086 u64 tid,
1087 struct ceph_msg *m)
1088{
1089 struct ceph_osd *osd = con->private;
1090 struct ceph_osd_client *osdc;
1091 int ret = -1;
1092 int data_len = le32_to_cpu(hdr->data_len);
1093 unsigned data_off = le16_to_cpu(hdr->data_off);
1094
1095 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1096
1097 if (!osd)
1098 return -1;
1099
1100 osdc = osd->o_osdc;
1101
1102 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1103 tid, req->r_num_pages, want);
1104 if (unlikely(req->r_num_pages < want))
1105 goto out;
1106 m->pages = req->r_pages;
1107 m->nr_pages = req->r_num_pages;
1108 ret = 0; /* success */
1109out:
1110 BUG_ON(ret < 0 || m->nr_pages < want);
1111
1112 return ret;
1113}
1114
1115/*
1116 * Register request, send initial attempt.
1117 */
1118int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1119 struct ceph_osd_request *req,
1120 bool nofail)
1121{
1122 int rc = 0;
1123
1124 req->r_request->pages = req->r_pages;
1125 req->r_request->nr_pages = req->r_num_pages;
1126
1127 register_request(osdc, req);
1128
1129 down_read(&osdc->map_sem);
1130 mutex_lock(&osdc->request_mutex);
1131 /*
1132 * a racing kick_requests() may have sent the message for us
1133 * while we dropped request_mutex above, so only send now if
1134 * the request still han't been touched yet.
1135 */
1136 if (req->r_sent == 0) {
1137 rc = __send_request(osdc, req);
1138 if (rc) {
1139 if (nofail) {
1140 dout("osdc_start_request failed send, "
1141 " marking %lld\n", req->r_tid);
1142 req->r_resend = true;
1143 rc = 0;
1144 } else {
1145 __unregister_request(osdc, req);
1146 }
1147 }
1148 }
1149 mutex_unlock(&osdc->request_mutex);
1150 up_read(&osdc->map_sem);
1151 return rc;
1152}
1153
1154/*
1155 * wait for a request to complete
1156 */
1157int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1158 struct ceph_osd_request *req)
1159{
1160 int rc;
1161
1162 rc = wait_for_completion_interruptible(&req->r_completion);
1163 if (rc < 0) {
1164 mutex_lock(&osdc->request_mutex);
1165 __cancel_request(req);
1166 __unregister_request(osdc, req);
1167 mutex_unlock(&osdc->request_mutex);
1168 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1169 return rc;
1170 }
1171
1172 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1173 return req->r_result;
1174}
1175
1176/*
1177 * sync - wait for all in-flight requests to flush. avoid starvation.
1178 */
1179void ceph_osdc_sync(struct ceph_osd_client *osdc)
1180{
1181 struct ceph_osd_request *req;
1182 u64 last_tid, next_tid = 0;
1183
1184 mutex_lock(&osdc->request_mutex);
1185 last_tid = osdc->last_tid;
1186 while (1) {
1187 req = __lookup_request_ge(osdc, next_tid);
1188 if (!req)
1189 break;
1190 if (req->r_tid > last_tid)
1191 break;
1192
1193 next_tid = req->r_tid + 1;
1194 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1195 continue;
1196
1197 ceph_osdc_get_request(req);
1198 mutex_unlock(&osdc->request_mutex);
1199 dout("sync waiting on tid %llu (last is %llu)\n",
1200 req->r_tid, last_tid);
1201 wait_for_completion(&req->r_safe_completion);
1202 mutex_lock(&osdc->request_mutex);
1203 ceph_osdc_put_request(req);
1204 }
1205 mutex_unlock(&osdc->request_mutex);
1206 dout("sync done (thru tid %llu)\n", last_tid);
1207}
1208
1209/*
1210 * init, shutdown
1211 */
1212int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1213{
1214 int err;
1215
1216 dout("init\n");
1217 osdc->client = client;
1218 osdc->osdmap = NULL;
1219 init_rwsem(&osdc->map_sem);
1220 init_completion(&osdc->map_waiters);
1221 osdc->last_requested_map = 0;
1222 mutex_init(&osdc->request_mutex);
1223 osdc->last_tid = 0;
1224 osdc->osds = RB_ROOT;
1225 INIT_LIST_HEAD(&osdc->osd_lru);
1226 osdc->requests = RB_ROOT;
1227 INIT_LIST_HEAD(&osdc->req_lru);
1228 osdc->num_requests = 0;
1229 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1230 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1231
1232 schedule_delayed_work(&osdc->osds_timeout_work,
1233 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1234
1235 err = -ENOMEM;
1236 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1237 sizeof(struct ceph_osd_request));
1238 if (!osdc->req_mempool)
1239 goto out;
1240
1241 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1242 if (err < 0)
1243 goto out_mempool;
1244 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1245 OSD_OPREPLY_FRONT_LEN, 10, true);
1246 if (err < 0)
1247 goto out_msgpool;
1248 return 0;
1249
1250out_msgpool:
1251 ceph_msgpool_destroy(&osdc->msgpool_op);
1252out_mempool:
1253 mempool_destroy(osdc->req_mempool);
1254out:
1255 return err;
1256}
1257
1258void ceph_osdc_stop(struct ceph_osd_client *osdc)
1259{
1260 cancel_delayed_work_sync(&osdc->timeout_work);
1261 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1262 if (osdc->osdmap) {
1263 ceph_osdmap_destroy(osdc->osdmap);
1264 osdc->osdmap = NULL;
1265 }
1266 remove_old_osds(osdc, 1);
1267 mempool_destroy(osdc->req_mempool);
1268 ceph_msgpool_destroy(&osdc->msgpool_op);
1269 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1270}
1271
1272/*
1273 * Read some contiguous pages. If we cross a stripe boundary, shorten
1274 * *plen. Return number of bytes read, or error.
1275 */
1276int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1277 struct ceph_vino vino, struct ceph_file_layout *layout,
1278 u64 off, u64 *plen,
1279 u32 truncate_seq, u64 truncate_size,
1280 struct page **pages, int num_pages)
1281{
1282 struct ceph_osd_request *req;
1283 int rc = 0;
1284
1285 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1286 vino.snap, off, *plen);
1287 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1288 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1289 NULL, 0, truncate_seq, truncate_size, NULL,
1290 false, 1);
1291 if (IS_ERR(req))
1292 return PTR_ERR(req);
1293
1294 /* it may be a short read due to an object boundary */
1295 req->r_pages = pages;
1296 num_pages = calc_pages_for(off, *plen);
1297 req->r_num_pages = num_pages;
1298
1299 dout("readpages final extent is %llu~%llu (%d pages)\n",
1300 off, *plen, req->r_num_pages);
1301
1302 rc = ceph_osdc_start_request(osdc, req, false);
1303 if (!rc)
1304 rc = ceph_osdc_wait_request(osdc, req);
1305
1306 ceph_osdc_put_request(req);
1307 dout("readpages result %d\n", rc);
1308 return rc;
1309}
1310
1311/*
1312 * do a synchronous write on N pages
1313 */
1314int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1315 struct ceph_file_layout *layout,
1316 struct ceph_snap_context *snapc,
1317 u64 off, u64 len,
1318 u32 truncate_seq, u64 truncate_size,
1319 struct timespec *mtime,
1320 struct page **pages, int num_pages,
1321 int flags, int do_sync, bool nofail)
1322{
1323 struct ceph_osd_request *req;
1324 int rc = 0;
1325
1326 BUG_ON(vino.snap != CEPH_NOSNAP);
1327 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1328 CEPH_OSD_OP_WRITE,
1329 flags | CEPH_OSD_FLAG_ONDISK |
1330 CEPH_OSD_FLAG_WRITE,
1331 snapc, do_sync,
1332 truncate_seq, truncate_size, mtime,
1333 nofail, 1);
1334 if (IS_ERR(req))
1335 return PTR_ERR(req);
1336
1337 /* it may be a short write due to an object boundary */
1338 req->r_pages = pages;
1339 req->r_num_pages = calc_pages_for(off, len);
1340 dout("writepages %llu~%llu (%d pages)\n", off, len,
1341 req->r_num_pages);
1342
1343 rc = ceph_osdc_start_request(osdc, req, nofail);
1344 if (!rc)
1345 rc = ceph_osdc_wait_request(osdc, req);
1346
1347 ceph_osdc_put_request(req);
1348 if (rc == 0)
1349 rc = len;
1350 dout("writepages result %d\n", rc);
1351 return rc;
1352}
1353
1354/*
1355 * handle incoming message
1356 */
1357static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1358{
1359 struct ceph_osd *osd = con->private;
1360 struct ceph_osd_client *osdc;
1361 int type = le16_to_cpu(msg->hdr.type);
1362
1363 if (!osd)
1364 return;
1365 osdc = osd->o_osdc;
1366
1367 switch (type) {
1368 case CEPH_MSG_OSD_MAP:
1369 ceph_osdc_handle_map(osdc, msg);
1370 break;
1371 case CEPH_MSG_OSD_OPREPLY:
1372 handle_reply(osdc, msg, con);
1373 break;
1374
1375 default:
1376 pr_err("received unknown message type %d %s\n", type,
1377 ceph_msg_type_name(type));
1378 }
1379 ceph_msg_put(msg);
1380}
1381
1382/*
1383 * lookup and return message for incoming reply
1384 */
1385static struct ceph_msg *get_reply(struct ceph_connection *con,
1386 struct ceph_msg_header *hdr,
1387 int *skip)
1388{
1389 struct ceph_osd *osd = con->private;
1390 struct ceph_osd_client *osdc = osd->o_osdc;
1391 struct ceph_msg *m;
1392 struct ceph_osd_request *req;
1393 int front = le32_to_cpu(hdr->front_len);
1394 int data_len = le32_to_cpu(hdr->data_len);
1395 u64 tid;
1396 int err;
1397
1398 tid = le64_to_cpu(hdr->tid);
1399 mutex_lock(&osdc->request_mutex);
1400 req = __lookup_request(osdc, tid);
1401 if (!req) {
1402 *skip = 1;
1403 m = NULL;
1404 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1405 osd->o_osd);
1406 goto out;
1407 }
1408
1409 if (req->r_con_filling_msg) {
1410 dout("get_reply revoking msg %p from old con %p\n",
1411 req->r_reply, req->r_con_filling_msg);
1412 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1413 ceph_con_put(req->r_con_filling_msg);
1414 }
1415
1416 if (front > req->r_reply->front.iov_len) {
1417 pr_warning("get_reply front %d > preallocated %d\n",
1418 front, (int)req->r_reply->front.iov_len);
1419 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1420 if (IS_ERR(m))
1421 goto out;
1422 ceph_msg_put(req->r_reply);
1423 req->r_reply = m;
1424 }
1425 m = ceph_msg_get(req->r_reply);
1426
1427 if (data_len > 0) {
1428 err = __prepare_pages(con, hdr, req, tid, m);
1429 if (err < 0) {
1430 *skip = 1;
1431 ceph_msg_put(m);
1432 m = ERR_PTR(err);
1433 }
1434 }
1435 *skip = 0;
1436 req->r_con_filling_msg = ceph_con_get(con);
1437 dout("get_reply tid %lld %p\n", tid, m);
1438
1439out:
1440 mutex_unlock(&osdc->request_mutex);
1441 return m;
1442
1443}
1444
1445static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1446 struct ceph_msg_header *hdr,
1447 int *skip)
1448{
1449 struct ceph_osd *osd = con->private;
1450 int type = le16_to_cpu(hdr->type);
1451 int front = le32_to_cpu(hdr->front_len);
1452
1453 switch (type) {
1454 case CEPH_MSG_OSD_MAP:
1455 return ceph_msg_new(type, front, 0, 0, NULL);
1456 case CEPH_MSG_OSD_OPREPLY:
1457 return get_reply(con, hdr, skip);
1458 default:
1459 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1460 osd->o_osd);
1461 *skip = 1;
1462 return NULL;
1463 }
1464}
1465
1466/*
1467 * Wrappers to refcount containing ceph_osd struct
1468 */
1469static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1470{
1471 struct ceph_osd *osd = con->private;
1472 if (get_osd(osd))
1473 return con;
1474 return NULL;
1475}
1476
1477static void put_osd_con(struct ceph_connection *con)
1478{
1479 struct ceph_osd *osd = con->private;
1480 put_osd(osd);
1481}
1482
1483/*
1484 * authentication
1485 */
1486static int get_authorizer(struct ceph_connection *con,
1487 void **buf, int *len, int *proto,
1488 void **reply_buf, int *reply_len, int force_new)
1489{
1490 struct ceph_osd *o = con->private;
1491 struct ceph_osd_client *osdc = o->o_osdc;
1492 struct ceph_auth_client *ac = osdc->client->monc.auth;
1493 int ret = 0;
1494
1495 if (force_new && o->o_authorizer) {
1496 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1497 o->o_authorizer = NULL;
1498 }
1499 if (o->o_authorizer == NULL) {
1500 ret = ac->ops->create_authorizer(
1501 ac, CEPH_ENTITY_TYPE_OSD,
1502 &o->o_authorizer,
1503 &o->o_authorizer_buf,
1504 &o->o_authorizer_buf_len,
1505 &o->o_authorizer_reply_buf,
1506 &o->o_authorizer_reply_buf_len);
1507 if (ret)
1508 return ret;
1509 }
1510
1511 *proto = ac->protocol;
1512 *buf = o->o_authorizer_buf;
1513 *len = o->o_authorizer_buf_len;
1514 *reply_buf = o->o_authorizer_reply_buf;
1515 *reply_len = o->o_authorizer_reply_buf_len;
1516 return 0;
1517}
1518
1519
1520static int verify_authorizer_reply(struct ceph_connection *con, int len)
1521{
1522 struct ceph_osd *o = con->private;
1523 struct ceph_osd_client *osdc = o->o_osdc;
1524 struct ceph_auth_client *ac = osdc->client->monc.auth;
1525
1526 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1527}
1528
1529static int invalidate_authorizer(struct ceph_connection *con)
1530{
1531 struct ceph_osd *o = con->private;
1532 struct ceph_osd_client *osdc = o->o_osdc;
1533 struct ceph_auth_client *ac = osdc->client->monc.auth;
1534
1535 if (ac->ops->invalidate_authorizer)
1536 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1537
1538 return ceph_monc_validate_auth(&osdc->client->monc);
1539}
1540
1541const static struct ceph_connection_operations osd_con_ops = {
1542 .get = get_osd_con,
1543 .put = put_osd_con,
1544 .dispatch = dispatch,
1545 .get_authorizer = get_authorizer,
1546 .verify_authorizer_reply = verify_authorizer_reply,
1547 .invalidate_authorizer = invalidate_authorizer,
1548 .alloc_msg = alloc_msg,
1549 .fault = osd_reset,
1550};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..b0759911e7c3
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_stamp; /* send OR check time */
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..2e2c15eed82a
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1062 @@
1
2#include "ceph_debug.h"
3
4#include <linux/slab.h>
5#include <asm/div64.h>
6
7#include "super.h"
8#include "osdmap.h"
9#include "crush/hash.h"
10#include "crush/mapper.h"
11#include "decode.h"
12
13char *ceph_osdmap_state_str(char *str, int len, int state)
14{
15 int flag = 0;
16
17 if (!len)
18 goto done;
19
20 *str = '\0';
21 if (state) {
22 if (state & CEPH_OSD_EXISTS) {
23 snprintf(str, len, "exists");
24 flag = 1;
25 }
26 if (state & CEPH_OSD_UP) {
27 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
28 "up");
29 flag = 1;
30 }
31 } else {
32 snprintf(str, len, "doesn't exist");
33 }
34done:
35 return str;
36}
37
38/* maps */
39
40static int calc_bits_of(unsigned t)
41{
42 int b = 0;
43 while (t) {
44 t = t >> 1;
45 b++;
46 }
47 return b;
48}
49
50/*
51 * the foo_mask is the smallest value 2^n-1 that is >= foo.
52 */
53static void calc_pg_masks(struct ceph_pg_pool_info *pi)
54{
55 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
56 pi->pgp_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
58 pi->lpg_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
60 pi->lpgp_num_mask =
61 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
62}
63
64/*
65 * decode crush map
66 */
67static int crush_decode_uniform_bucket(void **p, void *end,
68 struct crush_bucket_uniform *b)
69{
70 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
71 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
72 b->item_weight = ceph_decode_32(p);
73 return 0;
74bad:
75 return -EINVAL;
76}
77
78static int crush_decode_list_bucket(void **p, void *end,
79 struct crush_bucket_list *b)
80{
81 int j;
82 dout("crush_decode_list_bucket %p to %p\n", *p, end);
83 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
84 if (b->item_weights == NULL)
85 return -ENOMEM;
86 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
87 if (b->sum_weights == NULL)
88 return -ENOMEM;
89 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
90 for (j = 0; j < b->h.size; j++) {
91 b->item_weights[j] = ceph_decode_32(p);
92 b->sum_weights[j] = ceph_decode_32(p);
93 }
94 return 0;
95bad:
96 return -EINVAL;
97}
98
99static int crush_decode_tree_bucket(void **p, void *end,
100 struct crush_bucket_tree *b)
101{
102 int j;
103 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
104 ceph_decode_32_safe(p, end, b->num_nodes, bad);
105 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
106 if (b->node_weights == NULL)
107 return -ENOMEM;
108 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
109 for (j = 0; j < b->num_nodes; j++)
110 b->node_weights[j] = ceph_decode_32(p);
111 return 0;
112bad:
113 return -EINVAL;
114}
115
116static int crush_decode_straw_bucket(void **p, void *end,
117 struct crush_bucket_straw *b)
118{
119 int j;
120 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
121 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
122 if (b->item_weights == NULL)
123 return -ENOMEM;
124 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
125 if (b->straws == NULL)
126 return -ENOMEM;
127 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
128 for (j = 0; j < b->h.size; j++) {
129 b->item_weights[j] = ceph_decode_32(p);
130 b->straws[j] = ceph_decode_32(p);
131 }
132 return 0;
133bad:
134 return -EINVAL;
135}
136
137static struct crush_map *crush_decode(void *pbyval, void *end)
138{
139 struct crush_map *c;
140 int err = -EINVAL;
141 int i, j;
142 void **p = &pbyval;
143 void *start = pbyval;
144 u32 magic;
145
146 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
147
148 c = kzalloc(sizeof(*c), GFP_NOFS);
149 if (c == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 ceph_decode_need(p, end, 4*sizeof(u32), bad);
153 magic = ceph_decode_32(p);
154 if (magic != CRUSH_MAGIC) {
155 pr_err("crush_decode magic %x != current %x\n",
156 (unsigned)magic, (unsigned)CRUSH_MAGIC);
157 goto bad;
158 }
159 c->max_buckets = ceph_decode_32(p);
160 c->max_rules = ceph_decode_32(p);
161 c->max_devices = ceph_decode_32(p);
162
163 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
164 if (c->device_parents == NULL)
165 goto badmem;
166 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
167 if (c->bucket_parents == NULL)
168 goto badmem;
169
170 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
171 if (c->buckets == NULL)
172 goto badmem;
173 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
174 if (c->rules == NULL)
175 goto badmem;
176
177 /* buckets */
178 for (i = 0; i < c->max_buckets; i++) {
179 int size = 0;
180 u32 alg;
181 struct crush_bucket *b;
182
183 ceph_decode_32_safe(p, end, alg, bad);
184 if (alg == 0) {
185 c->buckets[i] = NULL;
186 continue;
187 }
188 dout("crush_decode bucket %d off %x %p to %p\n",
189 i, (int)(*p-start), *p, end);
190
191 switch (alg) {
192 case CRUSH_BUCKET_UNIFORM:
193 size = sizeof(struct crush_bucket_uniform);
194 break;
195 case CRUSH_BUCKET_LIST:
196 size = sizeof(struct crush_bucket_list);
197 break;
198 case CRUSH_BUCKET_TREE:
199 size = sizeof(struct crush_bucket_tree);
200 break;
201 case CRUSH_BUCKET_STRAW:
202 size = sizeof(struct crush_bucket_straw);
203 break;
204 default:
205 err = -EINVAL;
206 goto bad;
207 }
208 BUG_ON(size == 0);
209 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
210 if (b == NULL)
211 goto badmem;
212
213 ceph_decode_need(p, end, 4*sizeof(u32), bad);
214 b->id = ceph_decode_32(p);
215 b->type = ceph_decode_16(p);
216 b->alg = ceph_decode_8(p);
217 b->hash = ceph_decode_8(p);
218 b->weight = ceph_decode_32(p);
219 b->size = ceph_decode_32(p);
220
221 dout("crush_decode bucket size %d off %x %p to %p\n",
222 b->size, (int)(*p-start), *p, end);
223
224 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
225 if (b->items == NULL)
226 goto badmem;
227 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
228 if (b->perm == NULL)
229 goto badmem;
230 b->perm_n = 0;
231
232 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
233 for (j = 0; j < b->size; j++)
234 b->items[j] = ceph_decode_32(p);
235
236 switch (b->alg) {
237 case CRUSH_BUCKET_UNIFORM:
238 err = crush_decode_uniform_bucket(p, end,
239 (struct crush_bucket_uniform *)b);
240 if (err < 0)
241 goto bad;
242 break;
243 case CRUSH_BUCKET_LIST:
244 err = crush_decode_list_bucket(p, end,
245 (struct crush_bucket_list *)b);
246 if (err < 0)
247 goto bad;
248 break;
249 case CRUSH_BUCKET_TREE:
250 err = crush_decode_tree_bucket(p, end,
251 (struct crush_bucket_tree *)b);
252 if (err < 0)
253 goto bad;
254 break;
255 case CRUSH_BUCKET_STRAW:
256 err = crush_decode_straw_bucket(p, end,
257 (struct crush_bucket_straw *)b);
258 if (err < 0)
259 goto bad;
260 break;
261 }
262 }
263
264 /* rules */
265 dout("rule vec is %p\n", c->rules);
266 for (i = 0; i < c->max_rules; i++) {
267 u32 yes;
268 struct crush_rule *r;
269
270 ceph_decode_32_safe(p, end, yes, bad);
271 if (!yes) {
272 dout("crush_decode NO rule %d off %x %p to %p\n",
273 i, (int)(*p-start), *p, end);
274 c->rules[i] = NULL;
275 continue;
276 }
277
278 dout("crush_decode rule %d off %x %p to %p\n",
279 i, (int)(*p-start), *p, end);
280
281 /* len */
282 ceph_decode_32_safe(p, end, yes, bad);
283#if BITS_PER_LONG == 32
284 err = -EINVAL;
285 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
286 goto bad;
287#endif
288 r = c->rules[i] = kmalloc(sizeof(*r) +
289 yes*sizeof(struct crush_rule_step),
290 GFP_NOFS);
291 if (r == NULL)
292 goto badmem;
293 dout(" rule %d is at %p\n", i, r);
294 r->len = yes;
295 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
296 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
297 for (j = 0; j < r->len; j++) {
298 r->steps[j].op = ceph_decode_32(p);
299 r->steps[j].arg1 = ceph_decode_32(p);
300 r->steps[j].arg2 = ceph_decode_32(p);
301 }
302 }
303
304 /* ignore trailing name maps. */
305
306 dout("crush_decode success\n");
307 return c;
308
309badmem:
310 err = -ENOMEM;
311bad:
312 dout("crush_decode fail %d\n", err);
313 crush_destroy(c);
314 return ERR_PTR(err);
315}
316
317/*
318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
319 * to a set of osds)
320 */
321static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
322{
323 u64 a = *(u64 *)&l;
324 u64 b = *(u64 *)&r;
325
326 if (a < b)
327 return -1;
328 if (a > b)
329 return 1;
330 return 0;
331}
332
333static int __insert_pg_mapping(struct ceph_pg_mapping *new,
334 struct rb_root *root)
335{
336 struct rb_node **p = &root->rb_node;
337 struct rb_node *parent = NULL;
338 struct ceph_pg_mapping *pg = NULL;
339 int c;
340
341 while (*p) {
342 parent = *p;
343 pg = rb_entry(parent, struct ceph_pg_mapping, node);
344 c = pgid_cmp(new->pgid, pg->pgid);
345 if (c < 0)
346 p = &(*p)->rb_left;
347 else if (c > 0)
348 p = &(*p)->rb_right;
349 else
350 return -EEXIST;
351 }
352
353 rb_link_node(&new->node, parent, p);
354 rb_insert_color(&new->node, root);
355 return 0;
356}
357
358static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
359 struct ceph_pg pgid)
360{
361 struct rb_node *n = root->rb_node;
362 struct ceph_pg_mapping *pg;
363 int c;
364
365 while (n) {
366 pg = rb_entry(n, struct ceph_pg_mapping, node);
367 c = pgid_cmp(pgid, pg->pgid);
368 if (c < 0)
369 n = n->rb_left;
370 else if (c > 0)
371 n = n->rb_right;
372 else
373 return pg;
374 }
375 return NULL;
376}
377
378/*
379 * rbtree of pg pool info
380 */
381static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
382{
383 struct rb_node **p = &root->rb_node;
384 struct rb_node *parent = NULL;
385 struct ceph_pg_pool_info *pi = NULL;
386
387 while (*p) {
388 parent = *p;
389 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
390 if (new->id < pi->id)
391 p = &(*p)->rb_left;
392 else if (new->id > pi->id)
393 p = &(*p)->rb_right;
394 else
395 return -EEXIST;
396 }
397
398 rb_link_node(&new->node, parent, p);
399 rb_insert_color(&new->node, root);
400 return 0;
401}
402
403static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
404{
405 struct ceph_pg_pool_info *pi;
406 struct rb_node *n = root->rb_node;
407
408 while (n) {
409 pi = rb_entry(n, struct ceph_pg_pool_info, node);
410 if (id < pi->id)
411 n = n->rb_left;
412 else if (id > pi->id)
413 n = n->rb_right;
414 else
415 return pi;
416 }
417 return NULL;
418}
419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
428{
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
433}
434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
527/*
528 * decode a full map.
529 */
530struct ceph_osdmap *osdmap_decode(void **p, void *end)
531{
532 struct ceph_osdmap *map;
533 u16 version;
534 u32 len, max, i;
535 u8 ev;
536 int err = -EINVAL;
537 void *start = *p;
538 struct ceph_pg_pool_info *pi;
539
540 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
541
542 map = kzalloc(sizeof(*map), GFP_NOFS);
543 if (map == NULL)
544 return ERR_PTR(-ENOMEM);
545 map->pg_temp = RB_ROOT;
546
547 ceph_decode_16_safe(p, end, version, bad);
548 if (version > CEPH_OSDMAP_VERSION) {
549 pr_warning("got unknown v %d > %d of osdmap\n", version,
550 CEPH_OSDMAP_VERSION);
551 goto bad;
552 }
553
554 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
555 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
556 map->epoch = ceph_decode_32(p);
557 ceph_decode_copy(p, &map->created, sizeof(map->created));
558 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
559
560 ceph_decode_32_safe(p, end, max, bad);
561 while (max--) {
562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
564 if (!pi)
565 goto bad;
566 pi->id = ceph_decode_32(p);
567 ev = ceph_decode_8(p); /* encoding version */
568 if (ev > CEPH_PG_POOL_VERSION) {
569 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
570 ev, CEPH_PG_POOL_VERSION);
571 goto bad;
572 }
573 __decode_pool(p, pi);
574 __insert_pg_pool(&map->pg_pools, pi);
575 }
576
577 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
578 goto bad;
579
580 ceph_decode_32_safe(p, end, map->pool_max, bad);
581
582 ceph_decode_32_safe(p, end, map->flags, bad);
583
584 max = ceph_decode_32(p);
585
586 /* (re)alloc osd arrays */
587 err = osdmap_set_max_osd(map, max);
588 if (err < 0)
589 goto bad;
590 dout("osdmap_decode max_osd = %d\n", map->max_osd);
591
592 /* osds */
593 err = -EINVAL;
594 ceph_decode_need(p, end, 3*sizeof(u32) +
595 map->max_osd*(1 + sizeof(*map->osd_weight) +
596 sizeof(*map->osd_addr)), bad);
597 *p += 4; /* skip length field (should match max) */
598 ceph_decode_copy(p, map->osd_state, map->max_osd);
599
600 *p += 4; /* skip length field (should match max) */
601 for (i = 0; i < map->max_osd; i++)
602 map->osd_weight[i] = ceph_decode_32(p);
603
604 *p += 4; /* skip length field (should match max) */
605 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
606 for (i = 0; i < map->max_osd; i++)
607 ceph_decode_addr(&map->osd_addr[i]);
608
609 /* pg_temp */
610 ceph_decode_32_safe(p, end, len, bad);
611 for (i = 0; i < len; i++) {
612 int n, j;
613 struct ceph_pg pgid;
614 struct ceph_pg_mapping *pg;
615
616 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
617 ceph_decode_copy(p, &pgid, sizeof(pgid));
618 n = ceph_decode_32(p);
619 ceph_decode_need(p, end, n * sizeof(u32), bad);
620 err = -ENOMEM;
621 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
622 if (!pg)
623 goto bad;
624 pg->pgid = pgid;
625 pg->len = n;
626 for (j = 0; j < n; j++)
627 pg->osds[j] = ceph_decode_32(p);
628
629 err = __insert_pg_mapping(pg, &map->pg_temp);
630 if (err)
631 goto bad;
632 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
633 }
634
635 /* crush */
636 ceph_decode_32_safe(p, end, len, bad);
637 dout("osdmap_decode crush len %d from off 0x%x\n", len,
638 (int)(*p - start));
639 ceph_decode_need(p, end, len, bad);
640 map->crush = crush_decode(*p, end);
641 *p += len;
642 if (IS_ERR(map->crush)) {
643 err = PTR_ERR(map->crush);
644 map->crush = NULL;
645 goto bad;
646 }
647
648 /* ignore the rest of the map */
649 *p = end;
650
651 dout("osdmap_decode done %p %p\n", *p, end);
652 return map;
653
654bad:
655 dout("osdmap_decode fail\n");
656 ceph_osdmap_destroy(map);
657 return ERR_PTR(err);
658}
659
660/*
661 * decode and apply an incremental map update.
662 */
663struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
664 struct ceph_osdmap *map,
665 struct ceph_messenger *msgr)
666{
667 struct crush_map *newcrush = NULL;
668 struct ceph_fsid fsid;
669 u32 epoch = 0;
670 struct ceph_timespec modified;
671 u32 len, pool;
672 __s32 new_pool_max, new_flags, max;
673 void *start = *p;
674 int err = -EINVAL;
675 u16 version;
676 struct rb_node *rbp;
677
678 ceph_decode_16_safe(p, end, version, bad);
679 if (version > CEPH_OSDMAP_INC_VERSION) {
680 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
681 CEPH_OSDMAP_INC_VERSION);
682 goto bad;
683 }
684
685 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
686 bad);
687 ceph_decode_copy(p, &fsid, sizeof(fsid));
688 epoch = ceph_decode_32(p);
689 BUG_ON(epoch != map->epoch+1);
690 ceph_decode_copy(p, &modified, sizeof(modified));
691 new_pool_max = ceph_decode_32(p);
692 new_flags = ceph_decode_32(p);
693
694 /* full map? */
695 ceph_decode_32_safe(p, end, len, bad);
696 if (len > 0) {
697 dout("apply_incremental full map len %d, %p to %p\n",
698 len, *p, end);
699 return osdmap_decode(p, min(*p+len, end));
700 }
701
702 /* new crush? */
703 ceph_decode_32_safe(p, end, len, bad);
704 if (len > 0) {
705 dout("apply_incremental new crush map len %d, %p to %p\n",
706 len, *p, end);
707 newcrush = crush_decode(*p, min(*p+len, end));
708 if (IS_ERR(newcrush))
709 return ERR_PTR(PTR_ERR(newcrush));
710 }
711
712 /* new flags? */
713 if (new_flags >= 0)
714 map->flags = new_flags;
715 if (new_pool_max >= 0)
716 map->pool_max = new_pool_max;
717
718 ceph_decode_need(p, end, 5*sizeof(u32), bad);
719
720 /* new max? */
721 max = ceph_decode_32(p);
722 if (max >= 0) {
723 err = osdmap_set_max_osd(map, max);
724 if (err < 0)
725 goto bad;
726 }
727
728 map->epoch++;
729 map->modified = map->modified;
730 if (newcrush) {
731 if (map->crush)
732 crush_destroy(map->crush);
733 map->crush = newcrush;
734 newcrush = NULL;
735 }
736
737 /* new_pool */
738 ceph_decode_32_safe(p, end, len, bad);
739 while (len--) {
740 __u8 ev;
741 struct ceph_pg_pool_info *pi;
742
743 ceph_decode_32_safe(p, end, pool, bad);
744 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
745 ev = ceph_decode_8(p); /* encoding version */
746 if (ev > CEPH_PG_POOL_VERSION) {
747 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
748 ev, CEPH_PG_POOL_VERSION);
749 goto bad;
750 }
751 pi = __lookup_pg_pool(&map->pg_pools, pool);
752 if (!pi) {
753 pi = kzalloc(sizeof(*pi), GFP_NOFS);
754 if (!pi) {
755 err = -ENOMEM;
756 goto bad;
757 }
758 pi->id = pool;
759 __insert_pg_pool(&map->pg_pools, pi);
760 }
761 __decode_pool(p, pi);
762 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad;
765
766 /* old_pool */
767 ceph_decode_32_safe(p, end, len, bad);
768 while (len--) {
769 struct ceph_pg_pool_info *pi;
770
771 ceph_decode_32_safe(p, end, pool, bad);
772 pi = __lookup_pg_pool(&map->pg_pools, pool);
773 if (pi)
774 __remove_pg_pool(&map->pg_pools, pi);
775 }
776
777 /* new_up */
778 err = -EINVAL;
779 ceph_decode_32_safe(p, end, len, bad);
780 while (len--) {
781 u32 osd;
782 struct ceph_entity_addr addr;
783 ceph_decode_32_safe(p, end, osd, bad);
784 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
785 ceph_decode_addr(&addr);
786 pr_info("osd%d up\n", osd);
787 BUG_ON(osd >= map->max_osd);
788 map->osd_state[osd] |= CEPH_OSD_UP;
789 map->osd_addr[osd] = addr;
790 }
791
792 /* new_down */
793 ceph_decode_32_safe(p, end, len, bad);
794 while (len--) {
795 u32 osd;
796 ceph_decode_32_safe(p, end, osd, bad);
797 (*p)++; /* clean flag */
798 pr_info("osd%d down\n", osd);
799 if (osd < map->max_osd)
800 map->osd_state[osd] &= ~CEPH_OSD_UP;
801 }
802
803 /* new_weight */
804 ceph_decode_32_safe(p, end, len, bad);
805 while (len--) {
806 u32 osd, off;
807 ceph_decode_need(p, end, sizeof(u32)*2, bad);
808 osd = ceph_decode_32(p);
809 off = ceph_decode_32(p);
810 pr_info("osd%d weight 0x%x %s\n", osd, off,
811 off == CEPH_OSD_IN ? "(in)" :
812 (off == CEPH_OSD_OUT ? "(out)" : ""));
813 if (osd < map->max_osd)
814 map->osd_weight[osd] = off;
815 }
816
817 /* new_pg_temp */
818 rbp = rb_first(&map->pg_temp);
819 ceph_decode_32_safe(p, end, len, bad);
820 while (len--) {
821 struct ceph_pg_mapping *pg;
822 int j;
823 struct ceph_pg pgid;
824 u32 pglen;
825 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
826 ceph_decode_copy(p, &pgid, sizeof(pgid));
827 pglen = ceph_decode_32(p);
828
829 /* remove any? */
830 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
831 node)->pgid, pgid) <= 0) {
832 struct rb_node *cur = rbp;
833 rbp = rb_next(rbp);
834 dout(" removed pg_temp %llx\n",
835 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
836 node)->pgid);
837 rb_erase(cur, &map->pg_temp);
838 }
839
840 if (pglen) {
841 /* insert */
842 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
843 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
844 if (!pg) {
845 err = -ENOMEM;
846 goto bad;
847 }
848 pg->pgid = pgid;
849 pg->len = pglen;
850 for (j = 0; j < pglen; j++)
851 pg->osds[j] = ceph_decode_32(p);
852 err = __insert_pg_mapping(pg, &map->pg_temp);
853 if (err)
854 goto bad;
855 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
856 pglen);
857 }
858 }
859 while (rbp) {
860 struct rb_node *cur = rbp;
861 rbp = rb_next(rbp);
862 dout(" removed pg_temp %llx\n",
863 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
864 node)->pgid);
865 rb_erase(cur, &map->pg_temp);
866 }
867
868 /* ignore the rest */
869 *p = end;
870 return map;
871
872bad:
873 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
874 epoch, (int)(*p - start), *p, start, end);
875 print_hex_dump(KERN_DEBUG, "osdmap: ",
876 DUMP_PREFIX_OFFSET, 16, 1,
877 start, end - start, true);
878 if (newcrush)
879 crush_destroy(newcrush);
880 return ERR_PTR(err);
881}
882
883
884
885
886/*
887 * calculate file layout from given offset, length.
888 * fill in correct oid, logical length, and object extent
889 * offset, length.
890 *
891 * for now, we write only a single su, until we can
892 * pass a stride back to the caller.
893 */
894void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
895 u64 off, u64 *plen,
896 u64 *ono,
897 u64 *oxoff, u64 *oxlen)
898{
899 u32 osize = le32_to_cpu(layout->fl_object_size);
900 u32 su = le32_to_cpu(layout->fl_stripe_unit);
901 u32 sc = le32_to_cpu(layout->fl_stripe_count);
902 u32 bl, stripeno, stripepos, objsetno;
903 u32 su_per_object;
904 u64 t, su_offset;
905
906 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
907 osize, su);
908 su_per_object = osize / su;
909 dout("osize %u / su %u = su_per_object %u\n", osize, su,
910 su_per_object);
911
912 BUG_ON((su & ~PAGE_MASK) != 0);
913 /* bl = *off / su; */
914 t = off;
915 do_div(t, su);
916 bl = t;
917 dout("off %llu / su %u = bl %u\n", off, su, bl);
918
919 stripeno = bl / sc;
920 stripepos = bl % sc;
921 objsetno = stripeno / su_per_object;
922
923 *ono = objsetno * sc + stripepos;
924 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
925
926 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
927 t = off;
928 su_offset = do_div(t, su);
929 *oxoff = su_offset + (stripeno % su_per_object) * su;
930
931 /*
932 * Calculate the length of the extent being written to the selected
933 * object. This is the minimum of the full length requested (plen) or
934 * the remainder of the current stripe being written to.
935 */
936 *oxlen = min_t(u64, *plen, su - su_offset);
937 *plen = *oxlen;
938
939 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
940}
941
942/*
943 * calculate an object layout (i.e. pgid) from an oid,
944 * file_layout, and osdmap
945 */
946int ceph_calc_object_layout(struct ceph_object_layout *ol,
947 const char *oid,
948 struct ceph_file_layout *fl,
949 struct ceph_osdmap *osdmap)
950{
951 unsigned num, num_mask;
952 struct ceph_pg pgid;
953 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
954 int poolid = le32_to_cpu(fl->fl_pg_pool);
955 struct ceph_pg_pool_info *pool;
956 unsigned ps;
957
958 BUG_ON(!osdmap);
959
960 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
961 if (!pool)
962 return -EIO;
963 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
964 if (preferred >= 0) {
965 ps += preferred;
966 num = le32_to_cpu(pool->v.lpg_num);
967 num_mask = pool->lpg_num_mask;
968 } else {
969 num = le32_to_cpu(pool->v.pg_num);
970 num_mask = pool->pg_num_mask;
971 }
972
973 pgid.ps = cpu_to_le16(ps);
974 pgid.preferred = cpu_to_le16(preferred);
975 pgid.pool = fl->fl_pg_pool;
976 if (preferred >= 0)
977 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
978 (int)preferred);
979 else
980 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
981
982 ol->ol_pgid = pgid;
983 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
984 return 0;
985}
986
987/*
988 * Calculate raw osd vector for the given pgid. Return pointer to osd
989 * array, or NULL on failure.
990 */
991static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
992 int *osds, int *num)
993{
994 struct ceph_pg_mapping *pg;
995 struct ceph_pg_pool_info *pool;
996 int ruleno;
997 unsigned poolid, ps, pps;
998 int preferred;
999
1000 /* pg_temp? */
1001 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1002 if (pg) {
1003 *num = pg->len;
1004 return pg->osds;
1005 }
1006
1007 /* crush */
1008 poolid = le32_to_cpu(pgid.pool);
1009 ps = le16_to_cpu(pgid.ps);
1010 preferred = (s16)le16_to_cpu(pgid.preferred);
1011
1012 /* don't forcefeed bad device ids to crush */
1013 if (preferred >= osdmap->max_osd ||
1014 preferred >= osdmap->crush->max_devices)
1015 preferred = -1;
1016
1017 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
1018 if (!pool)
1019 return NULL;
1020 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1021 pool->v.type, pool->v.size);
1022 if (ruleno < 0) {
1023 pr_err("no crush rule pool %d type %d size %d\n",
1024 poolid, pool->v.type, pool->v.size);
1025 return NULL;
1026 }
1027
1028 if (preferred >= 0)
1029 pps = ceph_stable_mod(ps,
1030 le32_to_cpu(pool->v.lpgp_num),
1031 pool->lpgp_num_mask);
1032 else
1033 pps = ceph_stable_mod(ps,
1034 le32_to_cpu(pool->v.pgp_num),
1035 pool->pgp_num_mask);
1036 pps += poolid;
1037 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
1038 min_t(int, pool->v.size, *num),
1039 preferred, osdmap->osd_weight);
1040 return osds;
1041}
1042
1043/*
1044 * Return primary osd for given pgid, or -1 if none.
1045 */
1046int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1047{
1048 int rawosds[10], *osds;
1049 int i, num = ARRAY_SIZE(rawosds);
1050
1051 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1052 if (!osds)
1053 return -1;
1054
1055 /* primary is first up osd */
1056 for (i = 0; i < num; i++)
1057 if (ceph_osd_is_up(osdmap, osds[i])) {
1058 return osds[i];
1059 break;
1060 }
1061 return -1;
1062}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..8bc9f1e4f562
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,126 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
27};
28
29struct ceph_pg_mapping {
30 struct rb_node node;
31 struct ceph_pg pgid;
32 int len;
33 int osds[];
34};
35
36struct ceph_osdmap {
37 struct ceph_fsid fsid;
38 u32 epoch;
39 u32 mkfs_epoch;
40 struct ceph_timespec created, modified;
41
42 u32 flags; /* CEPH_OSDMAP_* */
43
44 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
45 u8 *osd_state; /* CEPH_OSD_* */
46 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
47 struct ceph_entity_addr *osd_addr;
48
49 struct rb_root pg_temp;
50 struct rb_root pg_pools;
51 u32 pool_max;
52
53 /* the CRUSH map specifies the mapping of placement groups to
54 * the list of osds that store+replicate them. */
55 struct crush_map *crush;
56};
57
58/*
59 * file layout helpers
60 */
61#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
62#define ceph_file_layout_stripe_count(l) \
63 ((__s32)le32_to_cpu((l).fl_stripe_count))
64#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
65#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
66#define ceph_file_layout_object_su(l) \
67 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
68#define ceph_file_layout_pg_preferred(l) \
69 ((__s32)le32_to_cpu((l).fl_pg_preferred))
70#define ceph_file_layout_pg_pool(l) \
71 ((__s32)le32_to_cpu((l).fl_pg_pool))
72
73static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
74{
75 return le32_to_cpu(l->fl_stripe_unit) *
76 le32_to_cpu(l->fl_stripe_count);
77}
78
79/* "period" == bytes before i start on a new set of objects */
80static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
81{
82 return le32_to_cpu(l->fl_object_size) *
83 le32_to_cpu(l->fl_stripe_count);
84}
85
86
87static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
88{
89 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
90}
91
92static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
93{
94 return map && (map->flags & flag);
95}
96
97extern char *ceph_osdmap_state_str(char *str, int len, int state);
98
99static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
100 int osd)
101{
102 if (osd >= map->max_osd)
103 return NULL;
104 return &map->osd_addr[osd];
105}
106
107extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
108extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
109 struct ceph_osdmap *map,
110 struct ceph_messenger *msgr);
111extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
112
113/* calculate mapping of a file extent to an object */
114extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
115 u64 off, u64 *plen,
116 u64 *bno, u64 *oxoff, u64 *oxlen);
117
118/* calculate mapping of object to a placement group */
119extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid,
121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid);
125
126#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..5f8dbf7c745a
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,55 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8int ceph_pagelist_release(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail)
11 kunmap(pl->mapped_tail);
12 while (!list_empty(&pl->head)) {
13 struct page *page = list_first_entry(&pl->head, struct page,
14 lru);
15 list_del(&page->lru);
16 __free_page(page);
17 }
18 return 0;
19}
20
21static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
22{
23 struct page *page = alloc_page(GFP_NOFS);
24 if (!page)
25 return -ENOMEM;
26 pl->room += PAGE_SIZE;
27 list_add_tail(&page->lru, &pl->head);
28 if (pl->mapped_tail)
29 kunmap(pl->mapped_tail);
30 pl->mapped_tail = kmap(page);
31 return 0;
32}
33
34int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
35{
36 while (pl->room < len) {
37 size_t bit = pl->room;
38 int ret;
39
40 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
41 buf, bit);
42 pl->length += bit;
43 pl->room -= bit;
44 buf += bit;
45 len -= bit;
46 ret = ceph_pagelist_addpage(pl);
47 if (ret)
48 return ret;
49 }
50
51 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
52 pl->length += len;
53 pl->room -= len;
54 return 0;
55}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..a1fc1d017b58
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,376 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
18
19/*
20 * fs id
21 */
22struct ceph_fsid {
23 unsigned char fsid[16];
24};
25
26static inline int ceph_fsid_compare(const struct ceph_fsid *a,
27 const struct ceph_fsid *b)
28{
29 return memcmp(a, b, sizeof(*a));
30}
31
32/*
33 * ino, object, etc.
34 */
35typedef __le64 ceph_snapid_t;
36#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
37#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
38#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
39
40struct ceph_timespec {
41 __le32 tv_sec;
42 __le32 tv_nsec;
43} __attribute__ ((packed));
44
45
46/*
47 * object layout - how objects are mapped into PGs
48 */
49#define CEPH_OBJECT_LAYOUT_HASH 1
50#define CEPH_OBJECT_LAYOUT_LINEAR 2
51#define CEPH_OBJECT_LAYOUT_HASHINO 3
52
53/*
54 * pg layout -- how PGs are mapped onto (sets of) OSDs
55 */
56#define CEPH_PG_LAYOUT_CRUSH 0
57#define CEPH_PG_LAYOUT_HASH 1
58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3
60
61
62/*
63 * placement group.
64 * we encode this into one __le64.
65 */
66struct ceph_pg {
67 __le16 preferred; /* preferred primary osd */
68 __le16 ps; /* placement seed */
69 __le32 pool; /* object pool */
70} __attribute__ ((packed));
71
72/*
73 * pg_pool is a set of pgs storing a pool of objects
74 *
75 * pg_num -- base number of pseudorandomly placed pgs
76 *
77 * pgp_num -- effective number when calculating pg placement. this
78 * is used for pg_num increases. new pgs result in data being "split"
79 * into new pgs. for this to proceed smoothly, new pgs are intiially
80 * colocated with their parents; that is, pgp_num doesn't increase
81 * until the new pgs have successfully split. only _then_ are the new
82 * pgs placed independently.
83 *
84 * lpg_num -- localized pg count (per device). replicas are randomly
85 * selected.
86 *
87 * lpgp_num -- as above.
88 */
89#define CEPH_PG_TYPE_REP 1
90#define CEPH_PG_TYPE_RAID4 2
91#define CEPH_PG_POOL_VERSION 2
92struct ceph_pg_pool {
93 __u8 type; /* CEPH_PG_TYPE_* */
94 __u8 size; /* number of osds in each pg */
95 __u8 crush_ruleset; /* crush placement rule */
96 __u8 object_hash; /* hash mapping object name to ps */
97 __le32 pg_num, pgp_num; /* number of pg's */
98 __le32 lpg_num, lpgp_num; /* number of localized pg's */
99 __le32 last_change; /* most recent epoch changed */
100 __le64 snap_seq; /* seq for per-pool snapshot */
101 __le32 snap_epoch; /* epoch of last snap */
102 __le32 num_snaps;
103 __le32 num_removed_snap_intervals;
104 __le64 uid;
105} __attribute__ ((packed));
106
107/*
108 * stable_mod func is used to control number of placement groups.
109 * similar to straight-up modulo, but produces a stable mapping as b
110 * increases over time. b is the number of bins, and bmask is the
111 * containing power of 2 minus 1.
112 *
113 * b <= bmask and bmask=(2**n)-1
114 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
115 */
116static inline int ceph_stable_mod(int x, int b, int bmask)
117{
118 if ((x & bmask) < b)
119 return x & bmask;
120 else
121 return x & (bmask >> 1);
122}
123
124/*
125 * object layout - how a given object should be stored.
126 */
127struct ceph_object_layout {
128 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
129 __le32 ol_stripe_unit; /* for per-object parity, if any */
130} __attribute__ ((packed));
131
132/*
133 * compound epoch+version, used by storage layer to serialize mutations
134 */
135struct ceph_eversion {
136 __le32 epoch;
137 __le64 version;
138} __attribute__ ((packed));
139
140/*
141 * osd map bits
142 */
143
144/* status bits */
145#define CEPH_OSD_EXISTS 1
146#define CEPH_OSD_UP 2
147
148/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
149#define CEPH_OSD_IN 0x10000
150#define CEPH_OSD_OUT 0
151
152
153/*
154 * osd map flag bits
155 */
156#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
157#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
158#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
159#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
160#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
161
162/*
163 * osd ops
164 */
165#define CEPH_OSD_OP_MODE 0xf000
166#define CEPH_OSD_OP_MODE_RD 0x1000
167#define CEPH_OSD_OP_MODE_WR 0x2000
168#define CEPH_OSD_OP_MODE_RMW 0x3000
169#define CEPH_OSD_OP_MODE_SUB 0x4000
170
171#define CEPH_OSD_OP_TYPE 0x0f00
172#define CEPH_OSD_OP_TYPE_LOCK 0x0100
173#define CEPH_OSD_OP_TYPE_DATA 0x0200
174#define CEPH_OSD_OP_TYPE_ATTR 0x0300
175#define CEPH_OSD_OP_TYPE_EXEC 0x0400
176#define CEPH_OSD_OP_TYPE_PG 0x0500
177
178enum {
179 /** data **/
180 /* read */
181 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
182 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
183
184 /* fancy read */
185 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
186
187 /* write */
188 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
189 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
190 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
191 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
192 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
193
194 /* fancy write */
195 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
196 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
197 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
198 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
199
200 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
201 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
202 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
203
204 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
205
206 /** attrs **/
207 /* read */
208 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
209 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
210
211 /* write */
212 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
213 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
214 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
215 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
216
217 /** subop **/
218 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
219 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
220 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
221 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
222 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
223
224 /** lock **/
225 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
226 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
227 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
228 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
229 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
230 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
231
232 /** exec **/
233 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
234
235 /** pg **/
236 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
237};
238
239static inline int ceph_osd_op_type_lock(int op)
240{
241 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
242}
243static inline int ceph_osd_op_type_data(int op)
244{
245 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
246}
247static inline int ceph_osd_op_type_attr(int op)
248{
249 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
250}
251static inline int ceph_osd_op_type_exec(int op)
252{
253 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
254}
255static inline int ceph_osd_op_type_pg(int op)
256{
257 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
258}
259
260static inline int ceph_osd_op_mode_subop(int op)
261{
262 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
263}
264static inline int ceph_osd_op_mode_read(int op)
265{
266 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
267}
268static inline int ceph_osd_op_mode_modify(int op)
269{
270 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
271}
272
273#define CEPH_OSD_TMAP_HDR 'h'
274#define CEPH_OSD_TMAP_SET 's'
275#define CEPH_OSD_TMAP_RM 'r'
276
277extern const char *ceph_osd_op_name(int op);
278
279
280/*
281 * osd op flags
282 *
283 * An op may be READ, WRITE, or READ|WRITE.
284 */
285enum {
286 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
287 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
288 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
289 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
290 CEPH_OSD_FLAG_READ = 16, /* op may read */
291 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
292 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
293 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
294 CEPH_OSD_FLAG_BALANCE_READS = 256,
295 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
296 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
297 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
298};
299
300enum {
301 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
302};
303
304#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
305#define EBLACKLISTED ESHUTDOWN /* blacklisted */
306
307/*
308 * an individual object operation. each may be accompanied by some data
309 * payload
310 */
311struct ceph_osd_op {
312 __le16 op; /* CEPH_OSD_OP_* */
313 __le32 flags; /* CEPH_OSD_FLAG_* */
314 union {
315 struct {
316 __le64 offset, length;
317 __le64 truncate_size;
318 __le32 truncate_seq;
319 } __attribute__ ((packed)) extent;
320 struct {
321 __le32 name_len;
322 __le32 value_len;
323 } __attribute__ ((packed)) xattr;
324 struct {
325 __u8 class_len;
326 __u8 method_len;
327 __u8 argc;
328 __le32 indata_len;
329 } __attribute__ ((packed)) cls;
330 struct {
331 __le64 cookie, count;
332 } __attribute__ ((packed)) pgls;
333 };
334 __le32 payload_len;
335} __attribute__ ((packed));
336
337/*
338 * osd request message header. each request may include multiple
339 * ceph_osd_op object operations.
340 */
341struct ceph_osd_request_head {
342 __le32 client_inc; /* client incarnation */
343 struct ceph_object_layout layout; /* pgid */
344 __le32 osdmap_epoch; /* client's osdmap epoch */
345
346 __le32 flags;
347
348 struct ceph_timespec mtime; /* for mutations only */
349 struct ceph_eversion reassert_version; /* if we are replaying op */
350
351 __le32 object_len; /* length of object name */
352
353 __le64 snapid; /* snapid to read */
354 __le64 snap_seq; /* writer's snap context */
355 __le32 num_snaps;
356
357 __le16 num_ops;
358 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
359} __attribute__ ((packed));
360
361struct ceph_osd_reply_head {
362 __le32 client_inc; /* client incarnation */
363 __le32 flags;
364 struct ceph_object_layout layout;
365 __le32 osdmap_epoch;
366 struct ceph_eversion reassert_version; /* for replaying uncommitted */
367
368 __le32 result; /* result code */
369
370 __le32 object_len; /* length of object name */
371 __le32 num_ops;
372 struct ceph_osd_op ops[0]; /* ops[], object */
373} __attribute__ ((packed));
374
375
376#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..2b881262ef67
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,907 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4#include <linux/slab.h>
5
6#include "super.h"
7#include "decode.h"
8
9/*
10 * Snapshots in ceph are driven in large part by cooperation from the
11 * client. In contrast to local file systems or file servers that
12 * implement snapshots at a single point in the system, ceph's
13 * distributed access to storage requires clients to help decide
14 * whether a write logically occurs before or after a recently created
15 * snapshot.
16 *
17 * This provides a perfect instantanous client-wide snapshot. Between
18 * clients, however, snapshots may appear to be applied at slightly
19 * different points in time, depending on delays in delivering the
20 * snapshot notification.
21 *
22 * Snapshots are _not_ file system-wide. Instead, each snapshot
23 * applies to the subdirectory nested beneath some directory. This
24 * effectively divides the hierarchy into multiple "realms," where all
25 * of the files contained by each realm share the same set of
26 * snapshots. An individual realm's snap set contains snapshots
27 * explicitly created on that realm, as well as any snaps in its
28 * parent's snap set _after_ the point at which the parent became it's
29 * parent (due to, say, a rename). Similarly, snaps from prior parents
30 * during the time intervals during which they were the parent are included.
31 *
32 * The client is spared most of this detail, fortunately... it must only
33 * maintains a hierarchy of realms reflecting the current parent/child
34 * realm relationship, and for each realm has an explicit list of snaps
35 * inherited from prior parents.
36 *
37 * A snap_realm struct is maintained for realms containing every inode
38 * with an open cap in the system. (The needed snap realm information is
39 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
40 * version number is used to ensure that as realm parameters change (new
41 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
42 *
43 * The realm hierarchy drives the generation of a 'snap context' for each
44 * realm, which simply lists the resulting set of snaps for the realm. This
45 * is attached to any writes sent to OSDs.
46 */
47/*
48 * Unfortunately error handling is a bit mixed here. If we get a snap
49 * update, but don't have enough memory to update our realm hierarchy,
50 * it's not clear what we can do about it (besides complaining to the
51 * console).
52 */
53
54
55/*
56 * increase ref count for the realm
57 *
58 * caller must hold snap_rwsem for write.
59 */
60void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
61 struct ceph_snap_realm *realm)
62{
63 dout("get_realm %p %d -> %d\n", realm,
64 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
65 /*
66 * since we _only_ increment realm refs or empty the empty
67 * list with snap_rwsem held, adjusting the empty list here is
68 * safe. we do need to protect against concurrent empty list
69 * additions, however.
70 */
71 if (atomic_read(&realm->nref) == 0) {
72 spin_lock(&mdsc->snap_empty_lock);
73 list_del_init(&realm->empty_item);
74 spin_unlock(&mdsc->snap_empty_lock);
75 }
76
77 atomic_inc(&realm->nref);
78}
79
80static void __insert_snap_realm(struct rb_root *root,
81 struct ceph_snap_realm *new)
82{
83 struct rb_node **p = &root->rb_node;
84 struct rb_node *parent = NULL;
85 struct ceph_snap_realm *r = NULL;
86
87 while (*p) {
88 parent = *p;
89 r = rb_entry(parent, struct ceph_snap_realm, node);
90 if (new->ino < r->ino)
91 p = &(*p)->rb_left;
92 else if (new->ino > r->ino)
93 p = &(*p)->rb_right;
94 else
95 BUG();
96 }
97
98 rb_link_node(&new->node, parent, p);
99 rb_insert_color(&new->node, root);
100}
101
102/*
103 * create and get the realm rooted at @ino and bump its ref count.
104 *
105 * caller must hold snap_rwsem for write.
106 */
107static struct ceph_snap_realm *ceph_create_snap_realm(
108 struct ceph_mds_client *mdsc,
109 u64 ino)
110{
111 struct ceph_snap_realm *realm;
112
113 realm = kzalloc(sizeof(*realm), GFP_NOFS);
114 if (!realm)
115 return ERR_PTR(-ENOMEM);
116
117 atomic_set(&realm->nref, 0); /* tree does not take a ref */
118 realm->ino = ino;
119 INIT_LIST_HEAD(&realm->children);
120 INIT_LIST_HEAD(&realm->child_item);
121 INIT_LIST_HEAD(&realm->empty_item);
122 INIT_LIST_HEAD(&realm->inodes_with_caps);
123 spin_lock_init(&realm->inodes_with_caps_lock);
124 __insert_snap_realm(&mdsc->snap_realms, realm);
125 dout("create_snap_realm %llx %p\n", realm->ino, realm);
126 return realm;
127}
128
129/*
130 * lookup the realm rooted at @ino.
131 *
132 * caller must hold snap_rwsem for write.
133 */
134struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
135 u64 ino)
136{
137 struct rb_node *n = mdsc->snap_realms.rb_node;
138 struct ceph_snap_realm *r;
139
140 while (n) {
141 r = rb_entry(n, struct ceph_snap_realm, node);
142 if (ino < r->ino)
143 n = n->rb_left;
144 else if (ino > r->ino)
145 n = n->rb_right;
146 else {
147 dout("lookup_snap_realm %llx %p\n", r->ino, r);
148 return r;
149 }
150 }
151 return NULL;
152}
153
154static void __put_snap_realm(struct ceph_mds_client *mdsc,
155 struct ceph_snap_realm *realm);
156
157/*
158 * called with snap_rwsem (write)
159 */
160static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
161 struct ceph_snap_realm *realm)
162{
163 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
164
165 rb_erase(&realm->node, &mdsc->snap_realms);
166
167 if (realm->parent) {
168 list_del_init(&realm->child_item);
169 __put_snap_realm(mdsc, realm->parent);
170 }
171
172 kfree(realm->prior_parent_snaps);
173 kfree(realm->snaps);
174 ceph_put_snap_context(realm->cached_context);
175 kfree(realm);
176}
177
178/*
179 * caller holds snap_rwsem (write)
180 */
181static void __put_snap_realm(struct ceph_mds_client *mdsc,
182 struct ceph_snap_realm *realm)
183{
184 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
185 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
186 if (atomic_dec_and_test(&realm->nref))
187 __destroy_snap_realm(mdsc, realm);
188}
189
190/*
191 * caller needn't hold any locks
192 */
193void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
194 struct ceph_snap_realm *realm)
195{
196 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
197 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
198 if (!atomic_dec_and_test(&realm->nref))
199 return;
200
201 if (down_write_trylock(&mdsc->snap_rwsem)) {
202 __destroy_snap_realm(mdsc, realm);
203 up_write(&mdsc->snap_rwsem);
204 } else {
205 spin_lock(&mdsc->snap_empty_lock);
206 list_add(&mdsc->snap_empty, &realm->empty_item);
207 spin_unlock(&mdsc->snap_empty_lock);
208 }
209}
210
211/*
212 * Clean up any realms whose ref counts have dropped to zero. Note
213 * that this does not include realms who were created but not yet
214 * used.
215 *
216 * Called under snap_rwsem (write)
217 */
218static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
219{
220 struct ceph_snap_realm *realm;
221
222 spin_lock(&mdsc->snap_empty_lock);
223 while (!list_empty(&mdsc->snap_empty)) {
224 realm = list_first_entry(&mdsc->snap_empty,
225 struct ceph_snap_realm, empty_item);
226 list_del(&realm->empty_item);
227 spin_unlock(&mdsc->snap_empty_lock);
228 __destroy_snap_realm(mdsc, realm);
229 spin_lock(&mdsc->snap_empty_lock);
230 }
231 spin_unlock(&mdsc->snap_empty_lock);
232}
233
234void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
235{
236 down_write(&mdsc->snap_rwsem);
237 __cleanup_empty_realms(mdsc);
238 up_write(&mdsc->snap_rwsem);
239}
240
241/*
242 * adjust the parent realm of a given @realm. adjust child list, and parent
243 * pointers, and ref counts appropriately.
244 *
245 * return true if parent was changed, 0 if unchanged, <0 on error.
246 *
247 * caller must hold snap_rwsem for write.
248 */
249static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
250 struct ceph_snap_realm *realm,
251 u64 parentino)
252{
253 struct ceph_snap_realm *parent;
254
255 if (realm->parent_ino == parentino)
256 return 0;
257
258 parent = ceph_lookup_snap_realm(mdsc, parentino);
259 if (!parent) {
260 parent = ceph_create_snap_realm(mdsc, parentino);
261 if (IS_ERR(parent))
262 return PTR_ERR(parent);
263 }
264 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
265 realm->ino, realm, realm->parent_ino, realm->parent,
266 parentino, parent);
267 if (realm->parent) {
268 list_del_init(&realm->child_item);
269 ceph_put_snap_realm(mdsc, realm->parent);
270 }
271 realm->parent_ino = parentino;
272 realm->parent = parent;
273 ceph_get_snap_realm(mdsc, parent);
274 list_add(&realm->child_item, &parent->children);
275 return 1;
276}
277
278
279static int cmpu64_rev(const void *a, const void *b)
280{
281 if (*(u64 *)a < *(u64 *)b)
282 return 1;
283 if (*(u64 *)a > *(u64 *)b)
284 return -1;
285 return 0;
286}
287
288/*
289 * build the snap context for a given realm.
290 */
291static int build_snap_context(struct ceph_snap_realm *realm)
292{
293 struct ceph_snap_realm *parent = realm->parent;
294 struct ceph_snap_context *snapc;
295 int err = 0;
296 int i;
297 int num = realm->num_prior_parent_snaps + realm->num_snaps;
298
299 /*
300 * build parent context, if it hasn't been built.
301 * conservatively estimate that all parent snaps might be
302 * included by us.
303 */
304 if (parent) {
305 if (!parent->cached_context) {
306 err = build_snap_context(parent);
307 if (err)
308 goto fail;
309 }
310 num += parent->cached_context->num_snaps;
311 }
312
313 /* do i actually need to update? not if my context seq
314 matches realm seq, and my parents' does to. (this works
315 because we rebuild_snap_realms() works _downward_ in
316 hierarchy after each update.) */
317 if (realm->cached_context &&
318 realm->cached_context->seq == realm->seq &&
319 (!parent ||
320 realm->cached_context->seq >= parent->cached_context->seq)) {
321 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
322 " (unchanged)\n",
323 realm->ino, realm, realm->cached_context,
324 realm->cached_context->seq,
325 realm->cached_context->num_snaps);
326 return 0;
327 }
328
329 /* alloc new snap context */
330 err = -ENOMEM;
331 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
332 goto fail;
333 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
334 if (!snapc)
335 goto fail;
336 atomic_set(&snapc->nref, 1);
337
338 /* build (reverse sorted) snap vector */
339 num = 0;
340 snapc->seq = realm->seq;
341 if (parent) {
342 /* include any of parent's snaps occuring _after_ my
343 parent became my parent */
344 for (i = 0; i < parent->cached_context->num_snaps; i++)
345 if (parent->cached_context->snaps[i] >=
346 realm->parent_since)
347 snapc->snaps[num++] =
348 parent->cached_context->snaps[i];
349 if (parent->cached_context->seq > snapc->seq)
350 snapc->seq = parent->cached_context->seq;
351 }
352 memcpy(snapc->snaps + num, realm->snaps,
353 sizeof(u64)*realm->num_snaps);
354 num += realm->num_snaps;
355 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
356 sizeof(u64)*realm->num_prior_parent_snaps);
357 num += realm->num_prior_parent_snaps;
358
359 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
360 snapc->num_snaps = num;
361 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
362 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
363
364 if (realm->cached_context)
365 ceph_put_snap_context(realm->cached_context);
366 realm->cached_context = snapc;
367 return 0;
368
369fail:
370 /*
371 * if we fail, clear old (incorrect) cached_context... hopefully
372 * we'll have better luck building it later
373 */
374 if (realm->cached_context) {
375 ceph_put_snap_context(realm->cached_context);
376 realm->cached_context = NULL;
377 }
378 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
379 realm, err);
380 return err;
381}
382
383/*
384 * rebuild snap context for the given realm and all of its children.
385 */
386static void rebuild_snap_realms(struct ceph_snap_realm *realm)
387{
388 struct ceph_snap_realm *child;
389
390 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
391 build_snap_context(realm);
392
393 list_for_each_entry(child, &realm->children, child_item)
394 rebuild_snap_realms(child);
395}
396
397
398/*
399 * helper to allocate and decode an array of snapids. free prior
400 * instance, if any.
401 */
402static int dup_array(u64 **dst, __le64 *src, int num)
403{
404 int i;
405
406 kfree(*dst);
407 if (num) {
408 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
409 if (!*dst)
410 return -ENOMEM;
411 for (i = 0; i < num; i++)
412 (*dst)[i] = get_unaligned_le64(src + i);
413 } else {
414 *dst = NULL;
415 }
416 return 0;
417}
418
419
420/*
421 * When a snapshot is applied, the size/mtime inode metadata is queued
422 * in a ceph_cap_snap (one for each snapshot) until writeback
423 * completes and the metadata can be flushed back to the MDS.
424 *
425 * However, if a (sync) write is currently in-progress when we apply
426 * the snapshot, we have to wait until the write succeeds or fails
427 * (and a final size/mtime is known). In this case the
428 * cap_snap->writing = 1, and is said to be "pending." When the write
429 * finishes, we __ceph_finish_cap_snap().
430 *
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change).
433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p already pending\n", inode);
454 kfree(capsnap);
455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc;
457
458 igrab(inode);
459
460 atomic_set(&capsnap->nref, 1);
461 capsnap->ci = ci;
462 INIT_LIST_HEAD(&capsnap->ci_item);
463 INIT_LIST_HEAD(&capsnap->flushing_item);
464
465 capsnap->follows = snapc->seq - 1;
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 capsnap->context = snapc;
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
529 return 0;
530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
535
536 spin_lock(&mdsc->snap_flush_lock);
537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
538 spin_unlock(&mdsc->snap_flush_lock);
539 return 1; /* caller may want to ceph_flush_snaps */
540}
541
542
543/*
544 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
545 * the snap realm parameters from a given realm and all of its ancestors,
546 * up to the root.
547 *
548 * Caller must hold snap_rwsem for write.
549 */
550int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
551 void *p, void *e, bool deletion)
552{
553 struct ceph_mds_snap_realm *ri; /* encoded */
554 __le64 *snaps; /* encoded */
555 __le64 *prior_parent_snaps; /* encoded */
556 struct ceph_snap_realm *realm;
557 int invalidate = 0;
558 int err = -ENOMEM;
559
560 dout("update_snap_trace deletion=%d\n", deletion);
561more:
562 ceph_decode_need(&p, e, sizeof(*ri), bad);
563 ri = p;
564 p += sizeof(*ri);
565 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
566 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
567 snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
569 prior_parent_snaps = p;
570 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
571
572 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (!realm) {
574 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
575 if (IS_ERR(realm)) {
576 err = PTR_ERR(realm);
577 goto fail;
578 }
579 }
580
581 if (le64_to_cpu(ri->seq) > realm->seq) {
582 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
583 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
584 /*
585 * if the realm seq has changed, queue a cap_snap for every
586 * inode with open caps. we do this _before_ we update
587 * the realm info so that we prepare for writeback under the
588 * _previous_ snap context.
589 *
590 * ...unless it's a snap deletion!
591 */
592 if (!deletion) {
593 struct ceph_inode_info *ci;
594 struct inode *lastinode = NULL;
595
596 spin_lock(&realm->inodes_with_caps_lock);
597 list_for_each_entry(ci, &realm->inodes_with_caps,
598 i_snap_realm_item) {
599 struct inode *inode = igrab(&ci->vfs_inode);
600 if (!inode)
601 continue;
602 spin_unlock(&realm->inodes_with_caps_lock);
603 if (lastinode)
604 iput(lastinode);
605 lastinode = inode;
606 ceph_queue_cap_snap(ci);
607 spin_lock(&realm->inodes_with_caps_lock);
608 }
609 spin_unlock(&realm->inodes_with_caps_lock);
610 if (lastinode)
611 iput(lastinode);
612 dout("update_snap_trace cap_snaps queued\n");
613 }
614
615 } else {
616 dout("update_snap_trace %llx %p seq %lld unchanged\n",
617 realm->ino, realm, realm->seq);
618 }
619
620 /* ensure the parent is correct */
621 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
622 if (err < 0)
623 goto fail;
624 invalidate += err;
625
626 if (le64_to_cpu(ri->seq) > realm->seq) {
627 /* update realm parameters, snap lists */
628 realm->seq = le64_to_cpu(ri->seq);
629 realm->created = le64_to_cpu(ri->created);
630 realm->parent_since = le64_to_cpu(ri->parent_since);
631
632 realm->num_snaps = le32_to_cpu(ri->num_snaps);
633 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
634 if (err < 0)
635 goto fail;
636
637 realm->num_prior_parent_snaps =
638 le32_to_cpu(ri->num_prior_parent_snaps);
639 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
640 realm->num_prior_parent_snaps);
641 if (err < 0)
642 goto fail;
643
644 invalidate = 1;
645 } else if (!realm->cached_context) {
646 invalidate = 1;
647 }
648
649 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
650 realm, invalidate, p, e);
651
652 if (p < e)
653 goto more;
654
655 /* invalidate when we reach the _end_ (root) of the trace */
656 if (invalidate)
657 rebuild_snap_realms(realm);
658
659 __cleanup_empty_realms(mdsc);
660 return 0;
661
662bad:
663 err = -EINVAL;
664fail:
665 pr_err("update_snap_trace error %d\n", err);
666 return err;
667}
668
669
670/*
671 * Send any cap_snaps that are queued for flush. Try to carry
672 * s_mutex across multiple snap flushes to avoid locking overhead.
673 *
674 * Caller holds no locks.
675 */
676static void flush_snaps(struct ceph_mds_client *mdsc)
677{
678 struct ceph_inode_info *ci;
679 struct inode *inode;
680 struct ceph_mds_session *session = NULL;
681
682 dout("flush_snaps\n");
683 spin_lock(&mdsc->snap_flush_lock);
684 while (!list_empty(&mdsc->snap_flush_list)) {
685 ci = list_first_entry(&mdsc->snap_flush_list,
686 struct ceph_inode_info, i_snap_flush_item);
687 inode = &ci->vfs_inode;
688 igrab(inode);
689 spin_unlock(&mdsc->snap_flush_lock);
690 spin_lock(&inode->i_lock);
691 __ceph_flush_snaps(ci, &session);
692 spin_unlock(&inode->i_lock);
693 iput(inode);
694 spin_lock(&mdsc->snap_flush_lock);
695 }
696 spin_unlock(&mdsc->snap_flush_lock);
697
698 if (session) {
699 mutex_unlock(&session->s_mutex);
700 ceph_put_mds_session(session);
701 }
702 dout("flush_snaps done\n");
703}
704
705
706/*
707 * Handle a snap notification from the MDS.
708 *
709 * This can take two basic forms: the simplest is just a snap creation
710 * or deletion notification on an existing realm. This should update the
711 * realm and its children.
712 *
713 * The more difficult case is realm creation, due to snap creation at a
714 * new point in the file hierarchy, or due to a rename that moves a file or
715 * directory into another realm.
716 */
717void ceph_handle_snap(struct ceph_mds_client *mdsc,
718 struct ceph_mds_session *session,
719 struct ceph_msg *msg)
720{
721 struct super_block *sb = mdsc->client->sb;
722 int mds = session->s_mds;
723 u64 split;
724 int op;
725 int trace_len;
726 struct ceph_snap_realm *realm = NULL;
727 void *p = msg->front.iov_base;
728 void *e = p + msg->front.iov_len;
729 struct ceph_mds_snap_head *h;
730 int num_split_inos, num_split_realms;
731 __le64 *split_inos = NULL, *split_realms = NULL;
732 int i;
733 int locked_rwsem = 0;
734
735 /* decode */
736 if (msg->front.iov_len < sizeof(*h))
737 goto bad;
738 h = p;
739 op = le32_to_cpu(h->op);
740 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
741 * existing realm */
742 num_split_inos = le32_to_cpu(h->num_split_inos);
743 num_split_realms = le32_to_cpu(h->num_split_realms);
744 trace_len = le32_to_cpu(h->trace_len);
745 p += sizeof(*h);
746
747 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
748 ceph_snap_op_name(op), split, trace_len);
749
750 mutex_lock(&session->s_mutex);
751 session->s_seq++;
752 mutex_unlock(&session->s_mutex);
753
754 down_write(&mdsc->snap_rwsem);
755 locked_rwsem = 1;
756
757 if (op == CEPH_SNAP_OP_SPLIT) {
758 struct ceph_mds_snap_realm *ri;
759
760 /*
761 * A "split" breaks part of an existing realm off into
762 * a new realm. The MDS provides a list of inodes
763 * (with caps) and child realms that belong to the new
764 * child.
765 */
766 split_inos = p;
767 p += sizeof(u64) * num_split_inos;
768 split_realms = p;
769 p += sizeof(u64) * num_split_realms;
770 ceph_decode_need(&p, e, sizeof(*ri), bad);
771 /* we will peek at realm info here, but will _not_
772 * advance p, as the realm update will occur below in
773 * ceph_update_snap_trace. */
774 ri = p;
775
776 realm = ceph_lookup_snap_realm(mdsc, split);
777 if (!realm) {
778 realm = ceph_create_snap_realm(mdsc, split);
779 if (IS_ERR(realm))
780 goto out;
781 }
782 ceph_get_snap_realm(mdsc, realm);
783
784 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
785 for (i = 0; i < num_split_inos; i++) {
786 struct ceph_vino vino = {
787 .ino = le64_to_cpu(split_inos[i]),
788 .snap = CEPH_NOSNAP,
789 };
790 struct inode *inode = ceph_find_inode(sb, vino);
791 struct ceph_inode_info *ci;
792
793 if (!inode)
794 continue;
795 ci = ceph_inode(inode);
796
797 spin_lock(&inode->i_lock);
798 if (!ci->i_snap_realm)
799 goto skip_inode;
800 /*
801 * If this inode belongs to a realm that was
802 * created after our new realm, we experienced
803 * a race (due to another split notifications
804 * arriving from a different MDS). So skip
805 * this inode.
806 */
807 if (ci->i_snap_realm->created >
808 le64_to_cpu(ri->created)) {
809 dout(" leaving %p in newer realm %llx %p\n",
810 inode, ci->i_snap_realm->ino,
811 ci->i_snap_realm);
812 goto skip_inode;
813 }
814 dout(" will move %p to split realm %llx %p\n",
815 inode, realm->ino, realm);
816 /*
817 * Remove the inode from the realm's inode
818 * list, but don't add it to the new realm
819 * yet. We don't want the cap_snap to be
820 * queued (again) by ceph_update_snap_trace()
821 * below. Queue it _now_, under the old context.
822 */
823 spin_lock(&realm->inodes_with_caps_lock);
824 list_del_init(&ci->i_snap_realm_item);
825 spin_unlock(&realm->inodes_with_caps_lock);
826 spin_unlock(&inode->i_lock);
827
828 ceph_queue_cap_snap(ci);
829
830 iput(inode);
831 continue;
832
833skip_inode:
834 spin_unlock(&inode->i_lock);
835 iput(inode);
836 }
837
838 /* we may have taken some of the old realm's children. */
839 for (i = 0; i < num_split_realms; i++) {
840 struct ceph_snap_realm *child =
841 ceph_lookup_snap_realm(mdsc,
842 le64_to_cpu(split_realms[i]));
843 if (!child)
844 continue;
845 adjust_snap_realm_parent(mdsc, child, realm->ino);
846 }
847 }
848
849 /*
850 * update using the provided snap trace. if we are deleting a
851 * snap, we can avoid queueing cap_snaps.
852 */
853 ceph_update_snap_trace(mdsc, p, e,
854 op == CEPH_SNAP_OP_DESTROY);
855
856 if (op == CEPH_SNAP_OP_SPLIT) {
857 /*
858 * ok, _now_ add the inodes into the new realm.
859 */
860 for (i = 0; i < num_split_inos; i++) {
861 struct ceph_vino vino = {
862 .ino = le64_to_cpu(split_inos[i]),
863 .snap = CEPH_NOSNAP,
864 };
865 struct inode *inode = ceph_find_inode(sb, vino);
866 struct ceph_inode_info *ci;
867
868 if (!inode)
869 continue;
870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock);
872 if (!ci->i_snap_realm)
873 goto split_skip_inode;
874 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
875 spin_lock(&realm->inodes_with_caps_lock);
876 list_add(&ci->i_snap_realm_item,
877 &realm->inodes_with_caps);
878 ci->i_snap_realm = realm;
879 spin_unlock(&realm->inodes_with_caps_lock);
880 ceph_get_snap_realm(mdsc, realm);
881split_skip_inode:
882 spin_unlock(&inode->i_lock);
883 iput(inode);
884 }
885
886 /* we took a reference when we created the realm, above */
887 ceph_put_snap_realm(mdsc, realm);
888 }
889
890 __cleanup_empty_realms(mdsc);
891
892 up_write(&mdsc->snap_rwsem);
893
894 flush_snaps(mdsc);
895 return;
896
897bad:
898 pr_err("corrupt snap message from mds%d\n", mds);
899 ceph_msg_dump(msg);
900out:
901 if (locked_rwsem)
902 up_write(&mdsc->snap_rwsem);
903 return;
904}
905
906
907
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..75d02eaa1279
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1031 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/slab.h>
15#include <linux/statfs.h>
16#include <linux/string.h>
17#include <linux/version.h>
18#include <linux/vmalloc.h>
19
20#include "decode.h"
21#include "super.h"
22#include "mon_client.h"
23#include "auth.h"
24
25/*
26 * Ceph superblock operations
27 *
28 * Handle the basics of mounting, unmounting.
29 */
30
31
32/*
33 * find filename portion of a path (/foo/bar/baz -> baz)
34 */
35const char *ceph_file_part(const char *s, int len)
36{
37 const char *e = s + len;
38
39 while (e != s && *(e-1) != '/')
40 e--;
41 return e;
42}
43
44
45/*
46 * super ops
47 */
48static void ceph_put_super(struct super_block *s)
49{
50 struct ceph_client *cl = ceph_client(s);
51
52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&cl->mdsc);
54 return;
55}
56
57static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
58{
59 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
60 struct ceph_monmap *monmap = client->monc.monmap;
61 struct ceph_statfs st;
62 u64 fsid;
63 int err;
64
65 dout("statfs\n");
66 err = ceph_monc_do_statfs(&client->monc, &st);
67 if (err < 0)
68 return err;
69
70 /* fill in kstatfs */
71 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
72
73 /*
74 * express utilization in terms of large blocks to avoid
75 * overflow on 32-bit machines.
76 */
77 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
78 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
79 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
80 (CEPH_BLOCK_SHIFT-10);
81 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
82
83 buf->f_files = le64_to_cpu(st.num_objects);
84 buf->f_ffree = -1;
85 buf->f_namelen = PATH_MAX;
86 buf->f_frsize = PAGE_CACHE_SIZE;
87
88 /* leave fsid little-endian, regardless of host endianness */
89 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
90 buf->f_fsid.val[0] = fsid & 0xffffffff;
91 buf->f_fsid.val[1] = fsid >> 32;
92
93 return 0;
94}
95
96
97static int ceph_syncfs(struct super_block *sb, int wait)
98{
99 dout("sync_fs %d\n", wait);
100 ceph_osdc_sync(&ceph_client(sb)->osdc);
101 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
102 dout("sync_fs %d done\n", wait);
103 return 0;
104}
105
106
107/**
108 * ceph_show_options - Show mount options in /proc/mounts
109 * @m: seq_file to write to
110 * @mnt: mount descriptor
111 */
112static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
113{
114 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
115 struct ceph_mount_args *args = client->mount_args;
116
117 if (args->flags & CEPH_OPT_FSID)
118 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
120 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
121 if (args->flags & CEPH_OPT_NOSHARE)
122 seq_puts(m, ",noshare");
123 if (args->flags & CEPH_OPT_DIRSTAT)
124 seq_puts(m, ",dirstat");
125 if ((args->flags & CEPH_OPT_RBYTES) == 0)
126 seq_puts(m, ",norbytes");
127 if (args->flags & CEPH_OPT_NOCRC)
128 seq_puts(m, ",nocrc");
129 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
130 seq_puts(m, ",noasyncreaddir");
131 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
132 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
133 if (args->name)
134 seq_printf(m, ",name=%s", args->name);
135 if (args->secret)
136 seq_puts(m, ",secret=<hidden>");
137 return 0;
138}
139
140/*
141 * caches
142 */
143struct kmem_cache *ceph_inode_cachep;
144struct kmem_cache *ceph_cap_cachep;
145struct kmem_cache *ceph_dentry_cachep;
146struct kmem_cache *ceph_file_cachep;
147
148static void ceph_inode_init_once(void *foo)
149{
150 struct ceph_inode_info *ci = foo;
151 inode_init_once(&ci->vfs_inode);
152}
153
154static int default_congestion_kb(void)
155{
156 int congestion_kb;
157
158 /*
159 * Copied from NFS
160 *
161 * congestion size, scale with available memory.
162 *
163 * 64MB: 8192k
164 * 128MB: 11585k
165 * 256MB: 16384k
166 * 512MB: 23170k
167 * 1GB: 32768k
168 * 2GB: 46340k
169 * 4GB: 65536k
170 * 8GB: 92681k
171 * 16GB: 131072k
172 *
173 * This allows larger machines to have larger/more transfers.
174 * Limit the default to 256M
175 */
176 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
177 if (congestion_kb > 256*1024)
178 congestion_kb = 256*1024;
179
180 return congestion_kb;
181}
182
183static int __init init_caches(void)
184{
185 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
186 sizeof(struct ceph_inode_info),
187 __alignof__(struct ceph_inode_info),
188 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
189 ceph_inode_init_once);
190 if (ceph_inode_cachep == NULL)
191 return -ENOMEM;
192
193 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
194 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
195 if (ceph_cap_cachep == NULL)
196 goto bad_cap;
197
198 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
199 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
200 if (ceph_dentry_cachep == NULL)
201 goto bad_dentry;
202
203 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
204 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
205 if (ceph_file_cachep == NULL)
206 goto bad_file;
207
208 return 0;
209
210bad_file:
211 kmem_cache_destroy(ceph_dentry_cachep);
212bad_dentry:
213 kmem_cache_destroy(ceph_cap_cachep);
214bad_cap:
215 kmem_cache_destroy(ceph_inode_cachep);
216 return -ENOMEM;
217}
218
219static void destroy_caches(void)
220{
221 kmem_cache_destroy(ceph_inode_cachep);
222 kmem_cache_destroy(ceph_cap_cachep);
223 kmem_cache_destroy(ceph_dentry_cachep);
224 kmem_cache_destroy(ceph_file_cachep);
225}
226
227
228/*
229 * ceph_umount_begin - initiate forced umount. Tear down down the
230 * mount, skipping steps that may hang while waiting for server(s).
231 */
232static void ceph_umount_begin(struct super_block *sb)
233{
234 struct ceph_client *client = ceph_sb_to_client(sb);
235
236 dout("ceph_umount_begin - starting forced umount\n");
237 if (!client)
238 return;
239 client->mount_state = CEPH_MOUNT_SHUTDOWN;
240 return;
241}
242
243static const struct super_operations ceph_super_ops = {
244 .alloc_inode = ceph_alloc_inode,
245 .destroy_inode = ceph_destroy_inode,
246 .write_inode = ceph_write_inode,
247 .sync_fs = ceph_syncfs,
248 .put_super = ceph_put_super,
249 .show_options = ceph_show_options,
250 .statfs = ceph_statfs,
251 .umount_begin = ceph_umount_begin,
252};
253
254
255const char *ceph_msg_type_name(int type)
256{
257 switch (type) {
258 case CEPH_MSG_SHUTDOWN: return "shutdown";
259 case CEPH_MSG_PING: return "ping";
260 case CEPH_MSG_AUTH: return "auth";
261 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
262 case CEPH_MSG_MON_MAP: return "mon_map";
263 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
264 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
265 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
266 case CEPH_MSG_STATFS: return "statfs";
267 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
268 case CEPH_MSG_MDS_MAP: return "mds_map";
269 case CEPH_MSG_CLIENT_SESSION: return "client_session";
270 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
271 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
272 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
273 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
274 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
275 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
276 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
277 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
278 case CEPH_MSG_OSD_MAP: return "osd_map";
279 case CEPH_MSG_OSD_OP: return "osd_op";
280 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
281 default: return "unknown";
282 }
283}
284
285
286/*
287 * mount options
288 */
289enum {
290 Opt_fsidmajor,
291 Opt_fsidminor,
292 Opt_monport,
293 Opt_wsize,
294 Opt_rsize,
295 Opt_osdtimeout,
296 Opt_osdkeepalivetimeout,
297 Opt_mount_timeout,
298 Opt_osd_idle_ttl,
299 Opt_caps_wanted_delay_min,
300 Opt_caps_wanted_delay_max,
301 Opt_readdir_max_entries,
302 Opt_congestion_kb,
303 Opt_last_int,
304 /* int args above */
305 Opt_snapdirname,
306 Opt_name,
307 Opt_secret,
308 Opt_last_string,
309 /* string args above */
310 Opt_ip,
311 Opt_noshare,
312 Opt_dirstat,
313 Opt_nodirstat,
314 Opt_rbytes,
315 Opt_norbytes,
316 Opt_nocrc,
317 Opt_noasyncreaddir,
318};
319
320static match_table_t arg_tokens = {
321 {Opt_fsidmajor, "fsidmajor=%ld"},
322 {Opt_fsidminor, "fsidminor=%ld"},
323 {Opt_monport, "monport=%d"},
324 {Opt_wsize, "wsize=%d"},
325 {Opt_rsize, "rsize=%d"},
326 {Opt_osdtimeout, "osdtimeout=%d"},
327 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
328 {Opt_mount_timeout, "mount_timeout=%d"},
329 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
330 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
331 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
332 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
333 {Opt_congestion_kb, "write_congestion_kb=%d"},
334 /* int args above */
335 {Opt_snapdirname, "snapdirname=%s"},
336 {Opt_name, "name=%s"},
337 {Opt_secret, "secret=%s"},
338 /* string args above */
339 {Opt_ip, "ip=%s"},
340 {Opt_noshare, "noshare"},
341 {Opt_dirstat, "dirstat"},
342 {Opt_nodirstat, "nodirstat"},
343 {Opt_rbytes, "rbytes"},
344 {Opt_norbytes, "norbytes"},
345 {Opt_nocrc, "nocrc"},
346 {Opt_noasyncreaddir, "noasyncreaddir"},
347 {-1, NULL}
348};
349
350
351static struct ceph_mount_args *parse_mount_args(int flags, char *options,
352 const char *dev_name,
353 const char **path)
354{
355 struct ceph_mount_args *args;
356 const char *c;
357 int err = -ENOMEM;
358 substring_t argstr[MAX_OPT_ARGS];
359
360 args = kzalloc(sizeof(*args), GFP_KERNEL);
361 if (!args)
362 return ERR_PTR(-ENOMEM);
363 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
364 GFP_KERNEL);
365 if (!args->mon_addr)
366 goto out;
367
368 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
369
370 /* start with defaults */
371 args->sb_flags = flags;
372 args->flags = CEPH_OPT_DEFAULT;
373 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
374 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
375 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
376 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
377 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
378 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
379 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
380 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
381 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
382 args->max_readdir = 1024;
383 args->congestion_kb = default_congestion_kb();
384
385 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
386 err = -EINVAL;
387 if (!dev_name)
388 goto out;
389 *path = strstr(dev_name, ":/");
390 if (*path == NULL) {
391 pr_err("device name is missing path (no :/ in %s)\n",
392 dev_name);
393 goto out;
394 }
395
396 /* get mon ip(s) */
397 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
398 CEPH_MAX_MON, &args->num_mon);
399 if (err < 0)
400 goto out;
401
402 /* path on server */
403 *path += 2;
404 dout("server path '%s'\n", *path);
405
406 /* parse mount options */
407 while ((c = strsep(&options, ",")) != NULL) {
408 int token, intval, ret;
409 if (!*c)
410 continue;
411 err = -EINVAL;
412 token = match_token((char *)c, arg_tokens, argstr);
413 if (token < 0) {
414 pr_err("bad mount option at '%s'\n", c);
415 goto out;
416 }
417 if (token < Opt_last_int) {
418 ret = match_int(&argstr[0], &intval);
419 if (ret < 0) {
420 pr_err("bad mount option arg (not int) "
421 "at '%s'\n", c);
422 continue;
423 }
424 dout("got int token %d val %d\n", token, intval);
425 } else if (token > Opt_last_int && token < Opt_last_string) {
426 dout("got string token %d val %s\n", token,
427 argstr[0].from);
428 } else {
429 dout("got token %d\n", token);
430 }
431 switch (token) {
432 case Opt_fsidmajor:
433 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
434 break;
435 case Opt_fsidminor:
436 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
437 break;
438 case Opt_ip:
439 err = ceph_parse_ips(argstr[0].from,
440 argstr[0].to,
441 &args->my_addr,
442 1, NULL);
443 if (err < 0)
444 goto out;
445 args->flags |= CEPH_OPT_MYIP;
446 break;
447
448 case Opt_snapdirname:
449 kfree(args->snapdir_name);
450 args->snapdir_name = kstrndup(argstr[0].from,
451 argstr[0].to-argstr[0].from,
452 GFP_KERNEL);
453 break;
454 case Opt_name:
455 args->name = kstrndup(argstr[0].from,
456 argstr[0].to-argstr[0].from,
457 GFP_KERNEL);
458 break;
459 case Opt_secret:
460 args->secret = kstrndup(argstr[0].from,
461 argstr[0].to-argstr[0].from,
462 GFP_KERNEL);
463 break;
464
465 /* misc */
466 case Opt_wsize:
467 args->wsize = intval;
468 break;
469 case Opt_rsize:
470 args->rsize = intval;
471 break;
472 case Opt_osdtimeout:
473 args->osd_timeout = intval;
474 break;
475 case Opt_osdkeepalivetimeout:
476 args->osd_keepalive_timeout = intval;
477 break;
478 case Opt_mount_timeout:
479 args->mount_timeout = intval;
480 break;
481 case Opt_caps_wanted_delay_min:
482 args->caps_wanted_delay_min = intval;
483 break;
484 case Opt_caps_wanted_delay_max:
485 args->caps_wanted_delay_max = intval;
486 break;
487 case Opt_readdir_max_entries:
488 args->max_readdir = intval;
489 break;
490 case Opt_congestion_kb:
491 args->congestion_kb = intval;
492 break;
493
494 case Opt_noshare:
495 args->flags |= CEPH_OPT_NOSHARE;
496 break;
497
498 case Opt_dirstat:
499 args->flags |= CEPH_OPT_DIRSTAT;
500 break;
501 case Opt_nodirstat:
502 args->flags &= ~CEPH_OPT_DIRSTAT;
503 break;
504 case Opt_rbytes:
505 args->flags |= CEPH_OPT_RBYTES;
506 break;
507 case Opt_norbytes:
508 args->flags &= ~CEPH_OPT_RBYTES;
509 break;
510 case Opt_nocrc:
511 args->flags |= CEPH_OPT_NOCRC;
512 break;
513 case Opt_noasyncreaddir:
514 args->flags |= CEPH_OPT_NOASYNCREADDIR;
515 break;
516
517 default:
518 BUG_ON(token);
519 }
520 }
521 return args;
522
523out:
524 kfree(args->mon_addr);
525 kfree(args);
526 return ERR_PTR(err);
527}
528
529static void destroy_mount_args(struct ceph_mount_args *args)
530{
531 dout("destroy_mount_args %p\n", args);
532 kfree(args->snapdir_name);
533 args->snapdir_name = NULL;
534 kfree(args->name);
535 args->name = NULL;
536 kfree(args->secret);
537 args->secret = NULL;
538 kfree(args);
539}
540
541/*
542 * create a fresh client instance
543 */
544static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
545{
546 struct ceph_client *client;
547 int err = -ENOMEM;
548
549 client = kzalloc(sizeof(*client), GFP_KERNEL);
550 if (client == NULL)
551 return ERR_PTR(-ENOMEM);
552
553 mutex_init(&client->mount_mutex);
554
555 init_waitqueue_head(&client->auth_wq);
556
557 client->sb = NULL;
558 client->mount_state = CEPH_MOUNT_MOUNTING;
559 client->mount_args = args;
560
561 client->msgr = NULL;
562
563 client->auth_err = 0;
564 atomic_long_set(&client->writeback_count, 0);
565
566 err = bdi_init(&client->backing_dev_info);
567 if (err < 0)
568 goto fail;
569
570 err = -ENOMEM;
571 client->wb_wq = create_workqueue("ceph-writeback");
572 if (client->wb_wq == NULL)
573 goto fail_bdi;
574 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
575 if (client->pg_inv_wq == NULL)
576 goto fail_wb_wq;
577 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
578 if (client->trunc_wq == NULL)
579 goto fail_pg_inv_wq;
580
581 /* set up mempools */
582 err = -ENOMEM;
583 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
584 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
585 if (!client->wb_pagevec_pool)
586 goto fail_trunc_wq;
587
588 /* caps */
589 client->min_caps = args->max_readdir;
590 ceph_adjust_min_caps(client->min_caps);
591
592 /* subsystems */
593 err = ceph_monc_init(&client->monc, client);
594 if (err < 0)
595 goto fail_mempool;
596 err = ceph_osdc_init(&client->osdc, client);
597 if (err < 0)
598 goto fail_monc;
599 err = ceph_mdsc_init(&client->mdsc, client);
600 if (err < 0)
601 goto fail_osdc;
602 return client;
603
604fail_osdc:
605 ceph_osdc_stop(&client->osdc);
606fail_monc:
607 ceph_monc_stop(&client->monc);
608fail_mempool:
609 mempool_destroy(client->wb_pagevec_pool);
610fail_trunc_wq:
611 destroy_workqueue(client->trunc_wq);
612fail_pg_inv_wq:
613 destroy_workqueue(client->pg_inv_wq);
614fail_wb_wq:
615 destroy_workqueue(client->wb_wq);
616fail_bdi:
617 bdi_destroy(&client->backing_dev_info);
618fail:
619 kfree(client);
620 return ERR_PTR(err);
621}
622
623static void ceph_destroy_client(struct ceph_client *client)
624{
625 dout("destroy_client %p\n", client);
626
627 /* unmount */
628 ceph_mdsc_stop(&client->mdsc);
629 ceph_monc_stop(&client->monc);
630 ceph_osdc_stop(&client->osdc);
631
632 ceph_adjust_min_caps(-client->min_caps);
633
634 ceph_debugfs_client_cleanup(client);
635 destroy_workqueue(client->wb_wq);
636 destroy_workqueue(client->pg_inv_wq);
637 destroy_workqueue(client->trunc_wq);
638
639 if (client->msgr)
640 ceph_messenger_destroy(client->msgr);
641 mempool_destroy(client->wb_pagevec_pool);
642
643 destroy_mount_args(client->mount_args);
644
645 kfree(client);
646 dout("destroy_client %p done\n", client);
647}
648
649/*
650 * Initially learn our fsid, or verify an fsid matches.
651 */
652int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
653{
654 if (client->have_fsid) {
655 if (ceph_fsid_compare(&client->fsid, fsid)) {
656 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
657 PR_FSID(&client->fsid), PR_FSID(fsid));
658 return -1;
659 }
660 } else {
661 pr_info("client%lld fsid " FSID_FORMAT "\n",
662 client->monc.auth->global_id, PR_FSID(fsid));
663 memcpy(&client->fsid, fsid, sizeof(*fsid));
664 ceph_debugfs_client_init(client);
665 client->have_fsid = true;
666 }
667 return 0;
668}
669
670/*
671 * true if we have the mon map (and have thus joined the cluster)
672 */
673static int have_mon_map(struct ceph_client *client)
674{
675 return client->monc.monmap && client->monc.monmap->epoch;
676}
677
678/*
679 * Bootstrap mount by opening the root directory. Note the mount
680 * @started time from caller, and time out if this takes too long.
681 */
682static struct dentry *open_root_dentry(struct ceph_client *client,
683 const char *path,
684 unsigned long started)
685{
686 struct ceph_mds_client *mdsc = &client->mdsc;
687 struct ceph_mds_request *req = NULL;
688 int err;
689 struct dentry *root;
690
691 /* open dir */
692 dout("open_root_inode opening '%s'\n", path);
693 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
694 if (IS_ERR(req))
695 return ERR_PTR(PTR_ERR(req));
696 req->r_path1 = kstrdup(path, GFP_NOFS);
697 req->r_ino1.ino = CEPH_INO_ROOT;
698 req->r_ino1.snap = CEPH_NOSNAP;
699 req->r_started = started;
700 req->r_timeout = client->mount_args->mount_timeout * HZ;
701 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
702 req->r_num_caps = 2;
703 err = ceph_mdsc_do_request(mdsc, NULL, req);
704 if (err == 0) {
705 dout("open_root_inode success\n");
706 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
707 client->sb->s_root == NULL)
708 root = d_alloc_root(req->r_target_inode);
709 else
710 root = d_obtain_alias(req->r_target_inode);
711 req->r_target_inode = NULL;
712 dout("open_root_inode success, root dentry is %p\n", root);
713 } else {
714 root = ERR_PTR(err);
715 }
716 ceph_mdsc_put_request(req);
717 return root;
718}
719
720/*
721 * mount: join the ceph cluster, and open root directory.
722 */
723static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
724 const char *path)
725{
726 struct ceph_entity_addr *myaddr = NULL;
727 int err;
728 unsigned long timeout = client->mount_args->mount_timeout * HZ;
729 unsigned long started = jiffies; /* note the start time */
730 struct dentry *root;
731
732 dout("mount start\n");
733 mutex_lock(&client->mount_mutex);
734
735 /* initialize the messenger */
736 if (client->msgr == NULL) {
737 if (ceph_test_opt(client, MYIP))
738 myaddr = &client->mount_args->my_addr;
739 client->msgr = ceph_messenger_create(myaddr);
740 if (IS_ERR(client->msgr)) {
741 err = PTR_ERR(client->msgr);
742 client->msgr = NULL;
743 goto out;
744 }
745 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
746 }
747
748 /* open session, and wait for mon, mds, and osd maps */
749 err = ceph_monc_open_session(&client->monc);
750 if (err < 0)
751 goto out;
752
753 while (!have_mon_map(client)) {
754 err = -EIO;
755 if (timeout && time_after_eq(jiffies, started + timeout))
756 goto out;
757
758 /* wait */
759 dout("mount waiting for mon_map\n");
760 err = wait_event_interruptible_timeout(client->auth_wq,
761 have_mon_map(client) || (client->auth_err < 0),
762 timeout);
763 if (err == -EINTR || err == -ERESTARTSYS)
764 goto out;
765 if (client->auth_err < 0) {
766 err = client->auth_err;
767 goto out;
768 }
769 }
770
771 dout("mount opening root\n");
772 root = open_root_dentry(client, "", started);
773 if (IS_ERR(root)) {
774 err = PTR_ERR(root);
775 goto out;
776 }
777 if (client->sb->s_root)
778 dput(root);
779 else
780 client->sb->s_root = root;
781
782 if (path[0] == 0) {
783 dget(root);
784 } else {
785 dout("mount opening base mountpoint\n");
786 root = open_root_dentry(client, path, started);
787 if (IS_ERR(root)) {
788 err = PTR_ERR(root);
789 dput(client->sb->s_root);
790 client->sb->s_root = NULL;
791 goto out;
792 }
793 }
794
795 mnt->mnt_root = root;
796 mnt->mnt_sb = client->sb;
797
798 client->mount_state = CEPH_MOUNT_MOUNTED;
799 dout("mount success\n");
800 err = 0;
801
802out:
803 mutex_unlock(&client->mount_mutex);
804 return err;
805}
806
807static int ceph_set_super(struct super_block *s, void *data)
808{
809 struct ceph_client *client = data;
810 int ret;
811
812 dout("set_super %p data %p\n", s, data);
813
814 s->s_flags = client->mount_args->sb_flags;
815 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
816
817 s->s_fs_info = client;
818 client->sb = s;
819
820 s->s_op = &ceph_super_ops;
821 s->s_export_op = &ceph_export_ops;
822
823 s->s_time_gran = 1000; /* 1000 ns == 1 us */
824
825 ret = set_anon_super(s, NULL); /* what is that second arg for? */
826 if (ret != 0)
827 goto fail;
828
829 return ret;
830
831fail:
832 s->s_fs_info = NULL;
833 client->sb = NULL;
834 return ret;
835}
836
837/*
838 * share superblock if same fs AND options
839 */
840static int ceph_compare_super(struct super_block *sb, void *data)
841{
842 struct ceph_client *new = data;
843 struct ceph_mount_args *args = new->mount_args;
844 struct ceph_client *other = ceph_sb_to_client(sb);
845 int i;
846
847 dout("ceph_compare_super %p\n", sb);
848 if (args->flags & CEPH_OPT_FSID) {
849 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
850 dout("fsid doesn't match\n");
851 return 0;
852 }
853 } else {
854 /* do we share (a) monitor? */
855 for (i = 0; i < new->monc.monmap->num_mon; i++)
856 if (ceph_monmap_contains(other->monc.monmap,
857 &new->monc.monmap->mon_inst[i].addr))
858 break;
859 if (i == new->monc.monmap->num_mon) {
860 dout("mon ip not part of monmap\n");
861 return 0;
862 }
863 dout("mon ip matches existing sb %p\n", sb);
864 }
865 if (args->sb_flags != other->mount_args->sb_flags) {
866 dout("flags differ\n");
867 return 0;
868 }
869 return 1;
870}
871
872/*
873 * construct our own bdi so we can control readahead, etc.
874 */
875static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
876{
877 int err;
878
879 sb->s_bdi = &client->backing_dev_info;
880
881 /* set ra_pages based on rsize mount option? */
882 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
883 client->backing_dev_info.ra_pages =
884 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
885 >> PAGE_SHIFT;
886 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
887 return err;
888}
889
890static int ceph_get_sb(struct file_system_type *fs_type,
891 int flags, const char *dev_name, void *data,
892 struct vfsmount *mnt)
893{
894 struct super_block *sb;
895 struct ceph_client *client;
896 int err;
897 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
898 const char *path = NULL;
899 struct ceph_mount_args *args;
900
901 dout("ceph_get_sb\n");
902 args = parse_mount_args(flags, data, dev_name, &path);
903 if (IS_ERR(args)) {
904 err = PTR_ERR(args);
905 goto out_final;
906 }
907
908 /* create client (which we may/may not use) */
909 client = ceph_create_client(args);
910 if (IS_ERR(client)) {
911 err = PTR_ERR(client);
912 goto out_final;
913 }
914
915 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
916 compare_super = NULL;
917 sb = sget(fs_type, compare_super, ceph_set_super, client);
918 if (IS_ERR(sb)) {
919 err = PTR_ERR(sb);
920 goto out;
921 }
922
923 if (ceph_client(sb) != client) {
924 ceph_destroy_client(client);
925 client = ceph_client(sb);
926 dout("get_sb got existing client %p\n", client);
927 } else {
928 dout("get_sb using new client %p\n", client);
929 err = ceph_register_bdi(sb, client);
930 if (err < 0)
931 goto out_splat;
932 }
933
934 err = ceph_mount(client, mnt, path);
935 if (err < 0)
936 goto out_splat;
937 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
938 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
939 return 0;
940
941out_splat:
942 ceph_mdsc_close_sessions(&client->mdsc);
943 up_write(&sb->s_umount);
944 deactivate_super(sb);
945 goto out_final;
946
947out:
948 ceph_destroy_client(client);
949out_final:
950 dout("ceph_get_sb fail %d\n", err);
951 return err;
952}
953
954static void ceph_kill_sb(struct super_block *s)
955{
956 struct ceph_client *client = ceph_sb_to_client(s);
957 dout("kill_sb %p\n", s);
958 ceph_mdsc_pre_umount(&client->mdsc);
959 kill_anon_super(s); /* will call put_super after sb is r/o */
960 if (s->s_bdi == &client->backing_dev_info)
961 bdi_unregister(&client->backing_dev_info);
962 bdi_destroy(&client->backing_dev_info);
963 ceph_destroy_client(client);
964}
965
966static struct file_system_type ceph_fs_type = {
967 .owner = THIS_MODULE,
968 .name = "ceph",
969 .get_sb = ceph_get_sb,
970 .kill_sb = ceph_kill_sb,
971 .fs_flags = FS_RENAME_DOES_D_MOVE,
972};
973
974#define _STRINGIFY(x) #x
975#define STRINGIFY(x) _STRINGIFY(x)
976
977static int __init init_ceph(void)
978{
979 int ret = 0;
980
981 ret = ceph_debugfs_init();
982 if (ret < 0)
983 goto out;
984
985 ret = ceph_msgr_init();
986 if (ret < 0)
987 goto out_debugfs;
988
989 ret = init_caches();
990 if (ret)
991 goto out_msgr;
992
993 ceph_caps_init();
994
995 ret = register_filesystem(&ceph_fs_type);
996 if (ret)
997 goto out_icache;
998
999 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
1000 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1001 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1002 return 0;
1003
1004out_icache:
1005 destroy_caches();
1006out_msgr:
1007 ceph_msgr_exit();
1008out_debugfs:
1009 ceph_debugfs_cleanup();
1010out:
1011 return ret;
1012}
1013
1014static void __exit exit_ceph(void)
1015{
1016 dout("exit_ceph\n");
1017 unregister_filesystem(&ceph_fs_type);
1018 ceph_caps_finalize();
1019 destroy_caches();
1020 ceph_msgr_exit();
1021 ceph_debugfs_cleanup();
1022}
1023
1024module_init(init_ceph);
1025module_exit(exit_ceph);
1026
1027MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1028MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1029MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1030MODULE_DESCRIPTION("Ceph filesystem for Linux");
1031MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..e30dfbb056c3
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24
25/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400
27
28/* large granularity for statfs utilization stats to facilitate
29 * large volume sizes on 32-bit machines. */
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32
33/*
34 * mount options
35 */
36#define CEPH_OPT_FSID (1<<0)
37#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
38#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
39#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
40#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
41#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
42#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
43
44#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
45
46#define ceph_set_opt(client, opt) \
47 (client)->mount_args->flags |= CEPH_OPT_##opt;
48#define ceph_test_opt(client, opt) \
49 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
50
51
52struct ceph_mount_args {
53 int sb_flags;
54 int num_mon;
55 struct ceph_entity_addr *mon_addr;
56 int flags;
57 int mount_timeout;
58 int osd_idle_ttl;
59 int caps_wanted_delay_min, caps_wanted_delay_max;
60 struct ceph_fsid fsid;
61 struct ceph_entity_addr my_addr;
62 int wsize;
63 int rsize; /* max readahead */
64 int max_readdir; /* max readdir size */
65 int congestion_kb; /* max readdir size */
66 int osd_timeout;
67 int osd_keepalive_timeout;
68 char *snapdir_name; /* default ".snap" */
69 char *name;
70 char *secret;
71 int cap_release_safety;
72};
73
74/*
75 * defaults
76 */
77#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
78#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
79#define CEPH_OSD_KEEPALIVE_DEFAULT 5
80#define CEPH_OSD_IDLE_TTL_DEFAULT 60
81#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
82
83#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
84#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
85
86#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
87#define CEPH_AUTH_NAME_DEFAULT "guest"
88
89/*
90 * Delay telling the MDS we no longer want caps, in case we reopen
91 * the file. Delay a minimum amount of time, even if we send a cap
92 * message for some other reason. Otherwise, take the oppotunity to
93 * update the mds to avoid sending another message later.
94 */
95#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
96#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
97
98
99/* mount state */
100enum {
101 CEPH_MOUNT_MOUNTING,
102 CEPH_MOUNT_MOUNTED,
103 CEPH_MOUNT_UNMOUNTING,
104 CEPH_MOUNT_UNMOUNTED,
105 CEPH_MOUNT_SHUTDOWN,
106};
107
108/*
109 * subtract jiffies
110 */
111static inline unsigned long time_sub(unsigned long a, unsigned long b)
112{
113 BUG_ON(time_after(b, a));
114 return (long)a - (long)b;
115}
116
117/*
118 * per-filesystem client state
119 *
120 * possibly shared by multiple mount points, if they are
121 * mounting the same ceph filesystem/cluster.
122 */
123struct ceph_client {
124 struct ceph_fsid fsid;
125 bool have_fsid;
126
127 struct mutex mount_mutex; /* serialize mount attempts */
128 struct ceph_mount_args *mount_args;
129
130 struct super_block *sb;
131
132 unsigned long mount_state;
133 wait_queue_head_t auth_wq;
134
135 int auth_err;
136
137 int min_caps; /* min caps i added */
138
139 struct ceph_messenger *msgr; /* messenger instance */
140 struct ceph_mon_client monc;
141 struct ceph_mds_client mdsc;
142 struct ceph_osd_client osdc;
143
144 /* writeback */
145 mempool_t *wb_pagevec_pool;
146 struct workqueue_struct *wb_wq;
147 struct workqueue_struct *pg_inv_wq;
148 struct workqueue_struct *trunc_wq;
149 atomic_long_t writeback_count;
150
151 struct backing_dev_info backing_dev_info;
152
153#ifdef CONFIG_DEBUG_FS
154 struct dentry *debugfs_monmap;
155 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
156 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
157 struct dentry *debugfs_congestion_kb;
158 struct dentry *debugfs_bdi;
159#endif
160};
161
162static inline struct ceph_client *ceph_client(struct super_block *sb)
163{
164 return sb->s_fs_info;
165}
166
167
168/*
169 * File i/o capability. This tracks shared state with the metadata
170 * server that allows us to cache or writeback attributes or to read
171 * and write data. For any given inode, we should have one or more
172 * capabilities, one issued by each metadata server, and our
173 * cumulative access is the OR of all issued capabilities.
174 *
175 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
176 * session capability lists.
177 */
178struct ceph_cap {
179 struct ceph_inode_info *ci;
180 struct rb_node ci_node; /* per-ci cap tree */
181 struct ceph_mds_session *session;
182 struct list_head session_caps; /* per-session caplist */
183 int mds;
184 u64 cap_id; /* unique cap id (mds provided) */
185 int issued; /* latest, from the mds */
186 int implemented; /* implemented superset of issued (for revocation) */
187 int mds_wanted;
188 u32 seq, issue_seq, mseq;
189 u32 cap_gen; /* active/stale cycle */
190 unsigned long last_used;
191 struct list_head caps_item;
192};
193
194#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
195#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
196#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
197
198/*
199 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
200 * we first complete any in-process sync writes and writeback any dirty
201 * data before flushing the snapped state (tracked here) back to the MDS.
202 */
203struct ceph_cap_snap {
204 atomic_t nref;
205 struct ceph_inode_info *ci;
206 struct list_head ci_item, flushing_item;
207
208 u64 follows, flush_tid;
209 int issued, dirty;
210 struct ceph_snap_context *context;
211
212 mode_t mode;
213 uid_t uid;
214 gid_t gid;
215
216 void *xattr_blob;
217 int xattr_len;
218 u64 xattr_version;
219
220 u64 size;
221 struct timespec mtime, atime, ctime;
222 u64 time_warp_seq;
223 int writing; /* a sync write is still in progress */
224 int dirty_pages; /* dirty pages awaiting writeback */
225};
226
227static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
228{
229 if (atomic_dec_and_test(&capsnap->nref))
230 kfree(capsnap);
231}
232
233/*
234 * The frag tree describes how a directory is fragmented, potentially across
235 * multiple metadata servers. It is also used to indicate points where
236 * metadata authority is delegated, and whether/where metadata is replicated.
237 *
238 * A _leaf_ frag will be present in the i_fragtree IFF there is
239 * delegation info. That is, if mds >= 0 || ndist > 0.
240 */
241#define CEPH_MAX_DIRFRAG_REP 4
242
243struct ceph_inode_frag {
244 struct rb_node node;
245
246 /* fragtree state */
247 u32 frag;
248 int split_by; /* i.e. 2^(split_by) children */
249
250 /* delegation and replication info */
251 int mds; /* -1 if same authority as parent */
252 int ndist; /* >0 if replicated */
253 int dist[CEPH_MAX_DIRFRAG_REP];
254};
255
256/*
257 * We cache inode xattrs as an encoded blob until they are first used,
258 * at which point we parse them into an rbtree.
259 */
260struct ceph_inode_xattr {
261 struct rb_node node;
262
263 const char *name;
264 int name_len;
265 const char *val;
266 int val_len;
267 int dirty;
268
269 int should_free_name;
270 int should_free_val;
271};
272
273struct ceph_inode_xattrs_info {
274 /*
275 * (still encoded) xattr blob. we avoid the overhead of parsing
276 * this until someone actually calls getxattr, etc.
277 *
278 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
279 * NULL means we don't know.
280 */
281 struct ceph_buffer *blob, *prealloc_blob;
282
283 struct rb_root index;
284 bool dirty;
285 int count;
286 int names_size;
287 int vals_size;
288 u64 version, index_version;
289};
290
291/*
292 * Ceph inode.
293 */
294#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
295#define CEPH_I_NODELAY 4 /* do not delay cap release */
296#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
297#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
298
299struct ceph_inode_info {
300 struct ceph_vino i_vino; /* ceph ino + snap */
301
302 u64 i_version;
303 u32 i_time_warp_seq;
304
305 unsigned i_ceph_flags;
306 unsigned long i_release_count;
307
308 struct ceph_file_layout i_layout;
309 char *i_symlink;
310
311 /* for dirs */
312 struct timespec i_rctime;
313 u64 i_rbytes, i_rfiles, i_rsubdirs;
314 u64 i_files, i_subdirs;
315 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
316
317 struct rb_root i_fragtree;
318 struct mutex i_fragtree_mutex;
319
320 struct ceph_inode_xattrs_info i_xattrs;
321
322 /* capabilities. protected _both_ by i_lock and cap->session's
323 * s_mutex. */
324 struct rb_root i_caps; /* cap list */
325 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
326 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
327 struct list_head i_dirty_item, i_flushing_item;
328 u64 i_cap_flush_seq;
329 /* we need to track cap writeback on a per-cap-bit basis, to allow
330 * overlapping, pipelined cap flushes to the mds. we can probably
331 * reduce the tid to 8 bits if we're concerned about inode size. */
332 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
333 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
334 unsigned long i_hold_caps_min; /* jiffies */
335 unsigned long i_hold_caps_max; /* jiffies */
336 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
337 int i_cap_exporting_mds; /* to handle cap migration between */
338 unsigned i_cap_exporting_mseq; /* mds's. */
339 unsigned i_cap_exporting_issued;
340 struct ceph_cap_reservation i_cap_migration_resv;
341 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
342 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
343 unsigned i_snap_caps; /* cap bits for snapped files */
344
345 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
346
347 u32 i_truncate_seq; /* last truncate to smaller size */
348 u64 i_truncate_size; /* and the size we last truncated down to */
349 int i_truncate_pending; /* still need to call vmtruncate */
350
351 u64 i_max_size; /* max file size authorized by mds */
352 u64 i_reported_size; /* (max_)size reported to or requested of mds */
353 u64 i_wanted_max_size; /* offset we'd like to write too */
354 u64 i_requested_max_size; /* max_size we've requested */
355
356 /* held references to caps */
357 int i_pin_ref;
358 int i_rd_ref, i_rdcache_ref, i_wr_ref;
359 int i_wrbuffer_ref, i_wrbuffer_ref_head;
360 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
361 u32 i_rdcache_gen; /* we increment this each time we get
362 FILE_CACHE. If it's non-zero, we
363 _may_ have cached pages. */
364 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
365
366 struct list_head i_unsafe_writes; /* uncommitted sync writes */
367 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
368 spinlock_t i_unsafe_lock;
369
370 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
371 int i_snap_realm_counter; /* snap realm (if caps) */
372 struct list_head i_snap_realm_item;
373 struct list_head i_snap_flush_item;
374
375 struct work_struct i_wb_work; /* writeback work */
376 struct work_struct i_pg_inv_work; /* page invalidation work */
377
378 struct work_struct i_vmtruncate_work;
379
380 struct inode vfs_inode; /* at end */
381};
382
383static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
384{
385 return container_of(inode, struct ceph_inode_info, vfs_inode);
386}
387
388static inline void ceph_i_clear(struct inode *inode, unsigned mask)
389{
390 struct ceph_inode_info *ci = ceph_inode(inode);
391
392 spin_lock(&inode->i_lock);
393 ci->i_ceph_flags &= ~mask;
394 spin_unlock(&inode->i_lock);
395}
396
397static inline void ceph_i_set(struct inode *inode, unsigned mask)
398{
399 struct ceph_inode_info *ci = ceph_inode(inode);
400
401 spin_lock(&inode->i_lock);
402 ci->i_ceph_flags |= mask;
403 spin_unlock(&inode->i_lock);
404}
405
406static inline bool ceph_i_test(struct inode *inode, unsigned mask)
407{
408 struct ceph_inode_info *ci = ceph_inode(inode);
409 bool r;
410
411 smp_mb();
412 r = (ci->i_ceph_flags & mask) == mask;
413 return r;
414}
415
416
417/* find a specific frag @f */
418extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
419 u32 f);
420
421/*
422 * choose fragment for value @v. copy frag content to pfrag, if leaf
423 * exists
424 */
425extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
426 struct ceph_inode_frag *pfrag,
427 int *found);
428
429/*
430 * Ceph dentry state
431 */
432struct ceph_dentry_info {
433 struct ceph_mds_session *lease_session;
434 u32 lease_gen, lease_shared_gen;
435 u32 lease_seq;
436 unsigned long lease_renew_after, lease_renew_from;
437 struct list_head lru;
438 struct dentry *dentry;
439 u64 time;
440 u64 offset;
441};
442
443static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
444{
445 return (struct ceph_dentry_info *)dentry->d_fsdata;
446}
447
448static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
449{
450 return ((loff_t)frag << 32) | (loff_t)off;
451}
452
453/*
454 * ino_t is <64 bits on many architectures, blech.
455 *
456 * don't include snap in ino hash, at least for now.
457 */
458static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
459{
460 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
461#if BITS_PER_LONG == 32
462 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
463 if (!ino)
464 ino = 1;
465#endif
466 return ino;
467}
468
469static inline int ceph_set_ino_cb(struct inode *inode, void *data)
470{
471 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
472 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
473 return 0;
474}
475
476static inline struct ceph_vino ceph_vino(struct inode *inode)
477{
478 return ceph_inode(inode)->i_vino;
479}
480
481/* for printf-style formatting */
482#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
483
484static inline u64 ceph_ino(struct inode *inode)
485{
486 return ceph_inode(inode)->i_vino.ino;
487}
488static inline u64 ceph_snap(struct inode *inode)
489{
490 return ceph_inode(inode)->i_vino.snap;
491}
492
493static inline int ceph_ino_compare(struct inode *inode, void *data)
494{
495 struct ceph_vino *pvino = (struct ceph_vino *)data;
496 struct ceph_inode_info *ci = ceph_inode(inode);
497 return ci->i_vino.ino == pvino->ino &&
498 ci->i_vino.snap == pvino->snap;
499}
500
501static inline struct inode *ceph_find_inode(struct super_block *sb,
502 struct ceph_vino vino)
503{
504 ino_t t = ceph_vino_to_ino(vino);
505 return ilookup5(sb, t, ceph_ino_compare, &vino);
506}
507
508
509/*
510 * caps helpers
511 */
512static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
513{
514 return !RB_EMPTY_ROOT(&ci->i_caps);
515}
516
517extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
518extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
519extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
520 struct ceph_cap *cap);
521
522static inline int ceph_caps_issued(struct ceph_inode_info *ci)
523{
524 int issued;
525 spin_lock(&ci->vfs_inode.i_lock);
526 issued = __ceph_caps_issued(ci, NULL);
527 spin_unlock(&ci->vfs_inode.i_lock);
528 return issued;
529}
530
531static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
532 int touch)
533{
534 int r;
535 spin_lock(&ci->vfs_inode.i_lock);
536 r = __ceph_caps_issued_mask(ci, mask, touch);
537 spin_unlock(&ci->vfs_inode.i_lock);
538 return r;
539}
540
541static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
542{
543 return ci->i_dirty_caps | ci->i_flushing_caps;
544}
545extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
546
547extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
548extern int __ceph_caps_used(struct ceph_inode_info *ci);
549
550extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
551
552/*
553 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
554 */
555static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
556{
557 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
558 if (w & CEPH_CAP_FILE_BUFFER)
559 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
560 return w;
561}
562
563/* what the mds thinks we want */
564extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
565
566extern void ceph_caps_init(void);
567extern void ceph_caps_finalize(void);
568extern void ceph_adjust_min_caps(int delta);
569extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
570extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
571extern void ceph_reservation_status(struct ceph_client *client,
572 int *total, int *avail, int *used,
573 int *reserved, int *min);
574
575static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
576{
577 return (struct ceph_client *)inode->i_sb->s_fs_info;
578}
579
580static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
581{
582 return (struct ceph_client *)sb->s_fs_info;
583}
584
585
586/*
587 * we keep buffered readdir results attached to file->private_data
588 */
589struct ceph_file_info {
590 int fmode; /* initialized on open */
591
592 /* readdir: position within the dir */
593 u32 frag;
594 struct ceph_mds_request *last_readdir;
595 int at_end;
596
597 /* readdir: position within a frag */
598 unsigned offset; /* offset of last chunk, adjusted for . and .. */
599 u64 next_offset; /* offset of next chunk (last_name's + 1) */
600 char *last_name; /* last entry in previous chunk */
601 struct dentry *dentry; /* next dentry (for dcache readdir) */
602 unsigned long dir_release_count;
603
604 /* used for -o dirstat read() on directory thing */
605 char *dir_info;
606 int dir_info_len;
607};
608
609
610
611/*
612 * snapshots
613 */
614
615/*
616 * A "snap context" is the set of existing snapshots when we
617 * write data. It is used by the OSD to guide its COW behavior.
618 *
619 * The ceph_snap_context is refcounted, and attached to each dirty
620 * page, indicating which context the dirty data belonged when it was
621 * dirtied.
622 */
623struct ceph_snap_context {
624 atomic_t nref;
625 u64 seq;
626 int num_snaps;
627 u64 snaps[];
628};
629
630static inline struct ceph_snap_context *
631ceph_get_snap_context(struct ceph_snap_context *sc)
632{
633 /*
634 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
635 atomic_read(&sc->nref)+1);
636 */
637 if (sc)
638 atomic_inc(&sc->nref);
639 return sc;
640}
641
642static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
643{
644 if (!sc)
645 return;
646 /*
647 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
648 atomic_read(&sc->nref)-1);
649 */
650 if (atomic_dec_and_test(&sc->nref)) {
651 /*printk(" deleting snap_context %p\n", sc);*/
652 kfree(sc);
653 }
654}
655
656/*
657 * A "snap realm" describes a subset of the file hierarchy sharing
658 * the same set of snapshots that apply to it. The realms themselves
659 * are organized into a hierarchy, such that children inherit (some of)
660 * the snapshots of their parents.
661 *
662 * All inodes within the realm that have capabilities are linked into a
663 * per-realm list.
664 */
665struct ceph_snap_realm {
666 u64 ino;
667 atomic_t nref;
668 struct rb_node node;
669
670 u64 created, seq;
671 u64 parent_ino;
672 u64 parent_since; /* snapid when our current parent became so */
673
674 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
675 int num_prior_parent_snaps; /* had prior to parent_since */
676 u64 *snaps; /* snaps specific to this realm */
677 int num_snaps;
678
679 struct ceph_snap_realm *parent;
680 struct list_head children; /* list of child realms */
681 struct list_head child_item;
682
683 struct list_head empty_item; /* if i have ref==0 */
684
685 /* the current set of snaps for this realm */
686 struct ceph_snap_context *cached_context;
687
688 struct list_head inodes_with_caps;
689 spinlock_t inodes_with_caps_lock;
690};
691
692
693
694/*
695 * calculate the number of pages a given length and offset map onto,
696 * if we align the data.
697 */
698static inline int calc_pages_for(u64 off, u64 len)
699{
700 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
701 (off >> PAGE_CACHE_SHIFT);
702}
703
704
705
706/* snap.c */
707struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
708 u64 ino);
709extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
710 struct ceph_snap_realm *realm);
711extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
712 struct ceph_snap_realm *realm);
713extern int ceph_update_snap_trace(struct ceph_mds_client *m,
714 void *p, void *e, bool deletion);
715extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg);
718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..2845422907fc
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,845 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6#include <linux/slab.h>
7
8static bool ceph_is_valid_xattr(const char *name)
9{
10 return !strncmp(name, XATTR_SECURITY_PREFIX,
11 XATTR_SECURITY_PREFIX_LEN) ||
12 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
13 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
14}
15
16/*
17 * These define virtual xattrs exposing the recursive directory
18 * statistics and layout metadata.
19 */
20struct ceph_vxattr_cb {
21 bool readonly;
22 char *name;
23 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
24 size_t size);
25};
26
27/* directories */
28
29static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
30 size_t size)
31{
32 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
33}
34
35static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
36 size_t size)
37{
38 return snprintf(val, size, "%lld", ci->i_files);
39}
40
41static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
42 size_t size)
43{
44 return snprintf(val, size, "%lld", ci->i_subdirs);
45}
46
47static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
48 size_t size)
49{
50 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
51}
52
53static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
54 size_t size)
55{
56 return snprintf(val, size, "%lld", ci->i_rfiles);
57}
58
59static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
60 size_t size)
61{
62 return snprintf(val, size, "%lld", ci->i_rsubdirs);
63}
64
65static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
66 size_t size)
67{
68 return snprintf(val, size, "%lld", ci->i_rbytes);
69}
70
71static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
72 size_t size)
73{
74 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
75 (long)ci->i_rctime.tv_nsec);
76}
77
78static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
79 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
80 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
81 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
82 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
83 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
84 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
85 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
86 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
87 { true, NULL, NULL }
88};
89
90/* files */
91
92static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
93 size_t size)
94{
95 int ret;
96
97 ret = snprintf(val, size,
98 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
99 (unsigned long long)ceph_file_layout_su(ci->i_layout),
100 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
101 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
102 if (ceph_file_layout_pg_preferred(ci->i_layout))
103 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
104 (unsigned long long)ceph_file_layout_pg_preferred(
105 ci->i_layout));
106 return ret;
107}
108
109static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
110 { true, "user.ceph.layout", ceph_vxattrcb_layout},
111 { NULL, NULL }
112};
113
114static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
115{
116 if (S_ISDIR(inode->i_mode))
117 return ceph_dir_vxattrs;
118 else if (S_ISREG(inode->i_mode))
119 return ceph_file_vxattrs;
120 return NULL;
121}
122
123static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
124 const char *name)
125{
126 do {
127 if (strcmp(vxattr->name, name) == 0)
128 return vxattr;
129 vxattr++;
130 } while (vxattr->name);
131 return NULL;
132}
133
134static int __set_xattr(struct ceph_inode_info *ci,
135 const char *name, int name_len,
136 const char *val, int val_len,
137 int dirty,
138 int should_free_name, int should_free_val,
139 struct ceph_inode_xattr **newxattr)
140{
141 struct rb_node **p;
142 struct rb_node *parent = NULL;
143 struct ceph_inode_xattr *xattr = NULL;
144 int c;
145 int new = 0;
146
147 p = &ci->i_xattrs.index.rb_node;
148 while (*p) {
149 parent = *p;
150 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
151 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
152 if (c < 0)
153 p = &(*p)->rb_left;
154 else if (c > 0)
155 p = &(*p)->rb_right;
156 else {
157 if (name_len == xattr->name_len)
158 break;
159 else if (name_len < xattr->name_len)
160 p = &(*p)->rb_left;
161 else
162 p = &(*p)->rb_right;
163 }
164 xattr = NULL;
165 }
166
167 if (!xattr) {
168 new = 1;
169 xattr = *newxattr;
170 xattr->name = name;
171 xattr->name_len = name_len;
172 xattr->should_free_name = should_free_name;
173
174 ci->i_xattrs.count++;
175 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
176 } else {
177 kfree(*newxattr);
178 *newxattr = NULL;
179 if (xattr->should_free_val)
180 kfree((void *)xattr->val);
181
182 if (should_free_name) {
183 kfree((void *)name);
184 name = xattr->name;
185 }
186 ci->i_xattrs.names_size -= xattr->name_len;
187 ci->i_xattrs.vals_size -= xattr->val_len;
188 }
189 if (!xattr) {
190 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
191 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
192 xattr->val);
193 return -ENOMEM;
194 }
195 ci->i_xattrs.names_size += name_len;
196 ci->i_xattrs.vals_size += val_len;
197 if (val)
198 xattr->val = val;
199 else
200 xattr->val = "";
201
202 xattr->val_len = val_len;
203 xattr->dirty = dirty;
204 xattr->should_free_val = (val && should_free_val);
205
206 if (new) {
207 rb_link_node(&xattr->node, parent, p);
208 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
209 dout("__set_xattr_val p=%p\n", p);
210 }
211
212 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
213 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
214
215 return 0;
216}
217
218static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
219 const char *name)
220{
221 struct rb_node **p;
222 struct rb_node *parent = NULL;
223 struct ceph_inode_xattr *xattr = NULL;
224 int c;
225
226 p = &ci->i_xattrs.index.rb_node;
227 while (*p) {
228 parent = *p;
229 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
230 c = strncmp(name, xattr->name, xattr->name_len);
231 if (c < 0)
232 p = &(*p)->rb_left;
233 else if (c > 0)
234 p = &(*p)->rb_right;
235 else {
236 dout("__get_xattr %s: found %.*s\n", name,
237 xattr->val_len, xattr->val);
238 return xattr;
239 }
240 }
241
242 dout("__get_xattr %s: not found\n", name);
243
244 return NULL;
245}
246
247static void __free_xattr(struct ceph_inode_xattr *xattr)
248{
249 BUG_ON(!xattr);
250
251 if (xattr->should_free_name)
252 kfree((void *)xattr->name);
253 if (xattr->should_free_val)
254 kfree((void *)xattr->val);
255
256 kfree(xattr);
257}
258
259static int __remove_xattr(struct ceph_inode_info *ci,
260 struct ceph_inode_xattr *xattr)
261{
262 if (!xattr)
263 return -EOPNOTSUPP;
264
265 rb_erase(&xattr->node, &ci->i_xattrs.index);
266
267 if (xattr->should_free_name)
268 kfree((void *)xattr->name);
269 if (xattr->should_free_val)
270 kfree((void *)xattr->val);
271
272 ci->i_xattrs.names_size -= xattr->name_len;
273 ci->i_xattrs.vals_size -= xattr->val_len;
274 ci->i_xattrs.count--;
275 kfree(xattr);
276
277 return 0;
278}
279
280static int __remove_xattr_by_name(struct ceph_inode_info *ci,
281 const char *name)
282{
283 struct rb_node **p;
284 struct ceph_inode_xattr *xattr;
285 int err;
286
287 p = &ci->i_xattrs.index.rb_node;
288 xattr = __get_xattr(ci, name);
289 err = __remove_xattr(ci, xattr);
290 return err;
291}
292
293static char *__copy_xattr_names(struct ceph_inode_info *ci,
294 char *dest)
295{
296 struct rb_node *p;
297 struct ceph_inode_xattr *xattr = NULL;
298
299 p = rb_first(&ci->i_xattrs.index);
300 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
301
302 while (p) {
303 xattr = rb_entry(p, struct ceph_inode_xattr, node);
304 memcpy(dest, xattr->name, xattr->name_len);
305 dest[xattr->name_len] = '\0';
306
307 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
308 xattr->name_len, ci->i_xattrs.names_size);
309
310 dest += xattr->name_len + 1;
311 p = rb_next(p);
312 }
313
314 return dest;
315}
316
317void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
318{
319 struct rb_node *p, *tmp;
320 struct ceph_inode_xattr *xattr = NULL;
321
322 p = rb_first(&ci->i_xattrs.index);
323
324 dout("__ceph_destroy_xattrs p=%p\n", p);
325
326 while (p) {
327 xattr = rb_entry(p, struct ceph_inode_xattr, node);
328 tmp = p;
329 p = rb_next(tmp);
330 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
331 xattr->name_len, xattr->name);
332 rb_erase(tmp, &ci->i_xattrs.index);
333
334 __free_xattr(xattr);
335 }
336
337 ci->i_xattrs.names_size = 0;
338 ci->i_xattrs.vals_size = 0;
339 ci->i_xattrs.index_version = 0;
340 ci->i_xattrs.count = 0;
341 ci->i_xattrs.index = RB_ROOT;
342}
343
344static int __build_xattrs(struct inode *inode)
345{
346 u32 namelen;
347 u32 numattr = 0;
348 void *p, *end;
349 u32 len;
350 const char *name, *val;
351 struct ceph_inode_info *ci = ceph_inode(inode);
352 int xattr_version;
353 struct ceph_inode_xattr **xattrs = NULL;
354 int err = 0;
355 int i;
356
357 dout("__build_xattrs() len=%d\n",
358 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
359
360 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
361 return 0; /* already built */
362
363 __ceph_destroy_xattrs(ci);
364
365start:
366 /* updated internal xattr rb tree */
367 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
368 p = ci->i_xattrs.blob->vec.iov_base;
369 end = p + ci->i_xattrs.blob->vec.iov_len;
370 ceph_decode_32_safe(&p, end, numattr, bad);
371 xattr_version = ci->i_xattrs.version;
372 spin_unlock(&inode->i_lock);
373
374 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
375 GFP_NOFS);
376 err = -ENOMEM;
377 if (!xattrs)
378 goto bad_lock;
379 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
380 for (i = 0; i < numattr; i++) {
381 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
382 GFP_NOFS);
383 if (!xattrs[i])
384 goto bad_lock;
385 }
386
387 spin_lock(&inode->i_lock);
388 if (ci->i_xattrs.version != xattr_version) {
389 /* lost a race, retry */
390 for (i = 0; i < numattr; i++)
391 kfree(xattrs[i]);
392 kfree(xattrs);
393 goto start;
394 }
395 err = -EIO;
396 while (numattr--) {
397 ceph_decode_32_safe(&p, end, len, bad);
398 namelen = len;
399 name = p;
400 p += len;
401 ceph_decode_32_safe(&p, end, len, bad);
402 val = p;
403 p += len;
404
405 err = __set_xattr(ci, name, namelen, val, len,
406 0, 0, 0, &xattrs[numattr]);
407
408 if (err < 0)
409 goto bad;
410 }
411 kfree(xattrs);
412 }
413 ci->i_xattrs.index_version = ci->i_xattrs.version;
414 ci->i_xattrs.dirty = false;
415
416 return err;
417bad_lock:
418 spin_lock(&inode->i_lock);
419bad:
420 if (xattrs) {
421 for (i = 0; i < numattr; i++)
422 kfree(xattrs[i]);
423 kfree(xattrs);
424 }
425 ci->i_xattrs.names_size = 0;
426 return err;
427}
428
429static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
430 int val_size)
431{
432 /*
433 * 4 bytes for the length, and additional 4 bytes per each xattr name,
434 * 4 bytes per each value
435 */
436 int size = 4 + ci->i_xattrs.count*(4 + 4) +
437 ci->i_xattrs.names_size +
438 ci->i_xattrs.vals_size;
439 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
440 ci->i_xattrs.count, ci->i_xattrs.names_size,
441 ci->i_xattrs.vals_size);
442
443 if (name_size)
444 size += 4 + 4 + name_size + val_size;
445
446 return size;
447}
448
449/*
450 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
451 * and swap into place.
452 */
453void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
454{
455 struct rb_node *p;
456 struct ceph_inode_xattr *xattr = NULL;
457 void *dest;
458
459 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
460 if (ci->i_xattrs.dirty) {
461 int need = __get_required_blob_size(ci, 0, 0);
462
463 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
464
465 p = rb_first(&ci->i_xattrs.index);
466 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
467
468 ceph_encode_32(&dest, ci->i_xattrs.count);
469 while (p) {
470 xattr = rb_entry(p, struct ceph_inode_xattr, node);
471
472 ceph_encode_32(&dest, xattr->name_len);
473 memcpy(dest, xattr->name, xattr->name_len);
474 dest += xattr->name_len;
475 ceph_encode_32(&dest, xattr->val_len);
476 memcpy(dest, xattr->val, xattr->val_len);
477 dest += xattr->val_len;
478
479 p = rb_next(p);
480 }
481
482 /* adjust buffer len; it may be larger than we need */
483 ci->i_xattrs.prealloc_blob->vec.iov_len =
484 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
485
486 if (ci->i_xattrs.blob)
487 ceph_buffer_put(ci->i_xattrs.blob);
488 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
489 ci->i_xattrs.prealloc_blob = NULL;
490 ci->i_xattrs.dirty = false;
491 }
492}
493
494ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
495 size_t size)
496{
497 struct inode *inode = dentry->d_inode;
498 struct ceph_inode_info *ci = ceph_inode(inode);
499 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
500 int err;
501 struct ceph_inode_xattr *xattr;
502 struct ceph_vxattr_cb *vxattr = NULL;
503
504 if (!ceph_is_valid_xattr(name))
505 return -ENODATA;
506
507 /* let's see if a virtual xattr was requested */
508 if (vxattrs)
509 vxattr = ceph_match_vxattr(vxattrs, name);
510
511 spin_lock(&inode->i_lock);
512 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
513 ci->i_xattrs.version, ci->i_xattrs.index_version);
514
515 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
516 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
517 goto get_xattr;
518 } else {
519 spin_unlock(&inode->i_lock);
520 /* get xattrs from mds (if we don't already have them) */
521 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
522 if (err)
523 return err;
524 }
525
526 spin_lock(&inode->i_lock);
527
528 if (vxattr && vxattr->readonly) {
529 err = vxattr->getxattr_cb(ci, value, size);
530 goto out;
531 }
532
533 err = __build_xattrs(inode);
534 if (err < 0)
535 goto out;
536
537get_xattr:
538 err = -ENODATA; /* == ENOATTR */
539 xattr = __get_xattr(ci, name);
540 if (!xattr) {
541 if (vxattr)
542 err = vxattr->getxattr_cb(ci, value, size);
543 goto out;
544 }
545
546 err = -ERANGE;
547 if (size && size < xattr->val_len)
548 goto out;
549
550 err = xattr->val_len;
551 if (size == 0)
552 goto out;
553
554 memcpy(value, xattr->val, xattr->val_len);
555
556out:
557 spin_unlock(&inode->i_lock);
558 return err;
559}
560
561ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
562{
563 struct inode *inode = dentry->d_inode;
564 struct ceph_inode_info *ci = ceph_inode(inode);
565 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
566 u32 vir_namelen = 0;
567 u32 namelen;
568 int err;
569 u32 len;
570 int i;
571
572 spin_lock(&inode->i_lock);
573 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
574 ci->i_xattrs.version, ci->i_xattrs.index_version);
575
576 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
577 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
578 goto list_xattr;
579 } else {
580 spin_unlock(&inode->i_lock);
581 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
582 if (err)
583 return err;
584 }
585
586 spin_lock(&inode->i_lock);
587
588 err = __build_xattrs(inode);
589 if (err < 0)
590 goto out;
591
592list_xattr:
593 vir_namelen = 0;
594 /* include virtual dir xattrs */
595 if (vxattrs)
596 for (i = 0; vxattrs[i].name; i++)
597 vir_namelen += strlen(vxattrs[i].name) + 1;
598 /* adding 1 byte per each variable due to the null termination */
599 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
600 err = -ERANGE;
601 if (size && namelen > size)
602 goto out;
603
604 err = namelen;
605 if (size == 0)
606 goto out;
607
608 names = __copy_xattr_names(ci, names);
609
610 /* virtual xattr names, too */
611 if (vxattrs)
612 for (i = 0; vxattrs[i].name; i++) {
613 len = sprintf(names, "%s", vxattrs[i].name);
614 names += len + 1;
615 }
616
617out:
618 spin_unlock(&inode->i_lock);
619 return err;
620}
621
622static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
623 const char *value, size_t size, int flags)
624{
625 struct ceph_client *client = ceph_client(dentry->d_sb);
626 struct inode *inode = dentry->d_inode;
627 struct ceph_inode_info *ci = ceph_inode(inode);
628 struct inode *parent_inode = dentry->d_parent->d_inode;
629 struct ceph_mds_request *req;
630 struct ceph_mds_client *mdsc = &client->mdsc;
631 int err;
632 int i, nr_pages;
633 struct page **pages = NULL;
634 void *kaddr;
635
636 /* copy value into some pages */
637 nr_pages = calc_pages_for(0, size);
638 if (nr_pages) {
639 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
640 if (!pages)
641 return -ENOMEM;
642 err = -ENOMEM;
643 for (i = 0; i < nr_pages; i++) {
644 pages[i] = alloc_page(GFP_NOFS);
645 if (!pages[i]) {
646 nr_pages = i;
647 goto out;
648 }
649 kaddr = kmap(pages[i]);
650 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
651 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
652 }
653 }
654
655 dout("setxattr value=%.*s\n", (int)size, value);
656
657 /* do request */
658 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
659 USE_AUTH_MDS);
660 if (IS_ERR(req)) {
661 err = PTR_ERR(req);
662 goto out;
663 }
664 req->r_inode = igrab(inode);
665 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
666 req->r_num_caps = 1;
667 req->r_args.setxattr.flags = cpu_to_le32(flags);
668 req->r_path2 = kstrdup(name, GFP_NOFS);
669
670 req->r_pages = pages;
671 req->r_num_pages = nr_pages;
672 req->r_data_len = size;
673
674 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
675 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
676 ceph_mdsc_put_request(req);
677 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
678
679out:
680 if (pages) {
681 for (i = 0; i < nr_pages; i++)
682 __free_page(pages[i]);
683 kfree(pages);
684 }
685 return err;
686}
687
688int ceph_setxattr(struct dentry *dentry, const char *name,
689 const void *value, size_t size, int flags)
690{
691 struct inode *inode = dentry->d_inode;
692 struct ceph_inode_info *ci = ceph_inode(inode);
693 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
694 int err;
695 int name_len = strlen(name);
696 int val_len = size;
697 char *newname = NULL;
698 char *newval = NULL;
699 struct ceph_inode_xattr *xattr = NULL;
700 int issued;
701 int required_blob_size;
702
703 if (ceph_snap(inode) != CEPH_NOSNAP)
704 return -EROFS;
705
706 if (!ceph_is_valid_xattr(name))
707 return -EOPNOTSUPP;
708
709 if (vxattrs) {
710 struct ceph_vxattr_cb *vxattr =
711 ceph_match_vxattr(vxattrs, name);
712 if (vxattr && vxattr->readonly)
713 return -EOPNOTSUPP;
714 }
715
716 /* preallocate memory for xattr name, value, index node */
717 err = -ENOMEM;
718 newname = kmalloc(name_len + 1, GFP_NOFS);
719 if (!newname)
720 goto out;
721 memcpy(newname, name, name_len + 1);
722
723 if (val_len) {
724 newval = kmalloc(val_len + 1, GFP_NOFS);
725 if (!newval)
726 goto out;
727 memcpy(newval, value, val_len);
728 newval[val_len] = '\0';
729 }
730
731 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
732 if (!xattr)
733 goto out;
734
735 spin_lock(&inode->i_lock);
736retry:
737 issued = __ceph_caps_issued(ci, NULL);
738 if (!(issued & CEPH_CAP_XATTR_EXCL))
739 goto do_sync;
740 __build_xattrs(inode);
741
742 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
743
744 if (!ci->i_xattrs.prealloc_blob ||
745 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
746 struct ceph_buffer *blob = NULL;
747
748 spin_unlock(&inode->i_lock);
749 dout(" preaallocating new blob size=%d\n", required_blob_size);
750 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
751 if (!blob)
752 goto out;
753 spin_lock(&inode->i_lock);
754 if (ci->i_xattrs.prealloc_blob)
755 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
756 ci->i_xattrs.prealloc_blob = blob;
757 goto retry;
758 }
759
760 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
761 err = __set_xattr(ci, newname, name_len, newval,
762 val_len, 1, 1, 1, &xattr);
763 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
764 ci->i_xattrs.dirty = true;
765 inode->i_ctime = CURRENT_TIME;
766 spin_unlock(&inode->i_lock);
767
768 return err;
769
770do_sync:
771 spin_unlock(&inode->i_lock);
772 err = ceph_sync_setxattr(dentry, name, value, size, flags);
773out:
774 kfree(newname);
775 kfree(newval);
776 kfree(xattr);
777 return err;
778}
779
780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
781{
782 struct ceph_client *client = ceph_client(dentry->d_sb);
783 struct ceph_mds_client *mdsc = &client->mdsc;
784 struct inode *inode = dentry->d_inode;
785 struct inode *parent_inode = dentry->d_parent->d_inode;
786 struct ceph_mds_request *req;
787 int err;
788
789 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
790 USE_AUTH_MDS);
791 if (IS_ERR(req))
792 return PTR_ERR(req);
793 req->r_inode = igrab(inode);
794 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
795 req->r_num_caps = 1;
796 req->r_path2 = kstrdup(name, GFP_NOFS);
797
798 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
799 ceph_mdsc_put_request(req);
800 return err;
801}
802
803int ceph_removexattr(struct dentry *dentry, const char *name)
804{
805 struct inode *inode = dentry->d_inode;
806 struct ceph_inode_info *ci = ceph_inode(inode);
807 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
808 int issued;
809 int err;
810
811 if (ceph_snap(inode) != CEPH_NOSNAP)
812 return -EROFS;
813
814 if (!ceph_is_valid_xattr(name))
815 return -EOPNOTSUPP;
816
817 if (vxattrs) {
818 struct ceph_vxattr_cb *vxattr =
819 ceph_match_vxattr(vxattrs, name);
820 if (vxattr && vxattr->readonly)
821 return -EOPNOTSUPP;
822 }
823
824 spin_lock(&inode->i_lock);
825 __build_xattrs(inode);
826 issued = __ceph_caps_issued(ci, NULL);
827 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
828
829 if (!(issued & CEPH_CAP_XATTR_EXCL))
830 goto do_sync;
831
832 err = __remove_xattr_by_name(ceph_inode(inode), name);
833 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
834 ci->i_xattrs.dirty = true;
835 inode->i_ctime = CURRENT_TIME;
836
837 spin_unlock(&inode->i_lock);
838
839 return err;
840do_sync:
841 spin_unlock(&inode->i_lock);
842 err = ceph_send_removexattr(dentry, name);
843 return err;
844}
845