aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-03-19 12:43:06 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-19 12:43:06 -0400
commitfc7f99cf36ebae853639dabb43bc2f0098c59aef (patch)
tree3ca7050397f515f91ef98f8b6293f9f7fd84ef02
parent0a492fdef8aa241f6139e6455e852cc710ae8ed1 (diff)
parentf1a3d57213fe264b4cf584e78bac36aaf9998729 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (205 commits) ceph: update for write_inode API change ceph: reset osd after relevant messages timed out ceph: fix flush_dirty_caps race with caps migration ceph: include migrating caps in issued set ceph: fix osdmap decoding when pools include (removed) snaps ceph: return EBADF if waiting for caps on closed file ceph: set osd request message front length correctly ceph: reset front len on return to msgpool; BUG on mismatched front iov ceph: fix snaptrace decoding on cap migration between mds ceph: use single osd op reply msg ceph: reset bits on connection close ceph: remove bogus mds forward warning ceph: remove fragile __map_osds optimization ceph: fix connection fault STANDBY check ceph: invalidate_authorizer without con->mutex held ceph: don't clobber write return value when using O_SYNC ceph: fix client_request_forward decoding ceph: drop messages on unregistered mds sessions; cleanup ceph: fix comments, locking in destroy_inode ceph: move dereference after NULL test ... Fix trivial conflicts in Documentation/ioctl/ioctl-number.txt
-rw-r--r--Documentation/filesystems/ceph.txt139
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--MAINTAINERS9
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1188
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c257
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c121
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c656
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c78
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2927
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c408
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c483
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1220
-rw-r--r--fs/ceph/export.c223
-rw-r--r--fs/ceph/file.c937
-rw-r--r--fs/ceph/inode.c1750
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3021
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2240
-rw-r--r--fs/ceph/messenger.h254
-rw-r--r--fs/ceph/mon_client.c834
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1537
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1019
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c54
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c904
-rw-r--r--fs/ceph/super.c1030
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c844
67 files changed, 28066 insertions, 0 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 000000000000..6e03917316bd
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,139 @@
1Ceph Distributed File System
2============================
3
4Ceph is a distributed network file system designed to provide good
5performance, reliability, and scalability.
6
7Basic features include:
8
9 * POSIX semantics
10 * Seamless scaling from 1 to many thousands of nodes
11 * High availability and reliability. No single points of failure.
12 * N-way replication of data across storage nodes
13 * Fast recovery from node failures
14 * Automatic rebalancing of data on node addition/removal
15 * Easy deployment: most FS components are userspace daemons
16
17Also,
18 * Flexible snapshots (on any directory)
19 * Recursive accounting (nested files, directories, bytes)
20
21In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
22on symmetric access by all clients to shared block devices, Ceph
23separates data and metadata management into independent server
24clusters, similar to Lustre. Unlike Lustre, however, metadata and
25storage nodes run entirely as user space daemons. Storage nodes
26utilize btrfs to store data objects, leveraging its advanced features
27(checksumming, metadata replication, etc.). File data is striped
28across storage nodes in large chunks to distribute workload and
29facilitate high throughputs. When storage nodes fail, data is
30re-replicated in a distributed fashion by the storage nodes themselves
31(with some minimal coordination from a cluster monitor), making the
32system extremely efficient and scalable.
33
34Metadata servers effectively form a large, consistent, distributed
35in-memory cache above the file namespace that is extremely scalable,
36dynamically redistributes metadata in response to workload changes,
37and can tolerate arbitrary (well, non-Byzantine) node failures. The
38metadata server takes a somewhat unconventional approach to metadata
39storage to significantly improve performance for common workloads. In
40particular, inodes with only a single link are embedded in
41directories, allowing entire directories of dentries and inodes to be
42loaded into its cache with a single I/O operation. The contents of
43extremely large directories can be fragmented and managed by
44independent metadata servers, allowing scalable concurrent access.
45
46The system offers automatic data rebalancing/migration when scaling
47from a small cluster of just a few nodes to many hundreds, without
48requiring an administrator carve the data set into static volumes or
49go through the tedious process of migrating data between servers.
50When the file system approaches full, new nodes can be easily added
51and things will "just work."
52
53Ceph includes flexible snapshot mechanism that allows a user to create
54a snapshot on any subdirectory (and its nested contents) in the
55system. Snapshot creation and deletion are as simple as 'mkdir
56.snap/foo' and 'rmdir .snap/foo'.
57
58Ceph also provides some recursive accounting on directories for nested
59files and bytes. That is, a 'getfattr -d foo' on any directory in the
60system will reveal the total number of nested regular files and
61subdirectories, and a summation of all nested file sizes. This makes
62the identification of large disk space consumers relatively quick, as
63no 'du' or similar recursive scan of the file system is required.
64
65
66Mount Syntax
67============
68
69The basic mount syntax is:
70
71 # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
72
73You only need to specify a single monitor, as the client will get the
74full list when it connects. (However, if the monitor you specify
75happens to be down, the mount won't succeed.) The port can be left
76off if the monitor is using the default. So if the monitor is at
771.2.3.4,
78
79 # mount -t ceph 1.2.3.4:/ /mnt/ceph
80
81is sufficient. If /sbin/mount.ceph is installed, a hostname can be
82used instead of an IP address.
83
84
85
86Mount Options
87=============
88
89 ip=A.B.C.D[:N]
90 Specify the IP and/or port the client should bind to locally.
91 There is normally not much reason to do this. If the IP is not
92 specified, the client's IP address is determined by looking at the
93 address it's connection to the monitor originates from.
94
95 wsize=X
96 Specify the maximum write size in bytes. By default there is no
97 maximu. Ceph will normally size writes based on the file stripe
98 size.
99
100 rsize=X
101 Specify the maximum readahead.
102
103 mount_timeout=X
104 Specify the timeout value for mount (in seconds), in the case
105 of a non-responsive Ceph file system. The default is 30
106 seconds.
107
108 rbytes
109 When stat() is called on a directory, set st_size to 'rbytes',
110 the summation of file sizes over all files nested beneath that
111 directory. This is the default.
112
113 norbytes
114 When stat() is called on a directory, set st_size to the
115 number of entries in that directory.
116
117 nocrc
118 Disable CRC32C calculation for data writes. If set, the OSD
119 must rely on TCP's error correction to detect data corruption
120 in the data payload.
121
122 noasyncreaddir
123 Disable client's use its local cache to satisfy readdir
124 requests. (This does not change correctness; the client uses
125 cached metadata only when a lease or capability ensures it is
126 valid.)
127
128
129More Information
130================
131
132For more information on Ceph, see the home page at
133 http://ceph.newdream.net/
134
135The Linux kernel client source tree is available at
136 git://ceph.newdream.net/linux-ceph-client.git
137
138and the source for the full system is at
139 git://ceph.newdream.net/ceph.git
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 35c9b51d20ea..dd5806f4fcc4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -291,6 +291,7 @@ Code Seq#(hex) Include File Comments
2910x92 00-0F drivers/usb/mon/mon_bin.c 2910x92 00-0F drivers/usb/mon/mon_bin.c
2920x93 60-7F linux/auto_fs.h 2920x93 60-7F linux/auto_fs.h
2930x94 all fs/btrfs/ioctl.h 2930x94 all fs/btrfs/ioctl.h
2940x97 00-7F fs/ceph/ioctl.h Ceph file system
2940x99 00-0F 537-Addinboard driver 2950x99 00-0F 537-Addinboard driver
295 <mailto:buk@buks.ipn.de> 296 <mailto:buk@buks.ipn.de>
2960xA0 all linux/sdp/sdp.h Industrial Device Project 2970xA0 all linux/sdp/sdp.h Industrial Device Project
diff --git a/MAINTAINERS b/MAINTAINERS
index 382eaa4d0068..449d44402083 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1441,6 +1441,15 @@ F: arch/powerpc/include/asm/spu*.h
1441F: arch/powerpc/oprofile/*cell* 1441F: arch/powerpc/oprofile/*cell*
1442F: arch/powerpc/platforms/cell/ 1442F: arch/powerpc/platforms/cell/
1443 1443
1444CEPH DISTRIBUTED FILE SYSTEM CLIENT
1445M: Sage Weil <sage@newdream.net>
1446L: ceph-devel@lists.sourceforge.net
1447W: http://ceph.newdream.net/
1448T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
1449S: Supported
1450F: Documentation/filesystems/ceph.txt
1451F: fs/ceph
1452
1444CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: 1453CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
1445M: David Vrabel <david.vrabel@csr.com> 1454M: David Vrabel <david.vrabel@csr.com>
1446L: linux-usb@vger.kernel.org 1455L: linux-usb@vger.kernel.org
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
235 235
236source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
238source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
239source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
240source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
127obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..23bb0ceabe31
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1188 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/pagevec.h>
9#include <linux/task_io_accounting_ops.h>
10
11#include "super.h"
12#include "osd_client.h"
13
14/*
15 * Ceph address space ops.
16 *
17 * There are a few funny things going on here.
18 *
19 * The page->private field is used to reference a struct
20 * ceph_snap_context for _every_ dirty page. This indicates which
21 * snapshot the page was logically dirtied in, and thus which snap
22 * context needs to be associated with the osd write during writeback.
23 *
24 * Similarly, struct ceph_inode_info maintains a set of counters to
25 * count dirty pages on the inode. In the absense of snapshots,
26 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
27 *
28 * When a snapshot is taken (that is, when the client receives
29 * notification that a snapshot was taken), each inode with caps and
30 * with dirty pages (dirty pages implies there is a cap) gets a new
31 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
32 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
33 * moved to capsnap->dirty. (Unless a sync write is currently in
34 * progress. In that case, the capsnap is said to be "pending", new
35 * writes cannot start, and the capsnap isn't "finalized" until the
36 * write completes (or fails) and a final size/mtime for the inode for
37 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
38 *
39 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
40 * we look for the first capsnap in i_cap_snaps and write out pages in
41 * that snap context _only_. Then we move on to the next capsnap,
42 * eventually reaching the "live" or "head" context (i.e., pages that
43 * are not yet snapped) and are writing the most recently dirtied
44 * pages.
45 *
46 * Invalidate and so forth must take care to ensure the dirty page
47 * accounting is preserved.
48 */
49
50#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
51#define CONGESTION_OFF_THRESH(congestion_kb) \
52 (CONGESTION_ON_THRESH(congestion_kb) - \
53 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
54
55
56
57/*
58 * Dirty a page. Optimistically adjust accounting, on the assumption
59 * that we won't race with invalidate. If we do, readjust.
60 */
61static int ceph_set_page_dirty(struct page *page)
62{
63 struct address_space *mapping = page->mapping;
64 struct inode *inode;
65 struct ceph_inode_info *ci;
66 int undo = 0;
67 struct ceph_snap_context *snapc;
68
69 if (unlikely(!mapping))
70 return !TestSetPageDirty(page);
71
72 if (TestSetPageDirty(page)) {
73 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
74 mapping->host, page, page->index);
75 return 0;
76 }
77
78 inode = mapping->host;
79 ci = ceph_inode(inode);
80
81 /*
82 * Note that we're grabbing a snapc ref here without holding
83 * any locks!
84 */
85 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
86
87 /* dirty the head */
88 spin_lock(&inode->i_lock);
89 if (ci->i_wrbuffer_ref_head == 0)
90 ci->i_head_snapc = ceph_get_snap_context(snapc);
91 ++ci->i_wrbuffer_ref_head;
92 if (ci->i_wrbuffer_ref == 0)
93 igrab(inode);
94 ++ci->i_wrbuffer_ref;
95 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
96 "snapc %p seq %lld (%d snaps)\n",
97 mapping->host, page, page->index,
98 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
99 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
100 snapc, snapc->seq, snapc->num_snaps);
101 spin_unlock(&inode->i_lock);
102
103 /* now adjust page */
104 spin_lock_irq(&mapping->tree_lock);
105 if (page->mapping) { /* Race with truncate? */
106 WARN_ON_ONCE(!PageUptodate(page));
107
108 if (mapping_cap_account_dirty(mapping)) {
109 __inc_zone_page_state(page, NR_FILE_DIRTY);
110 __inc_bdi_stat(mapping->backing_dev_info,
111 BDI_RECLAIMABLE);
112 task_io_account_write(PAGE_CACHE_SIZE);
113 }
114 radix_tree_tag_set(&mapping->page_tree,
115 page_index(page), PAGECACHE_TAG_DIRTY);
116
117 /*
118 * Reference snap context in page->private. Also set
119 * PagePrivate so that we get invalidatepage callback.
120 */
121 page->private = (unsigned long)snapc;
122 SetPagePrivate(page);
123 } else {
124 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
125 undo = 1;
126 }
127
128 spin_unlock_irq(&mapping->tree_lock);
129
130 if (undo)
131 /* whoops, we failed to dirty the page */
132 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
133
134 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
135
136 BUG_ON(!PageDirty(page));
137 return 1;
138}
139
140/*
141 * If we are truncating the full page (i.e. offset == 0), adjust the
142 * dirty page counters appropriately. Only called if there is private
143 * data on the page.
144 */
145static void ceph_invalidatepage(struct page *page, unsigned long offset)
146{
147 struct inode *inode;
148 struct ceph_inode_info *ci;
149 struct ceph_snap_context *snapc = (void *)page->private;
150
151 BUG_ON(!PageLocked(page));
152 BUG_ON(!page->private);
153 BUG_ON(!PagePrivate(page));
154 BUG_ON(!page->mapping);
155
156 inode = page->mapping->host;
157
158 /*
159 * We can get non-dirty pages here due to races between
160 * set_page_dirty and truncate_complete_page; just spit out a
161 * warning, in case we end up with accounting problems later.
162 */
163 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165
166 if (offset == 0)
167 ClearPageChecked(page);
168
169 ci = ceph_inode(inode);
170 if (offset == 0) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
172 inode, page, page->index, offset);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc);
175 page->private = 0;
176 ClearPagePrivate(page);
177 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n",
179 inode, page, page->index);
180 }
181}
182
183/* just a sanity check */
184static int ceph_releasepage(struct page *page, gfp_t g)
185{
186 struct inode *inode = page->mapping ? page->mapping->host : NULL;
187 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
188 WARN_ON(PageDirty(page));
189 WARN_ON(page->private);
190 WARN_ON(PagePrivate(page));
191 return 0;
192}
193
194/*
195 * read a single page, without unlocking it.
196 */
197static int readpage_nounlock(struct file *filp, struct page *page)
198{
199 struct inode *inode = filp->f_dentry->d_inode;
200 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
202 int err = 0;
203 u64 len = PAGE_CACHE_SIZE;
204
205 dout("readpage inode %p file %p page %p index %lu\n",
206 inode, filp, page, page->index);
207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
208 page->index << PAGE_CACHE_SHIFT, &len,
209 ci->i_truncate_seq, ci->i_truncate_size,
210 &page, 1);
211 if (err == -ENOENT)
212 err = 0;
213 if (err < 0) {
214 SetPageError(page);
215 goto out;
216 } else if (err < PAGE_CACHE_SIZE) {
217 /* zero fill remainder of page */
218 zero_user_segment(page, err, PAGE_CACHE_SIZE);
219 }
220 SetPageUptodate(page);
221
222out:
223 return err < 0 ? err : 0;
224}
225
226static int ceph_readpage(struct file *filp, struct page *page)
227{
228 int r = readpage_nounlock(filp, page);
229 unlock_page(page);
230 return r;
231}
232
233/*
234 * Build a vector of contiguous pages from the provided page list.
235 */
236static struct page **page_vector_from_list(struct list_head *page_list,
237 unsigned *nr_pages)
238{
239 struct page **pages;
240 struct page *page;
241 int next_index, contig_pages = 0;
242
243 /* build page vector */
244 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
245 if (!pages)
246 return ERR_PTR(-ENOMEM);
247
248 BUG_ON(list_empty(page_list));
249 next_index = list_entry(page_list->prev, struct page, lru)->index;
250 list_for_each_entry_reverse(page, page_list, lru) {
251 if (page->index == next_index) {
252 dout("readpages page %d %p\n", contig_pages, page);
253 pages[contig_pages] = page;
254 contig_pages++;
255 next_index++;
256 } else {
257 break;
258 }
259 }
260 *nr_pages = contig_pages;
261 return pages;
262}
263
264/*
265 * Read multiple pages. Leave pages we don't read + unlock in page_list;
266 * the caller (VM) cleans them up.
267 */
268static int ceph_readpages(struct file *file, struct address_space *mapping,
269 struct list_head *page_list, unsigned nr_pages)
270{
271 struct inode *inode = file->f_dentry->d_inode;
272 struct ceph_inode_info *ci = ceph_inode(inode);
273 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
274 int rc = 0;
275 struct page **pages;
276 struct pagevec pvec;
277 loff_t offset;
278 u64 len;
279
280 dout("readpages %p file %p nr_pages %d\n",
281 inode, file, nr_pages);
282
283 pages = page_vector_from_list(page_list, &nr_pages);
284 if (IS_ERR(pages))
285 return PTR_ERR(pages);
286
287 /* guess read extent */
288 offset = pages[0]->index << PAGE_CACHE_SHIFT;
289 len = nr_pages << PAGE_CACHE_SHIFT;
290 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
291 offset, &len,
292 ci->i_truncate_seq, ci->i_truncate_size,
293 pages, nr_pages);
294 if (rc == -ENOENT)
295 rc = 0;
296 if (rc < 0)
297 goto out;
298
299 /* set uptodate and add to lru in pagevec-sized chunks */
300 pagevec_init(&pvec, 0);
301 for (; !list_empty(page_list) && len > 0;
302 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
303 struct page *page =
304 list_entry(page_list->prev, struct page, lru);
305
306 list_del(&page->lru);
307
308 if (rc < (int)PAGE_CACHE_SIZE) {
309 /* zero (remainder of) page */
310 int s = rc < 0 ? 0 : rc;
311 zero_user_segment(page, s, PAGE_CACHE_SIZE);
312 }
313
314 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
315 page_cache_release(page);
316 dout("readpages %p add_to_page_cache failed %p\n",
317 inode, page);
318 continue;
319 }
320 dout("readpages %p adding %p idx %lu\n", inode, page,
321 page->index);
322 flush_dcache_page(page);
323 SetPageUptodate(page);
324 unlock_page(page);
325 if (pagevec_add(&pvec, page) == 0)
326 pagevec_lru_add_file(&pvec); /* add to lru */
327 }
328 pagevec_lru_add_file(&pvec);
329 rc = 0;
330
331out:
332 kfree(pages);
333 return rc;
334}
335
336/*
337 * Get ref for the oldest snapc for an inode with dirty data... that is, the
338 * only snap context we are allowed to write back.
339 *
340 * Caller holds i_lock.
341 */
342static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
343 u64 *snap_size)
344{
345 struct ceph_inode_info *ci = ceph_inode(inode);
346 struct ceph_snap_context *snapc = NULL;
347 struct ceph_cap_snap *capsnap = NULL;
348
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_snap_realm) {
360 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 return snapc;
365}
366
367static struct ceph_snap_context *get_oldest_context(struct inode *inode,
368 u64 *snap_size)
369{
370 struct ceph_snap_context *snapc = NULL;
371
372 spin_lock(&inode->i_lock);
373 snapc = __get_oldest_context(inode, snap_size);
374 spin_unlock(&inode->i_lock);
375 return snapc;
376}
377
378/*
379 * Write a single page, but leave the page locked.
380 *
381 * If we get a write error, set the page error bit, but still adjust the
382 * dirty page accounting (i.e., page is no longer dirty).
383 */
384static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
385{
386 struct inode *inode;
387 struct ceph_inode_info *ci;
388 struct ceph_client *client;
389 struct ceph_osd_client *osdc;
390 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
391 int len = PAGE_CACHE_SIZE;
392 loff_t i_size;
393 int err = 0;
394 struct ceph_snap_context *snapc;
395 u64 snap_size = 0;
396 long writeback_stat;
397
398 dout("writepage %p idx %lu\n", page, page->index);
399
400 if (!page->mapping || !page->mapping->host) {
401 dout("writepage %p - no mapping\n", page);
402 return -EFAULT;
403 }
404 inode = page->mapping->host;
405 ci = ceph_inode(inode);
406 client = ceph_inode_to_client(inode);
407 osdc = &client->osdc;
408
409 /* verify this is a writeable snap context */
410 snapc = (void *)page->private;
411 if (snapc == NULL) {
412 dout("writepage %p page %p not dirty?\n", inode, page);
413 goto out;
414 }
415 if (snapc != get_oldest_context(inode, &snap_size)) {
416 dout("writepage %p page %p snapc %p not writeable - noop\n",
417 inode, page, (void *)page->private);
418 /* we should only noop if called by kswapd */
419 WARN_ON((current->flags & PF_MEMALLOC) == 0);
420 goto out;
421 }
422
423 /* is this a partial page at end of file? */
424 if (snap_size)
425 i_size = snap_size;
426 else
427 i_size = i_size_read(inode);
428 if (i_size < page_off + len)
429 len = i_size - page_off;
430
431 dout("writepage %p page %p index %lu on %llu~%u\n",
432 inode, page, page->index, page_off, len);
433
434 writeback_stat = atomic_long_inc_return(&client->writeback_count);
435 if (writeback_stat >
436 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
437 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
438
439 set_page_writeback(page);
440 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
441 &ci->i_layout, snapc,
442 page_off, len,
443 ci->i_truncate_seq, ci->i_truncate_size,
444 &inode->i_mtime,
445 &page, 1, 0, 0, true);
446 if (err < 0) {
447 dout("writepage setting page/mapping error %d %p\n", err, page);
448 SetPageError(page);
449 mapping_set_error(&inode->i_data, err);
450 if (wbc)
451 wbc->pages_skipped++;
452 } else {
453 dout("writepage cleaned page %p\n", page);
454 err = 0; /* vfs expects us to return 0 */
455 }
456 page->private = 0;
457 ClearPagePrivate(page);
458 end_page_writeback(page);
459 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
460 ceph_put_snap_context(snapc);
461out:
462 return err;
463}
464
465static int ceph_writepage(struct page *page, struct writeback_control *wbc)
466{
467 int err;
468 struct inode *inode = page->mapping->host;
469 BUG_ON(!inode);
470 igrab(inode);
471 err = writepage_nounlock(page, wbc);
472 unlock_page(page);
473 iput(inode);
474 return err;
475}
476
477
478/*
479 * lame release_pages helper. release_pages() isn't exported to
480 * modules.
481 */
482static void ceph_release_pages(struct page **pages, int num)
483{
484 struct pagevec pvec;
485 int i;
486
487 pagevec_init(&pvec, 0);
488 for (i = 0; i < num; i++) {
489 if (pagevec_add(&pvec, pages[i]) == 0)
490 pagevec_release(&pvec);
491 }
492 pagevec_release(&pvec);
493}
494
495
496/*
497 * async writeback completion handler.
498 *
499 * If we get an error, set the mapping error bit, but not the individual
500 * page error bits.
501 */
502static void writepages_finish(struct ceph_osd_request *req,
503 struct ceph_msg *msg)
504{
505 struct inode *inode = req->r_inode;
506 struct ceph_osd_reply_head *replyhead;
507 struct ceph_osd_op *op;
508 struct ceph_inode_info *ci = ceph_inode(inode);
509 unsigned wrote;
510 struct page *page;
511 int i;
512 struct ceph_snap_context *snapc = req->r_snapc;
513 struct address_space *mapping = inode->i_mapping;
514 struct writeback_control *wbc = req->r_wbc;
515 __s32 rc = -EIO;
516 u64 bytes = 0;
517 struct ceph_client *client = ceph_inode_to_client(inode);
518 long writeback_stat;
519 unsigned issued = __ceph_caps_issued(ci, NULL);
520
521 /* parse reply */
522 replyhead = msg->front.iov_base;
523 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
524 op = (void *)(replyhead + 1);
525 rc = le32_to_cpu(replyhead->result);
526 bytes = le64_to_cpu(op->extent.length);
527
528 if (rc >= 0) {
529 /*
530 * Assume we wrote the pages we originally sent. The
531 * osd might reply with fewer pages if our writeback
532 * raced with a truncation and was adjusted at the osd,
533 * so don't believe the reply.
534 */
535 wrote = req->r_num_pages;
536 } else {
537 wrote = 0;
538 mapping_set_error(mapping, rc);
539 }
540 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
541 inode, rc, bytes, wrote);
542
543 /* clean all pages */
544 for (i = 0; i < req->r_num_pages; i++) {
545 page = req->r_pages[i];
546 BUG_ON(!page);
547 WARN_ON(!PageUptodate(page));
548
549 writeback_stat =
550 atomic_long_dec_return(&client->writeback_count);
551 if (writeback_stat <
552 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
553 clear_bdi_congested(&client->backing_dev_info,
554 BLK_RW_ASYNC);
555
556 if (i >= wrote) {
557 dout("inode %p skipping page %p\n", inode, page);
558 wbc->pages_skipped++;
559 }
560 page->private = 0;
561 ClearPagePrivate(page);
562 ceph_put_snap_context(snapc);
563 dout("unlocking %d %p\n", i, page);
564 end_page_writeback(page);
565
566 /*
567 * We lost the cache cap, need to truncate the page before
568 * it is unlocked, otherwise we'd truncate it later in the
569 * page truncation thread, possibly losing some data that
570 * raced its way in
571 */
572 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
573 generic_error_remove_page(inode->i_mapping, page);
574
575 unlock_page(page);
576 }
577 dout("%p wrote+cleaned %d pages\n", inode, wrote);
578 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
579
580 ceph_release_pages(req->r_pages, req->r_num_pages);
581 if (req->r_pages_from_pool)
582 mempool_free(req->r_pages,
583 ceph_client(inode->i_sb)->wb_pagevec_pool);
584 else
585 kfree(req->r_pages);
586 ceph_osdc_put_request(req);
587}
588
589/*
590 * allocate a page vec, either directly, or if necessary, via a the
591 * mempool. we avoid the mempool if we can because req->r_num_pages
592 * may be less than the maximum write size.
593 */
594static void alloc_page_vec(struct ceph_client *client,
595 struct ceph_osd_request *req)
596{
597 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
598 GFP_NOFS);
599 if (!req->r_pages) {
600 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
601 req->r_pages_from_pool = 1;
602 WARN_ON(!req->r_pages);
603 }
604}
605
606/*
607 * initiate async writeback
608 */
609static int ceph_writepages_start(struct address_space *mapping,
610 struct writeback_control *wbc)
611{
612 struct inode *inode = mapping->host;
613 struct backing_dev_info *bdi = mapping->backing_dev_info;
614 struct ceph_inode_info *ci = ceph_inode(inode);
615 struct ceph_client *client;
616 pgoff_t index, start, end;
617 int range_whole = 0;
618 int should_loop = 1;
619 pgoff_t max_pages = 0, max_pages_ever = 0;
620 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
621 struct pagevec pvec;
622 int done = 0;
623 int rc = 0;
624 unsigned wsize = 1 << inode->i_blkbits;
625 struct ceph_osd_request *req = NULL;
626 int do_sync;
627 u64 snap_size = 0;
628
629 /*
630 * Include a 'sync' in the OSD request if this is a data
631 * integrity write (e.g., O_SYNC write or fsync()), or if our
632 * cap is being revoked.
633 */
634 do_sync = wbc->sync_mode == WB_SYNC_ALL;
635 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
636 do_sync = 1;
637 dout("writepages_start %p dosync=%d (mode=%s)\n",
638 inode, do_sync,
639 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
640 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
641
642 client = ceph_inode_to_client(inode);
643 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
644 pr_warning("writepage_start %p on forced umount\n", inode);
645 return -EIO; /* we're in a forced umount, don't write! */
646 }
647 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
648 wsize = client->mount_args->wsize;
649 if (wsize < PAGE_CACHE_SIZE)
650 wsize = PAGE_CACHE_SIZE;
651 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
652
653 pagevec_init(&pvec, 0);
654
655 /* ?? */
656 if (wbc->nonblocking && bdi_write_congested(bdi)) {
657 dout(" writepages congested\n");
658 wbc->encountered_congestion = 1;
659 goto out_final;
660 }
661
662 /* where to start/end? */
663 if (wbc->range_cyclic) {
664 start = mapping->writeback_index; /* Start from prev offset */
665 end = -1;
666 dout(" cyclic, start at %lu\n", start);
667 } else {
668 start = wbc->range_start >> PAGE_CACHE_SHIFT;
669 end = wbc->range_end >> PAGE_CACHE_SHIFT;
670 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
671 range_whole = 1;
672 should_loop = 0;
673 dout(" not cyclic, %lu to %lu\n", start, end);
674 }
675 index = start;
676
677retry:
678 /* find oldest snap context with dirty data */
679 ceph_put_snap_context(snapc);
680 snapc = get_oldest_context(inode, &snap_size);
681 if (!snapc) {
682 /* hmm, why does writepages get called when there
683 is no dirty data? */
684 dout(" no snap context with dirty data?\n");
685 goto out;
686 }
687 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
688 snapc, snapc->seq, snapc->num_snaps);
689 if (last_snapc && snapc != last_snapc) {
690 /* if we switched to a newer snapc, restart our scan at the
691 * start of the original file range. */
692 dout(" snapc differs from last pass, restarting at %lu\n",
693 index);
694 index = start;
695 }
696 last_snapc = snapc;
697
698 while (!done && index <= end) {
699 unsigned i;
700 int first;
701 pgoff_t next;
702 int pvec_pages, locked_pages;
703 struct page *page;
704 int want;
705 u64 offset, len;
706 struct ceph_osd_request_head *reqhead;
707 struct ceph_osd_op *op;
708 long writeback_stat;
709
710 next = 0;
711 locked_pages = 0;
712 max_pages = max_pages_ever;
713
714get_more_pages:
715 first = -1;
716 want = min(end - index,
717 min((pgoff_t)PAGEVEC_SIZE,
718 max_pages - (pgoff_t)locked_pages) - 1)
719 + 1;
720 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
721 PAGECACHE_TAG_DIRTY,
722 want);
723 dout("pagevec_lookup_tag got %d\n", pvec_pages);
724 if (!pvec_pages && !locked_pages)
725 break;
726 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
727 page = pvec.pages[i];
728 dout("? %p idx %lu\n", page, page->index);
729 if (locked_pages == 0)
730 lock_page(page); /* first page */
731 else if (!trylock_page(page))
732 break;
733
734 /* only dirty pages, or our accounting breaks */
735 if (unlikely(!PageDirty(page)) ||
736 unlikely(page->mapping != mapping)) {
737 dout("!dirty or !mapping %p\n", page);
738 unlock_page(page);
739 break;
740 }
741 if (!wbc->range_cyclic && page->index > end) {
742 dout("end of range %p\n", page);
743 done = 1;
744 unlock_page(page);
745 break;
746 }
747 if (next && (page->index != next)) {
748 dout("not consecutive %p\n", page);
749 unlock_page(page);
750 break;
751 }
752 if (wbc->sync_mode != WB_SYNC_NONE) {
753 dout("waiting on writeback %p\n", page);
754 wait_on_page_writeback(page);
755 }
756 if ((snap_size && page_offset(page) > snap_size) ||
757 (!snap_size &&
758 page_offset(page) > i_size_read(inode))) {
759 dout("%p page eof %llu\n", page, snap_size ?
760 snap_size : i_size_read(inode));
761 done = 1;
762 unlock_page(page);
763 break;
764 }
765 if (PageWriteback(page)) {
766 dout("%p under writeback\n", page);
767 unlock_page(page);
768 break;
769 }
770
771 /* only if matching snap context */
772 if (snapc != (void *)page->private) {
773 dout("page snapc %p != oldest %p\n",
774 (void *)page->private, snapc);
775 unlock_page(page);
776 if (!locked_pages)
777 continue; /* keep looking for snap */
778 break;
779 }
780
781 if (!clear_page_dirty_for_io(page)) {
782 dout("%p !clear_page_dirty_for_io\n", page);
783 unlock_page(page);
784 break;
785 }
786
787 /* ok */
788 if (locked_pages == 0) {
789 /* prepare async write request */
790 offset = page->index << PAGE_CACHE_SHIFT;
791 len = wsize;
792 req = ceph_osdc_new_request(&client->osdc,
793 &ci->i_layout,
794 ceph_vino(inode),
795 offset, &len,
796 CEPH_OSD_OP_WRITE,
797 CEPH_OSD_FLAG_WRITE |
798 CEPH_OSD_FLAG_ONDISK,
799 snapc, do_sync,
800 ci->i_truncate_seq,
801 ci->i_truncate_size,
802 &inode->i_mtime, true, 1);
803 max_pages = req->r_num_pages;
804
805 alloc_page_vec(client, req);
806 req->r_callback = writepages_finish;
807 req->r_inode = inode;
808 req->r_wbc = wbc;
809 }
810
811 /* note position of first page in pvec */
812 if (first < 0)
813 first = i;
814 dout("%p will write page %p idx %lu\n",
815 inode, page, page->index);
816
817 writeback_stat = atomic_long_inc_return(&client->writeback_count);
818 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
819 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
820 }
821
822 set_page_writeback(page);
823 req->r_pages[locked_pages] = page;
824 locked_pages++;
825 next = page->index + 1;
826 }
827
828 /* did we get anything? */
829 if (!locked_pages)
830 goto release_pvec_pages;
831 if (i) {
832 int j;
833 BUG_ON(!locked_pages || first < 0);
834
835 if (pvec_pages && i == pvec_pages &&
836 locked_pages < max_pages) {
837 dout("reached end pvec, trying for more\n");
838 pagevec_reinit(&pvec);
839 goto get_more_pages;
840 }
841
842 /* shift unused pages over in the pvec... we
843 * will need to release them below. */
844 for (j = i; j < pvec_pages; j++) {
845 dout(" pvec leftover page %p\n",
846 pvec.pages[j]);
847 pvec.pages[j-i+first] = pvec.pages[j];
848 }
849 pvec.nr -= i-first;
850 }
851
852 /* submit the write */
853 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
854 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
855 (u64)locked_pages << PAGE_CACHE_SHIFT);
856 dout("writepages got %d pages at %llu~%llu\n",
857 locked_pages, offset, len);
858
859 /* revise final length, page count */
860 req->r_num_pages = locked_pages;
861 reqhead = req->r_request->front.iov_base;
862 op = (void *)(reqhead + 1);
863 op->extent.length = cpu_to_le64(len);
864 op->payload_len = cpu_to_le32(len);
865 req->r_request->hdr.data_len = cpu_to_le32(len);
866
867 ceph_osdc_start_request(&client->osdc, req, true);
868 req = NULL;
869
870 /* continue? */
871 index = next;
872 wbc->nr_to_write -= locked_pages;
873 if (wbc->nr_to_write <= 0)
874 done = 1;
875
876release_pvec_pages:
877 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
878 pvec.nr ? pvec.pages[0] : NULL);
879 pagevec_release(&pvec);
880
881 if (locked_pages && !done)
882 goto retry;
883 }
884
885 if (should_loop && !done) {
886 /* more to do; loop back to beginning of file */
887 dout("writepages looping back to beginning of file\n");
888 should_loop = 0;
889 index = 0;
890 goto retry;
891 }
892
893 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
894 mapping->writeback_index = index;
895
896out:
897 if (req)
898 ceph_osdc_put_request(req);
899 if (rc > 0)
900 rc = 0; /* vfs expects us to return 0 */
901 ceph_put_snap_context(snapc);
902 dout("writepages done, rc = %d\n", rc);
903out_final:
904 return rc;
905}
906
907
908
909/*
910 * See if a given @snapc is either writeable, or already written.
911 */
912static int context_is_writeable_or_written(struct inode *inode,
913 struct ceph_snap_context *snapc)
914{
915 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
916 return !oldest || snapc->seq <= oldest->seq;
917}
918
919/*
920 * We are only allowed to write into/dirty the page if the page is
921 * clean, or already dirty within the same snap context.
922 */
923static int ceph_update_writeable_page(struct file *file,
924 loff_t pos, unsigned len,
925 struct page *page)
926{
927 struct inode *inode = file->f_dentry->d_inode;
928 struct ceph_inode_info *ci = ceph_inode(inode);
929 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
930 loff_t page_off = pos & PAGE_CACHE_MASK;
931 int pos_in_page = pos & ~PAGE_CACHE_MASK;
932 int end_in_page = pos_in_page + len;
933 loff_t i_size;
934 struct ceph_snap_context *snapc;
935 int r;
936
937retry_locked:
938 /* writepages currently holds page lock, but if we change that later, */
939 wait_on_page_writeback(page);
940
941 /* check snap context */
942 BUG_ON(!ci->i_snap_realm);
943 down_read(&mdsc->snap_rwsem);
944 BUG_ON(!ci->i_snap_realm->cached_context);
945 if (page->private &&
946 (void *)page->private != ci->i_snap_realm->cached_context) {
947 /*
948 * this page is already dirty in another (older) snap
949 * context! is it writeable now?
950 */
951 snapc = get_oldest_context(inode, NULL);
952 up_read(&mdsc->snap_rwsem);
953
954 if (snapc != (void *)page->private) {
955 dout(" page %p snapc %p not current or oldest\n",
956 page, (void *)page->private);
957 /*
958 * queue for writeback, and wait for snapc to
959 * be writeable or written
960 */
961 snapc = ceph_get_snap_context((void *)page->private);
962 unlock_page(page);
963 ceph_queue_writeback(inode);
964 wait_event_interruptible(ci->i_cap_wq,
965 context_is_writeable_or_written(inode, snapc));
966 ceph_put_snap_context(snapc);
967 return -EAGAIN;
968 }
969
970 /* yay, writeable, do it now (without dropping page lock) */
971 dout(" page %p snapc %p not current, but oldest\n",
972 page, snapc);
973 if (!clear_page_dirty_for_io(page))
974 goto retry_locked;
975 r = writepage_nounlock(page, NULL);
976 if (r < 0)
977 goto fail_nosnap;
978 goto retry_locked;
979 }
980
981 if (PageUptodate(page)) {
982 dout(" page %p already uptodate\n", page);
983 return 0;
984 }
985
986 /* full page? */
987 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
988 return 0;
989
990 /* past end of file? */
991 i_size = inode->i_size; /* caller holds i_mutex */
992
993 if (i_size + len > inode->i_sb->s_maxbytes) {
994 /* file is too big */
995 r = -EINVAL;
996 goto fail;
997 }
998
999 if (page_off >= i_size ||
1000 (pos_in_page == 0 && (pos+len) >= i_size &&
1001 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1002 dout(" zeroing %p 0 - %d and %d - %d\n",
1003 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1004 zero_user_segments(page,
1005 0, pos_in_page,
1006 end_in_page, PAGE_CACHE_SIZE);
1007 return 0;
1008 }
1009
1010 /* we need to read it. */
1011 up_read(&mdsc->snap_rwsem);
1012 r = readpage_nounlock(file, page);
1013 if (r < 0)
1014 goto fail_nosnap;
1015 goto retry_locked;
1016
1017fail:
1018 up_read(&mdsc->snap_rwsem);
1019fail_nosnap:
1020 unlock_page(page);
1021 return r;
1022}
1023
1024/*
1025 * We are only allowed to write into/dirty the page if the page is
1026 * clean, or already dirty within the same snap context.
1027 */
1028static int ceph_write_begin(struct file *file, struct address_space *mapping,
1029 loff_t pos, unsigned len, unsigned flags,
1030 struct page **pagep, void **fsdata)
1031{
1032 struct inode *inode = file->f_dentry->d_inode;
1033 struct page *page;
1034 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1035 int r;
1036
1037 do {
1038 /* get a page*/
1039 page = grab_cache_page_write_begin(mapping, index, 0);
1040 if (!page)
1041 return -ENOMEM;
1042 *pagep = page;
1043
1044 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1045 inode, page, (int)pos, (int)len);
1046
1047 r = ceph_update_writeable_page(file, pos, len, page);
1048 } while (r == -EAGAIN);
1049
1050 return r;
1051}
1052
1053/*
1054 * we don't do anything in here that simple_write_end doesn't do
1055 * except adjust dirty page accounting and drop read lock on
1056 * mdsc->snap_rwsem.
1057 */
1058static int ceph_write_end(struct file *file, struct address_space *mapping,
1059 loff_t pos, unsigned len, unsigned copied,
1060 struct page *page, void *fsdata)
1061{
1062 struct inode *inode = file->f_dentry->d_inode;
1063 struct ceph_client *client = ceph_inode_to_client(inode);
1064 struct ceph_mds_client *mdsc = &client->mdsc;
1065 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1066 int check_cap = 0;
1067
1068 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1069 inode, page, (int)pos, (int)copied, (int)len);
1070
1071 /* zero the stale part of the page if we did a short copy */
1072 if (copied < len)
1073 zero_user_segment(page, from+copied, len);
1074
1075 /* did file size increase? */
1076 /* (no need for i_size_read(); we caller holds i_mutex */
1077 if (pos+copied > inode->i_size)
1078 check_cap = ceph_inode_set_size(inode, pos+copied);
1079
1080 if (!PageUptodate(page))
1081 SetPageUptodate(page);
1082
1083 set_page_dirty(page);
1084
1085 unlock_page(page);
1086 up_read(&mdsc->snap_rwsem);
1087 page_cache_release(page);
1088
1089 if (check_cap)
1090 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1091
1092 return copied;
1093}
1094
1095/*
1096 * we set .direct_IO to indicate direct io is supported, but since we
1097 * intercept O_DIRECT reads and writes early, this function should
1098 * never get called.
1099 */
1100static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1101 const struct iovec *iov,
1102 loff_t pos, unsigned long nr_segs)
1103{
1104 WARN_ON(1);
1105 return -EINVAL;
1106}
1107
1108const struct address_space_operations ceph_aops = {
1109 .readpage = ceph_readpage,
1110 .readpages = ceph_readpages,
1111 .writepage = ceph_writepage,
1112 .writepages = ceph_writepages_start,
1113 .write_begin = ceph_write_begin,
1114 .write_end = ceph_write_end,
1115 .set_page_dirty = ceph_set_page_dirty,
1116 .invalidatepage = ceph_invalidatepage,
1117 .releasepage = ceph_releasepage,
1118 .direct_IO = ceph_direct_io,
1119};
1120
1121
1122/*
1123 * vm ops
1124 */
1125
1126/*
1127 * Reuse write_begin here for simplicity.
1128 */
1129static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1130{
1131 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1132 struct page *page = vmf->page;
1133 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1134 loff_t off = page->index << PAGE_CACHE_SHIFT;
1135 loff_t size, len;
1136 int ret;
1137
1138 size = i_size_read(inode);
1139 if (off + PAGE_CACHE_SIZE <= size)
1140 len = PAGE_CACHE_SIZE;
1141 else
1142 len = size & ~PAGE_CACHE_MASK;
1143
1144 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1145 off, len, page, page->index);
1146
1147 lock_page(page);
1148
1149 ret = VM_FAULT_NOPAGE;
1150 if ((off > size) ||
1151 (page->mapping != inode->i_mapping))
1152 goto out;
1153
1154 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1155 if (ret == 0) {
1156 /* success. we'll keep the page locked. */
1157 set_page_dirty(page);
1158 up_read(&mdsc->snap_rwsem);
1159 ret = VM_FAULT_LOCKED;
1160 } else {
1161 if (ret == -ENOMEM)
1162 ret = VM_FAULT_OOM;
1163 else
1164 ret = VM_FAULT_SIGBUS;
1165 }
1166out:
1167 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1168 if (ret != VM_FAULT_LOCKED)
1169 unlock_page(page);
1170 return ret;
1171}
1172
1173static struct vm_operations_struct ceph_vmops = {
1174 .fault = filemap_fault,
1175 .page_mkwrite = ceph_page_mkwrite,
1176};
1177
1178int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1179{
1180 struct address_space *mapping = file->f_mapping;
1181
1182 if (!mapping->a_ops->readpage)
1183 return -ENOEXEC;
1184 file_accessed(file);
1185 vma->vm_ops = &ceph_vmops;
1186 vma->vm_flags |= VM_CAN_NONLINEAR;
1187 return 0;
1188}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..abb204fea6c7
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,257 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5
6#include "types.h"
7#include "auth_none.h"
8#include "auth_x.h"
9#include "decode.h"
10#include "super.h"
11
12#include "messenger.h"
13
14/*
15 * get protocol handler
16 */
17static u32 supported_protocols[] = {
18 CEPH_AUTH_NONE,
19 CEPH_AUTH_CEPHX
20};
21
22int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
23{
24 switch (protocol) {
25 case CEPH_AUTH_NONE:
26 return ceph_auth_none_init(ac);
27 case CEPH_AUTH_CEPHX:
28 return ceph_x_init(ac);
29 default:
30 return -ENOENT;
31 }
32}
33
34/*
35 * setup, teardown.
36 */
37struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
38{
39 struct ceph_auth_client *ac;
40 int ret;
41
42 dout("auth_init name '%s' secret '%s'\n", name, secret);
43
44 ret = -ENOMEM;
45 ac = kzalloc(sizeof(*ac), GFP_NOFS);
46 if (!ac)
47 goto out;
48
49 ac->negotiating = true;
50 if (name)
51 ac->name = name;
52 else
53 ac->name = CEPH_AUTH_NAME_DEFAULT;
54 dout("auth_init name %s secret %s\n", ac->name, secret);
55 ac->secret = secret;
56 return ac;
57
58out:
59 return ERR_PTR(ret);
60}
61
62void ceph_auth_destroy(struct ceph_auth_client *ac)
63{
64 dout("auth_destroy %p\n", ac);
65 if (ac->ops)
66 ac->ops->destroy(ac);
67 kfree(ac);
68}
69
70/*
71 * Reset occurs when reconnecting to the monitor.
72 */
73void ceph_auth_reset(struct ceph_auth_client *ac)
74{
75 dout("auth_reset %p\n", ac);
76 if (ac->ops && !ac->negotiating)
77 ac->ops->reset(ac);
78 ac->negotiating = true;
79}
80
81int ceph_entity_name_encode(const char *name, void **p, void *end)
82{
83 int len = strlen(name);
84
85 if (*p + 2*sizeof(u32) + len > end)
86 return -ERANGE;
87 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
88 ceph_encode_32(p, len);
89 ceph_encode_copy(p, name, len);
90 return 0;
91}
92
93/*
94 * Initiate protocol negotiation with monitor. Include entity name
95 * and list supported protocols.
96 */
97int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
98{
99 struct ceph_mon_request_header *monhdr = buf;
100 void *p = monhdr + 1, *end = buf + len, *lenp;
101 int i, num;
102 int ret;
103
104 dout("auth_build_hello\n");
105 monhdr->have_version = 0;
106 monhdr->session_mon = cpu_to_le16(-1);
107 monhdr->session_mon_tid = 0;
108
109 ceph_encode_32(&p, 0); /* no protocol, yet */
110
111 lenp = p;
112 p += sizeof(u32);
113
114 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
115 ceph_encode_8(&p, 1);
116 num = ARRAY_SIZE(supported_protocols);
117 ceph_encode_32(&p, num);
118 ceph_decode_need(&p, end, num * sizeof(u32), bad);
119 for (i = 0; i < num; i++)
120 ceph_encode_32(&p, supported_protocols[i]);
121
122 ret = ceph_entity_name_encode(ac->name, &p, end);
123 if (ret < 0)
124 return ret;
125 ceph_decode_need(&p, end, sizeof(u64), bad);
126 ceph_encode_64(&p, ac->global_id);
127
128 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
129 return p - buf;
130
131bad:
132 return -ERANGE;
133}
134
135int ceph_build_auth_request(struct ceph_auth_client *ac,
136 void *msg_buf, size_t msg_len)
137{
138 struct ceph_mon_request_header *monhdr = msg_buf;
139 void *p = monhdr + 1;
140 void *end = msg_buf + msg_len;
141 int ret;
142
143 monhdr->have_version = 0;
144 monhdr->session_mon = cpu_to_le16(-1);
145 monhdr->session_mon_tid = 0;
146
147 ceph_encode_32(&p, ac->protocol);
148
149 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
150 if (ret < 0) {
151 pr_err("error %d building request\n", ret);
152 return ret;
153 }
154 dout(" built request %d bytes\n", ret);
155 ceph_encode_32(&p, ret);
156 return p + ret - msg_buf;
157}
158
159/*
160 * Handle auth message from monitor.
161 */
162int ceph_handle_auth_reply(struct ceph_auth_client *ac,
163 void *buf, size_t len,
164 void *reply_buf, size_t reply_len)
165{
166 void *p = buf;
167 void *end = buf + len;
168 int protocol;
169 s32 result;
170 u64 global_id;
171 void *payload, *payload_end;
172 int payload_len;
173 char *result_msg;
174 int result_msg_len;
175 int ret = -EINVAL;
176
177 dout("handle_auth_reply %p %p\n", p, end);
178 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
179 protocol = ceph_decode_32(&p);
180 result = ceph_decode_32(&p);
181 global_id = ceph_decode_64(&p);
182 payload_len = ceph_decode_32(&p);
183 payload = p;
184 p += payload_len;
185 ceph_decode_need(&p, end, sizeof(u32), bad);
186 result_msg_len = ceph_decode_32(&p);
187 result_msg = p;
188 p += result_msg_len;
189 if (p != end)
190 goto bad;
191
192 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
193 result_msg, global_id, payload_len);
194
195 payload_end = payload + payload_len;
196
197 if (global_id && ac->global_id != global_id) {
198 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
199 ac->global_id = global_id;
200 }
201
202 if (ac->negotiating) {
203 /* server does not support our protocols? */
204 if (!protocol && result < 0) {
205 ret = result;
206 goto out;
207 }
208 /* set up (new) protocol handler? */
209 if (ac->protocol && ac->protocol != protocol) {
210 ac->ops->destroy(ac);
211 ac->protocol = 0;
212 ac->ops = NULL;
213 }
214 if (ac->protocol != protocol) {
215 ret = ceph_auth_init_protocol(ac, protocol);
216 if (ret) {
217 pr_err("error %d on auth protocol %d init\n",
218 ret, protocol);
219 goto out;
220 }
221 }
222
223 ac->negotiating = false;
224 }
225
226 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
227 if (ret == -EAGAIN) {
228 return ceph_build_auth_request(ac, reply_buf, reply_len);
229 } else if (ret) {
230 pr_err("authentication error %d\n", ret);
231 return ret;
232 }
233 return 0;
234
235bad:
236 pr_err("failed to decode auth msg\n");
237out:
238 return ret;
239}
240
241int ceph_build_auth(struct ceph_auth_client *ac,
242 void *msg_buf, size_t msg_len)
243{
244 if (!ac->protocol)
245 return ceph_auth_build_hello(ac, msg_buf, msg_len);
246 BUG_ON(!ac->ops);
247 if (!ac->ops->is_authenticated(ac))
248 return ceph_build_auth_request(ac, msg_buf, msg_len);
249 return 0;
250}
251
252int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
253{
254 if (!ac->ops)
255 return 0;
256 return ac->ops->is_authenticated(ac);
257}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..b4ef6f0a6c85
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_none.h"
9#include "auth.h"
10#include "decode.h"
11
12static void reset(struct ceph_auth_client *ac)
13{
14 struct ceph_auth_none_info *xi = ac->private;
15
16 xi->starting = true;
17 xi->built_authorizer = false;
18}
19
20static void destroy(struct ceph_auth_client *ac)
21{
22 kfree(ac->private);
23 ac->private = NULL;
24}
25
26static int is_authenticated(struct ceph_auth_client *ac)
27{
28 struct ceph_auth_none_info *xi = ac->private;
29
30 return !xi->starting;
31}
32
33/*
34 * the generic auth code decode the global_id, and we carry no actual
35 * authenticate state, so nothing happens here.
36 */
37static int handle_reply(struct ceph_auth_client *ac, int result,
38 void *buf, void *end)
39{
40 struct ceph_auth_none_info *xi = ac->private;
41
42 xi->starting = false;
43 return result;
44}
45
46/*
47 * build an 'authorizer' with our entity_name and global_id. we can
48 * reuse a single static copy since it is identical for all services
49 * we connect to.
50 */
51static int ceph_auth_none_create_authorizer(
52 struct ceph_auth_client *ac, int peer_type,
53 struct ceph_authorizer **a,
54 void **buf, size_t *len,
55 void **reply_buf, size_t *reply_len)
56{
57 struct ceph_auth_none_info *ai = ac->private;
58 struct ceph_none_authorizer *au = &ai->au;
59 void *p, *end;
60 int ret;
61
62 if (!ai->built_authorizer) {
63 p = au->buf;
64 end = p + sizeof(au->buf);
65 ceph_encode_8(&p, 1);
66 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
67 if (ret < 0)
68 goto bad;
69 ceph_decode_need(&p, end, sizeof(u64), bad2);
70 ceph_encode_64(&p, ac->global_id);
71 au->buf_len = p - (void *)au->buf;
72 ai->built_authorizer = true;
73 dout("built authorizer len %d\n", au->buf_len);
74 }
75
76 *a = (struct ceph_authorizer *)au;
77 *buf = au->buf;
78 *len = au->buf_len;
79 *reply_buf = au->reply_buf;
80 *reply_len = sizeof(au->reply_buf);
81 return 0;
82
83bad2:
84 ret = -ERANGE;
85bad:
86 return ret;
87}
88
89static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
90 struct ceph_authorizer *a)
91{
92 /* nothing to do */
93}
94
95static const struct ceph_auth_client_ops ceph_auth_none_ops = {
96 .reset = reset,
97 .destroy = destroy,
98 .is_authenticated = is_authenticated,
99 .handle_reply = handle_reply,
100 .create_authorizer = ceph_auth_none_create_authorizer,
101 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
102};
103
104int ceph_auth_none_init(struct ceph_auth_client *ac)
105{
106 struct ceph_auth_none_info *xi;
107
108 dout("ceph_auth_none_init %p\n", ac);
109 xi = kzalloc(sizeof(*xi), GFP_NOFS);
110 if (!xi)
111 return -ENOMEM;
112
113 xi->starting = true;
114 xi->built_authorizer = false;
115
116 ac->protocol = CEPH_AUTH_NONE;
117 ac->private = xi;
118 ac->ops = &ceph_auth_none_ops;
119 return 0;
120}
121
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..f0318427b6da
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,656 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_x.h"
9#include "auth_x_protocol.h"
10#include "crypto.h"
11#include "auth.h"
12#include "decode.h"
13
14struct kmem_cache *ceph_x_ticketbuf_cachep;
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_encrypt(struct ceph_crypto_key *secret,
32 void *ibuf, int ilen, void *obuf, size_t olen)
33{
34 struct ceph_x_encrypt_header head = {
35 .struct_v = 1,
36 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
37 };
38 size_t len = olen - sizeof(u32);
39 int ret;
40
41 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
42 &head, sizeof(head), ibuf, ilen);
43 if (ret)
44 return ret;
45 ceph_encode_32(&obuf, len);
46 return len + sizeof(u32);
47}
48
49static int ceph_x_decrypt(struct ceph_crypto_key *secret,
50 void **p, void *end, void *obuf, size_t olen)
51{
52 struct ceph_x_encrypt_header head;
53 size_t head_len = sizeof(head);
54 int len, ret;
55
56 len = ceph_decode_32(p);
57 if (*p + len > end)
58 return -EINVAL;
59
60 dout("ceph_x_decrypt len %d\n", len);
61 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
62 *p, len);
63 if (ret)
64 return ret;
65 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
66 return -EPERM;
67 *p += len;
68 return olen;
69}
70
71/*
72 * get existing (or insert new) ticket handler
73 */
74struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
75 int service)
76{
77 struct ceph_x_ticket_handler *th;
78 struct ceph_x_info *xi = ac->private;
79 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
80
81 while (*p) {
82 parent = *p;
83 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
84 if (service < th->service)
85 p = &(*p)->rb_left;
86 else if (service > th->service)
87 p = &(*p)->rb_right;
88 else
89 return th;
90 }
91
92 /* add it */
93 th = kzalloc(sizeof(*th), GFP_NOFS);
94 if (!th)
95 return ERR_PTR(-ENOMEM);
96 th->service = service;
97 rb_link_node(&th->node, parent, p);
98 rb_insert_color(&th->node, &xi->ticket_handlers);
99 return th;
100}
101
102static void remove_ticket_handler(struct ceph_auth_client *ac,
103 struct ceph_x_ticket_handler *th)
104{
105 struct ceph_x_info *xi = ac->private;
106
107 dout("remove_ticket_handler %p %d\n", th, th->service);
108 rb_erase(&th->node, &xi->ticket_handlers);
109 ceph_crypto_key_destroy(&th->session_key);
110 if (th->ticket_blob)
111 ceph_buffer_put(th->ticket_blob);
112 kfree(th);
113}
114
115static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
116 struct ceph_crypto_key *secret,
117 void *buf, void *end)
118{
119 struct ceph_x_info *xi = ac->private;
120 int num;
121 void *p = buf;
122 int ret;
123 char *dbuf;
124 char *ticket_buf;
125 u8 struct_v;
126
127 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
128 if (!dbuf)
129 return -ENOMEM;
130
131 ret = -ENOMEM;
132 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
133 GFP_NOFS | GFP_ATOMIC);
134 if (!ticket_buf)
135 goto out_dbuf;
136
137 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
138 struct_v = ceph_decode_8(&p);
139 if (struct_v != 1)
140 goto bad;
141 num = ceph_decode_32(&p);
142 dout("%d tickets\n", num);
143 while (num--) {
144 int type;
145 u8 struct_v;
146 struct ceph_x_ticket_handler *th;
147 void *dp, *dend;
148 int dlen;
149 char is_enc;
150 struct timespec validity;
151 struct ceph_crypto_key old_key;
152 void *tp, *tpend;
153
154 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
155
156 type = ceph_decode_32(&p);
157 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
158
159 struct_v = ceph_decode_8(&p);
160 if (struct_v != 1)
161 goto bad;
162
163 th = get_ticket_handler(ac, type);
164 if (IS_ERR(th)) {
165 ret = PTR_ERR(th);
166 goto out;
167 }
168
169 /* blob for me */
170 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
171 TEMP_TICKET_BUF_LEN);
172 if (dlen <= 0) {
173 ret = dlen;
174 goto out;
175 }
176 dout(" decrypted %d bytes\n", dlen);
177 dend = dbuf + dlen;
178 dp = dbuf;
179
180 struct_v = ceph_decode_8(&dp);
181 if (struct_v != 1)
182 goto bad;
183
184 memcpy(&old_key, &th->session_key, sizeof(old_key));
185 ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
186 if (ret)
187 goto out;
188
189 ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
190 ceph_decode_timespec(&validity, &th->validity);
191 th->expires = get_seconds() + validity.tv_sec;
192 th->renew_after = th->expires - (validity.tv_sec / 4);
193 dout(" expires=%lu renew_after=%lu\n", th->expires,
194 th->renew_after);
195
196 /* ticket blob for service */
197 ceph_decode_8_safe(&p, end, is_enc, bad);
198 tp = ticket_buf;
199 if (is_enc) {
200 /* encrypted */
201 dout(" encrypted ticket\n");
202 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
203 TEMP_TICKET_BUF_LEN);
204 if (dlen < 0) {
205 ret = dlen;
206 goto out;
207 }
208 dlen = ceph_decode_32(&tp);
209 } else {
210 /* unencrypted */
211 ceph_decode_32_safe(&p, end, dlen, bad);
212 ceph_decode_need(&p, end, dlen, bad);
213 ceph_decode_copy(&p, ticket_buf, dlen);
214 }
215 tpend = tp + dlen;
216 dout(" ticket blob is %d bytes\n", dlen);
217 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
218 struct_v = ceph_decode_8(&tp);
219 th->secret_id = ceph_decode_64(&tp);
220 ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
221 if (ret)
222 goto out;
223 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
224 type, ceph_entity_type_name(type), th->secret_id,
225 (int)th->ticket_blob->vec.iov_len);
226 xi->have_keys |= th->service;
227 }
228
229 ret = 0;
230out:
231 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
232out_dbuf:
233 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
234 return ret;
235
236bad:
237 ret = -EINVAL;
238 goto out;
239}
240
241static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
242 struct ceph_x_ticket_handler *th,
243 struct ceph_x_authorizer *au)
244{
245 int len;
246 struct ceph_x_authorize_a *msg_a;
247 struct ceph_x_authorize_b msg_b;
248 void *p, *end;
249 int ret;
250 int ticket_blob_len =
251 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
252
253 dout("build_authorizer for %s %p\n",
254 ceph_entity_type_name(th->service), au);
255
256 len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
257 ticket_blob_len + 16;
258 dout(" need len %d\n", len);
259 if (au->buf && au->buf->alloc_len < len) {
260 ceph_buffer_put(au->buf);
261 au->buf = NULL;
262 }
263 if (!au->buf) {
264 au->buf = ceph_buffer_new(len, GFP_NOFS);
265 if (!au->buf)
266 return -ENOMEM;
267 }
268 au->service = th->service;
269
270 msg_a = au->buf->vec.iov_base;
271 msg_a->struct_v = 1;
272 msg_a->global_id = cpu_to_le64(ac->global_id);
273 msg_a->service_id = cpu_to_le32(th->service);
274 msg_a->ticket_blob.struct_v = 1;
275 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
276 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
277 if (ticket_blob_len) {
278 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
279 th->ticket_blob->vec.iov_len);
280 }
281 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
282 le64_to_cpu(msg_a->ticket_blob.secret_id));
283
284 p = msg_a + 1;
285 p += ticket_blob_len;
286 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
287
288 get_random_bytes(&au->nonce, sizeof(au->nonce));
289 msg_b.struct_v = 1;
290 msg_b.nonce = cpu_to_le64(au->nonce);
291 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
292 p, end - p);
293 if (ret < 0)
294 goto out_buf;
295 p += ret;
296 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
297 dout(" built authorizer nonce %llx len %d\n", au->nonce,
298 (int)au->buf->vec.iov_len);
299 return 0;
300
301out_buf:
302 ceph_buffer_put(au->buf);
303 au->buf = NULL;
304 return ret;
305}
306
307static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
308 void **p, void *end)
309{
310 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
311 ceph_encode_8(p, 1);
312 ceph_encode_64(p, th->secret_id);
313 if (th->ticket_blob) {
314 const char *buf = th->ticket_blob->vec.iov_base;
315 u32 len = th->ticket_blob->vec.iov_len;
316
317 ceph_encode_32_safe(p, end, len, bad);
318 ceph_encode_copy_safe(p, end, buf, len, bad);
319 } else {
320 ceph_encode_32_safe(p, end, 0, bad);
321 }
322
323 return 0;
324bad:
325 return -ERANGE;
326}
327
328static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
329{
330 int want = ac->want_keys;
331 struct ceph_x_info *xi = ac->private;
332 int service;
333
334 *pneed = ac->want_keys & ~(xi->have_keys);
335
336 for (service = 1; service <= want; service <<= 1) {
337 struct ceph_x_ticket_handler *th;
338
339 if (!(ac->want_keys & service))
340 continue;
341
342 if (*pneed & service)
343 continue;
344
345 th = get_ticket_handler(ac, service);
346
347 if (!th) {
348 *pneed |= service;
349 continue;
350 }
351
352 if (get_seconds() >= th->renew_after)
353 *pneed |= service;
354 if (get_seconds() >= th->expires)
355 xi->have_keys &= ~service;
356 }
357}
358
359
360static int ceph_x_build_request(struct ceph_auth_client *ac,
361 void *buf, void *end)
362{
363 struct ceph_x_info *xi = ac->private;
364 int need;
365 struct ceph_x_request_header *head = buf;
366 int ret;
367 struct ceph_x_ticket_handler *th =
368 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
369
370 ceph_x_validate_tickets(ac, &need);
371
372 dout("build_request want %x have %x need %x\n",
373 ac->want_keys, xi->have_keys, need);
374
375 if (need & CEPH_ENTITY_TYPE_AUTH) {
376 struct ceph_x_authenticate *auth = (void *)(head + 1);
377 void *p = auth + 1;
378 struct ceph_x_challenge_blob tmp;
379 char tmp_enc[40];
380 u64 *u;
381
382 if (p > end)
383 return -ERANGE;
384
385 dout(" get_auth_session_key\n");
386 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
387
388 /* encrypt and hash */
389 get_random_bytes(&auth->client_challenge, sizeof(u64));
390 tmp.client_challenge = auth->client_challenge;
391 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
392 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
393 tmp_enc, sizeof(tmp_enc));
394 if (ret < 0)
395 return ret;
396
397 auth->struct_v = 1;
398 auth->key = 0;
399 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
400 auth->key ^= *u;
401 dout(" server_challenge %llx client_challenge %llx key %llx\n",
402 xi->server_challenge, le64_to_cpu(auth->client_challenge),
403 le64_to_cpu(auth->key));
404
405 /* now encode the old ticket if exists */
406 ret = ceph_x_encode_ticket(th, &p, end);
407 if (ret < 0)
408 return ret;
409
410 return p - buf;
411 }
412
413 if (need) {
414 void *p = head + 1;
415 struct ceph_x_service_ticket_request *req;
416
417 if (p > end)
418 return -ERANGE;
419 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
420
421 BUG_ON(!th);
422 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
423 if (ret)
424 return ret;
425 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
426 xi->auth_authorizer.buf->vec.iov_len);
427
428 req = p;
429 req->keys = cpu_to_le32(need);
430 p += sizeof(*req);
431 return p - buf;
432 }
433
434 return 0;
435}
436
437static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
438 void *buf, void *end)
439{
440 struct ceph_x_info *xi = ac->private;
441 struct ceph_x_reply_header *head = buf;
442 struct ceph_x_ticket_handler *th;
443 int len = end - buf;
444 int op;
445 int ret;
446
447 if (result)
448 return result; /* XXX hmm? */
449
450 if (xi->starting) {
451 /* it's a hello */
452 struct ceph_x_server_challenge *sc = buf;
453
454 if (len != sizeof(*sc))
455 return -EINVAL;
456 xi->server_challenge = le64_to_cpu(sc->server_challenge);
457 dout("handle_reply got server challenge %llx\n",
458 xi->server_challenge);
459 xi->starting = false;
460 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
461 return -EAGAIN;
462 }
463
464 op = le32_to_cpu(head->op);
465 result = le32_to_cpu(head->result);
466 dout("handle_reply op %d result %d\n", op, result);
467 switch (op) {
468 case CEPHX_GET_AUTH_SESSION_KEY:
469 /* verify auth key */
470 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
471 buf + sizeof(*head), end);
472 break;
473
474 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
475 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
476 BUG_ON(!th);
477 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
478 buf + sizeof(*head), end);
479 break;
480
481 default:
482 return -EINVAL;
483 }
484 if (ret)
485 return ret;
486 if (ac->want_keys == xi->have_keys)
487 return 0;
488 return -EAGAIN;
489}
490
491static int ceph_x_create_authorizer(
492 struct ceph_auth_client *ac, int peer_type,
493 struct ceph_authorizer **a,
494 void **buf, size_t *len,
495 void **reply_buf, size_t *reply_len)
496{
497 struct ceph_x_authorizer *au;
498 struct ceph_x_ticket_handler *th;
499 int ret;
500
501 th = get_ticket_handler(ac, peer_type);
502 if (IS_ERR(th))
503 return PTR_ERR(th);
504
505 au = kzalloc(sizeof(*au), GFP_NOFS);
506 if (!au)
507 return -ENOMEM;
508
509 ret = ceph_x_build_authorizer(ac, th, au);
510 if (ret) {
511 kfree(au);
512 return ret;
513 }
514
515 *a = (struct ceph_authorizer *)au;
516 *buf = au->buf->vec.iov_base;
517 *len = au->buf->vec.iov_len;
518 *reply_buf = au->reply_buf;
519 *reply_len = sizeof(au->reply_buf);
520 return 0;
521}
522
523static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
524 struct ceph_authorizer *a, size_t len)
525{
526 struct ceph_x_authorizer *au = (void *)a;
527 struct ceph_x_ticket_handler *th;
528 int ret = 0;
529 struct ceph_x_authorize_reply reply;
530 void *p = au->reply_buf;
531 void *end = p + sizeof(au->reply_buf);
532
533 th = get_ticket_handler(ac, au->service);
534 if (!th)
535 return -EIO; /* hrm! */
536 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
537 if (ret < 0)
538 return ret;
539 if (ret != sizeof(reply))
540 return -EPERM;
541
542 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
543 ret = -EPERM;
544 else
545 ret = 0;
546 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
547 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
548 return ret;
549}
550
551static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
552 struct ceph_authorizer *a)
553{
554 struct ceph_x_authorizer *au = (void *)a;
555
556 ceph_buffer_put(au->buf);
557 kfree(au);
558}
559
560
561static void ceph_x_reset(struct ceph_auth_client *ac)
562{
563 struct ceph_x_info *xi = ac->private;
564
565 dout("reset\n");
566 xi->starting = true;
567 xi->server_challenge = 0;
568}
569
570static void ceph_x_destroy(struct ceph_auth_client *ac)
571{
572 struct ceph_x_info *xi = ac->private;
573 struct rb_node *p;
574
575 dout("ceph_x_destroy %p\n", ac);
576 ceph_crypto_key_destroy(&xi->secret);
577
578 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
579 struct ceph_x_ticket_handler *th =
580 rb_entry(p, struct ceph_x_ticket_handler, node);
581 remove_ticket_handler(ac, th);
582 }
583
584 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
585
586 kfree(ac->private);
587 ac->private = NULL;
588}
589
590static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
591 int peer_type)
592{
593 struct ceph_x_ticket_handler *th;
594
595 th = get_ticket_handler(ac, peer_type);
596 if (th && !IS_ERR(th))
597 remove_ticket_handler(ac, th);
598}
599
600
601static const struct ceph_auth_client_ops ceph_x_ops = {
602 .is_authenticated = ceph_x_is_authenticated,
603 .build_request = ceph_x_build_request,
604 .handle_reply = ceph_x_handle_reply,
605 .create_authorizer = ceph_x_create_authorizer,
606 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
607 .destroy_authorizer = ceph_x_destroy_authorizer,
608 .invalidate_authorizer = ceph_x_invalidate_authorizer,
609 .reset = ceph_x_reset,
610 .destroy = ceph_x_destroy,
611};
612
613
614int ceph_x_init(struct ceph_auth_client *ac)
615{
616 struct ceph_x_info *xi;
617 int ret;
618
619 dout("ceph_x_init %p\n", ac);
620 xi = kzalloc(sizeof(*xi), GFP_NOFS);
621 if (!xi)
622 return -ENOMEM;
623
624 ret = -ENOMEM;
625 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
626 TEMP_TICKET_BUF_LEN, 8,
627 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
628 NULL);
629 if (!ceph_x_ticketbuf_cachep)
630 goto done_nomem;
631 ret = -EINVAL;
632 if (!ac->secret) {
633 pr_err("no secret set (for auth_x protocol)\n");
634 goto done_nomem;
635 }
636
637 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
638 if (ret)
639 goto done_nomem;
640
641 xi->starting = true;
642 xi->ticket_handlers = RB_ROOT;
643
644 ac->protocol = CEPH_AUTH_CEPHX;
645 ac->private = xi;
646 ac->ops = &ceph_x_ops;
647 return 0;
648
649done_nomem:
650 kfree(xi);
651 if (ceph_x_ticketbuf_cachep)
652 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
653 return ret;
654}
655
656
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..b98086c7aeba
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
1
2#include "ceph_debug.h"
3#include "buffer.h"
4#include "decode.h"
5
6struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
7{
8 struct ceph_buffer *b;
9
10 b = kmalloc(sizeof(*b), gfp);
11 if (!b)
12 return NULL;
13
14 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
15 if (b->vec.iov_base) {
16 b->is_vmalloc = false;
17 } else {
18 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
19 if (!b->vec.iov_base) {
20 kfree(b);
21 return NULL;
22 }
23 b->is_vmalloc = true;
24 }
25
26 kref_init(&b->kref);
27 b->alloc_len = len;
28 b->vec.iov_len = len;
29 dout("buffer_new %p\n", b);
30 return b;
31}
32
33void ceph_buffer_release(struct kref *kref)
34{
35 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
36
37 dout("buffer_release %p\n", b);
38 if (b->vec.iov_base) {
39 if (b->is_vmalloc)
40 vfree(b->vec.iov_base);
41 else
42 kfree(b->vec.iov_base);
43 }
44 kfree(b);
45}
46
47int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
48{
49 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
50 if (b->vec.iov_base) {
51 b->is_vmalloc = false;
52 } else {
53 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
54 b->is_vmalloc = true;
55 }
56 if (!b->vec.iov_base)
57 return -ENOMEM;
58 b->alloc_len = len;
59 b->vec.iov_len = len;
60 return 0;
61}
62
63int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
64{
65 size_t len;
66
67 ceph_decode_need(p, end, sizeof(u32), bad);
68 len = ceph_decode_32(p);
69 dout("decode_buffer len %d\n", (int)len);
70 ceph_decode_need(p, end, len, bad);
71 *b = ceph_buffer_new(len, GFP_NOFS);
72 if (!*b)
73 return -ENOMEM;
74 ceph_decode_copy(p, (*b)->vec.iov_base, len);
75 return 0;
76bad:
77 return -EINVAL;
78}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..db122bb357b8
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2927 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/vmalloc.h>
7#include <linux/wait.h>
8#include <linux/writeback.h>
9
10#include "super.h"
11#include "decode.h"
12#include "messenger.h"
13
14/*
15 * Capability management
16 *
17 * The Ceph metadata servers control client access to inode metadata
18 * and file data by issuing capabilities, granting clients permission
19 * to read and/or write both inode field and file data to OSDs
20 * (storage nodes). Each capability consists of a set of bits
21 * indicating which operations are allowed.
22 *
23 * If the client holds a *_SHARED cap, the client has a coherent value
24 * that can be safely read from the cached inode.
25 *
26 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
27 * client is allowed to change inode attributes (e.g., file size,
28 * mtime), note its dirty state in the ceph_cap, and asynchronously
29 * flush that metadata change to the MDS.
30 *
31 * In the event of a conflicting operation (perhaps by another
32 * client), the MDS will revoke the conflicting client capabilities.
33 *
34 * In order for a client to cache an inode, it must hold a capability
35 * with at least one MDS server. When inodes are released, release
36 * notifications are batched and periodically sent en masse to the MDS
37 * cluster to release server state.
38 */
39
40
41/*
42 * Generate readable cap strings for debugging output.
43 */
44#define MAX_CAP_STR 20
45static char cap_str[MAX_CAP_STR][40];
46static DEFINE_SPINLOCK(cap_str_lock);
47static int last_cap_str;
48
49static char *gcap_string(char *s, int c)
50{
51 if (c & CEPH_CAP_GSHARED)
52 *s++ = 's';
53 if (c & CEPH_CAP_GEXCL)
54 *s++ = 'x';
55 if (c & CEPH_CAP_GCACHE)
56 *s++ = 'c';
57 if (c & CEPH_CAP_GRD)
58 *s++ = 'r';
59 if (c & CEPH_CAP_GWR)
60 *s++ = 'w';
61 if (c & CEPH_CAP_GBUFFER)
62 *s++ = 'b';
63 if (c & CEPH_CAP_GLAZYIO)
64 *s++ = 'l';
65 return s;
66}
67
68const char *ceph_cap_string(int caps)
69{
70 int i;
71 char *s;
72 int c;
73
74 spin_lock(&cap_str_lock);
75 i = last_cap_str++;
76 if (last_cap_str == MAX_CAP_STR)
77 last_cap_str = 0;
78 spin_unlock(&cap_str_lock);
79
80 s = cap_str[i];
81
82 if (caps & CEPH_CAP_PIN)
83 *s++ = 'p';
84
85 c = (caps >> CEPH_CAP_SAUTH) & 3;
86 if (c) {
87 *s++ = 'A';
88 s = gcap_string(s, c);
89 }
90
91 c = (caps >> CEPH_CAP_SLINK) & 3;
92 if (c) {
93 *s++ = 'L';
94 s = gcap_string(s, c);
95 }
96
97 c = (caps >> CEPH_CAP_SXATTR) & 3;
98 if (c) {
99 *s++ = 'X';
100 s = gcap_string(s, c);
101 }
102
103 c = caps >> CEPH_CAP_SFILE;
104 if (c) {
105 *s++ = 'F';
106 s = gcap_string(s, c);
107 }
108
109 if (s == cap_str[i])
110 *s++ = '-';
111 *s = 0;
112 return cap_str[i];
113}
114
115/*
116 * Cap reservations
117 *
118 * Maintain a global pool of preallocated struct ceph_caps, referenced
119 * by struct ceph_caps_reservations. This ensures that we preallocate
120 * memory needed to successfully process an MDS response. (If an MDS
121 * sends us cap information and we fail to process it, we will have
122 * problems due to the client and MDS being out of sync.)
123 *
124 * Reservations are 'owned' by a ceph_cap_reservation context.
125 */
126static spinlock_t caps_list_lock;
127static struct list_head caps_list; /* unused (reserved or unreserved) */
128static int caps_total_count; /* total caps allocated */
129static int caps_use_count; /* in use */
130static int caps_reserve_count; /* unused, reserved */
131static int caps_avail_count; /* unused, unreserved */
132static int caps_min_count; /* keep at least this many (unreserved) */
133
134void __init ceph_caps_init(void)
135{
136 INIT_LIST_HEAD(&caps_list);
137 spin_lock_init(&caps_list_lock);
138}
139
140void ceph_caps_finalize(void)
141{
142 struct ceph_cap *cap;
143
144 spin_lock(&caps_list_lock);
145 while (!list_empty(&caps_list)) {
146 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
147 list_del(&cap->caps_item);
148 kmem_cache_free(ceph_cap_cachep, cap);
149 }
150 caps_total_count = 0;
151 caps_avail_count = 0;
152 caps_use_count = 0;
153 caps_reserve_count = 0;
154 caps_min_count = 0;
155 spin_unlock(&caps_list_lock);
156}
157
158void ceph_adjust_min_caps(int delta)
159{
160 spin_lock(&caps_list_lock);
161 caps_min_count += delta;
162 BUG_ON(caps_min_count < 0);
163 spin_unlock(&caps_list_lock);
164}
165
166int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
167{
168 int i;
169 struct ceph_cap *cap;
170 int have;
171 int alloc = 0;
172 LIST_HEAD(newcaps);
173 int ret = 0;
174
175 dout("reserve caps ctx=%p need=%d\n", ctx, need);
176
177 /* first reserve any caps that are already allocated */
178 spin_lock(&caps_list_lock);
179 if (caps_avail_count >= need)
180 have = need;
181 else
182 have = caps_avail_count;
183 caps_avail_count -= have;
184 caps_reserve_count += have;
185 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
186 caps_avail_count);
187 spin_unlock(&caps_list_lock);
188
189 for (i = have; i < need; i++) {
190 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
191 if (!cap) {
192 ret = -ENOMEM;
193 goto out_alloc_count;
194 }
195 list_add(&cap->caps_item, &newcaps);
196 alloc++;
197 }
198 BUG_ON(have + alloc != need);
199
200 spin_lock(&caps_list_lock);
201 caps_total_count += alloc;
202 caps_reserve_count += alloc;
203 list_splice(&newcaps, &caps_list);
204
205 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
206 caps_avail_count);
207 spin_unlock(&caps_list_lock);
208
209 ctx->count = need;
210 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
211 ctx, caps_total_count, caps_use_count, caps_reserve_count,
212 caps_avail_count);
213 return 0;
214
215out_alloc_count:
216 /* we didn't manage to reserve as much as we needed */
217 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
218 ctx, need, have);
219 return ret;
220}
221
222int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
223{
224 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
225 if (ctx->count) {
226 spin_lock(&caps_list_lock);
227 BUG_ON(caps_reserve_count < ctx->count);
228 caps_reserve_count -= ctx->count;
229 caps_avail_count += ctx->count;
230 ctx->count = 0;
231 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
232 caps_total_count, caps_use_count, caps_reserve_count,
233 caps_avail_count);
234 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
235 caps_avail_count);
236 spin_unlock(&caps_list_lock);
237 }
238 return 0;
239}
240
241static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
242{
243 struct ceph_cap *cap = NULL;
244
245 /* temporary, until we do something about cap import/export */
246 if (!ctx)
247 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
248
249 spin_lock(&caps_list_lock);
250 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
251 ctx, ctx->count, caps_total_count, caps_use_count,
252 caps_reserve_count, caps_avail_count);
253 BUG_ON(!ctx->count);
254 BUG_ON(ctx->count > caps_reserve_count);
255 BUG_ON(list_empty(&caps_list));
256
257 ctx->count--;
258 caps_reserve_count--;
259 caps_use_count++;
260
261 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
262 list_del(&cap->caps_item);
263
264 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
265 caps_avail_count);
266 spin_unlock(&caps_list_lock);
267 return cap;
268}
269
270void ceph_put_cap(struct ceph_cap *cap)
271{
272 spin_lock(&caps_list_lock);
273 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
274 cap, caps_total_count, caps_use_count,
275 caps_reserve_count, caps_avail_count);
276 caps_use_count--;
277 /*
278 * Keep some preallocated caps around (ceph_min_count), to
279 * avoid lots of free/alloc churn.
280 */
281 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
282 caps_total_count--;
283 kmem_cache_free(ceph_cap_cachep, cap);
284 } else {
285 caps_avail_count++;
286 list_add(&cap->caps_item, &caps_list);
287 }
288
289 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
290 caps_avail_count);
291 spin_unlock(&caps_list_lock);
292}
293
294void ceph_reservation_status(struct ceph_client *client,
295 int *total, int *avail, int *used, int *reserved,
296 int *min)
297{
298 if (total)
299 *total = caps_total_count;
300 if (avail)
301 *avail = caps_avail_count;
302 if (used)
303 *used = caps_use_count;
304 if (reserved)
305 *reserved = caps_reserve_count;
306 if (min)
307 *min = caps_min_count;
308}
309
310/*
311 * Find ceph_cap for given mds, if any.
312 *
313 * Called with i_lock held.
314 */
315static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
316{
317 struct ceph_cap *cap;
318 struct rb_node *n = ci->i_caps.rb_node;
319
320 while (n) {
321 cap = rb_entry(n, struct ceph_cap, ci_node);
322 if (mds < cap->mds)
323 n = n->rb_left;
324 else if (mds > cap->mds)
325 n = n->rb_right;
326 else
327 return cap;
328 }
329 return NULL;
330}
331
332/*
333 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
334 * -1.
335 */
336static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
337{
338 struct ceph_cap *cap;
339 int mds = -1;
340 struct rb_node *p;
341
342 /* prefer mds with WR|WRBUFFER|EXCL caps */
343 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
344 cap = rb_entry(p, struct ceph_cap, ci_node);
345 mds = cap->mds;
346 if (mseq)
347 *mseq = cap->mseq;
348 if (cap->issued & (CEPH_CAP_FILE_WR |
349 CEPH_CAP_FILE_BUFFER |
350 CEPH_CAP_FILE_EXCL))
351 break;
352 }
353 return mds;
354}
355
356int ceph_get_cap_mds(struct inode *inode)
357{
358 int mds;
359 spin_lock(&inode->i_lock);
360 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
361 spin_unlock(&inode->i_lock);
362 return mds;
363}
364
365/*
366 * Called under i_lock.
367 */
368static void __insert_cap_node(struct ceph_inode_info *ci,
369 struct ceph_cap *new)
370{
371 struct rb_node **p = &ci->i_caps.rb_node;
372 struct rb_node *parent = NULL;
373 struct ceph_cap *cap = NULL;
374
375 while (*p) {
376 parent = *p;
377 cap = rb_entry(parent, struct ceph_cap, ci_node);
378 if (new->mds < cap->mds)
379 p = &(*p)->rb_left;
380 else if (new->mds > cap->mds)
381 p = &(*p)->rb_right;
382 else
383 BUG();
384 }
385
386 rb_link_node(&new->ci_node, parent, p);
387 rb_insert_color(&new->ci_node, &ci->i_caps);
388}
389
390/*
391 * (re)set cap hold timeouts, which control the delayed release
392 * of unused caps back to the MDS. Should be called on cap use.
393 */
394static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
395 struct ceph_inode_info *ci)
396{
397 struct ceph_mount_args *ma = mdsc->client->mount_args;
398
399 ci->i_hold_caps_min = round_jiffies(jiffies +
400 ma->caps_wanted_delay_min * HZ);
401 ci->i_hold_caps_max = round_jiffies(jiffies +
402 ma->caps_wanted_delay_max * HZ);
403 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
404 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
405}
406
407/*
408 * (Re)queue cap at the end of the delayed cap release list.
409 *
410 * If I_FLUSH is set, leave the inode at the front of the list.
411 *
412 * Caller holds i_lock
413 * -> we take mdsc->cap_delay_lock
414 */
415static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
416 struct ceph_inode_info *ci)
417{
418 __cap_set_timeouts(mdsc, ci);
419 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
420 ci->i_ceph_flags, ci->i_hold_caps_max);
421 if (!mdsc->stopping) {
422 spin_lock(&mdsc->cap_delay_lock);
423 if (!list_empty(&ci->i_cap_delay_list)) {
424 if (ci->i_ceph_flags & CEPH_I_FLUSH)
425 goto no_change;
426 list_del_init(&ci->i_cap_delay_list);
427 }
428 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
429no_change:
430 spin_unlock(&mdsc->cap_delay_lock);
431 }
432}
433
434/*
435 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
436 * indicating we should send a cap message to flush dirty metadata
437 * asap, and move to the front of the delayed cap list.
438 */
439static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
440 struct ceph_inode_info *ci)
441{
442 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
443 spin_lock(&mdsc->cap_delay_lock);
444 ci->i_ceph_flags |= CEPH_I_FLUSH;
445 if (!list_empty(&ci->i_cap_delay_list))
446 list_del_init(&ci->i_cap_delay_list);
447 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
448 spin_unlock(&mdsc->cap_delay_lock);
449}
450
451/*
452 * Cancel delayed work on cap.
453 *
454 * Caller must hold i_lock.
455 */
456static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
457 struct ceph_inode_info *ci)
458{
459 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
460 if (list_empty(&ci->i_cap_delay_list))
461 return;
462 spin_lock(&mdsc->cap_delay_lock);
463 list_del_init(&ci->i_cap_delay_list);
464 spin_unlock(&mdsc->cap_delay_lock);
465}
466
467/*
468 * Common issue checks for add_cap, handle_cap_grant.
469 */
470static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
471 unsigned issued)
472{
473 unsigned had = __ceph_caps_issued(ci, NULL);
474
475 /*
476 * Each time we receive FILE_CACHE anew, we increment
477 * i_rdcache_gen.
478 */
479 if ((issued & CEPH_CAP_FILE_CACHE) &&
480 (had & CEPH_CAP_FILE_CACHE) == 0)
481 ci->i_rdcache_gen++;
482
483 /*
484 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
485 * don't know what happened to this directory while we didn't
486 * have the cap.
487 */
488 if ((issued & CEPH_CAP_FILE_SHARED) &&
489 (had & CEPH_CAP_FILE_SHARED) == 0) {
490 ci->i_shared_gen++;
491 if (S_ISDIR(ci->vfs_inode.i_mode)) {
492 dout(" marking %p NOT complete\n", &ci->vfs_inode);
493 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
494 }
495 }
496}
497
498/*
499 * Add a capability under the given MDS session.
500 *
501 * Caller should hold session snap_rwsem (read) and s_mutex.
502 *
503 * @fmode is the open file mode, if we are opening a file, otherwise
504 * it is < 0. (This is so we can atomically add the cap and add an
505 * open file reference to it.)
506 */
507int ceph_add_cap(struct inode *inode,
508 struct ceph_mds_session *session, u64 cap_id,
509 int fmode, unsigned issued, unsigned wanted,
510 unsigned seq, unsigned mseq, u64 realmino, int flags,
511 struct ceph_cap_reservation *caps_reservation)
512{
513 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
514 struct ceph_inode_info *ci = ceph_inode(inode);
515 struct ceph_cap *new_cap = NULL;
516 struct ceph_cap *cap;
517 int mds = session->s_mds;
518 int actual_wanted;
519
520 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
521 session->s_mds, cap_id, ceph_cap_string(issued), seq);
522
523 /*
524 * If we are opening the file, include file mode wanted bits
525 * in wanted.
526 */
527 if (fmode >= 0)
528 wanted |= ceph_caps_for_mode(fmode);
529
530retry:
531 spin_lock(&inode->i_lock);
532 cap = __get_cap_for_mds(ci, mds);
533 if (!cap) {
534 if (new_cap) {
535 cap = new_cap;
536 new_cap = NULL;
537 } else {
538 spin_unlock(&inode->i_lock);
539 new_cap = get_cap(caps_reservation);
540 if (new_cap == NULL)
541 return -ENOMEM;
542 goto retry;
543 }
544
545 cap->issued = 0;
546 cap->implemented = 0;
547 cap->mds = mds;
548 cap->mds_wanted = 0;
549
550 cap->ci = ci;
551 __insert_cap_node(ci, cap);
552
553 /* clear out old exporting info? (i.e. on cap import) */
554 if (ci->i_cap_exporting_mds == mds) {
555 ci->i_cap_exporting_issued = 0;
556 ci->i_cap_exporting_mseq = 0;
557 ci->i_cap_exporting_mds = -1;
558 }
559
560 /* add to session cap list */
561 cap->session = session;
562 spin_lock(&session->s_cap_lock);
563 list_add_tail(&cap->session_caps, &session->s_caps);
564 session->s_nr_caps++;
565 spin_unlock(&session->s_cap_lock);
566 }
567
568 if (!ci->i_snap_realm) {
569 /*
570 * add this inode to the appropriate snap realm
571 */
572 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
573 realmino);
574 if (realm) {
575 ceph_get_snap_realm(mdsc, realm);
576 spin_lock(&realm->inodes_with_caps_lock);
577 ci->i_snap_realm = realm;
578 list_add(&ci->i_snap_realm_item,
579 &realm->inodes_with_caps);
580 spin_unlock(&realm->inodes_with_caps_lock);
581 } else {
582 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
583 realmino);
584 }
585 }
586
587 __check_cap_issue(ci, cap, issued);
588
589 /*
590 * If we are issued caps we don't want, or the mds' wanted
591 * value appears to be off, queue a check so we'll release
592 * later and/or update the mds wanted value.
593 */
594 actual_wanted = __ceph_caps_wanted(ci);
595 if ((wanted & ~actual_wanted) ||
596 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
597 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
598 ceph_cap_string(issued), ceph_cap_string(wanted),
599 ceph_cap_string(actual_wanted));
600 __cap_delay_requeue(mdsc, ci);
601 }
602
603 if (flags & CEPH_CAP_FLAG_AUTH)
604 ci->i_auth_cap = cap;
605 else if (ci->i_auth_cap == cap)
606 ci->i_auth_cap = NULL;
607
608 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
609 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
610 ceph_cap_string(issued|cap->issued), seq, mds);
611 cap->cap_id = cap_id;
612 cap->issued = issued;
613 cap->implemented |= issued;
614 cap->mds_wanted |= wanted;
615 cap->seq = seq;
616 cap->issue_seq = seq;
617 cap->mseq = mseq;
618 cap->cap_gen = session->s_cap_gen;
619
620 if (fmode >= 0)
621 __ceph_get_fmode(ci, fmode);
622 spin_unlock(&inode->i_lock);
623 wake_up(&ci->i_cap_wq);
624 return 0;
625}
626
627/*
628 * Return true if cap has not timed out and belongs to the current
629 * generation of the MDS session (i.e. has not gone 'stale' due to
630 * us losing touch with the mds).
631 */
632static int __cap_is_valid(struct ceph_cap *cap)
633{
634 unsigned long ttl;
635 u32 gen;
636
637 spin_lock(&cap->session->s_cap_lock);
638 gen = cap->session->s_cap_gen;
639 ttl = cap->session->s_cap_ttl;
640 spin_unlock(&cap->session->s_cap_lock);
641
642 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
643 dout("__cap_is_valid %p cap %p issued %s "
644 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
645 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
646 return 0;
647 }
648
649 return 1;
650}
651
652/*
653 * Return set of valid cap bits issued to us. Note that caps time
654 * out, and may be invalidated in bulk if the client session times out
655 * and session->s_cap_gen is bumped.
656 */
657int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
658{
659 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
660 struct ceph_cap *cap;
661 struct rb_node *p;
662
663 if (implemented)
664 *implemented = 0;
665 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
666 cap = rb_entry(p, struct ceph_cap, ci_node);
667 if (!__cap_is_valid(cap))
668 continue;
669 dout("__ceph_caps_issued %p cap %p issued %s\n",
670 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
671 have |= cap->issued;
672 if (implemented)
673 *implemented |= cap->implemented;
674 }
675 return have;
676}
677
678/*
679 * Get cap bits issued by caps other than @ocap
680 */
681int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
682{
683 int have = ci->i_snap_caps;
684 struct ceph_cap *cap;
685 struct rb_node *p;
686
687 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
688 cap = rb_entry(p, struct ceph_cap, ci_node);
689 if (cap == ocap)
690 continue;
691 if (!__cap_is_valid(cap))
692 continue;
693 have |= cap->issued;
694 }
695 return have;
696}
697
698/*
699 * Move a cap to the end of the LRU (oldest caps at list head, newest
700 * at list tail).
701 */
702static void __touch_cap(struct ceph_cap *cap)
703{
704 struct ceph_mds_session *s = cap->session;
705
706 spin_lock(&s->s_cap_lock);
707 if (s->s_cap_iterator == NULL) {
708 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
709 s->s_mds);
710 list_move_tail(&cap->session_caps, &s->s_caps);
711 } else {
712 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
713 &cap->ci->vfs_inode, cap, s->s_mds);
714 }
715 spin_unlock(&s->s_cap_lock);
716}
717
718/*
719 * Check if we hold the given mask. If so, move the cap(s) to the
720 * front of their respective LRUs. (This is the preferred way for
721 * callers to check for caps they want.)
722 */
723int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
724{
725 struct ceph_cap *cap;
726 struct rb_node *p;
727 int have = ci->i_snap_caps;
728
729 if ((have & mask) == mask) {
730 dout("__ceph_caps_issued_mask %p snap issued %s"
731 " (mask %s)\n", &ci->vfs_inode,
732 ceph_cap_string(have),
733 ceph_cap_string(mask));
734 return 1;
735 }
736
737 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
738 cap = rb_entry(p, struct ceph_cap, ci_node);
739 if (!__cap_is_valid(cap))
740 continue;
741 if ((cap->issued & mask) == mask) {
742 dout("__ceph_caps_issued_mask %p cap %p issued %s"
743 " (mask %s)\n", &ci->vfs_inode, cap,
744 ceph_cap_string(cap->issued),
745 ceph_cap_string(mask));
746 if (touch)
747 __touch_cap(cap);
748 return 1;
749 }
750
751 /* does a combination of caps satisfy mask? */
752 have |= cap->issued;
753 if ((have & mask) == mask) {
754 dout("__ceph_caps_issued_mask %p combo issued %s"
755 " (mask %s)\n", &ci->vfs_inode,
756 ceph_cap_string(cap->issued),
757 ceph_cap_string(mask));
758 if (touch) {
759 struct rb_node *q;
760
761 /* touch this + preceeding caps */
762 __touch_cap(cap);
763 for (q = rb_first(&ci->i_caps); q != p;
764 q = rb_next(q)) {
765 cap = rb_entry(q, struct ceph_cap,
766 ci_node);
767 if (!__cap_is_valid(cap))
768 continue;
769 __touch_cap(cap);
770 }
771 }
772 return 1;
773 }
774 }
775
776 return 0;
777}
778
779/*
780 * Return true if mask caps are currently being revoked by an MDS.
781 */
782int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
783{
784 struct inode *inode = &ci->vfs_inode;
785 struct ceph_cap *cap;
786 struct rb_node *p;
787 int ret = 0;
788
789 spin_lock(&inode->i_lock);
790 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
791 cap = rb_entry(p, struct ceph_cap, ci_node);
792 if (__cap_is_valid(cap) &&
793 (cap->implemented & ~cap->issued & mask)) {
794 ret = 1;
795 break;
796 }
797 }
798 spin_unlock(&inode->i_lock);
799 dout("ceph_caps_revoking %p %s = %d\n", inode,
800 ceph_cap_string(mask), ret);
801 return ret;
802}
803
804int __ceph_caps_used(struct ceph_inode_info *ci)
805{
806 int used = 0;
807 if (ci->i_pin_ref)
808 used |= CEPH_CAP_PIN;
809 if (ci->i_rd_ref)
810 used |= CEPH_CAP_FILE_RD;
811 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
812 used |= CEPH_CAP_FILE_CACHE;
813 if (ci->i_wr_ref)
814 used |= CEPH_CAP_FILE_WR;
815 if (ci->i_wrbuffer_ref)
816 used |= CEPH_CAP_FILE_BUFFER;
817 return used;
818}
819
820/*
821 * wanted, by virtue of open file modes
822 */
823int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
824{
825 int want = 0;
826 int mode;
827 for (mode = 0; mode < 4; mode++)
828 if (ci->i_nr_by_mode[mode])
829 want |= ceph_caps_for_mode(mode);
830 return want;
831}
832
833/*
834 * Return caps we have registered with the MDS(s) as 'wanted'.
835 */
836int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
837{
838 struct ceph_cap *cap;
839 struct rb_node *p;
840 int mds_wanted = 0;
841
842 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
843 cap = rb_entry(p, struct ceph_cap, ci_node);
844 if (!__cap_is_valid(cap))
845 continue;
846 mds_wanted |= cap->mds_wanted;
847 }
848 return mds_wanted;
849}
850
851/*
852 * called under i_lock
853 */
854static int __ceph_is_any_caps(struct ceph_inode_info *ci)
855{
856 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
857}
858
859/*
860 * caller should hold i_lock.
861 * caller will not hold session s_mutex if called from destroy_inode.
862 */
863void __ceph_remove_cap(struct ceph_cap *cap)
864{
865 struct ceph_mds_session *session = cap->session;
866 struct ceph_inode_info *ci = cap->ci;
867 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
868
869 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
870
871 /* remove from inode list */
872 rb_erase(&cap->ci_node, &ci->i_caps);
873 cap->ci = NULL;
874 if (ci->i_auth_cap == cap)
875 ci->i_auth_cap = NULL;
876
877 /* remove from session list */
878 spin_lock(&session->s_cap_lock);
879 if (session->s_cap_iterator == cap) {
880 /* not yet, we are iterating over this very cap */
881 dout("__ceph_remove_cap delaying %p removal from session %p\n",
882 cap, cap->session);
883 } else {
884 list_del_init(&cap->session_caps);
885 session->s_nr_caps--;
886 cap->session = NULL;
887 }
888 spin_unlock(&session->s_cap_lock);
889
890 if (cap->session == NULL)
891 ceph_put_cap(cap);
892
893 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
894 struct ceph_snap_realm *realm = ci->i_snap_realm;
895 spin_lock(&realm->inodes_with_caps_lock);
896 list_del_init(&ci->i_snap_realm_item);
897 ci->i_snap_realm_counter++;
898 ci->i_snap_realm = NULL;
899 spin_unlock(&realm->inodes_with_caps_lock);
900 ceph_put_snap_realm(mdsc, realm);
901 }
902 if (!__ceph_is_any_real_caps(ci))
903 __cap_delay_cancel(mdsc, ci);
904}
905
906/*
907 * Build and send a cap message to the given MDS.
908 *
909 * Caller should be holding s_mutex.
910 */
911static int send_cap_msg(struct ceph_mds_session *session,
912 u64 ino, u64 cid, int op,
913 int caps, int wanted, int dirty,
914 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
915 u64 size, u64 max_size,
916 struct timespec *mtime, struct timespec *atime,
917 u64 time_warp_seq,
918 uid_t uid, gid_t gid, mode_t mode,
919 u64 xattr_version,
920 struct ceph_buffer *xattrs_buf,
921 u64 follows)
922{
923 struct ceph_mds_caps *fc;
924 struct ceph_msg *msg;
925
926 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
927 " seq %u/%u mseq %u follows %lld size %llu/%llu"
928 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
929 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
930 ceph_cap_string(dirty),
931 seq, issue_seq, mseq, follows, size, max_size,
932 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
933
934 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
935 if (IS_ERR(msg))
936 return PTR_ERR(msg);
937
938 msg->hdr.tid = cpu_to_le64(flush_tid);
939
940 fc = msg->front.iov_base;
941 memset(fc, 0, sizeof(*fc));
942
943 fc->cap_id = cpu_to_le64(cid);
944 fc->op = cpu_to_le32(op);
945 fc->seq = cpu_to_le32(seq);
946 fc->issue_seq = cpu_to_le32(issue_seq);
947 fc->migrate_seq = cpu_to_le32(mseq);
948 fc->caps = cpu_to_le32(caps);
949 fc->wanted = cpu_to_le32(wanted);
950 fc->dirty = cpu_to_le32(dirty);
951 fc->ino = cpu_to_le64(ino);
952 fc->snap_follows = cpu_to_le64(follows);
953
954 fc->size = cpu_to_le64(size);
955 fc->max_size = cpu_to_le64(max_size);
956 if (mtime)
957 ceph_encode_timespec(&fc->mtime, mtime);
958 if (atime)
959 ceph_encode_timespec(&fc->atime, atime);
960 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
961
962 fc->uid = cpu_to_le32(uid);
963 fc->gid = cpu_to_le32(gid);
964 fc->mode = cpu_to_le32(mode);
965
966 fc->xattr_version = cpu_to_le64(xattr_version);
967 if (xattrs_buf) {
968 msg->middle = ceph_buffer_get(xattrs_buf);
969 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
970 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 }
972
973 ceph_con_send(&session->s_con, msg);
974 return 0;
975}
976
977/*
978 * Queue cap releases when an inode is dropped from our cache. Since
979 * inode is about to be destroyed, there is no need for i_lock.
980 */
981void ceph_queue_caps_release(struct inode *inode)
982{
983 struct ceph_inode_info *ci = ceph_inode(inode);
984 struct rb_node *p;
985
986 p = rb_first(&ci->i_caps);
987 while (p) {
988 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
989 struct ceph_mds_session *session = cap->session;
990 struct ceph_msg *msg;
991 struct ceph_mds_cap_release *head;
992 struct ceph_mds_cap_item *item;
993
994 spin_lock(&session->s_cap_lock);
995 BUG_ON(!session->s_num_cap_releases);
996 msg = list_first_entry(&session->s_cap_releases,
997 struct ceph_msg, list_head);
998
999 dout(" adding %p release to mds%d msg %p (%d left)\n",
1000 inode, session->s_mds, msg, session->s_num_cap_releases);
1001
1002 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1003 head = msg->front.iov_base;
1004 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1005 item = msg->front.iov_base + msg->front.iov_len;
1006 item->ino = cpu_to_le64(ceph_ino(inode));
1007 item->cap_id = cpu_to_le64(cap->cap_id);
1008 item->migrate_seq = cpu_to_le32(cap->mseq);
1009 item->seq = cpu_to_le32(cap->issue_seq);
1010
1011 session->s_num_cap_releases--;
1012
1013 msg->front.iov_len += sizeof(*item);
1014 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1015 dout(" release msg %p full\n", msg);
1016 list_move_tail(&msg->list_head,
1017 &session->s_cap_releases_done);
1018 } else {
1019 dout(" release msg %p at %d/%d (%d)\n", msg,
1020 (int)le32_to_cpu(head->num),
1021 (int)CEPH_CAPS_PER_RELEASE,
1022 (int)msg->front.iov_len);
1023 }
1024 spin_unlock(&session->s_cap_lock);
1025 p = rb_next(p);
1026 __ceph_remove_cap(cap);
1027 }
1028}
1029
1030/*
1031 * Send a cap msg on the given inode. Update our caps state, then
1032 * drop i_lock and send the message.
1033 *
1034 * Make note of max_size reported/requested from mds, revoked caps
1035 * that have now been implemented.
1036 *
1037 * Make half-hearted attempt ot to invalidate page cache if we are
1038 * dropping RDCACHE. Note that this will leave behind locked pages
1039 * that we'll then need to deal with elsewhere.
1040 *
1041 * Return non-zero if delayed release, or we experienced an error
1042 * such that the caller should requeue + retry later.
1043 *
1044 * called with i_lock, then drops it.
1045 * caller should hold snap_rwsem (read), s_mutex.
1046 */
1047static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1048 int op, int used, int want, int retain, int flushing,
1049 unsigned *pflush_tid)
1050 __releases(cap->ci->vfs_inode->i_lock)
1051{
1052 struct ceph_inode_info *ci = cap->ci;
1053 struct inode *inode = &ci->vfs_inode;
1054 u64 cap_id = cap->cap_id;
1055 int held, revoking, dropping, keep;
1056 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1057 u64 size, max_size;
1058 struct timespec mtime, atime;
1059 int wake = 0;
1060 mode_t mode;
1061 uid_t uid;
1062 gid_t gid;
1063 struct ceph_mds_session *session;
1064 u64 xattr_version = 0;
1065 int delayed = 0;
1066 u64 flush_tid = 0;
1067 int i;
1068 int ret;
1069
1070 held = cap->issued | cap->implemented;
1071 revoking = cap->implemented & ~cap->issued;
1072 retain &= ~revoking;
1073 dropping = cap->issued & ~retain;
1074
1075 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1076 inode, cap, cap->session,
1077 ceph_cap_string(held), ceph_cap_string(held & retain),
1078 ceph_cap_string(revoking));
1079 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1080
1081 session = cap->session;
1082
1083 /* don't release wanted unless we've waited a bit. */
1084 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1085 time_before(jiffies, ci->i_hold_caps_min)) {
1086 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1087 ceph_cap_string(cap->issued),
1088 ceph_cap_string(cap->issued & retain),
1089 ceph_cap_string(cap->mds_wanted),
1090 ceph_cap_string(want));
1091 want |= cap->mds_wanted;
1092 retain |= cap->issued;
1093 delayed = 1;
1094 }
1095 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1096
1097 cap->issued &= retain; /* drop bits we don't want */
1098 if (cap->implemented & ~cap->issued) {
1099 /*
1100 * Wake up any waiters on wanted -> needed transition.
1101 * This is due to the weird transition from buffered
1102 * to sync IO... we need to flush dirty pages _before_
1103 * allowing sync writes to avoid reordering.
1104 */
1105 wake = 1;
1106 }
1107 cap->implemented &= cap->issued | used;
1108 cap->mds_wanted = want;
1109
1110 if (flushing) {
1111 /*
1112 * assign a tid for flush operations so we can avoid
1113 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1114 * clean type races. track latest tid for every bit
1115 * so we can handle flush AxFw, flush Fw, and have the
1116 * first ack clean Ax.
1117 */
1118 flush_tid = ++ci->i_cap_flush_last_tid;
1119 if (pflush_tid)
1120 *pflush_tid = flush_tid;
1121 dout(" cap_flush_tid %d\n", (int)flush_tid);
1122 for (i = 0; i < CEPH_CAP_BITS; i++)
1123 if (flushing & (1 << i))
1124 ci->i_cap_flush_tid[i] = flush_tid;
1125 }
1126
1127 keep = cap->implemented;
1128 seq = cap->seq;
1129 issue_seq = cap->issue_seq;
1130 mseq = cap->mseq;
1131 size = inode->i_size;
1132 ci->i_reported_size = size;
1133 max_size = ci->i_wanted_max_size;
1134 ci->i_requested_max_size = max_size;
1135 mtime = inode->i_mtime;
1136 atime = inode->i_atime;
1137 time_warp_seq = ci->i_time_warp_seq;
1138 follows = ci->i_snap_realm->cached_context->seq;
1139 uid = inode->i_uid;
1140 gid = inode->i_gid;
1141 mode = inode->i_mode;
1142
1143 if (dropping & CEPH_CAP_XATTR_EXCL) {
1144 __ceph_build_xattrs_blob(ci);
1145 xattr_version = ci->i_xattrs.version + 1;
1146 }
1147
1148 spin_unlock(&inode->i_lock);
1149
1150 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1151 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1152 size, max_size, &mtime, &atime, time_warp_seq,
1153 uid, gid, mode,
1154 xattr_version,
1155 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1156 follows);
1157 if (ret < 0) {
1158 dout("error sending cap msg, must requeue %p\n", inode);
1159 delayed = 1;
1160 }
1161
1162 if (wake)
1163 wake_up(&ci->i_cap_wq);
1164
1165 return delayed;
1166}
1167
1168/*
1169 * When a snapshot is taken, clients accumulate dirty metadata on
1170 * inodes with capabilities in ceph_cap_snaps to describe the file
1171 * state at the time the snapshot was taken. This must be flushed
1172 * asynchronously back to the MDS once sync writes complete and dirty
1173 * data is written out.
1174 *
1175 * Called under i_lock. Takes s_mutex as needed.
1176 */
1177void __ceph_flush_snaps(struct ceph_inode_info *ci,
1178 struct ceph_mds_session **psession)
1179{
1180 struct inode *inode = &ci->vfs_inode;
1181 int mds;
1182 struct ceph_cap_snap *capsnap;
1183 u32 mseq;
1184 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1185 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1186 session->s_mutex */
1187 u64 next_follows = 0; /* keep track of how far we've gotten through the
1188 i_cap_snaps list, and skip these entries next time
1189 around to avoid an infinite loop */
1190
1191 if (psession)
1192 session = *psession;
1193
1194 dout("__flush_snaps %p\n", inode);
1195retry:
1196 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1197 /* avoid an infiniute loop after retry */
1198 if (capsnap->follows < next_follows)
1199 continue;
1200 /*
1201 * we need to wait for sync writes to complete and for dirty
1202 * pages to be written out.
1203 */
1204 if (capsnap->dirty_pages || capsnap->writing)
1205 continue;
1206
1207 /* pick mds, take s_mutex */
1208 mds = __ceph_get_cap_mds(ci, &mseq);
1209 if (session && session->s_mds != mds) {
1210 dout("oops, wrong session %p mutex\n", session);
1211 mutex_unlock(&session->s_mutex);
1212 ceph_put_mds_session(session);
1213 session = NULL;
1214 }
1215 if (!session) {
1216 spin_unlock(&inode->i_lock);
1217 mutex_lock(&mdsc->mutex);
1218 session = __ceph_lookup_mds_session(mdsc, mds);
1219 mutex_unlock(&mdsc->mutex);
1220 if (session) {
1221 dout("inverting session/ino locks on %p\n",
1222 session);
1223 mutex_lock(&session->s_mutex);
1224 }
1225 /*
1226 * if session == NULL, we raced against a cap
1227 * deletion. retry, and we'll get a better
1228 * @mds value next time.
1229 */
1230 spin_lock(&inode->i_lock);
1231 goto retry;
1232 }
1233
1234 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1235 atomic_inc(&capsnap->nref);
1236 if (!list_empty(&capsnap->flushing_item))
1237 list_del_init(&capsnap->flushing_item);
1238 list_add_tail(&capsnap->flushing_item,
1239 &session->s_cap_snaps_flushing);
1240 spin_unlock(&inode->i_lock);
1241
1242 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1243 inode, capsnap, next_follows, capsnap->size);
1244 send_cap_msg(session, ceph_vino(inode).ino, 0,
1245 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1246 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1247 capsnap->size, 0,
1248 &capsnap->mtime, &capsnap->atime,
1249 capsnap->time_warp_seq,
1250 capsnap->uid, capsnap->gid, capsnap->mode,
1251 0, NULL,
1252 capsnap->follows);
1253
1254 next_follows = capsnap->follows + 1;
1255 ceph_put_cap_snap(capsnap);
1256
1257 spin_lock(&inode->i_lock);
1258 goto retry;
1259 }
1260
1261 /* we flushed them all; remove this inode from the queue */
1262 spin_lock(&mdsc->snap_flush_lock);
1263 list_del_init(&ci->i_snap_flush_item);
1264 spin_unlock(&mdsc->snap_flush_lock);
1265
1266 if (psession)
1267 *psession = session;
1268 else if (session) {
1269 mutex_unlock(&session->s_mutex);
1270 ceph_put_mds_session(session);
1271 }
1272}
1273
1274static void ceph_flush_snaps(struct ceph_inode_info *ci)
1275{
1276 struct inode *inode = &ci->vfs_inode;
1277
1278 spin_lock(&inode->i_lock);
1279 __ceph_flush_snaps(ci, NULL);
1280 spin_unlock(&inode->i_lock);
1281}
1282
1283/*
1284 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1285 * list.
1286 */
1287void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1288{
1289 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1290 struct inode *inode = &ci->vfs_inode;
1291 int was = ci->i_dirty_caps;
1292 int dirty = 0;
1293
1294 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1295 ceph_cap_string(mask), ceph_cap_string(was),
1296 ceph_cap_string(was | mask));
1297 ci->i_dirty_caps |= mask;
1298 if (was == 0) {
1299 dout(" inode %p now dirty\n", &ci->vfs_inode);
1300 BUG_ON(!list_empty(&ci->i_dirty_item));
1301 spin_lock(&mdsc->cap_dirty_lock);
1302 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1303 spin_unlock(&mdsc->cap_dirty_lock);
1304 if (ci->i_flushing_caps == 0) {
1305 igrab(inode);
1306 dirty |= I_DIRTY_SYNC;
1307 }
1308 }
1309 BUG_ON(list_empty(&ci->i_dirty_item));
1310 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1311 (mask & CEPH_CAP_FILE_BUFFER))
1312 dirty |= I_DIRTY_DATASYNC;
1313 if (dirty)
1314 __mark_inode_dirty(inode, dirty);
1315 __cap_delay_requeue(mdsc, ci);
1316}
1317
1318/*
1319 * Add dirty inode to the flushing list. Assigned a seq number so we
1320 * can wait for caps to flush without starving.
1321 *
1322 * Called under i_lock.
1323 */
1324static int __mark_caps_flushing(struct inode *inode,
1325 struct ceph_mds_session *session)
1326{
1327 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1328 struct ceph_inode_info *ci = ceph_inode(inode);
1329 int flushing;
1330
1331 BUG_ON(ci->i_dirty_caps == 0);
1332 BUG_ON(list_empty(&ci->i_dirty_item));
1333
1334 flushing = ci->i_dirty_caps;
1335 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1336 ceph_cap_string(flushing),
1337 ceph_cap_string(ci->i_flushing_caps),
1338 ceph_cap_string(ci->i_flushing_caps | flushing));
1339 ci->i_flushing_caps |= flushing;
1340 ci->i_dirty_caps = 0;
1341 dout(" inode %p now !dirty\n", inode);
1342
1343 spin_lock(&mdsc->cap_dirty_lock);
1344 list_del_init(&ci->i_dirty_item);
1345
1346 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1347 if (list_empty(&ci->i_flushing_item)) {
1348 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1349 mdsc->num_cap_flushing++;
1350 dout(" inode %p now flushing seq %lld\n", inode,
1351 ci->i_cap_flush_seq);
1352 } else {
1353 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1354 dout(" inode %p now flushing (more) seq %lld\n", inode,
1355 ci->i_cap_flush_seq);
1356 }
1357 spin_unlock(&mdsc->cap_dirty_lock);
1358
1359 return flushing;
1360}
1361
1362/*
1363 * try to invalidate mapping pages without blocking.
1364 */
1365static int mapping_is_empty(struct address_space *mapping)
1366{
1367 struct page *page = find_get_page(mapping, 0);
1368
1369 if (!page)
1370 return 1;
1371
1372 put_page(page);
1373 return 0;
1374}
1375
1376static int try_nonblocking_invalidate(struct inode *inode)
1377{
1378 struct ceph_inode_info *ci = ceph_inode(inode);
1379 u32 invalidating_gen = ci->i_rdcache_gen;
1380
1381 spin_unlock(&inode->i_lock);
1382 invalidate_mapping_pages(&inode->i_data, 0, -1);
1383 spin_lock(&inode->i_lock);
1384
1385 if (mapping_is_empty(&inode->i_data) &&
1386 invalidating_gen == ci->i_rdcache_gen) {
1387 /* success. */
1388 dout("try_nonblocking_invalidate %p success\n", inode);
1389 ci->i_rdcache_gen = 0;
1390 ci->i_rdcache_revoking = 0;
1391 return 0;
1392 }
1393 dout("try_nonblocking_invalidate %p failed\n", inode);
1394 return -1;
1395}
1396
1397/*
1398 * Swiss army knife function to examine currently used and wanted
1399 * versus held caps. Release, flush, ack revoked caps to mds as
1400 * appropriate.
1401 *
1402 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1403 * cap release further.
1404 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1405 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1406 * further delay.
1407 */
1408void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1409 struct ceph_mds_session *session)
1410{
1411 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1412 struct ceph_mds_client *mdsc = &client->mdsc;
1413 struct inode *inode = &ci->vfs_inode;
1414 struct ceph_cap *cap;
1415 int file_wanted, used;
1416 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1417 int drop_session_lock = session ? 0 : 1;
1418 int issued, implemented, want, retain, revoking, flushing = 0;
1419 int mds = -1; /* keep track of how far we've gone through i_caps list
1420 to avoid an infinite loop on retry */
1421 struct rb_node *p;
1422 int tried_invalidate = 0;
1423 int delayed = 0, sent = 0, force_requeue = 0, num;
1424 int queue_invalidate = 0;
1425 int is_delayed = flags & CHECK_CAPS_NODELAY;
1426
1427 /* if we are unmounting, flush any unused caps immediately. */
1428 if (mdsc->stopping)
1429 is_delayed = 1;
1430
1431 spin_lock(&inode->i_lock);
1432
1433 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1434 flags |= CHECK_CAPS_FLUSH;
1435
1436 /* flush snaps first time around only */
1437 if (!list_empty(&ci->i_cap_snaps))
1438 __ceph_flush_snaps(ci, &session);
1439 goto retry_locked;
1440retry:
1441 spin_lock(&inode->i_lock);
1442retry_locked:
1443 file_wanted = __ceph_caps_file_wanted(ci);
1444 used = __ceph_caps_used(ci);
1445 want = file_wanted | used;
1446 issued = __ceph_caps_issued(ci, &implemented);
1447 revoking = implemented & ~issued;
1448
1449 retain = want | CEPH_CAP_PIN;
1450 if (!mdsc->stopping && inode->i_nlink > 0) {
1451 if (want) {
1452 retain |= CEPH_CAP_ANY; /* be greedy */
1453 } else {
1454 retain |= CEPH_CAP_ANY_SHARED;
1455 /*
1456 * keep RD only if we didn't have the file open RW,
1457 * because then the mds would revoke it anyway to
1458 * journal max_size=0.
1459 */
1460 if (ci->i_max_size == 0)
1461 retain |= CEPH_CAP_ANY_RD;
1462 }
1463 }
1464
1465 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1466 " issued %s revoking %s retain %s %s%s%s\n", inode,
1467 ceph_cap_string(file_wanted),
1468 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1469 ceph_cap_string(ci->i_flushing_caps),
1470 ceph_cap_string(issued), ceph_cap_string(revoking),
1471 ceph_cap_string(retain),
1472 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1473 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1474 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1475
1476 /*
1477 * If we no longer need to hold onto old our caps, and we may
1478 * have cached pages, but don't want them, then try to invalidate.
1479 * If we fail, it's because pages are locked.... try again later.
1480 */
1481 if ((!is_delayed || mdsc->stopping) &&
1482 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1483 ci->i_rdcache_gen && /* may have cached pages */
1484 (file_wanted == 0 || /* no open files */
1485 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1486 !tried_invalidate) {
1487 dout("check_caps trying to invalidate on %p\n", inode);
1488 if (try_nonblocking_invalidate(inode) < 0) {
1489 if (revoking & CEPH_CAP_FILE_CACHE) {
1490 dout("check_caps queuing invalidate\n");
1491 queue_invalidate = 1;
1492 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1493 } else {
1494 dout("check_caps failed to invalidate pages\n");
1495 /* we failed to invalidate pages. check these
1496 caps again later. */
1497 force_requeue = 1;
1498 __cap_set_timeouts(mdsc, ci);
1499 }
1500 }
1501 tried_invalidate = 1;
1502 goto retry_locked;
1503 }
1504
1505 num = 0;
1506 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1507 cap = rb_entry(p, struct ceph_cap, ci_node);
1508 num++;
1509
1510 /* avoid looping forever */
1511 if (mds >= cap->mds ||
1512 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1513 continue;
1514
1515 /* NOTE: no side-effects allowed, until we take s_mutex */
1516
1517 revoking = cap->implemented & ~cap->issued;
1518 if (revoking)
1519 dout(" mds%d revoking %s\n", cap->mds,
1520 ceph_cap_string(revoking));
1521
1522 if (cap == ci->i_auth_cap &&
1523 (cap->issued & CEPH_CAP_FILE_WR)) {
1524 /* request larger max_size from MDS? */
1525 if (ci->i_wanted_max_size > ci->i_max_size &&
1526 ci->i_wanted_max_size > ci->i_requested_max_size) {
1527 dout("requesting new max_size\n");
1528 goto ack;
1529 }
1530
1531 /* approaching file_max? */
1532 if ((inode->i_size << 1) >= ci->i_max_size &&
1533 (ci->i_reported_size << 1) < ci->i_max_size) {
1534 dout("i_size approaching max_size\n");
1535 goto ack;
1536 }
1537 }
1538 /* flush anything dirty? */
1539 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1540 ci->i_dirty_caps) {
1541 dout("flushing dirty caps\n");
1542 goto ack;
1543 }
1544
1545 /* completed revocation? going down and there are no caps? */
1546 if (revoking && (revoking & used) == 0) {
1547 dout("completed revocation of %s\n",
1548 ceph_cap_string(cap->implemented & ~cap->issued));
1549 goto ack;
1550 }
1551
1552 /* want more caps from mds? */
1553 if (want & ~(cap->mds_wanted | cap->issued))
1554 goto ack;
1555
1556 /* things we might delay */
1557 if ((cap->issued & ~retain) == 0 &&
1558 cap->mds_wanted == want)
1559 continue; /* nope, all good */
1560
1561 if (is_delayed)
1562 goto ack;
1563
1564 /* delay? */
1565 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1566 time_before(jiffies, ci->i_hold_caps_max)) {
1567 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1568 ceph_cap_string(cap->issued),
1569 ceph_cap_string(cap->issued & retain),
1570 ceph_cap_string(cap->mds_wanted),
1571 ceph_cap_string(want));
1572 delayed++;
1573 continue;
1574 }
1575
1576ack:
1577 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1578 dout(" skipping %p I_NOFLUSH set\n", inode);
1579 continue;
1580 }
1581
1582 if (session && session != cap->session) {
1583 dout("oops, wrong session %p mutex\n", session);
1584 mutex_unlock(&session->s_mutex);
1585 session = NULL;
1586 }
1587 if (!session) {
1588 session = cap->session;
1589 if (mutex_trylock(&session->s_mutex) == 0) {
1590 dout("inverting session/ino locks on %p\n",
1591 session);
1592 spin_unlock(&inode->i_lock);
1593 if (took_snap_rwsem) {
1594 up_read(&mdsc->snap_rwsem);
1595 took_snap_rwsem = 0;
1596 }
1597 mutex_lock(&session->s_mutex);
1598 goto retry;
1599 }
1600 }
1601 /* take snap_rwsem after session mutex */
1602 if (!took_snap_rwsem) {
1603 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1604 dout("inverting snap/in locks on %p\n",
1605 inode);
1606 spin_unlock(&inode->i_lock);
1607 down_read(&mdsc->snap_rwsem);
1608 took_snap_rwsem = 1;
1609 goto retry;
1610 }
1611 took_snap_rwsem = 1;
1612 }
1613
1614 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1615 flushing = __mark_caps_flushing(inode, session);
1616
1617 mds = cap->mds; /* remember mds, so we don't repeat */
1618 sent++;
1619
1620 /* __send_cap drops i_lock */
1621 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1622 retain, flushing, NULL);
1623 goto retry; /* retake i_lock and restart our cap scan. */
1624 }
1625
1626 /*
1627 * Reschedule delayed caps release if we delayed anything,
1628 * otherwise cancel.
1629 */
1630 if (delayed && is_delayed)
1631 force_requeue = 1; /* __send_cap delayed release; requeue */
1632 if (!delayed && !is_delayed)
1633 __cap_delay_cancel(mdsc, ci);
1634 else if (!is_delayed || force_requeue)
1635 __cap_delay_requeue(mdsc, ci);
1636
1637 spin_unlock(&inode->i_lock);
1638
1639 if (queue_invalidate)
1640 ceph_queue_invalidate(inode);
1641
1642 if (session && drop_session_lock)
1643 mutex_unlock(&session->s_mutex);
1644 if (took_snap_rwsem)
1645 up_read(&mdsc->snap_rwsem);
1646}
1647
1648/*
1649 * Try to flush dirty caps back to the auth mds.
1650 */
1651static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1652 unsigned *flush_tid)
1653{
1654 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1655 struct ceph_inode_info *ci = ceph_inode(inode);
1656 int unlock_session = session ? 0 : 1;
1657 int flushing = 0;
1658
1659retry:
1660 spin_lock(&inode->i_lock);
1661 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1662 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1663 goto out;
1664 }
1665 if (ci->i_dirty_caps && ci->i_auth_cap) {
1666 struct ceph_cap *cap = ci->i_auth_cap;
1667 int used = __ceph_caps_used(ci);
1668 int want = __ceph_caps_wanted(ci);
1669 int delayed;
1670
1671 if (!session) {
1672 spin_unlock(&inode->i_lock);
1673 session = cap->session;
1674 mutex_lock(&session->s_mutex);
1675 goto retry;
1676 }
1677 BUG_ON(session != cap->session);
1678 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1679 goto out;
1680
1681 flushing = __mark_caps_flushing(inode, session);
1682
1683 /* __send_cap drops i_lock */
1684 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1685 cap->issued | cap->implemented, flushing,
1686 flush_tid);
1687 if (!delayed)
1688 goto out_unlocked;
1689
1690 spin_lock(&inode->i_lock);
1691 __cap_delay_requeue(mdsc, ci);
1692 }
1693out:
1694 spin_unlock(&inode->i_lock);
1695out_unlocked:
1696 if (session && unlock_session)
1697 mutex_unlock(&session->s_mutex);
1698 return flushing;
1699}
1700
1701/*
1702 * Return true if we've flushed caps through the given flush_tid.
1703 */
1704static int caps_are_flushed(struct inode *inode, unsigned tid)
1705{
1706 struct ceph_inode_info *ci = ceph_inode(inode);
1707 int dirty, i, ret = 1;
1708
1709 spin_lock(&inode->i_lock);
1710 dirty = __ceph_caps_dirty(ci);
1711 for (i = 0; i < CEPH_CAP_BITS; i++)
1712 if ((ci->i_flushing_caps & (1 << i)) &&
1713 ci->i_cap_flush_tid[i] <= tid) {
1714 /* still flushing this bit */
1715 ret = 0;
1716 break;
1717 }
1718 spin_unlock(&inode->i_lock);
1719 return ret;
1720}
1721
1722/*
1723 * Wait on any unsafe replies for the given inode. First wait on the
1724 * newest request, and make that the upper bound. Then, if there are
1725 * more requests, keep waiting on the oldest as long as it is still older
1726 * than the original request.
1727 */
1728static void sync_write_wait(struct inode *inode)
1729{
1730 struct ceph_inode_info *ci = ceph_inode(inode);
1731 struct list_head *head = &ci->i_unsafe_writes;
1732 struct ceph_osd_request *req;
1733 u64 last_tid;
1734
1735 spin_lock(&ci->i_unsafe_lock);
1736 if (list_empty(head))
1737 goto out;
1738
1739 /* set upper bound as _last_ entry in chain */
1740 req = list_entry(head->prev, struct ceph_osd_request,
1741 r_unsafe_item);
1742 last_tid = req->r_tid;
1743
1744 do {
1745 ceph_osdc_get_request(req);
1746 spin_unlock(&ci->i_unsafe_lock);
1747 dout("sync_write_wait on tid %llu (until %llu)\n",
1748 req->r_tid, last_tid);
1749 wait_for_completion(&req->r_safe_completion);
1750 spin_lock(&ci->i_unsafe_lock);
1751 ceph_osdc_put_request(req);
1752
1753 /*
1754 * from here on look at first entry in chain, since we
1755 * only want to wait for anything older than last_tid
1756 */
1757 if (list_empty(head))
1758 break;
1759 req = list_entry(head->next, struct ceph_osd_request,
1760 r_unsafe_item);
1761 } while (req->r_tid < last_tid);
1762out:
1763 spin_unlock(&ci->i_unsafe_lock);
1764}
1765
1766int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1767{
1768 struct inode *inode = dentry->d_inode;
1769 struct ceph_inode_info *ci = ceph_inode(inode);
1770 unsigned flush_tid;
1771 int ret;
1772 int dirty;
1773
1774 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1775 sync_write_wait(inode);
1776
1777 ret = filemap_write_and_wait(inode->i_mapping);
1778 if (ret < 0)
1779 return ret;
1780
1781 dirty = try_flush_caps(inode, NULL, &flush_tid);
1782 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1783
1784 /*
1785 * only wait on non-file metadata writeback (the mds
1786 * can recover size and mtime, so we don't need to
1787 * wait for that)
1788 */
1789 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1790 dout("fsync waiting for flush_tid %u\n", flush_tid);
1791 ret = wait_event_interruptible(ci->i_cap_wq,
1792 caps_are_flushed(inode, flush_tid));
1793 }
1794
1795 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1796 return ret;
1797}
1798
1799/*
1800 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1801 * queue inode for flush but don't do so immediately, because we can
1802 * get by with fewer MDS messages if we wait for data writeback to
1803 * complete first.
1804 */
1805int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1806{
1807 struct ceph_inode_info *ci = ceph_inode(inode);
1808 unsigned flush_tid;
1809 int err = 0;
1810 int dirty;
1811 int wait = wbc->sync_mode == WB_SYNC_ALL;
1812
1813 dout("write_inode %p wait=%d\n", inode, wait);
1814 if (wait) {
1815 dirty = try_flush_caps(inode, NULL, &flush_tid);
1816 if (dirty)
1817 err = wait_event_interruptible(ci->i_cap_wq,
1818 caps_are_flushed(inode, flush_tid));
1819 } else {
1820 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1821
1822 spin_lock(&inode->i_lock);
1823 if (__ceph_caps_dirty(ci))
1824 __cap_delay_requeue_front(mdsc, ci);
1825 spin_unlock(&inode->i_lock);
1826 }
1827 return err;
1828}
1829
1830/*
1831 * After a recovering MDS goes active, we need to resend any caps
1832 * we were flushing.
1833 *
1834 * Caller holds session->s_mutex.
1835 */
1836static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1837 struct ceph_mds_session *session)
1838{
1839 struct ceph_cap_snap *capsnap;
1840
1841 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1842 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1843 flushing_item) {
1844 struct ceph_inode_info *ci = capsnap->ci;
1845 struct inode *inode = &ci->vfs_inode;
1846 struct ceph_cap *cap;
1847
1848 spin_lock(&inode->i_lock);
1849 cap = ci->i_auth_cap;
1850 if (cap && cap->session == session) {
1851 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1852 cap, capsnap);
1853 __ceph_flush_snaps(ci, &session);
1854 } else {
1855 pr_err("%p auth cap %p not mds%d ???\n", inode,
1856 cap, session->s_mds);
1857 spin_unlock(&inode->i_lock);
1858 }
1859 }
1860}
1861
1862void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1863 struct ceph_mds_session *session)
1864{
1865 struct ceph_inode_info *ci;
1866
1867 kick_flushing_capsnaps(mdsc, session);
1868
1869 dout("kick_flushing_caps mds%d\n", session->s_mds);
1870 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1871 struct inode *inode = &ci->vfs_inode;
1872 struct ceph_cap *cap;
1873 int delayed = 0;
1874
1875 spin_lock(&inode->i_lock);
1876 cap = ci->i_auth_cap;
1877 if (cap && cap->session == session) {
1878 dout("kick_flushing_caps %p cap %p %s\n", inode,
1879 cap, ceph_cap_string(ci->i_flushing_caps));
1880 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1881 __ceph_caps_used(ci),
1882 __ceph_caps_wanted(ci),
1883 cap->issued | cap->implemented,
1884 ci->i_flushing_caps, NULL);
1885 if (delayed) {
1886 spin_lock(&inode->i_lock);
1887 __cap_delay_requeue(mdsc, ci);
1888 spin_unlock(&inode->i_lock);
1889 }
1890 } else {
1891 pr_err("%p auth cap %p not mds%d ???\n", inode,
1892 cap, session->s_mds);
1893 spin_unlock(&inode->i_lock);
1894 }
1895 }
1896}
1897
1898
1899/*
1900 * Take references to capabilities we hold, so that we don't release
1901 * them to the MDS prematurely.
1902 *
1903 * Protected by i_lock.
1904 */
1905static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1906{
1907 if (got & CEPH_CAP_PIN)
1908 ci->i_pin_ref++;
1909 if (got & CEPH_CAP_FILE_RD)
1910 ci->i_rd_ref++;
1911 if (got & CEPH_CAP_FILE_CACHE)
1912 ci->i_rdcache_ref++;
1913 if (got & CEPH_CAP_FILE_WR)
1914 ci->i_wr_ref++;
1915 if (got & CEPH_CAP_FILE_BUFFER) {
1916 if (ci->i_wrbuffer_ref == 0)
1917 igrab(&ci->vfs_inode);
1918 ci->i_wrbuffer_ref++;
1919 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1920 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1921 }
1922}
1923
1924/*
1925 * Try to grab cap references. Specify those refs we @want, and the
1926 * minimal set we @need. Also include the larger offset we are writing
1927 * to (when applicable), and check against max_size here as well.
1928 * Note that caller is responsible for ensuring max_size increases are
1929 * requested from the MDS.
1930 */
1931static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1932 int *got, loff_t endoff, int *check_max, int *err)
1933{
1934 struct inode *inode = &ci->vfs_inode;
1935 int ret = 0;
1936 int have, implemented;
1937 int file_wanted;
1938
1939 dout("get_cap_refs %p need %s want %s\n", inode,
1940 ceph_cap_string(need), ceph_cap_string(want));
1941 spin_lock(&inode->i_lock);
1942
1943 /* make sure file is actually open */
1944 file_wanted = __ceph_caps_file_wanted(ci);
1945 if ((file_wanted & need) == 0) {
1946 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1947 ceph_cap_string(need), ceph_cap_string(file_wanted));
1948 *err = -EBADF;
1949 ret = 1;
1950 goto out;
1951 }
1952
1953 if (need & CEPH_CAP_FILE_WR) {
1954 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1955 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1956 inode, endoff, ci->i_max_size);
1957 if (endoff > ci->i_wanted_max_size) {
1958 *check_max = 1;
1959 ret = 1;
1960 }
1961 goto out;
1962 }
1963 /*
1964 * If a sync write is in progress, we must wait, so that we
1965 * can get a final snapshot value for size+mtime.
1966 */
1967 if (__ceph_have_pending_cap_snap(ci)) {
1968 dout("get_cap_refs %p cap_snap_pending\n", inode);
1969 goto out;
1970 }
1971 }
1972 have = __ceph_caps_issued(ci, &implemented);
1973
1974 /*
1975 * disallow writes while a truncate is pending
1976 */
1977 if (ci->i_truncate_pending)
1978 have &= ~CEPH_CAP_FILE_WR;
1979
1980 if ((have & need) == need) {
1981 /*
1982 * Look at (implemented & ~have & not) so that we keep waiting
1983 * on transition from wanted -> needed caps. This is needed
1984 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1985 * going before a prior buffered writeback happens.
1986 */
1987 int not = want & ~(have & need);
1988 int revoking = implemented & ~have;
1989 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1990 inode, ceph_cap_string(have), ceph_cap_string(not),
1991 ceph_cap_string(revoking));
1992 if ((revoking & not) == 0) {
1993 *got = need | (have & want);
1994 __take_cap_refs(ci, *got);
1995 ret = 1;
1996 }
1997 } else {
1998 dout("get_cap_refs %p have %s needed %s\n", inode,
1999 ceph_cap_string(have), ceph_cap_string(need));
2000 }
2001out:
2002 spin_unlock(&inode->i_lock);
2003 dout("get_cap_refs %p ret %d got %s\n", inode,
2004 ret, ceph_cap_string(*got));
2005 return ret;
2006}
2007
2008/*
2009 * Check the offset we are writing up to against our current
2010 * max_size. If necessary, tell the MDS we want to write to
2011 * a larger offset.
2012 */
2013static void check_max_size(struct inode *inode, loff_t endoff)
2014{
2015 struct ceph_inode_info *ci = ceph_inode(inode);
2016 int check = 0;
2017
2018 /* do we need to explicitly request a larger max_size? */
2019 spin_lock(&inode->i_lock);
2020 if ((endoff >= ci->i_max_size ||
2021 endoff > (inode->i_size << 1)) &&
2022 endoff > ci->i_wanted_max_size) {
2023 dout("write %p at large endoff %llu, req max_size\n",
2024 inode, endoff);
2025 ci->i_wanted_max_size = endoff;
2026 check = 1;
2027 }
2028 spin_unlock(&inode->i_lock);
2029 if (check)
2030 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2031}
2032
2033/*
2034 * Wait for caps, and take cap references. If we can't get a WR cap
2035 * due to a small max_size, make sure we check_max_size (and possibly
2036 * ask the mds) so we don't get hung up indefinitely.
2037 */
2038int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2039 loff_t endoff)
2040{
2041 int check_max, ret, err;
2042
2043retry:
2044 if (endoff > 0)
2045 check_max_size(&ci->vfs_inode, endoff);
2046 check_max = 0;
2047 err = 0;
2048 ret = wait_event_interruptible(ci->i_cap_wq,
2049 try_get_cap_refs(ci, need, want,
2050 got, endoff,
2051 &check_max, &err));
2052 if (err)
2053 ret = err;
2054 if (check_max)
2055 goto retry;
2056 return ret;
2057}
2058
2059/*
2060 * Take cap refs. Caller must already know we hold at least one ref
2061 * on the caps in question or we don't know this is safe.
2062 */
2063void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2064{
2065 spin_lock(&ci->vfs_inode.i_lock);
2066 __take_cap_refs(ci, caps);
2067 spin_unlock(&ci->vfs_inode.i_lock);
2068}
2069
2070/*
2071 * Release cap refs.
2072 *
2073 * If we released the last ref on any given cap, call ceph_check_caps
2074 * to release (or schedule a release).
2075 *
2076 * If we are releasing a WR cap (from a sync write), finalize any affected
2077 * cap_snap, and wake up any waiters.
2078 */
2079void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2080{
2081 struct inode *inode = &ci->vfs_inode;
2082 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2083 struct ceph_cap_snap *capsnap;
2084
2085 spin_lock(&inode->i_lock);
2086 if (had & CEPH_CAP_PIN)
2087 --ci->i_pin_ref;
2088 if (had & CEPH_CAP_FILE_RD)
2089 if (--ci->i_rd_ref == 0)
2090 last++;
2091 if (had & CEPH_CAP_FILE_CACHE)
2092 if (--ci->i_rdcache_ref == 0)
2093 last++;
2094 if (had & CEPH_CAP_FILE_BUFFER) {
2095 if (--ci->i_wrbuffer_ref == 0) {
2096 last++;
2097 put++;
2098 }
2099 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2100 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2101 }
2102 if (had & CEPH_CAP_FILE_WR)
2103 if (--ci->i_wr_ref == 0) {
2104 last++;
2105 if (!list_empty(&ci->i_cap_snaps)) {
2106 capsnap = list_first_entry(&ci->i_cap_snaps,
2107 struct ceph_cap_snap,
2108 ci_item);
2109 if (capsnap->writing) {
2110 capsnap->writing = 0;
2111 flushsnaps =
2112 __ceph_finish_cap_snap(ci,
2113 capsnap);
2114 wake = 1;
2115 }
2116 }
2117 }
2118 spin_unlock(&inode->i_lock);
2119
2120 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
2121 last ? "last" : "");
2122
2123 if (last && !flushsnaps)
2124 ceph_check_caps(ci, 0, NULL);
2125 else if (flushsnaps)
2126 ceph_flush_snaps(ci);
2127 if (wake)
2128 wake_up(&ci->i_cap_wq);
2129 if (put)
2130 iput(inode);
2131}
2132
2133/*
2134 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2135 * context. Adjust per-snap dirty page accounting as appropriate.
2136 * Once all dirty data for a cap_snap is flushed, flush snapped file
2137 * metadata back to the MDS. If we dropped the last ref, call
2138 * ceph_check_caps.
2139 */
2140void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2141 struct ceph_snap_context *snapc)
2142{
2143 struct inode *inode = &ci->vfs_inode;
2144 int last = 0;
2145 int last_snap = 0;
2146 int found = 0;
2147 struct ceph_cap_snap *capsnap = NULL;
2148
2149 spin_lock(&inode->i_lock);
2150 ci->i_wrbuffer_ref -= nr;
2151 last = !ci->i_wrbuffer_ref;
2152
2153 if (ci->i_head_snapc == snapc) {
2154 ci->i_wrbuffer_ref_head -= nr;
2155 if (!ci->i_wrbuffer_ref_head) {
2156 ceph_put_snap_context(ci->i_head_snapc);
2157 ci->i_head_snapc = NULL;
2158 }
2159 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2160 inode,
2161 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2162 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2163 last ? " LAST" : "");
2164 } else {
2165 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2166 if (capsnap->context == snapc) {
2167 found = 1;
2168 capsnap->dirty_pages -= nr;
2169 last_snap = !capsnap->dirty_pages;
2170 break;
2171 }
2172 }
2173 BUG_ON(!found);
2174 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2175 " snap %lld %d/%d -> %d/%d %s%s\n",
2176 inode, capsnap, capsnap->context->seq,
2177 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2178 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2179 last ? " (wrbuffer last)" : "",
2180 last_snap ? " (capsnap last)" : "");
2181 }
2182
2183 spin_unlock(&inode->i_lock);
2184
2185 if (last) {
2186 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2187 iput(inode);
2188 } else if (last_snap) {
2189 ceph_flush_snaps(ci);
2190 wake_up(&ci->i_cap_wq);
2191 }
2192}
2193
2194/*
2195 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2196 * actually be a revocation if it specifies a smaller cap set.)
2197 *
2198 * caller holds s_mutex.
2199 * return value:
2200 * 0 - ok
2201 * 1 - check_caps on auth cap only (writeback)
2202 * 2 - check_caps (ack revoke)
2203 */
2204static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2205 struct ceph_mds_session *session,
2206 struct ceph_cap *cap,
2207 struct ceph_buffer *xattr_buf)
2208 __releases(inode->i_lock)
2209
2210{
2211 struct ceph_inode_info *ci = ceph_inode(inode);
2212 int mds = session->s_mds;
2213 int seq = le32_to_cpu(grant->seq);
2214 int newcaps = le32_to_cpu(grant->caps);
2215 int issued, implemented, used, wanted, dirty;
2216 u64 size = le64_to_cpu(grant->size);
2217 u64 max_size = le64_to_cpu(grant->max_size);
2218 struct timespec mtime, atime, ctime;
2219 int reply = 0;
2220 int wake = 0;
2221 int writeback = 0;
2222 int revoked_rdcache = 0;
2223 int queue_invalidate = 0;
2224
2225 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2226 inode, cap, mds, seq, ceph_cap_string(newcaps));
2227 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2228 inode->i_size);
2229
2230 /*
2231 * If CACHE is being revoked, and we have no dirty buffers,
2232 * try to invalidate (once). (If there are dirty buffers, we
2233 * will invalidate _after_ writeback.)
2234 */
2235 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2236 !ci->i_wrbuffer_ref) {
2237 if (try_nonblocking_invalidate(inode) == 0) {
2238 revoked_rdcache = 1;
2239 } else {
2240 /* there were locked pages.. invalidate later
2241 in a separate thread. */
2242 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2243 queue_invalidate = 1;
2244 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2245 }
2246 }
2247 }
2248
2249 /* side effects now are allowed */
2250
2251 issued = __ceph_caps_issued(ci, &implemented);
2252 issued |= implemented | __ceph_caps_dirty(ci);
2253
2254 cap->cap_gen = session->s_cap_gen;
2255
2256 __check_cap_issue(ci, cap, newcaps);
2257
2258 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2259 inode->i_mode = le32_to_cpu(grant->mode);
2260 inode->i_uid = le32_to_cpu(grant->uid);
2261 inode->i_gid = le32_to_cpu(grant->gid);
2262 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2263 inode->i_uid, inode->i_gid);
2264 }
2265
2266 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2267 inode->i_nlink = le32_to_cpu(grant->nlink);
2268
2269 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2270 int len = le32_to_cpu(grant->xattr_len);
2271 u64 version = le64_to_cpu(grant->xattr_version);
2272
2273 if (version > ci->i_xattrs.version) {
2274 dout(" got new xattrs v%llu on %p len %d\n",
2275 version, inode, len);
2276 if (ci->i_xattrs.blob)
2277 ceph_buffer_put(ci->i_xattrs.blob);
2278 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2279 ci->i_xattrs.version = version;
2280 }
2281 }
2282
2283 /* size/ctime/mtime/atime? */
2284 ceph_fill_file_size(inode, issued,
2285 le32_to_cpu(grant->truncate_seq),
2286 le64_to_cpu(grant->truncate_size), size);
2287 ceph_decode_timespec(&mtime, &grant->mtime);
2288 ceph_decode_timespec(&atime, &grant->atime);
2289 ceph_decode_timespec(&ctime, &grant->ctime);
2290 ceph_fill_file_time(inode, issued,
2291 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2292 &atime);
2293
2294 /* max size increase? */
2295 if (max_size != ci->i_max_size) {
2296 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2297 ci->i_max_size = max_size;
2298 if (max_size >= ci->i_wanted_max_size) {
2299 ci->i_wanted_max_size = 0; /* reset */
2300 ci->i_requested_max_size = 0;
2301 }
2302 wake = 1;
2303 }
2304
2305 /* check cap bits */
2306 wanted = __ceph_caps_wanted(ci);
2307 used = __ceph_caps_used(ci);
2308 dirty = __ceph_caps_dirty(ci);
2309 dout(" my wanted = %s, used = %s, dirty %s\n",
2310 ceph_cap_string(wanted),
2311 ceph_cap_string(used),
2312 ceph_cap_string(dirty));
2313 if (wanted != le32_to_cpu(grant->wanted)) {
2314 dout("mds wanted %s -> %s\n",
2315 ceph_cap_string(le32_to_cpu(grant->wanted)),
2316 ceph_cap_string(wanted));
2317 grant->wanted = cpu_to_le32(wanted);
2318 }
2319
2320 cap->seq = seq;
2321
2322 /* file layout may have changed */
2323 ci->i_layout = grant->layout;
2324
2325 /* revocation, grant, or no-op? */
2326 if (cap->issued & ~newcaps) {
2327 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2328 ceph_cap_string(newcaps));
2329 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2330 writeback = 1; /* will delay ack */
2331 else if (dirty & ~newcaps)
2332 reply = 1; /* initiate writeback in check_caps */
2333 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2334 revoked_rdcache)
2335 reply = 2; /* send revoke ack in check_caps */
2336 cap->issued = newcaps;
2337 } else if (cap->issued == newcaps) {
2338 dout("caps unchanged: %s -> %s\n",
2339 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2340 } else {
2341 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2342 ceph_cap_string(newcaps));
2343 cap->issued = newcaps;
2344 cap->implemented |= newcaps; /* add bits only, to
2345 * avoid stepping on a
2346 * pending revocation */
2347 wake = 1;
2348 }
2349
2350 spin_unlock(&inode->i_lock);
2351 if (writeback)
2352 /*
2353 * queue inode for writeback: we can't actually call
2354 * filemap_write_and_wait, etc. from message handler
2355 * context.
2356 */
2357 ceph_queue_writeback(inode);
2358 if (queue_invalidate)
2359 ceph_queue_invalidate(inode);
2360 if (wake)
2361 wake_up(&ci->i_cap_wq);
2362 return reply;
2363}
2364
2365/*
2366 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2367 * MDS has been safely committed.
2368 */
2369static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2370 struct ceph_mds_caps *m,
2371 struct ceph_mds_session *session,
2372 struct ceph_cap *cap)
2373 __releases(inode->i_lock)
2374{
2375 struct ceph_inode_info *ci = ceph_inode(inode);
2376 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2377 unsigned seq = le32_to_cpu(m->seq);
2378 int dirty = le32_to_cpu(m->dirty);
2379 int cleaned = 0;
2380 int drop = 0;
2381 int i;
2382
2383 for (i = 0; i < CEPH_CAP_BITS; i++)
2384 if ((dirty & (1 << i)) &&
2385 flush_tid == ci->i_cap_flush_tid[i])
2386 cleaned |= 1 << i;
2387
2388 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2389 " flushing %s -> %s\n",
2390 inode, session->s_mds, seq, ceph_cap_string(dirty),
2391 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2392 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2393
2394 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2395 goto out;
2396
2397 ci->i_flushing_caps &= ~cleaned;
2398
2399 spin_lock(&mdsc->cap_dirty_lock);
2400 if (ci->i_flushing_caps == 0) {
2401 list_del_init(&ci->i_flushing_item);
2402 if (!list_empty(&session->s_cap_flushing))
2403 dout(" mds%d still flushing cap on %p\n",
2404 session->s_mds,
2405 &list_entry(session->s_cap_flushing.next,
2406 struct ceph_inode_info,
2407 i_flushing_item)->vfs_inode);
2408 mdsc->num_cap_flushing--;
2409 wake_up(&mdsc->cap_flushing_wq);
2410 dout(" inode %p now !flushing\n", inode);
2411
2412 if (ci->i_dirty_caps == 0) {
2413 dout(" inode %p now clean\n", inode);
2414 BUG_ON(!list_empty(&ci->i_dirty_item));
2415 drop = 1;
2416 } else {
2417 BUG_ON(list_empty(&ci->i_dirty_item));
2418 }
2419 }
2420 spin_unlock(&mdsc->cap_dirty_lock);
2421 wake_up(&ci->i_cap_wq);
2422
2423out:
2424 spin_unlock(&inode->i_lock);
2425 if (drop)
2426 iput(inode);
2427}
2428
2429/*
2430 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2431 * throw away our cap_snap.
2432 *
2433 * Caller hold s_mutex.
2434 */
2435static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2436 struct ceph_mds_caps *m,
2437 struct ceph_mds_session *session)
2438{
2439 struct ceph_inode_info *ci = ceph_inode(inode);
2440 u64 follows = le64_to_cpu(m->snap_follows);
2441 struct ceph_cap_snap *capsnap;
2442 int drop = 0;
2443
2444 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2445 inode, ci, session->s_mds, follows);
2446
2447 spin_lock(&inode->i_lock);
2448 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2449 if (capsnap->follows == follows) {
2450 if (capsnap->flush_tid != flush_tid) {
2451 dout(" cap_snap %p follows %lld tid %lld !="
2452 " %lld\n", capsnap, follows,
2453 flush_tid, capsnap->flush_tid);
2454 break;
2455 }
2456 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2457 dout(" removing cap_snap %p follows %lld\n",
2458 capsnap, follows);
2459 ceph_put_snap_context(capsnap->context);
2460 list_del(&capsnap->ci_item);
2461 list_del(&capsnap->flushing_item);
2462 ceph_put_cap_snap(capsnap);
2463 drop = 1;
2464 break;
2465 } else {
2466 dout(" skipping cap_snap %p follows %lld\n",
2467 capsnap, capsnap->follows);
2468 }
2469 }
2470 spin_unlock(&inode->i_lock);
2471 if (drop)
2472 iput(inode);
2473}
2474
2475/*
2476 * Handle TRUNC from MDS, indicating file truncation.
2477 *
2478 * caller hold s_mutex.
2479 */
2480static void handle_cap_trunc(struct inode *inode,
2481 struct ceph_mds_caps *trunc,
2482 struct ceph_mds_session *session)
2483 __releases(inode->i_lock)
2484{
2485 struct ceph_inode_info *ci = ceph_inode(inode);
2486 int mds = session->s_mds;
2487 int seq = le32_to_cpu(trunc->seq);
2488 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2489 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2490 u64 size = le64_to_cpu(trunc->size);
2491 int implemented = 0;
2492 int dirty = __ceph_caps_dirty(ci);
2493 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2494 int queue_trunc = 0;
2495
2496 issued |= implemented | dirty;
2497
2498 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2499 inode, mds, seq, truncate_size, truncate_seq);
2500 queue_trunc = ceph_fill_file_size(inode, issued,
2501 truncate_seq, truncate_size, size);
2502 spin_unlock(&inode->i_lock);
2503
2504 if (queue_trunc)
2505 ceph_queue_vmtruncate(inode);
2506}
2507
2508/*
2509 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2510 * different one. If we are the most recent migration we've seen (as
2511 * indicated by mseq), make note of the migrating cap bits for the
2512 * duration (until we see the corresponding IMPORT).
2513 *
2514 * caller holds s_mutex
2515 */
2516static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2517 struct ceph_mds_session *session)
2518{
2519 struct ceph_inode_info *ci = ceph_inode(inode);
2520 int mds = session->s_mds;
2521 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2522 struct ceph_cap *cap = NULL, *t;
2523 struct rb_node *p;
2524 int remember = 1;
2525
2526 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2527 inode, ci, mds, mseq);
2528
2529 spin_lock(&inode->i_lock);
2530
2531 /* make sure we haven't seen a higher mseq */
2532 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2533 t = rb_entry(p, struct ceph_cap, ci_node);
2534 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2535 dout(" higher mseq on cap from mds%d\n",
2536 t->session->s_mds);
2537 remember = 0;
2538 }
2539 if (t->session->s_mds == mds)
2540 cap = t;
2541 }
2542
2543 if (cap) {
2544 if (remember) {
2545 /* make note */
2546 ci->i_cap_exporting_mds = mds;
2547 ci->i_cap_exporting_mseq = mseq;
2548 ci->i_cap_exporting_issued = cap->issued;
2549 }
2550 __ceph_remove_cap(cap);
2551 } else {
2552 WARN_ON(!cap);
2553 }
2554
2555 spin_unlock(&inode->i_lock);
2556}
2557
2558/*
2559 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2560 * clean them up.
2561 *
2562 * caller holds s_mutex.
2563 */
2564static void handle_cap_import(struct ceph_mds_client *mdsc,
2565 struct inode *inode, struct ceph_mds_caps *im,
2566 struct ceph_mds_session *session,
2567 void *snaptrace, int snaptrace_len)
2568{
2569 struct ceph_inode_info *ci = ceph_inode(inode);
2570 int mds = session->s_mds;
2571 unsigned issued = le32_to_cpu(im->caps);
2572 unsigned wanted = le32_to_cpu(im->wanted);
2573 unsigned seq = le32_to_cpu(im->seq);
2574 unsigned mseq = le32_to_cpu(im->migrate_seq);
2575 u64 realmino = le64_to_cpu(im->realm);
2576 u64 cap_id = le64_to_cpu(im->cap_id);
2577
2578 if (ci->i_cap_exporting_mds >= 0 &&
2579 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2580 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2581 " - cleared exporting from mds%d\n",
2582 inode, ci, mds, mseq,
2583 ci->i_cap_exporting_mds);
2584 ci->i_cap_exporting_issued = 0;
2585 ci->i_cap_exporting_mseq = 0;
2586 ci->i_cap_exporting_mds = -1;
2587 } else {
2588 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2589 inode, ci, mds, mseq);
2590 }
2591
2592 down_write(&mdsc->snap_rwsem);
2593 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2594 false);
2595 downgrade_write(&mdsc->snap_rwsem);
2596 ceph_add_cap(inode, session, cap_id, -1,
2597 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2598 NULL /* no caps context */);
2599 try_flush_caps(inode, session, NULL);
2600 up_read(&mdsc->snap_rwsem);
2601}
2602
2603/*
2604 * Handle a caps message from the MDS.
2605 *
2606 * Identify the appropriate session, inode, and call the right handler
2607 * based on the cap op.
2608 */
2609void ceph_handle_caps(struct ceph_mds_session *session,
2610 struct ceph_msg *msg)
2611{
2612 struct ceph_mds_client *mdsc = session->s_mdsc;
2613 struct super_block *sb = mdsc->client->sb;
2614 struct inode *inode;
2615 struct ceph_cap *cap;
2616 struct ceph_mds_caps *h;
2617 int mds = session->s_mds;
2618 int op;
2619 u32 seq;
2620 struct ceph_vino vino;
2621 u64 cap_id;
2622 u64 size, max_size;
2623 u64 tid;
2624 int check_caps = 0;
2625 void *snaptrace;
2626 int r;
2627
2628 dout("handle_caps from mds%d\n", mds);
2629
2630 /* decode */
2631 tid = le64_to_cpu(msg->hdr.tid);
2632 if (msg->front.iov_len < sizeof(*h))
2633 goto bad;
2634 h = msg->front.iov_base;
2635 snaptrace = h + 1;
2636 op = le32_to_cpu(h->op);
2637 vino.ino = le64_to_cpu(h->ino);
2638 vino.snap = CEPH_NOSNAP;
2639 cap_id = le64_to_cpu(h->cap_id);
2640 seq = le32_to_cpu(h->seq);
2641 size = le64_to_cpu(h->size);
2642 max_size = le64_to_cpu(h->max_size);
2643
2644 mutex_lock(&session->s_mutex);
2645 session->s_seq++;
2646 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2647 (unsigned)seq);
2648
2649 /* lookup ino */
2650 inode = ceph_find_inode(sb, vino);
2651 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2652 vino.snap, inode);
2653 if (!inode) {
2654 dout(" i don't have ino %llx\n", vino.ino);
2655 goto done;
2656 }
2657
2658 /* these will work even if we don't have a cap yet */
2659 switch (op) {
2660 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2661 handle_cap_flushsnap_ack(inode, tid, h, session);
2662 goto done;
2663
2664 case CEPH_CAP_OP_EXPORT:
2665 handle_cap_export(inode, h, session);
2666 goto done;
2667
2668 case CEPH_CAP_OP_IMPORT:
2669 handle_cap_import(mdsc, inode, h, session,
2670 snaptrace, le32_to_cpu(h->snap_trace_len));
2671 check_caps = 1; /* we may have sent a RELEASE to the old auth */
2672 goto done;
2673 }
2674
2675 /* the rest require a cap */
2676 spin_lock(&inode->i_lock);
2677 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2678 if (!cap) {
2679 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2680 inode, ceph_ino(inode), ceph_snap(inode), mds);
2681 spin_unlock(&inode->i_lock);
2682 goto done;
2683 }
2684
2685 /* note that each of these drops i_lock for us */
2686 switch (op) {
2687 case CEPH_CAP_OP_REVOKE:
2688 case CEPH_CAP_OP_GRANT:
2689 r = handle_cap_grant(inode, h, session, cap, msg->middle);
2690 if (r == 1)
2691 ceph_check_caps(ceph_inode(inode),
2692 CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2693 session);
2694 else if (r == 2)
2695 ceph_check_caps(ceph_inode(inode),
2696 CHECK_CAPS_NODELAY,
2697 session);
2698 break;
2699
2700 case CEPH_CAP_OP_FLUSH_ACK:
2701 handle_cap_flush_ack(inode, tid, h, session, cap);
2702 break;
2703
2704 case CEPH_CAP_OP_TRUNC:
2705 handle_cap_trunc(inode, h, session);
2706 break;
2707
2708 default:
2709 spin_unlock(&inode->i_lock);
2710 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2711 ceph_cap_op_name(op));
2712 }
2713
2714done:
2715 mutex_unlock(&session->s_mutex);
2716
2717 if (check_caps)
2718 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
2719 if (inode)
2720 iput(inode);
2721 return;
2722
2723bad:
2724 pr_err("ceph_handle_caps: corrupt message\n");
2725 ceph_msg_dump(msg);
2726 return;
2727}
2728
2729/*
2730 * Delayed work handler to process end of delayed cap release LRU list.
2731 */
2732void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2733{
2734 struct ceph_inode_info *ci;
2735 int flags = CHECK_CAPS_NODELAY;
2736
2737 dout("check_delayed_caps\n");
2738 while (1) {
2739 spin_lock(&mdsc->cap_delay_lock);
2740 if (list_empty(&mdsc->cap_delay_list))
2741 break;
2742 ci = list_first_entry(&mdsc->cap_delay_list,
2743 struct ceph_inode_info,
2744 i_cap_delay_list);
2745 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2746 time_before(jiffies, ci->i_hold_caps_max))
2747 break;
2748 list_del_init(&ci->i_cap_delay_list);
2749 spin_unlock(&mdsc->cap_delay_lock);
2750 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2751 ceph_check_caps(ci, flags, NULL);
2752 }
2753 spin_unlock(&mdsc->cap_delay_lock);
2754}
2755
2756/*
2757 * Flush all dirty caps to the mds
2758 */
2759void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2760{
2761 struct ceph_inode_info *ci, *nci = NULL;
2762 struct inode *inode, *ninode = NULL;
2763 struct list_head *p, *n;
2764
2765 dout("flush_dirty_caps\n");
2766 spin_lock(&mdsc->cap_dirty_lock);
2767 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2768 if (nci) {
2769 ci = nci;
2770 inode = ninode;
2771 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2772 dout("flush_dirty_caps inode %p (was next inode)\n",
2773 inode);
2774 } else {
2775 ci = list_entry(p, struct ceph_inode_info,
2776 i_dirty_item);
2777 inode = igrab(&ci->vfs_inode);
2778 BUG_ON(!inode);
2779 dout("flush_dirty_caps inode %p\n", inode);
2780 }
2781 if (n != &mdsc->cap_dirty) {
2782 nci = list_entry(n, struct ceph_inode_info,
2783 i_dirty_item);
2784 ninode = igrab(&nci->vfs_inode);
2785 BUG_ON(!ninode);
2786 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2787 dout("flush_dirty_caps next inode %p, noflush\n",
2788 ninode);
2789 } else {
2790 nci = NULL;
2791 ninode = NULL;
2792 }
2793 spin_unlock(&mdsc->cap_dirty_lock);
2794 if (inode) {
2795 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2796 NULL);
2797 iput(inode);
2798 }
2799 spin_lock(&mdsc->cap_dirty_lock);
2800 }
2801 spin_unlock(&mdsc->cap_dirty_lock);
2802}
2803
2804/*
2805 * Drop open file reference. If we were the last open file,
2806 * we may need to release capabilities to the MDS (or schedule
2807 * their delayed release).
2808 */
2809void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2810{
2811 struct inode *inode = &ci->vfs_inode;
2812 int last = 0;
2813
2814 spin_lock(&inode->i_lock);
2815 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2816 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2817 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2818 if (--ci->i_nr_by_mode[fmode] == 0)
2819 last++;
2820 spin_unlock(&inode->i_lock);
2821
2822 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2823 ceph_check_caps(ci, 0, NULL);
2824}
2825
2826/*
2827 * Helpers for embedding cap and dentry lease releases into mds
2828 * requests.
2829 *
2830 * @force is used by dentry_release (below) to force inclusion of a
2831 * record for the directory inode, even when there aren't any caps to
2832 * drop.
2833 */
2834int ceph_encode_inode_release(void **p, struct inode *inode,
2835 int mds, int drop, int unless, int force)
2836{
2837 struct ceph_inode_info *ci = ceph_inode(inode);
2838 struct ceph_cap *cap;
2839 struct ceph_mds_request_release *rel = *p;
2840 int ret = 0;
2841
2842 dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
2843 mds, ceph_cap_string(drop), ceph_cap_string(unless));
2844
2845 spin_lock(&inode->i_lock);
2846 cap = __get_cap_for_mds(ci, mds);
2847 if (cap && __cap_is_valid(cap)) {
2848 if (force ||
2849 ((cap->issued & drop) &&
2850 (cap->issued & unless) == 0)) {
2851 if ((cap->issued & drop) &&
2852 (cap->issued & unless) == 0) {
2853 dout("encode_inode_release %p cap %p %s -> "
2854 "%s\n", inode, cap,
2855 ceph_cap_string(cap->issued),
2856 ceph_cap_string(cap->issued & ~drop));
2857 cap->issued &= ~drop;
2858 cap->implemented &= ~drop;
2859 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2860 int wanted = __ceph_caps_wanted(ci);
2861 dout(" wanted %s -> %s (act %s)\n",
2862 ceph_cap_string(cap->mds_wanted),
2863 ceph_cap_string(cap->mds_wanted &
2864 ~wanted),
2865 ceph_cap_string(wanted));
2866 cap->mds_wanted &= wanted;
2867 }
2868 } else {
2869 dout("encode_inode_release %p cap %p %s"
2870 " (force)\n", inode, cap,
2871 ceph_cap_string(cap->issued));
2872 }
2873
2874 rel->ino = cpu_to_le64(ceph_ino(inode));
2875 rel->cap_id = cpu_to_le64(cap->cap_id);
2876 rel->seq = cpu_to_le32(cap->seq);
2877 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2878 rel->mseq = cpu_to_le32(cap->mseq);
2879 rel->caps = cpu_to_le32(cap->issued);
2880 rel->wanted = cpu_to_le32(cap->mds_wanted);
2881 rel->dname_len = 0;
2882 rel->dname_seq = 0;
2883 *p += sizeof(*rel);
2884 ret = 1;
2885 } else {
2886 dout("encode_inode_release %p cap %p %s\n",
2887 inode, cap, ceph_cap_string(cap->issued));
2888 }
2889 }
2890 spin_unlock(&inode->i_lock);
2891 return ret;
2892}
2893
2894int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2895 int mds, int drop, int unless)
2896{
2897 struct inode *dir = dentry->d_parent->d_inode;
2898 struct ceph_mds_request_release *rel = *p;
2899 struct ceph_dentry_info *di = ceph_dentry(dentry);
2900 int force = 0;
2901 int ret;
2902
2903 /*
2904 * force an record for the directory caps if we have a dentry lease.
2905 * this is racy (can't take i_lock and d_lock together), but it
2906 * doesn't have to be perfect; the mds will revoke anything we don't
2907 * release.
2908 */
2909 spin_lock(&dentry->d_lock);
2910 if (di->lease_session && di->lease_session->s_mds == mds)
2911 force = 1;
2912 spin_unlock(&dentry->d_lock);
2913
2914 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2915
2916 spin_lock(&dentry->d_lock);
2917 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2918 dout("encode_dentry_release %p mds%d seq %d\n",
2919 dentry, mds, (int)di->lease_seq);
2920 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2921 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2922 *p += dentry->d_name.len;
2923 rel->dname_seq = cpu_to_le32(di->lease_seq);
2924 }
2925 spin_unlock(&dentry->d_lock);
2926 return ret;
2927}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..291ac288e791
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <crypto/hash.h>
7
8#include "crypto.h"
9#include "decode.h"
10
11int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
12{
13 if (*p + sizeof(u16) + sizeof(key->created) +
14 sizeof(u16) + key->len > end)
15 return -ERANGE;
16 ceph_encode_16(p, key->type);
17 ceph_encode_copy(p, &key->created, sizeof(key->created));
18 ceph_encode_16(p, key->len);
19 ceph_encode_copy(p, key->key, key->len);
20 return 0;
21}
22
23int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
24{
25 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
26 key->type = ceph_decode_16(p);
27 ceph_decode_copy(p, &key->created, sizeof(key->created));
28 key->len = ceph_decode_16(p);
29 ceph_decode_need(p, end, key->len, bad);
30 key->key = kmalloc(key->len, GFP_NOFS);
31 if (!key->key)
32 return -ENOMEM;
33 ceph_decode_copy(p, key->key, key->len);
34 return 0;
35
36bad:
37 dout("failed to decode crypto key\n");
38 return -EINVAL;
39}
40
41int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
42{
43 int inlen = strlen(inkey);
44 int blen = inlen * 3 / 4;
45 void *buf, *p;
46 int ret;
47
48 dout("crypto_key_unarmor %s\n", inkey);
49 buf = kmalloc(blen, GFP_NOFS);
50 if (!buf)
51 return -ENOMEM;
52 blen = ceph_unarmor(buf, inkey, inkey+inlen);
53 if (blen < 0) {
54 kfree(buf);
55 return blen;
56 }
57
58 p = buf;
59 ret = ceph_crypto_key_decode(key, &p, p + blen);
60 kfree(buf);
61 if (ret)
62 return ret;
63 dout("crypto_key_unarmor key %p type %d len %d\n", key,
64 key->type, key->len);
65 return 0;
66}
67
68
69
70#define AES_KEY_SIZE 16
71
72static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
73{
74 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
75}
76
77const u8 *aes_iv = "cephsageyudagreg";
78
79int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
80 const void *src, size_t src_len)
81{
82 struct scatterlist sg_in[2], sg_out[1];
83 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
84 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
85 int ret;
86 void *iv;
87 int ivsize;
88 size_t zero_padding = (0x10 - (src_len & 0x0f));
89 char pad[16];
90
91 if (IS_ERR(tfm))
92 return PTR_ERR(tfm);
93
94 memset(pad, zero_padding, zero_padding);
95
96 *dst_len = src_len + zero_padding;
97
98 crypto_blkcipher_setkey((void *)tfm, key, key_len);
99 sg_init_table(sg_in, 2);
100 sg_set_buf(&sg_in[0], src, src_len);
101 sg_set_buf(&sg_in[1], pad, zero_padding);
102 sg_init_table(sg_out, 1);
103 sg_set_buf(sg_out, dst, *dst_len);
104 iv = crypto_blkcipher_crt(tfm)->iv;
105 ivsize = crypto_blkcipher_ivsize(tfm);
106
107 memcpy(iv, aes_iv, ivsize);
108 /*
109 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
110 key, key_len, 1);
111 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
112 src, src_len, 1);
113 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
114 pad, zero_padding, 1);
115 */
116 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
117 src_len + zero_padding);
118 crypto_free_blkcipher(tfm);
119 if (ret < 0)
120 pr_err("ceph_aes_crypt failed %d\n", ret);
121 /*
122 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
123 dst, *dst_len, 1);
124 */
125 return 0;
126}
127
128int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
129 const void *src1, size_t src1_len,
130 const void *src2, size_t src2_len)
131{
132 struct scatterlist sg_in[3], sg_out[1];
133 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
134 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
135 int ret;
136 void *iv;
137 int ivsize;
138 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
139 char pad[16];
140
141 if (IS_ERR(tfm))
142 return PTR_ERR(tfm);
143
144 memset(pad, zero_padding, zero_padding);
145
146 *dst_len = src1_len + src2_len + zero_padding;
147
148 crypto_blkcipher_setkey((void *)tfm, key, key_len);
149 sg_init_table(sg_in, 3);
150 sg_set_buf(&sg_in[0], src1, src1_len);
151 sg_set_buf(&sg_in[1], src2, src2_len);
152 sg_set_buf(&sg_in[2], pad, zero_padding);
153 sg_init_table(sg_out, 1);
154 sg_set_buf(sg_out, dst, *dst_len);
155 iv = crypto_blkcipher_crt(tfm)->iv;
156 ivsize = crypto_blkcipher_ivsize(tfm);
157
158 memcpy(iv, aes_iv, ivsize);
159 /*
160 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
161 key, key_len, 1);
162 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
163 src1, src1_len, 1);
164 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
165 src2, src2_len, 1);
166 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
167 pad, zero_padding, 1);
168 */
169 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
170 src1_len + src2_len + zero_padding);
171 crypto_free_blkcipher(tfm);
172 if (ret < 0)
173 pr_err("ceph_aes_crypt2 failed %d\n", ret);
174 /*
175 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
176 dst, *dst_len, 1);
177 */
178 return 0;
179}
180
181int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
182 const void *src, size_t src_len)
183{
184 struct scatterlist sg_in[1], sg_out[2];
185 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
186 struct blkcipher_desc desc = { .tfm = tfm };
187 char pad[16];
188 void *iv;
189 int ivsize;
190 int ret;
191 int last_byte;
192
193 if (IS_ERR(tfm))
194 return PTR_ERR(tfm);
195
196 crypto_blkcipher_setkey((void *)tfm, key, key_len);
197 sg_init_table(sg_in, 1);
198 sg_init_table(sg_out, 2);
199 sg_set_buf(sg_in, src, src_len);
200 sg_set_buf(&sg_out[0], dst, *dst_len);
201 sg_set_buf(&sg_out[1], pad, sizeof(pad));
202
203 iv = crypto_blkcipher_crt(tfm)->iv;
204 ivsize = crypto_blkcipher_ivsize(tfm);
205
206 memcpy(iv, aes_iv, ivsize);
207
208 /*
209 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
210 key, key_len, 1);
211 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
212 src, src_len, 1);
213 */
214
215 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
216 crypto_free_blkcipher(tfm);
217 if (ret < 0) {
218 pr_err("ceph_aes_decrypt failed %d\n", ret);
219 return ret;
220 }
221
222 if (src_len <= *dst_len)
223 last_byte = ((char *)dst)[src_len - 1];
224 else
225 last_byte = pad[src_len - *dst_len - 1];
226 if (last_byte <= 16 && src_len >= last_byte) {
227 *dst_len = src_len - last_byte;
228 } else {
229 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
230 last_byte, (int)src_len);
231 return -EPERM; /* bad padding */
232 }
233 /*
234 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
235 dst, *dst_len, 1);
236 */
237 return 0;
238}
239
240int ceph_aes_decrypt2(const void *key, int key_len,
241 void *dst1, size_t *dst1_len,
242 void *dst2, size_t *dst2_len,
243 const void *src, size_t src_len)
244{
245 struct scatterlist sg_in[1], sg_out[3];
246 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
247 struct blkcipher_desc desc = { .tfm = tfm };
248 char pad[16];
249 void *iv;
250 int ivsize;
251 int ret;
252 int last_byte;
253
254 if (IS_ERR(tfm))
255 return PTR_ERR(tfm);
256
257 sg_init_table(sg_in, 1);
258 sg_set_buf(sg_in, src, src_len);
259 sg_init_table(sg_out, 3);
260 sg_set_buf(&sg_out[0], dst1, *dst1_len);
261 sg_set_buf(&sg_out[1], dst2, *dst2_len);
262 sg_set_buf(&sg_out[2], pad, sizeof(pad));
263
264 crypto_blkcipher_setkey((void *)tfm, key, key_len);
265 iv = crypto_blkcipher_crt(tfm)->iv;
266 ivsize = crypto_blkcipher_ivsize(tfm);
267
268 memcpy(iv, aes_iv, ivsize);
269
270 /*
271 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
272 key, key_len, 1);
273 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
274 src, src_len, 1);
275 */
276
277 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
278 crypto_free_blkcipher(tfm);
279 if (ret < 0) {
280 pr_err("ceph_aes_decrypt failed %d\n", ret);
281 return ret;
282 }
283
284 if (src_len <= *dst1_len)
285 last_byte = ((char *)dst1)[src_len - 1];
286 else if (src_len <= *dst1_len + *dst2_len)
287 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
288 else
289 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
290 if (last_byte <= 16 && src_len >= last_byte) {
291 src_len -= last_byte;
292 } else {
293 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
294 last_byte, (int)src_len);
295 return -EPERM; /* bad padding */
296 }
297
298 if (src_len < *dst1_len) {
299 *dst1_len = src_len;
300 *dst2_len = 0;
301 } else {
302 *dst2_len = src_len - *dst1_len;
303 }
304 /*
305 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
306 dst1, *dst1_len, 1);
307 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
308 dst2, *dst2_len, 1);
309 */
310
311 return 0;
312}
313
314
315int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
316 const void *src, size_t src_len)
317{
318 switch (secret->type) {
319 case CEPH_CRYPTO_NONE:
320 if (*dst_len < src_len)
321 return -ERANGE;
322 memcpy(dst, src, src_len);
323 *dst_len = src_len;
324 return 0;
325
326 case CEPH_CRYPTO_AES:
327 return ceph_aes_decrypt(secret->key, secret->len, dst,
328 dst_len, src, src_len);
329
330 default:
331 return -EINVAL;
332 }
333}
334
335int ceph_decrypt2(struct ceph_crypto_key *secret,
336 void *dst1, size_t *dst1_len,
337 void *dst2, size_t *dst2_len,
338 const void *src, size_t src_len)
339{
340 size_t t;
341
342 switch (secret->type) {
343 case CEPH_CRYPTO_NONE:
344 if (*dst1_len + *dst2_len < src_len)
345 return -ERANGE;
346 t = min(*dst1_len, src_len);
347 memcpy(dst1, src, t);
348 *dst1_len = t;
349 src += t;
350 src_len -= t;
351 if (src_len) {
352 t = min(*dst2_len, src_len);
353 memcpy(dst2, src, t);
354 *dst2_len = t;
355 }
356 return 0;
357
358 case CEPH_CRYPTO_AES:
359 return ceph_aes_decrypt2(secret->key, secret->len,
360 dst1, dst1_len, dst2, dst2_len,
361 src, src_len);
362
363 default:
364 return -EINVAL;
365 }
366}
367
368int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
369 const void *src, size_t src_len)
370{
371 switch (secret->type) {
372 case CEPH_CRYPTO_NONE:
373 if (*dst_len < src_len)
374 return -ERANGE;
375 memcpy(dst, src, src_len);
376 *dst_len = src_len;
377 return 0;
378
379 case CEPH_CRYPTO_AES:
380 return ceph_aes_encrypt(secret->key, secret->len, dst,
381 dst_len, src, src_len);
382
383 default:
384 return -EINVAL;
385 }
386}
387
388int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
389 const void *src1, size_t src1_len,
390 const void *src2, size_t src2_len)
391{
392 switch (secret->type) {
393 case CEPH_CRYPTO_NONE:
394 if (*dst_len < src1_len + src2_len)
395 return -ERANGE;
396 memcpy(dst, src1, src1_len);
397 memcpy(dst + src1_len, src2, src2_len);
398 *dst_len = src1_len + src2_len;
399 return 0;
400
401 case CEPH_CRYPTO_AES:
402 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
403 src1, src1_len, src2, src2_len);
404
405 default:
406 return -EINVAL;
407 }
408}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..e159f1415110
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/module.h>
5#include <linux/ctype.h>
6#include <linux/debugfs.h>
7#include <linux/seq_file.h>
8
9#include "super.h"
10#include "mds_client.h"
11#include "mon_client.h"
12#include "auth.h"
13
14#ifdef CONFIG_DEBUG_FS
15
16/*
17 * Implement /sys/kernel/debug/ceph fun
18 *
19 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
20 * .../osdmap - current osdmap
21 * .../mdsmap - current mdsmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../mdsc - active mds requests
25 * .../monc - mon client state
26 * .../dentry_lru - dump contents of dentry lru
27 * .../caps - expose cap (reservation) stats
28 * .../bdi - symlink to ../../bdi/something
29 */
30
31static struct dentry *ceph_debugfs_dir;
32
33static int monmap_show(struct seq_file *s, void *p)
34{
35 int i;
36 struct ceph_client *client = s->private;
37
38 if (client->monc.monmap == NULL)
39 return 0;
40
41 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
42 for (i = 0; i < client->monc.monmap->num_mon; i++) {
43 struct ceph_entity_inst *inst =
44 &client->monc.monmap->mon_inst[i];
45
46 seq_printf(s, "\t%s%lld\t%s\n",
47 ENTITY_NAME(inst->name),
48 pr_addr(&inst->addr.in_addr));
49 }
50 return 0;
51}
52
53static int mdsmap_show(struct seq_file *s, void *p)
54{
55 int i;
56 struct ceph_client *client = s->private;
57
58 if (client->mdsc.mdsmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
61 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
62 seq_printf(s, "session_timeout %d\n",
63 client->mdsc.mdsmap->m_session_timeout);
64 seq_printf(s, "session_autoclose %d\n",
65 client->mdsc.mdsmap->m_session_autoclose);
66 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
67 struct ceph_entity_addr *addr =
68 &client->mdsc.mdsmap->m_info[i].addr;
69 int state = client->mdsc.mdsmap->m_info[i].state;
70
71 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
72 ceph_mds_state_name(state));
73 }
74 return 0;
75}
76
77static int osdmap_show(struct seq_file *s, void *p)
78{
79 int i;
80 struct ceph_client *client = s->private;
81 struct rb_node *n;
82
83 if (client->osdc.osdmap == NULL)
84 return 0;
85 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
86 seq_printf(s, "flags%s%s\n",
87 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
88 " NEARFULL" : "",
89 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
90 " FULL" : "");
91 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
92 struct ceph_pg_pool_info *pool =
93 rb_entry(n, struct ceph_pg_pool_info, node);
94 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
95 pool->id, pool->v.pg_num, pool->pg_num_mask,
96 pool->v.lpg_num, pool->lpg_num_mask);
97 }
98 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
99 struct ceph_entity_addr *addr =
100 &client->osdc.osdmap->osd_addr[i];
101 int state = client->osdc.osdmap->osd_state[i];
102 char sb[64];
103
104 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
105 i, pr_addr(&addr->in_addr),
106 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
107 ceph_osdmap_state_str(sb, sizeof(sb), state));
108 }
109 return 0;
110}
111
112static int monc_show(struct seq_file *s, void *p)
113{
114 struct ceph_client *client = s->private;
115 struct ceph_mon_statfs_request *req;
116 struct ceph_mon_client *monc = &client->monc;
117 struct rb_node *rp;
118
119 mutex_lock(&monc->mutex);
120
121 if (monc->have_mdsmap)
122 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
123 if (monc->have_osdmap)
124 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
125 if (monc->want_next_osdmap)
126 seq_printf(s, "want next osdmap\n");
127
128 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
129 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
130 seq_printf(s, "%lld statfs\n", req->tid);
131 }
132
133 mutex_unlock(&monc->mutex);
134 return 0;
135}
136
137static int mdsc_show(struct seq_file *s, void *p)
138{
139 struct ceph_client *client = s->private;
140 struct ceph_mds_client *mdsc = &client->mdsc;
141 struct ceph_mds_request *req;
142 struct rb_node *rp;
143 int pathlen;
144 u64 pathbase;
145 char *path;
146
147 mutex_lock(&mdsc->mutex);
148 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
149 req = rb_entry(rp, struct ceph_mds_request, r_node);
150
151 if (req->r_request)
152 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
153 else
154 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
155
156 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
157
158 if (req->r_got_unsafe)
159 seq_printf(s, "\t(unsafe)");
160 else
161 seq_printf(s, "\t");
162
163 if (req->r_inode) {
164 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
165 } else if (req->r_dentry) {
166 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
167 &pathbase, 0);
168 spin_lock(&req->r_dentry->d_lock);
169 seq_printf(s, " #%llx/%.*s (%s)",
170 ceph_ino(req->r_dentry->d_parent->d_inode),
171 req->r_dentry->d_name.len,
172 req->r_dentry->d_name.name,
173 path ? path : "");
174 spin_unlock(&req->r_dentry->d_lock);
175 kfree(path);
176 } else if (req->r_path1) {
177 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
178 req->r_path1);
179 }
180
181 if (req->r_old_dentry) {
182 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
183 &pathbase, 0);
184 spin_lock(&req->r_old_dentry->d_lock);
185 seq_printf(s, " #%llx/%.*s (%s)",
186 ceph_ino(req->r_old_dentry->d_parent->d_inode),
187 req->r_old_dentry->d_name.len,
188 req->r_old_dentry->d_name.name,
189 path ? path : "");
190 spin_unlock(&req->r_old_dentry->d_lock);
191 kfree(path);
192 } else if (req->r_path2) {
193 if (req->r_ino2.ino)
194 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
195 req->r_path2);
196 else
197 seq_printf(s, " %s", req->r_path2);
198 }
199
200 seq_printf(s, "\n");
201 }
202 mutex_unlock(&mdsc->mutex);
203
204 return 0;
205}
206
207static int osdc_show(struct seq_file *s, void *pp)
208{
209 struct ceph_client *client = s->private;
210 struct ceph_osd_client *osdc = &client->osdc;
211 struct rb_node *p;
212
213 mutex_lock(&osdc->request_mutex);
214 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
215 struct ceph_osd_request *req;
216 struct ceph_osd_request_head *head;
217 struct ceph_osd_op *op;
218 int num_ops;
219 int opcode, olen;
220 int i;
221
222 req = rb_entry(p, struct ceph_osd_request, r_node);
223
224 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
225 req->r_osd ? req->r_osd->o_osd : -1,
226 le32_to_cpu(req->r_pgid.pool),
227 le16_to_cpu(req->r_pgid.ps));
228
229 head = req->r_request->front.iov_base;
230 op = (void *)(head + 1);
231
232 num_ops = le16_to_cpu(head->num_ops);
233 olen = le32_to_cpu(head->object_len);
234 seq_printf(s, "%.*s", olen,
235 (const char *)(head->ops + num_ops));
236
237 if (req->r_reassert_version.epoch)
238 seq_printf(s, "\t%u'%llu",
239 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
240 le64_to_cpu(req->r_reassert_version.version));
241 else
242 seq_printf(s, "\t");
243
244 for (i = 0; i < num_ops; i++) {
245 opcode = le16_to_cpu(op->op);
246 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
247 op++;
248 }
249
250 seq_printf(s, "\n");
251 }
252 mutex_unlock(&osdc->request_mutex);
253 return 0;
254}
255
256static int caps_show(struct seq_file *s, void *p)
257{
258 struct ceph_client *client = p;
259 int total, avail, used, reserved, min;
260
261 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
262 seq_printf(s, "total\t\t%d\n"
263 "avail\t\t%d\n"
264 "used\t\t%d\n"
265 "reserved\t%d\n"
266 "min\t%d\n",
267 total, avail, used, reserved, min);
268 return 0;
269}
270
271static int dentry_lru_show(struct seq_file *s, void *ptr)
272{
273 struct ceph_client *client = s->private;
274 struct ceph_mds_client *mdsc = &client->mdsc;
275 struct ceph_dentry_info *di;
276
277 spin_lock(&mdsc->dentry_lru_lock);
278 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
279 struct dentry *dentry = di->dentry;
280 seq_printf(s, "%p %p\t%.*s\n",
281 di, dentry, dentry->d_name.len, dentry->d_name.name);
282 }
283 spin_unlock(&mdsc->dentry_lru_lock);
284
285 return 0;
286}
287
288#define DEFINE_SHOW_FUNC(name) \
289static int name##_open(struct inode *inode, struct file *file) \
290{ \
291 struct seq_file *sf; \
292 int ret; \
293 \
294 ret = single_open(file, name, NULL); \
295 sf = file->private_data; \
296 sf->private = inode->i_private; \
297 return ret; \
298} \
299 \
300static const struct file_operations name##_fops = { \
301 .open = name##_open, \
302 .read = seq_read, \
303 .llseek = seq_lseek, \
304 .release = single_release, \
305};
306
307DEFINE_SHOW_FUNC(monmap_show)
308DEFINE_SHOW_FUNC(mdsmap_show)
309DEFINE_SHOW_FUNC(osdmap_show)
310DEFINE_SHOW_FUNC(monc_show)
311DEFINE_SHOW_FUNC(mdsc_show)
312DEFINE_SHOW_FUNC(osdc_show)
313DEFINE_SHOW_FUNC(dentry_lru_show)
314DEFINE_SHOW_FUNC(caps_show)
315
316static int congestion_kb_set(void *data, u64 val)
317{
318 struct ceph_client *client = (struct ceph_client *)data;
319
320 if (client)
321 client->mount_args->congestion_kb = (int)val;
322
323 return 0;
324}
325
326static int congestion_kb_get(void *data, u64 *val)
327{
328 struct ceph_client *client = (struct ceph_client *)data;
329
330 if (client)
331 *val = (u64)client->mount_args->congestion_kb;
332
333 return 0;
334}
335
336
337DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
338 congestion_kb_set, "%llu\n");
339
340int __init ceph_debugfs_init(void)
341{
342 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
343 if (!ceph_debugfs_dir)
344 return -ENOMEM;
345 return 0;
346}
347
348void ceph_debugfs_cleanup(void)
349{
350 debugfs_remove(ceph_debugfs_dir);
351}
352
353int ceph_debugfs_client_init(struct ceph_client *client)
354{
355 int ret = 0;
356 char name[80];
357
358 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
359 PR_FSID(&client->fsid), client->monc.auth->global_id);
360
361 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
362 if (!client->debugfs_dir)
363 goto out;
364
365 client->monc.debugfs_file = debugfs_create_file("monc",
366 0600,
367 client->debugfs_dir,
368 client,
369 &monc_show_fops);
370 if (!client->monc.debugfs_file)
371 goto out;
372
373 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
374 0600,
375 client->debugfs_dir,
376 client,
377 &mdsc_show_fops);
378 if (!client->mdsc.debugfs_file)
379 goto out;
380
381 client->osdc.debugfs_file = debugfs_create_file("osdc",
382 0600,
383 client->debugfs_dir,
384 client,
385 &osdc_show_fops);
386 if (!client->osdc.debugfs_file)
387 goto out;
388
389 client->debugfs_monmap = debugfs_create_file("monmap",
390 0600,
391 client->debugfs_dir,
392 client,
393 &monmap_show_fops);
394 if (!client->debugfs_monmap)
395 goto out;
396
397 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
398 0600,
399 client->debugfs_dir,
400 client,
401 &mdsmap_show_fops);
402 if (!client->debugfs_mdsmap)
403 goto out;
404
405 client->debugfs_osdmap = debugfs_create_file("osdmap",
406 0600,
407 client->debugfs_dir,
408 client,
409 &osdmap_show_fops);
410 if (!client->debugfs_osdmap)
411 goto out;
412
413 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
414 0600,
415 client->debugfs_dir,
416 client,
417 &dentry_lru_show_fops);
418 if (!client->debugfs_dentry_lru)
419 goto out;
420
421 client->debugfs_caps = debugfs_create_file("caps",
422 0400,
423 client->debugfs_dir,
424 client,
425 &caps_show_fops);
426 if (!client->debugfs_caps)
427 goto out;
428
429 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
430 0600,
431 client->debugfs_dir,
432 client,
433 &congestion_kb_fops);
434 if (!client->debugfs_congestion_kb)
435 goto out;
436
437 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
438 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
439 name);
440
441 return 0;
442
443out:
444 ceph_debugfs_client_cleanup(client);
445 return ret;
446}
447
448void ceph_debugfs_client_cleanup(struct ceph_client *client)
449{
450 debugfs_remove(client->debugfs_bdi);
451 debugfs_remove(client->debugfs_caps);
452 debugfs_remove(client->debugfs_dentry_lru);
453 debugfs_remove(client->debugfs_osdmap);
454 debugfs_remove(client->debugfs_mdsmap);
455 debugfs_remove(client->debugfs_monmap);
456 debugfs_remove(client->osdc.debugfs_file);
457 debugfs_remove(client->mdsc.debugfs_file);
458 debugfs_remove(client->monc.debugfs_file);
459 debugfs_remove(client->debugfs_congestion_kb);
460 debugfs_remove(client->debugfs_dir);
461}
462
463#else // CONFIG_DEBUG_FS
464
465int __init ceph_debugfs_init(void)
466{
467 return 0;
468}
469
470void ceph_debugfs_cleanup(void)
471{
472}
473
474int ceph_debugfs_client_init(struct ceph_client *client)
475{
476 return 0;
477}
478
479void ceph_debugfs_client_cleanup(struct ceph_client *client)
480{
481}
482
483#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..5107384ee029
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1220 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/sched.h>
7
8#include "super.h"
9
10/*
11 * Directory operations: readdir, lookup, create, link, unlink,
12 * rename, etc.
13 */
14
15/*
16 * Ceph MDS operations are specified in terms of a base ino and
17 * relative path. Thus, the client can specify an operation on a
18 * specific inode (e.g., a getattr due to fstat(2)), or as a path
19 * relative to, say, the root directory.
20 *
21 * Normally, we limit ourselves to strict inode ops (no path component)
22 * or dentry operations (a single path component relative to an ino). The
23 * exception to this is open_root_dentry(), which will open the mount
24 * point by name.
25 */
26
27const struct inode_operations ceph_dir_iops;
28const struct file_operations ceph_dir_fops;
29struct dentry_operations ceph_dentry_ops;
30
31/*
32 * Initialize ceph dentry state.
33 */
34int ceph_init_dentry(struct dentry *dentry)
35{
36 struct ceph_dentry_info *di;
37
38 if (dentry->d_fsdata)
39 return 0;
40
41 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
42 dentry->d_op = &ceph_dentry_ops;
43 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
44 dentry->d_op = &ceph_snapdir_dentry_ops;
45 else
46 dentry->d_op = &ceph_snap_dentry_ops;
47
48 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
49 if (!di)
50 return -ENOMEM; /* oh well */
51
52 spin_lock(&dentry->d_lock);
53 if (dentry->d_fsdata) /* lost a race */
54 goto out_unlock;
55 di->dentry = dentry;
56 di->lease_session = NULL;
57 dentry->d_fsdata = di;
58 dentry->d_time = jiffies;
59 ceph_dentry_lru_add(dentry);
60out_unlock:
61 spin_unlock(&dentry->d_lock);
62 return 0;
63}
64
65
66
67/*
68 * for readdir, we encode the directory frag and offset within that
69 * frag into f_pos.
70 */
71static unsigned fpos_frag(loff_t p)
72{
73 return p >> 32;
74}
75static unsigned fpos_off(loff_t p)
76{
77 return p & 0xffffffff;
78}
79
80/*
81 * When possible, we try to satisfy a readdir by peeking at the
82 * dcache. We make this work by carefully ordering dentries on
83 * d_u.d_child when we initially get results back from the MDS, and
84 * falling back to a "normal" sync readdir if any dentries in the dir
85 * are dropped.
86 *
87 * I_COMPLETE tells indicates we have all dentries in the dir. It is
88 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
89 * the MDS if/when the directory is modified).
90 */
91static int __dcache_readdir(struct file *filp,
92 void *dirent, filldir_t filldir)
93{
94 struct inode *inode = filp->f_dentry->d_inode;
95 struct ceph_file_info *fi = filp->private_data;
96 struct dentry *parent = filp->f_dentry;
97 struct inode *dir = parent->d_inode;
98 struct list_head *p;
99 struct dentry *dentry, *last;
100 struct ceph_dentry_info *di;
101 int err = 0;
102
103 /* claim ref on last dentry we returned */
104 last = fi->dentry;
105 fi->dentry = NULL;
106
107 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
108 last);
109
110 spin_lock(&dcache_lock);
111
112 /* start at beginning? */
113 if (filp->f_pos == 2 || (last &&
114 filp->f_pos < ceph_dentry(last)->offset)) {
115 if (list_empty(&parent->d_subdirs))
116 goto out_unlock;
117 p = parent->d_subdirs.prev;
118 dout(" initial p %p/%p\n", p->prev, p->next);
119 } else {
120 p = last->d_u.d_child.prev;
121 }
122
123more:
124 dentry = list_entry(p, struct dentry, d_u.d_child);
125 di = ceph_dentry(dentry);
126 while (1) {
127 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
128 parent->d_subdirs.prev, parent->d_subdirs.next);
129 if (p == &parent->d_subdirs) {
130 fi->at_end = 1;
131 goto out_unlock;
132 }
133 if (!d_unhashed(dentry) && dentry->d_inode &&
134 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
135 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
136 filp->f_pos <= di->offset)
137 break;
138 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
139 dentry->d_name.len, dentry->d_name.name, di->offset,
140 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
141 !dentry->d_inode ? " null" : "");
142 p = p->prev;
143 dentry = list_entry(p, struct dentry, d_u.d_child);
144 di = ceph_dentry(dentry);
145 }
146
147 atomic_inc(&dentry->d_count);
148 spin_unlock(&dcache_lock);
149 spin_unlock(&inode->i_lock);
150
151 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
152 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
153 filp->f_pos = di->offset;
154 err = filldir(dirent, dentry->d_name.name,
155 dentry->d_name.len, di->offset,
156 dentry->d_inode->i_ino,
157 dentry->d_inode->i_mode >> 12);
158
159 if (last) {
160 if (err < 0) {
161 /* remember our position */
162 fi->dentry = last;
163 fi->next_offset = di->offset;
164 } else {
165 dput(last);
166 }
167 last = NULL;
168 }
169
170 spin_lock(&inode->i_lock);
171 spin_lock(&dcache_lock);
172
173 if (err < 0)
174 goto out_unlock;
175
176 last = dentry;
177
178 p = p->prev;
179 filp->f_pos++;
180
181 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
182 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
183 goto more;
184 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
185 err = -EAGAIN;
186
187out_unlock:
188 spin_unlock(&dcache_lock);
189
190 if (last) {
191 spin_unlock(&inode->i_lock);
192 dput(last);
193 spin_lock(&inode->i_lock);
194 }
195
196 return err;
197}
198
199/*
200 * make note of the last dentry we read, so we can
201 * continue at the same lexicographical point,
202 * regardless of what dir changes take place on the
203 * server.
204 */
205static int note_last_dentry(struct ceph_file_info *fi, const char *name,
206 int len)
207{
208 kfree(fi->last_name);
209 fi->last_name = kmalloc(len+1, GFP_NOFS);
210 if (!fi->last_name)
211 return -ENOMEM;
212 memcpy(fi->last_name, name, len);
213 fi->last_name[len] = 0;
214 dout("note_last_dentry '%s'\n", fi->last_name);
215 return 0;
216}
217
218static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
219{
220 struct ceph_file_info *fi = filp->private_data;
221 struct inode *inode = filp->f_dentry->d_inode;
222 struct ceph_inode_info *ci = ceph_inode(inode);
223 struct ceph_client *client = ceph_inode_to_client(inode);
224 struct ceph_mds_client *mdsc = &client->mdsc;
225 unsigned frag = fpos_frag(filp->f_pos);
226 int off = fpos_off(filp->f_pos);
227 int err;
228 u32 ftype;
229 struct ceph_mds_reply_info_parsed *rinfo;
230 const int max_entries = client->mount_args->max_readdir;
231
232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
233 if (fi->at_end)
234 return 0;
235
236 /* always start with . and .. */
237 if (filp->f_pos == 0) {
238 /* note dir version at start of readdir so we can tell
239 * if any dentries get dropped */
240 fi->dir_release_count = ci->i_release_count;
241
242 dout("readdir off 0 -> '.'\n");
243 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
244 inode->i_ino, inode->i_mode >> 12) < 0)
245 return 0;
246 filp->f_pos = 1;
247 off = 1;
248 }
249 if (filp->f_pos == 1) {
250 dout("readdir off 1 -> '..'\n");
251 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
252 filp->f_dentry->d_parent->d_inode->i_ino,
253 inode->i_mode >> 12) < 0)
254 return 0;
255 filp->f_pos = 2;
256 off = 2;
257 }
258
259 /* can we use the dcache? */
260 spin_lock(&inode->i_lock);
261 if ((filp->f_pos == 2 || fi->dentry) &&
262 !ceph_test_opt(client, NOASYNCREADDIR) &&
263 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
264 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
265 err = __dcache_readdir(filp, dirent, filldir);
266 if (err != -EAGAIN) {
267 spin_unlock(&inode->i_lock);
268 return err;
269 }
270 }
271 spin_unlock(&inode->i_lock);
272 if (fi->dentry) {
273 err = note_last_dentry(fi, fi->dentry->d_name.name,
274 fi->dentry->d_name.len);
275 if (err)
276 return err;
277 dput(fi->dentry);
278 fi->dentry = NULL;
279 }
280
281 /* proceed with a normal readdir */
282
283more:
284 /* do we have the correct frag content buffered? */
285 if (fi->frag != frag || fi->last_readdir == NULL) {
286 struct ceph_mds_request *req;
287 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
288 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
289
290 /* discard old result, if any */
291 if (fi->last_readdir)
292 ceph_mdsc_put_request(fi->last_readdir);
293
294 /* requery frag tree, as the frag topology may have changed */
295 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
296
297 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
298 ceph_vinop(inode), frag, fi->last_name);
299 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
300 if (IS_ERR(req))
301 return PTR_ERR(req);
302 req->r_inode = igrab(inode);
303 req->r_dentry = dget(filp->f_dentry);
304 /* hints to request -> mds selection code */
305 req->r_direct_mode = USE_AUTH_MDS;
306 req->r_direct_hash = ceph_frag_value(frag);
307 req->r_direct_is_hash = true;
308 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
309 req->r_readdir_offset = fi->next_offset;
310 req->r_args.readdir.frag = cpu_to_le32(frag);
311 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
312 req->r_num_caps = max_entries;
313 err = ceph_mdsc_do_request(mdsc, NULL, req);
314 if (err < 0) {
315 ceph_mdsc_put_request(req);
316 return err;
317 }
318 dout("readdir got and parsed readdir result=%d"
319 " on frag %x, end=%d, complete=%d\n", err, frag,
320 (int)req->r_reply_info.dir_end,
321 (int)req->r_reply_info.dir_complete);
322
323 if (!req->r_did_prepopulate) {
324 dout("readdir !did_prepopulate");
325 fi->dir_release_count--; /* preclude I_COMPLETE */
326 }
327
328 /* note next offset and last dentry name */
329 fi->offset = fi->next_offset;
330 fi->last_readdir = req;
331
332 if (req->r_reply_info.dir_end) {
333 kfree(fi->last_name);
334 fi->last_name = NULL;
335 fi->next_offset = 0;
336 } else {
337 rinfo = &req->r_reply_info;
338 err = note_last_dentry(fi,
339 rinfo->dir_dname[rinfo->dir_nr-1],
340 rinfo->dir_dname_len[rinfo->dir_nr-1]);
341 if (err)
342 return err;
343 fi->next_offset += rinfo->dir_nr;
344 }
345 }
346
347 rinfo = &fi->last_readdir->r_reply_info;
348 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
349 rinfo->dir_nr, off, fi->offset);
350 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
351 u64 pos = ceph_make_fpos(frag, off);
352 struct ceph_mds_reply_inode *in =
353 rinfo->dir_in[off - fi->offset].in;
354 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
355 off, off - fi->offset, rinfo->dir_nr, pos,
356 rinfo->dir_dname_len[off - fi->offset],
357 rinfo->dir_dname[off - fi->offset], in);
358 BUG_ON(!in);
359 ftype = le32_to_cpu(in->mode) >> 12;
360 if (filldir(dirent,
361 rinfo->dir_dname[off - fi->offset],
362 rinfo->dir_dname_len[off - fi->offset],
363 pos,
364 le64_to_cpu(in->ino),
365 ftype) < 0) {
366 dout("filldir stopping us...\n");
367 return 0;
368 }
369 off++;
370 filp->f_pos = pos + 1;
371 }
372
373 if (fi->last_name) {
374 ceph_mdsc_put_request(fi->last_readdir);
375 fi->last_readdir = NULL;
376 goto more;
377 }
378
379 /* more frags? */
380 if (!ceph_frag_is_rightmost(frag)) {
381 frag = ceph_frag_next(frag);
382 off = 0;
383 filp->f_pos = ceph_make_fpos(frag, off);
384 dout("readdir next frag is %x\n", frag);
385 goto more;
386 }
387 fi->at_end = 1;
388
389 /*
390 * if dir_release_count still matches the dir, no dentries
391 * were released during the whole readdir, and we should have
392 * the complete dir contents in our cache.
393 */
394 spin_lock(&inode->i_lock);
395 if (ci->i_release_count == fi->dir_release_count) {
396 dout(" marking %p complete\n", inode);
397 ci->i_ceph_flags |= CEPH_I_COMPLETE;
398 ci->i_max_offset = filp->f_pos;
399 }
400 spin_unlock(&inode->i_lock);
401
402 dout("readdir %p filp %p done.\n", inode, filp);
403 return 0;
404}
405
406static void reset_readdir(struct ceph_file_info *fi)
407{
408 if (fi->last_readdir) {
409 ceph_mdsc_put_request(fi->last_readdir);
410 fi->last_readdir = NULL;
411 }
412 kfree(fi->last_name);
413 fi->next_offset = 2; /* compensate for . and .. */
414 if (fi->dentry) {
415 dput(fi->dentry);
416 fi->dentry = NULL;
417 }
418 fi->at_end = 0;
419}
420
421static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
422{
423 struct ceph_file_info *fi = file->private_data;
424 struct inode *inode = file->f_mapping->host;
425 loff_t old_offset = offset;
426 loff_t retval;
427
428 mutex_lock(&inode->i_mutex);
429 switch (origin) {
430 case SEEK_END:
431 offset += inode->i_size + 2; /* FIXME */
432 break;
433 case SEEK_CUR:
434 offset += file->f_pos;
435 }
436 retval = -EINVAL;
437 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
438 if (offset != file->f_pos) {
439 file->f_pos = offset;
440 file->f_version = 0;
441 fi->at_end = 0;
442 }
443 retval = offset;
444
445 /*
446 * discard buffered readdir content on seekdir(0), or
447 * seek to new frag, or seek prior to current chunk.
448 */
449 if (offset == 0 ||
450 fpos_frag(offset) != fpos_frag(old_offset) ||
451 fpos_off(offset) < fi->offset) {
452 dout("dir_llseek dropping %p content\n", file);
453 reset_readdir(fi);
454 }
455
456 /* bump dir_release_count if we did a forward seek */
457 if (offset > old_offset)
458 fi->dir_release_count--;
459 }
460 mutex_unlock(&inode->i_mutex);
461 return retval;
462}
463
464/*
465 * Process result of a lookup/open request.
466 *
467 * Mainly, make sure we return the final req->r_dentry (if it already
468 * existed) in place of the original VFS-provided dentry when they
469 * differ.
470 *
471 * Gracefully handle the case where the MDS replies with -ENOENT and
472 * no trace (which it may do, at its discretion, e.g., if it doesn't
473 * care to issue a lease on the negative dentry).
474 */
475struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
476 struct dentry *dentry, int err)
477{
478 struct ceph_client *client = ceph_client(dentry->d_sb);
479 struct inode *parent = dentry->d_parent->d_inode;
480
481 /* .snap dir? */
482 if (err == -ENOENT &&
483 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
484 strcmp(dentry->d_name.name,
485 client->mount_args->snapdir_name) == 0) {
486 struct inode *inode = ceph_get_snapdir(parent);
487 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
488 dentry, dentry->d_name.len, dentry->d_name.name, inode);
489 d_add(dentry, inode);
490 err = 0;
491 }
492
493 if (err == -ENOENT) {
494 /* no trace? */
495 err = 0;
496 if (!req->r_reply_info.head->is_dentry) {
497 dout("ENOENT and no trace, dentry %p inode %p\n",
498 dentry, dentry->d_inode);
499 if (dentry->d_inode) {
500 d_drop(dentry);
501 err = -ENOENT;
502 } else {
503 d_add(dentry, NULL);
504 }
505 }
506 }
507 if (err)
508 dentry = ERR_PTR(err);
509 else if (dentry != req->r_dentry)
510 dentry = dget(req->r_dentry); /* we got spliced */
511 else
512 dentry = NULL;
513 return dentry;
514}
515
516static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
517{
518 return ceph_ino(inode) == CEPH_INO_ROOT &&
519 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
520}
521
522/*
523 * Look up a single dir entry. If there is a lookup intent, inform
524 * the MDS so that it gets our 'caps wanted' value in a single op.
525 */
526static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
527 struct nameidata *nd)
528{
529 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
530 struct ceph_mds_client *mdsc = &client->mdsc;
531 struct ceph_mds_request *req;
532 int op;
533 int err;
534
535 dout("lookup %p dentry %p '%.*s'\n",
536 dir, dentry, dentry->d_name.len, dentry->d_name.name);
537
538 if (dentry->d_name.len > NAME_MAX)
539 return ERR_PTR(-ENAMETOOLONG);
540
541 err = ceph_init_dentry(dentry);
542 if (err < 0)
543 return ERR_PTR(err);
544
545 /* open (but not create!) intent? */
546 if (nd &&
547 (nd->flags & LOOKUP_OPEN) &&
548 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
549 !(nd->intent.open.flags & O_CREAT)) {
550 int mode = nd->intent.open.create_mode & ~current->fs->umask;
551 return ceph_lookup_open(dir, dentry, nd, mode, 1);
552 }
553
554 /* can we conclude ENOENT locally? */
555 if (dentry->d_inode == NULL) {
556 struct ceph_inode_info *ci = ceph_inode(dir);
557 struct ceph_dentry_info *di = ceph_dentry(dentry);
558
559 spin_lock(&dir->i_lock);
560 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
561 if (strncmp(dentry->d_name.name,
562 client->mount_args->snapdir_name,
563 dentry->d_name.len) &&
564 !is_root_ceph_dentry(dir, dentry) &&
565 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
566 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
567 di->offset = ci->i_max_offset++;
568 spin_unlock(&dir->i_lock);
569 dout(" dir %p complete, -ENOENT\n", dir);
570 d_add(dentry, NULL);
571 di->lease_shared_gen = ci->i_shared_gen;
572 return NULL;
573 }
574 spin_unlock(&dir->i_lock);
575 }
576
577 op = ceph_snap(dir) == CEPH_SNAPDIR ?
578 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
579 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
580 if (IS_ERR(req))
581 return ERR_PTR(PTR_ERR(req));
582 req->r_dentry = dget(dentry);
583 req->r_num_caps = 2;
584 /* we only need inode linkage */
585 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
586 req->r_locked_dir = dir;
587 err = ceph_mdsc_do_request(mdsc, NULL, req);
588 dentry = ceph_finish_lookup(req, dentry, err);
589 ceph_mdsc_put_request(req); /* will dput(dentry) */
590 dout("lookup result=%p\n", dentry);
591 return dentry;
592}
593
594/*
595 * If we do a create but get no trace back from the MDS, follow up with
596 * a lookup (the VFS expects us to link up the provided dentry).
597 */
598int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
599{
600 struct dentry *result = ceph_lookup(dir, dentry, NULL);
601
602 if (result && !IS_ERR(result)) {
603 /*
604 * We created the item, then did a lookup, and found
605 * it was already linked to another inode we already
606 * had in our cache (and thus got spliced). Link our
607 * dentry to that inode, but don't hash it, just in
608 * case the VFS wants to dereference it.
609 */
610 BUG_ON(!result->d_inode);
611 d_instantiate(dentry, result->d_inode);
612 return 0;
613 }
614 return PTR_ERR(result);
615}
616
617static int ceph_mknod(struct inode *dir, struct dentry *dentry,
618 int mode, dev_t rdev)
619{
620 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
621 struct ceph_mds_client *mdsc = &client->mdsc;
622 struct ceph_mds_request *req;
623 int err;
624
625 if (ceph_snap(dir) != CEPH_NOSNAP)
626 return -EROFS;
627
628 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
629 dir, dentry, mode, rdev);
630 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
631 if (IS_ERR(req)) {
632 d_drop(dentry);
633 return PTR_ERR(req);
634 }
635 req->r_dentry = dget(dentry);
636 req->r_num_caps = 2;
637 req->r_locked_dir = dir;
638 req->r_args.mknod.mode = cpu_to_le32(mode);
639 req->r_args.mknod.rdev = cpu_to_le32(rdev);
640 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
641 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
642 err = ceph_mdsc_do_request(mdsc, dir, req);
643 if (!err && !req->r_reply_info.head->is_dentry)
644 err = ceph_handle_notrace_create(dir, dentry);
645 ceph_mdsc_put_request(req);
646 if (err)
647 d_drop(dentry);
648 return err;
649}
650
651static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
652 struct nameidata *nd)
653{
654 dout("create in dir %p dentry %p name '%.*s'\n",
655 dir, dentry, dentry->d_name.len, dentry->d_name.name);
656
657 if (ceph_snap(dir) != CEPH_NOSNAP)
658 return -EROFS;
659
660 if (nd) {
661 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
662 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
663 /* hrm, what should i do here if we get aliased? */
664 if (IS_ERR(dentry))
665 return PTR_ERR(dentry);
666 return 0;
667 }
668
669 /* fall back to mknod */
670 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
671}
672
673static int ceph_symlink(struct inode *dir, struct dentry *dentry,
674 const char *dest)
675{
676 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
677 struct ceph_mds_client *mdsc = &client->mdsc;
678 struct ceph_mds_request *req;
679 int err;
680
681 if (ceph_snap(dir) != CEPH_NOSNAP)
682 return -EROFS;
683
684 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
685 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
686 if (IS_ERR(req)) {
687 d_drop(dentry);
688 return PTR_ERR(req);
689 }
690 req->r_dentry = dget(dentry);
691 req->r_num_caps = 2;
692 req->r_path2 = kstrdup(dest, GFP_NOFS);
693 req->r_locked_dir = dir;
694 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
695 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
696 err = ceph_mdsc_do_request(mdsc, dir, req);
697 if (!err && !req->r_reply_info.head->is_dentry)
698 err = ceph_handle_notrace_create(dir, dentry);
699 ceph_mdsc_put_request(req);
700 if (err)
701 d_drop(dentry);
702 return err;
703}
704
705static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
706{
707 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
708 struct ceph_mds_client *mdsc = &client->mdsc;
709 struct ceph_mds_request *req;
710 int err = -EROFS;
711 int op;
712
713 if (ceph_snap(dir) == CEPH_SNAPDIR) {
714 /* mkdir .snap/foo is a MKSNAP */
715 op = CEPH_MDS_OP_MKSNAP;
716 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
717 dentry->d_name.len, dentry->d_name.name, dentry);
718 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
719 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
720 op = CEPH_MDS_OP_MKDIR;
721 } else {
722 goto out;
723 }
724 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
725 if (IS_ERR(req)) {
726 err = PTR_ERR(req);
727 goto out;
728 }
729
730 req->r_dentry = dget(dentry);
731 req->r_num_caps = 2;
732 req->r_locked_dir = dir;
733 req->r_args.mkdir.mode = cpu_to_le32(mode);
734 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
735 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
736 err = ceph_mdsc_do_request(mdsc, dir, req);
737 if (!err && !req->r_reply_info.head->is_dentry)
738 err = ceph_handle_notrace_create(dir, dentry);
739 ceph_mdsc_put_request(req);
740out:
741 if (err < 0)
742 d_drop(dentry);
743 return err;
744}
745
746static int ceph_link(struct dentry *old_dentry, struct inode *dir,
747 struct dentry *dentry)
748{
749 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
750 struct ceph_mds_client *mdsc = &client->mdsc;
751 struct ceph_mds_request *req;
752 int err;
753
754 if (ceph_snap(dir) != CEPH_NOSNAP)
755 return -EROFS;
756
757 dout("link in dir %p old_dentry %p dentry %p\n", dir,
758 old_dentry, dentry);
759 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
760 if (IS_ERR(req)) {
761 d_drop(dentry);
762 return PTR_ERR(req);
763 }
764 req->r_dentry = dget(dentry);
765 req->r_num_caps = 2;
766 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
767 req->r_locked_dir = dir;
768 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
769 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
770 err = ceph_mdsc_do_request(mdsc, dir, req);
771 if (err)
772 d_drop(dentry);
773 else if (!req->r_reply_info.head->is_dentry)
774 d_instantiate(dentry, igrab(old_dentry->d_inode));
775 ceph_mdsc_put_request(req);
776 return err;
777}
778
779/*
780 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
781 * looks like the link count will hit 0, drop any other caps (other
782 * than PIN) we don't specifically want (due to the file still being
783 * open).
784 */
785static int drop_caps_for_unlink(struct inode *inode)
786{
787 struct ceph_inode_info *ci = ceph_inode(inode);
788 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
789
790 spin_lock(&inode->i_lock);
791 if (inode->i_nlink == 1) {
792 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
793 ci->i_ceph_flags |= CEPH_I_NODELAY;
794 }
795 spin_unlock(&inode->i_lock);
796 return drop;
797}
798
799/*
800 * rmdir and unlink are differ only by the metadata op code
801 */
802static int ceph_unlink(struct inode *dir, struct dentry *dentry)
803{
804 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
805 struct ceph_mds_client *mdsc = &client->mdsc;
806 struct inode *inode = dentry->d_inode;
807 struct ceph_mds_request *req;
808 int err = -EROFS;
809 int op;
810
811 if (ceph_snap(dir) == CEPH_SNAPDIR) {
812 /* rmdir .snap/foo is RMSNAP */
813 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
814 dentry->d_name.name, dentry);
815 op = CEPH_MDS_OP_RMSNAP;
816 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
817 dout("unlink/rmdir dir %p dn %p inode %p\n",
818 dir, dentry, inode);
819 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
820 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
821 } else
822 goto out;
823 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
824 if (IS_ERR(req)) {
825 err = PTR_ERR(req);
826 goto out;
827 }
828 req->r_dentry = dget(dentry);
829 req->r_num_caps = 2;
830 req->r_locked_dir = dir;
831 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
832 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
833 req->r_inode_drop = drop_caps_for_unlink(inode);
834 err = ceph_mdsc_do_request(mdsc, dir, req);
835 if (!err && !req->r_reply_info.head->is_dentry)
836 d_delete(dentry);
837 ceph_mdsc_put_request(req);
838out:
839 return err;
840}
841
842static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
843 struct inode *new_dir, struct dentry *new_dentry)
844{
845 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
846 struct ceph_mds_client *mdsc = &client->mdsc;
847 struct ceph_mds_request *req;
848 int err;
849
850 if (ceph_snap(old_dir) != ceph_snap(new_dir))
851 return -EXDEV;
852 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
853 ceph_snap(new_dir) != CEPH_NOSNAP)
854 return -EROFS;
855 dout("rename dir %p dentry %p to dir %p dentry %p\n",
856 old_dir, old_dentry, new_dir, new_dentry);
857 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
858 if (IS_ERR(req))
859 return PTR_ERR(req);
860 req->r_dentry = dget(new_dentry);
861 req->r_num_caps = 2;
862 req->r_old_dentry = dget(old_dentry);
863 req->r_locked_dir = new_dir;
864 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
865 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
866 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
867 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
868 /* release LINK_RDCACHE on source inode (mds will lock it) */
869 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
870 if (new_dentry->d_inode)
871 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
872 err = ceph_mdsc_do_request(mdsc, old_dir, req);
873 if (!err && !req->r_reply_info.head->is_dentry) {
874 /*
875 * Normally d_move() is done by fill_trace (called by
876 * do_request, above). If there is no trace, we need
877 * to do it here.
878 */
879 d_move(old_dentry, new_dentry);
880 }
881 ceph_mdsc_put_request(req);
882 return err;
883}
884
885
886/*
887 * Check if dentry lease is valid. If not, delete the lease. Try to
888 * renew if the least is more than half up.
889 */
890static int dentry_lease_is_valid(struct dentry *dentry)
891{
892 struct ceph_dentry_info *di;
893 struct ceph_mds_session *s;
894 int valid = 0;
895 u32 gen;
896 unsigned long ttl;
897 struct ceph_mds_session *session = NULL;
898 struct inode *dir = NULL;
899 u32 seq = 0;
900
901 spin_lock(&dentry->d_lock);
902 di = ceph_dentry(dentry);
903 if (di && di->lease_session) {
904 s = di->lease_session;
905 spin_lock(&s->s_cap_lock);
906 gen = s->s_cap_gen;
907 ttl = s->s_cap_ttl;
908 spin_unlock(&s->s_cap_lock);
909
910 if (di->lease_gen == gen &&
911 time_before(jiffies, dentry->d_time) &&
912 time_before(jiffies, ttl)) {
913 valid = 1;
914 if (di->lease_renew_after &&
915 time_after(jiffies, di->lease_renew_after)) {
916 /* we should renew */
917 dir = dentry->d_parent->d_inode;
918 session = ceph_get_mds_session(s);
919 seq = di->lease_seq;
920 di->lease_renew_after = 0;
921 di->lease_renew_from = jiffies;
922 }
923 }
924 }
925 spin_unlock(&dentry->d_lock);
926
927 if (session) {
928 ceph_mdsc_lease_send_msg(session, dir, dentry,
929 CEPH_MDS_LEASE_RENEW, seq);
930 ceph_put_mds_session(session);
931 }
932 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
933 return valid;
934}
935
936/*
937 * Check if directory-wide content lease/cap is valid.
938 */
939static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
940{
941 struct ceph_inode_info *ci = ceph_inode(dir);
942 struct ceph_dentry_info *di = ceph_dentry(dentry);
943 int valid = 0;
944
945 spin_lock(&dir->i_lock);
946 if (ci->i_shared_gen == di->lease_shared_gen)
947 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
948 spin_unlock(&dir->i_lock);
949 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
950 dir, (unsigned)ci->i_shared_gen, dentry,
951 (unsigned)di->lease_shared_gen, valid);
952 return valid;
953}
954
955/*
956 * Check if cached dentry can be trusted.
957 */
958static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
959{
960 struct inode *dir = dentry->d_parent->d_inode;
961
962 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
963 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
964
965 /* always trust cached snapped dentries, snapdir dentry */
966 if (ceph_snap(dir) != CEPH_NOSNAP) {
967 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
968 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
969 goto out_touch;
970 }
971 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
972 goto out_touch;
973
974 if (dentry_lease_is_valid(dentry) ||
975 dir_lease_is_valid(dir, dentry))
976 goto out_touch;
977
978 dout("d_revalidate %p invalid\n", dentry);
979 d_drop(dentry);
980 return 0;
981out_touch:
982 ceph_dentry_lru_touch(dentry);
983 return 1;
984}
985
986/*
987 * When a dentry is released, clear the dir I_COMPLETE if it was part
988 * of the current dir gen.
989 */
990static void ceph_dentry_release(struct dentry *dentry)
991{
992 struct ceph_dentry_info *di = ceph_dentry(dentry);
993 struct inode *parent_inode = dentry->d_parent->d_inode;
994
995 if (parent_inode) {
996 struct ceph_inode_info *ci = ceph_inode(parent_inode);
997
998 spin_lock(&parent_inode->i_lock);
999 if (ci->i_shared_gen == di->lease_shared_gen) {
1000 dout(" clearing %p complete (d_release)\n",
1001 parent_inode);
1002 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1003 ci->i_release_count++;
1004 }
1005 spin_unlock(&parent_inode->i_lock);
1006 }
1007 if (di) {
1008 ceph_dentry_lru_del(dentry);
1009 if (di->lease_session)
1010 ceph_put_mds_session(di->lease_session);
1011 kmem_cache_free(ceph_dentry_cachep, di);
1012 dentry->d_fsdata = NULL;
1013 }
1014}
1015
1016static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1017 struct nameidata *nd)
1018{
1019 /*
1020 * Eventually, we'll want to revalidate snapped metadata
1021 * too... probably...
1022 */
1023 return 1;
1024}
1025
1026
1027
1028/*
1029 * read() on a dir. This weird interface hack only works if mounted
1030 * with '-o dirstat'.
1031 */
1032static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1033 loff_t *ppos)
1034{
1035 struct ceph_file_info *cf = file->private_data;
1036 struct inode *inode = file->f_dentry->d_inode;
1037 struct ceph_inode_info *ci = ceph_inode(inode);
1038 int left;
1039
1040 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1041 return -EISDIR;
1042
1043 if (!cf->dir_info) {
1044 cf->dir_info = kmalloc(1024, GFP_NOFS);
1045 if (!cf->dir_info)
1046 return -ENOMEM;
1047 cf->dir_info_len =
1048 sprintf(cf->dir_info,
1049 "entries: %20lld\n"
1050 " files: %20lld\n"
1051 " subdirs: %20lld\n"
1052 "rentries: %20lld\n"
1053 " rfiles: %20lld\n"
1054 " rsubdirs: %20lld\n"
1055 "rbytes: %20lld\n"
1056 "rctime: %10ld.%09ld\n",
1057 ci->i_files + ci->i_subdirs,
1058 ci->i_files,
1059 ci->i_subdirs,
1060 ci->i_rfiles + ci->i_rsubdirs,
1061 ci->i_rfiles,
1062 ci->i_rsubdirs,
1063 ci->i_rbytes,
1064 (long)ci->i_rctime.tv_sec,
1065 (long)ci->i_rctime.tv_nsec);
1066 }
1067
1068 if (*ppos >= cf->dir_info_len)
1069 return 0;
1070 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1071 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1072 if (left == size)
1073 return -EFAULT;
1074 *ppos += (size - left);
1075 return size - left;
1076}
1077
1078/*
1079 * an fsync() on a dir will wait for any uncommitted directory
1080 * operations to commit.
1081 */
1082static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1083 int datasync)
1084{
1085 struct inode *inode = dentry->d_inode;
1086 struct ceph_inode_info *ci = ceph_inode(inode);
1087 struct list_head *head = &ci->i_unsafe_dirops;
1088 struct ceph_mds_request *req;
1089 u64 last_tid;
1090 int ret = 0;
1091
1092 dout("dir_fsync %p\n", inode);
1093 spin_lock(&ci->i_unsafe_lock);
1094 if (list_empty(head))
1095 goto out;
1096
1097 req = list_entry(head->prev,
1098 struct ceph_mds_request, r_unsafe_dir_item);
1099 last_tid = req->r_tid;
1100
1101 do {
1102 ceph_mdsc_get_request(req);
1103 spin_unlock(&ci->i_unsafe_lock);
1104 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1105 inode, req->r_tid, last_tid);
1106 if (req->r_timeout) {
1107 ret = wait_for_completion_timeout(
1108 &req->r_safe_completion, req->r_timeout);
1109 if (ret > 0)
1110 ret = 0;
1111 else if (ret == 0)
1112 ret = -EIO; /* timed out */
1113 } else {
1114 wait_for_completion(&req->r_safe_completion);
1115 }
1116 spin_lock(&ci->i_unsafe_lock);
1117 ceph_mdsc_put_request(req);
1118
1119 if (ret || list_empty(head))
1120 break;
1121 req = list_entry(head->next,
1122 struct ceph_mds_request, r_unsafe_dir_item);
1123 } while (req->r_tid < last_tid);
1124out:
1125 spin_unlock(&ci->i_unsafe_lock);
1126 return ret;
1127}
1128
1129/*
1130 * We maintain a private dentry LRU.
1131 *
1132 * FIXME: this needs to be changed to a per-mds lru to be useful.
1133 */
1134void ceph_dentry_lru_add(struct dentry *dn)
1135{
1136 struct ceph_dentry_info *di = ceph_dentry(dn);
1137 struct ceph_mds_client *mdsc;
1138
1139 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1140 dn->d_name.len, dn->d_name.name);
1141 if (di) {
1142 mdsc = &ceph_client(dn->d_sb)->mdsc;
1143 spin_lock(&mdsc->dentry_lru_lock);
1144 list_add_tail(&di->lru, &mdsc->dentry_lru);
1145 mdsc->num_dentry++;
1146 spin_unlock(&mdsc->dentry_lru_lock);
1147 }
1148}
1149
1150void ceph_dentry_lru_touch(struct dentry *dn)
1151{
1152 struct ceph_dentry_info *di = ceph_dentry(dn);
1153 struct ceph_mds_client *mdsc;
1154
1155 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1156 dn->d_name.len, dn->d_name.name);
1157 if (di) {
1158 mdsc = &ceph_client(dn->d_sb)->mdsc;
1159 spin_lock(&mdsc->dentry_lru_lock);
1160 list_move_tail(&di->lru, &mdsc->dentry_lru);
1161 spin_unlock(&mdsc->dentry_lru_lock);
1162 }
1163}
1164
1165void ceph_dentry_lru_del(struct dentry *dn)
1166{
1167 struct ceph_dentry_info *di = ceph_dentry(dn);
1168 struct ceph_mds_client *mdsc;
1169
1170 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1171 dn->d_name.len, dn->d_name.name);
1172 if (di) {
1173 mdsc = &ceph_client(dn->d_sb)->mdsc;
1174 spin_lock(&mdsc->dentry_lru_lock);
1175 list_del_init(&di->lru);
1176 mdsc->num_dentry--;
1177 spin_unlock(&mdsc->dentry_lru_lock);
1178 }
1179}
1180
1181const struct file_operations ceph_dir_fops = {
1182 .read = ceph_read_dir,
1183 .readdir = ceph_readdir,
1184 .llseek = ceph_dir_llseek,
1185 .open = ceph_open,
1186 .release = ceph_release,
1187 .unlocked_ioctl = ceph_ioctl,
1188 .fsync = ceph_dir_fsync,
1189};
1190
1191const struct inode_operations ceph_dir_iops = {
1192 .lookup = ceph_lookup,
1193 .permission = ceph_permission,
1194 .getattr = ceph_getattr,
1195 .setattr = ceph_setattr,
1196 .setxattr = ceph_setxattr,
1197 .getxattr = ceph_getxattr,
1198 .listxattr = ceph_listxattr,
1199 .removexattr = ceph_removexattr,
1200 .mknod = ceph_mknod,
1201 .symlink = ceph_symlink,
1202 .mkdir = ceph_mkdir,
1203 .link = ceph_link,
1204 .unlink = ceph_unlink,
1205 .rmdir = ceph_unlink,
1206 .rename = ceph_rename,
1207 .create = ceph_create,
1208};
1209
1210struct dentry_operations ceph_dentry_ops = {
1211 .d_revalidate = ceph_d_revalidate,
1212 .d_release = ceph_dentry_release,
1213};
1214
1215struct dentry_operations ceph_snapdir_dentry_ops = {
1216 .d_revalidate = ceph_snapdir_d_revalidate,
1217};
1218
1219struct dentry_operations ceph_snap_dentry_ops = {
1220};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..fc68e39cbad6
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <asm/unaligned.h>
5
6#include "super.h"
7
8/*
9 * NFS export support
10 *
11 * NFS re-export of a ceph mount is, at present, only semireliable.
12 * The basic issue is that the Ceph architectures doesn't lend itself
13 * well to generating filehandles that will remain valid forever.
14 *
15 * So, we do our best. If you're lucky, your inode will be in the
16 * client's cache. If it's not, and you have a connectable fh, then
17 * the MDS server may be able to find it for you. Otherwise, you get
18 * ESTALE.
19 *
20 * There are ways to this more reliable, but in the non-connectable fh
21 * case, we won't every work perfectly, and in the connectable case,
22 * some changes are needed on the MDS side to work better.
23 */
24
25/*
26 * Basic fh
27 */
28struct ceph_nfs_fh {
29 u64 ino;
30} __attribute__ ((packed));
31
32/*
33 * Larger 'connectable' fh that includes parent ino and name hash.
34 * Use this whenever possible, as it works more reliably.
35 */
36struct ceph_nfs_confh {
37 u64 ino, parent_ino;
38 u32 parent_name_hash;
39} __attribute__ ((packed));
40
41static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
42 int connectable)
43{
44 struct ceph_nfs_fh *fh = (void *)rawfh;
45 struct ceph_nfs_confh *cfh = (void *)rawfh;
46 struct dentry *parent = dentry->d_parent;
47 struct inode *inode = dentry->d_inode;
48 int type;
49
50 /* don't re-export snaps */
51 if (ceph_snap(inode) != CEPH_NOSNAP)
52 return -EINVAL;
53
54 if (*max_len >= sizeof(*cfh)) {
55 dout("encode_fh %p connectable\n", dentry);
56 cfh->ino = ceph_ino(dentry->d_inode);
57 cfh->parent_ino = ceph_ino(parent->d_inode);
58 cfh->parent_name_hash = parent->d_name.hash;
59 *max_len = sizeof(*cfh);
60 type = 2;
61 } else if (*max_len > sizeof(*fh)) {
62 if (connectable)
63 return -ENOSPC;
64 dout("encode_fh %p\n", dentry);
65 fh->ino = ceph_ino(dentry->d_inode);
66 *max_len = sizeof(*fh);
67 type = 1;
68 } else {
69 return -ENOSPC;
70 }
71 return type;
72}
73
74/*
75 * convert regular fh to dentry
76 *
77 * FIXME: we should try harder by querying the mds for the ino.
78 */
79static struct dentry *__fh_to_dentry(struct super_block *sb,
80 struct ceph_nfs_fh *fh)
81{
82 struct inode *inode;
83 struct dentry *dentry;
84 struct ceph_vino vino;
85 int err;
86
87 dout("__fh_to_dentry %llx\n", fh->ino);
88 vino.ino = fh->ino;
89 vino.snap = CEPH_NOSNAP;
90 inode = ceph_find_inode(sb, vino);
91 if (!inode)
92 return ERR_PTR(-ESTALE);
93
94 dentry = d_obtain_alias(inode);
95 if (!dentry) {
96 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
97 fh->ino, inode);
98 iput(inode);
99 return ERR_PTR(-ENOMEM);
100 }
101 err = ceph_init_dentry(dentry);
102
103 if (err < 0) {
104 iput(inode);
105 return ERR_PTR(err);
106 }
107 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
108 return dentry;
109}
110
111/*
112 * convert connectable fh to dentry
113 */
114static struct dentry *__cfh_to_dentry(struct super_block *sb,
115 struct ceph_nfs_confh *cfh)
116{
117 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
118 struct inode *inode;
119 struct dentry *dentry;
120 struct ceph_vino vino;
121 int err;
122
123 dout("__cfh_to_dentry %llx (%llx/%x)\n",
124 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
125
126 vino.ino = cfh->ino;
127 vino.snap = CEPH_NOSNAP;
128 inode = ceph_find_inode(sb, vino);
129 if (!inode) {
130 struct ceph_mds_request *req;
131
132 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
133 USE_ANY_MDS);
134 if (IS_ERR(req))
135 return ERR_PTR(PTR_ERR(req));
136
137 req->r_ino1 = vino;
138 req->r_ino2.ino = cfh->parent_ino;
139 req->r_ino2.snap = CEPH_NOSNAP;
140 req->r_path2 = kmalloc(16, GFP_NOFS);
141 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
142 req->r_num_caps = 1;
143 err = ceph_mdsc_do_request(mdsc, NULL, req);
144 ceph_mdsc_put_request(req);
145 inode = ceph_find_inode(sb, vino);
146 if (!inode)
147 return ERR_PTR(err ? err : -ESTALE);
148 }
149
150 dentry = d_obtain_alias(inode);
151 if (!dentry) {
152 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
153 cfh->ino, inode);
154 iput(inode);
155 return ERR_PTR(-ENOMEM);
156 }
157 err = ceph_init_dentry(dentry);
158 if (err < 0) {
159 iput(inode);
160 return ERR_PTR(err);
161 }
162 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
163 return dentry;
164}
165
166static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
167 int fh_len, int fh_type)
168{
169 if (fh_type == 1)
170 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
171 else
172 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
173}
174
175/*
176 * get parent, if possible.
177 *
178 * FIXME: we could do better by querying the mds to discover the
179 * parent.
180 */
181static struct dentry *ceph_fh_to_parent(struct super_block *sb,
182 struct fid *fid,
183 int fh_len, int fh_type)
184{
185 struct ceph_nfs_confh *cfh = (void *)fid->raw;
186 struct ceph_vino vino;
187 struct inode *inode;
188 struct dentry *dentry;
189 int err;
190
191 if (fh_type == 1)
192 return ERR_PTR(-ESTALE);
193
194 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
195 cfh->parent_name_hash);
196
197 vino.ino = cfh->ino;
198 vino.snap = CEPH_NOSNAP;
199 inode = ceph_find_inode(sb, vino);
200 if (!inode)
201 return ERR_PTR(-ESTALE);
202
203 dentry = d_obtain_alias(inode);
204 if (!dentry) {
205 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
206 cfh->ino, inode);
207 iput(inode);
208 return ERR_PTR(-ENOMEM);
209 }
210 err = ceph_init_dentry(dentry);
211 if (err < 0) {
212 iput(inode);
213 return ERR_PTR(err);
214 }
215 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
216 return dentry;
217}
218
219const struct export_operations ceph_export_ops = {
220 .encode_fh = ceph_encode_fh,
221 .fh_to_dentry = ceph_fh_to_dentry,
222 .fh_to_parent = ceph_fh_to_parent,
223};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..5d2af8464f6a
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,937 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/writeback.h>
7
8#include "super.h"
9#include "mds_client.h"
10
11/*
12 * Ceph file operations
13 *
14 * Implement basic open/close functionality, and implement
15 * read/write.
16 *
17 * We implement three modes of file I/O:
18 * - buffered uses the generic_file_aio_{read,write} helpers
19 *
20 * - synchronous is used when there is multi-client read/write
21 * sharing, avoids the page cache, and synchronously waits for an
22 * ack from the OSD.
23 *
24 * - direct io takes the variant of the sync path that references
25 * user pages directly.
26 *
27 * fsync() flushes and waits on dirty pages, but just queues metadata
28 * for writeback: since the MDS can recover size and mtime there is no
29 * need to wait for MDS acknowledgement.
30 */
31
32
33/*
34 * Prepare an open request. Preallocate ceph_cap to avoid an
35 * inopportune ENOMEM later.
36 */
37static struct ceph_mds_request *
38prepare_open_request(struct super_block *sb, int flags, int create_mode)
39{
40 struct ceph_client *client = ceph_sb_to_client(sb);
41 struct ceph_mds_client *mdsc = &client->mdsc;
42 struct ceph_mds_request *req;
43 int want_auth = USE_ANY_MDS;
44 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
45
46 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
47 want_auth = USE_AUTH_MDS;
48
49 req = ceph_mdsc_create_request(mdsc, op, want_auth);
50 if (IS_ERR(req))
51 goto out;
52 req->r_fmode = ceph_flags_to_mode(flags);
53 req->r_args.open.flags = cpu_to_le32(flags);
54 req->r_args.open.mode = cpu_to_le32(create_mode);
55 req->r_args.open.preferred = cpu_to_le32(-1);
56out:
57 return req;
58}
59
60/*
61 * initialize private struct file data.
62 * if we fail, clean up by dropping fmode reference on the ceph_inode
63 */
64static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
65{
66 struct ceph_file_info *cf;
67 int ret = 0;
68
69 switch (inode->i_mode & S_IFMT) {
70 case S_IFREG:
71 case S_IFDIR:
72 dout("init_file %p %p 0%o (regular)\n", inode, file,
73 inode->i_mode);
74 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
75 if (cf == NULL) {
76 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
77 return -ENOMEM;
78 }
79 cf->fmode = fmode;
80 cf->next_offset = 2;
81 file->private_data = cf;
82 BUG_ON(inode->i_fop->release != ceph_release);
83 break;
84
85 case S_IFLNK:
86 dout("init_file %p %p 0%o (symlink)\n", inode, file,
87 inode->i_mode);
88 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
89 break;
90
91 default:
92 dout("init_file %p %p 0%o (special)\n", inode, file,
93 inode->i_mode);
94 /*
95 * we need to drop the open ref now, since we don't
96 * have .release set to ceph_release.
97 */
98 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
99 BUG_ON(inode->i_fop->release == ceph_release);
100
101 /* call the proper open fop */
102 ret = inode->i_fop->open(inode, file);
103 }
104 return ret;
105}
106
107/*
108 * If the filp already has private_data, that means the file was
109 * already opened by intent during lookup, and we do nothing.
110 *
111 * If we already have the requisite capabilities, we can satisfy
112 * the open request locally (no need to request new caps from the
113 * MDS). We do, however, need to inform the MDS (asynchronously)
114 * if our wanted caps set expands.
115 */
116int ceph_open(struct inode *inode, struct file *file)
117{
118 struct ceph_inode_info *ci = ceph_inode(inode);
119 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
120 struct ceph_mds_client *mdsc = &client->mdsc;
121 struct ceph_mds_request *req;
122 struct ceph_file_info *cf = file->private_data;
123 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
124 int err;
125 int flags, fmode, wanted;
126
127 if (cf) {
128 dout("open file %p is already opened\n", file);
129 return 0;
130 }
131
132 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
133 flags = file->f_flags & ~(O_CREAT|O_EXCL);
134 if (S_ISDIR(inode->i_mode))
135 flags = O_DIRECTORY; /* mds likes to know */
136
137 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
138 ceph_vinop(inode), file, flags, file->f_flags);
139 fmode = ceph_flags_to_mode(flags);
140 wanted = ceph_caps_for_mode(fmode);
141
142 /* snapped files are read-only */
143 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
144 return -EROFS;
145
146 /* trivially open snapdir */
147 if (ceph_snap(inode) == CEPH_SNAPDIR) {
148 spin_lock(&inode->i_lock);
149 __ceph_get_fmode(ci, fmode);
150 spin_unlock(&inode->i_lock);
151 return ceph_init_file(inode, file, fmode);
152 }
153
154 /*
155 * No need to block if we have any caps. Update wanted set
156 * asynchronously.
157 */
158 spin_lock(&inode->i_lock);
159 if (__ceph_is_any_real_caps(ci)) {
160 int mds_wanted = __ceph_caps_mds_wanted(ci);
161 int issued = __ceph_caps_issued(ci, NULL);
162
163 dout("open %p fmode %d want %s issued %s using existing\n",
164 inode, fmode, ceph_cap_string(wanted),
165 ceph_cap_string(issued));
166 __ceph_get_fmode(ci, fmode);
167 spin_unlock(&inode->i_lock);
168
169 /* adjust wanted? */
170 if ((issued & wanted) != wanted &&
171 (mds_wanted & wanted) != wanted &&
172 ceph_snap(inode) != CEPH_SNAPDIR)
173 ceph_check_caps(ci, 0, NULL);
174
175 return ceph_init_file(inode, file, fmode);
176 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
177 (ci->i_snap_caps & wanted) == wanted) {
178 __ceph_get_fmode(ci, fmode);
179 spin_unlock(&inode->i_lock);
180 return ceph_init_file(inode, file, fmode);
181 }
182 spin_unlock(&inode->i_lock);
183
184 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
185 req = prepare_open_request(inode->i_sb, flags, 0);
186 if (IS_ERR(req)) {
187 err = PTR_ERR(req);
188 goto out;
189 }
190 req->r_inode = igrab(inode);
191 req->r_num_caps = 1;
192 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
193 if (!err)
194 err = ceph_init_file(inode, file, req->r_fmode);
195 ceph_mdsc_put_request(req);
196 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
197out:
198 return err;
199}
200
201
202/*
203 * Do a lookup + open with a single request.
204 *
205 * If this succeeds, but some subsequent check in the vfs
206 * may_open() fails, the struct *file gets cleaned up (i.e.
207 * ceph_release gets called). So fear not!
208 */
209/*
210 * flags
211 * path_lookup_open -> LOOKUP_OPEN
212 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
213 */
214struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
215 struct nameidata *nd, int mode,
216 int locked_dir)
217{
218 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
219 struct ceph_mds_client *mdsc = &client->mdsc;
220 struct file *file = nd->intent.open.file;
221 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
222 struct ceph_mds_request *req;
223 int err;
224 int flags = nd->intent.open.flags - 1; /* silly vfs! */
225
226 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
227 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
228
229 /* do the open */
230 req = prepare_open_request(dir->i_sb, flags, mode);
231 if (IS_ERR(req))
232 return ERR_PTR(PTR_ERR(req));
233 req->r_dentry = dget(dentry);
234 req->r_num_caps = 2;
235 if (flags & O_CREAT) {
236 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
237 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
238 }
239 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
240 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
241 dentry = ceph_finish_lookup(req, dentry, err);
242 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
243 err = ceph_handle_notrace_create(dir, dentry);
244 if (!err)
245 err = ceph_init_file(req->r_dentry->d_inode, file,
246 req->r_fmode);
247 ceph_mdsc_put_request(req);
248 dout("ceph_lookup_open result=%p\n", dentry);
249 return dentry;
250}
251
252int ceph_release(struct inode *inode, struct file *file)
253{
254 struct ceph_inode_info *ci = ceph_inode(inode);
255 struct ceph_file_info *cf = file->private_data;
256
257 dout("release inode %p file %p\n", inode, file);
258 ceph_put_fmode(ci, cf->fmode);
259 if (cf->last_readdir)
260 ceph_mdsc_put_request(cf->last_readdir);
261 kfree(cf->last_name);
262 kfree(cf->dir_info);
263 dput(cf->dentry);
264 kmem_cache_free(ceph_file_cachep, cf);
265
266 /* wake up anyone waiting for caps on this inode */
267 wake_up(&ci->i_cap_wq);
268 return 0;
269}
270
271/*
272 * build a vector of user pages
273 */
274static struct page **get_direct_page_vector(const char __user *data,
275 int num_pages,
276 loff_t off, size_t len)
277{
278 struct page **pages;
279 int rc;
280
281 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
282 if (!pages)
283 return ERR_PTR(-ENOMEM);
284
285 down_read(&current->mm->mmap_sem);
286 rc = get_user_pages(current, current->mm, (unsigned long)data,
287 num_pages, 0, 0, pages, NULL);
288 up_read(&current->mm->mmap_sem);
289 if (rc < 0)
290 goto fail;
291 return pages;
292
293fail:
294 kfree(pages);
295 return ERR_PTR(rc);
296}
297
298static void put_page_vector(struct page **pages, int num_pages)
299{
300 int i;
301
302 for (i = 0; i < num_pages; i++)
303 put_page(pages[i]);
304 kfree(pages);
305}
306
307void ceph_release_page_vector(struct page **pages, int num_pages)
308{
309 int i;
310
311 for (i = 0; i < num_pages; i++)
312 __free_pages(pages[i], 0);
313 kfree(pages);
314}
315
316/*
317 * allocate a vector new pages
318 */
319static struct page **alloc_page_vector(int num_pages)
320{
321 struct page **pages;
322 int i;
323
324 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
325 if (!pages)
326 return ERR_PTR(-ENOMEM);
327 for (i = 0; i < num_pages; i++) {
328 pages[i] = alloc_page(GFP_NOFS);
329 if (pages[i] == NULL) {
330 ceph_release_page_vector(pages, i);
331 return ERR_PTR(-ENOMEM);
332 }
333 }
334 return pages;
335}
336
337/*
338 * copy user data into a page vector
339 */
340static int copy_user_to_page_vector(struct page **pages,
341 const char __user *data,
342 loff_t off, size_t len)
343{
344 int i = 0;
345 int po = off & ~PAGE_CACHE_MASK;
346 int left = len;
347 int l, bad;
348
349 while (left > 0) {
350 l = min_t(int, PAGE_CACHE_SIZE-po, left);
351 bad = copy_from_user(page_address(pages[i]) + po, data, l);
352 if (bad == l)
353 return -EFAULT;
354 data += l - bad;
355 left -= l - bad;
356 po += l - bad;
357 if (po == PAGE_CACHE_SIZE) {
358 po = 0;
359 i++;
360 }
361 }
362 return len;
363}
364
365/*
366 * copy user data from a page vector into a user pointer
367 */
368static int copy_page_vector_to_user(struct page **pages, char __user *data,
369 loff_t off, size_t len)
370{
371 int i = 0;
372 int po = off & ~PAGE_CACHE_MASK;
373 int left = len;
374 int l, bad;
375
376 while (left > 0) {
377 l = min_t(int, left, PAGE_CACHE_SIZE-po);
378 bad = copy_to_user(data, page_address(pages[i]) + po, l);
379 if (bad == l)
380 return -EFAULT;
381 data += l - bad;
382 left -= l - bad;
383 if (po) {
384 po += l - bad;
385 if (po == PAGE_CACHE_SIZE)
386 po = 0;
387 }
388 i++;
389 }
390 return len;
391}
392
393/*
394 * Zero an extent within a page vector. Offset is relative to the
395 * start of the first page.
396 */
397static void zero_page_vector_range(int off, int len, struct page **pages)
398{
399 int i = off >> PAGE_CACHE_SHIFT;
400
401 off &= ~PAGE_CACHE_MASK;
402
403 dout("zero_page_vector_page %u~%u\n", off, len);
404
405 /* leading partial page? */
406 if (off) {
407 int end = min((int)PAGE_CACHE_SIZE, off + len);
408 dout("zeroing %d %p head from %d\n", i, pages[i],
409 (int)off);
410 zero_user_segment(pages[i], off, end);
411 len -= (end - off);
412 i++;
413 }
414 while (len >= PAGE_CACHE_SIZE) {
415 dout("zeroing %d %p len=%d\n", i, pages[i], len);
416 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
417 len -= PAGE_CACHE_SIZE;
418 i++;
419 }
420 /* trailing partial page? */
421 if (len) {
422 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
423 zero_user_segment(pages[i], 0, len);
424 }
425}
426
427
428/*
429 * Read a range of bytes striped over one or more objects. Iterate over
430 * objects we stripe over. (That's not atomic, but good enough for now.)
431 *
432 * If we get a short result from the OSD, check against i_size; we need to
433 * only return a short read to the caller if we hit EOF.
434 */
435static int striped_read(struct inode *inode,
436 u64 off, u64 len,
437 struct page **pages, int num_pages,
438 int *checkeof)
439{
440 struct ceph_client *client = ceph_inode_to_client(inode);
441 struct ceph_inode_info *ci = ceph_inode(inode);
442 u64 pos, this_len;
443 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
444 int left, pages_left;
445 int read;
446 struct page **page_pos;
447 int ret;
448 bool hit_stripe, was_short;
449
450 /*
451 * we may need to do multiple reads. not atomic, unfortunately.
452 */
453 pos = off;
454 left = len;
455 page_pos = pages;
456 pages_left = num_pages;
457 read = 0;
458
459more:
460 this_len = left;
461 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
462 &ci->i_layout, pos, &this_len,
463 ci->i_truncate_seq,
464 ci->i_truncate_size,
465 page_pos, pages_left);
466 hit_stripe = this_len < left;
467 was_short = ret >= 0 && ret < this_len;
468 if (ret == -ENOENT)
469 ret = 0;
470 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
471 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
472
473 if (ret > 0) {
474 int didpages =
475 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
476
477 if (read < pos - off) {
478 dout(" zero gap %llu to %llu\n", off + read, pos);
479 zero_page_vector_range(page_off + read,
480 pos - off - read, pages);
481 }
482 pos += ret;
483 read = pos - off;
484 left -= ret;
485 page_pos += didpages;
486 pages_left -= didpages;
487
488 /* hit stripe? */
489 if (left && hit_stripe)
490 goto more;
491 }
492
493 if (was_short) {
494 /* was original extent fully inside i_size? */
495 if (pos + left <= inode->i_size) {
496 dout("zero tail\n");
497 zero_page_vector_range(page_off + read, len - read,
498 pages);
499 read = len;
500 goto out;
501 }
502
503 /* check i_size */
504 *checkeof = 1;
505 }
506
507out:
508 if (ret >= 0)
509 ret = read;
510 dout("striped_read returns %d\n", ret);
511 return ret;
512}
513
514/*
515 * Completely synchronous read and write methods. Direct from __user
516 * buffer to osd, or directly to user pages (if O_DIRECT).
517 *
518 * If the read spans object boundary, just do multiple reads.
519 */
520static ssize_t ceph_sync_read(struct file *file, char __user *data,
521 unsigned len, loff_t *poff, int *checkeof)
522{
523 struct inode *inode = file->f_dentry->d_inode;
524 struct page **pages;
525 u64 off = *poff;
526 int num_pages = calc_pages_for(off, len);
527 int ret;
528
529 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
530 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
531
532 if (file->f_flags & O_DIRECT) {
533 pages = get_direct_page_vector(data, num_pages, off, len);
534
535 /*
536 * flush any page cache pages in this range. this
537 * will make concurrent normal and O_DIRECT io slow,
538 * but it will at least behave sensibly when they are
539 * in sequence.
540 */
541 } else {
542 pages = alloc_page_vector(num_pages);
543 }
544 if (IS_ERR(pages))
545 return PTR_ERR(pages);
546
547 ret = filemap_write_and_wait(inode->i_mapping);
548 if (ret < 0)
549 goto done;
550
551 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
552
553 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
554 ret = copy_page_vector_to_user(pages, data, off, ret);
555 if (ret >= 0)
556 *poff = off + ret;
557
558done:
559 if (file->f_flags & O_DIRECT)
560 put_page_vector(pages, num_pages);
561 else
562 ceph_release_page_vector(pages, num_pages);
563 dout("sync_read result %d\n", ret);
564 return ret;
565}
566
567/*
568 * Write commit callback, called if we requested both an ACK and
569 * ONDISK commit reply from the OSD.
570 */
571static void sync_write_commit(struct ceph_osd_request *req,
572 struct ceph_msg *msg)
573{
574 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
575
576 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
577 spin_lock(&ci->i_unsafe_lock);
578 list_del_init(&req->r_unsafe_item);
579 spin_unlock(&ci->i_unsafe_lock);
580 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
581}
582
583/*
584 * Synchronous write, straight from __user pointer or user pages (if
585 * O_DIRECT).
586 *
587 * If write spans object boundary, just do multiple writes. (For a
588 * correct atomic write, we should e.g. take write locks on all
589 * objects, rollback on failure, etc.)
590 */
591static ssize_t ceph_sync_write(struct file *file, const char __user *data,
592 size_t left, loff_t *offset)
593{
594 struct inode *inode = file->f_dentry->d_inode;
595 struct ceph_inode_info *ci = ceph_inode(inode);
596 struct ceph_client *client = ceph_inode_to_client(inode);
597 struct ceph_osd_request *req;
598 struct page **pages;
599 int num_pages;
600 long long unsigned pos;
601 u64 len;
602 int written = 0;
603 int flags;
604 int do_sync = 0;
605 int check_caps = 0;
606 int ret;
607 struct timespec mtime = CURRENT_TIME;
608
609 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
610 return -EROFS;
611
612 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
613 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
614
615 if (file->f_flags & O_APPEND)
616 pos = i_size_read(inode);
617 else
618 pos = *offset;
619
620 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
621 if (ret < 0)
622 return ret;
623
624 ret = invalidate_inode_pages2_range(inode->i_mapping,
625 pos >> PAGE_CACHE_SHIFT,
626 (pos + left) >> PAGE_CACHE_SHIFT);
627 if (ret < 0)
628 dout("invalidate_inode_pages2_range returned %d\n", ret);
629
630 flags = CEPH_OSD_FLAG_ORDERSNAP |
631 CEPH_OSD_FLAG_ONDISK |
632 CEPH_OSD_FLAG_WRITE;
633 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
634 flags |= CEPH_OSD_FLAG_ACK;
635 else
636 do_sync = 1;
637
638 /*
639 * we may need to do multiple writes here if we span an object
640 * boundary. this isn't atomic, unfortunately. :(
641 */
642more:
643 len = left;
644 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
645 ceph_vino(inode), pos, &len,
646 CEPH_OSD_OP_WRITE, flags,
647 ci->i_snap_realm->cached_context,
648 do_sync,
649 ci->i_truncate_seq, ci->i_truncate_size,
650 &mtime, false, 2);
651 if (IS_ERR(req))
652 return PTR_ERR(req);
653
654 num_pages = calc_pages_for(pos, len);
655
656 if (file->f_flags & O_DIRECT) {
657 pages = get_direct_page_vector(data, num_pages, pos, len);
658 if (IS_ERR(pages)) {
659 ret = PTR_ERR(pages);
660 goto out;
661 }
662
663 /*
664 * throw out any page cache pages in this range. this
665 * may block.
666 */
667 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
668 } else {
669 pages = alloc_page_vector(num_pages);
670 if (IS_ERR(pages)) {
671 ret = PTR_ERR(pages);
672 goto out;
673 }
674 ret = copy_user_to_page_vector(pages, data, pos, len);
675 if (ret < 0) {
676 ceph_release_page_vector(pages, num_pages);
677 goto out;
678 }
679
680 if ((file->f_flags & O_SYNC) == 0) {
681 /* get a second commit callback */
682 req->r_safe_callback = sync_write_commit;
683 req->r_own_pages = 1;
684 }
685 }
686 req->r_pages = pages;
687 req->r_num_pages = num_pages;
688 req->r_inode = inode;
689
690 ret = ceph_osdc_start_request(&client->osdc, req, false);
691 if (!ret) {
692 if (req->r_safe_callback) {
693 /*
694 * Add to inode unsafe list only after we
695 * start_request so that a tid has been assigned.
696 */
697 spin_lock(&ci->i_unsafe_lock);
698 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
699 spin_unlock(&ci->i_unsafe_lock);
700 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
701 }
702 ret = ceph_osdc_wait_request(&client->osdc, req);
703 }
704
705 if (file->f_flags & O_DIRECT)
706 put_page_vector(pages, num_pages);
707 else if (file->f_flags & O_SYNC)
708 ceph_release_page_vector(pages, num_pages);
709
710out:
711 ceph_osdc_put_request(req);
712 if (ret == 0) {
713 pos += len;
714 written += len;
715 left -= len;
716 if (left)
717 goto more;
718
719 ret = written;
720 *offset = pos;
721 if (pos > i_size_read(inode))
722 check_caps = ceph_inode_set_size(inode, pos);
723 if (check_caps)
724 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
725 NULL);
726 }
727 return ret;
728}
729
730/*
731 * Wrap generic_file_aio_read with checks for cap bits on the inode.
732 * Atomically grab references, so that those bits are not released
733 * back to the MDS mid-read.
734 *
735 * Hmm, the sync read case isn't actually async... should it be?
736 */
737static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
738 unsigned long nr_segs, loff_t pos)
739{
740 struct file *filp = iocb->ki_filp;
741 loff_t *ppos = &iocb->ki_pos;
742 size_t len = iov->iov_len;
743 struct inode *inode = filp->f_dentry->d_inode;
744 struct ceph_inode_info *ci = ceph_inode(inode);
745 void *base = iov->iov_base;
746 ssize_t ret;
747 int got = 0;
748 int checkeof = 0, read = 0;
749
750 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
751 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
752again:
753 __ceph_do_pending_vmtruncate(inode);
754 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
755 &got, -1);
756 if (ret < 0)
757 goto out;
758 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
759 inode, ceph_vinop(inode), pos, (unsigned)len,
760 ceph_cap_string(got));
761
762 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
763 (iocb->ki_filp->f_flags & O_DIRECT) ||
764 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
765 /* hmm, this isn't really async... */
766 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
767 else
768 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
769
770out:
771 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
772 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
773 ceph_put_cap_refs(ci, got);
774
775 if (checkeof && ret >= 0) {
776 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
777
778 /* hit EOF or hole? */
779 if (statret == 0 && *ppos < inode->i_size) {
780 dout("aio_read sync_read hit hole, reading more\n");
781 read += ret;
782 base += ret;
783 len -= ret;
784 checkeof = 0;
785 goto again;
786 }
787 }
788 if (ret >= 0)
789 ret += read;
790
791 return ret;
792}
793
794/*
795 * Take cap references to avoid releasing caps to MDS mid-write.
796 *
797 * If we are synchronous, and write with an old snap context, the OSD
798 * may return EOLDSNAPC. In that case, retry the write.. _after_
799 * dropping our cap refs and allowing the pending snap to logically
800 * complete _before_ this write occurs.
801 *
802 * If we are near ENOSPC, write synchronously.
803 */
804static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
805 unsigned long nr_segs, loff_t pos)
806{
807 struct file *file = iocb->ki_filp;
808 struct inode *inode = file->f_dentry->d_inode;
809 struct ceph_inode_info *ci = ceph_inode(inode);
810 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
811 loff_t endoff = pos + iov->iov_len;
812 int got = 0;
813 int ret, err;
814
815 if (ceph_snap(inode) != CEPH_NOSNAP)
816 return -EROFS;
817
818retry_snap:
819 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
820 return -ENOSPC;
821 __ceph_do_pending_vmtruncate(inode);
822 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
823 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
824 inode->i_size);
825 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
826 &got, endoff);
827 if (ret < 0)
828 goto out;
829
830 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
831 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
832 ceph_cap_string(got));
833
834 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
835 (iocb->ki_filp->f_flags & O_DIRECT) ||
836 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
837 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
838 &iocb->ki_pos);
839 } else {
840 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
841
842 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
843 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
844 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
845 err = vfs_fsync_range(file, file->f_path.dentry,
846 pos, pos + ret - 1, 1);
847 if (err < 0)
848 ret = err;
849 }
850 }
851 if (ret >= 0) {
852 spin_lock(&inode->i_lock);
853 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
854 spin_unlock(&inode->i_lock);
855 }
856
857out:
858 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
859 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
860 ceph_cap_string(got));
861 ceph_put_cap_refs(ci, got);
862
863 if (ret == -EOLDSNAPC) {
864 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
865 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
866 goto retry_snap;
867 }
868
869 return ret;
870}
871
872/*
873 * llseek. be sure to verify file size on SEEK_END.
874 */
875static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
876{
877 struct inode *inode = file->f_mapping->host;
878 int ret;
879
880 mutex_lock(&inode->i_mutex);
881 __ceph_do_pending_vmtruncate(inode);
882 switch (origin) {
883 case SEEK_END:
884 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
885 if (ret < 0) {
886 offset = ret;
887 goto out;
888 }
889 offset += inode->i_size;
890 break;
891 case SEEK_CUR:
892 /*
893 * Here we special-case the lseek(fd, 0, SEEK_CUR)
894 * position-querying operation. Avoid rewriting the "same"
895 * f_pos value back to the file because a concurrent read(),
896 * write() or lseek() might have altered it
897 */
898 if (offset == 0) {
899 offset = file->f_pos;
900 goto out;
901 }
902 offset += file->f_pos;
903 break;
904 }
905
906 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
907 offset = -EINVAL;
908 goto out;
909 }
910
911 /* Special lock needed here? */
912 if (offset != file->f_pos) {
913 file->f_pos = offset;
914 file->f_version = 0;
915 }
916
917out:
918 mutex_unlock(&inode->i_mutex);
919 return offset;
920}
921
922const struct file_operations ceph_file_fops = {
923 .open = ceph_open,
924 .release = ceph_release,
925 .llseek = ceph_llseek,
926 .read = do_sync_read,
927 .write = do_sync_write,
928 .aio_read = ceph_aio_read,
929 .aio_write = ceph_aio_write,
930 .mmap = ceph_mmap,
931 .fsync = ceph_fsync,
932 .splice_read = generic_file_splice_read,
933 .splice_write = generic_file_splice_write,
934 .unlocked_ioctl = ceph_ioctl,
935 .compat_ioctl = ceph_ioctl,
936};
937
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..7abe1aed819b
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1750 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 kfree(ci->i_symlink);
382 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
383 frag = rb_entry(n, struct ceph_inode_frag, node);
384 rb_erase(n, &ci->i_fragtree);
385 kfree(frag);
386 }
387
388 __ceph_destroy_xattrs(ci);
389 if (ci->i_xattrs.blob)
390 ceph_buffer_put(ci->i_xattrs.blob);
391 if (ci->i_xattrs.prealloc_blob)
392 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
393
394 kmem_cache_free(ceph_inode_cachep, ci);
395}
396
397
398/*
399 * Helpers to fill in size, ctime, mtime, and atime. We have to be
400 * careful because either the client or MDS may have more up to date
401 * info, depending on which capabilities are held, and whether
402 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
403 * and size are monotonically increasing, except when utimes() or
404 * truncate() increments the corresponding _seq values.)
405 */
406int ceph_fill_file_size(struct inode *inode, int issued,
407 u32 truncate_seq, u64 truncate_size, u64 size)
408{
409 struct ceph_inode_info *ci = ceph_inode(inode);
410 int queue_trunc = 0;
411
412 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
413 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
414 dout("size %lld -> %llu\n", inode->i_size, size);
415 inode->i_size = size;
416 inode->i_blocks = (size + (1<<9) - 1) >> 9;
417 ci->i_reported_size = size;
418 if (truncate_seq != ci->i_truncate_seq) {
419 dout("truncate_seq %u -> %u\n",
420 ci->i_truncate_seq, truncate_seq);
421 ci->i_truncate_seq = truncate_seq;
422 /*
423 * If we hold relevant caps, or in the case where we're
424 * not the only client referencing this file and we
425 * don't hold those caps, then we need to check whether
426 * the file is either opened or mmaped
427 */
428 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
429 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
430 CEPH_CAP_FILE_EXCL)) ||
431 mapping_mapped(inode->i_mapping) ||
432 __ceph_caps_file_wanted(ci)) {
433 ci->i_truncate_pending++;
434 queue_trunc = 1;
435 }
436 }
437 }
438 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
439 ci->i_truncate_size != truncate_size) {
440 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
441 truncate_size);
442 ci->i_truncate_size = truncate_size;
443 }
444 return queue_trunc;
445}
446
447void ceph_fill_file_time(struct inode *inode, int issued,
448 u64 time_warp_seq, struct timespec *ctime,
449 struct timespec *mtime, struct timespec *atime)
450{
451 struct ceph_inode_info *ci = ceph_inode(inode);
452 int warn = 0;
453
454 if (issued & (CEPH_CAP_FILE_EXCL|
455 CEPH_CAP_FILE_WR|
456 CEPH_CAP_FILE_BUFFER)) {
457 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
458 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
459 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
460 ctime->tv_sec, ctime->tv_nsec);
461 inode->i_ctime = *ctime;
462 }
463 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
464 /* the MDS did a utimes() */
465 dout("mtime %ld.%09ld -> %ld.%09ld "
466 "tw %d -> %d\n",
467 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
468 mtime->tv_sec, mtime->tv_nsec,
469 ci->i_time_warp_seq, (int)time_warp_seq);
470
471 inode->i_mtime = *mtime;
472 inode->i_atime = *atime;
473 ci->i_time_warp_seq = time_warp_seq;
474 } else if (time_warp_seq == ci->i_time_warp_seq) {
475 /* nobody did utimes(); take the max */
476 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
477 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
478 inode->i_mtime.tv_sec,
479 inode->i_mtime.tv_nsec,
480 mtime->tv_sec, mtime->tv_nsec);
481 inode->i_mtime = *mtime;
482 }
483 if (timespec_compare(atime, &inode->i_atime) > 0) {
484 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
485 inode->i_atime.tv_sec,
486 inode->i_atime.tv_nsec,
487 atime->tv_sec, atime->tv_nsec);
488 inode->i_atime = *atime;
489 }
490 } else if (issued & CEPH_CAP_FILE_EXCL) {
491 /* we did a utimes(); ignore mds values */
492 } else {
493 warn = 1;
494 }
495 } else {
496 /* we have no write caps; whatever the MDS says is true */
497 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
498 inode->i_ctime = *ctime;
499 inode->i_mtime = *mtime;
500 inode->i_atime = *atime;
501 ci->i_time_warp_seq = time_warp_seq;
502 } else {
503 warn = 1;
504 }
505 }
506 if (warn) /* time_warp_seq shouldn't go backwards */
507 dout("%p mds time_warp_seq %llu < %u\n",
508 inode, time_warp_seq, ci->i_time_warp_seq);
509}
510
511/*
512 * Populate an inode based on info from mds. May be called on new or
513 * existing inodes.
514 */
515static int fill_inode(struct inode *inode,
516 struct ceph_mds_reply_info_in *iinfo,
517 struct ceph_mds_reply_dirfrag *dirinfo,
518 struct ceph_mds_session *session,
519 unsigned long ttl_from, int cap_fmode,
520 struct ceph_cap_reservation *caps_reservation)
521{
522 struct ceph_mds_reply_inode *info = iinfo->in;
523 struct ceph_inode_info *ci = ceph_inode(inode);
524 int i;
525 int issued, implemented;
526 struct timespec mtime, atime, ctime;
527 u32 nsplits;
528 struct ceph_buffer *xattr_blob = NULL;
529 int err = 0;
530 int queue_trunc = 0;
531
532 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
533 inode, ceph_vinop(inode), le64_to_cpu(info->version),
534 ci->i_version);
535
536 /*
537 * prealloc xattr data, if it looks like we'll need it. only
538 * if len > 4 (meaning there are actually xattrs; the first 4
539 * bytes are the xattr count).
540 */
541 if (iinfo->xattr_len > 4) {
542 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
543 if (!xattr_blob)
544 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
545 iinfo->xattr_len);
546 }
547
548 spin_lock(&inode->i_lock);
549
550 /*
551 * provided version will be odd if inode value is projected,
552 * even if stable. skip the update if we have a newer info
553 * (e.g., due to inode info racing form multiple MDSs), or if
554 * we are getting projected (unstable) inode info.
555 */
556 if (le64_to_cpu(info->version) > 0 &&
557 (ci->i_version & ~1) > le64_to_cpu(info->version))
558 goto no_change;
559
560 issued = __ceph_caps_issued(ci, &implemented);
561 issued |= implemented | __ceph_caps_dirty(ci);
562
563 /* update inode */
564 ci->i_version = le64_to_cpu(info->version);
565 inode->i_version++;
566 inode->i_rdev = le32_to_cpu(info->rdev);
567
568 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
569 inode->i_mode = le32_to_cpu(info->mode);
570 inode->i_uid = le32_to_cpu(info->uid);
571 inode->i_gid = le32_to_cpu(info->gid);
572 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
573 inode->i_uid, inode->i_gid);
574 }
575
576 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
577 inode->i_nlink = le32_to_cpu(info->nlink);
578
579 /* be careful with mtime, atime, size */
580 ceph_decode_timespec(&atime, &info->atime);
581 ceph_decode_timespec(&mtime, &info->mtime);
582 ceph_decode_timespec(&ctime, &info->ctime);
583 queue_trunc = ceph_fill_file_size(inode, issued,
584 le32_to_cpu(info->truncate_seq),
585 le64_to_cpu(info->truncate_size),
586 le64_to_cpu(info->size));
587 ceph_fill_file_time(inode, issued,
588 le32_to_cpu(info->time_warp_seq),
589 &ctime, &mtime, &atime);
590
591 ci->i_max_size = le64_to_cpu(info->max_size);
592 ci->i_layout = info->layout;
593 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
594
595 /* xattrs */
596 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
597 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
598 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
599 if (ci->i_xattrs.blob)
600 ceph_buffer_put(ci->i_xattrs.blob);
601 ci->i_xattrs.blob = xattr_blob;
602 if (xattr_blob)
603 memcpy(ci->i_xattrs.blob->vec.iov_base,
604 iinfo->xattr_data, iinfo->xattr_len);
605 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
606 }
607
608 inode->i_mapping->a_ops = &ceph_aops;
609 inode->i_mapping->backing_dev_info =
610 &ceph_client(inode->i_sb)->backing_dev_info;
611
612 switch (inode->i_mode & S_IFMT) {
613 case S_IFIFO:
614 case S_IFBLK:
615 case S_IFCHR:
616 case S_IFSOCK:
617 init_special_inode(inode, inode->i_mode, inode->i_rdev);
618 inode->i_op = &ceph_file_iops;
619 break;
620 case S_IFREG:
621 inode->i_op = &ceph_file_iops;
622 inode->i_fop = &ceph_file_fops;
623 break;
624 case S_IFLNK:
625 inode->i_op = &ceph_symlink_iops;
626 if (!ci->i_symlink) {
627 int symlen = iinfo->symlink_len;
628 char *sym;
629
630 BUG_ON(symlen != inode->i_size);
631 spin_unlock(&inode->i_lock);
632
633 err = -ENOMEM;
634 sym = kmalloc(symlen+1, GFP_NOFS);
635 if (!sym)
636 goto out;
637 memcpy(sym, iinfo->symlink, symlen);
638 sym[symlen] = 0;
639
640 spin_lock(&inode->i_lock);
641 if (!ci->i_symlink)
642 ci->i_symlink = sym;
643 else
644 kfree(sym); /* lost a race */
645 }
646 break;
647 case S_IFDIR:
648 inode->i_op = &ceph_dir_iops;
649 inode->i_fop = &ceph_dir_fops;
650
651 ci->i_files = le64_to_cpu(info->files);
652 ci->i_subdirs = le64_to_cpu(info->subdirs);
653 ci->i_rbytes = le64_to_cpu(info->rbytes);
654 ci->i_rfiles = le64_to_cpu(info->rfiles);
655 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
656 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
657
658 /* set dir completion flag? */
659 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
660 ceph_snap(inode) == CEPH_NOSNAP &&
661 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
662 dout(" marking %p complete (empty)\n", inode);
663 ci->i_ceph_flags |= CEPH_I_COMPLETE;
664 ci->i_max_offset = 2;
665 }
666
667 /* it may be better to set st_size in getattr instead? */
668 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
669 inode->i_size = ci->i_rbytes;
670 break;
671 default:
672 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
673 ceph_vinop(inode), inode->i_mode);
674 }
675
676no_change:
677 spin_unlock(&inode->i_lock);
678
679 /* queue truncate if we saw i_size decrease */
680 if (queue_trunc)
681 ceph_queue_vmtruncate(inode);
682
683 /* populate frag tree */
684 /* FIXME: move me up, if/when version reflects fragtree changes */
685 nsplits = le32_to_cpu(info->fragtree.nsplits);
686 mutex_lock(&ci->i_fragtree_mutex);
687 for (i = 0; i < nsplits; i++) {
688 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
689 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
690
691 if (IS_ERR(frag))
692 continue;
693 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
694 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
695 }
696 mutex_unlock(&ci->i_fragtree_mutex);
697
698 /* were we issued a capability? */
699 if (info->cap.caps) {
700 if (ceph_snap(inode) == CEPH_NOSNAP) {
701 ceph_add_cap(inode, session,
702 le64_to_cpu(info->cap.cap_id),
703 cap_fmode,
704 le32_to_cpu(info->cap.caps),
705 le32_to_cpu(info->cap.wanted),
706 le32_to_cpu(info->cap.seq),
707 le32_to_cpu(info->cap.mseq),
708 le64_to_cpu(info->cap.realm),
709 info->cap.flags,
710 caps_reservation);
711 } else {
712 spin_lock(&inode->i_lock);
713 dout(" %p got snap_caps %s\n", inode,
714 ceph_cap_string(le32_to_cpu(info->cap.caps)));
715 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
716 if (cap_fmode >= 0)
717 __ceph_get_fmode(ci, cap_fmode);
718 spin_unlock(&inode->i_lock);
719 }
720 }
721
722 /* update delegation info? */
723 if (dirinfo)
724 ceph_fill_dirfrag(inode, dirinfo);
725
726 err = 0;
727
728out:
729 if (xattr_blob)
730 ceph_buffer_put(xattr_blob);
731 return err;
732}
733
734/*
735 * caller should hold session s_mutex.
736 */
737static void update_dentry_lease(struct dentry *dentry,
738 struct ceph_mds_reply_lease *lease,
739 struct ceph_mds_session *session,
740 unsigned long from_time)
741{
742 struct ceph_dentry_info *di = ceph_dentry(dentry);
743 long unsigned duration = le32_to_cpu(lease->duration_ms);
744 long unsigned ttl = from_time + (duration * HZ) / 1000;
745 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
746 struct inode *dir;
747
748 /* only track leases on regular dentries */
749 if (dentry->d_op != &ceph_dentry_ops)
750 return;
751
752 spin_lock(&dentry->d_lock);
753 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
754 dentry, le16_to_cpu(lease->mask), duration, ttl);
755
756 /* make lease_rdcache_gen match directory */
757 dir = dentry->d_parent->d_inode;
758 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
759
760 if (lease->mask == 0)
761 goto out_unlock;
762
763 if (di->lease_gen == session->s_cap_gen &&
764 time_before(ttl, dentry->d_time))
765 goto out_unlock; /* we already have a newer lease. */
766
767 if (di->lease_session && di->lease_session != session)
768 goto out_unlock;
769
770 ceph_dentry_lru_touch(dentry);
771
772 if (!di->lease_session)
773 di->lease_session = ceph_get_mds_session(session);
774 di->lease_gen = session->s_cap_gen;
775 di->lease_seq = le32_to_cpu(lease->seq);
776 di->lease_renew_after = half_ttl;
777 di->lease_renew_from = 0;
778 dentry->d_time = ttl;
779out_unlock:
780 spin_unlock(&dentry->d_lock);
781 return;
782}
783
784/*
785 * splice a dentry to an inode.
786 * caller must hold directory i_mutex for this to be safe.
787 *
788 * we will only rehash the resulting dentry if @prehash is
789 * true; @prehash will be set to false (for the benefit of
790 * the caller) if we fail.
791 */
792static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
793 bool *prehash)
794{
795 struct dentry *realdn;
796
797 /* dn must be unhashed */
798 if (!d_unhashed(dn))
799 d_drop(dn);
800 realdn = d_materialise_unique(dn, in);
801 if (IS_ERR(realdn)) {
802 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
803 dn, in, ceph_vinop(in));
804 if (prehash)
805 *prehash = false; /* don't rehash on error */
806 dn = realdn; /* note realdn contains the error */
807 goto out;
808 } else if (realdn) {
809 dout("dn %p (%d) spliced with %p (%d) "
810 "inode %p ino %llx.%llx\n",
811 dn, atomic_read(&dn->d_count),
812 realdn, atomic_read(&realdn->d_count),
813 realdn->d_inode, ceph_vinop(realdn->d_inode));
814 dput(dn);
815 dn = realdn;
816 } else {
817 BUG_ON(!ceph_dentry(dn));
818
819 dout("dn %p attached to %p ino %llx.%llx\n",
820 dn, dn->d_inode, ceph_vinop(dn->d_inode));
821 }
822 if ((!prehash || *prehash) && d_unhashed(dn))
823 d_rehash(dn);
824out:
825 return dn;
826}
827
828/*
829 * Set dentry's directory position based on the current dir's max, and
830 * order it in d_subdirs, so that dcache_readdir behaves.
831 */
832static void ceph_set_dentry_offset(struct dentry *dn)
833{
834 struct dentry *dir = dn->d_parent;
835 struct inode *inode = dn->d_parent->d_inode;
836 struct ceph_dentry_info *di;
837
838 BUG_ON(!inode);
839
840 di = ceph_dentry(dn);
841
842 spin_lock(&inode->i_lock);
843 di->offset = ceph_inode(inode)->i_max_offset++;
844 spin_unlock(&inode->i_lock);
845
846 spin_lock(&dcache_lock);
847 spin_lock(&dn->d_lock);
848 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
849 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
850 dn->d_u.d_child.prev, dn->d_u.d_child.next);
851 spin_unlock(&dn->d_lock);
852 spin_unlock(&dcache_lock);
853}
854
855/*
856 * Incorporate results into the local cache. This is either just
857 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
858 * after a lookup).
859 *
860 * A reply may contain
861 * a directory inode along with a dentry.
862 * and/or a target inode
863 *
864 * Called with snap_rwsem (read).
865 */
866int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
867 struct ceph_mds_session *session)
868{
869 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
870 struct inode *in = NULL;
871 struct ceph_mds_reply_inode *ininfo;
872 struct ceph_vino vino;
873 int i = 0;
874 int err = 0;
875
876 dout("fill_trace %p is_dentry %d is_target %d\n", req,
877 rinfo->head->is_dentry, rinfo->head->is_target);
878
879#if 0
880 /*
881 * Debugging hook:
882 *
883 * If we resend completed ops to a recovering mds, we get no
884 * trace. Since that is very rare, pretend this is the case
885 * to ensure the 'no trace' handlers in the callers behave.
886 *
887 * Fill in inodes unconditionally to avoid breaking cap
888 * invariants.
889 */
890 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
891 pr_info("fill_trace faking empty trace on %lld %s\n",
892 req->r_tid, ceph_mds_op_name(rinfo->head->op));
893 if (rinfo->head->is_dentry) {
894 rinfo->head->is_dentry = 0;
895 err = fill_inode(req->r_locked_dir,
896 &rinfo->diri, rinfo->dirfrag,
897 session, req->r_request_started, -1);
898 }
899 if (rinfo->head->is_target) {
900 rinfo->head->is_target = 0;
901 ininfo = rinfo->targeti.in;
902 vino.ino = le64_to_cpu(ininfo->ino);
903 vino.snap = le64_to_cpu(ininfo->snapid);
904 in = ceph_get_inode(sb, vino);
905 err = fill_inode(in, &rinfo->targeti, NULL,
906 session, req->r_request_started,
907 req->r_fmode);
908 iput(in);
909 }
910 }
911#endif
912
913 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
914 dout("fill_trace reply is empty!\n");
915 if (rinfo->head->result == 0 && req->r_locked_dir) {
916 struct ceph_inode_info *ci =
917 ceph_inode(req->r_locked_dir);
918 dout(" clearing %p complete (empty trace)\n",
919 req->r_locked_dir);
920 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
921 ci->i_release_count++;
922 }
923 return 0;
924 }
925
926 if (rinfo->head->is_dentry) {
927 struct inode *dir = req->r_locked_dir;
928
929 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
930 session, req->r_request_started, -1,
931 &req->r_caps_reservation);
932 if (err < 0)
933 return err;
934 }
935
936 if (rinfo->head->is_dentry && !req->r_aborted) {
937 /*
938 * lookup link rename : null -> possibly existing inode
939 * mknod symlink mkdir : null -> new inode
940 * unlink : linked -> null
941 */
942 struct inode *dir = req->r_locked_dir;
943 struct dentry *dn = req->r_dentry;
944 bool have_dir_cap, have_lease;
945
946 BUG_ON(!dn);
947 BUG_ON(!dir);
948 BUG_ON(dn->d_parent->d_inode != dir);
949 BUG_ON(ceph_ino(dir) !=
950 le64_to_cpu(rinfo->diri.in->ino));
951 BUG_ON(ceph_snap(dir) !=
952 le64_to_cpu(rinfo->diri.in->snapid));
953
954 /* do we have a lease on the whole dir? */
955 have_dir_cap =
956 (le32_to_cpu(rinfo->diri.in->cap.caps) &
957 CEPH_CAP_FILE_SHARED);
958
959 /* do we have a dn lease? */
960 have_lease = have_dir_cap ||
961 (le16_to_cpu(rinfo->dlease->mask) &
962 CEPH_LOCK_DN);
963
964 if (!have_lease)
965 dout("fill_trace no dentry lease or dir cap\n");
966
967 /* rename? */
968 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
969 dout(" src %p '%.*s' dst %p '%.*s'\n",
970 req->r_old_dentry,
971 req->r_old_dentry->d_name.len,
972 req->r_old_dentry->d_name.name,
973 dn, dn->d_name.len, dn->d_name.name);
974 dout("fill_trace doing d_move %p -> %p\n",
975 req->r_old_dentry, dn);
976 d_move(req->r_old_dentry, dn);
977 dout(" src %p '%.*s' dst %p '%.*s'\n",
978 req->r_old_dentry,
979 req->r_old_dentry->d_name.len,
980 req->r_old_dentry->d_name.name,
981 dn, dn->d_name.len, dn->d_name.name);
982 /* ensure target dentry is invalidated, despite
983 rehashing bug in vfs_rename_dir */
984 dn->d_time = jiffies;
985 ceph_dentry(dn)->lease_shared_gen = 0;
986 /* take overwritten dentry's readdir offset */
987 ceph_dentry(req->r_old_dentry)->offset =
988 ceph_dentry(dn)->offset;
989 dn = req->r_old_dentry; /* use old_dentry */
990 in = dn->d_inode;
991 }
992
993 /* null dentry? */
994 if (!rinfo->head->is_target) {
995 dout("fill_trace null dentry\n");
996 if (dn->d_inode) {
997 dout("d_delete %p\n", dn);
998 d_delete(dn);
999 } else {
1000 dout("d_instantiate %p NULL\n", dn);
1001 d_instantiate(dn, NULL);
1002 if (have_lease && d_unhashed(dn))
1003 d_rehash(dn);
1004 update_dentry_lease(dn, rinfo->dlease,
1005 session,
1006 req->r_request_started);
1007 }
1008 goto done;
1009 }
1010
1011 /* attach proper inode */
1012 ininfo = rinfo->targeti.in;
1013 vino.ino = le64_to_cpu(ininfo->ino);
1014 vino.snap = le64_to_cpu(ininfo->snapid);
1015 if (!dn->d_inode) {
1016 in = ceph_get_inode(sb, vino);
1017 if (IS_ERR(in)) {
1018 pr_err("fill_trace bad get_inode "
1019 "%llx.%llx\n", vino.ino, vino.snap);
1020 err = PTR_ERR(in);
1021 d_delete(dn);
1022 goto done;
1023 }
1024 dn = splice_dentry(dn, in, &have_lease);
1025 if (IS_ERR(dn)) {
1026 err = PTR_ERR(dn);
1027 goto done;
1028 }
1029 req->r_dentry = dn; /* may have spliced */
1030 ceph_set_dentry_offset(dn);
1031 igrab(in);
1032 } else if (ceph_ino(in) == vino.ino &&
1033 ceph_snap(in) == vino.snap) {
1034 igrab(in);
1035 } else {
1036 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1037 dn, in, ceph_ino(in), ceph_snap(in),
1038 vino.ino, vino.snap);
1039 have_lease = false;
1040 in = NULL;
1041 }
1042
1043 if (have_lease)
1044 update_dentry_lease(dn, rinfo->dlease, session,
1045 req->r_request_started);
1046 dout(" final dn %p\n", dn);
1047 i++;
1048 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1049 req->r_op == CEPH_MDS_OP_MKSNAP) {
1050 struct dentry *dn = req->r_dentry;
1051
1052 /* fill out a snapdir LOOKUPSNAP dentry */
1053 BUG_ON(!dn);
1054 BUG_ON(!req->r_locked_dir);
1055 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1056 ininfo = rinfo->targeti.in;
1057 vino.ino = le64_to_cpu(ininfo->ino);
1058 vino.snap = le64_to_cpu(ininfo->snapid);
1059 in = ceph_get_inode(sb, vino);
1060 if (IS_ERR(in)) {
1061 pr_err("fill_inode get_inode badness %llx.%llx\n",
1062 vino.ino, vino.snap);
1063 err = PTR_ERR(in);
1064 d_delete(dn);
1065 goto done;
1066 }
1067 dout(" linking snapped dir %p to dn %p\n", in, dn);
1068 dn = splice_dentry(dn, in, NULL);
1069 if (IS_ERR(dn)) {
1070 err = PTR_ERR(dn);
1071 goto done;
1072 }
1073 ceph_set_dentry_offset(dn);
1074 req->r_dentry = dn; /* may have spliced */
1075 igrab(in);
1076 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1077 }
1078
1079 if (rinfo->head->is_target) {
1080 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1081 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1082
1083 if (in == NULL || ceph_ino(in) != vino.ino ||
1084 ceph_snap(in) != vino.snap) {
1085 in = ceph_get_inode(sb, vino);
1086 if (IS_ERR(in)) {
1087 err = PTR_ERR(in);
1088 goto done;
1089 }
1090 }
1091 req->r_target_inode = in;
1092
1093 err = fill_inode(in,
1094 &rinfo->targeti, NULL,
1095 session, req->r_request_started,
1096 (le32_to_cpu(rinfo->head->result) == 0) ?
1097 req->r_fmode : -1,
1098 &req->r_caps_reservation);
1099 if (err < 0) {
1100 pr_err("fill_inode badness %p %llx.%llx\n",
1101 in, ceph_vinop(in));
1102 goto done;
1103 }
1104 }
1105
1106done:
1107 dout("fill_trace done err=%d\n", err);
1108 return err;
1109}
1110
1111/*
1112 * Prepopulate our cache with readdir results, leases, etc.
1113 */
1114int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1115 struct ceph_mds_session *session)
1116{
1117 struct dentry *parent = req->r_dentry;
1118 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1119 struct qstr dname;
1120 struct dentry *dn;
1121 struct inode *in;
1122 int err = 0, i;
1123 struct inode *snapdir = NULL;
1124 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1125 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1126 struct ceph_dentry_info *di;
1127
1128 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1129 snapdir = ceph_get_snapdir(parent->d_inode);
1130 parent = d_find_alias(snapdir);
1131 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1132 rinfo->dir_nr, parent);
1133 } else {
1134 dout("readdir_prepopulate %d items under dn %p\n",
1135 rinfo->dir_nr, parent);
1136 if (rinfo->dir_dir)
1137 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1138 }
1139
1140 for (i = 0; i < rinfo->dir_nr; i++) {
1141 struct ceph_vino vino;
1142
1143 dname.name = rinfo->dir_dname[i];
1144 dname.len = rinfo->dir_dname_len[i];
1145 dname.hash = full_name_hash(dname.name, dname.len);
1146
1147 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1148 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1149
1150retry_lookup:
1151 dn = d_lookup(parent, &dname);
1152 dout("d_lookup on parent=%p name=%.*s got %p\n",
1153 parent, dname.len, dname.name, dn);
1154
1155 if (!dn) {
1156 dn = d_alloc(parent, &dname);
1157 dout("d_alloc %p '%.*s' = %p\n", parent,
1158 dname.len, dname.name, dn);
1159 if (dn == NULL) {
1160 dout("d_alloc badness\n");
1161 err = -ENOMEM;
1162 goto out;
1163 }
1164 err = ceph_init_dentry(dn);
1165 if (err < 0)
1166 goto out;
1167 } else if (dn->d_inode &&
1168 (ceph_ino(dn->d_inode) != vino.ino ||
1169 ceph_snap(dn->d_inode) != vino.snap)) {
1170 dout(" dn %p points to wrong inode %p\n",
1171 dn, dn->d_inode);
1172 d_delete(dn);
1173 dput(dn);
1174 goto retry_lookup;
1175 } else {
1176 /* reorder parent's d_subdirs */
1177 spin_lock(&dcache_lock);
1178 spin_lock(&dn->d_lock);
1179 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1180 spin_unlock(&dn->d_lock);
1181 spin_unlock(&dcache_lock);
1182 }
1183
1184 di = dn->d_fsdata;
1185 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1186
1187 /* inode */
1188 if (dn->d_inode) {
1189 in = dn->d_inode;
1190 } else {
1191 in = ceph_get_inode(parent->d_sb, vino);
1192 if (in == NULL) {
1193 dout("new_inode badness\n");
1194 d_delete(dn);
1195 dput(dn);
1196 err = -ENOMEM;
1197 goto out;
1198 }
1199 dn = splice_dentry(dn, in, NULL);
1200 }
1201
1202 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1203 req->r_request_started, -1,
1204 &req->r_caps_reservation) < 0) {
1205 pr_err("fill_inode badness on %p\n", in);
1206 dput(dn);
1207 continue;
1208 }
1209 update_dentry_lease(dn, rinfo->dir_dlease[i],
1210 req->r_session, req->r_request_started);
1211 dput(dn);
1212 }
1213 req->r_did_prepopulate = true;
1214
1215out:
1216 if (snapdir) {
1217 iput(snapdir);
1218 dput(parent);
1219 }
1220 dout("readdir_prepopulate done\n");
1221 return err;
1222}
1223
1224int ceph_inode_set_size(struct inode *inode, loff_t size)
1225{
1226 struct ceph_inode_info *ci = ceph_inode(inode);
1227 int ret = 0;
1228
1229 spin_lock(&inode->i_lock);
1230 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1231 inode->i_size = size;
1232 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1233
1234 /* tell the MDS if we are approaching max_size */
1235 if ((size << 1) >= ci->i_max_size &&
1236 (ci->i_reported_size << 1) < ci->i_max_size)
1237 ret = 1;
1238
1239 spin_unlock(&inode->i_lock);
1240 return ret;
1241}
1242
1243/*
1244 * Write back inode data in a worker thread. (This can't be done
1245 * in the message handler context.)
1246 */
1247void ceph_queue_writeback(struct inode *inode)
1248{
1249 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1250 &ceph_inode(inode)->i_wb_work)) {
1251 dout("ceph_queue_writeback %p\n", inode);
1252 igrab(inode);
1253 } else {
1254 dout("ceph_queue_writeback %p failed\n", inode);
1255 }
1256}
1257
1258static void ceph_writeback_work(struct work_struct *work)
1259{
1260 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1261 i_wb_work);
1262 struct inode *inode = &ci->vfs_inode;
1263
1264 dout("writeback %p\n", inode);
1265 filemap_fdatawrite(&inode->i_data);
1266 iput(inode);
1267}
1268
1269/*
1270 * queue an async invalidation
1271 */
1272void ceph_queue_invalidate(struct inode *inode)
1273{
1274 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1275 &ceph_inode(inode)->i_pg_inv_work)) {
1276 dout("ceph_queue_invalidate %p\n", inode);
1277 igrab(inode);
1278 } else {
1279 dout("ceph_queue_invalidate %p failed\n", inode);
1280 }
1281}
1282
1283/*
1284 * invalidate any pages that are not dirty or under writeback. this
1285 * includes pages that are clean and mapped.
1286 */
1287static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1288{
1289 struct pagevec pvec;
1290 pgoff_t next = 0;
1291 int i;
1292
1293 pagevec_init(&pvec, 0);
1294 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1295 for (i = 0; i < pagevec_count(&pvec); i++) {
1296 struct page *page = pvec.pages[i];
1297 pgoff_t index;
1298 int skip_page =
1299 (PageDirty(page) || PageWriteback(page));
1300
1301 if (!skip_page)
1302 skip_page = !trylock_page(page);
1303
1304 /*
1305 * We really shouldn't be looking at the ->index of an
1306 * unlocked page. But we're not allowed to lock these
1307 * pages. So we rely upon nobody altering the ->index
1308 * of this (pinned-by-us) page.
1309 */
1310 index = page->index;
1311 if (index > next)
1312 next = index;
1313 next++;
1314
1315 if (skip_page)
1316 continue;
1317
1318 generic_error_remove_page(mapping, page);
1319 unlock_page(page);
1320 }
1321 pagevec_release(&pvec);
1322 cond_resched();
1323 }
1324}
1325
1326/*
1327 * Invalidate inode pages in a worker thread. (This can't be done
1328 * in the message handler context.)
1329 */
1330static void ceph_invalidate_work(struct work_struct *work)
1331{
1332 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1333 i_pg_inv_work);
1334 struct inode *inode = &ci->vfs_inode;
1335 u32 orig_gen;
1336 int check = 0;
1337
1338 spin_lock(&inode->i_lock);
1339 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1340 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1341 if (ci->i_rdcache_gen == 0 ||
1342 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1343 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1344 /* nevermind! */
1345 ci->i_rdcache_revoking = 0;
1346 spin_unlock(&inode->i_lock);
1347 goto out;
1348 }
1349 orig_gen = ci->i_rdcache_gen;
1350 spin_unlock(&inode->i_lock);
1351
1352 ceph_invalidate_nondirty_pages(inode->i_mapping);
1353
1354 spin_lock(&inode->i_lock);
1355 if (orig_gen == ci->i_rdcache_gen) {
1356 dout("invalidate_pages %p gen %d successful\n", inode,
1357 ci->i_rdcache_gen);
1358 ci->i_rdcache_gen = 0;
1359 ci->i_rdcache_revoking = 0;
1360 check = 1;
1361 } else {
1362 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1363 inode, orig_gen, ci->i_rdcache_gen);
1364 }
1365 spin_unlock(&inode->i_lock);
1366
1367 if (check)
1368 ceph_check_caps(ci, 0, NULL);
1369out:
1370 iput(inode);
1371}
1372
1373
1374/*
1375 * called by trunc_wq; take i_mutex ourselves
1376 *
1377 * We also truncate in a separate thread as well.
1378 */
1379static void ceph_vmtruncate_work(struct work_struct *work)
1380{
1381 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1382 i_vmtruncate_work);
1383 struct inode *inode = &ci->vfs_inode;
1384
1385 dout("vmtruncate_work %p\n", inode);
1386 mutex_lock(&inode->i_mutex);
1387 __ceph_do_pending_vmtruncate(inode);
1388 mutex_unlock(&inode->i_mutex);
1389 iput(inode);
1390}
1391
1392/*
1393 * Queue an async vmtruncate. If we fail to queue work, we will handle
1394 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1395 */
1396void ceph_queue_vmtruncate(struct inode *inode)
1397{
1398 struct ceph_inode_info *ci = ceph_inode(inode);
1399
1400 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1401 &ci->i_vmtruncate_work)) {
1402 dout("ceph_queue_vmtruncate %p\n", inode);
1403 igrab(inode);
1404 } else {
1405 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1406 inode, ci->i_truncate_pending);
1407 }
1408}
1409
1410/*
1411 * called with i_mutex held.
1412 *
1413 * Make sure any pending truncation is applied before doing anything
1414 * that may depend on it.
1415 */
1416void __ceph_do_pending_vmtruncate(struct inode *inode)
1417{
1418 struct ceph_inode_info *ci = ceph_inode(inode);
1419 u64 to;
1420 int wrbuffer_refs, wake = 0;
1421
1422retry:
1423 spin_lock(&inode->i_lock);
1424 if (ci->i_truncate_pending == 0) {
1425 dout("__do_pending_vmtruncate %p none pending\n", inode);
1426 spin_unlock(&inode->i_lock);
1427 return;
1428 }
1429
1430 /*
1431 * make sure any dirty snapped pages are flushed before we
1432 * possibly truncate them.. so write AND block!
1433 */
1434 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1435 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1436 inode);
1437 spin_unlock(&inode->i_lock);
1438 filemap_write_and_wait_range(&inode->i_data, 0,
1439 inode->i_sb->s_maxbytes);
1440 goto retry;
1441 }
1442
1443 to = ci->i_truncate_size;
1444 wrbuffer_refs = ci->i_wrbuffer_ref;
1445 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1446 ci->i_truncate_pending, to);
1447 spin_unlock(&inode->i_lock);
1448
1449 truncate_inode_pages(inode->i_mapping, to);
1450
1451 spin_lock(&inode->i_lock);
1452 ci->i_truncate_pending--;
1453 if (ci->i_truncate_pending == 0)
1454 wake = 1;
1455 spin_unlock(&inode->i_lock);
1456
1457 if (wrbuffer_refs == 0)
1458 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1459 if (wake)
1460 wake_up(&ci->i_cap_wq);
1461}
1462
1463
1464/*
1465 * symlinks
1466 */
1467static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1468{
1469 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1470 nd_set_link(nd, ci->i_symlink);
1471 return NULL;
1472}
1473
1474static const struct inode_operations ceph_symlink_iops = {
1475 .readlink = generic_readlink,
1476 .follow_link = ceph_sym_follow_link,
1477};
1478
1479/*
1480 * setattr
1481 */
1482int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1483{
1484 struct inode *inode = dentry->d_inode;
1485 struct ceph_inode_info *ci = ceph_inode(inode);
1486 struct inode *parent_inode = dentry->d_parent->d_inode;
1487 const unsigned int ia_valid = attr->ia_valid;
1488 struct ceph_mds_request *req;
1489 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1490 int issued;
1491 int release = 0, dirtied = 0;
1492 int mask = 0;
1493 int err = 0;
1494
1495 if (ceph_snap(inode) != CEPH_NOSNAP)
1496 return -EROFS;
1497
1498 __ceph_do_pending_vmtruncate(inode);
1499
1500 err = inode_change_ok(inode, attr);
1501 if (err != 0)
1502 return err;
1503
1504 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1505 USE_AUTH_MDS);
1506 if (IS_ERR(req))
1507 return PTR_ERR(req);
1508
1509 spin_lock(&inode->i_lock);
1510 issued = __ceph_caps_issued(ci, NULL);
1511 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1512
1513 if (ia_valid & ATTR_UID) {
1514 dout("setattr %p uid %d -> %d\n", inode,
1515 inode->i_uid, attr->ia_uid);
1516 if (issued & CEPH_CAP_AUTH_EXCL) {
1517 inode->i_uid = attr->ia_uid;
1518 dirtied |= CEPH_CAP_AUTH_EXCL;
1519 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1520 attr->ia_uid != inode->i_uid) {
1521 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1522 mask |= CEPH_SETATTR_UID;
1523 release |= CEPH_CAP_AUTH_SHARED;
1524 }
1525 }
1526 if (ia_valid & ATTR_GID) {
1527 dout("setattr %p gid %d -> %d\n", inode,
1528 inode->i_gid, attr->ia_gid);
1529 if (issued & CEPH_CAP_AUTH_EXCL) {
1530 inode->i_gid = attr->ia_gid;
1531 dirtied |= CEPH_CAP_AUTH_EXCL;
1532 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1533 attr->ia_gid != inode->i_gid) {
1534 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1535 mask |= CEPH_SETATTR_GID;
1536 release |= CEPH_CAP_AUTH_SHARED;
1537 }
1538 }
1539 if (ia_valid & ATTR_MODE) {
1540 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1541 attr->ia_mode);
1542 if (issued & CEPH_CAP_AUTH_EXCL) {
1543 inode->i_mode = attr->ia_mode;
1544 dirtied |= CEPH_CAP_AUTH_EXCL;
1545 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1546 attr->ia_mode != inode->i_mode) {
1547 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1548 mask |= CEPH_SETATTR_MODE;
1549 release |= CEPH_CAP_AUTH_SHARED;
1550 }
1551 }
1552
1553 if (ia_valid & ATTR_ATIME) {
1554 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1555 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1556 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1557 if (issued & CEPH_CAP_FILE_EXCL) {
1558 ci->i_time_warp_seq++;
1559 inode->i_atime = attr->ia_atime;
1560 dirtied |= CEPH_CAP_FILE_EXCL;
1561 } else if ((issued & CEPH_CAP_FILE_WR) &&
1562 timespec_compare(&inode->i_atime,
1563 &attr->ia_atime) < 0) {
1564 inode->i_atime = attr->ia_atime;
1565 dirtied |= CEPH_CAP_FILE_WR;
1566 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1567 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1568 ceph_encode_timespec(&req->r_args.setattr.atime,
1569 &attr->ia_atime);
1570 mask |= CEPH_SETATTR_ATIME;
1571 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1572 CEPH_CAP_FILE_WR;
1573 }
1574 }
1575 if (ia_valid & ATTR_MTIME) {
1576 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1577 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1578 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1579 if (issued & CEPH_CAP_FILE_EXCL) {
1580 ci->i_time_warp_seq++;
1581 inode->i_mtime = attr->ia_mtime;
1582 dirtied |= CEPH_CAP_FILE_EXCL;
1583 } else if ((issued & CEPH_CAP_FILE_WR) &&
1584 timespec_compare(&inode->i_mtime,
1585 &attr->ia_mtime) < 0) {
1586 inode->i_mtime = attr->ia_mtime;
1587 dirtied |= CEPH_CAP_FILE_WR;
1588 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1589 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1590 ceph_encode_timespec(&req->r_args.setattr.mtime,
1591 &attr->ia_mtime);
1592 mask |= CEPH_SETATTR_MTIME;
1593 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1594 CEPH_CAP_FILE_WR;
1595 }
1596 }
1597 if (ia_valid & ATTR_SIZE) {
1598 dout("setattr %p size %lld -> %lld\n", inode,
1599 inode->i_size, attr->ia_size);
1600 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1601 err = -EINVAL;
1602 goto out;
1603 }
1604 if ((issued & CEPH_CAP_FILE_EXCL) &&
1605 attr->ia_size > inode->i_size) {
1606 inode->i_size = attr->ia_size;
1607 inode->i_blocks =
1608 (attr->ia_size + (1 << 9) - 1) >> 9;
1609 inode->i_ctime = attr->ia_ctime;
1610 ci->i_reported_size = attr->ia_size;
1611 dirtied |= CEPH_CAP_FILE_EXCL;
1612 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1613 attr->ia_size != inode->i_size) {
1614 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1615 req->r_args.setattr.old_size =
1616 cpu_to_le64(inode->i_size);
1617 mask |= CEPH_SETATTR_SIZE;
1618 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1619 CEPH_CAP_FILE_WR;
1620 }
1621 }
1622
1623 /* these do nothing */
1624 if (ia_valid & ATTR_CTIME) {
1625 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1626 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1627 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1628 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1629 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1630 only ? "ctime only" : "ignored");
1631 inode->i_ctime = attr->ia_ctime;
1632 if (only) {
1633 /*
1634 * if kernel wants to dirty ctime but nothing else,
1635 * we need to choose a cap to dirty under, or do
1636 * a almost-no-op setattr
1637 */
1638 if (issued & CEPH_CAP_AUTH_EXCL)
1639 dirtied |= CEPH_CAP_AUTH_EXCL;
1640 else if (issued & CEPH_CAP_FILE_EXCL)
1641 dirtied |= CEPH_CAP_FILE_EXCL;
1642 else if (issued & CEPH_CAP_XATTR_EXCL)
1643 dirtied |= CEPH_CAP_XATTR_EXCL;
1644 else
1645 mask |= CEPH_SETATTR_CTIME;
1646 }
1647 }
1648 if (ia_valid & ATTR_FILE)
1649 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1650
1651 if (dirtied) {
1652 __ceph_mark_dirty_caps(ci, dirtied);
1653 inode->i_ctime = CURRENT_TIME;
1654 }
1655
1656 release &= issued;
1657 spin_unlock(&inode->i_lock);
1658
1659 if (mask) {
1660 req->r_inode = igrab(inode);
1661 req->r_inode_drop = release;
1662 req->r_args.setattr.mask = cpu_to_le32(mask);
1663 req->r_num_caps = 1;
1664 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1665 }
1666 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1667 ceph_cap_string(dirtied), mask);
1668
1669 ceph_mdsc_put_request(req);
1670 __ceph_do_pending_vmtruncate(inode);
1671 return err;
1672out:
1673 spin_unlock(&inode->i_lock);
1674 ceph_mdsc_put_request(req);
1675 return err;
1676}
1677
1678/*
1679 * Verify that we have a lease on the given mask. If not,
1680 * do a getattr against an mds.
1681 */
1682int ceph_do_getattr(struct inode *inode, int mask)
1683{
1684 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1685 struct ceph_mds_client *mdsc = &client->mdsc;
1686 struct ceph_mds_request *req;
1687 int err;
1688
1689 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1690 dout("do_getattr inode %p SNAPDIR\n", inode);
1691 return 0;
1692 }
1693
1694 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1695 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1696 return 0;
1697
1698 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1699 if (IS_ERR(req))
1700 return PTR_ERR(req);
1701 req->r_inode = igrab(inode);
1702 req->r_num_caps = 1;
1703 req->r_args.getattr.mask = cpu_to_le32(mask);
1704 err = ceph_mdsc_do_request(mdsc, NULL, req);
1705 ceph_mdsc_put_request(req);
1706 dout("do_getattr result=%d\n", err);
1707 return err;
1708}
1709
1710
1711/*
1712 * Check inode permissions. We verify we have a valid value for
1713 * the AUTH cap, then call the generic handler.
1714 */
1715int ceph_permission(struct inode *inode, int mask)
1716{
1717 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1718
1719 if (!err)
1720 err = generic_permission(inode, mask, NULL);
1721 return err;
1722}
1723
1724/*
1725 * Get all attributes. Hopefully somedata we'll have a statlite()
1726 * and can limit the fields we require to be accurate.
1727 */
1728int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1729 struct kstat *stat)
1730{
1731 struct inode *inode = dentry->d_inode;
1732 struct ceph_inode_info *ci = ceph_inode(inode);
1733 int err;
1734
1735 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1736 if (!err) {
1737 generic_fillattr(inode, stat);
1738 stat->ino = inode->i_ino;
1739 if (ceph_snap(inode) != CEPH_NOSNAP)
1740 stat->dev = ceph_snap(inode);
1741 else
1742 stat->dev = 0;
1743 if (S_ISDIR(inode->i_mode)) {
1744 stat->size = ci->i_rbytes;
1745 stat->blocks = 0;
1746 stat->blksize = 65536;
1747 }
1748 }
1749 return err;
1750}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..a2600101ec22
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3021 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/sched.h>
5
6#include "mds_client.h"
7#include "mon_client.h"
8#include "super.h"
9#include "messenger.h"
10#include "decode.h"
11#include "auth.h"
12#include "pagelist.h"
13
14/*
15 * A cluster of MDS (metadata server) daemons is responsible for
16 * managing the file system namespace (the directory hierarchy and
17 * inodes) and for coordinating shared access to storage. Metadata is
18 * partitioning hierarchically across a number of servers, and that
19 * partition varies over time as the cluster adjusts the distribution
20 * in order to balance load.
21 *
22 * The MDS client is primarily responsible to managing synchronous
23 * metadata requests for operations like open, unlink, and so forth.
24 * If there is a MDS failure, we find out about it when we (possibly
25 * request and) receive a new MDS map, and can resubmit affected
26 * requests.
27 *
28 * For the most part, though, we take advantage of a lossless
29 * communications channel to the MDS, and do not need to worry about
30 * timing out or resubmitting requests.
31 *
32 * We maintain a stateful "session" with each MDS we interact with.
33 * Within each session, we sent periodic heartbeat messages to ensure
34 * any capabilities or leases we have been issues remain valid. If
35 * the session times out and goes stale, our leases and capabilities
36 * are no longer valid.
37 */
38
39static void __wake_requests(struct ceph_mds_client *mdsc,
40 struct list_head *head);
41
42const static struct ceph_connection_operations mds_con_ops;
43
44
45/*
46 * mds reply parsing
47 */
48
49/*
50 * parse individual inode info
51 */
52static int parse_reply_info_in(void **p, void *end,
53 struct ceph_mds_reply_info_in *info)
54{
55 int err = -EIO;
56
57 info->in = *p;
58 *p += sizeof(struct ceph_mds_reply_inode) +
59 sizeof(*info->in->fragtree.splits) *
60 le32_to_cpu(info->in->fragtree.nsplits);
61
62 ceph_decode_32_safe(p, end, info->symlink_len, bad);
63 ceph_decode_need(p, end, info->symlink_len, bad);
64 info->symlink = *p;
65 *p += info->symlink_len;
66
67 ceph_decode_32_safe(p, end, info->xattr_len, bad);
68 ceph_decode_need(p, end, info->xattr_len, bad);
69 info->xattr_data = *p;
70 *p += info->xattr_len;
71 return 0;
72bad:
73 return err;
74}
75
76/*
77 * parse a normal reply, which may contain a (dir+)dentry and/or a
78 * target inode.
79 */
80static int parse_reply_info_trace(void **p, void *end,
81 struct ceph_mds_reply_info_parsed *info)
82{
83 int err;
84
85 if (info->head->is_dentry) {
86 err = parse_reply_info_in(p, end, &info->diri);
87 if (err < 0)
88 goto out_bad;
89
90 if (unlikely(*p + sizeof(*info->dirfrag) > end))
91 goto bad;
92 info->dirfrag = *p;
93 *p += sizeof(*info->dirfrag) +
94 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
95 if (unlikely(*p > end))
96 goto bad;
97
98 ceph_decode_32_safe(p, end, info->dname_len, bad);
99 ceph_decode_need(p, end, info->dname_len, bad);
100 info->dname = *p;
101 *p += info->dname_len;
102 info->dlease = *p;
103 *p += sizeof(*info->dlease);
104 }
105
106 if (info->head->is_target) {
107 err = parse_reply_info_in(p, end, &info->targeti);
108 if (err < 0)
109 goto out_bad;
110 }
111
112 if (unlikely(*p != end))
113 goto bad;
114 return 0;
115
116bad:
117 err = -EIO;
118out_bad:
119 pr_err("problem parsing mds trace %d\n", err);
120 return err;
121}
122
123/*
124 * parse readdir results
125 */
126static int parse_reply_info_dir(void **p, void *end,
127 struct ceph_mds_reply_info_parsed *info)
128{
129 u32 num, i = 0;
130 int err;
131
132 info->dir_dir = *p;
133 if (*p + sizeof(*info->dir_dir) > end)
134 goto bad;
135 *p += sizeof(*info->dir_dir) +
136 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
137 if (*p > end)
138 goto bad;
139
140 ceph_decode_need(p, end, sizeof(num) + 2, bad);
141 num = ceph_decode_32(p);
142 info->dir_end = ceph_decode_8(p);
143 info->dir_complete = ceph_decode_8(p);
144 if (num == 0)
145 goto done;
146
147 /* alloc large array */
148 info->dir_nr = num;
149 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
150 sizeof(*info->dir_dname) +
151 sizeof(*info->dir_dname_len) +
152 sizeof(*info->dir_dlease),
153 GFP_NOFS);
154 if (info->dir_in == NULL) {
155 err = -ENOMEM;
156 goto out_bad;
157 }
158 info->dir_dname = (void *)(info->dir_in + num);
159 info->dir_dname_len = (void *)(info->dir_dname + num);
160 info->dir_dlease = (void *)(info->dir_dname_len + num);
161
162 while (num) {
163 /* dentry */
164 ceph_decode_need(p, end, sizeof(u32)*2, bad);
165 info->dir_dname_len[i] = ceph_decode_32(p);
166 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
167 info->dir_dname[i] = *p;
168 *p += info->dir_dname_len[i];
169 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
170 info->dir_dname[i]);
171 info->dir_dlease[i] = *p;
172 *p += sizeof(struct ceph_mds_reply_lease);
173
174 /* inode */
175 err = parse_reply_info_in(p, end, &info->dir_in[i]);
176 if (err < 0)
177 goto out_bad;
178 i++;
179 num--;
180 }
181
182done:
183 if (*p != end)
184 goto bad;
185 return 0;
186
187bad:
188 err = -EIO;
189out_bad:
190 pr_err("problem parsing dir contents %d\n", err);
191 return err;
192}
193
194/*
195 * parse entire mds reply
196 */
197static int parse_reply_info(struct ceph_msg *msg,
198 struct ceph_mds_reply_info_parsed *info)
199{
200 void *p, *end;
201 u32 len;
202 int err;
203
204 info->head = msg->front.iov_base;
205 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
206 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
207
208 /* trace */
209 ceph_decode_32_safe(&p, end, len, bad);
210 if (len > 0) {
211 err = parse_reply_info_trace(&p, p+len, info);
212 if (err < 0)
213 goto out_bad;
214 }
215
216 /* dir content */
217 ceph_decode_32_safe(&p, end, len, bad);
218 if (len > 0) {
219 err = parse_reply_info_dir(&p, p+len, info);
220 if (err < 0)
221 goto out_bad;
222 }
223
224 /* snap blob */
225 ceph_decode_32_safe(&p, end, len, bad);
226 info->snapblob_len = len;
227 info->snapblob = p;
228 p += len;
229
230 if (p != end)
231 goto bad;
232 return 0;
233
234bad:
235 err = -EIO;
236out_bad:
237 pr_err("mds parse_reply err %d\n", err);
238 return err;
239}
240
241static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
242{
243 kfree(info->dir_in);
244}
245
246
247/*
248 * sessions
249 */
250static const char *session_state_name(int s)
251{
252 switch (s) {
253 case CEPH_MDS_SESSION_NEW: return "new";
254 case CEPH_MDS_SESSION_OPENING: return "opening";
255 case CEPH_MDS_SESSION_OPEN: return "open";
256 case CEPH_MDS_SESSION_HUNG: return "hung";
257 case CEPH_MDS_SESSION_CLOSING: return "closing";
258 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
259 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
260 default: return "???";
261 }
262}
263
264static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
265{
266 if (atomic_inc_not_zero(&s->s_ref)) {
267 dout("mdsc get_session %p %d -> %d\n", s,
268 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
269 return s;
270 } else {
271 dout("mdsc get_session %p 0 -- FAIL", s);
272 return NULL;
273 }
274}
275
276void ceph_put_mds_session(struct ceph_mds_session *s)
277{
278 dout("mdsc put_session %p %d -> %d\n", s,
279 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
280 if (atomic_dec_and_test(&s->s_ref)) {
281 if (s->s_authorizer)
282 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
283 s->s_mdsc->client->monc.auth, s->s_authorizer);
284 kfree(s);
285 }
286}
287
288/*
289 * called under mdsc->mutex
290 */
291struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
292 int mds)
293{
294 struct ceph_mds_session *session;
295
296 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
297 return NULL;
298 session = mdsc->sessions[mds];
299 dout("lookup_mds_session %p %d\n", session,
300 atomic_read(&session->s_ref));
301 get_session(session);
302 return session;
303}
304
305static bool __have_session(struct ceph_mds_client *mdsc, int mds)
306{
307 if (mds >= mdsc->max_sessions)
308 return false;
309 return mdsc->sessions[mds];
310}
311
312static int __verify_registered_session(struct ceph_mds_client *mdsc,
313 struct ceph_mds_session *s)
314{
315 if (s->s_mds >= mdsc->max_sessions ||
316 mdsc->sessions[s->s_mds] != s)
317 return -ENOENT;
318 return 0;
319}
320
321/*
322 * create+register a new session for given mds.
323 * called under mdsc->mutex.
324 */
325static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
326 int mds)
327{
328 struct ceph_mds_session *s;
329
330 s = kzalloc(sizeof(*s), GFP_NOFS);
331 s->s_mdsc = mdsc;
332 s->s_mds = mds;
333 s->s_state = CEPH_MDS_SESSION_NEW;
334 s->s_ttl = 0;
335 s->s_seq = 0;
336 mutex_init(&s->s_mutex);
337
338 ceph_con_init(mdsc->client->msgr, &s->s_con);
339 s->s_con.private = s;
340 s->s_con.ops = &mds_con_ops;
341 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
342 s->s_con.peer_name.num = cpu_to_le64(mds);
343
344 spin_lock_init(&s->s_cap_lock);
345 s->s_cap_gen = 0;
346 s->s_cap_ttl = 0;
347 s->s_renew_requested = 0;
348 s->s_renew_seq = 0;
349 INIT_LIST_HEAD(&s->s_caps);
350 s->s_nr_caps = 0;
351 s->s_trim_caps = 0;
352 atomic_set(&s->s_ref, 1);
353 INIT_LIST_HEAD(&s->s_waiting);
354 INIT_LIST_HEAD(&s->s_unsafe);
355 s->s_num_cap_releases = 0;
356 s->s_cap_iterator = NULL;
357 INIT_LIST_HEAD(&s->s_cap_releases);
358 INIT_LIST_HEAD(&s->s_cap_releases_done);
359 INIT_LIST_HEAD(&s->s_cap_flushing);
360 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
361
362 dout("register_session mds%d\n", mds);
363 if (mds >= mdsc->max_sessions) {
364 int newmax = 1 << get_count_order(mds+1);
365 struct ceph_mds_session **sa;
366
367 dout("register_session realloc to %d\n", newmax);
368 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
369 if (sa == NULL)
370 goto fail_realloc;
371 if (mdsc->sessions) {
372 memcpy(sa, mdsc->sessions,
373 mdsc->max_sessions * sizeof(void *));
374 kfree(mdsc->sessions);
375 }
376 mdsc->sessions = sa;
377 mdsc->max_sessions = newmax;
378 }
379 mdsc->sessions[mds] = s;
380 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
381
382 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
383
384 return s;
385
386fail_realloc:
387 kfree(s);
388 return ERR_PTR(-ENOMEM);
389}
390
391/*
392 * called under mdsc->mutex
393 */
394static void __unregister_session(struct ceph_mds_client *mdsc,
395 struct ceph_mds_session *s)
396{
397 dout("__unregister_session mds%d %p\n", s->s_mds, s);
398 BUG_ON(mdsc->sessions[s->s_mds] != s);
399 mdsc->sessions[s->s_mds] = NULL;
400 ceph_con_close(&s->s_con);
401 ceph_put_mds_session(s);
402}
403
404/*
405 * drop session refs in request.
406 *
407 * should be last request ref, or hold mdsc->mutex
408 */
409static void put_request_session(struct ceph_mds_request *req)
410{
411 if (req->r_session) {
412 ceph_put_mds_session(req->r_session);
413 req->r_session = NULL;
414 }
415}
416
417void ceph_mdsc_release_request(struct kref *kref)
418{
419 struct ceph_mds_request *req = container_of(kref,
420 struct ceph_mds_request,
421 r_kref);
422 if (req->r_request)
423 ceph_msg_put(req->r_request);
424 if (req->r_reply) {
425 ceph_msg_put(req->r_reply);
426 destroy_reply_info(&req->r_reply_info);
427 }
428 if (req->r_inode) {
429 ceph_put_cap_refs(ceph_inode(req->r_inode),
430 CEPH_CAP_PIN);
431 iput(req->r_inode);
432 }
433 if (req->r_locked_dir)
434 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
435 CEPH_CAP_PIN);
436 if (req->r_target_inode)
437 iput(req->r_target_inode);
438 if (req->r_dentry)
439 dput(req->r_dentry);
440 if (req->r_old_dentry) {
441 ceph_put_cap_refs(
442 ceph_inode(req->r_old_dentry->d_parent->d_inode),
443 CEPH_CAP_PIN);
444 dput(req->r_old_dentry);
445 }
446 kfree(req->r_path1);
447 kfree(req->r_path2);
448 put_request_session(req);
449 ceph_unreserve_caps(&req->r_caps_reservation);
450 kfree(req);
451}
452
453/*
454 * lookup session, bump ref if found.
455 *
456 * called under mdsc->mutex.
457 */
458static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
459 u64 tid)
460{
461 struct ceph_mds_request *req;
462 struct rb_node *n = mdsc->request_tree.rb_node;
463
464 while (n) {
465 req = rb_entry(n, struct ceph_mds_request, r_node);
466 if (tid < req->r_tid)
467 n = n->rb_left;
468 else if (tid > req->r_tid)
469 n = n->rb_right;
470 else {
471 ceph_mdsc_get_request(req);
472 return req;
473 }
474 }
475 return NULL;
476}
477
478static void __insert_request(struct ceph_mds_client *mdsc,
479 struct ceph_mds_request *new)
480{
481 struct rb_node **p = &mdsc->request_tree.rb_node;
482 struct rb_node *parent = NULL;
483 struct ceph_mds_request *req = NULL;
484
485 while (*p) {
486 parent = *p;
487 req = rb_entry(parent, struct ceph_mds_request, r_node);
488 if (new->r_tid < req->r_tid)
489 p = &(*p)->rb_left;
490 else if (new->r_tid > req->r_tid)
491 p = &(*p)->rb_right;
492 else
493 BUG();
494 }
495
496 rb_link_node(&new->r_node, parent, p);
497 rb_insert_color(&new->r_node, &mdsc->request_tree);
498}
499
500/*
501 * Register an in-flight request, and assign a tid. Link to directory
502 * are modifying (if any).
503 *
504 * Called under mdsc->mutex.
505 */
506static void __register_request(struct ceph_mds_client *mdsc,
507 struct ceph_mds_request *req,
508 struct inode *dir)
509{
510 req->r_tid = ++mdsc->last_tid;
511 if (req->r_num_caps)
512 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
513 dout("__register_request %p tid %lld\n", req, req->r_tid);
514 ceph_mdsc_get_request(req);
515 __insert_request(mdsc, req);
516
517 if (dir) {
518 struct ceph_inode_info *ci = ceph_inode(dir);
519
520 spin_lock(&ci->i_unsafe_lock);
521 req->r_unsafe_dir = dir;
522 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
523 spin_unlock(&ci->i_unsafe_lock);
524 }
525}
526
527static void __unregister_request(struct ceph_mds_client *mdsc,
528 struct ceph_mds_request *req)
529{
530 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
531 rb_erase(&req->r_node, &mdsc->request_tree);
532 ceph_mdsc_put_request(req);
533
534 if (req->r_unsafe_dir) {
535 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
536
537 spin_lock(&ci->i_unsafe_lock);
538 list_del_init(&req->r_unsafe_dir_item);
539 spin_unlock(&ci->i_unsafe_lock);
540 }
541}
542
543/*
544 * Choose mds to send request to next. If there is a hint set in the
545 * request (e.g., due to a prior forward hint from the mds), use that.
546 * Otherwise, consult frag tree and/or caps to identify the
547 * appropriate mds. If all else fails, choose randomly.
548 *
549 * Called under mdsc->mutex.
550 */
551static int __choose_mds(struct ceph_mds_client *mdsc,
552 struct ceph_mds_request *req)
553{
554 struct inode *inode;
555 struct ceph_inode_info *ci;
556 struct ceph_cap *cap;
557 int mode = req->r_direct_mode;
558 int mds = -1;
559 u32 hash = req->r_direct_hash;
560 bool is_hash = req->r_direct_is_hash;
561
562 /*
563 * is there a specific mds we should try? ignore hint if we have
564 * no session and the mds is not up (active or recovering).
565 */
566 if (req->r_resend_mds >= 0 &&
567 (__have_session(mdsc, req->r_resend_mds) ||
568 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
569 dout("choose_mds using resend_mds mds%d\n",
570 req->r_resend_mds);
571 return req->r_resend_mds;
572 }
573
574 if (mode == USE_RANDOM_MDS)
575 goto random;
576
577 inode = NULL;
578 if (req->r_inode) {
579 inode = req->r_inode;
580 } else if (req->r_dentry) {
581 if (req->r_dentry->d_inode) {
582 inode = req->r_dentry->d_inode;
583 } else {
584 inode = req->r_dentry->d_parent->d_inode;
585 hash = req->r_dentry->d_name.hash;
586 is_hash = true;
587 }
588 }
589 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
590 (int)hash, mode);
591 if (!inode)
592 goto random;
593 ci = ceph_inode(inode);
594
595 if (is_hash && S_ISDIR(inode->i_mode)) {
596 struct ceph_inode_frag frag;
597 int found;
598
599 ceph_choose_frag(ci, hash, &frag, &found);
600 if (found) {
601 if (mode == USE_ANY_MDS && frag.ndist > 0) {
602 u8 r;
603
604 /* choose a random replica */
605 get_random_bytes(&r, 1);
606 r %= frag.ndist;
607 mds = frag.dist[r];
608 dout("choose_mds %p %llx.%llx "
609 "frag %u mds%d (%d/%d)\n",
610 inode, ceph_vinop(inode),
611 frag.frag, frag.mds,
612 (int)r, frag.ndist);
613 return mds;
614 }
615
616 /* since this file/dir wasn't known to be
617 * replicated, then we want to look for the
618 * authoritative mds. */
619 mode = USE_AUTH_MDS;
620 if (frag.mds >= 0) {
621 /* choose auth mds */
622 mds = frag.mds;
623 dout("choose_mds %p %llx.%llx "
624 "frag %u mds%d (auth)\n",
625 inode, ceph_vinop(inode), frag.frag, mds);
626 return mds;
627 }
628 }
629 }
630
631 spin_lock(&inode->i_lock);
632 cap = NULL;
633 if (mode == USE_AUTH_MDS)
634 cap = ci->i_auth_cap;
635 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
636 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
637 if (!cap) {
638 spin_unlock(&inode->i_lock);
639 goto random;
640 }
641 mds = cap->session->s_mds;
642 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
643 inode, ceph_vinop(inode), mds,
644 cap == ci->i_auth_cap ? "auth " : "", cap);
645 spin_unlock(&inode->i_lock);
646 return mds;
647
648random:
649 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
650 dout("choose_mds chose random mds%d\n", mds);
651 return mds;
652}
653
654
655/*
656 * session messages
657 */
658static struct ceph_msg *create_session_msg(u32 op, u64 seq)
659{
660 struct ceph_msg *msg;
661 struct ceph_mds_session_head *h;
662
663 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
664 if (IS_ERR(msg)) {
665 pr_err("create_session_msg ENOMEM creating msg\n");
666 return ERR_PTR(PTR_ERR(msg));
667 }
668 h = msg->front.iov_base;
669 h->op = cpu_to_le32(op);
670 h->seq = cpu_to_le64(seq);
671 return msg;
672}
673
674/*
675 * send session open request.
676 *
677 * called under mdsc->mutex
678 */
679static int __open_session(struct ceph_mds_client *mdsc,
680 struct ceph_mds_session *session)
681{
682 struct ceph_msg *msg;
683 int mstate;
684 int mds = session->s_mds;
685 int err = 0;
686
687 /* wait for mds to go active? */
688 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
689 dout("open_session to mds%d (%s)\n", mds,
690 ceph_mds_state_name(mstate));
691 session->s_state = CEPH_MDS_SESSION_OPENING;
692 session->s_renew_requested = jiffies;
693
694 /* send connect message */
695 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
696 if (IS_ERR(msg)) {
697 err = PTR_ERR(msg);
698 goto out;
699 }
700 ceph_con_send(&session->s_con, msg);
701
702out:
703 return 0;
704}
705
706/*
707 * session caps
708 */
709
710/*
711 * Free preallocated cap messages assigned to this session
712 */
713static void cleanup_cap_releases(struct ceph_mds_session *session)
714{
715 struct ceph_msg *msg;
716
717 spin_lock(&session->s_cap_lock);
718 while (!list_empty(&session->s_cap_releases)) {
719 msg = list_first_entry(&session->s_cap_releases,
720 struct ceph_msg, list_head);
721 list_del_init(&msg->list_head);
722 ceph_msg_put(msg);
723 }
724 while (!list_empty(&session->s_cap_releases_done)) {
725 msg = list_first_entry(&session->s_cap_releases_done,
726 struct ceph_msg, list_head);
727 list_del_init(&msg->list_head);
728 ceph_msg_put(msg);
729 }
730 spin_unlock(&session->s_cap_lock);
731}
732
733/*
734 * Helper to safely iterate over all caps associated with a session.
735 *
736 * caller must hold session s_mutex
737 */
738static int iterate_session_caps(struct ceph_mds_session *session,
739 int (*cb)(struct inode *, struct ceph_cap *,
740 void *), void *arg)
741{
742 struct list_head *p;
743 struct ceph_cap *cap;
744 struct inode *inode, *last_inode = NULL;
745 struct ceph_cap *old_cap = NULL;
746 int ret;
747
748 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
749 spin_lock(&session->s_cap_lock);
750 p = session->s_caps.next;
751 while (p != &session->s_caps) {
752 cap = list_entry(p, struct ceph_cap, session_caps);
753 inode = igrab(&cap->ci->vfs_inode);
754 if (!inode) {
755 p = p->next;
756 continue;
757 }
758 session->s_cap_iterator = cap;
759 spin_unlock(&session->s_cap_lock);
760
761 if (last_inode) {
762 iput(last_inode);
763 last_inode = NULL;
764 }
765 if (old_cap) {
766 ceph_put_cap(old_cap);
767 old_cap = NULL;
768 }
769
770 ret = cb(inode, cap, arg);
771 last_inode = inode;
772
773 spin_lock(&session->s_cap_lock);
774 p = p->next;
775 if (cap->ci == NULL) {
776 dout("iterate_session_caps finishing cap %p removal\n",
777 cap);
778 BUG_ON(cap->session != session);
779 list_del_init(&cap->session_caps);
780 session->s_nr_caps--;
781 cap->session = NULL;
782 old_cap = cap; /* put_cap it w/o locks held */
783 }
784 if (ret < 0)
785 goto out;
786 }
787 ret = 0;
788out:
789 session->s_cap_iterator = NULL;
790 spin_unlock(&session->s_cap_lock);
791
792 if (last_inode)
793 iput(last_inode);
794 if (old_cap)
795 ceph_put_cap(old_cap);
796
797 return ret;
798}
799
800static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
801 void *arg)
802{
803 struct ceph_inode_info *ci = ceph_inode(inode);
804 dout("removing cap %p, ci is %p, inode is %p\n",
805 cap, ci, &ci->vfs_inode);
806 ceph_remove_cap(cap);
807 return 0;
808}
809
810/*
811 * caller must hold session s_mutex
812 */
813static void remove_session_caps(struct ceph_mds_session *session)
814{
815 dout("remove_session_caps on %p\n", session);
816 iterate_session_caps(session, remove_session_caps_cb, NULL);
817 BUG_ON(session->s_nr_caps > 0);
818 cleanup_cap_releases(session);
819}
820
821/*
822 * wake up any threads waiting on this session's caps. if the cap is
823 * old (didn't get renewed on the client reconnect), remove it now.
824 *
825 * caller must hold s_mutex.
826 */
827static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
828 void *arg)
829{
830 struct ceph_inode_info *ci = ceph_inode(inode);
831
832 wake_up(&ci->i_cap_wq);
833 if (arg) {
834 spin_lock(&inode->i_lock);
835 ci->i_wanted_max_size = 0;
836 ci->i_requested_max_size = 0;
837 spin_unlock(&inode->i_lock);
838 }
839 return 0;
840}
841
842static void wake_up_session_caps(struct ceph_mds_session *session,
843 int reconnect)
844{
845 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
846 iterate_session_caps(session, wake_up_session_cb,
847 (void *)(unsigned long)reconnect);
848}
849
850/*
851 * Send periodic message to MDS renewing all currently held caps. The
852 * ack will reset the expiration for all caps from this session.
853 *
854 * caller holds s_mutex
855 */
856static int send_renew_caps(struct ceph_mds_client *mdsc,
857 struct ceph_mds_session *session)
858{
859 struct ceph_msg *msg;
860 int state;
861
862 if (time_after_eq(jiffies, session->s_cap_ttl) &&
863 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
864 pr_info("mds%d caps stale\n", session->s_mds);
865
866 /* do not try to renew caps until a recovering mds has reconnected
867 * with its clients. */
868 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
869 if (state < CEPH_MDS_STATE_RECONNECT) {
870 dout("send_renew_caps ignoring mds%d (%s)\n",
871 session->s_mds, ceph_mds_state_name(state));
872 return 0;
873 }
874
875 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
876 ceph_mds_state_name(state));
877 session->s_renew_requested = jiffies;
878 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
879 ++session->s_renew_seq);
880 if (IS_ERR(msg))
881 return PTR_ERR(msg);
882 ceph_con_send(&session->s_con, msg);
883 return 0;
884}
885
886/*
887 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
888 *
889 * Called under session->s_mutex
890 */
891static void renewed_caps(struct ceph_mds_client *mdsc,
892 struct ceph_mds_session *session, int is_renew)
893{
894 int was_stale;
895 int wake = 0;
896
897 spin_lock(&session->s_cap_lock);
898 was_stale = is_renew && (session->s_cap_ttl == 0 ||
899 time_after_eq(jiffies, session->s_cap_ttl));
900
901 session->s_cap_ttl = session->s_renew_requested +
902 mdsc->mdsmap->m_session_timeout*HZ;
903
904 if (was_stale) {
905 if (time_before(jiffies, session->s_cap_ttl)) {
906 pr_info("mds%d caps renewed\n", session->s_mds);
907 wake = 1;
908 } else {
909 pr_info("mds%d caps still stale\n", session->s_mds);
910 }
911 }
912 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
913 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
914 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
915 spin_unlock(&session->s_cap_lock);
916
917 if (wake)
918 wake_up_session_caps(session, 0);
919}
920
921/*
922 * send a session close request
923 */
924static int request_close_session(struct ceph_mds_client *mdsc,
925 struct ceph_mds_session *session)
926{
927 struct ceph_msg *msg;
928 int err = 0;
929
930 dout("request_close_session mds%d state %s seq %lld\n",
931 session->s_mds, session_state_name(session->s_state),
932 session->s_seq);
933 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
934 if (IS_ERR(msg))
935 err = PTR_ERR(msg);
936 else
937 ceph_con_send(&session->s_con, msg);
938 return err;
939}
940
941/*
942 * Called with s_mutex held.
943 */
944static int __close_session(struct ceph_mds_client *mdsc,
945 struct ceph_mds_session *session)
946{
947 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
948 return 0;
949 session->s_state = CEPH_MDS_SESSION_CLOSING;
950 return request_close_session(mdsc, session);
951}
952
953/*
954 * Trim old(er) caps.
955 *
956 * Because we can't cache an inode without one or more caps, we do
957 * this indirectly: if a cap is unused, we prune its aliases, at which
958 * point the inode will hopefully get dropped to.
959 *
960 * Yes, this is a bit sloppy. Our only real goal here is to respond to
961 * memory pressure from the MDS, though, so it needn't be perfect.
962 */
963static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
964{
965 struct ceph_mds_session *session = arg;
966 struct ceph_inode_info *ci = ceph_inode(inode);
967 int used, oissued, mine;
968
969 if (session->s_trim_caps <= 0)
970 return -1;
971
972 spin_lock(&inode->i_lock);
973 mine = cap->issued | cap->implemented;
974 used = __ceph_caps_used(ci);
975 oissued = __ceph_caps_issued_other(ci, cap);
976
977 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
978 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
979 ceph_cap_string(used));
980 if (ci->i_dirty_caps)
981 goto out; /* dirty caps */
982 if ((used & ~oissued) & mine)
983 goto out; /* we need these caps */
984
985 session->s_trim_caps--;
986 if (oissued) {
987 /* we aren't the only cap.. just remove us */
988 __ceph_remove_cap(cap);
989 } else {
990 /* try to drop referring dentries */
991 spin_unlock(&inode->i_lock);
992 d_prune_aliases(inode);
993 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
994 inode, cap, atomic_read(&inode->i_count));
995 return 0;
996 }
997
998out:
999 spin_unlock(&inode->i_lock);
1000 return 0;
1001}
1002
1003/*
1004 * Trim session cap count down to some max number.
1005 */
1006static int trim_caps(struct ceph_mds_client *mdsc,
1007 struct ceph_mds_session *session,
1008 int max_caps)
1009{
1010 int trim_caps = session->s_nr_caps - max_caps;
1011
1012 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1013 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1014 if (trim_caps > 0) {
1015 session->s_trim_caps = trim_caps;
1016 iterate_session_caps(session, trim_caps_cb, session);
1017 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps,
1019 trim_caps - session->s_trim_caps);
1020 session->s_trim_caps = 0;
1021 }
1022 return 0;
1023}
1024
1025/*
1026 * Allocate cap_release messages. If there is a partially full message
1027 * in the queue, try to allocate enough to cover it's remainder, so that
1028 * we can send it immediately.
1029 *
1030 * Called under s_mutex.
1031 */
1032static int add_cap_releases(struct ceph_mds_client *mdsc,
1033 struct ceph_mds_session *session,
1034 int extra)
1035{
1036 struct ceph_msg *msg;
1037 struct ceph_mds_cap_release *head;
1038 int err = -ENOMEM;
1039
1040 if (extra < 0)
1041 extra = mdsc->client->mount_args->cap_release_safety;
1042
1043 spin_lock(&session->s_cap_lock);
1044
1045 if (!list_empty(&session->s_cap_releases)) {
1046 msg = list_first_entry(&session->s_cap_releases,
1047 struct ceph_msg,
1048 list_head);
1049 head = msg->front.iov_base;
1050 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1051 }
1052
1053 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1054 spin_unlock(&session->s_cap_lock);
1055 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1056 0, 0, NULL);
1057 if (!msg)
1058 goto out_unlocked;
1059 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1060 (int)msg->front.iov_len);
1061 head = msg->front.iov_base;
1062 head->num = cpu_to_le32(0);
1063 msg->front.iov_len = sizeof(*head);
1064 spin_lock(&session->s_cap_lock);
1065 list_add(&msg->list_head, &session->s_cap_releases);
1066 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1067 }
1068
1069 if (!list_empty(&session->s_cap_releases)) {
1070 msg = list_first_entry(&session->s_cap_releases,
1071 struct ceph_msg,
1072 list_head);
1073 head = msg->front.iov_base;
1074 if (head->num) {
1075 dout(" queueing non-full %p (%d)\n", msg,
1076 le32_to_cpu(head->num));
1077 list_move_tail(&msg->list_head,
1078 &session->s_cap_releases_done);
1079 session->s_num_cap_releases -=
1080 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1081 }
1082 }
1083 err = 0;
1084 spin_unlock(&session->s_cap_lock);
1085out_unlocked:
1086 return err;
1087}
1088
1089/*
1090 * flush all dirty inode data to disk.
1091 *
1092 * returns true if we've flushed through want_flush_seq
1093 */
1094static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1095{
1096 int mds, ret = 1;
1097
1098 dout("check_cap_flush want %lld\n", want_flush_seq);
1099 mutex_lock(&mdsc->mutex);
1100 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1101 struct ceph_mds_session *session = mdsc->sessions[mds];
1102
1103 if (!session)
1104 continue;
1105 get_session(session);
1106 mutex_unlock(&mdsc->mutex);
1107
1108 mutex_lock(&session->s_mutex);
1109 if (!list_empty(&session->s_cap_flushing)) {
1110 struct ceph_inode_info *ci =
1111 list_entry(session->s_cap_flushing.next,
1112 struct ceph_inode_info,
1113 i_flushing_item);
1114 struct inode *inode = &ci->vfs_inode;
1115
1116 spin_lock(&inode->i_lock);
1117 if (ci->i_cap_flush_seq <= want_flush_seq) {
1118 dout("check_cap_flush still flushing %p "
1119 "seq %lld <= %lld to mds%d\n", inode,
1120 ci->i_cap_flush_seq, want_flush_seq,
1121 session->s_mds);
1122 ret = 0;
1123 }
1124 spin_unlock(&inode->i_lock);
1125 }
1126 mutex_unlock(&session->s_mutex);
1127 ceph_put_mds_session(session);
1128
1129 if (!ret)
1130 return ret;
1131 mutex_lock(&mdsc->mutex);
1132 }
1133
1134 mutex_unlock(&mdsc->mutex);
1135 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1136 return ret;
1137}
1138
1139/*
1140 * called under s_mutex
1141 */
1142static void send_cap_releases(struct ceph_mds_client *mdsc,
1143 struct ceph_mds_session *session)
1144{
1145 struct ceph_msg *msg;
1146
1147 dout("send_cap_releases mds%d\n", session->s_mds);
1148 while (1) {
1149 spin_lock(&session->s_cap_lock);
1150 if (list_empty(&session->s_cap_releases_done))
1151 break;
1152 msg = list_first_entry(&session->s_cap_releases_done,
1153 struct ceph_msg, list_head);
1154 list_del_init(&msg->list_head);
1155 spin_unlock(&session->s_cap_lock);
1156 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1157 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1158 ceph_con_send(&session->s_con, msg);
1159 }
1160 spin_unlock(&session->s_cap_lock);
1161}
1162
1163/*
1164 * requests
1165 */
1166
1167/*
1168 * Create an mds request.
1169 */
1170struct ceph_mds_request *
1171ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1172{
1173 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1174
1175 if (!req)
1176 return ERR_PTR(-ENOMEM);
1177
1178 req->r_started = jiffies;
1179 req->r_resend_mds = -1;
1180 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1181 req->r_fmode = -1;
1182 kref_init(&req->r_kref);
1183 INIT_LIST_HEAD(&req->r_wait);
1184 init_completion(&req->r_completion);
1185 init_completion(&req->r_safe_completion);
1186 INIT_LIST_HEAD(&req->r_unsafe_item);
1187
1188 req->r_op = op;
1189 req->r_direct_mode = mode;
1190 return req;
1191}
1192
1193/*
1194 * return oldest (lowest) request, tid in request tree, 0 if none.
1195 *
1196 * called under mdsc->mutex.
1197 */
1198static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1199{
1200 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1201 return NULL;
1202 return rb_entry(rb_first(&mdsc->request_tree),
1203 struct ceph_mds_request, r_node);
1204}
1205
1206static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1207{
1208 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1209
1210 if (req)
1211 return req->r_tid;
1212 return 0;
1213}
1214
1215/*
1216 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1217 * on build_path_from_dentry in fs/cifs/dir.c.
1218 *
1219 * If @stop_on_nosnap, generate path relative to the first non-snapped
1220 * inode.
1221 *
1222 * Encode hidden .snap dirs as a double /, i.e.
1223 * foo/.snap/bar -> foo//bar
1224 */
1225char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1226 int stop_on_nosnap)
1227{
1228 struct dentry *temp;
1229 char *path;
1230 int len, pos;
1231
1232 if (dentry == NULL)
1233 return ERR_PTR(-EINVAL);
1234
1235retry:
1236 len = 0;
1237 for (temp = dentry; !IS_ROOT(temp);) {
1238 struct inode *inode = temp->d_inode;
1239 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1240 len++; /* slash only */
1241 else if (stop_on_nosnap && inode &&
1242 ceph_snap(inode) == CEPH_NOSNAP)
1243 break;
1244 else
1245 len += 1 + temp->d_name.len;
1246 temp = temp->d_parent;
1247 if (temp == NULL) {
1248 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1249 return ERR_PTR(-EINVAL);
1250 }
1251 }
1252 if (len)
1253 len--; /* no leading '/' */
1254
1255 path = kmalloc(len+1, GFP_NOFS);
1256 if (path == NULL)
1257 return ERR_PTR(-ENOMEM);
1258 pos = len;
1259 path[pos] = 0; /* trailing null */
1260 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1261 struct inode *inode = temp->d_inode;
1262
1263 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1264 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1265 pos, temp);
1266 } else if (stop_on_nosnap && inode &&
1267 ceph_snap(inode) == CEPH_NOSNAP) {
1268 break;
1269 } else {
1270 pos -= temp->d_name.len;
1271 if (pos < 0)
1272 break;
1273 strncpy(path + pos, temp->d_name.name,
1274 temp->d_name.len);
1275 dout("build_path_dentry path+%d: %p '%.*s'\n",
1276 pos, temp, temp->d_name.len, path + pos);
1277 }
1278 if (pos)
1279 path[--pos] = '/';
1280 temp = temp->d_parent;
1281 if (temp == NULL) {
1282 pr_err("build_path_dentry corrupt dentry\n");
1283 kfree(path);
1284 return ERR_PTR(-EINVAL);
1285 }
1286 }
1287 if (pos != 0) {
1288 pr_err("build_path_dentry did not end path lookup where "
1289 "expected, namelen is %d, pos is %d\n", len, pos);
1290 /* presumably this is only possible if racing with a
1291 rename of one of the parent directories (we can not
1292 lock the dentries above us to prevent this, but
1293 retrying should be harmless) */
1294 kfree(path);
1295 goto retry;
1296 }
1297
1298 *base = ceph_ino(temp->d_inode);
1299 *plen = len;
1300 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1301 dentry, atomic_read(&dentry->d_count), *base, len, path);
1302 return path;
1303}
1304
1305static int build_dentry_path(struct dentry *dentry,
1306 const char **ppath, int *ppathlen, u64 *pino,
1307 int *pfreepath)
1308{
1309 char *path;
1310
1311 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1312 *pino = ceph_ino(dentry->d_parent->d_inode);
1313 *ppath = dentry->d_name.name;
1314 *ppathlen = dentry->d_name.len;
1315 return 0;
1316 }
1317 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1318 if (IS_ERR(path))
1319 return PTR_ERR(path);
1320 *ppath = path;
1321 *pfreepath = 1;
1322 return 0;
1323}
1324
1325static int build_inode_path(struct inode *inode,
1326 const char **ppath, int *ppathlen, u64 *pino,
1327 int *pfreepath)
1328{
1329 struct dentry *dentry;
1330 char *path;
1331
1332 if (ceph_snap(inode) == CEPH_NOSNAP) {
1333 *pino = ceph_ino(inode);
1334 *ppathlen = 0;
1335 return 0;
1336 }
1337 dentry = d_find_alias(inode);
1338 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1339 dput(dentry);
1340 if (IS_ERR(path))
1341 return PTR_ERR(path);
1342 *ppath = path;
1343 *pfreepath = 1;
1344 return 0;
1345}
1346
1347/*
1348 * request arguments may be specified via an inode *, a dentry *, or
1349 * an explicit ino+path.
1350 */
1351static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1352 const char *rpath, u64 rino,
1353 const char **ppath, int *pathlen,
1354 u64 *ino, int *freepath)
1355{
1356 int r = 0;
1357
1358 if (rinode) {
1359 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1360 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1361 ceph_snap(rinode));
1362 } else if (rdentry) {
1363 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1364 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1365 *ppath);
1366 } else if (rpath) {
1367 *ino = rino;
1368 *ppath = rpath;
1369 *pathlen = strlen(rpath);
1370 dout(" path %.*s\n", *pathlen, rpath);
1371 }
1372
1373 return r;
1374}
1375
1376/*
1377 * called under mdsc->mutex
1378 */
1379static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1380 struct ceph_mds_request *req,
1381 int mds)
1382{
1383 struct ceph_msg *msg;
1384 struct ceph_mds_request_head *head;
1385 const char *path1 = NULL;
1386 const char *path2 = NULL;
1387 u64 ino1 = 0, ino2 = 0;
1388 int pathlen1 = 0, pathlen2 = 0;
1389 int freepath1 = 0, freepath2 = 0;
1390 int len;
1391 u16 releases;
1392 void *p, *end;
1393 int ret;
1394
1395 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1396 req->r_path1, req->r_ino1.ino,
1397 &path1, &pathlen1, &ino1, &freepath1);
1398 if (ret < 0) {
1399 msg = ERR_PTR(ret);
1400 goto out;
1401 }
1402
1403 ret = set_request_path_attr(NULL, req->r_old_dentry,
1404 req->r_path2, req->r_ino2.ino,
1405 &path2, &pathlen2, &ino2, &freepath2);
1406 if (ret < 0) {
1407 msg = ERR_PTR(ret);
1408 goto out_free1;
1409 }
1410
1411 len = sizeof(*head) +
1412 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1413
1414 /* calculate (max) length for cap releases */
1415 len += sizeof(struct ceph_mds_request_release) *
1416 (!!req->r_inode_drop + !!req->r_dentry_drop +
1417 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1418 if (req->r_dentry_drop)
1419 len += req->r_dentry->d_name.len;
1420 if (req->r_old_dentry_drop)
1421 len += req->r_old_dentry->d_name.len;
1422
1423 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1424 if (IS_ERR(msg))
1425 goto out_free2;
1426
1427 msg->hdr.tid = cpu_to_le64(req->r_tid);
1428
1429 head = msg->front.iov_base;
1430 p = msg->front.iov_base + sizeof(*head);
1431 end = msg->front.iov_base + msg->front.iov_len;
1432
1433 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1434 head->op = cpu_to_le32(req->r_op);
1435 head->caller_uid = cpu_to_le32(current_fsuid());
1436 head->caller_gid = cpu_to_le32(current_fsgid());
1437 head->args = req->r_args;
1438
1439 ceph_encode_filepath(&p, end, ino1, path1);
1440 ceph_encode_filepath(&p, end, ino2, path2);
1441
1442 /* cap releases */
1443 releases = 0;
1444 if (req->r_inode_drop)
1445 releases += ceph_encode_inode_release(&p,
1446 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1447 mds, req->r_inode_drop, req->r_inode_unless, 0);
1448 if (req->r_dentry_drop)
1449 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1450 mds, req->r_dentry_drop, req->r_dentry_unless);
1451 if (req->r_old_dentry_drop)
1452 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1453 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1454 if (req->r_old_inode_drop)
1455 releases += ceph_encode_inode_release(&p,
1456 req->r_old_dentry->d_inode,
1457 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1458 head->num_releases = cpu_to_le16(releases);
1459
1460 BUG_ON(p > end);
1461 msg->front.iov_len = p - msg->front.iov_base;
1462 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1463
1464 msg->pages = req->r_pages;
1465 msg->nr_pages = req->r_num_pages;
1466 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1467 msg->hdr.data_off = cpu_to_le16(0);
1468
1469out_free2:
1470 if (freepath2)
1471 kfree((char *)path2);
1472out_free1:
1473 if (freepath1)
1474 kfree((char *)path1);
1475out:
1476 return msg;
1477}
1478
1479/*
1480 * called under mdsc->mutex if error, under no mutex if
1481 * success.
1482 */
1483static void complete_request(struct ceph_mds_client *mdsc,
1484 struct ceph_mds_request *req)
1485{
1486 if (req->r_callback)
1487 req->r_callback(mdsc, req);
1488 else
1489 complete(&req->r_completion);
1490}
1491
1492/*
1493 * called under mdsc->mutex
1494 */
1495static int __prepare_send_request(struct ceph_mds_client *mdsc,
1496 struct ceph_mds_request *req,
1497 int mds)
1498{
1499 struct ceph_mds_request_head *rhead;
1500 struct ceph_msg *msg;
1501 int flags = 0;
1502
1503 req->r_mds = mds;
1504 req->r_attempts++;
1505 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1506 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1507
1508 if (req->r_request) {
1509 ceph_msg_put(req->r_request);
1510 req->r_request = NULL;
1511 }
1512 msg = create_request_message(mdsc, req, mds);
1513 if (IS_ERR(msg)) {
1514 req->r_reply = ERR_PTR(PTR_ERR(msg));
1515 complete_request(mdsc, req);
1516 return -PTR_ERR(msg);
1517 }
1518 req->r_request = msg;
1519
1520 rhead = msg->front.iov_base;
1521 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1522 if (req->r_got_unsafe)
1523 flags |= CEPH_MDS_FLAG_REPLAY;
1524 if (req->r_locked_dir)
1525 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1526 rhead->flags = cpu_to_le32(flags);
1527 rhead->num_fwd = req->r_num_fwd;
1528 rhead->num_retry = req->r_attempts - 1;
1529
1530 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1531
1532 if (req->r_target_inode && req->r_got_unsafe)
1533 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1534 else
1535 rhead->ino = 0;
1536 return 0;
1537}
1538
1539/*
1540 * send request, or put it on the appropriate wait list.
1541 */
1542static int __do_request(struct ceph_mds_client *mdsc,
1543 struct ceph_mds_request *req)
1544{
1545 struct ceph_mds_session *session = NULL;
1546 int mds = -1;
1547 int err = -EAGAIN;
1548
1549 if (req->r_reply)
1550 goto out;
1551
1552 if (req->r_timeout &&
1553 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1554 dout("do_request timed out\n");
1555 err = -EIO;
1556 goto finish;
1557 }
1558
1559 mds = __choose_mds(mdsc, req);
1560 if (mds < 0 ||
1561 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1562 dout("do_request no mds or not active, waiting for map\n");
1563 list_add(&req->r_wait, &mdsc->waiting_for_map);
1564 goto out;
1565 }
1566
1567 /* get, open session */
1568 session = __ceph_lookup_mds_session(mdsc, mds);
1569 if (!session)
1570 session = register_session(mdsc, mds);
1571 dout("do_request mds%d session %p state %s\n", mds, session,
1572 session_state_name(session->s_state));
1573 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1574 session->s_state != CEPH_MDS_SESSION_HUNG) {
1575 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1576 session->s_state == CEPH_MDS_SESSION_CLOSING)
1577 __open_session(mdsc, session);
1578 list_add(&req->r_wait, &session->s_waiting);
1579 goto out_session;
1580 }
1581
1582 /* send request */
1583 req->r_session = get_session(session);
1584 req->r_resend_mds = -1; /* forget any previous mds hint */
1585
1586 if (req->r_request_started == 0) /* note request start time */
1587 req->r_request_started = jiffies;
1588
1589 err = __prepare_send_request(mdsc, req, mds);
1590 if (!err) {
1591 ceph_msg_get(req->r_request);
1592 ceph_con_send(&session->s_con, req->r_request);
1593 }
1594
1595out_session:
1596 ceph_put_mds_session(session);
1597out:
1598 return err;
1599
1600finish:
1601 req->r_reply = ERR_PTR(err);
1602 complete_request(mdsc, req);
1603 goto out;
1604}
1605
1606/*
1607 * called under mdsc->mutex
1608 */
1609static void __wake_requests(struct ceph_mds_client *mdsc,
1610 struct list_head *head)
1611{
1612 struct ceph_mds_request *req, *nreq;
1613
1614 list_for_each_entry_safe(req, nreq, head, r_wait) {
1615 list_del_init(&req->r_wait);
1616 __do_request(mdsc, req);
1617 }
1618}
1619
1620/*
1621 * Wake up threads with requests pending for @mds, so that they can
1622 * resubmit their requests to a possibly different mds. If @all is set,
1623 * wake up if their requests has been forwarded to @mds, too.
1624 */
1625static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1626{
1627 struct ceph_mds_request *req;
1628 struct rb_node *p;
1629
1630 dout("kick_requests mds%d\n", mds);
1631 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1632 req = rb_entry(p, struct ceph_mds_request, r_node);
1633 if (req->r_got_unsafe)
1634 continue;
1635 if (req->r_session &&
1636 req->r_session->s_mds == mds) {
1637 dout(" kicking tid %llu\n", req->r_tid);
1638 put_request_session(req);
1639 __do_request(mdsc, req);
1640 }
1641 }
1642}
1643
1644void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1645 struct ceph_mds_request *req)
1646{
1647 dout("submit_request on %p\n", req);
1648 mutex_lock(&mdsc->mutex);
1649 __register_request(mdsc, req, NULL);
1650 __do_request(mdsc, req);
1651 mutex_unlock(&mdsc->mutex);
1652}
1653
1654/*
1655 * Synchrously perform an mds request. Take care of all of the
1656 * session setup, forwarding, retry details.
1657 */
1658int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1659 struct inode *dir,
1660 struct ceph_mds_request *req)
1661{
1662 int err;
1663
1664 dout("do_request on %p\n", req);
1665
1666 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1667 if (req->r_inode)
1668 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1669 if (req->r_locked_dir)
1670 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1671 if (req->r_old_dentry)
1672 ceph_get_cap_refs(
1673 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1674 CEPH_CAP_PIN);
1675
1676 /* issue */
1677 mutex_lock(&mdsc->mutex);
1678 __register_request(mdsc, req, dir);
1679 __do_request(mdsc, req);
1680
1681 /* wait */
1682 if (!req->r_reply) {
1683 mutex_unlock(&mdsc->mutex);
1684 if (req->r_timeout) {
1685 err = (long)wait_for_completion_interruptible_timeout(
1686 &req->r_completion, req->r_timeout);
1687 if (err == 0)
1688 req->r_reply = ERR_PTR(-EIO);
1689 else if (err < 0)
1690 req->r_reply = ERR_PTR(err);
1691 } else {
1692 err = wait_for_completion_interruptible(
1693 &req->r_completion);
1694 if (err)
1695 req->r_reply = ERR_PTR(err);
1696 }
1697 mutex_lock(&mdsc->mutex);
1698 }
1699
1700 if (IS_ERR(req->r_reply)) {
1701 err = PTR_ERR(req->r_reply);
1702 req->r_reply = NULL;
1703
1704 if (err == -ERESTARTSYS) {
1705 /* aborted */
1706 req->r_aborted = true;
1707
1708 if (req->r_locked_dir &&
1709 (req->r_op & CEPH_MDS_OP_WRITE)) {
1710 struct ceph_inode_info *ci =
1711 ceph_inode(req->r_locked_dir);
1712
1713 dout("aborted, clearing I_COMPLETE on %p\n",
1714 req->r_locked_dir);
1715 spin_lock(&req->r_locked_dir->i_lock);
1716 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1717 ci->i_release_count++;
1718 spin_unlock(&req->r_locked_dir->i_lock);
1719 }
1720 } else {
1721 /* clean up this request */
1722 __unregister_request(mdsc, req);
1723 if (!list_empty(&req->r_unsafe_item))
1724 list_del_init(&req->r_unsafe_item);
1725 complete(&req->r_safe_completion);
1726 }
1727 } else if (req->r_err) {
1728 err = req->r_err;
1729 } else {
1730 err = le32_to_cpu(req->r_reply_info.head->result);
1731 }
1732 mutex_unlock(&mdsc->mutex);
1733
1734 dout("do_request %p done, result %d\n", req, err);
1735 return err;
1736}
1737
1738/*
1739 * Handle mds reply.
1740 *
1741 * We take the session mutex and parse and process the reply immediately.
1742 * This preserves the logical ordering of replies, capabilities, etc., sent
1743 * by the MDS as they are applied to our local cache.
1744 */
1745static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1746{
1747 struct ceph_mds_client *mdsc = session->s_mdsc;
1748 struct ceph_mds_request *req;
1749 struct ceph_mds_reply_head *head = msg->front.iov_base;
1750 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1751 u64 tid;
1752 int err, result;
1753 int mds = session->s_mds;
1754
1755 if (msg->front.iov_len < sizeof(*head)) {
1756 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1757 ceph_msg_dump(msg);
1758 return;
1759 }
1760
1761 /* get request, session */
1762 tid = le64_to_cpu(msg->hdr.tid);
1763 mutex_lock(&mdsc->mutex);
1764 req = __lookup_request(mdsc, tid);
1765 if (!req) {
1766 dout("handle_reply on unknown tid %llu\n", tid);
1767 mutex_unlock(&mdsc->mutex);
1768 return;
1769 }
1770 dout("handle_reply %p\n", req);
1771
1772 /* correct session? */
1773 if (!req->r_session && req->r_session != session) {
1774 pr_err("mdsc_handle_reply got %llu on session mds%d"
1775 " not mds%d\n", tid, session->s_mds,
1776 req->r_session ? req->r_session->s_mds : -1);
1777 mutex_unlock(&mdsc->mutex);
1778 goto out;
1779 }
1780
1781 /* dup? */
1782 if ((req->r_got_unsafe && !head->safe) ||
1783 (req->r_got_safe && head->safe)) {
1784 pr_warning("got a dup %s reply on %llu from mds%d\n",
1785 head->safe ? "safe" : "unsafe", tid, mds);
1786 mutex_unlock(&mdsc->mutex);
1787 goto out;
1788 }
1789
1790 result = le32_to_cpu(head->result);
1791
1792 /*
1793 * Tolerate 2 consecutive ESTALEs from the same mds.
1794 * FIXME: we should be looking at the cap migrate_seq.
1795 */
1796 if (result == -ESTALE) {
1797 req->r_direct_mode = USE_AUTH_MDS;
1798 req->r_num_stale++;
1799 if (req->r_num_stale <= 2) {
1800 __do_request(mdsc, req);
1801 mutex_unlock(&mdsc->mutex);
1802 goto out;
1803 }
1804 } else {
1805 req->r_num_stale = 0;
1806 }
1807
1808 if (head->safe) {
1809 req->r_got_safe = true;
1810 __unregister_request(mdsc, req);
1811 complete(&req->r_safe_completion);
1812
1813 if (req->r_got_unsafe) {
1814 /*
1815 * We already handled the unsafe response, now do the
1816 * cleanup. No need to examine the response; the MDS
1817 * doesn't include any result info in the safe
1818 * response. And even if it did, there is nothing
1819 * useful we could do with a revised return value.
1820 */
1821 dout("got safe reply %llu, mds%d\n", tid, mds);
1822 list_del_init(&req->r_unsafe_item);
1823
1824 /* last unsafe request during umount? */
1825 if (mdsc->stopping && !__get_oldest_req(mdsc))
1826 complete(&mdsc->safe_umount_waiters);
1827 mutex_unlock(&mdsc->mutex);
1828 goto out;
1829 }
1830 }
1831
1832 BUG_ON(req->r_reply);
1833
1834 if (!head->safe) {
1835 req->r_got_unsafe = true;
1836 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1837 }
1838
1839 dout("handle_reply tid %lld result %d\n", tid, result);
1840 rinfo = &req->r_reply_info;
1841 err = parse_reply_info(msg, rinfo);
1842 mutex_unlock(&mdsc->mutex);
1843
1844 mutex_lock(&session->s_mutex);
1845 if (err < 0) {
1846 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1847 ceph_msg_dump(msg);
1848 goto out_err;
1849 }
1850
1851 /* snap trace */
1852 if (rinfo->snapblob_len) {
1853 down_write(&mdsc->snap_rwsem);
1854 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1855 rinfo->snapblob + rinfo->snapblob_len,
1856 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1857 downgrade_write(&mdsc->snap_rwsem);
1858 } else {
1859 down_read(&mdsc->snap_rwsem);
1860 }
1861
1862 /* insert trace into our cache */
1863 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1864 if (err == 0) {
1865 if (result == 0 && rinfo->dir_nr)
1866 ceph_readdir_prepopulate(req, req->r_session);
1867 ceph_unreserve_caps(&req->r_caps_reservation);
1868 }
1869
1870 up_read(&mdsc->snap_rwsem);
1871out_err:
1872 if (err) {
1873 req->r_err = err;
1874 } else {
1875 req->r_reply = msg;
1876 ceph_msg_get(msg);
1877 }
1878
1879 add_cap_releases(mdsc, req->r_session, -1);
1880 mutex_unlock(&session->s_mutex);
1881
1882 /* kick calling process */
1883 complete_request(mdsc, req);
1884out:
1885 ceph_mdsc_put_request(req);
1886 return;
1887}
1888
1889
1890
1891/*
1892 * handle mds notification that our request has been forwarded.
1893 */
1894static void handle_forward(struct ceph_mds_client *mdsc,
1895 struct ceph_mds_session *session,
1896 struct ceph_msg *msg)
1897{
1898 struct ceph_mds_request *req;
1899 u64 tid = le64_to_cpu(msg->hdr.tid);
1900 u32 next_mds;
1901 u32 fwd_seq;
1902 int err = -EINVAL;
1903 void *p = msg->front.iov_base;
1904 void *end = p + msg->front.iov_len;
1905
1906 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1907 next_mds = ceph_decode_32(&p);
1908 fwd_seq = ceph_decode_32(&p);
1909
1910 mutex_lock(&mdsc->mutex);
1911 req = __lookup_request(mdsc, tid);
1912 if (!req) {
1913 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1914 goto out; /* dup reply? */
1915 }
1916
1917 if (fwd_seq <= req->r_num_fwd) {
1918 dout("forward %llu to mds%d - old seq %d <= %d\n",
1919 tid, next_mds, req->r_num_fwd, fwd_seq);
1920 } else {
1921 /* resend. forward race not possible; mds would drop */
1922 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1923 req->r_num_fwd = fwd_seq;
1924 req->r_resend_mds = next_mds;
1925 put_request_session(req);
1926 __do_request(mdsc, req);
1927 }
1928 ceph_mdsc_put_request(req);
1929out:
1930 mutex_unlock(&mdsc->mutex);
1931 return;
1932
1933bad:
1934 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1935}
1936
1937/*
1938 * handle a mds session control message
1939 */
1940static void handle_session(struct ceph_mds_session *session,
1941 struct ceph_msg *msg)
1942{
1943 struct ceph_mds_client *mdsc = session->s_mdsc;
1944 u32 op;
1945 u64 seq;
1946 int mds = session->s_mds;
1947 struct ceph_mds_session_head *h = msg->front.iov_base;
1948 int wake = 0;
1949
1950 /* decode */
1951 if (msg->front.iov_len != sizeof(*h))
1952 goto bad;
1953 op = le32_to_cpu(h->op);
1954 seq = le64_to_cpu(h->seq);
1955
1956 mutex_lock(&mdsc->mutex);
1957 if (op == CEPH_SESSION_CLOSE)
1958 __unregister_session(mdsc, session);
1959 /* FIXME: this ttl calculation is generous */
1960 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1961 mutex_unlock(&mdsc->mutex);
1962
1963 mutex_lock(&session->s_mutex);
1964
1965 dout("handle_session mds%d %s %p state %s seq %llu\n",
1966 mds, ceph_session_op_name(op), session,
1967 session_state_name(session->s_state), seq);
1968
1969 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1970 session->s_state = CEPH_MDS_SESSION_OPEN;
1971 pr_info("mds%d came back\n", session->s_mds);
1972 }
1973
1974 switch (op) {
1975 case CEPH_SESSION_OPEN:
1976 session->s_state = CEPH_MDS_SESSION_OPEN;
1977 renewed_caps(mdsc, session, 0);
1978 wake = 1;
1979 if (mdsc->stopping)
1980 __close_session(mdsc, session);
1981 break;
1982
1983 case CEPH_SESSION_RENEWCAPS:
1984 if (session->s_renew_seq == seq)
1985 renewed_caps(mdsc, session, 1);
1986 break;
1987
1988 case CEPH_SESSION_CLOSE:
1989 remove_session_caps(session);
1990 wake = 1; /* for good measure */
1991 complete(&mdsc->session_close_waiters);
1992 kick_requests(mdsc, mds, 0); /* cur only */
1993 break;
1994
1995 case CEPH_SESSION_STALE:
1996 pr_info("mds%d caps went stale, renewing\n",
1997 session->s_mds);
1998 spin_lock(&session->s_cap_lock);
1999 session->s_cap_gen++;
2000 session->s_cap_ttl = 0;
2001 spin_unlock(&session->s_cap_lock);
2002 send_renew_caps(mdsc, session);
2003 break;
2004
2005 case CEPH_SESSION_RECALL_STATE:
2006 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2007 break;
2008
2009 default:
2010 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2011 WARN_ON(1);
2012 }
2013
2014 mutex_unlock(&session->s_mutex);
2015 if (wake) {
2016 mutex_lock(&mdsc->mutex);
2017 __wake_requests(mdsc, &session->s_waiting);
2018 mutex_unlock(&mdsc->mutex);
2019 }
2020 return;
2021
2022bad:
2023 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2024 (int)msg->front.iov_len);
2025 ceph_msg_dump(msg);
2026 return;
2027}
2028
2029
2030/*
2031 * called under session->mutex.
2032 */
2033static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2034 struct ceph_mds_session *session)
2035{
2036 struct ceph_mds_request *req, *nreq;
2037 int err;
2038
2039 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2040
2041 mutex_lock(&mdsc->mutex);
2042 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2043 err = __prepare_send_request(mdsc, req, session->s_mds);
2044 if (!err) {
2045 ceph_msg_get(req->r_request);
2046 ceph_con_send(&session->s_con, req->r_request);
2047 }
2048 }
2049 mutex_unlock(&mdsc->mutex);
2050}
2051
2052/*
2053 * Encode information about a cap for a reconnect with the MDS.
2054 */
2055static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2056 void *arg)
2057{
2058 struct ceph_mds_cap_reconnect rec;
2059 struct ceph_inode_info *ci;
2060 struct ceph_pagelist *pagelist = arg;
2061 char *path;
2062 int pathlen, err;
2063 u64 pathbase;
2064 struct dentry *dentry;
2065
2066 ci = cap->ci;
2067
2068 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2069 inode, ceph_vinop(inode), cap, cap->cap_id,
2070 ceph_cap_string(cap->issued));
2071 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2072 if (err)
2073 return err;
2074
2075 dentry = d_find_alias(inode);
2076 if (dentry) {
2077 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2078 if (IS_ERR(path)) {
2079 err = PTR_ERR(path);
2080 BUG_ON(err);
2081 }
2082 } else {
2083 path = NULL;
2084 pathlen = 0;
2085 }
2086 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2087 if (err)
2088 goto out;
2089
2090 spin_lock(&inode->i_lock);
2091 cap->seq = 0; /* reset cap seq */
2092 cap->issue_seq = 0; /* and issue_seq */
2093 rec.cap_id = cpu_to_le64(cap->cap_id);
2094 rec.pathbase = cpu_to_le64(pathbase);
2095 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2096 rec.issued = cpu_to_le32(cap->issued);
2097 rec.size = cpu_to_le64(inode->i_size);
2098 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2099 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2100 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2101 spin_unlock(&inode->i_lock);
2102
2103 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2104
2105out:
2106 kfree(path);
2107 dput(dentry);
2108 return err;
2109}
2110
2111
2112/*
2113 * If an MDS fails and recovers, clients need to reconnect in order to
2114 * reestablish shared state. This includes all caps issued through
2115 * this session _and_ the snap_realm hierarchy. Because it's not
2116 * clear which snap realms the mds cares about, we send everything we
2117 * know about.. that ensures we'll then get any new info the
2118 * recovering MDS might have.
2119 *
2120 * This is a relatively heavyweight operation, but it's rare.
2121 *
2122 * called with mdsc->mutex held.
2123 */
2124static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2125{
2126 struct ceph_mds_session *session = NULL;
2127 struct ceph_msg *reply;
2128 struct rb_node *p;
2129 int err;
2130 struct ceph_pagelist *pagelist;
2131
2132 pr_info("reconnect to recovering mds%d\n", mds);
2133
2134 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2135 if (!pagelist)
2136 goto fail_nopagelist;
2137 ceph_pagelist_init(pagelist);
2138
2139 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2140 if (IS_ERR(reply)) {
2141 err = PTR_ERR(reply);
2142 goto fail_nomsg;
2143 }
2144
2145 /* find session */
2146 session = __ceph_lookup_mds_session(mdsc, mds);
2147 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2148
2149 if (session) {
2150 mutex_lock(&session->s_mutex);
2151
2152 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2153 session->s_seq = 0;
2154
2155 ceph_con_open(&session->s_con,
2156 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2157
2158 /* replay unsafe requests */
2159 replay_unsafe_requests(mdsc, session);
2160 } else {
2161 dout("no session for mds%d, will send short reconnect\n",
2162 mds);
2163 }
2164
2165 down_read(&mdsc->snap_rwsem);
2166
2167 if (!session)
2168 goto send;
2169 dout("session %p state %s\n", session,
2170 session_state_name(session->s_state));
2171
2172 /* traverse this session's caps */
2173 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2174 if (err)
2175 goto fail;
2176 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2177 if (err < 0)
2178 goto out;
2179
2180 /*
2181 * snaprealms. we provide mds with the ino, seq (version), and
2182 * parent for all of our realms. If the mds has any newer info,
2183 * it will tell us.
2184 */
2185 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2186 struct ceph_snap_realm *realm =
2187 rb_entry(p, struct ceph_snap_realm, node);
2188 struct ceph_mds_snaprealm_reconnect sr_rec;
2189
2190 dout(" adding snap realm %llx seq %lld parent %llx\n",
2191 realm->ino, realm->seq, realm->parent_ino);
2192 sr_rec.ino = cpu_to_le64(realm->ino);
2193 sr_rec.seq = cpu_to_le64(realm->seq);
2194 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2195 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2196 if (err)
2197 goto fail;
2198 }
2199
2200send:
2201 reply->pagelist = pagelist;
2202 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2203 reply->nr_pages = calc_pages_for(0, pagelist->length);
2204 ceph_con_send(&session->s_con, reply);
2205
2206 if (session) {
2207 session->s_state = CEPH_MDS_SESSION_OPEN;
2208 __wake_requests(mdsc, &session->s_waiting);
2209 }
2210
2211out:
2212 up_read(&mdsc->snap_rwsem);
2213 if (session) {
2214 mutex_unlock(&session->s_mutex);
2215 ceph_put_mds_session(session);
2216 }
2217 mutex_lock(&mdsc->mutex);
2218 return;
2219
2220fail:
2221 ceph_msg_put(reply);
2222fail_nomsg:
2223 ceph_pagelist_release(pagelist);
2224 kfree(pagelist);
2225fail_nopagelist:
2226 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2227 goto out;
2228}
2229
2230
2231/*
2232 * compare old and new mdsmaps, kicking requests
2233 * and closing out old connections as necessary
2234 *
2235 * called under mdsc->mutex.
2236 */
2237static void check_new_map(struct ceph_mds_client *mdsc,
2238 struct ceph_mdsmap *newmap,
2239 struct ceph_mdsmap *oldmap)
2240{
2241 int i;
2242 int oldstate, newstate;
2243 struct ceph_mds_session *s;
2244
2245 dout("check_new_map new %u old %u\n",
2246 newmap->m_epoch, oldmap->m_epoch);
2247
2248 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2249 if (mdsc->sessions[i] == NULL)
2250 continue;
2251 s = mdsc->sessions[i];
2252 oldstate = ceph_mdsmap_get_state(oldmap, i);
2253 newstate = ceph_mdsmap_get_state(newmap, i);
2254
2255 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2256 i, ceph_mds_state_name(oldstate),
2257 ceph_mds_state_name(newstate),
2258 session_state_name(s->s_state));
2259
2260 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2261 ceph_mdsmap_get_addr(newmap, i),
2262 sizeof(struct ceph_entity_addr))) {
2263 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2264 /* the session never opened, just close it
2265 * out now */
2266 __wake_requests(mdsc, &s->s_waiting);
2267 __unregister_session(mdsc, s);
2268 } else {
2269 /* just close it */
2270 mutex_unlock(&mdsc->mutex);
2271 mutex_lock(&s->s_mutex);
2272 mutex_lock(&mdsc->mutex);
2273 ceph_con_close(&s->s_con);
2274 mutex_unlock(&s->s_mutex);
2275 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2276 }
2277
2278 /* kick any requests waiting on the recovering mds */
2279 kick_requests(mdsc, i, 1);
2280 } else if (oldstate == newstate) {
2281 continue; /* nothing new with this mds */
2282 }
2283
2284 /*
2285 * send reconnect?
2286 */
2287 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2288 newstate >= CEPH_MDS_STATE_RECONNECT)
2289 send_mds_reconnect(mdsc, i);
2290
2291 /*
2292 * kick requests on any mds that has gone active.
2293 *
2294 * kick requests on cur or forwarder: we may have sent
2295 * the request to mds1, mds1 told us it forwarded it
2296 * to mds2, but then we learn mds1 failed and can't be
2297 * sure it successfully forwarded our request before
2298 * it died.
2299 */
2300 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2301 newstate >= CEPH_MDS_STATE_ACTIVE) {
2302 pr_info("mds%d reconnect completed\n", s->s_mds);
2303 kick_requests(mdsc, i, 1);
2304 ceph_kick_flushing_caps(mdsc, s);
2305 wake_up_session_caps(s, 1);
2306 }
2307 }
2308}
2309
2310
2311
2312/*
2313 * leases
2314 */
2315
2316/*
2317 * caller must hold session s_mutex, dentry->d_lock
2318 */
2319void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2320{
2321 struct ceph_dentry_info *di = ceph_dentry(dentry);
2322
2323 ceph_put_mds_session(di->lease_session);
2324 di->lease_session = NULL;
2325}
2326
2327static void handle_lease(struct ceph_mds_client *mdsc,
2328 struct ceph_mds_session *session,
2329 struct ceph_msg *msg)
2330{
2331 struct super_block *sb = mdsc->client->sb;
2332 struct inode *inode;
2333 struct ceph_inode_info *ci;
2334 struct dentry *parent, *dentry;
2335 struct ceph_dentry_info *di;
2336 int mds = session->s_mds;
2337 struct ceph_mds_lease *h = msg->front.iov_base;
2338 struct ceph_vino vino;
2339 int mask;
2340 struct qstr dname;
2341 int release = 0;
2342
2343 dout("handle_lease from mds%d\n", mds);
2344
2345 /* decode */
2346 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2347 goto bad;
2348 vino.ino = le64_to_cpu(h->ino);
2349 vino.snap = CEPH_NOSNAP;
2350 mask = le16_to_cpu(h->mask);
2351 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2352 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2353 if (dname.len != get_unaligned_le32(h+1))
2354 goto bad;
2355
2356 mutex_lock(&session->s_mutex);
2357 session->s_seq++;
2358
2359 /* lookup inode */
2360 inode = ceph_find_inode(sb, vino);
2361 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2362 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2363 if (inode == NULL) {
2364 dout("handle_lease no inode %llx\n", vino.ino);
2365 goto release;
2366 }
2367 ci = ceph_inode(inode);
2368
2369 /* dentry */
2370 parent = d_find_alias(inode);
2371 if (!parent) {
2372 dout("no parent dentry on inode %p\n", inode);
2373 WARN_ON(1);
2374 goto release; /* hrm... */
2375 }
2376 dname.hash = full_name_hash(dname.name, dname.len);
2377 dentry = d_lookup(parent, &dname);
2378 dput(parent);
2379 if (!dentry)
2380 goto release;
2381
2382 spin_lock(&dentry->d_lock);
2383 di = ceph_dentry(dentry);
2384 switch (h->action) {
2385 case CEPH_MDS_LEASE_REVOKE:
2386 if (di && di->lease_session == session) {
2387 h->seq = cpu_to_le32(di->lease_seq);
2388 __ceph_mdsc_drop_dentry_lease(dentry);
2389 }
2390 release = 1;
2391 break;
2392
2393 case CEPH_MDS_LEASE_RENEW:
2394 if (di && di->lease_session == session &&
2395 di->lease_gen == session->s_cap_gen &&
2396 di->lease_renew_from &&
2397 di->lease_renew_after == 0) {
2398 unsigned long duration =
2399 le32_to_cpu(h->duration_ms) * HZ / 1000;
2400
2401 di->lease_seq = le32_to_cpu(h->seq);
2402 dentry->d_time = di->lease_renew_from + duration;
2403 di->lease_renew_after = di->lease_renew_from +
2404 (duration >> 1);
2405 di->lease_renew_from = 0;
2406 }
2407 break;
2408 }
2409 spin_unlock(&dentry->d_lock);
2410 dput(dentry);
2411
2412 if (!release)
2413 goto out;
2414
2415release:
2416 /* let's just reuse the same message */
2417 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2418 ceph_msg_get(msg);
2419 ceph_con_send(&session->s_con, msg);
2420
2421out:
2422 iput(inode);
2423 mutex_unlock(&session->s_mutex);
2424 return;
2425
2426bad:
2427 pr_err("corrupt lease message\n");
2428 ceph_msg_dump(msg);
2429}
2430
2431void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2432 struct inode *inode,
2433 struct dentry *dentry, char action,
2434 u32 seq)
2435{
2436 struct ceph_msg *msg;
2437 struct ceph_mds_lease *lease;
2438 int len = sizeof(*lease) + sizeof(u32);
2439 int dnamelen = 0;
2440
2441 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2442 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2443 dnamelen = dentry->d_name.len;
2444 len += dnamelen;
2445
2446 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2447 if (IS_ERR(msg))
2448 return;
2449 lease = msg->front.iov_base;
2450 lease->action = action;
2451 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2452 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2453 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2454 lease->seq = cpu_to_le32(seq);
2455 put_unaligned_le32(dnamelen, lease + 1);
2456 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2457
2458 /*
2459 * if this is a preemptive lease RELEASE, no need to
2460 * flush request stream, since the actual request will
2461 * soon follow.
2462 */
2463 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2464
2465 ceph_con_send(&session->s_con, msg);
2466}
2467
2468/*
2469 * Preemptively release a lease we expect to invalidate anyway.
2470 * Pass @inode always, @dentry is optional.
2471 */
2472void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2473 struct dentry *dentry, int mask)
2474{
2475 struct ceph_dentry_info *di;
2476 struct ceph_mds_session *session;
2477 u32 seq;
2478
2479 BUG_ON(inode == NULL);
2480 BUG_ON(dentry == NULL);
2481 BUG_ON(mask != CEPH_LOCK_DN);
2482
2483 /* is dentry lease valid? */
2484 spin_lock(&dentry->d_lock);
2485 di = ceph_dentry(dentry);
2486 if (!di || !di->lease_session ||
2487 di->lease_session->s_mds < 0 ||
2488 di->lease_gen != di->lease_session->s_cap_gen ||
2489 !time_before(jiffies, dentry->d_time)) {
2490 dout("lease_release inode %p dentry %p -- "
2491 "no lease on %d\n",
2492 inode, dentry, mask);
2493 spin_unlock(&dentry->d_lock);
2494 return;
2495 }
2496
2497 /* we do have a lease on this dentry; note mds and seq */
2498 session = ceph_get_mds_session(di->lease_session);
2499 seq = di->lease_seq;
2500 __ceph_mdsc_drop_dentry_lease(dentry);
2501 spin_unlock(&dentry->d_lock);
2502
2503 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2504 inode, dentry, mask, session->s_mds);
2505 ceph_mdsc_lease_send_msg(session, inode, dentry,
2506 CEPH_MDS_LEASE_RELEASE, seq);
2507 ceph_put_mds_session(session);
2508}
2509
2510/*
2511 * drop all leases (and dentry refs) in preparation for umount
2512 */
2513static void drop_leases(struct ceph_mds_client *mdsc)
2514{
2515 int i;
2516
2517 dout("drop_leases\n");
2518 mutex_lock(&mdsc->mutex);
2519 for (i = 0; i < mdsc->max_sessions; i++) {
2520 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2521 if (!s)
2522 continue;
2523 mutex_unlock(&mdsc->mutex);
2524 mutex_lock(&s->s_mutex);
2525 mutex_unlock(&s->s_mutex);
2526 ceph_put_mds_session(s);
2527 mutex_lock(&mdsc->mutex);
2528 }
2529 mutex_unlock(&mdsc->mutex);
2530}
2531
2532
2533
2534/*
2535 * delayed work -- periodically trim expired leases, renew caps with mds
2536 */
2537static void schedule_delayed(struct ceph_mds_client *mdsc)
2538{
2539 int delay = 5;
2540 unsigned hz = round_jiffies_relative(HZ * delay);
2541 schedule_delayed_work(&mdsc->delayed_work, hz);
2542}
2543
2544static void delayed_work(struct work_struct *work)
2545{
2546 int i;
2547 struct ceph_mds_client *mdsc =
2548 container_of(work, struct ceph_mds_client, delayed_work.work);
2549 int renew_interval;
2550 int renew_caps;
2551
2552 dout("mdsc delayed_work\n");
2553 ceph_check_delayed_caps(mdsc);
2554
2555 mutex_lock(&mdsc->mutex);
2556 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2557 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2558 mdsc->last_renew_caps);
2559 if (renew_caps)
2560 mdsc->last_renew_caps = jiffies;
2561
2562 for (i = 0; i < mdsc->max_sessions; i++) {
2563 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2564 if (s == NULL)
2565 continue;
2566 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2567 dout("resending session close request for mds%d\n",
2568 s->s_mds);
2569 request_close_session(mdsc, s);
2570 ceph_put_mds_session(s);
2571 continue;
2572 }
2573 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2574 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2575 s->s_state = CEPH_MDS_SESSION_HUNG;
2576 pr_info("mds%d hung\n", s->s_mds);
2577 }
2578 }
2579 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2580 /* this mds is failed or recovering, just wait */
2581 ceph_put_mds_session(s);
2582 continue;
2583 }
2584 mutex_unlock(&mdsc->mutex);
2585
2586 mutex_lock(&s->s_mutex);
2587 if (renew_caps)
2588 send_renew_caps(mdsc, s);
2589 else
2590 ceph_con_keepalive(&s->s_con);
2591 add_cap_releases(mdsc, s, -1);
2592 send_cap_releases(mdsc, s);
2593 mutex_unlock(&s->s_mutex);
2594 ceph_put_mds_session(s);
2595
2596 mutex_lock(&mdsc->mutex);
2597 }
2598 mutex_unlock(&mdsc->mutex);
2599
2600 schedule_delayed(mdsc);
2601}
2602
2603
2604int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2605{
2606 mdsc->client = client;
2607 mutex_init(&mdsc->mutex);
2608 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2609 init_completion(&mdsc->safe_umount_waiters);
2610 init_completion(&mdsc->session_close_waiters);
2611 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2612 mdsc->sessions = NULL;
2613 mdsc->max_sessions = 0;
2614 mdsc->stopping = 0;
2615 init_rwsem(&mdsc->snap_rwsem);
2616 mdsc->snap_realms = RB_ROOT;
2617 INIT_LIST_HEAD(&mdsc->snap_empty);
2618 spin_lock_init(&mdsc->snap_empty_lock);
2619 mdsc->last_tid = 0;
2620 mdsc->request_tree = RB_ROOT;
2621 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2622 mdsc->last_renew_caps = jiffies;
2623 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2624 spin_lock_init(&mdsc->cap_delay_lock);
2625 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2626 spin_lock_init(&mdsc->snap_flush_lock);
2627 mdsc->cap_flush_seq = 0;
2628 INIT_LIST_HEAD(&mdsc->cap_dirty);
2629 mdsc->num_cap_flushing = 0;
2630 spin_lock_init(&mdsc->cap_dirty_lock);
2631 init_waitqueue_head(&mdsc->cap_flushing_wq);
2632 spin_lock_init(&mdsc->dentry_lru_lock);
2633 INIT_LIST_HEAD(&mdsc->dentry_lru);
2634 return 0;
2635}
2636
2637/*
2638 * Wait for safe replies on open mds requests. If we time out, drop
2639 * all requests from the tree to avoid dangling dentry refs.
2640 */
2641static void wait_requests(struct ceph_mds_client *mdsc)
2642{
2643 struct ceph_mds_request *req;
2644 struct ceph_client *client = mdsc->client;
2645
2646 mutex_lock(&mdsc->mutex);
2647 if (__get_oldest_req(mdsc)) {
2648 mutex_unlock(&mdsc->mutex);
2649
2650 dout("wait_requests waiting for requests\n");
2651 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2652 client->mount_args->mount_timeout * HZ);
2653
2654 /* tear down remaining requests */
2655 mutex_lock(&mdsc->mutex);
2656 while ((req = __get_oldest_req(mdsc))) {
2657 dout("wait_requests timed out on tid %llu\n",
2658 req->r_tid);
2659 __unregister_request(mdsc, req);
2660 }
2661 }
2662 mutex_unlock(&mdsc->mutex);
2663 dout("wait_requests done\n");
2664}
2665
2666/*
2667 * called before mount is ro, and before dentries are torn down.
2668 * (hmm, does this still race with new lookups?)
2669 */
2670void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2671{
2672 dout("pre_umount\n");
2673 mdsc->stopping = 1;
2674
2675 drop_leases(mdsc);
2676 ceph_flush_dirty_caps(mdsc);
2677 wait_requests(mdsc);
2678}
2679
2680/*
2681 * wait for all write mds requests to flush.
2682 */
2683static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2684{
2685 struct ceph_mds_request *req = NULL;
2686 struct rb_node *n;
2687
2688 mutex_lock(&mdsc->mutex);
2689 dout("wait_unsafe_requests want %lld\n", want_tid);
2690 req = __get_oldest_req(mdsc);
2691 while (req && req->r_tid <= want_tid) {
2692 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2693 /* write op */
2694 ceph_mdsc_get_request(req);
2695 mutex_unlock(&mdsc->mutex);
2696 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2697 req->r_tid, want_tid);
2698 wait_for_completion(&req->r_safe_completion);
2699 mutex_lock(&mdsc->mutex);
2700 n = rb_next(&req->r_node);
2701 ceph_mdsc_put_request(req);
2702 } else {
2703 n = rb_next(&req->r_node);
2704 }
2705 if (!n)
2706 break;
2707 req = rb_entry(n, struct ceph_mds_request, r_node);
2708 }
2709 mutex_unlock(&mdsc->mutex);
2710 dout("wait_unsafe_requests done\n");
2711}
2712
2713void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2714{
2715 u64 want_tid, want_flush;
2716
2717 dout("sync\n");
2718 mutex_lock(&mdsc->mutex);
2719 want_tid = mdsc->last_tid;
2720 want_flush = mdsc->cap_flush_seq;
2721 mutex_unlock(&mdsc->mutex);
2722 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2723
2724 ceph_flush_dirty_caps(mdsc);
2725
2726 wait_unsafe_requests(mdsc, want_tid);
2727 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2728}
2729
2730
2731/*
2732 * called after sb is ro.
2733 */
2734void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2735{
2736 struct ceph_mds_session *session;
2737 int i;
2738 int n;
2739 struct ceph_client *client = mdsc->client;
2740 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2741
2742 dout("close_sessions\n");
2743
2744 mutex_lock(&mdsc->mutex);
2745
2746 /* close sessions */
2747 started = jiffies;
2748 while (time_before(jiffies, started + timeout)) {
2749 dout("closing sessions\n");
2750 n = 0;
2751 for (i = 0; i < mdsc->max_sessions; i++) {
2752 session = __ceph_lookup_mds_session(mdsc, i);
2753 if (!session)
2754 continue;
2755 mutex_unlock(&mdsc->mutex);
2756 mutex_lock(&session->s_mutex);
2757 __close_session(mdsc, session);
2758 mutex_unlock(&session->s_mutex);
2759 ceph_put_mds_session(session);
2760 mutex_lock(&mdsc->mutex);
2761 n++;
2762 }
2763 if (n == 0)
2764 break;
2765
2766 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2767 break;
2768
2769 dout("waiting for sessions to close\n");
2770 mutex_unlock(&mdsc->mutex);
2771 wait_for_completion_timeout(&mdsc->session_close_waiters,
2772 timeout);
2773 mutex_lock(&mdsc->mutex);
2774 }
2775
2776 /* tear down remaining sessions */
2777 for (i = 0; i < mdsc->max_sessions; i++) {
2778 if (mdsc->sessions[i]) {
2779 session = get_session(mdsc->sessions[i]);
2780 __unregister_session(mdsc, session);
2781 mutex_unlock(&mdsc->mutex);
2782 mutex_lock(&session->s_mutex);
2783 remove_session_caps(session);
2784 mutex_unlock(&session->s_mutex);
2785 ceph_put_mds_session(session);
2786 mutex_lock(&mdsc->mutex);
2787 }
2788 }
2789
2790 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2791
2792 mutex_unlock(&mdsc->mutex);
2793
2794 ceph_cleanup_empty_realms(mdsc);
2795
2796 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2797
2798 dout("stopped\n");
2799}
2800
2801void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2802{
2803 dout("stop\n");
2804 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2805 if (mdsc->mdsmap)
2806 ceph_mdsmap_destroy(mdsc->mdsmap);
2807 kfree(mdsc->sessions);
2808}
2809
2810
2811/*
2812 * handle mds map update.
2813 */
2814void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2815{
2816 u32 epoch;
2817 u32 maplen;
2818 void *p = msg->front.iov_base;
2819 void *end = p + msg->front.iov_len;
2820 struct ceph_mdsmap *newmap, *oldmap;
2821 struct ceph_fsid fsid;
2822 int err = -EINVAL;
2823
2824 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2825 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2826 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2827 return;
2828 epoch = ceph_decode_32(&p);
2829 maplen = ceph_decode_32(&p);
2830 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2831
2832 /* do we need it? */
2833 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2834 mutex_lock(&mdsc->mutex);
2835 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2836 dout("handle_map epoch %u <= our %u\n",
2837 epoch, mdsc->mdsmap->m_epoch);
2838 mutex_unlock(&mdsc->mutex);
2839 return;
2840 }
2841
2842 newmap = ceph_mdsmap_decode(&p, end);
2843 if (IS_ERR(newmap)) {
2844 err = PTR_ERR(newmap);
2845 goto bad_unlock;
2846 }
2847
2848 /* swap into place */
2849 if (mdsc->mdsmap) {
2850 oldmap = mdsc->mdsmap;
2851 mdsc->mdsmap = newmap;
2852 check_new_map(mdsc, newmap, oldmap);
2853 ceph_mdsmap_destroy(oldmap);
2854 } else {
2855 mdsc->mdsmap = newmap; /* first mds map */
2856 }
2857 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2858
2859 __wake_requests(mdsc, &mdsc->waiting_for_map);
2860
2861 mutex_unlock(&mdsc->mutex);
2862 schedule_delayed(mdsc);
2863 return;
2864
2865bad_unlock:
2866 mutex_unlock(&mdsc->mutex);
2867bad:
2868 pr_err("error decoding mdsmap %d\n", err);
2869 return;
2870}
2871
2872static struct ceph_connection *con_get(struct ceph_connection *con)
2873{
2874 struct ceph_mds_session *s = con->private;
2875
2876 if (get_session(s)) {
2877 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2878 return con;
2879 }
2880 dout("mdsc con_get %p FAIL\n", s);
2881 return NULL;
2882}
2883
2884static void con_put(struct ceph_connection *con)
2885{
2886 struct ceph_mds_session *s = con->private;
2887
2888 ceph_put_mds_session(s);
2889 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2890}
2891
2892/*
2893 * if the client is unresponsive for long enough, the mds will kill
2894 * the session entirely.
2895 */
2896static void peer_reset(struct ceph_connection *con)
2897{
2898 struct ceph_mds_session *s = con->private;
2899
2900 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2901 s->s_mds);
2902}
2903
2904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2905{
2906 struct ceph_mds_session *s = con->private;
2907 struct ceph_mds_client *mdsc = s->s_mdsc;
2908 int type = le16_to_cpu(msg->hdr.type);
2909
2910 mutex_lock(&mdsc->mutex);
2911 if (__verify_registered_session(mdsc, s) < 0) {
2912 mutex_unlock(&mdsc->mutex);
2913 goto out;
2914 }
2915 mutex_unlock(&mdsc->mutex);
2916
2917 switch (type) {
2918 case CEPH_MSG_MDS_MAP:
2919 ceph_mdsc_handle_map(mdsc, msg);
2920 break;
2921 case CEPH_MSG_CLIENT_SESSION:
2922 handle_session(s, msg);
2923 break;
2924 case CEPH_MSG_CLIENT_REPLY:
2925 handle_reply(s, msg);
2926 break;
2927 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2928 handle_forward(mdsc, s, msg);
2929 break;
2930 case CEPH_MSG_CLIENT_CAPS:
2931 ceph_handle_caps(s, msg);
2932 break;
2933 case CEPH_MSG_CLIENT_SNAP:
2934 ceph_handle_snap(mdsc, s, msg);
2935 break;
2936 case CEPH_MSG_CLIENT_LEASE:
2937 handle_lease(mdsc, s, msg);
2938 break;
2939
2940 default:
2941 pr_err("received unknown message type %d %s\n", type,
2942 ceph_msg_type_name(type));
2943 }
2944out:
2945 ceph_msg_put(msg);
2946}
2947
2948/*
2949 * authentication
2950 */
2951static int get_authorizer(struct ceph_connection *con,
2952 void **buf, int *len, int *proto,
2953 void **reply_buf, int *reply_len, int force_new)
2954{
2955 struct ceph_mds_session *s = con->private;
2956 struct ceph_mds_client *mdsc = s->s_mdsc;
2957 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2958 int ret = 0;
2959
2960 if (force_new && s->s_authorizer) {
2961 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2962 s->s_authorizer = NULL;
2963 }
2964 if (s->s_authorizer == NULL) {
2965 if (ac->ops->create_authorizer) {
2966 ret = ac->ops->create_authorizer(
2967 ac, CEPH_ENTITY_TYPE_MDS,
2968 &s->s_authorizer,
2969 &s->s_authorizer_buf,
2970 &s->s_authorizer_buf_len,
2971 &s->s_authorizer_reply_buf,
2972 &s->s_authorizer_reply_buf_len);
2973 if (ret)
2974 return ret;
2975 }
2976 }
2977
2978 *proto = ac->protocol;
2979 *buf = s->s_authorizer_buf;
2980 *len = s->s_authorizer_buf_len;
2981 *reply_buf = s->s_authorizer_reply_buf;
2982 *reply_len = s->s_authorizer_reply_buf_len;
2983 return 0;
2984}
2985
2986
2987static int verify_authorizer_reply(struct ceph_connection *con, int len)
2988{
2989 struct ceph_mds_session *s = con->private;
2990 struct ceph_mds_client *mdsc = s->s_mdsc;
2991 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2992
2993 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
2994}
2995
2996static int invalidate_authorizer(struct ceph_connection *con)
2997{
2998 struct ceph_mds_session *s = con->private;
2999 struct ceph_mds_client *mdsc = s->s_mdsc;
3000 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3001
3002 if (ac->ops->invalidate_authorizer)
3003 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3004
3005 return ceph_monc_validate_auth(&mdsc->client->monc);
3006}
3007
3008const static struct ceph_connection_operations mds_con_ops = {
3009 .get = con_get,
3010 .put = con_put,
3011 .dispatch = dispatch,
3012 .get_authorizer = get_authorizer,
3013 .verify_authorizer_reply = verify_authorizer_reply,
3014 .invalidate_authorizer = invalidate_authorizer,
3015 .peer_reset = peer_reset,
3016};
3017
3018
3019
3020
3021/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..781656a49bf8
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/socket.h>
10#include <linux/string.h>
11#include <net/tcp.h>
12
13#include "super.h"
14#include "messenger.h"
15#include "decode.h"
16#include "pagelist.h"
17
18/*
19 * Ceph uses the messenger to exchange ceph_msg messages with other
20 * hosts in the system. The messenger provides ordered and reliable
21 * delivery. We tolerate TCP disconnects by reconnecting (with
22 * exponential backoff) in the case of a fault (disconnection, bad
23 * crc, protocol error). Acks allow sent messages to be discarded by
24 * the sender.
25 */
26
27/* static tag bytes (protocol control messages) */
28static char tag_msg = CEPH_MSGR_TAG_MSG;
29static char tag_ack = CEPH_MSGR_TAG_ACK;
30static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
31
32
33static void queue_con(struct ceph_connection *con);
34static void con_work(struct work_struct *);
35static void ceph_fault(struct ceph_connection *con);
36
37const char *ceph_name_type_str(int t)
38{
39 switch (t) {
40 case CEPH_ENTITY_TYPE_MON: return "mon";
41 case CEPH_ENTITY_TYPE_MDS: return "mds";
42 case CEPH_ENTITY_TYPE_OSD: return "osd";
43 case CEPH_ENTITY_TYPE_CLIENT: return "client";
44 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
45 default: return "???";
46 }
47}
48
49/*
50 * nicely render a sockaddr as a string.
51 */
52#define MAX_ADDR_STR 20
53static char addr_str[MAX_ADDR_STR][40];
54static DEFINE_SPINLOCK(addr_str_lock);
55static int last_addr_str;
56
57const char *pr_addr(const struct sockaddr_storage *ss)
58{
59 int i;
60 char *s;
61 struct sockaddr_in *in4 = (void *)ss;
62 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
63 struct sockaddr_in6 *in6 = (void *)ss;
64
65 spin_lock(&addr_str_lock);
66 i = last_addr_str++;
67 if (last_addr_str == MAX_ADDR_STR)
68 last_addr_str = 0;
69 spin_unlock(&addr_str_lock);
70 s = addr_str[i];
71
72 switch (ss->ss_family) {
73 case AF_INET:
74 sprintf(s, "%u.%u.%u.%u:%u",
75 (unsigned int)quad[0],
76 (unsigned int)quad[1],
77 (unsigned int)quad[2],
78 (unsigned int)quad[3],
79 (unsigned int)ntohs(in4->sin_port));
80 break;
81
82 case AF_INET6:
83 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
84 in6->sin6_addr.s6_addr16[0],
85 in6->sin6_addr.s6_addr16[1],
86 in6->sin6_addr.s6_addr16[2],
87 in6->sin6_addr.s6_addr16[3],
88 in6->sin6_addr.s6_addr16[4],
89 in6->sin6_addr.s6_addr16[5],
90 in6->sin6_addr.s6_addr16[6],
91 in6->sin6_addr.s6_addr16[7],
92 (unsigned int)ntohs(in6->sin6_port));
93 break;
94
95 default:
96 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
97 }
98
99 return s;
100}
101
102static void encode_my_addr(struct ceph_messenger *msgr)
103{
104 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
105 ceph_encode_addr(&msgr->my_enc_addr);
106}
107
108/*
109 * work queue for all reading and writing to/from the socket.
110 */
111struct workqueue_struct *ceph_msgr_wq;
112
113int __init ceph_msgr_init(void)
114{
115 ceph_msgr_wq = create_workqueue("ceph-msgr");
116 if (IS_ERR(ceph_msgr_wq)) {
117 int ret = PTR_ERR(ceph_msgr_wq);
118 pr_err("msgr_init failed to create workqueue: %d\n", ret);
119 ceph_msgr_wq = NULL;
120 return ret;
121 }
122 return 0;
123}
124
125void ceph_msgr_exit(void)
126{
127 destroy_workqueue(ceph_msgr_wq);
128}
129
130/*
131 * socket callback functions
132 */
133
134/* data available on socket, or listen socket received a connect */
135static void ceph_data_ready(struct sock *sk, int count_unused)
136{
137 struct ceph_connection *con =
138 (struct ceph_connection *)sk->sk_user_data;
139 if (sk->sk_state != TCP_CLOSE_WAIT) {
140 dout("ceph_data_ready on %p state = %lu, queueing work\n",
141 con, con->state);
142 queue_con(con);
143 }
144}
145
146/* socket has buffer space for writing */
147static void ceph_write_space(struct sock *sk)
148{
149 struct ceph_connection *con =
150 (struct ceph_connection *)sk->sk_user_data;
151
152 /* only queue to workqueue if there is data we want to write. */
153 if (test_bit(WRITE_PENDING, &con->state)) {
154 dout("ceph_write_space %p queueing write work\n", con);
155 queue_con(con);
156 } else {
157 dout("ceph_write_space %p nothing to write\n", con);
158 }
159
160 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
161 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
162}
163
164/* socket's state has changed */
165static void ceph_state_change(struct sock *sk)
166{
167 struct ceph_connection *con =
168 (struct ceph_connection *)sk->sk_user_data;
169
170 dout("ceph_state_change %p state = %lu sk_state = %u\n",
171 con, con->state, sk->sk_state);
172
173 if (test_bit(CLOSED, &con->state))
174 return;
175
176 switch (sk->sk_state) {
177 case TCP_CLOSE:
178 dout("ceph_state_change TCP_CLOSE\n");
179 case TCP_CLOSE_WAIT:
180 dout("ceph_state_change TCP_CLOSE_WAIT\n");
181 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
182 if (test_bit(CONNECTING, &con->state))
183 con->error_msg = "connection failed";
184 else
185 con->error_msg = "socket closed";
186 queue_con(con);
187 }
188 break;
189 case TCP_ESTABLISHED:
190 dout("ceph_state_change TCP_ESTABLISHED\n");
191 queue_con(con);
192 break;
193 }
194}
195
196/*
197 * set up socket callbacks
198 */
199static void set_sock_callbacks(struct socket *sock,
200 struct ceph_connection *con)
201{
202 struct sock *sk = sock->sk;
203 sk->sk_user_data = (void *)con;
204 sk->sk_data_ready = ceph_data_ready;
205 sk->sk_write_space = ceph_write_space;
206 sk->sk_state_change = ceph_state_change;
207}
208
209
210/*
211 * socket helpers
212 */
213
214/*
215 * initiate connection to a remote socket.
216 */
217static struct socket *ceph_tcp_connect(struct ceph_connection *con)
218{
219 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
220 struct socket *sock;
221 int ret;
222
223 BUG_ON(con->sock);
224 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
225 if (ret)
226 return ERR_PTR(ret);
227 con->sock = sock;
228 sock->sk->sk_allocation = GFP_NOFS;
229
230 set_sock_callbacks(sock, con);
231
232 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
233
234 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
235 if (ret == -EINPROGRESS) {
236 dout("connect %s EINPROGRESS sk_state = %u\n",
237 pr_addr(&con->peer_addr.in_addr),
238 sock->sk->sk_state);
239 ret = 0;
240 }
241 if (ret < 0) {
242 pr_err("connect %s error %d\n",
243 pr_addr(&con->peer_addr.in_addr), ret);
244 sock_release(sock);
245 con->sock = NULL;
246 con->error_msg = "connect error";
247 }
248
249 if (ret < 0)
250 return ERR_PTR(ret);
251 return sock;
252}
253
254static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
255{
256 struct kvec iov = {buf, len};
257 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
258
259 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
260}
261
262/*
263 * write something. @more is true if caller will be sending more data
264 * shortly.
265 */
266static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
267 size_t kvlen, size_t len, int more)
268{
269 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
270
271 if (more)
272 msg.msg_flags |= MSG_MORE;
273 else
274 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
275
276 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
277}
278
279
280/*
281 * Shutdown/close the socket for the given connection.
282 */
283static int con_close_socket(struct ceph_connection *con)
284{
285 int rc;
286
287 dout("con_close_socket on %p sock %p\n", con, con->sock);
288 if (!con->sock)
289 return 0;
290 set_bit(SOCK_CLOSED, &con->state);
291 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
292 sock_release(con->sock);
293 con->sock = NULL;
294 clear_bit(SOCK_CLOSED, &con->state);
295 return rc;
296}
297
298/*
299 * Reset a connection. Discard all incoming and outgoing messages
300 * and clear *_seq state.
301 */
302static void ceph_msg_remove(struct ceph_msg *msg)
303{
304 list_del_init(&msg->list_head);
305 ceph_msg_put(msg);
306}
307static void ceph_msg_remove_list(struct list_head *head)
308{
309 while (!list_empty(head)) {
310 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
311 list_head);
312 ceph_msg_remove(msg);
313 }
314}
315
316static void reset_connection(struct ceph_connection *con)
317{
318 /* reset connection, out_queue, msg_ and connect_seq */
319 /* discard existing out_queue and msg_seq */
320 ceph_msg_remove_list(&con->out_queue);
321 ceph_msg_remove_list(&con->out_sent);
322
323 if (con->in_msg) {
324 ceph_msg_put(con->in_msg);
325 con->in_msg = NULL;
326 }
327
328 con->connect_seq = 0;
329 con->out_seq = 0;
330 if (con->out_msg) {
331 ceph_msg_put(con->out_msg);
332 con->out_msg = NULL;
333 }
334 con->in_seq = 0;
335}
336
337/*
338 * mark a peer down. drop any open connections.
339 */
340void ceph_con_close(struct ceph_connection *con)
341{
342 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
343 set_bit(CLOSED, &con->state); /* in case there's queued work */
344 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
345 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
346 clear_bit(KEEPALIVE_PENDING, &con->state);
347 clear_bit(WRITE_PENDING, &con->state);
348 mutex_lock(&con->mutex);
349 reset_connection(con);
350 cancel_delayed_work(&con->work);
351 mutex_unlock(&con->mutex);
352 queue_con(con);
353}
354
355/*
356 * Reopen a closed connection, with a new peer address.
357 */
358void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
359{
360 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
361 set_bit(OPENING, &con->state);
362 clear_bit(CLOSED, &con->state);
363 memcpy(&con->peer_addr, addr, sizeof(*addr));
364 con->delay = 0; /* reset backoff memory */
365 queue_con(con);
366}
367
368/*
369 * generic get/put
370 */
371struct ceph_connection *ceph_con_get(struct ceph_connection *con)
372{
373 dout("con_get %p nref = %d -> %d\n", con,
374 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
375 if (atomic_inc_not_zero(&con->nref))
376 return con;
377 return NULL;
378}
379
380void ceph_con_put(struct ceph_connection *con)
381{
382 dout("con_put %p nref = %d -> %d\n", con,
383 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
384 BUG_ON(atomic_read(&con->nref) == 0);
385 if (atomic_dec_and_test(&con->nref)) {
386 BUG_ON(con->sock);
387 kfree(con);
388 }
389}
390
391/*
392 * initialize a new connection.
393 */
394void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
395{
396 dout("con_init %p\n", con);
397 memset(con, 0, sizeof(*con));
398 atomic_set(&con->nref, 1);
399 con->msgr = msgr;
400 mutex_init(&con->mutex);
401 INIT_LIST_HEAD(&con->out_queue);
402 INIT_LIST_HEAD(&con->out_sent);
403 INIT_DELAYED_WORK(&con->work, con_work);
404}
405
406
407/*
408 * We maintain a global counter to order connection attempts. Get
409 * a unique seq greater than @gt.
410 */
411static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
412{
413 u32 ret;
414
415 spin_lock(&msgr->global_seq_lock);
416 if (msgr->global_seq < gt)
417 msgr->global_seq = gt;
418 ret = ++msgr->global_seq;
419 spin_unlock(&msgr->global_seq_lock);
420 return ret;
421}
422
423
424/*
425 * Prepare footer for currently outgoing message, and finish things
426 * off. Assumes out_kvec* are already valid.. we just add on to the end.
427 */
428static void prepare_write_message_footer(struct ceph_connection *con, int v)
429{
430 struct ceph_msg *m = con->out_msg;
431
432 dout("prepare_write_message_footer %p\n", con);
433 con->out_kvec_is_msg = true;
434 con->out_kvec[v].iov_base = &m->footer;
435 con->out_kvec[v].iov_len = sizeof(m->footer);
436 con->out_kvec_bytes += sizeof(m->footer);
437 con->out_kvec_left++;
438 con->out_more = m->more_to_follow;
439 con->out_msg_done = true;
440}
441
442/*
443 * Prepare headers for the next outgoing message.
444 */
445static void prepare_write_message(struct ceph_connection *con)
446{
447 struct ceph_msg *m;
448 int v = 0;
449
450 con->out_kvec_bytes = 0;
451 con->out_kvec_is_msg = true;
452 con->out_msg_done = false;
453
454 /* Sneak an ack in there first? If we can get it into the same
455 * TCP packet that's a good thing. */
456 if (con->in_seq > con->in_seq_acked) {
457 con->in_seq_acked = con->in_seq;
458 con->out_kvec[v].iov_base = &tag_ack;
459 con->out_kvec[v++].iov_len = 1;
460 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
461 con->out_kvec[v].iov_base = &con->out_temp_ack;
462 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
463 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
464 }
465
466 m = list_first_entry(&con->out_queue,
467 struct ceph_msg, list_head);
468 con->out_msg = m;
469 if (test_bit(LOSSYTX, &con->state)) {
470 list_del_init(&m->list_head);
471 } else {
472 /* put message on sent list */
473 ceph_msg_get(m);
474 list_move_tail(&m->list_head, &con->out_sent);
475 }
476
477 m->hdr.seq = cpu_to_le64(++con->out_seq);
478
479 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
480 m, con->out_seq, le16_to_cpu(m->hdr.type),
481 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
482 le32_to_cpu(m->hdr.data_len),
483 m->nr_pages);
484 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
485
486 /* tag + hdr + front + middle */
487 con->out_kvec[v].iov_base = &tag_msg;
488 con->out_kvec[v++].iov_len = 1;
489 con->out_kvec[v].iov_base = &m->hdr;
490 con->out_kvec[v++].iov_len = sizeof(m->hdr);
491 con->out_kvec[v++] = m->front;
492 if (m->middle)
493 con->out_kvec[v++] = m->middle->vec;
494 con->out_kvec_left = v;
495 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
496 (m->middle ? m->middle->vec.iov_len : 0);
497 con->out_kvec_cur = con->out_kvec;
498
499 /* fill in crc (except data pages), footer */
500 con->out_msg->hdr.crc =
501 cpu_to_le32(crc32c(0, (void *)&m->hdr,
502 sizeof(m->hdr) - sizeof(m->hdr.crc)));
503 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
504 con->out_msg->footer.front_crc =
505 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
506 if (m->middle)
507 con->out_msg->footer.middle_crc =
508 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
509 m->middle->vec.iov_len));
510 else
511 con->out_msg->footer.middle_crc = 0;
512 con->out_msg->footer.data_crc = 0;
513 dout("prepare_write_message front_crc %u data_crc %u\n",
514 le32_to_cpu(con->out_msg->footer.front_crc),
515 le32_to_cpu(con->out_msg->footer.middle_crc));
516
517 /* is there a data payload? */
518 if (le32_to_cpu(m->hdr.data_len) > 0) {
519 /* initialize page iterator */
520 con->out_msg_pos.page = 0;
521 con->out_msg_pos.page_pos =
522 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
523 con->out_msg_pos.data_pos = 0;
524 con->out_msg_pos.did_page_crc = 0;
525 con->out_more = 1; /* data + footer will follow */
526 } else {
527 /* no, queue up footer too and be done */
528 prepare_write_message_footer(con, v);
529 }
530
531 set_bit(WRITE_PENDING, &con->state);
532}
533
534/*
535 * Prepare an ack.
536 */
537static void prepare_write_ack(struct ceph_connection *con)
538{
539 dout("prepare_write_ack %p %llu -> %llu\n", con,
540 con->in_seq_acked, con->in_seq);
541 con->in_seq_acked = con->in_seq;
542
543 con->out_kvec[0].iov_base = &tag_ack;
544 con->out_kvec[0].iov_len = 1;
545 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
546 con->out_kvec[1].iov_base = &con->out_temp_ack;
547 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
548 con->out_kvec_left = 2;
549 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
550 con->out_kvec_cur = con->out_kvec;
551 con->out_more = 1; /* more will follow.. eventually.. */
552 set_bit(WRITE_PENDING, &con->state);
553}
554
555/*
556 * Prepare to write keepalive byte.
557 */
558static void prepare_write_keepalive(struct ceph_connection *con)
559{
560 dout("prepare_write_keepalive %p\n", con);
561 con->out_kvec[0].iov_base = &tag_keepalive;
562 con->out_kvec[0].iov_len = 1;
563 con->out_kvec_left = 1;
564 con->out_kvec_bytes = 1;
565 con->out_kvec_cur = con->out_kvec;
566 set_bit(WRITE_PENDING, &con->state);
567}
568
569/*
570 * Connection negotiation.
571 */
572
573static void prepare_connect_authorizer(struct ceph_connection *con)
574{
575 void *auth_buf;
576 int auth_len = 0;
577 int auth_protocol = 0;
578
579 mutex_unlock(&con->mutex);
580 if (con->ops->get_authorizer)
581 con->ops->get_authorizer(con, &auth_buf, &auth_len,
582 &auth_protocol, &con->auth_reply_buf,
583 &con->auth_reply_buf_len,
584 con->auth_retry);
585 mutex_lock(&con->mutex);
586
587 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
588 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
589
590 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
591 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
592 con->out_kvec_left++;
593 con->out_kvec_bytes += auth_len;
594}
595
596/*
597 * We connected to a peer and are saying hello.
598 */
599static void prepare_write_banner(struct ceph_messenger *msgr,
600 struct ceph_connection *con)
601{
602 int len = strlen(CEPH_BANNER);
603
604 con->out_kvec[0].iov_base = CEPH_BANNER;
605 con->out_kvec[0].iov_len = len;
606 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
607 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
608 con->out_kvec_left = 2;
609 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
610 con->out_kvec_cur = con->out_kvec;
611 con->out_more = 0;
612 set_bit(WRITE_PENDING, &con->state);
613}
614
615static void prepare_write_connect(struct ceph_messenger *msgr,
616 struct ceph_connection *con,
617 int after_banner)
618{
619 unsigned global_seq = get_global_seq(con->msgr, 0);
620 int proto;
621
622 switch (con->peer_name.type) {
623 case CEPH_ENTITY_TYPE_MON:
624 proto = CEPH_MONC_PROTOCOL;
625 break;
626 case CEPH_ENTITY_TYPE_OSD:
627 proto = CEPH_OSDC_PROTOCOL;
628 break;
629 case CEPH_ENTITY_TYPE_MDS:
630 proto = CEPH_MDSC_PROTOCOL;
631 break;
632 default:
633 BUG();
634 }
635
636 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
637 con->connect_seq, global_seq, proto);
638
639 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
640 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
641 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
642 con->out_connect.global_seq = cpu_to_le32(global_seq);
643 con->out_connect.protocol_version = cpu_to_le32(proto);
644 con->out_connect.flags = 0;
645
646 if (!after_banner) {
647 con->out_kvec_left = 0;
648 con->out_kvec_bytes = 0;
649 }
650 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
651 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
652 con->out_kvec_left++;
653 con->out_kvec_bytes += sizeof(con->out_connect);
654 con->out_kvec_cur = con->out_kvec;
655 con->out_more = 0;
656 set_bit(WRITE_PENDING, &con->state);
657
658 prepare_connect_authorizer(con);
659}
660
661
662/*
663 * write as much of pending kvecs to the socket as we can.
664 * 1 -> done
665 * 0 -> socket full, but more to do
666 * <0 -> error
667 */
668static int write_partial_kvec(struct ceph_connection *con)
669{
670 int ret;
671
672 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
673 while (con->out_kvec_bytes > 0) {
674 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
675 con->out_kvec_left, con->out_kvec_bytes,
676 con->out_more);
677 if (ret <= 0)
678 goto out;
679 con->out_kvec_bytes -= ret;
680 if (con->out_kvec_bytes == 0)
681 break; /* done */
682 while (ret > 0) {
683 if (ret >= con->out_kvec_cur->iov_len) {
684 ret -= con->out_kvec_cur->iov_len;
685 con->out_kvec_cur++;
686 con->out_kvec_left--;
687 } else {
688 con->out_kvec_cur->iov_len -= ret;
689 con->out_kvec_cur->iov_base += ret;
690 ret = 0;
691 break;
692 }
693 }
694 }
695 con->out_kvec_left = 0;
696 con->out_kvec_is_msg = false;
697 ret = 1;
698out:
699 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
700 con->out_kvec_bytes, con->out_kvec_left, ret);
701 return ret; /* done! */
702}
703
704/*
705 * Write as much message data payload as we can. If we finish, queue
706 * up the footer.
707 * 1 -> done, footer is now queued in out_kvec[].
708 * 0 -> socket full, but more to do
709 * <0 -> error
710 */
711static int write_partial_msg_pages(struct ceph_connection *con)
712{
713 struct ceph_msg *msg = con->out_msg;
714 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
715 size_t len;
716 int crc = con->msgr->nocrc;
717 int ret;
718
719 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
720 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
721 con->out_msg_pos.page_pos);
722
723 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
724 struct page *page = NULL;
725 void *kaddr = NULL;
726
727 /*
728 * if we are calculating the data crc (the default), we need
729 * to map the page. if our pages[] has been revoked, use the
730 * zero page.
731 */
732 if (msg->pages) {
733 page = msg->pages[con->out_msg_pos.page];
734 if (crc)
735 kaddr = kmap(page);
736 } else if (msg->pagelist) {
737 page = list_first_entry(&msg->pagelist->head,
738 struct page, lru);
739 if (crc)
740 kaddr = kmap(page);
741 } else {
742 page = con->msgr->zero_page;
743 if (crc)
744 kaddr = page_address(con->msgr->zero_page);
745 }
746 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
747 (int)(data_len - con->out_msg_pos.data_pos));
748 if (crc && !con->out_msg_pos.did_page_crc) {
749 void *base = kaddr + con->out_msg_pos.page_pos;
750 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
751
752 BUG_ON(kaddr == NULL);
753 con->out_msg->footer.data_crc =
754 cpu_to_le32(crc32c(tmpcrc, base, len));
755 con->out_msg_pos.did_page_crc = 1;
756 }
757
758 ret = kernel_sendpage(con->sock, page,
759 con->out_msg_pos.page_pos, len,
760 MSG_DONTWAIT | MSG_NOSIGNAL |
761 MSG_MORE);
762
763 if (crc && (msg->pages || msg->pagelist))
764 kunmap(page);
765
766 if (ret <= 0)
767 goto out;
768
769 con->out_msg_pos.data_pos += ret;
770 con->out_msg_pos.page_pos += ret;
771 if (ret == len) {
772 con->out_msg_pos.page_pos = 0;
773 con->out_msg_pos.page++;
774 con->out_msg_pos.did_page_crc = 0;
775 if (msg->pagelist)
776 list_move_tail(&page->lru,
777 &msg->pagelist->head);
778 }
779 }
780
781 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
782
783 /* prepare and queue up footer, too */
784 if (!crc)
785 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
786 con->out_kvec_bytes = 0;
787 con->out_kvec_left = 0;
788 con->out_kvec_cur = con->out_kvec;
789 prepare_write_message_footer(con, 0);
790 ret = 1;
791out:
792 return ret;
793}
794
795/*
796 * write some zeros
797 */
798static int write_partial_skip(struct ceph_connection *con)
799{
800 int ret;
801
802 while (con->out_skip > 0) {
803 struct kvec iov = {
804 .iov_base = page_address(con->msgr->zero_page),
805 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
806 };
807
808 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
809 if (ret <= 0)
810 goto out;
811 con->out_skip -= ret;
812 }
813 ret = 1;
814out:
815 return ret;
816}
817
818/*
819 * Prepare to read connection handshake, or an ack.
820 */
821static void prepare_read_banner(struct ceph_connection *con)
822{
823 dout("prepare_read_banner %p\n", con);
824 con->in_base_pos = 0;
825}
826
827static void prepare_read_connect(struct ceph_connection *con)
828{
829 dout("prepare_read_connect %p\n", con);
830 con->in_base_pos = 0;
831}
832
833static void prepare_read_connect_retry(struct ceph_connection *con)
834{
835 dout("prepare_read_connect_retry %p\n", con);
836 con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
837 + sizeof(con->peer_addr_for_me);
838}
839
840static void prepare_read_ack(struct ceph_connection *con)
841{
842 dout("prepare_read_ack %p\n", con);
843 con->in_base_pos = 0;
844}
845
846static void prepare_read_tag(struct ceph_connection *con)
847{
848 dout("prepare_read_tag %p\n", con);
849 con->in_base_pos = 0;
850 con->in_tag = CEPH_MSGR_TAG_READY;
851}
852
853/*
854 * Prepare to read a message.
855 */
856static int prepare_read_message(struct ceph_connection *con)
857{
858 dout("prepare_read_message %p\n", con);
859 BUG_ON(con->in_msg != NULL);
860 con->in_base_pos = 0;
861 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
862 return 0;
863}
864
865
866static int read_partial(struct ceph_connection *con,
867 int *to, int size, void *object)
868{
869 *to += size;
870 while (con->in_base_pos < *to) {
871 int left = *to - con->in_base_pos;
872 int have = size - left;
873 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
874 if (ret <= 0)
875 return ret;
876 con->in_base_pos += ret;
877 }
878 return 1;
879}
880
881
882/*
883 * Read all or part of the connect-side handshake on a new connection
884 */
885static int read_partial_banner(struct ceph_connection *con)
886{
887 int ret, to = 0;
888
889 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
890
891 /* peer's banner */
892 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
893 if (ret <= 0)
894 goto out;
895 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
896 &con->actual_peer_addr);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
900 &con->peer_addr_for_me);
901 if (ret <= 0)
902 goto out;
903out:
904 return ret;
905}
906
907static int read_partial_connect(struct ceph_connection *con)
908{
909 int ret, to = 0;
910
911 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
912
913 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
914 if (ret <= 0)
915 goto out;
916 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
917 con->auth_reply_buf);
918 if (ret <= 0)
919 goto out;
920
921 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
922 con, (int)con->in_reply.tag,
923 le32_to_cpu(con->in_reply.connect_seq),
924 le32_to_cpu(con->in_reply.global_seq));
925out:
926 return ret;
927
928}
929
930/*
931 * Verify the hello banner looks okay.
932 */
933static int verify_hello(struct ceph_connection *con)
934{
935 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
936 pr_err("connect to %s got bad banner\n",
937 pr_addr(&con->peer_addr.in_addr));
938 con->error_msg = "protocol error, bad banner";
939 return -1;
940 }
941 return 0;
942}
943
944static bool addr_is_blank(struct sockaddr_storage *ss)
945{
946 switch (ss->ss_family) {
947 case AF_INET:
948 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
949 case AF_INET6:
950 return
951 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
952 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
953 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
954 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
955 }
956 return false;
957}
958
959static int addr_port(struct sockaddr_storage *ss)
960{
961 switch (ss->ss_family) {
962 case AF_INET:
963 return ntohs(((struct sockaddr_in *)ss)->sin_port);
964 case AF_INET6:
965 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
966 }
967 return 0;
968}
969
970static void addr_set_port(struct sockaddr_storage *ss, int p)
971{
972 switch (ss->ss_family) {
973 case AF_INET:
974 ((struct sockaddr_in *)ss)->sin_port = htons(p);
975 case AF_INET6:
976 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
977 }
978}
979
980/*
981 * Parse an ip[:port] list into an addr array. Use the default
982 * monitor port if a port isn't specified.
983 */
984int ceph_parse_ips(const char *c, const char *end,
985 struct ceph_entity_addr *addr,
986 int max_count, int *count)
987{
988 int i;
989 const char *p = c;
990
991 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
992 for (i = 0; i < max_count; i++) {
993 const char *ipend;
994 struct sockaddr_storage *ss = &addr[i].in_addr;
995 struct sockaddr_in *in4 = (void *)ss;
996 struct sockaddr_in6 *in6 = (void *)ss;
997 int port;
998
999 memset(ss, 0, sizeof(*ss));
1000 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1001 ',', &ipend)) {
1002 ss->ss_family = AF_INET;
1003 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1004 ',', &ipend)) {
1005 ss->ss_family = AF_INET6;
1006 } else {
1007 goto bad;
1008 }
1009 p = ipend;
1010
1011 /* port? */
1012 if (p < end && *p == ':') {
1013 port = 0;
1014 p++;
1015 while (p < end && *p >= '0' && *p <= '9') {
1016 port = (port * 10) + (*p - '0');
1017 p++;
1018 }
1019 if (port > 65535 || port == 0)
1020 goto bad;
1021 } else {
1022 port = CEPH_MON_PORT;
1023 }
1024
1025 addr_set_port(ss, port);
1026
1027 dout("parse_ips got %s\n", pr_addr(ss));
1028
1029 if (p == end)
1030 break;
1031 if (*p != ',')
1032 goto bad;
1033 p++;
1034 }
1035
1036 if (p != end)
1037 goto bad;
1038
1039 if (count)
1040 *count = i + 1;
1041 return 0;
1042
1043bad:
1044 pr_err("parse_ips bad ip '%s'\n", c);
1045 return -EINVAL;
1046}
1047
1048static int process_banner(struct ceph_connection *con)
1049{
1050 dout("process_banner on %p\n", con);
1051
1052 if (verify_hello(con) < 0)
1053 return -1;
1054
1055 ceph_decode_addr(&con->actual_peer_addr);
1056 ceph_decode_addr(&con->peer_addr_for_me);
1057
1058 /*
1059 * Make sure the other end is who we wanted. note that the other
1060 * end may not yet know their ip address, so if it's 0.0.0.0, give
1061 * them the benefit of the doubt.
1062 */
1063 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1064 sizeof(con->peer_addr)) != 0 &&
1065 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1066 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1067 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1068 pr_addr(&con->peer_addr.in_addr),
1069 le64_to_cpu(con->peer_addr.nonce),
1070 pr_addr(&con->actual_peer_addr.in_addr),
1071 le64_to_cpu(con->actual_peer_addr.nonce));
1072 con->error_msg = "wrong peer at address";
1073 return -1;
1074 }
1075
1076 /*
1077 * did we learn our address?
1078 */
1079 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1080 int port = addr_port(&con->msgr->inst.addr.in_addr);
1081
1082 memcpy(&con->msgr->inst.addr.in_addr,
1083 &con->peer_addr_for_me.in_addr,
1084 sizeof(con->peer_addr_for_me.in_addr));
1085 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1086 encode_my_addr(con->msgr);
1087 dout("process_banner learned my addr is %s\n",
1088 pr_addr(&con->msgr->inst.addr.in_addr));
1089 }
1090
1091 set_bit(NEGOTIATING, &con->state);
1092 prepare_read_connect(con);
1093 return 0;
1094}
1095
1096static void fail_protocol(struct ceph_connection *con)
1097{
1098 reset_connection(con);
1099 set_bit(CLOSED, &con->state); /* in case there's queued work */
1100
1101 mutex_unlock(&con->mutex);
1102 if (con->ops->bad_proto)
1103 con->ops->bad_proto(con);
1104 mutex_lock(&con->mutex);
1105}
1106
1107static int process_connect(struct ceph_connection *con)
1108{
1109 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1110 u64 req_feat = CEPH_FEATURE_REQUIRED;
1111 u64 server_feat = le64_to_cpu(con->in_reply.features);
1112
1113 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1114
1115 switch (con->in_reply.tag) {
1116 case CEPH_MSGR_TAG_FEATURES:
1117 pr_err("%s%lld %s feature set mismatch,"
1118 " my %llx < server's %llx, missing %llx\n",
1119 ENTITY_NAME(con->peer_name),
1120 pr_addr(&con->peer_addr.in_addr),
1121 sup_feat, server_feat, server_feat & ~sup_feat);
1122 con->error_msg = "missing required protocol features";
1123 fail_protocol(con);
1124 return -1;
1125
1126 case CEPH_MSGR_TAG_BADPROTOVER:
1127 pr_err("%s%lld %s protocol version mismatch,"
1128 " my %d != server's %d\n",
1129 ENTITY_NAME(con->peer_name),
1130 pr_addr(&con->peer_addr.in_addr),
1131 le32_to_cpu(con->out_connect.protocol_version),
1132 le32_to_cpu(con->in_reply.protocol_version));
1133 con->error_msg = "protocol version mismatch";
1134 fail_protocol(con);
1135 return -1;
1136
1137 case CEPH_MSGR_TAG_BADAUTHORIZER:
1138 con->auth_retry++;
1139 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1140 con->auth_retry);
1141 if (con->auth_retry == 2) {
1142 con->error_msg = "connect authorization failure";
1143 reset_connection(con);
1144 set_bit(CLOSED, &con->state);
1145 return -1;
1146 }
1147 con->auth_retry = 1;
1148 prepare_write_connect(con->msgr, con, 0);
1149 prepare_read_connect_retry(con);
1150 break;
1151
1152 case CEPH_MSGR_TAG_RESETSESSION:
1153 /*
1154 * If we connected with a large connect_seq but the peer
1155 * has no record of a session with us (no connection, or
1156 * connect_seq == 0), they will send RESETSESION to indicate
1157 * that they must have reset their session, and may have
1158 * dropped messages.
1159 */
1160 dout("process_connect got RESET peer seq %u\n",
1161 le32_to_cpu(con->in_connect.connect_seq));
1162 pr_err("%s%lld %s connection reset\n",
1163 ENTITY_NAME(con->peer_name),
1164 pr_addr(&con->peer_addr.in_addr));
1165 reset_connection(con);
1166 prepare_write_connect(con->msgr, con, 0);
1167 prepare_read_connect(con);
1168
1169 /* Tell ceph about it. */
1170 mutex_unlock(&con->mutex);
1171 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1172 if (con->ops->peer_reset)
1173 con->ops->peer_reset(con);
1174 mutex_lock(&con->mutex);
1175 break;
1176
1177 case CEPH_MSGR_TAG_RETRY_SESSION:
1178 /*
1179 * If we sent a smaller connect_seq than the peer has, try
1180 * again with a larger value.
1181 */
1182 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1183 le32_to_cpu(con->out_connect.connect_seq),
1184 le32_to_cpu(con->in_connect.connect_seq));
1185 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1186 prepare_write_connect(con->msgr, con, 0);
1187 prepare_read_connect(con);
1188 break;
1189
1190 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1191 /*
1192 * If we sent a smaller global_seq than the peer has, try
1193 * again with a larger value.
1194 */
1195 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1196 con->peer_global_seq,
1197 le32_to_cpu(con->in_connect.global_seq));
1198 get_global_seq(con->msgr,
1199 le32_to_cpu(con->in_connect.global_seq));
1200 prepare_write_connect(con->msgr, con, 0);
1201 prepare_read_connect(con);
1202 break;
1203
1204 case CEPH_MSGR_TAG_READY:
1205 if (req_feat & ~server_feat) {
1206 pr_err("%s%lld %s protocol feature mismatch,"
1207 " my required %llx > server's %llx, need %llx\n",
1208 ENTITY_NAME(con->peer_name),
1209 pr_addr(&con->peer_addr.in_addr),
1210 req_feat, server_feat, req_feat & ~server_feat);
1211 con->error_msg = "missing required protocol features";
1212 fail_protocol(con);
1213 return -1;
1214 }
1215 clear_bit(CONNECTING, &con->state);
1216 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1217 con->connect_seq++;
1218 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1219 con->peer_global_seq,
1220 le32_to_cpu(con->in_reply.connect_seq),
1221 con->connect_seq);
1222 WARN_ON(con->connect_seq !=
1223 le32_to_cpu(con->in_reply.connect_seq));
1224
1225 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1226 set_bit(LOSSYTX, &con->state);
1227
1228 prepare_read_tag(con);
1229 break;
1230
1231 case CEPH_MSGR_TAG_WAIT:
1232 /*
1233 * If there is a connection race (we are opening
1234 * connections to each other), one of us may just have
1235 * to WAIT. This shouldn't happen if we are the
1236 * client.
1237 */
1238 pr_err("process_connect peer connecting WAIT\n");
1239
1240 default:
1241 pr_err("connect protocol error, will retry\n");
1242 con->error_msg = "protocol error, garbage tag during connect";
1243 return -1;
1244 }
1245 return 0;
1246}
1247
1248
1249/*
1250 * read (part of) an ack
1251 */
1252static int read_partial_ack(struct ceph_connection *con)
1253{
1254 int to = 0;
1255
1256 return read_partial(con, &to, sizeof(con->in_temp_ack),
1257 &con->in_temp_ack);
1258}
1259
1260
1261/*
1262 * We can finally discard anything that's been acked.
1263 */
1264static void process_ack(struct ceph_connection *con)
1265{
1266 struct ceph_msg *m;
1267 u64 ack = le64_to_cpu(con->in_temp_ack);
1268 u64 seq;
1269
1270 while (!list_empty(&con->out_sent)) {
1271 m = list_first_entry(&con->out_sent, struct ceph_msg,
1272 list_head);
1273 seq = le64_to_cpu(m->hdr.seq);
1274 if (seq > ack)
1275 break;
1276 dout("got ack for seq %llu type %d at %p\n", seq,
1277 le16_to_cpu(m->hdr.type), m);
1278 ceph_msg_remove(m);
1279 }
1280 prepare_read_tag(con);
1281}
1282
1283
1284
1285
1286static int read_partial_message_section(struct ceph_connection *con,
1287 struct kvec *section, unsigned int sec_len,
1288 u32 *crc)
1289{
1290 int left;
1291 int ret;
1292
1293 BUG_ON(!section);
1294
1295 while (section->iov_len < sec_len) {
1296 BUG_ON(section->iov_base == NULL);
1297 left = sec_len - section->iov_len;
1298 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1299 section->iov_len, left);
1300 if (ret <= 0)
1301 return ret;
1302 section->iov_len += ret;
1303 if (section->iov_len == sec_len)
1304 *crc = crc32c(0, section->iov_base,
1305 section->iov_len);
1306 }
1307
1308 return 1;
1309}
1310
1311static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1312 struct ceph_msg_header *hdr,
1313 int *skip);
1314/*
1315 * read (part of) a message.
1316 */
1317static int read_partial_message(struct ceph_connection *con)
1318{
1319 struct ceph_msg *m = con->in_msg;
1320 void *p;
1321 int ret;
1322 int to, left;
1323 unsigned front_len, middle_len, data_len, data_off;
1324 int datacrc = con->msgr->nocrc;
1325 int skip;
1326
1327 dout("read_partial_message con %p msg %p\n", con, m);
1328
1329 /* header */
1330 while (con->in_base_pos < sizeof(con->in_hdr)) {
1331 left = sizeof(con->in_hdr) - con->in_base_pos;
1332 ret = ceph_tcp_recvmsg(con->sock,
1333 (char *)&con->in_hdr + con->in_base_pos,
1334 left);
1335 if (ret <= 0)
1336 return ret;
1337 con->in_base_pos += ret;
1338 if (con->in_base_pos == sizeof(con->in_hdr)) {
1339 u32 crc = crc32c(0, (void *)&con->in_hdr,
1340 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1341 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1342 pr_err("read_partial_message bad hdr "
1343 " crc %u != expected %u\n",
1344 crc, con->in_hdr.crc);
1345 return -EBADMSG;
1346 }
1347 }
1348 }
1349 front_len = le32_to_cpu(con->in_hdr.front_len);
1350 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1351 return -EIO;
1352 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1353 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1354 return -EIO;
1355 data_len = le32_to_cpu(con->in_hdr.data_len);
1356 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1357 return -EIO;
1358 data_off = le16_to_cpu(con->in_hdr.data_off);
1359
1360 /* allocate message? */
1361 if (!con->in_msg) {
1362 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1363 con->in_hdr.front_len, con->in_hdr.data_len);
1364 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1365 if (skip) {
1366 /* skip this message */
1367 dout("alloc_msg returned NULL, skipping message\n");
1368 con->in_base_pos = -front_len - middle_len - data_len -
1369 sizeof(m->footer);
1370 con->in_tag = CEPH_MSGR_TAG_READY;
1371 return 0;
1372 }
1373 if (IS_ERR(con->in_msg)) {
1374 ret = PTR_ERR(con->in_msg);
1375 con->in_msg = NULL;
1376 con->error_msg =
1377 "error allocating memory for incoming message";
1378 return ret;
1379 }
1380 m = con->in_msg;
1381 m->front.iov_len = 0; /* haven't read it yet */
1382 if (m->middle)
1383 m->middle->vec.iov_len = 0;
1384
1385 con->in_msg_pos.page = 0;
1386 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1387 con->in_msg_pos.data_pos = 0;
1388 }
1389
1390 /* front */
1391 ret = read_partial_message_section(con, &m->front, front_len,
1392 &con->in_front_crc);
1393 if (ret <= 0)
1394 return ret;
1395
1396 /* middle */
1397 if (m->middle) {
1398 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1399 &con->in_middle_crc);
1400 if (ret <= 0)
1401 return ret;
1402 }
1403
1404 /* (page) data */
1405 while (con->in_msg_pos.data_pos < data_len) {
1406 left = min((int)(data_len - con->in_msg_pos.data_pos),
1407 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1408 BUG_ON(m->pages == NULL);
1409 p = kmap(m->pages[con->in_msg_pos.page]);
1410 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1411 left);
1412 if (ret > 0 && datacrc)
1413 con->in_data_crc =
1414 crc32c(con->in_data_crc,
1415 p + con->in_msg_pos.page_pos, ret);
1416 kunmap(m->pages[con->in_msg_pos.page]);
1417 if (ret <= 0)
1418 return ret;
1419 con->in_msg_pos.data_pos += ret;
1420 con->in_msg_pos.page_pos += ret;
1421 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1422 con->in_msg_pos.page_pos = 0;
1423 con->in_msg_pos.page++;
1424 }
1425 }
1426
1427 /* footer */
1428 to = sizeof(m->hdr) + sizeof(m->footer);
1429 while (con->in_base_pos < to) {
1430 left = to - con->in_base_pos;
1431 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1432 (con->in_base_pos - sizeof(m->hdr)),
1433 left);
1434 if (ret <= 0)
1435 return ret;
1436 con->in_base_pos += ret;
1437 }
1438 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1439 m, front_len, m->footer.front_crc, middle_len,
1440 m->footer.middle_crc, data_len, m->footer.data_crc);
1441
1442 /* crc ok? */
1443 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1444 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1445 m, con->in_front_crc, m->footer.front_crc);
1446 return -EBADMSG;
1447 }
1448 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1449 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1450 m, con->in_middle_crc, m->footer.middle_crc);
1451 return -EBADMSG;
1452 }
1453 if (datacrc &&
1454 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1455 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1456 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1457 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1458 return -EBADMSG;
1459 }
1460
1461 return 1; /* done! */
1462}
1463
1464/*
1465 * Process message. This happens in the worker thread. The callback should
1466 * be careful not to do anything that waits on other incoming messages or it
1467 * may deadlock.
1468 */
1469static void process_message(struct ceph_connection *con)
1470{
1471 struct ceph_msg *msg;
1472
1473 msg = con->in_msg;
1474 con->in_msg = NULL;
1475
1476 /* if first message, set peer_name */
1477 if (con->peer_name.type == 0)
1478 con->peer_name = msg->hdr.src.name;
1479
1480 con->in_seq++;
1481 mutex_unlock(&con->mutex);
1482
1483 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1484 msg, le64_to_cpu(msg->hdr.seq),
1485 ENTITY_NAME(msg->hdr.src.name),
1486 le16_to_cpu(msg->hdr.type),
1487 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1488 le32_to_cpu(msg->hdr.front_len),
1489 le32_to_cpu(msg->hdr.data_len),
1490 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1491 con->ops->dispatch(con, msg);
1492
1493 mutex_lock(&con->mutex);
1494 prepare_read_tag(con);
1495}
1496
1497
1498/*
1499 * Write something to the socket. Called in a worker thread when the
1500 * socket appears to be writeable and we have something ready to send.
1501 */
1502static int try_write(struct ceph_connection *con)
1503{
1504 struct ceph_messenger *msgr = con->msgr;
1505 int ret = 1;
1506
1507 dout("try_write start %p state %lu nref %d\n", con, con->state,
1508 atomic_read(&con->nref));
1509
1510 mutex_lock(&con->mutex);
1511more:
1512 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1513
1514 /* open the socket first? */
1515 if (con->sock == NULL) {
1516 /*
1517 * if we were STANDBY and are reconnecting _this_
1518 * connection, bump connect_seq now. Always bump
1519 * global_seq.
1520 */
1521 if (test_and_clear_bit(STANDBY, &con->state))
1522 con->connect_seq++;
1523
1524 prepare_write_banner(msgr, con);
1525 prepare_write_connect(msgr, con, 1);
1526 prepare_read_banner(con);
1527 set_bit(CONNECTING, &con->state);
1528 clear_bit(NEGOTIATING, &con->state);
1529
1530 BUG_ON(con->in_msg);
1531 con->in_tag = CEPH_MSGR_TAG_READY;
1532 dout("try_write initiating connect on %p new state %lu\n",
1533 con, con->state);
1534 con->sock = ceph_tcp_connect(con);
1535 if (IS_ERR(con->sock)) {
1536 con->sock = NULL;
1537 con->error_msg = "connect error";
1538 ret = -1;
1539 goto out;
1540 }
1541 }
1542
1543more_kvec:
1544 /* kvec data queued? */
1545 if (con->out_skip) {
1546 ret = write_partial_skip(con);
1547 if (ret <= 0)
1548 goto done;
1549 if (ret < 0) {
1550 dout("try_write write_partial_skip err %d\n", ret);
1551 goto done;
1552 }
1553 }
1554 if (con->out_kvec_left) {
1555 ret = write_partial_kvec(con);
1556 if (ret <= 0)
1557 goto done;
1558 }
1559
1560 /* msg pages? */
1561 if (con->out_msg) {
1562 if (con->out_msg_done) {
1563 ceph_msg_put(con->out_msg);
1564 con->out_msg = NULL; /* we're done with this one */
1565 goto do_next;
1566 }
1567
1568 ret = write_partial_msg_pages(con);
1569 if (ret == 1)
1570 goto more_kvec; /* we need to send the footer, too! */
1571 if (ret == 0)
1572 goto done;
1573 if (ret < 0) {
1574 dout("try_write write_partial_msg_pages err %d\n",
1575 ret);
1576 goto done;
1577 }
1578 }
1579
1580do_next:
1581 if (!test_bit(CONNECTING, &con->state)) {
1582 /* is anything else pending? */
1583 if (!list_empty(&con->out_queue)) {
1584 prepare_write_message(con);
1585 goto more;
1586 }
1587 if (con->in_seq > con->in_seq_acked) {
1588 prepare_write_ack(con);
1589 goto more;
1590 }
1591 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1592 prepare_write_keepalive(con);
1593 goto more;
1594 }
1595 }
1596
1597 /* Nothing to do! */
1598 clear_bit(WRITE_PENDING, &con->state);
1599 dout("try_write nothing else to write.\n");
1600done:
1601 ret = 0;
1602out:
1603 mutex_unlock(&con->mutex);
1604 dout("try_write done on %p\n", con);
1605 return ret;
1606}
1607
1608
1609
1610/*
1611 * Read what we can from the socket.
1612 */
1613static int try_read(struct ceph_connection *con)
1614{
1615 struct ceph_messenger *msgr;
1616 int ret = -1;
1617
1618 if (!con->sock)
1619 return 0;
1620
1621 if (test_bit(STANDBY, &con->state))
1622 return 0;
1623
1624 dout("try_read start on %p\n", con);
1625 msgr = con->msgr;
1626
1627 mutex_lock(&con->mutex);
1628
1629more:
1630 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1631 con->in_base_pos);
1632 if (test_bit(CONNECTING, &con->state)) {
1633 if (!test_bit(NEGOTIATING, &con->state)) {
1634 dout("try_read connecting\n");
1635 ret = read_partial_banner(con);
1636 if (ret <= 0)
1637 goto done;
1638 if (process_banner(con) < 0) {
1639 ret = -1;
1640 goto out;
1641 }
1642 }
1643 ret = read_partial_connect(con);
1644 if (ret <= 0)
1645 goto done;
1646 if (process_connect(con) < 0) {
1647 ret = -1;
1648 goto out;
1649 }
1650 goto more;
1651 }
1652
1653 if (con->in_base_pos < 0) {
1654 /*
1655 * skipping + discarding content.
1656 *
1657 * FIXME: there must be a better way to do this!
1658 */
1659 static char buf[1024];
1660 int skip = min(1024, -con->in_base_pos);
1661 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1662 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1663 if (ret <= 0)
1664 goto done;
1665 con->in_base_pos += ret;
1666 if (con->in_base_pos)
1667 goto more;
1668 }
1669 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1670 /*
1671 * what's next?
1672 */
1673 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1674 if (ret <= 0)
1675 goto done;
1676 dout("try_read got tag %d\n", (int)con->in_tag);
1677 switch (con->in_tag) {
1678 case CEPH_MSGR_TAG_MSG:
1679 prepare_read_message(con);
1680 break;
1681 case CEPH_MSGR_TAG_ACK:
1682 prepare_read_ack(con);
1683 break;
1684 case CEPH_MSGR_TAG_CLOSE:
1685 set_bit(CLOSED, &con->state); /* fixme */
1686 goto done;
1687 default:
1688 goto bad_tag;
1689 }
1690 }
1691 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1692 ret = read_partial_message(con);
1693 if (ret <= 0) {
1694 switch (ret) {
1695 case -EBADMSG:
1696 con->error_msg = "bad crc";
1697 ret = -EIO;
1698 goto out;
1699 case -EIO:
1700 con->error_msg = "io error";
1701 goto out;
1702 default:
1703 goto done;
1704 }
1705 }
1706 if (con->in_tag == CEPH_MSGR_TAG_READY)
1707 goto more;
1708 process_message(con);
1709 goto more;
1710 }
1711 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1712 ret = read_partial_ack(con);
1713 if (ret <= 0)
1714 goto done;
1715 process_ack(con);
1716 goto more;
1717 }
1718
1719done:
1720 ret = 0;
1721out:
1722 mutex_unlock(&con->mutex);
1723 dout("try_read done on %p\n", con);
1724 return ret;
1725
1726bad_tag:
1727 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1728 con->error_msg = "protocol error, garbage tag";
1729 ret = -1;
1730 goto out;
1731}
1732
1733
1734/*
1735 * Atomically queue work on a connection. Bump @con reference to
1736 * avoid races with connection teardown.
1737 *
1738 * There is some trickery going on with QUEUED and BUSY because we
1739 * only want a _single_ thread operating on each connection at any
1740 * point in time, but we want to use all available CPUs.
1741 *
1742 * The worker thread only proceeds if it can atomically set BUSY. It
1743 * clears QUEUED and does it's thing. When it thinks it's done, it
1744 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1745 * (tries again to set BUSY).
1746 *
1747 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1748 * try to queue work. If that fails (work is already queued, or BUSY)
1749 * we give up (work also already being done or is queued) but leave QUEUED
1750 * set so that the worker thread will loop if necessary.
1751 */
1752static void queue_con(struct ceph_connection *con)
1753{
1754 if (test_bit(DEAD, &con->state)) {
1755 dout("queue_con %p ignoring: DEAD\n",
1756 con);
1757 return;
1758 }
1759
1760 if (!con->ops->get(con)) {
1761 dout("queue_con %p ref count 0\n", con);
1762 return;
1763 }
1764
1765 set_bit(QUEUED, &con->state);
1766 if (test_bit(BUSY, &con->state)) {
1767 dout("queue_con %p - already BUSY\n", con);
1768 con->ops->put(con);
1769 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1770 dout("queue_con %p - already queued\n", con);
1771 con->ops->put(con);
1772 } else {
1773 dout("queue_con %p\n", con);
1774 }
1775}
1776
1777/*
1778 * Do some work on a connection. Drop a connection ref when we're done.
1779 */
1780static void con_work(struct work_struct *work)
1781{
1782 struct ceph_connection *con = container_of(work, struct ceph_connection,
1783 work.work);
1784 int backoff = 0;
1785
1786more:
1787 if (test_and_set_bit(BUSY, &con->state) != 0) {
1788 dout("con_work %p BUSY already set\n", con);
1789 goto out;
1790 }
1791 dout("con_work %p start, clearing QUEUED\n", con);
1792 clear_bit(QUEUED, &con->state);
1793
1794 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1795 dout("con_work CLOSED\n");
1796 con_close_socket(con);
1797 goto done;
1798 }
1799 if (test_and_clear_bit(OPENING, &con->state)) {
1800 /* reopen w/ new peer */
1801 dout("con_work OPENING\n");
1802 con_close_socket(con);
1803 }
1804
1805 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1806 try_read(con) < 0 ||
1807 try_write(con) < 0) {
1808 backoff = 1;
1809 ceph_fault(con); /* error/fault path */
1810 }
1811
1812done:
1813 clear_bit(BUSY, &con->state);
1814 dout("con->state=%lu\n", con->state);
1815 if (test_bit(QUEUED, &con->state)) {
1816 if (!backoff || test_bit(OPENING, &con->state)) {
1817 dout("con_work %p QUEUED reset, looping\n", con);
1818 goto more;
1819 }
1820 dout("con_work %p QUEUED reset, but just faulted\n", con);
1821 clear_bit(QUEUED, &con->state);
1822 }
1823 dout("con_work %p done\n", con);
1824
1825out:
1826 con->ops->put(con);
1827}
1828
1829
1830/*
1831 * Generic error/fault handler. A retry mechanism is used with
1832 * exponential backoff
1833 */
1834static void ceph_fault(struct ceph_connection *con)
1835{
1836 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1837 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1838 dout("fault %p state %lu to peer %s\n",
1839 con, con->state, pr_addr(&con->peer_addr.in_addr));
1840
1841 if (test_bit(LOSSYTX, &con->state)) {
1842 dout("fault on LOSSYTX channel\n");
1843 goto out;
1844 }
1845
1846 clear_bit(BUSY, &con->state); /* to avoid an improbable race */
1847
1848 mutex_lock(&con->mutex);
1849 if (test_bit(CLOSED, &con->state))
1850 goto out_unlock;
1851
1852 con_close_socket(con);
1853
1854 if (con->in_msg) {
1855 ceph_msg_put(con->in_msg);
1856 con->in_msg = NULL;
1857 }
1858
1859 /* Requeue anything that hasn't been acked */
1860 list_splice_init(&con->out_sent, &con->out_queue);
1861
1862 /* If there are no messages in the queue, place the connection
1863 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1864 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1865 dout("fault setting STANDBY\n");
1866 set_bit(STANDBY, &con->state);
1867 } else {
1868 /* retry after a delay. */
1869 if (con->delay == 0)
1870 con->delay = BASE_DELAY_INTERVAL;
1871 else if (con->delay < MAX_DELAY_INTERVAL)
1872 con->delay *= 2;
1873 dout("fault queueing %p delay %lu\n", con, con->delay);
1874 con->ops->get(con);
1875 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1876 round_jiffies_relative(con->delay)) == 0)
1877 con->ops->put(con);
1878 }
1879
1880out_unlock:
1881 mutex_unlock(&con->mutex);
1882out:
1883 /*
1884 * in case we faulted due to authentication, invalidate our
1885 * current tickets so that we can get new ones.
1886 */
1887 if (con->auth_retry && con->ops->invalidate_authorizer) {
1888 dout("calling invalidate_authorizer()\n");
1889 con->ops->invalidate_authorizer(con);
1890 }
1891
1892 if (con->ops->fault)
1893 con->ops->fault(con);
1894}
1895
1896
1897
1898/*
1899 * create a new messenger instance
1900 */
1901struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1902{
1903 struct ceph_messenger *msgr;
1904
1905 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1906 if (msgr == NULL)
1907 return ERR_PTR(-ENOMEM);
1908
1909 spin_lock_init(&msgr->global_seq_lock);
1910
1911 /* the zero page is needed if a request is "canceled" while the message
1912 * is being written over the socket */
1913 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1914 if (!msgr->zero_page) {
1915 kfree(msgr);
1916 return ERR_PTR(-ENOMEM);
1917 }
1918 kmap(msgr->zero_page);
1919
1920 if (myaddr)
1921 msgr->inst.addr = *myaddr;
1922
1923 /* select a random nonce */
1924 msgr->inst.addr.type = 0;
1925 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1926 encode_my_addr(msgr);
1927
1928 dout("messenger_create %p\n", msgr);
1929 return msgr;
1930}
1931
1932void ceph_messenger_destroy(struct ceph_messenger *msgr)
1933{
1934 dout("destroy %p\n", msgr);
1935 kunmap(msgr->zero_page);
1936 __free_page(msgr->zero_page);
1937 kfree(msgr);
1938 dout("destroyed messenger %p\n", msgr);
1939}
1940
1941/*
1942 * Queue up an outgoing message on the given connection.
1943 */
1944void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1945{
1946 if (test_bit(CLOSED, &con->state)) {
1947 dout("con_send %p closed, dropping %p\n", con, msg);
1948 ceph_msg_put(msg);
1949 return;
1950 }
1951
1952 /* set src+dst */
1953 msg->hdr.src.name = con->msgr->inst.name;
1954 msg->hdr.src.addr = con->msgr->my_enc_addr;
1955 msg->hdr.orig_src = msg->hdr.src;
1956
1957 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1958
1959 /* queue */
1960 mutex_lock(&con->mutex);
1961 BUG_ON(!list_empty(&msg->list_head));
1962 list_add_tail(&msg->list_head, &con->out_queue);
1963 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1964 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1965 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1966 le32_to_cpu(msg->hdr.front_len),
1967 le32_to_cpu(msg->hdr.middle_len),
1968 le32_to_cpu(msg->hdr.data_len));
1969 mutex_unlock(&con->mutex);
1970
1971 /* if there wasn't anything waiting to send before, queue
1972 * new work */
1973 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1974 queue_con(con);
1975}
1976
1977/*
1978 * Revoke a message that was previously queued for send
1979 */
1980void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1981{
1982 mutex_lock(&con->mutex);
1983 if (!list_empty(&msg->list_head)) {
1984 dout("con_revoke %p msg %p\n", con, msg);
1985 list_del_init(&msg->list_head);
1986 ceph_msg_put(msg);
1987 msg->hdr.seq = 0;
1988 if (con->out_msg == msg) {
1989 ceph_msg_put(con->out_msg);
1990 con->out_msg = NULL;
1991 }
1992 if (con->out_kvec_is_msg) {
1993 con->out_skip = con->out_kvec_bytes;
1994 con->out_kvec_is_msg = false;
1995 }
1996 } else {
1997 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
1998 }
1999 mutex_unlock(&con->mutex);
2000}
2001
2002/*
2003 * Revoke a message that we may be reading data into
2004 */
2005void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2006{
2007 mutex_lock(&con->mutex);
2008 if (con->in_msg && con->in_msg == msg) {
2009 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2010 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2011 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2012
2013 /* skip rest of message */
2014 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2015 con->in_base_pos = con->in_base_pos -
2016 sizeof(struct ceph_msg_header) -
2017 front_len -
2018 middle_len -
2019 data_len -
2020 sizeof(struct ceph_msg_footer);
2021 ceph_msg_put(con->in_msg);
2022 con->in_msg = NULL;
2023 con->in_tag = CEPH_MSGR_TAG_READY;
2024 } else {
2025 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2026 con, con->in_msg, msg);
2027 }
2028 mutex_unlock(&con->mutex);
2029}
2030
2031/*
2032 * Queue a keepalive byte to ensure the tcp connection is alive.
2033 */
2034void ceph_con_keepalive(struct ceph_connection *con)
2035{
2036 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2037 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2038 queue_con(con);
2039}
2040
2041
2042/*
2043 * construct a new message with given type, size
2044 * the new msg has a ref count of 1.
2045 */
2046struct ceph_msg *ceph_msg_new(int type, int front_len,
2047 int page_len, int page_off, struct page **pages)
2048{
2049 struct ceph_msg *m;
2050
2051 m = kmalloc(sizeof(*m), GFP_NOFS);
2052 if (m == NULL)
2053 goto out;
2054 kref_init(&m->kref);
2055 INIT_LIST_HEAD(&m->list_head);
2056
2057 m->hdr.type = cpu_to_le16(type);
2058 m->hdr.front_len = cpu_to_le32(front_len);
2059 m->hdr.middle_len = 0;
2060 m->hdr.data_len = cpu_to_le32(page_len);
2061 m->hdr.data_off = cpu_to_le16(page_off);
2062 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2063 m->footer.front_crc = 0;
2064 m->footer.middle_crc = 0;
2065 m->footer.data_crc = 0;
2066 m->front_max = front_len;
2067 m->front_is_vmalloc = false;
2068 m->more_to_follow = false;
2069 m->pool = NULL;
2070
2071 /* front */
2072 if (front_len) {
2073 if (front_len > PAGE_CACHE_SIZE) {
2074 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2075 PAGE_KERNEL);
2076 m->front_is_vmalloc = true;
2077 } else {
2078 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2079 }
2080 if (m->front.iov_base == NULL) {
2081 pr_err("msg_new can't allocate %d bytes\n",
2082 front_len);
2083 goto out2;
2084 }
2085 } else {
2086 m->front.iov_base = NULL;
2087 }
2088 m->front.iov_len = front_len;
2089
2090 /* middle */
2091 m->middle = NULL;
2092
2093 /* data */
2094 m->nr_pages = calc_pages_for(page_off, page_len);
2095 m->pages = pages;
2096 m->pagelist = NULL;
2097
2098 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2099 m->nr_pages);
2100 return m;
2101
2102out2:
2103 ceph_msg_put(m);
2104out:
2105 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2106 return ERR_PTR(-ENOMEM);
2107}
2108
2109/*
2110 * Allocate "middle" portion of a message, if it is needed and wasn't
2111 * allocated by alloc_msg. This allows us to read a small fixed-size
2112 * per-type header in the front and then gracefully fail (i.e.,
2113 * propagate the error to the caller based on info in the front) when
2114 * the middle is too large.
2115 */
2116static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2117{
2118 int type = le16_to_cpu(msg->hdr.type);
2119 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2120
2121 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2122 ceph_msg_type_name(type), middle_len);
2123 BUG_ON(!middle_len);
2124 BUG_ON(msg->middle);
2125
2126 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2127 if (!msg->middle)
2128 return -ENOMEM;
2129 return 0;
2130}
2131
2132/*
2133 * Generic message allocator, for incoming messages.
2134 */
2135static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2136 struct ceph_msg_header *hdr,
2137 int *skip)
2138{
2139 int type = le16_to_cpu(hdr->type);
2140 int front_len = le32_to_cpu(hdr->front_len);
2141 int middle_len = le32_to_cpu(hdr->middle_len);
2142 struct ceph_msg *msg = NULL;
2143 int ret;
2144
2145 if (con->ops->alloc_msg) {
2146 mutex_unlock(&con->mutex);
2147 msg = con->ops->alloc_msg(con, hdr, skip);
2148 mutex_lock(&con->mutex);
2149 if (IS_ERR(msg))
2150 return msg;
2151
2152 if (*skip)
2153 return NULL;
2154 }
2155 if (!msg) {
2156 *skip = 0;
2157 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2158 if (!msg) {
2159 pr_err("unable to allocate msg type %d len %d\n",
2160 type, front_len);
2161 return ERR_PTR(-ENOMEM);
2162 }
2163 }
2164 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2165
2166 if (middle_len) {
2167 ret = ceph_alloc_middle(con, msg);
2168
2169 if (ret < 0) {
2170 ceph_msg_put(msg);
2171 return msg;
2172 }
2173 }
2174
2175 return msg;
2176}
2177
2178
2179/*
2180 * Free a generically kmalloc'd message.
2181 */
2182void ceph_msg_kfree(struct ceph_msg *m)
2183{
2184 dout("msg_kfree %p\n", m);
2185 if (m->front_is_vmalloc)
2186 vfree(m->front.iov_base);
2187 else
2188 kfree(m->front.iov_base);
2189 kfree(m);
2190}
2191
2192/*
2193 * Drop a msg ref. Destroy as needed.
2194 */
2195void ceph_msg_last_put(struct kref *kref)
2196{
2197 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2198
2199 dout("ceph_msg_put last one on %p\n", m);
2200 WARN_ON(!list_empty(&m->list_head));
2201
2202 /* drop middle, data, if any */
2203 if (m->middle) {
2204 ceph_buffer_put(m->middle);
2205 m->middle = NULL;
2206 }
2207 m->nr_pages = 0;
2208 m->pages = NULL;
2209
2210 if (m->pagelist) {
2211 ceph_pagelist_release(m->pagelist);
2212 kfree(m->pagelist);
2213 m->pagelist = NULL;
2214 }
2215
2216 if (m->pool)
2217 ceph_msgpool_put(m->pool, m);
2218 else
2219 ceph_msg_kfree(m);
2220}
2221
2222void ceph_msg_dump(struct ceph_msg *msg)
2223{
2224 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2225 msg->front_max, msg->nr_pages);
2226 print_hex_dump(KERN_DEBUG, "header: ",
2227 DUMP_PREFIX_OFFSET, 16, 1,
2228 &msg->hdr, sizeof(msg->hdr), true);
2229 print_hex_dump(KERN_DEBUG, " front: ",
2230 DUMP_PREFIX_OFFSET, 16, 1,
2231 msg->front.iov_base, msg->front.iov_len, true);
2232 if (msg->middle)
2233 print_hex_dump(KERN_DEBUG, "middle: ",
2234 DUMP_PREFIX_OFFSET, 16, 1,
2235 msg->middle->vec.iov_base,
2236 msg->middle->vec.iov_len, true);
2237 print_hex_dump(KERN_DEBUG, "footer: ",
2238 DUMP_PREFIX_OFFSET, 16, 1,
2239 &msg->footer, sizeof(msg->footer), true);
2240}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..4caaa5911110
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,254 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern void ceph_con_close(struct ceph_connection *con);
227extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
228extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke_message(struct ceph_connection *con,
230 struct ceph_msg *msg);
231extern void ceph_con_keepalive(struct ceph_connection *con);
232extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
233extern void ceph_con_put(struct ceph_connection *con);
234
235extern struct ceph_msg *ceph_msg_new(int type, int front_len,
236 int page_len, int page_off,
237 struct page **pages);
238extern void ceph_msg_kfree(struct ceph_msg *m);
239
240
241static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
242{
243 kref_get(&msg->kref);
244 return msg;
245}
246extern void ceph_msg_last_put(struct kref *kref);
247static inline void ceph_msg_put(struct ceph_msg *msg)
248{
249 kref_put(&msg->kref, ceph_msg_last_put);
250}
251
252extern void ceph_msg_dump(struct ceph_msg *msg);
253
254#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..890597c09d43
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/random.h>
5#include <linux/sched.h>
6
7#include "mon_client.h"
8#include "super.h"
9#include "auth.h"
10#include "decode.h"
11
12/*
13 * Interact with Ceph monitor cluster. Handle requests for new map
14 * versions, and periodically resend as needed. Also implement
15 * statfs() and umount().
16 *
17 * A small cluster of Ceph "monitors" are responsible for managing critical
18 * cluster configuration and state information. An odd number (e.g., 3, 5)
19 * of cmon daemons use a modified version of the Paxos part-time parliament
20 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
21 * list of clients who have mounted the file system.
22 *
23 * We maintain an open, active session with a monitor at all times in order to
24 * receive timely MDSMap updates. We periodically send a keepalive byte on the
25 * TCP socket to ensure we detect a failure. If the connection does break, we
26 * randomly hunt for a new monitor. Once the connection is reestablished, we
27 * resend any outstanding requests.
28 */
29
30const static struct ceph_connection_operations mon_con_ops;
31
32static int __validate_auth(struct ceph_mon_client *monc);
33
34/*
35 * Decode a monmap blob (e.g., during mount).
36 */
37struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
38{
39 struct ceph_monmap *m = NULL;
40 int i, err = -EINVAL;
41 struct ceph_fsid fsid;
42 u32 epoch, num_mon;
43 u16 version;
44 u32 len;
45
46 ceph_decode_32_safe(&p, end, len, bad);
47 ceph_decode_need(&p, end, len, bad);
48
49 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
50
51 ceph_decode_16_safe(&p, end, version, bad);
52
53 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
54 ceph_decode_copy(&p, &fsid, sizeof(fsid));
55 epoch = ceph_decode_32(&p);
56
57 num_mon = ceph_decode_32(&p);
58 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
59
60 if (num_mon >= CEPH_MAX_MON)
61 goto bad;
62 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
63 if (m == NULL)
64 return ERR_PTR(-ENOMEM);
65 m->fsid = fsid;
66 m->epoch = epoch;
67 m->num_mon = num_mon;
68 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
69 for (i = 0; i < num_mon; i++)
70 ceph_decode_addr(&m->mon_inst[i].addr);
71
72 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
73 m->num_mon);
74 for (i = 0; i < m->num_mon; i++)
75 dout("monmap_decode mon%d is %s\n", i,
76 pr_addr(&m->mon_inst[i].addr.in_addr));
77 return m;
78
79bad:
80 dout("monmap_decode failed with %d\n", err);
81 kfree(m);
82 return ERR_PTR(err);
83}
84
85/*
86 * return true if *addr is included in the monmap.
87 */
88int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
89{
90 int i;
91
92 for (i = 0; i < m->num_mon; i++)
93 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
94 return 1;
95 return 0;
96}
97
98/*
99 * Send an auth request.
100 */
101static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
102{
103 monc->pending_auth = 1;
104 monc->m_auth->front.iov_len = len;
105 monc->m_auth->hdr.front_len = cpu_to_le32(len);
106 ceph_msg_get(monc->m_auth); /* keep our ref */
107 ceph_con_send(monc->con, monc->m_auth);
108}
109
110/*
111 * Close monitor session, if any.
112 */
113static void __close_session(struct ceph_mon_client *monc)
114{
115 if (monc->con) {
116 dout("__close_session closing mon%d\n", monc->cur_mon);
117 ceph_con_revoke(monc->con, monc->m_auth);
118 ceph_con_close(monc->con);
119 monc->cur_mon = -1;
120 monc->pending_auth = 0;
121 ceph_auth_reset(monc->auth);
122 }
123}
124
125/*
126 * Open a session with a (new) monitor.
127 */
128static int __open_session(struct ceph_mon_client *monc)
129{
130 char r;
131 int ret;
132
133 if (monc->cur_mon < 0) {
134 get_random_bytes(&r, 1);
135 monc->cur_mon = r % monc->monmap->num_mon;
136 dout("open_session num=%d r=%d -> mon%d\n",
137 monc->monmap->num_mon, r, monc->cur_mon);
138 monc->sub_sent = 0;
139 monc->sub_renew_after = jiffies; /* i.e., expired */
140 monc->want_next_osdmap = !!monc->want_next_osdmap;
141
142 dout("open_session mon%d opening\n", monc->cur_mon);
143 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
144 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
145 ceph_con_open(monc->con,
146 &monc->monmap->mon_inst[monc->cur_mon].addr);
147
148 /* initiatiate authentication handshake */
149 ret = ceph_auth_build_hello(monc->auth,
150 monc->m_auth->front.iov_base,
151 monc->m_auth->front_max);
152 __send_prepared_auth_request(monc, ret);
153 } else {
154 dout("open_session mon%d already open\n", monc->cur_mon);
155 }
156 return 0;
157}
158
159static bool __sub_expired(struct ceph_mon_client *monc)
160{
161 return time_after_eq(jiffies, monc->sub_renew_after);
162}
163
164/*
165 * Reschedule delayed work timer.
166 */
167static void __schedule_delayed(struct ceph_mon_client *monc)
168{
169 unsigned delay;
170
171 if (monc->cur_mon < 0 || __sub_expired(monc))
172 delay = 10 * HZ;
173 else
174 delay = 20 * HZ;
175 dout("__schedule_delayed after %u\n", delay);
176 schedule_delayed_work(&monc->delayed_work, delay);
177}
178
179/*
180 * Send subscribe request for mdsmap and/or osdmap.
181 */
182static void __send_subscribe(struct ceph_mon_client *monc)
183{
184 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
185 (unsigned)monc->sub_sent, __sub_expired(monc),
186 monc->want_next_osdmap);
187 if ((__sub_expired(monc) && !monc->sub_sent) ||
188 monc->want_next_osdmap == 1) {
189 struct ceph_msg *msg;
190 struct ceph_mon_subscribe_item *i;
191 void *p, *end;
192
193 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
194 if (!msg)
195 return;
196
197 p = msg->front.iov_base;
198 end = p + msg->front.iov_len;
199
200 dout("__send_subscribe to 'mdsmap' %u+\n",
201 (unsigned)monc->have_mdsmap);
202 if (monc->want_next_osdmap) {
203 dout("__send_subscribe to 'osdmap' %u\n",
204 (unsigned)monc->have_osdmap);
205 ceph_encode_32(&p, 3);
206 ceph_encode_string(&p, end, "osdmap", 6);
207 i = p;
208 i->have = cpu_to_le64(monc->have_osdmap);
209 i->onetime = 1;
210 p += sizeof(*i);
211 monc->want_next_osdmap = 2; /* requested */
212 } else {
213 ceph_encode_32(&p, 2);
214 }
215 ceph_encode_string(&p, end, "mdsmap", 6);
216 i = p;
217 i->have = cpu_to_le64(monc->have_mdsmap);
218 i->onetime = 0;
219 p += sizeof(*i);
220 ceph_encode_string(&p, end, "monmap", 6);
221 i = p;
222 i->have = 0;
223 i->onetime = 0;
224 p += sizeof(*i);
225
226 msg->front.iov_len = p - msg->front.iov_base;
227 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
228 ceph_con_send(monc->con, msg);
229
230 monc->sub_sent = jiffies | 1; /* never 0 */
231 }
232}
233
234static void handle_subscribe_ack(struct ceph_mon_client *monc,
235 struct ceph_msg *msg)
236{
237 unsigned seconds;
238 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
239
240 if (msg->front.iov_len < sizeof(*h))
241 goto bad;
242 seconds = le32_to_cpu(h->duration);
243
244 mutex_lock(&monc->mutex);
245 if (monc->hunting) {
246 pr_info("mon%d %s session established\n",
247 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
248 monc->hunting = false;
249 }
250 dout("handle_subscribe_ack after %d seconds\n", seconds);
251 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
252 monc->sub_sent = 0;
253 mutex_unlock(&monc->mutex);
254 return;
255bad:
256 pr_err("got corrupt subscribe-ack msg\n");
257 ceph_msg_dump(msg);
258}
259
260/*
261 * Keep track of which maps we have
262 */
263int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
264{
265 mutex_lock(&monc->mutex);
266 monc->have_mdsmap = got;
267 mutex_unlock(&monc->mutex);
268 return 0;
269}
270
271int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
272{
273 mutex_lock(&monc->mutex);
274 monc->have_osdmap = got;
275 monc->want_next_osdmap = 0;
276 mutex_unlock(&monc->mutex);
277 return 0;
278}
279
280/*
281 * Register interest in the next osdmap
282 */
283void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
284{
285 dout("request_next_osdmap have %u\n", monc->have_osdmap);
286 mutex_lock(&monc->mutex);
287 if (!monc->want_next_osdmap)
288 monc->want_next_osdmap = 1;
289 if (monc->want_next_osdmap < 2)
290 __send_subscribe(monc);
291 mutex_unlock(&monc->mutex);
292}
293
294/*
295 *
296 */
297int ceph_monc_open_session(struct ceph_mon_client *monc)
298{
299 if (!monc->con) {
300 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
301 if (!monc->con)
302 return -ENOMEM;
303 ceph_con_init(monc->client->msgr, monc->con);
304 monc->con->private = monc;
305 monc->con->ops = &mon_con_ops;
306 }
307
308 mutex_lock(&monc->mutex);
309 __open_session(monc);
310 __schedule_delayed(monc);
311 mutex_unlock(&monc->mutex);
312 return 0;
313}
314
315/*
316 * The monitor responds with mount ack indicate mount success. The
317 * included client ticket allows the client to talk to MDSs and OSDs.
318 */
319static void ceph_monc_handle_map(struct ceph_mon_client *monc,
320 struct ceph_msg *msg)
321{
322 struct ceph_client *client = monc->client;
323 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
324 void *p, *end;
325
326 mutex_lock(&monc->mutex);
327
328 dout("handle_monmap\n");
329 p = msg->front.iov_base;
330 end = p + msg->front.iov_len;
331
332 monmap = ceph_monmap_decode(p, end);
333 if (IS_ERR(monmap)) {
334 pr_err("problem decoding monmap, %d\n",
335 (int)PTR_ERR(monmap));
336 goto out;
337 }
338
339 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
340 kfree(monmap);
341 goto out;
342 }
343
344 client->monc.monmap = monmap;
345 kfree(old);
346
347out:
348 mutex_unlock(&monc->mutex);
349 wake_up(&client->auth_wq);
350}
351
352/*
353 * statfs
354 */
355static struct ceph_mon_statfs_request *__lookup_statfs(
356 struct ceph_mon_client *monc, u64 tid)
357{
358 struct ceph_mon_statfs_request *req;
359 struct rb_node *n = monc->statfs_request_tree.rb_node;
360
361 while (n) {
362 req = rb_entry(n, struct ceph_mon_statfs_request, node);
363 if (tid < req->tid)
364 n = n->rb_left;
365 else if (tid > req->tid)
366 n = n->rb_right;
367 else
368 return req;
369 }
370 return NULL;
371}
372
373static void __insert_statfs(struct ceph_mon_client *monc,
374 struct ceph_mon_statfs_request *new)
375{
376 struct rb_node **p = &monc->statfs_request_tree.rb_node;
377 struct rb_node *parent = NULL;
378 struct ceph_mon_statfs_request *req = NULL;
379
380 while (*p) {
381 parent = *p;
382 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
383 if (new->tid < req->tid)
384 p = &(*p)->rb_left;
385 else if (new->tid > req->tid)
386 p = &(*p)->rb_right;
387 else
388 BUG();
389 }
390
391 rb_link_node(&new->node, parent, p);
392 rb_insert_color(&new->node, &monc->statfs_request_tree);
393}
394
395static void handle_statfs_reply(struct ceph_mon_client *monc,
396 struct ceph_msg *msg)
397{
398 struct ceph_mon_statfs_request *req;
399 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
400 u64 tid;
401
402 if (msg->front.iov_len != sizeof(*reply))
403 goto bad;
404 tid = le64_to_cpu(msg->hdr.tid);
405 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
406
407 mutex_lock(&monc->mutex);
408 req = __lookup_statfs(monc, tid);
409 if (req) {
410 *req->buf = reply->st;
411 req->result = 0;
412 }
413 mutex_unlock(&monc->mutex);
414 if (req)
415 complete(&req->completion);
416 return;
417
418bad:
419 pr_err("corrupt statfs reply, no tid\n");
420 ceph_msg_dump(msg);
421}
422
423/*
424 * (re)send a statfs request
425 */
426static int send_statfs(struct ceph_mon_client *monc,
427 struct ceph_mon_statfs_request *req)
428{
429 struct ceph_msg *msg;
430 struct ceph_mon_statfs *h;
431
432 dout("send_statfs tid %llu\n", req->tid);
433 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
434 if (IS_ERR(msg))
435 return PTR_ERR(msg);
436 req->request = msg;
437 msg->hdr.tid = cpu_to_le64(req->tid);
438 h = msg->front.iov_base;
439 h->monhdr.have_version = 0;
440 h->monhdr.session_mon = cpu_to_le16(-1);
441 h->monhdr.session_mon_tid = 0;
442 h->fsid = monc->monmap->fsid;
443 ceph_con_send(monc->con, msg);
444 return 0;
445}
446
447/*
448 * Do a synchronous statfs().
449 */
450int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
451{
452 struct ceph_mon_statfs_request req;
453 int err;
454
455 req.buf = buf;
456 init_completion(&req.completion);
457
458 /* allocate memory for reply */
459 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
460 if (err)
461 return err;
462
463 /* register request */
464 mutex_lock(&monc->mutex);
465 req.tid = ++monc->last_tid;
466 req.last_attempt = jiffies;
467 req.delay = BASE_DELAY_INTERVAL;
468 __insert_statfs(monc, &req);
469 monc->num_statfs_requests++;
470 mutex_unlock(&monc->mutex);
471
472 /* send request and wait */
473 err = send_statfs(monc, &req);
474 if (!err)
475 err = wait_for_completion_interruptible(&req.completion);
476
477 mutex_lock(&monc->mutex);
478 rb_erase(&req.node, &monc->statfs_request_tree);
479 monc->num_statfs_requests--;
480 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
481 mutex_unlock(&monc->mutex);
482
483 if (!err)
484 err = req.result;
485 return err;
486}
487
488/*
489 * Resend pending statfs requests.
490 */
491static void __resend_statfs(struct ceph_mon_client *monc)
492{
493 struct ceph_mon_statfs_request *req;
494 struct rb_node *p;
495
496 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
497 req = rb_entry(p, struct ceph_mon_statfs_request, node);
498 send_statfs(monc, req);
499 }
500}
501
502/*
503 * Delayed work. If we haven't mounted yet, retry. Otherwise,
504 * renew/retry subscription as needed (in case it is timing out, or we
505 * got an ENOMEM). And keep the monitor connection alive.
506 */
507static void delayed_work(struct work_struct *work)
508{
509 struct ceph_mon_client *monc =
510 container_of(work, struct ceph_mon_client, delayed_work.work);
511
512 dout("monc delayed_work\n");
513 mutex_lock(&monc->mutex);
514 if (monc->hunting) {
515 __close_session(monc);
516 __open_session(monc); /* continue hunting */
517 } else {
518 ceph_con_keepalive(monc->con);
519
520 __validate_auth(monc);
521
522 if (monc->auth->ops->is_authenticated(monc->auth))
523 __send_subscribe(monc);
524 }
525 __schedule_delayed(monc);
526 mutex_unlock(&monc->mutex);
527}
528
529/*
530 * On startup, we build a temporary monmap populated with the IPs
531 * provided by mount(2).
532 */
533static int build_initial_monmap(struct ceph_mon_client *monc)
534{
535 struct ceph_mount_args *args = monc->client->mount_args;
536 struct ceph_entity_addr *mon_addr = args->mon_addr;
537 int num_mon = args->num_mon;
538 int i;
539
540 /* build initial monmap */
541 monc->monmap = kzalloc(sizeof(*monc->monmap) +
542 num_mon*sizeof(monc->monmap->mon_inst[0]),
543 GFP_KERNEL);
544 if (!monc->monmap)
545 return -ENOMEM;
546 for (i = 0; i < num_mon; i++) {
547 monc->monmap->mon_inst[i].addr = mon_addr[i];
548 monc->monmap->mon_inst[i].addr.nonce = 0;
549 monc->monmap->mon_inst[i].name.type =
550 CEPH_ENTITY_TYPE_MON;
551 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
552 }
553 monc->monmap->num_mon = num_mon;
554 monc->have_fsid = false;
555
556 /* release addr memory */
557 kfree(args->mon_addr);
558 args->mon_addr = NULL;
559 args->num_mon = 0;
560 return 0;
561}
562
563int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
564{
565 int err = 0;
566
567 dout("init\n");
568 memset(monc, 0, sizeof(*monc));
569 monc->client = cl;
570 monc->monmap = NULL;
571 mutex_init(&monc->mutex);
572
573 err = build_initial_monmap(monc);
574 if (err)
575 goto out;
576
577 monc->con = NULL;
578
579 /* authentication */
580 monc->auth = ceph_auth_init(cl->mount_args->name,
581 cl->mount_args->secret);
582 if (IS_ERR(monc->auth))
583 return PTR_ERR(monc->auth);
584 monc->auth->want_keys =
585 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
586 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
587
588 /* msg pools */
589 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
590 sizeof(struct ceph_mon_subscribe_ack), 1, false);
591 if (err < 0)
592 goto out_monmap;
593 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
594 sizeof(struct ceph_mon_statfs_reply), 0, false);
595 if (err < 0)
596 goto out_pool1;
597 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
598 if (err < 0)
599 goto out_pool2;
600
601 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
602 monc->pending_auth = 0;
603 if (IS_ERR(monc->m_auth)) {
604 err = PTR_ERR(monc->m_auth);
605 monc->m_auth = NULL;
606 goto out_pool3;
607 }
608
609 monc->cur_mon = -1;
610 monc->hunting = true;
611 monc->sub_renew_after = jiffies;
612 monc->sub_sent = 0;
613
614 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
615 monc->statfs_request_tree = RB_ROOT;
616 monc->num_statfs_requests = 0;
617 monc->last_tid = 0;
618
619 monc->have_mdsmap = 0;
620 monc->have_osdmap = 0;
621 monc->want_next_osdmap = 1;
622 return 0;
623
624out_pool3:
625 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
626out_pool2:
627 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
628out_pool1:
629 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
630out_monmap:
631 kfree(monc->monmap);
632out:
633 return err;
634}
635
636void ceph_monc_stop(struct ceph_mon_client *monc)
637{
638 dout("stop\n");
639 cancel_delayed_work_sync(&monc->delayed_work);
640
641 mutex_lock(&monc->mutex);
642 __close_session(monc);
643 if (monc->con) {
644 monc->con->private = NULL;
645 monc->con->ops->put(monc->con);
646 monc->con = NULL;
647 }
648 mutex_unlock(&monc->mutex);
649
650 ceph_auth_destroy(monc->auth);
651
652 ceph_msg_put(monc->m_auth);
653 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
654 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
655 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
656
657 kfree(monc->monmap);
658}
659
660static void handle_auth_reply(struct ceph_mon_client *monc,
661 struct ceph_msg *msg)
662{
663 int ret;
664
665 mutex_lock(&monc->mutex);
666 monc->pending_auth = 0;
667 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
668 msg->front.iov_len,
669 monc->m_auth->front.iov_base,
670 monc->m_auth->front_max);
671 if (ret < 0) {
672 monc->client->auth_err = ret;
673 wake_up(&monc->client->auth_wq);
674 } else if (ret > 0) {
675 __send_prepared_auth_request(monc, ret);
676 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
677 dout("authenticated, starting session\n");
678
679 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
680 monc->client->msgr->inst.name.num = monc->auth->global_id;
681
682 __send_subscribe(monc);
683 __resend_statfs(monc);
684 }
685 mutex_unlock(&monc->mutex);
686}
687
688static int __validate_auth(struct ceph_mon_client *monc)
689{
690 int ret;
691
692 if (monc->pending_auth)
693 return 0;
694
695 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
696 monc->m_auth->front_max);
697 if (ret <= 0)
698 return ret; /* either an error, or no need to authenticate */
699 __send_prepared_auth_request(monc, ret);
700 return 0;
701}
702
703int ceph_monc_validate_auth(struct ceph_mon_client *monc)
704{
705 int ret;
706
707 mutex_lock(&monc->mutex);
708 ret = __validate_auth(monc);
709 mutex_unlock(&monc->mutex);
710 return ret;
711}
712
713/*
714 * handle incoming message
715 */
716static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
717{
718 struct ceph_mon_client *monc = con->private;
719 int type = le16_to_cpu(msg->hdr.type);
720
721 if (!monc)
722 return;
723
724 switch (type) {
725 case CEPH_MSG_AUTH_REPLY:
726 handle_auth_reply(monc, msg);
727 break;
728
729 case CEPH_MSG_MON_SUBSCRIBE_ACK:
730 handle_subscribe_ack(monc, msg);
731 break;
732
733 case CEPH_MSG_STATFS_REPLY:
734 handle_statfs_reply(monc, msg);
735 break;
736
737 case CEPH_MSG_MON_MAP:
738 ceph_monc_handle_map(monc, msg);
739 break;
740
741 case CEPH_MSG_MDS_MAP:
742 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
743 break;
744
745 case CEPH_MSG_OSD_MAP:
746 ceph_osdc_handle_map(&monc->client->osdc, msg);
747 break;
748
749 default:
750 pr_err("received unknown message type %d %s\n", type,
751 ceph_msg_type_name(type));
752 }
753 ceph_msg_put(msg);
754}
755
756/*
757 * Allocate memory for incoming message
758 */
759static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
760 struct ceph_msg_header *hdr,
761 int *skip)
762{
763 struct ceph_mon_client *monc = con->private;
764 int type = le16_to_cpu(hdr->type);
765 int front_len = le32_to_cpu(hdr->front_len);
766 struct ceph_msg *m = NULL;
767
768 *skip = 0;
769
770 switch (type) {
771 case CEPH_MSG_MON_SUBSCRIBE_ACK:
772 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
773 break;
774 case CEPH_MSG_STATFS_REPLY:
775 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
776 break;
777 case CEPH_MSG_AUTH_REPLY:
778 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
779 break;
780 case CEPH_MSG_MON_MAP:
781 case CEPH_MSG_MDS_MAP:
782 case CEPH_MSG_OSD_MAP:
783 m = ceph_msg_new(type, front_len, 0, 0, NULL);
784 break;
785 }
786
787 if (!m) {
788 pr_info("alloc_msg unknown type %d\n", type);
789 *skip = 1;
790 }
791 return m;
792}
793
794/*
795 * If the monitor connection resets, pick a new monitor and resubmit
796 * any pending requests.
797 */
798static void mon_fault(struct ceph_connection *con)
799{
800 struct ceph_mon_client *monc = con->private;
801
802 if (!monc)
803 return;
804
805 dout("mon_fault\n");
806 mutex_lock(&monc->mutex);
807 if (!con->private)
808 goto out;
809
810 if (monc->con && !monc->hunting)
811 pr_info("mon%d %s session lost, "
812 "hunting for new mon\n", monc->cur_mon,
813 pr_addr(&monc->con->peer_addr.in_addr));
814
815 __close_session(monc);
816 if (!monc->hunting) {
817 /* start hunting */
818 monc->hunting = true;
819 __open_session(monc);
820 } else {
821 /* already hunting, let's wait a bit */
822 __schedule_delayed(monc);
823 }
824out:
825 mutex_unlock(&monc->mutex);
826}
827
828const static struct ceph_connection_operations mon_con_ops = {
829 .get = ceph_con_get,
830 .put = ceph_con_put,
831 .dispatch = dispatch,
832 .fault = mon_fault,
833 .alloc_msg = mon_alloc_msg,
834};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..dbe63db9762f
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1537 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 int ret = 0;
417
418 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
419 if (list_empty(&osd->o_requests)) {
420 __remove_osd(osdc, osd);
421 } else {
422 ceph_con_close(&osd->o_con);
423 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
424 osd->o_incarnation++;
425 }
426 return ret;
427}
428
429static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
430{
431 struct rb_node **p = &osdc->osds.rb_node;
432 struct rb_node *parent = NULL;
433 struct ceph_osd *osd = NULL;
434
435 while (*p) {
436 parent = *p;
437 osd = rb_entry(parent, struct ceph_osd, o_node);
438 if (new->o_osd < osd->o_osd)
439 p = &(*p)->rb_left;
440 else if (new->o_osd > osd->o_osd)
441 p = &(*p)->rb_right;
442 else
443 BUG();
444 }
445
446 rb_link_node(&new->o_node, parent, p);
447 rb_insert_color(&new->o_node, &osdc->osds);
448}
449
450static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
451{
452 struct ceph_osd *osd;
453 struct rb_node *n = osdc->osds.rb_node;
454
455 while (n) {
456 osd = rb_entry(n, struct ceph_osd, o_node);
457 if (o < osd->o_osd)
458 n = n->rb_left;
459 else if (o > osd->o_osd)
460 n = n->rb_right;
461 else
462 return osd;
463 }
464 return NULL;
465}
466
467static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
468{
469 schedule_delayed_work(&osdc->timeout_work,
470 osdc->client->mount_args->osd_keepalive_timeout * HZ);
471}
472
473static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
474{
475 cancel_delayed_work(&osdc->timeout_work);
476}
477
478/*
479 * Register request, assign tid. If this is the first request, set up
480 * the timeout event.
481 */
482static void register_request(struct ceph_osd_client *osdc,
483 struct ceph_osd_request *req)
484{
485 mutex_lock(&osdc->request_mutex);
486 req->r_tid = ++osdc->last_tid;
487 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
488 INIT_LIST_HEAD(&req->r_req_lru_item);
489
490 dout("register_request %p tid %lld\n", req, req->r_tid);
491 __insert_request(osdc, req);
492 ceph_osdc_get_request(req);
493 osdc->num_requests++;
494
495 if (osdc->num_requests == 1) {
496 dout(" first request, scheduling timeout\n");
497 __schedule_osd_timeout(osdc);
498 }
499 mutex_unlock(&osdc->request_mutex);
500}
501
502/*
503 * called under osdc->request_mutex
504 */
505static void __unregister_request(struct ceph_osd_client *osdc,
506 struct ceph_osd_request *req)
507{
508 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
509 rb_erase(&req->r_node, &osdc->requests);
510 osdc->num_requests--;
511
512 if (req->r_osd) {
513 /* make sure the original request isn't in flight. */
514 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
515
516 list_del_init(&req->r_osd_item);
517 if (list_empty(&req->r_osd->o_requests))
518 __move_osd_to_lru(osdc, req->r_osd);
519 req->r_osd = NULL;
520 }
521
522 ceph_osdc_put_request(req);
523
524 list_del_init(&req->r_req_lru_item);
525 if (osdc->num_requests == 0) {
526 dout(" no requests, canceling timeout\n");
527 __cancel_osd_timeout(osdc);
528 }
529}
530
531/*
532 * Cancel a previously queued request message
533 */
534static void __cancel_request(struct ceph_osd_request *req)
535{
536 if (req->r_sent) {
537 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
538 req->r_sent = 0;
539 }
540 list_del_init(&req->r_req_lru_item);
541}
542
543/*
544 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
545 * (as needed), and set the request r_osd appropriately. If there is
546 * no up osd, set r_osd to NULL.
547 *
548 * Return 0 if unchanged, 1 if changed, or negative on error.
549 *
550 * Caller should hold map_sem for read and request_mutex.
551 */
552static int __map_osds(struct ceph_osd_client *osdc,
553 struct ceph_osd_request *req)
554{
555 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
556 struct ceph_pg pgid;
557 int o = -1;
558 int err;
559
560 dout("map_osds %p tid %lld\n", req, req->r_tid);
561 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
562 &req->r_file_layout, osdc->osdmap);
563 if (err)
564 return err;
565 pgid = reqhead->layout.ol_pgid;
566 req->r_pgid = pgid;
567
568 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
569
570 if ((req->r_osd && req->r_osd->o_osd == o &&
571 req->r_sent >= req->r_osd->o_incarnation) ||
572 (req->r_osd == NULL && o == -1))
573 return 0; /* no change */
574
575 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
576 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
577 req->r_osd ? req->r_osd->o_osd : -1);
578
579 if (req->r_osd) {
580 __cancel_request(req);
581 list_del_init(&req->r_osd_item);
582 req->r_osd = NULL;
583 }
584
585 req->r_osd = __lookup_osd(osdc, o);
586 if (!req->r_osd && o >= 0) {
587 err = -ENOMEM;
588 req->r_osd = create_osd(osdc);
589 if (!req->r_osd)
590 goto out;
591
592 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
593 req->r_osd->o_osd = o;
594 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
595 __insert_osd(osdc, req->r_osd);
596
597 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
598 }
599
600 if (req->r_osd) {
601 __remove_osd_from_lru(req->r_osd);
602 list_add(&req->r_osd_item, &req->r_osd->o_requests);
603 }
604 err = 1; /* osd changed */
605
606out:
607 return err;
608}
609
610/*
611 * caller should hold map_sem (for read) and request_mutex
612 */
613static int __send_request(struct ceph_osd_client *osdc,
614 struct ceph_osd_request *req)
615{
616 struct ceph_osd_request_head *reqhead;
617 int err;
618
619 err = __map_osds(osdc, req);
620 if (err < 0)
621 return err;
622 if (req->r_osd == NULL) {
623 dout("send_request %p no up osds in pg\n", req);
624 ceph_monc_request_next_osdmap(&osdc->client->monc);
625 return 0;
626 }
627
628 dout("send_request %p tid %llu to osd%d flags %d\n",
629 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
630
631 reqhead = req->r_request->front.iov_base;
632 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
633 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
634 reqhead->reassert_version = req->r_reassert_version;
635
636 req->r_sent_stamp = jiffies;
637 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
638
639 ceph_msg_get(req->r_request); /* send consumes a ref */
640 ceph_con_send(&req->r_osd->o_con, req->r_request);
641 req->r_sent = req->r_osd->o_incarnation;
642 return 0;
643}
644
645/*
646 * Timeout callback, called every N seconds when 1 or more osd
647 * requests has been active for more than N seconds. When this
648 * happens, we ping all OSDs with requests who have timed out to
649 * ensure any communications channel reset is detected. Reset the
650 * request timeouts another N seconds in the future as we go.
651 * Reschedule the timeout event another N seconds in future (unless
652 * there are no open requests).
653 */
654static void handle_timeout(struct work_struct *work)
655{
656 struct ceph_osd_client *osdc =
657 container_of(work, struct ceph_osd_client, timeout_work.work);
658 struct ceph_osd_request *req, *last_req = NULL;
659 struct ceph_osd *osd;
660 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
661 unsigned long keepalive =
662 osdc->client->mount_args->osd_keepalive_timeout * HZ;
663 unsigned long last_sent = 0;
664 struct rb_node *p;
665 struct list_head slow_osds;
666
667 dout("timeout\n");
668 down_read(&osdc->map_sem);
669
670 ceph_monc_request_next_osdmap(&osdc->client->monc);
671
672 mutex_lock(&osdc->request_mutex);
673 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
674 req = rb_entry(p, struct ceph_osd_request, r_node);
675
676 if (req->r_resend) {
677 int err;
678
679 dout("osdc resending prev failed %lld\n", req->r_tid);
680 err = __send_request(osdc, req);
681 if (err)
682 dout("osdc failed again on %lld\n", req->r_tid);
683 else
684 req->r_resend = false;
685 continue;
686 }
687 }
688
689 /*
690 * reset osds that appear to be _really_ unresponsive. this
691 * is a failsafe measure.. we really shouldn't be getting to
692 * this point if the system is working properly. the monitors
693 * should mark the osd as failed and we should find out about
694 * it from an updated osd map.
695 */
696 while (!list_empty(&osdc->req_lru)) {
697 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
698 r_req_lru_item);
699
700 if (time_before(jiffies, req->r_sent_stamp + timeout))
701 break;
702
703 BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
704 last_req = req;
705 last_sent = req->r_sent_stamp;
706
707 osd = req->r_osd;
708 BUG_ON(!osd);
709 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
710 req->r_tid, osd->o_osd);
711 __kick_requests(osdc, osd);
712 }
713
714 /*
715 * ping osds that are a bit slow. this ensures that if there
716 * is a break in the TCP connection we will notice, and reopen
717 * a connection with that osd (from the fault callback).
718 */
719 INIT_LIST_HEAD(&slow_osds);
720 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
721 if (time_before(jiffies, req->r_sent_stamp + keepalive))
722 break;
723
724 osd = req->r_osd;
725 BUG_ON(!osd);
726 dout(" tid %llu is slow, will send keepalive on osd%d\n",
727 req->r_tid, osd->o_osd);
728 list_move_tail(&osd->o_keepalive_item, &slow_osds);
729 }
730 while (!list_empty(&slow_osds)) {
731 osd = list_entry(slow_osds.next, struct ceph_osd,
732 o_keepalive_item);
733 list_del_init(&osd->o_keepalive_item);
734 ceph_con_keepalive(&osd->o_con);
735 }
736
737 __schedule_osd_timeout(osdc);
738 mutex_unlock(&osdc->request_mutex);
739
740 up_read(&osdc->map_sem);
741}
742
743static void handle_osds_timeout(struct work_struct *work)
744{
745 struct ceph_osd_client *osdc =
746 container_of(work, struct ceph_osd_client,
747 osds_timeout_work.work);
748 unsigned long delay =
749 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
750
751 dout("osds timeout\n");
752 down_read(&osdc->map_sem);
753 remove_old_osds(osdc, 0);
754 up_read(&osdc->map_sem);
755
756 schedule_delayed_work(&osdc->osds_timeout_work,
757 round_jiffies_relative(delay));
758}
759
760/*
761 * handle osd op reply. either call the callback if it is specified,
762 * or do the completion to wake up the waiting thread.
763 */
764static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
765 struct ceph_connection *con)
766{
767 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
768 struct ceph_osd_request *req;
769 u64 tid;
770 int numops, object_len, flags;
771
772 tid = le64_to_cpu(msg->hdr.tid);
773 if (msg->front.iov_len < sizeof(*rhead))
774 goto bad;
775 numops = le32_to_cpu(rhead->num_ops);
776 object_len = le32_to_cpu(rhead->object_len);
777 if (msg->front.iov_len != sizeof(*rhead) + object_len +
778 numops * sizeof(struct ceph_osd_op))
779 goto bad;
780 dout("handle_reply %p tid %llu\n", msg, tid);
781
782 /* lookup */
783 mutex_lock(&osdc->request_mutex);
784 req = __lookup_request(osdc, tid);
785 if (req == NULL) {
786 dout("handle_reply tid %llu dne\n", tid);
787 mutex_unlock(&osdc->request_mutex);
788 return;
789 }
790 ceph_osdc_get_request(req);
791 flags = le32_to_cpu(rhead->flags);
792
793 /*
794 * if this connection filled our message, drop our reference now, to
795 * avoid a (safe but slower) revoke later.
796 */
797 if (req->r_con_filling_msg == con && req->r_reply == msg) {
798 dout(" dropping con_filling_msg ref %p\n", con);
799 req->r_con_filling_msg = NULL;
800 ceph_con_put(con);
801 }
802
803 if (!req->r_got_reply) {
804 unsigned bytes;
805
806 req->r_result = le32_to_cpu(rhead->result);
807 bytes = le32_to_cpu(msg->hdr.data_len);
808 dout("handle_reply result %d bytes %d\n", req->r_result,
809 bytes);
810 if (req->r_result == 0)
811 req->r_result = bytes;
812
813 /* in case this is a write and we need to replay, */
814 req->r_reassert_version = rhead->reassert_version;
815
816 req->r_got_reply = 1;
817 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
818 dout("handle_reply tid %llu dup ack\n", tid);
819 mutex_unlock(&osdc->request_mutex);
820 goto done;
821 }
822
823 dout("handle_reply tid %llu flags %d\n", tid, flags);
824
825 /* either this is a read, or we got the safe response */
826 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
827 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
828 __unregister_request(osdc, req);
829
830 mutex_unlock(&osdc->request_mutex);
831
832 if (req->r_callback)
833 req->r_callback(req, msg);
834 else
835 complete(&req->r_completion);
836
837 if (flags & CEPH_OSD_FLAG_ONDISK) {
838 if (req->r_safe_callback)
839 req->r_safe_callback(req, msg);
840 complete(&req->r_safe_completion); /* fsync waiter */
841 }
842
843done:
844 ceph_osdc_put_request(req);
845 return;
846
847bad:
848 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
849 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
850 (int)sizeof(*rhead));
851 ceph_msg_dump(msg);
852}
853
854
855static int __kick_requests(struct ceph_osd_client *osdc,
856 struct ceph_osd *kickosd)
857{
858 struct ceph_osd_request *req;
859 struct rb_node *p, *n;
860 int needmap = 0;
861 int err;
862
863 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
864 if (kickosd) {
865 __reset_osd(osdc, kickosd);
866 } else {
867 for (p = rb_first(&osdc->osds); p; p = n) {
868 struct ceph_osd *osd =
869 rb_entry(p, struct ceph_osd, o_node);
870
871 n = rb_next(p);
872 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
873 memcmp(&osd->o_con.peer_addr,
874 ceph_osd_addr(osdc->osdmap,
875 osd->o_osd),
876 sizeof(struct ceph_entity_addr)) != 0)
877 __reset_osd(osdc, osd);
878 }
879 }
880
881 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
882 req = rb_entry(p, struct ceph_osd_request, r_node);
883
884 if (req->r_resend) {
885 dout(" r_resend set on tid %llu\n", req->r_tid);
886 __cancel_request(req);
887 goto kick;
888 }
889 if (req->r_osd && kickosd == req->r_osd) {
890 __cancel_request(req);
891 goto kick;
892 }
893
894 err = __map_osds(osdc, req);
895 if (err == 0)
896 continue; /* no change */
897 if (err < 0) {
898 /*
899 * FIXME: really, we should set the request
900 * error and fail if this isn't a 'nofail'
901 * request, but that's a fair bit more
902 * complicated to do. So retry!
903 */
904 dout(" setting r_resend on %llu\n", req->r_tid);
905 req->r_resend = true;
906 continue;
907 }
908 if (req->r_osd == NULL) {
909 dout("tid %llu maps to no valid osd\n", req->r_tid);
910 needmap++; /* request a newer map */
911 continue;
912 }
913
914kick:
915 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
916 req->r_osd->o_osd);
917 req->r_flags |= CEPH_OSD_FLAG_RETRY;
918 err = __send_request(osdc, req);
919 if (err) {
920 dout(" setting r_resend on %llu\n", req->r_tid);
921 req->r_resend = true;
922 }
923 }
924
925 return needmap;
926}
927
928/*
929 * Resubmit osd requests whose osd or osd address has changed. Request
930 * a new osd map if osds are down, or we are otherwise unable to determine
931 * how to direct a request.
932 *
933 * Close connections to down osds.
934 *
935 * If @who is specified, resubmit requests for that specific osd.
936 *
937 * Caller should hold map_sem for read and request_mutex.
938 */
939static void kick_requests(struct ceph_osd_client *osdc,
940 struct ceph_osd *kickosd)
941{
942 int needmap;
943
944 mutex_lock(&osdc->request_mutex);
945 needmap = __kick_requests(osdc, kickosd);
946 mutex_unlock(&osdc->request_mutex);
947
948 if (needmap) {
949 dout("%d requests for down osds, need new map\n", needmap);
950 ceph_monc_request_next_osdmap(&osdc->client->monc);
951 }
952
953}
954/*
955 * Process updated osd map.
956 *
957 * The message contains any number of incremental and full maps, normally
958 * indicating some sort of topology change in the cluster. Kick requests
959 * off to different OSDs as needed.
960 */
961void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
962{
963 void *p, *end, *next;
964 u32 nr_maps, maplen;
965 u32 epoch;
966 struct ceph_osdmap *newmap = NULL, *oldmap;
967 int err;
968 struct ceph_fsid fsid;
969
970 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
971 p = msg->front.iov_base;
972 end = p + msg->front.iov_len;
973
974 /* verify fsid */
975 ceph_decode_need(&p, end, sizeof(fsid), bad);
976 ceph_decode_copy(&p, &fsid, sizeof(fsid));
977 if (ceph_check_fsid(osdc->client, &fsid) < 0)
978 return;
979
980 down_write(&osdc->map_sem);
981
982 /* incremental maps */
983 ceph_decode_32_safe(&p, end, nr_maps, bad);
984 dout(" %d inc maps\n", nr_maps);
985 while (nr_maps > 0) {
986 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
987 epoch = ceph_decode_32(&p);
988 maplen = ceph_decode_32(&p);
989 ceph_decode_need(&p, end, maplen, bad);
990 next = p + maplen;
991 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
992 dout("applying incremental map %u len %d\n",
993 epoch, maplen);
994 newmap = osdmap_apply_incremental(&p, next,
995 osdc->osdmap,
996 osdc->client->msgr);
997 if (IS_ERR(newmap)) {
998 err = PTR_ERR(newmap);
999 goto bad;
1000 }
1001 BUG_ON(!newmap);
1002 if (newmap != osdc->osdmap) {
1003 ceph_osdmap_destroy(osdc->osdmap);
1004 osdc->osdmap = newmap;
1005 }
1006 } else {
1007 dout("ignoring incremental map %u len %d\n",
1008 epoch, maplen);
1009 }
1010 p = next;
1011 nr_maps--;
1012 }
1013 if (newmap)
1014 goto done;
1015
1016 /* full maps */
1017 ceph_decode_32_safe(&p, end, nr_maps, bad);
1018 dout(" %d full maps\n", nr_maps);
1019 while (nr_maps) {
1020 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1021 epoch = ceph_decode_32(&p);
1022 maplen = ceph_decode_32(&p);
1023 ceph_decode_need(&p, end, maplen, bad);
1024 if (nr_maps > 1) {
1025 dout("skipping non-latest full map %u len %d\n",
1026 epoch, maplen);
1027 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1028 dout("skipping full map %u len %d, "
1029 "older than our %u\n", epoch, maplen,
1030 osdc->osdmap->epoch);
1031 } else {
1032 dout("taking full map %u len %d\n", epoch, maplen);
1033 newmap = osdmap_decode(&p, p+maplen);
1034 if (IS_ERR(newmap)) {
1035 err = PTR_ERR(newmap);
1036 goto bad;
1037 }
1038 BUG_ON(!newmap);
1039 oldmap = osdc->osdmap;
1040 osdc->osdmap = newmap;
1041 if (oldmap)
1042 ceph_osdmap_destroy(oldmap);
1043 }
1044 p += maplen;
1045 nr_maps--;
1046 }
1047
1048done:
1049 downgrade_write(&osdc->map_sem);
1050 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1051 if (newmap)
1052 kick_requests(osdc, NULL);
1053 up_read(&osdc->map_sem);
1054 return;
1055
1056bad:
1057 pr_err("osdc handle_map corrupt msg\n");
1058 ceph_msg_dump(msg);
1059 up_write(&osdc->map_sem);
1060 return;
1061}
1062
1063
1064/*
1065 * A read request prepares specific pages that data is to be read into.
1066 * When a message is being read off the wire, we call prepare_pages to
1067 * find those pages.
1068 * 0 = success, -1 failure.
1069 */
1070static int __prepare_pages(struct ceph_connection *con,
1071 struct ceph_msg_header *hdr,
1072 struct ceph_osd_request *req,
1073 u64 tid,
1074 struct ceph_msg *m)
1075{
1076 struct ceph_osd *osd = con->private;
1077 struct ceph_osd_client *osdc;
1078 int ret = -1;
1079 int data_len = le32_to_cpu(hdr->data_len);
1080 unsigned data_off = le16_to_cpu(hdr->data_off);
1081
1082 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1083
1084 if (!osd)
1085 return -1;
1086
1087 osdc = osd->o_osdc;
1088
1089 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1090 tid, req->r_num_pages, want);
1091 if (unlikely(req->r_num_pages < want))
1092 goto out;
1093 m->pages = req->r_pages;
1094 m->nr_pages = req->r_num_pages;
1095 ret = 0; /* success */
1096out:
1097 BUG_ON(ret < 0 || m->nr_pages < want);
1098
1099 return ret;
1100}
1101
1102/*
1103 * Register request, send initial attempt.
1104 */
1105int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1106 struct ceph_osd_request *req,
1107 bool nofail)
1108{
1109 int rc = 0;
1110
1111 req->r_request->pages = req->r_pages;
1112 req->r_request->nr_pages = req->r_num_pages;
1113
1114 register_request(osdc, req);
1115
1116 down_read(&osdc->map_sem);
1117 mutex_lock(&osdc->request_mutex);
1118 /*
1119 * a racing kick_requests() may have sent the message for us
1120 * while we dropped request_mutex above, so only send now if
1121 * the request still han't been touched yet.
1122 */
1123 if (req->r_sent == 0) {
1124 rc = __send_request(osdc, req);
1125 if (rc) {
1126 if (nofail) {
1127 dout("osdc_start_request failed send, "
1128 " marking %lld\n", req->r_tid);
1129 req->r_resend = true;
1130 rc = 0;
1131 } else {
1132 __unregister_request(osdc, req);
1133 }
1134 }
1135 }
1136 mutex_unlock(&osdc->request_mutex);
1137 up_read(&osdc->map_sem);
1138 return rc;
1139}
1140
1141/*
1142 * wait for a request to complete
1143 */
1144int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1145 struct ceph_osd_request *req)
1146{
1147 int rc;
1148
1149 rc = wait_for_completion_interruptible(&req->r_completion);
1150 if (rc < 0) {
1151 mutex_lock(&osdc->request_mutex);
1152 __cancel_request(req);
1153 __unregister_request(osdc, req);
1154 mutex_unlock(&osdc->request_mutex);
1155 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1156 return rc;
1157 }
1158
1159 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1160 return req->r_result;
1161}
1162
1163/*
1164 * sync - wait for all in-flight requests to flush. avoid starvation.
1165 */
1166void ceph_osdc_sync(struct ceph_osd_client *osdc)
1167{
1168 struct ceph_osd_request *req;
1169 u64 last_tid, next_tid = 0;
1170
1171 mutex_lock(&osdc->request_mutex);
1172 last_tid = osdc->last_tid;
1173 while (1) {
1174 req = __lookup_request_ge(osdc, next_tid);
1175 if (!req)
1176 break;
1177 if (req->r_tid > last_tid)
1178 break;
1179
1180 next_tid = req->r_tid + 1;
1181 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1182 continue;
1183
1184 ceph_osdc_get_request(req);
1185 mutex_unlock(&osdc->request_mutex);
1186 dout("sync waiting on tid %llu (last is %llu)\n",
1187 req->r_tid, last_tid);
1188 wait_for_completion(&req->r_safe_completion);
1189 mutex_lock(&osdc->request_mutex);
1190 ceph_osdc_put_request(req);
1191 }
1192 mutex_unlock(&osdc->request_mutex);
1193 dout("sync done (thru tid %llu)\n", last_tid);
1194}
1195
1196/*
1197 * init, shutdown
1198 */
1199int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1200{
1201 int err;
1202
1203 dout("init\n");
1204 osdc->client = client;
1205 osdc->osdmap = NULL;
1206 init_rwsem(&osdc->map_sem);
1207 init_completion(&osdc->map_waiters);
1208 osdc->last_requested_map = 0;
1209 mutex_init(&osdc->request_mutex);
1210 osdc->last_tid = 0;
1211 osdc->osds = RB_ROOT;
1212 INIT_LIST_HEAD(&osdc->osd_lru);
1213 osdc->requests = RB_ROOT;
1214 INIT_LIST_HEAD(&osdc->req_lru);
1215 osdc->num_requests = 0;
1216 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1217 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1218
1219 schedule_delayed_work(&osdc->osds_timeout_work,
1220 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1221
1222 err = -ENOMEM;
1223 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1224 sizeof(struct ceph_osd_request));
1225 if (!osdc->req_mempool)
1226 goto out;
1227
1228 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1229 if (err < 0)
1230 goto out_mempool;
1231 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1232 OSD_OPREPLY_FRONT_LEN, 10, true);
1233 if (err < 0)
1234 goto out_msgpool;
1235 return 0;
1236
1237out_msgpool:
1238 ceph_msgpool_destroy(&osdc->msgpool_op);
1239out_mempool:
1240 mempool_destroy(osdc->req_mempool);
1241out:
1242 return err;
1243}
1244
1245void ceph_osdc_stop(struct ceph_osd_client *osdc)
1246{
1247 cancel_delayed_work_sync(&osdc->timeout_work);
1248 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1249 if (osdc->osdmap) {
1250 ceph_osdmap_destroy(osdc->osdmap);
1251 osdc->osdmap = NULL;
1252 }
1253 remove_old_osds(osdc, 1);
1254 mempool_destroy(osdc->req_mempool);
1255 ceph_msgpool_destroy(&osdc->msgpool_op);
1256 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1257}
1258
1259/*
1260 * Read some contiguous pages. If we cross a stripe boundary, shorten
1261 * *plen. Return number of bytes read, or error.
1262 */
1263int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1264 struct ceph_vino vino, struct ceph_file_layout *layout,
1265 u64 off, u64 *plen,
1266 u32 truncate_seq, u64 truncate_size,
1267 struct page **pages, int num_pages)
1268{
1269 struct ceph_osd_request *req;
1270 int rc = 0;
1271
1272 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1273 vino.snap, off, *plen);
1274 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1275 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1276 NULL, 0, truncate_seq, truncate_size, NULL,
1277 false, 1);
1278 if (IS_ERR(req))
1279 return PTR_ERR(req);
1280
1281 /* it may be a short read due to an object boundary */
1282 req->r_pages = pages;
1283 num_pages = calc_pages_for(off, *plen);
1284 req->r_num_pages = num_pages;
1285
1286 dout("readpages final extent is %llu~%llu (%d pages)\n",
1287 off, *plen, req->r_num_pages);
1288
1289 rc = ceph_osdc_start_request(osdc, req, false);
1290 if (!rc)
1291 rc = ceph_osdc_wait_request(osdc, req);
1292
1293 ceph_osdc_put_request(req);
1294 dout("readpages result %d\n", rc);
1295 return rc;
1296}
1297
1298/*
1299 * do a synchronous write on N pages
1300 */
1301int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1302 struct ceph_file_layout *layout,
1303 struct ceph_snap_context *snapc,
1304 u64 off, u64 len,
1305 u32 truncate_seq, u64 truncate_size,
1306 struct timespec *mtime,
1307 struct page **pages, int num_pages,
1308 int flags, int do_sync, bool nofail)
1309{
1310 struct ceph_osd_request *req;
1311 int rc = 0;
1312
1313 BUG_ON(vino.snap != CEPH_NOSNAP);
1314 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1315 CEPH_OSD_OP_WRITE,
1316 flags | CEPH_OSD_FLAG_ONDISK |
1317 CEPH_OSD_FLAG_WRITE,
1318 snapc, do_sync,
1319 truncate_seq, truncate_size, mtime,
1320 nofail, 1);
1321 if (IS_ERR(req))
1322 return PTR_ERR(req);
1323
1324 /* it may be a short write due to an object boundary */
1325 req->r_pages = pages;
1326 req->r_num_pages = calc_pages_for(off, len);
1327 dout("writepages %llu~%llu (%d pages)\n", off, len,
1328 req->r_num_pages);
1329
1330 rc = ceph_osdc_start_request(osdc, req, nofail);
1331 if (!rc)
1332 rc = ceph_osdc_wait_request(osdc, req);
1333
1334 ceph_osdc_put_request(req);
1335 if (rc == 0)
1336 rc = len;
1337 dout("writepages result %d\n", rc);
1338 return rc;
1339}
1340
1341/*
1342 * handle incoming message
1343 */
1344static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1345{
1346 struct ceph_osd *osd = con->private;
1347 struct ceph_osd_client *osdc;
1348 int type = le16_to_cpu(msg->hdr.type);
1349
1350 if (!osd)
1351 return;
1352 osdc = osd->o_osdc;
1353
1354 switch (type) {
1355 case CEPH_MSG_OSD_MAP:
1356 ceph_osdc_handle_map(osdc, msg);
1357 break;
1358 case CEPH_MSG_OSD_OPREPLY:
1359 handle_reply(osdc, msg, con);
1360 break;
1361
1362 default:
1363 pr_err("received unknown message type %d %s\n", type,
1364 ceph_msg_type_name(type));
1365 }
1366 ceph_msg_put(msg);
1367}
1368
1369/*
1370 * lookup and return message for incoming reply
1371 */
1372static struct ceph_msg *get_reply(struct ceph_connection *con,
1373 struct ceph_msg_header *hdr,
1374 int *skip)
1375{
1376 struct ceph_osd *osd = con->private;
1377 struct ceph_osd_client *osdc = osd->o_osdc;
1378 struct ceph_msg *m;
1379 struct ceph_osd_request *req;
1380 int front = le32_to_cpu(hdr->front_len);
1381 int data_len = le32_to_cpu(hdr->data_len);
1382 u64 tid;
1383 int err;
1384
1385 tid = le64_to_cpu(hdr->tid);
1386 mutex_lock(&osdc->request_mutex);
1387 req = __lookup_request(osdc, tid);
1388 if (!req) {
1389 *skip = 1;
1390 m = NULL;
1391 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1392 osd->o_osd);
1393 goto out;
1394 }
1395
1396 if (req->r_con_filling_msg) {
1397 dout("get_reply revoking msg %p from old con %p\n",
1398 req->r_reply, req->r_con_filling_msg);
1399 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1400 ceph_con_put(req->r_con_filling_msg);
1401 }
1402
1403 if (front > req->r_reply->front.iov_len) {
1404 pr_warning("get_reply front %d > preallocated %d\n",
1405 front, (int)req->r_reply->front.iov_len);
1406 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1407 if (IS_ERR(m))
1408 goto out;
1409 ceph_msg_put(req->r_reply);
1410 req->r_reply = m;
1411 }
1412 m = ceph_msg_get(req->r_reply);
1413
1414 if (data_len > 0) {
1415 err = __prepare_pages(con, hdr, req, tid, m);
1416 if (err < 0) {
1417 *skip = 1;
1418 ceph_msg_put(m);
1419 m = ERR_PTR(err);
1420 }
1421 }
1422 *skip = 0;
1423 req->r_con_filling_msg = ceph_con_get(con);
1424 dout("get_reply tid %lld %p\n", tid, m);
1425
1426out:
1427 mutex_unlock(&osdc->request_mutex);
1428 return m;
1429
1430}
1431
1432static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1433 struct ceph_msg_header *hdr,
1434 int *skip)
1435{
1436 struct ceph_osd *osd = con->private;
1437 int type = le16_to_cpu(hdr->type);
1438 int front = le32_to_cpu(hdr->front_len);
1439
1440 switch (type) {
1441 case CEPH_MSG_OSD_MAP:
1442 return ceph_msg_new(type, front, 0, 0, NULL);
1443 case CEPH_MSG_OSD_OPREPLY:
1444 return get_reply(con, hdr, skip);
1445 default:
1446 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1447 osd->o_osd);
1448 *skip = 1;
1449 return NULL;
1450 }
1451}
1452
1453/*
1454 * Wrappers to refcount containing ceph_osd struct
1455 */
1456static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1457{
1458 struct ceph_osd *osd = con->private;
1459 if (get_osd(osd))
1460 return con;
1461 return NULL;
1462}
1463
1464static void put_osd_con(struct ceph_connection *con)
1465{
1466 struct ceph_osd *osd = con->private;
1467 put_osd(osd);
1468}
1469
1470/*
1471 * authentication
1472 */
1473static int get_authorizer(struct ceph_connection *con,
1474 void **buf, int *len, int *proto,
1475 void **reply_buf, int *reply_len, int force_new)
1476{
1477 struct ceph_osd *o = con->private;
1478 struct ceph_osd_client *osdc = o->o_osdc;
1479 struct ceph_auth_client *ac = osdc->client->monc.auth;
1480 int ret = 0;
1481
1482 if (force_new && o->o_authorizer) {
1483 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1484 o->o_authorizer = NULL;
1485 }
1486 if (o->o_authorizer == NULL) {
1487 ret = ac->ops->create_authorizer(
1488 ac, CEPH_ENTITY_TYPE_OSD,
1489 &o->o_authorizer,
1490 &o->o_authorizer_buf,
1491 &o->o_authorizer_buf_len,
1492 &o->o_authorizer_reply_buf,
1493 &o->o_authorizer_reply_buf_len);
1494 if (ret)
1495 return ret;
1496 }
1497
1498 *proto = ac->protocol;
1499 *buf = o->o_authorizer_buf;
1500 *len = o->o_authorizer_buf_len;
1501 *reply_buf = o->o_authorizer_reply_buf;
1502 *reply_len = o->o_authorizer_reply_buf_len;
1503 return 0;
1504}
1505
1506
1507static int verify_authorizer_reply(struct ceph_connection *con, int len)
1508{
1509 struct ceph_osd *o = con->private;
1510 struct ceph_osd_client *osdc = o->o_osdc;
1511 struct ceph_auth_client *ac = osdc->client->monc.auth;
1512
1513 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1514}
1515
1516static int invalidate_authorizer(struct ceph_connection *con)
1517{
1518 struct ceph_osd *o = con->private;
1519 struct ceph_osd_client *osdc = o->o_osdc;
1520 struct ceph_auth_client *ac = osdc->client->monc.auth;
1521
1522 if (ac->ops->invalidate_authorizer)
1523 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1524
1525 return ceph_monc_validate_auth(&osdc->client->monc);
1526}
1527
1528const static struct ceph_connection_operations osd_con_ops = {
1529 .get = get_osd_con,
1530 .put = put_osd_con,
1531 .dispatch = dispatch,
1532 .get_authorizer = get_authorizer,
1533 .verify_authorizer_reply = verify_authorizer_reply,
1534 .invalidate_authorizer = invalidate_authorizer,
1535 .alloc_msg = alloc_msg,
1536 .fault = osd_reset,
1537};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..1b1a3ca43afc
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_sent_stamp;
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..b83f2692b835
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1019 @@
1
2#include <asm/div64.h>
3
4#include "super.h"
5#include "osdmap.h"
6#include "crush/hash.h"
7#include "crush/mapper.h"
8#include "decode.h"
9#include "ceph_debug.h"
10
11char *ceph_osdmap_state_str(char *str, int len, int state)
12{
13 int flag = 0;
14
15 if (!len)
16 goto done;
17
18 *str = '\0';
19 if (state) {
20 if (state & CEPH_OSD_EXISTS) {
21 snprintf(str, len, "exists");
22 flag = 1;
23 }
24 if (state & CEPH_OSD_UP) {
25 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
26 "up");
27 flag = 1;
28 }
29 } else {
30 snprintf(str, len, "doesn't exist");
31 }
32done:
33 return str;
34}
35
36/* maps */
37
38static int calc_bits_of(unsigned t)
39{
40 int b = 0;
41 while (t) {
42 t = t >> 1;
43 b++;
44 }
45 return b;
46}
47
48/*
49 * the foo_mask is the smallest value 2^n-1 that is >= foo.
50 */
51static void calc_pg_masks(struct ceph_pg_pool_info *pi)
52{
53 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
54 pi->pgp_num_mask =
55 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
56 pi->lpg_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
58 pi->lpgp_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
60}
61
62/*
63 * decode crush map
64 */
65static int crush_decode_uniform_bucket(void **p, void *end,
66 struct crush_bucket_uniform *b)
67{
68 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
69 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
70 b->item_weight = ceph_decode_32(p);
71 return 0;
72bad:
73 return -EINVAL;
74}
75
76static int crush_decode_list_bucket(void **p, void *end,
77 struct crush_bucket_list *b)
78{
79 int j;
80 dout("crush_decode_list_bucket %p to %p\n", *p, end);
81 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
82 if (b->item_weights == NULL)
83 return -ENOMEM;
84 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->sum_weights == NULL)
86 return -ENOMEM;
87 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
88 for (j = 0; j < b->h.size; j++) {
89 b->item_weights[j] = ceph_decode_32(p);
90 b->sum_weights[j] = ceph_decode_32(p);
91 }
92 return 0;
93bad:
94 return -EINVAL;
95}
96
97static int crush_decode_tree_bucket(void **p, void *end,
98 struct crush_bucket_tree *b)
99{
100 int j;
101 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
102 ceph_decode_32_safe(p, end, b->num_nodes, bad);
103 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
104 if (b->node_weights == NULL)
105 return -ENOMEM;
106 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
107 for (j = 0; j < b->num_nodes; j++)
108 b->node_weights[j] = ceph_decode_32(p);
109 return 0;
110bad:
111 return -EINVAL;
112}
113
114static int crush_decode_straw_bucket(void **p, void *end,
115 struct crush_bucket_straw *b)
116{
117 int j;
118 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
119 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
120 if (b->item_weights == NULL)
121 return -ENOMEM;
122 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->straws == NULL)
124 return -ENOMEM;
125 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
126 for (j = 0; j < b->h.size; j++) {
127 b->item_weights[j] = ceph_decode_32(p);
128 b->straws[j] = ceph_decode_32(p);
129 }
130 return 0;
131bad:
132 return -EINVAL;
133}
134
135static struct crush_map *crush_decode(void *pbyval, void *end)
136{
137 struct crush_map *c;
138 int err = -EINVAL;
139 int i, j;
140 void **p = &pbyval;
141 void *start = pbyval;
142 u32 magic;
143
144 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
145
146 c = kzalloc(sizeof(*c), GFP_NOFS);
147 if (c == NULL)
148 return ERR_PTR(-ENOMEM);
149
150 ceph_decode_need(p, end, 4*sizeof(u32), bad);
151 magic = ceph_decode_32(p);
152 if (magic != CRUSH_MAGIC) {
153 pr_err("crush_decode magic %x != current %x\n",
154 (unsigned)magic, (unsigned)CRUSH_MAGIC);
155 goto bad;
156 }
157 c->max_buckets = ceph_decode_32(p);
158 c->max_rules = ceph_decode_32(p);
159 c->max_devices = ceph_decode_32(p);
160
161 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
162 if (c->device_parents == NULL)
163 goto badmem;
164 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
165 if (c->bucket_parents == NULL)
166 goto badmem;
167
168 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
169 if (c->buckets == NULL)
170 goto badmem;
171 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
172 if (c->rules == NULL)
173 goto badmem;
174
175 /* buckets */
176 for (i = 0; i < c->max_buckets; i++) {
177 int size = 0;
178 u32 alg;
179 struct crush_bucket *b;
180
181 ceph_decode_32_safe(p, end, alg, bad);
182 if (alg == 0) {
183 c->buckets[i] = NULL;
184 continue;
185 }
186 dout("crush_decode bucket %d off %x %p to %p\n",
187 i, (int)(*p-start), *p, end);
188
189 switch (alg) {
190 case CRUSH_BUCKET_UNIFORM:
191 size = sizeof(struct crush_bucket_uniform);
192 break;
193 case CRUSH_BUCKET_LIST:
194 size = sizeof(struct crush_bucket_list);
195 break;
196 case CRUSH_BUCKET_TREE:
197 size = sizeof(struct crush_bucket_tree);
198 break;
199 case CRUSH_BUCKET_STRAW:
200 size = sizeof(struct crush_bucket_straw);
201 break;
202 default:
203 err = -EINVAL;
204 goto bad;
205 }
206 BUG_ON(size == 0);
207 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
208 if (b == NULL)
209 goto badmem;
210
211 ceph_decode_need(p, end, 4*sizeof(u32), bad);
212 b->id = ceph_decode_32(p);
213 b->type = ceph_decode_16(p);
214 b->alg = ceph_decode_8(p);
215 b->hash = ceph_decode_8(p);
216 b->weight = ceph_decode_32(p);
217 b->size = ceph_decode_32(p);
218
219 dout("crush_decode bucket size %d off %x %p to %p\n",
220 b->size, (int)(*p-start), *p, end);
221
222 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
223 if (b->items == NULL)
224 goto badmem;
225 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
226 if (b->perm == NULL)
227 goto badmem;
228 b->perm_n = 0;
229
230 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
231 for (j = 0; j < b->size; j++)
232 b->items[j] = ceph_decode_32(p);
233
234 switch (b->alg) {
235 case CRUSH_BUCKET_UNIFORM:
236 err = crush_decode_uniform_bucket(p, end,
237 (struct crush_bucket_uniform *)b);
238 if (err < 0)
239 goto bad;
240 break;
241 case CRUSH_BUCKET_LIST:
242 err = crush_decode_list_bucket(p, end,
243 (struct crush_bucket_list *)b);
244 if (err < 0)
245 goto bad;
246 break;
247 case CRUSH_BUCKET_TREE:
248 err = crush_decode_tree_bucket(p, end,
249 (struct crush_bucket_tree *)b);
250 if (err < 0)
251 goto bad;
252 break;
253 case CRUSH_BUCKET_STRAW:
254 err = crush_decode_straw_bucket(p, end,
255 (struct crush_bucket_straw *)b);
256 if (err < 0)
257 goto bad;
258 break;
259 }
260 }
261
262 /* rules */
263 dout("rule vec is %p\n", c->rules);
264 for (i = 0; i < c->max_rules; i++) {
265 u32 yes;
266 struct crush_rule *r;
267
268 ceph_decode_32_safe(p, end, yes, bad);
269 if (!yes) {
270 dout("crush_decode NO rule %d off %x %p to %p\n",
271 i, (int)(*p-start), *p, end);
272 c->rules[i] = NULL;
273 continue;
274 }
275
276 dout("crush_decode rule %d off %x %p to %p\n",
277 i, (int)(*p-start), *p, end);
278
279 /* len */
280 ceph_decode_32_safe(p, end, yes, bad);
281#if BITS_PER_LONG == 32
282 err = -EINVAL;
283 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
284 goto bad;
285#endif
286 r = c->rules[i] = kmalloc(sizeof(*r) +
287 yes*sizeof(struct crush_rule_step),
288 GFP_NOFS);
289 if (r == NULL)
290 goto badmem;
291 dout(" rule %d is at %p\n", i, r);
292 r->len = yes;
293 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
294 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
295 for (j = 0; j < r->len; j++) {
296 r->steps[j].op = ceph_decode_32(p);
297 r->steps[j].arg1 = ceph_decode_32(p);
298 r->steps[j].arg2 = ceph_decode_32(p);
299 }
300 }
301
302 /* ignore trailing name maps. */
303
304 dout("crush_decode success\n");
305 return c;
306
307badmem:
308 err = -ENOMEM;
309bad:
310 dout("crush_decode fail %d\n", err);
311 crush_destroy(c);
312 return ERR_PTR(err);
313}
314
315
316/*
317 * osd map
318 */
319void ceph_osdmap_destroy(struct ceph_osdmap *map)
320{
321 dout("osdmap_destroy %p\n", map);
322 if (map->crush)
323 crush_destroy(map->crush);
324 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
325 struct ceph_pg_mapping *pg =
326 rb_entry(rb_first(&map->pg_temp),
327 struct ceph_pg_mapping, node);
328 rb_erase(&pg->node, &map->pg_temp);
329 kfree(pg);
330 }
331 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
332 struct ceph_pg_pool_info *pi =
333 rb_entry(rb_first(&map->pg_pools),
334 struct ceph_pg_pool_info, node);
335 rb_erase(&pi->node, &map->pg_pools);
336 kfree(pi);
337 }
338 kfree(map->osd_state);
339 kfree(map->osd_weight);
340 kfree(map->osd_addr);
341 kfree(map);
342}
343
344/*
345 * adjust max osd value. reallocate arrays.
346 */
347static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
348{
349 u8 *state;
350 struct ceph_entity_addr *addr;
351 u32 *weight;
352
353 state = kcalloc(max, sizeof(*state), GFP_NOFS);
354 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
355 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
356 if (state == NULL || addr == NULL || weight == NULL) {
357 kfree(state);
358 kfree(addr);
359 kfree(weight);
360 return -ENOMEM;
361 }
362
363 /* copy old? */
364 if (map->osd_state) {
365 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
366 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
367 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
368 kfree(map->osd_state);
369 kfree(map->osd_addr);
370 kfree(map->osd_weight);
371 }
372
373 map->osd_state = state;
374 map->osd_weight = weight;
375 map->osd_addr = addr;
376 map->max_osd = max;
377 return 0;
378}
379
380/*
381 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
382 * to a set of osds)
383 */
384static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
385{
386 u64 a = *(u64 *)&l;
387 u64 b = *(u64 *)&r;
388
389 if (a < b)
390 return -1;
391 if (a > b)
392 return 1;
393 return 0;
394}
395
396static int __insert_pg_mapping(struct ceph_pg_mapping *new,
397 struct rb_root *root)
398{
399 struct rb_node **p = &root->rb_node;
400 struct rb_node *parent = NULL;
401 struct ceph_pg_mapping *pg = NULL;
402 int c;
403
404 while (*p) {
405 parent = *p;
406 pg = rb_entry(parent, struct ceph_pg_mapping, node);
407 c = pgid_cmp(new->pgid, pg->pgid);
408 if (c < 0)
409 p = &(*p)->rb_left;
410 else if (c > 0)
411 p = &(*p)->rb_right;
412 else
413 return -EEXIST;
414 }
415
416 rb_link_node(&new->node, parent, p);
417 rb_insert_color(&new->node, root);
418 return 0;
419}
420
421static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
422 struct ceph_pg pgid)
423{
424 struct rb_node *n = root->rb_node;
425 struct ceph_pg_mapping *pg;
426 int c;
427
428 while (n) {
429 pg = rb_entry(n, struct ceph_pg_mapping, node);
430 c = pgid_cmp(pgid, pg->pgid);
431 if (c < 0)
432 n = n->rb_left;
433 else if (c > 0)
434 n = n->rb_right;
435 else
436 return pg;
437 }
438 return NULL;
439}
440
441/*
442 * rbtree of pg pool info
443 */
444static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
445{
446 struct rb_node **p = &root->rb_node;
447 struct rb_node *parent = NULL;
448 struct ceph_pg_pool_info *pi = NULL;
449
450 while (*p) {
451 parent = *p;
452 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
453 if (new->id < pi->id)
454 p = &(*p)->rb_left;
455 else if (new->id > pi->id)
456 p = &(*p)->rb_right;
457 else
458 return -EEXIST;
459 }
460
461 rb_link_node(&new->node, parent, p);
462 rb_insert_color(&new->node, root);
463 return 0;
464}
465
466static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
467{
468 struct ceph_pg_pool_info *pi;
469 struct rb_node *n = root->rb_node;
470
471 while (n) {
472 pi = rb_entry(n, struct ceph_pg_pool_info, node);
473 if (id < pi->id)
474 n = n->rb_left;
475 else if (id > pi->id)
476 n = n->rb_right;
477 else
478 return pi;
479 }
480 return NULL;
481}
482
483/*
484 * decode a full map.
485 */
486struct ceph_osdmap *osdmap_decode(void **p, void *end)
487{
488 struct ceph_osdmap *map;
489 u16 version;
490 u32 len, max, i;
491 u8 ev;
492 int err = -EINVAL;
493 void *start = *p;
494 struct ceph_pg_pool_info *pi;
495
496 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
497
498 map = kzalloc(sizeof(*map), GFP_NOFS);
499 if (map == NULL)
500 return ERR_PTR(-ENOMEM);
501 map->pg_temp = RB_ROOT;
502
503 ceph_decode_16_safe(p, end, version, bad);
504 if (version > CEPH_OSDMAP_VERSION) {
505 pr_warning("got unknown v %d > %d of osdmap\n", version,
506 CEPH_OSDMAP_VERSION);
507 goto bad;
508 }
509
510 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
511 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
512 map->epoch = ceph_decode_32(p);
513 ceph_decode_copy(p, &map->created, sizeof(map->created));
514 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
515
516 ceph_decode_32_safe(p, end, max, bad);
517 while (max--) {
518 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
519 pi = kmalloc(sizeof(*pi), GFP_NOFS);
520 if (!pi)
521 goto bad;
522 pi->id = ceph_decode_32(p);
523 ev = ceph_decode_8(p); /* encoding version */
524 if (ev > CEPH_PG_POOL_VERSION) {
525 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
526 ev, CEPH_PG_POOL_VERSION);
527 goto bad;
528 }
529 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
530 __insert_pg_pool(&map->pg_pools, pi);
531 calc_pg_masks(pi);
532 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
533 *p += le32_to_cpu(pi->v.num_removed_snap_intervals)
534 * sizeof(u64) * 2;
535 }
536 ceph_decode_32_safe(p, end, map->pool_max, bad);
537
538 ceph_decode_32_safe(p, end, map->flags, bad);
539
540 max = ceph_decode_32(p);
541
542 /* (re)alloc osd arrays */
543 err = osdmap_set_max_osd(map, max);
544 if (err < 0)
545 goto bad;
546 dout("osdmap_decode max_osd = %d\n", map->max_osd);
547
548 /* osds */
549 err = -EINVAL;
550 ceph_decode_need(p, end, 3*sizeof(u32) +
551 map->max_osd*(1 + sizeof(*map->osd_weight) +
552 sizeof(*map->osd_addr)), bad);
553 *p += 4; /* skip length field (should match max) */
554 ceph_decode_copy(p, map->osd_state, map->max_osd);
555
556 *p += 4; /* skip length field (should match max) */
557 for (i = 0; i < map->max_osd; i++)
558 map->osd_weight[i] = ceph_decode_32(p);
559
560 *p += 4; /* skip length field (should match max) */
561 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
562 for (i = 0; i < map->max_osd; i++)
563 ceph_decode_addr(&map->osd_addr[i]);
564
565 /* pg_temp */
566 ceph_decode_32_safe(p, end, len, bad);
567 for (i = 0; i < len; i++) {
568 int n, j;
569 struct ceph_pg pgid;
570 struct ceph_pg_mapping *pg;
571
572 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
573 ceph_decode_copy(p, &pgid, sizeof(pgid));
574 n = ceph_decode_32(p);
575 ceph_decode_need(p, end, n * sizeof(u32), bad);
576 err = -ENOMEM;
577 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
578 if (!pg)
579 goto bad;
580 pg->pgid = pgid;
581 pg->len = n;
582 for (j = 0; j < n; j++)
583 pg->osds[j] = ceph_decode_32(p);
584
585 err = __insert_pg_mapping(pg, &map->pg_temp);
586 if (err)
587 goto bad;
588 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
589 }
590
591 /* crush */
592 ceph_decode_32_safe(p, end, len, bad);
593 dout("osdmap_decode crush len %d from off 0x%x\n", len,
594 (int)(*p - start));
595 ceph_decode_need(p, end, len, bad);
596 map->crush = crush_decode(*p, end);
597 *p += len;
598 if (IS_ERR(map->crush)) {
599 err = PTR_ERR(map->crush);
600 map->crush = NULL;
601 goto bad;
602 }
603
604 /* ignore the rest of the map */
605 *p = end;
606
607 dout("osdmap_decode done %p %p\n", *p, end);
608 return map;
609
610bad:
611 dout("osdmap_decode fail\n");
612 ceph_osdmap_destroy(map);
613 return ERR_PTR(err);
614}
615
616/*
617 * decode and apply an incremental map update.
618 */
619struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
620 struct ceph_osdmap *map,
621 struct ceph_messenger *msgr)
622{
623 struct crush_map *newcrush = NULL;
624 struct ceph_fsid fsid;
625 u32 epoch = 0;
626 struct ceph_timespec modified;
627 u32 len, pool;
628 __s32 new_pool_max, new_flags, max;
629 void *start = *p;
630 int err = -EINVAL;
631 u16 version;
632 struct rb_node *rbp;
633
634 ceph_decode_16_safe(p, end, version, bad);
635 if (version > CEPH_OSDMAP_INC_VERSION) {
636 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
637 CEPH_OSDMAP_INC_VERSION);
638 goto bad;
639 }
640
641 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
642 bad);
643 ceph_decode_copy(p, &fsid, sizeof(fsid));
644 epoch = ceph_decode_32(p);
645 BUG_ON(epoch != map->epoch+1);
646 ceph_decode_copy(p, &modified, sizeof(modified));
647 new_pool_max = ceph_decode_32(p);
648 new_flags = ceph_decode_32(p);
649
650 /* full map? */
651 ceph_decode_32_safe(p, end, len, bad);
652 if (len > 0) {
653 dout("apply_incremental full map len %d, %p to %p\n",
654 len, *p, end);
655 return osdmap_decode(p, min(*p+len, end));
656 }
657
658 /* new crush? */
659 ceph_decode_32_safe(p, end, len, bad);
660 if (len > 0) {
661 dout("apply_incremental new crush map len %d, %p to %p\n",
662 len, *p, end);
663 newcrush = crush_decode(*p, min(*p+len, end));
664 if (IS_ERR(newcrush))
665 return ERR_PTR(PTR_ERR(newcrush));
666 }
667
668 /* new flags? */
669 if (new_flags >= 0)
670 map->flags = new_flags;
671 if (new_pool_max >= 0)
672 map->pool_max = new_pool_max;
673
674 ceph_decode_need(p, end, 5*sizeof(u32), bad);
675
676 /* new max? */
677 max = ceph_decode_32(p);
678 if (max >= 0) {
679 err = osdmap_set_max_osd(map, max);
680 if (err < 0)
681 goto bad;
682 }
683
684 map->epoch++;
685 map->modified = map->modified;
686 if (newcrush) {
687 if (map->crush)
688 crush_destroy(map->crush);
689 map->crush = newcrush;
690 newcrush = NULL;
691 }
692
693 /* new_pool */
694 ceph_decode_32_safe(p, end, len, bad);
695 while (len--) {
696 __u8 ev;
697 struct ceph_pg_pool_info *pi;
698
699 ceph_decode_32_safe(p, end, pool, bad);
700 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
701 ev = ceph_decode_8(p); /* encoding version */
702 if (ev > CEPH_PG_POOL_VERSION) {
703 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
704 ev, CEPH_PG_POOL_VERSION);
705 goto bad;
706 }
707 pi = __lookup_pg_pool(&map->pg_pools, pool);
708 if (!pi) {
709 pi = kmalloc(sizeof(*pi), GFP_NOFS);
710 if (!pi) {
711 err = -ENOMEM;
712 goto bad;
713 }
714 pi->id = pool;
715 __insert_pg_pool(&map->pg_pools, pi);
716 }
717 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
718 calc_pg_masks(pi);
719 }
720
721 /* old_pool */
722 ceph_decode_32_safe(p, end, len, bad);
723 while (len--) {
724 struct ceph_pg_pool_info *pi;
725
726 ceph_decode_32_safe(p, end, pool, bad);
727 pi = __lookup_pg_pool(&map->pg_pools, pool);
728 if (pi) {
729 rb_erase(&pi->node, &map->pg_pools);
730 kfree(pi);
731 }
732 }
733
734 /* new_up */
735 err = -EINVAL;
736 ceph_decode_32_safe(p, end, len, bad);
737 while (len--) {
738 u32 osd;
739 struct ceph_entity_addr addr;
740 ceph_decode_32_safe(p, end, osd, bad);
741 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
742 ceph_decode_addr(&addr);
743 pr_info("osd%d up\n", osd);
744 BUG_ON(osd >= map->max_osd);
745 map->osd_state[osd] |= CEPH_OSD_UP;
746 map->osd_addr[osd] = addr;
747 }
748
749 /* new_down */
750 ceph_decode_32_safe(p, end, len, bad);
751 while (len--) {
752 u32 osd;
753 ceph_decode_32_safe(p, end, osd, bad);
754 (*p)++; /* clean flag */
755 pr_info("osd%d down\n", osd);
756 if (osd < map->max_osd)
757 map->osd_state[osd] &= ~CEPH_OSD_UP;
758 }
759
760 /* new_weight */
761 ceph_decode_32_safe(p, end, len, bad);
762 while (len--) {
763 u32 osd, off;
764 ceph_decode_need(p, end, sizeof(u32)*2, bad);
765 osd = ceph_decode_32(p);
766 off = ceph_decode_32(p);
767 pr_info("osd%d weight 0x%x %s\n", osd, off,
768 off == CEPH_OSD_IN ? "(in)" :
769 (off == CEPH_OSD_OUT ? "(out)" : ""));
770 if (osd < map->max_osd)
771 map->osd_weight[osd] = off;
772 }
773
774 /* new_pg_temp */
775 rbp = rb_first(&map->pg_temp);
776 ceph_decode_32_safe(p, end, len, bad);
777 while (len--) {
778 struct ceph_pg_mapping *pg;
779 int j;
780 struct ceph_pg pgid;
781 u32 pglen;
782 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
783 ceph_decode_copy(p, &pgid, sizeof(pgid));
784 pglen = ceph_decode_32(p);
785
786 /* remove any? */
787 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
788 node)->pgid, pgid) <= 0) {
789 struct rb_node *cur = rbp;
790 rbp = rb_next(rbp);
791 dout(" removed pg_temp %llx\n",
792 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
793 node)->pgid);
794 rb_erase(cur, &map->pg_temp);
795 }
796
797 if (pglen) {
798 /* insert */
799 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
800 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
801 if (!pg) {
802 err = -ENOMEM;
803 goto bad;
804 }
805 pg->pgid = pgid;
806 pg->len = pglen;
807 for (j = 0; j < pglen; j++)
808 pg->osds[j] = ceph_decode_32(p);
809 err = __insert_pg_mapping(pg, &map->pg_temp);
810 if (err)
811 goto bad;
812 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
813 pglen);
814 }
815 }
816 while (rbp) {
817 struct rb_node *cur = rbp;
818 rbp = rb_next(rbp);
819 dout(" removed pg_temp %llx\n",
820 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
821 node)->pgid);
822 rb_erase(cur, &map->pg_temp);
823 }
824
825 /* ignore the rest */
826 *p = end;
827 return map;
828
829bad:
830 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
831 epoch, (int)(*p - start), *p, start, end);
832 print_hex_dump(KERN_DEBUG, "osdmap: ",
833 DUMP_PREFIX_OFFSET, 16, 1,
834 start, end - start, true);
835 if (newcrush)
836 crush_destroy(newcrush);
837 return ERR_PTR(err);
838}
839
840
841
842
843/*
844 * calculate file layout from given offset, length.
845 * fill in correct oid, logical length, and object extent
846 * offset, length.
847 *
848 * for now, we write only a single su, until we can
849 * pass a stride back to the caller.
850 */
851void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
852 u64 off, u64 *plen,
853 u64 *ono,
854 u64 *oxoff, u64 *oxlen)
855{
856 u32 osize = le32_to_cpu(layout->fl_object_size);
857 u32 su = le32_to_cpu(layout->fl_stripe_unit);
858 u32 sc = le32_to_cpu(layout->fl_stripe_count);
859 u32 bl, stripeno, stripepos, objsetno;
860 u32 su_per_object;
861 u64 t, su_offset;
862
863 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
864 osize, su);
865 su_per_object = osize / su;
866 dout("osize %u / su %u = su_per_object %u\n", osize, su,
867 su_per_object);
868
869 BUG_ON((su & ~PAGE_MASK) != 0);
870 /* bl = *off / su; */
871 t = off;
872 do_div(t, su);
873 bl = t;
874 dout("off %llu / su %u = bl %u\n", off, su, bl);
875
876 stripeno = bl / sc;
877 stripepos = bl % sc;
878 objsetno = stripeno / su_per_object;
879
880 *ono = objsetno * sc + stripepos;
881 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
882
883 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
884 t = off;
885 su_offset = do_div(t, su);
886 *oxoff = su_offset + (stripeno % su_per_object) * su;
887
888 /*
889 * Calculate the length of the extent being written to the selected
890 * object. This is the minimum of the full length requested (plen) or
891 * the remainder of the current stripe being written to.
892 */
893 *oxlen = min_t(u64, *plen, su - su_offset);
894 *plen = *oxlen;
895
896 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
897}
898
899/*
900 * calculate an object layout (i.e. pgid) from an oid,
901 * file_layout, and osdmap
902 */
903int ceph_calc_object_layout(struct ceph_object_layout *ol,
904 const char *oid,
905 struct ceph_file_layout *fl,
906 struct ceph_osdmap *osdmap)
907{
908 unsigned num, num_mask;
909 struct ceph_pg pgid;
910 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
911 int poolid = le32_to_cpu(fl->fl_pg_pool);
912 struct ceph_pg_pool_info *pool;
913 unsigned ps;
914
915 BUG_ON(!osdmap);
916
917 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
918 if (!pool)
919 return -EIO;
920 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
921 if (preferred >= 0) {
922 ps += preferred;
923 num = le32_to_cpu(pool->v.lpg_num);
924 num_mask = pool->lpg_num_mask;
925 } else {
926 num = le32_to_cpu(pool->v.pg_num);
927 num_mask = pool->pg_num_mask;
928 }
929
930 pgid.ps = cpu_to_le16(ps);
931 pgid.preferred = cpu_to_le16(preferred);
932 pgid.pool = fl->fl_pg_pool;
933 if (preferred >= 0)
934 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
935 (int)preferred);
936 else
937 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
938
939 ol->ol_pgid = pgid;
940 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
941 return 0;
942}
943
944/*
945 * Calculate raw osd vector for the given pgid. Return pointer to osd
946 * array, or NULL on failure.
947 */
948static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
949 int *osds, int *num)
950{
951 struct ceph_pg_mapping *pg;
952 struct ceph_pg_pool_info *pool;
953 int ruleno;
954 unsigned poolid, ps, pps;
955 int preferred;
956
957 /* pg_temp? */
958 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
959 if (pg) {
960 *num = pg->len;
961 return pg->osds;
962 }
963
964 /* crush */
965 poolid = le32_to_cpu(pgid.pool);
966 ps = le16_to_cpu(pgid.ps);
967 preferred = (s16)le16_to_cpu(pgid.preferred);
968
969 /* don't forcefeed bad device ids to crush */
970 if (preferred >= osdmap->max_osd ||
971 preferred >= osdmap->crush->max_devices)
972 preferred = -1;
973
974 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
975 if (!pool)
976 return NULL;
977 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
978 pool->v.type, pool->v.size);
979 if (ruleno < 0) {
980 pr_err("no crush rule pool %d type %d size %d\n",
981 poolid, pool->v.type, pool->v.size);
982 return NULL;
983 }
984
985 if (preferred >= 0)
986 pps = ceph_stable_mod(ps,
987 le32_to_cpu(pool->v.lpgp_num),
988 pool->lpgp_num_mask);
989 else
990 pps = ceph_stable_mod(ps,
991 le32_to_cpu(pool->v.pgp_num),
992 pool->pgp_num_mask);
993 pps += poolid;
994 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
995 min_t(int, pool->v.size, *num),
996 preferred, osdmap->osd_weight);
997 return osds;
998}
999
1000/*
1001 * Return primary osd for given pgid, or -1 if none.
1002 */
1003int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1004{
1005 int rawosds[10], *osds;
1006 int i, num = ARRAY_SIZE(rawosds);
1007
1008 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1009 if (!osds)
1010 return -1;
1011
1012 /* primary is first up osd */
1013 for (i = 0; i < num; i++)
1014 if (ceph_osd_is_up(osdmap, osds[i])) {
1015 return osds[i];
1016 break;
1017 }
1018 return -1;
1019}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26};
27
28struct ceph_pg_mapping {
29 struct rb_node node;
30 struct ceph_pg pgid;
31 int len;
32 int osds[];
33};
34
35struct ceph_osdmap {
36 struct ceph_fsid fsid;
37 u32 epoch;
38 u32 mkfs_epoch;
39 struct ceph_timespec created, modified;
40
41 u32 flags; /* CEPH_OSDMAP_* */
42
43 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
44 u8 *osd_state; /* CEPH_OSD_* */
45 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
46 struct ceph_entity_addr *osd_addr;
47
48 struct rb_root pg_temp;
49 struct rb_root pg_pools;
50 u32 pool_max;
51
52 /* the CRUSH map specifies the mapping of placement groups to
53 * the list of osds that store+replicate them. */
54 struct crush_map *crush;
55};
56
57/*
58 * file layout helpers
59 */
60#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
61#define ceph_file_layout_stripe_count(l) \
62 ((__s32)le32_to_cpu((l).fl_stripe_count))
63#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
64#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
65#define ceph_file_layout_object_su(l) \
66 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
67#define ceph_file_layout_pg_preferred(l) \
68 ((__s32)le32_to_cpu((l).fl_pg_preferred))
69#define ceph_file_layout_pg_pool(l) \
70 ((__s32)le32_to_cpu((l).fl_pg_pool))
71
72static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
73{
74 return le32_to_cpu(l->fl_stripe_unit) *
75 le32_to_cpu(l->fl_stripe_count);
76}
77
78/* "period" == bytes before i start on a new set of objects */
79static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
80{
81 return le32_to_cpu(l->fl_object_size) *
82 le32_to_cpu(l->fl_stripe_count);
83}
84
85
86static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
87{
88 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
89}
90
91static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
92{
93 return map && (map->flags & flag);
94}
95
96extern char *ceph_osdmap_state_str(char *str, int len, int state);
97
98static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
99 int osd)
100{
101 if (osd >= map->max_osd)
102 return NULL;
103 return &map->osd_addr[osd];
104}
105
106extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
107extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
108 struct ceph_osdmap *map,
109 struct ceph_messenger *msgr);
110extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
111
112/* calculate mapping of a file extent to an object */
113extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
114 u64 off, u64 *plen,
115 u64 *bno, u64 *oxoff, u64 *oxlen);
116
117/* calculate mapping of object to a placement group */
118extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid,
120 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid);
124
125#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..370e93695474
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
1
2#include <linux/pagemap.h>
3#include <linux/highmem.h>
4
5#include "pagelist.h"
6
7int ceph_pagelist_release(struct ceph_pagelist *pl)
8{
9 if (pl->mapped_tail)
10 kunmap(pl->mapped_tail);
11 while (!list_empty(&pl->head)) {
12 struct page *page = list_first_entry(&pl->head, struct page,
13 lru);
14 list_del(&page->lru);
15 __free_page(page);
16 }
17 return 0;
18}
19
20static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
21{
22 struct page *page = alloc_page(GFP_NOFS);
23 if (!page)
24 return -ENOMEM;
25 pl->room += PAGE_SIZE;
26 list_add_tail(&page->lru, &pl->head);
27 if (pl->mapped_tail)
28 kunmap(pl->mapped_tail);
29 pl->mapped_tail = kmap(page);
30 return 0;
31}
32
33int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
34{
35 while (pl->room < len) {
36 size_t bit = pl->room;
37 int ret;
38
39 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
40 buf, bit);
41 pl->length += bit;
42 pl->room -= bit;
43 buf += bit;
44 len -= bit;
45 ret = ceph_pagelist_addpage(pl);
46 if (ret)
47 return ret;
48 }
49
50 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
51 pl->length += len;
52 pl->room -= len;
53 return 0;
54}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 4
16
17/*
18 * fs id
19 */
20struct ceph_fsid {
21 unsigned char fsid[16];
22};
23
24static inline int ceph_fsid_compare(const struct ceph_fsid *a,
25 const struct ceph_fsid *b)
26{
27 return memcmp(a, b, sizeof(*a));
28}
29
30/*
31 * ino, object, etc.
32 */
33typedef __le64 ceph_snapid_t;
34#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
35#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
36#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
37
38struct ceph_timespec {
39 __le32 tv_sec;
40 __le32 tv_nsec;
41} __attribute__ ((packed));
42
43
44/*
45 * object layout - how objects are mapped into PGs
46 */
47#define CEPH_OBJECT_LAYOUT_HASH 1
48#define CEPH_OBJECT_LAYOUT_LINEAR 2
49#define CEPH_OBJECT_LAYOUT_HASHINO 3
50
51/*
52 * pg layout -- how PGs are mapped onto (sets of) OSDs
53 */
54#define CEPH_PG_LAYOUT_CRUSH 0
55#define CEPH_PG_LAYOUT_HASH 1
56#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3
58
59
60/*
61 * placement group.
62 * we encode this into one __le64.
63 */
64struct ceph_pg {
65 __le16 preferred; /* preferred primary osd */
66 __le16 ps; /* placement seed */
67 __le32 pool; /* object pool */
68} __attribute__ ((packed));
69
70/*
71 * pg_pool is a set of pgs storing a pool of objects
72 *
73 * pg_num -- base number of pseudorandomly placed pgs
74 *
75 * pgp_num -- effective number when calculating pg placement. this
76 * is used for pg_num increases. new pgs result in data being "split"
77 * into new pgs. for this to proceed smoothly, new pgs are intiially
78 * colocated with their parents; that is, pgp_num doesn't increase
79 * until the new pgs have successfully split. only _then_ are the new
80 * pgs placed independently.
81 *
82 * lpg_num -- localized pg count (per device). replicas are randomly
83 * selected.
84 *
85 * lpgp_num -- as above.
86 */
87#define CEPH_PG_TYPE_REP 1
88#define CEPH_PG_TYPE_RAID4 2
89#define CEPH_PG_POOL_VERSION 2
90struct ceph_pg_pool {
91 __u8 type; /* CEPH_PG_TYPE_* */
92 __u8 size; /* number of osds in each pg */
93 __u8 crush_ruleset; /* crush placement rule */
94 __u8 object_hash; /* hash mapping object name to ps */
95 __le32 pg_num, pgp_num; /* number of pg's */
96 __le32 lpg_num, lpgp_num; /* number of localized pg's */
97 __le32 last_change; /* most recent epoch changed */
98 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps;
101 __le32 num_removed_snap_intervals;
102 __le64 uid;
103} __attribute__ ((packed));
104
105/*
106 * stable_mod func is used to control number of placement groups.
107 * similar to straight-up modulo, but produces a stable mapping as b
108 * increases over time. b is the number of bins, and bmask is the
109 * containing power of 2 minus 1.
110 *
111 * b <= bmask and bmask=(2**n)-1
112 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
113 */
114static inline int ceph_stable_mod(int x, int b, int bmask)
115{
116 if ((x & bmask) < b)
117 return x & bmask;
118 else
119 return x & (bmask >> 1);
120}
121
122/*
123 * object layout - how a given object should be stored.
124 */
125struct ceph_object_layout {
126 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
127 __le32 ol_stripe_unit; /* for per-object parity, if any */
128} __attribute__ ((packed));
129
130/*
131 * compound epoch+version, used by storage layer to serialize mutations
132 */
133struct ceph_eversion {
134 __le32 epoch;
135 __le64 version;
136} __attribute__ ((packed));
137
138/*
139 * osd map bits
140 */
141
142/* status bits */
143#define CEPH_OSD_EXISTS 1
144#define CEPH_OSD_UP 2
145
146/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
147#define CEPH_OSD_IN 0x10000
148#define CEPH_OSD_OUT 0
149
150
151/*
152 * osd map flag bits
153 */
154#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
155#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
156#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
157#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
158#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
159
160/*
161 * osd ops
162 */
163#define CEPH_OSD_OP_MODE 0xf000
164#define CEPH_OSD_OP_MODE_RD 0x1000
165#define CEPH_OSD_OP_MODE_WR 0x2000
166#define CEPH_OSD_OP_MODE_RMW 0x3000
167#define CEPH_OSD_OP_MODE_SUB 0x4000
168
169#define CEPH_OSD_OP_TYPE 0x0f00
170#define CEPH_OSD_OP_TYPE_LOCK 0x0100
171#define CEPH_OSD_OP_TYPE_DATA 0x0200
172#define CEPH_OSD_OP_TYPE_ATTR 0x0300
173#define CEPH_OSD_OP_TYPE_EXEC 0x0400
174#define CEPH_OSD_OP_TYPE_PG 0x0500
175
176enum {
177 /** data **/
178 /* read */
179 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
180 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
181
182 /* fancy read */
183 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
184
185 /* write */
186 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
187 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
188 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
189 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
190 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
191
192 /* fancy write */
193 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
194 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
195 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
196 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
197
198 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
199 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
200 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
201
202 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
203
204 /** attrs **/
205 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
208
209 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
213 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
214
215 /** subop **/
216 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
217 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
218 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
219 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
220 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
221
222 /** lock **/
223 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
224 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
225 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
226 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
227 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
228 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
229
230 /** exec **/
231 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
232
233 /** pg **/
234 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
235};
236
237static inline int ceph_osd_op_type_lock(int op)
238{
239 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
240}
241static inline int ceph_osd_op_type_data(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
244}
245static inline int ceph_osd_op_type_attr(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
248}
249static inline int ceph_osd_op_type_exec(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
252}
253static inline int ceph_osd_op_type_pg(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
256}
257
258static inline int ceph_osd_op_mode_subop(int op)
259{
260 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
261}
262static inline int ceph_osd_op_mode_read(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
265}
266static inline int ceph_osd_op_mode_modify(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
269}
270
271#define CEPH_OSD_TMAP_HDR 'h'
272#define CEPH_OSD_TMAP_SET 's'
273#define CEPH_OSD_TMAP_RM 'r'
274
275extern const char *ceph_osd_op_name(int op);
276
277
278/*
279 * osd op flags
280 *
281 * An op may be READ, WRITE, or READ|WRITE.
282 */
283enum {
284 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
285 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
286 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
287 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
288 CEPH_OSD_FLAG_READ = 16, /* op may read */
289 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
290 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
291 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
292 CEPH_OSD_FLAG_BALANCE_READS = 256,
293 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
294 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
295 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
296};
297
298enum {
299 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
300};
301
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304
305/*
306 * an individual object operation. each may be accompanied by some data
307 * payload
308 */
309struct ceph_osd_op {
310 __le16 op; /* CEPH_OSD_OP_* */
311 __le32 flags; /* CEPH_OSD_FLAG_* */
312 union {
313 struct {
314 __le64 offset, length;
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) extent;
318 struct {
319 __le32 name_len;
320 __le32 value_len;
321 } __attribute__ ((packed)) xattr;
322 struct {
323 __u8 class_len;
324 __u8 method_len;
325 __u8 argc;
326 __le32 indata_len;
327 } __attribute__ ((packed)) cls;
328 struct {
329 __le64 cookie, count;
330 } __attribute__ ((packed)) pgls;
331 };
332 __le32 payload_len;
333} __attribute__ ((packed));
334
335/*
336 * osd request message header. each request may include multiple
337 * ceph_osd_op object operations.
338 */
339struct ceph_osd_request_head {
340 __le32 client_inc; /* client incarnation */
341 struct ceph_object_layout layout; /* pgid */
342 __le32 osdmap_epoch; /* client's osdmap epoch */
343
344 __le32 flags;
345
346 struct ceph_timespec mtime; /* for mutations only */
347 struct ceph_eversion reassert_version; /* if we are replaying op */
348
349 __le32 object_len; /* length of object name */
350
351 __le64 snapid; /* snapid to read */
352 __le64 snap_seq; /* writer's snap context */
353 __le32 num_snaps;
354
355 __le16 num_ops;
356 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
357} __attribute__ ((packed));
358
359struct ceph_osd_reply_head {
360 __le32 client_inc; /* client incarnation */
361 __le32 flags;
362 struct ceph_object_layout layout;
363 __le32 osdmap_epoch;
364 struct ceph_eversion reassert_version; /* for replaying uncommitted */
365
366 __le32 result; /* result code */
367
368 __le32 object_len; /* length of object name */
369 __le32 num_ops;
370 struct ceph_osd_op ops[0]; /* ops[], object */
371} __attribute__ ((packed));
372
373
374#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..bf2a5f3846a4
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,904 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4
5#include "super.h"
6#include "decode.h"
7
8/*
9 * Snapshots in ceph are driven in large part by cooperation from the
10 * client. In contrast to local file systems or file servers that
11 * implement snapshots at a single point in the system, ceph's
12 * distributed access to storage requires clients to help decide
13 * whether a write logically occurs before or after a recently created
14 * snapshot.
15 *
16 * This provides a perfect instantanous client-wide snapshot. Between
17 * clients, however, snapshots may appear to be applied at slightly
18 * different points in time, depending on delays in delivering the
19 * snapshot notification.
20 *
21 * Snapshots are _not_ file system-wide. Instead, each snapshot
22 * applies to the subdirectory nested beneath some directory. This
23 * effectively divides the hierarchy into multiple "realms," where all
24 * of the files contained by each realm share the same set of
25 * snapshots. An individual realm's snap set contains snapshots
26 * explicitly created on that realm, as well as any snaps in its
27 * parent's snap set _after_ the point at which the parent became it's
28 * parent (due to, say, a rename). Similarly, snaps from prior parents
29 * during the time intervals during which they were the parent are included.
30 *
31 * The client is spared most of this detail, fortunately... it must only
32 * maintains a hierarchy of realms reflecting the current parent/child
33 * realm relationship, and for each realm has an explicit list of snaps
34 * inherited from prior parents.
35 *
36 * A snap_realm struct is maintained for realms containing every inode
37 * with an open cap in the system. (The needed snap realm information is
38 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
39 * version number is used to ensure that as realm parameters change (new
40 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
41 *
42 * The realm hierarchy drives the generation of a 'snap context' for each
43 * realm, which simply lists the resulting set of snaps for the realm. This
44 * is attached to any writes sent to OSDs.
45 */
46/*
47 * Unfortunately error handling is a bit mixed here. If we get a snap
48 * update, but don't have enough memory to update our realm hierarchy,
49 * it's not clear what we can do about it (besides complaining to the
50 * console).
51 */
52
53
54/*
55 * increase ref count for the realm
56 *
57 * caller must hold snap_rwsem for write.
58 */
59void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
60 struct ceph_snap_realm *realm)
61{
62 dout("get_realm %p %d -> %d\n", realm,
63 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
64 /*
65 * since we _only_ increment realm refs or empty the empty
66 * list with snap_rwsem held, adjusting the empty list here is
67 * safe. we do need to protect against concurrent empty list
68 * additions, however.
69 */
70 if (atomic_read(&realm->nref) == 0) {
71 spin_lock(&mdsc->snap_empty_lock);
72 list_del_init(&realm->empty_item);
73 spin_unlock(&mdsc->snap_empty_lock);
74 }
75
76 atomic_inc(&realm->nref);
77}
78
79static void __insert_snap_realm(struct rb_root *root,
80 struct ceph_snap_realm *new)
81{
82 struct rb_node **p = &root->rb_node;
83 struct rb_node *parent = NULL;
84 struct ceph_snap_realm *r = NULL;
85
86 while (*p) {
87 parent = *p;
88 r = rb_entry(parent, struct ceph_snap_realm, node);
89 if (new->ino < r->ino)
90 p = &(*p)->rb_left;
91 else if (new->ino > r->ino)
92 p = &(*p)->rb_right;
93 else
94 BUG();
95 }
96
97 rb_link_node(&new->node, parent, p);
98 rb_insert_color(&new->node, root);
99}
100
101/*
102 * create and get the realm rooted at @ino and bump its ref count.
103 *
104 * caller must hold snap_rwsem for write.
105 */
106static struct ceph_snap_realm *ceph_create_snap_realm(
107 struct ceph_mds_client *mdsc,
108 u64 ino)
109{
110 struct ceph_snap_realm *realm;
111
112 realm = kzalloc(sizeof(*realm), GFP_NOFS);
113 if (!realm)
114 return ERR_PTR(-ENOMEM);
115
116 atomic_set(&realm->nref, 0); /* tree does not take a ref */
117 realm->ino = ino;
118 INIT_LIST_HEAD(&realm->children);
119 INIT_LIST_HEAD(&realm->child_item);
120 INIT_LIST_HEAD(&realm->empty_item);
121 INIT_LIST_HEAD(&realm->inodes_with_caps);
122 spin_lock_init(&realm->inodes_with_caps_lock);
123 __insert_snap_realm(&mdsc->snap_realms, realm);
124 dout("create_snap_realm %llx %p\n", realm->ino, realm);
125 return realm;
126}
127
128/*
129 * lookup the realm rooted at @ino.
130 *
131 * caller must hold snap_rwsem for write.
132 */
133struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
134 u64 ino)
135{
136 struct rb_node *n = mdsc->snap_realms.rb_node;
137 struct ceph_snap_realm *r;
138
139 while (n) {
140 r = rb_entry(n, struct ceph_snap_realm, node);
141 if (ino < r->ino)
142 n = n->rb_left;
143 else if (ino > r->ino)
144 n = n->rb_right;
145 else {
146 dout("lookup_snap_realm %llx %p\n", r->ino, r);
147 return r;
148 }
149 }
150 return NULL;
151}
152
153static void __put_snap_realm(struct ceph_mds_client *mdsc,
154 struct ceph_snap_realm *realm);
155
156/*
157 * called with snap_rwsem (write)
158 */
159static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
160 struct ceph_snap_realm *realm)
161{
162 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
163
164 rb_erase(&realm->node, &mdsc->snap_realms);
165
166 if (realm->parent) {
167 list_del_init(&realm->child_item);
168 __put_snap_realm(mdsc, realm->parent);
169 }
170
171 kfree(realm->prior_parent_snaps);
172 kfree(realm->snaps);
173 ceph_put_snap_context(realm->cached_context);
174 kfree(realm);
175}
176
177/*
178 * caller holds snap_rwsem (write)
179 */
180static void __put_snap_realm(struct ceph_mds_client *mdsc,
181 struct ceph_snap_realm *realm)
182{
183 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
184 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
185 if (atomic_dec_and_test(&realm->nref))
186 __destroy_snap_realm(mdsc, realm);
187}
188
189/*
190 * caller needn't hold any locks
191 */
192void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
193 struct ceph_snap_realm *realm)
194{
195 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
196 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
197 if (!atomic_dec_and_test(&realm->nref))
198 return;
199
200 if (down_write_trylock(&mdsc->snap_rwsem)) {
201 __destroy_snap_realm(mdsc, realm);
202 up_write(&mdsc->snap_rwsem);
203 } else {
204 spin_lock(&mdsc->snap_empty_lock);
205 list_add(&mdsc->snap_empty, &realm->empty_item);
206 spin_unlock(&mdsc->snap_empty_lock);
207 }
208}
209
210/*
211 * Clean up any realms whose ref counts have dropped to zero. Note
212 * that this does not include realms who were created but not yet
213 * used.
214 *
215 * Called under snap_rwsem (write)
216 */
217static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
218{
219 struct ceph_snap_realm *realm;
220
221 spin_lock(&mdsc->snap_empty_lock);
222 while (!list_empty(&mdsc->snap_empty)) {
223 realm = list_first_entry(&mdsc->snap_empty,
224 struct ceph_snap_realm, empty_item);
225 list_del(&realm->empty_item);
226 spin_unlock(&mdsc->snap_empty_lock);
227 __destroy_snap_realm(mdsc, realm);
228 spin_lock(&mdsc->snap_empty_lock);
229 }
230 spin_unlock(&mdsc->snap_empty_lock);
231}
232
233void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
234{
235 down_write(&mdsc->snap_rwsem);
236 __cleanup_empty_realms(mdsc);
237 up_write(&mdsc->snap_rwsem);
238}
239
240/*
241 * adjust the parent realm of a given @realm. adjust child list, and parent
242 * pointers, and ref counts appropriately.
243 *
244 * return true if parent was changed, 0 if unchanged, <0 on error.
245 *
246 * caller must hold snap_rwsem for write.
247 */
248static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
249 struct ceph_snap_realm *realm,
250 u64 parentino)
251{
252 struct ceph_snap_realm *parent;
253
254 if (realm->parent_ino == parentino)
255 return 0;
256
257 parent = ceph_lookup_snap_realm(mdsc, parentino);
258 if (!parent) {
259 parent = ceph_create_snap_realm(mdsc, parentino);
260 if (IS_ERR(parent))
261 return PTR_ERR(parent);
262 }
263 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
264 realm->ino, realm, realm->parent_ino, realm->parent,
265 parentino, parent);
266 if (realm->parent) {
267 list_del_init(&realm->child_item);
268 ceph_put_snap_realm(mdsc, realm->parent);
269 }
270 realm->parent_ino = parentino;
271 realm->parent = parent;
272 ceph_get_snap_realm(mdsc, parent);
273 list_add(&realm->child_item, &parent->children);
274 return 1;
275}
276
277
278static int cmpu64_rev(const void *a, const void *b)
279{
280 if (*(u64 *)a < *(u64 *)b)
281 return 1;
282 if (*(u64 *)a > *(u64 *)b)
283 return -1;
284 return 0;
285}
286
287/*
288 * build the snap context for a given realm.
289 */
290static int build_snap_context(struct ceph_snap_realm *realm)
291{
292 struct ceph_snap_realm *parent = realm->parent;
293 struct ceph_snap_context *snapc;
294 int err = 0;
295 int i;
296 int num = realm->num_prior_parent_snaps + realm->num_snaps;
297
298 /*
299 * build parent context, if it hasn't been built.
300 * conservatively estimate that all parent snaps might be
301 * included by us.
302 */
303 if (parent) {
304 if (!parent->cached_context) {
305 err = build_snap_context(parent);
306 if (err)
307 goto fail;
308 }
309 num += parent->cached_context->num_snaps;
310 }
311
312 /* do i actually need to update? not if my context seq
313 matches realm seq, and my parents' does to. (this works
314 because we rebuild_snap_realms() works _downward_ in
315 hierarchy after each update.) */
316 if (realm->cached_context &&
317 realm->cached_context->seq <= realm->seq &&
318 (!parent ||
319 realm->cached_context->seq <= parent->cached_context->seq)) {
320 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
321 " (unchanged)\n",
322 realm->ino, realm, realm->cached_context,
323 realm->cached_context->seq,
324 realm->cached_context->num_snaps);
325 return 0;
326 }
327
328 /* alloc new snap context */
329 err = -ENOMEM;
330 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
331 goto fail;
332 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
333 if (!snapc)
334 goto fail;
335 atomic_set(&snapc->nref, 1);
336
337 /* build (reverse sorted) snap vector */
338 num = 0;
339 snapc->seq = realm->seq;
340 if (parent) {
341 /* include any of parent's snaps occuring _after_ my
342 parent became my parent */
343 for (i = 0; i < parent->cached_context->num_snaps; i++)
344 if (parent->cached_context->snaps[i] >=
345 realm->parent_since)
346 snapc->snaps[num++] =
347 parent->cached_context->snaps[i];
348 if (parent->cached_context->seq > snapc->seq)
349 snapc->seq = parent->cached_context->seq;
350 }
351 memcpy(snapc->snaps + num, realm->snaps,
352 sizeof(u64)*realm->num_snaps);
353 num += realm->num_snaps;
354 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
355 sizeof(u64)*realm->num_prior_parent_snaps);
356 num += realm->num_prior_parent_snaps;
357
358 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
359 snapc->num_snaps = num;
360 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
361 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
362
363 if (realm->cached_context)
364 ceph_put_snap_context(realm->cached_context);
365 realm->cached_context = snapc;
366 return 0;
367
368fail:
369 /*
370 * if we fail, clear old (incorrect) cached_context... hopefully
371 * we'll have better luck building it later
372 */
373 if (realm->cached_context) {
374 ceph_put_snap_context(realm->cached_context);
375 realm->cached_context = NULL;
376 }
377 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
378 realm, err);
379 return err;
380}
381
382/*
383 * rebuild snap context for the given realm and all of its children.
384 */
385static void rebuild_snap_realms(struct ceph_snap_realm *realm)
386{
387 struct ceph_snap_realm *child;
388
389 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
390 build_snap_context(realm);
391
392 list_for_each_entry(child, &realm->children, child_item)
393 rebuild_snap_realms(child);
394}
395
396
397/*
398 * helper to allocate and decode an array of snapids. free prior
399 * instance, if any.
400 */
401static int dup_array(u64 **dst, __le64 *src, int num)
402{
403 int i;
404
405 kfree(*dst);
406 if (num) {
407 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
408 if (!*dst)
409 return -ENOMEM;
410 for (i = 0; i < num; i++)
411 (*dst)[i] = get_unaligned_le64(src + i);
412 } else {
413 *dst = NULL;
414 }
415 return 0;
416}
417
418
419/*
420 * When a snapshot is applied, the size/mtime inode metadata is queued
421 * in a ceph_cap_snap (one for each snapshot) until writeback
422 * completes and the metadata can be flushed back to the MDS.
423 *
424 * However, if a (sync) write is currently in-progress when we apply
425 * the snapshot, we have to wait until the write succeeds or fails
426 * (and a final size/mtime is known). In this case the
427 * cap_snap->writing = 1, and is said to be "pending." When the write
428 * finishes, we __ceph_finish_cap_snap().
429 *
430 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
431 * change).
432 */
433void ceph_queue_cap_snap(struct ceph_inode_info *ci,
434 struct ceph_snap_context *snapc)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p snapc %p seq %llu used %d"
454 " already pending\n", inode, snapc, snapc->seq, used);
455 kfree(capsnap);
456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
457 igrab(inode);
458
459 atomic_set(&capsnap->nref, 1);
460 capsnap->ci = ci;
461 INIT_LIST_HEAD(&capsnap->ci_item);
462 INIT_LIST_HEAD(&capsnap->flushing_item);
463
464 capsnap->follows = snapc->seq - 1;
465 capsnap->context = ceph_get_snap_context(snapc);
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 ceph_put_snap_context(ci->i_head_snapc);
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 capsnap->size, capsnap->dirty_pages);
528 return 0;
529 }
530 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
531 inode, capsnap, capsnap->context,
532 capsnap->context->seq, capsnap->size);
533
534 spin_lock(&mdsc->snap_flush_lock);
535 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
536 spin_unlock(&mdsc->snap_flush_lock);
537 return 1; /* caller may want to ceph_flush_snaps */
538}
539
540
541/*
542 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
543 * the snap realm parameters from a given realm and all of its ancestors,
544 * up to the root.
545 *
546 * Caller must hold snap_rwsem for write.
547 */
548int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
549 void *p, void *e, bool deletion)
550{
551 struct ceph_mds_snap_realm *ri; /* encoded */
552 __le64 *snaps; /* encoded */
553 __le64 *prior_parent_snaps; /* encoded */
554 struct ceph_snap_realm *realm;
555 int invalidate = 0;
556 int err = -ENOMEM;
557
558 dout("update_snap_trace deletion=%d\n", deletion);
559more:
560 ceph_decode_need(&p, e, sizeof(*ri), bad);
561 ri = p;
562 p += sizeof(*ri);
563 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
564 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
565 snaps = p;
566 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
567 prior_parent_snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
569
570 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
571 if (!realm) {
572 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (IS_ERR(realm)) {
574 err = PTR_ERR(realm);
575 goto fail;
576 }
577 }
578
579 if (le64_to_cpu(ri->seq) > realm->seq) {
580 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
581 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
582 /*
583 * if the realm seq has changed, queue a cap_snap for every
584 * inode with open caps. we do this _before_ we update
585 * the realm info so that we prepare for writeback under the
586 * _previous_ snap context.
587 *
588 * ...unless it's a snap deletion!
589 */
590 if (!deletion) {
591 struct ceph_inode_info *ci;
592 struct inode *lastinode = NULL;
593
594 spin_lock(&realm->inodes_with_caps_lock);
595 list_for_each_entry(ci, &realm->inodes_with_caps,
596 i_snap_realm_item) {
597 struct inode *inode = igrab(&ci->vfs_inode);
598 if (!inode)
599 continue;
600 spin_unlock(&realm->inodes_with_caps_lock);
601 if (lastinode)
602 iput(lastinode);
603 lastinode = inode;
604 ceph_queue_cap_snap(ci, realm->cached_context);
605 spin_lock(&realm->inodes_with_caps_lock);
606 }
607 spin_unlock(&realm->inodes_with_caps_lock);
608 if (lastinode)
609 iput(lastinode);
610 dout("update_snap_trace cap_snaps queued\n");
611 }
612
613 } else {
614 dout("update_snap_trace %llx %p seq %lld unchanged\n",
615 realm->ino, realm, realm->seq);
616 }
617
618 /* ensure the parent is correct */
619 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
620 if (err < 0)
621 goto fail;
622 invalidate += err;
623
624 if (le64_to_cpu(ri->seq) > realm->seq) {
625 /* update realm parameters, snap lists */
626 realm->seq = le64_to_cpu(ri->seq);
627 realm->created = le64_to_cpu(ri->created);
628 realm->parent_since = le64_to_cpu(ri->parent_since);
629
630 realm->num_snaps = le32_to_cpu(ri->num_snaps);
631 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
632 if (err < 0)
633 goto fail;
634
635 realm->num_prior_parent_snaps =
636 le32_to_cpu(ri->num_prior_parent_snaps);
637 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
638 realm->num_prior_parent_snaps);
639 if (err < 0)
640 goto fail;
641
642 invalidate = 1;
643 } else if (!realm->cached_context) {
644 invalidate = 1;
645 }
646
647 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
648 realm, invalidate, p, e);
649
650 if (p < e)
651 goto more;
652
653 /* invalidate when we reach the _end_ (root) of the trace */
654 if (invalidate)
655 rebuild_snap_realms(realm);
656
657 __cleanup_empty_realms(mdsc);
658 return 0;
659
660bad:
661 err = -EINVAL;
662fail:
663 pr_err("update_snap_trace error %d\n", err);
664 return err;
665}
666
667
668/*
669 * Send any cap_snaps that are queued for flush. Try to carry
670 * s_mutex across multiple snap flushes to avoid locking overhead.
671 *
672 * Caller holds no locks.
673 */
674static void flush_snaps(struct ceph_mds_client *mdsc)
675{
676 struct ceph_inode_info *ci;
677 struct inode *inode;
678 struct ceph_mds_session *session = NULL;
679
680 dout("flush_snaps\n");
681 spin_lock(&mdsc->snap_flush_lock);
682 while (!list_empty(&mdsc->snap_flush_list)) {
683 ci = list_first_entry(&mdsc->snap_flush_list,
684 struct ceph_inode_info, i_snap_flush_item);
685 inode = &ci->vfs_inode;
686 igrab(inode);
687 spin_unlock(&mdsc->snap_flush_lock);
688 spin_lock(&inode->i_lock);
689 __ceph_flush_snaps(ci, &session);
690 spin_unlock(&inode->i_lock);
691 iput(inode);
692 spin_lock(&mdsc->snap_flush_lock);
693 }
694 spin_unlock(&mdsc->snap_flush_lock);
695
696 if (session) {
697 mutex_unlock(&session->s_mutex);
698 ceph_put_mds_session(session);
699 }
700 dout("flush_snaps done\n");
701}
702
703
704/*
705 * Handle a snap notification from the MDS.
706 *
707 * This can take two basic forms: the simplest is just a snap creation
708 * or deletion notification on an existing realm. This should update the
709 * realm and its children.
710 *
711 * The more difficult case is realm creation, due to snap creation at a
712 * new point in the file hierarchy, or due to a rename that moves a file or
713 * directory into another realm.
714 */
715void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg)
718{
719 struct super_block *sb = mdsc->client->sb;
720 int mds = session->s_mds;
721 u64 split;
722 int op;
723 int trace_len;
724 struct ceph_snap_realm *realm = NULL;
725 void *p = msg->front.iov_base;
726 void *e = p + msg->front.iov_len;
727 struct ceph_mds_snap_head *h;
728 int num_split_inos, num_split_realms;
729 __le64 *split_inos = NULL, *split_realms = NULL;
730 int i;
731 int locked_rwsem = 0;
732
733 /* decode */
734 if (msg->front.iov_len < sizeof(*h))
735 goto bad;
736 h = p;
737 op = le32_to_cpu(h->op);
738 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
739 * existing realm */
740 num_split_inos = le32_to_cpu(h->num_split_inos);
741 num_split_realms = le32_to_cpu(h->num_split_realms);
742 trace_len = le32_to_cpu(h->trace_len);
743 p += sizeof(*h);
744
745 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
746 ceph_snap_op_name(op), split, trace_len);
747
748 mutex_lock(&session->s_mutex);
749 session->s_seq++;
750 mutex_unlock(&session->s_mutex);
751
752 down_write(&mdsc->snap_rwsem);
753 locked_rwsem = 1;
754
755 if (op == CEPH_SNAP_OP_SPLIT) {
756 struct ceph_mds_snap_realm *ri;
757
758 /*
759 * A "split" breaks part of an existing realm off into
760 * a new realm. The MDS provides a list of inodes
761 * (with caps) and child realms that belong to the new
762 * child.
763 */
764 split_inos = p;
765 p += sizeof(u64) * num_split_inos;
766 split_realms = p;
767 p += sizeof(u64) * num_split_realms;
768 ceph_decode_need(&p, e, sizeof(*ri), bad);
769 /* we will peek at realm info here, but will _not_
770 * advance p, as the realm update will occur below in
771 * ceph_update_snap_trace. */
772 ri = p;
773
774 realm = ceph_lookup_snap_realm(mdsc, split);
775 if (!realm) {
776 realm = ceph_create_snap_realm(mdsc, split);
777 if (IS_ERR(realm))
778 goto out;
779 }
780 ceph_get_snap_realm(mdsc, realm);
781
782 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
783 for (i = 0; i < num_split_inos; i++) {
784 struct ceph_vino vino = {
785 .ino = le64_to_cpu(split_inos[i]),
786 .snap = CEPH_NOSNAP,
787 };
788 struct inode *inode = ceph_find_inode(sb, vino);
789 struct ceph_inode_info *ci;
790
791 if (!inode)
792 continue;
793 ci = ceph_inode(inode);
794
795 spin_lock(&inode->i_lock);
796 if (!ci->i_snap_realm)
797 goto skip_inode;
798 /*
799 * If this inode belongs to a realm that was
800 * created after our new realm, we experienced
801 * a race (due to another split notifications
802 * arriving from a different MDS). So skip
803 * this inode.
804 */
805 if (ci->i_snap_realm->created >
806 le64_to_cpu(ri->created)) {
807 dout(" leaving %p in newer realm %llx %p\n",
808 inode, ci->i_snap_realm->ino,
809 ci->i_snap_realm);
810 goto skip_inode;
811 }
812 dout(" will move %p to split realm %llx %p\n",
813 inode, realm->ino, realm);
814 /*
815 * Remove the inode from the realm's inode
816 * list, but don't add it to the new realm
817 * yet. We don't want the cap_snap to be
818 * queued (again) by ceph_update_snap_trace()
819 * below. Queue it _now_, under the old context.
820 */
821 list_del_init(&ci->i_snap_realm_item);
822 spin_unlock(&inode->i_lock);
823
824 ceph_queue_cap_snap(ci,
825 ci->i_snap_realm->cached_context);
826
827 iput(inode);
828 continue;
829
830skip_inode:
831 spin_unlock(&inode->i_lock);
832 iput(inode);
833 }
834
835 /* we may have taken some of the old realm's children. */
836 for (i = 0; i < num_split_realms; i++) {
837 struct ceph_snap_realm *child =
838 ceph_lookup_snap_realm(mdsc,
839 le64_to_cpu(split_realms[i]));
840 if (!child)
841 continue;
842 adjust_snap_realm_parent(mdsc, child, realm->ino);
843 }
844 }
845
846 /*
847 * update using the provided snap trace. if we are deleting a
848 * snap, we can avoid queueing cap_snaps.
849 */
850 ceph_update_snap_trace(mdsc, p, e,
851 op == CEPH_SNAP_OP_DESTROY);
852
853 if (op == CEPH_SNAP_OP_SPLIT) {
854 /*
855 * ok, _now_ add the inodes into the new realm.
856 */
857 for (i = 0; i < num_split_inos; i++) {
858 struct ceph_vino vino = {
859 .ino = le64_to_cpu(split_inos[i]),
860 .snap = CEPH_NOSNAP,
861 };
862 struct inode *inode = ceph_find_inode(sb, vino);
863 struct ceph_inode_info *ci;
864
865 if (!inode)
866 continue;
867 ci = ceph_inode(inode);
868 spin_lock(&inode->i_lock);
869 if (!ci->i_snap_realm)
870 goto split_skip_inode;
871 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
872 spin_lock(&realm->inodes_with_caps_lock);
873 list_add(&ci->i_snap_realm_item,
874 &realm->inodes_with_caps);
875 ci->i_snap_realm = realm;
876 spin_unlock(&realm->inodes_with_caps_lock);
877 ceph_get_snap_realm(mdsc, realm);
878split_skip_inode:
879 spin_unlock(&inode->i_lock);
880 iput(inode);
881 }
882
883 /* we took a reference when we created the realm, above */
884 ceph_put_snap_realm(mdsc, realm);
885 }
886
887 __cleanup_empty_realms(mdsc);
888
889 up_write(&mdsc->snap_rwsem);
890
891 flush_snaps(mdsc);
892 return;
893
894bad:
895 pr_err("corrupt snap message from mds%d\n", mds);
896 ceph_msg_dump(msg);
897out:
898 if (locked_rwsem)
899 up_write(&mdsc->snap_rwsem);
900 return;
901}
902
903
904
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..4290a6e860b0
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16#include <linux/version.h>
17#include <linux/vmalloc.h>
18
19#include "decode.h"
20#include "super.h"
21#include "mon_client.h"
22#include "auth.h"
23
24/*
25 * Ceph superblock operations
26 *
27 * Handle the basics of mounting, unmounting.
28 */
29
30
31/*
32 * find filename portion of a path (/foo/bar/baz -> baz)
33 */
34const char *ceph_file_part(const char *s, int len)
35{
36 const char *e = s + len;
37
38 while (e != s && *(e-1) != '/')
39 e--;
40 return e;
41}
42
43
44/*
45 * super ops
46 */
47static void ceph_put_super(struct super_block *s)
48{
49 struct ceph_client *cl = ceph_client(s);
50
51 dout("put_super\n");
52 ceph_mdsc_close_sessions(&cl->mdsc);
53 return;
54}
55
56static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
57{
58 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
59 struct ceph_monmap *monmap = client->monc.monmap;
60 struct ceph_statfs st;
61 u64 fsid;
62 int err;
63
64 dout("statfs\n");
65 err = ceph_monc_do_statfs(&client->monc, &st);
66 if (err < 0)
67 return err;
68
69 /* fill in kstatfs */
70 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
71
72 /*
73 * express utilization in terms of large blocks to avoid
74 * overflow on 32-bit machines.
75 */
76 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
77 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
78 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
79 (CEPH_BLOCK_SHIFT-10);
80 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
81
82 buf->f_files = le64_to_cpu(st.num_objects);
83 buf->f_ffree = -1;
84 buf->f_namelen = PATH_MAX;
85 buf->f_frsize = PAGE_CACHE_SIZE;
86
87 /* leave fsid little-endian, regardless of host endianness */
88 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
89 buf->f_fsid.val[0] = fsid & 0xffffffff;
90 buf->f_fsid.val[1] = fsid >> 32;
91
92 return 0;
93}
94
95
96static int ceph_syncfs(struct super_block *sb, int wait)
97{
98 dout("sync_fs %d\n", wait);
99 ceph_osdc_sync(&ceph_client(sb)->osdc);
100 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
101 dout("sync_fs %d done\n", wait);
102 return 0;
103}
104
105
106/**
107 * ceph_show_options - Show mount options in /proc/mounts
108 * @m: seq_file to write to
109 * @mnt: mount descriptor
110 */
111static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
112{
113 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
114 struct ceph_mount_args *args = client->mount_args;
115
116 if (args->flags & CEPH_OPT_FSID)
117 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
118 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
120 if (args->flags & CEPH_OPT_NOSHARE)
121 seq_puts(m, ",noshare");
122 if (args->flags & CEPH_OPT_DIRSTAT)
123 seq_puts(m, ",dirstat");
124 if ((args->flags & CEPH_OPT_RBYTES) == 0)
125 seq_puts(m, ",norbytes");
126 if (args->flags & CEPH_OPT_NOCRC)
127 seq_puts(m, ",nocrc");
128 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
129 seq_puts(m, ",noasyncreaddir");
130 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
131 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
132 if (args->name)
133 seq_printf(m, ",name=%s", args->name);
134 if (args->secret)
135 seq_puts(m, ",secret=<hidden>");
136 return 0;
137}
138
139/*
140 * caches
141 */
142struct kmem_cache *ceph_inode_cachep;
143struct kmem_cache *ceph_cap_cachep;
144struct kmem_cache *ceph_dentry_cachep;
145struct kmem_cache *ceph_file_cachep;
146
147static void ceph_inode_init_once(void *foo)
148{
149 struct ceph_inode_info *ci = foo;
150 inode_init_once(&ci->vfs_inode);
151}
152
153static int default_congestion_kb(void)
154{
155 int congestion_kb;
156
157 /*
158 * Copied from NFS
159 *
160 * congestion size, scale with available memory.
161 *
162 * 64MB: 8192k
163 * 128MB: 11585k
164 * 256MB: 16384k
165 * 512MB: 23170k
166 * 1GB: 32768k
167 * 2GB: 46340k
168 * 4GB: 65536k
169 * 8GB: 92681k
170 * 16GB: 131072k
171 *
172 * This allows larger machines to have larger/more transfers.
173 * Limit the default to 256M
174 */
175 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
176 if (congestion_kb > 256*1024)
177 congestion_kb = 256*1024;
178
179 return congestion_kb;
180}
181
182static int __init init_caches(void)
183{
184 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
185 sizeof(struct ceph_inode_info),
186 __alignof__(struct ceph_inode_info),
187 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
188 ceph_inode_init_once);
189 if (ceph_inode_cachep == NULL)
190 return -ENOMEM;
191
192 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
193 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
194 if (ceph_cap_cachep == NULL)
195 goto bad_cap;
196
197 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
198 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
199 if (ceph_dentry_cachep == NULL)
200 goto bad_dentry;
201
202 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
203 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
204 if (ceph_file_cachep == NULL)
205 goto bad_file;
206
207 return 0;
208
209bad_file:
210 kmem_cache_destroy(ceph_dentry_cachep);
211bad_dentry:
212 kmem_cache_destroy(ceph_cap_cachep);
213bad_cap:
214 kmem_cache_destroy(ceph_inode_cachep);
215 return -ENOMEM;
216}
217
218static void destroy_caches(void)
219{
220 kmem_cache_destroy(ceph_inode_cachep);
221 kmem_cache_destroy(ceph_cap_cachep);
222 kmem_cache_destroy(ceph_dentry_cachep);
223 kmem_cache_destroy(ceph_file_cachep);
224}
225
226
227/*
228 * ceph_umount_begin - initiate forced umount. Tear down down the
229 * mount, skipping steps that may hang while waiting for server(s).
230 */
231static void ceph_umount_begin(struct super_block *sb)
232{
233 struct ceph_client *client = ceph_sb_to_client(sb);
234
235 dout("ceph_umount_begin - starting forced umount\n");
236 if (!client)
237 return;
238 client->mount_state = CEPH_MOUNT_SHUTDOWN;
239 return;
240}
241
242static const struct super_operations ceph_super_ops = {
243 .alloc_inode = ceph_alloc_inode,
244 .destroy_inode = ceph_destroy_inode,
245 .write_inode = ceph_write_inode,
246 .sync_fs = ceph_syncfs,
247 .put_super = ceph_put_super,
248 .show_options = ceph_show_options,
249 .statfs = ceph_statfs,
250 .umount_begin = ceph_umount_begin,
251};
252
253
254const char *ceph_msg_type_name(int type)
255{
256 switch (type) {
257 case CEPH_MSG_SHUTDOWN: return "shutdown";
258 case CEPH_MSG_PING: return "ping";
259 case CEPH_MSG_AUTH: return "auth";
260 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
261 case CEPH_MSG_MON_MAP: return "mon_map";
262 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
263 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
264 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
265 case CEPH_MSG_STATFS: return "statfs";
266 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
267 case CEPH_MSG_MDS_MAP: return "mds_map";
268 case CEPH_MSG_CLIENT_SESSION: return "client_session";
269 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
270 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
271 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
272 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
273 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
274 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
275 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
276 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
277 case CEPH_MSG_OSD_MAP: return "osd_map";
278 case CEPH_MSG_OSD_OP: return "osd_op";
279 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
280 default: return "unknown";
281 }
282}
283
284
285/*
286 * mount options
287 */
288enum {
289 Opt_fsidmajor,
290 Opt_fsidminor,
291 Opt_monport,
292 Opt_wsize,
293 Opt_rsize,
294 Opt_osdtimeout,
295 Opt_osdkeepalivetimeout,
296 Opt_mount_timeout,
297 Opt_osd_idle_ttl,
298 Opt_caps_wanted_delay_min,
299 Opt_caps_wanted_delay_max,
300 Opt_readdir_max_entries,
301 Opt_congestion_kb,
302 Opt_last_int,
303 /* int args above */
304 Opt_snapdirname,
305 Opt_name,
306 Opt_secret,
307 Opt_last_string,
308 /* string args above */
309 Opt_ip,
310 Opt_noshare,
311 Opt_dirstat,
312 Opt_nodirstat,
313 Opt_rbytes,
314 Opt_norbytes,
315 Opt_nocrc,
316 Opt_noasyncreaddir,
317};
318
319static match_table_t arg_tokens = {
320 {Opt_fsidmajor, "fsidmajor=%ld"},
321 {Opt_fsidminor, "fsidminor=%ld"},
322 {Opt_monport, "monport=%d"},
323 {Opt_wsize, "wsize=%d"},
324 {Opt_rsize, "rsize=%d"},
325 {Opt_osdtimeout, "osdtimeout=%d"},
326 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
327 {Opt_mount_timeout, "mount_timeout=%d"},
328 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
329 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
330 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
331 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
332 {Opt_congestion_kb, "write_congestion_kb=%d"},
333 /* int args above */
334 {Opt_snapdirname, "snapdirname=%s"},
335 {Opt_name, "name=%s"},
336 {Opt_secret, "secret=%s"},
337 /* string args above */
338 {Opt_ip, "ip=%s"},
339 {Opt_noshare, "noshare"},
340 {Opt_dirstat, "dirstat"},
341 {Opt_nodirstat, "nodirstat"},
342 {Opt_rbytes, "rbytes"},
343 {Opt_norbytes, "norbytes"},
344 {Opt_nocrc, "nocrc"},
345 {Opt_noasyncreaddir, "noasyncreaddir"},
346 {-1, NULL}
347};
348
349
350static struct ceph_mount_args *parse_mount_args(int flags, char *options,
351 const char *dev_name,
352 const char **path)
353{
354 struct ceph_mount_args *args;
355 const char *c;
356 int err = -ENOMEM;
357 substring_t argstr[MAX_OPT_ARGS];
358
359 args = kzalloc(sizeof(*args), GFP_KERNEL);
360 if (!args)
361 return ERR_PTR(-ENOMEM);
362 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
363 GFP_KERNEL);
364 if (!args->mon_addr)
365 goto out;
366
367 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
368
369 /* start with defaults */
370 args->sb_flags = flags;
371 args->flags = CEPH_OPT_DEFAULT;
372 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
373 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
374 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
375 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
376 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
377 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
378 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
379 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
380 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
381 args->max_readdir = 1024;
382 args->congestion_kb = default_congestion_kb();
383
384 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
385 err = -EINVAL;
386 if (!dev_name)
387 goto out;
388 *path = strstr(dev_name, ":/");
389 if (*path == NULL) {
390 pr_err("device name is missing path (no :/ in %s)\n",
391 dev_name);
392 goto out;
393 }
394
395 /* get mon ip(s) */
396 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
397 CEPH_MAX_MON, &args->num_mon);
398 if (err < 0)
399 goto out;
400
401 /* path on server */
402 *path += 2;
403 dout("server path '%s'\n", *path);
404
405 /* parse mount options */
406 while ((c = strsep(&options, ",")) != NULL) {
407 int token, intval, ret;
408 if (!*c)
409 continue;
410 err = -EINVAL;
411 token = match_token((char *)c, arg_tokens, argstr);
412 if (token < 0) {
413 pr_err("bad mount option at '%s'\n", c);
414 goto out;
415 }
416 if (token < Opt_last_int) {
417 ret = match_int(&argstr[0], &intval);
418 if (ret < 0) {
419 pr_err("bad mount option arg (not int) "
420 "at '%s'\n", c);
421 continue;
422 }
423 dout("got int token %d val %d\n", token, intval);
424 } else if (token > Opt_last_int && token < Opt_last_string) {
425 dout("got string token %d val %s\n", token,
426 argstr[0].from);
427 } else {
428 dout("got token %d\n", token);
429 }
430 switch (token) {
431 case Opt_fsidmajor:
432 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
433 break;
434 case Opt_fsidminor:
435 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
436 break;
437 case Opt_ip:
438 err = ceph_parse_ips(argstr[0].from,
439 argstr[0].to,
440 &args->my_addr,
441 1, NULL);
442 if (err < 0)
443 goto out;
444 args->flags |= CEPH_OPT_MYIP;
445 break;
446
447 case Opt_snapdirname:
448 kfree(args->snapdir_name);
449 args->snapdir_name = kstrndup(argstr[0].from,
450 argstr[0].to-argstr[0].from,
451 GFP_KERNEL);
452 break;
453 case Opt_name:
454 args->name = kstrndup(argstr[0].from,
455 argstr[0].to-argstr[0].from,
456 GFP_KERNEL);
457 break;
458 case Opt_secret:
459 args->secret = kstrndup(argstr[0].from,
460 argstr[0].to-argstr[0].from,
461 GFP_KERNEL);
462 break;
463
464 /* misc */
465 case Opt_wsize:
466 args->wsize = intval;
467 break;
468 case Opt_rsize:
469 args->rsize = intval;
470 break;
471 case Opt_osdtimeout:
472 args->osd_timeout = intval;
473 break;
474 case Opt_osdkeepalivetimeout:
475 args->osd_keepalive_timeout = intval;
476 break;
477 case Opt_mount_timeout:
478 args->mount_timeout = intval;
479 break;
480 case Opt_caps_wanted_delay_min:
481 args->caps_wanted_delay_min = intval;
482 break;
483 case Opt_caps_wanted_delay_max:
484 args->caps_wanted_delay_max = intval;
485 break;
486 case Opt_readdir_max_entries:
487 args->max_readdir = intval;
488 break;
489 case Opt_congestion_kb:
490 args->congestion_kb = intval;
491 break;
492
493 case Opt_noshare:
494 args->flags |= CEPH_OPT_NOSHARE;
495 break;
496
497 case Opt_dirstat:
498 args->flags |= CEPH_OPT_DIRSTAT;
499 break;
500 case Opt_nodirstat:
501 args->flags &= ~CEPH_OPT_DIRSTAT;
502 break;
503 case Opt_rbytes:
504 args->flags |= CEPH_OPT_RBYTES;
505 break;
506 case Opt_norbytes:
507 args->flags &= ~CEPH_OPT_RBYTES;
508 break;
509 case Opt_nocrc:
510 args->flags |= CEPH_OPT_NOCRC;
511 break;
512 case Opt_noasyncreaddir:
513 args->flags |= CEPH_OPT_NOASYNCREADDIR;
514 break;
515
516 default:
517 BUG_ON(token);
518 }
519 }
520 return args;
521
522out:
523 kfree(args->mon_addr);
524 kfree(args);
525 return ERR_PTR(err);
526}
527
528static void destroy_mount_args(struct ceph_mount_args *args)
529{
530 dout("destroy_mount_args %p\n", args);
531 kfree(args->snapdir_name);
532 args->snapdir_name = NULL;
533 kfree(args->name);
534 args->name = NULL;
535 kfree(args->secret);
536 args->secret = NULL;
537 kfree(args);
538}
539
540/*
541 * create a fresh client instance
542 */
543static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
544{
545 struct ceph_client *client;
546 int err = -ENOMEM;
547
548 client = kzalloc(sizeof(*client), GFP_KERNEL);
549 if (client == NULL)
550 return ERR_PTR(-ENOMEM);
551
552 mutex_init(&client->mount_mutex);
553
554 init_waitqueue_head(&client->auth_wq);
555
556 client->sb = NULL;
557 client->mount_state = CEPH_MOUNT_MOUNTING;
558 client->mount_args = args;
559
560 client->msgr = NULL;
561
562 client->auth_err = 0;
563 atomic_long_set(&client->writeback_count, 0);
564
565 err = bdi_init(&client->backing_dev_info);
566 if (err < 0)
567 goto fail;
568
569 err = -ENOMEM;
570 client->wb_wq = create_workqueue("ceph-writeback");
571 if (client->wb_wq == NULL)
572 goto fail_bdi;
573 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
574 if (client->pg_inv_wq == NULL)
575 goto fail_wb_wq;
576 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
577 if (client->trunc_wq == NULL)
578 goto fail_pg_inv_wq;
579
580 /* set up mempools */
581 err = -ENOMEM;
582 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
583 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
584 if (!client->wb_pagevec_pool)
585 goto fail_trunc_wq;
586
587 /* caps */
588 client->min_caps = args->max_readdir;
589 ceph_adjust_min_caps(client->min_caps);
590
591 /* subsystems */
592 err = ceph_monc_init(&client->monc, client);
593 if (err < 0)
594 goto fail_mempool;
595 err = ceph_osdc_init(&client->osdc, client);
596 if (err < 0)
597 goto fail_monc;
598 err = ceph_mdsc_init(&client->mdsc, client);
599 if (err < 0)
600 goto fail_osdc;
601 return client;
602
603fail_osdc:
604 ceph_osdc_stop(&client->osdc);
605fail_monc:
606 ceph_monc_stop(&client->monc);
607fail_mempool:
608 mempool_destroy(client->wb_pagevec_pool);
609fail_trunc_wq:
610 destroy_workqueue(client->trunc_wq);
611fail_pg_inv_wq:
612 destroy_workqueue(client->pg_inv_wq);
613fail_wb_wq:
614 destroy_workqueue(client->wb_wq);
615fail_bdi:
616 bdi_destroy(&client->backing_dev_info);
617fail:
618 kfree(client);
619 return ERR_PTR(err);
620}
621
622static void ceph_destroy_client(struct ceph_client *client)
623{
624 dout("destroy_client %p\n", client);
625
626 /* unmount */
627 ceph_mdsc_stop(&client->mdsc);
628 ceph_monc_stop(&client->monc);
629 ceph_osdc_stop(&client->osdc);
630
631 ceph_adjust_min_caps(-client->min_caps);
632
633 ceph_debugfs_client_cleanup(client);
634 destroy_workqueue(client->wb_wq);
635 destroy_workqueue(client->pg_inv_wq);
636 destroy_workqueue(client->trunc_wq);
637
638 if (client->msgr)
639 ceph_messenger_destroy(client->msgr);
640 mempool_destroy(client->wb_pagevec_pool);
641
642 destroy_mount_args(client->mount_args);
643
644 kfree(client);
645 dout("destroy_client %p done\n", client);
646}
647
648/*
649 * Initially learn our fsid, or verify an fsid matches.
650 */
651int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
652{
653 if (client->have_fsid) {
654 if (ceph_fsid_compare(&client->fsid, fsid)) {
655 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
656 PR_FSID(&client->fsid), PR_FSID(fsid));
657 return -1;
658 }
659 } else {
660 pr_info("client%lld fsid " FSID_FORMAT "\n",
661 client->monc.auth->global_id, PR_FSID(fsid));
662 memcpy(&client->fsid, fsid, sizeof(*fsid));
663 ceph_debugfs_client_init(client);
664 client->have_fsid = true;
665 }
666 return 0;
667}
668
669/*
670 * true if we have the mon map (and have thus joined the cluster)
671 */
672static int have_mon_map(struct ceph_client *client)
673{
674 return client->monc.monmap && client->monc.monmap->epoch;
675}
676
677/*
678 * Bootstrap mount by opening the root directory. Note the mount
679 * @started time from caller, and time out if this takes too long.
680 */
681static struct dentry *open_root_dentry(struct ceph_client *client,
682 const char *path,
683 unsigned long started)
684{
685 struct ceph_mds_client *mdsc = &client->mdsc;
686 struct ceph_mds_request *req = NULL;
687 int err;
688 struct dentry *root;
689
690 /* open dir */
691 dout("open_root_inode opening '%s'\n", path);
692 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
693 if (IS_ERR(req))
694 return ERR_PTR(PTR_ERR(req));
695 req->r_path1 = kstrdup(path, GFP_NOFS);
696 req->r_ino1.ino = CEPH_INO_ROOT;
697 req->r_ino1.snap = CEPH_NOSNAP;
698 req->r_started = started;
699 req->r_timeout = client->mount_args->mount_timeout * HZ;
700 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
701 req->r_num_caps = 2;
702 err = ceph_mdsc_do_request(mdsc, NULL, req);
703 if (err == 0) {
704 dout("open_root_inode success\n");
705 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
706 client->sb->s_root == NULL)
707 root = d_alloc_root(req->r_target_inode);
708 else
709 root = d_obtain_alias(req->r_target_inode);
710 req->r_target_inode = NULL;
711 dout("open_root_inode success, root dentry is %p\n", root);
712 } else {
713 root = ERR_PTR(err);
714 }
715 ceph_mdsc_put_request(req);
716 return root;
717}
718
719/*
720 * mount: join the ceph cluster, and open root directory.
721 */
722static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
723 const char *path)
724{
725 struct ceph_entity_addr *myaddr = NULL;
726 int err;
727 unsigned long timeout = client->mount_args->mount_timeout * HZ;
728 unsigned long started = jiffies; /* note the start time */
729 struct dentry *root;
730
731 dout("mount start\n");
732 mutex_lock(&client->mount_mutex);
733
734 /* initialize the messenger */
735 if (client->msgr == NULL) {
736 if (ceph_test_opt(client, MYIP))
737 myaddr = &client->mount_args->my_addr;
738 client->msgr = ceph_messenger_create(myaddr);
739 if (IS_ERR(client->msgr)) {
740 err = PTR_ERR(client->msgr);
741 client->msgr = NULL;
742 goto out;
743 }
744 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
745 }
746
747 /* open session, and wait for mon, mds, and osd maps */
748 err = ceph_monc_open_session(&client->monc);
749 if (err < 0)
750 goto out;
751
752 while (!have_mon_map(client)) {
753 err = -EIO;
754 if (timeout && time_after_eq(jiffies, started + timeout))
755 goto out;
756
757 /* wait */
758 dout("mount waiting for mon_map\n");
759 err = wait_event_interruptible_timeout(client->auth_wq,
760 have_mon_map(client) || (client->auth_err < 0),
761 timeout);
762 if (err == -EINTR || err == -ERESTARTSYS)
763 goto out;
764 if (client->auth_err < 0) {
765 err = client->auth_err;
766 goto out;
767 }
768 }
769
770 dout("mount opening root\n");
771 root = open_root_dentry(client, "", started);
772 if (IS_ERR(root)) {
773 err = PTR_ERR(root);
774 goto out;
775 }
776 if (client->sb->s_root)
777 dput(root);
778 else
779 client->sb->s_root = root;
780
781 if (path[0] == 0) {
782 dget(root);
783 } else {
784 dout("mount opening base mountpoint\n");
785 root = open_root_dentry(client, path, started);
786 if (IS_ERR(root)) {
787 err = PTR_ERR(root);
788 dput(client->sb->s_root);
789 client->sb->s_root = NULL;
790 goto out;
791 }
792 }
793
794 mnt->mnt_root = root;
795 mnt->mnt_sb = client->sb;
796
797 client->mount_state = CEPH_MOUNT_MOUNTED;
798 dout("mount success\n");
799 err = 0;
800
801out:
802 mutex_unlock(&client->mount_mutex);
803 return err;
804}
805
806static int ceph_set_super(struct super_block *s, void *data)
807{
808 struct ceph_client *client = data;
809 int ret;
810
811 dout("set_super %p data %p\n", s, data);
812
813 s->s_flags = client->mount_args->sb_flags;
814 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
815
816 s->s_fs_info = client;
817 client->sb = s;
818
819 s->s_op = &ceph_super_ops;
820 s->s_export_op = &ceph_export_ops;
821
822 s->s_time_gran = 1000; /* 1000 ns == 1 us */
823
824 ret = set_anon_super(s, NULL); /* what is that second arg for? */
825 if (ret != 0)
826 goto fail;
827
828 return ret;
829
830fail:
831 s->s_fs_info = NULL;
832 client->sb = NULL;
833 return ret;
834}
835
836/*
837 * share superblock if same fs AND options
838 */
839static int ceph_compare_super(struct super_block *sb, void *data)
840{
841 struct ceph_client *new = data;
842 struct ceph_mount_args *args = new->mount_args;
843 struct ceph_client *other = ceph_sb_to_client(sb);
844 int i;
845
846 dout("ceph_compare_super %p\n", sb);
847 if (args->flags & CEPH_OPT_FSID) {
848 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
849 dout("fsid doesn't match\n");
850 return 0;
851 }
852 } else {
853 /* do we share (a) monitor? */
854 for (i = 0; i < new->monc.monmap->num_mon; i++)
855 if (ceph_monmap_contains(other->monc.monmap,
856 &new->monc.monmap->mon_inst[i].addr))
857 break;
858 if (i == new->monc.monmap->num_mon) {
859 dout("mon ip not part of monmap\n");
860 return 0;
861 }
862 dout("mon ip matches existing sb %p\n", sb);
863 }
864 if (args->sb_flags != other->mount_args->sb_flags) {
865 dout("flags differ\n");
866 return 0;
867 }
868 return 1;
869}
870
871/*
872 * construct our own bdi so we can control readahead, etc.
873 */
874static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
875{
876 int err;
877
878 sb->s_bdi = &client->backing_dev_info;
879
880 /* set ra_pages based on rsize mount option? */
881 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
882 client->backing_dev_info.ra_pages =
883 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
884 >> PAGE_SHIFT;
885 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
886 return err;
887}
888
889static int ceph_get_sb(struct file_system_type *fs_type,
890 int flags, const char *dev_name, void *data,
891 struct vfsmount *mnt)
892{
893 struct super_block *sb;
894 struct ceph_client *client;
895 int err;
896 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
897 const char *path = NULL;
898 struct ceph_mount_args *args;
899
900 dout("ceph_get_sb\n");
901 args = parse_mount_args(flags, data, dev_name, &path);
902 if (IS_ERR(args)) {
903 err = PTR_ERR(args);
904 goto out_final;
905 }
906
907 /* create client (which we may/may not use) */
908 client = ceph_create_client(args);
909 if (IS_ERR(client)) {
910 err = PTR_ERR(client);
911 goto out_final;
912 }
913
914 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
915 compare_super = NULL;
916 sb = sget(fs_type, compare_super, ceph_set_super, client);
917 if (IS_ERR(sb)) {
918 err = PTR_ERR(sb);
919 goto out;
920 }
921
922 if (ceph_client(sb) != client) {
923 ceph_destroy_client(client);
924 client = ceph_client(sb);
925 dout("get_sb got existing client %p\n", client);
926 } else {
927 dout("get_sb using new client %p\n", client);
928 err = ceph_register_bdi(sb, client);
929 if (err < 0)
930 goto out_splat;
931 }
932
933 err = ceph_mount(client, mnt, path);
934 if (err < 0)
935 goto out_splat;
936 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
937 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
938 return 0;
939
940out_splat:
941 ceph_mdsc_close_sessions(&client->mdsc);
942 up_write(&sb->s_umount);
943 deactivate_super(sb);
944 goto out_final;
945
946out:
947 ceph_destroy_client(client);
948out_final:
949 dout("ceph_get_sb fail %d\n", err);
950 return err;
951}
952
953static void ceph_kill_sb(struct super_block *s)
954{
955 struct ceph_client *client = ceph_sb_to_client(s);
956 dout("kill_sb %p\n", s);
957 ceph_mdsc_pre_umount(&client->mdsc);
958 kill_anon_super(s); /* will call put_super after sb is r/o */
959 if (s->s_bdi == &client->backing_dev_info)
960 bdi_unregister(&client->backing_dev_info);
961 bdi_destroy(&client->backing_dev_info);
962 ceph_destroy_client(client);
963}
964
965static struct file_system_type ceph_fs_type = {
966 .owner = THIS_MODULE,
967 .name = "ceph",
968 .get_sb = ceph_get_sb,
969 .kill_sb = ceph_kill_sb,
970 .fs_flags = FS_RENAME_DOES_D_MOVE,
971};
972
973#define _STRINGIFY(x) #x
974#define STRINGIFY(x) _STRINGIFY(x)
975
976static int __init init_ceph(void)
977{
978 int ret = 0;
979
980 ret = ceph_debugfs_init();
981 if (ret < 0)
982 goto out;
983
984 ret = ceph_msgr_init();
985 if (ret < 0)
986 goto out_debugfs;
987
988 ret = init_caches();
989 if (ret)
990 goto out_msgr;
991
992 ceph_caps_init();
993
994 ret = register_filesystem(&ceph_fs_type);
995 if (ret)
996 goto out_icache;
997
998 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
999 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1000 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1001 return 0;
1002
1003out_icache:
1004 destroy_caches();
1005out_msgr:
1006 ceph_msgr_exit();
1007out_debugfs:
1008 ceph_debugfs_cleanup();
1009out:
1010 return ret;
1011}
1012
1013static void __exit exit_ceph(void)
1014{
1015 dout("exit_ceph\n");
1016 unregister_filesystem(&ceph_fs_type);
1017 ceph_caps_finalize();
1018 destroy_caches();
1019 ceph_msgr_exit();
1020 ceph_debugfs_cleanup();
1021}
1022
1023module_init(init_ceph);
1024module_exit(exit_ceph);
1025
1026MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1027MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1028MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1029MODULE_DESCRIPTION("Ceph filesystem for Linux");
1030MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..65d12036b670
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15
16#include "types.h"
17#include "messenger.h"
18#include "msgpool.h"
19#include "mon_client.h"
20#include "mds_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/* f_type in struct statfs */
25#define CEPH_SUPER_MAGIC 0x00c36400
26
27/* large granularity for statfs utilization stats to facilitate
28 * large volume sizes on 32-bit machines. */
29#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
30#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
31
32/*
33 * mount options
34 */
35#define CEPH_OPT_FSID (1<<0)
36#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
37#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
38#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
39#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
40#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
41#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
42
43#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
44
45#define ceph_set_opt(client, opt) \
46 (client)->mount_args->flags |= CEPH_OPT_##opt;
47#define ceph_test_opt(client, opt) \
48 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
49
50
51struct ceph_mount_args {
52 int sb_flags;
53 int num_mon;
54 struct ceph_entity_addr *mon_addr;
55 int flags;
56 int mount_timeout;
57 int osd_idle_ttl;
58 int caps_wanted_delay_min, caps_wanted_delay_max;
59 struct ceph_fsid fsid;
60 struct ceph_entity_addr my_addr;
61 int wsize;
62 int rsize; /* max readahead */
63 int max_readdir; /* max readdir size */
64 int congestion_kb; /* max readdir size */
65 int osd_timeout;
66 int osd_keepalive_timeout;
67 char *snapdir_name; /* default ".snap" */
68 char *name;
69 char *secret;
70 int cap_release_safety;
71};
72
73/*
74 * defaults
75 */
76#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
77#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
78#define CEPH_OSD_KEEPALIVE_DEFAULT 5
79#define CEPH_OSD_IDLE_TTL_DEFAULT 60
80#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
81
82#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
83#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
84
85#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
86#define CEPH_AUTH_NAME_DEFAULT "guest"
87
88/*
89 * Delay telling the MDS we no longer want caps, in case we reopen
90 * the file. Delay a minimum amount of time, even if we send a cap
91 * message for some other reason. Otherwise, take the oppotunity to
92 * update the mds to avoid sending another message later.
93 */
94#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
95#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
96
97
98/* mount state */
99enum {
100 CEPH_MOUNT_MOUNTING,
101 CEPH_MOUNT_MOUNTED,
102 CEPH_MOUNT_UNMOUNTING,
103 CEPH_MOUNT_UNMOUNTED,
104 CEPH_MOUNT_SHUTDOWN,
105};
106
107/*
108 * subtract jiffies
109 */
110static inline unsigned long time_sub(unsigned long a, unsigned long b)
111{
112 BUG_ON(time_after(b, a));
113 return (long)a - (long)b;
114}
115
116/*
117 * per-filesystem client state
118 *
119 * possibly shared by multiple mount points, if they are
120 * mounting the same ceph filesystem/cluster.
121 */
122struct ceph_client {
123 struct ceph_fsid fsid;
124 bool have_fsid;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 struct ceph_mount_args *mount_args;
128
129 struct super_block *sb;
130
131 unsigned long mount_state;
132 wait_queue_head_t auth_wq;
133
134 int auth_err;
135
136 int min_caps; /* min caps i added */
137
138 struct ceph_messenger *msgr; /* messenger instance */
139 struct ceph_mon_client monc;
140 struct ceph_mds_client mdsc;
141 struct ceph_osd_client osdc;
142
143 /* writeback */
144 mempool_t *wb_pagevec_pool;
145 struct workqueue_struct *wb_wq;
146 struct workqueue_struct *pg_inv_wq;
147 struct workqueue_struct *trunc_wq;
148 atomic_long_t writeback_count;
149
150 struct backing_dev_info backing_dev_info;
151
152#ifdef CONFIG_DEBUG_FS
153 struct dentry *debugfs_monmap;
154 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
155 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
156 struct dentry *debugfs_congestion_kb;
157 struct dentry *debugfs_bdi;
158#endif
159};
160
161static inline struct ceph_client *ceph_client(struct super_block *sb)
162{
163 return sb->s_fs_info;
164}
165
166
167/*
168 * File i/o capability. This tracks shared state with the metadata
169 * server that allows us to cache or writeback attributes or to read
170 * and write data. For any given inode, we should have one or more
171 * capabilities, one issued by each metadata server, and our
172 * cumulative access is the OR of all issued capabilities.
173 *
174 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
175 * session capability lists.
176 */
177struct ceph_cap {
178 struct ceph_inode_info *ci;
179 struct rb_node ci_node; /* per-ci cap tree */
180 struct ceph_mds_session *session;
181 struct list_head session_caps; /* per-session caplist */
182 int mds;
183 u64 cap_id; /* unique cap id (mds provided) */
184 int issued; /* latest, from the mds */
185 int implemented; /* implemented superset of issued (for revocation) */
186 int mds_wanted;
187 u32 seq, issue_seq, mseq;
188 u32 cap_gen; /* active/stale cycle */
189 unsigned long last_used;
190 struct list_head caps_item;
191};
192
193#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
194#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
195#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
196
197/*
198 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
199 * we first complete any in-process sync writes and writeback any dirty
200 * data before flushing the snapped state (tracked here) back to the MDS.
201 */
202struct ceph_cap_snap {
203 atomic_t nref;
204 struct ceph_inode_info *ci;
205 struct list_head ci_item, flushing_item;
206
207 u64 follows, flush_tid;
208 int issued, dirty;
209 struct ceph_snap_context *context;
210
211 mode_t mode;
212 uid_t uid;
213 gid_t gid;
214
215 void *xattr_blob;
216 int xattr_len;
217 u64 xattr_version;
218
219 u64 size;
220 struct timespec mtime, atime, ctime;
221 u64 time_warp_seq;
222 int writing; /* a sync write is still in progress */
223 int dirty_pages; /* dirty pages awaiting writeback */
224};
225
226static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
227{
228 if (atomic_dec_and_test(&capsnap->nref))
229 kfree(capsnap);
230}
231
232/*
233 * The frag tree describes how a directory is fragmented, potentially across
234 * multiple metadata servers. It is also used to indicate points where
235 * metadata authority is delegated, and whether/where metadata is replicated.
236 *
237 * A _leaf_ frag will be present in the i_fragtree IFF there is
238 * delegation info. That is, if mds >= 0 || ndist > 0.
239 */
240#define CEPH_MAX_DIRFRAG_REP 4
241
242struct ceph_inode_frag {
243 struct rb_node node;
244
245 /* fragtree state */
246 u32 frag;
247 int split_by; /* i.e. 2^(split_by) children */
248
249 /* delegation and replication info */
250 int mds; /* -1 if same authority as parent */
251 int ndist; /* >0 if replicated */
252 int dist[CEPH_MAX_DIRFRAG_REP];
253};
254
255/*
256 * We cache inode xattrs as an encoded blob until they are first used,
257 * at which point we parse them into an rbtree.
258 */
259struct ceph_inode_xattr {
260 struct rb_node node;
261
262 const char *name;
263 int name_len;
264 const char *val;
265 int val_len;
266 int dirty;
267
268 int should_free_name;
269 int should_free_val;
270};
271
272struct ceph_inode_xattrs_info {
273 /*
274 * (still encoded) xattr blob. we avoid the overhead of parsing
275 * this until someone actually calls getxattr, etc.
276 *
277 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
278 * NULL means we don't know.
279 */
280 struct ceph_buffer *blob, *prealloc_blob;
281
282 struct rb_root index;
283 bool dirty;
284 int count;
285 int names_size;
286 int vals_size;
287 u64 version, index_version;
288};
289
290/*
291 * Ceph inode.
292 */
293#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
294#define CEPH_I_NODELAY 4 /* do not delay cap release */
295#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
296#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
297
298struct ceph_inode_info {
299 struct ceph_vino i_vino; /* ceph ino + snap */
300
301 u64 i_version;
302 u32 i_time_warp_seq;
303
304 unsigned i_ceph_flags;
305 unsigned long i_release_count;
306
307 struct ceph_file_layout i_layout;
308 char *i_symlink;
309
310 /* for dirs */
311 struct timespec i_rctime;
312 u64 i_rbytes, i_rfiles, i_rsubdirs;
313 u64 i_files, i_subdirs;
314 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
315
316 struct rb_root i_fragtree;
317 struct mutex i_fragtree_mutex;
318
319 struct ceph_inode_xattrs_info i_xattrs;
320
321 /* capabilities. protected _both_ by i_lock and cap->session's
322 * s_mutex. */
323 struct rb_root i_caps; /* cap list */
324 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
325 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
326 struct list_head i_dirty_item, i_flushing_item;
327 u64 i_cap_flush_seq;
328 /* we need to track cap writeback on a per-cap-bit basis, to allow
329 * overlapping, pipelined cap flushes to the mds. we can probably
330 * reduce the tid to 8 bits if we're concerned about inode size. */
331 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
332 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
333 unsigned long i_hold_caps_min; /* jiffies */
334 unsigned long i_hold_caps_max; /* jiffies */
335 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
336 int i_cap_exporting_mds; /* to handle cap migration between */
337 unsigned i_cap_exporting_mseq; /* mds's. */
338 unsigned i_cap_exporting_issued;
339 struct ceph_cap_reservation i_cap_migration_resv;
340 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
341 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
342 unsigned i_snap_caps; /* cap bits for snapped files */
343
344 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
345
346 u32 i_truncate_seq; /* last truncate to smaller size */
347 u64 i_truncate_size; /* and the size we last truncated down to */
348 int i_truncate_pending; /* still need to call vmtruncate */
349
350 u64 i_max_size; /* max file size authorized by mds */
351 u64 i_reported_size; /* (max_)size reported to or requested of mds */
352 u64 i_wanted_max_size; /* offset we'd like to write too */
353 u64 i_requested_max_size; /* max_size we've requested */
354
355 /* held references to caps */
356 int i_pin_ref;
357 int i_rd_ref, i_rdcache_ref, i_wr_ref;
358 int i_wrbuffer_ref, i_wrbuffer_ref_head;
359 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
360 u32 i_rdcache_gen; /* we increment this each time we get
361 FILE_CACHE. If it's non-zero, we
362 _may_ have cached pages. */
363 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
364
365 struct list_head i_unsafe_writes; /* uncommitted sync writes */
366 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
367 spinlock_t i_unsafe_lock;
368
369 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
370 int i_snap_realm_counter; /* snap realm (if caps) */
371 struct list_head i_snap_realm_item;
372 struct list_head i_snap_flush_item;
373
374 struct work_struct i_wb_work; /* writeback work */
375 struct work_struct i_pg_inv_work; /* page invalidation work */
376
377 struct work_struct i_vmtruncate_work;
378
379 struct inode vfs_inode; /* at end */
380};
381
382static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
383{
384 return container_of(inode, struct ceph_inode_info, vfs_inode);
385}
386
387static inline void ceph_i_clear(struct inode *inode, unsigned mask)
388{
389 struct ceph_inode_info *ci = ceph_inode(inode);
390
391 spin_lock(&inode->i_lock);
392 ci->i_ceph_flags &= ~mask;
393 spin_unlock(&inode->i_lock);
394}
395
396static inline void ceph_i_set(struct inode *inode, unsigned mask)
397{
398 struct ceph_inode_info *ci = ceph_inode(inode);
399
400 spin_lock(&inode->i_lock);
401 ci->i_ceph_flags |= mask;
402 spin_unlock(&inode->i_lock);
403}
404
405static inline bool ceph_i_test(struct inode *inode, unsigned mask)
406{
407 struct ceph_inode_info *ci = ceph_inode(inode);
408 bool r;
409
410 smp_mb();
411 r = (ci->i_ceph_flags & mask) == mask;
412 return r;
413}
414
415
416/* find a specific frag @f */
417extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
418 u32 f);
419
420/*
421 * choose fragment for value @v. copy frag content to pfrag, if leaf
422 * exists
423 */
424extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
425 struct ceph_inode_frag *pfrag,
426 int *found);
427
428/*
429 * Ceph dentry state
430 */
431struct ceph_dentry_info {
432 struct ceph_mds_session *lease_session;
433 u32 lease_gen, lease_shared_gen;
434 u32 lease_seq;
435 unsigned long lease_renew_after, lease_renew_from;
436 struct list_head lru;
437 struct dentry *dentry;
438 u64 time;
439 u64 offset;
440};
441
442static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
443{
444 return (struct ceph_dentry_info *)dentry->d_fsdata;
445}
446
447static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
448{
449 return ((loff_t)frag << 32) | (loff_t)off;
450}
451
452/*
453 * ino_t is <64 bits on many architectures, blech.
454 *
455 * don't include snap in ino hash, at least for now.
456 */
457static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
458{
459 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
460#if BITS_PER_LONG == 32
461 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
462 if (!ino)
463 ino = 1;
464#endif
465 return ino;
466}
467
468static inline int ceph_set_ino_cb(struct inode *inode, void *data)
469{
470 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
471 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
472 return 0;
473}
474
475static inline struct ceph_vino ceph_vino(struct inode *inode)
476{
477 return ceph_inode(inode)->i_vino;
478}
479
480/* for printf-style formatting */
481#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
482
483static inline u64 ceph_ino(struct inode *inode)
484{
485 return ceph_inode(inode)->i_vino.ino;
486}
487static inline u64 ceph_snap(struct inode *inode)
488{
489 return ceph_inode(inode)->i_vino.snap;
490}
491
492static inline int ceph_ino_compare(struct inode *inode, void *data)
493{
494 struct ceph_vino *pvino = (struct ceph_vino *)data;
495 struct ceph_inode_info *ci = ceph_inode(inode);
496 return ci->i_vino.ino == pvino->ino &&
497 ci->i_vino.snap == pvino->snap;
498}
499
500static inline struct inode *ceph_find_inode(struct super_block *sb,
501 struct ceph_vino vino)
502{
503 ino_t t = ceph_vino_to_ino(vino);
504 return ilookup5(sb, t, ceph_ino_compare, &vino);
505}
506
507
508/*
509 * caps helpers
510 */
511static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
512{
513 return !RB_EMPTY_ROOT(&ci->i_caps);
514}
515
516extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
517extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
518extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
519 struct ceph_cap *cap);
520
521static inline int ceph_caps_issued(struct ceph_inode_info *ci)
522{
523 int issued;
524 spin_lock(&ci->vfs_inode.i_lock);
525 issued = __ceph_caps_issued(ci, NULL);
526 spin_unlock(&ci->vfs_inode.i_lock);
527 return issued;
528}
529
530static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
531 int touch)
532{
533 int r;
534 spin_lock(&ci->vfs_inode.i_lock);
535 r = __ceph_caps_issued_mask(ci, mask, touch);
536 spin_unlock(&ci->vfs_inode.i_lock);
537 return r;
538}
539
540static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
541{
542 return ci->i_dirty_caps | ci->i_flushing_caps;
543}
544extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
545
546extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
547extern int __ceph_caps_used(struct ceph_inode_info *ci);
548
549extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
550
551/*
552 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
553 */
554static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
555{
556 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
557 if (w & CEPH_CAP_FILE_BUFFER)
558 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
559 return w;
560}
561
562/* what the mds thinks we want */
563extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
564
565extern void ceph_caps_init(void);
566extern void ceph_caps_finalize(void);
567extern void ceph_adjust_min_caps(int delta);
568extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
569extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
570extern void ceph_reservation_status(struct ceph_client *client,
571 int *total, int *avail, int *used,
572 int *reserved, int *min);
573
574static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
575{
576 return (struct ceph_client *)inode->i_sb->s_fs_info;
577}
578
579static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
580{
581 return (struct ceph_client *)sb->s_fs_info;
582}
583
584
585/*
586 * we keep buffered readdir results attached to file->private_data
587 */
588struct ceph_file_info {
589 int fmode; /* initialized on open */
590
591 /* readdir: position within the dir */
592 u32 frag;
593 struct ceph_mds_request *last_readdir;
594 int at_end;
595
596 /* readdir: position within a frag */
597 unsigned offset; /* offset of last chunk, adjusted for . and .. */
598 u64 next_offset; /* offset of next chunk (last_name's + 1) */
599 char *last_name; /* last entry in previous chunk */
600 struct dentry *dentry; /* next dentry (for dcache readdir) */
601 unsigned long dir_release_count;
602
603 /* used for -o dirstat read() on directory thing */
604 char *dir_info;
605 int dir_info_len;
606};
607
608
609
610/*
611 * snapshots
612 */
613
614/*
615 * A "snap context" is the set of existing snapshots when we
616 * write data. It is used by the OSD to guide its COW behavior.
617 *
618 * The ceph_snap_context is refcounted, and attached to each dirty
619 * page, indicating which context the dirty data belonged when it was
620 * dirtied.
621 */
622struct ceph_snap_context {
623 atomic_t nref;
624 u64 seq;
625 int num_snaps;
626 u64 snaps[];
627};
628
629static inline struct ceph_snap_context *
630ceph_get_snap_context(struct ceph_snap_context *sc)
631{
632 /*
633 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
634 atomic_read(&sc->nref)+1);
635 */
636 if (sc)
637 atomic_inc(&sc->nref);
638 return sc;
639}
640
641static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
642{
643 if (!sc)
644 return;
645 /*
646 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
647 atomic_read(&sc->nref)-1);
648 */
649 if (atomic_dec_and_test(&sc->nref)) {
650 /*printk(" deleting snap_context %p\n", sc);*/
651 kfree(sc);
652 }
653}
654
655/*
656 * A "snap realm" describes a subset of the file hierarchy sharing
657 * the same set of snapshots that apply to it. The realms themselves
658 * are organized into a hierarchy, such that children inherit (some of)
659 * the snapshots of their parents.
660 *
661 * All inodes within the realm that have capabilities are linked into a
662 * per-realm list.
663 */
664struct ceph_snap_realm {
665 u64 ino;
666 atomic_t nref;
667 struct rb_node node;
668
669 u64 created, seq;
670 u64 parent_ino;
671 u64 parent_since; /* snapid when our current parent became so */
672
673 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
674 int num_prior_parent_snaps; /* had prior to parent_since */
675 u64 *snaps; /* snaps specific to this realm */
676 int num_snaps;
677
678 struct ceph_snap_realm *parent;
679 struct list_head children; /* list of child realms */
680 struct list_head child_item;
681
682 struct list_head empty_item; /* if i have ref==0 */
683
684 /* the current set of snaps for this realm */
685 struct ceph_snap_context *cached_context;
686
687 struct list_head inodes_with_caps;
688 spinlock_t inodes_with_caps_lock;
689};
690
691
692
693/*
694 * calculate the number of pages a given length and offset map onto,
695 * if we align the data.
696 */
697static inline int calc_pages_for(u64 off, u64 len)
698{
699 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
700 (off >> PAGE_CACHE_SHIFT);
701}
702
703
704
705/* snap.c */
706struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
707 u64 ino);
708extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
709 struct ceph_snap_realm *realm);
710extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
711 struct ceph_snap_realm *realm);
712extern int ceph_update_snap_trace(struct ceph_mds_client *m,
713 void *p, void *e, bool deletion);
714extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
715 struct ceph_mds_session *session,
716 struct ceph_msg *msg);
717extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
718 struct ceph_snap_context *snapc);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..37d6ce645691
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,844 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6
7static bool ceph_is_valid_xattr(const char *name)
8{
9 return !strncmp(name, XATTR_SECURITY_PREFIX,
10 XATTR_SECURITY_PREFIX_LEN) ||
11 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
12 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
13}
14
15/*
16 * These define virtual xattrs exposing the recursive directory
17 * statistics and layout metadata.
18 */
19struct ceph_vxattr_cb {
20 bool readonly;
21 char *name;
22 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
23 size_t size);
24};
25
26/* directories */
27
28static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
29 size_t size)
30{
31 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
32}
33
34static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
35 size_t size)
36{
37 return snprintf(val, size, "%lld", ci->i_files);
38}
39
40static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
41 size_t size)
42{
43 return snprintf(val, size, "%lld", ci->i_subdirs);
44}
45
46static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
47 size_t size)
48{
49 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
50}
51
52static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
53 size_t size)
54{
55 return snprintf(val, size, "%lld", ci->i_rfiles);
56}
57
58static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
59 size_t size)
60{
61 return snprintf(val, size, "%lld", ci->i_rsubdirs);
62}
63
64static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
65 size_t size)
66{
67 return snprintf(val, size, "%lld", ci->i_rbytes);
68}
69
70static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
71 size_t size)
72{
73 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
74 (long)ci->i_rctime.tv_nsec);
75}
76
77static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
78 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
79 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
80 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
81 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
82 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
83 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
84 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
85 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
86 { true, NULL, NULL }
87};
88
89/* files */
90
91static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
92 size_t size)
93{
94 int ret;
95
96 ret = snprintf(val, size,
97 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
98 (unsigned long long)ceph_file_layout_su(ci->i_layout),
99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
101 if (ceph_file_layout_pg_preferred(ci->i_layout))
102 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
103 (unsigned long long)ceph_file_layout_pg_preferred(
104 ci->i_layout));
105 return ret;
106}
107
108static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
109 { true, "user.ceph.layout", ceph_vxattrcb_layout},
110 { NULL, NULL }
111};
112
113static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
114{
115 if (S_ISDIR(inode->i_mode))
116 return ceph_dir_vxattrs;
117 else if (S_ISREG(inode->i_mode))
118 return ceph_file_vxattrs;
119 return NULL;
120}
121
122static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
123 const char *name)
124{
125 do {
126 if (strcmp(vxattr->name, name) == 0)
127 return vxattr;
128 vxattr++;
129 } while (vxattr->name);
130 return NULL;
131}
132
133static int __set_xattr(struct ceph_inode_info *ci,
134 const char *name, int name_len,
135 const char *val, int val_len,
136 int dirty,
137 int should_free_name, int should_free_val,
138 struct ceph_inode_xattr **newxattr)
139{
140 struct rb_node **p;
141 struct rb_node *parent = NULL;
142 struct ceph_inode_xattr *xattr = NULL;
143 int c;
144 int new = 0;
145
146 p = &ci->i_xattrs.index.rb_node;
147 while (*p) {
148 parent = *p;
149 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
150 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
151 if (c < 0)
152 p = &(*p)->rb_left;
153 else if (c > 0)
154 p = &(*p)->rb_right;
155 else {
156 if (name_len == xattr->name_len)
157 break;
158 else if (name_len < xattr->name_len)
159 p = &(*p)->rb_left;
160 else
161 p = &(*p)->rb_right;
162 }
163 xattr = NULL;
164 }
165
166 if (!xattr) {
167 new = 1;
168 xattr = *newxattr;
169 xattr->name = name;
170 xattr->name_len = name_len;
171 xattr->should_free_name = should_free_name;
172
173 ci->i_xattrs.count++;
174 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
175 } else {
176 kfree(*newxattr);
177 *newxattr = NULL;
178 if (xattr->should_free_val)
179 kfree((void *)xattr->val);
180
181 if (should_free_name) {
182 kfree((void *)name);
183 name = xattr->name;
184 }
185 ci->i_xattrs.names_size -= xattr->name_len;
186 ci->i_xattrs.vals_size -= xattr->val_len;
187 }
188 if (!xattr) {
189 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
190 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
191 xattr->val);
192 return -ENOMEM;
193 }
194 ci->i_xattrs.names_size += name_len;
195 ci->i_xattrs.vals_size += val_len;
196 if (val)
197 xattr->val = val;
198 else
199 xattr->val = "";
200
201 xattr->val_len = val_len;
202 xattr->dirty = dirty;
203 xattr->should_free_val = (val && should_free_val);
204
205 if (new) {
206 rb_link_node(&xattr->node, parent, p);
207 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
208 dout("__set_xattr_val p=%p\n", p);
209 }
210
211 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
212 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
213
214 return 0;
215}
216
217static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
218 const char *name)
219{
220 struct rb_node **p;
221 struct rb_node *parent = NULL;
222 struct ceph_inode_xattr *xattr = NULL;
223 int c;
224
225 p = &ci->i_xattrs.index.rb_node;
226 while (*p) {
227 parent = *p;
228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c < 0)
231 p = &(*p)->rb_left;
232 else if (c > 0)
233 p = &(*p)->rb_right;
234 else {
235 dout("__get_xattr %s: found %.*s\n", name,
236 xattr->val_len, xattr->val);
237 return xattr;
238 }
239 }
240
241 dout("__get_xattr %s: not found\n", name);
242
243 return NULL;
244}
245
246static void __free_xattr(struct ceph_inode_xattr *xattr)
247{
248 BUG_ON(!xattr);
249
250 if (xattr->should_free_name)
251 kfree((void *)xattr->name);
252 if (xattr->should_free_val)
253 kfree((void *)xattr->val);
254
255 kfree(xattr);
256}
257
258static int __remove_xattr(struct ceph_inode_info *ci,
259 struct ceph_inode_xattr *xattr)
260{
261 if (!xattr)
262 return -EOPNOTSUPP;
263
264 rb_erase(&xattr->node, &ci->i_xattrs.index);
265
266 if (xattr->should_free_name)
267 kfree((void *)xattr->name);
268 if (xattr->should_free_val)
269 kfree((void *)xattr->val);
270
271 ci->i_xattrs.names_size -= xattr->name_len;
272 ci->i_xattrs.vals_size -= xattr->val_len;
273 ci->i_xattrs.count--;
274 kfree(xattr);
275
276 return 0;
277}
278
279static int __remove_xattr_by_name(struct ceph_inode_info *ci,
280 const char *name)
281{
282 struct rb_node **p;
283 struct ceph_inode_xattr *xattr;
284 int err;
285
286 p = &ci->i_xattrs.index.rb_node;
287 xattr = __get_xattr(ci, name);
288 err = __remove_xattr(ci, xattr);
289 return err;
290}
291
292static char *__copy_xattr_names(struct ceph_inode_info *ci,
293 char *dest)
294{
295 struct rb_node *p;
296 struct ceph_inode_xattr *xattr = NULL;
297
298 p = rb_first(&ci->i_xattrs.index);
299 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
300
301 while (p) {
302 xattr = rb_entry(p, struct ceph_inode_xattr, node);
303 memcpy(dest, xattr->name, xattr->name_len);
304 dest[xattr->name_len] = '\0';
305
306 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
307 xattr->name_len, ci->i_xattrs.names_size);
308
309 dest += xattr->name_len + 1;
310 p = rb_next(p);
311 }
312
313 return dest;
314}
315
316void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
317{
318 struct rb_node *p, *tmp;
319 struct ceph_inode_xattr *xattr = NULL;
320
321 p = rb_first(&ci->i_xattrs.index);
322
323 dout("__ceph_destroy_xattrs p=%p\n", p);
324
325 while (p) {
326 xattr = rb_entry(p, struct ceph_inode_xattr, node);
327 tmp = p;
328 p = rb_next(tmp);
329 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
330 xattr->name_len, xattr->name);
331 rb_erase(tmp, &ci->i_xattrs.index);
332
333 __free_xattr(xattr);
334 }
335
336 ci->i_xattrs.names_size = 0;
337 ci->i_xattrs.vals_size = 0;
338 ci->i_xattrs.index_version = 0;
339 ci->i_xattrs.count = 0;
340 ci->i_xattrs.index = RB_ROOT;
341}
342
343static int __build_xattrs(struct inode *inode)
344{
345 u32 namelen;
346 u32 numattr = 0;
347 void *p, *end;
348 u32 len;
349 const char *name, *val;
350 struct ceph_inode_info *ci = ceph_inode(inode);
351 int xattr_version;
352 struct ceph_inode_xattr **xattrs = NULL;
353 int err = 0;
354 int i;
355
356 dout("__build_xattrs() len=%d\n",
357 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
358
359 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
360 return 0; /* already built */
361
362 __ceph_destroy_xattrs(ci);
363
364start:
365 /* updated internal xattr rb tree */
366 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
367 p = ci->i_xattrs.blob->vec.iov_base;
368 end = p + ci->i_xattrs.blob->vec.iov_len;
369 ceph_decode_32_safe(&p, end, numattr, bad);
370 xattr_version = ci->i_xattrs.version;
371 spin_unlock(&inode->i_lock);
372
373 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
374 GFP_NOFS);
375 err = -ENOMEM;
376 if (!xattrs)
377 goto bad_lock;
378 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
379 for (i = 0; i < numattr; i++) {
380 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
381 GFP_NOFS);
382 if (!xattrs[i])
383 goto bad_lock;
384 }
385
386 spin_lock(&inode->i_lock);
387 if (ci->i_xattrs.version != xattr_version) {
388 /* lost a race, retry */
389 for (i = 0; i < numattr; i++)
390 kfree(xattrs[i]);
391 kfree(xattrs);
392 goto start;
393 }
394 err = -EIO;
395 while (numattr--) {
396 ceph_decode_32_safe(&p, end, len, bad);
397 namelen = len;
398 name = p;
399 p += len;
400 ceph_decode_32_safe(&p, end, len, bad);
401 val = p;
402 p += len;
403
404 err = __set_xattr(ci, name, namelen, val, len,
405 0, 0, 0, &xattrs[numattr]);
406
407 if (err < 0)
408 goto bad;
409 }
410 kfree(xattrs);
411 }
412 ci->i_xattrs.index_version = ci->i_xattrs.version;
413 ci->i_xattrs.dirty = false;
414
415 return err;
416bad_lock:
417 spin_lock(&inode->i_lock);
418bad:
419 if (xattrs) {
420 for (i = 0; i < numattr; i++)
421 kfree(xattrs[i]);
422 kfree(xattrs);
423 }
424 ci->i_xattrs.names_size = 0;
425 return err;
426}
427
428static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
429 int val_size)
430{
431 /*
432 * 4 bytes for the length, and additional 4 bytes per each xattr name,
433 * 4 bytes per each value
434 */
435 int size = 4 + ci->i_xattrs.count*(4 + 4) +
436 ci->i_xattrs.names_size +
437 ci->i_xattrs.vals_size;
438 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
439 ci->i_xattrs.count, ci->i_xattrs.names_size,
440 ci->i_xattrs.vals_size);
441
442 if (name_size)
443 size += 4 + 4 + name_size + val_size;
444
445 return size;
446}
447
448/*
449 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
450 * and swap into place.
451 */
452void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
453{
454 struct rb_node *p;
455 struct ceph_inode_xattr *xattr = NULL;
456 void *dest;
457
458 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
459 if (ci->i_xattrs.dirty) {
460 int need = __get_required_blob_size(ci, 0, 0);
461
462 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
463
464 p = rb_first(&ci->i_xattrs.index);
465 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
466
467 ceph_encode_32(&dest, ci->i_xattrs.count);
468 while (p) {
469 xattr = rb_entry(p, struct ceph_inode_xattr, node);
470
471 ceph_encode_32(&dest, xattr->name_len);
472 memcpy(dest, xattr->name, xattr->name_len);
473 dest += xattr->name_len;
474 ceph_encode_32(&dest, xattr->val_len);
475 memcpy(dest, xattr->val, xattr->val_len);
476 dest += xattr->val_len;
477
478 p = rb_next(p);
479 }
480
481 /* adjust buffer len; it may be larger than we need */
482 ci->i_xattrs.prealloc_blob->vec.iov_len =
483 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
484
485 if (ci->i_xattrs.blob)
486 ceph_buffer_put(ci->i_xattrs.blob);
487 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
488 ci->i_xattrs.prealloc_blob = NULL;
489 ci->i_xattrs.dirty = false;
490 }
491}
492
493ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
494 size_t size)
495{
496 struct inode *inode = dentry->d_inode;
497 struct ceph_inode_info *ci = ceph_inode(inode);
498 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
499 int err;
500 struct ceph_inode_xattr *xattr;
501 struct ceph_vxattr_cb *vxattr = NULL;
502
503 if (!ceph_is_valid_xattr(name))
504 return -ENODATA;
505
506 /* let's see if a virtual xattr was requested */
507 if (vxattrs)
508 vxattr = ceph_match_vxattr(vxattrs, name);
509
510 spin_lock(&inode->i_lock);
511 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
512 ci->i_xattrs.version, ci->i_xattrs.index_version);
513
514 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
515 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
516 goto get_xattr;
517 } else {
518 spin_unlock(&inode->i_lock);
519 /* get xattrs from mds (if we don't already have them) */
520 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
521 if (err)
522 return err;
523 }
524
525 spin_lock(&inode->i_lock);
526
527 if (vxattr && vxattr->readonly) {
528 err = vxattr->getxattr_cb(ci, value, size);
529 goto out;
530 }
531
532 err = __build_xattrs(inode);
533 if (err < 0)
534 goto out;
535
536get_xattr:
537 err = -ENODATA; /* == ENOATTR */
538 xattr = __get_xattr(ci, name);
539 if (!xattr) {
540 if (vxattr)
541 err = vxattr->getxattr_cb(ci, value, size);
542 goto out;
543 }
544
545 err = -ERANGE;
546 if (size && size < xattr->val_len)
547 goto out;
548
549 err = xattr->val_len;
550 if (size == 0)
551 goto out;
552
553 memcpy(value, xattr->val, xattr->val_len);
554
555out:
556 spin_unlock(&inode->i_lock);
557 return err;
558}
559
560ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
561{
562 struct inode *inode = dentry->d_inode;
563 struct ceph_inode_info *ci = ceph_inode(inode);
564 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
565 u32 vir_namelen = 0;
566 u32 namelen;
567 int err;
568 u32 len;
569 int i;
570
571 spin_lock(&inode->i_lock);
572 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
573 ci->i_xattrs.version, ci->i_xattrs.index_version);
574
575 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
576 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
577 goto list_xattr;
578 } else {
579 spin_unlock(&inode->i_lock);
580 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
581 if (err)
582 return err;
583 }
584
585 spin_lock(&inode->i_lock);
586
587 err = __build_xattrs(inode);
588 if (err < 0)
589 goto out;
590
591list_xattr:
592 vir_namelen = 0;
593 /* include virtual dir xattrs */
594 if (vxattrs)
595 for (i = 0; vxattrs[i].name; i++)
596 vir_namelen += strlen(vxattrs[i].name) + 1;
597 /* adding 1 byte per each variable due to the null termination */
598 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
599 err = -ERANGE;
600 if (size && namelen > size)
601 goto out;
602
603 err = namelen;
604 if (size == 0)
605 goto out;
606
607 names = __copy_xattr_names(ci, names);
608
609 /* virtual xattr names, too */
610 if (vxattrs)
611 for (i = 0; vxattrs[i].name; i++) {
612 len = sprintf(names, "%s", vxattrs[i].name);
613 names += len + 1;
614 }
615
616out:
617 spin_unlock(&inode->i_lock);
618 return err;
619}
620
621static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
622 const char *value, size_t size, int flags)
623{
624 struct ceph_client *client = ceph_client(dentry->d_sb);
625 struct inode *inode = dentry->d_inode;
626 struct ceph_inode_info *ci = ceph_inode(inode);
627 struct inode *parent_inode = dentry->d_parent->d_inode;
628 struct ceph_mds_request *req;
629 struct ceph_mds_client *mdsc = &client->mdsc;
630 int err;
631 int i, nr_pages;
632 struct page **pages = NULL;
633 void *kaddr;
634
635 /* copy value into some pages */
636 nr_pages = calc_pages_for(0, size);
637 if (nr_pages) {
638 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
639 if (!pages)
640 return -ENOMEM;
641 err = -ENOMEM;
642 for (i = 0; i < nr_pages; i++) {
643 pages[i] = alloc_page(GFP_NOFS);
644 if (!pages[i]) {
645 nr_pages = i;
646 goto out;
647 }
648 kaddr = kmap(pages[i]);
649 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
650 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
651 }
652 }
653
654 dout("setxattr value=%.*s\n", (int)size, value);
655
656 /* do request */
657 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
658 USE_AUTH_MDS);
659 if (IS_ERR(req)) {
660 err = PTR_ERR(req);
661 goto out;
662 }
663 req->r_inode = igrab(inode);
664 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
665 req->r_num_caps = 1;
666 req->r_args.setxattr.flags = cpu_to_le32(flags);
667 req->r_path2 = kstrdup(name, GFP_NOFS);
668
669 req->r_pages = pages;
670 req->r_num_pages = nr_pages;
671 req->r_data_len = size;
672
673 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
674 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
675 ceph_mdsc_put_request(req);
676 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
677
678out:
679 if (pages) {
680 for (i = 0; i < nr_pages; i++)
681 __free_page(pages[i]);
682 kfree(pages);
683 }
684 return err;
685}
686
687int ceph_setxattr(struct dentry *dentry, const char *name,
688 const void *value, size_t size, int flags)
689{
690 struct inode *inode = dentry->d_inode;
691 struct ceph_inode_info *ci = ceph_inode(inode);
692 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
693 int err;
694 int name_len = strlen(name);
695 int val_len = size;
696 char *newname = NULL;
697 char *newval = NULL;
698 struct ceph_inode_xattr *xattr = NULL;
699 int issued;
700 int required_blob_size;
701
702 if (ceph_snap(inode) != CEPH_NOSNAP)
703 return -EROFS;
704
705 if (!ceph_is_valid_xattr(name))
706 return -EOPNOTSUPP;
707
708 if (vxattrs) {
709 struct ceph_vxattr_cb *vxattr =
710 ceph_match_vxattr(vxattrs, name);
711 if (vxattr && vxattr->readonly)
712 return -EOPNOTSUPP;
713 }
714
715 /* preallocate memory for xattr name, value, index node */
716 err = -ENOMEM;
717 newname = kmalloc(name_len + 1, GFP_NOFS);
718 if (!newname)
719 goto out;
720 memcpy(newname, name, name_len + 1);
721
722 if (val_len) {
723 newval = kmalloc(val_len + 1, GFP_NOFS);
724 if (!newval)
725 goto out;
726 memcpy(newval, value, val_len);
727 newval[val_len] = '\0';
728 }
729
730 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
731 if (!xattr)
732 goto out;
733
734 spin_lock(&inode->i_lock);
735retry:
736 issued = __ceph_caps_issued(ci, NULL);
737 if (!(issued & CEPH_CAP_XATTR_EXCL))
738 goto do_sync;
739 __build_xattrs(inode);
740
741 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
742
743 if (!ci->i_xattrs.prealloc_blob ||
744 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
745 struct ceph_buffer *blob = NULL;
746
747 spin_unlock(&inode->i_lock);
748 dout(" preaallocating new blob size=%d\n", required_blob_size);
749 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
750 if (!blob)
751 goto out;
752 spin_lock(&inode->i_lock);
753 if (ci->i_xattrs.prealloc_blob)
754 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
755 ci->i_xattrs.prealloc_blob = blob;
756 goto retry;
757 }
758
759 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
760 err = __set_xattr(ci, newname, name_len, newval,
761 val_len, 1, 1, 1, &xattr);
762 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
763 ci->i_xattrs.dirty = true;
764 inode->i_ctime = CURRENT_TIME;
765 spin_unlock(&inode->i_lock);
766
767 return err;
768
769do_sync:
770 spin_unlock(&inode->i_lock);
771 err = ceph_sync_setxattr(dentry, name, value, size, flags);
772out:
773 kfree(newname);
774 kfree(newval);
775 kfree(xattr);
776 return err;
777}
778
779static int ceph_send_removexattr(struct dentry *dentry, const char *name)
780{
781 struct ceph_client *client = ceph_client(dentry->d_sb);
782 struct ceph_mds_client *mdsc = &client->mdsc;
783 struct inode *inode = dentry->d_inode;
784 struct inode *parent_inode = dentry->d_parent->d_inode;
785 struct ceph_mds_request *req;
786 int err;
787
788 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
789 USE_AUTH_MDS);
790 if (IS_ERR(req))
791 return PTR_ERR(req);
792 req->r_inode = igrab(inode);
793 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
794 req->r_num_caps = 1;
795 req->r_path2 = kstrdup(name, GFP_NOFS);
796
797 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
798 ceph_mdsc_put_request(req);
799 return err;
800}
801
802int ceph_removexattr(struct dentry *dentry, const char *name)
803{
804 struct inode *inode = dentry->d_inode;
805 struct ceph_inode_info *ci = ceph_inode(inode);
806 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
807 int issued;
808 int err;
809
810 if (ceph_snap(inode) != CEPH_NOSNAP)
811 return -EROFS;
812
813 if (!ceph_is_valid_xattr(name))
814 return -EOPNOTSUPP;
815
816 if (vxattrs) {
817 struct ceph_vxattr_cb *vxattr =
818 ceph_match_vxattr(vxattrs, name);
819 if (vxattr && vxattr->readonly)
820 return -EOPNOTSUPP;
821 }
822
823 spin_lock(&inode->i_lock);
824 __build_xattrs(inode);
825 issued = __ceph_caps_issued(ci, NULL);
826 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
827
828 if (!(issued & CEPH_CAP_XATTR_EXCL))
829 goto do_sync;
830
831 err = __remove_xattr_by_name(ceph_inode(inode), name);
832 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
833 ci->i_xattrs.dirty = true;
834 inode->i_ctime = CURRENT_TIME;
835
836 spin_unlock(&inode->i_lock);
837
838 return err;
839do_sync:
840 spin_unlock(&inode->i_lock);
841 err = ceph_send_removexattr(dentry, name);
842 return err;
843}
844