aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-11 12:18:32 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-11 12:18:32 -0400
commit682c30ed2165d5694a414d31eac7c63ac5700fb0 (patch)
tree0dd5c95637222f05f3f89511453387b03c85f6f4
parent84479f3c17e2c452d22be10a967e5282b3742d9f (diff)
parente56fa10e92e077d456cbc33b7025032887772b33 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (39 commits) ceph: generalize mon requests, add pool op support ceph: only queue async writeback on cap revocation if there is dirty data ceph: do not ignore osd_idle_ttl mount option ceph: constify dentry_operations ceph: whitespace cleanup ceph: add flock/fcntl lock support ceph: define on-wire types, constants for file locking support ceph: add CEPH_FEATURE_FLOCK to the supported feature bits ceph: support v2 reconnect encoding ceph: support v2 client_caps encoding ceph: move AES iv definition to shared header ceph: fix decoding of pool snap info ceph: make ->sync_fs not wait if wait==0 ceph: warn on missing snap realm ceph: print useful error message when crush rule not found ceph: use %pU to print uuid (fsid) ceph: sync header defs with server code ceph: clean up header guards ceph: strip misleading/obsolete version, feature info ceph: specify supported features in super.h ...
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c16
-rw-r--r--fs/ceph/armor.c6
-rw-r--r--fs/ceph/auth.c6
-rw-r--r--fs/ceph/auth_x.c6
-rw-r--r--fs/ceph/buffer.c16
-rw-r--r--fs/ceph/caps.c303
-rw-r--r--fs/ceph/ceph_frag.h4
-rw-r--r--fs/ceph/ceph_fs.c50
-rw-r--r--fs/ceph/ceph_fs.h87
-rw-r--r--fs/ceph/ceph_hash.h4
-rw-r--r--fs/ceph/ceph_strings.c3
-rw-r--r--fs/ceph/crush/crush.h4
-rw-r--r--fs/ceph/crush/hash.h4
-rw-r--r--fs/ceph/crush/mapper.h4
-rw-r--r--fs/ceph/crypto.c27
-rw-r--r--fs/ceph/crypto.h4
-rw-r--r--fs/ceph/debugfs.c21
-rw-r--r--fs/ceph/decode.h6
-rw-r--r--fs/ceph/dir.c10
-rw-r--r--fs/ceph/file.c32
-rw-r--r--fs/ceph/inode.c5
-rw-r--r--fs/ceph/ioctl.c24
-rw-r--r--fs/ceph/ioctl.h2
-rw-r--r--fs/ceph/locks.c256
-rw-r--r--fs/ceph/mds_client.c235
-rw-r--r--fs/ceph/mds_client.h30
-rw-r--r--fs/ceph/mdsmap.c6
-rw-r--r--fs/ceph/mdsmap.h8
-rw-r--r--fs/ceph/messenger.c23
-rw-r--r--fs/ceph/mon_client.c170
-rw-r--r--fs/ceph/mon_client.h5
-rw-r--r--fs/ceph/msgr.h4
-rw-r--r--fs/ceph/osd_client.c9
-rw-r--r--fs/ceph/osdmap.c37
-rw-r--r--fs/ceph/rados.h13
-rw-r--r--fs/ceph/super.c88
-rw-r--r--fs/ceph/super.h40
-rw-r--r--fs/ceph/xattr.c2
39 files changed, 1162 insertions, 410 deletions
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 6a660e610be8..278e1172600d 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@ ifneq ($(KERNELRELEASE),)
6 6
7obj-$(CONFIG_CEPH_FS) += ceph.o 7obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \ 12 mds_client.o mdsmap.o \
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d9c60b84949a..5598a0d02295 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -309,7 +309,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
309 zero_user_segment(page, s, PAGE_CACHE_SIZE); 309 zero_user_segment(page, s, PAGE_CACHE_SIZE);
310 } 310 }
311 311
312 if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) { 312 if (add_to_page_cache_lru(page, mapping, page->index,
313 GFP_NOFS)) {
313 page_cache_release(page); 314 page_cache_release(page);
314 dout("readpages %p add_to_page_cache failed %p\n", 315 dout("readpages %p add_to_page_cache failed %p\n",
315 inode, page); 316 inode, page);
@@ -552,7 +553,7 @@ static void writepages_finish(struct ceph_osd_request *req,
552 * page truncation thread, possibly losing some data that 553 * page truncation thread, possibly losing some data that
553 * raced its way in 554 * raced its way in
554 */ 555 */
555 if ((issued & CEPH_CAP_FILE_CACHE) == 0) 556 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
556 generic_error_remove_page(inode->i_mapping, page); 557 generic_error_remove_page(inode->i_mapping, page);
557 558
558 unlock_page(page); 559 unlock_page(page);
@@ -797,9 +798,12 @@ get_more_pages:
797 dout("%p will write page %p idx %lu\n", 798 dout("%p will write page %p idx %lu\n",
798 inode, page, page->index); 799 inode, page, page->index);
799 800
800 writeback_stat = atomic_long_inc_return(&client->writeback_count); 801 writeback_stat =
801 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) { 802 atomic_long_inc_return(&client->writeback_count);
802 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 803 if (writeback_stat > CONGESTION_ON_THRESH(
804 client->mount_args->congestion_kb)) {
805 set_bdi_congested(&client->backing_dev_info,
806 BLK_RW_ASYNC);
803 } 807 }
804 808
805 set_page_writeback(page); 809 set_page_writeback(page);
@@ -1036,7 +1040,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
1036 *pagep = page; 1040 *pagep = page;
1037 1041
1038 dout("write_begin file %p inode %p page %p %d~%d\n", file, 1042 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1039 inode, page, (int)pos, (int)len); 1043 inode, page, (int)pos, (int)len);
1040 1044
1041 r = ceph_update_writeable_page(file, pos, len, page); 1045 r = ceph_update_writeable_page(file, pos, len, page);
1042 } while (r == -EAGAIN); 1046 } while (r == -EAGAIN);
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
index 67b2c030924b..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/fs/ceph/armor.c
@@ -1,11 +1,15 @@
1 1
2#include <linux/errno.h> 2#include <linux/errno.h>
3 3
4int ceph_armor(char *dst, const char *src, const char *end);
5int ceph_unarmor(char *dst, const char *src, const char *end);
6
4/* 7/*
5 * base64 encode/decode. 8 * base64 encode/decode.
6 */ 9 */
7 10
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; 11static const char *pem_key =
12 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9 13
10static int encode_bits(int c) 14static int encode_bits(int c)
11{ 15{
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 89490beaf537..6d2e30600627 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -20,7 +20,7 @@ static u32 supported_protocols[] = {
20 CEPH_AUTH_CEPHX 20 CEPH_AUTH_CEPHX
21}; 21};
22 22
23int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol) 23static int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
24{ 24{
25 switch (protocol) { 25 switch (protocol) {
26 case CEPH_AUTH_NONE: 26 case CEPH_AUTH_NONE:
@@ -133,8 +133,8 @@ bad:
133 return -ERANGE; 133 return -ERANGE;
134} 134}
135 135
136int ceph_build_auth_request(struct ceph_auth_client *ac, 136static int ceph_build_auth_request(struct ceph_auth_client *ac,
137 void *msg_buf, size_t msg_len) 137 void *msg_buf, size_t msg_len)
138{ 138{
139 struct ceph_mon_request_header *monhdr = msg_buf; 139 struct ceph_mon_request_header *monhdr = msg_buf;
140 void *p = monhdr + 1; 140 void *p = monhdr + 1;
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 6d44053ecff1..582e0b2caf8a 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -87,8 +87,8 @@ static int ceph_x_decrypt(struct ceph_crypto_key *secret,
87/* 87/*
88 * get existing (or insert new) ticket handler 88 * get existing (or insert new) ticket handler
89 */ 89 */
90struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac, 90static struct ceph_x_ticket_handler *
91 int service) 91get_ticket_handler(struct ceph_auth_client *ac, int service)
92{ 92{
93 struct ceph_x_ticket_handler *th; 93 struct ceph_x_ticket_handler *th;
94 struct ceph_x_info *xi = ac->private; 94 struct ceph_x_info *xi = ac->private;
@@ -429,7 +429,7 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
429 auth->struct_v = 1; 429 auth->struct_v = 1;
430 auth->key = 0; 430 auth->key = 0;
431 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++) 431 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
432 auth->key ^= *u; 432 auth->key ^= *(__le64 *)u;
433 dout(" server_challenge %llx client_challenge %llx key %llx\n", 433 dout(" server_challenge %llx client_challenge %llx key %llx\n",
434 xi->server_challenge, le64_to_cpu(auth->client_challenge), 434 xi->server_challenge, le64_to_cpu(auth->client_challenge),
435 le64_to_cpu(auth->key)); 435 le64_to_cpu(auth->key));
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
index c67535d70aa6..cd39f17021de 100644
--- a/fs/ceph/buffer.c
+++ b/fs/ceph/buffer.c
@@ -47,22 +47,6 @@ void ceph_buffer_release(struct kref *kref)
47 kfree(b); 47 kfree(b);
48} 48}
49 49
50int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
51{
52 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
53 if (b->vec.iov_base) {
54 b->is_vmalloc = false;
55 } else {
56 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
57 b->is_vmalloc = true;
58 }
59 if (!b->vec.iov_base)
60 return -ENOMEM;
61 b->alloc_len = len;
62 b->vec.iov_len = len;
63 return 0;
64}
65
66int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) 50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
67{ 51{
68 size_t len; 52 size_t len;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b81be9a56487..7bf182b03973 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -113,58 +113,41 @@ const char *ceph_cap_string(int caps)
113 return cap_str[i]; 113 return cap_str[i];
114} 114}
115 115
116/* 116void ceph_caps_init(struct ceph_mds_client *mdsc)
117 * Cap reservations
118 *
119 * Maintain a global pool of preallocated struct ceph_caps, referenced
120 * by struct ceph_caps_reservations. This ensures that we preallocate
121 * memory needed to successfully process an MDS response. (If an MDS
122 * sends us cap information and we fail to process it, we will have
123 * problems due to the client and MDS being out of sync.)
124 *
125 * Reservations are 'owned' by a ceph_cap_reservation context.
126 */
127static spinlock_t caps_list_lock;
128static struct list_head caps_list; /* unused (reserved or unreserved) */
129static int caps_total_count; /* total caps allocated */
130static int caps_use_count; /* in use */
131static int caps_reserve_count; /* unused, reserved */
132static int caps_avail_count; /* unused, unreserved */
133static int caps_min_count; /* keep at least this many (unreserved) */
134
135void __init ceph_caps_init(void)
136{ 117{
137 INIT_LIST_HEAD(&caps_list); 118 INIT_LIST_HEAD(&mdsc->caps_list);
138 spin_lock_init(&caps_list_lock); 119 spin_lock_init(&mdsc->caps_list_lock);
139} 120}
140 121
141void ceph_caps_finalize(void) 122void ceph_caps_finalize(struct ceph_mds_client *mdsc)
142{ 123{
143 struct ceph_cap *cap; 124 struct ceph_cap *cap;
144 125
145 spin_lock(&caps_list_lock); 126 spin_lock(&mdsc->caps_list_lock);
146 while (!list_empty(&caps_list)) { 127 while (!list_empty(&mdsc->caps_list)) {
147 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 128 cap = list_first_entry(&mdsc->caps_list,
129 struct ceph_cap, caps_item);
148 list_del(&cap->caps_item); 130 list_del(&cap->caps_item);
149 kmem_cache_free(ceph_cap_cachep, cap); 131 kmem_cache_free(ceph_cap_cachep, cap);
150 } 132 }
151 caps_total_count = 0; 133 mdsc->caps_total_count = 0;
152 caps_avail_count = 0; 134 mdsc->caps_avail_count = 0;
153 caps_use_count = 0; 135 mdsc->caps_use_count = 0;
154 caps_reserve_count = 0; 136 mdsc->caps_reserve_count = 0;
155 caps_min_count = 0; 137 mdsc->caps_min_count = 0;
156 spin_unlock(&caps_list_lock); 138 spin_unlock(&mdsc->caps_list_lock);
157} 139}
158 140
159void ceph_adjust_min_caps(int delta) 141void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta)
160{ 142{
161 spin_lock(&caps_list_lock); 143 spin_lock(&mdsc->caps_list_lock);
162 caps_min_count += delta; 144 mdsc->caps_min_count += delta;
163 BUG_ON(caps_min_count < 0); 145 BUG_ON(mdsc->caps_min_count < 0);
164 spin_unlock(&caps_list_lock); 146 spin_unlock(&mdsc->caps_list_lock);
165} 147}
166 148
167int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need) 149int ceph_reserve_caps(struct ceph_mds_client *mdsc,
150 struct ceph_cap_reservation *ctx, int need)
168{ 151{
169 int i; 152 int i;
170 struct ceph_cap *cap; 153 struct ceph_cap *cap;
@@ -176,16 +159,17 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
176 dout("reserve caps ctx=%p need=%d\n", ctx, need); 159 dout("reserve caps ctx=%p need=%d\n", ctx, need);
177 160
178 /* first reserve any caps that are already allocated */ 161 /* first reserve any caps that are already allocated */
179 spin_lock(&caps_list_lock); 162 spin_lock(&mdsc->caps_list_lock);
180 if (caps_avail_count >= need) 163 if (mdsc->caps_avail_count >= need)
181 have = need; 164 have = need;
182 else 165 else
183 have = caps_avail_count; 166 have = mdsc->caps_avail_count;
184 caps_avail_count -= have; 167 mdsc->caps_avail_count -= have;
185 caps_reserve_count += have; 168 mdsc->caps_reserve_count += have;
186 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 169 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
187 caps_avail_count); 170 mdsc->caps_reserve_count +
188 spin_unlock(&caps_list_lock); 171 mdsc->caps_avail_count);
172 spin_unlock(&mdsc->caps_list_lock);
189 173
190 for (i = have; i < need; i++) { 174 for (i = have; i < need; i++) {
191 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 175 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
@@ -198,19 +182,20 @@ int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
198 } 182 }
199 BUG_ON(have + alloc != need); 183 BUG_ON(have + alloc != need);
200 184
201 spin_lock(&caps_list_lock); 185 spin_lock(&mdsc->caps_list_lock);
202 caps_total_count += alloc; 186 mdsc->caps_total_count += alloc;
203 caps_reserve_count += alloc; 187 mdsc->caps_reserve_count += alloc;
204 list_splice(&newcaps, &caps_list); 188 list_splice(&newcaps, &mdsc->caps_list);
205 189
206 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 190 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
207 caps_avail_count); 191 mdsc->caps_reserve_count +
208 spin_unlock(&caps_list_lock); 192 mdsc->caps_avail_count);
193 spin_unlock(&mdsc->caps_list_lock);
209 194
210 ctx->count = need; 195 ctx->count = need;
211 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", 196 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
212 ctx, caps_total_count, caps_use_count, caps_reserve_count, 197 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
213 caps_avail_count); 198 mdsc->caps_reserve_count, mdsc->caps_avail_count);
214 return 0; 199 return 0;
215 200
216out_alloc_count: 201out_alloc_count:
@@ -220,26 +205,29 @@ out_alloc_count:
220 return ret; 205 return ret;
221} 206}
222 207
223int ceph_unreserve_caps(struct ceph_cap_reservation *ctx) 208int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
209 struct ceph_cap_reservation *ctx)
224{ 210{
225 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); 211 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
226 if (ctx->count) { 212 if (ctx->count) {
227 spin_lock(&caps_list_lock); 213 spin_lock(&mdsc->caps_list_lock);
228 BUG_ON(caps_reserve_count < ctx->count); 214 BUG_ON(mdsc->caps_reserve_count < ctx->count);
229 caps_reserve_count -= ctx->count; 215 mdsc->caps_reserve_count -= ctx->count;
230 caps_avail_count += ctx->count; 216 mdsc->caps_avail_count += ctx->count;
231 ctx->count = 0; 217 ctx->count = 0;
232 dout("unreserve caps %d = %d used + %d resv + %d avail\n", 218 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
233 caps_total_count, caps_use_count, caps_reserve_count, 219 mdsc->caps_total_count, mdsc->caps_use_count,
234 caps_avail_count); 220 mdsc->caps_reserve_count, mdsc->caps_avail_count);
235 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 221 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
236 caps_avail_count); 222 mdsc->caps_reserve_count +
237 spin_unlock(&caps_list_lock); 223 mdsc->caps_avail_count);
224 spin_unlock(&mdsc->caps_list_lock);
238 } 225 }
239 return 0; 226 return 0;
240} 227}
241 228
242static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx) 229static struct ceph_cap *get_cap(struct ceph_mds_client *mdsc,
230 struct ceph_cap_reservation *ctx)
243{ 231{
244 struct ceph_cap *cap = NULL; 232 struct ceph_cap *cap = NULL;
245 233
@@ -247,71 +235,74 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
247 if (!ctx) { 235 if (!ctx) {
248 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); 236 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
249 if (cap) { 237 if (cap) {
250 caps_use_count++; 238 mdsc->caps_use_count++;
251 caps_total_count++; 239 mdsc->caps_total_count++;
252 } 240 }
253 return cap; 241 return cap;
254 } 242 }
255 243
256 spin_lock(&caps_list_lock); 244 spin_lock(&mdsc->caps_list_lock);
257 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n", 245 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
258 ctx, ctx->count, caps_total_count, caps_use_count, 246 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
259 caps_reserve_count, caps_avail_count); 247 mdsc->caps_reserve_count, mdsc->caps_avail_count);
260 BUG_ON(!ctx->count); 248 BUG_ON(!ctx->count);
261 BUG_ON(ctx->count > caps_reserve_count); 249 BUG_ON(ctx->count > mdsc->caps_reserve_count);
262 BUG_ON(list_empty(&caps_list)); 250 BUG_ON(list_empty(&mdsc->caps_list));
263 251
264 ctx->count--; 252 ctx->count--;
265 caps_reserve_count--; 253 mdsc->caps_reserve_count--;
266 caps_use_count++; 254 mdsc->caps_use_count++;
267 255
268 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item); 256 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
269 list_del(&cap->caps_item); 257 list_del(&cap->caps_item);
270 258
271 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 259 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
272 caps_avail_count); 260 mdsc->caps_reserve_count + mdsc->caps_avail_count);
273 spin_unlock(&caps_list_lock); 261 spin_unlock(&mdsc->caps_list_lock);
274 return cap; 262 return cap;
275} 263}
276 264
277void ceph_put_cap(struct ceph_cap *cap) 265void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
278{ 266{
279 spin_lock(&caps_list_lock); 267 spin_lock(&mdsc->caps_list_lock);
280 dout("put_cap %p %d = %d used + %d resv + %d avail\n", 268 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
281 cap, caps_total_count, caps_use_count, 269 cap, mdsc->caps_total_count, mdsc->caps_use_count,
282 caps_reserve_count, caps_avail_count); 270 mdsc->caps_reserve_count, mdsc->caps_avail_count);
283 caps_use_count--; 271 mdsc->caps_use_count--;
284 /* 272 /*
285 * Keep some preallocated caps around (ceph_min_count), to 273 * Keep some preallocated caps around (ceph_min_count), to
286 * avoid lots of free/alloc churn. 274 * avoid lots of free/alloc churn.
287 */ 275 */
288 if (caps_avail_count >= caps_reserve_count + caps_min_count) { 276 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
289 caps_total_count--; 277 mdsc->caps_min_count) {
278 mdsc->caps_total_count--;
290 kmem_cache_free(ceph_cap_cachep, cap); 279 kmem_cache_free(ceph_cap_cachep, cap);
291 } else { 280 } else {
292 caps_avail_count++; 281 mdsc->caps_avail_count++;
293 list_add(&cap->caps_item, &caps_list); 282 list_add(&cap->caps_item, &mdsc->caps_list);
294 } 283 }
295 284
296 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count + 285 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
297 caps_avail_count); 286 mdsc->caps_reserve_count + mdsc->caps_avail_count);
298 spin_unlock(&caps_list_lock); 287 spin_unlock(&mdsc->caps_list_lock);
299} 288}
300 289
301void ceph_reservation_status(struct ceph_client *client, 290void ceph_reservation_status(struct ceph_client *client,
302 int *total, int *avail, int *used, int *reserved, 291 int *total, int *avail, int *used, int *reserved,
303 int *min) 292 int *min)
304{ 293{
294 struct ceph_mds_client *mdsc = &client->mdsc;
295
305 if (total) 296 if (total)
306 *total = caps_total_count; 297 *total = mdsc->caps_total_count;
307 if (avail) 298 if (avail)
308 *avail = caps_avail_count; 299 *avail = mdsc->caps_avail_count;
309 if (used) 300 if (used)
310 *used = caps_use_count; 301 *used = mdsc->caps_use_count;
311 if (reserved) 302 if (reserved)
312 *reserved = caps_reserve_count; 303 *reserved = mdsc->caps_reserve_count;
313 if (min) 304 if (min)
314 *min = caps_min_count; 305 *min = mdsc->caps_min_count;
315} 306}
316 307
317/* 308/*
@@ -336,22 +327,29 @@ static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
336 return NULL; 327 return NULL;
337} 328}
338 329
330struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
331{
332 struct ceph_cap *cap;
333
334 spin_lock(&ci->vfs_inode.i_lock);
335 cap = __get_cap_for_mds(ci, mds);
336 spin_unlock(&ci->vfs_inode.i_lock);
337 return cap;
338}
339
339/* 340/*
340 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else 341 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
341 * -1.
342 */ 342 */
343static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq) 343static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
344{ 344{
345 struct ceph_cap *cap; 345 struct ceph_cap *cap;
346 int mds = -1; 346 int mds = -1;
347 struct rb_node *p; 347 struct rb_node *p;
348 348
349 /* prefer mds with WR|WRBUFFER|EXCL caps */ 349 /* prefer mds with WR|BUFFER|EXCL caps */
350 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { 350 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
351 cap = rb_entry(p, struct ceph_cap, ci_node); 351 cap = rb_entry(p, struct ceph_cap, ci_node);
352 mds = cap->mds; 352 mds = cap->mds;
353 if (mseq)
354 *mseq = cap->mseq;
355 if (cap->issued & (CEPH_CAP_FILE_WR | 353 if (cap->issued & (CEPH_CAP_FILE_WR |
356 CEPH_CAP_FILE_BUFFER | 354 CEPH_CAP_FILE_BUFFER |
357 CEPH_CAP_FILE_EXCL)) 355 CEPH_CAP_FILE_EXCL))
@@ -364,7 +362,7 @@ int ceph_get_cap_mds(struct inode *inode)
364{ 362{
365 int mds; 363 int mds;
366 spin_lock(&inode->i_lock); 364 spin_lock(&inode->i_lock);
367 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL); 365 mds = __ceph_get_cap_mds(ceph_inode(inode));
368 spin_unlock(&inode->i_lock); 366 spin_unlock(&inode->i_lock);
369 return mds; 367 return mds;
370} 368}
@@ -483,8 +481,8 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
483 * Each time we receive FILE_CACHE anew, we increment 481 * Each time we receive FILE_CACHE anew, we increment
484 * i_rdcache_gen. 482 * i_rdcache_gen.
485 */ 483 */
486 if ((issued & CEPH_CAP_FILE_CACHE) && 484 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
487 (had & CEPH_CAP_FILE_CACHE) == 0) 485 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
488 ci->i_rdcache_gen++; 486 ci->i_rdcache_gen++;
489 487
490 /* 488 /*
@@ -543,7 +541,7 @@ retry:
543 new_cap = NULL; 541 new_cap = NULL;
544 } else { 542 } else {
545 spin_unlock(&inode->i_lock); 543 spin_unlock(&inode->i_lock);
546 new_cap = get_cap(caps_reservation); 544 new_cap = get_cap(mdsc, caps_reservation);
547 if (new_cap == NULL) 545 if (new_cap == NULL)
548 return -ENOMEM; 546 return -ENOMEM;
549 goto retry; 547 goto retry;
@@ -588,6 +586,7 @@ retry:
588 } else { 586 } else {
589 pr_err("ceph_add_cap: couldn't find snap realm %llx\n", 587 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
590 realmino); 588 realmino);
589 WARN_ON(!realm);
591 } 590 }
592 } 591 }
593 592
@@ -831,7 +830,7 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
831{ 830{
832 int want = 0; 831 int want = 0;
833 int mode; 832 int mode;
834 for (mode = 0; mode < 4; mode++) 833 for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
835 if (ci->i_nr_by_mode[mode]) 834 if (ci->i_nr_by_mode[mode])
836 want |= ceph_caps_for_mode(mode); 835 want |= ceph_caps_for_mode(mode);
837 return want; 836 return want;
@@ -901,7 +900,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
901 ci->i_auth_cap = NULL; 900 ci->i_auth_cap = NULL;
902 901
903 if (removed) 902 if (removed)
904 ceph_put_cap(cap); 903 ceph_put_cap(mdsc, cap);
905 904
906 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 905 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
907 struct ceph_snap_realm *realm = ci->i_snap_realm; 906 struct ceph_snap_realm *realm = ci->i_snap_realm;
@@ -1197,6 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1197 */ 1196 */
1198void __ceph_flush_snaps(struct ceph_inode_info *ci, 1197void __ceph_flush_snaps(struct ceph_inode_info *ci,
1199 struct ceph_mds_session **psession) 1198 struct ceph_mds_session **psession)
1199 __releases(ci->vfs_inode->i_lock)
1200 __acquires(ci->vfs_inode->i_lock)
1200{ 1201{
1201 struct inode *inode = &ci->vfs_inode; 1202 struct inode *inode = &ci->vfs_inode;
1202 int mds; 1203 int mds;
@@ -1232,7 +1233,13 @@ retry:
1232 BUG_ON(capsnap->dirty == 0); 1233 BUG_ON(capsnap->dirty == 0);
1233 1234
1234 /* pick mds, take s_mutex */ 1235 /* pick mds, take s_mutex */
1235 mds = __ceph_get_cap_mds(ci, &mseq); 1236 if (ci->i_auth_cap == NULL) {
1237 dout("no auth cap (migrating?), doing nothing\n");
1238 goto out;
1239 }
1240 mds = ci->i_auth_cap->session->s_mds;
1241 mseq = ci->i_auth_cap->mseq;
1242
1236 if (session && session->s_mds != mds) { 1243 if (session && session->s_mds != mds) {
1237 dout("oops, wrong session %p mutex\n", session); 1244 dout("oops, wrong session %p mutex\n", session);
1238 mutex_unlock(&session->s_mutex); 1245 mutex_unlock(&session->s_mutex);
@@ -1251,8 +1258,8 @@ retry:
1251 } 1258 }
1252 /* 1259 /*
1253 * if session == NULL, we raced against a cap 1260 * if session == NULL, we raced against a cap
1254 * deletion. retry, and we'll get a better 1261 * deletion or migration. retry, and we'll
1255 * @mds value next time. 1262 * get a better @mds value next time.
1256 */ 1263 */
1257 spin_lock(&inode->i_lock); 1264 spin_lock(&inode->i_lock);
1258 goto retry; 1265 goto retry;
@@ -1290,6 +1297,7 @@ retry:
1290 list_del_init(&ci->i_snap_flush_item); 1297 list_del_init(&ci->i_snap_flush_item);
1291 spin_unlock(&mdsc->snap_flush_lock); 1298 spin_unlock(&mdsc->snap_flush_lock);
1292 1299
1300out:
1293 if (psession) 1301 if (psession)
1294 *psession = session; 1302 *psession = session;
1295 else if (session) { 1303 else if (session) {
@@ -1435,7 +1443,6 @@ static int try_nonblocking_invalidate(struct inode *inode)
1435 */ 1443 */
1436void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1444void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1437 struct ceph_mds_session *session) 1445 struct ceph_mds_session *session)
1438 __releases(session->s_mutex)
1439{ 1446{
1440 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1447 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1441 struct ceph_mds_client *mdsc = &client->mdsc; 1448 struct ceph_mds_client *mdsc = &client->mdsc;
@@ -1510,11 +1517,13 @@ retry_locked:
1510 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1517 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1511 ci->i_rdcache_gen && /* may have cached pages */ 1518 ci->i_rdcache_gen && /* may have cached pages */
1512 (file_wanted == 0 || /* no open files */ 1519 (file_wanted == 0 || /* no open files */
1513 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */ 1520 (revoking & (CEPH_CAP_FILE_CACHE|
1521 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
1514 !tried_invalidate) { 1522 !tried_invalidate) {
1515 dout("check_caps trying to invalidate on %p\n", inode); 1523 dout("check_caps trying to invalidate on %p\n", inode);
1516 if (try_nonblocking_invalidate(inode) < 0) { 1524 if (try_nonblocking_invalidate(inode) < 0) {
1517 if (revoking & CEPH_CAP_FILE_CACHE) { 1525 if (revoking & (CEPH_CAP_FILE_CACHE|
1526 CEPH_CAP_FILE_LAZYIO)) {
1518 dout("check_caps queuing invalidate\n"); 1527 dout("check_caps queuing invalidate\n");
1519 queue_invalidate = 1; 1528 queue_invalidate = 1;
1520 ci->i_rdcache_revoking = ci->i_rdcache_gen; 1529 ci->i_rdcache_revoking = ci->i_rdcache_gen;
@@ -2250,8 +2259,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2250 struct ceph_mds_session *session, 2259 struct ceph_mds_session *session,
2251 struct ceph_cap *cap, 2260 struct ceph_cap *cap,
2252 struct ceph_buffer *xattr_buf) 2261 struct ceph_buffer *xattr_buf)
2253 __releases(inode->i_lock) 2262 __releases(inode->i_lock)
2254 __releases(session->s_mutex)
2255{ 2263{
2256 struct ceph_inode_info *ci = ceph_inode(inode); 2264 struct ceph_inode_info *ci = ceph_inode(inode);
2257 int mds = session->s_mds; 2265 int mds = session->s_mds;
@@ -2278,6 +2286,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2278 * will invalidate _after_ writeback.) 2286 * will invalidate _after_ writeback.)
2279 */ 2287 */
2280 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && 2288 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2289 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2281 !ci->i_wrbuffer_ref) { 2290 !ci->i_wrbuffer_ref) {
2282 if (try_nonblocking_invalidate(inode) == 0) { 2291 if (try_nonblocking_invalidate(inode) == 0) {
2283 revoked_rdcache = 1; 2292 revoked_rdcache = 1;
@@ -2369,15 +2378,22 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2369 2378
2370 /* revocation, grant, or no-op? */ 2379 /* revocation, grant, or no-op? */
2371 if (cap->issued & ~newcaps) { 2380 if (cap->issued & ~newcaps) {
2372 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued), 2381 int revoking = cap->issued & ~newcaps;
2373 ceph_cap_string(newcaps)); 2382
2374 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) 2383 dout("revocation: %s -> %s (revoking %s)\n",
2375 writeback = 1; /* will delay ack */ 2384 ceph_cap_string(cap->issued),
2376 else if (dirty & ~newcaps) 2385 ceph_cap_string(newcaps),
2377 check_caps = 1; /* initiate writeback in check_caps */ 2386 ceph_cap_string(revoking));
2378 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || 2387 if (revoking & used & CEPH_CAP_FILE_BUFFER)
2379 revoked_rdcache) 2388 writeback = 1; /* initiate writeback; will delay ack */
2380 check_caps = 2; /* send revoke ack in check_caps */ 2389 else if (revoking == CEPH_CAP_FILE_CACHE &&
2390 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
2391 queue_invalidate)
2392 ; /* do nothing yet, invalidation will be queued */
2393 else if (cap == ci->i_auth_cap)
2394 check_caps = 1; /* check auth cap only */
2395 else
2396 check_caps = 2; /* check all caps */
2381 cap->issued = newcaps; 2397 cap->issued = newcaps;
2382 cap->implemented |= newcaps; 2398 cap->implemented |= newcaps;
2383 } else if (cap->issued == newcaps) { 2399 } else if (cap->issued == newcaps) {
@@ -2568,7 +2584,8 @@ static void handle_cap_trunc(struct inode *inode,
2568 * caller holds s_mutex 2584 * caller holds s_mutex
2569 */ 2585 */
2570static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, 2586static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2571 struct ceph_mds_session *session) 2587 struct ceph_mds_session *session,
2588 int *open_target_sessions)
2572{ 2589{
2573 struct ceph_inode_info *ci = ceph_inode(inode); 2590 struct ceph_inode_info *ci = ceph_inode(inode);
2574 int mds = session->s_mds; 2591 int mds = session->s_mds;
@@ -2600,6 +2617,12 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2600 ci->i_cap_exporting_mds = mds; 2617 ci->i_cap_exporting_mds = mds;
2601 ci->i_cap_exporting_mseq = mseq; 2618 ci->i_cap_exporting_mseq = mseq;
2602 ci->i_cap_exporting_issued = cap->issued; 2619 ci->i_cap_exporting_issued = cap->issued;
2620
2621 /*
2622 * make sure we have open sessions with all possible
2623 * export targets, so that we get the matching IMPORT
2624 */
2625 *open_target_sessions = 1;
2603 } 2626 }
2604 __ceph_remove_cap(cap); 2627 __ceph_remove_cap(cap);
2605 } 2628 }
@@ -2675,6 +2698,10 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2675 u64 size, max_size; 2698 u64 size, max_size;
2676 u64 tid; 2699 u64 tid;
2677 void *snaptrace; 2700 void *snaptrace;
2701 size_t snaptrace_len;
2702 void *flock;
2703 u32 flock_len;
2704 int open_target_sessions = 0;
2678 2705
2679 dout("handle_caps from mds%d\n", mds); 2706 dout("handle_caps from mds%d\n", mds);
2680 2707
@@ -2683,7 +2710,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2683 if (msg->front.iov_len < sizeof(*h)) 2710 if (msg->front.iov_len < sizeof(*h))
2684 goto bad; 2711 goto bad;
2685 h = msg->front.iov_base; 2712 h = msg->front.iov_base;
2686 snaptrace = h + 1;
2687 op = le32_to_cpu(h->op); 2713 op = le32_to_cpu(h->op);
2688 vino.ino = le64_to_cpu(h->ino); 2714 vino.ino = le64_to_cpu(h->ino);
2689 vino.snap = CEPH_NOSNAP; 2715 vino.snap = CEPH_NOSNAP;
@@ -2693,6 +2719,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2693 size = le64_to_cpu(h->size); 2719 size = le64_to_cpu(h->size);
2694 max_size = le64_to_cpu(h->max_size); 2720 max_size = le64_to_cpu(h->max_size);
2695 2721
2722 snaptrace = h + 1;
2723 snaptrace_len = le32_to_cpu(h->snap_trace_len);
2724
2725 if (le16_to_cpu(msg->hdr.version) >= 2) {
2726 void *p, *end;
2727
2728 p = snaptrace + snaptrace_len;
2729 end = msg->front.iov_base + msg->front.iov_len;
2730 ceph_decode_32_safe(&p, end, flock_len, bad);
2731 flock = p;
2732 } else {
2733 flock = NULL;
2734 flock_len = 0;
2735 }
2736
2696 mutex_lock(&session->s_mutex); 2737 mutex_lock(&session->s_mutex);
2697 session->s_seq++; 2738 session->s_seq++;
2698 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, 2739 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
@@ -2714,7 +2755,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2714 * along for the mds (who clearly thinks we still have this 2755 * along for the mds (who clearly thinks we still have this
2715 * cap). 2756 * cap).
2716 */ 2757 */
2717 ceph_add_cap_releases(mdsc, session, -1); 2758 ceph_add_cap_releases(mdsc, session);
2718 ceph_send_cap_releases(mdsc, session); 2759 ceph_send_cap_releases(mdsc, session);
2719 goto done; 2760 goto done;
2720 } 2761 }
@@ -2726,12 +2767,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2726 goto done; 2767 goto done;
2727 2768
2728 case CEPH_CAP_OP_EXPORT: 2769 case CEPH_CAP_OP_EXPORT:
2729 handle_cap_export(inode, h, session); 2770 handle_cap_export(inode, h, session, &open_target_sessions);
2730 goto done; 2771 goto done;
2731 2772
2732 case CEPH_CAP_OP_IMPORT: 2773 case CEPH_CAP_OP_IMPORT:
2733 handle_cap_import(mdsc, inode, h, session, 2774 handle_cap_import(mdsc, inode, h, session,
2734 snaptrace, le32_to_cpu(h->snap_trace_len)); 2775 snaptrace, snaptrace_len);
2735 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, 2776 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY,
2736 session); 2777 session);
2737 goto done_unlocked; 2778 goto done_unlocked;
@@ -2773,6 +2814,8 @@ done:
2773done_unlocked: 2814done_unlocked:
2774 if (inode) 2815 if (inode)
2775 iput(inode); 2816 iput(inode);
2817 if (open_target_sessions)
2818 ceph_mdsc_open_export_target_sessions(mdsc, session);
2776 return; 2819 return;
2777 2820
2778bad: 2821bad:
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
index 793f50cb7c22..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/fs/ceph/ceph_frag.h
@@ -1,5 +1,5 @@
1#ifndef _FS_CEPH_FRAG_H 1#ifndef FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H 2#define FS_CEPH_FRAG_H
3 3
4/* 4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space, 5 * "Frags" are a way to describe a subset of a 32-bit number space,
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
index 79d76bc4303f..3ac6cc7c1156 100644
--- a/fs/ceph/ceph_fs.c
+++ b/fs/ceph/ceph_fs.c
@@ -29,46 +29,44 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
29 29
30int ceph_flags_to_mode(int flags) 30int ceph_flags_to_mode(int flags)
31{ 31{
32 int mode;
33
32#ifdef O_DIRECTORY /* fixme */ 34#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY) 35 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN; 36 return CEPH_FILE_MODE_PIN;
35#endif 37#endif
38 if ((flags & O_APPEND) == O_APPEND)
39 flags |= O_WRONLY;
40
41 if ((flags & O_ACCMODE) == O_RDWR)
42 mode = CEPH_FILE_MODE_RDWR;
43 else if ((flags & O_ACCMODE) == O_WRONLY)
44 mode = CEPH_FILE_MODE_WR;
45 else
46 mode = CEPH_FILE_MODE_RD;
47
36#ifdef O_LAZY 48#ifdef O_LAZY
37 if (flags & O_LAZY) 49 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY; 50 mode |= CEPH_FILE_MODE_LAZY;
39#endif 51#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42 52
43 flags &= O_ACCMODE; 53 return mode;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49} 54}
50 55
51int ceph_caps_for_mode(int mode) 56int ceph_caps_for_mode(int mode)
52{ 57{
53 switch (mode) { 58 int caps = CEPH_CAP_PIN;
54 case CEPH_FILE_MODE_PIN: 59
55 return CEPH_CAP_PIN; 60 if (mode & CEPH_FILE_MODE_RD)
56 case CEPH_FILE_MODE_RD: 61 caps |= CEPH_CAP_FILE_SHARED |
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE; 62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR: 63 if (mode & CEPH_FILE_MODE_WR)
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED | 64 caps |= CEPH_CAP_FILE_EXCL |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | 65 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL | 66 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; 67 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 } 68 if (mode & CEPH_FILE_MODE_LAZY)
73 return 0; 69 caps |= CEPH_CAP_FILE_LAZYIO;
70
71 return caps;
74} 72}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 2fa992eaf7da..d5619ac86711 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -9,27 +9,13 @@
9 * LGPL2 9 * LGPL2
10 */ 10 */
11 11
12#ifndef _FS_CEPH_CEPH_FS_H 12#ifndef CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H 13#define CEPH_FS_H
14 14
15#include "msgr.h" 15#include "msgr.h"
16#include "rados.h" 16#include "rados.h"
17 17
18/* 18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 20
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level 19 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev 20 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public, 21 * internal cluster protocols separately from the public,
@@ -53,18 +39,10 @@
53/* 39/*
54 * feature bits 40 * feature bits
55 */ 41 */
56#define CEPH_FEATURE_UID 1 42#define CEPH_FEATURE_UID (1<<0)
57#define CEPH_FEATURE_NOSRCADDR 2 43#define CEPH_FEATURE_NOSRCADDR (1<<1)
58#define CEPH_FEATURE_FLOCK 4 44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
59 45#define CEPH_FEATURE_FLOCK (1<<3)
60#define CEPH_FEATURE_SUPPORTED_MON CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
61#define CEPH_FEATURE_REQUIRED_MON CEPH_FEATURE_UID
62#define CEPH_FEATURE_SUPPORTED_MDS CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
63#define CEPH_FEATURE_REQUIRED_MDS CEPH_FEATURE_UID
64#define CEPH_FEATURE_SUPPORTED_OSD CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
65#define CEPH_FEATURE_REQUIRED_OSD CEPH_FEATURE_UID
66#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
67#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
68 46
69 47
70/* 48/*
@@ -96,6 +74,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
96#define CEPH_CRYPTO_NONE 0x0 74#define CEPH_CRYPTO_NONE 0x0
97#define CEPH_CRYPTO_AES 0x1 75#define CEPH_CRYPTO_AES 0x1
98 76
77#define CEPH_AES_IV "cephsageyudagreg"
78
99/* security/authentication protocols */ 79/* security/authentication protocols */
100#define CEPH_AUTH_UNKNOWN 0x0 80#define CEPH_AUTH_UNKNOWN 0x0
101#define CEPH_AUTH_NONE 0x1 81#define CEPH_AUTH_NONE 0x1
@@ -275,6 +255,7 @@ extern const char *ceph_mds_state_name(int s);
275#define CEPH_LOCK_IDFT 512 /* dir frag tree */ 255#define CEPH_LOCK_IDFT 512 /* dir frag tree */
276#define CEPH_LOCK_INEST 1024 /* mds internal */ 256#define CEPH_LOCK_INEST 1024 /* mds internal */
277#define CEPH_LOCK_IXATTR 2048 257#define CEPH_LOCK_IXATTR 2048
258#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
278#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ 259#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
279 260
280/* client_session ops */ 261/* client_session ops */
@@ -316,6 +297,8 @@ enum {
316 CEPH_MDS_OP_RMXATTR = 0x01106, 297 CEPH_MDS_OP_RMXATTR = 0x01106,
317 CEPH_MDS_OP_SETLAYOUT = 0x01107, 298 CEPH_MDS_OP_SETLAYOUT = 0x01107,
318 CEPH_MDS_OP_SETATTR = 0x01108, 299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
319 302
320 CEPH_MDS_OP_MKNOD = 0x01201, 303 CEPH_MDS_OP_MKNOD = 0x01201,
321 CEPH_MDS_OP_LINK = 0x01202, 304 CEPH_MDS_OP_LINK = 0x01202,
@@ -386,6 +369,15 @@ union ceph_mds_request_args {
386 struct { 369 struct {
387 struct ceph_file_layout layout; 370 struct ceph_file_layout layout;
388 } __attribute__ ((packed)) setlayout; 371 } __attribute__ ((packed)) setlayout;
372 struct {
373 __u8 rule; /* currently fcntl or flock */
374 __u8 type; /* shared, exclusive, remove*/
375 __le64 pid; /* process id requesting the lock */
376 __le64 pid_namespace;
377 __le64 start; /* initial location to lock */
378 __le64 length; /* num bytes to lock from start */
379 __u8 wait; /* will caller wait for lock to become available? */
380 } __attribute__ ((packed)) filelock_change;
389} __attribute__ ((packed)); 381} __attribute__ ((packed));
390 382
391#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */ 383#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
@@ -480,6 +472,23 @@ struct ceph_mds_reply_dirfrag {
480 __le32 dist[]; 472 __le32 dist[];
481} __attribute__ ((packed)); 473} __attribute__ ((packed));
482 474
475#define CEPH_LOCK_FCNTL 1
476#define CEPH_LOCK_FLOCK 2
477
478#define CEPH_LOCK_SHARED 1
479#define CEPH_LOCK_EXCL 2
480#define CEPH_LOCK_UNLOCK 4
481
482struct ceph_filelock {
483 __le64 start;/* file offset to start lock at */
484 __le64 length; /* num bytes to lock; 0 for all following start */
485 __le64 client; /* which client holds the lock */
486 __le64 pid; /* process id holding the lock on the client */
487 __le64 pid_namespace;
488 __u8 type; /* shared lock, exclusive lock, or unlock */
489} __attribute__ ((packed));
490
491
483/* file access modes */ 492/* file access modes */
484#define CEPH_FILE_MODE_PIN 0 493#define CEPH_FILE_MODE_PIN 0
485#define CEPH_FILE_MODE_RD 1 494#define CEPH_FILE_MODE_RD 1
@@ -508,9 +517,10 @@ int ceph_flags_to_mode(int flags);
508#define CEPH_CAP_SAUTH 2 517#define CEPH_CAP_SAUTH 2
509#define CEPH_CAP_SLINK 4 518#define CEPH_CAP_SLINK 4
510#define CEPH_CAP_SXATTR 6 519#define CEPH_CAP_SXATTR 6
511#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */ 520#define CEPH_CAP_SFILE 8
521#define CEPH_CAP_SFLOCK 20
512 522
513#define CEPH_CAP_BITS 16 523#define CEPH_CAP_BITS 22
514 524
515/* composed values */ 525/* composed values */
516#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH) 526#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
@@ -528,6 +538,9 @@ int ceph_flags_to_mode(int flags);
528#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE) 538#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
529#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE) 539#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
530#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE) 540#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
541#define CEPH_CAP_FLOCK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFLOCK)
542#define CEPH_CAP_FLOCK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFLOCK)
543
531 544
532/* cap masks (for getattr) */ 545/* cap masks (for getattr) */
533#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN 546#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
@@ -563,7 +576,8 @@ int ceph_flags_to_mode(int flags);
563 CEPH_CAP_FILE_EXCL) 576 CEPH_CAP_FILE_EXCL)
564#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR) 577#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
565#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \ 578#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
566 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN) 579 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
580 CEPH_CAP_PIN)
567 581
568#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \ 582#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
569 CEPH_LOCK_IXATTR) 583 CEPH_LOCK_IXATTR)
@@ -653,12 +667,21 @@ struct ceph_mds_cap_reconnect {
653 __le64 cap_id; 667 __le64 cap_id;
654 __le32 wanted; 668 __le32 wanted;
655 __le32 issued; 669 __le32 issued;
670 __le64 snaprealm;
671 __le64 pathbase; /* base ino for our path to this ino */
672 __le32 flock_len; /* size of flock state blob, if any */
673} __attribute__ ((packed));
674/* followed by flock blob */
675
676struct ceph_mds_cap_reconnect_v1 {
677 __le64 cap_id;
678 __le32 wanted;
679 __le32 issued;
656 __le64 size; 680 __le64 size;
657 struct ceph_timespec mtime, atime; 681 struct ceph_timespec mtime, atime;
658 __le64 snaprealm; 682 __le64 snaprealm;
659 __le64 pathbase; /* base ino for our path to this ino */ 683 __le64 pathbase; /* base ino for our path to this ino */
660} __attribute__ ((packed)); 684} __attribute__ ((packed));
661/* followed by encoded string */
662 685
663struct ceph_mds_snaprealm_reconnect { 686struct ceph_mds_snaprealm_reconnect {
664 __le64 ino; /* snap realm base */ 687 __le64 ino; /* snap realm base */
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
index 5ac470c433c9..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/fs/ceph/ceph_hash.h
@@ -1,5 +1,5 @@
1#ifndef _FS_CEPH_HASH_H 1#ifndef FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H 2#define FS_CEPH_HASH_H
3 3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */ 4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */ 5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 7503aee828ce..c6179d3a26a2 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -28,6 +28,7 @@ const char *ceph_osd_op_name(int op)
28 case CEPH_OSD_OP_TRUNCATE: return "truncate"; 28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero"; 29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull"; 30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
31 32
32 case CEPH_OSD_OP_APPEND: return "append"; 33 case CEPH_OSD_OP_APPEND: return "append";
33 case CEPH_OSD_OP_STARTSYNC: return "startsync"; 34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
@@ -129,6 +130,8 @@ const char *ceph_mds_op_name(int op)
129 case CEPH_MDS_OP_LSSNAP: return "lssnap"; 130 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap"; 131 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap"; 132 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
133 case CEPH_MDS_OP_SETFILELOCK: return "setfilelock";
134 case CEPH_MDS_OP_GETFILELOCK: return "getfilelock";
132 } 135 }
133 return "???"; 136 return "???";
134} 137}
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
index dcd7e7523700..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/fs/ceph/crush/crush.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_CRUSH_H 1#ifndef CEPH_CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H 2#define CEPH_CRUSH_CRUSH_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5 5
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
index ff48e110e4bb..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/fs/ceph/crush/hash.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_HASH_H 1#ifndef CEPH_CRUSH_HASH_H
2#define _CRUSH_HASH_H 2#define CEPH_CRUSH_HASH_H
3 3
4#define CRUSH_HASH_RJENKINS1 0 4#define CRUSH_HASH_RJENKINS1 0
5 5
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
index 98e90046fd9f..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/fs/ceph/crush/mapper.h
@@ -1,5 +1,5 @@
1#ifndef _CRUSH_MAPPER_H 1#ifndef CEPH_CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H 2#define CEPH_CRUSH_MAPPER_H
3 3
4/* 4/*
5 * CRUSH functions for find rules and then mapping an input to an 5 * CRUSH functions for find rules and then mapping an input to an
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
index f704b3b62424..a3e627f63293 100644
--- a/fs/ceph/crypto.c
+++ b/fs/ceph/crypto.c
@@ -75,10 +75,11 @@ static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); 75 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
76} 76}
77 77
78const u8 *aes_iv = "cephsageyudagreg"; 78static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
79 79
80int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len, 80static int ceph_aes_encrypt(const void *key, int key_len,
81 const void *src, size_t src_len) 81 void *dst, size_t *dst_len,
82 const void *src, size_t src_len)
82{ 83{
83 struct scatterlist sg_in[2], sg_out[1]; 84 struct scatterlist sg_in[2], sg_out[1];
84 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 85 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -126,9 +127,10 @@ int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
126 return 0; 127 return 0;
127} 128}
128 129
129int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len, 130static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
130 const void *src1, size_t src1_len, 131 size_t *dst_len,
131 const void *src2, size_t src2_len) 132 const void *src1, size_t src1_len,
133 const void *src2, size_t src2_len)
132{ 134{
133 struct scatterlist sg_in[3], sg_out[1]; 135 struct scatterlist sg_in[3], sg_out[1];
134 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 136 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -179,8 +181,9 @@ int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
179 return 0; 181 return 0;
180} 182}
181 183
182int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len, 184static int ceph_aes_decrypt(const void *key, int key_len,
183 const void *src, size_t src_len) 185 void *dst, size_t *dst_len,
186 const void *src, size_t src_len)
184{ 187{
185 struct scatterlist sg_in[1], sg_out[2]; 188 struct scatterlist sg_in[1], sg_out[2];
186 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 189 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
@@ -238,10 +241,10 @@ int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
238 return 0; 241 return 0;
239} 242}
240 243
241int ceph_aes_decrypt2(const void *key, int key_len, 244static int ceph_aes_decrypt2(const void *key, int key_len,
242 void *dst1, size_t *dst1_len, 245 void *dst1, size_t *dst1_len,
243 void *dst2, size_t *dst2_len, 246 void *dst2, size_t *dst2_len,
244 const void *src, size_t src_len) 247 const void *src, size_t src_len)
245{ 248{
246 struct scatterlist sg_in[1], sg_out[3]; 249 struct scatterlist sg_in[1], sg_out[3];
247 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher(); 250 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
index 40b502e6bd89..bdf38607323c 100644
--- a/fs/ceph/crypto.h
+++ b/fs/ceph/crypto.h
@@ -42,7 +42,7 @@ extern int ceph_encrypt2(struct ceph_crypto_key *secret,
42 const void *src2, size_t src2_len); 42 const void *src2, size_t src2_len);
43 43
44/* armor.c */ 44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end); 45extern int ceph_armor(char *dst, const char *src, const char *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end); 46extern int ceph_unarmor(char *dst, const char *src, const char *end);
47 47
48#endif 48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f2f5332ddbba..360c4f22718d 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -291,7 +291,7 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
291 return 0; 291 return 0;
292} 292}
293 293
294#define DEFINE_SHOW_FUNC(name) \ 294#define DEFINE_SHOW_FUNC(name) \
295static int name##_open(struct inode *inode, struct file *file) \ 295static int name##_open(struct inode *inode, struct file *file) \
296{ \ 296{ \
297 struct seq_file *sf; \ 297 struct seq_file *sf; \
@@ -361,8 +361,8 @@ int ceph_debugfs_client_init(struct ceph_client *client)
361 int ret = 0; 361 int ret = 0;
362 char name[80]; 362 char name[80];
363 363
364 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld", 364 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
365 PR_FSID(&client->fsid), client->monc.auth->global_id); 365 client->monc.auth->global_id);
366 366
367 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 367 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
368 if (!client->debugfs_dir) 368 if (!client->debugfs_dir)
@@ -432,11 +432,12 @@ int ceph_debugfs_client_init(struct ceph_client *client)
432 if (!client->debugfs_caps) 432 if (!client->debugfs_caps)
433 goto out; 433 goto out;
434 434
435 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb", 435 client->debugfs_congestion_kb =
436 0600, 436 debugfs_create_file("writeback_congestion_kb",
437 client->debugfs_dir, 437 0600,
438 client, 438 client->debugfs_dir,
439 &congestion_kb_fops); 439 client,
440 &congestion_kb_fops);
440 if (!client->debugfs_congestion_kb) 441 if (!client->debugfs_congestion_kb)
441 goto out; 442 goto out;
442 443
@@ -466,7 +467,7 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
466 debugfs_remove(client->debugfs_dir); 467 debugfs_remove(client->debugfs_dir);
467} 468}
468 469
469#else // CONFIG_DEBUG_FS 470#else /* CONFIG_DEBUG_FS */
470 471
471int __init ceph_debugfs_init(void) 472int __init ceph_debugfs_init(void)
472{ 473{
@@ -486,4 +487,4 @@ void ceph_debugfs_client_cleanup(struct ceph_client *client)
486{ 487{
487} 488}
488 489
489#endif // CONFIG_DEBUG_FS 490#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
index 65b3e022eaf5..3d25415afe63 100644
--- a/fs/ceph/decode.h
+++ b/fs/ceph/decode.h
@@ -99,11 +99,13 @@ static inline void ceph_encode_timespec(struct ceph_timespec *tv,
99 */ 99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a) 100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{ 101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family); 102 __be16 ss_family = htons(a->in_addr.ss_family);
103 a->in_addr.ss_family = *(__u16 *)&ss_family;
103} 104}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a) 105static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{ 106{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family); 107 __be16 ss_family = *(__be16 *)&a->in_addr.ss_family;
108 a->in_addr.ss_family = ntohs(ss_family);
107 WARN_ON(a->in_addr.ss_family == 512); 109 WARN_ON(a->in_addr.ss_family == 512);
108} 110}
109 111
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f94ed3c7f6a5..67bbb41d5526 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -27,7 +27,7 @@
27 27
28const struct inode_operations ceph_dir_iops; 28const struct inode_operations ceph_dir_iops;
29const struct file_operations ceph_dir_fops; 29const struct file_operations ceph_dir_fops;
30struct dentry_operations ceph_dentry_ops; 30const struct dentry_operations ceph_dentry_ops;
31 31
32/* 32/*
33 * Initialize ceph dentry state. 33 * Initialize ceph dentry state.
@@ -94,6 +94,8 @@ static unsigned fpos_off(loff_t p)
94 */ 94 */
95static int __dcache_readdir(struct file *filp, 95static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 96 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
97{ 99{
98 struct inode *inode = filp->f_dentry->d_inode; 100 struct inode *inode = filp->f_dentry->d_inode;
99 struct ceph_file_info *fi = filp->private_data; 101 struct ceph_file_info *fi = filp->private_data;
@@ -1239,16 +1241,16 @@ const struct inode_operations ceph_dir_iops = {
1239 .create = ceph_create, 1241 .create = ceph_create,
1240}; 1242};
1241 1243
1242struct dentry_operations ceph_dentry_ops = { 1244const struct dentry_operations ceph_dentry_ops = {
1243 .d_revalidate = ceph_d_revalidate, 1245 .d_revalidate = ceph_d_revalidate,
1244 .d_release = ceph_dentry_release, 1246 .d_release = ceph_dentry_release,
1245}; 1247};
1246 1248
1247struct dentry_operations ceph_snapdir_dentry_ops = { 1249const struct dentry_operations ceph_snapdir_dentry_ops = {
1248 .d_revalidate = ceph_snapdir_d_revalidate, 1250 .d_revalidate = ceph_snapdir_d_revalidate,
1249 .d_release = ceph_dentry_release, 1251 .d_release = ceph_dentry_release,
1250}; 1252};
1251 1253
1252struct dentry_operations ceph_snap_dentry_ops = { 1254const struct dentry_operations ceph_snap_dentry_ops = {
1253 .d_release = ceph_dentry_release, 1255 .d_release = ceph_dentry_release,
1254}; 1256};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7c08698fad3e..8c044a4f0457 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -317,7 +317,7 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
317/* 317/*
318 * allocate a vector new pages 318 * allocate a vector new pages
319 */ 319 */
320struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags) 320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{ 321{
322 struct page **pages; 322 struct page **pages;
323 int i; 323 int i;
@@ -665,7 +665,7 @@ more:
665 * throw out any page cache pages in this range. this 665 * throw out any page cache pages in this range. this
666 * may block. 666 * may block.
667 */ 667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1)); 669 (pos+len) | (PAGE_CACHE_SIZE-1));
670 } else { 670 } else {
671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 671 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
@@ -740,28 +740,32 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
740 unsigned long nr_segs, loff_t pos) 740 unsigned long nr_segs, loff_t pos)
741{ 741{
742 struct file *filp = iocb->ki_filp; 742 struct file *filp = iocb->ki_filp;
743 struct ceph_file_info *fi = filp->private_data;
743 loff_t *ppos = &iocb->ki_pos; 744 loff_t *ppos = &iocb->ki_pos;
744 size_t len = iov->iov_len; 745 size_t len = iov->iov_len;
745 struct inode *inode = filp->f_dentry->d_inode; 746 struct inode *inode = filp->f_dentry->d_inode;
746 struct ceph_inode_info *ci = ceph_inode(inode); 747 struct ceph_inode_info *ci = ceph_inode(inode);
747 void *base = iov->iov_base; 748 void __user *base = iov->iov_base;
748 ssize_t ret; 749 ssize_t ret;
749 int got = 0; 750 int want, got = 0;
750 int checkeof = 0, read = 0; 751 int checkeof = 0, read = 0;
751 752
752 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 753 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
753 inode, ceph_vinop(inode), pos, (unsigned)len, inode); 754 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
754again: 755again:
755 __ceph_do_pending_vmtruncate(inode); 756 __ceph_do_pending_vmtruncate(inode);
756 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, 757 if (fi->fmode & CEPH_FILE_MODE_LAZY)
757 &got, -1); 758 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
759 else
760 want = CEPH_CAP_FILE_CACHE;
761 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1);
758 if (ret < 0) 762 if (ret < 0)
759 goto out; 763 goto out;
760 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 764 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
761 inode, ceph_vinop(inode), pos, (unsigned)len, 765 inode, ceph_vinop(inode), pos, (unsigned)len,
762 ceph_cap_string(got)); 766 ceph_cap_string(got));
763 767
764 if ((got & CEPH_CAP_FILE_CACHE) == 0 || 768 if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 ||
765 (iocb->ki_filp->f_flags & O_DIRECT) || 769 (iocb->ki_filp->f_flags & O_DIRECT) ||
766 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) 770 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
767 /* hmm, this isn't really async... */ 771 /* hmm, this isn't really async... */
@@ -807,11 +811,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
807 unsigned long nr_segs, loff_t pos) 811 unsigned long nr_segs, loff_t pos)
808{ 812{
809 struct file *file = iocb->ki_filp; 813 struct file *file = iocb->ki_filp;
814 struct ceph_file_info *fi = file->private_data;
810 struct inode *inode = file->f_dentry->d_inode; 815 struct inode *inode = file->f_dentry->d_inode;
811 struct ceph_inode_info *ci = ceph_inode(inode); 816 struct ceph_inode_info *ci = ceph_inode(inode);
812 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
813 loff_t endoff = pos + iov->iov_len; 818 loff_t endoff = pos + iov->iov_len;
814 int got = 0; 819 int want, got = 0;
815 int ret, err; 820 int ret, err;
816 821
817 if (ceph_snap(inode) != CEPH_NOSNAP) 822 if (ceph_snap(inode) != CEPH_NOSNAP)
@@ -824,8 +829,11 @@ retry_snap:
824 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", 829 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
825 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 830 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
826 inode->i_size); 831 inode->i_size);
827 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 832 if (fi->fmode & CEPH_FILE_MODE_LAZY)
828 &got, endoff); 833 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
834 else
835 want = CEPH_CAP_FILE_BUFFER;
836 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff);
829 if (ret < 0) 837 if (ret < 0)
830 goto out; 838 goto out;
831 839
@@ -833,7 +841,7 @@ retry_snap:
833 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, 841 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
834 ceph_cap_string(got)); 842 ceph_cap_string(got));
835 843
836 if ((got & CEPH_CAP_FILE_BUFFER) == 0 || 844 if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 ||
837 (iocb->ki_filp->f_flags & O_DIRECT) || 845 (iocb->ki_filp->f_flags & O_DIRECT) ||
838 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) { 846 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
839 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, 847 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
@@ -930,6 +938,8 @@ const struct file_operations ceph_file_fops = {
930 .aio_write = ceph_aio_write, 938 .aio_write = ceph_aio_write,
931 .mmap = ceph_mmap, 939 .mmap = ceph_mmap,
932 .fsync = ceph_fsync, 940 .fsync = ceph_fsync,
941 .lock = ceph_lock,
942 .flock = ceph_flock,
933 .splice_read = generic_file_splice_read, 943 .splice_read = generic_file_splice_read,
934 .splice_write = generic_file_splice_write, 944 .splice_write = generic_file_splice_write,
935 .unlocked_ioctl = ceph_ioctl, 945 .unlocked_ioctl = ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 389f9dbd9949..5d893d31e399 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -442,8 +442,9 @@ int ceph_fill_file_size(struct inode *inode, int issued,
442 * the file is either opened or mmaped 442 * the file is either opened or mmaped
443 */ 443 */
444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD| 444 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER| 445 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
446 CEPH_CAP_FILE_EXCL)) || 446 CEPH_CAP_FILE_EXCL|
447 CEPH_CAP_FILE_LAZYIO)) ||
447 mapping_mapped(inode->i_mapping) || 448 mapping_mapped(inode->i_mapping) ||
448 __ceph_caps_file_wanted(ci)) { 449 __ceph_caps_file_wanted(ci)) {
449 ci->i_truncate_pending++; 450 ci->i_truncate_pending++;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index d085f07756b4..76e307d2aba1 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -143,6 +143,27 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
143 return 0; 143 return 0;
144} 144}
145 145
146static long ceph_ioctl_lazyio(struct file *file)
147{
148 struct ceph_file_info *fi = file->private_data;
149 struct inode *inode = file->f_dentry->d_inode;
150 struct ceph_inode_info *ci = ceph_inode(inode);
151
152 if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
153 spin_lock(&inode->i_lock);
154 ci->i_nr_by_mode[fi->fmode]--;
155 fi->fmode |= CEPH_FILE_MODE_LAZY;
156 ci->i_nr_by_mode[fi->fmode]++;
157 spin_unlock(&inode->i_lock);
158 dout("ioctl_layzio: file %p marked lazy\n", file);
159
160 ceph_check_caps(ci, 0, NULL);
161 } else {
162 dout("ioctl_layzio: file %p already lazy\n", file);
163 }
164 return 0;
165}
166
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 167long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{ 168{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg); 169 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
@@ -155,6 +176,9 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
155 176
156 case CEPH_IOC_GET_DATALOC: 177 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 178 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179
180 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file);
158 } 182 }
159 return -ENOTTY; 183 return -ENOTTY;
160} 184}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 25e4f1a9d059..88451a3b6857 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -37,4 +37,6 @@ struct ceph_ioctl_dataloc {
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \ 37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc) 38 struct ceph_ioctl_dataloc)
39 39
40#define CEPH_IOC_LAZYIO _IO(CEPH_IOCTL_MAGIC, 4)
41
40#endif 42#endif
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
new file mode 100644
index 000000000000..ae85af06454f
--- /dev/null
+++ b/fs/ceph/locks.c
@@ -0,0 +1,256 @@
1#include "ceph_debug.h"
2
3#include <linux/file.h>
4#include <linux/namei.h>
5
6#include "super.h"
7#include "mds_client.h"
8#include "pagelist.h"
9
10/**
11 * Implement fcntl and flock locking functions.
12 */
13static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
14 u64 pid, u64 pid_ns,
15 int cmd, u64 start, u64 length, u8 wait)
16{
17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req;
21 int err;
22
23 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS);
24 if (IS_ERR(req))
25 return PTR_ERR(req);
26 req->r_inode = igrab(inode);
27
28 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
29 "length: %llu, wait: %d, type`: %d", (int)lock_type,
30 (int)operation, pid, start, length, wait, cmd);
31
32 req->r_args.filelock_change.rule = lock_type;
33 req->r_args.filelock_change.type = cmd;
34 req->r_args.filelock_change.pid = cpu_to_le64(pid);
35 /* This should be adjusted, but I'm not sure if
36 namespaces actually get id numbers*/
37 req->r_args.filelock_change.pid_namespace =
38 cpu_to_le64((u64)pid_ns);
39 req->r_args.filelock_change.start = cpu_to_le64(start);
40 req->r_args.filelock_change.length = cpu_to_le64(length);
41 req->r_args.filelock_change.wait = wait;
42
43 err = ceph_mdsc_do_request(mdsc, inode, req);
44 ceph_mdsc_put_request(req);
45 dout("ceph_lock_message: rule: %d, op: %d, pid: %llu, start: %llu, "
46 "length: %llu, wait: %d, type`: %d err code %d", (int)lock_type,
47 (int)operation, pid, start, length, wait, cmd, err);
48 return err;
49}
50
51/**
52 * Attempt to set an fcntl lock.
53 * For now, this just goes away to the server. Later it may be more awesome.
54 */
55int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
56{
57 u64 length;
58 u8 lock_cmd;
59 int err;
60 u8 wait = 0;
61 u16 op = CEPH_MDS_OP_SETFILELOCK;
62
63 fl->fl_nspid = get_pid(task_tgid(current));
64 dout("ceph_lock, fl_pid:%d", fl->fl_pid);
65
66 /* set wait bit as appropriate, then make command as Ceph expects it*/
67 if (F_SETLKW == cmd)
68 wait = 1;
69 if (F_GETLK == cmd)
70 op = CEPH_MDS_OP_GETFILELOCK;
71
72 if (F_RDLCK == fl->fl_type)
73 lock_cmd = CEPH_LOCK_SHARED;
74 else if (F_WRLCK == fl->fl_type)
75 lock_cmd = CEPH_LOCK_EXCL;
76 else
77 lock_cmd = CEPH_LOCK_UNLOCK;
78
79 if (LLONG_MAX == fl->fl_end)
80 length = 0;
81 else
82 length = fl->fl_end - fl->fl_start + 1;
83
84 err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
85 (u64)fl->fl_pid, (u64)fl->fl_nspid,
86 lock_cmd, fl->fl_start,
87 length, wait);
88 if (!err) {
89 dout("mds locked, locking locally");
90 err = posix_lock_file(file, fl, NULL);
91 if (err && (CEPH_MDS_OP_SETFILELOCK == op)) {
92 /* undo! This should only happen if the kernel detects
93 * local deadlock. */
94 ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
95 (u64)fl->fl_pid, (u64)fl->fl_nspid,
96 CEPH_LOCK_UNLOCK, fl->fl_start,
97 length, 0);
98 dout("got %d on posix_lock_file, undid lock", err);
99 }
100 } else {
101 dout("mds returned error code %d", err);
102 }
103 return err;
104}
105
106int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
107{
108 u64 length;
109 u8 lock_cmd;
110 int err;
111 u8 wait = 1;
112
113 fl->fl_nspid = get_pid(task_tgid(current));
114 dout("ceph_flock, fl_pid:%d", fl->fl_pid);
115
116 /* set wait bit, then clear it out of cmd*/
117 if (cmd & LOCK_NB)
118 wait = 0;
119 cmd = cmd & (LOCK_SH | LOCK_EX | LOCK_UN);
120 /* set command sequence that Ceph wants to see:
121 shared lock, exclusive lock, or unlock */
122 if (LOCK_SH == cmd)
123 lock_cmd = CEPH_LOCK_SHARED;
124 else if (LOCK_EX == cmd)
125 lock_cmd = CEPH_LOCK_EXCL;
126 else
127 lock_cmd = CEPH_LOCK_UNLOCK;
128 /* mds requires start and length rather than start and end */
129 if (LLONG_MAX == fl->fl_end)
130 length = 0;
131 else
132 length = fl->fl_end - fl->fl_start + 1;
133
134 err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
135 file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
136 lock_cmd, fl->fl_start,
137 length, wait);
138 if (!err) {
139 err = flock_lock_file_wait(file, fl);
140 if (err) {
141 ceph_lock_message(CEPH_LOCK_FLOCK,
142 CEPH_MDS_OP_SETFILELOCK,
143 file, (u64)fl->fl_pid,
144 (u64)fl->fl_nspid,
145 CEPH_LOCK_UNLOCK, fl->fl_start,
146 length, 0);
147 dout("got %d on flock_lock_file_wait, undid lock", err);
148 }
149 } else {
150 dout("mds error code %d", err);
151 }
152 return err;
153}
154
155/**
156 * Must be called with BKL already held. Fills in the passed
157 * counter variables, so you can prepare pagelist metadata before calling
158 * ceph_encode_locks.
159 */
160void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
161{
162 struct file_lock *lock;
163
164 *fcntl_count = 0;
165 *flock_count = 0;
166
167 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
168 if (lock->fl_flags & FL_POSIX)
169 ++(*fcntl_count);
170 else if (lock->fl_flags & FL_FLOCK)
171 ++(*flock_count);
172 }
173 dout("counted %d flock locks and %d fcntl locks",
174 *flock_count, *fcntl_count);
175}
176
177/**
178 * Encode the flock and fcntl locks for the given inode into the pagelist.
179 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
180 * sequential flock locks.
181 * Must be called with BLK already held, and the lock numbers should have
182 * been gathered under the same lock holding window.
183 */
184int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
185 int num_fcntl_locks, int num_flock_locks)
186{
187 struct file_lock *lock;
188 struct ceph_filelock cephlock;
189 int err = 0;
190
191 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
192 num_fcntl_locks);
193 err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32));
194 if (err)
195 goto fail;
196 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
197 if (lock->fl_flags & FL_POSIX) {
198 err = lock_to_ceph_filelock(lock, &cephlock);
199 if (err)
200 goto fail;
201 err = ceph_pagelist_append(pagelist, &cephlock,
202 sizeof(struct ceph_filelock));
203 }
204 if (err)
205 goto fail;
206 }
207
208 err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32));
209 if (err)
210 goto fail;
211 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
212 if (lock->fl_flags & FL_FLOCK) {
213 err = lock_to_ceph_filelock(lock, &cephlock);
214 if (err)
215 goto fail;
216 err = ceph_pagelist_append(pagelist, &cephlock,
217 sizeof(struct ceph_filelock));
218 }
219 if (err)
220 goto fail;
221 }
222fail:
223 return err;
224}
225
226/*
227 * Given a pointer to a lock, convert it to a ceph filelock
228 */
229int lock_to_ceph_filelock(struct file_lock *lock,
230 struct ceph_filelock *cephlock)
231{
232 int err = 0;
233
234 cephlock->start = cpu_to_le64(lock->fl_start);
235 cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
236 cephlock->client = cpu_to_le64(0);
237 cephlock->pid = cpu_to_le64(lock->fl_pid);
238 cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
239
240 switch (lock->fl_type) {
241 case F_RDLCK:
242 cephlock->type = CEPH_LOCK_SHARED;
243 break;
244 case F_WRLCK:
245 cephlock->type = CEPH_LOCK_EXCL;
246 break;
247 case F_UNLCK:
248 cephlock->type = CEPH_LOCK_UNLOCK;
249 break;
250 default:
251 dout("Have unknown lock type %d", lock->fl_type);
252 err = -EINVAL;
253 }
254
255 return err;
256}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index dd440bd438a9..a75ddbf9fe37 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3,6 +3,7 @@
3#include <linux/wait.h> 3#include <linux/wait.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/sched.h> 5#include <linux/sched.h>
6#include <linux/smp_lock.h>
6 7
7#include "mds_client.h" 8#include "mds_client.h"
8#include "mon_client.h" 9#include "mon_client.h"
@@ -37,6 +38,11 @@
37 * are no longer valid. 38 * are no longer valid.
38 */ 39 */
39 40
41struct ceph_reconnect_state {
42 struct ceph_pagelist *pagelist;
43 bool flock;
44};
45
40static void __wake_requests(struct ceph_mds_client *mdsc, 46static void __wake_requests(struct ceph_mds_client *mdsc,
41 struct list_head *head); 47 struct list_head *head);
42 48
@@ -449,7 +455,7 @@ void ceph_mdsc_release_request(struct kref *kref)
449 kfree(req->r_path1); 455 kfree(req->r_path1);
450 kfree(req->r_path2); 456 kfree(req->r_path2);
451 put_request_session(req); 457 put_request_session(req);
452 ceph_unreserve_caps(&req->r_caps_reservation); 458 ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation);
453 kfree(req); 459 kfree(req);
454} 460}
455 461
@@ -512,7 +518,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
512{ 518{
513 req->r_tid = ++mdsc->last_tid; 519 req->r_tid = ++mdsc->last_tid;
514 if (req->r_num_caps) 520 if (req->r_num_caps)
515 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps); 521 ceph_reserve_caps(mdsc, &req->r_caps_reservation,
522 req->r_num_caps);
516 dout("__register_request %p tid %lld\n", req, req->r_tid); 523 dout("__register_request %p tid %lld\n", req, req->r_tid);
517 ceph_mdsc_get_request(req); 524 ceph_mdsc_get_request(req);
518 __insert_request(mdsc, req); 525 __insert_request(mdsc, req);
@@ -704,6 +711,51 @@ static int __open_session(struct ceph_mds_client *mdsc,
704} 711}
705 712
706/* 713/*
714 * open sessions for any export targets for the given mds
715 *
716 * called under mdsc->mutex
717 */
718static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
719 struct ceph_mds_session *session)
720{
721 struct ceph_mds_info *mi;
722 struct ceph_mds_session *ts;
723 int i, mds = session->s_mds;
724 int target;
725
726 if (mds >= mdsc->mdsmap->m_max_mds)
727 return;
728 mi = &mdsc->mdsmap->m_info[mds];
729 dout("open_export_target_sessions for mds%d (%d targets)\n",
730 session->s_mds, mi->num_export_targets);
731
732 for (i = 0; i < mi->num_export_targets; i++) {
733 target = mi->export_targets[i];
734 ts = __ceph_lookup_mds_session(mdsc, target);
735 if (!ts) {
736 ts = register_session(mdsc, target);
737 if (IS_ERR(ts))
738 return;
739 }
740 if (session->s_state == CEPH_MDS_SESSION_NEW ||
741 session->s_state == CEPH_MDS_SESSION_CLOSING)
742 __open_session(mdsc, session);
743 else
744 dout(" mds%d target mds%d %p is %s\n", session->s_mds,
745 i, ts, session_state_name(ts->s_state));
746 ceph_put_mds_session(ts);
747 }
748}
749
750void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
751 struct ceph_mds_session *session)
752{
753 mutex_lock(&mdsc->mutex);
754 __open_export_target_sessions(mdsc, session);
755 mutex_unlock(&mdsc->mutex);
756}
757
758/*
707 * session caps 759 * session caps
708 */ 760 */
709 761
@@ -764,7 +816,7 @@ static int iterate_session_caps(struct ceph_mds_session *session,
764 last_inode = NULL; 816 last_inode = NULL;
765 } 817 }
766 if (old_cap) { 818 if (old_cap) {
767 ceph_put_cap(old_cap); 819 ceph_put_cap(session->s_mdsc, old_cap);
768 old_cap = NULL; 820 old_cap = NULL;
769 } 821 }
770 822
@@ -793,7 +845,7 @@ out:
793 if (last_inode) 845 if (last_inode)
794 iput(last_inode); 846 iput(last_inode);
795 if (old_cap) 847 if (old_cap)
796 ceph_put_cap(old_cap); 848 ceph_put_cap(session->s_mdsc, old_cap);
797 849
798 return ret; 850 return ret;
799} 851}
@@ -1067,15 +1119,16 @@ static int trim_caps(struct ceph_mds_client *mdsc,
1067 * Called under s_mutex. 1119 * Called under s_mutex.
1068 */ 1120 */
1069int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 1121int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1070 struct ceph_mds_session *session, 1122 struct ceph_mds_session *session)
1071 int extra)
1072{ 1123{
1073 struct ceph_msg *msg; 1124 struct ceph_msg *msg, *partial = NULL;
1074 struct ceph_mds_cap_release *head; 1125 struct ceph_mds_cap_release *head;
1075 int err = -ENOMEM; 1126 int err = -ENOMEM;
1127 int extra = mdsc->client->mount_args->cap_release_safety;
1128 int num;
1076 1129
1077 if (extra < 0) 1130 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
1078 extra = mdsc->client->mount_args->cap_release_safety; 1131 extra);
1079 1132
1080 spin_lock(&session->s_cap_lock); 1133 spin_lock(&session->s_cap_lock);
1081 1134
@@ -1084,9 +1137,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1084 struct ceph_msg, 1137 struct ceph_msg,
1085 list_head); 1138 list_head);
1086 head = msg->front.iov_base; 1139 head = msg->front.iov_base;
1087 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num); 1140 num = le32_to_cpu(head->num);
1141 if (num) {
1142 dout(" partial %p with (%d/%d)\n", msg, num,
1143 (int)CEPH_CAPS_PER_RELEASE);
1144 extra += CEPH_CAPS_PER_RELEASE - num;
1145 partial = msg;
1146 }
1088 } 1147 }
1089
1090 while (session->s_num_cap_releases < session->s_nr_caps + extra) { 1148 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1091 spin_unlock(&session->s_cap_lock); 1149 spin_unlock(&session->s_cap_lock);
1092 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, 1150 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
@@ -1103,19 +1161,14 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1103 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; 1161 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1104 } 1162 }
1105 1163
1106 if (!list_empty(&session->s_cap_releases)) { 1164 if (partial) {
1107 msg = list_first_entry(&session->s_cap_releases, 1165 head = partial->front.iov_base;
1108 struct ceph_msg, 1166 num = le32_to_cpu(head->num);
1109 list_head); 1167 dout(" queueing partial %p with %d/%d\n", partial, num,
1110 head = msg->front.iov_base; 1168 (int)CEPH_CAPS_PER_RELEASE);
1111 if (head->num) { 1169 list_move_tail(&partial->list_head,
1112 dout(" queueing non-full %p (%d)\n", msg, 1170 &session->s_cap_releases_done);
1113 le32_to_cpu(head->num)); 1171 session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num;
1114 list_move_tail(&msg->list_head,
1115 &session->s_cap_releases_done);
1116 session->s_num_cap_releases -=
1117 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1118 }
1119 } 1172 }
1120 err = 0; 1173 err = 0;
1121 spin_unlock(&session->s_cap_lock); 1174 spin_unlock(&session->s_cap_lock);
@@ -1250,6 +1303,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1250 return ERR_PTR(-ENOMEM); 1303 return ERR_PTR(-ENOMEM);
1251 1304
1252 mutex_init(&req->r_fill_mutex); 1305 mutex_init(&req->r_fill_mutex);
1306 req->r_mdsc = mdsc;
1253 req->r_started = jiffies; 1307 req->r_started = jiffies;
1254 req->r_resend_mds = -1; 1308 req->r_resend_mds = -1;
1255 INIT_LIST_HEAD(&req->r_unsafe_dir_item); 1309 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1580,6 +1634,15 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1580 1634
1581 req->r_mds = mds; 1635 req->r_mds = mds;
1582 req->r_attempts++; 1636 req->r_attempts++;
1637 if (req->r_inode) {
1638 struct ceph_cap *cap =
1639 ceph_get_cap_for_mds(ceph_inode(req->r_inode), mds);
1640
1641 if (cap)
1642 req->r_sent_on_mseq = cap->mseq;
1643 else
1644 req->r_sent_on_mseq = -1;
1645 }
1583 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req, 1646 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1584 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts); 1647 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1585 1648
@@ -1914,21 +1977,40 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1914 result = le32_to_cpu(head->result); 1977 result = le32_to_cpu(head->result);
1915 1978
1916 /* 1979 /*
1917 * Tolerate 2 consecutive ESTALEs from the same mds. 1980 * Handle an ESTALE
1918 * FIXME: we should be looking at the cap migrate_seq. 1981 * if we're not talking to the authority, send to them
1982 * if the authority has changed while we weren't looking,
1983 * send to new authority
1984 * Otherwise we just have to return an ESTALE
1919 */ 1985 */
1920 if (result == -ESTALE) { 1986 if (result == -ESTALE) {
1921 req->r_direct_mode = USE_AUTH_MDS; 1987 dout("got ESTALE on request %llu", req->r_tid);
1922 req->r_num_stale++; 1988 if (!req->r_inode) {
1923 if (req->r_num_stale <= 2) { 1989 /* do nothing; not an authority problem */
1990 } else if (req->r_direct_mode != USE_AUTH_MDS) {
1991 dout("not using auth, setting for that now");
1992 req->r_direct_mode = USE_AUTH_MDS;
1924 __do_request(mdsc, req); 1993 __do_request(mdsc, req);
1925 mutex_unlock(&mdsc->mutex); 1994 mutex_unlock(&mdsc->mutex);
1926 goto out; 1995 goto out;
1996 } else {
1997 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
1998 struct ceph_cap *cap =
1999 ceph_get_cap_for_mds(ci, req->r_mds);;
2000
2001 dout("already using auth");
2002 if ((!cap || cap != ci->i_auth_cap) ||
2003 (cap->mseq != req->r_sent_on_mseq)) {
2004 dout("but cap changed, so resending");
2005 __do_request(mdsc, req);
2006 mutex_unlock(&mdsc->mutex);
2007 goto out;
2008 }
1927 } 2009 }
1928 } else { 2010 dout("have to return ESTALE on request %llu", req->r_tid);
1929 req->r_num_stale = 0;
1930 } 2011 }
1931 2012
2013
1932 if (head->safe) { 2014 if (head->safe) {
1933 req->r_got_safe = true; 2015 req->r_got_safe = true;
1934 __unregister_request(mdsc, req); 2016 __unregister_request(mdsc, req);
@@ -1985,7 +2067,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1985 if (err == 0) { 2067 if (err == 0) {
1986 if (result == 0 && rinfo->dir_nr) 2068 if (result == 0 && rinfo->dir_nr)
1987 ceph_readdir_prepopulate(req, req->r_session); 2069 ceph_readdir_prepopulate(req, req->r_session);
1988 ceph_unreserve_caps(&req->r_caps_reservation); 2070 ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
1989 } 2071 }
1990 mutex_unlock(&req->r_fill_mutex); 2072 mutex_unlock(&req->r_fill_mutex);
1991 2073
@@ -2005,7 +2087,7 @@ out_err:
2005 } 2087 }
2006 mutex_unlock(&mdsc->mutex); 2088 mutex_unlock(&mdsc->mutex);
2007 2089
2008 ceph_add_cap_releases(mdsc, req->r_session, -1); 2090 ceph_add_cap_releases(mdsc, req->r_session);
2009 mutex_unlock(&session->s_mutex); 2091 mutex_unlock(&session->s_mutex);
2010 2092
2011 /* kick calling process */ 2093 /* kick calling process */
@@ -2193,9 +2275,14 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2193static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, 2275static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2194 void *arg) 2276 void *arg)
2195{ 2277{
2196 struct ceph_mds_cap_reconnect rec; 2278 union {
2279 struct ceph_mds_cap_reconnect v2;
2280 struct ceph_mds_cap_reconnect_v1 v1;
2281 } rec;
2282 size_t reclen;
2197 struct ceph_inode_info *ci; 2283 struct ceph_inode_info *ci;
2198 struct ceph_pagelist *pagelist = arg; 2284 struct ceph_reconnect_state *recon_state = arg;
2285 struct ceph_pagelist *pagelist = recon_state->pagelist;
2199 char *path; 2286 char *path;
2200 int pathlen, err; 2287 int pathlen, err;
2201 u64 pathbase; 2288 u64 pathbase;
@@ -2228,17 +2315,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2228 spin_lock(&inode->i_lock); 2315 spin_lock(&inode->i_lock);
2229 cap->seq = 0; /* reset cap seq */ 2316 cap->seq = 0; /* reset cap seq */
2230 cap->issue_seq = 0; /* and issue_seq */ 2317 cap->issue_seq = 0; /* and issue_seq */
2231 rec.cap_id = cpu_to_le64(cap->cap_id); 2318
2232 rec.pathbase = cpu_to_le64(pathbase); 2319 if (recon_state->flock) {
2233 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); 2320 rec.v2.cap_id = cpu_to_le64(cap->cap_id);
2234 rec.issued = cpu_to_le32(cap->issued); 2321 rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2235 rec.size = cpu_to_le64(inode->i_size); 2322 rec.v2.issued = cpu_to_le32(cap->issued);
2236 ceph_encode_timespec(&rec.mtime, &inode->i_mtime); 2323 rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2237 ceph_encode_timespec(&rec.atime, &inode->i_atime); 2324 rec.v2.pathbase = cpu_to_le64(pathbase);
2238 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); 2325 rec.v2.flock_len = 0;
2326 reclen = sizeof(rec.v2);
2327 } else {
2328 rec.v1.cap_id = cpu_to_le64(cap->cap_id);
2329 rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2330 rec.v1.issued = cpu_to_le32(cap->issued);
2331 rec.v1.size = cpu_to_le64(inode->i_size);
2332 ceph_encode_timespec(&rec.v1.mtime, &inode->i_mtime);
2333 ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
2334 rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2335 rec.v1.pathbase = cpu_to_le64(pathbase);
2336 reclen = sizeof(rec.v1);
2337 }
2239 spin_unlock(&inode->i_lock); 2338 spin_unlock(&inode->i_lock);
2240 2339
2241 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec)); 2340 if (recon_state->flock) {
2341 int num_fcntl_locks, num_flock_locks;
2342
2343 lock_kernel();
2344 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
2345 rec.v2.flock_len = (2*sizeof(u32) +
2346 (num_fcntl_locks+num_flock_locks) *
2347 sizeof(struct ceph_filelock));
2348
2349 err = ceph_pagelist_append(pagelist, &rec, reclen);
2350 if (!err)
2351 err = ceph_encode_locks(inode, pagelist,
2352 num_fcntl_locks,
2353 num_flock_locks);
2354 unlock_kernel();
2355 }
2242 2356
2243out: 2357out:
2244 kfree(path); 2358 kfree(path);
@@ -2267,6 +2381,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2267 int mds = session->s_mds; 2381 int mds = session->s_mds;
2268 int err = -ENOMEM; 2382 int err = -ENOMEM;
2269 struct ceph_pagelist *pagelist; 2383 struct ceph_pagelist *pagelist;
2384 struct ceph_reconnect_state recon_state;
2270 2385
2271 pr_info("mds%d reconnect start\n", mds); 2386 pr_info("mds%d reconnect start\n", mds);
2272 2387
@@ -2301,7 +2416,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2301 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps); 2416 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2302 if (err) 2417 if (err)
2303 goto fail; 2418 goto fail;
2304 err = iterate_session_caps(session, encode_caps_cb, pagelist); 2419
2420 recon_state.pagelist = pagelist;
2421 recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
2422 err = iterate_session_caps(session, encode_caps_cb, &recon_state);
2305 if (err < 0) 2423 if (err < 0)
2306 goto fail; 2424 goto fail;
2307 2425
@@ -2326,6 +2444,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
2326 } 2444 }
2327 2445
2328 reply->pagelist = pagelist; 2446 reply->pagelist = pagelist;
2447 if (recon_state.flock)
2448 reply->hdr.version = cpu_to_le16(2);
2329 reply->hdr.data_len = cpu_to_le32(pagelist->length); 2449 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2330 reply->nr_pages = calc_pages_for(0, pagelist->length); 2450 reply->nr_pages = calc_pages_for(0, pagelist->length);
2331 ceph_con_send(&session->s_con, reply); 2451 ceph_con_send(&session->s_con, reply);
@@ -2376,9 +2496,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2376 oldstate = ceph_mdsmap_get_state(oldmap, i); 2496 oldstate = ceph_mdsmap_get_state(oldmap, i);
2377 newstate = ceph_mdsmap_get_state(newmap, i); 2497 newstate = ceph_mdsmap_get_state(newmap, i);
2378 2498
2379 dout("check_new_map mds%d state %s -> %s (session %s)\n", 2499 dout("check_new_map mds%d state %s%s -> %s%s (session %s)\n",
2380 i, ceph_mds_state_name(oldstate), 2500 i, ceph_mds_state_name(oldstate),
2501 ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "",
2381 ceph_mds_state_name(newstate), 2502 ceph_mds_state_name(newstate),
2503 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
2382 session_state_name(s->s_state)); 2504 session_state_name(s->s_state));
2383 2505
2384 if (memcmp(ceph_mdsmap_get_addr(oldmap, i), 2506 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
@@ -2428,6 +2550,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
2428 wake_up_session_caps(s, 1); 2550 wake_up_session_caps(s, 1);
2429 } 2551 }
2430 } 2552 }
2553
2554 for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) {
2555 s = mdsc->sessions[i];
2556 if (!s)
2557 continue;
2558 if (!ceph_mdsmap_is_laggy(newmap, i))
2559 continue;
2560 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2561 s->s_state == CEPH_MDS_SESSION_HUNG ||
2562 s->s_state == CEPH_MDS_SESSION_CLOSING) {
2563 dout(" connecting to export targets of laggy mds%d\n",
2564 i);
2565 __open_export_target_sessions(mdsc, s);
2566 }
2567 }
2431} 2568}
2432 2569
2433 2570
@@ -2715,7 +2852,7 @@ static void delayed_work(struct work_struct *work)
2715 send_renew_caps(mdsc, s); 2852 send_renew_caps(mdsc, s);
2716 else 2853 else
2717 ceph_con_keepalive(&s->s_con); 2854 ceph_con_keepalive(&s->s_con);
2718 ceph_add_cap_releases(mdsc, s, -1); 2855 ceph_add_cap_releases(mdsc, s);
2719 if (s->s_state == CEPH_MDS_SESSION_OPEN || 2856 if (s->s_state == CEPH_MDS_SESSION_OPEN ||
2720 s->s_state == CEPH_MDS_SESSION_HUNG) 2857 s->s_state == CEPH_MDS_SESSION_HUNG)
2721 ceph_send_cap_releases(mdsc, s); 2858 ceph_send_cap_releases(mdsc, s);
@@ -2764,6 +2901,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2764 spin_lock_init(&mdsc->dentry_lru_lock); 2901 spin_lock_init(&mdsc->dentry_lru_lock);
2765 INIT_LIST_HEAD(&mdsc->dentry_lru); 2902 INIT_LIST_HEAD(&mdsc->dentry_lru);
2766 2903
2904 ceph_caps_init(mdsc);
2905 ceph_adjust_min_caps(mdsc, client->min_caps);
2906
2767 return 0; 2907 return 0;
2768} 2908}
2769 2909
@@ -2959,6 +3099,7 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2959 if (mdsc->mdsmap) 3099 if (mdsc->mdsmap)
2960 ceph_mdsmap_destroy(mdsc->mdsmap); 3100 ceph_mdsmap_destroy(mdsc->mdsmap);
2961 kfree(mdsc->sessions); 3101 kfree(mdsc->sessions);
3102 ceph_caps_finalize(mdsc);
2962} 3103}
2963 3104
2964 3105
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 952410c60d09..ab7e89f5e344 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -151,6 +151,7 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
151struct ceph_mds_request { 151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */ 152 u64 r_tid; /* transaction id */
153 struct rb_node r_node; 153 struct rb_node r_node;
154 struct ceph_mds_client *r_mdsc;
154 155
155 int r_op; /* mds op code */ 156 int r_op; /* mds op code */
156 int r_mds; 157 int r_mds;
@@ -207,8 +208,8 @@ struct ceph_mds_request {
207 208
208 int r_attempts; /* resend attempts */ 209 int r_attempts; /* resend attempts */
209 int r_num_fwd; /* number of forward attempts */ 210 int r_num_fwd; /* number of forward attempts */
210 int r_num_stale;
211 int r_resend_mds; /* mds to resend to next, if any*/ 211 int r_resend_mds; /* mds to resend to next, if any*/
212 u32 r_sent_on_mseq; /* cap mseq request was sent at*/
212 213
213 struct kref r_kref; 214 struct kref r_kref;
214 struct list_head r_wait; 215 struct list_head r_wait;
@@ -267,6 +268,27 @@ struct ceph_mds_client {
267 spinlock_t cap_dirty_lock; /* protects above items */ 268 spinlock_t cap_dirty_lock; /* protects above items */
268 wait_queue_head_t cap_flushing_wq; 269 wait_queue_head_t cap_flushing_wq;
269 270
271 /*
272 * Cap reservations
273 *
274 * Maintain a global pool of preallocated struct ceph_caps, referenced
275 * by struct ceph_caps_reservations. This ensures that we preallocate
276 * memory needed to successfully process an MDS response. (If an MDS
277 * sends us cap information and we fail to process it, we will have
278 * problems due to the client and MDS being out of sync.)
279 *
280 * Reservations are 'owned' by a ceph_cap_reservation context.
281 */
282 spinlock_t caps_list_lock;
283 struct list_head caps_list; /* unused (reserved or
284 unreserved) */
285 int caps_total_count; /* total caps allocated */
286 int caps_use_count; /* in use */
287 int caps_reserve_count; /* unused, reserved */
288 int caps_avail_count; /* unused, unreserved */
289 int caps_min_count; /* keep at least this many
290 (unreserved) */
291
270#ifdef CONFIG_DEBUG_FS 292#ifdef CONFIG_DEBUG_FS
271 struct dentry *debugfs_file; 293 struct dentry *debugfs_file;
272#endif 294#endif
@@ -324,8 +346,7 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
324} 346}
325 347
326extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, 348extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
327 struct ceph_mds_session *session, 349 struct ceph_mds_session *session);
328 int extra);
329extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, 350extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
330 struct ceph_mds_session *session); 351 struct ceph_mds_session *session);
331 352
@@ -343,4 +364,7 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
343extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, 364extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
344 struct ceph_msg *msg); 365 struct ceph_msg *msg);
345 366
367extern void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc,
368 struct ceph_mds_session *session);
369
346#endif 370#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index c4c498e6dfef..040be6d1150b 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -85,6 +85,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
85 struct ceph_entity_addr addr; 85 struct ceph_entity_addr addr;
86 u32 num_export_targets; 86 u32 num_export_targets;
87 void *pexport_targets = NULL; 87 void *pexport_targets = NULL;
88 struct ceph_timespec laggy_since;
88 89
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad); 90 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p); 91 global_id = ceph_decode_64(p);
@@ -103,7 +104,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
103 state_seq = ceph_decode_64(p); 104 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr)); 105 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr); 106 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec); 107 ceph_decode_copy(p, &laggy_since, sizeof(laggy_since));
107 *p += sizeof(u32); 108 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad); 109 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen; 110 *p += namelen;
@@ -122,6 +123,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
122 m->m_info[mds].global_id = global_id; 123 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state; 124 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr; 125 m->m_info[mds].addr = addr;
126 m->m_info[mds].laggy =
127 (laggy_since.tv_sec != 0 ||
128 laggy_since.tv_nsec != 0);
125 m->m_info[mds].num_export_targets = num_export_targets; 129 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) { 130 if (num_export_targets) {
127 m->m_info[mds].export_targets = 131 m->m_info[mds].export_targets =
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index eacc131aa5cb..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -13,6 +13,7 @@ struct ceph_mds_info {
13 struct ceph_entity_addr addr; 13 struct ceph_entity_addr addr;
14 s32 state; 14 s32 state;
15 int num_export_targets; 15 int num_export_targets;
16 bool laggy;
16 u32 *export_targets; 17 u32 *export_targets;
17}; 18};
18 19
@@ -47,6 +48,13 @@ static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
47 return m->m_info[w].state; 48 return m->m_info[w].state;
48} 49}
49 50
51static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
52{
53 if (w >= 0 && w < m->m_max_mds)
54 return m->m_info[w].laggy;
55 return false;
56}
57
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m); 58extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end); 59extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m); 60extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 15167b2daa55..2502d76fcec1 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -108,7 +108,7 @@ void ceph_msgr_exit(void)
108 destroy_workqueue(ceph_msgr_wq); 108 destroy_workqueue(ceph_msgr_wq);
109} 109}
110 110
111void ceph_msgr_flush() 111void ceph_msgr_flush(void)
112{ 112{
113 flush_workqueue(ceph_msgr_wq); 113 flush_workqueue(ceph_msgr_wq);
114} 114}
@@ -647,7 +647,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto); 648 con->connect_seq, global_seq, proto);
649 649
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT); 650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq); 653 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1081,11 +1081,11 @@ static int process_banner(struct ceph_connection *con)
1081 sizeof(con->peer_addr)) != 0 && 1081 sizeof(con->peer_addr)) != 0 &&
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) && 1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) { 1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n", 1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr), 1085 pr_addr(&con->peer_addr.in_addr),
1086 le64_to_cpu(con->peer_addr.nonce), 1086 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr), 1087 pr_addr(&con->actual_peer_addr.in_addr),
1088 le64_to_cpu(con->actual_peer_addr.nonce)); 1088 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address"; 1089 con->error_msg = "wrong peer at address";
1090 return -1; 1090 return -1;
1091 } 1091 }
@@ -1123,8 +1123,8 @@ static void fail_protocol(struct ceph_connection *con)
1123 1123
1124static int process_connect(struct ceph_connection *con) 1124static int process_connect(struct ceph_connection *con)
1125{ 1125{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT; 1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT; 1127 u64 req_feat = CEPH_FEATURE_REQUIRED;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features); 1128 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129 1129
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1302,8 +1302,8 @@ static void process_ack(struct ceph_connection *con)
1302 1302
1303 1303
1304static int read_partial_message_section(struct ceph_connection *con, 1304static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, unsigned int sec_len, 1305 struct kvec *section,
1306 u32 *crc) 1306 unsigned int sec_len, u32 *crc)
1307{ 1307{
1308 int left; 1308 int left;
1309 int ret; 1309 int ret;
@@ -1434,7 +1434,8 @@ static int read_partial_message(struct ceph_connection *con)
1434 1434
1435 /* middle */ 1435 /* middle */
1436 if (m->middle) { 1436 if (m->middle) {
1437 ret = read_partial_message_section(con, &m->middle->vec, middle_len, 1437 ret = read_partial_message_section(con, &m->middle->vec,
1438 middle_len,
1438 &con->in_middle_crc); 1439 &con->in_middle_crc);
1439 if (ret <= 0) 1440 if (ret <= 0)
1440 return ret; 1441 return ret;
@@ -1920,7 +1921,7 @@ out:
1920 /* 1921 /*
1921 * in case we faulted due to authentication, invalidate our 1922 * in case we faulted due to authentication, invalidate our
1922 * current tickets so that we can get new ones. 1923 * current tickets so that we can get new ones.
1923 */ 1924 */
1924 if (con->auth_retry && con->ops->invalidate_authorizer) { 1925 if (con->auth_retry && con->ops->invalidate_authorizer) {
1925 dout("calling invalidate_authorizer()\n"); 1926 dout("calling invalidate_authorizer()\n");
1926 con->ops->invalidate_authorizer(con); 1927 con->ops->invalidate_authorizer(con);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 54fe01c50706..b2a5a3e4a671 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -349,7 +349,7 @@ out:
349} 349}
350 350
351/* 351/*
352 * statfs 352 * generic requests (e.g., statfs, poolop)
353 */ 353 */
354static struct ceph_mon_generic_request *__lookup_generic_req( 354static struct ceph_mon_generic_request *__lookup_generic_req(
355 struct ceph_mon_client *monc, u64 tid) 355 struct ceph_mon_client *monc, u64 tid)
@@ -442,6 +442,35 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
442 return m; 442 return m;
443} 443}
444 444
445static int do_generic_request(struct ceph_mon_client *monc,
446 struct ceph_mon_generic_request *req)
447{
448 int err;
449
450 /* register request */
451 mutex_lock(&monc->mutex);
452 req->tid = ++monc->last_tid;
453 req->request->hdr.tid = cpu_to_le64(req->tid);
454 __insert_generic_request(monc, req);
455 monc->num_generic_requests++;
456 ceph_con_send(monc->con, ceph_msg_get(req->request));
457 mutex_unlock(&monc->mutex);
458
459 err = wait_for_completion_interruptible(&req->completion);
460
461 mutex_lock(&monc->mutex);
462 rb_erase(&req->node, &monc->generic_request_tree);
463 monc->num_generic_requests--;
464 mutex_unlock(&monc->mutex);
465
466 if (!err)
467 err = req->result;
468 return err;
469}
470
471/*
472 * statfs
473 */
445static void handle_statfs_reply(struct ceph_mon_client *monc, 474static void handle_statfs_reply(struct ceph_mon_client *monc,
446 struct ceph_msg *msg) 475 struct ceph_msg *msg)
447{ 476{
@@ -468,7 +497,7 @@ static void handle_statfs_reply(struct ceph_mon_client *monc,
468 return; 497 return;
469 498
470bad: 499bad:
471 pr_err("corrupt generic reply, no tid\n"); 500 pr_err("corrupt generic reply, tid %llu\n", tid);
472 ceph_msg_dump(msg); 501 ceph_msg_dump(msg);
473} 502}
474 503
@@ -487,6 +516,7 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
487 516
488 kref_init(&req->kref); 517 kref_init(&req->kref);
489 req->buf = buf; 518 req->buf = buf;
519 req->buf_len = sizeof(*buf);
490 init_completion(&req->completion); 520 init_completion(&req->completion);
491 521
492 err = -ENOMEM; 522 err = -ENOMEM;
@@ -504,33 +534,134 @@ int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
504 h->monhdr.session_mon_tid = 0; 534 h->monhdr.session_mon_tid = 0;
505 h->fsid = monc->monmap->fsid; 535 h->fsid = monc->monmap->fsid;
506 536
507 /* register request */ 537 err = do_generic_request(monc, req);
508 mutex_lock(&monc->mutex);
509 req->tid = ++monc->last_tid;
510 req->request->hdr.tid = cpu_to_le64(req->tid);
511 __insert_generic_request(monc, req);
512 monc->num_generic_requests++;
513 mutex_unlock(&monc->mutex);
514 538
515 /* send request and wait */ 539out:
516 ceph_con_send(monc->con, ceph_msg_get(req->request)); 540 kref_put(&req->kref, release_generic_request);
517 err = wait_for_completion_interruptible(&req->completion); 541 return err;
542}
543
544/*
545 * pool ops
546 */
547static int get_poolop_reply_buf(const char *src, size_t src_len,
548 char *dst, size_t dst_len)
549{
550 u32 buf_len;
551
552 if (src_len != sizeof(u32) + dst_len)
553 return -EINVAL;
554
555 buf_len = le32_to_cpu(*(u32 *)src);
556 if (buf_len != dst_len)
557 return -EINVAL;
558
559 memcpy(dst, src + sizeof(u32), dst_len);
560 return 0;
561}
562
563static void handle_poolop_reply(struct ceph_mon_client *monc,
564 struct ceph_msg *msg)
565{
566 struct ceph_mon_generic_request *req;
567 struct ceph_mon_poolop_reply *reply = msg->front.iov_base;
568 u64 tid = le64_to_cpu(msg->hdr.tid);
569
570 if (msg->front.iov_len < sizeof(*reply))
571 goto bad;
572 dout("handle_poolop_reply %p tid %llu\n", msg, tid);
518 573
519 mutex_lock(&monc->mutex); 574 mutex_lock(&monc->mutex);
520 rb_erase(&req->node, &monc->generic_request_tree); 575 req = __lookup_generic_req(monc, tid);
521 monc->num_generic_requests--; 576 if (req) {
577 if (req->buf_len &&
578 get_poolop_reply_buf(msg->front.iov_base + sizeof(*reply),
579 msg->front.iov_len - sizeof(*reply),
580 req->buf, req->buf_len) < 0) {
581 mutex_unlock(&monc->mutex);
582 goto bad;
583 }
584 req->result = le32_to_cpu(reply->reply_code);
585 get_generic_request(req);
586 }
522 mutex_unlock(&monc->mutex); 587 mutex_unlock(&monc->mutex);
588 if (req) {
589 complete(&req->completion);
590 put_generic_request(req);
591 }
592 return;
523 593
524 if (!err) 594bad:
525 err = req->result; 595 pr_err("corrupt generic reply, tid %llu\n", tid);
596 ceph_msg_dump(msg);
597}
598
599/*
600 * Do a synchronous pool op.
601 */
602int ceph_monc_do_poolop(struct ceph_mon_client *monc, u32 op,
603 u32 pool, u64 snapid,
604 char *buf, int len)
605{
606 struct ceph_mon_generic_request *req;
607 struct ceph_mon_poolop *h;
608 int err;
609
610 req = kzalloc(sizeof(*req), GFP_NOFS);
611 if (!req)
612 return -ENOMEM;
613
614 kref_init(&req->kref);
615 req->buf = buf;
616 req->buf_len = len;
617 init_completion(&req->completion);
618
619 err = -ENOMEM;
620 req->request = ceph_msg_new(CEPH_MSG_POOLOP, sizeof(*h), GFP_NOFS);
621 if (!req->request)
622 goto out;
623 req->reply = ceph_msg_new(CEPH_MSG_POOLOP_REPLY, 1024, GFP_NOFS);
624 if (!req->reply)
625 goto out;
626
627 /* fill out request */
628 req->request->hdr.version = cpu_to_le16(2);
629 h = req->request->front.iov_base;
630 h->monhdr.have_version = 0;
631 h->monhdr.session_mon = cpu_to_le16(-1);
632 h->monhdr.session_mon_tid = 0;
633 h->fsid = monc->monmap->fsid;
634 h->pool = cpu_to_le32(pool);
635 h->op = cpu_to_le32(op);
636 h->auid = 0;
637 h->snapid = cpu_to_le64(snapid);
638 h->name_len = 0;
639
640 err = do_generic_request(monc, req);
526 641
527out: 642out:
528 kref_put(&req->kref, release_generic_request); 643 kref_put(&req->kref, release_generic_request);
529 return err; 644 return err;
530} 645}
531 646
647int ceph_monc_create_snapid(struct ceph_mon_client *monc,
648 u32 pool, u64 *snapid)
649{
650 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
651 pool, 0, (char *)snapid, sizeof(*snapid));
652
653}
654
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid)
657{
658 return ceph_monc_do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
659 pool, snapid, 0, 0);
660
661}
662
532/* 663/*
533 * Resend pending statfs requests. 664 * Resend pending generic requests.
534 */ 665 */
535static void __resend_generic_request(struct ceph_mon_client *monc) 666static void __resend_generic_request(struct ceph_mon_client *monc)
536{ 667{
@@ -783,6 +914,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
783 handle_statfs_reply(monc, msg); 914 handle_statfs_reply(monc, msg);
784 break; 915 break;
785 916
917 case CEPH_MSG_POOLOP_REPLY:
918 handle_poolop_reply(monc, msg);
919 break;
920
786 case CEPH_MSG_MON_MAP: 921 case CEPH_MSG_MON_MAP:
787 ceph_monc_handle_map(monc, msg); 922 ceph_monc_handle_map(monc, msg);
788 break; 923 break;
@@ -820,6 +955,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
820 case CEPH_MSG_MON_SUBSCRIBE_ACK: 955 case CEPH_MSG_MON_SUBSCRIBE_ACK:
821 m = ceph_msg_get(monc->m_subscribe_ack); 956 m = ceph_msg_get(monc->m_subscribe_ack);
822 break; 957 break;
958 case CEPH_MSG_POOLOP_REPLY:
823 case CEPH_MSG_STATFS_REPLY: 959 case CEPH_MSG_STATFS_REPLY:
824 return get_generic_reply(con, hdr, skip); 960 return get_generic_reply(con, hdr, skip);
825 case CEPH_MSG_AUTH_REPLY: 961 case CEPH_MSG_AUTH_REPLY:
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index 174d794321d0..8e396f2c0963 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -50,6 +50,7 @@ struct ceph_mon_generic_request {
50 struct rb_node node; 50 struct rb_node node;
51 int result; 51 int result;
52 void *buf; 52 void *buf;
53 int buf_len;
53 struct completion completion; 54 struct completion completion;
54 struct ceph_msg *request; /* original request */ 55 struct ceph_msg *request; /* original request */
55 struct ceph_msg *reply; /* and reply */ 56 struct ceph_msg *reply; /* and reply */
@@ -111,6 +112,10 @@ extern int ceph_monc_open_session(struct ceph_mon_client *monc);
111 112
112extern int ceph_monc_validate_auth(struct ceph_mon_client *monc); 113extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
113 114
115extern int ceph_monc_create_snapid(struct ceph_mon_client *monc,
116 u32 pool, u64 *snapid);
114 117
118extern int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
119 u32 pool, u64 snapid);
115 120
116#endif 121#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 892a0298dfdf..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -1,5 +1,5 @@
1#ifndef __MSGR_H 1#ifndef CEPH_MSGR_H
2#define __MSGR_H 2#define CEPH_MSGR_H
3 3
4/* 4/*
5 * Data types for message passing layer used by Ceph. 5 * Data types for message passing layer used by Ceph.
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index e38522347898..bed6391e52c7 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1276,8 +1276,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1276 1276
1277 /* it may be a short read due to an object boundary */ 1277 /* it may be a short read due to an object boundary */
1278 req->r_pages = pages; 1278 req->r_pages = pages;
1279 num_pages = calc_pages_for(off, *plen);
1280 req->r_num_pages = num_pages;
1281 1279
1282 dout("readpages final extent is %llu~%llu (%d pages)\n", 1280 dout("readpages final extent is %llu~%llu (%d pages)\n",
1283 off, *plen, req->r_num_pages); 1281 off, *plen, req->r_num_pages);
@@ -1319,7 +1317,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1319 1317
1320 /* it may be a short write due to an object boundary */ 1318 /* it may be a short write due to an object boundary */
1321 req->r_pages = pages; 1319 req->r_pages = pages;
1322 req->r_num_pages = calc_pages_for(off, len);
1323 dout("writepages %llu~%llu (%d pages)\n", off, len, 1320 dout("writepages %llu~%llu (%d pages)\n", off, len,
1324 req->r_num_pages); 1321 req->r_num_pages);
1325 1322
@@ -1476,8 +1473,8 @@ static void put_osd_con(struct ceph_connection *con)
1476 * authentication 1473 * authentication
1477 */ 1474 */
1478static int get_authorizer(struct ceph_connection *con, 1475static int get_authorizer(struct ceph_connection *con,
1479 void **buf, int *len, int *proto, 1476 void **buf, int *len, int *proto,
1480 void **reply_buf, int *reply_len, int force_new) 1477 void **reply_buf, int *reply_len, int force_new)
1481{ 1478{
1482 struct ceph_osd *o = con->private; 1479 struct ceph_osd *o = con->private;
1483 struct ceph_osd_client *osdc = o->o_osdc; 1480 struct ceph_osd_client *osdc = o->o_osdc;
@@ -1497,7 +1494,7 @@ static int get_authorizer(struct ceph_connection *con,
1497 &o->o_authorizer_reply_buf, 1494 &o->o_authorizer_reply_buf,
1498 &o->o_authorizer_reply_buf_len); 1495 &o->o_authorizer_reply_buf_len);
1499 if (ret) 1496 if (ret)
1500 return ret; 1497 return ret;
1501 } 1498 }
1502 1499
1503 *proto = ac->protocol; 1500 *proto = ac->protocol;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 416d46adbf87..e31f118f1392 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -424,12 +424,30 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
424 kfree(pi); 424 kfree(pi);
425} 425}
426 426
427void __decode_pool(void **p, struct ceph_pg_pool_info *pi) 427static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
428{ 428{
429 unsigned n, m;
430
429 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 431 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
430 calc_pg_masks(pi); 432 calc_pg_masks(pi);
431 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); 433
434 /* num_snaps * snap_info_t */
435 n = le32_to_cpu(pi->v.num_snaps);
436 while (n--) {
437 ceph_decode_need(p, end, sizeof(u64) + 1 + sizeof(u64) +
438 sizeof(struct ceph_timespec), bad);
439 *p += sizeof(u64) + /* key */
440 1 + sizeof(u64) + /* u8, snapid */
441 sizeof(struct ceph_timespec);
442 m = ceph_decode_32(p); /* snap name */
443 *p += m;
444 }
445
432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 446 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
447 return 0;
448
449bad:
450 return -EINVAL;
433} 451}
434 452
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 453static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
@@ -571,7 +589,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
571 kfree(pi); 589 kfree(pi);
572 goto bad; 590 goto bad;
573 } 591 }
574 __decode_pool(p, pi); 592 err = __decode_pool(p, end, pi);
593 if (err < 0)
594 goto bad;
575 __insert_pg_pool(&map->pg_pools, pi); 595 __insert_pg_pool(&map->pg_pools, pi);
576 } 596 }
577 597
@@ -760,7 +780,9 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
760 pi->id = pool; 780 pi->id = pool;
761 __insert_pg_pool(&map->pg_pools, pi); 781 __insert_pg_pool(&map->pg_pools, pi);
762 } 782 }
763 __decode_pool(p, pi); 783 err = __decode_pool(p, end, pi);
784 if (err < 0)
785 goto bad;
764 } 786 }
765 if (version >= 5 && __decode_pool_names(p, end, map) < 0) 787 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
766 goto bad; 788 goto bad;
@@ -833,7 +855,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
833 node)->pgid, pgid) <= 0) { 855 node)->pgid, pgid) <= 0) {
834 struct ceph_pg_mapping *cur = 856 struct ceph_pg_mapping *cur =
835 rb_entry(rbp, struct ceph_pg_mapping, node); 857 rb_entry(rbp, struct ceph_pg_mapping, node);
836 858
837 rbp = rb_next(rbp); 859 rbp = rb_next(rbp);
838 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid); 860 dout(" removed pg_temp %llx\n", *(u64 *)&cur->pgid);
839 rb_erase(&cur->node, &map->pg_temp); 861 rb_erase(&cur->node, &map->pg_temp);
@@ -1026,8 +1048,9 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1026 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset, 1048 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
1027 pool->v.type, pool->v.size); 1049 pool->v.type, pool->v.size);
1028 if (ruleno < 0) { 1050 if (ruleno < 0) {
1029 pr_err("no crush rule pool %d type %d size %d\n", 1051 pr_err("no crush rule pool %d ruleset %d type %d size %d\n",
1030 poolid, pool->v.type, pool->v.size); 1052 poolid, pool->v.crush_ruleset, pool->v.type,
1053 pool->v.size);
1031 return NULL; 1054 return NULL;
1032 } 1055 }
1033 1056
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 8fcc023056c7..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -1,5 +1,5 @@
1#ifndef __RADOS_H 1#ifndef CEPH_RADOS_H
2#define __RADOS_H 2#define CEPH_RADOS_H
3 3
4/* 4/*
5 * Data types for the Ceph distributed object storage layer RADOS 5 * Data types for the Ceph distributed object storage layer RADOS
@@ -203,6 +203,7 @@ enum {
203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12, 203 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
204 204
205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13, 205 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
206 CEPH_OSD_OP_ROLLBACK= CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 14,
206 207
207 /** attrs **/ 208 /** attrs **/
208 /* read */ 209 /* read */
@@ -272,6 +273,10 @@ static inline int ceph_osd_op_mode_modify(int op)
272 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR; 273 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
273} 274}
274 275
276/*
277 * note that the following tmap stuff is also defined in the ceph librados.h
278 * any modification here needs to be updated there
279 */
275#define CEPH_OSD_TMAP_HDR 'h' 280#define CEPH_OSD_TMAP_HDR 'h'
276#define CEPH_OSD_TMAP_SET 's' 281#define CEPH_OSD_TMAP_SET 's'
277#define CEPH_OSD_TMAP_RM 'r' 282#define CEPH_OSD_TMAP_RM 'r'
@@ -297,6 +302,7 @@ enum {
297 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */ 302 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
298 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */ 303 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
299 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */ 304 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
305 CEPH_OSD_FLAG_EXEC_PUBLIC = 4096, /* op may exec (public) */
300}; 306};
301 307
302enum { 308enum {
@@ -350,6 +356,9 @@ struct ceph_osd_op {
350 struct { 356 struct {
351 __le64 cookie, count; 357 __le64 cookie, count;
352 } __attribute__ ((packed)) pgls; 358 } __attribute__ ((packed)) pgls;
359 struct {
360 __le64 snapid;
361 } __attribute__ ((packed)) snap;
353 }; 362 };
354 __le32 payload_len; 363 __le32 payload_len;
355} __attribute__ ((packed)); 364} __attribute__ ((packed));
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index fa87f51e38e1..9922628532b2 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -2,6 +2,7 @@
2#include "ceph_debug.h" 2#include "ceph_debug.h"
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h>
5#include <linux/fs.h> 6#include <linux/fs.h>
6#include <linux/inet.h> 7#include <linux/inet.h>
7#include <linux/in6.h> 8#include <linux/in6.h>
@@ -101,12 +102,21 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
101} 102}
102 103
103 104
104static int ceph_syncfs(struct super_block *sb, int wait) 105static int ceph_sync_fs(struct super_block *sb, int wait)
105{ 106{
106 dout("sync_fs %d\n", wait); 107 struct ceph_client *client = ceph_sb_to_client(sb);
108
109 if (!wait) {
110 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc);
112 dout("sync_fs (non-blocking) done\n");
113 return 0;
114 }
115
116 dout("sync_fs (blocking)\n");
107 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
108 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
109 dout("sync_fs %d done\n", wait); 119 dout("sync_fs (blocking) done\n");
110 return 0; 120 return 0;
111} 121}
112 122
@@ -150,9 +160,7 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
150 struct ceph_mount_args *args = client->mount_args; 160 struct ceph_mount_args *args = client->mount_args;
151 161
152 if (args->flags & CEPH_OPT_FSID) 162 if (args->flags & CEPH_OPT_FSID)
153 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu", 163 seq_printf(m, ",fsid=%pU", &args->fsid);
154 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
155 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
156 if (args->flags & CEPH_OPT_NOSHARE) 164 if (args->flags & CEPH_OPT_NOSHARE)
157 seq_puts(m, ",noshare"); 165 seq_puts(m, ",noshare");
158 if (args->flags & CEPH_OPT_DIRSTAT) 166 if (args->flags & CEPH_OPT_DIRSTAT)
@@ -279,7 +287,7 @@ static const struct super_operations ceph_super_ops = {
279 .alloc_inode = ceph_alloc_inode, 287 .alloc_inode = ceph_alloc_inode,
280 .destroy_inode = ceph_destroy_inode, 288 .destroy_inode = ceph_destroy_inode,
281 .write_inode = ceph_write_inode, 289 .write_inode = ceph_write_inode,
282 .sync_fs = ceph_syncfs, 290 .sync_fs = ceph_sync_fs,
283 .put_super = ceph_put_super, 291 .put_super = ceph_put_super,
284 .show_options = ceph_show_options, 292 .show_options = ceph_show_options,
285 .statfs = ceph_statfs, 293 .statfs = ceph_statfs,
@@ -322,9 +330,6 @@ const char *ceph_msg_type_name(int type)
322 * mount options 330 * mount options
323 */ 331 */
324enum { 332enum {
325 Opt_fsidmajor,
326 Opt_fsidminor,
327 Opt_monport,
328 Opt_wsize, 333 Opt_wsize,
329 Opt_rsize, 334 Opt_rsize,
330 Opt_osdtimeout, 335 Opt_osdtimeout,
@@ -339,6 +344,7 @@ enum {
339 Opt_congestion_kb, 344 Opt_congestion_kb,
340 Opt_last_int, 345 Opt_last_int,
341 /* int args above */ 346 /* int args above */
347 Opt_fsid,
342 Opt_snapdirname, 348 Opt_snapdirname,
343 Opt_name, 349 Opt_name,
344 Opt_secret, 350 Opt_secret,
@@ -355,9 +361,6 @@ enum {
355}; 361};
356 362
357static match_table_t arg_tokens = { 363static match_table_t arg_tokens = {
358 {Opt_fsidmajor, "fsidmajor=%ld"},
359 {Opt_fsidminor, "fsidminor=%ld"},
360 {Opt_monport, "monport=%d"},
361 {Opt_wsize, "wsize=%d"}, 364 {Opt_wsize, "wsize=%d"},
362 {Opt_rsize, "rsize=%d"}, 365 {Opt_rsize, "rsize=%d"},
363 {Opt_osdtimeout, "osdtimeout=%d"}, 366 {Opt_osdtimeout, "osdtimeout=%d"},
@@ -371,6 +374,7 @@ static match_table_t arg_tokens = {
371 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
372 {Opt_congestion_kb, "write_congestion_kb=%d"}, 375 {Opt_congestion_kb, "write_congestion_kb=%d"},
373 /* int args above */ 376 /* int args above */
377 {Opt_fsid, "fsid=%s"},
374 {Opt_snapdirname, "snapdirname=%s"}, 378 {Opt_snapdirname, "snapdirname=%s"},
375 {Opt_name, "name=%s"}, 379 {Opt_name, "name=%s"},
376 {Opt_secret, "secret=%s"}, 380 {Opt_secret, "secret=%s"},
@@ -386,6 +390,36 @@ static match_table_t arg_tokens = {
386 {-1, NULL} 390 {-1, NULL}
387}; 391};
388 392
393static int parse_fsid(const char *str, struct ceph_fsid *fsid)
394{
395 int i = 0;
396 char tmp[3];
397 int err = -EINVAL;
398 int d;
399
400 dout("parse_fsid '%s'\n", str);
401 tmp[2] = 0;
402 while (*str && i < 16) {
403 if (ispunct(*str)) {
404 str++;
405 continue;
406 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1]))
408 break;
409 tmp[0] = str[0];
410 tmp[1] = str[1];
411 if (sscanf(tmp, "%x", &d) < 1)
412 break;
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 }
417
418 if (i == 16)
419 err = 0;
420 dout("parse_fsid ret %d got fsid %pU", err, fsid);
421 return err;
422}
389 423
390static struct ceph_mount_args *parse_mount_args(int flags, char *options, 424static struct ceph_mount_args *parse_mount_args(int flags, char *options,
391 const char *dev_name, 425 const char *dev_name,
@@ -469,12 +503,6 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
469 dout("got token %d\n", token); 503 dout("got token %d\n", token);
470 } 504 }
471 switch (token) { 505 switch (token) {
472 case Opt_fsidmajor:
473 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
474 break;
475 case Opt_fsidminor:
476 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
477 break;
478 case Opt_ip: 506 case Opt_ip:
479 err = ceph_parse_ips(argstr[0].from, 507 err = ceph_parse_ips(argstr[0].from,
480 argstr[0].to, 508 argstr[0].to,
@@ -485,6 +513,11 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
485 args->flags |= CEPH_OPT_MYIP; 513 args->flags |= CEPH_OPT_MYIP;
486 break; 514 break;
487 515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
488 case Opt_snapdirname: 521 case Opt_snapdirname:
489 kfree(args->snapdir_name); 522 kfree(args->snapdir_name);
490 args->snapdir_name = kstrndup(argstr[0].from, 523 args->snapdir_name = kstrndup(argstr[0].from,
@@ -515,6 +548,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
515 case Opt_osdkeepalivetimeout: 548 case Opt_osdkeepalivetimeout:
516 args->osd_keepalive_timeout = intval; 549 args->osd_keepalive_timeout = intval;
517 break; 550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
518 case Opt_mount_timeout: 554 case Opt_mount_timeout:
519 args->mount_timeout = intval; 555 args->mount_timeout = intval;
520 break; 556 break;
@@ -630,7 +666,6 @@ static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
630 666
631 /* caps */ 667 /* caps */
632 client->min_caps = args->max_readdir; 668 client->min_caps = args->max_readdir;
633 ceph_adjust_min_caps(client->min_caps);
634 669
635 /* subsystems */ 670 /* subsystems */
636 err = ceph_monc_init(&client->monc, client); 671 err = ceph_monc_init(&client->monc, client);
@@ -680,8 +715,6 @@ static void ceph_destroy_client(struct ceph_client *client)
680 715
681 ceph_monc_stop(&client->monc); 716 ceph_monc_stop(&client->monc);
682 717
683 ceph_adjust_min_caps(-client->min_caps);
684
685 ceph_debugfs_client_cleanup(client); 718 ceph_debugfs_client_cleanup(client);
686 destroy_workqueue(client->wb_wq); 719 destroy_workqueue(client->wb_wq);
687 destroy_workqueue(client->pg_inv_wq); 720 destroy_workqueue(client->pg_inv_wq);
@@ -706,13 +739,13 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
706{ 739{
707 if (client->have_fsid) { 740 if (client->have_fsid) {
708 if (ceph_fsid_compare(&client->fsid, fsid)) { 741 if (ceph_fsid_compare(&client->fsid, fsid)) {
709 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT, 742 pr_err("bad fsid, had %pU got %pU",
710 PR_FSID(&client->fsid), PR_FSID(fsid)); 743 &client->fsid, fsid);
711 return -1; 744 return -1;
712 } 745 }
713 } else { 746 } else {
714 pr_info("client%lld fsid " FSID_FORMAT "\n", 747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id,
715 client->monc.auth->global_id, PR_FSID(fsid)); 748 fsid);
716 memcpy(&client->fsid, fsid, sizeof(*fsid)); 749 memcpy(&client->fsid, fsid, sizeof(*fsid));
717 ceph_debugfs_client_init(client); 750 ceph_debugfs_client_init(client);
718 client->have_fsid = true; 751 client->have_fsid = true;
@@ -1043,8 +1076,6 @@ static int __init init_ceph(void)
1043 if (ret) 1076 if (ret)
1044 goto out_msgr; 1077 goto out_msgr;
1045 1078
1046 ceph_caps_init();
1047
1048 ret = register_filesystem(&ceph_fs_type); 1079 ret = register_filesystem(&ceph_fs_type);
1049 if (ret) 1080 if (ret)
1050 goto out_icache; 1081 goto out_icache;
@@ -1069,7 +1100,6 @@ static void __exit exit_ceph(void)
1069{ 1100{
1070 dout("exit_ceph\n"); 1101 dout("exit_ceph\n");
1071 unregister_filesystem(&ceph_fs_type); 1102 unregister_filesystem(&ceph_fs_type);
1072 ceph_caps_finalize();
1073 destroy_caches(); 1103 destroy_caches();
1074 ceph_msgr_exit(); 1104 ceph_msgr_exit();
1075 ceph_debugfs_cleanup(); 1105 ceph_debugfs_cleanup();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 10a4a406e887..2482d696f0de 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -31,6 +31,12 @@
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 32
33/* 33/*
34 * Supported features
35 */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38
39/*
34 * mount options 40 * mount options
35 */ 41 */
36#define CEPH_OPT_FSID (1<<0) 42#define CEPH_OPT_FSID (1<<0)
@@ -560,11 +566,13 @@ static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
560/* what the mds thinks we want */ 566/* what the mds thinks we want */
561extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci); 567extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
562 568
563extern void ceph_caps_init(void); 569extern void ceph_caps_init(struct ceph_mds_client *mdsc);
564extern void ceph_caps_finalize(void); 570extern void ceph_caps_finalize(struct ceph_mds_client *mdsc);
565extern void ceph_adjust_min_caps(int delta); 571extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta);
566extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need); 572extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
567extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx); 573 struct ceph_cap_reservation *ctx, int need);
574extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
575 struct ceph_cap_reservation *ctx);
568extern void ceph_reservation_status(struct ceph_client *client, 576extern void ceph_reservation_status(struct ceph_client *client,
569 int *total, int *avail, int *used, 577 int *total, int *avail, int *used,
570 int *reserved, int *min); 578 int *reserved, int *min);
@@ -738,13 +746,6 @@ extern struct kmem_cache *ceph_file_cachep;
738extern const char *ceph_msg_type_name(int type); 746extern const char *ceph_msg_type_name(int type);
739extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 747extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
740 748
741#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
742 "%02x%02x%02x%02x%02x%02x"
743#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
744 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
745 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
746 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
747
748/* inode.c */ 749/* inode.c */
749extern const struct inode_operations ceph_file_iops; 750extern const struct inode_operations ceph_file_iops;
750 751
@@ -806,13 +807,16 @@ static inline void ceph_remove_cap(struct ceph_cap *cap)
806 __ceph_remove_cap(cap); 807 __ceph_remove_cap(cap);
807 spin_unlock(&inode->i_lock); 808 spin_unlock(&inode->i_lock);
808} 809}
809extern void ceph_put_cap(struct ceph_cap *cap); 810extern void ceph_put_cap(struct ceph_mds_client *mdsc,
811 struct ceph_cap *cap);
810 812
811extern void ceph_queue_caps_release(struct inode *inode); 813extern void ceph_queue_caps_release(struct inode *inode);
812extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); 814extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
813extern int ceph_fsync(struct file *file, int datasync); 815extern int ceph_fsync(struct file *file, int datasync);
814extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, 816extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
815 struct ceph_mds_session *session); 817 struct ceph_mds_session *session);
818extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
819 int mds);
816extern int ceph_get_cap_mds(struct inode *inode); 820extern int ceph_get_cap_mds(struct inode *inode);
817extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 821extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
818extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 822extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
@@ -857,7 +861,7 @@ extern void ceph_release_page_vector(struct page **pages, int num_pages);
857/* dir.c */ 861/* dir.c */
858extern const struct file_operations ceph_dir_fops; 862extern const struct file_operations ceph_dir_fops;
859extern const struct inode_operations ceph_dir_iops; 863extern const struct inode_operations ceph_dir_iops;
860extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 864extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
861 ceph_snapdir_dentry_ops; 865 ceph_snapdir_dentry_ops;
862 866
863extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 867extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -888,6 +892,14 @@ extern void ceph_debugfs_cleanup(void);
888extern int ceph_debugfs_client_init(struct ceph_client *client); 892extern int ceph_debugfs_client_init(struct ceph_client *client);
889extern void ceph_debugfs_client_cleanup(struct ceph_client *client); 893extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
890 894
895/* locks.c */
896extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
897extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
898extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num);
899extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p,
900 int p_locks, int f_locks);
901extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c);
902
891static inline struct inode *get_dentry_parent_inode(struct dentry *dentry) 903static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
892{ 904{
893 if (dentry && dentry->d_parent) 905 if (dentry && dentry->d_parent)
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 68aeebc69681..097a2654c00f 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -337,6 +337,8 @@ void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
337} 337}
338 338
339static int __build_xattrs(struct inode *inode) 339static int __build_xattrs(struct inode *inode)
340 __releases(inode->i_lock)
341 __acquires(inode->i_lock)
340{ 342{
341 u32 namelen; 343 u32 namelen;
342 u32 numattr = 0; 344 u32 numattr = 0;