aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-09-11 15:33:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2015-09-11 15:33:03 -0400
commite013f74b60bbd37ee8c3a55214eb351ea3101c15 (patch)
tree096b59f550dea6df9347edf97b872dc75a79f653
parent01cab5549c3e9a0fe7248fc5ad0fd79361cc0d39 (diff)
parent438386853d4c0c48fe73bf05a7d61c70ca5a3bfb (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph update from Sage Weil: "There are a few fixes for snapshot behavior with CephFS and support for the new keepalive protocol from Zheng, a libceph fix that affects both RBD and CephFS, a few bug fixes and cleanups for RBD from Ilya, and several small fixes and cleanups from Jianpeng and others" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: ceph: improve readahead for file holes ceph: get inode size for each append write libceph: check data_len in ->alloc_msg() libceph: use keepalive2 to verify the mon session is alive rbd: plug rbd_dev->header.object_prefix memory leak rbd: fix double free on rbd_dev->header_name libceph: set 'exists' flag for newly up osd ceph: cleanup use of ceph_msg_get ceph: no need to get parent inode in ceph_open ceph: remove the useless judgement ceph: remove redundant test of head->safe and silence static analysis warnings ceph: fix queuing inode to mdsdir's snaprealm libceph: rename con_work() to ceph_con_workfn() libceph: Avoid holding the zero page on ceph_msgr_slab_init errors libceph: remove the unused macro AES_KEY_SIZE ceph: invalidate dirty pages after forced umount ceph: EIO all operations after forced umount
-rw-r--r--drivers/block/rbd.c6
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c8
-rw-r--r--fs/ceph/file.c14
-rw-r--r--fs/ceph/mds_client.c59
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/snap.c7
-rw-r--r--fs/ceph/super.c1
-rw-r--r--include/linux/ceph/libceph.h2
-rw-r--r--include/linux/ceph/messenger.h4
-rw-r--r--include/linux/ceph/msgr.h4
-rw-r--r--net/ceph/ceph_common.c1
-rw-r--r--net/ceph/crypto.c4
-rw-r--r--net/ceph/messenger.c82
-rw-r--r--net/ceph/mon_client.c37
-rw-r--r--net/ceph/osd_client.c51
-rw-r--r--net/ceph/osdmap.c2
17 files changed, 191 insertions, 98 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 698f761037ce..d93a0372b37b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -4673,7 +4673,10 @@ static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
4673 } 4673 }
4674 4674
4675 ret = rbd_dev_v2_snap_context(rbd_dev); 4675 ret = rbd_dev_v2_snap_context(rbd_dev);
4676 dout("rbd_dev_v2_snap_context returned %d\n", ret); 4676 if (ret && first_time) {
4677 kfree(rbd_dev->header.object_prefix);
4678 rbd_dev->header.object_prefix = NULL;
4679 }
4677 4680
4678 return ret; 4681 return ret;
4679} 4682}
@@ -5154,7 +5157,6 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
5154out_err: 5157out_err:
5155 if (parent) { 5158 if (parent) {
5156 rbd_dev_unparent(rbd_dev); 5159 rbd_dev_unparent(rbd_dev);
5157 kfree(rbd_dev->header_name);
5158 rbd_dev_destroy(parent); 5160 rbd_dev_destroy(parent);
5159 } else { 5161 } else {
5160 rbd_put_client(rbdc); 5162 rbd_put_client(rbdc);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a268abfe60ac..9d23e788d1df 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
276 for (i = 0; i < num_pages; i++) { 276 for (i = 0; i < num_pages; i++) {
277 struct page *page = osd_data->pages[i]; 277 struct page *page = osd_data->pages[i];
278 278
279 if (rc < 0) 279 if (rc < 0 && rc != ENOENT)
280 goto unlock; 280 goto unlock;
281 if (bytes < (int)PAGE_CACHE_SIZE) { 281 if (bytes < (int)PAGE_CACHE_SIZE) {
282 /* zero (remainder of) page */ 282 /* zero (remainder of) page */
@@ -717,8 +717,10 @@ static int ceph_writepages_start(struct address_space *mapping,
717 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 717 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
718 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 718 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
719 719
720 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { 720 if (ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
721 pr_warn("writepage_start %p on forced umount\n", inode); 721 pr_warn("writepage_start %p on forced umount\n", inode);
722 truncate_pagecache(inode, 0);
723 mapping_set_error(mapping, -EIO);
722 return -EIO; /* we're in a forced umount, don't write! */ 724 return -EIO; /* we're in a forced umount, don't write! */
723 } 725 }
724 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) 726 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ddd5e9471290..27b566874bc1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2413,6 +2413,14 @@ again:
2413 goto out_unlock; 2413 goto out_unlock;
2414 } 2414 }
2415 2415
2416 if (!__ceph_is_any_caps(ci) &&
2417 ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
2418 dout("get_cap_refs %p forced umount\n", inode);
2419 *err = -EIO;
2420 ret = 1;
2421 goto out_unlock;
2422 }
2423
2416 dout("get_cap_refs %p have %s needed %s\n", inode, 2424 dout("get_cap_refs %p have %s needed %s\n", inode,
2417 ceph_cap_string(have), ceph_cap_string(need)); 2425 ceph_cap_string(have), ceph_cap_string(need));
2418 } 2426 }
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 8b79d87eaf46..0c62868b5c56 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -136,7 +136,6 @@ int ceph_open(struct inode *inode, struct file *file)
136 struct ceph_mds_client *mdsc = fsc->mdsc; 136 struct ceph_mds_client *mdsc = fsc->mdsc;
137 struct ceph_mds_request *req; 137 struct ceph_mds_request *req;
138 struct ceph_file_info *cf = file->private_data; 138 struct ceph_file_info *cf = file->private_data;
139 struct inode *parent_inode = NULL;
140 int err; 139 int err;
141 int flags, fmode, wanted; 140 int flags, fmode, wanted;
142 141
@@ -210,10 +209,7 @@ int ceph_open(struct inode *inode, struct file *file)
210 ihold(inode); 209 ihold(inode);
211 210
212 req->r_num_caps = 1; 211 req->r_num_caps = 1;
213 if (flags & O_CREAT) 212 err = ceph_mdsc_do_request(mdsc, NULL, req);
214 parent_inode = ceph_get_dentry_parent_inode(file->f_path.dentry);
215 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
216 iput(parent_inode);
217 if (!err) 213 if (!err)
218 err = ceph_init_file(inode, file, req->r_fmode); 214 err = ceph_init_file(inode, file, req->r_fmode);
219 ceph_mdsc_put_request(req); 215 ceph_mdsc_put_request(req);
@@ -279,7 +275,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
279 if (err) 275 if (err)
280 goto out_req; 276 goto out_req;
281 277
282 if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) 278 if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
283 err = ceph_handle_notrace_create(dir, dentry); 279 err = ceph_handle_notrace_create(dir, dentry);
284 280
285 if (d_unhashed(dentry)) { 281 if (d_unhashed(dentry)) {
@@ -956,6 +952,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
956 /* We can write back this queue in page reclaim */ 952 /* We can write back this queue in page reclaim */
957 current->backing_dev_info = inode_to_bdi(inode); 953 current->backing_dev_info = inode_to_bdi(inode);
958 954
955 if (iocb->ki_flags & IOCB_APPEND) {
956 err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
957 if (err < 0)
958 goto out;
959 }
960
959 err = generic_write_checks(iocb, from); 961 err = generic_write_checks(iocb, from);
960 if (err <= 0) 962 if (err <= 0)
961 goto out; 963 goto out;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 6aa07af67603..51cb02da75d9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2107,7 +2107,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
2107 msg = create_request_message(mdsc, req, mds, drop_cap_releases); 2107 msg = create_request_message(mdsc, req, mds, drop_cap_releases);
2108 if (IS_ERR(msg)) { 2108 if (IS_ERR(msg)) {
2109 req->r_err = PTR_ERR(msg); 2109 req->r_err = PTR_ERR(msg);
2110 complete_request(mdsc, req);
2111 return PTR_ERR(msg); 2110 return PTR_ERR(msg);
2112 } 2111 }
2113 req->r_request = msg; 2112 req->r_request = msg;
@@ -2135,7 +2134,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
2135{ 2134{
2136 struct ceph_mds_session *session = NULL; 2135 struct ceph_mds_session *session = NULL;
2137 int mds = -1; 2136 int mds = -1;
2138 int err = -EAGAIN; 2137 int err = 0;
2139 2138
2140 if (req->r_err || req->r_got_result) { 2139 if (req->r_err || req->r_got_result) {
2141 if (req->r_aborted) 2140 if (req->r_aborted)
@@ -2149,6 +2148,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
2149 err = -EIO; 2148 err = -EIO;
2150 goto finish; 2149 goto finish;
2151 } 2150 }
2151 if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
2152 dout("do_request forced umount\n");
2153 err = -EIO;
2154 goto finish;
2155 }
2152 2156
2153 put_request_session(req); 2157 put_request_session(req);
2154 2158
@@ -2196,13 +2200,15 @@ static int __do_request(struct ceph_mds_client *mdsc,
2196 2200
2197out_session: 2201out_session:
2198 ceph_put_mds_session(session); 2202 ceph_put_mds_session(session);
2203finish:
2204 if (err) {
2205 dout("__do_request early error %d\n", err);
2206 req->r_err = err;
2207 complete_request(mdsc, req);
2208 __unregister_request(mdsc, req);
2209 }
2199out: 2210out:
2200 return err; 2211 return err;
2201
2202finish:
2203 req->r_err = err;
2204 complete_request(mdsc, req);
2205 goto out;
2206} 2212}
2207 2213
2208/* 2214/*
@@ -2289,8 +2295,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
2289 2295
2290 if (req->r_err) { 2296 if (req->r_err) {
2291 err = req->r_err; 2297 err = req->r_err;
2292 __unregister_request(mdsc, req);
2293 dout("do_request early error %d\n", err);
2294 goto out; 2298 goto out;
2295 } 2299 }
2296 2300
@@ -2411,7 +2415,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2411 mutex_unlock(&mdsc->mutex); 2415 mutex_unlock(&mdsc->mutex);
2412 goto out; 2416 goto out;
2413 } 2417 }
2414 if (req->r_got_safe && !head->safe) { 2418 if (req->r_got_safe) {
2415 pr_warn("got unsafe after safe on %llu from mds%d\n", 2419 pr_warn("got unsafe after safe on %llu from mds%d\n",
2416 tid, mds); 2420 tid, mds);
2417 mutex_unlock(&mdsc->mutex); 2421 mutex_unlock(&mdsc->mutex);
@@ -2520,8 +2524,7 @@ out_err:
2520 if (err) { 2524 if (err) {
2521 req->r_err = err; 2525 req->r_err = err;
2522 } else { 2526 } else {
2523 req->r_reply = msg; 2527 req->r_reply = ceph_msg_get(msg);
2524 ceph_msg_get(msg);
2525 req->r_got_result = true; 2528 req->r_got_result = true;
2526 } 2529 }
2527 } else { 2530 } else {
@@ -3555,7 +3558,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3555{ 3558{
3556 u64 want_tid, want_flush, want_snap; 3559 u64 want_tid, want_flush, want_snap;
3557 3560
3558 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3561 if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
3559 return; 3562 return;
3560 3563
3561 dout("sync\n"); 3564 dout("sync\n");
@@ -3584,7 +3587,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3584 */ 3587 */
3585static bool done_closing_sessions(struct ceph_mds_client *mdsc) 3588static bool done_closing_sessions(struct ceph_mds_client *mdsc)
3586{ 3589{
3587 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) 3590 if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
3588 return true; 3591 return true;
3589 return atomic_read(&mdsc->num_sessions) == 0; 3592 return atomic_read(&mdsc->num_sessions) == 0;
3590} 3593}
@@ -3643,6 +3646,34 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3643 dout("stopped\n"); 3646 dout("stopped\n");
3644} 3647}
3645 3648
3649void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
3650{
3651 struct ceph_mds_session *session;
3652 int mds;
3653
3654 dout("force umount\n");
3655
3656 mutex_lock(&mdsc->mutex);
3657 for (mds = 0; mds < mdsc->max_sessions; mds++) {
3658 session = __ceph_lookup_mds_session(mdsc, mds);
3659 if (!session)
3660 continue;
3661 mutex_unlock(&mdsc->mutex);
3662 mutex_lock(&session->s_mutex);
3663 __close_session(mdsc, session);
3664 if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
3665 cleanup_session_requests(mdsc, session);
3666 remove_session_caps(session);
3667 }
3668 mutex_unlock(&session->s_mutex);
3669 ceph_put_mds_session(session);
3670 mutex_lock(&mdsc->mutex);
3671 kick_requests(mdsc, mds);
3672 }
3673 __wake_requests(mdsc, &mdsc->waiting_for_map);
3674 mutex_unlock(&mdsc->mutex);
3675}
3676
3646static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3677static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3647{ 3678{
3648 dout("stop\n"); 3679 dout("stop\n");
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 762757e6cebf..f575eafe2261 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -366,6 +366,7 @@ extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
366 366
367extern int ceph_mdsc_init(struct ceph_fs_client *fsc); 367extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
368extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 368extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
369extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
369extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc); 370extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
370 371
371extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 372extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 233d906aec02..4aa7122a8d38 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -338,12 +338,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
338 return 0; 338 return 0;
339 } 339 }
340 340
341 if (num == 0 && realm->seq == ceph_empty_snapc->seq) {
342 ceph_get_snap_context(ceph_empty_snapc);
343 snapc = ceph_empty_snapc;
344 goto done;
345 }
346
347 /* alloc new snap context */ 341 /* alloc new snap context */
348 err = -ENOMEM; 342 err = -ENOMEM;
349 if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) 343 if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64))
@@ -381,7 +375,6 @@ static int build_snap_context(struct ceph_snap_realm *realm)
381 realm->ino, realm, snapc, snapc->seq, 375 realm->ino, realm, snapc, snapc->seq,
382 (unsigned int) snapc->num_snaps); 376 (unsigned int) snapc->num_snaps);
383 377
384done:
385 ceph_put_snap_context(realm->cached_context); 378 ceph_put_snap_context(realm->cached_context);
386 realm->cached_context = snapc; 379 realm->cached_context = snapc;
387 return 0; 380 return 0;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7b6bfcbf801c..f446afada328 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -708,6 +708,7 @@ static void ceph_umount_begin(struct super_block *sb)
708 if (!fsc) 708 if (!fsc)
709 return; 709 return;
710 fsc->mount_state = CEPH_MOUNT_SHUTDOWN; 710 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
711 ceph_mdsc_force_umount(fsc->mdsc);
711 return; 712 return;
712} 713}
713 714
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 9ebee53d3bf5..397c5cd09794 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -46,6 +46,7 @@ struct ceph_options {
46 unsigned long mount_timeout; /* jiffies */ 46 unsigned long mount_timeout; /* jiffies */
47 unsigned long osd_idle_ttl; /* jiffies */ 47 unsigned long osd_idle_ttl; /* jiffies */
48 unsigned long osd_keepalive_timeout; /* jiffies */ 48 unsigned long osd_keepalive_timeout; /* jiffies */
49 unsigned long monc_ping_timeout; /* jiffies */
49 50
50 /* 51 /*
51 * any type that can't be simply compared or doesn't need need 52 * any type that can't be simply compared or doesn't need need
@@ -66,6 +67,7 @@ struct ceph_options {
66#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) 67#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
67#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) 68#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
68#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) 69#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
70#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000)
69 71
70#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) 72#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
71#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) 73#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 37753278987a..7e1252e97a30 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -248,6 +248,8 @@ struct ceph_connection {
248 int in_base_pos; /* bytes read */ 248 int in_base_pos; /* bytes read */
249 __le64 in_temp_ack; /* for reading an ack */ 249 __le64 in_temp_ack; /* for reading an ack */
250 250
251 struct timespec last_keepalive_ack;
252
251 struct delayed_work work; /* send|recv work */ 253 struct delayed_work work; /* send|recv work */
252 unsigned long delay; /* current delay interval */ 254 unsigned long delay; /* current delay interval */
253}; 255};
@@ -285,6 +287,8 @@ extern void ceph_msg_revoke(struct ceph_msg *msg);
285extern void ceph_msg_revoke_incoming(struct ceph_msg *msg); 287extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
286 288
287extern void ceph_con_keepalive(struct ceph_connection *con); 289extern void ceph_con_keepalive(struct ceph_connection *con);
290extern bool ceph_con_keepalive_expired(struct ceph_connection *con,
291 unsigned long interval);
288 292
289extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, 293extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
290 size_t length, size_t alignment); 294 size_t length, size_t alignment);
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 1c1887206ffa..0fe2656ac415 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -84,10 +84,12 @@ struct ceph_entity_inst {
84#define CEPH_MSGR_TAG_MSG 7 /* message */ 84#define CEPH_MSGR_TAG_MSG 7 /* message */
85#define CEPH_MSGR_TAG_ACK 8 /* message ack */ 85#define CEPH_MSGR_TAG_ACK 8 /* message ack */
86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */ 86#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */ 87#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */ 88#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */ 89#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
90#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ 90#define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */
91#define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */
92#define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */
91 93
92 94
93/* 95/*
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 69a4d30a9ccf..54a00d66509e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -357,6 +357,7 @@ ceph_parse_options(char *options, const char *dev_name,
357 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 357 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
358 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; 358 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
359 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; 359 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
360 opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
360 361
361 /* get mon ip(s) */ 362 /* get mon ip(s) */
362 /* ip1[:port1][,ip2[:port2]...] */ 363 /* ip1[:port1][,ip2[:port2]...] */
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 790fe89d90c0..4440edcce0d6 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -79,10 +79,6 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
79 return 0; 79 return 0;
80} 80}
81 81
82
83
84#define AES_KEY_SIZE 16
85
86static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void) 82static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
87{ 83{
88 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC); 84 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index e3be1d22a247..525f454f7531 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -163,6 +163,7 @@ static struct kmem_cache *ceph_msg_data_cache;
163static char tag_msg = CEPH_MSGR_TAG_MSG; 163static char tag_msg = CEPH_MSGR_TAG_MSG;
164static char tag_ack = CEPH_MSGR_TAG_ACK; 164static char tag_ack = CEPH_MSGR_TAG_ACK;
165static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; 165static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
166static char tag_keepalive2 = CEPH_MSGR_TAG_KEEPALIVE2;
166 167
167#ifdef CONFIG_LOCKDEP 168#ifdef CONFIG_LOCKDEP
168static struct lock_class_key socket_class; 169static struct lock_class_key socket_class;
@@ -176,7 +177,7 @@ static struct lock_class_key socket_class;
176 177
177static void queue_con(struct ceph_connection *con); 178static void queue_con(struct ceph_connection *con);
178static void cancel_con(struct ceph_connection *con); 179static void cancel_con(struct ceph_connection *con);
179static void con_work(struct work_struct *); 180static void ceph_con_workfn(struct work_struct *);
180static void con_fault(struct ceph_connection *con); 181static void con_fault(struct ceph_connection *con);
181 182
182/* 183/*
@@ -276,22 +277,22 @@ static void _ceph_msgr_exit(void)
276 ceph_msgr_wq = NULL; 277 ceph_msgr_wq = NULL;
277 } 278 }
278 279
279 ceph_msgr_slab_exit();
280
281 BUG_ON(zero_page == NULL); 280 BUG_ON(zero_page == NULL);
282 page_cache_release(zero_page); 281 page_cache_release(zero_page);
283 zero_page = NULL; 282 zero_page = NULL;
283
284 ceph_msgr_slab_exit();
284} 285}
285 286
286int ceph_msgr_init(void) 287int ceph_msgr_init(void)
287{ 288{
289 if (ceph_msgr_slab_init())
290 return -ENOMEM;
291
288 BUG_ON(zero_page != NULL); 292 BUG_ON(zero_page != NULL);
289 zero_page = ZERO_PAGE(0); 293 zero_page = ZERO_PAGE(0);
290 page_cache_get(zero_page); 294 page_cache_get(zero_page);
291 295
292 if (ceph_msgr_slab_init())
293 return -ENOMEM;
294
295 /* 296 /*
296 * The number of active work items is limited by the number of 297 * The number of active work items is limited by the number of
297 * connections, so leave @max_active at default. 298 * connections, so leave @max_active at default.
@@ -749,7 +750,7 @@ void ceph_con_init(struct ceph_connection *con, void *private,
749 mutex_init(&con->mutex); 750 mutex_init(&con->mutex);
750 INIT_LIST_HEAD(&con->out_queue); 751 INIT_LIST_HEAD(&con->out_queue);
751 INIT_LIST_HEAD(&con->out_sent); 752 INIT_LIST_HEAD(&con->out_sent);
752 INIT_DELAYED_WORK(&con->work, con_work); 753 INIT_DELAYED_WORK(&con->work, ceph_con_workfn);
753 754
754 con->state = CON_STATE_CLOSED; 755 con->state = CON_STATE_CLOSED;
755} 756}
@@ -1351,7 +1352,15 @@ static void prepare_write_keepalive(struct ceph_connection *con)
1351{ 1352{
1352 dout("prepare_write_keepalive %p\n", con); 1353 dout("prepare_write_keepalive %p\n", con);
1353 con_out_kvec_reset(con); 1354 con_out_kvec_reset(con);
1354 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); 1355 if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) {
1356 struct timespec ts = CURRENT_TIME;
1357 struct ceph_timespec ceph_ts;
1358 ceph_encode_timespec(&ceph_ts, &ts);
1359 con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2);
1360 con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts);
1361 } else {
1362 con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive);
1363 }
1355 con_flag_set(con, CON_FLAG_WRITE_PENDING); 1364 con_flag_set(con, CON_FLAG_WRITE_PENDING);
1356} 1365}
1357 1366
@@ -1625,6 +1634,12 @@ static void prepare_read_tag(struct ceph_connection *con)
1625 con->in_tag = CEPH_MSGR_TAG_READY; 1634 con->in_tag = CEPH_MSGR_TAG_READY;
1626} 1635}
1627 1636
1637static void prepare_read_keepalive_ack(struct ceph_connection *con)
1638{
1639 dout("prepare_read_keepalive_ack %p\n", con);
1640 con->in_base_pos = 0;
1641}
1642
1628/* 1643/*
1629 * Prepare to read a message. 1644 * Prepare to read a message.
1630 */ 1645 */
@@ -2322,13 +2337,6 @@ static int read_partial_message(struct ceph_connection *con)
2322 return ret; 2337 return ret;
2323 2338
2324 BUG_ON(!con->in_msg ^ skip); 2339 BUG_ON(!con->in_msg ^ skip);
2325 if (con->in_msg && data_len > con->in_msg->data_length) {
2326 pr_warn("%s skipping long message (%u > %zd)\n",
2327 __func__, data_len, con->in_msg->data_length);
2328 ceph_msg_put(con->in_msg);
2329 con->in_msg = NULL;
2330 skip = 1;
2331 }
2332 if (skip) { 2340 if (skip) {
2333 /* skip this message */ 2341 /* skip this message */
2334 dout("alloc_msg said skip message\n"); 2342 dout("alloc_msg said skip message\n");
@@ -2457,6 +2465,17 @@ static void process_message(struct ceph_connection *con)
2457 mutex_lock(&con->mutex); 2465 mutex_lock(&con->mutex);
2458} 2466}
2459 2467
2468static int read_keepalive_ack(struct ceph_connection *con)
2469{
2470 struct ceph_timespec ceph_ts;
2471 size_t size = sizeof(ceph_ts);
2472 int ret = read_partial(con, size, size, &ceph_ts);
2473 if (ret <= 0)
2474 return ret;
2475 ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts);
2476 prepare_read_tag(con);
2477 return 1;
2478}
2460 2479
2461/* 2480/*
2462 * Write something to the socket. Called in a worker thread when the 2481 * Write something to the socket. Called in a worker thread when the
@@ -2526,6 +2545,10 @@ more_kvec:
2526 2545
2527do_next: 2546do_next:
2528 if (con->state == CON_STATE_OPEN) { 2547 if (con->state == CON_STATE_OPEN) {
2548 if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
2549 prepare_write_keepalive(con);
2550 goto more;
2551 }
2529 /* is anything else pending? */ 2552 /* is anything else pending? */
2530 if (!list_empty(&con->out_queue)) { 2553 if (!list_empty(&con->out_queue)) {
2531 prepare_write_message(con); 2554 prepare_write_message(con);
@@ -2535,10 +2558,6 @@ do_next:
2535 prepare_write_ack(con); 2558 prepare_write_ack(con);
2536 goto more; 2559 goto more;
2537 } 2560 }
2538 if (con_flag_test_and_clear(con, CON_FLAG_KEEPALIVE_PENDING)) {
2539 prepare_write_keepalive(con);
2540 goto more;
2541 }
2542 } 2561 }
2543 2562
2544 /* Nothing to do! */ 2563 /* Nothing to do! */
@@ -2641,6 +2660,9 @@ more:
2641 case CEPH_MSGR_TAG_ACK: 2660 case CEPH_MSGR_TAG_ACK:
2642 prepare_read_ack(con); 2661 prepare_read_ack(con);
2643 break; 2662 break;
2663 case CEPH_MSGR_TAG_KEEPALIVE2_ACK:
2664 prepare_read_keepalive_ack(con);
2665 break;
2644 case CEPH_MSGR_TAG_CLOSE: 2666 case CEPH_MSGR_TAG_CLOSE:
2645 con_close_socket(con); 2667 con_close_socket(con);
2646 con->state = CON_STATE_CLOSED; 2668 con->state = CON_STATE_CLOSED;
@@ -2684,6 +2706,12 @@ more:
2684 process_ack(con); 2706 process_ack(con);
2685 goto more; 2707 goto more;
2686 } 2708 }
2709 if (con->in_tag == CEPH_MSGR_TAG_KEEPALIVE2_ACK) {
2710 ret = read_keepalive_ack(con);
2711 if (ret <= 0)
2712 goto out;
2713 goto more;
2714 }
2687 2715
2688out: 2716out:
2689 dout("try_read done on %p ret %d\n", con, ret); 2717 dout("try_read done on %p ret %d\n", con, ret);
@@ -2799,7 +2827,7 @@ static void con_fault_finish(struct ceph_connection *con)
2799/* 2827/*
2800 * Do some work on a connection. Drop a connection ref when we're done. 2828 * Do some work on a connection. Drop a connection ref when we're done.
2801 */ 2829 */
2802static void con_work(struct work_struct *work) 2830static void ceph_con_workfn(struct work_struct *work)
2803{ 2831{
2804 struct ceph_connection *con = container_of(work, struct ceph_connection, 2832 struct ceph_connection *con = container_of(work, struct ceph_connection,
2805 work.work); 2833 work.work);
@@ -3101,6 +3129,20 @@ void ceph_con_keepalive(struct ceph_connection *con)
3101} 3129}
3102EXPORT_SYMBOL(ceph_con_keepalive); 3130EXPORT_SYMBOL(ceph_con_keepalive);
3103 3131
3132bool ceph_con_keepalive_expired(struct ceph_connection *con,
3133 unsigned long interval)
3134{
3135 if (interval > 0 &&
3136 (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) {
3137 struct timespec now = CURRENT_TIME;
3138 struct timespec ts;
3139 jiffies_to_timespec(interval, &ts);
3140 ts = timespec_add(con->last_keepalive_ack, ts);
3141 return timespec_compare(&now, &ts) >= 0;
3142 }
3143 return false;
3144}
3145
3104static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) 3146static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
3105{ 3147{
3106 struct ceph_msg_data *data; 3148 struct ceph_msg_data *data;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d6ff1215928..edda01626a45 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -149,6 +149,10 @@ static int __open_session(struct ceph_mon_client *monc)
149 CEPH_ENTITY_TYPE_MON, monc->cur_mon, 149 CEPH_ENTITY_TYPE_MON, monc->cur_mon,
150 &monc->monmap->mon_inst[monc->cur_mon].addr); 150 &monc->monmap->mon_inst[monc->cur_mon].addr);
151 151
152 /* send an initial keepalive to ensure our timestamp is
153 * valid by the time we are in an OPENED state */
154 ceph_con_keepalive(&monc->con);
155
152 /* initiatiate authentication handshake */ 156 /* initiatiate authentication handshake */
153 ret = ceph_auth_build_hello(monc->auth, 157 ret = ceph_auth_build_hello(monc->auth,
154 monc->m_auth->front.iov_base, 158 monc->m_auth->front.iov_base,
@@ -170,14 +174,19 @@ static bool __sub_expired(struct ceph_mon_client *monc)
170 */ 174 */
171static void __schedule_delayed(struct ceph_mon_client *monc) 175static void __schedule_delayed(struct ceph_mon_client *monc)
172{ 176{
173 unsigned int delay; 177 struct ceph_options *opt = monc->client->options;
178 unsigned long delay;
174 179
175 if (monc->cur_mon < 0 || __sub_expired(monc)) 180 if (monc->cur_mon < 0 || __sub_expired(monc)) {
176 delay = 10 * HZ; 181 delay = 10 * HZ;
177 else 182 } else {
178 delay = 20 * HZ; 183 delay = 20 * HZ;
179 dout("__schedule_delayed after %u\n", delay); 184 if (opt->monc_ping_timeout > 0)
180 schedule_delayed_work(&monc->delayed_work, delay); 185 delay = min(delay, opt->monc_ping_timeout / 3);
186 }
187 dout("__schedule_delayed after %lu\n", delay);
188 schedule_delayed_work(&monc->delayed_work,
189 round_jiffies_relative(delay));
181} 190}
182 191
183/* 192/*
@@ -743,11 +752,23 @@ static void delayed_work(struct work_struct *work)
743 __close_session(monc); 752 __close_session(monc);
744 __open_session(monc); /* continue hunting */ 753 __open_session(monc); /* continue hunting */
745 } else { 754 } else {
746 ceph_con_keepalive(&monc->con); 755 struct ceph_options *opt = monc->client->options;
756 int is_auth = ceph_auth_is_authenticated(monc->auth);
757 if (ceph_con_keepalive_expired(&monc->con,
758 opt->monc_ping_timeout)) {
759 dout("monc keepalive timeout\n");
760 is_auth = 0;
761 __close_session(monc);
762 monc->hunting = true;
763 __open_session(monc);
764 }
747 765
748 __validate_auth(monc); 766 if (!monc->hunting) {
767 ceph_con_keepalive(&monc->con);
768 __validate_auth(monc);
769 }
749 770
750 if (ceph_auth_is_authenticated(monc->auth)) 771 if (is_auth)
751 __send_subscribe(monc); 772 __send_subscribe(monc);
752 } 773 }
753 __schedule_delayed(monc); 774 __schedule_delayed(monc);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 50033677c0fa..80b94e37c94a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2817,8 +2817,9 @@ out:
2817} 2817}
2818 2818
2819/* 2819/*
2820 * lookup and return message for incoming reply. set up reply message 2820 * Lookup and return message for incoming reply. Don't try to do
2821 * pages. 2821 * anything about a larger than preallocated data portion of the
2822 * message at the moment - for now, just skip the message.
2822 */ 2823 */
2823static struct ceph_msg *get_reply(struct ceph_connection *con, 2824static struct ceph_msg *get_reply(struct ceph_connection *con,
2824 struct ceph_msg_header *hdr, 2825 struct ceph_msg_header *hdr,
@@ -2836,10 +2837,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2836 mutex_lock(&osdc->request_mutex); 2837 mutex_lock(&osdc->request_mutex);
2837 req = __lookup_request(osdc, tid); 2838 req = __lookup_request(osdc, tid);
2838 if (!req) { 2839 if (!req) {
2839 *skip = 1; 2840 pr_warn("%s osd%d tid %llu unknown, skipping\n",
2841 __func__, osd->o_osd, tid);
2840 m = NULL; 2842 m = NULL;
2841 dout("get_reply unknown tid %llu from osd%d\n", tid, 2843 *skip = 1;
2842 osd->o_osd);
2843 goto out; 2844 goto out;
2844 } 2845 }
2845 2846
@@ -2849,10 +2850,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2849 ceph_msg_revoke_incoming(req->r_reply); 2850 ceph_msg_revoke_incoming(req->r_reply);
2850 2851
2851 if (front_len > req->r_reply->front_alloc_len) { 2852 if (front_len > req->r_reply->front_alloc_len) {
2852 pr_warn("get_reply front %d > preallocated %d (%u#%llu)\n", 2853 pr_warn("%s osd%d tid %llu front %d > preallocated %d\n",
2853 front_len, req->r_reply->front_alloc_len, 2854 __func__, osd->o_osd, req->r_tid, front_len,
2854 (unsigned int)con->peer_name.type, 2855 req->r_reply->front_alloc_len);
2855 le64_to_cpu(con->peer_name.num));
2856 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS, 2856 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front_len, GFP_NOFS,
2857 false); 2857 false);
2858 if (!m) 2858 if (!m)
@@ -2860,37 +2860,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2860 ceph_msg_put(req->r_reply); 2860 ceph_msg_put(req->r_reply);
2861 req->r_reply = m; 2861 req->r_reply = m;
2862 } 2862 }
2863 m = ceph_msg_get(req->r_reply);
2864
2865 if (data_len > 0) {
2866 struct ceph_osd_data *osd_data;
2867 2863
2868 /* 2864 if (data_len > req->r_reply->data_length) {
2869 * XXX This is assuming there is only one op containing 2865 pr_warn("%s osd%d tid %llu data %d > preallocated %zu, skipping\n",
2870 * XXX page data. Probably OK for reads, but this 2866 __func__, osd->o_osd, req->r_tid, data_len,
2871 * XXX ought to be done more generally. 2867 req->r_reply->data_length);
2872 */ 2868 m = NULL;
2873 osd_data = osd_req_op_extent_osd_data(req, 0); 2869 *skip = 1;
2874 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) { 2870 goto out;
2875 if (osd_data->pages &&
2876 unlikely(osd_data->length < data_len)) {
2877
2878 pr_warn("tid %lld reply has %d bytes we had only %llu bytes ready\n",
2879 tid, data_len, osd_data->length);
2880 *skip = 1;
2881 ceph_msg_put(m);
2882 m = NULL;
2883 goto out;
2884 }
2885 }
2886 } 2871 }
2887 *skip = 0; 2872
2873 m = ceph_msg_get(req->r_reply);
2888 dout("get_reply tid %lld %p\n", tid, m); 2874 dout("get_reply tid %lld %p\n", tid, m);
2889 2875
2890out: 2876out:
2891 mutex_unlock(&osdc->request_mutex); 2877 mutex_unlock(&osdc->request_mutex);
2892 return m; 2878 return m;
2893
2894} 2879}
2895 2880
2896static struct ceph_msg *alloc_msg(struct ceph_connection *con, 2881static struct ceph_msg *alloc_msg(struct ceph_connection *con,
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4a3125836b64..7d8f581d9f1f 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1300,7 +1300,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1300 ceph_decode_addr(&addr); 1300 ceph_decode_addr(&addr);
1301 pr_info("osd%d up\n", osd); 1301 pr_info("osd%d up\n", osd);
1302 BUG_ON(osd >= map->max_osd); 1302 BUG_ON(osd >= map->max_osd);
1303 map->osd_state[osd] |= CEPH_OSD_UP; 1303 map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
1304 map->osd_addr[osd] = addr; 1304 map->osd_addr[osd] = addr;
1305 } 1305 }
1306 1306