diff options
| -rw-r--r-- | Documentation/filesystems/00-INDEX | 2 | ||||
| -rw-r--r-- | Documentation/filesystems/ceph.txt | 11 | ||||
| -rw-r--r-- | MAINTAINERS | 2 | ||||
| -rw-r--r-- | fs/ceph/addr.c | 10 | ||||
| -rw-r--r-- | fs/ceph/auth_x.c | 53 | ||||
| -rw-r--r-- | fs/ceph/caps.c | 73 | ||||
| -rw-r--r-- | fs/ceph/dir.c | 4 | ||||
| -rw-r--r-- | fs/ceph/inode.c | 16 | ||||
| -rw-r--r-- | fs/ceph/mds_client.c | 43 | ||||
| -rw-r--r-- | fs/ceph/messenger.c | 19 | ||||
| -rw-r--r-- | fs/ceph/messenger.h | 1 | ||||
| -rw-r--r-- | fs/ceph/osd_client.c | 29 | ||||
| -rw-r--r-- | fs/ceph/osd_client.h | 2 | ||||
| -rw-r--r-- | fs/ceph/osdmap.c | 17 | ||||
| -rw-r--r-- | fs/ceph/snap.c | 6 |
15 files changed, 191 insertions, 97 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 3bae418c6ad3..4303614b5add 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX | |||
| @@ -16,6 +16,8 @@ befs.txt | |||
| 16 | - information about the BeOS filesystem for Linux. | 16 | - information about the BeOS filesystem for Linux. |
| 17 | bfs.txt | 17 | bfs.txt |
| 18 | - info for the SCO UnixWare Boot Filesystem (BFS). | 18 | - info for the SCO UnixWare Boot Filesystem (BFS). |
| 19 | ceph.txt | ||
| 20 | - info for the Ceph Distributed File System | ||
| 19 | cifs.txt | 21 | cifs.txt |
| 20 | - description of the CIFS filesystem. | 22 | - description of the CIFS filesystem. |
| 21 | coda.txt | 23 | coda.txt |
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 6e03917316bd..0660c9f5deef 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt | |||
| @@ -8,7 +8,7 @@ Basic features include: | |||
| 8 | 8 | ||
| 9 | * POSIX semantics | 9 | * POSIX semantics |
| 10 | * Seamless scaling from 1 to many thousands of nodes | 10 | * Seamless scaling from 1 to many thousands of nodes |
| 11 | * High availability and reliability. No single points of failure. | 11 | * High availability and reliability. No single point of failure. |
| 12 | * N-way replication of data across storage nodes | 12 | * N-way replication of data across storage nodes |
| 13 | * Fast recovery from node failures | 13 | * Fast recovery from node failures |
| 14 | * Automatic rebalancing of data on node addition/removal | 14 | * Automatic rebalancing of data on node addition/removal |
| @@ -94,7 +94,7 @@ Mount Options | |||
| 94 | 94 | ||
| 95 | wsize=X | 95 | wsize=X |
| 96 | Specify the maximum write size in bytes. By default there is no | 96 | Specify the maximum write size in bytes. By default there is no |
| 97 | maximu. Ceph will normally size writes based on the file stripe | 97 | maximum. Ceph will normally size writes based on the file stripe |
| 98 | size. | 98 | size. |
| 99 | 99 | ||
| 100 | rsize=X | 100 | rsize=X |
| @@ -115,7 +115,7 @@ Mount Options | |||
| 115 | number of entries in that directory. | 115 | number of entries in that directory. |
| 116 | 116 | ||
| 117 | nocrc | 117 | nocrc |
| 118 | Disable CRC32C calculation for data writes. If set, the OSD | 118 | Disable CRC32C calculation for data writes. If set, the storage node |
| 119 | must rely on TCP's error correction to detect data corruption | 119 | must rely on TCP's error correction to detect data corruption |
| 120 | in the data payload. | 120 | in the data payload. |
| 121 | 121 | ||
| @@ -133,7 +133,8 @@ For more information on Ceph, see the home page at | |||
| 133 | http://ceph.newdream.net/ | 133 | http://ceph.newdream.net/ |
| 134 | 134 | ||
| 135 | The Linux kernel client source tree is available at | 135 | The Linux kernel client source tree is available at |
| 136 | git://ceph.newdream.net/linux-ceph-client.git | 136 | git://ceph.newdream.net/git/ceph-client.git |
| 137 | git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git | ||
| 137 | 138 | ||
| 138 | and the source for the full system is at | 139 | and the source for the full system is at |
| 139 | git://ceph.newdream.net/ceph.git | 140 | git://ceph.newdream.net/git/ceph.git |
diff --git a/MAINTAINERS b/MAINTAINERS index 1a203f9626f6..088bd41ac71e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -1443,7 +1443,7 @@ F: arch/powerpc/platforms/cell/ | |||
| 1443 | 1443 | ||
| 1444 | CEPH DISTRIBUTED FILE SYSTEM CLIENT | 1444 | CEPH DISTRIBUTED FILE SYSTEM CLIENT |
| 1445 | M: Sage Weil <sage@newdream.net> | 1445 | M: Sage Weil <sage@newdream.net> |
| 1446 | L: ceph-devel@lists.sourceforge.net | 1446 | L: ceph-devel@vger.kernel.org |
| 1447 | W: http://ceph.newdream.net/ | 1447 | W: http://ceph.newdream.net/ |
| 1448 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git | 1448 | T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git |
| 1449 | S: Supported | 1449 | S: Supported |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 23bb0ceabe31..ce8ef6107727 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
| @@ -919,6 +919,10 @@ static int context_is_writeable_or_written(struct inode *inode, | |||
| 919 | /* | 919 | /* |
| 920 | * We are only allowed to write into/dirty the page if the page is | 920 | * We are only allowed to write into/dirty the page if the page is |
| 921 | * clean, or already dirty within the same snap context. | 921 | * clean, or already dirty within the same snap context. |
| 922 | * | ||
| 923 | * called with page locked. | ||
| 924 | * return success with page locked, | ||
| 925 | * or any failure (incl -EAGAIN) with page unlocked. | ||
| 922 | */ | 926 | */ |
| 923 | static int ceph_update_writeable_page(struct file *file, | 927 | static int ceph_update_writeable_page(struct file *file, |
| 924 | loff_t pos, unsigned len, | 928 | loff_t pos, unsigned len, |
| @@ -961,9 +965,11 @@ retry_locked: | |||
| 961 | snapc = ceph_get_snap_context((void *)page->private); | 965 | snapc = ceph_get_snap_context((void *)page->private); |
| 962 | unlock_page(page); | 966 | unlock_page(page); |
| 963 | ceph_queue_writeback(inode); | 967 | ceph_queue_writeback(inode); |
| 964 | wait_event_interruptible(ci->i_cap_wq, | 968 | r = wait_event_interruptible(ci->i_cap_wq, |
| 965 | context_is_writeable_or_written(inode, snapc)); | 969 | context_is_writeable_or_written(inode, snapc)); |
| 966 | ceph_put_snap_context(snapc); | 970 | ceph_put_snap_context(snapc); |
| 971 | if (r == -ERESTARTSYS) | ||
| 972 | return r; | ||
| 967 | return -EAGAIN; | 973 | return -EAGAIN; |
| 968 | } | 974 | } |
| 969 | 975 | ||
| @@ -1035,7 +1041,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, | |||
| 1035 | int r; | 1041 | int r; |
| 1036 | 1042 | ||
| 1037 | do { | 1043 | do { |
| 1038 | /* get a page*/ | 1044 | /* get a page */ |
| 1039 | page = grab_cache_page_write_begin(mapping, index, 0); | 1045 | page = grab_cache_page_write_begin(mapping, index, 0); |
| 1040 | if (!page) | 1046 | if (!page) |
| 1041 | return -ENOMEM; | 1047 | return -ENOMEM; |
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index f0318427b6da..8d8a84964763 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c | |||
| @@ -28,6 +28,12 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac) | |||
| 28 | return (ac->want_keys & xi->have_keys) == ac->want_keys; | 28 | return (ac->want_keys & xi->have_keys) == ac->want_keys; |
| 29 | } | 29 | } |
| 30 | 30 | ||
| 31 | static int ceph_x_encrypt_buflen(int ilen) | ||
| 32 | { | ||
| 33 | return sizeof(struct ceph_x_encrypt_header) + ilen + 16 + | ||
| 34 | sizeof(u32); | ||
| 35 | } | ||
| 36 | |||
| 31 | static int ceph_x_encrypt(struct ceph_crypto_key *secret, | 37 | static int ceph_x_encrypt(struct ceph_crypto_key *secret, |
| 32 | void *ibuf, int ilen, void *obuf, size_t olen) | 38 | void *ibuf, int ilen, void *obuf, size_t olen) |
| 33 | { | 39 | { |
| @@ -150,6 +156,11 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, | |||
| 150 | struct timespec validity; | 156 | struct timespec validity; |
| 151 | struct ceph_crypto_key old_key; | 157 | struct ceph_crypto_key old_key; |
| 152 | void *tp, *tpend; | 158 | void *tp, *tpend; |
| 159 | struct ceph_timespec new_validity; | ||
| 160 | struct ceph_crypto_key new_session_key; | ||
| 161 | struct ceph_buffer *new_ticket_blob; | ||
| 162 | unsigned long new_expires, new_renew_after; | ||
| 163 | u64 new_secret_id; | ||
| 153 | 164 | ||
| 154 | ceph_decode_need(&p, end, sizeof(u32) + 1, bad); | 165 | ceph_decode_need(&p, end, sizeof(u32) + 1, bad); |
| 155 | 166 | ||
| @@ -182,16 +193,16 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, | |||
| 182 | goto bad; | 193 | goto bad; |
| 183 | 194 | ||
| 184 | memcpy(&old_key, &th->session_key, sizeof(old_key)); | 195 | memcpy(&old_key, &th->session_key, sizeof(old_key)); |
| 185 | ret = ceph_crypto_key_decode(&th->session_key, &dp, dend); | 196 | ret = ceph_crypto_key_decode(&new_session_key, &dp, dend); |
| 186 | if (ret) | 197 | if (ret) |
| 187 | goto out; | 198 | goto out; |
| 188 | 199 | ||
| 189 | ceph_decode_copy(&dp, &th->validity, sizeof(th->validity)); | 200 | ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); |
| 190 | ceph_decode_timespec(&validity, &th->validity); | 201 | ceph_decode_timespec(&validity, &new_validity); |
| 191 | th->expires = get_seconds() + validity.tv_sec; | 202 | new_expires = get_seconds() + validity.tv_sec; |
| 192 | th->renew_after = th->expires - (validity.tv_sec / 4); | 203 | new_renew_after = new_expires - (validity.tv_sec / 4); |
| 193 | dout(" expires=%lu renew_after=%lu\n", th->expires, | 204 | dout(" expires=%lu renew_after=%lu\n", new_expires, |
| 194 | th->renew_after); | 205 | new_renew_after); |
| 195 | 206 | ||
| 196 | /* ticket blob for service */ | 207 | /* ticket blob for service */ |
| 197 | ceph_decode_8_safe(&p, end, is_enc, bad); | 208 | ceph_decode_8_safe(&p, end, is_enc, bad); |
| @@ -216,10 +227,21 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, | |||
| 216 | dout(" ticket blob is %d bytes\n", dlen); | 227 | dout(" ticket blob is %d bytes\n", dlen); |
| 217 | ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); | 228 | ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad); |
| 218 | struct_v = ceph_decode_8(&tp); | 229 | struct_v = ceph_decode_8(&tp); |
| 219 | th->secret_id = ceph_decode_64(&tp); | 230 | new_secret_id = ceph_decode_64(&tp); |
| 220 | ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend); | 231 | ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend); |
| 221 | if (ret) | 232 | if (ret) |
| 222 | goto out; | 233 | goto out; |
| 234 | |||
| 235 | /* all is well, update our ticket */ | ||
| 236 | ceph_crypto_key_destroy(&th->session_key); | ||
| 237 | if (th->ticket_blob) | ||
| 238 | ceph_buffer_put(th->ticket_blob); | ||
| 239 | th->session_key = new_session_key; | ||
| 240 | th->ticket_blob = new_ticket_blob; | ||
| 241 | th->validity = new_validity; | ||
| 242 | th->secret_id = new_secret_id; | ||
| 243 | th->expires = new_expires; | ||
| 244 | th->renew_after = new_renew_after; | ||
| 223 | dout(" got ticket service %d (%s) secret_id %lld len %d\n", | 245 | dout(" got ticket service %d (%s) secret_id %lld len %d\n", |
| 224 | type, ceph_entity_type_name(type), th->secret_id, | 246 | type, ceph_entity_type_name(type), th->secret_id, |
| 225 | (int)th->ticket_blob->vec.iov_len); | 247 | (int)th->ticket_blob->vec.iov_len); |
| @@ -242,7 +264,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, | |||
| 242 | struct ceph_x_ticket_handler *th, | 264 | struct ceph_x_ticket_handler *th, |
| 243 | struct ceph_x_authorizer *au) | 265 | struct ceph_x_authorizer *au) |
| 244 | { | 266 | { |
| 245 | int len; | 267 | int maxlen; |
| 246 | struct ceph_x_authorize_a *msg_a; | 268 | struct ceph_x_authorize_a *msg_a; |
| 247 | struct ceph_x_authorize_b msg_b; | 269 | struct ceph_x_authorize_b msg_b; |
| 248 | void *p, *end; | 270 | void *p, *end; |
| @@ -253,15 +275,15 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, | |||
| 253 | dout("build_authorizer for %s %p\n", | 275 | dout("build_authorizer for %s %p\n", |
| 254 | ceph_entity_type_name(th->service), au); | 276 | ceph_entity_type_name(th->service), au); |
| 255 | 277 | ||
| 256 | len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) + | 278 | maxlen = sizeof(*msg_a) + sizeof(msg_b) + |
| 257 | ticket_blob_len + 16; | 279 | ceph_x_encrypt_buflen(ticket_blob_len); |
| 258 | dout(" need len %d\n", len); | 280 | dout(" need len %d\n", maxlen); |
| 259 | if (au->buf && au->buf->alloc_len < len) { | 281 | if (au->buf && au->buf->alloc_len < maxlen) { |
| 260 | ceph_buffer_put(au->buf); | 282 | ceph_buffer_put(au->buf); |
| 261 | au->buf = NULL; | 283 | au->buf = NULL; |
| 262 | } | 284 | } |
| 263 | if (!au->buf) { | 285 | if (!au->buf) { |
| 264 | au->buf = ceph_buffer_new(len, GFP_NOFS); | 286 | au->buf = ceph_buffer_new(maxlen, GFP_NOFS); |
| 265 | if (!au->buf) | 287 | if (!au->buf) |
| 266 | return -ENOMEM; | 288 | return -ENOMEM; |
| 267 | } | 289 | } |
| @@ -296,6 +318,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, | |||
| 296 | au->buf->vec.iov_len = p - au->buf->vec.iov_base; | 318 | au->buf->vec.iov_len = p - au->buf->vec.iov_base; |
| 297 | dout(" built authorizer nonce %llx len %d\n", au->nonce, | 319 | dout(" built authorizer nonce %llx len %d\n", au->nonce, |
| 298 | (int)au->buf->vec.iov_len); | 320 | (int)au->buf->vec.iov_len); |
| 321 | BUG_ON(au->buf->vec.iov_len > maxlen); | ||
| 299 | return 0; | 322 | return 0; |
| 300 | 323 | ||
| 301 | out_buf: | 324 | out_buf: |
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index db122bb357b8..7d0a0d0adc18 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
| @@ -1407,6 +1407,7 @@ static int try_nonblocking_invalidate(struct inode *inode) | |||
| 1407 | */ | 1407 | */ |
| 1408 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, | 1408 | void ceph_check_caps(struct ceph_inode_info *ci, int flags, |
| 1409 | struct ceph_mds_session *session) | 1409 | struct ceph_mds_session *session) |
| 1410 | __releases(session->s_mutex) | ||
| 1410 | { | 1411 | { |
| 1411 | struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); | 1412 | struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); |
| 1412 | struct ceph_mds_client *mdsc = &client->mdsc; | 1413 | struct ceph_mds_client *mdsc = &client->mdsc; |
| @@ -1414,7 +1415,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, | |||
| 1414 | struct ceph_cap *cap; | 1415 | struct ceph_cap *cap; |
| 1415 | int file_wanted, used; | 1416 | int file_wanted, used; |
| 1416 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ | 1417 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ |
| 1417 | int drop_session_lock = session ? 0 : 1; | ||
| 1418 | int issued, implemented, want, retain, revoking, flushing = 0; | 1418 | int issued, implemented, want, retain, revoking, flushing = 0; |
| 1419 | int mds = -1; /* keep track of how far we've gone through i_caps list | 1419 | int mds = -1; /* keep track of how far we've gone through i_caps list |
| 1420 | to avoid an infinite loop on retry */ | 1420 | to avoid an infinite loop on retry */ |
| @@ -1639,7 +1639,7 @@ ack: | |||
| 1639 | if (queue_invalidate) | 1639 | if (queue_invalidate) |
| 1640 | ceph_queue_invalidate(inode); | 1640 | ceph_queue_invalidate(inode); |
| 1641 | 1641 | ||
| 1642 | if (session && drop_session_lock) | 1642 | if (session) |
| 1643 | mutex_unlock(&session->s_mutex); | 1643 | mutex_unlock(&session->s_mutex); |
| 1644 | if (took_snap_rwsem) | 1644 | if (took_snap_rwsem) |
| 1645 | up_read(&mdsc->snap_rwsem); | 1645 | up_read(&mdsc->snap_rwsem); |
| @@ -2195,18 +2195,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
| 2195 | * Handle a cap GRANT message from the MDS. (Note that a GRANT may | 2195 | * Handle a cap GRANT message from the MDS. (Note that a GRANT may |
| 2196 | * actually be a revocation if it specifies a smaller cap set.) | 2196 | * actually be a revocation if it specifies a smaller cap set.) |
| 2197 | * | 2197 | * |
| 2198 | * caller holds s_mutex. | 2198 | * caller holds s_mutex and i_lock, we drop both. |
| 2199 | * | ||
| 2199 | * return value: | 2200 | * return value: |
| 2200 | * 0 - ok | 2201 | * 0 - ok |
| 2201 | * 1 - check_caps on auth cap only (writeback) | 2202 | * 1 - check_caps on auth cap only (writeback) |
| 2202 | * 2 - check_caps (ack revoke) | 2203 | * 2 - check_caps (ack revoke) |
| 2203 | */ | 2204 | */ |
| 2204 | static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | 2205 | static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, |
| 2205 | struct ceph_mds_session *session, | 2206 | struct ceph_mds_session *session, |
| 2206 | struct ceph_cap *cap, | 2207 | struct ceph_cap *cap, |
| 2207 | struct ceph_buffer *xattr_buf) | 2208 | struct ceph_buffer *xattr_buf) |
| 2208 | __releases(inode->i_lock) | 2209 | __releases(inode->i_lock) |
| 2209 | 2210 | __releases(session->s_mutex) | |
| 2210 | { | 2211 | { |
| 2211 | struct ceph_inode_info *ci = ceph_inode(inode); | 2212 | struct ceph_inode_info *ci = ceph_inode(inode); |
| 2212 | int mds = session->s_mds; | 2213 | int mds = session->s_mds; |
| @@ -2216,7 +2217,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2216 | u64 size = le64_to_cpu(grant->size); | 2217 | u64 size = le64_to_cpu(grant->size); |
| 2217 | u64 max_size = le64_to_cpu(grant->max_size); | 2218 | u64 max_size = le64_to_cpu(grant->max_size); |
| 2218 | struct timespec mtime, atime, ctime; | 2219 | struct timespec mtime, atime, ctime; |
| 2219 | int reply = 0; | 2220 | int check_caps = 0; |
| 2220 | int wake = 0; | 2221 | int wake = 0; |
| 2221 | int writeback = 0; | 2222 | int writeback = 0; |
| 2222 | int revoked_rdcache = 0; | 2223 | int revoked_rdcache = 0; |
| @@ -2329,11 +2330,12 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2329 | if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) | 2330 | if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER) |
| 2330 | writeback = 1; /* will delay ack */ | 2331 | writeback = 1; /* will delay ack */ |
| 2331 | else if (dirty & ~newcaps) | 2332 | else if (dirty & ~newcaps) |
| 2332 | reply = 1; /* initiate writeback in check_caps */ | 2333 | check_caps = 1; /* initiate writeback in check_caps */ |
| 2333 | else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || | 2334 | else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 || |
| 2334 | revoked_rdcache) | 2335 | revoked_rdcache) |
| 2335 | reply = 2; /* send revoke ack in check_caps */ | 2336 | check_caps = 2; /* send revoke ack in check_caps */ |
| 2336 | cap->issued = newcaps; | 2337 | cap->issued = newcaps; |
| 2338 | cap->implemented |= newcaps; | ||
| 2337 | } else if (cap->issued == newcaps) { | 2339 | } else if (cap->issued == newcaps) { |
| 2338 | dout("caps unchanged: %s -> %s\n", | 2340 | dout("caps unchanged: %s -> %s\n", |
| 2339 | ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); | 2341 | ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); |
| @@ -2346,6 +2348,7 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2346 | * pending revocation */ | 2348 | * pending revocation */ |
| 2347 | wake = 1; | 2349 | wake = 1; |
| 2348 | } | 2350 | } |
| 2351 | BUG_ON(cap->issued & ~cap->implemented); | ||
| 2349 | 2352 | ||
| 2350 | spin_unlock(&inode->i_lock); | 2353 | spin_unlock(&inode->i_lock); |
| 2351 | if (writeback) | 2354 | if (writeback) |
| @@ -2359,7 +2362,14 @@ static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, | |||
| 2359 | ceph_queue_invalidate(inode); | 2362 | ceph_queue_invalidate(inode); |
| 2360 | if (wake) | 2363 | if (wake) |
| 2361 | wake_up(&ci->i_cap_wq); | 2364 | wake_up(&ci->i_cap_wq); |
| 2362 | return reply; | 2365 | |
| 2366 | if (check_caps == 1) | ||
| 2367 | ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, | ||
| 2368 | session); | ||
| 2369 | else if (check_caps == 2) | ||
| 2370 | ceph_check_caps(ci, CHECK_CAPS_NODELAY, session); | ||
| 2371 | else | ||
| 2372 | mutex_unlock(&session->s_mutex); | ||
| 2363 | } | 2373 | } |
| 2364 | 2374 | ||
| 2365 | /* | 2375 | /* |
| @@ -2548,9 +2558,8 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, | |||
| 2548 | ci->i_cap_exporting_issued = cap->issued; | 2558 | ci->i_cap_exporting_issued = cap->issued; |
| 2549 | } | 2559 | } |
| 2550 | __ceph_remove_cap(cap); | 2560 | __ceph_remove_cap(cap); |
| 2551 | } else { | ||
| 2552 | WARN_ON(!cap); | ||
| 2553 | } | 2561 | } |
| 2562 | /* else, we already released it */ | ||
| 2554 | 2563 | ||
| 2555 | spin_unlock(&inode->i_lock); | 2564 | spin_unlock(&inode->i_lock); |
| 2556 | } | 2565 | } |
| @@ -2621,9 +2630,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2621 | u64 cap_id; | 2630 | u64 cap_id; |
| 2622 | u64 size, max_size; | 2631 | u64 size, max_size; |
| 2623 | u64 tid; | 2632 | u64 tid; |
| 2624 | int check_caps = 0; | ||
| 2625 | void *snaptrace; | 2633 | void *snaptrace; |
| 2626 | int r; | ||
| 2627 | 2634 | ||
| 2628 | dout("handle_caps from mds%d\n", mds); | 2635 | dout("handle_caps from mds%d\n", mds); |
| 2629 | 2636 | ||
| @@ -2668,8 +2675,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2668 | case CEPH_CAP_OP_IMPORT: | 2675 | case CEPH_CAP_OP_IMPORT: |
| 2669 | handle_cap_import(mdsc, inode, h, session, | 2676 | handle_cap_import(mdsc, inode, h, session, |
| 2670 | snaptrace, le32_to_cpu(h->snap_trace_len)); | 2677 | snaptrace, le32_to_cpu(h->snap_trace_len)); |
| 2671 | check_caps = 1; /* we may have sent a RELEASE to the old auth */ | 2678 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, |
| 2672 | goto done; | 2679 | session); |
| 2680 | goto done_unlocked; | ||
| 2673 | } | 2681 | } |
| 2674 | 2682 | ||
| 2675 | /* the rest require a cap */ | 2683 | /* the rest require a cap */ |
| @@ -2686,16 +2694,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2686 | switch (op) { | 2694 | switch (op) { |
| 2687 | case CEPH_CAP_OP_REVOKE: | 2695 | case CEPH_CAP_OP_REVOKE: |
| 2688 | case CEPH_CAP_OP_GRANT: | 2696 | case CEPH_CAP_OP_GRANT: |
| 2689 | r = handle_cap_grant(inode, h, session, cap, msg->middle); | 2697 | handle_cap_grant(inode, h, session, cap, msg->middle); |
| 2690 | if (r == 1) | 2698 | goto done_unlocked; |
| 2691 | ceph_check_caps(ceph_inode(inode), | ||
| 2692 | CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY, | ||
| 2693 | session); | ||
| 2694 | else if (r == 2) | ||
| 2695 | ceph_check_caps(ceph_inode(inode), | ||
| 2696 | CHECK_CAPS_NODELAY, | ||
| 2697 | session); | ||
| 2698 | break; | ||
| 2699 | 2699 | ||
| 2700 | case CEPH_CAP_OP_FLUSH_ACK: | 2700 | case CEPH_CAP_OP_FLUSH_ACK: |
| 2701 | handle_cap_flush_ack(inode, tid, h, session, cap); | 2701 | handle_cap_flush_ack(inode, tid, h, session, cap); |
| @@ -2713,9 +2713,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
| 2713 | 2713 | ||
| 2714 | done: | 2714 | done: |
| 2715 | mutex_unlock(&session->s_mutex); | 2715 | mutex_unlock(&session->s_mutex); |
| 2716 | 2716 | done_unlocked: | |
| 2717 | if (check_caps) | ||
| 2718 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL); | ||
| 2719 | if (inode) | 2717 | if (inode) |
| 2720 | iput(inode); | 2718 | iput(inode); |
| 2721 | return; | 2719 | return; |
| @@ -2838,11 +2836,18 @@ int ceph_encode_inode_release(void **p, struct inode *inode, | |||
| 2838 | struct ceph_cap *cap; | 2836 | struct ceph_cap *cap; |
| 2839 | struct ceph_mds_request_release *rel = *p; | 2837 | struct ceph_mds_request_release *rel = *p; |
| 2840 | int ret = 0; | 2838 | int ret = 0; |
| 2841 | 2839 | int used = 0; | |
| 2842 | dout("encode_inode_release %p mds%d drop %s unless %s\n", inode, | ||
| 2843 | mds, ceph_cap_string(drop), ceph_cap_string(unless)); | ||
| 2844 | 2840 | ||
| 2845 | spin_lock(&inode->i_lock); | 2841 | spin_lock(&inode->i_lock); |
| 2842 | used = __ceph_caps_used(ci); | ||
| 2843 | |||
| 2844 | dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode, | ||
| 2845 | mds, ceph_cap_string(used), ceph_cap_string(drop), | ||
| 2846 | ceph_cap_string(unless)); | ||
| 2847 | |||
| 2848 | /* only drop unused caps */ | ||
| 2849 | drop &= ~used; | ||
| 2850 | |||
| 2846 | cap = __get_cap_for_mds(ci, mds); | 2851 | cap = __get_cap_for_mds(ci, mds); |
| 2847 | if (cap && __cap_is_valid(cap)) { | 2852 | if (cap && __cap_is_valid(cap)) { |
| 2848 | if (force || | 2853 | if (force || |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 5107384ee029..8a9116e15b70 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
| @@ -288,8 +288,10 @@ more: | |||
| 288 | CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; | 288 | CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; |
| 289 | 289 | ||
| 290 | /* discard old result, if any */ | 290 | /* discard old result, if any */ |
| 291 | if (fi->last_readdir) | 291 | if (fi->last_readdir) { |
| 292 | ceph_mdsc_put_request(fi->last_readdir); | 292 | ceph_mdsc_put_request(fi->last_readdir); |
| 293 | fi->last_readdir = NULL; | ||
| 294 | } | ||
| 293 | 295 | ||
| 294 | /* requery frag tree, as the frag topology may have changed */ | 296 | /* requery frag tree, as the frag topology may have changed */ |
| 295 | frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); | 297 | frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL); |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7abe1aed819b..aca82d55cc53 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
| @@ -378,6 +378,22 @@ void ceph_destroy_inode(struct inode *inode) | |||
| 378 | 378 | ||
| 379 | ceph_queue_caps_release(inode); | 379 | ceph_queue_caps_release(inode); |
| 380 | 380 | ||
| 381 | /* | ||
| 382 | * we may still have a snap_realm reference if there are stray | ||
| 383 | * caps in i_cap_exporting_issued or i_snap_caps. | ||
| 384 | */ | ||
| 385 | if (ci->i_snap_realm) { | ||
| 386 | struct ceph_mds_client *mdsc = | ||
| 387 | &ceph_client(ci->vfs_inode.i_sb)->mdsc; | ||
| 388 | struct ceph_snap_realm *realm = ci->i_snap_realm; | ||
| 389 | |||
| 390 | dout(" dropping residual ref to snap realm %p\n", realm); | ||
| 391 | spin_lock(&realm->inodes_with_caps_lock); | ||
| 392 | list_del_init(&ci->i_snap_realm_item); | ||
| 393 | spin_unlock(&realm->inodes_with_caps_lock); | ||
| 394 | ceph_put_snap_realm(mdsc, realm); | ||
| 395 | } | ||
| 396 | |||
| 381 | kfree(ci->i_symlink); | 397 | kfree(ci->i_symlink); |
| 382 | while ((n = rb_first(&ci->i_fragtree)) != NULL) { | 398 | while ((n = rb_first(&ci->i_fragtree)) != NULL) { |
| 383 | frag = rb_entry(n, struct ceph_inode_frag, node); | 399 | frag = rb_entry(n, struct ceph_inode_frag, node); |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a2600101ec22..5c7920be6420 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
| @@ -328,6 +328,8 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, | |||
| 328 | struct ceph_mds_session *s; | 328 | struct ceph_mds_session *s; |
| 329 | 329 | ||
| 330 | s = kzalloc(sizeof(*s), GFP_NOFS); | 330 | s = kzalloc(sizeof(*s), GFP_NOFS); |
| 331 | if (!s) | ||
| 332 | return ERR_PTR(-ENOMEM); | ||
| 331 | s->s_mdsc = mdsc; | 333 | s->s_mdsc = mdsc; |
| 332 | s->s_mds = mds; | 334 | s->s_mds = mds; |
| 333 | s->s_state = CEPH_MDS_SESSION_NEW; | 335 | s->s_state = CEPH_MDS_SESSION_NEW; |
| @@ -529,7 +531,7 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
| 529 | { | 531 | { |
| 530 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 532 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); |
| 531 | rb_erase(&req->r_node, &mdsc->request_tree); | 533 | rb_erase(&req->r_node, &mdsc->request_tree); |
| 532 | ceph_mdsc_put_request(req); | 534 | RB_CLEAR_NODE(&req->r_node); |
| 533 | 535 | ||
| 534 | if (req->r_unsafe_dir) { | 536 | if (req->r_unsafe_dir) { |
| 535 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); | 537 | struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir); |
| @@ -538,6 +540,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
| 538 | list_del_init(&req->r_unsafe_dir_item); | 540 | list_del_init(&req->r_unsafe_dir_item); |
| 539 | spin_unlock(&ci->i_unsafe_lock); | 541 | spin_unlock(&ci->i_unsafe_lock); |
| 540 | } | 542 | } |
| 543 | |||
| 544 | ceph_mdsc_put_request(req); | ||
| 541 | } | 545 | } |
| 542 | 546 | ||
| 543 | /* | 547 | /* |
| @@ -862,6 +866,7 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, | |||
| 862 | if (time_after_eq(jiffies, session->s_cap_ttl) && | 866 | if (time_after_eq(jiffies, session->s_cap_ttl) && |
| 863 | time_after_eq(session->s_cap_ttl, session->s_renew_requested)) | 867 | time_after_eq(session->s_cap_ttl, session->s_renew_requested)) |
| 864 | pr_info("mds%d caps stale\n", session->s_mds); | 868 | pr_info("mds%d caps stale\n", session->s_mds); |
| 869 | session->s_renew_requested = jiffies; | ||
| 865 | 870 | ||
| 866 | /* do not try to renew caps until a recovering mds has reconnected | 871 | /* do not try to renew caps until a recovering mds has reconnected |
| 867 | * with its clients. */ | 872 | * with its clients. */ |
| @@ -874,7 +879,6 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, | |||
| 874 | 879 | ||
| 875 | dout("send_renew_caps to mds%d (%s)\n", session->s_mds, | 880 | dout("send_renew_caps to mds%d (%s)\n", session->s_mds, |
| 876 | ceph_mds_state_name(state)); | 881 | ceph_mds_state_name(state)); |
| 877 | session->s_renew_requested = jiffies; | ||
| 878 | msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, | 882 | msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, |
| 879 | ++session->s_renew_seq); | 883 | ++session->s_renew_seq); |
| 880 | if (IS_ERR(msg)) | 884 | if (IS_ERR(msg)) |
| @@ -1566,8 +1570,13 @@ static int __do_request(struct ceph_mds_client *mdsc, | |||
| 1566 | 1570 | ||
| 1567 | /* get, open session */ | 1571 | /* get, open session */ |
| 1568 | session = __ceph_lookup_mds_session(mdsc, mds); | 1572 | session = __ceph_lookup_mds_session(mdsc, mds); |
| 1569 | if (!session) | 1573 | if (!session) { |
| 1570 | session = register_session(mdsc, mds); | 1574 | session = register_session(mdsc, mds); |
| 1575 | if (IS_ERR(session)) { | ||
| 1576 | err = PTR_ERR(session); | ||
| 1577 | goto finish; | ||
| 1578 | } | ||
| 1579 | } | ||
| 1571 | dout("do_request mds%d session %p state %s\n", mds, session, | 1580 | dout("do_request mds%d session %p state %s\n", mds, session, |
| 1572 | session_state_name(session->s_state)); | 1581 | session_state_name(session->s_state)); |
| 1573 | if (session->s_state != CEPH_MDS_SESSION_OPEN && | 1582 | if (session->s_state != CEPH_MDS_SESSION_OPEN && |
| @@ -1770,7 +1779,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) | |||
| 1770 | dout("handle_reply %p\n", req); | 1779 | dout("handle_reply %p\n", req); |
| 1771 | 1780 | ||
| 1772 | /* correct session? */ | 1781 | /* correct session? */ |
| 1773 | if (!req->r_session && req->r_session != session) { | 1782 | if (req->r_session != session) { |
| 1774 | pr_err("mdsc_handle_reply got %llu on session mds%d" | 1783 | pr_err("mdsc_handle_reply got %llu on session mds%d" |
| 1775 | " not mds%d\n", tid, session->s_mds, | 1784 | " not mds%d\n", tid, session->s_mds, |
| 1776 | req->r_session ? req->r_session->s_mds : -1); | 1785 | req->r_session ? req->r_session->s_mds : -1); |
| @@ -2682,29 +2691,41 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) | |||
| 2682 | */ | 2691 | */ |
| 2683 | static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) | 2692 | static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid) |
| 2684 | { | 2693 | { |
| 2685 | struct ceph_mds_request *req = NULL; | 2694 | struct ceph_mds_request *req = NULL, *nextreq; |
| 2686 | struct rb_node *n; | 2695 | struct rb_node *n; |
| 2687 | 2696 | ||
| 2688 | mutex_lock(&mdsc->mutex); | 2697 | mutex_lock(&mdsc->mutex); |
| 2689 | dout("wait_unsafe_requests want %lld\n", want_tid); | 2698 | dout("wait_unsafe_requests want %lld\n", want_tid); |
| 2699 | restart: | ||
| 2690 | req = __get_oldest_req(mdsc); | 2700 | req = __get_oldest_req(mdsc); |
| 2691 | while (req && req->r_tid <= want_tid) { | 2701 | while (req && req->r_tid <= want_tid) { |
| 2702 | /* find next request */ | ||
| 2703 | n = rb_next(&req->r_node); | ||
| 2704 | if (n) | ||
| 2705 | nextreq = rb_entry(n, struct ceph_mds_request, r_node); | ||
| 2706 | else | ||
| 2707 | nextreq = NULL; | ||
| 2692 | if ((req->r_op & CEPH_MDS_OP_WRITE)) { | 2708 | if ((req->r_op & CEPH_MDS_OP_WRITE)) { |
| 2693 | /* write op */ | 2709 | /* write op */ |
| 2694 | ceph_mdsc_get_request(req); | 2710 | ceph_mdsc_get_request(req); |
| 2711 | if (nextreq) | ||
| 2712 | ceph_mdsc_get_request(nextreq); | ||
| 2695 | mutex_unlock(&mdsc->mutex); | 2713 | mutex_unlock(&mdsc->mutex); |
| 2696 | dout("wait_unsafe_requests wait on %llu (want %llu)\n", | 2714 | dout("wait_unsafe_requests wait on %llu (want %llu)\n", |
| 2697 | req->r_tid, want_tid); | 2715 | req->r_tid, want_tid); |
| 2698 | wait_for_completion(&req->r_safe_completion); | 2716 | wait_for_completion(&req->r_safe_completion); |
| 2699 | mutex_lock(&mdsc->mutex); | 2717 | mutex_lock(&mdsc->mutex); |
| 2700 | n = rb_next(&req->r_node); | ||
| 2701 | ceph_mdsc_put_request(req); | 2718 | ceph_mdsc_put_request(req); |
| 2702 | } else { | 2719 | if (!nextreq) |
| 2703 | n = rb_next(&req->r_node); | 2720 | break; /* next dne before, so we're done! */ |
| 2721 | if (RB_EMPTY_NODE(&nextreq->r_node)) { | ||
| 2722 | /* next request was removed from tree */ | ||
| 2723 | ceph_mdsc_put_request(nextreq); | ||
| 2724 | goto restart; | ||
| 2725 | } | ||
| 2726 | ceph_mdsc_put_request(nextreq); /* won't go away */ | ||
| 2704 | } | 2727 | } |
| 2705 | if (!n) | 2728 | req = nextreq; |
| 2706 | break; | ||
| 2707 | req = rb_entry(n, struct ceph_mds_request, r_node); | ||
| 2708 | } | 2729 | } |
| 2709 | mutex_unlock(&mdsc->mutex); | 2730 | mutex_unlock(&mdsc->mutex); |
| 2710 | dout("wait_unsafe_requests done\n"); | 2731 | dout("wait_unsafe_requests done\n"); |
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index 781656a49bf8..a32f0f896d9f 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c | |||
| @@ -366,6 +366,14 @@ void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) | |||
| 366 | } | 366 | } |
| 367 | 367 | ||
| 368 | /* | 368 | /* |
| 369 | * return true if this connection ever successfully opened | ||
| 370 | */ | ||
| 371 | bool ceph_con_opened(struct ceph_connection *con) | ||
| 372 | { | ||
| 373 | return con->connect_seq > 0; | ||
| 374 | } | ||
| 375 | |||
| 376 | /* | ||
| 369 | * generic get/put | 377 | * generic get/put |
| 370 | */ | 378 | */ |
| 371 | struct ceph_connection *ceph_con_get(struct ceph_connection *con) | 379 | struct ceph_connection *ceph_con_get(struct ceph_connection *con) |
| @@ -830,13 +838,6 @@ static void prepare_read_connect(struct ceph_connection *con) | |||
| 830 | con->in_base_pos = 0; | 838 | con->in_base_pos = 0; |
| 831 | } | 839 | } |
| 832 | 840 | ||
| 833 | static void prepare_read_connect_retry(struct ceph_connection *con) | ||
| 834 | { | ||
| 835 | dout("prepare_read_connect_retry %p\n", con); | ||
| 836 | con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr) | ||
| 837 | + sizeof(con->peer_addr_for_me); | ||
| 838 | } | ||
| 839 | |||
| 840 | static void prepare_read_ack(struct ceph_connection *con) | 841 | static void prepare_read_ack(struct ceph_connection *con) |
| 841 | { | 842 | { |
| 842 | dout("prepare_read_ack %p\n", con); | 843 | dout("prepare_read_ack %p\n", con); |
| @@ -1146,7 +1147,7 @@ static int process_connect(struct ceph_connection *con) | |||
| 1146 | } | 1147 | } |
| 1147 | con->auth_retry = 1; | 1148 | con->auth_retry = 1; |
| 1148 | prepare_write_connect(con->msgr, con, 0); | 1149 | prepare_write_connect(con->msgr, con, 0); |
| 1149 | prepare_read_connect_retry(con); | 1150 | prepare_read_connect(con); |
| 1150 | break; | 1151 | break; |
| 1151 | 1152 | ||
| 1152 | case CEPH_MSGR_TAG_RESETSESSION: | 1153 | case CEPH_MSGR_TAG_RESETSESSION: |
| @@ -1843,8 +1844,6 @@ static void ceph_fault(struct ceph_connection *con) | |||
| 1843 | goto out; | 1844 | goto out; |
| 1844 | } | 1845 | } |
| 1845 | 1846 | ||
| 1846 | clear_bit(BUSY, &con->state); /* to avoid an improbable race */ | ||
| 1847 | |||
| 1848 | mutex_lock(&con->mutex); | 1847 | mutex_lock(&con->mutex); |
| 1849 | if (test_bit(CLOSED, &con->state)) | 1848 | if (test_bit(CLOSED, &con->state)) |
| 1850 | goto out_unlock; | 1849 | goto out_unlock; |
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h index 4caaa5911110..a343dae73cdc 100644 --- a/fs/ceph/messenger.h +++ b/fs/ceph/messenger.h | |||
| @@ -223,6 +223,7 @@ extern void ceph_con_init(struct ceph_messenger *msgr, | |||
| 223 | struct ceph_connection *con); | 223 | struct ceph_connection *con); |
| 224 | extern void ceph_con_open(struct ceph_connection *con, | 224 | extern void ceph_con_open(struct ceph_connection *con, |
| 225 | struct ceph_entity_addr *addr); | 225 | struct ceph_entity_addr *addr); |
| 226 | extern bool ceph_con_opened(struct ceph_connection *con); | ||
| 226 | extern void ceph_con_close(struct ceph_connection *con); | 227 | extern void ceph_con_close(struct ceph_connection *con); |
| 227 | extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); | 228 | extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg); |
| 228 | extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); | 229 | extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg); |
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index dbe63db9762f..c7b4dedaace6 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c | |||
| @@ -413,11 +413,22 @@ static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all) | |||
| 413 | */ | 413 | */ |
| 414 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) | 414 | static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd) |
| 415 | { | 415 | { |
| 416 | struct ceph_osd_request *req; | ||
| 416 | int ret = 0; | 417 | int ret = 0; |
| 417 | 418 | ||
| 418 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); | 419 | dout("__reset_osd %p osd%d\n", osd, osd->o_osd); |
| 419 | if (list_empty(&osd->o_requests)) { | 420 | if (list_empty(&osd->o_requests)) { |
| 420 | __remove_osd(osdc, osd); | 421 | __remove_osd(osdc, osd); |
| 422 | } else if (memcmp(&osdc->osdmap->osd_addr[osd->o_osd], | ||
| 423 | &osd->o_con.peer_addr, | ||
| 424 | sizeof(osd->o_con.peer_addr)) == 0 && | ||
| 425 | !ceph_con_opened(&osd->o_con)) { | ||
| 426 | dout(" osd addr hasn't changed and connection never opened," | ||
| 427 | " letting msgr retry"); | ||
| 428 | /* touch each r_stamp for handle_timeout()'s benfit */ | ||
| 429 | list_for_each_entry(req, &osd->o_requests, r_osd_item) | ||
| 430 | req->r_stamp = jiffies; | ||
| 431 | ret = -EAGAIN; | ||
| 421 | } else { | 432 | } else { |
| 422 | ceph_con_close(&osd->o_con); | 433 | ceph_con_close(&osd->o_con); |
| 423 | ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); | 434 | ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); |
| @@ -633,7 +644,7 @@ static int __send_request(struct ceph_osd_client *osdc, | |||
| 633 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ | 644 | reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */ |
| 634 | reqhead->reassert_version = req->r_reassert_version; | 645 | reqhead->reassert_version = req->r_reassert_version; |
| 635 | 646 | ||
| 636 | req->r_sent_stamp = jiffies; | 647 | req->r_stamp = jiffies; |
| 637 | list_move_tail(&osdc->req_lru, &req->r_req_lru_item); | 648 | list_move_tail(&osdc->req_lru, &req->r_req_lru_item); |
| 638 | 649 | ||
| 639 | ceph_msg_get(req->r_request); /* send consumes a ref */ | 650 | ceph_msg_get(req->r_request); /* send consumes a ref */ |
| @@ -660,7 +671,7 @@ static void handle_timeout(struct work_struct *work) | |||
| 660 | unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; | 671 | unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; |
| 661 | unsigned long keepalive = | 672 | unsigned long keepalive = |
| 662 | osdc->client->mount_args->osd_keepalive_timeout * HZ; | 673 | osdc->client->mount_args->osd_keepalive_timeout * HZ; |
| 663 | unsigned long last_sent = 0; | 674 | unsigned long last_stamp = 0; |
| 664 | struct rb_node *p; | 675 | struct rb_node *p; |
| 665 | struct list_head slow_osds; | 676 | struct list_head slow_osds; |
| 666 | 677 | ||
| @@ -697,12 +708,12 @@ static void handle_timeout(struct work_struct *work) | |||
| 697 | req = list_entry(osdc->req_lru.next, struct ceph_osd_request, | 708 | req = list_entry(osdc->req_lru.next, struct ceph_osd_request, |
| 698 | r_req_lru_item); | 709 | r_req_lru_item); |
| 699 | 710 | ||
| 700 | if (time_before(jiffies, req->r_sent_stamp + timeout)) | 711 | if (time_before(jiffies, req->r_stamp + timeout)) |
| 701 | break; | 712 | break; |
| 702 | 713 | ||
| 703 | BUG_ON(req == last_req && req->r_sent_stamp == last_sent); | 714 | BUG_ON(req == last_req && req->r_stamp == last_stamp); |
| 704 | last_req = req; | 715 | last_req = req; |
| 705 | last_sent = req->r_sent_stamp; | 716 | last_stamp = req->r_stamp; |
| 706 | 717 | ||
| 707 | osd = req->r_osd; | 718 | osd = req->r_osd; |
| 708 | BUG_ON(!osd); | 719 | BUG_ON(!osd); |
| @@ -718,7 +729,7 @@ static void handle_timeout(struct work_struct *work) | |||
| 718 | */ | 729 | */ |
| 719 | INIT_LIST_HEAD(&slow_osds); | 730 | INIT_LIST_HEAD(&slow_osds); |
| 720 | list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { | 731 | list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) { |
| 721 | if (time_before(jiffies, req->r_sent_stamp + keepalive)) | 732 | if (time_before(jiffies, req->r_stamp + keepalive)) |
| 722 | break; | 733 | break; |
| 723 | 734 | ||
| 724 | osd = req->r_osd; | 735 | osd = req->r_osd; |
| @@ -862,7 +873,9 @@ static int __kick_requests(struct ceph_osd_client *osdc, | |||
| 862 | 873 | ||
| 863 | dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); | 874 | dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1); |
| 864 | if (kickosd) { | 875 | if (kickosd) { |
| 865 | __reset_osd(osdc, kickosd); | 876 | err = __reset_osd(osdc, kickosd); |
| 877 | if (err == -EAGAIN) | ||
| 878 | return 1; | ||
| 866 | } else { | 879 | } else { |
| 867 | for (p = rb_first(&osdc->osds); p; p = n) { | 880 | for (p = rb_first(&osdc->osds); p; p = n) { |
| 868 | struct ceph_osd *osd = | 881 | struct ceph_osd *osd = |
| @@ -913,7 +926,7 @@ static int __kick_requests(struct ceph_osd_client *osdc, | |||
| 913 | 926 | ||
| 914 | kick: | 927 | kick: |
| 915 | dout("kicking %p tid %llu osd%d\n", req, req->r_tid, | 928 | dout("kicking %p tid %llu osd%d\n", req, req->r_tid, |
| 916 | req->r_osd->o_osd); | 929 | req->r_osd ? req->r_osd->o_osd : -1); |
| 917 | req->r_flags |= CEPH_OSD_FLAG_RETRY; | 930 | req->r_flags |= CEPH_OSD_FLAG_RETRY; |
| 918 | err = __send_request(osdc, req); | 931 | err = __send_request(osdc, req); |
| 919 | if (err) { | 932 | if (err) { |
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h index 1b1a3ca43afc..b0759911e7c3 100644 --- a/fs/ceph/osd_client.h +++ b/fs/ceph/osd_client.h | |||
| @@ -70,7 +70,7 @@ struct ceph_osd_request { | |||
| 70 | 70 | ||
| 71 | char r_oid[40]; /* object name */ | 71 | char r_oid[40]; /* object name */ |
| 72 | int r_oid_len; | 72 | int r_oid_len; |
| 73 | unsigned long r_sent_stamp; | 73 | unsigned long r_stamp; /* send OR check time */ |
| 74 | bool r_resend; /* msg send failed, needs retry */ | 74 | bool r_resend; /* msg send failed, needs retry */ |
| 75 | 75 | ||
| 76 | struct ceph_file_layout r_file_layout; | 76 | struct ceph_file_layout r_file_layout; |
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index b83f2692b835..d82fe87c2a6e 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c | |||
| @@ -480,6 +480,14 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) | |||
| 480 | return NULL; | 480 | return NULL; |
| 481 | } | 481 | } |
| 482 | 482 | ||
| 483 | void __decode_pool(void **p, struct ceph_pg_pool_info *pi) | ||
| 484 | { | ||
| 485 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | ||
| 486 | calc_pg_masks(pi); | ||
| 487 | *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); | ||
| 488 | *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; | ||
| 489 | } | ||
| 490 | |||
| 483 | /* | 491 | /* |
| 484 | * decode a full map. | 492 | * decode a full map. |
| 485 | */ | 493 | */ |
| @@ -526,12 +534,8 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) | |||
| 526 | ev, CEPH_PG_POOL_VERSION); | 534 | ev, CEPH_PG_POOL_VERSION); |
| 527 | goto bad; | 535 | goto bad; |
| 528 | } | 536 | } |
| 529 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | 537 | __decode_pool(p, pi); |
| 530 | __insert_pg_pool(&map->pg_pools, pi); | 538 | __insert_pg_pool(&map->pg_pools, pi); |
| 531 | calc_pg_masks(pi); | ||
| 532 | *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64); | ||
| 533 | *p += le32_to_cpu(pi->v.num_removed_snap_intervals) | ||
| 534 | * sizeof(u64) * 2; | ||
| 535 | } | 539 | } |
| 536 | ceph_decode_32_safe(p, end, map->pool_max, bad); | 540 | ceph_decode_32_safe(p, end, map->pool_max, bad); |
| 537 | 541 | ||
| @@ -714,8 +718,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, | |||
| 714 | pi->id = pool; | 718 | pi->id = pool; |
| 715 | __insert_pg_pool(&map->pg_pools, pi); | 719 | __insert_pg_pool(&map->pg_pools, pi); |
| 716 | } | 720 | } |
| 717 | ceph_decode_copy(p, &pi->v, sizeof(pi->v)); | 721 | __decode_pool(p, pi); |
| 718 | calc_pg_masks(pi); | ||
| 719 | } | 722 | } |
| 720 | 723 | ||
| 721 | /* old_pool */ | 724 | /* old_pool */ |
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index bf2a5f3846a4..df04e210a055 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c | |||
| @@ -314,9 +314,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) | |||
| 314 | because we rebuild_snap_realms() works _downward_ in | 314 | because we rebuild_snap_realms() works _downward_ in |
| 315 | hierarchy after each update.) */ | 315 | hierarchy after each update.) */ |
| 316 | if (realm->cached_context && | 316 | if (realm->cached_context && |
| 317 | realm->cached_context->seq <= realm->seq && | 317 | realm->cached_context->seq == realm->seq && |
| 318 | (!parent || | 318 | (!parent || |
| 319 | realm->cached_context->seq <= parent->cached_context->seq)) { | 319 | realm->cached_context->seq >= parent->cached_context->seq)) { |
| 320 | dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" | 320 | dout("build_snap_context %llx %p: %p seq %lld (%d snaps)" |
| 321 | " (unchanged)\n", | 321 | " (unchanged)\n", |
| 322 | realm->ino, realm, realm->cached_context, | 322 | realm->ino, realm, realm->cached_context, |
| @@ -818,7 +818,9 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, | |||
| 818 | * queued (again) by ceph_update_snap_trace() | 818 | * queued (again) by ceph_update_snap_trace() |
| 819 | * below. Queue it _now_, under the old context. | 819 | * below. Queue it _now_, under the old context. |
| 820 | */ | 820 | */ |
| 821 | spin_lock(&realm->inodes_with_caps_lock); | ||
| 821 | list_del_init(&ci->i_snap_realm_item); | 822 | list_del_init(&ci->i_snap_realm_item); |
| 823 | spin_unlock(&realm->inodes_with_caps_lock); | ||
| 822 | spin_unlock(&inode->i_lock); | 824 | spin_unlock(&inode->i_lock); |
| 823 | 825 | ||
| 824 | ceph_queue_cap_snap(ci, | 826 | ceph_queue_cap_snap(ci, |
