aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-04-14 21:45:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-04-14 21:45:31 -0400
commit96e35b40c0d6206f56370f937f6f4722739eb273 (patch)
tree2c387b6e3f628484a1f4bdc964e529f89d5f5821 /fs/ceph
parentf5c07a2d8acfc98e00d3be6298f979e5b3175953 (diff)
parenta6a5349d17f2a5c37079826f1a1474c3d08c6b53 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: ceph: use separate class for ceph sockets' sk_lock ceph: reserve one more caps space when doing readdir ceph: queue_cap_snap should always queue dirty context ceph: fix dentry reference leak in dcache readdir ceph: decode v5 of osdmap (pool names) [protocol change] ceph: fix ack counter reset on connection reset ceph: fix leaked inode ref due to snap metadata writeback race ceph: fix snap context reference leaks ceph: allow writeback of snapped pages older than 'oldest' snapc ceph: fix dentry rehashing on virtual .snap dir
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/addr.c62
-rw-r--r--fs/ceph/caps.c42
-rw-r--r--fs/ceph/dir.c7
-rw-r--r--fs/ceph/inode.c10
-rw-r--r--fs/ceph/messenger.c9
-rw-r--r--fs/ceph/osdmap.c180
-rw-r--r--fs/ceph/osdmap.h1
-rw-r--r--fs/ceph/rados.h6
-rw-r--r--fs/ceph/snap.c26
-rw-r--r--fs/ceph/super.h3
10 files changed, 212 insertions, 134 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index aa3cd7cc3e40..412593703d1e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -337,16 +337,15 @@ out:
337/* 337/*
338 * Get ref for the oldest snapc for an inode with dirty data... that is, the 338 * Get ref for the oldest snapc for an inode with dirty data... that is, the
339 * only snap context we are allowed to write back. 339 * only snap context we are allowed to write back.
340 *
341 * Caller holds i_lock.
342 */ 340 */
343static struct ceph_snap_context *__get_oldest_context(struct inode *inode, 341static struct ceph_snap_context *get_oldest_context(struct inode *inode,
344 u64 *snap_size) 342 u64 *snap_size)
345{ 343{
346 struct ceph_inode_info *ci = ceph_inode(inode); 344 struct ceph_inode_info *ci = ceph_inode(inode);
347 struct ceph_snap_context *snapc = NULL; 345 struct ceph_snap_context *snapc = NULL;
348 struct ceph_cap_snap *capsnap = NULL; 346 struct ceph_cap_snap *capsnap = NULL;
349 347
348 spin_lock(&inode->i_lock);
350 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
351 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, 350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
352 capsnap->context, capsnap->dirty_pages); 351 capsnap->context, capsnap->dirty_pages);
@@ -357,21 +356,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
357 break; 356 break;
358 } 357 }
359 } 358 }
360 if (!snapc && ci->i_snap_realm) { 359 if (!snapc && ci->i_head_snapc) {
361 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); 360 snapc = ceph_get_snap_context(ci->i_head_snapc);
362 dout(" head snapc %p has %d dirty pages\n", 361 dout(" head snapc %p has %d dirty pages\n",
363 snapc, ci->i_wrbuffer_ref_head); 362 snapc, ci->i_wrbuffer_ref_head);
364 } 363 }
365 return snapc;
366}
367
368static struct ceph_snap_context *get_oldest_context(struct inode *inode,
369 u64 *snap_size)
370{
371 struct ceph_snap_context *snapc = NULL;
372
373 spin_lock(&inode->i_lock);
374 snapc = __get_oldest_context(inode, snap_size);
375 spin_unlock(&inode->i_lock); 364 spin_unlock(&inode->i_lock);
376 return snapc; 365 return snapc;
377} 366}
@@ -392,7 +381,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
392 int len = PAGE_CACHE_SIZE; 381 int len = PAGE_CACHE_SIZE;
393 loff_t i_size; 382 loff_t i_size;
394 int err = 0; 383 int err = 0;
395 struct ceph_snap_context *snapc; 384 struct ceph_snap_context *snapc, *oldest;
396 u64 snap_size = 0; 385 u64 snap_size = 0;
397 long writeback_stat; 386 long writeback_stat;
398 387
@@ -413,13 +402,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
413 dout("writepage %p page %p not dirty?\n", inode, page); 402 dout("writepage %p page %p not dirty?\n", inode, page);
414 goto out; 403 goto out;
415 } 404 }
416 if (snapc != get_oldest_context(inode, &snap_size)) { 405 oldest = get_oldest_context(inode, &snap_size);
406 if (snapc->seq > oldest->seq) {
417 dout("writepage %p page %p snapc %p not writeable - noop\n", 407 dout("writepage %p page %p snapc %p not writeable - noop\n",
418 inode, page, (void *)page->private); 408 inode, page, (void *)page->private);
419 /* we should only noop if called by kswapd */ 409 /* we should only noop if called by kswapd */
420 WARN_ON((current->flags & PF_MEMALLOC) == 0); 410 WARN_ON((current->flags & PF_MEMALLOC) == 0);
411 ceph_put_snap_context(oldest);
421 goto out; 412 goto out;
422 } 413 }
414 ceph_put_snap_context(oldest);
423 415
424 /* is this a partial page at end of file? */ 416 /* is this a partial page at end of file? */
425 if (snap_size) 417 if (snap_size)
@@ -458,7 +450,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
458 ClearPagePrivate(page); 450 ClearPagePrivate(page);
459 end_page_writeback(page); 451 end_page_writeback(page);
460 ceph_put_wrbuffer_cap_refs(ci, 1, snapc); 452 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
461 ceph_put_snap_context(snapc); 453 ceph_put_snap_context(snapc); /* page's reference */
462out: 454out:
463 return err; 455 return err;
464} 456}
@@ -558,9 +550,9 @@ static void writepages_finish(struct ceph_osd_request *req,
558 dout("inode %p skipping page %p\n", inode, page); 550 dout("inode %p skipping page %p\n", inode, page);
559 wbc->pages_skipped++; 551 wbc->pages_skipped++;
560 } 552 }
553 ceph_put_snap_context((void *)page->private);
561 page->private = 0; 554 page->private = 0;
562 ClearPagePrivate(page); 555 ClearPagePrivate(page);
563 ceph_put_snap_context(snapc);
564 dout("unlocking %d %p\n", i, page); 556 dout("unlocking %d %p\n", i, page);
565 end_page_writeback(page); 557 end_page_writeback(page);
566 558
@@ -618,7 +610,7 @@ static int ceph_writepages_start(struct address_space *mapping,
618 int range_whole = 0; 610 int range_whole = 0;
619 int should_loop = 1; 611 int should_loop = 1;
620 pgoff_t max_pages = 0, max_pages_ever = 0; 612 pgoff_t max_pages = 0, max_pages_ever = 0;
621 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; 613 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
622 struct pagevec pvec; 614 struct pagevec pvec;
623 int done = 0; 615 int done = 0;
624 int rc = 0; 616 int rc = 0;
@@ -770,9 +762,10 @@ get_more_pages:
770 } 762 }
771 763
772 /* only if matching snap context */ 764 /* only if matching snap context */
773 if (snapc != (void *)page->private) { 765 pgsnapc = (void *)page->private;
774 dout("page snapc %p != oldest %p\n", 766 if (pgsnapc->seq > snapc->seq) {
775 (void *)page->private, snapc); 767 dout("page snapc %p %lld > oldest %p %lld\n",
768 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
776 unlock_page(page); 769 unlock_page(page);
777 if (!locked_pages) 770 if (!locked_pages)
778 continue; /* keep looking for snap */ 771 continue; /* keep looking for snap */
@@ -914,7 +907,10 @@ static int context_is_writeable_or_written(struct inode *inode,
914 struct ceph_snap_context *snapc) 907 struct ceph_snap_context *snapc)
915{ 908{
916 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); 909 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
917 return !oldest || snapc->seq <= oldest->seq; 910 int ret = !oldest || snapc->seq <= oldest->seq;
911
912 ceph_put_snap_context(oldest);
913 return ret;
918} 914}
919 915
920/* 916/*
@@ -936,8 +932,8 @@ static int ceph_update_writeable_page(struct file *file,
936 int pos_in_page = pos & ~PAGE_CACHE_MASK; 932 int pos_in_page = pos & ~PAGE_CACHE_MASK;
937 int end_in_page = pos_in_page + len; 933 int end_in_page = pos_in_page + len;
938 loff_t i_size; 934 loff_t i_size;
939 struct ceph_snap_context *snapc;
940 int r; 935 int r;
936 struct ceph_snap_context *snapc, *oldest;
941 937
942retry_locked: 938retry_locked:
943 /* writepages currently holds page lock, but if we change that later, */ 939 /* writepages currently holds page lock, but if we change that later, */
@@ -947,23 +943,24 @@ retry_locked:
947 BUG_ON(!ci->i_snap_realm); 943 BUG_ON(!ci->i_snap_realm);
948 down_read(&mdsc->snap_rwsem); 944 down_read(&mdsc->snap_rwsem);
949 BUG_ON(!ci->i_snap_realm->cached_context); 945 BUG_ON(!ci->i_snap_realm->cached_context);
950 if (page->private && 946 snapc = (void *)page->private;
951 (void *)page->private != ci->i_snap_realm->cached_context) { 947 if (snapc && snapc != ci->i_head_snapc) {
952 /* 948 /*
953 * this page is already dirty in another (older) snap 949 * this page is already dirty in another (older) snap
954 * context! is it writeable now? 950 * context! is it writeable now?
955 */ 951 */
956 snapc = get_oldest_context(inode, NULL); 952 oldest = get_oldest_context(inode, NULL);
957 up_read(&mdsc->snap_rwsem); 953 up_read(&mdsc->snap_rwsem);
958 954
959 if (snapc != (void *)page->private) { 955 if (snapc->seq > oldest->seq) {
956 ceph_put_snap_context(oldest);
960 dout(" page %p snapc %p not current or oldest\n", 957 dout(" page %p snapc %p not current or oldest\n",
961 page, (void *)page->private); 958 page, snapc);
962 /* 959 /*
963 * queue for writeback, and wait for snapc to 960 * queue for writeback, and wait for snapc to
964 * be writeable or written 961 * be writeable or written
965 */ 962 */
966 snapc = ceph_get_snap_context((void *)page->private); 963 snapc = ceph_get_snap_context(snapc);
967 unlock_page(page); 964 unlock_page(page);
968 ceph_queue_writeback(inode); 965 ceph_queue_writeback(inode);
969 r = wait_event_interruptible(ci->i_cap_wq, 966 r = wait_event_interruptible(ci->i_cap_wq,
@@ -973,6 +970,7 @@ retry_locked:
973 return r; 970 return r;
974 return -EAGAIN; 971 return -EAGAIN;
975 } 972 }
973 ceph_put_snap_context(oldest);
976 974
977 /* yay, writeable, do it now (without dropping page lock) */ 975 /* yay, writeable, do it now (without dropping page lock) */
978 dout(" page %p snapc %p not current, but oldest\n", 976 dout(" page %p snapc %p not current, but oldest\n",
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 3710e077a857..aa2239fa9a3b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1205,6 +1205,12 @@ retry:
1205 if (capsnap->dirty_pages || capsnap->writing) 1205 if (capsnap->dirty_pages || capsnap->writing)
1206 continue; 1206 continue;
1207 1207
1208 /*
1209 * if cap writeback already occurred, we should have dropped
1210 * the capsnap in ceph_put_wrbuffer_cap_refs.
1211 */
1212 BUG_ON(capsnap->dirty == 0);
1213
1208 /* pick mds, take s_mutex */ 1214 /* pick mds, take s_mutex */
1209 mds = __ceph_get_cap_mds(ci, &mseq); 1215 mds = __ceph_get_cap_mds(ci, &mseq);
1210 if (session && session->s_mds != mds) { 1216 if (session && session->s_mds != mds) {
@@ -2118,8 +2124,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2118 } 2124 }
2119 spin_unlock(&inode->i_lock); 2125 spin_unlock(&inode->i_lock);
2120 2126
2121 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), 2127 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2122 last ? "last" : ""); 2128 last ? " last" : "", put ? " put" : "");
2123 2129
2124 if (last && !flushsnaps) 2130 if (last && !flushsnaps)
2125 ceph_check_caps(ci, 0, NULL); 2131 ceph_check_caps(ci, 0, NULL);
@@ -2143,7 +2149,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2143{ 2149{
2144 struct inode *inode = &ci->vfs_inode; 2150 struct inode *inode = &ci->vfs_inode;
2145 int last = 0; 2151 int last = 0;
2146 int last_snap = 0; 2152 int complete_capsnap = 0;
2153 int drop_capsnap = 0;
2147 int found = 0; 2154 int found = 0;
2148 struct ceph_cap_snap *capsnap = NULL; 2155 struct ceph_cap_snap *capsnap = NULL;
2149 2156
@@ -2166,19 +2173,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2166 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2173 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2167 if (capsnap->context == snapc) { 2174 if (capsnap->context == snapc) {
2168 found = 1; 2175 found = 1;
2169 capsnap->dirty_pages -= nr;
2170 last_snap = !capsnap->dirty_pages;
2171 break; 2176 break;
2172 } 2177 }
2173 } 2178 }
2174 BUG_ON(!found); 2179 BUG_ON(!found);
2180 capsnap->dirty_pages -= nr;
2181 if (capsnap->dirty_pages == 0) {
2182 complete_capsnap = 1;
2183 if (capsnap->dirty == 0)
2184 /* cap writeback completed before we created
2185 * the cap_snap; no FLUSHSNAP is needed */
2186 drop_capsnap = 1;
2187 }
2175 dout("put_wrbuffer_cap_refs on %p cap_snap %p " 2188 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2176 " snap %lld %d/%d -> %d/%d %s%s\n", 2189 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2177 inode, capsnap, capsnap->context->seq, 2190 inode, capsnap, capsnap->context->seq,
2178 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, 2191 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2179 ci->i_wrbuffer_ref, capsnap->dirty_pages, 2192 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2180 last ? " (wrbuffer last)" : "", 2193 last ? " (wrbuffer last)" : "",
2181 last_snap ? " (capsnap last)" : ""); 2194 complete_capsnap ? " (complete capsnap)" : "",
2195 drop_capsnap ? " (drop capsnap)" : "");
2196 if (drop_capsnap) {
2197 ceph_put_snap_context(capsnap->context);
2198 list_del(&capsnap->ci_item);
2199 list_del(&capsnap->flushing_item);
2200 ceph_put_cap_snap(capsnap);
2201 }
2182 } 2202 }
2183 2203
2184 spin_unlock(&inode->i_lock); 2204 spin_unlock(&inode->i_lock);
@@ -2186,10 +2206,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2186 if (last) { 2206 if (last) {
2187 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2207 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2188 iput(inode); 2208 iput(inode);
2189 } else if (last_snap) { 2209 } else if (complete_capsnap) {
2190 ceph_flush_snaps(ci); 2210 ceph_flush_snaps(ci);
2191 wake_up(&ci->i_cap_wq); 2211 wake_up(&ci->i_cap_wq);
2192 } 2212 }
2213 if (drop_capsnap)
2214 iput(inode);
2193} 2215}
2194 2216
2195/* 2217/*
@@ -2465,8 +2487,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2465 break; 2487 break;
2466 } 2488 }
2467 WARN_ON(capsnap->dirty_pages || capsnap->writing); 2489 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2468 dout(" removing cap_snap %p follows %lld\n", 2490 dout(" removing %p cap_snap %p follows %lld\n",
2469 capsnap, follows); 2491 inode, capsnap, follows);
2470 ceph_put_snap_context(capsnap->context); 2492 ceph_put_snap_context(capsnap->context);
2471 list_del(&capsnap->ci_item); 2493 list_del(&capsnap->ci_item);
2472 list_del(&capsnap->flushing_item); 2494 list_del(&capsnap->flushing_item);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 7261dc6c2ead..ea8ee2e526aa 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -171,11 +171,11 @@ more:
171 spin_lock(&inode->i_lock); 171 spin_lock(&inode->i_lock);
172 spin_lock(&dcache_lock); 172 spin_lock(&dcache_lock);
173 173
174 last = dentry;
175
174 if (err < 0) 176 if (err < 0)
175 goto out_unlock; 177 goto out_unlock;
176 178
177 last = dentry;
178
179 p = p->prev; 179 p = p->prev;
180 filp->f_pos++; 180 filp->f_pos++;
181 181
@@ -312,7 +312,7 @@ more:
312 req->r_readdir_offset = fi->next_offset; 312 req->r_readdir_offset = fi->next_offset;
313 req->r_args.readdir.frag = cpu_to_le32(frag); 313 req->r_args.readdir.frag = cpu_to_le32(frag);
314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries); 314 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
315 req->r_num_caps = max_entries; 315 req->r_num_caps = max_entries + 1;
316 err = ceph_mdsc_do_request(mdsc, NULL, req); 316 err = ceph_mdsc_do_request(mdsc, NULL, req);
317 if (err < 0) { 317 if (err < 0) {
318 ceph_mdsc_put_request(req); 318 ceph_mdsc_put_request(req);
@@ -489,6 +489,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
489 struct inode *inode = ceph_get_snapdir(parent); 489 struct inode *inode = ceph_get_snapdir(parent);
490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
491 dentry, dentry->d_name.len, dentry->d_name.name, inode); 491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
492 BUG_ON(!d_unhashed(dentry));
492 d_add(dentry, inode); 493 d_add(dentry, inode);
493 err = 0; 494 err = 0;
494 } 495 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index aca82d55cc53..26f883c275e8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -886,6 +886,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
886 struct inode *in = NULL; 886 struct inode *in = NULL;
887 struct ceph_mds_reply_inode *ininfo; 887 struct ceph_mds_reply_inode *ininfo;
888 struct ceph_vino vino; 888 struct ceph_vino vino;
889 struct ceph_client *client = ceph_sb_to_client(sb);
889 int i = 0; 890 int i = 0;
890 int err = 0; 891 int err = 0;
891 892
@@ -949,7 +950,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
949 return err; 950 return err;
950 } 951 }
951 952
952 if (rinfo->head->is_dentry && !req->r_aborted) { 953 /*
954 * ignore null lease/binding on snapdir ENOENT, or else we
955 * will have trouble splicing in the virtual snapdir later
956 */
957 if (rinfo->head->is_dentry && !req->r_aborted &&
958 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
959 client->mount_args->snapdir_name,
960 req->r_dentry->d_name.len))) {
953 /* 961 /*
954 * lookup link rename : null -> possibly existing inode 962 * lookup link rename : null -> possibly existing inode
955 * mknod symlink mkdir : null -> new inode 963 * mknod symlink mkdir : null -> new inode
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 8f1715ffbe4b..cdaaa131add3 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -30,6 +30,10 @@ static char tag_msg = CEPH_MSGR_TAG_MSG;
30static char tag_ack = CEPH_MSGR_TAG_ACK; 30static char tag_ack = CEPH_MSGR_TAG_ACK;
31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; 31static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
32 32
33#ifdef CONFIG_LOCKDEP
34static struct lock_class_key socket_class;
35#endif
36
33 37
34static void queue_con(struct ceph_connection *con); 38static void queue_con(struct ceph_connection *con);
35static void con_work(struct work_struct *); 39static void con_work(struct work_struct *);
@@ -228,6 +232,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
228 con->sock = sock; 232 con->sock = sock;
229 sock->sk->sk_allocation = GFP_NOFS; 233 sock->sk->sk_allocation = GFP_NOFS;
230 234
235#ifdef CONFIG_LOCKDEP
236 lockdep_set_class(&sock->sk->sk_lock, &socket_class);
237#endif
238
231 set_sock_callbacks(sock, con); 239 set_sock_callbacks(sock, con);
232 240
233 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); 241 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
@@ -333,6 +341,7 @@ static void reset_connection(struct ceph_connection *con)
333 con->out_msg = NULL; 341 con->out_msg = NULL;
334 } 342 }
335 con->in_seq = 0; 343 con->in_seq = 0;
344 con->in_seq_acked = 0;
336} 345}
337 346
338/* 347/*
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 21c6623c4b07..2e2c15eed82a 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -314,71 +314,6 @@ bad:
314 return ERR_PTR(err); 314 return ERR_PTR(err);
315} 315}
316 316
317
318/*
319 * osd map
320 */
321void ceph_osdmap_destroy(struct ceph_osdmap *map)
322{
323 dout("osdmap_destroy %p\n", map);
324 if (map->crush)
325 crush_destroy(map->crush);
326 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
327 struct ceph_pg_mapping *pg =
328 rb_entry(rb_first(&map->pg_temp),
329 struct ceph_pg_mapping, node);
330 rb_erase(&pg->node, &map->pg_temp);
331 kfree(pg);
332 }
333 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
334 struct ceph_pg_pool_info *pi =
335 rb_entry(rb_first(&map->pg_pools),
336 struct ceph_pg_pool_info, node);
337 rb_erase(&pi->node, &map->pg_pools);
338 kfree(pi);
339 }
340 kfree(map->osd_state);
341 kfree(map->osd_weight);
342 kfree(map->osd_addr);
343 kfree(map);
344}
345
346/*
347 * adjust max osd value. reallocate arrays.
348 */
349static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
350{
351 u8 *state;
352 struct ceph_entity_addr *addr;
353 u32 *weight;
354
355 state = kcalloc(max, sizeof(*state), GFP_NOFS);
356 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
357 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
358 if (state == NULL || addr == NULL || weight == NULL) {
359 kfree(state);
360 kfree(addr);
361 kfree(weight);
362 return -ENOMEM;
363 }
364
365 /* copy old? */
366 if (map->osd_state) {
367 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
368 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
369 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
370 kfree(map->osd_state);
371 kfree(map->osd_addr);
372 kfree(map->osd_weight);
373 }
374
375 map->osd_state = state;
376 map->osd_weight = weight;
377 map->osd_addr = addr;
378 map->max_osd = max;
379 return 0;
380}
381
382/* 317/*
383 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 318 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
384 * to a set of osds) 319 * to a set of osds)
@@ -482,6 +417,13 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
482 return NULL; 417 return NULL;
483} 418}
484 419
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{
422 rb_erase(&pi->node, root);
423 kfree(pi->name);
424 kfree(pi);
425}
426
485void __decode_pool(void **p, struct ceph_pg_pool_info *pi) 427void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
486{ 428{
487 ceph_decode_copy(p, &pi->v, sizeof(pi->v)); 429 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
@@ -490,6 +432,98 @@ void __decode_pool(void **p, struct ceph_pg_pool_info *pi)
490 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; 432 *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2;
491} 433}
492 434
435static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
436{
437 struct ceph_pg_pool_info *pi;
438 u32 num, len, pool;
439
440 ceph_decode_32_safe(p, end, num, bad);
441 dout(" %d pool names\n", num);
442 while (num--) {
443 ceph_decode_32_safe(p, end, pool, bad);
444 ceph_decode_32_safe(p, end, len, bad);
445 dout(" pool %d len %d\n", pool, len);
446 pi = __lookup_pg_pool(&map->pg_pools, pool);
447 if (pi) {
448 kfree(pi->name);
449 pi->name = kmalloc(len + 1, GFP_NOFS);
450 if (pi->name) {
451 memcpy(pi->name, *p, len);
452 pi->name[len] = '\0';
453 dout(" name is %s\n", pi->name);
454 }
455 }
456 *p += len;
457 }
458 return 0;
459
460bad:
461 return -EINVAL;
462}
463
464/*
465 * osd map
466 */
467void ceph_osdmap_destroy(struct ceph_osdmap *map)
468{
469 dout("osdmap_destroy %p\n", map);
470 if (map->crush)
471 crush_destroy(map->crush);
472 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
473 struct ceph_pg_mapping *pg =
474 rb_entry(rb_first(&map->pg_temp),
475 struct ceph_pg_mapping, node);
476 rb_erase(&pg->node, &map->pg_temp);
477 kfree(pg);
478 }
479 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
480 struct ceph_pg_pool_info *pi =
481 rb_entry(rb_first(&map->pg_pools),
482 struct ceph_pg_pool_info, node);
483 __remove_pg_pool(&map->pg_pools, pi);
484 }
485 kfree(map->osd_state);
486 kfree(map->osd_weight);
487 kfree(map->osd_addr);
488 kfree(map);
489}
490
491/*
492 * adjust max osd value. reallocate arrays.
493 */
494static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
495{
496 u8 *state;
497 struct ceph_entity_addr *addr;
498 u32 *weight;
499
500 state = kcalloc(max, sizeof(*state), GFP_NOFS);
501 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
502 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
503 if (state == NULL || addr == NULL || weight == NULL) {
504 kfree(state);
505 kfree(addr);
506 kfree(weight);
507 return -ENOMEM;
508 }
509
510 /* copy old? */
511 if (map->osd_state) {
512 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
513 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
514 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
515 kfree(map->osd_state);
516 kfree(map->osd_addr);
517 kfree(map->osd_weight);
518 }
519
520 map->osd_state = state;
521 map->osd_weight = weight;
522 map->osd_addr = addr;
523 map->max_osd = max;
524 return 0;
525}
526
493/* 527/*
494 * decode a full map. 528 * decode a full map.
495 */ 529 */
@@ -526,7 +560,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
526 ceph_decode_32_safe(p, end, max, bad); 560 ceph_decode_32_safe(p, end, max, bad);
527 while (max--) { 561 while (max--) {
528 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); 562 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
529 pi = kmalloc(sizeof(*pi), GFP_NOFS); 563 pi = kzalloc(sizeof(*pi), GFP_NOFS);
530 if (!pi) 564 if (!pi)
531 goto bad; 565 goto bad;
532 pi->id = ceph_decode_32(p); 566 pi->id = ceph_decode_32(p);
@@ -539,6 +573,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
539 __decode_pool(p, pi); 573 __decode_pool(p, pi);
540 __insert_pg_pool(&map->pg_pools, pi); 574 __insert_pg_pool(&map->pg_pools, pi);
541 } 575 }
576
577 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
578 goto bad;
579
542 ceph_decode_32_safe(p, end, map->pool_max, bad); 580 ceph_decode_32_safe(p, end, map->pool_max, bad);
543 581
544 ceph_decode_32_safe(p, end, map->flags, bad); 582 ceph_decode_32_safe(p, end, map->flags, bad);
@@ -712,7 +750,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
712 } 750 }
713 pi = __lookup_pg_pool(&map->pg_pools, pool); 751 pi = __lookup_pg_pool(&map->pg_pools, pool);
714 if (!pi) { 752 if (!pi) {
715 pi = kmalloc(sizeof(*pi), GFP_NOFS); 753 pi = kzalloc(sizeof(*pi), GFP_NOFS);
716 if (!pi) { 754 if (!pi) {
717 err = -ENOMEM; 755 err = -ENOMEM;
718 goto bad; 756 goto bad;
@@ -722,6 +760,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
722 } 760 }
723 __decode_pool(p, pi); 761 __decode_pool(p, pi);
724 } 762 }
763 if (version >= 5 && __decode_pool_names(p, end, map) < 0)
764 goto bad;
725 765
726 /* old_pool */ 766 /* old_pool */
727 ceph_decode_32_safe(p, end, len, bad); 767 ceph_decode_32_safe(p, end, len, bad);
@@ -730,10 +770,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
730 770
731 ceph_decode_32_safe(p, end, pool, bad); 771 ceph_decode_32_safe(p, end, pool, bad);
732 pi = __lookup_pg_pool(&map->pg_pools, pool); 772 pi = __lookup_pg_pool(&map->pg_pools, pool);
733 if (pi) { 773 if (pi)
734 rb_erase(&pi->node, &map->pg_pools); 774 __remove_pg_pool(&map->pg_pools, pi);
735 kfree(pi);
736 }
737 } 775 }
738 776
739 /* new_up */ 777 /* new_up */
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 1fb55afb2642..8bc9f1e4f562 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -23,6 +23,7 @@ struct ceph_pg_pool_info {
23 int id; 23 int id;
24 struct ceph_pg_pool v; 24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; 25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26 char *name;
26}; 27};
27 28
28struct ceph_pg_mapping { 29struct ceph_pg_mapping {
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index 26ac8b89a676..a1fc1d017b58 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -11,8 +11,10 @@
11/* 11/*
12 * osdmap encoding versions 12 * osdmap encoding versions
13 */ 13 */
14#define CEPH_OSDMAP_INC_VERSION 4 14#define CEPH_OSDMAP_INC_VERSION 5
15#define CEPH_OSDMAP_VERSION 4 15#define CEPH_OSDMAP_INC_VERSION_EXT 5
16#define CEPH_OSDMAP_VERSION 5
17#define CEPH_OSDMAP_VERSION_EXT 5
16 18
17/* 19/*
18 * fs id 20 * fs id
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e6f9bc57d472..2b881262ef67 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -431,8 +431,7 @@ static int dup_array(u64 **dst, __le64 *src, int num)
431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't 431 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
432 * change). 432 * change).
433 */ 433 */
434void ceph_queue_cap_snap(struct ceph_inode_info *ci, 434void ceph_queue_cap_snap(struct ceph_inode_info *ci)
435 struct ceph_snap_context *snapc)
436{ 435{
437 struct inode *inode = &ci->vfs_inode; 436 struct inode *inode = &ci->vfs_inode;
438 struct ceph_cap_snap *capsnap; 437 struct ceph_cap_snap *capsnap;
@@ -451,10 +450,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
451 as no new writes are allowed to start when pending, so any 450 as no new writes are allowed to start when pending, so any
452 writes in progress now were started before the previous 451 writes in progress now were started before the previous
453 cap_snap. lucky us. */ 452 cap_snap. lucky us. */
454 dout("queue_cap_snap %p snapc %p seq %llu used %d" 453 dout("queue_cap_snap %p already pending\n", inode);
455 " already pending\n", inode, snapc, snapc->seq, used);
456 kfree(capsnap); 454 kfree(capsnap);
457 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { 455 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
456 struct ceph_snap_context *snapc = ci->i_head_snapc;
457
458 igrab(inode); 458 igrab(inode);
459 459
460 atomic_set(&capsnap->nref, 1); 460 atomic_set(&capsnap->nref, 1);
@@ -463,7 +463,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
463 INIT_LIST_HEAD(&capsnap->flushing_item); 463 INIT_LIST_HEAD(&capsnap->flushing_item);
464 464
465 capsnap->follows = snapc->seq - 1; 465 capsnap->follows = snapc->seq - 1;
466 capsnap->context = ceph_get_snap_context(snapc);
467 capsnap->issued = __ceph_caps_issued(ci, NULL); 466 capsnap->issued = __ceph_caps_issued(ci, NULL);
468 capsnap->dirty = __ceph_caps_dirty(ci); 467 capsnap->dirty = __ceph_caps_dirty(ci);
469 468
@@ -480,7 +479,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci,
480 snapshot. */ 479 snapshot. */
481 capsnap->dirty_pages = ci->i_wrbuffer_ref_head; 480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
482 ci->i_wrbuffer_ref_head = 0; 481 ci->i_wrbuffer_ref_head = 0;
483 ceph_put_snap_context(ci->i_head_snapc); 482 capsnap->context = snapc;
484 ci->i_head_snapc = NULL; 483 ci->i_head_snapc = NULL;
485 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); 484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
486 485
@@ -522,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
522 capsnap->ctime = inode->i_ctime; 521 capsnap->ctime = inode->i_ctime;
523 capsnap->time_warp_seq = ci->i_time_warp_seq; 522 capsnap->time_warp_seq = ci->i_time_warp_seq;
524 if (capsnap->dirty_pages) { 523 if (capsnap->dirty_pages) {
525 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " 524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
526 "still has %d dirty pages\n", inode, capsnap, 525 "still has %d dirty pages\n", inode, capsnap,
527 capsnap->context, capsnap->context->seq, 526 capsnap->context, capsnap->context->seq,
528 capsnap->size, capsnap->dirty_pages); 527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
529 return 0; 529 return 0;
530 } 530 }
531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", 531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
532 inode, capsnap, capsnap->context, 532 inode, capsnap, capsnap->context,
533 capsnap->context->seq, capsnap->size); 533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
534 535
535 spin_lock(&mdsc->snap_flush_lock); 536 spin_lock(&mdsc->snap_flush_lock);
536 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); 537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
@@ -602,7 +603,7 @@ more:
602 if (lastinode) 603 if (lastinode)
603 iput(lastinode); 604 iput(lastinode);
604 lastinode = inode; 605 lastinode = inode;
605 ceph_queue_cap_snap(ci, realm->cached_context); 606 ceph_queue_cap_snap(ci);
606 spin_lock(&realm->inodes_with_caps_lock); 607 spin_lock(&realm->inodes_with_caps_lock);
607 } 608 }
608 spin_unlock(&realm->inodes_with_caps_lock); 609 spin_unlock(&realm->inodes_with_caps_lock);
@@ -824,8 +825,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
824 spin_unlock(&realm->inodes_with_caps_lock); 825 spin_unlock(&realm->inodes_with_caps_lock);
825 spin_unlock(&inode->i_lock); 826 spin_unlock(&inode->i_lock);
826 827
827 ceph_queue_cap_snap(ci, 828 ceph_queue_cap_snap(ci);
828 ci->i_snap_realm->cached_context);
829 829
830 iput(inode); 830 iput(inode);
831 continue; 831 continue;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index ca702c67bc66..e30dfbb056c3 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -715,8 +715,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m,
715extern void ceph_handle_snap(struct ceph_mds_client *mdsc, 715extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session, 716 struct ceph_mds_session *session,
717 struct ceph_msg *msg); 717 struct ceph_msg *msg);
718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci, 718extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
719 struct ceph_snap_context *snapc);
720extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
721 struct ceph_cap_snap *capsnap); 720 struct ceph_cap_snap *capsnap);
722extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);