aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSage Weil <sage@newdream.net>2010-04-01 12:33:46 -0400
committerSage Weil <sage@newdream.net>2010-04-01 12:34:38 -0400
commit819ccbfa448403992ceafc05d6d7097aaa74d4c3 (patch)
treee6b3592fd85fbb35a63d6d10c84a8d01913eddab
parent6298a33757ba7361bb8f506c106daad77e5ac8cf (diff)
ceph: fix leaked inode ref due to snap metadata writeback race
We create a ceph_cap_snap if there is dirty cap metadata (for writeback to mds) OR dirty pages (for writeback to osd). It is thus possible that the metadata has been written back to the MDS but the OSD data has not when the cap_snap is created. This results in a cap_snap with dirty(caps) == 0. The problem is that cap writeback to the MDS isn't necessary, and a FLUSHSNAP cap op gets no ack from the MDS. This leaves the cap_snap attached to the inode along with its inode reference. Fix the problem by dropping the cap_snap if it becomes 'complete' (all pages written out) and dirty(caps) == 0 in ceph_put_wrbuffer_cap_refs(). Also, BUG() in __ceph_flush_snaps() if we encounter a cap_snap with dirty(caps) == 0. Signed-off-by: Sage Weil <sage@newdream.net>
-rw-r--r--fs/ceph/caps.c42
-rw-r--r--fs/ceph/snap.c10
2 files changed, 38 insertions, 14 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7d0a0d0adc18..b6fdf010749b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1204,6 +1204,12 @@ retry:
1204 if (capsnap->dirty_pages || capsnap->writing) 1204 if (capsnap->dirty_pages || capsnap->writing)
1205 continue; 1205 continue;
1206 1206
1207 /*
1208 * if cap writeback already occurred, we should have dropped
1209 * the capsnap in ceph_put_wrbuffer_cap_refs.
1210 */
1211 BUG_ON(capsnap->dirty == 0);
1212
1207 /* pick mds, take s_mutex */ 1213 /* pick mds, take s_mutex */
1208 mds = __ceph_get_cap_mds(ci, &mseq); 1214 mds = __ceph_get_cap_mds(ci, &mseq);
1209 if (session && session->s_mds != mds) { 1215 if (session && session->s_mds != mds) {
@@ -2117,8 +2123,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2117 } 2123 }
2118 spin_unlock(&inode->i_lock); 2124 spin_unlock(&inode->i_lock);
2119 2125
2120 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), 2126 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
2121 last ? "last" : ""); 2127 last ? " last" : "", put ? " put" : "");
2122 2128
2123 if (last && !flushsnaps) 2129 if (last && !flushsnaps)
2124 ceph_check_caps(ci, 0, NULL); 2130 ceph_check_caps(ci, 0, NULL);
@@ -2142,7 +2148,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2142{ 2148{
2143 struct inode *inode = &ci->vfs_inode; 2149 struct inode *inode = &ci->vfs_inode;
2144 int last = 0; 2150 int last = 0;
2145 int last_snap = 0; 2151 int complete_capsnap = 0;
2152 int drop_capsnap = 0;
2146 int found = 0; 2153 int found = 0;
2147 struct ceph_cap_snap *capsnap = NULL; 2154 struct ceph_cap_snap *capsnap = NULL;
2148 2155
@@ -2165,19 +2172,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2165 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { 2172 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2166 if (capsnap->context == snapc) { 2173 if (capsnap->context == snapc) {
2167 found = 1; 2174 found = 1;
2168 capsnap->dirty_pages -= nr;
2169 last_snap = !capsnap->dirty_pages;
2170 break; 2175 break;
2171 } 2176 }
2172 } 2177 }
2173 BUG_ON(!found); 2178 BUG_ON(!found);
2179 capsnap->dirty_pages -= nr;
2180 if (capsnap->dirty_pages == 0) {
2181 complete_capsnap = 1;
2182 if (capsnap->dirty == 0)
2183 /* cap writeback completed before we created
2184 * the cap_snap; no FLUSHSNAP is needed */
2185 drop_capsnap = 1;
2186 }
2174 dout("put_wrbuffer_cap_refs on %p cap_snap %p " 2187 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2175 " snap %lld %d/%d -> %d/%d %s%s\n", 2188 " snap %lld %d/%d -> %d/%d %s%s%s\n",
2176 inode, capsnap, capsnap->context->seq, 2189 inode, capsnap, capsnap->context->seq,
2177 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, 2190 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2178 ci->i_wrbuffer_ref, capsnap->dirty_pages, 2191 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2179 last ? " (wrbuffer last)" : "", 2192 last ? " (wrbuffer last)" : "",
2180 last_snap ? " (capsnap last)" : ""); 2193 complete_capsnap ? " (complete capsnap)" : "",
2194 drop_capsnap ? " (drop capsnap)" : "");
2195 if (drop_capsnap) {
2196 ceph_put_snap_context(capsnap->context);
2197 list_del(&capsnap->ci_item);
2198 list_del(&capsnap->flushing_item);
2199 ceph_put_cap_snap(capsnap);
2200 }
2181 } 2201 }
2182 2202
2183 spin_unlock(&inode->i_lock); 2203 spin_unlock(&inode->i_lock);
@@ -2185,10 +2205,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2185 if (last) { 2205 if (last) {
2186 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2206 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2187 iput(inode); 2207 iput(inode);
2188 } else if (last_snap) { 2208 } else if (complete_capsnap) {
2189 ceph_flush_snaps(ci); 2209 ceph_flush_snaps(ci);
2190 wake_up(&ci->i_cap_wq); 2210 wake_up(&ci->i_cap_wq);
2191 } 2211 }
2212 if (drop_capsnap)
2213 iput(inode);
2192} 2214}
2193 2215
2194/* 2216/*
@@ -2464,8 +2486,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2464 break; 2486 break;
2465 } 2487 }
2466 WARN_ON(capsnap->dirty_pages || capsnap->writing); 2488 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2467 dout(" removing cap_snap %p follows %lld\n", 2489 dout(" removing %p cap_snap %p follows %lld\n",
2468 capsnap, follows); 2490 inode, capsnap, follows);
2469 ceph_put_snap_context(capsnap->context); 2491 ceph_put_snap_context(capsnap->context);
2470 list_del(&capsnap->ci_item); 2492 list_del(&capsnap->ci_item);
2471 list_del(&capsnap->flushing_item); 2493 list_del(&capsnap->flushing_item);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index df04e210a055..7e3e5f9edaa4 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
521 capsnap->ctime = inode->i_ctime; 521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq; 522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) { 523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " 524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
525 "still has %d dirty pages\n", inode, capsnap, 525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq, 526 capsnap->context, capsnap->context->seq,
527 capsnap->size, capsnap->dirty_pages); 527 ceph_cap_string(capsnap->dirty), capsnap->size,
528 capsnap->dirty_pages);
528 return 0; 529 return 0;
529 } 530 }
530 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", 531 dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
531 inode, capsnap, capsnap->context, 532 inode, capsnap, capsnap->context,
532 capsnap->context->seq, capsnap->size); 533 capsnap->context->seq, ceph_cap_string(capsnap->dirty),
534 capsnap->size);
533 535
534 spin_lock(&mdsc->snap_flush_lock); 536 spin_lock(&mdsc->snap_flush_lock);
535 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); 537 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);