summaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2019-05-18 08:39:55 -0400
committerIlya Dryomov <idryomov@gmail.com>2019-06-05 14:34:39 -0400
commit3e1d0452edceebb903d23db53201013c940bf000 (patch)
tree1a6065a9b2fcd52395ae35778cf9de0ad43d0b30 /fs/ceph
parent1cf89a8dee5e6e9d4fcb81b571a54d40068dfbb7 (diff)
ceph: avoid iput_final() while holding mutex or in dispatch thread
iput_final() may wait for reahahead pages. The wait can cause deadlock. For example: Workqueue: ceph-msgr ceph_con_workfn [libceph] Call Trace: schedule+0x36/0x80 io_schedule+0x16/0x40 __lock_page+0x101/0x140 truncate_inode_pages_range+0x556/0x9f0 truncate_inode_pages_final+0x4d/0x60 evict+0x182/0x1a0 iput+0x1d2/0x220 iterate_session_caps+0x82/0x230 [ceph] dispatch+0x678/0xa80 [ceph] ceph_con_workfn+0x95b/0x1560 [libceph] process_one_work+0x14d/0x410 worker_thread+0x4b/0x460 kthread+0x105/0x140 ret_from_fork+0x22/0x40 Workqueue: ceph-msgr ceph_con_workfn [libceph] Call Trace: __schedule+0x3d6/0x8b0 schedule+0x36/0x80 schedule_preempt_disabled+0xe/0x10 mutex_lock+0x2f/0x40 ceph_check_caps+0x505/0xa80 [ceph] ceph_put_wrbuffer_cap_refs+0x1e5/0x2c0 [ceph] writepages_finish+0x2d3/0x410 [ceph] __complete_request+0x26/0x60 [libceph] handle_reply+0x6c8/0xa10 [libceph] dispatch+0x29a/0xbb0 [libceph] ceph_con_workfn+0x95b/0x1560 [libceph] process_one_work+0x14d/0x410 worker_thread+0x4b/0x460 kthread+0x105/0x140 ret_from_fork+0x22/0x40 In above example, truncate_inode_pages_range() waits for readahead pages while holding s_mutex. ceph_check_caps() waits for s_mutex and blocks OSD dispatch thread. Later OSD replies (for readahead) can't be handled. ceph_check_caps() also may lock snap_rwsem for read. So similar deadlock can happen if iput_final() is called while holding snap_rwsem. In general, it's not good to call iput_final() inside MDS/OSD dispatch threads or while holding any mutex. The fix is introducing ceph_async_iput(), which calls iput_final() in workqueue. Signed-off-by: "Yan, Zheng" <zyan@redhat.com> Reviewed-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/caps.c12
-rw-r--r--fs/ceph/inode.c31
-rw-r--r--fs/ceph/mds_client.c28
-rw-r--r--fs/ceph/quota.c9
-rw-r--r--fs/ceph/snap.c16
-rw-r--r--fs/ceph/super.h2
6 files changed, 71 insertions, 27 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 72f8e1311392..52a2b90621cd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2992,8 +2992,10 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2992 } 2992 }
2993 if (complete_capsnap) 2993 if (complete_capsnap)
2994 wake_up_all(&ci->i_cap_wq); 2994 wake_up_all(&ci->i_cap_wq);
2995 while (put-- > 0) 2995 while (put-- > 0) {
2996 iput(inode); 2996 /* avoid calling iput_final() in osd dispatch threads */
2997 ceph_async_iput(inode);
2998 }
2997} 2999}
2998 3000
2999/* 3001/*
@@ -3964,8 +3966,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
3964done: 3966done:
3965 mutex_unlock(&session->s_mutex); 3967 mutex_unlock(&session->s_mutex);
3966done_unlocked: 3968done_unlocked:
3967 iput(inode);
3968 ceph_put_string(extra_info.pool_ns); 3969 ceph_put_string(extra_info.pool_ns);
3970 /* avoid calling iput_final() in mds dispatch threads */
3971 ceph_async_iput(inode);
3969 return; 3972 return;
3970 3973
3971flush_cap_releases: 3974flush_cap_releases:
@@ -4011,7 +4014,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
4011 if (inode) { 4014 if (inode) {
4012 dout("check_delayed_caps on %p\n", inode); 4015 dout("check_delayed_caps on %p\n", inode);
4013 ceph_check_caps(ci, flags, NULL); 4016 ceph_check_caps(ci, flags, NULL);
4014 iput(inode); 4017 /* avoid calling iput_final() in tick thread */
4018 ceph_async_iput(inode);
4015 } 4019 }
4016 } 4020 }
4017 spin_unlock(&mdsc->cap_delay_lock); 4021 spin_unlock(&mdsc->cap_delay_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3acdd3cc6039..761451f36e2d 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1476,7 +1476,8 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
1476 pr_err("fill_inode badness on %p got %d\n", in, rc); 1476 pr_err("fill_inode badness on %p got %d\n", in, rc);
1477 err = rc; 1477 err = rc;
1478 } 1478 }
1479 iput(in); 1479 /* avoid calling iput_final() in mds dispatch threads */
1480 ceph_async_iput(in);
1480 } 1481 }
1481 1482
1482 return err; 1483 return err;
@@ -1674,8 +1675,11 @@ retry_lookup:
1674 &req->r_caps_reservation); 1675 &req->r_caps_reservation);
1675 if (ret < 0) { 1676 if (ret < 0) {
1676 pr_err("fill_inode badness on %p\n", in); 1677 pr_err("fill_inode badness on %p\n", in);
1677 if (d_really_is_negative(dn)) 1678 if (d_really_is_negative(dn)) {
1678 iput(in); 1679 /* avoid calling iput_final() in mds
1680 * dispatch threads */
1681 ceph_async_iput(in);
1682 }
1679 d_drop(dn); 1683 d_drop(dn);
1680 err = ret; 1684 err = ret;
1681 goto next_item; 1685 goto next_item;
@@ -1685,7 +1689,7 @@ retry_lookup:
1685 if (ceph_security_xattr_deadlock(in)) { 1689 if (ceph_security_xattr_deadlock(in)) {
1686 dout(" skip splicing dn %p to inode %p" 1690 dout(" skip splicing dn %p to inode %p"
1687 " (security xattr deadlock)\n", dn, in); 1691 " (security xattr deadlock)\n", dn, in);
1688 iput(in); 1692 ceph_async_iput(in);
1689 skipped++; 1693 skipped++;
1690 goto next_item; 1694 goto next_item;
1691 } 1695 }
@@ -1737,6 +1741,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size)
1737} 1741}
1738 1742
1739/* 1743/*
1744 * Put reference to inode, but avoid calling iput_final() in current thread.
1745 * iput_final() may wait for reahahead pages. The wait can cause deadlock in
1746 * some contexts.
1747 */
1748void ceph_async_iput(struct inode *inode)
1749{
1750 if (!inode)
1751 return;
1752 for (;;) {
1753 if (atomic_add_unless(&inode->i_count, -1, 1))
1754 break;
1755 if (queue_work(ceph_inode_to_client(inode)->inode_wq,
1756 &ceph_inode(inode)->i_work))
1757 break;
1758 /* queue work failed, i_count must be at least 2 */
1759 }
1760}
1761
1762/*
1740 * Write back inode data in a worker thread. (This can't be done 1763 * Write back inode data in a worker thread. (This can't be done
1741 * in the message handler context.) 1764 * in the message handler context.)
1742 */ 1765 */
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 959b1bf7c327..6af2d0d4a87a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -690,11 +690,12 @@ void ceph_mdsc_release_request(struct kref *kref)
690 ceph_msg_put(req->r_reply); 690 ceph_msg_put(req->r_reply);
691 if (req->r_inode) { 691 if (req->r_inode) {
692 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); 692 ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
693 iput(req->r_inode); 693 /* avoid calling iput_final() in mds dispatch threads */
694 ceph_async_iput(req->r_inode);
694 } 695 }
695 if (req->r_parent) 696 if (req->r_parent)
696 ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); 697 ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
697 iput(req->r_target_inode); 698 ceph_async_iput(req->r_target_inode);
698 if (req->r_dentry) 699 if (req->r_dentry)
699 dput(req->r_dentry); 700 dput(req->r_dentry);
700 if (req->r_old_dentry) 701 if (req->r_old_dentry)
@@ -708,7 +709,7 @@ void ceph_mdsc_release_request(struct kref *kref)
708 */ 709 */
709 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), 710 ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir),
710 CEPH_CAP_PIN); 711 CEPH_CAP_PIN);
711 iput(req->r_old_dentry_dir); 712 ceph_async_iput(req->r_old_dentry_dir);
712 } 713 }
713 kfree(req->r_path1); 714 kfree(req->r_path1);
714 kfree(req->r_path2); 715 kfree(req->r_path2);
@@ -818,7 +819,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
818 } 819 }
819 820
820 if (req->r_unsafe_dir) { 821 if (req->r_unsafe_dir) {
821 iput(req->r_unsafe_dir); 822 /* avoid calling iput_final() in mds dispatch threads */
823 ceph_async_iput(req->r_unsafe_dir);
822 req->r_unsafe_dir = NULL; 824 req->r_unsafe_dir = NULL;
823 } 825 }
824 826
@@ -983,7 +985,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
983 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); 985 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
984 if (!cap) { 986 if (!cap) {
985 spin_unlock(&ci->i_ceph_lock); 987 spin_unlock(&ci->i_ceph_lock);
986 iput(inode); 988 ceph_async_iput(inode);
987 goto random; 989 goto random;
988 } 990 }
989 mds = cap->session->s_mds; 991 mds = cap->session->s_mds;
@@ -992,7 +994,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
992 cap == ci->i_auth_cap ? "auth " : "", cap); 994 cap == ci->i_auth_cap ? "auth " : "", cap);
993 spin_unlock(&ci->i_ceph_lock); 995 spin_unlock(&ci->i_ceph_lock);
994out: 996out:
995 iput(inode); 997 /* avoid calling iput_final() while holding mdsc->mutex or
998 * in mds dispatch threads */
999 ceph_async_iput(inode);
996 return mds; 1000 return mds;
997 1001
998random: 1002random:
@@ -1302,7 +1306,9 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session,
1302 spin_unlock(&session->s_cap_lock); 1306 spin_unlock(&session->s_cap_lock);
1303 1307
1304 if (last_inode) { 1308 if (last_inode) {
1305 iput(last_inode); 1309 /* avoid calling iput_final() while holding
1310 * s_mutex or in mds dispatch threads */
1311 ceph_async_iput(last_inode);
1306 last_inode = NULL; 1312 last_inode = NULL;
1307 } 1313 }
1308 if (old_cap) { 1314 if (old_cap) {
@@ -1335,7 +1341,7 @@ out:
1335 session->s_cap_iterator = NULL; 1341 session->s_cap_iterator = NULL;
1336 spin_unlock(&session->s_cap_lock); 1342 spin_unlock(&session->s_cap_lock);
1337 1343
1338 iput(last_inode); 1344 ceph_async_iput(last_inode);
1339 if (old_cap) 1345 if (old_cap)
1340 ceph_put_cap(session->s_mdsc, old_cap); 1346 ceph_put_cap(session->s_mdsc, old_cap);
1341 1347
@@ -1471,7 +1477,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
1471 spin_unlock(&session->s_cap_lock); 1477 spin_unlock(&session->s_cap_lock);
1472 1478
1473 inode = ceph_find_inode(sb, vino); 1479 inode = ceph_find_inode(sb, vino);
1474 iput(inode); 1480 /* avoid calling iput_final() while holding s_mutex */
1481 ceph_async_iput(inode);
1475 1482
1476 spin_lock(&session->s_cap_lock); 1483 spin_lock(&session->s_cap_lock);
1477 } 1484 }
@@ -3912,8 +3919,9 @@ release:
3912 ceph_con_send(&session->s_con, msg); 3919 ceph_con_send(&session->s_con, msg);
3913 3920
3914out: 3921out:
3915 iput(inode);
3916 mutex_unlock(&session->s_mutex); 3922 mutex_unlock(&session->s_mutex);
3923 /* avoid calling iput_final() in mds dispatch threads */
3924 ceph_async_iput(inode);
3917 return; 3925 return;
3918 3926
3919bad: 3927bad:
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index c4522212872c..d629fc857450 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -74,7 +74,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc,
74 le64_to_cpu(h->max_files)); 74 le64_to_cpu(h->max_files));
75 spin_unlock(&ci->i_ceph_lock); 75 spin_unlock(&ci->i_ceph_lock);
76 76
77 iput(inode); 77 /* avoid calling iput_final() in dispatch thread */
78 ceph_async_iput(inode);
78} 79}
79 80
80static struct ceph_quotarealm_inode * 81static struct ceph_quotarealm_inode *
@@ -235,7 +236,8 @@ restart:
235 236
236 ci = ceph_inode(in); 237 ci = ceph_inode(in);
237 has_quota = __ceph_has_any_quota(ci); 238 has_quota = __ceph_has_any_quota(ci);
238 iput(in); 239 /* avoid calling iput_final() while holding mdsc->snap_rwsem */
240 ceph_async_iput(in);
239 241
240 next = realm->parent; 242 next = realm->parent;
241 if (has_quota || !next) 243 if (has_quota || !next)
@@ -372,7 +374,8 @@ restart:
372 pr_warn("Invalid quota check op (%d)\n", op); 374 pr_warn("Invalid quota check op (%d)\n", op);
373 exceeded = true; /* Just break the loop */ 375 exceeded = true; /* Just break the loop */
374 } 376 }
375 iput(in); 377 /* avoid calling iput_final() while holding mdsc->snap_rwsem */
378 ceph_async_iput(in);
376 379
377 next = realm->parent; 380 next = realm->parent;
378 if (exceeded || !next) 381 if (exceeded || !next)
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index b26e12cd8ec3..72c6c022f02b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -648,13 +648,15 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
648 if (!inode) 648 if (!inode)
649 continue; 649 continue;
650 spin_unlock(&realm->inodes_with_caps_lock); 650 spin_unlock(&realm->inodes_with_caps_lock);
651 iput(lastinode); 651 /* avoid calling iput_final() while holding
652 * mdsc->snap_rwsem or in mds dispatch threads */
653 ceph_async_iput(lastinode);
652 lastinode = inode; 654 lastinode = inode;
653 ceph_queue_cap_snap(ci); 655 ceph_queue_cap_snap(ci);
654 spin_lock(&realm->inodes_with_caps_lock); 656 spin_lock(&realm->inodes_with_caps_lock);
655 } 657 }
656 spin_unlock(&realm->inodes_with_caps_lock); 658 spin_unlock(&realm->inodes_with_caps_lock);
657 iput(lastinode); 659 ceph_async_iput(lastinode);
658 660
659 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); 661 dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
660} 662}
@@ -806,7 +808,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
806 ihold(inode); 808 ihold(inode);
807 spin_unlock(&mdsc->snap_flush_lock); 809 spin_unlock(&mdsc->snap_flush_lock);
808 ceph_flush_snaps(ci, &session); 810 ceph_flush_snaps(ci, &session);
809 iput(inode); 811 /* avoid calling iput_final() while holding
812 * session->s_mutex or in mds dispatch threads */
813 ceph_async_iput(inode);
810 spin_lock(&mdsc->snap_flush_lock); 814 spin_lock(&mdsc->snap_flush_lock);
811 } 815 }
812 spin_unlock(&mdsc->snap_flush_lock); 816 spin_unlock(&mdsc->snap_flush_lock);
@@ -950,12 +954,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
950 ceph_get_snap_realm(mdsc, realm); 954 ceph_get_snap_realm(mdsc, realm);
951 ceph_put_snap_realm(mdsc, oldrealm); 955 ceph_put_snap_realm(mdsc, oldrealm);
952 956
953 iput(inode); 957 /* avoid calling iput_final() while holding
958 * mdsc->snap_rwsem or mds in dispatch threads */
959 ceph_async_iput(inode);
954 continue; 960 continue;
955 961
956skip_inode: 962skip_inode:
957 spin_unlock(&ci->i_ceph_lock); 963 spin_unlock(&ci->i_ceph_lock);
958 iput(inode); 964 ceph_async_iput(inode);
959 } 965 }
960 966
961 /* we may have taken some of the old realm's children. */ 967 /* we may have taken some of the old realm's children. */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3fb866a979ce..5f27e1f7f2d6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -899,9 +899,9 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask);
899extern bool ceph_inode_set_size(struct inode *inode, loff_t size); 899extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
900extern void __ceph_do_pending_vmtruncate(struct inode *inode); 900extern void __ceph_do_pending_vmtruncate(struct inode *inode);
901extern void ceph_queue_vmtruncate(struct inode *inode); 901extern void ceph_queue_vmtruncate(struct inode *inode);
902
903extern void ceph_queue_invalidate(struct inode *inode); 902extern void ceph_queue_invalidate(struct inode *inode);
904extern void ceph_queue_writeback(struct inode *inode); 903extern void ceph_queue_writeback(struct inode *inode);
904extern void ceph_async_iput(struct inode *inode);
905 905
906extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 906extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
907 int mask, bool force); 907 int mask, bool force);