diff options
author | Yan, Zheng <zyan@redhat.com> | 2019-05-18 08:39:55 -0400 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2019-06-05 14:34:39 -0400 |
commit | 3e1d0452edceebb903d23db53201013c940bf000 (patch) | |
tree | 1a6065a9b2fcd52395ae35778cf9de0ad43d0b30 /fs/ceph | |
parent | 1cf89a8dee5e6e9d4fcb81b571a54d40068dfbb7 (diff) |
ceph: avoid iput_final() while holding mutex or in dispatch thread
iput_final() may wait for reahahead pages. The wait can cause deadlock.
For example:
Workqueue: ceph-msgr ceph_con_workfn [libceph]
Call Trace:
schedule+0x36/0x80
io_schedule+0x16/0x40
__lock_page+0x101/0x140
truncate_inode_pages_range+0x556/0x9f0
truncate_inode_pages_final+0x4d/0x60
evict+0x182/0x1a0
iput+0x1d2/0x220
iterate_session_caps+0x82/0x230 [ceph]
dispatch+0x678/0xa80 [ceph]
ceph_con_workfn+0x95b/0x1560 [libceph]
process_one_work+0x14d/0x410
worker_thread+0x4b/0x460
kthread+0x105/0x140
ret_from_fork+0x22/0x40
Workqueue: ceph-msgr ceph_con_workfn [libceph]
Call Trace:
__schedule+0x3d6/0x8b0
schedule+0x36/0x80
schedule_preempt_disabled+0xe/0x10
mutex_lock+0x2f/0x40
ceph_check_caps+0x505/0xa80 [ceph]
ceph_put_wrbuffer_cap_refs+0x1e5/0x2c0 [ceph]
writepages_finish+0x2d3/0x410 [ceph]
__complete_request+0x26/0x60 [libceph]
handle_reply+0x6c8/0xa10 [libceph]
dispatch+0x29a/0xbb0 [libceph]
ceph_con_workfn+0x95b/0x1560 [libceph]
process_one_work+0x14d/0x410
worker_thread+0x4b/0x460
kthread+0x105/0x140
ret_from_fork+0x22/0x40
In above example, truncate_inode_pages_range() waits for readahead pages
while holding s_mutex. ceph_check_caps() waits for s_mutex and blocks
OSD dispatch thread. Later OSD replies (for readahead) can't be handled.
ceph_check_caps() also may lock snap_rwsem for read. So similar deadlock
can happen if iput_final() is called while holding snap_rwsem.
In general, it's not good to call iput_final() inside MDS/OSD dispatch
threads or while holding any mutex.
The fix is introducing ceph_async_iput(), which calls iput_final() in
workqueue.
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/caps.c | 12 | ||||
-rw-r--r-- | fs/ceph/inode.c | 31 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 28 | ||||
-rw-r--r-- | fs/ceph/quota.c | 9 | ||||
-rw-r--r-- | fs/ceph/snap.c | 16 | ||||
-rw-r--r-- | fs/ceph/super.h | 2 |
6 files changed, 71 insertions, 27 deletions
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 72f8e1311392..52a2b90621cd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -2992,8 +2992,10 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
2992 | } | 2992 | } |
2993 | if (complete_capsnap) | 2993 | if (complete_capsnap) |
2994 | wake_up_all(&ci->i_cap_wq); | 2994 | wake_up_all(&ci->i_cap_wq); |
2995 | while (put-- > 0) | 2995 | while (put-- > 0) { |
2996 | iput(inode); | 2996 | /* avoid calling iput_final() in osd dispatch threads */ |
2997 | ceph_async_iput(inode); | ||
2998 | } | ||
2997 | } | 2999 | } |
2998 | 3000 | ||
2999 | /* | 3001 | /* |
@@ -3964,8 +3966,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
3964 | done: | 3966 | done: |
3965 | mutex_unlock(&session->s_mutex); | 3967 | mutex_unlock(&session->s_mutex); |
3966 | done_unlocked: | 3968 | done_unlocked: |
3967 | iput(inode); | ||
3968 | ceph_put_string(extra_info.pool_ns); | 3969 | ceph_put_string(extra_info.pool_ns); |
3970 | /* avoid calling iput_final() in mds dispatch threads */ | ||
3971 | ceph_async_iput(inode); | ||
3969 | return; | 3972 | return; |
3970 | 3973 | ||
3971 | flush_cap_releases: | 3974 | flush_cap_releases: |
@@ -4011,7 +4014,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) | |||
4011 | if (inode) { | 4014 | if (inode) { |
4012 | dout("check_delayed_caps on %p\n", inode); | 4015 | dout("check_delayed_caps on %p\n", inode); |
4013 | ceph_check_caps(ci, flags, NULL); | 4016 | ceph_check_caps(ci, flags, NULL); |
4014 | iput(inode); | 4017 | /* avoid calling iput_final() in tick thread */ |
4018 | ceph_async_iput(inode); | ||
4015 | } | 4019 | } |
4016 | } | 4020 | } |
4017 | spin_unlock(&mdsc->cap_delay_lock); | 4021 | spin_unlock(&mdsc->cap_delay_lock); |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3acdd3cc6039..761451f36e2d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -1476,7 +1476,8 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, | |||
1476 | pr_err("fill_inode badness on %p got %d\n", in, rc); | 1476 | pr_err("fill_inode badness on %p got %d\n", in, rc); |
1477 | err = rc; | 1477 | err = rc; |
1478 | } | 1478 | } |
1479 | iput(in); | 1479 | /* avoid calling iput_final() in mds dispatch threads */ |
1480 | ceph_async_iput(in); | ||
1480 | } | 1481 | } |
1481 | 1482 | ||
1482 | return err; | 1483 | return err; |
@@ -1674,8 +1675,11 @@ retry_lookup: | |||
1674 | &req->r_caps_reservation); | 1675 | &req->r_caps_reservation); |
1675 | if (ret < 0) { | 1676 | if (ret < 0) { |
1676 | pr_err("fill_inode badness on %p\n", in); | 1677 | pr_err("fill_inode badness on %p\n", in); |
1677 | if (d_really_is_negative(dn)) | 1678 | if (d_really_is_negative(dn)) { |
1678 | iput(in); | 1679 | /* avoid calling iput_final() in mds |
1680 | * dispatch threads */ | ||
1681 | ceph_async_iput(in); | ||
1682 | } | ||
1679 | d_drop(dn); | 1683 | d_drop(dn); |
1680 | err = ret; | 1684 | err = ret; |
1681 | goto next_item; | 1685 | goto next_item; |
@@ -1685,7 +1689,7 @@ retry_lookup: | |||
1685 | if (ceph_security_xattr_deadlock(in)) { | 1689 | if (ceph_security_xattr_deadlock(in)) { |
1686 | dout(" skip splicing dn %p to inode %p" | 1690 | dout(" skip splicing dn %p to inode %p" |
1687 | " (security xattr deadlock)\n", dn, in); | 1691 | " (security xattr deadlock)\n", dn, in); |
1688 | iput(in); | 1692 | ceph_async_iput(in); |
1689 | skipped++; | 1693 | skipped++; |
1690 | goto next_item; | 1694 | goto next_item; |
1691 | } | 1695 | } |
@@ -1737,6 +1741,25 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) | |||
1737 | } | 1741 | } |
1738 | 1742 | ||
1739 | /* | 1743 | /* |
1744 | * Put reference to inode, but avoid calling iput_final() in current thread. | ||
1745 | * iput_final() may wait for reahahead pages. The wait can cause deadlock in | ||
1746 | * some contexts. | ||
1747 | */ | ||
1748 | void ceph_async_iput(struct inode *inode) | ||
1749 | { | ||
1750 | if (!inode) | ||
1751 | return; | ||
1752 | for (;;) { | ||
1753 | if (atomic_add_unless(&inode->i_count, -1, 1)) | ||
1754 | break; | ||
1755 | if (queue_work(ceph_inode_to_client(inode)->inode_wq, | ||
1756 | &ceph_inode(inode)->i_work)) | ||
1757 | break; | ||
1758 | /* queue work failed, i_count must be at least 2 */ | ||
1759 | } | ||
1760 | } | ||
1761 | |||
1762 | /* | ||
1740 | * Write back inode data in a worker thread. (This can't be done | 1763 | * Write back inode data in a worker thread. (This can't be done |
1741 | * in the message handler context.) | 1764 | * in the message handler context.) |
1742 | */ | 1765 | */ |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 959b1bf7c327..6af2d0d4a87a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -690,11 +690,12 @@ void ceph_mdsc_release_request(struct kref *kref) | |||
690 | ceph_msg_put(req->r_reply); | 690 | ceph_msg_put(req->r_reply); |
691 | if (req->r_inode) { | 691 | if (req->r_inode) { |
692 | ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); | 692 | ceph_put_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); |
693 | iput(req->r_inode); | 693 | /* avoid calling iput_final() in mds dispatch threads */ |
694 | ceph_async_iput(req->r_inode); | ||
694 | } | 695 | } |
695 | if (req->r_parent) | 696 | if (req->r_parent) |
696 | ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); | 697 | ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); |
697 | iput(req->r_target_inode); | 698 | ceph_async_iput(req->r_target_inode); |
698 | if (req->r_dentry) | 699 | if (req->r_dentry) |
699 | dput(req->r_dentry); | 700 | dput(req->r_dentry); |
700 | if (req->r_old_dentry) | 701 | if (req->r_old_dentry) |
@@ -708,7 +709,7 @@ void ceph_mdsc_release_request(struct kref *kref) | |||
708 | */ | 709 | */ |
709 | ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), | 710 | ceph_put_cap_refs(ceph_inode(req->r_old_dentry_dir), |
710 | CEPH_CAP_PIN); | 711 | CEPH_CAP_PIN); |
711 | iput(req->r_old_dentry_dir); | 712 | ceph_async_iput(req->r_old_dentry_dir); |
712 | } | 713 | } |
713 | kfree(req->r_path1); | 714 | kfree(req->r_path1); |
714 | kfree(req->r_path2); | 715 | kfree(req->r_path2); |
@@ -818,7 +819,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
818 | } | 819 | } |
819 | 820 | ||
820 | if (req->r_unsafe_dir) { | 821 | if (req->r_unsafe_dir) { |
821 | iput(req->r_unsafe_dir); | 822 | /* avoid calling iput_final() in mds dispatch threads */ |
823 | ceph_async_iput(req->r_unsafe_dir); | ||
822 | req->r_unsafe_dir = NULL; | 824 | req->r_unsafe_dir = NULL; |
823 | } | 825 | } |
824 | 826 | ||
@@ -983,7 +985,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
983 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); | 985 | cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node); |
984 | if (!cap) { | 986 | if (!cap) { |
985 | spin_unlock(&ci->i_ceph_lock); | 987 | spin_unlock(&ci->i_ceph_lock); |
986 | iput(inode); | 988 | ceph_async_iput(inode); |
987 | goto random; | 989 | goto random; |
988 | } | 990 | } |
989 | mds = cap->session->s_mds; | 991 | mds = cap->session->s_mds; |
@@ -992,7 +994,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, | |||
992 | cap == ci->i_auth_cap ? "auth " : "", cap); | 994 | cap == ci->i_auth_cap ? "auth " : "", cap); |
993 | spin_unlock(&ci->i_ceph_lock); | 995 | spin_unlock(&ci->i_ceph_lock); |
994 | out: | 996 | out: |
995 | iput(inode); | 997 | /* avoid calling iput_final() while holding mdsc->mutex or |
998 | * in mds dispatch threads */ | ||
999 | ceph_async_iput(inode); | ||
996 | return mds; | 1000 | return mds; |
997 | 1001 | ||
998 | random: | 1002 | random: |
@@ -1302,7 +1306,9 @@ int ceph_iterate_session_caps(struct ceph_mds_session *session, | |||
1302 | spin_unlock(&session->s_cap_lock); | 1306 | spin_unlock(&session->s_cap_lock); |
1303 | 1307 | ||
1304 | if (last_inode) { | 1308 | if (last_inode) { |
1305 | iput(last_inode); | 1309 | /* avoid calling iput_final() while holding |
1310 | * s_mutex or in mds dispatch threads */ | ||
1311 | ceph_async_iput(last_inode); | ||
1306 | last_inode = NULL; | 1312 | last_inode = NULL; |
1307 | } | 1313 | } |
1308 | if (old_cap) { | 1314 | if (old_cap) { |
@@ -1335,7 +1341,7 @@ out: | |||
1335 | session->s_cap_iterator = NULL; | 1341 | session->s_cap_iterator = NULL; |
1336 | spin_unlock(&session->s_cap_lock); | 1342 | spin_unlock(&session->s_cap_lock); |
1337 | 1343 | ||
1338 | iput(last_inode); | 1344 | ceph_async_iput(last_inode); |
1339 | if (old_cap) | 1345 | if (old_cap) |
1340 | ceph_put_cap(session->s_mdsc, old_cap); | 1346 | ceph_put_cap(session->s_mdsc, old_cap); |
1341 | 1347 | ||
@@ -1471,7 +1477,8 @@ static void remove_session_caps(struct ceph_mds_session *session) | |||
1471 | spin_unlock(&session->s_cap_lock); | 1477 | spin_unlock(&session->s_cap_lock); |
1472 | 1478 | ||
1473 | inode = ceph_find_inode(sb, vino); | 1479 | inode = ceph_find_inode(sb, vino); |
1474 | iput(inode); | 1480 | /* avoid calling iput_final() while holding s_mutex */ |
1481 | ceph_async_iput(inode); | ||
1475 | 1482 | ||
1476 | spin_lock(&session->s_cap_lock); | 1483 | spin_lock(&session->s_cap_lock); |
1477 | } | 1484 | } |
@@ -3912,8 +3919,9 @@ release: | |||
3912 | ceph_con_send(&session->s_con, msg); | 3919 | ceph_con_send(&session->s_con, msg); |
3913 | 3920 | ||
3914 | out: | 3921 | out: |
3915 | iput(inode); | ||
3916 | mutex_unlock(&session->s_mutex); | 3922 | mutex_unlock(&session->s_mutex); |
3923 | /* avoid calling iput_final() in mds dispatch threads */ | ||
3924 | ceph_async_iput(inode); | ||
3917 | return; | 3925 | return; |
3918 | 3926 | ||
3919 | bad: | 3927 | bad: |
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index c4522212872c..d629fc857450 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c | |||
@@ -74,7 +74,8 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, | |||
74 | le64_to_cpu(h->max_files)); | 74 | le64_to_cpu(h->max_files)); |
75 | spin_unlock(&ci->i_ceph_lock); | 75 | spin_unlock(&ci->i_ceph_lock); |
76 | 76 | ||
77 | iput(inode); | 77 | /* avoid calling iput_final() in dispatch thread */ |
78 | ceph_async_iput(inode); | ||
78 | } | 79 | } |
79 | 80 | ||
80 | static struct ceph_quotarealm_inode * | 81 | static struct ceph_quotarealm_inode * |
@@ -235,7 +236,8 @@ restart: | |||
235 | 236 | ||
236 | ci = ceph_inode(in); | 237 | ci = ceph_inode(in); |
237 | has_quota = __ceph_has_any_quota(ci); | 238 | has_quota = __ceph_has_any_quota(ci); |
238 | iput(in); | 239 | /* avoid calling iput_final() while holding mdsc->snap_rwsem */ |
240 | ceph_async_iput(in); | ||
239 | 241 | ||
240 | next = realm->parent; | 242 | next = realm->parent; |
241 | if (has_quota || !next) | 243 | if (has_quota || !next) |
@@ -372,7 +374,8 @@ restart: | |||
372 | pr_warn("Invalid quota check op (%d)\n", op); | 374 | pr_warn("Invalid quota check op (%d)\n", op); |
373 | exceeded = true; /* Just break the loop */ | 375 | exceeded = true; /* Just break the loop */ |
374 | } | 376 | } |
375 | iput(in); | 377 | /* avoid calling iput_final() while holding mdsc->snap_rwsem */ |
378 | ceph_async_iput(in); | ||
376 | 379 | ||
377 | next = realm->parent; | 380 | next = realm->parent; |
378 | if (exceeded || !next) | 381 | if (exceeded || !next) |
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b26e12cd8ec3..72c6c022f02b 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c | |||
@@ -648,13 +648,15 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) | |||
648 | if (!inode) | 648 | if (!inode) |
649 | continue; | 649 | continue; |
650 | spin_unlock(&realm->inodes_with_caps_lock); | 650 | spin_unlock(&realm->inodes_with_caps_lock); |
651 | iput(lastinode); | 651 | /* avoid calling iput_final() while holding |
652 | * mdsc->snap_rwsem or in mds dispatch threads */ | ||
653 | ceph_async_iput(lastinode); | ||
652 | lastinode = inode; | 654 | lastinode = inode; |
653 | ceph_queue_cap_snap(ci); | 655 | ceph_queue_cap_snap(ci); |
654 | spin_lock(&realm->inodes_with_caps_lock); | 656 | spin_lock(&realm->inodes_with_caps_lock); |
655 | } | 657 | } |
656 | spin_unlock(&realm->inodes_with_caps_lock); | 658 | spin_unlock(&realm->inodes_with_caps_lock); |
657 | iput(lastinode); | 659 | ceph_async_iput(lastinode); |
658 | 660 | ||
659 | dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); | 661 | dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); |
660 | } | 662 | } |
@@ -806,7 +808,9 @@ static void flush_snaps(struct ceph_mds_client *mdsc) | |||
806 | ihold(inode); | 808 | ihold(inode); |
807 | spin_unlock(&mdsc->snap_flush_lock); | 809 | spin_unlock(&mdsc->snap_flush_lock); |
808 | ceph_flush_snaps(ci, &session); | 810 | ceph_flush_snaps(ci, &session); |
809 | iput(inode); | 811 | /* avoid calling iput_final() while holding |
812 | * session->s_mutex or in mds dispatch threads */ | ||
813 | ceph_async_iput(inode); | ||
810 | spin_lock(&mdsc->snap_flush_lock); | 814 | spin_lock(&mdsc->snap_flush_lock); |
811 | } | 815 | } |
812 | spin_unlock(&mdsc->snap_flush_lock); | 816 | spin_unlock(&mdsc->snap_flush_lock); |
@@ -950,12 +954,14 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, | |||
950 | ceph_get_snap_realm(mdsc, realm); | 954 | ceph_get_snap_realm(mdsc, realm); |
951 | ceph_put_snap_realm(mdsc, oldrealm); | 955 | ceph_put_snap_realm(mdsc, oldrealm); |
952 | 956 | ||
953 | iput(inode); | 957 | /* avoid calling iput_final() while holding |
958 | * mdsc->snap_rwsem or mds in dispatch threads */ | ||
959 | ceph_async_iput(inode); | ||
954 | continue; | 960 | continue; |
955 | 961 | ||
956 | skip_inode: | 962 | skip_inode: |
957 | spin_unlock(&ci->i_ceph_lock); | 963 | spin_unlock(&ci->i_ceph_lock); |
958 | iput(inode); | 964 | ceph_async_iput(inode); |
959 | } | 965 | } |
960 | 966 | ||
961 | /* we may have taken some of the old realm's children. */ | 967 | /* we may have taken some of the old realm's children. */ |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3fb866a979ce..5f27e1f7f2d6 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -899,9 +899,9 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask); | |||
899 | extern bool ceph_inode_set_size(struct inode *inode, loff_t size); | 899 | extern bool ceph_inode_set_size(struct inode *inode, loff_t size); |
900 | extern void __ceph_do_pending_vmtruncate(struct inode *inode); | 900 | extern void __ceph_do_pending_vmtruncate(struct inode *inode); |
901 | extern void ceph_queue_vmtruncate(struct inode *inode); | 901 | extern void ceph_queue_vmtruncate(struct inode *inode); |
902 | |||
903 | extern void ceph_queue_invalidate(struct inode *inode); | 902 | extern void ceph_queue_invalidate(struct inode *inode); |
904 | extern void ceph_queue_writeback(struct inode *inode); | 903 | extern void ceph_queue_writeback(struct inode *inode); |
904 | extern void ceph_async_iput(struct inode *inode); | ||
905 | 905 | ||
906 | extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, | 906 | extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, |
907 | int mask, bool force); | 907 | int mask, bool force); |