diff options
author | Sage Weil <sage@newdream.net> | 2010-02-09 17:04:02 -0500 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2010-02-11 14:48:53 -0500 |
commit | 6a026589ba333185c466c906376fe022a27a53f9 (patch) | |
tree | 7c536163e2192928978b79d5a6bbd79dd5993777 /fs/ceph/file.c | |
parent | 68c283236a1e0772e1a469dd2ffc17afc300b07b (diff) |
ceph: fix sync read eof check deadlock
If a sync read gets a short result from the OSD, it may need to do a
getattr to see if it is short due to reaching end-of-file. The getattr
was being done while holding a reference to FILE_RD, which can lead to
a deadlock if the MDS is revoking that capability bit and can't process
the getattr until it does.
We fix this by setting a flag if EOF size validation is needed, and doing
the getattr in ceph_aio_read, after the RD cap ref is dropped. If the
read needs to be continued, we loop and continue traversing the file.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r-- | fs/ceph/file.c | 39 |
1 files changed, 26 insertions, 13 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bbf1ccf2d56e..2c4ae4441cab 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -431,7 +431,8 @@ static void zero_page_vector_range(int off, int len, struct page **pages) | |||
431 | */ | 431 | */ |
432 | static int striped_read(struct inode *inode, | 432 | static int striped_read(struct inode *inode, |
433 | u64 off, u64 len, | 433 | u64 off, u64 len, |
434 | struct page **pages, int num_pages) | 434 | struct page **pages, int num_pages, |
435 | int *checkeof) | ||
435 | { | 436 | { |
436 | struct ceph_client *client = ceph_inode_to_client(inode); | 437 | struct ceph_client *client = ceph_inode_to_client(inode); |
437 | struct ceph_inode_info *ci = ceph_inode(inode); | 438 | struct ceph_inode_info *ci = ceph_inode(inode); |
@@ -497,15 +498,7 @@ more: | |||
497 | } | 498 | } |
498 | 499 | ||
499 | /* check i_size */ | 500 | /* check i_size */ |
500 | ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); | 501 | *checkeof = 1; |
501 | if (ret < 0) | ||
502 | goto out; | ||
503 | |||
504 | /* hit EOF? */ | ||
505 | if (pos >= inode->i_size) | ||
506 | goto out; | ||
507 | |||
508 | goto more; | ||
509 | } | 502 | } |
510 | 503 | ||
511 | out: | 504 | out: |
@@ -522,7 +515,7 @@ out: | |||
522 | * If the read spans object boundary, just do multiple reads. | 515 | * If the read spans object boundary, just do multiple reads. |
523 | */ | 516 | */ |
524 | static ssize_t ceph_sync_read(struct file *file, char __user *data, | 517 | static ssize_t ceph_sync_read(struct file *file, char __user *data, |
525 | unsigned len, loff_t *poff) | 518 | unsigned len, loff_t *poff, int *checkeof) |
526 | { | 519 | { |
527 | struct inode *inode = file->f_dentry->d_inode; | 520 | struct inode *inode = file->f_dentry->d_inode; |
528 | struct page **pages; | 521 | struct page **pages; |
@@ -552,7 +545,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, | |||
552 | if (ret < 0) | 545 | if (ret < 0) |
553 | goto done; | 546 | goto done; |
554 | 547 | ||
555 | ret = striped_read(inode, off, len, pages, num_pages); | 548 | ret = striped_read(inode, off, len, pages, num_pages, checkeof); |
556 | 549 | ||
557 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) | 550 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) |
558 | ret = copy_page_vector_to_user(pages, data, off, ret); | 551 | ret = copy_page_vector_to_user(pages, data, off, ret); |
@@ -746,11 +739,14 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
746 | size_t len = iov->iov_len; | 739 | size_t len = iov->iov_len; |
747 | struct inode *inode = filp->f_dentry->d_inode; | 740 | struct inode *inode = filp->f_dentry->d_inode; |
748 | struct ceph_inode_info *ci = ceph_inode(inode); | 741 | struct ceph_inode_info *ci = ceph_inode(inode); |
742 | void *base = iov->iov_base; | ||
749 | ssize_t ret; | 743 | ssize_t ret; |
750 | int got = 0; | 744 | int got = 0; |
745 | int checkeof = 0, read = 0; | ||
751 | 746 | ||
752 | dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", | 747 | dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", |
753 | inode, ceph_vinop(inode), pos, (unsigned)len, inode); | 748 | inode, ceph_vinop(inode), pos, (unsigned)len, inode); |
749 | again: | ||
754 | __ceph_do_pending_vmtruncate(inode); | 750 | __ceph_do_pending_vmtruncate(inode); |
755 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, | 751 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, |
756 | &got, -1); | 752 | &got, -1); |
@@ -764,7 +760,7 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
764 | (iocb->ki_filp->f_flags & O_DIRECT) || | 760 | (iocb->ki_filp->f_flags & O_DIRECT) || |
765 | (inode->i_sb->s_flags & MS_SYNCHRONOUS)) | 761 | (inode->i_sb->s_flags & MS_SYNCHRONOUS)) |
766 | /* hmm, this isn't really async... */ | 762 | /* hmm, this isn't really async... */ |
767 | ret = ceph_sync_read(filp, iov->iov_base, len, ppos); | 763 | ret = ceph_sync_read(filp, base, len, ppos, &checkeof); |
768 | else | 764 | else |
769 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); | 765 | ret = generic_file_aio_read(iocb, iov, nr_segs, pos); |
770 | 766 | ||
@@ -772,6 +768,23 @@ out: | |||
772 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", | 768 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", |
773 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); | 769 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); |
774 | ceph_put_cap_refs(ci, got); | 770 | ceph_put_cap_refs(ci, got); |
771 | |||
772 | if (checkeof && ret >= 0) { | ||
773 | int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); | ||
774 | |||
775 | /* hit EOF or hole? */ | ||
776 | if (statret == 0 && *ppos < inode->i_size) { | ||
777 | dout("aio_read sync_read hit hole, reading more\n"); | ||
778 | read += ret; | ||
779 | base += ret; | ||
780 | len -= ret; | ||
781 | checkeof = 0; | ||
782 | goto again; | ||
783 | } | ||
784 | } | ||
785 | if (ret >= 0) | ||
786 | ret += read; | ||
787 | |||
775 | return ret; | 788 | return ret; |
776 | } | 789 | } |
777 | 790 | ||