diff options
author | Sage Weil <sage@inktank.com> | 2013-05-02 00:15:58 -0400 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2013-05-02 00:15:58 -0400 |
commit | 7971bd92baf729fcebe04d7330ac22dc668d0261 (patch) | |
tree | 78a8ea3b4e072e52840dac968dfacfff737765bd /fs/ceph | |
parent | a8673d61ad77ddf2118599507bd40cc345e95368 (diff) |
ceph: revert commit 22cddde104
commit 22cddde104 breaks the atomicity of write operation, it also
introduces a deadlock between write and truncate.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Greg Farnum <greg@inktank.com>
Conflicts:
fs/ceph/addr.c
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/addr.c | 51 | ||||
-rw-r--r-- | fs/ceph/file.c | 73 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 1 |
3 files changed, 48 insertions, 77 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a60ea977af6f..2a571fb4803b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -1067,51 +1067,23 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, | |||
1067 | struct page **pagep, void **fsdata) | 1067 | struct page **pagep, void **fsdata) |
1068 | { | 1068 | { |
1069 | struct inode *inode = file_inode(file); | 1069 | struct inode *inode = file_inode(file); |
1070 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
1071 | struct ceph_file_info *fi = file->private_data; | ||
1072 | struct page *page; | 1070 | struct page *page; |
1073 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1071 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1074 | int r, want, got = 0; | 1072 | int r; |
1075 | |||
1076 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||
1077 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | ||
1078 | else | ||
1079 | want = CEPH_CAP_FILE_BUFFER; | ||
1080 | |||
1081 | dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | ||
1082 | inode, ceph_vinop(inode), pos, len, inode->i_size); | ||
1083 | r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); | ||
1084 | if (r < 0) | ||
1085 | return r; | ||
1086 | dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
1087 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | ||
1088 | if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { | ||
1089 | ceph_put_cap_refs(ci, got); | ||
1090 | return -EAGAIN; | ||
1091 | } | ||
1092 | 1073 | ||
1093 | do { | 1074 | do { |
1094 | /* get a page */ | 1075 | /* get a page */ |
1095 | page = grab_cache_page_write_begin(mapping, index, 0); | 1076 | page = grab_cache_page_write_begin(mapping, index, 0); |
1096 | if (!page) { | 1077 | if (!page) |
1097 | r = -ENOMEM; | 1078 | return -ENOMEM; |
1098 | break; | 1079 | *pagep = page; |
1099 | } | ||
1100 | 1080 | ||
1101 | dout("write_begin file %p inode %p page %p %d~%d\n", file, | 1081 | dout("write_begin file %p inode %p page %p %d~%d\n", file, |
1102 | inode, page, (int)pos, (int)len); | 1082 | inode, page, (int)pos, (int)len); |
1103 | 1083 | ||
1104 | r = ceph_update_writeable_page(file, pos, len, page); | 1084 | r = ceph_update_writeable_page(file, pos, len, page); |
1105 | if (r) | ||
1106 | page_cache_release(page); | ||
1107 | } while (r == -EAGAIN); | 1085 | } while (r == -EAGAIN); |
1108 | 1086 | ||
1109 | if (r) { | ||
1110 | ceph_put_cap_refs(ci, got); | ||
1111 | } else { | ||
1112 | *pagep = page; | ||
1113 | *(int *)fsdata = got; | ||
1114 | } | ||
1115 | return r; | 1087 | return r; |
1116 | } | 1088 | } |
1117 | 1089 | ||
@@ -1125,12 +1097,10 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
1125 | struct page *page, void *fsdata) | 1097 | struct page *page, void *fsdata) |
1126 | { | 1098 | { |
1127 | struct inode *inode = file_inode(file); | 1099 | struct inode *inode = file_inode(file); |
1128 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
1129 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 1100 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
1130 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1101 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1131 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1102 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1132 | int check_cap = 0; | 1103 | int check_cap = 0; |
1133 | int got = (unsigned long)fsdata; | ||
1134 | 1104 | ||
1135 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, | 1105 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, |
1136 | inode, page, (int)pos, (int)copied, (int)len); | 1106 | inode, page, (int)pos, (int)copied, (int)len); |
@@ -1153,19 +1123,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
1153 | up_read(&mdsc->snap_rwsem); | 1123 | up_read(&mdsc->snap_rwsem); |
1154 | page_cache_release(page); | 1124 | page_cache_release(page); |
1155 | 1125 | ||
1156 | if (copied > 0) { | ||
1157 | int dirty; | ||
1158 | spin_lock(&ci->i_ceph_lock); | ||
1159 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | ||
1160 | spin_unlock(&ci->i_ceph_lock); | ||
1161 | if (dirty) | ||
1162 | __mark_inode_dirty(inode, dirty); | ||
1163 | } | ||
1164 | |||
1165 | dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", | ||
1166 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | ||
1167 | ceph_put_cap_refs(ci, got); | ||
1168 | |||
1169 | if (check_cap) | 1126 | if (check_cap) |
1170 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); | 1127 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); |
1171 | 1128 | ||
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index bf338d9b67e3..b86d2a0eb145 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -718,53 +718,63 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
718 | struct ceph_osd_client *osdc = | 718 | struct ceph_osd_client *osdc = |
719 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | 719 | &ceph_sb_to_client(inode->i_sb)->client->osdc; |
720 | loff_t endoff = pos + iov->iov_len; | 720 | loff_t endoff = pos + iov->iov_len; |
721 | int got = 0; | 721 | int want, got = 0; |
722 | int ret, err, written; | 722 | int ret, err; |
723 | 723 | ||
724 | if (ceph_snap(inode) != CEPH_NOSNAP) | 724 | if (ceph_snap(inode) != CEPH_NOSNAP) |
725 | return -EROFS; | 725 | return -EROFS; |
726 | 726 | ||
727 | retry_snap: | 727 | retry_snap: |
728 | written = 0; | ||
729 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) | 728 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) |
730 | return -ENOSPC; | 729 | return -ENOSPC; |
731 | __ceph_do_pending_vmtruncate(inode); | 730 | __ceph_do_pending_vmtruncate(inode); |
731 | dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | ||
732 | inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, | ||
733 | inode->i_size); | ||
734 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||
735 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | ||
736 | else | ||
737 | want = CEPH_CAP_FILE_BUFFER; | ||
738 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); | ||
739 | if (ret < 0) | ||
740 | goto out_put; | ||
732 | 741 | ||
733 | /* | 742 | dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", |
734 | * try to do a buffered write. if we don't have sufficient | 743 | inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, |
735 | * caps, we'll get -EAGAIN from generic_file_aio_write, or a | 744 | ceph_cap_string(got)); |
736 | * short write if we only get caps for some pages. | 745 | |
737 | */ | 746 | if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || |
738 | if (!(iocb->ki_filp->f_flags & O_DIRECT) && | 747 | (iocb->ki_filp->f_flags & O_DIRECT) || |
739 | !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && | 748 | (inode->i_sb->s_flags & MS_SYNCHRONOUS) || |
740 | !(fi->flags & CEPH_F_SYNC)) { | 749 | (fi->flags & CEPH_F_SYNC)) { |
741 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | 750 | ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, |
742 | if (ret >= 0) | 751 | &iocb->ki_pos); |
743 | written = ret; | 752 | } else { |
753 | /* | ||
754 | * buffered write; drop Fw early to avoid slow | ||
755 | * revocation if we get stuck on balance_dirty_pages | ||
756 | */ | ||
757 | int dirty; | ||
744 | 758 | ||
759 | spin_lock(&ci->i_ceph_lock); | ||
760 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | ||
761 | spin_unlock(&ci->i_ceph_lock); | ||
762 | ceph_put_cap_refs(ci, got); | ||
763 | |||
764 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | ||
745 | if ((ret >= 0 || ret == -EIOCBQUEUED) && | 765 | if ((ret >= 0 || ret == -EIOCBQUEUED) && |
746 | ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) | 766 | ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) |
747 | || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { | 767 | || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { |
748 | err = vfs_fsync_range(file, pos, pos + written - 1, 1); | 768 | err = vfs_fsync_range(file, pos, pos + ret - 1, 1); |
749 | if (err < 0) | 769 | if (err < 0) |
750 | ret = err; | 770 | ret = err; |
751 | } | 771 | } |
752 | if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) | ||
753 | goto out; | ||
754 | } | ||
755 | 772 | ||
756 | dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | 773 | if (dirty) |
757 | inode, ceph_vinop(inode), pos + written, | 774 | __mark_inode_dirty(inode, dirty); |
758 | (unsigned)iov->iov_len - written, inode->i_size); | ||
759 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); | ||
760 | if (ret < 0) | ||
761 | goto out; | 775 | goto out; |
776 | } | ||
762 | 777 | ||
763 | dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
764 | inode, ceph_vinop(inode), pos + written, | ||
765 | (unsigned)iov->iov_len - written, ceph_cap_string(got)); | ||
766 | ret = ceph_sync_write(file, iov->iov_base + written, | ||
767 | iov->iov_len - written, &iocb->ki_pos); | ||
768 | if (ret >= 0) { | 778 | if (ret >= 0) { |
769 | int dirty; | 779 | int dirty; |
770 | spin_lock(&ci->i_ceph_lock); | 780 | spin_lock(&ci->i_ceph_lock); |
@@ -773,10 +783,13 @@ retry_snap: | |||
773 | if (dirty) | 783 | if (dirty) |
774 | __mark_inode_dirty(inode, dirty); | 784 | __mark_inode_dirty(inode, dirty); |
775 | } | 785 | } |
786 | |||
787 | out_put: | ||
776 | dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", | 788 | dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", |
777 | inode, ceph_vinop(inode), pos + written, | 789 | inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, |
778 | (unsigned)iov->iov_len - written, ceph_cap_string(got)); | 790 | ceph_cap_string(got)); |
779 | ceph_put_cap_refs(ci, got); | 791 | ceph_put_cap_refs(ci, got); |
792 | |||
780 | out: | 793 | out: |
781 | if (ret == -EOLDSNAPC) { | 794 | if (ret == -EOLDSNAPC) { |
782 | dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", | 795 | dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 56da380878c5..9811caae7be4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -1916,6 +1916,7 @@ static void __wake_requests(struct ceph_mds_client *mdsc, | |||
1916 | req = list_entry(tmp_list.next, | 1916 | req = list_entry(tmp_list.next, |
1917 | struct ceph_mds_request, r_wait); | 1917 | struct ceph_mds_request, r_wait); |
1918 | list_del_init(&req->r_wait); | 1918 | list_del_init(&req->r_wait); |
1919 | dout(" wake request %p tid %llu\n", req, req->r_tid); | ||
1919 | __do_request(mdsc, req); | 1920 | __do_request(mdsc, req); |
1920 | } | 1921 | } |
1921 | } | 1922 | } |