diff options
author | Sage Weil <sage@inktank.com> | 2012-11-05 14:07:23 -0500 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-11-05 14:07:23 -0500 |
commit | 22cddde104d715600a4c218bf9224923208afe90 (patch) | |
tree | 82fc93d9c89ef41145e52ace63484047e600f866 /fs/ceph | |
parent | 4d1d0534f53863108fdea496288cb3310f88118d (diff) |
ceph: Fix i_size update race
ceph_aio_write() has an optimization that marks cap EPH_CAP_FILE_WR
dirty before data is copied to page cache and inode size is updated.
If ceph_check_caps() flushes the dirty cap before the inode size is
updated, MDS can miss the new inode size. The fix is move
ceph_{get,put}_cap_refs() into ceph_write_{begin,end}() and call
__ceph_mark_dirty_caps() after inode size is updated.
Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Sage Weil <sage@inktank.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/addr.c | 51 | ||||
-rw-r--r-- | fs/ceph/file.c | 73 |
2 files changed, 77 insertions, 47 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 22b6e4583fa..21a07187df0 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -1078,23 +1078,51 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, | |||
1078 | struct page **pagep, void **fsdata) | 1078 | struct page **pagep, void **fsdata) |
1079 | { | 1079 | { |
1080 | struct inode *inode = file->f_dentry->d_inode; | 1080 | struct inode *inode = file->f_dentry->d_inode; |
1081 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
1082 | struct ceph_file_info *fi = file->private_data; | ||
1081 | struct page *page; | 1083 | struct page *page; |
1082 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; | 1084 | pgoff_t index = pos >> PAGE_CACHE_SHIFT; |
1083 | int r; | 1085 | int r, want, got = 0; |
1086 | |||
1087 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||
1088 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | ||
1089 | else | ||
1090 | want = CEPH_CAP_FILE_BUFFER; | ||
1091 | |||
1092 | dout("write_begin %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | ||
1093 | inode, ceph_vinop(inode), pos, len, inode->i_size); | ||
1094 | r = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos+len); | ||
1095 | if (r < 0) | ||
1096 | return r; | ||
1097 | dout("write_begin %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
1098 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | ||
1099 | if (!(got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO))) { | ||
1100 | ceph_put_cap_refs(ci, got); | ||
1101 | return -EAGAIN; | ||
1102 | } | ||
1084 | 1103 | ||
1085 | do { | 1104 | do { |
1086 | /* get a page */ | 1105 | /* get a page */ |
1087 | page = grab_cache_page_write_begin(mapping, index, 0); | 1106 | page = grab_cache_page_write_begin(mapping, index, 0); |
1088 | if (!page) | 1107 | if (!page) { |
1089 | return -ENOMEM; | 1108 | r = -ENOMEM; |
1090 | *pagep = page; | 1109 | break; |
1110 | } | ||
1091 | 1111 | ||
1092 | dout("write_begin file %p inode %p page %p %d~%d\n", file, | 1112 | dout("write_begin file %p inode %p page %p %d~%d\n", file, |
1093 | inode, page, (int)pos, (int)len); | 1113 | inode, page, (int)pos, (int)len); |
1094 | 1114 | ||
1095 | r = ceph_update_writeable_page(file, pos, len, page); | 1115 | r = ceph_update_writeable_page(file, pos, len, page); |
1116 | if (r) | ||
1117 | page_cache_release(page); | ||
1096 | } while (r == -EAGAIN); | 1118 | } while (r == -EAGAIN); |
1097 | 1119 | ||
1120 | if (r) { | ||
1121 | ceph_put_cap_refs(ci, got); | ||
1122 | } else { | ||
1123 | *pagep = page; | ||
1124 | *(int *)fsdata = got; | ||
1125 | } | ||
1098 | return r; | 1126 | return r; |
1099 | } | 1127 | } |
1100 | 1128 | ||
@@ -1108,10 +1136,12 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
1108 | struct page *page, void *fsdata) | 1136 | struct page *page, void *fsdata) |
1109 | { | 1137 | { |
1110 | struct inode *inode = file->f_dentry->d_inode; | 1138 | struct inode *inode = file->f_dentry->d_inode; |
1139 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
1111 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 1140 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
1112 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1141 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1113 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1142 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1114 | int check_cap = 0; | 1143 | int check_cap = 0; |
1144 | int got = (unsigned long)fsdata; | ||
1115 | 1145 | ||
1116 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, | 1146 | dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, |
1117 | inode, page, (int)pos, (int)copied, (int)len); | 1147 | inode, page, (int)pos, (int)copied, (int)len); |
@@ -1134,6 +1164,19 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
1134 | up_read(&mdsc->snap_rwsem); | 1164 | up_read(&mdsc->snap_rwsem); |
1135 | page_cache_release(page); | 1165 | page_cache_release(page); |
1136 | 1166 | ||
1167 | if (copied > 0) { | ||
1168 | int dirty; | ||
1169 | spin_lock(&ci->i_ceph_lock); | ||
1170 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | ||
1171 | spin_unlock(&ci->i_ceph_lock); | ||
1172 | if (dirty) | ||
1173 | __mark_inode_dirty(inode, dirty); | ||
1174 | } | ||
1175 | |||
1176 | dout("write_end %p %llx.%llx %llu~%u dropping cap refs on %s\n", | ||
1177 | inode, ceph_vinop(inode), pos, len, ceph_cap_string(got)); | ||
1178 | ceph_put_cap_refs(ci, got); | ||
1179 | |||
1137 | if (check_cap) | 1180 | if (check_cap) |
1138 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); | 1181 | ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); |
1139 | 1182 | ||
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 5840d2aaed1..d415096800a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -712,63 +712,53 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
712 | struct ceph_osd_client *osdc = | 712 | struct ceph_osd_client *osdc = |
713 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | 713 | &ceph_sb_to_client(inode->i_sb)->client->osdc; |
714 | loff_t endoff = pos + iov->iov_len; | 714 | loff_t endoff = pos + iov->iov_len; |
715 | int want, got = 0; | 715 | int got = 0; |
716 | int ret, err; | 716 | int ret, err, written; |
717 | 717 | ||
718 | if (ceph_snap(inode) != CEPH_NOSNAP) | 718 | if (ceph_snap(inode) != CEPH_NOSNAP) |
719 | return -EROFS; | 719 | return -EROFS; |
720 | 720 | ||
721 | retry_snap: | 721 | retry_snap: |
722 | written = 0; | ||
722 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) | 723 | if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) |
723 | return -ENOSPC; | 724 | return -ENOSPC; |
724 | __ceph_do_pending_vmtruncate(inode); | 725 | __ceph_do_pending_vmtruncate(inode); |
725 | dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", | ||
726 | inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, | ||
727 | inode->i_size); | ||
728 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | ||
729 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | ||
730 | else | ||
731 | want = CEPH_CAP_FILE_BUFFER; | ||
732 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); | ||
733 | if (ret < 0) | ||
734 | goto out_put; | ||
735 | |||
736 | dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
737 | inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, | ||
738 | ceph_cap_string(got)); | ||
739 | |||
740 | if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || | ||
741 | (iocb->ki_filp->f_flags & O_DIRECT) || | ||
742 | (inode->i_sb->s_flags & MS_SYNCHRONOUS) || | ||
743 | (fi->flags & CEPH_F_SYNC)) { | ||
744 | ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, | ||
745 | &iocb->ki_pos); | ||
746 | } else { | ||
747 | /* | ||
748 | * buffered write; drop Fw early to avoid slow | ||
749 | * revocation if we get stuck on balance_dirty_pages | ||
750 | */ | ||
751 | int dirty; | ||
752 | |||
753 | spin_lock(&ci->i_ceph_lock); | ||
754 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | ||
755 | spin_unlock(&ci->i_ceph_lock); | ||
756 | ceph_put_cap_refs(ci, got); | ||
757 | 726 | ||
727 | /* | ||
728 | * try to do a buffered write. if we don't have sufficient | ||
729 | * caps, we'll get -EAGAIN from generic_file_aio_write, or a | ||
730 | * short write if we only get caps for some pages. | ||
731 | */ | ||
732 | if (!(iocb->ki_filp->f_flags & O_DIRECT) && | ||
733 | !(inode->i_sb->s_flags & MS_SYNCHRONOUS) && | ||
734 | !(fi->flags & CEPH_F_SYNC)) { | ||
758 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); | 735 | ret = generic_file_aio_write(iocb, iov, nr_segs, pos); |
736 | if (ret >= 0) | ||
737 | written = ret; | ||
738 | |||
759 | if ((ret >= 0 || ret == -EIOCBQUEUED) && | 739 | if ((ret >= 0 || ret == -EIOCBQUEUED) && |
760 | ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) | 740 | ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) |
761 | || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { | 741 | || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { |
762 | err = vfs_fsync_range(file, pos, pos + ret - 1, 1); | 742 | err = vfs_fsync_range(file, pos, pos + written - 1, 1); |
763 | if (err < 0) | 743 | if (err < 0) |
764 | ret = err; | 744 | ret = err; |
765 | } | 745 | } |
746 | if ((ret < 0 && ret != -EAGAIN) || pos + written >= endoff) | ||
747 | goto out; | ||
748 | } | ||
766 | 749 | ||
767 | if (dirty) | 750 | dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", |
768 | __mark_inode_dirty(inode, dirty); | 751 | inode, ceph_vinop(inode), pos + written, |
752 | (unsigned)iov->iov_len - written, inode->i_size); | ||
753 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, 0, &got, endoff); | ||
754 | if (ret < 0) | ||
769 | goto out; | 755 | goto out; |
770 | } | ||
771 | 756 | ||
757 | dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n", | ||
758 | inode, ceph_vinop(inode), pos + written, | ||
759 | (unsigned)iov->iov_len - written, ceph_cap_string(got)); | ||
760 | ret = ceph_sync_write(file, iov->iov_base + written, | ||
761 | iov->iov_len - written, &iocb->ki_pos); | ||
772 | if (ret >= 0) { | 762 | if (ret >= 0) { |
773 | int dirty; | 763 | int dirty; |
774 | spin_lock(&ci->i_ceph_lock); | 764 | spin_lock(&ci->i_ceph_lock); |
@@ -777,13 +767,10 @@ retry_snap: | |||
777 | if (dirty) | 767 | if (dirty) |
778 | __mark_inode_dirty(inode, dirty); | 768 | __mark_inode_dirty(inode, dirty); |
779 | } | 769 | } |
780 | |||
781 | out_put: | ||
782 | dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", | 770 | dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", |
783 | inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, | 771 | inode, ceph_vinop(inode), pos + written, |
784 | ceph_cap_string(got)); | 772 | (unsigned)iov->iov_len - written, ceph_cap_string(got)); |
785 | ceph_put_cap_refs(ci, got); | 773 | ceph_put_cap_refs(ci, got); |
786 | |||
787 | out: | 774 | out: |
788 | if (ret == -EOLDSNAPC) { | 775 | if (ret == -EOLDSNAPC) { |
789 | dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", | 776 | dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", |