aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2014-11-14 09:36:18 -0500
committerIlya Dryomov <idryomov@redhat.com>2014-12-17 12:09:52 -0500
commit83701246aee8f83b4b42483051b439fbe96ed47d (patch)
tree45e0392da027efe21b3af9606ed10043ed093d1d /fs
parent3738daa68a5121ad7dd0318bca931e2a6afb0e8c (diff)
ceph: sync read inline data
we can't use getattr to fetch inline data while holding Fr cap, because it can cause deadlock. If we need to sync read inline data, drop cap refs first, then use getattr to fetch inline data. Signed-off-by: Yan, Zheng <zyan@redhat.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/ceph/addr.c66
-rw-r--r--fs/ceph/file.c63
2 files changed, 116 insertions, 13 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5d2b88e3ff0b..13413d7440d6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -192,17 +192,30 @@ static int readpage_nounlock(struct file *filp, struct page *page)
192 struct ceph_osd_client *osdc = 192 struct ceph_osd_client *osdc =
193 &ceph_inode_to_client(inode)->client->osdc; 193 &ceph_inode_to_client(inode)->client->osdc;
194 int err = 0; 194 int err = 0;
195 u64 off = page_offset(page);
195 u64 len = PAGE_CACHE_SIZE; 196 u64 len = PAGE_CACHE_SIZE;
196 197
197 err = ceph_readpage_from_fscache(inode, page); 198 if (off >= i_size_read(inode)) {
199 zero_user_segment(page, err, PAGE_CACHE_SIZE);
200 SetPageUptodate(page);
201 return 0;
202 }
198 203
204 /*
205 * Uptodate inline data should have been added into page cache
206 * while getting Fcr caps.
207 */
208 if (ci->i_inline_version != CEPH_INLINE_NONE)
209 return -EINVAL;
210
211 err = ceph_readpage_from_fscache(inode, page);
199 if (err == 0) 212 if (err == 0)
200 goto out; 213 goto out;
201 214
202 dout("readpage inode %p file %p page %p index %lu\n", 215 dout("readpage inode %p file %p page %p index %lu\n",
203 inode, filp, page, page->index); 216 inode, filp, page, page->index);
204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 217 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
205 (u64) page_offset(page), &len, 218 off, &len,
206 ci->i_truncate_seq, ci->i_truncate_size, 219 ci->i_truncate_seq, ci->i_truncate_size,
207 &page, 1, 0); 220 &page, 1, 0);
208 if (err == -ENOENT) 221 if (err == -ENOENT)
@@ -384,6 +397,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
384 int rc = 0; 397 int rc = 0;
385 int max = 0; 398 int max = 0;
386 399
400 if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
401 return -EINVAL;
402
387 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, 403 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list,
388 &nr_pages); 404 &nr_pages);
389 405
@@ -1219,8 +1235,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1219 want = CEPH_CAP_FILE_CACHE; 1235 want = CEPH_CAP_FILE_CACHE;
1220 while (1) { 1236 while (1) {
1221 got = 0; 1237 got = 0;
1222 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, 1238 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want,
1223 &got, &pinned_page); 1239 -1, &got, &pinned_page);
1224 if (ret == 0) 1240 if (ret == 0)
1225 break; 1241 break;
1226 if (ret != -ERESTARTSYS) { 1242 if (ret != -ERESTARTSYS) {
@@ -1231,7 +1247,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1231 dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1247 dout("filemap_fault %p %llu~%zd got cap refs on %s\n",
1232 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); 1248 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got));
1233 1249
1234 ret = filemap_fault(vma, vmf); 1250 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
1251 ci->i_inline_version == CEPH_INLINE_NONE)
1252 ret = filemap_fault(vma, vmf);
1253 else
1254 ret = -EAGAIN;
1235 1255
1236 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", 1256 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
1237 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); 1257 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret);
@@ -1239,6 +1259,42 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1239 page_cache_release(pinned_page); 1259 page_cache_release(pinned_page);
1240 ceph_put_cap_refs(ci, got); 1260 ceph_put_cap_refs(ci, got);
1241 1261
1262 if (ret != -EAGAIN)
1263 return ret;
1264
1265 /* read inline data */
1266 if (off >= PAGE_CACHE_SIZE) {
1267 /* does not support inline data > PAGE_SIZE */
1268 ret = VM_FAULT_SIGBUS;
1269 } else {
1270 int ret1;
1271 struct address_space *mapping = inode->i_mapping;
1272 struct page *page = find_or_create_page(mapping, 0,
1273 mapping_gfp_mask(mapping) &
1274 ~__GFP_FS);
1275 if (!page) {
1276 ret = VM_FAULT_OOM;
1277 goto out;
1278 }
1279 ret1 = __ceph_do_getattr(inode, page,
1280 CEPH_STAT_CAP_INLINE_DATA, true);
1281 if (ret1 < 0 || off >= i_size_read(inode)) {
1282 unlock_page(page);
1283 page_cache_release(page);
1284 ret = VM_FAULT_SIGBUS;
1285 goto out;
1286 }
1287 if (ret1 < PAGE_CACHE_SIZE)
1288 zero_user_segment(page, ret1, PAGE_CACHE_SIZE);
1289 else
1290 flush_dcache_page(page);
1291 SetPageUptodate(page);
1292 vmf->page = page;
1293 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
1294 }
1295out:
1296 dout("filemap_fault %p %llu~%zd read inline data ret %d\n",
1297 inode, off, (size_t)PAGE_CACHE_SIZE, ret);
1242 return ret; 1298 return ret;
1243} 1299}
1244 1300
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 861b9954a63a..5b092bda9284 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -333,6 +333,11 @@ int ceph_release(struct inode *inode, struct file *file)
333 return 0; 333 return 0;
334} 334}
335 335
336enum {
337 CHECK_EOF = 1,
338 READ_INLINE = 2,
339};
340
336/* 341/*
337 * Read a range of bytes striped over one or more objects. Iterate over 342 * Read a range of bytes striped over one or more objects. Iterate over
338 * objects we stripe over. (That's not atomic, but good enough for now.) 343 * objects we stripe over. (That's not atomic, but good enough for now.)
@@ -412,7 +417,7 @@ more:
412 ret = read; 417 ret = read;
413 /* did we bounce off eof? */ 418 /* did we bounce off eof? */
414 if (pos + left > inode->i_size) 419 if (pos + left > inode->i_size)
415 *checkeof = 1; 420 *checkeof = CHECK_EOF;
416 } 421 }
417 422
418 dout("striped_read returns %d\n", ret); 423 dout("striped_read returns %d\n", ret);
@@ -808,7 +813,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
808 struct page *pinned_page = NULL; 813 struct page *pinned_page = NULL;
809 ssize_t ret; 814 ssize_t ret;
810 int want, got = 0; 815 int want, got = 0;
811 int checkeof = 0, read = 0; 816 int retry_op = 0, read = 0;
812 817
813again: 818again:
814 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", 819 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
@@ -830,8 +835,12 @@ again:
830 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 835 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
831 ceph_cap_string(got)); 836 ceph_cap_string(got));
832 837
833 /* hmm, this isn't really async... */ 838 if (ci->i_inline_version == CEPH_INLINE_NONE) {
834 ret = ceph_sync_read(iocb, to, &checkeof); 839 /* hmm, this isn't really async... */
840 ret = ceph_sync_read(iocb, to, &retry_op);
841 } else {
842 retry_op = READ_INLINE;
843 }
835 } else { 844 } else {
836 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 845 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
837 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 846 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
@@ -846,12 +855,50 @@ again:
846 pinned_page = NULL; 855 pinned_page = NULL;
847 } 856 }
848 ceph_put_cap_refs(ci, got); 857 ceph_put_cap_refs(ci, got);
858 if (retry_op && ret >= 0) {
859 int statret;
860 struct page *page = NULL;
861 loff_t i_size;
862 if (retry_op == READ_INLINE) {
863 page = __page_cache_alloc(GFP_NOFS);
864 if (!page)
865 return -ENOMEM;
866 }
867
868 statret = __ceph_do_getattr(inode, page,
869 CEPH_STAT_CAP_INLINE_DATA, !!page);
870 if (statret < 0) {
871 __free_page(page);
872 if (statret == -ENODATA) {
873 BUG_ON(retry_op != READ_INLINE);
874 goto again;
875 }
876 return statret;
877 }
849 878
850 if (checkeof && ret >= 0) { 879 i_size = i_size_read(inode);
851 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 880 if (retry_op == READ_INLINE) {
881 /* does not support inline data > PAGE_SIZE */
882 if (i_size > PAGE_CACHE_SIZE) {
883 ret = -EIO;
884 } else if (iocb->ki_pos < i_size) {
885 loff_t end = min_t(loff_t, i_size,
886 iocb->ki_pos + len);
887 if (statret < end)
888 zero_user_segment(page, statret, end);
889 ret = copy_page_to_iter(page,
890 iocb->ki_pos & ~PAGE_MASK,
891 end - iocb->ki_pos, to);
892 iocb->ki_pos += ret;
893 } else {
894 ret = 0;
895 }
896 __free_pages(page, 0);
897 return ret;
898 }
852 899
853 /* hit EOF or hole? */ 900 /* hit EOF or hole? */
854 if (statret == 0 && iocb->ki_pos < inode->i_size && 901 if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
855 ret < len) { 902 ret < len) {
856 dout("sync_read hit hole, ppos %lld < size %lld" 903 dout("sync_read hit hole, ppos %lld < size %lld"
857 ", reading more\n", iocb->ki_pos, 904 ", reading more\n", iocb->ki_pos,
@@ -859,7 +906,7 @@ again:
859 906
860 read += ret; 907 read += ret;
861 len -= ret; 908 len -= ret;
862 checkeof = 0; 909 retry_op = 0;
863 goto again; 910 goto again;
864 } 911 }
865 } 912 }