diff options
author | Yan, Zheng <zyan@redhat.com> | 2017-12-14 22:15:36 -0500 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2018-01-29 12:36:08 -0500 |
commit | 5d988308283ecf062fa88f20ae05c52cce0bcdca (patch) | |
tree | fdc84f449e10eab2f74123d38fc0f3dcf26e5b54 | |
parent | 5495c2d04f85da09512f5f346ed24dc0261d905d (diff) |
ceph: track read contexts in ceph_file_info
Previously ceph_read_iter() uses current->journal to pass context info
to ceph_readpages(), so that ceph_readpages() can distinguish read(2)
from readahead(2)/fadvise(2)/madvise(2). The problem is that page fault
can happen when copying data to userspace memory. Page fault may call
other filesystem's page_mkwrite() if the userspace memory is mapped to a
file. The later filesystem may also want to use current->journal.
The fix is define a on-stack data structure in ceph_read_iter(), add it
to context list in ceph_file_info. ceph_readpages() searches the list,
find if there is a context belongs to current thread.
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
-rw-r--r-- | fs/ceph/addr.c | 19 | ||||
-rw-r--r-- | fs/ceph/file.c | 10 | ||||
-rw-r--r-- | fs/ceph/super.h | 46 |
3 files changed, 66 insertions, 9 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index dbf07051aacd..78a1208b878e 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -299,7 +299,8 @@ unlock: | |||
299 | * start an async read(ahead) operation. return nr_pages we submitted | 299 | * start an async read(ahead) operation. return nr_pages we submitted |
300 | * a read for on success, or negative error code. | 300 | * a read for on success, or negative error code. |
301 | */ | 301 | */ |
302 | static int start_read(struct inode *inode, struct list_head *page_list, int max) | 302 | static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, |
303 | struct list_head *page_list, int max) | ||
303 | { | 304 | { |
304 | struct ceph_osd_client *osdc = | 305 | struct ceph_osd_client *osdc = |
305 | &ceph_inode_to_client(inode)->client->osdc; | 306 | &ceph_inode_to_client(inode)->client->osdc; |
@@ -316,7 +317,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) | |||
316 | int got = 0; | 317 | int got = 0; |
317 | int ret = 0; | 318 | int ret = 0; |
318 | 319 | ||
319 | if (!current->journal_info) { | 320 | if (!rw_ctx) { |
320 | /* caller of readpages does not hold buffer and read caps | 321 | /* caller of readpages does not hold buffer and read caps |
321 | * (fadvise, madvise and readahead cases) */ | 322 | * (fadvise, madvise and readahead cases) */ |
322 | int want = CEPH_CAP_FILE_CACHE; | 323 | int want = CEPH_CAP_FILE_CACHE; |
@@ -437,6 +438,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, | |||
437 | { | 438 | { |
438 | struct inode *inode = file_inode(file); | 439 | struct inode *inode = file_inode(file); |
439 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 440 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
441 | struct ceph_file_info *ci = file->private_data; | ||
442 | struct ceph_rw_context *rw_ctx; | ||
440 | int rc = 0; | 443 | int rc = 0; |
441 | int max = 0; | 444 | int max = 0; |
442 | 445 | ||
@@ -449,11 +452,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, | |||
449 | if (rc == 0) | 452 | if (rc == 0) |
450 | goto out; | 453 | goto out; |
451 | 454 | ||
455 | rw_ctx = ceph_find_rw_context(ci); | ||
452 | max = fsc->mount_options->rsize >> PAGE_SHIFT; | 456 | max = fsc->mount_options->rsize >> PAGE_SHIFT; |
453 | dout("readpages %p file %p nr_pages %d max %d\n", | 457 | dout("readpages %p file %p ctx %p nr_pages %d max %d\n", |
454 | inode, file, nr_pages, max); | 458 | inode, file, rw_ctx, nr_pages, max); |
455 | while (!list_empty(page_list)) { | 459 | while (!list_empty(page_list)) { |
456 | rc = start_read(inode, page_list, max); | 460 | rc = start_read(inode, rw_ctx, page_list, max); |
457 | if (rc < 0) | 461 | if (rc < 0) |
458 | goto out; | 462 | goto out; |
459 | } | 463 | } |
@@ -1450,9 +1454,10 @@ static int ceph_filemap_fault(struct vm_fault *vmf) | |||
1450 | 1454 | ||
1451 | if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || | 1455 | if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || |
1452 | ci->i_inline_version == CEPH_INLINE_NONE) { | 1456 | ci->i_inline_version == CEPH_INLINE_NONE) { |
1453 | current->journal_info = vma->vm_file; | 1457 | CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); |
1458 | ceph_add_rw_context(fi, &rw_ctx); | ||
1454 | ret = filemap_fault(vmf); | 1459 | ret = filemap_fault(vmf); |
1455 | current->journal_info = NULL; | 1460 | ceph_del_rw_context(fi, &rw_ctx); |
1456 | } else | 1461 | } else |
1457 | ret = -EAGAIN; | 1462 | ret = -EAGAIN; |
1458 | 1463 | ||
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 770dd3b413e4..6639926eed4e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -181,6 +181,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) | |||
181 | return -ENOMEM; | 181 | return -ENOMEM; |
182 | } | 182 | } |
183 | cf->fmode = fmode; | 183 | cf->fmode = fmode; |
184 | |||
185 | spin_lock_init(&cf->rw_contexts_lock); | ||
186 | INIT_LIST_HEAD(&cf->rw_contexts); | ||
187 | |||
184 | cf->next_offset = 2; | 188 | cf->next_offset = 2; |
185 | cf->readdir_cache_idx = -1; | 189 | cf->readdir_cache_idx = -1; |
186 | file->private_data = cf; | 190 | file->private_data = cf; |
@@ -464,6 +468,7 @@ int ceph_release(struct inode *inode, struct file *file) | |||
464 | ceph_mdsc_put_request(cf->last_readdir); | 468 | ceph_mdsc_put_request(cf->last_readdir); |
465 | kfree(cf->last_name); | 469 | kfree(cf->last_name); |
466 | kfree(cf->dir_info); | 470 | kfree(cf->dir_info); |
471 | WARN_ON(!list_empty(&cf->rw_contexts)); | ||
467 | kmem_cache_free(ceph_file_cachep, cf); | 472 | kmem_cache_free(ceph_file_cachep, cf); |
468 | 473 | ||
469 | /* wake up anyone waiting for caps on this inode */ | 474 | /* wake up anyone waiting for caps on this inode */ |
@@ -1199,12 +1204,13 @@ again: | |||
1199 | retry_op = READ_INLINE; | 1204 | retry_op = READ_INLINE; |
1200 | } | 1205 | } |
1201 | } else { | 1206 | } else { |
1207 | CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); | ||
1202 | dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", | 1208 | dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", |
1203 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, | 1209 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, |
1204 | ceph_cap_string(got)); | 1210 | ceph_cap_string(got)); |
1205 | current->journal_info = filp; | 1211 | ceph_add_rw_context(fi, &rw_ctx); |
1206 | ret = generic_file_read_iter(iocb, to); | 1212 | ret = generic_file_read_iter(iocb, to); |
1207 | current->journal_info = NULL; | 1213 | ceph_del_rw_context(fi, &rw_ctx); |
1208 | } | 1214 | } |
1209 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", | 1215 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", |
1210 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); | 1216 | inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 75701c199b2b..601100da738f 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -669,6 +669,9 @@ struct ceph_file_info { | |||
669 | short fmode; /* initialized on open */ | 669 | short fmode; /* initialized on open */ |
670 | short flags; /* CEPH_F_* */ | 670 | short flags; /* CEPH_F_* */ |
671 | 671 | ||
672 | spinlock_t rw_contexts_lock; | ||
673 | struct list_head rw_contexts; | ||
674 | |||
672 | /* readdir: position within the dir */ | 675 | /* readdir: position within the dir */ |
673 | u32 frag; | 676 | u32 frag; |
674 | struct ceph_mds_request *last_readdir; | 677 | struct ceph_mds_request *last_readdir; |
@@ -685,6 +688,49 @@ struct ceph_file_info { | |||
685 | int dir_info_len; | 688 | int dir_info_len; |
686 | }; | 689 | }; |
687 | 690 | ||
691 | struct ceph_rw_context { | ||
692 | struct list_head list; | ||
693 | struct task_struct *thread; | ||
694 | int caps; | ||
695 | }; | ||
696 | |||
697 | #define CEPH_DEFINE_RW_CONTEXT(_name, _caps) \ | ||
698 | struct ceph_rw_context _name = { \ | ||
699 | .thread = current, \ | ||
700 | .caps = _caps, \ | ||
701 | } | ||
702 | |||
703 | static inline void ceph_add_rw_context(struct ceph_file_info *cf, | ||
704 | struct ceph_rw_context *ctx) | ||
705 | { | ||
706 | spin_lock(&cf->rw_contexts_lock); | ||
707 | list_add(&ctx->list, &cf->rw_contexts); | ||
708 | spin_unlock(&cf->rw_contexts_lock); | ||
709 | } | ||
710 | |||
711 | static inline void ceph_del_rw_context(struct ceph_file_info *cf, | ||
712 | struct ceph_rw_context *ctx) | ||
713 | { | ||
714 | spin_lock(&cf->rw_contexts_lock); | ||
715 | list_del(&ctx->list); | ||
716 | spin_unlock(&cf->rw_contexts_lock); | ||
717 | } | ||
718 | |||
719 | static inline struct ceph_rw_context* | ||
720 | ceph_find_rw_context(struct ceph_file_info *cf) | ||
721 | { | ||
722 | struct ceph_rw_context *ctx, *found = NULL; | ||
723 | spin_lock(&cf->rw_contexts_lock); | ||
724 | list_for_each_entry(ctx, &cf->rw_contexts, list) { | ||
725 | if (ctx->thread == current) { | ||
726 | found = ctx; | ||
727 | break; | ||
728 | } | ||
729 | } | ||
730 | spin_unlock(&cf->rw_contexts_lock); | ||
731 | return found; | ||
732 | } | ||
733 | |||
688 | struct ceph_readdir_cache_control { | 734 | struct ceph_readdir_cache_control { |
689 | struct page *page; | 735 | struct page *page; |
690 | struct dentry **dentries; | 736 | struct dentry **dentries; |