diff options
author | Yan, Zheng <zyan@redhat.com> | 2017-08-29 23:36:06 -0400 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2017-09-06 13:56:55 -0400 |
commit | 1f934b00e907527cddb83984d0783cc4a029952a (patch) | |
tree | 3e1f89e89c990448a8dd84d7d00e743d84902679 /fs/ceph/addr.c | |
parent | b072d774664b690768bdf7e068ee95a161e5f107 (diff) |
ceph: properly get capsnap's size in get_oldest_context()
capsnap's size is set by __ceph_finish_cap_snap(). If capsnap is under
writing, its size is zero. In this case, get_oldest_context() should
read i_size. Besides, ceph_writepages_start() should re-check capsnap's
size after dirty pages get locked.
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph/addr.c')
-rw-r--r-- | fs/ceph/addr.c | 137 |
1 files changed, 80 insertions, 57 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b6ac3da9ddab..03a1ee27b33c 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -463,14 +463,20 @@ out: | |||
463 | return rc; | 463 | return rc; |
464 | } | 464 | } |
465 | 465 | ||
466 | struct ceph_writeback_ctl | ||
467 | { | ||
468 | loff_t i_size; | ||
469 | u64 truncate_size; | ||
470 | u32 truncate_seq; | ||
471 | bool size_stable; | ||
472 | }; | ||
473 | |||
466 | /* | 474 | /* |
467 | * Get ref for the oldest snapc for an inode with dirty data... that is, the | 475 | * Get ref for the oldest snapc for an inode with dirty data... that is, the |
468 | * only snap context we are allowed to write back. | 476 | * only snap context we are allowed to write back. |
469 | */ | 477 | */ |
470 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, | 478 | static struct ceph_snap_context * |
471 | loff_t *snap_size, | 479 | get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl) |
472 | u64 *truncate_size, | ||
473 | u32 *truncate_seq) | ||
474 | { | 480 | { |
475 | struct ceph_inode_info *ci = ceph_inode(inode); | 481 | struct ceph_inode_info *ci = ceph_inode(inode); |
476 | struct ceph_snap_context *snapc = NULL; | 482 | struct ceph_snap_context *snapc = NULL; |
@@ -482,12 +488,17 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, | |||
482 | capsnap->context, capsnap->dirty_pages); | 488 | capsnap->context, capsnap->dirty_pages); |
483 | if (capsnap->dirty_pages) { | 489 | if (capsnap->dirty_pages) { |
484 | snapc = ceph_get_snap_context(capsnap->context); | 490 | snapc = ceph_get_snap_context(capsnap->context); |
485 | if (snap_size) | 491 | if (ctl) { |
486 | *snap_size = capsnap->size; | 492 | if (capsnap->writing) { |
487 | if (truncate_size) | 493 | ctl->i_size = i_size_read(inode); |
488 | *truncate_size = capsnap->truncate_size; | 494 | ctl->size_stable = false; |
489 | if (truncate_seq) | 495 | } else { |
490 | *truncate_seq = capsnap->truncate_seq; | 496 | ctl->i_size = capsnap->size; |
497 | ctl->size_stable = true; | ||
498 | } | ||
499 | ctl->truncate_size = capsnap->truncate_size; | ||
500 | ctl->truncate_seq = capsnap->truncate_seq; | ||
501 | } | ||
491 | break; | 502 | break; |
492 | } | 503 | } |
493 | } | 504 | } |
@@ -495,15 +506,44 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, | |||
495 | snapc = ceph_get_snap_context(ci->i_head_snapc); | 506 | snapc = ceph_get_snap_context(ci->i_head_snapc); |
496 | dout(" head snapc %p has %d dirty pages\n", | 507 | dout(" head snapc %p has %d dirty pages\n", |
497 | snapc, ci->i_wrbuffer_ref_head); | 508 | snapc, ci->i_wrbuffer_ref_head); |
498 | if (truncate_size) | 509 | if (ctl) { |
499 | *truncate_size = ci->i_truncate_size; | 510 | ctl->i_size = i_size_read(inode); |
500 | if (truncate_seq) | 511 | ctl->truncate_size = ci->i_truncate_size; |
501 | *truncate_seq = ci->i_truncate_seq; | 512 | ctl->truncate_seq = ci->i_truncate_seq; |
513 | ctl->size_stable = false; | ||
514 | } | ||
502 | } | 515 | } |
503 | spin_unlock(&ci->i_ceph_lock); | 516 | spin_unlock(&ci->i_ceph_lock); |
504 | return snapc; | 517 | return snapc; |
505 | } | 518 | } |
506 | 519 | ||
520 | static u64 get_writepages_data_length(struct inode *inode, | ||
521 | struct page *page, u64 start) | ||
522 | { | ||
523 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
524 | struct ceph_snap_context *snapc = page_snap_context(page); | ||
525 | struct ceph_cap_snap *capsnap = NULL; | ||
526 | u64 end = i_size_read(inode); | ||
527 | |||
528 | if (snapc != ci->i_head_snapc) { | ||
529 | bool found = false; | ||
530 | spin_lock(&ci->i_ceph_lock); | ||
531 | list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { | ||
532 | if (capsnap->context == snapc) { | ||
533 | if (!capsnap->writing) | ||
534 | end = capsnap->size; | ||
535 | found = true; | ||
536 | break; | ||
537 | } | ||
538 | } | ||
539 | spin_unlock(&ci->i_ceph_lock); | ||
540 | WARN_ON(!found); | ||
541 | } | ||
542 | if (end > page_offset(page) + PAGE_SIZE) | ||
543 | end = page_offset(page) + PAGE_SIZE; | ||
544 | return end > start ? end - start : 0; | ||
545 | } | ||
546 | |||
507 | /* | 547 | /* |
508 | * Write a single page, but leave the page locked. | 548 | * Write a single page, but leave the page locked. |
509 | * | 549 | * |
@@ -515,21 +555,17 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
515 | struct inode *inode; | 555 | struct inode *inode; |
516 | struct ceph_inode_info *ci; | 556 | struct ceph_inode_info *ci; |
517 | struct ceph_fs_client *fsc; | 557 | struct ceph_fs_client *fsc; |
518 | struct ceph_osd_client *osdc; | ||
519 | struct ceph_snap_context *snapc, *oldest; | 558 | struct ceph_snap_context *snapc, *oldest; |
520 | loff_t page_off = page_offset(page); | 559 | loff_t page_off = page_offset(page); |
521 | loff_t snap_size = -1; | ||
522 | long writeback_stat; | 560 | long writeback_stat; |
523 | u64 truncate_size; | ||
524 | u32 truncate_seq; | ||
525 | int err, len = PAGE_SIZE; | 561 | int err, len = PAGE_SIZE; |
562 | struct ceph_writeback_ctl ceph_wbc; | ||
526 | 563 | ||
527 | dout("writepage %p idx %lu\n", page, page->index); | 564 | dout("writepage %p idx %lu\n", page, page->index); |
528 | 565 | ||
529 | inode = page->mapping->host; | 566 | inode = page->mapping->host; |
530 | ci = ceph_inode(inode); | 567 | ci = ceph_inode(inode); |
531 | fsc = ceph_inode_to_client(inode); | 568 | fsc = ceph_inode_to_client(inode); |
532 | osdc = &fsc->client->osdc; | ||
533 | 569 | ||
534 | /* verify this is a writeable snap context */ | 570 | /* verify this is a writeable snap context */ |
535 | snapc = page_snap_context(page); | 571 | snapc = page_snap_context(page); |
@@ -537,8 +573,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
537 | dout("writepage %p page %p not dirty?\n", inode, page); | 573 | dout("writepage %p page %p not dirty?\n", inode, page); |
538 | return 0; | 574 | return 0; |
539 | } | 575 | } |
540 | oldest = get_oldest_context(inode, &snap_size, | 576 | oldest = get_oldest_context(inode, &ceph_wbc); |
541 | &truncate_size, &truncate_seq); | ||
542 | if (snapc->seq > oldest->seq) { | 577 | if (snapc->seq > oldest->seq) { |
543 | dout("writepage %p page %p snapc %p not writeable - noop\n", | 578 | dout("writepage %p page %p snapc %p not writeable - noop\n", |
544 | inode, page, snapc); | 579 | inode, page, snapc); |
@@ -550,17 +585,14 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
550 | } | 585 | } |
551 | ceph_put_snap_context(oldest); | 586 | ceph_put_snap_context(oldest); |
552 | 587 | ||
553 | if (snap_size == -1) | ||
554 | snap_size = i_size_read(inode); | ||
555 | |||
556 | /* is this a partial page at end of file? */ | 588 | /* is this a partial page at end of file? */ |
557 | if (page_off >= snap_size) { | 589 | if (page_off >= ceph_wbc.i_size) { |
558 | dout("%p page eof %llu\n", page, snap_size); | 590 | dout("%p page eof %llu\n", page, ceph_wbc.i_size); |
559 | return 0; | 591 | return 0; |
560 | } | 592 | } |
561 | 593 | ||
562 | if (snap_size < page_off + len) | 594 | if (ceph_wbc.i_size < page_off + len) |
563 | len = snap_size - page_off; | 595 | len = ceph_wbc.i_size - page_off; |
564 | 596 | ||
565 | dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", | 597 | dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", |
566 | inode, page, page->index, page_off, len, snapc, snapc->seq); | 598 | inode, page, page->index, page_off, len, snapc, snapc->seq); |
@@ -571,10 +603,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
571 | set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); | 603 | set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); |
572 | 604 | ||
573 | set_page_writeback(page); | 605 | set_page_writeback(page); |
574 | err = ceph_osdc_writepages(osdc, ceph_vino(inode), | 606 | err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), |
575 | &ci->i_layout, snapc, | 607 | &ci->i_layout, snapc, page_off, len, |
576 | page_off, len, | 608 | ceph_wbc.truncate_seq, |
577 | truncate_seq, truncate_size, | 609 | ceph_wbc.truncate_size, |
578 | &inode->i_mtime, &page, 1); | 610 | &inode->i_mtime, &page, 1); |
579 | if (err < 0) { | 611 | if (err < 0) { |
580 | struct writeback_control tmp_wbc; | 612 | struct writeback_control tmp_wbc; |
@@ -745,9 +777,7 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
745 | int rc = 0; | 777 | int rc = 0; |
746 | unsigned int wsize = i_blocksize(inode); | 778 | unsigned int wsize = i_blocksize(inode); |
747 | struct ceph_osd_request *req = NULL; | 779 | struct ceph_osd_request *req = NULL; |
748 | loff_t snap_size, i_size; | 780 | struct ceph_writeback_ctl ceph_wbc; |
749 | u64 truncate_size; | ||
750 | u32 truncate_seq; | ||
751 | 781 | ||
752 | dout("writepages_start %p (mode=%s)\n", inode, | 782 | dout("writepages_start %p (mode=%s)\n", inode, |
753 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : | 783 | wbc->sync_mode == WB_SYNC_NONE ? "NONE" : |
@@ -786,9 +816,7 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
786 | retry: | 816 | retry: |
787 | /* find oldest snap context with dirty data */ | 817 | /* find oldest snap context with dirty data */ |
788 | ceph_put_snap_context(snapc); | 818 | ceph_put_snap_context(snapc); |
789 | snap_size = -1; | 819 | snapc = get_oldest_context(inode, &ceph_wbc); |
790 | snapc = get_oldest_context(inode, &snap_size, | ||
791 | &truncate_size, &truncate_seq); | ||
792 | if (!snapc) { | 820 | if (!snapc) { |
793 | /* hmm, why does writepages get called when there | 821 | /* hmm, why does writepages get called when there |
794 | is no dirty data? */ | 822 | is no dirty data? */ |
@@ -798,8 +826,6 @@ retry: | |||
798 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", | 826 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", |
799 | snapc, snapc->seq, snapc->num_snaps); | 827 | snapc, snapc->seq, snapc->num_snaps); |
800 | 828 | ||
801 | i_size = i_size_read(inode); | ||
802 | |||
803 | if (last_snapc && snapc != last_snapc) { | 829 | if (last_snapc && snapc != last_snapc) { |
804 | /* if we switched to a newer snapc, restart our scan at the | 830 | /* if we switched to a newer snapc, restart our scan at the |
805 | * start of the original file range. */ | 831 | * start of the original file range. */ |
@@ -865,10 +891,9 @@ get_more_pages: | |||
865 | dout("waiting on writeback %p\n", page); | 891 | dout("waiting on writeback %p\n", page); |
866 | wait_on_page_writeback(page); | 892 | wait_on_page_writeback(page); |
867 | } | 893 | } |
868 | if (page_offset(page) >= | 894 | if (page_offset(page) >= ceph_wbc.i_size) { |
869 | (snap_size == -1 ? i_size : snap_size)) { | 895 | dout("%p page eof %llu\n", |
870 | dout("%p page eof %llu\n", page, | 896 | page, ceph_wbc.i_size); |
871 | (snap_size == -1 ? i_size : snap_size)); | ||
872 | done = 1; | 897 | done = 1; |
873 | unlock_page(page); | 898 | unlock_page(page); |
874 | break; | 899 | break; |
@@ -996,10 +1021,9 @@ new_request: | |||
996 | req = ceph_osdc_new_request(&fsc->client->osdc, | 1021 | req = ceph_osdc_new_request(&fsc->client->osdc, |
997 | &ci->i_layout, vino, | 1022 | &ci->i_layout, vino, |
998 | offset, &len, 0, num_ops, | 1023 | offset, &len, 0, num_ops, |
999 | CEPH_OSD_OP_WRITE, | 1024 | CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, |
1000 | CEPH_OSD_FLAG_WRITE, | 1025 | snapc, ceph_wbc.truncate_seq, |
1001 | snapc, truncate_seq, | 1026 | ceph_wbc.truncate_size, false); |
1002 | truncate_size, false); | ||
1003 | if (IS_ERR(req)) { | 1027 | if (IS_ERR(req)) { |
1004 | req = ceph_osdc_new_request(&fsc->client->osdc, | 1028 | req = ceph_osdc_new_request(&fsc->client->osdc, |
1005 | &ci->i_layout, vino, | 1029 | &ci->i_layout, vino, |
@@ -1008,8 +1032,8 @@ new_request: | |||
1008 | CEPH_OSD_SLAB_OPS), | 1032 | CEPH_OSD_SLAB_OPS), |
1009 | CEPH_OSD_OP_WRITE, | 1033 | CEPH_OSD_OP_WRITE, |
1010 | CEPH_OSD_FLAG_WRITE, | 1034 | CEPH_OSD_FLAG_WRITE, |
1011 | snapc, truncate_seq, | 1035 | snapc, ceph_wbc.truncate_seq, |
1012 | truncate_size, true); | 1036 | ceph_wbc.truncate_size, true); |
1013 | BUG_ON(IS_ERR(req)); | 1037 | BUG_ON(IS_ERR(req)); |
1014 | } | 1038 | } |
1015 | BUG_ON(len < page_offset(pages[locked_pages - 1]) + | 1039 | BUG_ON(len < page_offset(pages[locked_pages - 1]) + |
@@ -1046,14 +1070,15 @@ new_request: | |||
1046 | len += PAGE_SIZE; | 1070 | len += PAGE_SIZE; |
1047 | } | 1071 | } |
1048 | 1072 | ||
1049 | if (snap_size != -1) { | 1073 | if (ceph_wbc.size_stable) { |
1050 | len = min(len, snap_size - offset); | 1074 | len = min(len, ceph_wbc.i_size - offset); |
1051 | } else if (i == locked_pages) { | 1075 | } else if (i == locked_pages) { |
1052 | /* writepages_finish() clears writeback pages | 1076 | /* writepages_finish() clears writeback pages |
1053 | * according to the data length, so make sure | 1077 | * according to the data length, so make sure |
1054 | * data length covers all locked pages */ | 1078 | * data length covers all locked pages */ |
1055 | u64 min_len = len + 1 - PAGE_SIZE; | 1079 | u64 min_len = len + 1 - PAGE_SIZE; |
1056 | len = min(len, (u64)i_size_read(inode) - offset); | 1080 | len = get_writepages_data_length(inode, pages[i - 1], |
1081 | offset); | ||
1057 | len = max(len, min_len); | 1082 | len = max(len, min_len); |
1058 | } | 1083 | } |
1059 | dout("writepages got pages at %llu~%llu\n", offset, len); | 1084 | dout("writepages got pages at %llu~%llu\n", offset, len); |
@@ -1137,8 +1162,7 @@ out: | |||
1137 | static int context_is_writeable_or_written(struct inode *inode, | 1162 | static int context_is_writeable_or_written(struct inode *inode, |
1138 | struct ceph_snap_context *snapc) | 1163 | struct ceph_snap_context *snapc) |
1139 | { | 1164 | { |
1140 | struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, | 1165 | struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); |
1141 | NULL, NULL); | ||
1142 | int ret = !oldest || snapc->seq <= oldest->seq; | 1166 | int ret = !oldest || snapc->seq <= oldest->seq; |
1143 | 1167 | ||
1144 | ceph_put_snap_context(oldest); | 1168 | ceph_put_snap_context(oldest); |
@@ -1183,8 +1207,7 @@ retry_locked: | |||
1183 | * this page is already dirty in another (older) snap | 1207 | * this page is already dirty in another (older) snap |
1184 | * context! is it writeable now? | 1208 | * context! is it writeable now? |
1185 | */ | 1209 | */ |
1186 | oldest = get_oldest_context(inode, NULL, NULL, NULL); | 1210 | oldest = get_oldest_context(inode, NULL); |
1187 | |||
1188 | if (snapc->seq > oldest->seq) { | 1211 | if (snapc->seq > oldest->seq) { |
1189 | ceph_put_snap_context(oldest); | 1212 | ceph_put_snap_context(oldest); |
1190 | dout(" page %p snapc %p not current or oldest\n", | 1213 | dout(" page %p snapc %p not current or oldest\n", |