diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-07-02 14:35:00 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-07-02 14:35:00 -0400 |
commit | 0c76c6ba246043bbc5c0f9620a0645ae78217421 (patch) | |
tree | 644a4db58706c4e97478951f0a3a0087ddf26e5e /fs/ceph | |
parent | 8688d9540cc6e17df4cba71615e27f04e0378fe6 (diff) | |
parent | 5a60e87603c4c533492c515b7f62578189b03c9c (diff) |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil:
"We have a pile of bug fixes from Ilya, including a few patches that
sync up the CRUSH code with the latest from userspace.
There is also a long series from Zheng that fixes various issues with
snapshots, inline data, and directory fsync, some simplification and
improvement in the cap release code, and a rework of the caching of
directory contents.
To top it off there are a few small fixes and cleanups from Benoit and
Hong"
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (40 commits)
rbd: use GFP_NOIO in rbd_obj_request_create()
crush: fix a bug in tree bucket decode
libceph: Fix ceph_tcp_sendpage()'s more boolean usage
libceph: Remove spurious kunmap() of the zero page
rbd: queue_depth map option
rbd: store rbd_options in rbd_device
rbd: terminate rbd_opts_tokens with Opt_err
ceph: fix ceph_writepages_start()
rbd: bump queue_max_segments
ceph: rework dcache readdir
crush: sync up with userspace
crush: fix crash from invalid 'take' argument
ceph: switch some GFP_NOFS memory allocation to GFP_KERNEL
ceph: pre-allocate data structure that tracks caps flushing
ceph: re-send flushing caps (which are revoked) in reconnect stage
ceph: send TID of the oldest pending caps flush to MDS
ceph: track pending caps flushing globally
ceph: track pending caps flushing accurately
libceph: fix wrong name "Ceph filesystem for Linux"
ceph: fix directory fsync
...
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/acl.c | 4 | ||||
-rw-r--r-- | fs/ceph/addr.c | 308 | ||||
-rw-r--r-- | fs/ceph/caps.c | 836 | ||||
-rw-r--r-- | fs/ceph/dir.c | 383 | ||||
-rw-r--r-- | fs/ceph/file.c | 61 | ||||
-rw-r--r-- | fs/ceph/inode.c | 155 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 425 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 23 | ||||
-rw-r--r-- | fs/ceph/snap.c | 173 | ||||
-rw-r--r-- | fs/ceph/super.c | 25 | ||||
-rw-r--r-- | fs/ceph/super.h | 125 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 65 |
12 files changed, 1689 insertions, 894 deletions
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 64fa248343f6..8f84646f10e9 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c | |||
@@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, | |||
187 | val_size2 = posix_acl_xattr_size(default_acl->a_count); | 187 | val_size2 = posix_acl_xattr_size(default_acl->a_count); |
188 | 188 | ||
189 | err = -ENOMEM; | 189 | err = -ENOMEM; |
190 | tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); | 190 | tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); |
191 | if (!tmp_buf) | 191 | if (!tmp_buf) |
192 | goto out_err; | 192 | goto out_err; |
193 | pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); | 193 | pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); |
194 | if (!pagelist) | 194 | if (!pagelist) |
195 | goto out_err; | 195 | goto out_err; |
196 | ceph_pagelist_init(pagelist); | 196 | ceph_pagelist_init(pagelist); |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e162bcd105ee..890c50971a69 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page) | |||
87 | inode = mapping->host; | 87 | inode = mapping->host; |
88 | ci = ceph_inode(inode); | 88 | ci = ceph_inode(inode); |
89 | 89 | ||
90 | /* | ||
91 | * Note that we're grabbing a snapc ref here without holding | ||
92 | * any locks! | ||
93 | */ | ||
94 | snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); | ||
95 | |||
96 | /* dirty the head */ | 90 | /* dirty the head */ |
97 | spin_lock(&ci->i_ceph_lock); | 91 | spin_lock(&ci->i_ceph_lock); |
98 | if (ci->i_head_snapc == NULL) | 92 | BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference |
99 | ci->i_head_snapc = ceph_get_snap_context(snapc); | 93 | if (__ceph_have_pending_cap_snap(ci)) { |
100 | ++ci->i_wrbuffer_ref_head; | 94 | struct ceph_cap_snap *capsnap = |
95 | list_last_entry(&ci->i_cap_snaps, | ||
96 | struct ceph_cap_snap, | ||
97 | ci_item); | ||
98 | snapc = ceph_get_snap_context(capsnap->context); | ||
99 | capsnap->dirty_pages++; | ||
100 | } else { | ||
101 | BUG_ON(!ci->i_head_snapc); | ||
102 | snapc = ceph_get_snap_context(ci->i_head_snapc); | ||
103 | ++ci->i_wrbuffer_ref_head; | ||
104 | } | ||
101 | if (ci->i_wrbuffer_ref == 0) | 105 | if (ci->i_wrbuffer_ref == 0) |
102 | ihold(inode); | 106 | ihold(inode); |
103 | ++ci->i_wrbuffer_ref; | 107 | ++ci->i_wrbuffer_ref; |
@@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) | |||
346 | 350 | ||
347 | /* build page vector */ | 351 | /* build page vector */ |
348 | nr_pages = calc_pages_for(0, len); | 352 | nr_pages = calc_pages_for(0, len); |
349 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); | 353 | pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); |
350 | ret = -ENOMEM; | 354 | ret = -ENOMEM; |
351 | if (!pages) | 355 | if (!pages) |
352 | goto out; | 356 | goto out; |
@@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) | |||
358 | dout("start_read %p adding %p idx %lu\n", inode, page, | 362 | dout("start_read %p adding %p idx %lu\n", inode, page, |
359 | page->index); | 363 | page->index); |
360 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, | 364 | if (add_to_page_cache_lru(page, &inode->i_data, page->index, |
361 | GFP_NOFS)) { | 365 | GFP_KERNEL)) { |
362 | ceph_fscache_uncache_page(inode, page); | 366 | ceph_fscache_uncache_page(inode, page); |
363 | page_cache_release(page); | 367 | page_cache_release(page); |
364 | dout("start_read %p add_to_page_cache failed %p\n", | 368 | dout("start_read %p add_to_page_cache failed %p\n", |
@@ -436,7 +440,7 @@ out: | |||
436 | * only snap context we are allowed to write back. | 440 | * only snap context we are allowed to write back. |
437 | */ | 441 | */ |
438 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, | 442 | static struct ceph_snap_context *get_oldest_context(struct inode *inode, |
439 | u64 *snap_size) | 443 | loff_t *snap_size) |
440 | { | 444 | { |
441 | struct ceph_inode_info *ci = ceph_inode(inode); | 445 | struct ceph_inode_info *ci = ceph_inode(inode); |
442 | struct ceph_snap_context *snapc = NULL; | 446 | struct ceph_snap_context *snapc = NULL; |
@@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
476 | struct ceph_osd_client *osdc; | 480 | struct ceph_osd_client *osdc; |
477 | struct ceph_snap_context *snapc, *oldest; | 481 | struct ceph_snap_context *snapc, *oldest; |
478 | loff_t page_off = page_offset(page); | 482 | loff_t page_off = page_offset(page); |
483 | loff_t snap_size = -1; | ||
479 | long writeback_stat; | 484 | long writeback_stat; |
480 | u64 truncate_size, snap_size = 0; | 485 | u64 truncate_size; |
481 | u32 truncate_seq; | 486 | u32 truncate_seq; |
482 | int err = 0, len = PAGE_CACHE_SIZE; | 487 | int err = 0, len = PAGE_CACHE_SIZE; |
483 | 488 | ||
@@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
512 | spin_lock(&ci->i_ceph_lock); | 517 | spin_lock(&ci->i_ceph_lock); |
513 | truncate_seq = ci->i_truncate_seq; | 518 | truncate_seq = ci->i_truncate_seq; |
514 | truncate_size = ci->i_truncate_size; | 519 | truncate_size = ci->i_truncate_size; |
515 | if (!snap_size) | 520 | if (snap_size == -1) |
516 | snap_size = i_size_read(inode); | 521 | snap_size = i_size_read(inode); |
517 | spin_unlock(&ci->i_ceph_lock); | 522 | spin_unlock(&ci->i_ceph_lock); |
518 | 523 | ||
@@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
695 | unsigned wsize = 1 << inode->i_blkbits; | 700 | unsigned wsize = 1 << inode->i_blkbits; |
696 | struct ceph_osd_request *req = NULL; | 701 | struct ceph_osd_request *req = NULL; |
697 | int do_sync = 0; | 702 | int do_sync = 0; |
698 | u64 truncate_size, snap_size; | 703 | loff_t snap_size, i_size; |
704 | u64 truncate_size; | ||
699 | u32 truncate_seq; | 705 | u32 truncate_seq; |
700 | 706 | ||
701 | /* | 707 | /* |
@@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping, | |||
741 | retry: | 747 | retry: |
742 | /* find oldest snap context with dirty data */ | 748 | /* find oldest snap context with dirty data */ |
743 | ceph_put_snap_context(snapc); | 749 | ceph_put_snap_context(snapc); |
744 | snap_size = 0; | 750 | snap_size = -1; |
745 | snapc = get_oldest_context(inode, &snap_size); | 751 | snapc = get_oldest_context(inode, &snap_size); |
746 | if (!snapc) { | 752 | if (!snapc) { |
747 | /* hmm, why does writepages get called when there | 753 | /* hmm, why does writepages get called when there |
@@ -749,16 +755,13 @@ retry: | |||
749 | dout(" no snap context with dirty data?\n"); | 755 | dout(" no snap context with dirty data?\n"); |
750 | goto out; | 756 | goto out; |
751 | } | 757 | } |
752 | if (snap_size == 0) | ||
753 | snap_size = i_size_read(inode); | ||
754 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", | 758 | dout(" oldest snapc is %p seq %lld (%d snaps)\n", |
755 | snapc, snapc->seq, snapc->num_snaps); | 759 | snapc, snapc->seq, snapc->num_snaps); |
756 | 760 | ||
757 | spin_lock(&ci->i_ceph_lock); | 761 | spin_lock(&ci->i_ceph_lock); |
758 | truncate_seq = ci->i_truncate_seq; | 762 | truncate_seq = ci->i_truncate_seq; |
759 | truncate_size = ci->i_truncate_size; | 763 | truncate_size = ci->i_truncate_size; |
760 | if (!snap_size) | 764 | i_size = i_size_read(inode); |
761 | snap_size = i_size_read(inode); | ||
762 | spin_unlock(&ci->i_ceph_lock); | 765 | spin_unlock(&ci->i_ceph_lock); |
763 | 766 | ||
764 | if (last_snapc && snapc != last_snapc) { | 767 | if (last_snapc && snapc != last_snapc) { |
@@ -828,8 +831,10 @@ get_more_pages: | |||
828 | dout("waiting on writeback %p\n", page); | 831 | dout("waiting on writeback %p\n", page); |
829 | wait_on_page_writeback(page); | 832 | wait_on_page_writeback(page); |
830 | } | 833 | } |
831 | if (page_offset(page) >= snap_size) { | 834 | if (page_offset(page) >= |
832 | dout("%p page eof %llu\n", page, snap_size); | 835 | (snap_size == -1 ? i_size : snap_size)) { |
836 | dout("%p page eof %llu\n", page, | ||
837 | (snap_size == -1 ? i_size : snap_size)); | ||
833 | done = 1; | 838 | done = 1; |
834 | unlock_page(page); | 839 | unlock_page(page); |
835 | break; | 840 | break; |
@@ -884,7 +889,8 @@ get_more_pages: | |||
884 | } | 889 | } |
885 | 890 | ||
886 | if (do_sync) | 891 | if (do_sync) |
887 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); | 892 | osd_req_op_init(req, 1, |
893 | CEPH_OSD_OP_STARTSYNC, 0); | ||
888 | 894 | ||
889 | req->r_callback = writepages_finish; | 895 | req->r_callback = writepages_finish; |
890 | req->r_inode = inode; | 896 | req->r_inode = inode; |
@@ -944,10 +950,18 @@ get_more_pages: | |||
944 | } | 950 | } |
945 | 951 | ||
946 | /* Format the osd request message and submit the write */ | 952 | /* Format the osd request message and submit the write */ |
947 | |||
948 | offset = page_offset(pages[0]); | 953 | offset = page_offset(pages[0]); |
949 | len = min(snap_size - offset, | 954 | len = (u64)locked_pages << PAGE_CACHE_SHIFT; |
950 | (u64)locked_pages << PAGE_CACHE_SHIFT); | 955 | if (snap_size == -1) { |
956 | len = min(len, (u64)i_size_read(inode) - offset); | ||
957 | /* writepages_finish() clears writeback pages | ||
958 | * according to the data length, so make sure | ||
959 | * data length covers all locked pages */ | ||
960 | len = max(len, 1 + | ||
961 | ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); | ||
962 | } else { | ||
963 | len = min(len, snap_size - offset); | ||
964 | } | ||
951 | dout("writepages got %d pages at %llu~%llu\n", | 965 | dout("writepages got %d pages at %llu~%llu\n", |
952 | locked_pages, offset, len); | 966 | locked_pages, offset, len); |
953 | 967 | ||
@@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file, | |||
1032 | { | 1046 | { |
1033 | struct inode *inode = file_inode(file); | 1047 | struct inode *inode = file_inode(file); |
1034 | struct ceph_inode_info *ci = ceph_inode(inode); | 1048 | struct ceph_inode_info *ci = ceph_inode(inode); |
1035 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | ||
1036 | loff_t page_off = pos & PAGE_CACHE_MASK; | 1049 | loff_t page_off = pos & PAGE_CACHE_MASK; |
1037 | int pos_in_page = pos & ~PAGE_CACHE_MASK; | 1050 | int pos_in_page = pos & ~PAGE_CACHE_MASK; |
1038 | int end_in_page = pos_in_page + len; | 1051 | int end_in_page = pos_in_page + len; |
@@ -1044,10 +1057,6 @@ retry_locked: | |||
1044 | /* writepages currently holds page lock, but if we change that later, */ | 1057 | /* writepages currently holds page lock, but if we change that later, */ |
1045 | wait_on_page_writeback(page); | 1058 | wait_on_page_writeback(page); |
1046 | 1059 | ||
1047 | /* check snap context */ | ||
1048 | BUG_ON(!ci->i_snap_realm); | ||
1049 | down_read(&mdsc->snap_rwsem); | ||
1050 | BUG_ON(!ci->i_snap_realm->cached_context); | ||
1051 | snapc = page_snap_context(page); | 1060 | snapc = page_snap_context(page); |
1052 | if (snapc && snapc != ci->i_head_snapc) { | 1061 | if (snapc && snapc != ci->i_head_snapc) { |
1053 | /* | 1062 | /* |
@@ -1055,7 +1064,6 @@ retry_locked: | |||
1055 | * context! is it writeable now? | 1064 | * context! is it writeable now? |
1056 | */ | 1065 | */ |
1057 | oldest = get_oldest_context(inode, NULL); | 1066 | oldest = get_oldest_context(inode, NULL); |
1058 | up_read(&mdsc->snap_rwsem); | ||
1059 | 1067 | ||
1060 | if (snapc->seq > oldest->seq) { | 1068 | if (snapc->seq > oldest->seq) { |
1061 | ceph_put_snap_context(oldest); | 1069 | ceph_put_snap_context(oldest); |
@@ -1112,7 +1120,6 @@ retry_locked: | |||
1112 | } | 1120 | } |
1113 | 1121 | ||
1114 | /* we need to read it. */ | 1122 | /* we need to read it. */ |
1115 | up_read(&mdsc->snap_rwsem); | ||
1116 | r = readpage_nounlock(file, page); | 1123 | r = readpage_nounlock(file, page); |
1117 | if (r < 0) | 1124 | if (r < 0) |
1118 | goto fail_nosnap; | 1125 | goto fail_nosnap; |
@@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, | |||
1157 | 1164 | ||
1158 | /* | 1165 | /* |
1159 | * we don't do anything in here that simple_write_end doesn't do | 1166 | * we don't do anything in here that simple_write_end doesn't do |
1160 | * except adjust dirty page accounting and drop read lock on | 1167 | * except adjust dirty page accounting |
1161 | * mdsc->snap_rwsem. | ||
1162 | */ | 1168 | */ |
1163 | static int ceph_write_end(struct file *file, struct address_space *mapping, | 1169 | static int ceph_write_end(struct file *file, struct address_space *mapping, |
1164 | loff_t pos, unsigned len, unsigned copied, | 1170 | loff_t pos, unsigned len, unsigned copied, |
1165 | struct page *page, void *fsdata) | 1171 | struct page *page, void *fsdata) |
1166 | { | 1172 | { |
1167 | struct inode *inode = file_inode(file); | 1173 | struct inode *inode = file_inode(file); |
1168 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
1169 | struct ceph_mds_client *mdsc = fsc->mdsc; | ||
1170 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); | 1174 | unsigned from = pos & (PAGE_CACHE_SIZE - 1); |
1171 | int check_cap = 0; | 1175 | int check_cap = 0; |
1172 | 1176 | ||
@@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, | |||
1188 | set_page_dirty(page); | 1192 | set_page_dirty(page); |
1189 | 1193 | ||
1190 | unlock_page(page); | 1194 | unlock_page(page); |
1191 | up_read(&mdsc->snap_rwsem); | ||
1192 | page_cache_release(page); | 1195 | page_cache_release(page); |
1193 | 1196 | ||
1194 | if (check_cap) | 1197 | if (check_cap) |
@@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1314 | struct inode *inode = file_inode(vma->vm_file); | 1317 | struct inode *inode = file_inode(vma->vm_file); |
1315 | struct ceph_inode_info *ci = ceph_inode(inode); | 1318 | struct ceph_inode_info *ci = ceph_inode(inode); |
1316 | struct ceph_file_info *fi = vma->vm_file->private_data; | 1319 | struct ceph_file_info *fi = vma->vm_file->private_data; |
1317 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 1320 | struct ceph_cap_flush *prealloc_cf; |
1318 | struct page *page = vmf->page; | 1321 | struct page *page = vmf->page; |
1319 | loff_t off = page_offset(page); | 1322 | loff_t off = page_offset(page); |
1320 | loff_t size = i_size_read(inode); | 1323 | loff_t size = i_size_read(inode); |
1321 | size_t len; | 1324 | size_t len; |
1322 | int want, got, ret; | 1325 | int want, got, ret; |
1323 | 1326 | ||
1327 | prealloc_cf = ceph_alloc_cap_flush(); | ||
1328 | if (!prealloc_cf) | ||
1329 | return VM_FAULT_SIGBUS; | ||
1330 | |||
1324 | if (ci->i_inline_version != CEPH_INLINE_NONE) { | 1331 | if (ci->i_inline_version != CEPH_INLINE_NONE) { |
1325 | struct page *locked_page = NULL; | 1332 | struct page *locked_page = NULL; |
1326 | if (off == 0) { | 1333 | if (off == 0) { |
@@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1330 | ret = ceph_uninline_data(vma->vm_file, locked_page); | 1337 | ret = ceph_uninline_data(vma->vm_file, locked_page); |
1331 | if (locked_page) | 1338 | if (locked_page) |
1332 | unlock_page(locked_page); | 1339 | unlock_page(locked_page); |
1333 | if (ret < 0) | 1340 | if (ret < 0) { |
1334 | return VM_FAULT_SIGBUS; | 1341 | ret = VM_FAULT_SIGBUS; |
1342 | goto out_free; | ||
1343 | } | ||
1335 | } | 1344 | } |
1336 | 1345 | ||
1337 | if (off + PAGE_CACHE_SIZE <= size) | 1346 | if (off + PAGE_CACHE_SIZE <= size) |
@@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1353 | break; | 1362 | break; |
1354 | if (ret != -ERESTARTSYS) { | 1363 | if (ret != -ERESTARTSYS) { |
1355 | WARN_ON(1); | 1364 | WARN_ON(1); |
1356 | return VM_FAULT_SIGBUS; | 1365 | ret = VM_FAULT_SIGBUS; |
1366 | goto out_free; | ||
1357 | } | 1367 | } |
1358 | } | 1368 | } |
1359 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", | 1369 | dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", |
@@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
1373 | if (ret == 0) { | 1383 | if (ret == 0) { |
1374 | /* success. we'll keep the page locked. */ | 1384 | /* success. we'll keep the page locked. */ |
1375 | set_page_dirty(page); | 1385 | set_page_dirty(page); |
1376 | up_read(&mdsc->snap_rwsem); | ||
1377 | ret = VM_FAULT_LOCKED; | 1386 | ret = VM_FAULT_LOCKED; |
1378 | } else { | 1387 | } else { |
1379 | if (ret == -ENOMEM) | 1388 | if (ret == -ENOMEM) |
@@ -1389,7 +1398,8 @@ out: | |||
1389 | int dirty; | 1398 | int dirty; |
1390 | spin_lock(&ci->i_ceph_lock); | 1399 | spin_lock(&ci->i_ceph_lock); |
1391 | ci->i_inline_version = CEPH_INLINE_NONE; | 1400 | ci->i_inline_version = CEPH_INLINE_NONE; |
1392 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 1401 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, |
1402 | &prealloc_cf); | ||
1393 | spin_unlock(&ci->i_ceph_lock); | 1403 | spin_unlock(&ci->i_ceph_lock); |
1394 | if (dirty) | 1404 | if (dirty) |
1395 | __mark_inode_dirty(inode, dirty); | 1405 | __mark_inode_dirty(inode, dirty); |
@@ -1398,6 +1408,8 @@ out: | |||
1398 | dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", | 1408 | dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", |
1399 | inode, off, len, ceph_cap_string(got), ret); | 1409 | inode, off, len, ceph_cap_string(got), ret); |
1400 | ceph_put_cap_refs(ci, got); | 1410 | ceph_put_cap_refs(ci, got); |
1411 | out_free: | ||
1412 | ceph_free_cap_flush(prealloc_cf); | ||
1401 | 1413 | ||
1402 | return ret; | 1414 | return ret; |
1403 | } | 1415 | } |
@@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) | |||
1509 | ceph_vino(inode), 0, &len, 0, 1, | 1521 | ceph_vino(inode), 0, &len, 0, 1, |
1510 | CEPH_OSD_OP_CREATE, | 1522 | CEPH_OSD_OP_CREATE, |
1511 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | 1523 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, |
1512 | ci->i_snap_realm->cached_context, | 1524 | ceph_empty_snapc, 0, 0, false); |
1513 | 0, 0, false); | ||
1514 | if (IS_ERR(req)) { | 1525 | if (IS_ERR(req)) { |
1515 | err = PTR_ERR(req); | 1526 | err = PTR_ERR(req); |
1516 | goto out; | 1527 | goto out; |
@@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) | |||
1528 | ceph_vino(inode), 0, &len, 1, 3, | 1539 | ceph_vino(inode), 0, &len, 1, 3, |
1529 | CEPH_OSD_OP_WRITE, | 1540 | CEPH_OSD_OP_WRITE, |
1530 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, | 1541 | CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, |
1531 | ci->i_snap_realm->cached_context, | 1542 | ceph_empty_snapc, |
1532 | ci->i_truncate_seq, ci->i_truncate_size, | 1543 | ci->i_truncate_seq, ci->i_truncate_size, |
1533 | false); | 1544 | false); |
1534 | if (IS_ERR(req)) { | 1545 | if (IS_ERR(req)) { |
@@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) | |||
1597 | vma->vm_ops = &ceph_vmops; | 1608 | vma->vm_ops = &ceph_vmops; |
1598 | return 0; | 1609 | return 0; |
1599 | } | 1610 | } |
1611 | |||
1612 | enum { | ||
1613 | POOL_READ = 1, | ||
1614 | POOL_WRITE = 2, | ||
1615 | }; | ||
1616 | |||
1617 | static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) | ||
1618 | { | ||
1619 | struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); | ||
1620 | struct ceph_mds_client *mdsc = fsc->mdsc; | ||
1621 | struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; | ||
1622 | struct rb_node **p, *parent; | ||
1623 | struct ceph_pool_perm *perm; | ||
1624 | struct page **pages; | ||
1625 | int err = 0, err2 = 0, have = 0; | ||
1626 | |||
1627 | down_read(&mdsc->pool_perm_rwsem); | ||
1628 | p = &mdsc->pool_perm_tree.rb_node; | ||
1629 | while (*p) { | ||
1630 | perm = rb_entry(*p, struct ceph_pool_perm, node); | ||
1631 | if (pool < perm->pool) | ||
1632 | p = &(*p)->rb_left; | ||
1633 | else if (pool > perm->pool) | ||
1634 | p = &(*p)->rb_right; | ||
1635 | else { | ||
1636 | have = perm->perm; | ||
1637 | break; | ||
1638 | } | ||
1639 | } | ||
1640 | up_read(&mdsc->pool_perm_rwsem); | ||
1641 | if (*p) | ||
1642 | goto out; | ||
1643 | |||
1644 | dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); | ||
1645 | |||
1646 | down_write(&mdsc->pool_perm_rwsem); | ||
1647 | parent = NULL; | ||
1648 | while (*p) { | ||
1649 | parent = *p; | ||
1650 | perm = rb_entry(parent, struct ceph_pool_perm, node); | ||
1651 | if (pool < perm->pool) | ||
1652 | p = &(*p)->rb_left; | ||
1653 | else if (pool > perm->pool) | ||
1654 | p = &(*p)->rb_right; | ||
1655 | else { | ||
1656 | have = perm->perm; | ||
1657 | break; | ||
1658 | } | ||
1659 | } | ||
1660 | if (*p) { | ||
1661 | up_write(&mdsc->pool_perm_rwsem); | ||
1662 | goto out; | ||
1663 | } | ||
1664 | |||
1665 | rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, | ||
1666 | ceph_empty_snapc, | ||
1667 | 1, false, GFP_NOFS); | ||
1668 | if (!rd_req) { | ||
1669 | err = -ENOMEM; | ||
1670 | goto out_unlock; | ||
1671 | } | ||
1672 | |||
1673 | rd_req->r_flags = CEPH_OSD_FLAG_READ; | ||
1674 | osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); | ||
1675 | rd_req->r_base_oloc.pool = pool; | ||
1676 | snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), | ||
1677 | "%llx.00000000", ci->i_vino.ino); | ||
1678 | rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); | ||
1679 | |||
1680 | wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, | ||
1681 | ceph_empty_snapc, | ||
1682 | 1, false, GFP_NOFS); | ||
1683 | if (!wr_req) { | ||
1684 | err = -ENOMEM; | ||
1685 | goto out_unlock; | ||
1686 | } | ||
1687 | |||
1688 | wr_req->r_flags = CEPH_OSD_FLAG_WRITE | | ||
1689 | CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; | ||
1690 | osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); | ||
1691 | wr_req->r_base_oloc.pool = pool; | ||
1692 | wr_req->r_base_oid = rd_req->r_base_oid; | ||
1693 | |||
1694 | /* one page should be large enough for STAT data */ | ||
1695 | pages = ceph_alloc_page_vector(1, GFP_KERNEL); | ||
1696 | if (IS_ERR(pages)) { | ||
1697 | err = PTR_ERR(pages); | ||
1698 | goto out_unlock; | ||
1699 | } | ||
1700 | |||
1701 | osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, | ||
1702 | 0, false, true); | ||
1703 | ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, | ||
1704 | &ci->vfs_inode.i_mtime); | ||
1705 | err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); | ||
1706 | |||
1707 | ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, | ||
1708 | &ci->vfs_inode.i_mtime); | ||
1709 | err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); | ||
1710 | |||
1711 | if (!err) | ||
1712 | err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); | ||
1713 | if (!err2) | ||
1714 | err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); | ||
1715 | |||
1716 | if (err >= 0 || err == -ENOENT) | ||
1717 | have |= POOL_READ; | ||
1718 | else if (err != -EPERM) | ||
1719 | goto out_unlock; | ||
1720 | |||
1721 | if (err2 == 0 || err2 == -EEXIST) | ||
1722 | have |= POOL_WRITE; | ||
1723 | else if (err2 != -EPERM) { | ||
1724 | err = err2; | ||
1725 | goto out_unlock; | ||
1726 | } | ||
1727 | |||
1728 | perm = kmalloc(sizeof(*perm), GFP_NOFS); | ||
1729 | if (!perm) { | ||
1730 | err = -ENOMEM; | ||
1731 | goto out_unlock; | ||
1732 | } | ||
1733 | |||
1734 | perm->pool = pool; | ||
1735 | perm->perm = have; | ||
1736 | rb_link_node(&perm->node, parent, p); | ||
1737 | rb_insert_color(&perm->node, &mdsc->pool_perm_tree); | ||
1738 | err = 0; | ||
1739 | out_unlock: | ||
1740 | up_write(&mdsc->pool_perm_rwsem); | ||
1741 | |||
1742 | if (rd_req) | ||
1743 | ceph_osdc_put_request(rd_req); | ||
1744 | if (wr_req) | ||
1745 | ceph_osdc_put_request(wr_req); | ||
1746 | out: | ||
1747 | if (!err) | ||
1748 | err = have; | ||
1749 | dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); | ||
1750 | return err; | ||
1751 | } | ||
1752 | |||
1753 | int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) | ||
1754 | { | ||
1755 | u32 pool; | ||
1756 | int ret, flags; | ||
1757 | |||
1758 | if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), | ||
1759 | NOPOOLPERM)) | ||
1760 | return 0; | ||
1761 | |||
1762 | spin_lock(&ci->i_ceph_lock); | ||
1763 | flags = ci->i_ceph_flags; | ||
1764 | pool = ceph_file_layout_pg_pool(ci->i_layout); | ||
1765 | spin_unlock(&ci->i_ceph_lock); | ||
1766 | check: | ||
1767 | if (flags & CEPH_I_POOL_PERM) { | ||
1768 | if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { | ||
1769 | dout("ceph_pool_perm_check pool %u no read perm\n", | ||
1770 | pool); | ||
1771 | return -EPERM; | ||
1772 | } | ||
1773 | if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { | ||
1774 | dout("ceph_pool_perm_check pool %u no write perm\n", | ||
1775 | pool); | ||
1776 | return -EPERM; | ||
1777 | } | ||
1778 | return 0; | ||
1779 | } | ||
1780 | |||
1781 | ret = __ceph_pool_perm_get(ci, pool); | ||
1782 | if (ret < 0) | ||
1783 | return ret; | ||
1784 | |||
1785 | flags = CEPH_I_POOL_PERM; | ||
1786 | if (ret & POOL_READ) | ||
1787 | flags |= CEPH_I_POOL_RD; | ||
1788 | if (ret & POOL_WRITE) | ||
1789 | flags |= CEPH_I_POOL_WR; | ||
1790 | |||
1791 | spin_lock(&ci->i_ceph_lock); | ||
1792 | if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { | ||
1793 | ci->i_ceph_flags = flags; | ||
1794 | } else { | ||
1795 | pool = ceph_file_layout_pg_pool(ci->i_layout); | ||
1796 | flags = ci->i_ceph_flags; | ||
1797 | } | ||
1798 | spin_unlock(&ci->i_ceph_lock); | ||
1799 | goto check; | ||
1800 | } | ||
1801 | |||
1802 | void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) | ||
1803 | { | ||
1804 | struct ceph_pool_perm *perm; | ||
1805 | struct rb_node *n; | ||
1806 | |||
1807 | while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { | ||
1808 | n = rb_first(&mdsc->pool_perm_tree); | ||
1809 | perm = rb_entry(n, struct ceph_pool_perm, node); | ||
1810 | rb_erase(n, &mdsc->pool_perm_tree); | ||
1811 | kfree(perm); | ||
1812 | } | ||
1813 | } | ||
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index be5ea6af8366..dc10c9dd36c1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) | |||
833 | used |= CEPH_CAP_PIN; | 833 | used |= CEPH_CAP_PIN; |
834 | if (ci->i_rd_ref) | 834 | if (ci->i_rd_ref) |
835 | used |= CEPH_CAP_FILE_RD; | 835 | used |= CEPH_CAP_FILE_RD; |
836 | if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) | 836 | if (ci->i_rdcache_ref || |
837 | (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ | ||
838 | ci->vfs_inode.i_data.nrpages)) | ||
837 | used |= CEPH_CAP_FILE_CACHE; | 839 | used |= CEPH_CAP_FILE_CACHE; |
838 | if (ci->i_wr_ref) | 840 | if (ci->i_wr_ref) |
839 | used |= CEPH_CAP_FILE_WR; | 841 | used |= CEPH_CAP_FILE_WR; |
@@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) | |||
926 | 928 | ||
927 | /* remove from session list */ | 929 | /* remove from session list */ |
928 | spin_lock(&session->s_cap_lock); | 930 | spin_lock(&session->s_cap_lock); |
929 | /* | ||
930 | * s_cap_reconnect is protected by s_cap_lock. no one changes | ||
931 | * s_cap_gen while session is in the reconnect state. | ||
932 | */ | ||
933 | if (queue_release && | ||
934 | (!session->s_cap_reconnect || | ||
935 | cap->cap_gen == session->s_cap_gen)) | ||
936 | __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, | ||
937 | cap->mseq, cap->issue_seq); | ||
938 | |||
939 | if (session->s_cap_iterator == cap) { | 931 | if (session->s_cap_iterator == cap) { |
940 | /* not yet, we are iterating over this very cap */ | 932 | /* not yet, we are iterating over this very cap */ |
941 | dout("__ceph_remove_cap delaying %p removal from session %p\n", | 933 | dout("__ceph_remove_cap delaying %p removal from session %p\n", |
@@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) | |||
948 | } | 940 | } |
949 | /* protect backpointer with s_cap_lock: see iterate_session_caps */ | 941 | /* protect backpointer with s_cap_lock: see iterate_session_caps */ |
950 | cap->ci = NULL; | 942 | cap->ci = NULL; |
943 | |||
944 | /* | ||
945 | * s_cap_reconnect is protected by s_cap_lock. no one changes | ||
946 | * s_cap_gen while session is in the reconnect state. | ||
947 | */ | ||
948 | if (queue_release && | ||
949 | (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { | ||
950 | cap->queue_release = 1; | ||
951 | if (removed) { | ||
952 | list_add_tail(&cap->session_caps, | ||
953 | &session->s_cap_releases); | ||
954 | session->s_num_cap_releases++; | ||
955 | removed = 0; | ||
956 | } | ||
957 | } else { | ||
958 | cap->queue_release = 0; | ||
959 | } | ||
960 | cap->cap_ino = ci->i_vino.ino; | ||
961 | |||
951 | spin_unlock(&session->s_cap_lock); | 962 | spin_unlock(&session->s_cap_lock); |
952 | 963 | ||
953 | /* remove from inode list */ | 964 | /* remove from inode list */ |
@@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) | |||
977 | static int send_cap_msg(struct ceph_mds_session *session, | 988 | static int send_cap_msg(struct ceph_mds_session *session, |
978 | u64 ino, u64 cid, int op, | 989 | u64 ino, u64 cid, int op, |
979 | int caps, int wanted, int dirty, | 990 | int caps, int wanted, int dirty, |
980 | u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, | 991 | u32 seq, u64 flush_tid, u64 oldest_flush_tid, |
981 | u64 size, u64 max_size, | 992 | u32 issue_seq, u32 mseq, u64 size, u64 max_size, |
982 | struct timespec *mtime, struct timespec *atime, | 993 | struct timespec *mtime, struct timespec *atime, |
983 | u64 time_warp_seq, | 994 | u64 time_warp_seq, |
984 | kuid_t uid, kgid_t gid, umode_t mode, | 995 | kuid_t uid, kgid_t gid, umode_t mode, |
@@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session, | |||
992 | size_t extra_len; | 1003 | size_t extra_len; |
993 | 1004 | ||
994 | dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" | 1005 | dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" |
995 | " seq %u/%u mseq %u follows %lld size %llu/%llu" | 1006 | " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" |
996 | " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), | 1007 | " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), |
997 | cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), | 1008 | cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), |
998 | ceph_cap_string(dirty), | 1009 | ceph_cap_string(dirty), |
999 | seq, issue_seq, mseq, follows, size, max_size, | 1010 | seq, issue_seq, flush_tid, oldest_flush_tid, |
1011 | mseq, follows, size, max_size, | ||
1000 | xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); | 1012 | xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); |
1001 | 1013 | ||
1002 | /* flock buffer size + inline version + inline data size */ | 1014 | /* flock buffer size + inline version + inline data size + |
1003 | extra_len = 4 + 8 + 4; | 1015 | * osd_epoch_barrier + oldest_flush_tid */ |
1016 | extra_len = 4 + 8 + 4 + 4 + 8; | ||
1004 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, | 1017 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, |
1005 | GFP_NOFS, false); | 1018 | GFP_NOFS, false); |
1006 | if (!msg) | 1019 | if (!msg) |
1007 | return -ENOMEM; | 1020 | return -ENOMEM; |
1008 | 1021 | ||
1022 | msg->hdr.version = cpu_to_le16(6); | ||
1009 | msg->hdr.tid = cpu_to_le64(flush_tid); | 1023 | msg->hdr.tid = cpu_to_le64(flush_tid); |
1010 | 1024 | ||
1011 | fc = msg->front.iov_base; | 1025 | fc = msg->front.iov_base; |
@@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session, | |||
1041 | ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); | 1055 | ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); |
1042 | /* inline data size */ | 1056 | /* inline data size */ |
1043 | ceph_encode_32(&p, 0); | 1057 | ceph_encode_32(&p, 0); |
1058 | /* osd_epoch_barrier */ | ||
1059 | ceph_encode_32(&p, 0); | ||
1060 | /* oldest_flush_tid */ | ||
1061 | ceph_encode_64(&p, oldest_flush_tid); | ||
1044 | 1062 | ||
1045 | fc->xattr_version = cpu_to_le64(xattr_version); | 1063 | fc->xattr_version = cpu_to_le64(xattr_version); |
1046 | if (xattrs_buf) { | 1064 | if (xattrs_buf) { |
@@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session, | |||
1053 | return 0; | 1071 | return 0; |
1054 | } | 1072 | } |
1055 | 1073 | ||
1056 | void __queue_cap_release(struct ceph_mds_session *session, | ||
1057 | u64 ino, u64 cap_id, u32 migrate_seq, | ||
1058 | u32 issue_seq) | ||
1059 | { | ||
1060 | struct ceph_msg *msg; | ||
1061 | struct ceph_mds_cap_release *head; | ||
1062 | struct ceph_mds_cap_item *item; | ||
1063 | |||
1064 | BUG_ON(!session->s_num_cap_releases); | ||
1065 | msg = list_first_entry(&session->s_cap_releases, | ||
1066 | struct ceph_msg, list_head); | ||
1067 | |||
1068 | dout(" adding %llx release to mds%d msg %p (%d left)\n", | ||
1069 | ino, session->s_mds, msg, session->s_num_cap_releases); | ||
1070 | |||
1071 | BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); | ||
1072 | head = msg->front.iov_base; | ||
1073 | le32_add_cpu(&head->num, 1); | ||
1074 | item = msg->front.iov_base + msg->front.iov_len; | ||
1075 | item->ino = cpu_to_le64(ino); | ||
1076 | item->cap_id = cpu_to_le64(cap_id); | ||
1077 | item->migrate_seq = cpu_to_le32(migrate_seq); | ||
1078 | item->seq = cpu_to_le32(issue_seq); | ||
1079 | |||
1080 | session->s_num_cap_releases--; | ||
1081 | |||
1082 | msg->front.iov_len += sizeof(*item); | ||
1083 | if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { | ||
1084 | dout(" release msg %p full\n", msg); | ||
1085 | list_move_tail(&msg->list_head, &session->s_cap_releases_done); | ||
1086 | } else { | ||
1087 | dout(" release msg %p at %d/%d (%d)\n", msg, | ||
1088 | (int)le32_to_cpu(head->num), | ||
1089 | (int)CEPH_CAPS_PER_RELEASE, | ||
1090 | (int)msg->front.iov_len); | ||
1091 | } | ||
1092 | } | ||
1093 | |||
1094 | /* | 1074 | /* |
1095 | * Queue cap releases when an inode is dropped from our cache. Since | 1075 | * Queue cap releases when an inode is dropped from our cache. Since |
1096 | * inode is about to be destroyed, there is no need for i_ceph_lock. | 1076 | * inode is about to be destroyed, there is no need for i_ceph_lock. |
@@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode) | |||
1127 | */ | 1107 | */ |
1128 | static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | 1108 | static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, |
1129 | int op, int used, int want, int retain, int flushing, | 1109 | int op, int used, int want, int retain, int flushing, |
1130 | unsigned *pflush_tid) | 1110 | u64 flush_tid, u64 oldest_flush_tid) |
1131 | __releases(cap->ci->i_ceph_lock) | 1111 | __releases(cap->ci->i_ceph_lock) |
1132 | { | 1112 | { |
1133 | struct ceph_inode_info *ci = cap->ci; | 1113 | struct ceph_inode_info *ci = cap->ci; |
@@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
1145 | u64 xattr_version = 0; | 1125 | u64 xattr_version = 0; |
1146 | struct ceph_buffer *xattr_blob = NULL; | 1126 | struct ceph_buffer *xattr_blob = NULL; |
1147 | int delayed = 0; | 1127 | int delayed = 0; |
1148 | u64 flush_tid = 0; | ||
1149 | int i; | ||
1150 | int ret; | 1128 | int ret; |
1151 | bool inline_data; | 1129 | bool inline_data; |
1152 | 1130 | ||
@@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
1190 | cap->implemented &= cap->issued | used; | 1168 | cap->implemented &= cap->issued | used; |
1191 | cap->mds_wanted = want; | 1169 | cap->mds_wanted = want; |
1192 | 1170 | ||
1193 | if (flushing) { | 1171 | follows = flushing ? ci->i_head_snapc->seq : 0; |
1194 | /* | ||
1195 | * assign a tid for flush operations so we can avoid | ||
1196 | * flush1 -> dirty1 -> flush2 -> flushack1 -> mark | ||
1197 | * clean type races. track latest tid for every bit | ||
1198 | * so we can handle flush AxFw, flush Fw, and have the | ||
1199 | * first ack clean Ax. | ||
1200 | */ | ||
1201 | flush_tid = ++ci->i_cap_flush_last_tid; | ||
1202 | if (pflush_tid) | ||
1203 | *pflush_tid = flush_tid; | ||
1204 | dout(" cap_flush_tid %d\n", (int)flush_tid); | ||
1205 | for (i = 0; i < CEPH_CAP_BITS; i++) | ||
1206 | if (flushing & (1 << i)) | ||
1207 | ci->i_cap_flush_tid[i] = flush_tid; | ||
1208 | |||
1209 | follows = ci->i_head_snapc->seq; | ||
1210 | } else { | ||
1211 | follows = 0; | ||
1212 | } | ||
1213 | 1172 | ||
1214 | keep = cap->implemented; | 1173 | keep = cap->implemented; |
1215 | seq = cap->seq; | 1174 | seq = cap->seq; |
@@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
1237 | spin_unlock(&ci->i_ceph_lock); | 1196 | spin_unlock(&ci->i_ceph_lock); |
1238 | 1197 | ||
1239 | ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, | 1198 | ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, |
1240 | op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, | 1199 | op, keep, want, flushing, seq, |
1200 | flush_tid, oldest_flush_tid, issue_seq, mseq, | ||
1241 | size, max_size, &mtime, &atime, time_warp_seq, | 1201 | size, max_size, &mtime, &atime, time_warp_seq, |
1242 | uid, gid, mode, xattr_version, xattr_blob, | 1202 | uid, gid, mode, xattr_version, xattr_blob, |
1243 | follows, inline_data); | 1203 | follows, inline_data); |
@@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, | |||
1259 | * asynchronously back to the MDS once sync writes complete and dirty | 1219 | * asynchronously back to the MDS once sync writes complete and dirty |
1260 | * data is written out. | 1220 | * data is written out. |
1261 | * | 1221 | * |
1262 | * Unless @again is true, skip cap_snaps that were already sent to | 1222 | * Unless @kick is true, skip cap_snaps that were already sent to |
1263 | * the MDS (i.e., during this session). | 1223 | * the MDS (i.e., during this session). |
1264 | * | 1224 | * |
1265 | * Called under i_ceph_lock. Takes s_mutex as needed. | 1225 | * Called under i_ceph_lock. Takes s_mutex as needed. |
1266 | */ | 1226 | */ |
1267 | void __ceph_flush_snaps(struct ceph_inode_info *ci, | 1227 | void __ceph_flush_snaps(struct ceph_inode_info *ci, |
1268 | struct ceph_mds_session **psession, | 1228 | struct ceph_mds_session **psession, |
1269 | int again) | 1229 | int kick) |
1270 | __releases(ci->i_ceph_lock) | 1230 | __releases(ci->i_ceph_lock) |
1271 | __acquires(ci->i_ceph_lock) | 1231 | __acquires(ci->i_ceph_lock) |
1272 | { | 1232 | { |
@@ -1297,11 +1257,8 @@ retry: | |||
1297 | if (capsnap->dirty_pages || capsnap->writing) | 1257 | if (capsnap->dirty_pages || capsnap->writing) |
1298 | break; | 1258 | break; |
1299 | 1259 | ||
1300 | /* | 1260 | /* should be removed by ceph_try_drop_cap_snap() */ |
1301 | * if cap writeback already occurred, we should have dropped | 1261 | BUG_ON(!capsnap->need_flush); |
1302 | * the capsnap in ceph_put_wrbuffer_cap_refs. | ||
1303 | */ | ||
1304 | BUG_ON(capsnap->dirty == 0); | ||
1305 | 1262 | ||
1306 | /* pick mds, take s_mutex */ | 1263 | /* pick mds, take s_mutex */ |
1307 | if (ci->i_auth_cap == NULL) { | 1264 | if (ci->i_auth_cap == NULL) { |
@@ -1310,7 +1267,7 @@ retry: | |||
1310 | } | 1267 | } |
1311 | 1268 | ||
1312 | /* only flush each capsnap once */ | 1269 | /* only flush each capsnap once */ |
1313 | if (!again && !list_empty(&capsnap->flushing_item)) { | 1270 | if (!kick && !list_empty(&capsnap->flushing_item)) { |
1314 | dout("already flushed %p, skipping\n", capsnap); | 1271 | dout("already flushed %p, skipping\n", capsnap); |
1315 | continue; | 1272 | continue; |
1316 | } | 1273 | } |
@@ -1320,6 +1277,9 @@ retry: | |||
1320 | 1277 | ||
1321 | if (session && session->s_mds != mds) { | 1278 | if (session && session->s_mds != mds) { |
1322 | dout("oops, wrong session %p mutex\n", session); | 1279 | dout("oops, wrong session %p mutex\n", session); |
1280 | if (kick) | ||
1281 | goto out; | ||
1282 | |||
1323 | mutex_unlock(&session->s_mutex); | 1283 | mutex_unlock(&session->s_mutex); |
1324 | ceph_put_mds_session(session); | 1284 | ceph_put_mds_session(session); |
1325 | session = NULL; | 1285 | session = NULL; |
@@ -1343,20 +1303,22 @@ retry: | |||
1343 | goto retry; | 1303 | goto retry; |
1344 | } | 1304 | } |
1345 | 1305 | ||
1346 | capsnap->flush_tid = ++ci->i_cap_flush_last_tid; | 1306 | spin_lock(&mdsc->cap_dirty_lock); |
1307 | capsnap->flush_tid = ++mdsc->last_cap_flush_tid; | ||
1308 | spin_unlock(&mdsc->cap_dirty_lock); | ||
1309 | |||
1347 | atomic_inc(&capsnap->nref); | 1310 | atomic_inc(&capsnap->nref); |
1348 | if (!list_empty(&capsnap->flushing_item)) | 1311 | if (list_empty(&capsnap->flushing_item)) |
1349 | list_del_init(&capsnap->flushing_item); | 1312 | list_add_tail(&capsnap->flushing_item, |
1350 | list_add_tail(&capsnap->flushing_item, | 1313 | &session->s_cap_snaps_flushing); |
1351 | &session->s_cap_snaps_flushing); | ||
1352 | spin_unlock(&ci->i_ceph_lock); | 1314 | spin_unlock(&ci->i_ceph_lock); |
1353 | 1315 | ||
1354 | dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", | 1316 | dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", |
1355 | inode, capsnap, capsnap->follows, capsnap->flush_tid); | 1317 | inode, capsnap, capsnap->follows, capsnap->flush_tid); |
1356 | send_cap_msg(session, ceph_vino(inode).ino, 0, | 1318 | send_cap_msg(session, ceph_vino(inode).ino, 0, |
1357 | CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, | 1319 | CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, |
1358 | capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, | 1320 | capsnap->dirty, 0, capsnap->flush_tid, 0, |
1359 | capsnap->size, 0, | 1321 | 0, mseq, capsnap->size, 0, |
1360 | &capsnap->mtime, &capsnap->atime, | 1322 | &capsnap->mtime, &capsnap->atime, |
1361 | capsnap->time_warp_seq, | 1323 | capsnap->time_warp_seq, |
1362 | capsnap->uid, capsnap->gid, capsnap->mode, | 1324 | capsnap->uid, capsnap->gid, capsnap->mode, |
@@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) | |||
1396 | * Caller is then responsible for calling __mark_inode_dirty with the | 1358 | * Caller is then responsible for calling __mark_inode_dirty with the |
1397 | * returned flags value. | 1359 | * returned flags value. |
1398 | */ | 1360 | */ |
1399 | int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | 1361 | int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, |
1362 | struct ceph_cap_flush **pcf) | ||
1400 | { | 1363 | { |
1401 | struct ceph_mds_client *mdsc = | 1364 | struct ceph_mds_client *mdsc = |
1402 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 1365 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; |
@@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
1416 | ceph_cap_string(was | mask)); | 1379 | ceph_cap_string(was | mask)); |
1417 | ci->i_dirty_caps |= mask; | 1380 | ci->i_dirty_caps |= mask; |
1418 | if (was == 0) { | 1381 | if (was == 0) { |
1419 | if (!ci->i_head_snapc) | 1382 | WARN_ON_ONCE(ci->i_prealloc_cap_flush); |
1383 | swap(ci->i_prealloc_cap_flush, *pcf); | ||
1384 | |||
1385 | if (!ci->i_head_snapc) { | ||
1386 | WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); | ||
1420 | ci->i_head_snapc = ceph_get_snap_context( | 1387 | ci->i_head_snapc = ceph_get_snap_context( |
1421 | ci->i_snap_realm->cached_context); | 1388 | ci->i_snap_realm->cached_context); |
1389 | } | ||
1422 | dout(" inode %p now dirty snapc %p auth cap %p\n", | 1390 | dout(" inode %p now dirty snapc %p auth cap %p\n", |
1423 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); | 1391 | &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); |
1424 | BUG_ON(!list_empty(&ci->i_dirty_item)); | 1392 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
@@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
1429 | ihold(inode); | 1397 | ihold(inode); |
1430 | dirty |= I_DIRTY_SYNC; | 1398 | dirty |= I_DIRTY_SYNC; |
1431 | } | 1399 | } |
1400 | } else { | ||
1401 | WARN_ON_ONCE(!ci->i_prealloc_cap_flush); | ||
1432 | } | 1402 | } |
1433 | BUG_ON(list_empty(&ci->i_dirty_item)); | 1403 | BUG_ON(list_empty(&ci->i_dirty_item)); |
1434 | if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && | 1404 | if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && |
@@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
1438 | return dirty; | 1408 | return dirty; |
1439 | } | 1409 | } |
1440 | 1410 | ||
1411 | static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, | ||
1412 | struct ceph_cap_flush *cf) | ||
1413 | { | ||
1414 | struct rb_node **p = &ci->i_cap_flush_tree.rb_node; | ||
1415 | struct rb_node *parent = NULL; | ||
1416 | struct ceph_cap_flush *other = NULL; | ||
1417 | |||
1418 | while (*p) { | ||
1419 | parent = *p; | ||
1420 | other = rb_entry(parent, struct ceph_cap_flush, i_node); | ||
1421 | |||
1422 | if (cf->tid < other->tid) | ||
1423 | p = &(*p)->rb_left; | ||
1424 | else if (cf->tid > other->tid) | ||
1425 | p = &(*p)->rb_right; | ||
1426 | else | ||
1427 | BUG(); | ||
1428 | } | ||
1429 | |||
1430 | rb_link_node(&cf->i_node, parent, p); | ||
1431 | rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); | ||
1432 | } | ||
1433 | |||
1434 | static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, | ||
1435 | struct ceph_cap_flush *cf) | ||
1436 | { | ||
1437 | struct rb_node **p = &mdsc->cap_flush_tree.rb_node; | ||
1438 | struct rb_node *parent = NULL; | ||
1439 | struct ceph_cap_flush *other = NULL; | ||
1440 | |||
1441 | while (*p) { | ||
1442 | parent = *p; | ||
1443 | other = rb_entry(parent, struct ceph_cap_flush, g_node); | ||
1444 | |||
1445 | if (cf->tid < other->tid) | ||
1446 | p = &(*p)->rb_left; | ||
1447 | else if (cf->tid > other->tid) | ||
1448 | p = &(*p)->rb_right; | ||
1449 | else | ||
1450 | BUG(); | ||
1451 | } | ||
1452 | |||
1453 | rb_link_node(&cf->g_node, parent, p); | ||
1454 | rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); | ||
1455 | } | ||
1456 | |||
1457 | struct ceph_cap_flush *ceph_alloc_cap_flush(void) | ||
1458 | { | ||
1459 | return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); | ||
1460 | } | ||
1461 | |||
1462 | void ceph_free_cap_flush(struct ceph_cap_flush *cf) | ||
1463 | { | ||
1464 | if (cf) | ||
1465 | kmem_cache_free(ceph_cap_flush_cachep, cf); | ||
1466 | } | ||
1467 | |||
1468 | static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) | ||
1469 | { | ||
1470 | struct rb_node *n = rb_first(&mdsc->cap_flush_tree); | ||
1471 | if (n) { | ||
1472 | struct ceph_cap_flush *cf = | ||
1473 | rb_entry(n, struct ceph_cap_flush, g_node); | ||
1474 | return cf->tid; | ||
1475 | } | ||
1476 | return 0; | ||
1477 | } | ||
1478 | |||
1441 | /* | 1479 | /* |
1442 | * Add dirty inode to the flushing list. Assigned a seq number so we | 1480 | * Add dirty inode to the flushing list. Assigned a seq number so we |
1443 | * can wait for caps to flush without starving. | 1481 | * can wait for caps to flush without starving. |
@@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) | |||
1445 | * Called under i_ceph_lock. | 1483 | * Called under i_ceph_lock. |
1446 | */ | 1484 | */ |
1447 | static int __mark_caps_flushing(struct inode *inode, | 1485 | static int __mark_caps_flushing(struct inode *inode, |
1448 | struct ceph_mds_session *session) | 1486 | struct ceph_mds_session *session, |
1487 | u64 *flush_tid, u64 *oldest_flush_tid) | ||
1449 | { | 1488 | { |
1450 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1489 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1451 | struct ceph_inode_info *ci = ceph_inode(inode); | 1490 | struct ceph_inode_info *ci = ceph_inode(inode); |
1491 | struct ceph_cap_flush *cf = NULL; | ||
1452 | int flushing; | 1492 | int flushing; |
1453 | 1493 | ||
1454 | BUG_ON(ci->i_dirty_caps == 0); | 1494 | BUG_ON(ci->i_dirty_caps == 0); |
1455 | BUG_ON(list_empty(&ci->i_dirty_item)); | 1495 | BUG_ON(list_empty(&ci->i_dirty_item)); |
1496 | BUG_ON(!ci->i_prealloc_cap_flush); | ||
1456 | 1497 | ||
1457 | flushing = ci->i_dirty_caps; | 1498 | flushing = ci->i_dirty_caps; |
1458 | dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", | 1499 | dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", |
@@ -1463,22 +1504,31 @@ static int __mark_caps_flushing(struct inode *inode, | |||
1463 | ci->i_dirty_caps = 0; | 1504 | ci->i_dirty_caps = 0; |
1464 | dout(" inode %p now !dirty\n", inode); | 1505 | dout(" inode %p now !dirty\n", inode); |
1465 | 1506 | ||
1507 | swap(cf, ci->i_prealloc_cap_flush); | ||
1508 | cf->caps = flushing; | ||
1509 | cf->kick = false; | ||
1510 | |||
1466 | spin_lock(&mdsc->cap_dirty_lock); | 1511 | spin_lock(&mdsc->cap_dirty_lock); |
1467 | list_del_init(&ci->i_dirty_item); | 1512 | list_del_init(&ci->i_dirty_item); |
1468 | 1513 | ||
1514 | cf->tid = ++mdsc->last_cap_flush_tid; | ||
1515 | __add_cap_flushing_to_mdsc(mdsc, cf); | ||
1516 | *oldest_flush_tid = __get_oldest_flush_tid(mdsc); | ||
1517 | |||
1469 | if (list_empty(&ci->i_flushing_item)) { | 1518 | if (list_empty(&ci->i_flushing_item)) { |
1470 | ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; | ||
1471 | list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 1519 | list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
1472 | mdsc->num_cap_flushing++; | 1520 | mdsc->num_cap_flushing++; |
1473 | dout(" inode %p now flushing seq %lld\n", inode, | 1521 | dout(" inode %p now flushing tid %llu\n", inode, cf->tid); |
1474 | ci->i_cap_flush_seq); | ||
1475 | } else { | 1522 | } else { |
1476 | list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); | 1523 | list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); |
1477 | dout(" inode %p now flushing (more) seq %lld\n", inode, | 1524 | dout(" inode %p now flushing (more) tid %llu\n", |
1478 | ci->i_cap_flush_seq); | 1525 | inode, cf->tid); |
1479 | } | 1526 | } |
1480 | spin_unlock(&mdsc->cap_dirty_lock); | 1527 | spin_unlock(&mdsc->cap_dirty_lock); |
1481 | 1528 | ||
1529 | __add_cap_flushing_to_inode(ci, cf); | ||
1530 | |||
1531 | *flush_tid = cf->tid; | ||
1482 | return flushing; | 1532 | return flushing; |
1483 | } | 1533 | } |
1484 | 1534 | ||
@@ -1524,6 +1574,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, | |||
1524 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1574 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1525 | struct inode *inode = &ci->vfs_inode; | 1575 | struct inode *inode = &ci->vfs_inode; |
1526 | struct ceph_cap *cap; | 1576 | struct ceph_cap *cap; |
1577 | u64 flush_tid, oldest_flush_tid; | ||
1527 | int file_wanted, used, cap_used; | 1578 | int file_wanted, used, cap_used; |
1528 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ | 1579 | int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ |
1529 | int issued, implemented, want, retain, revoking, flushing = 0; | 1580 | int issued, implemented, want, retain, revoking, flushing = 0; |
@@ -1553,13 +1604,13 @@ retry: | |||
1553 | retry_locked: | 1604 | retry_locked: |
1554 | file_wanted = __ceph_caps_file_wanted(ci); | 1605 | file_wanted = __ceph_caps_file_wanted(ci); |
1555 | used = __ceph_caps_used(ci); | 1606 | used = __ceph_caps_used(ci); |
1556 | want = file_wanted | used; | ||
1557 | issued = __ceph_caps_issued(ci, &implemented); | 1607 | issued = __ceph_caps_issued(ci, &implemented); |
1558 | revoking = implemented & ~issued; | 1608 | revoking = implemented & ~issued; |
1559 | 1609 | ||
1560 | retain = want | CEPH_CAP_PIN; | 1610 | want = file_wanted; |
1611 | retain = file_wanted | used | CEPH_CAP_PIN; | ||
1561 | if (!mdsc->stopping && inode->i_nlink > 0) { | 1612 | if (!mdsc->stopping && inode->i_nlink > 0) { |
1562 | if (want) { | 1613 | if (file_wanted) { |
1563 | retain |= CEPH_CAP_ANY; /* be greedy */ | 1614 | retain |= CEPH_CAP_ANY; /* be greedy */ |
1564 | } else if (S_ISDIR(inode->i_mode) && | 1615 | } else if (S_ISDIR(inode->i_mode) && |
1565 | (issued & CEPH_CAP_FILE_SHARED) && | 1616 | (issued & CEPH_CAP_FILE_SHARED) && |
@@ -1602,9 +1653,10 @@ retry_locked: | |||
1602 | * If we fail, it's because pages are locked.... try again later. | 1653 | * If we fail, it's because pages are locked.... try again later. |
1603 | */ | 1654 | */ |
1604 | if ((!is_delayed || mdsc->stopping) && | 1655 | if ((!is_delayed || mdsc->stopping) && |
1605 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ | 1656 | !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ |
1606 | inode->i_data.nrpages && /* have cached pages */ | 1657 | ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ |
1607 | (file_wanted == 0 || /* no open files */ | 1658 | inode->i_data.nrpages && /* have cached pages */ |
1659 | (file_wanted == 0 || /* no open files */ | ||
1608 | (revoking & (CEPH_CAP_FILE_CACHE| | 1660 | (revoking & (CEPH_CAP_FILE_CACHE| |
1609 | CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ | 1661 | CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ |
1610 | !tried_invalidate) { | 1662 | !tried_invalidate) { |
@@ -1742,17 +1794,25 @@ ack: | |||
1742 | took_snap_rwsem = 1; | 1794 | took_snap_rwsem = 1; |
1743 | } | 1795 | } |
1744 | 1796 | ||
1745 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) | 1797 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) { |
1746 | flushing = __mark_caps_flushing(inode, session); | 1798 | flushing = __mark_caps_flushing(inode, session, |
1747 | else | 1799 | &flush_tid, |
1800 | &oldest_flush_tid); | ||
1801 | } else { | ||
1748 | flushing = 0; | 1802 | flushing = 0; |
1803 | flush_tid = 0; | ||
1804 | spin_lock(&mdsc->cap_dirty_lock); | ||
1805 | oldest_flush_tid = __get_oldest_flush_tid(mdsc); | ||
1806 | spin_unlock(&mdsc->cap_dirty_lock); | ||
1807 | } | ||
1749 | 1808 | ||
1750 | mds = cap->mds; /* remember mds, so we don't repeat */ | 1809 | mds = cap->mds; /* remember mds, so we don't repeat */ |
1751 | sent++; | 1810 | sent++; |
1752 | 1811 | ||
1753 | /* __send_cap drops i_ceph_lock */ | 1812 | /* __send_cap drops i_ceph_lock */ |
1754 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, | 1813 | delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, |
1755 | want, retain, flushing, NULL); | 1814 | want, retain, flushing, |
1815 | flush_tid, oldest_flush_tid); | ||
1756 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ | 1816 | goto retry; /* retake i_ceph_lock and restart our cap scan. */ |
1757 | } | 1817 | } |
1758 | 1818 | ||
@@ -1781,12 +1841,13 @@ ack: | |||
1781 | /* | 1841 | /* |
1782 | * Try to flush dirty caps back to the auth mds. | 1842 | * Try to flush dirty caps back to the auth mds. |
1783 | */ | 1843 | */ |
1784 | static int try_flush_caps(struct inode *inode, unsigned *flush_tid) | 1844 | static int try_flush_caps(struct inode *inode, u64 *ptid) |
1785 | { | 1845 | { |
1786 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1846 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1787 | struct ceph_inode_info *ci = ceph_inode(inode); | 1847 | struct ceph_inode_info *ci = ceph_inode(inode); |
1788 | int flushing = 0; | ||
1789 | struct ceph_mds_session *session = NULL; | 1848 | struct ceph_mds_session *session = NULL; |
1849 | int flushing = 0; | ||
1850 | u64 flush_tid = 0, oldest_flush_tid = 0; | ||
1790 | 1851 | ||
1791 | retry: | 1852 | retry: |
1792 | spin_lock(&ci->i_ceph_lock); | 1853 | spin_lock(&ci->i_ceph_lock); |
@@ -1811,42 +1872,54 @@ retry: | |||
1811 | if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) | 1872 | if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) |
1812 | goto out; | 1873 | goto out; |
1813 | 1874 | ||
1814 | flushing = __mark_caps_flushing(inode, session); | 1875 | flushing = __mark_caps_flushing(inode, session, &flush_tid, |
1876 | &oldest_flush_tid); | ||
1815 | 1877 | ||
1816 | /* __send_cap drops i_ceph_lock */ | 1878 | /* __send_cap drops i_ceph_lock */ |
1817 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, | 1879 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, |
1818 | cap->issued | cap->implemented, flushing, | 1880 | (cap->issued | cap->implemented), |
1819 | flush_tid); | 1881 | flushing, flush_tid, oldest_flush_tid); |
1820 | if (!delayed) | ||
1821 | goto out_unlocked; | ||
1822 | 1882 | ||
1823 | spin_lock(&ci->i_ceph_lock); | 1883 | if (delayed) { |
1824 | __cap_delay_requeue(mdsc, ci); | 1884 | spin_lock(&ci->i_ceph_lock); |
1885 | __cap_delay_requeue(mdsc, ci); | ||
1886 | spin_unlock(&ci->i_ceph_lock); | ||
1887 | } | ||
1888 | } else { | ||
1889 | struct rb_node *n = rb_last(&ci->i_cap_flush_tree); | ||
1890 | if (n) { | ||
1891 | struct ceph_cap_flush *cf = | ||
1892 | rb_entry(n, struct ceph_cap_flush, i_node); | ||
1893 | flush_tid = cf->tid; | ||
1894 | } | ||
1895 | flushing = ci->i_flushing_caps; | ||
1896 | spin_unlock(&ci->i_ceph_lock); | ||
1825 | } | 1897 | } |
1826 | out: | 1898 | out: |
1827 | spin_unlock(&ci->i_ceph_lock); | ||
1828 | out_unlocked: | ||
1829 | if (session) | 1899 | if (session) |
1830 | mutex_unlock(&session->s_mutex); | 1900 | mutex_unlock(&session->s_mutex); |
1901 | |||
1902 | *ptid = flush_tid; | ||
1831 | return flushing; | 1903 | return flushing; |
1832 | } | 1904 | } |
1833 | 1905 | ||
1834 | /* | 1906 | /* |
1835 | * Return true if we've flushed caps through the given flush_tid. | 1907 | * Return true if we've flushed caps through the given flush_tid. |
1836 | */ | 1908 | */ |
1837 | static int caps_are_flushed(struct inode *inode, unsigned tid) | 1909 | static int caps_are_flushed(struct inode *inode, u64 flush_tid) |
1838 | { | 1910 | { |
1839 | struct ceph_inode_info *ci = ceph_inode(inode); | 1911 | struct ceph_inode_info *ci = ceph_inode(inode); |
1840 | int i, ret = 1; | 1912 | struct ceph_cap_flush *cf; |
1913 | struct rb_node *n; | ||
1914 | int ret = 1; | ||
1841 | 1915 | ||
1842 | spin_lock(&ci->i_ceph_lock); | 1916 | spin_lock(&ci->i_ceph_lock); |
1843 | for (i = 0; i < CEPH_CAP_BITS; i++) | 1917 | n = rb_first(&ci->i_cap_flush_tree); |
1844 | if ((ci->i_flushing_caps & (1 << i)) && | 1918 | if (n) { |
1845 | ci->i_cap_flush_tid[i] <= tid) { | 1919 | cf = rb_entry(n, struct ceph_cap_flush, i_node); |
1846 | /* still flushing this bit */ | 1920 | if (cf->tid <= flush_tid) |
1847 | ret = 0; | 1921 | ret = 0; |
1848 | break; | 1922 | } |
1849 | } | ||
1850 | spin_unlock(&ci->i_ceph_lock); | 1923 | spin_unlock(&ci->i_ceph_lock); |
1851 | return ret; | 1924 | return ret; |
1852 | } | 1925 | } |
@@ -1864,13 +1937,16 @@ static void sync_write_wait(struct inode *inode) | |||
1864 | struct ceph_osd_request *req; | 1937 | struct ceph_osd_request *req; |
1865 | u64 last_tid; | 1938 | u64 last_tid; |
1866 | 1939 | ||
1940 | if (!S_ISREG(inode->i_mode)) | ||
1941 | return; | ||
1942 | |||
1867 | spin_lock(&ci->i_unsafe_lock); | 1943 | spin_lock(&ci->i_unsafe_lock); |
1868 | if (list_empty(head)) | 1944 | if (list_empty(head)) |
1869 | goto out; | 1945 | goto out; |
1870 | 1946 | ||
1871 | /* set upper bound as _last_ entry in chain */ | 1947 | /* set upper bound as _last_ entry in chain */ |
1872 | req = list_entry(head->prev, struct ceph_osd_request, | 1948 | req = list_last_entry(head, struct ceph_osd_request, |
1873 | r_unsafe_item); | 1949 | r_unsafe_item); |
1874 | last_tid = req->r_tid; | 1950 | last_tid = req->r_tid; |
1875 | 1951 | ||
1876 | do { | 1952 | do { |
@@ -1888,18 +1964,64 @@ static void sync_write_wait(struct inode *inode) | |||
1888 | */ | 1964 | */ |
1889 | if (list_empty(head)) | 1965 | if (list_empty(head)) |
1890 | break; | 1966 | break; |
1891 | req = list_entry(head->next, struct ceph_osd_request, | 1967 | req = list_first_entry(head, struct ceph_osd_request, |
1892 | r_unsafe_item); | 1968 | r_unsafe_item); |
1893 | } while (req->r_tid < last_tid); | 1969 | } while (req->r_tid < last_tid); |
1894 | out: | 1970 | out: |
1895 | spin_unlock(&ci->i_unsafe_lock); | 1971 | spin_unlock(&ci->i_unsafe_lock); |
1896 | } | 1972 | } |
1897 | 1973 | ||
1974 | /* | ||
1975 | * wait for any uncommitted directory operations to commit. | ||
1976 | */ | ||
1977 | static int unsafe_dirop_wait(struct inode *inode) | ||
1978 | { | ||
1979 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
1980 | struct list_head *head = &ci->i_unsafe_dirops; | ||
1981 | struct ceph_mds_request *req; | ||
1982 | u64 last_tid; | ||
1983 | int ret = 0; | ||
1984 | |||
1985 | if (!S_ISDIR(inode->i_mode)) | ||
1986 | return 0; | ||
1987 | |||
1988 | spin_lock(&ci->i_unsafe_lock); | ||
1989 | if (list_empty(head)) | ||
1990 | goto out; | ||
1991 | |||
1992 | req = list_last_entry(head, struct ceph_mds_request, | ||
1993 | r_unsafe_dir_item); | ||
1994 | last_tid = req->r_tid; | ||
1995 | |||
1996 | do { | ||
1997 | ceph_mdsc_get_request(req); | ||
1998 | spin_unlock(&ci->i_unsafe_lock); | ||
1999 | |||
2000 | dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n", | ||
2001 | inode, req->r_tid, last_tid); | ||
2002 | ret = !wait_for_completion_timeout(&req->r_safe_completion, | ||
2003 | ceph_timeout_jiffies(req->r_timeout)); | ||
2004 | if (ret) | ||
2005 | ret = -EIO; /* timed out */ | ||
2006 | |||
2007 | ceph_mdsc_put_request(req); | ||
2008 | |||
2009 | spin_lock(&ci->i_unsafe_lock); | ||
2010 | if (ret || list_empty(head)) | ||
2011 | break; | ||
2012 | req = list_first_entry(head, struct ceph_mds_request, | ||
2013 | r_unsafe_dir_item); | ||
2014 | } while (req->r_tid < last_tid); | ||
2015 | out: | ||
2016 | spin_unlock(&ci->i_unsafe_lock); | ||
2017 | return ret; | ||
2018 | } | ||
2019 | |||
1898 | int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | 2020 | int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) |
1899 | { | 2021 | { |
1900 | struct inode *inode = file->f_mapping->host; | 2022 | struct inode *inode = file->f_mapping->host; |
1901 | struct ceph_inode_info *ci = ceph_inode(inode); | 2023 | struct ceph_inode_info *ci = ceph_inode(inode); |
1902 | unsigned flush_tid; | 2024 | u64 flush_tid; |
1903 | int ret; | 2025 | int ret; |
1904 | int dirty; | 2026 | int dirty; |
1905 | 2027 | ||
@@ -1908,25 +2030,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |||
1908 | 2030 | ||
1909 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 2031 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); |
1910 | if (ret < 0) | 2032 | if (ret < 0) |
1911 | return ret; | 2033 | goto out; |
2034 | |||
2035 | if (datasync) | ||
2036 | goto out; | ||
2037 | |||
1912 | mutex_lock(&inode->i_mutex); | 2038 | mutex_lock(&inode->i_mutex); |
1913 | 2039 | ||
1914 | dirty = try_flush_caps(inode, &flush_tid); | 2040 | dirty = try_flush_caps(inode, &flush_tid); |
1915 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); | 2041 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); |
1916 | 2042 | ||
2043 | ret = unsafe_dirop_wait(inode); | ||
2044 | |||
1917 | /* | 2045 | /* |
1918 | * only wait on non-file metadata writeback (the mds | 2046 | * only wait on non-file metadata writeback (the mds |
1919 | * can recover size and mtime, so we don't need to | 2047 | * can recover size and mtime, so we don't need to |
1920 | * wait for that) | 2048 | * wait for that) |
1921 | */ | 2049 | */ |
1922 | if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { | 2050 | if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
1923 | dout("fsync waiting for flush_tid %u\n", flush_tid); | ||
1924 | ret = wait_event_interruptible(ci->i_cap_wq, | 2051 | ret = wait_event_interruptible(ci->i_cap_wq, |
1925 | caps_are_flushed(inode, flush_tid)); | 2052 | caps_are_flushed(inode, flush_tid)); |
1926 | } | 2053 | } |
1927 | |||
1928 | dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); | ||
1929 | mutex_unlock(&inode->i_mutex); | 2054 | mutex_unlock(&inode->i_mutex); |
2055 | out: | ||
2056 | dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); | ||
1930 | return ret; | 2057 | return ret; |
1931 | } | 2058 | } |
1932 | 2059 | ||
@@ -1939,7 +2066,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |||
1939 | int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) | 2066 | int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) |
1940 | { | 2067 | { |
1941 | struct ceph_inode_info *ci = ceph_inode(inode); | 2068 | struct ceph_inode_info *ci = ceph_inode(inode); |
1942 | unsigned flush_tid; | 2069 | u64 flush_tid; |
1943 | int err = 0; | 2070 | int err = 0; |
1944 | int dirty; | 2071 | int dirty; |
1945 | int wait = wbc->sync_mode == WB_SYNC_ALL; | 2072 | int wait = wbc->sync_mode == WB_SYNC_ALL; |
@@ -1994,6 +2121,104 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, | |||
1994 | } | 2121 | } |
1995 | } | 2122 | } |
1996 | 2123 | ||
2124 | static int __kick_flushing_caps(struct ceph_mds_client *mdsc, | ||
2125 | struct ceph_mds_session *session, | ||
2126 | struct ceph_inode_info *ci, | ||
2127 | bool kick_all) | ||
2128 | { | ||
2129 | struct inode *inode = &ci->vfs_inode; | ||
2130 | struct ceph_cap *cap; | ||
2131 | struct ceph_cap_flush *cf; | ||
2132 | struct rb_node *n; | ||
2133 | int delayed = 0; | ||
2134 | u64 first_tid = 0; | ||
2135 | u64 oldest_flush_tid; | ||
2136 | |||
2137 | spin_lock(&mdsc->cap_dirty_lock); | ||
2138 | oldest_flush_tid = __get_oldest_flush_tid(mdsc); | ||
2139 | spin_unlock(&mdsc->cap_dirty_lock); | ||
2140 | |||
2141 | while (true) { | ||
2142 | spin_lock(&ci->i_ceph_lock); | ||
2143 | cap = ci->i_auth_cap; | ||
2144 | if (!(cap && cap->session == session)) { | ||
2145 | pr_err("%p auth cap %p not mds%d ???\n", inode, | ||
2146 | cap, session->s_mds); | ||
2147 | spin_unlock(&ci->i_ceph_lock); | ||
2148 | break; | ||
2149 | } | ||
2150 | |||
2151 | for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { | ||
2152 | cf = rb_entry(n, struct ceph_cap_flush, i_node); | ||
2153 | if (cf->tid < first_tid) | ||
2154 | continue; | ||
2155 | if (kick_all || cf->kick) | ||
2156 | break; | ||
2157 | } | ||
2158 | if (!n) { | ||
2159 | spin_unlock(&ci->i_ceph_lock); | ||
2160 | break; | ||
2161 | } | ||
2162 | |||
2163 | cf = rb_entry(n, struct ceph_cap_flush, i_node); | ||
2164 | cf->kick = false; | ||
2165 | |||
2166 | first_tid = cf->tid + 1; | ||
2167 | |||
2168 | dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, | ||
2169 | cap, cf->tid, ceph_cap_string(cf->caps)); | ||
2170 | delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | ||
2171 | __ceph_caps_used(ci), | ||
2172 | __ceph_caps_wanted(ci), | ||
2173 | cap->issued | cap->implemented, | ||
2174 | cf->caps, cf->tid, oldest_flush_tid); | ||
2175 | } | ||
2176 | return delayed; | ||
2177 | } | ||
2178 | |||
2179 | void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, | ||
2180 | struct ceph_mds_session *session) | ||
2181 | { | ||
2182 | struct ceph_inode_info *ci; | ||
2183 | struct ceph_cap *cap; | ||
2184 | struct ceph_cap_flush *cf; | ||
2185 | struct rb_node *n; | ||
2186 | |||
2187 | dout("early_kick_flushing_caps mds%d\n", session->s_mds); | ||
2188 | list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { | ||
2189 | spin_lock(&ci->i_ceph_lock); | ||
2190 | cap = ci->i_auth_cap; | ||
2191 | if (!(cap && cap->session == session)) { | ||
2192 | pr_err("%p auth cap %p not mds%d ???\n", | ||
2193 | &ci->vfs_inode, cap, session->s_mds); | ||
2194 | spin_unlock(&ci->i_ceph_lock); | ||
2195 | continue; | ||
2196 | } | ||
2197 | |||
2198 | |||
2199 | /* | ||
2200 | * if flushing caps were revoked, we re-send the cap flush | ||
2201 | * in client reconnect stage. This guarantees MDS * processes | ||
2202 | * the cap flush message before issuing the flushing caps to | ||
2203 | * other client. | ||
2204 | */ | ||
2205 | if ((cap->issued & ci->i_flushing_caps) != | ||
2206 | ci->i_flushing_caps) { | ||
2207 | spin_unlock(&ci->i_ceph_lock); | ||
2208 | if (!__kick_flushing_caps(mdsc, session, ci, true)) | ||
2209 | continue; | ||
2210 | spin_lock(&ci->i_ceph_lock); | ||
2211 | } | ||
2212 | |||
2213 | for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { | ||
2214 | cf = rb_entry(n, struct ceph_cap_flush, i_node); | ||
2215 | cf->kick = true; | ||
2216 | } | ||
2217 | |||
2218 | spin_unlock(&ci->i_ceph_lock); | ||
2219 | } | ||
2220 | } | ||
2221 | |||
1997 | void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | 2222 | void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, |
1998 | struct ceph_mds_session *session) | 2223 | struct ceph_mds_session *session) |
1999 | { | 2224 | { |
@@ -2003,28 +2228,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | |||
2003 | 2228 | ||
2004 | dout("kick_flushing_caps mds%d\n", session->s_mds); | 2229 | dout("kick_flushing_caps mds%d\n", session->s_mds); |
2005 | list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { | 2230 | list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { |
2006 | struct inode *inode = &ci->vfs_inode; | 2231 | int delayed = __kick_flushing_caps(mdsc, session, ci, false); |
2007 | struct ceph_cap *cap; | 2232 | if (delayed) { |
2008 | int delayed = 0; | 2233 | spin_lock(&ci->i_ceph_lock); |
2009 | 2234 | __cap_delay_requeue(mdsc, ci); | |
2010 | spin_lock(&ci->i_ceph_lock); | ||
2011 | cap = ci->i_auth_cap; | ||
2012 | if (cap && cap->session == session) { | ||
2013 | dout("kick_flushing_caps %p cap %p %s\n", inode, | ||
2014 | cap, ceph_cap_string(ci->i_flushing_caps)); | ||
2015 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | ||
2016 | __ceph_caps_used(ci), | ||
2017 | __ceph_caps_wanted(ci), | ||
2018 | cap->issued | cap->implemented, | ||
2019 | ci->i_flushing_caps, NULL); | ||
2020 | if (delayed) { | ||
2021 | spin_lock(&ci->i_ceph_lock); | ||
2022 | __cap_delay_requeue(mdsc, ci); | ||
2023 | spin_unlock(&ci->i_ceph_lock); | ||
2024 | } | ||
2025 | } else { | ||
2026 | pr_err("%p auth cap %p not mds%d ???\n", inode, | ||
2027 | cap, session->s_mds); | ||
2028 | spin_unlock(&ci->i_ceph_lock); | 2235 | spin_unlock(&ci->i_ceph_lock); |
2029 | } | 2236 | } |
2030 | } | 2237 | } |
@@ -2036,26 +2243,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | |||
2036 | { | 2243 | { |
2037 | struct ceph_inode_info *ci = ceph_inode(inode); | 2244 | struct ceph_inode_info *ci = ceph_inode(inode); |
2038 | struct ceph_cap *cap; | 2245 | struct ceph_cap *cap; |
2039 | int delayed = 0; | ||
2040 | 2246 | ||
2041 | spin_lock(&ci->i_ceph_lock); | 2247 | spin_lock(&ci->i_ceph_lock); |
2042 | cap = ci->i_auth_cap; | 2248 | cap = ci->i_auth_cap; |
2043 | dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, | 2249 | dout("kick_flushing_inode_caps %p flushing %s\n", inode, |
2044 | ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); | 2250 | ceph_cap_string(ci->i_flushing_caps)); |
2045 | 2251 | ||
2046 | __ceph_flush_snaps(ci, &session, 1); | 2252 | __ceph_flush_snaps(ci, &session, 1); |
2047 | 2253 | ||
2048 | if (ci->i_flushing_caps) { | 2254 | if (ci->i_flushing_caps) { |
2255 | int delayed; | ||
2256 | |||
2049 | spin_lock(&mdsc->cap_dirty_lock); | 2257 | spin_lock(&mdsc->cap_dirty_lock); |
2050 | list_move_tail(&ci->i_flushing_item, | 2258 | list_move_tail(&ci->i_flushing_item, |
2051 | &cap->session->s_cap_flushing); | 2259 | &cap->session->s_cap_flushing); |
2052 | spin_unlock(&mdsc->cap_dirty_lock); | 2260 | spin_unlock(&mdsc->cap_dirty_lock); |
2053 | 2261 | ||
2054 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | 2262 | spin_unlock(&ci->i_ceph_lock); |
2055 | __ceph_caps_used(ci), | 2263 | |
2056 | __ceph_caps_wanted(ci), | 2264 | delayed = __kick_flushing_caps(mdsc, session, ci, true); |
2057 | cap->issued | cap->implemented, | ||
2058 | ci->i_flushing_caps, NULL); | ||
2059 | if (delayed) { | 2265 | if (delayed) { |
2060 | spin_lock(&ci->i_ceph_lock); | 2266 | spin_lock(&ci->i_ceph_lock); |
2061 | __cap_delay_requeue(mdsc, ci); | 2267 | __cap_delay_requeue(mdsc, ci); |
@@ -2073,7 +2279,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, | |||
2073 | * | 2279 | * |
2074 | * Protected by i_ceph_lock. | 2280 | * Protected by i_ceph_lock. |
2075 | */ | 2281 | */ |
2076 | static void __take_cap_refs(struct ceph_inode_info *ci, int got) | 2282 | static void __take_cap_refs(struct ceph_inode_info *ci, int got, |
2283 | bool snap_rwsem_locked) | ||
2077 | { | 2284 | { |
2078 | if (got & CEPH_CAP_PIN) | 2285 | if (got & CEPH_CAP_PIN) |
2079 | ci->i_pin_ref++; | 2286 | ci->i_pin_ref++; |
@@ -2081,8 +2288,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) | |||
2081 | ci->i_rd_ref++; | 2288 | ci->i_rd_ref++; |
2082 | if (got & CEPH_CAP_FILE_CACHE) | 2289 | if (got & CEPH_CAP_FILE_CACHE) |
2083 | ci->i_rdcache_ref++; | 2290 | ci->i_rdcache_ref++; |
2084 | if (got & CEPH_CAP_FILE_WR) | 2291 | if (got & CEPH_CAP_FILE_WR) { |
2292 | if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { | ||
2293 | BUG_ON(!snap_rwsem_locked); | ||
2294 | ci->i_head_snapc = ceph_get_snap_context( | ||
2295 | ci->i_snap_realm->cached_context); | ||
2296 | } | ||
2085 | ci->i_wr_ref++; | 2297 | ci->i_wr_ref++; |
2298 | } | ||
2086 | if (got & CEPH_CAP_FILE_BUFFER) { | 2299 | if (got & CEPH_CAP_FILE_BUFFER) { |
2087 | if (ci->i_wb_ref == 0) | 2300 | if (ci->i_wb_ref == 0) |
2088 | ihold(&ci->vfs_inode); | 2301 | ihold(&ci->vfs_inode); |
@@ -2100,16 +2313,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) | |||
2100 | * requested from the MDS. | 2313 | * requested from the MDS. |
2101 | */ | 2314 | */ |
2102 | static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | 2315 | static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, |
2103 | loff_t endoff, int *got, int *check_max, int *err) | 2316 | loff_t endoff, bool nonblock, int *got, int *err) |
2104 | { | 2317 | { |
2105 | struct inode *inode = &ci->vfs_inode; | 2318 | struct inode *inode = &ci->vfs_inode; |
2319 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | ||
2106 | int ret = 0; | 2320 | int ret = 0; |
2107 | int have, implemented; | 2321 | int have, implemented; |
2108 | int file_wanted; | 2322 | int file_wanted; |
2323 | bool snap_rwsem_locked = false; | ||
2109 | 2324 | ||
2110 | dout("get_cap_refs %p need %s want %s\n", inode, | 2325 | dout("get_cap_refs %p need %s want %s\n", inode, |
2111 | ceph_cap_string(need), ceph_cap_string(want)); | 2326 | ceph_cap_string(need), ceph_cap_string(want)); |
2112 | 2327 | ||
2328 | again: | ||
2113 | spin_lock(&ci->i_ceph_lock); | 2329 | spin_lock(&ci->i_ceph_lock); |
2114 | 2330 | ||
2115 | /* make sure file is actually open */ | 2331 | /* make sure file is actually open */ |
@@ -2125,6 +2341,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2125 | /* finish pending truncate */ | 2341 | /* finish pending truncate */ |
2126 | while (ci->i_truncate_pending) { | 2342 | while (ci->i_truncate_pending) { |
2127 | spin_unlock(&ci->i_ceph_lock); | 2343 | spin_unlock(&ci->i_ceph_lock); |
2344 | if (snap_rwsem_locked) { | ||
2345 | up_read(&mdsc->snap_rwsem); | ||
2346 | snap_rwsem_locked = false; | ||
2347 | } | ||
2128 | __ceph_do_pending_vmtruncate(inode); | 2348 | __ceph_do_pending_vmtruncate(inode); |
2129 | spin_lock(&ci->i_ceph_lock); | 2349 | spin_lock(&ci->i_ceph_lock); |
2130 | } | 2350 | } |
@@ -2136,7 +2356,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2136 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", | 2356 | dout("get_cap_refs %p endoff %llu > maxsize %llu\n", |
2137 | inode, endoff, ci->i_max_size); | 2357 | inode, endoff, ci->i_max_size); |
2138 | if (endoff > ci->i_requested_max_size) { | 2358 | if (endoff > ci->i_requested_max_size) { |
2139 | *check_max = 1; | 2359 | *err = -EAGAIN; |
2140 | ret = 1; | 2360 | ret = 1; |
2141 | } | 2361 | } |
2142 | goto out_unlock; | 2362 | goto out_unlock; |
@@ -2164,8 +2384,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2164 | inode, ceph_cap_string(have), ceph_cap_string(not), | 2384 | inode, ceph_cap_string(have), ceph_cap_string(not), |
2165 | ceph_cap_string(revoking)); | 2385 | ceph_cap_string(revoking)); |
2166 | if ((revoking & not) == 0) { | 2386 | if ((revoking & not) == 0) { |
2387 | if (!snap_rwsem_locked && | ||
2388 | !ci->i_head_snapc && | ||
2389 | (need & CEPH_CAP_FILE_WR)) { | ||
2390 | if (!down_read_trylock(&mdsc->snap_rwsem)) { | ||
2391 | /* | ||
2392 | * we can not call down_read() when | ||
2393 | * task isn't in TASK_RUNNING state | ||
2394 | */ | ||
2395 | if (nonblock) { | ||
2396 | *err = -EAGAIN; | ||
2397 | ret = 1; | ||
2398 | goto out_unlock; | ||
2399 | } | ||
2400 | |||
2401 | spin_unlock(&ci->i_ceph_lock); | ||
2402 | down_read(&mdsc->snap_rwsem); | ||
2403 | snap_rwsem_locked = true; | ||
2404 | goto again; | ||
2405 | } | ||
2406 | snap_rwsem_locked = true; | ||
2407 | } | ||
2167 | *got = need | (have & want); | 2408 | *got = need | (have & want); |
2168 | __take_cap_refs(ci, *got); | 2409 | __take_cap_refs(ci, *got, true); |
2169 | ret = 1; | 2410 | ret = 1; |
2170 | } | 2411 | } |
2171 | } else { | 2412 | } else { |
@@ -2189,6 +2430,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2189 | } | 2430 | } |
2190 | out_unlock: | 2431 | out_unlock: |
2191 | spin_unlock(&ci->i_ceph_lock); | 2432 | spin_unlock(&ci->i_ceph_lock); |
2433 | if (snap_rwsem_locked) | ||
2434 | up_read(&mdsc->snap_rwsem); | ||
2192 | 2435 | ||
2193 | dout("get_cap_refs %p ret %d got %s\n", inode, | 2436 | dout("get_cap_refs %p ret %d got %s\n", inode, |
2194 | ret, ceph_cap_string(*got)); | 2437 | ret, ceph_cap_string(*got)); |
@@ -2231,50 +2474,70 @@ static void check_max_size(struct inode *inode, loff_t endoff) | |||
2231 | int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | 2474 | int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, |
2232 | loff_t endoff, int *got, struct page **pinned_page) | 2475 | loff_t endoff, int *got, struct page **pinned_page) |
2233 | { | 2476 | { |
2234 | int _got, check_max, ret, err = 0; | 2477 | int _got, ret, err = 0; |
2235 | 2478 | ||
2236 | retry: | 2479 | ret = ceph_pool_perm_check(ci, need); |
2237 | if (endoff > 0) | ||
2238 | check_max_size(&ci->vfs_inode, endoff); | ||
2239 | _got = 0; | ||
2240 | check_max = 0; | ||
2241 | ret = wait_event_interruptible(ci->i_cap_wq, | ||
2242 | try_get_cap_refs(ci, need, want, endoff, | ||
2243 | &_got, &check_max, &err)); | ||
2244 | if (err) | ||
2245 | ret = err; | ||
2246 | if (ret < 0) | 2480 | if (ret < 0) |
2247 | return ret; | 2481 | return ret; |
2248 | 2482 | ||
2249 | if (check_max) | 2483 | while (true) { |
2250 | goto retry; | 2484 | if (endoff > 0) |
2485 | check_max_size(&ci->vfs_inode, endoff); | ||
2251 | 2486 | ||
2252 | if (ci->i_inline_version != CEPH_INLINE_NONE && | 2487 | err = 0; |
2253 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && | ||
2254 | i_size_read(&ci->vfs_inode) > 0) { | ||
2255 | struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); | ||
2256 | if (page) { | ||
2257 | if (PageUptodate(page)) { | ||
2258 | *pinned_page = page; | ||
2259 | goto out; | ||
2260 | } | ||
2261 | page_cache_release(page); | ||
2262 | } | ||
2263 | /* | ||
2264 | * drop cap refs first because getattr while holding | ||
2265 | * caps refs can cause deadlock. | ||
2266 | */ | ||
2267 | ceph_put_cap_refs(ci, _got); | ||
2268 | _got = 0; | 2488 | _got = 0; |
2489 | ret = try_get_cap_refs(ci, need, want, endoff, | ||
2490 | false, &_got, &err); | ||
2491 | if (ret) { | ||
2492 | if (err == -EAGAIN) | ||
2493 | continue; | ||
2494 | if (err < 0) | ||
2495 | return err; | ||
2496 | } else { | ||
2497 | ret = wait_event_interruptible(ci->i_cap_wq, | ||
2498 | try_get_cap_refs(ci, need, want, endoff, | ||
2499 | true, &_got, &err)); | ||
2500 | if (err == -EAGAIN) | ||
2501 | continue; | ||
2502 | if (err < 0) | ||
2503 | ret = err; | ||
2504 | if (ret < 0) | ||
2505 | return ret; | ||
2506 | } | ||
2269 | 2507 | ||
2270 | /* getattr request will bring inline data into page cache */ | 2508 | if (ci->i_inline_version != CEPH_INLINE_NONE && |
2271 | ret = __ceph_do_getattr(&ci->vfs_inode, NULL, | 2509 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
2272 | CEPH_STAT_CAP_INLINE_DATA, true); | 2510 | i_size_read(&ci->vfs_inode) > 0) { |
2273 | if (ret < 0) | 2511 | struct page *page = |
2274 | return ret; | 2512 | find_get_page(ci->vfs_inode.i_mapping, 0); |
2275 | goto retry; | 2513 | if (page) { |
2514 | if (PageUptodate(page)) { | ||
2515 | *pinned_page = page; | ||
2516 | break; | ||
2517 | } | ||
2518 | page_cache_release(page); | ||
2519 | } | ||
2520 | /* | ||
2521 | * drop cap refs first because getattr while | ||
2522 | * holding * caps refs can cause deadlock. | ||
2523 | */ | ||
2524 | ceph_put_cap_refs(ci, _got); | ||
2525 | _got = 0; | ||
2526 | |||
2527 | /* | ||
2528 | * getattr request will bring inline data into | ||
2529 | * page cache | ||
2530 | */ | ||
2531 | ret = __ceph_do_getattr(&ci->vfs_inode, NULL, | ||
2532 | CEPH_STAT_CAP_INLINE_DATA, | ||
2533 | true); | ||
2534 | if (ret < 0) | ||
2535 | return ret; | ||
2536 | continue; | ||
2537 | } | ||
2538 | break; | ||
2276 | } | 2539 | } |
2277 | out: | 2540 | |
2278 | *got = _got; | 2541 | *got = _got; |
2279 | return 0; | 2542 | return 0; |
2280 | } | 2543 | } |
@@ -2286,10 +2549,31 @@ out: | |||
2286 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) | 2549 | void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) |
2287 | { | 2550 | { |
2288 | spin_lock(&ci->i_ceph_lock); | 2551 | spin_lock(&ci->i_ceph_lock); |
2289 | __take_cap_refs(ci, caps); | 2552 | __take_cap_refs(ci, caps, false); |
2290 | spin_unlock(&ci->i_ceph_lock); | 2553 | spin_unlock(&ci->i_ceph_lock); |
2291 | } | 2554 | } |
2292 | 2555 | ||
2556 | |||
2557 | /* | ||
2558 | * drop cap_snap that is not associated with any snapshot. | ||
2559 | * we don't need to send FLUSHSNAP message for it. | ||
2560 | */ | ||
2561 | static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) | ||
2562 | { | ||
2563 | if (!capsnap->need_flush && | ||
2564 | !capsnap->writing && !capsnap->dirty_pages) { | ||
2565 | |||
2566 | dout("dropping cap_snap %p follows %llu\n", | ||
2567 | capsnap, capsnap->follows); | ||
2568 | ceph_put_snap_context(capsnap->context); | ||
2569 | list_del(&capsnap->ci_item); | ||
2570 | list_del(&capsnap->flushing_item); | ||
2571 | ceph_put_cap_snap(capsnap); | ||
2572 | return 1; | ||
2573 | } | ||
2574 | return 0; | ||
2575 | } | ||
2576 | |||
2293 | /* | 2577 | /* |
2294 | * Release cap refs. | 2578 | * Release cap refs. |
2295 | * | 2579 | * |
@@ -2303,7 +2587,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | |||
2303 | { | 2587 | { |
2304 | struct inode *inode = &ci->vfs_inode; | 2588 | struct inode *inode = &ci->vfs_inode; |
2305 | int last = 0, put = 0, flushsnaps = 0, wake = 0; | 2589 | int last = 0, put = 0, flushsnaps = 0, wake = 0; |
2306 | struct ceph_cap_snap *capsnap; | ||
2307 | 2590 | ||
2308 | spin_lock(&ci->i_ceph_lock); | 2591 | spin_lock(&ci->i_ceph_lock); |
2309 | if (had & CEPH_CAP_PIN) | 2592 | if (had & CEPH_CAP_PIN) |
@@ -2325,17 +2608,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | |||
2325 | if (had & CEPH_CAP_FILE_WR) | 2608 | if (had & CEPH_CAP_FILE_WR) |
2326 | if (--ci->i_wr_ref == 0) { | 2609 | if (--ci->i_wr_ref == 0) { |
2327 | last++; | 2610 | last++; |
2328 | if (!list_empty(&ci->i_cap_snaps)) { | 2611 | if (__ceph_have_pending_cap_snap(ci)) { |
2329 | capsnap = list_first_entry(&ci->i_cap_snaps, | 2612 | struct ceph_cap_snap *capsnap = |
2330 | struct ceph_cap_snap, | 2613 | list_last_entry(&ci->i_cap_snaps, |
2331 | ci_item); | 2614 | struct ceph_cap_snap, |
2332 | if (capsnap->writing) { | 2615 | ci_item); |
2333 | capsnap->writing = 0; | 2616 | capsnap->writing = 0; |
2334 | flushsnaps = | 2617 | if (ceph_try_drop_cap_snap(capsnap)) |
2335 | __ceph_finish_cap_snap(ci, | 2618 | put++; |
2336 | capsnap); | 2619 | else if (__ceph_finish_cap_snap(ci, capsnap)) |
2337 | wake = 1; | 2620 | flushsnaps = 1; |
2338 | } | 2621 | wake = 1; |
2622 | } | ||
2623 | if (ci->i_wrbuffer_ref_head == 0 && | ||
2624 | ci->i_dirty_caps == 0 && | ||
2625 | ci->i_flushing_caps == 0) { | ||
2626 | BUG_ON(!ci->i_head_snapc); | ||
2627 | ceph_put_snap_context(ci->i_head_snapc); | ||
2628 | ci->i_head_snapc = NULL; | ||
2339 | } | 2629 | } |
2340 | /* see comment in __ceph_remove_cap() */ | 2630 | /* see comment in __ceph_remove_cap() */ |
2341 | if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) | 2631 | if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) |
@@ -2352,7 +2642,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) | |||
2352 | ceph_flush_snaps(ci); | 2642 | ceph_flush_snaps(ci); |
2353 | if (wake) | 2643 | if (wake) |
2354 | wake_up_all(&ci->i_cap_wq); | 2644 | wake_up_all(&ci->i_cap_wq); |
2355 | if (put) | 2645 | while (put-- > 0) |
2356 | iput(inode); | 2646 | iput(inode); |
2357 | } | 2647 | } |
2358 | 2648 | ||
@@ -2380,7 +2670,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
2380 | if (ci->i_head_snapc == snapc) { | 2670 | if (ci->i_head_snapc == snapc) { |
2381 | ci->i_wrbuffer_ref_head -= nr; | 2671 | ci->i_wrbuffer_ref_head -= nr; |
2382 | if (ci->i_wrbuffer_ref_head == 0 && | 2672 | if (ci->i_wrbuffer_ref_head == 0 && |
2383 | ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { | 2673 | ci->i_wr_ref == 0 && |
2674 | ci->i_dirty_caps == 0 && | ||
2675 | ci->i_flushing_caps == 0) { | ||
2384 | BUG_ON(!ci->i_head_snapc); | 2676 | BUG_ON(!ci->i_head_snapc); |
2385 | ceph_put_snap_context(ci->i_head_snapc); | 2677 | ceph_put_snap_context(ci->i_head_snapc); |
2386 | ci->i_head_snapc = NULL; | 2678 | ci->i_head_snapc = NULL; |
@@ -2401,25 +2693,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | |||
2401 | capsnap->dirty_pages -= nr; | 2693 | capsnap->dirty_pages -= nr; |
2402 | if (capsnap->dirty_pages == 0) { | 2694 | if (capsnap->dirty_pages == 0) { |
2403 | complete_capsnap = 1; | 2695 | complete_capsnap = 1; |
2404 | if (capsnap->dirty == 0) | 2696 | drop_capsnap = ceph_try_drop_cap_snap(capsnap); |
2405 | /* cap writeback completed before we created | ||
2406 | * the cap_snap; no FLUSHSNAP is needed */ | ||
2407 | drop_capsnap = 1; | ||
2408 | } | 2697 | } |
2409 | dout("put_wrbuffer_cap_refs on %p cap_snap %p " | 2698 | dout("put_wrbuffer_cap_refs on %p cap_snap %p " |
2410 | " snap %lld %d/%d -> %d/%d %s%s%s\n", | 2699 | " snap %lld %d/%d -> %d/%d %s%s\n", |
2411 | inode, capsnap, capsnap->context->seq, | 2700 | inode, capsnap, capsnap->context->seq, |
2412 | ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, | 2701 | ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, |
2413 | ci->i_wrbuffer_ref, capsnap->dirty_pages, | 2702 | ci->i_wrbuffer_ref, capsnap->dirty_pages, |
2414 | last ? " (wrbuffer last)" : "", | 2703 | last ? " (wrbuffer last)" : "", |
2415 | complete_capsnap ? " (complete capsnap)" : "", | 2704 | complete_capsnap ? " (complete capsnap)" : ""); |
2416 | drop_capsnap ? " (drop capsnap)" : ""); | ||
2417 | if (drop_capsnap) { | ||
2418 | ceph_put_snap_context(capsnap->context); | ||
2419 | list_del(&capsnap->ci_item); | ||
2420 | list_del(&capsnap->flushing_item); | ||
2421 | ceph_put_cap_snap(capsnap); | ||
2422 | } | ||
2423 | } | 2705 | } |
2424 | 2706 | ||
2425 | spin_unlock(&ci->i_ceph_lock); | 2707 | spin_unlock(&ci->i_ceph_lock); |
@@ -2526,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, | |||
2526 | * try to invalidate (once). (If there are dirty buffers, we | 2808 | * try to invalidate (once). (If there are dirty buffers, we |
2527 | * will invalidate _after_ writeback.) | 2809 | * will invalidate _after_ writeback.) |
2528 | */ | 2810 | */ |
2529 | if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && | 2811 | if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ |
2812 | ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && | ||
2530 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && | 2813 | (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && |
2531 | !ci->i_wrbuffer_ref) { | 2814 | !ci->i_wrbuffer_ref) { |
2532 | if (try_nonblocking_invalidate(inode)) { | 2815 | if (try_nonblocking_invalidate(inode)) { |
@@ -2732,16 +3015,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2732 | { | 3015 | { |
2733 | struct ceph_inode_info *ci = ceph_inode(inode); | 3016 | struct ceph_inode_info *ci = ceph_inode(inode); |
2734 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 3017 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
3018 | struct ceph_cap_flush *cf; | ||
3019 | struct rb_node *n; | ||
3020 | LIST_HEAD(to_remove); | ||
2735 | unsigned seq = le32_to_cpu(m->seq); | 3021 | unsigned seq = le32_to_cpu(m->seq); |
2736 | int dirty = le32_to_cpu(m->dirty); | 3022 | int dirty = le32_to_cpu(m->dirty); |
2737 | int cleaned = 0; | 3023 | int cleaned = 0; |
2738 | int drop = 0; | 3024 | int drop = 0; |
2739 | int i; | ||
2740 | 3025 | ||
2741 | for (i = 0; i < CEPH_CAP_BITS; i++) | 3026 | n = rb_first(&ci->i_cap_flush_tree); |
2742 | if ((dirty & (1 << i)) && | 3027 | while (n) { |
2743 | (u16)flush_tid == ci->i_cap_flush_tid[i]) | 3028 | cf = rb_entry(n, struct ceph_cap_flush, i_node); |
2744 | cleaned |= 1 << i; | 3029 | n = rb_next(&cf->i_node); |
3030 | if (cf->tid == flush_tid) | ||
3031 | cleaned = cf->caps; | ||
3032 | if (cf->tid <= flush_tid) { | ||
3033 | rb_erase(&cf->i_node, &ci->i_cap_flush_tree); | ||
3034 | list_add_tail(&cf->list, &to_remove); | ||
3035 | } else { | ||
3036 | cleaned &= ~cf->caps; | ||
3037 | if (!cleaned) | ||
3038 | break; | ||
3039 | } | ||
3040 | } | ||
2745 | 3041 | ||
2746 | dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," | 3042 | dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," |
2747 | " flushing %s -> %s\n", | 3043 | " flushing %s -> %s\n", |
@@ -2749,12 +3045,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2749 | ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), | 3045 | ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), |
2750 | ceph_cap_string(ci->i_flushing_caps & ~cleaned)); | 3046 | ceph_cap_string(ci->i_flushing_caps & ~cleaned)); |
2751 | 3047 | ||
2752 | if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) | 3048 | if (list_empty(&to_remove) && !cleaned) |
2753 | goto out; | 3049 | goto out; |
2754 | 3050 | ||
2755 | ci->i_flushing_caps &= ~cleaned; | 3051 | ci->i_flushing_caps &= ~cleaned; |
2756 | 3052 | ||
2757 | spin_lock(&mdsc->cap_dirty_lock); | 3053 | spin_lock(&mdsc->cap_dirty_lock); |
3054 | |||
3055 | if (!list_empty(&to_remove)) { | ||
3056 | list_for_each_entry(cf, &to_remove, list) | ||
3057 | rb_erase(&cf->g_node, &mdsc->cap_flush_tree); | ||
3058 | |||
3059 | n = rb_first(&mdsc->cap_flush_tree); | ||
3060 | cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; | ||
3061 | if (!cf || cf->tid > flush_tid) | ||
3062 | wake_up_all(&mdsc->cap_flushing_wq); | ||
3063 | } | ||
3064 | |||
2758 | if (ci->i_flushing_caps == 0) { | 3065 | if (ci->i_flushing_caps == 0) { |
2759 | list_del_init(&ci->i_flushing_item); | 3066 | list_del_init(&ci->i_flushing_item); |
2760 | if (!list_empty(&session->s_cap_flushing)) | 3067 | if (!list_empty(&session->s_cap_flushing)) |
@@ -2764,14 +3071,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2764 | struct ceph_inode_info, | 3071 | struct ceph_inode_info, |
2765 | i_flushing_item)->vfs_inode); | 3072 | i_flushing_item)->vfs_inode); |
2766 | mdsc->num_cap_flushing--; | 3073 | mdsc->num_cap_flushing--; |
2767 | wake_up_all(&mdsc->cap_flushing_wq); | ||
2768 | dout(" inode %p now !flushing\n", inode); | 3074 | dout(" inode %p now !flushing\n", inode); |
2769 | 3075 | ||
2770 | if (ci->i_dirty_caps == 0) { | 3076 | if (ci->i_dirty_caps == 0) { |
2771 | dout(" inode %p now clean\n", inode); | 3077 | dout(" inode %p now clean\n", inode); |
2772 | BUG_ON(!list_empty(&ci->i_dirty_item)); | 3078 | BUG_ON(!list_empty(&ci->i_dirty_item)); |
2773 | drop = 1; | 3079 | drop = 1; |
2774 | if (ci->i_wrbuffer_ref_head == 0) { | 3080 | if (ci->i_wr_ref == 0 && |
3081 | ci->i_wrbuffer_ref_head == 0) { | ||
2775 | BUG_ON(!ci->i_head_snapc); | 3082 | BUG_ON(!ci->i_head_snapc); |
2776 | ceph_put_snap_context(ci->i_head_snapc); | 3083 | ceph_put_snap_context(ci->i_head_snapc); |
2777 | ci->i_head_snapc = NULL; | 3084 | ci->i_head_snapc = NULL; |
@@ -2785,6 +3092,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, | |||
2785 | 3092 | ||
2786 | out: | 3093 | out: |
2787 | spin_unlock(&ci->i_ceph_lock); | 3094 | spin_unlock(&ci->i_ceph_lock); |
3095 | |||
3096 | while (!list_empty(&to_remove)) { | ||
3097 | cf = list_first_entry(&to_remove, | ||
3098 | struct ceph_cap_flush, list); | ||
3099 | list_del(&cf->list); | ||
3100 | ceph_free_cap_flush(cf); | ||
3101 | } | ||
2788 | if (drop) | 3102 | if (drop) |
2789 | iput(inode); | 3103 | iput(inode); |
2790 | } | 3104 | } |
@@ -2800,6 +3114,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, | |||
2800 | struct ceph_mds_session *session) | 3114 | struct ceph_mds_session *session) |
2801 | { | 3115 | { |
2802 | struct ceph_inode_info *ci = ceph_inode(inode); | 3116 | struct ceph_inode_info *ci = ceph_inode(inode); |
3117 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | ||
2803 | u64 follows = le64_to_cpu(m->snap_follows); | 3118 | u64 follows = le64_to_cpu(m->snap_follows); |
2804 | struct ceph_cap_snap *capsnap; | 3119 | struct ceph_cap_snap *capsnap; |
2805 | int drop = 0; | 3120 | int drop = 0; |
@@ -2823,6 +3138,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, | |||
2823 | list_del(&capsnap->ci_item); | 3138 | list_del(&capsnap->ci_item); |
2824 | list_del(&capsnap->flushing_item); | 3139 | list_del(&capsnap->flushing_item); |
2825 | ceph_put_cap_snap(capsnap); | 3140 | ceph_put_cap_snap(capsnap); |
3141 | wake_up_all(&mdsc->cap_flushing_wq); | ||
2826 | drop = 1; | 3142 | drop = 1; |
2827 | break; | 3143 | break; |
2828 | } else { | 3144 | } else { |
@@ -2971,7 +3287,6 @@ retry: | |||
2971 | mutex_lock_nested(&session->s_mutex, | 3287 | mutex_lock_nested(&session->s_mutex, |
2972 | SINGLE_DEPTH_NESTING); | 3288 | SINGLE_DEPTH_NESTING); |
2973 | } | 3289 | } |
2974 | ceph_add_cap_releases(mdsc, tsession); | ||
2975 | new_cap = ceph_get_cap(mdsc, NULL); | 3290 | new_cap = ceph_get_cap(mdsc, NULL); |
2976 | } else { | 3291 | } else { |
2977 | WARN_ON(1); | 3292 | WARN_ON(1); |
@@ -3167,16 +3482,20 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
3167 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, | 3482 | dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, |
3168 | (unsigned)seq); | 3483 | (unsigned)seq); |
3169 | 3484 | ||
3170 | if (op == CEPH_CAP_OP_IMPORT) | ||
3171 | ceph_add_cap_releases(mdsc, session); | ||
3172 | |||
3173 | if (!inode) { | 3485 | if (!inode) { |
3174 | dout(" i don't have ino %llx\n", vino.ino); | 3486 | dout(" i don't have ino %llx\n", vino.ino); |
3175 | 3487 | ||
3176 | if (op == CEPH_CAP_OP_IMPORT) { | 3488 | if (op == CEPH_CAP_OP_IMPORT) { |
3489 | cap = ceph_get_cap(mdsc, NULL); | ||
3490 | cap->cap_ino = vino.ino; | ||
3491 | cap->queue_release = 1; | ||
3492 | cap->cap_id = cap_id; | ||
3493 | cap->mseq = mseq; | ||
3494 | cap->seq = seq; | ||
3177 | spin_lock(&session->s_cap_lock); | 3495 | spin_lock(&session->s_cap_lock); |
3178 | __queue_cap_release(session, vino.ino, cap_id, | 3496 | list_add_tail(&cap->session_caps, |
3179 | mseq, seq); | 3497 | &session->s_cap_releases); |
3498 | session->s_num_cap_releases++; | ||
3180 | spin_unlock(&session->s_cap_lock); | 3499 | spin_unlock(&session->s_cap_lock); |
3181 | } | 3500 | } |
3182 | goto flush_cap_releases; | 3501 | goto flush_cap_releases; |
@@ -3252,11 +3571,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, | |||
3252 | 3571 | ||
3253 | flush_cap_releases: | 3572 | flush_cap_releases: |
3254 | /* | 3573 | /* |
3255 | * send any full release message to try to move things | 3574 | * send any cap release message to try to move things |
3256 | * along for the mds (who clearly thinks we still have this | 3575 | * along for the mds (who clearly thinks we still have this |
3257 | * cap). | 3576 | * cap). |
3258 | */ | 3577 | */ |
3259 | ceph_add_cap_releases(mdsc, session); | ||
3260 | ceph_send_cap_releases(mdsc, session); | 3578 | ceph_send_cap_releases(mdsc, session); |
3261 | 3579 | ||
3262 | done: | 3580 | done: |
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4248307fea90..9314b4ea2375 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c | |||
@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry) | |||
38 | if (dentry->d_fsdata) | 38 | if (dentry->d_fsdata) |
39 | return 0; | 39 | return 0; |
40 | 40 | ||
41 | di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); | 41 | di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); |
42 | if (!di) | 42 | if (!di) |
43 | return -ENOMEM; /* oh well */ | 43 | return -ENOMEM; /* oh well */ |
44 | 44 | ||
@@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r) | |||
107 | } | 107 | } |
108 | 108 | ||
109 | /* | 109 | /* |
110 | * make note of the last dentry we read, so we can | ||
111 | * continue at the same lexicographical point, | ||
112 | * regardless of what dir changes take place on the | ||
113 | * server. | ||
114 | */ | ||
115 | static int note_last_dentry(struct ceph_file_info *fi, const char *name, | ||
116 | int len, unsigned next_offset) | ||
117 | { | ||
118 | char *buf = kmalloc(len+1, GFP_KERNEL); | ||
119 | if (!buf) | ||
120 | return -ENOMEM; | ||
121 | kfree(fi->last_name); | ||
122 | fi->last_name = buf; | ||
123 | memcpy(fi->last_name, name, len); | ||
124 | fi->last_name[len] = 0; | ||
125 | fi->next_offset = next_offset; | ||
126 | dout("note_last_dentry '%s'\n", fi->last_name); | ||
127 | return 0; | ||
128 | } | ||
129 | |||
130 | /* | ||
110 | * When possible, we try to satisfy a readdir by peeking at the | 131 | * When possible, we try to satisfy a readdir by peeking at the |
111 | * dcache. We make this work by carefully ordering dentries on | 132 | * dcache. We make this work by carefully ordering dentries on |
112 | * d_child when we initially get results back from the MDS, and | 133 | * d_child when we initially get results back from the MDS, and |
@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, | |||
123 | struct ceph_file_info *fi = file->private_data; | 144 | struct ceph_file_info *fi = file->private_data; |
124 | struct dentry *parent = file->f_path.dentry; | 145 | struct dentry *parent = file->f_path.dentry; |
125 | struct inode *dir = d_inode(parent); | 146 | struct inode *dir = d_inode(parent); |
126 | struct list_head *p; | 147 | struct dentry *dentry, *last = NULL; |
127 | struct dentry *dentry, *last; | ||
128 | struct ceph_dentry_info *di; | 148 | struct ceph_dentry_info *di; |
149 | unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *); | ||
129 | int err = 0; | 150 | int err = 0; |
151 | loff_t ptr_pos = 0; | ||
152 | struct ceph_readdir_cache_control cache_ctl = {}; | ||
130 | 153 | ||
131 | /* claim ref on last dentry we returned */ | 154 | dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); |
132 | last = fi->dentry; | ||
133 | fi->dentry = NULL; | ||
134 | |||
135 | dout("__dcache_readdir %p v%u at %llu (last %p)\n", | ||
136 | dir, shared_gen, ctx->pos, last); | ||
137 | 155 | ||
138 | spin_lock(&parent->d_lock); | 156 | /* we can calculate cache index for the first dirfrag */ |
139 | 157 | if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { | |
140 | /* start at beginning? */ | 158 | cache_ctl.index = fpos_off(ctx->pos) - 2; |
141 | if (ctx->pos == 2 || last == NULL || | 159 | BUG_ON(cache_ctl.index < 0); |
142 | fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { | 160 | ptr_pos = cache_ctl.index * sizeof(struct dentry *); |
143 | if (list_empty(&parent->d_subdirs)) | ||
144 | goto out_unlock; | ||
145 | p = parent->d_subdirs.prev; | ||
146 | dout(" initial p %p/%p\n", p->prev, p->next); | ||
147 | } else { | ||
148 | p = last->d_child.prev; | ||
149 | } | 161 | } |
150 | 162 | ||
151 | more: | 163 | while (true) { |
152 | dentry = list_entry(p, struct dentry, d_child); | 164 | pgoff_t pgoff; |
153 | di = ceph_dentry(dentry); | 165 | bool emit_dentry; |
154 | while (1) { | 166 | |
155 | dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, | 167 | if (ptr_pos >= i_size_read(dir)) { |
156 | d_unhashed(dentry) ? "!hashed" : "hashed", | ||
157 | parent->d_subdirs.prev, parent->d_subdirs.next); | ||
158 | if (p == &parent->d_subdirs) { | ||
159 | fi->flags |= CEPH_F_ATEND; | 168 | fi->flags |= CEPH_F_ATEND; |
160 | goto out_unlock; | 169 | err = 0; |
170 | break; | ||
171 | } | ||
172 | |||
173 | err = -EAGAIN; | ||
174 | pgoff = ptr_pos >> PAGE_CACHE_SHIFT; | ||
175 | if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { | ||
176 | ceph_readdir_cache_release(&cache_ctl); | ||
177 | cache_ctl.page = find_lock_page(&dir->i_data, pgoff); | ||
178 | if (!cache_ctl.page) { | ||
179 | dout(" page %lu not found\n", pgoff); | ||
180 | break; | ||
181 | } | ||
182 | /* reading/filling the cache are serialized by | ||
183 | * i_mutex, no need to use page lock */ | ||
184 | unlock_page(cache_ctl.page); | ||
185 | cache_ctl.dentries = kmap(cache_ctl.page); | ||
161 | } | 186 | } |
162 | spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); | 187 | |
188 | rcu_read_lock(); | ||
189 | spin_lock(&parent->d_lock); | ||
190 | /* check i_size again here, because empty directory can be | ||
191 | * marked as complete while not holding the i_mutex. */ | ||
192 | if (ceph_dir_is_complete_ordered(dir) && | ||
193 | ptr_pos < i_size_read(dir)) | ||
194 | dentry = cache_ctl.dentries[cache_ctl.index % nsize]; | ||
195 | else | ||
196 | dentry = NULL; | ||
197 | spin_unlock(&parent->d_lock); | ||
198 | if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) | ||
199 | dentry = NULL; | ||
200 | rcu_read_unlock(); | ||
201 | if (!dentry) | ||
202 | break; | ||
203 | |||
204 | emit_dentry = false; | ||
205 | di = ceph_dentry(dentry); | ||
206 | spin_lock(&dentry->d_lock); | ||
163 | if (di->lease_shared_gen == shared_gen && | 207 | if (di->lease_shared_gen == shared_gen && |
164 | !d_unhashed(dentry) && d_really_is_positive(dentry) && | 208 | d_really_is_positive(dentry) && |
165 | ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && | 209 | ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && |
166 | ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && | 210 | ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && |
167 | fpos_cmp(ctx->pos, di->offset) <= 0) | 211 | fpos_cmp(ctx->pos, di->offset) <= 0) { |
168 | break; | 212 | emit_dentry = true; |
169 | dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, | 213 | } |
170 | dentry, di->offset, | ||
171 | ctx->pos, d_unhashed(dentry) ? " unhashed" : "", | ||
172 | !d_inode(dentry) ? " null" : ""); | ||
173 | spin_unlock(&dentry->d_lock); | 214 | spin_unlock(&dentry->d_lock); |
174 | p = p->prev; | ||
175 | dentry = list_entry(p, struct dentry, d_child); | ||
176 | di = ceph_dentry(dentry); | ||
177 | } | ||
178 | |||
179 | dget_dlock(dentry); | ||
180 | spin_unlock(&dentry->d_lock); | ||
181 | spin_unlock(&parent->d_lock); | ||
182 | 215 | ||
183 | /* make sure a dentry wasn't dropped while we didn't have parent lock */ | 216 | if (emit_dentry) { |
184 | if (!ceph_dir_is_complete_ordered(dir)) { | 217 | dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, |
185 | dout(" lost dir complete on %p; falling back to mds\n", dir); | 218 | dentry, dentry, d_inode(dentry)); |
186 | dput(dentry); | 219 | ctx->pos = di->offset; |
187 | err = -EAGAIN; | 220 | if (!dir_emit(ctx, dentry->d_name.name, |
188 | goto out; | 221 | dentry->d_name.len, |
189 | } | 222 | ceph_translate_ino(dentry->d_sb, |
223 | d_inode(dentry)->i_ino), | ||
224 | d_inode(dentry)->i_mode >> 12)) { | ||
225 | dput(dentry); | ||
226 | err = 0; | ||
227 | break; | ||
228 | } | ||
229 | ctx->pos++; | ||
190 | 230 | ||
191 | dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, | 231 | if (last) |
192 | dentry, dentry, d_inode(dentry)); | 232 | dput(last); |
193 | if (!dir_emit(ctx, dentry->d_name.name, | 233 | last = dentry; |
194 | dentry->d_name.len, | 234 | } else { |
195 | ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), | 235 | dput(dentry); |
196 | d_inode(dentry)->i_mode >> 12)) { | ||
197 | if (last) { | ||
198 | /* remember our position */ | ||
199 | fi->dentry = last; | ||
200 | fi->next_offset = fpos_off(di->offset); | ||
201 | } | 236 | } |
202 | dput(dentry); | ||
203 | return 0; | ||
204 | } | ||
205 | |||
206 | ctx->pos = di->offset + 1; | ||
207 | 237 | ||
208 | if (last) | 238 | cache_ctl.index++; |
209 | dput(last); | 239 | ptr_pos += sizeof(struct dentry *); |
210 | last = dentry; | 240 | } |
211 | 241 | ceph_readdir_cache_release(&cache_ctl); | |
212 | spin_lock(&parent->d_lock); | 242 | if (last) { |
213 | p = p->prev; /* advance to next dentry */ | 243 | int ret; |
214 | goto more; | 244 | di = ceph_dentry(last); |
215 | 245 | ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, | |
216 | out_unlock: | 246 | fpos_off(di->offset) + 1); |
217 | spin_unlock(&parent->d_lock); | 247 | if (ret < 0) |
218 | out: | 248 | err = ret; |
219 | if (last) | ||
220 | dput(last); | 249 | dput(last); |
250 | } | ||
221 | return err; | 251 | return err; |
222 | } | 252 | } |
223 | 253 | ||
224 | /* | ||
225 | * make note of the last dentry we read, so we can | ||
226 | * continue at the same lexicographical point, | ||
227 | * regardless of what dir changes take place on the | ||
228 | * server. | ||
229 | */ | ||
230 | static int note_last_dentry(struct ceph_file_info *fi, const char *name, | ||
231 | int len) | ||
232 | { | ||
233 | kfree(fi->last_name); | ||
234 | fi->last_name = kmalloc(len+1, GFP_NOFS); | ||
235 | if (!fi->last_name) | ||
236 | return -ENOMEM; | ||
237 | memcpy(fi->last_name, name, len); | ||
238 | fi->last_name[len] = 0; | ||
239 | dout("note_last_dentry '%s'\n", fi->last_name); | ||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | static int ceph_readdir(struct file *file, struct dir_context *ctx) | 254 | static int ceph_readdir(struct file *file, struct dir_context *ctx) |
244 | { | 255 | { |
245 | struct ceph_file_info *fi = file->private_data; | 256 | struct ceph_file_info *fi = file->private_data; |
@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
280 | 291 | ||
281 | /* can we use the dcache? */ | 292 | /* can we use the dcache? */ |
282 | spin_lock(&ci->i_ceph_lock); | 293 | spin_lock(&ci->i_ceph_lock); |
283 | if ((ctx->pos == 2 || fi->dentry) && | 294 | if (ceph_test_mount_opt(fsc, DCACHE) && |
284 | ceph_test_mount_opt(fsc, DCACHE) && | ||
285 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && | 295 | !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && |
286 | ceph_snap(inode) != CEPH_SNAPDIR && | 296 | ceph_snap(inode) != CEPH_SNAPDIR && |
287 | __ceph_dir_is_complete_ordered(ci) && | 297 | __ceph_dir_is_complete_ordered(ci) && |
@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) | |||
296 | } else { | 306 | } else { |
297 | spin_unlock(&ci->i_ceph_lock); | 307 | spin_unlock(&ci->i_ceph_lock); |
298 | } | 308 | } |
299 | if (fi->dentry) { | ||
300 | err = note_last_dentry(fi, fi->dentry->d_name.name, | ||
301 | fi->dentry->d_name.len); | ||
302 | if (err) | ||
303 | return err; | ||
304 | dput(fi->dentry); | ||
305 | fi->dentry = NULL; | ||
306 | } | ||
307 | 309 | ||
308 | /* proceed with a normal readdir */ | 310 | /* proceed with a normal readdir */ |
309 | |||
310 | if (ctx->pos == 2) { | ||
311 | /* note dir version at start of readdir so we can tell | ||
312 | * if any dentries get dropped */ | ||
313 | fi->dir_release_count = atomic_read(&ci->i_release_count); | ||
314 | fi->dir_ordered_count = ci->i_ordered_count; | ||
315 | } | ||
316 | |||
317 | more: | 311 | more: |
318 | /* do we have the correct frag content buffered? */ | 312 | /* do we have the correct frag content buffered? */ |
319 | if (fi->frag != frag || fi->last_readdir == NULL) { | 313 | if (fi->frag != frag || fi->last_readdir == NULL) { |
@@ -342,12 +336,15 @@ more: | |||
342 | req->r_direct_hash = ceph_frag_value(frag); | 336 | req->r_direct_hash = ceph_frag_value(frag); |
343 | req->r_direct_is_hash = true; | 337 | req->r_direct_is_hash = true; |
344 | if (fi->last_name) { | 338 | if (fi->last_name) { |
345 | req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); | 339 | req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); |
346 | if (!req->r_path2) { | 340 | if (!req->r_path2) { |
347 | ceph_mdsc_put_request(req); | 341 | ceph_mdsc_put_request(req); |
348 | return -ENOMEM; | 342 | return -ENOMEM; |
349 | } | 343 | } |
350 | } | 344 | } |
345 | req->r_dir_release_cnt = fi->dir_release_count; | ||
346 | req->r_dir_ordered_cnt = fi->dir_ordered_count; | ||
347 | req->r_readdir_cache_idx = fi->readdir_cache_idx; | ||
351 | req->r_readdir_offset = fi->next_offset; | 348 | req->r_readdir_offset = fi->next_offset; |
352 | req->r_args.readdir.frag = cpu_to_le32(frag); | 349 | req->r_args.readdir.frag = cpu_to_le32(frag); |
353 | 350 | ||
@@ -364,26 +361,38 @@ more: | |||
364 | (int)req->r_reply_info.dir_end, | 361 | (int)req->r_reply_info.dir_end, |
365 | (int)req->r_reply_info.dir_complete); | 362 | (int)req->r_reply_info.dir_complete); |
366 | 363 | ||
367 | if (!req->r_did_prepopulate) { | ||
368 | dout("readdir !did_prepopulate"); | ||
369 | /* preclude from marking dir complete */ | ||
370 | fi->dir_release_count--; | ||
371 | } | ||
372 | 364 | ||
373 | /* note next offset and last dentry name */ | 365 | /* note next offset and last dentry name */ |
374 | rinfo = &req->r_reply_info; | 366 | rinfo = &req->r_reply_info; |
375 | if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { | 367 | if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { |
376 | frag = le32_to_cpu(rinfo->dir_dir->frag); | 368 | frag = le32_to_cpu(rinfo->dir_dir->frag); |
377 | if (ceph_frag_is_leftmost(frag)) | 369 | off = req->r_readdir_offset; |
378 | fi->next_offset = 2; | 370 | fi->next_offset = off; |
379 | else | ||
380 | fi->next_offset = 0; | ||
381 | off = fi->next_offset; | ||
382 | } | 371 | } |
372 | |||
383 | fi->frag = frag; | 373 | fi->frag = frag; |
384 | fi->offset = fi->next_offset; | 374 | fi->offset = fi->next_offset; |
385 | fi->last_readdir = req; | 375 | fi->last_readdir = req; |
386 | 376 | ||
377 | if (req->r_did_prepopulate) { | ||
378 | fi->readdir_cache_idx = req->r_readdir_cache_idx; | ||
379 | if (fi->readdir_cache_idx < 0) { | ||
380 | /* preclude from marking dir ordered */ | ||
381 | fi->dir_ordered_count = 0; | ||
382 | } else if (ceph_frag_is_leftmost(frag) && off == 2) { | ||
383 | /* note dir version at start of readdir so | ||
384 | * we can tell if any dentries get dropped */ | ||
385 | fi->dir_release_count = req->r_dir_release_cnt; | ||
386 | fi->dir_ordered_count = req->r_dir_ordered_cnt; | ||
387 | } | ||
388 | } else { | ||
389 | dout("readdir !did_prepopulate"); | ||
390 | /* disable readdir cache */ | ||
391 | fi->readdir_cache_idx = -1; | ||
392 | /* preclude from marking dir complete */ | ||
393 | fi->dir_release_count = 0; | ||
394 | } | ||
395 | |||
387 | if (req->r_reply_info.dir_end) { | 396 | if (req->r_reply_info.dir_end) { |
388 | kfree(fi->last_name); | 397 | kfree(fi->last_name); |
389 | fi->last_name = NULL; | 398 | fi->last_name = NULL; |
@@ -394,10 +403,10 @@ more: | |||
394 | } else { | 403 | } else { |
395 | err = note_last_dentry(fi, | 404 | err = note_last_dentry(fi, |
396 | rinfo->dir_dname[rinfo->dir_nr-1], | 405 | rinfo->dir_dname[rinfo->dir_nr-1], |
397 | rinfo->dir_dname_len[rinfo->dir_nr-1]); | 406 | rinfo->dir_dname_len[rinfo->dir_nr-1], |
407 | fi->next_offset + rinfo->dir_nr); | ||
398 | if (err) | 408 | if (err) |
399 | return err; | 409 | return err; |
400 | fi->next_offset += rinfo->dir_nr; | ||
401 | } | 410 | } |
402 | } | 411 | } |
403 | 412 | ||
@@ -453,16 +462,22 @@ more: | |||
453 | * were released during the whole readdir, and we should have | 462 | * were released during the whole readdir, and we should have |
454 | * the complete dir contents in our cache. | 463 | * the complete dir contents in our cache. |
455 | */ | 464 | */ |
456 | spin_lock(&ci->i_ceph_lock); | 465 | if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { |
457 | if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { | 466 | spin_lock(&ci->i_ceph_lock); |
458 | if (ci->i_ordered_count == fi->dir_ordered_count) | 467 | if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { |
459 | dout(" marking %p complete and ordered\n", inode); | 468 | dout(" marking %p complete and ordered\n", inode); |
460 | else | 469 | /* use i_size to track number of entries in |
470 | * readdir cache */ | ||
471 | BUG_ON(fi->readdir_cache_idx < 0); | ||
472 | i_size_write(inode, fi->readdir_cache_idx * | ||
473 | sizeof(struct dentry*)); | ||
474 | } else { | ||
461 | dout(" marking %p complete\n", inode); | 475 | dout(" marking %p complete\n", inode); |
476 | } | ||
462 | __ceph_dir_set_complete(ci, fi->dir_release_count, | 477 | __ceph_dir_set_complete(ci, fi->dir_release_count, |
463 | fi->dir_ordered_count); | 478 | fi->dir_ordered_count); |
479 | spin_unlock(&ci->i_ceph_lock); | ||
464 | } | 480 | } |
465 | spin_unlock(&ci->i_ceph_lock); | ||
466 | 481 | ||
467 | dout("readdir %p file %p done.\n", inode, file); | 482 | dout("readdir %p file %p done.\n", inode, file); |
468 | return 0; | 483 | return 0; |
@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) | |||
476 | } | 491 | } |
477 | kfree(fi->last_name); | 492 | kfree(fi->last_name); |
478 | fi->last_name = NULL; | 493 | fi->last_name = NULL; |
494 | fi->dir_release_count = 0; | ||
495 | fi->readdir_cache_idx = -1; | ||
479 | if (ceph_frag_is_leftmost(frag)) | 496 | if (ceph_frag_is_leftmost(frag)) |
480 | fi->next_offset = 2; /* compensate for . and .. */ | 497 | fi->next_offset = 2; /* compensate for . and .. */ |
481 | else | 498 | else |
482 | fi->next_offset = 0; | 499 | fi->next_offset = 0; |
483 | if (fi->dentry) { | ||
484 | dput(fi->dentry); | ||
485 | fi->dentry = NULL; | ||
486 | } | ||
487 | fi->flags &= ~CEPH_F_ATEND; | 500 | fi->flags &= ~CEPH_F_ATEND; |
488 | } | 501 | } |
489 | 502 | ||
@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) | |||
497 | mutex_lock(&inode->i_mutex); | 510 | mutex_lock(&inode->i_mutex); |
498 | retval = -EINVAL; | 511 | retval = -EINVAL; |
499 | switch (whence) { | 512 | switch (whence) { |
500 | case SEEK_END: | ||
501 | offset += inode->i_size + 2; /* FIXME */ | ||
502 | break; | ||
503 | case SEEK_CUR: | 513 | case SEEK_CUR: |
504 | offset += file->f_pos; | 514 | offset += file->f_pos; |
505 | case SEEK_SET: | 515 | case SEEK_SET: |
506 | break; | 516 | break; |
517 | case SEEK_END: | ||
518 | retval = -EOPNOTSUPP; | ||
507 | default: | 519 | default: |
508 | goto out; | 520 | goto out; |
509 | } | 521 | } |
@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) | |||
516 | } | 528 | } |
517 | retval = offset; | 529 | retval = offset; |
518 | 530 | ||
519 | /* | ||
520 | * discard buffered readdir content on seekdir(0), or | ||
521 | * seek to new frag, or seek prior to current chunk. | ||
522 | */ | ||
523 | if (offset == 0 || | 531 | if (offset == 0 || |
524 | fpos_frag(offset) != fi->frag || | 532 | fpos_frag(offset) != fi->frag || |
525 | fpos_off(offset) < fi->offset) { | 533 | fpos_off(offset) < fi->offset) { |
534 | /* discard buffered readdir content on seekdir(0), or | ||
535 | * seek to new frag, or seek prior to current chunk */ | ||
526 | dout("dir_llseek dropping %p content\n", file); | 536 | dout("dir_llseek dropping %p content\n", file); |
527 | reset_readdir(fi, fpos_frag(offset)); | 537 | reset_readdir(fi, fpos_frag(offset)); |
538 | } else if (fpos_cmp(offset, old_offset) > 0) { | ||
539 | /* reset dir_release_count if we did a forward seek */ | ||
540 | fi->dir_release_count = 0; | ||
541 | fi->readdir_cache_idx = -1; | ||
528 | } | 542 | } |
529 | |||
530 | /* bump dir_release_count if we did a forward seek */ | ||
531 | if (fpos_cmp(offset, old_offset) > 0) | ||
532 | fi->dir_release_count--; | ||
533 | } | 543 | } |
534 | out: | 544 | out: |
535 | mutex_unlock(&inode->i_mutex); | 545 | mutex_unlock(&inode->i_mutex); |
@@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, | |||
764 | err = PTR_ERR(req); | 774 | err = PTR_ERR(req); |
765 | goto out; | 775 | goto out; |
766 | } | 776 | } |
767 | req->r_path2 = kstrdup(dest, GFP_NOFS); | 777 | req->r_path2 = kstrdup(dest, GFP_KERNEL); |
768 | if (!req->r_path2) { | 778 | if (!req->r_path2) { |
769 | err = -ENOMEM; | 779 | err = -ENOMEM; |
770 | ceph_mdsc_put_request(req); | 780 | ceph_mdsc_put_request(req); |
@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
985 | * to do it here. | 995 | * to do it here. |
986 | */ | 996 | */ |
987 | 997 | ||
998 | /* d_move screws up sibling dentries' offsets */ | ||
999 | ceph_dir_clear_complete(old_dir); | ||
1000 | ceph_dir_clear_complete(new_dir); | ||
1001 | |||
988 | d_move(old_dentry, new_dentry); | 1002 | d_move(old_dentry, new_dentry); |
989 | 1003 | ||
990 | /* ensure target dentry is invalidated, despite | 1004 | /* ensure target dentry is invalidated, despite |
991 | rehashing bug in vfs_rename_dir */ | 1005 | rehashing bug in vfs_rename_dir */ |
992 | ceph_invalidate_dentry_lease(new_dentry); | 1006 | ceph_invalidate_dentry_lease(new_dentry); |
993 | |||
994 | /* d_move screws up sibling dentries' offsets */ | ||
995 | ceph_dir_clear_complete(old_dir); | ||
996 | ceph_dir_clear_complete(new_dir); | ||
997 | |||
998 | } | 1007 | } |
999 | ceph_mdsc_put_request(req); | 1008 | ceph_mdsc_put_request(req); |
1000 | return err; | 1009 | return err; |
@@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, | |||
1189 | return -EISDIR; | 1198 | return -EISDIR; |
1190 | 1199 | ||
1191 | if (!cf->dir_info) { | 1200 | if (!cf->dir_info) { |
1192 | cf->dir_info = kmalloc(bufsize, GFP_NOFS); | 1201 | cf->dir_info = kmalloc(bufsize, GFP_KERNEL); |
1193 | if (!cf->dir_info) | 1202 | if (!cf->dir_info) |
1194 | return -ENOMEM; | 1203 | return -ENOMEM; |
1195 | cf->dir_info_len = | 1204 | cf->dir_info_len = |
@@ -1224,66 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, | |||
1224 | } | 1233 | } |
1225 | 1234 | ||
1226 | /* | 1235 | /* |
1227 | * an fsync() on a dir will wait for any uncommitted directory | ||
1228 | * operations to commit. | ||
1229 | */ | ||
1230 | static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, | ||
1231 | int datasync) | ||
1232 | { | ||
1233 | struct inode *inode = file_inode(file); | ||
1234 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
1235 | struct list_head *head = &ci->i_unsafe_dirops; | ||
1236 | struct ceph_mds_request *req; | ||
1237 | u64 last_tid; | ||
1238 | int ret = 0; | ||
1239 | |||
1240 | dout("dir_fsync %p\n", inode); | ||
1241 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | ||
1242 | if (ret) | ||
1243 | return ret; | ||
1244 | mutex_lock(&inode->i_mutex); | ||
1245 | |||
1246 | spin_lock(&ci->i_unsafe_lock); | ||
1247 | if (list_empty(head)) | ||
1248 | goto out; | ||
1249 | |||
1250 | req = list_entry(head->prev, | ||
1251 | struct ceph_mds_request, r_unsafe_dir_item); | ||
1252 | last_tid = req->r_tid; | ||
1253 | |||
1254 | do { | ||
1255 | ceph_mdsc_get_request(req); | ||
1256 | spin_unlock(&ci->i_unsafe_lock); | ||
1257 | |||
1258 | dout("dir_fsync %p wait on tid %llu (until %llu)\n", | ||
1259 | inode, req->r_tid, last_tid); | ||
1260 | if (req->r_timeout) { | ||
1261 | unsigned long time_left = wait_for_completion_timeout( | ||
1262 | &req->r_safe_completion, | ||
1263 | req->r_timeout); | ||
1264 | if (time_left > 0) | ||
1265 | ret = 0; | ||
1266 | else | ||
1267 | ret = -EIO; /* timed out */ | ||
1268 | } else { | ||
1269 | wait_for_completion(&req->r_safe_completion); | ||
1270 | } | ||
1271 | ceph_mdsc_put_request(req); | ||
1272 | |||
1273 | spin_lock(&ci->i_unsafe_lock); | ||
1274 | if (ret || list_empty(head)) | ||
1275 | break; | ||
1276 | req = list_entry(head->next, | ||
1277 | struct ceph_mds_request, r_unsafe_dir_item); | ||
1278 | } while (req->r_tid < last_tid); | ||
1279 | out: | ||
1280 | spin_unlock(&ci->i_unsafe_lock); | ||
1281 | mutex_unlock(&inode->i_mutex); | ||
1282 | |||
1283 | return ret; | ||
1284 | } | ||
1285 | |||
1286 | /* | ||
1287 | * We maintain a private dentry LRU. | 1236 | * We maintain a private dentry LRU. |
1288 | * | 1237 | * |
1289 | * FIXME: this needs to be changed to a per-mds lru to be useful. | 1238 | * FIXME: this needs to be changed to a per-mds lru to be useful. |
@@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = { | |||
1353 | .open = ceph_open, | 1302 | .open = ceph_open, |
1354 | .release = ceph_release, | 1303 | .release = ceph_release, |
1355 | .unlocked_ioctl = ceph_ioctl, | 1304 | .unlocked_ioctl = ceph_ioctl, |
1356 | .fsync = ceph_dir_fsync, | 1305 | .fsync = ceph_fsync, |
1357 | }; | 1306 | }; |
1358 | 1307 | ||
1359 | const struct file_operations ceph_snapdir_fops = { | 1308 | const struct file_operations ceph_snapdir_fops = { |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b6b522b4b31..faf92095e105 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) | |||
89 | case S_IFDIR: | 89 | case S_IFDIR: |
90 | dout("init_file %p %p 0%o (regular)\n", inode, file, | 90 | dout("init_file %p %p 0%o (regular)\n", inode, file, |
91 | inode->i_mode); | 91 | inode->i_mode); |
92 | cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); | 92 | cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); |
93 | if (cf == NULL) { | 93 | if (cf == NULL) { |
94 | ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ | 94 | ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ |
95 | return -ENOMEM; | 95 | return -ENOMEM; |
96 | } | 96 | } |
97 | cf->fmode = fmode; | 97 | cf->fmode = fmode; |
98 | cf->next_offset = 2; | 98 | cf->next_offset = 2; |
99 | cf->readdir_cache_idx = -1; | ||
99 | file->private_data = cf; | 100 | file->private_data = cf; |
100 | BUG_ON(inode->i_fop->release != ceph_release); | 101 | BUG_ON(inode->i_fop->release != ceph_release); |
101 | break; | 102 | break; |
@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) | |||
324 | ceph_mdsc_put_request(cf->last_readdir); | 325 | ceph_mdsc_put_request(cf->last_readdir); |
325 | kfree(cf->last_name); | 326 | kfree(cf->last_name); |
326 | kfree(cf->dir_info); | 327 | kfree(cf->dir_info); |
327 | dput(cf->dentry); | ||
328 | kmem_cache_free(ceph_file_cachep, cf); | 328 | kmem_cache_free(ceph_file_cachep, cf); |
329 | 329 | ||
330 | /* wake up anyone waiting for caps on this inode */ | 330 | /* wake up anyone waiting for caps on this inode */ |
@@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, | |||
483 | } | 483 | } |
484 | } else { | 484 | } else { |
485 | num_pages = calc_pages_for(off, len); | 485 | num_pages = calc_pages_for(off, len); |
486 | pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); | 486 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
487 | if (IS_ERR(pages)) | 487 | if (IS_ERR(pages)) |
488 | return PTR_ERR(pages); | 488 | return PTR_ERR(pages); |
489 | ret = striped_read(inode, off, len, pages, | 489 | ret = striped_read(inode, off, len, pages, |
@@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) | |||
557 | * objects, rollback on failure, etc.) | 557 | * objects, rollback on failure, etc.) |
558 | */ | 558 | */ |
559 | static ssize_t | 559 | static ssize_t |
560 | ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | 560 | ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, |
561 | struct ceph_snap_context *snapc) | ||
561 | { | 562 | { |
562 | struct file *file = iocb->ki_filp; | 563 | struct file *file = iocb->ki_filp; |
563 | struct inode *inode = file_inode(file); | 564 | struct inode *inode = file_inode(file); |
564 | struct ceph_inode_info *ci = ceph_inode(inode); | 565 | struct ceph_inode_info *ci = ceph_inode(inode); |
565 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 566 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
566 | struct ceph_snap_context *snapc; | ||
567 | struct ceph_vino vino; | 567 | struct ceph_vino vino; |
568 | struct ceph_osd_request *req; | 568 | struct ceph_osd_request *req; |
569 | struct page **pages; | 569 | struct page **pages; |
@@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | |||
600 | size_t start; | 600 | size_t start; |
601 | ssize_t n; | 601 | ssize_t n; |
602 | 602 | ||
603 | snapc = ci->i_snap_realm->cached_context; | ||
604 | vino = ceph_vino(inode); | 603 | vino = ceph_vino(inode); |
605 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 604 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
606 | vino, pos, &len, 0, | 605 | vino, pos, &len, 0, |
@@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | |||
614 | break; | 613 | break; |
615 | } | 614 | } |
616 | 615 | ||
617 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); | 616 | osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); |
618 | 617 | ||
619 | n = iov_iter_get_pages_alloc(from, &pages, len, &start); | 618 | n = iov_iter_get_pages_alloc(from, &pages, len, &start); |
620 | if (unlikely(n < 0)) { | 619 | if (unlikely(n < 0)) { |
@@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | |||
674 | * objects, rollback on failure, etc.) | 673 | * objects, rollback on failure, etc.) |
675 | */ | 674 | */ |
676 | static ssize_t | 675 | static ssize_t |
677 | ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | 676 | ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, |
677 | struct ceph_snap_context *snapc) | ||
678 | { | 678 | { |
679 | struct file *file = iocb->ki_filp; | 679 | struct file *file = iocb->ki_filp; |
680 | struct inode *inode = file_inode(file); | 680 | struct inode *inode = file_inode(file); |
681 | struct ceph_inode_info *ci = ceph_inode(inode); | 681 | struct ceph_inode_info *ci = ceph_inode(inode); |
682 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 682 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
683 | struct ceph_snap_context *snapc; | ||
684 | struct ceph_vino vino; | 683 | struct ceph_vino vino; |
685 | struct ceph_osd_request *req; | 684 | struct ceph_osd_request *req; |
686 | struct page **pages; | 685 | struct page **pages; |
@@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | |||
717 | size_t left; | 716 | size_t left; |
718 | int n; | 717 | int n; |
719 | 718 | ||
720 | snapc = ci->i_snap_realm->cached_context; | ||
721 | vino = ceph_vino(inode); | 719 | vino = ceph_vino(inode); |
722 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 720 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
723 | vino, pos, &len, 0, 1, | 721 | vino, pos, &len, 0, 1, |
@@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) | |||
736 | */ | 734 | */ |
737 | num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 735 | num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
738 | 736 | ||
739 | pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); | 737 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
740 | if (IS_ERR(pages)) { | 738 | if (IS_ERR(pages)) { |
741 | ret = PTR_ERR(pages); | 739 | ret = PTR_ERR(pages); |
742 | goto out; | 740 | goto out; |
@@ -860,7 +858,7 @@ again: | |||
860 | struct page *page = NULL; | 858 | struct page *page = NULL; |
861 | loff_t i_size; | 859 | loff_t i_size; |
862 | if (retry_op == READ_INLINE) { | 860 | if (retry_op == READ_INLINE) { |
863 | page = __page_cache_alloc(GFP_NOFS); | 861 | page = __page_cache_alloc(GFP_KERNEL); |
864 | if (!page) | 862 | if (!page) |
865 | return -ENOMEM; | 863 | return -ENOMEM; |
866 | } | 864 | } |
@@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
941 | struct ceph_inode_info *ci = ceph_inode(inode); | 939 | struct ceph_inode_info *ci = ceph_inode(inode); |
942 | struct ceph_osd_client *osdc = | 940 | struct ceph_osd_client *osdc = |
943 | &ceph_sb_to_client(inode->i_sb)->client->osdc; | 941 | &ceph_sb_to_client(inode->i_sb)->client->osdc; |
942 | struct ceph_cap_flush *prealloc_cf; | ||
944 | ssize_t count, written = 0; | 943 | ssize_t count, written = 0; |
945 | int err, want, got; | 944 | int err, want, got; |
946 | loff_t pos; | 945 | loff_t pos; |
@@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
948 | if (ceph_snap(inode) != CEPH_NOSNAP) | 947 | if (ceph_snap(inode) != CEPH_NOSNAP) |
949 | return -EROFS; | 948 | return -EROFS; |
950 | 949 | ||
950 | prealloc_cf = ceph_alloc_cap_flush(); | ||
951 | if (!prealloc_cf) | ||
952 | return -ENOMEM; | ||
953 | |||
951 | mutex_lock(&inode->i_mutex); | 954 | mutex_lock(&inode->i_mutex); |
952 | 955 | ||
953 | /* We can write back this queue in page reclaim */ | 956 | /* We can write back this queue in page reclaim */ |
@@ -996,14 +999,30 @@ retry_snap: | |||
996 | 999 | ||
997 | if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || | 1000 | if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || |
998 | (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { | 1001 | (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { |
1002 | struct ceph_snap_context *snapc; | ||
999 | struct iov_iter data; | 1003 | struct iov_iter data; |
1000 | mutex_unlock(&inode->i_mutex); | 1004 | mutex_unlock(&inode->i_mutex); |
1005 | |||
1006 | spin_lock(&ci->i_ceph_lock); | ||
1007 | if (__ceph_have_pending_cap_snap(ci)) { | ||
1008 | struct ceph_cap_snap *capsnap = | ||
1009 | list_last_entry(&ci->i_cap_snaps, | ||
1010 | struct ceph_cap_snap, | ||
1011 | ci_item); | ||
1012 | snapc = ceph_get_snap_context(capsnap->context); | ||
1013 | } else { | ||
1014 | BUG_ON(!ci->i_head_snapc); | ||
1015 | snapc = ceph_get_snap_context(ci->i_head_snapc); | ||
1016 | } | ||
1017 | spin_unlock(&ci->i_ceph_lock); | ||
1018 | |||
1001 | /* we might need to revert back to that point */ | 1019 | /* we might need to revert back to that point */ |
1002 | data = *from; | 1020 | data = *from; |
1003 | if (iocb->ki_flags & IOCB_DIRECT) | 1021 | if (iocb->ki_flags & IOCB_DIRECT) |
1004 | written = ceph_sync_direct_write(iocb, &data, pos); | 1022 | written = ceph_sync_direct_write(iocb, &data, pos, |
1023 | snapc); | ||
1005 | else | 1024 | else |
1006 | written = ceph_sync_write(iocb, &data, pos); | 1025 | written = ceph_sync_write(iocb, &data, pos, snapc); |
1007 | if (written == -EOLDSNAPC) { | 1026 | if (written == -EOLDSNAPC) { |
1008 | dout("aio_write %p %llx.%llx %llu~%u" | 1027 | dout("aio_write %p %llx.%llx %llu~%u" |
1009 | "got EOLDSNAPC, retrying\n", | 1028 | "got EOLDSNAPC, retrying\n", |
@@ -1014,6 +1033,7 @@ retry_snap: | |||
1014 | } | 1033 | } |
1015 | if (written > 0) | 1034 | if (written > 0) |
1016 | iov_iter_advance(from, written); | 1035 | iov_iter_advance(from, written); |
1036 | ceph_put_snap_context(snapc); | ||
1017 | } else { | 1037 | } else { |
1018 | loff_t old_size = inode->i_size; | 1038 | loff_t old_size = inode->i_size; |
1019 | /* | 1039 | /* |
@@ -1035,7 +1055,8 @@ retry_snap: | |||
1035 | int dirty; | 1055 | int dirty; |
1036 | spin_lock(&ci->i_ceph_lock); | 1056 | spin_lock(&ci->i_ceph_lock); |
1037 | ci->i_inline_version = CEPH_INLINE_NONE; | 1057 | ci->i_inline_version = CEPH_INLINE_NONE; |
1038 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 1058 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, |
1059 | &prealloc_cf); | ||
1039 | spin_unlock(&ci->i_ceph_lock); | 1060 | spin_unlock(&ci->i_ceph_lock); |
1040 | if (dirty) | 1061 | if (dirty) |
1041 | __mark_inode_dirty(inode, dirty); | 1062 | __mark_inode_dirty(inode, dirty); |
@@ -1059,6 +1080,7 @@ retry_snap: | |||
1059 | out: | 1080 | out: |
1060 | mutex_unlock(&inode->i_mutex); | 1081 | mutex_unlock(&inode->i_mutex); |
1061 | out_unlocked: | 1082 | out_unlocked: |
1083 | ceph_free_cap_flush(prealloc_cf); | ||
1062 | current->backing_dev_info = NULL; | 1084 | current->backing_dev_info = NULL; |
1063 | return written ? written : err; | 1085 | return written ? written : err; |
1064 | } | 1086 | } |
@@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1255 | struct ceph_inode_info *ci = ceph_inode(inode); | 1277 | struct ceph_inode_info *ci = ceph_inode(inode); |
1256 | struct ceph_osd_client *osdc = | 1278 | struct ceph_osd_client *osdc = |
1257 | &ceph_inode_to_client(inode)->client->osdc; | 1279 | &ceph_inode_to_client(inode)->client->osdc; |
1280 | struct ceph_cap_flush *prealloc_cf; | ||
1258 | int want, got = 0; | 1281 | int want, got = 0; |
1259 | int dirty; | 1282 | int dirty; |
1260 | int ret = 0; | 1283 | int ret = 0; |
@@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1267 | if (!S_ISREG(inode->i_mode)) | 1290 | if (!S_ISREG(inode->i_mode)) |
1268 | return -EOPNOTSUPP; | 1291 | return -EOPNOTSUPP; |
1269 | 1292 | ||
1293 | prealloc_cf = ceph_alloc_cap_flush(); | ||
1294 | if (!prealloc_cf) | ||
1295 | return -ENOMEM; | ||
1296 | |||
1270 | mutex_lock(&inode->i_mutex); | 1297 | mutex_lock(&inode->i_mutex); |
1271 | 1298 | ||
1272 | if (ceph_snap(inode) != CEPH_NOSNAP) { | 1299 | if (ceph_snap(inode) != CEPH_NOSNAP) { |
@@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1313 | if (!ret) { | 1340 | if (!ret) { |
1314 | spin_lock(&ci->i_ceph_lock); | 1341 | spin_lock(&ci->i_ceph_lock); |
1315 | ci->i_inline_version = CEPH_INLINE_NONE; | 1342 | ci->i_inline_version = CEPH_INLINE_NONE; |
1316 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); | 1343 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, |
1344 | &prealloc_cf); | ||
1317 | spin_unlock(&ci->i_ceph_lock); | 1345 | spin_unlock(&ci->i_ceph_lock); |
1318 | if (dirty) | 1346 | if (dirty) |
1319 | __mark_inode_dirty(inode, dirty); | 1347 | __mark_inode_dirty(inode, dirty); |
@@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1322 | ceph_put_cap_refs(ci, got); | 1350 | ceph_put_cap_refs(ci, got); |
1323 | unlock: | 1351 | unlock: |
1324 | mutex_unlock(&inode->i_mutex); | 1352 | mutex_unlock(&inode->i_mutex); |
1353 | ceph_free_cap_flush(prealloc_cf); | ||
1325 | return ret; | 1354 | return ret; |
1326 | } | 1355 | } |
1327 | 1356 | ||
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 571acd88606c..96d2bd829902 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -389,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
389 | ci->i_inline_version = 0; | 389 | ci->i_inline_version = 0; |
390 | ci->i_time_warp_seq = 0; | 390 | ci->i_time_warp_seq = 0; |
391 | ci->i_ceph_flags = 0; | 391 | ci->i_ceph_flags = 0; |
392 | ci->i_ordered_count = 0; | 392 | atomic64_set(&ci->i_ordered_count, 1); |
393 | atomic_set(&ci->i_release_count, 1); | 393 | atomic64_set(&ci->i_release_count, 1); |
394 | atomic_set(&ci->i_complete_count, 0); | 394 | atomic64_set(&ci->i_complete_seq[0], 0); |
395 | atomic64_set(&ci->i_complete_seq[1], 0); | ||
395 | ci->i_symlink = NULL; | 396 | ci->i_symlink = NULL; |
396 | 397 | ||
397 | memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); | 398 | memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); |
@@ -415,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
415 | ci->i_flushing_caps = 0; | 416 | ci->i_flushing_caps = 0; |
416 | INIT_LIST_HEAD(&ci->i_dirty_item); | 417 | INIT_LIST_HEAD(&ci->i_dirty_item); |
417 | INIT_LIST_HEAD(&ci->i_flushing_item); | 418 | INIT_LIST_HEAD(&ci->i_flushing_item); |
418 | ci->i_cap_flush_seq = 0; | 419 | ci->i_prealloc_cap_flush = NULL; |
419 | ci->i_cap_flush_last_tid = 0; | 420 | ci->i_cap_flush_tree = RB_ROOT; |
420 | memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); | ||
421 | init_waitqueue_head(&ci->i_cap_wq); | 421 | init_waitqueue_head(&ci->i_cap_wq); |
422 | ci->i_hold_caps_min = 0; | 422 | ci->i_hold_caps_min = 0; |
423 | ci->i_hold_caps_max = 0; | 423 | ci->i_hold_caps_max = 0; |
@@ -752,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, | |||
752 | 752 | ||
753 | if (new_version || | 753 | if (new_version || |
754 | (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { | 754 | (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { |
755 | if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) | ||
756 | ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; | ||
755 | ci->i_layout = info->layout; | 757 | ci->i_layout = info->layout; |
758 | |||
756 | queue_trunc = ceph_fill_file_size(inode, issued, | 759 | queue_trunc = ceph_fill_file_size(inode, issued, |
757 | le32_to_cpu(info->truncate_seq), | 760 | le32_to_cpu(info->truncate_seq), |
758 | le64_to_cpu(info->truncate_size), | 761 | le64_to_cpu(info->truncate_size), |
@@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, | |||
858 | (issued & CEPH_CAP_FILE_EXCL) == 0 && | 861 | (issued & CEPH_CAP_FILE_EXCL) == 0 && |
859 | !__ceph_dir_is_complete(ci)) { | 862 | !__ceph_dir_is_complete(ci)) { |
860 | dout(" marking %p complete (empty)\n", inode); | 863 | dout(" marking %p complete (empty)\n", inode); |
864 | i_size_write(inode, 0); | ||
861 | __ceph_dir_set_complete(ci, | 865 | __ceph_dir_set_complete(ci, |
862 | atomic_read(&ci->i_release_count), | 866 | atomic64_read(&ci->i_release_count), |
863 | ci->i_ordered_count); | 867 | atomic64_read(&ci->i_ordered_count)); |
864 | } | 868 | } |
865 | 869 | ||
866 | wake = true; | 870 | wake = true; |
@@ -1212,6 +1216,10 @@ retry_lookup: | |||
1212 | dout("fill_trace doing d_move %p -> %p\n", | 1216 | dout("fill_trace doing d_move %p -> %p\n", |
1213 | req->r_old_dentry, dn); | 1217 | req->r_old_dentry, dn); |
1214 | 1218 | ||
1219 | /* d_move screws up sibling dentries' offsets */ | ||
1220 | ceph_dir_clear_ordered(dir); | ||
1221 | ceph_dir_clear_ordered(olddir); | ||
1222 | |||
1215 | d_move(req->r_old_dentry, dn); | 1223 | d_move(req->r_old_dentry, dn); |
1216 | dout(" src %p '%pd' dst %p '%pd'\n", | 1224 | dout(" src %p '%pd' dst %p '%pd'\n", |
1217 | req->r_old_dentry, | 1225 | req->r_old_dentry, |
@@ -1222,10 +1230,6 @@ retry_lookup: | |||
1222 | rehashing bug in vfs_rename_dir */ | 1230 | rehashing bug in vfs_rename_dir */ |
1223 | ceph_invalidate_dentry_lease(dn); | 1231 | ceph_invalidate_dentry_lease(dn); |
1224 | 1232 | ||
1225 | /* d_move screws up sibling dentries' offsets */ | ||
1226 | ceph_dir_clear_ordered(dir); | ||
1227 | ceph_dir_clear_ordered(olddir); | ||
1228 | |||
1229 | dout("dn %p gets new offset %lld\n", req->r_old_dentry, | 1233 | dout("dn %p gets new offset %lld\n", req->r_old_dentry, |
1230 | ceph_dentry(req->r_old_dentry)->offset); | 1234 | ceph_dentry(req->r_old_dentry)->offset); |
1231 | 1235 | ||
@@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, | |||
1333 | return err; | 1337 | return err; |
1334 | } | 1338 | } |
1335 | 1339 | ||
1340 | void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) | ||
1341 | { | ||
1342 | if (ctl->page) { | ||
1343 | kunmap(ctl->page); | ||
1344 | page_cache_release(ctl->page); | ||
1345 | ctl->page = NULL; | ||
1346 | } | ||
1347 | } | ||
1348 | |||
1349 | static int fill_readdir_cache(struct inode *dir, struct dentry *dn, | ||
1350 | struct ceph_readdir_cache_control *ctl, | ||
1351 | struct ceph_mds_request *req) | ||
1352 | { | ||
1353 | struct ceph_inode_info *ci = ceph_inode(dir); | ||
1354 | unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*); | ||
1355 | unsigned idx = ctl->index % nsize; | ||
1356 | pgoff_t pgoff = ctl->index / nsize; | ||
1357 | |||
1358 | if (!ctl->page || pgoff != page_index(ctl->page)) { | ||
1359 | ceph_readdir_cache_release(ctl); | ||
1360 | ctl->page = grab_cache_page(&dir->i_data, pgoff); | ||
1361 | if (!ctl->page) { | ||
1362 | ctl->index = -1; | ||
1363 | return -ENOMEM; | ||
1364 | } | ||
1365 | /* reading/filling the cache are serialized by | ||
1366 | * i_mutex, no need to use page lock */ | ||
1367 | unlock_page(ctl->page); | ||
1368 | ctl->dentries = kmap(ctl->page); | ||
1369 | } | ||
1370 | |||
1371 | if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && | ||
1372 | req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { | ||
1373 | dout("readdir cache dn %p idx %d\n", dn, ctl->index); | ||
1374 | ctl->dentries[idx] = dn; | ||
1375 | ctl->index++; | ||
1376 | } else { | ||
1377 | dout("disable readdir cache\n"); | ||
1378 | ctl->index = -1; | ||
1379 | } | ||
1380 | return 0; | ||
1381 | } | ||
1382 | |||
1336 | int ceph_readdir_prepopulate(struct ceph_mds_request *req, | 1383 | int ceph_readdir_prepopulate(struct ceph_mds_request *req, |
1337 | struct ceph_mds_session *session) | 1384 | struct ceph_mds_session *session) |
1338 | { | 1385 | { |
@@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
1345 | struct inode *snapdir = NULL; | 1392 | struct inode *snapdir = NULL; |
1346 | struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; | 1393 | struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; |
1347 | struct ceph_dentry_info *di; | 1394 | struct ceph_dentry_info *di; |
1348 | u64 r_readdir_offset = req->r_readdir_offset; | ||
1349 | u32 frag = le32_to_cpu(rhead->args.readdir.frag); | 1395 | u32 frag = le32_to_cpu(rhead->args.readdir.frag); |
1396 | struct ceph_readdir_cache_control cache_ctl = {}; | ||
1397 | |||
1398 | if (req->r_aborted) | ||
1399 | return readdir_prepopulate_inodes_only(req, session); | ||
1350 | 1400 | ||
1351 | if (rinfo->dir_dir && | 1401 | if (rinfo->dir_dir && |
1352 | le32_to_cpu(rinfo->dir_dir->frag) != frag) { | 1402 | le32_to_cpu(rinfo->dir_dir->frag) != frag) { |
@@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
1354 | frag, le32_to_cpu(rinfo->dir_dir->frag)); | 1404 | frag, le32_to_cpu(rinfo->dir_dir->frag)); |
1355 | frag = le32_to_cpu(rinfo->dir_dir->frag); | 1405 | frag = le32_to_cpu(rinfo->dir_dir->frag); |
1356 | if (ceph_frag_is_leftmost(frag)) | 1406 | if (ceph_frag_is_leftmost(frag)) |
1357 | r_readdir_offset = 2; | 1407 | req->r_readdir_offset = 2; |
1358 | else | 1408 | else |
1359 | r_readdir_offset = 0; | 1409 | req->r_readdir_offset = 0; |
1360 | } | 1410 | } |
1361 | 1411 | ||
1362 | if (req->r_aborted) | ||
1363 | return readdir_prepopulate_inodes_only(req, session); | ||
1364 | |||
1365 | if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { | 1412 | if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { |
1366 | snapdir = ceph_get_snapdir(d_inode(parent)); | 1413 | snapdir = ceph_get_snapdir(d_inode(parent)); |
1367 | parent = d_find_alias(snapdir); | 1414 | parent = d_find_alias(snapdir); |
@@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, | |||
1374 | ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); | 1421 | ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); |
1375 | } | 1422 | } |
1376 | 1423 | ||
1424 | if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { | ||
1425 | /* note dir version at start of readdir so we can tell | ||
1426 | * if any dentries get dropped */ | ||
1427 | struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); | ||
1428 | req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); | ||
1429 | req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); | ||
1430 | req->r_readdir_cache_idx = 0; | ||
1431 | } | ||
1432 | |||
1433 | cache_ctl.index = req->r_readdir_cache_idx; | ||
1434 | |||
1377 | /* FIXME: release caps/leases if error occurs */ | 1435 | /* FIXME: release caps/leases if error occurs */ |
1378 | for (i = 0; i < rinfo->dir_nr; i++) { | 1436 | for (i = 0; i < rinfo->dir_nr; i++) { |
1379 | struct ceph_vino vino; | 1437 | struct ceph_vino vino; |
@@ -1413,13 +1471,6 @@ retry_lookup: | |||
1413 | d_delete(dn); | 1471 | d_delete(dn); |
1414 | dput(dn); | 1472 | dput(dn); |
1415 | goto retry_lookup; | 1473 | goto retry_lookup; |
1416 | } else { | ||
1417 | /* reorder parent's d_subdirs */ | ||
1418 | spin_lock(&parent->d_lock); | ||
1419 | spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); | ||
1420 | list_move(&dn->d_child, &parent->d_subdirs); | ||
1421 | spin_unlock(&dn->d_lock); | ||
1422 | spin_unlock(&parent->d_lock); | ||
1423 | } | 1474 | } |
1424 | 1475 | ||
1425 | /* inode */ | 1476 | /* inode */ |
@@ -1436,13 +1487,15 @@ retry_lookup: | |||
1436 | } | 1487 | } |
1437 | } | 1488 | } |
1438 | 1489 | ||
1439 | if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, | 1490 | ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, |
1440 | req->r_request_started, -1, | 1491 | req->r_request_started, -1, |
1441 | &req->r_caps_reservation) < 0) { | 1492 | &req->r_caps_reservation); |
1493 | if (ret < 0) { | ||
1442 | pr_err("fill_inode badness on %p\n", in); | 1494 | pr_err("fill_inode badness on %p\n", in); |
1443 | if (d_really_is_negative(dn)) | 1495 | if (d_really_is_negative(dn)) |
1444 | iput(in); | 1496 | iput(in); |
1445 | d_drop(dn); | 1497 | d_drop(dn); |
1498 | err = ret; | ||
1446 | goto next_item; | 1499 | goto next_item; |
1447 | } | 1500 | } |
1448 | 1501 | ||
@@ -1458,19 +1511,28 @@ retry_lookup: | |||
1458 | } | 1511 | } |
1459 | 1512 | ||
1460 | di = dn->d_fsdata; | 1513 | di = dn->d_fsdata; |
1461 | di->offset = ceph_make_fpos(frag, i + r_readdir_offset); | 1514 | di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); |
1462 | 1515 | ||
1463 | update_dentry_lease(dn, rinfo->dir_dlease[i], | 1516 | update_dentry_lease(dn, rinfo->dir_dlease[i], |
1464 | req->r_session, | 1517 | req->r_session, |
1465 | req->r_request_started); | 1518 | req->r_request_started); |
1519 | |||
1520 | if (err == 0 && cache_ctl.index >= 0) { | ||
1521 | ret = fill_readdir_cache(d_inode(parent), dn, | ||
1522 | &cache_ctl, req); | ||
1523 | if (ret < 0) | ||
1524 | err = ret; | ||
1525 | } | ||
1466 | next_item: | 1526 | next_item: |
1467 | if (dn) | 1527 | if (dn) |
1468 | dput(dn); | 1528 | dput(dn); |
1469 | } | 1529 | } |
1470 | if (err == 0) | ||
1471 | req->r_did_prepopulate = true; | ||
1472 | |||
1473 | out: | 1530 | out: |
1531 | if (err == 0) { | ||
1532 | req->r_did_prepopulate = true; | ||
1533 | req->r_readdir_cache_idx = cache_ctl.index; | ||
1534 | } | ||
1535 | ceph_readdir_cache_release(&cache_ctl); | ||
1474 | if (snapdir) { | 1536 | if (snapdir) { |
1475 | iput(snapdir); | 1537 | iput(snapdir); |
1476 | dput(parent); | 1538 | dput(parent); |
@@ -1712,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1712 | const unsigned int ia_valid = attr->ia_valid; | 1774 | const unsigned int ia_valid = attr->ia_valid; |
1713 | struct ceph_mds_request *req; | 1775 | struct ceph_mds_request *req; |
1714 | struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; | 1776 | struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; |
1777 | struct ceph_cap_flush *prealloc_cf; | ||
1715 | int issued; | 1778 | int issued; |
1716 | int release = 0, dirtied = 0; | 1779 | int release = 0, dirtied = 0; |
1717 | int mask = 0; | 1780 | int mask = 0; |
1718 | int err = 0; | 1781 | int err = 0; |
1719 | int inode_dirty_flags = 0; | 1782 | int inode_dirty_flags = 0; |
1783 | bool lock_snap_rwsem = false; | ||
1720 | 1784 | ||
1721 | if (ceph_snap(inode) != CEPH_NOSNAP) | 1785 | if (ceph_snap(inode) != CEPH_NOSNAP) |
1722 | return -EROFS; | 1786 | return -EROFS; |
@@ -1725,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1725 | if (err != 0) | 1789 | if (err != 0) |
1726 | return err; | 1790 | return err; |
1727 | 1791 | ||
1792 | prealloc_cf = ceph_alloc_cap_flush(); | ||
1793 | if (!prealloc_cf) | ||
1794 | return -ENOMEM; | ||
1795 | |||
1728 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, | 1796 | req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, |
1729 | USE_AUTH_MDS); | 1797 | USE_AUTH_MDS); |
1730 | if (IS_ERR(req)) | 1798 | if (IS_ERR(req)) { |
1799 | ceph_free_cap_flush(prealloc_cf); | ||
1731 | return PTR_ERR(req); | 1800 | return PTR_ERR(req); |
1801 | } | ||
1732 | 1802 | ||
1733 | spin_lock(&ci->i_ceph_lock); | 1803 | spin_lock(&ci->i_ceph_lock); |
1734 | issued = __ceph_caps_issued(ci, NULL); | 1804 | issued = __ceph_caps_issued(ci, NULL); |
1805 | |||
1806 | if (!ci->i_head_snapc && | ||
1807 | (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { | ||
1808 | lock_snap_rwsem = true; | ||
1809 | if (!down_read_trylock(&mdsc->snap_rwsem)) { | ||
1810 | spin_unlock(&ci->i_ceph_lock); | ||
1811 | down_read(&mdsc->snap_rwsem); | ||
1812 | spin_lock(&ci->i_ceph_lock); | ||
1813 | issued = __ceph_caps_issued(ci, NULL); | ||
1814 | } | ||
1815 | } | ||
1816 | |||
1735 | dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); | 1817 | dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); |
1736 | 1818 | ||
1737 | if (ia_valid & ATTR_UID) { | 1819 | if (ia_valid & ATTR_UID) { |
@@ -1874,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1874 | dout("setattr %p ATTR_FILE ... hrm!\n", inode); | 1956 | dout("setattr %p ATTR_FILE ... hrm!\n", inode); |
1875 | 1957 | ||
1876 | if (dirtied) { | 1958 | if (dirtied) { |
1877 | inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); | 1959 | inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, |
1960 | &prealloc_cf); | ||
1878 | inode->i_ctime = CURRENT_TIME; | 1961 | inode->i_ctime = CURRENT_TIME; |
1879 | } | 1962 | } |
1880 | 1963 | ||
1881 | release &= issued; | 1964 | release &= issued; |
1882 | spin_unlock(&ci->i_ceph_lock); | 1965 | spin_unlock(&ci->i_ceph_lock); |
1966 | if (lock_snap_rwsem) | ||
1967 | up_read(&mdsc->snap_rwsem); | ||
1883 | 1968 | ||
1884 | if (inode_dirty_flags) | 1969 | if (inode_dirty_flags) |
1885 | __mark_inode_dirty(inode, inode_dirty_flags); | 1970 | __mark_inode_dirty(inode, inode_dirty_flags); |
@@ -1904,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) | |||
1904 | ceph_mdsc_put_request(req); | 1989 | ceph_mdsc_put_request(req); |
1905 | if (mask & CEPH_SETATTR_SIZE) | 1990 | if (mask & CEPH_SETATTR_SIZE) |
1906 | __ceph_do_pending_vmtruncate(inode); | 1991 | __ceph_do_pending_vmtruncate(inode); |
1992 | ceph_free_cap_flush(prealloc_cf); | ||
1907 | return err; | 1993 | return err; |
1908 | out_put: | 1994 | out_put: |
1909 | ceph_mdsc_put_request(req); | 1995 | ceph_mdsc_put_request(req); |
1996 | ceph_free_cap_flush(prealloc_cf); | ||
1910 | return err; | 1997 | return err; |
1911 | } | 1998 | } |
1912 | 1999 | ||
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84f37f34f9aa..6aa07af67603 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -8,6 +8,7 @@ | |||
8 | #include <linux/debugfs.h> | 8 | #include <linux/debugfs.h> |
9 | #include <linux/seq_file.h> | 9 | #include <linux/seq_file.h> |
10 | #include <linux/utsname.h> | 10 | #include <linux/utsname.h> |
11 | #include <linux/ratelimit.h> | ||
11 | 12 | ||
12 | #include "super.h" | 13 | #include "super.h" |
13 | #include "mds_client.h" | 14 | #include "mds_client.h" |
@@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, | |||
458 | s->s_cap_reconnect = 0; | 459 | s->s_cap_reconnect = 0; |
459 | s->s_cap_iterator = NULL; | 460 | s->s_cap_iterator = NULL; |
460 | INIT_LIST_HEAD(&s->s_cap_releases); | 461 | INIT_LIST_HEAD(&s->s_cap_releases); |
461 | INIT_LIST_HEAD(&s->s_cap_releases_done); | ||
462 | INIT_LIST_HEAD(&s->s_cap_flushing); | 462 | INIT_LIST_HEAD(&s->s_cap_flushing); |
463 | INIT_LIST_HEAD(&s->s_cap_snaps_flushing); | 463 | INIT_LIST_HEAD(&s->s_cap_snaps_flushing); |
464 | 464 | ||
@@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc, | |||
629 | req->r_uid = current_fsuid(); | 629 | req->r_uid = current_fsuid(); |
630 | req->r_gid = current_fsgid(); | 630 | req->r_gid = current_fsgid(); |
631 | 631 | ||
632 | if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) | ||
633 | mdsc->oldest_tid = req->r_tid; | ||
634 | |||
632 | if (dir) { | 635 | if (dir) { |
633 | struct ceph_inode_info *ci = ceph_inode(dir); | 636 | struct ceph_inode_info *ci = ceph_inode(dir); |
634 | 637 | ||
@@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc, | |||
644 | struct ceph_mds_request *req) | 647 | struct ceph_mds_request *req) |
645 | { | 648 | { |
646 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); | 649 | dout("__unregister_request %p tid %lld\n", req, req->r_tid); |
650 | |||
651 | if (req->r_tid == mdsc->oldest_tid) { | ||
652 | struct rb_node *p = rb_next(&req->r_node); | ||
653 | mdsc->oldest_tid = 0; | ||
654 | while (p) { | ||
655 | struct ceph_mds_request *next_req = | ||
656 | rb_entry(p, struct ceph_mds_request, r_node); | ||
657 | if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { | ||
658 | mdsc->oldest_tid = next_req->r_tid; | ||
659 | break; | ||
660 | } | ||
661 | p = rb_next(p); | ||
662 | } | ||
663 | } | ||
664 | |||
647 | rb_erase(&req->r_node, &mdsc->request_tree); | 665 | rb_erase(&req->r_node, &mdsc->request_tree); |
648 | RB_CLEAR_NODE(&req->r_node); | 666 | RB_CLEAR_NODE(&req->r_node); |
649 | 667 | ||
@@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, | |||
998 | * session caps | 1016 | * session caps |
999 | */ | 1017 | */ |
1000 | 1018 | ||
1001 | /* | 1019 | /* caller holds s_cap_lock, we drop it */ |
1002 | * Free preallocated cap messages assigned to this session | 1020 | static void cleanup_cap_releases(struct ceph_mds_client *mdsc, |
1003 | */ | 1021 | struct ceph_mds_session *session) |
1004 | static void cleanup_cap_releases(struct ceph_mds_session *session) | 1022 | __releases(session->s_cap_lock) |
1005 | { | 1023 | { |
1006 | struct ceph_msg *msg; | 1024 | LIST_HEAD(tmp_list); |
1025 | list_splice_init(&session->s_cap_releases, &tmp_list); | ||
1026 | session->s_num_cap_releases = 0; | ||
1027 | spin_unlock(&session->s_cap_lock); | ||
1007 | 1028 | ||
1008 | spin_lock(&session->s_cap_lock); | 1029 | dout("cleanup_cap_releases mds%d\n", session->s_mds); |
1009 | while (!list_empty(&session->s_cap_releases)) { | 1030 | while (!list_empty(&tmp_list)) { |
1010 | msg = list_first_entry(&session->s_cap_releases, | 1031 | struct ceph_cap *cap; |
1011 | struct ceph_msg, list_head); | 1032 | /* zero out the in-progress message */ |
1012 | list_del_init(&msg->list_head); | 1033 | cap = list_first_entry(&tmp_list, |
1013 | ceph_msg_put(msg); | 1034 | struct ceph_cap, session_caps); |
1014 | } | 1035 | list_del(&cap->session_caps); |
1015 | while (!list_empty(&session->s_cap_releases_done)) { | 1036 | ceph_put_cap(mdsc, cap); |
1016 | msg = list_first_entry(&session->s_cap_releases_done, | ||
1017 | struct ceph_msg, list_head); | ||
1018 | list_del_init(&msg->list_head); | ||
1019 | ceph_msg_put(msg); | ||
1020 | } | 1037 | } |
1021 | spin_unlock(&session->s_cap_lock); | ||
1022 | } | 1038 | } |
1023 | 1039 | ||
1024 | static void cleanup_session_requests(struct ceph_mds_client *mdsc, | 1040 | static void cleanup_session_requests(struct ceph_mds_client *mdsc, |
@@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, | |||
1033 | req = list_first_entry(&session->s_unsafe, | 1049 | req = list_first_entry(&session->s_unsafe, |
1034 | struct ceph_mds_request, r_unsafe_item); | 1050 | struct ceph_mds_request, r_unsafe_item); |
1035 | list_del_init(&req->r_unsafe_item); | 1051 | list_del_init(&req->r_unsafe_item); |
1036 | pr_info(" dropping unsafe request %llu\n", req->r_tid); | 1052 | pr_warn_ratelimited(" dropping unsafe request %llu\n", |
1053 | req->r_tid); | ||
1037 | __unregister_request(mdsc, req); | 1054 | __unregister_request(mdsc, req); |
1038 | } | 1055 | } |
1039 | /* zero r_attempts, so kick_requests() will re-send requests */ | 1056 | /* zero r_attempts, so kick_requests() will re-send requests */ |
@@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session, | |||
1095 | dout("iterate_session_caps finishing cap %p removal\n", | 1112 | dout("iterate_session_caps finishing cap %p removal\n", |
1096 | cap); | 1113 | cap); |
1097 | BUG_ON(cap->session != session); | 1114 | BUG_ON(cap->session != session); |
1115 | cap->session = NULL; | ||
1098 | list_del_init(&cap->session_caps); | 1116 | list_del_init(&cap->session_caps); |
1099 | session->s_nr_caps--; | 1117 | session->s_nr_caps--; |
1100 | cap->session = NULL; | 1118 | if (cap->queue_release) { |
1101 | old_cap = cap; /* put_cap it w/o locks held */ | 1119 | list_add_tail(&cap->session_caps, |
1120 | &session->s_cap_releases); | ||
1121 | session->s_num_cap_releases++; | ||
1122 | } else { | ||
1123 | old_cap = cap; /* put_cap it w/o locks held */ | ||
1124 | } | ||
1102 | } | 1125 | } |
1103 | if (ret < 0) | 1126 | if (ret < 0) |
1104 | goto out; | 1127 | goto out; |
@@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1119 | void *arg) | 1142 | void *arg) |
1120 | { | 1143 | { |
1121 | struct ceph_inode_info *ci = ceph_inode(inode); | 1144 | struct ceph_inode_info *ci = ceph_inode(inode); |
1145 | LIST_HEAD(to_remove); | ||
1122 | int drop = 0; | 1146 | int drop = 0; |
1123 | 1147 | ||
1124 | dout("removing cap %p, ci is %p, inode is %p\n", | 1148 | dout("removing cap %p, ci is %p, inode is %p\n", |
@@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1126 | spin_lock(&ci->i_ceph_lock); | 1150 | spin_lock(&ci->i_ceph_lock); |
1127 | __ceph_remove_cap(cap, false); | 1151 | __ceph_remove_cap(cap, false); |
1128 | if (!ci->i_auth_cap) { | 1152 | if (!ci->i_auth_cap) { |
1153 | struct ceph_cap_flush *cf; | ||
1129 | struct ceph_mds_client *mdsc = | 1154 | struct ceph_mds_client *mdsc = |
1130 | ceph_sb_to_client(inode->i_sb)->mdsc; | 1155 | ceph_sb_to_client(inode->i_sb)->mdsc; |
1131 | 1156 | ||
1157 | while (true) { | ||
1158 | struct rb_node *n = rb_first(&ci->i_cap_flush_tree); | ||
1159 | if (!n) | ||
1160 | break; | ||
1161 | cf = rb_entry(n, struct ceph_cap_flush, i_node); | ||
1162 | rb_erase(&cf->i_node, &ci->i_cap_flush_tree); | ||
1163 | list_add(&cf->list, &to_remove); | ||
1164 | } | ||
1165 | |||
1132 | spin_lock(&mdsc->cap_dirty_lock); | 1166 | spin_lock(&mdsc->cap_dirty_lock); |
1167 | |||
1168 | list_for_each_entry(cf, &to_remove, list) | ||
1169 | rb_erase(&cf->g_node, &mdsc->cap_flush_tree); | ||
1170 | |||
1133 | if (!list_empty(&ci->i_dirty_item)) { | 1171 | if (!list_empty(&ci->i_dirty_item)) { |
1134 | pr_info(" dropping dirty %s state for %p %lld\n", | 1172 | pr_warn_ratelimited( |
1173 | " dropping dirty %s state for %p %lld\n", | ||
1135 | ceph_cap_string(ci->i_dirty_caps), | 1174 | ceph_cap_string(ci->i_dirty_caps), |
1136 | inode, ceph_ino(inode)); | 1175 | inode, ceph_ino(inode)); |
1137 | ci->i_dirty_caps = 0; | 1176 | ci->i_dirty_caps = 0; |
@@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1139 | drop = 1; | 1178 | drop = 1; |
1140 | } | 1179 | } |
1141 | if (!list_empty(&ci->i_flushing_item)) { | 1180 | if (!list_empty(&ci->i_flushing_item)) { |
1142 | pr_info(" dropping dirty+flushing %s state for %p %lld\n", | 1181 | pr_warn_ratelimited( |
1182 | " dropping dirty+flushing %s state for %p %lld\n", | ||
1143 | ceph_cap_string(ci->i_flushing_caps), | 1183 | ceph_cap_string(ci->i_flushing_caps), |
1144 | inode, ceph_ino(inode)); | 1184 | inode, ceph_ino(inode)); |
1145 | ci->i_flushing_caps = 0; | 1185 | ci->i_flushing_caps = 0; |
@@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1148 | drop = 1; | 1188 | drop = 1; |
1149 | } | 1189 | } |
1150 | spin_unlock(&mdsc->cap_dirty_lock); | 1190 | spin_unlock(&mdsc->cap_dirty_lock); |
1191 | |||
1192 | if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { | ||
1193 | list_add(&ci->i_prealloc_cap_flush->list, &to_remove); | ||
1194 | ci->i_prealloc_cap_flush = NULL; | ||
1195 | } | ||
1151 | } | 1196 | } |
1152 | spin_unlock(&ci->i_ceph_lock); | 1197 | spin_unlock(&ci->i_ceph_lock); |
1198 | while (!list_empty(&to_remove)) { | ||
1199 | struct ceph_cap_flush *cf; | ||
1200 | cf = list_first_entry(&to_remove, | ||
1201 | struct ceph_cap_flush, list); | ||
1202 | list_del(&cf->list); | ||
1203 | ceph_free_cap_flush(cf); | ||
1204 | } | ||
1153 | while (drop--) | 1205 | while (drop--) |
1154 | iput(inode); | 1206 | iput(inode); |
1155 | return 0; | 1207 | return 0; |
@@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session) | |||
1191 | spin_lock(&session->s_cap_lock); | 1243 | spin_lock(&session->s_cap_lock); |
1192 | } | 1244 | } |
1193 | } | 1245 | } |
1194 | spin_unlock(&session->s_cap_lock); | 1246 | |
1247 | // drop cap expires and unlock s_cap_lock | ||
1248 | cleanup_cap_releases(session->s_mdsc, session); | ||
1195 | 1249 | ||
1196 | BUG_ON(session->s_nr_caps > 0); | 1250 | BUG_ON(session->s_nr_caps > 0); |
1197 | BUG_ON(!list_empty(&session->s_cap_flushing)); | 1251 | BUG_ON(!list_empty(&session->s_cap_flushing)); |
1198 | cleanup_cap_releases(session); | ||
1199 | } | 1252 | } |
1200 | 1253 | ||
1201 | /* | 1254 | /* |
@@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
1371 | inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), | 1424 | inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), |
1372 | ceph_cap_string(used), ceph_cap_string(wanted)); | 1425 | ceph_cap_string(used), ceph_cap_string(wanted)); |
1373 | if (cap == ci->i_auth_cap) { | 1426 | if (cap == ci->i_auth_cap) { |
1374 | if (ci->i_dirty_caps | ci->i_flushing_caps) | 1427 | if (ci->i_dirty_caps || ci->i_flushing_caps || |
1428 | !list_empty(&ci->i_cap_snaps)) | ||
1375 | goto out; | 1429 | goto out; |
1376 | if ((used | wanted) & CEPH_CAP_ANY_WR) | 1430 | if ((used | wanted) & CEPH_CAP_ANY_WR) |
1377 | goto out; | 1431 | goto out; |
@@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc, | |||
1417 | session->s_trim_caps = 0; | 1471 | session->s_trim_caps = 0; |
1418 | } | 1472 | } |
1419 | 1473 | ||
1420 | ceph_add_cap_releases(mdsc, session); | ||
1421 | ceph_send_cap_releases(mdsc, session); | 1474 | ceph_send_cap_releases(mdsc, session); |
1422 | return 0; | 1475 | return 0; |
1423 | } | 1476 | } |
1424 | 1477 | ||
1425 | /* | 1478 | static int check_capsnap_flush(struct ceph_inode_info *ci, |
1426 | * Allocate cap_release messages. If there is a partially full message | 1479 | u64 want_snap_seq) |
1427 | * in the queue, try to allocate enough to cover it's remainder, so that | ||
1428 | * we can send it immediately. | ||
1429 | * | ||
1430 | * Called under s_mutex. | ||
1431 | */ | ||
1432 | int ceph_add_cap_releases(struct ceph_mds_client *mdsc, | ||
1433 | struct ceph_mds_session *session) | ||
1434 | { | 1480 | { |
1435 | struct ceph_msg *msg, *partial = NULL; | 1481 | int ret = 1; |
1436 | struct ceph_mds_cap_release *head; | 1482 | spin_lock(&ci->i_ceph_lock); |
1437 | int err = -ENOMEM; | 1483 | if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { |
1438 | int extra = mdsc->fsc->mount_options->cap_release_safety; | 1484 | struct ceph_cap_snap *capsnap = |
1439 | int num; | 1485 | list_first_entry(&ci->i_cap_snaps, |
1440 | 1486 | struct ceph_cap_snap, ci_item); | |
1441 | dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, | 1487 | ret = capsnap->follows >= want_snap_seq; |
1442 | extra); | ||
1443 | |||
1444 | spin_lock(&session->s_cap_lock); | ||
1445 | |||
1446 | if (!list_empty(&session->s_cap_releases)) { | ||
1447 | msg = list_first_entry(&session->s_cap_releases, | ||
1448 | struct ceph_msg, | ||
1449 | list_head); | ||
1450 | head = msg->front.iov_base; | ||
1451 | num = le32_to_cpu(head->num); | ||
1452 | if (num) { | ||
1453 | dout(" partial %p with (%d/%d)\n", msg, num, | ||
1454 | (int)CEPH_CAPS_PER_RELEASE); | ||
1455 | extra += CEPH_CAPS_PER_RELEASE - num; | ||
1456 | partial = msg; | ||
1457 | } | ||
1458 | } | ||
1459 | while (session->s_num_cap_releases < session->s_nr_caps + extra) { | ||
1460 | spin_unlock(&session->s_cap_lock); | ||
1461 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, | ||
1462 | GFP_NOFS, false); | ||
1463 | if (!msg) | ||
1464 | goto out_unlocked; | ||
1465 | dout("add_cap_releases %p msg %p now %d\n", session, msg, | ||
1466 | (int)msg->front.iov_len); | ||
1467 | head = msg->front.iov_base; | ||
1468 | head->num = cpu_to_le32(0); | ||
1469 | msg->front.iov_len = sizeof(*head); | ||
1470 | spin_lock(&session->s_cap_lock); | ||
1471 | list_add(&msg->list_head, &session->s_cap_releases); | ||
1472 | session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; | ||
1473 | } | ||
1474 | |||
1475 | if (partial) { | ||
1476 | head = partial->front.iov_base; | ||
1477 | num = le32_to_cpu(head->num); | ||
1478 | dout(" queueing partial %p with %d/%d\n", partial, num, | ||
1479 | (int)CEPH_CAPS_PER_RELEASE); | ||
1480 | list_move_tail(&partial->list_head, | ||
1481 | &session->s_cap_releases_done); | ||
1482 | session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; | ||
1483 | } | 1488 | } |
1484 | err = 0; | 1489 | spin_unlock(&ci->i_ceph_lock); |
1485 | spin_unlock(&session->s_cap_lock); | 1490 | return ret; |
1486 | out_unlocked: | ||
1487 | return err; | ||
1488 | } | 1491 | } |
1489 | 1492 | ||
1490 | static int check_cap_flush(struct inode *inode, u64 want_flush_seq) | 1493 | static int check_caps_flush(struct ceph_mds_client *mdsc, |
1494 | u64 want_flush_tid) | ||
1491 | { | 1495 | { |
1492 | struct ceph_inode_info *ci = ceph_inode(inode); | 1496 | struct rb_node *n; |
1493 | int ret; | 1497 | struct ceph_cap_flush *cf; |
1494 | spin_lock(&ci->i_ceph_lock); | 1498 | int ret = 1; |
1495 | if (ci->i_flushing_caps) | 1499 | |
1496 | ret = ci->i_cap_flush_seq >= want_flush_seq; | 1500 | spin_lock(&mdsc->cap_dirty_lock); |
1497 | else | 1501 | n = rb_first(&mdsc->cap_flush_tree); |
1498 | ret = 1; | 1502 | cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; |
1499 | spin_unlock(&ci->i_ceph_lock); | 1503 | if (cf && cf->tid <= want_flush_tid) { |
1504 | dout("check_caps_flush still flushing tid %llu <= %llu\n", | ||
1505 | cf->tid, want_flush_tid); | ||
1506 | ret = 0; | ||
1507 | } | ||
1508 | spin_unlock(&mdsc->cap_dirty_lock); | ||
1500 | return ret; | 1509 | return ret; |
1501 | } | 1510 | } |
1502 | 1511 | ||
1503 | /* | 1512 | /* |
1504 | * flush all dirty inode data to disk. | 1513 | * flush all dirty inode data to disk. |
1505 | * | 1514 | * |
1506 | * returns true if we've flushed through want_flush_seq | 1515 | * returns true if we've flushed through want_flush_tid |
1507 | */ | 1516 | */ |
1508 | static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) | 1517 | static void wait_caps_flush(struct ceph_mds_client *mdsc, |
1518 | u64 want_flush_tid, u64 want_snap_seq) | ||
1509 | { | 1519 | { |
1510 | int mds; | 1520 | int mds; |
1511 | 1521 | ||
1512 | dout("check_cap_flush want %lld\n", want_flush_seq); | 1522 | dout("check_caps_flush want %llu snap want %llu\n", |
1523 | want_flush_tid, want_snap_seq); | ||
1513 | mutex_lock(&mdsc->mutex); | 1524 | mutex_lock(&mdsc->mutex); |
1514 | for (mds = 0; mds < mdsc->max_sessions; mds++) { | 1525 | for (mds = 0; mds < mdsc->max_sessions; ) { |
1515 | struct ceph_mds_session *session = mdsc->sessions[mds]; | 1526 | struct ceph_mds_session *session = mdsc->sessions[mds]; |
1516 | struct inode *inode = NULL; | 1527 | struct inode *inode = NULL; |
1517 | 1528 | ||
1518 | if (!session) | 1529 | if (!session) { |
1530 | mds++; | ||
1519 | continue; | 1531 | continue; |
1532 | } | ||
1520 | get_session(session); | 1533 | get_session(session); |
1521 | mutex_unlock(&mdsc->mutex); | 1534 | mutex_unlock(&mdsc->mutex); |
1522 | 1535 | ||
1523 | mutex_lock(&session->s_mutex); | 1536 | mutex_lock(&session->s_mutex); |
1524 | if (!list_empty(&session->s_cap_flushing)) { | 1537 | if (!list_empty(&session->s_cap_snaps_flushing)) { |
1525 | struct ceph_inode_info *ci = | 1538 | struct ceph_cap_snap *capsnap = |
1526 | list_entry(session->s_cap_flushing.next, | 1539 | list_first_entry(&session->s_cap_snaps_flushing, |
1527 | struct ceph_inode_info, | 1540 | struct ceph_cap_snap, |
1528 | i_flushing_item); | 1541 | flushing_item); |
1529 | 1542 | struct ceph_inode_info *ci = capsnap->ci; | |
1530 | if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { | 1543 | if (!check_capsnap_flush(ci, want_snap_seq)) { |
1531 | dout("check_cap_flush still flushing %p " | 1544 | dout("check_cap_flush still flushing snap %p " |
1532 | "seq %lld <= %lld to mds%d\n", | 1545 | "follows %lld <= %lld to mds%d\n", |
1533 | &ci->vfs_inode, ci->i_cap_flush_seq, | 1546 | &ci->vfs_inode, capsnap->follows, |
1534 | want_flush_seq, session->s_mds); | 1547 | want_snap_seq, mds); |
1535 | inode = igrab(&ci->vfs_inode); | 1548 | inode = igrab(&ci->vfs_inode); |
1536 | } | 1549 | } |
1537 | } | 1550 | } |
@@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) | |||
1540 | 1553 | ||
1541 | if (inode) { | 1554 | if (inode) { |
1542 | wait_event(mdsc->cap_flushing_wq, | 1555 | wait_event(mdsc->cap_flushing_wq, |
1543 | check_cap_flush(inode, want_flush_seq)); | 1556 | check_capsnap_flush(ceph_inode(inode), |
1557 | want_snap_seq)); | ||
1544 | iput(inode); | 1558 | iput(inode); |
1559 | } else { | ||
1560 | mds++; | ||
1545 | } | 1561 | } |
1546 | 1562 | ||
1547 | mutex_lock(&mdsc->mutex); | 1563 | mutex_lock(&mdsc->mutex); |
1548 | } | 1564 | } |
1549 | |||
1550 | mutex_unlock(&mdsc->mutex); | 1565 | mutex_unlock(&mdsc->mutex); |
1551 | dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); | 1566 | |
1567 | wait_event(mdsc->cap_flushing_wq, | ||
1568 | check_caps_flush(mdsc, want_flush_tid)); | ||
1569 | |||
1570 | dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); | ||
1552 | } | 1571 | } |
1553 | 1572 | ||
1554 | /* | 1573 | /* |
@@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) | |||
1557 | void ceph_send_cap_releases(struct ceph_mds_client *mdsc, | 1576 | void ceph_send_cap_releases(struct ceph_mds_client *mdsc, |
1558 | struct ceph_mds_session *session) | 1577 | struct ceph_mds_session *session) |
1559 | { | 1578 | { |
1560 | struct ceph_msg *msg; | 1579 | struct ceph_msg *msg = NULL; |
1580 | struct ceph_mds_cap_release *head; | ||
1581 | struct ceph_mds_cap_item *item; | ||
1582 | struct ceph_cap *cap; | ||
1583 | LIST_HEAD(tmp_list); | ||
1584 | int num_cap_releases; | ||
1561 | 1585 | ||
1562 | dout("send_cap_releases mds%d\n", session->s_mds); | ||
1563 | spin_lock(&session->s_cap_lock); | 1586 | spin_lock(&session->s_cap_lock); |
1564 | while (!list_empty(&session->s_cap_releases_done)) { | 1587 | again: |
1565 | msg = list_first_entry(&session->s_cap_releases_done, | 1588 | list_splice_init(&session->s_cap_releases, &tmp_list); |
1566 | struct ceph_msg, list_head); | 1589 | num_cap_releases = session->s_num_cap_releases; |
1567 | list_del_init(&msg->list_head); | 1590 | session->s_num_cap_releases = 0; |
1568 | spin_unlock(&session->s_cap_lock); | ||
1569 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | ||
1570 | dout("send_cap_releases mds%d %p\n", session->s_mds, msg); | ||
1571 | ceph_con_send(&session->s_con, msg); | ||
1572 | spin_lock(&session->s_cap_lock); | ||
1573 | } | ||
1574 | spin_unlock(&session->s_cap_lock); | 1591 | spin_unlock(&session->s_cap_lock); |
1575 | } | ||
1576 | 1592 | ||
1577 | static void discard_cap_releases(struct ceph_mds_client *mdsc, | 1593 | while (!list_empty(&tmp_list)) { |
1578 | struct ceph_mds_session *session) | 1594 | if (!msg) { |
1579 | { | 1595 | msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, |
1580 | struct ceph_msg *msg; | 1596 | PAGE_CACHE_SIZE, GFP_NOFS, false); |
1581 | struct ceph_mds_cap_release *head; | 1597 | if (!msg) |
1582 | unsigned num; | 1598 | goto out_err; |
1583 | 1599 | head = msg->front.iov_base; | |
1584 | dout("discard_cap_releases mds%d\n", session->s_mds); | 1600 | head->num = cpu_to_le32(0); |
1601 | msg->front.iov_len = sizeof(*head); | ||
1602 | } | ||
1603 | cap = list_first_entry(&tmp_list, struct ceph_cap, | ||
1604 | session_caps); | ||
1605 | list_del(&cap->session_caps); | ||
1606 | num_cap_releases--; | ||
1585 | 1607 | ||
1586 | if (!list_empty(&session->s_cap_releases)) { | ||
1587 | /* zero out the in-progress message */ | ||
1588 | msg = list_first_entry(&session->s_cap_releases, | ||
1589 | struct ceph_msg, list_head); | ||
1590 | head = msg->front.iov_base; | 1608 | head = msg->front.iov_base; |
1591 | num = le32_to_cpu(head->num); | 1609 | le32_add_cpu(&head->num, 1); |
1592 | dout("discard_cap_releases mds%d %p %u\n", | 1610 | item = msg->front.iov_base + msg->front.iov_len; |
1593 | session->s_mds, msg, num); | 1611 | item->ino = cpu_to_le64(cap->cap_ino); |
1594 | head->num = cpu_to_le32(0); | 1612 | item->cap_id = cpu_to_le64(cap->cap_id); |
1595 | msg->front.iov_len = sizeof(*head); | 1613 | item->migrate_seq = cpu_to_le32(cap->mseq); |
1596 | session->s_num_cap_releases += num; | 1614 | item->seq = cpu_to_le32(cap->issue_seq); |
1615 | msg->front.iov_len += sizeof(*item); | ||
1616 | |||
1617 | ceph_put_cap(mdsc, cap); | ||
1618 | |||
1619 | if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { | ||
1620 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); | ||
1621 | dout("send_cap_releases mds%d %p\n", session->s_mds, msg); | ||
1622 | ceph_con_send(&session->s_con, msg); | ||
1623 | msg = NULL; | ||
1624 | } | ||
1597 | } | 1625 | } |
1598 | 1626 | ||
1599 | /* requeue completed messages */ | 1627 | BUG_ON(num_cap_releases != 0); |
1600 | while (!list_empty(&session->s_cap_releases_done)) { | ||
1601 | msg = list_first_entry(&session->s_cap_releases_done, | ||
1602 | struct ceph_msg, list_head); | ||
1603 | list_del_init(&msg->list_head); | ||
1604 | 1628 | ||
1605 | head = msg->front.iov_base; | 1629 | spin_lock(&session->s_cap_lock); |
1606 | num = le32_to_cpu(head->num); | 1630 | if (!list_empty(&session->s_cap_releases)) |
1607 | dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, | 1631 | goto again; |
1608 | num); | 1632 | spin_unlock(&session->s_cap_lock); |
1609 | session->s_num_cap_releases += num; | 1633 | |
1610 | head->num = cpu_to_le32(0); | 1634 | if (msg) { |
1611 | msg->front.iov_len = sizeof(*head); | 1635 | msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); |
1612 | list_add(&msg->list_head, &session->s_cap_releases); | 1636 | dout("send_cap_releases mds%d %p\n", session->s_mds, msg); |
1637 | ceph_con_send(&session->s_con, msg); | ||
1613 | } | 1638 | } |
1639 | return; | ||
1640 | out_err: | ||
1641 | pr_err("send_cap_releases mds%d, failed to allocate message\n", | ||
1642 | session->s_mds); | ||
1643 | spin_lock(&session->s_cap_lock); | ||
1644 | list_splice(&tmp_list, &session->s_cap_releases); | ||
1645 | session->s_num_cap_releases += num_cap_releases; | ||
1646 | spin_unlock(&session->s_cap_lock); | ||
1614 | } | 1647 | } |
1615 | 1648 | ||
1616 | /* | 1649 | /* |
@@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, | |||
1635 | 1668 | ||
1636 | order = get_order(size * num_entries); | 1669 | order = get_order(size * num_entries); |
1637 | while (order >= 0) { | 1670 | while (order >= 0) { |
1638 | rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, | 1671 | rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | |
1672 | __GFP_NOWARN, | ||
1639 | order); | 1673 | order); |
1640 | if (rinfo->dir_in) | 1674 | if (rinfo->dir_in) |
1641 | break; | 1675 | break; |
@@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) | |||
1697 | struct ceph_mds_request, r_node); | 1731 | struct ceph_mds_request, r_node); |
1698 | } | 1732 | } |
1699 | 1733 | ||
1700 | static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) | 1734 | static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) |
1701 | { | 1735 | { |
1702 | struct ceph_mds_request *req = __get_oldest_req(mdsc); | 1736 | return mdsc->oldest_tid; |
1703 | |||
1704 | if (req) | ||
1705 | return req->r_tid; | ||
1706 | return 0; | ||
1707 | } | 1737 | } |
1708 | 1738 | ||
1709 | /* | 1739 | /* |
@@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, | |||
2267 | /* wait */ | 2297 | /* wait */ |
2268 | mutex_unlock(&mdsc->mutex); | 2298 | mutex_unlock(&mdsc->mutex); |
2269 | dout("do_request waiting\n"); | 2299 | dout("do_request waiting\n"); |
2270 | if (req->r_timeout) { | 2300 | if (!req->r_timeout && req->r_wait_for_completion) { |
2271 | err = (long)wait_for_completion_killable_timeout( | ||
2272 | &req->r_completion, req->r_timeout); | ||
2273 | if (err == 0) | ||
2274 | err = -EIO; | ||
2275 | } else if (req->r_wait_for_completion) { | ||
2276 | err = req->r_wait_for_completion(mdsc, req); | 2301 | err = req->r_wait_for_completion(mdsc, req); |
2277 | } else { | 2302 | } else { |
2278 | err = wait_for_completion_killable(&req->r_completion); | 2303 | long timeleft = wait_for_completion_killable_timeout( |
2304 | &req->r_completion, | ||
2305 | ceph_timeout_jiffies(req->r_timeout)); | ||
2306 | if (timeleft > 0) | ||
2307 | err = 0; | ||
2308 | else if (!timeleft) | ||
2309 | err = -EIO; /* timed out */ | ||
2310 | else | ||
2311 | err = timeleft; /* killed */ | ||
2279 | } | 2312 | } |
2280 | dout("do_request waited, got %d\n", err); | 2313 | dout("do_request waited, got %d\n", err); |
2281 | mutex_lock(&mdsc->mutex); | 2314 | mutex_lock(&mdsc->mutex); |
@@ -2496,7 +2529,6 @@ out_err: | |||
2496 | } | 2529 | } |
2497 | mutex_unlock(&mdsc->mutex); | 2530 | mutex_unlock(&mdsc->mutex); |
2498 | 2531 | ||
2499 | ceph_add_cap_releases(mdsc, req->r_session); | ||
2500 | mutex_unlock(&session->s_mutex); | 2532 | mutex_unlock(&session->s_mutex); |
2501 | 2533 | ||
2502 | /* kick calling process */ | 2534 | /* kick calling process */ |
@@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, | |||
2888 | */ | 2920 | */ |
2889 | session->s_cap_reconnect = 1; | 2921 | session->s_cap_reconnect = 1; |
2890 | /* drop old cap expires; we're about to reestablish that state */ | 2922 | /* drop old cap expires; we're about to reestablish that state */ |
2891 | discard_cap_releases(mdsc, session); | 2923 | cleanup_cap_releases(mdsc, session); |
2892 | spin_unlock(&session->s_cap_lock); | ||
2893 | 2924 | ||
2894 | /* trim unused caps to reduce MDS's cache rejoin time */ | 2925 | /* trim unused caps to reduce MDS's cache rejoin time */ |
2895 | if (mdsc->fsc->sb->s_root) | 2926 | if (mdsc->fsc->sb->s_root) |
@@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, | |||
2956 | 2987 | ||
2957 | reply->hdr.data_len = cpu_to_le32(pagelist->length); | 2988 | reply->hdr.data_len = cpu_to_le32(pagelist->length); |
2958 | ceph_msg_data_add_pagelist(reply, pagelist); | 2989 | ceph_msg_data_add_pagelist(reply, pagelist); |
2990 | |||
2991 | ceph_early_kick_flushing_caps(mdsc, session); | ||
2992 | |||
2959 | ceph_con_send(&session->s_con, reply); | 2993 | ceph_con_send(&session->s_con, reply); |
2960 | 2994 | ||
2961 | mutex_unlock(&session->s_mutex); | 2995 | mutex_unlock(&session->s_mutex); |
@@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work) | |||
3352 | send_renew_caps(mdsc, s); | 3386 | send_renew_caps(mdsc, s); |
3353 | else | 3387 | else |
3354 | ceph_con_keepalive(&s->s_con); | 3388 | ceph_con_keepalive(&s->s_con); |
3355 | ceph_add_cap_releases(mdsc, s); | ||
3356 | if (s->s_state == CEPH_MDS_SESSION_OPEN || | 3389 | if (s->s_state == CEPH_MDS_SESSION_OPEN || |
3357 | s->s_state == CEPH_MDS_SESSION_HUNG) | 3390 | s->s_state == CEPH_MDS_SESSION_HUNG) |
3358 | ceph_send_cap_releases(mdsc, s); | 3391 | ceph_send_cap_releases(mdsc, s); |
@@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) | |||
3390 | atomic_set(&mdsc->num_sessions, 0); | 3423 | atomic_set(&mdsc->num_sessions, 0); |
3391 | mdsc->max_sessions = 0; | 3424 | mdsc->max_sessions = 0; |
3392 | mdsc->stopping = 0; | 3425 | mdsc->stopping = 0; |
3426 | mdsc->last_snap_seq = 0; | ||
3393 | init_rwsem(&mdsc->snap_rwsem); | 3427 | init_rwsem(&mdsc->snap_rwsem); |
3394 | mdsc->snap_realms = RB_ROOT; | 3428 | mdsc->snap_realms = RB_ROOT; |
3395 | INIT_LIST_HEAD(&mdsc->snap_empty); | 3429 | INIT_LIST_HEAD(&mdsc->snap_empty); |
3396 | spin_lock_init(&mdsc->snap_empty_lock); | 3430 | spin_lock_init(&mdsc->snap_empty_lock); |
3397 | mdsc->last_tid = 0; | 3431 | mdsc->last_tid = 0; |
3432 | mdsc->oldest_tid = 0; | ||
3398 | mdsc->request_tree = RB_ROOT; | 3433 | mdsc->request_tree = RB_ROOT; |
3399 | INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); | 3434 | INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); |
3400 | mdsc->last_renew_caps = jiffies; | 3435 | mdsc->last_renew_caps = jiffies; |
@@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) | |||
3402 | spin_lock_init(&mdsc->cap_delay_lock); | 3437 | spin_lock_init(&mdsc->cap_delay_lock); |
3403 | INIT_LIST_HEAD(&mdsc->snap_flush_list); | 3438 | INIT_LIST_HEAD(&mdsc->snap_flush_list); |
3404 | spin_lock_init(&mdsc->snap_flush_lock); | 3439 | spin_lock_init(&mdsc->snap_flush_lock); |
3405 | mdsc->cap_flush_seq = 0; | 3440 | mdsc->last_cap_flush_tid = 1; |
3441 | mdsc->cap_flush_tree = RB_ROOT; | ||
3406 | INIT_LIST_HEAD(&mdsc->cap_dirty); | 3442 | INIT_LIST_HEAD(&mdsc->cap_dirty); |
3407 | INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); | 3443 | INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); |
3408 | mdsc->num_cap_flushing = 0; | 3444 | mdsc->num_cap_flushing = 0; |
@@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) | |||
3414 | ceph_caps_init(mdsc); | 3450 | ceph_caps_init(mdsc); |
3415 | ceph_adjust_min_caps(mdsc, fsc->min_caps); | 3451 | ceph_adjust_min_caps(mdsc, fsc->min_caps); |
3416 | 3452 | ||
3453 | init_rwsem(&mdsc->pool_perm_rwsem); | ||
3454 | mdsc->pool_perm_tree = RB_ROOT; | ||
3455 | |||
3417 | return 0; | 3456 | return 0; |
3418 | } | 3457 | } |
3419 | 3458 | ||
@@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) | |||
3423 | */ | 3462 | */ |
3424 | static void wait_requests(struct ceph_mds_client *mdsc) | 3463 | static void wait_requests(struct ceph_mds_client *mdsc) |
3425 | { | 3464 | { |
3465 | struct ceph_options *opts = mdsc->fsc->client->options; | ||
3426 | struct ceph_mds_request *req; | 3466 | struct ceph_mds_request *req; |
3427 | struct ceph_fs_client *fsc = mdsc->fsc; | ||
3428 | 3467 | ||
3429 | mutex_lock(&mdsc->mutex); | 3468 | mutex_lock(&mdsc->mutex); |
3430 | if (__get_oldest_req(mdsc)) { | 3469 | if (__get_oldest_req(mdsc)) { |
@@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) | |||
3432 | 3471 | ||
3433 | dout("wait_requests waiting for requests\n"); | 3472 | dout("wait_requests waiting for requests\n"); |
3434 | wait_for_completion_timeout(&mdsc->safe_umount_waiters, | 3473 | wait_for_completion_timeout(&mdsc->safe_umount_waiters, |
3435 | fsc->client->options->mount_timeout * HZ); | 3474 | ceph_timeout_jiffies(opts->mount_timeout)); |
3436 | 3475 | ||
3437 | /* tear down remaining requests */ | 3476 | /* tear down remaining requests */ |
3438 | mutex_lock(&mdsc->mutex); | 3477 | mutex_lock(&mdsc->mutex); |
@@ -3485,7 +3524,8 @@ restart: | |||
3485 | nextreq = rb_entry(n, struct ceph_mds_request, r_node); | 3524 | nextreq = rb_entry(n, struct ceph_mds_request, r_node); |
3486 | else | 3525 | else |
3487 | nextreq = NULL; | 3526 | nextreq = NULL; |
3488 | if ((req->r_op & CEPH_MDS_OP_WRITE)) { | 3527 | if (req->r_op != CEPH_MDS_OP_SETFILELOCK && |
3528 | (req->r_op & CEPH_MDS_OP_WRITE)) { | ||
3489 | /* write op */ | 3529 | /* write op */ |
3490 | ceph_mdsc_get_request(req); | 3530 | ceph_mdsc_get_request(req); |
3491 | if (nextreq) | 3531 | if (nextreq) |
@@ -3513,7 +3553,7 @@ restart: | |||
3513 | 3553 | ||
3514 | void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | 3554 | void ceph_mdsc_sync(struct ceph_mds_client *mdsc) |
3515 | { | 3555 | { |
3516 | u64 want_tid, want_flush; | 3556 | u64 want_tid, want_flush, want_snap; |
3517 | 3557 | ||
3518 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) | 3558 | if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) |
3519 | return; | 3559 | return; |
@@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) | |||
3525 | 3565 | ||
3526 | ceph_flush_dirty_caps(mdsc); | 3566 | ceph_flush_dirty_caps(mdsc); |
3527 | spin_lock(&mdsc->cap_dirty_lock); | 3567 | spin_lock(&mdsc->cap_dirty_lock); |
3528 | want_flush = mdsc->cap_flush_seq; | 3568 | want_flush = mdsc->last_cap_flush_tid; |
3529 | spin_unlock(&mdsc->cap_dirty_lock); | 3569 | spin_unlock(&mdsc->cap_dirty_lock); |
3530 | 3570 | ||
3531 | dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); | 3571 | down_read(&mdsc->snap_rwsem); |
3572 | want_snap = mdsc->last_snap_seq; | ||
3573 | up_read(&mdsc->snap_rwsem); | ||
3574 | |||
3575 | dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", | ||
3576 | want_tid, want_flush, want_snap); | ||
3532 | 3577 | ||
3533 | wait_unsafe_requests(mdsc, want_tid); | 3578 | wait_unsafe_requests(mdsc, want_tid); |
3534 | wait_caps_flush(mdsc, want_flush); | 3579 | wait_caps_flush(mdsc, want_flush, want_snap); |
3535 | } | 3580 | } |
3536 | 3581 | ||
3537 | /* | 3582 | /* |
@@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc) | |||
3549 | */ | 3594 | */ |
3550 | void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) | 3595 | void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) |
3551 | { | 3596 | { |
3597 | struct ceph_options *opts = mdsc->fsc->client->options; | ||
3552 | struct ceph_mds_session *session; | 3598 | struct ceph_mds_session *session; |
3553 | int i; | 3599 | int i; |
3554 | struct ceph_fs_client *fsc = mdsc->fsc; | ||
3555 | unsigned long timeout = fsc->client->options->mount_timeout * HZ; | ||
3556 | 3600 | ||
3557 | dout("close_sessions\n"); | 3601 | dout("close_sessions\n"); |
3558 | 3602 | ||
@@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) | |||
3573 | 3617 | ||
3574 | dout("waiting for sessions to close\n"); | 3618 | dout("waiting for sessions to close\n"); |
3575 | wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), | 3619 | wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), |
3576 | timeout); | 3620 | ceph_timeout_jiffies(opts->mount_timeout)); |
3577 | 3621 | ||
3578 | /* tear down remaining sessions */ | 3622 | /* tear down remaining sessions */ |
3579 | mutex_lock(&mdsc->mutex); | 3623 | mutex_lock(&mdsc->mutex); |
@@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) | |||
3607 | ceph_mdsmap_destroy(mdsc->mdsmap); | 3651 | ceph_mdsmap_destroy(mdsc->mdsmap); |
3608 | kfree(mdsc->sessions); | 3652 | kfree(mdsc->sessions); |
3609 | ceph_caps_finalize(mdsc); | 3653 | ceph_caps_finalize(mdsc); |
3654 | ceph_pool_perm_destroy(mdsc); | ||
3610 | } | 3655 | } |
3611 | 3656 | ||
3612 | void ceph_mdsc_destroy(struct ceph_fs_client *fsc) | 3657 | void ceph_mdsc_destroy(struct ceph_fs_client *fsc) |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1875b5d985c6..762757e6cebf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -139,7 +139,6 @@ struct ceph_mds_session { | |||
139 | int s_cap_reconnect; | 139 | int s_cap_reconnect; |
140 | int s_readonly; | 140 | int s_readonly; |
141 | struct list_head s_cap_releases; /* waiting cap_release messages */ | 141 | struct list_head s_cap_releases; /* waiting cap_release messages */ |
142 | struct list_head s_cap_releases_done; /* ready to send */ | ||
143 | struct ceph_cap *s_cap_iterator; | 142 | struct ceph_cap *s_cap_iterator; |
144 | 143 | ||
145 | /* protected by mutex */ | 144 | /* protected by mutex */ |
@@ -228,7 +227,7 @@ struct ceph_mds_request { | |||
228 | int r_err; | 227 | int r_err; |
229 | bool r_aborted; | 228 | bool r_aborted; |
230 | 229 | ||
231 | unsigned long r_timeout; /* optional. jiffies */ | 230 | unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ |
232 | unsigned long r_started; /* start time to measure timeout against */ | 231 | unsigned long r_started; /* start time to measure timeout against */ |
233 | unsigned long r_request_started; /* start time for mds request only, | 232 | unsigned long r_request_started; /* start time for mds request only, |
234 | used to measure lease durations */ | 233 | used to measure lease durations */ |
@@ -254,12 +253,21 @@ struct ceph_mds_request { | |||
254 | bool r_got_unsafe, r_got_safe, r_got_result; | 253 | bool r_got_unsafe, r_got_safe, r_got_result; |
255 | 254 | ||
256 | bool r_did_prepopulate; | 255 | bool r_did_prepopulate; |
256 | long long r_dir_release_cnt; | ||
257 | long long r_dir_ordered_cnt; | ||
258 | int r_readdir_cache_idx; | ||
257 | u32 r_readdir_offset; | 259 | u32 r_readdir_offset; |
258 | 260 | ||
259 | struct ceph_cap_reservation r_caps_reservation; | 261 | struct ceph_cap_reservation r_caps_reservation; |
260 | int r_num_caps; | 262 | int r_num_caps; |
261 | }; | 263 | }; |
262 | 264 | ||
265 | struct ceph_pool_perm { | ||
266 | struct rb_node node; | ||
267 | u32 pool; | ||
268 | int perm; | ||
269 | }; | ||
270 | |||
263 | /* | 271 | /* |
264 | * mds client state | 272 | * mds client state |
265 | */ | 273 | */ |
@@ -284,12 +292,15 @@ struct ceph_mds_client { | |||
284 | * references (implying they contain no inodes with caps) that | 292 | * references (implying they contain no inodes with caps) that |
285 | * should be destroyed. | 293 | * should be destroyed. |
286 | */ | 294 | */ |
295 | u64 last_snap_seq; | ||
287 | struct rw_semaphore snap_rwsem; | 296 | struct rw_semaphore snap_rwsem; |
288 | struct rb_root snap_realms; | 297 | struct rb_root snap_realms; |
289 | struct list_head snap_empty; | 298 | struct list_head snap_empty; |
290 | spinlock_t snap_empty_lock; /* protect snap_empty */ | 299 | spinlock_t snap_empty_lock; /* protect snap_empty */ |
291 | 300 | ||
292 | u64 last_tid; /* most recent mds request */ | 301 | u64 last_tid; /* most recent mds request */ |
302 | u64 oldest_tid; /* oldest incomplete mds request, | ||
303 | excluding setfilelock requests */ | ||
293 | struct rb_root request_tree; /* pending mds requests */ | 304 | struct rb_root request_tree; /* pending mds requests */ |
294 | struct delayed_work delayed_work; /* delayed work */ | 305 | struct delayed_work delayed_work; /* delayed work */ |
295 | unsigned long last_renew_caps; /* last time we renewed our caps */ | 306 | unsigned long last_renew_caps; /* last time we renewed our caps */ |
@@ -298,7 +309,8 @@ struct ceph_mds_client { | |||
298 | struct list_head snap_flush_list; /* cap_snaps ready to flush */ | 309 | struct list_head snap_flush_list; /* cap_snaps ready to flush */ |
299 | spinlock_t snap_flush_lock; | 310 | spinlock_t snap_flush_lock; |
300 | 311 | ||
301 | u64 cap_flush_seq; | 312 | u64 last_cap_flush_tid; |
313 | struct rb_root cap_flush_tree; | ||
302 | struct list_head cap_dirty; /* inodes with dirty caps */ | 314 | struct list_head cap_dirty; /* inodes with dirty caps */ |
303 | struct list_head cap_dirty_migrating; /* ...that are migration... */ | 315 | struct list_head cap_dirty_migrating; /* ...that are migration... */ |
304 | int num_cap_flushing; /* # caps we are flushing */ | 316 | int num_cap_flushing; /* # caps we are flushing */ |
@@ -328,6 +340,9 @@ struct ceph_mds_client { | |||
328 | spinlock_t dentry_lru_lock; | 340 | spinlock_t dentry_lru_lock; |
329 | struct list_head dentry_lru; | 341 | struct list_head dentry_lru; |
330 | int num_dentry; | 342 | int num_dentry; |
343 | |||
344 | struct rw_semaphore pool_perm_rwsem; | ||
345 | struct rb_root pool_perm_tree; | ||
331 | }; | 346 | }; |
332 | 347 | ||
333 | extern const char *ceph_mds_op_name(int op); | 348 | extern const char *ceph_mds_op_name(int op); |
@@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) | |||
379 | kref_put(&req->r_kref, ceph_mdsc_release_request); | 394 | kref_put(&req->r_kref, ceph_mdsc_release_request); |
380 | } | 395 | } |
381 | 396 | ||
382 | extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, | ||
383 | struct ceph_mds_session *session); | ||
384 | extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, | 397 | extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, |
385 | struct ceph_mds_session *session); | 398 | struct ceph_mds_session *session); |
386 | 399 | ||
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a97e39f09ba6..233d906aec02 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c | |||
@@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) | |||
296 | } | 296 | } |
297 | 297 | ||
298 | 298 | ||
299 | static struct ceph_snap_context *empty_snapc; | 299 | struct ceph_snap_context *ceph_empty_snapc; |
300 | 300 | ||
301 | /* | 301 | /* |
302 | * build the snap context for a given realm. | 302 | * build the snap context for a given realm. |
@@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) | |||
338 | return 0; | 338 | return 0; |
339 | } | 339 | } |
340 | 340 | ||
341 | if (num == 0 && realm->seq == empty_snapc->seq) { | 341 | if (num == 0 && realm->seq == ceph_empty_snapc->seq) { |
342 | ceph_get_snap_context(empty_snapc); | 342 | ceph_get_snap_context(ceph_empty_snapc); |
343 | snapc = empty_snapc; | 343 | snapc = ceph_empty_snapc; |
344 | goto done; | 344 | goto done; |
345 | } | 345 | } |
346 | 346 | ||
@@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) | |||
436 | return 0; | 436 | return 0; |
437 | } | 437 | } |
438 | 438 | ||
439 | static bool has_new_snaps(struct ceph_snap_context *o, | ||
440 | struct ceph_snap_context *n) | ||
441 | { | ||
442 | if (n->num_snaps == 0) | ||
443 | return false; | ||
444 | /* snaps are in descending order */ | ||
445 | return n->snaps[0] > o->seq; | ||
446 | } | ||
439 | 447 | ||
440 | /* | 448 | /* |
441 | * When a snapshot is applied, the size/mtime inode metadata is queued | 449 | * When a snapshot is applied, the size/mtime inode metadata is queued |
@@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) | |||
455 | { | 463 | { |
456 | struct inode *inode = &ci->vfs_inode; | 464 | struct inode *inode = &ci->vfs_inode; |
457 | struct ceph_cap_snap *capsnap; | 465 | struct ceph_cap_snap *capsnap; |
466 | struct ceph_snap_context *old_snapc, *new_snapc; | ||
458 | int used, dirty; | 467 | int used, dirty; |
459 | 468 | ||
460 | capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); | 469 | capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); |
@@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) | |||
467 | used = __ceph_caps_used(ci); | 476 | used = __ceph_caps_used(ci); |
468 | dirty = __ceph_caps_dirty(ci); | 477 | dirty = __ceph_caps_dirty(ci); |
469 | 478 | ||
479 | old_snapc = ci->i_head_snapc; | ||
480 | new_snapc = ci->i_snap_realm->cached_context; | ||
481 | |||
470 | /* | 482 | /* |
471 | * If there is a write in progress, treat that as a dirty Fw, | 483 | * If there is a write in progress, treat that as a dirty Fw, |
472 | * even though it hasn't completed yet; by the time we finish | 484 | * even though it hasn't completed yet; by the time we finish |
@@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) | |||
481 | writes in progress now were started before the previous | 493 | writes in progress now were started before the previous |
482 | cap_snap. lucky us. */ | 494 | cap_snap. lucky us. */ |
483 | dout("queue_cap_snap %p already pending\n", inode); | 495 | dout("queue_cap_snap %p already pending\n", inode); |
484 | kfree(capsnap); | 496 | goto update_snapc; |
485 | } else if (ci->i_snap_realm->cached_context == empty_snapc) { | 497 | } |
486 | dout("queue_cap_snap %p empty snapc\n", inode); | 498 | if (ci->i_wrbuffer_ref_head == 0 && |
487 | kfree(capsnap); | 499 | !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { |
488 | } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| | 500 | dout("queue_cap_snap %p nothing dirty|writing\n", inode); |
489 | CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { | 501 | goto update_snapc; |
490 | struct ceph_snap_context *snapc = ci->i_head_snapc; | 502 | } |
491 | |||
492 | /* | ||
493 | * if we are a sync write, we may need to go to the snaprealm | ||
494 | * to get the current snapc. | ||
495 | */ | ||
496 | if (!snapc) | ||
497 | snapc = ci->i_snap_realm->cached_context; | ||
498 | 503 | ||
499 | dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", | 504 | BUG_ON(!old_snapc); |
500 | inode, capsnap, snapc, ceph_cap_string(dirty)); | ||
501 | ihold(inode); | ||
502 | 505 | ||
503 | atomic_set(&capsnap->nref, 1); | 506 | /* |
504 | capsnap->ci = ci; | 507 | * There is no need to send FLUSHSNAP message to MDS if there is |
505 | INIT_LIST_HEAD(&capsnap->ci_item); | 508 | * no new snapshot. But when there is dirty pages or on-going |
506 | INIT_LIST_HEAD(&capsnap->flushing_item); | 509 | * writes, we still need to create cap_snap. cap_snap is needed |
507 | 510 | * by the write path and page writeback path. | |
508 | capsnap->follows = snapc->seq; | 511 | * |
509 | capsnap->issued = __ceph_caps_issued(ci, NULL); | 512 | * also see ceph_try_drop_cap_snap() |
510 | capsnap->dirty = dirty; | 513 | */ |
511 | 514 | if (has_new_snaps(old_snapc, new_snapc)) { | |
512 | capsnap->mode = inode->i_mode; | 515 | if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) |
513 | capsnap->uid = inode->i_uid; | 516 | capsnap->need_flush = true; |
514 | capsnap->gid = inode->i_gid; | 517 | } else { |
515 | 518 | if (!(used & CEPH_CAP_FILE_WR) && | |
516 | if (dirty & CEPH_CAP_XATTR_EXCL) { | 519 | ci->i_wrbuffer_ref_head == 0) { |
517 | __ceph_build_xattrs_blob(ci); | 520 | dout("queue_cap_snap %p " |
518 | capsnap->xattr_blob = | 521 | "no new_snap|dirty_page|writing\n", inode); |
519 | ceph_buffer_get(ci->i_xattrs.blob); | 522 | goto update_snapc; |
520 | capsnap->xattr_version = ci->i_xattrs.version; | ||
521 | } else { | ||
522 | capsnap->xattr_blob = NULL; | ||
523 | capsnap->xattr_version = 0; | ||
524 | } | 523 | } |
524 | } | ||
525 | 525 | ||
526 | capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; | 526 | dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", |
527 | 527 | inode, capsnap, old_snapc, ceph_cap_string(dirty), | |
528 | /* dirty page count moved from _head to this cap_snap; | 528 | capsnap->need_flush ? "" : "no_flush"); |
529 | all subsequent writes page dirties occur _after_ this | 529 | ihold(inode); |
530 | snapshot. */ | 530 | |
531 | capsnap->dirty_pages = ci->i_wrbuffer_ref_head; | 531 | atomic_set(&capsnap->nref, 1); |
532 | ci->i_wrbuffer_ref_head = 0; | 532 | capsnap->ci = ci; |
533 | capsnap->context = snapc; | 533 | INIT_LIST_HEAD(&capsnap->ci_item); |
534 | ci->i_head_snapc = | 534 | INIT_LIST_HEAD(&capsnap->flushing_item); |
535 | ceph_get_snap_context(ci->i_snap_realm->cached_context); | 535 | |
536 | dout(" new snapc is %p\n", ci->i_head_snapc); | 536 | capsnap->follows = old_snapc->seq; |
537 | list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); | 537 | capsnap->issued = __ceph_caps_issued(ci, NULL); |
538 | 538 | capsnap->dirty = dirty; | |
539 | if (used & CEPH_CAP_FILE_WR) { | 539 | |
540 | dout("queue_cap_snap %p cap_snap %p snapc %p" | 540 | capsnap->mode = inode->i_mode; |
541 | " seq %llu used WR, now pending\n", inode, | 541 | capsnap->uid = inode->i_uid; |
542 | capsnap, snapc, snapc->seq); | 542 | capsnap->gid = inode->i_gid; |
543 | capsnap->writing = 1; | 543 | |
544 | } else { | 544 | if (dirty & CEPH_CAP_XATTR_EXCL) { |
545 | /* note mtime, size NOW. */ | 545 | __ceph_build_xattrs_blob(ci); |
546 | __ceph_finish_cap_snap(ci, capsnap); | 546 | capsnap->xattr_blob = |
547 | } | 547 | ceph_buffer_get(ci->i_xattrs.blob); |
548 | capsnap->xattr_version = ci->i_xattrs.version; | ||
548 | } else { | 549 | } else { |
549 | dout("queue_cap_snap %p nothing dirty|writing\n", inode); | 550 | capsnap->xattr_blob = NULL; |
550 | kfree(capsnap); | 551 | capsnap->xattr_version = 0; |
551 | } | 552 | } |
552 | 553 | ||
554 | capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; | ||
555 | |||
556 | /* dirty page count moved from _head to this cap_snap; | ||
557 | all subsequent writes page dirties occur _after_ this | ||
558 | snapshot. */ | ||
559 | capsnap->dirty_pages = ci->i_wrbuffer_ref_head; | ||
560 | ci->i_wrbuffer_ref_head = 0; | ||
561 | capsnap->context = old_snapc; | ||
562 | list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); | ||
563 | old_snapc = NULL; | ||
564 | |||
565 | if (used & CEPH_CAP_FILE_WR) { | ||
566 | dout("queue_cap_snap %p cap_snap %p snapc %p" | ||
567 | " seq %llu used WR, now pending\n", inode, | ||
568 | capsnap, old_snapc, old_snapc->seq); | ||
569 | capsnap->writing = 1; | ||
570 | } else { | ||
571 | /* note mtime, size NOW. */ | ||
572 | __ceph_finish_cap_snap(ci, capsnap); | ||
573 | } | ||
574 | capsnap = NULL; | ||
575 | |||
576 | update_snapc: | ||
577 | if (ci->i_head_snapc) { | ||
578 | ci->i_head_snapc = ceph_get_snap_context(new_snapc); | ||
579 | dout(" new snapc is %p\n", new_snapc); | ||
580 | } | ||
553 | spin_unlock(&ci->i_ceph_lock); | 581 | spin_unlock(&ci->i_ceph_lock); |
582 | |||
583 | kfree(capsnap); | ||
584 | ceph_put_snap_context(old_snapc); | ||
554 | } | 585 | } |
555 | 586 | ||
556 | /* | 587 | /* |
@@ -699,6 +730,8 @@ more: | |||
699 | 730 | ||
700 | /* queue realm for cap_snap creation */ | 731 | /* queue realm for cap_snap creation */ |
701 | list_add(&realm->dirty_item, &dirty_realms); | 732 | list_add(&realm->dirty_item, &dirty_realms); |
733 | if (realm->seq > mdsc->last_snap_seq) | ||
734 | mdsc->last_snap_seq = realm->seq; | ||
702 | 735 | ||
703 | invalidate = 1; | 736 | invalidate = 1; |
704 | } else if (!realm->cached_context) { | 737 | } else if (!realm->cached_context) { |
@@ -964,14 +997,14 @@ out: | |||
964 | 997 | ||
965 | int __init ceph_snap_init(void) | 998 | int __init ceph_snap_init(void) |
966 | { | 999 | { |
967 | empty_snapc = ceph_create_snap_context(0, GFP_NOFS); | 1000 | ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); |
968 | if (!empty_snapc) | 1001 | if (!ceph_empty_snapc) |
969 | return -ENOMEM; | 1002 | return -ENOMEM; |
970 | empty_snapc->seq = 1; | 1003 | ceph_empty_snapc->seq = 1; |
971 | return 0; | 1004 | return 0; |
972 | } | 1005 | } |
973 | 1006 | ||
974 | void ceph_snap_exit(void) | 1007 | void ceph_snap_exit(void) |
975 | { | 1008 | { |
976 | ceph_put_snap_context(empty_snapc); | 1009 | ceph_put_snap_context(ceph_empty_snapc); |
977 | } | 1010 | } |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e9905374078..d1c833c321b9 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -134,10 +134,12 @@ enum { | |||
134 | Opt_noino32, | 134 | Opt_noino32, |
135 | Opt_fscache, | 135 | Opt_fscache, |
136 | Opt_nofscache, | 136 | Opt_nofscache, |
137 | Opt_poolperm, | ||
138 | Opt_nopoolperm, | ||
137 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | 139 | #ifdef CONFIG_CEPH_FS_POSIX_ACL |
138 | Opt_acl, | 140 | Opt_acl, |
139 | #endif | 141 | #endif |
140 | Opt_noacl | 142 | Opt_noacl, |
141 | }; | 143 | }; |
142 | 144 | ||
143 | static match_table_t fsopt_tokens = { | 145 | static match_table_t fsopt_tokens = { |
@@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { | |||
165 | {Opt_noino32, "noino32"}, | 167 | {Opt_noino32, "noino32"}, |
166 | {Opt_fscache, "fsc"}, | 168 | {Opt_fscache, "fsc"}, |
167 | {Opt_nofscache, "nofsc"}, | 169 | {Opt_nofscache, "nofsc"}, |
170 | {Opt_poolperm, "poolperm"}, | ||
171 | {Opt_nopoolperm, "nopoolperm"}, | ||
168 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | 172 | #ifdef CONFIG_CEPH_FS_POSIX_ACL |
169 | {Opt_acl, "acl"}, | 173 | {Opt_acl, "acl"}, |
170 | #endif | 174 | #endif |
@@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) | |||
268 | case Opt_nofscache: | 272 | case Opt_nofscache: |
269 | fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; | 273 | fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; |
270 | break; | 274 | break; |
275 | case Opt_poolperm: | ||
276 | fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; | ||
277 | printk ("pool perm"); | ||
278 | break; | ||
279 | case Opt_nopoolperm: | ||
280 | fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; | ||
281 | break; | ||
271 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | 282 | #ifdef CONFIG_CEPH_FS_POSIX_ACL |
272 | case Opt_acl: | 283 | case Opt_acl: |
273 | fsopt->sb_flags |= MS_POSIXACL; | 284 | fsopt->sb_flags |= MS_POSIXACL; |
@@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) | |||
436 | seq_puts(m, ",nodcache"); | 447 | seq_puts(m, ",nodcache"); |
437 | if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) | 448 | if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) |
438 | seq_puts(m, ",fsc"); | 449 | seq_puts(m, ",fsc"); |
450 | if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) | ||
451 | seq_puts(m, ",nopoolperm"); | ||
439 | 452 | ||
440 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | 453 | #ifdef CONFIG_CEPH_FS_POSIX_ACL |
441 | if (fsopt->sb_flags & MS_POSIXACL) | 454 | if (fsopt->sb_flags & MS_POSIXACL) |
@@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) | |||
609 | */ | 622 | */ |
610 | struct kmem_cache *ceph_inode_cachep; | 623 | struct kmem_cache *ceph_inode_cachep; |
611 | struct kmem_cache *ceph_cap_cachep; | 624 | struct kmem_cache *ceph_cap_cachep; |
625 | struct kmem_cache *ceph_cap_flush_cachep; | ||
612 | struct kmem_cache *ceph_dentry_cachep; | 626 | struct kmem_cache *ceph_dentry_cachep; |
613 | struct kmem_cache *ceph_file_cachep; | 627 | struct kmem_cache *ceph_file_cachep; |
614 | 628 | ||
@@ -634,6 +648,10 @@ static int __init init_caches(void) | |||
634 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | 648 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); |
635 | if (ceph_cap_cachep == NULL) | 649 | if (ceph_cap_cachep == NULL) |
636 | goto bad_cap; | 650 | goto bad_cap; |
651 | ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, | ||
652 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | ||
653 | if (ceph_cap_flush_cachep == NULL) | ||
654 | goto bad_cap_flush; | ||
637 | 655 | ||
638 | ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, | 656 | ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, |
639 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); | 657 | SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); |
@@ -652,6 +670,8 @@ static int __init init_caches(void) | |||
652 | bad_file: | 670 | bad_file: |
653 | kmem_cache_destroy(ceph_dentry_cachep); | 671 | kmem_cache_destroy(ceph_dentry_cachep); |
654 | bad_dentry: | 672 | bad_dentry: |
673 | kmem_cache_destroy(ceph_cap_flush_cachep); | ||
674 | bad_cap_flush: | ||
655 | kmem_cache_destroy(ceph_cap_cachep); | 675 | kmem_cache_destroy(ceph_cap_cachep); |
656 | bad_cap: | 676 | bad_cap: |
657 | kmem_cache_destroy(ceph_inode_cachep); | 677 | kmem_cache_destroy(ceph_inode_cachep); |
@@ -668,6 +688,7 @@ static void destroy_caches(void) | |||
668 | 688 | ||
669 | kmem_cache_destroy(ceph_inode_cachep); | 689 | kmem_cache_destroy(ceph_inode_cachep); |
670 | kmem_cache_destroy(ceph_cap_cachep); | 690 | kmem_cache_destroy(ceph_cap_cachep); |
691 | kmem_cache_destroy(ceph_cap_flush_cachep); | ||
671 | kmem_cache_destroy(ceph_dentry_cachep); | 692 | kmem_cache_destroy(ceph_dentry_cachep); |
672 | kmem_cache_destroy(ceph_file_cachep); | 693 | kmem_cache_destroy(ceph_file_cachep); |
673 | 694 | ||
@@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, | |||
729 | req->r_ino1.ino = CEPH_INO_ROOT; | 750 | req->r_ino1.ino = CEPH_INO_ROOT; |
730 | req->r_ino1.snap = CEPH_NOSNAP; | 751 | req->r_ino1.snap = CEPH_NOSNAP; |
731 | req->r_started = started; | 752 | req->r_started = started; |
732 | req->r_timeout = fsc->client->options->mount_timeout * HZ; | 753 | req->r_timeout = fsc->client->options->mount_timeout; |
733 | req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); | 754 | req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); |
734 | req->r_num_caps = 2; | 755 | req->r_num_caps = 2; |
735 | err = ceph_mdsc_do_request(mdsc, NULL, req); | 756 | err = ceph_mdsc_do_request(mdsc, NULL, req); |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fa20e1318939..860cc016e70d 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -35,6 +35,7 @@ | |||
35 | #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ | 35 | #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ |
36 | #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ | 36 | #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ |
37 | #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ | 37 | #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ |
38 | #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ | ||
38 | 39 | ||
39 | #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ | 40 | #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ |
40 | CEPH_MOUNT_OPT_DCACHE) | 41 | CEPH_MOUNT_OPT_DCACHE) |
@@ -121,11 +122,21 @@ struct ceph_cap { | |||
121 | struct rb_node ci_node; /* per-ci cap tree */ | 122 | struct rb_node ci_node; /* per-ci cap tree */ |
122 | struct ceph_mds_session *session; | 123 | struct ceph_mds_session *session; |
123 | struct list_head session_caps; /* per-session caplist */ | 124 | struct list_head session_caps; /* per-session caplist */ |
124 | int mds; | ||
125 | u64 cap_id; /* unique cap id (mds provided) */ | 125 | u64 cap_id; /* unique cap id (mds provided) */ |
126 | int issued; /* latest, from the mds */ | 126 | union { |
127 | int implemented; /* implemented superset of issued (for revocation) */ | 127 | /* in-use caps */ |
128 | int mds_wanted; | 128 | struct { |
129 | int issued; /* latest, from the mds */ | ||
130 | int implemented; /* implemented superset of | ||
131 | issued (for revocation) */ | ||
132 | int mds, mds_wanted; | ||
133 | }; | ||
134 | /* caps to release */ | ||
135 | struct { | ||
136 | u64 cap_ino; | ||
137 | int queue_release; | ||
138 | }; | ||
139 | }; | ||
129 | u32 seq, issue_seq, mseq; | 140 | u32 seq, issue_seq, mseq; |
130 | u32 cap_gen; /* active/stale cycle */ | 141 | u32 cap_gen; /* active/stale cycle */ |
131 | unsigned long last_used; | 142 | unsigned long last_used; |
@@ -163,6 +174,7 @@ struct ceph_cap_snap { | |||
163 | int writing; /* a sync write is still in progress */ | 174 | int writing; /* a sync write is still in progress */ |
164 | int dirty_pages; /* dirty pages awaiting writeback */ | 175 | int dirty_pages; /* dirty pages awaiting writeback */ |
165 | bool inline_data; | 176 | bool inline_data; |
177 | bool need_flush; | ||
166 | }; | 178 | }; |
167 | 179 | ||
168 | static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) | 180 | static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) |
@@ -174,6 +186,17 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) | |||
174 | } | 186 | } |
175 | } | 187 | } |
176 | 188 | ||
189 | struct ceph_cap_flush { | ||
190 | u64 tid; | ||
191 | int caps; | ||
192 | bool kick; | ||
193 | struct rb_node g_node; // global | ||
194 | union { | ||
195 | struct rb_node i_node; // inode | ||
196 | struct list_head list; | ||
197 | }; | ||
198 | }; | ||
199 | |||
177 | /* | 200 | /* |
178 | * The frag tree describes how a directory is fragmented, potentially across | 201 | * The frag tree describes how a directory is fragmented, potentially across |
179 | * multiple metadata servers. It is also used to indicate points where | 202 | * multiple metadata servers. It is also used to indicate points where |
@@ -259,9 +282,9 @@ struct ceph_inode_info { | |||
259 | u32 i_time_warp_seq; | 282 | u32 i_time_warp_seq; |
260 | 283 | ||
261 | unsigned i_ceph_flags; | 284 | unsigned i_ceph_flags; |
262 | int i_ordered_count; | 285 | atomic64_t i_release_count; |
263 | atomic_t i_release_count; | 286 | atomic64_t i_ordered_count; |
264 | atomic_t i_complete_count; | 287 | atomic64_t i_complete_seq[2]; |
265 | 288 | ||
266 | struct ceph_dir_layout i_dir_layout; | 289 | struct ceph_dir_layout i_dir_layout; |
267 | struct ceph_file_layout i_layout; | 290 | struct ceph_file_layout i_layout; |
@@ -283,11 +306,11 @@ struct ceph_inode_info { | |||
283 | struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ | 306 | struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ |
284 | unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ | 307 | unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ |
285 | struct list_head i_dirty_item, i_flushing_item; | 308 | struct list_head i_dirty_item, i_flushing_item; |
286 | u64 i_cap_flush_seq; | ||
287 | /* we need to track cap writeback on a per-cap-bit basis, to allow | 309 | /* we need to track cap writeback on a per-cap-bit basis, to allow |
288 | * overlapping, pipelined cap flushes to the mds. we can probably | 310 | * overlapping, pipelined cap flushes to the mds. we can probably |
289 | * reduce the tid to 8 bits if we're concerned about inode size. */ | 311 | * reduce the tid to 8 bits if we're concerned about inode size. */ |
290 | u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; | 312 | struct ceph_cap_flush *i_prealloc_cap_flush; |
313 | struct rb_root i_cap_flush_tree; | ||
291 | wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ | 314 | wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ |
292 | unsigned long i_hold_caps_min; /* jiffies */ | 315 | unsigned long i_hold_caps_min; /* jiffies */ |
293 | unsigned long i_hold_caps_max; /* jiffies */ | 316 | unsigned long i_hold_caps_max; /* jiffies */ |
@@ -438,36 +461,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, | |||
438 | /* | 461 | /* |
439 | * Ceph inode. | 462 | * Ceph inode. |
440 | */ | 463 | */ |
441 | #define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ | 464 | #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ |
442 | #define CEPH_I_NODELAY 4 /* do not delay cap release */ | 465 | #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ |
443 | #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ | 466 | #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ |
444 | #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ | 467 | #define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ |
468 | #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ | ||
469 | #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ | ||
470 | #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ | ||
471 | |||
445 | 472 | ||
446 | static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, | 473 | static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, |
447 | int release_count, int ordered_count) | 474 | long long release_count, |
475 | long long ordered_count) | ||
448 | { | 476 | { |
449 | atomic_set(&ci->i_complete_count, release_count); | 477 | smp_mb__before_atomic(); |
450 | if (ci->i_ordered_count == ordered_count) | 478 | atomic64_set(&ci->i_complete_seq[0], release_count); |
451 | ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; | 479 | atomic64_set(&ci->i_complete_seq[1], ordered_count); |
452 | else | ||
453 | ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; | ||
454 | } | 480 | } |
455 | 481 | ||
456 | static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) | 482 | static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) |
457 | { | 483 | { |
458 | atomic_inc(&ci->i_release_count); | 484 | atomic64_inc(&ci->i_release_count); |
485 | } | ||
486 | |||
487 | static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) | ||
488 | { | ||
489 | atomic64_inc(&ci->i_ordered_count); | ||
459 | } | 490 | } |
460 | 491 | ||
461 | static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) | 492 | static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) |
462 | { | 493 | { |
463 | return atomic_read(&ci->i_complete_count) == | 494 | return atomic64_read(&ci->i_complete_seq[0]) == |
464 | atomic_read(&ci->i_release_count); | 495 | atomic64_read(&ci->i_release_count); |
465 | } | 496 | } |
466 | 497 | ||
467 | static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) | 498 | static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) |
468 | { | 499 | { |
469 | return __ceph_dir_is_complete(ci) && | 500 | return atomic64_read(&ci->i_complete_seq[0]) == |
470 | (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); | 501 | atomic64_read(&ci->i_release_count) && |
502 | atomic64_read(&ci->i_complete_seq[1]) == | ||
503 | atomic64_read(&ci->i_ordered_count); | ||
471 | } | 504 | } |
472 | 505 | ||
473 | static inline void ceph_dir_clear_complete(struct inode *inode) | 506 | static inline void ceph_dir_clear_complete(struct inode *inode) |
@@ -477,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) | |||
477 | 510 | ||
478 | static inline void ceph_dir_clear_ordered(struct inode *inode) | 511 | static inline void ceph_dir_clear_ordered(struct inode *inode) |
479 | { | 512 | { |
480 | struct ceph_inode_info *ci = ceph_inode(inode); | 513 | __ceph_dir_clear_ordered(ceph_inode(inode)); |
481 | spin_lock(&ci->i_ceph_lock); | ||
482 | ci->i_ordered_count++; | ||
483 | ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; | ||
484 | spin_unlock(&ci->i_ceph_lock); | ||
485 | } | 514 | } |
486 | 515 | ||
487 | static inline bool ceph_dir_is_complete_ordered(struct inode *inode) | 516 | static inline bool ceph_dir_is_complete_ordered(struct inode *inode) |
488 | { | 517 | { |
489 | struct ceph_inode_info *ci = ceph_inode(inode); | 518 | bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); |
490 | bool ret; | 519 | smp_rmb(); |
491 | spin_lock(&ci->i_ceph_lock); | ||
492 | ret = __ceph_dir_is_complete_ordered(ci); | ||
493 | spin_unlock(&ci->i_ceph_lock); | ||
494 | return ret; | 520 | return ret; |
495 | } | 521 | } |
496 | 522 | ||
@@ -552,7 +578,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) | |||
552 | { | 578 | { |
553 | return ci->i_dirty_caps | ci->i_flushing_caps; | 579 | return ci->i_dirty_caps | ci->i_flushing_caps; |
554 | } | 580 | } |
555 | extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); | 581 | extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); |
582 | extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); | ||
583 | extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, | ||
584 | struct ceph_cap_flush **pcf); | ||
556 | 585 | ||
557 | extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, | 586 | extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, |
558 | struct ceph_cap *ocap, int mask); | 587 | struct ceph_cap *ocap, int mask); |
@@ -606,16 +635,20 @@ struct ceph_file_info { | |||
606 | unsigned offset; /* offset of last chunk, adjusted for . and .. */ | 635 | unsigned offset; /* offset of last chunk, adjusted for . and .. */ |
607 | unsigned next_offset; /* offset of next chunk (last_name's + 1) */ | 636 | unsigned next_offset; /* offset of next chunk (last_name's + 1) */ |
608 | char *last_name; /* last entry in previous chunk */ | 637 | char *last_name; /* last entry in previous chunk */ |
609 | struct dentry *dentry; /* next dentry (for dcache readdir) */ | 638 | long long dir_release_count; |
610 | int dir_release_count; | 639 | long long dir_ordered_count; |
611 | int dir_ordered_count; | 640 | int readdir_cache_idx; |
612 | 641 | ||
613 | /* used for -o dirstat read() on directory thing */ | 642 | /* used for -o dirstat read() on directory thing */ |
614 | char *dir_info; | 643 | char *dir_info; |
615 | int dir_info_len; | 644 | int dir_info_len; |
616 | }; | 645 | }; |
617 | 646 | ||
618 | 647 | struct ceph_readdir_cache_control { | |
648 | struct page *page; | ||
649 | struct dentry **dentries; | ||
650 | int index; | ||
651 | }; | ||
619 | 652 | ||
620 | /* | 653 | /* |
621 | * A "snap realm" describes a subset of the file hierarchy sharing | 654 | * A "snap realm" describes a subset of the file hierarchy sharing |
@@ -687,6 +720,7 @@ static inline int default_congestion_kb(void) | |||
687 | 720 | ||
688 | 721 | ||
689 | /* snap.c */ | 722 | /* snap.c */ |
723 | extern struct ceph_snap_context *ceph_empty_snapc; | ||
690 | struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, | 724 | struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, |
691 | u64 ino); | 725 | u64 ino); |
692 | extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, | 726 | extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, |
@@ -713,8 +747,8 @@ extern void ceph_snap_exit(void); | |||
713 | static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) | 747 | static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) |
714 | { | 748 | { |
715 | return !list_empty(&ci->i_cap_snaps) && | 749 | return !list_empty(&ci->i_cap_snaps) && |
716 | list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, | 750 | list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, |
717 | ci_item)->writing; | 751 | ci_item)->writing; |
718 | } | 752 | } |
719 | 753 | ||
720 | /* inode.c */ | 754 | /* inode.c */ |
@@ -838,12 +872,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc, | |||
838 | struct ceph_cap *cap); | 872 | struct ceph_cap *cap); |
839 | extern int ceph_is_any_caps(struct inode *inode); | 873 | extern int ceph_is_any_caps(struct inode *inode); |
840 | 874 | ||
841 | extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, | ||
842 | u64 cap_id, u32 migrate_seq, u32 issue_seq); | ||
843 | extern void ceph_queue_caps_release(struct inode *inode); | 875 | extern void ceph_queue_caps_release(struct inode *inode); |
844 | extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); | 876 | extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); |
845 | extern int ceph_fsync(struct file *file, loff_t start, loff_t end, | 877 | extern int ceph_fsync(struct file *file, loff_t start, loff_t end, |
846 | int datasync); | 878 | int datasync); |
879 | extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, | ||
880 | struct ceph_mds_session *session); | ||
847 | extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | 881 | extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, |
848 | struct ceph_mds_session *session); | 882 | struct ceph_mds_session *session); |
849 | extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, | 883 | extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, |
@@ -879,6 +913,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); | |||
879 | /* addr.c */ | 913 | /* addr.c */ |
880 | extern const struct address_space_operations ceph_aops; | 914 | extern const struct address_space_operations ceph_aops; |
881 | extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); | 915 | extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); |
916 | extern int ceph_uninline_data(struct file *filp, struct page *locked_page); | ||
917 | extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); | ||
918 | extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); | ||
882 | 919 | ||
883 | /* file.c */ | 920 | /* file.c */ |
884 | extern const struct file_operations ceph_file_fops; | 921 | extern const struct file_operations ceph_file_fops; |
@@ -890,7 +927,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, | |||
890 | extern int ceph_release(struct inode *inode, struct file *filp); | 927 | extern int ceph_release(struct inode *inode, struct file *filp); |
891 | extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, | 928 | extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, |
892 | char *data, size_t len); | 929 | char *data, size_t len); |
893 | int ceph_uninline_data(struct file *filp, struct page *locked_page); | ||
894 | /* dir.c */ | 930 | /* dir.c */ |
895 | extern const struct file_operations ceph_dir_fops; | 931 | extern const struct file_operations ceph_dir_fops; |
896 | extern const struct file_operations ceph_snapdir_fops; | 932 | extern const struct file_operations ceph_snapdir_fops; |
@@ -911,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); | |||
911 | extern void ceph_invalidate_dentry_lease(struct dentry *dentry); | 947 | extern void ceph_invalidate_dentry_lease(struct dentry *dentry); |
912 | extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); | 948 | extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); |
913 | extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); | 949 | extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); |
950 | extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); | ||
914 | 951 | ||
915 | /* | 952 | /* |
916 | * our d_ops vary depending on whether the inode is live, | 953 | * our d_ops vary depending on whether the inode is live, |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index cd7ffad4041d..819163d8313b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, | |||
911 | struct inode *inode = d_inode(dentry); | 911 | struct inode *inode = d_inode(dentry); |
912 | struct ceph_vxattr *vxattr; | 912 | struct ceph_vxattr *vxattr; |
913 | struct ceph_inode_info *ci = ceph_inode(inode); | 913 | struct ceph_inode_info *ci = ceph_inode(inode); |
914 | struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; | ||
915 | struct ceph_cap_flush *prealloc_cf = NULL; | ||
914 | int issued; | 916 | int issued; |
915 | int err; | 917 | int err; |
916 | int dirty = 0; | 918 | int dirty = 0; |
@@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, | |||
920 | char *newval = NULL; | 922 | char *newval = NULL; |
921 | struct ceph_inode_xattr *xattr = NULL; | 923 | struct ceph_inode_xattr *xattr = NULL; |
922 | int required_blob_size; | 924 | int required_blob_size; |
925 | bool lock_snap_rwsem = false; | ||
923 | 926 | ||
924 | if (!ceph_is_valid_xattr(name)) | 927 | if (!ceph_is_valid_xattr(name)) |
925 | return -EOPNOTSUPP; | 928 | return -EOPNOTSUPP; |
@@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, | |||
948 | if (!xattr) | 951 | if (!xattr) |
949 | goto out; | 952 | goto out; |
950 | 953 | ||
954 | prealloc_cf = ceph_alloc_cap_flush(); | ||
955 | if (!prealloc_cf) | ||
956 | goto out; | ||
957 | |||
951 | spin_lock(&ci->i_ceph_lock); | 958 | spin_lock(&ci->i_ceph_lock); |
952 | retry: | 959 | retry: |
953 | issued = __ceph_caps_issued(ci, NULL); | 960 | issued = __ceph_caps_issued(ci, NULL); |
954 | dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); | ||
955 | if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) | 961 | if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) |
956 | goto do_sync; | 962 | goto do_sync; |
963 | |||
964 | if (!lock_snap_rwsem && !ci->i_head_snapc) { | ||
965 | lock_snap_rwsem = true; | ||
966 | if (!down_read_trylock(&mdsc->snap_rwsem)) { | ||
967 | spin_unlock(&ci->i_ceph_lock); | ||
968 | down_read(&mdsc->snap_rwsem); | ||
969 | spin_lock(&ci->i_ceph_lock); | ||
970 | goto retry; | ||
971 | } | ||
972 | } | ||
973 | |||
974 | dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); | ||
957 | __build_xattrs(inode); | 975 | __build_xattrs(inode); |
958 | 976 | ||
959 | required_blob_size = __get_required_blob_size(ci, name_len, val_len); | 977 | required_blob_size = __get_required_blob_size(ci, name_len, val_len); |
@@ -966,7 +984,7 @@ retry: | |||
966 | dout(" preaallocating new blob size=%d\n", required_blob_size); | 984 | dout(" preaallocating new blob size=%d\n", required_blob_size); |
967 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); | 985 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); |
968 | if (!blob) | 986 | if (!blob) |
969 | goto out; | 987 | goto do_sync_unlocked; |
970 | spin_lock(&ci->i_ceph_lock); | 988 | spin_lock(&ci->i_ceph_lock); |
971 | if (ci->i_xattrs.prealloc_blob) | 989 | if (ci->i_xattrs.prealloc_blob) |
972 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); | 990 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); |
@@ -978,21 +996,28 @@ retry: | |||
978 | flags, value ? 1 : -1, &xattr); | 996 | flags, value ? 1 : -1, &xattr); |
979 | 997 | ||
980 | if (!err) { | 998 | if (!err) { |
981 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); | 999 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, |
1000 | &prealloc_cf); | ||
982 | ci->i_xattrs.dirty = true; | 1001 | ci->i_xattrs.dirty = true; |
983 | inode->i_ctime = CURRENT_TIME; | 1002 | inode->i_ctime = CURRENT_TIME; |
984 | } | 1003 | } |
985 | 1004 | ||
986 | spin_unlock(&ci->i_ceph_lock); | 1005 | spin_unlock(&ci->i_ceph_lock); |
1006 | if (lock_snap_rwsem) | ||
1007 | up_read(&mdsc->snap_rwsem); | ||
987 | if (dirty) | 1008 | if (dirty) |
988 | __mark_inode_dirty(inode, dirty); | 1009 | __mark_inode_dirty(inode, dirty); |
1010 | ceph_free_cap_flush(prealloc_cf); | ||
989 | return err; | 1011 | return err; |
990 | 1012 | ||
991 | do_sync: | 1013 | do_sync: |
992 | spin_unlock(&ci->i_ceph_lock); | 1014 | spin_unlock(&ci->i_ceph_lock); |
993 | do_sync_unlocked: | 1015 | do_sync_unlocked: |
1016 | if (lock_snap_rwsem) | ||
1017 | up_read(&mdsc->snap_rwsem); | ||
994 | err = ceph_sync_setxattr(dentry, name, value, size, flags); | 1018 | err = ceph_sync_setxattr(dentry, name, value, size, flags); |
995 | out: | 1019 | out: |
1020 | ceph_free_cap_flush(prealloc_cf); | ||
996 | kfree(newname); | 1021 | kfree(newname); |
997 | kfree(newval); | 1022 | kfree(newval); |
998 | kfree(xattr); | 1023 | kfree(xattr); |
@@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) | |||
1044 | struct inode *inode = d_inode(dentry); | 1069 | struct inode *inode = d_inode(dentry); |
1045 | struct ceph_vxattr *vxattr; | 1070 | struct ceph_vxattr *vxattr; |
1046 | struct ceph_inode_info *ci = ceph_inode(inode); | 1071 | struct ceph_inode_info *ci = ceph_inode(inode); |
1072 | struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; | ||
1073 | struct ceph_cap_flush *prealloc_cf = NULL; | ||
1047 | int issued; | 1074 | int issued; |
1048 | int err; | 1075 | int err; |
1049 | int required_blob_size; | 1076 | int required_blob_size; |
1050 | int dirty; | 1077 | int dirty; |
1078 | bool lock_snap_rwsem = false; | ||
1051 | 1079 | ||
1052 | if (!ceph_is_valid_xattr(name)) | 1080 | if (!ceph_is_valid_xattr(name)) |
1053 | return -EOPNOTSUPP; | 1081 | return -EOPNOTSUPP; |
@@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) | |||
1060 | if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) | 1088 | if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) |
1061 | goto do_sync_unlocked; | 1089 | goto do_sync_unlocked; |
1062 | 1090 | ||
1091 | prealloc_cf = ceph_alloc_cap_flush(); | ||
1092 | if (!prealloc_cf) | ||
1093 | return -ENOMEM; | ||
1094 | |||
1063 | err = -ENOMEM; | 1095 | err = -ENOMEM; |
1064 | spin_lock(&ci->i_ceph_lock); | 1096 | spin_lock(&ci->i_ceph_lock); |
1065 | retry: | 1097 | retry: |
1066 | issued = __ceph_caps_issued(ci, NULL); | 1098 | issued = __ceph_caps_issued(ci, NULL); |
1067 | dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); | ||
1068 | |||
1069 | if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) | 1099 | if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) |
1070 | goto do_sync; | 1100 | goto do_sync; |
1101 | |||
1102 | if (!lock_snap_rwsem && !ci->i_head_snapc) { | ||
1103 | lock_snap_rwsem = true; | ||
1104 | if (!down_read_trylock(&mdsc->snap_rwsem)) { | ||
1105 | spin_unlock(&ci->i_ceph_lock); | ||
1106 | down_read(&mdsc->snap_rwsem); | ||
1107 | spin_lock(&ci->i_ceph_lock); | ||
1108 | goto retry; | ||
1109 | } | ||
1110 | } | ||
1111 | |||
1112 | dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); | ||
1113 | |||
1071 | __build_xattrs(inode); | 1114 | __build_xattrs(inode); |
1072 | 1115 | ||
1073 | required_blob_size = __get_required_blob_size(ci, 0, 0); | 1116 | required_blob_size = __get_required_blob_size(ci, 0, 0); |
@@ -1080,7 +1123,7 @@ retry: | |||
1080 | dout(" preaallocating new blob size=%d\n", required_blob_size); | 1123 | dout(" preaallocating new blob size=%d\n", required_blob_size); |
1081 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); | 1124 | blob = ceph_buffer_new(required_blob_size, GFP_NOFS); |
1082 | if (!blob) | 1125 | if (!blob) |
1083 | goto out; | 1126 | goto do_sync_unlocked; |
1084 | spin_lock(&ci->i_ceph_lock); | 1127 | spin_lock(&ci->i_ceph_lock); |
1085 | if (ci->i_xattrs.prealloc_blob) | 1128 | if (ci->i_xattrs.prealloc_blob) |
1086 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); | 1129 | ceph_buffer_put(ci->i_xattrs.prealloc_blob); |
@@ -1090,18 +1133,24 @@ retry: | |||
1090 | 1133 | ||
1091 | err = __remove_xattr_by_name(ceph_inode(inode), name); | 1134 | err = __remove_xattr_by_name(ceph_inode(inode), name); |
1092 | 1135 | ||
1093 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); | 1136 | dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, |
1137 | &prealloc_cf); | ||
1094 | ci->i_xattrs.dirty = true; | 1138 | ci->i_xattrs.dirty = true; |
1095 | inode->i_ctime = CURRENT_TIME; | 1139 | inode->i_ctime = CURRENT_TIME; |
1096 | spin_unlock(&ci->i_ceph_lock); | 1140 | spin_unlock(&ci->i_ceph_lock); |
1141 | if (lock_snap_rwsem) | ||
1142 | up_read(&mdsc->snap_rwsem); | ||
1097 | if (dirty) | 1143 | if (dirty) |
1098 | __mark_inode_dirty(inode, dirty); | 1144 | __mark_inode_dirty(inode, dirty); |
1145 | ceph_free_cap_flush(prealloc_cf); | ||
1099 | return err; | 1146 | return err; |
1100 | do_sync: | 1147 | do_sync: |
1101 | spin_unlock(&ci->i_ceph_lock); | 1148 | spin_unlock(&ci->i_ceph_lock); |
1102 | do_sync_unlocked: | 1149 | do_sync_unlocked: |
1150 | if (lock_snap_rwsem) | ||
1151 | up_read(&mdsc->snap_rwsem); | ||
1152 | ceph_free_cap_flush(prealloc_cf); | ||
1103 | err = ceph_send_removexattr(dentry, name); | 1153 | err = ceph_send_removexattr(dentry, name); |
1104 | out: | ||
1105 | return err; | 1154 | return err; |
1106 | } | 1155 | } |
1107 | 1156 | ||