diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-25 13:21:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-25 13:21:13 -0400 |
commit | f41def397161053eb0d3ed6861ef65985efbf293 (patch) | |
tree | 28c03e8f26fc975ab059ff407b0c3d9165bc489f | |
parent | 7b1373dd6e86f3a222590ae404a400e699b32884 (diff) | |
parent | 3ee5a7015c8b7cb4de21f7345f8381946f2fce55 (diff) |
Merge tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"The highlights are:
- automatic recovery of a blacklisted filesystem session (Zheng Yan).
This is disabled by default and can be enabled by mounting with the
new "recover_session=clean" option.
- serialize buffered reads and O_DIRECT writes (Jeff Layton). Care is
taken to avoid serializing O_DIRECT reads and writes with each
other, this is based on the exclusion scheme from NFS.
- handle large osdmaps better in the face of fragmented memory
(myself)
- don't limit what security.* xattrs can be get or set (Jeff Layton).
We were overly restrictive here, unnecessarily preventing things
like file capability sets stored in security.capability from
working.
- allow copy_file_range() within the same inode and across different
filesystems within the same cluster (Luis Henriques)"
* tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client: (41 commits)
ceph: call ceph_mdsc_destroy from destroy_fs_client
libceph: use ceph_kvmalloc() for osdmap arrays
libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc()
ceph: allow object copies across different filesystems in the same cluster
ceph: include ceph_debug.h in cache.c
ceph: move static keyword to the front of declarations
rbd: pull rbd_img_request_create() dout out into the callers
ceph: reconnect connection if session hang in opening state
libceph: drop unused con parameter of calc_target()
ceph: use release_pages() directly
rbd: fix response length parameter for encoded strings
ceph: allow arbitrary security.* xattrs
ceph: only set CEPH_I_SEC_INITED if we got a MAC label
ceph: turn ceph_security_invalidate_secctx into static inline
ceph: add buffered/direct exclusionary locking for reads and writes
libceph: handle OSD op ceph_pagelist_append() errors
ceph: don't return a value from void function
ceph: don't freeze during write page faults
ceph: update the mtime when truncating up
ceph: fix indentation in __get_snap_name()
...
-rw-r--r-- | Documentation/filesystems/ceph.txt | 14 | ||||
-rw-r--r-- | drivers/block/rbd.c | 18 | ||||
-rw-r--r-- | fs/ceph/Makefile | 2 | ||||
-rw-r--r-- | fs/ceph/addr.c | 61 | ||||
-rw-r--r-- | fs/ceph/cache.c | 2 | ||||
-rw-r--r-- | fs/ceph/caps.c | 173 | ||||
-rw-r--r-- | fs/ceph/debugfs.c | 1 | ||||
-rw-r--r-- | fs/ceph/export.c | 60 | ||||
-rw-r--r-- | fs/ceph/file.c | 104 | ||||
-rw-r--r-- | fs/ceph/inode.c | 50 | ||||
-rw-r--r-- | fs/ceph/io.c | 163 | ||||
-rw-r--r-- | fs/ceph/io.h | 12 | ||||
-rw-r--r-- | fs/ceph/locks.c | 8 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 110 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 8 | ||||
-rw-r--r-- | fs/ceph/super.c | 52 | ||||
-rw-r--r-- | fs/ceph/super.h | 49 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 76 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/mon_client.h | 1 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 2 | ||||
-rw-r--r-- | net/ceph/ceph_common.c | 37 | ||||
-rw-r--r-- | net/ceph/messenger.c | 6 | ||||
-rw-r--r-- | net/ceph/mon_client.c | 7 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 65 | ||||
-rw-r--r-- | net/ceph/osdmap.c | 69 |
27 files changed, 767 insertions, 385 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index d2c6a5ccf0f5..b19b6a03f91c 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt | |||
@@ -158,6 +158,20 @@ Mount Options | |||
158 | copies. Currently, it's only used in copy_file_range, which will revert | 158 | copies. Currently, it's only used in copy_file_range, which will revert |
159 | to the default VFS implementation if this option is used. | 159 | to the default VFS implementation if this option is used. |
160 | 160 | ||
161 | recover_session=<no|clean> | ||
162 | Set auto reconnect mode in the case where the client is blacklisted. The | ||
163 | available modes are "no" and "clean". The default is "no". | ||
164 | |||
165 | * no: never attempt to reconnect when client detects that it has been | ||
166 | blacklisted. Operations will generally fail after being blacklisted. | ||
167 | |||
168 | * clean: client reconnects to the ceph cluster automatically when it | ||
169 | detects that it has been blacklisted. During reconnect, client drops | ||
170 | dirty data/metadata, invalidates page caches and writable file handles. | ||
171 | After reconnect, file locks become stale because the MDS loses track | ||
172 | of them. If an inode contains any stale file locks, read/write on the | ||
173 | inode is not allowed until applications release all stale file locks. | ||
174 | |||
161 | More Information | 175 | More Information |
162 | ================ | 176 | ================ |
163 | 177 | ||
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c8fb886aebd4..7c4350c0fb77 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create( | |||
1754 | mutex_init(&img_request->state_mutex); | 1754 | mutex_init(&img_request->state_mutex); |
1755 | kref_init(&img_request->kref); | 1755 | kref_init(&img_request->kref); |
1756 | 1756 | ||
1757 | dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev, | ||
1758 | obj_op_name(op_type), img_request); | ||
1759 | return img_request; | 1757 | return img_request; |
1760 | } | 1758 | } |
1761 | 1759 | ||
@@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) | |||
2944 | __set_bit(IMG_REQ_CHILD, &child_img_req->flags); | 2942 | __set_bit(IMG_REQ_CHILD, &child_img_req->flags); |
2945 | child_img_req->obj_request = obj_req; | 2943 | child_img_req->obj_request = obj_req; |
2946 | 2944 | ||
2945 | dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req, | ||
2946 | obj_req); | ||
2947 | |||
2947 | if (!rbd_img_is_write(img_req)) { | 2948 | if (!rbd_img_is_write(img_req)) { |
2948 | switch (img_req->data_type) { | 2949 | switch (img_req->data_type) { |
2949 | case OBJ_REQUEST_BIO: | 2950 | case OBJ_REQUEST_BIO: |
@@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work) | |||
4877 | img_request->rq = rq; | 4878 | img_request->rq = rq; |
4878 | snapc = NULL; /* img_request consumes a ref */ | 4879 | snapc = NULL; /* img_request consumes a ref */ |
4879 | 4880 | ||
4881 | dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev, | ||
4882 | img_request, obj_op_name(op_type), offset, length); | ||
4883 | |||
4880 | if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) | 4884 | if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) |
4881 | result = rbd_img_fill_nodata(img_request, offset, length); | 4885 | result = rbd_img_fill_nodata(img_request, offset, length); |
4882 | else | 4886 | else |
@@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev) | |||
5669 | 5673 | ||
5670 | static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) | 5674 | static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) |
5671 | { | 5675 | { |
5676 | size_t size; | ||
5672 | void *reply_buf; | 5677 | void *reply_buf; |
5673 | int ret; | 5678 | int ret; |
5674 | void *p; | 5679 | void *p; |
5675 | 5680 | ||
5676 | reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); | 5681 | /* Response will be an encoded string, which includes a length */ |
5682 | size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX; | ||
5683 | reply_buf = kzalloc(size, GFP_KERNEL); | ||
5677 | if (!reply_buf) | 5684 | if (!reply_buf) |
5678 | return -ENOMEM; | 5685 | return -ENOMEM; |
5679 | 5686 | ||
5680 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, | 5687 | ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, |
5681 | &rbd_dev->header_oloc, "get_object_prefix", | 5688 | &rbd_dev->header_oloc, "get_object_prefix", |
5682 | NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); | 5689 | NULL, 0, reply_buf, size); |
5683 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 5690 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
5684 | if (ret < 0) | 5691 | if (ret < 0) |
5685 | goto out; | 5692 | goto out; |
@@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
6696 | dout("rbd id object name is %s\n", oid.name); | 6703 | dout("rbd id object name is %s\n", oid.name); |
6697 | 6704 | ||
6698 | /* Response will be an encoded string, which includes a length */ | 6705 | /* Response will be an encoded string, which includes a length */ |
6699 | |||
6700 | size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; | 6706 | size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; |
6701 | response = kzalloc(size, GFP_NOIO); | 6707 | response = kzalloc(size, GFP_NOIO); |
6702 | if (!response) { | 6708 | if (!response) { |
@@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) | |||
6708 | 6714 | ||
6709 | ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, | 6715 | ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, |
6710 | "get_id", NULL, 0, | 6716 | "get_id", NULL, 0, |
6711 | response, RBD_IMAGE_ID_LEN_MAX); | 6717 | response, size); |
6712 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); | 6718 | dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); |
6713 | if (ret == -ENOENT) { | 6719 | if (ret == -ENOENT) { |
6714 | image_id = kstrdup("", GFP_KERNEL); | 6720 | image_id = kstrdup("", GFP_KERNEL); |
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index a699e320393f..c1da294418d1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile | |||
@@ -6,7 +6,7 @@ | |||
6 | obj-$(CONFIG_CEPH_FS) += ceph.o | 6 | obj-$(CONFIG_CEPH_FS) += ceph.o |
7 | 7 | ||
8 | ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ | 8 | ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ |
9 | export.o caps.o snap.o xattr.o quota.o \ | 9 | export.o caps.o snap.o xattr.o quota.o io.o \ |
10 | mds_client.o mdsmap.o strings.o ceph_frag.o \ | 10 | mds_client.o mdsmap.o strings.o ceph_frag.o \ |
11 | debugfs.o | 11 | debugfs.o |
12 | 12 | ||
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3c8b886bf64..7ab616601141 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) | |||
189 | { | 189 | { |
190 | struct inode *inode = file_inode(filp); | 190 | struct inode *inode = file_inode(filp); |
191 | struct ceph_inode_info *ci = ceph_inode(inode); | 191 | struct ceph_inode_info *ci = ceph_inode(inode); |
192 | struct ceph_osd_client *osdc = | 192 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
193 | &ceph_inode_to_client(inode)->client->osdc; | ||
194 | int err = 0; | 193 | int err = 0; |
195 | u64 off = page_offset(page); | 194 | u64 off = page_offset(page); |
196 | u64 len = PAGE_SIZE; | 195 | u64 len = PAGE_SIZE; |
@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) | |||
219 | 218 | ||
220 | dout("readpage inode %p file %p page %p index %lu\n", | 219 | dout("readpage inode %p file %p page %p index %lu\n", |
221 | inode, filp, page, page->index); | 220 | inode, filp, page, page->index); |
222 | err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, | 221 | err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), |
223 | off, &len, | 222 | &ci->i_layout, off, &len, |
224 | ci->i_truncate_seq, ci->i_truncate_size, | 223 | ci->i_truncate_seq, ci->i_truncate_size, |
225 | &page, 1, 0); | 224 | &page, 1, 0); |
226 | if (err == -ENOENT) | 225 | if (err == -ENOENT) |
@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) | |||
228 | if (err < 0) { | 227 | if (err < 0) { |
229 | SetPageError(page); | 228 | SetPageError(page); |
230 | ceph_fscache_readpage_cancel(inode, page); | 229 | ceph_fscache_readpage_cancel(inode, page); |
230 | if (err == -EBLACKLISTED) | ||
231 | fsc->blacklisted = true; | ||
231 | goto out; | 232 | goto out; |
232 | } | 233 | } |
233 | if (err < PAGE_SIZE) | 234 | if (err < PAGE_SIZE) |
@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) | |||
266 | int i; | 267 | int i; |
267 | 268 | ||
268 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); | 269 | dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); |
270 | if (rc == -EBLACKLISTED) | ||
271 | ceph_inode_to_client(inode)->blacklisted = true; | ||
269 | 272 | ||
270 | /* unlock all pages, zeroing any data we didn't read */ | 273 | /* unlock all pages, zeroing any data we didn't read */ |
271 | osd_data = osd_req_op_extent_osd_data(req, 0); | 274 | osd_data = osd_req_op_extent_osd_data(req, 0); |
@@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, | |||
323 | /* caller of readpages does not hold buffer and read caps | 326 | /* caller of readpages does not hold buffer and read caps |
324 | * (fadvise, madvise and readahead cases) */ | 327 | * (fadvise, madvise and readahead cases) */ |
325 | int want = CEPH_CAP_FILE_CACHE; | 328 | int want = CEPH_CAP_FILE_CACHE; |
326 | ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); | 329 | ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, |
330 | true, &got); | ||
327 | if (ret < 0) { | 331 | if (ret < 0) { |
328 | dout("start_read %p, error getting cap\n", inode); | 332 | dout("start_read %p, error getting cap\n", inode); |
329 | } else if (!(got & want)) { | 333 | } else if (!(got & want)) { |
@@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, | |||
569 | /* | 573 | /* |
570 | * Write a single page, but leave the page locked. | 574 | * Write a single page, but leave the page locked. |
571 | * | 575 | * |
572 | * If we get a write error, set the page error bit, but still adjust the | 576 | * If we get a write error, mark the mapping for error, but still adjust the |
573 | * dirty page accounting (i.e., page is no longer dirty). | 577 | * dirty page accounting (i.e., page is no longer dirty). |
574 | */ | 578 | */ |
575 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | 579 | static int writepage_nounlock(struct page *page, struct writeback_control *wbc) |
@@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) | |||
640 | end_page_writeback(page); | 644 | end_page_writeback(page); |
641 | return err; | 645 | return err; |
642 | } | 646 | } |
647 | if (err == -EBLACKLISTED) | ||
648 | fsc->blacklisted = true; | ||
643 | dout("writepage setting page/mapping error %d %p\n", | 649 | dout("writepage setting page/mapping error %d %p\n", |
644 | err, page); | 650 | err, page); |
645 | SetPageError(page); | ||
646 | mapping_set_error(&inode->i_data, err); | 651 | mapping_set_error(&inode->i_data, err); |
647 | wbc->pages_skipped++; | 652 | wbc->pages_skipped++; |
648 | } else { | 653 | } else { |
@@ -680,23 +685,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) | |||
680 | } | 685 | } |
681 | 686 | ||
682 | /* | 687 | /* |
683 | * lame release_pages helper. release_pages() isn't exported to | ||
684 | * modules. | ||
685 | */ | ||
686 | static void ceph_release_pages(struct page **pages, int num) | ||
687 | { | ||
688 | struct pagevec pvec; | ||
689 | int i; | ||
690 | |||
691 | pagevec_init(&pvec); | ||
692 | for (i = 0; i < num; i++) { | ||
693 | if (pagevec_add(&pvec, pages[i]) == 0) | ||
694 | pagevec_release(&pvec); | ||
695 | } | ||
696 | pagevec_release(&pvec); | ||
697 | } | ||
698 | |||
699 | /* | ||
700 | * async writeback completion handler. | 688 | * async writeback completion handler. |
701 | * | 689 | * |
702 | * If we get an error, set the mapping error bit, but not the individual | 690 | * If we get an error, set the mapping error bit, but not the individual |
@@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req) | |||
720 | if (rc < 0) { | 708 | if (rc < 0) { |
721 | mapping_set_error(mapping, rc); | 709 | mapping_set_error(mapping, rc); |
722 | ceph_set_error_write(ci); | 710 | ceph_set_error_write(ci); |
711 | if (rc == -EBLACKLISTED) | ||
712 | fsc->blacklisted = true; | ||
723 | } else { | 713 | } else { |
724 | ceph_clear_error_write(ci); | 714 | ceph_clear_error_write(ci); |
725 | } | 715 | } |
@@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) | |||
769 | dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", | 759 | dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", |
770 | inode, osd_data->length, rc >= 0 ? num_pages : 0); | 760 | inode, osd_data->length, rc >= 0 ? num_pages : 0); |
771 | 761 | ||
772 | ceph_release_pages(osd_data->pages, num_pages); | 762 | release_pages(osd_data->pages, num_pages); |
773 | } | 763 | } |
774 | 764 | ||
775 | ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); | 765 | ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); |
@@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) | |||
1452 | want = CEPH_CAP_FILE_CACHE; | 1442 | want = CEPH_CAP_FILE_CACHE; |
1453 | 1443 | ||
1454 | got = 0; | 1444 | got = 0; |
1455 | err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); | 1445 | err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, |
1446 | &got, &pinned_page); | ||
1456 | if (err < 0) | 1447 | if (err < 0) |
1457 | goto out_restore; | 1448 | goto out_restore; |
1458 | 1449 | ||
@@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) | |||
1540 | if (!prealloc_cf) | 1531 | if (!prealloc_cf) |
1541 | return VM_FAULT_OOM; | 1532 | return VM_FAULT_OOM; |
1542 | 1533 | ||
1534 | sb_start_pagefault(inode->i_sb); | ||
1543 | ceph_block_sigs(&oldset); | 1535 | ceph_block_sigs(&oldset); |
1544 | 1536 | ||
1545 | if (ci->i_inline_version != CEPH_INLINE_NONE) { | 1537 | if (ci->i_inline_version != CEPH_INLINE_NONE) { |
@@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) | |||
1568 | want = CEPH_CAP_FILE_BUFFER; | 1560 | want = CEPH_CAP_FILE_BUFFER; |
1569 | 1561 | ||
1570 | got = 0; | 1562 | got = 0; |
1571 | err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, | 1563 | err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, |
1572 | &got, NULL); | 1564 | &got, NULL); |
1573 | if (err < 0) | 1565 | if (err < 0) |
1574 | goto out_free; | 1566 | goto out_free; |
@@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) | |||
1614 | ceph_put_cap_refs(ci, got); | 1606 | ceph_put_cap_refs(ci, got); |
1615 | out_free: | 1607 | out_free: |
1616 | ceph_restore_sigs(&oldset); | 1608 | ceph_restore_sigs(&oldset); |
1609 | sb_end_pagefault(inode->i_sb); | ||
1617 | ceph_free_cap_flush(prealloc_cf); | 1610 | ceph_free_cap_flush(prealloc_cf); |
1618 | if (err < 0) | 1611 | if (err < 0) |
1619 | ret = vmf_error(err); | 1612 | ret = vmf_error(err); |
@@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, | |||
1946 | 1939 | ||
1947 | if (err >= 0 || err == -ENOENT) | 1940 | if (err >= 0 || err == -ENOENT) |
1948 | have |= POOL_READ; | 1941 | have |= POOL_READ; |
1949 | else if (err != -EPERM) | 1942 | else if (err != -EPERM) { |
1943 | if (err == -EBLACKLISTED) | ||
1944 | fsc->blacklisted = true; | ||
1950 | goto out_unlock; | 1945 | goto out_unlock; |
1946 | } | ||
1951 | 1947 | ||
1952 | if (err2 == 0 || err2 == -EEXIST) | 1948 | if (err2 == 0 || err2 == -EEXIST) |
1953 | have |= POOL_WRITE; | 1949 | have |= POOL_WRITE; |
1954 | else if (err2 != -EPERM) { | 1950 | else if (err2 != -EPERM) { |
1951 | if (err2 == -EBLACKLISTED) | ||
1952 | fsc->blacklisted = true; | ||
1955 | err = err2; | 1953 | err = err2; |
1956 | goto out_unlock; | 1954 | goto out_unlock; |
1957 | } | 1955 | } |
@@ -1989,10 +1987,11 @@ out: | |||
1989 | return err; | 1987 | return err; |
1990 | } | 1988 | } |
1991 | 1989 | ||
1992 | int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) | 1990 | int ceph_pool_perm_check(struct inode *inode, int need) |
1993 | { | 1991 | { |
1994 | s64 pool; | 1992 | struct ceph_inode_info *ci = ceph_inode(inode); |
1995 | struct ceph_string *pool_ns; | 1993 | struct ceph_string *pool_ns; |
1994 | s64 pool; | ||
1996 | int ret, flags; | 1995 | int ret, flags; |
1997 | 1996 | ||
1998 | if (ci->i_vino.snap != CEPH_NOSNAP) { | 1997 | if (ci->i_vino.snap != CEPH_NOSNAP) { |
@@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) | |||
2004 | return 0; | 2003 | return 0; |
2005 | } | 2004 | } |
2006 | 2005 | ||
2007 | if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), | 2006 | if (ceph_test_mount_opt(ceph_inode_to_client(inode), |
2008 | NOPOOLPERM)) | 2007 | NOPOOLPERM)) |
2009 | return 0; | 2008 | return 0; |
2010 | 2009 | ||
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bc90cf6ad7ed..b2ec29eeb4c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c | |||
@@ -6,6 +6,8 @@ | |||
6 | * Written by Milosz Tanski (milosz@adfin.com) | 6 | * Written by Milosz Tanski (milosz@adfin.com) |
7 | */ | 7 | */ |
8 | 8 | ||
9 | #include <linux/ceph/ceph_debug.h> | ||
10 | |||
9 | #include "super.h" | 11 | #include "super.h" |
10 | #include "cache.h" | 12 | #include "cache.h" |
11 | 13 | ||
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ce0f5658720a..d3b9c9d5c1bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -458,37 +458,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) | |||
458 | } | 458 | } |
459 | 459 | ||
460 | /* | 460 | /* |
461 | * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. | ||
462 | */ | ||
463 | static int __ceph_get_cap_mds(struct ceph_inode_info *ci) | ||
464 | { | ||
465 | struct ceph_cap *cap; | ||
466 | int mds = -1; | ||
467 | struct rb_node *p; | ||
468 | |||
469 | /* prefer mds with WR|BUFFER|EXCL caps */ | ||
470 | for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { | ||
471 | cap = rb_entry(p, struct ceph_cap, ci_node); | ||
472 | mds = cap->mds; | ||
473 | if (cap->issued & (CEPH_CAP_FILE_WR | | ||
474 | CEPH_CAP_FILE_BUFFER | | ||
475 | CEPH_CAP_FILE_EXCL)) | ||
476 | break; | ||
477 | } | ||
478 | return mds; | ||
479 | } | ||
480 | |||
481 | int ceph_get_cap_mds(struct inode *inode) | ||
482 | { | ||
483 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
484 | int mds; | ||
485 | spin_lock(&ci->i_ceph_lock); | ||
486 | mds = __ceph_get_cap_mds(ceph_inode(inode)); | ||
487 | spin_unlock(&ci->i_ceph_lock); | ||
488 | return mds; | ||
489 | } | ||
490 | |||
491 | /* | ||
492 | * Called under i_ceph_lock. | 461 | * Called under i_ceph_lock. |
493 | */ | 462 | */ |
494 | static void __insert_cap_node(struct ceph_inode_info *ci, | 463 | static void __insert_cap_node(struct ceph_inode_info *ci, |
@@ -628,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, | |||
628 | /* | 597 | /* |
629 | * Add a capability under the given MDS session. | 598 | * Add a capability under the given MDS session. |
630 | * | 599 | * |
631 | * Caller should hold session snap_rwsem (read) and s_mutex. | 600 | * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock |
632 | * | 601 | * |
633 | * @fmode is the open file mode, if we are opening a file, otherwise | 602 | * @fmode is the open file mode, if we are opening a file, otherwise |
634 | * it is < 0. (This is so we can atomically add the cap and add an | 603 | * it is < 0. (This is so we can atomically add the cap and add an |
@@ -645,6 +614,9 @@ void ceph_add_cap(struct inode *inode, | |||
645 | struct ceph_cap *cap; | 614 | struct ceph_cap *cap; |
646 | int mds = session->s_mds; | 615 | int mds = session->s_mds; |
647 | int actual_wanted; | 616 | int actual_wanted; |
617 | u32 gen; | ||
618 | |||
619 | lockdep_assert_held(&ci->i_ceph_lock); | ||
648 | 620 | ||
649 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, | 621 | dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, |
650 | session->s_mds, cap_id, ceph_cap_string(issued), seq); | 622 | session->s_mds, cap_id, ceph_cap_string(issued), seq); |
@@ -656,6 +628,10 @@ void ceph_add_cap(struct inode *inode, | |||
656 | if (fmode >= 0) | 628 | if (fmode >= 0) |
657 | wanted |= ceph_caps_for_mode(fmode); | 629 | wanted |= ceph_caps_for_mode(fmode); |
658 | 630 | ||
631 | spin_lock(&session->s_gen_ttl_lock); | ||
632 | gen = session->s_cap_gen; | ||
633 | spin_unlock(&session->s_gen_ttl_lock); | ||
634 | |||
659 | cap = __get_cap_for_mds(ci, mds); | 635 | cap = __get_cap_for_mds(ci, mds); |
660 | if (!cap) { | 636 | if (!cap) { |
661 | cap = *new_cap; | 637 | cap = *new_cap; |
@@ -681,7 +657,7 @@ void ceph_add_cap(struct inode *inode, | |||
681 | list_move_tail(&cap->session_caps, &session->s_caps); | 657 | list_move_tail(&cap->session_caps, &session->s_caps); |
682 | spin_unlock(&session->s_cap_lock); | 658 | spin_unlock(&session->s_cap_lock); |
683 | 659 | ||
684 | if (cap->cap_gen < session->s_cap_gen) | 660 | if (cap->cap_gen < gen) |
685 | cap->issued = cap->implemented = CEPH_CAP_PIN; | 661 | cap->issued = cap->implemented = CEPH_CAP_PIN; |
686 | 662 | ||
687 | /* | 663 | /* |
@@ -775,7 +751,7 @@ void ceph_add_cap(struct inode *inode, | |||
775 | cap->seq = seq; | 751 | cap->seq = seq; |
776 | cap->issue_seq = seq; | 752 | cap->issue_seq = seq; |
777 | cap->mseq = mseq; | 753 | cap->mseq = mseq; |
778 | cap->cap_gen = session->s_cap_gen; | 754 | cap->cap_gen = gen; |
779 | 755 | ||
780 | if (fmode >= 0) | 756 | if (fmode >= 0) |
781 | __ceph_get_fmode(ci, fmode); | 757 | __ceph_get_fmode(ci, fmode); |
@@ -1284,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) | |||
1284 | * Make note of max_size reported/requested from mds, revoked caps | 1260 | * Make note of max_size reported/requested from mds, revoked caps |
1285 | * that have now been implemented. | 1261 | * that have now been implemented. |
1286 | * | 1262 | * |
1287 | * Make half-hearted attempt ot to invalidate page cache if we are | ||
1288 | * dropping RDCACHE. Note that this will leave behind locked pages | ||
1289 | * that we'll then need to deal with elsewhere. | ||
1290 | * | ||
1291 | * Return non-zero if delayed release, or we experienced an error | 1263 | * Return non-zero if delayed release, or we experienced an error |
1292 | * such that the caller should requeue + retry later. | 1264 | * such that the caller should requeue + retry later. |
1293 | * | 1265 | * |
@@ -1746,11 +1718,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc, | |||
1746 | * Add dirty inode to the flushing list. Assigned a seq number so we | 1718 | * Add dirty inode to the flushing list. Assigned a seq number so we |
1747 | * can wait for caps to flush without starving. | 1719 | * can wait for caps to flush without starving. |
1748 | * | 1720 | * |
1749 | * Called under i_ceph_lock. | 1721 | * Called under i_ceph_lock. Returns the flush tid. |
1750 | */ | 1722 | */ |
1751 | static int __mark_caps_flushing(struct inode *inode, | 1723 | static u64 __mark_caps_flushing(struct inode *inode, |
1752 | struct ceph_mds_session *session, bool wake, | 1724 | struct ceph_mds_session *session, bool wake, |
1753 | u64 *flush_tid, u64 *oldest_flush_tid) | 1725 | u64 *oldest_flush_tid) |
1754 | { | 1726 | { |
1755 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1727 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1756 | struct ceph_inode_info *ci = ceph_inode(inode); | 1728 | struct ceph_inode_info *ci = ceph_inode(inode); |
@@ -1789,8 +1761,7 @@ static int __mark_caps_flushing(struct inode *inode, | |||
1789 | 1761 | ||
1790 | list_add_tail(&cf->i_list, &ci->i_cap_flush_list); | 1762 | list_add_tail(&cf->i_list, &ci->i_cap_flush_list); |
1791 | 1763 | ||
1792 | *flush_tid = cf->tid; | 1764 | return cf->tid; |
1793 | return flushing; | ||
1794 | } | 1765 | } |
1795 | 1766 | ||
1796 | /* | 1767 | /* |
@@ -2028,11 +1999,6 @@ retry_locked: | |||
2028 | } | 1999 | } |
2029 | 2000 | ||
2030 | ack: | 2001 | ack: |
2031 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | ||
2032 | dout(" skipping %p I_NOFLUSH set\n", inode); | ||
2033 | continue; | ||
2034 | } | ||
2035 | |||
2036 | if (session && session != cap->session) { | 2002 | if (session && session != cap->session) { |
2037 | dout("oops, wrong session %p mutex\n", session); | 2003 | dout("oops, wrong session %p mutex\n", session); |
2038 | mutex_unlock(&session->s_mutex); | 2004 | mutex_unlock(&session->s_mutex); |
@@ -2080,9 +2046,9 @@ ack: | |||
2080 | } | 2046 | } |
2081 | 2047 | ||
2082 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) { | 2048 | if (cap == ci->i_auth_cap && ci->i_dirty_caps) { |
2083 | flushing = __mark_caps_flushing(inode, session, false, | 2049 | flushing = ci->i_dirty_caps; |
2084 | &flush_tid, | 2050 | flush_tid = __mark_caps_flushing(inode, session, false, |
2085 | &oldest_flush_tid); | 2051 | &oldest_flush_tid); |
2086 | } else { | 2052 | } else { |
2087 | flushing = 0; | 2053 | flushing = 0; |
2088 | flush_tid = 0; | 2054 | flush_tid = 0; |
@@ -2130,16 +2096,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) | |||
2130 | retry: | 2096 | retry: |
2131 | spin_lock(&ci->i_ceph_lock); | 2097 | spin_lock(&ci->i_ceph_lock); |
2132 | retry_locked: | 2098 | retry_locked: |
2133 | if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { | ||
2134 | spin_unlock(&ci->i_ceph_lock); | ||
2135 | dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); | ||
2136 | goto out; | ||
2137 | } | ||
2138 | if (ci->i_dirty_caps && ci->i_auth_cap) { | 2099 | if (ci->i_dirty_caps && ci->i_auth_cap) { |
2139 | struct ceph_cap *cap = ci->i_auth_cap; | 2100 | struct ceph_cap *cap = ci->i_auth_cap; |
2140 | int delayed; | 2101 | int delayed; |
2141 | 2102 | ||
2142 | if (!session || session != cap->session) { | 2103 | if (session != cap->session) { |
2143 | spin_unlock(&ci->i_ceph_lock); | 2104 | spin_unlock(&ci->i_ceph_lock); |
2144 | if (session) | 2105 | if (session) |
2145 | mutex_unlock(&session->s_mutex); | 2106 | mutex_unlock(&session->s_mutex); |
@@ -2161,8 +2122,9 @@ retry_locked: | |||
2161 | goto retry_locked; | 2122 | goto retry_locked; |
2162 | } | 2123 | } |
2163 | 2124 | ||
2164 | flushing = __mark_caps_flushing(inode, session, true, | 2125 | flushing = ci->i_dirty_caps; |
2165 | &flush_tid, &oldest_flush_tid); | 2126 | flush_tid = __mark_caps_flushing(inode, session, true, |
2127 | &oldest_flush_tid); | ||
2166 | 2128 | ||
2167 | /* __send_cap drops i_ceph_lock */ | 2129 | /* __send_cap drops i_ceph_lock */ |
2168 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, | 2130 | delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, |
@@ -2261,35 +2223,45 @@ static int unsafe_request_wait(struct inode *inode) | |||
2261 | 2223 | ||
2262 | int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) | 2224 | int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) |
2263 | { | 2225 | { |
2226 | struct ceph_file_info *fi = file->private_data; | ||
2264 | struct inode *inode = file->f_mapping->host; | 2227 | struct inode *inode = file->f_mapping->host; |
2265 | struct ceph_inode_info *ci = ceph_inode(inode); | 2228 | struct ceph_inode_info *ci = ceph_inode(inode); |
2266 | u64 flush_tid; | 2229 | u64 flush_tid; |
2267 | int ret; | 2230 | int ret, err; |
2268 | int dirty; | 2231 | int dirty; |
2269 | 2232 | ||
2270 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); | 2233 | dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); |
2271 | 2234 | ||
2272 | ret = file_write_and_wait_range(file, start, end); | 2235 | ret = file_write_and_wait_range(file, start, end); |
2273 | if (ret < 0) | ||
2274 | goto out; | ||
2275 | |||
2276 | if (datasync) | 2236 | if (datasync) |
2277 | goto out; | 2237 | goto out; |
2278 | 2238 | ||
2279 | dirty = try_flush_caps(inode, &flush_tid); | 2239 | dirty = try_flush_caps(inode, &flush_tid); |
2280 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); | 2240 | dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); |
2281 | 2241 | ||
2282 | ret = unsafe_request_wait(inode); | 2242 | err = unsafe_request_wait(inode); |
2283 | 2243 | ||
2284 | /* | 2244 | /* |
2285 | * only wait on non-file metadata writeback (the mds | 2245 | * only wait on non-file metadata writeback (the mds |
2286 | * can recover size and mtime, so we don't need to | 2246 | * can recover size and mtime, so we don't need to |
2287 | * wait for that) | 2247 | * wait for that) |
2288 | */ | 2248 | */ |
2289 | if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { | 2249 | if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { |
2290 | ret = wait_event_interruptible(ci->i_cap_wq, | 2250 | err = wait_event_interruptible(ci->i_cap_wq, |
2291 | caps_are_flushed(inode, flush_tid)); | 2251 | caps_are_flushed(inode, flush_tid)); |
2292 | } | 2252 | } |
2253 | |||
2254 | if (err < 0) | ||
2255 | ret = err; | ||
2256 | |||
2257 | if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { | ||
2258 | spin_lock(&file->f_lock); | ||
2259 | err = errseq_check_and_advance(&ci->i_meta_err, | ||
2260 | &fi->meta_err); | ||
2261 | spin_unlock(&file->f_lock); | ||
2262 | if (err < 0) | ||
2263 | ret = err; | ||
2264 | } | ||
2293 | out: | 2265 | out: |
2294 | dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); | 2266 | dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); |
2295 | return ret; | 2267 | return ret; |
@@ -2560,10 +2532,15 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, | |||
2560 | * | 2532 | * |
2561 | * FIXME: how does a 0 return differ from -EAGAIN? | 2533 | * FIXME: how does a 0 return differ from -EAGAIN? |
2562 | */ | 2534 | */ |
2563 | static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | 2535 | enum { |
2564 | loff_t endoff, bool nonblock, int *got) | 2536 | NON_BLOCKING = 1, |
2537 | CHECK_FILELOCK = 2, | ||
2538 | }; | ||
2539 | |||
2540 | static int try_get_cap_refs(struct inode *inode, int need, int want, | ||
2541 | loff_t endoff, int flags, int *got) | ||
2565 | { | 2542 | { |
2566 | struct inode *inode = &ci->vfs_inode; | 2543 | struct ceph_inode_info *ci = ceph_inode(inode); |
2567 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; | 2544 | struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; |
2568 | int ret = 0; | 2545 | int ret = 0; |
2569 | int have, implemented; | 2546 | int have, implemented; |
@@ -2576,6 +2553,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, | |||
2576 | again: | 2553 | again: |
2577 | spin_lock(&ci->i_ceph_lock); | 2554 | spin_lock(&ci->i_ceph_lock); |
2578 | 2555 | ||
2556 | if ((flags & CHECK_FILELOCK) && | ||
2557 | (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { | ||
2558 | dout("try_get_cap_refs %p error filelock\n", inode); | ||
2559 | ret = -EIO; | ||
2560 | goto out_unlock; | ||
2561 | } | ||
2562 | |||
2579 | /* make sure file is actually open */ | 2563 | /* make sure file is actually open */ |
2580 | file_wanted = __ceph_caps_file_wanted(ci); | 2564 | file_wanted = __ceph_caps_file_wanted(ci); |
2581 | if ((file_wanted & need) != need) { | 2565 | if ((file_wanted & need) != need) { |
@@ -2637,7 +2621,7 @@ again: | |||
2637 | * we can not call down_read() when | 2621 | * we can not call down_read() when |
2638 | * task isn't in TASK_RUNNING state | 2622 | * task isn't in TASK_RUNNING state |
2639 | */ | 2623 | */ |
2640 | if (nonblock) { | 2624 | if (flags & NON_BLOCKING) { |
2641 | ret = -EAGAIN; | 2625 | ret = -EAGAIN; |
2642 | goto out_unlock; | 2626 | goto out_unlock; |
2643 | } | 2627 | } |
@@ -2731,18 +2715,19 @@ static void check_max_size(struct inode *inode, loff_t endoff) | |||
2731 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 2715 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
2732 | } | 2716 | } |
2733 | 2717 | ||
2734 | int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, | 2718 | int ceph_try_get_caps(struct inode *inode, int need, int want, |
2735 | bool nonblock, int *got) | 2719 | bool nonblock, int *got) |
2736 | { | 2720 | { |
2737 | int ret; | 2721 | int ret; |
2738 | 2722 | ||
2739 | BUG_ON(need & ~CEPH_CAP_FILE_RD); | 2723 | BUG_ON(need & ~CEPH_CAP_FILE_RD); |
2740 | BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); | 2724 | BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); |
2741 | ret = ceph_pool_perm_check(ci, need); | 2725 | ret = ceph_pool_perm_check(inode, need); |
2742 | if (ret < 0) | 2726 | if (ret < 0) |
2743 | return ret; | 2727 | return ret; |
2744 | 2728 | ||
2745 | ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); | 2729 | ret = try_get_cap_refs(inode, need, want, 0, |
2730 | (nonblock ? NON_BLOCKING : 0), got); | ||
2746 | return ret == -EAGAIN ? 0 : ret; | 2731 | return ret == -EAGAIN ? 0 : ret; |
2747 | } | 2732 | } |
2748 | 2733 | ||
@@ -2751,30 +2736,40 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, | |||
2751 | * due to a small max_size, make sure we check_max_size (and possibly | 2736 | * due to a small max_size, make sure we check_max_size (and possibly |
2752 | * ask the mds) so we don't get hung up indefinitely. | 2737 | * ask the mds) so we don't get hung up indefinitely. |
2753 | */ | 2738 | */ |
2754 | int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | 2739 | int ceph_get_caps(struct file *filp, int need, int want, |
2755 | loff_t endoff, int *got, struct page **pinned_page) | 2740 | loff_t endoff, int *got, struct page **pinned_page) |
2756 | { | 2741 | { |
2757 | int _got, ret; | 2742 | struct ceph_file_info *fi = filp->private_data; |
2743 | struct inode *inode = file_inode(filp); | ||
2744 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
2745 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
2746 | int ret, _got, flags; | ||
2758 | 2747 | ||
2759 | ret = ceph_pool_perm_check(ci, need); | 2748 | ret = ceph_pool_perm_check(inode, need); |
2760 | if (ret < 0) | 2749 | if (ret < 0) |
2761 | return ret; | 2750 | return ret; |
2762 | 2751 | ||
2752 | if ((fi->fmode & CEPH_FILE_MODE_WR) && | ||
2753 | fi->filp_gen != READ_ONCE(fsc->filp_gen)) | ||
2754 | return -EBADF; | ||
2755 | |||
2763 | while (true) { | 2756 | while (true) { |
2764 | if (endoff > 0) | 2757 | if (endoff > 0) |
2765 | check_max_size(&ci->vfs_inode, endoff); | 2758 | check_max_size(inode, endoff); |
2766 | 2759 | ||
2760 | flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; | ||
2767 | _got = 0; | 2761 | _got = 0; |
2768 | ret = try_get_cap_refs(ci, need, want, endoff, | 2762 | ret = try_get_cap_refs(inode, need, want, endoff, |
2769 | false, &_got); | 2763 | flags, &_got); |
2770 | if (ret == -EAGAIN) | 2764 | if (ret == -EAGAIN) |
2771 | continue; | 2765 | continue; |
2772 | if (!ret) { | 2766 | if (!ret) { |
2773 | DEFINE_WAIT_FUNC(wait, woken_wake_function); | 2767 | DEFINE_WAIT_FUNC(wait, woken_wake_function); |
2774 | add_wait_queue(&ci->i_cap_wq, &wait); | 2768 | add_wait_queue(&ci->i_cap_wq, &wait); |
2775 | 2769 | ||
2776 | while (!(ret = try_get_cap_refs(ci, need, want, endoff, | 2770 | flags |= NON_BLOCKING; |
2777 | true, &_got))) { | 2771 | while (!(ret = try_get_cap_refs(inode, need, want, |
2772 | endoff, flags, &_got))) { | ||
2778 | if (signal_pending(current)) { | 2773 | if (signal_pending(current)) { |
2779 | ret = -ERESTARTSYS; | 2774 | ret = -ERESTARTSYS; |
2780 | break; | 2775 | break; |
@@ -2786,10 +2781,18 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | |||
2786 | if (ret == -EAGAIN) | 2781 | if (ret == -EAGAIN) |
2787 | continue; | 2782 | continue; |
2788 | } | 2783 | } |
2784 | |||
2785 | if ((fi->fmode & CEPH_FILE_MODE_WR) && | ||
2786 | fi->filp_gen != READ_ONCE(fsc->filp_gen)) { | ||
2787 | if (ret >= 0 && _got) | ||
2788 | ceph_put_cap_refs(ci, _got); | ||
2789 | return -EBADF; | ||
2790 | } | ||
2791 | |||
2789 | if (ret < 0) { | 2792 | if (ret < 0) { |
2790 | if (ret == -ESTALE) { | 2793 | if (ret == -ESTALE) { |
2791 | /* session was killed, try renew caps */ | 2794 | /* session was killed, try renew caps */ |
2792 | ret = ceph_renew_caps(&ci->vfs_inode); | 2795 | ret = ceph_renew_caps(inode); |
2793 | if (ret == 0) | 2796 | if (ret == 0) |
2794 | continue; | 2797 | continue; |
2795 | } | 2798 | } |
@@ -2798,9 +2801,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | |||
2798 | 2801 | ||
2799 | if (ci->i_inline_version != CEPH_INLINE_NONE && | 2802 | if (ci->i_inline_version != CEPH_INLINE_NONE && |
2800 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && | 2803 | (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && |
2801 | i_size_read(&ci->vfs_inode) > 0) { | 2804 | i_size_read(inode) > 0) { |
2802 | struct page *page = | 2805 | struct page *page = |
2803 | find_get_page(ci->vfs_inode.i_mapping, 0); | 2806 | find_get_page(inode->i_mapping, 0); |
2804 | if (page) { | 2807 | if (page) { |
2805 | if (PageUptodate(page)) { | 2808 | if (PageUptodate(page)) { |
2806 | *pinned_page = page; | 2809 | *pinned_page = page; |
@@ -2819,7 +2822,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | |||
2819 | * getattr request will bring inline data into | 2822 | * getattr request will bring inline data into |
2820 | * page cache | 2823 | * page cache |
2821 | */ | 2824 | */ |
2822 | ret = __ceph_do_getattr(&ci->vfs_inode, NULL, | 2825 | ret = __ceph_do_getattr(inode, NULL, |
2823 | CEPH_STAT_CAP_INLINE_DATA, | 2826 | CEPH_STAT_CAP_INLINE_DATA, |
2824 | true); | 2827 | true); |
2825 | if (ret < 0) | 2828 | if (ret < 0) |
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 2eb88ed22993..facb387c2735 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c | |||
@@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | |||
294 | 294 | ||
295 | void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) | 295 | void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) |
296 | { | 296 | { |
297 | return 0; | ||
298 | } | 297 | } |
299 | 298 | ||
300 | void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) | 299 | void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) |
diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 15ff1b09cfa2..b6bfa94332c3 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c | |||
@@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { | |||
35 | static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, | 35 | static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, |
36 | struct inode *parent_inode) | 36 | struct inode *parent_inode) |
37 | { | 37 | { |
38 | const static int snap_handle_length = | 38 | static const int snap_handle_length = |
39 | sizeof(struct ceph_nfs_snapfh) >> 2; | 39 | sizeof(struct ceph_nfs_snapfh) >> 2; |
40 | struct ceph_nfs_snapfh *sfh = (void *)rawfh; | 40 | struct ceph_nfs_snapfh *sfh = (void *)rawfh; |
41 | u64 snapid = ceph_snap(inode); | 41 | u64 snapid = ceph_snap(inode); |
@@ -85,9 +85,9 @@ out: | |||
85 | static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, | 85 | static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, |
86 | struct inode *parent_inode) | 86 | struct inode *parent_inode) |
87 | { | 87 | { |
88 | const static int handle_length = | 88 | static const int handle_length = |
89 | sizeof(struct ceph_nfs_fh) >> 2; | 89 | sizeof(struct ceph_nfs_fh) >> 2; |
90 | const static int connected_handle_length = | 90 | static const int connected_handle_length = |
91 | sizeof(struct ceph_nfs_confh) >> 2; | 91 | sizeof(struct ceph_nfs_confh) >> 2; |
92 | int type; | 92 | int type; |
93 | 93 | ||
@@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, | |||
458 | if (err < 0) | 458 | if (err < 0) |
459 | goto out; | 459 | goto out; |
460 | 460 | ||
461 | rinfo = &req->r_reply_info; | 461 | rinfo = &req->r_reply_info; |
462 | for (i = 0; i < rinfo->dir_nr; i++) { | 462 | for (i = 0; i < rinfo->dir_nr; i++) { |
463 | rde = rinfo->dir_entries + i; | 463 | rde = rinfo->dir_entries + i; |
464 | BUG_ON(!rde->inode.in); | 464 | BUG_ON(!rde->inode.in); |
465 | if (ceph_snap(inode) == | 465 | if (ceph_snap(inode) == |
466 | le64_to_cpu(rde->inode.in->snapid)) { | 466 | le64_to_cpu(rde->inode.in->snapid)) { |
467 | memcpy(name, rde->name, rde->name_len); | 467 | memcpy(name, rde->name, rde->name_len); |
468 | name[rde->name_len] = '\0'; | 468 | name[rde->name_len] = '\0'; |
469 | err = 0; | 469 | err = 0; |
470 | goto out; | 470 | goto out; |
471 | } | 471 | } |
472 | } | 472 | } |
473 | 473 | ||
474 | if (rinfo->dir_end) | 474 | if (rinfo->dir_end) |
475 | break; | 475 | break; |
476 | 476 | ||
477 | BUG_ON(rinfo->dir_nr <= 0); | 477 | BUG_ON(rinfo->dir_nr <= 0); |
478 | rde = rinfo->dir_entries + (rinfo->dir_nr - 1); | 478 | rde = rinfo->dir_entries + (rinfo->dir_nr - 1); |
479 | next_offset += rinfo->dir_nr; | 479 | next_offset += rinfo->dir_nr; |
480 | last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); | 480 | last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); |
481 | if (!last_name) { | 481 | if (!last_name) { |
482 | err = -ENOMEM; | 482 | err = -ENOMEM; |
483 | goto out; | 483 | goto out; |
484 | } | 484 | } |
485 | 485 | ||
486 | ceph_mdsc_put_request(req); | 486 | ceph_mdsc_put_request(req); |
487 | req = NULL; | 487 | req = NULL; |
488 | } | 488 | } |
489 | err = -ENOENT; | 489 | err = -ENOENT; |
490 | out: | 490 | out: |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 685a03cc4b77..d277f71abe0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -15,6 +15,7 @@ | |||
15 | #include "super.h" | 15 | #include "super.h" |
16 | #include "mds_client.h" | 16 | #include "mds_client.h" |
17 | #include "cache.h" | 17 | #include "cache.h" |
18 | #include "io.h" | ||
18 | 19 | ||
19 | static __le32 ceph_flags_sys2wire(u32 flags) | 20 | static __le32 ceph_flags_sys2wire(u32 flags) |
20 | { | 21 | { |
@@ -201,6 +202,7 @@ out: | |||
201 | static int ceph_init_file_info(struct inode *inode, struct file *file, | 202 | static int ceph_init_file_info(struct inode *inode, struct file *file, |
202 | int fmode, bool isdir) | 203 | int fmode, bool isdir) |
203 | { | 204 | { |
205 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
204 | struct ceph_file_info *fi; | 206 | struct ceph_file_info *fi; |
205 | 207 | ||
206 | dout("%s %p %p 0%o (%s)\n", __func__, inode, file, | 208 | dout("%s %p %p 0%o (%s)\n", __func__, inode, file, |
@@ -211,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, | |||
211 | struct ceph_dir_file_info *dfi = | 213 | struct ceph_dir_file_info *dfi = |
212 | kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); | 214 | kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); |
213 | if (!dfi) { | 215 | if (!dfi) { |
214 | ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ | 216 | ceph_put_fmode(ci, fmode); /* clean up */ |
215 | return -ENOMEM; | 217 | return -ENOMEM; |
216 | } | 218 | } |
217 | 219 | ||
@@ -222,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, | |||
222 | } else { | 224 | } else { |
223 | fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); | 225 | fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); |
224 | if (!fi) { | 226 | if (!fi) { |
225 | ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ | 227 | ceph_put_fmode(ci, fmode); /* clean up */ |
226 | return -ENOMEM; | 228 | return -ENOMEM; |
227 | } | 229 | } |
228 | 230 | ||
@@ -232,6 +234,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, | |||
232 | fi->fmode = fmode; | 234 | fi->fmode = fmode; |
233 | spin_lock_init(&fi->rw_contexts_lock); | 235 | spin_lock_init(&fi->rw_contexts_lock); |
234 | INIT_LIST_HEAD(&fi->rw_contexts); | 236 | INIT_LIST_HEAD(&fi->rw_contexts); |
237 | fi->meta_err = errseq_sample(&ci->i_meta_err); | ||
238 | fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); | ||
235 | 239 | ||
236 | return 0; | 240 | return 0; |
237 | } | 241 | } |
@@ -695,7 +699,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, | |||
695 | ceph_release_page_vector(pages, num_pages); | 699 | ceph_release_page_vector(pages, num_pages); |
696 | } | 700 | } |
697 | 701 | ||
698 | if (ret <= 0 || off >= i_size || !more) | 702 | if (ret < 0) { |
703 | if (ret == -EBLACKLISTED) | ||
704 | fsc->blacklisted = true; | ||
705 | break; | ||
706 | } | ||
707 | |||
708 | if (off >= i_size || !more) | ||
699 | break; | 709 | break; |
700 | } | 710 | } |
701 | 711 | ||
@@ -921,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, | |||
921 | struct ceph_aio_request *aio_req = NULL; | 931 | struct ceph_aio_request *aio_req = NULL; |
922 | int num_pages = 0; | 932 | int num_pages = 0; |
923 | int flags; | 933 | int flags; |
924 | int ret; | 934 | int ret = 0; |
925 | struct timespec64 mtime = current_time(inode); | 935 | struct timespec64 mtime = current_time(inode); |
926 | size_t count = iov_iter_count(iter); | 936 | size_t count = iov_iter_count(iter); |
927 | loff_t pos = iocb->ki_pos; | 937 | loff_t pos = iocb->ki_pos; |
@@ -935,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, | |||
935 | (write ? "write" : "read"), file, pos, (unsigned)count, | 945 | (write ? "write" : "read"), file, pos, (unsigned)count, |
936 | snapc, snapc ? snapc->seq : 0); | 946 | snapc, snapc ? snapc->seq : 0); |
937 | 947 | ||
938 | ret = filemap_write_and_wait_range(inode->i_mapping, | ||
939 | pos, pos + count - 1); | ||
940 | if (ret < 0) | ||
941 | return ret; | ||
942 | |||
943 | if (write) { | 948 | if (write) { |
944 | int ret2 = invalidate_inode_pages2_range(inode->i_mapping, | 949 | int ret2 = invalidate_inode_pages2_range(inode->i_mapping, |
945 | pos >> PAGE_SHIFT, | 950 | pos >> PAGE_SHIFT, |
@@ -1260,7 +1265,8 @@ again: | |||
1260 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; | 1265 | want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; |
1261 | else | 1266 | else |
1262 | want = CEPH_CAP_FILE_CACHE; | 1267 | want = CEPH_CAP_FILE_CACHE; |
1263 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); | 1268 | ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, |
1269 | &got, &pinned_page); | ||
1264 | if (ret < 0) | 1270 | if (ret < 0) |
1265 | return ret; | 1271 | return ret; |
1266 | 1272 | ||
@@ -1274,12 +1280,16 @@ again: | |||
1274 | 1280 | ||
1275 | if (ci->i_inline_version == CEPH_INLINE_NONE) { | 1281 | if (ci->i_inline_version == CEPH_INLINE_NONE) { |
1276 | if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { | 1282 | if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { |
1283 | ceph_start_io_direct(inode); | ||
1277 | ret = ceph_direct_read_write(iocb, to, | 1284 | ret = ceph_direct_read_write(iocb, to, |
1278 | NULL, NULL); | 1285 | NULL, NULL); |
1286 | ceph_end_io_direct(inode); | ||
1279 | if (ret >= 0 && ret < len) | 1287 | if (ret >= 0 && ret < len) |
1280 | retry_op = CHECK_EOF; | 1288 | retry_op = CHECK_EOF; |
1281 | } else { | 1289 | } else { |
1290 | ceph_start_io_read(inode); | ||
1282 | ret = ceph_sync_read(iocb, to, &retry_op); | 1291 | ret = ceph_sync_read(iocb, to, &retry_op); |
1292 | ceph_end_io_read(inode); | ||
1283 | } | 1293 | } |
1284 | } else { | 1294 | } else { |
1285 | retry_op = READ_INLINE; | 1295 | retry_op = READ_INLINE; |
@@ -1290,7 +1300,9 @@ again: | |||
1290 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, | 1300 | inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, |
1291 | ceph_cap_string(got)); | 1301 | ceph_cap_string(got)); |
1292 | ceph_add_rw_context(fi, &rw_ctx); | 1302 | ceph_add_rw_context(fi, &rw_ctx); |
1303 | ceph_start_io_read(inode); | ||
1293 | ret = generic_file_read_iter(iocb, to); | 1304 | ret = generic_file_read_iter(iocb, to); |
1305 | ceph_end_io_read(inode); | ||
1294 | ceph_del_rw_context(fi, &rw_ctx); | 1306 | ceph_del_rw_context(fi, &rw_ctx); |
1295 | } | 1307 | } |
1296 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", | 1308 | dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", |
@@ -1399,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) | |||
1399 | return -ENOMEM; | 1411 | return -ENOMEM; |
1400 | 1412 | ||
1401 | retry_snap: | 1413 | retry_snap: |
1402 | inode_lock(inode); | 1414 | if (iocb->ki_flags & IOCB_DIRECT) |
1415 | ceph_start_io_direct(inode); | ||
1416 | else | ||
1417 | ceph_start_io_write(inode); | ||
1403 | 1418 | ||
1404 | /* We can write back this queue in page reclaim */ | 1419 | /* We can write back this queue in page reclaim */ |
1405 | current->backing_dev_info = inode_to_bdi(inode); | 1420 | current->backing_dev_info = inode_to_bdi(inode); |
@@ -1457,7 +1472,7 @@ retry_snap: | |||
1457 | else | 1472 | else |
1458 | want = CEPH_CAP_FILE_BUFFER; | 1473 | want = CEPH_CAP_FILE_BUFFER; |
1459 | got = 0; | 1474 | got = 0; |
1460 | err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, | 1475 | err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, |
1461 | &got, NULL); | 1476 | &got, NULL); |
1462 | if (err < 0) | 1477 | if (err < 0) |
1463 | goto out; | 1478 | goto out; |
@@ -1470,7 +1485,6 @@ retry_snap: | |||
1470 | (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { | 1485 | (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { |
1471 | struct ceph_snap_context *snapc; | 1486 | struct ceph_snap_context *snapc; |
1472 | struct iov_iter data; | 1487 | struct iov_iter data; |
1473 | inode_unlock(inode); | ||
1474 | 1488 | ||
1475 | spin_lock(&ci->i_ceph_lock); | 1489 | spin_lock(&ci->i_ceph_lock); |
1476 | if (__ceph_have_pending_cap_snap(ci)) { | 1490 | if (__ceph_have_pending_cap_snap(ci)) { |
@@ -1487,11 +1501,14 @@ retry_snap: | |||
1487 | 1501 | ||
1488 | /* we might need to revert back to that point */ | 1502 | /* we might need to revert back to that point */ |
1489 | data = *from; | 1503 | data = *from; |
1490 | if (iocb->ki_flags & IOCB_DIRECT) | 1504 | if (iocb->ki_flags & IOCB_DIRECT) { |
1491 | written = ceph_direct_read_write(iocb, &data, snapc, | 1505 | written = ceph_direct_read_write(iocb, &data, snapc, |
1492 | &prealloc_cf); | 1506 | &prealloc_cf); |
1493 | else | 1507 | ceph_end_io_direct(inode); |
1508 | } else { | ||
1494 | written = ceph_sync_write(iocb, &data, pos, snapc); | 1509 | written = ceph_sync_write(iocb, &data, pos, snapc); |
1510 | ceph_end_io_write(inode); | ||
1511 | } | ||
1495 | if (written > 0) | 1512 | if (written > 0) |
1496 | iov_iter_advance(from, written); | 1513 | iov_iter_advance(from, written); |
1497 | ceph_put_snap_context(snapc); | 1514 | ceph_put_snap_context(snapc); |
@@ -1506,7 +1523,7 @@ retry_snap: | |||
1506 | written = generic_perform_write(file, from, pos); | 1523 | written = generic_perform_write(file, from, pos); |
1507 | if (likely(written >= 0)) | 1524 | if (likely(written >= 0)) |
1508 | iocb->ki_pos = pos + written; | 1525 | iocb->ki_pos = pos + written; |
1509 | inode_unlock(inode); | 1526 | ceph_end_io_write(inode); |
1510 | } | 1527 | } |
1511 | 1528 | ||
1512 | if (written >= 0) { | 1529 | if (written >= 0) { |
@@ -1541,9 +1558,11 @@ retry_snap: | |||
1541 | } | 1558 | } |
1542 | 1559 | ||
1543 | goto out_unlocked; | 1560 | goto out_unlocked; |
1544 | |||
1545 | out: | 1561 | out: |
1546 | inode_unlock(inode); | 1562 | if (iocb->ki_flags & IOCB_DIRECT) |
1563 | ceph_end_io_direct(inode); | ||
1564 | else | ||
1565 | ceph_end_io_write(inode); | ||
1547 | out_unlocked: | 1566 | out_unlocked: |
1548 | ceph_free_cap_flush(prealloc_cf); | 1567 | ceph_free_cap_flush(prealloc_cf); |
1549 | current->backing_dev_info = NULL; | 1568 | current->backing_dev_info = NULL; |
@@ -1781,7 +1800,7 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1781 | else | 1800 | else |
1782 | want = CEPH_CAP_FILE_BUFFER; | 1801 | want = CEPH_CAP_FILE_BUFFER; |
1783 | 1802 | ||
1784 | ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); | 1803 | ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); |
1785 | if (ret < 0) | 1804 | if (ret < 0) |
1786 | goto unlock; | 1805 | goto unlock; |
1787 | 1806 | ||
@@ -1810,16 +1829,15 @@ unlock: | |||
1810 | * src_ci. Two attempts are made to obtain both caps, and an error is return if | 1829 | * src_ci. Two attempts are made to obtain both caps, and an error is return if |
1811 | * this fails; zero is returned on success. | 1830 | * this fails; zero is returned on success. |
1812 | */ | 1831 | */ |
1813 | static int get_rd_wr_caps(struct ceph_inode_info *src_ci, | 1832 | static int get_rd_wr_caps(struct file *src_filp, int *src_got, |
1814 | loff_t src_endoff, int *src_got, | 1833 | struct file *dst_filp, |
1815 | struct ceph_inode_info *dst_ci, | ||
1816 | loff_t dst_endoff, int *dst_got) | 1834 | loff_t dst_endoff, int *dst_got) |
1817 | { | 1835 | { |
1818 | int ret = 0; | 1836 | int ret = 0; |
1819 | bool retrying = false; | 1837 | bool retrying = false; |
1820 | 1838 | ||
1821 | retry_caps: | 1839 | retry_caps: |
1822 | ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, | 1840 | ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, |
1823 | dst_endoff, dst_got, NULL); | 1841 | dst_endoff, dst_got, NULL); |
1824 | if (ret < 0) | 1842 | if (ret < 0) |
1825 | return ret; | 1843 | return ret; |
@@ -1829,24 +1847,24 @@ retry_caps: | |||
1829 | * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some | 1847 | * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some |
1830 | * retry dance instead to try to get both capabilities. | 1848 | * retry dance instead to try to get both capabilities. |
1831 | */ | 1849 | */ |
1832 | ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, | 1850 | ret = ceph_try_get_caps(file_inode(src_filp), |
1851 | CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, | ||
1833 | false, src_got); | 1852 | false, src_got); |
1834 | if (ret <= 0) { | 1853 | if (ret <= 0) { |
1835 | /* Start by dropping dst_ci caps and getting src_ci caps */ | 1854 | /* Start by dropping dst_ci caps and getting src_ci caps */ |
1836 | ceph_put_cap_refs(dst_ci, *dst_got); | 1855 | ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); |
1837 | if (retrying) { | 1856 | if (retrying) { |
1838 | if (!ret) | 1857 | if (!ret) |
1839 | /* ceph_try_get_caps masks EAGAIN */ | 1858 | /* ceph_try_get_caps masks EAGAIN */ |
1840 | ret = -EAGAIN; | 1859 | ret = -EAGAIN; |
1841 | return ret; | 1860 | return ret; |
1842 | } | 1861 | } |
1843 | ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, | 1862 | ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, |
1844 | CEPH_CAP_FILE_SHARED, src_endoff, | 1863 | CEPH_CAP_FILE_SHARED, -1, src_got, NULL); |
1845 | src_got, NULL); | ||
1846 | if (ret < 0) | 1864 | if (ret < 0) |
1847 | return ret; | 1865 | return ret; |
1848 | /*... drop src_ci caps too, and retry */ | 1866 | /*... drop src_ci caps too, and retry */ |
1849 | ceph_put_cap_refs(src_ci, *src_got); | 1867 | ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); |
1850 | retrying = true; | 1868 | retrying = true; |
1851 | goto retry_caps; | 1869 | goto retry_caps; |
1852 | } | 1870 | } |
@@ -1904,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, | |||
1904 | struct ceph_inode_info *src_ci = ceph_inode(src_inode); | 1922 | struct ceph_inode_info *src_ci = ceph_inode(src_inode); |
1905 | struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); | 1923 | struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); |
1906 | struct ceph_cap_flush *prealloc_cf; | 1924 | struct ceph_cap_flush *prealloc_cf; |
1925 | struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); | ||
1907 | struct ceph_object_locator src_oloc, dst_oloc; | 1926 | struct ceph_object_locator src_oloc, dst_oloc; |
1908 | struct ceph_object_id src_oid, dst_oid; | 1927 | struct ceph_object_id src_oid, dst_oid; |
1909 | loff_t endoff = 0, size; | 1928 | loff_t endoff = 0, size; |
@@ -1913,10 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, | |||
1913 | int src_got = 0, dst_got = 0, err, dirty; | 1932 | int src_got = 0, dst_got = 0, err, dirty; |
1914 | bool do_final_copy = false; | 1933 | bool do_final_copy = false; |
1915 | 1934 | ||
1916 | if (src_inode == dst_inode) | 1935 | if (src_inode->i_sb != dst_inode->i_sb) { |
1917 | return -EINVAL; | 1936 | struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); |
1918 | if (src_inode->i_sb != dst_inode->i_sb) | 1937 | |
1919 | return -EXDEV; | 1938 | if (ceph_fsid_compare(&src_fsc->client->fsid, |
1939 | &dst_fsc->client->fsid)) { | ||
1940 | dout("Copying files across clusters: src: %pU dst: %pU\n", | ||
1941 | &src_fsc->client->fsid, &dst_fsc->client->fsid); | ||
1942 | return -EXDEV; | ||
1943 | } | ||
1944 | } | ||
1920 | if (ceph_snap(dst_inode) != CEPH_NOSNAP) | 1945 | if (ceph_snap(dst_inode) != CEPH_NOSNAP) |
1921 | return -EROFS; | 1946 | return -EROFS; |
1922 | 1947 | ||
@@ -1928,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, | |||
1928 | * efficient). | 1953 | * efficient). |
1929 | */ | 1954 | */ |
1930 | 1955 | ||
1931 | if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) | 1956 | if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) |
1932 | return -EOPNOTSUPP; | 1957 | return -EOPNOTSUPP; |
1933 | 1958 | ||
1934 | if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || | 1959 | if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || |
@@ -1960,8 +1985,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, | |||
1960 | * clients may have dirty data in their caches. And OSDs know nothing | 1985 | * clients may have dirty data in their caches. And OSDs know nothing |
1961 | * about caps, so they can't safely do the remote object copies. | 1986 | * about caps, so they can't safely do the remote object copies. |
1962 | */ | 1987 | */ |
1963 | err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, | 1988 | err = get_rd_wr_caps(src_file, &src_got, |
1964 | dst_ci, (dst_off + len), &dst_got); | 1989 | dst_file, (dst_off + len), &dst_got); |
1965 | if (err < 0) { | 1990 | if (err < 0) { |
1966 | dout("get_rd_wr_caps returned %d\n", err); | 1991 | dout("get_rd_wr_caps returned %d\n", err); |
1967 | ret = -EOPNOTSUPP; | 1992 | ret = -EOPNOTSUPP; |
@@ -2018,9 +2043,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, | |||
2018 | goto out; | 2043 | goto out; |
2019 | } | 2044 | } |
2020 | len -= ret; | 2045 | len -= ret; |
2021 | err = get_rd_wr_caps(src_ci, (src_off + len), | 2046 | err = get_rd_wr_caps(src_file, &src_got, |
2022 | &src_got, dst_ci, | 2047 | dst_file, (dst_off + len), &dst_got); |
2023 | (dst_off + len), &dst_got); | ||
2024 | if (err < 0) | 2048 | if (err < 0) |
2025 | goto out; | 2049 | goto out; |
2026 | err = is_file_size_ok(src_inode, dst_inode, | 2050 | err = is_file_size_ok(src_inode, dst_inode, |
@@ -2044,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, | |||
2044 | dst_ci->i_vino.ino, dst_objnum); | 2068 | dst_ci->i_vino.ino, dst_objnum); |
2045 | /* Do an object remote copy */ | 2069 | /* Do an object remote copy */ |
2046 | err = ceph_osdc_copy_from( | 2070 | err = ceph_osdc_copy_from( |
2047 | &ceph_inode_to_client(src_inode)->client->osdc, | 2071 | &src_fsc->client->osdc, |
2048 | src_ci->i_vino.snap, 0, | 2072 | src_ci->i_vino.snap, 0, |
2049 | &src_oid, &src_oloc, | 2073 | &src_oid, &src_oloc, |
2050 | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | | 2074 | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 18500edefc56..9f135624ae47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) | |||
515 | 515 | ||
516 | ceph_fscache_inode_init(ci); | 516 | ceph_fscache_inode_init(ci); |
517 | 517 | ||
518 | ci->i_meta_err = 0; | ||
519 | |||
518 | return &ci->vfs_inode; | 520 | return &ci->vfs_inode; |
519 | } | 521 | } |
520 | 522 | ||
@@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, | |||
801 | 803 | ||
802 | /* update inode */ | 804 | /* update inode */ |
803 | inode->i_rdev = le32_to_cpu(info->rdev); | 805 | inode->i_rdev = le32_to_cpu(info->rdev); |
804 | inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; | 806 | /* directories have fl_stripe_unit set to zero */ |
807 | if (le32_to_cpu(info->layout.fl_stripe_unit)) | ||
808 | inode->i_blkbits = | ||
809 | fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; | ||
810 | else | ||
811 | inode->i_blkbits = CEPH_BLOCK_SHIFT; | ||
805 | 812 | ||
806 | __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); | 813 | __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); |
807 | 814 | ||
@@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { | |||
1982 | int __ceph_setattr(struct inode *inode, struct iattr *attr) | 1989 | int __ceph_setattr(struct inode *inode, struct iattr *attr) |
1983 | { | 1990 | { |
1984 | struct ceph_inode_info *ci = ceph_inode(inode); | 1991 | struct ceph_inode_info *ci = ceph_inode(inode); |
1985 | const unsigned int ia_valid = attr->ia_valid; | 1992 | unsigned int ia_valid = attr->ia_valid; |
1986 | struct ceph_mds_request *req; | 1993 | struct ceph_mds_request *req; |
1987 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; | 1994 | struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; |
1988 | struct ceph_cap_flush *prealloc_cf; | 1995 | struct ceph_cap_flush *prealloc_cf; |
@@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) | |||
2087 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; | 2094 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; |
2088 | } | 2095 | } |
2089 | } | 2096 | } |
2097 | if (ia_valid & ATTR_SIZE) { | ||
2098 | dout("setattr %p size %lld -> %lld\n", inode, | ||
2099 | inode->i_size, attr->ia_size); | ||
2100 | if ((issued & CEPH_CAP_FILE_EXCL) && | ||
2101 | attr->ia_size > inode->i_size) { | ||
2102 | i_size_write(inode, attr->ia_size); | ||
2103 | inode->i_blocks = calc_inode_blocks(attr->ia_size); | ||
2104 | ci->i_reported_size = attr->ia_size; | ||
2105 | dirtied |= CEPH_CAP_FILE_EXCL; | ||
2106 | ia_valid |= ATTR_MTIME; | ||
2107 | } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || | ||
2108 | attr->ia_size != inode->i_size) { | ||
2109 | req->r_args.setattr.size = cpu_to_le64(attr->ia_size); | ||
2110 | req->r_args.setattr.old_size = | ||
2111 | cpu_to_le64(inode->i_size); | ||
2112 | mask |= CEPH_SETATTR_SIZE; | ||
2113 | release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | | ||
2114 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; | ||
2115 | } | ||
2116 | } | ||
2090 | if (ia_valid & ATTR_MTIME) { | 2117 | if (ia_valid & ATTR_MTIME) { |
2091 | dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, | 2118 | dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, |
2092 | inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, | 2119 | inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, |
@@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) | |||
2109 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; | 2136 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; |
2110 | } | 2137 | } |
2111 | } | 2138 | } |
2112 | if (ia_valid & ATTR_SIZE) { | ||
2113 | dout("setattr %p size %lld -> %lld\n", inode, | ||
2114 | inode->i_size, attr->ia_size); | ||
2115 | if ((issued & CEPH_CAP_FILE_EXCL) && | ||
2116 | attr->ia_size > inode->i_size) { | ||
2117 | i_size_write(inode, attr->ia_size); | ||
2118 | inode->i_blocks = calc_inode_blocks(attr->ia_size); | ||
2119 | ci->i_reported_size = attr->ia_size; | ||
2120 | dirtied |= CEPH_CAP_FILE_EXCL; | ||
2121 | } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || | ||
2122 | attr->ia_size != inode->i_size) { | ||
2123 | req->r_args.setattr.size = cpu_to_le64(attr->ia_size); | ||
2124 | req->r_args.setattr.old_size = | ||
2125 | cpu_to_le64(inode->i_size); | ||
2126 | mask |= CEPH_SETATTR_SIZE; | ||
2127 | release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | | ||
2128 | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; | ||
2129 | } | ||
2130 | } | ||
2131 | 2139 | ||
2132 | /* these do nothing */ | 2140 | /* these do nothing */ |
2133 | if (ia_valid & ATTR_CTIME) { | 2141 | if (ia_valid & ATTR_CTIME) { |
diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..97602ea92ff4 --- /dev/null +++ b/fs/ceph/io.c | |||
@@ -0,0 +1,163 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | /* | ||
3 | * Copyright (c) 2016 Trond Myklebust | ||
4 | * Copyright (c) 2019 Jeff Layton | ||
5 | * | ||
6 | * I/O and data path helper functionality. | ||
7 | * | ||
8 | * Heavily borrowed from equivalent code in fs/nfs/io.c | ||
9 | */ | ||
10 | |||
11 | #include <linux/ceph/ceph_debug.h> | ||
12 | |||
13 | #include <linux/types.h> | ||
14 | #include <linux/kernel.h> | ||
15 | #include <linux/rwsem.h> | ||
16 | #include <linux/fs.h> | ||
17 | |||
18 | #include "super.h" | ||
19 | #include "io.h" | ||
20 | |||
21 | /* Call with exclusively locked inode->i_rwsem */ | ||
22 | static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) | ||
23 | { | ||
24 | lockdep_assert_held_write(&inode->i_rwsem); | ||
25 | |||
26 | if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { | ||
27 | spin_lock(&ci->i_ceph_lock); | ||
28 | ci->i_ceph_flags &= ~CEPH_I_ODIRECT; | ||
29 | spin_unlock(&ci->i_ceph_lock); | ||
30 | inode_dio_wait(inode); | ||
31 | } | ||
32 | } | ||
33 | |||
34 | /** | ||
35 | * ceph_start_io_read - declare the file is being used for buffered reads | ||
36 | * @inode: file inode | ||
37 | * | ||
38 | * Declare that a buffered read operation is about to start, and ensure | ||
39 | * that we block all direct I/O. | ||
40 | * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, | ||
41 | * and holds a shared lock on inode->i_rwsem to ensure that the flag | ||
42 | * cannot be changed. | ||
43 | * In practice, this means that buffered read operations are allowed to | ||
44 | * execute in parallel, thanks to the shared lock, whereas direct I/O | ||
45 | * operations need to wait to grab an exclusive lock in order to set | ||
46 | * CEPH_I_ODIRECT. | ||
47 | * Note that buffered writes and truncates both take a write lock on | ||
48 | * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. | ||
49 | */ | ||
50 | void | ||
51 | ceph_start_io_read(struct inode *inode) | ||
52 | { | ||
53 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
54 | |||
55 | /* Be an optimist! */ | ||
56 | down_read(&inode->i_rwsem); | ||
57 | if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) | ||
58 | return; | ||
59 | up_read(&inode->i_rwsem); | ||
60 | /* Slow path.... */ | ||
61 | down_write(&inode->i_rwsem); | ||
62 | ceph_block_o_direct(ci, inode); | ||
63 | downgrade_write(&inode->i_rwsem); | ||
64 | } | ||
65 | |||
66 | /** | ||
67 | * ceph_end_io_read - declare that the buffered read operation is done | ||
68 | * @inode: file inode | ||
69 | * | ||
70 | * Declare that a buffered read operation is done, and release the shared | ||
71 | * lock on inode->i_rwsem. | ||
72 | */ | ||
73 | void | ||
74 | ceph_end_io_read(struct inode *inode) | ||
75 | { | ||
76 | up_read(&inode->i_rwsem); | ||
77 | } | ||
78 | |||
79 | /** | ||
80 | * ceph_start_io_write - declare the file is being used for buffered writes | ||
81 | * @inode: file inode | ||
82 | * | ||
83 | * Declare that a buffered write operation is about to start, and ensure | ||
84 | * that we block all direct I/O. | ||
85 | */ | ||
86 | void | ||
87 | ceph_start_io_write(struct inode *inode) | ||
88 | { | ||
89 | down_write(&inode->i_rwsem); | ||
90 | ceph_block_o_direct(ceph_inode(inode), inode); | ||
91 | } | ||
92 | |||
93 | /** | ||
94 | * ceph_end_io_write - declare that the buffered write operation is done | ||
95 | * @inode: file inode | ||
96 | * | ||
97 | * Declare that a buffered write operation is done, and release the | ||
98 | * lock on inode->i_rwsem. | ||
99 | */ | ||
100 | void | ||
101 | ceph_end_io_write(struct inode *inode) | ||
102 | { | ||
103 | up_write(&inode->i_rwsem); | ||
104 | } | ||
105 | |||
106 | /* Call with exclusively locked inode->i_rwsem */ | ||
107 | static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) | ||
108 | { | ||
109 | lockdep_assert_held_write(&inode->i_rwsem); | ||
110 | |||
111 | if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { | ||
112 | spin_lock(&ci->i_ceph_lock); | ||
113 | ci->i_ceph_flags |= CEPH_I_ODIRECT; | ||
114 | spin_unlock(&ci->i_ceph_lock); | ||
115 | /* FIXME: unmap_mapping_range? */ | ||
116 | filemap_write_and_wait(inode->i_mapping); | ||
117 | } | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * ceph_end_io_direct - declare the file is being used for direct i/o | ||
122 | * @inode: file inode | ||
123 | * | ||
124 | * Declare that a direct I/O operation is about to start, and ensure | ||
125 | * that we block all buffered I/O. | ||
126 | * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, | ||
127 | * and holds a shared lock on inode->i_rwsem to ensure that the flag | ||
128 | * cannot be changed. | ||
129 | * In practice, this means that direct I/O operations are allowed to | ||
130 | * execute in parallel, thanks to the shared lock, whereas buffered I/O | ||
131 | * operations need to wait to grab an exclusive lock in order to clear | ||
132 | * CEPH_I_ODIRECT. | ||
133 | * Note that buffered writes and truncates both take a write lock on | ||
134 | * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. | ||
135 | */ | ||
136 | void | ||
137 | ceph_start_io_direct(struct inode *inode) | ||
138 | { | ||
139 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
140 | |||
141 | /* Be an optimist! */ | ||
142 | down_read(&inode->i_rwsem); | ||
143 | if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) | ||
144 | return; | ||
145 | up_read(&inode->i_rwsem); | ||
146 | /* Slow path.... */ | ||
147 | down_write(&inode->i_rwsem); | ||
148 | ceph_block_buffered(ci, inode); | ||
149 | downgrade_write(&inode->i_rwsem); | ||
150 | } | ||
151 | |||
152 | /** | ||
153 | * ceph_end_io_direct - declare that the direct i/o operation is done | ||
154 | * @inode: file inode | ||
155 | * | ||
156 | * Declare that a direct I/O operation is done, and release the shared | ||
157 | * lock on inode->i_rwsem. | ||
158 | */ | ||
159 | void | ||
160 | ceph_end_io_direct(struct inode *inode) | ||
161 | { | ||
162 | up_read(&inode->i_rwsem); | ||
163 | } | ||
diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..fa594cd77348 --- /dev/null +++ b/fs/ceph/io.h | |||
@@ -0,0 +1,12 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | ||
2 | #ifndef _FS_CEPH_IO_H | ||
3 | #define _FS_CEPH_IO_H | ||
4 | |||
5 | void ceph_start_io_read(struct inode *inode); | ||
6 | void ceph_end_io_read(struct inode *inode); | ||
7 | void ceph_start_io_write(struct inode *inode); | ||
8 | void ceph_end_io_write(struct inode *inode); | ||
9 | void ceph_start_io_direct(struct inode *inode); | ||
10 | void ceph_end_io_direct(struct inode *inode); | ||
11 | |||
12 | #endif /* FS_CEPH_IO_H */ | ||
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 5083e238ad15..544e9e85b120 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c | |||
@@ -32,14 +32,18 @@ void __init ceph_flock_init(void) | |||
32 | 32 | ||
33 | static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) | 33 | static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) |
34 | { | 34 | { |
35 | struct inode *inode = file_inode(src->fl_file); | 35 | struct ceph_file_info *fi = dst->fl_file->private_data; |
36 | struct inode *inode = file_inode(dst->fl_file); | ||
36 | atomic_inc(&ceph_inode(inode)->i_filelock_ref); | 37 | atomic_inc(&ceph_inode(inode)->i_filelock_ref); |
38 | atomic_inc(&fi->num_locks); | ||
37 | } | 39 | } |
38 | 40 | ||
39 | static void ceph_fl_release_lock(struct file_lock *fl) | 41 | static void ceph_fl_release_lock(struct file_lock *fl) |
40 | { | 42 | { |
43 | struct ceph_file_info *fi = fl->fl_file->private_data; | ||
41 | struct inode *inode = file_inode(fl->fl_file); | 44 | struct inode *inode = file_inode(fl->fl_file); |
42 | struct ceph_inode_info *ci = ceph_inode(inode); | 45 | struct ceph_inode_info *ci = ceph_inode(inode); |
46 | atomic_dec(&fi->num_locks); | ||
43 | if (atomic_dec_and_test(&ci->i_filelock_ref)) { | 47 | if (atomic_dec_and_test(&ci->i_filelock_ref)) { |
44 | /* clear error when all locks are released */ | 48 | /* clear error when all locks are released */ |
45 | spin_lock(&ci->i_ceph_lock); | 49 | spin_lock(&ci->i_ceph_lock); |
@@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, | |||
73 | * window. Caller function will decrease the counter. | 77 | * window. Caller function will decrease the counter. |
74 | */ | 78 | */ |
75 | fl->fl_ops = &ceph_fl_lock_ops; | 79 | fl->fl_ops = &ceph_fl_lock_ops; |
76 | atomic_inc(&ceph_inode(inode)->i_filelock_ref); | 80 | fl->fl_ops->fl_copy_lock(fl, NULL); |
77 | } | 81 | } |
78 | 82 | ||
79 | if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) | 83 | if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f048bd8..a8a8f84f3bbf 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, | |||
639 | s->s_renew_seq = 0; | 639 | s->s_renew_seq = 0; |
640 | INIT_LIST_HEAD(&s->s_caps); | 640 | INIT_LIST_HEAD(&s->s_caps); |
641 | s->s_nr_caps = 0; | 641 | s->s_nr_caps = 0; |
642 | s->s_trim_caps = 0; | ||
643 | refcount_set(&s->s_ref, 1); | 642 | refcount_set(&s->s_ref, 1); |
644 | INIT_LIST_HEAD(&s->s_waiting); | 643 | INIT_LIST_HEAD(&s->s_waiting); |
645 | INIT_LIST_HEAD(&s->s_unsafe); | 644 | INIT_LIST_HEAD(&s->s_unsafe); |
@@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, | |||
1270 | { | 1269 | { |
1271 | struct ceph_mds_request *req; | 1270 | struct ceph_mds_request *req; |
1272 | struct rb_node *p; | 1271 | struct rb_node *p; |
1272 | struct ceph_inode_info *ci; | ||
1273 | 1273 | ||
1274 | dout("cleanup_session_requests mds%d\n", session->s_mds); | 1274 | dout("cleanup_session_requests mds%d\n", session->s_mds); |
1275 | mutex_lock(&mdsc->mutex); | 1275 | mutex_lock(&mdsc->mutex); |
@@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, | |||
1278 | struct ceph_mds_request, r_unsafe_item); | 1278 | struct ceph_mds_request, r_unsafe_item); |
1279 | pr_warn_ratelimited(" dropping unsafe request %llu\n", | 1279 | pr_warn_ratelimited(" dropping unsafe request %llu\n", |
1280 | req->r_tid); | 1280 | req->r_tid); |
1281 | if (req->r_target_inode) { | ||
1282 | /* dropping unsafe change of inode's attributes */ | ||
1283 | ci = ceph_inode(req->r_target_inode); | ||
1284 | errseq_set(&ci->i_meta_err, -EIO); | ||
1285 | } | ||
1286 | if (req->r_unsafe_dir) { | ||
1287 | /* dropping unsafe directory operation */ | ||
1288 | ci = ceph_inode(req->r_unsafe_dir); | ||
1289 | errseq_set(&ci->i_meta_err, -EIO); | ||
1290 | } | ||
1281 | __unregister_request(mdsc, req); | 1291 | __unregister_request(mdsc, req); |
1282 | } | 1292 | } |
1283 | /* zero r_attempts, so kick_requests() will re-send requests */ | 1293 | /* zero r_attempts, so kick_requests() will re-send requests */ |
@@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1370 | struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; | 1380 | struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; |
1371 | struct ceph_inode_info *ci = ceph_inode(inode); | 1381 | struct ceph_inode_info *ci = ceph_inode(inode); |
1372 | LIST_HEAD(to_remove); | 1382 | LIST_HEAD(to_remove); |
1373 | bool drop = false; | 1383 | bool dirty_dropped = false; |
1374 | bool invalidate = false; | 1384 | bool invalidate = false; |
1375 | 1385 | ||
1376 | dout("removing cap %p, ci is %p, inode is %p\n", | 1386 | dout("removing cap %p, ci is %p, inode is %p\n", |
@@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1383 | struct ceph_cap_flush *cf; | 1393 | struct ceph_cap_flush *cf; |
1384 | struct ceph_mds_client *mdsc = fsc->mdsc; | 1394 | struct ceph_mds_client *mdsc = fsc->mdsc; |
1385 | 1395 | ||
1386 | if (ci->i_wrbuffer_ref > 0 && | 1396 | if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { |
1387 | READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) | 1397 | if (inode->i_data.nrpages > 0) |
1388 | invalidate = true; | 1398 | invalidate = true; |
1399 | if (ci->i_wrbuffer_ref > 0) | ||
1400 | mapping_set_error(&inode->i_data, -EIO); | ||
1401 | } | ||
1389 | 1402 | ||
1390 | while (!list_empty(&ci->i_cap_flush_list)) { | 1403 | while (!list_empty(&ci->i_cap_flush_list)) { |
1391 | cf = list_first_entry(&ci->i_cap_flush_list, | 1404 | cf = list_first_entry(&ci->i_cap_flush_list, |
@@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1405 | inode, ceph_ino(inode)); | 1418 | inode, ceph_ino(inode)); |
1406 | ci->i_dirty_caps = 0; | 1419 | ci->i_dirty_caps = 0; |
1407 | list_del_init(&ci->i_dirty_item); | 1420 | list_del_init(&ci->i_dirty_item); |
1408 | drop = true; | 1421 | dirty_dropped = true; |
1409 | } | 1422 | } |
1410 | if (!list_empty(&ci->i_flushing_item)) { | 1423 | if (!list_empty(&ci->i_flushing_item)) { |
1411 | pr_warn_ratelimited( | 1424 | pr_warn_ratelimited( |
@@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1415 | ci->i_flushing_caps = 0; | 1428 | ci->i_flushing_caps = 0; |
1416 | list_del_init(&ci->i_flushing_item); | 1429 | list_del_init(&ci->i_flushing_item); |
1417 | mdsc->num_cap_flushing--; | 1430 | mdsc->num_cap_flushing--; |
1418 | drop = true; | 1431 | dirty_dropped = true; |
1419 | } | 1432 | } |
1420 | spin_unlock(&mdsc->cap_dirty_lock); | 1433 | spin_unlock(&mdsc->cap_dirty_lock); |
1421 | 1434 | ||
1435 | if (dirty_dropped) { | ||
1436 | errseq_set(&ci->i_meta_err, -EIO); | ||
1437 | |||
1438 | if (ci->i_wrbuffer_ref_head == 0 && | ||
1439 | ci->i_wr_ref == 0 && | ||
1440 | ci->i_dirty_caps == 0 && | ||
1441 | ci->i_flushing_caps == 0) { | ||
1442 | ceph_put_snap_context(ci->i_head_snapc); | ||
1443 | ci->i_head_snapc = NULL; | ||
1444 | } | ||
1445 | } | ||
1446 | |||
1422 | if (atomic_read(&ci->i_filelock_ref) > 0) { | 1447 | if (atomic_read(&ci->i_filelock_ref) > 0) { |
1423 | /* make further file lock syscall return -EIO */ | 1448 | /* make further file lock syscall return -EIO */ |
1424 | ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; | 1449 | ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; |
@@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1430 | list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); | 1455 | list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); |
1431 | ci->i_prealloc_cap_flush = NULL; | 1456 | ci->i_prealloc_cap_flush = NULL; |
1432 | } | 1457 | } |
1433 | |||
1434 | if (drop && | ||
1435 | ci->i_wrbuffer_ref_head == 0 && | ||
1436 | ci->i_wr_ref == 0 && | ||
1437 | ci->i_dirty_caps == 0 && | ||
1438 | ci->i_flushing_caps == 0) { | ||
1439 | ceph_put_snap_context(ci->i_head_snapc); | ||
1440 | ci->i_head_snapc = NULL; | ||
1441 | } | ||
1442 | } | 1458 | } |
1443 | spin_unlock(&ci->i_ceph_lock); | 1459 | spin_unlock(&ci->i_ceph_lock); |
1444 | while (!list_empty(&to_remove)) { | 1460 | while (!list_empty(&to_remove)) { |
@@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, | |||
1452 | wake_up_all(&ci->i_cap_wq); | 1468 | wake_up_all(&ci->i_cap_wq); |
1453 | if (invalidate) | 1469 | if (invalidate) |
1454 | ceph_queue_invalidate(inode); | 1470 | ceph_queue_invalidate(inode); |
1455 | if (drop) | 1471 | if (dirty_dropped) |
1456 | iput(inode); | 1472 | iput(inode); |
1457 | return 0; | 1473 | return 0; |
1458 | } | 1474 | } |
@@ -1705,11 +1721,11 @@ out: | |||
1705 | */ | 1721 | */ |
1706 | static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | 1722 | static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) |
1707 | { | 1723 | { |
1708 | struct ceph_mds_session *session = arg; | 1724 | int *remaining = arg; |
1709 | struct ceph_inode_info *ci = ceph_inode(inode); | 1725 | struct ceph_inode_info *ci = ceph_inode(inode); |
1710 | int used, wanted, oissued, mine; | 1726 | int used, wanted, oissued, mine; |
1711 | 1727 | ||
1712 | if (session->s_trim_caps <= 0) | 1728 | if (*remaining <= 0) |
1713 | return -1; | 1729 | return -1; |
1714 | 1730 | ||
1715 | spin_lock(&ci->i_ceph_lock); | 1731 | spin_lock(&ci->i_ceph_lock); |
@@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
1746 | if (oissued) { | 1762 | if (oissued) { |
1747 | /* we aren't the only cap.. just remove us */ | 1763 | /* we aren't the only cap.. just remove us */ |
1748 | __ceph_remove_cap(cap, true); | 1764 | __ceph_remove_cap(cap, true); |
1749 | session->s_trim_caps--; | 1765 | (*remaining)--; |
1750 | } else { | 1766 | } else { |
1751 | struct dentry *dentry; | 1767 | struct dentry *dentry; |
1752 | /* try dropping referring dentries */ | 1768 | /* try dropping referring dentries */ |
@@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) | |||
1758 | d_prune_aliases(inode); | 1774 | d_prune_aliases(inode); |
1759 | count = atomic_read(&inode->i_count); | 1775 | count = atomic_read(&inode->i_count); |
1760 | if (count == 1) | 1776 | if (count == 1) |
1761 | session->s_trim_caps--; | 1777 | (*remaining)--; |
1762 | dout("trim_caps_cb %p cap %p pruned, count now %d\n", | 1778 | dout("trim_caps_cb %p cap %p pruned, count now %d\n", |
1763 | inode, cap, count); | 1779 | inode, cap, count); |
1764 | } else { | 1780 | } else { |
@@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, | |||
1784 | dout("trim_caps mds%d start: %d / %d, trim %d\n", | 1800 | dout("trim_caps mds%d start: %d / %d, trim %d\n", |
1785 | session->s_mds, session->s_nr_caps, max_caps, trim_caps); | 1801 | session->s_mds, session->s_nr_caps, max_caps, trim_caps); |
1786 | if (trim_caps > 0) { | 1802 | if (trim_caps > 0) { |
1787 | session->s_trim_caps = trim_caps; | 1803 | int remaining = trim_caps; |
1788 | ceph_iterate_session_caps(session, trim_caps_cb, session); | 1804 | |
1805 | ceph_iterate_session_caps(session, trim_caps_cb, &remaining); | ||
1789 | dout("trim_caps mds%d done: %d / %d, trimmed %d\n", | 1806 | dout("trim_caps mds%d done: %d / %d, trimmed %d\n", |
1790 | session->s_mds, session->s_nr_caps, max_caps, | 1807 | session->s_mds, session->s_nr_caps, max_caps, |
1791 | trim_caps - session->s_trim_caps); | 1808 | trim_caps - remaining); |
1792 | session->s_trim_caps = 0; | ||
1793 | } | 1809 | } |
1794 | 1810 | ||
1795 | ceph_flush_cap_releases(mdsc, session); | 1811 | ceph_flush_cap_releases(mdsc, session); |
@@ -3015,18 +3031,23 @@ bad: | |||
3015 | pr_err("mdsc_handle_forward decode error err=%d\n", err); | 3031 | pr_err("mdsc_handle_forward decode error err=%d\n", err); |
3016 | } | 3032 | } |
3017 | 3033 | ||
3018 | static int __decode_and_drop_session_metadata(void **p, void *end) | 3034 | static int __decode_session_metadata(void **p, void *end, |
3035 | bool *blacklisted) | ||
3019 | { | 3036 | { |
3020 | /* map<string,string> */ | 3037 | /* map<string,string> */ |
3021 | u32 n; | 3038 | u32 n; |
3039 | bool err_str; | ||
3022 | ceph_decode_32_safe(p, end, n, bad); | 3040 | ceph_decode_32_safe(p, end, n, bad); |
3023 | while (n-- > 0) { | 3041 | while (n-- > 0) { |
3024 | u32 len; | 3042 | u32 len; |
3025 | ceph_decode_32_safe(p, end, len, bad); | 3043 | ceph_decode_32_safe(p, end, len, bad); |
3026 | ceph_decode_need(p, end, len, bad); | 3044 | ceph_decode_need(p, end, len, bad); |
3045 | err_str = !strncmp(*p, "error_string", len); | ||
3027 | *p += len; | 3046 | *p += len; |
3028 | ceph_decode_32_safe(p, end, len, bad); | 3047 | ceph_decode_32_safe(p, end, len, bad); |
3029 | ceph_decode_need(p, end, len, bad); | 3048 | ceph_decode_need(p, end, len, bad); |
3049 | if (err_str && strnstr(*p, "blacklisted", len)) | ||
3050 | *blacklisted = true; | ||
3030 | *p += len; | 3051 | *p += len; |
3031 | } | 3052 | } |
3032 | return 0; | 3053 | return 0; |
@@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session, | |||
3050 | u64 seq; | 3071 | u64 seq; |
3051 | unsigned long features = 0; | 3072 | unsigned long features = 0; |
3052 | int wake = 0; | 3073 | int wake = 0; |
3074 | bool blacklisted = false; | ||
3053 | 3075 | ||
3054 | /* decode */ | 3076 | /* decode */ |
3055 | ceph_decode_need(&p, end, sizeof(*h), bad); | 3077 | ceph_decode_need(&p, end, sizeof(*h), bad); |
@@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session, | |||
3062 | if (msg_version >= 3) { | 3084 | if (msg_version >= 3) { |
3063 | u32 len; | 3085 | u32 len; |
3064 | /* version >= 2, metadata */ | 3086 | /* version >= 2, metadata */ |
3065 | if (__decode_and_drop_session_metadata(&p, end) < 0) | 3087 | if (__decode_session_metadata(&p, end, &blacklisted) < 0) |
3066 | goto bad; | 3088 | goto bad; |
3067 | /* version >= 3, feature bits */ | 3089 | /* version >= 3, feature bits */ |
3068 | ceph_decode_32_safe(&p, end, len, bad); | 3090 | ceph_decode_32_safe(&p, end, len, bad); |
@@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session, | |||
3149 | session->s_state = CEPH_MDS_SESSION_REJECTED; | 3171 | session->s_state = CEPH_MDS_SESSION_REJECTED; |
3150 | cleanup_session_requests(mdsc, session); | 3172 | cleanup_session_requests(mdsc, session); |
3151 | remove_session_caps(session); | 3173 | remove_session_caps(session); |
3174 | if (blacklisted) | ||
3175 | mdsc->fsc->blacklisted = true; | ||
3152 | wake = 2; /* for good measure */ | 3176 | wake = 2; /* for good measure */ |
3153 | break; | 3177 | break; |
3154 | 3178 | ||
@@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) | |||
3998 | mutex_unlock(&mdsc->mutex); | 4022 | mutex_unlock(&mdsc->mutex); |
3999 | } | 4023 | } |
4000 | 4024 | ||
4025 | static void maybe_recover_session(struct ceph_mds_client *mdsc) | ||
4026 | { | ||
4027 | struct ceph_fs_client *fsc = mdsc->fsc; | ||
4028 | |||
4029 | if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) | ||
4030 | return; | ||
4031 | |||
4032 | if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) | ||
4033 | return; | ||
4034 | |||
4035 | if (!READ_ONCE(fsc->blacklisted)) | ||
4036 | return; | ||
4037 | |||
4038 | if (fsc->last_auto_reconnect && | ||
4039 | time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) | ||
4040 | return; | ||
4001 | 4041 | ||
4042 | pr_info("auto reconnect after blacklisted\n"); | ||
4043 | fsc->last_auto_reconnect = jiffies; | ||
4044 | ceph_force_reconnect(fsc->sb); | ||
4045 | } | ||
4002 | 4046 | ||
4003 | /* | 4047 | /* |
4004 | * delayed work -- periodically trim expired leases, renew caps with mds | 4048 | * delayed work -- periodically trim expired leases, renew caps with mds |
@@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work) | |||
4044 | pr_info("mds%d hung\n", s->s_mds); | 4088 | pr_info("mds%d hung\n", s->s_mds); |
4045 | } | 4089 | } |
4046 | } | 4090 | } |
4047 | if (s->s_state < CEPH_MDS_SESSION_OPEN) { | 4091 | if (s->s_state == CEPH_MDS_SESSION_NEW || |
4092 | s->s_state == CEPH_MDS_SESSION_RESTARTING || | ||
4093 | s->s_state == CEPH_MDS_SESSION_REJECTED) { | ||
4048 | /* this mds is failed or recovering, just wait */ | 4094 | /* this mds is failed or recovering, just wait */ |
4049 | ceph_put_mds_session(s); | 4095 | ceph_put_mds_session(s); |
4050 | continue; | 4096 | continue; |
@@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work) | |||
4072 | 4118 | ||
4073 | ceph_trim_snapid_map(mdsc); | 4119 | ceph_trim_snapid_map(mdsc); |
4074 | 4120 | ||
4121 | maybe_recover_session(mdsc); | ||
4122 | |||
4075 | schedule_delayed(mdsc); | 4123 | schedule_delayed(mdsc); |
4076 | } | 4124 | } |
4077 | 4125 | ||
@@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) | |||
4355 | session = __ceph_lookup_mds_session(mdsc, mds); | 4403 | session = __ceph_lookup_mds_session(mdsc, mds); |
4356 | if (!session) | 4404 | if (!session) |
4357 | continue; | 4405 | continue; |
4406 | |||
4407 | if (session->s_state == CEPH_MDS_SESSION_REJECTED) | ||
4408 | __unregister_session(mdsc, session); | ||
4409 | __wake_requests(mdsc, &session->s_waiting); | ||
4358 | mutex_unlock(&mdsc->mutex); | 4410 | mutex_unlock(&mdsc->mutex); |
4411 | |||
4359 | mutex_lock(&session->s_mutex); | 4412 | mutex_lock(&session->s_mutex); |
4360 | __close_session(mdsc, session); | 4413 | __close_session(mdsc, session); |
4361 | if (session->s_state == CEPH_MDS_SESSION_CLOSING) { | 4414 | if (session->s_state == CEPH_MDS_SESSION_CLOSING) { |
@@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) | |||
4364 | } | 4417 | } |
4365 | mutex_unlock(&session->s_mutex); | 4418 | mutex_unlock(&session->s_mutex); |
4366 | ceph_put_mds_session(session); | 4419 | ceph_put_mds_session(session); |
4420 | |||
4367 | mutex_lock(&mdsc->mutex); | 4421 | mutex_lock(&mdsc->mutex); |
4368 | kick_requests(mdsc, mds); | 4422 | kick_requests(mdsc, mds); |
4369 | } | 4423 | } |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f7c8603484fe..5cd131b41d84 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -148,9 +148,9 @@ enum { | |||
148 | CEPH_MDS_SESSION_OPENING = 2, | 148 | CEPH_MDS_SESSION_OPENING = 2, |
149 | CEPH_MDS_SESSION_OPEN = 3, | 149 | CEPH_MDS_SESSION_OPEN = 3, |
150 | CEPH_MDS_SESSION_HUNG = 4, | 150 | CEPH_MDS_SESSION_HUNG = 4, |
151 | CEPH_MDS_SESSION_CLOSING = 5, | 151 | CEPH_MDS_SESSION_RESTARTING = 5, |
152 | CEPH_MDS_SESSION_RESTARTING = 6, | 152 | CEPH_MDS_SESSION_RECONNECTING = 6, |
153 | CEPH_MDS_SESSION_RECONNECTING = 7, | 153 | CEPH_MDS_SESSION_CLOSING = 7, |
154 | CEPH_MDS_SESSION_REJECTED = 8, | 154 | CEPH_MDS_SESSION_REJECTED = 8, |
155 | }; | 155 | }; |
156 | 156 | ||
@@ -176,7 +176,7 @@ struct ceph_mds_session { | |||
176 | spinlock_t s_cap_lock; | 176 | spinlock_t s_cap_lock; |
177 | struct list_head s_caps; /* all caps issued by this session */ | 177 | struct list_head s_caps; /* all caps issued by this session */ |
178 | struct ceph_cap *s_cap_iterator; | 178 | struct ceph_cap *s_cap_iterator; |
179 | int s_nr_caps, s_trim_caps; | 179 | int s_nr_caps; |
180 | int s_num_cap_releases; | 180 | int s_num_cap_releases; |
181 | int s_cap_reconnect; | 181 | int s_cap_reconnect; |
182 | int s_readonly; | 182 | int s_readonly; |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 377fafc76f20..edfd643a8205 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -143,6 +143,7 @@ enum { | |||
143 | Opt_snapdirname, | 143 | Opt_snapdirname, |
144 | Opt_mds_namespace, | 144 | Opt_mds_namespace, |
145 | Opt_fscache_uniq, | 145 | Opt_fscache_uniq, |
146 | Opt_recover_session, | ||
146 | Opt_last_string, | 147 | Opt_last_string, |
147 | /* string args above */ | 148 | /* string args above */ |
148 | Opt_dirstat, | 149 | Opt_dirstat, |
@@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { | |||
184 | /* int args above */ | 185 | /* int args above */ |
185 | {Opt_snapdirname, "snapdirname=%s"}, | 186 | {Opt_snapdirname, "snapdirname=%s"}, |
186 | {Opt_mds_namespace, "mds_namespace=%s"}, | 187 | {Opt_mds_namespace, "mds_namespace=%s"}, |
188 | {Opt_recover_session, "recover_session=%s"}, | ||
187 | {Opt_fscache_uniq, "fsc=%s"}, | 189 | {Opt_fscache_uniq, "fsc=%s"}, |
188 | /* string args above */ | 190 | /* string args above */ |
189 | {Opt_dirstat, "dirstat"}, | 191 | {Opt_dirstat, "dirstat"}, |
@@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) | |||
254 | if (!fsopt->mds_namespace) | 256 | if (!fsopt->mds_namespace) |
255 | return -ENOMEM; | 257 | return -ENOMEM; |
256 | break; | 258 | break; |
259 | case Opt_recover_session: | ||
260 | if (!strncmp(argstr[0].from, "no", | ||
261 | argstr[0].to - argstr[0].from)) { | ||
262 | fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; | ||
263 | } else if (!strncmp(argstr[0].from, "clean", | ||
264 | argstr[0].to - argstr[0].from)) { | ||
265 | fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; | ||
266 | } else { | ||
267 | return -EINVAL; | ||
268 | } | ||
269 | break; | ||
257 | case Opt_fscache_uniq: | 270 | case Opt_fscache_uniq: |
258 | kfree(fsopt->fscache_uniq); | 271 | kfree(fsopt->fscache_uniq); |
259 | fsopt->fscache_uniq = kstrndup(argstr[0].from, | 272 | fsopt->fscache_uniq = kstrndup(argstr[0].from, |
@@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) | |||
576 | 589 | ||
577 | if (fsopt->mds_namespace) | 590 | if (fsopt->mds_namespace) |
578 | seq_show_option(m, "mds_namespace", fsopt->mds_namespace); | 591 | seq_show_option(m, "mds_namespace", fsopt->mds_namespace); |
592 | |||
593 | if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) | ||
594 | seq_show_option(m, "recover_session", "clean"); | ||
595 | |||
579 | if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) | 596 | if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) |
580 | seq_printf(m, ",wsize=%d", fsopt->wsize); | 597 | seq_printf(m, ",wsize=%d", fsopt->wsize); |
581 | if (fsopt->rsize != CEPH_MAX_READ_SIZE) | 598 | if (fsopt->rsize != CEPH_MAX_READ_SIZE) |
@@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, | |||
664 | 681 | ||
665 | fsc->sb = NULL; | 682 | fsc->sb = NULL; |
666 | fsc->mount_state = CEPH_MOUNT_MOUNTING; | 683 | fsc->mount_state = CEPH_MOUNT_MOUNTING; |
684 | fsc->filp_gen = 1; | ||
667 | 685 | ||
668 | atomic_long_set(&fsc->writeback_count, 0); | 686 | atomic_long_set(&fsc->writeback_count, 0); |
669 | 687 | ||
@@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) | |||
713 | { | 731 | { |
714 | dout("destroy_fs_client %p\n", fsc); | 732 | dout("destroy_fs_client %p\n", fsc); |
715 | 733 | ||
734 | ceph_mdsc_destroy(fsc); | ||
716 | destroy_workqueue(fsc->inode_wq); | 735 | destroy_workqueue(fsc->inode_wq); |
717 | destroy_workqueue(fsc->cap_wq); | 736 | destroy_workqueue(fsc->cap_wq); |
718 | 737 | ||
@@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb) | |||
829 | fsc->mount_state = CEPH_MOUNT_SHUTDOWN; | 848 | fsc->mount_state = CEPH_MOUNT_SHUTDOWN; |
830 | ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); | 849 | ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); |
831 | ceph_mdsc_force_umount(fsc->mdsc); | 850 | ceph_mdsc_force_umount(fsc->mdsc); |
832 | return; | 851 | fsc->filp_gen++; // invalidate open files |
833 | } | 852 | } |
834 | 853 | ||
835 | static int ceph_remount(struct super_block *sb, int *flags, char *data) | 854 | static int ceph_remount(struct super_block *sb, int *flags, char *data) |
@@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, | |||
1089 | } | 1108 | } |
1090 | 1109 | ||
1091 | if (ceph_sb_to_client(sb) != fsc) { | 1110 | if (ceph_sb_to_client(sb) != fsc) { |
1092 | ceph_mdsc_destroy(fsc); | ||
1093 | destroy_fs_client(fsc); | 1111 | destroy_fs_client(fsc); |
1094 | fsc = ceph_sb_to_client(sb); | 1112 | fsc = ceph_sb_to_client(sb); |
1095 | dout("get_sb got existing client %p\n", fsc); | 1113 | dout("get_sb got existing client %p\n", fsc); |
@@ -1115,7 +1133,6 @@ out_splat: | |||
1115 | goto out_final; | 1133 | goto out_final; |
1116 | 1134 | ||
1117 | out: | 1135 | out: |
1118 | ceph_mdsc_destroy(fsc); | ||
1119 | destroy_fs_client(fsc); | 1136 | destroy_fs_client(fsc); |
1120 | out_final: | 1137 | out_final: |
1121 | dout("ceph_mount fail %ld\n", PTR_ERR(res)); | 1138 | dout("ceph_mount fail %ld\n", PTR_ERR(res)); |
@@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s) | |||
1139 | 1156 | ||
1140 | ceph_fscache_unregister_fs(fsc); | 1157 | ceph_fscache_unregister_fs(fsc); |
1141 | 1158 | ||
1142 | ceph_mdsc_destroy(fsc); | ||
1143 | |||
1144 | destroy_fs_client(fsc); | 1159 | destroy_fs_client(fsc); |
1145 | free_anon_bdev(dev); | 1160 | free_anon_bdev(dev); |
1146 | } | 1161 | } |
@@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = { | |||
1154 | }; | 1169 | }; |
1155 | MODULE_ALIAS_FS("ceph"); | 1170 | MODULE_ALIAS_FS("ceph"); |
1156 | 1171 | ||
1172 | int ceph_force_reconnect(struct super_block *sb) | ||
1173 | { | ||
1174 | struct ceph_fs_client *fsc = ceph_sb_to_client(sb); | ||
1175 | int err = 0; | ||
1176 | |||
1177 | ceph_umount_begin(sb); | ||
1178 | |||
1179 | /* Make sure all page caches get invalidated. | ||
1180 | * see remove_session_caps_cb() */ | ||
1181 | flush_workqueue(fsc->inode_wq); | ||
1182 | |||
1183 | /* In case that we were blacklisted. This also reset | ||
1184 | * all mon/osd connections */ | ||
1185 | ceph_reset_client_addr(fsc->client); | ||
1186 | |||
1187 | ceph_osdc_clear_abort_err(&fsc->client->osdc); | ||
1188 | |||
1189 | fsc->blacklisted = false; | ||
1190 | fsc->mount_state = CEPH_MOUNT_MOUNTED; | ||
1191 | |||
1192 | if (sb->s_root) { | ||
1193 | err = __ceph_do_getattr(d_inode(sb->s_root), NULL, | ||
1194 | CEPH_STAT_CAP_INODE, true); | ||
1195 | } | ||
1196 | return err; | ||
1197 | } | ||
1198 | |||
1157 | static int __init init_ceph(void) | 1199 | static int __init init_ceph(void) |
1158 | { | 1200 | { |
1159 | int ret = init_caches(); | 1201 | int ret = init_caches(); |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee7de85..f98d9247f9cb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
17 | #include <linux/posix_acl.h> | 17 | #include <linux/posix_acl.h> |
18 | #include <linux/refcount.h> | 18 | #include <linux/refcount.h> |
19 | #include <linux/security.h> | ||
19 | 20 | ||
20 | #include <linux/ceph/libceph.h> | 21 | #include <linux/ceph/libceph.h> |
21 | 22 | ||
@@ -31,6 +32,7 @@ | |||
31 | #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ | 32 | #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ |
32 | #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) | 33 | #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) |
33 | 34 | ||
35 | #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ | ||
34 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ | 36 | #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ |
35 | #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ | 37 | #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ |
36 | #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ | 38 | #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ |
@@ -101,6 +103,11 @@ struct ceph_fs_client { | |||
101 | struct ceph_client *client; | 103 | struct ceph_client *client; |
102 | 104 | ||
103 | unsigned long mount_state; | 105 | unsigned long mount_state; |
106 | |||
107 | unsigned long last_auto_reconnect; | ||
108 | bool blacklisted; | ||
109 | |||
110 | u32 filp_gen; | ||
104 | loff_t max_file_size; | 111 | loff_t max_file_size; |
105 | 112 | ||
106 | struct ceph_mds_client *mdsc; | 113 | struct ceph_mds_client *mdsc; |
@@ -395,6 +402,8 @@ struct ceph_inode_info { | |||
395 | struct fscache_cookie *fscache; | 402 | struct fscache_cookie *fscache; |
396 | u32 i_fscache_gen; | 403 | u32 i_fscache_gen; |
397 | #endif | 404 | #endif |
405 | errseq_t i_meta_err; | ||
406 | |||
398 | struct inode vfs_inode; /* at end */ | 407 | struct inode vfs_inode; /* at end */ |
399 | }; | 408 | }; |
400 | 409 | ||
@@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, | |||
499 | #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ | 508 | #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ |
500 | #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ | 509 | #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ |
501 | #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ | 510 | #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ |
502 | #define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ | 511 | #define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ |
503 | #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ | 512 | #define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ |
504 | #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ | 513 | #define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ |
505 | #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ | 514 | #define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ |
506 | #define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ | 515 | #define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ |
507 | #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ | 516 | #define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ |
508 | #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ | 517 | #define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ |
509 | #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ | 518 | #define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ |
510 | #define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ | 519 | #define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ |
511 | #define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ | 520 | #define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ |
512 | |||
513 | 521 | ||
514 | /* | 522 | /* |
515 | * Masks of ceph inode work. | 523 | * Masks of ceph inode work. |
@@ -703,6 +711,10 @@ struct ceph_file_info { | |||
703 | 711 | ||
704 | spinlock_t rw_contexts_lock; | 712 | spinlock_t rw_contexts_lock; |
705 | struct list_head rw_contexts; | 713 | struct list_head rw_contexts; |
714 | |||
715 | errseq_t meta_err; | ||
716 | u32 filp_gen; | ||
717 | atomic_t num_locks; | ||
706 | }; | 718 | }; |
707 | 719 | ||
708 | struct ceph_dir_file_info { | 720 | struct ceph_dir_file_info { |
@@ -842,7 +854,8 @@ static inline int default_congestion_kb(void) | |||
842 | } | 854 | } |
843 | 855 | ||
844 | 856 | ||
845 | 857 | /* super.c */ | |
858 | extern int ceph_force_reconnect(struct super_block *sb); | ||
846 | /* snap.c */ | 859 | /* snap.c */ |
847 | struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, | 860 | struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, |
848 | u64 ino); | 861 | u64 ino); |
@@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) | |||
959 | #ifdef CONFIG_CEPH_FS_SECURITY_LABEL | 972 | #ifdef CONFIG_CEPH_FS_SECURITY_LABEL |
960 | extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, | 973 | extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, |
961 | struct ceph_acl_sec_ctx *ctx); | 974 | struct ceph_acl_sec_ctx *ctx); |
962 | extern void ceph_security_invalidate_secctx(struct inode *inode); | 975 | static inline void ceph_security_invalidate_secctx(struct inode *inode) |
976 | { | ||
977 | security_inode_invalidate_secctx(inode); | ||
978 | } | ||
963 | #else | 979 | #else |
964 | static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, | 980 | static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, |
965 | struct ceph_acl_sec_ctx *ctx) | 981 | struct ceph_acl_sec_ctx *ctx) |
@@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, | |||
1039 | struct ceph_mds_session *session); | 1055 | struct ceph_mds_session *session); |
1040 | extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, | 1056 | extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, |
1041 | int mds); | 1057 | int mds); |
1042 | extern int ceph_get_cap_mds(struct inode *inode); | ||
1043 | extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); | 1058 | extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); |
1044 | extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); | 1059 | extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); |
1045 | extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, | 1060 | extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, |
@@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, | |||
1058 | struct inode *dir, | 1073 | struct inode *dir, |
1059 | int mds, int drop, int unless); | 1074 | int mds, int drop, int unless); |
1060 | 1075 | ||
1061 | extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | 1076 | extern int ceph_get_caps(struct file *filp, int need, int want, |
1062 | loff_t endoff, int *got, struct page **pinned_page); | 1077 | loff_t endoff, int *got, struct page **pinned_page); |
1063 | extern int ceph_try_get_caps(struct ceph_inode_info *ci, | 1078 | extern int ceph_try_get_caps(struct inode *inode, |
1064 | int need, int want, bool nonblock, int *got); | 1079 | int need, int want, bool nonblock, int *got); |
1065 | 1080 | ||
1066 | /* for counting open files by mode */ | 1081 | /* for counting open files by mode */ |
@@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); | |||
1071 | extern const struct address_space_operations ceph_aops; | 1086 | extern const struct address_space_operations ceph_aops; |
1072 | extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); | 1087 | extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); |
1073 | extern int ceph_uninline_data(struct file *filp, struct page *locked_page); | 1088 | extern int ceph_uninline_data(struct file *filp, struct page *locked_page); |
1074 | extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); | 1089 | extern int ceph_pool_perm_check(struct inode *inode, int need); |
1075 | extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); | 1090 | extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); |
1076 | 1091 | ||
1077 | /* file.c */ | 1092 | /* file.c */ |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 939eab7aa219..cb18ee637cb7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, | |||
20 | 20 | ||
21 | static bool ceph_is_valid_xattr(const char *name) | 21 | static bool ceph_is_valid_xattr(const char *name) |
22 | { | 22 | { |
23 | return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || | 23 | return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || |
24 | !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || | ||
24 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || | 25 | !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || |
25 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); | 26 | !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); |
26 | } | 27 | } |
@@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, | |||
892 | memcpy(value, xattr->val, xattr->val_len); | 893 | memcpy(value, xattr->val, xattr->val_len); |
893 | 894 | ||
894 | if (current->journal_info && | 895 | if (current->journal_info && |
895 | !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) | 896 | !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && |
897 | security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) | ||
896 | ci->i_ceph_flags |= CEPH_I_SEC_INITED; | 898 | ci->i_ceph_flags |= CEPH_I_SEC_INITED; |
897 | out: | 899 | out: |
898 | spin_unlock(&ci->i_ceph_lock); | 900 | spin_unlock(&ci->i_ceph_lock); |
@@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) | |||
903 | { | 905 | { |
904 | struct inode *inode = d_inode(dentry); | 906 | struct inode *inode = d_inode(dentry); |
905 | struct ceph_inode_info *ci = ceph_inode(inode); | 907 | struct ceph_inode_info *ci = ceph_inode(inode); |
906 | struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); | ||
907 | bool len_only = (size == 0); | 908 | bool len_only = (size == 0); |
908 | u32 namelen; | 909 | u32 namelen; |
909 | int err; | 910 | int err; |
910 | int i; | ||
911 | 911 | ||
912 | spin_lock(&ci->i_ceph_lock); | 912 | spin_lock(&ci->i_ceph_lock); |
913 | dout("listxattr %p ver=%lld index_ver=%lld\n", inode, | 913 | dout("listxattr %p ver=%lld index_ver=%lld\n", inode, |
@@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) | |||
936 | names = __copy_xattr_names(ci, names); | 936 | names = __copy_xattr_names(ci, names); |
937 | size -= namelen; | 937 | size -= namelen; |
938 | } | 938 | } |
939 | |||
940 | |||
941 | /* virtual xattr names, too */ | ||
942 | if (vxattrs) { | ||
943 | for (i = 0; vxattrs[i].name; i++) { | ||
944 | size_t this_len; | ||
945 | |||
946 | if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) | ||
947 | continue; | ||
948 | if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) | ||
949 | continue; | ||
950 | |||
951 | this_len = strlen(vxattrs[i].name) + 1; | ||
952 | namelen += this_len; | ||
953 | if (len_only) | ||
954 | continue; | ||
955 | |||
956 | if (this_len > size) { | ||
957 | err = -ERANGE; | ||
958 | goto out; | ||
959 | } | ||
960 | |||
961 | memcpy(names, vxattrs[i].name, this_len); | ||
962 | names += this_len; | ||
963 | size -= this_len; | ||
964 | } | ||
965 | } | ||
966 | err = namelen; | 939 | err = namelen; |
967 | out: | 940 | out: |
968 | spin_unlock(&ci->i_ceph_lock); | 941 | spin_unlock(&ci->i_ceph_lock); |
@@ -1293,42 +1266,8 @@ out: | |||
1293 | ceph_pagelist_release(pagelist); | 1266 | ceph_pagelist_release(pagelist); |
1294 | return err; | 1267 | return err; |
1295 | } | 1268 | } |
1296 | 1269 | #endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ | |
1297 | void ceph_security_invalidate_secctx(struct inode *inode) | 1270 | #endif /* CONFIG_SECURITY */ |
1298 | { | ||
1299 | security_inode_invalidate_secctx(inode); | ||
1300 | } | ||
1301 | |||
1302 | static int ceph_xattr_set_security_label(const struct xattr_handler *handler, | ||
1303 | struct dentry *unused, struct inode *inode, | ||
1304 | const char *key, const void *buf, | ||
1305 | size_t buflen, int flags) | ||
1306 | { | ||
1307 | if (security_ismaclabel(key)) { | ||
1308 | const char *name = xattr_full_name(handler, key); | ||
1309 | return __ceph_setxattr(inode, name, buf, buflen, flags); | ||
1310 | } | ||
1311 | return -EOPNOTSUPP; | ||
1312 | } | ||
1313 | |||
1314 | static int ceph_xattr_get_security_label(const struct xattr_handler *handler, | ||
1315 | struct dentry *unused, struct inode *inode, | ||
1316 | const char *key, void *buf, size_t buflen) | ||
1317 | { | ||
1318 | if (security_ismaclabel(key)) { | ||
1319 | const char *name = xattr_full_name(handler, key); | ||
1320 | return __ceph_getxattr(inode, name, buf, buflen); | ||
1321 | } | ||
1322 | return -EOPNOTSUPP; | ||
1323 | } | ||
1324 | |||
1325 | static const struct xattr_handler ceph_security_label_handler = { | ||
1326 | .prefix = XATTR_SECURITY_PREFIX, | ||
1327 | .get = ceph_xattr_get_security_label, | ||
1328 | .set = ceph_xattr_set_security_label, | ||
1329 | }; | ||
1330 | #endif | ||
1331 | #endif | ||
1332 | 1271 | ||
1333 | void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) | 1272 | void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) |
1334 | { | 1273 | { |
@@ -1352,9 +1291,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { | |||
1352 | &posix_acl_access_xattr_handler, | 1291 | &posix_acl_access_xattr_handler, |
1353 | &posix_acl_default_xattr_handler, | 1292 | &posix_acl_default_xattr_handler, |
1354 | #endif | 1293 | #endif |
1355 | #ifdef CONFIG_CEPH_FS_SECURITY_LABEL | ||
1356 | &ceph_security_label_handler, | ||
1357 | #endif | ||
1358 | &ceph_other_xattr_handler, | 1294 | &ceph_other_xattr_handler, |
1359 | NULL, | 1295 | NULL, |
1360 | }; | 1296 | }; |
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 82156da3c650..b9dbda1c26aa 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h | |||
@@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); | |||
293 | struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); | 293 | struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); |
294 | u64 ceph_client_gid(struct ceph_client *client); | 294 | u64 ceph_client_gid(struct ceph_client *client); |
295 | extern void ceph_destroy_client(struct ceph_client *client); | 295 | extern void ceph_destroy_client(struct ceph_client *client); |
296 | extern void ceph_reset_client_addr(struct ceph_client *client); | ||
296 | extern int __ceph_open_session(struct ceph_client *client, | 297 | extern int __ceph_open_session(struct ceph_client *client, |
297 | unsigned long started); | 298 | unsigned long started); |
298 | extern int ceph_open_session(struct ceph_client *client); | 299 | extern int ceph_open_session(struct ceph_client *client); |
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 23895d178149..c4458dc6a757 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h | |||
@@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void); | |||
337 | extern void ceph_messenger_init(struct ceph_messenger *msgr, | 337 | extern void ceph_messenger_init(struct ceph_messenger *msgr, |
338 | struct ceph_entity_addr *myaddr); | 338 | struct ceph_entity_addr *myaddr); |
339 | extern void ceph_messenger_fini(struct ceph_messenger *msgr); | 339 | extern void ceph_messenger_fini(struct ceph_messenger *msgr); |
340 | extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr); | ||
340 | 341 | ||
341 | extern void ceph_con_init(struct ceph_connection *con, void *private, | 342 | extern void ceph_con_init(struct ceph_connection *con, void *private, |
342 | const struct ceph_connection_operations *ops, | 343 | const struct ceph_connection_operations *ops, |
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index b4d134d3312a..dbb8a6959a73 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h | |||
@@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, | |||
109 | 109 | ||
110 | extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); | 110 | extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); |
111 | extern void ceph_monc_stop(struct ceph_mon_client *monc); | 111 | extern void ceph_monc_stop(struct ceph_mon_client *monc); |
112 | extern void ceph_monc_reopen_session(struct ceph_mon_client *monc); | ||
112 | 113 | ||
113 | enum { | 114 | enum { |
114 | CEPH_SUB_MONMAP = 0, | 115 | CEPH_SUB_MONMAP = 0, |
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index ad7fe5d10dcd..eaffbdddf89a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void); | |||
381 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, | 381 | extern int ceph_osdc_init(struct ceph_osd_client *osdc, |
382 | struct ceph_client *client); | 382 | struct ceph_client *client); |
383 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); | 383 | extern void ceph_osdc_stop(struct ceph_osd_client *osdc); |
384 | extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc); | ||
384 | 385 | ||
385 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, | 386 | extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, |
386 | struct ceph_msg *msg); | 387 | struct ceph_msg *msg); |
@@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, | |||
388 | struct ceph_msg *msg); | 389 | struct ceph_msg *msg); |
389 | void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); | 390 | void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); |
390 | void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); | 391 | void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); |
392 | void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc); | ||
391 | 393 | ||
392 | #define osd_req_op_data(oreq, whch, typ, fld) \ | 394 | #define osd_req_op_data(oreq, whch, typ, fld) \ |
393 | ({ \ | 395 | ({ \ |
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4eeea4d5c3ef..2d568246803f 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c | |||
@@ -13,6 +13,7 @@ | |||
13 | #include <linux/nsproxy.h> | 13 | #include <linux/nsproxy.h> |
14 | #include <linux/parser.h> | 14 | #include <linux/parser.h> |
15 | #include <linux/sched.h> | 15 | #include <linux/sched.h> |
16 | #include <linux/sched/mm.h> | ||
16 | #include <linux/seq_file.h> | 17 | #include <linux/seq_file.h> |
17 | #include <linux/slab.h> | 18 | #include <linux/slab.h> |
18 | #include <linux/statfs.h> | 19 | #include <linux/statfs.h> |
@@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt, | |||
185 | } | 186 | } |
186 | EXPORT_SYMBOL(ceph_compare_options); | 187 | EXPORT_SYMBOL(ceph_compare_options); |
187 | 188 | ||
189 | /* | ||
190 | * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are | ||
191 | * compatible with (a superset of) GFP_KERNEL. This is because while the | ||
192 | * actual pages are allocated with the specified flags, the page table pages | ||
193 | * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take | ||
194 | * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). | ||
195 | * | ||
196 | * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. | ||
197 | */ | ||
188 | void *ceph_kvmalloc(size_t size, gfp_t flags) | 198 | void *ceph_kvmalloc(size_t size, gfp_t flags) |
189 | { | 199 | { |
190 | if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { | 200 | void *p; |
191 | void *ptr = kmalloc(size, flags | __GFP_NOWARN); | 201 | |
192 | if (ptr) | 202 | if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { |
193 | return ptr; | 203 | p = kvmalloc(size, flags); |
204 | } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { | ||
205 | unsigned int nofs_flag = memalloc_nofs_save(); | ||
206 | p = kvmalloc(size, GFP_KERNEL); | ||
207 | memalloc_nofs_restore(nofs_flag); | ||
208 | } else { | ||
209 | unsigned int noio_flag = memalloc_noio_save(); | ||
210 | p = kvmalloc(size, GFP_KERNEL); | ||
211 | memalloc_noio_restore(noio_flag); | ||
194 | } | 212 | } |
195 | 213 | ||
196 | return __vmalloc(size, flags, PAGE_KERNEL); | 214 | return p; |
197 | } | 215 | } |
198 | 216 | ||
199 | |||
200 | static int parse_fsid(const char *str, struct ceph_fsid *fsid) | 217 | static int parse_fsid(const char *str, struct ceph_fsid *fsid) |
201 | { | 218 | { |
202 | int i = 0; | 219 | int i = 0; |
@@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client) | |||
694 | } | 711 | } |
695 | EXPORT_SYMBOL(ceph_destroy_client); | 712 | EXPORT_SYMBOL(ceph_destroy_client); |
696 | 713 | ||
714 | void ceph_reset_client_addr(struct ceph_client *client) | ||
715 | { | ||
716 | ceph_messenger_reset_nonce(&client->msgr); | ||
717 | ceph_monc_reopen_session(&client->monc); | ||
718 | ceph_osdc_reopen_osds(&client->osdc); | ||
719 | } | ||
720 | EXPORT_SYMBOL(ceph_reset_client_addr); | ||
721 | |||
697 | /* | 722 | /* |
698 | * true if we have the mon map (and have thus joined the cluster) | 723 | * true if we have the mon map (and have thus joined the cluster) |
699 | */ | 724 | */ |
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 962f521c863e..e4cb3db2ee77 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c | |||
@@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con) | |||
3031 | } | 3031 | } |
3032 | 3032 | ||
3033 | 3033 | ||
3034 | void ceph_messenger_reset_nonce(struct ceph_messenger *msgr) | ||
3035 | { | ||
3036 | u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000; | ||
3037 | msgr->inst.addr.nonce = cpu_to_le32(nonce); | ||
3038 | encode_my_addr(msgr); | ||
3039 | } | ||
3034 | 3040 | ||
3035 | /* | 3041 | /* |
3036 | * initialize a new messenger instance | 3042 | * initialize a new messenger instance |
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0520bf9825aa..7256c402ebaa 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c | |||
@@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc) | |||
213 | __open_session(monc); | 213 | __open_session(monc); |
214 | } | 214 | } |
215 | 215 | ||
216 | void ceph_monc_reopen_session(struct ceph_mon_client *monc) | ||
217 | { | ||
218 | mutex_lock(&monc->mutex); | ||
219 | reopen_session(monc); | ||
220 | mutex_unlock(&monc->mutex); | ||
221 | } | ||
222 | |||
216 | static void un_backoff(struct ceph_mon_client *monc) | 223 | static void un_backoff(struct ceph_mon_client *monc) |
217 | { | 224 | { |
218 | monc->hunt_mult /= 2; /* reduce by 50% */ | 225 | monc->hunt_mult /= 2; /* reduce by 50% */ |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 78ae6e8c953d..ba45b074a362 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
841 | struct ceph_pagelist *pagelist; | 841 | struct ceph_pagelist *pagelist; |
842 | size_t payload_len = 0; | 842 | size_t payload_len = 0; |
843 | size_t size; | 843 | size_t size; |
844 | int ret; | ||
844 | 845 | ||
845 | op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); | 846 | op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); |
846 | 847 | ||
@@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
852 | size = strlen(class); | 853 | size = strlen(class); |
853 | BUG_ON(size > (size_t) U8_MAX); | 854 | BUG_ON(size > (size_t) U8_MAX); |
854 | op->cls.class_len = size; | 855 | op->cls.class_len = size; |
855 | ceph_pagelist_append(pagelist, class, size); | 856 | ret = ceph_pagelist_append(pagelist, class, size); |
857 | if (ret) | ||
858 | goto err_pagelist_free; | ||
856 | payload_len += size; | 859 | payload_len += size; |
857 | 860 | ||
858 | op->cls.method_name = method; | 861 | op->cls.method_name = method; |
859 | size = strlen(method); | 862 | size = strlen(method); |
860 | BUG_ON(size > (size_t) U8_MAX); | 863 | BUG_ON(size > (size_t) U8_MAX); |
861 | op->cls.method_len = size; | 864 | op->cls.method_len = size; |
862 | ceph_pagelist_append(pagelist, method, size); | 865 | ret = ceph_pagelist_append(pagelist, method, size); |
866 | if (ret) | ||
867 | goto err_pagelist_free; | ||
863 | payload_len += size; | 868 | payload_len += size; |
864 | 869 | ||
865 | osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); | 870 | osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); |
866 | |||
867 | op->indata_len = payload_len; | 871 | op->indata_len = payload_len; |
868 | return 0; | 872 | return 0; |
873 | |||
874 | err_pagelist_free: | ||
875 | ceph_pagelist_release(pagelist); | ||
876 | return ret; | ||
869 | } | 877 | } |
870 | EXPORT_SYMBOL(osd_req_op_cls_init); | 878 | EXPORT_SYMBOL(osd_req_op_cls_init); |
871 | 879 | ||
@@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
877 | opcode, 0); | 885 | opcode, 0); |
878 | struct ceph_pagelist *pagelist; | 886 | struct ceph_pagelist *pagelist; |
879 | size_t payload_len; | 887 | size_t payload_len; |
888 | int ret; | ||
880 | 889 | ||
881 | BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); | 890 | BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); |
882 | 891 | ||
@@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
886 | 895 | ||
887 | payload_len = strlen(name); | 896 | payload_len = strlen(name); |
888 | op->xattr.name_len = payload_len; | 897 | op->xattr.name_len = payload_len; |
889 | ceph_pagelist_append(pagelist, name, payload_len); | 898 | ret = ceph_pagelist_append(pagelist, name, payload_len); |
899 | if (ret) | ||
900 | goto err_pagelist_free; | ||
890 | 901 | ||
891 | op->xattr.value_len = size; | 902 | op->xattr.value_len = size; |
892 | ceph_pagelist_append(pagelist, value, size); | 903 | ret = ceph_pagelist_append(pagelist, value, size); |
904 | if (ret) | ||
905 | goto err_pagelist_free; | ||
893 | payload_len += size; | 906 | payload_len += size; |
894 | 907 | ||
895 | op->xattr.cmp_op = cmp_op; | 908 | op->xattr.cmp_op = cmp_op; |
@@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
898 | ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); | 911 | ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); |
899 | op->indata_len = payload_len; | 912 | op->indata_len = payload_len; |
900 | return 0; | 913 | return 0; |
914 | |||
915 | err_pagelist_free: | ||
916 | ceph_pagelist_release(pagelist); | ||
917 | return ret; | ||
901 | } | 918 | } |
902 | EXPORT_SYMBOL(osd_req_op_xattr_init); | 919 | EXPORT_SYMBOL(osd_req_op_xattr_init); |
903 | 920 | ||
@@ -1488,7 +1505,6 @@ enum calc_target_result { | |||
1488 | 1505 | ||
1489 | static enum calc_target_result calc_target(struct ceph_osd_client *osdc, | 1506 | static enum calc_target_result calc_target(struct ceph_osd_client *osdc, |
1490 | struct ceph_osd_request_target *t, | 1507 | struct ceph_osd_request_target *t, |
1491 | struct ceph_connection *con, | ||
1492 | bool any_change) | 1508 | bool any_change) |
1493 | { | 1509 | { |
1494 | struct ceph_pg_pool_info *pi; | 1510 | struct ceph_pg_pool_info *pi; |
@@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) | |||
2272 | dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); | 2288 | dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); |
2273 | 2289 | ||
2274 | again: | 2290 | again: |
2275 | ct_res = calc_target(osdc, &req->r_t, NULL, false); | 2291 | ct_res = calc_target(osdc, &req->r_t, false); |
2276 | if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) | 2292 | if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) |
2277 | goto promote; | 2293 | goto promote; |
2278 | 2294 | ||
@@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err) | |||
2476 | } | 2492 | } |
2477 | EXPORT_SYMBOL(ceph_osdc_abort_requests); | 2493 | EXPORT_SYMBOL(ceph_osdc_abort_requests); |
2478 | 2494 | ||
2495 | void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc) | ||
2496 | { | ||
2497 | down_write(&osdc->lock); | ||
2498 | osdc->abort_err = 0; | ||
2499 | up_write(&osdc->lock); | ||
2500 | } | ||
2501 | EXPORT_SYMBOL(ceph_osdc_clear_abort_err); | ||
2502 | |||
2479 | static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) | 2503 | static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) |
2480 | { | 2504 | { |
2481 | if (likely(eb > osdc->epoch_barrier)) { | 2505 | if (likely(eb > osdc->epoch_barrier)) { |
@@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) | |||
3087 | lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; | 3111 | lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; |
3088 | } | 3112 | } |
3089 | 3113 | ||
3090 | calc_target(osdc, &lreq->t, NULL, false); | 3114 | calc_target(osdc, &lreq->t, false); |
3091 | osd = lookup_create_osd(osdc, lreq->t.osd, true); | 3115 | osd = lookup_create_osd(osdc, lreq->t.osd, true); |
3092 | link_linger(osd, lreq); | 3116 | link_linger(osd, lreq); |
3093 | 3117 | ||
@@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) | |||
3704 | struct ceph_osd_client *osdc = lreq->osdc; | 3728 | struct ceph_osd_client *osdc = lreq->osdc; |
3705 | enum calc_target_result ct_res; | 3729 | enum calc_target_result ct_res; |
3706 | 3730 | ||
3707 | ct_res = calc_target(osdc, &lreq->t, NULL, true); | 3731 | ct_res = calc_target(osdc, &lreq->t, true); |
3708 | if (ct_res == CALC_TARGET_NEED_RESEND) { | 3732 | if (ct_res == CALC_TARGET_NEED_RESEND) { |
3709 | struct ceph_osd *osd; | 3733 | struct ceph_osd *osd; |
3710 | 3734 | ||
@@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd, | |||
3776 | n = rb_next(n); /* unlink_request(), check_pool_dne() */ | 3800 | n = rb_next(n); /* unlink_request(), check_pool_dne() */ |
3777 | 3801 | ||
3778 | dout("%s req %p tid %llu\n", __func__, req, req->r_tid); | 3802 | dout("%s req %p tid %llu\n", __func__, req, req->r_tid); |
3779 | ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, | 3803 | ct_res = calc_target(osdc, &req->r_t, false); |
3780 | false); | ||
3781 | switch (ct_res) { | 3804 | switch (ct_res) { |
3782 | case CALC_TARGET_NO_ACTION: | 3805 | case CALC_TARGET_NO_ACTION: |
3783 | force_resend_writes = cleared_full || | 3806 | force_resend_writes = cleared_full || |
@@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc, | |||
3886 | n = rb_next(n); | 3909 | n = rb_next(n); |
3887 | 3910 | ||
3888 | if (req->r_t.epoch < osdc->osdmap->epoch) { | 3911 | if (req->r_t.epoch < osdc->osdmap->epoch) { |
3889 | ct_res = calc_target(osdc, &req->r_t, NULL, false); | 3912 | ct_res = calc_target(osdc, &req->r_t, false); |
3890 | if (ct_res == CALC_TARGET_POOL_DNE) { | 3913 | if (ct_res == CALC_TARGET_POOL_DNE) { |
3891 | erase_request(need_resend, req); | 3914 | erase_request(need_resend, req); |
3892 | check_pool_dne(req); | 3915 | check_pool_dne(req); |
@@ -5087,6 +5110,24 @@ out_put_req: | |||
5087 | EXPORT_SYMBOL(ceph_osdc_call); | 5110 | EXPORT_SYMBOL(ceph_osdc_call); |
5088 | 5111 | ||
5089 | /* | 5112 | /* |
5113 | * reset all osd connections | ||
5114 | */ | ||
5115 | void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc) | ||
5116 | { | ||
5117 | struct rb_node *n; | ||
5118 | |||
5119 | down_write(&osdc->lock); | ||
5120 | for (n = rb_first(&osdc->osds); n; ) { | ||
5121 | struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); | ||
5122 | |||
5123 | n = rb_next(n); | ||
5124 | if (!reopen_osd(osd)) | ||
5125 | kick_osd_requests(osd); | ||
5126 | } | ||
5127 | up_write(&osdc->lock); | ||
5128 | } | ||
5129 | |||
5130 | /* | ||
5090 | * init, shutdown | 5131 | * init, shutdown |
5091 | */ | 5132 | */ |
5092 | int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | 5133 | int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) |
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 90437906b7bc..4e0de14f80bb 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c | |||
@@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) | |||
973 | struct ceph_pg_pool_info, node); | 973 | struct ceph_pg_pool_info, node); |
974 | __remove_pg_pool(&map->pg_pools, pi); | 974 | __remove_pg_pool(&map->pg_pools, pi); |
975 | } | 975 | } |
976 | kfree(map->osd_state); | 976 | kvfree(map->osd_state); |
977 | kfree(map->osd_weight); | 977 | kvfree(map->osd_weight); |
978 | kfree(map->osd_addr); | 978 | kvfree(map->osd_addr); |
979 | kfree(map->osd_primary_affinity); | 979 | kvfree(map->osd_primary_affinity); |
980 | kfree(map->crush_workspace); | 980 | kvfree(map->crush_workspace); |
981 | kfree(map); | 981 | kfree(map); |
982 | } | 982 | } |
983 | 983 | ||
@@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) | |||
986 | * | 986 | * |
987 | * The new elements are properly initialized. | 987 | * The new elements are properly initialized. |
988 | */ | 988 | */ |
989 | static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) | 989 | static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) |
990 | { | 990 | { |
991 | u32 *state; | 991 | u32 *state; |
992 | u32 *weight; | 992 | u32 *weight; |
993 | struct ceph_entity_addr *addr; | 993 | struct ceph_entity_addr *addr; |
994 | u32 to_copy; | ||
994 | int i; | 995 | int i; |
995 | 996 | ||
996 | state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); | 997 | dout("%s old %u new %u\n", __func__, map->max_osd, max); |
997 | if (!state) | 998 | if (max == map->max_osd) |
998 | return -ENOMEM; | 999 | return 0; |
999 | map->osd_state = state; | ||
1000 | 1000 | ||
1001 | weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); | 1001 | state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); |
1002 | if (!weight) | 1002 | weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); |
1003 | addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); | ||
1004 | if (!state || !weight || !addr) { | ||
1005 | kvfree(state); | ||
1006 | kvfree(weight); | ||
1007 | kvfree(addr); | ||
1003 | return -ENOMEM; | 1008 | return -ENOMEM; |
1004 | map->osd_weight = weight; | 1009 | } |
1005 | 1010 | ||
1006 | addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); | 1011 | to_copy = min(map->max_osd, max); |
1007 | if (!addr) | 1012 | if (map->osd_state) { |
1008 | return -ENOMEM; | 1013 | memcpy(state, map->osd_state, to_copy * sizeof(*state)); |
1009 | map->osd_addr = addr; | 1014 | memcpy(weight, map->osd_weight, to_copy * sizeof(*weight)); |
1015 | memcpy(addr, map->osd_addr, to_copy * sizeof(*addr)); | ||
1016 | kvfree(map->osd_state); | ||
1017 | kvfree(map->osd_weight); | ||
1018 | kvfree(map->osd_addr); | ||
1019 | } | ||
1010 | 1020 | ||
1021 | map->osd_state = state; | ||
1022 | map->osd_weight = weight; | ||
1023 | map->osd_addr = addr; | ||
1011 | for (i = map->max_osd; i < max; i++) { | 1024 | for (i = map->max_osd; i < max; i++) { |
1012 | map->osd_state[i] = 0; | 1025 | map->osd_state[i] = 0; |
1013 | map->osd_weight[i] = CEPH_OSD_OUT; | 1026 | map->osd_weight[i] = CEPH_OSD_OUT; |
@@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) | |||
1017 | if (map->osd_primary_affinity) { | 1030 | if (map->osd_primary_affinity) { |
1018 | u32 *affinity; | 1031 | u32 *affinity; |
1019 | 1032 | ||
1020 | affinity = krealloc(map->osd_primary_affinity, | 1033 | affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), |
1021 | max*sizeof(*affinity), GFP_NOFS); | 1034 | GFP_NOFS); |
1022 | if (!affinity) | 1035 | if (!affinity) |
1023 | return -ENOMEM; | 1036 | return -ENOMEM; |
1024 | map->osd_primary_affinity = affinity; | ||
1025 | 1037 | ||
1038 | memcpy(affinity, map->osd_primary_affinity, | ||
1039 | to_copy * sizeof(*affinity)); | ||
1040 | kvfree(map->osd_primary_affinity); | ||
1041 | |||
1042 | map->osd_primary_affinity = affinity; | ||
1026 | for (i = map->max_osd; i < max; i++) | 1043 | for (i = map->max_osd; i < max; i++) |
1027 | map->osd_primary_affinity[i] = | 1044 | map->osd_primary_affinity[i] = |
1028 | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; | 1045 | CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; |
@@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) | |||
1043 | 1060 | ||
1044 | work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); | 1061 | work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); |
1045 | dout("%s work_size %zu bytes\n", __func__, work_size); | 1062 | dout("%s work_size %zu bytes\n", __func__, work_size); |
1046 | workspace = kmalloc(work_size, GFP_NOIO); | 1063 | workspace = ceph_kvmalloc(work_size, GFP_NOIO); |
1047 | if (!workspace) { | 1064 | if (!workspace) { |
1048 | crush_destroy(crush); | 1065 | crush_destroy(crush); |
1049 | return -ENOMEM; | 1066 | return -ENOMEM; |
@@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) | |||
1052 | 1069 | ||
1053 | if (map->crush) | 1070 | if (map->crush) |
1054 | crush_destroy(map->crush); | 1071 | crush_destroy(map->crush); |
1055 | kfree(map->crush_workspace); | 1072 | kvfree(map->crush_workspace); |
1056 | map->crush = crush; | 1073 | map->crush = crush; |
1057 | map->crush_workspace = workspace; | 1074 | map->crush_workspace = workspace; |
1058 | return 0; | 1075 | return 0; |
@@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) | |||
1298 | if (!map->osd_primary_affinity) { | 1315 | if (!map->osd_primary_affinity) { |
1299 | int i; | 1316 | int i; |
1300 | 1317 | ||
1301 | map->osd_primary_affinity = kmalloc_array(map->max_osd, | 1318 | map->osd_primary_affinity = ceph_kvmalloc( |
1302 | sizeof(u32), | 1319 | array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), |
1303 | GFP_NOFS); | 1320 | GFP_NOFS); |
1304 | if (!map->osd_primary_affinity) | 1321 | if (!map->osd_primary_affinity) |
1305 | return -ENOMEM; | 1322 | return -ENOMEM; |
1306 | 1323 | ||
@@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end, | |||
1321 | 1338 | ||
1322 | ceph_decode_32_safe(p, end, len, e_inval); | 1339 | ceph_decode_32_safe(p, end, len, e_inval); |
1323 | if (len == 0) { | 1340 | if (len == 0) { |
1324 | kfree(map->osd_primary_affinity); | 1341 | kvfree(map->osd_primary_affinity); |
1325 | map->osd_primary_affinity = NULL; | 1342 | map->osd_primary_affinity = NULL; |
1326 | return 0; | 1343 | return 0; |
1327 | } | 1344 | } |