summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-25 13:21:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-25 13:21:13 -0400
commitf41def397161053eb0d3ed6861ef65985efbf293 (patch)
tree28c03e8f26fc975ab059ff407b0c3d9165bc489f
parent7b1373dd6e86f3a222590ae404a400e699b32884 (diff)
parent3ee5a7015c8b7cb4de21f7345f8381946f2fce55 (diff)
Merge tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov: "The highlights are: - automatic recovery of a blacklisted filesystem session (Zheng Yan). This is disabled by default and can be enabled by mounting with the new "recover_session=clean" option. - serialize buffered reads and O_DIRECT writes (Jeff Layton). Care is taken to avoid serializing O_DIRECT reads and writes with each other, this is based on the exclusion scheme from NFS. - handle large osdmaps better in the face of fragmented memory (myself) - don't limit what security.* xattrs can be get or set (Jeff Layton). We were overly restrictive here, unnecessarily preventing things like file capability sets stored in security.capability from working. - allow copy_file_range() within the same inode and across different filesystems within the same cluster (Luis Henriques)" * tag 'ceph-for-5.4-rc1' of git://github.com/ceph/ceph-client: (41 commits) ceph: call ceph_mdsc_destroy from destroy_fs_client libceph: use ceph_kvmalloc() for osdmap arrays libceph: avoid a __vmalloc() deadlock in ceph_kvmalloc() ceph: allow object copies across different filesystems in the same cluster ceph: include ceph_debug.h in cache.c ceph: move static keyword to the front of declarations rbd: pull rbd_img_request_create() dout out into the callers ceph: reconnect connection if session hang in opening state libceph: drop unused con parameter of calc_target() ceph: use release_pages() directly rbd: fix response length parameter for encoded strings ceph: allow arbitrary security.* xattrs ceph: only set CEPH_I_SEC_INITED if we got a MAC label ceph: turn ceph_security_invalidate_secctx into static inline ceph: add buffered/direct exclusionary locking for reads and writes libceph: handle OSD op ceph_pagelist_append() errors ceph: don't return a value from void function ceph: don't freeze during write page faults ceph: update the mtime when truncating up ceph: fix indentation in __get_snap_name() ...
-rw-r--r--Documentation/filesystems/ceph.txt14
-rw-r--r--drivers/block/rbd.c18
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c61
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c173
-rw-r--r--fs/ceph/debugfs.c1
-rw-r--r--fs/ceph/export.c60
-rw-r--r--fs/ceph/file.c104
-rw-r--r--fs/ceph/inode.c50
-rw-r--r--fs/ceph/io.c163
-rw-r--r--fs/ceph/io.h12
-rw-r--r--fs/ceph/locks.c8
-rw-r--r--fs/ceph/mds_client.c110
-rw-r--r--fs/ceph/mds_client.h8
-rw-r--r--fs/ceph/super.c52
-rw-r--r--fs/ceph/super.h49
-rw-r--r--fs/ceph/xattr.c76
-rw-r--r--include/linux/ceph/libceph.h1
-rw-r--r--include/linux/ceph/messenger.h1
-rw-r--r--include/linux/ceph/mon_client.h1
-rw-r--r--include/linux/ceph/osd_client.h2
-rw-r--r--net/ceph/ceph_common.c37
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/mon_client.c7
-rw-r--r--net/ceph/osd_client.c65
-rw-r--r--net/ceph/osdmap.c69
27 files changed, 767 insertions, 385 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index d2c6a5ccf0f5..b19b6a03f91c 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -158,6 +158,20 @@ Mount Options
158 copies. Currently, it's only used in copy_file_range, which will revert 158 copies. Currently, it's only used in copy_file_range, which will revert
159 to the default VFS implementation if this option is used. 159 to the default VFS implementation if this option is used.
160 160
161 recover_session=<no|clean>
162 Set auto reconnect mode in the case where the client is blacklisted. The
163 available modes are "no" and "clean". The default is "no".
164
165 * no: never attempt to reconnect when client detects that it has been
166 blacklisted. Operations will generally fail after being blacklisted.
167
168 * clean: client reconnects to the ceph cluster automatically when it
169 detects that it has been blacklisted. During reconnect, client drops
170 dirty data/metadata, invalidates page caches and writable file handles.
171 After reconnect, file locks become stale because the MDS loses track
172 of them. If an inode contains any stale file locks, read/write on the
173 inode is not allowed until applications release all stale file locks.
174
161More Information 175More Information
162================ 176================
163 177
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c8fb886aebd4..7c4350c0fb77 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create(
1754 mutex_init(&img_request->state_mutex); 1754 mutex_init(&img_request->state_mutex);
1755 kref_init(&img_request->kref); 1755 kref_init(&img_request->kref);
1756 1756
1757 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1758 obj_op_name(op_type), img_request);
1759 return img_request; 1757 return img_request;
1760} 1758}
1761 1759
@@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2944 __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2942 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2945 child_img_req->obj_request = obj_req; 2943 child_img_req->obj_request = obj_req;
2946 2944
2945 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2946 obj_req);
2947
2947 if (!rbd_img_is_write(img_req)) { 2948 if (!rbd_img_is_write(img_req)) {
2948 switch (img_req->data_type) { 2949 switch (img_req->data_type) {
2949 case OBJ_REQUEST_BIO: 2950 case OBJ_REQUEST_BIO:
@@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work)
4877 img_request->rq = rq; 4878 img_request->rq = rq;
4878 snapc = NULL; /* img_request consumes a ref */ 4879 snapc = NULL; /* img_request consumes a ref */
4879 4880
4881 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4882 img_request, obj_op_name(op_type), offset, length);
4883
4880 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 4884 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4881 result = rbd_img_fill_nodata(img_request, offset, length); 4885 result = rbd_img_fill_nodata(img_request, offset, length);
4882 else 4886 else
@@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5669 5673
5670static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 5674static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5671{ 5675{
5676 size_t size;
5672 void *reply_buf; 5677 void *reply_buf;
5673 int ret; 5678 int ret;
5674 void *p; 5679 void *p;
5675 5680
5676 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 5681 /* Response will be an encoded string, which includes a length */
5682 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5683 reply_buf = kzalloc(size, GFP_KERNEL);
5677 if (!reply_buf) 5684 if (!reply_buf)
5678 return -ENOMEM; 5685 return -ENOMEM;
5679 5686
5680 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5687 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5681 &rbd_dev->header_oloc, "get_object_prefix", 5688 &rbd_dev->header_oloc, "get_object_prefix",
5682 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 5689 NULL, 0, reply_buf, size);
5683 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5690 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5684 if (ret < 0) 5691 if (ret < 0)
5685 goto out; 5692 goto out;
@@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6696 dout("rbd id object name is %s\n", oid.name); 6703 dout("rbd id object name is %s\n", oid.name);
6697 6704
6698 /* Response will be an encoded string, which includes a length */ 6705 /* Response will be an encoded string, which includes a length */
6699
6700 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 6706 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6701 response = kzalloc(size, GFP_NOIO); 6707 response = kzalloc(size, GFP_NOIO);
6702 if (!response) { 6708 if (!response) {
@@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6708 6714
6709 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 6715 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6710 "get_id", NULL, 0, 6716 "get_id", NULL, 0,
6711 response, RBD_IMAGE_ID_LEN_MAX); 6717 response, size);
6712 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6718 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6713 if (ret == -ENOENT) { 6719 if (ret == -ENOENT) {
6714 image_id = kstrdup("", GFP_KERNEL); 6720 image_id = kstrdup("", GFP_KERNEL);
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index a699e320393f..c1da294418d1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@
6obj-$(CONFIG_CEPH_FS) += ceph.o 6obj-$(CONFIG_CEPH_FS) += ceph.o
7 7
8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
9 export.o caps.o snap.o xattr.o quota.o \ 9 export.o caps.o snap.o xattr.o quota.o io.o \
10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 10 mds_client.o mdsmap.o strings.o ceph_frag.o \
11 debugfs.o 11 debugfs.o
12 12
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b3c8b886bf64..7ab616601141 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
189{ 189{
190 struct inode *inode = file_inode(filp); 190 struct inode *inode = file_inode(filp);
191 struct ceph_inode_info *ci = ceph_inode(inode); 191 struct ceph_inode_info *ci = ceph_inode(inode);
192 struct ceph_osd_client *osdc = 192 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
193 &ceph_inode_to_client(inode)->client->osdc;
194 int err = 0; 193 int err = 0;
195 u64 off = page_offset(page); 194 u64 off = page_offset(page);
196 u64 len = PAGE_SIZE; 195 u64 len = PAGE_SIZE;
@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
219 218
220 dout("readpage inode %p file %p page %p index %lu\n", 219 dout("readpage inode %p file %p page %p index %lu\n",
221 inode, filp, page, page->index); 220 inode, filp, page, page->index);
222 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 221 err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
223 off, &len, 222 &ci->i_layout, off, &len,
224 ci->i_truncate_seq, ci->i_truncate_size, 223 ci->i_truncate_seq, ci->i_truncate_size,
225 &page, 1, 0); 224 &page, 1, 0);
226 if (err == -ENOENT) 225 if (err == -ENOENT)
@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
228 if (err < 0) { 227 if (err < 0) {
229 SetPageError(page); 228 SetPageError(page);
230 ceph_fscache_readpage_cancel(inode, page); 229 ceph_fscache_readpage_cancel(inode, page);
230 if (err == -EBLACKLISTED)
231 fsc->blacklisted = true;
231 goto out; 232 goto out;
232 } 233 }
233 if (err < PAGE_SIZE) 234 if (err < PAGE_SIZE)
@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
266 int i; 267 int i;
267 268
268 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 269 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
270 if (rc == -EBLACKLISTED)
271 ceph_inode_to_client(inode)->blacklisted = true;
269 272
270 /* unlock all pages, zeroing any data we didn't read */ 273 /* unlock all pages, zeroing any data we didn't read */
271 osd_data = osd_req_op_extent_osd_data(req, 0); 274 osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
323 /* caller of readpages does not hold buffer and read caps 326 /* caller of readpages does not hold buffer and read caps
324 * (fadvise, madvise and readahead cases) */ 327 * (fadvise, madvise and readahead cases) */
325 int want = CEPH_CAP_FILE_CACHE; 328 int want = CEPH_CAP_FILE_CACHE;
326 ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); 329 ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
330 true, &got);
327 if (ret < 0) { 331 if (ret < 0) {
328 dout("start_read %p, error getting cap\n", inode); 332 dout("start_read %p, error getting cap\n", inode);
329 } else if (!(got & want)) { 333 } else if (!(got & want)) {
@@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode,
569/* 573/*
570 * Write a single page, but leave the page locked. 574 * Write a single page, but leave the page locked.
571 * 575 *
572 * If we get a write error, set the page error bit, but still adjust the 576 * If we get a write error, mark the mapping for error, but still adjust the
573 * dirty page accounting (i.e., page is no longer dirty). 577 * dirty page accounting (i.e., page is no longer dirty).
574 */ 578 */
575static int writepage_nounlock(struct page *page, struct writeback_control *wbc) 579static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
@@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
640 end_page_writeback(page); 644 end_page_writeback(page);
641 return err; 645 return err;
642 } 646 }
647 if (err == -EBLACKLISTED)
648 fsc->blacklisted = true;
643 dout("writepage setting page/mapping error %d %p\n", 649 dout("writepage setting page/mapping error %d %p\n",
644 err, page); 650 err, page);
645 SetPageError(page);
646 mapping_set_error(&inode->i_data, err); 651 mapping_set_error(&inode->i_data, err);
647 wbc->pages_skipped++; 652 wbc->pages_skipped++;
648 } else { 653 } else {
@@ -680,23 +685,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
680} 685}
681 686
682/* 687/*
683 * lame release_pages helper. release_pages() isn't exported to
684 * modules.
685 */
686static void ceph_release_pages(struct page **pages, int num)
687{
688 struct pagevec pvec;
689 int i;
690
691 pagevec_init(&pvec);
692 for (i = 0; i < num; i++) {
693 if (pagevec_add(&pvec, pages[i]) == 0)
694 pagevec_release(&pvec);
695 }
696 pagevec_release(&pvec);
697}
698
699/*
700 * async writeback completion handler. 688 * async writeback completion handler.
701 * 689 *
702 * If we get an error, set the mapping error bit, but not the individual 690 * If we get an error, set the mapping error bit, but not the individual
@@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req)
720 if (rc < 0) { 708 if (rc < 0) {
721 mapping_set_error(mapping, rc); 709 mapping_set_error(mapping, rc);
722 ceph_set_error_write(ci); 710 ceph_set_error_write(ci);
711 if (rc == -EBLACKLISTED)
712 fsc->blacklisted = true;
723 } else { 713 } else {
724 ceph_clear_error_write(ci); 714 ceph_clear_error_write(ci);
725 } 715 }
@@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req)
769 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", 759 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
770 inode, osd_data->length, rc >= 0 ? num_pages : 0); 760 inode, osd_data->length, rc >= 0 ? num_pages : 0);
771 761
772 ceph_release_pages(osd_data->pages, num_pages); 762 release_pages(osd_data->pages, num_pages);
773 } 763 }
774 764
775 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); 765 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
@@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
1452 want = CEPH_CAP_FILE_CACHE; 1442 want = CEPH_CAP_FILE_CACHE;
1453 1443
1454 got = 0; 1444 got = 0;
1455 err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 1445 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
1446 &got, &pinned_page);
1456 if (err < 0) 1447 if (err < 0)
1457 goto out_restore; 1448 goto out_restore;
1458 1449
@@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1540 if (!prealloc_cf) 1531 if (!prealloc_cf)
1541 return VM_FAULT_OOM; 1532 return VM_FAULT_OOM;
1542 1533
1534 sb_start_pagefault(inode->i_sb);
1543 ceph_block_sigs(&oldset); 1535 ceph_block_sigs(&oldset);
1544 1536
1545 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1537 if (ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1568 want = CEPH_CAP_FILE_BUFFER; 1560 want = CEPH_CAP_FILE_BUFFER;
1569 1561
1570 got = 0; 1562 got = 0;
1571 err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1563 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
1572 &got, NULL); 1564 &got, NULL);
1573 if (err < 0) 1565 if (err < 0)
1574 goto out_free; 1566 goto out_free;
@@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1614 ceph_put_cap_refs(ci, got); 1606 ceph_put_cap_refs(ci, got);
1615out_free: 1607out_free:
1616 ceph_restore_sigs(&oldset); 1608 ceph_restore_sigs(&oldset);
1609 sb_end_pagefault(inode->i_sb);
1617 ceph_free_cap_flush(prealloc_cf); 1610 ceph_free_cap_flush(prealloc_cf);
1618 if (err < 0) 1611 if (err < 0)
1619 ret = vmf_error(err); 1612 ret = vmf_error(err);
@@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
1946 1939
1947 if (err >= 0 || err == -ENOENT) 1940 if (err >= 0 || err == -ENOENT)
1948 have |= POOL_READ; 1941 have |= POOL_READ;
1949 else if (err != -EPERM) 1942 else if (err != -EPERM) {
1943 if (err == -EBLACKLISTED)
1944 fsc->blacklisted = true;
1950 goto out_unlock; 1945 goto out_unlock;
1946 }
1951 1947
1952 if (err2 == 0 || err2 == -EEXIST) 1948 if (err2 == 0 || err2 == -EEXIST)
1953 have |= POOL_WRITE; 1949 have |= POOL_WRITE;
1954 else if (err2 != -EPERM) { 1950 else if (err2 != -EPERM) {
1951 if (err2 == -EBLACKLISTED)
1952 fsc->blacklisted = true;
1955 err = err2; 1953 err = err2;
1956 goto out_unlock; 1954 goto out_unlock;
1957 } 1955 }
@@ -1989,10 +1987,11 @@ out:
1989 return err; 1987 return err;
1990} 1988}
1991 1989
1992int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) 1990int ceph_pool_perm_check(struct inode *inode, int need)
1993{ 1991{
1994 s64 pool; 1992 struct ceph_inode_info *ci = ceph_inode(inode);
1995 struct ceph_string *pool_ns; 1993 struct ceph_string *pool_ns;
1994 s64 pool;
1996 int ret, flags; 1995 int ret, flags;
1997 1996
1998 if (ci->i_vino.snap != CEPH_NOSNAP) { 1997 if (ci->i_vino.snap != CEPH_NOSNAP) {
@@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
2004 return 0; 2003 return 0;
2005 } 2004 }
2006 2005
2007 if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), 2006 if (ceph_test_mount_opt(ceph_inode_to_client(inode),
2008 NOPOOLPERM)) 2007 NOPOOLPERM))
2009 return 0; 2008 return 0;
2010 2009
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index bc90cf6ad7ed..b2ec29eeb4c4 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -6,6 +6,8 @@
6 * Written by Milosz Tanski (milosz@adfin.com) 6 * Written by Milosz Tanski (milosz@adfin.com)
7 */ 7 */
8 8
9#include <linux/ceph/ceph_debug.h>
10
9#include "super.h" 11#include "super.h"
10#include "cache.h" 12#include "cache.h"
11 13
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ce0f5658720a..d3b9c9d5c1bd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -458,37 +458,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
458} 458}
459 459
460/* 460/*
461 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
462 */
463static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
464{
465 struct ceph_cap *cap;
466 int mds = -1;
467 struct rb_node *p;
468
469 /* prefer mds with WR|BUFFER|EXCL caps */
470 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
471 cap = rb_entry(p, struct ceph_cap, ci_node);
472 mds = cap->mds;
473 if (cap->issued & (CEPH_CAP_FILE_WR |
474 CEPH_CAP_FILE_BUFFER |
475 CEPH_CAP_FILE_EXCL))
476 break;
477 }
478 return mds;
479}
480
481int ceph_get_cap_mds(struct inode *inode)
482{
483 struct ceph_inode_info *ci = ceph_inode(inode);
484 int mds;
485 spin_lock(&ci->i_ceph_lock);
486 mds = __ceph_get_cap_mds(ceph_inode(inode));
487 spin_unlock(&ci->i_ceph_lock);
488 return mds;
489}
490
491/*
492 * Called under i_ceph_lock. 461 * Called under i_ceph_lock.
493 */ 462 */
494static void __insert_cap_node(struct ceph_inode_info *ci, 463static void __insert_cap_node(struct ceph_inode_info *ci,
@@ -628,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
628/* 597/*
629 * Add a capability under the given MDS session. 598 * Add a capability under the given MDS session.
630 * 599 *
631 * Caller should hold session snap_rwsem (read) and s_mutex. 600 * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
632 * 601 *
633 * @fmode is the open file mode, if we are opening a file, otherwise 602 * @fmode is the open file mode, if we are opening a file, otherwise
634 * it is < 0. (This is so we can atomically add the cap and add an 603 * it is < 0. (This is so we can atomically add the cap and add an
@@ -645,6 +614,9 @@ void ceph_add_cap(struct inode *inode,
645 struct ceph_cap *cap; 614 struct ceph_cap *cap;
646 int mds = session->s_mds; 615 int mds = session->s_mds;
647 int actual_wanted; 616 int actual_wanted;
617 u32 gen;
618
619 lockdep_assert_held(&ci->i_ceph_lock);
648 620
649 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, 621 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
650 session->s_mds, cap_id, ceph_cap_string(issued), seq); 622 session->s_mds, cap_id, ceph_cap_string(issued), seq);
@@ -656,6 +628,10 @@ void ceph_add_cap(struct inode *inode,
656 if (fmode >= 0) 628 if (fmode >= 0)
657 wanted |= ceph_caps_for_mode(fmode); 629 wanted |= ceph_caps_for_mode(fmode);
658 630
631 spin_lock(&session->s_gen_ttl_lock);
632 gen = session->s_cap_gen;
633 spin_unlock(&session->s_gen_ttl_lock);
634
659 cap = __get_cap_for_mds(ci, mds); 635 cap = __get_cap_for_mds(ci, mds);
660 if (!cap) { 636 if (!cap) {
661 cap = *new_cap; 637 cap = *new_cap;
@@ -681,7 +657,7 @@ void ceph_add_cap(struct inode *inode,
681 list_move_tail(&cap->session_caps, &session->s_caps); 657 list_move_tail(&cap->session_caps, &session->s_caps);
682 spin_unlock(&session->s_cap_lock); 658 spin_unlock(&session->s_cap_lock);
683 659
684 if (cap->cap_gen < session->s_cap_gen) 660 if (cap->cap_gen < gen)
685 cap->issued = cap->implemented = CEPH_CAP_PIN; 661 cap->issued = cap->implemented = CEPH_CAP_PIN;
686 662
687 /* 663 /*
@@ -775,7 +751,7 @@ void ceph_add_cap(struct inode *inode,
775 cap->seq = seq; 751 cap->seq = seq;
776 cap->issue_seq = seq; 752 cap->issue_seq = seq;
777 cap->mseq = mseq; 753 cap->mseq = mseq;
778 cap->cap_gen = session->s_cap_gen; 754 cap->cap_gen = gen;
779 755
780 if (fmode >= 0) 756 if (fmode >= 0)
781 __ceph_get_fmode(ci, fmode); 757 __ceph_get_fmode(ci, fmode);
@@ -1284,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
1284 * Make note of max_size reported/requested from mds, revoked caps 1260 * Make note of max_size reported/requested from mds, revoked caps
1285 * that have now been implemented. 1261 * that have now been implemented.
1286 * 1262 *
1287 * Make half-hearted attempt ot to invalidate page cache if we are
1288 * dropping RDCACHE. Note that this will leave behind locked pages
1289 * that we'll then need to deal with elsewhere.
1290 *
1291 * Return non-zero if delayed release, or we experienced an error 1263 * Return non-zero if delayed release, or we experienced an error
1292 * such that the caller should requeue + retry later. 1264 * such that the caller should requeue + retry later.
1293 * 1265 *
@@ -1746,11 +1718,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
1746 * Add dirty inode to the flushing list. Assigned a seq number so we 1718 * Add dirty inode to the flushing list. Assigned a seq number so we
1747 * can wait for caps to flush without starving. 1719 * can wait for caps to flush without starving.
1748 * 1720 *
1749 * Called under i_ceph_lock. 1721 * Called under i_ceph_lock. Returns the flush tid.
1750 */ 1722 */
1751static int __mark_caps_flushing(struct inode *inode, 1723static u64 __mark_caps_flushing(struct inode *inode,
1752 struct ceph_mds_session *session, bool wake, 1724 struct ceph_mds_session *session, bool wake,
1753 u64 *flush_tid, u64 *oldest_flush_tid) 1725 u64 *oldest_flush_tid)
1754{ 1726{
1755 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1727 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1756 struct ceph_inode_info *ci = ceph_inode(inode); 1728 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1789,8 +1761,7 @@ static int __mark_caps_flushing(struct inode *inode,
1789 1761
1790 list_add_tail(&cf->i_list, &ci->i_cap_flush_list); 1762 list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
1791 1763
1792 *flush_tid = cf->tid; 1764 return cf->tid;
1793 return flushing;
1794} 1765}
1795 1766
1796/* 1767/*
@@ -2028,11 +1999,6 @@ retry_locked:
2028 } 1999 }
2029 2000
2030ack: 2001ack:
2031 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2032 dout(" skipping %p I_NOFLUSH set\n", inode);
2033 continue;
2034 }
2035
2036 if (session && session != cap->session) { 2002 if (session && session != cap->session) {
2037 dout("oops, wrong session %p mutex\n", session); 2003 dout("oops, wrong session %p mutex\n", session);
2038 mutex_unlock(&session->s_mutex); 2004 mutex_unlock(&session->s_mutex);
@@ -2080,9 +2046,9 @@ ack:
2080 } 2046 }
2081 2047
2082 if (cap == ci->i_auth_cap && ci->i_dirty_caps) { 2048 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2083 flushing = __mark_caps_flushing(inode, session, false, 2049 flushing = ci->i_dirty_caps;
2084 &flush_tid, 2050 flush_tid = __mark_caps_flushing(inode, session, false,
2085 &oldest_flush_tid); 2051 &oldest_flush_tid);
2086 } else { 2052 } else {
2087 flushing = 0; 2053 flushing = 0;
2088 flush_tid = 0; 2054 flush_tid = 0;
@@ -2130,16 +2096,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
2130retry: 2096retry:
2131 spin_lock(&ci->i_ceph_lock); 2097 spin_lock(&ci->i_ceph_lock);
2132retry_locked: 2098retry_locked:
2133 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2134 spin_unlock(&ci->i_ceph_lock);
2135 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
2136 goto out;
2137 }
2138 if (ci->i_dirty_caps && ci->i_auth_cap) { 2099 if (ci->i_dirty_caps && ci->i_auth_cap) {
2139 struct ceph_cap *cap = ci->i_auth_cap; 2100 struct ceph_cap *cap = ci->i_auth_cap;
2140 int delayed; 2101 int delayed;
2141 2102
2142 if (!session || session != cap->session) { 2103 if (session != cap->session) {
2143 spin_unlock(&ci->i_ceph_lock); 2104 spin_unlock(&ci->i_ceph_lock);
2144 if (session) 2105 if (session)
2145 mutex_unlock(&session->s_mutex); 2106 mutex_unlock(&session->s_mutex);
@@ -2161,8 +2122,9 @@ retry_locked:
2161 goto retry_locked; 2122 goto retry_locked;
2162 } 2123 }
2163 2124
2164 flushing = __mark_caps_flushing(inode, session, true, 2125 flushing = ci->i_dirty_caps;
2165 &flush_tid, &oldest_flush_tid); 2126 flush_tid = __mark_caps_flushing(inode, session, true,
2127 &oldest_flush_tid);
2166 2128
2167 /* __send_cap drops i_ceph_lock */ 2129 /* __send_cap drops i_ceph_lock */
2168 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2130 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
@@ -2261,35 +2223,45 @@ static int unsafe_request_wait(struct inode *inode)
2261 2223
2262int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) 2224int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2263{ 2225{
2226 struct ceph_file_info *fi = file->private_data;
2264 struct inode *inode = file->f_mapping->host; 2227 struct inode *inode = file->f_mapping->host;
2265 struct ceph_inode_info *ci = ceph_inode(inode); 2228 struct ceph_inode_info *ci = ceph_inode(inode);
2266 u64 flush_tid; 2229 u64 flush_tid;
2267 int ret; 2230 int ret, err;
2268 int dirty; 2231 int dirty;
2269 2232
2270 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 2233 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
2271 2234
2272 ret = file_write_and_wait_range(file, start, end); 2235 ret = file_write_and_wait_range(file, start, end);
2273 if (ret < 0)
2274 goto out;
2275
2276 if (datasync) 2236 if (datasync)
2277 goto out; 2237 goto out;
2278 2238
2279 dirty = try_flush_caps(inode, &flush_tid); 2239 dirty = try_flush_caps(inode, &flush_tid);
2280 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 2240 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2281 2241
2282 ret = unsafe_request_wait(inode); 2242 err = unsafe_request_wait(inode);
2283 2243
2284 /* 2244 /*
2285 * only wait on non-file metadata writeback (the mds 2245 * only wait on non-file metadata writeback (the mds
2286 * can recover size and mtime, so we don't need to 2246 * can recover size and mtime, so we don't need to
2287 * wait for that) 2247 * wait for that)
2288 */ 2248 */
2289 if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { 2249 if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2290 ret = wait_event_interruptible(ci->i_cap_wq, 2250 err = wait_event_interruptible(ci->i_cap_wq,
2291 caps_are_flushed(inode, flush_tid)); 2251 caps_are_flushed(inode, flush_tid));
2292 } 2252 }
2253
2254 if (err < 0)
2255 ret = err;
2256
2257 if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
2258 spin_lock(&file->f_lock);
2259 err = errseq_check_and_advance(&ci->i_meta_err,
2260 &fi->meta_err);
2261 spin_unlock(&file->f_lock);
2262 if (err < 0)
2263 ret = err;
2264 }
2293out: 2265out:
2294 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); 2266 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
2295 return ret; 2267 return ret;
@@ -2560,10 +2532,15 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
2560 * 2532 *
2561 * FIXME: how does a 0 return differ from -EAGAIN? 2533 * FIXME: how does a 0 return differ from -EAGAIN?
2562 */ 2534 */
2563static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2535enum {
2564 loff_t endoff, bool nonblock, int *got) 2536 NON_BLOCKING = 1,
2537 CHECK_FILELOCK = 2,
2538};
2539
2540static int try_get_cap_refs(struct inode *inode, int need, int want,
2541 loff_t endoff, int flags, int *got)
2565{ 2542{
2566 struct inode *inode = &ci->vfs_inode; 2543 struct ceph_inode_info *ci = ceph_inode(inode);
2567 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2544 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2568 int ret = 0; 2545 int ret = 0;
2569 int have, implemented; 2546 int have, implemented;
@@ -2576,6 +2553,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2576again: 2553again:
2577 spin_lock(&ci->i_ceph_lock); 2554 spin_lock(&ci->i_ceph_lock);
2578 2555
2556 if ((flags & CHECK_FILELOCK) &&
2557 (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2558 dout("try_get_cap_refs %p error filelock\n", inode);
2559 ret = -EIO;
2560 goto out_unlock;
2561 }
2562
2579 /* make sure file is actually open */ 2563 /* make sure file is actually open */
2580 file_wanted = __ceph_caps_file_wanted(ci); 2564 file_wanted = __ceph_caps_file_wanted(ci);
2581 if ((file_wanted & need) != need) { 2565 if ((file_wanted & need) != need) {
@@ -2637,7 +2621,7 @@ again:
2637 * we can not call down_read() when 2621 * we can not call down_read() when
2638 * task isn't in TASK_RUNNING state 2622 * task isn't in TASK_RUNNING state
2639 */ 2623 */
2640 if (nonblock) { 2624 if (flags & NON_BLOCKING) {
2641 ret = -EAGAIN; 2625 ret = -EAGAIN;
2642 goto out_unlock; 2626 goto out_unlock;
2643 } 2627 }
@@ -2731,18 +2715,19 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2731 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2715 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2732} 2716}
2733 2717
2734int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, 2718int ceph_try_get_caps(struct inode *inode, int need, int want,
2735 bool nonblock, int *got) 2719 bool nonblock, int *got)
2736{ 2720{
2737 int ret; 2721 int ret;
2738 2722
2739 BUG_ON(need & ~CEPH_CAP_FILE_RD); 2723 BUG_ON(need & ~CEPH_CAP_FILE_RD);
2740 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); 2724 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
2741 ret = ceph_pool_perm_check(ci, need); 2725 ret = ceph_pool_perm_check(inode, need);
2742 if (ret < 0) 2726 if (ret < 0)
2743 return ret; 2727 return ret;
2744 2728
2745 ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); 2729 ret = try_get_cap_refs(inode, need, want, 0,
2730 (nonblock ? NON_BLOCKING : 0), got);
2746 return ret == -EAGAIN ? 0 : ret; 2731 return ret == -EAGAIN ? 0 : ret;
2747} 2732}
2748 2733
@@ -2751,30 +2736,40 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
2751 * due to a small max_size, make sure we check_max_size (and possibly 2736 * due to a small max_size, make sure we check_max_size (and possibly
2752 * ask the mds) so we don't get hung up indefinitely. 2737 * ask the mds) so we don't get hung up indefinitely.
2753 */ 2738 */
2754int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2739int ceph_get_caps(struct file *filp, int need, int want,
2755 loff_t endoff, int *got, struct page **pinned_page) 2740 loff_t endoff, int *got, struct page **pinned_page)
2756{ 2741{
2757 int _got, ret; 2742 struct ceph_file_info *fi = filp->private_data;
2743 struct inode *inode = file_inode(filp);
2744 struct ceph_inode_info *ci = ceph_inode(inode);
2745 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2746 int ret, _got, flags;
2758 2747
2759 ret = ceph_pool_perm_check(ci, need); 2748 ret = ceph_pool_perm_check(inode, need);
2760 if (ret < 0) 2749 if (ret < 0)
2761 return ret; 2750 return ret;
2762 2751
2752 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2753 fi->filp_gen != READ_ONCE(fsc->filp_gen))
2754 return -EBADF;
2755
2763 while (true) { 2756 while (true) {
2764 if (endoff > 0) 2757 if (endoff > 0)
2765 check_max_size(&ci->vfs_inode, endoff); 2758 check_max_size(inode, endoff);
2766 2759
2760 flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
2767 _got = 0; 2761 _got = 0;
2768 ret = try_get_cap_refs(ci, need, want, endoff, 2762 ret = try_get_cap_refs(inode, need, want, endoff,
2769 false, &_got); 2763 flags, &_got);
2770 if (ret == -EAGAIN) 2764 if (ret == -EAGAIN)
2771 continue; 2765 continue;
2772 if (!ret) { 2766 if (!ret) {
2773 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2767 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2774 add_wait_queue(&ci->i_cap_wq, &wait); 2768 add_wait_queue(&ci->i_cap_wq, &wait);
2775 2769
2776 while (!(ret = try_get_cap_refs(ci, need, want, endoff, 2770 flags |= NON_BLOCKING;
2777 true, &_got))) { 2771 while (!(ret = try_get_cap_refs(inode, need, want,
2772 endoff, flags, &_got))) {
2778 if (signal_pending(current)) { 2773 if (signal_pending(current)) {
2779 ret = -ERESTARTSYS; 2774 ret = -ERESTARTSYS;
2780 break; 2775 break;
@@ -2786,10 +2781,18 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2786 if (ret == -EAGAIN) 2781 if (ret == -EAGAIN)
2787 continue; 2782 continue;
2788 } 2783 }
2784
2785 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2786 fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
2787 if (ret >= 0 && _got)
2788 ceph_put_cap_refs(ci, _got);
2789 return -EBADF;
2790 }
2791
2789 if (ret < 0) { 2792 if (ret < 0) {
2790 if (ret == -ESTALE) { 2793 if (ret == -ESTALE) {
2791 /* session was killed, try renew caps */ 2794 /* session was killed, try renew caps */
2792 ret = ceph_renew_caps(&ci->vfs_inode); 2795 ret = ceph_renew_caps(inode);
2793 if (ret == 0) 2796 if (ret == 0)
2794 continue; 2797 continue;
2795 } 2798 }
@@ -2798,9 +2801,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2798 2801
2799 if (ci->i_inline_version != CEPH_INLINE_NONE && 2802 if (ci->i_inline_version != CEPH_INLINE_NONE &&
2800 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2803 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2801 i_size_read(&ci->vfs_inode) > 0) { 2804 i_size_read(inode) > 0) {
2802 struct page *page = 2805 struct page *page =
2803 find_get_page(ci->vfs_inode.i_mapping, 0); 2806 find_get_page(inode->i_mapping, 0);
2804 if (page) { 2807 if (page) {
2805 if (PageUptodate(page)) { 2808 if (PageUptodate(page)) {
2806 *pinned_page = page; 2809 *pinned_page = page;
@@ -2819,7 +2822,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2819 * getattr request will bring inline data into 2822 * getattr request will bring inline data into
2820 * page cache 2823 * page cache
2821 */ 2824 */
2822 ret = __ceph_do_getattr(&ci->vfs_inode, NULL, 2825 ret = __ceph_do_getattr(inode, NULL,
2823 CEPH_STAT_CAP_INLINE_DATA, 2826 CEPH_STAT_CAP_INLINE_DATA,
2824 true); 2827 true);
2825 if (ret < 0) 2828 if (ret < 0)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 2eb88ed22993..facb387c2735 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
294 294
295void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) 295void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
296{ 296{
297 return 0;
298} 297}
299 298
300void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) 299void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 15ff1b09cfa2..b6bfa94332c3 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -35,7 +35,7 @@ struct ceph_nfs_snapfh {
35static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, 35static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
36 struct inode *parent_inode) 36 struct inode *parent_inode)
37{ 37{
38 const static int snap_handle_length = 38 static const int snap_handle_length =
39 sizeof(struct ceph_nfs_snapfh) >> 2; 39 sizeof(struct ceph_nfs_snapfh) >> 2;
40 struct ceph_nfs_snapfh *sfh = (void *)rawfh; 40 struct ceph_nfs_snapfh *sfh = (void *)rawfh;
41 u64 snapid = ceph_snap(inode); 41 u64 snapid = ceph_snap(inode);
@@ -85,9 +85,9 @@ out:
85static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 85static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
86 struct inode *parent_inode) 86 struct inode *parent_inode)
87{ 87{
88 const static int handle_length = 88 static const int handle_length =
89 sizeof(struct ceph_nfs_fh) >> 2; 89 sizeof(struct ceph_nfs_fh) >> 2;
90 const static int connected_handle_length = 90 static const int connected_handle_length =
91 sizeof(struct ceph_nfs_confh) >> 2; 91 sizeof(struct ceph_nfs_confh) >> 2;
92 int type; 92 int type;
93 93
@@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name,
458 if (err < 0) 458 if (err < 0)
459 goto out; 459 goto out;
460 460
461 rinfo = &req->r_reply_info; 461 rinfo = &req->r_reply_info;
462 for (i = 0; i < rinfo->dir_nr; i++) { 462 for (i = 0; i < rinfo->dir_nr; i++) {
463 rde = rinfo->dir_entries + i; 463 rde = rinfo->dir_entries + i;
464 BUG_ON(!rde->inode.in); 464 BUG_ON(!rde->inode.in);
465 if (ceph_snap(inode) == 465 if (ceph_snap(inode) ==
466 le64_to_cpu(rde->inode.in->snapid)) { 466 le64_to_cpu(rde->inode.in->snapid)) {
467 memcpy(name, rde->name, rde->name_len); 467 memcpy(name, rde->name, rde->name_len);
468 name[rde->name_len] = '\0'; 468 name[rde->name_len] = '\0';
469 err = 0; 469 err = 0;
470 goto out; 470 goto out;
471 } 471 }
472 } 472 }
473 473
474 if (rinfo->dir_end) 474 if (rinfo->dir_end)
475 break; 475 break;
476 476
477 BUG_ON(rinfo->dir_nr <= 0); 477 BUG_ON(rinfo->dir_nr <= 0);
478 rde = rinfo->dir_entries + (rinfo->dir_nr - 1); 478 rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
479 next_offset += rinfo->dir_nr; 479 next_offset += rinfo->dir_nr;
480 last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); 480 last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
481 if (!last_name) { 481 if (!last_name) {
482 err = -ENOMEM; 482 err = -ENOMEM;
483 goto out; 483 goto out;
484 } 484 }
485 485
486 ceph_mdsc_put_request(req); 486 ceph_mdsc_put_request(req);
487 req = NULL; 487 req = NULL;
488 } 488 }
489 err = -ENOENT; 489 err = -ENOENT;
490out: 490out:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 685a03cc4b77..d277f71abe0b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -15,6 +15,7 @@
15#include "super.h" 15#include "super.h"
16#include "mds_client.h" 16#include "mds_client.h"
17#include "cache.h" 17#include "cache.h"
18#include "io.h"
18 19
19static __le32 ceph_flags_sys2wire(u32 flags) 20static __le32 ceph_flags_sys2wire(u32 flags)
20{ 21{
@@ -201,6 +202,7 @@ out:
201static int ceph_init_file_info(struct inode *inode, struct file *file, 202static int ceph_init_file_info(struct inode *inode, struct file *file,
202 int fmode, bool isdir) 203 int fmode, bool isdir)
203{ 204{
205 struct ceph_inode_info *ci = ceph_inode(inode);
204 struct ceph_file_info *fi; 206 struct ceph_file_info *fi;
205 207
206 dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 208 dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
@@ -211,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
211 struct ceph_dir_file_info *dfi = 213 struct ceph_dir_file_info *dfi =
212 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 214 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
213 if (!dfi) { 215 if (!dfi) {
214 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 216 ceph_put_fmode(ci, fmode); /* clean up */
215 return -ENOMEM; 217 return -ENOMEM;
216 } 218 }
217 219
@@ -222,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
222 } else { 224 } else {
223 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 225 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
224 if (!fi) { 226 if (!fi) {
225 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 227 ceph_put_fmode(ci, fmode); /* clean up */
226 return -ENOMEM; 228 return -ENOMEM;
227 } 229 }
228 230
@@ -232,6 +234,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
232 fi->fmode = fmode; 234 fi->fmode = fmode;
233 spin_lock_init(&fi->rw_contexts_lock); 235 spin_lock_init(&fi->rw_contexts_lock);
234 INIT_LIST_HEAD(&fi->rw_contexts); 236 INIT_LIST_HEAD(&fi->rw_contexts);
237 fi->meta_err = errseq_sample(&ci->i_meta_err);
238 fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
235 239
236 return 0; 240 return 0;
237} 241}
@@ -695,7 +699,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
695 ceph_release_page_vector(pages, num_pages); 699 ceph_release_page_vector(pages, num_pages);
696 } 700 }
697 701
698 if (ret <= 0 || off >= i_size || !more) 702 if (ret < 0) {
703 if (ret == -EBLACKLISTED)
704 fsc->blacklisted = true;
705 break;
706 }
707
708 if (off >= i_size || !more)
699 break; 709 break;
700 } 710 }
701 711
@@ -921,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
921 struct ceph_aio_request *aio_req = NULL; 931 struct ceph_aio_request *aio_req = NULL;
922 int num_pages = 0; 932 int num_pages = 0;
923 int flags; 933 int flags;
924 int ret; 934 int ret = 0;
925 struct timespec64 mtime = current_time(inode); 935 struct timespec64 mtime = current_time(inode);
926 size_t count = iov_iter_count(iter); 936 size_t count = iov_iter_count(iter);
927 loff_t pos = iocb->ki_pos; 937 loff_t pos = iocb->ki_pos;
@@ -935,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
935 (write ? "write" : "read"), file, pos, (unsigned)count, 945 (write ? "write" : "read"), file, pos, (unsigned)count,
936 snapc, snapc ? snapc->seq : 0); 946 snapc, snapc ? snapc->seq : 0);
937 947
938 ret = filemap_write_and_wait_range(inode->i_mapping,
939 pos, pos + count - 1);
940 if (ret < 0)
941 return ret;
942
943 if (write) { 948 if (write) {
944 int ret2 = invalidate_inode_pages2_range(inode->i_mapping, 949 int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
945 pos >> PAGE_SHIFT, 950 pos >> PAGE_SHIFT,
@@ -1260,7 +1265,8 @@ again:
1260 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1265 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1261 else 1266 else
1262 want = CEPH_CAP_FILE_CACHE; 1267 want = CEPH_CAP_FILE_CACHE;
1263 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 1268 ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
1269 &got, &pinned_page);
1264 if (ret < 0) 1270 if (ret < 0)
1265 return ret; 1271 return ret;
1266 1272
@@ -1274,12 +1280,16 @@ again:
1274 1280
1275 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1281 if (ci->i_inline_version == CEPH_INLINE_NONE) {
1276 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1282 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
1283 ceph_start_io_direct(inode);
1277 ret = ceph_direct_read_write(iocb, to, 1284 ret = ceph_direct_read_write(iocb, to,
1278 NULL, NULL); 1285 NULL, NULL);
1286 ceph_end_io_direct(inode);
1279 if (ret >= 0 && ret < len) 1287 if (ret >= 0 && ret < len)
1280 retry_op = CHECK_EOF; 1288 retry_op = CHECK_EOF;
1281 } else { 1289 } else {
1290 ceph_start_io_read(inode);
1282 ret = ceph_sync_read(iocb, to, &retry_op); 1291 ret = ceph_sync_read(iocb, to, &retry_op);
1292 ceph_end_io_read(inode);
1283 } 1293 }
1284 } else { 1294 } else {
1285 retry_op = READ_INLINE; 1295 retry_op = READ_INLINE;
@@ -1290,7 +1300,9 @@ again:
1290 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1300 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
1291 ceph_cap_string(got)); 1301 ceph_cap_string(got));
1292 ceph_add_rw_context(fi, &rw_ctx); 1302 ceph_add_rw_context(fi, &rw_ctx);
1303 ceph_start_io_read(inode);
1293 ret = generic_file_read_iter(iocb, to); 1304 ret = generic_file_read_iter(iocb, to);
1305 ceph_end_io_read(inode);
1294 ceph_del_rw_context(fi, &rw_ctx); 1306 ceph_del_rw_context(fi, &rw_ctx);
1295 } 1307 }
1296 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 1308 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1399,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
1399 return -ENOMEM; 1411 return -ENOMEM;
1400 1412
1401retry_snap: 1413retry_snap:
1402 inode_lock(inode); 1414 if (iocb->ki_flags & IOCB_DIRECT)
1415 ceph_start_io_direct(inode);
1416 else
1417 ceph_start_io_write(inode);
1403 1418
1404 /* We can write back this queue in page reclaim */ 1419 /* We can write back this queue in page reclaim */
1405 current->backing_dev_info = inode_to_bdi(inode); 1420 current->backing_dev_info = inode_to_bdi(inode);
@@ -1457,7 +1472,7 @@ retry_snap:
1457 else 1472 else
1458 want = CEPH_CAP_FILE_BUFFER; 1473 want = CEPH_CAP_FILE_BUFFER;
1459 got = 0; 1474 got = 0;
1460 err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, 1475 err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
1461 &got, NULL); 1476 &got, NULL);
1462 if (err < 0) 1477 if (err < 0)
1463 goto out; 1478 goto out;
@@ -1470,7 +1485,6 @@ retry_snap:
1470 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 1485 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
1471 struct ceph_snap_context *snapc; 1486 struct ceph_snap_context *snapc;
1472 struct iov_iter data; 1487 struct iov_iter data;
1473 inode_unlock(inode);
1474 1488
1475 spin_lock(&ci->i_ceph_lock); 1489 spin_lock(&ci->i_ceph_lock);
1476 if (__ceph_have_pending_cap_snap(ci)) { 1490 if (__ceph_have_pending_cap_snap(ci)) {
@@ -1487,11 +1501,14 @@ retry_snap:
1487 1501
1488 /* we might need to revert back to that point */ 1502 /* we might need to revert back to that point */
1489 data = *from; 1503 data = *from;
1490 if (iocb->ki_flags & IOCB_DIRECT) 1504 if (iocb->ki_flags & IOCB_DIRECT) {
1491 written = ceph_direct_read_write(iocb, &data, snapc, 1505 written = ceph_direct_read_write(iocb, &data, snapc,
1492 &prealloc_cf); 1506 &prealloc_cf);
1493 else 1507 ceph_end_io_direct(inode);
1508 } else {
1494 written = ceph_sync_write(iocb, &data, pos, snapc); 1509 written = ceph_sync_write(iocb, &data, pos, snapc);
1510 ceph_end_io_write(inode);
1511 }
1495 if (written > 0) 1512 if (written > 0)
1496 iov_iter_advance(from, written); 1513 iov_iter_advance(from, written);
1497 ceph_put_snap_context(snapc); 1514 ceph_put_snap_context(snapc);
@@ -1506,7 +1523,7 @@ retry_snap:
1506 written = generic_perform_write(file, from, pos); 1523 written = generic_perform_write(file, from, pos);
1507 if (likely(written >= 0)) 1524 if (likely(written >= 0))
1508 iocb->ki_pos = pos + written; 1525 iocb->ki_pos = pos + written;
1509 inode_unlock(inode); 1526 ceph_end_io_write(inode);
1510 } 1527 }
1511 1528
1512 if (written >= 0) { 1529 if (written >= 0) {
@@ -1541,9 +1558,11 @@ retry_snap:
1541 } 1558 }
1542 1559
1543 goto out_unlocked; 1560 goto out_unlocked;
1544
1545out: 1561out:
1546 inode_unlock(inode); 1562 if (iocb->ki_flags & IOCB_DIRECT)
1563 ceph_end_io_direct(inode);
1564 else
1565 ceph_end_io_write(inode);
1547out_unlocked: 1566out_unlocked:
1548 ceph_free_cap_flush(prealloc_cf); 1567 ceph_free_cap_flush(prealloc_cf);
1549 current->backing_dev_info = NULL; 1568 current->backing_dev_info = NULL;
@@ -1781,7 +1800,7 @@ static long ceph_fallocate(struct file *file, int mode,
1781 else 1800 else
1782 want = CEPH_CAP_FILE_BUFFER; 1801 want = CEPH_CAP_FILE_BUFFER;
1783 1802
1784 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 1803 ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
1785 if (ret < 0) 1804 if (ret < 0)
1786 goto unlock; 1805 goto unlock;
1787 1806
@@ -1810,16 +1829,15 @@ unlock:
1810 * src_ci. Two attempts are made to obtain both caps, and an error is return if 1829 * src_ci. Two attempts are made to obtain both caps, and an error is return if
1811 * this fails; zero is returned on success. 1830 * this fails; zero is returned on success.
1812 */ 1831 */
1813static int get_rd_wr_caps(struct ceph_inode_info *src_ci, 1832static int get_rd_wr_caps(struct file *src_filp, int *src_got,
1814 loff_t src_endoff, int *src_got, 1833 struct file *dst_filp,
1815 struct ceph_inode_info *dst_ci,
1816 loff_t dst_endoff, int *dst_got) 1834 loff_t dst_endoff, int *dst_got)
1817{ 1835{
1818 int ret = 0; 1836 int ret = 0;
1819 bool retrying = false; 1837 bool retrying = false;
1820 1838
1821retry_caps: 1839retry_caps:
1822 ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 1840 ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
1823 dst_endoff, dst_got, NULL); 1841 dst_endoff, dst_got, NULL);
1824 if (ret < 0) 1842 if (ret < 0)
1825 return ret; 1843 return ret;
@@ -1829,24 +1847,24 @@ retry_caps:
1829 * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some 1847 * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
1830 * retry dance instead to try to get both capabilities. 1848 * retry dance instead to try to get both capabilities.
1831 */ 1849 */
1832 ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 1850 ret = ceph_try_get_caps(file_inode(src_filp),
1851 CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
1833 false, src_got); 1852 false, src_got);
1834 if (ret <= 0) { 1853 if (ret <= 0) {
1835 /* Start by dropping dst_ci caps and getting src_ci caps */ 1854 /* Start by dropping dst_ci caps and getting src_ci caps */
1836 ceph_put_cap_refs(dst_ci, *dst_got); 1855 ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
1837 if (retrying) { 1856 if (retrying) {
1838 if (!ret) 1857 if (!ret)
1839 /* ceph_try_get_caps masks EAGAIN */ 1858 /* ceph_try_get_caps masks EAGAIN */
1840 ret = -EAGAIN; 1859 ret = -EAGAIN;
1841 return ret; 1860 return ret;
1842 } 1861 }
1843 ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, 1862 ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
1844 CEPH_CAP_FILE_SHARED, src_endoff, 1863 CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
1845 src_got, NULL);
1846 if (ret < 0) 1864 if (ret < 0)
1847 return ret; 1865 return ret;
1848 /*... drop src_ci caps too, and retry */ 1866 /*... drop src_ci caps too, and retry */
1849 ceph_put_cap_refs(src_ci, *src_got); 1867 ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
1850 retrying = true; 1868 retrying = true;
1851 goto retry_caps; 1869 goto retry_caps;
1852 } 1870 }
@@ -1904,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1904 struct ceph_inode_info *src_ci = ceph_inode(src_inode); 1922 struct ceph_inode_info *src_ci = ceph_inode(src_inode);
1905 struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); 1923 struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
1906 struct ceph_cap_flush *prealloc_cf; 1924 struct ceph_cap_flush *prealloc_cf;
1925 struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
1907 struct ceph_object_locator src_oloc, dst_oloc; 1926 struct ceph_object_locator src_oloc, dst_oloc;
1908 struct ceph_object_id src_oid, dst_oid; 1927 struct ceph_object_id src_oid, dst_oid;
1909 loff_t endoff = 0, size; 1928 loff_t endoff = 0, size;
@@ -1913,10 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1913 int src_got = 0, dst_got = 0, err, dirty; 1932 int src_got = 0, dst_got = 0, err, dirty;
1914 bool do_final_copy = false; 1933 bool do_final_copy = false;
1915 1934
1916 if (src_inode == dst_inode) 1935 if (src_inode->i_sb != dst_inode->i_sb) {
1917 return -EINVAL; 1936 struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
1918 if (src_inode->i_sb != dst_inode->i_sb) 1937
1919 return -EXDEV; 1938 if (ceph_fsid_compare(&src_fsc->client->fsid,
1939 &dst_fsc->client->fsid)) {
1940 dout("Copying files across clusters: src: %pU dst: %pU\n",
1941 &src_fsc->client->fsid, &dst_fsc->client->fsid);
1942 return -EXDEV;
1943 }
1944 }
1920 if (ceph_snap(dst_inode) != CEPH_NOSNAP) 1945 if (ceph_snap(dst_inode) != CEPH_NOSNAP)
1921 return -EROFS; 1946 return -EROFS;
1922 1947
@@ -1928,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1928 * efficient). 1953 * efficient).
1929 */ 1954 */
1930 1955
1931 if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) 1956 if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
1932 return -EOPNOTSUPP; 1957 return -EOPNOTSUPP;
1933 1958
1934 if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || 1959 if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
@@ -1960,8 +1985,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1960 * clients may have dirty data in their caches. And OSDs know nothing 1985 * clients may have dirty data in their caches. And OSDs know nothing
1961 * about caps, so they can't safely do the remote object copies. 1986 * about caps, so they can't safely do the remote object copies.
1962 */ 1987 */
1963 err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, 1988 err = get_rd_wr_caps(src_file, &src_got,
1964 dst_ci, (dst_off + len), &dst_got); 1989 dst_file, (dst_off + len), &dst_got);
1965 if (err < 0) { 1990 if (err < 0) {
1966 dout("get_rd_wr_caps returned %d\n", err); 1991 dout("get_rd_wr_caps returned %d\n", err);
1967 ret = -EOPNOTSUPP; 1992 ret = -EOPNOTSUPP;
@@ -2018,9 +2043,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2018 goto out; 2043 goto out;
2019 } 2044 }
2020 len -= ret; 2045 len -= ret;
2021 err = get_rd_wr_caps(src_ci, (src_off + len), 2046 err = get_rd_wr_caps(src_file, &src_got,
2022 &src_got, dst_ci, 2047 dst_file, (dst_off + len), &dst_got);
2023 (dst_off + len), &dst_got);
2024 if (err < 0) 2048 if (err < 0)
2025 goto out; 2049 goto out;
2026 err = is_file_size_ok(src_inode, dst_inode, 2050 err = is_file_size_ok(src_inode, dst_inode,
@@ -2044,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2044 dst_ci->i_vino.ino, dst_objnum); 2068 dst_ci->i_vino.ino, dst_objnum);
2045 /* Do an object remote copy */ 2069 /* Do an object remote copy */
2046 err = ceph_osdc_copy_from( 2070 err = ceph_osdc_copy_from(
2047 &ceph_inode_to_client(src_inode)->client->osdc, 2071 &src_fsc->client->osdc,
2048 src_ci->i_vino.snap, 0, 2072 src_ci->i_vino.snap, 0,
2049 &src_oid, &src_oloc, 2073 &src_oid, &src_oloc,
2050 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 2074 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 18500edefc56..9f135624ae47 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
515 515
516 ceph_fscache_inode_init(ci); 516 ceph_fscache_inode_init(ci);
517 517
518 ci->i_meta_err = 0;
519
518 return &ci->vfs_inode; 520 return &ci->vfs_inode;
519} 521}
520 522
@@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
801 803
802 /* update inode */ 804 /* update inode */
803 inode->i_rdev = le32_to_cpu(info->rdev); 805 inode->i_rdev = le32_to_cpu(info->rdev);
804 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 806 /* directories have fl_stripe_unit set to zero */
807 if (le32_to_cpu(info->layout.fl_stripe_unit))
808 inode->i_blkbits =
809 fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
810 else
811 inode->i_blkbits = CEPH_BLOCK_SHIFT;
805 812
806 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 813 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
807 814
@@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = {
1982int __ceph_setattr(struct inode *inode, struct iattr *attr) 1989int __ceph_setattr(struct inode *inode, struct iattr *attr)
1983{ 1990{
1984 struct ceph_inode_info *ci = ceph_inode(inode); 1991 struct ceph_inode_info *ci = ceph_inode(inode);
1985 const unsigned int ia_valid = attr->ia_valid; 1992 unsigned int ia_valid = attr->ia_valid;
1986 struct ceph_mds_request *req; 1993 struct ceph_mds_request *req;
1987 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1994 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1988 struct ceph_cap_flush *prealloc_cf; 1995 struct ceph_cap_flush *prealloc_cf;
@@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
2087 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2094 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2088 } 2095 }
2089 } 2096 }
2097 if (ia_valid & ATTR_SIZE) {
2098 dout("setattr %p size %lld -> %lld\n", inode,
2099 inode->i_size, attr->ia_size);
2100 if ((issued & CEPH_CAP_FILE_EXCL) &&
2101 attr->ia_size > inode->i_size) {
2102 i_size_write(inode, attr->ia_size);
2103 inode->i_blocks = calc_inode_blocks(attr->ia_size);
2104 ci->i_reported_size = attr->ia_size;
2105 dirtied |= CEPH_CAP_FILE_EXCL;
2106 ia_valid |= ATTR_MTIME;
2107 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2108 attr->ia_size != inode->i_size) {
2109 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2110 req->r_args.setattr.old_size =
2111 cpu_to_le64(inode->i_size);
2112 mask |= CEPH_SETATTR_SIZE;
2113 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2114 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2115 }
2116 }
2090 if (ia_valid & ATTR_MTIME) { 2117 if (ia_valid & ATTR_MTIME) {
2091 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, 2118 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
2092 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, 2119 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
@@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
2109 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2136 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2110 } 2137 }
2111 } 2138 }
2112 if (ia_valid & ATTR_SIZE) {
2113 dout("setattr %p size %lld -> %lld\n", inode,
2114 inode->i_size, attr->ia_size);
2115 if ((issued & CEPH_CAP_FILE_EXCL) &&
2116 attr->ia_size > inode->i_size) {
2117 i_size_write(inode, attr->ia_size);
2118 inode->i_blocks = calc_inode_blocks(attr->ia_size);
2119 ci->i_reported_size = attr->ia_size;
2120 dirtied |= CEPH_CAP_FILE_EXCL;
2121 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2122 attr->ia_size != inode->i_size) {
2123 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2124 req->r_args.setattr.old_size =
2125 cpu_to_le64(inode->i_size);
2126 mask |= CEPH_SETATTR_SIZE;
2127 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2128 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2129 }
2130 }
2131 2139
2132 /* these do nothing */ 2140 /* these do nothing */
2133 if (ia_valid & ATTR_CTIME) { 2141 if (ia_valid & ATTR_CTIME) {
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 000000000000..97602ea92ff4
--- /dev/null
+++ b/fs/ceph/io.c
@@ -0,0 +1,163 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2016 Trond Myklebust
4 * Copyright (c) 2019 Jeff Layton
5 *
6 * I/O and data path helper functionality.
7 *
8 * Heavily borrowed from equivalent code in fs/nfs/io.c
9 */
10
11#include <linux/ceph/ceph_debug.h>
12
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/rwsem.h>
16#include <linux/fs.h>
17
18#include "super.h"
19#include "io.h"
20
21/* Call with exclusively locked inode->i_rwsem */
22static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
23{
24 lockdep_assert_held_write(&inode->i_rwsem);
25
26 if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
27 spin_lock(&ci->i_ceph_lock);
28 ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
29 spin_unlock(&ci->i_ceph_lock);
30 inode_dio_wait(inode);
31 }
32}
33
34/**
35 * ceph_start_io_read - declare the file is being used for buffered reads
36 * @inode: file inode
37 *
38 * Declare that a buffered read operation is about to start, and ensure
39 * that we block all direct I/O.
40 * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
41 * and holds a shared lock on inode->i_rwsem to ensure that the flag
42 * cannot be changed.
43 * In practice, this means that buffered read operations are allowed to
44 * execute in parallel, thanks to the shared lock, whereas direct I/O
45 * operations need to wait to grab an exclusive lock in order to set
46 * CEPH_I_ODIRECT.
47 * Note that buffered writes and truncates both take a write lock on
48 * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
49 */
50void
51ceph_start_io_read(struct inode *inode)
52{
53 struct ceph_inode_info *ci = ceph_inode(inode);
54
55 /* Be an optimist! */
56 down_read(&inode->i_rwsem);
57 if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
58 return;
59 up_read(&inode->i_rwsem);
60 /* Slow path.... */
61 down_write(&inode->i_rwsem);
62 ceph_block_o_direct(ci, inode);
63 downgrade_write(&inode->i_rwsem);
64}
65
66/**
67 * ceph_end_io_read - declare that the buffered read operation is done
68 * @inode: file inode
69 *
70 * Declare that a buffered read operation is done, and release the shared
71 * lock on inode->i_rwsem.
72 */
73void
74ceph_end_io_read(struct inode *inode)
75{
76 up_read(&inode->i_rwsem);
77}
78
79/**
80 * ceph_start_io_write - declare the file is being used for buffered writes
81 * @inode: file inode
82 *
83 * Declare that a buffered write operation is about to start, and ensure
84 * that we block all direct I/O.
85 */
86void
87ceph_start_io_write(struct inode *inode)
88{
89 down_write(&inode->i_rwsem);
90 ceph_block_o_direct(ceph_inode(inode), inode);
91}
92
93/**
94 * ceph_end_io_write - declare that the buffered write operation is done
95 * @inode: file inode
96 *
97 * Declare that a buffered write operation is done, and release the
98 * lock on inode->i_rwsem.
99 */
100void
101ceph_end_io_write(struct inode *inode)
102{
103 up_write(&inode->i_rwsem);
104}
105
106/* Call with exclusively locked inode->i_rwsem */
107static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
108{
109 lockdep_assert_held_write(&inode->i_rwsem);
110
111 if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
112 spin_lock(&ci->i_ceph_lock);
113 ci->i_ceph_flags |= CEPH_I_ODIRECT;
114 spin_unlock(&ci->i_ceph_lock);
115 /* FIXME: unmap_mapping_range? */
116 filemap_write_and_wait(inode->i_mapping);
117 }
118}
119
120/**
121 * ceph_end_io_direct - declare the file is being used for direct i/o
122 * @inode: file inode
123 *
124 * Declare that a direct I/O operation is about to start, and ensure
125 * that we block all buffered I/O.
126 * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
127 * and holds a shared lock on inode->i_rwsem to ensure that the flag
128 * cannot be changed.
129 * In practice, this means that direct I/O operations are allowed to
130 * execute in parallel, thanks to the shared lock, whereas buffered I/O
131 * operations need to wait to grab an exclusive lock in order to clear
132 * CEPH_I_ODIRECT.
133 * Note that buffered writes and truncates both take a write lock on
134 * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
135 */
136void
137ceph_start_io_direct(struct inode *inode)
138{
139 struct ceph_inode_info *ci = ceph_inode(inode);
140
141 /* Be an optimist! */
142 down_read(&inode->i_rwsem);
143 if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
144 return;
145 up_read(&inode->i_rwsem);
146 /* Slow path.... */
147 down_write(&inode->i_rwsem);
148 ceph_block_buffered(ci, inode);
149 downgrade_write(&inode->i_rwsem);
150}
151
152/**
153 * ceph_end_io_direct - declare that the direct i/o operation is done
154 * @inode: file inode
155 *
156 * Declare that a direct I/O operation is done, and release the shared
157 * lock on inode->i_rwsem.
158 */
159void
160ceph_end_io_direct(struct inode *inode)
161{
162 up_read(&inode->i_rwsem);
163}
diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 000000000000..fa594cd77348
--- /dev/null
+++ b/fs/ceph/io.h
@@ -0,0 +1,12 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _FS_CEPH_IO_H
3#define _FS_CEPH_IO_H
4
5void ceph_start_io_read(struct inode *inode);
6void ceph_end_io_read(struct inode *inode);
7void ceph_start_io_write(struct inode *inode);
8void ceph_end_io_write(struct inode *inode);
9void ceph_start_io_direct(struct inode *inode);
10void ceph_end_io_direct(struct inode *inode);
11
12#endif /* FS_CEPH_IO_H */
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 5083e238ad15..544e9e85b120 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -32,14 +32,18 @@ void __init ceph_flock_init(void)
32 32
33static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 33static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
34{ 34{
35 struct inode *inode = file_inode(src->fl_file); 35 struct ceph_file_info *fi = dst->fl_file->private_data;
36 struct inode *inode = file_inode(dst->fl_file);
36 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 37 atomic_inc(&ceph_inode(inode)->i_filelock_ref);
38 atomic_inc(&fi->num_locks);
37} 39}
38 40
39static void ceph_fl_release_lock(struct file_lock *fl) 41static void ceph_fl_release_lock(struct file_lock *fl)
40{ 42{
43 struct ceph_file_info *fi = fl->fl_file->private_data;
41 struct inode *inode = file_inode(fl->fl_file); 44 struct inode *inode = file_inode(fl->fl_file);
42 struct ceph_inode_info *ci = ceph_inode(inode); 45 struct ceph_inode_info *ci = ceph_inode(inode);
46 atomic_dec(&fi->num_locks);
43 if (atomic_dec_and_test(&ci->i_filelock_ref)) { 47 if (atomic_dec_and_test(&ci->i_filelock_ref)) {
44 /* clear error when all locks are released */ 48 /* clear error when all locks are released */
45 spin_lock(&ci->i_ceph_lock); 49 spin_lock(&ci->i_ceph_lock);
@@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
73 * window. Caller function will decrease the counter. 77 * window. Caller function will decrease the counter.
74 */ 78 */
75 fl->fl_ops = &ceph_fl_lock_ops; 79 fl->fl_ops = &ceph_fl_lock_ops;
76 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 80 fl->fl_ops->fl_copy_lock(fl, NULL);
77 } 81 }
78 82
79 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 83 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 920e9f048bd8..a8a8f84f3bbf 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
639 s->s_renew_seq = 0; 639 s->s_renew_seq = 0;
640 INIT_LIST_HEAD(&s->s_caps); 640 INIT_LIST_HEAD(&s->s_caps);
641 s->s_nr_caps = 0; 641 s->s_nr_caps = 0;
642 s->s_trim_caps = 0;
643 refcount_set(&s->s_ref, 1); 642 refcount_set(&s->s_ref, 1);
644 INIT_LIST_HEAD(&s->s_waiting); 643 INIT_LIST_HEAD(&s->s_waiting);
645 INIT_LIST_HEAD(&s->s_unsafe); 644 INIT_LIST_HEAD(&s->s_unsafe);
@@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1270{ 1269{
1271 struct ceph_mds_request *req; 1270 struct ceph_mds_request *req;
1272 struct rb_node *p; 1271 struct rb_node *p;
1272 struct ceph_inode_info *ci;
1273 1273
1274 dout("cleanup_session_requests mds%d\n", session->s_mds); 1274 dout("cleanup_session_requests mds%d\n", session->s_mds);
1275 mutex_lock(&mdsc->mutex); 1275 mutex_lock(&mdsc->mutex);
@@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1278 struct ceph_mds_request, r_unsafe_item); 1278 struct ceph_mds_request, r_unsafe_item);
1279 pr_warn_ratelimited(" dropping unsafe request %llu\n", 1279 pr_warn_ratelimited(" dropping unsafe request %llu\n",
1280 req->r_tid); 1280 req->r_tid);
1281 if (req->r_target_inode) {
1282 /* dropping unsafe change of inode's attributes */
1283 ci = ceph_inode(req->r_target_inode);
1284 errseq_set(&ci->i_meta_err, -EIO);
1285 }
1286 if (req->r_unsafe_dir) {
1287 /* dropping unsafe directory operation */
1288 ci = ceph_inode(req->r_unsafe_dir);
1289 errseq_set(&ci->i_meta_err, -EIO);
1290 }
1281 __unregister_request(mdsc, req); 1291 __unregister_request(mdsc, req);
1282 } 1292 }
1283 /* zero r_attempts, so kick_requests() will re-send requests */ 1293 /* zero r_attempts, so kick_requests() will re-send requests */
@@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1370 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; 1380 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
1371 struct ceph_inode_info *ci = ceph_inode(inode); 1381 struct ceph_inode_info *ci = ceph_inode(inode);
1372 LIST_HEAD(to_remove); 1382 LIST_HEAD(to_remove);
1373 bool drop = false; 1383 bool dirty_dropped = false;
1374 bool invalidate = false; 1384 bool invalidate = false;
1375 1385
1376 dout("removing cap %p, ci is %p, inode is %p\n", 1386 dout("removing cap %p, ci is %p, inode is %p\n",
@@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1383 struct ceph_cap_flush *cf; 1393 struct ceph_cap_flush *cf;
1384 struct ceph_mds_client *mdsc = fsc->mdsc; 1394 struct ceph_mds_client *mdsc = fsc->mdsc;
1385 1395
1386 if (ci->i_wrbuffer_ref > 0 && 1396 if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1387 READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) 1397 if (inode->i_data.nrpages > 0)
1388 invalidate = true; 1398 invalidate = true;
1399 if (ci->i_wrbuffer_ref > 0)
1400 mapping_set_error(&inode->i_data, -EIO);
1401 }
1389 1402
1390 while (!list_empty(&ci->i_cap_flush_list)) { 1403 while (!list_empty(&ci->i_cap_flush_list)) {
1391 cf = list_first_entry(&ci->i_cap_flush_list, 1404 cf = list_first_entry(&ci->i_cap_flush_list,
@@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1405 inode, ceph_ino(inode)); 1418 inode, ceph_ino(inode));
1406 ci->i_dirty_caps = 0; 1419 ci->i_dirty_caps = 0;
1407 list_del_init(&ci->i_dirty_item); 1420 list_del_init(&ci->i_dirty_item);
1408 drop = true; 1421 dirty_dropped = true;
1409 } 1422 }
1410 if (!list_empty(&ci->i_flushing_item)) { 1423 if (!list_empty(&ci->i_flushing_item)) {
1411 pr_warn_ratelimited( 1424 pr_warn_ratelimited(
@@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1415 ci->i_flushing_caps = 0; 1428 ci->i_flushing_caps = 0;
1416 list_del_init(&ci->i_flushing_item); 1429 list_del_init(&ci->i_flushing_item);
1417 mdsc->num_cap_flushing--; 1430 mdsc->num_cap_flushing--;
1418 drop = true; 1431 dirty_dropped = true;
1419 } 1432 }
1420 spin_unlock(&mdsc->cap_dirty_lock); 1433 spin_unlock(&mdsc->cap_dirty_lock);
1421 1434
1435 if (dirty_dropped) {
1436 errseq_set(&ci->i_meta_err, -EIO);
1437
1438 if (ci->i_wrbuffer_ref_head == 0 &&
1439 ci->i_wr_ref == 0 &&
1440 ci->i_dirty_caps == 0 &&
1441 ci->i_flushing_caps == 0) {
1442 ceph_put_snap_context(ci->i_head_snapc);
1443 ci->i_head_snapc = NULL;
1444 }
1445 }
1446
1422 if (atomic_read(&ci->i_filelock_ref) > 0) { 1447 if (atomic_read(&ci->i_filelock_ref) > 0) {
1423 /* make further file lock syscall return -EIO */ 1448 /* make further file lock syscall return -EIO */
1424 ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; 1449 ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
@@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1430 list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); 1455 list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
1431 ci->i_prealloc_cap_flush = NULL; 1456 ci->i_prealloc_cap_flush = NULL;
1432 } 1457 }
1433
1434 if (drop &&
1435 ci->i_wrbuffer_ref_head == 0 &&
1436 ci->i_wr_ref == 0 &&
1437 ci->i_dirty_caps == 0 &&
1438 ci->i_flushing_caps == 0) {
1439 ceph_put_snap_context(ci->i_head_snapc);
1440 ci->i_head_snapc = NULL;
1441 }
1442 } 1458 }
1443 spin_unlock(&ci->i_ceph_lock); 1459 spin_unlock(&ci->i_ceph_lock);
1444 while (!list_empty(&to_remove)) { 1460 while (!list_empty(&to_remove)) {
@@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1452 wake_up_all(&ci->i_cap_wq); 1468 wake_up_all(&ci->i_cap_wq);
1453 if (invalidate) 1469 if (invalidate)
1454 ceph_queue_invalidate(inode); 1470 ceph_queue_invalidate(inode);
1455 if (drop) 1471 if (dirty_dropped)
1456 iput(inode); 1472 iput(inode);
1457 return 0; 1473 return 0;
1458} 1474}
@@ -1705,11 +1721,11 @@ out:
1705 */ 1721 */
1706static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) 1722static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1707{ 1723{
1708 struct ceph_mds_session *session = arg; 1724 int *remaining = arg;
1709 struct ceph_inode_info *ci = ceph_inode(inode); 1725 struct ceph_inode_info *ci = ceph_inode(inode);
1710 int used, wanted, oissued, mine; 1726 int used, wanted, oissued, mine;
1711 1727
1712 if (session->s_trim_caps <= 0) 1728 if (*remaining <= 0)
1713 return -1; 1729 return -1;
1714 1730
1715 spin_lock(&ci->i_ceph_lock); 1731 spin_lock(&ci->i_ceph_lock);
@@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1746 if (oissued) { 1762 if (oissued) {
1747 /* we aren't the only cap.. just remove us */ 1763 /* we aren't the only cap.. just remove us */
1748 __ceph_remove_cap(cap, true); 1764 __ceph_remove_cap(cap, true);
1749 session->s_trim_caps--; 1765 (*remaining)--;
1750 } else { 1766 } else {
1751 struct dentry *dentry; 1767 struct dentry *dentry;
1752 /* try dropping referring dentries */ 1768 /* try dropping referring dentries */
@@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1758 d_prune_aliases(inode); 1774 d_prune_aliases(inode);
1759 count = atomic_read(&inode->i_count); 1775 count = atomic_read(&inode->i_count);
1760 if (count == 1) 1776 if (count == 1)
1761 session->s_trim_caps--; 1777 (*remaining)--;
1762 dout("trim_caps_cb %p cap %p pruned, count now %d\n", 1778 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
1763 inode, cap, count); 1779 inode, cap, count);
1764 } else { 1780 } else {
@@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
1784 dout("trim_caps mds%d start: %d / %d, trim %d\n", 1800 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1785 session->s_mds, session->s_nr_caps, max_caps, trim_caps); 1801 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1786 if (trim_caps > 0) { 1802 if (trim_caps > 0) {
1787 session->s_trim_caps = trim_caps; 1803 int remaining = trim_caps;
1788 ceph_iterate_session_caps(session, trim_caps_cb, session); 1804
1805 ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
1789 dout("trim_caps mds%d done: %d / %d, trimmed %d\n", 1806 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1790 session->s_mds, session->s_nr_caps, max_caps, 1807 session->s_mds, session->s_nr_caps, max_caps,
1791 trim_caps - session->s_trim_caps); 1808 trim_caps - remaining);
1792 session->s_trim_caps = 0;
1793 } 1809 }
1794 1810
1795 ceph_flush_cap_releases(mdsc, session); 1811 ceph_flush_cap_releases(mdsc, session);
@@ -3015,18 +3031,23 @@ bad:
3015 pr_err("mdsc_handle_forward decode error err=%d\n", err); 3031 pr_err("mdsc_handle_forward decode error err=%d\n", err);
3016} 3032}
3017 3033
3018static int __decode_and_drop_session_metadata(void **p, void *end) 3034static int __decode_session_metadata(void **p, void *end,
3035 bool *blacklisted)
3019{ 3036{
3020 /* map<string,string> */ 3037 /* map<string,string> */
3021 u32 n; 3038 u32 n;
3039 bool err_str;
3022 ceph_decode_32_safe(p, end, n, bad); 3040 ceph_decode_32_safe(p, end, n, bad);
3023 while (n-- > 0) { 3041 while (n-- > 0) {
3024 u32 len; 3042 u32 len;
3025 ceph_decode_32_safe(p, end, len, bad); 3043 ceph_decode_32_safe(p, end, len, bad);
3026 ceph_decode_need(p, end, len, bad); 3044 ceph_decode_need(p, end, len, bad);
3045 err_str = !strncmp(*p, "error_string", len);
3027 *p += len; 3046 *p += len;
3028 ceph_decode_32_safe(p, end, len, bad); 3047 ceph_decode_32_safe(p, end, len, bad);
3029 ceph_decode_need(p, end, len, bad); 3048 ceph_decode_need(p, end, len, bad);
3049 if (err_str && strnstr(*p, "blacklisted", len))
3050 *blacklisted = true;
3030 *p += len; 3051 *p += len;
3031 } 3052 }
3032 return 0; 3053 return 0;
@@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session,
3050 u64 seq; 3071 u64 seq;
3051 unsigned long features = 0; 3072 unsigned long features = 0;
3052 int wake = 0; 3073 int wake = 0;
3074 bool blacklisted = false;
3053 3075
3054 /* decode */ 3076 /* decode */
3055 ceph_decode_need(&p, end, sizeof(*h), bad); 3077 ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session,
3062 if (msg_version >= 3) { 3084 if (msg_version >= 3) {
3063 u32 len; 3085 u32 len;
3064 /* version >= 2, metadata */ 3086 /* version >= 2, metadata */
3065 if (__decode_and_drop_session_metadata(&p, end) < 0) 3087 if (__decode_session_metadata(&p, end, &blacklisted) < 0)
3066 goto bad; 3088 goto bad;
3067 /* version >= 3, feature bits */ 3089 /* version >= 3, feature bits */
3068 ceph_decode_32_safe(&p, end, len, bad); 3090 ceph_decode_32_safe(&p, end, len, bad);
@@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session,
3149 session->s_state = CEPH_MDS_SESSION_REJECTED; 3171 session->s_state = CEPH_MDS_SESSION_REJECTED;
3150 cleanup_session_requests(mdsc, session); 3172 cleanup_session_requests(mdsc, session);
3151 remove_session_caps(session); 3173 remove_session_caps(session);
3174 if (blacklisted)
3175 mdsc->fsc->blacklisted = true;
3152 wake = 2; /* for good measure */ 3176 wake = 2; /* for good measure */
3153 break; 3177 break;
3154 3178
@@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
3998 mutex_unlock(&mdsc->mutex); 4022 mutex_unlock(&mdsc->mutex);
3999} 4023}
4000 4024
4025static void maybe_recover_session(struct ceph_mds_client *mdsc)
4026{
4027 struct ceph_fs_client *fsc = mdsc->fsc;
4028
4029 if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
4030 return;
4031
4032 if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
4033 return;
4034
4035 if (!READ_ONCE(fsc->blacklisted))
4036 return;
4037
4038 if (fsc->last_auto_reconnect &&
4039 time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
4040 return;
4001 4041
4042 pr_info("auto reconnect after blacklisted\n");
4043 fsc->last_auto_reconnect = jiffies;
4044 ceph_force_reconnect(fsc->sb);
4045}
4002 4046
4003/* 4047/*
4004 * delayed work -- periodically trim expired leases, renew caps with mds 4048 * delayed work -- periodically trim expired leases, renew caps with mds
@@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work)
4044 pr_info("mds%d hung\n", s->s_mds); 4088 pr_info("mds%d hung\n", s->s_mds);
4045 } 4089 }
4046 } 4090 }
4047 if (s->s_state < CEPH_MDS_SESSION_OPEN) { 4091 if (s->s_state == CEPH_MDS_SESSION_NEW ||
4092 s->s_state == CEPH_MDS_SESSION_RESTARTING ||
4093 s->s_state == CEPH_MDS_SESSION_REJECTED) {
4048 /* this mds is failed or recovering, just wait */ 4094 /* this mds is failed or recovering, just wait */
4049 ceph_put_mds_session(s); 4095 ceph_put_mds_session(s);
4050 continue; 4096 continue;
@@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work)
4072 4118
4073 ceph_trim_snapid_map(mdsc); 4119 ceph_trim_snapid_map(mdsc);
4074 4120
4121 maybe_recover_session(mdsc);
4122
4075 schedule_delayed(mdsc); 4123 schedule_delayed(mdsc);
4076} 4124}
4077 4125
@@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
4355 session = __ceph_lookup_mds_session(mdsc, mds); 4403 session = __ceph_lookup_mds_session(mdsc, mds);
4356 if (!session) 4404 if (!session)
4357 continue; 4405 continue;
4406
4407 if (session->s_state == CEPH_MDS_SESSION_REJECTED)
4408 __unregister_session(mdsc, session);
4409 __wake_requests(mdsc, &session->s_waiting);
4358 mutex_unlock(&mdsc->mutex); 4410 mutex_unlock(&mdsc->mutex);
4411
4359 mutex_lock(&session->s_mutex); 4412 mutex_lock(&session->s_mutex);
4360 __close_session(mdsc, session); 4413 __close_session(mdsc, session);
4361 if (session->s_state == CEPH_MDS_SESSION_CLOSING) { 4414 if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
4364 } 4417 }
4365 mutex_unlock(&session->s_mutex); 4418 mutex_unlock(&session->s_mutex);
4366 ceph_put_mds_session(session); 4419 ceph_put_mds_session(session);
4420
4367 mutex_lock(&mdsc->mutex); 4421 mutex_lock(&mdsc->mutex);
4368 kick_requests(mdsc, mds); 4422 kick_requests(mdsc, mds);
4369 } 4423 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f7c8603484fe..5cd131b41d84 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -148,9 +148,9 @@ enum {
148 CEPH_MDS_SESSION_OPENING = 2, 148 CEPH_MDS_SESSION_OPENING = 2,
149 CEPH_MDS_SESSION_OPEN = 3, 149 CEPH_MDS_SESSION_OPEN = 3,
150 CEPH_MDS_SESSION_HUNG = 4, 150 CEPH_MDS_SESSION_HUNG = 4,
151 CEPH_MDS_SESSION_CLOSING = 5, 151 CEPH_MDS_SESSION_RESTARTING = 5,
152 CEPH_MDS_SESSION_RESTARTING = 6, 152 CEPH_MDS_SESSION_RECONNECTING = 6,
153 CEPH_MDS_SESSION_RECONNECTING = 7, 153 CEPH_MDS_SESSION_CLOSING = 7,
154 CEPH_MDS_SESSION_REJECTED = 8, 154 CEPH_MDS_SESSION_REJECTED = 8,
155}; 155};
156 156
@@ -176,7 +176,7 @@ struct ceph_mds_session {
176 spinlock_t s_cap_lock; 176 spinlock_t s_cap_lock;
177 struct list_head s_caps; /* all caps issued by this session */ 177 struct list_head s_caps; /* all caps issued by this session */
178 struct ceph_cap *s_cap_iterator; 178 struct ceph_cap *s_cap_iterator;
179 int s_nr_caps, s_trim_caps; 179 int s_nr_caps;
180 int s_num_cap_releases; 180 int s_num_cap_releases;
181 int s_cap_reconnect; 181 int s_cap_reconnect;
182 int s_readonly; 182 int s_readonly;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 377fafc76f20..edfd643a8205 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -143,6 +143,7 @@ enum {
143 Opt_snapdirname, 143 Opt_snapdirname,
144 Opt_mds_namespace, 144 Opt_mds_namespace,
145 Opt_fscache_uniq, 145 Opt_fscache_uniq,
146 Opt_recover_session,
146 Opt_last_string, 147 Opt_last_string,
147 /* string args above */ 148 /* string args above */
148 Opt_dirstat, 149 Opt_dirstat,
@@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = {
184 /* int args above */ 185 /* int args above */
185 {Opt_snapdirname, "snapdirname=%s"}, 186 {Opt_snapdirname, "snapdirname=%s"},
186 {Opt_mds_namespace, "mds_namespace=%s"}, 187 {Opt_mds_namespace, "mds_namespace=%s"},
188 {Opt_recover_session, "recover_session=%s"},
187 {Opt_fscache_uniq, "fsc=%s"}, 189 {Opt_fscache_uniq, "fsc=%s"},
188 /* string args above */ 190 /* string args above */
189 {Opt_dirstat, "dirstat"}, 191 {Opt_dirstat, "dirstat"},
@@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private)
254 if (!fsopt->mds_namespace) 256 if (!fsopt->mds_namespace)
255 return -ENOMEM; 257 return -ENOMEM;
256 break; 258 break;
259 case Opt_recover_session:
260 if (!strncmp(argstr[0].from, "no",
261 argstr[0].to - argstr[0].from)) {
262 fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
263 } else if (!strncmp(argstr[0].from, "clean",
264 argstr[0].to - argstr[0].from)) {
265 fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
266 } else {
267 return -EINVAL;
268 }
269 break;
257 case Opt_fscache_uniq: 270 case Opt_fscache_uniq:
258 kfree(fsopt->fscache_uniq); 271 kfree(fsopt->fscache_uniq);
259 fsopt->fscache_uniq = kstrndup(argstr[0].from, 272 fsopt->fscache_uniq = kstrndup(argstr[0].from,
@@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
576 589
577 if (fsopt->mds_namespace) 590 if (fsopt->mds_namespace)
578 seq_show_option(m, "mds_namespace", fsopt->mds_namespace); 591 seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
592
593 if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
594 seq_show_option(m, "recover_session", "clean");
595
579 if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) 596 if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
580 seq_printf(m, ",wsize=%d", fsopt->wsize); 597 seq_printf(m, ",wsize=%d", fsopt->wsize);
581 if (fsopt->rsize != CEPH_MAX_READ_SIZE) 598 if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
664 681
665 fsc->sb = NULL; 682 fsc->sb = NULL;
666 fsc->mount_state = CEPH_MOUNT_MOUNTING; 683 fsc->mount_state = CEPH_MOUNT_MOUNTING;
684 fsc->filp_gen = 1;
667 685
668 atomic_long_set(&fsc->writeback_count, 0); 686 atomic_long_set(&fsc->writeback_count, 0);
669 687
@@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
713{ 731{
714 dout("destroy_fs_client %p\n", fsc); 732 dout("destroy_fs_client %p\n", fsc);
715 733
734 ceph_mdsc_destroy(fsc);
716 destroy_workqueue(fsc->inode_wq); 735 destroy_workqueue(fsc->inode_wq);
717 destroy_workqueue(fsc->cap_wq); 736 destroy_workqueue(fsc->cap_wq);
718 737
@@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb)
829 fsc->mount_state = CEPH_MOUNT_SHUTDOWN; 848 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
830 ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); 849 ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
831 ceph_mdsc_force_umount(fsc->mdsc); 850 ceph_mdsc_force_umount(fsc->mdsc);
832 return; 851 fsc->filp_gen++; // invalidate open files
833} 852}
834 853
835static int ceph_remount(struct super_block *sb, int *flags, char *data) 854static int ceph_remount(struct super_block *sb, int *flags, char *data)
@@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
1089 } 1108 }
1090 1109
1091 if (ceph_sb_to_client(sb) != fsc) { 1110 if (ceph_sb_to_client(sb) != fsc) {
1092 ceph_mdsc_destroy(fsc);
1093 destroy_fs_client(fsc); 1111 destroy_fs_client(fsc);
1094 fsc = ceph_sb_to_client(sb); 1112 fsc = ceph_sb_to_client(sb);
1095 dout("get_sb got existing client %p\n", fsc); 1113 dout("get_sb got existing client %p\n", fsc);
@@ -1115,7 +1133,6 @@ out_splat:
1115 goto out_final; 1133 goto out_final;
1116 1134
1117out: 1135out:
1118 ceph_mdsc_destroy(fsc);
1119 destroy_fs_client(fsc); 1136 destroy_fs_client(fsc);
1120out_final: 1137out_final:
1121 dout("ceph_mount fail %ld\n", PTR_ERR(res)); 1138 dout("ceph_mount fail %ld\n", PTR_ERR(res));
@@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s)
1139 1156
1140 ceph_fscache_unregister_fs(fsc); 1157 ceph_fscache_unregister_fs(fsc);
1141 1158
1142 ceph_mdsc_destroy(fsc);
1143
1144 destroy_fs_client(fsc); 1159 destroy_fs_client(fsc);
1145 free_anon_bdev(dev); 1160 free_anon_bdev(dev);
1146} 1161}
@@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = {
1154}; 1169};
1155MODULE_ALIAS_FS("ceph"); 1170MODULE_ALIAS_FS("ceph");
1156 1171
1172int ceph_force_reconnect(struct super_block *sb)
1173{
1174 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
1175 int err = 0;
1176
1177 ceph_umount_begin(sb);
1178
1179 /* Make sure all page caches get invalidated.
1180 * see remove_session_caps_cb() */
1181 flush_workqueue(fsc->inode_wq);
1182
1183 /* In case that we were blacklisted. This also reset
1184 * all mon/osd connections */
1185 ceph_reset_client_addr(fsc->client);
1186
1187 ceph_osdc_clear_abort_err(&fsc->client->osdc);
1188
1189 fsc->blacklisted = false;
1190 fsc->mount_state = CEPH_MOUNT_MOUNTED;
1191
1192 if (sb->s_root) {
1193 err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
1194 CEPH_STAT_CAP_INODE, true);
1195 }
1196 return err;
1197}
1198
1157static int __init init_ceph(void) 1199static int __init init_ceph(void)
1158{ 1200{
1159 int ret = init_caches(); 1201 int ret = init_caches();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6b9f1ee7de85..f98d9247f9cb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/posix_acl.h> 17#include <linux/posix_acl.h>
18#include <linux/refcount.h> 18#include <linux/refcount.h>
19#include <linux/security.h>
19 20
20#include <linux/ceph/libceph.h> 21#include <linux/ceph/libceph.h>
21 22
@@ -31,6 +32,7 @@
31#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ 32#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
32#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 33#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
33 34
35#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
34#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 36#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
35#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 37#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
36#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 38#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
@@ -101,6 +103,11 @@ struct ceph_fs_client {
101 struct ceph_client *client; 103 struct ceph_client *client;
102 104
103 unsigned long mount_state; 105 unsigned long mount_state;
106
107 unsigned long last_auto_reconnect;
108 bool blacklisted;
109
110 u32 filp_gen;
104 loff_t max_file_size; 111 loff_t max_file_size;
105 112
106 struct ceph_mds_client *mdsc; 113 struct ceph_mds_client *mdsc;
@@ -395,6 +402,8 @@ struct ceph_inode_info {
395 struct fscache_cookie *fscache; 402 struct fscache_cookie *fscache;
396 u32 i_fscache_gen; 403 u32 i_fscache_gen;
397#endif 404#endif
405 errseq_t i_meta_err;
406
398 struct inode vfs_inode; /* at end */ 407 struct inode vfs_inode; /* at end */
399}; 408};
400 409
@@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
499#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ 508#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
500#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ 509#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
501#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ 510#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
502#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ 511#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
503#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ 512#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
504#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 513#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
505#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 514#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
506#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 515#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
507#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ 516#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
508#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ 517#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
509#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ 518#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
510#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ 519#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
511#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ 520#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
512
513 521
514/* 522/*
515 * Masks of ceph inode work. 523 * Masks of ceph inode work.
@@ -703,6 +711,10 @@ struct ceph_file_info {
703 711
704 spinlock_t rw_contexts_lock; 712 spinlock_t rw_contexts_lock;
705 struct list_head rw_contexts; 713 struct list_head rw_contexts;
714
715 errseq_t meta_err;
716 u32 filp_gen;
717 atomic_t num_locks;
706}; 718};
707 719
708struct ceph_dir_file_info { 720struct ceph_dir_file_info {
@@ -842,7 +854,8 @@ static inline int default_congestion_kb(void)
842} 854}
843 855
844 856
845 857/* super.c */
858extern int ceph_force_reconnect(struct super_block *sb);
846/* snap.c */ 859/* snap.c */
847struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 860struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
848 u64 ino); 861 u64 ino);
@@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in)
959#ifdef CONFIG_CEPH_FS_SECURITY_LABEL 972#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
960extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, 973extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
961 struct ceph_acl_sec_ctx *ctx); 974 struct ceph_acl_sec_ctx *ctx);
962extern void ceph_security_invalidate_secctx(struct inode *inode); 975static inline void ceph_security_invalidate_secctx(struct inode *inode)
976{
977 security_inode_invalidate_secctx(inode);
978}
963#else 979#else
964static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, 980static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
965 struct ceph_acl_sec_ctx *ctx) 981 struct ceph_acl_sec_ctx *ctx)
@@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1039 struct ceph_mds_session *session); 1055 struct ceph_mds_session *session);
1040extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, 1056extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
1041 int mds); 1057 int mds);
1042extern int ceph_get_cap_mds(struct inode *inode);
1043extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 1058extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
1044extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 1059extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
1045extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 1060extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
1058 struct inode *dir, 1073 struct inode *dir,
1059 int mds, int drop, int unless); 1074 int mds, int drop, int unless);
1060 1075
1061extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 1076extern int ceph_get_caps(struct file *filp, int need, int want,
1062 loff_t endoff, int *got, struct page **pinned_page); 1077 loff_t endoff, int *got, struct page **pinned_page);
1063extern int ceph_try_get_caps(struct ceph_inode_info *ci, 1078extern int ceph_try_get_caps(struct inode *inode,
1064 int need, int want, bool nonblock, int *got); 1079 int need, int want, bool nonblock, int *got);
1065 1080
1066/* for counting open files by mode */ 1081/* for counting open files by mode */
@@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
1071extern const struct address_space_operations ceph_aops; 1086extern const struct address_space_operations ceph_aops;
1072extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); 1087extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
1073extern int ceph_uninline_data(struct file *filp, struct page *locked_page); 1088extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
1074extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); 1089extern int ceph_pool_perm_check(struct inode *inode, int need);
1075extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); 1090extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
1076 1091
1077/* file.c */ 1092/* file.c */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 939eab7aa219..cb18ee637cb7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci,
20 20
21static bool ceph_is_valid_xattr(const char *name) 21static bool ceph_is_valid_xattr(const char *name)
22{ 22{
23 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 23 return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
24 !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
24 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 25 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
25 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 26 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
26} 27}
@@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
892 memcpy(value, xattr->val, xattr->val_len); 893 memcpy(value, xattr->val, xattr->val_len);
893 894
894 if (current->journal_info && 895 if (current->journal_info &&
895 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 896 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
897 security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN))
896 ci->i_ceph_flags |= CEPH_I_SEC_INITED; 898 ci->i_ceph_flags |= CEPH_I_SEC_INITED;
897out: 899out:
898 spin_unlock(&ci->i_ceph_lock); 900 spin_unlock(&ci->i_ceph_lock);
@@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
903{ 905{
904 struct inode *inode = d_inode(dentry); 906 struct inode *inode = d_inode(dentry);
905 struct ceph_inode_info *ci = ceph_inode(inode); 907 struct ceph_inode_info *ci = ceph_inode(inode);
906 struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
907 bool len_only = (size == 0); 908 bool len_only = (size == 0);
908 u32 namelen; 909 u32 namelen;
909 int err; 910 int err;
910 int i;
911 911
912 spin_lock(&ci->i_ceph_lock); 912 spin_lock(&ci->i_ceph_lock);
913 dout("listxattr %p ver=%lld index_ver=%lld\n", inode, 913 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
936 names = __copy_xattr_names(ci, names); 936 names = __copy_xattr_names(ci, names);
937 size -= namelen; 937 size -= namelen;
938 } 938 }
939
940
941 /* virtual xattr names, too */
942 if (vxattrs) {
943 for (i = 0; vxattrs[i].name; i++) {
944 size_t this_len;
945
946 if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN)
947 continue;
948 if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))
949 continue;
950
951 this_len = strlen(vxattrs[i].name) + 1;
952 namelen += this_len;
953 if (len_only)
954 continue;
955
956 if (this_len > size) {
957 err = -ERANGE;
958 goto out;
959 }
960
961 memcpy(names, vxattrs[i].name, this_len);
962 names += this_len;
963 size -= this_len;
964 }
965 }
966 err = namelen; 939 err = namelen;
967out: 940out:
968 spin_unlock(&ci->i_ceph_lock); 941 spin_unlock(&ci->i_ceph_lock);
@@ -1293,42 +1266,8 @@ out:
1293 ceph_pagelist_release(pagelist); 1266 ceph_pagelist_release(pagelist);
1294 return err; 1267 return err;
1295} 1268}
1296 1269#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
1297void ceph_security_invalidate_secctx(struct inode *inode) 1270#endif /* CONFIG_SECURITY */
1298{
1299 security_inode_invalidate_secctx(inode);
1300}
1301
1302static int ceph_xattr_set_security_label(const struct xattr_handler *handler,
1303 struct dentry *unused, struct inode *inode,
1304 const char *key, const void *buf,
1305 size_t buflen, int flags)
1306{
1307 if (security_ismaclabel(key)) {
1308 const char *name = xattr_full_name(handler, key);
1309 return __ceph_setxattr(inode, name, buf, buflen, flags);
1310 }
1311 return -EOPNOTSUPP;
1312}
1313
1314static int ceph_xattr_get_security_label(const struct xattr_handler *handler,
1315 struct dentry *unused, struct inode *inode,
1316 const char *key, void *buf, size_t buflen)
1317{
1318 if (security_ismaclabel(key)) {
1319 const char *name = xattr_full_name(handler, key);
1320 return __ceph_getxattr(inode, name, buf, buflen);
1321 }
1322 return -EOPNOTSUPP;
1323}
1324
1325static const struct xattr_handler ceph_security_label_handler = {
1326 .prefix = XATTR_SECURITY_PREFIX,
1327 .get = ceph_xattr_get_security_label,
1328 .set = ceph_xattr_set_security_label,
1329};
1330#endif
1331#endif
1332 1271
1333void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) 1272void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
1334{ 1273{
@@ -1352,9 +1291,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = {
1352 &posix_acl_access_xattr_handler, 1291 &posix_acl_access_xattr_handler,
1353 &posix_acl_default_xattr_handler, 1292 &posix_acl_default_xattr_handler,
1354#endif 1293#endif
1355#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
1356 &ceph_security_label_handler,
1357#endif
1358 &ceph_other_xattr_handler, 1294 &ceph_other_xattr_handler,
1359 NULL, 1295 NULL,
1360}; 1296};
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 82156da3c650..b9dbda1c26aa 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
293struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); 293struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
294u64 ceph_client_gid(struct ceph_client *client); 294u64 ceph_client_gid(struct ceph_client *client);
295extern void ceph_destroy_client(struct ceph_client *client); 295extern void ceph_destroy_client(struct ceph_client *client);
296extern void ceph_reset_client_addr(struct ceph_client *client);
296extern int __ceph_open_session(struct ceph_client *client, 297extern int __ceph_open_session(struct ceph_client *client,
297 unsigned long started); 298 unsigned long started);
298extern int ceph_open_session(struct ceph_client *client); 299extern int ceph_open_session(struct ceph_client *client);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 23895d178149..c4458dc6a757 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void);
337extern void ceph_messenger_init(struct ceph_messenger *msgr, 337extern void ceph_messenger_init(struct ceph_messenger *msgr,
338 struct ceph_entity_addr *myaddr); 338 struct ceph_entity_addr *myaddr);
339extern void ceph_messenger_fini(struct ceph_messenger *msgr); 339extern void ceph_messenger_fini(struct ceph_messenger *msgr);
340extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr);
340 341
341extern void ceph_con_init(struct ceph_connection *con, void *private, 342extern void ceph_con_init(struct ceph_connection *con, void *private,
342 const struct ceph_connection_operations *ops, 343 const struct ceph_connection_operations *ops,
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index b4d134d3312a..dbb8a6959a73 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m,
109 109
110extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); 110extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
111extern void ceph_monc_stop(struct ceph_mon_client *monc); 111extern void ceph_monc_stop(struct ceph_mon_client *monc);
112extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
112 113
113enum { 114enum {
114 CEPH_SUB_MONMAP = 0, 115 CEPH_SUB_MONMAP = 0,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ad7fe5d10dcd..eaffbdddf89a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void);
381extern int ceph_osdc_init(struct ceph_osd_client *osdc, 381extern int ceph_osdc_init(struct ceph_osd_client *osdc,
382 struct ceph_client *client); 382 struct ceph_client *client);
383extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 383extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
384extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
384 385
385extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, 386extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
386 struct ceph_msg *msg); 387 struct ceph_msg *msg);
@@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
388 struct ceph_msg *msg); 389 struct ceph_msg *msg);
389void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); 390void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
390void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); 391void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
392void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
391 393
392#define osd_req_op_data(oreq, whch, typ, fld) \ 394#define osd_req_op_data(oreq, whch, typ, fld) \
393({ \ 395({ \
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4eeea4d5c3ef..2d568246803f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -13,6 +13,7 @@
13#include <linux/nsproxy.h> 13#include <linux/nsproxy.h>
14#include <linux/parser.h> 14#include <linux/parser.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/mm.h>
16#include <linux/seq_file.h> 17#include <linux/seq_file.h>
17#include <linux/slab.h> 18#include <linux/slab.h>
18#include <linux/statfs.h> 19#include <linux/statfs.h>
@@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt,
185} 186}
186EXPORT_SYMBOL(ceph_compare_options); 187EXPORT_SYMBOL(ceph_compare_options);
187 188
189/*
190 * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
191 * compatible with (a superset of) GFP_KERNEL. This is because while the
192 * actual pages are allocated with the specified flags, the page table pages
193 * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take
194 * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
195 *
196 * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
197 */
188void *ceph_kvmalloc(size_t size, gfp_t flags) 198void *ceph_kvmalloc(size_t size, gfp_t flags)
189{ 199{
190 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 200 void *p;
191 void *ptr = kmalloc(size, flags | __GFP_NOWARN); 201
192 if (ptr) 202 if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
193 return ptr; 203 p = kvmalloc(size, flags);
204 } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
205 unsigned int nofs_flag = memalloc_nofs_save();
206 p = kvmalloc(size, GFP_KERNEL);
207 memalloc_nofs_restore(nofs_flag);
208 } else {
209 unsigned int noio_flag = memalloc_noio_save();
210 p = kvmalloc(size, GFP_KERNEL);
211 memalloc_noio_restore(noio_flag);
194 } 212 }
195 213
196 return __vmalloc(size, flags, PAGE_KERNEL); 214 return p;
197} 215}
198 216
199
200static int parse_fsid(const char *str, struct ceph_fsid *fsid) 217static int parse_fsid(const char *str, struct ceph_fsid *fsid)
201{ 218{
202 int i = 0; 219 int i = 0;
@@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client)
694} 711}
695EXPORT_SYMBOL(ceph_destroy_client); 712EXPORT_SYMBOL(ceph_destroy_client);
696 713
714void ceph_reset_client_addr(struct ceph_client *client)
715{
716 ceph_messenger_reset_nonce(&client->msgr);
717 ceph_monc_reopen_session(&client->monc);
718 ceph_osdc_reopen_osds(&client->osdc);
719}
720EXPORT_SYMBOL(ceph_reset_client_addr);
721
697/* 722/*
698 * true if we have the mon map (and have thus joined the cluster) 723 * true if we have the mon map (and have thus joined the cluster)
699 */ 724 */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 962f521c863e..e4cb3db2ee77 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con)
3031} 3031}
3032 3032
3033 3033
3034void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
3035{
3036 u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
3037 msgr->inst.addr.nonce = cpu_to_le32(nonce);
3038 encode_my_addr(msgr);
3039}
3034 3040
3035/* 3041/*
3036 * initialize a new messenger instance 3042 * initialize a new messenger instance
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 0520bf9825aa..7256c402ebaa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc)
213 __open_session(monc); 213 __open_session(monc);
214} 214}
215 215
216void ceph_monc_reopen_session(struct ceph_mon_client *monc)
217{
218 mutex_lock(&monc->mutex);
219 reopen_session(monc);
220 mutex_unlock(&monc->mutex);
221}
222
216static void un_backoff(struct ceph_mon_client *monc) 223static void un_backoff(struct ceph_mon_client *monc)
217{ 224{
218 monc->hunt_mult /= 2; /* reduce by 50% */ 225 monc->hunt_mult /= 2; /* reduce by 50% */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 78ae6e8c953d..ba45b074a362 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
841 struct ceph_pagelist *pagelist; 841 struct ceph_pagelist *pagelist;
842 size_t payload_len = 0; 842 size_t payload_len = 0;
843 size_t size; 843 size_t size;
844 int ret;
844 845
845 op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); 846 op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
846 847
@@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
852 size = strlen(class); 853 size = strlen(class);
853 BUG_ON(size > (size_t) U8_MAX); 854 BUG_ON(size > (size_t) U8_MAX);
854 op->cls.class_len = size; 855 op->cls.class_len = size;
855 ceph_pagelist_append(pagelist, class, size); 856 ret = ceph_pagelist_append(pagelist, class, size);
857 if (ret)
858 goto err_pagelist_free;
856 payload_len += size; 859 payload_len += size;
857 860
858 op->cls.method_name = method; 861 op->cls.method_name = method;
859 size = strlen(method); 862 size = strlen(method);
860 BUG_ON(size > (size_t) U8_MAX); 863 BUG_ON(size > (size_t) U8_MAX);
861 op->cls.method_len = size; 864 op->cls.method_len = size;
862 ceph_pagelist_append(pagelist, method, size); 865 ret = ceph_pagelist_append(pagelist, method, size);
866 if (ret)
867 goto err_pagelist_free;
863 payload_len += size; 868 payload_len += size;
864 869
865 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); 870 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
866
867 op->indata_len = payload_len; 871 op->indata_len = payload_len;
868 return 0; 872 return 0;
873
874err_pagelist_free:
875 ceph_pagelist_release(pagelist);
876 return ret;
869} 877}
870EXPORT_SYMBOL(osd_req_op_cls_init); 878EXPORT_SYMBOL(osd_req_op_cls_init);
871 879
@@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
877 opcode, 0); 885 opcode, 0);
878 struct ceph_pagelist *pagelist; 886 struct ceph_pagelist *pagelist;
879 size_t payload_len; 887 size_t payload_len;
888 int ret;
880 889
881 BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); 890 BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
882 891
@@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
886 895
887 payload_len = strlen(name); 896 payload_len = strlen(name);
888 op->xattr.name_len = payload_len; 897 op->xattr.name_len = payload_len;
889 ceph_pagelist_append(pagelist, name, payload_len); 898 ret = ceph_pagelist_append(pagelist, name, payload_len);
899 if (ret)
900 goto err_pagelist_free;
890 901
891 op->xattr.value_len = size; 902 op->xattr.value_len = size;
892 ceph_pagelist_append(pagelist, value, size); 903 ret = ceph_pagelist_append(pagelist, value, size);
904 if (ret)
905 goto err_pagelist_free;
893 payload_len += size; 906 payload_len += size;
894 907
895 op->xattr.cmp_op = cmp_op; 908 op->xattr.cmp_op = cmp_op;
@@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
898 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 911 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
899 op->indata_len = payload_len; 912 op->indata_len = payload_len;
900 return 0; 913 return 0;
914
915err_pagelist_free:
916 ceph_pagelist_release(pagelist);
917 return ret;
901} 918}
902EXPORT_SYMBOL(osd_req_op_xattr_init); 919EXPORT_SYMBOL(osd_req_op_xattr_init);
903 920
@@ -1488,7 +1505,6 @@ enum calc_target_result {
1488 1505
1489static enum calc_target_result calc_target(struct ceph_osd_client *osdc, 1506static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
1490 struct ceph_osd_request_target *t, 1507 struct ceph_osd_request_target *t,
1491 struct ceph_connection *con,
1492 bool any_change) 1508 bool any_change)
1493{ 1509{
1494 struct ceph_pg_pool_info *pi; 1510 struct ceph_pg_pool_info *pi;
@@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
2272 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); 2288 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
2273 2289
2274again: 2290again:
2275 ct_res = calc_target(osdc, &req->r_t, NULL, false); 2291 ct_res = calc_target(osdc, &req->r_t, false);
2276 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) 2292 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
2277 goto promote; 2293 goto promote;
2278 2294
@@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
2476} 2492}
2477EXPORT_SYMBOL(ceph_osdc_abort_requests); 2493EXPORT_SYMBOL(ceph_osdc_abort_requests);
2478 2494
2495void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
2496{
2497 down_write(&osdc->lock);
2498 osdc->abort_err = 0;
2499 up_write(&osdc->lock);
2500}
2501EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
2502
2479static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) 2503static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
2480{ 2504{
2481 if (likely(eb > osdc->epoch_barrier)) { 2505 if (likely(eb > osdc->epoch_barrier)) {
@@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
3087 lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; 3111 lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
3088 } 3112 }
3089 3113
3090 calc_target(osdc, &lreq->t, NULL, false); 3114 calc_target(osdc, &lreq->t, false);
3091 osd = lookup_create_osd(osdc, lreq->t.osd, true); 3115 osd = lookup_create_osd(osdc, lreq->t.osd, true);
3092 link_linger(osd, lreq); 3116 link_linger(osd, lreq);
3093 3117
@@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
3704 struct ceph_osd_client *osdc = lreq->osdc; 3728 struct ceph_osd_client *osdc = lreq->osdc;
3705 enum calc_target_result ct_res; 3729 enum calc_target_result ct_res;
3706 3730
3707 ct_res = calc_target(osdc, &lreq->t, NULL, true); 3731 ct_res = calc_target(osdc, &lreq->t, true);
3708 if (ct_res == CALC_TARGET_NEED_RESEND) { 3732 if (ct_res == CALC_TARGET_NEED_RESEND) {
3709 struct ceph_osd *osd; 3733 struct ceph_osd *osd;
3710 3734
@@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd,
3776 n = rb_next(n); /* unlink_request(), check_pool_dne() */ 3800 n = rb_next(n); /* unlink_request(), check_pool_dne() */
3777 3801
3778 dout("%s req %p tid %llu\n", __func__, req, req->r_tid); 3802 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
3779 ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, 3803 ct_res = calc_target(osdc, &req->r_t, false);
3780 false);
3781 switch (ct_res) { 3804 switch (ct_res) {
3782 case CALC_TARGET_NO_ACTION: 3805 case CALC_TARGET_NO_ACTION:
3783 force_resend_writes = cleared_full || 3806 force_resend_writes = cleared_full ||
@@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
3886 n = rb_next(n); 3909 n = rb_next(n);
3887 3910
3888 if (req->r_t.epoch < osdc->osdmap->epoch) { 3911 if (req->r_t.epoch < osdc->osdmap->epoch) {
3889 ct_res = calc_target(osdc, &req->r_t, NULL, false); 3912 ct_res = calc_target(osdc, &req->r_t, false);
3890 if (ct_res == CALC_TARGET_POOL_DNE) { 3913 if (ct_res == CALC_TARGET_POOL_DNE) {
3891 erase_request(need_resend, req); 3914 erase_request(need_resend, req);
3892 check_pool_dne(req); 3915 check_pool_dne(req);
@@ -5087,6 +5110,24 @@ out_put_req:
5087EXPORT_SYMBOL(ceph_osdc_call); 5110EXPORT_SYMBOL(ceph_osdc_call);
5088 5111
5089/* 5112/*
5113 * reset all osd connections
5114 */
5115void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
5116{
5117 struct rb_node *n;
5118
5119 down_write(&osdc->lock);
5120 for (n = rb_first(&osdc->osds); n; ) {
5121 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
5122
5123 n = rb_next(n);
5124 if (!reopen_osd(osd))
5125 kick_osd_requests(osd);
5126 }
5127 up_write(&osdc->lock);
5128}
5129
5130/*
5090 * init, shutdown 5131 * init, shutdown
5091 */ 5132 */
5092int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) 5133int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 90437906b7bc..4e0de14f80bb 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
973 struct ceph_pg_pool_info, node); 973 struct ceph_pg_pool_info, node);
974 __remove_pg_pool(&map->pg_pools, pi); 974 __remove_pg_pool(&map->pg_pools, pi);
975 } 975 }
976 kfree(map->osd_state); 976 kvfree(map->osd_state);
977 kfree(map->osd_weight); 977 kvfree(map->osd_weight);
978 kfree(map->osd_addr); 978 kvfree(map->osd_addr);
979 kfree(map->osd_primary_affinity); 979 kvfree(map->osd_primary_affinity);
980 kfree(map->crush_workspace); 980 kvfree(map->crush_workspace);
981 kfree(map); 981 kfree(map);
982} 982}
983 983
@@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
986 * 986 *
987 * The new elements are properly initialized. 987 * The new elements are properly initialized.
988 */ 988 */
989static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 989static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
990{ 990{
991 u32 *state; 991 u32 *state;
992 u32 *weight; 992 u32 *weight;
993 struct ceph_entity_addr *addr; 993 struct ceph_entity_addr *addr;
994 u32 to_copy;
994 int i; 995 int i;
995 996
996 state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); 997 dout("%s old %u new %u\n", __func__, map->max_osd, max);
997 if (!state) 998 if (max == map->max_osd)
998 return -ENOMEM; 999 return 0;
999 map->osd_state = state;
1000 1000
1001 weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); 1001 state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
1002 if (!weight) 1002 weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
1003 addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
1004 if (!state || !weight || !addr) {
1005 kvfree(state);
1006 kvfree(weight);
1007 kvfree(addr);
1003 return -ENOMEM; 1008 return -ENOMEM;
1004 map->osd_weight = weight; 1009 }
1005 1010
1006 addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); 1011 to_copy = min(map->max_osd, max);
1007 if (!addr) 1012 if (map->osd_state) {
1008 return -ENOMEM; 1013 memcpy(state, map->osd_state, to_copy * sizeof(*state));
1009 map->osd_addr = addr; 1014 memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
1015 memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
1016 kvfree(map->osd_state);
1017 kvfree(map->osd_weight);
1018 kvfree(map->osd_addr);
1019 }
1010 1020
1021 map->osd_state = state;
1022 map->osd_weight = weight;
1023 map->osd_addr = addr;
1011 for (i = map->max_osd; i < max; i++) { 1024 for (i = map->max_osd; i < max; i++) {
1012 map->osd_state[i] = 0; 1025 map->osd_state[i] = 0;
1013 map->osd_weight[i] = CEPH_OSD_OUT; 1026 map->osd_weight[i] = CEPH_OSD_OUT;
@@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
1017 if (map->osd_primary_affinity) { 1030 if (map->osd_primary_affinity) {
1018 u32 *affinity; 1031 u32 *affinity;
1019 1032
1020 affinity = krealloc(map->osd_primary_affinity, 1033 affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
1021 max*sizeof(*affinity), GFP_NOFS); 1034 GFP_NOFS);
1022 if (!affinity) 1035 if (!affinity)
1023 return -ENOMEM; 1036 return -ENOMEM;
1024 map->osd_primary_affinity = affinity;
1025 1037
1038 memcpy(affinity, map->osd_primary_affinity,
1039 to_copy * sizeof(*affinity));
1040 kvfree(map->osd_primary_affinity);
1041
1042 map->osd_primary_affinity = affinity;
1026 for (i = map->max_osd; i < max; i++) 1043 for (i = map->max_osd; i < max; i++)
1027 map->osd_primary_affinity[i] = 1044 map->osd_primary_affinity[i] =
1028 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 1045 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
1043 1060
1044 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); 1061 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
1045 dout("%s work_size %zu bytes\n", __func__, work_size); 1062 dout("%s work_size %zu bytes\n", __func__, work_size);
1046 workspace = kmalloc(work_size, GFP_NOIO); 1063 workspace = ceph_kvmalloc(work_size, GFP_NOIO);
1047 if (!workspace) { 1064 if (!workspace) {
1048 crush_destroy(crush); 1065 crush_destroy(crush);
1049 return -ENOMEM; 1066 return -ENOMEM;
@@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
1052 1069
1053 if (map->crush) 1070 if (map->crush)
1054 crush_destroy(map->crush); 1071 crush_destroy(map->crush);
1055 kfree(map->crush_workspace); 1072 kvfree(map->crush_workspace);
1056 map->crush = crush; 1073 map->crush = crush;
1057 map->crush_workspace = workspace; 1074 map->crush_workspace = workspace;
1058 return 0; 1075 return 0;
@@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
1298 if (!map->osd_primary_affinity) { 1315 if (!map->osd_primary_affinity) {
1299 int i; 1316 int i;
1300 1317
1301 map->osd_primary_affinity = kmalloc_array(map->max_osd, 1318 map->osd_primary_affinity = ceph_kvmalloc(
1302 sizeof(u32), 1319 array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
1303 GFP_NOFS); 1320 GFP_NOFS);
1304 if (!map->osd_primary_affinity) 1321 if (!map->osd_primary_affinity)
1305 return -ENOMEM; 1322 return -ENOMEM;
1306 1323
@@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end,
1321 1338
1322 ceph_decode_32_safe(p, end, len, e_inval); 1339 ceph_decode_32_safe(p, end, len, e_inval);
1323 if (len == 0) { 1340 if (len == 0) {
1324 kfree(map->osd_primary_affinity); 1341 kvfree(map->osd_primary_affinity);
1325 map->osd_primary_affinity = NULL; 1342 map->osd_primary_affinity = NULL;
1326 return 0; 1343 return 0;
1327 } 1344 }