summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/ceph.txt14
-rw-r--r--MAINTAINERS2
-rw-r--r--drivers/block/rbd.c18
-rw-r--r--drivers/char/tpm/tpm-interface.c23
-rw-r--r--fs/ceph/Makefile2
-rw-r--r--fs/ceph/addr.c61
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c173
-rw-r--r--fs/ceph/debugfs.c1
-rw-r--r--fs/ceph/export.c60
-rw-r--r--fs/ceph/file.c104
-rw-r--r--fs/ceph/inode.c50
-rw-r--r--fs/ceph/io.c163
-rw-r--r--fs/ceph/io.h12
-rw-r--r--fs/ceph/locks.c8
-rw-r--r--fs/ceph/mds_client.c110
-rw-r--r--fs/ceph/mds_client.h8
-rw-r--r--fs/ceph/super.c52
-rw-r--r--fs/ceph/super.h49
-rw-r--r--fs/ceph/xattr.c76
-rw-r--r--fs/fs_context.c14
-rw-r--r--fs/fuse/cuse.c101
-rw-r--r--fs/fuse/dev.c654
-rw-r--r--fs/fuse/dir.c283
-rw-r--r--fs/fuse/file.c1227
-rw-r--r--fs/fuse/fuse_i.h350
-rw-r--r--fs/fuse/inode.c553
-rw-r--r--fs/fuse/readdir.c72
-rw-r--r--fs/fuse/xattr.c76
-rw-r--r--fs/iomap/direct-io.c24
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/proc_namespace.c2
-rw-r--r--fs/super.c5
-rw-r--r--fs/xfs/xfs_file.c14
-rw-r--r--include/linux/ceph/libceph.h1
-rw-r--r--include/linux/ceph/messenger.h1
-rw-r--r--include/linux/ceph/mon_client.h1
-rw-r--r--include/linux/ceph/osd_client.h2
-rw-r--r--include/linux/fs_context.h1
-rw-r--r--include/linux/iomap.h10
-rw-r--r--include/uapi/linux/fuse.h4
-rw-r--r--net/ceph/ceph_common.c37
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/mon_client.c7
-rw-r--r--net/ceph/osd_client.c65
-rw-r--r--net/ceph/osdmap.c69
-rw-r--r--security/keys/trusted.c5
-rw-r--r--tools/testing/selftests/.gitignore2
-rw-r--r--tools/testing/selftests/tpm2/Makefile1
49 files changed, 2544 insertions, 2033 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index d2c6a5ccf0f5..b19b6a03f91c 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -158,6 +158,20 @@ Mount Options
158 copies. Currently, it's only used in copy_file_range, which will revert 158 copies. Currently, it's only used in copy_file_range, which will revert
159 to the default VFS implementation if this option is used. 159 to the default VFS implementation if this option is used.
160 160
161 recover_session=<no|clean>
162 Set auto reconnect mode in the case where the client is blacklisted. The
163 available modes are "no" and "clean". The default is "no".
164
165 * no: never attempt to reconnect when client detects that it has been
166 blacklisted. Operations will generally fail after being blacklisted.
167
168 * clean: client reconnects to the ceph cluster automatically when it
169 detects that it has been blacklisted. During reconnect, client drops
170 dirty data/metadata, invalidates page caches and writable file handles.
171 After reconnect, file locks become stale because the MDS loses track
172 of them. If an inode contains any stale file locks, read/write on the
173 inode is not allowed until applications release all stale file locks.
174
161More Information 175More Information
162================ 176================
163 177
diff --git a/MAINTAINERS b/MAINTAINERS
index a8d193a74692..a97f1be63b9d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9056,7 +9056,7 @@ S: Supported
9056F: Documentation/security/keys/trusted-encrypted.rst 9056F: Documentation/security/keys/trusted-encrypted.rst
9057F: include/keys/trusted-type.h 9057F: include/keys/trusted-type.h
9058F: security/keys/trusted.c 9058F: security/keys/trusted.c
9059F: security/keys/trusted.h 9059F: include/keys/trusted.h
9060 9060
9061KEYS/KEYRINGS: 9061KEYS/KEYRINGS:
9062M: David Howells <dhowells@redhat.com> 9062M: David Howells <dhowells@redhat.com>
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c8fb886aebd4..7c4350c0fb77 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1754,8 +1754,6 @@ static struct rbd_img_request *rbd_img_request_create(
1754 mutex_init(&img_request->state_mutex); 1754 mutex_init(&img_request->state_mutex);
1755 kref_init(&img_request->kref); 1755 kref_init(&img_request->kref);
1756 1756
1757 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1758 obj_op_name(op_type), img_request);
1759 return img_request; 1757 return img_request;
1760} 1758}
1761 1759
@@ -2944,6 +2942,9 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2944 __set_bit(IMG_REQ_CHILD, &child_img_req->flags); 2942 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2945 child_img_req->obj_request = obj_req; 2943 child_img_req->obj_request = obj_req;
2946 2944
2945 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2946 obj_req);
2947
2947 if (!rbd_img_is_write(img_req)) { 2948 if (!rbd_img_is_write(img_req)) {
2948 switch (img_req->data_type) { 2949 switch (img_req->data_type) {
2949 case OBJ_REQUEST_BIO: 2950 case OBJ_REQUEST_BIO:
@@ -4877,6 +4878,9 @@ static void rbd_queue_workfn(struct work_struct *work)
4877 img_request->rq = rq; 4878 img_request->rq = rq;
4878 snapc = NULL; /* img_request consumes a ref */ 4879 snapc = NULL; /* img_request consumes a ref */
4879 4880
4881 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4882 img_request, obj_op_name(op_type), offset, length);
4883
4880 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT) 4884 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
4881 result = rbd_img_fill_nodata(img_request, offset, length); 4885 result = rbd_img_fill_nodata(img_request, offset, length);
4882 else 4886 else
@@ -5669,17 +5673,20 @@ static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5669 5673
5670static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev) 5674static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5671{ 5675{
5676 size_t size;
5672 void *reply_buf; 5677 void *reply_buf;
5673 int ret; 5678 int ret;
5674 void *p; 5679 void *p;
5675 5680
5676 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL); 5681 /* Response will be an encoded string, which includes a length */
5682 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5683 reply_buf = kzalloc(size, GFP_KERNEL);
5677 if (!reply_buf) 5684 if (!reply_buf)
5678 return -ENOMEM; 5685 return -ENOMEM;
5679 5686
5680 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid, 5687 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5681 &rbd_dev->header_oloc, "get_object_prefix", 5688 &rbd_dev->header_oloc, "get_object_prefix",
5682 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX); 5689 NULL, 0, reply_buf, size);
5683 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 5690 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5684 if (ret < 0) 5691 if (ret < 0)
5685 goto out; 5692 goto out;
@@ -6696,7 +6703,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6696 dout("rbd id object name is %s\n", oid.name); 6703 dout("rbd id object name is %s\n", oid.name);
6697 6704
6698 /* Response will be an encoded string, which includes a length */ 6705 /* Response will be an encoded string, which includes a length */
6699
6700 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX; 6706 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6701 response = kzalloc(size, GFP_NOIO); 6707 response = kzalloc(size, GFP_NOIO);
6702 if (!response) { 6708 if (!response) {
@@ -6708,7 +6714,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6708 6714
6709 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc, 6715 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6710 "get_id", NULL, 0, 6716 "get_id", NULL, 0,
6711 response, RBD_IMAGE_ID_LEN_MAX); 6717 response, size);
6712 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret); 6718 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
6713 if (ret == -ENOENT) { 6719 if (ret == -ENOENT) {
6714 image_id = kstrdup("", GFP_KERNEL); 6720 image_id = kstrdup("", GFP_KERNEL);
diff --git a/drivers/char/tpm/tpm-interface.c b/drivers/char/tpm/tpm-interface.c
index 1b4f95c13e00..d7a3888ad80f 100644
--- a/drivers/char/tpm/tpm-interface.c
+++ b/drivers/char/tpm/tpm-interface.c
@@ -320,18 +320,22 @@ int tpm_pcr_extend(struct tpm_chip *chip, u32 pcr_idx,
320 if (!chip) 320 if (!chip)
321 return -ENODEV; 321 return -ENODEV;
322 322
323 for (i = 0; i < chip->nr_allocated_banks; i++) 323 for (i = 0; i < chip->nr_allocated_banks; i++) {
324 if (digests[i].alg_id != chip->allocated_banks[i].alg_id) 324 if (digests[i].alg_id != chip->allocated_banks[i].alg_id) {
325 return -EINVAL; 325 rc = EINVAL;
326 goto out;
327 }
328 }
326 329
327 if (chip->flags & TPM_CHIP_FLAG_TPM2) { 330 if (chip->flags & TPM_CHIP_FLAG_TPM2) {
328 rc = tpm2_pcr_extend(chip, pcr_idx, digests); 331 rc = tpm2_pcr_extend(chip, pcr_idx, digests);
329 tpm_put_ops(chip); 332 goto out;
330 return rc;
331 } 333 }
332 334
333 rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest, 335 rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest,
334 "attempting extend a PCR value"); 336 "attempting extend a PCR value");
337
338out:
335 tpm_put_ops(chip); 339 tpm_put_ops(chip);
336 return rc; 340 return rc;
337} 341}
@@ -354,14 +358,9 @@ int tpm_send(struct tpm_chip *chip, void *cmd, size_t buflen)
354 if (!chip) 358 if (!chip)
355 return -ENODEV; 359 return -ENODEV;
356 360
357 rc = tpm_buf_init(&buf, 0, 0); 361 buf.data = cmd;
358 if (rc)
359 goto out;
360
361 memcpy(buf.data, cmd, buflen);
362 rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command"); 362 rc = tpm_transmit_cmd(chip, &buf, 0, "attempting to a send a command");
363 tpm_buf_destroy(&buf); 363
364out:
365 tpm_put_ops(chip); 364 tpm_put_ops(chip);
366 return rc; 365 return rc;
367} 366}
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index a699e320393f..c1da294418d1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -6,7 +6,7 @@
6obj-$(CONFIG_CEPH_FS) += ceph.o 6obj-$(CONFIG_CEPH_FS) += ceph.o
7 7
8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 8ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
9 export.o caps.o snap.o xattr.o quota.o \ 9 export.o caps.o snap.o xattr.o quota.o io.o \
10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 10 mds_client.o mdsmap.o strings.o ceph_frag.o \
11 debugfs.o 11 debugfs.o
12 12
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b3c8b886bf64..7ab616601141 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
189{ 189{
190 struct inode *inode = file_inode(filp); 190 struct inode *inode = file_inode(filp);
191 struct ceph_inode_info *ci = ceph_inode(inode); 191 struct ceph_inode_info *ci = ceph_inode(inode);
192 struct ceph_osd_client *osdc = 192 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
193 &ceph_inode_to_client(inode)->client->osdc;
194 int err = 0; 193 int err = 0;
195 u64 off = page_offset(page); 194 u64 off = page_offset(page);
196 u64 len = PAGE_SIZE; 195 u64 len = PAGE_SIZE;
@@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
219 218
220 dout("readpage inode %p file %p page %p index %lu\n", 219 dout("readpage inode %p file %p page %p index %lu\n",
221 inode, filp, page, page->index); 220 inode, filp, page, page->index);
222 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 221 err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
223 off, &len, 222 &ci->i_layout, off, &len,
224 ci->i_truncate_seq, ci->i_truncate_size, 223 ci->i_truncate_seq, ci->i_truncate_size,
225 &page, 1, 0); 224 &page, 1, 0);
226 if (err == -ENOENT) 225 if (err == -ENOENT)
@@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page)
228 if (err < 0) { 227 if (err < 0) {
229 SetPageError(page); 228 SetPageError(page);
230 ceph_fscache_readpage_cancel(inode, page); 229 ceph_fscache_readpage_cancel(inode, page);
230 if (err == -EBLACKLISTED)
231 fsc->blacklisted = true;
231 goto out; 232 goto out;
232 } 233 }
233 if (err < PAGE_SIZE) 234 if (err < PAGE_SIZE)
@@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req)
266 int i; 267 int i;
267 268
268 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 269 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
270 if (rc == -EBLACKLISTED)
271 ceph_inode_to_client(inode)->blacklisted = true;
269 272
270 /* unlock all pages, zeroing any data we didn't read */ 273 /* unlock all pages, zeroing any data we didn't read */
271 osd_data = osd_req_op_extent_osd_data(req, 0); 274 osd_data = osd_req_op_extent_osd_data(req, 0);
@@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
323 /* caller of readpages does not hold buffer and read caps 326 /* caller of readpages does not hold buffer and read caps
324 * (fadvise, madvise and readahead cases) */ 327 * (fadvise, madvise and readahead cases) */
325 int want = CEPH_CAP_FILE_CACHE; 328 int want = CEPH_CAP_FILE_CACHE;
326 ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); 329 ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want,
330 true, &got);
327 if (ret < 0) { 331 if (ret < 0) {
328 dout("start_read %p, error getting cap\n", inode); 332 dout("start_read %p, error getting cap\n", inode);
329 } else if (!(got & want)) { 333 } else if (!(got & want)) {
@@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode,
569/* 573/*
570 * Write a single page, but leave the page locked. 574 * Write a single page, but leave the page locked.
571 * 575 *
572 * If we get a write error, set the page error bit, but still adjust the 576 * If we get a write error, mark the mapping for error, but still adjust the
573 * dirty page accounting (i.e., page is no longer dirty). 577 * dirty page accounting (i.e., page is no longer dirty).
574 */ 578 */
575static int writepage_nounlock(struct page *page, struct writeback_control *wbc) 579static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
@@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
640 end_page_writeback(page); 644 end_page_writeback(page);
641 return err; 645 return err;
642 } 646 }
647 if (err == -EBLACKLISTED)
648 fsc->blacklisted = true;
643 dout("writepage setting page/mapping error %d %p\n", 649 dout("writepage setting page/mapping error %d %p\n",
644 err, page); 650 err, page);
645 SetPageError(page);
646 mapping_set_error(&inode->i_data, err); 651 mapping_set_error(&inode->i_data, err);
647 wbc->pages_skipped++; 652 wbc->pages_skipped++;
648 } else { 653 } else {
@@ -680,23 +685,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
680} 685}
681 686
682/* 687/*
683 * lame release_pages helper. release_pages() isn't exported to
684 * modules.
685 */
686static void ceph_release_pages(struct page **pages, int num)
687{
688 struct pagevec pvec;
689 int i;
690
691 pagevec_init(&pvec);
692 for (i = 0; i < num; i++) {
693 if (pagevec_add(&pvec, pages[i]) == 0)
694 pagevec_release(&pvec);
695 }
696 pagevec_release(&pvec);
697}
698
699/*
700 * async writeback completion handler. 688 * async writeback completion handler.
701 * 689 *
702 * If we get an error, set the mapping error bit, but not the individual 690 * If we get an error, set the mapping error bit, but not the individual
@@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req)
720 if (rc < 0) { 708 if (rc < 0) {
721 mapping_set_error(mapping, rc); 709 mapping_set_error(mapping, rc);
722 ceph_set_error_write(ci); 710 ceph_set_error_write(ci);
711 if (rc == -EBLACKLISTED)
712 fsc->blacklisted = true;
723 } else { 713 } else {
724 ceph_clear_error_write(ci); 714 ceph_clear_error_write(ci);
725 } 715 }
@@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req)
769 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", 759 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
770 inode, osd_data->length, rc >= 0 ? num_pages : 0); 760 inode, osd_data->length, rc >= 0 ? num_pages : 0);
771 761
772 ceph_release_pages(osd_data->pages, num_pages); 762 release_pages(osd_data->pages, num_pages);
773 } 763 }
774 764
775 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); 765 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
@@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
1452 want = CEPH_CAP_FILE_CACHE; 1442 want = CEPH_CAP_FILE_CACHE;
1453 1443
1454 got = 0; 1444 got = 0;
1455 err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 1445 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1,
1446 &got, &pinned_page);
1456 if (err < 0) 1447 if (err < 0)
1457 goto out_restore; 1448 goto out_restore;
1458 1449
@@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1540 if (!prealloc_cf) 1531 if (!prealloc_cf)
1541 return VM_FAULT_OOM; 1532 return VM_FAULT_OOM;
1542 1533
1534 sb_start_pagefault(inode->i_sb);
1543 ceph_block_sigs(&oldset); 1535 ceph_block_sigs(&oldset);
1544 1536
1545 if (ci->i_inline_version != CEPH_INLINE_NONE) { 1537 if (ci->i_inline_version != CEPH_INLINE_NONE) {
@@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1568 want = CEPH_CAP_FILE_BUFFER; 1560 want = CEPH_CAP_FILE_BUFFER;
1569 1561
1570 got = 0; 1562 got = 0;
1571 err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1563 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len,
1572 &got, NULL); 1564 &got, NULL);
1573 if (err < 0) 1565 if (err < 0)
1574 goto out_free; 1566 goto out_free;
@@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
1614 ceph_put_cap_refs(ci, got); 1606 ceph_put_cap_refs(ci, got);
1615out_free: 1607out_free:
1616 ceph_restore_sigs(&oldset); 1608 ceph_restore_sigs(&oldset);
1609 sb_end_pagefault(inode->i_sb);
1617 ceph_free_cap_flush(prealloc_cf); 1610 ceph_free_cap_flush(prealloc_cf);
1618 if (err < 0) 1611 if (err < 0)
1619 ret = vmf_error(err); 1612 ret = vmf_error(err);
@@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
1946 1939
1947 if (err >= 0 || err == -ENOENT) 1940 if (err >= 0 || err == -ENOENT)
1948 have |= POOL_READ; 1941 have |= POOL_READ;
1949 else if (err != -EPERM) 1942 else if (err != -EPERM) {
1943 if (err == -EBLACKLISTED)
1944 fsc->blacklisted = true;
1950 goto out_unlock; 1945 goto out_unlock;
1946 }
1951 1947
1952 if (err2 == 0 || err2 == -EEXIST) 1948 if (err2 == 0 || err2 == -EEXIST)
1953 have |= POOL_WRITE; 1949 have |= POOL_WRITE;
1954 else if (err2 != -EPERM) { 1950 else if (err2 != -EPERM) {
1951 if (err2 == -EBLACKLISTED)
1952 fsc->blacklisted = true;
1955 err = err2; 1953 err = err2;
1956 goto out_unlock; 1954 goto out_unlock;
1957 } 1955 }
@@ -1989,10 +1987,11 @@ out:
1989 return err; 1987 return err;
1990} 1988}
1991 1989
1992int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) 1990int ceph_pool_perm_check(struct inode *inode, int need)
1993{ 1991{
1994 s64 pool; 1992 struct ceph_inode_info *ci = ceph_inode(inode);
1995 struct ceph_string *pool_ns; 1993 struct ceph_string *pool_ns;
1994 s64 pool;
1996 int ret, flags; 1995 int ret, flags;
1997 1996
1998 if (ci->i_vino.snap != CEPH_NOSNAP) { 1997 if (ci->i_vino.snap != CEPH_NOSNAP) {
@@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
2004 return 0; 2003 return 0;
2005 } 2004 }
2006 2005
2007 if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), 2006 if (ceph_test_mount_opt(ceph_inode_to_client(inode),
2008 NOPOOLPERM)) 2007 NOPOOLPERM))
2009 return 0; 2008 return 0;
2010 2009
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index bc90cf6ad7ed..b2ec29eeb4c4 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -6,6 +6,8 @@
6 * Written by Milosz Tanski (milosz@adfin.com) 6 * Written by Milosz Tanski (milosz@adfin.com)
7 */ 7 */
8 8
9#include <linux/ceph/ceph_debug.h>
10
9#include "super.h" 11#include "super.h"
10#include "cache.h" 12#include "cache.h"
11 13
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ce0f5658720a..d3b9c9d5c1bd 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -458,37 +458,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
458} 458}
459 459
460/* 460/*
461 * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1.
462 */
463static int __ceph_get_cap_mds(struct ceph_inode_info *ci)
464{
465 struct ceph_cap *cap;
466 int mds = -1;
467 struct rb_node *p;
468
469 /* prefer mds with WR|BUFFER|EXCL caps */
470 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
471 cap = rb_entry(p, struct ceph_cap, ci_node);
472 mds = cap->mds;
473 if (cap->issued & (CEPH_CAP_FILE_WR |
474 CEPH_CAP_FILE_BUFFER |
475 CEPH_CAP_FILE_EXCL))
476 break;
477 }
478 return mds;
479}
480
481int ceph_get_cap_mds(struct inode *inode)
482{
483 struct ceph_inode_info *ci = ceph_inode(inode);
484 int mds;
485 spin_lock(&ci->i_ceph_lock);
486 mds = __ceph_get_cap_mds(ceph_inode(inode));
487 spin_unlock(&ci->i_ceph_lock);
488 return mds;
489}
490
491/*
492 * Called under i_ceph_lock. 461 * Called under i_ceph_lock.
493 */ 462 */
494static void __insert_cap_node(struct ceph_inode_info *ci, 463static void __insert_cap_node(struct ceph_inode_info *ci,
@@ -628,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
628/* 597/*
629 * Add a capability under the given MDS session. 598 * Add a capability under the given MDS session.
630 * 599 *
631 * Caller should hold session snap_rwsem (read) and s_mutex. 600 * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
632 * 601 *
633 * @fmode is the open file mode, if we are opening a file, otherwise 602 * @fmode is the open file mode, if we are opening a file, otherwise
634 * it is < 0. (This is so we can atomically add the cap and add an 603 * it is < 0. (This is so we can atomically add the cap and add an
@@ -645,6 +614,9 @@ void ceph_add_cap(struct inode *inode,
645 struct ceph_cap *cap; 614 struct ceph_cap *cap;
646 int mds = session->s_mds; 615 int mds = session->s_mds;
647 int actual_wanted; 616 int actual_wanted;
617 u32 gen;
618
619 lockdep_assert_held(&ci->i_ceph_lock);
648 620
649 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, 621 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
650 session->s_mds, cap_id, ceph_cap_string(issued), seq); 622 session->s_mds, cap_id, ceph_cap_string(issued), seq);
@@ -656,6 +628,10 @@ void ceph_add_cap(struct inode *inode,
656 if (fmode >= 0) 628 if (fmode >= 0)
657 wanted |= ceph_caps_for_mode(fmode); 629 wanted |= ceph_caps_for_mode(fmode);
658 630
631 spin_lock(&session->s_gen_ttl_lock);
632 gen = session->s_cap_gen;
633 spin_unlock(&session->s_gen_ttl_lock);
634
659 cap = __get_cap_for_mds(ci, mds); 635 cap = __get_cap_for_mds(ci, mds);
660 if (!cap) { 636 if (!cap) {
661 cap = *new_cap; 637 cap = *new_cap;
@@ -681,7 +657,7 @@ void ceph_add_cap(struct inode *inode,
681 list_move_tail(&cap->session_caps, &session->s_caps); 657 list_move_tail(&cap->session_caps, &session->s_caps);
682 spin_unlock(&session->s_cap_lock); 658 spin_unlock(&session->s_cap_lock);
683 659
684 if (cap->cap_gen < session->s_cap_gen) 660 if (cap->cap_gen < gen)
685 cap->issued = cap->implemented = CEPH_CAP_PIN; 661 cap->issued = cap->implemented = CEPH_CAP_PIN;
686 662
687 /* 663 /*
@@ -775,7 +751,7 @@ void ceph_add_cap(struct inode *inode,
775 cap->seq = seq; 751 cap->seq = seq;
776 cap->issue_seq = seq; 752 cap->issue_seq = seq;
777 cap->mseq = mseq; 753 cap->mseq = mseq;
778 cap->cap_gen = session->s_cap_gen; 754 cap->cap_gen = gen;
779 755
780 if (fmode >= 0) 756 if (fmode >= 0)
781 __ceph_get_fmode(ci, fmode); 757 __ceph_get_fmode(ci, fmode);
@@ -1284,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
1284 * Make note of max_size reported/requested from mds, revoked caps 1260 * Make note of max_size reported/requested from mds, revoked caps
1285 * that have now been implemented. 1261 * that have now been implemented.
1286 * 1262 *
1287 * Make half-hearted attempt ot to invalidate page cache if we are
1288 * dropping RDCACHE. Note that this will leave behind locked pages
1289 * that we'll then need to deal with elsewhere.
1290 *
1291 * Return non-zero if delayed release, or we experienced an error 1263 * Return non-zero if delayed release, or we experienced an error
1292 * such that the caller should requeue + retry later. 1264 * such that the caller should requeue + retry later.
1293 * 1265 *
@@ -1746,11 +1718,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
1746 * Add dirty inode to the flushing list. Assigned a seq number so we 1718 * Add dirty inode to the flushing list. Assigned a seq number so we
1747 * can wait for caps to flush without starving. 1719 * can wait for caps to flush without starving.
1748 * 1720 *
1749 * Called under i_ceph_lock. 1721 * Called under i_ceph_lock. Returns the flush tid.
1750 */ 1722 */
1751static int __mark_caps_flushing(struct inode *inode, 1723static u64 __mark_caps_flushing(struct inode *inode,
1752 struct ceph_mds_session *session, bool wake, 1724 struct ceph_mds_session *session, bool wake,
1753 u64 *flush_tid, u64 *oldest_flush_tid) 1725 u64 *oldest_flush_tid)
1754{ 1726{
1755 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1727 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1756 struct ceph_inode_info *ci = ceph_inode(inode); 1728 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1789,8 +1761,7 @@ static int __mark_caps_flushing(struct inode *inode,
1789 1761
1790 list_add_tail(&cf->i_list, &ci->i_cap_flush_list); 1762 list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
1791 1763
1792 *flush_tid = cf->tid; 1764 return cf->tid;
1793 return flushing;
1794} 1765}
1795 1766
1796/* 1767/*
@@ -2028,11 +1999,6 @@ retry_locked:
2028 } 1999 }
2029 2000
2030ack: 2001ack:
2031 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2032 dout(" skipping %p I_NOFLUSH set\n", inode);
2033 continue;
2034 }
2035
2036 if (session && session != cap->session) { 2002 if (session && session != cap->session) {
2037 dout("oops, wrong session %p mutex\n", session); 2003 dout("oops, wrong session %p mutex\n", session);
2038 mutex_unlock(&session->s_mutex); 2004 mutex_unlock(&session->s_mutex);
@@ -2080,9 +2046,9 @@ ack:
2080 } 2046 }
2081 2047
2082 if (cap == ci->i_auth_cap && ci->i_dirty_caps) { 2048 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
2083 flushing = __mark_caps_flushing(inode, session, false, 2049 flushing = ci->i_dirty_caps;
2084 &flush_tid, 2050 flush_tid = __mark_caps_flushing(inode, session, false,
2085 &oldest_flush_tid); 2051 &oldest_flush_tid);
2086 } else { 2052 } else {
2087 flushing = 0; 2053 flushing = 0;
2088 flush_tid = 0; 2054 flush_tid = 0;
@@ -2130,16 +2096,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
2130retry: 2096retry:
2131 spin_lock(&ci->i_ceph_lock); 2097 spin_lock(&ci->i_ceph_lock);
2132retry_locked: 2098retry_locked:
2133 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
2134 spin_unlock(&ci->i_ceph_lock);
2135 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
2136 goto out;
2137 }
2138 if (ci->i_dirty_caps && ci->i_auth_cap) { 2099 if (ci->i_dirty_caps && ci->i_auth_cap) {
2139 struct ceph_cap *cap = ci->i_auth_cap; 2100 struct ceph_cap *cap = ci->i_auth_cap;
2140 int delayed; 2101 int delayed;
2141 2102
2142 if (!session || session != cap->session) { 2103 if (session != cap->session) {
2143 spin_unlock(&ci->i_ceph_lock); 2104 spin_unlock(&ci->i_ceph_lock);
2144 if (session) 2105 if (session)
2145 mutex_unlock(&session->s_mutex); 2106 mutex_unlock(&session->s_mutex);
@@ -2161,8 +2122,9 @@ retry_locked:
2161 goto retry_locked; 2122 goto retry_locked;
2162 } 2123 }
2163 2124
2164 flushing = __mark_caps_flushing(inode, session, true, 2125 flushing = ci->i_dirty_caps;
2165 &flush_tid, &oldest_flush_tid); 2126 flush_tid = __mark_caps_flushing(inode, session, true,
2127 &oldest_flush_tid);
2166 2128
2167 /* __send_cap drops i_ceph_lock */ 2129 /* __send_cap drops i_ceph_lock */
2168 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, 2130 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
@@ -2261,35 +2223,45 @@ static int unsafe_request_wait(struct inode *inode)
2261 2223
2262int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) 2224int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
2263{ 2225{
2226 struct ceph_file_info *fi = file->private_data;
2264 struct inode *inode = file->f_mapping->host; 2227 struct inode *inode = file->f_mapping->host;
2265 struct ceph_inode_info *ci = ceph_inode(inode); 2228 struct ceph_inode_info *ci = ceph_inode(inode);
2266 u64 flush_tid; 2229 u64 flush_tid;
2267 int ret; 2230 int ret, err;
2268 int dirty; 2231 int dirty;
2269 2232
2270 dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); 2233 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
2271 2234
2272 ret = file_write_and_wait_range(file, start, end); 2235 ret = file_write_and_wait_range(file, start, end);
2273 if (ret < 0)
2274 goto out;
2275
2276 if (datasync) 2236 if (datasync)
2277 goto out; 2237 goto out;
2278 2238
2279 dirty = try_flush_caps(inode, &flush_tid); 2239 dirty = try_flush_caps(inode, &flush_tid);
2280 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); 2240 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2281 2241
2282 ret = unsafe_request_wait(inode); 2242 err = unsafe_request_wait(inode);
2283 2243
2284 /* 2244 /*
2285 * only wait on non-file metadata writeback (the mds 2245 * only wait on non-file metadata writeback (the mds
2286 * can recover size and mtime, so we don't need to 2246 * can recover size and mtime, so we don't need to
2287 * wait for that) 2247 * wait for that)
2288 */ 2248 */
2289 if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { 2249 if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2290 ret = wait_event_interruptible(ci->i_cap_wq, 2250 err = wait_event_interruptible(ci->i_cap_wq,
2291 caps_are_flushed(inode, flush_tid)); 2251 caps_are_flushed(inode, flush_tid));
2292 } 2252 }
2253
2254 if (err < 0)
2255 ret = err;
2256
2257 if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
2258 spin_lock(&file->f_lock);
2259 err = errseq_check_and_advance(&ci->i_meta_err,
2260 &fi->meta_err);
2261 spin_unlock(&file->f_lock);
2262 if (err < 0)
2263 ret = err;
2264 }
2293out: 2265out:
2294 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); 2266 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
2295 return ret; 2267 return ret;
@@ -2560,10 +2532,15 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got,
2560 * 2532 *
2561 * FIXME: how does a 0 return differ from -EAGAIN? 2533 * FIXME: how does a 0 return differ from -EAGAIN?
2562 */ 2534 */
2563static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2535enum {
2564 loff_t endoff, bool nonblock, int *got) 2536 NON_BLOCKING = 1,
2537 CHECK_FILELOCK = 2,
2538};
2539
2540static int try_get_cap_refs(struct inode *inode, int need, int want,
2541 loff_t endoff, int flags, int *got)
2565{ 2542{
2566 struct inode *inode = &ci->vfs_inode; 2543 struct ceph_inode_info *ci = ceph_inode(inode);
2567 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; 2544 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
2568 int ret = 0; 2545 int ret = 0;
2569 int have, implemented; 2546 int have, implemented;
@@ -2576,6 +2553,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
2576again: 2553again:
2577 spin_lock(&ci->i_ceph_lock); 2554 spin_lock(&ci->i_ceph_lock);
2578 2555
2556 if ((flags & CHECK_FILELOCK) &&
2557 (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2558 dout("try_get_cap_refs %p error filelock\n", inode);
2559 ret = -EIO;
2560 goto out_unlock;
2561 }
2562
2579 /* make sure file is actually open */ 2563 /* make sure file is actually open */
2580 file_wanted = __ceph_caps_file_wanted(ci); 2564 file_wanted = __ceph_caps_file_wanted(ci);
2581 if ((file_wanted & need) != need) { 2565 if ((file_wanted & need) != need) {
@@ -2637,7 +2621,7 @@ again:
2637 * we can not call down_read() when 2621 * we can not call down_read() when
2638 * task isn't in TASK_RUNNING state 2622 * task isn't in TASK_RUNNING state
2639 */ 2623 */
2640 if (nonblock) { 2624 if (flags & NON_BLOCKING) {
2641 ret = -EAGAIN; 2625 ret = -EAGAIN;
2642 goto out_unlock; 2626 goto out_unlock;
2643 } 2627 }
@@ -2731,18 +2715,19 @@ static void check_max_size(struct inode *inode, loff_t endoff)
2731 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); 2715 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2732} 2716}
2733 2717
2734int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, 2718int ceph_try_get_caps(struct inode *inode, int need, int want,
2735 bool nonblock, int *got) 2719 bool nonblock, int *got)
2736{ 2720{
2737 int ret; 2721 int ret;
2738 2722
2739 BUG_ON(need & ~CEPH_CAP_FILE_RD); 2723 BUG_ON(need & ~CEPH_CAP_FILE_RD);
2740 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); 2724 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
2741 ret = ceph_pool_perm_check(ci, need); 2725 ret = ceph_pool_perm_check(inode, need);
2742 if (ret < 0) 2726 if (ret < 0)
2743 return ret; 2727 return ret;
2744 2728
2745 ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); 2729 ret = try_get_cap_refs(inode, need, want, 0,
2730 (nonblock ? NON_BLOCKING : 0), got);
2746 return ret == -EAGAIN ? 0 : ret; 2731 return ret == -EAGAIN ? 0 : ret;
2747} 2732}
2748 2733
@@ -2751,30 +2736,40 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
2751 * due to a small max_size, make sure we check_max_size (and possibly 2736 * due to a small max_size, make sure we check_max_size (and possibly
2752 * ask the mds) so we don't get hung up indefinitely. 2737 * ask the mds) so we don't get hung up indefinitely.
2753 */ 2738 */
2754int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2739int ceph_get_caps(struct file *filp, int need, int want,
2755 loff_t endoff, int *got, struct page **pinned_page) 2740 loff_t endoff, int *got, struct page **pinned_page)
2756{ 2741{
2757 int _got, ret; 2742 struct ceph_file_info *fi = filp->private_data;
2743 struct inode *inode = file_inode(filp);
2744 struct ceph_inode_info *ci = ceph_inode(inode);
2745 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
2746 int ret, _got, flags;
2758 2747
2759 ret = ceph_pool_perm_check(ci, need); 2748 ret = ceph_pool_perm_check(inode, need);
2760 if (ret < 0) 2749 if (ret < 0)
2761 return ret; 2750 return ret;
2762 2751
2752 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2753 fi->filp_gen != READ_ONCE(fsc->filp_gen))
2754 return -EBADF;
2755
2763 while (true) { 2756 while (true) {
2764 if (endoff > 0) 2757 if (endoff > 0)
2765 check_max_size(&ci->vfs_inode, endoff); 2758 check_max_size(inode, endoff);
2766 2759
2760 flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0;
2767 _got = 0; 2761 _got = 0;
2768 ret = try_get_cap_refs(ci, need, want, endoff, 2762 ret = try_get_cap_refs(inode, need, want, endoff,
2769 false, &_got); 2763 flags, &_got);
2770 if (ret == -EAGAIN) 2764 if (ret == -EAGAIN)
2771 continue; 2765 continue;
2772 if (!ret) { 2766 if (!ret) {
2773 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2767 DEFINE_WAIT_FUNC(wait, woken_wake_function);
2774 add_wait_queue(&ci->i_cap_wq, &wait); 2768 add_wait_queue(&ci->i_cap_wq, &wait);
2775 2769
2776 while (!(ret = try_get_cap_refs(ci, need, want, endoff, 2770 flags |= NON_BLOCKING;
2777 true, &_got))) { 2771 while (!(ret = try_get_cap_refs(inode, need, want,
2772 endoff, flags, &_got))) {
2778 if (signal_pending(current)) { 2773 if (signal_pending(current)) {
2779 ret = -ERESTARTSYS; 2774 ret = -ERESTARTSYS;
2780 break; 2775 break;
@@ -2786,10 +2781,18 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2786 if (ret == -EAGAIN) 2781 if (ret == -EAGAIN)
2787 continue; 2782 continue;
2788 } 2783 }
2784
2785 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2786 fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
2787 if (ret >= 0 && _got)
2788 ceph_put_cap_refs(ci, _got);
2789 return -EBADF;
2790 }
2791
2789 if (ret < 0) { 2792 if (ret < 0) {
2790 if (ret == -ESTALE) { 2793 if (ret == -ESTALE) {
2791 /* session was killed, try renew caps */ 2794 /* session was killed, try renew caps */
2792 ret = ceph_renew_caps(&ci->vfs_inode); 2795 ret = ceph_renew_caps(inode);
2793 if (ret == 0) 2796 if (ret == 0)
2794 continue; 2797 continue;
2795 } 2798 }
@@ -2798,9 +2801,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2798 2801
2799 if (ci->i_inline_version != CEPH_INLINE_NONE && 2802 if (ci->i_inline_version != CEPH_INLINE_NONE &&
2800 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2803 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
2801 i_size_read(&ci->vfs_inode) > 0) { 2804 i_size_read(inode) > 0) {
2802 struct page *page = 2805 struct page *page =
2803 find_get_page(ci->vfs_inode.i_mapping, 0); 2806 find_get_page(inode->i_mapping, 0);
2804 if (page) { 2807 if (page) {
2805 if (PageUptodate(page)) { 2808 if (PageUptodate(page)) {
2806 *pinned_page = page; 2809 *pinned_page = page;
@@ -2819,7 +2822,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
2819 * getattr request will bring inline data into 2822 * getattr request will bring inline data into
2820 * page cache 2823 * page cache
2821 */ 2824 */
2822 ret = __ceph_do_getattr(&ci->vfs_inode, NULL, 2825 ret = __ceph_do_getattr(inode, NULL,
2823 CEPH_STAT_CAP_INLINE_DATA, 2826 CEPH_STAT_CAP_INLINE_DATA,
2824 true); 2827 true);
2825 if (ret < 0) 2828 if (ret < 0)
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 2eb88ed22993..facb387c2735 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
294 294
295void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) 295void ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
296{ 296{
297 return 0;
298} 297}
299 298
300void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) 299void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 15ff1b09cfa2..b6bfa94332c3 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -35,7 +35,7 @@ struct ceph_nfs_snapfh {
35static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, 35static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len,
36 struct inode *parent_inode) 36 struct inode *parent_inode)
37{ 37{
38 const static int snap_handle_length = 38 static const int snap_handle_length =
39 sizeof(struct ceph_nfs_snapfh) >> 2; 39 sizeof(struct ceph_nfs_snapfh) >> 2;
40 struct ceph_nfs_snapfh *sfh = (void *)rawfh; 40 struct ceph_nfs_snapfh *sfh = (void *)rawfh;
41 u64 snapid = ceph_snap(inode); 41 u64 snapid = ceph_snap(inode);
@@ -85,9 +85,9 @@ out:
85static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, 85static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
86 struct inode *parent_inode) 86 struct inode *parent_inode)
87{ 87{
88 const static int handle_length = 88 static const int handle_length =
89 sizeof(struct ceph_nfs_fh) >> 2; 89 sizeof(struct ceph_nfs_fh) >> 2;
90 const static int connected_handle_length = 90 static const int connected_handle_length =
91 sizeof(struct ceph_nfs_confh) >> 2; 91 sizeof(struct ceph_nfs_confh) >> 2;
92 int type; 92 int type;
93 93
@@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name,
458 if (err < 0) 458 if (err < 0)
459 goto out; 459 goto out;
460 460
461 rinfo = &req->r_reply_info; 461 rinfo = &req->r_reply_info;
462 for (i = 0; i < rinfo->dir_nr; i++) { 462 for (i = 0; i < rinfo->dir_nr; i++) {
463 rde = rinfo->dir_entries + i; 463 rde = rinfo->dir_entries + i;
464 BUG_ON(!rde->inode.in); 464 BUG_ON(!rde->inode.in);
465 if (ceph_snap(inode) == 465 if (ceph_snap(inode) ==
466 le64_to_cpu(rde->inode.in->snapid)) { 466 le64_to_cpu(rde->inode.in->snapid)) {
467 memcpy(name, rde->name, rde->name_len); 467 memcpy(name, rde->name, rde->name_len);
468 name[rde->name_len] = '\0'; 468 name[rde->name_len] = '\0';
469 err = 0; 469 err = 0;
470 goto out; 470 goto out;
471 } 471 }
472 } 472 }
473 473
474 if (rinfo->dir_end) 474 if (rinfo->dir_end)
475 break; 475 break;
476 476
477 BUG_ON(rinfo->dir_nr <= 0); 477 BUG_ON(rinfo->dir_nr <= 0);
478 rde = rinfo->dir_entries + (rinfo->dir_nr - 1); 478 rde = rinfo->dir_entries + (rinfo->dir_nr - 1);
479 next_offset += rinfo->dir_nr; 479 next_offset += rinfo->dir_nr;
480 last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); 480 last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL);
481 if (!last_name) { 481 if (!last_name) {
482 err = -ENOMEM; 482 err = -ENOMEM;
483 goto out; 483 goto out;
484 } 484 }
485 485
486 ceph_mdsc_put_request(req); 486 ceph_mdsc_put_request(req);
487 req = NULL; 487 req = NULL;
488 } 488 }
489 err = -ENOENT; 489 err = -ENOENT;
490out: 490out:
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 685a03cc4b77..d277f71abe0b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -15,6 +15,7 @@
15#include "super.h" 15#include "super.h"
16#include "mds_client.h" 16#include "mds_client.h"
17#include "cache.h" 17#include "cache.h"
18#include "io.h"
18 19
19static __le32 ceph_flags_sys2wire(u32 flags) 20static __le32 ceph_flags_sys2wire(u32 flags)
20{ 21{
@@ -201,6 +202,7 @@ out:
201static int ceph_init_file_info(struct inode *inode, struct file *file, 202static int ceph_init_file_info(struct inode *inode, struct file *file,
202 int fmode, bool isdir) 203 int fmode, bool isdir)
203{ 204{
205 struct ceph_inode_info *ci = ceph_inode(inode);
204 struct ceph_file_info *fi; 206 struct ceph_file_info *fi;
205 207
206 dout("%s %p %p 0%o (%s)\n", __func__, inode, file, 208 dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
@@ -211,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
211 struct ceph_dir_file_info *dfi = 213 struct ceph_dir_file_info *dfi =
212 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); 214 kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL);
213 if (!dfi) { 215 if (!dfi) {
214 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 216 ceph_put_fmode(ci, fmode); /* clean up */
215 return -ENOMEM; 217 return -ENOMEM;
216 } 218 }
217 219
@@ -222,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
222 } else { 224 } else {
223 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); 225 fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
224 if (!fi) { 226 if (!fi) {
225 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ 227 ceph_put_fmode(ci, fmode); /* clean up */
226 return -ENOMEM; 228 return -ENOMEM;
227 } 229 }
228 230
@@ -232,6 +234,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
232 fi->fmode = fmode; 234 fi->fmode = fmode;
233 spin_lock_init(&fi->rw_contexts_lock); 235 spin_lock_init(&fi->rw_contexts_lock);
234 INIT_LIST_HEAD(&fi->rw_contexts); 236 INIT_LIST_HEAD(&fi->rw_contexts);
237 fi->meta_err = errseq_sample(&ci->i_meta_err);
238 fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
235 239
236 return 0; 240 return 0;
237} 241}
@@ -695,7 +699,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
695 ceph_release_page_vector(pages, num_pages); 699 ceph_release_page_vector(pages, num_pages);
696 } 700 }
697 701
698 if (ret <= 0 || off >= i_size || !more) 702 if (ret < 0) {
703 if (ret == -EBLACKLISTED)
704 fsc->blacklisted = true;
705 break;
706 }
707
708 if (off >= i_size || !more)
699 break; 709 break;
700 } 710 }
701 711
@@ -921,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
921 struct ceph_aio_request *aio_req = NULL; 931 struct ceph_aio_request *aio_req = NULL;
922 int num_pages = 0; 932 int num_pages = 0;
923 int flags; 933 int flags;
924 int ret; 934 int ret = 0;
925 struct timespec64 mtime = current_time(inode); 935 struct timespec64 mtime = current_time(inode);
926 size_t count = iov_iter_count(iter); 936 size_t count = iov_iter_count(iter);
927 loff_t pos = iocb->ki_pos; 937 loff_t pos = iocb->ki_pos;
@@ -935,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
935 (write ? "write" : "read"), file, pos, (unsigned)count, 945 (write ? "write" : "read"), file, pos, (unsigned)count,
936 snapc, snapc ? snapc->seq : 0); 946 snapc, snapc ? snapc->seq : 0);
937 947
938 ret = filemap_write_and_wait_range(inode->i_mapping,
939 pos, pos + count - 1);
940 if (ret < 0)
941 return ret;
942
943 if (write) { 948 if (write) {
944 int ret2 = invalidate_inode_pages2_range(inode->i_mapping, 949 int ret2 = invalidate_inode_pages2_range(inode->i_mapping,
945 pos >> PAGE_SHIFT, 950 pos >> PAGE_SHIFT,
@@ -1260,7 +1265,8 @@ again:
1260 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1265 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1261 else 1266 else
1262 want = CEPH_CAP_FILE_CACHE; 1267 want = CEPH_CAP_FILE_CACHE;
1263 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 1268 ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1,
1269 &got, &pinned_page);
1264 if (ret < 0) 1270 if (ret < 0)
1265 return ret; 1271 return ret;
1266 1272
@@ -1274,12 +1280,16 @@ again:
1274 1280
1275 if (ci->i_inline_version == CEPH_INLINE_NONE) { 1281 if (ci->i_inline_version == CEPH_INLINE_NONE) {
1276 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1282 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
1283 ceph_start_io_direct(inode);
1277 ret = ceph_direct_read_write(iocb, to, 1284 ret = ceph_direct_read_write(iocb, to,
1278 NULL, NULL); 1285 NULL, NULL);
1286 ceph_end_io_direct(inode);
1279 if (ret >= 0 && ret < len) 1287 if (ret >= 0 && ret < len)
1280 retry_op = CHECK_EOF; 1288 retry_op = CHECK_EOF;
1281 } else { 1289 } else {
1290 ceph_start_io_read(inode);
1282 ret = ceph_sync_read(iocb, to, &retry_op); 1291 ret = ceph_sync_read(iocb, to, &retry_op);
1292 ceph_end_io_read(inode);
1283 } 1293 }
1284 } else { 1294 } else {
1285 retry_op = READ_INLINE; 1295 retry_op = READ_INLINE;
@@ -1290,7 +1300,9 @@ again:
1290 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1300 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
1291 ceph_cap_string(got)); 1301 ceph_cap_string(got));
1292 ceph_add_rw_context(fi, &rw_ctx); 1302 ceph_add_rw_context(fi, &rw_ctx);
1303 ceph_start_io_read(inode);
1293 ret = generic_file_read_iter(iocb, to); 1304 ret = generic_file_read_iter(iocb, to);
1305 ceph_end_io_read(inode);
1294 ceph_del_rw_context(fi, &rw_ctx); 1306 ceph_del_rw_context(fi, &rw_ctx);
1295 } 1307 }
1296 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 1308 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
@@ -1399,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
1399 return -ENOMEM; 1411 return -ENOMEM;
1400 1412
1401retry_snap: 1413retry_snap:
1402 inode_lock(inode); 1414 if (iocb->ki_flags & IOCB_DIRECT)
1415 ceph_start_io_direct(inode);
1416 else
1417 ceph_start_io_write(inode);
1403 1418
1404 /* We can write back this queue in page reclaim */ 1419 /* We can write back this queue in page reclaim */
1405 current->backing_dev_info = inode_to_bdi(inode); 1420 current->backing_dev_info = inode_to_bdi(inode);
@@ -1457,7 +1472,7 @@ retry_snap:
1457 else 1472 else
1458 want = CEPH_CAP_FILE_BUFFER; 1473 want = CEPH_CAP_FILE_BUFFER;
1459 got = 0; 1474 got = 0;
1460 err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, 1475 err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count,
1461 &got, NULL); 1476 &got, NULL);
1462 if (err < 0) 1477 if (err < 0)
1463 goto out; 1478 goto out;
@@ -1470,7 +1485,6 @@ retry_snap:
1470 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { 1485 (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) {
1471 struct ceph_snap_context *snapc; 1486 struct ceph_snap_context *snapc;
1472 struct iov_iter data; 1487 struct iov_iter data;
1473 inode_unlock(inode);
1474 1488
1475 spin_lock(&ci->i_ceph_lock); 1489 spin_lock(&ci->i_ceph_lock);
1476 if (__ceph_have_pending_cap_snap(ci)) { 1490 if (__ceph_have_pending_cap_snap(ci)) {
@@ -1487,11 +1501,14 @@ retry_snap:
1487 1501
1488 /* we might need to revert back to that point */ 1502 /* we might need to revert back to that point */
1489 data = *from; 1503 data = *from;
1490 if (iocb->ki_flags & IOCB_DIRECT) 1504 if (iocb->ki_flags & IOCB_DIRECT) {
1491 written = ceph_direct_read_write(iocb, &data, snapc, 1505 written = ceph_direct_read_write(iocb, &data, snapc,
1492 &prealloc_cf); 1506 &prealloc_cf);
1493 else 1507 ceph_end_io_direct(inode);
1508 } else {
1494 written = ceph_sync_write(iocb, &data, pos, snapc); 1509 written = ceph_sync_write(iocb, &data, pos, snapc);
1510 ceph_end_io_write(inode);
1511 }
1495 if (written > 0) 1512 if (written > 0)
1496 iov_iter_advance(from, written); 1513 iov_iter_advance(from, written);
1497 ceph_put_snap_context(snapc); 1514 ceph_put_snap_context(snapc);
@@ -1506,7 +1523,7 @@ retry_snap:
1506 written = generic_perform_write(file, from, pos); 1523 written = generic_perform_write(file, from, pos);
1507 if (likely(written >= 0)) 1524 if (likely(written >= 0))
1508 iocb->ki_pos = pos + written; 1525 iocb->ki_pos = pos + written;
1509 inode_unlock(inode); 1526 ceph_end_io_write(inode);
1510 } 1527 }
1511 1528
1512 if (written >= 0) { 1529 if (written >= 0) {
@@ -1541,9 +1558,11 @@ retry_snap:
1541 } 1558 }
1542 1559
1543 goto out_unlocked; 1560 goto out_unlocked;
1544
1545out: 1561out:
1546 inode_unlock(inode); 1562 if (iocb->ki_flags & IOCB_DIRECT)
1563 ceph_end_io_direct(inode);
1564 else
1565 ceph_end_io_write(inode);
1547out_unlocked: 1566out_unlocked:
1548 ceph_free_cap_flush(prealloc_cf); 1567 ceph_free_cap_flush(prealloc_cf);
1549 current->backing_dev_info = NULL; 1568 current->backing_dev_info = NULL;
@@ -1781,7 +1800,7 @@ static long ceph_fallocate(struct file *file, int mode,
1781 else 1800 else
1782 want = CEPH_CAP_FILE_BUFFER; 1801 want = CEPH_CAP_FILE_BUFFER;
1783 1802
1784 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 1803 ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL);
1785 if (ret < 0) 1804 if (ret < 0)
1786 goto unlock; 1805 goto unlock;
1787 1806
@@ -1810,16 +1829,15 @@ unlock:
1810 * src_ci. Two attempts are made to obtain both caps, and an error is return if 1829 * src_ci. Two attempts are made to obtain both caps, and an error is return if
1811 * this fails; zero is returned on success. 1830 * this fails; zero is returned on success.
1812 */ 1831 */
1813static int get_rd_wr_caps(struct ceph_inode_info *src_ci, 1832static int get_rd_wr_caps(struct file *src_filp, int *src_got,
1814 loff_t src_endoff, int *src_got, 1833 struct file *dst_filp,
1815 struct ceph_inode_info *dst_ci,
1816 loff_t dst_endoff, int *dst_got) 1834 loff_t dst_endoff, int *dst_got)
1817{ 1835{
1818 int ret = 0; 1836 int ret = 0;
1819 bool retrying = false; 1837 bool retrying = false;
1820 1838
1821retry_caps: 1839retry_caps:
1822 ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, 1840 ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
1823 dst_endoff, dst_got, NULL); 1841 dst_endoff, dst_got, NULL);
1824 if (ret < 0) 1842 if (ret < 0)
1825 return ret; 1843 return ret;
@@ -1829,24 +1847,24 @@ retry_caps:
1829 * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some 1847 * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some
1830 * retry dance instead to try to get both capabilities. 1848 * retry dance instead to try to get both capabilities.
1831 */ 1849 */
1832 ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, 1850 ret = ceph_try_get_caps(file_inode(src_filp),
1851 CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED,
1833 false, src_got); 1852 false, src_got);
1834 if (ret <= 0) { 1853 if (ret <= 0) {
1835 /* Start by dropping dst_ci caps and getting src_ci caps */ 1854 /* Start by dropping dst_ci caps and getting src_ci caps */
1836 ceph_put_cap_refs(dst_ci, *dst_got); 1855 ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got);
1837 if (retrying) { 1856 if (retrying) {
1838 if (!ret) 1857 if (!ret)
1839 /* ceph_try_get_caps masks EAGAIN */ 1858 /* ceph_try_get_caps masks EAGAIN */
1840 ret = -EAGAIN; 1859 ret = -EAGAIN;
1841 return ret; 1860 return ret;
1842 } 1861 }
1843 ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, 1862 ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD,
1844 CEPH_CAP_FILE_SHARED, src_endoff, 1863 CEPH_CAP_FILE_SHARED, -1, src_got, NULL);
1845 src_got, NULL);
1846 if (ret < 0) 1864 if (ret < 0)
1847 return ret; 1865 return ret;
1848 /*... drop src_ci caps too, and retry */ 1866 /*... drop src_ci caps too, and retry */
1849 ceph_put_cap_refs(src_ci, *src_got); 1867 ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got);
1850 retrying = true; 1868 retrying = true;
1851 goto retry_caps; 1869 goto retry_caps;
1852 } 1870 }
@@ -1904,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1904 struct ceph_inode_info *src_ci = ceph_inode(src_inode); 1922 struct ceph_inode_info *src_ci = ceph_inode(src_inode);
1905 struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); 1923 struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
1906 struct ceph_cap_flush *prealloc_cf; 1924 struct ceph_cap_flush *prealloc_cf;
1925 struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
1907 struct ceph_object_locator src_oloc, dst_oloc; 1926 struct ceph_object_locator src_oloc, dst_oloc;
1908 struct ceph_object_id src_oid, dst_oid; 1927 struct ceph_object_id src_oid, dst_oid;
1909 loff_t endoff = 0, size; 1928 loff_t endoff = 0, size;
@@ -1913,10 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1913 int src_got = 0, dst_got = 0, err, dirty; 1932 int src_got = 0, dst_got = 0, err, dirty;
1914 bool do_final_copy = false; 1933 bool do_final_copy = false;
1915 1934
1916 if (src_inode == dst_inode) 1935 if (src_inode->i_sb != dst_inode->i_sb) {
1917 return -EINVAL; 1936 struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
1918 if (src_inode->i_sb != dst_inode->i_sb) 1937
1919 return -EXDEV; 1938 if (ceph_fsid_compare(&src_fsc->client->fsid,
1939 &dst_fsc->client->fsid)) {
1940 dout("Copying files across clusters: src: %pU dst: %pU\n",
1941 &src_fsc->client->fsid, &dst_fsc->client->fsid);
1942 return -EXDEV;
1943 }
1944 }
1920 if (ceph_snap(dst_inode) != CEPH_NOSNAP) 1945 if (ceph_snap(dst_inode) != CEPH_NOSNAP)
1921 return -EROFS; 1946 return -EROFS;
1922 1947
@@ -1928,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1928 * efficient). 1953 * efficient).
1929 */ 1954 */
1930 1955
1931 if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) 1956 if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
1932 return -EOPNOTSUPP; 1957 return -EOPNOTSUPP;
1933 1958
1934 if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || 1959 if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
@@ -1960,8 +1985,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
1960 * clients may have dirty data in their caches. And OSDs know nothing 1985 * clients may have dirty data in their caches. And OSDs know nothing
1961 * about caps, so they can't safely do the remote object copies. 1986 * about caps, so they can't safely do the remote object copies.
1962 */ 1987 */
1963 err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, 1988 err = get_rd_wr_caps(src_file, &src_got,
1964 dst_ci, (dst_off + len), &dst_got); 1989 dst_file, (dst_off + len), &dst_got);
1965 if (err < 0) { 1990 if (err < 0) {
1966 dout("get_rd_wr_caps returned %d\n", err); 1991 dout("get_rd_wr_caps returned %d\n", err);
1967 ret = -EOPNOTSUPP; 1992 ret = -EOPNOTSUPP;
@@ -2018,9 +2043,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2018 goto out; 2043 goto out;
2019 } 2044 }
2020 len -= ret; 2045 len -= ret;
2021 err = get_rd_wr_caps(src_ci, (src_off + len), 2046 err = get_rd_wr_caps(src_file, &src_got,
2022 &src_got, dst_ci, 2047 dst_file, (dst_off + len), &dst_got);
2023 (dst_off + len), &dst_got);
2024 if (err < 0) 2048 if (err < 0)
2025 goto out; 2049 goto out;
2026 err = is_file_size_ok(src_inode, dst_inode, 2050 err = is_file_size_ok(src_inode, dst_inode,
@@ -2044,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
2044 dst_ci->i_vino.ino, dst_objnum); 2068 dst_ci->i_vino.ino, dst_objnum);
2045 /* Do an object remote copy */ 2069 /* Do an object remote copy */
2046 err = ceph_osdc_copy_from( 2070 err = ceph_osdc_copy_from(
2047 &ceph_inode_to_client(src_inode)->client->osdc, 2071 &src_fsc->client->osdc,
2048 src_ci->i_vino.snap, 0, 2072 src_ci->i_vino.snap, 0,
2049 &src_oid, &src_oloc, 2073 &src_oid, &src_oloc,
2050 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 2074 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 18500edefc56..9f135624ae47 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
515 515
516 ceph_fscache_inode_init(ci); 516 ceph_fscache_inode_init(ci);
517 517
518 ci->i_meta_err = 0;
519
518 return &ci->vfs_inode; 520 return &ci->vfs_inode;
519} 521}
520 522
@@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
801 803
802 /* update inode */ 804 /* update inode */
803 inode->i_rdev = le32_to_cpu(info->rdev); 805 inode->i_rdev = le32_to_cpu(info->rdev);
804 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 806 /* directories have fl_stripe_unit set to zero */
807 if (le32_to_cpu(info->layout.fl_stripe_unit))
808 inode->i_blkbits =
809 fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
810 else
811 inode->i_blkbits = CEPH_BLOCK_SHIFT;
805 812
806 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); 813 __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files);
807 814
@@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = {
1982int __ceph_setattr(struct inode *inode, struct iattr *attr) 1989int __ceph_setattr(struct inode *inode, struct iattr *attr)
1983{ 1990{
1984 struct ceph_inode_info *ci = ceph_inode(inode); 1991 struct ceph_inode_info *ci = ceph_inode(inode);
1985 const unsigned int ia_valid = attr->ia_valid; 1992 unsigned int ia_valid = attr->ia_valid;
1986 struct ceph_mds_request *req; 1993 struct ceph_mds_request *req;
1987 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; 1994 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1988 struct ceph_cap_flush *prealloc_cf; 1995 struct ceph_cap_flush *prealloc_cf;
@@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
2087 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2094 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2088 } 2095 }
2089 } 2096 }
2097 if (ia_valid & ATTR_SIZE) {
2098 dout("setattr %p size %lld -> %lld\n", inode,
2099 inode->i_size, attr->ia_size);
2100 if ((issued & CEPH_CAP_FILE_EXCL) &&
2101 attr->ia_size > inode->i_size) {
2102 i_size_write(inode, attr->ia_size);
2103 inode->i_blocks = calc_inode_blocks(attr->ia_size);
2104 ci->i_reported_size = attr->ia_size;
2105 dirtied |= CEPH_CAP_FILE_EXCL;
2106 ia_valid |= ATTR_MTIME;
2107 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2108 attr->ia_size != inode->i_size) {
2109 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2110 req->r_args.setattr.old_size =
2111 cpu_to_le64(inode->i_size);
2112 mask |= CEPH_SETATTR_SIZE;
2113 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2114 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2115 }
2116 }
2090 if (ia_valid & ATTR_MTIME) { 2117 if (ia_valid & ATTR_MTIME) {
2091 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, 2118 dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode,
2092 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, 2119 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
@@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr)
2109 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; 2136 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2110 } 2137 }
2111 } 2138 }
2112 if (ia_valid & ATTR_SIZE) {
2113 dout("setattr %p size %lld -> %lld\n", inode,
2114 inode->i_size, attr->ia_size);
2115 if ((issued & CEPH_CAP_FILE_EXCL) &&
2116 attr->ia_size > inode->i_size) {
2117 i_size_write(inode, attr->ia_size);
2118 inode->i_blocks = calc_inode_blocks(attr->ia_size);
2119 ci->i_reported_size = attr->ia_size;
2120 dirtied |= CEPH_CAP_FILE_EXCL;
2121 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
2122 attr->ia_size != inode->i_size) {
2123 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
2124 req->r_args.setattr.old_size =
2125 cpu_to_le64(inode->i_size);
2126 mask |= CEPH_SETATTR_SIZE;
2127 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2128 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
2129 }
2130 }
2131 2139
2132 /* these do nothing */ 2140 /* these do nothing */
2133 if (ia_valid & ATTR_CTIME) { 2141 if (ia_valid & ATTR_CTIME) {
diff --git a/fs/ceph/io.c b/fs/ceph/io.c
new file mode 100644
index 000000000000..97602ea92ff4
--- /dev/null
+++ b/fs/ceph/io.c
@@ -0,0 +1,163 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2016 Trond Myklebust
4 * Copyright (c) 2019 Jeff Layton
5 *
6 * I/O and data path helper functionality.
7 *
8 * Heavily borrowed from equivalent code in fs/nfs/io.c
9 */
10
11#include <linux/ceph/ceph_debug.h>
12
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/rwsem.h>
16#include <linux/fs.h>
17
18#include "super.h"
19#include "io.h"
20
21/* Call with exclusively locked inode->i_rwsem */
22static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode)
23{
24 lockdep_assert_held_write(&inode->i_rwsem);
25
26 if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) {
27 spin_lock(&ci->i_ceph_lock);
28 ci->i_ceph_flags &= ~CEPH_I_ODIRECT;
29 spin_unlock(&ci->i_ceph_lock);
30 inode_dio_wait(inode);
31 }
32}
33
34/**
35 * ceph_start_io_read - declare the file is being used for buffered reads
36 * @inode: file inode
37 *
38 * Declare that a buffered read operation is about to start, and ensure
39 * that we block all direct I/O.
40 * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset,
41 * and holds a shared lock on inode->i_rwsem to ensure that the flag
42 * cannot be changed.
43 * In practice, this means that buffered read operations are allowed to
44 * execute in parallel, thanks to the shared lock, whereas direct I/O
45 * operations need to wait to grab an exclusive lock in order to set
46 * CEPH_I_ODIRECT.
47 * Note that buffered writes and truncates both take a write lock on
48 * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
49 */
50void
51ceph_start_io_read(struct inode *inode)
52{
53 struct ceph_inode_info *ci = ceph_inode(inode);
54
55 /* Be an optimist! */
56 down_read(&inode->i_rwsem);
57 if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT))
58 return;
59 up_read(&inode->i_rwsem);
60 /* Slow path.... */
61 down_write(&inode->i_rwsem);
62 ceph_block_o_direct(ci, inode);
63 downgrade_write(&inode->i_rwsem);
64}
65
66/**
67 * ceph_end_io_read - declare that the buffered read operation is done
68 * @inode: file inode
69 *
70 * Declare that a buffered read operation is done, and release the shared
71 * lock on inode->i_rwsem.
72 */
73void
74ceph_end_io_read(struct inode *inode)
75{
76 up_read(&inode->i_rwsem);
77}
78
79/**
80 * ceph_start_io_write - declare the file is being used for buffered writes
81 * @inode: file inode
82 *
83 * Declare that a buffered write operation is about to start, and ensure
84 * that we block all direct I/O.
85 */
86void
87ceph_start_io_write(struct inode *inode)
88{
89 down_write(&inode->i_rwsem);
90 ceph_block_o_direct(ceph_inode(inode), inode);
91}
92
93/**
94 * ceph_end_io_write - declare that the buffered write operation is done
95 * @inode: file inode
96 *
97 * Declare that a buffered write operation is done, and release the
98 * lock on inode->i_rwsem.
99 */
100void
101ceph_end_io_write(struct inode *inode)
102{
103 up_write(&inode->i_rwsem);
104}
105
106/* Call with exclusively locked inode->i_rwsem */
107static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode)
108{
109 lockdep_assert_held_write(&inode->i_rwsem);
110
111 if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) {
112 spin_lock(&ci->i_ceph_lock);
113 ci->i_ceph_flags |= CEPH_I_ODIRECT;
114 spin_unlock(&ci->i_ceph_lock);
115 /* FIXME: unmap_mapping_range? */
116 filemap_write_and_wait(inode->i_mapping);
117 }
118}
119
120/**
121 * ceph_end_io_direct - declare the file is being used for direct i/o
122 * @inode: file inode
123 *
124 * Declare that a direct I/O operation is about to start, and ensure
125 * that we block all buffered I/O.
126 * On exit, the function ensures that the CEPH_I_ODIRECT flag is set,
127 * and holds a shared lock on inode->i_rwsem to ensure that the flag
128 * cannot be changed.
129 * In practice, this means that direct I/O operations are allowed to
130 * execute in parallel, thanks to the shared lock, whereas buffered I/O
131 * operations need to wait to grab an exclusive lock in order to clear
132 * CEPH_I_ODIRECT.
133 * Note that buffered writes and truncates both take a write lock on
134 * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
135 */
136void
137ceph_start_io_direct(struct inode *inode)
138{
139 struct ceph_inode_info *ci = ceph_inode(inode);
140
141 /* Be an optimist! */
142 down_read(&inode->i_rwsem);
143 if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)
144 return;
145 up_read(&inode->i_rwsem);
146 /* Slow path.... */
147 down_write(&inode->i_rwsem);
148 ceph_block_buffered(ci, inode);
149 downgrade_write(&inode->i_rwsem);
150}
151
152/**
153 * ceph_end_io_direct - declare that the direct i/o operation is done
154 * @inode: file inode
155 *
156 * Declare that a direct I/O operation is done, and release the shared
157 * lock on inode->i_rwsem.
158 */
159void
160ceph_end_io_direct(struct inode *inode)
161{
162 up_read(&inode->i_rwsem);
163}
diff --git a/fs/ceph/io.h b/fs/ceph/io.h
new file mode 100644
index 000000000000..fa594cd77348
--- /dev/null
+++ b/fs/ceph/io.h
@@ -0,0 +1,12 @@
1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _FS_CEPH_IO_H
3#define _FS_CEPH_IO_H
4
5void ceph_start_io_read(struct inode *inode);
6void ceph_end_io_read(struct inode *inode);
7void ceph_start_io_write(struct inode *inode);
8void ceph_end_io_write(struct inode *inode);
9void ceph_start_io_direct(struct inode *inode);
10void ceph_end_io_direct(struct inode *inode);
11
12#endif /* FS_CEPH_IO_H */
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 5083e238ad15..544e9e85b120 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -32,14 +32,18 @@ void __init ceph_flock_init(void)
32 32
33static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) 33static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
34{ 34{
35 struct inode *inode = file_inode(src->fl_file); 35 struct ceph_file_info *fi = dst->fl_file->private_data;
36 struct inode *inode = file_inode(dst->fl_file);
36 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 37 atomic_inc(&ceph_inode(inode)->i_filelock_ref);
38 atomic_inc(&fi->num_locks);
37} 39}
38 40
39static void ceph_fl_release_lock(struct file_lock *fl) 41static void ceph_fl_release_lock(struct file_lock *fl)
40{ 42{
43 struct ceph_file_info *fi = fl->fl_file->private_data;
41 struct inode *inode = file_inode(fl->fl_file); 44 struct inode *inode = file_inode(fl->fl_file);
42 struct ceph_inode_info *ci = ceph_inode(inode); 45 struct ceph_inode_info *ci = ceph_inode(inode);
46 atomic_dec(&fi->num_locks);
43 if (atomic_dec_and_test(&ci->i_filelock_ref)) { 47 if (atomic_dec_and_test(&ci->i_filelock_ref)) {
44 /* clear error when all locks are released */ 48 /* clear error when all locks are released */
45 spin_lock(&ci->i_ceph_lock); 49 spin_lock(&ci->i_ceph_lock);
@@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
73 * window. Caller function will decrease the counter. 77 * window. Caller function will decrease the counter.
74 */ 78 */
75 fl->fl_ops = &ceph_fl_lock_ops; 79 fl->fl_ops = &ceph_fl_lock_ops;
76 atomic_inc(&ceph_inode(inode)->i_filelock_ref); 80 fl->fl_ops->fl_copy_lock(fl, NULL);
77 } 81 }
78 82
79 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 83 if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 920e9f048bd8..a8a8f84f3bbf 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -639,7 +639,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
639 s->s_renew_seq = 0; 639 s->s_renew_seq = 0;
640 INIT_LIST_HEAD(&s->s_caps); 640 INIT_LIST_HEAD(&s->s_caps);
641 s->s_nr_caps = 0; 641 s->s_nr_caps = 0;
642 s->s_trim_caps = 0;
643 refcount_set(&s->s_ref, 1); 642 refcount_set(&s->s_ref, 1);
644 INIT_LIST_HEAD(&s->s_waiting); 643 INIT_LIST_HEAD(&s->s_waiting);
645 INIT_LIST_HEAD(&s->s_unsafe); 644 INIT_LIST_HEAD(&s->s_unsafe);
@@ -1270,6 +1269,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1270{ 1269{
1271 struct ceph_mds_request *req; 1270 struct ceph_mds_request *req;
1272 struct rb_node *p; 1271 struct rb_node *p;
1272 struct ceph_inode_info *ci;
1273 1273
1274 dout("cleanup_session_requests mds%d\n", session->s_mds); 1274 dout("cleanup_session_requests mds%d\n", session->s_mds);
1275 mutex_lock(&mdsc->mutex); 1275 mutex_lock(&mdsc->mutex);
@@ -1278,6 +1278,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
1278 struct ceph_mds_request, r_unsafe_item); 1278 struct ceph_mds_request, r_unsafe_item);
1279 pr_warn_ratelimited(" dropping unsafe request %llu\n", 1279 pr_warn_ratelimited(" dropping unsafe request %llu\n",
1280 req->r_tid); 1280 req->r_tid);
1281 if (req->r_target_inode) {
1282 /* dropping unsafe change of inode's attributes */
1283 ci = ceph_inode(req->r_target_inode);
1284 errseq_set(&ci->i_meta_err, -EIO);
1285 }
1286 if (req->r_unsafe_dir) {
1287 /* dropping unsafe directory operation */
1288 ci = ceph_inode(req->r_unsafe_dir);
1289 errseq_set(&ci->i_meta_err, -EIO);
1290 }
1281 __unregister_request(mdsc, req); 1291 __unregister_request(mdsc, req);
1282 } 1292 }
1283 /* zero r_attempts, so kick_requests() will re-send requests */ 1293 /* zero r_attempts, so kick_requests() will re-send requests */
@@ -1370,7 +1380,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1370 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; 1380 struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg;
1371 struct ceph_inode_info *ci = ceph_inode(inode); 1381 struct ceph_inode_info *ci = ceph_inode(inode);
1372 LIST_HEAD(to_remove); 1382 LIST_HEAD(to_remove);
1373 bool drop = false; 1383 bool dirty_dropped = false;
1374 bool invalidate = false; 1384 bool invalidate = false;
1375 1385
1376 dout("removing cap %p, ci is %p, inode is %p\n", 1386 dout("removing cap %p, ci is %p, inode is %p\n",
@@ -1383,9 +1393,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1383 struct ceph_cap_flush *cf; 1393 struct ceph_cap_flush *cf;
1384 struct ceph_mds_client *mdsc = fsc->mdsc; 1394 struct ceph_mds_client *mdsc = fsc->mdsc;
1385 1395
1386 if (ci->i_wrbuffer_ref > 0 && 1396 if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
1387 READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) 1397 if (inode->i_data.nrpages > 0)
1388 invalidate = true; 1398 invalidate = true;
1399 if (ci->i_wrbuffer_ref > 0)
1400 mapping_set_error(&inode->i_data, -EIO);
1401 }
1389 1402
1390 while (!list_empty(&ci->i_cap_flush_list)) { 1403 while (!list_empty(&ci->i_cap_flush_list)) {
1391 cf = list_first_entry(&ci->i_cap_flush_list, 1404 cf = list_first_entry(&ci->i_cap_flush_list,
@@ -1405,7 +1418,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1405 inode, ceph_ino(inode)); 1418 inode, ceph_ino(inode));
1406 ci->i_dirty_caps = 0; 1419 ci->i_dirty_caps = 0;
1407 list_del_init(&ci->i_dirty_item); 1420 list_del_init(&ci->i_dirty_item);
1408 drop = true; 1421 dirty_dropped = true;
1409 } 1422 }
1410 if (!list_empty(&ci->i_flushing_item)) { 1423 if (!list_empty(&ci->i_flushing_item)) {
1411 pr_warn_ratelimited( 1424 pr_warn_ratelimited(
@@ -1415,10 +1428,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1415 ci->i_flushing_caps = 0; 1428 ci->i_flushing_caps = 0;
1416 list_del_init(&ci->i_flushing_item); 1429 list_del_init(&ci->i_flushing_item);
1417 mdsc->num_cap_flushing--; 1430 mdsc->num_cap_flushing--;
1418 drop = true; 1431 dirty_dropped = true;
1419 } 1432 }
1420 spin_unlock(&mdsc->cap_dirty_lock); 1433 spin_unlock(&mdsc->cap_dirty_lock);
1421 1434
1435 if (dirty_dropped) {
1436 errseq_set(&ci->i_meta_err, -EIO);
1437
1438 if (ci->i_wrbuffer_ref_head == 0 &&
1439 ci->i_wr_ref == 0 &&
1440 ci->i_dirty_caps == 0 &&
1441 ci->i_flushing_caps == 0) {
1442 ceph_put_snap_context(ci->i_head_snapc);
1443 ci->i_head_snapc = NULL;
1444 }
1445 }
1446
1422 if (atomic_read(&ci->i_filelock_ref) > 0) { 1447 if (atomic_read(&ci->i_filelock_ref) > 0) {
1423 /* make further file lock syscall return -EIO */ 1448 /* make further file lock syscall return -EIO */
1424 ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; 1449 ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK;
@@ -1430,15 +1455,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1430 list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); 1455 list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
1431 ci->i_prealloc_cap_flush = NULL; 1456 ci->i_prealloc_cap_flush = NULL;
1432 } 1457 }
1433
1434 if (drop &&
1435 ci->i_wrbuffer_ref_head == 0 &&
1436 ci->i_wr_ref == 0 &&
1437 ci->i_dirty_caps == 0 &&
1438 ci->i_flushing_caps == 0) {
1439 ceph_put_snap_context(ci->i_head_snapc);
1440 ci->i_head_snapc = NULL;
1441 }
1442 } 1458 }
1443 spin_unlock(&ci->i_ceph_lock); 1459 spin_unlock(&ci->i_ceph_lock);
1444 while (!list_empty(&to_remove)) { 1460 while (!list_empty(&to_remove)) {
@@ -1452,7 +1468,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
1452 wake_up_all(&ci->i_cap_wq); 1468 wake_up_all(&ci->i_cap_wq);
1453 if (invalidate) 1469 if (invalidate)
1454 ceph_queue_invalidate(inode); 1470 ceph_queue_invalidate(inode);
1455 if (drop) 1471 if (dirty_dropped)
1456 iput(inode); 1472 iput(inode);
1457 return 0; 1473 return 0;
1458} 1474}
@@ -1705,11 +1721,11 @@ out:
1705 */ 1721 */
1706static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) 1722static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1707{ 1723{
1708 struct ceph_mds_session *session = arg; 1724 int *remaining = arg;
1709 struct ceph_inode_info *ci = ceph_inode(inode); 1725 struct ceph_inode_info *ci = ceph_inode(inode);
1710 int used, wanted, oissued, mine; 1726 int used, wanted, oissued, mine;
1711 1727
1712 if (session->s_trim_caps <= 0) 1728 if (*remaining <= 0)
1713 return -1; 1729 return -1;
1714 1730
1715 spin_lock(&ci->i_ceph_lock); 1731 spin_lock(&ci->i_ceph_lock);
@@ -1746,7 +1762,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1746 if (oissued) { 1762 if (oissued) {
1747 /* we aren't the only cap.. just remove us */ 1763 /* we aren't the only cap.. just remove us */
1748 __ceph_remove_cap(cap, true); 1764 __ceph_remove_cap(cap, true);
1749 session->s_trim_caps--; 1765 (*remaining)--;
1750 } else { 1766 } else {
1751 struct dentry *dentry; 1767 struct dentry *dentry;
1752 /* try dropping referring dentries */ 1768 /* try dropping referring dentries */
@@ -1758,7 +1774,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
1758 d_prune_aliases(inode); 1774 d_prune_aliases(inode);
1759 count = atomic_read(&inode->i_count); 1775 count = atomic_read(&inode->i_count);
1760 if (count == 1) 1776 if (count == 1)
1761 session->s_trim_caps--; 1777 (*remaining)--;
1762 dout("trim_caps_cb %p cap %p pruned, count now %d\n", 1778 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
1763 inode, cap, count); 1779 inode, cap, count);
1764 } else { 1780 } else {
@@ -1784,12 +1800,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc,
1784 dout("trim_caps mds%d start: %d / %d, trim %d\n", 1800 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1785 session->s_mds, session->s_nr_caps, max_caps, trim_caps); 1801 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1786 if (trim_caps > 0) { 1802 if (trim_caps > 0) {
1787 session->s_trim_caps = trim_caps; 1803 int remaining = trim_caps;
1788 ceph_iterate_session_caps(session, trim_caps_cb, session); 1804
1805 ceph_iterate_session_caps(session, trim_caps_cb, &remaining);
1789 dout("trim_caps mds%d done: %d / %d, trimmed %d\n", 1806 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1790 session->s_mds, session->s_nr_caps, max_caps, 1807 session->s_mds, session->s_nr_caps, max_caps,
1791 trim_caps - session->s_trim_caps); 1808 trim_caps - remaining);
1792 session->s_trim_caps = 0;
1793 } 1809 }
1794 1810
1795 ceph_flush_cap_releases(mdsc, session); 1811 ceph_flush_cap_releases(mdsc, session);
@@ -3015,18 +3031,23 @@ bad:
3015 pr_err("mdsc_handle_forward decode error err=%d\n", err); 3031 pr_err("mdsc_handle_forward decode error err=%d\n", err);
3016} 3032}
3017 3033
3018static int __decode_and_drop_session_metadata(void **p, void *end) 3034static int __decode_session_metadata(void **p, void *end,
3035 bool *blacklisted)
3019{ 3036{
3020 /* map<string,string> */ 3037 /* map<string,string> */
3021 u32 n; 3038 u32 n;
3039 bool err_str;
3022 ceph_decode_32_safe(p, end, n, bad); 3040 ceph_decode_32_safe(p, end, n, bad);
3023 while (n-- > 0) { 3041 while (n-- > 0) {
3024 u32 len; 3042 u32 len;
3025 ceph_decode_32_safe(p, end, len, bad); 3043 ceph_decode_32_safe(p, end, len, bad);
3026 ceph_decode_need(p, end, len, bad); 3044 ceph_decode_need(p, end, len, bad);
3045 err_str = !strncmp(*p, "error_string", len);
3027 *p += len; 3046 *p += len;
3028 ceph_decode_32_safe(p, end, len, bad); 3047 ceph_decode_32_safe(p, end, len, bad);
3029 ceph_decode_need(p, end, len, bad); 3048 ceph_decode_need(p, end, len, bad);
3049 if (err_str && strnstr(*p, "blacklisted", len))
3050 *blacklisted = true;
3030 *p += len; 3051 *p += len;
3031 } 3052 }
3032 return 0; 3053 return 0;
@@ -3050,6 +3071,7 @@ static void handle_session(struct ceph_mds_session *session,
3050 u64 seq; 3071 u64 seq;
3051 unsigned long features = 0; 3072 unsigned long features = 0;
3052 int wake = 0; 3073 int wake = 0;
3074 bool blacklisted = false;
3053 3075
3054 /* decode */ 3076 /* decode */
3055 ceph_decode_need(&p, end, sizeof(*h), bad); 3077 ceph_decode_need(&p, end, sizeof(*h), bad);
@@ -3062,7 +3084,7 @@ static void handle_session(struct ceph_mds_session *session,
3062 if (msg_version >= 3) { 3084 if (msg_version >= 3) {
3063 u32 len; 3085 u32 len;
3064 /* version >= 2, metadata */ 3086 /* version >= 2, metadata */
3065 if (__decode_and_drop_session_metadata(&p, end) < 0) 3087 if (__decode_session_metadata(&p, end, &blacklisted) < 0)
3066 goto bad; 3088 goto bad;
3067 /* version >= 3, feature bits */ 3089 /* version >= 3, feature bits */
3068 ceph_decode_32_safe(&p, end, len, bad); 3090 ceph_decode_32_safe(&p, end, len, bad);
@@ -3149,6 +3171,8 @@ static void handle_session(struct ceph_mds_session *session,
3149 session->s_state = CEPH_MDS_SESSION_REJECTED; 3171 session->s_state = CEPH_MDS_SESSION_REJECTED;
3150 cleanup_session_requests(mdsc, session); 3172 cleanup_session_requests(mdsc, session);
3151 remove_session_caps(session); 3173 remove_session_caps(session);
3174 if (blacklisted)
3175 mdsc->fsc->blacklisted = true;
3152 wake = 2; /* for good measure */ 3176 wake = 2; /* for good measure */
3153 break; 3177 break;
3154 3178
@@ -3998,7 +4022,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
3998 mutex_unlock(&mdsc->mutex); 4022 mutex_unlock(&mdsc->mutex);
3999} 4023}
4000 4024
4025static void maybe_recover_session(struct ceph_mds_client *mdsc)
4026{
4027 struct ceph_fs_client *fsc = mdsc->fsc;
4028
4029 if (!ceph_test_mount_opt(fsc, CLEANRECOVER))
4030 return;
4031
4032 if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED)
4033 return;
4034
4035 if (!READ_ONCE(fsc->blacklisted))
4036 return;
4037
4038 if (fsc->last_auto_reconnect &&
4039 time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30))
4040 return;
4001 4041
4042 pr_info("auto reconnect after blacklisted\n");
4043 fsc->last_auto_reconnect = jiffies;
4044 ceph_force_reconnect(fsc->sb);
4045}
4002 4046
4003/* 4047/*
4004 * delayed work -- periodically trim expired leases, renew caps with mds 4048 * delayed work -- periodically trim expired leases, renew caps with mds
@@ -4044,7 +4088,9 @@ static void delayed_work(struct work_struct *work)
4044 pr_info("mds%d hung\n", s->s_mds); 4088 pr_info("mds%d hung\n", s->s_mds);
4045 } 4089 }
4046 } 4090 }
4047 if (s->s_state < CEPH_MDS_SESSION_OPEN) { 4091 if (s->s_state == CEPH_MDS_SESSION_NEW ||
4092 s->s_state == CEPH_MDS_SESSION_RESTARTING ||
4093 s->s_state == CEPH_MDS_SESSION_REJECTED) {
4048 /* this mds is failed or recovering, just wait */ 4094 /* this mds is failed or recovering, just wait */
4049 ceph_put_mds_session(s); 4095 ceph_put_mds_session(s);
4050 continue; 4096 continue;
@@ -4072,6 +4118,8 @@ static void delayed_work(struct work_struct *work)
4072 4118
4073 ceph_trim_snapid_map(mdsc); 4119 ceph_trim_snapid_map(mdsc);
4074 4120
4121 maybe_recover_session(mdsc);
4122
4075 schedule_delayed(mdsc); 4123 schedule_delayed(mdsc);
4076} 4124}
4077 4125
@@ -4355,7 +4403,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
4355 session = __ceph_lookup_mds_session(mdsc, mds); 4403 session = __ceph_lookup_mds_session(mdsc, mds);
4356 if (!session) 4404 if (!session)
4357 continue; 4405 continue;
4406
4407 if (session->s_state == CEPH_MDS_SESSION_REJECTED)
4408 __unregister_session(mdsc, session);
4409 __wake_requests(mdsc, &session->s_waiting);
4358 mutex_unlock(&mdsc->mutex); 4410 mutex_unlock(&mdsc->mutex);
4411
4359 mutex_lock(&session->s_mutex); 4412 mutex_lock(&session->s_mutex);
4360 __close_session(mdsc, session); 4413 __close_session(mdsc, session);
4361 if (session->s_state == CEPH_MDS_SESSION_CLOSING) { 4414 if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
@@ -4364,6 +4417,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
4364 } 4417 }
4365 mutex_unlock(&session->s_mutex); 4418 mutex_unlock(&session->s_mutex);
4366 ceph_put_mds_session(session); 4419 ceph_put_mds_session(session);
4420
4367 mutex_lock(&mdsc->mutex); 4421 mutex_lock(&mdsc->mutex);
4368 kick_requests(mdsc, mds); 4422 kick_requests(mdsc, mds);
4369 } 4423 }
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f7c8603484fe..5cd131b41d84 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -148,9 +148,9 @@ enum {
148 CEPH_MDS_SESSION_OPENING = 2, 148 CEPH_MDS_SESSION_OPENING = 2,
149 CEPH_MDS_SESSION_OPEN = 3, 149 CEPH_MDS_SESSION_OPEN = 3,
150 CEPH_MDS_SESSION_HUNG = 4, 150 CEPH_MDS_SESSION_HUNG = 4,
151 CEPH_MDS_SESSION_CLOSING = 5, 151 CEPH_MDS_SESSION_RESTARTING = 5,
152 CEPH_MDS_SESSION_RESTARTING = 6, 152 CEPH_MDS_SESSION_RECONNECTING = 6,
153 CEPH_MDS_SESSION_RECONNECTING = 7, 153 CEPH_MDS_SESSION_CLOSING = 7,
154 CEPH_MDS_SESSION_REJECTED = 8, 154 CEPH_MDS_SESSION_REJECTED = 8,
155}; 155};
156 156
@@ -176,7 +176,7 @@ struct ceph_mds_session {
176 spinlock_t s_cap_lock; 176 spinlock_t s_cap_lock;
177 struct list_head s_caps; /* all caps issued by this session */ 177 struct list_head s_caps; /* all caps issued by this session */
178 struct ceph_cap *s_cap_iterator; 178 struct ceph_cap *s_cap_iterator;
179 int s_nr_caps, s_trim_caps; 179 int s_nr_caps;
180 int s_num_cap_releases; 180 int s_num_cap_releases;
181 int s_cap_reconnect; 181 int s_cap_reconnect;
182 int s_readonly; 182 int s_readonly;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 377fafc76f20..edfd643a8205 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -143,6 +143,7 @@ enum {
143 Opt_snapdirname, 143 Opt_snapdirname,
144 Opt_mds_namespace, 144 Opt_mds_namespace,
145 Opt_fscache_uniq, 145 Opt_fscache_uniq,
146 Opt_recover_session,
146 Opt_last_string, 147 Opt_last_string,
147 /* string args above */ 148 /* string args above */
148 Opt_dirstat, 149 Opt_dirstat,
@@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = {
184 /* int args above */ 185 /* int args above */
185 {Opt_snapdirname, "snapdirname=%s"}, 186 {Opt_snapdirname, "snapdirname=%s"},
186 {Opt_mds_namespace, "mds_namespace=%s"}, 187 {Opt_mds_namespace, "mds_namespace=%s"},
188 {Opt_recover_session, "recover_session=%s"},
187 {Opt_fscache_uniq, "fsc=%s"}, 189 {Opt_fscache_uniq, "fsc=%s"},
188 /* string args above */ 190 /* string args above */
189 {Opt_dirstat, "dirstat"}, 191 {Opt_dirstat, "dirstat"},
@@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private)
254 if (!fsopt->mds_namespace) 256 if (!fsopt->mds_namespace)
255 return -ENOMEM; 257 return -ENOMEM;
256 break; 258 break;
259 case Opt_recover_session:
260 if (!strncmp(argstr[0].from, "no",
261 argstr[0].to - argstr[0].from)) {
262 fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER;
263 } else if (!strncmp(argstr[0].from, "clean",
264 argstr[0].to - argstr[0].from)) {
265 fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER;
266 } else {
267 return -EINVAL;
268 }
269 break;
257 case Opt_fscache_uniq: 270 case Opt_fscache_uniq:
258 kfree(fsopt->fscache_uniq); 271 kfree(fsopt->fscache_uniq);
259 fsopt->fscache_uniq = kstrndup(argstr[0].from, 272 fsopt->fscache_uniq = kstrndup(argstr[0].from,
@@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
576 589
577 if (fsopt->mds_namespace) 590 if (fsopt->mds_namespace)
578 seq_show_option(m, "mds_namespace", fsopt->mds_namespace); 591 seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
592
593 if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER)
594 seq_show_option(m, "recover_session", "clean");
595
579 if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) 596 if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
580 seq_printf(m, ",wsize=%d", fsopt->wsize); 597 seq_printf(m, ",wsize=%d", fsopt->wsize);
581 if (fsopt->rsize != CEPH_MAX_READ_SIZE) 598 if (fsopt->rsize != CEPH_MAX_READ_SIZE)
@@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
664 681
665 fsc->sb = NULL; 682 fsc->sb = NULL;
666 fsc->mount_state = CEPH_MOUNT_MOUNTING; 683 fsc->mount_state = CEPH_MOUNT_MOUNTING;
684 fsc->filp_gen = 1;
667 685
668 atomic_long_set(&fsc->writeback_count, 0); 686 atomic_long_set(&fsc->writeback_count, 0);
669 687
@@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
713{ 731{
714 dout("destroy_fs_client %p\n", fsc); 732 dout("destroy_fs_client %p\n", fsc);
715 733
734 ceph_mdsc_destroy(fsc);
716 destroy_workqueue(fsc->inode_wq); 735 destroy_workqueue(fsc->inode_wq);
717 destroy_workqueue(fsc->cap_wq); 736 destroy_workqueue(fsc->cap_wq);
718 737
@@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb)
829 fsc->mount_state = CEPH_MOUNT_SHUTDOWN; 848 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
830 ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); 849 ceph_osdc_abort_requests(&fsc->client->osdc, -EIO);
831 ceph_mdsc_force_umount(fsc->mdsc); 850 ceph_mdsc_force_umount(fsc->mdsc);
832 return; 851 fsc->filp_gen++; // invalidate open files
833} 852}
834 853
835static int ceph_remount(struct super_block *sb, int *flags, char *data) 854static int ceph_remount(struct super_block *sb, int *flags, char *data)
@@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
1089 } 1108 }
1090 1109
1091 if (ceph_sb_to_client(sb) != fsc) { 1110 if (ceph_sb_to_client(sb) != fsc) {
1092 ceph_mdsc_destroy(fsc);
1093 destroy_fs_client(fsc); 1111 destroy_fs_client(fsc);
1094 fsc = ceph_sb_to_client(sb); 1112 fsc = ceph_sb_to_client(sb);
1095 dout("get_sb got existing client %p\n", fsc); 1113 dout("get_sb got existing client %p\n", fsc);
@@ -1115,7 +1133,6 @@ out_splat:
1115 goto out_final; 1133 goto out_final;
1116 1134
1117out: 1135out:
1118 ceph_mdsc_destroy(fsc);
1119 destroy_fs_client(fsc); 1136 destroy_fs_client(fsc);
1120out_final: 1137out_final:
1121 dout("ceph_mount fail %ld\n", PTR_ERR(res)); 1138 dout("ceph_mount fail %ld\n", PTR_ERR(res));
@@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s)
1139 1156
1140 ceph_fscache_unregister_fs(fsc); 1157 ceph_fscache_unregister_fs(fsc);
1141 1158
1142 ceph_mdsc_destroy(fsc);
1143
1144 destroy_fs_client(fsc); 1159 destroy_fs_client(fsc);
1145 free_anon_bdev(dev); 1160 free_anon_bdev(dev);
1146} 1161}
@@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = {
1154}; 1169};
1155MODULE_ALIAS_FS("ceph"); 1170MODULE_ALIAS_FS("ceph");
1156 1171
1172int ceph_force_reconnect(struct super_block *sb)
1173{
1174 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
1175 int err = 0;
1176
1177 ceph_umount_begin(sb);
1178
1179 /* Make sure all page caches get invalidated.
1180 * see remove_session_caps_cb() */
1181 flush_workqueue(fsc->inode_wq);
1182
1183 /* In case that we were blacklisted. This also reset
1184 * all mon/osd connections */
1185 ceph_reset_client_addr(fsc->client);
1186
1187 ceph_osdc_clear_abort_err(&fsc->client->osdc);
1188
1189 fsc->blacklisted = false;
1190 fsc->mount_state = CEPH_MOUNT_MOUNTED;
1191
1192 if (sb->s_root) {
1193 err = __ceph_do_getattr(d_inode(sb->s_root), NULL,
1194 CEPH_STAT_CAP_INODE, true);
1195 }
1196 return err;
1197}
1198
1157static int __init init_ceph(void) 1199static int __init init_ceph(void)
1158{ 1200{
1159 int ret = init_caches(); 1201 int ret = init_caches();
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6b9f1ee7de85..f98d9247f9cb 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/posix_acl.h> 17#include <linux/posix_acl.h>
18#include <linux/refcount.h> 18#include <linux/refcount.h>
19#include <linux/security.h>
19 20
20#include <linux/ceph/libceph.h> 21#include <linux/ceph/libceph.h>
21 22
@@ -31,6 +32,7 @@
31#define CEPH_BLOCK_SHIFT 22 /* 4 MB */ 32#define CEPH_BLOCK_SHIFT 22 /* 4 MB */
32#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 33#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
33 34
35#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */
34#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 36#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
35#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 37#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
36#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ 38#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
@@ -101,6 +103,11 @@ struct ceph_fs_client {
101 struct ceph_client *client; 103 struct ceph_client *client;
102 104
103 unsigned long mount_state; 105 unsigned long mount_state;
106
107 unsigned long last_auto_reconnect;
108 bool blacklisted;
109
110 u32 filp_gen;
104 loff_t max_file_size; 111 loff_t max_file_size;
105 112
106 struct ceph_mds_client *mdsc; 113 struct ceph_mds_client *mdsc;
@@ -395,6 +402,8 @@ struct ceph_inode_info {
395 struct fscache_cookie *fscache; 402 struct fscache_cookie *fscache;
396 u32 i_fscache_gen; 403 u32 i_fscache_gen;
397#endif 404#endif
405 errseq_t i_meta_err;
406
398 struct inode vfs_inode; /* at end */ 407 struct inode vfs_inode; /* at end */
399}; 408};
400 409
@@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
499#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ 508#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */
500#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ 509#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */
501#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ 510#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */
502#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ 511#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */
503#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ 512#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */
504#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ 513#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */
505#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ 514#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */
506#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ 515#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */
507#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ 516#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */
508#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ 517#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */
509#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ 518#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */
510#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ 519#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */
511#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ 520#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */
512
513 521
514/* 522/*
515 * Masks of ceph inode work. 523 * Masks of ceph inode work.
@@ -703,6 +711,10 @@ struct ceph_file_info {
703 711
704 spinlock_t rw_contexts_lock; 712 spinlock_t rw_contexts_lock;
705 struct list_head rw_contexts; 713 struct list_head rw_contexts;
714
715 errseq_t meta_err;
716 u32 filp_gen;
717 atomic_t num_locks;
706}; 718};
707 719
708struct ceph_dir_file_info { 720struct ceph_dir_file_info {
@@ -842,7 +854,8 @@ static inline int default_congestion_kb(void)
842} 854}
843 855
844 856
845 857/* super.c */
858extern int ceph_force_reconnect(struct super_block *sb);
846/* snap.c */ 859/* snap.c */
847struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, 860struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
848 u64 ino); 861 u64 ino);
@@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in)
959#ifdef CONFIG_CEPH_FS_SECURITY_LABEL 972#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
960extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, 973extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
961 struct ceph_acl_sec_ctx *ctx); 974 struct ceph_acl_sec_ctx *ctx);
962extern void ceph_security_invalidate_secctx(struct inode *inode); 975static inline void ceph_security_invalidate_secctx(struct inode *inode)
976{
977 security_inode_invalidate_secctx(inode);
978}
963#else 979#else
964static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, 980static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode,
965 struct ceph_acl_sec_ctx *ctx) 981 struct ceph_acl_sec_ctx *ctx)
@@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1039 struct ceph_mds_session *session); 1055 struct ceph_mds_session *session);
1040extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, 1056extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci,
1041 int mds); 1057 int mds);
1042extern int ceph_get_cap_mds(struct inode *inode);
1043extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); 1058extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
1044extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); 1059extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
1045extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, 1060extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
@@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
1058 struct inode *dir, 1073 struct inode *dir,
1059 int mds, int drop, int unless); 1074 int mds, int drop, int unless);
1060 1075
1061extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 1076extern int ceph_get_caps(struct file *filp, int need, int want,
1062 loff_t endoff, int *got, struct page **pinned_page); 1077 loff_t endoff, int *got, struct page **pinned_page);
1063extern int ceph_try_get_caps(struct ceph_inode_info *ci, 1078extern int ceph_try_get_caps(struct inode *inode,
1064 int need, int want, bool nonblock, int *got); 1079 int need, int want, bool nonblock, int *got);
1065 1080
1066/* for counting open files by mode */ 1081/* for counting open files by mode */
@@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
1071extern const struct address_space_operations ceph_aops; 1086extern const struct address_space_operations ceph_aops;
1072extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); 1087extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
1073extern int ceph_uninline_data(struct file *filp, struct page *locked_page); 1088extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
1074extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); 1089extern int ceph_pool_perm_check(struct inode *inode, int need);
1075extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); 1090extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
1076 1091
1077/* file.c */ 1092/* file.c */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 939eab7aa219..cb18ee637cb7 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci,
20 20
21static bool ceph_is_valid_xattr(const char *name) 21static bool ceph_is_valid_xattr(const char *name)
22{ 22{
23 return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || 23 return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) ||
24 !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) ||
24 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 25 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
25 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); 26 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
26} 27}
@@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
892 memcpy(value, xattr->val, xattr->val_len); 893 memcpy(value, xattr->val, xattr->val_len);
893 894
894 if (current->journal_info && 895 if (current->journal_info &&
895 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 896 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) &&
897 security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN))
896 ci->i_ceph_flags |= CEPH_I_SEC_INITED; 898 ci->i_ceph_flags |= CEPH_I_SEC_INITED;
897out: 899out:
898 spin_unlock(&ci->i_ceph_lock); 900 spin_unlock(&ci->i_ceph_lock);
@@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
903{ 905{
904 struct inode *inode = d_inode(dentry); 906 struct inode *inode = d_inode(dentry);
905 struct ceph_inode_info *ci = ceph_inode(inode); 907 struct ceph_inode_info *ci = ceph_inode(inode);
906 struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode);
907 bool len_only = (size == 0); 908 bool len_only = (size == 0);
908 u32 namelen; 909 u32 namelen;
909 int err; 910 int err;
910 int i;
911 911
912 spin_lock(&ci->i_ceph_lock); 912 spin_lock(&ci->i_ceph_lock);
913 dout("listxattr %p ver=%lld index_ver=%lld\n", inode, 913 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
@@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
936 names = __copy_xattr_names(ci, names); 936 names = __copy_xattr_names(ci, names);
937 size -= namelen; 937 size -= namelen;
938 } 938 }
939
940
941 /* virtual xattr names, too */
942 if (vxattrs) {
943 for (i = 0; vxattrs[i].name; i++) {
944 size_t this_len;
945
946 if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN)
947 continue;
948 if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci))
949 continue;
950
951 this_len = strlen(vxattrs[i].name) + 1;
952 namelen += this_len;
953 if (len_only)
954 continue;
955
956 if (this_len > size) {
957 err = -ERANGE;
958 goto out;
959 }
960
961 memcpy(names, vxattrs[i].name, this_len);
962 names += this_len;
963 size -= this_len;
964 }
965 }
966 err = namelen; 939 err = namelen;
967out: 940out:
968 spin_unlock(&ci->i_ceph_lock); 941 spin_unlock(&ci->i_ceph_lock);
@@ -1293,42 +1266,8 @@ out:
1293 ceph_pagelist_release(pagelist); 1266 ceph_pagelist_release(pagelist);
1294 return err; 1267 return err;
1295} 1268}
1296 1269#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */
1297void ceph_security_invalidate_secctx(struct inode *inode) 1270#endif /* CONFIG_SECURITY */
1298{
1299 security_inode_invalidate_secctx(inode);
1300}
1301
1302static int ceph_xattr_set_security_label(const struct xattr_handler *handler,
1303 struct dentry *unused, struct inode *inode,
1304 const char *key, const void *buf,
1305 size_t buflen, int flags)
1306{
1307 if (security_ismaclabel(key)) {
1308 const char *name = xattr_full_name(handler, key);
1309 return __ceph_setxattr(inode, name, buf, buflen, flags);
1310 }
1311 return -EOPNOTSUPP;
1312}
1313
1314static int ceph_xattr_get_security_label(const struct xattr_handler *handler,
1315 struct dentry *unused, struct inode *inode,
1316 const char *key, void *buf, size_t buflen)
1317{
1318 if (security_ismaclabel(key)) {
1319 const char *name = xattr_full_name(handler, key);
1320 return __ceph_getxattr(inode, name, buf, buflen);
1321 }
1322 return -EOPNOTSUPP;
1323}
1324
1325static const struct xattr_handler ceph_security_label_handler = {
1326 .prefix = XATTR_SECURITY_PREFIX,
1327 .get = ceph_xattr_get_security_label,
1328 .set = ceph_xattr_set_security_label,
1329};
1330#endif
1331#endif
1332 1271
1333void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) 1272void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx)
1334{ 1273{
@@ -1352,9 +1291,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = {
1352 &posix_acl_access_xattr_handler, 1291 &posix_acl_access_xattr_handler,
1353 &posix_acl_default_xattr_handler, 1292 &posix_acl_default_xattr_handler,
1354#endif 1293#endif
1355#ifdef CONFIG_CEPH_FS_SECURITY_LABEL
1356 &ceph_security_label_handler,
1357#endif
1358 &ceph_other_xattr_handler, 1294 &ceph_other_xattr_handler,
1359 NULL, 1295 NULL,
1360}; 1296};
diff --git a/fs/fs_context.c b/fs/fs_context.c
index 87c2c9687d90..138b5b4d621d 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -504,7 +504,6 @@ void put_fs_context(struct fs_context *fc)
504 put_net(fc->net_ns); 504 put_net(fc->net_ns);
505 put_user_ns(fc->user_ns); 505 put_user_ns(fc->user_ns);
506 put_cred(fc->cred); 506 put_cred(fc->cred);
507 kfree(fc->subtype);
508 put_fc_log(fc); 507 put_fc_log(fc);
509 put_filesystem(fc->fs_type); 508 put_filesystem(fc->fs_type);
510 kfree(fc->source); 509 kfree(fc->source);
@@ -571,17 +570,6 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
571 return 0; 570 return 0;
572 } 571 }
573 572
574 if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) &&
575 strcmp(param->key, "subtype") == 0) {
576 if (param->type != fs_value_is_string)
577 return invalf(fc, "VFS: Legacy: Non-string subtype");
578 if (fc->subtype)
579 return invalf(fc, "VFS: Legacy: Multiple subtype");
580 fc->subtype = param->string;
581 param->string = NULL;
582 return 0;
583 }
584
585 if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) 573 if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS)
586 return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); 574 return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options");
587 575
@@ -738,8 +726,6 @@ void vfs_clean_context(struct fs_context *fc)
738 fc->s_fs_info = NULL; 726 fc->s_fs_info = NULL;
739 fc->sb_flags = 0; 727 fc->sb_flags = 0;
740 security_free_mnt_opts(&fc->security); 728 security_free_mnt_opts(&fc->security);
741 kfree(fc->subtype);
742 fc->subtype = NULL;
743 kfree(fc->source); 729 kfree(fc->source);
744 fc->source = NULL; 730 fc->source = NULL;
745 731
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index bab7a0db81dd..00015d851382 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -142,11 +142,10 @@ static int cuse_open(struct inode *inode, struct file *file)
142 142
143static int cuse_release(struct inode *inode, struct file *file) 143static int cuse_release(struct inode *inode, struct file *file)
144{ 144{
145 struct fuse_inode *fi = get_fuse_inode(inode);
146 struct fuse_file *ff = file->private_data; 145 struct fuse_file *ff = file->private_data;
147 struct fuse_conn *fc = ff->fc; 146 struct fuse_conn *fc = ff->fc;
148 147
149 fuse_sync_release(fi, ff, file->f_flags); 148 fuse_sync_release(NULL, ff, file->f_flags);
150 fuse_conn_put(fc); 149 fuse_conn_put(fc);
151 150
152 return 0; 151 return 0;
@@ -299,6 +298,14 @@ static void cuse_gendev_release(struct device *dev)
299 kfree(dev); 298 kfree(dev);
300} 299}
301 300
301struct cuse_init_args {
302 struct fuse_args_pages ap;
303 struct cuse_init_in in;
304 struct cuse_init_out out;
305 struct page *page;
306 struct fuse_page_desc desc;
307};
308
302/** 309/**
303 * cuse_process_init_reply - finish initializing CUSE channel 310 * cuse_process_init_reply - finish initializing CUSE channel
304 * 311 *
@@ -306,21 +313,22 @@ static void cuse_gendev_release(struct device *dev)
306 * required data structures for it. Please read the comment at the 313 * required data structures for it. Please read the comment at the
307 * top of this file for high level overview. 314 * top of this file for high level overview.
308 */ 315 */
309static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 316static void cuse_process_init_reply(struct fuse_conn *fc,
317 struct fuse_args *args, int error)
310{ 318{
319 struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args);
320 struct fuse_args_pages *ap = &ia->ap;
311 struct cuse_conn *cc = fc_to_cc(fc), *pos; 321 struct cuse_conn *cc = fc_to_cc(fc), *pos;
312 struct cuse_init_out *arg = req->out.args[0].value; 322 struct cuse_init_out *arg = &ia->out;
313 struct page *page = req->pages[0]; 323 struct page *page = ap->pages[0];
314 struct cuse_devinfo devinfo = { }; 324 struct cuse_devinfo devinfo = { };
315 struct device *dev; 325 struct device *dev;
316 struct cdev *cdev; 326 struct cdev *cdev;
317 dev_t devt; 327 dev_t devt;
318 int rc, i; 328 int rc, i;
319 329
320 if (req->out.h.error || 330 if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11)
321 arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
322 goto err; 331 goto err;
323 }
324 332
325 fc->minor = arg->minor; 333 fc->minor = arg->minor;
326 fc->max_read = max_t(unsigned, arg->max_read, 4096); 334 fc->max_read = max_t(unsigned, arg->max_read, 4096);
@@ -329,7 +337,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
329 /* parse init reply */ 337 /* parse init reply */
330 cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; 338 cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
331 339
332 rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, 340 rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size,
333 &devinfo); 341 &devinfo);
334 if (rc) 342 if (rc)
335 goto err; 343 goto err;
@@ -396,7 +404,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
396 dev_set_uevent_suppress(dev, 0); 404 dev_set_uevent_suppress(dev, 0);
397 kobject_uevent(&dev->kobj, KOBJ_ADD); 405 kobject_uevent(&dev->kobj, KOBJ_ADD);
398out: 406out:
399 kfree(arg); 407 kfree(ia);
400 __free_page(page); 408 __free_page(page);
401 return; 409 return;
402 410
@@ -415,55 +423,49 @@ err:
415static int cuse_send_init(struct cuse_conn *cc) 423static int cuse_send_init(struct cuse_conn *cc)
416{ 424{
417 int rc; 425 int rc;
418 struct fuse_req *req;
419 struct page *page; 426 struct page *page;
420 struct fuse_conn *fc = &cc->fc; 427 struct fuse_conn *fc = &cc->fc;
421 struct cuse_init_in *arg; 428 struct cuse_init_args *ia;
422 void *outarg; 429 struct fuse_args_pages *ap;
423 430
424 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); 431 BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
425 432
426 req = fuse_get_req_for_background(fc, 1);
427 if (IS_ERR(req)) {
428 rc = PTR_ERR(req);
429 goto err;
430 }
431
432 rc = -ENOMEM; 433 rc = -ENOMEM;
433 page = alloc_page(GFP_KERNEL | __GFP_ZERO); 434 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
434 if (!page) 435 if (!page)
435 goto err_put_req; 436 goto err;
436 437
437 outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); 438 ia = kzalloc(sizeof(*ia), GFP_KERNEL);
438 if (!outarg) 439 if (!ia)
439 goto err_free_page; 440 goto err_free_page;
440 441
441 arg = &req->misc.cuse_init_in; 442 ap = &ia->ap;
442 arg->major = FUSE_KERNEL_VERSION; 443 ia->in.major = FUSE_KERNEL_VERSION;
443 arg->minor = FUSE_KERNEL_MINOR_VERSION; 444 ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
444 arg->flags |= CUSE_UNRESTRICTED_IOCTL; 445 ia->in.flags |= CUSE_UNRESTRICTED_IOCTL;
445 req->in.h.opcode = CUSE_INIT; 446 ap->args.opcode = CUSE_INIT;
446 req->in.numargs = 1; 447 ap->args.in_numargs = 1;
447 req->in.args[0].size = sizeof(struct cuse_init_in); 448 ap->args.in_args[0].size = sizeof(ia->in);
448 req->in.args[0].value = arg; 449 ap->args.in_args[0].value = &ia->in;
449 req->out.numargs = 2; 450 ap->args.out_numargs = 2;
450 req->out.args[0].size = sizeof(struct cuse_init_out); 451 ap->args.out_args[0].size = sizeof(ia->out);
451 req->out.args[0].value = outarg; 452 ap->args.out_args[0].value = &ia->out;
452 req->out.args[1].size = CUSE_INIT_INFO_MAX; 453 ap->args.out_args[1].size = CUSE_INIT_INFO_MAX;
453 req->out.argvar = 1; 454 ap->args.out_argvar = 1;
454 req->out.argpages = 1; 455 ap->args.out_pages = 1;
455 req->pages[0] = page; 456 ap->num_pages = 1;
456 req->page_descs[0].length = req->out.args[1].size; 457 ap->pages = &ia->page;
457 req->num_pages = 1; 458 ap->descs = &ia->desc;
458 req->end = cuse_process_init_reply; 459 ia->page = page;
459 fuse_request_send_background(fc, req); 460 ia->desc.length = ap->args.out_args[1].size;
460 461 ap->args.end = cuse_process_init_reply;
461 return 0; 462
462 463 rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
464 if (rc) {
465 kfree(ia);
463err_free_page: 466err_free_page:
464 __free_page(page); 467 __free_page(page);
465err_put_req: 468 }
466 fuse_put_request(fc, req);
467err: 469err:
468 return rc; 470 return rc;
469} 471}
@@ -504,9 +506,9 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
504 * Limit the cuse channel to requests that can 506 * Limit the cuse channel to requests that can
505 * be represented in file->f_cred->user_ns. 507 * be represented in file->f_cred->user_ns.
506 */ 508 */
507 fuse_conn_init(&cc->fc, file->f_cred->user_ns); 509 fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL);
508 510
509 fud = fuse_dev_alloc(&cc->fc); 511 fud = fuse_dev_alloc_install(&cc->fc);
510 if (!fud) { 512 if (!fud) {
511 kfree(cc); 513 kfree(cc);
512 return -ENOMEM; 514 return -ENOMEM;
@@ -519,6 +521,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file)
519 rc = cuse_send_init(cc); 521 rc = cuse_send_init(cc);
520 if (rc) { 522 if (rc) {
521 fuse_dev_free(fud); 523 fuse_dev_free(fud);
524 fuse_conn_put(&cc->fc);
522 return rc; 525 return rc;
523 } 526 }
524 file->private_data = fud; 527 file->private_data = fud;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ea8237513dfa..dadd617d826c 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -40,107 +40,30 @@ static struct fuse_dev *fuse_get_dev(struct file *file)
40 return READ_ONCE(file->private_data); 40 return READ_ONCE(file->private_data);
41} 41}
42 42
43static void fuse_request_init(struct fuse_req *req, struct page **pages, 43static void fuse_request_init(struct fuse_req *req)
44 struct fuse_page_desc *page_descs,
45 unsigned npages)
46{ 44{
47 INIT_LIST_HEAD(&req->list); 45 INIT_LIST_HEAD(&req->list);
48 INIT_LIST_HEAD(&req->intr_entry); 46 INIT_LIST_HEAD(&req->intr_entry);
49 init_waitqueue_head(&req->waitq); 47 init_waitqueue_head(&req->waitq);
50 refcount_set(&req->count, 1); 48 refcount_set(&req->count, 1);
51 req->pages = pages;
52 req->page_descs = page_descs;
53 req->max_pages = npages;
54 __set_bit(FR_PENDING, &req->flags); 49 __set_bit(FR_PENDING, &req->flags);
55} 50}
56 51
57static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, 52static struct fuse_req *fuse_request_alloc(gfp_t flags)
58 struct fuse_page_desc **desc)
59{
60 struct page **pages;
61
62 pages = kzalloc(npages * (sizeof(struct page *) +
63 sizeof(struct fuse_page_desc)), flags);
64 *desc = (void *) pages + npages * sizeof(struct page *);
65
66 return pages;
67}
68
69static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
70{ 53{
71 struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); 54 struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags);
72 if (req) { 55 if (req)
73 struct page **pages = NULL; 56 fuse_request_init(req);
74 struct fuse_page_desc *page_descs = NULL;
75
76 WARN_ON(npages > FUSE_MAX_MAX_PAGES);
77 if (npages > FUSE_REQ_INLINE_PAGES) {
78 pages = fuse_req_pages_alloc(npages, flags,
79 &page_descs);
80 if (!pages) {
81 kmem_cache_free(fuse_req_cachep, req);
82 return NULL;
83 }
84 } else if (npages) {
85 pages = req->inline_pages;
86 page_descs = req->inline_page_descs;
87 }
88 57
89 fuse_request_init(req, pages, page_descs, npages);
90 }
91 return req; 58 return req;
92} 59}
93 60
94struct fuse_req *fuse_request_alloc(unsigned npages) 61static void fuse_request_free(struct fuse_req *req)
95{
96 return __fuse_request_alloc(npages, GFP_KERNEL);
97}
98EXPORT_SYMBOL_GPL(fuse_request_alloc);
99
100struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
101{
102 return __fuse_request_alloc(npages, GFP_NOFS);
103}
104
105static void fuse_req_pages_free(struct fuse_req *req)
106{
107 if (req->pages != req->inline_pages)
108 kfree(req->pages);
109}
110
111bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
112 gfp_t flags)
113{
114 struct page **pages;
115 struct fuse_page_desc *page_descs;
116 unsigned int npages = min_t(unsigned int,
117 max_t(unsigned int, req->max_pages * 2,
118 FUSE_DEFAULT_MAX_PAGES_PER_REQ),
119 fc->max_pages);
120 WARN_ON(npages <= req->max_pages);
121
122 pages = fuse_req_pages_alloc(npages, flags, &page_descs);
123 if (!pages)
124 return false;
125
126 memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages);
127 memcpy(page_descs, req->page_descs,
128 sizeof(struct fuse_page_desc) * req->max_pages);
129 fuse_req_pages_free(req);
130 req->pages = pages;
131 req->page_descs = page_descs;
132 req->max_pages = npages;
133
134 return true;
135}
136
137void fuse_request_free(struct fuse_req *req)
138{ 62{
139 fuse_req_pages_free(req);
140 kmem_cache_free(fuse_req_cachep, req); 63 kmem_cache_free(fuse_req_cachep, req);
141} 64}
142 65
143void __fuse_get_request(struct fuse_req *req) 66static void __fuse_get_request(struct fuse_req *req)
144{ 67{
145 refcount_inc(&req->count); 68 refcount_inc(&req->count);
146} 69}
@@ -177,8 +100,9 @@ static void fuse_drop_waiting(struct fuse_conn *fc)
177 } 100 }
178} 101}
179 102
180static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, 103static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
181 bool for_background) 104
105static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background)
182{ 106{
183 struct fuse_req *req; 107 struct fuse_req *req;
184 int err; 108 int err;
@@ -201,7 +125,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
201 if (fc->conn_error) 125 if (fc->conn_error)
202 goto out; 126 goto out;
203 127
204 req = fuse_request_alloc(npages); 128 req = fuse_request_alloc(GFP_KERNEL);
205 err = -ENOMEM; 129 err = -ENOMEM;
206 if (!req) { 130 if (!req) {
207 if (for_background) 131 if (for_background)
@@ -229,101 +153,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
229 return ERR_PTR(err); 153 return ERR_PTR(err);
230} 154}
231 155
232struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) 156static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
233{
234 return __fuse_get_req(fc, npages, false);
235}
236EXPORT_SYMBOL_GPL(fuse_get_req);
237
238struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
239 unsigned npages)
240{
241 return __fuse_get_req(fc, npages, true);
242}
243EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
244
245/*
246 * Return request in fuse_file->reserved_req. However that may
247 * currently be in use. If that is the case, wait for it to become
248 * available.
249 */
250static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
251 struct file *file)
252{
253 struct fuse_req *req = NULL;
254 struct fuse_inode *fi = get_fuse_inode(file_inode(file));
255 struct fuse_file *ff = file->private_data;
256
257 do {
258 wait_event(fc->reserved_req_waitq, ff->reserved_req);
259 spin_lock(&fi->lock);
260 if (ff->reserved_req) {
261 req = ff->reserved_req;
262 ff->reserved_req = NULL;
263 req->stolen_file = get_file(file);
264 }
265 spin_unlock(&fi->lock);
266 } while (!req);
267
268 return req;
269}
270
271/*
272 * Put stolen request back into fuse_file->reserved_req
273 */
274static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
275{
276 struct file *file = req->stolen_file;
277 struct fuse_inode *fi = get_fuse_inode(file_inode(file));
278 struct fuse_file *ff = file->private_data;
279
280 WARN_ON(req->max_pages);
281 spin_lock(&fi->lock);
282 memset(req, 0, sizeof(*req));
283 fuse_request_init(req, NULL, NULL, 0);
284 BUG_ON(ff->reserved_req);
285 ff->reserved_req = req;
286 wake_up_all(&fc->reserved_req_waitq);
287 spin_unlock(&fi->lock);
288 fput(file);
289}
290
291/*
292 * Gets a requests for a file operation, always succeeds
293 *
294 * This is used for sending the FLUSH request, which must get to
295 * userspace, due to POSIX locks which may need to be unlocked.
296 *
297 * If allocation fails due to OOM, use the reserved request in
298 * fuse_file.
299 *
300 * This is very unlikely to deadlock accidentally, since the
301 * filesystem should not have it's own file open. If deadlock is
302 * intentional, it can still be broken by "aborting" the filesystem.
303 */
304struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
305 struct file *file)
306{
307 struct fuse_req *req;
308
309 atomic_inc(&fc->num_waiting);
310 wait_event(fc->blocked_waitq, fc->initialized);
311 /* Matches smp_wmb() in fuse_set_initialized() */
312 smp_rmb();
313 req = fuse_request_alloc(0);
314 if (!req)
315 req = get_reserved_req(fc, file);
316
317 req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
318 req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
319 req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
320
321 __set_bit(FR_WAITING, &req->flags);
322 __clear_bit(FR_BACKGROUND, &req->flags);
323 return req;
324}
325
326void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
327{ 157{
328 if (refcount_dec_and_test(&req->count)) { 158 if (refcount_dec_and_test(&req->count)) {
329 if (test_bit(FR_BACKGROUND, &req->flags)) { 159 if (test_bit(FR_BACKGROUND, &req->flags)) {
@@ -342,15 +172,11 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
342 fuse_drop_waiting(fc); 172 fuse_drop_waiting(fc);
343 } 173 }
344 174
345 if (req->stolen_file) 175 fuse_request_free(req);
346 put_reserved_req(fc, req);
347 else
348 fuse_request_free(req);
349 } 176 }
350} 177}
351EXPORT_SYMBOL_GPL(fuse_put_request);
352 178
353static unsigned len_args(unsigned numargs, struct fuse_arg *args) 179unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args)
354{ 180{
355 unsigned nbytes = 0; 181 unsigned nbytes = 0;
356 unsigned i; 182 unsigned i;
@@ -360,25 +186,47 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args)
360 186
361 return nbytes; 187 return nbytes;
362} 188}
189EXPORT_SYMBOL_GPL(fuse_len_args);
363 190
364static u64 fuse_get_unique(struct fuse_iqueue *fiq) 191u64 fuse_get_unique(struct fuse_iqueue *fiq)
365{ 192{
366 fiq->reqctr += FUSE_REQ_ID_STEP; 193 fiq->reqctr += FUSE_REQ_ID_STEP;
367 return fiq->reqctr; 194 return fiq->reqctr;
368} 195}
196EXPORT_SYMBOL_GPL(fuse_get_unique);
369 197
370static unsigned int fuse_req_hash(u64 unique) 198static unsigned int fuse_req_hash(u64 unique)
371{ 199{
372 return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); 200 return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS);
373} 201}
374 202
375static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) 203/**
204 * A new request is available, wake fiq->waitq
205 */
206static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq)
207__releases(fiq->lock)
208{
209 wake_up(&fiq->waitq);
210 kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
211 spin_unlock(&fiq->lock);
212}
213
214const struct fuse_iqueue_ops fuse_dev_fiq_ops = {
215 .wake_forget_and_unlock = fuse_dev_wake_and_unlock,
216 .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock,
217 .wake_pending_and_unlock = fuse_dev_wake_and_unlock,
218};
219EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops);
220
221static void queue_request_and_unlock(struct fuse_iqueue *fiq,
222 struct fuse_req *req)
223__releases(fiq->lock)
376{ 224{
377 req->in.h.len = sizeof(struct fuse_in_header) + 225 req->in.h.len = sizeof(struct fuse_in_header) +
378 len_args(req->in.numargs, (struct fuse_arg *) req->in.args); 226 fuse_len_args(req->args->in_numargs,
227 (struct fuse_arg *) req->args->in_args);
379 list_add_tail(&req->list, &fiq->pending); 228 list_add_tail(&req->list, &fiq->pending);
380 wake_up_locked(&fiq->waitq); 229 fiq->ops->wake_pending_and_unlock(fiq);
381 kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
382} 230}
383 231
384void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, 232void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
@@ -389,16 +237,15 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
389 forget->forget_one.nodeid = nodeid; 237 forget->forget_one.nodeid = nodeid;
390 forget->forget_one.nlookup = nlookup; 238 forget->forget_one.nlookup = nlookup;
391 239
392 spin_lock(&fiq->waitq.lock); 240 spin_lock(&fiq->lock);
393 if (fiq->connected) { 241 if (fiq->connected) {
394 fiq->forget_list_tail->next = forget; 242 fiq->forget_list_tail->next = forget;
395 fiq->forget_list_tail = forget; 243 fiq->forget_list_tail = forget;
396 wake_up_locked(&fiq->waitq); 244 fiq->ops->wake_forget_and_unlock(fiq);
397 kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
398 } else { 245 } else {
399 kfree(forget); 246 kfree(forget);
247 spin_unlock(&fiq->lock);
400 } 248 }
401 spin_unlock(&fiq->waitq.lock);
402} 249}
403 250
404static void flush_bg_queue(struct fuse_conn *fc) 251static void flush_bg_queue(struct fuse_conn *fc)
@@ -412,10 +259,9 @@ static void flush_bg_queue(struct fuse_conn *fc)
412 req = list_first_entry(&fc->bg_queue, struct fuse_req, list); 259 req = list_first_entry(&fc->bg_queue, struct fuse_req, list);
413 list_del(&req->list); 260 list_del(&req->list);
414 fc->active_background++; 261 fc->active_background++;
415 spin_lock(&fiq->waitq.lock); 262 spin_lock(&fiq->lock);
416 req->in.h.unique = fuse_get_unique(fiq); 263 req->in.h.unique = fuse_get_unique(fiq);
417 queue_request(fiq, req); 264 queue_request_and_unlock(fiq, req);
418 spin_unlock(&fiq->waitq.lock);
419 } 265 }
420} 266}
421 267
@@ -427,9 +273,10 @@ static void flush_bg_queue(struct fuse_conn *fc)
427 * the 'end' callback is called if given, else the reference to the 273 * the 'end' callback is called if given, else the reference to the
428 * request is released 274 * request is released
429 */ 275 */
430static void request_end(struct fuse_conn *fc, struct fuse_req *req) 276void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req)
431{ 277{
432 struct fuse_iqueue *fiq = &fc->iq; 278 struct fuse_iqueue *fiq = &fc->iq;
279 bool async = req->args->end;
433 280
434 if (test_and_set_bit(FR_FINISHED, &req->flags)) 281 if (test_and_set_bit(FR_FINISHED, &req->flags))
435 goto put_request; 282 goto put_request;
@@ -439,9 +286,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
439 * smp_mb() from queue_interrupt(). 286 * smp_mb() from queue_interrupt().
440 */ 287 */
441 if (!list_empty(&req->intr_entry)) { 288 if (!list_empty(&req->intr_entry)) {
442 spin_lock(&fiq->waitq.lock); 289 spin_lock(&fiq->lock);
443 list_del_init(&req->intr_entry); 290 list_del_init(&req->intr_entry);
444 spin_unlock(&fiq->waitq.lock); 291 spin_unlock(&fiq->lock);
445 } 292 }
446 WARN_ON(test_bit(FR_PENDING, &req->flags)); 293 WARN_ON(test_bit(FR_PENDING, &req->flags));
447 WARN_ON(test_bit(FR_SENT, &req->flags)); 294 WARN_ON(test_bit(FR_SENT, &req->flags));
@@ -475,18 +322,19 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
475 wake_up(&req->waitq); 322 wake_up(&req->waitq);
476 } 323 }
477 324
478 if (req->end) 325 if (async)
479 req->end(fc, req); 326 req->args->end(fc, req->args, req->out.h.error);
480put_request: 327put_request:
481 fuse_put_request(fc, req); 328 fuse_put_request(fc, req);
482} 329}
330EXPORT_SYMBOL_GPL(fuse_request_end);
483 331
484static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) 332static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
485{ 333{
486 spin_lock(&fiq->waitq.lock); 334 spin_lock(&fiq->lock);
487 /* Check for we've sent request to interrupt this req */ 335 /* Check for we've sent request to interrupt this req */
488 if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { 336 if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) {
489 spin_unlock(&fiq->waitq.lock); 337 spin_unlock(&fiq->lock);
490 return -EINVAL; 338 return -EINVAL;
491 } 339 }
492 340
@@ -499,13 +347,13 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req)
499 smp_mb(); 347 smp_mb();
500 if (test_bit(FR_FINISHED, &req->flags)) { 348 if (test_bit(FR_FINISHED, &req->flags)) {
501 list_del_init(&req->intr_entry); 349 list_del_init(&req->intr_entry);
502 spin_unlock(&fiq->waitq.lock); 350 spin_unlock(&fiq->lock);
503 return 0; 351 return 0;
504 } 352 }
505 wake_up_locked(&fiq->waitq); 353 fiq->ops->wake_interrupt_and_unlock(fiq);
506 kill_fasync(&fiq->fasync, SIGIO, POLL_IN); 354 } else {
355 spin_unlock(&fiq->lock);
507 } 356 }
508 spin_unlock(&fiq->waitq.lock);
509 return 0; 357 return 0;
510} 358}
511 359
@@ -535,16 +383,16 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
535 if (!err) 383 if (!err)
536 return; 384 return;
537 385
538 spin_lock(&fiq->waitq.lock); 386 spin_lock(&fiq->lock);
539 /* Request is not yet in userspace, bail out */ 387 /* Request is not yet in userspace, bail out */
540 if (test_bit(FR_PENDING, &req->flags)) { 388 if (test_bit(FR_PENDING, &req->flags)) {
541 list_del(&req->list); 389 list_del(&req->list);
542 spin_unlock(&fiq->waitq.lock); 390 spin_unlock(&fiq->lock);
543 __fuse_put_request(req); 391 __fuse_put_request(req);
544 req->out.h.error = -EINTR; 392 req->out.h.error = -EINTR;
545 return; 393 return;
546 } 394 }
547 spin_unlock(&fiq->waitq.lock); 395 spin_unlock(&fiq->lock);
548 } 396 }
549 397
550 /* 398 /*
@@ -559,101 +407,110 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
559 struct fuse_iqueue *fiq = &fc->iq; 407 struct fuse_iqueue *fiq = &fc->iq;
560 408
561 BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); 409 BUG_ON(test_bit(FR_BACKGROUND, &req->flags));
562 spin_lock(&fiq->waitq.lock); 410 spin_lock(&fiq->lock);
563 if (!fiq->connected) { 411 if (!fiq->connected) {
564 spin_unlock(&fiq->waitq.lock); 412 spin_unlock(&fiq->lock);
565 req->out.h.error = -ENOTCONN; 413 req->out.h.error = -ENOTCONN;
566 } else { 414 } else {
567 req->in.h.unique = fuse_get_unique(fiq); 415 req->in.h.unique = fuse_get_unique(fiq);
568 queue_request(fiq, req);
569 /* acquire extra reference, since request is still needed 416 /* acquire extra reference, since request is still needed
570 after request_end() */ 417 after fuse_request_end() */
571 __fuse_get_request(req); 418 __fuse_get_request(req);
572 spin_unlock(&fiq->waitq.lock); 419 queue_request_and_unlock(fiq, req);
573 420
574 request_wait_answer(fc, req); 421 request_wait_answer(fc, req);
575 /* Pairs with smp_wmb() in request_end() */ 422 /* Pairs with smp_wmb() in fuse_request_end() */
576 smp_rmb(); 423 smp_rmb();
577 } 424 }
578} 425}
579 426
580void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
581{
582 __set_bit(FR_ISREPLY, &req->flags);
583 if (!test_bit(FR_WAITING, &req->flags)) {
584 __set_bit(FR_WAITING, &req->flags);
585 atomic_inc(&fc->num_waiting);
586 }
587 __fuse_request_send(fc, req);
588}
589EXPORT_SYMBOL_GPL(fuse_request_send);
590
591static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) 427static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
592{ 428{
593 if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) 429 if (fc->minor < 4 && args->opcode == FUSE_STATFS)
594 args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; 430 args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE;
595 431
596 if (fc->minor < 9) { 432 if (fc->minor < 9) {
597 switch (args->in.h.opcode) { 433 switch (args->opcode) {
598 case FUSE_LOOKUP: 434 case FUSE_LOOKUP:
599 case FUSE_CREATE: 435 case FUSE_CREATE:
600 case FUSE_MKNOD: 436 case FUSE_MKNOD:
601 case FUSE_MKDIR: 437 case FUSE_MKDIR:
602 case FUSE_SYMLINK: 438 case FUSE_SYMLINK:
603 case FUSE_LINK: 439 case FUSE_LINK:
604 args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; 440 args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
605 break; 441 break;
606 case FUSE_GETATTR: 442 case FUSE_GETATTR:
607 case FUSE_SETATTR: 443 case FUSE_SETATTR:
608 args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; 444 args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
609 break; 445 break;
610 } 446 }
611 } 447 }
612 if (fc->minor < 12) { 448 if (fc->minor < 12) {
613 switch (args->in.h.opcode) { 449 switch (args->opcode) {
614 case FUSE_CREATE: 450 case FUSE_CREATE:
615 args->in.args[0].size = sizeof(struct fuse_open_in); 451 args->in_args[0].size = sizeof(struct fuse_open_in);
616 break; 452 break;
617 case FUSE_MKNOD: 453 case FUSE_MKNOD:
618 args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; 454 args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
619 break; 455 break;
620 } 456 }
621 } 457 }
622} 458}
623 459
460static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req)
461{
462 req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid());
463 req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid());
464 req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns);
465}
466
467static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
468{
469 req->in.h.opcode = args->opcode;
470 req->in.h.nodeid = args->nodeid;
471 req->args = args;
472}
473
624ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) 474ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
625{ 475{
626 struct fuse_req *req; 476 struct fuse_req *req;
627 ssize_t ret; 477 ssize_t ret;
628 478
629 req = fuse_get_req(fc, 0); 479 if (args->force) {
630 if (IS_ERR(req)) 480 atomic_inc(&fc->num_waiting);
631 return PTR_ERR(req); 481 req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL);
482
483 if (!args->nocreds)
484 fuse_force_creds(fc, req);
485
486 __set_bit(FR_WAITING, &req->flags);
487 __set_bit(FR_FORCE, &req->flags);
488 } else {
489 WARN_ON(args->nocreds);
490 req = fuse_get_req(fc, false);
491 if (IS_ERR(req))
492 return PTR_ERR(req);
493 }
632 494
633 /* Needs to be done after fuse_get_req() so that fc->minor is valid */ 495 /* Needs to be done after fuse_get_req() so that fc->minor is valid */
634 fuse_adjust_compat(fc, args); 496 fuse_adjust_compat(fc, args);
497 fuse_args_to_req(req, args);
635 498
636 req->in.h.opcode = args->in.h.opcode; 499 if (!args->noreply)
637 req->in.h.nodeid = args->in.h.nodeid; 500 __set_bit(FR_ISREPLY, &req->flags);
638 req->in.numargs = args->in.numargs; 501 __fuse_request_send(fc, req);
639 memcpy(req->in.args, args->in.args,
640 args->in.numargs * sizeof(struct fuse_in_arg));
641 req->out.argvar = args->out.argvar;
642 req->out.numargs = args->out.numargs;
643 memcpy(req->out.args, args->out.args,
644 args->out.numargs * sizeof(struct fuse_arg));
645 fuse_request_send(fc, req);
646 ret = req->out.h.error; 502 ret = req->out.h.error;
647 if (!ret && args->out.argvar) { 503 if (!ret && args->out_argvar) {
648 BUG_ON(args->out.numargs != 1); 504 BUG_ON(args->out_numargs == 0);
649 ret = req->out.args[0].size; 505 ret = args->out_args[args->out_numargs - 1].size;
650 } 506 }
651 fuse_put_request(fc, req); 507 fuse_put_request(fc, req);
652 508
653 return ret; 509 return ret;
654} 510}
655 511
656bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) 512static bool fuse_request_queue_background(struct fuse_conn *fc,
513 struct fuse_req *req)
657{ 514{
658 bool queued = false; 515 bool queued = false;
659 516
@@ -681,56 +538,63 @@ bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req)
681 return queued; 538 return queued;
682} 539}
683 540
684void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) 541int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
542 gfp_t gfp_flags)
685{ 543{
686 WARN_ON(!req->end); 544 struct fuse_req *req;
545
546 if (args->force) {
547 WARN_ON(!args->nocreds);
548 req = fuse_request_alloc(gfp_flags);
549 if (!req)
550 return -ENOMEM;
551 __set_bit(FR_BACKGROUND, &req->flags);
552 } else {
553 WARN_ON(args->nocreds);
554 req = fuse_get_req(fc, true);
555 if (IS_ERR(req))
556 return PTR_ERR(req);
557 }
558
559 fuse_args_to_req(req, args);
560
687 if (!fuse_request_queue_background(fc, req)) { 561 if (!fuse_request_queue_background(fc, req)) {
688 req->out.h.error = -ENOTCONN;
689 req->end(fc, req);
690 fuse_put_request(fc, req); 562 fuse_put_request(fc, req);
563 return -ENOTCONN;
691 } 564 }
565
566 return 0;
692} 567}
693EXPORT_SYMBOL_GPL(fuse_request_send_background); 568EXPORT_SYMBOL_GPL(fuse_simple_background);
694 569
695static int fuse_request_send_notify_reply(struct fuse_conn *fc, 570static int fuse_simple_notify_reply(struct fuse_conn *fc,
696 struct fuse_req *req, u64 unique) 571 struct fuse_args *args, u64 unique)
697{ 572{
698 int err = -ENODEV; 573 struct fuse_req *req;
699 struct fuse_iqueue *fiq = &fc->iq; 574 struct fuse_iqueue *fiq = &fc->iq;
575 int err = 0;
576
577 req = fuse_get_req(fc, false);
578 if (IS_ERR(req))
579 return PTR_ERR(req);
700 580
701 __clear_bit(FR_ISREPLY, &req->flags); 581 __clear_bit(FR_ISREPLY, &req->flags);
702 req->in.h.unique = unique; 582 req->in.h.unique = unique;
703 spin_lock(&fiq->waitq.lock); 583
584 fuse_args_to_req(req, args);
585
586 spin_lock(&fiq->lock);
704 if (fiq->connected) { 587 if (fiq->connected) {
705 queue_request(fiq, req); 588 queue_request_and_unlock(fiq, req);
706 err = 0; 589 } else {
590 err = -ENODEV;
591 spin_unlock(&fiq->lock);
592 fuse_put_request(fc, req);
707 } 593 }
708 spin_unlock(&fiq->waitq.lock);
709 594
710 return err; 595 return err;
711} 596}
712 597
713void fuse_force_forget(struct file *file, u64 nodeid)
714{
715 struct inode *inode = file_inode(file);
716 struct fuse_conn *fc = get_fuse_conn(inode);
717 struct fuse_req *req;
718 struct fuse_forget_in inarg;
719
720 memset(&inarg, 0, sizeof(inarg));
721 inarg.nlookup = 1;
722 req = fuse_get_req_nofail_nopages(fc, file);
723 req->in.h.opcode = FUSE_FORGET;
724 req->in.h.nodeid = nodeid;
725 req->in.numargs = 1;
726 req->in.args[0].size = sizeof(inarg);
727 req->in.args[0].value = &inarg;
728 __clear_bit(FR_ISREPLY, &req->flags);
729 __fuse_request_send(fc, req);
730 /* ignore errors */
731 fuse_put_request(fc, req);
732}
733
734/* 598/*
735 * Lock the request. Up to the next unlock_request() there mustn't be 599 * Lock the request. Up to the next unlock_request() there mustn't be
736 * anything that could cause a page-fault. If the request was already 600 * anything that could cause a page-fault. If the request was already
@@ -1084,14 +948,15 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
1084{ 948{
1085 unsigned i; 949 unsigned i;
1086 struct fuse_req *req = cs->req; 950 struct fuse_req *req = cs->req;
951 struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args);
952
1087 953
1088 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { 954 for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) {
1089 int err; 955 int err;
1090 unsigned offset = req->page_descs[i].offset; 956 unsigned int offset = ap->descs[i].offset;
1091 unsigned count = min(nbytes, req->page_descs[i].length); 957 unsigned int count = min(nbytes, ap->descs[i].length);
1092 958
1093 err = fuse_copy_page(cs, &req->pages[i], offset, count, 959 err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing);
1094 zeroing);
1095 if (err) 960 if (err)
1096 return err; 961 return err;
1097 962
@@ -1149,12 +1014,12 @@ static int request_pending(struct fuse_iqueue *fiq)
1149 * Unlike other requests this is assembled on demand, without a need 1014 * Unlike other requests this is assembled on demand, without a need
1150 * to allocate a separate fuse_req structure. 1015 * to allocate a separate fuse_req structure.
1151 * 1016 *
1152 * Called with fiq->waitq.lock held, releases it 1017 * Called with fiq->lock held, releases it
1153 */ 1018 */
1154static int fuse_read_interrupt(struct fuse_iqueue *fiq, 1019static int fuse_read_interrupt(struct fuse_iqueue *fiq,
1155 struct fuse_copy_state *cs, 1020 struct fuse_copy_state *cs,
1156 size_t nbytes, struct fuse_req *req) 1021 size_t nbytes, struct fuse_req *req)
1157__releases(fiq->waitq.lock) 1022__releases(fiq->lock)
1158{ 1023{
1159 struct fuse_in_header ih; 1024 struct fuse_in_header ih;
1160 struct fuse_interrupt_in arg; 1025 struct fuse_interrupt_in arg;
@@ -1169,7 +1034,7 @@ __releases(fiq->waitq.lock)
1169 ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); 1034 ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT);
1170 arg.unique = req->in.h.unique; 1035 arg.unique = req->in.h.unique;
1171 1036
1172 spin_unlock(&fiq->waitq.lock); 1037 spin_unlock(&fiq->lock);
1173 if (nbytes < reqsize) 1038 if (nbytes < reqsize)
1174 return -EINVAL; 1039 return -EINVAL;
1175 1040
@@ -1181,9 +1046,9 @@ __releases(fiq->waitq.lock)
1181 return err ? err : reqsize; 1046 return err ? err : reqsize;
1182} 1047}
1183 1048
1184static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, 1049struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
1185 unsigned max, 1050 unsigned int max,
1186 unsigned *countp) 1051 unsigned int *countp)
1187{ 1052{
1188 struct fuse_forget_link *head = fiq->forget_list_head.next; 1053 struct fuse_forget_link *head = fiq->forget_list_head.next;
1189 struct fuse_forget_link **newhead = &head; 1054 struct fuse_forget_link **newhead = &head;
@@ -1202,14 +1067,15 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq,
1202 1067
1203 return head; 1068 return head;
1204} 1069}
1070EXPORT_SYMBOL(fuse_dequeue_forget);
1205 1071
1206static int fuse_read_single_forget(struct fuse_iqueue *fiq, 1072static int fuse_read_single_forget(struct fuse_iqueue *fiq,
1207 struct fuse_copy_state *cs, 1073 struct fuse_copy_state *cs,
1208 size_t nbytes) 1074 size_t nbytes)
1209__releases(fiq->waitq.lock) 1075__releases(fiq->lock)
1210{ 1076{
1211 int err; 1077 int err;
1212 struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); 1078 struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL);
1213 struct fuse_forget_in arg = { 1079 struct fuse_forget_in arg = {
1214 .nlookup = forget->forget_one.nlookup, 1080 .nlookup = forget->forget_one.nlookup,
1215 }; 1081 };
@@ -1220,7 +1086,7 @@ __releases(fiq->waitq.lock)
1220 .len = sizeof(ih) + sizeof(arg), 1086 .len = sizeof(ih) + sizeof(arg),
1221 }; 1087 };
1222 1088
1223 spin_unlock(&fiq->waitq.lock); 1089 spin_unlock(&fiq->lock);
1224 kfree(forget); 1090 kfree(forget);
1225 if (nbytes < ih.len) 1091 if (nbytes < ih.len)
1226 return -EINVAL; 1092 return -EINVAL;
@@ -1238,7 +1104,7 @@ __releases(fiq->waitq.lock)
1238 1104
1239static int fuse_read_batch_forget(struct fuse_iqueue *fiq, 1105static int fuse_read_batch_forget(struct fuse_iqueue *fiq,
1240 struct fuse_copy_state *cs, size_t nbytes) 1106 struct fuse_copy_state *cs, size_t nbytes)
1241__releases(fiq->waitq.lock) 1107__releases(fiq->lock)
1242{ 1108{
1243 int err; 1109 int err;
1244 unsigned max_forgets; 1110 unsigned max_forgets;
@@ -1252,13 +1118,13 @@ __releases(fiq->waitq.lock)
1252 }; 1118 };
1253 1119
1254 if (nbytes < ih.len) { 1120 if (nbytes < ih.len) {
1255 spin_unlock(&fiq->waitq.lock); 1121 spin_unlock(&fiq->lock);
1256 return -EINVAL; 1122 return -EINVAL;
1257 } 1123 }
1258 1124
1259 max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); 1125 max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
1260 head = dequeue_forget(fiq, max_forgets, &count); 1126 head = fuse_dequeue_forget(fiq, max_forgets, &count);
1261 spin_unlock(&fiq->waitq.lock); 1127 spin_unlock(&fiq->lock);
1262 1128
1263 arg.count = count; 1129 arg.count = count;
1264 ih.len += count * sizeof(struct fuse_forget_one); 1130 ih.len += count * sizeof(struct fuse_forget_one);
@@ -1288,7 +1154,7 @@ __releases(fiq->waitq.lock)
1288static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, 1154static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq,
1289 struct fuse_copy_state *cs, 1155 struct fuse_copy_state *cs,
1290 size_t nbytes) 1156 size_t nbytes)
1291__releases(fiq->waitq.lock) 1157__releases(fiq->lock)
1292{ 1158{
1293 if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) 1159 if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL)
1294 return fuse_read_single_forget(fiq, cs, nbytes); 1160 return fuse_read_single_forget(fiq, cs, nbytes);
@@ -1302,7 +1168,7 @@ __releases(fiq->waitq.lock)
1302 * the pending list and copies request data to userspace buffer. If 1168 * the pending list and copies request data to userspace buffer. If
1303 * no reply is needed (FORGET) or request has been aborted or there 1169 * no reply is needed (FORGET) or request has been aborted or there
1304 * was an error during the copying then it's finished by calling 1170 * was an error during the copying then it's finished by calling
1305 * request_end(). Otherwise add it to the processing list, and set 1171 * fuse_request_end(). Otherwise add it to the processing list, and set
1306 * the 'sent' flag. 1172 * the 'sent' flag.
1307 */ 1173 */
1308static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, 1174static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
@@ -1313,21 +1179,42 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
1313 struct fuse_iqueue *fiq = &fc->iq; 1179 struct fuse_iqueue *fiq = &fc->iq;
1314 struct fuse_pqueue *fpq = &fud->pq; 1180 struct fuse_pqueue *fpq = &fud->pq;
1315 struct fuse_req *req; 1181 struct fuse_req *req;
1316 struct fuse_in *in; 1182 struct fuse_args *args;
1317 unsigned reqsize; 1183 unsigned reqsize;
1318 unsigned int hash; 1184 unsigned int hash;
1319 1185
1186 /*
1187 * Require sane minimum read buffer - that has capacity for fixed part
1188 * of any request header + negotiated max_write room for data.
1189 *
1190 * Historically libfuse reserves 4K for fixed header room, but e.g.
1191 * GlusterFS reserves only 80 bytes
1192 *
1193 * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)`
1194 *
1195 * which is the absolute minimum any sane filesystem should be using
1196 * for header room.
1197 */
1198 if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER,
1199 sizeof(struct fuse_in_header) +
1200 sizeof(struct fuse_write_in) +
1201 fc->max_write))
1202 return -EINVAL;
1203
1320 restart: 1204 restart:
1321 spin_lock(&fiq->waitq.lock); 1205 for (;;) {
1322 err = -EAGAIN; 1206 spin_lock(&fiq->lock);
1323 if ((file->f_flags & O_NONBLOCK) && fiq->connected && 1207 if (!fiq->connected || request_pending(fiq))
1324 !request_pending(fiq)) 1208 break;
1325 goto err_unlock; 1209 spin_unlock(&fiq->lock);
1326 1210
1327 err = wait_event_interruptible_exclusive_locked(fiq->waitq, 1211 if (file->f_flags & O_NONBLOCK)
1212 return -EAGAIN;
1213 err = wait_event_interruptible_exclusive(fiq->waitq,
1328 !fiq->connected || request_pending(fiq)); 1214 !fiq->connected || request_pending(fiq));
1329 if (err) 1215 if (err)
1330 goto err_unlock; 1216 return err;
1217 }
1331 1218
1332 if (!fiq->connected) { 1219 if (!fiq->connected) {
1333 err = fc->aborted ? -ECONNABORTED : -ENODEV; 1220 err = fc->aborted ? -ECONNABORTED : -ENODEV;
@@ -1351,28 +1238,28 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file,
1351 req = list_entry(fiq->pending.next, struct fuse_req, list); 1238 req = list_entry(fiq->pending.next, struct fuse_req, list);
1352 clear_bit(FR_PENDING, &req->flags); 1239 clear_bit(FR_PENDING, &req->flags);
1353 list_del_init(&req->list); 1240 list_del_init(&req->list);
1354 spin_unlock(&fiq->waitq.lock); 1241 spin_unlock(&fiq->lock);
1355 1242
1356 in = &req->in; 1243 args = req->args;
1357 reqsize = in->h.len; 1244 reqsize = req->in.h.len;
1358 1245
1359 /* If request is too large, reply with an error and restart the read */ 1246 /* If request is too large, reply with an error and restart the read */
1360 if (nbytes < reqsize) { 1247 if (nbytes < reqsize) {
1361 req->out.h.error = -EIO; 1248 req->out.h.error = -EIO;
1362 /* SETXATTR is special, since it may contain too large data */ 1249 /* SETXATTR is special, since it may contain too large data */
1363 if (in->h.opcode == FUSE_SETXATTR) 1250 if (args->opcode == FUSE_SETXATTR)
1364 req->out.h.error = -E2BIG; 1251 req->out.h.error = -E2BIG;
1365 request_end(fc, req); 1252 fuse_request_end(fc, req);
1366 goto restart; 1253 goto restart;
1367 } 1254 }
1368 spin_lock(&fpq->lock); 1255 spin_lock(&fpq->lock);
1369 list_add(&req->list, &fpq->io); 1256 list_add(&req->list, &fpq->io);
1370 spin_unlock(&fpq->lock); 1257 spin_unlock(&fpq->lock);
1371 cs->req = req; 1258 cs->req = req;
1372 err = fuse_copy_one(cs, &in->h, sizeof(in->h)); 1259 err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h));
1373 if (!err) 1260 if (!err)
1374 err = fuse_copy_args(cs, in->numargs, in->argpages, 1261 err = fuse_copy_args(cs, args->in_numargs, args->in_pages,
1375 (struct fuse_arg *) in->args, 0); 1262 (struct fuse_arg *) args->in_args, 0);
1376 fuse_copy_finish(cs); 1263 fuse_copy_finish(cs);
1377 spin_lock(&fpq->lock); 1264 spin_lock(&fpq->lock);
1378 clear_bit(FR_LOCKED, &req->flags); 1265 clear_bit(FR_LOCKED, &req->flags);
@@ -1405,11 +1292,11 @@ out_end:
1405 if (!test_bit(FR_PRIVATE, &req->flags)) 1292 if (!test_bit(FR_PRIVATE, &req->flags))
1406 list_del_init(&req->list); 1293 list_del_init(&req->list);
1407 spin_unlock(&fpq->lock); 1294 spin_unlock(&fpq->lock);
1408 request_end(fc, req); 1295 fuse_request_end(fc, req);
1409 return err; 1296 return err;
1410 1297
1411 err_unlock: 1298 err_unlock:
1412 spin_unlock(&fiq->waitq.lock); 1299 spin_unlock(&fiq->lock);
1413 return err; 1300 return err;
1414} 1301}
1415 1302
@@ -1728,9 +1615,19 @@ out_finish:
1728 return err; 1615 return err;
1729} 1616}
1730 1617
1731static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) 1618struct fuse_retrieve_args {
1619 struct fuse_args_pages ap;
1620 struct fuse_notify_retrieve_in inarg;
1621};
1622
1623static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args,
1624 int error)
1732{ 1625{
1733 release_pages(req->pages, req->num_pages); 1626 struct fuse_retrieve_args *ra =
1627 container_of(args, typeof(*ra), ap.args);
1628
1629 release_pages(ra->ap.pages, ra->ap.num_pages);
1630 kfree(ra);
1734} 1631}
1735 1632
1736static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, 1633static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
@@ -1738,13 +1635,16 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1738{ 1635{
1739 int err; 1636 int err;
1740 struct address_space *mapping = inode->i_mapping; 1637 struct address_space *mapping = inode->i_mapping;
1741 struct fuse_req *req;
1742 pgoff_t index; 1638 pgoff_t index;
1743 loff_t file_size; 1639 loff_t file_size;
1744 unsigned int num; 1640 unsigned int num;
1745 unsigned int offset; 1641 unsigned int offset;
1746 size_t total_len = 0; 1642 size_t total_len = 0;
1747 unsigned int num_pages; 1643 unsigned int num_pages;
1644 struct fuse_retrieve_args *ra;
1645 size_t args_size = sizeof(*ra);
1646 struct fuse_args_pages *ap;
1647 struct fuse_args *args;
1748 1648
1749 offset = outarg->offset & ~PAGE_MASK; 1649 offset = outarg->offset & ~PAGE_MASK;
1750 file_size = i_size_read(inode); 1650 file_size = i_size_read(inode);
@@ -1758,19 +1658,26 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1758 num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; 1658 num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
1759 num_pages = min(num_pages, fc->max_pages); 1659 num_pages = min(num_pages, fc->max_pages);
1760 1660
1761 req = fuse_get_req(fc, num_pages); 1661 args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0]));
1762 if (IS_ERR(req))
1763 return PTR_ERR(req);
1764 1662
1765 req->in.h.opcode = FUSE_NOTIFY_REPLY; 1663 ra = kzalloc(args_size, GFP_KERNEL);
1766 req->in.h.nodeid = outarg->nodeid; 1664 if (!ra)
1767 req->in.numargs = 2; 1665 return -ENOMEM;
1768 req->in.argpages = 1; 1666
1769 req->end = fuse_retrieve_end; 1667 ap = &ra->ap;
1668 ap->pages = (void *) (ra + 1);
1669 ap->descs = (void *) (ap->pages + num_pages);
1670
1671 args = &ap->args;
1672 args->nodeid = outarg->nodeid;
1673 args->opcode = FUSE_NOTIFY_REPLY;
1674 args->in_numargs = 2;
1675 args->in_pages = true;
1676 args->end = fuse_retrieve_end;
1770 1677
1771 index = outarg->offset >> PAGE_SHIFT; 1678 index = outarg->offset >> PAGE_SHIFT;
1772 1679
1773 while (num && req->num_pages < num_pages) { 1680 while (num && ap->num_pages < num_pages) {
1774 struct page *page; 1681 struct page *page;
1775 unsigned int this_num; 1682 unsigned int this_num;
1776 1683
@@ -1779,27 +1686,25 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
1779 break; 1686 break;
1780 1687
1781 this_num = min_t(unsigned, num, PAGE_SIZE - offset); 1688 this_num = min_t(unsigned, num, PAGE_SIZE - offset);
1782 req->pages[req->num_pages] = page; 1689 ap->pages[ap->num_pages] = page;
1783 req->page_descs[req->num_pages].offset = offset; 1690 ap->descs[ap->num_pages].offset = offset;
1784 req->page_descs[req->num_pages].length = this_num; 1691 ap->descs[ap->num_pages].length = this_num;
1785 req->num_pages++; 1692 ap->num_pages++;
1786 1693
1787 offset = 0; 1694 offset = 0;
1788 num -= this_num; 1695 num -= this_num;
1789 total_len += this_num; 1696 total_len += this_num;
1790 index++; 1697 index++;
1791 } 1698 }
1792 req->misc.retrieve_in.offset = outarg->offset; 1699 ra->inarg.offset = outarg->offset;
1793 req->misc.retrieve_in.size = total_len; 1700 ra->inarg.size = total_len;
1794 req->in.args[0].size = sizeof(req->misc.retrieve_in); 1701 args->in_args[0].size = sizeof(ra->inarg);
1795 req->in.args[0].value = &req->misc.retrieve_in; 1702 args->in_args[0].value = &ra->inarg;
1796 req->in.args[1].size = total_len; 1703 args->in_args[1].size = total_len;
1797 1704
1798 err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); 1705 err = fuse_simple_notify_reply(fc, args, outarg->notify_unique);
1799 if (err) { 1706 if (err)
1800 fuse_retrieve_end(fc, req); 1707 fuse_retrieve_end(fc, args, err);
1801 fuse_put_request(fc, req);
1802 }
1803 1708
1804 return err; 1709 return err;
1805} 1710}
@@ -1885,27 +1790,25 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique)
1885 return NULL; 1790 return NULL;
1886} 1791}
1887 1792
1888static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, 1793static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args,
1889 unsigned nbytes) 1794 unsigned nbytes)
1890{ 1795{
1891 unsigned reqsize = sizeof(struct fuse_out_header); 1796 unsigned reqsize = sizeof(struct fuse_out_header);
1892 1797
1893 if (out->h.error) 1798 reqsize += fuse_len_args(args->out_numargs, args->out_args);
1894 return nbytes != reqsize ? -EINVAL : 0;
1895
1896 reqsize += len_args(out->numargs, out->args);
1897 1799
1898 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) 1800 if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar))
1899 return -EINVAL; 1801 return -EINVAL;
1900 else if (reqsize > nbytes) { 1802 else if (reqsize > nbytes) {
1901 struct fuse_arg *lastarg = &out->args[out->numargs-1]; 1803 struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1];
1902 unsigned diffsize = reqsize - nbytes; 1804 unsigned diffsize = reqsize - nbytes;
1805
1903 if (diffsize > lastarg->size) 1806 if (diffsize > lastarg->size)
1904 return -EINVAL; 1807 return -EINVAL;
1905 lastarg->size -= diffsize; 1808 lastarg->size -= diffsize;
1906 } 1809 }
1907 return fuse_copy_args(cs, out->numargs, out->argpages, out->args, 1810 return fuse_copy_args(cs, args->out_numargs, args->out_pages,
1908 out->page_zeroing); 1811 args->out_args, args->page_zeroing);
1909} 1812}
1910 1813
1911/* 1814/*
@@ -1913,7 +1816,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
1913 * the write buffer. The request is then searched on the processing 1816 * the write buffer. The request is then searched on the processing
1914 * list by the unique ID found in the header. If found, then remove 1817 * list by the unique ID found in the header. If found, then remove
1915 * it from the list and copy the rest of the buffer to the request. 1818 * it from the list and copy the rest of the buffer to the request.
1916 * The request is finished by calling request_end() 1819 * The request is finished by calling fuse_request_end().
1917 */ 1820 */
1918static ssize_t fuse_dev_do_write(struct fuse_dev *fud, 1821static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
1919 struct fuse_copy_state *cs, size_t nbytes) 1822 struct fuse_copy_state *cs, size_t nbytes)
@@ -1984,10 +1887,13 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
1984 set_bit(FR_LOCKED, &req->flags); 1887 set_bit(FR_LOCKED, &req->flags);
1985 spin_unlock(&fpq->lock); 1888 spin_unlock(&fpq->lock);
1986 cs->req = req; 1889 cs->req = req;
1987 if (!req->out.page_replace) 1890 if (!req->args->page_replace)
1988 cs->move_pages = 0; 1891 cs->move_pages = 0;
1989 1892
1990 err = copy_out_args(cs, &req->out, nbytes); 1893 if (oh.error)
1894 err = nbytes != sizeof(oh) ? -EINVAL : 0;
1895 else
1896 err = copy_out_args(cs, req->args, nbytes);
1991 fuse_copy_finish(cs); 1897 fuse_copy_finish(cs);
1992 1898
1993 spin_lock(&fpq->lock); 1899 spin_lock(&fpq->lock);
@@ -2000,7 +1906,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
2000 list_del_init(&req->list); 1906 list_del_init(&req->list);
2001 spin_unlock(&fpq->lock); 1907 spin_unlock(&fpq->lock);
2002 1908
2003 request_end(fc, req); 1909 fuse_request_end(fc, req);
2004out: 1910out:
2005 return err ? err : nbytes; 1911 return err ? err : nbytes;
2006 1912
@@ -2121,12 +2027,12 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait)
2121 fiq = &fud->fc->iq; 2027 fiq = &fud->fc->iq;
2122 poll_wait(file, &fiq->waitq, wait); 2028 poll_wait(file, &fiq->waitq, wait);
2123 2029
2124 spin_lock(&fiq->waitq.lock); 2030 spin_lock(&fiq->lock);
2125 if (!fiq->connected) 2031 if (!fiq->connected)
2126 mask = EPOLLERR; 2032 mask = EPOLLERR;
2127 else if (request_pending(fiq)) 2033 else if (request_pending(fiq))
2128 mask |= EPOLLIN | EPOLLRDNORM; 2034 mask |= EPOLLIN | EPOLLRDNORM;
2129 spin_unlock(&fiq->waitq.lock); 2035 spin_unlock(&fiq->lock);
2130 2036
2131 return mask; 2037 return mask;
2132} 2038}
@@ -2140,7 +2046,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
2140 req->out.h.error = -ECONNABORTED; 2046 req->out.h.error = -ECONNABORTED;
2141 clear_bit(FR_SENT, &req->flags); 2047 clear_bit(FR_SENT, &req->flags);
2142 list_del_init(&req->list); 2048 list_del_init(&req->list);
2143 request_end(fc, req); 2049 fuse_request_end(fc, req);
2144 } 2050 }
2145} 2051}
2146 2052
@@ -2221,15 +2127,15 @@ void fuse_abort_conn(struct fuse_conn *fc)
2221 flush_bg_queue(fc); 2127 flush_bg_queue(fc);
2222 spin_unlock(&fc->bg_lock); 2128 spin_unlock(&fc->bg_lock);
2223 2129
2224 spin_lock(&fiq->waitq.lock); 2130 spin_lock(&fiq->lock);
2225 fiq->connected = 0; 2131 fiq->connected = 0;
2226 list_for_each_entry(req, &fiq->pending, list) 2132 list_for_each_entry(req, &fiq->pending, list)
2227 clear_bit(FR_PENDING, &req->flags); 2133 clear_bit(FR_PENDING, &req->flags);
2228 list_splice_tail_init(&fiq->pending, &to_end); 2134 list_splice_tail_init(&fiq->pending, &to_end);
2229 while (forget_pending(fiq)) 2135 while (forget_pending(fiq))
2230 kfree(dequeue_forget(fiq, 1, NULL)); 2136 kfree(fuse_dequeue_forget(fiq, 1, NULL));
2231 wake_up_all_locked(&fiq->waitq); 2137 wake_up_all(&fiq->waitq);
2232 spin_unlock(&fiq->waitq.lock); 2138 spin_unlock(&fiq->lock);
2233 kill_fasync(&fiq->fasync, SIGIO, POLL_IN); 2139 kill_fasync(&fiq->fasync, SIGIO, POLL_IN);
2234 end_polls(fc); 2140 end_polls(fc);
2235 wake_up_all(&fc->blocked_waitq); 2141 wake_up_all(&fc->blocked_waitq);
@@ -2296,7 +2202,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
2296 if (new->private_data) 2202 if (new->private_data)
2297 return -EINVAL; 2203 return -EINVAL;
2298 2204
2299 fud = fuse_dev_alloc(fc); 2205 fud = fuse_dev_alloc_install(fc);
2300 if (!fud) 2206 if (!fud)
2301 return -ENOMEM; 2207 return -ENOMEM;
2302 2208
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dd0f64f7bc06..d572c900bb0f 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -24,20 +24,54 @@ static void fuse_advise_use_readdirplus(struct inode *dir)
24 set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); 24 set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
25} 25}
26 26
27#if BITS_PER_LONG >= 64
28static inline void __fuse_dentry_settime(struct dentry *entry, u64 time)
29{
30 entry->d_fsdata = (void *) time;
31}
32
33static inline u64 fuse_dentry_time(const struct dentry *entry)
34{
35 return (u64)entry->d_fsdata;
36}
37
38#else
27union fuse_dentry { 39union fuse_dentry {
28 u64 time; 40 u64 time;
29 struct rcu_head rcu; 41 struct rcu_head rcu;
30}; 42};
31 43
32static inline void fuse_dentry_settime(struct dentry *entry, u64 time) 44static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time)
33{ 45{
34 ((union fuse_dentry *) entry->d_fsdata)->time = time; 46 ((union fuse_dentry *) dentry->d_fsdata)->time = time;
35} 47}
36 48
37static inline u64 fuse_dentry_time(struct dentry *entry) 49static inline u64 fuse_dentry_time(const struct dentry *entry)
38{ 50{
39 return ((union fuse_dentry *) entry->d_fsdata)->time; 51 return ((union fuse_dentry *) entry->d_fsdata)->time;
40} 52}
53#endif
54
55static void fuse_dentry_settime(struct dentry *dentry, u64 time)
56{
57 struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
58 bool delete = !time && fc->delete_stale;
59 /*
60 * Mess with DCACHE_OP_DELETE because dput() will be faster without it.
61 * Don't care about races, either way it's just an optimization
62 */
63 if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) ||
64 (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) {
65 spin_lock(&dentry->d_lock);
66 if (!delete)
67 dentry->d_flags &= ~DCACHE_OP_DELETE;
68 else
69 dentry->d_flags |= DCACHE_OP_DELETE;
70 spin_unlock(&dentry->d_lock);
71 }
72
73 __fuse_dentry_settime(dentry, time);
74}
41 75
42/* 76/*
43 * FUSE caches dentries and attributes with separate timeout. The 77 * FUSE caches dentries and attributes with separate timeout. The
@@ -139,14 +173,14 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
139 struct fuse_entry_out *outarg) 173 struct fuse_entry_out *outarg)
140{ 174{
141 memset(outarg, 0, sizeof(struct fuse_entry_out)); 175 memset(outarg, 0, sizeof(struct fuse_entry_out));
142 args->in.h.opcode = FUSE_LOOKUP; 176 args->opcode = FUSE_LOOKUP;
143 args->in.h.nodeid = nodeid; 177 args->nodeid = nodeid;
144 args->in.numargs = 1; 178 args->in_numargs = 1;
145 args->in.args[0].size = name->len + 1; 179 args->in_args[0].size = name->len + 1;
146 args->in.args[0].value = name->name; 180 args->in_args[0].value = name->name;
147 args->out.numargs = 1; 181 args->out_numargs = 1;
148 args->out.args[0].size = sizeof(struct fuse_entry_out); 182 args->out_args[0].size = sizeof(struct fuse_entry_out);
149 args->out.args[0].value = outarg; 183 args->out_args[0].value = outarg;
150} 184}
151 185
152/* 186/*
@@ -242,9 +276,11 @@ invalid:
242 goto out; 276 goto out;
243} 277}
244 278
279#if BITS_PER_LONG < 64
245static int fuse_dentry_init(struct dentry *dentry) 280static int fuse_dentry_init(struct dentry *dentry)
246{ 281{
247 dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); 282 dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry),
283 GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE);
248 284
249 return dentry->d_fsdata ? 0 : -ENOMEM; 285 return dentry->d_fsdata ? 0 : -ENOMEM;
250} 286}
@@ -254,16 +290,27 @@ static void fuse_dentry_release(struct dentry *dentry)
254 290
255 kfree_rcu(fd, rcu); 291 kfree_rcu(fd, rcu);
256} 292}
293#endif
294
295static int fuse_dentry_delete(const struct dentry *dentry)
296{
297 return time_before64(fuse_dentry_time(dentry), get_jiffies_64());
298}
257 299
258const struct dentry_operations fuse_dentry_operations = { 300const struct dentry_operations fuse_dentry_operations = {
259 .d_revalidate = fuse_dentry_revalidate, 301 .d_revalidate = fuse_dentry_revalidate,
302 .d_delete = fuse_dentry_delete,
303#if BITS_PER_LONG < 64
260 .d_init = fuse_dentry_init, 304 .d_init = fuse_dentry_init,
261 .d_release = fuse_dentry_release, 305 .d_release = fuse_dentry_release,
306#endif
262}; 307};
263 308
264const struct dentry_operations fuse_root_dentry_operations = { 309const struct dentry_operations fuse_root_dentry_operations = {
310#if BITS_PER_LONG < 64
265 .d_init = fuse_dentry_init, 311 .d_init = fuse_dentry_init,
266 .d_release = fuse_dentry_release, 312 .d_release = fuse_dentry_release,
313#endif
267}; 314};
268 315
269int fuse_valid_type(int m) 316int fuse_valid_type(int m)
@@ -410,18 +457,18 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
410 inarg.flags = flags; 457 inarg.flags = flags;
411 inarg.mode = mode; 458 inarg.mode = mode;
412 inarg.umask = current_umask(); 459 inarg.umask = current_umask();
413 args.in.h.opcode = FUSE_CREATE; 460 args.opcode = FUSE_CREATE;
414 args.in.h.nodeid = get_node_id(dir); 461 args.nodeid = get_node_id(dir);
415 args.in.numargs = 2; 462 args.in_numargs = 2;
416 args.in.args[0].size = sizeof(inarg); 463 args.in_args[0].size = sizeof(inarg);
417 args.in.args[0].value = &inarg; 464 args.in_args[0].value = &inarg;
418 args.in.args[1].size = entry->d_name.len + 1; 465 args.in_args[1].size = entry->d_name.len + 1;
419 args.in.args[1].value = entry->d_name.name; 466 args.in_args[1].value = entry->d_name.name;
420 args.out.numargs = 2; 467 args.out_numargs = 2;
421 args.out.args[0].size = sizeof(outentry); 468 args.out_args[0].size = sizeof(outentry);
422 args.out.args[0].value = &outentry; 469 args.out_args[0].value = &outentry;
423 args.out.args[1].size = sizeof(outopen); 470 args.out_args[1].size = sizeof(outopen);
424 args.out.args[1].value = &outopen; 471 args.out_args[1].value = &outopen;
425 err = fuse_simple_request(fc, &args); 472 err = fuse_simple_request(fc, &args);
426 if (err) 473 if (err)
427 goto out_free_ff; 474 goto out_free_ff;
@@ -526,10 +573,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
526 return -ENOMEM; 573 return -ENOMEM;
527 574
528 memset(&outarg, 0, sizeof(outarg)); 575 memset(&outarg, 0, sizeof(outarg));
529 args->in.h.nodeid = get_node_id(dir); 576 args->nodeid = get_node_id(dir);
530 args->out.numargs = 1; 577 args->out_numargs = 1;
531 args->out.args[0].size = sizeof(outarg); 578 args->out_args[0].size = sizeof(outarg);
532 args->out.args[0].value = &outarg; 579 args->out_args[0].value = &outarg;
533 err = fuse_simple_request(fc, args); 580 err = fuse_simple_request(fc, args);
534 if (err) 581 if (err)
535 goto out_put_forget_req; 582 goto out_put_forget_req;
@@ -582,12 +629,12 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
582 inarg.mode = mode; 629 inarg.mode = mode;
583 inarg.rdev = new_encode_dev(rdev); 630 inarg.rdev = new_encode_dev(rdev);
584 inarg.umask = current_umask(); 631 inarg.umask = current_umask();
585 args.in.h.opcode = FUSE_MKNOD; 632 args.opcode = FUSE_MKNOD;
586 args.in.numargs = 2; 633 args.in_numargs = 2;
587 args.in.args[0].size = sizeof(inarg); 634 args.in_args[0].size = sizeof(inarg);
588 args.in.args[0].value = &inarg; 635 args.in_args[0].value = &inarg;
589 args.in.args[1].size = entry->d_name.len + 1; 636 args.in_args[1].size = entry->d_name.len + 1;
590 args.in.args[1].value = entry->d_name.name; 637 args.in_args[1].value = entry->d_name.name;
591 return create_new_entry(fc, &args, dir, entry, mode); 638 return create_new_entry(fc, &args, dir, entry, mode);
592} 639}
593 640
@@ -609,12 +656,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
609 memset(&inarg, 0, sizeof(inarg)); 656 memset(&inarg, 0, sizeof(inarg));
610 inarg.mode = mode; 657 inarg.mode = mode;
611 inarg.umask = current_umask(); 658 inarg.umask = current_umask();
612 args.in.h.opcode = FUSE_MKDIR; 659 args.opcode = FUSE_MKDIR;
613 args.in.numargs = 2; 660 args.in_numargs = 2;
614 args.in.args[0].size = sizeof(inarg); 661 args.in_args[0].size = sizeof(inarg);
615 args.in.args[0].value = &inarg; 662 args.in_args[0].value = &inarg;
616 args.in.args[1].size = entry->d_name.len + 1; 663 args.in_args[1].size = entry->d_name.len + 1;
617 args.in.args[1].value = entry->d_name.name; 664 args.in_args[1].value = entry->d_name.name;
618 return create_new_entry(fc, &args, dir, entry, S_IFDIR); 665 return create_new_entry(fc, &args, dir, entry, S_IFDIR);
619} 666}
620 667
@@ -625,12 +672,12 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
625 unsigned len = strlen(link) + 1; 672 unsigned len = strlen(link) + 1;
626 FUSE_ARGS(args); 673 FUSE_ARGS(args);
627 674
628 args.in.h.opcode = FUSE_SYMLINK; 675 args.opcode = FUSE_SYMLINK;
629 args.in.numargs = 2; 676 args.in_numargs = 2;
630 args.in.args[0].size = entry->d_name.len + 1; 677 args.in_args[0].size = entry->d_name.len + 1;
631 args.in.args[0].value = entry->d_name.name; 678 args.in_args[0].value = entry->d_name.name;
632 args.in.args[1].size = len; 679 args.in_args[1].size = len;
633 args.in.args[1].value = link; 680 args.in_args[1].value = link;
634 return create_new_entry(fc, &args, dir, entry, S_IFLNK); 681 return create_new_entry(fc, &args, dir, entry, S_IFLNK);
635} 682}
636 683
@@ -648,11 +695,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
648 struct fuse_conn *fc = get_fuse_conn(dir); 695 struct fuse_conn *fc = get_fuse_conn(dir);
649 FUSE_ARGS(args); 696 FUSE_ARGS(args);
650 697
651 args.in.h.opcode = FUSE_UNLINK; 698 args.opcode = FUSE_UNLINK;
652 args.in.h.nodeid = get_node_id(dir); 699 args.nodeid = get_node_id(dir);
653 args.in.numargs = 1; 700 args.in_numargs = 1;
654 args.in.args[0].size = entry->d_name.len + 1; 701 args.in_args[0].size = entry->d_name.len + 1;
655 args.in.args[0].value = entry->d_name.name; 702 args.in_args[0].value = entry->d_name.name;
656 err = fuse_simple_request(fc, &args); 703 err = fuse_simple_request(fc, &args);
657 if (!err) { 704 if (!err) {
658 struct inode *inode = d_inode(entry); 705 struct inode *inode = d_inode(entry);
@@ -684,11 +731,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
684 struct fuse_conn *fc = get_fuse_conn(dir); 731 struct fuse_conn *fc = get_fuse_conn(dir);
685 FUSE_ARGS(args); 732 FUSE_ARGS(args);
686 733
687 args.in.h.opcode = FUSE_RMDIR; 734 args.opcode = FUSE_RMDIR;
688 args.in.h.nodeid = get_node_id(dir); 735 args.nodeid = get_node_id(dir);
689 args.in.numargs = 1; 736 args.in_numargs = 1;
690 args.in.args[0].size = entry->d_name.len + 1; 737 args.in_args[0].size = entry->d_name.len + 1;
691 args.in.args[0].value = entry->d_name.name; 738 args.in_args[0].value = entry->d_name.name;
692 err = fuse_simple_request(fc, &args); 739 err = fuse_simple_request(fc, &args);
693 if (!err) { 740 if (!err) {
694 clear_nlink(d_inode(entry)); 741 clear_nlink(d_inode(entry));
@@ -711,15 +758,15 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
711 memset(&inarg, 0, argsize); 758 memset(&inarg, 0, argsize);
712 inarg.newdir = get_node_id(newdir); 759 inarg.newdir = get_node_id(newdir);
713 inarg.flags = flags; 760 inarg.flags = flags;
714 args.in.h.opcode = opcode; 761 args.opcode = opcode;
715 args.in.h.nodeid = get_node_id(olddir); 762 args.nodeid = get_node_id(olddir);
716 args.in.numargs = 3; 763 args.in_numargs = 3;
717 args.in.args[0].size = argsize; 764 args.in_args[0].size = argsize;
718 args.in.args[0].value = &inarg; 765 args.in_args[0].value = &inarg;
719 args.in.args[1].size = oldent->d_name.len + 1; 766 args.in_args[1].size = oldent->d_name.len + 1;
720 args.in.args[1].value = oldent->d_name.name; 767 args.in_args[1].value = oldent->d_name.name;
721 args.in.args[2].size = newent->d_name.len + 1; 768 args.in_args[2].size = newent->d_name.len + 1;
722 args.in.args[2].value = newent->d_name.name; 769 args.in_args[2].value = newent->d_name.name;
723 err = fuse_simple_request(fc, &args); 770 err = fuse_simple_request(fc, &args);
724 if (!err) { 771 if (!err) {
725 /* ctime changes */ 772 /* ctime changes */
@@ -796,12 +843,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
796 843
797 memset(&inarg, 0, sizeof(inarg)); 844 memset(&inarg, 0, sizeof(inarg));
798 inarg.oldnodeid = get_node_id(inode); 845 inarg.oldnodeid = get_node_id(inode);
799 args.in.h.opcode = FUSE_LINK; 846 args.opcode = FUSE_LINK;
800 args.in.numargs = 2; 847 args.in_numargs = 2;
801 args.in.args[0].size = sizeof(inarg); 848 args.in_args[0].size = sizeof(inarg);
802 args.in.args[0].value = &inarg; 849 args.in_args[0].value = &inarg;
803 args.in.args[1].size = newent->d_name.len + 1; 850 args.in_args[1].size = newent->d_name.len + 1;
804 args.in.args[1].value = newent->d_name.name; 851 args.in_args[1].value = newent->d_name.name;
805 err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); 852 err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
806 /* Contrary to "normal" filesystems it can happen that link 853 /* Contrary to "normal" filesystems it can happen that link
807 makes two "logical" inodes point to the same "physical" 854 makes two "logical" inodes point to the same "physical"
@@ -884,14 +931,14 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
884 inarg.getattr_flags |= FUSE_GETATTR_FH; 931 inarg.getattr_flags |= FUSE_GETATTR_FH;
885 inarg.fh = ff->fh; 932 inarg.fh = ff->fh;
886 } 933 }
887 args.in.h.opcode = FUSE_GETATTR; 934 args.opcode = FUSE_GETATTR;
888 args.in.h.nodeid = get_node_id(inode); 935 args.nodeid = get_node_id(inode);
889 args.in.numargs = 1; 936 args.in_numargs = 1;
890 args.in.args[0].size = sizeof(inarg); 937 args.in_args[0].size = sizeof(inarg);
891 args.in.args[0].value = &inarg; 938 args.in_args[0].value = &inarg;
892 args.out.numargs = 1; 939 args.out_numargs = 1;
893 args.out.args[0].size = sizeof(outarg); 940 args.out_args[0].size = sizeof(outarg);
894 args.out.args[0].value = &outarg; 941 args.out_args[0].value = &outarg;
895 err = fuse_simple_request(fc, &args); 942 err = fuse_simple_request(fc, &args);
896 if (!err) { 943 if (!err) {
897 if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { 944 if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
@@ -1056,11 +1103,11 @@ static int fuse_access(struct inode *inode, int mask)
1056 1103
1057 memset(&inarg, 0, sizeof(inarg)); 1104 memset(&inarg, 0, sizeof(inarg));
1058 inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); 1105 inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
1059 args.in.h.opcode = FUSE_ACCESS; 1106 args.opcode = FUSE_ACCESS;
1060 args.in.h.nodeid = get_node_id(inode); 1107 args.nodeid = get_node_id(inode);
1061 args.in.numargs = 1; 1108 args.in_numargs = 1;
1062 args.in.args[0].size = sizeof(inarg); 1109 args.in_args[0].size = sizeof(inarg);
1063 args.in.args[0].value = &inarg; 1110 args.in_args[0].value = &inarg;
1064 err = fuse_simple_request(fc, &args); 1111 err = fuse_simple_request(fc, &args);
1065 if (err == -ENOSYS) { 1112 if (err == -ENOSYS) {
1066 fc->no_access = 1; 1113 fc->no_access = 1;
@@ -1152,38 +1199,36 @@ static int fuse_permission(struct inode *inode, int mask)
1152static int fuse_readlink_page(struct inode *inode, struct page *page) 1199static int fuse_readlink_page(struct inode *inode, struct page *page)
1153{ 1200{
1154 struct fuse_conn *fc = get_fuse_conn(inode); 1201 struct fuse_conn *fc = get_fuse_conn(inode);
1155 struct fuse_req *req; 1202 struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 };
1156 int err; 1203 struct fuse_args_pages ap = {
1204 .num_pages = 1,
1205 .pages = &page,
1206 .descs = &desc,
1207 };
1208 char *link;
1209 ssize_t res;
1210
1211 ap.args.opcode = FUSE_READLINK;
1212 ap.args.nodeid = get_node_id(inode);
1213 ap.args.out_pages = true;
1214 ap.args.out_argvar = true;
1215 ap.args.page_zeroing = true;
1216 ap.args.out_numargs = 1;
1217 ap.args.out_args[0].size = desc.length;
1218 res = fuse_simple_request(fc, &ap.args);
1157 1219
1158 req = fuse_get_req(fc, 1); 1220 fuse_invalidate_atime(inode);
1159 if (IS_ERR(req))
1160 return PTR_ERR(req);
1161
1162 req->out.page_zeroing = 1;
1163 req->out.argpages = 1;
1164 req->num_pages = 1;
1165 req->pages[0] = page;
1166 req->page_descs[0].length = PAGE_SIZE - 1;
1167 req->in.h.opcode = FUSE_READLINK;
1168 req->in.h.nodeid = get_node_id(inode);
1169 req->out.argvar = 1;
1170 req->out.numargs = 1;
1171 req->out.args[0].size = PAGE_SIZE - 1;
1172 fuse_request_send(fc, req);
1173 err = req->out.h.error;
1174 1221
1175 if (!err) { 1222 if (res < 0)
1176 char *link = page_address(page); 1223 return res;
1177 size_t len = req->out.args[0].size;
1178 1224
1179 BUG_ON(len >= PAGE_SIZE); 1225 if (WARN_ON(res >= PAGE_SIZE))
1180 link[len] = '\0'; 1226 return -EIO;
1181 }
1182 1227
1183 fuse_put_request(fc, req); 1228 link = page_address(page);
1184 fuse_invalidate_atime(inode); 1229 link[res] = '\0';
1185 1230
1186 return err; 1231 return 0;
1187} 1232}
1188 1233
1189static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, 1234static const char *fuse_get_link(struct dentry *dentry, struct inode *inode,
@@ -1383,14 +1428,14 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
1383 struct fuse_setattr_in *inarg_p, 1428 struct fuse_setattr_in *inarg_p,
1384 struct fuse_attr_out *outarg_p) 1429 struct fuse_attr_out *outarg_p)
1385{ 1430{
1386 args->in.h.opcode = FUSE_SETATTR; 1431 args->opcode = FUSE_SETATTR;
1387 args->in.h.nodeid = get_node_id(inode); 1432 args->nodeid = get_node_id(inode);
1388 args->in.numargs = 1; 1433 args->in_numargs = 1;
1389 args->in.args[0].size = sizeof(*inarg_p); 1434 args->in_args[0].size = sizeof(*inarg_p);
1390 args->in.args[0].value = inarg_p; 1435 args->in_args[0].value = inarg_p;
1391 args->out.numargs = 1; 1436 args->out_numargs = 1;
1392 args->out.args[0].size = sizeof(*outarg_p); 1437 args->out_args[0].size = sizeof(*outarg_p);
1393 args->out.args[0].value = outarg_p; 1438 args->out_args[0].value = outarg_p;
1394} 1439}
1395 1440
1396/* 1441/*
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5ae2828beb00..0f0225686aee 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,18 @@
19#include <linux/falloc.h> 19#include <linux/falloc.h>
20#include <linux/uio.h> 20#include <linux/uio.h>
21 21
22static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
23 struct fuse_page_desc **desc)
24{
25 struct page **pages;
26
27 pages = kzalloc(npages * (sizeof(struct page *) +
28 sizeof(struct fuse_page_desc)), flags);
29 *desc = (void *) (pages + npages);
30
31 return pages;
32}
33
22static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, 34static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
23 int opcode, struct fuse_open_out *outargp) 35 int opcode, struct fuse_open_out *outargp)
24{ 36{
@@ -29,29 +41,36 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
29 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); 41 inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
30 if (!fc->atomic_o_trunc) 42 if (!fc->atomic_o_trunc)
31 inarg.flags &= ~O_TRUNC; 43 inarg.flags &= ~O_TRUNC;
32 args.in.h.opcode = opcode; 44 args.opcode = opcode;
33 args.in.h.nodeid = nodeid; 45 args.nodeid = nodeid;
34 args.in.numargs = 1; 46 args.in_numargs = 1;
35 args.in.args[0].size = sizeof(inarg); 47 args.in_args[0].size = sizeof(inarg);
36 args.in.args[0].value = &inarg; 48 args.in_args[0].value = &inarg;
37 args.out.numargs = 1; 49 args.out_numargs = 1;
38 args.out.args[0].size = sizeof(*outargp); 50 args.out_args[0].size = sizeof(*outargp);
39 args.out.args[0].value = outargp; 51 args.out_args[0].value = outargp;
40 52
41 return fuse_simple_request(fc, &args); 53 return fuse_simple_request(fc, &args);
42} 54}
43 55
56struct fuse_release_args {
57 struct fuse_args args;
58 struct fuse_release_in inarg;
59 struct inode *inode;
60};
61
44struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) 62struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
45{ 63{
46 struct fuse_file *ff; 64 struct fuse_file *ff;
47 65
48 ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); 66 ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
49 if (unlikely(!ff)) 67 if (unlikely(!ff))
50 return NULL; 68 return NULL;
51 69
52 ff->fc = fc; 70 ff->fc = fc;
53 ff->reserved_req = fuse_request_alloc(0); 71 ff->release_args = kzalloc(sizeof(*ff->release_args),
54 if (unlikely(!ff->reserved_req)) { 72 GFP_KERNEL_ACCOUNT);
73 if (!ff->release_args) {
55 kfree(ff); 74 kfree(ff);
56 return NULL; 75 return NULL;
57 } 76 }
@@ -69,7 +88,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
69 88
70void fuse_file_free(struct fuse_file *ff) 89void fuse_file_free(struct fuse_file *ff)
71{ 90{
72 fuse_request_free(ff->reserved_req); 91 kfree(ff->release_args);
73 mutex_destroy(&ff->readdir.lock); 92 mutex_destroy(&ff->readdir.lock);
74 kfree(ff); 93 kfree(ff);
75} 94}
@@ -80,34 +99,31 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff)
80 return ff; 99 return ff;
81} 100}
82 101
83static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) 102static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
103 int error)
84{ 104{
85 iput(req->misc.release.inode); 105 struct fuse_release_args *ra = container_of(args, typeof(*ra), args);
106
107 iput(ra->inode);
108 kfree(ra);
86} 109}
87 110
88static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) 111static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
89{ 112{
90 if (refcount_dec_and_test(&ff->count)) { 113 if (refcount_dec_and_test(&ff->count)) {
91 struct fuse_req *req = ff->reserved_req; 114 struct fuse_args *args = &ff->release_args->args;
92 115
93 if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { 116 if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
94 /* 117 /* Do nothing when client does not implement 'open' */
95 * Drop the release request when client does not 118 fuse_release_end(ff->fc, args, 0);
96 * implement 'open'
97 */
98 __clear_bit(FR_BACKGROUND, &req->flags);
99 iput(req->misc.release.inode);
100 fuse_put_request(ff->fc, req);
101 } else if (sync) { 119 } else if (sync) {
102 __set_bit(FR_FORCE, &req->flags); 120 fuse_simple_request(ff->fc, args);
103 __clear_bit(FR_BACKGROUND, &req->flags); 121 fuse_release_end(ff->fc, args, 0);
104 fuse_request_send(ff->fc, req);
105 iput(req->misc.release.inode);
106 fuse_put_request(ff->fc, req);
107 } else { 122 } else {
108 req->end = fuse_release_end; 123 args->end = fuse_release_end;
109 __set_bit(FR_BACKGROUND, &req->flags); 124 if (fuse_simple_background(ff->fc, args,
110 fuse_request_send_background(ff->fc, req); 125 GFP_KERNEL | __GFP_NOFAIL))
126 fuse_release_end(ff->fc, args, -ENOTCONN);
111 } 127 }
112 kfree(ff); 128 kfree(ff);
113 } 129 }
@@ -227,8 +243,7 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
227 int flags, int opcode) 243 int flags, int opcode)
228{ 244{
229 struct fuse_conn *fc = ff->fc; 245 struct fuse_conn *fc = ff->fc;
230 struct fuse_req *req = ff->reserved_req; 246 struct fuse_release_args *ra = ff->release_args;
231 struct fuse_release_in *inarg = &req->misc.release.in;
232 247
233 /* Inode is NULL on error path of fuse_create_open() */ 248 /* Inode is NULL on error path of fuse_create_open() */
234 if (likely(fi)) { 249 if (likely(fi)) {
@@ -243,32 +258,33 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
243 258
244 wake_up_interruptible_all(&ff->poll_wait); 259 wake_up_interruptible_all(&ff->poll_wait);
245 260
246 inarg->fh = ff->fh; 261 ra->inarg.fh = ff->fh;
247 inarg->flags = flags; 262 ra->inarg.flags = flags;
248 req->in.h.opcode = opcode; 263 ra->args.in_numargs = 1;
249 req->in.h.nodeid = ff->nodeid; 264 ra->args.in_args[0].size = sizeof(struct fuse_release_in);
250 req->in.numargs = 1; 265 ra->args.in_args[0].value = &ra->inarg;
251 req->in.args[0].size = sizeof(struct fuse_release_in); 266 ra->args.opcode = opcode;
252 req->in.args[0].value = inarg; 267 ra->args.nodeid = ff->nodeid;
268 ra->args.force = true;
269 ra->args.nocreds = true;
253} 270}
254 271
255void fuse_release_common(struct file *file, bool isdir) 272void fuse_release_common(struct file *file, bool isdir)
256{ 273{
257 struct fuse_inode *fi = get_fuse_inode(file_inode(file)); 274 struct fuse_inode *fi = get_fuse_inode(file_inode(file));
258 struct fuse_file *ff = file->private_data; 275 struct fuse_file *ff = file->private_data;
259 struct fuse_req *req = ff->reserved_req; 276 struct fuse_release_args *ra = ff->release_args;
260 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; 277 int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
261 278
262 fuse_prepare_release(fi, ff, file->f_flags, opcode); 279 fuse_prepare_release(fi, ff, file->f_flags, opcode);
263 280
264 if (ff->flock) { 281 if (ff->flock) {
265 struct fuse_release_in *inarg = &req->misc.release.in; 282 ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
266 inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; 283 ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
267 inarg->lock_owner = fuse_lock_owner_id(ff->fc, 284 (fl_owner_t) file);
268 (fl_owner_t) file);
269 } 285 }
270 /* Hold inode until release is finished */ 286 /* Hold inode until release is finished */
271 req->misc.release.inode = igrab(file_inode(file)); 287 ra->inode = igrab(file_inode(file));
272 288
273 /* 289 /*
274 * Normally this will send the RELEASE request, however if 290 * Normally this will send the RELEASE request, however if
@@ -279,7 +295,7 @@ void fuse_release_common(struct file *file, bool isdir)
279 * synchronous RELEASE is allowed (and desirable) in this case 295 * synchronous RELEASE is allowed (and desirable) in this case
280 * because the server can be trusted not to screw up. 296 * because the server can be trusted not to screw up.
281 */ 297 */
282 fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); 298 fuse_file_put(ff, ff->fc->destroy, isdir);
283} 299}
284 300
285static int fuse_open(struct inode *inode, struct file *file) 301static int fuse_open(struct inode *inode, struct file *file)
@@ -335,19 +351,27 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
335 return (u64) v0 + ((u64) v1 << 32); 351 return (u64) v0 + ((u64) v1 << 32);
336} 352}
337 353
338static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, 354struct fuse_writepage_args {
355 struct fuse_io_args ia;
356 struct list_head writepages_entry;
357 struct list_head queue_entry;
358 struct fuse_writepage_args *next;
359 struct inode *inode;
360};
361
362static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
339 pgoff_t idx_from, pgoff_t idx_to) 363 pgoff_t idx_from, pgoff_t idx_to)
340{ 364{
341 struct fuse_req *req; 365 struct fuse_writepage_args *wpa;
342 366
343 list_for_each_entry(req, &fi->writepages, writepages_entry) { 367 list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
344 pgoff_t curr_index; 368 pgoff_t curr_index;
345 369
346 WARN_ON(get_fuse_inode(req->inode) != fi); 370 WARN_ON(get_fuse_inode(wpa->inode) != fi);
347 curr_index = req->misc.write.in.offset >> PAGE_SHIFT; 371 curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
348 if (idx_from < curr_index + req->num_pages && 372 if (idx_from < curr_index + wpa->ia.ap.num_pages &&
349 curr_index <= idx_to) { 373 curr_index <= idx_to) {
350 return req; 374 return wpa;
351 } 375 }
352 } 376 }
353 return NULL; 377 return NULL;
@@ -383,12 +407,11 @@ static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
383 * Since fuse doesn't rely on the VM writeback tracking, this has to 407 * Since fuse doesn't rely on the VM writeback tracking, this has to
384 * use some other means. 408 * use some other means.
385 */ 409 */
386static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) 410static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
387{ 411{
388 struct fuse_inode *fi = get_fuse_inode(inode); 412 struct fuse_inode *fi = get_fuse_inode(inode);
389 413
390 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); 414 wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
391 return 0;
392} 415}
393 416
394/* 417/*
@@ -411,8 +434,8 @@ static int fuse_flush(struct file *file, fl_owner_t id)
411 struct inode *inode = file_inode(file); 434 struct inode *inode = file_inode(file);
412 struct fuse_conn *fc = get_fuse_conn(inode); 435 struct fuse_conn *fc = get_fuse_conn(inode);
413 struct fuse_file *ff = file->private_data; 436 struct fuse_file *ff = file->private_data;
414 struct fuse_req *req;
415 struct fuse_flush_in inarg; 437 struct fuse_flush_in inarg;
438 FUSE_ARGS(args);
416 int err; 439 int err;
417 440
418 if (is_bad_inode(inode)) 441 if (is_bad_inode(inode))
@@ -433,19 +456,17 @@ static int fuse_flush(struct file *file, fl_owner_t id)
433 if (err) 456 if (err)
434 return err; 457 return err;
435 458
436 req = fuse_get_req_nofail_nopages(fc, file);
437 memset(&inarg, 0, sizeof(inarg)); 459 memset(&inarg, 0, sizeof(inarg));
438 inarg.fh = ff->fh; 460 inarg.fh = ff->fh;
439 inarg.lock_owner = fuse_lock_owner_id(fc, id); 461 inarg.lock_owner = fuse_lock_owner_id(fc, id);
440 req->in.h.opcode = FUSE_FLUSH; 462 args.opcode = FUSE_FLUSH;
441 req->in.h.nodeid = get_node_id(inode); 463 args.nodeid = get_node_id(inode);
442 req->in.numargs = 1; 464 args.in_numargs = 1;
443 req->in.args[0].size = sizeof(inarg); 465 args.in_args[0].size = sizeof(inarg);
444 req->in.args[0].value = &inarg; 466 args.in_args[0].value = &inarg;
445 __set_bit(FR_FORCE, &req->flags); 467 args.force = true;
446 fuse_request_send(fc, req); 468
447 err = req->out.h.error; 469 err = fuse_simple_request(fc, &args);
448 fuse_put_request(fc, req);
449 if (err == -ENOSYS) { 470 if (err == -ENOSYS) {
450 fc->no_flush = 1; 471 fc->no_flush = 1;
451 err = 0; 472 err = 0;
@@ -465,11 +486,11 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
465 memset(&inarg, 0, sizeof(inarg)); 486 memset(&inarg, 0, sizeof(inarg));
466 inarg.fh = ff->fh; 487 inarg.fh = ff->fh;
467 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; 488 inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
468 args.in.h.opcode = opcode; 489 args.opcode = opcode;
469 args.in.h.nodeid = get_node_id(inode); 490 args.nodeid = get_node_id(inode);
470 args.in.numargs = 1; 491 args.in_numargs = 1;
471 args.in.args[0].size = sizeof(inarg); 492 args.in_args[0].size = sizeof(inarg);
472 args.in.args[0].value = &inarg; 493 args.in_args[0].value = &inarg;
473 return fuse_simple_request(fc, &args); 494 return fuse_simple_request(fc, &args);
474} 495}
475 496
@@ -523,35 +544,35 @@ out:
523 return err; 544 return err;
524} 545}
525 546
526void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, 547void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
527 size_t count, int opcode) 548 size_t count, int opcode)
528{ 549{
529 struct fuse_read_in *inarg = &req->misc.read.in;
530 struct fuse_file *ff = file->private_data; 550 struct fuse_file *ff = file->private_data;
551 struct fuse_args *args = &ia->ap.args;
531 552
532 inarg->fh = ff->fh; 553 ia->read.in.fh = ff->fh;
533 inarg->offset = pos; 554 ia->read.in.offset = pos;
534 inarg->size = count; 555 ia->read.in.size = count;
535 inarg->flags = file->f_flags; 556 ia->read.in.flags = file->f_flags;
536 req->in.h.opcode = opcode; 557 args->opcode = opcode;
537 req->in.h.nodeid = ff->nodeid; 558 args->nodeid = ff->nodeid;
538 req->in.numargs = 1; 559 args->in_numargs = 1;
539 req->in.args[0].size = sizeof(struct fuse_read_in); 560 args->in_args[0].size = sizeof(ia->read.in);
540 req->in.args[0].value = inarg; 561 args->in_args[0].value = &ia->read.in;
541 req->out.argvar = 1; 562 args->out_argvar = true;
542 req->out.numargs = 1; 563 args->out_numargs = 1;
543 req->out.args[0].size = count; 564 args->out_args[0].size = count;
544} 565}
545 566
546static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) 567static void fuse_release_user_pages(struct fuse_args_pages *ap,
568 bool should_dirty)
547{ 569{
548 unsigned i; 570 unsigned int i;
549 571
550 for (i = 0; i < req->num_pages; i++) { 572 for (i = 0; i < ap->num_pages; i++) {
551 struct page *page = req->pages[i];
552 if (should_dirty) 573 if (should_dirty)
553 set_page_dirty_lock(page); 574 set_page_dirty_lock(ap->pages[i]);
554 put_page(page); 575 put_page(ap->pages[i]);
555 } 576 }
556} 577}
557 578
@@ -621,64 +642,94 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
621 kref_put(&io->refcnt, fuse_io_release); 642 kref_put(&io->refcnt, fuse_io_release);
622} 643}
623 644
624static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) 645static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
646 unsigned int npages)
647{
648 struct fuse_io_args *ia;
649
650 ia = kzalloc(sizeof(*ia), GFP_KERNEL);
651 if (ia) {
652 ia->io = io;
653 ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
654 &ia->ap.descs);
655 if (!ia->ap.pages) {
656 kfree(ia);
657 ia = NULL;
658 }
659 }
660 return ia;
661}
662
663static void fuse_io_free(struct fuse_io_args *ia)
664{
665 kfree(ia->ap.pages);
666 kfree(ia);
667}
668
669static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
670 int err)
625{ 671{
626 struct fuse_io_priv *io = req->io; 672 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
673 struct fuse_io_priv *io = ia->io;
627 ssize_t pos = -1; 674 ssize_t pos = -1;
628 675
629 fuse_release_user_pages(req, io->should_dirty); 676 fuse_release_user_pages(&ia->ap, io->should_dirty);
630 677
631 if (io->write) { 678 if (err) {
632 if (req->misc.write.in.size != req->misc.write.out.size) 679 /* Nothing */
633 pos = req->misc.write.in.offset - io->offset + 680 } else if (io->write) {
634 req->misc.write.out.size; 681 if (ia->write.out.size > ia->write.in.size) {
682 err = -EIO;
683 } else if (ia->write.in.size != ia->write.out.size) {
684 pos = ia->write.in.offset - io->offset +
685 ia->write.out.size;
686 }
635 } else { 687 } else {
636 if (req->misc.read.in.size != req->out.args[0].size) 688 u32 outsize = args->out_args[0].size;
637 pos = req->misc.read.in.offset - io->offset + 689
638 req->out.args[0].size; 690 if (ia->read.in.size != outsize)
691 pos = ia->read.in.offset - io->offset + outsize;
639 } 692 }
640 693
641 fuse_aio_complete(io, req->out.h.error, pos); 694 fuse_aio_complete(io, err, pos);
695 fuse_io_free(ia);
642} 696}
643 697
644static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, 698static ssize_t fuse_async_req_send(struct fuse_conn *fc,
645 size_t num_bytes, struct fuse_io_priv *io) 699 struct fuse_io_args *ia, size_t num_bytes)
646{ 700{
701 ssize_t err;
702 struct fuse_io_priv *io = ia->io;
703
647 spin_lock(&io->lock); 704 spin_lock(&io->lock);
648 kref_get(&io->refcnt); 705 kref_get(&io->refcnt);
649 io->size += num_bytes; 706 io->size += num_bytes;
650 io->reqs++; 707 io->reqs++;
651 spin_unlock(&io->lock); 708 spin_unlock(&io->lock);
652 709
653 req->io = io; 710 ia->ap.args.end = fuse_aio_complete_req;
654 req->end = fuse_aio_complete_req; 711 err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
655 712
656 __fuse_get_request(req); 713 return err ?: num_bytes;
657 fuse_request_send_background(fc, req);
658
659 return num_bytes;
660} 714}
661 715
662static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, 716static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
663 loff_t pos, size_t count, fl_owner_t owner) 717 fl_owner_t owner)
664{ 718{
665 struct file *file = io->iocb->ki_filp; 719 struct file *file = ia->io->iocb->ki_filp;
666 struct fuse_file *ff = file->private_data; 720 struct fuse_file *ff = file->private_data;
667 struct fuse_conn *fc = ff->fc; 721 struct fuse_conn *fc = ff->fc;
668 722
669 fuse_read_fill(req, file, pos, count, FUSE_READ); 723 fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
670 if (owner != NULL) { 724 if (owner != NULL) {
671 struct fuse_read_in *inarg = &req->misc.read.in; 725 ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
672 726 ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
673 inarg->read_flags |= FUSE_READ_LOCKOWNER;
674 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
675 } 727 }
676 728
677 if (io->async) 729 if (ia->io->async)
678 return fuse_async_req_send(fc, req, count, io); 730 return fuse_async_req_send(fc, ia, count);
679 731
680 fuse_request_send(fc, req); 732 return fuse_simple_request(fc, &ia->ap.args);
681 return req->out.args[0].size;
682} 733}
683 734
684static void fuse_read_update_size(struct inode *inode, loff_t size, 735static void fuse_read_update_size(struct inode *inode, loff_t size,
@@ -696,10 +747,9 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
696 spin_unlock(&fi->lock); 747 spin_unlock(&fi->lock);
697} 748}
698 749
699static void fuse_short_read(struct fuse_req *req, struct inode *inode, 750static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
700 u64 attr_ver) 751 struct fuse_args_pages *ap)
701{ 752{
702 size_t num_read = req->out.args[0].size;
703 struct fuse_conn *fc = get_fuse_conn(inode); 753 struct fuse_conn *fc = get_fuse_conn(inode);
704 754
705 if (fc->writeback_cache) { 755 if (fc->writeback_cache) {
@@ -712,28 +762,31 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode,
712 int start_idx = num_read >> PAGE_SHIFT; 762 int start_idx = num_read >> PAGE_SHIFT;
713 size_t off = num_read & (PAGE_SIZE - 1); 763 size_t off = num_read & (PAGE_SIZE - 1);
714 764
715 for (i = start_idx; i < req->num_pages; i++) { 765 for (i = start_idx; i < ap->num_pages; i++) {
716 zero_user_segment(req->pages[i], off, PAGE_SIZE); 766 zero_user_segment(ap->pages[i], off, PAGE_SIZE);
717 off = 0; 767 off = 0;
718 } 768 }
719 } else { 769 } else {
720 loff_t pos = page_offset(req->pages[0]) + num_read; 770 loff_t pos = page_offset(ap->pages[0]) + num_read;
721 fuse_read_update_size(inode, pos, attr_ver); 771 fuse_read_update_size(inode, pos, attr_ver);
722 } 772 }
723} 773}
724 774
725static int fuse_do_readpage(struct file *file, struct page *page) 775static int fuse_do_readpage(struct file *file, struct page *page)
726{ 776{
727 struct kiocb iocb;
728 struct fuse_io_priv io;
729 struct inode *inode = page->mapping->host; 777 struct inode *inode = page->mapping->host;
730 struct fuse_conn *fc = get_fuse_conn(inode); 778 struct fuse_conn *fc = get_fuse_conn(inode);
731 struct fuse_req *req;
732 size_t num_read;
733 loff_t pos = page_offset(page); 779 loff_t pos = page_offset(page);
734 size_t count = PAGE_SIZE; 780 struct fuse_page_desc desc = { .length = PAGE_SIZE };
781 struct fuse_io_args ia = {
782 .ap.args.page_zeroing = true,
783 .ap.args.out_pages = true,
784 .ap.num_pages = 1,
785 .ap.pages = &page,
786 .ap.descs = &desc,
787 };
788 ssize_t res;
735 u64 attr_ver; 789 u64 attr_ver;
736 int err;
737 790
738 /* 791 /*
739 * Page writeback can extend beyond the lifetime of the 792 * Page writeback can extend beyond the lifetime of the
@@ -742,35 +795,21 @@ static int fuse_do_readpage(struct file *file, struct page *page)
742 */ 795 */
743 fuse_wait_on_page_writeback(inode, page->index); 796 fuse_wait_on_page_writeback(inode, page->index);
744 797
745 req = fuse_get_req(fc, 1);
746 if (IS_ERR(req))
747 return PTR_ERR(req);
748
749 attr_ver = fuse_get_attr_version(fc); 798 attr_ver = fuse_get_attr_version(fc);
750 799
751 req->out.page_zeroing = 1; 800 fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
752 req->out.argpages = 1; 801 res = fuse_simple_request(fc, &ia.ap.args);
753 req->num_pages = 1; 802 if (res < 0)
754 req->pages[0] = page; 803 return res;
755 req->page_descs[0].length = count; 804 /*
756 init_sync_kiocb(&iocb, file); 805 * Short read means EOF. If file size is larger, truncate it
757 io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); 806 */
758 num_read = fuse_send_read(req, &io, pos, count, NULL); 807 if (res < desc.length)
759 err = req->out.h.error; 808 fuse_short_read(inode, attr_ver, res, &ia.ap);
760
761 if (!err) {
762 /*
763 * Short read means EOF. If file size is larger, truncate it
764 */
765 if (num_read < count)
766 fuse_short_read(req, inode, attr_ver);
767
768 SetPageUptodate(page);
769 }
770 809
771 fuse_put_request(fc, req); 810 SetPageUptodate(page);
772 811
773 return err; 812 return 0;
774} 813}
775 814
776static int fuse_readpage(struct file *file, struct page *page) 815static int fuse_readpage(struct file *file, struct page *page)
@@ -789,15 +828,18 @@ static int fuse_readpage(struct file *file, struct page *page)
789 return err; 828 return err;
790} 829}
791 830
792static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) 831static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
832 int err)
793{ 833{
794 int i; 834 int i;
795 size_t count = req->misc.read.in.size; 835 struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
796 size_t num_read = req->out.args[0].size; 836 struct fuse_args_pages *ap = &ia->ap;
837 size_t count = ia->read.in.size;
838 size_t num_read = args->out_args[0].size;
797 struct address_space *mapping = NULL; 839 struct address_space *mapping = NULL;
798 840
799 for (i = 0; mapping == NULL && i < req->num_pages; i++) 841 for (i = 0; mapping == NULL && i < ap->num_pages; i++)
800 mapping = req->pages[i]->mapping; 842 mapping = ap->pages[i]->mapping;
801 843
802 if (mapping) { 844 if (mapping) {
803 struct inode *inode = mapping->host; 845 struct inode *inode = mapping->host;
@@ -805,93 +847,97 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
805 /* 847 /*
806 * Short read means EOF. If file size is larger, truncate it 848 * Short read means EOF. If file size is larger, truncate it
807 */ 849 */
808 if (!req->out.h.error && num_read < count) 850 if (!err && num_read < count)
809 fuse_short_read(req, inode, req->misc.read.attr_ver); 851 fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
810 852
811 fuse_invalidate_atime(inode); 853 fuse_invalidate_atime(inode);
812 } 854 }
813 855
814 for (i = 0; i < req->num_pages; i++) { 856 for (i = 0; i < ap->num_pages; i++) {
815 struct page *page = req->pages[i]; 857 struct page *page = ap->pages[i];
816 if (!req->out.h.error) 858
859 if (!err)
817 SetPageUptodate(page); 860 SetPageUptodate(page);
818 else 861 else
819 SetPageError(page); 862 SetPageError(page);
820 unlock_page(page); 863 unlock_page(page);
821 put_page(page); 864 put_page(page);
822 } 865 }
823 if (req->ff) 866 if (ia->ff)
824 fuse_file_put(req->ff, false, false); 867 fuse_file_put(ia->ff, false, false);
868
869 fuse_io_free(ia);
825} 870}
826 871
827static void fuse_send_readpages(struct fuse_req *req, struct file *file) 872static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
828{ 873{
829 struct fuse_file *ff = file->private_data; 874 struct fuse_file *ff = file->private_data;
830 struct fuse_conn *fc = ff->fc; 875 struct fuse_conn *fc = ff->fc;
831 loff_t pos = page_offset(req->pages[0]); 876 struct fuse_args_pages *ap = &ia->ap;
832 size_t count = req->num_pages << PAGE_SHIFT; 877 loff_t pos = page_offset(ap->pages[0]);
833 878 size_t count = ap->num_pages << PAGE_SHIFT;
834 req->out.argpages = 1; 879 int err;
835 req->out.page_zeroing = 1; 880
836 req->out.page_replace = 1; 881 ap->args.out_pages = true;
837 fuse_read_fill(req, file, pos, count, FUSE_READ); 882 ap->args.page_zeroing = true;
838 req->misc.read.attr_ver = fuse_get_attr_version(fc); 883 ap->args.page_replace = true;
884 fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
885 ia->read.attr_ver = fuse_get_attr_version(fc);
839 if (fc->async_read) { 886 if (fc->async_read) {
840 req->ff = fuse_file_get(ff); 887 ia->ff = fuse_file_get(ff);
841 req->end = fuse_readpages_end; 888 ap->args.end = fuse_readpages_end;
842 fuse_request_send_background(fc, req); 889 err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
890 if (!err)
891 return;
843 } else { 892 } else {
844 fuse_request_send(fc, req); 893 err = fuse_simple_request(fc, &ap->args);
845 fuse_readpages_end(fc, req);
846 fuse_put_request(fc, req);
847 } 894 }
895 fuse_readpages_end(fc, &ap->args, err);
848} 896}
849 897
850struct fuse_fill_data { 898struct fuse_fill_data {
851 struct fuse_req *req; 899 struct fuse_io_args *ia;
852 struct file *file; 900 struct file *file;
853 struct inode *inode; 901 struct inode *inode;
854 unsigned nr_pages; 902 unsigned int nr_pages;
903 unsigned int max_pages;
855}; 904};
856 905
857static int fuse_readpages_fill(void *_data, struct page *page) 906static int fuse_readpages_fill(void *_data, struct page *page)
858{ 907{
859 struct fuse_fill_data *data = _data; 908 struct fuse_fill_data *data = _data;
860 struct fuse_req *req = data->req; 909 struct fuse_io_args *ia = data->ia;
910 struct fuse_args_pages *ap = &ia->ap;
861 struct inode *inode = data->inode; 911 struct inode *inode = data->inode;
862 struct fuse_conn *fc = get_fuse_conn(inode); 912 struct fuse_conn *fc = get_fuse_conn(inode);
863 913
864 fuse_wait_on_page_writeback(inode, page->index); 914 fuse_wait_on_page_writeback(inode, page->index);
865 915
866 if (req->num_pages && 916 if (ap->num_pages &&
867 (req->num_pages == fc->max_pages || 917 (ap->num_pages == fc->max_pages ||
868 (req->num_pages + 1) * PAGE_SIZE > fc->max_read || 918 (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
869 req->pages[req->num_pages - 1]->index + 1 != page->index)) { 919 ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
870 unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, 920 data->max_pages = min_t(unsigned int, data->nr_pages,
871 fc->max_pages); 921 fc->max_pages);
872 fuse_send_readpages(req, data->file); 922 fuse_send_readpages(ia, data->file);
873 if (fc->async_read) 923 data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
874 req = fuse_get_req_for_background(fc, nr_alloc); 924 if (!ia) {
875 else
876 req = fuse_get_req(fc, nr_alloc);
877
878 data->req = req;
879 if (IS_ERR(req)) {
880 unlock_page(page); 925 unlock_page(page);
881 return PTR_ERR(req); 926 return -ENOMEM;
882 } 927 }
928 ap = &ia->ap;
883 } 929 }
884 930
885 if (WARN_ON(req->num_pages >= req->max_pages)) { 931 if (WARN_ON(ap->num_pages >= data->max_pages)) {
886 unlock_page(page); 932 unlock_page(page);
887 fuse_put_request(fc, req); 933 fuse_io_free(ia);
888 return -EIO; 934 return -EIO;
889 } 935 }
890 936
891 get_page(page); 937 get_page(page);
892 req->pages[req->num_pages] = page; 938 ap->pages[ap->num_pages] = page;
893 req->page_descs[req->num_pages].length = PAGE_SIZE; 939 ap->descs[ap->num_pages].length = PAGE_SIZE;
894 req->num_pages++; 940 ap->num_pages++;
895 data->nr_pages--; 941 data->nr_pages--;
896 return 0; 942 return 0;
897} 943}
@@ -903,7 +949,6 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
903 struct fuse_conn *fc = get_fuse_conn(inode); 949 struct fuse_conn *fc = get_fuse_conn(inode);
904 struct fuse_fill_data data; 950 struct fuse_fill_data data;
905 int err; 951 int err;
906 unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages);
907 952
908 err = -EIO; 953 err = -EIO;
909 if (is_bad_inode(inode)) 954 if (is_bad_inode(inode))
@@ -911,21 +956,20 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
911 956
912 data.file = file; 957 data.file = file;
913 data.inode = inode; 958 data.inode = inode;
914 if (fc->async_read)
915 data.req = fuse_get_req_for_background(fc, nr_alloc);
916 else
917 data.req = fuse_get_req(fc, nr_alloc);
918 data.nr_pages = nr_pages; 959 data.nr_pages = nr_pages;
919 err = PTR_ERR(data.req); 960 data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
920 if (IS_ERR(data.req)) 961;
962 data.ia = fuse_io_alloc(NULL, data.max_pages);
963 err = -ENOMEM;
964 if (!data.ia)
921 goto out; 965 goto out;
922 966
923 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); 967 err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
924 if (!err) { 968 if (!err) {
925 if (data.req->num_pages) 969 if (data.ia->ap.num_pages)
926 fuse_send_readpages(data.req, file); 970 fuse_send_readpages(data.ia, file);
927 else 971 else
928 fuse_put_request(fc, data.req); 972 fuse_io_free(data.ia);
929 } 973 }
930out: 974out:
931 return err; 975 return err;
@@ -952,54 +996,65 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
952 return generic_file_read_iter(iocb, to); 996 return generic_file_read_iter(iocb, to);
953} 997}
954 998
955static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, 999static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
956 loff_t pos, size_t count) 1000 loff_t pos, size_t count)
957{ 1001{
958 struct fuse_write_in *inarg = &req->misc.write.in; 1002 struct fuse_args *args = &ia->ap.args;
959 struct fuse_write_out *outarg = &req->misc.write.out;
960 1003
961 inarg->fh = ff->fh; 1004 ia->write.in.fh = ff->fh;
962 inarg->offset = pos; 1005 ia->write.in.offset = pos;
963 inarg->size = count; 1006 ia->write.in.size = count;
964 req->in.h.opcode = FUSE_WRITE; 1007 args->opcode = FUSE_WRITE;
965 req->in.h.nodeid = ff->nodeid; 1008 args->nodeid = ff->nodeid;
966 req->in.numargs = 2; 1009 args->in_numargs = 2;
967 if (ff->fc->minor < 9) 1010 if (ff->fc->minor < 9)
968 req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; 1011 args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
969 else 1012 else
970 req->in.args[0].size = sizeof(struct fuse_write_in); 1013 args->in_args[0].size = sizeof(ia->write.in);
971 req->in.args[0].value = inarg; 1014 args->in_args[0].value = &ia->write.in;
972 req->in.args[1].size = count; 1015 args->in_args[1].size = count;
973 req->out.numargs = 1; 1016 args->out_numargs = 1;
974 req->out.args[0].size = sizeof(struct fuse_write_out); 1017 args->out_args[0].size = sizeof(ia->write.out);
975 req->out.args[0].value = outarg; 1018 args->out_args[0].value = &ia->write.out;
976} 1019}
977 1020
978static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, 1021static unsigned int fuse_write_flags(struct kiocb *iocb)
979 loff_t pos, size_t count, fl_owner_t owner)
980{ 1022{
981 struct kiocb *iocb = io->iocb; 1023 unsigned int flags = iocb->ki_filp->f_flags;
1024
1025 if (iocb->ki_flags & IOCB_DSYNC)
1026 flags |= O_DSYNC;
1027 if (iocb->ki_flags & IOCB_SYNC)
1028 flags |= O_SYNC;
1029
1030 return flags;
1031}
1032
1033static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
1034 size_t count, fl_owner_t owner)
1035{
1036 struct kiocb *iocb = ia->io->iocb;
982 struct file *file = iocb->ki_filp; 1037 struct file *file = iocb->ki_filp;
983 struct fuse_file *ff = file->private_data; 1038 struct fuse_file *ff = file->private_data;
984 struct fuse_conn *fc = ff->fc; 1039 struct fuse_conn *fc = ff->fc;
985 struct fuse_write_in *inarg = &req->misc.write.in; 1040 struct fuse_write_in *inarg = &ia->write.in;
1041 ssize_t err;
986 1042
987 fuse_write_fill(req, ff, pos, count); 1043 fuse_write_args_fill(ia, ff, pos, count);
988 inarg->flags = file->f_flags; 1044 inarg->flags = fuse_write_flags(iocb);
989 if (iocb->ki_flags & IOCB_DSYNC)
990 inarg->flags |= O_DSYNC;
991 if (iocb->ki_flags & IOCB_SYNC)
992 inarg->flags |= O_SYNC;
993 if (owner != NULL) { 1045 if (owner != NULL) {
994 inarg->write_flags |= FUSE_WRITE_LOCKOWNER; 1046 inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
995 inarg->lock_owner = fuse_lock_owner_id(fc, owner); 1047 inarg->lock_owner = fuse_lock_owner_id(fc, owner);
996 } 1048 }
997 1049
998 if (io->async) 1050 if (ia->io->async)
999 return fuse_async_req_send(fc, req, count, io); 1051 return fuse_async_req_send(fc, ia, count);
1052
1053 err = fuse_simple_request(fc, &ia->ap.args);
1054 if (!err && ia->write.out.size > count)
1055 err = -EIO;
1000 1056
1001 fuse_request_send(fc, req); 1057 return err ?: ia->write.out.size;
1002 return req->misc.write.out.size;
1003} 1058}
1004 1059
1005bool fuse_write_update_size(struct inode *inode, loff_t pos) 1060bool fuse_write_update_size(struct inode *inode, loff_t pos)
@@ -1019,26 +1074,31 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos)
1019 return ret; 1074 return ret;
1020} 1075}
1021 1076
1022static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, 1077static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
1023 struct inode *inode, loff_t pos, 1078 struct kiocb *iocb, struct inode *inode,
1024 size_t count) 1079 loff_t pos, size_t count)
1025{ 1080{
1026 size_t res; 1081 struct fuse_args_pages *ap = &ia->ap;
1027 unsigned offset; 1082 struct file *file = iocb->ki_filp;
1028 unsigned i; 1083 struct fuse_file *ff = file->private_data;
1029 struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); 1084 struct fuse_conn *fc = ff->fc;
1085 unsigned int offset, i;
1086 int err;
1030 1087
1031 for (i = 0; i < req->num_pages; i++) 1088 for (i = 0; i < ap->num_pages; i++)
1032 fuse_wait_on_page_writeback(inode, req->pages[i]->index); 1089 fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
1033 1090
1034 res = fuse_send_write(req, &io, pos, count, NULL); 1091 fuse_write_args_fill(ia, ff, pos, count);
1092 ia->write.in.flags = fuse_write_flags(iocb);
1035 1093
1036 offset = req->page_descs[0].offset; 1094 err = fuse_simple_request(fc, &ap->args);
1037 count = res;
1038 for (i = 0; i < req->num_pages; i++) {
1039 struct page *page = req->pages[i];
1040 1095
1041 if (!req->out.h.error && !offset && count >= PAGE_SIZE) 1096 offset = ap->descs[0].offset;
1097 count = ia->write.out.size;
1098 for (i = 0; i < ap->num_pages; i++) {
1099 struct page *page = ap->pages[i];
1100
1101 if (!err && !offset && count >= PAGE_SIZE)
1042 SetPageUptodate(page); 1102 SetPageUptodate(page);
1043 1103
1044 if (count > PAGE_SIZE - offset) 1104 if (count > PAGE_SIZE - offset)
@@ -1051,20 +1111,21 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb,
1051 put_page(page); 1111 put_page(page);
1052 } 1112 }
1053 1113
1054 return res; 1114 return err;
1055} 1115}
1056 1116
1057static ssize_t fuse_fill_write_pages(struct fuse_req *req, 1117static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
1058 struct address_space *mapping, 1118 struct address_space *mapping,
1059 struct iov_iter *ii, loff_t pos) 1119 struct iov_iter *ii, loff_t pos,
1120 unsigned int max_pages)
1060{ 1121{
1061 struct fuse_conn *fc = get_fuse_conn(mapping->host); 1122 struct fuse_conn *fc = get_fuse_conn(mapping->host);
1062 unsigned offset = pos & (PAGE_SIZE - 1); 1123 unsigned offset = pos & (PAGE_SIZE - 1);
1063 size_t count = 0; 1124 size_t count = 0;
1064 int err; 1125 int err;
1065 1126
1066 req->in.argpages = 1; 1127 ap->args.in_pages = true;
1067 req->page_descs[0].offset = offset; 1128 ap->descs[0].offset = offset;
1068 1129
1069 do { 1130 do {
1070 size_t tmp; 1131 size_t tmp;
@@ -1100,9 +1161,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
1100 } 1161 }
1101 1162
1102 err = 0; 1163 err = 0;
1103 req->pages[req->num_pages] = page; 1164 ap->pages[ap->num_pages] = page;
1104 req->page_descs[req->num_pages].length = tmp; 1165 ap->descs[ap->num_pages].length = tmp;
1105 req->num_pages++; 1166 ap->num_pages++;
1106 1167
1107 count += tmp; 1168 count += tmp;
1108 pos += tmp; 1169 pos += tmp;
@@ -1113,7 +1174,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
1113 if (!fc->big_writes) 1174 if (!fc->big_writes)
1114 break; 1175 break;
1115 } while (iov_iter_count(ii) && count < fc->max_write && 1176 } while (iov_iter_count(ii) && count < fc->max_write &&
1116 req->num_pages < req->max_pages && offset == 0); 1177 ap->num_pages < max_pages && offset == 0);
1117 1178
1118 return count > 0 ? count : err; 1179 return count > 0 ? count : err;
1119} 1180}
@@ -1141,27 +1202,27 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
1141 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 1202 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
1142 1203
1143 do { 1204 do {
1144 struct fuse_req *req;
1145 ssize_t count; 1205 ssize_t count;
1206 struct fuse_io_args ia = {};
1207 struct fuse_args_pages *ap = &ia.ap;
1146 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), 1208 unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
1147 fc->max_pages); 1209 fc->max_pages);
1148 1210
1149 req = fuse_get_req(fc, nr_pages); 1211 ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
1150 if (IS_ERR(req)) { 1212 if (!ap->pages) {
1151 err = PTR_ERR(req); 1213 err = -ENOMEM;
1152 break; 1214 break;
1153 } 1215 }
1154 1216
1155 count = fuse_fill_write_pages(req, mapping, ii, pos); 1217 count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
1156 if (count <= 0) { 1218 if (count <= 0) {
1157 err = count; 1219 err = count;
1158 } else { 1220 } else {
1159 size_t num_written; 1221 err = fuse_send_write_pages(&ia, iocb, inode,
1160 1222 pos, count);
1161 num_written = fuse_send_write_pages(req, iocb, inode,
1162 pos, count);
1163 err = req->out.h.error;
1164 if (!err) { 1223 if (!err) {
1224 size_t num_written = ia.write.out.size;
1225
1165 res += num_written; 1226 res += num_written;
1166 pos += num_written; 1227 pos += num_written;
1167 1228
@@ -1170,7 +1231,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
1170 err = -EIO; 1231 err = -EIO;
1171 } 1232 }
1172 } 1233 }
1173 fuse_put_request(fc, req); 1234 kfree(ap->pages);
1174 } while (!err && iov_iter_count(ii)); 1235 } while (!err && iov_iter_count(ii));
1175 1236
1176 if (res > 0) 1237 if (res > 0)
@@ -1258,14 +1319,14 @@ out:
1258 return written ? written : err; 1319 return written ? written : err;
1259} 1320}
1260 1321
1261static inline void fuse_page_descs_length_init(struct fuse_req *req, 1322static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
1262 unsigned index, unsigned nr_pages) 1323 unsigned int index,
1324 unsigned int nr_pages)
1263{ 1325{
1264 int i; 1326 int i;
1265 1327
1266 for (i = index; i < index + nr_pages; i++) 1328 for (i = index; i < index + nr_pages; i++)
1267 req->page_descs[i].length = PAGE_SIZE - 1329 descs[i].length = PAGE_SIZE - descs[i].offset;
1268 req->page_descs[i].offset;
1269} 1330}
1270 1331
1271static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) 1332static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
@@ -1279,8 +1340,9 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
1279 return min(iov_iter_single_seg_count(ii), max_size); 1340 return min(iov_iter_single_seg_count(ii), max_size);
1280} 1341}
1281 1342
1282static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, 1343static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
1283 size_t *nbytesp, int write) 1344 size_t *nbytesp, int write,
1345 unsigned int max_pages)
1284{ 1346{
1285 size_t nbytes = 0; /* # bytes already packed in req */ 1347 size_t nbytes = 0; /* # bytes already packed in req */
1286 ssize_t ret = 0; 1348 ssize_t ret = 0;
@@ -1291,21 +1353,21 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1291 size_t frag_size = fuse_get_frag_size(ii, *nbytesp); 1353 size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
1292 1354
1293 if (write) 1355 if (write)
1294 req->in.args[1].value = (void *) user_addr; 1356 ap->args.in_args[1].value = (void *) user_addr;
1295 else 1357 else
1296 req->out.args[0].value = (void *) user_addr; 1358 ap->args.out_args[0].value = (void *) user_addr;
1297 1359
1298 iov_iter_advance(ii, frag_size); 1360 iov_iter_advance(ii, frag_size);
1299 *nbytesp = frag_size; 1361 *nbytesp = frag_size;
1300 return 0; 1362 return 0;
1301 } 1363 }
1302 1364
1303 while (nbytes < *nbytesp && req->num_pages < req->max_pages) { 1365 while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1304 unsigned npages; 1366 unsigned npages;
1305 size_t start; 1367 size_t start;
1306 ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], 1368 ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
1307 *nbytesp - nbytes, 1369 *nbytesp - nbytes,
1308 req->max_pages - req->num_pages, 1370 max_pages - ap->num_pages,
1309 &start); 1371 &start);
1310 if (ret < 0) 1372 if (ret < 0)
1311 break; 1373 break;
@@ -1316,18 +1378,18 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1316 ret += start; 1378 ret += start;
1317 npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; 1379 npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1318 1380
1319 req->page_descs[req->num_pages].offset = start; 1381 ap->descs[ap->num_pages].offset = start;
1320 fuse_page_descs_length_init(req, req->num_pages, npages); 1382 fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1321 1383
1322 req->num_pages += npages; 1384 ap->num_pages += npages;
1323 req->page_descs[req->num_pages - 1].length -= 1385 ap->descs[ap->num_pages - 1].length -=
1324 (PAGE_SIZE - ret) & (PAGE_SIZE - 1); 1386 (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1325 } 1387 }
1326 1388
1327 if (write) 1389 if (write)
1328 req->in.argpages = 1; 1390 ap->args.in_pages = 1;
1329 else 1391 else
1330 req->out.argpages = 1; 1392 ap->args.out_pages = 1;
1331 1393
1332 *nbytesp = nbytes; 1394 *nbytesp = nbytes;
1333 1395
@@ -1349,17 +1411,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1349 pgoff_t idx_from = pos >> PAGE_SHIFT; 1411 pgoff_t idx_from = pos >> PAGE_SHIFT;
1350 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; 1412 pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
1351 ssize_t res = 0; 1413 ssize_t res = 0;
1352 struct fuse_req *req;
1353 int err = 0; 1414 int err = 0;
1415 struct fuse_io_args *ia;
1416 unsigned int max_pages;
1354 1417
1355 if (io->async) 1418 max_pages = iov_iter_npages(iter, fc->max_pages);
1356 req = fuse_get_req_for_background(fc, iov_iter_npages(iter, 1419 ia = fuse_io_alloc(io, max_pages);
1357 fc->max_pages)); 1420 if (!ia)
1358 else 1421 return -ENOMEM;
1359 req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages));
1360 if (IS_ERR(req))
1361 return PTR_ERR(req);
1362 1422
1423 ia->io = io;
1363 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { 1424 if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
1364 if (!write) 1425 if (!write)
1365 inode_lock(inode); 1426 inode_lock(inode);
@@ -1370,54 +1431,49 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
1370 1431
1371 io->should_dirty = !write && iter_is_iovec(iter); 1432 io->should_dirty = !write && iter_is_iovec(iter);
1372 while (count) { 1433 while (count) {
1373 size_t nres; 1434 ssize_t nres;
1374 fl_owner_t owner = current->files; 1435 fl_owner_t owner = current->files;
1375 size_t nbytes = min(count, nmax); 1436 size_t nbytes = min(count, nmax);
1376 err = fuse_get_user_pages(req, iter, &nbytes, write); 1437
1438 err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
1439 max_pages);
1377 if (err && !nbytes) 1440 if (err && !nbytes)
1378 break; 1441 break;
1379 1442
1380 if (write) { 1443 if (write) {
1381 if (!capable(CAP_FSETID)) { 1444 if (!capable(CAP_FSETID))
1382 struct fuse_write_in *inarg; 1445 ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
1383 1446
1384 inarg = &req->misc.write.in; 1447 nres = fuse_send_write(ia, pos, nbytes, owner);
1385 inarg->write_flags |= FUSE_WRITE_KILL_PRIV;
1386 }
1387 nres = fuse_send_write(req, io, pos, nbytes, owner);
1388 } else { 1448 } else {
1389 nres = fuse_send_read(req, io, pos, nbytes, owner); 1449 nres = fuse_send_read(ia, pos, nbytes, owner);
1390 } 1450 }
1391 1451
1392 if (!io->async) 1452 if (!io->async || nres < 0) {
1393 fuse_release_user_pages(req, io->should_dirty); 1453 fuse_release_user_pages(&ia->ap, io->should_dirty);
1394 if (req->out.h.error) { 1454 fuse_io_free(ia);
1395 err = req->out.h.error; 1455 }
1396 break; 1456 ia = NULL;
1397 } else if (nres > nbytes) { 1457 if (nres < 0) {
1398 res = 0; 1458 err = nres;
1399 err = -EIO;
1400 break; 1459 break;
1401 } 1460 }
1461 WARN_ON(nres > nbytes);
1462
1402 count -= nres; 1463 count -= nres;
1403 res += nres; 1464 res += nres;
1404 pos += nres; 1465 pos += nres;
1405 if (nres != nbytes) 1466 if (nres != nbytes)
1406 break; 1467 break;
1407 if (count) { 1468 if (count) {
1408 fuse_put_request(fc, req); 1469 max_pages = iov_iter_npages(iter, fc->max_pages);
1409 if (io->async) 1470 ia = fuse_io_alloc(io, max_pages);
1410 req = fuse_get_req_for_background(fc, 1471 if (!ia)
1411 iov_iter_npages(iter, fc->max_pages));
1412 else
1413 req = fuse_get_req(fc, iov_iter_npages(iter,
1414 fc->max_pages));
1415 if (IS_ERR(req))
1416 break; 1472 break;
1417 } 1473 }
1418 } 1474 }
1419 if (!IS_ERR(req)) 1475 if (ia)
1420 fuse_put_request(fc, req); 1476 fuse_io_free(ia);
1421 if (res > 0) 1477 if (res > 0)
1422 *ppos = pos; 1478 *ppos = pos;
1423 1479
@@ -1509,45 +1565,53 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1509 return fuse_direct_write_iter(iocb, from); 1565 return fuse_direct_write_iter(iocb, from);
1510} 1566}
1511 1567
1512static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) 1568static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1513{ 1569{
1570 struct fuse_args_pages *ap = &wpa->ia.ap;
1514 int i; 1571 int i;
1515 1572
1516 for (i = 0; i < req->num_pages; i++) 1573 for (i = 0; i < ap->num_pages; i++)
1517 __free_page(req->pages[i]); 1574 __free_page(ap->pages[i]);
1575
1576 if (wpa->ia.ff)
1577 fuse_file_put(wpa->ia.ff, false, false);
1518 1578
1519 if (req->ff) 1579 kfree(ap->pages);
1520 fuse_file_put(req->ff, false, false); 1580 kfree(wpa);
1521} 1581}
1522 1582
1523static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) 1583static void fuse_writepage_finish(struct fuse_conn *fc,
1584 struct fuse_writepage_args *wpa)
1524{ 1585{
1525 struct inode *inode = req->inode; 1586 struct fuse_args_pages *ap = &wpa->ia.ap;
1587 struct inode *inode = wpa->inode;
1526 struct fuse_inode *fi = get_fuse_inode(inode); 1588 struct fuse_inode *fi = get_fuse_inode(inode);
1527 struct backing_dev_info *bdi = inode_to_bdi(inode); 1589 struct backing_dev_info *bdi = inode_to_bdi(inode);
1528 int i; 1590 int i;
1529 1591
1530 list_del(&req->writepages_entry); 1592 list_del(&wpa->writepages_entry);
1531 for (i = 0; i < req->num_pages; i++) { 1593 for (i = 0; i < ap->num_pages; i++) {
1532 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 1594 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1533 dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); 1595 dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
1534 wb_writeout_inc(&bdi->wb); 1596 wb_writeout_inc(&bdi->wb);
1535 } 1597 }
1536 wake_up(&fi->page_waitq); 1598 wake_up(&fi->page_waitq);
1537} 1599}
1538 1600
1539/* Called under fi->lock, may release and reacquire it */ 1601/* Called under fi->lock, may release and reacquire it */
1540static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, 1602static void fuse_send_writepage(struct fuse_conn *fc,
1541 loff_t size) 1603 struct fuse_writepage_args *wpa, loff_t size)
1542__releases(fi->lock) 1604__releases(fi->lock)
1543__acquires(fi->lock) 1605__acquires(fi->lock)
1544{ 1606{
1545 struct fuse_req *aux, *next; 1607 struct fuse_writepage_args *aux, *next;
1546 struct fuse_inode *fi = get_fuse_inode(req->inode); 1608 struct fuse_inode *fi = get_fuse_inode(wpa->inode);
1547 struct fuse_write_in *inarg = &req->misc.write.in; 1609 struct fuse_write_in *inarg = &wpa->ia.write.in;
1548 __u64 data_size = req->num_pages * PAGE_SIZE; 1610 struct fuse_args *args = &wpa->ia.ap.args;
1549 bool queued; 1611 __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
1612 int err;
1550 1613
1614 fi->writectr++;
1551 if (inarg->offset + data_size <= size) { 1615 if (inarg->offset + data_size <= size) {
1552 inarg->size = data_size; 1616 inarg->size = data_size;
1553 } else if (inarg->offset < size) { 1617 } else if (inarg->offset < size) {
@@ -1557,29 +1621,36 @@ __acquires(fi->lock)
1557 goto out_free; 1621 goto out_free;
1558 } 1622 }
1559 1623
1560 req->in.args[1].size = inarg->size; 1624 args->in_args[1].size = inarg->size;
1561 queued = fuse_request_queue_background(fc, req); 1625 args->force = true;
1626 args->nocreds = true;
1627
1628 err = fuse_simple_background(fc, args, GFP_ATOMIC);
1629 if (err == -ENOMEM) {
1630 spin_unlock(&fi->lock);
1631 err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
1632 spin_lock(&fi->lock);
1633 }
1634
1562 /* Fails on broken connection only */ 1635 /* Fails on broken connection only */
1563 if (unlikely(!queued)) 1636 if (unlikely(err))
1564 goto out_free; 1637 goto out_free;
1565 1638
1566 fi->writectr++;
1567 return; 1639 return;
1568 1640
1569 out_free: 1641 out_free:
1570 fuse_writepage_finish(fc, req); 1642 fi->writectr--;
1643 fuse_writepage_finish(fc, wpa);
1571 spin_unlock(&fi->lock); 1644 spin_unlock(&fi->lock);
1572 1645
1573 /* After fuse_writepage_finish() aux request list is private */ 1646 /* After fuse_writepage_finish() aux request list is private */
1574 for (aux = req->misc.write.next; aux; aux = next) { 1647 for (aux = wpa->next; aux; aux = next) {
1575 next = aux->misc.write.next; 1648 next = aux->next;
1576 aux->misc.write.next = NULL; 1649 aux->next = NULL;
1577 fuse_writepage_free(fc, aux); 1650 fuse_writepage_free(aux);
1578 fuse_put_request(fc, aux);
1579 } 1651 }
1580 1652
1581 fuse_writepage_free(fc, req); 1653 fuse_writepage_free(wpa);
1582 fuse_put_request(fc, req);
1583 spin_lock(&fi->lock); 1654 spin_lock(&fi->lock);
1584} 1655}
1585 1656
@@ -1596,29 +1667,34 @@ __acquires(fi->lock)
1596 struct fuse_conn *fc = get_fuse_conn(inode); 1667 struct fuse_conn *fc = get_fuse_conn(inode);
1597 struct fuse_inode *fi = get_fuse_inode(inode); 1668 struct fuse_inode *fi = get_fuse_inode(inode);
1598 loff_t crop = i_size_read(inode); 1669 loff_t crop = i_size_read(inode);
1599 struct fuse_req *req; 1670 struct fuse_writepage_args *wpa;
1600 1671
1601 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { 1672 while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1602 req = list_entry(fi->queued_writes.next, struct fuse_req, list); 1673 wpa = list_entry(fi->queued_writes.next,
1603 list_del_init(&req->list); 1674 struct fuse_writepage_args, queue_entry);
1604 fuse_send_writepage(fc, req, crop); 1675 list_del_init(&wpa->queue_entry);
1676 fuse_send_writepage(fc, wpa, crop);
1605 } 1677 }
1606} 1678}
1607 1679
1608static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) 1680static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
1681 int error)
1609{ 1682{
1610 struct inode *inode = req->inode; 1683 struct fuse_writepage_args *wpa =
1684 container_of(args, typeof(*wpa), ia.ap.args);
1685 struct inode *inode = wpa->inode;
1611 struct fuse_inode *fi = get_fuse_inode(inode); 1686 struct fuse_inode *fi = get_fuse_inode(inode);
1612 1687
1613 mapping_set_error(inode->i_mapping, req->out.h.error); 1688 mapping_set_error(inode->i_mapping, error);
1614 spin_lock(&fi->lock); 1689 spin_lock(&fi->lock);
1615 while (req->misc.write.next) { 1690 while (wpa->next) {
1616 struct fuse_conn *fc = get_fuse_conn(inode); 1691 struct fuse_conn *fc = get_fuse_conn(inode);
1617 struct fuse_write_in *inarg = &req->misc.write.in; 1692 struct fuse_write_in *inarg = &wpa->ia.write.in;
1618 struct fuse_req *next = req->misc.write.next; 1693 struct fuse_writepage_args *next = wpa->next;
1619 req->misc.write.next = next->misc.write.next; 1694
1620 next->misc.write.next = NULL; 1695 wpa->next = next->next;
1621 next->ff = fuse_file_get(req->ff); 1696 next->next = NULL;
1697 next->ia.ff = fuse_file_get(wpa->ia.ff);
1622 list_add(&next->writepages_entry, &fi->writepages); 1698 list_add(&next->writepages_entry, &fi->writepages);
1623 1699
1624 /* 1700 /*
@@ -1647,9 +1723,9 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
1647 fuse_send_writepage(fc, next, inarg->offset + inarg->size); 1723 fuse_send_writepage(fc, next, inarg->offset + inarg->size);
1648 } 1724 }
1649 fi->writectr--; 1725 fi->writectr--;
1650 fuse_writepage_finish(fc, req); 1726 fuse_writepage_finish(fc, wpa);
1651 spin_unlock(&fi->lock); 1727 spin_unlock(&fi->lock);
1652 fuse_writepage_free(fc, req); 1728 fuse_writepage_free(wpa);
1653} 1729}
1654 1730
1655static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, 1731static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
@@ -1691,52 +1767,71 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
1691 return err; 1767 return err;
1692} 1768}
1693 1769
1770static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
1771{
1772 struct fuse_writepage_args *wpa;
1773 struct fuse_args_pages *ap;
1774
1775 wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
1776 if (wpa) {
1777 ap = &wpa->ia.ap;
1778 ap->num_pages = 0;
1779 ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
1780 if (!ap->pages) {
1781 kfree(wpa);
1782 wpa = NULL;
1783 }
1784 }
1785 return wpa;
1786
1787}
1788
1694static int fuse_writepage_locked(struct page *page) 1789static int fuse_writepage_locked(struct page *page)
1695{ 1790{
1696 struct address_space *mapping = page->mapping; 1791 struct address_space *mapping = page->mapping;
1697 struct inode *inode = mapping->host; 1792 struct inode *inode = mapping->host;
1698 struct fuse_conn *fc = get_fuse_conn(inode); 1793 struct fuse_conn *fc = get_fuse_conn(inode);
1699 struct fuse_inode *fi = get_fuse_inode(inode); 1794 struct fuse_inode *fi = get_fuse_inode(inode);
1700 struct fuse_req *req; 1795 struct fuse_writepage_args *wpa;
1796 struct fuse_args_pages *ap;
1701 struct page *tmp_page; 1797 struct page *tmp_page;
1702 int error = -ENOMEM; 1798 int error = -ENOMEM;
1703 1799
1704 set_page_writeback(page); 1800 set_page_writeback(page);
1705 1801
1706 req = fuse_request_alloc_nofs(1); 1802 wpa = fuse_writepage_args_alloc();
1707 if (!req) 1803 if (!wpa)
1708 goto err; 1804 goto err;
1805 ap = &wpa->ia.ap;
1709 1806
1710 /* writeback always goes to bg_queue */
1711 __set_bit(FR_BACKGROUND, &req->flags);
1712 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 1807 tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1713 if (!tmp_page) 1808 if (!tmp_page)
1714 goto err_free; 1809 goto err_free;
1715 1810
1716 error = -EIO; 1811 error = -EIO;
1717 req->ff = fuse_write_file_get(fc, fi); 1812 wpa->ia.ff = fuse_write_file_get(fc, fi);
1718 if (!req->ff) 1813 if (!wpa->ia.ff)
1719 goto err_nofile; 1814 goto err_nofile;
1720 1815
1721 fuse_write_fill(req, req->ff, page_offset(page), 0); 1816 fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
1722 1817
1723 copy_highpage(tmp_page, page); 1818 copy_highpage(tmp_page, page);
1724 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; 1819 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
1725 req->misc.write.next = NULL; 1820 wpa->next = NULL;
1726 req->in.argpages = 1; 1821 ap->args.in_pages = true;
1727 req->num_pages = 1; 1822 ap->num_pages = 1;
1728 req->pages[0] = tmp_page; 1823 ap->pages[0] = tmp_page;
1729 req->page_descs[0].offset = 0; 1824 ap->descs[0].offset = 0;
1730 req->page_descs[0].length = PAGE_SIZE; 1825 ap->descs[0].length = PAGE_SIZE;
1731 req->end = fuse_writepage_end; 1826 ap->args.end = fuse_writepage_end;
1732 req->inode = inode; 1827 wpa->inode = inode;
1733 1828
1734 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 1829 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1735 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); 1830 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
1736 1831
1737 spin_lock(&fi->lock); 1832 spin_lock(&fi->lock);
1738 list_add(&req->writepages_entry, &fi->writepages); 1833 list_add(&wpa->writepages_entry, &fi->writepages);
1739 list_add_tail(&req->list, &fi->queued_writes); 1834 list_add_tail(&wpa->queue_entry, &fi->queued_writes);
1740 fuse_flush_writepages(inode); 1835 fuse_flush_writepages(inode);
1741 spin_unlock(&fi->lock); 1836 spin_unlock(&fi->lock);
1742 1837
@@ -1747,7 +1842,7 @@ static int fuse_writepage_locked(struct page *page)
1747err_nofile: 1842err_nofile:
1748 __free_page(tmp_page); 1843 __free_page(tmp_page);
1749err_free: 1844err_free:
1750 fuse_request_free(req); 1845 kfree(wpa);
1751err: 1846err:
1752 mapping_set_error(page->mapping, error); 1847 mapping_set_error(page->mapping, error);
1753 end_page_writeback(page); 1848 end_page_writeback(page);
@@ -1767,6 +1862,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1767 WARN_ON(wbc->sync_mode == WB_SYNC_ALL); 1862 WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
1768 1863
1769 redirty_page_for_writepage(wbc, page); 1864 redirty_page_for_writepage(wbc, page);
1865 unlock_page(page);
1770 return 0; 1866 return 0;
1771 } 1867 }
1772 1868
@@ -1777,23 +1873,50 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc)
1777} 1873}
1778 1874
1779struct fuse_fill_wb_data { 1875struct fuse_fill_wb_data {
1780 struct fuse_req *req; 1876 struct fuse_writepage_args *wpa;
1781 struct fuse_file *ff; 1877 struct fuse_file *ff;
1782 struct inode *inode; 1878 struct inode *inode;
1783 struct page **orig_pages; 1879 struct page **orig_pages;
1880 unsigned int max_pages;
1784}; 1881};
1785 1882
1883static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
1884{
1885 struct fuse_args_pages *ap = &data->wpa->ia.ap;
1886 struct fuse_conn *fc = get_fuse_conn(data->inode);
1887 struct page **pages;
1888 struct fuse_page_desc *descs;
1889 unsigned int npages = min_t(unsigned int,
1890 max_t(unsigned int, data->max_pages * 2,
1891 FUSE_DEFAULT_MAX_PAGES_PER_REQ),
1892 fc->max_pages);
1893 WARN_ON(npages <= data->max_pages);
1894
1895 pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
1896 if (!pages)
1897 return false;
1898
1899 memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
1900 memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
1901 kfree(ap->pages);
1902 ap->pages = pages;
1903 ap->descs = descs;
1904 data->max_pages = npages;
1905
1906 return true;
1907}
1908
1786static void fuse_writepages_send(struct fuse_fill_wb_data *data) 1909static void fuse_writepages_send(struct fuse_fill_wb_data *data)
1787{ 1910{
1788 struct fuse_req *req = data->req; 1911 struct fuse_writepage_args *wpa = data->wpa;
1789 struct inode *inode = data->inode; 1912 struct inode *inode = data->inode;
1790 struct fuse_inode *fi = get_fuse_inode(inode); 1913 struct fuse_inode *fi = get_fuse_inode(inode);
1791 int num_pages = req->num_pages; 1914 int num_pages = wpa->ia.ap.num_pages;
1792 int i; 1915 int i;
1793 1916
1794 req->ff = fuse_file_get(data->ff); 1917 wpa->ia.ff = fuse_file_get(data->ff);
1795 spin_lock(&fi->lock); 1918 spin_lock(&fi->lock);
1796 list_add_tail(&req->list, &fi->queued_writes); 1919 list_add_tail(&wpa->queue_entry, &fi->queued_writes);
1797 fuse_flush_writepages(inode); 1920 fuse_flush_writepages(inode);
1798 spin_unlock(&fi->lock); 1921 spin_unlock(&fi->lock);
1799 1922
@@ -1808,54 +1931,52 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data)
1808 * this new request onto the auxiliary list, otherwise reuse the existing one by 1931 * this new request onto the auxiliary list, otherwise reuse the existing one by
1809 * copying the new page contents over to the old temporary page. 1932 * copying the new page contents over to the old temporary page.
1810 */ 1933 */
1811static bool fuse_writepage_in_flight(struct fuse_req *new_req, 1934static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
1812 struct page *page) 1935 struct page *page)
1813{ 1936{
1814 struct fuse_conn *fc = get_fuse_conn(new_req->inode); 1937 struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
1815 struct fuse_inode *fi = get_fuse_inode(new_req->inode); 1938 struct fuse_writepage_args *tmp;
1816 struct fuse_req *tmp; 1939 struct fuse_writepage_args *old_wpa;
1817 struct fuse_req *old_req; 1940 struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
1818 1941
1819 WARN_ON(new_req->num_pages != 0); 1942 WARN_ON(new_ap->num_pages != 0);
1820 1943
1821 spin_lock(&fi->lock); 1944 spin_lock(&fi->lock);
1822 list_del(&new_req->writepages_entry); 1945 list_del(&new_wpa->writepages_entry);
1823 old_req = fuse_find_writeback(fi, page->index, page->index); 1946 old_wpa = fuse_find_writeback(fi, page->index, page->index);
1824 if (!old_req) { 1947 if (!old_wpa) {
1825 list_add(&new_req->writepages_entry, &fi->writepages); 1948 list_add(&new_wpa->writepages_entry, &fi->writepages);
1826 spin_unlock(&fi->lock); 1949 spin_unlock(&fi->lock);
1827 return false; 1950 return false;
1828 } 1951 }
1829 1952
1830 new_req->num_pages = 1; 1953 new_ap->num_pages = 1;
1831 for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { 1954 for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
1832 pgoff_t curr_index; 1955 pgoff_t curr_index;
1833 1956
1834 WARN_ON(tmp->inode != new_req->inode); 1957 WARN_ON(tmp->inode != new_wpa->inode);
1835 curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; 1958 curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
1836 if (curr_index == page->index) { 1959 if (curr_index == page->index) {
1837 WARN_ON(tmp->num_pages != 1); 1960 WARN_ON(tmp->ia.ap.num_pages != 1);
1838 WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); 1961 swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
1839 swap(tmp->pages[0], new_req->pages[0]);
1840 break; 1962 break;
1841 } 1963 }
1842 } 1964 }
1843 1965
1844 if (!tmp) { 1966 if (!tmp) {
1845 new_req->misc.write.next = old_req->misc.write.next; 1967 new_wpa->next = old_wpa->next;
1846 old_req->misc.write.next = new_req; 1968 old_wpa->next = new_wpa;
1847 } 1969 }
1848 1970
1849 spin_unlock(&fi->lock); 1971 spin_unlock(&fi->lock);
1850 1972
1851 if (tmp) { 1973 if (tmp) {
1852 struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); 1974 struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
1853 1975
1854 dec_wb_stat(&bdi->wb, WB_WRITEBACK); 1976 dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1855 dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); 1977 dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
1856 wb_writeout_inc(&bdi->wb); 1978 wb_writeout_inc(&bdi->wb);
1857 fuse_writepage_free(fc, new_req); 1979 fuse_writepage_free(new_wpa);
1858 fuse_request_free(new_req);
1859 } 1980 }
1860 1981
1861 return true; 1982 return true;
@@ -1865,7 +1986,8 @@ static int fuse_writepages_fill(struct page *page,
1865 struct writeback_control *wbc, void *_data) 1986 struct writeback_control *wbc, void *_data)
1866{ 1987{
1867 struct fuse_fill_wb_data *data = _data; 1988 struct fuse_fill_wb_data *data = _data;
1868 struct fuse_req *req = data->req; 1989 struct fuse_writepage_args *wpa = data->wpa;
1990 struct fuse_args_pages *ap = &wpa->ia.ap;
1869 struct inode *inode = data->inode; 1991 struct inode *inode = data->inode;
1870 struct fuse_inode *fi = get_fuse_inode(inode); 1992 struct fuse_inode *fi = get_fuse_inode(inode);
1871 struct fuse_conn *fc = get_fuse_conn(inode); 1993 struct fuse_conn *fc = get_fuse_conn(inode);
@@ -1888,16 +2010,16 @@ static int fuse_writepages_fill(struct page *page,
1888 */ 2010 */
1889 is_writeback = fuse_page_is_writeback(inode, page->index); 2011 is_writeback = fuse_page_is_writeback(inode, page->index);
1890 2012
1891 if (req && req->num_pages && 2013 if (wpa && ap->num_pages &&
1892 (is_writeback || req->num_pages == fc->max_pages || 2014 (is_writeback || ap->num_pages == fc->max_pages ||
1893 (req->num_pages + 1) * PAGE_SIZE > fc->max_write || 2015 (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
1894 data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { 2016 data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
1895 fuse_writepages_send(data); 2017 fuse_writepages_send(data);
1896 data->req = NULL; 2018 data->wpa = NULL;
1897 } else if (req && req->num_pages == req->max_pages) { 2019 } else if (wpa && ap->num_pages == data->max_pages) {
1898 if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { 2020 if (!fuse_pages_realloc(data)) {
1899 fuse_writepages_send(data); 2021 fuse_writepages_send(data);
1900 req = data->req = NULL; 2022 data->wpa = NULL;
1901 } 2023 }
1902 } 2024 }
1903 2025
@@ -1915,59 +2037,60 @@ static int fuse_writepages_fill(struct page *page,
1915 * This is ensured by holding the page lock in page_mkwrite() while 2037 * This is ensured by holding the page lock in page_mkwrite() while
1916 * checking fuse_page_is_writeback(). We already hold the page lock 2038 * checking fuse_page_is_writeback(). We already hold the page lock
1917 * since clear_page_dirty_for_io() and keep it held until we add the 2039 * since clear_page_dirty_for_io() and keep it held until we add the
1918 * request to the fi->writepages list and increment req->num_pages. 2040 * request to the fi->writepages list and increment ap->num_pages.
1919 * After this fuse_page_is_writeback() will indicate that the page is 2041 * After this fuse_page_is_writeback() will indicate that the page is
1920 * under writeback, so we can release the page lock. 2042 * under writeback, so we can release the page lock.
1921 */ 2043 */
1922 if (data->req == NULL) { 2044 if (data->wpa == NULL) {
1923 struct fuse_inode *fi = get_fuse_inode(inode); 2045 struct fuse_inode *fi = get_fuse_inode(inode);
1924 2046
1925 err = -ENOMEM; 2047 err = -ENOMEM;
1926 req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); 2048 wpa = fuse_writepage_args_alloc();
1927 if (!req) { 2049 if (!wpa) {
1928 __free_page(tmp_page); 2050 __free_page(tmp_page);
1929 goto out_unlock; 2051 goto out_unlock;
1930 } 2052 }
2053 data->max_pages = 1;
1931 2054
1932 fuse_write_fill(req, data->ff, page_offset(page), 0); 2055 ap = &wpa->ia.ap;
1933 req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; 2056 fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
1934 req->misc.write.next = NULL; 2057 wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
1935 req->in.argpages = 1; 2058 wpa->next = NULL;
1936 __set_bit(FR_BACKGROUND, &req->flags); 2059 ap->args.in_pages = true;
1937 req->num_pages = 0; 2060 ap->args.end = fuse_writepage_end;
1938 req->end = fuse_writepage_end; 2061 ap->num_pages = 0;
1939 req->inode = inode; 2062 wpa->inode = inode;
1940 2063
1941 spin_lock(&fi->lock); 2064 spin_lock(&fi->lock);
1942 list_add(&req->writepages_entry, &fi->writepages); 2065 list_add(&wpa->writepages_entry, &fi->writepages);
1943 spin_unlock(&fi->lock); 2066 spin_unlock(&fi->lock);
1944 2067
1945 data->req = req; 2068 data->wpa = wpa;
1946 } 2069 }
1947 set_page_writeback(page); 2070 set_page_writeback(page);
1948 2071
1949 copy_highpage(tmp_page, page); 2072 copy_highpage(tmp_page, page);
1950 req->pages[req->num_pages] = tmp_page; 2073 ap->pages[ap->num_pages] = tmp_page;
1951 req->page_descs[req->num_pages].offset = 0; 2074 ap->descs[ap->num_pages].offset = 0;
1952 req->page_descs[req->num_pages].length = PAGE_SIZE; 2075 ap->descs[ap->num_pages].length = PAGE_SIZE;
1953 2076
1954 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); 2077 inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1955 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); 2078 inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
1956 2079
1957 err = 0; 2080 err = 0;
1958 if (is_writeback && fuse_writepage_in_flight(req, page)) { 2081 if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
1959 end_page_writeback(page); 2082 end_page_writeback(page);
1960 data->req = NULL; 2083 data->wpa = NULL;
1961 goto out_unlock; 2084 goto out_unlock;
1962 } 2085 }
1963 data->orig_pages[req->num_pages] = page; 2086 data->orig_pages[ap->num_pages] = page;
1964 2087
1965 /* 2088 /*
1966 * Protected by fi->lock against concurrent access by 2089 * Protected by fi->lock against concurrent access by
1967 * fuse_page_is_writeback(). 2090 * fuse_page_is_writeback().
1968 */ 2091 */
1969 spin_lock(&fi->lock); 2092 spin_lock(&fi->lock);
1970 req->num_pages++; 2093 ap->num_pages++;
1971 spin_unlock(&fi->lock); 2094 spin_unlock(&fi->lock);
1972 2095
1973out_unlock: 2096out_unlock:
@@ -1989,7 +2112,7 @@ static int fuse_writepages(struct address_space *mapping,
1989 goto out; 2112 goto out;
1990 2113
1991 data.inode = inode; 2114 data.inode = inode;
1992 data.req = NULL; 2115 data.wpa = NULL;
1993 data.ff = NULL; 2116 data.ff = NULL;
1994 2117
1995 err = -ENOMEM; 2118 err = -ENOMEM;
@@ -2000,9 +2123,9 @@ static int fuse_writepages(struct address_space *mapping,
2000 goto out; 2123 goto out;
2001 2124
2002 err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); 2125 err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2003 if (data.req) { 2126 if (data.wpa) {
2004 /* Ignore errors if we can write at least one page */ 2127 /* Ignore errors if we can write at least one page */
2005 BUG_ON(!data.req->num_pages); 2128 WARN_ON(!data.wpa->ia.ap.num_pages);
2006 fuse_writepages_send(&data); 2129 fuse_writepages_send(&data);
2007 err = 0; 2130 err = 0;
2008 } 2131 }
@@ -2222,11 +2345,11 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2222 inarg->lk.pid = pid; 2345 inarg->lk.pid = pid;
2223 if (flock) 2346 if (flock)
2224 inarg->lk_flags |= FUSE_LK_FLOCK; 2347 inarg->lk_flags |= FUSE_LK_FLOCK;
2225 args->in.h.opcode = opcode; 2348 args->opcode = opcode;
2226 args->in.h.nodeid = get_node_id(inode); 2349 args->nodeid = get_node_id(inode);
2227 args->in.numargs = 1; 2350 args->in_numargs = 1;
2228 args->in.args[0].size = sizeof(*inarg); 2351 args->in_args[0].size = sizeof(*inarg);
2229 args->in.args[0].value = inarg; 2352 args->in_args[0].value = inarg;
2230} 2353}
2231 2354
2232static int fuse_getlk(struct file *file, struct file_lock *fl) 2355static int fuse_getlk(struct file *file, struct file_lock *fl)
@@ -2239,9 +2362,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
2239 int err; 2362 int err;
2240 2363
2241 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); 2364 fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2242 args.out.numargs = 1; 2365 args.out_numargs = 1;
2243 args.out.args[0].size = sizeof(outarg); 2366 args.out_args[0].size = sizeof(outarg);
2244 args.out.args[0].value = &outarg; 2367 args.out_args[0].value = &outarg;
2245 err = fuse_simple_request(fc, &args); 2368 err = fuse_simple_request(fc, &args);
2246 if (!err) 2369 if (!err)
2247 err = convert_fuse_file_lock(fc, &outarg.lk, fl); 2370 err = convert_fuse_file_lock(fc, &outarg.lk, fl);
@@ -2336,14 +2459,14 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
2336 memset(&inarg, 0, sizeof(inarg)); 2459 memset(&inarg, 0, sizeof(inarg));
2337 inarg.block = block; 2460 inarg.block = block;
2338 inarg.blocksize = inode->i_sb->s_blocksize; 2461 inarg.blocksize = inode->i_sb->s_blocksize;
2339 args.in.h.opcode = FUSE_BMAP; 2462 args.opcode = FUSE_BMAP;
2340 args.in.h.nodeid = get_node_id(inode); 2463 args.nodeid = get_node_id(inode);
2341 args.in.numargs = 1; 2464 args.in_numargs = 1;
2342 args.in.args[0].size = sizeof(inarg); 2465 args.in_args[0].size = sizeof(inarg);
2343 args.in.args[0].value = &inarg; 2466 args.in_args[0].value = &inarg;
2344 args.out.numargs = 1; 2467 args.out_numargs = 1;
2345 args.out.args[0].size = sizeof(outarg); 2468 args.out_args[0].size = sizeof(outarg);
2346 args.out.args[0].value = &outarg; 2469 args.out_args[0].value = &outarg;
2347 err = fuse_simple_request(fc, &args); 2470 err = fuse_simple_request(fc, &args);
2348 if (err == -ENOSYS) 2471 if (err == -ENOSYS)
2349 fc->no_bmap = 1; 2472 fc->no_bmap = 1;
@@ -2368,14 +2491,14 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
2368 if (fc->no_lseek) 2491 if (fc->no_lseek)
2369 goto fallback; 2492 goto fallback;
2370 2493
2371 args.in.h.opcode = FUSE_LSEEK; 2494 args.opcode = FUSE_LSEEK;
2372 args.in.h.nodeid = ff->nodeid; 2495 args.nodeid = ff->nodeid;
2373 args.in.numargs = 1; 2496 args.in_numargs = 1;
2374 args.in.args[0].size = sizeof(inarg); 2497 args.in_args[0].size = sizeof(inarg);
2375 args.in.args[0].value = &inarg; 2498 args.in_args[0].value = &inarg;
2376 args.out.numargs = 1; 2499 args.out_numargs = 1;
2377 args.out.args[0].size = sizeof(outarg); 2500 args.out_args[0].size = sizeof(outarg);
2378 args.out.args[0].value = &outarg; 2501 args.out_args[0].value = &outarg;
2379 err = fuse_simple_request(fc, &args); 2502 err = fuse_simple_request(fc, &args);
2380 if (err) { 2503 if (err) {
2381 if (err == -ENOSYS) { 2504 if (err == -ENOSYS) {
@@ -2573,14 +2696,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2573 .flags = flags 2696 .flags = flags
2574 }; 2697 };
2575 struct fuse_ioctl_out outarg; 2698 struct fuse_ioctl_out outarg;
2576 struct fuse_req *req = NULL;
2577 struct page **pages = NULL;
2578 struct iovec *iov_page = NULL; 2699 struct iovec *iov_page = NULL;
2579 struct iovec *in_iov = NULL, *out_iov = NULL; 2700 struct iovec *in_iov = NULL, *out_iov = NULL;
2580 unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; 2701 unsigned int in_iovs = 0, out_iovs = 0, max_pages;
2581 size_t in_size, out_size, transferred, c; 2702 size_t in_size, out_size, c;
2703 ssize_t transferred;
2582 int err, i; 2704 int err, i;
2583 struct iov_iter ii; 2705 struct iov_iter ii;
2706 struct fuse_args_pages ap = {};
2584 2707
2585#if BITS_PER_LONG == 32 2708#if BITS_PER_LONG == 32
2586 inarg.flags |= FUSE_IOCTL_32BIT; 2709 inarg.flags |= FUSE_IOCTL_32BIT;
@@ -2598,11 +2721,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2598 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); 2721 BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
2599 2722
2600 err = -ENOMEM; 2723 err = -ENOMEM;
2601 pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); 2724 ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
2602 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); 2725 iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
2603 if (!pages || !iov_page) 2726 if (!ap.pages || !iov_page)
2604 goto out; 2727 goto out;
2605 2728
2729 fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);
2730
2606 /* 2731 /*
2607 * If restricted, initialize IO parameters as encoded in @cmd. 2732 * If restricted, initialize IO parameters as encoded in @cmd.
2608 * RETRY from server is not allowed. 2733 * RETRY from server is not allowed.
@@ -2639,56 +2764,44 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2639 err = -ENOMEM; 2764 err = -ENOMEM;
2640 if (max_pages > fc->max_pages) 2765 if (max_pages > fc->max_pages)
2641 goto out; 2766 goto out;
2642 while (num_pages < max_pages) { 2767 while (ap.num_pages < max_pages) {
2643 pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); 2768 ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
2644 if (!pages[num_pages]) 2769 if (!ap.pages[ap.num_pages])
2645 goto out; 2770 goto out;
2646 num_pages++; 2771 ap.num_pages++;
2647 } 2772 }
2648 2773
2649 req = fuse_get_req(fc, num_pages);
2650 if (IS_ERR(req)) {
2651 err = PTR_ERR(req);
2652 req = NULL;
2653 goto out;
2654 }
2655 memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
2656 req->num_pages = num_pages;
2657 fuse_page_descs_length_init(req, 0, req->num_pages);
2658 2774
2659 /* okay, let's send it to the client */ 2775 /* okay, let's send it to the client */
2660 req->in.h.opcode = FUSE_IOCTL; 2776 ap.args.opcode = FUSE_IOCTL;
2661 req->in.h.nodeid = ff->nodeid; 2777 ap.args.nodeid = ff->nodeid;
2662 req->in.numargs = 1; 2778 ap.args.in_numargs = 1;
2663 req->in.args[0].size = sizeof(inarg); 2779 ap.args.in_args[0].size = sizeof(inarg);
2664 req->in.args[0].value = &inarg; 2780 ap.args.in_args[0].value = &inarg;
2665 if (in_size) { 2781 if (in_size) {
2666 req->in.numargs++; 2782 ap.args.in_numargs++;
2667 req->in.args[1].size = in_size; 2783 ap.args.in_args[1].size = in_size;
2668 req->in.argpages = 1; 2784 ap.args.in_pages = true;
2669 2785
2670 err = -EFAULT; 2786 err = -EFAULT;
2671 iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); 2787 iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
2672 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { 2788 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
2673 c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); 2789 c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2674 if (c != PAGE_SIZE && iov_iter_count(&ii)) 2790 if (c != PAGE_SIZE && iov_iter_count(&ii))
2675 goto out; 2791 goto out;
2676 } 2792 }
2677 } 2793 }
2678 2794
2679 req->out.numargs = 2; 2795 ap.args.out_numargs = 2;
2680 req->out.args[0].size = sizeof(outarg); 2796 ap.args.out_args[0].size = sizeof(outarg);
2681 req->out.args[0].value = &outarg; 2797 ap.args.out_args[0].value = &outarg;
2682 req->out.args[1].size = out_size; 2798 ap.args.out_args[1].size = out_size;
2683 req->out.argpages = 1; 2799 ap.args.out_pages = true;
2684 req->out.argvar = 1; 2800 ap.args.out_argvar = true;
2685 2801
2686 fuse_request_send(fc, req); 2802 transferred = fuse_simple_request(fc, &ap.args);
2687 err = req->out.h.error; 2803 err = transferred;
2688 transferred = req->out.args[1].size; 2804 if (transferred < 0)
2689 fuse_put_request(fc, req);
2690 req = NULL;
2691 if (err)
2692 goto out; 2805 goto out;
2693 2806
2694 /* did it ask for retry? */ 2807 /* did it ask for retry? */
@@ -2713,7 +2826,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2713 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) 2826 in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
2714 goto out; 2827 goto out;
2715 2828
2716 vaddr = kmap_atomic(pages[0]); 2829 vaddr = kmap_atomic(ap.pages[0]);
2717 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, 2830 err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
2718 transferred, in_iovs + out_iovs, 2831 transferred, in_iovs + out_iovs,
2719 (flags & FUSE_IOCTL_COMPAT) != 0); 2832 (flags & FUSE_IOCTL_COMPAT) != 0);
@@ -2741,19 +2854,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
2741 2854
2742 err = -EFAULT; 2855 err = -EFAULT;
2743 iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); 2856 iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
2744 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { 2857 for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
2745 c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); 2858 c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2746 if (c != PAGE_SIZE && iov_iter_count(&ii)) 2859 if (c != PAGE_SIZE && iov_iter_count(&ii))
2747 goto out; 2860 goto out;
2748 } 2861 }
2749 err = 0; 2862 err = 0;
2750 out: 2863 out:
2751 if (req)
2752 fuse_put_request(fc, req);
2753 free_page((unsigned long) iov_page); 2864 free_page((unsigned long) iov_page);
2754 while (num_pages) 2865 while (ap.num_pages)
2755 __free_page(pages[--num_pages]); 2866 __free_page(ap.pages[--ap.num_pages]);
2756 kfree(pages); 2867 kfree(ap.pages);
2757 2868
2758 return err ? err : outarg.result; 2869 return err ? err : outarg.result;
2759} 2870}
@@ -2861,14 +2972,14 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait)
2861 fuse_register_polled_file(fc, ff); 2972 fuse_register_polled_file(fc, ff);
2862 } 2973 }
2863 2974
2864 args.in.h.opcode = FUSE_POLL; 2975 args.opcode = FUSE_POLL;
2865 args.in.h.nodeid = ff->nodeid; 2976 args.nodeid = ff->nodeid;
2866 args.in.numargs = 1; 2977 args.in_numargs = 1;
2867 args.in.args[0].size = sizeof(inarg); 2978 args.in_args[0].size = sizeof(inarg);
2868 args.in.args[0].value = &inarg; 2979 args.in_args[0].value = &inarg;
2869 args.out.numargs = 1; 2980 args.out_numargs = 1;
2870 args.out.args[0].size = sizeof(outarg); 2981 args.out_args[0].size = sizeof(outarg);
2871 args.out.args[0].value = &outarg; 2982 args.out_args[0].value = &outarg;
2872 err = fuse_simple_request(fc, &args); 2983 err = fuse_simple_request(fc, &args);
2873 2984
2874 if (!err) 2985 if (!err)
@@ -3076,11 +3187,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
3076 if (!(mode & FALLOC_FL_KEEP_SIZE)) 3187 if (!(mode & FALLOC_FL_KEEP_SIZE))
3077 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); 3188 set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
3078 3189
3079 args.in.h.opcode = FUSE_FALLOCATE; 3190 args.opcode = FUSE_FALLOCATE;
3080 args.in.h.nodeid = ff->nodeid; 3191 args.nodeid = ff->nodeid;
3081 args.in.numargs = 1; 3192 args.in_numargs = 1;
3082 args.in.args[0].size = sizeof(inarg); 3193 args.in_args[0].size = sizeof(inarg);
3083 args.in.args[0].value = &inarg; 3194 args.in_args[0].value = &inarg;
3084 err = fuse_simple_request(fc, &args); 3195 err = fuse_simple_request(fc, &args);
3085 if (err == -ENOSYS) { 3196 if (err == -ENOSYS) {
3086 fc->no_fallocate = 1; 3197 fc->no_fallocate = 1;
@@ -3168,14 +3279,14 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
3168 if (is_unstable) 3279 if (is_unstable)
3169 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); 3280 set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
3170 3281
3171 args.in.h.opcode = FUSE_COPY_FILE_RANGE; 3282 args.opcode = FUSE_COPY_FILE_RANGE;
3172 args.in.h.nodeid = ff_in->nodeid; 3283 args.nodeid = ff_in->nodeid;
3173 args.in.numargs = 1; 3284 args.in_numargs = 1;
3174 args.in.args[0].size = sizeof(inarg); 3285 args.in_args[0].size = sizeof(inarg);
3175 args.in.args[0].value = &inarg; 3286 args.in_args[0].value = &inarg;
3176 args.out.numargs = 1; 3287 args.out_numargs = 1;
3177 args.out.args[0].size = sizeof(outarg); 3288 args.out_args[0].size = sizeof(outarg);
3178 args.out.args[0].value = &outarg; 3289 args.out_args[0].value = &outarg;
3179 err = fuse_simple_request(fc, &args); 3290 err = fuse_simple_request(fc, &args);
3180 if (err == -ENOSYS) { 3291 if (err == -ENOSYS) {
3181 fc->no_copy_file_range = 1; 3292 fc->no_copy_file_range = 1;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 24dbca777775..fc89cb40e874 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -47,9 +47,6 @@
47/** Number of dentries for each connection in the control filesystem */ 47/** Number of dentries for each connection in the control filesystem */
48#define FUSE_CTL_NUM_DENTRIES 5 48#define FUSE_CTL_NUM_DENTRIES 5
49 49
50/** Number of page pointers embedded in fuse_req */
51#define FUSE_REQ_INLINE_PAGES 1
52
53/** List of active connections */ 50/** List of active connections */
54extern struct list_head fuse_conn_list; 51extern struct list_head fuse_conn_list;
55 52
@@ -164,17 +161,15 @@ enum {
164}; 161};
165 162
166struct fuse_conn; 163struct fuse_conn;
164struct fuse_release_args;
167 165
168/** FUSE specific file data */ 166/** FUSE specific file data */
169struct fuse_file { 167struct fuse_file {
170 /** Fuse connection for this file */ 168 /** Fuse connection for this file */
171 struct fuse_conn *fc; 169 struct fuse_conn *fc;
172 170
173 /* 171 /* Argument space reserved for release */
174 * Request reserved for flush and release. 172 struct fuse_release_args *release_args;
175 * Modified under relative fuse_inode::lock.
176 */
177 struct fuse_req *reserved_req;
178 173
179 /** Kernel file handle guaranteed to be unique */ 174 /** Kernel file handle guaranteed to be unique */
180 u64 kh; 175 u64 kh;
@@ -229,57 +224,12 @@ struct fuse_in_arg {
229 const void *value; 224 const void *value;
230}; 225};
231 226
232/** The request input */
233struct fuse_in {
234 /** The request header */
235 struct fuse_in_header h;
236
237 /** True if the data for the last argument is in req->pages */
238 unsigned argpages:1;
239
240 /** Number of arguments */
241 unsigned numargs;
242
243 /** Array of arguments */
244 struct fuse_in_arg args[3];
245};
246
247/** One output argument of a request */ 227/** One output argument of a request */
248struct fuse_arg { 228struct fuse_arg {
249 unsigned size; 229 unsigned size;
250 void *value; 230 void *value;
251}; 231};
252 232
253/** The request output */
254struct fuse_out {
255 /** Header returned from userspace */
256 struct fuse_out_header h;
257
258 /*
259 * The following bitfields are not changed during the request
260 * processing
261 */
262
263 /** Last argument is variable length (can be shorter than
264 arg->size) */
265 unsigned argvar:1;
266
267 /** Last argument is a list of pages to copy data to */
268 unsigned argpages:1;
269
270 /** Zero partially or not copied pages */
271 unsigned page_zeroing:1;
272
273 /** Pages may be replaced with new ones */
274 unsigned page_replace:1;
275
276 /** Number or arguments */
277 unsigned numargs;
278
279 /** Array of arguments */
280 struct fuse_arg args[2];
281};
282
283/** FUSE page descriptor */ 233/** FUSE page descriptor */
284struct fuse_page_desc { 234struct fuse_page_desc {
285 unsigned int length; 235 unsigned int length;
@@ -287,20 +237,28 @@ struct fuse_page_desc {
287}; 237};
288 238
289struct fuse_args { 239struct fuse_args {
290 struct { 240 uint64_t nodeid;
291 struct { 241 uint32_t opcode;
292 uint32_t opcode; 242 unsigned short in_numargs;
293 uint64_t nodeid; 243 unsigned short out_numargs;
294 } h; 244 bool force:1;
295 unsigned numargs; 245 bool noreply:1;
296 struct fuse_in_arg args[3]; 246 bool nocreds:1;
247 bool in_pages:1;
248 bool out_pages:1;
249 bool out_argvar:1;
250 bool page_zeroing:1;
251 bool page_replace:1;
252 struct fuse_in_arg in_args[3];
253 struct fuse_arg out_args[2];
254 void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error);
255};
297 256
298 } in; 257struct fuse_args_pages {
299 struct { 258 struct fuse_args args;
300 unsigned argvar:1; 259 struct page **pages;
301 unsigned numargs; 260 struct fuse_page_desc *descs;
302 struct fuse_arg args[2]; 261 unsigned int num_pages;
303 } out;
304}; 262};
305 263
306#define FUSE_ARGS(args) struct fuse_args args = {} 264#define FUSE_ARGS(args) struct fuse_args args = {}
@@ -373,83 +331,70 @@ struct fuse_req {
373 /** Entry on the interrupts list */ 331 /** Entry on the interrupts list */
374 struct list_head intr_entry; 332 struct list_head intr_entry;
375 333
334 /* Input/output arguments */
335 struct fuse_args *args;
336
376 /** refcount */ 337 /** refcount */
377 refcount_t count; 338 refcount_t count;
378 339
379 /* Request flags, updated with test/set/clear_bit() */ 340 /* Request flags, updated with test/set/clear_bit() */
380 unsigned long flags; 341 unsigned long flags;
381 342
382 /** The request input */ 343 /* The request input header */
383 struct fuse_in in; 344 struct {
345 struct fuse_in_header h;
346 } in;
384 347
385 /** The request output */ 348 /* The request output header */
386 struct fuse_out out; 349 struct {
350 struct fuse_out_header h;
351 } out;
387 352
388 /** Used to wake up the task waiting for completion of request*/ 353 /** Used to wake up the task waiting for completion of request*/
389 wait_queue_head_t waitq; 354 wait_queue_head_t waitq;
390 355
391 /** Data for asynchronous requests */ 356};
392 union {
393 struct {
394 struct fuse_release_in in;
395 struct inode *inode;
396 } release;
397 struct fuse_init_in init_in;
398 struct fuse_init_out init_out;
399 struct cuse_init_in cuse_init_in;
400 struct {
401 struct fuse_read_in in;
402 u64 attr_ver;
403 } read;
404 struct {
405 struct fuse_write_in in;
406 struct fuse_write_out out;
407 struct fuse_req *next;
408 } write;
409 struct fuse_notify_retrieve_in retrieve_in;
410 } misc;
411
412 /** page vector */
413 struct page **pages;
414
415 /** page-descriptor vector */
416 struct fuse_page_desc *page_descs;
417
418 /** size of the 'pages' array */
419 unsigned max_pages;
420
421 /** inline page vector */
422 struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
423
424 /** inline page-descriptor vector */
425 struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
426
427 /** number of pages in vector */
428 unsigned num_pages;
429
430 /** File used in the request (or NULL) */
431 struct fuse_file *ff;
432
433 /** Inode used in the request or NULL */
434 struct inode *inode;
435 357
436 /** AIO control block */ 358struct fuse_iqueue;
437 struct fuse_io_priv *io;
438 359
439 /** Link on fi->writepages */ 360/**
440 struct list_head writepages_entry; 361 * Input queue callbacks
362 *
363 * Input queue signalling is device-specific. For example, the /dev/fuse file
364 * uses fiq->waitq and fasync to wake processes that are waiting on queue
365 * readiness. These callbacks allow other device types to respond to input
366 * queue activity.
367 */
368struct fuse_iqueue_ops {
369 /**
370 * Signal that a forget has been queued
371 */
372 void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq)
373 __releases(fiq->lock);
441 374
442 /** Request completion callback */ 375 /**
443 void (*end)(struct fuse_conn *, struct fuse_req *); 376 * Signal that an INTERRUPT request has been queued
377 */
378 void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq)
379 __releases(fiq->lock);
444 380
445 /** Request is stolen from fuse_file->reserved_req */ 381 /**
446 struct file *stolen_file; 382 * Signal that a request has been queued
383 */
384 void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq)
385 __releases(fiq->lock);
447}; 386};
448 387
388/** /dev/fuse input queue operations */
389extern const struct fuse_iqueue_ops fuse_dev_fiq_ops;
390
449struct fuse_iqueue { 391struct fuse_iqueue {
450 /** Connection established */ 392 /** Connection established */
451 unsigned connected; 393 unsigned connected;
452 394
395 /** Lock protecting accesses to members of this structure */
396 spinlock_t lock;
397
453 /** Readers of the connection are waiting on this */ 398 /** Readers of the connection are waiting on this */
454 wait_queue_head_t waitq; 399 wait_queue_head_t waitq;
455 400
@@ -471,6 +416,12 @@ struct fuse_iqueue {
471 416
472 /** O_ASYNC requests */ 417 /** O_ASYNC requests */
473 struct fasync_struct *fasync; 418 struct fasync_struct *fasync;
419
420 /** Device-specific callbacks */
421 const struct fuse_iqueue_ops *ops;
422
423 /** Device-specific state */
424 void *priv;
474}; 425};
475 426
476#define FUSE_PQ_HASH_BITS 8 427#define FUSE_PQ_HASH_BITS 8
@@ -504,6 +455,29 @@ struct fuse_dev {
504 struct list_head entry; 455 struct list_head entry;
505}; 456};
506 457
458struct fuse_fs_context {
459 int fd;
460 unsigned int rootmode;
461 kuid_t user_id;
462 kgid_t group_id;
463 bool is_bdev:1;
464 bool fd_present:1;
465 bool rootmode_present:1;
466 bool user_id_present:1;
467 bool group_id_present:1;
468 bool default_permissions:1;
469 bool allow_other:1;
470 bool destroy:1;
471 bool no_control:1;
472 bool no_force_umount:1;
473 unsigned int max_read;
474 unsigned int blksize;
475 const char *subtype;
476
477 /* fuse_dev pointer to fill in, should contain NULL on entry */
478 void **fudptr;
479};
480
507/** 481/**
508 * A Fuse connection. 482 * A Fuse connection.
509 * 483 *
@@ -584,9 +558,6 @@ struct fuse_conn {
584 /** waitq for blocked connection */ 558 /** waitq for blocked connection */
585 wait_queue_head_t blocked_waitq; 559 wait_queue_head_t blocked_waitq;
586 560
587 /** waitq for reserved requests */
588 wait_queue_head_t reserved_req_waitq;
589
590 /** Connection established, cleared on umount, connection 561 /** Connection established, cleared on umount, connection
591 abort and device release */ 562 abort and device release */
592 unsigned connected; 563 unsigned connected;
@@ -721,6 +692,18 @@ struct fuse_conn {
721 /** Does the filesystem support copy_file_range? */ 692 /** Does the filesystem support copy_file_range? */
722 unsigned no_copy_file_range:1; 693 unsigned no_copy_file_range:1;
723 694
695 /* Send DESTROY request */
696 unsigned int destroy:1;
697
698 /* Delete dentries that have gone stale */
699 unsigned int delete_stale:1;
700
701 /** Do not create entry in fusectl fs */
702 unsigned int no_control:1;
703
704 /** Do not allow MNT_FORCE umount */
705 unsigned int no_force_umount:1;
706
724 /** The number of requests waiting for completion */ 707 /** The number of requests waiting for completion */
725 atomic_t num_waiting; 708 atomic_t num_waiting;
726 709
@@ -742,9 +725,6 @@ struct fuse_conn {
742 /** Key for lock owner ID scrambling */ 725 /** Key for lock owner ID scrambling */
743 u32 scramble_key[4]; 726 u32 scramble_key[4];
744 727
745 /** Reserved request for the DESTROY message */
746 struct fuse_req *destroy_req;
747
748 /** Version counter for attribute changes */ 728 /** Version counter for attribute changes */
749 atomic64_t attr_version; 729 atomic64_t attr_version;
750 730
@@ -820,14 +800,32 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
820 800
821struct fuse_forget_link *fuse_alloc_forget(void); 801struct fuse_forget_link *fuse_alloc_forget(void);
822 802
823/* Used by READDIRPLUS */ 803struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq,
824void fuse_force_forget(struct file *file, u64 nodeid); 804 unsigned int max,
805 unsigned int *countp);
825 806
826/** 807/*
827 * Initialize READ or READDIR request 808 * Initialize READ or READDIR request
828 */ 809 */
829void fuse_read_fill(struct fuse_req *req, struct file *file, 810struct fuse_io_args {
830 loff_t pos, size_t count, int opcode); 811 union {
812 struct {
813 struct fuse_read_in in;
814 u64 attr_ver;
815 } read;
816 struct {
817 struct fuse_write_in in;
818 struct fuse_write_out out;
819 } write;
820 };
821 struct fuse_args_pages ap;
822 struct fuse_io_priv *io;
823 struct fuse_file *ff;
824};
825
826void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
827 size_t count, int opcode);
828
831 829
832/** 830/**
833 * Send OPEN or OPENDIR request 831 * Send OPEN or OPENDIR request
@@ -900,61 +898,16 @@ int fuse_ctl_init(void);
900void __exit fuse_ctl_cleanup(void); 898void __exit fuse_ctl_cleanup(void);
901 899
902/** 900/**
903 * Allocate a request
904 */
905struct fuse_req *fuse_request_alloc(unsigned npages);
906
907struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
908
909bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req,
910 gfp_t flags);
911
912
913/**
914 * Free a request
915 */
916void fuse_request_free(struct fuse_req *req);
917
918/**
919 * Get a request, may fail with -ENOMEM,
920 * caller should specify # elements in req->pages[] explicitly
921 */
922struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
923struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
924 unsigned npages);
925
926/*
927 * Increment reference count on request
928 */
929void __fuse_get_request(struct fuse_req *req);
930
931/**
932 * Gets a requests for a file operation, always succeeds
933 */
934struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
935 struct file *file);
936
937/**
938 * Decrement reference count of a request. If count goes to zero free
939 * the request.
940 */
941void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
942
943/**
944 * Send a request (synchronous)
945 */
946void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
947
948/**
949 * Simple request sending that does request allocation and freeing 901 * Simple request sending that does request allocation and freeing
950 */ 902 */
951ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); 903ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
904int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args,
905 gfp_t gfp_flags);
952 906
953/** 907/**
954 * Send a request in the background 908 * End a finished request
955 */ 909 */
956void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); 910void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req);
957bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req);
958 911
959/* Abort all requests */ 912/* Abort all requests */
960void fuse_abort_conn(struct fuse_conn *fc); 913void fuse_abort_conn(struct fuse_conn *fc);
@@ -980,15 +933,33 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
980/** 933/**
981 * Initialize fuse_conn 934 * Initialize fuse_conn
982 */ 935 */
983void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); 936void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
937 const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv);
984 938
985/** 939/**
986 * Release reference to fuse_conn 940 * Release reference to fuse_conn
987 */ 941 */
988void fuse_conn_put(struct fuse_conn *fc); 942void fuse_conn_put(struct fuse_conn *fc);
989 943
990struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); 944struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc);
945struct fuse_dev *fuse_dev_alloc(void);
946void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc);
991void fuse_dev_free(struct fuse_dev *fud); 947void fuse_dev_free(struct fuse_dev *fud);
948void fuse_send_init(struct fuse_conn *fc);
949
950/**
951 * Fill in superblock and initialize fuse connection
952 * @sb: partially-initialized superblock to fill in
953 * @ctx: mount context
954 */
955int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx);
956
957/**
958 * Disassociate fuse connection from superblock and kill the superblock
959 *
960 * Calls kill_anon_super(), do not use with bdev mounts.
961 */
962void fuse_kill_sb_anon(struct super_block *sb);
992 963
993/** 964/**
994 * Add connection to control filesystem 965 * Add connection to control filesystem
@@ -1093,4 +1064,15 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type);
1093/* readdir.c */ 1064/* readdir.c */
1094int fuse_readdir(struct file *file, struct dir_context *ctx); 1065int fuse_readdir(struct file *file, struct dir_context *ctx);
1095 1066
1067/**
1068 * Return the number of bytes in an arguments list
1069 */
1070unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args);
1071
1072/**
1073 * Get the next unique ID for a request
1074 */
1075u64 fuse_get_unique(struct fuse_iqueue *fiq);
1076void fuse_free_conn(struct fuse_conn *fc);
1077
1096#endif /* _FS_FUSE_I_H */ 1078#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 4bb885b0f032..51cb471f4dc3 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -15,7 +15,8 @@
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/parser.h> 18#include <linux/fs_context.h>
19#include <linux/fs_parser.h>
19#include <linux/statfs.h> 20#include <linux/statfs.h>
20#include <linux/random.h> 21#include <linux/random.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
@@ -59,24 +60,13 @@ MODULE_PARM_DESC(max_user_congthresh,
59/** Congestion starts at 75% of maximum */ 60/** Congestion starts at 75% of maximum */
60#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) 61#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
61 62
62struct fuse_mount_data { 63#ifdef CONFIG_BLOCK
63 int fd; 64static struct file_system_type fuseblk_fs_type;
64 unsigned rootmode; 65#endif
65 kuid_t user_id;
66 kgid_t group_id;
67 unsigned fd_present:1;
68 unsigned rootmode_present:1;
69 unsigned user_id_present:1;
70 unsigned group_id_present:1;
71 unsigned default_permissions:1;
72 unsigned allow_other:1;
73 unsigned max_read;
74 unsigned blksize;
75};
76 66
77struct fuse_forget_link *fuse_alloc_forget(void) 67struct fuse_forget_link *fuse_alloc_forget(void)
78{ 68{
79 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); 69 return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT);
80} 70}
81 71
82static struct inode *fuse_alloc_inode(struct super_block *sb) 72static struct inode *fuse_alloc_inode(struct super_block *sb)
@@ -374,19 +364,21 @@ void fuse_unlock_inode(struct inode *inode, bool locked)
374 364
375static void fuse_umount_begin(struct super_block *sb) 365static void fuse_umount_begin(struct super_block *sb)
376{ 366{
377 fuse_abort_conn(get_fuse_conn_super(sb)); 367 struct fuse_conn *fc = get_fuse_conn_super(sb);
368
369 if (!fc->no_force_umount)
370 fuse_abort_conn(fc);
378} 371}
379 372
380static void fuse_send_destroy(struct fuse_conn *fc) 373static void fuse_send_destroy(struct fuse_conn *fc)
381{ 374{
382 struct fuse_req *req = fc->destroy_req; 375 if (fc->conn_init) {
383 if (req && fc->conn_init) { 376 FUSE_ARGS(args);
384 fc->destroy_req = NULL; 377
385 req->in.h.opcode = FUSE_DESTROY; 378 args.opcode = FUSE_DESTROY;
386 __set_bit(FR_FORCE, &req->flags); 379 args.force = true;
387 __clear_bit(FR_BACKGROUND, &req->flags); 380 args.nocreds = true;
388 fuse_request_send(fc, req); 381 fuse_simple_request(fc, &args);
389 fuse_put_request(fc, req);
390 } 382 }
391} 383}
392 384
@@ -430,12 +422,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
430 } 422 }
431 423
432 memset(&outarg, 0, sizeof(outarg)); 424 memset(&outarg, 0, sizeof(outarg));
433 args.in.numargs = 0; 425 args.in_numargs = 0;
434 args.in.h.opcode = FUSE_STATFS; 426 args.opcode = FUSE_STATFS;
435 args.in.h.nodeid = get_node_id(d_inode(dentry)); 427 args.nodeid = get_node_id(d_inode(dentry));
436 args.out.numargs = 1; 428 args.out_numargs = 1;
437 args.out.args[0].size = sizeof(outarg); 429 args.out_args[0].size = sizeof(outarg);
438 args.out.args[0].value = &outarg; 430 args.out_args[0].value = &outarg;
439 err = fuse_simple_request(fc, &args); 431 err = fuse_simple_request(fc, &args);
440 if (!err) 432 if (!err)
441 convert_fuse_statfs(buf, &outarg.st); 433 convert_fuse_statfs(buf, &outarg.st);
@@ -443,6 +435,8 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
443} 435}
444 436
445enum { 437enum {
438 OPT_SOURCE,
439 OPT_SUBTYPE,
446 OPT_FD, 440 OPT_FD,
447 OPT_ROOTMODE, 441 OPT_ROOTMODE,
448 OPT_USER_ID, 442 OPT_USER_ID,
@@ -454,111 +448,109 @@ enum {
454 OPT_ERR 448 OPT_ERR
455}; 449};
456 450
457static const match_table_t tokens = { 451static const struct fs_parameter_spec fuse_param_specs[] = {
458 {OPT_FD, "fd=%u"}, 452 fsparam_string ("source", OPT_SOURCE),
459 {OPT_ROOTMODE, "rootmode=%o"}, 453 fsparam_u32 ("fd", OPT_FD),
460 {OPT_USER_ID, "user_id=%u"}, 454 fsparam_u32oct ("rootmode", OPT_ROOTMODE),
461 {OPT_GROUP_ID, "group_id=%u"}, 455 fsparam_u32 ("user_id", OPT_USER_ID),
462 {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, 456 fsparam_u32 ("group_id", OPT_GROUP_ID),
463 {OPT_ALLOW_OTHER, "allow_other"}, 457 fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS),
464 {OPT_MAX_READ, "max_read=%u"}, 458 fsparam_flag ("allow_other", OPT_ALLOW_OTHER),
465 {OPT_BLKSIZE, "blksize=%u"}, 459 fsparam_u32 ("max_read", OPT_MAX_READ),
466 {OPT_ERR, NULL} 460 fsparam_u32 ("blksize", OPT_BLKSIZE),
461 fsparam_string ("subtype", OPT_SUBTYPE),
462 {}
463};
464
465static const struct fs_parameter_description fuse_fs_parameters = {
466 .name = "fuse",
467 .specs = fuse_param_specs,
467}; 468};
468 469
469static int fuse_match_uint(substring_t *s, unsigned int *res) 470static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param)
470{ 471{
471 int err = -ENOMEM; 472 struct fs_parse_result result;
472 char *buf = match_strdup(s); 473 struct fuse_fs_context *ctx = fc->fs_private;
473 if (buf) { 474 int opt;
474 err = kstrtouint(buf, 10, res); 475
475 kfree(buf); 476 opt = fs_parse(fc, &fuse_fs_parameters, param, &result);
477 if (opt < 0)
478 return opt;
479
480 switch (opt) {
481 case OPT_SOURCE:
482 if (fc->source)
483 return invalf(fc, "fuse: Multiple sources specified");
484 fc->source = param->string;
485 param->string = NULL;
486 break;
487
488 case OPT_SUBTYPE:
489 if (ctx->subtype)
490 return invalf(fc, "fuse: Multiple subtypes specified");
491 ctx->subtype = param->string;
492 param->string = NULL;
493 return 0;
494
495 case OPT_FD:
496 ctx->fd = result.uint_32;
497 ctx->fd_present = 1;
498 break;
499
500 case OPT_ROOTMODE:
501 if (!fuse_valid_type(result.uint_32))
502 return invalf(fc, "fuse: Invalid rootmode");
503 ctx->rootmode = result.uint_32;
504 ctx->rootmode_present = 1;
505 break;
506
507 case OPT_USER_ID:
508 ctx->user_id = make_kuid(fc->user_ns, result.uint_32);
509 if (!uid_valid(ctx->user_id))
510 return invalf(fc, "fuse: Invalid user_id");
511 ctx->user_id_present = 1;
512 break;
513
514 case OPT_GROUP_ID:
515 ctx->group_id = make_kgid(fc->user_ns, result.uint_32);
516 if (!gid_valid(ctx->group_id))
517 return invalf(fc, "fuse: Invalid group_id");
518 ctx->group_id_present = 1;
519 break;
520
521 case OPT_DEFAULT_PERMISSIONS:
522 ctx->default_permissions = 1;
523 break;
524
525 case OPT_ALLOW_OTHER:
526 ctx->allow_other = 1;
527 break;
528
529 case OPT_MAX_READ:
530 ctx->max_read = result.uint_32;
531 break;
532
533 case OPT_BLKSIZE:
534 if (!ctx->is_bdev)
535 return invalf(fc, "fuse: blksize only supported for fuseblk");
536 ctx->blksize = result.uint_32;
537 break;
538
539 default:
540 return -EINVAL;
476 } 541 }
477 return err; 542
543 return 0;
478} 544}
479 545
480static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, 546static void fuse_free_fc(struct fs_context *fc)
481 struct user_namespace *user_ns)
482{ 547{
483 char *p; 548 struct fuse_fs_context *ctx = fc->fs_private;
484 memset(d, 0, sizeof(struct fuse_mount_data));
485 d->max_read = ~0;
486 d->blksize = FUSE_DEFAULT_BLKSIZE;
487
488 while ((p = strsep(&opt, ",")) != NULL) {
489 int token;
490 int value;
491 unsigned uv;
492 substring_t args[MAX_OPT_ARGS];
493 if (!*p)
494 continue;
495
496 token = match_token(p, tokens, args);
497 switch (token) {
498 case OPT_FD:
499 if (match_int(&args[0], &value))
500 return 0;
501 d->fd = value;
502 d->fd_present = 1;
503 break;
504
505 case OPT_ROOTMODE:
506 if (match_octal(&args[0], &value))
507 return 0;
508 if (!fuse_valid_type(value))
509 return 0;
510 d->rootmode = value;
511 d->rootmode_present = 1;
512 break;
513
514 case OPT_USER_ID:
515 if (fuse_match_uint(&args[0], &uv))
516 return 0;
517 d->user_id = make_kuid(user_ns, uv);
518 if (!uid_valid(d->user_id))
519 return 0;
520 d->user_id_present = 1;
521 break;
522
523 case OPT_GROUP_ID:
524 if (fuse_match_uint(&args[0], &uv))
525 return 0;
526 d->group_id = make_kgid(user_ns, uv);
527 if (!gid_valid(d->group_id))
528 return 0;
529 d->group_id_present = 1;
530 break;
531
532 case OPT_DEFAULT_PERMISSIONS:
533 d->default_permissions = 1;
534 break;
535
536 case OPT_ALLOW_OTHER:
537 d->allow_other = 1;
538 break;
539
540 case OPT_MAX_READ:
541 if (match_int(&args[0], &value))
542 return 0;
543 d->max_read = value;
544 break;
545
546 case OPT_BLKSIZE:
547 if (!is_bdev || match_int(&args[0], &value))
548 return 0;
549 d->blksize = value;
550 break;
551
552 default:
553 return 0;
554 }
555 }
556 549
557 if (!d->fd_present || !d->rootmode_present || 550 if (ctx) {
558 !d->user_id_present || !d->group_id_present) 551 kfree(ctx->subtype);
559 return 0; 552 kfree(ctx);
560 553 }
561 return 1;
562} 554}
563 555
564static int fuse_show_options(struct seq_file *m, struct dentry *root) 556static int fuse_show_options(struct seq_file *m, struct dentry *root)
@@ -579,14 +571,19 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
579 return 0; 571 return 0;
580} 572}
581 573
582static void fuse_iqueue_init(struct fuse_iqueue *fiq) 574static void fuse_iqueue_init(struct fuse_iqueue *fiq,
575 const struct fuse_iqueue_ops *ops,
576 void *priv)
583{ 577{
584 memset(fiq, 0, sizeof(struct fuse_iqueue)); 578 memset(fiq, 0, sizeof(struct fuse_iqueue));
579 spin_lock_init(&fiq->lock);
585 init_waitqueue_head(&fiq->waitq); 580 init_waitqueue_head(&fiq->waitq);
586 INIT_LIST_HEAD(&fiq->pending); 581 INIT_LIST_HEAD(&fiq->pending);
587 INIT_LIST_HEAD(&fiq->interrupts); 582 INIT_LIST_HEAD(&fiq->interrupts);
588 fiq->forget_list_tail = &fiq->forget_list_head; 583 fiq->forget_list_tail = &fiq->forget_list_head;
589 fiq->connected = 1; 584 fiq->connected = 1;
585 fiq->ops = ops;
586 fiq->priv = priv;
590} 587}
591 588
592static void fuse_pqueue_init(struct fuse_pqueue *fpq) 589static void fuse_pqueue_init(struct fuse_pqueue *fpq)
@@ -600,7 +597,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq)
600 fpq->connected = 1; 597 fpq->connected = 1;
601} 598}
602 599
603void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) 600void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns,
601 const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv)
604{ 602{
605 memset(fc, 0, sizeof(*fc)); 603 memset(fc, 0, sizeof(*fc));
606 spin_lock_init(&fc->lock); 604 spin_lock_init(&fc->lock);
@@ -609,8 +607,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns)
609 refcount_set(&fc->count, 1); 607 refcount_set(&fc->count, 1);
610 atomic_set(&fc->dev_count, 1); 608 atomic_set(&fc->dev_count, 1);
611 init_waitqueue_head(&fc->blocked_waitq); 609 init_waitqueue_head(&fc->blocked_waitq);
612 init_waitqueue_head(&fc->reserved_req_waitq); 610 fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv);
613 fuse_iqueue_init(&fc->iq);
614 INIT_LIST_HEAD(&fc->bg_queue); 611 INIT_LIST_HEAD(&fc->bg_queue);
615 INIT_LIST_HEAD(&fc->entry); 612 INIT_LIST_HEAD(&fc->entry);
616 INIT_LIST_HEAD(&fc->devices); 613 INIT_LIST_HEAD(&fc->devices);
@@ -633,8 +630,6 @@ EXPORT_SYMBOL_GPL(fuse_conn_init);
633void fuse_conn_put(struct fuse_conn *fc) 630void fuse_conn_put(struct fuse_conn *fc)
634{ 631{
635 if (refcount_dec_and_test(&fc->count)) { 632 if (refcount_dec_and_test(&fc->count)) {
636 if (fc->destroy_req)
637 fuse_request_free(fc->destroy_req);
638 put_pid_ns(fc->pid_ns); 633 put_pid_ns(fc->pid_ns);
639 put_user_ns(fc->user_ns); 634 put_user_ns(fc->user_ns);
640 fc->release(fc); 635 fc->release(fc);
@@ -822,9 +817,12 @@ static const struct super_operations fuse_super_operations = {
822 817
823static void sanitize_global_limit(unsigned *limit) 818static void sanitize_global_limit(unsigned *limit)
824{ 819{
820 /*
821 * The default maximum number of async requests is calculated to consume
822 * 1/2^13 of the total memory, assuming 392 bytes per request.
823 */
825 if (*limit == 0) 824 if (*limit == 0)
826 *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 825 *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392;
827 sizeof(struct fuse_req);
828 826
829 if (*limit >= 1 << 16) 827 if (*limit >= 1 << 16)
830 *limit = (1 << 16) - 1; 828 *limit = (1 << 16) - 1;
@@ -870,11 +868,19 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
870 spin_unlock(&fc->bg_lock); 868 spin_unlock(&fc->bg_lock);
871} 869}
872 870
873static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) 871struct fuse_init_args {
872 struct fuse_args args;
873 struct fuse_init_in in;
874 struct fuse_init_out out;
875};
876
877static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args,
878 int error)
874{ 879{
875 struct fuse_init_out *arg = &req->misc.init_out; 880 struct fuse_init_args *ia = container_of(args, typeof(*ia), args);
881 struct fuse_init_out *arg = &ia->out;
876 882
877 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) 883 if (error || arg->major != FUSE_KERNEL_VERSION)
878 fc->conn_error = 1; 884 fc->conn_error = 1;
879 else { 885 else {
880 unsigned long ra_pages; 886 unsigned long ra_pages;
@@ -951,18 +957,23 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
951 fc->max_write = max_t(unsigned, 4096, fc->max_write); 957 fc->max_write = max_t(unsigned, 4096, fc->max_write);
952 fc->conn_init = 1; 958 fc->conn_init = 1;
953 } 959 }
960 kfree(ia);
961
954 fuse_set_initialized(fc); 962 fuse_set_initialized(fc);
955 wake_up_all(&fc->blocked_waitq); 963 wake_up_all(&fc->blocked_waitq);
956} 964}
957 965
958static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) 966void fuse_send_init(struct fuse_conn *fc)
959{ 967{
960 struct fuse_init_in *arg = &req->misc.init_in; 968 struct fuse_init_args *ia;
969
970 ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL);
961 971
962 arg->major = FUSE_KERNEL_VERSION; 972 ia->in.major = FUSE_KERNEL_VERSION;
963 arg->minor = FUSE_KERNEL_MINOR_VERSION; 973 ia->in.minor = FUSE_KERNEL_MINOR_VERSION;
964 arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; 974 ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE;
965 arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | 975 ia->in.flags |=
976 FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
966 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | 977 FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
967 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | 978 FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
968 FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | 979 FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
@@ -971,26 +982,32 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
971 FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | 982 FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
972 FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | 983 FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
973 FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; 984 FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA;
974 req->in.h.opcode = FUSE_INIT; 985 ia->args.opcode = FUSE_INIT;
975 req->in.numargs = 1; 986 ia->args.in_numargs = 1;
976 req->in.args[0].size = sizeof(*arg); 987 ia->args.in_args[0].size = sizeof(ia->in);
977 req->in.args[0].value = arg; 988 ia->args.in_args[0].value = &ia->in;
978 req->out.numargs = 1; 989 ia->args.out_numargs = 1;
979 /* Variable length argument used for backward compatibility 990 /* Variable length argument used for backward compatibility
980 with interface version < 7.5. Rest of init_out is zeroed 991 with interface version < 7.5. Rest of init_out is zeroed
981 by do_get_request(), so a short reply is not a problem */ 992 by do_get_request(), so a short reply is not a problem */
982 req->out.argvar = 1; 993 ia->args.out_argvar = 1;
983 req->out.args[0].size = sizeof(struct fuse_init_out); 994 ia->args.out_args[0].size = sizeof(ia->out);
984 req->out.args[0].value = &req->misc.init_out; 995 ia->args.out_args[0].value = &ia->out;
985 req->end = process_init_reply; 996 ia->args.force = true;
986 fuse_request_send_background(fc, req); 997 ia->args.nocreds = true;
998 ia->args.end = process_init_reply;
999
1000 if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0)
1001 process_init_reply(fc, &ia->args, -ENOTCONN);
987} 1002}
1003EXPORT_SYMBOL_GPL(fuse_send_init);
988 1004
989static void fuse_free_conn(struct fuse_conn *fc) 1005void fuse_free_conn(struct fuse_conn *fc)
990{ 1006{
991 WARN_ON(!list_empty(&fc->devices)); 1007 WARN_ON(!list_empty(&fc->devices));
992 kfree_rcu(fc, rcu); 1008 kfree_rcu(fc, rcu);
993} 1009}
1010EXPORT_SYMBOL_GPL(fuse_free_conn);
994 1011
995static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) 1012static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
996{ 1013{
@@ -1032,7 +1049,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
1032 return 0; 1049 return 0;
1033} 1050}
1034 1051
1035struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) 1052struct fuse_dev *fuse_dev_alloc(void)
1036{ 1053{
1037 struct fuse_dev *fud; 1054 struct fuse_dev *fud;
1038 struct list_head *pq; 1055 struct list_head *pq;
@@ -1048,16 +1065,33 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc)
1048 } 1065 }
1049 1066
1050 fud->pq.processing = pq; 1067 fud->pq.processing = pq;
1051 fud->fc = fuse_conn_get(fc);
1052 fuse_pqueue_init(&fud->pq); 1068 fuse_pqueue_init(&fud->pq);
1053 1069
1070 return fud;
1071}
1072EXPORT_SYMBOL_GPL(fuse_dev_alloc);
1073
1074void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc)
1075{
1076 fud->fc = fuse_conn_get(fc);
1054 spin_lock(&fc->lock); 1077 spin_lock(&fc->lock);
1055 list_add_tail(&fud->entry, &fc->devices); 1078 list_add_tail(&fud->entry, &fc->devices);
1056 spin_unlock(&fc->lock); 1079 spin_unlock(&fc->lock);
1080}
1081EXPORT_SYMBOL_GPL(fuse_dev_install);
1057 1082
1083struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc)
1084{
1085 struct fuse_dev *fud;
1086
1087 fud = fuse_dev_alloc();
1088 if (!fud)
1089 return NULL;
1090
1091 fuse_dev_install(fud, fc);
1058 return fud; 1092 return fud;
1059} 1093}
1060EXPORT_SYMBOL_GPL(fuse_dev_alloc); 1094EXPORT_SYMBOL_GPL(fuse_dev_alloc_install);
1061 1095
1062void fuse_dev_free(struct fuse_dev *fud) 1096void fuse_dev_free(struct fuse_dev *fud)
1063{ 1097{
@@ -1075,17 +1109,13 @@ void fuse_dev_free(struct fuse_dev *fud)
1075} 1109}
1076EXPORT_SYMBOL_GPL(fuse_dev_free); 1110EXPORT_SYMBOL_GPL(fuse_dev_free);
1077 1111
1078static int fuse_fill_super(struct super_block *sb, void *data, int silent) 1112int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx)
1079{ 1113{
1080 struct fuse_dev *fud; 1114 struct fuse_dev *fud;
1081 struct fuse_conn *fc; 1115 struct fuse_conn *fc = get_fuse_conn_super(sb);
1082 struct inode *root; 1116 struct inode *root;
1083 struct fuse_mount_data d;
1084 struct file *file;
1085 struct dentry *root_dentry; 1117 struct dentry *root_dentry;
1086 struct fuse_req *init_req;
1087 int err; 1118 int err;
1088 int is_bdev = sb->s_bdev != NULL;
1089 1119
1090 err = -EINVAL; 1120 err = -EINVAL;
1091 if (sb->s_flags & SB_MANDLOCK) 1121 if (sb->s_flags & SB_MANDLOCK)
@@ -1093,19 +1123,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1093 1123
1094 sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); 1124 sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION);
1095 1125
1096 if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) 1126 if (ctx->is_bdev) {
1097 goto err;
1098
1099 if (is_bdev) {
1100#ifdef CONFIG_BLOCK 1127#ifdef CONFIG_BLOCK
1101 err = -EINVAL; 1128 err = -EINVAL;
1102 if (!sb_set_blocksize(sb, d.blksize)) 1129 if (!sb_set_blocksize(sb, ctx->blksize))
1103 goto err; 1130 goto err;
1104#endif 1131#endif
1105 } else { 1132 } else {
1106 sb->s_blocksize = PAGE_SIZE; 1133 sb->s_blocksize = PAGE_SIZE;
1107 sb->s_blocksize_bits = PAGE_SHIFT; 1134 sb->s_blocksize_bits = PAGE_SHIFT;
1108 } 1135 }
1136
1137 sb->s_subtype = ctx->subtype;
1138 ctx->subtype = NULL;
1109 sb->s_magic = FUSE_SUPER_MAGIC; 1139 sb->s_magic = FUSE_SUPER_MAGIC;
1110 sb->s_op = &fuse_super_operations; 1140 sb->s_op = &fuse_super_operations;
1111 sb->s_xattr = fuse_xattr_handlers; 1141 sb->s_xattr = fuse_xattr_handlers;
@@ -1116,19 +1146,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1116 if (sb->s_user_ns != &init_user_ns) 1146 if (sb->s_user_ns != &init_user_ns)
1117 sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; 1147 sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER;
1118 1148
1119 file = fget(d.fd);
1120 err = -EINVAL;
1121 if (!file)
1122 goto err;
1123
1124 /*
1125 * Require mount to happen from the same user namespace which
1126 * opened /dev/fuse to prevent potential attacks.
1127 */
1128 if (file->f_op != &fuse_dev_operations ||
1129 file->f_cred->user_ns != sb->s_user_ns)
1130 goto err_fput;
1131
1132 /* 1149 /*
1133 * If we are not in the initial user namespace posix 1150 * If we are not in the initial user namespace posix
1134 * acls must be translated. 1151 * acls must be translated.
@@ -1136,17 +1153,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1136 if (sb->s_user_ns != &init_user_ns) 1153 if (sb->s_user_ns != &init_user_ns)
1137 sb->s_xattr = fuse_no_acl_xattr_handlers; 1154 sb->s_xattr = fuse_no_acl_xattr_handlers;
1138 1155
1139 fc = kmalloc(sizeof(*fc), GFP_KERNEL); 1156 fud = fuse_dev_alloc_install(fc);
1140 err = -ENOMEM;
1141 if (!fc)
1142 goto err_fput;
1143
1144 fuse_conn_init(fc, sb->s_user_ns);
1145 fc->release = fuse_free_conn;
1146
1147 fud = fuse_dev_alloc(fc);
1148 if (!fud) 1157 if (!fud)
1149 goto err_put_conn; 1158 goto err;
1150 1159
1151 fc->dev = sb->s_dev; 1160 fc->dev = sb->s_dev;
1152 fc->sb = sb; 1161 fc->sb = sb;
@@ -1159,17 +1168,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1159 fc->dont_mask = 1; 1168 fc->dont_mask = 1;
1160 sb->s_flags |= SB_POSIXACL; 1169 sb->s_flags |= SB_POSIXACL;
1161 1170
1162 fc->default_permissions = d.default_permissions; 1171 fc->default_permissions = ctx->default_permissions;
1163 fc->allow_other = d.allow_other; 1172 fc->allow_other = ctx->allow_other;
1164 fc->user_id = d.user_id; 1173 fc->user_id = ctx->user_id;
1165 fc->group_id = d.group_id; 1174 fc->group_id = ctx->group_id;
1166 fc->max_read = max_t(unsigned, 4096, d.max_read); 1175 fc->max_read = max_t(unsigned, 4096, ctx->max_read);
1167 1176 fc->destroy = ctx->destroy;
1168 /* Used by get_root_inode() */ 1177 fc->no_control = ctx->no_control;
1169 sb->s_fs_info = fc; 1178 fc->no_force_umount = ctx->no_force_umount;
1170 1179
1171 err = -ENOMEM; 1180 err = -ENOMEM;
1172 root = fuse_get_root_inode(sb, d.rootmode); 1181 root = fuse_get_root_inode(sb, ctx->rootmode);
1173 sb->s_d_op = &fuse_root_dentry_operations; 1182 sb->s_d_op = &fuse_root_dentry_operations;
1174 root_dentry = d_make_root(root); 1183 root_dentry = d_make_root(root);
1175 if (!root_dentry) 1184 if (!root_dentry)
@@ -1177,20 +1186,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1177 /* Root dentry doesn't have .d_revalidate */ 1186 /* Root dentry doesn't have .d_revalidate */
1178 sb->s_d_op = &fuse_dentry_operations; 1187 sb->s_d_op = &fuse_dentry_operations;
1179 1188
1180 init_req = fuse_request_alloc(0);
1181 if (!init_req)
1182 goto err_put_root;
1183 __set_bit(FR_BACKGROUND, &init_req->flags);
1184
1185 if (is_bdev) {
1186 fc->destroy_req = fuse_request_alloc(0);
1187 if (!fc->destroy_req)
1188 goto err_free_init_req;
1189 }
1190
1191 mutex_lock(&fuse_mutex); 1189 mutex_lock(&fuse_mutex);
1192 err = -EINVAL; 1190 err = -EINVAL;
1193 if (file->private_data) 1191 if (*ctx->fudptr)
1194 goto err_unlock; 1192 goto err_unlock;
1195 1193
1196 err = fuse_ctl_add_conn(fc); 1194 err = fuse_ctl_add_conn(fc);
@@ -1199,27 +1197,62 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1199 1197
1200 list_add_tail(&fc->entry, &fuse_conn_list); 1198 list_add_tail(&fc->entry, &fuse_conn_list);
1201 sb->s_root = root_dentry; 1199 sb->s_root = root_dentry;
1202 file->private_data = fud; 1200 *ctx->fudptr = fud;
1203 mutex_unlock(&fuse_mutex); 1201 mutex_unlock(&fuse_mutex);
1202 return 0;
1203
1204 err_unlock:
1205 mutex_unlock(&fuse_mutex);
1206 dput(root_dentry);
1207 err_dev_free:
1208 fuse_dev_free(fud);
1209 err:
1210 return err;
1211}
1212EXPORT_SYMBOL_GPL(fuse_fill_super_common);
1213
1214static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc)
1215{
1216 struct fuse_fs_context *ctx = fsc->fs_private;
1217 struct file *file;
1218 int err;
1219 struct fuse_conn *fc;
1220
1221 err = -EINVAL;
1222 file = fget(ctx->fd);
1223 if (!file)
1224 goto err;
1225
1226 /*
1227 * Require mount to happen from the same user namespace which
1228 * opened /dev/fuse to prevent potential attacks.
1229 */
1230 if ((file->f_op != &fuse_dev_operations) ||
1231 (file->f_cred->user_ns != sb->s_user_ns))
1232 goto err_fput;
1233 ctx->fudptr = &file->private_data;
1234
1235 fc = kmalloc(sizeof(*fc), GFP_KERNEL);
1236 err = -ENOMEM;
1237 if (!fc)
1238 goto err_fput;
1239
1240 fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL);
1241 fc->release = fuse_free_conn;
1242 sb->s_fs_info = fc;
1243
1244 err = fuse_fill_super_common(sb, ctx);
1245 if (err)
1246 goto err_put_conn;
1204 /* 1247 /*
1205 * atomic_dec_and_test() in fput() provides the necessary 1248 * atomic_dec_and_test() in fput() provides the necessary
1206 * memory barrier for file->private_data to be visible on all 1249 * memory barrier for file->private_data to be visible on all
1207 * CPUs after this 1250 * CPUs after this
1208 */ 1251 */
1209 fput(file); 1252 fput(file);
1210 1253 fuse_send_init(get_fuse_conn_super(sb));
1211 fuse_send_init(fc, init_req);
1212
1213 return 0; 1254 return 0;
1214 1255
1215 err_unlock:
1216 mutex_unlock(&fuse_mutex);
1217 err_free_init_req:
1218 fuse_request_free(init_req);
1219 err_put_root:
1220 dput(root_dentry);
1221 err_dev_free:
1222 fuse_dev_free(fud);
1223 err_put_conn: 1256 err_put_conn:
1224 fuse_conn_put(fc); 1257 fuse_conn_put(fc);
1225 sb->s_fs_info = NULL; 1258 sb->s_fs_info = NULL;
@@ -1229,11 +1262,52 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
1229 return err; 1262 return err;
1230} 1263}
1231 1264
1232static struct dentry *fuse_mount(struct file_system_type *fs_type, 1265static int fuse_get_tree(struct fs_context *fc)
1233 int flags, const char *dev_name, 1266{
1234 void *raw_data) 1267 struct fuse_fs_context *ctx = fc->fs_private;
1268
1269 if (!ctx->fd_present || !ctx->rootmode_present ||
1270 !ctx->user_id_present || !ctx->group_id_present)
1271 return -EINVAL;
1272
1273#ifdef CONFIG_BLOCK
1274 if (ctx->is_bdev)
1275 return get_tree_bdev(fc, fuse_fill_super);
1276#endif
1277
1278 return get_tree_nodev(fc, fuse_fill_super);
1279}
1280
1281static const struct fs_context_operations fuse_context_ops = {
1282 .free = fuse_free_fc,
1283 .parse_param = fuse_parse_param,
1284 .get_tree = fuse_get_tree,
1285};
1286
1287/*
1288 * Set up the filesystem mount context.
1289 */
1290static int fuse_init_fs_context(struct fs_context *fc)
1235{ 1291{
1236 return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); 1292 struct fuse_fs_context *ctx;
1293
1294 ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL);
1295 if (!ctx)
1296 return -ENOMEM;
1297
1298 ctx->max_read = ~0;
1299 ctx->blksize = FUSE_DEFAULT_BLKSIZE;
1300
1301#ifdef CONFIG_BLOCK
1302 if (fc->fs_type == &fuseblk_fs_type) {
1303 ctx->is_bdev = true;
1304 ctx->destroy = true;
1305 }
1306#endif
1307
1308 fc->fs_private = ctx;
1309 fc->ops = &fuse_context_ops;
1310 return 0;
1237} 1311}
1238 1312
1239static void fuse_sb_destroy(struct super_block *sb) 1313static void fuse_sb_destroy(struct super_block *sb)
@@ -1241,7 +1315,8 @@ static void fuse_sb_destroy(struct super_block *sb)
1241 struct fuse_conn *fc = get_fuse_conn_super(sb); 1315 struct fuse_conn *fc = get_fuse_conn_super(sb);
1242 1316
1243 if (fc) { 1317 if (fc) {
1244 fuse_send_destroy(fc); 1318 if (fc->destroy)
1319 fuse_send_destroy(fc);
1245 1320
1246 fuse_abort_conn(fc); 1321 fuse_abort_conn(fc);
1247 fuse_wait_aborted(fc); 1322 fuse_wait_aborted(fc);
@@ -1252,29 +1327,24 @@ static void fuse_sb_destroy(struct super_block *sb)
1252 } 1327 }
1253} 1328}
1254 1329
1255static void fuse_kill_sb_anon(struct super_block *sb) 1330void fuse_kill_sb_anon(struct super_block *sb)
1256{ 1331{
1257 fuse_sb_destroy(sb); 1332 fuse_sb_destroy(sb);
1258 kill_anon_super(sb); 1333 kill_anon_super(sb);
1259} 1334}
1335EXPORT_SYMBOL_GPL(fuse_kill_sb_anon);
1260 1336
1261static struct file_system_type fuse_fs_type = { 1337static struct file_system_type fuse_fs_type = {
1262 .owner = THIS_MODULE, 1338 .owner = THIS_MODULE,
1263 .name = "fuse", 1339 .name = "fuse",
1264 .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, 1340 .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
1265 .mount = fuse_mount, 1341 .init_fs_context = fuse_init_fs_context,
1342 .parameters = &fuse_fs_parameters,
1266 .kill_sb = fuse_kill_sb_anon, 1343 .kill_sb = fuse_kill_sb_anon,
1267}; 1344};
1268MODULE_ALIAS_FS("fuse"); 1345MODULE_ALIAS_FS("fuse");
1269 1346
1270#ifdef CONFIG_BLOCK 1347#ifdef CONFIG_BLOCK
1271static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
1272 int flags, const char *dev_name,
1273 void *raw_data)
1274{
1275 return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
1276}
1277
1278static void fuse_kill_sb_blk(struct super_block *sb) 1348static void fuse_kill_sb_blk(struct super_block *sb)
1279{ 1349{
1280 fuse_sb_destroy(sb); 1350 fuse_sb_destroy(sb);
@@ -1284,7 +1354,8 @@ static void fuse_kill_sb_blk(struct super_block *sb)
1284static struct file_system_type fuseblk_fs_type = { 1354static struct file_system_type fuseblk_fs_type = {
1285 .owner = THIS_MODULE, 1355 .owner = THIS_MODULE,
1286 .name = "fuseblk", 1356 .name = "fuseblk",
1287 .mount = fuse_mount_blk, 1357 .init_fs_context = fuse_init_fs_context,
1358 .parameters = &fuse_fs_parameters,
1288 .kill_sb = fuse_kill_sb_blk, 1359 .kill_sb = fuse_kill_sb_blk,
1289 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, 1360 .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
1290}; 1361};
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 574d03f8a573..5c38b9d84c6e 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -249,6 +249,27 @@ retry:
249 return 0; 249 return 0;
250} 250}
251 251
252static void fuse_force_forget(struct file *file, u64 nodeid)
253{
254 struct inode *inode = file_inode(file);
255 struct fuse_conn *fc = get_fuse_conn(inode);
256 struct fuse_forget_in inarg;
257 FUSE_ARGS(args);
258
259 memset(&inarg, 0, sizeof(inarg));
260 inarg.nlookup = 1;
261 args.opcode = FUSE_FORGET;
262 args.nodeid = nodeid;
263 args.in_numargs = 1;
264 args.in_args[0].size = sizeof(inarg);
265 args.in_args[0].value = &inarg;
266 args.force = true;
267 args.noreply = true;
268
269 fuse_simple_request(fc, &args);
270 /* ignore errors */
271}
272
252static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, 273static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
253 struct dir_context *ctx, u64 attr_version) 274 struct dir_context *ctx, u64 attr_version)
254{ 275{
@@ -295,62 +316,55 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
295 316
296static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) 317static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
297{ 318{
298 int plus, err; 319 int plus;
299 size_t nbytes; 320 ssize_t res;
300 struct page *page; 321 struct page *page;
301 struct inode *inode = file_inode(file); 322 struct inode *inode = file_inode(file);
302 struct fuse_conn *fc = get_fuse_conn(inode); 323 struct fuse_conn *fc = get_fuse_conn(inode);
303 struct fuse_req *req; 324 struct fuse_io_args ia = {};
325 struct fuse_args_pages *ap = &ia.ap;
326 struct fuse_page_desc desc = { .length = PAGE_SIZE };
304 u64 attr_version = 0; 327 u64 attr_version = 0;
305 bool locked; 328 bool locked;
306 329
307 req = fuse_get_req(fc, 1);
308 if (IS_ERR(req))
309 return PTR_ERR(req);
310
311 page = alloc_page(GFP_KERNEL); 330 page = alloc_page(GFP_KERNEL);
312 if (!page) { 331 if (!page)
313 fuse_put_request(fc, req);
314 return -ENOMEM; 332 return -ENOMEM;
315 }
316 333
317 plus = fuse_use_readdirplus(inode, ctx); 334 plus = fuse_use_readdirplus(inode, ctx);
318 req->out.argpages = 1; 335 ap->args.out_pages = 1;
319 req->num_pages = 1; 336 ap->num_pages = 1;
320 req->pages[0] = page; 337 ap->pages = &page;
321 req->page_descs[0].length = PAGE_SIZE; 338 ap->descs = &desc;
322 if (plus) { 339 if (plus) {
323 attr_version = fuse_get_attr_version(fc); 340 attr_version = fuse_get_attr_version(fc);
324 fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, 341 fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
325 FUSE_READDIRPLUS); 342 FUSE_READDIRPLUS);
326 } else { 343 } else {
327 fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, 344 fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
328 FUSE_READDIR); 345 FUSE_READDIR);
329 } 346 }
330 locked = fuse_lock_inode(inode); 347 locked = fuse_lock_inode(inode);
331 fuse_request_send(fc, req); 348 res = fuse_simple_request(fc, &ap->args);
332 fuse_unlock_inode(inode, locked); 349 fuse_unlock_inode(inode, locked);
333 nbytes = req->out.args[0].size; 350 if (res >= 0) {
334 err = req->out.h.error; 351 if (!res) {
335 fuse_put_request(fc, req);
336 if (!err) {
337 if (!nbytes) {
338 struct fuse_file *ff = file->private_data; 352 struct fuse_file *ff = file->private_data;
339 353
340 if (ff->open_flags & FOPEN_CACHE_DIR) 354 if (ff->open_flags & FOPEN_CACHE_DIR)
341 fuse_readdir_cache_end(file, ctx->pos); 355 fuse_readdir_cache_end(file, ctx->pos);
342 } else if (plus) { 356 } else if (plus) {
343 err = parse_dirplusfile(page_address(page), nbytes, 357 res = parse_dirplusfile(page_address(page), res,
344 file, ctx, attr_version); 358 file, ctx, attr_version);
345 } else { 359 } else {
346 err = parse_dirfile(page_address(page), nbytes, file, 360 res = parse_dirfile(page_address(page), res, file,
347 ctx); 361 ctx);
348 } 362 }
349 } 363 }
350 364
351 __free_page(page); 365 __free_page(page);
352 fuse_invalidate_atime(inode); 366 fuse_invalidate_atime(inode);
353 return err; 367 return res;
354} 368}
355 369
356enum fuse_parse_result { 370enum fuse_parse_result {
@@ -372,11 +386,13 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff,
372 for (;;) { 386 for (;;) {
373 struct fuse_dirent *dirent = addr + offset; 387 struct fuse_dirent *dirent = addr + offset;
374 unsigned int nbytes = size - offset; 388 unsigned int nbytes = size - offset;
375 size_t reclen = FUSE_DIRENT_SIZE(dirent); 389 size_t reclen;
376 390
377 if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) 391 if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen)
378 break; 392 break;
379 393
394 reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */
395
380 if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) 396 if (WARN_ON(dirent->namelen > FUSE_NAME_MAX))
381 return FOUND_ERR; 397 return FOUND_ERR;
382 if (WARN_ON(reclen > nbytes)) 398 if (WARN_ON(reclen > nbytes))
diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c
index 433717640f78..20d052e08b3b 100644
--- a/fs/fuse/xattr.c
+++ b/fs/fuse/xattr.c
@@ -25,15 +25,15 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value,
25 memset(&inarg, 0, sizeof(inarg)); 25 memset(&inarg, 0, sizeof(inarg));
26 inarg.size = size; 26 inarg.size = size;
27 inarg.flags = flags; 27 inarg.flags = flags;
28 args.in.h.opcode = FUSE_SETXATTR; 28 args.opcode = FUSE_SETXATTR;
29 args.in.h.nodeid = get_node_id(inode); 29 args.nodeid = get_node_id(inode);
30 args.in.numargs = 3; 30 args.in_numargs = 3;
31 args.in.args[0].size = sizeof(inarg); 31 args.in_args[0].size = sizeof(inarg);
32 args.in.args[0].value = &inarg; 32 args.in_args[0].value = &inarg;
33 args.in.args[1].size = strlen(name) + 1; 33 args.in_args[1].size = strlen(name) + 1;
34 args.in.args[1].value = name; 34 args.in_args[1].value = name;
35 args.in.args[2].size = size; 35 args.in_args[2].size = size;
36 args.in.args[2].value = value; 36 args.in_args[2].value = value;
37 err = fuse_simple_request(fc, &args); 37 err = fuse_simple_request(fc, &args);
38 if (err == -ENOSYS) { 38 if (err == -ENOSYS) {
39 fc->no_setxattr = 1; 39 fc->no_setxattr = 1;
@@ -60,22 +60,22 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value,
60 60
61 memset(&inarg, 0, sizeof(inarg)); 61 memset(&inarg, 0, sizeof(inarg));
62 inarg.size = size; 62 inarg.size = size;
63 args.in.h.opcode = FUSE_GETXATTR; 63 args.opcode = FUSE_GETXATTR;
64 args.in.h.nodeid = get_node_id(inode); 64 args.nodeid = get_node_id(inode);
65 args.in.numargs = 2; 65 args.in_numargs = 2;
66 args.in.args[0].size = sizeof(inarg); 66 args.in_args[0].size = sizeof(inarg);
67 args.in.args[0].value = &inarg; 67 args.in_args[0].value = &inarg;
68 args.in.args[1].size = strlen(name) + 1; 68 args.in_args[1].size = strlen(name) + 1;
69 args.in.args[1].value = name; 69 args.in_args[1].value = name;
70 /* This is really two different operations rolled into one */ 70 /* This is really two different operations rolled into one */
71 args.out.numargs = 1; 71 args.out_numargs = 1;
72 if (size) { 72 if (size) {
73 args.out.argvar = 1; 73 args.out_argvar = true;
74 args.out.args[0].size = size; 74 args.out_args[0].size = size;
75 args.out.args[0].value = value; 75 args.out_args[0].value = value;
76 } else { 76 } else {
77 args.out.args[0].size = sizeof(outarg); 77 args.out_args[0].size = sizeof(outarg);
78 args.out.args[0].value = &outarg; 78 args.out_args[0].value = &outarg;
79 } 79 }
80 ret = fuse_simple_request(fc, &args); 80 ret = fuse_simple_request(fc, &args);
81 if (!ret && !size) 81 if (!ret && !size)
@@ -121,20 +121,20 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
121 121
122 memset(&inarg, 0, sizeof(inarg)); 122 memset(&inarg, 0, sizeof(inarg));
123 inarg.size = size; 123 inarg.size = size;
124 args.in.h.opcode = FUSE_LISTXATTR; 124 args.opcode = FUSE_LISTXATTR;
125 args.in.h.nodeid = get_node_id(inode); 125 args.nodeid = get_node_id(inode);
126 args.in.numargs = 1; 126 args.in_numargs = 1;
127 args.in.args[0].size = sizeof(inarg); 127 args.in_args[0].size = sizeof(inarg);
128 args.in.args[0].value = &inarg; 128 args.in_args[0].value = &inarg;
129 /* This is really two different operations rolled into one */ 129 /* This is really two different operations rolled into one */
130 args.out.numargs = 1; 130 args.out_numargs = 1;
131 if (size) { 131 if (size) {
132 args.out.argvar = 1; 132 args.out_argvar = true;
133 args.out.args[0].size = size; 133 args.out_args[0].size = size;
134 args.out.args[0].value = list; 134 args.out_args[0].value = list;
135 } else { 135 } else {
136 args.out.args[0].size = sizeof(outarg); 136 args.out_args[0].size = sizeof(outarg);
137 args.out.args[0].value = &outarg; 137 args.out_args[0].value = &outarg;
138 } 138 }
139 ret = fuse_simple_request(fc, &args); 139 ret = fuse_simple_request(fc, &args);
140 if (!ret && !size) 140 if (!ret && !size)
@@ -157,11 +157,11 @@ int fuse_removexattr(struct inode *inode, const char *name)
157 if (fc->no_removexattr) 157 if (fc->no_removexattr)
158 return -EOPNOTSUPP; 158 return -EOPNOTSUPP;
159 159
160 args.in.h.opcode = FUSE_REMOVEXATTR; 160 args.opcode = FUSE_REMOVEXATTR;
161 args.in.h.nodeid = get_node_id(inode); 161 args.nodeid = get_node_id(inode);
162 args.in.numargs = 1; 162 args.in_numargs = 1;
163 args.in.args[0].size = strlen(name) + 1; 163 args.in_args[0].size = strlen(name) + 1;
164 args.in.args[0].value = name; 164 args.in_args[0].value = name;
165 err = fuse_simple_request(fc, &args); 165 err = fuse_simple_request(fc, &args);
166 if (err == -ENOSYS) { 166 if (err == -ENOSYS) {
167 fc->no_removexattr = 1; 167 fc->no_removexattr = 1;
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 10517cea9682..1fc28c2da279 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -24,7 +24,7 @@
24 24
25struct iomap_dio { 25struct iomap_dio {
26 struct kiocb *iocb; 26 struct kiocb *iocb;
27 iomap_dio_end_io_t *end_io; 27 const struct iomap_dio_ops *dops;
28 loff_t i_size; 28 loff_t i_size;
29 loff_t size; 29 loff_t size;
30 atomic_t ref; 30 atomic_t ref;
@@ -72,18 +72,14 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap,
72 72
73static ssize_t iomap_dio_complete(struct iomap_dio *dio) 73static ssize_t iomap_dio_complete(struct iomap_dio *dio)
74{ 74{
75 const struct iomap_dio_ops *dops = dio->dops;
75 struct kiocb *iocb = dio->iocb; 76 struct kiocb *iocb = dio->iocb;
76 struct inode *inode = file_inode(iocb->ki_filp); 77 struct inode *inode = file_inode(iocb->ki_filp);
77 loff_t offset = iocb->ki_pos; 78 loff_t offset = iocb->ki_pos;
78 ssize_t ret; 79 ssize_t ret = dio->error;
79 80
80 if (dio->end_io) { 81 if (dops && dops->end_io)
81 ret = dio->end_io(iocb, 82 ret = dops->end_io(iocb, dio->size, ret, dio->flags);
82 dio->error ? dio->error : dio->size,
83 dio->flags);
84 } else {
85 ret = dio->error;
86 }
87 83
88 if (likely(!ret)) { 84 if (likely(!ret)) {
89 ret = dio->size; 85 ret = dio->size;
@@ -101,9 +97,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio)
101 * one is a pretty crazy thing to do, so we don't support it 100%. If 97 * one is a pretty crazy thing to do, so we don't support it 100%. If
102 * this invalidation fails, tough, the write still worked... 98 * this invalidation fails, tough, the write still worked...
103 * 99 *
104 * And this page cache invalidation has to be after dio->end_io(), as 100 * And this page cache invalidation has to be after ->end_io(), as some
105 * some filesystems convert unwritten extents to real allocations in 101 * filesystems convert unwritten extents to real allocations in
106 * end_io() when necessary, otherwise a racing buffer read would cache 102 * ->end_io() when necessary, otherwise a racing buffer read would cache
107 * zeros from unwritten extents. 103 * zeros from unwritten extents.
108 */ 104 */
109 if (!dio->error && 105 if (!dio->error &&
@@ -396,7 +392,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
396 */ 392 */
397ssize_t 393ssize_t
398iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 394iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
399 const struct iomap_ops *ops, iomap_dio_end_io_t end_io) 395 const struct iomap_ops *ops, const struct iomap_dio_ops *dops)
400{ 396{
401 struct address_space *mapping = iocb->ki_filp->f_mapping; 397 struct address_space *mapping = iocb->ki_filp->f_mapping;
402 struct inode *inode = file_inode(iocb->ki_filp); 398 struct inode *inode = file_inode(iocb->ki_filp);
@@ -421,7 +417,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
421 atomic_set(&dio->ref, 1); 417 atomic_set(&dio->ref, 1);
422 dio->size = 0; 418 dio->size = 0;
423 dio->i_size = i_size_read(inode); 419 dio->i_size = i_size_read(inode);
424 dio->end_io = end_io; 420 dio->dops = dops;
425 dio->error = 0; 421 dio->error = 0;
426 dio->flags = 0; 422 dio->flags = 0;
427 423
diff --git a/fs/namespace.c b/fs/namespace.c
index abcdc5f44865..fe0e9e1410fe 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2802,8 +2802,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
2802 put_filesystem(type); 2802 put_filesystem(type);
2803 return -EINVAL; 2803 return -EINVAL;
2804 } 2804 }
2805 } else {
2806 subtype = "";
2807 } 2805 }
2808 } 2806 }
2809 2807
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index e16fb8f2049e..273ee82d8aa9 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -88,7 +88,7 @@ static inline void mangle(struct seq_file *m, const char *s)
88static void show_type(struct seq_file *m, struct super_block *sb) 88static void show_type(struct seq_file *m, struct super_block *sb)
89{ 89{
90 mangle(m, sb->s_type->name); 90 mangle(m, sb->s_type->name);
91 if (sb->s_subtype && sb->s_subtype[0]) { 91 if (sb->s_subtype) {
92 seq_putc(m, '.'); 92 seq_putc(m, '.');
93 mangle(m, sb->s_subtype); 93 mangle(m, sb->s_subtype);
94 } 94 }
diff --git a/fs/super.c b/fs/super.c
index 8020974b2a68..f627b7c53d2b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1555,11 +1555,6 @@ int vfs_get_tree(struct fs_context *fc)
1555 sb = fc->root->d_sb; 1555 sb = fc->root->d_sb;
1556 WARN_ON(!sb->s_bdi); 1556 WARN_ON(!sb->s_bdi);
1557 1557
1558 if (fc->subtype && !sb->s_subtype) {
1559 sb->s_subtype = fc->subtype;
1560 fc->subtype = NULL;
1561 }
1562
1563 /* 1558 /*
1564 * Write barrier is for super_cache_count(). We place it before setting 1559 * Write barrier is for super_cache_count(). We place it before setting
1565 * SB_BORN as the data dependency between the two functions is the 1560 * SB_BORN as the data dependency between the two functions is the
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index d952d5962e93..1ffb179f35d2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -370,21 +370,23 @@ static int
370xfs_dio_write_end_io( 370xfs_dio_write_end_io(
371 struct kiocb *iocb, 371 struct kiocb *iocb,
372 ssize_t size, 372 ssize_t size,
373 int error,
373 unsigned flags) 374 unsigned flags)
374{ 375{
375 struct inode *inode = file_inode(iocb->ki_filp); 376 struct inode *inode = file_inode(iocb->ki_filp);
376 struct xfs_inode *ip = XFS_I(inode); 377 struct xfs_inode *ip = XFS_I(inode);
377 loff_t offset = iocb->ki_pos; 378 loff_t offset = iocb->ki_pos;
378 unsigned int nofs_flag; 379 unsigned int nofs_flag;
379 int error = 0;
380 380
381 trace_xfs_end_io_direct_write(ip, offset, size); 381 trace_xfs_end_io_direct_write(ip, offset, size);
382 382
383 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 383 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
384 return -EIO; 384 return -EIO;
385 385
386 if (size <= 0) 386 if (error)
387 return size; 387 return error;
388 if (!size)
389 return 0;
388 390
389 /* 391 /*
390 * Capture amount written on completion as we can't reliably account 392 * Capture amount written on completion as we can't reliably account
@@ -441,6 +443,10 @@ out:
441 return error; 443 return error;
442} 444}
443 445
446static const struct iomap_dio_ops xfs_dio_write_ops = {
447 .end_io = xfs_dio_write_end_io,
448};
449
444/* 450/*
445 * xfs_file_dio_aio_write - handle direct IO writes 451 * xfs_file_dio_aio_write - handle direct IO writes
446 * 452 *
@@ -541,7 +547,7 @@ xfs_file_dio_aio_write(
541 } 547 }
542 548
543 trace_xfs_file_direct_write(ip, count, iocb->ki_pos); 549 trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
544 ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); 550 ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops);
545 551
546 /* 552 /*
547 * If unaligned, this is the only IO in-flight. If it has not yet 553 * If unaligned, this is the only IO in-flight. If it has not yet
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 82156da3c650..b9dbda1c26aa 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -293,6 +293,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private);
293struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); 293struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client);
294u64 ceph_client_gid(struct ceph_client *client); 294u64 ceph_client_gid(struct ceph_client *client);
295extern void ceph_destroy_client(struct ceph_client *client); 295extern void ceph_destroy_client(struct ceph_client *client);
296extern void ceph_reset_client_addr(struct ceph_client *client);
296extern int __ceph_open_session(struct ceph_client *client, 297extern int __ceph_open_session(struct ceph_client *client,
297 unsigned long started); 298 unsigned long started);
298extern int ceph_open_session(struct ceph_client *client); 299extern int ceph_open_session(struct ceph_client *client);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 23895d178149..c4458dc6a757 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -337,6 +337,7 @@ extern void ceph_msgr_flush(void);
337extern void ceph_messenger_init(struct ceph_messenger *msgr, 337extern void ceph_messenger_init(struct ceph_messenger *msgr,
338 struct ceph_entity_addr *myaddr); 338 struct ceph_entity_addr *myaddr);
339extern void ceph_messenger_fini(struct ceph_messenger *msgr); 339extern void ceph_messenger_fini(struct ceph_messenger *msgr);
340extern void ceph_messenger_reset_nonce(struct ceph_messenger *msgr);
340 341
341extern void ceph_con_init(struct ceph_connection *con, void *private, 342extern void ceph_con_init(struct ceph_connection *con, void *private,
342 const struct ceph_connection_operations *ops, 343 const struct ceph_connection_operations *ops,
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index b4d134d3312a..dbb8a6959a73 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -109,6 +109,7 @@ extern int ceph_monmap_contains(struct ceph_monmap *m,
109 109
110extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); 110extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
111extern void ceph_monc_stop(struct ceph_mon_client *monc); 111extern void ceph_monc_stop(struct ceph_mon_client *monc);
112extern void ceph_monc_reopen_session(struct ceph_mon_client *monc);
112 113
113enum { 114enum {
114 CEPH_SUB_MONMAP = 0, 115 CEPH_SUB_MONMAP = 0,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ad7fe5d10dcd..eaffbdddf89a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -381,6 +381,7 @@ extern void ceph_osdc_cleanup(void);
381extern int ceph_osdc_init(struct ceph_osd_client *osdc, 381extern int ceph_osdc_init(struct ceph_osd_client *osdc,
382 struct ceph_client *client); 382 struct ceph_client *client);
383extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 383extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
384extern void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc);
384 385
385extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, 386extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
386 struct ceph_msg *msg); 387 struct ceph_msg *msg);
@@ -388,6 +389,7 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
388 struct ceph_msg *msg); 389 struct ceph_msg *msg);
389void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); 390void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb);
390void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err); 391void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err);
392void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc);
391 393
392#define osd_req_op_data(oreq, whch, typ, fld) \ 394#define osd_req_op_data(oreq, whch, typ, fld) \
393({ \ 395({ \
diff --git a/include/linux/fs_context.h b/include/linux/fs_context.h
index 0424df7f6e6b..e5c14e2c53d3 100644
--- a/include/linux/fs_context.h
+++ b/include/linux/fs_context.h
@@ -95,7 +95,6 @@ struct fs_context {
95 const struct cred *cred; /* The mounter's credentials */ 95 const struct cred *cred; /* The mounter's credentials */
96 struct fc_log *log; /* Logging buffer */ 96 struct fc_log *log; /* Logging buffer */
97 const char *source; /* The source name (eg. dev path) */ 97 const char *source; /* The source name (eg. dev path) */
98 const char *subtype; /* The subtype to set on the superblock */
99 void *security; /* Linux S&M options */ 98 void *security; /* Linux S&M options */
100 void *s_fs_info; /* Proposed s_fs_info */ 99 void *s_fs_info; /* Proposed s_fs_info */
101 unsigned int sb_flags; /* Proposed superblock flags (SB_*) */ 100 unsigned int sb_flags; /* Proposed superblock flags (SB_*) */
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index bc499ceae392..7aa5d6117936 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -188,10 +188,14 @@ sector_t iomap_bmap(struct address_space *mapping, sector_t bno,
188 */ 188 */
189#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */ 189#define IOMAP_DIO_UNWRITTEN (1 << 0) /* covers unwritten extent(s) */
190#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */ 190#define IOMAP_DIO_COW (1 << 1) /* covers COW extent(s) */
191typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, 191
192 unsigned flags); 192struct iomap_dio_ops {
193 int (*end_io)(struct kiocb *iocb, ssize_t size, int error,
194 unsigned flags);
195};
196
193ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, 197ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
194 const struct iomap_ops *ops, iomap_dio_end_io_t end_io); 198 const struct iomap_ops *ops, const struct iomap_dio_ops *dops);
195int iomap_dio_iopoll(struct kiocb *kiocb, bool spin); 199int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
196 200
197#ifdef CONFIG_SWAP 201#ifdef CONFIG_SWAP
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 2971d29a42e4..df2e12fb3381 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -425,6 +425,10 @@ enum fuse_opcode {
425 425
426 /* CUSE specific operations */ 426 /* CUSE specific operations */
427 CUSE_INIT = 4096, 427 CUSE_INIT = 4096,
428
429 /* Reserved opcodes: helpful to detect structure endian-ness */
430 CUSE_INIT_BSWAP_RESERVED = 1048576, /* CUSE_INIT << 8 */
431 FUSE_INIT_BSWAP_RESERVED = 436207616, /* FUSE_INIT << 24 */
428}; 432};
429 433
430enum fuse_notify_code { 434enum fuse_notify_code {
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 4eeea4d5c3ef..2d568246803f 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -13,6 +13,7 @@
13#include <linux/nsproxy.h> 13#include <linux/nsproxy.h>
14#include <linux/parser.h> 14#include <linux/parser.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/sched/mm.h>
16#include <linux/seq_file.h> 17#include <linux/seq_file.h>
17#include <linux/slab.h> 18#include <linux/slab.h>
18#include <linux/statfs.h> 19#include <linux/statfs.h>
@@ -185,18 +186,34 @@ int ceph_compare_options(struct ceph_options *new_opt,
185} 186}
186EXPORT_SYMBOL(ceph_compare_options); 187EXPORT_SYMBOL(ceph_compare_options);
187 188
189/*
190 * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are
191 * compatible with (a superset of) GFP_KERNEL. This is because while the
192 * actual pages are allocated with the specified flags, the page table pages
193 * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take
194 * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc().
195 *
196 * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO.
197 */
188void *ceph_kvmalloc(size_t size, gfp_t flags) 198void *ceph_kvmalloc(size_t size, gfp_t flags)
189{ 199{
190 if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { 200 void *p;
191 void *ptr = kmalloc(size, flags | __GFP_NOWARN); 201
192 if (ptr) 202 if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) {
193 return ptr; 203 p = kvmalloc(size, flags);
204 } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) {
205 unsigned int nofs_flag = memalloc_nofs_save();
206 p = kvmalloc(size, GFP_KERNEL);
207 memalloc_nofs_restore(nofs_flag);
208 } else {
209 unsigned int noio_flag = memalloc_noio_save();
210 p = kvmalloc(size, GFP_KERNEL);
211 memalloc_noio_restore(noio_flag);
194 } 212 }
195 213
196 return __vmalloc(size, flags, PAGE_KERNEL); 214 return p;
197} 215}
198 216
199
200static int parse_fsid(const char *str, struct ceph_fsid *fsid) 217static int parse_fsid(const char *str, struct ceph_fsid *fsid)
201{ 218{
202 int i = 0; 219 int i = 0;
@@ -694,6 +711,14 @@ void ceph_destroy_client(struct ceph_client *client)
694} 711}
695EXPORT_SYMBOL(ceph_destroy_client); 712EXPORT_SYMBOL(ceph_destroy_client);
696 713
714void ceph_reset_client_addr(struct ceph_client *client)
715{
716 ceph_messenger_reset_nonce(&client->msgr);
717 ceph_monc_reopen_session(&client->monc);
718 ceph_osdc_reopen_osds(&client->osdc);
719}
720EXPORT_SYMBOL(ceph_reset_client_addr);
721
697/* 722/*
698 * true if we have the mon map (and have thus joined the cluster) 723 * true if we have the mon map (and have thus joined the cluster)
699 */ 724 */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 962f521c863e..e4cb3db2ee77 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3031,6 +3031,12 @@ static void con_fault(struct ceph_connection *con)
3031} 3031}
3032 3032
3033 3033
3034void ceph_messenger_reset_nonce(struct ceph_messenger *msgr)
3035{
3036 u32 nonce = le32_to_cpu(msgr->inst.addr.nonce) + 1000000;
3037 msgr->inst.addr.nonce = cpu_to_le32(nonce);
3038 encode_my_addr(msgr);
3039}
3034 3040
3035/* 3041/*
3036 * initialize a new messenger instance 3042 * initialize a new messenger instance
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 0520bf9825aa..7256c402ebaa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -213,6 +213,13 @@ static void reopen_session(struct ceph_mon_client *monc)
213 __open_session(monc); 213 __open_session(monc);
214} 214}
215 215
216void ceph_monc_reopen_session(struct ceph_mon_client *monc)
217{
218 mutex_lock(&monc->mutex);
219 reopen_session(monc);
220 mutex_unlock(&monc->mutex);
221}
222
216static void un_backoff(struct ceph_mon_client *monc) 223static void un_backoff(struct ceph_mon_client *monc)
217{ 224{
218 monc->hunt_mult /= 2; /* reduce by 50% */ 225 monc->hunt_mult /= 2; /* reduce by 50% */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 78ae6e8c953d..ba45b074a362 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -841,6 +841,7 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
841 struct ceph_pagelist *pagelist; 841 struct ceph_pagelist *pagelist;
842 size_t payload_len = 0; 842 size_t payload_len = 0;
843 size_t size; 843 size_t size;
844 int ret;
844 845
845 op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); 846 op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0);
846 847
@@ -852,20 +853,27 @@ int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
852 size = strlen(class); 853 size = strlen(class);
853 BUG_ON(size > (size_t) U8_MAX); 854 BUG_ON(size > (size_t) U8_MAX);
854 op->cls.class_len = size; 855 op->cls.class_len = size;
855 ceph_pagelist_append(pagelist, class, size); 856 ret = ceph_pagelist_append(pagelist, class, size);
857 if (ret)
858 goto err_pagelist_free;
856 payload_len += size; 859 payload_len += size;
857 860
858 op->cls.method_name = method; 861 op->cls.method_name = method;
859 size = strlen(method); 862 size = strlen(method);
860 BUG_ON(size > (size_t) U8_MAX); 863 BUG_ON(size > (size_t) U8_MAX);
861 op->cls.method_len = size; 864 op->cls.method_len = size;
862 ceph_pagelist_append(pagelist, method, size); 865 ret = ceph_pagelist_append(pagelist, method, size);
866 if (ret)
867 goto err_pagelist_free;
863 payload_len += size; 868 payload_len += size;
864 869
865 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist); 870 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
866
867 op->indata_len = payload_len; 871 op->indata_len = payload_len;
868 return 0; 872 return 0;
873
874err_pagelist_free:
875 ceph_pagelist_release(pagelist);
876 return ret;
869} 877}
870EXPORT_SYMBOL(osd_req_op_cls_init); 878EXPORT_SYMBOL(osd_req_op_cls_init);
871 879
@@ -877,6 +885,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
877 opcode, 0); 885 opcode, 0);
878 struct ceph_pagelist *pagelist; 886 struct ceph_pagelist *pagelist;
879 size_t payload_len; 887 size_t payload_len;
888 int ret;
880 889
881 BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); 890 BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR);
882 891
@@ -886,10 +895,14 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
886 895
887 payload_len = strlen(name); 896 payload_len = strlen(name);
888 op->xattr.name_len = payload_len; 897 op->xattr.name_len = payload_len;
889 ceph_pagelist_append(pagelist, name, payload_len); 898 ret = ceph_pagelist_append(pagelist, name, payload_len);
899 if (ret)
900 goto err_pagelist_free;
890 901
891 op->xattr.value_len = size; 902 op->xattr.value_len = size;
892 ceph_pagelist_append(pagelist, value, size); 903 ret = ceph_pagelist_append(pagelist, value, size);
904 if (ret)
905 goto err_pagelist_free;
893 payload_len += size; 906 payload_len += size;
894 907
895 op->xattr.cmp_op = cmp_op; 908 op->xattr.cmp_op = cmp_op;
@@ -898,6 +911,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
898 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 911 ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
899 op->indata_len = payload_len; 912 op->indata_len = payload_len;
900 return 0; 913 return 0;
914
915err_pagelist_free:
916 ceph_pagelist_release(pagelist);
917 return ret;
901} 918}
902EXPORT_SYMBOL(osd_req_op_xattr_init); 919EXPORT_SYMBOL(osd_req_op_xattr_init);
903 920
@@ -1488,7 +1505,6 @@ enum calc_target_result {
1488 1505
1489static enum calc_target_result calc_target(struct ceph_osd_client *osdc, 1506static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
1490 struct ceph_osd_request_target *t, 1507 struct ceph_osd_request_target *t,
1491 struct ceph_connection *con,
1492 bool any_change) 1508 bool any_change)
1493{ 1509{
1494 struct ceph_pg_pool_info *pi; 1510 struct ceph_pg_pool_info *pi;
@@ -2272,7 +2288,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
2272 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); 2288 dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
2273 2289
2274again: 2290again:
2275 ct_res = calc_target(osdc, &req->r_t, NULL, false); 2291 ct_res = calc_target(osdc, &req->r_t, false);
2276 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) 2292 if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
2277 goto promote; 2293 goto promote;
2278 2294
@@ -2476,6 +2492,14 @@ void ceph_osdc_abort_requests(struct ceph_osd_client *osdc, int err)
2476} 2492}
2477EXPORT_SYMBOL(ceph_osdc_abort_requests); 2493EXPORT_SYMBOL(ceph_osdc_abort_requests);
2478 2494
2495void ceph_osdc_clear_abort_err(struct ceph_osd_client *osdc)
2496{
2497 down_write(&osdc->lock);
2498 osdc->abort_err = 0;
2499 up_write(&osdc->lock);
2500}
2501EXPORT_SYMBOL(ceph_osdc_clear_abort_err);
2502
2479static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) 2503static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb)
2480{ 2504{
2481 if (likely(eb > osdc->epoch_barrier)) { 2505 if (likely(eb > osdc->epoch_barrier)) {
@@ -3087,7 +3111,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
3087 lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; 3111 lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id;
3088 } 3112 }
3089 3113
3090 calc_target(osdc, &lreq->t, NULL, false); 3114 calc_target(osdc, &lreq->t, false);
3091 osd = lookup_create_osd(osdc, lreq->t.osd, true); 3115 osd = lookup_create_osd(osdc, lreq->t.osd, true);
3092 link_linger(osd, lreq); 3116 link_linger(osd, lreq);
3093 3117
@@ -3704,7 +3728,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
3704 struct ceph_osd_client *osdc = lreq->osdc; 3728 struct ceph_osd_client *osdc = lreq->osdc;
3705 enum calc_target_result ct_res; 3729 enum calc_target_result ct_res;
3706 3730
3707 ct_res = calc_target(osdc, &lreq->t, NULL, true); 3731 ct_res = calc_target(osdc, &lreq->t, true);
3708 if (ct_res == CALC_TARGET_NEED_RESEND) { 3732 if (ct_res == CALC_TARGET_NEED_RESEND) {
3709 struct ceph_osd *osd; 3733 struct ceph_osd *osd;
3710 3734
@@ -3776,8 +3800,7 @@ static void scan_requests(struct ceph_osd *osd,
3776 n = rb_next(n); /* unlink_request(), check_pool_dne() */ 3800 n = rb_next(n); /* unlink_request(), check_pool_dne() */
3777 3801
3778 dout("%s req %p tid %llu\n", __func__, req, req->r_tid); 3802 dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
3779 ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con, 3803 ct_res = calc_target(osdc, &req->r_t, false);
3780 false);
3781 switch (ct_res) { 3804 switch (ct_res) {
3782 case CALC_TARGET_NO_ACTION: 3805 case CALC_TARGET_NO_ACTION:
3783 force_resend_writes = cleared_full || 3806 force_resend_writes = cleared_full ||
@@ -3886,7 +3909,7 @@ static void kick_requests(struct ceph_osd_client *osdc,
3886 n = rb_next(n); 3909 n = rb_next(n);
3887 3910
3888 if (req->r_t.epoch < osdc->osdmap->epoch) { 3911 if (req->r_t.epoch < osdc->osdmap->epoch) {
3889 ct_res = calc_target(osdc, &req->r_t, NULL, false); 3912 ct_res = calc_target(osdc, &req->r_t, false);
3890 if (ct_res == CALC_TARGET_POOL_DNE) { 3913 if (ct_res == CALC_TARGET_POOL_DNE) {
3891 erase_request(need_resend, req); 3914 erase_request(need_resend, req);
3892 check_pool_dne(req); 3915 check_pool_dne(req);
@@ -5087,6 +5110,24 @@ out_put_req:
5087EXPORT_SYMBOL(ceph_osdc_call); 5110EXPORT_SYMBOL(ceph_osdc_call);
5088 5111
5089/* 5112/*
5113 * reset all osd connections
5114 */
5115void ceph_osdc_reopen_osds(struct ceph_osd_client *osdc)
5116{
5117 struct rb_node *n;
5118
5119 down_write(&osdc->lock);
5120 for (n = rb_first(&osdc->osds); n; ) {
5121 struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
5122
5123 n = rb_next(n);
5124 if (!reopen_osd(osd))
5125 kick_osd_requests(osd);
5126 }
5127 up_write(&osdc->lock);
5128}
5129
5130/*
5090 * init, shutdown 5131 * init, shutdown
5091 */ 5132 */
5092int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) 5133int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 90437906b7bc..4e0de14f80bb 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -973,11 +973,11 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
973 struct ceph_pg_pool_info, node); 973 struct ceph_pg_pool_info, node);
974 __remove_pg_pool(&map->pg_pools, pi); 974 __remove_pg_pool(&map->pg_pools, pi);
975 } 975 }
976 kfree(map->osd_state); 976 kvfree(map->osd_state);
977 kfree(map->osd_weight); 977 kvfree(map->osd_weight);
978 kfree(map->osd_addr); 978 kvfree(map->osd_addr);
979 kfree(map->osd_primary_affinity); 979 kvfree(map->osd_primary_affinity);
980 kfree(map->crush_workspace); 980 kvfree(map->crush_workspace);
981 kfree(map); 981 kfree(map);
982} 982}
983 983
@@ -986,28 +986,41 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
986 * 986 *
987 * The new elements are properly initialized. 987 * The new elements are properly initialized.
988 */ 988 */
989static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 989static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max)
990{ 990{
991 u32 *state; 991 u32 *state;
992 u32 *weight; 992 u32 *weight;
993 struct ceph_entity_addr *addr; 993 struct ceph_entity_addr *addr;
994 u32 to_copy;
994 int i; 995 int i;
995 996
996 state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS); 997 dout("%s old %u new %u\n", __func__, map->max_osd, max);
997 if (!state) 998 if (max == map->max_osd)
998 return -ENOMEM; 999 return 0;
999 map->osd_state = state;
1000 1000
1001 weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS); 1001 state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS);
1002 if (!weight) 1002 weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS);
1003 addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS);
1004 if (!state || !weight || !addr) {
1005 kvfree(state);
1006 kvfree(weight);
1007 kvfree(addr);
1003 return -ENOMEM; 1008 return -ENOMEM;
1004 map->osd_weight = weight; 1009 }
1005 1010
1006 addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS); 1011 to_copy = min(map->max_osd, max);
1007 if (!addr) 1012 if (map->osd_state) {
1008 return -ENOMEM; 1013 memcpy(state, map->osd_state, to_copy * sizeof(*state));
1009 map->osd_addr = addr; 1014 memcpy(weight, map->osd_weight, to_copy * sizeof(*weight));
1015 memcpy(addr, map->osd_addr, to_copy * sizeof(*addr));
1016 kvfree(map->osd_state);
1017 kvfree(map->osd_weight);
1018 kvfree(map->osd_addr);
1019 }
1010 1020
1021 map->osd_state = state;
1022 map->osd_weight = weight;
1023 map->osd_addr = addr;
1011 for (i = map->max_osd; i < max; i++) { 1024 for (i = map->max_osd; i < max; i++) {
1012 map->osd_state[i] = 0; 1025 map->osd_state[i] = 0;
1013 map->osd_weight[i] = CEPH_OSD_OUT; 1026 map->osd_weight[i] = CEPH_OSD_OUT;
@@ -1017,12 +1030,16 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
1017 if (map->osd_primary_affinity) { 1030 if (map->osd_primary_affinity) {
1018 u32 *affinity; 1031 u32 *affinity;
1019 1032
1020 affinity = krealloc(map->osd_primary_affinity, 1033 affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)),
1021 max*sizeof(*affinity), GFP_NOFS); 1034 GFP_NOFS);
1022 if (!affinity) 1035 if (!affinity)
1023 return -ENOMEM; 1036 return -ENOMEM;
1024 map->osd_primary_affinity = affinity;
1025 1037
1038 memcpy(affinity, map->osd_primary_affinity,
1039 to_copy * sizeof(*affinity));
1040 kvfree(map->osd_primary_affinity);
1041
1042 map->osd_primary_affinity = affinity;
1026 for (i = map->max_osd; i < max; i++) 1043 for (i = map->max_osd; i < max; i++)
1027 map->osd_primary_affinity[i] = 1044 map->osd_primary_affinity[i] =
1028 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; 1045 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
@@ -1043,7 +1060,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
1043 1060
1044 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); 1061 work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE);
1045 dout("%s work_size %zu bytes\n", __func__, work_size); 1062 dout("%s work_size %zu bytes\n", __func__, work_size);
1046 workspace = kmalloc(work_size, GFP_NOIO); 1063 workspace = ceph_kvmalloc(work_size, GFP_NOIO);
1047 if (!workspace) { 1064 if (!workspace) {
1048 crush_destroy(crush); 1065 crush_destroy(crush);
1049 return -ENOMEM; 1066 return -ENOMEM;
@@ -1052,7 +1069,7 @@ static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush)
1052 1069
1053 if (map->crush) 1070 if (map->crush)
1054 crush_destroy(map->crush); 1071 crush_destroy(map->crush);
1055 kfree(map->crush_workspace); 1072 kvfree(map->crush_workspace);
1056 map->crush = crush; 1073 map->crush = crush;
1057 map->crush_workspace = workspace; 1074 map->crush_workspace = workspace;
1058 return 0; 1075 return 0;
@@ -1298,9 +1315,9 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
1298 if (!map->osd_primary_affinity) { 1315 if (!map->osd_primary_affinity) {
1299 int i; 1316 int i;
1300 1317
1301 map->osd_primary_affinity = kmalloc_array(map->max_osd, 1318 map->osd_primary_affinity = ceph_kvmalloc(
1302 sizeof(u32), 1319 array_size(map->max_osd, sizeof(*map->osd_primary_affinity)),
1303 GFP_NOFS); 1320 GFP_NOFS);
1304 if (!map->osd_primary_affinity) 1321 if (!map->osd_primary_affinity)
1305 return -ENOMEM; 1322 return -ENOMEM;
1306 1323
@@ -1321,7 +1338,7 @@ static int decode_primary_affinity(void **p, void *end,
1321 1338
1322 ceph_decode_32_safe(p, end, len, e_inval); 1339 ceph_decode_32_safe(p, end, len, e_inval);
1323 if (len == 0) { 1340 if (len == 0) {
1324 kfree(map->osd_primary_affinity); 1341 kvfree(map->osd_primary_affinity);
1325 map->osd_primary_affinity = NULL; 1342 map->osd_primary_affinity = NULL;
1326 return 0; 1343 return 0;
1327 } 1344 }
diff --git a/security/keys/trusted.c b/security/keys/trusted.c
index ade699131065..1fbd77816610 100644
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1228,11 +1228,16 @@ hashalg_fail:
1228 1228
1229static int __init init_digests(void) 1229static int __init init_digests(void)
1230{ 1230{
1231 int i;
1232
1231 digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests), 1233 digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests),
1232 GFP_KERNEL); 1234 GFP_KERNEL);
1233 if (!digests) 1235 if (!digests)
1234 return -ENOMEM; 1236 return -ENOMEM;
1235 1237
1238 for (i = 0; i < chip->nr_allocated_banks; i++)
1239 digests[i].alg_id = chip->allocated_banks[i].alg_id;
1240
1236 return 0; 1241 return 0;
1237} 1242}
1238 1243
diff --git a/tools/testing/selftests/.gitignore b/tools/testing/selftests/.gitignore
index 8059ce834247..61df01cdf0b2 100644
--- a/tools/testing/selftests/.gitignore
+++ b/tools/testing/selftests/.gitignore
@@ -2,3 +2,5 @@ gpiogpio-event-mon
2gpiogpio-hammer 2gpiogpio-hammer
3gpioinclude/ 3gpioinclude/
4gpiolsgpio 4gpiolsgpio
5tpm2/SpaceTest.log
6tpm2/*.pyc
diff --git a/tools/testing/selftests/tpm2/Makefile b/tools/testing/selftests/tpm2/Makefile
index 9dd848427a7b..bf401f725eef 100644
--- a/tools/testing/selftests/tpm2/Makefile
+++ b/tools/testing/selftests/tpm2/Makefile
@@ -2,3 +2,4 @@
2include ../lib.mk 2include ../lib.mk
3 3
4TEST_PROGS := test_smoke.sh test_space.sh 4TEST_PROGS := test_smoke.sh test_space.sh
5TEST_FILES := tpm2.py tpm2_tests.py