diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-31 17:42:31 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-31 17:42:31 -0400 |
commit | 31990f0f5366a8f66688edae8688723b22034108 (patch) | |
tree | 07078a732a5f02d2330f3cb873286f9ac53ea969 | |
parent | a9ac6cc47bbb0fdd042012044f737ba13da10cb4 (diff) | |
parent | ea4cdc548e5e74a529cdd1aea885d74b4aa8f1b3 (diff) |
Merge tag 'ceph-for-4.20-rc1' of git://github.com/ceph/ceph-client
Pull ceph updates from Ilya Dryomov:
"The highlights are:
- a series that fixes some old memory allocation issues in libceph
(myself). We no longer allocate memory in places where allocation
failures cannot be handled and BUG when the allocation fails.
- support for copy_file_range() syscall (Luis Henriques). If size and
alignment conditions are met, it leverages RADOS copy-from
operation. Otherwise, a local copy is performed.
- a patch that reduces memory requirement of ceph_sync_read() from
the size of the entire read to the size of one object (Zheng Yan).
- fallocate() syscall is now restricted to FALLOC_FL_PUNCH_HOLE (Luis
Henriques)"
* tag 'ceph-for-4.20-rc1' of git://github.com/ceph/ceph-client: (25 commits)
ceph: new mount option to disable usage of copy-from op
ceph: support copy_file_range file operation
libceph: support the RADOS copy-from operation
ceph: add non-blocking parameter to ceph_try_get_caps()
libceph: check reply num_data_items in setup_request_data()
libceph: preallocate message data items
libceph, rbd, ceph: move ceph_osdc_alloc_messages() calls
libceph: introduce alloc_watch_request()
libceph: assign cookies in linger_submit()
libceph: enable fallback to ceph_msg_new() in ceph_msgpool_get()
ceph: num_ops is off by one in ceph_aio_retry_work()
libceph: no need to call osd_req_opcode_valid() in osd_req_encode_op()
ceph: set timeout conditionally in __cap_delay_requeue
libceph: don't consume a ref on pagelist in ceph_msg_data_add_pagelist()
libceph: introduce ceph_pagelist_alloc()
libceph: osd_req_op_cls_init() doesn't need to take opcode
libceph: bump CEPH_MSG_MAX_DATA_LEN
ceph: only allow punch hole mode in fallocate
ceph: refactor ceph_sync_read()
ceph: check if LOOKUPNAME request was aborted when filling trace
...
-rw-r--r-- | Documentation/filesystems/ceph.txt | 5 | ||||
-rw-r--r-- | drivers/block/rbd.c | 28 | ||||
-rw-r--r-- | fs/ceph/acl.c | 13 | ||||
-rw-r--r-- | fs/ceph/addr.c | 2 | ||||
-rw-r--r-- | fs/ceph/caps.c | 21 | ||||
-rw-r--r-- | fs/ceph/file.c | 573 | ||||
-rw-r--r-- | fs/ceph/inode.c | 13 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 9 | ||||
-rw-r--r-- | fs/ceph/super.c | 13 | ||||
-rw-r--r-- | fs/ceph/super.h | 3 | ||||
-rw-r--r-- | fs/ceph/xattr.c | 3 | ||||
-rw-r--r-- | include/linux/ceph/libceph.h | 8 | ||||
-rw-r--r-- | include/linux/ceph/messenger.h | 24 | ||||
-rw-r--r-- | include/linux/ceph/msgpool.h | 11 | ||||
-rw-r--r-- | include/linux/ceph/osd_client.h | 22 | ||||
-rw-r--r-- | include/linux/ceph/pagelist.h | 11 | ||||
-rw-r--r-- | include/linux/ceph/rados.h | 28 | ||||
-rw-r--r-- | net/ceph/messenger.c | 107 | ||||
-rw-r--r-- | net/ceph/msgpool.c | 27 | ||||
-rw-r--r-- | net/ceph/osd_client.c | 363 | ||||
-rw-r--r-- | net/ceph/pagelist.c | 20 |
21 files changed, 900 insertions, 404 deletions
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index 8bf62240e10d..1177052701e1 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt | |||
@@ -151,6 +151,11 @@ Mount Options | |||
151 | Report overall filesystem usage in statfs instead of using the root | 151 | Report overall filesystem usage in statfs instead of using the root |
152 | directory quota. | 152 | directory quota. |
153 | 153 | ||
154 | nocopyfrom | ||
155 | Don't use the RADOS 'copy-from' operation to perform remote object | ||
156 | copies. Currently, it's only used in copy_file_range, which will revert | ||
157 | to the default VFS implementation if this option is used. | ||
158 | |||
154 | More Information | 159 | More Information |
155 | ================ | 160 | ================ |
156 | 161 | ||
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 73ed5f3a862d..8e5140bbf241 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -1500,9 +1500,6 @@ rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) | |||
1500 | rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) | 1500 | rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) |
1501 | goto err_req; | 1501 | goto err_req; |
1502 | 1502 | ||
1503 | if (ceph_osdc_alloc_messages(req, GFP_NOIO)) | ||
1504 | goto err_req; | ||
1505 | |||
1506 | return req; | 1503 | return req; |
1507 | 1504 | ||
1508 | err_req: | 1505 | err_req: |
@@ -1945,6 +1942,10 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req) | |||
1945 | } | 1942 | } |
1946 | if (ret) | 1943 | if (ret) |
1947 | return ret; | 1944 | return ret; |
1945 | |||
1946 | ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); | ||
1947 | if (ret) | ||
1948 | return ret; | ||
1948 | } | 1949 | } |
1949 | 1950 | ||
1950 | return 0; | 1951 | return 0; |
@@ -2374,8 +2375,7 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) | |||
2374 | if (!obj_req->osd_req) | 2375 | if (!obj_req->osd_req) |
2375 | return -ENOMEM; | 2376 | return -ENOMEM; |
2376 | 2377 | ||
2377 | ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd", | 2378 | ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup"); |
2378 | "copyup"); | ||
2379 | if (ret) | 2379 | if (ret) |
2380 | return ret; | 2380 | return ret; |
2381 | 2381 | ||
@@ -2405,6 +2405,10 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes) | |||
2405 | rbd_assert(0); | 2405 | rbd_assert(0); |
2406 | } | 2406 | } |
2407 | 2407 | ||
2408 | ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO); | ||
2409 | if (ret) | ||
2410 | return ret; | ||
2411 | |||
2408 | rbd_obj_request_submit(obj_req); | 2412 | rbd_obj_request_submit(obj_req); |
2409 | return 0; | 2413 | return 0; |
2410 | } | 2414 | } |
@@ -3784,10 +3788,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, | |||
3784 | ceph_oloc_copy(&req->r_base_oloc, oloc); | 3788 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
3785 | req->r_flags = CEPH_OSD_FLAG_READ; | 3789 | req->r_flags = CEPH_OSD_FLAG_READ; |
3786 | 3790 | ||
3787 | ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); | ||
3788 | if (ret) | ||
3789 | goto out_req; | ||
3790 | |||
3791 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); | 3791 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); |
3792 | if (IS_ERR(pages)) { | 3792 | if (IS_ERR(pages)) { |
3793 | ret = PTR_ERR(pages); | 3793 | ret = PTR_ERR(pages); |
@@ -3798,6 +3798,10 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, | |||
3798 | osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, | 3798 | osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false, |
3799 | true); | 3799 | true); |
3800 | 3800 | ||
3801 | ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); | ||
3802 | if (ret) | ||
3803 | goto out_req; | ||
3804 | |||
3801 | ceph_osdc_start_request(osdc, req, false); | 3805 | ceph_osdc_start_request(osdc, req, false); |
3802 | ret = ceph_osdc_wait_request(osdc, req); | 3806 | ret = ceph_osdc_wait_request(osdc, req); |
3803 | if (ret >= 0) | 3807 | if (ret >= 0) |
@@ -6067,7 +6071,7 @@ static ssize_t rbd_remove_single_major(struct bus_type *bus, | |||
6067 | * create control files in sysfs | 6071 | * create control files in sysfs |
6068 | * /sys/bus/rbd/... | 6072 | * /sys/bus/rbd/... |
6069 | */ | 6073 | */ |
6070 | static int rbd_sysfs_init(void) | 6074 | static int __init rbd_sysfs_init(void) |
6071 | { | 6075 | { |
6072 | int ret; | 6076 | int ret; |
6073 | 6077 | ||
@@ -6082,13 +6086,13 @@ static int rbd_sysfs_init(void) | |||
6082 | return ret; | 6086 | return ret; |
6083 | } | 6087 | } |
6084 | 6088 | ||
6085 | static void rbd_sysfs_cleanup(void) | 6089 | static void __exit rbd_sysfs_cleanup(void) |
6086 | { | 6090 | { |
6087 | bus_unregister(&rbd_bus_type); | 6091 | bus_unregister(&rbd_bus_type); |
6088 | device_unregister(&rbd_root_dev); | 6092 | device_unregister(&rbd_root_dev); |
6089 | } | 6093 | } |
6090 | 6094 | ||
6091 | static int rbd_slab_init(void) | 6095 | static int __init rbd_slab_init(void) |
6092 | { | 6096 | { |
6093 | rbd_assert(!rbd_img_request_cache); | 6097 | rbd_assert(!rbd_img_request_cache); |
6094 | rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); | 6098 | rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0); |
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 027408d55aee..5f0103f40079 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c | |||
@@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) | |||
104 | struct timespec64 old_ctime = inode->i_ctime; | 104 | struct timespec64 old_ctime = inode->i_ctime; |
105 | umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; | 105 | umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; |
106 | 106 | ||
107 | if (ceph_snap(inode) != CEPH_NOSNAP) { | ||
108 | ret = -EROFS; | ||
109 | goto out; | ||
110 | } | ||
111 | |||
107 | switch (type) { | 112 | switch (type) { |
108 | case ACL_TYPE_ACCESS: | 113 | case ACL_TYPE_ACCESS: |
109 | name = XATTR_NAME_POSIX_ACL_ACCESS; | 114 | name = XATTR_NAME_POSIX_ACL_ACCESS; |
@@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) | |||
138 | goto out_free; | 143 | goto out_free; |
139 | } | 144 | } |
140 | 145 | ||
141 | if (ceph_snap(inode) != CEPH_NOSNAP) { | ||
142 | ret = -EROFS; | ||
143 | goto out_free; | ||
144 | } | ||
145 | |||
146 | if (new_mode != old_mode) { | 146 | if (new_mode != old_mode) { |
147 | newattrs.ia_ctime = current_time(inode); | 147 | newattrs.ia_ctime = current_time(inode); |
148 | newattrs.ia_mode = new_mode; | 148 | newattrs.ia_mode = new_mode; |
@@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, | |||
206 | tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); | 206 | tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); |
207 | if (!tmp_buf) | 207 | if (!tmp_buf) |
208 | goto out_err; | 208 | goto out_err; |
209 | pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); | 209 | pagelist = ceph_pagelist_alloc(GFP_KERNEL); |
210 | if (!pagelist) | 210 | if (!pagelist) |
211 | goto out_err; | 211 | goto out_err; |
212 | ceph_pagelist_init(pagelist); | ||
213 | 212 | ||
214 | err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); | 213 | err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); |
215 | if (err) | 214 | if (err) |
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 9c332a6f6667..8eade7a993c1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c | |||
@@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, | |||
322 | /* caller of readpages does not hold buffer and read caps | 322 | /* caller of readpages does not hold buffer and read caps |
323 | * (fadvise, madvise and readahead cases) */ | 323 | * (fadvise, madvise and readahead cases) */ |
324 | int want = CEPH_CAP_FILE_CACHE; | 324 | int want = CEPH_CAP_FILE_CACHE; |
325 | ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); | 325 | ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); |
326 | if (ret < 0) { | 326 | if (ret < 0) { |
327 | dout("start_read %p, error getting cap\n", inode); | 327 | dout("start_read %p, error getting cap\n", inode); |
328 | } else if (!(got & want)) { | 328 | } else if (!(got & want)) { |
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index dd7dfdd2ba13..f3496db4bb3e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c | |||
@@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, | |||
519 | * -> we take mdsc->cap_delay_lock | 519 | * -> we take mdsc->cap_delay_lock |
520 | */ | 520 | */ |
521 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, | 521 | static void __cap_delay_requeue(struct ceph_mds_client *mdsc, |
522 | struct ceph_inode_info *ci) | 522 | struct ceph_inode_info *ci, |
523 | bool set_timeout) | ||
523 | { | 524 | { |
524 | __cap_set_timeouts(mdsc, ci); | ||
525 | dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, | 525 | dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode, |
526 | ci->i_ceph_flags, ci->i_hold_caps_max); | 526 | ci->i_ceph_flags, ci->i_hold_caps_max); |
527 | if (!mdsc->stopping) { | 527 | if (!mdsc->stopping) { |
@@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, | |||
531 | goto no_change; | 531 | goto no_change; |
532 | list_del_init(&ci->i_cap_delay_list); | 532 | list_del_init(&ci->i_cap_delay_list); |
533 | } | 533 | } |
534 | if (set_timeout) | ||
535 | __cap_set_timeouts(mdsc, ci); | ||
534 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); | 536 | list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); |
535 | no_change: | 537 | no_change: |
536 | spin_unlock(&mdsc->cap_delay_lock); | 538 | spin_unlock(&mdsc->cap_delay_lock); |
@@ -720,7 +722,7 @@ void ceph_add_cap(struct inode *inode, | |||
720 | dout(" issued %s, mds wanted %s, actual %s, queueing\n", | 722 | dout(" issued %s, mds wanted %s, actual %s, queueing\n", |
721 | ceph_cap_string(issued), ceph_cap_string(wanted), | 723 | ceph_cap_string(issued), ceph_cap_string(wanted), |
722 | ceph_cap_string(actual_wanted)); | 724 | ceph_cap_string(actual_wanted)); |
723 | __cap_delay_requeue(mdsc, ci); | 725 | __cap_delay_requeue(mdsc, ci, true); |
724 | } | 726 | } |
725 | 727 | ||
726 | if (flags & CEPH_CAP_FLAG_AUTH) { | 728 | if (flags & CEPH_CAP_FLAG_AUTH) { |
@@ -1647,7 +1649,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, | |||
1647 | if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && | 1649 | if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && |
1648 | (mask & CEPH_CAP_FILE_BUFFER)) | 1650 | (mask & CEPH_CAP_FILE_BUFFER)) |
1649 | dirty |= I_DIRTY_DATASYNC; | 1651 | dirty |= I_DIRTY_DATASYNC; |
1650 | __cap_delay_requeue(mdsc, ci); | 1652 | __cap_delay_requeue(mdsc, ci, true); |
1651 | return dirty; | 1653 | return dirty; |
1652 | } | 1654 | } |
1653 | 1655 | ||
@@ -2065,7 +2067,7 @@ ack: | |||
2065 | 2067 | ||
2066 | /* Reschedule delayed caps release if we delayed anything */ | 2068 | /* Reschedule delayed caps release if we delayed anything */ |
2067 | if (delayed) | 2069 | if (delayed) |
2068 | __cap_delay_requeue(mdsc, ci); | 2070 | __cap_delay_requeue(mdsc, ci, false); |
2069 | 2071 | ||
2070 | spin_unlock(&ci->i_ceph_lock); | 2072 | spin_unlock(&ci->i_ceph_lock); |
2071 | 2073 | ||
@@ -2125,7 +2127,7 @@ retry: | |||
2125 | 2127 | ||
2126 | if (delayed) { | 2128 | if (delayed) { |
2127 | spin_lock(&ci->i_ceph_lock); | 2129 | spin_lock(&ci->i_ceph_lock); |
2128 | __cap_delay_requeue(mdsc, ci); | 2130 | __cap_delay_requeue(mdsc, ci, true); |
2129 | spin_unlock(&ci->i_ceph_lock); | 2131 | spin_unlock(&ci->i_ceph_lock); |
2130 | } | 2132 | } |
2131 | } else { | 2133 | } else { |
@@ -2671,17 +2673,18 @@ static void check_max_size(struct inode *inode, loff_t endoff) | |||
2671 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); | 2673 | ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); |
2672 | } | 2674 | } |
2673 | 2675 | ||
2674 | int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) | 2676 | int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, |
2677 | bool nonblock, int *got) | ||
2675 | { | 2678 | { |
2676 | int ret, err = 0; | 2679 | int ret, err = 0; |
2677 | 2680 | ||
2678 | BUG_ON(need & ~CEPH_CAP_FILE_RD); | 2681 | BUG_ON(need & ~CEPH_CAP_FILE_RD); |
2679 | BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); | 2682 | BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); |
2680 | ret = ceph_pool_perm_check(ci, need); | 2683 | ret = ceph_pool_perm_check(ci, need); |
2681 | if (ret < 0) | 2684 | if (ret < 0) |
2682 | return ret; | 2685 | return ret; |
2683 | 2686 | ||
2684 | ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); | 2687 | ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err); |
2685 | if (ret) { | 2688 | if (ret) { |
2686 | if (err == -EAGAIN) { | 2689 | if (err == -EAGAIN) { |
2687 | ret = 0; | 2690 | ret = 0; |
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 92ab20433682..f788496fafcc 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -1,5 +1,6 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/ceph/ceph_debug.h> | 2 | #include <linux/ceph/ceph_debug.h> |
3 | #include <linux/ceph/striper.h> | ||
3 | 4 | ||
4 | #include <linux/module.h> | 5 | #include <linux/module.h> |
5 | #include <linux/sched.h> | 6 | #include <linux/sched.h> |
@@ -557,90 +558,26 @@ enum { | |||
557 | }; | 558 | }; |
558 | 559 | ||
559 | /* | 560 | /* |
560 | * Read a range of bytes striped over one or more objects. Iterate over | ||
561 | * objects we stripe over. (That's not atomic, but good enough for now.) | ||
562 | * | ||
563 | * If we get a short result from the OSD, check against i_size; we need to | ||
564 | * only return a short read to the caller if we hit EOF. | ||
565 | */ | ||
566 | static int striped_read(struct inode *inode, | ||
567 | u64 pos, u64 len, | ||
568 | struct page **pages, int num_pages, | ||
569 | int page_align, int *checkeof) | ||
570 | { | ||
571 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
572 | struct ceph_inode_info *ci = ceph_inode(inode); | ||
573 | u64 this_len; | ||
574 | loff_t i_size; | ||
575 | int page_idx; | ||
576 | int ret, read = 0; | ||
577 | bool hit_stripe, was_short; | ||
578 | |||
579 | /* | ||
580 | * we may need to do multiple reads. not atomic, unfortunately. | ||
581 | */ | ||
582 | more: | ||
583 | this_len = len; | ||
584 | page_idx = (page_align + read) >> PAGE_SHIFT; | ||
585 | ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), | ||
586 | &ci->i_layout, pos, &this_len, | ||
587 | ci->i_truncate_seq, ci->i_truncate_size, | ||
588 | pages + page_idx, num_pages - page_idx, | ||
589 | ((page_align + read) & ~PAGE_MASK)); | ||
590 | if (ret == -ENOENT) | ||
591 | ret = 0; | ||
592 | hit_stripe = this_len < len; | ||
593 | was_short = ret >= 0 && ret < this_len; | ||
594 | dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, | ||
595 | ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); | ||
596 | |||
597 | i_size = i_size_read(inode); | ||
598 | if (ret >= 0) { | ||
599 | if (was_short && (pos + ret < i_size)) { | ||
600 | int zlen = min(this_len - ret, i_size - pos - ret); | ||
601 | int zoff = page_align + read + ret; | ||
602 | dout(" zero gap %llu to %llu\n", | ||
603 | pos + ret, pos + ret + zlen); | ||
604 | ceph_zero_page_vector_range(zoff, zlen, pages); | ||
605 | ret += zlen; | ||
606 | } | ||
607 | |||
608 | read += ret; | ||
609 | pos += ret; | ||
610 | len -= ret; | ||
611 | |||
612 | /* hit stripe and need continue*/ | ||
613 | if (len && hit_stripe && pos < i_size) | ||
614 | goto more; | ||
615 | } | ||
616 | |||
617 | if (read > 0) { | ||
618 | ret = read; | ||
619 | /* did we bounce off eof? */ | ||
620 | if (pos + len > i_size) | ||
621 | *checkeof = CHECK_EOF; | ||
622 | } | ||
623 | |||
624 | dout("striped_read returns %d\n", ret); | ||
625 | return ret; | ||
626 | } | ||
627 | |||
628 | /* | ||
629 | * Completely synchronous read and write methods. Direct from __user | 561 | * Completely synchronous read and write methods. Direct from __user |
630 | * buffer to osd, or directly to user pages (if O_DIRECT). | 562 | * buffer to osd, or directly to user pages (if O_DIRECT). |
631 | * | 563 | * |
632 | * If the read spans object boundary, just do multiple reads. | 564 | * If the read spans object boundary, just do multiple reads. (That's not |
565 | * atomic, but good enough for now.) | ||
566 | * | ||
567 | * If we get a short result from the OSD, check against i_size; we need to | ||
568 | * only return a short read to the caller if we hit EOF. | ||
633 | */ | 569 | */ |
634 | static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, | 570 | static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, |
635 | int *checkeof) | 571 | int *retry_op) |
636 | { | 572 | { |
637 | struct file *file = iocb->ki_filp; | 573 | struct file *file = iocb->ki_filp; |
638 | struct inode *inode = file_inode(file); | 574 | struct inode *inode = file_inode(file); |
639 | struct page **pages; | 575 | struct ceph_inode_info *ci = ceph_inode(inode); |
640 | u64 off = iocb->ki_pos; | 576 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
641 | int num_pages; | 577 | struct ceph_osd_client *osdc = &fsc->client->osdc; |
642 | ssize_t ret; | 578 | ssize_t ret; |
643 | size_t len = iov_iter_count(to); | 579 | u64 off = iocb->ki_pos; |
580 | u64 len = iov_iter_count(to); | ||
644 | 581 | ||
645 | dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, | 582 | dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, |
646 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); | 583 | (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); |
@@ -653,61 +590,118 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, | |||
653 | * but it will at least behave sensibly when they are | 590 | * but it will at least behave sensibly when they are |
654 | * in sequence. | 591 | * in sequence. |
655 | */ | 592 | */ |
656 | ret = filemap_write_and_wait_range(inode->i_mapping, off, | 593 | ret = filemap_write_and_wait_range(inode->i_mapping, off, off + len); |
657 | off + len); | ||
658 | if (ret < 0) | 594 | if (ret < 0) |
659 | return ret; | 595 | return ret; |
660 | 596 | ||
661 | if (unlikely(to->type & ITER_PIPE)) { | 597 | ret = 0; |
598 | while ((len = iov_iter_count(to)) > 0) { | ||
599 | struct ceph_osd_request *req; | ||
600 | struct page **pages; | ||
601 | int num_pages; | ||
662 | size_t page_off; | 602 | size_t page_off; |
663 | ret = iov_iter_get_pages_alloc(to, &pages, len, | 603 | u64 i_size; |
664 | &page_off); | 604 | bool more; |
665 | if (ret <= 0) | 605 | |
666 | return -ENOMEM; | 606 | req = ceph_osdc_new_request(osdc, &ci->i_layout, |
667 | num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); | 607 | ci->i_vino, off, &len, 0, 1, |
608 | CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, | ||
609 | NULL, ci->i_truncate_seq, | ||
610 | ci->i_truncate_size, false); | ||
611 | if (IS_ERR(req)) { | ||
612 | ret = PTR_ERR(req); | ||
613 | break; | ||
614 | } | ||
615 | |||
616 | more = len < iov_iter_count(to); | ||
668 | 617 | ||
669 | ret = striped_read(inode, off, ret, pages, num_pages, | 618 | if (unlikely(to->type & ITER_PIPE)) { |
670 | page_off, checkeof); | 619 | ret = iov_iter_get_pages_alloc(to, &pages, len, |
671 | if (ret > 0) { | 620 | &page_off); |
672 | iov_iter_advance(to, ret); | 621 | if (ret <= 0) { |
673 | off += ret; | 622 | ceph_osdc_put_request(req); |
623 | ret = -ENOMEM; | ||
624 | break; | ||
625 | } | ||
626 | num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); | ||
627 | if (ret < len) { | ||
628 | len = ret; | ||
629 | osd_req_op_extent_update(req, 0, len); | ||
630 | more = false; | ||
631 | } | ||
674 | } else { | 632 | } else { |
675 | iov_iter_advance(to, 0); | 633 | num_pages = calc_pages_for(off, len); |
634 | page_off = off & ~PAGE_MASK; | ||
635 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); | ||
636 | if (IS_ERR(pages)) { | ||
637 | ceph_osdc_put_request(req); | ||
638 | ret = PTR_ERR(pages); | ||
639 | break; | ||
640 | } | ||
676 | } | 641 | } |
677 | ceph_put_page_vector(pages, num_pages, false); | 642 | |
678 | } else { | 643 | osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, |
679 | num_pages = calc_pages_for(off, len); | 644 | false, false); |
680 | pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); | 645 | ret = ceph_osdc_start_request(osdc, req, false); |
681 | if (IS_ERR(pages)) | 646 | if (!ret) |
682 | return PTR_ERR(pages); | 647 | ret = ceph_osdc_wait_request(osdc, req); |
683 | 648 | ceph_osdc_put_request(req); | |
684 | ret = striped_read(inode, off, len, pages, num_pages, | 649 | |
685 | (off & ~PAGE_MASK), checkeof); | 650 | i_size = i_size_read(inode); |
686 | if (ret > 0) { | 651 | dout("sync_read %llu~%llu got %zd i_size %llu%s\n", |
687 | int l, k = 0; | 652 | off, len, ret, i_size, (more ? " MORE" : "")); |
688 | size_t left = ret; | 653 | |
689 | 654 | if (ret == -ENOENT) | |
690 | while (left) { | 655 | ret = 0; |
691 | size_t page_off = off & ~PAGE_MASK; | 656 | if (ret >= 0 && ret < len && (off + ret < i_size)) { |
692 | size_t copy = min_t(size_t, left, | 657 | int zlen = min(len - ret, i_size - off - ret); |
693 | PAGE_SIZE - page_off); | 658 | int zoff = page_off + ret; |
694 | l = copy_page_to_iter(pages[k++], page_off, | 659 | dout("sync_read zero gap %llu~%llu\n", |
695 | copy, to); | 660 | off + ret, off + ret + zlen); |
696 | off += l; | 661 | ceph_zero_page_vector_range(zoff, zlen, pages); |
697 | left -= l; | 662 | ret += zlen; |
698 | if (l < copy) | 663 | } |
664 | |||
665 | if (unlikely(to->type & ITER_PIPE)) { | ||
666 | if (ret > 0) { | ||
667 | iov_iter_advance(to, ret); | ||
668 | off += ret; | ||
669 | } else { | ||
670 | iov_iter_advance(to, 0); | ||
671 | } | ||
672 | ceph_put_page_vector(pages, num_pages, false); | ||
673 | } else { | ||
674 | int idx = 0; | ||
675 | size_t left = ret > 0 ? ret : 0; | ||
676 | while (left > 0) { | ||
677 | size_t len, copied; | ||
678 | page_off = off & ~PAGE_MASK; | ||
679 | len = min_t(size_t, left, PAGE_SIZE - page_off); | ||
680 | copied = copy_page_to_iter(pages[idx++], | ||
681 | page_off, len, to); | ||
682 | off += copied; | ||
683 | left -= copied; | ||
684 | if (copied < len) { | ||
685 | ret = -EFAULT; | ||
699 | break; | 686 | break; |
687 | } | ||
700 | } | 688 | } |
689 | ceph_release_page_vector(pages, num_pages); | ||
701 | } | 690 | } |
702 | ceph_release_page_vector(pages, num_pages); | 691 | |
692 | if (ret <= 0 || off >= i_size || !more) | ||
693 | break; | ||
703 | } | 694 | } |
704 | 695 | ||
705 | if (off > iocb->ki_pos) { | 696 | if (off > iocb->ki_pos) { |
697 | if (ret >= 0 && | ||
698 | iov_iter_count(to) > 0 && off >= i_size_read(inode)) | ||
699 | *retry_op = CHECK_EOF; | ||
706 | ret = off - iocb->ki_pos; | 700 | ret = off - iocb->ki_pos; |
707 | iocb->ki_pos = off; | 701 | iocb->ki_pos = off; |
708 | } | 702 | } |
709 | 703 | ||
710 | dout("sync_read result %zd\n", ret); | 704 | dout("sync_read result %zd retry_op %d\n", ret, *retry_op); |
711 | return ret; | 705 | return ret; |
712 | } | 706 | } |
713 | 707 | ||
@@ -865,7 +859,7 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
865 | } | 859 | } |
866 | spin_unlock(&ci->i_ceph_lock); | 860 | spin_unlock(&ci->i_ceph_lock); |
867 | 861 | ||
868 | req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, | 862 | req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 1, |
869 | false, GFP_NOFS); | 863 | false, GFP_NOFS); |
870 | if (!req) { | 864 | if (!req) { |
871 | ret = -ENOMEM; | 865 | ret = -ENOMEM; |
@@ -877,6 +871,11 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
877 | ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); | 871 | ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); |
878 | ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); | 872 | ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); |
879 | 873 | ||
874 | req->r_ops[0] = orig_req->r_ops[0]; | ||
875 | |||
876 | req->r_mtime = aio_req->mtime; | ||
877 | req->r_data_offset = req->r_ops[0].extent.offset; | ||
878 | |||
880 | ret = ceph_osdc_alloc_messages(req, GFP_NOFS); | 879 | ret = ceph_osdc_alloc_messages(req, GFP_NOFS); |
881 | if (ret) { | 880 | if (ret) { |
882 | ceph_osdc_put_request(req); | 881 | ceph_osdc_put_request(req); |
@@ -884,11 +883,6 @@ static void ceph_aio_retry_work(struct work_struct *work) | |||
884 | goto out; | 883 | goto out; |
885 | } | 884 | } |
886 | 885 | ||
887 | req->r_ops[0] = orig_req->r_ops[0]; | ||
888 | |||
889 | req->r_mtime = aio_req->mtime; | ||
890 | req->r_data_offset = req->r_ops[0].extent.offset; | ||
891 | |||
892 | ceph_osdc_put_request(orig_req); | 886 | ceph_osdc_put_request(orig_req); |
893 | 887 | ||
894 | req->r_callback = ceph_aio_complete_req; | 888 | req->r_callback = ceph_aio_complete_req; |
@@ -1735,7 +1729,6 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1735 | struct ceph_file_info *fi = file->private_data; | 1729 | struct ceph_file_info *fi = file->private_data; |
1736 | struct inode *inode = file_inode(file); | 1730 | struct inode *inode = file_inode(file); |
1737 | struct ceph_inode_info *ci = ceph_inode(inode); | 1731 | struct ceph_inode_info *ci = ceph_inode(inode); |
1738 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | ||
1739 | struct ceph_cap_flush *prealloc_cf; | 1732 | struct ceph_cap_flush *prealloc_cf; |
1740 | int want, got = 0; | 1733 | int want, got = 0; |
1741 | int dirty; | 1734 | int dirty; |
@@ -1743,10 +1736,7 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1743 | loff_t endoff = 0; | 1736 | loff_t endoff = 0; |
1744 | loff_t size; | 1737 | loff_t size; |
1745 | 1738 | ||
1746 | if ((offset + length) > max(i_size_read(inode), fsc->max_file_size)) | 1739 | if (mode != (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) |
1747 | return -EFBIG; | ||
1748 | |||
1749 | if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) | ||
1750 | return -EOPNOTSUPP; | 1740 | return -EOPNOTSUPP; |
1751 | 1741 | ||
1752 | if (!S_ISREG(inode->i_mode)) | 1742 | if (!S_ISREG(inode->i_mode)) |
@@ -1763,18 +1753,6 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1763 | goto unlock; | 1753 | goto unlock; |
1764 | } | 1754 | } |
1765 | 1755 | ||
1766 | if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && | ||
1767 | ceph_quota_is_max_bytes_exceeded(inode, offset + length)) { | ||
1768 | ret = -EDQUOT; | ||
1769 | goto unlock; | ||
1770 | } | ||
1771 | |||
1772 | if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) && | ||
1773 | !(mode & FALLOC_FL_PUNCH_HOLE)) { | ||
1774 | ret = -ENOSPC; | ||
1775 | goto unlock; | ||
1776 | } | ||
1777 | |||
1778 | if (ci->i_inline_version != CEPH_INLINE_NONE) { | 1756 | if (ci->i_inline_version != CEPH_INLINE_NONE) { |
1779 | ret = ceph_uninline_data(file, NULL); | 1757 | ret = ceph_uninline_data(file, NULL); |
1780 | if (ret < 0) | 1758 | if (ret < 0) |
@@ -1782,12 +1760,12 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1782 | } | 1760 | } |
1783 | 1761 | ||
1784 | size = i_size_read(inode); | 1762 | size = i_size_read(inode); |
1785 | if (!(mode & FALLOC_FL_KEEP_SIZE)) { | 1763 | |
1786 | endoff = offset + length; | 1764 | /* Are we punching a hole beyond EOF? */ |
1787 | ret = inode_newsize_ok(inode, endoff); | 1765 | if (offset >= size) |
1788 | if (ret) | 1766 | goto unlock; |
1789 | goto unlock; | 1767 | if ((offset + length) > size) |
1790 | } | 1768 | length = size - offset; |
1791 | 1769 | ||
1792 | if (fi->fmode & CEPH_FILE_MODE_LAZY) | 1770 | if (fi->fmode & CEPH_FILE_MODE_LAZY) |
1793 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; | 1771 | want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; |
@@ -1798,16 +1776,8 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1798 | if (ret < 0) | 1776 | if (ret < 0) |
1799 | goto unlock; | 1777 | goto unlock; |
1800 | 1778 | ||
1801 | if (mode & FALLOC_FL_PUNCH_HOLE) { | 1779 | ceph_zero_pagecache_range(inode, offset, length); |
1802 | if (offset < size) | 1780 | ret = ceph_zero_objects(inode, offset, length); |
1803 | ceph_zero_pagecache_range(inode, offset, length); | ||
1804 | ret = ceph_zero_objects(inode, offset, length); | ||
1805 | } else if (endoff > size) { | ||
1806 | truncate_pagecache_range(inode, size, -1); | ||
1807 | if (ceph_inode_set_size(inode, endoff)) | ||
1808 | ceph_check_caps(ceph_inode(inode), | ||
1809 | CHECK_CAPS_AUTHONLY, NULL); | ||
1810 | } | ||
1811 | 1781 | ||
1812 | if (!ret) { | 1782 | if (!ret) { |
1813 | spin_lock(&ci->i_ceph_lock); | 1783 | spin_lock(&ci->i_ceph_lock); |
@@ -1817,9 +1787,6 @@ static long ceph_fallocate(struct file *file, int mode, | |||
1817 | spin_unlock(&ci->i_ceph_lock); | 1787 | spin_unlock(&ci->i_ceph_lock); |
1818 | if (dirty) | 1788 | if (dirty) |
1819 | __mark_inode_dirty(inode, dirty); | 1789 | __mark_inode_dirty(inode, dirty); |
1820 | if ((endoff > size) && | ||
1821 | ceph_quota_is_max_bytes_approaching(inode, endoff)) | ||
1822 | ceph_check_caps(ci, CHECK_CAPS_NODELAY, NULL); | ||
1823 | } | 1790 | } |
1824 | 1791 | ||
1825 | ceph_put_cap_refs(ci, got); | 1792 | ceph_put_cap_refs(ci, got); |
@@ -1829,6 +1796,300 @@ unlock: | |||
1829 | return ret; | 1796 | return ret; |
1830 | } | 1797 | } |
1831 | 1798 | ||
1799 | /* | ||
1800 | * This function tries to get FILE_WR capabilities for dst_ci and FILE_RD for | ||
1801 | * src_ci. Two attempts are made to obtain both caps, and an error is return if | ||
1802 | * this fails; zero is returned on success. | ||
1803 | */ | ||
1804 | static int get_rd_wr_caps(struct ceph_inode_info *src_ci, | ||
1805 | loff_t src_endoff, int *src_got, | ||
1806 | struct ceph_inode_info *dst_ci, | ||
1807 | loff_t dst_endoff, int *dst_got) | ||
1808 | { | ||
1809 | int ret = 0; | ||
1810 | bool retrying = false; | ||
1811 | |||
1812 | retry_caps: | ||
1813 | ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, | ||
1814 | dst_endoff, dst_got, NULL); | ||
1815 | if (ret < 0) | ||
1816 | return ret; | ||
1817 | |||
1818 | /* | ||
1819 | * Since we're already holding the FILE_WR capability for the dst file, | ||
1820 | * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some | ||
1821 | * retry dance instead to try to get both capabilities. | ||
1822 | */ | ||
1823 | ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, | ||
1824 | false, src_got); | ||
1825 | if (ret <= 0) { | ||
1826 | /* Start by dropping dst_ci caps and getting src_ci caps */ | ||
1827 | ceph_put_cap_refs(dst_ci, *dst_got); | ||
1828 | if (retrying) { | ||
1829 | if (!ret) | ||
1830 | /* ceph_try_get_caps masks EAGAIN */ | ||
1831 | ret = -EAGAIN; | ||
1832 | return ret; | ||
1833 | } | ||
1834 | ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, | ||
1835 | CEPH_CAP_FILE_SHARED, src_endoff, | ||
1836 | src_got, NULL); | ||
1837 | if (ret < 0) | ||
1838 | return ret; | ||
1839 | /*... drop src_ci caps too, and retry */ | ||
1840 | ceph_put_cap_refs(src_ci, *src_got); | ||
1841 | retrying = true; | ||
1842 | goto retry_caps; | ||
1843 | } | ||
1844 | return ret; | ||
1845 | } | ||
1846 | |||
1847 | static void put_rd_wr_caps(struct ceph_inode_info *src_ci, int src_got, | ||
1848 | struct ceph_inode_info *dst_ci, int dst_got) | ||
1849 | { | ||
1850 | ceph_put_cap_refs(src_ci, src_got); | ||
1851 | ceph_put_cap_refs(dst_ci, dst_got); | ||
1852 | } | ||
1853 | |||
1854 | /* | ||
1855 | * This function does several size-related checks, returning an error if: | ||
1856 | * - source file is smaller than off+len | ||
1857 | * - destination file size is not OK (inode_newsize_ok()) | ||
1858 | * - max bytes quotas is exceeded | ||
1859 | */ | ||
1860 | static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, | ||
1861 | loff_t src_off, loff_t dst_off, size_t len) | ||
1862 | { | ||
1863 | loff_t size, endoff; | ||
1864 | |||
1865 | size = i_size_read(src_inode); | ||
1866 | /* | ||
1867 | * Don't copy beyond source file EOF. Instead of simply setting length | ||
1868 | * to (size - src_off), just drop to VFS default implementation, as the | ||
1869 | * local i_size may be stale due to other clients writing to the source | ||
1870 | * inode. | ||
1871 | */ | ||
1872 | if (src_off + len > size) { | ||
1873 | dout("Copy beyond EOF (%llu + %zu > %llu)\n", | ||
1874 | src_off, len, size); | ||
1875 | return -EOPNOTSUPP; | ||
1876 | } | ||
1877 | size = i_size_read(dst_inode); | ||
1878 | |||
1879 | endoff = dst_off + len; | ||
1880 | if (inode_newsize_ok(dst_inode, endoff)) | ||
1881 | return -EOPNOTSUPP; | ||
1882 | |||
1883 | if (ceph_quota_is_max_bytes_exceeded(dst_inode, endoff)) | ||
1884 | return -EDQUOT; | ||
1885 | |||
1886 | return 0; | ||
1887 | } | ||
1888 | |||
1889 | static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off, | ||
1890 | struct file *dst_file, loff_t dst_off, | ||
1891 | size_t len, unsigned int flags) | ||
1892 | { | ||
1893 | struct inode *src_inode = file_inode(src_file); | ||
1894 | struct inode *dst_inode = file_inode(dst_file); | ||
1895 | struct ceph_inode_info *src_ci = ceph_inode(src_inode); | ||
1896 | struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); | ||
1897 | struct ceph_cap_flush *prealloc_cf; | ||
1898 | struct ceph_object_locator src_oloc, dst_oloc; | ||
1899 | struct ceph_object_id src_oid, dst_oid; | ||
1900 | loff_t endoff = 0, size; | ||
1901 | ssize_t ret = -EIO; | ||
1902 | u64 src_objnum, dst_objnum, src_objoff, dst_objoff; | ||
1903 | u32 src_objlen, dst_objlen, object_size; | ||
1904 | int src_got = 0, dst_got = 0, err, dirty; | ||
1905 | bool do_final_copy = false; | ||
1906 | |||
1907 | if (src_inode == dst_inode) | ||
1908 | return -EINVAL; | ||
1909 | if (ceph_snap(dst_inode) != CEPH_NOSNAP) | ||
1910 | return -EROFS; | ||
1911 | |||
1912 | /* | ||
1913 | * Some of the checks below will return -EOPNOTSUPP, which will force a | ||
1914 | * fallback to the default VFS copy_file_range implementation. This is | ||
1915 | * desirable in several cases (for ex, the 'len' is smaller than the | ||
1916 | * size of the objects, or in cases where that would be more | ||
1917 | * efficient). | ||
1918 | */ | ||
1919 | |||
1920 | if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) | ||
1921 | return -EOPNOTSUPP; | ||
1922 | |||
1923 | if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || | ||
1924 | (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) || | ||
1925 | (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) | ||
1926 | return -EOPNOTSUPP; | ||
1927 | |||
1928 | if (len < src_ci->i_layout.object_size) | ||
1929 | return -EOPNOTSUPP; /* no remote copy will be done */ | ||
1930 | |||
1931 | prealloc_cf = ceph_alloc_cap_flush(); | ||
1932 | if (!prealloc_cf) | ||
1933 | return -ENOMEM; | ||
1934 | |||
1935 | /* Start by sync'ing the source file */ | ||
1936 | ret = file_write_and_wait_range(src_file, src_off, (src_off + len)); | ||
1937 | if (ret < 0) | ||
1938 | goto out; | ||
1939 | |||
1940 | /* | ||
1941 | * We need FILE_WR caps for dst_ci and FILE_RD for src_ci as other | ||
1942 | * clients may have dirty data in their caches. And OSDs know nothing | ||
1943 | * about caps, so they can't safely do the remote object copies. | ||
1944 | */ | ||
1945 | err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, | ||
1946 | dst_ci, (dst_off + len), &dst_got); | ||
1947 | if (err < 0) { | ||
1948 | dout("get_rd_wr_caps returned %d\n", err); | ||
1949 | ret = -EOPNOTSUPP; | ||
1950 | goto out; | ||
1951 | } | ||
1952 | |||
1953 | ret = is_file_size_ok(src_inode, dst_inode, src_off, dst_off, len); | ||
1954 | if (ret < 0) | ||
1955 | goto out_caps; | ||
1956 | |||
1957 | size = i_size_read(dst_inode); | ||
1958 | endoff = dst_off + len; | ||
1959 | |||
1960 | /* Drop dst file cached pages */ | ||
1961 | ret = invalidate_inode_pages2_range(dst_inode->i_mapping, | ||
1962 | dst_off >> PAGE_SHIFT, | ||
1963 | endoff >> PAGE_SHIFT); | ||
1964 | if (ret < 0) { | ||
1965 | dout("Failed to invalidate inode pages (%zd)\n", ret); | ||
1966 | ret = 0; /* XXX */ | ||
1967 | } | ||
1968 | src_oloc.pool = src_ci->i_layout.pool_id; | ||
1969 | src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); | ||
1970 | dst_oloc.pool = dst_ci->i_layout.pool_id; | ||
1971 | dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); | ||
1972 | |||
1973 | ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, | ||
1974 | src_ci->i_layout.object_size, | ||
1975 | &src_objnum, &src_objoff, &src_objlen); | ||
1976 | ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, | ||
1977 | dst_ci->i_layout.object_size, | ||
1978 | &dst_objnum, &dst_objoff, &dst_objlen); | ||
1979 | /* object-level offsets need to the same */ | ||
1980 | if (src_objoff != dst_objoff) { | ||
1981 | ret = -EOPNOTSUPP; | ||
1982 | goto out_caps; | ||
1983 | } | ||
1984 | |||
1985 | /* | ||
1986 | * Do a manual copy if the object offset isn't object aligned. | ||
1987 | * 'src_objlen' contains the bytes left until the end of the object, | ||
1988 | * starting at the src_off | ||
1989 | */ | ||
1990 | if (src_objoff) { | ||
1991 | /* | ||
1992 | * we need to temporarily drop all caps as we'll be calling | ||
1993 | * {read,write}_iter, which will get caps again. | ||
1994 | */ | ||
1995 | put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); | ||
1996 | ret = do_splice_direct(src_file, &src_off, dst_file, | ||
1997 | &dst_off, src_objlen, flags); | ||
1998 | if (ret < 0) { | ||
1999 | dout("do_splice_direct returned %d\n", err); | ||
2000 | goto out; | ||
2001 | } | ||
2002 | len -= ret; | ||
2003 | err = get_rd_wr_caps(src_ci, (src_off + len), | ||
2004 | &src_got, dst_ci, | ||
2005 | (dst_off + len), &dst_got); | ||
2006 | if (err < 0) | ||
2007 | goto out; | ||
2008 | err = is_file_size_ok(src_inode, dst_inode, | ||
2009 | src_off, dst_off, len); | ||
2010 | if (err < 0) | ||
2011 | goto out_caps; | ||
2012 | } | ||
2013 | object_size = src_ci->i_layout.object_size; | ||
2014 | while (len >= object_size) { | ||
2015 | ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, | ||
2016 | object_size, &src_objnum, | ||
2017 | &src_objoff, &src_objlen); | ||
2018 | ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off, | ||
2019 | object_size, &dst_objnum, | ||
2020 | &dst_objoff, &dst_objlen); | ||
2021 | ceph_oid_init(&src_oid); | ||
2022 | ceph_oid_printf(&src_oid, "%llx.%08llx", | ||
2023 | src_ci->i_vino.ino, src_objnum); | ||
2024 | ceph_oid_init(&dst_oid); | ||
2025 | ceph_oid_printf(&dst_oid, "%llx.%08llx", | ||
2026 | dst_ci->i_vino.ino, dst_objnum); | ||
2027 | /* Do an object remote copy */ | ||
2028 | err = ceph_osdc_copy_from( | ||
2029 | &ceph_inode_to_client(src_inode)->client->osdc, | ||
2030 | src_ci->i_vino.snap, 0, | ||
2031 | &src_oid, &src_oloc, | ||
2032 | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | | ||
2033 | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, | ||
2034 | &dst_oid, &dst_oloc, | ||
2035 | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | | ||
2036 | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); | ||
2037 | if (err) { | ||
2038 | dout("ceph_osdc_copy_from returned %d\n", err); | ||
2039 | if (!ret) | ||
2040 | ret = err; | ||
2041 | goto out_caps; | ||
2042 | } | ||
2043 | len -= object_size; | ||
2044 | src_off += object_size; | ||
2045 | dst_off += object_size; | ||
2046 | ret += object_size; | ||
2047 | } | ||
2048 | |||
2049 | if (len) | ||
2050 | /* We still need one final local copy */ | ||
2051 | do_final_copy = true; | ||
2052 | |||
2053 | file_update_time(dst_file); | ||
2054 | if (endoff > size) { | ||
2055 | int caps_flags = 0; | ||
2056 | |||
2057 | /* Let the MDS know about dst file size change */ | ||
2058 | if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) | ||
2059 | caps_flags |= CHECK_CAPS_NODELAY; | ||
2060 | if (ceph_inode_set_size(dst_inode, endoff)) | ||
2061 | caps_flags |= CHECK_CAPS_AUTHONLY; | ||
2062 | if (caps_flags) | ||
2063 | ceph_check_caps(dst_ci, caps_flags, NULL); | ||
2064 | } | ||
2065 | /* Mark Fw dirty */ | ||
2066 | spin_lock(&dst_ci->i_ceph_lock); | ||
2067 | dst_ci->i_inline_version = CEPH_INLINE_NONE; | ||
2068 | dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf); | ||
2069 | spin_unlock(&dst_ci->i_ceph_lock); | ||
2070 | if (dirty) | ||
2071 | __mark_inode_dirty(dst_inode, dirty); | ||
2072 | |||
2073 | out_caps: | ||
2074 | put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); | ||
2075 | |||
2076 | if (do_final_copy) { | ||
2077 | err = do_splice_direct(src_file, &src_off, dst_file, | ||
2078 | &dst_off, len, flags); | ||
2079 | if (err < 0) { | ||
2080 | dout("do_splice_direct returned %d\n", err); | ||
2081 | goto out; | ||
2082 | } | ||
2083 | len -= err; | ||
2084 | ret += err; | ||
2085 | } | ||
2086 | |||
2087 | out: | ||
2088 | ceph_free_cap_flush(prealloc_cf); | ||
2089 | |||
2090 | return ret; | ||
2091 | } | ||
2092 | |||
1832 | const struct file_operations ceph_file_fops = { | 2093 | const struct file_operations ceph_file_fops = { |
1833 | .open = ceph_open, | 2094 | .open = ceph_open, |
1834 | .release = ceph_release, | 2095 | .release = ceph_release, |
@@ -1844,5 +2105,5 @@ const struct file_operations ceph_file_fops = { | |||
1844 | .unlocked_ioctl = ceph_ioctl, | 2105 | .unlocked_ioctl = ceph_ioctl, |
1845 | .compat_ioctl = ceph_ioctl, | 2106 | .compat_ioctl = ceph_ioctl, |
1846 | .fallocate = ceph_fallocate, | 2107 | .fallocate = ceph_fallocate, |
2108 | .copy_file_range = ceph_copy_file_range, | ||
1847 | }; | 2109 | }; |
1848 | |||
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ebc7bdaed2d0..79dd5e6ed755 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -1132,8 +1132,12 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in) | |||
1132 | if (IS_ERR(realdn)) { | 1132 | if (IS_ERR(realdn)) { |
1133 | pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", | 1133 | pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n", |
1134 | PTR_ERR(realdn), dn, in, ceph_vinop(in)); | 1134 | PTR_ERR(realdn), dn, in, ceph_vinop(in)); |
1135 | dput(dn); | 1135 | dn = realdn; |
1136 | dn = realdn; /* note realdn contains the error */ | 1136 | /* |
1137 | * Caller should release 'dn' in the case of error. | ||
1138 | * If 'req->r_dentry' is passed to this function, | ||
1139 | * caller should leave 'req->r_dentry' untouched. | ||
1140 | */ | ||
1137 | goto out; | 1141 | goto out; |
1138 | } else if (realdn) { | 1142 | } else if (realdn) { |
1139 | dout("dn %p (%d) spliced with %p (%d) " | 1143 | dout("dn %p (%d) spliced with %p (%d) " |
@@ -1196,7 +1200,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) | |||
1196 | WARN_ON_ONCE(1); | 1200 | WARN_ON_ONCE(1); |
1197 | } | 1201 | } |
1198 | 1202 | ||
1199 | if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) { | 1203 | if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME && |
1204 | test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) && | ||
1205 | !test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) { | ||
1200 | struct qstr dname; | 1206 | struct qstr dname; |
1201 | struct dentry *dn, *parent; | 1207 | struct dentry *dn, *parent; |
1202 | 1208 | ||
@@ -1677,7 +1683,6 @@ retry_lookup: | |||
1677 | if (IS_ERR(realdn)) { | 1683 | if (IS_ERR(realdn)) { |
1678 | err = PTR_ERR(realdn); | 1684 | err = PTR_ERR(realdn); |
1679 | d_drop(dn); | 1685 | d_drop(dn); |
1680 | dn = NULL; | ||
1681 | goto next_item; | 1686 | goto next_item; |
1682 | } | 1687 | } |
1683 | dn = realdn; | 1688 | dn = realdn; |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bc43c822426a..67a9aeb2f4ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -2071,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | |||
2071 | if (req->r_old_dentry_drop) | 2071 | if (req->r_old_dentry_drop) |
2072 | len += req->r_old_dentry->d_name.len; | 2072 | len += req->r_old_dentry->d_name.len; |
2073 | 2073 | ||
2074 | msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false); | 2074 | msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false); |
2075 | if (!msg) { | 2075 | if (!msg) { |
2076 | msg = ERR_PTR(-ENOMEM); | 2076 | msg = ERR_PTR(-ENOMEM); |
2077 | goto out_free2; | 2077 | goto out_free2; |
@@ -2136,7 +2136,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, | |||
2136 | 2136 | ||
2137 | if (req->r_pagelist) { | 2137 | if (req->r_pagelist) { |
2138 | struct ceph_pagelist *pagelist = req->r_pagelist; | 2138 | struct ceph_pagelist *pagelist = req->r_pagelist; |
2139 | refcount_inc(&pagelist->refcnt); | ||
2140 | ceph_msg_data_add_pagelist(msg, pagelist); | 2139 | ceph_msg_data_add_pagelist(msg, pagelist); |
2141 | msg->hdr.data_len = cpu_to_le32(pagelist->length); | 2140 | msg->hdr.data_len = cpu_to_le32(pagelist->length); |
2142 | } else { | 2141 | } else { |
@@ -3126,12 +3125,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, | |||
3126 | 3125 | ||
3127 | pr_info("mds%d reconnect start\n", mds); | 3126 | pr_info("mds%d reconnect start\n", mds); |
3128 | 3127 | ||
3129 | pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); | 3128 | pagelist = ceph_pagelist_alloc(GFP_NOFS); |
3130 | if (!pagelist) | 3129 | if (!pagelist) |
3131 | goto fail_nopagelist; | 3130 | goto fail_nopagelist; |
3132 | ceph_pagelist_init(pagelist); | ||
3133 | 3131 | ||
3134 | reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false); | 3132 | reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false); |
3135 | if (!reply) | 3133 | if (!reply) |
3136 | goto fail_nomsg; | 3134 | goto fail_nomsg; |
3137 | 3135 | ||
@@ -3241,6 +3239,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, | |||
3241 | mutex_unlock(&mdsc->mutex); | 3239 | mutex_unlock(&mdsc->mutex); |
3242 | 3240 | ||
3243 | up_read(&mdsc->snap_rwsem); | 3241 | up_read(&mdsc->snap_rwsem); |
3242 | ceph_pagelist_release(pagelist); | ||
3244 | return; | 3243 | return; |
3245 | 3244 | ||
3246 | fail: | 3245 | fail: |
diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eab1359d0553..b5ecd6f50360 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c | |||
@@ -165,6 +165,8 @@ enum { | |||
165 | Opt_noacl, | 165 | Opt_noacl, |
166 | Opt_quotadf, | 166 | Opt_quotadf, |
167 | Opt_noquotadf, | 167 | Opt_noquotadf, |
168 | Opt_copyfrom, | ||
169 | Opt_nocopyfrom, | ||
168 | }; | 170 | }; |
169 | 171 | ||
170 | static match_table_t fsopt_tokens = { | 172 | static match_table_t fsopt_tokens = { |
@@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = { | |||
203 | {Opt_noacl, "noacl"}, | 205 | {Opt_noacl, "noacl"}, |
204 | {Opt_quotadf, "quotadf"}, | 206 | {Opt_quotadf, "quotadf"}, |
205 | {Opt_noquotadf, "noquotadf"}, | 207 | {Opt_noquotadf, "noquotadf"}, |
208 | {Opt_copyfrom, "copyfrom"}, | ||
209 | {Opt_nocopyfrom, "nocopyfrom"}, | ||
206 | {-1, NULL} | 210 | {-1, NULL} |
207 | }; | 211 | }; |
208 | 212 | ||
@@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private) | |||
355 | case Opt_noquotadf: | 359 | case Opt_noquotadf: |
356 | fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; | 360 | fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF; |
357 | break; | 361 | break; |
362 | case Opt_copyfrom: | ||
363 | fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM; | ||
364 | break; | ||
365 | case Opt_nocopyfrom: | ||
366 | fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM; | ||
367 | break; | ||
358 | #ifdef CONFIG_CEPH_FS_POSIX_ACL | 368 | #ifdef CONFIG_CEPH_FS_POSIX_ACL |
359 | case Opt_acl: | 369 | case Opt_acl: |
360 | fsopt->sb_flags |= SB_POSIXACL; | 370 | fsopt->sb_flags |= SB_POSIXACL; |
@@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) | |||
553 | seq_puts(m, ",noacl"); | 563 | seq_puts(m, ",noacl"); |
554 | #endif | 564 | #endif |
555 | 565 | ||
566 | if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) | ||
567 | seq_puts(m, ",nocopyfrom"); | ||
568 | |||
556 | if (fsopt->mds_namespace) | 569 | if (fsopt->mds_namespace) |
557 | seq_show_option(m, "mds_namespace", fsopt->mds_namespace); | 570 | seq_show_option(m, "mds_namespace", fsopt->mds_namespace); |
558 | if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) | 571 | if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) |
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 582e28fd1b7b..c005a5400f2e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -40,6 +40,7 @@ | |||
40 | #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ | 40 | #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ |
41 | #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ | 41 | #define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ |
42 | #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ | 42 | #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ |
43 | #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ | ||
43 | 44 | ||
44 | #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE | 45 | #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE |
45 | 46 | ||
@@ -1008,7 +1009,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, | |||
1008 | extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, | 1009 | extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, |
1009 | loff_t endoff, int *got, struct page **pinned_page); | 1010 | loff_t endoff, int *got, struct page **pinned_page); |
1010 | extern int ceph_try_get_caps(struct ceph_inode_info *ci, | 1011 | extern int ceph_try_get_caps(struct ceph_inode_info *ci, |
1011 | int need, int want, int *got); | 1012 | int need, int want, bool nonblock, int *got); |
1012 | 1013 | ||
1013 | /* for counting open files by mode */ | 1014 | /* for counting open files by mode */ |
1014 | extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); | 1015 | extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); |
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5cc8b94f8206..316f6ad10644 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c | |||
@@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name, | |||
951 | 951 | ||
952 | if (size > 0) { | 952 | if (size > 0) { |
953 | /* copy value into pagelist */ | 953 | /* copy value into pagelist */ |
954 | pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); | 954 | pagelist = ceph_pagelist_alloc(GFP_NOFS); |
955 | if (!pagelist) | 955 | if (!pagelist) |
956 | return -ENOMEM; | 956 | return -ENOMEM; |
957 | 957 | ||
958 | ceph_pagelist_init(pagelist); | ||
959 | err = ceph_pagelist_append(pagelist, value, size); | 958 | err = ceph_pagelist_append(pagelist, value, size); |
960 | if (err) | 959 | if (err) |
961 | goto out; | 960 | goto out; |
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 49c93b9308d7..68bb09c29ce8 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h | |||
@@ -81,7 +81,13 @@ struct ceph_options { | |||
81 | 81 | ||
82 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) | 82 | #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) |
83 | #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) | 83 | #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) |
84 | #define CEPH_MSG_MAX_DATA_LEN (16*1024*1024) | 84 | |
85 | /* | ||
86 | * Handle the largest possible rbd object in one message. | ||
87 | * There is no limit on the size of cephfs objects, but it has to obey | ||
88 | * rsize and wsize mount options anyway. | ||
89 | */ | ||
90 | #define CEPH_MSG_MAX_DATA_LEN (32*1024*1024) | ||
85 | 91 | ||
86 | #define CEPH_AUTH_NAME_DEFAULT "guest" | 92 | #define CEPH_AUTH_NAME_DEFAULT "guest" |
87 | 93 | ||
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index fc2b4491ee0a..800a2128d411 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h | |||
@@ -82,22 +82,6 @@ enum ceph_msg_data_type { | |||
82 | CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ | 82 | CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */ |
83 | }; | 83 | }; |
84 | 84 | ||
85 | static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type) | ||
86 | { | ||
87 | switch (type) { | ||
88 | case CEPH_MSG_DATA_NONE: | ||
89 | case CEPH_MSG_DATA_PAGES: | ||
90 | case CEPH_MSG_DATA_PAGELIST: | ||
91 | #ifdef CONFIG_BLOCK | ||
92 | case CEPH_MSG_DATA_BIO: | ||
93 | #endif /* CONFIG_BLOCK */ | ||
94 | case CEPH_MSG_DATA_BVECS: | ||
95 | return true; | ||
96 | default: | ||
97 | return false; | ||
98 | } | ||
99 | } | ||
100 | |||
101 | #ifdef CONFIG_BLOCK | 85 | #ifdef CONFIG_BLOCK |
102 | 86 | ||
103 | struct ceph_bio_iter { | 87 | struct ceph_bio_iter { |
@@ -181,7 +165,6 @@ struct ceph_bvec_iter { | |||
181 | } while (0) | 165 | } while (0) |
182 | 166 | ||
183 | struct ceph_msg_data { | 167 | struct ceph_msg_data { |
184 | struct list_head links; /* ceph_msg->data */ | ||
185 | enum ceph_msg_data_type type; | 168 | enum ceph_msg_data_type type; |
186 | union { | 169 | union { |
187 | #ifdef CONFIG_BLOCK | 170 | #ifdef CONFIG_BLOCK |
@@ -202,7 +185,6 @@ struct ceph_msg_data { | |||
202 | 185 | ||
203 | struct ceph_msg_data_cursor { | 186 | struct ceph_msg_data_cursor { |
204 | size_t total_resid; /* across all data items */ | 187 | size_t total_resid; /* across all data items */ |
205 | struct list_head *data_head; /* = &ceph_msg->data */ | ||
206 | 188 | ||
207 | struct ceph_msg_data *data; /* current data item */ | 189 | struct ceph_msg_data *data; /* current data item */ |
208 | size_t resid; /* bytes not yet consumed */ | 190 | size_t resid; /* bytes not yet consumed */ |
@@ -240,7 +222,9 @@ struct ceph_msg { | |||
240 | struct ceph_buffer *middle; | 222 | struct ceph_buffer *middle; |
241 | 223 | ||
242 | size_t data_length; | 224 | size_t data_length; |
243 | struct list_head data; | 225 | struct ceph_msg_data *data; |
226 | int num_data_items; | ||
227 | int max_data_items; | ||
244 | struct ceph_msg_data_cursor cursor; | 228 | struct ceph_msg_data_cursor cursor; |
245 | 229 | ||
246 | struct ceph_connection *con; | 230 | struct ceph_connection *con; |
@@ -381,6 +365,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, | |||
381 | void ceph_msg_data_add_bvecs(struct ceph_msg *msg, | 365 | void ceph_msg_data_add_bvecs(struct ceph_msg *msg, |
382 | struct ceph_bvec_iter *bvec_pos); | 366 | struct ceph_bvec_iter *bvec_pos); |
383 | 367 | ||
368 | struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, | ||
369 | gfp_t flags, bool can_fail); | ||
384 | extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | 370 | extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, |
385 | bool can_fail); | 371 | bool can_fail); |
386 | 372 | ||
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h index 76c98a512758..729cdf700eae 100644 --- a/include/linux/ceph/msgpool.h +++ b/include/linux/ceph/msgpool.h | |||
@@ -13,14 +13,15 @@ struct ceph_msgpool { | |||
13 | mempool_t *pool; | 13 | mempool_t *pool; |
14 | int type; /* preallocated message type */ | 14 | int type; /* preallocated message type */ |
15 | int front_len; /* preallocated payload size */ | 15 | int front_len; /* preallocated payload size */ |
16 | int max_data_items; | ||
16 | }; | 17 | }; |
17 | 18 | ||
18 | extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type, | 19 | int ceph_msgpool_init(struct ceph_msgpool *pool, int type, |
19 | int front_len, int size, bool blocking, | 20 | int front_len, int max_data_items, int size, |
20 | const char *name); | 21 | const char *name); |
21 | extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); | 22 | extern void ceph_msgpool_destroy(struct ceph_msgpool *pool); |
22 | extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *, | 23 | struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len, |
23 | int front_len); | 24 | int max_data_items); |
24 | extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); | 25 | extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *); |
25 | 26 | ||
26 | #endif | 27 | #endif |
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 02096da01845..7a2af5034278 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h | |||
@@ -136,6 +136,13 @@ struct ceph_osd_req_op { | |||
136 | u64 expected_object_size; | 136 | u64 expected_object_size; |
137 | u64 expected_write_size; | 137 | u64 expected_write_size; |
138 | } alloc_hint; | 138 | } alloc_hint; |
139 | struct { | ||
140 | u64 snapid; | ||
141 | u64 src_version; | ||
142 | u8 flags; | ||
143 | u32 src_fadvise_flags; | ||
144 | struct ceph_osd_data osd_data; | ||
145 | } copy_from; | ||
139 | }; | 146 | }; |
140 | }; | 147 | }; |
141 | 148 | ||
@@ -444,9 +451,8 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *, | |||
444 | struct page **pages, u64 length, | 451 | struct page **pages, u64 length, |
445 | u32 alignment, bool pages_from_pool, | 452 | u32 alignment, bool pages_from_pool, |
446 | bool own_pages); | 453 | bool own_pages); |
447 | extern int osd_req_op_cls_init(struct ceph_osd_request *osd_req, | 454 | int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, |
448 | unsigned int which, u16 opcode, | 455 | const char *class, const char *method); |
449 | const char *class, const char *method); | ||
450 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | 456 | extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, |
451 | u16 opcode, const char *name, const void *value, | 457 | u16 opcode, const char *name, const void *value, |
452 | size_t size, u8 cmp_op, u8 cmp_mode); | 458 | size_t size, u8 cmp_op, u8 cmp_mode); |
@@ -511,6 +517,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, | |||
511 | struct timespec64 *mtime, | 517 | struct timespec64 *mtime, |
512 | struct page **pages, int nr_pages); | 518 | struct page **pages, int nr_pages); |
513 | 519 | ||
520 | int ceph_osdc_copy_from(struct ceph_osd_client *osdc, | ||
521 | u64 src_snapid, u64 src_version, | ||
522 | struct ceph_object_id *src_oid, | ||
523 | struct ceph_object_locator *src_oloc, | ||
524 | u32 src_fadvise_flags, | ||
525 | struct ceph_object_id *dst_oid, | ||
526 | struct ceph_object_locator *dst_oloc, | ||
527 | u32 dst_fadvise_flags, | ||
528 | u8 copy_from_flags); | ||
529 | |||
514 | /* watch/notify */ | 530 | /* watch/notify */ |
515 | struct ceph_osd_linger_request * | 531 | struct ceph_osd_linger_request * |
516 | ceph_osdc_watch(struct ceph_osd_client *osdc, | 532 | ceph_osdc_watch(struct ceph_osd_client *osdc, |
diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index d0223364349f..5dead8486fd8 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h | |||
@@ -23,16 +23,7 @@ struct ceph_pagelist_cursor { | |||
23 | size_t room; /* room remaining to reset to */ | 23 | size_t room; /* room remaining to reset to */ |
24 | }; | 24 | }; |
25 | 25 | ||
26 | static inline void ceph_pagelist_init(struct ceph_pagelist *pl) | 26 | struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags); |
27 | { | ||
28 | INIT_LIST_HEAD(&pl->head); | ||
29 | pl->mapped_tail = NULL; | ||
30 | pl->length = 0; | ||
31 | pl->room = 0; | ||
32 | INIT_LIST_HEAD(&pl->free_list); | ||
33 | pl->num_pages_free = 0; | ||
34 | refcount_set(&pl->refcnt, 1); | ||
35 | } | ||
36 | 27 | ||
37 | extern void ceph_pagelist_release(struct ceph_pagelist *pl); | 28 | extern void ceph_pagelist_release(struct ceph_pagelist *pl); |
38 | 29 | ||
diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index f1988387c5ad..3eb0e55665b4 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h | |||
@@ -410,6 +410,14 @@ enum { | |||
410 | enum { | 410 | enum { |
411 | CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ | 411 | CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */ |
412 | CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ | 412 | CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */ |
413 | CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */ | ||
414 | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */ | ||
415 | CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in | ||
416 | the near future */ | ||
417 | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed | ||
418 | in the near future */ | ||
419 | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only | ||
420 | once by this client */ | ||
413 | }; | 421 | }; |
414 | 422 | ||
415 | #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ | 423 | #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ |
@@ -432,6 +440,15 @@ enum { | |||
432 | }; | 440 | }; |
433 | 441 | ||
434 | enum { | 442 | enum { |
443 | CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */ | ||
444 | CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */ | ||
445 | CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */ | ||
446 | CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to | ||
447 | * cloneid */ | ||
448 | CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ | ||
449 | }; | ||
450 | |||
451 | enum { | ||
435 | CEPH_OSD_WATCH_OP_UNWATCH = 0, | 452 | CEPH_OSD_WATCH_OP_UNWATCH = 0, |
436 | CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, | 453 | CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1, |
437 | /* note: use only ODD ids to prevent pre-giant code from | 454 | /* note: use only ODD ids to prevent pre-giant code from |
@@ -497,6 +514,17 @@ struct ceph_osd_op { | |||
497 | __le64 expected_object_size; | 514 | __le64 expected_object_size; |
498 | __le64 expected_write_size; | 515 | __le64 expected_write_size; |
499 | } __attribute__ ((packed)) alloc_hint; | 516 | } __attribute__ ((packed)) alloc_hint; |
517 | struct { | ||
518 | __le64 snapid; | ||
519 | __le64 src_version; | ||
520 | __u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */ | ||
521 | /* | ||
522 | * CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags | ||
523 | * for src object, flags for dest object are in | ||
524 | * ceph_osd_op::flags. | ||
525 | */ | ||
526 | __le32 src_fadvise_flags; | ||
527 | } __attribute__ ((packed)) copy_from; | ||
500 | }; | 528 | }; |
501 | __le32 payload_len; | 529 | __le32 payload_len; |
502 | } __attribute__ ((packed)); | 530 | } __attribute__ ((packed)); |
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 0a187196aeed..88e35830198c 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c | |||
@@ -156,7 +156,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con, | |||
156 | /* Slab caches for frequently-allocated structures */ | 156 | /* Slab caches for frequently-allocated structures */ |
157 | 157 | ||
158 | static struct kmem_cache *ceph_msg_cache; | 158 | static struct kmem_cache *ceph_msg_cache; |
159 | static struct kmem_cache *ceph_msg_data_cache; | ||
160 | 159 | ||
161 | /* static tag bytes (protocol control messages) */ | 160 | /* static tag bytes (protocol control messages) */ |
162 | static char tag_msg = CEPH_MSGR_TAG_MSG; | 161 | static char tag_msg = CEPH_MSGR_TAG_MSG; |
@@ -235,23 +234,11 @@ static int ceph_msgr_slab_init(void) | |||
235 | if (!ceph_msg_cache) | 234 | if (!ceph_msg_cache) |
236 | return -ENOMEM; | 235 | return -ENOMEM; |
237 | 236 | ||
238 | BUG_ON(ceph_msg_data_cache); | 237 | return 0; |
239 | ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0); | ||
240 | if (ceph_msg_data_cache) | ||
241 | return 0; | ||
242 | |||
243 | kmem_cache_destroy(ceph_msg_cache); | ||
244 | ceph_msg_cache = NULL; | ||
245 | |||
246 | return -ENOMEM; | ||
247 | } | 238 | } |
248 | 239 | ||
249 | static void ceph_msgr_slab_exit(void) | 240 | static void ceph_msgr_slab_exit(void) |
250 | { | 241 | { |
251 | BUG_ON(!ceph_msg_data_cache); | ||
252 | kmem_cache_destroy(ceph_msg_data_cache); | ||
253 | ceph_msg_data_cache = NULL; | ||
254 | |||
255 | BUG_ON(!ceph_msg_cache); | 242 | BUG_ON(!ceph_msg_cache); |
256 | kmem_cache_destroy(ceph_msg_cache); | 243 | kmem_cache_destroy(ceph_msg_cache); |
257 | ceph_msg_cache = NULL; | 244 | ceph_msg_cache = NULL; |
@@ -1141,16 +1128,13 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor) | |||
1141 | static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) | 1128 | static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length) |
1142 | { | 1129 | { |
1143 | struct ceph_msg_data_cursor *cursor = &msg->cursor; | 1130 | struct ceph_msg_data_cursor *cursor = &msg->cursor; |
1144 | struct ceph_msg_data *data; | ||
1145 | 1131 | ||
1146 | BUG_ON(!length); | 1132 | BUG_ON(!length); |
1147 | BUG_ON(length > msg->data_length); | 1133 | BUG_ON(length > msg->data_length); |
1148 | BUG_ON(list_empty(&msg->data)); | 1134 | BUG_ON(!msg->num_data_items); |
1149 | 1135 | ||
1150 | cursor->data_head = &msg->data; | ||
1151 | cursor->total_resid = length; | 1136 | cursor->total_resid = length; |
1152 | data = list_first_entry(&msg->data, struct ceph_msg_data, links); | 1137 | cursor->data = msg->data; |
1153 | cursor->data = data; | ||
1154 | 1138 | ||
1155 | __ceph_msg_data_cursor_init(cursor); | 1139 | __ceph_msg_data_cursor_init(cursor); |
1156 | } | 1140 | } |
@@ -1231,8 +1215,7 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, | |||
1231 | 1215 | ||
1232 | if (!cursor->resid && cursor->total_resid) { | 1216 | if (!cursor->resid && cursor->total_resid) { |
1233 | WARN_ON(!cursor->last_piece); | 1217 | WARN_ON(!cursor->last_piece); |
1234 | BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); | 1218 | cursor->data++; |
1235 | cursor->data = list_next_entry(cursor->data, links); | ||
1236 | __ceph_msg_data_cursor_init(cursor); | 1219 | __ceph_msg_data_cursor_init(cursor); |
1237 | new_piece = true; | 1220 | new_piece = true; |
1238 | } | 1221 | } |
@@ -1248,9 +1231,6 @@ static size_t sizeof_footer(struct ceph_connection *con) | |||
1248 | 1231 | ||
1249 | static void prepare_message_data(struct ceph_msg *msg, u32 data_len) | 1232 | static void prepare_message_data(struct ceph_msg *msg, u32 data_len) |
1250 | { | 1233 | { |
1251 | BUG_ON(!msg); | ||
1252 | BUG_ON(!data_len); | ||
1253 | |||
1254 | /* Initialize data cursor */ | 1234 | /* Initialize data cursor */ |
1255 | 1235 | ||
1256 | ceph_msg_data_cursor_init(msg, (size_t)data_len); | 1236 | ceph_msg_data_cursor_init(msg, (size_t)data_len); |
@@ -1590,7 +1570,7 @@ static int write_partial_message_data(struct ceph_connection *con) | |||
1590 | 1570 | ||
1591 | dout("%s %p msg %p\n", __func__, con, msg); | 1571 | dout("%s %p msg %p\n", __func__, con, msg); |
1592 | 1572 | ||
1593 | if (list_empty(&msg->data)) | 1573 | if (!msg->num_data_items) |
1594 | return -EINVAL; | 1574 | return -EINVAL; |
1595 | 1575 | ||
1596 | /* | 1576 | /* |
@@ -2347,8 +2327,7 @@ static int read_partial_msg_data(struct ceph_connection *con) | |||
2347 | u32 crc = 0; | 2327 | u32 crc = 0; |
2348 | int ret; | 2328 | int ret; |
2349 | 2329 | ||
2350 | BUG_ON(!msg); | 2330 | if (!msg->num_data_items) |
2351 | if (list_empty(&msg->data)) | ||
2352 | return -EIO; | 2331 | return -EIO; |
2353 | 2332 | ||
2354 | if (do_datacrc) | 2333 | if (do_datacrc) |
@@ -3256,32 +3235,16 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, | |||
3256 | return false; | 3235 | return false; |
3257 | } | 3236 | } |
3258 | 3237 | ||
3259 | static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type) | 3238 | static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg) |
3260 | { | 3239 | { |
3261 | struct ceph_msg_data *data; | 3240 | BUG_ON(msg->num_data_items >= msg->max_data_items); |
3262 | 3241 | return &msg->data[msg->num_data_items++]; | |
3263 | if (WARN_ON(!ceph_msg_data_type_valid(type))) | ||
3264 | return NULL; | ||
3265 | |||
3266 | data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS); | ||
3267 | if (!data) | ||
3268 | return NULL; | ||
3269 | |||
3270 | data->type = type; | ||
3271 | INIT_LIST_HEAD(&data->links); | ||
3272 | |||
3273 | return data; | ||
3274 | } | 3242 | } |
3275 | 3243 | ||
3276 | static void ceph_msg_data_destroy(struct ceph_msg_data *data) | 3244 | static void ceph_msg_data_destroy(struct ceph_msg_data *data) |
3277 | { | 3245 | { |
3278 | if (!data) | ||
3279 | return; | ||
3280 | |||
3281 | WARN_ON(!list_empty(&data->links)); | ||
3282 | if (data->type == CEPH_MSG_DATA_PAGELIST) | 3246 | if (data->type == CEPH_MSG_DATA_PAGELIST) |
3283 | ceph_pagelist_release(data->pagelist); | 3247 | ceph_pagelist_release(data->pagelist); |
3284 | kmem_cache_free(ceph_msg_data_cache, data); | ||
3285 | } | 3248 | } |
3286 | 3249 | ||
3287 | void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, | 3250 | void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, |
@@ -3292,13 +3255,12 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages, | |||
3292 | BUG_ON(!pages); | 3255 | BUG_ON(!pages); |
3293 | BUG_ON(!length); | 3256 | BUG_ON(!length); |
3294 | 3257 | ||
3295 | data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES); | 3258 | data = ceph_msg_data_add(msg); |
3296 | BUG_ON(!data); | 3259 | data->type = CEPH_MSG_DATA_PAGES; |
3297 | data->pages = pages; | 3260 | data->pages = pages; |
3298 | data->length = length; | 3261 | data->length = length; |
3299 | data->alignment = alignment & ~PAGE_MASK; | 3262 | data->alignment = alignment & ~PAGE_MASK; |
3300 | 3263 | ||
3301 | list_add_tail(&data->links, &msg->data); | ||
3302 | msg->data_length += length; | 3264 | msg->data_length += length; |
3303 | } | 3265 | } |
3304 | EXPORT_SYMBOL(ceph_msg_data_add_pages); | 3266 | EXPORT_SYMBOL(ceph_msg_data_add_pages); |
@@ -3311,11 +3273,11 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg, | |||
3311 | BUG_ON(!pagelist); | 3273 | BUG_ON(!pagelist); |
3312 | BUG_ON(!pagelist->length); | 3274 | BUG_ON(!pagelist->length); |
3313 | 3275 | ||
3314 | data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST); | 3276 | data = ceph_msg_data_add(msg); |
3315 | BUG_ON(!data); | 3277 | data->type = CEPH_MSG_DATA_PAGELIST; |
3278 | refcount_inc(&pagelist->refcnt); | ||
3316 | data->pagelist = pagelist; | 3279 | data->pagelist = pagelist; |
3317 | 3280 | ||
3318 | list_add_tail(&data->links, &msg->data); | ||
3319 | msg->data_length += pagelist->length; | 3281 | msg->data_length += pagelist->length; |
3320 | } | 3282 | } |
3321 | EXPORT_SYMBOL(ceph_msg_data_add_pagelist); | 3283 | EXPORT_SYMBOL(ceph_msg_data_add_pagelist); |
@@ -3326,12 +3288,11 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos, | |||
3326 | { | 3288 | { |
3327 | struct ceph_msg_data *data; | 3289 | struct ceph_msg_data *data; |
3328 | 3290 | ||
3329 | data = ceph_msg_data_create(CEPH_MSG_DATA_BIO); | 3291 | data = ceph_msg_data_add(msg); |
3330 | BUG_ON(!data); | 3292 | data->type = CEPH_MSG_DATA_BIO; |
3331 | data->bio_pos = *bio_pos; | 3293 | data->bio_pos = *bio_pos; |
3332 | data->bio_length = length; | 3294 | data->bio_length = length; |
3333 | 3295 | ||
3334 | list_add_tail(&data->links, &msg->data); | ||
3335 | msg->data_length += length; | 3296 | msg->data_length += length; |
3336 | } | 3297 | } |
3337 | EXPORT_SYMBOL(ceph_msg_data_add_bio); | 3298 | EXPORT_SYMBOL(ceph_msg_data_add_bio); |
@@ -3342,11 +3303,10 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg, | |||
3342 | { | 3303 | { |
3343 | struct ceph_msg_data *data; | 3304 | struct ceph_msg_data *data; |
3344 | 3305 | ||
3345 | data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS); | 3306 | data = ceph_msg_data_add(msg); |
3346 | BUG_ON(!data); | 3307 | data->type = CEPH_MSG_DATA_BVECS; |
3347 | data->bvec_pos = *bvec_pos; | 3308 | data->bvec_pos = *bvec_pos; |
3348 | 3309 | ||
3349 | list_add_tail(&data->links, &msg->data); | ||
3350 | msg->data_length += bvec_pos->iter.bi_size; | 3310 | msg->data_length += bvec_pos->iter.bi_size; |
3351 | } | 3311 | } |
3352 | EXPORT_SYMBOL(ceph_msg_data_add_bvecs); | 3312 | EXPORT_SYMBOL(ceph_msg_data_add_bvecs); |
@@ -3355,8 +3315,8 @@ EXPORT_SYMBOL(ceph_msg_data_add_bvecs); | |||
3355 | * construct a new message with given type, size | 3315 | * construct a new message with given type, size |
3356 | * the new msg has a ref count of 1. | 3316 | * the new msg has a ref count of 1. |
3357 | */ | 3317 | */ |
3358 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | 3318 | struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, |
3359 | bool can_fail) | 3319 | gfp_t flags, bool can_fail) |
3360 | { | 3320 | { |
3361 | struct ceph_msg *m; | 3321 | struct ceph_msg *m; |
3362 | 3322 | ||
@@ -3370,7 +3330,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |||
3370 | 3330 | ||
3371 | INIT_LIST_HEAD(&m->list_head); | 3331 | INIT_LIST_HEAD(&m->list_head); |
3372 | kref_init(&m->kref); | 3332 | kref_init(&m->kref); |
3373 | INIT_LIST_HEAD(&m->data); | ||
3374 | 3333 | ||
3375 | /* front */ | 3334 | /* front */ |
3376 | if (front_len) { | 3335 | if (front_len) { |
@@ -3385,6 +3344,15 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | |||
3385 | } | 3344 | } |
3386 | m->front_alloc_len = m->front.iov_len = front_len; | 3345 | m->front_alloc_len = m->front.iov_len = front_len; |
3387 | 3346 | ||
3347 | if (max_data_items) { | ||
3348 | m->data = kmalloc_array(max_data_items, sizeof(*m->data), | ||
3349 | flags); | ||
3350 | if (!m->data) | ||
3351 | goto out2; | ||
3352 | |||
3353 | m->max_data_items = max_data_items; | ||
3354 | } | ||
3355 | |||
3388 | dout("ceph_msg_new %p front %d\n", m, front_len); | 3356 | dout("ceph_msg_new %p front %d\n", m, front_len); |
3389 | return m; | 3357 | return m; |
3390 | 3358 | ||
@@ -3401,6 +3369,13 @@ out: | |||
3401 | } | 3369 | } |
3402 | return NULL; | 3370 | return NULL; |
3403 | } | 3371 | } |
3372 | EXPORT_SYMBOL(ceph_msg_new2); | ||
3373 | |||
3374 | struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags, | ||
3375 | bool can_fail) | ||
3376 | { | ||
3377 | return ceph_msg_new2(type, front_len, 0, flags, can_fail); | ||
3378 | } | ||
3404 | EXPORT_SYMBOL(ceph_msg_new); | 3379 | EXPORT_SYMBOL(ceph_msg_new); |
3405 | 3380 | ||
3406 | /* | 3381 | /* |
@@ -3496,13 +3471,14 @@ static void ceph_msg_free(struct ceph_msg *m) | |||
3496 | { | 3471 | { |
3497 | dout("%s %p\n", __func__, m); | 3472 | dout("%s %p\n", __func__, m); |
3498 | kvfree(m->front.iov_base); | 3473 | kvfree(m->front.iov_base); |
3474 | kfree(m->data); | ||
3499 | kmem_cache_free(ceph_msg_cache, m); | 3475 | kmem_cache_free(ceph_msg_cache, m); |
3500 | } | 3476 | } |
3501 | 3477 | ||
3502 | static void ceph_msg_release(struct kref *kref) | 3478 | static void ceph_msg_release(struct kref *kref) |
3503 | { | 3479 | { |
3504 | struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); | 3480 | struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); |
3505 | struct ceph_msg_data *data, *next; | 3481 | int i; |
3506 | 3482 | ||
3507 | dout("%s %p\n", __func__, m); | 3483 | dout("%s %p\n", __func__, m); |
3508 | WARN_ON(!list_empty(&m->list_head)); | 3484 | WARN_ON(!list_empty(&m->list_head)); |
@@ -3515,11 +3491,8 @@ static void ceph_msg_release(struct kref *kref) | |||
3515 | m->middle = NULL; | 3491 | m->middle = NULL; |
3516 | } | 3492 | } |
3517 | 3493 | ||
3518 | list_for_each_entry_safe(data, next, &m->data, links) { | 3494 | for (i = 0; i < m->num_data_items; i++) |
3519 | list_del_init(&data->links); | 3495 | ceph_msg_data_destroy(&m->data[i]); |
3520 | ceph_msg_data_destroy(data); | ||
3521 | } | ||
3522 | m->data_length = 0; | ||
3523 | 3496 | ||
3524 | if (m->pool) | 3497 | if (m->pool) |
3525 | ceph_msgpool_put(m->pool, m); | 3498 | ceph_msgpool_put(m->pool, m); |
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c index 72571535883f..e3ecb80cd182 100644 --- a/net/ceph/msgpool.c +++ b/net/ceph/msgpool.c | |||
@@ -14,7 +14,8 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg) | |||
14 | struct ceph_msgpool *pool = arg; | 14 | struct ceph_msgpool *pool = arg; |
15 | struct ceph_msg *msg; | 15 | struct ceph_msg *msg; |
16 | 16 | ||
17 | msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true); | 17 | msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items, |
18 | gfp_mask, true); | ||
18 | if (!msg) { | 19 | if (!msg) { |
19 | dout("msgpool_alloc %s failed\n", pool->name); | 20 | dout("msgpool_alloc %s failed\n", pool->name); |
20 | } else { | 21 | } else { |
@@ -35,11 +36,13 @@ static void msgpool_free(void *element, void *arg) | |||
35 | } | 36 | } |
36 | 37 | ||
37 | int ceph_msgpool_init(struct ceph_msgpool *pool, int type, | 38 | int ceph_msgpool_init(struct ceph_msgpool *pool, int type, |
38 | int front_len, int size, bool blocking, const char *name) | 39 | int front_len, int max_data_items, int size, |
40 | const char *name) | ||
39 | { | 41 | { |
40 | dout("msgpool %s init\n", name); | 42 | dout("msgpool %s init\n", name); |
41 | pool->type = type; | 43 | pool->type = type; |
42 | pool->front_len = front_len; | 44 | pool->front_len = front_len; |
45 | pool->max_data_items = max_data_items; | ||
43 | pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); | 46 | pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); |
44 | if (!pool->pool) | 47 | if (!pool->pool) |
45 | return -ENOMEM; | 48 | return -ENOMEM; |
@@ -53,18 +56,21 @@ void ceph_msgpool_destroy(struct ceph_msgpool *pool) | |||
53 | mempool_destroy(pool->pool); | 56 | mempool_destroy(pool->pool); |
54 | } | 57 | } |
55 | 58 | ||
56 | struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, | 59 | struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len, |
57 | int front_len) | 60 | int max_data_items) |
58 | { | 61 | { |
59 | struct ceph_msg *msg; | 62 | struct ceph_msg *msg; |
60 | 63 | ||
61 | if (front_len > pool->front_len) { | 64 | if (front_len > pool->front_len || |
62 | dout("msgpool_get %s need front %d, pool size is %d\n", | 65 | max_data_items > pool->max_data_items) { |
63 | pool->name, front_len, pool->front_len); | 66 | pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n", |
64 | WARN_ON(1); | 67 | __func__, front_len, max_data_items, pool->name, |
68 | pool->front_len, pool->max_data_items); | ||
69 | WARN_ON_ONCE(1); | ||
65 | 70 | ||
66 | /* try to alloc a fresh message */ | 71 | /* try to alloc a fresh message */ |
67 | return ceph_msg_new(pool->type, front_len, GFP_NOFS, false); | 72 | return ceph_msg_new2(pool->type, front_len, max_data_items, |
73 | GFP_NOFS, false); | ||
68 | } | 74 | } |
69 | 75 | ||
70 | msg = mempool_alloc(pool->pool, GFP_NOFS); | 76 | msg = mempool_alloc(pool->pool, GFP_NOFS); |
@@ -80,6 +86,9 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg) | |||
80 | msg->front.iov_len = pool->front_len; | 86 | msg->front.iov_len = pool->front_len; |
81 | msg->hdr.front_len = cpu_to_le32(pool->front_len); | 87 | msg->hdr.front_len = cpu_to_le32(pool->front_len); |
82 | 88 | ||
89 | msg->data_length = 0; | ||
90 | msg->num_data_items = 0; | ||
91 | |||
83 | kref_init(&msg->kref); /* retake single ref */ | 92 | kref_init(&msg->kref); /* retake single ref */ |
84 | mempool_free(msg, pool->pool); | 93 | mempool_free(msg, pool->pool); |
85 | } | 94 | } |
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 60934bd8796c..d23a9f81f3d7 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c | |||
@@ -126,6 +126,9 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data) | |||
126 | osd_data->type = CEPH_OSD_DATA_TYPE_NONE; | 126 | osd_data->type = CEPH_OSD_DATA_TYPE_NONE; |
127 | } | 127 | } |
128 | 128 | ||
129 | /* | ||
130 | * Consumes @pages if @own_pages is true. | ||
131 | */ | ||
129 | static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, | 132 | static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, |
130 | struct page **pages, u64 length, u32 alignment, | 133 | struct page **pages, u64 length, u32 alignment, |
131 | bool pages_from_pool, bool own_pages) | 134 | bool pages_from_pool, bool own_pages) |
@@ -138,6 +141,9 @@ static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data, | |||
138 | osd_data->own_pages = own_pages; | 141 | osd_data->own_pages = own_pages; |
139 | } | 142 | } |
140 | 143 | ||
144 | /* | ||
145 | * Consumes a ref on @pagelist. | ||
146 | */ | ||
141 | static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, | 147 | static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data, |
142 | struct ceph_pagelist *pagelist) | 148 | struct ceph_pagelist *pagelist) |
143 | { | 149 | { |
@@ -362,6 +368,8 @@ static void ceph_osd_data_release(struct ceph_osd_data *osd_data) | |||
362 | num_pages = calc_pages_for((u64)osd_data->alignment, | 368 | num_pages = calc_pages_for((u64)osd_data->alignment, |
363 | (u64)osd_data->length); | 369 | (u64)osd_data->length); |
364 | ceph_release_page_vector(osd_data->pages, num_pages); | 370 | ceph_release_page_vector(osd_data->pages, num_pages); |
371 | } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) { | ||
372 | ceph_pagelist_release(osd_data->pagelist); | ||
365 | } | 373 | } |
366 | ceph_osd_data_init(osd_data); | 374 | ceph_osd_data_init(osd_data); |
367 | } | 375 | } |
@@ -402,6 +410,9 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, | |||
402 | case CEPH_OSD_OP_LIST_WATCHERS: | 410 | case CEPH_OSD_OP_LIST_WATCHERS: |
403 | ceph_osd_data_release(&op->list_watchers.response_data); | 411 | ceph_osd_data_release(&op->list_watchers.response_data); |
404 | break; | 412 | break; |
413 | case CEPH_OSD_OP_COPY_FROM: | ||
414 | ceph_osd_data_release(&op->copy_from.osd_data); | ||
415 | break; | ||
405 | default: | 416 | default: |
406 | break; | 417 | break; |
407 | } | 418 | } |
@@ -606,12 +617,15 @@ static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc) | |||
606 | return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); | 617 | return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); |
607 | } | 618 | } |
608 | 619 | ||
609 | int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) | 620 | static int __ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp, |
621 | int num_request_data_items, | ||
622 | int num_reply_data_items) | ||
610 | { | 623 | { |
611 | struct ceph_osd_client *osdc = req->r_osdc; | 624 | struct ceph_osd_client *osdc = req->r_osdc; |
612 | struct ceph_msg *msg; | 625 | struct ceph_msg *msg; |
613 | int msg_size; | 626 | int msg_size; |
614 | 627 | ||
628 | WARN_ON(req->r_request || req->r_reply); | ||
615 | WARN_ON(ceph_oid_empty(&req->r_base_oid)); | 629 | WARN_ON(ceph_oid_empty(&req->r_base_oid)); |
616 | WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); | 630 | WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); |
617 | 631 | ||
@@ -633,9 +647,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) | |||
633 | msg_size += 4 + 8; /* retry_attempt, features */ | 647 | msg_size += 4 + 8; /* retry_attempt, features */ |
634 | 648 | ||
635 | if (req->r_mempool) | 649 | if (req->r_mempool) |
636 | msg = ceph_msgpool_get(&osdc->msgpool_op, 0); | 650 | msg = ceph_msgpool_get(&osdc->msgpool_op, msg_size, |
651 | num_request_data_items); | ||
637 | else | 652 | else |
638 | msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp, true); | 653 | msg = ceph_msg_new2(CEPH_MSG_OSD_OP, msg_size, |
654 | num_request_data_items, gfp, true); | ||
639 | if (!msg) | 655 | if (!msg) |
640 | return -ENOMEM; | 656 | return -ENOMEM; |
641 | 657 | ||
@@ -648,9 +664,11 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) | |||
648 | msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); | 664 | msg_size += req->r_num_ops * sizeof(struct ceph_osd_op); |
649 | 665 | ||
650 | if (req->r_mempool) | 666 | if (req->r_mempool) |
651 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); | 667 | msg = ceph_msgpool_get(&osdc->msgpool_op_reply, msg_size, |
668 | num_reply_data_items); | ||
652 | else | 669 | else |
653 | msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size, gfp, true); | 670 | msg = ceph_msg_new2(CEPH_MSG_OSD_OPREPLY, msg_size, |
671 | num_reply_data_items, gfp, true); | ||
654 | if (!msg) | 672 | if (!msg) |
655 | return -ENOMEM; | 673 | return -ENOMEM; |
656 | 674 | ||
@@ -658,7 +676,6 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) | |||
658 | 676 | ||
659 | return 0; | 677 | return 0; |
660 | } | 678 | } |
661 | EXPORT_SYMBOL(ceph_osdc_alloc_messages); | ||
662 | 679 | ||
663 | static bool osd_req_opcode_valid(u16 opcode) | 680 | static bool osd_req_opcode_valid(u16 opcode) |
664 | { | 681 | { |
@@ -671,6 +688,65 @@ __CEPH_FORALL_OSD_OPS(GENERATE_CASE) | |||
671 | } | 688 | } |
672 | } | 689 | } |
673 | 690 | ||
691 | static void get_num_data_items(struct ceph_osd_request *req, | ||
692 | int *num_request_data_items, | ||
693 | int *num_reply_data_items) | ||
694 | { | ||
695 | struct ceph_osd_req_op *op; | ||
696 | |||
697 | *num_request_data_items = 0; | ||
698 | *num_reply_data_items = 0; | ||
699 | |||
700 | for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { | ||
701 | switch (op->op) { | ||
702 | /* request */ | ||
703 | case CEPH_OSD_OP_WRITE: | ||
704 | case CEPH_OSD_OP_WRITEFULL: | ||
705 | case CEPH_OSD_OP_SETXATTR: | ||
706 | case CEPH_OSD_OP_CMPXATTR: | ||
707 | case CEPH_OSD_OP_NOTIFY_ACK: | ||
708 | case CEPH_OSD_OP_COPY_FROM: | ||
709 | *num_request_data_items += 1; | ||
710 | break; | ||
711 | |||
712 | /* reply */ | ||
713 | case CEPH_OSD_OP_STAT: | ||
714 | case CEPH_OSD_OP_READ: | ||
715 | case CEPH_OSD_OP_LIST_WATCHERS: | ||
716 | *num_reply_data_items += 1; | ||
717 | break; | ||
718 | |||
719 | /* both */ | ||
720 | case CEPH_OSD_OP_NOTIFY: | ||
721 | *num_request_data_items += 1; | ||
722 | *num_reply_data_items += 1; | ||
723 | break; | ||
724 | case CEPH_OSD_OP_CALL: | ||
725 | *num_request_data_items += 2; | ||
726 | *num_reply_data_items += 1; | ||
727 | break; | ||
728 | |||
729 | default: | ||
730 | WARN_ON(!osd_req_opcode_valid(op->op)); | ||
731 | break; | ||
732 | } | ||
733 | } | ||
734 | } | ||
735 | |||
736 | /* | ||
737 | * oid, oloc and OSD op opcode(s) must be filled in before this function | ||
738 | * is called. | ||
739 | */ | ||
740 | int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) | ||
741 | { | ||
742 | int num_request_data_items, num_reply_data_items; | ||
743 | |||
744 | get_num_data_items(req, &num_request_data_items, &num_reply_data_items); | ||
745 | return __ceph_osdc_alloc_messages(req, gfp, num_request_data_items, | ||
746 | num_reply_data_items); | ||
747 | } | ||
748 | EXPORT_SYMBOL(ceph_osdc_alloc_messages); | ||
749 | |||
674 | /* | 750 | /* |
675 | * This is an osd op init function for opcodes that have no data or | 751 | * This is an osd op init function for opcodes that have no data or |
676 | * other information associated with them. It also serves as a | 752 | * other information associated with them. It also serves as a |
@@ -767,22 +843,19 @@ void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req, | |||
767 | EXPORT_SYMBOL(osd_req_op_extent_dup_last); | 843 | EXPORT_SYMBOL(osd_req_op_extent_dup_last); |
768 | 844 | ||
769 | int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, | 845 | int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, |
770 | u16 opcode, const char *class, const char *method) | 846 | const char *class, const char *method) |
771 | { | 847 | { |
772 | struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, | 848 | struct ceph_osd_req_op *op; |
773 | opcode, 0); | ||
774 | struct ceph_pagelist *pagelist; | 849 | struct ceph_pagelist *pagelist; |
775 | size_t payload_len = 0; | 850 | size_t payload_len = 0; |
776 | size_t size; | 851 | size_t size; |
777 | 852 | ||
778 | BUG_ON(opcode != CEPH_OSD_OP_CALL); | 853 | op = _osd_req_op_init(osd_req, which, CEPH_OSD_OP_CALL, 0); |
779 | 854 | ||
780 | pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS); | 855 | pagelist = ceph_pagelist_alloc(GFP_NOFS); |
781 | if (!pagelist) | 856 | if (!pagelist) |
782 | return -ENOMEM; | 857 | return -ENOMEM; |
783 | 858 | ||
784 | ceph_pagelist_init(pagelist); | ||
785 | |||
786 | op->cls.class_name = class; | 859 | op->cls.class_name = class; |
787 | size = strlen(class); | 860 | size = strlen(class); |
788 | BUG_ON(size > (size_t) U8_MAX); | 861 | BUG_ON(size > (size_t) U8_MAX); |
@@ -815,12 +888,10 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, | |||
815 | 888 | ||
816 | BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); | 889 | BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); |
817 | 890 | ||
818 | pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); | 891 | pagelist = ceph_pagelist_alloc(GFP_NOFS); |
819 | if (!pagelist) | 892 | if (!pagelist) |
820 | return -ENOMEM; | 893 | return -ENOMEM; |
821 | 894 | ||
822 | ceph_pagelist_init(pagelist); | ||
823 | |||
824 | payload_len = strlen(name); | 895 | payload_len = strlen(name); |
825 | op->xattr.name_len = payload_len; | 896 | op->xattr.name_len = payload_len; |
826 | ceph_pagelist_append(pagelist, name, payload_len); | 897 | ceph_pagelist_append(pagelist, name, payload_len); |
@@ -900,12 +971,6 @@ static void ceph_osdc_msg_data_add(struct ceph_msg *msg, | |||
900 | static u32 osd_req_encode_op(struct ceph_osd_op *dst, | 971 | static u32 osd_req_encode_op(struct ceph_osd_op *dst, |
901 | const struct ceph_osd_req_op *src) | 972 | const struct ceph_osd_req_op *src) |
902 | { | 973 | { |
903 | if (WARN_ON(!osd_req_opcode_valid(src->op))) { | ||
904 | pr_err("unrecognized osd opcode %d\n", src->op); | ||
905 | |||
906 | return 0; | ||
907 | } | ||
908 | |||
909 | switch (src->op) { | 974 | switch (src->op) { |
910 | case CEPH_OSD_OP_STAT: | 975 | case CEPH_OSD_OP_STAT: |
911 | break; | 976 | break; |
@@ -955,6 +1020,14 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, | |||
955 | case CEPH_OSD_OP_CREATE: | 1020 | case CEPH_OSD_OP_CREATE: |
956 | case CEPH_OSD_OP_DELETE: | 1021 | case CEPH_OSD_OP_DELETE: |
957 | break; | 1022 | break; |
1023 | case CEPH_OSD_OP_COPY_FROM: | ||
1024 | dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); | ||
1025 | dst->copy_from.src_version = | ||
1026 | cpu_to_le64(src->copy_from.src_version); | ||
1027 | dst->copy_from.flags = src->copy_from.flags; | ||
1028 | dst->copy_from.src_fadvise_flags = | ||
1029 | cpu_to_le32(src->copy_from.src_fadvise_flags); | ||
1030 | break; | ||
958 | default: | 1031 | default: |
959 | pr_err("unsupported osd opcode %s\n", | 1032 | pr_err("unsupported osd opcode %s\n", |
960 | ceph_osd_op_name(src->op)); | 1033 | ceph_osd_op_name(src->op)); |
@@ -1038,7 +1111,15 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, | |||
1038 | if (flags & CEPH_OSD_FLAG_WRITE) | 1111 | if (flags & CEPH_OSD_FLAG_WRITE) |
1039 | req->r_data_offset = off; | 1112 | req->r_data_offset = off; |
1040 | 1113 | ||
1041 | r = ceph_osdc_alloc_messages(req, GFP_NOFS); | 1114 | if (num_ops > 1) |
1115 | /* | ||
1116 | * This is a special case for ceph_writepages_start(), but it | ||
1117 | * also covers ceph_uninline_data(). If more multi-op request | ||
1118 | * use cases emerge, we will need a separate helper. | ||
1119 | */ | ||
1120 | r = __ceph_osdc_alloc_messages(req, GFP_NOFS, num_ops, 0); | ||
1121 | else | ||
1122 | r = ceph_osdc_alloc_messages(req, GFP_NOFS); | ||
1042 | if (r) | 1123 | if (r) |
1043 | goto fail; | 1124 | goto fail; |
1044 | 1125 | ||
@@ -1845,48 +1926,55 @@ static bool should_plug_request(struct ceph_osd_request *req) | |||
1845 | return true; | 1926 | return true; |
1846 | } | 1927 | } |
1847 | 1928 | ||
1848 | static void setup_request_data(struct ceph_osd_request *req, | 1929 | /* |
1849 | struct ceph_msg *msg) | 1930 | * Keep get_num_data_items() in sync with this function. |
1931 | */ | ||
1932 | static void setup_request_data(struct ceph_osd_request *req) | ||
1850 | { | 1933 | { |
1851 | u32 data_len = 0; | 1934 | struct ceph_msg *request_msg = req->r_request; |
1852 | int i; | 1935 | struct ceph_msg *reply_msg = req->r_reply; |
1936 | struct ceph_osd_req_op *op; | ||
1853 | 1937 | ||
1854 | if (!list_empty(&msg->data)) | 1938 | if (req->r_request->num_data_items || req->r_reply->num_data_items) |
1855 | return; | 1939 | return; |
1856 | 1940 | ||
1857 | WARN_ON(msg->data_length); | 1941 | WARN_ON(request_msg->data_length || reply_msg->data_length); |
1858 | for (i = 0; i < req->r_num_ops; i++) { | 1942 | for (op = req->r_ops; op != &req->r_ops[req->r_num_ops]; op++) { |
1859 | struct ceph_osd_req_op *op = &req->r_ops[i]; | ||
1860 | |||
1861 | switch (op->op) { | 1943 | switch (op->op) { |
1862 | /* request */ | 1944 | /* request */ |
1863 | case CEPH_OSD_OP_WRITE: | 1945 | case CEPH_OSD_OP_WRITE: |
1864 | case CEPH_OSD_OP_WRITEFULL: | 1946 | case CEPH_OSD_OP_WRITEFULL: |
1865 | WARN_ON(op->indata_len != op->extent.length); | 1947 | WARN_ON(op->indata_len != op->extent.length); |
1866 | ceph_osdc_msg_data_add(msg, &op->extent.osd_data); | 1948 | ceph_osdc_msg_data_add(request_msg, |
1949 | &op->extent.osd_data); | ||
1867 | break; | 1950 | break; |
1868 | case CEPH_OSD_OP_SETXATTR: | 1951 | case CEPH_OSD_OP_SETXATTR: |
1869 | case CEPH_OSD_OP_CMPXATTR: | 1952 | case CEPH_OSD_OP_CMPXATTR: |
1870 | WARN_ON(op->indata_len != op->xattr.name_len + | 1953 | WARN_ON(op->indata_len != op->xattr.name_len + |
1871 | op->xattr.value_len); | 1954 | op->xattr.value_len); |
1872 | ceph_osdc_msg_data_add(msg, &op->xattr.osd_data); | 1955 | ceph_osdc_msg_data_add(request_msg, |
1956 | &op->xattr.osd_data); | ||
1873 | break; | 1957 | break; |
1874 | case CEPH_OSD_OP_NOTIFY_ACK: | 1958 | case CEPH_OSD_OP_NOTIFY_ACK: |
1875 | ceph_osdc_msg_data_add(msg, | 1959 | ceph_osdc_msg_data_add(request_msg, |
1876 | &op->notify_ack.request_data); | 1960 | &op->notify_ack.request_data); |
1877 | break; | 1961 | break; |
1962 | case CEPH_OSD_OP_COPY_FROM: | ||
1963 | ceph_osdc_msg_data_add(request_msg, | ||
1964 | &op->copy_from.osd_data); | ||
1965 | break; | ||
1878 | 1966 | ||
1879 | /* reply */ | 1967 | /* reply */ |
1880 | case CEPH_OSD_OP_STAT: | 1968 | case CEPH_OSD_OP_STAT: |
1881 | ceph_osdc_msg_data_add(req->r_reply, | 1969 | ceph_osdc_msg_data_add(reply_msg, |
1882 | &op->raw_data_in); | 1970 | &op->raw_data_in); |
1883 | break; | 1971 | break; |
1884 | case CEPH_OSD_OP_READ: | 1972 | case CEPH_OSD_OP_READ: |
1885 | ceph_osdc_msg_data_add(req->r_reply, | 1973 | ceph_osdc_msg_data_add(reply_msg, |
1886 | &op->extent.osd_data); | 1974 | &op->extent.osd_data); |
1887 | break; | 1975 | break; |
1888 | case CEPH_OSD_OP_LIST_WATCHERS: | 1976 | case CEPH_OSD_OP_LIST_WATCHERS: |
1889 | ceph_osdc_msg_data_add(req->r_reply, | 1977 | ceph_osdc_msg_data_add(reply_msg, |
1890 | &op->list_watchers.response_data); | 1978 | &op->list_watchers.response_data); |
1891 | break; | 1979 | break; |
1892 | 1980 | ||
@@ -1895,25 +1983,23 @@ static void setup_request_data(struct ceph_osd_request *req, | |||
1895 | WARN_ON(op->indata_len != op->cls.class_len + | 1983 | WARN_ON(op->indata_len != op->cls.class_len + |
1896 | op->cls.method_len + | 1984 | op->cls.method_len + |
1897 | op->cls.indata_len); | 1985 | op->cls.indata_len); |
1898 | ceph_osdc_msg_data_add(msg, &op->cls.request_info); | 1986 | ceph_osdc_msg_data_add(request_msg, |
1987 | &op->cls.request_info); | ||
1899 | /* optional, can be NONE */ | 1988 | /* optional, can be NONE */ |
1900 | ceph_osdc_msg_data_add(msg, &op->cls.request_data); | 1989 | ceph_osdc_msg_data_add(request_msg, |
1990 | &op->cls.request_data); | ||
1901 | /* optional, can be NONE */ | 1991 | /* optional, can be NONE */ |
1902 | ceph_osdc_msg_data_add(req->r_reply, | 1992 | ceph_osdc_msg_data_add(reply_msg, |
1903 | &op->cls.response_data); | 1993 | &op->cls.response_data); |
1904 | break; | 1994 | break; |
1905 | case CEPH_OSD_OP_NOTIFY: | 1995 | case CEPH_OSD_OP_NOTIFY: |
1906 | ceph_osdc_msg_data_add(msg, | 1996 | ceph_osdc_msg_data_add(request_msg, |
1907 | &op->notify.request_data); | 1997 | &op->notify.request_data); |
1908 | ceph_osdc_msg_data_add(req->r_reply, | 1998 | ceph_osdc_msg_data_add(reply_msg, |
1909 | &op->notify.response_data); | 1999 | &op->notify.response_data); |
1910 | break; | 2000 | break; |
1911 | } | 2001 | } |
1912 | |||
1913 | data_len += op->indata_len; | ||
1914 | } | 2002 | } |
1915 | |||
1916 | WARN_ON(data_len != msg->data_length); | ||
1917 | } | 2003 | } |
1918 | 2004 | ||
1919 | static void encode_pgid(void **p, const struct ceph_pg *pgid) | 2005 | static void encode_pgid(void **p, const struct ceph_pg *pgid) |
@@ -1961,7 +2047,7 @@ static void encode_request_partial(struct ceph_osd_request *req, | |||
1961 | req->r_data_offset || req->r_snapc); | 2047 | req->r_data_offset || req->r_snapc); |
1962 | } | 2048 | } |
1963 | 2049 | ||
1964 | setup_request_data(req, msg); | 2050 | setup_request_data(req); |
1965 | 2051 | ||
1966 | encode_spgid(&p, &req->r_t.spgid); /* actual spg */ | 2052 | encode_spgid(&p, &req->r_t.spgid); /* actual spg */ |
1967 | ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ | 2053 | ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */ |
@@ -3001,11 +3087,21 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) | |||
3001 | struct ceph_osd_client *osdc = lreq->osdc; | 3087 | struct ceph_osd_client *osdc = lreq->osdc; |
3002 | struct ceph_osd *osd; | 3088 | struct ceph_osd *osd; |
3003 | 3089 | ||
3090 | down_write(&osdc->lock); | ||
3091 | linger_register(lreq); | ||
3092 | if (lreq->is_watch) { | ||
3093 | lreq->reg_req->r_ops[0].watch.cookie = lreq->linger_id; | ||
3094 | lreq->ping_req->r_ops[0].watch.cookie = lreq->linger_id; | ||
3095 | } else { | ||
3096 | lreq->reg_req->r_ops[0].notify.cookie = lreq->linger_id; | ||
3097 | } | ||
3098 | |||
3004 | calc_target(osdc, &lreq->t, NULL, false); | 3099 | calc_target(osdc, &lreq->t, NULL, false); |
3005 | osd = lookup_create_osd(osdc, lreq->t.osd, true); | 3100 | osd = lookup_create_osd(osdc, lreq->t.osd, true); |
3006 | link_linger(osd, lreq); | 3101 | link_linger(osd, lreq); |
3007 | 3102 | ||
3008 | send_linger(lreq); | 3103 | send_linger(lreq); |
3104 | up_write(&osdc->lock); | ||
3009 | } | 3105 | } |
3010 | 3106 | ||
3011 | static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) | 3107 | static void cancel_linger_map_check(struct ceph_osd_linger_request *lreq) |
@@ -4318,9 +4414,7 @@ static void handle_watch_notify(struct ceph_osd_client *osdc, | |||
4318 | lreq->notify_id, notify_id); | 4414 | lreq->notify_id, notify_id); |
4319 | } else if (!completion_done(&lreq->notify_finish_wait)) { | 4415 | } else if (!completion_done(&lreq->notify_finish_wait)) { |
4320 | struct ceph_msg_data *data = | 4416 | struct ceph_msg_data *data = |
4321 | list_first_entry_or_null(&msg->data, | 4417 | msg->num_data_items ? &msg->data[0] : NULL; |
4322 | struct ceph_msg_data, | ||
4323 | links); | ||
4324 | 4418 | ||
4325 | if (data) { | 4419 | if (data) { |
4326 | if (lreq->preply_pages) { | 4420 | if (lreq->preply_pages) { |
@@ -4476,6 +4570,23 @@ alloc_linger_request(struct ceph_osd_linger_request *lreq) | |||
4476 | 4570 | ||
4477 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); | 4571 | ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); |
4478 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); | 4572 | ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); |
4573 | return req; | ||
4574 | } | ||
4575 | |||
4576 | static struct ceph_osd_request * | ||
4577 | alloc_watch_request(struct ceph_osd_linger_request *lreq, u8 watch_opcode) | ||
4578 | { | ||
4579 | struct ceph_osd_request *req; | ||
4580 | |||
4581 | req = alloc_linger_request(lreq); | ||
4582 | if (!req) | ||
4583 | return NULL; | ||
4584 | |||
4585 | /* | ||
4586 | * Pass 0 for cookie because we don't know it yet, it will be | ||
4587 | * filled in by linger_submit(). | ||
4588 | */ | ||
4589 | osd_req_op_watch_init(req, 0, 0, watch_opcode); | ||
4479 | 4590 | ||
4480 | if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { | 4591 | if (ceph_osdc_alloc_messages(req, GFP_NOIO)) { |
4481 | ceph_osdc_put_request(req); | 4592 | ceph_osdc_put_request(req); |
@@ -4514,27 +4625,19 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, | |||
4514 | lreq->t.flags = CEPH_OSD_FLAG_WRITE; | 4625 | lreq->t.flags = CEPH_OSD_FLAG_WRITE; |
4515 | ktime_get_real_ts64(&lreq->mtime); | 4626 | ktime_get_real_ts64(&lreq->mtime); |
4516 | 4627 | ||
4517 | lreq->reg_req = alloc_linger_request(lreq); | 4628 | lreq->reg_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_WATCH); |
4518 | if (!lreq->reg_req) { | 4629 | if (!lreq->reg_req) { |
4519 | ret = -ENOMEM; | 4630 | ret = -ENOMEM; |
4520 | goto err_put_lreq; | 4631 | goto err_put_lreq; |
4521 | } | 4632 | } |
4522 | 4633 | ||
4523 | lreq->ping_req = alloc_linger_request(lreq); | 4634 | lreq->ping_req = alloc_watch_request(lreq, CEPH_OSD_WATCH_OP_PING); |
4524 | if (!lreq->ping_req) { | 4635 | if (!lreq->ping_req) { |
4525 | ret = -ENOMEM; | 4636 | ret = -ENOMEM; |
4526 | goto err_put_lreq; | 4637 | goto err_put_lreq; |
4527 | } | 4638 | } |
4528 | 4639 | ||
4529 | down_write(&osdc->lock); | ||
4530 | linger_register(lreq); /* before osd_req_op_* */ | ||
4531 | osd_req_op_watch_init(lreq->reg_req, 0, lreq->linger_id, | ||
4532 | CEPH_OSD_WATCH_OP_WATCH); | ||
4533 | osd_req_op_watch_init(lreq->ping_req, 0, lreq->linger_id, | ||
4534 | CEPH_OSD_WATCH_OP_PING); | ||
4535 | linger_submit(lreq); | 4640 | linger_submit(lreq); |
4536 | up_write(&osdc->lock); | ||
4537 | |||
4538 | ret = linger_reg_commit_wait(lreq); | 4641 | ret = linger_reg_commit_wait(lreq); |
4539 | if (ret) { | 4642 | if (ret) { |
4540 | linger_cancel(lreq); | 4643 | linger_cancel(lreq); |
@@ -4599,11 +4702,10 @@ static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, | |||
4599 | 4702 | ||
4600 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); | 4703 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY_ACK, 0); |
4601 | 4704 | ||
4602 | pl = kmalloc(sizeof(*pl), GFP_NOIO); | 4705 | pl = ceph_pagelist_alloc(GFP_NOIO); |
4603 | if (!pl) | 4706 | if (!pl) |
4604 | return -ENOMEM; | 4707 | return -ENOMEM; |
4605 | 4708 | ||
4606 | ceph_pagelist_init(pl); | ||
4607 | ret = ceph_pagelist_encode_64(pl, notify_id); | 4709 | ret = ceph_pagelist_encode_64(pl, notify_id); |
4608 | ret |= ceph_pagelist_encode_64(pl, cookie); | 4710 | ret |= ceph_pagelist_encode_64(pl, cookie); |
4609 | if (payload) { | 4711 | if (payload) { |
@@ -4641,12 +4743,12 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, | |||
4641 | ceph_oloc_copy(&req->r_base_oloc, oloc); | 4743 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
4642 | req->r_flags = CEPH_OSD_FLAG_READ; | 4744 | req->r_flags = CEPH_OSD_FLAG_READ; |
4643 | 4745 | ||
4644 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); | 4746 | ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, |
4747 | payload_len); | ||
4645 | if (ret) | 4748 | if (ret) |
4646 | goto out_put_req; | 4749 | goto out_put_req; |
4647 | 4750 | ||
4648 | ret = osd_req_op_notify_ack_init(req, 0, notify_id, cookie, payload, | 4751 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); |
4649 | payload_len); | ||
4650 | if (ret) | 4752 | if (ret) |
4651 | goto out_put_req; | 4753 | goto out_put_req; |
4652 | 4754 | ||
@@ -4670,11 +4772,10 @@ static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, | |||
4670 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); | 4772 | op = _osd_req_op_init(req, which, CEPH_OSD_OP_NOTIFY, 0); |
4671 | op->notify.cookie = cookie; | 4773 | op->notify.cookie = cookie; |
4672 | 4774 | ||
4673 | pl = kmalloc(sizeof(*pl), GFP_NOIO); | 4775 | pl = ceph_pagelist_alloc(GFP_NOIO); |
4674 | if (!pl) | 4776 | if (!pl) |
4675 | return -ENOMEM; | 4777 | return -ENOMEM; |
4676 | 4778 | ||
4677 | ceph_pagelist_init(pl); | ||
4678 | ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ | 4779 | ret = ceph_pagelist_encode_32(pl, 1); /* prot_ver */ |
4679 | ret |= ceph_pagelist_encode_32(pl, timeout); | 4780 | ret |= ceph_pagelist_encode_32(pl, timeout); |
4680 | ret |= ceph_pagelist_encode_32(pl, payload_len); | 4781 | ret |= ceph_pagelist_encode_32(pl, payload_len); |
@@ -4733,29 +4834,30 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, | |||
4733 | goto out_put_lreq; | 4834 | goto out_put_lreq; |
4734 | } | 4835 | } |
4735 | 4836 | ||
4837 | /* | ||
4838 | * Pass 0 for cookie because we don't know it yet, it will be | ||
4839 | * filled in by linger_submit(). | ||
4840 | */ | ||
4841 | ret = osd_req_op_notify_init(lreq->reg_req, 0, 0, 1, timeout, | ||
4842 | payload, payload_len); | ||
4843 | if (ret) | ||
4844 | goto out_put_lreq; | ||
4845 | |||
4736 | /* for notify_id */ | 4846 | /* for notify_id */ |
4737 | pages = ceph_alloc_page_vector(1, GFP_NOIO); | 4847 | pages = ceph_alloc_page_vector(1, GFP_NOIO); |
4738 | if (IS_ERR(pages)) { | 4848 | if (IS_ERR(pages)) { |
4739 | ret = PTR_ERR(pages); | 4849 | ret = PTR_ERR(pages); |
4740 | goto out_put_lreq; | 4850 | goto out_put_lreq; |
4741 | } | 4851 | } |
4742 | |||
4743 | down_write(&osdc->lock); | ||
4744 | linger_register(lreq); /* before osd_req_op_* */ | ||
4745 | ret = osd_req_op_notify_init(lreq->reg_req, 0, lreq->linger_id, 1, | ||
4746 | timeout, payload, payload_len); | ||
4747 | if (ret) { | ||
4748 | linger_unregister(lreq); | ||
4749 | up_write(&osdc->lock); | ||
4750 | ceph_release_page_vector(pages, 1); | ||
4751 | goto out_put_lreq; | ||
4752 | } | ||
4753 | ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, | 4852 | ceph_osd_data_pages_init(osd_req_op_data(lreq->reg_req, 0, notify, |
4754 | response_data), | 4853 | response_data), |
4755 | pages, PAGE_SIZE, 0, false, true); | 4854 | pages, PAGE_SIZE, 0, false, true); |
4756 | linger_submit(lreq); | ||
4757 | up_write(&osdc->lock); | ||
4758 | 4855 | ||
4856 | ret = ceph_osdc_alloc_messages(lreq->reg_req, GFP_NOIO); | ||
4857 | if (ret) | ||
4858 | goto out_put_lreq; | ||
4859 | |||
4860 | linger_submit(lreq); | ||
4759 | ret = linger_reg_commit_wait(lreq); | 4861 | ret = linger_reg_commit_wait(lreq); |
4760 | if (!ret) | 4862 | if (!ret) |
4761 | ret = linger_notify_finish_wait(lreq); | 4863 | ret = linger_notify_finish_wait(lreq); |
@@ -4881,10 +4983,6 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, | |||
4881 | ceph_oloc_copy(&req->r_base_oloc, oloc); | 4983 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
4882 | req->r_flags = CEPH_OSD_FLAG_READ; | 4984 | req->r_flags = CEPH_OSD_FLAG_READ; |
4883 | 4985 | ||
4884 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); | ||
4885 | if (ret) | ||
4886 | goto out_put_req; | ||
4887 | |||
4888 | pages = ceph_alloc_page_vector(1, GFP_NOIO); | 4986 | pages = ceph_alloc_page_vector(1, GFP_NOIO); |
4889 | if (IS_ERR(pages)) { | 4987 | if (IS_ERR(pages)) { |
4890 | ret = PTR_ERR(pages); | 4988 | ret = PTR_ERR(pages); |
@@ -4896,6 +4994,10 @@ int ceph_osdc_list_watchers(struct ceph_osd_client *osdc, | |||
4896 | response_data), | 4994 | response_data), |
4897 | pages, PAGE_SIZE, 0, false, true); | 4995 | pages, PAGE_SIZE, 0, false, true); |
4898 | 4996 | ||
4997 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); | ||
4998 | if (ret) | ||
4999 | goto out_put_req; | ||
5000 | |||
4899 | ceph_osdc_start_request(osdc, req, false); | 5001 | ceph_osdc_start_request(osdc, req, false); |
4900 | ret = ceph_osdc_wait_request(osdc, req); | 5002 | ret = ceph_osdc_wait_request(osdc, req); |
4901 | if (ret >= 0) { | 5003 | if (ret >= 0) { |
@@ -4958,11 +5060,7 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, | |||
4958 | ceph_oloc_copy(&req->r_base_oloc, oloc); | 5060 | ceph_oloc_copy(&req->r_base_oloc, oloc); |
4959 | req->r_flags = flags; | 5061 | req->r_flags = flags; |
4960 | 5062 | ||
4961 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); | 5063 | ret = osd_req_op_cls_init(req, 0, class, method); |
4962 | if (ret) | ||
4963 | goto out_put_req; | ||
4964 | |||
4965 | ret = osd_req_op_cls_init(req, 0, CEPH_OSD_OP_CALL, class, method); | ||
4966 | if (ret) | 5064 | if (ret) |
4967 | goto out_put_req; | 5065 | goto out_put_req; |
4968 | 5066 | ||
@@ -4973,6 +5071,10 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, | |||
4973 | osd_req_op_cls_response_data_pages(req, 0, &resp_page, | 5071 | osd_req_op_cls_response_data_pages(req, 0, &resp_page, |
4974 | *resp_len, 0, false, false); | 5072 | *resp_len, 0, false, false); |
4975 | 5073 | ||
5074 | ret = ceph_osdc_alloc_messages(req, GFP_NOIO); | ||
5075 | if (ret) | ||
5076 | goto out_put_req; | ||
5077 | |||
4976 | ceph_osdc_start_request(osdc, req, false); | 5078 | ceph_osdc_start_request(osdc, req, false); |
4977 | ret = ceph_osdc_wait_request(osdc, req); | 5079 | ret = ceph_osdc_wait_request(osdc, req); |
4978 | if (ret >= 0) { | 5080 | if (ret >= 0) { |
@@ -5021,11 +5123,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) | |||
5021 | goto out_map; | 5123 | goto out_map; |
5022 | 5124 | ||
5023 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, | 5125 | err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP, |
5024 | PAGE_SIZE, 10, true, "osd_op"); | 5126 | PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, "osd_op"); |
5025 | if (err < 0) | 5127 | if (err < 0) |
5026 | goto out_mempool; | 5128 | goto out_mempool; |
5027 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, | 5129 | err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY, |
5028 | PAGE_SIZE, 10, true, "osd_op_reply"); | 5130 | PAGE_SIZE, CEPH_OSD_SLAB_OPS, 10, |
5131 | "osd_op_reply"); | ||
5029 | if (err < 0) | 5132 | if (err < 0) |
5030 | goto out_msgpool; | 5133 | goto out_msgpool; |
5031 | 5134 | ||
@@ -5168,6 +5271,80 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, | |||
5168 | } | 5271 | } |
5169 | EXPORT_SYMBOL(ceph_osdc_writepages); | 5272 | EXPORT_SYMBOL(ceph_osdc_writepages); |
5170 | 5273 | ||
5274 | static int osd_req_op_copy_from_init(struct ceph_osd_request *req, | ||
5275 | u64 src_snapid, u64 src_version, | ||
5276 | struct ceph_object_id *src_oid, | ||
5277 | struct ceph_object_locator *src_oloc, | ||
5278 | u32 src_fadvise_flags, | ||
5279 | u32 dst_fadvise_flags, | ||
5280 | u8 copy_from_flags) | ||
5281 | { | ||
5282 | struct ceph_osd_req_op *op; | ||
5283 | struct page **pages; | ||
5284 | void *p, *end; | ||
5285 | |||
5286 | pages = ceph_alloc_page_vector(1, GFP_KERNEL); | ||
5287 | if (IS_ERR(pages)) | ||
5288 | return PTR_ERR(pages); | ||
5289 | |||
5290 | op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); | ||
5291 | op->copy_from.snapid = src_snapid; | ||
5292 | op->copy_from.src_version = src_version; | ||
5293 | op->copy_from.flags = copy_from_flags; | ||
5294 | op->copy_from.src_fadvise_flags = src_fadvise_flags; | ||
5295 | |||
5296 | p = page_address(pages[0]); | ||
5297 | end = p + PAGE_SIZE; | ||
5298 | ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); | ||
5299 | encode_oloc(&p, end, src_oloc); | ||
5300 | op->indata_len = PAGE_SIZE - (end - p); | ||
5301 | |||
5302 | ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, | ||
5303 | op->indata_len, 0, false, true); | ||
5304 | return 0; | ||
5305 | } | ||
5306 | |||
5307 | int ceph_osdc_copy_from(struct ceph_osd_client *osdc, | ||
5308 | u64 src_snapid, u64 src_version, | ||
5309 | struct ceph_object_id *src_oid, | ||
5310 | struct ceph_object_locator *src_oloc, | ||
5311 | u32 src_fadvise_flags, | ||
5312 | struct ceph_object_id *dst_oid, | ||
5313 | struct ceph_object_locator *dst_oloc, | ||
5314 | u32 dst_fadvise_flags, | ||
5315 | u8 copy_from_flags) | ||
5316 | { | ||
5317 | struct ceph_osd_request *req; | ||
5318 | int ret; | ||
5319 | |||
5320 | req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); | ||
5321 | if (!req) | ||
5322 | return -ENOMEM; | ||
5323 | |||
5324 | req->r_flags = CEPH_OSD_FLAG_WRITE; | ||
5325 | |||
5326 | ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); | ||
5327 | ceph_oid_copy(&req->r_t.base_oid, dst_oid); | ||
5328 | |||
5329 | ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, | ||
5330 | src_oloc, src_fadvise_flags, | ||
5331 | dst_fadvise_flags, copy_from_flags); | ||
5332 | if (ret) | ||
5333 | goto out; | ||
5334 | |||
5335 | ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); | ||
5336 | if (ret) | ||
5337 | goto out; | ||
5338 | |||
5339 | ceph_osdc_start_request(osdc, req, false); | ||
5340 | ret = ceph_osdc_wait_request(osdc, req); | ||
5341 | |||
5342 | out: | ||
5343 | ceph_osdc_put_request(req); | ||
5344 | return ret; | ||
5345 | } | ||
5346 | EXPORT_SYMBOL(ceph_osdc_copy_from); | ||
5347 | |||
5171 | int __init ceph_osdc_setup(void) | 5348 | int __init ceph_osdc_setup(void) |
5172 | { | 5349 | { |
5173 | size_t size = sizeof(struct ceph_osd_request) + | 5350 | size_t size = sizeof(struct ceph_osd_request) + |
@@ -5295,7 +5472,7 @@ static struct ceph_msg *alloc_msg_with_page_vector(struct ceph_msg_header *hdr) | |||
5295 | u32 front_len = le32_to_cpu(hdr->front_len); | 5472 | u32 front_len = le32_to_cpu(hdr->front_len); |
5296 | u32 data_len = le32_to_cpu(hdr->data_len); | 5473 | u32 data_len = le32_to_cpu(hdr->data_len); |
5297 | 5474 | ||
5298 | m = ceph_msg_new(type, front_len, GFP_NOIO, false); | 5475 | m = ceph_msg_new2(type, front_len, 1, GFP_NOIO, false); |
5299 | if (!m) | 5476 | if (!m) |
5300 | return NULL; | 5477 | return NULL; |
5301 | 5478 | ||
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 2ea0564771d2..65e34f78b05d 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c | |||
@@ -6,6 +6,26 @@ | |||
6 | #include <linux/highmem.h> | 6 | #include <linux/highmem.h> |
7 | #include <linux/ceph/pagelist.h> | 7 | #include <linux/ceph/pagelist.h> |
8 | 8 | ||
9 | struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags) | ||
10 | { | ||
11 | struct ceph_pagelist *pl; | ||
12 | |||
13 | pl = kmalloc(sizeof(*pl), gfp_flags); | ||
14 | if (!pl) | ||
15 | return NULL; | ||
16 | |||
17 | INIT_LIST_HEAD(&pl->head); | ||
18 | pl->mapped_tail = NULL; | ||
19 | pl->length = 0; | ||
20 | pl->room = 0; | ||
21 | INIT_LIST_HEAD(&pl->free_list); | ||
22 | pl->num_pages_free = 0; | ||
23 | refcount_set(&pl->refcnt, 1); | ||
24 | |||
25 | return pl; | ||
26 | } | ||
27 | EXPORT_SYMBOL(ceph_pagelist_alloc); | ||
28 | |||
9 | static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) | 29 | static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) |
10 | { | 30 | { |
11 | if (pl->mapped_tail) { | 31 | if (pl->mapped_tail) { |