diff options
author | Ilya Dryomov <idryomov@gmail.com> | 2018-02-06 13:26:35 -0500 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2018-04-02 04:12:44 -0400 |
commit | afb978884c3ec17227626eb371130a97671e5238 (patch) | |
tree | 2857a22c333bae9a944cdca4c1c8f934c12290a7 | |
parent | e93aca0abb8b9f8fd23675dc9110b7517964657a (diff) |
rbd: introduce OWN_BVECS data type
If the layout is "fancy", we need to be able to rearrange the provided
bio_vecs in stripe unit chunks to make it possible for the messenger to
read/write directly from/to the provided data buffer, without employing
a temporary data buffer for assembling the result.
Higher level bio_vec arrays are generally immutable, so this requires
copying into a private array. Only the bio_vecs themselves are shuffled
around, not the actual data. OWN_BVECS doesn't own any pages.
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
-rw-r--r-- | drivers/block/rbd.c | 156 |
1 files changed, 149 insertions, 7 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 5fa4e1aced04..056865cfc596 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c | |||
@@ -215,6 +215,7 @@ enum obj_request_type { | |||
215 | OBJ_REQUEST_NODATA = 1, | 215 | OBJ_REQUEST_NODATA = 1, |
216 | OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ | 216 | OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ |
217 | OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ | 217 | OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ |
218 | OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */ | ||
218 | }; | 219 | }; |
219 | 220 | ||
220 | enum obj_operation_type { | 221 | enum obj_operation_type { |
@@ -261,6 +262,7 @@ struct rbd_obj_request { | |||
261 | struct { | 262 | struct { |
262 | struct ceph_bvec_iter bvec_pos; | 263 | struct ceph_bvec_iter bvec_pos; |
263 | u32 bvec_count; | 264 | u32 bvec_count; |
265 | u32 bvec_idx; | ||
264 | }; | 266 | }; |
265 | }; | 267 | }; |
266 | struct bio_vec *copyup_bvecs; | 268 | struct bio_vec *copyup_bvecs; |
@@ -1238,7 +1240,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes) | |||
1238 | 1240 | ||
1239 | /* | 1241 | /* |
1240 | * Zero a range in @obj_req data buffer defined by a bio (list) or | 1242 | * Zero a range in @obj_req data buffer defined by a bio (list) or |
1241 | * bio_vec array. | 1243 | * (private) bio_vec array. |
1242 | * | 1244 | * |
1243 | * @off is relative to the start of the data buffer. | 1245 | * @off is relative to the start of the data buffer. |
1244 | */ | 1246 | */ |
@@ -1250,6 +1252,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off, | |||
1250 | zero_bios(&obj_req->bio_pos, off, bytes); | 1252 | zero_bios(&obj_req->bio_pos, off, bytes); |
1251 | break; | 1253 | break; |
1252 | case OBJ_REQUEST_BVECS: | 1254 | case OBJ_REQUEST_BVECS: |
1255 | case OBJ_REQUEST_OWN_BVECS: | ||
1253 | zero_bvecs(&obj_req->bvec_pos, off, bytes); | 1256 | zero_bvecs(&obj_req->bvec_pos, off, bytes); |
1254 | break; | 1257 | break; |
1255 | default: | 1258 | default: |
@@ -1485,6 +1488,9 @@ static void rbd_obj_request_destroy(struct kref *kref) | |||
1485 | case OBJ_REQUEST_BIO: | 1488 | case OBJ_REQUEST_BIO: |
1486 | case OBJ_REQUEST_BVECS: | 1489 | case OBJ_REQUEST_BVECS: |
1487 | break; /* Nothing to do */ | 1490 | break; /* Nothing to do */ |
1491 | case OBJ_REQUEST_OWN_BVECS: | ||
1492 | kfree(obj_request->bvec_pos.bvecs); | ||
1493 | break; | ||
1488 | default: | 1494 | default: |
1489 | rbd_assert(0); | 1495 | rbd_assert(0); |
1490 | } | 1496 | } |
@@ -1679,8 +1685,10 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which) | |||
1679 | obj_req->ex.oe_len); | 1685 | obj_req->ex.oe_len); |
1680 | break; | 1686 | break; |
1681 | case OBJ_REQUEST_BVECS: | 1687 | case OBJ_REQUEST_BVECS: |
1688 | case OBJ_REQUEST_OWN_BVECS: | ||
1682 | rbd_assert(obj_req->bvec_pos.iter.bi_size == | 1689 | rbd_assert(obj_req->bvec_pos.iter.bi_size == |
1683 | obj_req->ex.oe_len); | 1690 | obj_req->ex.oe_len); |
1691 | rbd_assert(obj_req->bvec_idx == obj_req->bvec_count); | ||
1684 | osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, | 1692 | osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, |
1685 | &obj_req->bvec_pos); | 1693 | &obj_req->bvec_pos); |
1686 | break; | 1694 | break; |
@@ -1893,6 +1901,8 @@ struct rbd_img_fill_ctx { | |||
1893 | union rbd_img_fill_iter *pos; | 1901 | union rbd_img_fill_iter *pos; |
1894 | union rbd_img_fill_iter iter; | 1902 | union rbd_img_fill_iter iter; |
1895 | ceph_object_extent_fn_t set_pos_fn; | 1903 | ceph_object_extent_fn_t set_pos_fn; |
1904 | ceph_object_extent_fn_t count_fn; | ||
1905 | ceph_object_extent_fn_t copy_fn; | ||
1896 | }; | 1906 | }; |
1897 | 1907 | ||
1898 | static struct ceph_object_extent *alloc_object_extent(void *arg) | 1908 | static struct ceph_object_extent *alloc_object_extent(void *arg) |
@@ -1909,12 +1919,57 @@ static struct ceph_object_extent *alloc_object_extent(void *arg) | |||
1909 | } | 1919 | } |
1910 | 1920 | ||
1911 | /* | 1921 | /* |
1922 | * While su != os && sc == 1 is technically not fancy (it's the same | ||
1923 | * layout as su == os && sc == 1), we can't use the nocopy path for it | ||
1924 | * because ->set_pos_fn() should be called only once per object. | ||
1925 | * ceph_file_to_extents() invokes action_fn once per stripe unit, so | ||
1926 | * treat su != os && sc == 1 as fancy. | ||
1927 | */ | ||
1928 | static bool rbd_layout_is_fancy(struct ceph_file_layout *l) | ||
1929 | { | ||
1930 | return l->stripe_unit != l->object_size; | ||
1931 | } | ||
1932 | |||
1933 | static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req, | ||
1934 | struct ceph_file_extent *img_extents, | ||
1935 | u32 num_img_extents, | ||
1936 | struct rbd_img_fill_ctx *fctx) | ||
1937 | { | ||
1938 | u32 i; | ||
1939 | int ret; | ||
1940 | |||
1941 | img_req->data_type = fctx->pos_type; | ||
1942 | |||
1943 | /* | ||
1944 | * Create object requests and set each object request's starting | ||
1945 | * position in the provided bio (list) or bio_vec array. | ||
1946 | */ | ||
1947 | fctx->iter = *fctx->pos; | ||
1948 | for (i = 0; i < num_img_extents; i++) { | ||
1949 | ret = ceph_file_to_extents(&img_req->rbd_dev->layout, | ||
1950 | img_extents[i].fe_off, | ||
1951 | img_extents[i].fe_len, | ||
1952 | &img_req->object_extents, | ||
1953 | alloc_object_extent, img_req, | ||
1954 | fctx->set_pos_fn, &fctx->iter); | ||
1955 | if (ret) | ||
1956 | return ret; | ||
1957 | } | ||
1958 | |||
1959 | return __rbd_img_fill_request(img_req); | ||
1960 | } | ||
1961 | |||
1962 | /* | ||
1912 | * Map a list of image extents to a list of object extents, create the | 1963 | * Map a list of image extents to a list of object extents, create the |
1913 | * corresponding object requests (normally each to a different object, | 1964 | * corresponding object requests (normally each to a different object, |
1914 | * but not always) and add them to @img_req. For each object request, | 1965 | * but not always) and add them to @img_req. For each object request, |
1915 | * set up its data descriptor to point to the corresponding chunk of | 1966 | * set up its data descriptor to point to the corresponding chunk(s) of |
1916 | * @fctx->pos data buffer. | 1967 | * @fctx->pos data buffer. |
1917 | * | 1968 | * |
1969 | * Because ceph_file_to_extents() will merge adjacent object extents | ||
1970 | * together, each object request's data descriptor may point to multiple | ||
1971 | * different chunks of @fctx->pos data buffer. | ||
1972 | * | ||
1918 | * @fctx->pos data buffer is assumed to be large enough. | 1973 | * @fctx->pos data buffer is assumed to be large enough. |
1919 | */ | 1974 | */ |
1920 | static int rbd_img_fill_request(struct rbd_img_request *img_req, | 1975 | static int rbd_img_fill_request(struct rbd_img_request *img_req, |
@@ -1922,23 +1977,56 @@ static int rbd_img_fill_request(struct rbd_img_request *img_req, | |||
1922 | u32 num_img_extents, | 1977 | u32 num_img_extents, |
1923 | struct rbd_img_fill_ctx *fctx) | 1978 | struct rbd_img_fill_ctx *fctx) |
1924 | { | 1979 | { |
1980 | struct rbd_device *rbd_dev = img_req->rbd_dev; | ||
1981 | struct rbd_obj_request *obj_req; | ||
1925 | u32 i; | 1982 | u32 i; |
1926 | int ret; | 1983 | int ret; |
1927 | 1984 | ||
1928 | img_req->data_type = fctx->pos_type; | 1985 | if (fctx->pos_type == OBJ_REQUEST_NODATA || |
1986 | !rbd_layout_is_fancy(&rbd_dev->layout)) | ||
1987 | return rbd_img_fill_request_nocopy(img_req, img_extents, | ||
1988 | num_img_extents, fctx); | ||
1989 | |||
1990 | img_req->data_type = OBJ_REQUEST_OWN_BVECS; | ||
1929 | 1991 | ||
1930 | /* | 1992 | /* |
1931 | * Create object requests and set each object request's starting | 1993 | * Create object requests and determine ->bvec_count for each object |
1932 | * position in the provided bio (list) or bio_vec array. | 1994 | * request. Note that ->bvec_count sum over all object requests may |
1995 | * be greater than the number of bio_vecs in the provided bio (list) | ||
1996 | * or bio_vec array because when mapped, those bio_vecs can straddle | ||
1997 | * stripe unit boundaries. | ||
1933 | */ | 1998 | */ |
1934 | fctx->iter = *fctx->pos; | 1999 | fctx->iter = *fctx->pos; |
1935 | for (i = 0; i < num_img_extents; i++) { | 2000 | for (i = 0; i < num_img_extents; i++) { |
1936 | ret = ceph_file_to_extents(&img_req->rbd_dev->layout, | 2001 | ret = ceph_file_to_extents(&rbd_dev->layout, |
1937 | img_extents[i].fe_off, | 2002 | img_extents[i].fe_off, |
1938 | img_extents[i].fe_len, | 2003 | img_extents[i].fe_len, |
1939 | &img_req->object_extents, | 2004 | &img_req->object_extents, |
1940 | alloc_object_extent, img_req, | 2005 | alloc_object_extent, img_req, |
1941 | fctx->set_pos_fn, &fctx->iter); | 2006 | fctx->count_fn, &fctx->iter); |
2007 | if (ret) | ||
2008 | return ret; | ||
2009 | } | ||
2010 | |||
2011 | for_each_obj_request(img_req, obj_req) { | ||
2012 | obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count, | ||
2013 | sizeof(*obj_req->bvec_pos.bvecs), | ||
2014 | GFP_NOIO); | ||
2015 | if (!obj_req->bvec_pos.bvecs) | ||
2016 | return -ENOMEM; | ||
2017 | } | ||
2018 | |||
2019 | /* | ||
2020 | * Fill in each object request's private bio_vec array, splitting and | ||
2021 | * rearranging the provided bio_vecs in stripe unit chunks as needed. | ||
2022 | */ | ||
2023 | fctx->iter = *fctx->pos; | ||
2024 | for (i = 0; i < num_img_extents; i++) { | ||
2025 | ret = ceph_iterate_extents(&rbd_dev->layout, | ||
2026 | img_extents[i].fe_off, | ||
2027 | img_extents[i].fe_len, | ||
2028 | &img_req->object_extents, | ||
2029 | fctx->copy_fn, &fctx->iter); | ||
1942 | if (ret) | 2030 | if (ret) |
1943 | return ret; | 2031 | return ret; |
1944 | } | 2032 | } |
@@ -1970,6 +2058,32 @@ static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) | |||
1970 | ceph_bio_iter_advance(it, bytes); | 2058 | ceph_bio_iter_advance(it, bytes); |
1971 | } | 2059 | } |
1972 | 2060 | ||
2061 | static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) | ||
2062 | { | ||
2063 | struct rbd_obj_request *obj_req = | ||
2064 | container_of(ex, struct rbd_obj_request, ex); | ||
2065 | struct ceph_bio_iter *it = arg; | ||
2066 | |||
2067 | dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); | ||
2068 | ceph_bio_iter_advance_step(it, bytes, ({ | ||
2069 | obj_req->bvec_count++; | ||
2070 | })); | ||
2071 | |||
2072 | } | ||
2073 | |||
2074 | static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) | ||
2075 | { | ||
2076 | struct rbd_obj_request *obj_req = | ||
2077 | container_of(ex, struct rbd_obj_request, ex); | ||
2078 | struct ceph_bio_iter *it = arg; | ||
2079 | |||
2080 | dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes); | ||
2081 | ceph_bio_iter_advance_step(it, bytes, ({ | ||
2082 | obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; | ||
2083 | obj_req->bvec_pos.iter.bi_size += bv.bv_len; | ||
2084 | })); | ||
2085 | } | ||
2086 | |||
1973 | static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, | 2087 | static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, |
1974 | struct ceph_file_extent *img_extents, | 2088 | struct ceph_file_extent *img_extents, |
1975 | u32 num_img_extents, | 2089 | u32 num_img_extents, |
@@ -1979,6 +2093,8 @@ static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, | |||
1979 | .pos_type = OBJ_REQUEST_BIO, | 2093 | .pos_type = OBJ_REQUEST_BIO, |
1980 | .pos = (union rbd_img_fill_iter *)bio_pos, | 2094 | .pos = (union rbd_img_fill_iter *)bio_pos, |
1981 | .set_pos_fn = set_bio_pos, | 2095 | .set_pos_fn = set_bio_pos, |
2096 | .count_fn = count_bio_bvecs, | ||
2097 | .copy_fn = copy_bio_bvecs, | ||
1982 | }; | 2098 | }; |
1983 | 2099 | ||
1984 | return rbd_img_fill_request(img_req, img_extents, num_img_extents, | 2100 | return rbd_img_fill_request(img_req, img_extents, num_img_extents, |
@@ -2005,6 +2121,29 @@ static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg) | |||
2005 | ceph_bvec_iter_advance(it, bytes); | 2121 | ceph_bvec_iter_advance(it, bytes); |
2006 | } | 2122 | } |
2007 | 2123 | ||
2124 | static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) | ||
2125 | { | ||
2126 | struct rbd_obj_request *obj_req = | ||
2127 | container_of(ex, struct rbd_obj_request, ex); | ||
2128 | struct ceph_bvec_iter *it = arg; | ||
2129 | |||
2130 | ceph_bvec_iter_advance_step(it, bytes, ({ | ||
2131 | obj_req->bvec_count++; | ||
2132 | })); | ||
2133 | } | ||
2134 | |||
2135 | static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg) | ||
2136 | { | ||
2137 | struct rbd_obj_request *obj_req = | ||
2138 | container_of(ex, struct rbd_obj_request, ex); | ||
2139 | struct ceph_bvec_iter *it = arg; | ||
2140 | |||
2141 | ceph_bvec_iter_advance_step(it, bytes, ({ | ||
2142 | obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv; | ||
2143 | obj_req->bvec_pos.iter.bi_size += bv.bv_len; | ||
2144 | })); | ||
2145 | } | ||
2146 | |||
2008 | static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, | 2147 | static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, |
2009 | struct ceph_file_extent *img_extents, | 2148 | struct ceph_file_extent *img_extents, |
2010 | u32 num_img_extents, | 2149 | u32 num_img_extents, |
@@ -2014,6 +2153,8 @@ static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, | |||
2014 | .pos_type = OBJ_REQUEST_BVECS, | 2153 | .pos_type = OBJ_REQUEST_BVECS, |
2015 | .pos = (union rbd_img_fill_iter *)bvec_pos, | 2154 | .pos = (union rbd_img_fill_iter *)bvec_pos, |
2016 | .set_pos_fn = set_bvec_pos, | 2155 | .set_pos_fn = set_bvec_pos, |
2156 | .count_fn = count_bvecs, | ||
2157 | .copy_fn = copy_bvecs, | ||
2017 | }; | 2158 | }; |
2018 | 2159 | ||
2019 | return rbd_img_fill_request(img_req, img_extents, num_img_extents, | 2160 | return rbd_img_fill_request(img_req, img_extents, num_img_extents, |
@@ -2071,6 +2212,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req) | |||
2071 | &obj_req->bio_pos); | 2212 | &obj_req->bio_pos); |
2072 | break; | 2213 | break; |
2073 | case OBJ_REQUEST_BVECS: | 2214 | case OBJ_REQUEST_BVECS: |
2215 | case OBJ_REQUEST_OWN_BVECS: | ||
2074 | ret = __rbd_img_fill_from_bvecs(child_img_req, | 2216 | ret = __rbd_img_fill_from_bvecs(child_img_req, |
2075 | obj_req->img_extents, | 2217 | obj_req->img_extents, |
2076 | obj_req->num_img_extents, | 2218 | obj_req->num_img_extents, |