aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2018-02-06 13:26:35 -0500
committerIlya Dryomov <idryomov@gmail.com>2018-04-02 04:12:44 -0400
commitafb978884c3ec17227626eb371130a97671e5238 (patch)
tree2857a22c333bae9a944cdca4c1c8f934c12290a7
parente93aca0abb8b9f8fd23675dc9110b7517964657a (diff)
rbd: introduce OWN_BVECS data type
If the layout is "fancy", we need to be able to rearrange the provided bio_vecs in stripe unit chunks to make it possible for the messenger to read/write directly from/to the provided data buffer, without employing a temporary data buffer for assembling the result. Higher level bio_vec arrays are generally immutable, so this requires copying into a private array. Only the bio_vecs themselves are shuffled around, not the actual data. OWN_BVECS doesn't own any pages. Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
-rw-r--r--drivers/block/rbd.c156
1 files changed, 149 insertions, 7 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 5fa4e1aced04..056865cfc596 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -215,6 +215,7 @@ enum obj_request_type {
215 OBJ_REQUEST_NODATA = 1, 215 OBJ_REQUEST_NODATA = 1,
216 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */ 216 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
217 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */ 217 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
218 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
218}; 219};
219 220
220enum obj_operation_type { 221enum obj_operation_type {
@@ -261,6 +262,7 @@ struct rbd_obj_request {
261 struct { 262 struct {
262 struct ceph_bvec_iter bvec_pos; 263 struct ceph_bvec_iter bvec_pos;
263 u32 bvec_count; 264 u32 bvec_count;
265 u32 bvec_idx;
264 }; 266 };
265 }; 267 };
266 struct bio_vec *copyup_bvecs; 268 struct bio_vec *copyup_bvecs;
@@ -1238,7 +1240,7 @@ static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
1238 1240
1239/* 1241/*
1240 * Zero a range in @obj_req data buffer defined by a bio (list) or 1242 * Zero a range in @obj_req data buffer defined by a bio (list) or
1241 * bio_vec array. 1243 * (private) bio_vec array.
1242 * 1244 *
1243 * @off is relative to the start of the data buffer. 1245 * @off is relative to the start of the data buffer.
1244 */ 1246 */
@@ -1250,6 +1252,7 @@ static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1250 zero_bios(&obj_req->bio_pos, off, bytes); 1252 zero_bios(&obj_req->bio_pos, off, bytes);
1251 break; 1253 break;
1252 case OBJ_REQUEST_BVECS: 1254 case OBJ_REQUEST_BVECS:
1255 case OBJ_REQUEST_OWN_BVECS:
1253 zero_bvecs(&obj_req->bvec_pos, off, bytes); 1256 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1254 break; 1257 break;
1255 default: 1258 default:
@@ -1485,6 +1488,9 @@ static void rbd_obj_request_destroy(struct kref *kref)
1485 case OBJ_REQUEST_BIO: 1488 case OBJ_REQUEST_BIO:
1486 case OBJ_REQUEST_BVECS: 1489 case OBJ_REQUEST_BVECS:
1487 break; /* Nothing to do */ 1490 break; /* Nothing to do */
1491 case OBJ_REQUEST_OWN_BVECS:
1492 kfree(obj_request->bvec_pos.bvecs);
1493 break;
1488 default: 1494 default:
1489 rbd_assert(0); 1495 rbd_assert(0);
1490 } 1496 }
@@ -1679,8 +1685,10 @@ static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1679 obj_req->ex.oe_len); 1685 obj_req->ex.oe_len);
1680 break; 1686 break;
1681 case OBJ_REQUEST_BVECS: 1687 case OBJ_REQUEST_BVECS:
1688 case OBJ_REQUEST_OWN_BVECS:
1682 rbd_assert(obj_req->bvec_pos.iter.bi_size == 1689 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
1683 obj_req->ex.oe_len); 1690 obj_req->ex.oe_len);
1691 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
1684 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which, 1692 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1685 &obj_req->bvec_pos); 1693 &obj_req->bvec_pos);
1686 break; 1694 break;
@@ -1893,6 +1901,8 @@ struct rbd_img_fill_ctx {
1893 union rbd_img_fill_iter *pos; 1901 union rbd_img_fill_iter *pos;
1894 union rbd_img_fill_iter iter; 1902 union rbd_img_fill_iter iter;
1895 ceph_object_extent_fn_t set_pos_fn; 1903 ceph_object_extent_fn_t set_pos_fn;
1904 ceph_object_extent_fn_t count_fn;
1905 ceph_object_extent_fn_t copy_fn;
1896}; 1906};
1897 1907
1898static struct ceph_object_extent *alloc_object_extent(void *arg) 1908static struct ceph_object_extent *alloc_object_extent(void *arg)
@@ -1909,12 +1919,57 @@ static struct ceph_object_extent *alloc_object_extent(void *arg)
1909} 1919}
1910 1920
1911/* 1921/*
1922 * While su != os && sc == 1 is technically not fancy (it's the same
1923 * layout as su == os && sc == 1), we can't use the nocopy path for it
1924 * because ->set_pos_fn() should be called only once per object.
1925 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1926 * treat su != os && sc == 1 as fancy.
1927 */
1928static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1929{
1930 return l->stripe_unit != l->object_size;
1931}
1932
1933static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1934 struct ceph_file_extent *img_extents,
1935 u32 num_img_extents,
1936 struct rbd_img_fill_ctx *fctx)
1937{
1938 u32 i;
1939 int ret;
1940
1941 img_req->data_type = fctx->pos_type;
1942
1943 /*
1944 * Create object requests and set each object request's starting
1945 * position in the provided bio (list) or bio_vec array.
1946 */
1947 fctx->iter = *fctx->pos;
1948 for (i = 0; i < num_img_extents; i++) {
1949 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
1950 img_extents[i].fe_off,
1951 img_extents[i].fe_len,
1952 &img_req->object_extents,
1953 alloc_object_extent, img_req,
1954 fctx->set_pos_fn, &fctx->iter);
1955 if (ret)
1956 return ret;
1957 }
1958
1959 return __rbd_img_fill_request(img_req);
1960}
1961
1962/*
1912 * Map a list of image extents to a list of object extents, create the 1963 * Map a list of image extents to a list of object extents, create the
1913 * corresponding object requests (normally each to a different object, 1964 * corresponding object requests (normally each to a different object,
1914 * but not always) and add them to @img_req. For each object request, 1965 * but not always) and add them to @img_req. For each object request,
1915 * set up its data descriptor to point to the corresponding chunk of 1966 * set up its data descriptor to point to the corresponding chunk(s) of
1916 * @fctx->pos data buffer. 1967 * @fctx->pos data buffer.
1917 * 1968 *
1969 * Because ceph_file_to_extents() will merge adjacent object extents
1970 * together, each object request's data descriptor may point to multiple
1971 * different chunks of @fctx->pos data buffer.
1972 *
1918 * @fctx->pos data buffer is assumed to be large enough. 1973 * @fctx->pos data buffer is assumed to be large enough.
1919 */ 1974 */
1920static int rbd_img_fill_request(struct rbd_img_request *img_req, 1975static int rbd_img_fill_request(struct rbd_img_request *img_req,
@@ -1922,23 +1977,56 @@ static int rbd_img_fill_request(struct rbd_img_request *img_req,
1922 u32 num_img_extents, 1977 u32 num_img_extents,
1923 struct rbd_img_fill_ctx *fctx) 1978 struct rbd_img_fill_ctx *fctx)
1924{ 1979{
1980 struct rbd_device *rbd_dev = img_req->rbd_dev;
1981 struct rbd_obj_request *obj_req;
1925 u32 i; 1982 u32 i;
1926 int ret; 1983 int ret;
1927 1984
1928 img_req->data_type = fctx->pos_type; 1985 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
1986 !rbd_layout_is_fancy(&rbd_dev->layout))
1987 return rbd_img_fill_request_nocopy(img_req, img_extents,
1988 num_img_extents, fctx);
1989
1990 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
1929 1991
1930 /* 1992 /*
1931 * Create object requests and set each object request's starting 1993 * Create object requests and determine ->bvec_count for each object
1932 * position in the provided bio (list) or bio_vec array. 1994 * request. Note that ->bvec_count sum over all object requests may
1995 * be greater than the number of bio_vecs in the provided bio (list)
1996 * or bio_vec array because when mapped, those bio_vecs can straddle
1997 * stripe unit boundaries.
1933 */ 1998 */
1934 fctx->iter = *fctx->pos; 1999 fctx->iter = *fctx->pos;
1935 for (i = 0; i < num_img_extents; i++) { 2000 for (i = 0; i < num_img_extents; i++) {
1936 ret = ceph_file_to_extents(&img_req->rbd_dev->layout, 2001 ret = ceph_file_to_extents(&rbd_dev->layout,
1937 img_extents[i].fe_off, 2002 img_extents[i].fe_off,
1938 img_extents[i].fe_len, 2003 img_extents[i].fe_len,
1939 &img_req->object_extents, 2004 &img_req->object_extents,
1940 alloc_object_extent, img_req, 2005 alloc_object_extent, img_req,
1941 fctx->set_pos_fn, &fctx->iter); 2006 fctx->count_fn, &fctx->iter);
2007 if (ret)
2008 return ret;
2009 }
2010
2011 for_each_obj_request(img_req, obj_req) {
2012 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2013 sizeof(*obj_req->bvec_pos.bvecs),
2014 GFP_NOIO);
2015 if (!obj_req->bvec_pos.bvecs)
2016 return -ENOMEM;
2017 }
2018
2019 /*
2020 * Fill in each object request's private bio_vec array, splitting and
2021 * rearranging the provided bio_vecs in stripe unit chunks as needed.
2022 */
2023 fctx->iter = *fctx->pos;
2024 for (i = 0; i < num_img_extents; i++) {
2025 ret = ceph_iterate_extents(&rbd_dev->layout,
2026 img_extents[i].fe_off,
2027 img_extents[i].fe_len,
2028 &img_req->object_extents,
2029 fctx->copy_fn, &fctx->iter);
1942 if (ret) 2030 if (ret)
1943 return ret; 2031 return ret;
1944 } 2032 }
@@ -1970,6 +2058,32 @@ static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
1970 ceph_bio_iter_advance(it, bytes); 2058 ceph_bio_iter_advance(it, bytes);
1971} 2059}
1972 2060
2061static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2062{
2063 struct rbd_obj_request *obj_req =
2064 container_of(ex, struct rbd_obj_request, ex);
2065 struct ceph_bio_iter *it = arg;
2066
2067 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2068 ceph_bio_iter_advance_step(it, bytes, ({
2069 obj_req->bvec_count++;
2070 }));
2071
2072}
2073
2074static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2075{
2076 struct rbd_obj_request *obj_req =
2077 container_of(ex, struct rbd_obj_request, ex);
2078 struct ceph_bio_iter *it = arg;
2079
2080 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2081 ceph_bio_iter_advance_step(it, bytes, ({
2082 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2083 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2084 }));
2085}
2086
1973static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req, 2087static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
1974 struct ceph_file_extent *img_extents, 2088 struct ceph_file_extent *img_extents,
1975 u32 num_img_extents, 2089 u32 num_img_extents,
@@ -1979,6 +2093,8 @@ static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
1979 .pos_type = OBJ_REQUEST_BIO, 2093 .pos_type = OBJ_REQUEST_BIO,
1980 .pos = (union rbd_img_fill_iter *)bio_pos, 2094 .pos = (union rbd_img_fill_iter *)bio_pos,
1981 .set_pos_fn = set_bio_pos, 2095 .set_pos_fn = set_bio_pos,
2096 .count_fn = count_bio_bvecs,
2097 .copy_fn = copy_bio_bvecs,
1982 }; 2098 };
1983 2099
1984 return rbd_img_fill_request(img_req, img_extents, num_img_extents, 2100 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
@@ -2005,6 +2121,29 @@ static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2005 ceph_bvec_iter_advance(it, bytes); 2121 ceph_bvec_iter_advance(it, bytes);
2006} 2122}
2007 2123
2124static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2125{
2126 struct rbd_obj_request *obj_req =
2127 container_of(ex, struct rbd_obj_request, ex);
2128 struct ceph_bvec_iter *it = arg;
2129
2130 ceph_bvec_iter_advance_step(it, bytes, ({
2131 obj_req->bvec_count++;
2132 }));
2133}
2134
2135static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2136{
2137 struct rbd_obj_request *obj_req =
2138 container_of(ex, struct rbd_obj_request, ex);
2139 struct ceph_bvec_iter *it = arg;
2140
2141 ceph_bvec_iter_advance_step(it, bytes, ({
2142 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2143 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2144 }));
2145}
2146
2008static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req, 2147static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2009 struct ceph_file_extent *img_extents, 2148 struct ceph_file_extent *img_extents,
2010 u32 num_img_extents, 2149 u32 num_img_extents,
@@ -2014,6 +2153,8 @@ static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2014 .pos_type = OBJ_REQUEST_BVECS, 2153 .pos_type = OBJ_REQUEST_BVECS,
2015 .pos = (union rbd_img_fill_iter *)bvec_pos, 2154 .pos = (union rbd_img_fill_iter *)bvec_pos,
2016 .set_pos_fn = set_bvec_pos, 2155 .set_pos_fn = set_bvec_pos,
2156 .count_fn = count_bvecs,
2157 .copy_fn = copy_bvecs,
2017 }; 2158 };
2018 2159
2019 return rbd_img_fill_request(img_req, img_extents, num_img_extents, 2160 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
@@ -2071,6 +2212,7 @@ static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
2071 &obj_req->bio_pos); 2212 &obj_req->bio_pos);
2072 break; 2213 break;
2073 case OBJ_REQUEST_BVECS: 2214 case OBJ_REQUEST_BVECS:
2215 case OBJ_REQUEST_OWN_BVECS:
2074 ret = __rbd_img_fill_from_bvecs(child_img_req, 2216 ret = __rbd_img_fill_from_bvecs(child_img_req,
2075 obj_req->img_extents, 2217 obj_req->img_extents,
2076 obj_req->num_img_extents, 2218 obj_req->num_img_extents,