From 7dd440c9e0711d828442c3e129ab8bcb9aeeac23 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 11 Sep 2014 18:49:18 +0400 Subject: rbd: do not return -ERANGE on auth failures Trying to map an image out of a pool for which we don't have an 'x' permission bit fails with -ERANGE from ceph_extract_encoded_string() due to an unsigned vs signed bug. Fix it and get rid of the -EINVAL sink, thus propagating rbd::get_id cls method errors. (I've seen a bunch of unexplained -ERANGE reports, I bet this is it). Signed-off-by: Ilya Dryomov Reviewed-by: Alex Elder --- drivers/block/rbd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4b97baf8afa3..ce457db5d847 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4924,7 +4924,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = image_id ? 0 : -ENOMEM; if (!ret) rbd_dev->image_format = 1; - } else if (ret > sizeof (__le32)) { + } else if (ret >= 0) { void *p = response; image_id = ceph_extract_encoded_string(&p, p + ret, @@ -4932,8 +4932,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev) ret = PTR_ERR_OR_ZERO(image_id); if (!ret) rbd_dev->image_format = 2; - } else { - ret = -EINVAL; } if (!ret) { -- cgit v1.2.2 From 4e752f0ab0e8114f4edd7574081dc625d679dd15 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Tue, 8 Apr 2014 11:12:11 -0700 Subject: rbd: access snapshot context and mapping size safely These fields may both change while the image is mapped if a snapshot is created or deleted or the image is resized. They are guarded by rbd_dev->header_rwsem, so hold that while reading them, and store a local copy to refer to outside of the critical section. The local copy will stay consistent since the snapshot context is reference counted, and the mapping size is just a u64. This prevents torn loads from giving us inconsistent values. Move reading header.snapc into the caller of rbd_img_request_create() so that we only need to take the semaphore once. The read-only caller, rbd_parent_request_create() can just pass NULL for snapc, since the snapshot context is only relevant for writes. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ce457db5d847..eea44ce2d537 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2057,7 +2057,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request) + bool write_request, + struct ceph_snap_context *snapc) { struct rbd_img_request *img_request; @@ -2065,12 +2066,6 @@ static struct rbd_img_request *rbd_img_request_create( if (!img_request) return NULL; - if (write_request) { - down_read(&rbd_dev->header_rwsem); - ceph_get_snap_context(rbd_dev->header.snapc); - up_read(&rbd_dev->header_rwsem); - } - img_request->rq = NULL; img_request->rbd_dev = rbd_dev; img_request->offset = offset; @@ -2078,7 +2073,7 @@ static struct rbd_img_request *rbd_img_request_create( img_request->flags = 0; if (write_request) { img_request_write_set(img_request); - img_request->snapc = rbd_dev->header.snapc; + img_request->snapc = snapc; } else { img_request->snap_id = rbd_dev->spec->snap_id; } @@ -2134,8 +2129,8 @@ static struct rbd_img_request *rbd_parent_request_create( rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - parent_request = rbd_img_request_create(rbd_dev->parent, - img_offset, length, false); + parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, + length, false, NULL); if (!parent_request) return NULL; @@ -3183,9 +3178,11 @@ out: static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) { struct rbd_img_request *img_request; + struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; u64 length = blk_rq_bytes(rq); bool wr = rq_data_dir(rq) == WRITE; + u64 mapping_size; int result; /* Ignore/skip any zero-length requests */ @@ -3226,14 +3223,23 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; /* Shouldn't happen */ } - if (offset + length > rbd_dev->mapping.size) { + down_read(&rbd_dev->header_rwsem); + mapping_size = rbd_dev->mapping.size; + if (wr) { + snapc = rbd_dev->header.snapc; + ceph_get_snap_context(snapc); + } + up_read(&rbd_dev->header_rwsem); + + if (offset + length > mapping_size) { rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, - length, rbd_dev->mapping.size); + length, mapping_size); result = -EIO; goto err_rq; } - img_request = rbd_img_request_create(rbd_dev, offset, length, wr); + img_request = rbd_img_request_create(rbd_dev, offset, length, wr, + snapc); if (!img_request) { result = -ENOMEM; goto err_rq; @@ -3256,6 +3262,8 @@ err_rq: if (result) rbd_warn(rbd_dev, "%s %llx at %llx result %d", wr ? "write" : "read", length, offset, result); + if (snapc) + ceph_put_snap_context(snapc); blk_end_request_all(rq, result); } -- cgit v1.2.2 From 70d045f660c7331bce8c9377929b52a9738a12cb Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 12 Sep 2014 16:02:01 +0400 Subject: rbd: add img_obj_request_simple() helper To clarify the conditions and make it easier to add new ones. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index eea44ce2d537..6dae6586a8a9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2743,11 +2743,10 @@ out: return ret; } -static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +static bool img_obj_request_simple(struct rbd_obj_request *obj_request) { struct rbd_img_request *img_request; struct rbd_device *rbd_dev; - bool known; rbd_assert(obj_request_img_data_test(obj_request)); @@ -2755,22 +2754,35 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) rbd_assert(img_request); rbd_dev = img_request->rbd_dev; + /* Reads */ + if (!img_request_write_test(img_request)) + return true; + + /* Non-layered writes */ + if (!img_request_layered_test(img_request)) + return true; + /* - * Only writes to layered images need special handling. - * Reads and non-layered writes are simple object requests. - * Layered writes that start beyond the end of the overlap - * with the parent have no parent data, so they too are - * simple object requests. Finally, if the target object is - * known to already exist, its parent data has already been - * copied, so a write to the object can also be handled as a - * simple object request. + * Layered writes outside of the parent overlap range don't + * share any data with the parent. */ - if (!img_request_write_test(img_request) || - !img_request_layered_test(img_request) || - !obj_request_overlaps_parent(obj_request) || - ((known = obj_request_known_test(obj_request)) && - obj_request_exists_test(obj_request))) { + if (!obj_request_overlaps_parent(obj_request)) + return true; + /* + * If the object is known to already exist, its parent data has + * already been copied. + */ + if (obj_request_known_test(obj_request) && + obj_request_exists_test(obj_request)) + return true; + + return false; +} + +static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) +{ + if (img_obj_request_simple(obj_request)) { struct rbd_device *rbd_dev; struct ceph_osd_client *osdc; @@ -2786,7 +2798,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) * start by reading the data for the full target object from * the parent so we can use it for a copyup to the target. */ - if (known) + if (obj_request_known_test(obj_request)) return rbd_img_obj_parent_read_full(obj_request); /* We don't know whether the target exists. Go find out. */ -- cgit v1.2.2 From c622d226155b12276ae3d29d546f4b314d7cd68c Mon Sep 17 00:00:00 2001 From: Guangliang Zhao Date: Tue, 1 Apr 2014 22:22:15 +0800 Subject: rbd: skip the copyup when an entire object writing It need to copyup the parent's content when layered writing, but an entire object write would overwrite it, so skip it. Signed-off-by: Guangliang Zhao Reviewed-by: Josh Durgin Reviewed-by: Alex Elder --- drivers/block/rbd.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6dae6586a8a9..16eb247cb5fb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2769,6 +2769,14 @@ static bool img_obj_request_simple(struct rbd_obj_request *obj_request) if (!obj_request_overlaps_parent(obj_request)) return true; + /* + * Entire-object layered writes - we will overwrite whatever + * parent data there is anyway. + */ + if (!obj_request->offset && + obj_request->length == rbd_obj_bytes(&rbd_dev->header)) + return true; + /* * If the object is known to already exist, its parent data has * already been copied. -- cgit v1.2.2 From 6d2940c881aeb9f46baac548dc4e906a53957dba Mon Sep 17 00:00:00 2001 From: Guangliang Zhao Date: Thu, 13 Mar 2014 11:21:35 +0800 Subject: rbd: extend the operation type It could only handle the read and write operations now, extend it for the coming discard support. Signed-off-by: Guangliang Zhao Reviewed-by: Josh Durgin Reviewed-by: Alex Elder --- drivers/block/rbd.c | 97 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 34 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 16eb247cb5fb..d68c937d0a12 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -210,6 +210,11 @@ enum obj_request_type { OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES }; +enum obj_operation_type { + OBJ_OP_WRITE, + OBJ_OP_READ, +}; + enum obj_req_flags { OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ @@ -785,6 +790,18 @@ static int parse_rbd_opts_token(char *c, void *private) return 0; } +static char* obj_op_name(enum obj_operation_type op_type) +{ + switch (op_type) { + case OBJ_OP_READ: + return "read"; + case OBJ_OP_WRITE: + return "write"; + default: + return "???"; + } +} + /* * Get a ceph client with specific addr and configuration, if one does * not exist create it. Either way, ceph_opts is consumed by this @@ -1823,7 +1840,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) */ static struct ceph_osd_request *rbd_osd_req_create( struct rbd_device *rbd_dev, - bool write_request, + enum obj_operation_type op_type, unsigned int num_ops, struct rbd_obj_request *obj_request) { @@ -1831,16 +1848,14 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - if (obj_request_img_data_test(obj_request)) { + if (obj_request_img_data_test(obj_request) && op_type == OBJ_OP_WRITE) { struct rbd_img_request *img_request = obj_request->img_request; - rbd_assert(write_request == - img_request_write_test(img_request)); - if (write_request) - snapc = img_request->snapc; + rbd_assert(img_request_write_test(img_request)); + snapc = img_request->snapc; } - rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); + rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2)); /* Allocate and initialize the request, for the num_ops ops */ @@ -1850,7 +1865,7 @@ static struct ceph_osd_request *rbd_osd_req_create( if (!osd_req) return NULL; /* ENOMEM */ - if (write_request) + if (op_type == OBJ_OP_WRITE) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; else osd_req->r_flags = CEPH_OSD_FLAG_READ; @@ -2057,7 +2072,7 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) static struct rbd_img_request *rbd_img_request_create( struct rbd_device *rbd_dev, u64 offset, u64 length, - bool write_request, + enum obj_operation_type op_type, struct ceph_snap_context *snapc) { struct rbd_img_request *img_request; @@ -2071,7 +2086,7 @@ static struct rbd_img_request *rbd_img_request_create( img_request->offset = offset; img_request->length = length; img_request->flags = 0; - if (write_request) { + if (op_type == OBJ_OP_WRITE) { img_request_write_set(img_request); img_request->snapc = snapc; } else { @@ -2088,8 +2103,7 @@ static struct rbd_img_request *rbd_img_request_create( kref_init(&img_request->kref); dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, - write_request ? "write" : "read", offset, length, - img_request); + obj_op_name(op_type), offset, length, img_request); return img_request; } @@ -2130,7 +2144,7 @@ static struct rbd_img_request *rbd_parent_request_create( rbd_dev = obj_request->img_request->rbd_dev; parent_request = rbd_img_request_create(rbd_dev->parent, img_offset, - length, false, NULL); + length, OBJ_OP_READ, NULL); if (!parent_request) return NULL; @@ -2171,11 +2185,14 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) result = obj_request->result; if (result) { struct rbd_device *rbd_dev = img_request->rbd_dev; + enum obj_operation_type op_type; + + op_type = img_request_write_test(img_request) ? OBJ_OP_WRITE : + OBJ_OP_READ; rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", - img_request_write_test(img_request) ? "write" : "read", - obj_request->length, obj_request->img_offset, - obj_request->offset); + obj_op_name(op_type), obj_request->length, + obj_request->img_offset, obj_request->offset); rbd_warn(rbd_dev, " result %d xferred %x", result, xferred); if (!img_request->result) @@ -2254,10 +2271,10 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, struct rbd_device *rbd_dev = img_request->rbd_dev; struct rbd_obj_request *obj_request = NULL; struct rbd_obj_request *next_obj_request; - bool write_request = img_request_write_test(img_request); struct bio *bio_list = NULL; unsigned int bio_offset = 0; struct page **pages = NULL; + enum obj_operation_type op_type; u64 img_offset; u64 resid; u16 opcode; @@ -2265,7 +2282,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, dout("%s: img %p type %d data_desc %p\n", __func__, img_request, (int)type, data_desc); - opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ; img_offset = img_request->offset; resid = img_request->length; rbd_assert(resid > 0); @@ -2327,16 +2343,24 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - osd_req = rbd_osd_req_create(rbd_dev, write_request, - (write_request ? 2 : 1), - obj_request); + if (img_request_write_test(img_request)) { + op_type = OBJ_OP_WRITE; + opcode = CEPH_OSD_OP_WRITE; + } else { + op_type = OBJ_OP_READ; + opcode = CEPH_OSD_OP_READ; + } + + osd_req = rbd_osd_req_create(rbd_dev, op_type, + (op_type == OBJ_OP_WRITE) ? 2 : 1, + obj_request); if (!osd_req) goto out_unwind; obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; rbd_img_request_get(img_request); - if (write_request) { + if (op_type == OBJ_OP_WRITE) { osd_req_op_alloc_hint_init(osd_req, which, rbd_obj_bytes(&rbd_dev->header), rbd_obj_bytes(&rbd_dev->header)); @@ -2353,7 +2377,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, obj_request->pages, length, offset & ~PAGE_MASK, false, false); - if (write_request) + if (op_type == OBJ_OP_WRITE) rbd_osd_req_format_write(obj_request); else rbd_osd_req_format_read(obj_request); @@ -2723,7 +2747,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request) rbd_assert(obj_request->img_request); rbd_dev = obj_request->img_request->rbd_dev; - stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, stat_request); if (!stat_request->osd_req) goto out; @@ -2947,7 +2971,7 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id) return -ENOMEM; ret = -ENOMEM; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_request); if (!obj_request->osd_req) goto out; @@ -3010,7 +3034,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper( if (!obj_request) return ERR_PTR(-ENOMEM); - obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1, obj_request); if (!obj_request->osd_req) { ret = -ENOMEM; @@ -3148,7 +3172,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_request); if (!obj_request->osd_req) goto out; @@ -3201,10 +3225,15 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) struct ceph_snap_context *snapc = NULL; u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; u64 length = blk_rq_bytes(rq); - bool wr = rq_data_dir(rq) == WRITE; + enum obj_operation_type op_type; u64 mapping_size; int result; + if (rq->cmd_flags & REQ_WRITE) + op_type = OBJ_OP_WRITE; + else + op_type = OBJ_OP_READ; + /* Ignore/skip any zero-length requests */ if (!length) { @@ -3213,9 +3242,9 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; } - /* Disallow writes to a read-only device */ + /* Only reads are allowed to a read-only device */ - if (wr) { + if (op_type != OBJ_OP_READ) { if (rbd_dev->mapping.read_only) { result = -EROFS; goto err_rq; @@ -3245,7 +3274,7 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) down_read(&rbd_dev->header_rwsem); mapping_size = rbd_dev->mapping.size; - if (wr) { + if (op_type != OBJ_OP_READ) { snapc = rbd_dev->header.snapc; ceph_get_snap_context(snapc); } @@ -3258,7 +3287,7 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) goto err_rq; } - img_request = rbd_img_request_create(rbd_dev, offset, length, wr, + img_request = rbd_img_request_create(rbd_dev, offset, length, op_type, snapc); if (!img_request) { result = -ENOMEM; @@ -3281,7 +3310,7 @@ err_img_request: err_rq: if (result) rbd_warn(rbd_dev, "%s %llx at %llx result %d", - wr ? "write" : "read", length, offset, result); + obj_op_name(op_type), length, offset, result); if (snapc) ceph_put_snap_context(snapc); blk_end_request_all(rq, result); @@ -3421,7 +3450,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev, obj_request->pages = pages; obj_request->page_count = page_count; - obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, + obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_request); if (!obj_request->osd_req) goto out; -- cgit v1.2.2 From 90e98c5229c0adfadf2c2ad2c91d72902bf61bc4 Mon Sep 17 00:00:00 2001 From: Guangliang Zhao Date: Tue, 1 Apr 2014 22:22:16 +0800 Subject: rbd: initial discard bits from Guangliang Zhao This patch add the discard support for rbd driver. There are three types operation in the driver: 1. The objects would be removed if they completely contained within the discard range. 2. The objects would be truncated if they partly contained within the discard range, and align with their boundary. 3. Others would be zeroed. A discard request from blkdev_issue_discard() is defined which REQ_WRITE and REQ_DISCARD both marked and no data, so we must check the REQ_DISCARD first when getting the request type. This resolve: http://tracker.ceph.com/issues/190 [ Ilya Dryomov: This is incomplete and somewhat buggy, see follow up commits by Josh Durgin for refinements and fixes which weren't folded in to preserve authorship. ] Signed-off-by: Guangliang Zhao Reviewed-by: Josh Durgin Reviewed-by: Alex Elder --- drivers/block/rbd.c | 104 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 15 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d68c937d0a12..e2f7a708e20d 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -213,6 +213,7 @@ enum obj_request_type { enum obj_operation_type { OBJ_OP_WRITE, OBJ_OP_READ, + OBJ_OP_DISCARD, }; enum obj_req_flags { @@ -281,6 +282,7 @@ enum img_req_flags { IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ + IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */ }; struct rbd_img_request { @@ -797,6 +799,8 @@ static char* obj_op_name(enum obj_operation_type op_type) return "read"; case OBJ_OP_WRITE: return "write"; + case OBJ_OP_DISCARD: + return "discard"; default: return "???"; } @@ -1617,6 +1621,21 @@ static bool img_request_write_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; } +/* + * Set the discard flag when the img_request is an discard request + */ +static void img_request_discard_set(struct rbd_img_request *img_request) +{ + set_bit(IMG_REQ_DISCARD, &img_request->flags); + smp_mb(); +} + +static bool img_request_discard_test(struct rbd_img_request *img_request) +{ + smp_mb(); + return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0; +} + static void img_request_child_set(struct rbd_img_request *img_request) { set_bit(IMG_REQ_CHILD, &img_request->flags); @@ -1739,6 +1758,18 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request) obj_request_done_set(obj_request); } +static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) +{ + dout("%s: obj %p result %d %llu\n", __func__, obj_request, + obj_request->result, obj_request->length); + /* + * There is no such thing as a successful short discard. Set + * it to our originally-requested length. + */ + obj_request->xferred = obj_request->length; + obj_request_done_set(obj_request); +} + /* * For a simple stat call there's nothing to do. We'll do more if * this is part of a write sequence for a layered image. @@ -1790,6 +1821,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, case CEPH_OSD_OP_STAT: rbd_osd_stat_callback(obj_request); break; + case CEPH_OSD_OP_DELETE: + case CEPH_OSD_OP_TRUNCATE: + case CEPH_OSD_OP_ZERO: + rbd_osd_discard_callback(obj_request); + break; case CEPH_OSD_OP_CALL: case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_WATCH: @@ -1848,10 +1884,14 @@ static struct ceph_osd_request *rbd_osd_req_create( struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - if (obj_request_img_data_test(obj_request) && op_type == OBJ_OP_WRITE) { + if (obj_request_img_data_test(obj_request) && + (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) { struct rbd_img_request *img_request = obj_request->img_request; - - rbd_assert(img_request_write_test(img_request)); + if (op_type == OBJ_OP_WRITE) { + rbd_assert(img_request_write_test(img_request)); + } else { + rbd_assert(img_request_discard_test(img_request)); + } snapc = img_request->snapc; } @@ -1865,7 +1905,7 @@ static struct ceph_osd_request *rbd_osd_req_create( if (!osd_req) return NULL; /* ENOMEM */ - if (op_type == OBJ_OP_WRITE) + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; else osd_req->r_flags = CEPH_OSD_FLAG_READ; @@ -2086,7 +2126,10 @@ static struct rbd_img_request *rbd_img_request_create( img_request->offset = offset; img_request->length = length; img_request->flags = 0; - if (op_type == OBJ_OP_WRITE) { + if (op_type == OBJ_OP_DISCARD) { + img_request_discard_set(img_request); + img_request->snapc = snapc; + } else if (op_type == OBJ_OP_WRITE) { img_request_write_set(img_request); img_request->snapc = snapc; } else { @@ -2187,8 +2230,12 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request) struct rbd_device *rbd_dev = img_request->rbd_dev; enum obj_operation_type op_type; - op_type = img_request_write_test(img_request) ? OBJ_OP_WRITE : - OBJ_OP_READ; + if (img_request_discard_test(img_request)) + op_type = OBJ_OP_DISCARD; + else if (img_request_write_test(img_request)) + op_type = OBJ_OP_WRITE; + else + op_type = OBJ_OP_READ; rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", obj_op_name(op_type), obj_request->length, @@ -2275,7 +2322,9 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, unsigned int bio_offset = 0; struct page **pages = NULL; enum obj_operation_type op_type; + u64 object_size = rbd_obj_bytes(&rbd_dev->header); u64 img_offset; + u64 img_end; u64 resid; u16 opcode; @@ -2283,6 +2332,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, (int)type, data_desc); img_offset = img_request->offset; + img_end = rbd_dev->header.image_size; resid = img_request->length; rbd_assert(resid > 0); @@ -2290,8 +2340,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, bio_list = data_desc; rbd_assert(img_offset == bio_list->bi_iter.bi_sector << SECTOR_SHIFT); - } else { - rbd_assert(type == OBJ_REQUEST_PAGES); + } else if (type == OBJ_REQUEST_PAGES) { pages = data_desc; } @@ -2332,7 +2381,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, GFP_ATOMIC); if (!obj_request->bio_list) goto out_unwind; - } else { + } else if (type == OBJ_REQUEST_PAGES) { unsigned int page_count; obj_request->pages = pages; @@ -2343,7 +2392,19 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - if (img_request_write_test(img_request)) { + if (img_request_discard_test(img_request)) { + op_type = OBJ_OP_DISCARD; + if (!offset && (length == object_size) + && (!img_request_layered_test(img_request) || + (rbd_dev->parent_overlap <= + obj_request->img_offset))) + opcode = CEPH_OSD_OP_DELETE; + else if ((offset + length == object_size) || + (obj_request->img_offset + length == img_end)) + opcode = CEPH_OSD_OP_TRUNCATE; + else + opcode = CEPH_OSD_OP_ZERO; + } else if (img_request_write_test(img_request)) { op_type = OBJ_OP_WRITE; opcode = CEPH_OSD_OP_WRITE; } else { @@ -2372,12 +2433,13 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (type == OBJ_REQUEST_BIO) osd_req_op_extent_osd_data_bio(osd_req, which, obj_request->bio_list, length); - else + else if (type == OBJ_REQUEST_PAGES) osd_req_op_extent_osd_data_pages(osd_req, which, obj_request->pages, length, offset & ~PAGE_MASK, false, false); - if (op_type == OBJ_OP_WRITE) + /* Discards are also writes */ + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) rbd_osd_req_format_write(obj_request); else rbd_osd_req_format_read(obj_request); @@ -3229,7 +3291,9 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) u64 mapping_size; int result; - if (rq->cmd_flags & REQ_WRITE) + if (rq->cmd_flags & REQ_DISCARD) + op_type = OBJ_OP_DISCARD; + else if (rq->cmd_flags & REQ_WRITE) op_type = OBJ_OP_WRITE; else op_type = OBJ_OP_READ; @@ -3295,7 +3359,12 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) } img_request->rq = rq; - result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); + if (op_type == OBJ_OP_DISCARD) + result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA, + NULL); + else + result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, + rq->bio); if (result) goto err_img_request; @@ -3667,6 +3736,11 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_io_min(q, segment_size); blk_queue_io_opt(q, segment_size); + /* enable the discard support */ + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + q->limits.discard_granularity = segment_size; + q->limits.discard_alignment = segment_size; + blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; -- cgit v1.2.2 From 3c5df89367761d09d76454a2c4301a73bf2d46ce Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 4 Apr 2014 12:06:32 -0700 Subject: rbd: read image size for discard check safely In rbd_img_request_fill() the image size is only checked to determine whether we can truncate an object instead of zeroing it for discard requests. Take rbd_dev->header_rwsem while reading the image size, and move this read into the discard check, so that non-discard ops don't need to take the semaphore in this function. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e2f7a708e20d..31ace3dd33e4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2332,7 +2332,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, (int)type, data_desc); img_offset = img_request->offset; - img_end = rbd_dev->header.image_size; resid = img_request->length; rbd_assert(resid > 0); @@ -2397,13 +2396,20 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, if (!offset && (length == object_size) && (!img_request_layered_test(img_request) || (rbd_dev->parent_overlap <= - obj_request->img_offset))) + obj_request->img_offset))) { opcode = CEPH_OSD_OP_DELETE; - else if ((offset + length == object_size) || - (obj_request->img_offset + length == img_end)) + } else if ((offset + length == object_size)) { opcode = CEPH_OSD_OP_TRUNCATE; - else - opcode = CEPH_OSD_OP_ZERO; + } else { + down_read(&rbd_dev->header_rwsem); + img_end = rbd_dev->header.image_size; + up_read(&rbd_dev->header_rwsem); + + if (obj_request->img_offset + length == img_end) + opcode = CEPH_OSD_OP_TRUNCATE; + else + opcode = CEPH_OSD_OP_ZERO; + } } else if (img_request_write_test(img_request)) { op_type = OBJ_OP_WRITE; opcode = CEPH_OSD_OP_WRITE; -- cgit v1.2.2 From bef95455a44e2533fcea376740bb1a5cbd71269f Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 4 Apr 2014 17:47:52 -0700 Subject: rbd: fix snapshot context reference count for discards Discards take a reference to the snapshot context of an image when they are created. This reference needs to be cleaned up when the request is done just as it is for regular writes. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 31ace3dd33e4..de1520ccc0d4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2170,7 +2170,8 @@ static void rbd_img_request_destroy(struct kref *kref) rbd_dev_parent_put(img_request->rbd_dev); } - if (img_request_write_test(img_request)) + if (img_request_write_test(img_request) || + img_request_discard_test(img_request)) ceph_put_snap_context(img_request->snapc); kmem_cache_free(rbd_img_request_cache, img_request); -- cgit v1.2.2 From d0265de7c358d71a494dcd1ee28206b32754bb0f Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 7 Apr 2014 16:54:10 -0700 Subject: rbd: tolerate -ENOENT for discard operations Discard may try to delete an object from a non-layered image that does not exist. If this occurs, the image already has no data in that range, so change the result to success. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index de1520ccc0d4..835a96a09a6b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1767,6 +1767,9 @@ static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request) * it to our originally-requested length. */ obj_request->xferred = obj_request->length; + /* discarding a non-existent object is not a problem */ + if (obj_request->result == -ENOENT) + obj_request->result = 0; obj_request_done_set(obj_request); } -- cgit v1.2.2 From 1c220881e307b62cc2f77d911219de332aa3f61e Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 4 Apr 2014 17:49:12 -0700 Subject: rbd: make discard trigger copy-on-write Discard requests are a form of write, so they should go through the same process as plain write requests and trigger copy-on-write for layered images. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 835a96a09a6b..6fb93cd6957f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2851,7 +2851,8 @@ static bool img_obj_request_simple(struct rbd_obj_request *obj_request) rbd_dev = img_request->rbd_dev; /* Reads */ - if (!img_request_write_test(img_request)) + if (!img_request_write_test(img_request) && + !img_request_discard_test(img_request)) return true; /* Non-layered writes */ -- cgit v1.2.2 From 3b434a2aff38029ea053ce6c8fced53b2d01f7f0 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 4 Apr 2014 17:32:15 -0700 Subject: rbd: extract a method for adding object operations rbd_img_request_fill() creates a ceph_osd_request and has logic for adding the appropriate osd ops to it based on the request type and image properties. For layered images, the original rbd_obj_request is resent with a copyup operation in front, using a new ceph_osd_request. The logic for adding the original operations should be the same as when first sending them, so move it to a helper function. op_type only needs to be checked once, so create a helper for that as well and call it outside the loop in rbd_img_request_fill(). Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 133 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 55 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6fb93cd6957f..c07cb1dbc1c5 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1672,6 +1672,17 @@ static bool img_request_layered_test(struct rbd_img_request *img_request) return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; } +static enum obj_operation_type +rbd_img_request_op_type(struct rbd_img_request *img_request) +{ + if (img_request_write_test(img_request)) + return OBJ_OP_WRITE; + else if (img_request_discard_test(img_request)) + return OBJ_OP_DISCARD; + else + return OBJ_OP_READ; +} + static void rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) { @@ -2307,6 +2318,68 @@ out: rbd_img_request_complete(img_request); } +/* + * Add individual osd ops to the given ceph_osd_request and prepare + * them for submission. num_ops is the current number of + * osd operations already to the object request. + */ +static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, + struct ceph_osd_request *osd_request, + enum obj_operation_type op_type, + unsigned int num_ops) +{ + struct rbd_img_request *img_request = obj_request->img_request; + struct rbd_device *rbd_dev = img_request->rbd_dev; + u64 object_size = rbd_obj_bytes(&rbd_dev->header); + u64 offset = obj_request->offset; + u64 length = obj_request->length; + u64 img_end; + u16 opcode; + + if (op_type == OBJ_OP_DISCARD) { + if (!offset && (length == object_size) + && (!img_request_layered_test(img_request) || + (rbd_dev->parent_overlap <= + obj_request->img_offset))) { + opcode = CEPH_OSD_OP_DELETE; + } else if ((offset + length == object_size)) { + opcode = CEPH_OSD_OP_TRUNCATE; + } else { + down_read(&rbd_dev->header_rwsem); + img_end = rbd_dev->header.image_size; + up_read(&rbd_dev->header_rwsem); + + if (obj_request->img_offset + length == img_end) + opcode = CEPH_OSD_OP_TRUNCATE; + else + opcode = CEPH_OSD_OP_ZERO; + } + } else if (op_type == OBJ_OP_WRITE) { + opcode = CEPH_OSD_OP_WRITE; + osd_req_op_alloc_hint_init(osd_request, num_ops, + object_size, object_size); + num_ops++; + } else { + opcode = CEPH_OSD_OP_READ; + } + + osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length, + 0, 0); + if (obj_request->type == OBJ_REQUEST_BIO) + osd_req_op_extent_osd_data_bio(osd_request, num_ops, + obj_request->bio_list, length); + else if (obj_request->type == OBJ_REQUEST_PAGES) + osd_req_op_extent_osd_data_pages(osd_request, num_ops, + obj_request->pages, length, + offset & ~PAGE_MASK, false, false); + + /* Discards are also writes */ + if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) + rbd_osd_req_format_write(obj_request); + else + rbd_osd_req_format_read(obj_request); +} + /* * Split up an image request into one or more object requests, each * to a different object. The "type" parameter indicates whether @@ -2326,11 +2399,8 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, unsigned int bio_offset = 0; struct page **pages = NULL; enum obj_operation_type op_type; - u64 object_size = rbd_obj_bytes(&rbd_dev->header); u64 img_offset; - u64 img_end; u64 resid; - u16 opcode; dout("%s: img %p type %d data_desc %p\n", __func__, img_request, (int)type, data_desc); @@ -2338,6 +2408,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, img_offset = img_request->offset; resid = img_request->length; rbd_assert(resid > 0); + op_type = rbd_img_request_op_type(img_request); if (type == OBJ_REQUEST_BIO) { bio_list = data_desc; @@ -2352,7 +2423,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, const char *object_name; u64 offset; u64 length; - unsigned int which = 0; object_name = rbd_segment_name(rbd_dev, img_offset); if (!object_name) @@ -2395,66 +2465,19 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request, pages += page_count; } - if (img_request_discard_test(img_request)) { - op_type = OBJ_OP_DISCARD; - if (!offset && (length == object_size) - && (!img_request_layered_test(img_request) || - (rbd_dev->parent_overlap <= - obj_request->img_offset))) { - opcode = CEPH_OSD_OP_DELETE; - } else if ((offset + length == object_size)) { - opcode = CEPH_OSD_OP_TRUNCATE; - } else { - down_read(&rbd_dev->header_rwsem); - img_end = rbd_dev->header.image_size; - up_read(&rbd_dev->header_rwsem); - - if (obj_request->img_offset + length == img_end) - opcode = CEPH_OSD_OP_TRUNCATE; - else - opcode = CEPH_OSD_OP_ZERO; - } - } else if (img_request_write_test(img_request)) { - op_type = OBJ_OP_WRITE; - opcode = CEPH_OSD_OP_WRITE; - } else { - op_type = OBJ_OP_READ; - opcode = CEPH_OSD_OP_READ; - } - osd_req = rbd_osd_req_create(rbd_dev, op_type, (op_type == OBJ_OP_WRITE) ? 2 : 1, obj_request); if (!osd_req) goto out_unwind; + obj_request->osd_req = osd_req; obj_request->callback = rbd_img_obj_callback; - rbd_img_request_get(img_request); - - if (op_type == OBJ_OP_WRITE) { - osd_req_op_alloc_hint_init(osd_req, which, - rbd_obj_bytes(&rbd_dev->header), - rbd_obj_bytes(&rbd_dev->header)); - which++; - } - - osd_req_op_extent_init(osd_req, which, opcode, offset, length, - 0, 0); - if (type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, which, - obj_request->bio_list, length); - else if (type == OBJ_REQUEST_PAGES) - osd_req_op_extent_osd_data_pages(osd_req, which, - obj_request->pages, length, - offset & ~PAGE_MASK, false, false); + obj_request->img_offset = img_offset; - /* Discards are also writes */ - if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) - rbd_osd_req_format_write(obj_request); - else - rbd_osd_req_format_read(obj_request); + rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0); - obj_request->img_offset = img_offset; + rbd_img_request_get(img_request); img_offset += length; resid -= length; -- cgit v1.2.2 From d3246fb0da5d70838469c01d5b6b11163b49cd86 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 7 Apr 2014 16:49:21 -0700 Subject: rbd: use helpers to handle discard for layered images correctly Only allocate two osd ops for discard requests, since the preallocation hint is only added for regular writes. Use rbd_img_obj_request_fill() to recreate the original write or discard osd operations, isolating that logic to one place, and change the assert in rbd_osd_req_create_copyup() to accept discard requests as well. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 54 ++++++++++++++++++++++------------------------------- 1 file changed, 22 insertions(+), 32 deletions(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c07cb1dbc1c5..e1dcd36ae072 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1934,9 +1934,10 @@ static struct ceph_osd_request *rbd_osd_req_create( } /* - * Create a copyup osd request based on the information in the - * object request supplied. A copyup request has three osd ops, - * a copyup method call, a hint op, and a write op. + * Create a copyup osd request based on the information in the object + * request supplied. A copyup request has two or three osd ops, a + * copyup method call, potentially a hint op, and a write or truncate + * or zero op. */ static struct ceph_osd_request * rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) @@ -1946,18 +1947,24 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) struct rbd_device *rbd_dev; struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; + int num_osd_ops = 3; rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); - rbd_assert(img_request_write_test(img_request)); + rbd_assert(img_request_write_test(img_request) || + img_request_discard_test(img_request)); - /* Allocate and initialize the request, for the three ops */ + if (img_request_discard_test(img_request)) + num_osd_ops = 2; + + /* Allocate and initialize the request, for all the ops */ snapc = img_request->snapc; rbd_dev = img_request->rbd_dev; osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops, + false, GFP_ATOMIC); if (!osd_req) return NULL; /* ENOMEM */ @@ -2337,10 +2344,9 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, u16 opcode; if (op_type == OBJ_OP_DISCARD) { - if (!offset && (length == object_size) - && (!img_request_layered_test(img_request) || - (rbd_dev->parent_overlap <= - obj_request->img_offset))) { + if (!offset && length == object_size && + (!img_request_layered_test(img_request) || + !obj_request_overlaps_parent(obj_request))) { opcode = CEPH_OSD_OP_DELETE; } else if ((offset + length == object_size)) { opcode = CEPH_OSD_OP_TRUNCATE; @@ -2500,7 +2506,8 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request) struct page **pages; u32 page_count; - rbd_assert(obj_request->type == OBJ_REQUEST_BIO); + rbd_assert(obj_request->type == OBJ_REQUEST_BIO || + obj_request->type == OBJ_REQUEST_NODATA); rbd_assert(obj_request_img_data_test(obj_request)); img_request = obj_request->img_request; rbd_assert(img_request); @@ -2538,11 +2545,10 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) struct ceph_osd_client *osdc; struct rbd_device *rbd_dev; struct page **pages; + enum obj_operation_type op_type; u32 page_count; int img_result; u64 parent_length; - u64 offset; - u64 length; rbd_assert(img_request_child_test(img_request)); @@ -2606,26 +2612,10 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request) osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, false, false); - /* Then the hint op */ - - osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), - rbd_obj_bytes(&rbd_dev->header)); - - /* And the original write request op */ - - offset = orig_request->offset; - length = orig_request->length; - osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE, - offset, length, 0, 0); - if (orig_request->type == OBJ_REQUEST_BIO) - osd_req_op_extent_osd_data_bio(osd_req, 2, - orig_request->bio_list, length); - else - osd_req_op_extent_osd_data_pages(osd_req, 2, - orig_request->pages, length, - offset & ~PAGE_MASK, false, false); + /* Add the other op(s) */ - rbd_osd_req_format_write(orig_request); + op_type = rbd_img_request_op_type(orig_request->img_request); + rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1); /* All set, send it off. */ -- cgit v1.2.2 From b76f82398c1017e303d87760e22125714010207f Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 7 Apr 2014 16:52:03 -0700 Subject: rbd: set the remaining discard properties to enable support max_discard_sectors must be set for the queue to support discard. Operations implementing discard for rbd zero data, so report that. Signed-off-by: Josh Durgin --- drivers/block/rbd.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e1dcd36ae072..7712ae65753c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3764,6 +3764,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); q->limits.discard_granularity = segment_size; q->limits.discard_alignment = segment_size; + q->limits.max_discard_sectors = segment_size / SECTOR_SIZE; + q->limits.discard_zeroes_data = 1; blk_queue_merge_bvec(q, rbd_merge_bvec); disk->queue = q; -- cgit v1.2.2 From 792c3a914910bd34302c5345578f85cfcb5e2c01 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 10 Oct 2014 18:36:07 +0400 Subject: rbd: rbd workqueues need a resque worker Need to use WQ_MEM_RECLAIM for our workqueues to prevent I/O lockups under memory pressure - we sit on the memory reclaim path. Cc: stable@vger.kernel.org # 3.17, needs backporting for 3.16 Signed-off-by: Ilya Dryomov Tested-by: Micha Krause Reviewed-by: Sage Weil --- drivers/block/rbd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/block') diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7712ae65753c..0a54c588e433 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5242,7 +5242,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); - rbd_dev->rq_wq = alloc_workqueue("%s", 0, 0, rbd_dev->disk->disk_name); + rbd_dev->rq_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0, + rbd_dev->disk->disk_name); if (!rbd_dev->rq_wq) { ret = -ENOMEM; goto err_out_mapping; -- cgit v1.2.2