aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-15 00:46:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-15 00:46:01 -0400
commit6b0490816671b2f4126a99998c9bf3c8c0472de2 (patch)
tree016543455c2bdbe47b422fed6a3b4ffb991c97d6 /drivers/block
parentce9d7f7b45930ed16c512aabcfe651d44f1c8619 (diff)
parent0bc62284ee3f2a228c64902ed818b6ba8e04159b (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client
Pull Ceph updates from Sage Weil: "There is the long-awaited discard support for RBD (Guangliang Zhao, Josh Durgin), a pile of RBD bug fixes that didn't belong in late -rc's (Ilya Dryomov, Li RongQing), a pile of fs/ceph bug fixes and performance and debugging improvements (Yan, Zheng, John Spray), and a smattering of cleanups (Chao Yu, Fabian Frederick, Joe Perches)" * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (40 commits) ceph: fix divide-by-zero in __validate_layout() rbd: rbd workqueues need a resque worker libceph: ceph-msgr workqueue needs a resque worker ceph: fix bool assignments libceph: separate multiple ops with commas in debugfs output libceph: sync osd op definitions in rados.h libceph: remove redundant declaration ceph: additional debugfs output ceph: export ceph_session_state_name function ceph: include the initial ACL in create/mkdir/mknod MDS requests ceph: use pagelist to present MDS request data libceph: reference counting pagelist ceph: fix llistxattr on symlink ceph: send client metadata to MDS ceph: remove redundant code for max file size verification ceph: remove redundant io_iter_advance() ceph: move ceph_find_inode() outside the s_mutex ceph: request xattrs if xattr_version is zero rbd: set the remaining discard properties to enable support rbd: use helpers to handle discard for layered images correctly ...
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/rbd.c396
1 files changed, 276 insertions, 120 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4b97baf8afa3..0a54c588e433 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -210,6 +210,12 @@ enum obj_request_type {
210 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES 210 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
211}; 211};
212 212
213enum obj_operation_type {
214 OBJ_OP_WRITE,
215 OBJ_OP_READ,
216 OBJ_OP_DISCARD,
217};
218
213enum obj_req_flags { 219enum obj_req_flags {
214 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */ 220 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
215 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */ 221 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
@@ -276,6 +282,7 @@ enum img_req_flags {
276 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */ 282 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
277 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */ 283 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
278 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */ 284 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
285 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
279}; 286};
280 287
281struct rbd_img_request { 288struct rbd_img_request {
@@ -785,6 +792,20 @@ static int parse_rbd_opts_token(char *c, void *private)
785 return 0; 792 return 0;
786} 793}
787 794
795static char* obj_op_name(enum obj_operation_type op_type)
796{
797 switch (op_type) {
798 case OBJ_OP_READ:
799 return "read";
800 case OBJ_OP_WRITE:
801 return "write";
802 case OBJ_OP_DISCARD:
803 return "discard";
804 default:
805 return "???";
806 }
807}
808
788/* 809/*
789 * Get a ceph client with specific addr and configuration, if one does 810 * Get a ceph client with specific addr and configuration, if one does
790 * not exist create it. Either way, ceph_opts is consumed by this 811 * not exist create it. Either way, ceph_opts is consumed by this
@@ -1600,6 +1621,21 @@ static bool img_request_write_test(struct rbd_img_request *img_request)
1600 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0; 1621 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1601} 1622}
1602 1623
1624/*
1625 * Set the discard flag when the img_request is an discard request
1626 */
1627static void img_request_discard_set(struct rbd_img_request *img_request)
1628{
1629 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1630 smp_mb();
1631}
1632
1633static bool img_request_discard_test(struct rbd_img_request *img_request)
1634{
1635 smp_mb();
1636 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1637}
1638
1603static void img_request_child_set(struct rbd_img_request *img_request) 1639static void img_request_child_set(struct rbd_img_request *img_request)
1604{ 1640{
1605 set_bit(IMG_REQ_CHILD, &img_request->flags); 1641 set_bit(IMG_REQ_CHILD, &img_request->flags);
@@ -1636,6 +1672,17 @@ static bool img_request_layered_test(struct rbd_img_request *img_request)
1636 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0; 1672 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1637} 1673}
1638 1674
1675static enum obj_operation_type
1676rbd_img_request_op_type(struct rbd_img_request *img_request)
1677{
1678 if (img_request_write_test(img_request))
1679 return OBJ_OP_WRITE;
1680 else if (img_request_discard_test(img_request))
1681 return OBJ_OP_DISCARD;
1682 else
1683 return OBJ_OP_READ;
1684}
1685
1639static void 1686static void
1640rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request) 1687rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1641{ 1688{
@@ -1722,6 +1769,21 @@ static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
1722 obj_request_done_set(obj_request); 1769 obj_request_done_set(obj_request);
1723} 1770}
1724 1771
1772static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
1773{
1774 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1775 obj_request->result, obj_request->length);
1776 /*
1777 * There is no such thing as a successful short discard. Set
1778 * it to our originally-requested length.
1779 */
1780 obj_request->xferred = obj_request->length;
1781 /* discarding a non-existent object is not a problem */
1782 if (obj_request->result == -ENOENT)
1783 obj_request->result = 0;
1784 obj_request_done_set(obj_request);
1785}
1786
1725/* 1787/*
1726 * For a simple stat call there's nothing to do. We'll do more if 1788 * For a simple stat call there's nothing to do. We'll do more if
1727 * this is part of a write sequence for a layered image. 1789 * this is part of a write sequence for a layered image.
@@ -1773,6 +1835,11 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1773 case CEPH_OSD_OP_STAT: 1835 case CEPH_OSD_OP_STAT:
1774 rbd_osd_stat_callback(obj_request); 1836 rbd_osd_stat_callback(obj_request);
1775 break; 1837 break;
1838 case CEPH_OSD_OP_DELETE:
1839 case CEPH_OSD_OP_TRUNCATE:
1840 case CEPH_OSD_OP_ZERO:
1841 rbd_osd_discard_callback(obj_request);
1842 break;
1776 case CEPH_OSD_OP_CALL: 1843 case CEPH_OSD_OP_CALL:
1777 case CEPH_OSD_OP_NOTIFY_ACK: 1844 case CEPH_OSD_OP_NOTIFY_ACK:
1778 case CEPH_OSD_OP_WATCH: 1845 case CEPH_OSD_OP_WATCH:
@@ -1823,7 +1890,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1823 */ 1890 */
1824static struct ceph_osd_request *rbd_osd_req_create( 1891static struct ceph_osd_request *rbd_osd_req_create(
1825 struct rbd_device *rbd_dev, 1892 struct rbd_device *rbd_dev,
1826 bool write_request, 1893 enum obj_operation_type op_type,
1827 unsigned int num_ops, 1894 unsigned int num_ops,
1828 struct rbd_obj_request *obj_request) 1895 struct rbd_obj_request *obj_request)
1829{ 1896{
@@ -1831,16 +1898,18 @@ static struct ceph_osd_request *rbd_osd_req_create(
1831 struct ceph_osd_client *osdc; 1898 struct ceph_osd_client *osdc;
1832 struct ceph_osd_request *osd_req; 1899 struct ceph_osd_request *osd_req;
1833 1900
1834 if (obj_request_img_data_test(obj_request)) { 1901 if (obj_request_img_data_test(obj_request) &&
1902 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
1835 struct rbd_img_request *img_request = obj_request->img_request; 1903 struct rbd_img_request *img_request = obj_request->img_request;
1836 1904 if (op_type == OBJ_OP_WRITE) {
1837 rbd_assert(write_request == 1905 rbd_assert(img_request_write_test(img_request));
1838 img_request_write_test(img_request)); 1906 } else {
1839 if (write_request) 1907 rbd_assert(img_request_discard_test(img_request));
1840 snapc = img_request->snapc; 1908 }
1909 snapc = img_request->snapc;
1841 } 1910 }
1842 1911
1843 rbd_assert(num_ops == 1 || (write_request && num_ops == 2)); 1912 rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
1844 1913
1845 /* Allocate and initialize the request, for the num_ops ops */ 1914 /* Allocate and initialize the request, for the num_ops ops */
1846 1915
@@ -1850,7 +1919,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
1850 if (!osd_req) 1919 if (!osd_req)
1851 return NULL; /* ENOMEM */ 1920 return NULL; /* ENOMEM */
1852 1921
1853 if (write_request) 1922 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
1854 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK; 1923 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1855 else 1924 else
1856 osd_req->r_flags = CEPH_OSD_FLAG_READ; 1925 osd_req->r_flags = CEPH_OSD_FLAG_READ;
@@ -1865,9 +1934,10 @@ static struct ceph_osd_request *rbd_osd_req_create(
1865} 1934}
1866 1935
1867/* 1936/*
1868 * Create a copyup osd request based on the information in the 1937 * Create a copyup osd request based on the information in the object
1869 * object request supplied. A copyup request has three osd ops, 1938 * request supplied. A copyup request has two or three osd ops, a
1870 * a copyup method call, a hint op, and a write op. 1939 * copyup method call, potentially a hint op, and a write or truncate
1940 * or zero op.
1871 */ 1941 */
1872static struct ceph_osd_request * 1942static struct ceph_osd_request *
1873rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request) 1943rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
@@ -1877,18 +1947,24 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1877 struct rbd_device *rbd_dev; 1947 struct rbd_device *rbd_dev;
1878 struct ceph_osd_client *osdc; 1948 struct ceph_osd_client *osdc;
1879 struct ceph_osd_request *osd_req; 1949 struct ceph_osd_request *osd_req;
1950 int num_osd_ops = 3;
1880 1951
1881 rbd_assert(obj_request_img_data_test(obj_request)); 1952 rbd_assert(obj_request_img_data_test(obj_request));
1882 img_request = obj_request->img_request; 1953 img_request = obj_request->img_request;
1883 rbd_assert(img_request); 1954 rbd_assert(img_request);
1884 rbd_assert(img_request_write_test(img_request)); 1955 rbd_assert(img_request_write_test(img_request) ||
1956 img_request_discard_test(img_request));
1957
1958 if (img_request_discard_test(img_request))
1959 num_osd_ops = 2;
1885 1960
1886 /* Allocate and initialize the request, for the three ops */ 1961 /* Allocate and initialize the request, for all the ops */
1887 1962
1888 snapc = img_request->snapc; 1963 snapc = img_request->snapc;
1889 rbd_dev = img_request->rbd_dev; 1964 rbd_dev = img_request->rbd_dev;
1890 osdc = &rbd_dev->rbd_client->client->osdc; 1965 osdc = &rbd_dev->rbd_client->client->osdc;
1891 osd_req = ceph_osdc_alloc_request(osdc, snapc, 3, false, GFP_ATOMIC); 1966 osd_req = ceph_osdc_alloc_request(osdc, snapc, num_osd_ops,
1967 false, GFP_ATOMIC);
1892 if (!osd_req) 1968 if (!osd_req)
1893 return NULL; /* ENOMEM */ 1969 return NULL; /* ENOMEM */
1894 1970
@@ -2057,7 +2133,8 @@ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2057static struct rbd_img_request *rbd_img_request_create( 2133static struct rbd_img_request *rbd_img_request_create(
2058 struct rbd_device *rbd_dev, 2134 struct rbd_device *rbd_dev,
2059 u64 offset, u64 length, 2135 u64 offset, u64 length,
2060 bool write_request) 2136 enum obj_operation_type op_type,
2137 struct ceph_snap_context *snapc)
2061{ 2138{
2062 struct rbd_img_request *img_request; 2139 struct rbd_img_request *img_request;
2063 2140
@@ -2065,20 +2142,17 @@ static struct rbd_img_request *rbd_img_request_create(
2065 if (!img_request) 2142 if (!img_request)
2066 return NULL; 2143 return NULL;
2067 2144
2068 if (write_request) {
2069 down_read(&rbd_dev->header_rwsem);
2070 ceph_get_snap_context(rbd_dev->header.snapc);
2071 up_read(&rbd_dev->header_rwsem);
2072 }
2073
2074 img_request->rq = NULL; 2145 img_request->rq = NULL;
2075 img_request->rbd_dev = rbd_dev; 2146 img_request->rbd_dev = rbd_dev;
2076 img_request->offset = offset; 2147 img_request->offset = offset;
2077 img_request->length = length; 2148 img_request->length = length;
2078 img_request->flags = 0; 2149 img_request->flags = 0;
2079 if (write_request) { 2150 if (op_type == OBJ_OP_DISCARD) {
2151 img_request_discard_set(img_request);
2152 img_request->snapc = snapc;
2153 } else if (op_type == OBJ_OP_WRITE) {
2080 img_request_write_set(img_request); 2154 img_request_write_set(img_request);
2081 img_request->snapc = rbd_dev->header.snapc; 2155 img_request->snapc = snapc;
2082 } else { 2156 } else {
2083 img_request->snap_id = rbd_dev->spec->snap_id; 2157 img_request->snap_id = rbd_dev->spec->snap_id;
2084 } 2158 }
@@ -2093,8 +2167,7 @@ static struct rbd_img_request *rbd_img_request_create(
2093 kref_init(&img_request->kref); 2167 kref_init(&img_request->kref);
2094 2168
2095 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev, 2169 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
2096 write_request ? "write" : "read", offset, length, 2170 obj_op_name(op_type), offset, length, img_request);
2097 img_request);
2098 2171
2099 return img_request; 2172 return img_request;
2100} 2173}
@@ -2118,7 +2191,8 @@ static void rbd_img_request_destroy(struct kref *kref)
2118 rbd_dev_parent_put(img_request->rbd_dev); 2191 rbd_dev_parent_put(img_request->rbd_dev);
2119 } 2192 }
2120 2193
2121 if (img_request_write_test(img_request)) 2194 if (img_request_write_test(img_request) ||
2195 img_request_discard_test(img_request))
2122 ceph_put_snap_context(img_request->snapc); 2196 ceph_put_snap_context(img_request->snapc);
2123 2197
2124 kmem_cache_free(rbd_img_request_cache, img_request); 2198 kmem_cache_free(rbd_img_request_cache, img_request);
@@ -2134,8 +2208,8 @@ static struct rbd_img_request *rbd_parent_request_create(
2134 rbd_assert(obj_request->img_request); 2208 rbd_assert(obj_request->img_request);
2135 rbd_dev = obj_request->img_request->rbd_dev; 2209 rbd_dev = obj_request->img_request->rbd_dev;
2136 2210
2137 parent_request = rbd_img_request_create(rbd_dev->parent, 2211 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
2138 img_offset, length, false); 2212 length, OBJ_OP_READ, NULL);
2139 if (!parent_request) 2213 if (!parent_request)
2140 return NULL; 2214 return NULL;
2141 2215
@@ -2176,11 +2250,18 @@ static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2176 result = obj_request->result; 2250 result = obj_request->result;
2177 if (result) { 2251 if (result) {
2178 struct rbd_device *rbd_dev = img_request->rbd_dev; 2252 struct rbd_device *rbd_dev = img_request->rbd_dev;
2253 enum obj_operation_type op_type;
2254
2255 if (img_request_discard_test(img_request))
2256 op_type = OBJ_OP_DISCARD;
2257 else if (img_request_write_test(img_request))
2258 op_type = OBJ_OP_WRITE;
2259 else
2260 op_type = OBJ_OP_READ;
2179 2261
2180 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)", 2262 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
2181 img_request_write_test(img_request) ? "write" : "read", 2263 obj_op_name(op_type), obj_request->length,
2182 obj_request->length, obj_request->img_offset, 2264 obj_request->img_offset, obj_request->offset);
2183 obj_request->offset);
2184 rbd_warn(rbd_dev, " result %d xferred %x", 2265 rbd_warn(rbd_dev, " result %d xferred %x",
2185 result, xferred); 2266 result, xferred);
2186 if (!img_request->result) 2267 if (!img_request->result)
@@ -2245,6 +2326,67 @@ out:
2245} 2326}
2246 2327
2247/* 2328/*
2329 * Add individual osd ops to the given ceph_osd_request and prepare
2330 * them for submission. num_ops is the current number of
2331 * osd operations already to the object request.
2332 */
2333static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
2334 struct ceph_osd_request *osd_request,
2335 enum obj_operation_type op_type,
2336 unsigned int num_ops)
2337{
2338 struct rbd_img_request *img_request = obj_request->img_request;
2339 struct rbd_device *rbd_dev = img_request->rbd_dev;
2340 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
2341 u64 offset = obj_request->offset;
2342 u64 length = obj_request->length;
2343 u64 img_end;
2344 u16 opcode;
2345
2346 if (op_type == OBJ_OP_DISCARD) {
2347 if (!offset && length == object_size &&
2348 (!img_request_layered_test(img_request) ||
2349 !obj_request_overlaps_parent(obj_request))) {
2350 opcode = CEPH_OSD_OP_DELETE;
2351 } else if ((offset + length == object_size)) {
2352 opcode = CEPH_OSD_OP_TRUNCATE;
2353 } else {
2354 down_read(&rbd_dev->header_rwsem);
2355 img_end = rbd_dev->header.image_size;
2356 up_read(&rbd_dev->header_rwsem);
2357
2358 if (obj_request->img_offset + length == img_end)
2359 opcode = CEPH_OSD_OP_TRUNCATE;
2360 else
2361 opcode = CEPH_OSD_OP_ZERO;
2362 }
2363 } else if (op_type == OBJ_OP_WRITE) {
2364 opcode = CEPH_OSD_OP_WRITE;
2365 osd_req_op_alloc_hint_init(osd_request, num_ops,
2366 object_size, object_size);
2367 num_ops++;
2368 } else {
2369 opcode = CEPH_OSD_OP_READ;
2370 }
2371
2372 osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length,
2373 0, 0);
2374 if (obj_request->type == OBJ_REQUEST_BIO)
2375 osd_req_op_extent_osd_data_bio(osd_request, num_ops,
2376 obj_request->bio_list, length);
2377 else if (obj_request->type == OBJ_REQUEST_PAGES)
2378 osd_req_op_extent_osd_data_pages(osd_request, num_ops,
2379 obj_request->pages, length,
2380 offset & ~PAGE_MASK, false, false);
2381
2382 /* Discards are also writes */
2383 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2384 rbd_osd_req_format_write(obj_request);
2385 else
2386 rbd_osd_req_format_read(obj_request);
2387}
2388
2389/*
2248 * Split up an image request into one or more object requests, each 2390 * Split up an image request into one or more object requests, each
2249 * to a different object. The "type" parameter indicates whether 2391 * to a different object. The "type" parameter indicates whether
2250 * "data_desc" is the pointer to the head of a list of bio 2392 * "data_desc" is the pointer to the head of a list of bio
@@ -2259,28 +2401,26 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
2259 struct rbd_device *rbd_dev = img_request->rbd_dev; 2401 struct rbd_device *rbd_dev = img_request->rbd_dev;
2260 struct rbd_obj_request *obj_request = NULL; 2402 struct rbd_obj_request *obj_request = NULL;
2261 struct rbd_obj_request *next_obj_request; 2403 struct rbd_obj_request *next_obj_request;
2262 bool write_request = img_request_write_test(img_request);
2263 struct bio *bio_list = NULL; 2404 struct bio *bio_list = NULL;
2264 unsigned int bio_offset = 0; 2405 unsigned int bio_offset = 0;
2265 struct page **pages = NULL; 2406 struct page **pages = NULL;
2407 enum obj_operation_type op_type;
2266 u64 img_offset; 2408 u64 img_offset;
2267 u64 resid; 2409 u64 resid;
2268 u16 opcode;
2269 2410
2270 dout("%s: img %p type %d data_desc %p\n", __func__, img_request, 2411 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2271 (int)type, data_desc); 2412 (int)type, data_desc);
2272 2413
2273 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
2274 img_offset = img_request->offset; 2414 img_offset = img_request->offset;
2275 resid = img_request->length; 2415 resid = img_request->length;
2276 rbd_assert(resid > 0); 2416 rbd_assert(resid > 0);
2417 op_type = rbd_img_request_op_type(img_request);
2277 2418
2278 if (type == OBJ_REQUEST_BIO) { 2419 if (type == OBJ_REQUEST_BIO) {
2279 bio_list = data_desc; 2420 bio_list = data_desc;
2280 rbd_assert(img_offset == 2421 rbd_assert(img_offset ==
2281 bio_list->bi_iter.bi_sector << SECTOR_SHIFT); 2422 bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
2282 } else { 2423 } else if (type == OBJ_REQUEST_PAGES) {
2283 rbd_assert(type == OBJ_REQUEST_PAGES);
2284 pages = data_desc; 2424 pages = data_desc;
2285 } 2425 }
2286 2426
@@ -2289,7 +2429,6 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
2289 const char *object_name; 2429 const char *object_name;
2290 u64 offset; 2430 u64 offset;
2291 u64 length; 2431 u64 length;
2292 unsigned int which = 0;
2293 2432
2294 object_name = rbd_segment_name(rbd_dev, img_offset); 2433 object_name = rbd_segment_name(rbd_dev, img_offset);
2295 if (!object_name) 2434 if (!object_name)
@@ -2321,7 +2460,7 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
2321 GFP_ATOMIC); 2460 GFP_ATOMIC);
2322 if (!obj_request->bio_list) 2461 if (!obj_request->bio_list)
2323 goto out_unwind; 2462 goto out_unwind;
2324 } else { 2463 } else if (type == OBJ_REQUEST_PAGES) {
2325 unsigned int page_count; 2464 unsigned int page_count;
2326 2465
2327 obj_request->pages = pages; 2466 obj_request->pages = pages;
@@ -2332,38 +2471,19 @@ static int rbd_img_request_fill(struct rbd_img_request *img_request,
2332 pages += page_count; 2471 pages += page_count;
2333 } 2472 }
2334 2473
2335 osd_req = rbd_osd_req_create(rbd_dev, write_request, 2474 osd_req = rbd_osd_req_create(rbd_dev, op_type,
2336 (write_request ? 2 : 1), 2475 (op_type == OBJ_OP_WRITE) ? 2 : 1,
2337 obj_request); 2476 obj_request);
2338 if (!osd_req) 2477 if (!osd_req)
2339 goto out_unwind; 2478 goto out_unwind;
2479
2340 obj_request->osd_req = osd_req; 2480 obj_request->osd_req = osd_req;
2341 obj_request->callback = rbd_img_obj_callback; 2481 obj_request->callback = rbd_img_obj_callback;
2342 rbd_img_request_get(img_request); 2482 obj_request->img_offset = img_offset;
2343
2344 if (write_request) {
2345 osd_req_op_alloc_hint_init(osd_req, which,
2346 rbd_obj_bytes(&rbd_dev->header),
2347 rbd_obj_bytes(&rbd_dev->header));
2348 which++;
2349 }
2350
2351 osd_req_op_extent_init(osd_req, which, opcode, offset, length,
2352 0, 0);
2353 if (type == OBJ_REQUEST_BIO)
2354 osd_req_op_extent_osd_data_bio(osd_req, which,
2355 obj_request->bio_list, length);
2356 else
2357 osd_req_op_extent_osd_data_pages(osd_req, which,
2358 obj_request->pages, length,
2359 offset & ~PAGE_MASK, false, false);
2360 2483
2361 if (write_request) 2484 rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
2362 rbd_osd_req_format_write(obj_request);
2363 else
2364 rbd_osd_req_format_read(obj_request);
2365 2485
2366 obj_request->img_offset = img_offset; 2486 rbd_img_request_get(img_request);
2367 2487
2368 img_offset += length; 2488 img_offset += length;
2369 resid -= length; 2489 resid -= length;
@@ -2386,7 +2506,8 @@ rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2386 struct page **pages; 2506 struct page **pages;
2387 u32 page_count; 2507 u32 page_count;
2388 2508
2389 rbd_assert(obj_request->type == OBJ_REQUEST_BIO); 2509 rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2510 obj_request->type == OBJ_REQUEST_NODATA);
2390 rbd_assert(obj_request_img_data_test(obj_request)); 2511 rbd_assert(obj_request_img_data_test(obj_request));
2391 img_request = obj_request->img_request; 2512 img_request = obj_request->img_request;
2392 rbd_assert(img_request); 2513 rbd_assert(img_request);
@@ -2424,11 +2545,10 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2424 struct ceph_osd_client *osdc; 2545 struct ceph_osd_client *osdc;
2425 struct rbd_device *rbd_dev; 2546 struct rbd_device *rbd_dev;
2426 struct page **pages; 2547 struct page **pages;
2548 enum obj_operation_type op_type;
2427 u32 page_count; 2549 u32 page_count;
2428 int img_result; 2550 int img_result;
2429 u64 parent_length; 2551 u64 parent_length;
2430 u64 offset;
2431 u64 length;
2432 2552
2433 rbd_assert(img_request_child_test(img_request)); 2553 rbd_assert(img_request_child_test(img_request));
2434 2554
@@ -2492,26 +2612,10 @@ rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2492 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0, 2612 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
2493 false, false); 2613 false, false);
2494 2614
2495 /* Then the hint op */ 2615 /* Add the other op(s) */
2496 2616
2497 osd_req_op_alloc_hint_init(osd_req, 1, rbd_obj_bytes(&rbd_dev->header), 2617 op_type = rbd_img_request_op_type(orig_request->img_request);
2498 rbd_obj_bytes(&rbd_dev->header)); 2618 rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
2499
2500 /* And the original write request op */
2501
2502 offset = orig_request->offset;
2503 length = orig_request->length;
2504 osd_req_op_extent_init(osd_req, 2, CEPH_OSD_OP_WRITE,
2505 offset, length, 0, 0);
2506 if (orig_request->type == OBJ_REQUEST_BIO)
2507 osd_req_op_extent_osd_data_bio(osd_req, 2,
2508 orig_request->bio_list, length);
2509 else
2510 osd_req_op_extent_osd_data_pages(osd_req, 2,
2511 orig_request->pages, length,
2512 offset & ~PAGE_MASK, false, false);
2513
2514 rbd_osd_req_format_write(orig_request);
2515 2619
2516 /* All set, send it off. */ 2620 /* All set, send it off. */
2517 2621
@@ -2728,7 +2832,7 @@ static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2728 2832
2729 rbd_assert(obj_request->img_request); 2833 rbd_assert(obj_request->img_request);
2730 rbd_dev = obj_request->img_request->rbd_dev; 2834 rbd_dev = obj_request->img_request->rbd_dev;
2731 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 2835 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2732 stat_request); 2836 stat_request);
2733 if (!stat_request->osd_req) 2837 if (!stat_request->osd_req)
2734 goto out; 2838 goto out;
@@ -2748,11 +2852,10 @@ out:
2748 return ret; 2852 return ret;
2749} 2853}
2750 2854
2751static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request) 2855static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
2752{ 2856{
2753 struct rbd_img_request *img_request; 2857 struct rbd_img_request *img_request;
2754 struct rbd_device *rbd_dev; 2858 struct rbd_device *rbd_dev;
2755 bool known;
2756 2859
2757 rbd_assert(obj_request_img_data_test(obj_request)); 2860 rbd_assert(obj_request_img_data_test(obj_request));
2758 2861
@@ -2760,22 +2863,44 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2760 rbd_assert(img_request); 2863 rbd_assert(img_request);
2761 rbd_dev = img_request->rbd_dev; 2864 rbd_dev = img_request->rbd_dev;
2762 2865
2866 /* Reads */
2867 if (!img_request_write_test(img_request) &&
2868 !img_request_discard_test(img_request))
2869 return true;
2870
2871 /* Non-layered writes */
2872 if (!img_request_layered_test(img_request))
2873 return true;
2874
2875 /*
2876 * Layered writes outside of the parent overlap range don't
2877 * share any data with the parent.
2878 */
2879 if (!obj_request_overlaps_parent(obj_request))
2880 return true;
2881
2763 /* 2882 /*
2764 * Only writes to layered images need special handling. 2883 * Entire-object layered writes - we will overwrite whatever
2765 * Reads and non-layered writes are simple object requests. 2884 * parent data there is anyway.
2766 * Layered writes that start beyond the end of the overlap
2767 * with the parent have no parent data, so they too are
2768 * simple object requests. Finally, if the target object is
2769 * known to already exist, its parent data has already been
2770 * copied, so a write to the object can also be handled as a
2771 * simple object request.
2772 */ 2885 */
2773 if (!img_request_write_test(img_request) || 2886 if (!obj_request->offset &&
2774 !img_request_layered_test(img_request) || 2887 obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2775 !obj_request_overlaps_parent(obj_request) || 2888 return true;
2776 ((known = obj_request_known_test(obj_request)) && 2889
2777 obj_request_exists_test(obj_request))) { 2890 /*
2891 * If the object is known to already exist, its parent data has
2892 * already been copied.
2893 */
2894 if (obj_request_known_test(obj_request) &&
2895 obj_request_exists_test(obj_request))
2896 return true;
2897
2898 return false;
2899}
2778 2900
2901static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2902{
2903 if (img_obj_request_simple(obj_request)) {
2779 struct rbd_device *rbd_dev; 2904 struct rbd_device *rbd_dev;
2780 struct ceph_osd_client *osdc; 2905 struct ceph_osd_client *osdc;
2781 2906
@@ -2791,7 +2916,7 @@ static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2791 * start by reading the data for the full target object from 2916 * start by reading the data for the full target object from
2792 * the parent so we can use it for a copyup to the target. 2917 * the parent so we can use it for a copyup to the target.
2793 */ 2918 */
2794 if (known) 2919 if (obj_request_known_test(obj_request))
2795 return rbd_img_obj_parent_read_full(obj_request); 2920 return rbd_img_obj_parent_read_full(obj_request);
2796 2921
2797 /* We don't know whether the target exists. Go find out. */ 2922 /* We don't know whether the target exists. Go find out. */
@@ -2932,7 +3057,7 @@ static int rbd_obj_notify_ack_sync(struct rbd_device *rbd_dev, u64 notify_id)
2932 return -ENOMEM; 3057 return -ENOMEM;
2933 3058
2934 ret = -ENOMEM; 3059 ret = -ENOMEM;
2935 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 3060 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2936 obj_request); 3061 obj_request);
2937 if (!obj_request->osd_req) 3062 if (!obj_request->osd_req)
2938 goto out; 3063 goto out;
@@ -2995,7 +3120,7 @@ static struct rbd_obj_request *rbd_obj_watch_request_helper(
2995 if (!obj_request) 3120 if (!obj_request)
2996 return ERR_PTR(-ENOMEM); 3121 return ERR_PTR(-ENOMEM);
2997 3122
2998 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, 1, 3123 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE, 1,
2999 obj_request); 3124 obj_request);
3000 if (!obj_request->osd_req) { 3125 if (!obj_request->osd_req) {
3001 ret = -ENOMEM; 3126 ret = -ENOMEM;
@@ -3133,7 +3258,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
3133 obj_request->pages = pages; 3258 obj_request->pages = pages;
3134 obj_request->page_count = page_count; 3259 obj_request->page_count = page_count;
3135 3260
3136 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 3261 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3137 obj_request); 3262 obj_request);
3138 if (!obj_request->osd_req) 3263 if (!obj_request->osd_req)
3139 goto out; 3264 goto out;
@@ -3183,11 +3308,20 @@ out:
3183static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq) 3308static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
3184{ 3309{
3185 struct rbd_img_request *img_request; 3310 struct rbd_img_request *img_request;
3311 struct ceph_snap_context *snapc = NULL;
3186 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT; 3312 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3187 u64 length = blk_rq_bytes(rq); 3313 u64 length = blk_rq_bytes(rq);
3188 bool wr = rq_data_dir(rq) == WRITE; 3314 enum obj_operation_type op_type;
3315 u64 mapping_size;
3189 int result; 3316 int result;
3190 3317
3318 if (rq->cmd_flags & REQ_DISCARD)
3319 op_type = OBJ_OP_DISCARD;
3320 else if (rq->cmd_flags & REQ_WRITE)
3321 op_type = OBJ_OP_WRITE;
3322 else
3323 op_type = OBJ_OP_READ;
3324
3191 /* Ignore/skip any zero-length requests */ 3325 /* Ignore/skip any zero-length requests */
3192 3326
3193 if (!length) { 3327 if (!length) {
@@ -3196,9 +3330,9 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
3196 goto err_rq; 3330 goto err_rq;
3197 } 3331 }
3198 3332
3199 /* Disallow writes to a read-only device */ 3333 /* Only reads are allowed to a read-only device */
3200 3334
3201 if (wr) { 3335 if (op_type != OBJ_OP_READ) {
3202 if (rbd_dev->mapping.read_only) { 3336 if (rbd_dev->mapping.read_only) {
3203 result = -EROFS; 3337 result = -EROFS;
3204 goto err_rq; 3338 goto err_rq;
@@ -3226,21 +3360,35 @@ static void rbd_handle_request(struct rbd_device *rbd_dev, struct request *rq)
3226 goto err_rq; /* Shouldn't happen */ 3360 goto err_rq; /* Shouldn't happen */
3227 } 3361 }
3228 3362
3229 if (offset + length > rbd_dev->mapping.size) { 3363 down_read(&rbd_dev->header_rwsem);
3364 mapping_size = rbd_dev->mapping.size;
3365 if (op_type != OBJ_OP_READ) {
3366 snapc = rbd_dev->header.snapc;
3367 ceph_get_snap_context(snapc);
3368 }
3369 up_read(&rbd_dev->header_rwsem);
3370
3371 if (offset + length > mapping_size) {
3230 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset, 3372 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
3231 length, rbd_dev->mapping.size); 3373 length, mapping_size);
3232 result = -EIO; 3374 result = -EIO;
3233 goto err_rq; 3375 goto err_rq;
3234 } 3376 }
3235 3377
3236 img_request = rbd_img_request_create(rbd_dev, offset, length, wr); 3378 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
3379 snapc);
3237 if (!img_request) { 3380 if (!img_request) {
3238 result = -ENOMEM; 3381 result = -ENOMEM;
3239 goto err_rq; 3382 goto err_rq;
3240 } 3383 }
3241 img_request->rq = rq; 3384 img_request->rq = rq;
3242 3385
3243 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO, rq->bio); 3386 if (op_type == OBJ_OP_DISCARD)
3387 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
3388 NULL);
3389 else
3390 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3391 rq->bio);
3244 if (result) 3392 if (result)
3245 goto err_img_request; 3393 goto err_img_request;
3246 3394
@@ -3255,7 +3403,9 @@ err_img_request:
3255err_rq: 3403err_rq:
3256 if (result) 3404 if (result)
3257 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3405 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
3258 wr ? "write" : "read", length, offset, result); 3406 obj_op_name(op_type), length, offset, result);
3407 if (snapc)
3408 ceph_put_snap_context(snapc);
3259 blk_end_request_all(rq, result); 3409 blk_end_request_all(rq, result);
3260} 3410}
3261 3411
@@ -3393,7 +3543,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
3393 obj_request->pages = pages; 3543 obj_request->pages = pages;
3394 obj_request->page_count = page_count; 3544 obj_request->page_count = page_count;
3395 3545
3396 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, 1, 3546 obj_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
3397 obj_request); 3547 obj_request);
3398 if (!obj_request->osd_req) 3548 if (!obj_request->osd_req)
3399 goto out; 3549 goto out;
@@ -3610,6 +3760,13 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
3610 blk_queue_io_min(q, segment_size); 3760 blk_queue_io_min(q, segment_size);
3611 blk_queue_io_opt(q, segment_size); 3761 blk_queue_io_opt(q, segment_size);
3612 3762
3763 /* enable the discard support */
3764 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3765 q->limits.discard_granularity = segment_size;
3766 q->limits.discard_alignment = segment_size;
3767 q->limits.max_discard_sectors = segment_size / SECTOR_SIZE;
3768 q->limits.discard_zeroes_data = 1;
3769
3613 blk_queue_merge_bvec(q, rbd_merge_bvec); 3770 blk_queue_merge_bvec(q, rbd_merge_bvec);
3614 disk->queue = q; 3771 disk->queue = q;
3615 3772
@@ -4924,7 +5081,7 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4924 ret = image_id ? 0 : -ENOMEM; 5081 ret = image_id ? 0 : -ENOMEM;
4925 if (!ret) 5082 if (!ret)
4926 rbd_dev->image_format = 1; 5083 rbd_dev->image_format = 1;
4927 } else if (ret > sizeof (__le32)) { 5084 } else if (ret >= 0) {
4928 void *p = response; 5085 void *p = response;
4929 5086
4930 image_id = ceph_extract_encoded_string(&p, p + ret, 5087 image_id = ceph_extract_encoded_string(&p, p + ret,
@@ -4932,8 +5089,6 @@ static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4932 ret = PTR_ERR_OR_ZERO(image_id); 5089 ret = PTR_ERR_OR_ZERO(image_id);
4933 if (!ret) 5090 if (!ret)
4934 rbd_dev->image_format = 2; 5091 rbd_dev->image_format = 2;
4935 } else {
4936 ret = -EINVAL;
4937 } 5092 }
4938 5093
4939 if (!ret) { 5094 if (!ret) {
@@ -5087,7 +5242,8 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
5087 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); 5242 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
5088 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); 5243 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
5089 5244
5090 rbd_dev->rq_wq = alloc_workqueue("%s", 0, 0, rbd_dev->disk->disk_name); 5245 rbd_dev->rq_wq = alloc_workqueue("%s", WQ_MEM_RECLAIM, 0,
5246 rbd_dev->disk->disk_name);
5091 if (!rbd_dev->rq_wq) { 5247 if (!rbd_dev->rq_wq) {
5092 ret = -ENOMEM; 5248 ret = -ENOMEM;
5093 goto err_out_mapping; 5249 goto err_out_mapping;