aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIlya Dryomov <idryomov@gmail.com>2016-07-18 21:50:28 -0400
committerIlya Dryomov <idryomov@gmail.com>2016-07-22 09:17:40 -0400
commit930c532869774ebf8af9efe9484c597f896a7d46 (patch)
tree68072d2a5c42dbf347514b8737f1abd0df044966
parent92d21ac74a9e3c09b0b01c764e530657e4c85c49 (diff)
libceph: apply new_state before new_up_client on incrementals
Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
-rw-r--r--net/ceph/osdmap.c156
1 files changed, 113 insertions, 43 deletions
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 03062bb763b3..7e480bf75bcf 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1261,6 +1261,115 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1261} 1261}
1262 1262
1263/* 1263/*
1264 * Encoding order is (new_up_client, new_state, new_weight). Need to
1265 * apply in the (new_weight, new_state, new_up_client) order, because
1266 * an incremental map may look like e.g.
1267 *
1268 * new_up_client: { osd=6, addr=... } # set osd_state and addr
1269 * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
1270 */
1271static int decode_new_up_state_weight(void **p, void *end,
1272 struct ceph_osdmap *map)
1273{
1274 void *new_up_client;
1275 void *new_state;
1276 void *new_weight_end;
1277 u32 len;
1278
1279 new_up_client = *p;
1280 ceph_decode_32_safe(p, end, len, e_inval);
1281 len *= sizeof(u32) + sizeof(struct ceph_entity_addr);
1282 ceph_decode_need(p, end, len, e_inval);
1283 *p += len;
1284
1285 new_state = *p;
1286 ceph_decode_32_safe(p, end, len, e_inval);
1287 len *= sizeof(u32) + sizeof(u8);
1288 ceph_decode_need(p, end, len, e_inval);
1289 *p += len;
1290
1291 /* new_weight */
1292 ceph_decode_32_safe(p, end, len, e_inval);
1293 while (len--) {
1294 s32 osd;
1295 u32 w;
1296
1297 ceph_decode_need(p, end, 2*sizeof(u32), e_inval);
1298 osd = ceph_decode_32(p);
1299 w = ceph_decode_32(p);
1300 BUG_ON(osd >= map->max_osd);
1301 pr_info("osd%d weight 0x%x %s\n", osd, w,
1302 w == CEPH_OSD_IN ? "(in)" :
1303 (w == CEPH_OSD_OUT ? "(out)" : ""));
1304 map->osd_weight[osd] = w;
1305
1306 /*
1307 * If we are marking in, set the EXISTS, and clear the
1308 * AUTOOUT and NEW bits.
1309 */
1310 if (w) {
1311 map->osd_state[osd] |= CEPH_OSD_EXISTS;
1312 map->osd_state[osd] &= ~(CEPH_OSD_AUTOOUT |
1313 CEPH_OSD_NEW);
1314 }
1315 }
1316 new_weight_end = *p;
1317
1318 /* new_state (up/down) */
1319 *p = new_state;
1320 len = ceph_decode_32(p);
1321 while (len--) {
1322 s32 osd;
1323 u8 xorstate;
1324 int ret;
1325
1326 osd = ceph_decode_32(p);
1327 xorstate = ceph_decode_8(p);
1328 if (xorstate == 0)
1329 xorstate = CEPH_OSD_UP;
1330 BUG_ON(osd >= map->max_osd);
1331 if ((map->osd_state[osd] & CEPH_OSD_UP) &&
1332 (xorstate & CEPH_OSD_UP))
1333 pr_info("osd%d down\n", osd);
1334 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
1335 (xorstate & CEPH_OSD_EXISTS)) {
1336 pr_info("osd%d does not exist\n", osd);
1337 map->osd_weight[osd] = CEPH_OSD_IN;
1338 ret = set_primary_affinity(map, osd,
1339 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
1340 if (ret)
1341 return ret;
1342 memset(map->osd_addr + osd, 0, sizeof(*map->osd_addr));
1343 map->osd_state[osd] = 0;
1344 } else {
1345 map->osd_state[osd] ^= xorstate;
1346 }
1347 }
1348
1349 /* new_up_client */
1350 *p = new_up_client;
1351 len = ceph_decode_32(p);
1352 while (len--) {
1353 s32 osd;
1354 struct ceph_entity_addr addr;
1355
1356 osd = ceph_decode_32(p);
1357 ceph_decode_copy(p, &addr, sizeof(addr));
1358 ceph_decode_addr(&addr);
1359 BUG_ON(osd >= map->max_osd);
1360 pr_info("osd%d up\n", osd);
1361 map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP;
1362 map->osd_addr[osd] = addr;
1363 }
1364
1365 *p = new_weight_end;
1366 return 0;
1367
1368e_inval:
1369 return -EINVAL;
1370}
1371
1372/*
1264 * decode and apply an incremental map update. 1373 * decode and apply an incremental map update.
1265 */ 1374 */
1266struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, 1375struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
@@ -1358,49 +1467,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
1358 __remove_pg_pool(&map->pg_pools, pi); 1467 __remove_pg_pool(&map->pg_pools, pi);
1359 } 1468 }
1360 1469
1361 /* new_up */ 1470 /* new_up_client, new_state, new_weight */
1362 ceph_decode_32_safe(p, end, len, e_inval); 1471 err = decode_new_up_state_weight(p, end, map);
1363 while (len--) { 1472 if (err)
1364 u32 osd; 1473 goto bad;
1365 struct ceph_entity_addr addr;
1366 ceph_decode_32_safe(p, end, osd, e_inval);
1367 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
1368 ceph_decode_addr(&addr);
1369 pr_info("osd%d up\n", osd);
1370 BUG_ON(osd >= map->max_osd);
1371 map->osd_state[osd] |= CEPH_OSD_UP | CEPH_OSD_EXISTS;
1372 map->osd_addr[osd] = addr;
1373 }
1374
1375 /* new_state */
1376 ceph_decode_32_safe(p, end, len, e_inval);
1377 while (len--) {
1378 u32 osd;
1379 u8 xorstate;
1380 ceph_decode_32_safe(p, end, osd, e_inval);
1381 xorstate = **(u8 **)p;
1382 (*p)++; /* clean flag */
1383 if (xorstate == 0)
1384 xorstate = CEPH_OSD_UP;
1385 if (xorstate & CEPH_OSD_UP)
1386 pr_info("osd%d down\n", osd);
1387 if (osd < map->max_osd)
1388 map->osd_state[osd] ^= xorstate;
1389 }
1390
1391 /* new_weight */
1392 ceph_decode_32_safe(p, end, len, e_inval);
1393 while (len--) {
1394 u32 osd, off;
1395 ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
1396 osd = ceph_decode_32(p);
1397 off = ceph_decode_32(p);
1398 pr_info("osd%d weight 0x%x %s\n", osd, off,
1399 off == CEPH_OSD_IN ? "(in)" :
1400 (off == CEPH_OSD_OUT ? "(out)" : ""));
1401 if (osd < map->max_osd)
1402 map->osd_weight[osd] = off;
1403 }
1404 1474
1405 /* new_pg_temp */ 1475 /* new_pg_temp */
1406 err = decode_new_pg_temp(p, end, map); 1476 err = decode_new_pg_temp(p, end, map);