aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/bridge/netfilter/ebtables.c5
-rw-r--r--net/ceph/crush/mapper.c85
-rw-r--r--net/ceph/debugfs.c55
-rw-r--r--net/ceph/messenger.c6
-rw-r--r--net/ceph/osd_client.c41
-rw-r--r--net/ceph/osdmap.c993
-rw-r--r--net/core/dev.c14
-rw-r--r--net/core/ethtool.c1
-rw-r--r--net/core/filter.c32
-rw-r--r--net/core/flow.c8
-rw-r--r--net/core/pktgen.c2
-rw-r--r--net/ipv4/netfilter/arp_tables.c6
-rw-r--r--net/ipv4/netfilter/ip_tables.c6
-rw-r--r--net/ipv4/route.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c6
-rw-r--r--net/iucv/iucv.c127
-rw-r--r--net/mac802154/mib.c1
-rw-r--r--net/netfilter/nf_tables_api.c7
-rw-r--r--net/netfilter/xt_cgroup.c3
-rw-r--r--net/netfilter/xt_connlimit.c25
-rw-r--r--net/netfilter/xt_osf.c2
-rw-r--r--net/packet/af_packet.c3
-rw-r--r--net/sctp/socket.c36
-rw-r--r--net/sunrpc/Kconfig39
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/backchannel_rqst.c93
-rw-r--r--net/sunrpc/clnt.c81
-rw-r--r--net/sunrpc/sched.c3
-rw-r--r--net/sunrpc/svcsock.c16
-rw-r--r--net/sunrpc/xdr.c22
-rw-r--r--net/sunrpc/xprt.c12
-rw-r--r--net/sunrpc/xprtrdma/Makefile4
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c4
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c12
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c3
-rw-r--r--net/sunrpc/xprtrdma/transport.c10
-rw-r--r--net/sunrpc/xprtsock.c62
-rw-r--r--net/tipc/net.c3
-rw-r--r--net/tipc/socket.c3
40 files changed, 1251 insertions, 587 deletions
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0e474b13463b..1059ed3bc255 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1044,10 +1044,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
1044 if (repl->num_counters && 1044 if (repl->num_counters &&
1045 copy_to_user(repl->counters, counterstmp, 1045 copy_to_user(repl->counters, counterstmp,
1046 repl->num_counters * sizeof(struct ebt_counter))) { 1046 repl->num_counters * sizeof(struct ebt_counter))) {
1047 ret = -EFAULT; 1047 /* Silent error, can't fail, new table is already in place */
1048 net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n");
1048 } 1049 }
1049 else
1050 ret = 0;
1051 1050
1052 /* decrease module count and free resources */ 1051 /* decrease module count and free resources */
1053 EBT_ENTRY_ITERATE(table->entries, table->entries_size, 1052 EBT_ENTRY_ITERATE(table->entries, table->entries_size,
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b703790b4e44..a1ef53c04415 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -292,10 +292,12 @@ static int is_out(const struct crush_map *map,
292 * @outpos: our position in that vector 292 * @outpos: our position in that vector
293 * @tries: number of attempts to make 293 * @tries: number of attempts to make
294 * @recurse_tries: number of attempts to have recursive chooseleaf make 294 * @recurse_tries: number of attempts to have recursive chooseleaf make
295 * @local_tries: localized retries 295 * @local_retries: localized retries
296 * @local_fallback_tries: localized fallback retries 296 * @local_fallback_retries: localized fallback retries
297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose) 297 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
298 * @vary_r: pass r to recursive calls
298 * @out2: second output vector for leaf items (if @recurse_to_leaf) 299 * @out2: second output vector for leaf items (if @recurse_to_leaf)
300 * @parent_r: r value passed from the parent
299 */ 301 */
300static int crush_choose_firstn(const struct crush_map *map, 302static int crush_choose_firstn(const struct crush_map *map,
301 struct crush_bucket *bucket, 303 struct crush_bucket *bucket,
@@ -304,10 +306,12 @@ static int crush_choose_firstn(const struct crush_map *map,
304 int *out, int outpos, 306 int *out, int outpos,
305 unsigned int tries, 307 unsigned int tries,
306 unsigned int recurse_tries, 308 unsigned int recurse_tries,
307 unsigned int local_tries, 309 unsigned int local_retries,
308 unsigned int local_fallback_tries, 310 unsigned int local_fallback_retries,
309 int recurse_to_leaf, 311 int recurse_to_leaf,
310 int *out2) 312 unsigned int vary_r,
313 int *out2,
314 int parent_r)
311{ 315{
312 int rep; 316 int rep;
313 unsigned int ftotal, flocal; 317 unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
319 int itemtype; 323 int itemtype;
320 int collide, reject; 324 int collide, reject;
321 325
322 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 326 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
323 bucket->id, x, outpos, numrep); 327 recurse_to_leaf ? "_LEAF" : "",
328 bucket->id, x, outpos, numrep,
329 tries, recurse_tries, local_retries, local_fallback_retries,
330 parent_r);
324 331
325 for (rep = outpos; rep < numrep; rep++) { 332 for (rep = outpos; rep < numrep; rep++) {
326 /* keep trying until we get a non-out, non-colliding item */ 333 /* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
335 do { 342 do {
336 collide = 0; 343 collide = 0;
337 retry_bucket = 0; 344 retry_bucket = 0;
338 r = rep; 345 r = rep + parent_r;
339 /* r' = r + f_total */ 346 /* r' = r + f_total */
340 r += ftotal; 347 r += ftotal;
341 348
@@ -344,9 +351,9 @@ static int crush_choose_firstn(const struct crush_map *map,
344 reject = 1; 351 reject = 1;
345 goto reject; 352 goto reject;
346 } 353 }
347 if (local_fallback_tries > 0 && 354 if (local_fallback_retries > 0 &&
348 flocal >= (in->size>>1) && 355 flocal >= (in->size>>1) &&
349 flocal > local_fallback_tries) 356 flocal > local_fallback_retries)
350 item = bucket_perm_choose(in, x, r); 357 item = bucket_perm_choose(in, x, r);
351 else 358 else
352 item = crush_bucket_choose(in, x, r); 359 item = crush_bucket_choose(in, x, r);
@@ -387,16 +394,23 @@ static int crush_choose_firstn(const struct crush_map *map,
387 reject = 0; 394 reject = 0;
388 if (!collide && recurse_to_leaf) { 395 if (!collide && recurse_to_leaf) {
389 if (item < 0) { 396 if (item < 0) {
397 int sub_r;
398 if (vary_r)
399 sub_r = r >> (vary_r-1);
400 else
401 sub_r = 0;
390 if (crush_choose_firstn(map, 402 if (crush_choose_firstn(map,
391 map->buckets[-1-item], 403 map->buckets[-1-item],
392 weight, weight_max, 404 weight, weight_max,
393 x, outpos+1, 0, 405 x, outpos+1, 0,
394 out2, outpos, 406 out2, outpos,
395 recurse_tries, 0, 407 recurse_tries, 0,
396 local_tries, 408 local_retries,
397 local_fallback_tries, 409 local_fallback_retries,
398 0, 410 0,
399 NULL) <= outpos) 411 vary_r,
412 NULL,
413 sub_r) <= outpos)
400 /* didn't get leaf */ 414 /* didn't get leaf */
401 reject = 1; 415 reject = 1;
402 } else { 416 } else {
@@ -420,14 +434,14 @@ reject:
420 ftotal++; 434 ftotal++;
421 flocal++; 435 flocal++;
422 436
423 if (collide && flocal <= local_tries) 437 if (collide && flocal <= local_retries)
424 /* retry locally a few times */ 438 /* retry locally a few times */
425 retry_bucket = 1; 439 retry_bucket = 1;
426 else if (local_fallback_tries > 0 && 440 else if (local_fallback_retries > 0 &&
427 flocal <= in->size + local_fallback_tries) 441 flocal <= in->size + local_fallback_retries)
428 /* exhaustive bucket search */ 442 /* exhaustive bucket search */
429 retry_bucket = 1; 443 retry_bucket = 1;
430 else if (ftotal <= tries) 444 else if (ftotal < tries)
431 /* then retry descent */ 445 /* then retry descent */
432 retry_descent = 1; 446 retry_descent = 1;
433 else 447 else
@@ -640,10 +654,20 @@ int crush_do_rule(const struct crush_map *map,
640 __u32 step; 654 __u32 step;
641 int i, j; 655 int i, j;
642 int numrep; 656 int numrep;
643 int choose_tries = map->choose_total_tries; 657 /*
644 int choose_local_tries = map->choose_local_tries; 658 * the original choose_total_tries value was off by one (it
645 int choose_local_fallback_tries = map->choose_local_fallback_tries; 659 * counted "retries" and not "tries"). add one.
660 */
661 int choose_tries = map->choose_total_tries + 1;
646 int choose_leaf_tries = 0; 662 int choose_leaf_tries = 0;
663 /*
664 * the local tries values were counted as "retries", though,
665 * and need no adjustment
666 */
667 int choose_local_retries = map->choose_local_tries;
668 int choose_local_fallback_retries = map->choose_local_fallback_tries;
669
670 int vary_r = map->chooseleaf_vary_r;
647 671
648 if ((__u32)ruleno >= map->max_rules) { 672 if ((__u32)ruleno >= map->max_rules) {
649 dprintk(" bad ruleno %d\n", ruleno); 673 dprintk(" bad ruleno %d\n", ruleno);
@@ -676,13 +700,18 @@ int crush_do_rule(const struct crush_map *map,
676 break; 700 break;
677 701
678 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES: 702 case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
679 if (curstep->arg1 > 0) 703 if (curstep->arg1 >= 0)
680 choose_local_tries = curstep->arg1; 704 choose_local_retries = curstep->arg1;
681 break; 705 break;
682 706
683 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES: 707 case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
684 if (curstep->arg1 > 0) 708 if (curstep->arg1 >= 0)
685 choose_local_fallback_tries = curstep->arg1; 709 choose_local_fallback_retries = curstep->arg1;
710 break;
711
712 case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
713 if (curstep->arg1 >= 0)
714 vary_r = curstep->arg1;
686 break; 715 break;
687 716
688 case CRUSH_RULE_CHOOSELEAF_FIRSTN: 717 case CRUSH_RULE_CHOOSELEAF_FIRSTN:
@@ -734,10 +763,12 @@ int crush_do_rule(const struct crush_map *map,
734 o+osize, j, 763 o+osize, j,
735 choose_tries, 764 choose_tries,
736 recurse_tries, 765 recurse_tries,
737 choose_local_tries, 766 choose_local_retries,
738 choose_local_fallback_tries, 767 choose_local_fallback_retries,
739 recurse_to_leaf, 768 recurse_to_leaf,
740 c+osize); 769 vary_r,
770 c+osize,
771 0);
741 } else { 772 } else {
742 crush_choose_indep( 773 crush_choose_indep(
743 map, 774 map,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 258a382e75ed..10421a4b76f8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p)
53{ 53{
54 int i; 54 int i;
55 struct ceph_client *client = s->private; 55 struct ceph_client *client = s->private;
56 struct ceph_osdmap *map = client->osdc.osdmap;
56 struct rb_node *n; 57 struct rb_node *n;
57 58
58 if (client->osdc.osdmap == NULL) 59 if (map == NULL)
59 return 0; 60 return 0;
60 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch); 61
62 seq_printf(s, "epoch %d\n", map->epoch);
61 seq_printf(s, "flags%s%s\n", 63 seq_printf(s, "flags%s%s\n",
62 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ? 64 (map->flags & CEPH_OSDMAP_NEARFULL) ? " NEARFULL" : "",
63 " NEARFULL" : "", 65 (map->flags & CEPH_OSDMAP_FULL) ? " FULL" : "");
64 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ? 66
65 " FULL" : ""); 67 for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool = 68 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node); 69 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %llu pg_num %d / %d\n", 70
70 (unsigned long long)pool->id, pool->pg_num, 71 seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
71 pool->pg_num_mask); 72 pool->id, pool->pg_num, pool->pg_num_mask,
73 pool->read_tier, pool->write_tier);
72 } 74 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) { 75 for (i = 0; i < map->max_osd; i++) {
74 struct ceph_entity_addr *addr = 76 struct ceph_entity_addr *addr = &map->osd_addr[i];
75 &client->osdc.osdmap->osd_addr[i]; 77 int state = map->osd_state[i];
76 int state = client->osdc.osdmap->osd_state[i];
77 char sb[64]; 78 char sb[64];
78 79
79 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n", 80 seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
80 i, ceph_pr_addr(&addr->in_addr), 81 i, ceph_pr_addr(&addr->in_addr),
81 ((client->osdc.osdmap->osd_weight[i]*100) >> 16), 82 ((map->osd_weight[i]*100) >> 16),
82 ceph_osdmap_state_str(sb, sizeof(sb), state)); 83 ceph_osdmap_state_str(sb, sizeof(sb), state),
84 ((ceph_get_primary_affinity(map, i)*100) >> 16));
85 }
86 for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
87 struct ceph_pg_mapping *pg =
88 rb_entry(n, struct ceph_pg_mapping, node);
89
90 seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
91 pg->pgid.seed);
92 for (i = 0; i < pg->pg_temp.len; i++)
93 seq_printf(s, "%s%d", (i == 0 ? "" : ","),
94 pg->pg_temp.osds[i]);
95 seq_printf(s, "]\n");
83 } 96 }
97 for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
98 struct ceph_pg_mapping *pg =
99 rb_entry(n, struct ceph_pg_mapping, node);
100
101 seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
102 pg->pgid.seed, pg->primary_temp.osd);
103 }
104
84 return 0; 105 return 0;
85} 106}
86 107
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 30efc5c18622..4f55f9ce63fa 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
919 if (!bytes || cursor->page_offset) 919 if (!bytes || cursor->page_offset)
920 return false; /* more bytes to process in the current page */ 920 return false; /* more bytes to process in the current page */
921 921
922 if (!cursor->resid)
923 return false; /* no more data */
924
922 /* Move on to the next page; offset is already at 0 */ 925 /* Move on to the next page; offset is already at 0 */
923 926
924 BUG_ON(cursor->page_index >= cursor->page_count); 927 BUG_ON(cursor->page_index >= cursor->page_count);
@@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
1004 if (!bytes || cursor->offset & ~PAGE_MASK) 1007 if (!bytes || cursor->offset & ~PAGE_MASK)
1005 return false; /* more bytes to process in the current page */ 1008 return false; /* more bytes to process in the current page */
1006 1009
1010 if (!cursor->resid)
1011 return false; /* no more data */
1012
1007 /* Move on to the next page */ 1013 /* Move on to the next page */
1008 1014
1009 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); 1015 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 82750f915865..b0dfce77656a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
436 case CEPH_OSD_OP_OMAPCLEAR: 436 case CEPH_OSD_OP_OMAPCLEAR:
437 case CEPH_OSD_OP_OMAPRMKEYS: 437 case CEPH_OSD_OP_OMAPRMKEYS:
438 case CEPH_OSD_OP_OMAP_CMP: 438 case CEPH_OSD_OP_OMAP_CMP:
439 case CEPH_OSD_OP_SETALLOCHINT:
439 case CEPH_OSD_OP_CLONERANGE: 440 case CEPH_OSD_OP_CLONERANGE:
440 case CEPH_OSD_OP_ASSERT_SRC_VERSION: 441 case CEPH_OSD_OP_ASSERT_SRC_VERSION:
441 case CEPH_OSD_OP_SRC_CMPXATTR: 442 case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
591} 592}
592EXPORT_SYMBOL(osd_req_op_watch_init); 593EXPORT_SYMBOL(osd_req_op_watch_init);
593 594
595void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
596 unsigned int which,
597 u64 expected_object_size,
598 u64 expected_write_size)
599{
600 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
601 CEPH_OSD_OP_SETALLOCHINT);
602
603 op->alloc_hint.expected_object_size = expected_object_size;
604 op->alloc_hint.expected_write_size = expected_write_size;
605
606 /*
607 * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
608 * not worth a feature bit. Set FAILOK per-op flag to make
609 * sure older osds don't trip over an unsupported opcode.
610 */
611 op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
612}
613EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
614
594static void ceph_osdc_msg_data_add(struct ceph_msg *msg, 615static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
595 struct ceph_osd_data *osd_data) 616 struct ceph_osd_data *osd_data)
596{ 617{
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
681 dst->watch.ver = cpu_to_le64(src->watch.ver); 702 dst->watch.ver = cpu_to_le64(src->watch.ver);
682 dst->watch.flag = src->watch.flag; 703 dst->watch.flag = src->watch.flag;
683 break; 704 break;
705 case CEPH_OSD_OP_SETALLOCHINT:
706 dst->alloc_hint.expected_object_size =
707 cpu_to_le64(src->alloc_hint.expected_object_size);
708 dst->alloc_hint.expected_write_size =
709 cpu_to_le64(src->alloc_hint.expected_write_size);
710 break;
684 default: 711 default:
685 pr_err("unsupported osd opcode %s\n", 712 pr_err("unsupported osd opcode %s\n",
686 ceph_osd_op_name(src->op)); 713 ceph_osd_op_name(src->op));
@@ -688,7 +715,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
688 715
689 return 0; 716 return 0;
690 } 717 }
718
691 dst->op = cpu_to_le16(src->op); 719 dst->op = cpu_to_le16(src->op);
720 dst->flags = cpu_to_le32(src->flags);
692 dst->payload_len = cpu_to_le32(src->payload_len); 721 dst->payload_len = cpu_to_le32(src->payload_len);
693 722
694 return request_data_len; 723 return request_data_len;
@@ -1304,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc,
1304{ 1333{
1305 struct ceph_pg pgid; 1334 struct ceph_pg pgid;
1306 int acting[CEPH_PG_MAX_SIZE]; 1335 int acting[CEPH_PG_MAX_SIZE];
1307 int o = -1, num = 0; 1336 int num, o;
1308 int err; 1337 int err;
1309 bool was_paused; 1338 bool was_paused;
1310 1339
@@ -1317,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc,
1317 } 1346 }
1318 req->r_pgid = pgid; 1347 req->r_pgid = pgid;
1319 1348
1320 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting); 1349 num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
1321 if (err > 0) { 1350 if (num < 0)
1322 o = acting[0]; 1351 num = 0;
1323 num = err;
1324 }
1325 1352
1326 was_paused = req->r_paused; 1353 was_paused = req->r_paused;
1327 req->r_paused = __req_should_be_paused(osdc, req); 1354 req->r_paused = __req_should_be_paused(osdc, req);
@@ -2033,7 +2060,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
2033 int skipped_map = 0; 2060 int skipped_map = 0;
2034 2061
2035 dout("taking full map %u len %d\n", epoch, maplen); 2062 dout("taking full map %u len %d\n", epoch, maplen);
2036 newmap = osdmap_decode(&p, p+maplen); 2063 newmap = ceph_osdmap_decode(&p, p+maplen);
2037 if (IS_ERR(newmap)) { 2064 if (IS_ERR(newmap)) {
2038 err = PTR_ERR(newmap); 2065 err = PTR_ERR(newmap);
2039 goto bad; 2066 goto bad;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index aade4a5c1c07..e632b5a52f5b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -343,7 +343,7 @@ bad:
343 343
344/* 344/*
345 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid 345 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
346 * to a set of osds) 346 * to a set of osds) and primary_temp (explicit primary setting)
347 */ 347 */
348static int pgid_cmp(struct ceph_pg l, struct ceph_pg r) 348static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
349{ 349{
@@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
506 kfree(pi); 506 kfree(pi);
507} 507}
508 508
509static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) 509static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
510{ 510{
511 u8 ev, cv; 511 u8 ev, cv;
512 unsigned len, num; 512 unsigned len, num;
@@ -587,7 +587,7 @@ bad:
587 return -EINVAL; 587 return -EINVAL;
588} 588}
589 589
590static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) 590static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
591{ 591{
592 struct ceph_pg_pool_info *pi; 592 struct ceph_pg_pool_info *pi;
593 u32 num, len; 593 u32 num, len;
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
633 rb_erase(&pg->node, &map->pg_temp); 633 rb_erase(&pg->node, &map->pg_temp);
634 kfree(pg); 634 kfree(pg);
635 } 635 }
636 while (!RB_EMPTY_ROOT(&map->primary_temp)) {
637 struct ceph_pg_mapping *pg =
638 rb_entry(rb_first(&map->primary_temp),
639 struct ceph_pg_mapping, node);
640 rb_erase(&pg->node, &map->primary_temp);
641 kfree(pg);
642 }
636 while (!RB_EMPTY_ROOT(&map->pg_pools)) { 643 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
637 struct ceph_pg_pool_info *pi = 644 struct ceph_pg_pool_info *pi =
638 rb_entry(rb_first(&map->pg_pools), 645 rb_entry(rb_first(&map->pg_pools),
@@ -642,186 +649,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
642 kfree(map->osd_state); 649 kfree(map->osd_state);
643 kfree(map->osd_weight); 650 kfree(map->osd_weight);
644 kfree(map->osd_addr); 651 kfree(map->osd_addr);
652 kfree(map->osd_primary_affinity);
645 kfree(map); 653 kfree(map);
646} 654}
647 655
648/* 656/*
649 * adjust max osd value. reallocate arrays. 657 * Adjust max_osd value, (re)allocate arrays.
658 *
659 * The new elements are properly initialized.
650 */ 660 */
651static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) 661static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
652{ 662{
653 u8 *state; 663 u8 *state;
654 struct ceph_entity_addr *addr;
655 u32 *weight; 664 u32 *weight;
665 struct ceph_entity_addr *addr;
666 int i;
656 667
657 state = kcalloc(max, sizeof(*state), GFP_NOFS); 668 state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
658 addr = kcalloc(max, sizeof(*addr), GFP_NOFS); 669 weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
659 weight = kcalloc(max, sizeof(*weight), GFP_NOFS); 670 addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
660 if (state == NULL || addr == NULL || weight == NULL) { 671 if (!state || !weight || !addr) {
661 kfree(state); 672 kfree(state);
662 kfree(addr);
663 kfree(weight); 673 kfree(weight);
674 kfree(addr);
675
664 return -ENOMEM; 676 return -ENOMEM;
665 } 677 }
666 678
667 /* copy old? */ 679 for (i = map->max_osd; i < max; i++) {
668 if (map->osd_state) { 680 state[i] = 0;
669 memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); 681 weight[i] = CEPH_OSD_OUT;
670 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); 682 memset(addr + i, 0, sizeof(*addr));
671 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
672 kfree(map->osd_state);
673 kfree(map->osd_addr);
674 kfree(map->osd_weight);
675 } 683 }
676 684
677 map->osd_state = state; 685 map->osd_state = state;
678 map->osd_weight = weight; 686 map->osd_weight = weight;
679 map->osd_addr = addr; 687 map->osd_addr = addr;
688
689 if (map->osd_primary_affinity) {
690 u32 *affinity;
691
692 affinity = krealloc(map->osd_primary_affinity,
693 max*sizeof(*affinity), GFP_NOFS);
694 if (!affinity)
695 return -ENOMEM;
696
697 for (i = map->max_osd; i < max; i++)
698 affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
699
700 map->osd_primary_affinity = affinity;
701 }
702
680 map->max_osd = max; 703 map->max_osd = max;
704
681 return 0; 705 return 0;
682} 706}
683 707
708#define OSDMAP_WRAPPER_COMPAT_VER 7
709#define OSDMAP_CLIENT_DATA_COMPAT_VER 1
710
684/* 711/*
685 * decode a full map. 712 * Return 0 or error. On success, *v is set to 0 for old (v6) osdmaps,
713 * to struct_v of the client_data section for new (v7 and above)
714 * osdmaps.
686 */ 715 */
687struct ceph_osdmap *osdmap_decode(void **p, void *end) 716static int get_osdmap_client_data_v(void **p, void *end,
717 const char *prefix, u8 *v)
688{ 718{
689 struct ceph_osdmap *map; 719 u8 struct_v;
690 u16 version; 720
691 u32 len, max, i; 721 ceph_decode_8_safe(p, end, struct_v, e_inval);
692 int err = -EINVAL; 722 if (struct_v >= 7) {
693 void *start = *p; 723 u8 struct_compat;
694 struct ceph_pg_pool_info *pi; 724
725 ceph_decode_8_safe(p, end, struct_compat, e_inval);
726 if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
727 pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
728 struct_v, struct_compat,
729 OSDMAP_WRAPPER_COMPAT_VER, prefix);
730 return -EINVAL;
731 }
732 *p += 4; /* ignore wrapper struct_len */
733
734 ceph_decode_8_safe(p, end, struct_v, e_inval);
735 ceph_decode_8_safe(p, end, struct_compat, e_inval);
736 if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
737 pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
738 struct_v, struct_compat,
739 OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
740 return -EINVAL;
741 }
742 *p += 4; /* ignore client data struct_len */
743 } else {
744 u16 version;
745
746 *p -= 1;
747 ceph_decode_16_safe(p, end, version, e_inval);
748 if (version < 6) {
749 pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
750 prefix);
751 return -EINVAL;
752 }
695 753
696 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 754 /* old osdmap enconding */
755 struct_v = 0;
756 }
697 757
698 map = kzalloc(sizeof(*map), GFP_NOFS); 758 *v = struct_v;
699 if (map == NULL) 759 return 0;
700 return ERR_PTR(-ENOMEM);
701 map->pg_temp = RB_ROOT;
702 760
703 ceph_decode_16_safe(p, end, version, bad); 761e_inval:
704 if (version > 6) { 762 return -EINVAL;
705 pr_warning("got unknown v %d > 6 of osdmap\n", version); 763}
706 goto bad; 764
765static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
766 bool incremental)
767{
768 u32 n;
769
770 ceph_decode_32_safe(p, end, n, e_inval);
771 while (n--) {
772 struct ceph_pg_pool_info *pi;
773 u64 pool;
774 int ret;
775
776 ceph_decode_64_safe(p, end, pool, e_inval);
777
778 pi = __lookup_pg_pool(&map->pg_pools, pool);
779 if (!incremental || !pi) {
780 pi = kzalloc(sizeof(*pi), GFP_NOFS);
781 if (!pi)
782 return -ENOMEM;
783
784 pi->id = pool;
785
786 ret = __insert_pg_pool(&map->pg_pools, pi);
787 if (ret) {
788 kfree(pi);
789 return ret;
790 }
791 }
792
793 ret = decode_pool(p, end, pi);
794 if (ret)
795 return ret;
707 } 796 }
708 if (version < 6) { 797
709 pr_warning("got old v %d < 6 of osdmap\n", version); 798 return 0;
710 goto bad; 799
800e_inval:
801 return -EINVAL;
802}
803
804static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
805{
806 return __decode_pools(p, end, map, false);
807}
808
809static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
810{
811 return __decode_pools(p, end, map, true);
812}
813
814static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
815 bool incremental)
816{
817 u32 n;
818
819 ceph_decode_32_safe(p, end, n, e_inval);
820 while (n--) {
821 struct ceph_pg pgid;
822 u32 len, i;
823 int ret;
824
825 ret = ceph_decode_pgid(p, end, &pgid);
826 if (ret)
827 return ret;
828
829 ceph_decode_32_safe(p, end, len, e_inval);
830
831 ret = __remove_pg_mapping(&map->pg_temp, pgid);
832 BUG_ON(!incremental && ret != -ENOENT);
833
834 if (!incremental || len > 0) {
835 struct ceph_pg_mapping *pg;
836
837 ceph_decode_need(p, end, len*sizeof(u32), e_inval);
838
839 if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
840 return -EINVAL;
841
842 pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
843 if (!pg)
844 return -ENOMEM;
845
846 pg->pgid = pgid;
847 pg->pg_temp.len = len;
848 for (i = 0; i < len; i++)
849 pg->pg_temp.osds[i] = ceph_decode_32(p);
850
851 ret = __insert_pg_mapping(pg, &map->pg_temp);
852 if (ret) {
853 kfree(pg);
854 return ret;
855 }
856 }
711 } 857 }
712 858
713 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad); 859 return 0;
860
861e_inval:
862 return -EINVAL;
863}
864
865static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
866{
867 return __decode_pg_temp(p, end, map, false);
868}
869
870static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
871{
872 return __decode_pg_temp(p, end, map, true);
873}
874
875static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
876 bool incremental)
877{
878 u32 n;
879
880 ceph_decode_32_safe(p, end, n, e_inval);
881 while (n--) {
882 struct ceph_pg pgid;
883 u32 osd;
884 int ret;
885
886 ret = ceph_decode_pgid(p, end, &pgid);
887 if (ret)
888 return ret;
889
890 ceph_decode_32_safe(p, end, osd, e_inval);
891
892 ret = __remove_pg_mapping(&map->primary_temp, pgid);
893 BUG_ON(!incremental && ret != -ENOENT);
894
895 if (!incremental || osd != (u32)-1) {
896 struct ceph_pg_mapping *pg;
897
898 pg = kzalloc(sizeof(*pg), GFP_NOFS);
899 if (!pg)
900 return -ENOMEM;
901
902 pg->pgid = pgid;
903 pg->primary_temp.osd = osd;
904
905 ret = __insert_pg_mapping(pg, &map->primary_temp);
906 if (ret) {
907 kfree(pg);
908 return ret;
909 }
910 }
911 }
912
913 return 0;
914
915e_inval:
916 return -EINVAL;
917}
918
919static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
920{
921 return __decode_primary_temp(p, end, map, false);
922}
923
924static int decode_new_primary_temp(void **p, void *end,
925 struct ceph_osdmap *map)
926{
927 return __decode_primary_temp(p, end, map, true);
928}
929
930u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
931{
932 BUG_ON(osd >= map->max_osd);
933
934 if (!map->osd_primary_affinity)
935 return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
936
937 return map->osd_primary_affinity[osd];
938}
939
940static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
941{
942 BUG_ON(osd >= map->max_osd);
943
944 if (!map->osd_primary_affinity) {
945 int i;
946
947 map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
948 GFP_NOFS);
949 if (!map->osd_primary_affinity)
950 return -ENOMEM;
951
952 for (i = 0; i < map->max_osd; i++)
953 map->osd_primary_affinity[i] =
954 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
955 }
956
957 map->osd_primary_affinity[osd] = aff;
958
959 return 0;
960}
961
962static int decode_primary_affinity(void **p, void *end,
963 struct ceph_osdmap *map)
964{
965 u32 len, i;
966
967 ceph_decode_32_safe(p, end, len, e_inval);
968 if (len == 0) {
969 kfree(map->osd_primary_affinity);
970 map->osd_primary_affinity = NULL;
971 return 0;
972 }
973 if (len != map->max_osd)
974 goto e_inval;
975
976 ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
977
978 for (i = 0; i < map->max_osd; i++) {
979 int ret;
980
981 ret = set_primary_affinity(map, i, ceph_decode_32(p));
982 if (ret)
983 return ret;
984 }
985
986 return 0;
987
988e_inval:
989 return -EINVAL;
990}
991
992static int decode_new_primary_affinity(void **p, void *end,
993 struct ceph_osdmap *map)
994{
995 u32 n;
996
997 ceph_decode_32_safe(p, end, n, e_inval);
998 while (n--) {
999 u32 osd, aff;
1000 int ret;
1001
1002 ceph_decode_32_safe(p, end, osd, e_inval);
1003 ceph_decode_32_safe(p, end, aff, e_inval);
1004
1005 ret = set_primary_affinity(map, osd, aff);
1006 if (ret)
1007 return ret;
1008
1009 pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
1010 }
1011
1012 return 0;
1013
1014e_inval:
1015 return -EINVAL;
1016}
1017
1018/*
1019 * decode a full map.
1020 */
1021static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
1022{
1023 u8 struct_v;
1024 u32 epoch = 0;
1025 void *start = *p;
1026 u32 max;
1027 u32 len, i;
1028 int err;
1029
1030 dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
1031
1032 err = get_osdmap_client_data_v(p, end, "full", &struct_v);
1033 if (err)
1034 goto bad;
1035
1036 /* fsid, epoch, created, modified */
1037 ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
1038 sizeof(map->created) + sizeof(map->modified), e_inval);
714 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid)); 1039 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
715 map->epoch = ceph_decode_32(p); 1040 epoch = map->epoch = ceph_decode_32(p);
716 ceph_decode_copy(p, &map->created, sizeof(map->created)); 1041 ceph_decode_copy(p, &map->created, sizeof(map->created));
717 ceph_decode_copy(p, &map->modified, sizeof(map->modified)); 1042 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
718 1043
719 ceph_decode_32_safe(p, end, max, bad); 1044 /* pools */
720 while (max--) { 1045 err = decode_pools(p, end, map);
721 ceph_decode_need(p, end, 8 + 2, bad); 1046 if (err)
722 err = -ENOMEM; 1047 goto bad;
723 pi = kzalloc(sizeof(*pi), GFP_NOFS);
724 if (!pi)
725 goto bad;
726 pi->id = ceph_decode_64(p);
727 err = __decode_pool(p, end, pi);
728 if (err < 0) {
729 kfree(pi);
730 goto bad;
731 }
732 __insert_pg_pool(&map->pg_pools, pi);
733 }
734 1048
735 err = __decode_pool_names(p, end, map); 1049 /* pool_name */
736 if (err < 0) { 1050 err = decode_pool_names(p, end, map);
737 dout("fail to decode pool names"); 1051 if (err)
738 goto bad; 1052 goto bad;
739 }
740 1053
741 ceph_decode_32_safe(p, end, map->pool_max, bad); 1054 ceph_decode_32_safe(p, end, map->pool_max, e_inval);
742 1055
743 ceph_decode_32_safe(p, end, map->flags, bad); 1056 ceph_decode_32_safe(p, end, map->flags, e_inval);
744 1057
745 max = ceph_decode_32(p); 1058 /* max_osd */
1059 ceph_decode_32_safe(p, end, max, e_inval);
746 1060
747 /* (re)alloc osd arrays */ 1061 /* (re)alloc osd arrays */
748 err = osdmap_set_max_osd(map, max); 1062 err = osdmap_set_max_osd(map, max);
749 if (err < 0) 1063 if (err)
750 goto bad; 1064 goto bad;
751 dout("osdmap_decode max_osd = %d\n", map->max_osd);
752 1065
753 /* osds */ 1066 /* osd_state, osd_weight, osd_addrs->client_addr */
754 err = -EINVAL;
755 ceph_decode_need(p, end, 3*sizeof(u32) + 1067 ceph_decode_need(p, end, 3*sizeof(u32) +
756 map->max_osd*(1 + sizeof(*map->osd_weight) + 1068 map->max_osd*(1 + sizeof(*map->osd_weight) +
757 sizeof(*map->osd_addr)), bad); 1069 sizeof(*map->osd_addr)), e_inval);
758 *p += 4; /* skip length field (should match max) */ 1070
1071 if (ceph_decode_32(p) != map->max_osd)
1072 goto e_inval;
1073
759 ceph_decode_copy(p, map->osd_state, map->max_osd); 1074 ceph_decode_copy(p, map->osd_state, map->max_osd);
760 1075
761 *p += 4; /* skip length field (should match max) */ 1076 if (ceph_decode_32(p) != map->max_osd)
1077 goto e_inval;
1078
762 for (i = 0; i < map->max_osd; i++) 1079 for (i = 0; i < map->max_osd; i++)
763 map->osd_weight[i] = ceph_decode_32(p); 1080 map->osd_weight[i] = ceph_decode_32(p);
764 1081
765 *p += 4; /* skip length field (should match max) */ 1082 if (ceph_decode_32(p) != map->max_osd)
1083 goto e_inval;
1084
766 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr)); 1085 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
767 for (i = 0; i < map->max_osd; i++) 1086 for (i = 0; i < map->max_osd; i++)
768 ceph_decode_addr(&map->osd_addr[i]); 1087 ceph_decode_addr(&map->osd_addr[i]);
769 1088
770 /* pg_temp */ 1089 /* pg_temp */
771 ceph_decode_32_safe(p, end, len, bad); 1090 err = decode_pg_temp(p, end, map);
772 for (i = 0; i < len; i++) { 1091 if (err)
773 int n, j; 1092 goto bad;
774 struct ceph_pg pgid;
775 struct ceph_pg_mapping *pg;
776 1093
777 err = ceph_decode_pgid(p, end, &pgid); 1094 /* primary_temp */
1095 if (struct_v >= 1) {
1096 err = decode_primary_temp(p, end, map);
778 if (err) 1097 if (err)
779 goto bad; 1098 goto bad;
780 ceph_decode_need(p, end, sizeof(u32), bad); 1099 }
781 n = ceph_decode_32(p);
782 err = -EINVAL;
783 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
784 goto bad;
785 ceph_decode_need(p, end, n * sizeof(u32), bad);
786 err = -ENOMEM;
787 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
788 if (!pg)
789 goto bad;
790 pg->pgid = pgid;
791 pg->len = n;
792 for (j = 0; j < n; j++)
793 pg->osds[j] = ceph_decode_32(p);
794 1100
795 err = __insert_pg_mapping(pg, &map->pg_temp); 1101 /* primary_affinity */
1102 if (struct_v >= 2) {
1103 err = decode_primary_affinity(p, end, map);
796 if (err) 1104 if (err)
797 goto bad; 1105 goto bad;
798 dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed, 1106 } else {
799 len); 1107 /* XXX can this happen? */
1108 kfree(map->osd_primary_affinity);
1109 map->osd_primary_affinity = NULL;
800 } 1110 }
801 1111
802 /* crush */ 1112 /* crush */
803 ceph_decode_32_safe(p, end, len, bad); 1113 ceph_decode_32_safe(p, end, len, e_inval);
804 dout("osdmap_decode crush len %d from off 0x%x\n", len, 1114 map->crush = crush_decode(*p, min(*p + len, end));
805 (int)(*p - start));
806 ceph_decode_need(p, end, len, bad);
807 map->crush = crush_decode(*p, end);
808 *p += len;
809 if (IS_ERR(map->crush)) { 1115 if (IS_ERR(map->crush)) {
810 err = PTR_ERR(map->crush); 1116 err = PTR_ERR(map->crush);
811 map->crush = NULL; 1117 map->crush = NULL;
812 goto bad; 1118 goto bad;
813 } 1119 }
1120 *p += len;
814 1121
815 /* ignore the rest of the map */ 1122 /* ignore the rest */
816 *p = end; 1123 *p = end;
817 1124
818 dout("osdmap_decode done %p %p\n", *p, end); 1125 dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
819 return map; 1126 return 0;
820 1127
1128e_inval:
1129 err = -EINVAL;
821bad: 1130bad:
822 dout("osdmap_decode fail err %d\n", err); 1131 pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
823 ceph_osdmap_destroy(map); 1132 err, epoch, (int)(*p - start), *p, start, end);
824 return ERR_PTR(err); 1133 print_hex_dump(KERN_DEBUG, "osdmap: ",
1134 DUMP_PREFIX_OFFSET, 16, 1,
1135 start, end - start, true);
1136 return err;
1137}
1138
1139/*
1140 * Allocate and decode a full map.
1141 */
1142struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
1143{
1144 struct ceph_osdmap *map;
1145 int ret;
1146
1147 map = kzalloc(sizeof(*map), GFP_NOFS);
1148 if (!map)
1149 return ERR_PTR(-ENOMEM);
1150
1151 map->pg_temp = RB_ROOT;
1152 map->primary_temp = RB_ROOT;
1153 mutex_init(&map->crush_scratch_mutex);
1154
1155 ret = osdmap_decode(p, end, map);
1156 if (ret) {
1157 ceph_osdmap_destroy(map);
1158 return ERR_PTR(ret);
1159 }
1160
1161 return map;
825} 1162}
826 1163
827/* 1164/*
@@ -840,17 +1177,18 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
840 __s64 new_pool_max; 1177 __s64 new_pool_max;
841 __s32 new_flags, max; 1178 __s32 new_flags, max;
842 void *start = *p; 1179 void *start = *p;
843 int err = -EINVAL; 1180 int err;
844 u16 version; 1181 u8 struct_v;
1182
1183 dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
845 1184
846 ceph_decode_16_safe(p, end, version, bad); 1185 err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
847 if (version != 6) { 1186 if (err)
848 pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
849 goto bad; 1187 goto bad;
850 }
851 1188
852 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32), 1189 /* fsid, epoch, modified, new_pool_max, new_flags */
853 bad); 1190 ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
1191 sizeof(u64) + sizeof(u32), e_inval);
854 ceph_decode_copy(p, &fsid, sizeof(fsid)); 1192 ceph_decode_copy(p, &fsid, sizeof(fsid));
855 epoch = ceph_decode_32(p); 1193 epoch = ceph_decode_32(p);
856 BUG_ON(epoch != map->epoch+1); 1194 BUG_ON(epoch != map->epoch+1);
@@ -859,21 +1197,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
859 new_flags = ceph_decode_32(p); 1197 new_flags = ceph_decode_32(p);
860 1198
861 /* full map? */ 1199 /* full map? */
862 ceph_decode_32_safe(p, end, len, bad); 1200 ceph_decode_32_safe(p, end, len, e_inval);
863 if (len > 0) { 1201 if (len > 0) {
864 dout("apply_incremental full map len %d, %p to %p\n", 1202 dout("apply_incremental full map len %d, %p to %p\n",
865 len, *p, end); 1203 len, *p, end);
866 return osdmap_decode(p, min(*p+len, end)); 1204 return ceph_osdmap_decode(p, min(*p+len, end));
867 } 1205 }
868 1206
869 /* new crush? */ 1207 /* new crush? */
870 ceph_decode_32_safe(p, end, len, bad); 1208 ceph_decode_32_safe(p, end, len, e_inval);
871 if (len > 0) { 1209 if (len > 0) {
872 dout("apply_incremental new crush map len %d, %p to %p\n",
873 len, *p, end);
874 newcrush = crush_decode(*p, min(*p+len, end)); 1210 newcrush = crush_decode(*p, min(*p+len, end));
875 if (IS_ERR(newcrush)) 1211 if (IS_ERR(newcrush)) {
876 return ERR_CAST(newcrush); 1212 err = PTR_ERR(newcrush);
1213 newcrush = NULL;
1214 goto bad;
1215 }
877 *p += len; 1216 *p += len;
878 } 1217 }
879 1218
@@ -883,13 +1222,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
883 if (new_pool_max >= 0) 1222 if (new_pool_max >= 0)
884 map->pool_max = new_pool_max; 1223 map->pool_max = new_pool_max;
885 1224
886 ceph_decode_need(p, end, 5*sizeof(u32), bad);
887
888 /* new max? */ 1225 /* new max? */
889 max = ceph_decode_32(p); 1226 ceph_decode_32_safe(p, end, max, e_inval);
890 if (max >= 0) { 1227 if (max >= 0) {
891 err = osdmap_set_max_osd(map, max); 1228 err = osdmap_set_max_osd(map, max);
892 if (err < 0) 1229 if (err)
893 goto bad; 1230 goto bad;
894 } 1231 }
895 1232
@@ -902,51 +1239,34 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
902 newcrush = NULL; 1239 newcrush = NULL;
903 } 1240 }
904 1241
905 /* new_pool */ 1242 /* new_pools */
906 ceph_decode_32_safe(p, end, len, bad); 1243 err = decode_new_pools(p, end, map);
907 while (len--) { 1244 if (err)
908 struct ceph_pg_pool_info *pi; 1245 goto bad;
909 1246
910 ceph_decode_64_safe(p, end, pool, bad); 1247 /* new_pool_names */
911 pi = __lookup_pg_pool(&map->pg_pools, pool); 1248 err = decode_pool_names(p, end, map);
912 if (!pi) { 1249 if (err)
913 pi = kzalloc(sizeof(*pi), GFP_NOFS); 1250 goto bad;
914 if (!pi) {
915 err = -ENOMEM;
916 goto bad;
917 }
918 pi->id = pool;
919 __insert_pg_pool(&map->pg_pools, pi);
920 }
921 err = __decode_pool(p, end, pi);
922 if (err < 0)
923 goto bad;
924 }
925 if (version >= 5) {
926 err = __decode_pool_names(p, end, map);
927 if (err < 0)
928 goto bad;
929 }
930 1251
931 /* old_pool */ 1252 /* old_pool */
932 ceph_decode_32_safe(p, end, len, bad); 1253 ceph_decode_32_safe(p, end, len, e_inval);
933 while (len--) { 1254 while (len--) {
934 struct ceph_pg_pool_info *pi; 1255 struct ceph_pg_pool_info *pi;
935 1256
936 ceph_decode_64_safe(p, end, pool, bad); 1257 ceph_decode_64_safe(p, end, pool, e_inval);
937 pi = __lookup_pg_pool(&map->pg_pools, pool); 1258 pi = __lookup_pg_pool(&map->pg_pools, pool);
938 if (pi) 1259 if (pi)
939 __remove_pg_pool(&map->pg_pools, pi); 1260 __remove_pg_pool(&map->pg_pools, pi);
940 } 1261 }
941 1262
942 /* new_up */ 1263 /* new_up */
943 err = -EINVAL; 1264 ceph_decode_32_safe(p, end, len, e_inval);
944 ceph_decode_32_safe(p, end, len, bad);
945 while (len--) { 1265 while (len--) {
946 u32 osd; 1266 u32 osd;
947 struct ceph_entity_addr addr; 1267 struct ceph_entity_addr addr;
948 ceph_decode_32_safe(p, end, osd, bad); 1268 ceph_decode_32_safe(p, end, osd, e_inval);
949 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad); 1269 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
950 ceph_decode_addr(&addr); 1270 ceph_decode_addr(&addr);
951 pr_info("osd%d up\n", osd); 1271 pr_info("osd%d up\n", osd);
952 BUG_ON(osd >= map->max_osd); 1272 BUG_ON(osd >= map->max_osd);
@@ -955,11 +1275,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
955 } 1275 }
956 1276
957 /* new_state */ 1277 /* new_state */
958 ceph_decode_32_safe(p, end, len, bad); 1278 ceph_decode_32_safe(p, end, len, e_inval);
959 while (len--) { 1279 while (len--) {
960 u32 osd; 1280 u32 osd;
961 u8 xorstate; 1281 u8 xorstate;
962 ceph_decode_32_safe(p, end, osd, bad); 1282 ceph_decode_32_safe(p, end, osd, e_inval);
963 xorstate = **(u8 **)p; 1283 xorstate = **(u8 **)p;
964 (*p)++; /* clean flag */ 1284 (*p)++; /* clean flag */
965 if (xorstate == 0) 1285 if (xorstate == 0)
@@ -971,10 +1291,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
971 } 1291 }
972 1292
973 /* new_weight */ 1293 /* new_weight */
974 ceph_decode_32_safe(p, end, len, bad); 1294 ceph_decode_32_safe(p, end, len, e_inval);
975 while (len--) { 1295 while (len--) {
976 u32 osd, off; 1296 u32 osd, off;
977 ceph_decode_need(p, end, sizeof(u32)*2, bad); 1297 ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
978 osd = ceph_decode_32(p); 1298 osd = ceph_decode_32(p);
979 off = ceph_decode_32(p); 1299 off = ceph_decode_32(p);
980 pr_info("osd%d weight 0x%x %s\n", osd, off, 1300 pr_info("osd%d weight 0x%x %s\n", osd, off,
@@ -985,56 +1305,35 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
985 } 1305 }
986 1306
987 /* new_pg_temp */ 1307 /* new_pg_temp */
988 ceph_decode_32_safe(p, end, len, bad); 1308 err = decode_new_pg_temp(p, end, map);
989 while (len--) { 1309 if (err)
990 struct ceph_pg_mapping *pg; 1310 goto bad;
991 int j;
992 struct ceph_pg pgid;
993 u32 pglen;
994 1311
995 err = ceph_decode_pgid(p, end, &pgid); 1312 /* new_primary_temp */
1313 if (struct_v >= 1) {
1314 err = decode_new_primary_temp(p, end, map);
996 if (err) 1315 if (err)
997 goto bad; 1316 goto bad;
998 ceph_decode_need(p, end, sizeof(u32), bad); 1317 }
999 pglen = ceph_decode_32(p);
1000 if (pglen) {
1001 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
1002
1003 /* removing existing (if any) */
1004 (void) __remove_pg_mapping(&map->pg_temp, pgid);
1005 1318
1006 /* insert */ 1319 /* new_primary_affinity */
1007 err = -EINVAL; 1320 if (struct_v >= 2) {
1008 if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) 1321 err = decode_new_primary_affinity(p, end, map);
1009 goto bad; 1322 if (err)
1010 err = -ENOMEM; 1323 goto bad;
1011 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
1012 if (!pg)
1013 goto bad;
1014 pg->pgid = pgid;
1015 pg->len = pglen;
1016 for (j = 0; j < pglen; j++)
1017 pg->osds[j] = ceph_decode_32(p);
1018 err = __insert_pg_mapping(pg, &map->pg_temp);
1019 if (err) {
1020 kfree(pg);
1021 goto bad;
1022 }
1023 dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
1024 pgid.seed, pglen);
1025 } else {
1026 /* remove */
1027 __remove_pg_mapping(&map->pg_temp, pgid);
1028 }
1029 } 1324 }
1030 1325
1031 /* ignore the rest */ 1326 /* ignore the rest */
1032 *p = end; 1327 *p = end;
1328
1329 dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
1033 return map; 1330 return map;
1034 1331
1332e_inval:
1333 err = -EINVAL;
1035bad: 1334bad:
1036 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n", 1335 pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
1037 epoch, (int)(*p - start), *p, start, end); 1336 err, epoch, (int)(*p - start), *p, start, end);
1038 print_hex_dump(KERN_DEBUG, "osdmap: ", 1337 print_hex_dump(KERN_DEBUG, "osdmap: ",
1039 DUMP_PREFIX_OFFSET, 16, 1, 1338 DUMP_PREFIX_OFFSET, 16, 1,
1040 start, end - start, true); 1339 start, end - start, true);
@@ -1142,61 +1441,249 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
1142} 1441}
1143EXPORT_SYMBOL(ceph_oloc_oid_to_pg); 1442EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
1144 1443
1145static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x, 1444static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
1146 int *result, int result_max, 1445 int *result, int result_max,
1147 const __u32 *weight, int weight_max) 1446 const __u32 *weight, int weight_max)
1148{ 1447{
1149 int scratch[result_max * 3]; 1448 int r;
1150 1449
1151 return crush_do_rule(map, ruleno, x, result, result_max, 1450 BUG_ON(result_max > CEPH_PG_MAX_SIZE);
1152 weight, weight_max, scratch); 1451
1452 mutex_lock(&map->crush_scratch_mutex);
1453 r = crush_do_rule(map->crush, ruleno, x, result, result_max,
1454 weight, weight_max, map->crush_scratch_ary);
1455 mutex_unlock(&map->crush_scratch_mutex);
1456
1457 return r;
1153} 1458}
1154 1459
1155/* 1460/*
1156 * Calculate raw osd vector for the given pgid. Return pointer to osd 1461 * Calculate raw (crush) set for given pgid.
1157 * array, or NULL on failure. 1462 *
1463 * Return raw set length, or error.
1158 */ 1464 */
1159static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid, 1465static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
1160 int *osds, int *num) 1466 struct ceph_pg_pool_info *pool,
1467 struct ceph_pg pgid, u32 pps, int *osds)
1161{ 1468{
1162 struct ceph_pg_mapping *pg;
1163 struct ceph_pg_pool_info *pool;
1164 int ruleno; 1469 int ruleno;
1165 int r; 1470 int len;
1166 u32 pps;
1167 1471
1168 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool); 1472 /* crush */
1169 if (!pool) 1473 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
1170 return NULL; 1474 pool->type, pool->size);
1475 if (ruleno < 0) {
1476 pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
1477 pgid.pool, pool->crush_ruleset, pool->type,
1478 pool->size);
1479 return -ENOENT;
1480 }
1171 1481
1172 /* pg_temp? */ 1482 len = do_crush(osdmap, ruleno, pps, osds,
1483 min_t(int, pool->size, CEPH_PG_MAX_SIZE),
1484 osdmap->osd_weight, osdmap->max_osd);
1485 if (len < 0) {
1486 pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
1487 len, ruleno, pgid.pool, pool->crush_ruleset,
1488 pool->type, pool->size);
1489 return len;
1490 }
1491
1492 return len;
1493}
1494
1495/*
1496 * Given raw set, calculate up set and up primary.
1497 *
1498 * Return up set length. *primary is set to up primary osd id, or -1
1499 * if up set is empty.
1500 */
1501static int raw_to_up_osds(struct ceph_osdmap *osdmap,
1502 struct ceph_pg_pool_info *pool,
1503 int *osds, int len, int *primary)
1504{
1505 int up_primary = -1;
1506 int i;
1507
1508 if (ceph_can_shift_osds(pool)) {
1509 int removed = 0;
1510
1511 for (i = 0; i < len; i++) {
1512 if (ceph_osd_is_down(osdmap, osds[i])) {
1513 removed++;
1514 continue;
1515 }
1516 if (removed)
1517 osds[i - removed] = osds[i];
1518 }
1519
1520 len -= removed;
1521 if (len > 0)
1522 up_primary = osds[0];
1523 } else {
1524 for (i = len - 1; i >= 0; i--) {
1525 if (ceph_osd_is_down(osdmap, osds[i]))
1526 osds[i] = CRUSH_ITEM_NONE;
1527 else
1528 up_primary = osds[i];
1529 }
1530 }
1531
1532 *primary = up_primary;
1533 return len;
1534}
1535
1536static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
1537 struct ceph_pg_pool_info *pool,
1538 int *osds, int len, int *primary)
1539{
1540 int i;
1541 int pos = -1;
1542
1543 /*
1544 * Do we have any non-default primary_affinity values for these
1545 * osds?
1546 */
1547 if (!osdmap->osd_primary_affinity)
1548 return;
1549
1550 for (i = 0; i < len; i++) {
1551 if (osds[i] != CRUSH_ITEM_NONE &&
1552 osdmap->osd_primary_affinity[i] !=
1553 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
1554 break;
1555 }
1556 }
1557 if (i == len)
1558 return;
1559
1560 /*
1561 * Pick the primary. Feed both the seed (for the pg) and the
1562 * osd into the hash/rng so that a proportional fraction of an
1563 * osd's pgs get rejected as primary.
1564 */
1565 for (i = 0; i < len; i++) {
1566 int osd;
1567 u32 aff;
1568
1569 osd = osds[i];
1570 if (osd == CRUSH_ITEM_NONE)
1571 continue;
1572
1573 aff = osdmap->osd_primary_affinity[osd];
1574 if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
1575 (crush_hash32_2(CRUSH_HASH_RJENKINS1,
1576 pps, osd) >> 16) >= aff) {
1577 /*
1578 * We chose not to use this primary. Note it
1579 * anyway as a fallback in case we don't pick
1580 * anyone else, but keep looking.
1581 */
1582 if (pos < 0)
1583 pos = i;
1584 } else {
1585 pos = i;
1586 break;
1587 }
1588 }
1589 if (pos < 0)
1590 return;
1591
1592 *primary = osds[pos];
1593
1594 if (ceph_can_shift_osds(pool) && pos > 0) {
1595 /* move the new primary to the front */
1596 for (i = pos; i > 0; i--)
1597 osds[i] = osds[i - 1];
1598 osds[0] = *primary;
1599 }
1600}
1601
1602/*
1603 * Given up set, apply pg_temp and primary_temp mappings.
1604 *
1605 * Return acting set length. *primary is set to acting primary osd id,
1606 * or -1 if acting set is empty.
1607 */
1608static int apply_temps(struct ceph_osdmap *osdmap,
1609 struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
1610 int *osds, int len, int *primary)
1611{
1612 struct ceph_pg_mapping *pg;
1613 int temp_len;
1614 int temp_primary;
1615 int i;
1616
1617 /* raw_pg -> pg */
1173 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num, 1618 pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
1174 pool->pg_num_mask); 1619 pool->pg_num_mask);
1620
1621 /* pg_temp? */
1175 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); 1622 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
1176 if (pg) { 1623 if (pg) {
1177 *num = pg->len; 1624 temp_len = 0;
1178 return pg->osds; 1625 temp_primary = -1;
1626
1627 for (i = 0; i < pg->pg_temp.len; i++) {
1628 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
1629 if (ceph_can_shift_osds(pool))
1630 continue;
1631 else
1632 osds[temp_len++] = CRUSH_ITEM_NONE;
1633 } else {
1634 osds[temp_len++] = pg->pg_temp.osds[i];
1635 }
1636 }
1637
1638 /* apply pg_temp's primary */
1639 for (i = 0; i < temp_len; i++) {
1640 if (osds[i] != CRUSH_ITEM_NONE) {
1641 temp_primary = osds[i];
1642 break;
1643 }
1644 }
1645 } else {
1646 temp_len = len;
1647 temp_primary = *primary;
1179 } 1648 }
1180 1649
1181 /* crush */ 1650 /* primary_temp? */
1182 ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset, 1651 pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
1183 pool->type, pool->size); 1652 if (pg)
1184 if (ruleno < 0) { 1653 temp_primary = pg->primary_temp.osd;
1185 pr_err("no crush rule pool %lld ruleset %d type %d size %d\n", 1654
1186 pgid.pool, pool->crush_ruleset, pool->type, 1655 *primary = temp_primary;
1187 pool->size); 1656 return temp_len;
1188 return NULL; 1657}
1658
1659/*
1660 * Calculate acting set for given pgid.
1661 *
1662 * Return acting set length, or error. *primary is set to acting
1663 * primary osd id, or -1 if acting set is empty or on error.
1664 */
1665int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1666 int *osds, int *primary)
1667{
1668 struct ceph_pg_pool_info *pool;
1669 u32 pps;
1670 int len;
1671
1672 pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
1673 if (!pool) {
1674 *primary = -1;
1675 return -ENOENT;
1189 } 1676 }
1190 1677
1191 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) { 1678 if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
1192 /* hash pool id and seed sothat pool PGs do not overlap */ 1679 /* hash pool id and seed so that pool PGs do not overlap */
1193 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1, 1680 pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
1194 ceph_stable_mod(pgid.seed, pool->pgp_num, 1681 ceph_stable_mod(pgid.seed, pool->pgp_num,
1195 pool->pgp_num_mask), 1682 pool->pgp_num_mask),
1196 pgid.pool); 1683 pgid.pool);
1197 } else { 1684 } else {
1198 /* 1685 /*
1199 * legacy ehavior: add ps and pool together. this is 1686 * legacy behavior: add ps and pool together. this is
1200 * not a great approach because the PGs from each pool 1687 * not a great approach because the PGs from each pool
1201 * will overlap on top of each other: 0.5 == 1.4 == 1688 * will overlap on top of each other: 0.5 == 1.4 ==
1202 * 2.3 == ... 1689 * 2.3 == ...
@@ -1205,38 +1692,20 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1205 pool->pgp_num_mask) + 1692 pool->pgp_num_mask) +
1206 (unsigned)pgid.pool; 1693 (unsigned)pgid.pool;
1207 } 1694 }
1208 r = crush_do_rule_ary(osdmap->crush, ruleno, pps, 1695
1209 osds, min_t(int, pool->size, *num), 1696 len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
1210 osdmap->osd_weight, osdmap->max_osd); 1697 if (len < 0) {
1211 if (r < 0) { 1698 *primary = -1;
1212 pr_err("error %d from crush rule: pool %lld ruleset %d type %d" 1699 return len;
1213 " size %d\n", r, pgid.pool, pool->crush_ruleset,
1214 pool->type, pool->size);
1215 return NULL;
1216 } 1700 }
1217 *num = r;
1218 return osds;
1219}
1220 1701
1221/* 1702 len = raw_to_up_osds(osdmap, pool, osds, len, primary);
1222 * Return acting set for given pgid.
1223 */
1224int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1225 int *acting)
1226{
1227 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1228 int i, o, num = CEPH_PG_MAX_SIZE;
1229 1703
1230 osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1704 apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
1231 if (!osds)
1232 return -1;
1233 1705
1234 /* primary is first up osd */ 1706 len = apply_temps(osdmap, pool, pgid, osds, len, primary);
1235 o = 0; 1707
1236 for (i = 0; i < num; i++) 1708 return len;
1237 if (ceph_osd_is_up(osdmap, osds[i]))
1238 acting[o++] = osds[i];
1239 return o;
1240} 1709}
1241 1710
1242/* 1711/*
@@ -1244,17 +1713,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1244 */ 1713 */
1245int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1714int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1246{ 1715{
1247 int rawosds[CEPH_PG_MAX_SIZE], *osds; 1716 int osds[CEPH_PG_MAX_SIZE];
1248 int i, num = CEPH_PG_MAX_SIZE; 1717 int primary;
1249 1718
1250 osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1719 ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
1251 if (!osds)
1252 return -1;
1253 1720
1254 /* primary is first up osd */ 1721 return primary;
1255 for (i = 0; i < num; i++)
1256 if (ceph_osd_is_up(osdmap, osds[i]))
1257 return osds[i];
1258 return -1;
1259} 1722}
1260EXPORT_SYMBOL(ceph_calc_pg_primary); 1723EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/core/dev.c b/net/core/dev.c
index 757063420ce0..14dac0654f28 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4043,6 +4043,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4043 skb->vlan_tci = 0; 4043 skb->vlan_tci = 0;
4044 skb->dev = napi->dev; 4044 skb->dev = napi->dev;
4045 skb->skb_iif = 0; 4045 skb->skb_iif = 0;
4046 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4046 4047
4047 napi->skb = skb; 4048 napi->skb = skb;
4048} 4049}
@@ -4588,8 +4589,7 @@ void *netdev_lower_get_next_private(struct net_device *dev,
4588 if (&lower->list == &dev->adj_list.lower) 4589 if (&lower->list == &dev->adj_list.lower)
4589 return NULL; 4590 return NULL;
4590 4591
4591 if (iter) 4592 *iter = lower->list.next;
4592 *iter = lower->list.next;
4593 4593
4594 return lower->private; 4594 return lower->private;
4595} 4595}
@@ -4617,8 +4617,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
4617 if (&lower->list == &dev->adj_list.lower) 4617 if (&lower->list == &dev->adj_list.lower)
4618 return NULL; 4618 return NULL;
4619 4619
4620 if (iter) 4620 *iter = &lower->list;
4621 *iter = &lower->list;
4622 4621
4623 return lower->private; 4622 return lower->private;
4624} 4623}
@@ -5696,6 +5695,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
5696 } 5695 }
5697 } 5696 }
5698 5697
5698#ifdef CONFIG_NET_RX_BUSY_POLL
5699 if (dev->netdev_ops->ndo_busy_poll)
5700 features |= NETIF_F_BUSY_POLL;
5701 else
5702#endif
5703 features &= ~NETIF_F_BUSY_POLL;
5704
5699 return features; 5705 return features;
5700} 5706}
5701 5707
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 30071dec287a..640ba0e5831c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
97 [NETIF_F_RXFCS_BIT] = "rx-fcs", 97 [NETIF_F_RXFCS_BIT] = "rx-fcs",
98 [NETIF_F_RXALL_BIT] = "rx-all", 98 [NETIF_F_RXALL_BIT] = "rx-all",
99 [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", 99 [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
100 [NETIF_F_BUSY_POLL_BIT] = "busy-poll",
100}; 101};
101 102
102static int ethtool_get_features(struct net_device *dev, void __user *useraddr) 103static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
diff --git a/net/core/filter.c b/net/core/filter.c
index 765556ba32ef..e08b3822c72a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -295,43 +295,43 @@ select_insn:
295 (*(s64 *) &A) >>= K; 295 (*(s64 *) &A) >>= K;
296 CONT; 296 CONT;
297 BPF_ALU64_BPF_MOD_BPF_X: 297 BPF_ALU64_BPF_MOD_BPF_X:
298 if (unlikely(X == 0))
299 return 0;
298 tmp = A; 300 tmp = A;
299 if (X) 301 A = do_div(tmp, X);
300 A = do_div(tmp, X);
301 CONT; 302 CONT;
302 BPF_ALU_BPF_MOD_BPF_X: 303 BPF_ALU_BPF_MOD_BPF_X:
304 if (unlikely(X == 0))
305 return 0;
303 tmp = (u32) A; 306 tmp = (u32) A;
304 if (X) 307 A = do_div(tmp, (u32) X);
305 A = do_div(tmp, (u32) X);
306 CONT; 308 CONT;
307 BPF_ALU64_BPF_MOD_BPF_K: 309 BPF_ALU64_BPF_MOD_BPF_K:
308 tmp = A; 310 tmp = A;
309 if (K) 311 A = do_div(tmp, K);
310 A = do_div(tmp, K);
311 CONT; 312 CONT;
312 BPF_ALU_BPF_MOD_BPF_K: 313 BPF_ALU_BPF_MOD_BPF_K:
313 tmp = (u32) A; 314 tmp = (u32) A;
314 if (K) 315 A = do_div(tmp, (u32) K);
315 A = do_div(tmp, (u32) K);
316 CONT; 316 CONT;
317 BPF_ALU64_BPF_DIV_BPF_X: 317 BPF_ALU64_BPF_DIV_BPF_X:
318 if (X) 318 if (unlikely(X == 0))
319 do_div(A, X); 319 return 0;
320 do_div(A, X);
320 CONT; 321 CONT;
321 BPF_ALU_BPF_DIV_BPF_X: 322 BPF_ALU_BPF_DIV_BPF_X:
323 if (unlikely(X == 0))
324 return 0;
322 tmp = (u32) A; 325 tmp = (u32) A;
323 if (X) 326 do_div(tmp, (u32) X);
324 do_div(tmp, (u32) X);
325 A = (u32) tmp; 327 A = (u32) tmp;
326 CONT; 328 CONT;
327 BPF_ALU64_BPF_DIV_BPF_K: 329 BPF_ALU64_BPF_DIV_BPF_K:
328 if (K) 330 do_div(A, K);
329 do_div(A, K);
330 CONT; 331 CONT;
331 BPF_ALU_BPF_DIV_BPF_K: 332 BPF_ALU_BPF_DIV_BPF_K:
332 tmp = (u32) A; 333 tmp = (u32) A;
333 if (K) 334 do_div(tmp, (u32) K);
334 do_div(tmp, (u32) K);
335 A = (u32) tmp; 335 A = (u32) tmp;
336 CONT; 336 CONT;
337 BPF_ALU_BPF_END_BPF_TO_BE: 337 BPF_ALU_BPF_END_BPF_TO_BE:
diff --git a/net/core/flow.c b/net/core/flow.c
index 31cfb365e0c6..a0348fde1fdf 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -455,6 +455,8 @@ int flow_cache_init(struct net *net)
455 if (!fc->percpu) 455 if (!fc->percpu)
456 return -ENOMEM; 456 return -ENOMEM;
457 457
458 cpu_notifier_register_begin();
459
458 for_each_online_cpu(i) { 460 for_each_online_cpu(i) {
459 if (flow_cache_cpu_prepare(fc, i)) 461 if (flow_cache_cpu_prepare(fc, i))
460 goto err; 462 goto err;
@@ -462,7 +464,9 @@ int flow_cache_init(struct net *net)
462 fc->hotcpu_notifier = (struct notifier_block){ 464 fc->hotcpu_notifier = (struct notifier_block){
463 .notifier_call = flow_cache_cpu, 465 .notifier_call = flow_cache_cpu,
464 }; 466 };
465 register_hotcpu_notifier(&fc->hotcpu_notifier); 467 __register_hotcpu_notifier(&fc->hotcpu_notifier);
468
469 cpu_notifier_register_done();
466 470
467 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, 471 setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
468 (unsigned long) fc); 472 (unsigned long) fc);
@@ -478,6 +482,8 @@ err:
478 fcp->hash_table = NULL; 482 fcp->hash_table = NULL;
479 } 483 }
480 484
485 cpu_notifier_register_done();
486
481 free_percpu(fc->percpu); 487 free_percpu(fc->percpu);
482 fc->percpu = NULL; 488 fc->percpu = NULL;
483 489
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d0dac57291af..d068ec25db1e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3340,7 +3340,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
3340 3340
3341 __netif_tx_lock_bh(txq); 3341 __netif_tx_lock_bh(txq);
3342 3342
3343 if (unlikely(netif_xmit_frozen_or_stopped(txq))) { 3343 if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
3344 ret = NETDEV_TX_BUSY; 3344 ret = NETDEV_TX_BUSY;
3345 pkt_dev->last_ok = 0; 3345 pkt_dev->last_ok = 0;
3346 goto unlock; 3346 goto unlock;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 59da7cde0724..f95b6f93814b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,
1044 1044
1045 xt_free_table_info(oldinfo); 1045 xt_free_table_info(oldinfo);
1046 if (copy_to_user(counters_ptr, counters, 1046 if (copy_to_user(counters_ptr, counters,
1047 sizeof(struct xt_counters) * num_counters) != 0) 1047 sizeof(struct xt_counters) * num_counters) != 0) {
1048 ret = -EFAULT; 1048 /* Silent error, can't fail, new table is already in place */
1049 net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
1050 }
1049 vfree(counters); 1051 vfree(counters);
1050 xt_table_unlock(t); 1052 xt_table_unlock(t);
1051 return ret; 1053 return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 718dfbd30cbe..99e810f84671 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1231 1231
1232 xt_free_table_info(oldinfo); 1232 xt_free_table_info(oldinfo);
1233 if (copy_to_user(counters_ptr, counters, 1233 if (copy_to_user(counters_ptr, counters,
1234 sizeof(struct xt_counters) * num_counters) != 0) 1234 sizeof(struct xt_counters) * num_counters) != 0) {
1235 ret = -EFAULT; 1235 /* Silent error, can't fail, new table is already in place */
1236 net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
1237 }
1236 vfree(counters); 1238 vfree(counters);
1237 xt_table_unlock(t); 1239 xt_table_unlock(t);
1238 return ret; 1240 return ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1be9e990514d..34d094cadb11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -188,7 +188,7 @@ const __u8 ip_tos2prio[16] = {
188EXPORT_SYMBOL(ip_tos2prio); 188EXPORT_SYMBOL(ip_tos2prio);
189 189
190static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 190static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
191#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 191#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
192 192
193#ifdef CONFIG_PROC_FS 193#ifdef CONFIG_PROC_FS
194static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 194static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 710238f58aa9..e080fbbbc0e5 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1241,8 +1241,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
1241 1241
1242 xt_free_table_info(oldinfo); 1242 xt_free_table_info(oldinfo);
1243 if (copy_to_user(counters_ptr, counters, 1243 if (copy_to_user(counters_ptr, counters,
1244 sizeof(struct xt_counters) * num_counters) != 0) 1244 sizeof(struct xt_counters) * num_counters) != 0) {
1245 ret = -EFAULT; 1245 /* Silent error, can't fail, new table is already in place */
1246 net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
1247 }
1246 vfree(counters); 1248 vfree(counters);
1247 xt_table_unlock(t); 1249 xt_table_unlock(t);
1248 return ret; 1250 return ret;
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index cd5b8ec9be04..da787930df0a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -621,6 +621,42 @@ static void iucv_disable(void)
621 put_online_cpus(); 621 put_online_cpus();
622} 622}
623 623
624static void free_iucv_data(int cpu)
625{
626 kfree(iucv_param_irq[cpu]);
627 iucv_param_irq[cpu] = NULL;
628 kfree(iucv_param[cpu]);
629 iucv_param[cpu] = NULL;
630 kfree(iucv_irq_data[cpu]);
631 iucv_irq_data[cpu] = NULL;
632}
633
634static int alloc_iucv_data(int cpu)
635{
636 /* Note: GFP_DMA used to get memory below 2G */
637 iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
638 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
639 if (!iucv_irq_data[cpu])
640 goto out_free;
641
642 /* Allocate parameter blocks. */
643 iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
644 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
645 if (!iucv_param[cpu])
646 goto out_free;
647
648 iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
649 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
650 if (!iucv_param_irq[cpu])
651 goto out_free;
652
653 return 0;
654
655out_free:
656 free_iucv_data(cpu);
657 return -ENOMEM;
658}
659
624static int iucv_cpu_notify(struct notifier_block *self, 660static int iucv_cpu_notify(struct notifier_block *self,
625 unsigned long action, void *hcpu) 661 unsigned long action, void *hcpu)
626{ 662{
@@ -630,38 +666,14 @@ static int iucv_cpu_notify(struct notifier_block *self,
630 switch (action) { 666 switch (action) {
631 case CPU_UP_PREPARE: 667 case CPU_UP_PREPARE:
632 case CPU_UP_PREPARE_FROZEN: 668 case CPU_UP_PREPARE_FROZEN:
633 iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data), 669 if (alloc_iucv_data(cpu))
634 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
635 if (!iucv_irq_data[cpu])
636 return notifier_from_errno(-ENOMEM);
637
638 iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
639 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
640 if (!iucv_param[cpu]) {
641 kfree(iucv_irq_data[cpu]);
642 iucv_irq_data[cpu] = NULL;
643 return notifier_from_errno(-ENOMEM); 670 return notifier_from_errno(-ENOMEM);
644 }
645 iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
646 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
647 if (!iucv_param_irq[cpu]) {
648 kfree(iucv_param[cpu]);
649 iucv_param[cpu] = NULL;
650 kfree(iucv_irq_data[cpu]);
651 iucv_irq_data[cpu] = NULL;
652 return notifier_from_errno(-ENOMEM);
653 }
654 break; 671 break;
655 case CPU_UP_CANCELED: 672 case CPU_UP_CANCELED:
656 case CPU_UP_CANCELED_FROZEN: 673 case CPU_UP_CANCELED_FROZEN:
657 case CPU_DEAD: 674 case CPU_DEAD:
658 case CPU_DEAD_FROZEN: 675 case CPU_DEAD_FROZEN:
659 kfree(iucv_param_irq[cpu]); 676 free_iucv_data(cpu);
660 iucv_param_irq[cpu] = NULL;
661 kfree(iucv_param[cpu]);
662 iucv_param[cpu] = NULL;
663 kfree(iucv_irq_data[cpu]);
664 iucv_irq_data[cpu] = NULL;
665 break; 677 break;
666 case CPU_ONLINE: 678 case CPU_ONLINE:
667 case CPU_ONLINE_FROZEN: 679 case CPU_ONLINE_FROZEN:
@@ -2016,7 +2028,7 @@ static int __init iucv_init(void)
2016 rc = iucv_query_maxconn(); 2028 rc = iucv_query_maxconn();
2017 if (rc) 2029 if (rc)
2018 goto out_ctl; 2030 goto out_ctl;
2019 rc = register_external_interrupt(0x4000, iucv_external_interrupt); 2031 rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
2020 if (rc) 2032 if (rc)
2021 goto out_ctl; 2033 goto out_ctl;
2022 iucv_root = root_device_register("iucv"); 2034 iucv_root = root_device_register("iucv");
@@ -2025,33 +2037,20 @@ static int __init iucv_init(void)
2025 goto out_int; 2037 goto out_int;
2026 } 2038 }
2027 2039
2028 for_each_online_cpu(cpu) { 2040 cpu_notifier_register_begin();
2029 /* Note: GFP_DMA used to get memory below 2G */
2030 iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
2031 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
2032 if (!iucv_irq_data[cpu]) {
2033 rc = -ENOMEM;
2034 goto out_free;
2035 }
2036 2041
2037 /* Allocate parameter blocks. */ 2042 for_each_online_cpu(cpu) {
2038 iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param), 2043 if (alloc_iucv_data(cpu)) {
2039 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
2040 if (!iucv_param[cpu]) {
2041 rc = -ENOMEM;
2042 goto out_free;
2043 }
2044 iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
2045 GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
2046 if (!iucv_param_irq[cpu]) {
2047 rc = -ENOMEM; 2044 rc = -ENOMEM;
2048 goto out_free; 2045 goto out_free;
2049 } 2046 }
2050
2051 } 2047 }
2052 rc = register_hotcpu_notifier(&iucv_cpu_notifier); 2048 rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
2053 if (rc) 2049 if (rc)
2054 goto out_free; 2050 goto out_free;
2051
2052 cpu_notifier_register_done();
2053
2055 rc = register_reboot_notifier(&iucv_reboot_notifier); 2054 rc = register_reboot_notifier(&iucv_reboot_notifier);
2056 if (rc) 2055 if (rc)
2057 goto out_cpu; 2056 goto out_cpu;
@@ -2069,19 +2068,17 @@ static int __init iucv_init(void)
2069out_reboot: 2068out_reboot:
2070 unregister_reboot_notifier(&iucv_reboot_notifier); 2069 unregister_reboot_notifier(&iucv_reboot_notifier);
2071out_cpu: 2070out_cpu:
2072 unregister_hotcpu_notifier(&iucv_cpu_notifier); 2071 cpu_notifier_register_begin();
2072 __unregister_hotcpu_notifier(&iucv_cpu_notifier);
2073out_free: 2073out_free:
2074 for_each_possible_cpu(cpu) { 2074 for_each_possible_cpu(cpu)
2075 kfree(iucv_param_irq[cpu]); 2075 free_iucv_data(cpu);
2076 iucv_param_irq[cpu] = NULL; 2076
2077 kfree(iucv_param[cpu]); 2077 cpu_notifier_register_done();
2078 iucv_param[cpu] = NULL; 2078
2079 kfree(iucv_irq_data[cpu]);
2080 iucv_irq_data[cpu] = NULL;
2081 }
2082 root_device_unregister(iucv_root); 2079 root_device_unregister(iucv_root);
2083out_int: 2080out_int:
2084 unregister_external_interrupt(0x4000, iucv_external_interrupt); 2081 unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
2085out_ctl: 2082out_ctl:
2086 ctl_clear_bit(0, 1); 2083 ctl_clear_bit(0, 1);
2087out: 2084out:
@@ -2105,18 +2102,14 @@ static void __exit iucv_exit(void)
2105 kfree(p); 2102 kfree(p);
2106 spin_unlock_irq(&iucv_queue_lock); 2103 spin_unlock_irq(&iucv_queue_lock);
2107 unregister_reboot_notifier(&iucv_reboot_notifier); 2104 unregister_reboot_notifier(&iucv_reboot_notifier);
2108 unregister_hotcpu_notifier(&iucv_cpu_notifier); 2105 cpu_notifier_register_begin();
2109 for_each_possible_cpu(cpu) { 2106 __unregister_hotcpu_notifier(&iucv_cpu_notifier);
2110 kfree(iucv_param_irq[cpu]); 2107 for_each_possible_cpu(cpu)
2111 iucv_param_irq[cpu] = NULL; 2108 free_iucv_data(cpu);
2112 kfree(iucv_param[cpu]); 2109 cpu_notifier_register_done();
2113 iucv_param[cpu] = NULL;
2114 kfree(iucv_irq_data[cpu]);
2115 iucv_irq_data[cpu] = NULL;
2116 }
2117 root_device_unregister(iucv_root); 2110 root_device_unregister(iucv_root);
2118 bus_unregister(&iucv_bus); 2111 bus_unregister(&iucv_bus);
2119 unregister_external_interrupt(0x4000, iucv_external_interrupt); 2112 unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
2120} 2113}
2121 2114
2122subsys_initcall(iucv_init); 2115subsys_initcall(iucv_init);
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index 153bd1ddbfbb..f0991f2344d4 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -26,7 +26,6 @@
26#include <net/mac802154.h> 26#include <net/mac802154.h>
27#include <net/ieee802154_netdev.h> 27#include <net/ieee802154_netdev.h>
28#include <net/wpan-phy.h> 28#include <net/wpan-phy.h>
29#include <net/ieee802154_netdev.h>
30 29
31#include "mac802154.h" 30#include "mac802154.h"
32 31
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 33045a562297..3fd159db9f06 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -152,8 +152,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi,
152#ifdef CONFIG_MODULES 152#ifdef CONFIG_MODULES
153 if (autoload) { 153 if (autoload) {
154 nfnl_unlock(NFNL_SUBSYS_NFTABLES); 154 nfnl_unlock(NFNL_SUBSYS_NFTABLES);
155 request_module("nft-chain-%u-%*.s", afi->family, 155 request_module("nft-chain-%u-%.*s", afi->family,
156 nla_len(nla)-1, (const char *)nla_data(nla)); 156 nla_len(nla), (const char *)nla_data(nla));
157 nfnl_lock(NFNL_SUBSYS_NFTABLES); 157 nfnl_lock(NFNL_SUBSYS_NFTABLES);
158 type = __nf_tables_chain_type_lookup(afi->family, nla); 158 type = __nf_tables_chain_type_lookup(afi->family, nla);
159 if (type != NULL) 159 if (type != NULL)
@@ -1946,7 +1946,8 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const
1946 1946
1947static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { 1947static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
1948 [NFTA_SET_TABLE] = { .type = NLA_STRING }, 1948 [NFTA_SET_TABLE] = { .type = NLA_STRING },
1949 [NFTA_SET_NAME] = { .type = NLA_STRING }, 1949 [NFTA_SET_NAME] = { .type = NLA_STRING,
1950 .len = IFNAMSIZ - 1 },
1950 [NFTA_SET_FLAGS] = { .type = NLA_U32 }, 1951 [NFTA_SET_FLAGS] = { .type = NLA_U32 },
1951 [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, 1952 [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
1952 [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, 1953 [NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 9a8e77e7f8d4..f4e833005320 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -54,7 +54,8 @@ static struct xt_match cgroup_mt_reg __read_mostly = {
54 .matchsize = sizeof(struct xt_cgroup_info), 54 .matchsize = sizeof(struct xt_cgroup_info),
55 .me = THIS_MODULE, 55 .me = THIS_MODULE,
56 .hooks = (1 << NF_INET_LOCAL_OUT) | 56 .hooks = (1 << NF_INET_LOCAL_OUT) |
57 (1 << NF_INET_POST_ROUTING), 57 (1 << NF_INET_POST_ROUTING) |
58 (1 << NF_INET_LOCAL_IN),
58}; 59};
59 60
60static int __init cgroup_mt_init(void) 61static int __init cgroup_mt_init(void)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 458464e7bd7a..fbc66bb250d5 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -32,8 +32,14 @@
32#include <net/netfilter/nf_conntrack_tuple.h> 32#include <net/netfilter/nf_conntrack_tuple.h>
33#include <net/netfilter/nf_conntrack_zones.h> 33#include <net/netfilter/nf_conntrack_zones.h>
34 34
35#define CONNLIMIT_SLOTS 32 35#define CONNLIMIT_SLOTS 256U
36#define CONNLIMIT_LOCK_SLOTS 32 36
37#ifdef CONFIG_LOCKDEP
38#define CONNLIMIT_LOCK_SLOTS 8U
39#else
40#define CONNLIMIT_LOCK_SLOTS 256U
41#endif
42
37#define CONNLIMIT_GC_MAX_NODES 8 43#define CONNLIMIT_GC_MAX_NODES 8
38 44
39/* we will save the tuples of all connections we care about */ 45/* we will save the tuples of all connections we care about */
@@ -49,10 +55,11 @@ struct xt_connlimit_rb {
49 union nf_inet_addr addr; /* search key */ 55 union nf_inet_addr addr; /* search key */
50}; 56};
51 57
58static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp;
59
52struct xt_connlimit_data { 60struct xt_connlimit_data {
53 struct rb_root climit_root4[CONNLIMIT_SLOTS]; 61 struct rb_root climit_root4[CONNLIMIT_SLOTS];
54 struct rb_root climit_root6[CONNLIMIT_SLOTS]; 62 struct rb_root climit_root6[CONNLIMIT_SLOTS];
55 spinlock_t locks[CONNLIMIT_LOCK_SLOTS];
56}; 63};
57 64
58static u_int32_t connlimit_rnd __read_mostly; 65static u_int32_t connlimit_rnd __read_mostly;
@@ -297,11 +304,11 @@ static int count_them(struct net *net,
297 root = &data->climit_root4[hash]; 304 root = &data->climit_root4[hash];
298 } 305 }
299 306
300 spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); 307 spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
301 308
302 count = count_tree(net, root, tuple, addr, mask, family); 309 count = count_tree(net, root, tuple, addr, mask, family);
303 310
304 spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); 311 spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
305 312
306 return count; 313 return count;
307} 314}
@@ -377,9 +384,6 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
377 return -ENOMEM; 384 return -ENOMEM;
378 } 385 }
379 386
380 for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i)
381 spin_lock_init(&info->data->locks[i]);
382
383 for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) 387 for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
384 info->data->climit_root4[i] = RB_ROOT; 388 info->data->climit_root4[i] = RB_ROOT;
385 for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) 389 for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
@@ -435,11 +439,14 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
435 439
436static int __init connlimit_mt_init(void) 440static int __init connlimit_mt_init(void)
437{ 441{
438 int ret; 442 int ret, i;
439 443
440 BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); 444 BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
441 BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); 445 BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
442 446
447 for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i)
448 spin_lock_init(&xt_connlimit_locks[i]);
449
443 connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", 450 connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
444 sizeof(struct xt_connlimit_conn), 451 sizeof(struct xt_connlimit_conn),
445 0, 0, NULL); 452 0, 0, NULL);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 7174611bd672..c529161cdbf8 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -422,4 +422,6 @@ module_exit(xt_osf_fini);
422MODULE_LICENSE("GPL"); 422MODULE_LICENSE("GPL");
423MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); 423MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
424MODULE_DESCRIPTION("Passive OS fingerprint matching."); 424MODULE_DESCRIPTION("Passive OS fingerprint matching.");
425MODULE_ALIAS("ipt_osf");
426MODULE_ALIAS("ip6t_osf");
425MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); 427MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01039d2b1695..72e0c71fb01d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -261,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
261 local_bh_disable(); 261 local_bh_disable();
262 262
263 HARD_TX_LOCK(dev, txq, smp_processor_id()); 263 HARD_TX_LOCK(dev, txq, smp_processor_id());
264 if (!netif_xmit_frozen_or_stopped(txq)) { 264 if (!netif_xmit_frozen_or_drv_stopped(txq)) {
265 ret = ops->ndo_start_xmit(skb, dev); 265 ret = ops->ndo_start_xmit(skb, dev);
266 if (ret == NETDEV_TX_OK) 266 if (ret == NETDEV_TX_OK)
267 txq_trans_update(txq); 267 txq_trans_update(txq);
@@ -275,6 +275,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
275 275
276 return ret; 276 return ret;
277drop: 277drop:
278 atomic_long_inc(&dev->tx_dropped);
278 kfree_skb(skb); 279 kfree_skb(skb);
279 return NET_XMIT_DROP; 280 return NET_XMIT_DROP;
280} 281}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 981aaf8b6ace..5f83a6a2fa67 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6593,6 +6593,40 @@ static void __sctp_write_space(struct sctp_association *asoc)
6593 } 6593 }
6594} 6594}
6595 6595
6596static void sctp_wake_up_waiters(struct sock *sk,
6597 struct sctp_association *asoc)
6598{
6599 struct sctp_association *tmp = asoc;
6600
6601 /* We do accounting for the sndbuf space per association,
6602 * so we only need to wake our own association.
6603 */
6604 if (asoc->ep->sndbuf_policy)
6605 return __sctp_write_space(asoc);
6606
6607 /* Accounting for the sndbuf space is per socket, so we
6608 * need to wake up others, try to be fair and in case of
6609 * other associations, let them have a go first instead
6610 * of just doing a sctp_write_space() call.
6611 *
6612 * Note that we reach sctp_wake_up_waiters() only when
6613 * associations free up queued chunks, thus we are under
6614 * lock and the list of associations on a socket is
6615 * guaranteed not to change.
6616 */
6617 for (tmp = list_next_entry(tmp, asocs); 1;
6618 tmp = list_next_entry(tmp, asocs)) {
6619 /* Manually skip the head element. */
6620 if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs))
6621 continue;
6622 /* Wake up association. */
6623 __sctp_write_space(tmp);
6624 /* We've reached the end. */
6625 if (tmp == asoc)
6626 break;
6627 }
6628}
6629
6596/* Do accounting for the sndbuf space. 6630/* Do accounting for the sndbuf space.
6597 * Decrement the used sndbuf space of the corresponding association by the 6631 * Decrement the used sndbuf space of the corresponding association by the
6598 * data size which was just transmitted(freed). 6632 * data size which was just transmitted(freed).
@@ -6620,7 +6654,7 @@ static void sctp_wfree(struct sk_buff *skb)
6620 sk_mem_uncharge(sk, skb->truesize); 6654 sk_mem_uncharge(sk, skb->truesize);
6621 6655
6622 sock_wfree(skb); 6656 sock_wfree(skb);
6623 __sctp_write_space(asoc); 6657 sctp_wake_up_waiters(sk, asoc);
6624 6658
6625 sctp_association_put(asoc); 6659 sctp_association_put(asoc);
6626} 6660}
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 241b54f30204..0754d0f466d2 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -9,19 +9,6 @@ config SUNRPC_BACKCHANNEL
9 bool 9 bool
10 depends on SUNRPC 10 depends on SUNRPC
11 11
12config SUNRPC_XPRT_RDMA
13 tristate
14 depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
15 default SUNRPC && INFINIBAND
16 help
17 This option allows the NFS client and server to support
18 an RDMA-enabled transport.
19
20 To compile RPC client RDMA transport support as a module,
21 choose M here: the module will be called xprtrdma.
22
23 If unsure, say N.
24
25config SUNRPC_SWAP 12config SUNRPC_SWAP
26 bool 13 bool
27 depends on SUNRPC 14 depends on SUNRPC
@@ -57,3 +44,29 @@ config SUNRPC_DEBUG
57 but makes troubleshooting NFS issues significantly harder. 44 but makes troubleshooting NFS issues significantly harder.
58 45
59 If unsure, say Y. 46 If unsure, say Y.
47
48config SUNRPC_XPRT_RDMA_CLIENT
49 tristate "RPC over RDMA Client Support"
50 depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
51 default SUNRPC && INFINIBAND
52 help
53 This option allows the NFS client to support an RDMA-enabled
54 transport.
55
56 To compile RPC client RDMA transport support as a module,
57 choose M here: the module will be called xprtrdma.
58
59 If unsure, say N.
60
61config SUNRPC_XPRT_RDMA_SERVER
62 tristate "RPC over RDMA Server Support"
63 depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
64 default SUNRPC && INFINIBAND
65 help
66 This option allows the NFS server to support an RDMA-enabled
67 transport.
68
69 To compile RPC server RDMA transport support as a module,
70 choose M here: the module will be called svcrdma.
71
72 If unsure, say N.
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8209a0411bca..e5a7a1cac8f3 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,7 +5,8 @@
5 5
6obj-$(CONFIG_SUNRPC) += sunrpc.o 6obj-$(CONFIG_SUNRPC) += sunrpc.o
7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ 7obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
8obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/ 8
9obj-y += xprtrdma/
9 10
10sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ 11sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
11 auth.o auth_null.o auth_unix.o auth_generic.o \ 12 auth.o auth_null.o auth_unix.o auth_generic.o \
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index e860d4f7ed2a..3513d559bc45 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -212,39 +212,23 @@ out:
212} 212}
213EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); 213EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
214 214
215/* 215static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
216 * One or more rpc_rqst structure have been preallocated during the
217 * backchannel setup. Buffer space for the send and private XDR buffers
218 * has been preallocated as well. Use xprt_alloc_bc_request to allocate
219 * to this request. Use xprt_free_bc_request to return it.
220 *
221 * We know that we're called in soft interrupt context, grab the spin_lock
222 * since there is no need to grab the bottom half spin_lock.
223 *
224 * Return an available rpc_rqst, otherwise NULL if non are available.
225 */
226struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
227{ 216{
228 struct rpc_rqst *req; 217 struct rpc_rqst *req = NULL;
229 218
230 dprintk("RPC: allocate a backchannel request\n"); 219 dprintk("RPC: allocate a backchannel request\n");
231 spin_lock(&xprt->bc_pa_lock); 220 if (list_empty(&xprt->bc_pa_list))
232 if (!list_empty(&xprt->bc_pa_list)) { 221 goto not_found;
233 req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
234 rq_bc_pa_list);
235 list_del(&req->rq_bc_pa_list);
236 } else {
237 req = NULL;
238 }
239 spin_unlock(&xprt->bc_pa_lock);
240 222
241 if (req != NULL) { 223 req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
242 set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); 224 rq_bc_pa_list);
243 req->rq_reply_bytes_recvd = 0; 225 req->rq_reply_bytes_recvd = 0;
244 req->rq_bytes_sent = 0; 226 req->rq_bytes_sent = 0;
245 memcpy(&req->rq_private_buf, &req->rq_rcv_buf, 227 memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
246 sizeof(req->rq_private_buf)); 228 sizeof(req->rq_private_buf));
247 } 229 req->rq_xid = xid;
230 req->rq_connect_cookie = xprt->connect_cookie;
231not_found:
248 dprintk("RPC: backchannel req=%p\n", req); 232 dprintk("RPC: backchannel req=%p\n", req);
249 return req; 233 return req;
250} 234}
@@ -259,6 +243,7 @@ void xprt_free_bc_request(struct rpc_rqst *req)
259 243
260 dprintk("RPC: free backchannel req=%p\n", req); 244 dprintk("RPC: free backchannel req=%p\n", req);
261 245
246 req->rq_connect_cookie = xprt->connect_cookie - 1;
262 smp_mb__before_clear_bit(); 247 smp_mb__before_clear_bit();
263 WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state)); 248 WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
264 clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state); 249 clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
@@ -281,7 +266,57 @@ void xprt_free_bc_request(struct rpc_rqst *req)
281 * may be reused by a new callback request. 266 * may be reused by a new callback request.
282 */ 267 */
283 spin_lock_bh(&xprt->bc_pa_lock); 268 spin_lock_bh(&xprt->bc_pa_lock);
284 list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list); 269 list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
285 spin_unlock_bh(&xprt->bc_pa_lock); 270 spin_unlock_bh(&xprt->bc_pa_lock);
286} 271}
287 272
273/*
274 * One or more rpc_rqst structure have been preallocated during the
275 * backchannel setup. Buffer space for the send and private XDR buffers
276 * has been preallocated as well. Use xprt_alloc_bc_request to allocate
277 * to this request. Use xprt_free_bc_request to return it.
278 *
279 * We know that we're called in soft interrupt context, grab the spin_lock
280 * since there is no need to grab the bottom half spin_lock.
281 *
282 * Return an available rpc_rqst, otherwise NULL if non are available.
283 */
284struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
285{
286 struct rpc_rqst *req;
287
288 spin_lock(&xprt->bc_pa_lock);
289 list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
290 if (req->rq_connect_cookie != xprt->connect_cookie)
291 continue;
292 if (req->rq_xid == xid)
293 goto found;
294 }
295 req = xprt_alloc_bc_request(xprt, xid);
296found:
297 spin_unlock(&xprt->bc_pa_lock);
298 return req;
299}
300
301/*
302 * Add callback request to callback list. The callback
303 * service sleeps on the sv_cb_waitq waiting for new
304 * requests. Wake it up after adding enqueing the
305 * request.
306 */
307void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
308{
309 struct rpc_xprt *xprt = req->rq_xprt;
310 struct svc_serv *bc_serv = xprt->bc_serv;
311
312 req->rq_private_buf.len = copied;
313 set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
314
315 dprintk("RPC: add callback request to list\n");
316 spin_lock(&bc_serv->sv_cb_lock);
317 list_del(&req->rq_bc_pa_list);
318 list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
319 wake_up(&bc_serv->sv_cb_waitq);
320 spin_unlock(&bc_serv->sv_cb_lock);
321}
322
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0edada973434..2e6ab10734f6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -438,6 +438,38 @@ out_no_rpciod:
438 return ERR_PTR(err); 438 return ERR_PTR(err);
439} 439}
440 440
441struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
442 struct rpc_xprt *xprt)
443{
444 struct rpc_clnt *clnt = NULL;
445
446 clnt = rpc_new_client(args, xprt, NULL);
447 if (IS_ERR(clnt))
448 return clnt;
449
450 if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
451 int err = rpc_ping(clnt);
452 if (err != 0) {
453 rpc_shutdown_client(clnt);
454 return ERR_PTR(err);
455 }
456 }
457
458 clnt->cl_softrtry = 1;
459 if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
460 clnt->cl_softrtry = 0;
461
462 if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
463 clnt->cl_autobind = 1;
464 if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
465 clnt->cl_discrtry = 1;
466 if (!(args->flags & RPC_CLNT_CREATE_QUIET))
467 clnt->cl_chatty = 1;
468
469 return clnt;
470}
471EXPORT_SYMBOL_GPL(rpc_create_xprt);
472
441/** 473/**
442 * rpc_create - create an RPC client and transport with one call 474 * rpc_create - create an RPC client and transport with one call
443 * @args: rpc_clnt create argument structure 475 * @args: rpc_clnt create argument structure
@@ -451,7 +483,6 @@ out_no_rpciod:
451struct rpc_clnt *rpc_create(struct rpc_create_args *args) 483struct rpc_clnt *rpc_create(struct rpc_create_args *args)
452{ 484{
453 struct rpc_xprt *xprt; 485 struct rpc_xprt *xprt;
454 struct rpc_clnt *clnt;
455 struct xprt_create xprtargs = { 486 struct xprt_create xprtargs = {
456 .net = args->net, 487 .net = args->net,
457 .ident = args->protocol, 488 .ident = args->protocol,
@@ -515,30 +546,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
515 if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT) 546 if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
516 xprt->resvport = 0; 547 xprt->resvport = 0;
517 548
518 clnt = rpc_new_client(args, xprt, NULL); 549 return rpc_create_xprt(args, xprt);
519 if (IS_ERR(clnt))
520 return clnt;
521
522 if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
523 int err = rpc_ping(clnt);
524 if (err != 0) {
525 rpc_shutdown_client(clnt);
526 return ERR_PTR(err);
527 }
528 }
529
530 clnt->cl_softrtry = 1;
531 if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
532 clnt->cl_softrtry = 0;
533
534 if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
535 clnt->cl_autobind = 1;
536 if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
537 clnt->cl_discrtry = 1;
538 if (!(args->flags & RPC_CLNT_CREATE_QUIET))
539 clnt->cl_chatty = 1;
540
541 return clnt;
542} 550}
543EXPORT_SYMBOL_GPL(rpc_create); 551EXPORT_SYMBOL_GPL(rpc_create);
544 552
@@ -1363,6 +1371,7 @@ rpc_restart_call_prepare(struct rpc_task *task)
1363 if (RPC_ASSASSINATED(task)) 1371 if (RPC_ASSASSINATED(task))
1364 return 0; 1372 return 0;
1365 task->tk_action = call_start; 1373 task->tk_action = call_start;
1374 task->tk_status = 0;
1366 if (task->tk_ops->rpc_call_prepare != NULL) 1375 if (task->tk_ops->rpc_call_prepare != NULL)
1367 task->tk_action = rpc_prepare_task; 1376 task->tk_action = rpc_prepare_task;
1368 return 1; 1377 return 1;
@@ -1379,6 +1388,7 @@ rpc_restart_call(struct rpc_task *task)
1379 if (RPC_ASSASSINATED(task)) 1388 if (RPC_ASSASSINATED(task))
1380 return 0; 1389 return 0;
1381 task->tk_action = call_start; 1390 task->tk_action = call_start;
1391 task->tk_status = 0;
1382 return 1; 1392 return 1;
1383} 1393}
1384EXPORT_SYMBOL_GPL(rpc_restart_call); 1394EXPORT_SYMBOL_GPL(rpc_restart_call);
@@ -1728,9 +1738,7 @@ call_bind_status(struct rpc_task *task)
1728 case -EPROTONOSUPPORT: 1738 case -EPROTONOSUPPORT:
1729 dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n", 1739 dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
1730 task->tk_pid); 1740 task->tk_pid);
1731 task->tk_status = 0; 1741 goto retry_timeout;
1732 task->tk_action = call_bind;
1733 return;
1734 case -ECONNREFUSED: /* connection problems */ 1742 case -ECONNREFUSED: /* connection problems */
1735 case -ECONNRESET: 1743 case -ECONNRESET:
1736 case -ECONNABORTED: 1744 case -ECONNABORTED:
@@ -1756,6 +1764,7 @@ call_bind_status(struct rpc_task *task)
1756 return; 1764 return;
1757 1765
1758retry_timeout: 1766retry_timeout:
1767 task->tk_status = 0;
1759 task->tk_action = call_timeout; 1768 task->tk_action = call_timeout;
1760} 1769}
1761 1770
@@ -1798,21 +1807,19 @@ call_connect_status(struct rpc_task *task)
1798 trace_rpc_connect_status(task, status); 1807 trace_rpc_connect_status(task, status);
1799 task->tk_status = 0; 1808 task->tk_status = 0;
1800 switch (status) { 1809 switch (status) {
1801 /* if soft mounted, test if we've timed out */
1802 case -ETIMEDOUT:
1803 task->tk_action = call_timeout;
1804 return;
1805 case -ECONNREFUSED: 1810 case -ECONNREFUSED:
1806 case -ECONNRESET: 1811 case -ECONNRESET:
1807 case -ECONNABORTED: 1812 case -ECONNABORTED:
1808 case -ENETUNREACH: 1813 case -ENETUNREACH:
1809 case -EHOSTUNREACH: 1814 case -EHOSTUNREACH:
1810 /* retry with existing socket, after a delay */
1811 rpc_delay(task, 3*HZ);
1812 if (RPC_IS_SOFTCONN(task)) 1815 if (RPC_IS_SOFTCONN(task))
1813 break; 1816 break;
1817 /* retry with existing socket, after a delay */
1818 rpc_delay(task, 3*HZ);
1814 case -EAGAIN: 1819 case -EAGAIN:
1815 task->tk_action = call_bind; 1820 /* Check for timeouts before looping back to call_bind */
1821 case -ETIMEDOUT:
1822 task->tk_action = call_timeout;
1816 return; 1823 return;
1817 case 0: 1824 case 0:
1818 clnt->cl_stats->netreconn++; 1825 clnt->cl_stats->netreconn++;
@@ -2007,6 +2014,10 @@ call_status(struct rpc_task *task)
2007 case -EHOSTDOWN: 2014 case -EHOSTDOWN:
2008 case -EHOSTUNREACH: 2015 case -EHOSTUNREACH:
2009 case -ENETUNREACH: 2016 case -ENETUNREACH:
2017 if (RPC_IS_SOFTCONN(task)) {
2018 rpc_exit(task, status);
2019 break;
2020 }
2010 /* 2021 /*
2011 * Delay any retries for 3 seconds, then handle as if it 2022 * Delay any retries for 3 seconds, then handle as if it
2012 * were a timeout. 2023 * were a timeout.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ff3cc4bf4b24..25578afe1548 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -637,7 +637,8 @@ static void __rpc_queue_timer_fn(unsigned long ptr)
637 637
638static void __rpc_atrun(struct rpc_task *task) 638static void __rpc_atrun(struct rpc_task *task)
639{ 639{
640 task->tk_status = 0; 640 if (task->tk_status == -ETIMEDOUT)
641 task->tk_status = 0;
641} 642}
642 643
643/* 644/*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b6e59f0a9475..d06cb8752dcd 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1397,6 +1397,22 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
1397 return svsk; 1397 return svsk;
1398} 1398}
1399 1399
1400bool svc_alien_sock(struct net *net, int fd)
1401{
1402 int err;
1403 struct socket *sock = sockfd_lookup(fd, &err);
1404 bool ret = false;
1405
1406 if (!sock)
1407 goto out;
1408 if (sock_net(sock->sk) != net)
1409 ret = true;
1410 sockfd_put(sock);
1411out:
1412 return ret;
1413}
1414EXPORT_SYMBOL_GPL(svc_alien_sock);
1415
1400/** 1416/**
1401 * svc_addsock - add a listener socket to an RPC service 1417 * svc_addsock - add a listener socket to an RPC service
1402 * @serv: pointer to RPC service to which to add a new listener 1418 * @serv: pointer to RPC service to which to add a new listener
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 1504bb11e4f3..dd97ba3c4456 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -833,8 +833,20 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
833} 833}
834EXPORT_SYMBOL_GPL(xdr_buf_from_iov); 834EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
835 835
836/* Sets subbuf to the portion of buf of length len beginning base bytes 836/**
837 * from the start of buf. Returns -1 if base of length are out of bounds. */ 837 * xdr_buf_subsegment - set subbuf to a portion of buf
838 * @buf: an xdr buffer
839 * @subbuf: the result buffer
840 * @base: beginning of range in bytes
841 * @len: length of range in bytes
842 *
843 * sets @subbuf to an xdr buffer representing the portion of @buf of
844 * length @len starting at offset @base.
845 *
846 * @buf and @subbuf may be pointers to the same struct xdr_buf.
847 *
848 * Returns -1 if base of length are out of bounds.
849 */
838int 850int
839xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, 851xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
840 unsigned int base, unsigned int len) 852 unsigned int base, unsigned int len)
@@ -847,9 +859,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
847 len -= subbuf->head[0].iov_len; 859 len -= subbuf->head[0].iov_len;
848 base = 0; 860 base = 0;
849 } else { 861 } else {
850 subbuf->head[0].iov_base = NULL;
851 subbuf->head[0].iov_len = 0;
852 base -= buf->head[0].iov_len; 862 base -= buf->head[0].iov_len;
863 subbuf->head[0].iov_len = 0;
853 } 864 }
854 865
855 if (base < buf->page_len) { 866 if (base < buf->page_len) {
@@ -871,9 +882,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
871 len -= subbuf->tail[0].iov_len; 882 len -= subbuf->tail[0].iov_len;
872 base = 0; 883 base = 0;
873 } else { 884 } else {
874 subbuf->tail[0].iov_base = NULL;
875 subbuf->tail[0].iov_len = 0;
876 base -= buf->tail[0].iov_len; 885 base -= buf->tail[0].iov_len;
886 subbuf->tail[0].iov_len = 0;
877 } 887 }
878 888
879 if (base || len) 889 if (base || len)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 7d4df99f761f..d173f79947c6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1383,15 +1383,3 @@ void xprt_put(struct rpc_xprt *xprt)
1383 if (atomic_dec_and_test(&xprt->count)) 1383 if (atomic_dec_and_test(&xprt->count))
1384 xprt_destroy(xprt); 1384 xprt_destroy(xprt);
1385} 1385}
1386
1387/**
1388 * xprt_get - return a reference to an RPC transport.
1389 * @xprt: pointer to the transport
1390 *
1391 */
1392struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
1393{
1394 if (atomic_inc_not_zero(&xprt->count))
1395 return xprt;
1396 return NULL;
1397}
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 5a8f268bdd30..da5136fd5694 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,8 +1,8 @@
1obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o 1obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
2 2
3xprtrdma-y := transport.o rpc_rdma.o verbs.o 3xprtrdma-y := transport.o rpc_rdma.o verbs.o
4 4
5obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o 5obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
6 6
7svcrdma-y := svc_rdma.o svc_rdma_transport.o \ 7svcrdma-y := svc_rdma.o svc_rdma_transport.o \
8 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o 8 svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e03725bfe2b8..96ead526b125 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
649 break; 649 break;
650 page_base = 0; 650 page_base = 0;
651 } 651 }
652 rqst->rq_rcv_buf.page_len = olen - copy_len; 652 }
653 } else
654 rqst->rq_rcv_buf.page_len = 0;
655 653
656 if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) { 654 if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
657 curlen = copy_len; 655 curlen = copy_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0ce75524ed21..8d904e4eef15 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
90 sge_no++; 90 sge_no++;
91 } 91 }
92 rqstp->rq_respages = &rqstp->rq_pages[sge_no]; 92 rqstp->rq_respages = &rqstp->rq_pages[sge_no];
93 rqstp->rq_next_page = rqstp->rq_respages + 1;
93 94
94 /* We should never run out of SGE because the limit is defined to 95 /* We should never run out of SGE because the limit is defined to
95 * support the max allowed RPC data length 96 * support the max allowed RPC data length
@@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt,
169 */ 170 */
170 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no]; 171 head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
171 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1]; 172 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
173 rqstp->rq_next_page = rqstp->rq_respages + 1;
172 174
173 byte_count -= sge_bytes; 175 byte_count -= sge_bytes;
174 ch_bytes -= sge_bytes; 176 ch_bytes -= sge_bytes;
@@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
276 278
277 /* rq_respages points one past arg pages */ 279 /* rq_respages points one past arg pages */
278 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; 280 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
281 rqstp->rq_next_page = rqstp->rq_respages + 1;
279 282
280 /* Create the reply and chunk maps */ 283 /* Create the reply and chunk maps */
281 offset = 0; 284 offset = 0;
@@ -520,13 +523,6 @@ next_sge:
520 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++) 523 for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
521 rqstp->rq_pages[ch_no] = NULL; 524 rqstp->rq_pages[ch_no] = NULL;
522 525
523 /*
524 * Detach res pages. If svc_release sees any it will attempt to
525 * put them.
526 */
527 while (rqstp->rq_next_page != rqstp->rq_respages)
528 *(--rqstp->rq_next_page) = NULL;
529
530 return err; 526 return err;
531} 527}
532 528
@@ -550,7 +546,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
550 546
551 /* rq_respages starts after the last arg page */ 547 /* rq_respages starts after the last arg page */
552 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no]; 548 rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
553 rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no]; 549 rqstp->rq_next_page = rqstp->rq_respages + 1;
554 550
555 /* Rebuild rq_arg head and tail. */ 551 /* Rebuild rq_arg head and tail. */
556 rqstp->rq_arg.head[0] = head->arg.head[0]; 552 rqstp->rq_arg.head[0] = head->arg.head[0];
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c1d124dc772b..7e024a51617e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
265 xdr_off -= xdr->head[0].iov_len; 265 xdr_off -= xdr->head[0].iov_len;
266 if (xdr_off < xdr->page_len) { 266 if (xdr_off < xdr->page_len) {
267 /* This offset is in the page list */ 267 /* This offset is in the page list */
268 xdr_off += xdr->page_base;
268 page = xdr->pages[xdr_off >> PAGE_SHIFT]; 269 page = xdr->pages[xdr_off >> PAGE_SHIFT];
269 xdr_off &= ~PAGE_MASK; 270 xdr_off &= ~PAGE_MASK;
270 } else { 271 } else {
@@ -625,6 +626,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
625 if (page_no+1 >= sge_no) 626 if (page_no+1 >= sge_no)
626 ctxt->sge[page_no+1].length = 0; 627 ctxt->sge[page_no+1].length = 0;
627 } 628 }
629 rqstp->rq_next_page = rqstp->rq_respages + 1;
628 BUG_ON(sge_no > rdma->sc_max_sge); 630 BUG_ON(sge_no > rdma->sc_max_sge);
629 memset(&send_wr, 0, sizeof send_wr); 631 memset(&send_wr, 0, sizeof send_wr);
630 ctxt->wr_op = IB_WR_SEND; 632 ctxt->wr_op = IB_WR_SEND;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 62e4f9bcc387..25688fa2207f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void)
477 477
478 while ((page = alloc_page(GFP_KERNEL)) == NULL) { 478 while ((page = alloc_page(GFP_KERNEL)) == NULL) {
479 /* If we can't get memory, wait a bit and try again */ 479 /* If we can't get memory, wait a bit and try again */
480 printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 " 480 printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
481 "jiffies.\n");
482 schedule_timeout_uninterruptible(msecs_to_jiffies(1000)); 481 schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
483 } 482 }
484 return page; 483 return page;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 285dc0884115..1eb9c468d0c9 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void)
733{ 733{
734 int rc; 734 int rc;
735 735
736 dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n"); 736 dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
737#ifdef RPC_DEBUG 737#ifdef RPC_DEBUG
738 if (sunrpc_table_header) { 738 if (sunrpc_table_header) {
739 unregister_sysctl_table(sunrpc_table_header); 739 unregister_sysctl_table(sunrpc_table_header);
@@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void)
755 if (rc) 755 if (rc)
756 return rc; 756 return rc;
757 757
758 dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n"); 758 dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
759 759
760 dprintk(KERN_INFO "Defaults:\n"); 760 dprintk("Defaults:\n");
761 dprintk(KERN_INFO "\tSlots %d\n" 761 dprintk("\tSlots %d\n"
762 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n", 762 "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
763 xprt_rdma_slot_table_entries, 763 xprt_rdma_slot_table_entries,
764 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write); 764 xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
765 dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n", 765 dprintk("\tPadding %d\n\tMemreg %d\n",
766 xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy); 766 xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
767 767
768#ifdef RPC_DEBUG 768#ifdef RPC_DEBUG
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0addefca8e77..6735e1d1e9bb 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -909,6 +909,12 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
909 xs_tcp_shutdown(xprt); 909 xs_tcp_shutdown(xprt);
910} 910}
911 911
912static void xs_xprt_free(struct rpc_xprt *xprt)
913{
914 xs_free_peer_addresses(xprt);
915 xprt_free(xprt);
916}
917
912/** 918/**
913 * xs_destroy - prepare to shutdown a transport 919 * xs_destroy - prepare to shutdown a transport
914 * @xprt: doomed transport 920 * @xprt: doomed transport
@@ -919,8 +925,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
919 dprintk("RPC: xs_destroy xprt %p\n", xprt); 925 dprintk("RPC: xs_destroy xprt %p\n", xprt);
920 926
921 xs_close(xprt); 927 xs_close(xprt);
922 xs_free_peer_addresses(xprt); 928 xs_xprt_free(xprt);
923 xprt_free(xprt);
924 module_put(THIS_MODULE); 929 module_put(THIS_MODULE);
925} 930}
926 931
@@ -1306,41 +1311,29 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
1306 * If we're unable to obtain the rpc_rqst we schedule the closing of the 1311 * If we're unable to obtain the rpc_rqst we schedule the closing of the
1307 * connection and return -1. 1312 * connection and return -1.
1308 */ 1313 */
1309static inline int xs_tcp_read_callback(struct rpc_xprt *xprt, 1314static int xs_tcp_read_callback(struct rpc_xprt *xprt,
1310 struct xdr_skb_reader *desc) 1315 struct xdr_skb_reader *desc)
1311{ 1316{
1312 struct sock_xprt *transport = 1317 struct sock_xprt *transport =
1313 container_of(xprt, struct sock_xprt, xprt); 1318 container_of(xprt, struct sock_xprt, xprt);
1314 struct rpc_rqst *req; 1319 struct rpc_rqst *req;
1315 1320
1316 req = xprt_alloc_bc_request(xprt); 1321 /* Look up and lock the request corresponding to the given XID */
1322 spin_lock(&xprt->transport_lock);
1323 req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
1317 if (req == NULL) { 1324 if (req == NULL) {
1325 spin_unlock(&xprt->transport_lock);
1318 printk(KERN_WARNING "Callback slot table overflowed\n"); 1326 printk(KERN_WARNING "Callback slot table overflowed\n");
1319 xprt_force_disconnect(xprt); 1327 xprt_force_disconnect(xprt);
1320 return -1; 1328 return -1;
1321 } 1329 }
1322 1330
1323 req->rq_xid = transport->tcp_xid;
1324 dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); 1331 dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid));
1325 xs_tcp_read_common(xprt, desc, req); 1332 xs_tcp_read_common(xprt, desc, req);
1326 1333
1327 if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) { 1334 if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
1328 struct svc_serv *bc_serv = xprt->bc_serv; 1335 xprt_complete_bc_request(req, transport->tcp_copied);
1329 1336 spin_unlock(&xprt->transport_lock);
1330 /*
1331 * Add callback request to callback list. The callback
1332 * service sleeps on the sv_cb_waitq waiting for new
1333 * requests. Wake it up after adding enqueing the
1334 * request.
1335 */
1336 dprintk("RPC: add callback request to list\n");
1337 spin_lock(&bc_serv->sv_cb_lock);
1338 list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
1339 spin_unlock(&bc_serv->sv_cb_lock);
1340 wake_up(&bc_serv->sv_cb_waitq);
1341 }
1342
1343 req->rq_private_buf.len = transport->tcp_copied;
1344 1337
1345 return 0; 1338 return 0;
1346} 1339}
@@ -2544,6 +2537,10 @@ static void bc_close(struct rpc_xprt *xprt)
2544 2537
2545static void bc_destroy(struct rpc_xprt *xprt) 2538static void bc_destroy(struct rpc_xprt *xprt)
2546{ 2539{
2540 dprintk("RPC: bc_destroy xprt %p\n", xprt);
2541
2542 xs_xprt_free(xprt);
2543 module_put(THIS_MODULE);
2547} 2544}
2548 2545
2549static struct rpc_xprt_ops xs_local_ops = { 2546static struct rpc_xprt_ops xs_local_ops = {
@@ -2744,7 +2741,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
2744 return xprt; 2741 return xprt;
2745 ret = ERR_PTR(-EINVAL); 2742 ret = ERR_PTR(-EINVAL);
2746out_err: 2743out_err:
2747 xprt_free(xprt); 2744 xs_xprt_free(xprt);
2748 return ret; 2745 return ret;
2749} 2746}
2750 2747
@@ -2822,7 +2819,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
2822 return xprt; 2819 return xprt;
2823 ret = ERR_PTR(-EINVAL); 2820 ret = ERR_PTR(-EINVAL);
2824out_err: 2821out_err:
2825 xprt_free(xprt); 2822 xs_xprt_free(xprt);
2826 return ret; 2823 return ret;
2827} 2824}
2828 2825
@@ -2897,12 +2894,11 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
2897 xprt->address_strings[RPC_DISPLAY_ADDR], 2894 xprt->address_strings[RPC_DISPLAY_ADDR],
2898 xprt->address_strings[RPC_DISPLAY_PROTO]); 2895 xprt->address_strings[RPC_DISPLAY_PROTO]);
2899 2896
2900
2901 if (try_module_get(THIS_MODULE)) 2897 if (try_module_get(THIS_MODULE))
2902 return xprt; 2898 return xprt;
2903 ret = ERR_PTR(-EINVAL); 2899 ret = ERR_PTR(-EINVAL);
2904out_err: 2900out_err:
2905 xprt_free(xprt); 2901 xs_xprt_free(xprt);
2906 return ret; 2902 return ret;
2907} 2903}
2908 2904
@@ -2919,15 +2915,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
2919 struct svc_sock *bc_sock; 2915 struct svc_sock *bc_sock;
2920 struct rpc_xprt *ret; 2916 struct rpc_xprt *ret;
2921 2917
2922 if (args->bc_xprt->xpt_bc_xprt) {
2923 /*
2924 * This server connection already has a backchannel
2925 * transport; we can't create a new one, as we wouldn't
2926 * be able to match replies based on xid any more. So,
2927 * reuse the already-existing one:
2928 */
2929 return args->bc_xprt->xpt_bc_xprt;
2930 }
2931 xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries, 2918 xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
2932 xprt_tcp_slot_table_entries); 2919 xprt_tcp_slot_table_entries);
2933 if (IS_ERR(xprt)) 2920 if (IS_ERR(xprt))
@@ -2985,13 +2972,14 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
2985 */ 2972 */
2986 xprt_set_connected(xprt); 2973 xprt_set_connected(xprt);
2987 2974
2988
2989 if (try_module_get(THIS_MODULE)) 2975 if (try_module_get(THIS_MODULE))
2990 return xprt; 2976 return xprt;
2977
2978 args->bc_xprt->xpt_bc_xprt = NULL;
2991 xprt_put(xprt); 2979 xprt_put(xprt);
2992 ret = ERR_PTR(-EINVAL); 2980 ret = ERR_PTR(-EINVAL);
2993out_err: 2981out_err:
2994 xprt_free(xprt); 2982 xs_xprt_free(xprt);
2995 return ret; 2983 return ret;
2996} 2984}
2997 2985
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 0374a817631e..4c564eb69e1a 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -182,6 +182,8 @@ void tipc_net_start(u32 addr)
182 tipc_bclink_init(); 182 tipc_bclink_init();
183 write_unlock_bh(&tipc_net_lock); 183 write_unlock_bh(&tipc_net_lock);
184 184
185 tipc_nametbl_publish(TIPC_CFG_SRV, tipc_own_addr, tipc_own_addr,
186 TIPC_ZONE_SCOPE, 0, tipc_own_addr);
185 pr_info("Started in network mode\n"); 187 pr_info("Started in network mode\n");
186 pr_info("Own node address %s, network identity %u\n", 188 pr_info("Own node address %s, network identity %u\n",
187 tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); 189 tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id);
@@ -192,6 +194,7 @@ void tipc_net_stop(void)
192 if (!tipc_own_addr) 194 if (!tipc_own_addr)
193 return; 195 return;
194 196
197 tipc_nametbl_withdraw(TIPC_CFG_SRV, tipc_own_addr, 0, tipc_own_addr);
195 write_lock_bh(&tipc_net_lock); 198 write_lock_bh(&tipc_net_lock);
196 tipc_bearer_stop(); 199 tipc_bearer_stop();
197 tipc_bclink_stop(); 200 tipc_bclink_stop();
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 29b7f26a12cf..adc12e227303 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -301,7 +301,6 @@ static int tipc_release(struct socket *sock)
301 struct tipc_sock *tsk; 301 struct tipc_sock *tsk;
302 struct tipc_port *port; 302 struct tipc_port *port;
303 struct sk_buff *buf; 303 struct sk_buff *buf;
304 int res;
305 304
306 /* 305 /*
307 * Exit if socket isn't fully initialized (occurs when a failed accept() 306 * Exit if socket isn't fully initialized (occurs when a failed accept()
@@ -349,7 +348,7 @@ static int tipc_release(struct socket *sock)
349 sock_put(sk); 348 sock_put(sk);
350 sock->sk = NULL; 349 sock->sk = NULL;
351 350
352 return res; 351 return 0;
353} 352}
354 353
355/** 354/**