40 files changed, 1251 insertions, 587 deletions
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 0e474b13463b..1059ed3bc255 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1044,10 +1044,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
        if (repl->num_counters &&
           copy_to_user(repl->counters, counterstmp,
           repl->num_counters * sizeof(struct ebt_counter))) {
-                ret = -EFAULT;
+                /* Silent error, can't fail, new table is already in place */
+                net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n");
        }
-        else
-                ret = 0;
        /* decrease module count and free resources */
        EBT_ENTRY_ITERATE(table->entries, table->entries_size,
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index b703790b4e44..a1ef53c04415 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -292,10 +292,12 @@ static int is_out(const struct crush_map *map,
 * @outpos: our position in that vector
 * @tries: number of attempts to make
 * @recurse_tries: number of attempts to have recursive chooseleaf make
- * @local_tries: localized retries
+ * @local_retries: localized retries
- * @local_fallback_tries: localized fallback retries
+ * @local_fallback_retries: localized fallback retries
 * @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @vary_r: pass r to recursive calls
 * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ * @parent_r: r value passed from the parent
 */
 static int crush_choose_firstn(const struct crush_map *map,
                               struct crush_bucket *bucket,
@@ -304,10 +306,12 @@ static int crush_choose_firstn(const struct crush_map *map,
                               int *out, int outpos,
                               unsigned int tries,
                               unsigned int recurse_tries,
-                               unsigned int local_tries,
+                               unsigned int local_retries,
-                               unsigned int local_fallback_tries,
+                               unsigned int local_fallback_retries,
                               int recurse_to_leaf,
-                               int *out2)
+                               unsigned int vary_r,
+                               int *out2,
+                               int parent_r)
 {
        int rep;
        unsigned int ftotal, flocal;
@@ -319,8 +323,11 @@ static int crush_choose_firstn(const struct crush_map *map,
        int itemtype;
        int collide, reject;
-        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+        dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
-                bucket->id, x, outpos, numrep);
+                recurse_to_leaf ? "_LEAF" : "",
+                bucket->id, x, outpos, numrep,
+                tries, recurse_tries, local_retries, local_fallback_retries,
+                parent_r);
        for (rep = outpos; rep < numrep; rep++) {
                /* keep trying until we get a non-out, non-colliding item */
@@ -335,7 +342,7 @@ static int crush_choose_firstn(const struct crush_map *map,
                        do {
                                collide = 0;
                                retry_bucket = 0;
-                                r = rep;
+                                r = rep + parent_r;
                                /* r' = r + f_total */
                                r += ftotal;
@@ -344,9 +351,9 @@ static int crush_choose_firstn(const struct crush_map *map,
                                        reject = 1;
                                        goto reject;
                                }
-                                if (local_fallback_tries > 0 &&
+                                if (local_fallback_retries > 0 &&
                                    flocal >= (in->size>>1) &&
-                                    flocal > local_fallback_tries)
+                                    flocal > local_fallback_retries)
                                        item = bucket_perm_choose(in, x, r);
                                else
                                        item = crush_bucket_choose(in, x, r);
@@ -387,16 +394,23 @@ static int crush_choose_firstn(const struct crush_map *map,
                                reject = 0;
                                if (!collide && recurse_to_leaf) {
                                        if (item < 0) {
+                                                int sub_r;
+                                                if (vary_r)
+                                                        sub_r = r >> (vary_r-1);
+                                                else
+                                                        sub_r = 0;
                                                if (crush_choose_firstn(map,
                                                         map->buckets[-1-item],
                                                         weight, weight_max,
                                                         x, outpos+1, 0,
                                                         out2, outpos,
                                                         recurse_tries, 0,
-                                                         local_tries,
+                                                         local_retries,
-                                                         local_fallback_tries,
+                                                         local_fallback_retries,
                                                         0,
-                                                         NULL) <= outpos)
+                                                         vary_r,
+                                                         NULL,
+                                                         sub_r) <= outpos)
                                                        /* didn't get leaf */
                                                        reject = 1;
                                        } else {
@@ -420,14 +434,14 @@ reject:
                                        ftotal++;
                                        flocal++;
-                                        if (collide && flocal <= local_tries)
+                                        if (collide && flocal <= local_retries)
                                                /* retry locally a few times */
                                                retry_bucket = 1;
-                                        else if (local_fallback_tries > 0 &&
+                                        else if (local_fallback_retries > 0 &&
-                                                 flocal <= in->size + local_fallback_tries)
+                                                 flocal <= in->size + local_fallback_retries)
                                                /* exhaustive bucket search */
                                                retry_bucket = 1;
-                                        else if (ftotal <= tries)
+                                        else if (ftotal < tries)
                                                /* then retry descent */
                                                retry_descent = 1;
                                        else
@@ -640,10 +654,20 @@ int crush_do_rule(const struct crush_map *map,
        __u32 step;
        int i, j;
        int numrep;
-        int choose_tries = map->choose_total_tries;
+        /*
-        int choose_local_tries = map->choose_local_tries;
+         * the original choose_total_tries value was off by one (it
-        int choose_local_fallback_tries = map->choose_local_fallback_tries;
+         * counted "retries" and not "tries").  add one.
+         */
+        int choose_tries = map->choose_total_tries + 1;
        int choose_leaf_tries = 0;
+        /*
+         * the local tries values were counted as "retries", though,
+         * and need no adjustment
+         */
+        int choose_local_retries = map->choose_local_tries;
+        int choose_local_fallback_retries = map->choose_local_fallback_tries;
+        int vary_r = map->chooseleaf_vary_r;
        if ((__u32)ruleno >= map->max_rules) {
                dprintk(" bad ruleno %d\n", ruleno);
@@ -676,13 +700,18 @@ int crush_do_rule(const struct crush_map *map,
                        break;
                case CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES:
-                        if (curstep->arg1 > 0)
+                        if (curstep->arg1 >= 0)
-                                choose_local_tries = curstep->arg1;
+                                choose_local_retries = curstep->arg1;
                        break;
                case CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES:
-                        if (curstep->arg1 > 0)
+                        if (curstep->arg1 >= 0)
-                                choose_local_fallback_tries = curstep->arg1;
+                                choose_local_fallback_retries = curstep->arg1;
+                        break;
+                case CRUSH_RULE_SET_CHOOSELEAF_VARY_R:
+                        if (curstep->arg1 >= 0)
+                                vary_r = curstep->arg1;
                        break;
                case CRUSH_RULE_CHOOSELEAF_FIRSTN:
@@ -734,10 +763,12 @@ int crush_do_rule(const struct crush_map *map,
                                                o+osize, j,
                                                choose_tries,
                                                recurse_tries,
-                                                choose_local_tries,
+                                                choose_local_retries,
-                                                choose_local_fallback_tries,
+                                                choose_local_fallback_retries,
                                                recurse_to_leaf,
-                                                c+osize);
+                                                vary_r,
+                                                c+osize,
+                                                0);
                                } else {
                                        crush_choose_indep(
                                                map,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 258a382e75ed..10421a4b76f8 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -53,34 +53,55 @@ static int osdmap_show(struct seq_file *s, void *p)
 {
        int i;
        struct ceph_client *client = s->private;
+        struct ceph_osdmap *map = client->osdc.osdmap;
        struct rb_node *n;
-        if (client->osdc.osdmap == NULL)
+        if (map == NULL)
                return 0;
-        seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+        seq_printf(s, "epoch %d\n", map->epoch);
        seq_printf(s, "flags%s%s\n",
-                   (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+                   (map->flags & CEPH_OSDMAP_NEARFULL) ?  " NEARFULL" : "",
-                   " NEARFULL" : "",
+                   (map->flags & CEPH_OSDMAP_FULL) ?  " FULL" : "");
-                   (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
-                   " FULL" : "");
+        for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) {
-        for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
                struct ceph_pg_pool_info *pool =
                        rb_entry(n, struct ceph_pg_pool_info, node);
-                seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
-                           (unsigned long long)pool->id, pool->pg_num,
+                seq_printf(s, "pool %lld pg_num %u (%d) read_tier %lld write_tier %lld\n",
-                           pool->pg_num_mask);
+                           pool->id, pool->pg_num, pool->pg_num_mask,
+                           pool->read_tier, pool->write_tier);
        }
-        for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+        for (i = 0; i < map->max_osd; i++) {
-                struct ceph_entity_addr *addr =
+                struct ceph_entity_addr *addr = &map->osd_addr[i];
-                        &client->osdc.osdmap->osd_addr[i];
+                int state = map->osd_state[i];
-                int state = client->osdc.osdmap->osd_state[i];
                char sb[64];
-                seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+                seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
                           i, ceph_pr_addr(&addr->in_addr),
-                           ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+                           ((map->osd_weight[i]*100) >> 16),
-                           ceph_osdmap_state_str(sb, sizeof(sb), state));
+                           ceph_osdmap_state_str(sb, sizeof(sb), state),
+                           ((ceph_get_primary_affinity(map, i)*100) >> 16));
+        }
+        for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(n, struct ceph_pg_mapping, node);
+                seq_printf(s, "pg_temp %llu.%x [", pg->pgid.pool,
+                           pg->pgid.seed);
+                for (i = 0; i < pg->pg_temp.len; i++)
+                        seq_printf(s, "%s%d", (i == 0 ? "" : ","),
+                                   pg->pg_temp.osds[i]);
+                seq_printf(s, "]\n");
        }
+        for (n = rb_first(&map->primary_temp); n; n = rb_next(n)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(n, struct ceph_pg_mapping, node);
+                seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
+                           pg->pgid.seed, pg->primary_temp.osd);
+        }
        return 0;
 }
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 30efc5c18622..4f55f9ce63fa 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -919,6 +919,9 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
        if (!bytes || cursor->page_offset)
                return false;   /* more bytes to process in the current page */
+        if (!cursor->resid)
+                return false;   /* no more data */
        /* Move on to the next page; offset is already at 0 */
        BUG_ON(cursor->page_index >= cursor->page_count);
@@ -1004,6 +1007,9 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
        if (!bytes || cursor->offset & ~PAGE_MASK)
                return false;   /* more bytes to process in the current page */
+        if (!cursor->resid)
+                return false;   /* no more data */
        /* Move on to the next page */
        BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 82750f915865..b0dfce77656a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -436,6 +436,7 @@ static bool osd_req_opcode_valid(u16 opcode)
        case CEPH_OSD_OP_OMAPCLEAR:
        case CEPH_OSD_OP_OMAPRMKEYS:
        case CEPH_OSD_OP_OMAP_CMP:
+        case CEPH_OSD_OP_SETALLOCHINT:
        case CEPH_OSD_OP_CLONERANGE:
        case CEPH_OSD_OP_ASSERT_SRC_VERSION:
        case CEPH_OSD_OP_SRC_CMPXATTR:
@@ -591,6 +592,26 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
+void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req,
+                                unsigned int which,
+                                u64 expected_object_size,
+                                u64 expected_write_size)
+{
+        struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which,
+                                                      CEPH_OSD_OP_SETALLOCHINT);
+        op->alloc_hint.expected_object_size = expected_object_size;
+        op->alloc_hint.expected_write_size = expected_write_size;
+        /*
+         * CEPH_OSD_OP_SETALLOCHINT op is advisory and therefore deemed
+         * not worth a feature bit.  Set FAILOK per-op flag to make
+         * sure older osds don't trip over an unsupported opcode.
+         */
+        op->flags |= CEPH_OSD_OP_FLAG_FAILOK;
+}
+EXPORT_SYMBOL(osd_req_op_alloc_hint_init);
 static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
                                struct ceph_osd_data *osd_data)
 {
@@ -681,6 +702,12 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                dst->watch.ver = cpu_to_le64(src->watch.ver);
                dst->watch.flag = src->watch.flag;
                break;
+        case CEPH_OSD_OP_SETALLOCHINT:
+                dst->alloc_hint.expected_object_size =
+                    cpu_to_le64(src->alloc_hint.expected_object_size);
+                dst->alloc_hint.expected_write_size =
+                    cpu_to_le64(src->alloc_hint.expected_write_size);
+                break;
        default:
                pr_err("unsupported osd opcode %s\n",
                        ceph_osd_op_name(src->op));
@@ -688,7 +715,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
                return 0;
        }
        dst->op = cpu_to_le16(src->op);
+        dst->flags = cpu_to_le32(src->flags);
        dst->payload_len = cpu_to_le32(src->payload_len);
        return request_data_len;
@@ -1304,7 +1333,7 @@ static int __map_request(struct ceph_osd_client *osdc,
 {
        struct ceph_pg pgid;
        int acting[CEPH_PG_MAX_SIZE];
-        int o = -1, num = 0;
+        int num, o;
        int err;
        bool was_paused;
@@ -1317,11 +1346,9 @@ static int __map_request(struct ceph_osd_client *osdc,
        }
        req->r_pgid = pgid;
-        err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
+        num = ceph_calc_pg_acting(osdc->osdmap, pgid, acting, &o);
-        if (err > 0) {
+        if (num < 0)
-                o = acting[0];
+                num = 0;
-                num = err;
-        }
        was_paused = req->r_paused;
        req->r_paused = __req_should_be_paused(osdc, req);
@@ -2033,7 +2060,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
                        int skipped_map = 0;
                        dout("taking full map %u len %d\n", epoch, maplen);
-                        newmap = osdmap_decode(&p, p+maplen);
+                        newmap = ceph_osdmap_decode(&p, p+maplen);
                        if (IS_ERR(newmap)) {
                                err = PTR_ERR(newmap);
                                goto bad;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index aade4a5c1c07..e632b5a52f5b 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -343,7 +343,7 @@ bad:
 /*
 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
- * to a set of osds)
+ * to a set of osds) and primary_temp (explicit primary setting)
 */
 static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
 {
@@ -506,7 +506,7 @@ static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
        kfree(pi);
 }
-static int __decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
+static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
 {
        u8 ev, cv;
        unsigned len, num;
@@ -587,7 +587,7 @@ bad:
        return -EINVAL;
 }
-static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
+static int decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
 {
        struct ceph_pg_pool_info *pi;
        u32 num, len;
@@ -633,6 +633,13 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
                rb_erase(&pg->node, &map->pg_temp);
                kfree(pg);
        }
+        while (!RB_EMPTY_ROOT(&map->primary_temp)) {
+                struct ceph_pg_mapping *pg =
+                        rb_entry(rb_first(&map->primary_temp),
+                                 struct ceph_pg_mapping, node);
+                rb_erase(&pg->node, &map->primary_temp);
+                kfree(pg);
+        }
        while (!RB_EMPTY_ROOT(&map->pg_pools)) {
                struct ceph_pg_pool_info *pi =
                        rb_entry(rb_first(&map->pg_pools),
@@ -642,186 +649,516 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
        kfree(map->osd_state);
        kfree(map->osd_weight);
        kfree(map->osd_addr);
+        kfree(map->osd_primary_affinity);
        kfree(map);
 }
 /*
- * adjust max osd value.  reallocate arrays.
+ * Adjust max_osd value, (re)allocate arrays.
+ *
+ * The new elements are properly initialized.
 */
 static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 {
        u8 *state;
-        struct ceph_entity_addr *addr;
        u32 *weight;
+        struct ceph_entity_addr *addr;
+        int i;
-        state = kcalloc(max, sizeof(*state), GFP_NOFS);
+        state = krealloc(map->osd_state, max*sizeof(*state), GFP_NOFS);
-        addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+        weight = krealloc(map->osd_weight, max*sizeof(*weight), GFP_NOFS);
-        weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+        addr = krealloc(map->osd_addr, max*sizeof(*addr), GFP_NOFS);
-        if (state == NULL || addr == NULL || weight == NULL) {
+        if (!state || !weight || !addr) {
                kfree(state);
-                kfree(addr);
                kfree(weight);
+                kfree(addr);
                return -ENOMEM;
        }
-        /* copy old? */
+        for (i = map->max_osd; i < max; i++) {
-        if (map->osd_state) {
+                state[i] = 0;
-                memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+                weight[i] = CEPH_OSD_OUT;
-                memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+                memset(addr + i, 0, sizeof(*addr));
-                memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
-                kfree(map->osd_state);
-                kfree(map->osd_addr);
-                kfree(map->osd_weight);
        }
        map->osd_state = state;
        map->osd_weight = weight;
        map->osd_addr = addr;
+        if (map->osd_primary_affinity) {
+                u32 *affinity;
+                affinity = krealloc(map->osd_primary_affinity,
+                                    max*sizeof(*affinity), GFP_NOFS);
+                if (!affinity)
+                        return -ENOMEM;
+                for (i = map->max_osd; i < max; i++)
+                        affinity[i] = CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+                map->osd_primary_affinity = affinity;
+        }
        map->max_osd = max;
        return 0;
 }
+#define OSDMAP_WRAPPER_COMPAT_VER       7
+#define OSDMAP_CLIENT_DATA_COMPAT_VER   1
 /*
- * decode a full map.
+ * Return 0 or error.  On success, *v is set to 0 for old (v6) osdmaps,
+ * to struct_v of the client_data section for new (v7 and above)
+ * osdmaps.
 */
-struct ceph_osdmap *osdmap_decode(void **p, void *end)
+static int get_osdmap_client_data_v(void **p, void *end,
+                                    const char *prefix, u8 *v)
 {
-        struct ceph_osdmap *map;
+        u8 struct_v;
-        u16 version;
-        u32 len, max, i;
+        ceph_decode_8_safe(p, end, struct_v, e_inval);
-        int err = -EINVAL;
+        if (struct_v >= 7) {
-        void *start = *p;
+                u8 struct_compat;
-        struct ceph_pg_pool_info *pi;
+                ceph_decode_8_safe(p, end, struct_compat, e_inval);
+                if (struct_compat > OSDMAP_WRAPPER_COMPAT_VER) {
+                        pr_warning("got v %d cv %d > %d of %s ceph_osdmap\n",
+                                   struct_v, struct_compat,
+                                   OSDMAP_WRAPPER_COMPAT_VER, prefix);
+                        return -EINVAL;
+                }
+                *p += 4; /* ignore wrapper struct_len */
+                ceph_decode_8_safe(p, end, struct_v, e_inval);
+                ceph_decode_8_safe(p, end, struct_compat, e_inval);
+                if (struct_compat > OSDMAP_CLIENT_DATA_COMPAT_VER) {
+                        pr_warning("got v %d cv %d > %d of %s ceph_osdmap client data\n",
+                                   struct_v, struct_compat,
+                                   OSDMAP_CLIENT_DATA_COMPAT_VER, prefix);
+                        return -EINVAL;
+                }
+                *p += 4; /* ignore client data struct_len */
+        } else {
+                u16 version;
+                *p -= 1;
+                ceph_decode_16_safe(p, end, version, e_inval);
+                if (version < 6) {
+                        pr_warning("got v %d < 6 of %s ceph_osdmap\n", version,
+                                   prefix);
+                        return -EINVAL;
+                }
-        dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+                /* old osdmap enconding */
+                struct_v = 0;
+        }
-        map = kzalloc(sizeof(*map), GFP_NOFS);
+        *v = struct_v;
-        if (map == NULL)
+        return 0;
-                return ERR_PTR(-ENOMEM);
-        map->pg_temp = RB_ROOT;
-        ceph_decode_16_safe(p, end, version, bad);
+e_inval:
-        if (version > 6) {
+        return -EINVAL;
-                pr_warning("got unknown v %d > 6 of osdmap\n", version);
+}
-                goto bad;
+static int __decode_pools(void **p, void *end, struct ceph_osdmap *map,
+                          bool incremental)
+{
+        u32 n;
+        ceph_decode_32_safe(p, end, n, e_inval);
+        while (n--) {
+                struct ceph_pg_pool_info *pi;
+                u64 pool;
+                int ret;
+                ceph_decode_64_safe(p, end, pool, e_inval);
+                pi = __lookup_pg_pool(&map->pg_pools, pool);
+                if (!incremental || !pi) {
+                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                        if (!pi)
+                                return -ENOMEM;
+                        pi->id = pool;
+                        ret = __insert_pg_pool(&map->pg_pools, pi);
+                        if (ret) {
+                                kfree(pi);
+                                return ret;
+                        }
+                }
+                ret = decode_pool(p, end, pi);
+                if (ret)
+                        return ret;
        }
-        if (version < 6) {
-                pr_warning("got old v %d < 6 of osdmap\n", version);
+        return 0;
-                goto bad;
+e_inval:
+        return -EINVAL;
+}
+static int decode_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+        return __decode_pools(p, end, map, false);
+}
+static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
+{
+        return __decode_pools(p, end, map, true);
+}
+static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map,
+                            bool incremental)
+{
+        u32 n;
+        ceph_decode_32_safe(p, end, n, e_inval);
+        while (n--) {
+                struct ceph_pg pgid;
+                u32 len, i;
+                int ret;
+                ret = ceph_decode_pgid(p, end, &pgid);
+                if (ret)
+                        return ret;
+                ceph_decode_32_safe(p, end, len, e_inval);
+                ret = __remove_pg_mapping(&map->pg_temp, pgid);
+                BUG_ON(!incremental && ret != -ENOENT);
+                if (!incremental || len > 0) {
+                        struct ceph_pg_mapping *pg;
+                        ceph_decode_need(p, end, len*sizeof(u32), e_inval);
+                        if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+                                return -EINVAL;
+                        pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS);
+                        if (!pg)
+                                return -ENOMEM;
+                        pg->pgid = pgid;
+                        pg->pg_temp.len = len;
+                        for (i = 0; i < len; i++)
+                                pg->pg_temp.osds[i] = ceph_decode_32(p);
+                        ret = __insert_pg_mapping(pg, &map->pg_temp);
+                        if (ret) {
+                                kfree(pg);
+                                return ret;
+                        }
+                }
        }
-        ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+        return 0;
+e_inval:
+        return -EINVAL;
+}
+static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+        return __decode_pg_temp(p, end, map, false);
+}
+static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+        return __decode_pg_temp(p, end, map, true);
+}
+static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map,
+                                 bool incremental)
+{
+        u32 n;
+        ceph_decode_32_safe(p, end, n, e_inval);
+        while (n--) {
+                struct ceph_pg pgid;
+                u32 osd;
+                int ret;
+                ret = ceph_decode_pgid(p, end, &pgid);
+                if (ret)
+                        return ret;
+                ceph_decode_32_safe(p, end, osd, e_inval);
+                ret = __remove_pg_mapping(&map->primary_temp, pgid);
+                BUG_ON(!incremental && ret != -ENOENT);
+                if (!incremental || osd != (u32)-1) {
+                        struct ceph_pg_mapping *pg;
+                        pg = kzalloc(sizeof(*pg), GFP_NOFS);
+                        if (!pg)
+                                return -ENOMEM;
+                        pg->pgid = pgid;
+                        pg->primary_temp.osd = osd;
+                        ret = __insert_pg_mapping(pg, &map->primary_temp);
+                        if (ret) {
+                                kfree(pg);
+                                return ret;
+                        }
+                }
+        }
+        return 0;
+e_inval:
+        return -EINVAL;
+}
+static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
+{
+        return __decode_primary_temp(p, end, map, false);
+}
+static int decode_new_primary_temp(void **p, void *end,
+                                   struct ceph_osdmap *map)
+{
+        return __decode_primary_temp(p, end, map, true);
+}
+u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
+{
+        BUG_ON(osd >= map->max_osd);
+        if (!map->osd_primary_affinity)
+                return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+        return map->osd_primary_affinity[osd];
+}
+static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff)
+{
+        BUG_ON(osd >= map->max_osd);
+        if (!map->osd_primary_affinity) {
+                int i;
+                map->osd_primary_affinity = kmalloc(map->max_osd*sizeof(u32),
+                                                    GFP_NOFS);
+                if (!map->osd_primary_affinity)
+                        return -ENOMEM;
+                for (i = 0; i < map->max_osd; i++)
+                        map->osd_primary_affinity[i] =
+                            CEPH_OSD_DEFAULT_PRIMARY_AFFINITY;
+        }
+        map->osd_primary_affinity[osd] = aff;
+        return 0;
+}
+static int decode_primary_affinity(void **p, void *end,
+                                   struct ceph_osdmap *map)
+{
+        u32 len, i;
+        ceph_decode_32_safe(p, end, len, e_inval);
+        if (len == 0) {
+                kfree(map->osd_primary_affinity);
+                map->osd_primary_affinity = NULL;
+                return 0;
+        }
+        if (len != map->max_osd)
+                goto e_inval;
+        ceph_decode_need(p, end, map->max_osd*sizeof(u32), e_inval);
+        for (i = 0; i < map->max_osd; i++) {
+                int ret;
+                ret = set_primary_affinity(map, i, ceph_decode_32(p));
+                if (ret)
+                        return ret;
+        }
+        return 0;
+e_inval:
+        return -EINVAL;
+}
+static int decode_new_primary_affinity(void **p, void *end,
+                                       struct ceph_osdmap *map)
+{
+        u32 n;
+        ceph_decode_32_safe(p, end, n, e_inval);
+        while (n--) {
+                u32 osd, aff;
+                int ret;
+                ceph_decode_32_safe(p, end, osd, e_inval);
+                ceph_decode_32_safe(p, end, aff, e_inval);
+                ret = set_primary_affinity(map, osd, aff);
+                if (ret)
+                        return ret;
+                pr_info("osd%d primary-affinity 0x%x\n", osd, aff);
+        }
+        return 0;
+e_inval:
+        return -EINVAL;
+}
+/*
+ * decode a full map.
+ */
+static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
+{
+        u8 struct_v;
+        u32 epoch = 0;
+        void *start = *p;
+        u32 max;
+        u32 len, i;
+        int err;
+        dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
+        err = get_osdmap_client_data_v(p, end, "full", &struct_v);
+        if (err)
+                goto bad;
+        /* fsid, epoch, created, modified */
+        ceph_decode_need(p, end, sizeof(map->fsid) + sizeof(u32) +
+                         sizeof(map->created) + sizeof(map->modified), e_inval);
        ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
-        map->epoch = ceph_decode_32(p);
+        epoch = map->epoch = ceph_decode_32(p);
        ceph_decode_copy(p, &map->created, sizeof(map->created));
        ceph_decode_copy(p, &map->modified, sizeof(map->modified));
-        ceph_decode_32_safe(p, end, max, bad);
+        /* pools */
-        while (max--) {
+        err = decode_pools(p, end, map);
-                ceph_decode_need(p, end, 8 + 2, bad);
+        if (err)
-                err = -ENOMEM;
+                goto bad;
-                pi = kzalloc(sizeof(*pi), GFP_NOFS);
-                if (!pi)
-                        goto bad;
-                pi->id = ceph_decode_64(p);
-                err = __decode_pool(p, end, pi);
-                if (err < 0) {
-                        kfree(pi);
-                        goto bad;
-                }
-                __insert_pg_pool(&map->pg_pools, pi);
-        }
-        err = __decode_pool_names(p, end, map);
+        /* pool_name */
-        if (err < 0) {
+        err = decode_pool_names(p, end, map);
-                dout("fail to decode pool names");
+        if (err)
                goto bad;
-        }
-        ceph_decode_32_safe(p, end, map->pool_max, bad);
+        ceph_decode_32_safe(p, end, map->pool_max, e_inval);
-        ceph_decode_32_safe(p, end, map->flags, bad);
+        ceph_decode_32_safe(p, end, map->flags, e_inval);
-        max = ceph_decode_32(p);
+        /* max_osd */
+        ceph_decode_32_safe(p, end, max, e_inval);
        /* (re)alloc osd arrays */
        err = osdmap_set_max_osd(map, max);
-        if (err < 0)
+        if (err)
                goto bad;
-        dout("osdmap_decode max_osd = %d\n", map->max_osd);
-        /* osds */
+        /* osd_state, osd_weight, osd_addrs->client_addr */
-        err = -EINVAL;
        ceph_decode_need(p, end, 3*sizeof(u32) +
                         map->max_osd*(1 + sizeof(*map->osd_weight) +
-                                       sizeof(*map->osd_addr)), bad);
+                                       sizeof(*map->osd_addr)), e_inval);
-        *p += 4; /* skip length field (should match max) */
+        if (ceph_decode_32(p) != map->max_osd)
+                goto e_inval;
        ceph_decode_copy(p, map->osd_state, map->max_osd);
-        *p += 4; /* skip length field (should match max) */
+        if (ceph_decode_32(p) != map->max_osd)
+                goto e_inval;
        for (i = 0; i < map->max_osd; i++)
                map->osd_weight[i] = ceph_decode_32(p);
-        *p += 4; /* skip length field (should match max) */
+        if (ceph_decode_32(p) != map->max_osd)
+                goto e_inval;
        ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
        for (i = 0; i < map->max_osd; i++)
                ceph_decode_addr(&map->osd_addr[i]);
        /* pg_temp */
-        ceph_decode_32_safe(p, end, len, bad);
+        err = decode_pg_temp(p, end, map);
-        for (i = 0; i < len; i++) {
+        if (err)
-                int n, j;
+                goto bad;
-                struct ceph_pg pgid;
-                struct ceph_pg_mapping *pg;
-                err = ceph_decode_pgid(p, end, &pgid);
+        /* primary_temp */
+        if (struct_v >= 1) {
+                err = decode_primary_temp(p, end, map);
                if (err)
                        goto bad;
-                ceph_decode_need(p, end, sizeof(u32), bad);
+        }
-                n = ceph_decode_32(p);
-                err = -EINVAL;
-                if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
-                        goto bad;
-                ceph_decode_need(p, end, n * sizeof(u32), bad);
-                err = -ENOMEM;
-                pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
-                if (!pg)
-                        goto bad;
-                pg->pgid = pgid;
-                pg->len = n;
-                for (j = 0; j < n; j++)
-                        pg->osds[j] = ceph_decode_32(p);
-                err = __insert_pg_mapping(pg, &map->pg_temp);
+        /* primary_affinity */
+        if (struct_v >= 2) {
+                err = decode_primary_affinity(p, end, map);
                if (err)
                        goto bad;
-                dout(" added pg_temp %lld.%x len %d\n", pgid.pool, pgid.seed,
+        } else {
-                     len);
+                /* XXX can this happen? */
+                kfree(map->osd_primary_affinity);
+                map->osd_primary_affinity = NULL;
        }
        /* crush */
-        ceph_decode_32_safe(p, end, len, bad);
+        ceph_decode_32_safe(p, end, len, e_inval);
-        dout("osdmap_decode crush len %d from off 0x%x\n", len,
+        map->crush = crush_decode(*p, min(*p + len, end));
-             (int)(*p - start));
-        ceph_decode_need(p, end, len, bad);
-        map->crush = crush_decode(*p, end);
-        *p += len;
        if (IS_ERR(map->crush)) {
                err = PTR_ERR(map->crush);
                map->crush = NULL;
                goto bad;
        }
+        *p += len;
-        /* ignore the rest of the map */
+        /* ignore the rest */
        *p = end;
-        dout("osdmap_decode done %p %p\n", *p, end);
+        dout("full osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
-        return map;
+        return 0;
+e_inval:
+        err = -EINVAL;
 bad:
-        dout("osdmap_decode fail err %d\n", err);
+        pr_err("corrupt full osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
-        ceph_osdmap_destroy(map);
+               err, epoch, (int)(*p - start), *p, start, end);
-        return ERR_PTR(err);
+        print_hex_dump(KERN_DEBUG, "osdmap: ",
+                       DUMP_PREFIX_OFFSET, 16, 1,
+                       start, end - start, true);
+        return err;
+}
+/*
+ * Allocate and decode a full map.
+ */
+struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
+{
+        struct ceph_osdmap *map;
+        int ret;
+        map = kzalloc(sizeof(*map), GFP_NOFS);
+        if (!map)
+                return ERR_PTR(-ENOMEM);
+        map->pg_temp = RB_ROOT;
+        map->primary_temp = RB_ROOT;
+        mutex_init(&map->crush_scratch_mutex);
+        ret = osdmap_decode(p, end, map);
+        if (ret) {
+                ceph_osdmap_destroy(map);
+                return ERR_PTR(ret);
+        }
+        return map;
 }
 /*
@@ -840,17 +1177,18 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        __s64 new_pool_max;
        __s32 new_flags, max;
        void *start = *p;
-        int err = -EINVAL;
+        int err;
-        u16 version;
+        u8 struct_v;
+        dout("%s %p to %p len %d\n", __func__, *p, end, (int)(end - *p));
-        ceph_decode_16_safe(p, end, version, bad);
+        err = get_osdmap_client_data_v(p, end, "inc", &struct_v);
-        if (version != 6) {
+        if (err)
-                pr_warning("got unknown v %d != 6 of inc osdmap\n", version);
                goto bad;
-        }
-        ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+        /* fsid, epoch, modified, new_pool_max, new_flags */
-                         bad);
+        ceph_decode_need(p, end, sizeof(fsid) + sizeof(u32) + sizeof(modified) +
+                         sizeof(u64) + sizeof(u32), e_inval);
        ceph_decode_copy(p, &fsid, sizeof(fsid));
        epoch = ceph_decode_32(p);
        BUG_ON(epoch != map->epoch+1);
@@ -859,21 +1197,22 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        new_flags = ceph_decode_32(p);
        /* full map? */
-        ceph_decode_32_safe(p, end, len, bad);
+        ceph_decode_32_safe(p, end, len, e_inval);
        if (len > 0) {
                dout("apply_incremental full map len %d, %p to %p\n",
                     len, *p, end);
-                return osdmap_decode(p, min(*p+len, end));
+                return ceph_osdmap_decode(p, min(*p+len, end));
        }
        /* new crush? */
-        ceph_decode_32_safe(p, end, len, bad);
+        ceph_decode_32_safe(p, end, len, e_inval);
        if (len > 0) {
-                dout("apply_incremental new crush map len %d, %p to %p\n",
-                     len, *p, end);
                newcrush = crush_decode(*p, min(*p+len, end));
-                if (IS_ERR(newcrush))
+                if (IS_ERR(newcrush)) {
-                        return ERR_CAST(newcrush);
+                        err = PTR_ERR(newcrush);
+                        newcrush = NULL;
+                        goto bad;
+                }
                *p += len;
        }
@@ -883,13 +1222,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        if (new_pool_max >= 0)
                map->pool_max = new_pool_max;
-        ceph_decode_need(p, end, 5*sizeof(u32), bad);
        /* new max? */
-        max = ceph_decode_32(p);
+        ceph_decode_32_safe(p, end, max, e_inval);
        if (max >= 0) {
                err = osdmap_set_max_osd(map, max);
-                if (err < 0)
+                if (err)
                        goto bad;
        }
@@ -902,51 +1239,34 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                newcrush = NULL;
        }
-        /* new_pool */
+        /* new_pools */
-        ceph_decode_32_safe(p, end, len, bad);
+        err = decode_new_pools(p, end, map);
-        while (len--) {
+        if (err)
-                struct ceph_pg_pool_info *pi;
+                goto bad;
-                ceph_decode_64_safe(p, end, pool, bad);
+        /* new_pool_names */
-                pi = __lookup_pg_pool(&map->pg_pools, pool);
+        err = decode_pool_names(p, end, map);
-                if (!pi) {
+        if (err)
-                        pi = kzalloc(sizeof(*pi), GFP_NOFS);
+                goto bad;
-                        if (!pi) {
-                                err = -ENOMEM;
-                                goto bad;
-                        }
-                        pi->id = pool;
-                        __insert_pg_pool(&map->pg_pools, pi);
-                }
-                err = __decode_pool(p, end, pi);
-                if (err < 0)
-                        goto bad;
-        }
-        if (version >= 5) {
-                err = __decode_pool_names(p, end, map);
-                if (err < 0)
-                        goto bad;
-        }
        /* old_pool */
-        ceph_decode_32_safe(p, end, len, bad);
+        ceph_decode_32_safe(p, end, len, e_inval);
        while (len--) {
                struct ceph_pg_pool_info *pi;
-                ceph_decode_64_safe(p, end, pool, bad);
+                ceph_decode_64_safe(p, end, pool, e_inval);
                pi = __lookup_pg_pool(&map->pg_pools, pool);
                if (pi)
                        __remove_pg_pool(&map->pg_pools, pi);
        }
        /* new_up */
-        err = -EINVAL;
+        ceph_decode_32_safe(p, end, len, e_inval);
-        ceph_decode_32_safe(p, end, len, bad);
        while (len--) {
                u32 osd;
                struct ceph_entity_addr addr;
-                ceph_decode_32_safe(p, end, osd, bad);
+                ceph_decode_32_safe(p, end, osd, e_inval);
-                ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+                ceph_decode_copy_safe(p, end, &addr, sizeof(addr), e_inval);
                ceph_decode_addr(&addr);
                pr_info("osd%d up\n", osd);
                BUG_ON(osd >= map->max_osd);
@@ -955,11 +1275,11 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        }
        /* new_state */
-        ceph_decode_32_safe(p, end, len, bad);
+        ceph_decode_32_safe(p, end, len, e_inval);
        while (len--) {
                u32 osd;
                u8 xorstate;
-                ceph_decode_32_safe(p, end, osd, bad);
+                ceph_decode_32_safe(p, end, osd, e_inval);
                xorstate = **(u8 **)p;
                (*p)++;  /* clean flag */
                if (xorstate == 0)
@@ -971,10 +1291,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        }
        /* new_weight */
-        ceph_decode_32_safe(p, end, len, bad);
+        ceph_decode_32_safe(p, end, len, e_inval);
        while (len--) {
                u32 osd, off;
-                ceph_decode_need(p, end, sizeof(u32)*2, bad);
+                ceph_decode_need(p, end, sizeof(u32)*2, e_inval);
                osd = ceph_decode_32(p);
                off = ceph_decode_32(p);
                pr_info("osd%d weight 0x%x %s\n", osd, off,
@@ -985,56 +1305,35 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
        }
        /* new_pg_temp */
-        ceph_decode_32_safe(p, end, len, bad);
+        err = decode_new_pg_temp(p, end, map);
-        while (len--) {
+        if (err)
-                struct ceph_pg_mapping *pg;
+                goto bad;
-                int j;
-                struct ceph_pg pgid;
-                u32 pglen;
-                err = ceph_decode_pgid(p, end, &pgid);
+        /* new_primary_temp */
+        if (struct_v >= 1) {
+                err = decode_new_primary_temp(p, end, map);
                if (err)
                        goto bad;
-                ceph_decode_need(p, end, sizeof(u32), bad);
+        }
-                pglen = ceph_decode_32(p);
-                if (pglen) {
-                        ceph_decode_need(p, end, pglen*sizeof(u32), bad);
-                        /* removing existing (if any) */
-                        (void) __remove_pg_mapping(&map->pg_temp, pgid);
-                        /* insert */
+        /* new_primary_affinity */
-                        err = -EINVAL;
+        if (struct_v >= 2) {
-                        if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
+                err = decode_new_primary_affinity(p, end, map);
-                                goto bad;
+                if (err)
-                        err = -ENOMEM;
+                        goto bad;
-                        pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
-                        if (!pg)
-                                goto bad;
-                        pg->pgid = pgid;
-                        pg->len = pglen;
-                        for (j = 0; j < pglen; j++)
-                                pg->osds[j] = ceph_decode_32(p);
-                        err = __insert_pg_mapping(pg, &map->pg_temp);
-                        if (err) {
-                                kfree(pg);
-                                goto bad;
-                        }
-                        dout(" added pg_temp %lld.%x len %d\n", pgid.pool,
-                             pgid.seed, pglen);
-                } else {
-                        /* remove */
-                        __remove_pg_mapping(&map->pg_temp, pgid);
-                }
        }
        /* ignore the rest */
        *p = end;
+        dout("inc osdmap epoch %d max_osd %d\n", map->epoch, map->max_osd);
        return map;
+e_inval:
+        err = -EINVAL;
 bad:
-        pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+        pr_err("corrupt inc osdmap (%d) epoch %d off %d (%p of %p-%p)\n",
-               epoch, (int)(*p - start), *p, start, end);
+               err, epoch, (int)(*p - start), *p, start, end);
        print_hex_dump(KERN_DEBUG, "osdmap: ",
                       DUMP_PREFIX_OFFSET, 16, 1,
                       start, end - start, true);
@@ -1142,61 +1441,249 @@ int ceph_oloc_oid_to_pg(struct ceph_osdmap *osdmap,
 }
 EXPORT_SYMBOL(ceph_oloc_oid_to_pg);
-static int crush_do_rule_ary(const struct crush_map *map, int ruleno, int x,
+static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
-                             int *result, int result_max,
+                    int *result, int result_max,
-                             const __u32 *weight, int weight_max)
+                    const __u32 *weight, int weight_max)
 {
-        int scratch[result_max * 3];
+        int r;
-        return crush_do_rule(map, ruleno, x, result, result_max,
+        BUG_ON(result_max > CEPH_PG_MAX_SIZE);
-                             weight, weight_max, scratch);
+        mutex_lock(&map->crush_scratch_mutex);
+        r = crush_do_rule(map->crush, ruleno, x, result, result_max,
+                          weight, weight_max, map->crush_scratch_ary);
+        mutex_unlock(&map->crush_scratch_mutex);
+        return r;
 }
 /*
- * Calculate raw osd vector for the given pgid.  Return pointer to osd
+ * Calculate raw (crush) set for given pgid.
- * array, or NULL on failure.
+ *
+ * Return raw set length, or error.
 */
-static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+static int pg_to_raw_osds(struct ceph_osdmap *osdmap,
-                        int *osds, int *num)
+                          struct ceph_pg_pool_info *pool,
+                          struct ceph_pg pgid, u32 pps, int *osds)
 {
-        struct ceph_pg_mapping *pg;
-        struct ceph_pg_pool_info *pool;
        int ruleno;
-        int r;
+        int len;
-        u32 pps;
-        pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+        /* crush */
-        if (!pool)
+        ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
-                return NULL;
+                                 pool->type, pool->size);
+        if (ruleno < 0) {
+                pr_err("no crush rule: pool %lld ruleset %d type %d size %d\n",
+                       pgid.pool, pool->crush_ruleset, pool->type,
+                       pool->size);
+                return -ENOENT;
+        }
-        /* pg_temp? */
+        len = do_crush(osdmap, ruleno, pps, osds,
+                       min_t(int, pool->size, CEPH_PG_MAX_SIZE),
+                       osdmap->osd_weight, osdmap->max_osd);
+        if (len < 0) {
+                pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
+                       len, ruleno, pgid.pool, pool->crush_ruleset,
+                       pool->type, pool->size);
+                return len;
+        }
+        return len;
+}
+/*
+ * Given raw set, calculate up set and up primary.
+ *
+ * Return up set length.  *primary is set to up primary osd id, or -1
+ * if up set is empty.
+ */
+static int raw_to_up_osds(struct ceph_osdmap *osdmap,
+                          struct ceph_pg_pool_info *pool,
+                          int *osds, int len, int *primary)
+{
+        int up_primary = -1;
+        int i;
+        if (ceph_can_shift_osds(pool)) {
+                int removed = 0;
+                for (i = 0; i < len; i++) {
+                        if (ceph_osd_is_down(osdmap, osds[i])) {
+                                removed++;
+                                continue;
+                        }
+                        if (removed)
+                                osds[i - removed] = osds[i];
+                }
+                len -= removed;
+                if (len > 0)
+                        up_primary = osds[0];
+        } else {
+                for (i = len - 1; i >= 0; i--) {
+                        if (ceph_osd_is_down(osdmap, osds[i]))
+                                osds[i] = CRUSH_ITEM_NONE;
+                        else
+                                up_primary = osds[i];
+                }
+        }
+        *primary = up_primary;
+        return len;
+}
+static void apply_primary_affinity(struct ceph_osdmap *osdmap, u32 pps,
+                                   struct ceph_pg_pool_info *pool,
+                                   int *osds, int len, int *primary)
+{
+        int i;
+        int pos = -1;
+        /*
+         * Do we have any non-default primary_affinity values for these
+         * osds?
+         */
+        if (!osdmap->osd_primary_affinity)
+                return;
+        for (i = 0; i < len; i++) {
+                if (osds[i] != CRUSH_ITEM_NONE &&
+                    osdmap->osd_primary_affinity[i] !=
+                                        CEPH_OSD_DEFAULT_PRIMARY_AFFINITY) {
+                        break;
+                }
+        }
+        if (i == len)
+                return;
+        /*
+         * Pick the primary.  Feed both the seed (for the pg) and the
+         * osd into the hash/rng so that a proportional fraction of an
+         * osd's pgs get rejected as primary.
+         */
+        for (i = 0; i < len; i++) {
+                int osd;
+                u32 aff;
+                osd = osds[i];
+                if (osd == CRUSH_ITEM_NONE)
+                        continue;
+                aff = osdmap->osd_primary_affinity[osd];
+                if (aff < CEPH_OSD_MAX_PRIMARY_AFFINITY &&
+                    (crush_hash32_2(CRUSH_HASH_RJENKINS1,
+                                    pps, osd) >> 16) >= aff) {
+                        /*
+                         * We chose not to use this primary.  Note it
+                         * anyway as a fallback in case we don't pick
+                         * anyone else, but keep looking.
+                         */
+                        if (pos < 0)
+                                pos = i;
+                } else {
+                        pos = i;
+                        break;
+                }
+        }
+        if (pos < 0)
+                return;
+        *primary = osds[pos];
+        if (ceph_can_shift_osds(pool) && pos > 0) {
+                /* move the new primary to the front */
+                for (i = pos; i > 0; i--)
+                        osds[i] = osds[i - 1];
+                osds[0] = *primary;
+        }
+}
+/*
+ * Given up set, apply pg_temp and primary_temp mappings.
+ *
+ * Return acting set length.  *primary is set to acting primary osd id,
+ * or -1 if acting set is empty.
+ */
+static int apply_temps(struct ceph_osdmap *osdmap,
+                       struct ceph_pg_pool_info *pool, struct ceph_pg pgid,
+                       int *osds, int len, int *primary)
+{
+        struct ceph_pg_mapping *pg;
+        int temp_len;
+        int temp_primary;
+        int i;
+        /* raw_pg -> pg */
        pgid.seed = ceph_stable_mod(pgid.seed, pool->pg_num,
                                    pool->pg_num_mask);
+        /* pg_temp? */
        pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
        if (pg) {
-                *num = pg->len;
+                temp_len = 0;
-                return pg->osds;
+                temp_primary = -1;
+                for (i = 0; i < pg->pg_temp.len; i++) {
+                        if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
+                                if (ceph_can_shift_osds(pool))
+                                        continue;
+                                else
+                                        osds[temp_len++] = CRUSH_ITEM_NONE;
+                        } else {
+                                osds[temp_len++] = pg->pg_temp.osds[i];
+                        }
+                }
+                /* apply pg_temp's primary */
+                for (i = 0; i < temp_len; i++) {
+                        if (osds[i] != CRUSH_ITEM_NONE) {
+                                temp_primary = osds[i];
+                                break;
+                        }
+                }
+        } else {
+                temp_len = len;
+                temp_primary = *primary;
        }
-        /* crush */
+        /* primary_temp? */
-        ruleno = crush_find_rule(osdmap->crush, pool->crush_ruleset,
+        pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid);
-                                 pool->type, pool->size);
+        if (pg)
-        if (ruleno < 0) {
+                temp_primary = pg->primary_temp.osd;
-                pr_err("no crush rule pool %lld ruleset %d type %d size %d\n",
-                       pgid.pool, pool->crush_ruleset, pool->type,
+        *primary = temp_primary;
-                       pool->size);
+        return temp_len;
-                return NULL;
+}
+/*
+ * Calculate acting set for given pgid.
+ *
+ * Return acting set length, or error.  *primary is set to acting
+ * primary osd id, or -1 if acting set is empty or on error.
+ */
+int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+                        int *osds, int *primary)
+{
+        struct ceph_pg_pool_info *pool;
+        u32 pps;
+        int len;
+        pool = __lookup_pg_pool(&osdmap->pg_pools, pgid.pool);
+        if (!pool) {
+                *primary = -1;
+                return -ENOENT;
        }
        if (pool->flags & CEPH_POOL_FLAG_HASHPSPOOL) {
-                /* hash pool id and seed sothat pool PGs do not overlap */
+                /* hash pool id and seed so that pool PGs do not overlap */
                pps = crush_hash32_2(CRUSH_HASH_RJENKINS1,
                                     ceph_stable_mod(pgid.seed, pool->pgp_num,
                                                     pool->pgp_num_mask),
                                     pgid.pool);
        } else {
                /*
-                 * legacy ehavior: add ps and pool together.  this is
+                 * legacy behavior: add ps and pool together.  this is
                 * not a great approach because the PGs from each pool
                 * will overlap on top of each other: 0.5 == 1.4 ==
                 * 2.3 == ...
@@ -1205,38 +1692,20 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
                                      pool->pgp_num_mask) +
                        (unsigned)pgid.pool;
        }
-        r = crush_do_rule_ary(osdmap->crush, ruleno, pps,
-                              osds, min_t(int, pool->size, *num),
+        len = pg_to_raw_osds(osdmap, pool, pgid, pps, osds);
-                              osdmap->osd_weight, osdmap->max_osd);
+        if (len < 0) {
-        if (r < 0) {
+                *primary = -1;
-                pr_err("error %d from crush rule: pool %lld ruleset %d type %d"
+                return len;
-                       " size %d\n", r, pgid.pool, pool->crush_ruleset,
-                       pool->type, pool->size);
-                return NULL;
        }
-        *num = r;
-        return osds;
-}
-/*
+        len = raw_to_up_osds(osdmap, pool, osds, len, primary);
- * Return acting set for given pgid.
- */
-int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
-                        int *acting)
-{
-        int rawosds[CEPH_PG_MAX_SIZE], *osds;
-        int i, o, num = CEPH_PG_MAX_SIZE;
-        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        apply_primary_affinity(osdmap, pps, pool, osds, len, primary);
-        if (!osds)
-                return -1;
-        /* primary is first up osd */
+        len = apply_temps(osdmap, pool, pgid, osds, len, primary);
-        o = 0;
-        for (i = 0; i < num; i++)
+        return len;
-                if (ceph_osd_is_up(osdmap, osds[i]))
-                        acting[o++] = osds[i];
-        return o;
 }
 /*
@@ -1244,17 +1713,11 @@ int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
 */
 int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
 {
-        int rawosds[CEPH_PG_MAX_SIZE], *osds;
+        int osds[CEPH_PG_MAX_SIZE];
-        int i, num = CEPH_PG_MAX_SIZE;
+        int primary;
-        osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+        ceph_calc_pg_acting(osdmap, pgid, osds, &primary);
-        if (!osds)
-                return -1;
-        /* primary is first up osd */
+        return primary;
-        for (i = 0; i < num; i++)
-                if (ceph_osd_is_up(osdmap, osds[i]))
-                        return osds[i];
-        return -1;
 }
 EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/core/dev.c b/net/core/dev.c
index 757063420ce0..14dac0654f28 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4043,6 +4043,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
        skb->vlan_tci = 0;
        skb->dev = napi->dev;
        skb->skb_iif = 0;
+        skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
        napi->skb = skb;
 }
@@ -4588,8 +4589,7 @@ void *netdev_lower_get_next_private(struct net_device *dev,
        if (&lower->list == &dev->adj_list.lower)
                return NULL;
-        if (iter)
+        *iter = lower->list.next;
-                *iter = lower->list.next;
        return lower->private;
 }
@@ -4617,8 +4617,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev,
        if (&lower->list == &dev->adj_list.lower)
                return NULL;
-        if (iter)
+        *iter = &lower->list;
-                *iter = &lower->list;
        return lower->private;
 }
@@ -5696,6 +5695,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
                }
        }
+#ifdef CONFIG_NET_RX_BUSY_POLL
+        if (dev->netdev_ops->ndo_busy_poll)
+                features |= NETIF_F_BUSY_POLL;
+        else
+#endif
+                features &= ~NETIF_F_BUSY_POLL;
        return features;
 }
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 30071dec287a..640ba0e5831c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
        [NETIF_F_RXFCS_BIT] =            "rx-fcs",
        [NETIF_F_RXALL_BIT] =            "rx-all",
        [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
+        [NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
 };
 static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
diff --git a/net/core/filter.c b/net/core/filter.c
index 765556ba32ef..e08b3822c72a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -295,43 +295,43 @@ select_insn:
                (*(s64 *) &A) >>= K;
                CONT;
        BPF_ALU64_BPF_MOD_BPF_X:
+                if (unlikely(X == 0))
+                        return 0;
                tmp = A;
-                if (X)
+                A = do_div(tmp, X);
-                        A = do_div(tmp, X);
                CONT;
        BPF_ALU_BPF_MOD_BPF_X:
+                if (unlikely(X == 0))
+                        return 0;
                tmp = (u32) A;
-                if (X)
+                A = do_div(tmp, (u32) X);
-                        A = do_div(tmp, (u32) X);
                CONT;
        BPF_ALU64_BPF_MOD_BPF_K:
                tmp = A;
-                if (K)
+                A = do_div(tmp, K);
-                        A = do_div(tmp, K);
                CONT;
        BPF_ALU_BPF_MOD_BPF_K:
                tmp = (u32) A;
-                if (K)
+                A = do_div(tmp, (u32) K);
-                        A = do_div(tmp, (u32) K);
                CONT;
        BPF_ALU64_BPF_DIV_BPF_X:
-                if (X)
+                if (unlikely(X == 0))
-                        do_div(A, X);
+                        return 0;
+                do_div(A, X);
                CONT;
        BPF_ALU_BPF_DIV_BPF_X:
+                if (unlikely(X == 0))
+                        return 0;
                tmp = (u32) A;
-                if (X)
+                do_div(tmp, (u32) X);
-                        do_div(tmp, (u32) X);
                A = (u32) tmp;
                CONT;
        BPF_ALU64_BPF_DIV_BPF_K:
-                if (K)
+                do_div(A, K);
-                        do_div(A, K);
                CONT;
        BPF_ALU_BPF_DIV_BPF_K:
                tmp = (u32) A;
-                if (K)
+                do_div(tmp, (u32) K);
-                        do_div(tmp, (u32) K);
                A = (u32) tmp;
                CONT;
        BPF_ALU_BPF_END_BPF_TO_BE:
diff --git a/net/core/flow.c b/net/core/flow.c
index 31cfb365e0c6..a0348fde1fdf 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -455,6 +455,8 @@ int flow_cache_init(struct net *net)
        if (!fc->percpu)
                return -ENOMEM;
+        cpu_notifier_register_begin();
        for_each_online_cpu(i) {
                if (flow_cache_cpu_prepare(fc, i))
                        goto err;
@@ -462,7 +464,9 @@ int flow_cache_init(struct net *net)
        fc->hotcpu_notifier = (struct notifier_block){
                .notifier_call = flow_cache_cpu,
        };
-        register_hotcpu_notifier(&fc->hotcpu_notifier);
+        __register_hotcpu_notifier(&fc->hotcpu_notifier);
+        cpu_notifier_register_done();
        setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd,
                    (unsigned long) fc);
@@ -478,6 +482,8 @@ err:
                fcp->hash_table = NULL;
        }
+        cpu_notifier_register_done();
        free_percpu(fc->percpu);
        fc->percpu = NULL;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index d0dac57291af..d068ec25db1e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3340,7 +3340,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
        __netif_tx_lock_bh(txq);
-        if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
+        if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) {
                ret = NETDEV_TX_BUSY;
                pkt_dev->last_ok = 0;
                goto unlock;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 59da7cde0724..f95b6f93814b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name,
        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
-                         sizeof(struct xt_counters) * num_counters) != 0)
+                         sizeof(struct xt_counters) * num_counters) != 0) {
-                ret = -EFAULT;
+                /* Silent error, can't fail, new table is already in place */
+                net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n");
+        }
        vfree(counters);
        xt_table_unlock(t);
        return ret;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 718dfbd30cbe..99e810f84671 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
-                         sizeof(struct xt_counters) * num_counters) != 0)
+                         sizeof(struct xt_counters) * num_counters) != 0) {
-                ret = -EFAULT;
+                /* Silent error, can't fail, new table is already in place */
+                net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n");
+        }
        vfree(counters);
        xt_table_unlock(t);
        return ret;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1be9e990514d..34d094cadb11 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -188,7 +188,7 @@ const __u8 ip_tos2prio[16] = {
 EXPORT_SYMBOL(ip_tos2prio);
 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
-#define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field)
+#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 #ifdef CONFIG_PROC_FS
 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 710238f58aa9..e080fbbbc0e5 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1241,8 +1241,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
        xt_free_table_info(oldinfo);
        if (copy_to_user(counters_ptr, counters,
-                         sizeof(struct xt_counters) * num_counters) != 0)
+                         sizeof(struct xt_counters) * num_counters) != 0) {
-                ret = -EFAULT;
+                /* Silent error, can't fail, new table is already in place */
+                net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n");
+        }
        vfree(counters);
        xt_table_unlock(t);
        return ret;
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index cd5b8ec9be04..da787930df0a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -621,6 +621,42 @@ static void iucv_disable(void)
        put_online_cpus();
 }
+static void free_iucv_data(int cpu)
+{
+        kfree(iucv_param_irq[cpu]);
+        iucv_param_irq[cpu] = NULL;
+        kfree(iucv_param[cpu]);
+        iucv_param[cpu] = NULL;
+        kfree(iucv_irq_data[cpu]);
+        iucv_irq_data[cpu] = NULL;
+}
+static int alloc_iucv_data(int cpu)
+{
+        /* Note: GFP_DMA used to get memory below 2G */
+        iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
+                             GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+        if (!iucv_irq_data[cpu])
+                goto out_free;
+        /* Allocate parameter blocks. */
+        iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
+                          GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+        if (!iucv_param[cpu])
+                goto out_free;
+        iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
+                          GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
+        if (!iucv_param_irq[cpu])
+                goto out_free;
+        return 0;
+out_free:
+        free_iucv_data(cpu);
+        return -ENOMEM;
+}
 static int iucv_cpu_notify(struct notifier_block *self,
                                     unsigned long action, void *hcpu)
 {
@@ -630,38 +666,14 @@ static int iucv_cpu_notify(struct notifier_block *self,
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
-                iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
+                if (alloc_iucv_data(cpu))
-                                        GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
-                if (!iucv_irq_data[cpu])
-                        return notifier_from_errno(-ENOMEM);
-                iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
-                                     GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
-                if (!iucv_param[cpu]) {
-                        kfree(iucv_irq_data[cpu]);
-                        iucv_irq_data[cpu] = NULL;
                        return notifier_from_errno(-ENOMEM);
-                }
-                iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
-                                        GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
-                if (!iucv_param_irq[cpu]) {
-                        kfree(iucv_param[cpu]);
-                        iucv_param[cpu] = NULL;
-                        kfree(iucv_irq_data[cpu]);
-                        iucv_irq_data[cpu] = NULL;
-                        return notifier_from_errno(-ENOMEM);
-                }
                break;
        case CPU_UP_CANCELED:
        case CPU_UP_CANCELED_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                kfree(iucv_param_irq[cpu]);
+                free_iucv_data(cpu);
-                iucv_param_irq[cpu] = NULL;
-                kfree(iucv_param[cpu]);
-                iucv_param[cpu] = NULL;
-                kfree(iucv_irq_data[cpu]);
-                iucv_irq_data[cpu] = NULL;
                break;
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
@@ -2016,7 +2028,7 @@ static int __init iucv_init(void)
        rc = iucv_query_maxconn();
        if (rc)
                goto out_ctl;
-        rc = register_external_interrupt(0x4000, iucv_external_interrupt);
+        rc = register_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
        if (rc)
                goto out_ctl;
        iucv_root = root_device_register("iucv");
@@ -2025,33 +2037,20 @@ static int __init iucv_init(void)
                goto out_int;
        }
-        for_each_online_cpu(cpu) {
+        cpu_notifier_register_begin();
-                /* Note: GFP_DMA used to get memory below 2G */
-                iucv_irq_data[cpu] = kmalloc_node(sizeof(struct iucv_irq_data),
-                                     GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
-                if (!iucv_irq_data[cpu]) {
-                        rc = -ENOMEM;
-                        goto out_free;
-                }
-                /* Allocate parameter blocks. */
+        for_each_online_cpu(cpu) {
-                iucv_param[cpu] = kmalloc_node(sizeof(union iucv_param),
+                if (alloc_iucv_data(cpu)) {
-                                  GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
-                if (!iucv_param[cpu]) {
-                        rc = -ENOMEM;
-                        goto out_free;
-                }
-                iucv_param_irq[cpu] = kmalloc_node(sizeof(union iucv_param),
-                                  GFP_KERNEL|GFP_DMA, cpu_to_node(cpu));
-                if (!iucv_param_irq[cpu]) {
                        rc = -ENOMEM;
                        goto out_free;
                }
        }
-        rc = register_hotcpu_notifier(&iucv_cpu_notifier);
+        rc = __register_hotcpu_notifier(&iucv_cpu_notifier);
        if (rc)
                goto out_free;
+        cpu_notifier_register_done();
        rc = register_reboot_notifier(&iucv_reboot_notifier);
        if (rc)
                goto out_cpu;
@@ -2069,19 +2068,17 @@ static int __init iucv_init(void)
 out_reboot:
        unregister_reboot_notifier(&iucv_reboot_notifier);
 out_cpu:
-        unregister_hotcpu_notifier(&iucv_cpu_notifier);
+        cpu_notifier_register_begin();
+        __unregister_hotcpu_notifier(&iucv_cpu_notifier);
 out_free:
-        for_each_possible_cpu(cpu) {
+        for_each_possible_cpu(cpu)
-                kfree(iucv_param_irq[cpu]);
+                free_iucv_data(cpu);
-                iucv_param_irq[cpu] = NULL;
-                kfree(iucv_param[cpu]);
+        cpu_notifier_register_done();
-                iucv_param[cpu] = NULL;
-                kfree(iucv_irq_data[cpu]);
-                iucv_irq_data[cpu] = NULL;
-        }
        root_device_unregister(iucv_root);
 out_int:
-        unregister_external_interrupt(0x4000, iucv_external_interrupt);
+        unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
 out_ctl:
        ctl_clear_bit(0, 1);
 out:
@@ -2105,18 +2102,14 @@ static void __exit iucv_exit(void)
                kfree(p);
        spin_unlock_irq(&iucv_queue_lock);
        unregister_reboot_notifier(&iucv_reboot_notifier);
-        unregister_hotcpu_notifier(&iucv_cpu_notifier);
+        cpu_notifier_register_begin();
-        for_each_possible_cpu(cpu) {
+        __unregister_hotcpu_notifier(&iucv_cpu_notifier);
-                kfree(iucv_param_irq[cpu]);
+        for_each_possible_cpu(cpu)
-                iucv_param_irq[cpu] = NULL;
+                free_iucv_data(cpu);
-                kfree(iucv_param[cpu]);
+        cpu_notifier_register_done();
-                iucv_param[cpu] = NULL;
-                kfree(iucv_irq_data[cpu]);
-                iucv_irq_data[cpu] = NULL;
-        }
        root_device_unregister(iucv_root);
        bus_unregister(&iucv_bus);
-        unregister_external_interrupt(0x4000, iucv_external_interrupt);
+        unregister_external_irq(EXT_IRQ_IUCV, iucv_external_interrupt);
 }
 subsys_initcall(iucv_init);
diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c
index 153bd1ddbfbb..f0991f2344d4 100644
--- a/net/mac802154/mib.c
+++ b/net/mac802154/mib.c
@@ -26,7 +26,6 @@
 #include <net/mac802154.h>
 #include <net/ieee802154_netdev.h>
 #include <net/wpan-phy.h>
-#include <net/ieee802154_netdev.h>
 #include "mac802154.h"
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 33045a562297..3fd159db9f06 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -152,8 +152,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi,
 #ifdef CONFIG_MODULES
        if (autoload) {
                nfnl_unlock(NFNL_SUBSYS_NFTABLES);
-                request_module("nft-chain-%u-%*.s", afi->family,
+                request_module("nft-chain-%u-%.*s", afi->family,
-                               nla_len(nla)-1, (const char *)nla_data(nla));
+                               nla_len(nla), (const char *)nla_data(nla));
                nfnl_lock(NFNL_SUBSYS_NFTABLES);
                type = __nf_tables_chain_type_lookup(afi->family, nla);
                if (type != NULL)
@@ -1946,7 +1946,8 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const
 static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
        [NFTA_SET_TABLE]                = { .type = NLA_STRING },
-        [NFTA_SET_NAME]                 = { .type = NLA_STRING },
+        [NFTA_SET_NAME]                 = { .type = NLA_STRING,
+                                            .len = IFNAMSIZ - 1 },
        [NFTA_SET_FLAGS]                = { .type = NLA_U32 },
        [NFTA_SET_KEY_TYPE]             = { .type = NLA_U32 },
        [NFTA_SET_KEY_LEN]              = { .type = NLA_U32 },
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index 9a8e77e7f8d4..f4e833005320 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -54,7 +54,8 @@ static struct xt_match cgroup_mt_reg __read_mostly = {
        .matchsize  = sizeof(struct xt_cgroup_info),
        .me         = THIS_MODULE,
        .hooks      = (1 << NF_INET_LOCAL_OUT) |
-                      (1 << NF_INET_POST_ROUTING),
+                      (1 << NF_INET_POST_ROUTING) |
+                      (1 << NF_INET_LOCAL_IN),
 };
 static int __init cgroup_mt_init(void)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 458464e7bd7a..fbc66bb250d5 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -32,8 +32,14 @@
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_zones.h>
-#define CONNLIMIT_SLOTS         32
+#define CONNLIMIT_SLOTS         256U
-#define CONNLIMIT_LOCK_SLOTS    32
+#ifdef CONFIG_LOCKDEP
+#define CONNLIMIT_LOCK_SLOTS    8U
+#else
+#define CONNLIMIT_LOCK_SLOTS    256U
+#endif
 #define CONNLIMIT_GC_MAX_NODES  8
 /* we will save the tuples of all connections we care about */
@@ -49,10 +55,11 @@ struct xt_connlimit_rb {
        union nf_inet_addr addr; /* search key */
 };
+static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp;
 struct xt_connlimit_data {
        struct rb_root climit_root4[CONNLIMIT_SLOTS];
        struct rb_root climit_root6[CONNLIMIT_SLOTS];
-        spinlock_t              locks[CONNLIMIT_LOCK_SLOTS];
 };
 static u_int32_t connlimit_rnd __read_mostly;
@@ -297,11 +304,11 @@ static int count_them(struct net *net,
                root = &data->climit_root4[hash];
        }
-        spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
+        spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
        count = count_tree(net, root, tuple, addr, mask, family);
-        spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
+        spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]);
        return count;
 }
@@ -377,9 +384,6 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
                return -ENOMEM;
        }
-        for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i)
-                spin_lock_init(&info->data->locks[i]);
        for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
                info->data->climit_root4[i] = RB_ROOT;
        for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
@@ -435,11 +439,14 @@ static struct xt_match connlimit_mt_reg __read_mostly = {
 static int __init connlimit_mt_init(void)
 {
-        int ret;
+        int ret, i;
        BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS);
        BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0);
+        for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i)
+                spin_lock_init(&xt_connlimit_locks[i]);
        connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn",
                                           sizeof(struct xt_connlimit_conn),
                                           0, 0, NULL);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 7174611bd672..c529161cdbf8 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -422,4 +422,6 @@ module_exit(xt_osf_fini);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
 MODULE_DESCRIPTION("Passive OS fingerprint matching.");
+MODULE_ALIAS("ipt_osf");
+MODULE_ALIAS("ip6t_osf");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 01039d2b1695..72e0c71fb01d 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -261,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
        local_bh_disable();
        HARD_TX_LOCK(dev, txq, smp_processor_id());
-        if (!netif_xmit_frozen_or_stopped(txq)) {
+        if (!netif_xmit_frozen_or_drv_stopped(txq)) {
                ret = ops->ndo_start_xmit(skb, dev);
                if (ret == NETDEV_TX_OK)
                        txq_trans_update(txq);
@@ -275,6 +275,7 @@ static int packet_direct_xmit(struct sk_buff *skb)
        return ret;
 drop:
+        atomic_long_inc(&dev->tx_dropped);
        kfree_skb(skb);
        return NET_XMIT_DROP;
 }
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 981aaf8b6ace..5f83a6a2fa67 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -6593,6 +6593,40 @@ static void __sctp_write_space(struct sctp_association *asoc)
        }
 }
+static void sctp_wake_up_waiters(struct sock *sk,
+                                 struct sctp_association *asoc)
+{
+        struct sctp_association *tmp = asoc;
+        /* We do accounting for the sndbuf space per association,
+         * so we only need to wake our own association.
+         */
+        if (asoc->ep->sndbuf_policy)
+                return __sctp_write_space(asoc);
+        /* Accounting for the sndbuf space is per socket, so we
+         * need to wake up others, try to be fair and in case of
+         * other associations, let them have a go first instead
+         * of just doing a sctp_write_space() call.
+         *
+         * Note that we reach sctp_wake_up_waiters() only when
+         * associations free up queued chunks, thus we are under
+         * lock and the list of associations on a socket is
+         * guaranteed not to change.
+         */
+        for (tmp = list_next_entry(tmp, asocs); 1;
+             tmp = list_next_entry(tmp, asocs)) {
+                /* Manually skip the head element. */
+                if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs))
+                        continue;
+                /* Wake up association. */
+                __sctp_write_space(tmp);
+                /* We've reached the end. */
+                if (tmp == asoc)
+                        break;
+        }
+}
 /* Do accounting for the sndbuf space.
 * Decrement the used sndbuf space of the corresponding association by the
 * data size which was just transmitted(freed).
@@ -6620,7 +6654,7 @@ static void sctp_wfree(struct sk_buff *skb)
        sk_mem_uncharge(sk, skb->truesize);
        sock_wfree(skb);
-        __sctp_write_space(asoc);
+        sctp_wake_up_waiters(sk, asoc);
        sctp_association_put(asoc);
 }
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 241b54f30204..0754d0f466d2 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -9,19 +9,6 @@ config SUNRPC_BACKCHANNEL
        bool
        depends on SUNRPC
-config SUNRPC_XPRT_RDMA
-        tristate
-        depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
-        default SUNRPC && INFINIBAND
-        help
-          This option allows the NFS client and server to support
-          an RDMA-enabled transport.
-          To compile RPC client RDMA transport support as a module,
-          choose M here: the module will be called xprtrdma.
-          If unsure, say N.
 config SUNRPC_SWAP
        bool
        depends on SUNRPC
@@ -57,3 +44,29 @@ config SUNRPC_DEBUG
          but makes troubleshooting NFS issues significantly harder.
          If unsure, say Y.
+config SUNRPC_XPRT_RDMA_CLIENT
+        tristate "RPC over RDMA Client Support"
+        depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+        default SUNRPC && INFINIBAND
+        help
+          This option allows the NFS client to support an RDMA-enabled
+          transport.
+          To compile RPC client RDMA transport support as a module,
+          choose M here: the module will be called xprtrdma.
+          If unsure, say N.
+config SUNRPC_XPRT_RDMA_SERVER
+        tristate "RPC over RDMA Server Support"
+        depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS
+        default SUNRPC && INFINIBAND
+        help
+          This option allows the NFS server to support an RDMA-enabled
+          transport.
+          To compile RPC server RDMA transport support as a module,
+          choose M here: the module will be called svcrdma.
+          If unsure, say N.
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index 8209a0411bca..e5a7a1cac8f3 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -5,7 +5,8 @@
 obj-$(CONFIG_SUNRPC) += sunrpc.o
 obj-$(CONFIG_SUNRPC_GSS) += auth_gss/
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma/
+obj-y += xprtrdma/
 sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
            auth.o auth_null.o auth_unix.o auth_generic.o \
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index e860d4f7ed2a..3513d559bc45 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -212,39 +212,23 @@ out:
 }
 EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
-/*
+static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
- * One or more rpc_rqst structure have been preallocated during the
- * backchannel setup.  Buffer space for the send and private XDR buffers
- * has been preallocated as well.  Use xprt_alloc_bc_request to allocate
- * to this request.  Use xprt_free_bc_request to return it.
- *
- * We know that we're called in soft interrupt context, grab the spin_lock
- * since there is no need to grab the bottom half spin_lock.
- *
- * Return an available rpc_rqst, otherwise NULL if non are available.
- */
-struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt)
 {
-        struct rpc_rqst *req;
+        struct rpc_rqst *req = NULL;
        dprintk("RPC:       allocate a backchannel request\n");
-        spin_lock(&xprt->bc_pa_lock);
+        if (list_empty(&xprt->bc_pa_list))
-        if (!list_empty(&xprt->bc_pa_list)) {
+                goto not_found;
-                req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
-                                rq_bc_pa_list);
-                list_del(&req->rq_bc_pa_list);
-        } else {
-                req = NULL;
-        }
-        spin_unlock(&xprt->bc_pa_lock);
-        if (req != NULL) {
+        req = list_first_entry(&xprt->bc_pa_list, struct rpc_rqst,
-                set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+                                rq_bc_pa_list);
-                req->rq_reply_bytes_recvd = 0;
+        req->rq_reply_bytes_recvd = 0;
-                req->rq_bytes_sent = 0;
+        req->rq_bytes_sent = 0;
-                memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
+        memcpy(&req->rq_private_buf, &req->rq_rcv_buf,
                        sizeof(req->rq_private_buf));
-        }
+        req->rq_xid = xid;
+        req->rq_connect_cookie = xprt->connect_cookie;
+not_found:
        dprintk("RPC:       backchannel req=%p\n", req);
        return req;
 }
@@ -259,6 +243,7 @@ void xprt_free_bc_request(struct rpc_rqst *req)
        dprintk("RPC:       free backchannel req=%p\n", req);
+        req->rq_connect_cookie = xprt->connect_cookie - 1;
        smp_mb__before_clear_bit();
        WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state));
        clear_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
@@ -281,7 +266,57 @@ void xprt_free_bc_request(struct rpc_rqst *req)
         * may be reused by a new callback request.
         */
        spin_lock_bh(&xprt->bc_pa_lock);
-        list_add(&req->rq_bc_pa_list, &xprt->bc_pa_list);
+        list_add_tail(&req->rq_bc_pa_list, &xprt->bc_pa_list);
        spin_unlock_bh(&xprt->bc_pa_lock);
 }
+/*
+ * One or more rpc_rqst structure have been preallocated during the
+ * backchannel setup.  Buffer space for the send and private XDR buffers
+ * has been preallocated as well.  Use xprt_alloc_bc_request to allocate
+ * to this request.  Use xprt_free_bc_request to return it.
+ *
+ * We know that we're called in soft interrupt context, grab the spin_lock
+ * since there is no need to grab the bottom half spin_lock.
+ *
+ * Return an available rpc_rqst, otherwise NULL if non are available.
+ */
+struct rpc_rqst *xprt_lookup_bc_request(struct rpc_xprt *xprt, __be32 xid)
+{
+        struct rpc_rqst *req;
+        spin_lock(&xprt->bc_pa_lock);
+        list_for_each_entry(req, &xprt->bc_pa_list, rq_bc_pa_list) {
+                if (req->rq_connect_cookie != xprt->connect_cookie)
+                        continue;
+                if (req->rq_xid == xid)
+                        goto found;
+        }
+        req = xprt_alloc_bc_request(xprt, xid);
+found:
+        spin_unlock(&xprt->bc_pa_lock);
+        return req;
+}
+/*
+ * Add callback request to callback list.  The callback
+ * service sleeps on the sv_cb_waitq waiting for new
+ * requests.  Wake it up after adding enqueing the
+ * request.
+ */
+void xprt_complete_bc_request(struct rpc_rqst *req, uint32_t copied)
+{
+        struct rpc_xprt *xprt = req->rq_xprt;
+        struct svc_serv *bc_serv = xprt->bc_serv;
+        req->rq_private_buf.len = copied;
+        set_bit(RPC_BC_PA_IN_USE, &req->rq_bc_pa_state);
+        dprintk("RPC:       add callback request to list\n");
+        spin_lock(&bc_serv->sv_cb_lock);
+        list_del(&req->rq_bc_pa_list);
+        list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
+        wake_up(&bc_serv->sv_cb_waitq);
+        spin_unlock(&bc_serv->sv_cb_lock);
+}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0edada973434..2e6ab10734f6 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -438,6 +438,38 @@ out_no_rpciod:
        return ERR_PTR(err);
 }
+struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
+                                        struct rpc_xprt *xprt)
+{
+        struct rpc_clnt *clnt = NULL;
+        clnt = rpc_new_client(args, xprt, NULL);
+        if (IS_ERR(clnt))
+                return clnt;
+        if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
+                int err = rpc_ping(clnt);
+                if (err != 0) {
+                        rpc_shutdown_client(clnt);
+                        return ERR_PTR(err);
+                }
+        }
+        clnt->cl_softrtry = 1;
+        if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
+                clnt->cl_softrtry = 0;
+        if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
+                clnt->cl_autobind = 1;
+        if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
+                clnt->cl_discrtry = 1;
+        if (!(args->flags & RPC_CLNT_CREATE_QUIET))
+                clnt->cl_chatty = 1;
+        return clnt;
+}
+EXPORT_SYMBOL_GPL(rpc_create_xprt);
 /**
 * rpc_create - create an RPC client and transport with one call
 * @args: rpc_clnt create argument structure
@@ -451,7 +483,6 @@ out_no_rpciod:
 struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 {
        struct rpc_xprt *xprt;
-        struct rpc_clnt *clnt;
        struct xprt_create xprtargs = {
                .net = args->net,
                .ident = args->protocol,
@@ -515,30 +546,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
        if (args->flags & RPC_CLNT_CREATE_NONPRIVPORT)
                xprt->resvport = 0;
-        clnt = rpc_new_client(args, xprt, NULL);
+        return rpc_create_xprt(args, xprt);
-        if (IS_ERR(clnt))
-                return clnt;
-        if (!(args->flags & RPC_CLNT_CREATE_NOPING)) {
-                int err = rpc_ping(clnt);
-                if (err != 0) {
-                        rpc_shutdown_client(clnt);
-                        return ERR_PTR(err);
-                }
-        }
-        clnt->cl_softrtry = 1;
-        if (args->flags & RPC_CLNT_CREATE_HARDRTRY)
-                clnt->cl_softrtry = 0;
-        if (args->flags & RPC_CLNT_CREATE_AUTOBIND)
-                clnt->cl_autobind = 1;
-        if (args->flags & RPC_CLNT_CREATE_DISCRTRY)
-                clnt->cl_discrtry = 1;
-        if (!(args->flags & RPC_CLNT_CREATE_QUIET))
-                clnt->cl_chatty = 1;
-        return clnt;
 }
 EXPORT_SYMBOL_GPL(rpc_create);
@@ -1363,6 +1371,7 @@ rpc_restart_call_prepare(struct rpc_task *task)
        if (RPC_ASSASSINATED(task))
                return 0;
        task->tk_action = call_start;
+        task->tk_status = 0;
        if (task->tk_ops->rpc_call_prepare != NULL)
                task->tk_action = rpc_prepare_task;
        return 1;
@@ -1379,6 +1388,7 @@ rpc_restart_call(struct rpc_task *task)
        if (RPC_ASSASSINATED(task))
                return 0;
        task->tk_action = call_start;
+        task->tk_status = 0;
        return 1;
 }
 EXPORT_SYMBOL_GPL(rpc_restart_call);
@@ -1728,9 +1738,7 @@ call_bind_status(struct rpc_task *task)
        case -EPROTONOSUPPORT:
                dprintk("RPC: %5u remote rpcbind version unavailable, retrying\n",
                                task->tk_pid);
-                task->tk_status = 0;
+                goto retry_timeout;
-                task->tk_action = call_bind;
-                return;
        case -ECONNREFUSED:             /* connection problems */
        case -ECONNRESET:
        case -ECONNABORTED:
@@ -1756,6 +1764,7 @@ call_bind_status(struct rpc_task *task)
        return;
 retry_timeout:
+        task->tk_status = 0;
        task->tk_action = call_timeout;
 }
@@ -1798,21 +1807,19 @@ call_connect_status(struct rpc_task *task)
        trace_rpc_connect_status(task, status);
        task->tk_status = 0;
        switch (status) {
-                /* if soft mounted, test if we've timed out */
-        case -ETIMEDOUT:
-                task->tk_action = call_timeout;
-                return;
        case -ECONNREFUSED:
        case -ECONNRESET:
        case -ECONNABORTED:
        case -ENETUNREACH:
        case -EHOSTUNREACH:
-                /* retry with existing socket, after a delay */
-                rpc_delay(task, 3*HZ);
                if (RPC_IS_SOFTCONN(task))
                        break;
+                /* retry with existing socket, after a delay */
+                rpc_delay(task, 3*HZ);
        case -EAGAIN:
-                task->tk_action = call_bind;
+                /* Check for timeouts before looping back to call_bind */
+        case -ETIMEDOUT:
+                task->tk_action = call_timeout;
                return;
        case 0:
                clnt->cl_stats->netreconn++;
@@ -2007,6 +2014,10 @@ call_status(struct rpc_task *task)
        case -EHOSTDOWN:
        case -EHOSTUNREACH:
        case -ENETUNREACH:
+                if (RPC_IS_SOFTCONN(task)) {
+                        rpc_exit(task, status);
+                        break;
+                }
                /*
                 * Delay any retries for 3 seconds, then handle as if it
                 * were a timeout.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ff3cc4bf4b24..25578afe1548 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -637,7 +637,8 @@ static void __rpc_queue_timer_fn(unsigned long ptr)
 static void __rpc_atrun(struct rpc_task *task)
 {
-        task->tk_status = 0;
+        if (task->tk_status == -ETIMEDOUT)
+                task->tk_status = 0;
 }
 /*
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index b6e59f0a9475..d06cb8752dcd 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1397,6 +1397,22 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
        return svsk;
 }
+bool svc_alien_sock(struct net *net, int fd)
+{
+        int err;
+        struct socket *sock = sockfd_lookup(fd, &err);
+        bool ret = false;
+        if (!sock)
+                goto out;
+        if (sock_net(sock->sk) != net)
+                ret = true;
+        sockfd_put(sock);
+out:
+        return ret;
+}
+EXPORT_SYMBOL_GPL(svc_alien_sock);
 /**
 * svc_addsock - add a listener socket to an RPC service
 * @serv: pointer to RPC service to which to add a new listener
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 1504bb11e4f3..dd97ba3c4456 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -833,8 +833,20 @@ xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf)
 }
 EXPORT_SYMBOL_GPL(xdr_buf_from_iov);
-/* Sets subbuf to the portion of buf of length len beginning base bytes
+/**
- * from the start of buf. Returns -1 if base of length are out of bounds. */
+ * xdr_buf_subsegment - set subbuf to a portion of buf
+ * @buf: an xdr buffer
+ * @subbuf: the result buffer
+ * @base: beginning of range in bytes
+ * @len: length of range in bytes
+ *
+ * sets @subbuf to an xdr buffer representing the portion of @buf of
+ * length @len starting at offset @base.
+ *
+ * @buf and @subbuf may be pointers to the same struct xdr_buf.
+ *
+ * Returns -1 if base of length are out of bounds.
+ */
 int
 xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
                        unsigned int base, unsigned int len)
@@ -847,9 +859,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
                len -= subbuf->head[0].iov_len;
                base = 0;
        } else {
-                subbuf->head[0].iov_base = NULL;
-                subbuf->head[0].iov_len = 0;
                base -= buf->head[0].iov_len;
+                subbuf->head[0].iov_len = 0;
        }
        if (base < buf->page_len) {
@@ -871,9 +882,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
                len -= subbuf->tail[0].iov_len;
                base = 0;
        } else {
-                subbuf->tail[0].iov_base = NULL;
-                subbuf->tail[0].iov_len = 0;
                base -= buf->tail[0].iov_len;
+                subbuf->tail[0].iov_len = 0;
        }
        if (base || len)
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 7d4df99f761f..d173f79947c6 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1383,15 +1383,3 @@ void xprt_put(struct rpc_xprt *xprt)
        if (atomic_dec_and_test(&xprt->count))
                xprt_destroy(xprt);
 }
-/**
- * xprt_get - return a reference to an RPC transport.
- * @xprt: pointer to the transport
- *
- */
-struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
-{
-        if (atomic_inc_not_zero(&xprt->count))
-                return xprt;
-        return NULL;
-}
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 5a8f268bdd30..da5136fd5694 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,8 +1,8 @@
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += xprtrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_CLIENT) += xprtrdma.o
 xprtrdma-y := transport.o rpc_rdma.o verbs.o
-obj-$(CONFIG_SUNRPC_XPRT_RDMA) += svcrdma.o
+obj-$(CONFIG_SUNRPC_XPRT_RDMA_SERVER) += svcrdma.o
 svcrdma-y := svc_rdma.o svc_rdma_transport.o \
        svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index e03725bfe2b8..96ead526b125 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -649,9 +649,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
                                break;
                        page_base = 0;
                }
-                rqst->rq_rcv_buf.page_len = olen - copy_len;
+        }
-        } else
-                rqst->rq_rcv_buf.page_len = 0;
        if (copy_len && rqst->rq_rcv_buf.tail[0].iov_len) {
                curlen = copy_len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 0ce75524ed21..8d904e4eef15 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -90,6 +90,7 @@ static void rdma_build_arg_xdr(struct svc_rqst *rqstp,
                sge_no++;
        }
        rqstp->rq_respages = &rqstp->rq_pages[sge_no];
+        rqstp->rq_next_page = rqstp->rq_respages + 1;
        /* We should never run out of SGE because the limit is defined to
         * support the max allowed RPC data length
@@ -169,6 +170,7 @@ static int map_read_chunks(struct svcxprt_rdma *xprt,
                 */
                head->arg.pages[page_no] = rqstp->rq_arg.pages[page_no];
                rqstp->rq_respages = &rqstp->rq_arg.pages[page_no+1];
+                rqstp->rq_next_page = rqstp->rq_respages + 1;
                byte_count -= sge_bytes;
                ch_bytes -= sge_bytes;
@@ -276,6 +278,7 @@ static int fast_reg_read_chunks(struct svcxprt_rdma *xprt,
        /* rq_respages points one past arg pages */
        rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
+        rqstp->rq_next_page = rqstp->rq_respages + 1;
        /* Create the reply and chunk maps */
        offset = 0;
@@ -520,13 +523,6 @@ next_sge:
        for (ch_no = 0; &rqstp->rq_pages[ch_no] < rqstp->rq_respages; ch_no++)
                rqstp->rq_pages[ch_no] = NULL;
-        /*
-         * Detach res pages. If svc_release sees any it will attempt to
-         * put them.
-         */
-        while (rqstp->rq_next_page != rqstp->rq_respages)
-                *(--rqstp->rq_next_page) = NULL;
        return err;
 }
@@ -550,7 +546,7 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
        /* rq_respages starts after the last arg page */
        rqstp->rq_respages = &rqstp->rq_arg.pages[page_no];
-        rqstp->rq_next_page = &rqstp->rq_arg.pages[page_no];
+        rqstp->rq_next_page = rqstp->rq_respages + 1;
        /* Rebuild rq_arg head and tail. */
        rqstp->rq_arg.head[0] = head->arg.head[0];
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index c1d124dc772b..7e024a51617e 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -265,6 +265,7 @@ static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt,
                xdr_off -= xdr->head[0].iov_len;
                if (xdr_off < xdr->page_len) {
                        /* This offset is in the page list */
+                        xdr_off += xdr->page_base;
                        page = xdr->pages[xdr_off >> PAGE_SHIFT];
                        xdr_off &= ~PAGE_MASK;
                } else {
@@ -625,6 +626,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
                if (page_no+1 >= sge_no)
                        ctxt->sge[page_no+1].length = 0;
        }
+        rqstp->rq_next_page = rqstp->rq_respages + 1;
        BUG_ON(sge_no > rdma->sc_max_sge);
        memset(&send_wr, 0, sizeof send_wr);
        ctxt->wr_op = IB_WR_SEND;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 62e4f9bcc387..25688fa2207f 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -477,8 +477,7 @@ struct page *svc_rdma_get_page(void)
        while ((page = alloc_page(GFP_KERNEL)) == NULL) {
                /* If we can't get memory, wait a bit and try again */
-                printk(KERN_INFO "svcrdma: out of memory...retrying in 1000 "
+                printk(KERN_INFO "svcrdma: out of memory...retrying in 1s\n");
-                       "jiffies.\n");
                schedule_timeout_uninterruptible(msecs_to_jiffies(1000));
        }
        return page;
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 285dc0884115..1eb9c468d0c9 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -733,7 +733,7 @@ static void __exit xprt_rdma_cleanup(void)
 {
        int rc;
-        dprintk(KERN_INFO "RPCRDMA Module Removed, deregister RPC RDMA transport\n");
+        dprintk("RPCRDMA Module Removed, deregister RPC RDMA transport\n");
 #ifdef RPC_DEBUG
        if (sunrpc_table_header) {
                unregister_sysctl_table(sunrpc_table_header);
@@ -755,14 +755,14 @@ static int __init xprt_rdma_init(void)
        if (rc)
                return rc;
-        dprintk(KERN_INFO "RPCRDMA Module Init, register RPC RDMA transport\n");
+        dprintk("RPCRDMA Module Init, register RPC RDMA transport\n");
-        dprintk(KERN_INFO "Defaults:\n");
+        dprintk("Defaults:\n");
-        dprintk(KERN_INFO "\tSlots %d\n"
+        dprintk("\tSlots %d\n"
                "\tMaxInlineRead %d\n\tMaxInlineWrite %d\n",
                xprt_rdma_slot_table_entries,
                xprt_rdma_max_inline_read, xprt_rdma_max_inline_write);
-        dprintk(KERN_INFO "\tPadding %d\n\tMemreg %d\n",
+        dprintk("\tPadding %d\n\tMemreg %d\n",
                xprt_rdma_inline_write_padding, xprt_rdma_memreg_strategy);
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0addefca8e77..6735e1d1e9bb 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -909,6 +909,12 @@ static void xs_tcp_close(struct rpc_xprt *xprt)
                xs_tcp_shutdown(xprt);
 }
+static void xs_xprt_free(struct rpc_xprt *xprt)
+{
+        xs_free_peer_addresses(xprt);
+        xprt_free(xprt);
+}
 /**
 * xs_destroy - prepare to shutdown a transport
 * @xprt: doomed transport
@@ -919,8 +925,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
        dprintk("RPC:       xs_destroy xprt %p\n", xprt);
        xs_close(xprt);
-        xs_free_peer_addresses(xprt);
+        xs_xprt_free(xprt);
-        xprt_free(xprt);
        module_put(THIS_MODULE);
 }
@@ -1306,41 +1311,29 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
 * If we're unable to obtain the rpc_rqst we schedule the closing of the
 * connection and return -1.
 */
-static inline int xs_tcp_read_callback(struct rpc_xprt *xprt,
+static int xs_tcp_read_callback(struct rpc_xprt *xprt,
                                       struct xdr_skb_reader *desc)
 {
        struct sock_xprt *transport =
                                container_of(xprt, struct sock_xprt, xprt);
        struct rpc_rqst *req;
-        req = xprt_alloc_bc_request(xprt);
+        /* Look up and lock the request corresponding to the given XID */
+        spin_lock(&xprt->transport_lock);
+        req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
        if (req == NULL) {
+                spin_unlock(&xprt->transport_lock);
                printk(KERN_WARNING "Callback slot table overflowed\n");
                xprt_force_disconnect(xprt);
                return -1;
        }
-        req->rq_xid = transport->tcp_xid;
        dprintk("RPC:       read callback  XID %08x\n", ntohl(req->rq_xid));
        xs_tcp_read_common(xprt, desc, req);
-        if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) {
+        if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
-                struct svc_serv *bc_serv = xprt->bc_serv;
+                xprt_complete_bc_request(req, transport->tcp_copied);
+        spin_unlock(&xprt->transport_lock);
-                /*
-                 * Add callback request to callback list.  The callback
-                 * service sleeps on the sv_cb_waitq waiting for new
-                 * requests.  Wake it up after adding enqueing the
-                 * request.
-                 */
-                dprintk("RPC:       add callback request to list\n");
-                spin_lock(&bc_serv->sv_cb_lock);
-                list_add(&req->rq_bc_list, &bc_serv->sv_cb_list);
-                spin_unlock(&bc_serv->sv_cb_lock);
-                wake_up(&bc_serv->sv_cb_waitq);
-        }
-        req->rq_private_buf.len = transport->tcp_copied;
        return 0;
 }
@@ -2544,6 +2537,10 @@ static void bc_close(struct rpc_xprt *xprt)
 static void bc_destroy(struct rpc_xprt *xprt)
 {
+        dprintk("RPC:       bc_destroy xprt %p\n", xprt);
+        xs_xprt_free(xprt);
+        module_put(THIS_MODULE);
 }
 static struct rpc_xprt_ops xs_local_ops = {
@@ -2744,7 +2741,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
                return xprt;
        ret = ERR_PTR(-EINVAL);
 out_err:
-        xprt_free(xprt);
+        xs_xprt_free(xprt);
        return ret;
 }
@@ -2822,7 +2819,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
                return xprt;
        ret = ERR_PTR(-EINVAL);
 out_err:
-        xprt_free(xprt);
+        xs_xprt_free(xprt);
        return ret;
 }
@@ -2897,12 +2894,11 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
                                xprt->address_strings[RPC_DISPLAY_ADDR],
                                xprt->address_strings[RPC_DISPLAY_PROTO]);
        if (try_module_get(THIS_MODULE))
                return xprt;
        ret = ERR_PTR(-EINVAL);
 out_err:
-        xprt_free(xprt);
+        xs_xprt_free(xprt);
        return ret;
 }
@@ -2919,15 +2915,6 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
        struct svc_sock *bc_sock;
        struct rpc_xprt *ret;
-        if (args->bc_xprt->xpt_bc_xprt) {
-                /*
-                 * This server connection already has a backchannel
-                 * transport; we can't create a new one, as we wouldn't
-                 * be able to match replies based on xid any more.  So,
-                 * reuse the already-existing one:
-                 */
-                 return args->bc_xprt->xpt_bc_xprt;
-        }
        xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
                        xprt_tcp_slot_table_entries);
        if (IS_ERR(xprt))
@@ -2985,13 +2972,14 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args)
         */
        xprt_set_connected(xprt);
        if (try_module_get(THIS_MODULE))
                return xprt;
+        args->bc_xprt->xpt_bc_xprt = NULL;
        xprt_put(xprt);
        ret = ERR_PTR(-EINVAL);
 out_err:
-        xprt_free(xprt);
+        xs_xprt_free(xprt);
        return ret;
 }
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 0374a817631e..4c564eb69e1a 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -182,6 +182,8 @@ void tipc_net_start(u32 addr)
        tipc_bclink_init();
        write_unlock_bh(&tipc_net_lock);
+        tipc_nametbl_publish(TIPC_CFG_SRV, tipc_own_addr, tipc_own_addr,
+                             TIPC_ZONE_SCOPE, 0, tipc_own_addr);
        pr_info("Started in network mode\n");
        pr_info("Own node address %s, network identity %u\n",
                tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id);
@@ -192,6 +194,7 @@ void tipc_net_stop(void)
        if (!tipc_own_addr)
                return;
+        tipc_nametbl_withdraw(TIPC_CFG_SRV, tipc_own_addr, 0, tipc_own_addr);
        write_lock_bh(&tipc_net_lock);
        tipc_bearer_stop();
        tipc_bclink_stop();
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 29b7f26a12cf..adc12e227303 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -301,7 +301,6 @@ static int tipc_release(struct socket *sock)
        struct tipc_sock *tsk;
        struct tipc_port *port;
        struct sk_buff *buf;
-        int res;
        /*
         * Exit if socket isn't fully initialized (occurs when a failed accept()
@@ -349,7 +348,7 @@ static int tipc_release(struct socket *sock)
        sock_put(sk);
        sock->sk = NULL;
-        return res;
+        return 0;
 }
 /**