aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2013-06-19 06:55:31 -0400
committerIngo Molnar <mingo@kernel.org>2013-06-19 06:55:31 -0400
commitd81344c50824a4d28a9397e97135d60075ac37ff (patch)
treed25c443fb4a764cd788db857c49dd3d3f8f722d3 /net
parent0de358f1c2642710d41190b73fbc295e675c4ab8 (diff)
parent29bb9e5a75684106a37593ad75ec75ff8312731b (diff)
Merge branch 'sched/urgent' into sched/core
Merge in fixes before applying ongoing new work. Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/802/mrp.c4
-rw-r--r--net/batman-adv/distributed-arp-table.c13
-rw-r--r--net/batman-adv/main.c19
-rw-r--r--net/batman-adv/network-coding.c8
-rw-r--r--net/batman-adv/originator.c16
-rw-r--r--net/batman-adv/originator.h1
-rw-r--r--net/batman-adv/soft-interface.c1
-rw-r--r--net/batman-adv/translation-table.c7
-rw-r--r--net/bridge/netfilter/ebt_log.c11
-rw-r--r--net/bridge/netfilter/ebt_ulog.c18
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/auth.c117
-rw-r--r--net/ceph/auth_x.c24
-rw-r--r--net/ceph/auth_x.h1
-rw-r--r--net/ceph/ceph_common.c7
-rw-r--r--net/ceph/debugfs.c4
-rw-r--r--net/ceph/messenger.c1019
-rw-r--r--net/ceph/mon_client.c7
-rw-r--r--net/ceph/osd_client.c1090
-rw-r--r--net/ceph/osdmap.c45
-rw-r--r--net/ceph/snapshot.c78
-rw-r--r--net/core/dev.c11
-rw-r--r--net/core/iovec.c50
-rw-r--r--net/core/net-sysfs.c12
-rw-r--r--net/core/netpoll.c2
-rw-r--r--net/core/sock.c12
-rw-r--r--net/ipv4/fib_trie.c13
-rw-r--r--net/ipv4/gre.c8
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/ip_gre.c3
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/netfilter/ipt_ULOG.c13
-rw-r--r--net/ipv4/tcp.c29
-rw-r--r--net/ipv4/tcp_input.c23
-rw-r--r--net/ipv4/tcp_metrics.c15
-rw-r--r--net/ipv4/tcp_output.c10
-rw-r--r--net/ipv4/udp.c4
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/tcp_ipv6.c12
-rw-r--r--net/ipv6/udp.c13
-rw-r--r--net/ipv6/udp_impl.h2
-rw-r--r--net/ipv6/udplite.c2
-rw-r--r--net/ipv6/xfrm6_policy.c4
-rw-r--r--net/irda/irlap_frame.c2
-rw-r--r--net/mac80211/ieee80211_i.h1
-rw-r--r--net/mac80211/mlme.c61
-rw-r--r--net/mac80211/rate.c9
-rw-r--r--net/mac80211/rx.c3
-rw-r--r--net/mac80211/tkip.c4
-rw-r--r--net/mac80211/util.c7
-rw-r--r--net/netfilter/nf_log.c7
-rw-r--r--net/netfilter/nfnetlink_log.c6
-rw-r--r--net/netfilter/nfnetlink_queue_core.c2
-rw-r--r--net/netfilter/xt_LOG.c13
-rw-r--r--net/netfilter/xt_NFLOG.c3
-rw-r--r--net/netfilter/xt_TCPOPTSTRIP.c17
-rw-r--r--net/netlabel/netlabel_domainhash.c69
-rw-r--r--net/socket.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c65
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_xdr.c58
-rw-r--r--net/sunrpc/clnt.c2
-rw-r--r--net/sunrpc/netns.h4
-rw-r--r--net/sunrpc/rpc_pipe.c5
-rw-r--r--net/sunrpc/sched.c8
-rw-r--r--net/tipc/link.c11
-rw-r--r--net/wireless/core.c17
-rw-r--r--net/wireless/nl80211.c4
-rw-r--r--net/wireless/sme.c3
-rw-r--r--net/wireless/trace.h23
-rw-r--r--net/xfrm/xfrm_output.c1
71 files changed, 2166 insertions, 982 deletions
diff --git a/net/802/mrp.c b/net/802/mrp.c
index e085bcc754f6..1eb05d80b07b 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -871,10 +871,10 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
871 */ 871 */
872 del_timer_sync(&app->join_timer); 872 del_timer_sync(&app->join_timer);
873 873
874 spin_lock(&app->lock); 874 spin_lock_bh(&app->lock);
875 mrp_mad_event(app, MRP_EVENT_TX); 875 mrp_mad_event(app, MRP_EVENT_TX);
876 mrp_pdu_queue(app); 876 mrp_pdu_queue(app);
877 spin_unlock(&app->lock); 877 spin_unlock_bh(&app->lock);
878 878
879 mrp_queue_xmit(app); 879 mrp_queue_xmit(app);
880 880
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 8e15d966d9b0..239992021b1d 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -837,6 +837,19 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
837 837
838 dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst); 838 dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_dst);
839 if (dat_entry) { 839 if (dat_entry) {
840 /* If the ARP request is destined for a local client the local
841 * client will answer itself. DAT would only generate a
842 * duplicate packet.
843 *
844 * Moreover, if the soft-interface is enslaved into a bridge, an
845 * additional DAT answer may trigger kernel warnings about
846 * a packet coming from the wrong port.
847 */
848 if (batadv_is_my_client(bat_priv, dat_entry->mac_addr)) {
849 ret = true;
850 goto out;
851 }
852
840 skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src, 853 skb_new = arp_create(ARPOP_REPLY, ETH_P_ARP, ip_src,
841 bat_priv->soft_iface, ip_dst, hw_src, 854 bat_priv->soft_iface, ip_dst, hw_src,
842 dat_entry->mac_addr, hw_src); 855 dat_entry->mac_addr, hw_src);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 3e30a0f1b908..51aafd669cbb 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -163,16 +163,25 @@ void batadv_mesh_free(struct net_device *soft_iface)
163 batadv_vis_quit(bat_priv); 163 batadv_vis_quit(bat_priv);
164 164
165 batadv_gw_node_purge(bat_priv); 165 batadv_gw_node_purge(bat_priv);
166 batadv_originator_free(bat_priv);
167 batadv_nc_free(bat_priv); 166 batadv_nc_free(bat_priv);
167 batadv_dat_free(bat_priv);
168 batadv_bla_free(bat_priv);
168 169
170 /* Free the TT and the originator tables only after having terminated
171 * all the other depending components which may use these structures for
172 * their purposes.
173 */
169 batadv_tt_free(bat_priv); 174 batadv_tt_free(bat_priv);
170 175
171 batadv_bla_free(bat_priv); 176 /* Since the originator table clean up routine is accessing the TT
172 177 * tables as well, it has to be invoked after the TT tables have been
173 batadv_dat_free(bat_priv); 178 * freed and marked as empty. This ensures that no cleanup RCU callbacks
179 * accessing the TT data are scheduled for later execution.
180 */
181 batadv_originator_free(bat_priv);
174 182
175 free_percpu(bat_priv->bat_counters); 183 free_percpu(bat_priv->bat_counters);
184 bat_priv->bat_counters = NULL;
176 185
177 atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE); 186 atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
178} 187}
@@ -475,7 +484,7 @@ static int batadv_param_set_ra(const char *val, const struct kernel_param *kp)
475 char *algo_name = (char *)val; 484 char *algo_name = (char *)val;
476 size_t name_len = strlen(algo_name); 485 size_t name_len = strlen(algo_name);
477 486
478 if (algo_name[name_len - 1] == '\n') 487 if (name_len > 0 && algo_name[name_len - 1] == '\n')
479 algo_name[name_len - 1] = '\0'; 488 algo_name[name_len - 1] = '\0';
480 489
481 bat_algo_ops = batadv_algo_get(algo_name); 490 bat_algo_ops = batadv_algo_get(algo_name);
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index f7c54305a918..e84629ece9b7 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1514,6 +1514,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
1514 struct ethhdr *ethhdr, ethhdr_tmp; 1514 struct ethhdr *ethhdr, ethhdr_tmp;
1515 uint8_t *orig_dest, ttl, ttvn; 1515 uint8_t *orig_dest, ttl, ttvn;
1516 unsigned int coding_len; 1516 unsigned int coding_len;
1517 int err;
1517 1518
1518 /* Save headers temporarily */ 1519 /* Save headers temporarily */
1519 memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); 1520 memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp));
@@ -1568,8 +1569,11 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
1568 coding_len); 1569 coding_len);
1569 1570
1570 /* Resize decoded skb if decoded with larger packet */ 1571 /* Resize decoded skb if decoded with larger packet */
1571 if (nc_packet->skb->len > coding_len + h_size) 1572 if (nc_packet->skb->len > coding_len + h_size) {
1572 pskb_trim_rcsum(skb, coding_len + h_size); 1573 err = pskb_trim_rcsum(skb, coding_len + h_size);
1574 if (err)
1575 return NULL;
1576 }
1573 1577
1574 /* Create decoded unicast packet */ 1578 /* Create decoded unicast packet */
1575 unicast_packet = (struct batadv_unicast_packet *)skb->data; 1579 unicast_packet = (struct batadv_unicast_packet *)skb->data;
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 2f3452546636..fad1a2093e15 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -156,12 +156,28 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
156 kfree(orig_node); 156 kfree(orig_node);
157} 157}
158 158
159/**
160 * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly
161 * schedule an rcu callback for freeing it
162 * @orig_node: the orig node to free
163 */
159void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node) 164void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node)
160{ 165{
161 if (atomic_dec_and_test(&orig_node->refcount)) 166 if (atomic_dec_and_test(&orig_node->refcount))
162 call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); 167 call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
163} 168}
164 169
170/**
171 * batadv_orig_node_free_ref_now - decrement the orig node refcounter and
172 * possibly free it (without rcu callback)
173 * @orig_node: the orig node to free
174 */
175void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node)
176{
177 if (atomic_dec_and_test(&orig_node->refcount))
178 batadv_orig_node_free_rcu(&orig_node->rcu);
179}
180
165void batadv_originator_free(struct batadv_priv *bat_priv) 181void batadv_originator_free(struct batadv_priv *bat_priv)
166{ 182{
167 struct batadv_hashtable *hash = bat_priv->orig_hash; 183 struct batadv_hashtable *hash = bat_priv->orig_hash;
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 7df48fa7669d..734e5a3d8a5b 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -26,6 +26,7 @@ int batadv_originator_init(struct batadv_priv *bat_priv);
26void batadv_originator_free(struct batadv_priv *bat_priv); 26void batadv_originator_free(struct batadv_priv *bat_priv);
27void batadv_purge_orig_ref(struct batadv_priv *bat_priv); 27void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
28void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node); 28void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node);
29void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node);
29struct batadv_orig_node *batadv_get_orig_node(struct batadv_priv *bat_priv, 30struct batadv_orig_node *batadv_get_orig_node(struct batadv_priv *bat_priv,
30 const uint8_t *addr); 31 const uint8_t *addr);
31struct batadv_neigh_node * 32struct batadv_neigh_node *
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 6f20d339e33a..819dfb006cdf 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -505,6 +505,7 @@ unreg_debugfs:
505 batadv_debugfs_del_meshif(dev); 505 batadv_debugfs_del_meshif(dev);
506free_bat_counters: 506free_bat_counters:
507 free_percpu(bat_priv->bat_counters); 507 free_percpu(bat_priv->bat_counters);
508 bat_priv->bat_counters = NULL;
508 509
509 return ret; 510 return ret;
510} 511}
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 5e89deeb9542..9e8748575845 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -144,7 +144,12 @@ static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
144 struct batadv_tt_orig_list_entry *orig_entry; 144 struct batadv_tt_orig_list_entry *orig_entry;
145 145
146 orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu); 146 orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
147 batadv_orig_node_free_ref(orig_entry->orig_node); 147
148 /* We are in an rcu callback here, therefore we cannot use
149 * batadv_orig_node_free_ref() and its call_rcu():
150 * An rcu_barrier() wouldn't wait for that to finish
151 */
152 batadv_orig_node_free_ref_now(orig_entry->orig_node);
148 kfree(orig_entry); 153 kfree(orig_entry);
149} 154}
150 155
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 9878eb8204c5..19c37a4929bc 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -72,13 +72,12 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
72} 72}
73 73
74static void 74static void
75ebt_log_packet(u_int8_t pf, unsigned int hooknum, 75ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
76 const struct sk_buff *skb, const struct net_device *in, 76 const struct sk_buff *skb, const struct net_device *in,
77 const struct net_device *out, const struct nf_loginfo *loginfo, 77 const struct net_device *out, const struct nf_loginfo *loginfo,
78 const char *prefix) 78 const char *prefix)
79{ 79{
80 unsigned int bitmask; 80 unsigned int bitmask;
81 struct net *net = dev_net(in ? in : out);
82 81
83 /* FIXME: Disabled from containers until syslog ns is supported */ 82 /* FIXME: Disabled from containers until syslog ns is supported */
84 if (!net_eq(net, &init_net)) 83 if (!net_eq(net, &init_net))
@@ -191,7 +190,7 @@ ebt_log_tg(struct sk_buff *skb, const struct xt_action_param *par)
191 nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, 190 nf_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb,
192 par->in, par->out, &li, "%s", info->prefix); 191 par->in, par->out, &li, "%s", info->prefix);
193 else 192 else
194 ebt_log_packet(NFPROTO_BRIDGE, par->hooknum, skb, par->in, 193 ebt_log_packet(net, NFPROTO_BRIDGE, par->hooknum, skb, par->in,
195 par->out, &li, info->prefix); 194 par->out, &li, info->prefix);
196 return EBT_CONTINUE; 195 return EBT_CONTINUE;
197} 196}
diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c
index fc1905c51417..df0364aa12d5 100644
--- a/net/bridge/netfilter/ebt_ulog.c
+++ b/net/bridge/netfilter/ebt_ulog.c
@@ -131,14 +131,16 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
131 return skb; 131 return skb;
132} 132}
133 133
134static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, 134static void ebt_ulog_packet(struct net *net, unsigned int hooknr,
135 const struct net_device *in, const struct net_device *out, 135 const struct sk_buff *skb,
136 const struct ebt_ulog_info *uloginfo, const char *prefix) 136 const struct net_device *in,
137 const struct net_device *out,
138 const struct ebt_ulog_info *uloginfo,
139 const char *prefix)
137{ 140{
138 ebt_ulog_packet_msg_t *pm; 141 ebt_ulog_packet_msg_t *pm;
139 size_t size, copy_len; 142 size_t size, copy_len;
140 struct nlmsghdr *nlh; 143 struct nlmsghdr *nlh;
141 struct net *net = dev_net(in ? in : out);
142 struct ebt_ulog_net *ebt = ebt_ulog_pernet(net); 144 struct ebt_ulog_net *ebt = ebt_ulog_pernet(net);
143 unsigned int group = uloginfo->nlgroup; 145 unsigned int group = uloginfo->nlgroup;
144 ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group]; 146 ebt_ulog_buff_t *ub = &ebt->ulog_buffers[group];
@@ -233,7 +235,7 @@ unlock:
233} 235}
234 236
235/* this function is registered with the netfilter core */ 237/* this function is registered with the netfilter core */
236static void ebt_log_packet(u_int8_t pf, unsigned int hooknum, 238static void ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
237 const struct sk_buff *skb, const struct net_device *in, 239 const struct sk_buff *skb, const struct net_device *in,
238 const struct net_device *out, const struct nf_loginfo *li, 240 const struct net_device *out, const struct nf_loginfo *li,
239 const char *prefix) 241 const char *prefix)
@@ -252,13 +254,15 @@ static void ebt_log_packet(u_int8_t pf, unsigned int hooknum,
252 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); 254 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
253 } 255 }
254 256
255 ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 257 ebt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
256} 258}
257 259
258static unsigned int 260static unsigned int
259ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) 261ebt_ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
260{ 262{
261 ebt_ulog_packet(par->hooknum, skb, par->in, par->out, 263 struct net *net = dev_net(par->in ? par->in : par->out);
264
265 ebt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
262 par->targinfo, NULL); 266 par->targinfo, NULL);
263 return EBT_CONTINUE; 267 return EBT_CONTINUE;
264} 268}
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index e87ef435e11b..958d9856912c 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
11 crypto.o armor.o \ 11 crypto.o armor.o \
12 auth_x.o \ 12 auth_x.o \
13 ceph_fs.o ceph_strings.o ceph_hash.o \ 13 ceph_fs.o ceph_strings.o ceph_hash.o \
14 pagevec.o 14 pagevec.o snapshot.o
15 15
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index b4bf4ac090f1..6b923bcaa2a4 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
47 if (!ac) 47 if (!ac)
48 goto out; 48 goto out;
49 49
50 mutex_init(&ac->mutex);
50 ac->negotiating = true; 51 ac->negotiating = true;
51 if (name) 52 if (name)
52 ac->name = name; 53 ac->name = name;
@@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac)
73 */ 74 */
74void ceph_auth_reset(struct ceph_auth_client *ac) 75void ceph_auth_reset(struct ceph_auth_client *ac)
75{ 76{
77 mutex_lock(&ac->mutex);
76 dout("auth_reset %p\n", ac); 78 dout("auth_reset %p\n", ac);
77 if (ac->ops && !ac->negotiating) 79 if (ac->ops && !ac->negotiating)
78 ac->ops->reset(ac); 80 ac->ops->reset(ac);
79 ac->negotiating = true; 81 ac->negotiating = true;
82 mutex_unlock(&ac->mutex);
80} 83}
81 84
82int ceph_entity_name_encode(const char *name, void **p, void *end) 85int ceph_entity_name_encode(const char *name, void **p, void *end)
@@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
102 int i, num; 105 int i, num;
103 int ret; 106 int ret;
104 107
108 mutex_lock(&ac->mutex);
105 dout("auth_build_hello\n"); 109 dout("auth_build_hello\n");
106 monhdr->have_version = 0; 110 monhdr->have_version = 0;
107 monhdr->session_mon = cpu_to_le16(-1); 111 monhdr->session_mon = cpu_to_le16(-1);
@@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
122 126
123 ret = ceph_entity_name_encode(ac->name, &p, end); 127 ret = ceph_entity_name_encode(ac->name, &p, end);
124 if (ret < 0) 128 if (ret < 0)
125 return ret; 129 goto out;
126 ceph_decode_need(&p, end, sizeof(u64), bad); 130 ceph_decode_need(&p, end, sizeof(u64), bad);
127 ceph_encode_64(&p, ac->global_id); 131 ceph_encode_64(&p, ac->global_id);
128 132
129 ceph_encode_32(&lenp, p - lenp - sizeof(u32)); 133 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
130 return p - buf; 134 ret = p - buf;
135out:
136 mutex_unlock(&ac->mutex);
137 return ret;
131 138
132bad: 139bad:
133 return -ERANGE; 140 ret = -ERANGE;
141 goto out;
134} 142}
135 143
136static int ceph_build_auth_request(struct ceph_auth_client *ac, 144static int ceph_build_auth_request(struct ceph_auth_client *ac,
@@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac,
151 if (ret < 0) { 159 if (ret < 0) {
152 pr_err("error %d building auth method %s request\n", ret, 160 pr_err("error %d building auth method %s request\n", ret,
153 ac->ops->name); 161 ac->ops->name);
154 return ret; 162 goto out;
155 } 163 }
156 dout(" built request %d bytes\n", ret); 164 dout(" built request %d bytes\n", ret);
157 ceph_encode_32(&p, ret); 165 ceph_encode_32(&p, ret);
158 return p + ret - msg_buf; 166 ret = p + ret - msg_buf;
167out:
168 return ret;
159} 169}
160 170
161/* 171/*
@@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
176 int result_msg_len; 186 int result_msg_len;
177 int ret = -EINVAL; 187 int ret = -EINVAL;
178 188
189 mutex_lock(&ac->mutex);
179 dout("handle_auth_reply %p %p\n", p, end); 190 dout("handle_auth_reply %p %p\n", p, end);
180 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad); 191 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
181 protocol = ceph_decode_32(&p); 192 protocol = ceph_decode_32(&p);
@@ -227,33 +238,103 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
227 238
228 ret = ac->ops->handle_reply(ac, result, payload, payload_end); 239 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
229 if (ret == -EAGAIN) { 240 if (ret == -EAGAIN) {
230 return ceph_build_auth_request(ac, reply_buf, reply_len); 241 ret = ceph_build_auth_request(ac, reply_buf, reply_len);
231 } else if (ret) { 242 } else if (ret) {
232 pr_err("auth method '%s' error %d\n", ac->ops->name, ret); 243 pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
233 return ret;
234 } 244 }
235 return 0;
236 245
237bad:
238 pr_err("failed to decode auth msg\n");
239out: 246out:
247 mutex_unlock(&ac->mutex);
240 return ret; 248 return ret;
249
250bad:
251 pr_err("failed to decode auth msg\n");
252 ret = -EINVAL;
253 goto out;
241} 254}
242 255
243int ceph_build_auth(struct ceph_auth_client *ac, 256int ceph_build_auth(struct ceph_auth_client *ac,
244 void *msg_buf, size_t msg_len) 257 void *msg_buf, size_t msg_len)
245{ 258{
259 int ret = 0;
260
261 mutex_lock(&ac->mutex);
246 if (!ac->protocol) 262 if (!ac->protocol)
247 return ceph_auth_build_hello(ac, msg_buf, msg_len); 263 ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
248 BUG_ON(!ac->ops); 264 else if (ac->ops->should_authenticate(ac))
249 if (ac->ops->should_authenticate(ac)) 265 ret = ceph_build_auth_request(ac, msg_buf, msg_len);
250 return ceph_build_auth_request(ac, msg_buf, msg_len); 266 mutex_unlock(&ac->mutex);
251 return 0; 267 return ret;
252} 268}
253 269
254int ceph_auth_is_authenticated(struct ceph_auth_client *ac) 270int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
255{ 271{
256 if (!ac->ops) 272 int ret = 0;
257 return 0; 273
258 return ac->ops->is_authenticated(ac); 274 mutex_lock(&ac->mutex);
275 if (ac->ops)
276 ret = ac->ops->is_authenticated(ac);
277 mutex_unlock(&ac->mutex);
278 return ret;
279}
280EXPORT_SYMBOL(ceph_auth_is_authenticated);
281
282int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
283 int peer_type,
284 struct ceph_auth_handshake *auth)
285{
286 int ret = 0;
287
288 mutex_lock(&ac->mutex);
289 if (ac->ops && ac->ops->create_authorizer)
290 ret = ac->ops->create_authorizer(ac, peer_type, auth);
291 mutex_unlock(&ac->mutex);
292 return ret;
293}
294EXPORT_SYMBOL(ceph_auth_create_authorizer);
295
296void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
297 struct ceph_authorizer *a)
298{
299 mutex_lock(&ac->mutex);
300 if (ac->ops && ac->ops->destroy_authorizer)
301 ac->ops->destroy_authorizer(ac, a);
302 mutex_unlock(&ac->mutex);
303}
304EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
305
306int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
307 int peer_type,
308 struct ceph_auth_handshake *a)
309{
310 int ret = 0;
311
312 mutex_lock(&ac->mutex);
313 if (ac->ops && ac->ops->update_authorizer)
314 ret = ac->ops->update_authorizer(ac, peer_type, a);
315 mutex_unlock(&ac->mutex);
316 return ret;
317}
318EXPORT_SYMBOL(ceph_auth_update_authorizer);
319
320int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
321 struct ceph_authorizer *a, size_t len)
322{
323 int ret = 0;
324
325 mutex_lock(&ac->mutex);
326 if (ac->ops && ac->ops->verify_authorizer_reply)
327 ret = ac->ops->verify_authorizer_reply(ac, a, len);
328 mutex_unlock(&ac->mutex);
329 return ret;
330}
331EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
332
333void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
334{
335 mutex_lock(&ac->mutex);
336 if (ac->ops && ac->ops->invalidate_authorizer)
337 ac->ops->invalidate_authorizer(ac, peer_type);
338 mutex_unlock(&ac->mutex);
259} 339}
340EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index a16bf14eb027..96238ba95f2b 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
298 return -ENOMEM; 298 return -ENOMEM;
299 } 299 }
300 au->service = th->service; 300 au->service = th->service;
301 au->secret_id = th->secret_id;
301 302
302 msg_a = au->buf->vec.iov_base; 303 msg_a = au->buf->vec.iov_base;
303 msg_a->struct_v = 1; 304 msg_a->struct_v = 1;
@@ -555,6 +556,26 @@ static int ceph_x_create_authorizer(
555 return 0; 556 return 0;
556} 557}
557 558
559static int ceph_x_update_authorizer(
560 struct ceph_auth_client *ac, int peer_type,
561 struct ceph_auth_handshake *auth)
562{
563 struct ceph_x_authorizer *au;
564 struct ceph_x_ticket_handler *th;
565
566 th = get_ticket_handler(ac, peer_type);
567 if (IS_ERR(th))
568 return PTR_ERR(th);
569
570 au = (struct ceph_x_authorizer *)auth->authorizer;
571 if (au->secret_id < th->secret_id) {
572 dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
573 au->service, au->secret_id, th->secret_id);
574 return ceph_x_build_authorizer(ac, th, au);
575 }
576 return 0;
577}
578
558static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, 579static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
559 struct ceph_authorizer *a, size_t len) 580 struct ceph_authorizer *a, size_t len)
560{ 581{
@@ -630,7 +651,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
630 651
631 th = get_ticket_handler(ac, peer_type); 652 th = get_ticket_handler(ac, peer_type);
632 if (!IS_ERR(th)) 653 if (!IS_ERR(th))
633 remove_ticket_handler(ac, th); 654 memset(&th->validity, 0, sizeof(th->validity));
634} 655}
635 656
636 657
@@ -641,6 +662,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
641 .build_request = ceph_x_build_request, 662 .build_request = ceph_x_build_request,
642 .handle_reply = ceph_x_handle_reply, 663 .handle_reply = ceph_x_handle_reply,
643 .create_authorizer = ceph_x_create_authorizer, 664 .create_authorizer = ceph_x_create_authorizer,
665 .update_authorizer = ceph_x_update_authorizer,
644 .verify_authorizer_reply = ceph_x_verify_authorizer_reply, 666 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
645 .destroy_authorizer = ceph_x_destroy_authorizer, 667 .destroy_authorizer = ceph_x_destroy_authorizer,
646 .invalidate_authorizer = ceph_x_invalidate_authorizer, 668 .invalidate_authorizer = ceph_x_invalidate_authorizer,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index f459e93b774f..c5a058da7ac8 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -29,6 +29,7 @@ struct ceph_x_authorizer {
29 struct ceph_buffer *buf; 29 struct ceph_buffer *buf;
30 unsigned int service; 30 unsigned int service;
31 u64 nonce; 31 u64 nonce;
32 u64 secret_id;
32 char reply_buf[128]; /* big enough for encrypted blob */ 33 char reply_buf[128]; /* big enough for encrypted blob */
33}; 34};
34 35
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index e65e6e4be38b..34b11ee8124e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -606,11 +606,17 @@ static int __init init_ceph_lib(void)
606 if (ret < 0) 606 if (ret < 0)
607 goto out_crypto; 607 goto out_crypto;
608 608
609 ret = ceph_osdc_setup();
610 if (ret < 0)
611 goto out_msgr;
612
609 pr_info("loaded (mon/osd proto %d/%d)\n", 613 pr_info("loaded (mon/osd proto %d/%d)\n",
610 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL); 614 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
611 615
612 return 0; 616 return 0;
613 617
618out_msgr:
619 ceph_msgr_exit();
614out_crypto: 620out_crypto:
615 ceph_crypto_shutdown(); 621 ceph_crypto_shutdown();
616out_debugfs: 622out_debugfs:
@@ -622,6 +628,7 @@ out:
622static void __exit exit_ceph_lib(void) 628static void __exit exit_ceph_lib(void)
623{ 629{
624 dout("exit_ceph_lib\n"); 630 dout("exit_ceph_lib\n");
631 ceph_osdc_cleanup();
625 ceph_msgr_exit(); 632 ceph_msgr_exit();
626 ceph_crypto_shutdown(); 633 ceph_crypto_shutdown();
627 ceph_debugfs_cleanup(); 634 ceph_debugfs_cleanup();
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 00d051f4894e..83661cdc0766 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp)
123 mutex_lock(&osdc->request_mutex); 123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req; 125 struct ceph_osd_request *req;
126 unsigned int i;
126 int opcode; 127 int opcode;
127 int i;
128 128
129 req = rb_entry(p, struct ceph_osd_request, r_node); 129 req = rb_entry(p, struct ceph_osd_request, r_node);
130 130
@@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp)
142 seq_printf(s, "\t"); 142 seq_printf(s, "\t");
143 143
144 for (i = 0; i < req->r_num_ops; i++) { 144 for (i = 0; i < req->r_num_ops; i++) {
145 opcode = le16_to_cpu(req->r_request_ops[i].op); 145 opcode = req->r_ops[i].op;
146 seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); 146 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
147 } 147 }
148 148
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2c0669fb54e3..eb0a46a49bd4 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -21,6 +21,9 @@
21#include <linux/ceph/pagelist.h> 21#include <linux/ceph/pagelist.h>
22#include <linux/export.h> 22#include <linux/export.h>
23 23
24#define list_entry_next(pos, member) \
25 list_entry(pos->member.next, typeof(*pos), member)
26
24/* 27/*
25 * Ceph uses the messenger to exchange ceph_msg messages with other 28 * Ceph uses the messenger to exchange ceph_msg messages with other
26 * hosts in the system. The messenger provides ordered and reliable 29 * hosts in the system. The messenger provides ordered and reliable
@@ -149,6 +152,11 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
149 return test_and_set_bit(con_flag, &con->flags); 152 return test_and_set_bit(con_flag, &con->flags);
150} 153}
151 154
155/* Slab caches for frequently-allocated structures */
156
157static struct kmem_cache *ceph_msg_cache;
158static struct kmem_cache *ceph_msg_data_cache;
159
152/* static tag bytes (protocol control messages) */ 160/* static tag bytes (protocol control messages) */
153static char tag_msg = CEPH_MSGR_TAG_MSG; 161static char tag_msg = CEPH_MSGR_TAG_MSG;
154static char tag_ack = CEPH_MSGR_TAG_ACK; 162static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -223,6 +231,41 @@ static void encode_my_addr(struct ceph_messenger *msgr)
223 */ 231 */
224static struct workqueue_struct *ceph_msgr_wq; 232static struct workqueue_struct *ceph_msgr_wq;
225 233
234static int ceph_msgr_slab_init(void)
235{
236 BUG_ON(ceph_msg_cache);
237 ceph_msg_cache = kmem_cache_create("ceph_msg",
238 sizeof (struct ceph_msg),
239 __alignof__(struct ceph_msg), 0, NULL);
240
241 if (!ceph_msg_cache)
242 return -ENOMEM;
243
244 BUG_ON(ceph_msg_data_cache);
245 ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
246 sizeof (struct ceph_msg_data),
247 __alignof__(struct ceph_msg_data),
248 0, NULL);
249 if (ceph_msg_data_cache)
250 return 0;
251
252 kmem_cache_destroy(ceph_msg_cache);
253 ceph_msg_cache = NULL;
254
255 return -ENOMEM;
256}
257
258static void ceph_msgr_slab_exit(void)
259{
260 BUG_ON(!ceph_msg_data_cache);
261 kmem_cache_destroy(ceph_msg_data_cache);
262 ceph_msg_data_cache = NULL;
263
264 BUG_ON(!ceph_msg_cache);
265 kmem_cache_destroy(ceph_msg_cache);
266 ceph_msg_cache = NULL;
267}
268
226static void _ceph_msgr_exit(void) 269static void _ceph_msgr_exit(void)
227{ 270{
228 if (ceph_msgr_wq) { 271 if (ceph_msgr_wq) {
@@ -230,6 +273,8 @@ static void _ceph_msgr_exit(void)
230 ceph_msgr_wq = NULL; 273 ceph_msgr_wq = NULL;
231 } 274 }
232 275
276 ceph_msgr_slab_exit();
277
233 BUG_ON(zero_page == NULL); 278 BUG_ON(zero_page == NULL);
234 kunmap(zero_page); 279 kunmap(zero_page);
235 page_cache_release(zero_page); 280 page_cache_release(zero_page);
@@ -242,6 +287,9 @@ int ceph_msgr_init(void)
242 zero_page = ZERO_PAGE(0); 287 zero_page = ZERO_PAGE(0);
243 page_cache_get(zero_page); 288 page_cache_get(zero_page);
244 289
290 if (ceph_msgr_slab_init())
291 return -ENOMEM;
292
245 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0); 293 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
246 if (ceph_msgr_wq) 294 if (ceph_msgr_wq)
247 return 0; 295 return 0;
@@ -471,6 +519,22 @@ static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
471 return r; 519 return r;
472} 520}
473 521
522static int ceph_tcp_recvpage(struct socket *sock, struct page *page,
523 int page_offset, size_t length)
524{
525 void *kaddr;
526 int ret;
527
528 BUG_ON(page_offset + length > PAGE_SIZE);
529
530 kaddr = kmap(page);
531 BUG_ON(!kaddr);
532 ret = ceph_tcp_recvmsg(sock, kaddr + page_offset, length);
533 kunmap(page);
534
535 return ret;
536}
537
474/* 538/*
475 * write something. @more is true if caller will be sending more data 539 * write something. @more is true if caller will be sending more data
476 * shortly. 540 * shortly.
@@ -493,7 +557,7 @@ static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
493} 557}
494 558
495static int ceph_tcp_sendpage(struct socket *sock, struct page *page, 559static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
496 int offset, size_t size, int more) 560 int offset, size_t size, bool more)
497{ 561{
498 int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR); 562 int flags = MSG_DONTWAIT | MSG_NOSIGNAL | (more ? MSG_MORE : MSG_EOR);
499 int ret; 563 int ret;
@@ -697,50 +761,397 @@ static void con_out_kvec_add(struct ceph_connection *con,
697} 761}
698 762
699#ifdef CONFIG_BLOCK 763#ifdef CONFIG_BLOCK
700static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) 764
765/*
766 * For a bio data item, a piece is whatever remains of the next
767 * entry in the current bio iovec, or the first entry in the next
768 * bio in the list.
769 */
770static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
771 size_t length)
701{ 772{
702 if (!bio) { 773 struct ceph_msg_data *data = cursor->data;
703 *iter = NULL; 774 struct bio *bio;
704 *seg = 0; 775
705 return; 776 BUG_ON(data->type != CEPH_MSG_DATA_BIO);
777
778 bio = data->bio;
779 BUG_ON(!bio);
780 BUG_ON(!bio->bi_vcnt);
781
782 cursor->resid = min(length, data->bio_length);
783 cursor->bio = bio;
784 cursor->vector_index = 0;
785 cursor->vector_offset = 0;
786 cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
787}
788
789static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
790 size_t *page_offset,
791 size_t *length)
792{
793 struct ceph_msg_data *data = cursor->data;
794 struct bio *bio;
795 struct bio_vec *bio_vec;
796 unsigned int index;
797
798 BUG_ON(data->type != CEPH_MSG_DATA_BIO);
799
800 bio = cursor->bio;
801 BUG_ON(!bio);
802
803 index = cursor->vector_index;
804 BUG_ON(index >= (unsigned int) bio->bi_vcnt);
805
806 bio_vec = &bio->bi_io_vec[index];
807 BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
808 *page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
809 BUG_ON(*page_offset >= PAGE_SIZE);
810 if (cursor->last_piece) /* pagelist offset is always 0 */
811 *length = cursor->resid;
812 else
813 *length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
814 BUG_ON(*length > cursor->resid);
815 BUG_ON(*page_offset + *length > PAGE_SIZE);
816
817 return bio_vec->bv_page;
818}
819
820static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
821 size_t bytes)
822{
823 struct bio *bio;
824 struct bio_vec *bio_vec;
825 unsigned int index;
826
827 BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
828
829 bio = cursor->bio;
830 BUG_ON(!bio);
831
832 index = cursor->vector_index;
833 BUG_ON(index >= (unsigned int) bio->bi_vcnt);
834 bio_vec = &bio->bi_io_vec[index];
835
836 /* Advance the cursor offset */
837
838 BUG_ON(cursor->resid < bytes);
839 cursor->resid -= bytes;
840 cursor->vector_offset += bytes;
841 if (cursor->vector_offset < bio_vec->bv_len)
842 return false; /* more bytes to process in this segment */
843 BUG_ON(cursor->vector_offset != bio_vec->bv_len);
844
845 /* Move on to the next segment, and possibly the next bio */
846
847 if (++index == (unsigned int) bio->bi_vcnt) {
848 bio = bio->bi_next;
849 index = 0;
706 } 850 }
707 *iter = bio; 851 cursor->bio = bio;
708 *seg = bio->bi_idx; 852 cursor->vector_index = index;
853 cursor->vector_offset = 0;
854
855 if (!cursor->last_piece) {
856 BUG_ON(!cursor->resid);
857 BUG_ON(!bio);
858 /* A short read is OK, so use <= rather than == */
859 if (cursor->resid <= bio->bi_io_vec[index].bv_len)
860 cursor->last_piece = true;
861 }
862
863 return true;
709} 864}
865#endif /* CONFIG_BLOCK */
710 866
711static void iter_bio_next(struct bio **bio_iter, int *seg) 867/*
868 * For a page array, a piece comes from the first page in the array
869 * that has not already been fully consumed.
870 */
871static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
872 size_t length)
712{ 873{
713 if (*bio_iter == NULL) 874 struct ceph_msg_data *data = cursor->data;
714 return; 875 int page_count;
876
877 BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
715 878
716 BUG_ON(*seg >= (*bio_iter)->bi_vcnt); 879 BUG_ON(!data->pages);
880 BUG_ON(!data->length);
717 881
718 (*seg)++; 882 cursor->resid = min(length, data->length);
719 if (*seg == (*bio_iter)->bi_vcnt) 883 page_count = calc_pages_for(data->alignment, (u64)data->length);
720 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); 884 cursor->page_offset = data->alignment & ~PAGE_MASK;
885 cursor->page_index = 0;
886 BUG_ON(page_count > (int)USHRT_MAX);
887 cursor->page_count = (unsigned short)page_count;
888 BUG_ON(length > SIZE_MAX - cursor->page_offset);
889 cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
721} 890}
722#endif
723 891
724static void prepare_write_message_data(struct ceph_connection *con) 892static struct page *
893ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
894 size_t *page_offset, size_t *length)
725{ 895{
726 struct ceph_msg *msg = con->out_msg; 896 struct ceph_msg_data *data = cursor->data;
727 897
728 BUG_ON(!msg); 898 BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
729 BUG_ON(!msg->hdr.data_len); 899
900 BUG_ON(cursor->page_index >= cursor->page_count);
901 BUG_ON(cursor->page_offset >= PAGE_SIZE);
902
903 *page_offset = cursor->page_offset;
904 if (cursor->last_piece)
905 *length = cursor->resid;
906 else
907 *length = PAGE_SIZE - *page_offset;
908
909 return data->pages[cursor->page_index];
910}
911
912static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
913 size_t bytes)
914{
915 BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
916
917 BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
918
919 /* Advance the cursor page offset */
920
921 cursor->resid -= bytes;
922 cursor->page_offset = (cursor->page_offset + bytes) & ~PAGE_MASK;
923 if (!bytes || cursor->page_offset)
924 return false; /* more bytes to process in the current page */
925
926 /* Move on to the next page; offset is already at 0 */
927
928 BUG_ON(cursor->page_index >= cursor->page_count);
929 cursor->page_index++;
930 cursor->last_piece = cursor->resid <= PAGE_SIZE;
931
932 return true;
933}
934
935/*
936 * For a pagelist, a piece is whatever remains to be consumed in the
937 * first page in the list, or the front of the next page.
938 */
939static void
940ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
941 size_t length)
942{
943 struct ceph_msg_data *data = cursor->data;
944 struct ceph_pagelist *pagelist;
945 struct page *page;
946
947 BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
948
949 pagelist = data->pagelist;
950 BUG_ON(!pagelist);
951
952 if (!length)
953 return; /* pagelist can be assigned but empty */
954
955 BUG_ON(list_empty(&pagelist->head));
956 page = list_first_entry(&pagelist->head, struct page, lru);
957
958 cursor->resid = min(length, pagelist->length);
959 cursor->page = page;
960 cursor->offset = 0;
961 cursor->last_piece = cursor->resid <= PAGE_SIZE;
962}
963
964static struct page *
965ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
966 size_t *page_offset, size_t *length)
967{
968 struct ceph_msg_data *data = cursor->data;
969 struct ceph_pagelist *pagelist;
970
971 BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
730 972
731 /* initialize page iterator */ 973 pagelist = data->pagelist;
732 con->out_msg_pos.page = 0; 974 BUG_ON(!pagelist);
733 if (msg->pages) 975
734 con->out_msg_pos.page_pos = msg->page_alignment; 976 BUG_ON(!cursor->page);
977 BUG_ON(cursor->offset + cursor->resid != pagelist->length);
978
979 /* offset of first page in pagelist is always 0 */
980 *page_offset = cursor->offset & ~PAGE_MASK;
981 if (cursor->last_piece)
982 *length = cursor->resid;
735 else 983 else
736 con->out_msg_pos.page_pos = 0; 984 *length = PAGE_SIZE - *page_offset;
985
986 return cursor->page;
987}
988
989static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
990 size_t bytes)
991{
992 struct ceph_msg_data *data = cursor->data;
993 struct ceph_pagelist *pagelist;
994
995 BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
996
997 pagelist = data->pagelist;
998 BUG_ON(!pagelist);
999
1000 BUG_ON(cursor->offset + cursor->resid != pagelist->length);
1001 BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
1002
1003 /* Advance the cursor offset */
1004
1005 cursor->resid -= bytes;
1006 cursor->offset += bytes;
1007 /* offset of first page in pagelist is always 0 */
1008 if (!bytes || cursor->offset & ~PAGE_MASK)
1009 return false; /* more bytes to process in the current page */
1010
1011 /* Move on to the next page */
1012
1013 BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
1014 cursor->page = list_entry_next(cursor->page, lru);
1015 cursor->last_piece = cursor->resid <= PAGE_SIZE;
1016
1017 return true;
1018}
1019
1020/*
1021 * Message data is handled (sent or received) in pieces, where each
1022 * piece resides on a single page. The network layer might not
1023 * consume an entire piece at once. A data item's cursor keeps
1024 * track of which piece is next to process and how much remains to
1025 * be processed in that piece. It also tracks whether the current
1026 * piece is the last one in the data item.
1027 */
1028static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
1029{
1030 size_t length = cursor->total_resid;
1031
1032 switch (cursor->data->type) {
1033 case CEPH_MSG_DATA_PAGELIST:
1034 ceph_msg_data_pagelist_cursor_init(cursor, length);
1035 break;
1036 case CEPH_MSG_DATA_PAGES:
1037 ceph_msg_data_pages_cursor_init(cursor, length);
1038 break;
737#ifdef CONFIG_BLOCK 1039#ifdef CONFIG_BLOCK
738 if (msg->bio) 1040 case CEPH_MSG_DATA_BIO:
739 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); 1041 ceph_msg_data_bio_cursor_init(cursor, length);
740#endif 1042 break;
741 con->out_msg_pos.data_pos = 0; 1043#endif /* CONFIG_BLOCK */
742 con->out_msg_pos.did_page_crc = false; 1044 case CEPH_MSG_DATA_NONE:
743 con->out_more = 1; /* data + footer will follow */ 1045 default:
1046 /* BUG(); */
1047 break;
1048 }
1049 cursor->need_crc = true;
1050}
1051
1052static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
1053{
1054 struct ceph_msg_data_cursor *cursor = &msg->cursor;
1055 struct ceph_msg_data *data;
1056
1057 BUG_ON(!length);
1058 BUG_ON(length > msg->data_length);
1059 BUG_ON(list_empty(&msg->data));
1060
1061 cursor->data_head = &msg->data;
1062 cursor->total_resid = length;
1063 data = list_first_entry(&msg->data, struct ceph_msg_data, links);
1064 cursor->data = data;
1065
1066 __ceph_msg_data_cursor_init(cursor);
1067}
1068
1069/*
1070 * Return the page containing the next piece to process for a given
1071 * data item, and supply the page offset and length of that piece.
1072 * Indicate whether this is the last piece in this data item.
1073 */
1074static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
1075 size_t *page_offset, size_t *length,
1076 bool *last_piece)
1077{
1078 struct page *page;
1079
1080 switch (cursor->data->type) {
1081 case CEPH_MSG_DATA_PAGELIST:
1082 page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
1083 break;
1084 case CEPH_MSG_DATA_PAGES:
1085 page = ceph_msg_data_pages_next(cursor, page_offset, length);
1086 break;
1087#ifdef CONFIG_BLOCK
1088 case CEPH_MSG_DATA_BIO:
1089 page = ceph_msg_data_bio_next(cursor, page_offset, length);
1090 break;
1091#endif /* CONFIG_BLOCK */
1092 case CEPH_MSG_DATA_NONE:
1093 default:
1094 page = NULL;
1095 break;
1096 }
1097 BUG_ON(!page);
1098 BUG_ON(*page_offset + *length > PAGE_SIZE);
1099 BUG_ON(!*length);
1100 if (last_piece)
1101 *last_piece = cursor->last_piece;
1102
1103 return page;
1104}
1105
1106/*
1107 * Returns true if the result moves the cursor on to the next piece
1108 * of the data item.
1109 */
1110static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
1111 size_t bytes)
1112{
1113 bool new_piece;
1114
1115 BUG_ON(bytes > cursor->resid);
1116 switch (cursor->data->type) {
1117 case CEPH_MSG_DATA_PAGELIST:
1118 new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
1119 break;
1120 case CEPH_MSG_DATA_PAGES:
1121 new_piece = ceph_msg_data_pages_advance(cursor, bytes);
1122 break;
1123#ifdef CONFIG_BLOCK
1124 case CEPH_MSG_DATA_BIO:
1125 new_piece = ceph_msg_data_bio_advance(cursor, bytes);
1126 break;
1127#endif /* CONFIG_BLOCK */
1128 case CEPH_MSG_DATA_NONE:
1129 default:
1130 BUG();
1131 break;
1132 }
1133 cursor->total_resid -= bytes;
1134
1135 if (!cursor->resid && cursor->total_resid) {
1136 WARN_ON(!cursor->last_piece);
1137 BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
1138 cursor->data = list_entry_next(cursor->data, links);
1139 __ceph_msg_data_cursor_init(cursor);
1140 new_piece = true;
1141 }
1142 cursor->need_crc = new_piece;
1143
1144 return new_piece;
1145}
1146
1147static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
1148{
1149 BUG_ON(!msg);
1150 BUG_ON(!data_len);
1151
1152 /* Initialize data cursor */
1153
1154 ceph_msg_data_cursor_init(msg, (size_t)data_len);
744} 1155}
745 1156
746/* 1157/*
@@ -803,16 +1214,12 @@ static void prepare_write_message(struct ceph_connection *con)
803 m->hdr.seq = cpu_to_le64(++con->out_seq); 1214 m->hdr.seq = cpu_to_le64(++con->out_seq);
804 m->needs_out_seq = false; 1215 m->needs_out_seq = false;
805 } 1216 }
806#ifdef CONFIG_BLOCK 1217 WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
807 else
808 m->bio_iter = NULL;
809#endif
810 1218
811 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", 1219 dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
812 m, con->out_seq, le16_to_cpu(m->hdr.type), 1220 m, con->out_seq, le16_to_cpu(m->hdr.type),
813 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), 1221 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
814 le32_to_cpu(m->hdr.data_len), 1222 m->data_length);
815 m->nr_pages);
816 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); 1223 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
817 1224
818 /* tag + hdr + front + middle */ 1225 /* tag + hdr + front + middle */
@@ -843,11 +1250,13 @@ static void prepare_write_message(struct ceph_connection *con)
843 1250
844 /* is there a data payload? */ 1251 /* is there a data payload? */
845 con->out_msg->footer.data_crc = 0; 1252 con->out_msg->footer.data_crc = 0;
846 if (m->hdr.data_len) 1253 if (m->data_length) {
847 prepare_write_message_data(con); 1254 prepare_message_data(con->out_msg, m->data_length);
848 else 1255 con->out_more = 1; /* data + footer will follow */
1256 } else {
849 /* no, queue up footer too and be done */ 1257 /* no, queue up footer too and be done */
850 prepare_write_message_footer(con); 1258 prepare_write_message_footer(con);
1259 }
851 1260
852 con_flag_set(con, CON_FLAG_WRITE_PENDING); 1261 con_flag_set(con, CON_FLAG_WRITE_PENDING);
853} 1262}
@@ -874,6 +1283,24 @@ static void prepare_write_ack(struct ceph_connection *con)
874} 1283}
875 1284
876/* 1285/*
1286 * Prepare to share the seq during handshake
1287 */
1288static void prepare_write_seq(struct ceph_connection *con)
1289{
1290 dout("prepare_write_seq %p %llu -> %llu\n", con,
1291 con->in_seq_acked, con->in_seq);
1292 con->in_seq_acked = con->in_seq;
1293
1294 con_out_kvec_reset(con);
1295
1296 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
1297 con_out_kvec_add(con, sizeof (con->out_temp_ack),
1298 &con->out_temp_ack);
1299
1300 con_flag_set(con, CON_FLAG_WRITE_PENDING);
1301}
1302
1303/*
877 * Prepare to write keepalive byte. 1304 * Prepare to write keepalive byte.
878 */ 1305 */
879static void prepare_write_keepalive(struct ceph_connection *con) 1306static void prepare_write_keepalive(struct ceph_connection *con)
@@ -1022,35 +1449,19 @@ out:
1022 return ret; /* done! */ 1449 return ret; /* done! */
1023} 1450}
1024 1451
1025static void out_msg_pos_next(struct ceph_connection *con, struct page *page, 1452static u32 ceph_crc32c_page(u32 crc, struct page *page,
1026 size_t len, size_t sent, bool in_trail) 1453 unsigned int page_offset,
1454 unsigned int length)
1027{ 1455{
1028 struct ceph_msg *msg = con->out_msg; 1456 char *kaddr;
1029 1457
1030 BUG_ON(!msg); 1458 kaddr = kmap(page);
1031 BUG_ON(!sent); 1459 BUG_ON(kaddr == NULL);
1032 1460 crc = crc32c(crc, kaddr + page_offset, length);
1033 con->out_msg_pos.data_pos += sent; 1461 kunmap(page);
1034 con->out_msg_pos.page_pos += sent;
1035 if (sent < len)
1036 return;
1037 1462
1038 BUG_ON(sent != len); 1463 return crc;
1039 con->out_msg_pos.page_pos = 0;
1040 con->out_msg_pos.page++;
1041 con->out_msg_pos.did_page_crc = false;
1042 if (in_trail)
1043 list_move_tail(&page->lru,
1044 &msg->trail->head);
1045 else if (msg->pagelist)
1046 list_move_tail(&page->lru,
1047 &msg->pagelist->head);
1048#ifdef CONFIG_BLOCK
1049 else if (msg->bio)
1050 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
1051#endif
1052} 1464}
1053
1054/* 1465/*
1055 * Write as much message data payload as we can. If we finish, queue 1466 * Write as much message data payload as we can. If we finish, queue
1056 * up the footer. 1467 * up the footer.
@@ -1058,21 +1469,17 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
1058 * 0 -> socket full, but more to do 1469 * 0 -> socket full, but more to do
1059 * <0 -> error 1470 * <0 -> error
1060 */ 1471 */
1061static int write_partial_msg_pages(struct ceph_connection *con) 1472static int write_partial_message_data(struct ceph_connection *con)
1062{ 1473{
1063 struct ceph_msg *msg = con->out_msg; 1474 struct ceph_msg *msg = con->out_msg;
1064 unsigned int data_len = le32_to_cpu(msg->hdr.data_len); 1475 struct ceph_msg_data_cursor *cursor = &msg->cursor;
1065 size_t len;
1066 bool do_datacrc = !con->msgr->nocrc; 1476 bool do_datacrc = !con->msgr->nocrc;
1067 int ret; 1477 u32 crc;
1068 int total_max_write;
1069 bool in_trail = false;
1070 const size_t trail_len = (msg->trail ? msg->trail->length : 0);
1071 const size_t trail_off = data_len - trail_len;
1072 1478
1073 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", 1479 dout("%s %p msg %p\n", __func__, con, msg);
1074 con, msg, con->out_msg_pos.page, msg->nr_pages, 1480
1075 con->out_msg_pos.page_pos); 1481 if (list_empty(&msg->data))
1482 return -EINVAL;
1076 1483
1077 /* 1484 /*
1078 * Iterate through each page that contains data to be 1485 * Iterate through each page that contains data to be
@@ -1082,72 +1489,41 @@ static int write_partial_msg_pages(struct ceph_connection *con)
1082 * need to map the page. If we have no pages, they have 1489 * need to map the page. If we have no pages, they have
1083 * been revoked, so use the zero page. 1490 * been revoked, so use the zero page.
1084 */ 1491 */
1085 while (data_len > con->out_msg_pos.data_pos) { 1492 crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
1086 struct page *page = NULL; 1493 while (cursor->resid) {
1087 int max_write = PAGE_SIZE; 1494 struct page *page;
1088 int bio_offset = 0; 1495 size_t page_offset;
1089 1496 size_t length;
1090 in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off; 1497 bool last_piece;
1091 if (!in_trail) 1498 bool need_crc;
1092 total_max_write = trail_off - con->out_msg_pos.data_pos; 1499 int ret;
1093
1094 if (in_trail) {
1095 total_max_write = data_len - con->out_msg_pos.data_pos;
1096
1097 page = list_first_entry(&msg->trail->head,
1098 struct page, lru);
1099 } else if (msg->pages) {
1100 page = msg->pages[con->out_msg_pos.page];
1101 } else if (msg->pagelist) {
1102 page = list_first_entry(&msg->pagelist->head,
1103 struct page, lru);
1104#ifdef CONFIG_BLOCK
1105 } else if (msg->bio) {
1106 struct bio_vec *bv;
1107 1500
1108 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg); 1501 page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
1109 page = bv->bv_page; 1502 &last_piece);
1110 bio_offset = bv->bv_offset; 1503 ret = ceph_tcp_sendpage(con->sock, page, page_offset,
1111 max_write = bv->bv_len; 1504 length, last_piece);
1112#endif 1505 if (ret <= 0) {
1113 } else { 1506 if (do_datacrc)
1114 page = zero_page; 1507 msg->footer.data_crc = cpu_to_le32(crc);
1115 }
1116 len = min_t(int, max_write - con->out_msg_pos.page_pos,
1117 total_max_write);
1118
1119 if (do_datacrc && !con->out_msg_pos.did_page_crc) {
1120 void *base;
1121 u32 crc = le32_to_cpu(msg->footer.data_crc);
1122 char *kaddr;
1123
1124 kaddr = kmap(page);
1125 BUG_ON(kaddr == NULL);
1126 base = kaddr + con->out_msg_pos.page_pos + bio_offset;
1127 crc = crc32c(crc, base, len);
1128 kunmap(page);
1129 msg->footer.data_crc = cpu_to_le32(crc);
1130 con->out_msg_pos.did_page_crc = true;
1131 }
1132 ret = ceph_tcp_sendpage(con->sock, page,
1133 con->out_msg_pos.page_pos + bio_offset,
1134 len, 1);
1135 if (ret <= 0)
1136 goto out;
1137 1508
1138 out_msg_pos_next(con, page, len, (size_t) ret, in_trail); 1509 return ret;
1510 }
1511 if (do_datacrc && cursor->need_crc)
1512 crc = ceph_crc32c_page(crc, page, page_offset, length);
1513 need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
1139 } 1514 }
1140 1515
1141 dout("write_partial_msg_pages %p msg %p done\n", con, msg); 1516 dout("%s %p msg %p done\n", __func__, con, msg);
1142 1517
1143 /* prepare and queue up footer, too */ 1518 /* prepare and queue up footer, too */
1144 if (!do_datacrc) 1519 if (do_datacrc)
1520 msg->footer.data_crc = cpu_to_le32(crc);
1521 else
1145 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 1522 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
1146 con_out_kvec_reset(con); 1523 con_out_kvec_reset(con);
1147 prepare_write_message_footer(con); 1524 prepare_write_message_footer(con);
1148 ret = 1; 1525
1149out: 1526 return 1; /* must return > 0 to indicate success */
1150 return ret;
1151} 1527}
1152 1528
1153/* 1529/*
@@ -1160,7 +1536,7 @@ static int write_partial_skip(struct ceph_connection *con)
1160 while (con->out_skip > 0) { 1536 while (con->out_skip > 0) {
1161 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); 1537 size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
1162 1538
1163 ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, 1); 1539 ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
1164 if (ret <= 0) 1540 if (ret <= 0)
1165 goto out; 1541 goto out;
1166 con->out_skip -= ret; 1542 con->out_skip -= ret;
@@ -1191,6 +1567,13 @@ static void prepare_read_ack(struct ceph_connection *con)
1191 con->in_base_pos = 0; 1567 con->in_base_pos = 0;
1192} 1568}
1193 1569
1570static void prepare_read_seq(struct ceph_connection *con)
1571{
1572 dout("prepare_read_seq %p\n", con);
1573 con->in_base_pos = 0;
1574 con->in_tag = CEPH_MSGR_TAG_SEQ;
1575}
1576
1194static void prepare_read_tag(struct ceph_connection *con) 1577static void prepare_read_tag(struct ceph_connection *con)
1195{ 1578{
1196 dout("prepare_read_tag %p\n", con); 1579 dout("prepare_read_tag %p\n", con);
@@ -1597,7 +1980,6 @@ static int process_connect(struct ceph_connection *con)
1597 con->error_msg = "connect authorization failure"; 1980 con->error_msg = "connect authorization failure";
1598 return -1; 1981 return -1;
1599 } 1982 }
1600 con->auth_retry = 1;
1601 con_out_kvec_reset(con); 1983 con_out_kvec_reset(con);
1602 ret = prepare_write_connect(con); 1984 ret = prepare_write_connect(con);
1603 if (ret < 0) 1985 if (ret < 0)
@@ -1668,6 +2050,7 @@ static int process_connect(struct ceph_connection *con)
1668 prepare_read_connect(con); 2050 prepare_read_connect(con);
1669 break; 2051 break;
1670 2052
2053 case CEPH_MSGR_TAG_SEQ:
1671 case CEPH_MSGR_TAG_READY: 2054 case CEPH_MSGR_TAG_READY:
1672 if (req_feat & ~server_feat) { 2055 if (req_feat & ~server_feat) {
1673 pr_err("%s%lld %s protocol feature mismatch," 2056 pr_err("%s%lld %s protocol feature mismatch,"
@@ -1682,7 +2065,7 @@ static int process_connect(struct ceph_connection *con)
1682 2065
1683 WARN_ON(con->state != CON_STATE_NEGOTIATING); 2066 WARN_ON(con->state != CON_STATE_NEGOTIATING);
1684 con->state = CON_STATE_OPEN; 2067 con->state = CON_STATE_OPEN;
1685 2068 con->auth_retry = 0; /* we authenticated; clear flag */
1686 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 2069 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1687 con->connect_seq++; 2070 con->connect_seq++;
1688 con->peer_features = server_feat; 2071 con->peer_features = server_feat;
@@ -1698,7 +2081,12 @@ static int process_connect(struct ceph_connection *con)
1698 2081
1699 con->delay = 0; /* reset backoff memory */ 2082 con->delay = 0; /* reset backoff memory */
1700 2083
1701 prepare_read_tag(con); 2084 if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
2085 prepare_write_seq(con);
2086 prepare_read_seq(con);
2087 } else {
2088 prepare_read_tag(con);
2089 }
1702 break; 2090 break;
1703 2091
1704 case CEPH_MSGR_TAG_WAIT: 2092 case CEPH_MSGR_TAG_WAIT:
@@ -1732,7 +2120,6 @@ static int read_partial_ack(struct ceph_connection *con)
1732 return read_partial(con, end, size, &con->in_temp_ack); 2120 return read_partial(con, end, size, &con->in_temp_ack);
1733} 2121}
1734 2122
1735
1736/* 2123/*
1737 * We can finally discard anything that's been acked. 2124 * We can finally discard anything that's been acked.
1738 */ 2125 */
@@ -1757,8 +2144,6 @@ static void process_ack(struct ceph_connection *con)
1757} 2144}
1758 2145
1759 2146
1760
1761
1762static int read_partial_message_section(struct ceph_connection *con, 2147static int read_partial_message_section(struct ceph_connection *con,
1763 struct kvec *section, 2148 struct kvec *section,
1764 unsigned int sec_len, u32 *crc) 2149 unsigned int sec_len, u32 *crc)
@@ -1782,77 +2167,49 @@ static int read_partial_message_section(struct ceph_connection *con,
1782 return 1; 2167 return 1;
1783} 2168}
1784 2169
1785static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip); 2170static int read_partial_msg_data(struct ceph_connection *con)
1786
1787static int read_partial_message_pages(struct ceph_connection *con,
1788 struct page **pages,
1789 unsigned int data_len, bool do_datacrc)
1790{ 2171{
1791 void *p; 2172 struct ceph_msg *msg = con->in_msg;
2173 struct ceph_msg_data_cursor *cursor = &msg->cursor;
2174 const bool do_datacrc = !con->msgr->nocrc;
2175 struct page *page;
2176 size_t page_offset;
2177 size_t length;
2178 u32 crc = 0;
1792 int ret; 2179 int ret;
1793 int left;
1794 2180
1795 left = min((int)(data_len - con->in_msg_pos.data_pos), 2181 BUG_ON(!msg);
1796 (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); 2182 if (list_empty(&msg->data))
1797 /* (page) data */ 2183 return -EIO;
1798 BUG_ON(pages == NULL);
1799 p = kmap(pages[con->in_msg_pos.page]);
1800 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1801 left);
1802 if (ret > 0 && do_datacrc)
1803 con->in_data_crc =
1804 crc32c(con->in_data_crc,
1805 p + con->in_msg_pos.page_pos, ret);
1806 kunmap(pages[con->in_msg_pos.page]);
1807 if (ret <= 0)
1808 return ret;
1809 con->in_msg_pos.data_pos += ret;
1810 con->in_msg_pos.page_pos += ret;
1811 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1812 con->in_msg_pos.page_pos = 0;
1813 con->in_msg_pos.page++;
1814 }
1815
1816 return ret;
1817}
1818
1819#ifdef CONFIG_BLOCK
1820static int read_partial_message_bio(struct ceph_connection *con,
1821 struct bio **bio_iter, int *bio_seg,
1822 unsigned int data_len, bool do_datacrc)
1823{
1824 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1825 void *p;
1826 int ret, left;
1827 2184
1828 left = min((int)(data_len - con->in_msg_pos.data_pos), 2185 if (do_datacrc)
1829 (int)(bv->bv_len - con->in_msg_pos.page_pos)); 2186 crc = con->in_data_crc;
2187 while (cursor->resid) {
2188 page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
2189 NULL);
2190 ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
2191 if (ret <= 0) {
2192 if (do_datacrc)
2193 con->in_data_crc = crc;
1830 2194
1831 p = kmap(bv->bv_page) + bv->bv_offset; 2195 return ret;
2196 }
1832 2197
1833 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 2198 if (do_datacrc)
1834 left); 2199 crc = ceph_crc32c_page(crc, page, page_offset, ret);
1835 if (ret > 0 && do_datacrc) 2200 (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
1836 con->in_data_crc =
1837 crc32c(con->in_data_crc,
1838 p + con->in_msg_pos.page_pos, ret);
1839 kunmap(bv->bv_page);
1840 if (ret <= 0)
1841 return ret;
1842 con->in_msg_pos.data_pos += ret;
1843 con->in_msg_pos.page_pos += ret;
1844 if (con->in_msg_pos.page_pos == bv->bv_len) {
1845 con->in_msg_pos.page_pos = 0;
1846 iter_bio_next(bio_iter, bio_seg);
1847 } 2201 }
2202 if (do_datacrc)
2203 con->in_data_crc = crc;
1848 2204
1849 return ret; 2205 return 1; /* must return > 0 to indicate success */
1850} 2206}
1851#endif
1852 2207
1853/* 2208/*
1854 * read (part of) a message. 2209 * read (part of) a message.
1855 */ 2210 */
2211static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
2212
1856static int read_partial_message(struct ceph_connection *con) 2213static int read_partial_message(struct ceph_connection *con)
1857{ 2214{
1858 struct ceph_msg *m = con->in_msg; 2215 struct ceph_msg *m = con->in_msg;
@@ -1885,7 +2242,7 @@ static int read_partial_message(struct ceph_connection *con)
1885 if (front_len > CEPH_MSG_MAX_FRONT_LEN) 2242 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1886 return -EIO; 2243 return -EIO;
1887 middle_len = le32_to_cpu(con->in_hdr.middle_len); 2244 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1888 if (middle_len > CEPH_MSG_MAX_DATA_LEN) 2245 if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
1889 return -EIO; 2246 return -EIO;
1890 data_len = le32_to_cpu(con->in_hdr.data_len); 2247 data_len = le32_to_cpu(con->in_hdr.data_len);
1891 if (data_len > CEPH_MSG_MAX_DATA_LEN) 2248 if (data_len > CEPH_MSG_MAX_DATA_LEN)
@@ -1914,14 +2271,22 @@ static int read_partial_message(struct ceph_connection *con)
1914 int skip = 0; 2271 int skip = 0;
1915 2272
1916 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 2273 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1917 con->in_hdr.front_len, con->in_hdr.data_len); 2274 front_len, data_len);
1918 ret = ceph_con_in_msg_alloc(con, &skip); 2275 ret = ceph_con_in_msg_alloc(con, &skip);
1919 if (ret < 0) 2276 if (ret < 0)
1920 return ret; 2277 return ret;
2278
2279 BUG_ON(!con->in_msg ^ skip);
2280 if (con->in_msg && data_len > con->in_msg->data_length) {
2281 pr_warning("%s skipping long message (%u > %zd)\n",
2282 __func__, data_len, con->in_msg->data_length);
2283 ceph_msg_put(con->in_msg);
2284 con->in_msg = NULL;
2285 skip = 1;
2286 }
1921 if (skip) { 2287 if (skip) {
1922 /* skip this message */ 2288 /* skip this message */
1923 dout("alloc_msg said skip message\n"); 2289 dout("alloc_msg said skip message\n");
1924 BUG_ON(con->in_msg);
1925 con->in_base_pos = -front_len - middle_len - data_len - 2290 con->in_base_pos = -front_len - middle_len - data_len -
1926 sizeof(m->footer); 2291 sizeof(m->footer);
1927 con->in_tag = CEPH_MSGR_TAG_READY; 2292 con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1936,17 +2301,10 @@ static int read_partial_message(struct ceph_connection *con)
1936 if (m->middle) 2301 if (m->middle)
1937 m->middle->vec.iov_len = 0; 2302 m->middle->vec.iov_len = 0;
1938 2303
1939 con->in_msg_pos.page = 0; 2304 /* prepare for data payload, if any */
1940 if (m->pages)
1941 con->in_msg_pos.page_pos = m->page_alignment;
1942 else
1943 con->in_msg_pos.page_pos = 0;
1944 con->in_msg_pos.data_pos = 0;
1945 2305
1946#ifdef CONFIG_BLOCK 2306 if (data_len)
1947 if (m->bio) 2307 prepare_message_data(con->in_msg, data_len);
1948 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1949#endif
1950 } 2308 }
1951 2309
1952 /* front */ 2310 /* front */
@@ -1965,24 +2323,10 @@ static int read_partial_message(struct ceph_connection *con)
1965 } 2323 }
1966 2324
1967 /* (page) data */ 2325 /* (page) data */
1968 while (con->in_msg_pos.data_pos < data_len) { 2326 if (data_len) {
1969 if (m->pages) { 2327 ret = read_partial_msg_data(con);
1970 ret = read_partial_message_pages(con, m->pages, 2328 if (ret <= 0)
1971 data_len, do_datacrc); 2329 return ret;
1972 if (ret <= 0)
1973 return ret;
1974#ifdef CONFIG_BLOCK
1975 } else if (m->bio) {
1976 BUG_ON(!m->bio_iter);
1977 ret = read_partial_message_bio(con,
1978 &m->bio_iter, &m->bio_seg,
1979 data_len, do_datacrc);
1980 if (ret <= 0)
1981 return ret;
1982#endif
1983 } else {
1984 BUG_ON(1);
1985 }
1986 } 2330 }
1987 2331
1988 /* footer */ 2332 /* footer */
@@ -2108,13 +2452,13 @@ more_kvec:
2108 goto do_next; 2452 goto do_next;
2109 } 2453 }
2110 2454
2111 ret = write_partial_msg_pages(con); 2455 ret = write_partial_message_data(con);
2112 if (ret == 1) 2456 if (ret == 1)
2113 goto more_kvec; /* we need to send the footer, too! */ 2457 goto more_kvec; /* we need to send the footer, too! */
2114 if (ret == 0) 2458 if (ret == 0)
2115 goto out; 2459 goto out;
2116 if (ret < 0) { 2460 if (ret < 0) {
2117 dout("try_write write_partial_msg_pages err %d\n", 2461 dout("try_write write_partial_message_data err %d\n",
2118 ret); 2462 ret);
2119 goto out; 2463 goto out;
2120 } 2464 }
@@ -2266,7 +2610,12 @@ more:
2266 prepare_read_tag(con); 2610 prepare_read_tag(con);
2267 goto more; 2611 goto more;
2268 } 2612 }
2269 if (con->in_tag == CEPH_MSGR_TAG_ACK) { 2613 if (con->in_tag == CEPH_MSGR_TAG_ACK ||
2614 con->in_tag == CEPH_MSGR_TAG_SEQ) {
2615 /*
2616 * the final handshake seq exchange is semantically
2617 * equivalent to an ACK
2618 */
2270 ret = read_partial_ack(con); 2619 ret = read_partial_ack(con);
2271 if (ret <= 0) 2620 if (ret <= 0)
2272 goto out; 2621 goto out;
@@ -2672,6 +3021,88 @@ void ceph_con_keepalive(struct ceph_connection *con)
2672} 3021}
2673EXPORT_SYMBOL(ceph_con_keepalive); 3022EXPORT_SYMBOL(ceph_con_keepalive);
2674 3023
3024static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
3025{
3026 struct ceph_msg_data *data;
3027
3028 if (WARN_ON(!ceph_msg_data_type_valid(type)))
3029 return NULL;
3030
3031 data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
3032 if (data)
3033 data->type = type;
3034 INIT_LIST_HEAD(&data->links);
3035
3036 return data;
3037}
3038
3039static void ceph_msg_data_destroy(struct ceph_msg_data *data)
3040{
3041 if (!data)
3042 return;
3043
3044 WARN_ON(!list_empty(&data->links));
3045 if (data->type == CEPH_MSG_DATA_PAGELIST) {
3046 ceph_pagelist_release(data->pagelist);
3047 kfree(data->pagelist);
3048 }
3049 kmem_cache_free(ceph_msg_data_cache, data);
3050}
3051
3052void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
3053 size_t length, size_t alignment)
3054{
3055 struct ceph_msg_data *data;
3056
3057 BUG_ON(!pages);
3058 BUG_ON(!length);
3059
3060 data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
3061 BUG_ON(!data);
3062 data->pages = pages;
3063 data->length = length;
3064 data->alignment = alignment & ~PAGE_MASK;
3065
3066 list_add_tail(&data->links, &msg->data);
3067 msg->data_length += length;
3068}
3069EXPORT_SYMBOL(ceph_msg_data_add_pages);
3070
3071void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
3072 struct ceph_pagelist *pagelist)
3073{
3074 struct ceph_msg_data *data;
3075
3076 BUG_ON(!pagelist);
3077 BUG_ON(!pagelist->length);
3078
3079 data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
3080 BUG_ON(!data);
3081 data->pagelist = pagelist;
3082
3083 list_add_tail(&data->links, &msg->data);
3084 msg->data_length += pagelist->length;
3085}
3086EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
3087
3088#ifdef CONFIG_BLOCK
3089void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
3090 size_t length)
3091{
3092 struct ceph_msg_data *data;
3093
3094 BUG_ON(!bio);
3095
3096 data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
3097 BUG_ON(!data);
3098 data->bio = bio;
3099 data->bio_length = length;
3100
3101 list_add_tail(&data->links, &msg->data);
3102 msg->data_length += length;
3103}
3104EXPORT_SYMBOL(ceph_msg_data_add_bio);
3105#endif /* CONFIG_BLOCK */
2675 3106
2676/* 3107/*
2677 * construct a new message with given type, size 3108 * construct a new message with given type, size
@@ -2682,49 +3113,20 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2682{ 3113{
2683 struct ceph_msg *m; 3114 struct ceph_msg *m;
2684 3115
2685 m = kmalloc(sizeof(*m), flags); 3116 m = kmem_cache_zalloc(ceph_msg_cache, flags);
2686 if (m == NULL) 3117 if (m == NULL)
2687 goto out; 3118 goto out;
2688 kref_init(&m->kref);
2689 3119
2690 m->con = NULL;
2691 INIT_LIST_HEAD(&m->list_head);
2692
2693 m->hdr.tid = 0;
2694 m->hdr.type = cpu_to_le16(type); 3120 m->hdr.type = cpu_to_le16(type);
2695 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); 3121 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2696 m->hdr.version = 0;
2697 m->hdr.front_len = cpu_to_le32(front_len); 3122 m->hdr.front_len = cpu_to_le32(front_len);
2698 m->hdr.middle_len = 0;
2699 m->hdr.data_len = 0;
2700 m->hdr.data_off = 0;
2701 m->hdr.reserved = 0;
2702 m->footer.front_crc = 0;
2703 m->footer.middle_crc = 0;
2704 m->footer.data_crc = 0;
2705 m->footer.flags = 0;
2706 m->front_max = front_len;
2707 m->front_is_vmalloc = false;
2708 m->more_to_follow = false;
2709 m->ack_stamp = 0;
2710 m->pool = NULL;
2711
2712 /* middle */
2713 m->middle = NULL;
2714 3123
2715 /* data */ 3124 INIT_LIST_HEAD(&m->list_head);
2716 m->nr_pages = 0; 3125 kref_init(&m->kref);
2717 m->page_alignment = 0; 3126 INIT_LIST_HEAD(&m->data);
2718 m->pages = NULL;
2719 m->pagelist = NULL;
2720#ifdef CONFIG_BLOCK
2721 m->bio = NULL;
2722 m->bio_iter = NULL;
2723 m->bio_seg = 0;
2724#endif /* CONFIG_BLOCK */
2725 m->trail = NULL;
2726 3127
2727 /* front */ 3128 /* front */
3129 m->front_max = front_len;
2728 if (front_len) { 3130 if (front_len) {
2729 if (front_len > PAGE_CACHE_SIZE) { 3131 if (front_len > PAGE_CACHE_SIZE) {
2730 m->front.iov_base = __vmalloc(front_len, flags, 3132 m->front.iov_base = __vmalloc(front_len, flags,
@@ -2802,49 +3204,37 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2802static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip) 3204static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
2803{ 3205{
2804 struct ceph_msg_header *hdr = &con->in_hdr; 3206 struct ceph_msg_header *hdr = &con->in_hdr;
2805 int type = le16_to_cpu(hdr->type);
2806 int front_len = le32_to_cpu(hdr->front_len);
2807 int middle_len = le32_to_cpu(hdr->middle_len); 3207 int middle_len = le32_to_cpu(hdr->middle_len);
3208 struct ceph_msg *msg;
2808 int ret = 0; 3209 int ret = 0;
2809 3210
2810 BUG_ON(con->in_msg != NULL); 3211 BUG_ON(con->in_msg != NULL);
3212 BUG_ON(!con->ops->alloc_msg);
2811 3213
2812 if (con->ops->alloc_msg) { 3214 mutex_unlock(&con->mutex);
2813 struct ceph_msg *msg; 3215 msg = con->ops->alloc_msg(con, hdr, skip);
2814 3216 mutex_lock(&con->mutex);
2815 mutex_unlock(&con->mutex); 3217 if (con->state != CON_STATE_OPEN) {
2816 msg = con->ops->alloc_msg(con, hdr, skip); 3218 if (msg)
2817 mutex_lock(&con->mutex); 3219 ceph_msg_put(msg);
2818 if (con->state != CON_STATE_OPEN) { 3220 return -EAGAIN;
2819 if (msg)
2820 ceph_msg_put(msg);
2821 return -EAGAIN;
2822 }
2823 con->in_msg = msg;
2824 if (con->in_msg) {
2825 con->in_msg->con = con->ops->get(con);
2826 BUG_ON(con->in_msg->con == NULL);
2827 }
2828 if (*skip) {
2829 con->in_msg = NULL;
2830 return 0;
2831 }
2832 if (!con->in_msg) {
2833 con->error_msg =
2834 "error allocating memory for incoming message";
2835 return -ENOMEM;
2836 }
2837 } 3221 }
2838 if (!con->in_msg) { 3222 if (msg) {
2839 con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false); 3223 BUG_ON(*skip);
2840 if (!con->in_msg) { 3224 con->in_msg = msg;
2841 pr_err("unable to allocate msg type %d len %d\n",
2842 type, front_len);
2843 return -ENOMEM;
2844 }
2845 con->in_msg->con = con->ops->get(con); 3225 con->in_msg->con = con->ops->get(con);
2846 BUG_ON(con->in_msg->con == NULL); 3226 BUG_ON(con->in_msg->con == NULL);
2847 con->in_msg->page_alignment = le16_to_cpu(hdr->data_off); 3227 } else {
3228 /*
3229 * Null message pointer means either we should skip
3230 * this message or we couldn't allocate memory. The
3231 * former is not an error.
3232 */
3233 if (*skip)
3234 return 0;
3235 con->error_msg = "error allocating memory for incoming message";
3236
3237 return -ENOMEM;
2848 } 3238 }
2849 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 3239 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2850 3240
@@ -2870,7 +3260,7 @@ void ceph_msg_kfree(struct ceph_msg *m)
2870 vfree(m->front.iov_base); 3260 vfree(m->front.iov_base);
2871 else 3261 else
2872 kfree(m->front.iov_base); 3262 kfree(m->front.iov_base);
2873 kfree(m); 3263 kmem_cache_free(ceph_msg_cache, m);
2874} 3264}
2875 3265
2876/* 3266/*
@@ -2879,6 +3269,9 @@ void ceph_msg_kfree(struct ceph_msg *m)
2879void ceph_msg_last_put(struct kref *kref) 3269void ceph_msg_last_put(struct kref *kref)
2880{ 3270{
2881 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); 3271 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
3272 LIST_HEAD(data);
3273 struct list_head *links;
3274 struct list_head *next;
2882 3275
2883 dout("ceph_msg_put last one on %p\n", m); 3276 dout("ceph_msg_put last one on %p\n", m);
2884 WARN_ON(!list_empty(&m->list_head)); 3277 WARN_ON(!list_empty(&m->list_head));
@@ -2888,16 +3281,16 @@ void ceph_msg_last_put(struct kref *kref)
2888 ceph_buffer_put(m->middle); 3281 ceph_buffer_put(m->middle);
2889 m->middle = NULL; 3282 m->middle = NULL;
2890 } 3283 }
2891 m->nr_pages = 0;
2892 m->pages = NULL;
2893 3284
2894 if (m->pagelist) { 3285 list_splice_init(&m->data, &data);
2895 ceph_pagelist_release(m->pagelist); 3286 list_for_each_safe(links, next, &data) {
2896 kfree(m->pagelist); 3287 struct ceph_msg_data *data;
2897 m->pagelist = NULL;
2898 }
2899 3288
2900 m->trail = NULL; 3289 data = list_entry(links, struct ceph_msg_data, links);
3290 list_del_init(links);
3291 ceph_msg_data_destroy(data);
3292 }
3293 m->data_length = 0;
2901 3294
2902 if (m->pool) 3295 if (m->pool)
2903 ceph_msgpool_put(m->pool, m); 3296 ceph_msgpool_put(m->pool, m);
@@ -2908,8 +3301,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
2908 3301
2909void ceph_msg_dump(struct ceph_msg *msg) 3302void ceph_msg_dump(struct ceph_msg *msg)
2910{ 3303{
2911 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg, 3304 pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
2912 msg->front_max, msg->nr_pages); 3305 msg->front_max, msg->data_length);
2913 print_hex_dump(KERN_DEBUG, "header: ", 3306 print_hex_dump(KERN_DEBUG, "header: ",
2914 DUMP_PREFIX_OFFSET, 16, 1, 3307 DUMP_PREFIX_OFFSET, 16, 1,
2915 &msg->hdr, sizeof(msg->hdr), true); 3308 &msg->hdr, sizeof(msg->hdr), true);
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index aef5b1062bee..1fe25cd29d0e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work)
737 737
738 __validate_auth(monc); 738 __validate_auth(monc);
739 739
740 if (monc->auth->ops->is_authenticated(monc->auth)) 740 if (ceph_auth_is_authenticated(monc->auth))
741 __send_subscribe(monc); 741 __send_subscribe(monc);
742 } 742 }
743 __schedule_delayed(monc); 743 __schedule_delayed(monc);
@@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
892 892
893 mutex_lock(&monc->mutex); 893 mutex_lock(&monc->mutex);
894 had_debugfs_info = have_debugfs_info(monc); 894 had_debugfs_info = have_debugfs_info(monc);
895 if (monc->auth->ops) 895 was_auth = ceph_auth_is_authenticated(monc->auth);
896 was_auth = monc->auth->ops->is_authenticated(monc->auth);
897 monc->pending_auth = 0; 896 monc->pending_auth = 0;
898 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, 897 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
899 msg->front.iov_len, 898 msg->front.iov_len,
@@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
904 wake_up_all(&monc->client->auth_wq); 903 wake_up_all(&monc->client->auth_wq);
905 } else if (ret > 0) { 904 } else if (ret > 0) {
906 __send_prepared_auth_request(monc, ret); 905 __send_prepared_auth_request(monc, ret);
907 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { 906 } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
908 dout("authenticated, starting session\n"); 907 dout("authenticated, starting session\n");
909 908
910 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 909 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d730dd4d8eb2..d5953b87918c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,3 +1,4 @@
1
1#include <linux/ceph/ceph_debug.h> 2#include <linux/ceph/ceph_debug.h>
2 3
3#include <linux/module.h> 4#include <linux/module.h>
@@ -21,6 +22,8 @@
21#define OSD_OP_FRONT_LEN 4096 22#define OSD_OP_FRONT_LEN 4096
22#define OSD_OPREPLY_FRONT_LEN 512 23#define OSD_OPREPLY_FRONT_LEN 512
23 24
25static struct kmem_cache *ceph_osd_request_cache;
26
24static const struct ceph_connection_operations osd_con_ops; 27static const struct ceph_connection_operations osd_con_ops;
25 28
26static void __send_queued(struct ceph_osd_client *osdc); 29static void __send_queued(struct ceph_osd_client *osdc);
@@ -32,12 +35,6 @@ static void __unregister_linger_request(struct ceph_osd_client *osdc,
32static void __send_request(struct ceph_osd_client *osdc, 35static void __send_request(struct ceph_osd_client *osdc,
33 struct ceph_osd_request *req); 36 struct ceph_osd_request *req);
34 37
35static int op_has_extent(int op)
36{
37 return (op == CEPH_OSD_OP_READ ||
38 op == CEPH_OSD_OP_WRITE);
39}
40
41/* 38/*
42 * Implement client access to distributed object storage cluster. 39 * Implement client access to distributed object storage cluster.
43 * 40 *
@@ -63,53 +60,238 @@ static int op_has_extent(int op)
63 * 60 *
64 * fill osd op in request message. 61 * fill osd op in request message.
65 */ 62 */
66static int calc_layout(struct ceph_vino vino, 63static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
67 struct ceph_file_layout *layout, 64 u64 *objnum, u64 *objoff, u64 *objlen)
68 u64 off, u64 *plen,
69 struct ceph_osd_request *req,
70 struct ceph_osd_req_op *op)
71{ 65{
72 u64 orig_len = *plen; 66 u64 orig_len = *plen;
73 u64 bno = 0;
74 u64 objoff = 0;
75 u64 objlen = 0;
76 int r; 67 int r;
77 68
78 /* object extent? */ 69 /* object extent? */
79 r = ceph_calc_file_object_mapping(layout, off, orig_len, &bno, 70 r = ceph_calc_file_object_mapping(layout, off, orig_len, objnum,
80 &objoff, &objlen); 71 objoff, objlen);
81 if (r < 0) 72 if (r < 0)
82 return r; 73 return r;
83 if (objlen < orig_len) { 74 if (*objlen < orig_len) {
84 *plen = objlen; 75 *plen = *objlen;
85 dout(" skipping last %llu, final file extent %llu~%llu\n", 76 dout(" skipping last %llu, final file extent %llu~%llu\n",
86 orig_len - *plen, off, *plen); 77 orig_len - *plen, off, *plen);
87 } 78 }
88 79
89 if (op_has_extent(op->op)) { 80 dout("calc_layout objnum=%llx %llu~%llu\n", *objnum, *objoff, *objlen);
90 u32 osize = le32_to_cpu(layout->fl_object_size); 81
91 op->extent.offset = objoff; 82 return 0;
92 op->extent.length = objlen; 83}
93 if (op->extent.truncate_size <= off - objoff) { 84
94 op->extent.truncate_size = 0; 85static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
95 } else { 86{
96 op->extent.truncate_size -= off - objoff; 87 memset(osd_data, 0, sizeof (*osd_data));
97 if (op->extent.truncate_size > osize) 88 osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
98 op->extent.truncate_size = osize; 89}
99 } 90
91static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
92 struct page **pages, u64 length, u32 alignment,
93 bool pages_from_pool, bool own_pages)
94{
95 osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
96 osd_data->pages = pages;
97 osd_data->length = length;
98 osd_data->alignment = alignment;
99 osd_data->pages_from_pool = pages_from_pool;
100 osd_data->own_pages = own_pages;
101}
102
103static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
104 struct ceph_pagelist *pagelist)
105{
106 osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
107 osd_data->pagelist = pagelist;
108}
109
110#ifdef CONFIG_BLOCK
111static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
112 struct bio *bio, size_t bio_length)
113{
114 osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
115 osd_data->bio = bio;
116 osd_data->bio_length = bio_length;
117}
118#endif /* CONFIG_BLOCK */
119
120#define osd_req_op_data(oreq, whch, typ, fld) \
121 ({ \
122 BUG_ON(whch >= (oreq)->r_num_ops); \
123 &(oreq)->r_ops[whch].typ.fld; \
124 })
125
126static struct ceph_osd_data *
127osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
128{
129 BUG_ON(which >= osd_req->r_num_ops);
130
131 return &osd_req->r_ops[which].raw_data_in;
132}
133
134struct ceph_osd_data *
135osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
136 unsigned int which)
137{
138 return osd_req_op_data(osd_req, which, extent, osd_data);
139}
140EXPORT_SYMBOL(osd_req_op_extent_osd_data);
141
142struct ceph_osd_data *
143osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
144 unsigned int which)
145{
146 return osd_req_op_data(osd_req, which, cls, response_data);
147}
148EXPORT_SYMBOL(osd_req_op_cls_response_data); /* ??? */
149
150void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
151 unsigned int which, struct page **pages,
152 u64 length, u32 alignment,
153 bool pages_from_pool, bool own_pages)
154{
155 struct ceph_osd_data *osd_data;
156
157 osd_data = osd_req_op_raw_data_in(osd_req, which);
158 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
159 pages_from_pool, own_pages);
160}
161EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
162
163void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
164 unsigned int which, struct page **pages,
165 u64 length, u32 alignment,
166 bool pages_from_pool, bool own_pages)
167{
168 struct ceph_osd_data *osd_data;
169
170 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
171 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
172 pages_from_pool, own_pages);
173}
174EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
175
176void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
177 unsigned int which, struct ceph_pagelist *pagelist)
178{
179 struct ceph_osd_data *osd_data;
180
181 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
182 ceph_osd_data_pagelist_init(osd_data, pagelist);
183}
184EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
185
186#ifdef CONFIG_BLOCK
187void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
188 unsigned int which, struct bio *bio, size_t bio_length)
189{
190 struct ceph_osd_data *osd_data;
191
192 osd_data = osd_req_op_data(osd_req, which, extent, osd_data);
193 ceph_osd_data_bio_init(osd_data, bio, bio_length);
194}
195EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
196#endif /* CONFIG_BLOCK */
197
198static void osd_req_op_cls_request_info_pagelist(
199 struct ceph_osd_request *osd_req,
200 unsigned int which, struct ceph_pagelist *pagelist)
201{
202 struct ceph_osd_data *osd_data;
203
204 osd_data = osd_req_op_data(osd_req, which, cls, request_info);
205 ceph_osd_data_pagelist_init(osd_data, pagelist);
206}
207
208void osd_req_op_cls_request_data_pagelist(
209 struct ceph_osd_request *osd_req,
210 unsigned int which, struct ceph_pagelist *pagelist)
211{
212 struct ceph_osd_data *osd_data;
213
214 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
215 ceph_osd_data_pagelist_init(osd_data, pagelist);
216}
217EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
218
219void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
220 unsigned int which, struct page **pages, u64 length,
221 u32 alignment, bool pages_from_pool, bool own_pages)
222{
223 struct ceph_osd_data *osd_data;
224
225 osd_data = osd_req_op_data(osd_req, which, cls, request_data);
226 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
227 pages_from_pool, own_pages);
228}
229EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
230
231void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
232 unsigned int which, struct page **pages, u64 length,
233 u32 alignment, bool pages_from_pool, bool own_pages)
234{
235 struct ceph_osd_data *osd_data;
236
237 osd_data = osd_req_op_data(osd_req, which, cls, response_data);
238 ceph_osd_data_pages_init(osd_data, pages, length, alignment,
239 pages_from_pool, own_pages);
240}
241EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
242
243static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
244{
245 switch (osd_data->type) {
246 case CEPH_OSD_DATA_TYPE_NONE:
247 return 0;
248 case CEPH_OSD_DATA_TYPE_PAGES:
249 return osd_data->length;
250 case CEPH_OSD_DATA_TYPE_PAGELIST:
251 return (u64)osd_data->pagelist->length;
252#ifdef CONFIG_BLOCK
253 case CEPH_OSD_DATA_TYPE_BIO:
254 return (u64)osd_data->bio_length;
255#endif /* CONFIG_BLOCK */
256 default:
257 WARN(true, "unrecognized data type %d\n", (int)osd_data->type);
258 return 0;
100 } 259 }
101 req->r_num_pages = calc_pages_for(off, *plen); 260}
102 req->r_page_alignment = off & ~PAGE_MASK;
103 if (op->op == CEPH_OSD_OP_WRITE)
104 op->payload_len = *plen;
105 261
106 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", 262static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
107 bno, objoff, objlen, req->r_num_pages); 263{
264 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
265 int num_pages;
108 266
109 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno); 267 num_pages = calc_pages_for((u64)osd_data->alignment,
110 req->r_oid_len = strlen(req->r_oid); 268 (u64)osd_data->length);
269 ceph_release_page_vector(osd_data->pages, num_pages);
270 }
271 ceph_osd_data_init(osd_data);
272}
273
274static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
275 unsigned int which)
276{
277 struct ceph_osd_req_op *op;
278
279 BUG_ON(which >= osd_req->r_num_ops);
280 op = &osd_req->r_ops[which];
111 281
112 return r; 282 switch (op->op) {
283 case CEPH_OSD_OP_READ:
284 case CEPH_OSD_OP_WRITE:
285 ceph_osd_data_release(&op->extent.osd_data);
286 break;
287 case CEPH_OSD_OP_CALL:
288 ceph_osd_data_release(&op->cls.request_info);
289 ceph_osd_data_release(&op->cls.request_data);
290 ceph_osd_data_release(&op->cls.response_data);
291 break;
292 default:
293 break;
294 }
113} 295}
114 296
115/* 297/*
@@ -117,30 +299,26 @@ static int calc_layout(struct ceph_vino vino,
117 */ 299 */
118void ceph_osdc_release_request(struct kref *kref) 300void ceph_osdc_release_request(struct kref *kref)
119{ 301{
120 struct ceph_osd_request *req = container_of(kref, 302 struct ceph_osd_request *req;
121 struct ceph_osd_request, 303 unsigned int which;
122 r_kref);
123 304
305 req = container_of(kref, struct ceph_osd_request, r_kref);
124 if (req->r_request) 306 if (req->r_request)
125 ceph_msg_put(req->r_request); 307 ceph_msg_put(req->r_request);
126 if (req->r_con_filling_msg) { 308 if (req->r_reply) {
127 dout("%s revoking msg %p from con %p\n", __func__,
128 req->r_reply, req->r_con_filling_msg);
129 ceph_msg_revoke_incoming(req->r_reply); 309 ceph_msg_revoke_incoming(req->r_reply);
130 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
131 req->r_con_filling_msg = NULL;
132 }
133 if (req->r_reply)
134 ceph_msg_put(req->r_reply); 310 ceph_msg_put(req->r_reply);
135 if (req->r_own_pages) 311 }
136 ceph_release_page_vector(req->r_pages, 312
137 req->r_num_pages); 313 for (which = 0; which < req->r_num_ops; which++)
314 osd_req_op_data_release(req, which);
315
138 ceph_put_snap_context(req->r_snapc); 316 ceph_put_snap_context(req->r_snapc);
139 ceph_pagelist_release(&req->r_trail);
140 if (req->r_mempool) 317 if (req->r_mempool)
141 mempool_free(req, req->r_osdc->req_mempool); 318 mempool_free(req, req->r_osdc->req_mempool);
142 else 319 else
143 kfree(req); 320 kmem_cache_free(ceph_osd_request_cache, req);
321
144} 322}
145EXPORT_SYMBOL(ceph_osdc_release_request); 323EXPORT_SYMBOL(ceph_osdc_release_request);
146 324
@@ -154,6 +332,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
154 struct ceph_msg *msg; 332 struct ceph_msg *msg;
155 size_t msg_size; 333 size_t msg_size;
156 334
335 BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
336 BUG_ON(num_ops > CEPH_OSD_MAX_OP);
337
157 msg_size = 4 + 4 + 8 + 8 + 4+8; 338 msg_size = 4 + 4 + 8 + 8 + 4+8;
158 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */ 339 msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
159 msg_size += 1 + 8 + 4 + 4; /* pg_t */ 340 msg_size += 1 + 8 + 4 + 4; /* pg_t */
@@ -168,13 +349,14 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
168 req = mempool_alloc(osdc->req_mempool, gfp_flags); 349 req = mempool_alloc(osdc->req_mempool, gfp_flags);
169 memset(req, 0, sizeof(*req)); 350 memset(req, 0, sizeof(*req));
170 } else { 351 } else {
171 req = kzalloc(sizeof(*req), gfp_flags); 352 req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
172 } 353 }
173 if (req == NULL) 354 if (req == NULL)
174 return NULL; 355 return NULL;
175 356
176 req->r_osdc = osdc; 357 req->r_osdc = osdc;
177 req->r_mempool = use_mempool; 358 req->r_mempool = use_mempool;
359 req->r_num_ops = num_ops;
178 360
179 kref_init(&req->r_kref); 361 kref_init(&req->r_kref);
180 init_completion(&req->r_completion); 362 init_completion(&req->r_completion);
@@ -198,8 +380,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
198 } 380 }
199 req->r_reply = msg; 381 req->r_reply = msg;
200 382
201 ceph_pagelist_init(&req->r_trail);
202
203 /* create request message; allow space for oid */ 383 /* create request message; allow space for oid */
204 if (use_mempool) 384 if (use_mempool)
205 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 385 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -218,60 +398,24 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
218} 398}
219EXPORT_SYMBOL(ceph_osdc_alloc_request); 399EXPORT_SYMBOL(ceph_osdc_alloc_request);
220 400
221static void osd_req_encode_op(struct ceph_osd_request *req, 401static bool osd_req_opcode_valid(u16 opcode)
222 struct ceph_osd_op *dst,
223 struct ceph_osd_req_op *src)
224{ 402{
225 dst->op = cpu_to_le16(src->op); 403 switch (opcode) {
226
227 switch (src->op) {
228 case CEPH_OSD_OP_STAT:
229 break;
230 case CEPH_OSD_OP_READ: 404 case CEPH_OSD_OP_READ:
231 case CEPH_OSD_OP_WRITE: 405 case CEPH_OSD_OP_STAT:
232 dst->extent.offset =
233 cpu_to_le64(src->extent.offset);
234 dst->extent.length =
235 cpu_to_le64(src->extent.length);
236 dst->extent.truncate_size =
237 cpu_to_le64(src->extent.truncate_size);
238 dst->extent.truncate_seq =
239 cpu_to_le32(src->extent.truncate_seq);
240 break;
241 case CEPH_OSD_OP_CALL:
242 dst->cls.class_len = src->cls.class_len;
243 dst->cls.method_len = src->cls.method_len;
244 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
245
246 ceph_pagelist_append(&req->r_trail, src->cls.class_name,
247 src->cls.class_len);
248 ceph_pagelist_append(&req->r_trail, src->cls.method_name,
249 src->cls.method_len);
250 ceph_pagelist_append(&req->r_trail, src->cls.indata,
251 src->cls.indata_len);
252 break;
253 case CEPH_OSD_OP_STARTSYNC:
254 break;
255 case CEPH_OSD_OP_NOTIFY_ACK:
256 case CEPH_OSD_OP_WATCH:
257 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
258 dst->watch.ver = cpu_to_le64(src->watch.ver);
259 dst->watch.flag = src->watch.flag;
260 break;
261 default:
262 pr_err("unrecognized osd opcode %d\n", dst->op);
263 WARN_ON(1);
264 break;
265 case CEPH_OSD_OP_MAPEXT: 406 case CEPH_OSD_OP_MAPEXT:
266 case CEPH_OSD_OP_MASKTRUNC: 407 case CEPH_OSD_OP_MASKTRUNC:
267 case CEPH_OSD_OP_SPARSE_READ: 408 case CEPH_OSD_OP_SPARSE_READ:
268 case CEPH_OSD_OP_NOTIFY: 409 case CEPH_OSD_OP_NOTIFY:
410 case CEPH_OSD_OP_NOTIFY_ACK:
269 case CEPH_OSD_OP_ASSERT_VER: 411 case CEPH_OSD_OP_ASSERT_VER:
412 case CEPH_OSD_OP_WRITE:
270 case CEPH_OSD_OP_WRITEFULL: 413 case CEPH_OSD_OP_WRITEFULL:
271 case CEPH_OSD_OP_TRUNCATE: 414 case CEPH_OSD_OP_TRUNCATE:
272 case CEPH_OSD_OP_ZERO: 415 case CEPH_OSD_OP_ZERO:
273 case CEPH_OSD_OP_DELETE: 416 case CEPH_OSD_OP_DELETE:
274 case CEPH_OSD_OP_APPEND: 417 case CEPH_OSD_OP_APPEND:
418 case CEPH_OSD_OP_STARTSYNC:
275 case CEPH_OSD_OP_SETTRUNC: 419 case CEPH_OSD_OP_SETTRUNC:
276 case CEPH_OSD_OP_TRIMTRUNC: 420 case CEPH_OSD_OP_TRIMTRUNC:
277 case CEPH_OSD_OP_TMAPUP: 421 case CEPH_OSD_OP_TMAPUP:
@@ -279,11 +423,11 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
279 case CEPH_OSD_OP_TMAPGET: 423 case CEPH_OSD_OP_TMAPGET:
280 case CEPH_OSD_OP_CREATE: 424 case CEPH_OSD_OP_CREATE:
281 case CEPH_OSD_OP_ROLLBACK: 425 case CEPH_OSD_OP_ROLLBACK:
426 case CEPH_OSD_OP_WATCH:
282 case CEPH_OSD_OP_OMAPGETKEYS: 427 case CEPH_OSD_OP_OMAPGETKEYS:
283 case CEPH_OSD_OP_OMAPGETVALS: 428 case CEPH_OSD_OP_OMAPGETVALS:
284 case CEPH_OSD_OP_OMAPGETHEADER: 429 case CEPH_OSD_OP_OMAPGETHEADER:
285 case CEPH_OSD_OP_OMAPGETVALSBYKEYS: 430 case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
286 case CEPH_OSD_OP_MODE_RD:
287 case CEPH_OSD_OP_OMAPSETVALS: 431 case CEPH_OSD_OP_OMAPSETVALS:
288 case CEPH_OSD_OP_OMAPSETHEADER: 432 case CEPH_OSD_OP_OMAPSETHEADER:
289 case CEPH_OSD_OP_OMAPCLEAR: 433 case CEPH_OSD_OP_OMAPCLEAR:
@@ -314,113 +458,233 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
314 case CEPH_OSD_OP_RDUNLOCK: 458 case CEPH_OSD_OP_RDUNLOCK:
315 case CEPH_OSD_OP_UPLOCK: 459 case CEPH_OSD_OP_UPLOCK:
316 case CEPH_OSD_OP_DNLOCK: 460 case CEPH_OSD_OP_DNLOCK:
461 case CEPH_OSD_OP_CALL:
317 case CEPH_OSD_OP_PGLS: 462 case CEPH_OSD_OP_PGLS:
318 case CEPH_OSD_OP_PGLS_FILTER: 463 case CEPH_OSD_OP_PGLS_FILTER:
319 pr_err("unsupported osd opcode %s\n", 464 return true;
320 ceph_osd_op_name(dst->op)); 465 default:
321 WARN_ON(1); 466 return false;
322 break;
323 } 467 }
324 dst->payload_len = cpu_to_le32(src->payload_len);
325} 468}
326 469
327/* 470/*
328 * build new request AND message 471 * This is an osd op init function for opcodes that have no data or
329 * 472 * other information associated with them. It also serves as a
473 * common init routine for all the other init functions, below.
330 */ 474 */
331void ceph_osdc_build_request(struct ceph_osd_request *req, 475static struct ceph_osd_req_op *
332 u64 off, u64 len, unsigned int num_ops, 476_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
333 struct ceph_osd_req_op *src_ops, 477 u16 opcode)
334 struct ceph_snap_context *snapc, u64 snap_id,
335 struct timespec *mtime)
336{ 478{
337 struct ceph_msg *msg = req->r_request; 479 struct ceph_osd_req_op *op;
338 struct ceph_osd_req_op *src_op;
339 void *p;
340 size_t msg_size;
341 int flags = req->r_flags;
342 u64 data_len;
343 int i;
344 480
345 req->r_num_ops = num_ops; 481 BUG_ON(which >= osd_req->r_num_ops);
346 req->r_snapid = snap_id; 482 BUG_ON(!osd_req_opcode_valid(opcode));
347 req->r_snapc = ceph_get_snap_context(snapc);
348 483
349 /* encode request */ 484 op = &osd_req->r_ops[which];
350 msg->hdr.version = cpu_to_le16(4); 485 memset(op, 0, sizeof (*op));
486 op->op = opcode;
351 487
352 p = msg->front.iov_base; 488 return op;
353 ceph_encode_32(&p, 1); /* client_inc is always 1 */ 489}
354 req->r_request_osdmap_epoch = p;
355 p += 4;
356 req->r_request_flags = p;
357 p += 4;
358 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
359 ceph_encode_timespec(p, mtime);
360 p += sizeof(struct ceph_timespec);
361 req->r_request_reassert_version = p;
362 p += sizeof(struct ceph_eversion); /* will get filled in */
363 490
364 /* oloc */ 491void osd_req_op_init(struct ceph_osd_request *osd_req,
365 ceph_encode_8(&p, 4); 492 unsigned int which, u16 opcode)
366 ceph_encode_8(&p, 4); 493{
367 ceph_encode_32(&p, 8 + 4 + 4); 494 (void)_osd_req_op_init(osd_req, which, opcode);
368 req->r_request_pool = p; 495}
369 p += 8; 496EXPORT_SYMBOL(osd_req_op_init);
370 ceph_encode_32(&p, -1); /* preferred */
371 ceph_encode_32(&p, 0); /* key len */
372 497
373 ceph_encode_8(&p, 1); 498void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
374 req->r_request_pgid = p; 499 unsigned int which, u16 opcode,
375 p += 8 + 4; 500 u64 offset, u64 length,
376 ceph_encode_32(&p, -1); /* preferred */ 501 u64 truncate_size, u32 truncate_seq)
502{
503 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
504 size_t payload_len = 0;
377 505
378 /* oid */ 506 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
379 ceph_encode_32(&p, req->r_oid_len);
380 memcpy(p, req->r_oid, req->r_oid_len);
381 dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
382 p += req->r_oid_len;
383 507
384 /* ops */ 508 op->extent.offset = offset;
385 ceph_encode_16(&p, num_ops); 509 op->extent.length = length;
386 src_op = src_ops; 510 op->extent.truncate_size = truncate_size;
387 req->r_request_ops = p; 511 op->extent.truncate_seq = truncate_seq;
388 for (i = 0; i < num_ops; i++, src_op++) { 512 if (opcode == CEPH_OSD_OP_WRITE)
389 osd_req_encode_op(req, p, src_op); 513 payload_len += length;
390 p += sizeof(struct ceph_osd_op);
391 }
392 514
393 /* snaps */ 515 op->payload_len = payload_len;
394 ceph_encode_64(&p, req->r_snapid); 516}
395 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0); 517EXPORT_SYMBOL(osd_req_op_extent_init);
396 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0); 518
397 if (req->r_snapc) { 519void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
398 for (i = 0; i < snapc->num_snaps; i++) { 520 unsigned int which, u64 length)
399 ceph_encode_64(&p, req->r_snapc->snaps[i]); 521{
400 } 522 struct ceph_osd_req_op *op;
523 u64 previous;
524
525 BUG_ON(which >= osd_req->r_num_ops);
526 op = &osd_req->r_ops[which];
527 previous = op->extent.length;
528
529 if (length == previous)
530 return; /* Nothing to do */
531 BUG_ON(length > previous);
532
533 op->extent.length = length;
534 op->payload_len -= previous - length;
535}
536EXPORT_SYMBOL(osd_req_op_extent_update);
537
538void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
539 u16 opcode, const char *class, const char *method)
540{
541 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
542 struct ceph_pagelist *pagelist;
543 size_t payload_len = 0;
544 size_t size;
545
546 BUG_ON(opcode != CEPH_OSD_OP_CALL);
547
548 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
549 BUG_ON(!pagelist);
550 ceph_pagelist_init(pagelist);
551
552 op->cls.class_name = class;
553 size = strlen(class);
554 BUG_ON(size > (size_t) U8_MAX);
555 op->cls.class_len = size;
556 ceph_pagelist_append(pagelist, class, size);
557 payload_len += size;
558
559 op->cls.method_name = method;
560 size = strlen(method);
561 BUG_ON(size > (size_t) U8_MAX);
562 op->cls.method_len = size;
563 ceph_pagelist_append(pagelist, method, size);
564 payload_len += size;
565
566 osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
567
568 op->cls.argc = 0; /* currently unused */
569
570 op->payload_len = payload_len;
571}
572EXPORT_SYMBOL(osd_req_op_cls_init);
573
574void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
575 unsigned int which, u16 opcode,
576 u64 cookie, u64 version, int flag)
577{
578 struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
579
580 BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
581
582 op->watch.cookie = cookie;
583 op->watch.ver = version;
584 if (opcode == CEPH_OSD_OP_WATCH && flag)
585 op->watch.flag = (u8)1;
586}
587EXPORT_SYMBOL(osd_req_op_watch_init);
588
589static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
590 struct ceph_osd_data *osd_data)
591{
592 u64 length = ceph_osd_data_length(osd_data);
593
594 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
595 BUG_ON(length > (u64) SIZE_MAX);
596 if (length)
597 ceph_msg_data_add_pages(msg, osd_data->pages,
598 length, osd_data->alignment);
599 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
600 BUG_ON(!length);
601 ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
602#ifdef CONFIG_BLOCK
603 } else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
604 ceph_msg_data_add_bio(msg, osd_data->bio, length);
605#endif
606 } else {
607 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
401 } 608 }
609}
402 610
403 req->r_request_attempts = p; 611static u64 osd_req_encode_op(struct ceph_osd_request *req,
404 p += 4; 612 struct ceph_osd_op *dst, unsigned int which)
613{
614 struct ceph_osd_req_op *src;
615 struct ceph_osd_data *osd_data;
616 u64 request_data_len = 0;
617 u64 data_length;
405 618
406 data_len = req->r_trail.length; 619 BUG_ON(which >= req->r_num_ops);
407 if (flags & CEPH_OSD_FLAG_WRITE) { 620 src = &req->r_ops[which];
408 req->r_request->hdr.data_off = cpu_to_le16(off); 621 if (WARN_ON(!osd_req_opcode_valid(src->op))) {
409 data_len += len; 622 pr_err("unrecognized osd opcode %d\n", src->op);
623
624 return 0;
410 } 625 }
411 req->r_request->hdr.data_len = cpu_to_le32(data_len);
412 req->r_request->page_alignment = req->r_page_alignment;
413 626
414 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 627 switch (src->op) {
415 msg_size = p - msg->front.iov_base; 628 case CEPH_OSD_OP_STAT:
416 msg->front.iov_len = msg_size; 629 osd_data = &src->raw_data_in;
417 msg->hdr.front_len = cpu_to_le32(msg_size); 630 ceph_osdc_msg_data_add(req->r_reply, osd_data);
631 break;
632 case CEPH_OSD_OP_READ:
633 case CEPH_OSD_OP_WRITE:
634 if (src->op == CEPH_OSD_OP_WRITE)
635 request_data_len = src->extent.length;
636 dst->extent.offset = cpu_to_le64(src->extent.offset);
637 dst->extent.length = cpu_to_le64(src->extent.length);
638 dst->extent.truncate_size =
639 cpu_to_le64(src->extent.truncate_size);
640 dst->extent.truncate_seq =
641 cpu_to_le32(src->extent.truncate_seq);
642 osd_data = &src->extent.osd_data;
643 if (src->op == CEPH_OSD_OP_WRITE)
644 ceph_osdc_msg_data_add(req->r_request, osd_data);
645 else
646 ceph_osdc_msg_data_add(req->r_reply, osd_data);
647 break;
648 case CEPH_OSD_OP_CALL:
649 dst->cls.class_len = src->cls.class_len;
650 dst->cls.method_len = src->cls.method_len;
651 osd_data = &src->cls.request_info;
652 ceph_osdc_msg_data_add(req->r_request, osd_data);
653 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
654 request_data_len = osd_data->pagelist->length;
655
656 osd_data = &src->cls.request_data;
657 data_length = ceph_osd_data_length(osd_data);
658 if (data_length) {
659 BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
660 dst->cls.indata_len = cpu_to_le32(data_length);
661 ceph_osdc_msg_data_add(req->r_request, osd_data);
662 src->payload_len += data_length;
663 request_data_len += data_length;
664 }
665 osd_data = &src->cls.response_data;
666 ceph_osdc_msg_data_add(req->r_reply, osd_data);
667 break;
668 case CEPH_OSD_OP_STARTSYNC:
669 break;
670 case CEPH_OSD_OP_NOTIFY_ACK:
671 case CEPH_OSD_OP_WATCH:
672 dst->watch.cookie = cpu_to_le64(src->watch.cookie);
673 dst->watch.ver = cpu_to_le64(src->watch.ver);
674 dst->watch.flag = src->watch.flag;
675 break;
676 default:
677 pr_err("unsupported osd opcode %s\n",
678 ceph_osd_op_name(src->op));
679 WARN_ON(1);
418 680
419 dout("build_request msg_size was %d num_ops %d\n", (int)msg_size, 681 return 0;
420 num_ops); 682 }
421 return; 683 dst->op = cpu_to_le16(src->op);
684 dst->payload_len = cpu_to_le32(src->payload_len);
685
686 return request_data_len;
422} 687}
423EXPORT_SYMBOL(ceph_osdc_build_request);
424 688
425/* 689/*
426 * build new request AND message, calculate layout, and adjust file 690 * build new request AND message, calculate layout, and adjust file
@@ -436,51 +700,63 @@ EXPORT_SYMBOL(ceph_osdc_build_request);
436struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 700struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
437 struct ceph_file_layout *layout, 701 struct ceph_file_layout *layout,
438 struct ceph_vino vino, 702 struct ceph_vino vino,
439 u64 off, u64 *plen, 703 u64 off, u64 *plen, int num_ops,
440 int opcode, int flags, 704 int opcode, int flags,
441 struct ceph_snap_context *snapc, 705 struct ceph_snap_context *snapc,
442 int do_sync,
443 u32 truncate_seq, 706 u32 truncate_seq,
444 u64 truncate_size, 707 u64 truncate_size,
445 struct timespec *mtime, 708 bool use_mempool)
446 bool use_mempool,
447 int page_align)
448{ 709{
449 struct ceph_osd_req_op ops[2];
450 struct ceph_osd_request *req; 710 struct ceph_osd_request *req;
451 unsigned int num_op = 1; 711 u64 objnum = 0;
712 u64 objoff = 0;
713 u64 objlen = 0;
714 u32 object_size;
715 u64 object_base;
452 int r; 716 int r;
453 717
454 memset(&ops, 0, sizeof ops); 718 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
455
456 ops[0].op = opcode;
457 ops[0].extent.truncate_seq = truncate_seq;
458 ops[0].extent.truncate_size = truncate_size;
459
460 if (do_sync) {
461 ops[1].op = CEPH_OSD_OP_STARTSYNC;
462 num_op++;
463 }
464 719
465 req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool, 720 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
466 GFP_NOFS); 721 GFP_NOFS);
467 if (!req) 722 if (!req)
468 return ERR_PTR(-ENOMEM); 723 return ERR_PTR(-ENOMEM);
724
469 req->r_flags = flags; 725 req->r_flags = flags;
470 726
471 /* calculate max write size */ 727 /* calculate max write size */
472 r = calc_layout(vino, layout, off, plen, req, ops); 728 r = calc_layout(layout, off, plen, &objnum, &objoff, &objlen);
473 if (r < 0) 729 if (r < 0) {
730 ceph_osdc_put_request(req);
474 return ERR_PTR(r); 731 return ERR_PTR(r);
475 req->r_file_layout = *layout; /* keep a copy */ 732 }
476 733
477 /* in case it differs from natural (file) alignment that 734 object_size = le32_to_cpu(layout->fl_object_size);
478 calc_layout filled in for us */ 735 object_base = off - objoff;
479 req->r_num_pages = calc_pages_for(page_align, *plen); 736 if (truncate_size <= object_base) {
480 req->r_page_alignment = page_align; 737 truncate_size = 0;
738 } else {
739 truncate_size -= object_base;
740 if (truncate_size > object_size)
741 truncate_size = object_size;
742 }
743
744 osd_req_op_extent_init(req, 0, opcode, objoff, objlen,
745 truncate_size, truncate_seq);
746
747 /*
748 * A second op in the ops array means the caller wants to
749 * also issue a include a 'startsync' command so that the
750 * osd will flush data quickly.
751 */
752 if (num_ops > 1)
753 osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
754
755 req->r_file_layout = *layout; /* keep a copy */
481 756
482 ceph_osdc_build_request(req, off, *plen, num_op, ops, 757 snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx",
483 snapc, vino.snap, mtime); 758 vino.ino, objnum);
759 req->r_oid_len = strlen(req->r_oid);
484 760
485 return req; 761 return req;
486} 762}
@@ -558,21 +834,46 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
558 struct ceph_osd *osd) 834 struct ceph_osd *osd)
559{ 835{
560 struct ceph_osd_request *req, *nreq; 836 struct ceph_osd_request *req, *nreq;
837 LIST_HEAD(resend);
561 int err; 838 int err;
562 839
563 dout("__kick_osd_requests osd%d\n", osd->o_osd); 840 dout("__kick_osd_requests osd%d\n", osd->o_osd);
564 err = __reset_osd(osdc, osd); 841 err = __reset_osd(osdc, osd);
565 if (err) 842 if (err)
566 return; 843 return;
567 844 /*
845 * Build up a list of requests to resend by traversing the
846 * osd's list of requests. Requests for a given object are
847 * sent in tid order, and that is also the order they're
848 * kept on this list. Therefore all requests that are in
849 * flight will be found first, followed by all requests that
850 * have not yet been sent. And to resend requests while
851 * preserving this order we will want to put any sent
852 * requests back on the front of the osd client's unsent
853 * list.
854 *
855 * So we build a separate ordered list of already-sent
856 * requests for the affected osd and splice it onto the
857 * front of the osd client's unsent list. Once we've seen a
858 * request that has not yet been sent we're done. Those
859 * requests are already sitting right where they belong.
860 */
568 list_for_each_entry(req, &osd->o_requests, r_osd_item) { 861 list_for_each_entry(req, &osd->o_requests, r_osd_item) {
569 list_move(&req->r_req_lru_item, &osdc->req_unsent); 862 if (!req->r_sent)
570 dout("requeued %p tid %llu osd%d\n", req, req->r_tid, 863 break;
864 list_move_tail(&req->r_req_lru_item, &resend);
865 dout("requeueing %p tid %llu osd%d\n", req, req->r_tid,
571 osd->o_osd); 866 osd->o_osd);
572 if (!req->r_linger) 867 if (!req->r_linger)
573 req->r_flags |= CEPH_OSD_FLAG_RETRY; 868 req->r_flags |= CEPH_OSD_FLAG_RETRY;
574 } 869 }
870 list_splice(&resend, &osdc->req_unsent);
575 871
872 /*
873 * Linger requests are re-registered before sending, which
874 * sets up a new tid for each. We add them to the unsent
875 * list at the end to keep things in tid order.
876 */
576 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests, 877 list_for_each_entry_safe(req, nreq, &osd->o_linger_requests,
577 r_linger_osd) { 878 r_linger_osd) {
578 /* 879 /*
@@ -581,8 +882,8 @@ static void __kick_osd_requests(struct ceph_osd_client *osdc,
581 */ 882 */
582 BUG_ON(!list_empty(&req->r_req_lru_item)); 883 BUG_ON(!list_empty(&req->r_req_lru_item));
583 __register_request(osdc, req); 884 __register_request(osdc, req);
584 list_add(&req->r_req_lru_item, &osdc->req_unsent); 885 list_add_tail(&req->r_req_lru_item, &osdc->req_unsent);
585 list_add(&req->r_osd_item, &req->r_osd->o_requests); 886 list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
586 __unregister_linger_request(osdc, req); 887 __unregister_linger_request(osdc, req);
587 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid, 888 dout("requeued lingering %p tid %llu osd%d\n", req, req->r_tid,
588 osd->o_osd); 889 osd->o_osd);
@@ -654,8 +955,7 @@ static void put_osd(struct ceph_osd *osd)
654 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) { 955 if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
655 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth; 956 struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
656 957
657 if (ac->ops && ac->ops->destroy_authorizer) 958 ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
658 ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
659 kfree(osd); 959 kfree(osd);
660 } 960 }
661} 961}
@@ -820,14 +1120,6 @@ static void __register_request(struct ceph_osd_client *osdc,
820 } 1120 }
821} 1121}
822 1122
823static void register_request(struct ceph_osd_client *osdc,
824 struct ceph_osd_request *req)
825{
826 mutex_lock(&osdc->request_mutex);
827 __register_request(osdc, req);
828 mutex_unlock(&osdc->request_mutex);
829}
830
831/* 1123/*
832 * called under osdc->request_mutex 1124 * called under osdc->request_mutex
833 */ 1125 */
@@ -912,6 +1204,7 @@ void ceph_osdc_unregister_linger_request(struct ceph_osd_client *osdc,
912 mutex_lock(&osdc->request_mutex); 1204 mutex_lock(&osdc->request_mutex);
913 if (req->r_linger) { 1205 if (req->r_linger) {
914 __unregister_linger_request(osdc, req); 1206 __unregister_linger_request(osdc, req);
1207 req->r_linger = 0;
915 ceph_osdc_put_request(req); 1208 ceph_osdc_put_request(req);
916 } 1209 }
917 mutex_unlock(&osdc->request_mutex); 1210 mutex_unlock(&osdc->request_mutex);
@@ -952,8 +1245,8 @@ static int __map_request(struct ceph_osd_client *osdc,
952 int err; 1245 int err;
953 1246
954 dout("map_request %p tid %lld\n", req, req->r_tid); 1247 dout("map_request %p tid %lld\n", req, req->r_tid);
955 err = ceph_calc_object_layout(&pgid, req->r_oid, 1248 err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
956 &req->r_file_layout, osdc->osdmap); 1249 ceph_file_layout_pg_pool(req->r_file_layout));
957 if (err) { 1250 if (err) {
958 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1251 list_move(&req->r_req_lru_item, &osdc->req_notarget);
959 return err; 1252 return err;
@@ -1007,10 +1300,10 @@ static int __map_request(struct ceph_osd_client *osdc,
1007 1300
1008 if (req->r_osd) { 1301 if (req->r_osd) {
1009 __remove_osd_from_lru(req->r_osd); 1302 __remove_osd_from_lru(req->r_osd);
1010 list_add(&req->r_osd_item, &req->r_osd->o_requests); 1303 list_add_tail(&req->r_osd_item, &req->r_osd->o_requests);
1011 list_move(&req->r_req_lru_item, &osdc->req_unsent); 1304 list_move_tail(&req->r_req_lru_item, &osdc->req_unsent);
1012 } else { 1305 } else {
1013 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1306 list_move_tail(&req->r_req_lru_item, &osdc->req_notarget);
1014 } 1307 }
1015 err = 1; /* osd or pg changed */ 1308 err = 1; /* osd or pg changed */
1016 1309
@@ -1045,8 +1338,14 @@ static void __send_request(struct ceph_osd_client *osdc,
1045 list_move_tail(&req->r_req_lru_item, &osdc->req_lru); 1338 list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
1046 1339
1047 ceph_msg_get(req->r_request); /* send consumes a ref */ 1340 ceph_msg_get(req->r_request); /* send consumes a ref */
1048 ceph_con_send(&req->r_osd->o_con, req->r_request); 1341
1342 /* Mark the request unsafe if this is the first timet's being sent. */
1343
1344 if (!req->r_sent && req->r_unsafe_callback)
1345 req->r_unsafe_callback(req, true);
1049 req->r_sent = req->r_osd->o_incarnation; 1346 req->r_sent = req->r_osd->o_incarnation;
1347
1348 ceph_con_send(&req->r_osd->o_con, req->r_request);
1050} 1349}
1051 1350
1052/* 1351/*
@@ -1134,31 +1433,11 @@ static void handle_osds_timeout(struct work_struct *work)
1134 1433
1135static void complete_request(struct ceph_osd_request *req) 1434static void complete_request(struct ceph_osd_request *req)
1136{ 1435{
1137 if (req->r_safe_callback) 1436 if (req->r_unsafe_callback)
1138 req->r_safe_callback(req, NULL); 1437 req->r_unsafe_callback(req, false);
1139 complete_all(&req->r_safe_completion); /* fsync waiter */ 1438 complete_all(&req->r_safe_completion); /* fsync waiter */
1140} 1439}
1141 1440
1142static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
1143{
1144 __u8 v;
1145
1146 ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
1147 v = ceph_decode_8(p);
1148 if (v > 1) {
1149 pr_warning("do not understand pg encoding %d > 1", v);
1150 return -EINVAL;
1151 }
1152 pgid->pool = ceph_decode_64(p);
1153 pgid->seed = ceph_decode_32(p);
1154 *p += 4;
1155 return 0;
1156
1157bad:
1158 pr_warning("incomplete pg encoding");
1159 return -EINVAL;
1160}
1161
1162/* 1441/*
1163 * handle osd op reply. either call the callback if it is specified, 1442 * handle osd op reply. either call the callback if it is specified,
1164 * or do the completion to wake up the waiting thread. 1443 * or do the completion to wake up the waiting thread.
@@ -1170,7 +1449,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1170 struct ceph_osd_request *req; 1449 struct ceph_osd_request *req;
1171 u64 tid; 1450 u64 tid;
1172 int object_len; 1451 int object_len;
1173 int numops, payload_len, flags; 1452 unsigned int numops;
1453 int payload_len, flags;
1174 s32 result; 1454 s32 result;
1175 s32 retry_attempt; 1455 s32 retry_attempt;
1176 struct ceph_pg pg; 1456 struct ceph_pg pg;
@@ -1178,7 +1458,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1178 u32 reassert_epoch; 1458 u32 reassert_epoch;
1179 u64 reassert_version; 1459 u64 reassert_version;
1180 u32 osdmap_epoch; 1460 u32 osdmap_epoch;
1181 int i; 1461 int already_completed;
1462 u32 bytes;
1463 unsigned int i;
1182 1464
1183 tid = le64_to_cpu(msg->hdr.tid); 1465 tid = le64_to_cpu(msg->hdr.tid);
1184 dout("handle_reply %p tid %llu\n", msg, tid); 1466 dout("handle_reply %p tid %llu\n", msg, tid);
@@ -1191,7 +1473,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1191 ceph_decode_need(&p, end, object_len, bad); 1473 ceph_decode_need(&p, end, object_len, bad);
1192 p += object_len; 1474 p += object_len;
1193 1475
1194 err = __decode_pgid(&p, end, &pg); 1476 err = ceph_decode_pgid(&p, end, &pg);
1195 if (err) 1477 if (err)
1196 goto bad; 1478 goto bad;
1197 1479
@@ -1207,8 +1489,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1207 req = __lookup_request(osdc, tid); 1489 req = __lookup_request(osdc, tid);
1208 if (req == NULL) { 1490 if (req == NULL) {
1209 dout("handle_reply tid %llu dne\n", tid); 1491 dout("handle_reply tid %llu dne\n", tid);
1210 mutex_unlock(&osdc->request_mutex); 1492 goto bad_mutex;
1211 return;
1212 } 1493 }
1213 ceph_osdc_get_request(req); 1494 ceph_osdc_get_request(req);
1214 1495
@@ -1233,9 +1514,10 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1233 payload_len += len; 1514 payload_len += len;
1234 p += sizeof(*op); 1515 p += sizeof(*op);
1235 } 1516 }
1236 if (payload_len != le32_to_cpu(msg->hdr.data_len)) { 1517 bytes = le32_to_cpu(msg->hdr.data_len);
1518 if (payload_len != bytes) {
1237 pr_warning("sum of op payload lens %d != data_len %d", 1519 pr_warning("sum of op payload lens %d != data_len %d",
1238 payload_len, le32_to_cpu(msg->hdr.data_len)); 1520 payload_len, bytes);
1239 goto bad_put; 1521 goto bad_put;
1240 } 1522 }
1241 1523
@@ -1244,21 +1526,9 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1244 for (i = 0; i < numops; i++) 1526 for (i = 0; i < numops; i++)
1245 req->r_reply_op_result[i] = ceph_decode_32(&p); 1527 req->r_reply_op_result[i] = ceph_decode_32(&p);
1246 1528
1247 /*
1248 * if this connection filled our message, drop our reference now, to
1249 * avoid a (safe but slower) revoke later.
1250 */
1251 if (req->r_con_filling_msg == con && req->r_reply == msg) {
1252 dout(" dropping con_filling_msg ref %p\n", con);
1253 req->r_con_filling_msg = NULL;
1254 con->ops->put(con);
1255 }
1256
1257 if (!req->r_got_reply) { 1529 if (!req->r_got_reply) {
1258 unsigned int bytes;
1259 1530
1260 req->r_result = result; 1531 req->r_result = result;
1261 bytes = le32_to_cpu(msg->hdr.data_len);
1262 dout("handle_reply result %d bytes %d\n", req->r_result, 1532 dout("handle_reply result %d bytes %d\n", req->r_result,
1263 bytes); 1533 bytes);
1264 if (req->r_result == 0) 1534 if (req->r_result == 0)
@@ -1286,7 +1556,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
1286 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 1556 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
1287 __unregister_request(osdc, req); 1557 __unregister_request(osdc, req);
1288 1558
1559 already_completed = req->r_completed;
1560 req->r_completed = 1;
1289 mutex_unlock(&osdc->request_mutex); 1561 mutex_unlock(&osdc->request_mutex);
1562 if (already_completed)
1563 goto done;
1290 1564
1291 if (req->r_callback) 1565 if (req->r_callback)
1292 req->r_callback(req, msg); 1566 req->r_callback(req, msg);
@@ -1303,6 +1577,8 @@ done:
1303 1577
1304bad_put: 1578bad_put:
1305 ceph_osdc_put_request(req); 1579 ceph_osdc_put_request(req);
1580bad_mutex:
1581 mutex_unlock(&osdc->request_mutex);
1306bad: 1582bad:
1307 pr_err("corrupt osd_op_reply got %d %d\n", 1583 pr_err("corrupt osd_op_reply got %d %d\n",
1308 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len)); 1584 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len));
@@ -1736,6 +2012,104 @@ bad:
1736} 2012}
1737 2013
1738/* 2014/*
2015 * build new request AND message
2016 *
2017 */
2018void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
2019 struct ceph_snap_context *snapc, u64 snap_id,
2020 struct timespec *mtime)
2021{
2022 struct ceph_msg *msg = req->r_request;
2023 void *p;
2024 size_t msg_size;
2025 int flags = req->r_flags;
2026 u64 data_len;
2027 unsigned int i;
2028
2029 req->r_snapid = snap_id;
2030 req->r_snapc = ceph_get_snap_context(snapc);
2031
2032 /* encode request */
2033 msg->hdr.version = cpu_to_le16(4);
2034
2035 p = msg->front.iov_base;
2036 ceph_encode_32(&p, 1); /* client_inc is always 1 */
2037 req->r_request_osdmap_epoch = p;
2038 p += 4;
2039 req->r_request_flags = p;
2040 p += 4;
2041 if (req->r_flags & CEPH_OSD_FLAG_WRITE)
2042 ceph_encode_timespec(p, mtime);
2043 p += sizeof(struct ceph_timespec);
2044 req->r_request_reassert_version = p;
2045 p += sizeof(struct ceph_eversion); /* will get filled in */
2046
2047 /* oloc */
2048 ceph_encode_8(&p, 4);
2049 ceph_encode_8(&p, 4);
2050 ceph_encode_32(&p, 8 + 4 + 4);
2051 req->r_request_pool = p;
2052 p += 8;
2053 ceph_encode_32(&p, -1); /* preferred */
2054 ceph_encode_32(&p, 0); /* key len */
2055
2056 ceph_encode_8(&p, 1);
2057 req->r_request_pgid = p;
2058 p += 8 + 4;
2059 ceph_encode_32(&p, -1); /* preferred */
2060
2061 /* oid */
2062 ceph_encode_32(&p, req->r_oid_len);
2063 memcpy(p, req->r_oid, req->r_oid_len);
2064 dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
2065 p += req->r_oid_len;
2066
2067 /* ops--can imply data */
2068 ceph_encode_16(&p, (u16)req->r_num_ops);
2069 data_len = 0;
2070 for (i = 0; i < req->r_num_ops; i++) {
2071 data_len += osd_req_encode_op(req, p, i);
2072 p += sizeof(struct ceph_osd_op);
2073 }
2074
2075 /* snaps */
2076 ceph_encode_64(&p, req->r_snapid);
2077 ceph_encode_64(&p, req->r_snapc ? req->r_snapc->seq : 0);
2078 ceph_encode_32(&p, req->r_snapc ? req->r_snapc->num_snaps : 0);
2079 if (req->r_snapc) {
2080 for (i = 0; i < snapc->num_snaps; i++) {
2081 ceph_encode_64(&p, req->r_snapc->snaps[i]);
2082 }
2083 }
2084
2085 req->r_request_attempts = p;
2086 p += 4;
2087
2088 /* data */
2089 if (flags & CEPH_OSD_FLAG_WRITE) {
2090 u16 data_off;
2091
2092 /*
2093 * The header "data_off" is a hint to the receiver
2094 * allowing it to align received data into its
2095 * buffers such that there's no need to re-copy
2096 * it before writing it to disk (direct I/O).
2097 */
2098 data_off = (u16) (off & 0xffff);
2099 req->r_request->hdr.data_off = cpu_to_le16(data_off);
2100 }
2101 req->r_request->hdr.data_len = cpu_to_le32(data_len);
2102
2103 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
2104 msg_size = p - msg->front.iov_base;
2105 msg->front.iov_len = msg_size;
2106 msg->hdr.front_len = cpu_to_le32(msg_size);
2107
2108 dout("build_request msg_size was %d\n", (int)msg_size);
2109}
2110EXPORT_SYMBOL(ceph_osdc_build_request);
2111
2112/*
1739 * Register request, send initial attempt. 2113 * Register request, send initial attempt.
1740 */ 2114 */
1741int ceph_osdc_start_request(struct ceph_osd_client *osdc, 2115int ceph_osdc_start_request(struct ceph_osd_client *osdc,
@@ -1744,41 +2118,28 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1744{ 2118{
1745 int rc = 0; 2119 int rc = 0;
1746 2120
1747 req->r_request->pages = req->r_pages;
1748 req->r_request->nr_pages = req->r_num_pages;
1749#ifdef CONFIG_BLOCK
1750 req->r_request->bio = req->r_bio;
1751#endif
1752 req->r_request->trail = &req->r_trail;
1753
1754 register_request(osdc, req);
1755
1756 down_read(&osdc->map_sem); 2121 down_read(&osdc->map_sem);
1757 mutex_lock(&osdc->request_mutex); 2122 mutex_lock(&osdc->request_mutex);
1758 /* 2123 __register_request(osdc, req);
1759 * a racing kick_requests() may have sent the message for us 2124 req->r_sent = 0;
1760 * while we dropped request_mutex above, so only send now if 2125 req->r_got_reply = 0;
1761 * the request still han't been touched yet. 2126 req->r_completed = 0;
1762 */ 2127 rc = __map_request(osdc, req, 0);
1763 if (req->r_sent == 0) { 2128 if (rc < 0) {
1764 rc = __map_request(osdc, req, 0); 2129 if (nofail) {
1765 if (rc < 0) { 2130 dout("osdc_start_request failed map, "
1766 if (nofail) { 2131 " will retry %lld\n", req->r_tid);
1767 dout("osdc_start_request failed map, " 2132 rc = 0;
1768 " will retry %lld\n", req->r_tid);
1769 rc = 0;
1770 }
1771 goto out_unlock;
1772 }
1773 if (req->r_osd == NULL) {
1774 dout("send_request %p no up osds in pg\n", req);
1775 ceph_monc_request_next_osdmap(&osdc->client->monc);
1776 } else {
1777 __send_request(osdc, req);
1778 } 2133 }
1779 rc = 0; 2134 goto out_unlock;
1780 } 2135 }
1781 2136 if (req->r_osd == NULL) {
2137 dout("send_request %p no up osds in pg\n", req);
2138 ceph_monc_request_next_osdmap(&osdc->client->monc);
2139 } else {
2140 __send_queued(osdc);
2141 }
2142 rc = 0;
1782out_unlock: 2143out_unlock:
1783 mutex_unlock(&osdc->request_mutex); 2144 mutex_unlock(&osdc->request_mutex);
1784 up_read(&osdc->map_sem); 2145 up_read(&osdc->map_sem);
@@ -1940,18 +2301,22 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1940 2301
1941 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, 2302 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1942 vino.snap, off, *plen); 2303 vino.snap, off, *plen);
1943 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 2304 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1,
1944 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 2305 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1945 NULL, 0, truncate_seq, truncate_size, NULL, 2306 NULL, truncate_seq, truncate_size,
1946 false, page_align); 2307 false);
1947 if (IS_ERR(req)) 2308 if (IS_ERR(req))
1948 return PTR_ERR(req); 2309 return PTR_ERR(req);
1949 2310
1950 /* it may be a short read due to an object boundary */ 2311 /* it may be a short read due to an object boundary */
1951 req->r_pages = pages;
1952 2312
1953 dout("readpages final extent is %llu~%llu (%d pages align %d)\n", 2313 osd_req_op_extent_osd_data_pages(req, 0,
1954 off, *plen, req->r_num_pages, page_align); 2314 pages, *plen, page_align, false, false);
2315
2316 dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n",
2317 off, *plen, *plen, page_align);
2318
2319 ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
1955 2320
1956 rc = ceph_osdc_start_request(osdc, req, false); 2321 rc = ceph_osdc_start_request(osdc, req, false);
1957 if (!rc) 2322 if (!rc)
@@ -1978,20 +2343,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1978 int rc = 0; 2343 int rc = 0;
1979 int page_align = off & ~PAGE_MASK; 2344 int page_align = off & ~PAGE_MASK;
1980 2345
1981 BUG_ON(vino.snap != CEPH_NOSNAP); 2346 BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */
1982 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 2347 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1,
1983 CEPH_OSD_OP_WRITE, 2348 CEPH_OSD_OP_WRITE,
1984 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 2349 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
1985 snapc, 0, 2350 snapc, truncate_seq, truncate_size,
1986 truncate_seq, truncate_size, mtime, 2351 true);
1987 true, page_align);
1988 if (IS_ERR(req)) 2352 if (IS_ERR(req))
1989 return PTR_ERR(req); 2353 return PTR_ERR(req);
1990 2354
1991 /* it may be a short write due to an object boundary */ 2355 /* it may be a short write due to an object boundary */
1992 req->r_pages = pages; 2356 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
1993 dout("writepages %llu~%llu (%d pages)\n", off, len, 2357 false, false);
1994 req->r_num_pages); 2358 dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
2359
2360 ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
1995 2361
1996 rc = ceph_osdc_start_request(osdc, req, true); 2362 rc = ceph_osdc_start_request(osdc, req, true);
1997 if (!rc) 2363 if (!rc)
@@ -2005,6 +2371,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
2005} 2371}
2006EXPORT_SYMBOL(ceph_osdc_writepages); 2372EXPORT_SYMBOL(ceph_osdc_writepages);
2007 2373
2374int ceph_osdc_setup(void)
2375{
2376 BUG_ON(ceph_osd_request_cache);
2377 ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
2378 sizeof (struct ceph_osd_request),
2379 __alignof__(struct ceph_osd_request),
2380 0, NULL);
2381
2382 return ceph_osd_request_cache ? 0 : -ENOMEM;
2383}
2384EXPORT_SYMBOL(ceph_osdc_setup);
2385
2386void ceph_osdc_cleanup(void)
2387{
2388 BUG_ON(!ceph_osd_request_cache);
2389 kmem_cache_destroy(ceph_osd_request_cache);
2390 ceph_osd_request_cache = NULL;
2391}
2392EXPORT_SYMBOL(ceph_osdc_cleanup);
2393
2008/* 2394/*
2009 * handle incoming message 2395 * handle incoming message
2010 */ 2396 */
@@ -2064,13 +2450,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2064 goto out; 2450 goto out;
2065 } 2451 }
2066 2452
2067 if (req->r_con_filling_msg) { 2453 if (req->r_reply->con)
2068 dout("%s revoking msg %p from old con %p\n", __func__, 2454 dout("%s revoking msg %p from old con %p\n", __func__,
2069 req->r_reply, req->r_con_filling_msg); 2455 req->r_reply, req->r_reply->con);
2070 ceph_msg_revoke_incoming(req->r_reply); 2456 ceph_msg_revoke_incoming(req->r_reply);
2071 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
2072 req->r_con_filling_msg = NULL;
2073 }
2074 2457
2075 if (front > req->r_reply->front.iov_len) { 2458 if (front > req->r_reply->front.iov_len) {
2076 pr_warning("get_reply front %d > preallocated %d\n", 2459 pr_warning("get_reply front %d > preallocated %d\n",
@@ -2084,26 +2467,29 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2084 m = ceph_msg_get(req->r_reply); 2467 m = ceph_msg_get(req->r_reply);
2085 2468
2086 if (data_len > 0) { 2469 if (data_len > 0) {
2087 int want = calc_pages_for(req->r_page_alignment, data_len); 2470 struct ceph_osd_data *osd_data;
2088 2471
2089 if (req->r_pages && unlikely(req->r_num_pages < want)) { 2472 /*
2090 pr_warning("tid %lld reply has %d bytes %d pages, we" 2473 * XXX This is assuming there is only one op containing
2091 " had only %d pages ready\n", tid, data_len, 2474 * XXX page data. Probably OK for reads, but this
2092 want, req->r_num_pages); 2475 * XXX ought to be done more generally.
2093 *skip = 1; 2476 */
2094 ceph_msg_put(m); 2477 osd_data = osd_req_op_extent_osd_data(req, 0);
2095 m = NULL; 2478 if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
2096 goto out; 2479 if (osd_data->pages &&
2480 unlikely(osd_data->length < data_len)) {
2481
2482 pr_warning("tid %lld reply has %d bytes "
2483 "we had only %llu bytes ready\n",
2484 tid, data_len, osd_data->length);
2485 *skip = 1;
2486 ceph_msg_put(m);
2487 m = NULL;
2488 goto out;
2489 }
2097 } 2490 }
2098 m->pages = req->r_pages;
2099 m->nr_pages = req->r_num_pages;
2100 m->page_alignment = req->r_page_alignment;
2101#ifdef CONFIG_BLOCK
2102 m->bio = req->r_bio;
2103#endif
2104 } 2491 }
2105 *skip = 0; 2492 *skip = 0;
2106 req->r_con_filling_msg = con->ops->get(con);
2107 dout("get_reply tid %lld %p\n", tid, m); 2493 dout("get_reply tid %lld %p\n", tid, m);
2108 2494
2109out: 2495out:
@@ -2168,13 +2554,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
2168 struct ceph_auth_handshake *auth = &o->o_auth; 2554 struct ceph_auth_handshake *auth = &o->o_auth;
2169 2555
2170 if (force_new && auth->authorizer) { 2556 if (force_new && auth->authorizer) {
2171 if (ac->ops && ac->ops->destroy_authorizer) 2557 ceph_auth_destroy_authorizer(ac, auth->authorizer);
2172 ac->ops->destroy_authorizer(ac, auth->authorizer);
2173 auth->authorizer = NULL; 2558 auth->authorizer = NULL;
2174 } 2559 }
2175 if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) { 2560 if (!auth->authorizer) {
2176 int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD, 2561 int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
2177 auth); 2562 auth);
2563 if (ret)
2564 return ERR_PTR(ret);
2565 } else {
2566 int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
2567 auth);
2178 if (ret) 2568 if (ret)
2179 return ERR_PTR(ret); 2569 return ERR_PTR(ret);
2180 } 2570 }
@@ -2190,11 +2580,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
2190 struct ceph_osd_client *osdc = o->o_osdc; 2580 struct ceph_osd_client *osdc = o->o_osdc;
2191 struct ceph_auth_client *ac = osdc->client->monc.auth; 2581 struct ceph_auth_client *ac = osdc->client->monc.auth;
2192 2582
2193 /* 2583 return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
2194 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
2195 * XXX which do we do: succeed or fail?
2196 */
2197 return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
2198} 2584}
2199 2585
2200static int invalidate_authorizer(struct ceph_connection *con) 2586static int invalidate_authorizer(struct ceph_connection *con)
@@ -2203,9 +2589,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
2203 struct ceph_osd_client *osdc = o->o_osdc; 2589 struct ceph_osd_client *osdc = o->o_osdc;
2204 struct ceph_auth_client *ac = osdc->client->monc.auth; 2590 struct ceph_auth_client *ac = osdc->client->monc.auth;
2205 2591
2206 if (ac->ops && ac->ops->invalidate_authorizer) 2592 ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
2207 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
2208
2209 return ceph_monc_validate_auth(&osdc->client->monc); 2593 return ceph_monc_validate_auth(&osdc->client->monc);
2210} 2594}
2211 2595
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4543b9aba40c..603ddd92db19 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
654 return 0; 654 return 0;
655} 655}
656 656
657static int __decode_pgid(void **p, void *end, struct ceph_pg *pg)
658{
659 u8 v;
660
661 ceph_decode_need(p, end, 1+8+4+4, bad);
662 v = ceph_decode_8(p);
663 if (v != 1)
664 goto bad;
665 pg->pool = ceph_decode_64(p);
666 pg->seed = ceph_decode_32(p);
667 *p += 4; /* skip preferred */
668 return 0;
669
670bad:
671 dout("error decoding pgid\n");
672 return -EINVAL;
673}
674
675/* 657/*
676 * decode a full map. 658 * decode a full map.
677 */ 659 */
@@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
765 struct ceph_pg pgid; 747 struct ceph_pg pgid;
766 struct ceph_pg_mapping *pg; 748 struct ceph_pg_mapping *pg;
767 749
768 err = __decode_pgid(p, end, &pgid); 750 err = ceph_decode_pgid(p, end, &pgid);
769 if (err) 751 if (err)
770 goto bad; 752 goto bad;
771 ceph_decode_need(p, end, sizeof(u32), bad); 753 ceph_decode_need(p, end, sizeof(u32), bad);
@@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
983 struct ceph_pg pgid; 965 struct ceph_pg pgid;
984 u32 pglen; 966 u32 pglen;
985 967
986 err = __decode_pgid(p, end, &pgid); 968 err = ceph_decode_pgid(p, end, &pgid);
987 if (err) 969 if (err)
988 goto bad; 970 goto bad;
989 ceph_decode_need(p, end, sizeof(u32), bad); 971 ceph_decode_need(p, end, sizeof(u32), bad);
@@ -1111,27 +1093,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
1111 * calculate an object layout (i.e. pgid) from an oid, 1093 * calculate an object layout (i.e. pgid) from an oid,
1112 * file_layout, and osdmap 1094 * file_layout, and osdmap
1113 */ 1095 */
1114int ceph_calc_object_layout(struct ceph_pg *pg, 1096int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
1115 const char *oid, 1097 struct ceph_osdmap *osdmap, uint64_t pool)
1116 struct ceph_file_layout *fl,
1117 struct ceph_osdmap *osdmap)
1118{ 1098{
1119 unsigned int num, num_mask; 1099 struct ceph_pg_pool_info *pool_info;
1120 struct ceph_pg_pool_info *pool;
1121 1100
1122 BUG_ON(!osdmap); 1101 BUG_ON(!osdmap);
1123 pg->pool = le32_to_cpu(fl->fl_pg_pool); 1102 pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
1124 pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool); 1103 if (!pool_info)
1125 if (!pool)
1126 return -EIO; 1104 return -EIO;
1127 pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid)); 1105 pg->pool = pool;
1128 num = pool->pg_num; 1106 pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
1129 num_mask = pool->pg_num_mask;
1130 1107
1131 dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed); 1108 dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
1132 return 0; 1109 return 0;
1133} 1110}
1134EXPORT_SYMBOL(ceph_calc_object_layout); 1111EXPORT_SYMBOL(ceph_calc_ceph_pg);
1135 1112
1136/* 1113/*
1137 * Calculate raw osd vector for the given pgid. Return pointer to osd 1114 * Calculate raw osd vector for the given pgid. Return pointer to osd
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
new file mode 100644
index 000000000000..154683f5f14c
--- /dev/null
+++ b/net/ceph/snapshot.c
@@ -0,0 +1,78 @@
1/*
2 * snapshot.c Ceph snapshot context utility routines (part of libceph)
3 *
4 * Copyright (C) 2013 Inktank Storage, Inc.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
18 * 02110-1301, USA.
19 */
20
21#include <stddef.h>
22
23#include <linux/types.h>
24#include <linux/export.h>
25#include <linux/ceph/libceph.h>
26
27/*
28 * Ceph snapshot contexts are reference counted objects, and the
29 * returned structure holds a single reference. Acquire additional
30 * references with ceph_get_snap_context(), and release them with
31 * ceph_put_snap_context(). When the reference count reaches zero
32 * the entire structure is freed.
33 */
34
35/*
36 * Create a new ceph snapshot context large enough to hold the
37 * indicated number of snapshot ids (which can be 0). Caller has
38 * to fill in snapc->seq and snapc->snaps[0..snap_count-1].
39 *
40 * Returns a null pointer if an error occurs.
41 */
42struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
43 gfp_t gfp_flags)
44{
45 struct ceph_snap_context *snapc;
46 size_t size;
47
48 size = sizeof (struct ceph_snap_context);
49 size += snap_count * sizeof (snapc->snaps[0]);
50 snapc = kzalloc(size, gfp_flags);
51 if (!snapc)
52 return NULL;
53
54 atomic_set(&snapc->nref, 1);
55 snapc->num_snaps = snap_count;
56
57 return snapc;
58}
59EXPORT_SYMBOL(ceph_create_snap_context);
60
61struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
62{
63 if (sc)
64 atomic_inc(&sc->nref);
65 return sc;
66}
67EXPORT_SYMBOL(ceph_get_snap_context);
68
69void ceph_put_snap_context(struct ceph_snap_context *sc)
70{
71 if (!sc)
72 return;
73 if (atomic_dec_and_test(&sc->nref)) {
74 /*printk(" deleting snap_context %p\n", sc);*/
75 kfree(sc);
76 }
77}
78EXPORT_SYMBOL(ceph_put_snap_context);
diff --git a/net/core/dev.c b/net/core/dev.c
index 40b1fadaf637..fc1e289397f5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2213,6 +2213,17 @@ __be16 skb_network_protocol(struct sk_buff *skb)
2213 __be16 type = skb->protocol; 2213 __be16 type = skb->protocol;
2214 int vlan_depth = ETH_HLEN; 2214 int vlan_depth = ETH_HLEN;
2215 2215
2216 /* Tunnel gso handlers can set protocol to ethernet. */
2217 if (type == htons(ETH_P_TEB)) {
2218 struct ethhdr *eth;
2219
2220 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2221 return 0;
2222
2223 eth = (struct ethhdr *)skb_mac_header(skb);
2224 type = eth->h_proto;
2225 }
2226
2216 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) { 2227 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2217 struct vlan_hdr *vh; 2228 struct vlan_hdr *vh;
2218 2229
diff --git a/net/core/iovec.c b/net/core/iovec.c
index 7e7aeb01de45..de178e462682 100644
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -75,31 +75,6 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr_storage *a
75 75
76/* 76/*
77 * Copy kernel to iovec. Returns -EFAULT on error. 77 * Copy kernel to iovec. Returns -EFAULT on error.
78 *
79 * Note: this modifies the original iovec.
80 */
81
82int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len)
83{
84 while (len > 0) {
85 if (iov->iov_len) {
86 int copy = min_t(unsigned int, iov->iov_len, len);
87 if (copy_to_user(iov->iov_base, kdata, copy))
88 return -EFAULT;
89 kdata += copy;
90 len -= copy;
91 iov->iov_len -= copy;
92 iov->iov_base += copy;
93 }
94 iov++;
95 }
96
97 return 0;
98}
99EXPORT_SYMBOL(memcpy_toiovec);
100
101/*
102 * Copy kernel to iovec. Returns -EFAULT on error.
103 */ 78 */
104 79
105int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata, 80int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
@@ -125,31 +100,6 @@ int memcpy_toiovecend(const struct iovec *iov, unsigned char *kdata,
125EXPORT_SYMBOL(memcpy_toiovecend); 100EXPORT_SYMBOL(memcpy_toiovecend);
126 101
127/* 102/*
128 * Copy iovec to kernel. Returns -EFAULT on error.
129 *
130 * Note: this modifies the original iovec.
131 */
132
133int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len)
134{
135 while (len > 0) {
136 if (iov->iov_len) {
137 int copy = min_t(unsigned int, len, iov->iov_len);
138 if (copy_from_user(kdata, iov->iov_base, copy))
139 return -EFAULT;
140 len -= copy;
141 kdata += copy;
142 iov->iov_base += copy;
143 iov->iov_len -= copy;
144 }
145 iov++;
146 }
147
148 return 0;
149}
150EXPORT_SYMBOL(memcpy_fromiovec);
151
152/*
153 * Copy iovec from kernel. Returns -EFAULT on error. 103 * Copy iovec from kernel. Returns -EFAULT on error.
154 */ 104 */
155 105
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7427ab5e27d8..981fed397d1d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -606,21 +606,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
606 return sprintf(buf, "%lu\n", val); 606 return sprintf(buf, "%lu\n", val);
607} 607}
608 608
609static void rps_dev_flow_table_release_work(struct work_struct *work)
610{
611 struct rps_dev_flow_table *table = container_of(work,
612 struct rps_dev_flow_table, free_work);
613
614 vfree(table);
615}
616
617static void rps_dev_flow_table_release(struct rcu_head *rcu) 609static void rps_dev_flow_table_release(struct rcu_head *rcu)
618{ 610{
619 struct rps_dev_flow_table *table = container_of(rcu, 611 struct rps_dev_flow_table *table = container_of(rcu,
620 struct rps_dev_flow_table, rcu); 612 struct rps_dev_flow_table, rcu);
621 613 vfree(table);
622 INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
623 schedule_work(&table->free_work);
624} 614}
625 615
626static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, 616static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a5802a8b652f..cec074be8c43 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -206,7 +206,7 @@ static void netpoll_poll_dev(struct net_device *dev)
206 * the dev_open/close paths use this to block netpoll activity 206 * the dev_open/close paths use this to block netpoll activity
207 * while changing device state 207 * while changing device state
208 */ 208 */
209 if (!down_trylock(&ni->dev_lock)) 209 if (down_trylock(&ni->dev_lock))
210 return; 210 return;
211 211
212 if (!netif_running(dev)) { 212 if (!netif_running(dev)) {
diff --git a/net/core/sock.c b/net/core/sock.c
index d4f4cea726e7..6ba327da79e1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1217,18 +1217,6 @@ static void sock_copy(struct sock *nsk, const struct sock *osk)
1217#endif 1217#endif
1218} 1218}
1219 1219
1220/*
1221 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1222 * un-modified. Special care is taken when initializing object to zero.
1223 */
1224static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1225{
1226 if (offsetof(struct sock, sk_node.next) != 0)
1227 memset(sk, 0, offsetof(struct sock, sk_node.next));
1228 memset(&sk->sk_node.pprev, 0,
1229 size - offsetof(struct sock, sk_node.pprev));
1230}
1231
1232void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) 1220void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1233{ 1221{
1234 unsigned long nulls1, nulls2; 1222 unsigned long nulls1, nulls2;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ff06b7543d9f..49616fed9340 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -125,7 +125,6 @@ struct tnode {
125 unsigned int empty_children; /* KEYLENGTH bits needed */ 125 unsigned int empty_children; /* KEYLENGTH bits needed */
126 union { 126 union {
127 struct rcu_head rcu; 127 struct rcu_head rcu;
128 struct work_struct work;
129 struct tnode *tnode_free; 128 struct tnode *tnode_free;
130 }; 129 };
131 struct rt_trie_node __rcu *child[0]; 130 struct rt_trie_node __rcu *child[0];
@@ -383,12 +382,6 @@ static struct tnode *tnode_alloc(size_t size)
383 return vzalloc(size); 382 return vzalloc(size);
384} 383}
385 384
386static void __tnode_vfree(struct work_struct *arg)
387{
388 struct tnode *tn = container_of(arg, struct tnode, work);
389 vfree(tn);
390}
391
392static void __tnode_free_rcu(struct rcu_head *head) 385static void __tnode_free_rcu(struct rcu_head *head)
393{ 386{
394 struct tnode *tn = container_of(head, struct tnode, rcu); 387 struct tnode *tn = container_of(head, struct tnode, rcu);
@@ -397,10 +390,8 @@ static void __tnode_free_rcu(struct rcu_head *head)
397 390
398 if (size <= PAGE_SIZE) 391 if (size <= PAGE_SIZE)
399 kfree(tn); 392 kfree(tn);
400 else { 393 else
401 INIT_WORK(&tn->work, __tnode_vfree); 394 vfree(tn);
402 schedule_work(&tn->work);
403 }
404} 395}
405 396
406static inline void tnode_free(struct tnode *tn) 397static inline void tnode_free(struct tnode *tn)
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index cc22363965d2..b2e805af9b87 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -150,13 +150,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
150 csum = false; 150 csum = false;
151 151
152 /* setup inner skb. */ 152 /* setup inner skb. */
153 if (greh->protocol == htons(ETH_P_TEB)) { 153 skb->protocol = greh->protocol;
154 struct ethhdr *eth = (struct ethhdr *)skb_inner_mac_header(skb);
155 skb->protocol = eth->h_proto;
156 } else {
157 skb->protocol = greh->protocol;
158 }
159
160 skb->encapsulation = 0; 154 skb->encapsulation = 0;
161 155
162 if (unlikely(!pskb_may_pull(skb, ghl))) 156 if (unlikely(!pskb_may_pull(skb, ghl)))
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e97d66a1fdde..7e06641e36ae 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -305,6 +305,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
305 setup_timer(&q->timer, f->frag_expire, (unsigned long)q); 305 setup_timer(&q->timer, f->frag_expire, (unsigned long)q);
306 spin_lock_init(&q->lock); 306 spin_lock_init(&q->lock);
307 atomic_set(&q->refcnt, 1); 307 atomic_set(&q->refcnt, 1);
308 INIT_LIST_HEAD(&q->lru_list);
308 309
309 return q; 310 return q;
310} 311}
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c625e4dad4b0..2a83591492dd 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -235,7 +235,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
235 */ 235 */
236 struct net *net = dev_net(skb->dev); 236 struct net *net = dev_net(skb->dev);
237 struct ip_tunnel_net *itn; 237 struct ip_tunnel_net *itn;
238 const struct iphdr *iph = (const struct iphdr *)skb->data; 238 const struct iphdr *iph;
239 const int type = icmp_hdr(skb)->type; 239 const int type = icmp_hdr(skb)->type;
240 const int code = icmp_hdr(skb)->code; 240 const int code = icmp_hdr(skb)->code;
241 struct ip_tunnel *t; 241 struct ip_tunnel *t;
@@ -281,6 +281,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
281 else 281 else
282 itn = net_generic(net, ipgre_net_id); 282 itn = net_generic(net, ipgre_net_id);
283 283
284 iph = (const struct iphdr *)skb->data;
284 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags, 285 t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
285 iph->daddr, iph->saddr, tpi.key); 286 iph->daddr, iph->saddr, tpi.key);
286 287
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 147abf5275aa..4bcabf3ab4ca 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -84,7 +84,7 @@ int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
84EXPORT_SYMBOL(sysctl_ip_default_ttl); 84EXPORT_SYMBOL(sysctl_ip_default_ttl);
85 85
86/* Generate a checksum for an outgoing IP datagram. */ 86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph) 87void ip_send_check(struct iphdr *iph)
88{ 88{
89 iph->check = 0; 89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); 90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c
index f8a222cb6448..cf08218ddbcf 100644
--- a/net/ipv4/netfilter/ipt_ULOG.c
+++ b/net/ipv4/netfilter/ipt_ULOG.c
@@ -162,7 +162,8 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size)
162 return skb; 162 return skb;
163} 163}
164 164
165static void ipt_ulog_packet(unsigned int hooknum, 165static void ipt_ulog_packet(struct net *net,
166 unsigned int hooknum,
166 const struct sk_buff *skb, 167 const struct sk_buff *skb,
167 const struct net_device *in, 168 const struct net_device *in,
168 const struct net_device *out, 169 const struct net_device *out,
@@ -174,7 +175,6 @@ static void ipt_ulog_packet(unsigned int hooknum,
174 size_t size, copy_len; 175 size_t size, copy_len;
175 struct nlmsghdr *nlh; 176 struct nlmsghdr *nlh;
176 struct timeval tv; 177 struct timeval tv;
177 struct net *net = dev_net(in ? in : out);
178 struct ulog_net *ulog = ulog_pernet(net); 178 struct ulog_net *ulog = ulog_pernet(net);
179 179
180 /* ffs == find first bit set, necessary because userspace 180 /* ffs == find first bit set, necessary because userspace
@@ -291,12 +291,15 @@ alloc_failure:
291static unsigned int 291static unsigned int
292ulog_tg(struct sk_buff *skb, const struct xt_action_param *par) 292ulog_tg(struct sk_buff *skb, const struct xt_action_param *par)
293{ 293{
294 ipt_ulog_packet(par->hooknum, skb, par->in, par->out, 294 struct net *net = dev_net(par->in ? par->in : par->out);
295
296 ipt_ulog_packet(net, par->hooknum, skb, par->in, par->out,
295 par->targinfo, NULL); 297 par->targinfo, NULL);
296 return XT_CONTINUE; 298 return XT_CONTINUE;
297} 299}
298 300
299static void ipt_logfn(u_int8_t pf, 301static void ipt_logfn(struct net *net,
302 u_int8_t pf,
300 unsigned int hooknum, 303 unsigned int hooknum,
301 const struct sk_buff *skb, 304 const struct sk_buff *skb,
302 const struct net_device *in, 305 const struct net_device *in,
@@ -318,7 +321,7 @@ static void ipt_logfn(u_int8_t pf,
318 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); 321 strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix));
319 } 322 }
320 323
321 ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); 324 ipt_ulog_packet(net, hooknum, skb, in, out, &loginfo, prefix);
322} 325}
323 326
324static int ulog_tg_check(const struct xt_tgchk_param *par) 327static int ulog_tg_check(const struct xt_tgchk_param *par)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index dcb116dde216..ab450c099aa4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2887,6 +2887,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2887 unsigned int mss; 2887 unsigned int mss;
2888 struct sk_buff *gso_skb = skb; 2888 struct sk_buff *gso_skb = skb;
2889 __sum16 newcheck; 2889 __sum16 newcheck;
2890 bool ooo_okay, copy_destructor;
2890 2891
2891 if (!pskb_may_pull(skb, sizeof(*th))) 2892 if (!pskb_may_pull(skb, sizeof(*th)))
2892 goto out; 2893 goto out;
@@ -2927,10 +2928,18 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2927 goto out; 2928 goto out;
2928 } 2929 }
2929 2930
2931 copy_destructor = gso_skb->destructor == tcp_wfree;
2932 ooo_okay = gso_skb->ooo_okay;
2933 /* All segments but the first should have ooo_okay cleared */
2934 skb->ooo_okay = 0;
2935
2930 segs = skb_segment(skb, features); 2936 segs = skb_segment(skb, features);
2931 if (IS_ERR(segs)) 2937 if (IS_ERR(segs))
2932 goto out; 2938 goto out;
2933 2939
2940 /* Only first segment might have ooo_okay set */
2941 segs->ooo_okay = ooo_okay;
2942
2934 delta = htonl(oldlen + (thlen + mss)); 2943 delta = htonl(oldlen + (thlen + mss));
2935 2944
2936 skb = segs; 2945 skb = segs;
@@ -2950,6 +2959,17 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2950 thlen, skb->csum)); 2959 thlen, skb->csum));
2951 2960
2952 seq += mss; 2961 seq += mss;
2962 if (copy_destructor) {
2963 skb->destructor = gso_skb->destructor;
2964 skb->sk = gso_skb->sk;
2965 /* {tcp|sock}_wfree() use exact truesize accounting :
2966 * sum(skb->truesize) MUST be exactly be gso_skb->truesize
2967 * So we account mss bytes of 'true size' for each segment.
2968 * The last segment will contain the remaining.
2969 */
2970 skb->truesize = mss;
2971 gso_skb->truesize -= mss;
2972 }
2953 skb = skb->next; 2973 skb = skb->next;
2954 th = tcp_hdr(skb); 2974 th = tcp_hdr(skb);
2955 2975
@@ -2962,7 +2982,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
2962 * is freed at TX completion, and not right now when gso_skb 2982 * is freed at TX completion, and not right now when gso_skb
2963 * is freed by GSO engine 2983 * is freed by GSO engine
2964 */ 2984 */
2965 if (gso_skb->destructor == tcp_wfree) { 2985 if (copy_destructor) {
2966 swap(gso_skb->sk, skb->sk); 2986 swap(gso_skb->sk, skb->sk);
2967 swap(gso_skb->destructor, skb->destructor); 2987 swap(gso_skb->destructor, skb->destructor);
2968 swap(gso_skb->truesize, skb->truesize); 2988 swap(gso_skb->truesize, skb->truesize);
@@ -3269,8 +3289,11 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
3269 3289
3270 for (i = 0; i < shi->nr_frags; ++i) { 3290 for (i = 0; i < shi->nr_frags; ++i) {
3271 const struct skb_frag_struct *f = &shi->frags[i]; 3291 const struct skb_frag_struct *f = &shi->frags[i];
3272 struct page *page = skb_frag_page(f); 3292 unsigned int offset = f->page_offset;
3273 sg_set_page(&sg, page, skb_frag_size(f), f->page_offset); 3293 struct page *page = skb_frag_page(f) + (offset >> PAGE_SHIFT);
3294
3295 sg_set_page(&sg, page, skb_frag_size(f),
3296 offset_in_page(offset));
3274 if (crypto_hash_update(desc, &sg, skb_frag_size(f))) 3297 if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
3275 return 1; 3298 return 1;
3276 } 3299 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 08bbe6096528..9c6225780bd5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2743,8 +2743,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
2743 * tcp_xmit_retransmit_queue(). 2743 * tcp_xmit_retransmit_queue().
2744 */ 2744 */
2745static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 2745static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2746 int prior_sacked, bool is_dupack, 2746 int prior_sacked, int prior_packets,
2747 int flag) 2747 bool is_dupack, int flag)
2748{ 2748{
2749 struct inet_connection_sock *icsk = inet_csk(sk); 2749 struct inet_connection_sock *icsk = inet_csk(sk);
2750 struct tcp_sock *tp = tcp_sk(sk); 2750 struct tcp_sock *tp = tcp_sk(sk);
@@ -2804,7 +2804,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2804 tcp_add_reno_sack(sk); 2804 tcp_add_reno_sack(sk);
2805 } else 2805 } else
2806 do_lost = tcp_try_undo_partial(sk, pkts_acked); 2806 do_lost = tcp_try_undo_partial(sk, pkts_acked);
2807 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; 2807 newly_acked_sacked = prior_packets - tp->packets_out +
2808 tp->sacked_out - prior_sacked;
2808 break; 2809 break;
2809 case TCP_CA_Loss: 2810 case TCP_CA_Loss:
2810 tcp_process_loss(sk, flag, is_dupack); 2811 tcp_process_loss(sk, flag, is_dupack);
@@ -2818,7 +2819,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2818 if (is_dupack) 2819 if (is_dupack)
2819 tcp_add_reno_sack(sk); 2820 tcp_add_reno_sack(sk);
2820 } 2821 }
2821 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked; 2822 newly_acked_sacked = prior_packets - tp->packets_out +
2823 tp->sacked_out - prior_sacked;
2822 2824
2823 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 2825 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
2824 tcp_try_undo_dsack(sk); 2826 tcp_try_undo_dsack(sk);
@@ -3330,9 +3332,10 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3330 bool is_dupack = false; 3332 bool is_dupack = false;
3331 u32 prior_in_flight; 3333 u32 prior_in_flight;
3332 u32 prior_fackets; 3334 u32 prior_fackets;
3333 int prior_packets; 3335 int prior_packets = tp->packets_out;
3334 int prior_sacked = tp->sacked_out; 3336 int prior_sacked = tp->sacked_out;
3335 int pkts_acked = 0; 3337 int pkts_acked = 0;
3338 int previous_packets_out = 0;
3336 3339
3337 /* If the ack is older than previous acks 3340 /* If the ack is older than previous acks
3338 * then we can probably ignore it. 3341 * then we can probably ignore it.
@@ -3403,14 +3406,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3403 sk->sk_err_soft = 0; 3406 sk->sk_err_soft = 0;
3404 icsk->icsk_probes_out = 0; 3407 icsk->icsk_probes_out = 0;
3405 tp->rcv_tstamp = tcp_time_stamp; 3408 tp->rcv_tstamp = tcp_time_stamp;
3406 prior_packets = tp->packets_out;
3407 if (!prior_packets) 3409 if (!prior_packets)
3408 goto no_queue; 3410 goto no_queue;
3409 3411
3410 /* See if we can take anything off of the retransmit queue. */ 3412 /* See if we can take anything off of the retransmit queue. */
3413 previous_packets_out = tp->packets_out;
3411 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3414 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3412 3415
3413 pkts_acked = prior_packets - tp->packets_out; 3416 pkts_acked = previous_packets_out - tp->packets_out;
3414 3417
3415 if (tcp_ack_is_dubious(sk, flag)) { 3418 if (tcp_ack_is_dubious(sk, flag)) {
3416 /* Advance CWND, if state allows this. */ 3419 /* Advance CWND, if state allows this. */
@@ -3418,7 +3421,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3418 tcp_cong_avoid(sk, ack, prior_in_flight); 3421 tcp_cong_avoid(sk, ack, prior_in_flight);
3419 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3422 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3420 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3423 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3421 is_dupack, flag); 3424 prior_packets, is_dupack, flag);
3422 } else { 3425 } else {
3423 if (flag & FLAG_DATA_ACKED) 3426 if (flag & FLAG_DATA_ACKED)
3424 tcp_cong_avoid(sk, ack, prior_in_flight); 3427 tcp_cong_avoid(sk, ack, prior_in_flight);
@@ -3441,7 +3444,7 @@ no_queue:
3441 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3444 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3442 if (flag & FLAG_DSACKING_ACK) 3445 if (flag & FLAG_DSACKING_ACK)
3443 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3446 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3444 is_dupack, flag); 3447 prior_packets, is_dupack, flag);
3445 /* If this ack opens up a zero window, clear backoff. It was 3448 /* If this ack opens up a zero window, clear backoff. It was
3446 * being used to time the probes, and is probably far higher than 3449 * being used to time the probes, and is probably far higher than
3447 * it needs to be for normal retransmission. 3450 * it needs to be for normal retransmission.
@@ -3464,7 +3467,7 @@ old_ack:
3464 if (TCP_SKB_CB(skb)->sacked) { 3467 if (TCP_SKB_CB(skb)->sacked) {
3465 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3468 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3466 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked, 3469 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3467 is_dupack, flag); 3470 prior_packets, is_dupack, flag);
3468 } 3471 }
3469 3472
3470 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt); 3473 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index f696d7c2e9fa..f6a005c485a9 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -96,7 +96,8 @@ struct tcpm_hash_bucket {
96 96
97static DEFINE_SPINLOCK(tcp_metrics_lock); 97static DEFINE_SPINLOCK(tcp_metrics_lock);
98 98
99static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst) 99static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst,
100 bool fastopen_clear)
100{ 101{
101 u32 val; 102 u32 val;
102 103
@@ -122,9 +123,11 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, struct dst_entry *dst)
122 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); 123 tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING);
123 tm->tcpm_ts = 0; 124 tm->tcpm_ts = 0;
124 tm->tcpm_ts_stamp = 0; 125 tm->tcpm_ts_stamp = 0;
125 tm->tcpm_fastopen.mss = 0; 126 if (fastopen_clear) {
126 tm->tcpm_fastopen.syn_loss = 0; 127 tm->tcpm_fastopen.mss = 0;
127 tm->tcpm_fastopen.cookie.len = 0; 128 tm->tcpm_fastopen.syn_loss = 0;
129 tm->tcpm_fastopen.cookie.len = 0;
130 }
128} 131}
129 132
130static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, 133static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
@@ -154,7 +157,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
154 } 157 }
155 tm->tcpm_addr = *addr; 158 tm->tcpm_addr = *addr;
156 159
157 tcpm_suck_dst(tm, dst); 160 tcpm_suck_dst(tm, dst, true);
158 161
159 if (likely(!reclaim)) { 162 if (likely(!reclaim)) {
160 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain; 163 tm->tcpm_next = net->ipv4.tcp_metrics_hash[hash].chain;
@@ -171,7 +174,7 @@ out_unlock:
171static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst) 174static void tcpm_check_stamp(struct tcp_metrics_block *tm, struct dst_entry *dst)
172{ 175{
173 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT))) 176 if (tm && unlikely(time_after(jiffies, tm->tcpm_stamp + TCP_METRICS_TIMEOUT)))
174 tcpm_suck_dst(tm, dst); 177 tcpm_suck_dst(tm, dst, false);
175} 178}
176 179
177#define TCP_METRICS_RECLAIM_DEPTH 5 180#define TCP_METRICS_RECLAIM_DEPTH 5
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 536d40929ba6..ec335fabd5cc 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -874,11 +874,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
874 &md5); 874 &md5);
875 tcp_header_size = tcp_options_size + sizeof(struct tcphdr); 875 tcp_header_size = tcp_options_size + sizeof(struct tcphdr);
876 876
877 if (tcp_packets_in_flight(tp) == 0) { 877 if (tcp_packets_in_flight(tp) == 0)
878 tcp_ca_event(sk, CA_EVENT_TX_START); 878 tcp_ca_event(sk, CA_EVENT_TX_START);
879 skb->ooo_okay = 1; 879
880 } else 880 /* if no packet is in qdisc/device queue, then allow XPS to select
881 skb->ooo_okay = 0; 881 * another queue.
882 */
883 skb->ooo_okay = sk_wmem_alloc_get(sk) == 0;
882 884
883 skb_push(skb, tcp_header_size); 885 skb_push(skb, tcp_header_size);
884 skb_reset_transport_header(skb); 886 skb_reset_transport_header(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0ae038a4c7a8..0bf5d399a03c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2311,7 +2311,6 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2311 struct sk_buff *segs = ERR_PTR(-EINVAL); 2311 struct sk_buff *segs = ERR_PTR(-EINVAL);
2312 int mac_len = skb->mac_len; 2312 int mac_len = skb->mac_len;
2313 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); 2313 int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
2314 struct ethhdr *inner_eth = (struct ethhdr *)skb_inner_mac_header(skb);
2315 __be16 protocol = skb->protocol; 2314 __be16 protocol = skb->protocol;
2316 netdev_features_t enc_features; 2315 netdev_features_t enc_features;
2317 int outer_hlen; 2316 int outer_hlen;
@@ -2324,8 +2323,7 @@ static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
2324 skb_reset_mac_header(skb); 2323 skb_reset_mac_header(skb);
2325 skb_set_network_header(skb, skb_inner_network_offset(skb)); 2324 skb_set_network_header(skb, skb_inner_network_offset(skb));
2326 skb->mac_len = skb_inner_network_offset(skb); 2325 skb->mac_len = skb_inner_network_offset(skb);
2327 inner_eth = (struct ethhdr *)skb_mac_header(skb); 2326 skb->protocol = htons(ETH_P_TEB);
2328 skb->protocol = inner_eth->h_proto;
2329 2327
2330 /* segment inner packet. */ 2328 /* segment inner packet. */
2331 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb); 2329 enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d3ddd8400354..ecd60733e5e2 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1081,6 +1081,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
1081 } 1081 }
1082 if (t == NULL) 1082 if (t == NULL)
1083 t = netdev_priv(dev); 1083 t = netdev_priv(dev);
1084 memset(&p, 0, sizeof(p));
1084 ip6gre_tnl_parm_to_user(&p, &t->parms); 1085 ip6gre_tnl_parm_to_user(&p, &t->parms);
1085 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 1086 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1086 err = -EFAULT; 1087 err = -EFAULT;
@@ -1128,6 +1129,7 @@ static int ip6gre_tunnel_ioctl(struct net_device *dev,
1128 if (t) { 1129 if (t) {
1129 err = 0; 1130 err = 0;
1130 1131
1132 memset(&p, 0, sizeof(p));
1131 ip6gre_tnl_parm_to_user(&p, &t->parms); 1133 ip6gre_tnl_parm_to_user(&p, &t->parms);
1132 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) 1134 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1133 err = -EFAULT; 1135 err = -EFAULT;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d2eedf192330..dae1949019d7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1147,7 +1147,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1147 if (WARN_ON(np->cork.opt)) 1147 if (WARN_ON(np->cork.opt))
1148 return -EINVAL; 1148 return -EINVAL;
1149 1149
1150 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); 1150 np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1151 if (unlikely(np->cork.opt == NULL)) 1151 if (unlikely(np->cork.opt == NULL))
1152 return -ENOBUFS; 1152 return -ENOBUFS;
1153 1153
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 71167069b394..0a17ed9eaf39 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1890,6 +1890,17 @@ void tcp6_proc_exit(struct net *net)
1890} 1890}
1891#endif 1891#endif
1892 1892
1893static void tcp_v6_clear_sk(struct sock *sk, int size)
1894{
1895 struct inet_sock *inet = inet_sk(sk);
1896
1897 /* we do not want to clear pinet6 field, because of RCU lookups */
1898 sk_prot_clear_nulls(sk, offsetof(struct inet_sock, pinet6));
1899
1900 size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
1901 memset(&inet->pinet6 + 1, 0, size);
1902}
1903
1893struct proto tcpv6_prot = { 1904struct proto tcpv6_prot = {
1894 .name = "TCPv6", 1905 .name = "TCPv6",
1895 .owner = THIS_MODULE, 1906 .owner = THIS_MODULE,
@@ -1933,6 +1944,7 @@ struct proto tcpv6_prot = {
1933#ifdef CONFIG_MEMCG_KMEM 1944#ifdef CONFIG_MEMCG_KMEM
1934 .proto_cgroup = tcp_proto_cgroup, 1945 .proto_cgroup = tcp_proto_cgroup,
1935#endif 1946#endif
1947 .clear_sk = tcp_v6_clear_sk,
1936}; 1948};
1937 1949
1938static const struct inet6_protocol tcpv6_protocol = { 1950static const struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d4defdd44937..42923b14dfa6 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1432,6 +1432,17 @@ void udp6_proc_exit(struct net *net) {
1432} 1432}
1433#endif /* CONFIG_PROC_FS */ 1433#endif /* CONFIG_PROC_FS */
1434 1434
1435void udp_v6_clear_sk(struct sock *sk, int size)
1436{
1437 struct inet_sock *inet = inet_sk(sk);
1438
1439 /* we do not want to clear pinet6 field, because of RCU lookups */
1440 sk_prot_clear_portaddr_nulls(sk, offsetof(struct inet_sock, pinet6));
1441
1442 size -= offsetof(struct inet_sock, pinet6) + sizeof(inet->pinet6);
1443 memset(&inet->pinet6 + 1, 0, size);
1444}
1445
1435/* ------------------------------------------------------------------------ */ 1446/* ------------------------------------------------------------------------ */
1436 1447
1437struct proto udpv6_prot = { 1448struct proto udpv6_prot = {
@@ -1462,7 +1473,7 @@ struct proto udpv6_prot = {
1462 .compat_setsockopt = compat_udpv6_setsockopt, 1473 .compat_setsockopt = compat_udpv6_setsockopt,
1463 .compat_getsockopt = compat_udpv6_getsockopt, 1474 .compat_getsockopt = compat_udpv6_getsockopt,
1464#endif 1475#endif
1465 .clear_sk = sk_prot_clear_portaddr_nulls, 1476 .clear_sk = udp_v6_clear_sk,
1466}; 1477};
1467 1478
1468static struct inet_protosw udpv6_protosw = { 1479static struct inet_protosw udpv6_protosw = {
diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h
index d7571046bfc4..4691ed50a928 100644
--- a/net/ipv6/udp_impl.h
+++ b/net/ipv6/udp_impl.h
@@ -31,6 +31,8 @@ extern int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk,
31extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb); 31extern int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb);
32extern void udpv6_destroy_sock(struct sock *sk); 32extern void udpv6_destroy_sock(struct sock *sk);
33 33
34extern void udp_v6_clear_sk(struct sock *sk, int size);
35
34#ifdef CONFIG_PROC_FS 36#ifdef CONFIG_PROC_FS
35extern int udp6_seq_show(struct seq_file *seq, void *v); 37extern int udp6_seq_show(struct seq_file *seq, void *v);
36#endif 38#endif
diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c
index 1d08e21d9f69..dfcc4be46898 100644
--- a/net/ipv6/udplite.c
+++ b/net/ipv6/udplite.c
@@ -56,7 +56,7 @@ struct proto udplitev6_prot = {
56 .compat_setsockopt = compat_udpv6_setsockopt, 56 .compat_setsockopt = compat_udpv6_setsockopt,
57 .compat_getsockopt = compat_udpv6_getsockopt, 57 .compat_getsockopt = compat_udpv6_getsockopt,
58#endif 58#endif
59 .clear_sk = sk_prot_clear_portaddr_nulls, 59 .clear_sk = udp_v6_clear_sk,
60}; 60};
61 61
62static struct inet_protosw udplite6_protosw = { 62static struct inet_protosw udplite6_protosw = {
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 4ef7bdb65440..23ed03d786c8 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -103,8 +103,10 @@ static int xfrm6_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
103 dev_hold(dev); 103 dev_hold(dev);
104 104
105 xdst->u.rt6.rt6i_idev = in6_dev_get(dev); 105 xdst->u.rt6.rt6i_idev = in6_dev_get(dev);
106 if (!xdst->u.rt6.rt6i_idev) 106 if (!xdst->u.rt6.rt6i_idev) {
107 dev_put(dev);
107 return -ENODEV; 108 return -ENODEV;
109 }
108 110
109 rt6_transfer_peer(&xdst->u.rt6, rt); 111 rt6_transfer_peer(&xdst->u.rt6, rt);
110 112
diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c
index 8c004161a843..9ea0c933b9ff 100644
--- a/net/irda/irlap_frame.c
+++ b/net/irda/irlap_frame.c
@@ -544,7 +544,7 @@ static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self,
544 /* 544 /*
545 * We now have some discovery info to deliver! 545 * We now have some discovery info to deliver!
546 */ 546 */
547 discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC); 547 discovery = kzalloc(sizeof(discovery_t), GFP_ATOMIC);
548 if (!discovery) { 548 if (!discovery) {
549 IRDA_WARNING("%s: unable to malloc!\n", __func__); 549 IRDA_WARNING("%s: unable to malloc!\n", __func__);
550 return; 550 return;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 158e6eb188d3..44be28cfc6c4 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1267,6 +1267,7 @@ void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata);
1267void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata); 1267void ieee80211_mgd_stop(struct ieee80211_sub_if_data *sdata);
1268void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata, 1268void ieee80211_mgd_conn_tx_status(struct ieee80211_sub_if_data *sdata,
1269 __le16 fc, bool acked); 1269 __le16 fc, bool acked);
1270void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata);
1270 1271
1271/* IBSS code */ 1272/* IBSS code */
1272void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); 1273void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 29620bfc7a69..a46e490f20dd 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1015,7 +1015,8 @@ static void ieee80211_chswitch_timer(unsigned long data)
1015 1015
1016static void 1016static void
1017ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata, 1017ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
1018 u64 timestamp, struct ieee802_11_elems *elems) 1018 u64 timestamp, struct ieee802_11_elems *elems,
1019 bool beacon)
1019{ 1020{
1020 struct ieee80211_local *local = sdata->local; 1021 struct ieee80211_local *local = sdata->local;
1021 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 1022 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -1032,6 +1033,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
1032 struct cfg80211_chan_def new_vht_chandef = {}; 1033 struct cfg80211_chan_def new_vht_chandef = {};
1033 const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; 1034 const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
1034 const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie; 1035 const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
1036 const struct ieee80211_ht_operation *ht_oper;
1035 int secondary_channel_offset = -1; 1037 int secondary_channel_offset = -1;
1036 1038
1037 ASSERT_MGD_MTX(ifmgd); 1039 ASSERT_MGD_MTX(ifmgd);
@@ -1048,11 +1050,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
1048 1050
1049 sec_chan_offs = elems->sec_chan_offs; 1051 sec_chan_offs = elems->sec_chan_offs;
1050 wide_bw_chansw_ie = elems->wide_bw_chansw_ie; 1052 wide_bw_chansw_ie = elems->wide_bw_chansw_ie;
1053 ht_oper = elems->ht_operation;
1051 1054
1052 if (ifmgd->flags & (IEEE80211_STA_DISABLE_HT | 1055 if (ifmgd->flags & (IEEE80211_STA_DISABLE_HT |
1053 IEEE80211_STA_DISABLE_40MHZ)) { 1056 IEEE80211_STA_DISABLE_40MHZ)) {
1054 sec_chan_offs = NULL; 1057 sec_chan_offs = NULL;
1055 wide_bw_chansw_ie = NULL; 1058 wide_bw_chansw_ie = NULL;
1059 /* only used for bandwidth here */
1060 ht_oper = NULL;
1056 } 1061 }
1057 1062
1058 if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT) 1063 if (ifmgd->flags & IEEE80211_STA_DISABLE_VHT)
@@ -1094,10 +1099,20 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
1094 return; 1099 return;
1095 } 1100 }
1096 1101
1097 if (sec_chan_offs) { 1102 if (!beacon && sec_chan_offs) {
1098 secondary_channel_offset = sec_chan_offs->sec_chan_offs; 1103 secondary_channel_offset = sec_chan_offs->sec_chan_offs;
1104 } else if (beacon && ht_oper) {
1105 secondary_channel_offset =
1106 ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET;
1099 } else if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) { 1107 } else if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
1100 /* if HT is enabled and the IE not present, it's still HT */ 1108 /*
1109 * If it's not a beacon, HT is enabled and the IE not present,
1110 * it's 20 MHz, 802.11-2012 8.5.2.6:
1111 * This element [the Secondary Channel Offset Element] is
1112 * present when switching to a 40 MHz channel. It may be
1113 * present when switching to a 20 MHz channel (in which
1114 * case the secondary channel offset is set to SCN).
1115 */
1101 secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE; 1116 secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
1102 } 1117 }
1103 1118
@@ -2796,7 +2811,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
2796 mutex_unlock(&local->iflist_mtx); 2811 mutex_unlock(&local->iflist_mtx);
2797 } 2812 }
2798 2813
2799 ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, elems); 2814 ieee80211_sta_process_chanswitch(sdata, rx_status->mactime,
2815 elems, true);
2800 2816
2801} 2817}
2802 2818
@@ -3210,7 +3226,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
3210 3226
3211 ieee80211_sta_process_chanswitch(sdata, 3227 ieee80211_sta_process_chanswitch(sdata,
3212 rx_status->mactime, 3228 rx_status->mactime,
3213 &elems); 3229 &elems, false);
3214 } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) { 3230 } else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) {
3215 ies_len = skb->len - 3231 ies_len = skb->len -
3216 offsetof(struct ieee80211_mgmt, 3232 offsetof(struct ieee80211_mgmt,
@@ -3232,7 +3248,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
3232 3248
3233 ieee80211_sta_process_chanswitch(sdata, 3249 ieee80211_sta_process_chanswitch(sdata,
3234 rx_status->mactime, 3250 rx_status->mactime,
3235 &elems); 3251 &elems, false);
3236 } 3252 }
3237 break; 3253 break;
3238 } 3254 }
@@ -3623,6 +3639,31 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
3623 } 3639 }
3624} 3640}
3625 3641
3642#ifdef CONFIG_PM
3643void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata)
3644{
3645 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
3646
3647 mutex_lock(&ifmgd->mtx);
3648 if (!ifmgd->associated) {
3649 mutex_unlock(&ifmgd->mtx);
3650 return;
3651 }
3652
3653 if (sdata->flags & IEEE80211_SDATA_DISCONNECT_RESUME) {
3654 sdata->flags &= ~IEEE80211_SDATA_DISCONNECT_RESUME;
3655 mlme_dbg(sdata, "driver requested disconnect after resume\n");
3656 ieee80211_sta_connection_lost(sdata,
3657 ifmgd->associated->bssid,
3658 WLAN_REASON_UNSPECIFIED,
3659 true);
3660 mutex_unlock(&ifmgd->mtx);
3661 return;
3662 }
3663 mutex_unlock(&ifmgd->mtx);
3664}
3665#endif
3666
3626/* interface setup */ 3667/* interface setup */
3627void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) 3668void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
3628{ 3669{
@@ -4329,7 +4370,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
4329 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; 4370 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
4330 u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN]; 4371 u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
4331 bool tx = !req->local_state_change; 4372 bool tx = !req->local_state_change;
4332 bool sent_frame = false; 4373 bool report_frame = false;
4333 4374
4334 mutex_lock(&ifmgd->mtx); 4375 mutex_lock(&ifmgd->mtx);
4335 4376
@@ -4346,7 +4387,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
4346 ieee80211_destroy_auth_data(sdata, false); 4387 ieee80211_destroy_auth_data(sdata, false);
4347 mutex_unlock(&ifmgd->mtx); 4388 mutex_unlock(&ifmgd->mtx);
4348 4389
4349 sent_frame = tx; 4390 report_frame = true;
4350 goto out; 4391 goto out;
4351 } 4392 }
4352 4393
@@ -4354,12 +4395,12 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata,
4354 ether_addr_equal(ifmgd->associated->bssid, req->bssid)) { 4395 ether_addr_equal(ifmgd->associated->bssid, req->bssid)) {
4355 ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH, 4396 ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
4356 req->reason_code, tx, frame_buf); 4397 req->reason_code, tx, frame_buf);
4357 sent_frame = tx; 4398 report_frame = true;
4358 } 4399 }
4359 mutex_unlock(&ifmgd->mtx); 4400 mutex_unlock(&ifmgd->mtx);
4360 4401
4361 out: 4402 out:
4362 if (sent_frame) 4403 if (report_frame)
4363 __cfg80211_send_deauth(sdata->dev, frame_buf, 4404 __cfg80211_send_deauth(sdata->dev, frame_buf,
4364 IEEE80211_DEAUTH_FRAME_LEN); 4405 IEEE80211_DEAUTH_FRAME_LEN);
4365 4406
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 0d51877efdb7..d3f414fe67e0 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -688,8 +688,15 @@ int rate_control_set_rates(struct ieee80211_hw *hw,
688 struct ieee80211_sta *pubsta, 688 struct ieee80211_sta *pubsta,
689 struct ieee80211_sta_rates *rates) 689 struct ieee80211_sta_rates *rates)
690{ 690{
691 struct ieee80211_sta_rates *old = rcu_dereference(pubsta->rates); 691 struct ieee80211_sta_rates *old;
692 692
693 /*
694 * mac80211 guarantees that this function will not be called
695 * concurrently, so the following RCU access is safe, even without
696 * extra locking. This can not be checked easily, so we just set
697 * the condition to true.
698 */
699 old = rcu_dereference_protected(pubsta->rates, true);
693 rcu_assign_pointer(pubsta->rates, rates); 700 rcu_assign_pointer(pubsta->rates, rates);
694 if (old) 701 if (old)
695 kfree_rcu(old, rcu_head); 702 kfree_rcu(old, rcu_head);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c8447af76ead..8e2952620256 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -3036,6 +3036,9 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx,
3036 * and location updates. Note that mac80211 3036 * and location updates. Note that mac80211
3037 * itself never looks at these frames. 3037 * itself never looks at these frames.
3038 */ 3038 */
3039 if (!multicast &&
3040 !ether_addr_equal(sdata->vif.addr, hdr->addr1))
3041 return 0;
3039 if (ieee80211_is_public_action(hdr, skb->len)) 3042 if (ieee80211_is_public_action(hdr, skb->len))
3040 return 1; 3043 return 1;
3041 if (!ieee80211_is_beacon(hdr->frame_control)) 3044 if (!ieee80211_is_beacon(hdr->frame_control))
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 3ed801d90f1e..124b1fdc20d0 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -208,10 +208,10 @@ void ieee80211_get_tkip_p2k(struct ieee80211_key_conf *keyconf,
208 u32 iv32 = get_unaligned_le32(&data[4]); 208 u32 iv32 = get_unaligned_le32(&data[4]);
209 u16 iv16 = data[2] | (data[0] << 8); 209 u16 iv16 = data[2] | (data[0] << 8);
210 210
211 spin_lock_bh(&key->u.tkip.txlock); 211 spin_lock(&key->u.tkip.txlock);
212 ieee80211_compute_tkip_p1k(key, iv32); 212 ieee80211_compute_tkip_p1k(key, iv32);
213 tkip_mixing_phase2(tk, ctx, iv16, p2k); 213 tkip_mixing_phase2(tk, ctx, iv16, p2k);
214 spin_unlock_bh(&key->u.tkip.txlock); 214 spin_unlock(&key->u.tkip.txlock);
215} 215}
216EXPORT_SYMBOL(ieee80211_get_tkip_p2k); 216EXPORT_SYMBOL(ieee80211_get_tkip_p2k);
217 217
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 3f87fa468b1f..27e07150eb46 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1740,6 +1740,13 @@ int ieee80211_reconfig(struct ieee80211_local *local)
1740 mb(); 1740 mb();
1741 local->resuming = false; 1741 local->resuming = false;
1742 1742
1743 list_for_each_entry(sdata, &local->interfaces, list) {
1744 if (!ieee80211_sdata_running(sdata))
1745 continue;
1746 if (sdata->vif.type == NL80211_IFTYPE_STATION)
1747 ieee80211_sta_restart(sdata);
1748 }
1749
1743 mod_timer(&local->sta_cleanup, jiffies + 1); 1750 mod_timer(&local->sta_cleanup, jiffies + 1);
1744#else 1751#else
1745 WARN_ON(1); 1752 WARN_ON(1);
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 388656d5a9ec..3b18dd1be7d9 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -148,7 +148,7 @@ void nf_log_packet(struct net *net,
148 va_start(args, fmt); 148 va_start(args, fmt);
149 vsnprintf(prefix, sizeof(prefix), fmt, args); 149 vsnprintf(prefix, sizeof(prefix), fmt, args);
150 va_end(args); 150 va_end(args);
151 logger->logfn(pf, hooknum, skb, in, out, loginfo, prefix); 151 logger->logfn(net, pf, hooknum, skb, in, out, loginfo, prefix);
152 } 152 }
153 rcu_read_unlock(); 153 rcu_read_unlock();
154} 154}
@@ -368,17 +368,20 @@ static int __net_init nf_log_net_init(struct net *net)
368 return 0; 368 return 0;
369 369
370out_sysctl: 370out_sysctl:
371#ifdef CONFIG_PROC_FS
371 /* For init_net: errors will trigger panic, don't unroll on error. */ 372 /* For init_net: errors will trigger panic, don't unroll on error. */
372 if (!net_eq(net, &init_net)) 373 if (!net_eq(net, &init_net))
373 remove_proc_entry("nf_log", net->nf.proc_netfilter); 374 remove_proc_entry("nf_log", net->nf.proc_netfilter);
374 375#endif
375 return ret; 376 return ret;
376} 377}
377 378
378static void __net_exit nf_log_net_exit(struct net *net) 379static void __net_exit nf_log_net_exit(struct net *net)
379{ 380{
380 netfilter_log_sysctl_exit(net); 381 netfilter_log_sysctl_exit(net);
382#ifdef CONFIG_PROC_FS
381 remove_proc_entry("nf_log", net->nf.proc_netfilter); 383 remove_proc_entry("nf_log", net->nf.proc_netfilter);
384#endif
382} 385}
383 386
384static struct pernet_operations nf_log_net_ops = { 387static struct pernet_operations nf_log_net_ops = {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index faf1e9300d8a..962e9792e317 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -602,7 +602,8 @@ static struct nf_loginfo default_loginfo = {
602 602
603/* log handler for internal netfilter logging api */ 603/* log handler for internal netfilter logging api */
604void 604void
605nfulnl_log_packet(u_int8_t pf, 605nfulnl_log_packet(struct net *net,
606 u_int8_t pf,
606 unsigned int hooknum, 607 unsigned int hooknum,
607 const struct sk_buff *skb, 608 const struct sk_buff *skb,
608 const struct net_device *in, 609 const struct net_device *in,
@@ -615,7 +616,6 @@ nfulnl_log_packet(u_int8_t pf,
615 const struct nf_loginfo *li; 616 const struct nf_loginfo *li;
616 unsigned int qthreshold; 617 unsigned int qthreshold;
617 unsigned int plen; 618 unsigned int plen;
618 struct net *net = dev_net(in ? in : out);
619 struct nfnl_log_net *log = nfnl_log_pernet(net); 619 struct nfnl_log_net *log = nfnl_log_pernet(net);
620 620
621 if (li_user && li_user->type == NF_LOG_TYPE_ULOG) 621 if (li_user && li_user->type == NF_LOG_TYPE_ULOG)
@@ -1045,7 +1045,9 @@ static int __net_init nfnl_log_net_init(struct net *net)
1045 1045
1046static void __net_exit nfnl_log_net_exit(struct net *net) 1046static void __net_exit nfnl_log_net_exit(struct net *net)
1047{ 1047{
1048#ifdef CONFIG_PROC_FS
1048 remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter); 1049 remove_proc_entry("nfnetlink_log", net->nf.proc_netfilter);
1050#endif
1049} 1051}
1050 1052
1051static struct pernet_operations nfnl_log_net_ops = { 1053static struct pernet_operations nfnl_log_net_ops = {
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 2e0e835baf72..4e27fa035814 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -1285,7 +1285,9 @@ static int __net_init nfnl_queue_net_init(struct net *net)
1285 1285
1286static void __net_exit nfnl_queue_net_exit(struct net *net) 1286static void __net_exit nfnl_queue_net_exit(struct net *net)
1287{ 1287{
1288#ifdef CONFIG_PROC_FS
1288 remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); 1289 remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter);
1290#endif
1289} 1291}
1290 1292
1291static struct pernet_operations nfnl_queue_net_ops = { 1293static struct pernet_operations nfnl_queue_net_ops = {
diff --git a/net/netfilter/xt_LOG.c b/net/netfilter/xt_LOG.c
index fe573f6c9e91..491c7d821a0b 100644
--- a/net/netfilter/xt_LOG.c
+++ b/net/netfilter/xt_LOG.c
@@ -466,7 +466,8 @@ log_packet_common(struct sbuff *m,
466 466
467 467
468static void 468static void
469ipt_log_packet(u_int8_t pf, 469ipt_log_packet(struct net *net,
470 u_int8_t pf,
470 unsigned int hooknum, 471 unsigned int hooknum,
471 const struct sk_buff *skb, 472 const struct sk_buff *skb,
472 const struct net_device *in, 473 const struct net_device *in,
@@ -475,7 +476,6 @@ ipt_log_packet(u_int8_t pf,
475 const char *prefix) 476 const char *prefix)
476{ 477{
477 struct sbuff *m; 478 struct sbuff *m;
478 struct net *net = dev_net(in ? in : out);
479 479
480 /* FIXME: Disabled from containers until syslog ns is supported */ 480 /* FIXME: Disabled from containers until syslog ns is supported */
481 if (!net_eq(net, &init_net)) 481 if (!net_eq(net, &init_net))
@@ -797,7 +797,8 @@ fallback:
797} 797}
798 798
799static void 799static void
800ip6t_log_packet(u_int8_t pf, 800ip6t_log_packet(struct net *net,
801 u_int8_t pf,
801 unsigned int hooknum, 802 unsigned int hooknum,
802 const struct sk_buff *skb, 803 const struct sk_buff *skb,
803 const struct net_device *in, 804 const struct net_device *in,
@@ -806,7 +807,6 @@ ip6t_log_packet(u_int8_t pf,
806 const char *prefix) 807 const char *prefix)
807{ 808{
808 struct sbuff *m; 809 struct sbuff *m;
809 struct net *net = dev_net(in ? in : out);
810 810
811 /* FIXME: Disabled from containers until syslog ns is supported */ 811 /* FIXME: Disabled from containers until syslog ns is supported */
812 if (!net_eq(net, &init_net)) 812 if (!net_eq(net, &init_net))
@@ -833,17 +833,18 @@ log_tg(struct sk_buff *skb, const struct xt_action_param *par)
833{ 833{
834 const struct xt_log_info *loginfo = par->targinfo; 834 const struct xt_log_info *loginfo = par->targinfo;
835 struct nf_loginfo li; 835 struct nf_loginfo li;
836 struct net *net = dev_net(par->in ? par->in : par->out);
836 837
837 li.type = NF_LOG_TYPE_LOG; 838 li.type = NF_LOG_TYPE_LOG;
838 li.u.log.level = loginfo->level; 839 li.u.log.level = loginfo->level;
839 li.u.log.logflags = loginfo->logflags; 840 li.u.log.logflags = loginfo->logflags;
840 841
841 if (par->family == NFPROTO_IPV4) 842 if (par->family == NFPROTO_IPV4)
842 ipt_log_packet(NFPROTO_IPV4, par->hooknum, skb, par->in, 843 ipt_log_packet(net, NFPROTO_IPV4, par->hooknum, skb, par->in,
843 par->out, &li, loginfo->prefix); 844 par->out, &li, loginfo->prefix);
844#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) 845#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
845 else if (par->family == NFPROTO_IPV6) 846 else if (par->family == NFPROTO_IPV6)
846 ip6t_log_packet(NFPROTO_IPV6, par->hooknum, skb, par->in, 847 ip6t_log_packet(net, NFPROTO_IPV6, par->hooknum, skb, par->in,
847 par->out, &li, loginfo->prefix); 848 par->out, &li, loginfo->prefix);
848#endif 849#endif
849 else 850 else
diff --git a/net/netfilter/xt_NFLOG.c b/net/netfilter/xt_NFLOG.c
index a17dd0f589b2..fb7497c928a0 100644
--- a/net/netfilter/xt_NFLOG.c
+++ b/net/netfilter/xt_NFLOG.c
@@ -26,13 +26,14 @@ nflog_tg(struct sk_buff *skb, const struct xt_action_param *par)
26{ 26{
27 const struct xt_nflog_info *info = par->targinfo; 27 const struct xt_nflog_info *info = par->targinfo;
28 struct nf_loginfo li; 28 struct nf_loginfo li;
29 struct net *net = dev_net(par->in ? par->in : par->out);
29 30
30 li.type = NF_LOG_TYPE_ULOG; 31 li.type = NF_LOG_TYPE_ULOG;
31 li.u.ulog.copy_len = info->len; 32 li.u.ulog.copy_len = info->len;
32 li.u.ulog.group = info->group; 33 li.u.ulog.group = info->group;
33 li.u.ulog.qthreshold = info->threshold; 34 li.u.ulog.qthreshold = info->threshold;
34 35
35 nfulnl_log_packet(par->family, par->hooknum, skb, par->in, 36 nfulnl_log_packet(net, par->family, par->hooknum, skb, par->in,
36 par->out, &li, info->prefix); 37 par->out, &li, info->prefix);
37 return XT_CONTINUE; 38 return XT_CONTINUE;
38} 39}
diff --git a/net/netfilter/xt_TCPOPTSTRIP.c b/net/netfilter/xt_TCPOPTSTRIP.c
index 25fd1c4e1eec..1eb1a44bfd3d 100644
--- a/net/netfilter/xt_TCPOPTSTRIP.c
+++ b/net/netfilter/xt_TCPOPTSTRIP.c
@@ -30,17 +30,28 @@ static inline unsigned int optlen(const u_int8_t *opt, unsigned int offset)
30 30
31static unsigned int 31static unsigned int
32tcpoptstrip_mangle_packet(struct sk_buff *skb, 32tcpoptstrip_mangle_packet(struct sk_buff *skb,
33 const struct xt_tcpoptstrip_target_info *info, 33 const struct xt_action_param *par,
34 unsigned int tcphoff, unsigned int minlen) 34 unsigned int tcphoff, unsigned int minlen)
35{ 35{
36 const struct xt_tcpoptstrip_target_info *info = par->targinfo;
36 unsigned int optl, i, j; 37 unsigned int optl, i, j;
37 struct tcphdr *tcph; 38 struct tcphdr *tcph;
38 u_int16_t n, o; 39 u_int16_t n, o;
39 u_int8_t *opt; 40 u_int8_t *opt;
41 int len;
42
43 /* This is a fragment, no TCP header is available */
44 if (par->fragoff != 0)
45 return XT_CONTINUE;
40 46
41 if (!skb_make_writable(skb, skb->len)) 47 if (!skb_make_writable(skb, skb->len))
42 return NF_DROP; 48 return NF_DROP;
43 49
50 len = skb->len - tcphoff;
51 if (len < (int)sizeof(struct tcphdr) ||
52 tcp_hdr(skb)->doff * 4 > len)
53 return NF_DROP;
54
44 tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff); 55 tcph = (struct tcphdr *)(skb_network_header(skb) + tcphoff);
45 opt = (u_int8_t *)tcph; 56 opt = (u_int8_t *)tcph;
46 57
@@ -76,7 +87,7 @@ tcpoptstrip_mangle_packet(struct sk_buff *skb,
76static unsigned int 87static unsigned int
77tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par) 88tcpoptstrip_tg4(struct sk_buff *skb, const struct xt_action_param *par)
78{ 89{
79 return tcpoptstrip_mangle_packet(skb, par->targinfo, ip_hdrlen(skb), 90 return tcpoptstrip_mangle_packet(skb, par, ip_hdrlen(skb),
80 sizeof(struct iphdr) + sizeof(struct tcphdr)); 91 sizeof(struct iphdr) + sizeof(struct tcphdr));
81} 92}
82 93
@@ -94,7 +105,7 @@ tcpoptstrip_tg6(struct sk_buff *skb, const struct xt_action_param *par)
94 if (tcphoff < 0) 105 if (tcphoff < 0)
95 return NF_DROP; 106 return NF_DROP;
96 107
97 return tcpoptstrip_mangle_packet(skb, par->targinfo, tcphoff, 108 return tcpoptstrip_mangle_packet(skb, par, tcphoff,
98 sizeof(*ipv6h) + sizeof(struct tcphdr)); 109 sizeof(*ipv6h) + sizeof(struct tcphdr));
99} 110}
100#endif 111#endif
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index d8d424337550..6bb1d42f0fac 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -245,6 +245,71 @@ static void netlbl_domhsh_audit_add(struct netlbl_dom_map *entry,
245 } 245 }
246} 246}
247 247
248/**
249 * netlbl_domhsh_validate - Validate a new domain mapping entry
250 * @entry: the entry to validate
251 *
252 * This function validates the new domain mapping entry to ensure that it is
253 * a valid entry. Returns zero on success, negative values on failure.
254 *
255 */
256static int netlbl_domhsh_validate(const struct netlbl_dom_map *entry)
257{
258 struct netlbl_af4list *iter4;
259 struct netlbl_domaddr4_map *map4;
260#if IS_ENABLED(CONFIG_IPV6)
261 struct netlbl_af6list *iter6;
262 struct netlbl_domaddr6_map *map6;
263#endif /* IPv6 */
264
265 if (entry == NULL)
266 return -EINVAL;
267
268 switch (entry->type) {
269 case NETLBL_NLTYPE_UNLABELED:
270 if (entry->type_def.cipsov4 != NULL ||
271 entry->type_def.addrsel != NULL)
272 return -EINVAL;
273 break;
274 case NETLBL_NLTYPE_CIPSOV4:
275 if (entry->type_def.cipsov4 == NULL)
276 return -EINVAL;
277 break;
278 case NETLBL_NLTYPE_ADDRSELECT:
279 netlbl_af4list_foreach(iter4, &entry->type_def.addrsel->list4) {
280 map4 = netlbl_domhsh_addr4_entry(iter4);
281 switch (map4->type) {
282 case NETLBL_NLTYPE_UNLABELED:
283 if (map4->type_def.cipsov4 != NULL)
284 return -EINVAL;
285 break;
286 case NETLBL_NLTYPE_CIPSOV4:
287 if (map4->type_def.cipsov4 == NULL)
288 return -EINVAL;
289 break;
290 default:
291 return -EINVAL;
292 }
293 }
294#if IS_ENABLED(CONFIG_IPV6)
295 netlbl_af6list_foreach(iter6, &entry->type_def.addrsel->list6) {
296 map6 = netlbl_domhsh_addr6_entry(iter6);
297 switch (map6->type) {
298 case NETLBL_NLTYPE_UNLABELED:
299 break;
300 default:
301 return -EINVAL;
302 }
303 }
304#endif /* IPv6 */
305 break;
306 default:
307 return -EINVAL;
308 }
309
310 return 0;
311}
312
248/* 313/*
249 * Domain Hash Table Functions 314 * Domain Hash Table Functions
250 */ 315 */
@@ -311,6 +376,10 @@ int netlbl_domhsh_add(struct netlbl_dom_map *entry,
311 struct netlbl_af6list *tmp6; 376 struct netlbl_af6list *tmp6;
312#endif /* IPv6 */ 377#endif /* IPv6 */
313 378
379 ret_val = netlbl_domhsh_validate(entry);
380 if (ret_val != 0)
381 return ret_val;
382
314 /* XXX - we can remove this RCU read lock as the spinlock protects the 383 /* XXX - we can remove this RCU read lock as the spinlock protects the
315 * entire function, but before we do we need to fixup the 384 * entire function, but before we do we need to fixup the
316 * netlbl_af[4,6]list RCU functions to do "the right thing" with 385 * netlbl_af[4,6]list RCU functions to do "the right thing" with
diff --git a/net/socket.c b/net/socket.c
index b416093997da..6b94633ca61d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2412,7 +2412,7 @@ static const unsigned char nargs[21] = {
2412 2412
2413SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) 2413SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
2414{ 2414{
2415 unsigned long a[6]; 2415 unsigned long a[AUDITSC_ARGS];
2416 unsigned long a0, a1; 2416 unsigned long a0, a1;
2417 int err; 2417 int err;
2418 unsigned int len; 2418 unsigned int len;
@@ -2428,7 +2428,9 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
2428 if (copy_from_user(a, args, len)) 2428 if (copy_from_user(a, args, len))
2429 return -EFAULT; 2429 return -EFAULT;
2430 2430
2431 audit_socketcall(nargs[call] / sizeof(unsigned long), a); 2431 err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
2432 if (err)
2433 return err;
2432 2434
2433 a0 = a[0]; 2435 a0 = a[0];
2434 a1 = a[1]; 2436 a1 = a[1];
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a764e227fdde..fc2f78d6a9b4 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -52,6 +52,8 @@
52#include <linux/sunrpc/gss_api.h> 52#include <linux/sunrpc/gss_api.h>
53#include <asm/uaccess.h> 53#include <asm/uaccess.h>
54 54
55#include "../netns.h"
56
55static const struct rpc_authops authgss_ops; 57static const struct rpc_authops authgss_ops;
56 58
57static const struct rpc_credops gss_credops; 59static const struct rpc_credops gss_credops;
@@ -85,8 +87,6 @@ struct gss_auth {
85}; 87};
86 88
87/* pipe_version >= 0 if and only if someone has a pipe open. */ 89/* pipe_version >= 0 if and only if someone has a pipe open. */
88static int pipe_version = -1;
89static atomic_t pipe_users = ATOMIC_INIT(0);
90static DEFINE_SPINLOCK(pipe_version_lock); 90static DEFINE_SPINLOCK(pipe_version_lock);
91static struct rpc_wait_queue pipe_version_rpc_waitqueue; 91static struct rpc_wait_queue pipe_version_rpc_waitqueue;
92static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue); 92static DECLARE_WAIT_QUEUE_HEAD(pipe_version_waitqueue);
@@ -266,24 +266,27 @@ struct gss_upcall_msg {
266 char databuf[UPCALL_BUF_LEN]; 266 char databuf[UPCALL_BUF_LEN];
267}; 267};
268 268
269static int get_pipe_version(void) 269static int get_pipe_version(struct net *net)
270{ 270{
271 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
271 int ret; 272 int ret;
272 273
273 spin_lock(&pipe_version_lock); 274 spin_lock(&pipe_version_lock);
274 if (pipe_version >= 0) { 275 if (sn->pipe_version >= 0) {
275 atomic_inc(&pipe_users); 276 atomic_inc(&sn->pipe_users);
276 ret = pipe_version; 277 ret = sn->pipe_version;
277 } else 278 } else
278 ret = -EAGAIN; 279 ret = -EAGAIN;
279 spin_unlock(&pipe_version_lock); 280 spin_unlock(&pipe_version_lock);
280 return ret; 281 return ret;
281} 282}
282 283
283static void put_pipe_version(void) 284static void put_pipe_version(struct net *net)
284{ 285{
285 if (atomic_dec_and_lock(&pipe_users, &pipe_version_lock)) { 286 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
286 pipe_version = -1; 287
288 if (atomic_dec_and_lock(&sn->pipe_users, &pipe_version_lock)) {
289 sn->pipe_version = -1;
287 spin_unlock(&pipe_version_lock); 290 spin_unlock(&pipe_version_lock);
288 } 291 }
289} 292}
@@ -291,9 +294,10 @@ static void put_pipe_version(void)
291static void 294static void
292gss_release_msg(struct gss_upcall_msg *gss_msg) 295gss_release_msg(struct gss_upcall_msg *gss_msg)
293{ 296{
297 struct net *net = rpc_net_ns(gss_msg->auth->client);
294 if (!atomic_dec_and_test(&gss_msg->count)) 298 if (!atomic_dec_and_test(&gss_msg->count))
295 return; 299 return;
296 put_pipe_version(); 300 put_pipe_version(net);
297 BUG_ON(!list_empty(&gss_msg->list)); 301 BUG_ON(!list_empty(&gss_msg->list));
298 if (gss_msg->ctx != NULL) 302 if (gss_msg->ctx != NULL)
299 gss_put_ctx(gss_msg->ctx); 303 gss_put_ctx(gss_msg->ctx);
@@ -439,7 +443,10 @@ static void gss_encode_msg(struct gss_upcall_msg *gss_msg,
439 struct rpc_clnt *clnt, 443 struct rpc_clnt *clnt,
440 const char *service_name) 444 const char *service_name)
441{ 445{
442 if (pipe_version == 0) 446 struct net *net = rpc_net_ns(clnt);
447 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
448
449 if (sn->pipe_version == 0)
443 gss_encode_v0_msg(gss_msg); 450 gss_encode_v0_msg(gss_msg);
444 else /* pipe_version == 1 */ 451 else /* pipe_version == 1 */
445 gss_encode_v1_msg(gss_msg, clnt, service_name); 452 gss_encode_v1_msg(gss_msg, clnt, service_name);
@@ -455,7 +462,7 @@ gss_alloc_msg(struct gss_auth *gss_auth, struct rpc_clnt *clnt,
455 gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS); 462 gss_msg = kzalloc(sizeof(*gss_msg), GFP_NOFS);
456 if (gss_msg == NULL) 463 if (gss_msg == NULL)
457 return ERR_PTR(-ENOMEM); 464 return ERR_PTR(-ENOMEM);
458 vers = get_pipe_version(); 465 vers = get_pipe_version(rpc_net_ns(clnt));
459 if (vers < 0) { 466 if (vers < 0) {
460 kfree(gss_msg); 467 kfree(gss_msg);
461 return ERR_PTR(vers); 468 return ERR_PTR(vers);
@@ -559,24 +566,34 @@ out:
559static inline int 566static inline int
560gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) 567gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred)
561{ 568{
569 struct net *net = rpc_net_ns(gss_auth->client);
570 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
562 struct rpc_pipe *pipe; 571 struct rpc_pipe *pipe;
563 struct rpc_cred *cred = &gss_cred->gc_base; 572 struct rpc_cred *cred = &gss_cred->gc_base;
564 struct gss_upcall_msg *gss_msg; 573 struct gss_upcall_msg *gss_msg;
574 unsigned long timeout;
565 DEFINE_WAIT(wait); 575 DEFINE_WAIT(wait);
566 int err = 0; 576 int err;
567 577
568 dprintk("RPC: %s for uid %u\n", 578 dprintk("RPC: %s for uid %u\n",
569 __func__, from_kuid(&init_user_ns, cred->cr_uid)); 579 __func__, from_kuid(&init_user_ns, cred->cr_uid));
570retry: 580retry:
581 err = 0;
582 /* Default timeout is 15s unless we know that gssd is not running */
583 timeout = 15 * HZ;
584 if (!sn->gssd_running)
585 timeout = HZ >> 2;
571 gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); 586 gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred);
572 if (PTR_ERR(gss_msg) == -EAGAIN) { 587 if (PTR_ERR(gss_msg) == -EAGAIN) {
573 err = wait_event_interruptible_timeout(pipe_version_waitqueue, 588 err = wait_event_interruptible_timeout(pipe_version_waitqueue,
574 pipe_version >= 0, 15*HZ); 589 sn->pipe_version >= 0, timeout);
575 if (pipe_version < 0) { 590 if (sn->pipe_version < 0) {
591 if (err == 0)
592 sn->gssd_running = 0;
576 warn_gssd(); 593 warn_gssd();
577 err = -EACCES; 594 err = -EACCES;
578 } 595 }
579 if (err) 596 if (err < 0)
580 goto out; 597 goto out;
581 goto retry; 598 goto retry;
582 } 599 }
@@ -707,20 +724,22 @@ out:
707 724
708static int gss_pipe_open(struct inode *inode, int new_version) 725static int gss_pipe_open(struct inode *inode, int new_version)
709{ 726{
727 struct net *net = inode->i_sb->s_fs_info;
728 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
710 int ret = 0; 729 int ret = 0;
711 730
712 spin_lock(&pipe_version_lock); 731 spin_lock(&pipe_version_lock);
713 if (pipe_version < 0) { 732 if (sn->pipe_version < 0) {
714 /* First open of any gss pipe determines the version: */ 733 /* First open of any gss pipe determines the version: */
715 pipe_version = new_version; 734 sn->pipe_version = new_version;
716 rpc_wake_up(&pipe_version_rpc_waitqueue); 735 rpc_wake_up(&pipe_version_rpc_waitqueue);
717 wake_up(&pipe_version_waitqueue); 736 wake_up(&pipe_version_waitqueue);
718 } else if (pipe_version != new_version) { 737 } else if (sn->pipe_version != new_version) {
719 /* Trying to open a pipe of a different version */ 738 /* Trying to open a pipe of a different version */
720 ret = -EBUSY; 739 ret = -EBUSY;
721 goto out; 740 goto out;
722 } 741 }
723 atomic_inc(&pipe_users); 742 atomic_inc(&sn->pipe_users);
724out: 743out:
725 spin_unlock(&pipe_version_lock); 744 spin_unlock(&pipe_version_lock);
726 return ret; 745 return ret;
@@ -740,6 +759,7 @@ static int gss_pipe_open_v1(struct inode *inode)
740static void 759static void
741gss_pipe_release(struct inode *inode) 760gss_pipe_release(struct inode *inode)
742{ 761{
762 struct net *net = inode->i_sb->s_fs_info;
743 struct rpc_pipe *pipe = RPC_I(inode)->pipe; 763 struct rpc_pipe *pipe = RPC_I(inode)->pipe;
744 struct gss_upcall_msg *gss_msg; 764 struct gss_upcall_msg *gss_msg;
745 765
@@ -758,7 +778,7 @@ restart:
758 } 778 }
759 spin_unlock(&pipe->lock); 779 spin_unlock(&pipe->lock);
760 780
761 put_pipe_version(); 781 put_pipe_version(net);
762} 782}
763 783
764static void 784static void
@@ -867,8 +887,7 @@ gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor)
867 err = -EINVAL; 887 err = -EINVAL;
868 gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); 888 gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor);
869 if (!gss_auth->mech) { 889 if (!gss_auth->mech) {
870 printk(KERN_WARNING "%s: Pseudoflavor %d not found!\n", 890 dprintk("RPC: Pseudoflavor %d not found!\n", flavor);
871 __func__, flavor);
872 goto err_free; 891 goto err_free;
873 } 892 }
874 gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); 893 gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor);
diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c
index 5c4c61d527e2..357f613df7ff 100644
--- a/net/sunrpc/auth_gss/gss_rpc_xdr.c
+++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c
@@ -21,16 +21,6 @@
21#include <linux/sunrpc/svcauth.h> 21#include <linux/sunrpc/svcauth.h>
22#include "gss_rpc_xdr.h" 22#include "gss_rpc_xdr.h"
23 23
24static bool gssx_check_pointer(struct xdr_stream *xdr)
25{
26 __be32 *p;
27
28 p = xdr_reserve_space(xdr, 4);
29 if (unlikely(p == NULL))
30 return -ENOSPC;
31 return *p?true:false;
32}
33
34static int gssx_enc_bool(struct xdr_stream *xdr, int v) 24static int gssx_enc_bool(struct xdr_stream *xdr, int v)
35{ 25{
36 __be32 *p; 26 __be32 *p;
@@ -264,25 +254,27 @@ static int gssx_dec_option_array(struct xdr_stream *xdr,
264 if (unlikely(p == NULL)) 254 if (unlikely(p == NULL))
265 return -ENOSPC; 255 return -ENOSPC;
266 count = be32_to_cpup(p++); 256 count = be32_to_cpup(p++);
267 if (count != 0) { 257 if (!count)
268 /* we recognize only 1 currently: CREDS_VALUE */ 258 return 0;
269 oa->count = 1;
270 259
271 oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL); 260 /* we recognize only 1 currently: CREDS_VALUE */
272 if (!oa->data) 261 oa->count = 1;
273 return -ENOMEM;
274 262
275 creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL); 263 oa->data = kmalloc(sizeof(struct gssx_option), GFP_KERNEL);
276 if (!creds) { 264 if (!oa->data)
277 kfree(oa->data); 265 return -ENOMEM;
278 return -ENOMEM;
279 }
280 266
281 oa->data[0].option.data = CREDS_VALUE; 267 creds = kmalloc(sizeof(struct svc_cred), GFP_KERNEL);
282 oa->data[0].option.len = sizeof(CREDS_VALUE); 268 if (!creds) {
283 oa->data[0].value.data = (void *)creds; 269 kfree(oa->data);
284 oa->data[0].value.len = 0; 270 return -ENOMEM;
285 } 271 }
272
273 oa->data[0].option.data = CREDS_VALUE;
274 oa->data[0].option.len = sizeof(CREDS_VALUE);
275 oa->data[0].value.data = (void *)creds;
276 oa->data[0].value.len = 0;
277
286 for (i = 0; i < count; i++) { 278 for (i = 0; i < count; i++) {
287 gssx_buffer dummy = { 0, NULL }; 279 gssx_buffer dummy = { 0, NULL };
288 u32 length; 280 u32 length;
@@ -800,6 +792,7 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
800 struct xdr_stream *xdr, 792 struct xdr_stream *xdr,
801 struct gssx_res_accept_sec_context *res) 793 struct gssx_res_accept_sec_context *res)
802{ 794{
795 u32 value_follows;
803 int err; 796 int err;
804 797
805 /* res->status */ 798 /* res->status */
@@ -808,7 +801,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
808 return err; 801 return err;
809 802
810 /* res->context_handle */ 803 /* res->context_handle */
811 if (gssx_check_pointer(xdr)) { 804 err = gssx_dec_bool(xdr, &value_follows);
805 if (err)
806 return err;
807 if (value_follows) {
812 err = gssx_dec_ctx(xdr, res->context_handle); 808 err = gssx_dec_ctx(xdr, res->context_handle);
813 if (err) 809 if (err)
814 return err; 810 return err;
@@ -817,7 +813,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
817 } 813 }
818 814
819 /* res->output_token */ 815 /* res->output_token */
820 if (gssx_check_pointer(xdr)) { 816 err = gssx_dec_bool(xdr, &value_follows);
817 if (err)
818 return err;
819 if (value_follows) {
821 err = gssx_dec_buffer(xdr, res->output_token); 820 err = gssx_dec_buffer(xdr, res->output_token);
822 if (err) 821 if (err)
823 return err; 822 return err;
@@ -826,7 +825,10 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp,
826 } 825 }
827 826
828 /* res->delegated_cred_handle */ 827 /* res->delegated_cred_handle */
829 if (gssx_check_pointer(xdr)) { 828 err = gssx_dec_bool(xdr, &value_follows);
829 if (err)
830 return err;
831 if (value_follows) {
830 /* we do not support upcall servers sending this data. */ 832 /* we do not support upcall servers sending this data. */
831 return -EINVAL; 833 return -EINVAL;
832 } 834 }
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 3f7930f938cc..5a750b9c3640 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -360,7 +360,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, stru
360 360
361 auth = rpcauth_create(args->authflavor, clnt); 361 auth = rpcauth_create(args->authflavor, clnt);
362 if (IS_ERR(auth)) { 362 if (IS_ERR(auth)) {
363 printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", 363 dprintk("RPC: Couldn't create auth handle (flavor %u)\n",
364 args->authflavor); 364 args->authflavor);
365 err = PTR_ERR(auth); 365 err = PTR_ERR(auth);
366 goto out_no_auth; 366 goto out_no_auth;
diff --git a/net/sunrpc/netns.h b/net/sunrpc/netns.h
index 7111a4c9113b..74d948f5d5a1 100644
--- a/net/sunrpc/netns.h
+++ b/net/sunrpc/netns.h
@@ -28,7 +28,11 @@ struct sunrpc_net {
28 wait_queue_head_t gssp_wq; 28 wait_queue_head_t gssp_wq;
29 struct rpc_clnt *gssp_clnt; 29 struct rpc_clnt *gssp_clnt;
30 int use_gss_proxy; 30 int use_gss_proxy;
31 int pipe_version;
32 atomic_t pipe_users;
31 struct proc_dir_entry *use_gssp_proc; 33 struct proc_dir_entry *use_gssp_proc;
34
35 unsigned int gssd_running;
32}; 36};
33 37
34extern int sunrpc_net_id; 38extern int sunrpc_net_id;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index a9129f8d7070..e7ce4b3eb0bd 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -216,11 +216,14 @@ rpc_destroy_inode(struct inode *inode)
216static int 216static int
217rpc_pipe_open(struct inode *inode, struct file *filp) 217rpc_pipe_open(struct inode *inode, struct file *filp)
218{ 218{
219 struct net *net = inode->i_sb->s_fs_info;
220 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
219 struct rpc_pipe *pipe; 221 struct rpc_pipe *pipe;
220 int first_open; 222 int first_open;
221 int res = -ENXIO; 223 int res = -ENXIO;
222 224
223 mutex_lock(&inode->i_mutex); 225 mutex_lock(&inode->i_mutex);
226 sn->gssd_running = 1;
224 pipe = RPC_I(inode)->pipe; 227 pipe = RPC_I(inode)->pipe;
225 if (pipe == NULL) 228 if (pipe == NULL)
226 goto out; 229 goto out;
@@ -1069,6 +1072,8 @@ void rpc_pipefs_init_net(struct net *net)
1069 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id); 1072 struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
1070 1073
1071 mutex_init(&sn->pipefs_sb_lock); 1074 mutex_init(&sn->pipefs_sb_lock);
1075 sn->gssd_running = 1;
1076 sn->pipe_version = -1;
1072} 1077}
1073 1078
1074/* 1079/*
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index f8529fc8e542..5356b120dbf8 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -324,11 +324,17 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
324 * Note: If the task is ASYNC, and is being made runnable after sitting on an 324 * Note: If the task is ASYNC, and is being made runnable after sitting on an
325 * rpc_wait_queue, this must be called with the queue spinlock held to protect 325 * rpc_wait_queue, this must be called with the queue spinlock held to protect
326 * the wait queue operation. 326 * the wait queue operation.
327 * Note the ordering of rpc_test_and_set_running() and rpc_clear_queued(),
328 * which is needed to ensure that __rpc_execute() doesn't loop (due to the
329 * lockless RPC_IS_QUEUED() test) before we've had a chance to test
330 * the RPC_TASK_RUNNING flag.
327 */ 331 */
328static void rpc_make_runnable(struct rpc_task *task) 332static void rpc_make_runnable(struct rpc_task *task)
329{ 333{
334 bool need_wakeup = !rpc_test_and_set_running(task);
335
330 rpc_clear_queued(task); 336 rpc_clear_queued(task);
331 if (rpc_test_and_set_running(task)) 337 if (!need_wakeup)
332 return; 338 return;
333 if (RPC_IS_ASYNC(task)) { 339 if (RPC_IS_ASYNC(task)) {
334 INIT_WORK(&task->u.tk_work, rpc_async_schedule); 340 INIT_WORK(&task->u.tk_work, rpc_async_schedule);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index daa6080a2a0c..a80feee5197a 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -2306,8 +2306,11 @@ static int link_recv_changeover_msg(struct tipc_link **l_ptr,
2306 struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf); 2306 struct tipc_msg *tunnel_msg = buf_msg(tunnel_buf);
2307 u32 msg_typ = msg_type(tunnel_msg); 2307 u32 msg_typ = msg_type(tunnel_msg);
2308 u32 msg_count = msg_msgcnt(tunnel_msg); 2308 u32 msg_count = msg_msgcnt(tunnel_msg);
2309 u32 bearer_id = msg_bearer_id(tunnel_msg);
2309 2310
2310 dest_link = (*l_ptr)->owner->links[msg_bearer_id(tunnel_msg)]; 2311 if (bearer_id >= MAX_BEARERS)
2312 goto exit;
2313 dest_link = (*l_ptr)->owner->links[bearer_id];
2311 if (!dest_link) 2314 if (!dest_link)
2312 goto exit; 2315 goto exit;
2313 if (dest_link == *l_ptr) { 2316 if (dest_link == *l_ptr) {
@@ -2521,14 +2524,16 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb,
2521 struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm); 2524 struct tipc_msg *imsg = (struct tipc_msg *)msg_data(fragm);
2522 u32 msg_sz = msg_size(imsg); 2525 u32 msg_sz = msg_size(imsg);
2523 u32 fragm_sz = msg_data_sz(fragm); 2526 u32 fragm_sz = msg_data_sz(fragm);
2524 u32 exp_fragm_cnt = msg_sz/fragm_sz + !!(msg_sz % fragm_sz); 2527 u32 exp_fragm_cnt;
2525 u32 max = TIPC_MAX_USER_MSG_SIZE + NAMED_H_SIZE; 2528 u32 max = TIPC_MAX_USER_MSG_SIZE + NAMED_H_SIZE;
2529
2526 if (msg_type(imsg) == TIPC_MCAST_MSG) 2530 if (msg_type(imsg) == TIPC_MCAST_MSG)
2527 max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE; 2531 max = TIPC_MAX_USER_MSG_SIZE + MCAST_H_SIZE;
2528 if (msg_size(imsg) > max) { 2532 if (fragm_sz == 0 || msg_size(imsg) > max) {
2529 kfree_skb(fbuf); 2533 kfree_skb(fbuf);
2530 return 0; 2534 return 0;
2531 } 2535 }
2536 exp_fragm_cnt = msg_sz / fragm_sz + !!(msg_sz % fragm_sz);
2532 pbuf = tipc_buf_acquire(msg_size(imsg)); 2537 pbuf = tipc_buf_acquire(msg_size(imsg));
2533 if (pbuf != NULL) { 2538 if (pbuf != NULL) {
2534 pbuf->next = *pending; 2539 pbuf->next = *pending;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 84c9ad7e1dca..73405e00c800 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -638,17 +638,21 @@ int wiphy_register(struct wiphy *wiphy)
638 * cfg80211_mutex lock 638 * cfg80211_mutex lock
639 */ 639 */
640 res = rfkill_register(rdev->rfkill); 640 res = rfkill_register(rdev->rfkill);
641 if (res) 641 if (res) {
642 goto out_rm_dev; 642 device_del(&rdev->wiphy.dev);
643
644 mutex_lock(&cfg80211_mutex);
645 debugfs_remove_recursive(rdev->wiphy.debugfsdir);
646 list_del_rcu(&rdev->list);
647 wiphy_regulatory_deregister(wiphy);
648 mutex_unlock(&cfg80211_mutex);
649 return res;
650 }
643 651
644 rtnl_lock(); 652 rtnl_lock();
645 rdev->wiphy.registered = true; 653 rdev->wiphy.registered = true;
646 rtnl_unlock(); 654 rtnl_unlock();
647 return 0; 655 return 0;
648
649out_rm_dev:
650 device_del(&rdev->wiphy.dev);
651 return res;
652} 656}
653EXPORT_SYMBOL(wiphy_register); 657EXPORT_SYMBOL(wiphy_register);
654 658
@@ -866,7 +870,6 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev,
866#endif 870#endif
867 __cfg80211_disconnect(rdev, dev, 871 __cfg80211_disconnect(rdev, dev,
868 WLAN_REASON_DEAUTH_LEAVING, true); 872 WLAN_REASON_DEAUTH_LEAVING, true);
869 cfg80211_mlme_down(rdev, dev);
870 wdev_unlock(wdev); 873 wdev_unlock(wdev);
871 break; 874 break;
872 case NL80211_IFTYPE_MESH_POINT: 875 case NL80211_IFTYPE_MESH_POINT:
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index afa283841e8c..dfdb5e643211 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7577,6 +7577,8 @@ static int nl80211_send_wowlan_tcp(struct sk_buff *msg,
7577 &tcp->payload_tok)) 7577 &tcp->payload_tok))
7578 return -ENOBUFS; 7578 return -ENOBUFS;
7579 7579
7580 nla_nest_end(msg, nl_tcp);
7581
7580 return 0; 7582 return 0;
7581} 7583}
7582 7584
@@ -9970,6 +9972,7 @@ int nl80211_send_mgmt(struct cfg80211_registered_device *rdev,
9970 if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || 9972 if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
9971 (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, 9973 (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
9972 netdev->ifindex)) || 9974 netdev->ifindex)) ||
9975 nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
9973 nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) || 9976 nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, freq) ||
9974 (sig_dbm && 9977 (sig_dbm &&
9975 nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) || 9978 nla_put_u32(msg, NL80211_ATTR_RX_SIGNAL_DBM, sig_dbm)) ||
@@ -10010,6 +10013,7 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
10010 if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || 10013 if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
10011 (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX, 10014 (netdev && nla_put_u32(msg, NL80211_ATTR_IFINDEX,
10012 netdev->ifindex)) || 10015 netdev->ifindex)) ||
10016 nla_put_u64(msg, NL80211_ATTR_WDEV, wdev_id(wdev)) ||
10013 nla_put(msg, NL80211_ATTR_FRAME, len, buf) || 10017 nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
10014 nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) || 10018 nla_put_u64(msg, NL80211_ATTR_COOKIE, cookie) ||
10015 (ack && nla_put_flag(msg, NL80211_ATTR_ACK))) 10019 (ack && nla_put_flag(msg, NL80211_ATTR_ACK)))
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index a9dc5c736df0..8b5eddfba1e5 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -961,7 +961,7 @@ int __cfg80211_disconnect(struct cfg80211_registered_device *rdev,
961 /* was it connected by userspace SME? */ 961 /* was it connected by userspace SME? */
962 if (!wdev->conn) { 962 if (!wdev->conn) {
963 cfg80211_mlme_down(rdev, dev); 963 cfg80211_mlme_down(rdev, dev);
964 return 0; 964 goto disconnect;
965 } 965 }
966 966
967 if (wdev->sme_state == CFG80211_SME_CONNECTING && 967 if (wdev->sme_state == CFG80211_SME_CONNECTING &&
@@ -987,6 +987,7 @@ int __cfg80211_disconnect(struct cfg80211_registered_device *rdev,
987 return err; 987 return err;
988 } 988 }
989 989
990 disconnect:
990 if (wdev->sme_state == CFG80211_SME_CONNECTED) 991 if (wdev->sme_state == CFG80211_SME_CONNECTED)
991 __cfg80211_disconnected(dev, NULL, 0, 0, false); 992 __cfg80211_disconnected(dev, NULL, 0, 0, false);
992 else if (wdev->sme_state == CFG80211_SME_CONNECTING) 993 else if (wdev->sme_state == CFG80211_SME_CONNECTING)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index ecd4fcec3c94..5755bc14abbd 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2441,6 +2441,7 @@ TRACE_EVENT(cfg80211_report_wowlan_wakeup,
2441 TP_STRUCT__entry( 2441 TP_STRUCT__entry(
2442 WIPHY_ENTRY 2442 WIPHY_ENTRY
2443 WDEV_ENTRY 2443 WDEV_ENTRY
2444 __field(bool, non_wireless)
2444 __field(bool, disconnect) 2445 __field(bool, disconnect)
2445 __field(bool, magic_pkt) 2446 __field(bool, magic_pkt)
2446 __field(bool, gtk_rekey_failure) 2447 __field(bool, gtk_rekey_failure)
@@ -2449,20 +2450,22 @@ TRACE_EVENT(cfg80211_report_wowlan_wakeup,
2449 __field(bool, rfkill_release) 2450 __field(bool, rfkill_release)
2450 __field(s32, pattern_idx) 2451 __field(s32, pattern_idx)
2451 __field(u32, packet_len) 2452 __field(u32, packet_len)
2452 __dynamic_array(u8, packet, wakeup->packet_present_len) 2453 __dynamic_array(u8, packet,
2454 wakeup ? wakeup->packet_present_len : 0)
2453 ), 2455 ),
2454 TP_fast_assign( 2456 TP_fast_assign(
2455 WIPHY_ASSIGN; 2457 WIPHY_ASSIGN;
2456 WDEV_ASSIGN; 2458 WDEV_ASSIGN;
2457 __entry->disconnect = wakeup->disconnect; 2459 __entry->non_wireless = !wakeup;
2458 __entry->magic_pkt = wakeup->magic_pkt; 2460 __entry->disconnect = wakeup ? wakeup->disconnect : false;
2459 __entry->gtk_rekey_failure = wakeup->gtk_rekey_failure; 2461 __entry->magic_pkt = wakeup ? wakeup->magic_pkt : false;
2460 __entry->eap_identity_req = wakeup->eap_identity_req; 2462 __entry->gtk_rekey_failure = wakeup ? wakeup->gtk_rekey_failure : false;
2461 __entry->four_way_handshake = wakeup->four_way_handshake; 2463 __entry->eap_identity_req = wakeup ? wakeup->eap_identity_req : false;
2462 __entry->rfkill_release = wakeup->rfkill_release; 2464 __entry->four_way_handshake = wakeup ? wakeup->four_way_handshake : false;
2463 __entry->pattern_idx = wakeup->pattern_idx; 2465 __entry->rfkill_release = wakeup ? wakeup->rfkill_release : false;
2464 __entry->packet_len = wakeup->packet_len; 2466 __entry->pattern_idx = wakeup ? wakeup->pattern_idx : false;
2465 if (wakeup->packet && wakeup->packet_present_len) 2467 __entry->packet_len = wakeup ? wakeup->packet_len : false;
2468 if (wakeup && wakeup->packet && wakeup->packet_present_len)
2466 memcpy(__get_dynamic_array(packet), wakeup->packet, 2469 memcpy(__get_dynamic_array(packet), wakeup->packet,
2467 wakeup->packet_present_len); 2470 wakeup->packet_present_len);
2468 ), 2471 ),
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index bcfda8921b5b..0cf003dfa8fc 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -64,6 +64,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
64 64
65 if (unlikely(x->km.state != XFRM_STATE_VALID)) { 65 if (unlikely(x->km.state != XFRM_STATE_VALID)) {
66 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID); 66 XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEINVALID);
67 err = -EINVAL;
67 goto error; 68 goto error;
68 } 69 }
69 70