aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c52
-rw-r--r--net/atm/common.c1
-rw-r--r--net/atm/pvc.c1
-rw-r--r--net/batman-adv/gateway_client.c6
-rw-r--r--net/batman-adv/translation-table.c1
-rw-r--r--net/bluetooth/hci_event.c28
-rw-r--r--net/bluetooth/hci_sock.c2
-rw-r--r--net/bluetooth/l2cap_core.c1
-rw-r--r--net/bluetooth/l2cap_sock.c3
-rw-r--r--net/bluetooth/rfcomm/sock.c2
-rw-r--r--net/bluetooth/rfcomm/tty.c2
-rw-r--r--net/bluetooth/sco.c19
-rw-r--r--net/bluetooth/smp.c5
-rw-r--r--net/bridge/br_device.c30
-rw-r--r--net/bridge/br_forward.c2
-rw-r--r--net/bridge/br_if.c6
-rw-r--r--net/bridge/br_private.h4
-rw-r--r--net/bridge/br_sysfs_if.c6
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/caif/chnl_net.c4
-rw-r--r--net/ceph/ceph_common.c26
-rw-r--r--net/ceph/crush/mapper.c13
-rw-r--r--net/ceph/crypto.c1
-rw-r--r--net/ceph/crypto.h3
-rw-r--r--net/ceph/debugfs.c4
-rw-r--r--net/ceph/messenger.c932
-rw-r--r--net/ceph/mon_client.c127
-rw-r--r--net/ceph/msgpool.c7
-rw-r--r--net/ceph/osd_client.c77
-rw-r--r--net/ceph/osdmap.c59
-rw-r--r--net/core/dev.c84
-rw-r--r--net/core/dst.c10
-rw-r--r--net/core/filter.c8
-rw-r--r--net/core/netpoll.c99
-rw-r--r--net/core/netprio_cgroup.c30
-rw-r--r--net/core/rtnetlink.c17
-rw-r--r--net/core/scm.c4
-rw-r--r--net/core/skbuff.c124
-rw-r--r--net/core/sock.c60
-rw-r--r--net/dccp/ccid.h4
-rw-r--r--net/dccp/ccids/ccid3.c1
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/fib_frontend.c1
-rw-r--r--net/ipv4/fib_semantics.c42
-rw-r--r--net/ipv4/fib_trie.c55
-rw-r--r--net/ipv4/inet_connection_sock.c7
-rw-r--r--net/ipv4/ip_fragment.c4
-rw-r--r--net/ipv4/ip_input.c12
-rw-r--r--net/ipv4/ip_output.c10
-rw-r--r--net/ipv4/ipmr.c14
-rw-r--r--net/ipv4/netfilter/nf_nat_sip.c14
-rw-r--r--net/ipv4/route.c242
-rw-r--r--net/ipv4/sysctl_net_ipv4.c15
-rw-r--r--net/ipv4/tcp.c9
-rw-r--r--net/ipv4/tcp_cong.c3
-rw-r--r--net/ipv4/tcp_input.c44
-rw-r--r--net/ipv4/tcp_ipv4.c48
-rw-r--r--net/ipv4/tcp_metrics.c12
-rw-r--r--net/ipv4/tcp_minisocks.c2
-rw-r--r--net/ipv4/tcp_output.c49
-rw-r--r--net/ipv4/tcp_timer.c6
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv4/xfrm4_input.c4
-rw-r--r--net/ipv4/xfrm4_policy.c1
-rw-r--r--net/ipv6/addrconf.c4
-rw-r--r--net/ipv6/esp6.c6
-rw-r--r--net/ipv6/ip6_input.c11
-rw-r--r--net/ipv6/proc.c4
-rw-r--r--net/ipv6/route.c8
-rw-r--r--net/ipv6/tcp_ipv6.c73
-rw-r--r--net/ipv6/xfrm6_policy.c8
-rw-r--r--net/l2tp/l2tp_core.c3
-rw-r--r--net/l2tp/l2tp_core.h1
-rw-r--r--net/l2tp/l2tp_ip6.c1
-rw-r--r--net/llc/af_llc.c8
-rw-r--r--net/llc/llc_input.c21
-rw-r--r--net/llc/llc_station.c29
-rw-r--r--net/mac80211/led.c2
-rw-r--r--net/mac80211/mesh.c3
-rw-r--r--net/mac80211/mlme.c2
-rw-r--r--net/mac80211/scan.c3
-rw-r--r--net/mac80211/tx.c38
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c5
-rw-r--r--net/netfilter/nf_conntrack_core.c16
-rw-r--r--net/netfilter/nf_conntrack_expect.c29
-rw-r--r--net/netfilter/nf_conntrack_netlink.c10
-rw-r--r--net/netfilter/nf_conntrack_sip.c92
-rw-r--r--net/netfilter/nfnetlink_log.c6
-rw-r--r--net/netlink/af_netlink.c6
-rw-r--r--net/openvswitch/actions.c3
-rw-r--r--net/packet/af_packet.c12
-rw-r--r--net/sched/act_gact.c14
-rw-r--r--net/sched/act_ipt.c7
-rw-r--r--net/sched/act_mirred.c11
-rw-r--r--net/sched/act_pedit.c5
-rw-r--r--net/sched/act_simple.c5
-rw-r--r--net/sched/sch_qfq.c95
-rw-r--r--net/sctp/ulpevent.c3
-rw-r--r--net/socket.c5
-rw-r--r--net/sunrpc/Kconfig5
-rw-r--r--net/sunrpc/auth.c54
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c1
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c20
-rw-r--r--net/sunrpc/cache.c5
-rw-r--r--net/sunrpc/clnt.c12
-rw-r--r--net/sunrpc/rpcb_clnt.c4
-rw-r--r--net/sunrpc/sched.c14
-rw-r--r--net/sunrpc/svc_xprt.c10
-rw-r--r--net/sunrpc/svcsock.c2
-rw-r--r--net/sunrpc/xdr.c127
-rw-r--r--net/sunrpc/xprtrdma/transport.c3
-rw-r--r--net/sunrpc/xprtsock.c53
-rw-r--r--net/unix/af_unix.c97
-rw-r--r--net/wanrouter/wanmain.c51
-rw-r--r--net/wireless/core.c5
-rw-r--r--net/wireless/core.h1
-rw-r--r--net/wireless/reg.c19
-rw-r--r--net/wireless/util.c2
-rw-r--r--net/xfrm/xfrm_policy.c2
-rw-r--r--net/xfrm/xfrm_state.c25
121 files changed, 2346 insertions, 1182 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 73a2a83ee2da..402442402af7 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -137,9 +137,21 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
137 return rc; 137 return rc;
138} 138}
139 139
140static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb)
141{
142#ifdef CONFIG_NET_POLL_CONTROLLER
143 if (vlan->netpoll)
144 netpoll_send_skb(vlan->netpoll, skb);
145#else
146 BUG();
147#endif
148 return NETDEV_TX_OK;
149}
150
140static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, 151static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
141 struct net_device *dev) 152 struct net_device *dev)
142{ 153{
154 struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
143 struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); 155 struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data);
144 unsigned int len; 156 unsigned int len;
145 int ret; 157 int ret;
@@ -150,29 +162,30 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
150 * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... 162 * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
151 */ 163 */
152 if (veth->h_vlan_proto != htons(ETH_P_8021Q) || 164 if (veth->h_vlan_proto != htons(ETH_P_8021Q) ||
153 vlan_dev_priv(dev)->flags & VLAN_FLAG_REORDER_HDR) { 165 vlan->flags & VLAN_FLAG_REORDER_HDR) {
154 u16 vlan_tci; 166 u16 vlan_tci;
155 vlan_tci = vlan_dev_priv(dev)->vlan_id; 167 vlan_tci = vlan->vlan_id;
156 vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb); 168 vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
157 skb = __vlan_hwaccel_put_tag(skb, vlan_tci); 169 skb = __vlan_hwaccel_put_tag(skb, vlan_tci);
158 } 170 }
159 171
160 skb->dev = vlan_dev_priv(dev)->real_dev; 172 skb->dev = vlan->real_dev;
161 len = skb->len; 173 len = skb->len;
162 if (netpoll_tx_running(dev)) 174 if (unlikely(netpoll_tx_running(dev)))
163 return skb->dev->netdev_ops->ndo_start_xmit(skb, skb->dev); 175 return vlan_netpoll_send_skb(vlan, skb);
176
164 ret = dev_queue_xmit(skb); 177 ret = dev_queue_xmit(skb);
165 178
166 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { 179 if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
167 struct vlan_pcpu_stats *stats; 180 struct vlan_pcpu_stats *stats;
168 181
169 stats = this_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats); 182 stats = this_cpu_ptr(vlan->vlan_pcpu_stats);
170 u64_stats_update_begin(&stats->syncp); 183 u64_stats_update_begin(&stats->syncp);
171 stats->tx_packets++; 184 stats->tx_packets++;
172 stats->tx_bytes += len; 185 stats->tx_bytes += len;
173 u64_stats_update_end(&stats->syncp); 186 u64_stats_update_end(&stats->syncp);
174 } else { 187 } else {
175 this_cpu_inc(vlan_dev_priv(dev)->vlan_pcpu_stats->tx_dropped); 188 this_cpu_inc(vlan->vlan_pcpu_stats->tx_dropped);
176 } 189 }
177 190
178 return ret; 191 return ret;
@@ -669,25 +682,26 @@ static void vlan_dev_poll_controller(struct net_device *dev)
669 return; 682 return;
670} 683}
671 684
672static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo) 685static int vlan_dev_netpoll_setup(struct net_device *dev, struct netpoll_info *npinfo,
686 gfp_t gfp)
673{ 687{
674 struct vlan_dev_priv *info = vlan_dev_priv(dev); 688 struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
675 struct net_device *real_dev = info->real_dev; 689 struct net_device *real_dev = vlan->real_dev;
676 struct netpoll *netpoll; 690 struct netpoll *netpoll;
677 int err = 0; 691 int err = 0;
678 692
679 netpoll = kzalloc(sizeof(*netpoll), GFP_KERNEL); 693 netpoll = kzalloc(sizeof(*netpoll), gfp);
680 err = -ENOMEM; 694 err = -ENOMEM;
681 if (!netpoll) 695 if (!netpoll)
682 goto out; 696 goto out;
683 697
684 err = __netpoll_setup(netpoll, real_dev); 698 err = __netpoll_setup(netpoll, real_dev, gfp);
685 if (err) { 699 if (err) {
686 kfree(netpoll); 700 kfree(netpoll);
687 goto out; 701 goto out;
688 } 702 }
689 703
690 info->netpoll = netpoll; 704 vlan->netpoll = netpoll;
691 705
692out: 706out:
693 return err; 707 return err;
@@ -695,19 +709,15 @@ out:
695 709
696static void vlan_dev_netpoll_cleanup(struct net_device *dev) 710static void vlan_dev_netpoll_cleanup(struct net_device *dev)
697{ 711{
698 struct vlan_dev_priv *info = vlan_dev_priv(dev); 712 struct vlan_dev_priv *vlan= vlan_dev_priv(dev);
699 struct netpoll *netpoll = info->netpoll; 713 struct netpoll *netpoll = vlan->netpoll;
700 714
701 if (!netpoll) 715 if (!netpoll)
702 return; 716 return;
703 717
704 info->netpoll = NULL; 718 vlan->netpoll = NULL;
705
706 /* Wait for transmitting packets to finish before freeing. */
707 synchronize_rcu_bh();
708 719
709 __netpoll_cleanup(netpoll); 720 __netpoll_free_rcu(netpoll);
710 kfree(netpoll);
711} 721}
712#endif /* CONFIG_NET_POLL_CONTROLLER */ 722#endif /* CONFIG_NET_POLL_CONTROLLER */
713 723
diff --git a/net/atm/common.c b/net/atm/common.c
index b4b44dbed645..0c0ad930a632 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -812,6 +812,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname,
812 812
813 if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) 813 if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags))
814 return -ENOTCONN; 814 return -ENOTCONN;
815 memset(&pvc, 0, sizeof(pvc));
815 pvc.sap_family = AF_ATMPVC; 816 pvc.sap_family = AF_ATMPVC;
816 pvc.sap_addr.itf = vcc->dev->number; 817 pvc.sap_addr.itf = vcc->dev->number;
817 pvc.sap_addr.vpi = vcc->vpi; 818 pvc.sap_addr.vpi = vcc->vpi;
diff --git a/net/atm/pvc.c b/net/atm/pvc.c
index 3a734919c36c..ae0324021407 100644
--- a/net/atm/pvc.c
+++ b/net/atm/pvc.c
@@ -95,6 +95,7 @@ static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr,
95 return -ENOTCONN; 95 return -ENOTCONN;
96 *sockaddr_len = sizeof(struct sockaddr_atmpvc); 96 *sockaddr_len = sizeof(struct sockaddr_atmpvc);
97 addr = (struct sockaddr_atmpvc *)sockaddr; 97 addr = (struct sockaddr_atmpvc *)sockaddr;
98 memset(addr, 0, sizeof(*addr));
98 addr->sap_family = AF_ATMPVC; 99 addr->sap_family = AF_ATMPVC;
99 addr->sap_addr.itf = vcc->dev->number; 100 addr->sap_addr.itf = vcc->dev->number;
100 addr->sap_addr.vpi = vcc->vpi; 101 addr->sap_addr.vpi = vcc->vpi;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index b421cc49d2cd..fc866f2e4528 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -200,11 +200,11 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
200 if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT) 200 if (atomic_read(&bat_priv->gw_mode) != BATADV_GW_MODE_CLIENT)
201 goto out; 201 goto out;
202 202
203 if (!batadv_atomic_dec_not_zero(&bat_priv->gw_reselect))
204 goto out;
205
206 curr_gw = batadv_gw_get_selected_gw_node(bat_priv); 203 curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
207 204
205 if (!batadv_atomic_dec_not_zero(&bat_priv->gw_reselect) && curr_gw)
206 goto out;
207
208 next_gw = batadv_gw_get_best_gw_node(bat_priv); 208 next_gw = batadv_gw_get_best_gw_node(bat_priv);
209 209
210 if (curr_gw == next_gw) 210 if (curr_gw == next_gw)
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index a438f4b582fc..99dd8f75b3ff 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -197,6 +197,7 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
197del: 197del:
198 list_del(&entry->list); 198 list_del(&entry->list);
199 kfree(entry); 199 kfree(entry);
200 kfree(tt_change_node);
200 event_removed = true; 201 event_removed = true;
201 goto unlock; 202 goto unlock;
202 } 203 }
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 41ff978a33f9..715d7e33fba0 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1365,6 +1365,9 @@ static bool hci_resolve_next_name(struct hci_dev *hdev)
1365 return false; 1365 return false;
1366 1366
1367 e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED); 1367 e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, NAME_NEEDED);
1368 if (!e)
1369 return false;
1370
1368 if (hci_resolve_name(hdev, e) == 0) { 1371 if (hci_resolve_name(hdev, e) == 0) {
1369 e->name_state = NAME_PENDING; 1372 e->name_state = NAME_PENDING;
1370 return true; 1373 return true;
@@ -1393,12 +1396,20 @@ static void hci_check_pending_name(struct hci_dev *hdev, struct hci_conn *conn,
1393 return; 1396 return;
1394 1397
1395 e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING); 1398 e = hci_inquiry_cache_lookup_resolve(hdev, bdaddr, NAME_PENDING);
1396 if (e) { 1399 /* If the device was not found in a list of found devices names of which
1400 * are pending. there is no need to continue resolving a next name as it
1401 * will be done upon receiving another Remote Name Request Complete
1402 * Event */
1403 if (!e)
1404 return;
1405
1406 list_del(&e->list);
1407 if (name) {
1397 e->name_state = NAME_KNOWN; 1408 e->name_state = NAME_KNOWN;
1398 list_del(&e->list); 1409 mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00,
1399 if (name) 1410 e->data.rssi, name, name_len);
1400 mgmt_remote_name(hdev, bdaddr, ACL_LINK, 0x00, 1411 } else {
1401 e->data.rssi, name, name_len); 1412 e->name_state = NAME_NOT_KNOWN;
1402 } 1413 }
1403 1414
1404 if (hci_resolve_next_name(hdev)) 1415 if (hci_resolve_next_name(hdev))
@@ -1762,7 +1773,12 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
1762 if (conn->type == ACL_LINK) { 1773 if (conn->type == ACL_LINK) {
1763 conn->state = BT_CONFIG; 1774 conn->state = BT_CONFIG;
1764 hci_conn_hold(conn); 1775 hci_conn_hold(conn);
1765 conn->disc_timeout = HCI_DISCONN_TIMEOUT; 1776
1777 if (!conn->out && !hci_conn_ssp_enabled(conn) &&
1778 !hci_find_link_key(hdev, &ev->bdaddr))
1779 conn->disc_timeout = HCI_PAIRING_TIMEOUT;
1780 else
1781 conn->disc_timeout = HCI_DISCONN_TIMEOUT;
1766 } else 1782 } else
1767 conn->state = BT_CONNECTED; 1783 conn->state = BT_CONNECTED;
1768 1784
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index a7f04de03d79..19fdac78e555 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -694,6 +694,7 @@ static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
694 *addr_len = sizeof(*haddr); 694 *addr_len = sizeof(*haddr);
695 haddr->hci_family = AF_BLUETOOTH; 695 haddr->hci_family = AF_BLUETOOTH;
696 haddr->hci_dev = hdev->id; 696 haddr->hci_dev = hdev->id;
697 haddr->hci_channel= 0;
697 698
698 release_sock(sk); 699 release_sock(sk);
699 return 0; 700 return 0;
@@ -1009,6 +1010,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname,
1009 { 1010 {
1010 struct hci_filter *f = &hci_pi(sk)->filter; 1011 struct hci_filter *f = &hci_pi(sk)->filter;
1011 1012
1013 memset(&uf, 0, sizeof(uf));
1012 uf.type_mask = f->type_mask; 1014 uf.type_mask = f->type_mask;
1013 uf.opcode = f->opcode; 1015 uf.opcode = f->opcode;
1014 uf.event_mask[0] = *((u32 *) f->event_mask + 0); 1016 uf.event_mask[0] = *((u32 *) f->event_mask + 0);
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index a8964db04bfb..daa149b7003c 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -1181,6 +1181,7 @@ static void l2cap_le_conn_ready(struct l2cap_conn *conn)
1181 sk = chan->sk; 1181 sk = chan->sk;
1182 1182
1183 hci_conn_hold(conn->hcon); 1183 hci_conn_hold(conn->hcon);
1184 conn->hcon->disc_timeout = HCI_DISCONN_TIMEOUT;
1184 1185
1185 bacpy(&bt_sk(sk)->src, conn->src); 1186 bacpy(&bt_sk(sk)->src, conn->src);
1186 bacpy(&bt_sk(sk)->dst, conn->dst); 1187 bacpy(&bt_sk(sk)->dst, conn->dst);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index a4bb27e8427e..1497edd191a2 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -245,6 +245,7 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l
245 245
246 BT_DBG("sock %p, sk %p", sock, sk); 246 BT_DBG("sock %p, sk %p", sock, sk);
247 247
248 memset(la, 0, sizeof(struct sockaddr_l2));
248 addr->sa_family = AF_BLUETOOTH; 249 addr->sa_family = AF_BLUETOOTH;
249 *len = sizeof(struct sockaddr_l2); 250 *len = sizeof(struct sockaddr_l2);
250 251
@@ -1174,7 +1175,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p
1174 1175
1175 chan = l2cap_chan_create(); 1176 chan = l2cap_chan_create();
1176 if (!chan) { 1177 if (!chan) {
1177 l2cap_sock_kill(sk); 1178 sk_free(sk);
1178 return NULL; 1179 return NULL;
1179 } 1180 }
1180 1181
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index 7e1e59645c05..1a17850d093c 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -528,6 +528,7 @@ static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *
528 528
529 BT_DBG("sock %p, sk %p", sock, sk); 529 BT_DBG("sock %p, sk %p", sock, sk);
530 530
531 memset(sa, 0, sizeof(*sa));
531 sa->rc_family = AF_BLUETOOTH; 532 sa->rc_family = AF_BLUETOOTH;
532 sa->rc_channel = rfcomm_pi(sk)->channel; 533 sa->rc_channel = rfcomm_pi(sk)->channel;
533 if (peer) 534 if (peer)
@@ -822,6 +823,7 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
822 } 823 }
823 824
824 sec.level = rfcomm_pi(sk)->sec_level; 825 sec.level = rfcomm_pi(sk)->sec_level;
826 sec.key_size = 0;
825 827
826 len = min_t(unsigned int, len, sizeof(sec)); 828 len = min_t(unsigned int, len, sizeof(sec));
827 if (copy_to_user(optval, (char *) &sec, len)) 829 if (copy_to_user(optval, (char *) &sec, len))
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index cb960773c002..56f182393c4c 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -456,7 +456,7 @@ static int rfcomm_get_dev_list(void __user *arg)
456 456
457 size = sizeof(*dl) + dev_num * sizeof(*di); 457 size = sizeof(*dl) + dev_num * sizeof(*di);
458 458
459 dl = kmalloc(size, GFP_KERNEL); 459 dl = kzalloc(size, GFP_KERNEL);
460 if (!dl) 460 if (!dl)
461 return -ENOMEM; 461 return -ENOMEM;
462 462
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 40bbe25dcff7..3589e21edb09 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -131,6 +131,15 @@ static int sco_conn_del(struct hci_conn *hcon, int err)
131 sco_sock_clear_timer(sk); 131 sco_sock_clear_timer(sk);
132 sco_chan_del(sk, err); 132 sco_chan_del(sk, err);
133 bh_unlock_sock(sk); 133 bh_unlock_sock(sk);
134
135 sco_conn_lock(conn);
136 conn->sk = NULL;
137 sco_pi(sk)->conn = NULL;
138 sco_conn_unlock(conn);
139
140 if (conn->hcon)
141 hci_conn_put(conn->hcon);
142
134 sco_sock_kill(sk); 143 sco_sock_kill(sk);
135 } 144 }
136 145
@@ -821,16 +830,6 @@ static void sco_chan_del(struct sock *sk, int err)
821 830
822 BT_DBG("sk %p, conn %p, err %d", sk, conn, err); 831 BT_DBG("sk %p, conn %p, err %d", sk, conn, err);
823 832
824 if (conn) {
825 sco_conn_lock(conn);
826 conn->sk = NULL;
827 sco_pi(sk)->conn = NULL;
828 sco_conn_unlock(conn);
829
830 if (conn->hcon)
831 hci_conn_put(conn->hcon);
832 }
833
834 sk->sk_state = BT_CLOSED; 833 sk->sk_state = BT_CLOSED;
835 sk->sk_err = err; 834 sk->sk_err = err;
836 sk->sk_state_change(sk); 835 sk->sk_state_change(sk);
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index 16ef0dc85a0a..901a616c8083 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -579,8 +579,11 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
579 579
580 if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags)) 580 if (!test_and_set_bit(HCI_CONN_LE_SMP_PEND, &conn->hcon->flags))
581 smp = smp_chan_create(conn); 581 smp = smp_chan_create(conn);
582 else
583 smp = conn->smp_chan;
582 584
583 smp = conn->smp_chan; 585 if (!smp)
586 return SMP_UNSPECIFIED;
584 587
585 smp->preq[0] = SMP_CMD_PAIRING_REQ; 588 smp->preq[0] = SMP_CMD_PAIRING_REQ;
586 memcpy(&smp->preq[1], req, sizeof(*req)); 589 memcpy(&smp->preq[1], req, sizeof(*req));
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 333484537600..070e8a68cfc6 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -31,9 +31,11 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
31 struct net_bridge_mdb_entry *mdst; 31 struct net_bridge_mdb_entry *mdst;
32 struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats); 32 struct br_cpu_netstats *brstats = this_cpu_ptr(br->stats);
33 33
34 rcu_read_lock();
34#ifdef CONFIG_BRIDGE_NETFILTER 35#ifdef CONFIG_BRIDGE_NETFILTER
35 if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) { 36 if (skb->nf_bridge && (skb->nf_bridge->mask & BRNF_BRIDGED_DNAT)) {
36 br_nf_pre_routing_finish_bridge_slow(skb); 37 br_nf_pre_routing_finish_bridge_slow(skb);
38 rcu_read_unlock();
37 return NETDEV_TX_OK; 39 return NETDEV_TX_OK;
38 } 40 }
39#endif 41#endif
@@ -48,7 +50,6 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
48 skb_reset_mac_header(skb); 50 skb_reset_mac_header(skb);
49 skb_pull(skb, ETH_HLEN); 51 skb_pull(skb, ETH_HLEN);
50 52
51 rcu_read_lock();
52 if (is_broadcast_ether_addr(dest)) 53 if (is_broadcast_ether_addr(dest))
53 br_flood_deliver(br, skb); 54 br_flood_deliver(br, skb);
54 else if (is_multicast_ether_addr(dest)) { 55 else if (is_multicast_ether_addr(dest)) {
@@ -206,24 +207,23 @@ static void br_poll_controller(struct net_device *br_dev)
206static void br_netpoll_cleanup(struct net_device *dev) 207static void br_netpoll_cleanup(struct net_device *dev)
207{ 208{
208 struct net_bridge *br = netdev_priv(dev); 209 struct net_bridge *br = netdev_priv(dev);
209 struct net_bridge_port *p, *n; 210 struct net_bridge_port *p;
210 211
211 list_for_each_entry_safe(p, n, &br->port_list, list) { 212 list_for_each_entry(p, &br->port_list, list)
212 br_netpoll_disable(p); 213 br_netpoll_disable(p);
213 }
214} 214}
215 215
216static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) 216static int br_netpoll_setup(struct net_device *dev, struct netpoll_info *ni,
217 gfp_t gfp)
217{ 218{
218 struct net_bridge *br = netdev_priv(dev); 219 struct net_bridge *br = netdev_priv(dev);
219 struct net_bridge_port *p, *n; 220 struct net_bridge_port *p;
220 int err = 0; 221 int err = 0;
221 222
222 list_for_each_entry_safe(p, n, &br->port_list, list) { 223 list_for_each_entry(p, &br->port_list, list) {
223 if (!p->dev) 224 if (!p->dev)
224 continue; 225 continue;
225 226 err = br_netpoll_enable(p, gfp);
226 err = br_netpoll_enable(p);
227 if (err) 227 if (err)
228 goto fail; 228 goto fail;
229 } 229 }
@@ -236,17 +236,17 @@ fail:
236 goto out; 236 goto out;
237} 237}
238 238
239int br_netpoll_enable(struct net_bridge_port *p) 239int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp)
240{ 240{
241 struct netpoll *np; 241 struct netpoll *np;
242 int err = 0; 242 int err = 0;
243 243
244 np = kzalloc(sizeof(*p->np), GFP_KERNEL); 244 np = kzalloc(sizeof(*p->np), gfp);
245 err = -ENOMEM; 245 err = -ENOMEM;
246 if (!np) 246 if (!np)
247 goto out; 247 goto out;
248 248
249 err = __netpoll_setup(np, p->dev); 249 err = __netpoll_setup(np, p->dev, gfp);
250 if (err) { 250 if (err) {
251 kfree(np); 251 kfree(np);
252 goto out; 252 goto out;
@@ -267,11 +267,7 @@ void br_netpoll_disable(struct net_bridge_port *p)
267 267
268 p->np = NULL; 268 p->np = NULL;
269 269
270 /* Wait for transmitting packets to finish before freeing. */ 270 __netpoll_free_rcu(np);
271 synchronize_rcu_bh();
272
273 __netpoll_cleanup(np);
274 kfree(np);
275} 271}
276 272
277#endif 273#endif
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index e9466d412707..02015a505d2a 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -65,7 +65,7 @@ static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb)
65{ 65{
66 skb->dev = to->dev; 66 skb->dev = to->dev;
67 67
68 if (unlikely(netpoll_tx_running(to->dev))) { 68 if (unlikely(netpoll_tx_running(to->br->dev))) {
69 if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb)) 69 if (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))
70 kfree_skb(skb); 70 kfree_skb(skb);
71 else { 71 else {
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index e1144e1617be..1c8fdc3558cd 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -361,7 +361,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
361 if (err) 361 if (err)
362 goto err2; 362 goto err2;
363 363
364 if (br_netpoll_info(br) && ((err = br_netpoll_enable(p)))) 364 if (br_netpoll_info(br) && ((err = br_netpoll_enable(p, GFP_KERNEL))))
365 goto err3; 365 goto err3;
366 366
367 err = netdev_set_master(dev, br->dev); 367 err = netdev_set_master(dev, br->dev);
@@ -427,6 +427,10 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
427 if (!p || p->br != br) 427 if (!p || p->br != br)
428 return -EINVAL; 428 return -EINVAL;
429 429
430 /* Since more than one interface can be attached to a bridge,
431 * there still maybe an alternate path for netconsole to use;
432 * therefore there is no reason for a NETDEV_RELEASE event.
433 */
430 del_nbp(p); 434 del_nbp(p);
431 435
432 spin_lock_bh(&br->lock); 436 spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index a768b2408edf..f507d2af9646 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -316,7 +316,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
316 netpoll_send_skb(np, skb); 316 netpoll_send_skb(np, skb);
317} 317}
318 318
319extern int br_netpoll_enable(struct net_bridge_port *p); 319extern int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp);
320extern void br_netpoll_disable(struct net_bridge_port *p); 320extern void br_netpoll_disable(struct net_bridge_port *p);
321#else 321#else
322static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br) 322static inline struct netpoll_info *br_netpoll_info(struct net_bridge *br)
@@ -329,7 +329,7 @@ static inline void br_netpoll_send_skb(const struct net_bridge_port *p,
329{ 329{
330} 330}
331 331
332static inline int br_netpoll_enable(struct net_bridge_port *p) 332static inline int br_netpoll_enable(struct net_bridge_port *p, gfp_t gfp)
333{ 333{
334 return 0; 334 return 0;
335} 335}
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 6229b62749e8..13b36bdc76a7 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -27,7 +27,7 @@ struct brport_attribute {
27}; 27};
28 28
29#define BRPORT_ATTR(_name,_mode,_show,_store) \ 29#define BRPORT_ATTR(_name,_mode,_show,_store) \
30struct brport_attribute brport_attr_##_name = { \ 30const struct brport_attribute brport_attr_##_name = { \
31 .attr = {.name = __stringify(_name), \ 31 .attr = {.name = __stringify(_name), \
32 .mode = _mode }, \ 32 .mode = _mode }, \
33 .show = _show, \ 33 .show = _show, \
@@ -164,7 +164,7 @@ static BRPORT_ATTR(multicast_router, S_IRUGO | S_IWUSR, show_multicast_router,
164 store_multicast_router); 164 store_multicast_router);
165#endif 165#endif
166 166
167static struct brport_attribute *brport_attrs[] = { 167static const struct brport_attribute *brport_attrs[] = {
168 &brport_attr_path_cost, 168 &brport_attr_path_cost,
169 &brport_attr_priority, 169 &brport_attr_priority,
170 &brport_attr_port_id, 170 &brport_attr_port_id,
@@ -241,7 +241,7 @@ const struct sysfs_ops brport_sysfs_ops = {
241int br_sysfs_addif(struct net_bridge_port *p) 241int br_sysfs_addif(struct net_bridge_port *p)
242{ 242{
243 struct net_bridge *br = p->br; 243 struct net_bridge *br = p->br;
244 struct brport_attribute **a; 244 const struct brport_attribute **a;
245 int err; 245 int err;
246 246
247 err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj, 247 err = sysfs_create_link(&p->kobj, &br->dev->dev.kobj,
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 78f1cdad5b33..095259f83902 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
141 err = sk_filter(sk, skb); 141 err = sk_filter(sk, skb);
142 if (err) 142 if (err)
143 return err; 143 return err;
144 if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { 144 if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) {
145 set_rx_flow_off(cf_sk); 145 set_rx_flow_off(cf_sk);
146 net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); 146 net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n");
147 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); 147 caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ);
diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c
index 69771c04ba8f..e597733affb8 100644
--- a/net/caif/chnl_net.c
+++ b/net/caif/chnl_net.c
@@ -94,6 +94,10 @@ static int chnl_recv_cb(struct cflayer *layr, struct cfpkt *pkt)
94 94
95 /* check the version of IP */ 95 /* check the version of IP */
96 ip_version = skb_header_pointer(skb, 0, 1, &buf); 96 ip_version = skb_header_pointer(skb, 0, 1, &buf);
97 if (!ip_version) {
98 kfree_skb(skb);
99 return -EINVAL;
100 }
97 101
98 switch (*ip_version >> 4) { 102 switch (*ip_version >> 4) {
99 case 4: 103 case 4:
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index ba4323bce0e9..a8020293f342 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -17,6 +17,7 @@
17#include <linux/string.h> 17#include <linux/string.h>
18 18
19 19
20#include <linux/ceph/ceph_features.h>
20#include <linux/ceph/libceph.h> 21#include <linux/ceph/libceph.h>
21#include <linux/ceph/debugfs.h> 22#include <linux/ceph/debugfs.h>
22#include <linux/ceph/decode.h> 23#include <linux/ceph/decode.h>
@@ -83,7 +84,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
83 return -1; 84 return -1;
84 } 85 }
85 } else { 86 } else {
86 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
87 memcpy(&client->fsid, fsid, sizeof(*fsid)); 87 memcpy(&client->fsid, fsid, sizeof(*fsid));
88 } 88 }
89 return 0; 89 return 0;
@@ -460,27 +460,23 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
460 client->auth_err = 0; 460 client->auth_err = 0;
461 461
462 client->extra_mon_dispatch = NULL; 462 client->extra_mon_dispatch = NULL;
463 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT | 463 client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
464 supported_features; 464 supported_features;
465 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT | 465 client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT |
466 required_features; 466 required_features;
467 467
468 /* msgr */ 468 /* msgr */
469 if (ceph_test_opt(client, MYIP)) 469 if (ceph_test_opt(client, MYIP))
470 myaddr = &client->options->my_addr; 470 myaddr = &client->options->my_addr;
471 client->msgr = ceph_messenger_create(myaddr, 471 ceph_messenger_init(&client->msgr, myaddr,
472 client->supported_features, 472 client->supported_features,
473 client->required_features); 473 client->required_features,
474 if (IS_ERR(client->msgr)) { 474 ceph_test_opt(client, NOCRC));
475 err = PTR_ERR(client->msgr);
476 goto fail;
477 }
478 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
479 475
480 /* subsystems */ 476 /* subsystems */
481 err = ceph_monc_init(&client->monc, client); 477 err = ceph_monc_init(&client->monc, client);
482 if (err < 0) 478 if (err < 0)
483 goto fail_msgr; 479 goto fail;
484 err = ceph_osdc_init(&client->osdc, client); 480 err = ceph_osdc_init(&client->osdc, client);
485 if (err < 0) 481 if (err < 0)
486 goto fail_monc; 482 goto fail_monc;
@@ -489,8 +485,6 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
489 485
490fail_monc: 486fail_monc:
491 ceph_monc_stop(&client->monc); 487 ceph_monc_stop(&client->monc);
492fail_msgr:
493 ceph_messenger_destroy(client->msgr);
494fail: 488fail:
495 kfree(client); 489 kfree(client);
496 return ERR_PTR(err); 490 return ERR_PTR(err);
@@ -501,6 +495,8 @@ void ceph_destroy_client(struct ceph_client *client)
501{ 495{
502 dout("destroy_client %p\n", client); 496 dout("destroy_client %p\n", client);
503 497
498 atomic_set(&client->msgr.stopping, 1);
499
504 /* unmount */ 500 /* unmount */
505 ceph_osdc_stop(&client->osdc); 501 ceph_osdc_stop(&client->osdc);
506 502
@@ -508,8 +504,6 @@ void ceph_destroy_client(struct ceph_client *client)
508 504
509 ceph_debugfs_client_cleanup(client); 505 ceph_debugfs_client_cleanup(client);
510 506
511 ceph_messenger_destroy(client->msgr);
512
513 ceph_destroy_options(client->options); 507 ceph_destroy_options(client->options);
514 508
515 kfree(client); 509 kfree(client);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index d7edc24333b8..35fce755ce10 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
306 int item = 0; 306 int item = 0;
307 int itemtype; 307 int itemtype;
308 int collide, reject; 308 int collide, reject;
309 const unsigned int orig_tries = 5; /* attempts before we fall back to search */
310 309
311 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "", 310 dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
312 bucket->id, x, outpos, numrep); 311 bucket->id, x, outpos, numrep);
@@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
351 reject = 1; 350 reject = 1;
352 goto reject; 351 goto reject;
353 } 352 }
354 if (flocal >= (in->size>>1) && 353 if (map->choose_local_fallback_tries > 0 &&
355 flocal > orig_tries) 354 flocal >= (in->size>>1) &&
355 flocal > map->choose_local_fallback_tries)
356 item = bucket_perm_choose(in, x, r); 356 item = bucket_perm_choose(in, x, r);
357 else 357 else
358 item = crush_bucket_choose(in, x, r); 358 item = crush_bucket_choose(in, x, r);
@@ -422,13 +422,14 @@ reject:
422 ftotal++; 422 ftotal++;
423 flocal++; 423 flocal++;
424 424
425 if (collide && flocal < 3) 425 if (collide && flocal <= map->choose_local_tries)
426 /* retry locally a few times */ 426 /* retry locally a few times */
427 retry_bucket = 1; 427 retry_bucket = 1;
428 else if (flocal <= in->size + orig_tries) 428 else if (map->choose_local_fallback_tries > 0 &&
429 flocal <= in->size + map->choose_local_fallback_tries)
429 /* exhaustive bucket search */ 430 /* exhaustive bucket search */
430 retry_bucket = 1; 431 retry_bucket = 1;
431 else if (ftotal < 20) 432 else if (ftotal <= map->choose_total_tries)
432 /* then retry descent */ 433 /* then retry descent */
433 retry_descent = 1; 434 retry_descent = 1;
434 else 435 else
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index b780cb7947dd..9da7fdd3cd8a 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -466,6 +466,7 @@ void ceph_key_destroy(struct key *key) {
466 struct ceph_crypto_key *ckey = key->payload.data; 466 struct ceph_crypto_key *ckey = key->payload.data;
467 467
468 ceph_crypto_key_destroy(ckey); 468 ceph_crypto_key_destroy(ckey);
469 kfree(ckey);
469} 470}
470 471
471struct key_type key_type_ceph = { 472struct key_type key_type_ceph = {
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index 1919d1550d75..3572dc518bc9 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -16,7 +16,8 @@ struct ceph_crypto_key {
16 16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key) 17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{ 18{
19 kfree(key->key); 19 if (key)
20 kfree(key->key);
20} 21}
21 22
22extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst, 23extern int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 54b531a01121..38b5dc1823d4 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -189,6 +189,9 @@ int ceph_debugfs_client_init(struct ceph_client *client)
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid, 189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id); 190 client->monc.auth->global_id);
191 191
192 dout("ceph_debugfs_client_init %p %s\n", client, name);
193
194 BUG_ON(client->debugfs_dir);
192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 195 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
193 if (!client->debugfs_dir) 196 if (!client->debugfs_dir)
194 goto out; 197 goto out;
@@ -234,6 +237,7 @@ out:
234 237
235void ceph_debugfs_client_cleanup(struct ceph_client *client) 238void ceph_debugfs_client_cleanup(struct ceph_client *client)
236{ 239{
240 dout("ceph_debugfs_client_cleanup %p\n", client);
237 debugfs_remove(client->debugfs_osdmap); 241 debugfs_remove(client->debugfs_osdmap);
238 debugfs_remove(client->debugfs_monmap); 242 debugfs_remove(client->debugfs_monmap);
239 debugfs_remove(client->osdc.debugfs_file); 243 debugfs_remove(client->osdc.debugfs_file);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 10255e81be79..24c5eea8c45b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -29,6 +29,74 @@
29 * the sender. 29 * the sender.
30 */ 30 */
31 31
32/*
33 * We track the state of the socket on a given connection using
34 * values defined below. The transition to a new socket state is
35 * handled by a function which verifies we aren't coming from an
36 * unexpected state.
37 *
38 * --------
39 * | NEW* | transient initial state
40 * --------
41 * | con_sock_state_init()
42 * v
43 * ----------
44 * | CLOSED | initialized, but no socket (and no
45 * ---------- TCP connection)
46 * ^ \
47 * | \ con_sock_state_connecting()
48 * | ----------------------
49 * | \
50 * + con_sock_state_closed() \
51 * |+--------------------------- \
52 * | \ \ \
53 * | ----------- \ \
54 * | | CLOSING | socket event; \ \
55 * | ----------- await close \ \
56 * | ^ \ |
57 * | | \ |
58 * | + con_sock_state_closing() \ |
59 * | / \ | |
60 * | / --------------- | |
61 * | / \ v v
62 * | / --------------
63 * | / -----------------| CONNECTING | socket created, TCP
64 * | | / -------------- connect initiated
65 * | | | con_sock_state_connected()
66 * | | v
67 * -------------
68 * | CONNECTED | TCP connection established
69 * -------------
70 *
71 * State values for ceph_connection->sock_state; NEW is assumed to be 0.
72 */
73
74#define CON_SOCK_STATE_NEW 0 /* -> CLOSED */
75#define CON_SOCK_STATE_CLOSED 1 /* -> CONNECTING */
76#define CON_SOCK_STATE_CONNECTING 2 /* -> CONNECTED or -> CLOSING */
77#define CON_SOCK_STATE_CONNECTED 3 /* -> CLOSING or -> CLOSED */
78#define CON_SOCK_STATE_CLOSING 4 /* -> CLOSED */
79
80/*
81 * connection states
82 */
83#define CON_STATE_CLOSED 1 /* -> PREOPEN */
84#define CON_STATE_PREOPEN 2 /* -> CONNECTING, CLOSED */
85#define CON_STATE_CONNECTING 3 /* -> NEGOTIATING, CLOSED */
86#define CON_STATE_NEGOTIATING 4 /* -> OPEN, CLOSED */
87#define CON_STATE_OPEN 5 /* -> STANDBY, CLOSED */
88#define CON_STATE_STANDBY 6 /* -> PREOPEN, CLOSED */
89
90/*
91 * ceph_connection flag bits
92 */
93#define CON_FLAG_LOSSYTX 0 /* we can close channel or drop
94 * messages on errors */
95#define CON_FLAG_KEEPALIVE_PENDING 1 /* we need to send a keepalive */
96#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
97#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
98#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
99
32/* static tag bytes (protocol control messages) */ 100/* static tag bytes (protocol control messages) */
33static char tag_msg = CEPH_MSGR_TAG_MSG; 101static char tag_msg = CEPH_MSGR_TAG_MSG;
34static char tag_ack = CEPH_MSGR_TAG_ACK; 102static char tag_ack = CEPH_MSGR_TAG_ACK;
@@ -147,72 +215,130 @@ void ceph_msgr_flush(void)
147} 215}
148EXPORT_SYMBOL(ceph_msgr_flush); 216EXPORT_SYMBOL(ceph_msgr_flush);
149 217
218/* Connection socket state transition functions */
219
220static void con_sock_state_init(struct ceph_connection *con)
221{
222 int old_state;
223
224 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
225 if (WARN_ON(old_state != CON_SOCK_STATE_NEW))
226 printk("%s: unexpected old state %d\n", __func__, old_state);
227 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
228 CON_SOCK_STATE_CLOSED);
229}
230
231static void con_sock_state_connecting(struct ceph_connection *con)
232{
233 int old_state;
234
235 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTING);
236 if (WARN_ON(old_state != CON_SOCK_STATE_CLOSED))
237 printk("%s: unexpected old state %d\n", __func__, old_state);
238 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
239 CON_SOCK_STATE_CONNECTING);
240}
241
242static void con_sock_state_connected(struct ceph_connection *con)
243{
244 int old_state;
245
246 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CONNECTED);
247 if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING))
248 printk("%s: unexpected old state %d\n", __func__, old_state);
249 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
250 CON_SOCK_STATE_CONNECTED);
251}
252
253static void con_sock_state_closing(struct ceph_connection *con)
254{
255 int old_state;
256
257 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSING);
258 if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTING &&
259 old_state != CON_SOCK_STATE_CONNECTED &&
260 old_state != CON_SOCK_STATE_CLOSING))
261 printk("%s: unexpected old state %d\n", __func__, old_state);
262 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
263 CON_SOCK_STATE_CLOSING);
264}
265
266static void con_sock_state_closed(struct ceph_connection *con)
267{
268 int old_state;
269
270 old_state = atomic_xchg(&con->sock_state, CON_SOCK_STATE_CLOSED);
271 if (WARN_ON(old_state != CON_SOCK_STATE_CONNECTED &&
272 old_state != CON_SOCK_STATE_CLOSING &&
273 old_state != CON_SOCK_STATE_CONNECTING &&
274 old_state != CON_SOCK_STATE_CLOSED))
275 printk("%s: unexpected old state %d\n", __func__, old_state);
276 dout("%s con %p sock %d -> %d\n", __func__, con, old_state,
277 CON_SOCK_STATE_CLOSED);
278}
150 279
151/* 280/*
152 * socket callback functions 281 * socket callback functions
153 */ 282 */
154 283
155/* data available on socket, or listen socket received a connect */ 284/* data available on socket, or listen socket received a connect */
156static void ceph_data_ready(struct sock *sk, int count_unused) 285static void ceph_sock_data_ready(struct sock *sk, int count_unused)
157{ 286{
158 struct ceph_connection *con = sk->sk_user_data; 287 struct ceph_connection *con = sk->sk_user_data;
288 if (atomic_read(&con->msgr->stopping)) {
289 return;
290 }
159 291
160 if (sk->sk_state != TCP_CLOSE_WAIT) { 292 if (sk->sk_state != TCP_CLOSE_WAIT) {
161 dout("ceph_data_ready on %p state = %lu, queueing work\n", 293 dout("%s on %p state = %lu, queueing work\n", __func__,
162 con, con->state); 294 con, con->state);
163 queue_con(con); 295 queue_con(con);
164 } 296 }
165} 297}
166 298
167/* socket has buffer space for writing */ 299/* socket has buffer space for writing */
168static void ceph_write_space(struct sock *sk) 300static void ceph_sock_write_space(struct sock *sk)
169{ 301{
170 struct ceph_connection *con = sk->sk_user_data; 302 struct ceph_connection *con = sk->sk_user_data;
171 303
172 /* only queue to workqueue if there is data we want to write, 304 /* only queue to workqueue if there is data we want to write,
173 * and there is sufficient space in the socket buffer to accept 305 * and there is sufficient space in the socket buffer to accept
174 * more data. clear SOCK_NOSPACE so that ceph_write_space() 306 * more data. clear SOCK_NOSPACE so that ceph_sock_write_space()
175 * doesn't get called again until try_write() fills the socket 307 * doesn't get called again until try_write() fills the socket
176 * buffer. See net/ipv4/tcp_input.c:tcp_check_space() 308 * buffer. See net/ipv4/tcp_input.c:tcp_check_space()
177 * and net/core/stream.c:sk_stream_write_space(). 309 * and net/core/stream.c:sk_stream_write_space().
178 */ 310 */
179 if (test_bit(WRITE_PENDING, &con->state)) { 311 if (test_bit(CON_FLAG_WRITE_PENDING, &con->flags)) {
180 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { 312 if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
181 dout("ceph_write_space %p queueing write work\n", con); 313 dout("%s %p queueing write work\n", __func__, con);
182 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 314 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
183 queue_con(con); 315 queue_con(con);
184 } 316 }
185 } else { 317 } else {
186 dout("ceph_write_space %p nothing to write\n", con); 318 dout("%s %p nothing to write\n", __func__, con);
187 } 319 }
188} 320}
189 321
190/* socket's state has changed */ 322/* socket's state has changed */
191static void ceph_state_change(struct sock *sk) 323static void ceph_sock_state_change(struct sock *sk)
192{ 324{
193 struct ceph_connection *con = sk->sk_user_data; 325 struct ceph_connection *con = sk->sk_user_data;
194 326
195 dout("ceph_state_change %p state = %lu sk_state = %u\n", 327 dout("%s %p state = %lu sk_state = %u\n", __func__,
196 con, con->state, sk->sk_state); 328 con, con->state, sk->sk_state);
197 329
198 if (test_bit(CLOSED, &con->state))
199 return;
200
201 switch (sk->sk_state) { 330 switch (sk->sk_state) {
202 case TCP_CLOSE: 331 case TCP_CLOSE:
203 dout("ceph_state_change TCP_CLOSE\n"); 332 dout("%s TCP_CLOSE\n", __func__);
204 case TCP_CLOSE_WAIT: 333 case TCP_CLOSE_WAIT:
205 dout("ceph_state_change TCP_CLOSE_WAIT\n"); 334 dout("%s TCP_CLOSE_WAIT\n", __func__);
206 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) { 335 con_sock_state_closing(con);
207 if (test_bit(CONNECTING, &con->state)) 336 set_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
208 con->error_msg = "connection failed"; 337 queue_con(con);
209 else
210 con->error_msg = "socket closed";
211 queue_con(con);
212 }
213 break; 338 break;
214 case TCP_ESTABLISHED: 339 case TCP_ESTABLISHED:
215 dout("ceph_state_change TCP_ESTABLISHED\n"); 340 dout("%s TCP_ESTABLISHED\n", __func__);
341 con_sock_state_connected(con);
216 queue_con(con); 342 queue_con(con);
217 break; 343 break;
218 default: /* Everything else is uninteresting */ 344 default: /* Everything else is uninteresting */
@@ -228,9 +354,9 @@ static void set_sock_callbacks(struct socket *sock,
228{ 354{
229 struct sock *sk = sock->sk; 355 struct sock *sk = sock->sk;
230 sk->sk_user_data = con; 356 sk->sk_user_data = con;
231 sk->sk_data_ready = ceph_data_ready; 357 sk->sk_data_ready = ceph_sock_data_ready;
232 sk->sk_write_space = ceph_write_space; 358 sk->sk_write_space = ceph_sock_write_space;
233 sk->sk_state_change = ceph_state_change; 359 sk->sk_state_change = ceph_sock_state_change;
234} 360}
235 361
236 362
@@ -262,6 +388,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
262 388
263 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr)); 389 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
264 390
391 con_sock_state_connecting(con);
265 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), 392 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
266 O_NONBLOCK); 393 O_NONBLOCK);
267 if (ret == -EINPROGRESS) { 394 if (ret == -EINPROGRESS) {
@@ -277,7 +404,6 @@ static int ceph_tcp_connect(struct ceph_connection *con)
277 return ret; 404 return ret;
278 } 405 }
279 con->sock = sock; 406 con->sock = sock;
280
281 return 0; 407 return 0;
282} 408}
283 409
@@ -333,16 +459,24 @@ static int ceph_tcp_sendpage(struct socket *sock, struct page *page,
333 */ 459 */
334static int con_close_socket(struct ceph_connection *con) 460static int con_close_socket(struct ceph_connection *con)
335{ 461{
336 int rc; 462 int rc = 0;
337 463
338 dout("con_close_socket on %p sock %p\n", con, con->sock); 464 dout("con_close_socket on %p sock %p\n", con, con->sock);
339 if (!con->sock) 465 if (con->sock) {
340 return 0; 466 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
341 set_bit(SOCK_CLOSED, &con->state); 467 sock_release(con->sock);
342 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR); 468 con->sock = NULL;
343 sock_release(con->sock); 469 }
344 con->sock = NULL; 470
345 clear_bit(SOCK_CLOSED, &con->state); 471 /*
472 * Forcibly clear the SOCK_CLOSED flag. It gets set
473 * independent of the connection mutex, and we could have
474 * received a socket close event before we had the chance to
475 * shut the socket down.
476 */
477 clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags);
478
479 con_sock_state_closed(con);
346 return rc; 480 return rc;
347} 481}
348 482
@@ -353,6 +487,10 @@ static int con_close_socket(struct ceph_connection *con)
353static void ceph_msg_remove(struct ceph_msg *msg) 487static void ceph_msg_remove(struct ceph_msg *msg)
354{ 488{
355 list_del_init(&msg->list_head); 489 list_del_init(&msg->list_head);
490 BUG_ON(msg->con == NULL);
491 msg->con->ops->put(msg->con);
492 msg->con = NULL;
493
356 ceph_msg_put(msg); 494 ceph_msg_put(msg);
357} 495}
358static void ceph_msg_remove_list(struct list_head *head) 496static void ceph_msg_remove_list(struct list_head *head)
@@ -372,8 +510,11 @@ static void reset_connection(struct ceph_connection *con)
372 ceph_msg_remove_list(&con->out_sent); 510 ceph_msg_remove_list(&con->out_sent);
373 511
374 if (con->in_msg) { 512 if (con->in_msg) {
513 BUG_ON(con->in_msg->con != con);
514 con->in_msg->con = NULL;
375 ceph_msg_put(con->in_msg); 515 ceph_msg_put(con->in_msg);
376 con->in_msg = NULL; 516 con->in_msg = NULL;
517 con->ops->put(con);
377 } 518 }
378 519
379 con->connect_seq = 0; 520 con->connect_seq = 0;
@@ -391,32 +532,44 @@ static void reset_connection(struct ceph_connection *con)
391 */ 532 */
392void ceph_con_close(struct ceph_connection *con) 533void ceph_con_close(struct ceph_connection *con)
393{ 534{
535 mutex_lock(&con->mutex);
394 dout("con_close %p peer %s\n", con, 536 dout("con_close %p peer %s\n", con,
395 ceph_pr_addr(&con->peer_addr.in_addr)); 537 ceph_pr_addr(&con->peer_addr.in_addr));
396 set_bit(CLOSED, &con->state); /* in case there's queued work */ 538 con->state = CON_STATE_CLOSED;
397 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ 539
398 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ 540 clear_bit(CON_FLAG_LOSSYTX, &con->flags); /* so we retry next connect */
399 clear_bit(KEEPALIVE_PENDING, &con->state); 541 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
400 clear_bit(WRITE_PENDING, &con->state); 542 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
401 mutex_lock(&con->mutex); 543 clear_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags);
544 clear_bit(CON_FLAG_BACKOFF, &con->flags);
545
402 reset_connection(con); 546 reset_connection(con);
403 con->peer_global_seq = 0; 547 con->peer_global_seq = 0;
404 cancel_delayed_work(&con->work); 548 cancel_delayed_work(&con->work);
549 con_close_socket(con);
405 mutex_unlock(&con->mutex); 550 mutex_unlock(&con->mutex);
406 queue_con(con);
407} 551}
408EXPORT_SYMBOL(ceph_con_close); 552EXPORT_SYMBOL(ceph_con_close);
409 553
410/* 554/*
411 * Reopen a closed connection, with a new peer address. 555 * Reopen a closed connection, with a new peer address.
412 */ 556 */
413void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) 557void ceph_con_open(struct ceph_connection *con,
558 __u8 entity_type, __u64 entity_num,
559 struct ceph_entity_addr *addr)
414{ 560{
561 mutex_lock(&con->mutex);
415 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr)); 562 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
416 set_bit(OPENING, &con->state); 563
417 clear_bit(CLOSED, &con->state); 564 BUG_ON(con->state != CON_STATE_CLOSED);
565 con->state = CON_STATE_PREOPEN;
566
567 con->peer_name.type = (__u8) entity_type;
568 con->peer_name.num = cpu_to_le64(entity_num);
569
418 memcpy(&con->peer_addr, addr, sizeof(*addr)); 570 memcpy(&con->peer_addr, addr, sizeof(*addr));
419 con->delay = 0; /* reset backoff memory */ 571 con->delay = 0; /* reset backoff memory */
572 mutex_unlock(&con->mutex);
420 queue_con(con); 573 queue_con(con);
421} 574}
422EXPORT_SYMBOL(ceph_con_open); 575EXPORT_SYMBOL(ceph_con_open);
@@ -430,42 +583,26 @@ bool ceph_con_opened(struct ceph_connection *con)
430} 583}
431 584
432/* 585/*
433 * generic get/put
434 */
435struct ceph_connection *ceph_con_get(struct ceph_connection *con)
436{
437 int nref = __atomic_add_unless(&con->nref, 1, 0);
438
439 dout("con_get %p nref = %d -> %d\n", con, nref, nref + 1);
440
441 return nref ? con : NULL;
442}
443
444void ceph_con_put(struct ceph_connection *con)
445{
446 int nref = atomic_dec_return(&con->nref);
447
448 BUG_ON(nref < 0);
449 if (nref == 0) {
450 BUG_ON(con->sock);
451 kfree(con);
452 }
453 dout("con_put %p nref = %d -> %d\n", con, nref + 1, nref);
454}
455
456/*
457 * initialize a new connection. 586 * initialize a new connection.
458 */ 587 */
459void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con) 588void ceph_con_init(struct ceph_connection *con, void *private,
589 const struct ceph_connection_operations *ops,
590 struct ceph_messenger *msgr)
460{ 591{
461 dout("con_init %p\n", con); 592 dout("con_init %p\n", con);
462 memset(con, 0, sizeof(*con)); 593 memset(con, 0, sizeof(*con));
463 atomic_set(&con->nref, 1); 594 con->private = private;
595 con->ops = ops;
464 con->msgr = msgr; 596 con->msgr = msgr;
597
598 con_sock_state_init(con);
599
465 mutex_init(&con->mutex); 600 mutex_init(&con->mutex);
466 INIT_LIST_HEAD(&con->out_queue); 601 INIT_LIST_HEAD(&con->out_queue);
467 INIT_LIST_HEAD(&con->out_sent); 602 INIT_LIST_HEAD(&con->out_sent);
468 INIT_DELAYED_WORK(&con->work, con_work); 603 INIT_DELAYED_WORK(&con->work, con_work);
604
605 con->state = CON_STATE_CLOSED;
469} 606}
470EXPORT_SYMBOL(ceph_con_init); 607EXPORT_SYMBOL(ceph_con_init);
471 608
@@ -486,14 +623,14 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
486 return ret; 623 return ret;
487} 624}
488 625
489static void ceph_con_out_kvec_reset(struct ceph_connection *con) 626static void con_out_kvec_reset(struct ceph_connection *con)
490{ 627{
491 con->out_kvec_left = 0; 628 con->out_kvec_left = 0;
492 con->out_kvec_bytes = 0; 629 con->out_kvec_bytes = 0;
493 con->out_kvec_cur = &con->out_kvec[0]; 630 con->out_kvec_cur = &con->out_kvec[0];
494} 631}
495 632
496static void ceph_con_out_kvec_add(struct ceph_connection *con, 633static void con_out_kvec_add(struct ceph_connection *con,
497 size_t size, void *data) 634 size_t size, void *data)
498{ 635{
499 int index; 636 int index;
@@ -507,6 +644,53 @@ static void ceph_con_out_kvec_add(struct ceph_connection *con,
507 con->out_kvec_bytes += size; 644 con->out_kvec_bytes += size;
508} 645}
509 646
647#ifdef CONFIG_BLOCK
648static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
649{
650 if (!bio) {
651 *iter = NULL;
652 *seg = 0;
653 return;
654 }
655 *iter = bio;
656 *seg = bio->bi_idx;
657}
658
659static void iter_bio_next(struct bio **bio_iter, int *seg)
660{
661 if (*bio_iter == NULL)
662 return;
663
664 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
665
666 (*seg)++;
667 if (*seg == (*bio_iter)->bi_vcnt)
668 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
669}
670#endif
671
672static void prepare_write_message_data(struct ceph_connection *con)
673{
674 struct ceph_msg *msg = con->out_msg;
675
676 BUG_ON(!msg);
677 BUG_ON(!msg->hdr.data_len);
678
679 /* initialize page iterator */
680 con->out_msg_pos.page = 0;
681 if (msg->pages)
682 con->out_msg_pos.page_pos = msg->page_alignment;
683 else
684 con->out_msg_pos.page_pos = 0;
685#ifdef CONFIG_BLOCK
686 if (msg->bio)
687 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
688#endif
689 con->out_msg_pos.data_pos = 0;
690 con->out_msg_pos.did_page_crc = false;
691 con->out_more = 1; /* data + footer will follow */
692}
693
510/* 694/*
511 * Prepare footer for currently outgoing message, and finish things 695 * Prepare footer for currently outgoing message, and finish things
512 * off. Assumes out_kvec* are already valid.. we just add on to the end. 696 * off. Assumes out_kvec* are already valid.. we just add on to the end.
@@ -516,6 +700,8 @@ static void prepare_write_message_footer(struct ceph_connection *con)
516 struct ceph_msg *m = con->out_msg; 700 struct ceph_msg *m = con->out_msg;
517 int v = con->out_kvec_left; 701 int v = con->out_kvec_left;
518 702
703 m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
704
519 dout("prepare_write_message_footer %p\n", con); 705 dout("prepare_write_message_footer %p\n", con);
520 con->out_kvec_is_msg = true; 706 con->out_kvec_is_msg = true;
521 con->out_kvec[v].iov_base = &m->footer; 707 con->out_kvec[v].iov_base = &m->footer;
@@ -534,7 +720,7 @@ static void prepare_write_message(struct ceph_connection *con)
534 struct ceph_msg *m; 720 struct ceph_msg *m;
535 u32 crc; 721 u32 crc;
536 722
537 ceph_con_out_kvec_reset(con); 723 con_out_kvec_reset(con);
538 con->out_kvec_is_msg = true; 724 con->out_kvec_is_msg = true;
539 con->out_msg_done = false; 725 con->out_msg_done = false;
540 726
@@ -542,14 +728,16 @@ static void prepare_write_message(struct ceph_connection *con)
542 * TCP packet that's a good thing. */ 728 * TCP packet that's a good thing. */
543 if (con->in_seq > con->in_seq_acked) { 729 if (con->in_seq > con->in_seq_acked) {
544 con->in_seq_acked = con->in_seq; 730 con->in_seq_acked = con->in_seq;
545 ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 731 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
546 con->out_temp_ack = cpu_to_le64(con->in_seq_acked); 732 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
547 ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), 733 con_out_kvec_add(con, sizeof (con->out_temp_ack),
548 &con->out_temp_ack); 734 &con->out_temp_ack);
549 } 735 }
550 736
737 BUG_ON(list_empty(&con->out_queue));
551 m = list_first_entry(&con->out_queue, struct ceph_msg, list_head); 738 m = list_first_entry(&con->out_queue, struct ceph_msg, list_head);
552 con->out_msg = m; 739 con->out_msg = m;
740 BUG_ON(m->con != con);
553 741
554 /* put message on sent list */ 742 /* put message on sent list */
555 ceph_msg_get(m); 743 ceph_msg_get(m);
@@ -576,18 +764,18 @@ static void prepare_write_message(struct ceph_connection *con)
576 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); 764 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
577 765
578 /* tag + hdr + front + middle */ 766 /* tag + hdr + front + middle */
579 ceph_con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); 767 con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
580 ceph_con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); 768 con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
581 ceph_con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); 769 con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
582 770
583 if (m->middle) 771 if (m->middle)
584 ceph_con_out_kvec_add(con, m->middle->vec.iov_len, 772 con_out_kvec_add(con, m->middle->vec.iov_len,
585 m->middle->vec.iov_base); 773 m->middle->vec.iov_base);
586 774
587 /* fill in crc (except data pages), footer */ 775 /* fill in crc (except data pages), footer */
588 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); 776 crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
589 con->out_msg->hdr.crc = cpu_to_le32(crc); 777 con->out_msg->hdr.crc = cpu_to_le32(crc);
590 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE; 778 con->out_msg->footer.flags = 0;
591 779
592 crc = crc32c(0, m->front.iov_base, m->front.iov_len); 780 crc = crc32c(0, m->front.iov_base, m->front.iov_len);
593 con->out_msg->footer.front_crc = cpu_to_le32(crc); 781 con->out_msg->footer.front_crc = cpu_to_le32(crc);
@@ -597,28 +785,19 @@ static void prepare_write_message(struct ceph_connection *con)
597 con->out_msg->footer.middle_crc = cpu_to_le32(crc); 785 con->out_msg->footer.middle_crc = cpu_to_le32(crc);
598 } else 786 } else
599 con->out_msg->footer.middle_crc = 0; 787 con->out_msg->footer.middle_crc = 0;
600 con->out_msg->footer.data_crc = 0; 788 dout("%s front_crc %u middle_crc %u\n", __func__,
601 dout("prepare_write_message front_crc %u data_crc %u\n",
602 le32_to_cpu(con->out_msg->footer.front_crc), 789 le32_to_cpu(con->out_msg->footer.front_crc),
603 le32_to_cpu(con->out_msg->footer.middle_crc)); 790 le32_to_cpu(con->out_msg->footer.middle_crc));
604 791
605 /* is there a data payload? */ 792 /* is there a data payload? */
606 if (le32_to_cpu(m->hdr.data_len) > 0) { 793 con->out_msg->footer.data_crc = 0;
607 /* initialize page iterator */ 794 if (m->hdr.data_len)
608 con->out_msg_pos.page = 0; 795 prepare_write_message_data(con);
609 if (m->pages) 796 else
610 con->out_msg_pos.page_pos = m->page_alignment;
611 else
612 con->out_msg_pos.page_pos = 0;
613 con->out_msg_pos.data_pos = 0;
614 con->out_msg_pos.did_page_crc = false;
615 con->out_more = 1; /* data + footer will follow */
616 } else {
617 /* no, queue up footer too and be done */ 797 /* no, queue up footer too and be done */
618 prepare_write_message_footer(con); 798 prepare_write_message_footer(con);
619 }
620 799
621 set_bit(WRITE_PENDING, &con->state); 800 set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
622} 801}
623 802
624/* 803/*
@@ -630,16 +809,16 @@ static void prepare_write_ack(struct ceph_connection *con)
630 con->in_seq_acked, con->in_seq); 809 con->in_seq_acked, con->in_seq);
631 con->in_seq_acked = con->in_seq; 810 con->in_seq_acked = con->in_seq;
632 811
633 ceph_con_out_kvec_reset(con); 812 con_out_kvec_reset(con);
634 813
635 ceph_con_out_kvec_add(con, sizeof (tag_ack), &tag_ack); 814 con_out_kvec_add(con, sizeof (tag_ack), &tag_ack);
636 815
637 con->out_temp_ack = cpu_to_le64(con->in_seq_acked); 816 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
638 ceph_con_out_kvec_add(con, sizeof (con->out_temp_ack), 817 con_out_kvec_add(con, sizeof (con->out_temp_ack),
639 &con->out_temp_ack); 818 &con->out_temp_ack);
640 819
641 con->out_more = 1; /* more will follow.. eventually.. */ 820 con->out_more = 1; /* more will follow.. eventually.. */
642 set_bit(WRITE_PENDING, &con->state); 821 set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
643} 822}
644 823
645/* 824/*
@@ -648,9 +827,9 @@ static void prepare_write_ack(struct ceph_connection *con)
648static void prepare_write_keepalive(struct ceph_connection *con) 827static void prepare_write_keepalive(struct ceph_connection *con)
649{ 828{
650 dout("prepare_write_keepalive %p\n", con); 829 dout("prepare_write_keepalive %p\n", con);
651 ceph_con_out_kvec_reset(con); 830 con_out_kvec_reset(con);
652 ceph_con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive); 831 con_out_kvec_add(con, sizeof (tag_keepalive), &tag_keepalive);
653 set_bit(WRITE_PENDING, &con->state); 832 set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
654} 833}
655 834
656/* 835/*
@@ -665,27 +844,21 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
665 if (!con->ops->get_authorizer) { 844 if (!con->ops->get_authorizer) {
666 con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; 845 con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN;
667 con->out_connect.authorizer_len = 0; 846 con->out_connect.authorizer_len = 0;
668
669 return NULL; 847 return NULL;
670 } 848 }
671 849
672 /* Can't hold the mutex while getting authorizer */ 850 /* Can't hold the mutex while getting authorizer */
673
674 mutex_unlock(&con->mutex); 851 mutex_unlock(&con->mutex);
675
676 auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); 852 auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry);
677
678 mutex_lock(&con->mutex); 853 mutex_lock(&con->mutex);
679 854
680 if (IS_ERR(auth)) 855 if (IS_ERR(auth))
681 return auth; 856 return auth;
682 if (test_bit(CLOSED, &con->state) || test_bit(OPENING, &con->state)) 857 if (con->state != CON_STATE_NEGOTIATING)
683 return ERR_PTR(-EAGAIN); 858 return ERR_PTR(-EAGAIN);
684 859
685 con->auth_reply_buf = auth->authorizer_reply_buf; 860 con->auth_reply_buf = auth->authorizer_reply_buf;
686 con->auth_reply_buf_len = auth->authorizer_reply_buf_len; 861 con->auth_reply_buf_len = auth->authorizer_reply_buf_len;
687
688
689 return auth; 862 return auth;
690} 863}
691 864
@@ -694,12 +867,12 @@ static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection
694 */ 867 */
695static void prepare_write_banner(struct ceph_connection *con) 868static void prepare_write_banner(struct ceph_connection *con)
696{ 869{
697 ceph_con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER); 870 con_out_kvec_add(con, strlen(CEPH_BANNER), CEPH_BANNER);
698 ceph_con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr), 871 con_out_kvec_add(con, sizeof (con->msgr->my_enc_addr),
699 &con->msgr->my_enc_addr); 872 &con->msgr->my_enc_addr);
700 873
701 con->out_more = 0; 874 con->out_more = 0;
702 set_bit(WRITE_PENDING, &con->state); 875 set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
703} 876}
704 877
705static int prepare_write_connect(struct ceph_connection *con) 878static int prepare_write_connect(struct ceph_connection *con)
@@ -742,14 +915,14 @@ static int prepare_write_connect(struct ceph_connection *con)
742 con->out_connect.authorizer_len = auth ? 915 con->out_connect.authorizer_len = auth ?
743 cpu_to_le32(auth->authorizer_buf_len) : 0; 916 cpu_to_le32(auth->authorizer_buf_len) : 0;
744 917
745 ceph_con_out_kvec_add(con, sizeof (con->out_connect), 918 con_out_kvec_add(con, sizeof (con->out_connect),
746 &con->out_connect); 919 &con->out_connect);
747 if (auth && auth->authorizer_buf_len) 920 if (auth && auth->authorizer_buf_len)
748 ceph_con_out_kvec_add(con, auth->authorizer_buf_len, 921 con_out_kvec_add(con, auth->authorizer_buf_len,
749 auth->authorizer_buf); 922 auth->authorizer_buf);
750 923
751 con->out_more = 0; 924 con->out_more = 0;
752 set_bit(WRITE_PENDING, &con->state); 925 set_bit(CON_FLAG_WRITE_PENDING, &con->flags);
753 926
754 return 0; 927 return 0;
755} 928}
@@ -797,30 +970,34 @@ out:
797 return ret; /* done! */ 970 return ret; /* done! */
798} 971}
799 972
800#ifdef CONFIG_BLOCK 973static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
801static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg) 974 size_t len, size_t sent, bool in_trail)
802{ 975{
803 if (!bio) { 976 struct ceph_msg *msg = con->out_msg;
804 *iter = NULL;
805 *seg = 0;
806 return;
807 }
808 *iter = bio;
809 *seg = bio->bi_idx;
810}
811 977
812static void iter_bio_next(struct bio **bio_iter, int *seg) 978 BUG_ON(!msg);
813{ 979 BUG_ON(!sent);
814 if (*bio_iter == NULL)
815 return;
816 980
817 BUG_ON(*seg >= (*bio_iter)->bi_vcnt); 981 con->out_msg_pos.data_pos += sent;
982 con->out_msg_pos.page_pos += sent;
983 if (sent < len)
984 return;
818 985
819 (*seg)++; 986 BUG_ON(sent != len);
820 if (*seg == (*bio_iter)->bi_vcnt) 987 con->out_msg_pos.page_pos = 0;
821 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg); 988 con->out_msg_pos.page++;
822} 989 con->out_msg_pos.did_page_crc = false;
990 if (in_trail)
991 list_move_tail(&page->lru,
992 &msg->trail->head);
993 else if (msg->pagelist)
994 list_move_tail(&page->lru,
995 &msg->pagelist->head);
996#ifdef CONFIG_BLOCK
997 else if (msg->bio)
998 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
823#endif 999#endif
1000}
824 1001
825/* 1002/*
826 * Write as much message data payload as we can. If we finish, queue 1003 * Write as much message data payload as we can. If we finish, queue
@@ -837,41 +1014,36 @@ static int write_partial_msg_pages(struct ceph_connection *con)
837 bool do_datacrc = !con->msgr->nocrc; 1014 bool do_datacrc = !con->msgr->nocrc;
838 int ret; 1015 int ret;
839 int total_max_write; 1016 int total_max_write;
840 int in_trail = 0; 1017 bool in_trail = false;
841 size_t trail_len = (msg->trail ? msg->trail->length : 0); 1018 const size_t trail_len = (msg->trail ? msg->trail->length : 0);
1019 const size_t trail_off = data_len - trail_len;
842 1020
843 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", 1021 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
844 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, 1022 con, msg, con->out_msg_pos.page, msg->nr_pages,
845 con->out_msg_pos.page_pos); 1023 con->out_msg_pos.page_pos);
846 1024
847#ifdef CONFIG_BLOCK 1025 /*
848 if (msg->bio && !msg->bio_iter) 1026 * Iterate through each page that contains data to be
849 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg); 1027 * written, and send as much as possible for each.
850#endif 1028 *
851 1029 * If we are calculating the data crc (the default), we will
1030 * need to map the page. If we have no pages, they have
1031 * been revoked, so use the zero page.
1032 */
852 while (data_len > con->out_msg_pos.data_pos) { 1033 while (data_len > con->out_msg_pos.data_pos) {
853 struct page *page = NULL; 1034 struct page *page = NULL;
854 int max_write = PAGE_SIZE; 1035 int max_write = PAGE_SIZE;
855 int bio_offset = 0; 1036 int bio_offset = 0;
856 1037
857 total_max_write = data_len - trail_len - 1038 in_trail = in_trail || con->out_msg_pos.data_pos >= trail_off;
858 con->out_msg_pos.data_pos; 1039 if (!in_trail)
859 1040 total_max_write = trail_off - con->out_msg_pos.data_pos;
860 /*
861 * if we are calculating the data crc (the default), we need
862 * to map the page. if our pages[] has been revoked, use the
863 * zero page.
864 */
865
866 /* have we reached the trail part of the data? */
867 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
868 in_trail = 1;
869 1041
1042 if (in_trail) {
870 total_max_write = data_len - con->out_msg_pos.data_pos; 1043 total_max_write = data_len - con->out_msg_pos.data_pos;
871 1044
872 page = list_first_entry(&msg->trail->head, 1045 page = list_first_entry(&msg->trail->head,
873 struct page, lru); 1046 struct page, lru);
874 max_write = PAGE_SIZE;
875 } else if (msg->pages) { 1047 } else if (msg->pages) {
876 page = msg->pages[con->out_msg_pos.page]; 1048 page = msg->pages[con->out_msg_pos.page];
877 } else if (msg->pagelist) { 1049 } else if (msg->pagelist) {
@@ -894,15 +1066,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
894 1066
895 if (do_datacrc && !con->out_msg_pos.did_page_crc) { 1067 if (do_datacrc && !con->out_msg_pos.did_page_crc) {
896 void *base; 1068 void *base;
897 u32 crc; 1069 u32 crc = le32_to_cpu(msg->footer.data_crc);
898 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
899 char *kaddr; 1070 char *kaddr;
900 1071
901 kaddr = kmap(page); 1072 kaddr = kmap(page);
902 BUG_ON(kaddr == NULL); 1073 BUG_ON(kaddr == NULL);
903 base = kaddr + con->out_msg_pos.page_pos + bio_offset; 1074 base = kaddr + con->out_msg_pos.page_pos + bio_offset;
904 crc = crc32c(tmpcrc, base, len); 1075 crc = crc32c(crc, base, len);
905 con->out_msg->footer.data_crc = cpu_to_le32(crc); 1076 msg->footer.data_crc = cpu_to_le32(crc);
906 con->out_msg_pos.did_page_crc = true; 1077 con->out_msg_pos.did_page_crc = true;
907 } 1078 }
908 ret = ceph_tcp_sendpage(con->sock, page, 1079 ret = ceph_tcp_sendpage(con->sock, page,
@@ -915,31 +1086,15 @@ static int write_partial_msg_pages(struct ceph_connection *con)
915 if (ret <= 0) 1086 if (ret <= 0)
916 goto out; 1087 goto out;
917 1088
918 con->out_msg_pos.data_pos += ret; 1089 out_msg_pos_next(con, page, len, (size_t) ret, in_trail);
919 con->out_msg_pos.page_pos += ret;
920 if (ret == len) {
921 con->out_msg_pos.page_pos = 0;
922 con->out_msg_pos.page++;
923 con->out_msg_pos.did_page_crc = false;
924 if (in_trail)
925 list_move_tail(&page->lru,
926 &msg->trail->head);
927 else if (msg->pagelist)
928 list_move_tail(&page->lru,
929 &msg->pagelist->head);
930#ifdef CONFIG_BLOCK
931 else if (msg->bio)
932 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
933#endif
934 }
935 } 1090 }
936 1091
937 dout("write_partial_msg_pages %p msg %p done\n", con, msg); 1092 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
938 1093
939 /* prepare and queue up footer, too */ 1094 /* prepare and queue up footer, too */
940 if (!do_datacrc) 1095 if (!do_datacrc)
941 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; 1096 msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
942 ceph_con_out_kvec_reset(con); 1097 con_out_kvec_reset(con);
943 prepare_write_message_footer(con); 1098 prepare_write_message_footer(con);
944 ret = 1; 1099 ret = 1;
945out: 1100out:
@@ -1351,20 +1506,14 @@ static int process_banner(struct ceph_connection *con)
1351 ceph_pr_addr(&con->msgr->inst.addr.in_addr)); 1506 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1352 } 1507 }
1353 1508
1354 set_bit(NEGOTIATING, &con->state);
1355 prepare_read_connect(con);
1356 return 0; 1509 return 0;
1357} 1510}
1358 1511
1359static void fail_protocol(struct ceph_connection *con) 1512static void fail_protocol(struct ceph_connection *con)
1360{ 1513{
1361 reset_connection(con); 1514 reset_connection(con);
1362 set_bit(CLOSED, &con->state); /* in case there's queued work */ 1515 BUG_ON(con->state != CON_STATE_NEGOTIATING);
1363 1516 con->state = CON_STATE_CLOSED;
1364 mutex_unlock(&con->mutex);
1365 if (con->ops->bad_proto)
1366 con->ops->bad_proto(con);
1367 mutex_lock(&con->mutex);
1368} 1517}
1369 1518
1370static int process_connect(struct ceph_connection *con) 1519static int process_connect(struct ceph_connection *con)
@@ -1407,7 +1556,7 @@ static int process_connect(struct ceph_connection *con)
1407 return -1; 1556 return -1;
1408 } 1557 }
1409 con->auth_retry = 1; 1558 con->auth_retry = 1;
1410 ceph_con_out_kvec_reset(con); 1559 con_out_kvec_reset(con);
1411 ret = prepare_write_connect(con); 1560 ret = prepare_write_connect(con);
1412 if (ret < 0) 1561 if (ret < 0)
1413 return ret; 1562 return ret;
@@ -1428,7 +1577,7 @@ static int process_connect(struct ceph_connection *con)
1428 ENTITY_NAME(con->peer_name), 1577 ENTITY_NAME(con->peer_name),
1429 ceph_pr_addr(&con->peer_addr.in_addr)); 1578 ceph_pr_addr(&con->peer_addr.in_addr));
1430 reset_connection(con); 1579 reset_connection(con);
1431 ceph_con_out_kvec_reset(con); 1580 con_out_kvec_reset(con);
1432 ret = prepare_write_connect(con); 1581 ret = prepare_write_connect(con);
1433 if (ret < 0) 1582 if (ret < 0)
1434 return ret; 1583 return ret;
@@ -1440,8 +1589,7 @@ static int process_connect(struct ceph_connection *con)
1440 if (con->ops->peer_reset) 1589 if (con->ops->peer_reset)
1441 con->ops->peer_reset(con); 1590 con->ops->peer_reset(con);
1442 mutex_lock(&con->mutex); 1591 mutex_lock(&con->mutex);
1443 if (test_bit(CLOSED, &con->state) || 1592 if (con->state != CON_STATE_NEGOTIATING)
1444 test_bit(OPENING, &con->state))
1445 return -EAGAIN; 1593 return -EAGAIN;
1446 break; 1594 break;
1447 1595
@@ -1454,7 +1602,7 @@ static int process_connect(struct ceph_connection *con)
1454 le32_to_cpu(con->out_connect.connect_seq), 1602 le32_to_cpu(con->out_connect.connect_seq),
1455 le32_to_cpu(con->in_reply.connect_seq)); 1603 le32_to_cpu(con->in_reply.connect_seq));
1456 con->connect_seq = le32_to_cpu(con->in_reply.connect_seq); 1604 con->connect_seq = le32_to_cpu(con->in_reply.connect_seq);
1457 ceph_con_out_kvec_reset(con); 1605 con_out_kvec_reset(con);
1458 ret = prepare_write_connect(con); 1606 ret = prepare_write_connect(con);
1459 if (ret < 0) 1607 if (ret < 0)
1460 return ret; 1608 return ret;
@@ -1471,7 +1619,7 @@ static int process_connect(struct ceph_connection *con)
1471 le32_to_cpu(con->in_reply.global_seq)); 1619 le32_to_cpu(con->in_reply.global_seq));
1472 get_global_seq(con->msgr, 1620 get_global_seq(con->msgr,
1473 le32_to_cpu(con->in_reply.global_seq)); 1621 le32_to_cpu(con->in_reply.global_seq));
1474 ceph_con_out_kvec_reset(con); 1622 con_out_kvec_reset(con);
1475 ret = prepare_write_connect(con); 1623 ret = prepare_write_connect(con);
1476 if (ret < 0) 1624 if (ret < 0)
1477 return ret; 1625 return ret;
@@ -1489,7 +1637,10 @@ static int process_connect(struct ceph_connection *con)
1489 fail_protocol(con); 1637 fail_protocol(con);
1490 return -1; 1638 return -1;
1491 } 1639 }
1492 clear_bit(CONNECTING, &con->state); 1640
1641 BUG_ON(con->state != CON_STATE_NEGOTIATING);
1642 con->state = CON_STATE_OPEN;
1643
1493 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq); 1644 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1494 con->connect_seq++; 1645 con->connect_seq++;
1495 con->peer_features = server_feat; 1646 con->peer_features = server_feat;
@@ -1501,7 +1652,9 @@ static int process_connect(struct ceph_connection *con)
1501 le32_to_cpu(con->in_reply.connect_seq)); 1652 le32_to_cpu(con->in_reply.connect_seq));
1502 1653
1503 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY) 1654 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1504 set_bit(LOSSYTX, &con->state); 1655 set_bit(CON_FLAG_LOSSYTX, &con->flags);
1656
1657 con->delay = 0; /* reset backoff memory */
1505 1658
1506 prepare_read_tag(con); 1659 prepare_read_tag(con);
1507 break; 1660 break;
@@ -1587,10 +1740,7 @@ static int read_partial_message_section(struct ceph_connection *con,
1587 return 1; 1740 return 1;
1588} 1741}
1589 1742
1590static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, 1743static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip);
1591 struct ceph_msg_header *hdr,
1592 int *skip);
1593
1594 1744
1595static int read_partial_message_pages(struct ceph_connection *con, 1745static int read_partial_message_pages(struct ceph_connection *con,
1596 struct page **pages, 1746 struct page **pages,
@@ -1633,9 +1783,6 @@ static int read_partial_message_bio(struct ceph_connection *con,
1633 void *p; 1783 void *p;
1634 int ret, left; 1784 int ret, left;
1635 1785
1636 if (IS_ERR(bv))
1637 return PTR_ERR(bv);
1638
1639 left = min((int)(data_len - con->in_msg_pos.data_pos), 1786 left = min((int)(data_len - con->in_msg_pos.data_pos),
1640 (int)(bv->bv_len - con->in_msg_pos.page_pos)); 1787 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1641 1788
@@ -1672,7 +1819,6 @@ static int read_partial_message(struct ceph_connection *con)
1672 int ret; 1819 int ret;
1673 unsigned int front_len, middle_len, data_len; 1820 unsigned int front_len, middle_len, data_len;
1674 bool do_datacrc = !con->msgr->nocrc; 1821 bool do_datacrc = !con->msgr->nocrc;
1675 int skip;
1676 u64 seq; 1822 u64 seq;
1677 u32 crc; 1823 u32 crc;
1678 1824
@@ -1723,10 +1869,13 @@ static int read_partial_message(struct ceph_connection *con)
1723 1869
1724 /* allocate message? */ 1870 /* allocate message? */
1725 if (!con->in_msg) { 1871 if (!con->in_msg) {
1872 int skip = 0;
1873
1726 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 1874 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1727 con->in_hdr.front_len, con->in_hdr.data_len); 1875 con->in_hdr.front_len, con->in_hdr.data_len);
1728 skip = 0; 1876 ret = ceph_con_in_msg_alloc(con, &skip);
1729 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip); 1877 if (ret < 0)
1878 return ret;
1730 if (skip) { 1879 if (skip) {
1731 /* skip this message */ 1880 /* skip this message */
1732 dout("alloc_msg said skip message\n"); 1881 dout("alloc_msg said skip message\n");
@@ -1737,11 +1886,9 @@ static int read_partial_message(struct ceph_connection *con)
1737 con->in_seq++; 1886 con->in_seq++;
1738 return 0; 1887 return 0;
1739 } 1888 }
1740 if (!con->in_msg) { 1889
1741 con->error_msg = 1890 BUG_ON(!con->in_msg);
1742 "error allocating memory for incoming message"; 1891 BUG_ON(con->in_msg->con != con);
1743 return -ENOMEM;
1744 }
1745 m = con->in_msg; 1892 m = con->in_msg;
1746 m->front.iov_len = 0; /* haven't read it yet */ 1893 m->front.iov_len = 0; /* haven't read it yet */
1747 if (m->middle) 1894 if (m->middle)
@@ -1753,6 +1900,11 @@ static int read_partial_message(struct ceph_connection *con)
1753 else 1900 else
1754 con->in_msg_pos.page_pos = 0; 1901 con->in_msg_pos.page_pos = 0;
1755 con->in_msg_pos.data_pos = 0; 1902 con->in_msg_pos.data_pos = 0;
1903
1904#ifdef CONFIG_BLOCK
1905 if (m->bio)
1906 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1907#endif
1756 } 1908 }
1757 1909
1758 /* front */ 1910 /* front */
@@ -1769,10 +1921,6 @@ static int read_partial_message(struct ceph_connection *con)
1769 if (ret <= 0) 1921 if (ret <= 0)
1770 return ret; 1922 return ret;
1771 } 1923 }
1772#ifdef CONFIG_BLOCK
1773 if (m->bio && !m->bio_iter)
1774 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1775#endif
1776 1924
1777 /* (page) data */ 1925 /* (page) data */
1778 while (con->in_msg_pos.data_pos < data_len) { 1926 while (con->in_msg_pos.data_pos < data_len) {
@@ -1783,7 +1931,7 @@ static int read_partial_message(struct ceph_connection *con)
1783 return ret; 1931 return ret;
1784#ifdef CONFIG_BLOCK 1932#ifdef CONFIG_BLOCK
1785 } else if (m->bio) { 1933 } else if (m->bio) {
1786 1934 BUG_ON(!m->bio_iter);
1787 ret = read_partial_message_bio(con, 1935 ret = read_partial_message_bio(con,
1788 &m->bio_iter, &m->bio_seg, 1936 &m->bio_iter, &m->bio_seg,
1789 data_len, do_datacrc); 1937 data_len, do_datacrc);
@@ -1837,8 +1985,11 @@ static void process_message(struct ceph_connection *con)
1837{ 1985{
1838 struct ceph_msg *msg; 1986 struct ceph_msg *msg;
1839 1987
1988 BUG_ON(con->in_msg->con != con);
1989 con->in_msg->con = NULL;
1840 msg = con->in_msg; 1990 msg = con->in_msg;
1841 con->in_msg = NULL; 1991 con->in_msg = NULL;
1992 con->ops->put(con);
1842 1993
1843 /* if first message, set peer_name */ 1994 /* if first message, set peer_name */
1844 if (con->peer_name.type == 0) 1995 if (con->peer_name.type == 0)
@@ -1858,7 +2009,6 @@ static void process_message(struct ceph_connection *con)
1858 con->ops->dispatch(con, msg); 2009 con->ops->dispatch(con, msg);
1859 2010
1860 mutex_lock(&con->mutex); 2011 mutex_lock(&con->mutex);
1861 prepare_read_tag(con);
1862} 2012}
1863 2013
1864 2014
@@ -1870,22 +2020,19 @@ static int try_write(struct ceph_connection *con)
1870{ 2020{
1871 int ret = 1; 2021 int ret = 1;
1872 2022
1873 dout("try_write start %p state %lu nref %d\n", con, con->state, 2023 dout("try_write start %p state %lu\n", con, con->state);
1874 atomic_read(&con->nref));
1875 2024
1876more: 2025more:
1877 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes); 2026 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1878 2027
1879 /* open the socket first? */ 2028 /* open the socket first? */
1880 if (con->sock == NULL) { 2029 if (con->state == CON_STATE_PREOPEN) {
1881 ceph_con_out_kvec_reset(con); 2030 BUG_ON(con->sock);
2031 con->state = CON_STATE_CONNECTING;
2032
2033 con_out_kvec_reset(con);
1882 prepare_write_banner(con); 2034 prepare_write_banner(con);
1883 ret = prepare_write_connect(con);
1884 if (ret < 0)
1885 goto out;
1886 prepare_read_banner(con); 2035 prepare_read_banner(con);
1887 set_bit(CONNECTING, &con->state);
1888 clear_bit(NEGOTIATING, &con->state);
1889 2036
1890 BUG_ON(con->in_msg); 2037 BUG_ON(con->in_msg);
1891 con->in_tag = CEPH_MSGR_TAG_READY; 2038 con->in_tag = CEPH_MSGR_TAG_READY;
@@ -1932,7 +2079,7 @@ more_kvec:
1932 } 2079 }
1933 2080
1934do_next: 2081do_next:
1935 if (!test_bit(CONNECTING, &con->state)) { 2082 if (con->state == CON_STATE_OPEN) {
1936 /* is anything else pending? */ 2083 /* is anything else pending? */
1937 if (!list_empty(&con->out_queue)) { 2084 if (!list_empty(&con->out_queue)) {
1938 prepare_write_message(con); 2085 prepare_write_message(con);
@@ -1942,14 +2089,15 @@ do_next:
1942 prepare_write_ack(con); 2089 prepare_write_ack(con);
1943 goto more; 2090 goto more;
1944 } 2091 }
1945 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) { 2092 if (test_and_clear_bit(CON_FLAG_KEEPALIVE_PENDING,
2093 &con->flags)) {
1946 prepare_write_keepalive(con); 2094 prepare_write_keepalive(con);
1947 goto more; 2095 goto more;
1948 } 2096 }
1949 } 2097 }
1950 2098
1951 /* Nothing to do! */ 2099 /* Nothing to do! */
1952 clear_bit(WRITE_PENDING, &con->state); 2100 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
1953 dout("try_write nothing else to write.\n"); 2101 dout("try_write nothing else to write.\n");
1954 ret = 0; 2102 ret = 0;
1955out: 2103out:
@@ -1966,38 +2114,46 @@ static int try_read(struct ceph_connection *con)
1966{ 2114{
1967 int ret = -1; 2115 int ret = -1;
1968 2116
1969 if (!con->sock) 2117more:
1970 return 0; 2118 dout("try_read start on %p state %lu\n", con, con->state);
1971 2119 if (con->state != CON_STATE_CONNECTING &&
1972 if (test_bit(STANDBY, &con->state)) 2120 con->state != CON_STATE_NEGOTIATING &&
2121 con->state != CON_STATE_OPEN)
1973 return 0; 2122 return 0;
1974 2123
1975 dout("try_read start on %p\n", con); 2124 BUG_ON(!con->sock);
1976 2125
1977more:
1978 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag, 2126 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1979 con->in_base_pos); 2127 con->in_base_pos);
1980 2128
1981 /* 2129 if (con->state == CON_STATE_CONNECTING) {
1982 * process_connect and process_message drop and re-take 2130 dout("try_read connecting\n");
1983 * con->mutex. make sure we handle a racing close or reopen. 2131 ret = read_partial_banner(con);
1984 */ 2132 if (ret <= 0)
1985 if (test_bit(CLOSED, &con->state) || 2133 goto out;
1986 test_bit(OPENING, &con->state)) { 2134 ret = process_banner(con);
1987 ret = -EAGAIN; 2135 if (ret < 0)
2136 goto out;
2137
2138 BUG_ON(con->state != CON_STATE_CONNECTING);
2139 con->state = CON_STATE_NEGOTIATING;
2140
2141 /*
2142 * Received banner is good, exchange connection info.
2143 * Do not reset out_kvec, as sending our banner raced
2144 * with receiving peer banner after connect completed.
2145 */
2146 ret = prepare_write_connect(con);
2147 if (ret < 0)
2148 goto out;
2149 prepare_read_connect(con);
2150
2151 /* Send connection info before awaiting response */
1988 goto out; 2152 goto out;
1989 } 2153 }
1990 2154
1991 if (test_bit(CONNECTING, &con->state)) { 2155 if (con->state == CON_STATE_NEGOTIATING) {
1992 if (!test_bit(NEGOTIATING, &con->state)) { 2156 dout("try_read negotiating\n");
1993 dout("try_read connecting\n");
1994 ret = read_partial_banner(con);
1995 if (ret <= 0)
1996 goto out;
1997 ret = process_banner(con);
1998 if (ret < 0)
1999 goto out;
2000 }
2001 ret = read_partial_connect(con); 2157 ret = read_partial_connect(con);
2002 if (ret <= 0) 2158 if (ret <= 0)
2003 goto out; 2159 goto out;
@@ -2007,6 +2163,8 @@ more:
2007 goto more; 2163 goto more;
2008 } 2164 }
2009 2165
2166 BUG_ON(con->state != CON_STATE_OPEN);
2167
2010 if (con->in_base_pos < 0) { 2168 if (con->in_base_pos < 0) {
2011 /* 2169 /*
2012 * skipping + discarding content. 2170 * skipping + discarding content.
@@ -2040,7 +2198,8 @@ more:
2040 prepare_read_ack(con); 2198 prepare_read_ack(con);
2041 break; 2199 break;
2042 case CEPH_MSGR_TAG_CLOSE: 2200 case CEPH_MSGR_TAG_CLOSE:
2043 set_bit(CLOSED, &con->state); /* fixme */ 2201 con_close_socket(con);
2202 con->state = CON_STATE_CLOSED;
2044 goto out; 2203 goto out;
2045 default: 2204 default:
2046 goto bad_tag; 2205 goto bad_tag;
@@ -2063,6 +2222,8 @@ more:
2063 if (con->in_tag == CEPH_MSGR_TAG_READY) 2222 if (con->in_tag == CEPH_MSGR_TAG_READY)
2064 goto more; 2223 goto more;
2065 process_message(con); 2224 process_message(con);
2225 if (con->state == CON_STATE_OPEN)
2226 prepare_read_tag(con);
2066 goto more; 2227 goto more;
2067 } 2228 }
2068 if (con->in_tag == CEPH_MSGR_TAG_ACK) { 2229 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
@@ -2091,12 +2252,6 @@ bad_tag:
2091 */ 2252 */
2092static void queue_con(struct ceph_connection *con) 2253static void queue_con(struct ceph_connection *con)
2093{ 2254{
2094 if (test_bit(DEAD, &con->state)) {
2095 dout("queue_con %p ignoring: DEAD\n",
2096 con);
2097 return;
2098 }
2099
2100 if (!con->ops->get(con)) { 2255 if (!con->ops->get(con)) {
2101 dout("queue_con %p ref count 0\n", con); 2256 dout("queue_con %p ref count 0\n", con);
2102 return; 2257 return;
@@ -2121,7 +2276,26 @@ static void con_work(struct work_struct *work)
2121 2276
2122 mutex_lock(&con->mutex); 2277 mutex_lock(&con->mutex);
2123restart: 2278restart:
2124 if (test_and_clear_bit(BACKOFF, &con->state)) { 2279 if (test_and_clear_bit(CON_FLAG_SOCK_CLOSED, &con->flags)) {
2280 switch (con->state) {
2281 case CON_STATE_CONNECTING:
2282 con->error_msg = "connection failed";
2283 break;
2284 case CON_STATE_NEGOTIATING:
2285 con->error_msg = "negotiation failed";
2286 break;
2287 case CON_STATE_OPEN:
2288 con->error_msg = "socket closed";
2289 break;
2290 default:
2291 dout("unrecognized con state %d\n", (int)con->state);
2292 con->error_msg = "unrecognized con state";
2293 BUG();
2294 }
2295 goto fault;
2296 }
2297
2298 if (test_and_clear_bit(CON_FLAG_BACKOFF, &con->flags)) {
2125 dout("con_work %p backing off\n", con); 2299 dout("con_work %p backing off\n", con);
2126 if (queue_delayed_work(ceph_msgr_wq, &con->work, 2300 if (queue_delayed_work(ceph_msgr_wq, &con->work,
2127 round_jiffies_relative(con->delay))) { 2301 round_jiffies_relative(con->delay))) {
@@ -2135,35 +2309,35 @@ restart:
2135 } 2309 }
2136 } 2310 }
2137 2311
2138 if (test_bit(STANDBY, &con->state)) { 2312 if (con->state == CON_STATE_STANDBY) {
2139 dout("con_work %p STANDBY\n", con); 2313 dout("con_work %p STANDBY\n", con);
2140 goto done; 2314 goto done;
2141 } 2315 }
2142 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */ 2316 if (con->state == CON_STATE_CLOSED) {
2143 dout("con_work CLOSED\n"); 2317 dout("con_work %p CLOSED\n", con);
2144 con_close_socket(con); 2318 BUG_ON(con->sock);
2145 goto done; 2319 goto done;
2146 } 2320 }
2147 if (test_and_clear_bit(OPENING, &con->state)) { 2321 if (con->state == CON_STATE_PREOPEN) {
2148 /* reopen w/ new peer */
2149 dout("con_work OPENING\n"); 2322 dout("con_work OPENING\n");
2150 con_close_socket(con); 2323 BUG_ON(con->sock);
2151 } 2324 }
2152 2325
2153 if (test_and_clear_bit(SOCK_CLOSED, &con->state))
2154 goto fault;
2155
2156 ret = try_read(con); 2326 ret = try_read(con);
2157 if (ret == -EAGAIN) 2327 if (ret == -EAGAIN)
2158 goto restart; 2328 goto restart;
2159 if (ret < 0) 2329 if (ret < 0) {
2330 con->error_msg = "socket error on read";
2160 goto fault; 2331 goto fault;
2332 }
2161 2333
2162 ret = try_write(con); 2334 ret = try_write(con);
2163 if (ret == -EAGAIN) 2335 if (ret == -EAGAIN)
2164 goto restart; 2336 goto restart;
2165 if (ret < 0) 2337 if (ret < 0) {
2338 con->error_msg = "socket error on write";
2166 goto fault; 2339 goto fault;
2340 }
2167 2341
2168done: 2342done:
2169 mutex_unlock(&con->mutex); 2343 mutex_unlock(&con->mutex);
@@ -2172,7 +2346,6 @@ done_unlocked:
2172 return; 2346 return;
2173 2347
2174fault: 2348fault:
2175 mutex_unlock(&con->mutex);
2176 ceph_fault(con); /* error/fault path */ 2349 ceph_fault(con); /* error/fault path */
2177 goto done_unlocked; 2350 goto done_unlocked;
2178} 2351}
@@ -2183,26 +2356,31 @@ fault:
2183 * exponential backoff 2356 * exponential backoff
2184 */ 2357 */
2185static void ceph_fault(struct ceph_connection *con) 2358static void ceph_fault(struct ceph_connection *con)
2359 __releases(con->mutex)
2186{ 2360{
2187 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), 2361 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
2188 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg); 2362 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
2189 dout("fault %p state %lu to peer %s\n", 2363 dout("fault %p state %lu to peer %s\n",
2190 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr)); 2364 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
2191 2365
2192 if (test_bit(LOSSYTX, &con->state)) { 2366 BUG_ON(con->state != CON_STATE_CONNECTING &&
2193 dout("fault on LOSSYTX channel\n"); 2367 con->state != CON_STATE_NEGOTIATING &&
2194 goto out; 2368 con->state != CON_STATE_OPEN);
2195 }
2196
2197 mutex_lock(&con->mutex);
2198 if (test_bit(CLOSED, &con->state))
2199 goto out_unlock;
2200 2369
2201 con_close_socket(con); 2370 con_close_socket(con);
2202 2371
2372 if (test_bit(CON_FLAG_LOSSYTX, &con->flags)) {
2373 dout("fault on LOSSYTX channel, marking CLOSED\n");
2374 con->state = CON_STATE_CLOSED;
2375 goto out_unlock;
2376 }
2377
2203 if (con->in_msg) { 2378 if (con->in_msg) {
2379 BUG_ON(con->in_msg->con != con);
2380 con->in_msg->con = NULL;
2204 ceph_msg_put(con->in_msg); 2381 ceph_msg_put(con->in_msg);
2205 con->in_msg = NULL; 2382 con->in_msg = NULL;
2383 con->ops->put(con);
2206 } 2384 }
2207 2385
2208 /* Requeue anything that hasn't been acked */ 2386 /* Requeue anything that hasn't been acked */
@@ -2211,12 +2389,13 @@ static void ceph_fault(struct ceph_connection *con)
2211 /* If there are no messages queued or keepalive pending, place 2389 /* If there are no messages queued or keepalive pending, place
2212 * the connection in a STANDBY state */ 2390 * the connection in a STANDBY state */
2213 if (list_empty(&con->out_queue) && 2391 if (list_empty(&con->out_queue) &&
2214 !test_bit(KEEPALIVE_PENDING, &con->state)) { 2392 !test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags)) {
2215 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con); 2393 dout("fault %p setting STANDBY clearing WRITE_PENDING\n", con);
2216 clear_bit(WRITE_PENDING, &con->state); 2394 clear_bit(CON_FLAG_WRITE_PENDING, &con->flags);
2217 set_bit(STANDBY, &con->state); 2395 con->state = CON_STATE_STANDBY;
2218 } else { 2396 } else {
2219 /* retry after a delay. */ 2397 /* retry after a delay. */
2398 con->state = CON_STATE_PREOPEN;
2220 if (con->delay == 0) 2399 if (con->delay == 0)
2221 con->delay = BASE_DELAY_INTERVAL; 2400 con->delay = BASE_DELAY_INTERVAL;
2222 else if (con->delay < MAX_DELAY_INTERVAL) 2401 else if (con->delay < MAX_DELAY_INTERVAL)
@@ -2237,13 +2416,12 @@ static void ceph_fault(struct ceph_connection *con)
2237 * that when con_work restarts we schedule the 2416 * that when con_work restarts we schedule the
2238 * delay then. 2417 * delay then.
2239 */ 2418 */
2240 set_bit(BACKOFF, &con->state); 2419 set_bit(CON_FLAG_BACKOFF, &con->flags);
2241 } 2420 }
2242 } 2421 }
2243 2422
2244out_unlock: 2423out_unlock:
2245 mutex_unlock(&con->mutex); 2424 mutex_unlock(&con->mutex);
2246out:
2247 /* 2425 /*
2248 * in case we faulted due to authentication, invalidate our 2426 * in case we faulted due to authentication, invalidate our
2249 * current tickets so that we can get new ones. 2427 * current tickets so that we can get new ones.
@@ -2260,18 +2438,14 @@ out:
2260 2438
2261 2439
2262/* 2440/*
2263 * create a new messenger instance 2441 * initialize a new messenger instance
2264 */ 2442 */
2265struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr, 2443void ceph_messenger_init(struct ceph_messenger *msgr,
2266 u32 supported_features, 2444 struct ceph_entity_addr *myaddr,
2267 u32 required_features) 2445 u32 supported_features,
2446 u32 required_features,
2447 bool nocrc)
2268{ 2448{
2269 struct ceph_messenger *msgr;
2270
2271 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
2272 if (msgr == NULL)
2273 return ERR_PTR(-ENOMEM);
2274
2275 msgr->supported_features = supported_features; 2449 msgr->supported_features = supported_features;
2276 msgr->required_features = required_features; 2450 msgr->required_features = required_features;
2277 2451
@@ -2284,30 +2458,23 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2284 msgr->inst.addr.type = 0; 2458 msgr->inst.addr.type = 0;
2285 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce)); 2459 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
2286 encode_my_addr(msgr); 2460 encode_my_addr(msgr);
2461 msgr->nocrc = nocrc;
2287 2462
2288 dout("messenger_create %p\n", msgr); 2463 atomic_set(&msgr->stopping, 0);
2289 return msgr;
2290}
2291EXPORT_SYMBOL(ceph_messenger_create);
2292 2464
2293void ceph_messenger_destroy(struct ceph_messenger *msgr) 2465 dout("%s %p\n", __func__, msgr);
2294{
2295 dout("destroy %p\n", msgr);
2296 kfree(msgr);
2297 dout("destroyed messenger %p\n", msgr);
2298} 2466}
2299EXPORT_SYMBOL(ceph_messenger_destroy); 2467EXPORT_SYMBOL(ceph_messenger_init);
2300 2468
2301static void clear_standby(struct ceph_connection *con) 2469static void clear_standby(struct ceph_connection *con)
2302{ 2470{
2303 /* come back from STANDBY? */ 2471 /* come back from STANDBY? */
2304 if (test_and_clear_bit(STANDBY, &con->state)) { 2472 if (con->state == CON_STATE_STANDBY) {
2305 mutex_lock(&con->mutex);
2306 dout("clear_standby %p and ++connect_seq\n", con); 2473 dout("clear_standby %p and ++connect_seq\n", con);
2474 con->state = CON_STATE_PREOPEN;
2307 con->connect_seq++; 2475 con->connect_seq++;
2308 WARN_ON(test_bit(WRITE_PENDING, &con->state)); 2476 WARN_ON(test_bit(CON_FLAG_WRITE_PENDING, &con->flags));
2309 WARN_ON(test_bit(KEEPALIVE_PENDING, &con->state)); 2477 WARN_ON(test_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags));
2310 mutex_unlock(&con->mutex);
2311 } 2478 }
2312} 2479}
2313 2480
@@ -2316,21 +2483,24 @@ static void clear_standby(struct ceph_connection *con)
2316 */ 2483 */
2317void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg) 2484void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2318{ 2485{
2319 if (test_bit(CLOSED, &con->state)) {
2320 dout("con_send %p closed, dropping %p\n", con, msg);
2321 ceph_msg_put(msg);
2322 return;
2323 }
2324
2325 /* set src+dst */ 2486 /* set src+dst */
2326 msg->hdr.src = con->msgr->inst.name; 2487 msg->hdr.src = con->msgr->inst.name;
2327
2328 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 2488 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
2329
2330 msg->needs_out_seq = true; 2489 msg->needs_out_seq = true;
2331 2490
2332 /* queue */
2333 mutex_lock(&con->mutex); 2491 mutex_lock(&con->mutex);
2492
2493 if (con->state == CON_STATE_CLOSED) {
2494 dout("con_send %p closed, dropping %p\n", con, msg);
2495 ceph_msg_put(msg);
2496 mutex_unlock(&con->mutex);
2497 return;
2498 }
2499
2500 BUG_ON(msg->con != NULL);
2501 msg->con = con->ops->get(con);
2502 BUG_ON(msg->con == NULL);
2503
2334 BUG_ON(!list_empty(&msg->list_head)); 2504 BUG_ON(!list_empty(&msg->list_head));
2335 list_add_tail(&msg->list_head, &con->out_queue); 2505 list_add_tail(&msg->list_head, &con->out_queue);
2336 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg, 2506 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
@@ -2339,12 +2509,13 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2339 le32_to_cpu(msg->hdr.front_len), 2509 le32_to_cpu(msg->hdr.front_len),
2340 le32_to_cpu(msg->hdr.middle_len), 2510 le32_to_cpu(msg->hdr.middle_len),
2341 le32_to_cpu(msg->hdr.data_len)); 2511 le32_to_cpu(msg->hdr.data_len));
2512
2513 clear_standby(con);
2342 mutex_unlock(&con->mutex); 2514 mutex_unlock(&con->mutex);
2343 2515
2344 /* if there wasn't anything waiting to send before, queue 2516 /* if there wasn't anything waiting to send before, queue
2345 * new work */ 2517 * new work */
2346 clear_standby(con); 2518 if (test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
2347 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2348 queue_con(con); 2519 queue_con(con);
2349} 2520}
2350EXPORT_SYMBOL(ceph_con_send); 2521EXPORT_SYMBOL(ceph_con_send);
@@ -2352,24 +2523,34 @@ EXPORT_SYMBOL(ceph_con_send);
2352/* 2523/*
2353 * Revoke a message that was previously queued for send 2524 * Revoke a message that was previously queued for send
2354 */ 2525 */
2355void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg) 2526void ceph_msg_revoke(struct ceph_msg *msg)
2356{ 2527{
2528 struct ceph_connection *con = msg->con;
2529
2530 if (!con)
2531 return; /* Message not in our possession */
2532
2357 mutex_lock(&con->mutex); 2533 mutex_lock(&con->mutex);
2358 if (!list_empty(&msg->list_head)) { 2534 if (!list_empty(&msg->list_head)) {
2359 dout("con_revoke %p msg %p - was on queue\n", con, msg); 2535 dout("%s %p msg %p - was on queue\n", __func__, con, msg);
2360 list_del_init(&msg->list_head); 2536 list_del_init(&msg->list_head);
2361 ceph_msg_put(msg); 2537 BUG_ON(msg->con == NULL);
2538 msg->con->ops->put(msg->con);
2539 msg->con = NULL;
2362 msg->hdr.seq = 0; 2540 msg->hdr.seq = 0;
2541
2542 ceph_msg_put(msg);
2363 } 2543 }
2364 if (con->out_msg == msg) { 2544 if (con->out_msg == msg) {
2365 dout("con_revoke %p msg %p - was sending\n", con, msg); 2545 dout("%s %p msg %p - was sending\n", __func__, con, msg);
2366 con->out_msg = NULL; 2546 con->out_msg = NULL;
2367 if (con->out_kvec_is_msg) { 2547 if (con->out_kvec_is_msg) {
2368 con->out_skip = con->out_kvec_bytes; 2548 con->out_skip = con->out_kvec_bytes;
2369 con->out_kvec_is_msg = false; 2549 con->out_kvec_is_msg = false;
2370 } 2550 }
2371 ceph_msg_put(msg);
2372 msg->hdr.seq = 0; 2551 msg->hdr.seq = 0;
2552
2553 ceph_msg_put(msg);
2373 } 2554 }
2374 mutex_unlock(&con->mutex); 2555 mutex_unlock(&con->mutex);
2375} 2556}
@@ -2377,17 +2558,27 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
2377/* 2558/*
2378 * Revoke a message that we may be reading data into 2559 * Revoke a message that we may be reading data into
2379 */ 2560 */
2380void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) 2561void ceph_msg_revoke_incoming(struct ceph_msg *msg)
2381{ 2562{
2563 struct ceph_connection *con;
2564
2565 BUG_ON(msg == NULL);
2566 if (!msg->con) {
2567 dout("%s msg %p null con\n", __func__, msg);
2568
2569 return; /* Message not in our possession */
2570 }
2571
2572 con = msg->con;
2382 mutex_lock(&con->mutex); 2573 mutex_lock(&con->mutex);
2383 if (con->in_msg && con->in_msg == msg) { 2574 if (con->in_msg == msg) {
2384 unsigned int front_len = le32_to_cpu(con->in_hdr.front_len); 2575 unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
2385 unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len); 2576 unsigned int middle_len = le32_to_cpu(con->in_hdr.middle_len);
2386 unsigned int data_len = le32_to_cpu(con->in_hdr.data_len); 2577 unsigned int data_len = le32_to_cpu(con->in_hdr.data_len);
2387 2578
2388 /* skip rest of message */ 2579 /* skip rest of message */
2389 dout("con_revoke_pages %p msg %p revoked\n", con, msg); 2580 dout("%s %p msg %p revoked\n", __func__, con, msg);
2390 con->in_base_pos = con->in_base_pos - 2581 con->in_base_pos = con->in_base_pos -
2391 sizeof(struct ceph_msg_header) - 2582 sizeof(struct ceph_msg_header) -
2392 front_len - 2583 front_len -
2393 middle_len - 2584 middle_len -
@@ -2398,8 +2589,8 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2398 con->in_tag = CEPH_MSGR_TAG_READY; 2589 con->in_tag = CEPH_MSGR_TAG_READY;
2399 con->in_seq++; 2590 con->in_seq++;
2400 } else { 2591 } else {
2401 dout("con_revoke_pages %p msg %p pages %p no-op\n", 2592 dout("%s %p in_msg %p msg %p no-op\n",
2402 con, con->in_msg, msg); 2593 __func__, con, con->in_msg, msg);
2403 } 2594 }
2404 mutex_unlock(&con->mutex); 2595 mutex_unlock(&con->mutex);
2405} 2596}
@@ -2410,9 +2601,11 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2410void ceph_con_keepalive(struct ceph_connection *con) 2601void ceph_con_keepalive(struct ceph_connection *con)
2411{ 2602{
2412 dout("con_keepalive %p\n", con); 2603 dout("con_keepalive %p\n", con);
2604 mutex_lock(&con->mutex);
2413 clear_standby(con); 2605 clear_standby(con);
2414 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 && 2606 mutex_unlock(&con->mutex);
2415 test_and_set_bit(WRITE_PENDING, &con->state) == 0) 2607 if (test_and_set_bit(CON_FLAG_KEEPALIVE_PENDING, &con->flags) == 0 &&
2608 test_and_set_bit(CON_FLAG_WRITE_PENDING, &con->flags) == 0)
2416 queue_con(con); 2609 queue_con(con);
2417} 2610}
2418EXPORT_SYMBOL(ceph_con_keepalive); 2611EXPORT_SYMBOL(ceph_con_keepalive);
@@ -2431,6 +2624,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
2431 if (m == NULL) 2624 if (m == NULL)
2432 goto out; 2625 goto out;
2433 kref_init(&m->kref); 2626 kref_init(&m->kref);
2627
2628 m->con = NULL;
2434 INIT_LIST_HEAD(&m->list_head); 2629 INIT_LIST_HEAD(&m->list_head);
2435 2630
2436 m->hdr.tid = 0; 2631 m->hdr.tid = 0;
@@ -2526,46 +2721,77 @@ static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2526} 2721}
2527 2722
2528/* 2723/*
2529 * Generic message allocator, for incoming messages. 2724 * Allocate a message for receiving an incoming message on a
2725 * connection, and save the result in con->in_msg. Uses the
2726 * connection's private alloc_msg op if available.
2727 *
2728 * Returns 0 on success, or a negative error code.
2729 *
2730 * On success, if we set *skip = 1:
2731 * - the next message should be skipped and ignored.
2732 * - con->in_msg == NULL
2733 * or if we set *skip = 0:
2734 * - con->in_msg is non-null.
2735 * On error (ENOMEM, EAGAIN, ...),
2736 * - con->in_msg == NULL
2530 */ 2737 */
2531static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, 2738static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
2532 struct ceph_msg_header *hdr,
2533 int *skip)
2534{ 2739{
2740 struct ceph_msg_header *hdr = &con->in_hdr;
2535 int type = le16_to_cpu(hdr->type); 2741 int type = le16_to_cpu(hdr->type);
2536 int front_len = le32_to_cpu(hdr->front_len); 2742 int front_len = le32_to_cpu(hdr->front_len);
2537 int middle_len = le32_to_cpu(hdr->middle_len); 2743 int middle_len = le32_to_cpu(hdr->middle_len);
2538 struct ceph_msg *msg = NULL; 2744 int ret = 0;
2539 int ret; 2745
2746 BUG_ON(con->in_msg != NULL);
2540 2747
2541 if (con->ops->alloc_msg) { 2748 if (con->ops->alloc_msg) {
2749 struct ceph_msg *msg;
2750
2542 mutex_unlock(&con->mutex); 2751 mutex_unlock(&con->mutex);
2543 msg = con->ops->alloc_msg(con, hdr, skip); 2752 msg = con->ops->alloc_msg(con, hdr, skip);
2544 mutex_lock(&con->mutex); 2753 mutex_lock(&con->mutex);
2545 if (!msg || *skip) 2754 if (con->state != CON_STATE_OPEN) {
2546 return NULL; 2755 ceph_msg_put(msg);
2756 return -EAGAIN;
2757 }
2758 con->in_msg = msg;
2759 if (con->in_msg) {
2760 con->in_msg->con = con->ops->get(con);
2761 BUG_ON(con->in_msg->con == NULL);
2762 }
2763 if (*skip) {
2764 con->in_msg = NULL;
2765 return 0;
2766 }
2767 if (!con->in_msg) {
2768 con->error_msg =
2769 "error allocating memory for incoming message";
2770 return -ENOMEM;
2771 }
2547 } 2772 }
2548 if (!msg) { 2773 if (!con->in_msg) {
2549 *skip = 0; 2774 con->in_msg = ceph_msg_new(type, front_len, GFP_NOFS, false);
2550 msg = ceph_msg_new(type, front_len, GFP_NOFS, false); 2775 if (!con->in_msg) {
2551 if (!msg) {
2552 pr_err("unable to allocate msg type %d len %d\n", 2776 pr_err("unable to allocate msg type %d len %d\n",
2553 type, front_len); 2777 type, front_len);
2554 return NULL; 2778 return -ENOMEM;
2555 } 2779 }
2556 msg->page_alignment = le16_to_cpu(hdr->data_off); 2780 con->in_msg->con = con->ops->get(con);
2781 BUG_ON(con->in_msg->con == NULL);
2782 con->in_msg->page_alignment = le16_to_cpu(hdr->data_off);
2557 } 2783 }
2558 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2784 memcpy(&con->in_msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2559 2785
2560 if (middle_len && !msg->middle) { 2786 if (middle_len && !con->in_msg->middle) {
2561 ret = ceph_alloc_middle(con, msg); 2787 ret = ceph_alloc_middle(con, con->in_msg);
2562 if (ret < 0) { 2788 if (ret < 0) {
2563 ceph_msg_put(msg); 2789 ceph_msg_put(con->in_msg);
2564 return NULL; 2790 con->in_msg = NULL;
2565 } 2791 }
2566 } 2792 }
2567 2793
2568 return msg; 2794 return ret;
2569} 2795}
2570 2796
2571 2797
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index d0649a9655be..900ea0f043fc 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -106,9 +106,9 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
106 monc->pending_auth = 1; 106 monc->pending_auth = 1;
107 monc->m_auth->front.iov_len = len; 107 monc->m_auth->front.iov_len = len;
108 monc->m_auth->hdr.front_len = cpu_to_le32(len); 108 monc->m_auth->hdr.front_len = cpu_to_le32(len);
109 ceph_con_revoke(monc->con, monc->m_auth); 109 ceph_msg_revoke(monc->m_auth);
110 ceph_msg_get(monc->m_auth); /* keep our ref */ 110 ceph_msg_get(monc->m_auth); /* keep our ref */
111 ceph_con_send(monc->con, monc->m_auth); 111 ceph_con_send(&monc->con, monc->m_auth);
112} 112}
113 113
114/* 114/*
@@ -117,8 +117,11 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
117static void __close_session(struct ceph_mon_client *monc) 117static void __close_session(struct ceph_mon_client *monc)
118{ 118{
119 dout("__close_session closing mon%d\n", monc->cur_mon); 119 dout("__close_session closing mon%d\n", monc->cur_mon);
120 ceph_con_revoke(monc->con, monc->m_auth); 120 ceph_msg_revoke(monc->m_auth);
121 ceph_con_close(monc->con); 121 ceph_msg_revoke_incoming(monc->m_auth_reply);
122 ceph_msg_revoke(monc->m_subscribe);
123 ceph_msg_revoke_incoming(monc->m_subscribe_ack);
124 ceph_con_close(&monc->con);
122 monc->cur_mon = -1; 125 monc->cur_mon = -1;
123 monc->pending_auth = 0; 126 monc->pending_auth = 0;
124 ceph_auth_reset(monc->auth); 127 ceph_auth_reset(monc->auth);
@@ -142,9 +145,8 @@ static int __open_session(struct ceph_mon_client *monc)
142 monc->want_next_osdmap = !!monc->want_next_osdmap; 145 monc->want_next_osdmap = !!monc->want_next_osdmap;
143 146
144 dout("open_session mon%d opening\n", monc->cur_mon); 147 dout("open_session mon%d opening\n", monc->cur_mon);
145 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON; 148 ceph_con_open(&monc->con,
146 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon); 149 CEPH_ENTITY_TYPE_MON, monc->cur_mon,
147 ceph_con_open(monc->con,
148 &monc->monmap->mon_inst[monc->cur_mon].addr); 150 &monc->monmap->mon_inst[monc->cur_mon].addr);
149 151
150 /* initiatiate authentication handshake */ 152 /* initiatiate authentication handshake */
@@ -226,8 +228,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
226 228
227 msg->front.iov_len = p - msg->front.iov_base; 229 msg->front.iov_len = p - msg->front.iov_base;
228 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); 230 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
229 ceph_con_revoke(monc->con, msg); 231 ceph_msg_revoke(msg);
230 ceph_con_send(monc->con, ceph_msg_get(msg)); 232 ceph_con_send(&monc->con, ceph_msg_get(msg));
231 233
232 monc->sub_sent = jiffies | 1; /* never 0 */ 234 monc->sub_sent = jiffies | 1; /* never 0 */
233 } 235 }
@@ -247,7 +249,7 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
247 if (monc->hunting) { 249 if (monc->hunting) {
248 pr_info("mon%d %s session established\n", 250 pr_info("mon%d %s session established\n",
249 monc->cur_mon, 251 monc->cur_mon,
250 ceph_pr_addr(&monc->con->peer_addr.in_addr)); 252 ceph_pr_addr(&monc->con.peer_addr.in_addr));
251 monc->hunting = false; 253 monc->hunting = false;
252 } 254 }
253 dout("handle_subscribe_ack after %d seconds\n", seconds); 255 dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -309,6 +311,17 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
309EXPORT_SYMBOL(ceph_monc_open_session); 311EXPORT_SYMBOL(ceph_monc_open_session);
310 312
311/* 313/*
314 * We require the fsid and global_id in order to initialize our
315 * debugfs dir.
316 */
317static bool have_debugfs_info(struct ceph_mon_client *monc)
318{
319 dout("have_debugfs_info fsid %d globalid %lld\n",
320 (int)monc->client->have_fsid, monc->auth->global_id);
321 return monc->client->have_fsid && monc->auth->global_id > 0;
322}
323
324/*
312 * The monitor responds with mount ack indicate mount success. The 325 * The monitor responds with mount ack indicate mount success. The
313 * included client ticket allows the client to talk to MDSs and OSDs. 326 * included client ticket allows the client to talk to MDSs and OSDs.
314 */ 327 */
@@ -318,9 +331,12 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
318 struct ceph_client *client = monc->client; 331 struct ceph_client *client = monc->client;
319 struct ceph_monmap *monmap = NULL, *old = monc->monmap; 332 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
320 void *p, *end; 333 void *p, *end;
334 int had_debugfs_info, init_debugfs = 0;
321 335
322 mutex_lock(&monc->mutex); 336 mutex_lock(&monc->mutex);
323 337
338 had_debugfs_info = have_debugfs_info(monc);
339
324 dout("handle_monmap\n"); 340 dout("handle_monmap\n");
325 p = msg->front.iov_base; 341 p = msg->front.iov_base;
326 end = p + msg->front.iov_len; 342 end = p + msg->front.iov_len;
@@ -342,12 +358,22 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
342 358
343 if (!client->have_fsid) { 359 if (!client->have_fsid) {
344 client->have_fsid = true; 360 client->have_fsid = true;
361 if (!had_debugfs_info && have_debugfs_info(monc)) {
362 pr_info("client%lld fsid %pU\n",
363 ceph_client_id(monc->client),
364 &monc->client->fsid);
365 init_debugfs = 1;
366 }
345 mutex_unlock(&monc->mutex); 367 mutex_unlock(&monc->mutex);
346 /* 368
347 * do debugfs initialization without mutex to avoid 369 if (init_debugfs) {
348 * creating a locking dependency 370 /*
349 */ 371 * do debugfs initialization without mutex to avoid
350 ceph_debugfs_client_init(client); 372 * creating a locking dependency
373 */
374 ceph_debugfs_client_init(monc->client);
375 }
376
351 goto out_unlocked; 377 goto out_unlocked;
352 } 378 }
353out: 379out:
@@ -439,6 +465,7 @@ static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
439 m = NULL; 465 m = NULL;
440 } else { 466 } else {
441 dout("get_generic_reply %lld got %p\n", tid, req->reply); 467 dout("get_generic_reply %lld got %p\n", tid, req->reply);
468 *skip = 0;
442 m = ceph_msg_get(req->reply); 469 m = ceph_msg_get(req->reply);
443 /* 470 /*
444 * we don't need to track the connection reading into 471 * we don't need to track the connection reading into
@@ -461,7 +488,7 @@ static int do_generic_request(struct ceph_mon_client *monc,
461 req->request->hdr.tid = cpu_to_le64(req->tid); 488 req->request->hdr.tid = cpu_to_le64(req->tid);
462 __insert_generic_request(monc, req); 489 __insert_generic_request(monc, req);
463 monc->num_generic_requests++; 490 monc->num_generic_requests++;
464 ceph_con_send(monc->con, ceph_msg_get(req->request)); 491 ceph_con_send(&monc->con, ceph_msg_get(req->request));
465 mutex_unlock(&monc->mutex); 492 mutex_unlock(&monc->mutex);
466 493
467 err = wait_for_completion_interruptible(&req->completion); 494 err = wait_for_completion_interruptible(&req->completion);
@@ -684,8 +711,9 @@ static void __resend_generic_request(struct ceph_mon_client *monc)
684 711
685 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) { 712 for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
686 req = rb_entry(p, struct ceph_mon_generic_request, node); 713 req = rb_entry(p, struct ceph_mon_generic_request, node);
687 ceph_con_revoke(monc->con, req->request); 714 ceph_msg_revoke(req->request);
688 ceph_con_send(monc->con, ceph_msg_get(req->request)); 715 ceph_msg_revoke_incoming(req->reply);
716 ceph_con_send(&monc->con, ceph_msg_get(req->request));
689 } 717 }
690} 718}
691 719
@@ -705,7 +733,7 @@ static void delayed_work(struct work_struct *work)
705 __close_session(monc); 733 __close_session(monc);
706 __open_session(monc); /* continue hunting */ 734 __open_session(monc); /* continue hunting */
707 } else { 735 } else {
708 ceph_con_keepalive(monc->con); 736 ceph_con_keepalive(&monc->con);
709 737
710 __validate_auth(monc); 738 __validate_auth(monc);
711 739
@@ -760,19 +788,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
760 goto out; 788 goto out;
761 789
762 /* connection */ 790 /* connection */
763 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
764 if (!monc->con)
765 goto out_monmap;
766 ceph_con_init(monc->client->msgr, monc->con);
767 monc->con->private = monc;
768 monc->con->ops = &mon_con_ops;
769
770 /* authentication */ 791 /* authentication */
771 monc->auth = ceph_auth_init(cl->options->name, 792 monc->auth = ceph_auth_init(cl->options->name,
772 cl->options->key); 793 cl->options->key);
773 if (IS_ERR(monc->auth)) { 794 if (IS_ERR(monc->auth)) {
774 err = PTR_ERR(monc->auth); 795 err = PTR_ERR(monc->auth);
775 goto out_con; 796 goto out_monmap;
776 } 797 }
777 monc->auth->want_keys = 798 monc->auth->want_keys =
778 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON | 799 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
@@ -801,6 +822,9 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
801 if (!monc->m_auth) 822 if (!monc->m_auth)
802 goto out_auth_reply; 823 goto out_auth_reply;
803 824
825 ceph_con_init(&monc->con, monc, &mon_con_ops,
826 &monc->client->msgr);
827
804 monc->cur_mon = -1; 828 monc->cur_mon = -1;
805 monc->hunting = true; 829 monc->hunting = true;
806 monc->sub_renew_after = jiffies; 830 monc->sub_renew_after = jiffies;
@@ -824,8 +848,6 @@ out_subscribe_ack:
824 ceph_msg_put(monc->m_subscribe_ack); 848 ceph_msg_put(monc->m_subscribe_ack);
825out_auth: 849out_auth:
826 ceph_auth_destroy(monc->auth); 850 ceph_auth_destroy(monc->auth);
827out_con:
828 monc->con->ops->put(monc->con);
829out_monmap: 851out_monmap:
830 kfree(monc->monmap); 852 kfree(monc->monmap);
831out: 853out:
@@ -841,10 +863,6 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
841 mutex_lock(&monc->mutex); 863 mutex_lock(&monc->mutex);
842 __close_session(monc); 864 __close_session(monc);
843 865
844 monc->con->private = NULL;
845 monc->con->ops->put(monc->con);
846 monc->con = NULL;
847
848 mutex_unlock(&monc->mutex); 866 mutex_unlock(&monc->mutex);
849 867
850 /* 868 /*
@@ -871,8 +889,10 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
871{ 889{
872 int ret; 890 int ret;
873 int was_auth = 0; 891 int was_auth = 0;
892 int had_debugfs_info, init_debugfs = 0;
874 893
875 mutex_lock(&monc->mutex); 894 mutex_lock(&monc->mutex);
895 had_debugfs_info = have_debugfs_info(monc);
876 if (monc->auth->ops) 896 if (monc->auth->ops)
877 was_auth = monc->auth->ops->is_authenticated(monc->auth); 897 was_auth = monc->auth->ops->is_authenticated(monc->auth);
878 monc->pending_auth = 0; 898 monc->pending_auth = 0;
@@ -888,14 +908,29 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
888 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) { 908 } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
889 dout("authenticated, starting session\n"); 909 dout("authenticated, starting session\n");
890 910
891 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT; 911 monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
892 monc->client->msgr->inst.name.num = 912 monc->client->msgr.inst.name.num =
893 cpu_to_le64(monc->auth->global_id); 913 cpu_to_le64(monc->auth->global_id);
894 914
895 __send_subscribe(monc); 915 __send_subscribe(monc);
896 __resend_generic_request(monc); 916 __resend_generic_request(monc);
897 } 917 }
918
919 if (!had_debugfs_info && have_debugfs_info(monc)) {
920 pr_info("client%lld fsid %pU\n",
921 ceph_client_id(monc->client),
922 &monc->client->fsid);
923 init_debugfs = 1;
924 }
898 mutex_unlock(&monc->mutex); 925 mutex_unlock(&monc->mutex);
926
927 if (init_debugfs) {
928 /*
929 * do debugfs initialization without mutex to avoid
930 * creating a locking dependency
931 */
932 ceph_debugfs_client_init(monc->client);
933 }
899} 934}
900 935
901static int __validate_auth(struct ceph_mon_client *monc) 936static int __validate_auth(struct ceph_mon_client *monc)
@@ -1000,6 +1035,8 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
1000 case CEPH_MSG_MDS_MAP: 1035 case CEPH_MSG_MDS_MAP:
1001 case CEPH_MSG_OSD_MAP: 1036 case CEPH_MSG_OSD_MAP:
1002 m = ceph_msg_new(type, front_len, GFP_NOFS, false); 1037 m = ceph_msg_new(type, front_len, GFP_NOFS, false);
1038 if (!m)
1039 return NULL; /* ENOMEM--return skip == 0 */
1003 break; 1040 break;
1004 } 1041 }
1005 1042
@@ -1029,7 +1066,7 @@ static void mon_fault(struct ceph_connection *con)
1029 if (!monc->hunting) 1066 if (!monc->hunting)
1030 pr_info("mon%d %s session lost, " 1067 pr_info("mon%d %s session lost, "
1031 "hunting for new mon\n", monc->cur_mon, 1068 "hunting for new mon\n", monc->cur_mon,
1032 ceph_pr_addr(&monc->con->peer_addr.in_addr)); 1069 ceph_pr_addr(&monc->con.peer_addr.in_addr));
1033 1070
1034 __close_session(monc); 1071 __close_session(monc);
1035 if (!monc->hunting) { 1072 if (!monc->hunting) {
@@ -1044,9 +1081,23 @@ out:
1044 mutex_unlock(&monc->mutex); 1081 mutex_unlock(&monc->mutex);
1045} 1082}
1046 1083
1084/*
1085 * We can ignore refcounting on the connection struct, as all references
1086 * will come from the messenger workqueue, which is drained prior to
1087 * mon_client destruction.
1088 */
1089static struct ceph_connection *con_get(struct ceph_connection *con)
1090{
1091 return con;
1092}
1093
1094static void con_put(struct ceph_connection *con)
1095{
1096}
1097
1047static const struct ceph_connection_operations mon_con_ops = { 1098static const struct ceph_connection_operations mon_con_ops = {
1048 .get = ceph_con_get, 1099 .get = con_get,
1049 .put = ceph_con_put, 1100 .put = con_put,
1050 .dispatch = dispatch, 1101 .dispatch = dispatch,
1051 .fault = mon_fault, 1102 .fault = mon_fault,
1052 .alloc_msg = mon_alloc_msg, 1103 .alloc_msg = mon_alloc_msg,
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index 11d5f4196a73..ddec1c10ac80 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -12,7 +12,7 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
12 struct ceph_msgpool *pool = arg; 12 struct ceph_msgpool *pool = arg;
13 struct ceph_msg *msg; 13 struct ceph_msg *msg;
14 14
15 msg = ceph_msg_new(0, pool->front_len, gfp_mask, true); 15 msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
16 if (!msg) { 16 if (!msg) {
17 dout("msgpool_alloc %s failed\n", pool->name); 17 dout("msgpool_alloc %s failed\n", pool->name);
18 } else { 18 } else {
@@ -32,10 +32,11 @@ static void msgpool_free(void *element, void *arg)
32 ceph_msg_put(msg); 32 ceph_msg_put(msg);
33} 33}
34 34
35int ceph_msgpool_init(struct ceph_msgpool *pool, 35int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
36 int front_len, int size, bool blocking, const char *name) 36 int front_len, int size, bool blocking, const char *name)
37{ 37{
38 dout("msgpool %s init\n", name); 38 dout("msgpool %s init\n", name);
39 pool->type = type;
39 pool->front_len = front_len; 40 pool->front_len = front_len;
40 pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool); 41 pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
41 if (!pool->pool) 42 if (!pool->pool)
@@ -61,7 +62,7 @@ struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
61 WARN_ON(1); 62 WARN_ON(1);
62 63
63 /* try to alloc a fresh message */ 64 /* try to alloc a fresh message */
64 return ceph_msg_new(0, front_len, GFP_NOFS, false); 65 return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
65 } 66 }
66 67
67 msg = mempool_alloc(pool->pool, GFP_NOFS); 68 msg = mempool_alloc(pool->pool, GFP_NOFS);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ca59e66c9787..42119c05e82c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -140,10 +140,9 @@ void ceph_osdc_release_request(struct kref *kref)
140 if (req->r_request) 140 if (req->r_request)
141 ceph_msg_put(req->r_request); 141 ceph_msg_put(req->r_request);
142 if (req->r_con_filling_msg) { 142 if (req->r_con_filling_msg) {
143 dout("release_request revoking pages %p from con %p\n", 143 dout("%s revoking pages %p from con %p\n", __func__,
144 req->r_pages, req->r_con_filling_msg); 144 req->r_pages, req->r_con_filling_msg);
145 ceph_con_revoke_message(req->r_con_filling_msg, 145 ceph_msg_revoke_incoming(req->r_reply);
146 req->r_reply);
147 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 146 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
148 } 147 }
149 if (req->r_reply) 148 if (req->r_reply)
@@ -214,10 +213,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
214 kref_init(&req->r_kref); 213 kref_init(&req->r_kref);
215 init_completion(&req->r_completion); 214 init_completion(&req->r_completion);
216 init_completion(&req->r_safe_completion); 215 init_completion(&req->r_safe_completion);
216 rb_init_node(&req->r_node);
217 INIT_LIST_HEAD(&req->r_unsafe_item); 217 INIT_LIST_HEAD(&req->r_unsafe_item);
218 INIT_LIST_HEAD(&req->r_linger_item); 218 INIT_LIST_HEAD(&req->r_linger_item);
219 INIT_LIST_HEAD(&req->r_linger_osd); 219 INIT_LIST_HEAD(&req->r_linger_osd);
220 INIT_LIST_HEAD(&req->r_req_lru_item); 220 INIT_LIST_HEAD(&req->r_req_lru_item);
221 INIT_LIST_HEAD(&req->r_osd_item);
222
221 req->r_flags = flags; 223 req->r_flags = flags;
222 224
223 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); 225 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
@@ -243,6 +245,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
243 } 245 }
244 ceph_pagelist_init(req->r_trail); 246 ceph_pagelist_init(req->r_trail);
245 } 247 }
248
246 /* create request message; allow space for oid */ 249 /* create request message; allow space for oid */
247 msg_size += MAX_OBJ_NAME_SIZE; 250 msg_size += MAX_OBJ_NAME_SIZE;
248 if (snapc) 251 if (snapc)
@@ -256,7 +259,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
256 return NULL; 259 return NULL;
257 } 260 }
258 261
259 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
260 memset(msg->front.iov_base, 0, msg->front.iov_len); 262 memset(msg->front.iov_base, 0, msg->front.iov_len);
261 263
262 req->r_request = msg; 264 req->r_request = msg;
@@ -624,7 +626,7 @@ static void osd_reset(struct ceph_connection *con)
624/* 626/*
625 * Track open sessions with osds. 627 * Track open sessions with osds.
626 */ 628 */
627static struct ceph_osd *create_osd(struct ceph_osd_client *osdc) 629static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum)
628{ 630{
629 struct ceph_osd *osd; 631 struct ceph_osd *osd;
630 632
@@ -634,15 +636,13 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
634 636
635 atomic_set(&osd->o_ref, 1); 637 atomic_set(&osd->o_ref, 1);
636 osd->o_osdc = osdc; 638 osd->o_osdc = osdc;
639 osd->o_osd = onum;
637 INIT_LIST_HEAD(&osd->o_requests); 640 INIT_LIST_HEAD(&osd->o_requests);
638 INIT_LIST_HEAD(&osd->o_linger_requests); 641 INIT_LIST_HEAD(&osd->o_linger_requests);
639 INIT_LIST_HEAD(&osd->o_osd_lru); 642 INIT_LIST_HEAD(&osd->o_osd_lru);
640 osd->o_incarnation = 1; 643 osd->o_incarnation = 1;
641 644
642 ceph_con_init(osdc->client->msgr, &osd->o_con); 645 ceph_con_init(&osd->o_con, osd, &osd_con_ops, &osdc->client->msgr);
643 osd->o_con.private = osd;
644 osd->o_con.ops = &osd_con_ops;
645 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
646 646
647 INIT_LIST_HEAD(&osd->o_keepalive_item); 647 INIT_LIST_HEAD(&osd->o_keepalive_item);
648 return osd; 648 return osd;
@@ -688,7 +688,7 @@ static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
688 688
689static void remove_all_osds(struct ceph_osd_client *osdc) 689static void remove_all_osds(struct ceph_osd_client *osdc)
690{ 690{
691 dout("__remove_old_osds %p\n", osdc); 691 dout("%s %p\n", __func__, osdc);
692 mutex_lock(&osdc->request_mutex); 692 mutex_lock(&osdc->request_mutex);
693 while (!RB_EMPTY_ROOT(&osdc->osds)) { 693 while (!RB_EMPTY_ROOT(&osdc->osds)) {
694 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds), 694 struct ceph_osd *osd = rb_entry(rb_first(&osdc->osds),
@@ -752,7 +752,8 @@ static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
752 ret = -EAGAIN; 752 ret = -EAGAIN;
753 } else { 753 } else {
754 ceph_con_close(&osd->o_con); 754 ceph_con_close(&osd->o_con);
755 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]); 755 ceph_con_open(&osd->o_con, CEPH_ENTITY_TYPE_OSD, osd->o_osd,
756 &osdc->osdmap->osd_addr[osd->o_osd]);
756 osd->o_incarnation++; 757 osd->o_incarnation++;
757 } 758 }
758 return ret; 759 return ret;
@@ -853,7 +854,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
853 854
854 if (req->r_osd) { 855 if (req->r_osd) {
855 /* make sure the original request isn't in flight. */ 856 /* make sure the original request isn't in flight. */
856 ceph_con_revoke(&req->r_osd->o_con, req->r_request); 857 ceph_msg_revoke(req->r_request);
857 858
858 list_del_init(&req->r_osd_item); 859 list_del_init(&req->r_osd_item);
859 if (list_empty(&req->r_osd->o_requests) && 860 if (list_empty(&req->r_osd->o_requests) &&
@@ -880,7 +881,7 @@ static void __unregister_request(struct ceph_osd_client *osdc,
880static void __cancel_request(struct ceph_osd_request *req) 881static void __cancel_request(struct ceph_osd_request *req)
881{ 882{
882 if (req->r_sent && req->r_osd) { 883 if (req->r_sent && req->r_osd) {
883 ceph_con_revoke(&req->r_osd->o_con, req->r_request); 884 ceph_msg_revoke(req->r_request);
884 req->r_sent = 0; 885 req->r_sent = 0;
885 } 886 }
886} 887}
@@ -890,7 +891,9 @@ static void __register_linger_request(struct ceph_osd_client *osdc,
890{ 891{
891 dout("__register_linger_request %p\n", req); 892 dout("__register_linger_request %p\n", req);
892 list_add_tail(&req->r_linger_item, &osdc->req_linger); 893 list_add_tail(&req->r_linger_item, &osdc->req_linger);
893 list_add_tail(&req->r_linger_osd, &req->r_osd->o_linger_requests); 894 if (req->r_osd)
895 list_add_tail(&req->r_linger_osd,
896 &req->r_osd->o_linger_requests);
894} 897}
895 898
896static void __unregister_linger_request(struct ceph_osd_client *osdc, 899static void __unregister_linger_request(struct ceph_osd_client *osdc,
@@ -998,18 +1001,18 @@ static int __map_request(struct ceph_osd_client *osdc,
998 req->r_osd = __lookup_osd(osdc, o); 1001 req->r_osd = __lookup_osd(osdc, o);
999 if (!req->r_osd && o >= 0) { 1002 if (!req->r_osd && o >= 0) {
1000 err = -ENOMEM; 1003 err = -ENOMEM;
1001 req->r_osd = create_osd(osdc); 1004 req->r_osd = create_osd(osdc, o);
1002 if (!req->r_osd) { 1005 if (!req->r_osd) {
1003 list_move(&req->r_req_lru_item, &osdc->req_notarget); 1006 list_move(&req->r_req_lru_item, &osdc->req_notarget);
1004 goto out; 1007 goto out;
1005 } 1008 }
1006 1009
1007 dout("map_request osd %p is osd%d\n", req->r_osd, o); 1010 dout("map_request osd %p is osd%d\n", req->r_osd, o);
1008 req->r_osd->o_osd = o;
1009 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
1010 __insert_osd(osdc, req->r_osd); 1011 __insert_osd(osdc, req->r_osd);
1011 1012
1012 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]); 1013 ceph_con_open(&req->r_osd->o_con,
1014 CEPH_ENTITY_TYPE_OSD, o,
1015 &osdc->osdmap->osd_addr[o]);
1013 } 1016 }
1014 1017
1015 if (req->r_osd) { 1018 if (req->r_osd) {
@@ -1304,8 +1307,9 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1304 1307
1305 dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); 1308 dout("kick_requests %s\n", force_resend ? " (force resend)" : "");
1306 mutex_lock(&osdc->request_mutex); 1309 mutex_lock(&osdc->request_mutex);
1307 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { 1310 for (p = rb_first(&osdc->requests); p; ) {
1308 req = rb_entry(p, struct ceph_osd_request, r_node); 1311 req = rb_entry(p, struct ceph_osd_request, r_node);
1312 p = rb_next(p);
1309 err = __map_request(osdc, req, force_resend); 1313 err = __map_request(osdc, req, force_resend);
1310 if (err < 0) 1314 if (err < 0)
1311 continue; /* error */ 1315 continue; /* error */
@@ -1313,10 +1317,23 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
1313 dout("%p tid %llu maps to no osd\n", req, req->r_tid); 1317 dout("%p tid %llu maps to no osd\n", req, req->r_tid);
1314 needmap++; /* request a newer map */ 1318 needmap++; /* request a newer map */
1315 } else if (err > 0) { 1319 } else if (err > 0) {
1316 dout("%p tid %llu requeued on osd%d\n", req, req->r_tid, 1320 if (!req->r_linger) {
1317 req->r_osd ? req->r_osd->o_osd : -1); 1321 dout("%p tid %llu requeued on osd%d\n", req,
1318 if (!req->r_linger) 1322 req->r_tid,
1323 req->r_osd ? req->r_osd->o_osd : -1);
1319 req->r_flags |= CEPH_OSD_FLAG_RETRY; 1324 req->r_flags |= CEPH_OSD_FLAG_RETRY;
1325 }
1326 }
1327 if (req->r_linger && list_empty(&req->r_linger_item)) {
1328 /*
1329 * register as a linger so that we will
1330 * re-submit below and get a new tid
1331 */
1332 dout("%p tid %llu restart on osd%d\n",
1333 req, req->r_tid,
1334 req->r_osd ? req->r_osd->o_osd : -1);
1335 __register_linger_request(osdc, req);
1336 __unregister_request(osdc, req);
1320 } 1337 }
1321 } 1338 }
1322 1339
@@ -1391,7 +1408,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
1391 epoch, maplen); 1408 epoch, maplen);
1392 newmap = osdmap_apply_incremental(&p, next, 1409 newmap = osdmap_apply_incremental(&p, next,
1393 osdc->osdmap, 1410 osdc->osdmap,
1394 osdc->client->msgr); 1411 &osdc->client->msgr);
1395 if (IS_ERR(newmap)) { 1412 if (IS_ERR(newmap)) {
1396 err = PTR_ERR(newmap); 1413 err = PTR_ERR(newmap);
1397 goto bad; 1414 goto bad;
@@ -1839,11 +1856,12 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1839 if (!osdc->req_mempool) 1856 if (!osdc->req_mempool)
1840 goto out; 1857 goto out;
1841 1858
1842 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true, 1859 err = ceph_msgpool_init(&osdc->msgpool_op, CEPH_MSG_OSD_OP,
1860 OSD_OP_FRONT_LEN, 10, true,
1843 "osd_op"); 1861 "osd_op");
1844 if (err < 0) 1862 if (err < 0)
1845 goto out_mempool; 1863 goto out_mempool;
1846 err = ceph_msgpool_init(&osdc->msgpool_op_reply, 1864 err = ceph_msgpool_init(&osdc->msgpool_op_reply, CEPH_MSG_OSD_OPREPLY,
1847 OSD_OPREPLY_FRONT_LEN, 10, true, 1865 OSD_OPREPLY_FRONT_LEN, 10, true,
1848 "osd_op_reply"); 1866 "osd_op_reply");
1849 if (err < 0) 1867 if (err < 0)
@@ -2019,15 +2037,15 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
2019 if (!req) { 2037 if (!req) {
2020 *skip = 1; 2038 *skip = 1;
2021 m = NULL; 2039 m = NULL;
2022 pr_info("get_reply unknown tid %llu from osd%d\n", tid, 2040 dout("get_reply unknown tid %llu from osd%d\n", tid,
2023 osd->o_osd); 2041 osd->o_osd);
2024 goto out; 2042 goto out;
2025 } 2043 }
2026 2044
2027 if (req->r_con_filling_msg) { 2045 if (req->r_con_filling_msg) {
2028 dout("get_reply revoking msg %p from old con %p\n", 2046 dout("%s revoking msg %p from old con %p\n", __func__,
2029 req->r_reply, req->r_con_filling_msg); 2047 req->r_reply, req->r_con_filling_msg);
2030 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply); 2048 ceph_msg_revoke_incoming(req->r_reply);
2031 req->r_con_filling_msg->ops->put(req->r_con_filling_msg); 2049 req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
2032 req->r_con_filling_msg = NULL; 2050 req->r_con_filling_msg = NULL;
2033 } 2051 }
@@ -2080,6 +2098,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
2080 int type = le16_to_cpu(hdr->type); 2098 int type = le16_to_cpu(hdr->type);
2081 int front = le32_to_cpu(hdr->front_len); 2099 int front = le32_to_cpu(hdr->front_len);
2082 2100
2101 *skip = 0;
2083 switch (type) { 2102 switch (type) {
2084 case CEPH_MSG_OSD_MAP: 2103 case CEPH_MSG_OSD_MAP:
2085 case CEPH_MSG_WATCH_NOTIFY: 2104 case CEPH_MSG_WATCH_NOTIFY:
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 81e3b84a77ef..3124b71a8883 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -135,6 +135,21 @@ bad:
135 return -EINVAL; 135 return -EINVAL;
136} 136}
137 137
138static int skip_name_map(void **p, void *end)
139{
140 int len;
141 ceph_decode_32_safe(p, end, len ,bad);
142 while (len--) {
143 int strlen;
144 *p += sizeof(u32);
145 ceph_decode_32_safe(p, end, strlen, bad);
146 *p += strlen;
147}
148 return 0;
149bad:
150 return -EINVAL;
151}
152
138static struct crush_map *crush_decode(void *pbyval, void *end) 153static struct crush_map *crush_decode(void *pbyval, void *end)
139{ 154{
140 struct crush_map *c; 155 struct crush_map *c;
@@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
143 void **p = &pbyval; 158 void **p = &pbyval;
144 void *start = pbyval; 159 void *start = pbyval;
145 u32 magic; 160 u32 magic;
161 u32 num_name_maps;
146 162
147 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); 163 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
148 164
@@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
150 if (c == NULL) 166 if (c == NULL)
151 return ERR_PTR(-ENOMEM); 167 return ERR_PTR(-ENOMEM);
152 168
169 /* set tunables to default values */
170 c->choose_local_tries = 2;
171 c->choose_local_fallback_tries = 5;
172 c->choose_total_tries = 19;
173
153 ceph_decode_need(p, end, 4*sizeof(u32), bad); 174 ceph_decode_need(p, end, 4*sizeof(u32), bad);
154 magic = ceph_decode_32(p); 175 magic = ceph_decode_32(p);
155 if (magic != CRUSH_MAGIC) { 176 if (magic != CRUSH_MAGIC) {
@@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
297 } 318 }
298 319
299 /* ignore trailing name maps. */ 320 /* ignore trailing name maps. */
321 for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
322 err = skip_name_map(p, end);
323 if (err < 0)
324 goto done;
325 }
326
327 /* tunables */
328 ceph_decode_need(p, end, 3*sizeof(u32), done);
329 c->choose_local_tries = ceph_decode_32(p);
330 c->choose_local_fallback_tries = ceph_decode_32(p);
331 c->choose_total_tries = ceph_decode_32(p);
332 dout("crush decode tunable choose_local_tries = %d",
333 c->choose_local_tries);
334 dout("crush decode tunable choose_local_fallback_tries = %d",
335 c->choose_local_fallback_tries);
336 dout("crush decode tunable choose_total_tries = %d",
337 c->choose_total_tries);
300 338
339done:
301 dout("crush_decode success\n"); 340 dout("crush_decode success\n");
302 return c; 341 return c;
303 342
@@ -488,15 +527,16 @@ static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map)
488 ceph_decode_32_safe(p, end, pool, bad); 527 ceph_decode_32_safe(p, end, pool, bad);
489 ceph_decode_32_safe(p, end, len, bad); 528 ceph_decode_32_safe(p, end, len, bad);
490 dout(" pool %d len %d\n", pool, len); 529 dout(" pool %d len %d\n", pool, len);
530 ceph_decode_need(p, end, len, bad);
491 pi = __lookup_pg_pool(&map->pg_pools, pool); 531 pi = __lookup_pg_pool(&map->pg_pools, pool);
492 if (pi) { 532 if (pi) {
533 char *name = kstrndup(*p, len, GFP_NOFS);
534
535 if (!name)
536 return -ENOMEM;
493 kfree(pi->name); 537 kfree(pi->name);
494 pi->name = kmalloc(len + 1, GFP_NOFS); 538 pi->name = name;
495 if (pi->name) { 539 dout(" name is %s\n", pi->name);
496 memcpy(pi->name, *p, len);
497 pi->name[len] = '\0';
498 dout(" name is %s\n", pi->name);
499 }
500 } 540 }
501 *p += len; 541 *p += len;
502 } 542 }
@@ -666,6 +706,9 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
666 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad); 706 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
667 ceph_decode_copy(p, &pgid, sizeof(pgid)); 707 ceph_decode_copy(p, &pgid, sizeof(pgid));
668 n = ceph_decode_32(p); 708 n = ceph_decode_32(p);
709 err = -EINVAL;
710 if (n > (UINT_MAX - sizeof(*pg)) / sizeof(u32))
711 goto bad;
669 ceph_decode_need(p, end, n * sizeof(u32), bad); 712 ceph_decode_need(p, end, n * sizeof(u32), bad);
670 err = -ENOMEM; 713 err = -ENOMEM;
671 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS); 714 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
@@ -889,6 +932,10 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
889 (void) __remove_pg_mapping(&map->pg_temp, pgid); 932 (void) __remove_pg_mapping(&map->pg_temp, pgid);
890 933
891 /* insert */ 934 /* insert */
935 if (pglen > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) {
936 err = -EINVAL;
937 goto bad;
938 }
892 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS); 939 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
893 if (!pg) { 940 if (!pg) {
894 err = -ENOMEM; 941 err = -ENOMEM;
diff --git a/net/core/dev.c b/net/core/dev.c
index 0ebaea16632f..83988362805e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1055,6 +1055,8 @@ rollback:
1055 */ 1055 */
1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len) 1056int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1057{ 1057{
1058 char *new_ifalias;
1059
1058 ASSERT_RTNL(); 1060 ASSERT_RTNL();
1059 1061
1060 if (len >= IFALIASZ) 1062 if (len >= IFALIASZ)
@@ -1068,9 +1070,10 @@ int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1068 return 0; 1070 return 0;
1069 } 1071 }
1070 1072
1071 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL); 1073 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1072 if (!dev->ifalias) 1074 if (!new_ifalias)
1073 return -ENOMEM; 1075 return -ENOMEM;
1076 dev->ifalias = new_ifalias;
1074 1077
1075 strlcpy(dev->ifalias, alias, len+1); 1078 strlcpy(dev->ifalias, alias, len+1);
1076 return len; 1079 return len;
@@ -1172,6 +1175,7 @@ static int __dev_open(struct net_device *dev)
1172 net_dmaengine_get(); 1175 net_dmaengine_get();
1173 dev_set_rx_mode(dev); 1176 dev_set_rx_mode(dev);
1174 dev_activate(dev); 1177 dev_activate(dev);
1178 add_device_randomness(dev->dev_addr, dev->addr_len);
1175 } 1179 }
1176 1180
1177 return ret; 1181 return ret;
@@ -1638,6 +1642,19 @@ static inline int deliver_skb(struct sk_buff *skb,
1638 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 1642 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1639} 1643}
1640 1644
1645static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1646{
1647 if (ptype->af_packet_priv == NULL)
1648 return false;
1649
1650 if (ptype->id_match)
1651 return ptype->id_match(ptype, skb->sk);
1652 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1653 return true;
1654
1655 return false;
1656}
1657
1641/* 1658/*
1642 * Support routine. Sends outgoing frames to any network 1659 * Support routine. Sends outgoing frames to any network
1643 * taps currently in use. 1660 * taps currently in use.
@@ -1655,8 +1672,7 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1655 * they originated from - MvS (miquels@drinkel.ow.org) 1672 * they originated from - MvS (miquels@drinkel.ow.org)
1656 */ 1673 */
1657 if ((ptype->dev == dev || !ptype->dev) && 1674 if ((ptype->dev == dev || !ptype->dev) &&
1658 (ptype->af_packet_priv == NULL || 1675 (!skb_loop_sk(ptype, skb))) {
1659 (struct sock *)ptype->af_packet_priv != skb->sk)) {
1660 if (pt_prev) { 1676 if (pt_prev) {
1661 deliver_skb(skb2, pt_prev, skb->dev); 1677 deliver_skb(skb2, pt_prev, skb->dev);
1662 pt_prev = ptype; 1678 pt_prev = ptype;
@@ -2133,6 +2149,9 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
2133 __be16 protocol = skb->protocol; 2149 __be16 protocol = skb->protocol;
2134 netdev_features_t features = skb->dev->features; 2150 netdev_features_t features = skb->dev->features;
2135 2151
2152 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2153 features &= ~NETIF_F_GSO_MASK;
2154
2136 if (protocol == htons(ETH_P_8021Q)) { 2155 if (protocol == htons(ETH_P_8021Q)) {
2137 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 2156 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2138 protocol = veh->h_vlan_encapsulated_proto; 2157 protocol = veh->h_vlan_encapsulated_proto;
@@ -3155,6 +3174,23 @@ void netdev_rx_handler_unregister(struct net_device *dev)
3155} 3174}
3156EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); 3175EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3157 3176
3177/*
3178 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3179 * the special handling of PFMEMALLOC skbs.
3180 */
3181static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3182{
3183 switch (skb->protocol) {
3184 case __constant_htons(ETH_P_ARP):
3185 case __constant_htons(ETH_P_IP):
3186 case __constant_htons(ETH_P_IPV6):
3187 case __constant_htons(ETH_P_8021Q):
3188 return true;
3189 default:
3190 return false;
3191 }
3192}
3193
3158static int __netif_receive_skb(struct sk_buff *skb) 3194static int __netif_receive_skb(struct sk_buff *skb)
3159{ 3195{
3160 struct packet_type *ptype, *pt_prev; 3196 struct packet_type *ptype, *pt_prev;
@@ -3164,14 +3200,27 @@ static int __netif_receive_skb(struct sk_buff *skb)
3164 bool deliver_exact = false; 3200 bool deliver_exact = false;
3165 int ret = NET_RX_DROP; 3201 int ret = NET_RX_DROP;
3166 __be16 type; 3202 __be16 type;
3203 unsigned long pflags = current->flags;
3167 3204
3168 net_timestamp_check(!netdev_tstamp_prequeue, skb); 3205 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3169 3206
3170 trace_netif_receive_skb(skb); 3207 trace_netif_receive_skb(skb);
3171 3208
3209 /*
3210 * PFMEMALLOC skbs are special, they should
3211 * - be delivered to SOCK_MEMALLOC sockets only
3212 * - stay away from userspace
3213 * - have bounded memory usage
3214 *
3215 * Use PF_MEMALLOC as this saves us from propagating the allocation
3216 * context down to all allocation sites.
3217 */
3218 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3219 current->flags |= PF_MEMALLOC;
3220
3172 /* if we've gotten here through NAPI, check netpoll */ 3221 /* if we've gotten here through NAPI, check netpoll */
3173 if (netpoll_receive_skb(skb)) 3222 if (netpoll_receive_skb(skb))
3174 return NET_RX_DROP; 3223 goto out;
3175 3224
3176 orig_dev = skb->dev; 3225 orig_dev = skb->dev;
3177 3226
@@ -3191,7 +3240,7 @@ another_round:
3191 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { 3240 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3192 skb = vlan_untag(skb); 3241 skb = vlan_untag(skb);
3193 if (unlikely(!skb)) 3242 if (unlikely(!skb))
3194 goto out; 3243 goto unlock;
3195 } 3244 }
3196 3245
3197#ifdef CONFIG_NET_CLS_ACT 3246#ifdef CONFIG_NET_CLS_ACT
@@ -3201,6 +3250,9 @@ another_round:
3201 } 3250 }
3202#endif 3251#endif
3203 3252
3253 if (sk_memalloc_socks() && skb_pfmemalloc(skb))
3254 goto skip_taps;
3255
3204 list_for_each_entry_rcu(ptype, &ptype_all, list) { 3256 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3205 if (!ptype->dev || ptype->dev == skb->dev) { 3257 if (!ptype->dev || ptype->dev == skb->dev) {
3206 if (pt_prev) 3258 if (pt_prev)
@@ -3209,13 +3261,18 @@ another_round:
3209 } 3261 }
3210 } 3262 }
3211 3263
3264skip_taps:
3212#ifdef CONFIG_NET_CLS_ACT 3265#ifdef CONFIG_NET_CLS_ACT
3213 skb = handle_ing(skb, &pt_prev, &ret, orig_dev); 3266 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3214 if (!skb) 3267 if (!skb)
3215 goto out; 3268 goto unlock;
3216ncls: 3269ncls:
3217#endif 3270#endif
3218 3271
3272 if (sk_memalloc_socks() && skb_pfmemalloc(skb)
3273 && !skb_pfmemalloc_protocol(skb))
3274 goto drop;
3275
3219 rx_handler = rcu_dereference(skb->dev->rx_handler); 3276 rx_handler = rcu_dereference(skb->dev->rx_handler);
3220 if (vlan_tx_tag_present(skb)) { 3277 if (vlan_tx_tag_present(skb)) {
3221 if (pt_prev) { 3278 if (pt_prev) {
@@ -3225,7 +3282,7 @@ ncls:
3225 if (vlan_do_receive(&skb, !rx_handler)) 3282 if (vlan_do_receive(&skb, !rx_handler))
3226 goto another_round; 3283 goto another_round;
3227 else if (unlikely(!skb)) 3284 else if (unlikely(!skb))
3228 goto out; 3285 goto unlock;
3229 } 3286 }
3230 3287
3231 if (rx_handler) { 3288 if (rx_handler) {
@@ -3235,7 +3292,7 @@ ncls:
3235 } 3292 }
3236 switch (rx_handler(&skb)) { 3293 switch (rx_handler(&skb)) {
3237 case RX_HANDLER_CONSUMED: 3294 case RX_HANDLER_CONSUMED:
3238 goto out; 3295 goto unlock;
3239 case RX_HANDLER_ANOTHER: 3296 case RX_HANDLER_ANOTHER:
3240 goto another_round; 3297 goto another_round;
3241 case RX_HANDLER_EXACT: 3298 case RX_HANDLER_EXACT:
@@ -3268,6 +3325,7 @@ ncls:
3268 else 3325 else
3269 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); 3326 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3270 } else { 3327 } else {
3328drop:
3271 atomic_long_inc(&skb->dev->rx_dropped); 3329 atomic_long_inc(&skb->dev->rx_dropped);
3272 kfree_skb(skb); 3330 kfree_skb(skb);
3273 /* Jamal, now you will not able to escape explaining 3331 /* Jamal, now you will not able to escape explaining
@@ -3276,8 +3334,10 @@ ncls:
3276 ret = NET_RX_DROP; 3334 ret = NET_RX_DROP;
3277 } 3335 }
3278 3336
3279out: 3337unlock:
3280 rcu_read_unlock(); 3338 rcu_read_unlock();
3339out:
3340 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3281 return ret; 3341 return ret;
3282} 3342}
3283 3343
@@ -4801,6 +4861,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4801 err = ops->ndo_set_mac_address(dev, sa); 4861 err = ops->ndo_set_mac_address(dev, sa);
4802 if (!err) 4862 if (!err)
4803 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); 4863 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4864 add_device_randomness(dev->dev_addr, dev->addr_len);
4804 return err; 4865 return err;
4805} 4866}
4806EXPORT_SYMBOL(dev_set_mac_address); 4867EXPORT_SYMBOL(dev_set_mac_address);
@@ -5579,6 +5640,7 @@ int register_netdevice(struct net_device *dev)
5579 dev_init_scheduler(dev); 5640 dev_init_scheduler(dev);
5580 dev_hold(dev); 5641 dev_hold(dev);
5581 list_netdevice(dev); 5642 list_netdevice(dev);
5643 add_device_randomness(dev->dev_addr, dev->addr_len);
5582 5644
5583 /* Notify protocols, that a new device appeared. */ 5645 /* Notify protocols, that a new device appeared. */
5584 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev); 5646 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
@@ -5682,6 +5744,7 @@ EXPORT_SYMBOL(netdev_refcnt_read);
5682 5744
5683/** 5745/**
5684 * netdev_wait_allrefs - wait until all references are gone. 5746 * netdev_wait_allrefs - wait until all references are gone.
5747 * @dev: target net_device
5685 * 5748 *
5686 * This is called when unregistering network devices. 5749 * This is called when unregistering network devices.
5687 * 5750 *
@@ -5942,6 +6005,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5942 dev_net_set(dev, &init_net); 6005 dev_net_set(dev, &init_net);
5943 6006
5944 dev->gso_max_size = GSO_MAX_SIZE; 6007 dev->gso_max_size = GSO_MAX_SIZE;
6008 dev->gso_max_segs = GSO_MAX_SEGS;
5945 6009
5946 INIT_LIST_HEAD(&dev->napi_list); 6010 INIT_LIST_HEAD(&dev->napi_list);
5947 INIT_LIST_HEAD(&dev->unreg_list); 6011 INIT_LIST_HEAD(&dev->unreg_list);
diff --git a/net/core/dst.c b/net/core/dst.c
index 069d51d29414..56d63612e1e4 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -149,7 +149,15 @@ int dst_discard(struct sk_buff *skb)
149} 149}
150EXPORT_SYMBOL(dst_discard); 150EXPORT_SYMBOL(dst_discard);
151 151
152const u32 dst_default_metrics[RTAX_MAX]; 152const u32 dst_default_metrics[RTAX_MAX + 1] = {
153 /* This initializer is needed to force linker to place this variable
154 * into const section. Otherwise it might end into bss section.
155 * We really want to avoid false sharing on this variable, and catch
156 * any writes on it.
157 */
158 [RTAX_MAX] = 0xdeadbeef,
159};
160
153 161
154void *dst_alloc(struct dst_ops *ops, struct net_device *dev, 162void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
155 int initial_ref, int initial_obsolete, unsigned short flags) 163 int initial_ref, int initial_obsolete, unsigned short flags)
diff --git a/net/core/filter.c b/net/core/filter.c
index d4ce2dc712e3..907efd27ec77 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
83 int err; 83 int err;
84 struct sk_filter *filter; 84 struct sk_filter *filter;
85 85
86 /*
87 * If the skb was allocated from pfmemalloc reserves, only
88 * allow SOCK_MEMALLOC sockets to use it as this socket is
89 * helping free memory
90 */
91 if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC))
92 return -ENOMEM;
93
86 err = security_sock_rcv_skb(sk, skb); 94 err = security_sock_rcv_skb(sk, skb);
87 if (err) 95 if (err)
88 return err; 96 return err;
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index b4c90e42b443..e4ba3e70c174 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -26,6 +26,7 @@
26#include <linux/workqueue.h> 26#include <linux/workqueue.h>
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/export.h> 28#include <linux/export.h>
29#include <linux/if_vlan.h>
29#include <net/tcp.h> 30#include <net/tcp.h>
30#include <net/udp.h> 31#include <net/udp.h>
31#include <asm/unaligned.h> 32#include <asm/unaligned.h>
@@ -54,7 +55,7 @@ static atomic_t trapped;
54 MAX_UDP_CHUNK) 55 MAX_UDP_CHUNK)
55 56
56static void zap_completion_queue(void); 57static void zap_completion_queue(void);
57static void arp_reply(struct sk_buff *skb); 58static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo);
58 59
59static unsigned int carrier_timeout = 4; 60static unsigned int carrier_timeout = 4;
60module_param(carrier_timeout, uint, 0644); 61module_param(carrier_timeout, uint, 0644);
@@ -170,7 +171,8 @@ static void poll_napi(struct net_device *dev)
170 list_for_each_entry(napi, &dev->napi_list, dev_list) { 171 list_for_each_entry(napi, &dev->napi_list, dev_list) {
171 if (napi->poll_owner != smp_processor_id() && 172 if (napi->poll_owner != smp_processor_id() &&
172 spin_trylock(&napi->poll_lock)) { 173 spin_trylock(&napi->poll_lock)) {
173 budget = poll_one_napi(dev->npinfo, napi, budget); 174 budget = poll_one_napi(rcu_dereference_bh(dev->npinfo),
175 napi, budget);
174 spin_unlock(&napi->poll_lock); 176 spin_unlock(&napi->poll_lock);
175 177
176 if (!budget) 178 if (!budget)
@@ -185,13 +187,14 @@ static void service_arp_queue(struct netpoll_info *npi)
185 struct sk_buff *skb; 187 struct sk_buff *skb;
186 188
187 while ((skb = skb_dequeue(&npi->arp_tx))) 189 while ((skb = skb_dequeue(&npi->arp_tx)))
188 arp_reply(skb); 190 netpoll_arp_reply(skb, npi);
189 } 191 }
190} 192}
191 193
192static void netpoll_poll_dev(struct net_device *dev) 194static void netpoll_poll_dev(struct net_device *dev)
193{ 195{
194 const struct net_device_ops *ops; 196 const struct net_device_ops *ops;
197 struct netpoll_info *ni = rcu_dereference_bh(dev->npinfo);
195 198
196 if (!dev || !netif_running(dev)) 199 if (!dev || !netif_running(dev))
197 return; 200 return;
@@ -206,17 +209,18 @@ static void netpoll_poll_dev(struct net_device *dev)
206 poll_napi(dev); 209 poll_napi(dev);
207 210
208 if (dev->flags & IFF_SLAVE) { 211 if (dev->flags & IFF_SLAVE) {
209 if (dev->npinfo) { 212 if (ni) {
210 struct net_device *bond_dev = dev->master; 213 struct net_device *bond_dev = dev->master;
211 struct sk_buff *skb; 214 struct sk_buff *skb;
212 while ((skb = skb_dequeue(&dev->npinfo->arp_tx))) { 215 struct netpoll_info *bond_ni = rcu_dereference_bh(bond_dev->npinfo);
216 while ((skb = skb_dequeue(&ni->arp_tx))) {
213 skb->dev = bond_dev; 217 skb->dev = bond_dev;
214 skb_queue_tail(&bond_dev->npinfo->arp_tx, skb); 218 skb_queue_tail(&bond_ni->arp_tx, skb);
215 } 219 }
216 } 220 }
217 } 221 }
218 222
219 service_arp_queue(dev->npinfo); 223 service_arp_queue(ni);
220 224
221 zap_completion_queue(); 225 zap_completion_queue();
222} 226}
@@ -302,6 +306,7 @@ static int netpoll_owner_active(struct net_device *dev)
302 return 0; 306 return 0;
303} 307}
304 308
309/* call with IRQ disabled */
305void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, 310void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
306 struct net_device *dev) 311 struct net_device *dev)
307{ 312{
@@ -309,8 +314,11 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
309 unsigned long tries; 314 unsigned long tries;
310 const struct net_device_ops *ops = dev->netdev_ops; 315 const struct net_device_ops *ops = dev->netdev_ops;
311 /* It is up to the caller to keep npinfo alive. */ 316 /* It is up to the caller to keep npinfo alive. */
312 struct netpoll_info *npinfo = np->dev->npinfo; 317 struct netpoll_info *npinfo;
318
319 WARN_ON_ONCE(!irqs_disabled());
313 320
321 npinfo = rcu_dereference_bh(np->dev->npinfo);
314 if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { 322 if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
315 __kfree_skb(skb); 323 __kfree_skb(skb);
316 return; 324 return;
@@ -319,16 +327,22 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
319 /* don't get messages out of order, and no recursion */ 327 /* don't get messages out of order, and no recursion */
320 if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) { 328 if (skb_queue_len(&npinfo->txq) == 0 && !netpoll_owner_active(dev)) {
321 struct netdev_queue *txq; 329 struct netdev_queue *txq;
322 unsigned long flags;
323 330
324 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb)); 331 txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
325 332
326 local_irq_save(flags);
327 /* try until next clock tick */ 333 /* try until next clock tick */
328 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL; 334 for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
329 tries > 0; --tries) { 335 tries > 0; --tries) {
330 if (__netif_tx_trylock(txq)) { 336 if (__netif_tx_trylock(txq)) {
331 if (!netif_xmit_stopped(txq)) { 337 if (!netif_xmit_stopped(txq)) {
338 if (vlan_tx_tag_present(skb) &&
339 !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) {
340 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
341 if (unlikely(!skb))
342 break;
343 skb->vlan_tci = 0;
344 }
345
332 status = ops->ndo_start_xmit(skb, dev); 346 status = ops->ndo_start_xmit(skb, dev);
333 if (status == NETDEV_TX_OK) 347 if (status == NETDEV_TX_OK)
334 txq_trans_update(txq); 348 txq_trans_update(txq);
@@ -347,10 +361,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
347 } 361 }
348 362
349 WARN_ONCE(!irqs_disabled(), 363 WARN_ONCE(!irqs_disabled(),
350 "netpoll_send_skb(): %s enabled interrupts in poll (%pF)\n", 364 "netpoll_send_skb_on_dev(): %s enabled interrupts in poll (%pF)\n",
351 dev->name, ops->ndo_start_xmit); 365 dev->name, ops->ndo_start_xmit);
352 366
353 local_irq_restore(flags);
354 } 367 }
355 368
356 if (status != NETDEV_TX_OK) { 369 if (status != NETDEV_TX_OK) {
@@ -423,9 +436,8 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
423} 436}
424EXPORT_SYMBOL(netpoll_send_udp); 437EXPORT_SYMBOL(netpoll_send_udp);
425 438
426static void arp_reply(struct sk_buff *skb) 439static void netpoll_arp_reply(struct sk_buff *skb, struct netpoll_info *npinfo)
427{ 440{
428 struct netpoll_info *npinfo = skb->dev->npinfo;
429 struct arphdr *arp; 441 struct arphdr *arp;
430 unsigned char *arp_ptr; 442 unsigned char *arp_ptr;
431 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; 443 int size, type = ARPOP_REPLY, ptype = ETH_P_ARP;
@@ -543,13 +555,12 @@ static void arp_reply(struct sk_buff *skb)
543 spin_unlock_irqrestore(&npinfo->rx_lock, flags); 555 spin_unlock_irqrestore(&npinfo->rx_lock, flags);
544} 556}
545 557
546int __netpoll_rx(struct sk_buff *skb) 558int __netpoll_rx(struct sk_buff *skb, struct netpoll_info *npinfo)
547{ 559{
548 int proto, len, ulen; 560 int proto, len, ulen;
549 int hits = 0; 561 int hits = 0;
550 const struct iphdr *iph; 562 const struct iphdr *iph;
551 struct udphdr *uh; 563 struct udphdr *uh;
552 struct netpoll_info *npinfo = skb->dev->npinfo;
553 struct netpoll *np, *tmp; 564 struct netpoll *np, *tmp;
554 565
555 if (list_empty(&npinfo->rx_np)) 566 if (list_empty(&npinfo->rx_np))
@@ -565,6 +576,12 @@ int __netpoll_rx(struct sk_buff *skb)
565 return 1; 576 return 1;
566 } 577 }
567 578
579 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
580 skb = vlan_untag(skb);
581 if (unlikely(!skb))
582 goto out;
583 }
584
568 proto = ntohs(eth_hdr(skb)->h_proto); 585 proto = ntohs(eth_hdr(skb)->h_proto);
569 if (proto != ETH_P_IP) 586 if (proto != ETH_P_IP)
570 goto out; 587 goto out;
@@ -715,7 +732,7 @@ int netpoll_parse_options(struct netpoll *np, char *opt)
715} 732}
716EXPORT_SYMBOL(netpoll_parse_options); 733EXPORT_SYMBOL(netpoll_parse_options);
717 734
718int __netpoll_setup(struct netpoll *np, struct net_device *ndev) 735int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
719{ 736{
720 struct netpoll_info *npinfo; 737 struct netpoll_info *npinfo;
721 const struct net_device_ops *ops; 738 const struct net_device_ops *ops;
@@ -734,7 +751,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
734 } 751 }
735 752
736 if (!ndev->npinfo) { 753 if (!ndev->npinfo) {
737 npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL); 754 npinfo = kmalloc(sizeof(*npinfo), gfp);
738 if (!npinfo) { 755 if (!npinfo) {
739 err = -ENOMEM; 756 err = -ENOMEM;
740 goto out; 757 goto out;
@@ -752,7 +769,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
752 769
753 ops = np->dev->netdev_ops; 770 ops = np->dev->netdev_ops;
754 if (ops->ndo_netpoll_setup) { 771 if (ops->ndo_netpoll_setup) {
755 err = ops->ndo_netpoll_setup(ndev, npinfo); 772 err = ops->ndo_netpoll_setup(ndev, npinfo, gfp);
756 if (err) 773 if (err)
757 goto free_npinfo; 774 goto free_npinfo;
758 } 775 }
@@ -857,7 +874,7 @@ int netpoll_setup(struct netpoll *np)
857 refill_skbs(); 874 refill_skbs();
858 875
859 rtnl_lock(); 876 rtnl_lock();
860 err = __netpoll_setup(np, ndev); 877 err = __netpoll_setup(np, ndev, GFP_KERNEL);
861 rtnl_unlock(); 878 rtnl_unlock();
862 879
863 if (err) 880 if (err)
@@ -878,6 +895,24 @@ static int __init netpoll_init(void)
878} 895}
879core_initcall(netpoll_init); 896core_initcall(netpoll_init);
880 897
898static void rcu_cleanup_netpoll_info(struct rcu_head *rcu_head)
899{
900 struct netpoll_info *npinfo =
901 container_of(rcu_head, struct netpoll_info, rcu);
902
903 skb_queue_purge(&npinfo->arp_tx);
904 skb_queue_purge(&npinfo->txq);
905
906 /* we can't call cancel_delayed_work_sync here, as we are in softirq */
907 cancel_delayed_work(&npinfo->tx_work);
908
909 /* clean after last, unfinished work */
910 __skb_queue_purge(&npinfo->txq);
911 /* now cancel it again */
912 cancel_delayed_work(&npinfo->tx_work);
913 kfree(npinfo);
914}
915
881void __netpoll_cleanup(struct netpoll *np) 916void __netpoll_cleanup(struct netpoll *np)
882{ 917{
883 struct netpoll_info *npinfo; 918 struct netpoll_info *npinfo;
@@ -903,20 +938,24 @@ void __netpoll_cleanup(struct netpoll *np)
903 ops->ndo_netpoll_cleanup(np->dev); 938 ops->ndo_netpoll_cleanup(np->dev);
904 939
905 RCU_INIT_POINTER(np->dev->npinfo, NULL); 940 RCU_INIT_POINTER(np->dev->npinfo, NULL);
941 call_rcu_bh(&npinfo->rcu, rcu_cleanup_netpoll_info);
942 }
943}
944EXPORT_SYMBOL_GPL(__netpoll_cleanup);
906 945
907 /* avoid racing with NAPI reading npinfo */ 946static void rcu_cleanup_netpoll(struct rcu_head *rcu_head)
908 synchronize_rcu_bh(); 947{
948 struct netpoll *np = container_of(rcu_head, struct netpoll, rcu);
909 949
910 skb_queue_purge(&npinfo->arp_tx); 950 __netpoll_cleanup(np);
911 skb_queue_purge(&npinfo->txq); 951 kfree(np);
912 cancel_delayed_work_sync(&npinfo->tx_work); 952}
913 953
914 /* clean after last, unfinished work */ 954void __netpoll_free_rcu(struct netpoll *np)
915 __skb_queue_purge(&npinfo->txq); 955{
916 kfree(npinfo); 956 call_rcu_bh(&np->rcu, rcu_cleanup_netpoll);
917 }
918} 957}
919EXPORT_SYMBOL_GPL(__netpoll_cleanup); 958EXPORT_SYMBOL_GPL(__netpoll_free_rcu);
920 959
921void netpoll_cleanup(struct netpoll *np) 960void netpoll_cleanup(struct netpoll *np)
922{ 961{
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index ed0c0431fcd8..c75e3f9d060f 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -101,12 +101,10 @@ static int write_update_netdev_table(struct net_device *dev)
101 u32 max_len; 101 u32 max_len;
102 struct netprio_map *map; 102 struct netprio_map *map;
103 103
104 rtnl_lock();
105 max_len = atomic_read(&max_prioidx) + 1; 104 max_len = atomic_read(&max_prioidx) + 1;
106 map = rtnl_dereference(dev->priomap); 105 map = rtnl_dereference(dev->priomap);
107 if (!map || map->priomap_len < max_len) 106 if (!map || map->priomap_len < max_len)
108 ret = extend_netdev_table(dev, max_len); 107 ret = extend_netdev_table(dev, max_len);
109 rtnl_unlock();
110 108
111 return ret; 109 return ret;
112} 110}
@@ -256,17 +254,17 @@ static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
256 if (!dev) 254 if (!dev)
257 goto out_free_devname; 255 goto out_free_devname;
258 256
257 rtnl_lock();
259 ret = write_update_netdev_table(dev); 258 ret = write_update_netdev_table(dev);
260 if (ret < 0) 259 if (ret < 0)
261 goto out_put_dev; 260 goto out_put_dev;
262 261
263 rcu_read_lock(); 262 map = rtnl_dereference(dev->priomap);
264 map = rcu_dereference(dev->priomap);
265 if (map) 263 if (map)
266 map->priomap[prioidx] = priority; 264 map->priomap[prioidx] = priority;
267 rcu_read_unlock();
268 265
269out_put_dev: 266out_put_dev:
267 rtnl_unlock();
270 dev_put(dev); 268 dev_put(dev);
271 269
272out_free_devname: 270out_free_devname:
@@ -277,12 +275,6 @@ out_free_devname:
277void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 275void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
278{ 276{
279 struct task_struct *p; 277 struct task_struct *p;
280 char *tmp = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
281
282 if (!tmp) {
283 pr_warn("Unable to attach cgrp due to alloc failure!\n");
284 return;
285 }
286 278
287 cgroup_taskset_for_each(p, cgrp, tset) { 279 cgroup_taskset_for_each(p, cgrp, tset) {
288 unsigned int fd; 280 unsigned int fd;
@@ -296,32 +288,24 @@ void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
296 continue; 288 continue;
297 } 289 }
298 290
299 rcu_read_lock(); 291 spin_lock(&files->file_lock);
300 fdt = files_fdtable(files); 292 fdt = files_fdtable(files);
301 for (fd = 0; fd < fdt->max_fds; fd++) { 293 for (fd = 0; fd < fdt->max_fds; fd++) {
302 char *path;
303 struct file *file; 294 struct file *file;
304 struct socket *sock; 295 struct socket *sock;
305 unsigned long s; 296 int err;
306 int rv, err = 0;
307 297
308 file = fcheck_files(files, fd); 298 file = fcheck_files(files, fd);
309 if (!file) 299 if (!file)
310 continue; 300 continue;
311 301
312 path = d_path(&file->f_path, tmp, PAGE_SIZE);
313 rv = sscanf(path, "socket:[%lu]", &s);
314 if (rv <= 0)
315 continue;
316
317 sock = sock_from_file(file, &err); 302 sock = sock_from_file(file, &err);
318 if (!err) 303 if (sock)
319 sock_update_netprioidx(sock->sk, p); 304 sock_update_netprioidx(sock->sk, p);
320 } 305 }
321 rcu_read_unlock(); 306 spin_unlock(&files->file_lock);
322 task_unlock(p); 307 task_unlock(p);
323 } 308 }
324 kfree(tmp);
325} 309}
326 310
327static struct cftype ss_files[] = { 311static struct cftype ss_files[] = {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 334b930e0de3..2c5a0a06c4ce 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -625,9 +625,13 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
625 .rta_id = id, 625 .rta_id = id,
626 }; 626 };
627 627
628 if (expires) 628 if (expires) {
629 ci.rta_expires = jiffies_to_clock_t(expires); 629 unsigned long clock;
630 630
631 clock = jiffies_to_clock_t(abs(expires));
632 clock = min_t(unsigned long, clock, INT_MAX);
633 ci.rta_expires = (expires > 0) ? clock : -clock;
634 }
631 return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); 635 return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci);
632} 636}
633EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); 637EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo);
@@ -659,6 +663,12 @@ static void set_operstate(struct net_device *dev, unsigned char transition)
659 } 663 }
660} 664}
661 665
666static unsigned int rtnl_dev_get_flags(const struct net_device *dev)
667{
668 return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) |
669 (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI));
670}
671
662static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, 672static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
663 const struct ifinfomsg *ifm) 673 const struct ifinfomsg *ifm)
664{ 674{
@@ -667,7 +677,7 @@ static unsigned int rtnl_dev_combine_flags(const struct net_device *dev,
667 /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ 677 /* bugwards compatibility: ifi_change == 0 is treated as ~0 */
668 if (ifm->ifi_change) 678 if (ifm->ifi_change)
669 flags = (flags & ifm->ifi_change) | 679 flags = (flags & ifm->ifi_change) |
670 (dev->flags & ~ifm->ifi_change); 680 (rtnl_dev_get_flags(dev) & ~ifm->ifi_change);
671 681
672 return flags; 682 return flags;
673} 683}
@@ -1371,6 +1381,7 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1371 goto errout; 1381 goto errout;
1372 send_addr_notify = 1; 1382 send_addr_notify = 1;
1373 modified = 1; 1383 modified = 1;
1384 add_device_randomness(dev->dev_addr, dev->addr_len);
1374 } 1385 }
1375 1386
1376 if (tb[IFLA_MTU]) { 1387 if (tb[IFLA_MTU]) {
diff --git a/net/core/scm.c b/net/core/scm.c
index 8f6ccfd68ef4..040cebeed45b 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -265,6 +265,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
265 for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax; 265 for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i<fdmax;
266 i++, cmfptr++) 266 i++, cmfptr++)
267 { 267 {
268 struct socket *sock;
268 int new_fd; 269 int new_fd;
269 err = security_file_receive(fp[i]); 270 err = security_file_receive(fp[i]);
270 if (err) 271 if (err)
@@ -281,6 +282,9 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
281 } 282 }
282 /* Bump the usage count and install the file. */ 283 /* Bump the usage count and install the file. */
283 get_file(fp[i]); 284 get_file(fp[i]);
285 sock = sock_from_file(fp[i], &err);
286 if (sock)
287 sock_update_netprioidx(sock->sk, current);
284 fd_install(new_fd, fp[i]); 288 fd_install(new_fd, fp[i]);
285 } 289 }
286 290
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 368f65c15e4f..fe00d1208167 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
145 BUG(); 145 BUG();
146} 146}
147 147
148
149/*
150 * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
151 * the caller if emergency pfmemalloc reserves are being used. If it is and
152 * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves
153 * may be used. Otherwise, the packet data may be discarded until enough
154 * memory is free
155 */
156#define kmalloc_reserve(size, gfp, node, pfmemalloc) \
157 __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc)
158void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip,
159 bool *pfmemalloc)
160{
161 void *obj;
162 bool ret_pfmemalloc = false;
163
164 /*
165 * Try a regular allocation, when that fails and we're not entitled
166 * to the reserves, fail.
167 */
168 obj = kmalloc_node_track_caller(size,
169 flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
170 node);
171 if (obj || !(gfp_pfmemalloc_allowed(flags)))
172 goto out;
173
174 /* Try again but now we are using pfmemalloc reserves */
175 ret_pfmemalloc = true;
176 obj = kmalloc_node_track_caller(size, flags, node);
177
178out:
179 if (pfmemalloc)
180 *pfmemalloc = ret_pfmemalloc;
181
182 return obj;
183}
184
148/* Allocate a new skbuff. We do this ourselves so we can fill in a few 185/* Allocate a new skbuff. We do this ourselves so we can fill in a few
149 * 'private' fields and also do memory statistics to find all the 186 * 'private' fields and also do memory statistics to find all the
150 * [BEEP] leaks. 187 * [BEEP] leaks.
@@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
155 * __alloc_skb - allocate a network buffer 192 * __alloc_skb - allocate a network buffer
156 * @size: size to allocate 193 * @size: size to allocate
157 * @gfp_mask: allocation mask 194 * @gfp_mask: allocation mask
158 * @fclone: allocate from fclone cache instead of head cache 195 * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache
159 * and allocate a cloned (child) skb 196 * instead of head cache and allocate a cloned (child) skb.
197 * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for
198 * allocations in case the data is required for writeback
160 * @node: numa node to allocate memory on 199 * @node: numa node to allocate memory on
161 * 200 *
162 * Allocate a new &sk_buff. The returned buffer has no headroom and a 201 * Allocate a new &sk_buff. The returned buffer has no headroom and a
@@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here)
167 * %GFP_ATOMIC. 206 * %GFP_ATOMIC.
168 */ 207 */
169struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, 208struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
170 int fclone, int node) 209 int flags, int node)
171{ 210{
172 struct kmem_cache *cache; 211 struct kmem_cache *cache;
173 struct skb_shared_info *shinfo; 212 struct skb_shared_info *shinfo;
174 struct sk_buff *skb; 213 struct sk_buff *skb;
175 u8 *data; 214 u8 *data;
215 bool pfmemalloc;
176 216
177 cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; 217 cache = (flags & SKB_ALLOC_FCLONE)
218 ? skbuff_fclone_cache : skbuff_head_cache;
219
220 if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
221 gfp_mask |= __GFP_MEMALLOC;
178 222
179 /* Get the HEAD */ 223 /* Get the HEAD */
180 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); 224 skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
@@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
189 */ 233 */
190 size = SKB_DATA_ALIGN(size); 234 size = SKB_DATA_ALIGN(size);
191 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 235 size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
192 data = kmalloc_node_track_caller(size, gfp_mask, node); 236 data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc);
193 if (!data) 237 if (!data)
194 goto nodata; 238 goto nodata;
195 /* kmalloc(size) might give us more room than requested. 239 /* kmalloc(size) might give us more room than requested.
@@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
207 memset(skb, 0, offsetof(struct sk_buff, tail)); 251 memset(skb, 0, offsetof(struct sk_buff, tail));
208 /* Account for allocated memory : skb + skb->head */ 252 /* Account for allocated memory : skb + skb->head */
209 skb->truesize = SKB_TRUESIZE(size); 253 skb->truesize = SKB_TRUESIZE(size);
254 skb->pfmemalloc = pfmemalloc;
210 atomic_set(&skb->users, 1); 255 atomic_set(&skb->users, 1);
211 skb->head = data; 256 skb->head = data;
212 skb->data = data; 257 skb->data = data;
@@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
222 atomic_set(&shinfo->dataref, 1); 267 atomic_set(&shinfo->dataref, 1);
223 kmemcheck_annotate_variable(shinfo->destructor_arg); 268 kmemcheck_annotate_variable(shinfo->destructor_arg);
224 269
225 if (fclone) { 270 if (flags & SKB_ALLOC_FCLONE) {
226 struct sk_buff *child = skb + 1; 271 struct sk_buff *child = skb + 1;
227 atomic_t *fclone_ref = (atomic_t *) (child + 1); 272 atomic_t *fclone_ref = (atomic_t *) (child + 1);
228 273
@@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
232 atomic_set(fclone_ref, 1); 277 atomic_set(fclone_ref, 1);
233 278
234 child->fclone = SKB_FCLONE_UNAVAILABLE; 279 child->fclone = SKB_FCLONE_UNAVAILABLE;
280 child->pfmemalloc = pfmemalloc;
235 } 281 }
236out: 282out:
237 return skb; 283 return skb;
@@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache);
302 348
303#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) 349#define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES)
304 350
305/** 351static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
306 * netdev_alloc_frag - allocate a page fragment
307 * @fragsz: fragment size
308 *
309 * Allocates a frag from a page for receive buffer.
310 * Uses GFP_ATOMIC allocations.
311 */
312void *netdev_alloc_frag(unsigned int fragsz)
313{ 352{
314 struct netdev_alloc_cache *nc; 353 struct netdev_alloc_cache *nc;
315 void *data = NULL; 354 void *data = NULL;
@@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz)
319 nc = &__get_cpu_var(netdev_alloc_cache); 358 nc = &__get_cpu_var(netdev_alloc_cache);
320 if (unlikely(!nc->page)) { 359 if (unlikely(!nc->page)) {
321refill: 360refill:
322 nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); 361 nc->page = alloc_page(gfp_mask);
323 if (unlikely(!nc->page)) 362 if (unlikely(!nc->page))
324 goto end; 363 goto end;
325recycle: 364recycle:
@@ -343,6 +382,18 @@ end:
343 local_irq_restore(flags); 382 local_irq_restore(flags);
344 return data; 383 return data;
345} 384}
385
386/**
387 * netdev_alloc_frag - allocate a page fragment
388 * @fragsz: fragment size
389 *
390 * Allocates a frag from a page for receive buffer.
391 * Uses GFP_ATOMIC allocations.
392 */
393void *netdev_alloc_frag(unsigned int fragsz)
394{
395 return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD);
396}
346EXPORT_SYMBOL(netdev_alloc_frag); 397EXPORT_SYMBOL(netdev_alloc_frag);
347 398
348/** 399/**
@@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
366 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); 417 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
367 418
368 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { 419 if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) {
369 void *data = netdev_alloc_frag(fragsz); 420 void *data;
421
422 if (sk_memalloc_socks())
423 gfp_mask |= __GFP_MEMALLOC;
424
425 data = __netdev_alloc_frag(fragsz, gfp_mask);
370 426
371 if (likely(data)) { 427 if (likely(data)) {
372 skb = build_skb(data, fragsz); 428 skb = build_skb(data, fragsz);
@@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev,
374 put_page(virt_to_head_page(data)); 430 put_page(virt_to_head_page(data));
375 } 431 }
376 } else { 432 } else {
377 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); 433 skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask,
434 SKB_ALLOC_RX, NUMA_NO_NODE);
378 } 435 }
379 if (likely(skb)) { 436 if (likely(skb)) {
380 skb_reserve(skb, NET_SKB_PAD); 437 skb_reserve(skb, NET_SKB_PAD);
@@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
656#if IS_ENABLED(CONFIG_IP_VS) 713#if IS_ENABLED(CONFIG_IP_VS)
657 new->ipvs_property = old->ipvs_property; 714 new->ipvs_property = old->ipvs_property;
658#endif 715#endif
716 new->pfmemalloc = old->pfmemalloc;
659 new->protocol = old->protocol; 717 new->protocol = old->protocol;
660 new->mark = old->mark; 718 new->mark = old->mark;
661 new->skb_iif = old->skb_iif; 719 new->skb_iif = old->skb_iif;
@@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
814 n->fclone = SKB_FCLONE_CLONE; 872 n->fclone = SKB_FCLONE_CLONE;
815 atomic_inc(fclone_ref); 873 atomic_inc(fclone_ref);
816 } else { 874 } else {
875 if (skb_pfmemalloc(skb))
876 gfp_mask |= __GFP_MEMALLOC;
877
817 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); 878 n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
818 if (!n) 879 if (!n)
819 return NULL; 880 return NULL;
@@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
850 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; 911 skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type;
851} 912}
852 913
914static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
915{
916 if (skb_pfmemalloc(skb))
917 return SKB_ALLOC_RX;
918 return 0;
919}
920
853/** 921/**
854 * skb_copy - create private copy of an sk_buff 922 * skb_copy - create private copy of an sk_buff
855 * @skb: buffer to copy 923 * @skb: buffer to copy
@@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
871{ 939{
872 int headerlen = skb_headroom(skb); 940 int headerlen = skb_headroom(skb);
873 unsigned int size = skb_end_offset(skb) + skb->data_len; 941 unsigned int size = skb_end_offset(skb) + skb->data_len;
874 struct sk_buff *n = alloc_skb(size, gfp_mask); 942 struct sk_buff *n = __alloc_skb(size, gfp_mask,
943 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
875 944
876 if (!n) 945 if (!n)
877 return NULL; 946 return NULL;
@@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy);
906struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) 975struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
907{ 976{
908 unsigned int size = skb_headlen(skb) + headroom; 977 unsigned int size = skb_headlen(skb) + headroom;
909 struct sk_buff *n = alloc_skb(size, gfp_mask); 978 struct sk_buff *n = __alloc_skb(size, gfp_mask,
979 skb_alloc_rx_flag(skb), NUMA_NO_NODE);
910 980
911 if (!n) 981 if (!n)
912 goto out; 982 goto out;
@@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
979 1049
980 size = SKB_DATA_ALIGN(size); 1050 size = SKB_DATA_ALIGN(size);
981 1051
982 data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), 1052 if (skb_pfmemalloc(skb))
983 gfp_mask); 1053 gfp_mask |= __GFP_MEMALLOC;
1054 data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
1055 gfp_mask, NUMA_NO_NODE, NULL);
984 if (!data) 1056 if (!data)
985 goto nodata; 1057 goto nodata;
986 size = SKB_WITH_OVERHEAD(ksize(data)); 1058 size = SKB_WITH_OVERHEAD(ksize(data));
@@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
1092 /* 1164 /*
1093 * Allocate the copy buffer 1165 * Allocate the copy buffer
1094 */ 1166 */
1095 struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, 1167 struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
1096 gfp_mask); 1168 gfp_mask, skb_alloc_rx_flag(skb),
1169 NUMA_NO_NODE);
1097 int oldheadroom = skb_headroom(skb); 1170 int oldheadroom = skb_headroom(skb);
1098 int head_copy_len, head_copy_off; 1171 int head_copy_len, head_copy_off;
1099 int off; 1172 int off;
@@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
2775 skb_release_head_state(nskb); 2848 skb_release_head_state(nskb);
2776 __skb_push(nskb, doffset); 2849 __skb_push(nskb, doffset);
2777 } else { 2850 } else {
2778 nskb = alloc_skb(hsize + doffset + headroom, 2851 nskb = __alloc_skb(hsize + doffset + headroom,
2779 GFP_ATOMIC); 2852 GFP_ATOMIC, skb_alloc_rx_flag(skb),
2853 NUMA_NO_NODE);
2780 2854
2781 if (unlikely(!nskb)) 2855 if (unlikely(!nskb))
2782 goto err; 2856 goto err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 2676a88f533e..8f67ced8d6a8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -142,7 +142,7 @@
142static DEFINE_MUTEX(proto_list_mutex); 142static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list); 143static LIST_HEAD(proto_list);
144 144
145#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 145#ifdef CONFIG_MEMCG_KMEM
146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 146int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147{ 147{
148 struct proto *proto; 148 struct proto *proto;
@@ -271,6 +271,61 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 271int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272EXPORT_SYMBOL(sysctl_optmem_max); 272EXPORT_SYMBOL(sysctl_optmem_max);
273 273
274struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275EXPORT_SYMBOL_GPL(memalloc_socks);
276
277/**
278 * sk_set_memalloc - sets %SOCK_MEMALLOC
279 * @sk: socket to set it on
280 *
281 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282 * It's the responsibility of the admin to adjust min_free_kbytes
283 * to meet the requirements
284 */
285void sk_set_memalloc(struct sock *sk)
286{
287 sock_set_flag(sk, SOCK_MEMALLOC);
288 sk->sk_allocation |= __GFP_MEMALLOC;
289 static_key_slow_inc(&memalloc_socks);
290}
291EXPORT_SYMBOL_GPL(sk_set_memalloc);
292
293void sk_clear_memalloc(struct sock *sk)
294{
295 sock_reset_flag(sk, SOCK_MEMALLOC);
296 sk->sk_allocation &= ~__GFP_MEMALLOC;
297 static_key_slow_dec(&memalloc_socks);
298
299 /*
300 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302 * it has rmem allocations there is a risk that the user of the
303 * socket cannot make forward progress due to exceeding the rmem
304 * limits. By rights, sk_clear_memalloc() should only be called
305 * on sockets being torn down but warn and reset the accounting if
306 * that assumption breaks.
307 */
308 if (WARN_ON(sk->sk_forward_alloc))
309 sk_mem_reclaim(sk);
310}
311EXPORT_SYMBOL_GPL(sk_clear_memalloc);
312
313int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314{
315 int ret;
316 unsigned long pflags = current->flags;
317
318 /* these should have been dropped before queueing */
319 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320
321 current->flags |= PF_MEMALLOC;
322 ret = sk->sk_backlog_rcv(sk, skb);
323 tsk_restore_flags(current, pflags, PF_MEMALLOC);
324
325 return ret;
326}
327EXPORT_SYMBOL(__sk_backlog_rcv);
328
274#if defined(CONFIG_CGROUPS) 329#if defined(CONFIG_CGROUPS)
275#if !defined(CONFIG_NET_CLS_CGROUP) 330#if !defined(CONFIG_NET_CLS_CGROUP)
276int net_cls_subsys_id = -1; 331int net_cls_subsys_id = -1;
@@ -353,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
353 if (err) 408 if (err)
354 return err; 409 return err;
355 410
356 if (!sk_rmem_schedule(sk, skb->truesize)) { 411 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
357 atomic_inc(&sk->sk_drops); 412 atomic_inc(&sk->sk_drops);
358 return -ENOBUFS; 413 return -ENOBUFS;
359 } 414 }
@@ -1403,6 +1458,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1403 } else { 1458 } else {
1404 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1459 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1405 sk->sk_gso_max_size = dst->dev->gso_max_size; 1460 sk->sk_gso_max_size = dst->dev->gso_max_size;
1461 sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1406 } 1462 }
1407 } 1463 }
1408} 1464}
diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h
index 75c3582a7678..fb85d371a8de 100644
--- a/net/dccp/ccid.h
+++ b/net/dccp/ccid.h
@@ -246,7 +246,7 @@ static inline int ccid_hc_rx_getsockopt(struct ccid *ccid, struct sock *sk,
246 u32 __user *optval, int __user *optlen) 246 u32 __user *optval, int __user *optlen)
247{ 247{
248 int rc = -ENOPROTOOPT; 248 int rc = -ENOPROTOOPT;
249 if (ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL) 249 if (ccid != NULL && ccid->ccid_ops->ccid_hc_rx_getsockopt != NULL)
250 rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len, 250 rc = ccid->ccid_ops->ccid_hc_rx_getsockopt(sk, optname, len,
251 optval, optlen); 251 optval, optlen);
252 return rc; 252 return rc;
@@ -257,7 +257,7 @@ static inline int ccid_hc_tx_getsockopt(struct ccid *ccid, struct sock *sk,
257 u32 __user *optval, int __user *optlen) 257 u32 __user *optval, int __user *optlen)
258{ 258{
259 int rc = -ENOPROTOOPT; 259 int rc = -ENOPROTOOPT;
260 if (ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL) 260 if (ccid != NULL && ccid->ccid_ops->ccid_hc_tx_getsockopt != NULL)
261 rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len, 261 rc = ccid->ccid_ops->ccid_hc_tx_getsockopt(sk, optname, len,
262 optval, optlen); 262 optval, optlen);
263 return rc; 263 return rc;
diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c
index d65e98798eca..119c04317d48 100644
--- a/net/dccp/ccids/ccid3.c
+++ b/net/dccp/ccids/ccid3.c
@@ -535,6 +535,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len,
535 case DCCP_SOCKOPT_CCID_TX_INFO: 535 case DCCP_SOCKOPT_CCID_TX_INFO:
536 if (len < sizeof(tfrc)) 536 if (len < sizeof(tfrc))
537 return -EINVAL; 537 return -EINVAL;
538 memset(&tfrc, 0, sizeof(tfrc));
538 tfrc.tfrctx_x = hc->tx_x; 539 tfrc.tfrctx_x = hc->tx_x;
539 tfrc.tfrctx_x_recv = hc->tx_x_recv; 540 tfrc.tfrctx_x_recv = hc->tx_x_recv;
540 tfrc.tfrctx_x_calc = hc->tx_x_calc; 541 tfrc.tfrctx_x_calc = hc->tx_x_calc;
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ae2ccf2890e4..15ca63ec604e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o 49obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
50obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o 50obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
51obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o 51obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
52obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o 52obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
53obj-$(CONFIG_NETLABEL) += cipso_ipv4.o 53obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
54 54
55obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ 55obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index a0124eb7dbea..77e87aff419a 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -827,7 +827,7 @@ static int arp_process(struct sk_buff *skb)
827 } 827 }
828 828
829 if (arp->ar_op == htons(ARPOP_REQUEST) && 829 if (arp->ar_op == htons(ARPOP_REQUEST) &&
830 ip_route_input(skb, tip, sip, 0, dev) == 0) { 830 ip_route_input_noref(skb, tip, sip, 0, dev) == 0) {
831 831
832 rt = skb_rtable(skb); 832 rt = skb_rtable(skb);
833 addr_type = rt->rt_type; 833 addr_type = rt->rt_type;
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 8732cc7920ed..c43ae3fba792 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1046,6 +1046,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
1046 1046
1047 if (event == NETDEV_UNREGISTER) { 1047 if (event == NETDEV_UNREGISTER) {
1048 fib_disable_ip(dev, 2, -1); 1048 fib_disable_ip(dev, 2, -1);
1049 rt_flush_dev(dev);
1049 return NOTIFY_DONE; 1050 return NOTIFY_DONE;
1050 } 1051 }
1051 1052
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index e55171f184f9..da80dc14cc76 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -140,6 +140,21 @@ const struct fib_prop fib_props[RTN_MAX + 1] = {
140 }, 140 },
141}; 141};
142 142
143static void rt_fibinfo_free(struct rtable __rcu **rtp)
144{
145 struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146
147 if (!rt)
148 return;
149
150 /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151 * because we waited an RCU grace period before calling
152 * free_fib_info_rcu()
153 */
154
155 dst_free(&rt->dst);
156}
157
143static void free_nh_exceptions(struct fib_nh *nh) 158static void free_nh_exceptions(struct fib_nh *nh)
144{ 159{
145 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 160 struct fnhe_hash_bucket *hash = nh->nh_exceptions;
@@ -153,6 +168,9 @@ static void free_nh_exceptions(struct fib_nh *nh)
153 struct fib_nh_exception *next; 168 struct fib_nh_exception *next;
154 169
155 next = rcu_dereference_protected(fnhe->fnhe_next, 1); 170 next = rcu_dereference_protected(fnhe->fnhe_next, 1);
171
172 rt_fibinfo_free(&fnhe->fnhe_rth);
173
156 kfree(fnhe); 174 kfree(fnhe);
157 175
158 fnhe = next; 176 fnhe = next;
@@ -161,6 +179,23 @@ static void free_nh_exceptions(struct fib_nh *nh)
161 kfree(hash); 179 kfree(hash);
162} 180}
163 181
182static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
183{
184 int cpu;
185
186 if (!rtp)
187 return;
188
189 for_each_possible_cpu(cpu) {
190 struct rtable *rt;
191
192 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
193 if (rt)
194 dst_free(&rt->dst);
195 }
196 free_percpu(rtp);
197}
198
164/* Release a nexthop info record */ 199/* Release a nexthop info record */
165static void free_fib_info_rcu(struct rcu_head *head) 200static void free_fib_info_rcu(struct rcu_head *head)
166{ 201{
@@ -171,10 +206,8 @@ static void free_fib_info_rcu(struct rcu_head *head)
171 dev_put(nexthop_nh->nh_dev); 206 dev_put(nexthop_nh->nh_dev);
172 if (nexthop_nh->nh_exceptions) 207 if (nexthop_nh->nh_exceptions)
173 free_nh_exceptions(nexthop_nh); 208 free_nh_exceptions(nexthop_nh);
174 if (nexthop_nh->nh_rth_output) 209 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
175 dst_release(&nexthop_nh->nh_rth_output->dst); 210 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
176 if (nexthop_nh->nh_rth_input)
177 dst_release(&nexthop_nh->nh_rth_input->dst);
178 } endfor_nexthops(fi); 211 } endfor_nexthops(fi);
179 212
180 release_net(fi->fib_net); 213 release_net(fi->fib_net);
@@ -804,6 +837,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
804 fi->fib_nhs = nhs; 837 fi->fib_nhs = nhs;
805 change_nexthops(fi) { 838 change_nexthops(fi) {
806 nexthop_nh->nh_parent = fi; 839 nexthop_nh->nh_parent = fi;
840 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
807 } endfor_nexthops(fi) 841 } endfor_nexthops(fi)
808 842
809 if (cfg->fc_mx) { 843 if (cfg->fc_mx) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 18cbc15b20d5..57bd978483e1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -159,7 +159,6 @@ struct trie {
159#endif 159#endif
160}; 160};
161 161
162static void put_child(struct trie *t, struct tnode *tn, int i, struct rt_trie_node *n);
163static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n, 162static void tnode_put_child_reorg(struct tnode *tn, int i, struct rt_trie_node *n,
164 int wasfull); 163 int wasfull);
165static struct rt_trie_node *resize(struct trie *t, struct tnode *tn); 164static struct rt_trie_node *resize(struct trie *t, struct tnode *tn);
@@ -368,7 +367,7 @@ static void __leaf_free_rcu(struct rcu_head *head)
368 367
369static inline void free_leaf(struct leaf *l) 368static inline void free_leaf(struct leaf *l)
370{ 369{
371 call_rcu_bh(&l->rcu, __leaf_free_rcu); 370 call_rcu(&l->rcu, __leaf_free_rcu);
372} 371}
373 372
374static inline void free_leaf_info(struct leaf_info *leaf) 373static inline void free_leaf_info(struct leaf_info *leaf)
@@ -473,7 +472,7 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
473 } 472 }
474 473
475 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode), 474 pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
476 sizeof(struct rt_trie_node) << bits); 475 sizeof(struct rt_trie_node *) << bits);
477 return tn; 476 return tn;
478} 477}
479 478
@@ -490,7 +489,7 @@ static inline int tnode_full(const struct tnode *tn, const struct rt_trie_node *
490 return ((struct tnode *) n)->pos == tn->pos + tn->bits; 489 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
491} 490}
492 491
493static inline void put_child(struct trie *t, struct tnode *tn, int i, 492static inline void put_child(struct tnode *tn, int i,
494 struct rt_trie_node *n) 493 struct rt_trie_node *n)
495{ 494{
496 tnode_put_child_reorg(tn, i, n, -1); 495 tnode_put_child_reorg(tn, i, n, -1);
@@ -754,8 +753,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
754 goto nomem; 753 goto nomem;
755 } 754 }
756 755
757 put_child(t, tn, 2*i, (struct rt_trie_node *) left); 756 put_child(tn, 2*i, (struct rt_trie_node *) left);
758 put_child(t, tn, 2*i+1, (struct rt_trie_node *) right); 757 put_child(tn, 2*i+1, (struct rt_trie_node *) right);
759 } 758 }
760 } 759 }
761 760
@@ -776,9 +775,9 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
776 if (tkey_extract_bits(node->key, 775 if (tkey_extract_bits(node->key,
777 oldtnode->pos + oldtnode->bits, 776 oldtnode->pos + oldtnode->bits,
778 1) == 0) 777 1) == 0)
779 put_child(t, tn, 2*i, node); 778 put_child(tn, 2*i, node);
780 else 779 else
781 put_child(t, tn, 2*i+1, node); 780 put_child(tn, 2*i+1, node);
782 continue; 781 continue;
783 } 782 }
784 783
@@ -786,8 +785,8 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
786 inode = (struct tnode *) node; 785 inode = (struct tnode *) node;
787 786
788 if (inode->bits == 1) { 787 if (inode->bits == 1) {
789 put_child(t, tn, 2*i, rtnl_dereference(inode->child[0])); 788 put_child(tn, 2*i, rtnl_dereference(inode->child[0]));
790 put_child(t, tn, 2*i+1, rtnl_dereference(inode->child[1])); 789 put_child(tn, 2*i+1, rtnl_dereference(inode->child[1]));
791 790
792 tnode_free_safe(inode); 791 tnode_free_safe(inode);
793 continue; 792 continue;
@@ -817,22 +816,22 @@ static struct tnode *inflate(struct trie *t, struct tnode *tn)
817 */ 816 */
818 817
819 left = (struct tnode *) tnode_get_child(tn, 2*i); 818 left = (struct tnode *) tnode_get_child(tn, 2*i);
820 put_child(t, tn, 2*i, NULL); 819 put_child(tn, 2*i, NULL);
821 820
822 BUG_ON(!left); 821 BUG_ON(!left);
823 822
824 right = (struct tnode *) tnode_get_child(tn, 2*i+1); 823 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
825 put_child(t, tn, 2*i+1, NULL); 824 put_child(tn, 2*i+1, NULL);
826 825
827 BUG_ON(!right); 826 BUG_ON(!right);
828 827
829 size = tnode_child_length(left); 828 size = tnode_child_length(left);
830 for (j = 0; j < size; j++) { 829 for (j = 0; j < size; j++) {
831 put_child(t, left, j, rtnl_dereference(inode->child[j])); 830 put_child(left, j, rtnl_dereference(inode->child[j]));
832 put_child(t, right, j, rtnl_dereference(inode->child[j + size])); 831 put_child(right, j, rtnl_dereference(inode->child[j + size]));
833 } 832 }
834 put_child(t, tn, 2*i, resize(t, left)); 833 put_child(tn, 2*i, resize(t, left));
835 put_child(t, tn, 2*i+1, resize(t, right)); 834 put_child(tn, 2*i+1, resize(t, right));
836 835
837 tnode_free_safe(inode); 836 tnode_free_safe(inode);
838 } 837 }
@@ -877,7 +876,7 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
877 if (!newn) 876 if (!newn)
878 goto nomem; 877 goto nomem;
879 878
880 put_child(t, tn, i/2, (struct rt_trie_node *)newn); 879 put_child(tn, i/2, (struct rt_trie_node *)newn);
881 } 880 }
882 881
883 } 882 }
@@ -892,21 +891,21 @@ static struct tnode *halve(struct trie *t, struct tnode *tn)
892 if (left == NULL) { 891 if (left == NULL) {
893 if (right == NULL) /* Both are empty */ 892 if (right == NULL) /* Both are empty */
894 continue; 893 continue;
895 put_child(t, tn, i/2, right); 894 put_child(tn, i/2, right);
896 continue; 895 continue;
897 } 896 }
898 897
899 if (right == NULL) { 898 if (right == NULL) {
900 put_child(t, tn, i/2, left); 899 put_child(tn, i/2, left);
901 continue; 900 continue;
902 } 901 }
903 902
904 /* Two nonempty children */ 903 /* Two nonempty children */
905 newBinNode = (struct tnode *) tnode_get_child(tn, i/2); 904 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
906 put_child(t, tn, i/2, NULL); 905 put_child(tn, i/2, NULL);
907 put_child(t, newBinNode, 0, left); 906 put_child(newBinNode, 0, left);
908 put_child(t, newBinNode, 1, right); 907 put_child(newBinNode, 1, right);
909 put_child(t, tn, i/2, resize(t, newBinNode)); 908 put_child(tn, i/2, resize(t, newBinNode));
910 } 909 }
911 tnode_free_safe(oldtnode); 910 tnode_free_safe(oldtnode);
912 return tn; 911 return tn;
@@ -1125,7 +1124,7 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1125 node_set_parent((struct rt_trie_node *)l, tp); 1124 node_set_parent((struct rt_trie_node *)l, tp);
1126 1125
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1126 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, tp, cindex, (struct rt_trie_node *)l); 1127 put_child(tp, cindex, (struct rt_trie_node *)l);
1129 } else { 1128 } else {
1130 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */ 1129 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
1131 /* 1130 /*
@@ -1155,12 +1154,12 @@ static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
1155 node_set_parent((struct rt_trie_node *)tn, tp); 1154 node_set_parent((struct rt_trie_node *)tn, tp);
1156 1155
1157 missbit = tkey_extract_bits(key, newpos, 1); 1156 missbit = tkey_extract_bits(key, newpos, 1);
1158 put_child(t, tn, missbit, (struct rt_trie_node *)l); 1157 put_child(tn, missbit, (struct rt_trie_node *)l);
1159 put_child(t, tn, 1-missbit, n); 1158 put_child(tn, 1-missbit, n);
1160 1159
1161 if (tp) { 1160 if (tp) {
1162 cindex = tkey_extract_bits(key, tp->pos, tp->bits); 1161 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1163 put_child(t, tp, cindex, (struct rt_trie_node *)tn); 1162 put_child(tp, cindex, (struct rt_trie_node *)tn);
1164 } else { 1163 } else {
1165 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn); 1164 rcu_assign_pointer(t->trie, (struct rt_trie_node *)tn);
1166 tp = tn; 1165 tp = tn;
@@ -1619,7 +1618,7 @@ static void trie_leaf_remove(struct trie *t, struct leaf *l)
1619 1618
1620 if (tp) { 1619 if (tp) {
1621 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits); 1620 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
1622 put_child(t, tp, cindex, NULL); 1621 put_child(tp, cindex, NULL);
1623 trie_rebalance(t, tp); 1622 trie_rebalance(t, tp);
1624 } else 1623 } else
1625 RCU_INIT_POINTER(t->trie, NULL); 1624 RCU_INIT_POINTER(t->trie, NULL);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index db0cf17c00f7..7f75f21d7b83 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -404,12 +404,15 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
404{ 404{
405 const struct inet_request_sock *ireq = inet_rsk(req); 405 const struct inet_request_sock *ireq = inet_rsk(req);
406 struct inet_sock *newinet = inet_sk(newsk); 406 struct inet_sock *newinet = inet_sk(newsk);
407 struct ip_options_rcu *opt = ireq->opt; 407 struct ip_options_rcu *opt;
408 struct net *net = sock_net(sk); 408 struct net *net = sock_net(sk);
409 struct flowi4 *fl4; 409 struct flowi4 *fl4;
410 struct rtable *rt; 410 struct rtable *rt;
411 411
412 fl4 = &newinet->cork.fl.u.ip4; 412 fl4 = &newinet->cork.fl.u.ip4;
413
414 rcu_read_lock();
415 opt = rcu_dereference(newinet->inet_opt);
413 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 416 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
414 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 417 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
415 sk->sk_protocol, inet_sk_flowi_flags(sk), 418 sk->sk_protocol, inet_sk_flowi_flags(sk),
@@ -421,11 +424,13 @@ struct dst_entry *inet_csk_route_child_sock(struct sock *sk,
421 goto no_route; 424 goto no_route;
422 if (opt && opt->opt.is_strictroute && rt->rt_gateway) 425 if (opt && opt->opt.is_strictroute && rt->rt_gateway)
423 goto route_err; 426 goto route_err;
427 rcu_read_unlock();
424 return &rt->dst; 428 return &rt->dst;
425 429
426route_err: 430route_err:
427 ip_rt_put(rt); 431 ip_rt_put(rt);
428no_route: 432no_route:
433 rcu_read_unlock();
429 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); 434 IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES);
430 return NULL; 435 return NULL;
431} 436}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 7ad88e5e7110..8d07c973409c 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -258,8 +258,8 @@ static void ip_expire(unsigned long arg)
258 /* skb dst is stale, drop it, and perform route lookup again */ 258 /* skb dst is stale, drop it, and perform route lookup again */
259 skb_dst_drop(head); 259 skb_dst_drop(head);
260 iph = ip_hdr(head); 260 iph = ip_hdr(head);
261 err = ip_route_input(head, iph->daddr, iph->saddr, 261 err = ip_route_input_noref(head, iph->daddr, iph->saddr,
262 iph->tos, head->dev); 262 iph->tos, head->dev);
263 if (err) 263 if (err)
264 goto out_rcu_unlock; 264 goto out_rcu_unlock;
265 265
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 4ebc6feee250..f1395a6fb35f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -314,6 +314,7 @@ drop:
314} 314}
315 315
316int sysctl_ip_early_demux __read_mostly = 1; 316int sysctl_ip_early_demux __read_mostly = 1;
317EXPORT_SYMBOL(sysctl_ip_early_demux);
317 318
318static int ip_rcv_finish(struct sk_buff *skb) 319static int ip_rcv_finish(struct sk_buff *skb)
319{ 320{
@@ -324,11 +325,12 @@ static int ip_rcv_finish(struct sk_buff *skb)
324 const struct net_protocol *ipprot; 325 const struct net_protocol *ipprot;
325 int protocol = iph->protocol; 326 int protocol = iph->protocol;
326 327
327 rcu_read_lock();
328 ipprot = rcu_dereference(inet_protos[protocol]); 328 ipprot = rcu_dereference(inet_protos[protocol]);
329 if (ipprot && ipprot->early_demux) 329 if (ipprot && ipprot->early_demux) {
330 ipprot->early_demux(skb); 330 ipprot->early_demux(skb);
331 rcu_read_unlock(); 331 /* must reload iph, skb->head might have changed */
332 iph = ip_hdr(skb);
333 }
332 } 334 }
333 335
334 /* 336 /*
@@ -336,8 +338,8 @@ static int ip_rcv_finish(struct sk_buff *skb)
336 * how the packet travels inside Linux networking. 338 * how the packet travels inside Linux networking.
337 */ 339 */
338 if (!skb_dst(skb)) { 340 if (!skb_dst(skb)) {
339 int err = ip_route_input(skb, iph->daddr, iph->saddr, 341 int err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
340 iph->tos, skb->dev); 342 iph->tos, skb->dev);
341 if (unlikely(err)) { 343 if (unlikely(err)) {
342 if (err == -EXDEV) 344 if (err == -EXDEV)
343 NET_INC_STATS_BH(dev_net(skb->dev), 345 NET_INC_STATS_BH(dev_net(skb->dev),
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ba39a52d18c1..c196d749daf2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -197,7 +197,7 @@ static inline int ip_finish_output2(struct sk_buff *skb)
197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop); 197 neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
198 if (unlikely(!neigh)) 198 if (unlikely(!neigh))
199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false); 199 neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
200 if (neigh) { 200 if (!IS_ERR(neigh)) {
201 int res = dst_neigh_output(dst, neigh, skb); 201 int res = dst_neigh_output(dst, neigh, skb);
202 202
203 rcu_read_unlock_bh(); 203 rcu_read_unlock_bh();
@@ -1338,10 +1338,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
1338 iph->ihl = 5; 1338 iph->ihl = 5;
1339 iph->tos = inet->tos; 1339 iph->tos = inet->tos;
1340 iph->frag_off = df; 1340 iph->frag_off = df;
1341 ip_select_ident(iph, &rt->dst, sk);
1342 iph->ttl = ttl; 1341 iph->ttl = ttl;
1343 iph->protocol = sk->sk_protocol; 1342 iph->protocol = sk->sk_protocol;
1344 ip_copy_addrs(iph, fl4); 1343 ip_copy_addrs(iph, fl4);
1344 ip_select_ident(iph, &rt->dst, sk);
1345 1345
1346 if (opt) { 1346 if (opt) {
1347 iph->ihl += opt->optlen>>2; 1347 iph->ihl += opt->optlen>>2;
@@ -1366,9 +1366,8 @@ out:
1366 return skb; 1366 return skb;
1367} 1367}
1368 1368
1369int ip_send_skb(struct sk_buff *skb) 1369int ip_send_skb(struct net *net, struct sk_buff *skb)
1370{ 1370{
1371 struct net *net = sock_net(skb->sk);
1372 int err; 1371 int err;
1373 1372
1374 err = ip_local_out(skb); 1373 err = ip_local_out(skb);
@@ -1391,7 +1390,7 @@ int ip_push_pending_frames(struct sock *sk, struct flowi4 *fl4)
1391 return 0; 1390 return 0;
1392 1391
1393 /* Netfilter gets whole the not fragmented skb. */ 1392 /* Netfilter gets whole the not fragmented skb. */
1394 return ip_send_skb(skb); 1393 return ip_send_skb(sock_net(sk), skb);
1395} 1394}
1396 1395
1397/* 1396/*
@@ -1536,6 +1535,7 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb, __be32 daddr,
1536 arg->csumoffset) = csum_fold(csum_add(nskb->csum, 1535 arg->csumoffset) = csum_fold(csum_add(nskb->csum,
1537 arg->csum)); 1536 arg->csum));
1538 nskb->ip_summed = CHECKSUM_NONE; 1537 nskb->ip_summed = CHECKSUM_NONE;
1538 skb_orphan(nskb);
1539 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb)); 1539 skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
1540 ip_push_pending_frames(sk, &fl4); 1540 ip_push_pending_frames(sk, &fl4);
1541 } 1541 }
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8eec8f4a0536..ebdf06f938bf 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -124,6 +124,8 @@ static DEFINE_SPINLOCK(mfc_unres_lock);
124static struct kmem_cache *mrt_cachep __read_mostly; 124static struct kmem_cache *mrt_cachep __read_mostly;
125 125
126static struct mr_table *ipmr_new_table(struct net *net, u32 id); 126static struct mr_table *ipmr_new_table(struct net *net, u32 id);
127static void ipmr_free_table(struct mr_table *mrt);
128
127static int ip_mr_forward(struct net *net, struct mr_table *mrt, 129static int ip_mr_forward(struct net *net, struct mr_table *mrt,
128 struct sk_buff *skb, struct mfc_cache *cache, 130 struct sk_buff *skb, struct mfc_cache *cache,
129 int local); 131 int local);
@@ -131,6 +133,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
131 struct sk_buff *pkt, vifi_t vifi, int assert); 133 struct sk_buff *pkt, vifi_t vifi, int assert);
132static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, 134static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
133 struct mfc_cache *c, struct rtmsg *rtm); 135 struct mfc_cache *c, struct rtmsg *rtm);
136static void mroute_clean_tables(struct mr_table *mrt);
134static void ipmr_expire_process(unsigned long arg); 137static void ipmr_expire_process(unsigned long arg);
135 138
136#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES 139#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -271,7 +274,7 @@ static void __net_exit ipmr_rules_exit(struct net *net)
271 274
272 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { 275 list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
273 list_del(&mrt->list); 276 list_del(&mrt->list);
274 kfree(mrt); 277 ipmr_free_table(mrt);
275 } 278 }
276 fib_rules_unregister(net->ipv4.mr_rules_ops); 279 fib_rules_unregister(net->ipv4.mr_rules_ops);
277} 280}
@@ -299,7 +302,7 @@ static int __net_init ipmr_rules_init(struct net *net)
299 302
300static void __net_exit ipmr_rules_exit(struct net *net) 303static void __net_exit ipmr_rules_exit(struct net *net)
301{ 304{
302 kfree(net->ipv4.mrt); 305 ipmr_free_table(net->ipv4.mrt);
303} 306}
304#endif 307#endif
305 308
@@ -336,6 +339,13 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
336 return mrt; 339 return mrt;
337} 340}
338 341
342static void ipmr_free_table(struct mr_table *mrt)
343{
344 del_timer_sync(&mrt->ipmr_expire_timer);
345 mroute_clean_tables(mrt);
346 kfree(mrt);
347}
348
339/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ 349/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
340 350
341static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v) 351static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index ea4a23813d26..9c87cde28ff8 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -148,7 +148,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen, 148 if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
149 hdr, NULL, &matchoff, &matchlen, 149 hdr, NULL, &matchoff, &matchlen,
150 &addr, &port) > 0) { 150 &addr, &port) > 0) {
151 unsigned int matchend, poff, plen, buflen, n; 151 unsigned int olen, matchend, poff, plen, buflen, n;
152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; 152 char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
153 153
154 /* We're only interested in headers related to this 154 /* We're only interested in headers related to this
@@ -163,17 +163,18 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
163 goto next; 163 goto next;
164 } 164 }
165 165
166 olen = *datalen;
166 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen, 167 if (!map_addr(skb, dataoff, dptr, datalen, matchoff, matchlen,
167 &addr, port)) 168 &addr, port))
168 return NF_DROP; 169 return NF_DROP;
169 170
170 matchend = matchoff + matchlen; 171 matchend = matchoff + matchlen + *datalen - olen;
171 172
172 /* The maddr= parameter (RFC 2361) specifies where to send 173 /* The maddr= parameter (RFC 2361) specifies where to send
173 * the reply. */ 174 * the reply. */
174 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, 175 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
175 "maddr=", &poff, &plen, 176 "maddr=", &poff, &plen,
176 &addr) > 0 && 177 &addr, true) > 0 &&
177 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip && 178 addr.ip == ct->tuplehash[dir].tuple.src.u3.ip &&
178 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) { 179 addr.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) {
179 buflen = sprintf(buffer, "%pI4", 180 buflen = sprintf(buffer, "%pI4",
@@ -187,7 +188,7 @@ static unsigned int ip_nat_sip(struct sk_buff *skb, unsigned int dataoff,
187 * from which the server received the request. */ 188 * from which the server received the request. */
188 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen, 189 if (ct_sip_parse_address_param(ct, *dptr, matchend, *datalen,
189 "received=", &poff, &plen, 190 "received=", &poff, &plen,
190 &addr) > 0 && 191 &addr, false) > 0 &&
191 addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip && 192 addr.ip == ct->tuplehash[dir].tuple.dst.u3.ip &&
192 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { 193 addr.ip != ct->tuplehash[!dir].tuple.src.u3.ip) {
193 buflen = sprintf(buffer, "%pI4", 194 buflen = sprintf(buffer, "%pI4",
@@ -501,7 +502,10 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
501 ret = nf_ct_expect_related(rtcp_exp); 502 ret = nf_ct_expect_related(rtcp_exp);
502 if (ret == 0) 503 if (ret == 0)
503 break; 504 break;
504 else if (ret != -EBUSY) { 505 else if (ret == -EBUSY) {
506 nf_ct_unexpect_related(rtp_exp);
507 continue;
508 } else if (ret < 0) {
505 nf_ct_unexpect_related(rtp_exp); 509 nf_ct_unexpect_related(rtp_exp);
506 port = 0; 510 port = 0;
507 break; 511 break;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6bcb8fc71cbc..82cf2a722b23 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -70,7 +70,6 @@
70#include <linux/types.h> 70#include <linux/types.h>
71#include <linux/kernel.h> 71#include <linux/kernel.h>
72#include <linux/mm.h> 72#include <linux/mm.h>
73#include <linux/bootmem.h>
74#include <linux/string.h> 73#include <linux/string.h>
75#include <linux/socket.h> 74#include <linux/socket.h>
76#include <linux/sockios.h> 75#include <linux/sockios.h>
@@ -80,7 +79,6 @@
80#include <linux/netdevice.h> 79#include <linux/netdevice.h>
81#include <linux/proc_fs.h> 80#include <linux/proc_fs.h>
82#include <linux/init.h> 81#include <linux/init.h>
83#include <linux/workqueue.h>
84#include <linux/skbuff.h> 82#include <linux/skbuff.h>
85#include <linux/inetdevice.h> 83#include <linux/inetdevice.h>
86#include <linux/igmp.h> 84#include <linux/igmp.h>
@@ -88,11 +86,9 @@
88#include <linux/mroute.h> 86#include <linux/mroute.h>
89#include <linux/netfilter_ipv4.h> 87#include <linux/netfilter_ipv4.h>
90#include <linux/random.h> 88#include <linux/random.h>
91#include <linux/jhash.h>
92#include <linux/rcupdate.h> 89#include <linux/rcupdate.h>
93#include <linux/times.h> 90#include <linux/times.h>
94#include <linux/slab.h> 91#include <linux/slab.h>
95#include <linux/prefetch.h>
96#include <net/dst.h> 92#include <net/dst.h>
97#include <net/net_namespace.h> 93#include <net/net_namespace.h>
98#include <net/protocol.h> 94#include <net/protocol.h>
@@ -147,6 +143,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb, u32 mtu); 143 struct sk_buff *skb, u32 mtu);
148static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb); 145 struct sk_buff *skb);
146static void ipv4_dst_destroy(struct dst_entry *dst);
150 147
151static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 148static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
152 int how) 149 int how)
@@ -170,6 +167,7 @@ static struct dst_ops ipv4_dst_ops = {
170 .default_advmss = ipv4_default_advmss, 167 .default_advmss = ipv4_default_advmss,
171 .mtu = ipv4_mtu, 168 .mtu = ipv4_mtu,
172 .cow_metrics = ipv4_cow_metrics, 169 .cow_metrics = ipv4_cow_metrics,
170 .destroy = ipv4_dst_destroy,
173 .ifdown = ipv4_dst_ifdown, 171 .ifdown = ipv4_dst_ifdown,
174 .negative_advice = ipv4_negative_advice, 172 .negative_advice = ipv4_negative_advice,
175 .link_failure = ipv4_link_failure, 173 .link_failure = ipv4_link_failure,
@@ -444,7 +442,7 @@ static inline int ip_rt_proc_init(void)
444} 442}
445#endif /* CONFIG_PROC_FS */ 443#endif /* CONFIG_PROC_FS */
446 444
447static inline int rt_is_expired(struct rtable *rth) 445static inline bool rt_is_expired(const struct rtable *rth)
448{ 446{
449 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 447 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev));
450} 448}
@@ -587,11 +585,17 @@ static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
587 build_sk_flow_key(fl4, sk); 585 build_sk_flow_key(fl4, sk);
588} 586}
589 587
590static DEFINE_SEQLOCK(fnhe_seqlock); 588static inline void rt_free(struct rtable *rt)
589{
590 call_rcu(&rt->dst.rcu_head, dst_rcu_free);
591}
592
593static DEFINE_SPINLOCK(fnhe_lock);
591 594
592static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 595static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
593{ 596{
594 struct fib_nh_exception *fnhe, *oldest; 597 struct fib_nh_exception *fnhe, *oldest;
598 struct rtable *orig;
595 599
596 oldest = rcu_dereference(hash->chain); 600 oldest = rcu_dereference(hash->chain);
597 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 601 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -599,6 +603,11 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
599 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 603 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
600 oldest = fnhe; 604 oldest = fnhe;
601 } 605 }
606 orig = rcu_dereference(oldest->fnhe_rth);
607 if (orig) {
608 RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
609 rt_free(orig);
610 }
602 return oldest; 611 return oldest;
603} 612}
604 613
@@ -620,7 +629,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
620 int depth; 629 int depth;
621 u32 hval = fnhe_hashfun(daddr); 630 u32 hval = fnhe_hashfun(daddr);
622 631
623 write_seqlock_bh(&fnhe_seqlock); 632 spin_lock_bh(&fnhe_lock);
624 633
625 hash = nh->nh_exceptions; 634 hash = nh->nh_exceptions;
626 if (!hash) { 635 if (!hash) {
@@ -667,7 +676,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
667 fnhe->fnhe_stamp = jiffies; 676 fnhe->fnhe_stamp = jiffies;
668 677
669out_unlock: 678out_unlock:
670 write_sequnlock_bh(&fnhe_seqlock); 679 spin_unlock_bh(&fnhe_lock);
671 return; 680 return;
672} 681}
673 682
@@ -925,12 +934,14 @@ static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
925 if (mtu < ip_rt_min_pmtu) 934 if (mtu < ip_rt_min_pmtu)
926 mtu = ip_rt_min_pmtu; 935 mtu = ip_rt_min_pmtu;
927 936
937 rcu_read_lock();
928 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { 938 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) {
929 struct fib_nh *nh = &FIB_RES_NH(res); 939 struct fib_nh *nh = &FIB_RES_NH(res);
930 940
931 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 941 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
932 jiffies + ip_rt_mtu_expires); 942 jiffies + ip_rt_mtu_expires);
933 } 943 }
944 rcu_read_unlock();
934 return mtu; 945 return mtu;
935} 946}
936 947
@@ -947,7 +958,7 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
947 dst->obsolete = DST_OBSOLETE_KILL; 958 dst->obsolete = DST_OBSOLETE_KILL;
948 } else { 959 } else {
949 rt->rt_pmtu = mtu; 960 rt->rt_pmtu = mtu;
950 dst_set_expires(&rt->dst, ip_rt_mtu_expires); 961 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires);
951 } 962 }
952} 963}
953 964
@@ -1164,67 +1175,126 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1164 return NULL; 1175 return NULL;
1165} 1176}
1166 1177
1167static void rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1178static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1168 __be32 daddr) 1179 __be32 daddr)
1169{ 1180{
1170 __be32 fnhe_daddr, gw; 1181 bool ret = false;
1171 unsigned long expires; 1182
1172 unsigned int seq; 1183 spin_lock_bh(&fnhe_lock);
1173 u32 pmtu;
1174
1175restart:
1176 seq = read_seqbegin(&fnhe_seqlock);
1177 fnhe_daddr = fnhe->fnhe_daddr;
1178 gw = fnhe->fnhe_gw;
1179 pmtu = fnhe->fnhe_pmtu;
1180 expires = fnhe->fnhe_expires;
1181 if (read_seqretry(&fnhe_seqlock, seq))
1182 goto restart;
1183
1184 if (daddr != fnhe_daddr)
1185 return;
1186 1184
1187 if (pmtu) { 1185 if (daddr == fnhe->fnhe_daddr) {
1188 unsigned long diff = expires - jiffies; 1186 struct rtable *orig;
1189 1187
1190 if (time_before(jiffies, expires)) { 1188 if (fnhe->fnhe_pmtu) {
1191 rt->rt_pmtu = pmtu; 1189 unsigned long expires = fnhe->fnhe_expires;
1192 dst_set_expires(&rt->dst, diff); 1190 unsigned long diff = expires - jiffies;
1191
1192 if (time_before(jiffies, expires)) {
1193 rt->rt_pmtu = fnhe->fnhe_pmtu;
1194 dst_set_expires(&rt->dst, diff);
1195 }
1193 } 1196 }
1197 if (fnhe->fnhe_gw) {
1198 rt->rt_flags |= RTCF_REDIRECTED;
1199 rt->rt_gateway = fnhe->fnhe_gw;
1200 }
1201
1202 orig = rcu_dereference(fnhe->fnhe_rth);
1203 rcu_assign_pointer(fnhe->fnhe_rth, rt);
1204 if (orig)
1205 rt_free(orig);
1206
1207 fnhe->fnhe_stamp = jiffies;
1208 ret = true;
1209 } else {
1210 /* Routes we intend to cache in nexthop exception have
1211 * the DST_NOCACHE bit clear. However, if we are
1212 * unsuccessful at storing this route into the cache
1213 * we really need to set it.
1214 */
1215 rt->dst.flags |= DST_NOCACHE;
1194 } 1216 }
1195 if (gw) { 1217 spin_unlock_bh(&fnhe_lock);
1196 rt->rt_flags |= RTCF_REDIRECTED;
1197 rt->rt_gateway = gw;
1198 }
1199 fnhe->fnhe_stamp = jiffies;
1200}
1201 1218
1202static inline void rt_release_rcu(struct rcu_head *head) 1219 return ret;
1203{
1204 struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head);
1205 dst_release(dst);
1206} 1220}
1207 1221
1208static void rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1222static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1209{ 1223{
1210 struct rtable *orig, *prev, **p = &nh->nh_rth_output; 1224 struct rtable *orig, *prev, **p;
1211 1225 bool ret = true;
1212 if (rt_is_input_route(rt))
1213 p = &nh->nh_rth_input;
1214 1226
1227 if (rt_is_input_route(rt)) {
1228 p = (struct rtable **)&nh->nh_rth_input;
1229 } else {
1230 if (!nh->nh_pcpu_rth_output)
1231 goto nocache;
1232 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
1233 }
1215 orig = *p; 1234 orig = *p;
1216 1235
1217 prev = cmpxchg(p, orig, rt); 1236 prev = cmpxchg(p, orig, rt);
1218 if (prev == orig) { 1237 if (prev == orig) {
1219 dst_clone(&rt->dst);
1220 if (orig) 1238 if (orig)
1221 call_rcu_bh(&orig->dst.rcu_head, rt_release_rcu); 1239 rt_free(orig);
1240 } else {
1241 /* Routes we intend to cache in the FIB nexthop have
1242 * the DST_NOCACHE bit clear. However, if we are
1243 * unsuccessful at storing this route into the cache
1244 * we really need to set it.
1245 */
1246nocache:
1247 rt->dst.flags |= DST_NOCACHE;
1248 ret = false;
1249 }
1250
1251 return ret;
1252}
1253
1254static DEFINE_SPINLOCK(rt_uncached_lock);
1255static LIST_HEAD(rt_uncached_list);
1256
1257static void rt_add_uncached_list(struct rtable *rt)
1258{
1259 spin_lock_bh(&rt_uncached_lock);
1260 list_add_tail(&rt->rt_uncached, &rt_uncached_list);
1261 spin_unlock_bh(&rt_uncached_lock);
1262}
1263
1264static void ipv4_dst_destroy(struct dst_entry *dst)
1265{
1266 struct rtable *rt = (struct rtable *) dst;
1267
1268 if (!list_empty(&rt->rt_uncached)) {
1269 spin_lock_bh(&rt_uncached_lock);
1270 list_del(&rt->rt_uncached);
1271 spin_unlock_bh(&rt_uncached_lock);
1222 } 1272 }
1223} 1273}
1224 1274
1225static bool rt_cache_valid(struct rtable *rt) 1275void rt_flush_dev(struct net_device *dev)
1226{ 1276{
1227 return (rt && rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK); 1277 if (!list_empty(&rt_uncached_list)) {
1278 struct net *net = dev_net(dev);
1279 struct rtable *rt;
1280
1281 spin_lock_bh(&rt_uncached_lock);
1282 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) {
1283 if (rt->dst.dev != dev)
1284 continue;
1285 rt->dst.dev = net->loopback_dev;
1286 dev_hold(rt->dst.dev);
1287 dev_put(dev);
1288 }
1289 spin_unlock_bh(&rt_uncached_lock);
1290 }
1291}
1292
1293static bool rt_cache_valid(const struct rtable *rt)
1294{
1295 return rt &&
1296 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1297 !rt_is_expired(rt);
1228} 1298}
1229 1299
1230static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1300static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
@@ -1232,20 +1302,24 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1232 struct fib_nh_exception *fnhe, 1302 struct fib_nh_exception *fnhe,
1233 struct fib_info *fi, u16 type, u32 itag) 1303 struct fib_info *fi, u16 type, u32 itag)
1234{ 1304{
1305 bool cached = false;
1306
1235 if (fi) { 1307 if (fi) {
1236 struct fib_nh *nh = &FIB_RES_NH(*res); 1308 struct fib_nh *nh = &FIB_RES_NH(*res);
1237 1309
1238 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) 1310 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK)
1239 rt->rt_gateway = nh->nh_gw; 1311 rt->rt_gateway = nh->nh_gw;
1240 if (unlikely(fnhe))
1241 rt_bind_exception(rt, fnhe, daddr);
1242 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1312 dst_init_metrics(&rt->dst, fi->fib_metrics, true);
1243#ifdef CONFIG_IP_ROUTE_CLASSID 1313#ifdef CONFIG_IP_ROUTE_CLASSID
1244 rt->dst.tclassid = nh->nh_tclassid; 1314 rt->dst.tclassid = nh->nh_tclassid;
1245#endif 1315#endif
1246 if (!(rt->dst.flags & DST_HOST)) 1316 if (unlikely(fnhe))
1247 rt_cache_route(nh, rt); 1317 cached = rt_bind_exception(rt, fnhe, daddr);
1318 else if (!(rt->dst.flags & DST_NOCACHE))
1319 cached = rt_cache_route(nh, rt);
1248 } 1320 }
1321 if (unlikely(!cached))
1322 rt_add_uncached_list(rt);
1249 1323
1250#ifdef CONFIG_IP_ROUTE_CLASSID 1324#ifdef CONFIG_IP_ROUTE_CLASSID
1251#ifdef CONFIG_IP_MULTIPLE_TABLES 1325#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1259,7 +1333,7 @@ static struct rtable *rt_dst_alloc(struct net_device *dev,
1259 bool nopolicy, bool noxfrm, bool will_cache) 1333 bool nopolicy, bool noxfrm, bool will_cache)
1260{ 1334{
1261 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1335 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1262 (will_cache ? 0 : DST_HOST) | DST_NOCACHE | 1336 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1263 (nopolicy ? DST_NOPOLICY : 0) | 1337 (nopolicy ? DST_NOPOLICY : 0) |
1264 (noxfrm ? DST_NOXFRM : 0)); 1338 (noxfrm ? DST_NOXFRM : 0));
1265} 1339}
@@ -1312,6 +1386,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1312 rth->rt_iif = 0; 1386 rth->rt_iif = 0;
1313 rth->rt_pmtu = 0; 1387 rth->rt_pmtu = 0;
1314 rth->rt_gateway = 0; 1388 rth->rt_gateway = 0;
1389 INIT_LIST_HEAD(&rth->rt_uncached);
1315 if (our) { 1390 if (our) {
1316 rth->dst.input= ip_local_deliver; 1391 rth->dst.input= ip_local_deliver;
1317 rth->rt_flags |= RTCF_LOCAL; 1392 rth->rt_flags |= RTCF_LOCAL;
@@ -1364,8 +1439,7 @@ static void ip_handle_martian_source(struct net_device *dev,
1364static int __mkroute_input(struct sk_buff *skb, 1439static int __mkroute_input(struct sk_buff *skb,
1365 const struct fib_result *res, 1440 const struct fib_result *res,
1366 struct in_device *in_dev, 1441 struct in_device *in_dev,
1367 __be32 daddr, __be32 saddr, u32 tos, 1442 __be32 daddr, __be32 saddr, u32 tos)
1368 struct rtable **result)
1369{ 1443{
1370 struct rtable *rth; 1444 struct rtable *rth;
1371 int err; 1445 int err;
@@ -1414,9 +1488,9 @@ static int __mkroute_input(struct sk_buff *skb,
1414 do_cache = false; 1488 do_cache = false;
1415 if (res->fi) { 1489 if (res->fi) {
1416 if (!itag) { 1490 if (!itag) {
1417 rth = FIB_RES_NH(*res).nh_rth_input; 1491 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1418 if (rt_cache_valid(rth)) { 1492 if (rt_cache_valid(rth)) {
1419 dst_hold(&rth->dst); 1493 skb_dst_set_noref(skb, &rth->dst);
1420 goto out; 1494 goto out;
1421 } 1495 }
1422 do_cache = true; 1496 do_cache = true;
@@ -1438,13 +1512,14 @@ static int __mkroute_input(struct sk_buff *skb,
1438 rth->rt_iif = 0; 1512 rth->rt_iif = 0;
1439 rth->rt_pmtu = 0; 1513 rth->rt_pmtu = 0;
1440 rth->rt_gateway = 0; 1514 rth->rt_gateway = 0;
1515 INIT_LIST_HEAD(&rth->rt_uncached);
1441 1516
1442 rth->dst.input = ip_forward; 1517 rth->dst.input = ip_forward;
1443 rth->dst.output = ip_output; 1518 rth->dst.output = ip_output;
1444 1519
1445 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); 1520 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
1521 skb_dst_set(skb, &rth->dst);
1446out: 1522out:
1447 *result = rth;
1448 err = 0; 1523 err = 0;
1449 cleanup: 1524 cleanup:
1450 return err; 1525 return err;
@@ -1456,21 +1531,13 @@ static int ip_mkroute_input(struct sk_buff *skb,
1456 struct in_device *in_dev, 1531 struct in_device *in_dev,
1457 __be32 daddr, __be32 saddr, u32 tos) 1532 __be32 daddr, __be32 saddr, u32 tos)
1458{ 1533{
1459 struct rtable *rth = NULL;
1460 int err;
1461
1462#ifdef CONFIG_IP_ROUTE_MULTIPATH 1534#ifdef CONFIG_IP_ROUTE_MULTIPATH
1463 if (res->fi && res->fi->fib_nhs > 1) 1535 if (res->fi && res->fi->fib_nhs > 1)
1464 fib_select_multipath(res); 1536 fib_select_multipath(res);
1465#endif 1537#endif
1466 1538
1467 /* create a routing cache entry */ 1539 /* create a routing cache entry */
1468 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 1540 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1469 if (err)
1470 return err;
1471
1472 skb_dst_set(skb, &rth->dst);
1473 return 0;
1474} 1541}
1475 1542
1476/* 1543/*
@@ -1584,10 +1651,11 @@ local_input:
1584 do_cache = false; 1651 do_cache = false;
1585 if (res.fi) { 1652 if (res.fi) {
1586 if (!itag) { 1653 if (!itag) {
1587 rth = FIB_RES_NH(res).nh_rth_input; 1654 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1588 if (rt_cache_valid(rth)) { 1655 if (rt_cache_valid(rth)) {
1589 dst_hold(&rth->dst); 1656 skb_dst_set_noref(skb, &rth->dst);
1590 goto set_and_out; 1657 err = 0;
1658 goto out;
1591 } 1659 }
1592 do_cache = true; 1660 do_cache = true;
1593 } 1661 }
@@ -1611,6 +1679,7 @@ local_input:
1611 rth->rt_iif = 0; 1679 rth->rt_iif = 0;
1612 rth->rt_pmtu = 0; 1680 rth->rt_pmtu = 0;
1613 rth->rt_gateway = 0; 1681 rth->rt_gateway = 0;
1682 INIT_LIST_HEAD(&rth->rt_uncached);
1614 if (res.type == RTN_UNREACHABLE) { 1683 if (res.type == RTN_UNREACHABLE) {
1615 rth->dst.input= ip_error; 1684 rth->dst.input= ip_error;
1616 rth->dst.error= -err; 1685 rth->dst.error= -err;
@@ -1618,7 +1687,6 @@ local_input:
1618 } 1687 }
1619 if (do_cache) 1688 if (do_cache)
1620 rt_cache_route(&FIB_RES_NH(res), rth); 1689 rt_cache_route(&FIB_RES_NH(res), rth);
1621set_and_out:
1622 skb_dst_set(skb, &rth->dst); 1690 skb_dst_set(skb, &rth->dst);
1623 err = 0; 1691 err = 0;
1624 goto out; 1692 goto out;
@@ -1656,8 +1724,8 @@ martian_source_keep_err:
1656 goto out; 1724 goto out;
1657} 1725}
1658 1726
1659int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1727int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1660 u8 tos, struct net_device *dev) 1728 u8 tos, struct net_device *dev)
1661{ 1729{
1662 int res; 1730 int res;
1663 1731
@@ -1700,7 +1768,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1700 rcu_read_unlock(); 1768 rcu_read_unlock();
1701 return res; 1769 return res;
1702} 1770}
1703EXPORT_SYMBOL(ip_route_input); 1771EXPORT_SYMBOL(ip_route_input_noref);
1704 1772
1705/* called with rcu_read_lock() */ 1773/* called with rcu_read_lock() */
1706static struct rtable *__mkroute_output(const struct fib_result *res, 1774static struct rtable *__mkroute_output(const struct fib_result *res,
@@ -1750,19 +1818,23 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1750 1818
1751 fnhe = NULL; 1819 fnhe = NULL;
1752 if (fi) { 1820 if (fi) {
1821 struct rtable __rcu **prth;
1822
1753 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); 1823 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr);
1754 if (!fnhe) { 1824 if (fnhe)
1755 rth = FIB_RES_NH(*res).nh_rth_output; 1825 prth = &fnhe->fnhe_rth;
1756 if (rt_cache_valid(rth)) { 1826 else
1757 dst_hold(&rth->dst); 1827 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output);
1758 return rth; 1828 rth = rcu_dereference(*prth);
1759 } 1829 if (rt_cache_valid(rth)) {
1830 dst_hold(&rth->dst);
1831 return rth;
1760 } 1832 }
1761 } 1833 }
1762 rth = rt_dst_alloc(dev_out, 1834 rth = rt_dst_alloc(dev_out,
1763 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1835 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1764 IN_DEV_CONF_GET(in_dev, NOXFRM), 1836 IN_DEV_CONF_GET(in_dev, NOXFRM),
1765 fi && !fnhe); 1837 fi);
1766 if (!rth) 1838 if (!rth)
1767 return ERR_PTR(-ENOBUFS); 1839 return ERR_PTR(-ENOBUFS);
1768 1840
@@ -1775,6 +1847,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
1775 rth->rt_iif = orig_oif ? : 0; 1847 rth->rt_iif = orig_oif ? : 0;
1776 rth->rt_pmtu = 0; 1848 rth->rt_pmtu = 0;
1777 rth->rt_gateway = 0; 1849 rth->rt_gateway = 0;
1850 INIT_LIST_HEAD(&rth->rt_uncached);
1778 1851
1779 RT_CACHE_STAT_INC(out_slow_tot); 1852 RT_CACHE_STAT_INC(out_slow_tot);
1780 1853
@@ -1957,7 +2030,6 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
1957 } 2030 }
1958 dev_out = net->loopback_dev; 2031 dev_out = net->loopback_dev;
1959 fl4->flowi4_oif = dev_out->ifindex; 2032 fl4->flowi4_oif = dev_out->ifindex;
1960 res.fi = NULL;
1961 flags |= RTCF_LOCAL; 2033 flags |= RTCF_LOCAL;
1962 goto make_route; 2034 goto make_route;
1963 } 2035 }
@@ -2054,6 +2126,8 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
2054 rt->rt_type = ort->rt_type; 2126 rt->rt_type = ort->rt_type;
2055 rt->rt_gateway = ort->rt_gateway; 2127 rt->rt_gateway = ort->rt_gateway;
2056 2128
2129 INIT_LIST_HEAD(&rt->rt_uncached);
2130
2057 dst_free(new); 2131 dst_free(new);
2058 } 2132 }
2059 2133
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 5840c3255721..1b5ce96707a3 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
184 int ret; 184 int ret;
185 unsigned long vec[3]; 185 unsigned long vec[3];
186 struct net *net = current->nsproxy->net_ns; 186 struct net *net = current->nsproxy->net_ns;
187#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 187#ifdef CONFIG_MEMCG_KMEM
188 struct mem_cgroup *memcg; 188 struct mem_cgroup *memcg;
189#endif 189#endif
190 190
@@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write,
203 if (ret) 203 if (ret)
204 return ret; 204 return ret;
205 205
206#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 206#ifdef CONFIG_MEMCG_KMEM
207 rcu_read_lock(); 207 rcu_read_lock();
208 memcg = mem_cgroup_from_task(current); 208 memcg = mem_cgroup_from_task(current);
209 209
@@ -784,13 +784,6 @@ static struct ctl_table ipv4_net_table[] = {
784 .proc_handler = proc_dointvec 784 .proc_handler = proc_dointvec
785 }, 785 },
786 { 786 {
787 .procname = "rt_cache_rebuild_count",
788 .data = &init_net.ipv4.sysctl_rt_cache_rebuild_count,
789 .maxlen = sizeof(int),
790 .mode = 0644,
791 .proc_handler = proc_dointvec
792 },
793 {
794 .procname = "ping_group_range", 787 .procname = "ping_group_range",
795 .data = &init_net.ipv4.sysctl_ping_group_range, 788 .data = &init_net.ipv4.sysctl_ping_group_range,
796 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range), 789 .maxlen = sizeof(init_net.ipv4.sysctl_ping_group_range),
@@ -829,8 +822,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
829 table[5].data = 822 table[5].data =
830 &net->ipv4.sysctl_icmp_ratemask; 823 &net->ipv4.sysctl_icmp_ratemask;
831 table[6].data = 824 table[6].data =
832 &net->ipv4.sysctl_rt_cache_rebuild_count;
833 table[7].data =
834 &net->ipv4.sysctl_ping_group_range; 825 &net->ipv4.sysctl_ping_group_range;
835 826
836 } 827 }
@@ -842,8 +833,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
842 net->ipv4.sysctl_ping_group_range[0] = 1; 833 net->ipv4.sysctl_ping_group_range[0] = 1;
843 net->ipv4.sysctl_ping_group_range[1] = 0; 834 net->ipv4.sysctl_ping_group_range[1] = 0;
844 835
845 net->ipv4.sysctl_rt_cache_rebuild_count = 4;
846
847 tcp_init_mem(net); 836 tcp_init_mem(net);
848 837
849 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); 838 net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 581ecf02c6b5..2109ff4a1daf 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -811,7 +811,9 @@ static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
811 old_size_goal + mss_now > xmit_size_goal)) { 811 old_size_goal + mss_now > xmit_size_goal)) {
812 xmit_size_goal = old_size_goal; 812 xmit_size_goal = old_size_goal;
813 } else { 813 } else {
814 tp->xmit_size_goal_segs = xmit_size_goal / mss_now; 814 tp->xmit_size_goal_segs =
815 min_t(u16, xmit_size_goal / mss_now,
816 sk->sk_gso_max_segs);
815 xmit_size_goal = tp->xmit_size_goal_segs * mss_now; 817 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
816 } 818 }
817 } 819 }
@@ -2681,7 +2683,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
2681 /* Cap the max timeout in ms TCP will retry/retrans 2683 /* Cap the max timeout in ms TCP will retry/retrans
2682 * before giving up and aborting (ETIMEDOUT) a connection. 2684 * before giving up and aborting (ETIMEDOUT) a connection.
2683 */ 2685 */
2684 icsk->icsk_user_timeout = msecs_to_jiffies(val); 2686 if (val < 0)
2687 err = -EINVAL;
2688 else
2689 icsk->icsk_user_timeout = msecs_to_jiffies(val);
2685 break; 2690 break;
2686 default: 2691 default:
2687 err = -ENOPROTOOPT; 2692 err = -ENOPROTOOPT;
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4d4db16e336e..1432cdb0644c 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,7 +291,8 @@ bool tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight)
291 left = tp->snd_cwnd - in_flight; 291 left = tp->snd_cwnd - in_flight;
292 if (sk_can_gso(sk) && 292 if (sk_can_gso(sk) &&
293 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd && 293 left * sysctl_tcp_tso_win_divisor < tp->snd_cwnd &&
294 left * tp->mss_cache < sk->sk_gso_max_size) 294 left * tp->mss_cache < sk->sk_gso_max_size &&
295 left < sk->sk_gso_max_segs)
295 return true; 296 return true;
296 return left <= tcp_max_tso_deferred_mss(tp); 297 return left <= tcp_max_tso_deferred_mss(tp);
297} 298}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 3e07a64ca44e..6e38c6c23caa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2926,13 +2926,14 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
2926 * tcp_xmit_retransmit_queue(). 2926 * tcp_xmit_retransmit_queue().
2927 */ 2927 */
2928static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 2928static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2929 int newly_acked_sacked, bool is_dupack, 2929 int prior_sacked, bool is_dupack,
2930 int flag) 2930 int flag)
2931{ 2931{
2932 struct inet_connection_sock *icsk = inet_csk(sk); 2932 struct inet_connection_sock *icsk = inet_csk(sk);
2933 struct tcp_sock *tp = tcp_sk(sk); 2933 struct tcp_sock *tp = tcp_sk(sk);
2934 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) && 2934 int do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
2935 (tcp_fackets_out(tp) > tp->reordering)); 2935 (tcp_fackets_out(tp) > tp->reordering));
2936 int newly_acked_sacked = 0;
2936 int fast_rexmit = 0; 2937 int fast_rexmit = 0;
2937 2938
2938 if (WARN_ON(!tp->packets_out && tp->sacked_out)) 2939 if (WARN_ON(!tp->packets_out && tp->sacked_out))
@@ -2992,6 +2993,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
2992 tcp_add_reno_sack(sk); 2993 tcp_add_reno_sack(sk);
2993 } else 2994 } else
2994 do_lost = tcp_try_undo_partial(sk, pkts_acked); 2995 do_lost = tcp_try_undo_partial(sk, pkts_acked);
2996 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
2995 break; 2997 break;
2996 case TCP_CA_Loss: 2998 case TCP_CA_Loss:
2997 if (flag & FLAG_DATA_ACKED) 2999 if (flag & FLAG_DATA_ACKED)
@@ -3013,6 +3015,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
3013 if (is_dupack) 3015 if (is_dupack)
3014 tcp_add_reno_sack(sk); 3016 tcp_add_reno_sack(sk);
3015 } 3017 }
3018 newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
3016 3019
3017 if (icsk->icsk_ca_state <= TCP_CA_Disorder) 3020 if (icsk->icsk_ca_state <= TCP_CA_Disorder)
3018 tcp_try_undo_dsack(sk); 3021 tcp_try_undo_dsack(sk);
@@ -3590,7 +3593,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3590 int prior_packets; 3593 int prior_packets;
3591 int prior_sacked = tp->sacked_out; 3594 int prior_sacked = tp->sacked_out;
3592 int pkts_acked = 0; 3595 int pkts_acked = 0;
3593 int newly_acked_sacked = 0;
3594 bool frto_cwnd = false; 3596 bool frto_cwnd = false;
3595 3597
3596 /* If the ack is older than previous acks 3598 /* If the ack is older than previous acks
@@ -3666,8 +3668,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3666 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una); 3668 flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una);
3667 3669
3668 pkts_acked = prior_packets - tp->packets_out; 3670 pkts_acked = prior_packets - tp->packets_out;
3669 newly_acked_sacked = (prior_packets - prior_sacked) -
3670 (tp->packets_out - tp->sacked_out);
3671 3671
3672 if (tp->frto_counter) 3672 if (tp->frto_counter)
3673 frto_cwnd = tcp_process_frto(sk, flag); 3673 frto_cwnd = tcp_process_frto(sk, flag);
@@ -3681,7 +3681,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3681 tcp_may_raise_cwnd(sk, flag)) 3681 tcp_may_raise_cwnd(sk, flag))
3682 tcp_cong_avoid(sk, ack, prior_in_flight); 3682 tcp_cong_avoid(sk, ack, prior_in_flight);
3683 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); 3683 is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
3684 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, 3684 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3685 is_dupack, flag); 3685 is_dupack, flag);
3686 } else { 3686 } else {
3687 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) 3687 if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
@@ -3698,7 +3698,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
3698no_queue: 3698no_queue:
3699 /* If data was DSACKed, see if we can undo a cwnd reduction. */ 3699 /* If data was DSACKed, see if we can undo a cwnd reduction. */
3700 if (flag & FLAG_DSACKING_ACK) 3700 if (flag & FLAG_DSACKING_ACK)
3701 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked, 3701 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3702 is_dupack, flag); 3702 is_dupack, flag);
3703 /* If this ack opens up a zero window, clear backoff. It was 3703 /* If this ack opens up a zero window, clear backoff. It was
3704 * being used to time the probes, and is probably far higher than 3704 * being used to time the probes, and is probably far higher than
@@ -3718,8 +3718,7 @@ old_ack:
3718 */ 3718 */
3719 if (TCP_SKB_CB(skb)->sacked) { 3719 if (TCP_SKB_CB(skb)->sacked) {
3720 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3720 flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
3721 newly_acked_sacked = tp->sacked_out - prior_sacked; 3721 tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
3722 tcp_fastretrans_alert(sk, pkts_acked, newly_acked_sacked,
3723 is_dupack, flag); 3722 is_dupack, flag);
3724 } 3723 }
3725 3724
@@ -4351,19 +4350,20 @@ static void tcp_ofo_queue(struct sock *sk)
4351static bool tcp_prune_ofo_queue(struct sock *sk); 4350static bool tcp_prune_ofo_queue(struct sock *sk);
4352static int tcp_prune_queue(struct sock *sk); 4351static int tcp_prune_queue(struct sock *sk);
4353 4352
4354static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) 4353static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb,
4354 unsigned int size)
4355{ 4355{
4356 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || 4356 if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
4357 !sk_rmem_schedule(sk, size)) { 4357 !sk_rmem_schedule(sk, skb, size)) {
4358 4358
4359 if (tcp_prune_queue(sk) < 0) 4359 if (tcp_prune_queue(sk) < 0)
4360 return -1; 4360 return -1;
4361 4361
4362 if (!sk_rmem_schedule(sk, size)) { 4362 if (!sk_rmem_schedule(sk, skb, size)) {
4363 if (!tcp_prune_ofo_queue(sk)) 4363 if (!tcp_prune_ofo_queue(sk))
4364 return -1; 4364 return -1;
4365 4365
4366 if (!sk_rmem_schedule(sk, size)) 4366 if (!sk_rmem_schedule(sk, skb, size))
4367 return -1; 4367 return -1;
4368 } 4368 }
4369 } 4369 }
@@ -4418,7 +4418,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
4418 4418
4419 TCP_ECN_check_ce(tp, skb); 4419 TCP_ECN_check_ce(tp, skb);
4420 4420
4421 if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { 4421 if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) {
4422 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); 4422 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP);
4423 __kfree_skb(skb); 4423 __kfree_skb(skb);
4424 return; 4424 return;
@@ -4552,17 +4552,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
4552 4552
4553int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) 4553int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
4554{ 4554{
4555 struct sk_buff *skb; 4555 struct sk_buff *skb = NULL;
4556 struct tcphdr *th; 4556 struct tcphdr *th;
4557 bool fragstolen; 4557 bool fragstolen;
4558 4558
4559 if (tcp_try_rmem_schedule(sk, size + sizeof(*th)))
4560 goto err;
4561
4562 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); 4559 skb = alloc_skb(size + sizeof(*th), sk->sk_allocation);
4563 if (!skb) 4560 if (!skb)
4564 goto err; 4561 goto err;
4565 4562
4563 if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th)))
4564 goto err_free;
4565
4566 th = (struct tcphdr *)skb_put(skb, sizeof(*th)); 4566 th = (struct tcphdr *)skb_put(skb, sizeof(*th));
4567 skb_reset_transport_header(skb); 4567 skb_reset_transport_header(skb);
4568 memset(th, 0, sizeof(*th)); 4568 memset(th, 0, sizeof(*th));
@@ -4633,7 +4633,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4633 if (eaten <= 0) { 4633 if (eaten <= 0) {
4634queue_and_out: 4634queue_and_out:
4635 if (eaten < 0 && 4635 if (eaten < 0 &&
4636 tcp_try_rmem_schedule(sk, skb->truesize)) 4636 tcp_try_rmem_schedule(sk, skb, skb->truesize))
4637 goto drop; 4637 goto drop;
4638 4638
4639 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); 4639 eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
@@ -5391,6 +5391,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5391{ 5391{
5392 struct tcp_sock *tp = tcp_sk(sk); 5392 struct tcp_sock *tp = tcp_sk(sk);
5393 5393
5394 if (unlikely(sk->sk_rx_dst == NULL))
5395 inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb);
5394 /* 5396 /*
5395 * Header prediction. 5397 * Header prediction.
5396 * The code loosely follows the one in the famous 5398 * The code loosely follows the one in the famous
@@ -5475,7 +5477,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5475 if (tp->copied_seq == tp->rcv_nxt && 5477 if (tp->copied_seq == tp->rcv_nxt &&
5476 len - tcp_header_len <= tp->ucopy.len) { 5478 len - tcp_header_len <= tp->ucopy.len) {
5477#ifdef CONFIG_NET_DMA 5479#ifdef CONFIG_NET_DMA
5478 if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { 5480 if (tp->ucopy.task == current &&
5481 sock_owned_by_user(sk) &&
5482 tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
5479 copied_early = 1; 5483 copied_early = 1;
5480 eaten = 1; 5484 eaten = 1;
5481 } 5485 }
@@ -5602,7 +5606,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
5602 tcp_set_state(sk, TCP_ESTABLISHED); 5606 tcp_set_state(sk, TCP_ESTABLISHED);
5603 5607
5604 if (skb != NULL) { 5608 if (skb != NULL) {
5605 sk->sk_rx_dst = dst_clone(skb_dst(skb)); 5609 icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
5606 security_inet_conn_established(sk, skb); 5610 security_inet_conn_established(sk, skb);
5607 } 5611 }
5608 5612
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 3e30548ac32a..00a748d14062 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
417 417
418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ 418 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
419 tp->mtu_info = info; 419 tp->mtu_info = info;
420 if (!sock_owned_by_user(sk)) 420 if (!sock_owned_by_user(sk)) {
421 tcp_v4_mtu_reduced(sk); 421 tcp_v4_mtu_reduced(sk);
422 else 422 } else {
423 set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); 423 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags))
424 sock_hold(sk);
425 }
424 goto out; 426 goto out;
425 } 427 }
426 428
@@ -1462,6 +1464,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1462 goto exit_nonewsk; 1464 goto exit_nonewsk;
1463 1465
1464 newsk->sk_gso_type = SKB_GSO_TCPV4; 1466 newsk->sk_gso_type = SKB_GSO_TCPV4;
1467 inet_sk_rx_dst_set(newsk, skb);
1465 1468
1466 newtp = tcp_sk(newsk); 1469 newtp = tcp_sk(newsk);
1467 newinet = inet_sk(newsk); 1470 newinet = inet_sk(newsk);
@@ -1617,21 +1620,16 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1617#endif 1620#endif
1618 1621
1619 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1622 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1623 struct dst_entry *dst = sk->sk_rx_dst;
1624
1620 sock_rps_save_rxhash(sk, skb); 1625 sock_rps_save_rxhash(sk, skb);
1621 if (sk->sk_rx_dst) { 1626 if (dst) {
1622 struct dst_entry *dst = sk->sk_rx_dst; 1627 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1623 if (dst->ops->check(dst, 0) == NULL) { 1628 dst->ops->check(dst, 0) == NULL) {
1624 dst_release(dst); 1629 dst_release(dst);
1625 sk->sk_rx_dst = NULL; 1630 sk->sk_rx_dst = NULL;
1626 } 1631 }
1627 } 1632 }
1628 if (unlikely(sk->sk_rx_dst == NULL)) {
1629 struct inet_sock *icsk = inet_sk(sk);
1630 struct rtable *rt = skb_rtable(skb);
1631
1632 sk->sk_rx_dst = dst_clone(&rt->dst);
1633 icsk->rx_dst_ifindex = inet_iif(skb);
1634 }
1635 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { 1633 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1636 rsk = sk; 1634 rsk = sk;
1637 goto reset; 1635 goto reset;
@@ -1686,7 +1684,6 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1686 struct net *net = dev_net(skb->dev); 1684 struct net *net = dev_net(skb->dev);
1687 const struct iphdr *iph; 1685 const struct iphdr *iph;
1688 const struct tcphdr *th; 1686 const struct tcphdr *th;
1689 struct net_device *dev;
1690 struct sock *sk; 1687 struct sock *sk;
1691 1688
1692 if (skb->pkt_type != PACKET_HOST) 1689 if (skb->pkt_type != PACKET_HOST)
@@ -1701,24 +1698,20 @@ void tcp_v4_early_demux(struct sk_buff *skb)
1701 if (th->doff < sizeof(struct tcphdr) / 4) 1698 if (th->doff < sizeof(struct tcphdr) / 4)
1702 return; 1699 return;
1703 1700
1704 if (!pskb_may_pull(skb, ip_hdrlen(skb) + th->doff * 4))
1705 return;
1706
1707 dev = skb->dev;
1708 sk = __inet_lookup_established(net, &tcp_hashinfo, 1701 sk = __inet_lookup_established(net, &tcp_hashinfo,
1709 iph->saddr, th->source, 1702 iph->saddr, th->source,
1710 iph->daddr, ntohs(th->dest), 1703 iph->daddr, ntohs(th->dest),
1711 dev->ifindex); 1704 skb->skb_iif);
1712 if (sk) { 1705 if (sk) {
1713 skb->sk = sk; 1706 skb->sk = sk;
1714 skb->destructor = sock_edemux; 1707 skb->destructor = sock_edemux;
1715 if (sk->sk_state != TCP_TIME_WAIT) { 1708 if (sk->sk_state != TCP_TIME_WAIT) {
1716 struct dst_entry *dst = sk->sk_rx_dst; 1709 struct dst_entry *dst = sk->sk_rx_dst;
1717 struct inet_sock *icsk = inet_sk(sk); 1710
1718 if (dst) 1711 if (dst)
1719 dst = dst_check(dst, 0); 1712 dst = dst_check(dst, 0);
1720 if (dst && 1713 if (dst &&
1721 icsk->rx_dst_ifindex == dev->ifindex) 1714 inet_sk(sk)->rx_dst_ifindex == skb->skb_iif)
1722 skb_dst_set_noref(skb, dst); 1715 skb_dst_set_noref(skb, dst);
1723 } 1716 }
1724 } 1717 }
@@ -1879,10 +1872,21 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
1879 .twsk_destructor= tcp_twsk_destructor, 1872 .twsk_destructor= tcp_twsk_destructor,
1880}; 1873};
1881 1874
1875void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
1876{
1877 struct dst_entry *dst = skb_dst(skb);
1878
1879 dst_hold(dst);
1880 sk->sk_rx_dst = dst;
1881 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
1882}
1883EXPORT_SYMBOL(inet_sk_rx_dst_set);
1884
1882const struct inet_connection_sock_af_ops ipv4_specific = { 1885const struct inet_connection_sock_af_ops ipv4_specific = {
1883 .queue_xmit = ip_queue_xmit, 1886 .queue_xmit = ip_queue_xmit,
1884 .send_check = tcp_v4_send_check, 1887 .send_check = tcp_v4_send_check,
1885 .rebuild_header = inet_sk_rebuild_header, 1888 .rebuild_header = inet_sk_rebuild_header,
1889 .sk_rx_dst_set = inet_sk_rx_dst_set,
1886 .conn_request = tcp_v4_conn_request, 1890 .conn_request = tcp_v4_conn_request,
1887 .syn_recv_sock = tcp_v4_syn_recv_sock, 1891 .syn_recv_sock = tcp_v4_syn_recv_sock,
1888 .net_header_len = sizeof(struct iphdr), 1892 .net_header_len = sizeof(struct iphdr),
@@ -2640,7 +2644,7 @@ struct proto tcp_prot = {
2640 .compat_setsockopt = compat_tcp_setsockopt, 2644 .compat_setsockopt = compat_tcp_setsockopt,
2641 .compat_getsockopt = compat_tcp_getsockopt, 2645 .compat_getsockopt = compat_tcp_getsockopt,
2642#endif 2646#endif
2643#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 2647#ifdef CONFIG_MEMCG_KMEM
2644 .init_cgroup = tcp_init_cgroup, 2648 .init_cgroup = tcp_init_cgroup,
2645 .destroy_cgroup = tcp_destroy_cgroup, 2649 .destroy_cgroup = tcp_destroy_cgroup,
2646 .proto_cgroup = tcp_proto_cgroup, 2650 .proto_cgroup = tcp_proto_cgroup,
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index 2288a6399e1e..0abe67bb4d3a 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -731,6 +731,18 @@ static int __net_init tcp_net_metrics_init(struct net *net)
731 731
732static void __net_exit tcp_net_metrics_exit(struct net *net) 732static void __net_exit tcp_net_metrics_exit(struct net *net)
733{ 733{
734 unsigned int i;
735
736 for (i = 0; i < (1U << net->ipv4.tcp_metrics_hash_log) ; i++) {
737 struct tcp_metrics_block *tm, *next;
738
739 tm = rcu_dereference_protected(net->ipv4.tcp_metrics_hash[i].chain, 1);
740 while (tm) {
741 next = rcu_dereference_protected(tm->tcpm_next, 1);
742 kfree(tm);
743 tm = next;
744 }
745 }
734 kfree(net->ipv4.tcp_metrics_hash); 746 kfree(net->ipv4.tcp_metrics_hash);
735} 747}
736 748
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 5912ac3fd240..6ff7f10dce9d 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -387,8 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
387 struct tcp_sock *oldtp = tcp_sk(sk); 387 struct tcp_sock *oldtp = tcp_sk(sk);
388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values; 388 struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
389 389
390 newsk->sk_rx_dst = dst_clone(skb_dst(skb));
391
392 /* TCP Cookie Transactions require space for the cookie pair, 390 /* TCP Cookie Transactions require space for the cookie pair,
393 * as it differs for each connection. There is no need to 391 * as it differs for each connection. There is no need to
394 * copy any s_data_payload stored at the original socket. 392 * copy any s_data_payload stored at the original socket.
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 33cd065cfbd8..d04632673a9e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -910,14 +910,18 @@ void tcp_release_cb(struct sock *sk)
910 if (flags & (1UL << TCP_TSQ_DEFERRED)) 910 if (flags & (1UL << TCP_TSQ_DEFERRED))
911 tcp_tsq_handler(sk); 911 tcp_tsq_handler(sk);
912 912
913 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) 913 if (flags & (1UL << TCP_WRITE_TIMER_DEFERRED)) {
914 tcp_write_timer_handler(sk); 914 tcp_write_timer_handler(sk);
915 915 __sock_put(sk);
916 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) 916 }
917 if (flags & (1UL << TCP_DELACK_TIMER_DEFERRED)) {
917 tcp_delack_timer_handler(sk); 918 tcp_delack_timer_handler(sk);
918 919 __sock_put(sk);
919 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) 920 }
921 if (flags & (1UL << TCP_MTU_REDUCED_DEFERRED)) {
920 sk->sk_prot->mtu_reduced(sk); 922 sk->sk_prot->mtu_reduced(sk);
923 __sock_put(sk);
924 }
921} 925}
922EXPORT_SYMBOL(tcp_release_cb); 926EXPORT_SYMBOL(tcp_release_cb);
923 927
@@ -940,7 +944,7 @@ void __init tcp_tasklet_init(void)
940 * We cant xmit new skbs from this context, as we might already 944 * We cant xmit new skbs from this context, as we might already
941 * hold qdisc lock. 945 * hold qdisc lock.
942 */ 946 */
943void tcp_wfree(struct sk_buff *skb) 947static void tcp_wfree(struct sk_buff *skb)
944{ 948{
945 struct sock *sk = skb->sk; 949 struct sock *sk = skb->sk;
946 struct tcp_sock *tp = tcp_sk(sk); 950 struct tcp_sock *tp = tcp_sk(sk);
@@ -1522,21 +1526,21 @@ static void tcp_cwnd_validate(struct sock *sk)
1522 * when we would be allowed to send the split-due-to-Nagle skb fully. 1526 * when we would be allowed to send the split-due-to-Nagle skb fully.
1523 */ 1527 */
1524static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb, 1528static unsigned int tcp_mss_split_point(const struct sock *sk, const struct sk_buff *skb,
1525 unsigned int mss_now, unsigned int cwnd) 1529 unsigned int mss_now, unsigned int max_segs)
1526{ 1530{
1527 const struct tcp_sock *tp = tcp_sk(sk); 1531 const struct tcp_sock *tp = tcp_sk(sk);
1528 u32 needed, window, cwnd_len; 1532 u32 needed, window, max_len;
1529 1533
1530 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 1534 window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
1531 cwnd_len = mss_now * cwnd; 1535 max_len = mss_now * max_segs;
1532 1536
1533 if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) 1537 if (likely(max_len <= window && skb != tcp_write_queue_tail(sk)))
1534 return cwnd_len; 1538 return max_len;
1535 1539
1536 needed = min(skb->len, window); 1540 needed = min(skb->len, window);
1537 1541
1538 if (cwnd_len <= needed) 1542 if (max_len <= needed)
1539 return cwnd_len; 1543 return max_len;
1540 1544
1541 return needed - needed % mss_now; 1545 return needed - needed % mss_now;
1542} 1546}
@@ -1765,7 +1769,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1765 limit = min(send_win, cong_win); 1769 limit = min(send_win, cong_win);
1766 1770
1767 /* If a full-sized TSO skb can be sent, do it. */ 1771 /* If a full-sized TSO skb can be sent, do it. */
1768 if (limit >= sk->sk_gso_max_size) 1772 if (limit >= min_t(unsigned int, sk->sk_gso_max_size,
1773 sk->sk_gso_max_segs * tp->mss_cache))
1769 goto send_now; 1774 goto send_now;
1770 1775
1771 /* Middle in queue won't get any more data, full sendable already? */ 1776 /* Middle in queue won't get any more data, full sendable already? */
@@ -1999,7 +2004,9 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
1999 limit = mss_now; 2004 limit = mss_now;
2000 if (tso_segs > 1 && !tcp_urg_mode(tp)) 2005 if (tso_segs > 1 && !tcp_urg_mode(tp))
2001 limit = tcp_mss_split_point(sk, skb, mss_now, 2006 limit = tcp_mss_split_point(sk, skb, mss_now,
2002 cwnd_quota); 2007 min_t(unsigned int,
2008 cwnd_quota,
2009 sk->sk_gso_max_segs));
2003 2010
2004 if (skb->len > limit && 2011 if (skb->len > limit &&
2005 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp))) 2012 unlikely(tso_fragment(sk, skb, limit, mss_now, gfp)))
@@ -2045,7 +2052,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
2045 if (unlikely(sk->sk_state == TCP_CLOSE)) 2052 if (unlikely(sk->sk_state == TCP_CLOSE))
2046 return; 2053 return;
2047 2054
2048 if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) 2055 if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
2056 sk_gfp_atomic(sk, GFP_ATOMIC)))
2049 tcp_check_probe_timer(sk); 2057 tcp_check_probe_timer(sk);
2050} 2058}
2051 2059
@@ -2666,7 +2674,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
2666 2674
2667 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) 2675 if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
2668 s_data_desired = cvp->s_data_desired; 2676 s_data_desired = cvp->s_data_desired;
2669 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); 2677 skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
2678 sk_gfp_atomic(sk, GFP_ATOMIC));
2670 if (unlikely(!skb)) { 2679 if (unlikely(!skb)) {
2671 dst_release(dst); 2680 dst_release(dst);
2672 return NULL; 2681 return NULL;
@@ -3064,7 +3073,7 @@ void tcp_send_ack(struct sock *sk)
3064 * tcp_transmit_skb() will set the ownership to this 3073 * tcp_transmit_skb() will set the ownership to this
3065 * sock. 3074 * sock.
3066 */ 3075 */
3067 buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3076 buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3068 if (buff == NULL) { 3077 if (buff == NULL) {
3069 inet_csk_schedule_ack(sk); 3078 inet_csk_schedule_ack(sk);
3070 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; 3079 inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
@@ -3079,7 +3088,7 @@ void tcp_send_ack(struct sock *sk)
3079 3088
3080 /* Send it off, this clears delayed acks for us. */ 3089 /* Send it off, this clears delayed acks for us. */
3081 TCP_SKB_CB(buff)->when = tcp_time_stamp; 3090 TCP_SKB_CB(buff)->when = tcp_time_stamp;
3082 tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); 3091 tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
3083} 3092}
3084 3093
3085/* This routine sends a packet with an out of date sequence 3094/* This routine sends a packet with an out of date sequence
@@ -3099,7 +3108,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
3099 struct sk_buff *skb; 3108 struct sk_buff *skb;
3100 3109
3101 /* We don't queue it, tcp_transmit_skb() sets ownership. */ 3110 /* We don't queue it, tcp_transmit_skb() sets ownership. */
3102 skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); 3111 skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
3103 if (skb == NULL) 3112 if (skb == NULL)
3104 return -1; 3113 return -1;
3105 3114
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6df36ad55a38..b774a03bd1dc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -252,7 +252,8 @@ static void tcp_delack_timer(unsigned long data)
252 inet_csk(sk)->icsk_ack.blocked = 1; 252 inet_csk(sk)->icsk_ack.blocked = 1;
253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED); 253 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_DELAYEDACKLOCKED);
254 /* deleguate our work to tcp_release_cb() */ 254 /* deleguate our work to tcp_release_cb() */
255 set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); 255 if (!test_and_set_bit(TCP_DELACK_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
256 sock_hold(sk);
256 } 257 }
257 bh_unlock_sock(sk); 258 bh_unlock_sock(sk);
258 sock_put(sk); 259 sock_put(sk);
@@ -481,7 +482,8 @@ static void tcp_write_timer(unsigned long data)
481 tcp_write_timer_handler(sk); 482 tcp_write_timer_handler(sk);
482 } else { 483 } else {
483 /* deleguate our work to tcp_release_cb() */ 484 /* deleguate our work to tcp_release_cb() */
484 set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags); 485 if (!test_and_set_bit(TCP_WRITE_TIMER_DEFERRED, &tcp_sk(sk)->tsq_flags))
486 sock_hold(sk);
485 } 487 }
486 bh_unlock_sock(sk); 488 bh_unlock_sock(sk);
487 sock_put(sk); 489 sock_put(sk);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index b4c3582a991f..6f6d1aca3c3d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -758,7 +758,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4)
758 uh->check = CSUM_MANGLED_0; 758 uh->check = CSUM_MANGLED_0;
759 759
760send: 760send:
761 err = ip_send_skb(skb); 761 err = ip_send_skb(sock_net(sk), skb);
762 if (err) { 762 if (err) {
763 if (err == -ENOBUFS && !inet->recverr) { 763 if (err == -ENOBUFS && !inet->recverr) {
764 UDP_INC_STATS_USER(sock_net(sk), 764 UDP_INC_STATS_USER(sock_net(sk),
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 58d23a572509..06814b6216dc 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -27,8 +27,8 @@ static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
27 if (skb_dst(skb) == NULL) { 27 if (skb_dst(skb) == NULL) {
28 const struct iphdr *iph = ip_hdr(skb); 28 const struct iphdr *iph = ip_hdr(skb);
29 29
30 if (ip_route_input(skb, iph->daddr, iph->saddr, 30 if (ip_route_input_noref(skb, iph->daddr, iph->saddr,
31 iph->tos, skb->dev)) 31 iph->tos, skb->dev))
32 goto drop; 32 goto drop;
33 } 33 }
34 return dst_input(skb); 34 return dst_input(skb);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index c6281847f16a..681ea2f413e2 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -92,6 +92,7 @@ static int xfrm4_fill_dst(struct xfrm_dst *xdst, struct net_device *dev,
92 xdst->u.rt.rt_type = rt->rt_type; 92 xdst->u.rt.rt_type = rt->rt_type;
93 xdst->u.rt.rt_gateway = rt->rt_gateway; 93 xdst->u.rt.rt_gateway = rt->rt_gateway;
94 xdst->u.rt.rt_pmtu = rt->rt_pmtu; 94 xdst->u.rt.rt_pmtu = rt->rt_pmtu;
95 INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
95 96
96 return 0; 97 return 0;
97} 98}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 79181819a24f..6bc85f7c31e3 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -494,8 +494,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
494 struct net_device *dev; 494 struct net_device *dev;
495 struct inet6_dev *idev; 495 struct inet6_dev *idev;
496 496
497 rcu_read_lock(); 497 for_each_netdev(net, dev) {
498 for_each_netdev_rcu(net, dev) {
499 idev = __in6_dev_get(dev); 498 idev = __in6_dev_get(dev);
500 if (idev) { 499 if (idev) {
501 int changed = (!idev->cnf.forwarding) ^ (!newf); 500 int changed = (!idev->cnf.forwarding) ^ (!newf);
@@ -504,7 +503,6 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
504 dev_forward_change(idev); 503 dev_forward_change(idev);
505 } 504 }
506 } 505 }
507 rcu_read_unlock();
508} 506}
509 507
510static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) 508static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf)
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 6dc7fd353ef5..282f3723ee19 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -167,8 +167,6 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
167 struct esp_data *esp = x->data; 167 struct esp_data *esp = x->data;
168 168
169 /* skb is pure payload to encrypt */ 169 /* skb is pure payload to encrypt */
170 err = -ENOMEM;
171
172 aead = esp->aead; 170 aead = esp->aead;
173 alen = crypto_aead_authsize(aead); 171 alen = crypto_aead_authsize(aead);
174 172
@@ -203,8 +201,10 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
203 } 201 }
204 202
205 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen); 203 tmp = esp_alloc_tmp(aead, nfrags + sglists, seqhilen);
206 if (!tmp) 204 if (!tmp) {
205 err = -ENOMEM;
207 goto error; 206 goto error;
207 }
208 208
209 seqhi = esp_tmp_seqhi(tmp); 209 seqhi = esp_tmp_seqhi(tmp);
210 iv = esp_tmp_iv(aead, tmp, seqhilen); 210 iv = esp_tmp_iv(aead, tmp, seqhilen);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 5ab923e51af3..a52d864d562b 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -47,9 +47,16 @@
47 47
48 48
49 49
50inline int ip6_rcv_finish( struct sk_buff *skb) 50int ip6_rcv_finish(struct sk_buff *skb)
51{ 51{
52 if (skb_dst(skb) == NULL) 52 if (sysctl_ip_early_demux && !skb_dst(skb)) {
53 const struct inet6_protocol *ipprot;
54
55 ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
56 if (ipprot && ipprot->early_demux)
57 ipprot->early_demux(skb);
58 }
59 if (!skb_dst(skb))
53 ip6_route_input(skb); 60 ip6_route_input(skb);
54 61
55 return dst_input(skb); 62 return dst_input(skb);
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index da2e92d05c15..745a32042950 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -307,10 +307,10 @@ static int __net_init ipv6_proc_init_net(struct net *net)
307 goto proc_dev_snmp6_fail; 307 goto proc_dev_snmp6_fail;
308 return 0; 308 return 0;
309 309
310proc_dev_snmp6_fail:
311 proc_net_remove(net, "snmp6");
310proc_snmp6_fail: 312proc_snmp6_fail:
311 proc_net_remove(net, "sockstat6"); 313 proc_net_remove(net, "sockstat6");
312proc_dev_snmp6_fail:
313 proc_net_remove(net, "dev_snmp6");
314 return -ENOMEM; 314 return -ENOMEM;
315} 315}
316 316
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index cf02cb97bbdd..8e80fd279100 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2480,12 +2480,8 @@ static int rt6_fill_node(struct net *net,
2480 goto nla_put_failure; 2480 goto nla_put_failure;
2481 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric)) 2481 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
2482 goto nla_put_failure; 2482 goto nla_put_failure;
2483 if (!(rt->rt6i_flags & RTF_EXPIRES)) 2483
2484 expires = 0; 2484 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2485 else if (rt->dst.expires - jiffies < INT_MAX)
2486 expires = rt->dst.expires - jiffies;
2487 else
2488 expires = INT_MAX;
2489 2485
2490 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0) 2486 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2491 goto nla_put_failure; 2487 goto nla_put_failure;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f49476e2d884..a3e60cc04a8a 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -94,6 +94,18 @@ static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(struct sock *sk,
94} 94}
95#endif 95#endif
96 96
97static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
98{
99 struct dst_entry *dst = skb_dst(skb);
100 const struct rt6_info *rt = (const struct rt6_info *)dst;
101
102 dst_hold(dst);
103 sk->sk_rx_dst = dst;
104 inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
105 if (rt->rt6i_node)
106 inet6_sk(sk)->rx_dst_cookie = rt->rt6i_node->fn_sernum;
107}
108
97static void tcp_v6_hash(struct sock *sk) 109static void tcp_v6_hash(struct sock *sk)
98{ 110{
99 if (sk->sk_state != TCP_CLOSE) { 111 if (sk->sk_state != TCP_CLOSE) {
@@ -1270,6 +1282,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1270 1282
1271 newsk->sk_gso_type = SKB_GSO_TCPV6; 1283 newsk->sk_gso_type = SKB_GSO_TCPV6;
1272 __ip6_dst_store(newsk, dst, NULL, NULL); 1284 __ip6_dst_store(newsk, dst, NULL, NULL);
1285 inet6_sk_rx_dst_set(newsk, skb);
1273 1286
1274 newtcp6sk = (struct tcp6_sock *)newsk; 1287 newtcp6sk = (struct tcp6_sock *)newsk;
1275 inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; 1288 inet_sk(newsk)->pinet6 = &newtcp6sk->inet6;
@@ -1299,7 +1312,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1299 /* Clone pktoptions received with SYN */ 1312 /* Clone pktoptions received with SYN */
1300 newnp->pktoptions = NULL; 1313 newnp->pktoptions = NULL;
1301 if (treq->pktopts != NULL) { 1314 if (treq->pktopts != NULL) {
1302 newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); 1315 newnp->pktoptions = skb_clone(treq->pktopts,
1316 sk_gfp_atomic(sk, GFP_ATOMIC));
1303 consume_skb(treq->pktopts); 1317 consume_skb(treq->pktopts);
1304 treq->pktopts = NULL; 1318 treq->pktopts = NULL;
1305 if (newnp->pktoptions) 1319 if (newnp->pktoptions)
@@ -1349,7 +1363,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1349 * across. Shucks. 1363 * across. Shucks.
1350 */ 1364 */
1351 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, 1365 tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr,
1352 AF_INET6, key->key, key->keylen, GFP_ATOMIC); 1366 AF_INET6, key->key, key->keylen,
1367 sk_gfp_atomic(sk, GFP_ATOMIC));
1353 } 1368 }
1354#endif 1369#endif
1355 1370
@@ -1442,10 +1457,20 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
1442 --ANK (980728) 1457 --ANK (980728)
1443 */ 1458 */
1444 if (np->rxopt.all) 1459 if (np->rxopt.all)
1445 opt_skb = skb_clone(skb, GFP_ATOMIC); 1460 opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
1446 1461
1447 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ 1462 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1463 struct dst_entry *dst = sk->sk_rx_dst;
1464
1448 sock_rps_save_rxhash(sk, skb); 1465 sock_rps_save_rxhash(sk, skb);
1466 if (dst) {
1467 if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif ||
1468 dst->ops->check(dst, np->rx_dst_cookie) == NULL) {
1469 dst_release(dst);
1470 sk->sk_rx_dst = NULL;
1471 }
1472 }
1473
1449 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) 1474 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len))
1450 goto reset; 1475 goto reset;
1451 if (opt_skb) 1476 if (opt_skb)
@@ -1674,6 +1699,43 @@ do_time_wait:
1674 goto discard_it; 1699 goto discard_it;
1675} 1700}
1676 1701
1702static void tcp_v6_early_demux(struct sk_buff *skb)
1703{
1704 const struct ipv6hdr *hdr;
1705 const struct tcphdr *th;
1706 struct sock *sk;
1707
1708 if (skb->pkt_type != PACKET_HOST)
1709 return;
1710
1711 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1712 return;
1713
1714 hdr = ipv6_hdr(skb);
1715 th = tcp_hdr(skb);
1716
1717 if (th->doff < sizeof(struct tcphdr) / 4)
1718 return;
1719
1720 sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1721 &hdr->saddr, th->source,
1722 &hdr->daddr, ntohs(th->dest),
1723 inet6_iif(skb));
1724 if (sk) {
1725 skb->sk = sk;
1726 skb->destructor = sock_edemux;
1727 if (sk->sk_state != TCP_TIME_WAIT) {
1728 struct dst_entry *dst = sk->sk_rx_dst;
1729 struct inet_sock *icsk = inet_sk(sk);
1730 if (dst)
1731 dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
1732 if (dst &&
1733 icsk->rx_dst_ifindex == skb->skb_iif)
1734 skb_dst_set_noref(skb, dst);
1735 }
1736 }
1737}
1738
1677static struct timewait_sock_ops tcp6_timewait_sock_ops = { 1739static struct timewait_sock_ops tcp6_timewait_sock_ops = {
1678 .twsk_obj_size = sizeof(struct tcp6_timewait_sock), 1740 .twsk_obj_size = sizeof(struct tcp6_timewait_sock),
1679 .twsk_unique = tcp_twsk_unique, 1741 .twsk_unique = tcp_twsk_unique,
@@ -1684,6 +1746,7 @@ static const struct inet_connection_sock_af_ops ipv6_specific = {
1684 .queue_xmit = inet6_csk_xmit, 1746 .queue_xmit = inet6_csk_xmit,
1685 .send_check = tcp_v6_send_check, 1747 .send_check = tcp_v6_send_check,
1686 .rebuild_header = inet6_sk_rebuild_header, 1748 .rebuild_header = inet6_sk_rebuild_header,
1749 .sk_rx_dst_set = inet6_sk_rx_dst_set,
1687 .conn_request = tcp_v6_conn_request, 1750 .conn_request = tcp_v6_conn_request,
1688 .syn_recv_sock = tcp_v6_syn_recv_sock, 1751 .syn_recv_sock = tcp_v6_syn_recv_sock,
1689 .net_header_len = sizeof(struct ipv6hdr), 1752 .net_header_len = sizeof(struct ipv6hdr),
@@ -1715,6 +1778,7 @@ static const struct inet_connection_sock_af_ops ipv6_mapped = {
1715 .queue_xmit = ip_queue_xmit, 1778 .queue_xmit = ip_queue_xmit,
1716 .send_check = tcp_v4_send_check, 1779 .send_check = tcp_v4_send_check,
1717 .rebuild_header = inet_sk_rebuild_header, 1780 .rebuild_header = inet_sk_rebuild_header,
1781 .sk_rx_dst_set = inet_sk_rx_dst_set,
1718 .conn_request = tcp_v6_conn_request, 1782 .conn_request = tcp_v6_conn_request,
1719 .syn_recv_sock = tcp_v6_syn_recv_sock, 1783 .syn_recv_sock = tcp_v6_syn_recv_sock,
1720 .net_header_len = sizeof(struct iphdr), 1784 .net_header_len = sizeof(struct iphdr),
@@ -1978,12 +2042,13 @@ struct proto tcpv6_prot = {
1978 .compat_setsockopt = compat_tcp_setsockopt, 2042 .compat_setsockopt = compat_tcp_setsockopt,
1979 .compat_getsockopt = compat_tcp_getsockopt, 2043 .compat_getsockopt = compat_tcp_getsockopt,
1980#endif 2044#endif
1981#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 2045#ifdef CONFIG_MEMCG_KMEM
1982 .proto_cgroup = tcp_proto_cgroup, 2046 .proto_cgroup = tcp_proto_cgroup,
1983#endif 2047#endif
1984}; 2048};
1985 2049
1986static const struct inet6_protocol tcpv6_protocol = { 2050static const struct inet6_protocol tcpv6_protocol = {
2051 .early_demux = tcp_v6_early_demux,
1987 .handler = tcp_v6_rcv, 2052 .handler = tcp_v6_rcv,
1988 .err_handler = tcp_v6_err, 2053 .err_handler = tcp_v6_err,
1989 .gso_send_check = tcp_v6_gso_send_check, 2054 .gso_send_check = tcp_v6_gso_send_check,
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index ef39812107b1..f8c4c08ffb60 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -73,6 +73,13 @@ static int xfrm6_get_tos(const struct flowi *fl)
73 return 0; 73 return 0;
74} 74}
75 75
76static void xfrm6_init_dst(struct net *net, struct xfrm_dst *xdst)
77{
78 struct rt6_info *rt = (struct rt6_info *)xdst;
79
80 rt6_init_peer(rt, net->ipv6.peers);
81}
82
76static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst, 83static int xfrm6_init_path(struct xfrm_dst *path, struct dst_entry *dst,
77 int nfheader_len) 84 int nfheader_len)
78{ 85{
@@ -286,6 +293,7 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
286 .get_saddr = xfrm6_get_saddr, 293 .get_saddr = xfrm6_get_saddr,
287 .decode_session = _decode_session6, 294 .decode_session = _decode_session6,
288 .get_tos = xfrm6_get_tos, 295 .get_tos = xfrm6_get_tos,
296 .init_dst = xfrm6_init_dst,
289 .init_path = xfrm6_init_path, 297 .init_path = xfrm6_init_path,
290 .fill_dst = xfrm6_fill_dst, 298 .fill_dst = xfrm6_fill_dst,
291 .blackhole_route = ip6_blackhole_route, 299 .blackhole_route = ip6_blackhole_route,
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 393355d37b47..513cab08a986 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1347,11 +1347,10 @@ static void l2tp_tunnel_free(struct l2tp_tunnel *tunnel)
1347 /* Remove from tunnel list */ 1347 /* Remove from tunnel list */
1348 spin_lock_bh(&pn->l2tp_tunnel_list_lock); 1348 spin_lock_bh(&pn->l2tp_tunnel_list_lock);
1349 list_del_rcu(&tunnel->list); 1349 list_del_rcu(&tunnel->list);
1350 kfree_rcu(tunnel, rcu);
1350 spin_unlock_bh(&pn->l2tp_tunnel_list_lock); 1351 spin_unlock_bh(&pn->l2tp_tunnel_list_lock);
1351 synchronize_rcu();
1352 1352
1353 atomic_dec(&l2tp_tunnel_count); 1353 atomic_dec(&l2tp_tunnel_count);
1354 kfree(tunnel);
1355} 1354}
1356 1355
1357/* Create a socket for the tunnel, if one isn't set up by 1356/* Create a socket for the tunnel, if one isn't set up by
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index a38ec6cdeee1..56d583e083a7 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -163,6 +163,7 @@ struct l2tp_tunnel_cfg {
163 163
164struct l2tp_tunnel { 164struct l2tp_tunnel {
165 int magic; /* Should be L2TP_TUNNEL_MAGIC */ 165 int magic; /* Should be L2TP_TUNNEL_MAGIC */
166 struct rcu_head rcu;
166 rwlock_t hlist_lock; /* protect session_hlist */ 167 rwlock_t hlist_lock; /* protect session_hlist */
167 struct hlist_head session_hlist[L2TP_HASH_SIZE]; 168 struct hlist_head session_hlist[L2TP_HASH_SIZE];
168 /* hashed list of sessions, 169 /* hashed list of sessions,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 35e1e4bde587..927547171bc7 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -410,6 +410,7 @@ static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
410 lsa->l2tp_family = AF_INET6; 410 lsa->l2tp_family = AF_INET6;
411 lsa->l2tp_flowinfo = 0; 411 lsa->l2tp_flowinfo = 0;
412 lsa->l2tp_scope_id = 0; 412 lsa->l2tp_scope_id = 0;
413 lsa->l2tp_unused = 0;
413 if (peer) { 414 if (peer) {
414 if (!lsk->peer_conn_id) 415 if (!lsk->peer_conn_id)
415 return -ENOTCONN; 416 return -ENOTCONN;
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index f6fe4d400502..c2190005a114 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -969,14 +969,13 @@ static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr,
969 struct sockaddr_llc sllc; 969 struct sockaddr_llc sllc;
970 struct sock *sk = sock->sk; 970 struct sock *sk = sock->sk;
971 struct llc_sock *llc = llc_sk(sk); 971 struct llc_sock *llc = llc_sk(sk);
972 int rc = 0; 972 int rc = -EBADF;
973 973
974 memset(&sllc, 0, sizeof(sllc)); 974 memset(&sllc, 0, sizeof(sllc));
975 lock_sock(sk); 975 lock_sock(sk);
976 if (sock_flag(sk, SOCK_ZAPPED)) 976 if (sock_flag(sk, SOCK_ZAPPED))
977 goto out; 977 goto out;
978 *uaddrlen = sizeof(sllc); 978 *uaddrlen = sizeof(sllc);
979 memset(uaddr, 0, *uaddrlen);
980 if (peer) { 979 if (peer) {
981 rc = -ENOTCONN; 980 rc = -ENOTCONN;
982 if (sk->sk_state != TCP_ESTABLISHED) 981 if (sk->sk_state != TCP_ESTABLISHED)
@@ -1206,7 +1205,7 @@ static int __init llc2_init(void)
1206 rc = llc_proc_init(); 1205 rc = llc_proc_init();
1207 if (rc != 0) { 1206 if (rc != 0) {
1208 printk(llc_proc_err_msg); 1207 printk(llc_proc_err_msg);
1209 goto out_unregister_llc_proto; 1208 goto out_station;
1210 } 1209 }
1211 rc = llc_sysctl_init(); 1210 rc = llc_sysctl_init();
1212 if (rc) { 1211 if (rc) {
@@ -1226,7 +1225,8 @@ out_sysctl:
1226 llc_sysctl_exit(); 1225 llc_sysctl_exit();
1227out_proc: 1226out_proc:
1228 llc_proc_exit(); 1227 llc_proc_exit();
1229out_unregister_llc_proto: 1228out_station:
1229 llc_station_exit();
1230 proto_unregister(&llc_proto); 1230 proto_unregister(&llc_proto);
1231 goto out; 1231 goto out;
1232} 1232}
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
index e32cab44ea95..dd3e83328ad5 100644
--- a/net/llc/llc_input.c
+++ b/net/llc/llc_input.c
@@ -42,6 +42,7 @@ static void (*llc_type_handlers[2])(struct llc_sap *sap,
42void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, 42void llc_add_pack(int type, void (*handler)(struct llc_sap *sap,
43 struct sk_buff *skb)) 43 struct sk_buff *skb))
44{ 44{
45 smp_wmb(); /* ensure initialisation is complete before it's called */
45 if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) 46 if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
46 llc_type_handlers[type - 1] = handler; 47 llc_type_handlers[type - 1] = handler;
47} 48}
@@ -50,11 +51,19 @@ void llc_remove_pack(int type)
50{ 51{
51 if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) 52 if (type == LLC_DEST_SAP || type == LLC_DEST_CONN)
52 llc_type_handlers[type - 1] = NULL; 53 llc_type_handlers[type - 1] = NULL;
54 synchronize_net();
53} 55}
54 56
55void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) 57void llc_set_station_handler(void (*handler)(struct sk_buff *skb))
56{ 58{
59 /* Ensure initialisation is complete before it's called */
60 if (handler)
61 smp_wmb();
62
57 llc_station_handler = handler; 63 llc_station_handler = handler;
64
65 if (!handler)
66 synchronize_net();
58} 67}
59 68
60/** 69/**
@@ -150,6 +159,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
150 int dest; 159 int dest;
151 int (*rcv)(struct sk_buff *, struct net_device *, 160 int (*rcv)(struct sk_buff *, struct net_device *,
152 struct packet_type *, struct net_device *); 161 struct packet_type *, struct net_device *);
162 void (*sta_handler)(struct sk_buff *skb);
163 void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb);
153 164
154 if (!net_eq(dev_net(dev), &init_net)) 165 if (!net_eq(dev_net(dev), &init_net))
155 goto drop; 166 goto drop;
@@ -182,7 +193,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
182 */ 193 */
183 rcv = rcu_dereference(sap->rcv_func); 194 rcv = rcu_dereference(sap->rcv_func);
184 dest = llc_pdu_type(skb); 195 dest = llc_pdu_type(skb);
185 if (unlikely(!dest || !llc_type_handlers[dest - 1])) { 196 sap_handler = dest ? ACCESS_ONCE(llc_type_handlers[dest - 1]) : NULL;
197 if (unlikely(!sap_handler)) {
186 if (rcv) 198 if (rcv)
187 rcv(skb, dev, pt, orig_dev); 199 rcv(skb, dev, pt, orig_dev);
188 else 200 else
@@ -193,7 +205,7 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev,
193 if (cskb) 205 if (cskb)
194 rcv(cskb, dev, pt, orig_dev); 206 rcv(cskb, dev, pt, orig_dev);
195 } 207 }
196 llc_type_handlers[dest - 1](sap, skb); 208 sap_handler(sap, skb);
197 } 209 }
198 llc_sap_put(sap); 210 llc_sap_put(sap);
199out: 211out:
@@ -202,9 +214,10 @@ drop:
202 kfree_skb(skb); 214 kfree_skb(skb);
203 goto out; 215 goto out;
204handle_station: 216handle_station:
205 if (!llc_station_handler) 217 sta_handler = ACCESS_ONCE(llc_station_handler);
218 if (!sta_handler)
206 goto drop; 219 goto drop;
207 llc_station_handler(skb); 220 sta_handler(skb);
208 goto out; 221 goto out;
209} 222}
210 223
diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
index 39a8d8924b9c..b2f2bac2c2a2 100644
--- a/net/llc/llc_station.c
+++ b/net/llc/llc_station.c
@@ -268,7 +268,7 @@ static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb)
268out: 268out:
269 return rc; 269 return rc;
270free: 270free:
271 kfree_skb(skb); 271 kfree_skb(nskb);
272 goto out; 272 goto out;
273} 273}
274 274
@@ -293,7 +293,7 @@ static int llc_station_ac_send_xid_r(struct sk_buff *skb)
293out: 293out:
294 return rc; 294 return rc;
295free: 295free:
296 kfree_skb(skb); 296 kfree_skb(nskb);
297 goto out; 297 goto out;
298} 298}
299 299
@@ -322,7 +322,7 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb)
322out: 322out:
323 return rc; 323 return rc;
324free: 324free:
325 kfree_skb(skb); 325 kfree_skb(nskb);
326 goto out; 326 goto out;
327} 327}
328 328
@@ -687,12 +687,8 @@ static void llc_station_rcv(struct sk_buff *skb)
687 llc_station_state_process(skb); 687 llc_station_state_process(skb);
688} 688}
689 689
690int __init llc_station_init(void) 690void __init llc_station_init(void)
691{ 691{
692 int rc = -ENOBUFS;
693 struct sk_buff *skb;
694 struct llc_station_state_ev *ev;
695
696 skb_queue_head_init(&llc_main_station.mac_pdu_q); 692 skb_queue_head_init(&llc_main_station.mac_pdu_q);
697 skb_queue_head_init(&llc_main_station.ev_q.list); 693 skb_queue_head_init(&llc_main_station.ev_q.list);
698 spin_lock_init(&llc_main_station.ev_q.lock); 694 spin_lock_init(&llc_main_station.ev_q.lock);
@@ -700,23 +696,12 @@ int __init llc_station_init(void)
700 (unsigned long)&llc_main_station); 696 (unsigned long)&llc_main_station);
701 llc_main_station.ack_timer.expires = jiffies + 697 llc_main_station.ack_timer.expires = jiffies +
702 sysctl_llc_station_ack_timeout; 698 sysctl_llc_station_ack_timeout;
703 skb = alloc_skb(0, GFP_ATOMIC);
704 if (!skb)
705 goto out;
706 rc = 0;
707 llc_set_station_handler(llc_station_rcv);
708 ev = llc_station_ev(skb);
709 memset(ev, 0, sizeof(*ev));
710 llc_main_station.maximum_retry = 1; 699 llc_main_station.maximum_retry = 1;
711 llc_main_station.state = LLC_STATION_STATE_DOWN; 700 llc_main_station.state = LLC_STATION_STATE_UP;
712 ev->type = LLC_STATION_EV_TYPE_SIMPLE; 701 llc_set_station_handler(llc_station_rcv);
713 ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK;
714 rc = llc_station_next_state(skb);
715out:
716 return rc;
717} 702}
718 703
719void __exit llc_station_exit(void) 704void llc_station_exit(void)
720{ 705{
721 llc_set_station_handler(NULL); 706 llc_set_station_handler(NULL);
722} 707}
diff --git a/net/mac80211/led.c b/net/mac80211/led.c
index 1bf7903496f8..bcffa6903129 100644
--- a/net/mac80211/led.c
+++ b/net/mac80211/led.c
@@ -276,7 +276,7 @@ static void ieee80211_stop_tpt_led_trig(struct ieee80211_local *local)
276 276
277 read_lock(&tpt_trig->trig.leddev_list_lock); 277 read_lock(&tpt_trig->trig.leddev_list_lock);
278 list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list) 278 list_for_each_entry(led_cdev, &tpt_trig->trig.led_cdevs, trig_list)
279 led_brightness_set(led_cdev, LED_OFF); 279 led_set_brightness(led_cdev, LED_OFF);
280 read_unlock(&tpt_trig->trig.leddev_list_lock); 280 read_unlock(&tpt_trig->trig.leddev_list_lock);
281} 281}
282 282
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 6fac18c0423f..85572353a7e3 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -622,6 +622,7 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
622 622
623 del_timer_sync(&sdata->u.mesh.housekeeping_timer); 623 del_timer_sync(&sdata->u.mesh.housekeeping_timer);
624 del_timer_sync(&sdata->u.mesh.mesh_path_root_timer); 624 del_timer_sync(&sdata->u.mesh.mesh_path_root_timer);
625 del_timer_sync(&sdata->u.mesh.mesh_path_timer);
625 /* 626 /*
626 * If the timer fired while we waited for it, it will have 627 * If the timer fired while we waited for it, it will have
627 * requeued the work. Now the work will be running again 628 * requeued the work. Now the work will be running again
@@ -634,6 +635,8 @@ void ieee80211_stop_mesh(struct ieee80211_sub_if_data *sdata)
634 local->fif_other_bss--; 635 local->fif_other_bss--;
635 atomic_dec(&local->iff_allmultis); 636 atomic_dec(&local->iff_allmultis);
636 ieee80211_configure_filter(local); 637 ieee80211_configure_filter(local);
638
639 sdata->u.mesh.timers_running = 0;
637} 640}
638 641
639static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, 642static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index cef0c9e79aba..a4a5acdbaa4d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1430,6 +1430,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
1430 del_timer_sync(&sdata->u.mgd.bcn_mon_timer); 1430 del_timer_sync(&sdata->u.mgd.bcn_mon_timer);
1431 del_timer_sync(&sdata->u.mgd.timer); 1431 del_timer_sync(&sdata->u.mgd.timer);
1432 del_timer_sync(&sdata->u.mgd.chswitch_timer); 1432 del_timer_sync(&sdata->u.mgd.chswitch_timer);
1433
1434 sdata->u.mgd.timers_running = 0;
1433} 1435}
1434 1436
1435void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, 1437void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index bcaee5d12839..839dd9737989 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -299,7 +299,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted,
299 if (local->scan_req != local->int_scan_req) 299 if (local->scan_req != local->int_scan_req)
300 cfg80211_scan_done(local->scan_req, aborted); 300 cfg80211_scan_done(local->scan_req, aborted);
301 local->scan_req = NULL; 301 local->scan_req = NULL;
302 local->scan_sdata = NULL; 302 rcu_assign_pointer(local->scan_sdata, NULL);
303 303
304 local->scanning = 0; 304 local->scanning = 0;
305 local->scan_channel = NULL; 305 local->scan_channel = NULL;
@@ -984,7 +984,6 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_sub_if_data *sdata)
984 kfree(local->sched_scan_ies.ie[i]); 984 kfree(local->sched_scan_ies.ie[i]);
985 985
986 drv_sched_scan_stop(local, sdata); 986 drv_sched_scan_stop(local, sdata);
987 rcu_assign_pointer(local->sched_scan_sdata, NULL);
988 } 987 }
989out: 988out:
990 mutex_unlock(&local->mtx); 989 mutex_unlock(&local->mtx);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index acf712ffb5e6..c5e8c9c31f76 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1811,37 +1811,31 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
1811 meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr, 1811 meshhdrlen = ieee80211_new_mesh_header(&mesh_hdr,
1812 sdata, NULL, NULL); 1812 sdata, NULL, NULL);
1813 } else { 1813 } else {
1814 int is_mesh_mcast = 1; 1814 /* DS -> MBSS (802.11-2012 13.11.3.3).
1815 const u8 *mesh_da; 1815 * For unicast with unknown forwarding information,
1816 * destination might be in the MBSS or if that fails
1817 * forwarded to another mesh gate. In either case
1818 * resolution will be handled in ieee80211_xmit(), so
1819 * leave the original DA. This also works for mcast */
1820 const u8 *mesh_da = skb->data;
1821
1822 if (mppath)
1823 mesh_da = mppath->mpp;
1824 else if (mpath)
1825 mesh_da = mpath->dst;
1826 rcu_read_unlock();
1816 1827
1817 if (is_multicast_ether_addr(skb->data))
1818 /* DA TA mSA AE:SA */
1819 mesh_da = skb->data;
1820 else {
1821 static const u8 bcast[ETH_ALEN] =
1822 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
1823 if (mppath) {
1824 /* RA TA mDA mSA AE:DA SA */
1825 mesh_da = mppath->mpp;
1826 is_mesh_mcast = 0;
1827 } else if (mpath) {
1828 mesh_da = mpath->dst;
1829 is_mesh_mcast = 0;
1830 } else {
1831 /* DA TA mSA AE:SA */
1832 mesh_da = bcast;
1833 }
1834 }
1835 hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc, 1828 hdrlen = ieee80211_fill_mesh_addresses(&hdr, &fc,
1836 mesh_da, sdata->vif.addr); 1829 mesh_da, sdata->vif.addr);
1837 rcu_read_unlock(); 1830 if (is_multicast_ether_addr(mesh_da))
1838 if (is_mesh_mcast) 1831 /* DA TA mSA AE:SA */
1839 meshhdrlen = 1832 meshhdrlen =
1840 ieee80211_new_mesh_header(&mesh_hdr, 1833 ieee80211_new_mesh_header(&mesh_hdr,
1841 sdata, 1834 sdata,
1842 skb->data + ETH_ALEN, 1835 skb->data + ETH_ALEN,
1843 NULL); 1836 NULL);
1844 else 1837 else
1838 /* RA TA mDA mSA AE:DA SA */
1845 meshhdrlen = 1839 meshhdrlen =
1846 ieee80211_new_mesh_header(&mesh_hdr, 1840 ieee80211_new_mesh_header(&mesh_hdr,
1847 sdata, 1841 sdata,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 84444dda194b..f51013c07b9f 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1171,8 +1171,10 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1171 goto out_err; 1171 goto out_err;
1172 } 1172 }
1173 svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); 1173 svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1174 if (!svc->stats.cpustats) 1174 if (!svc->stats.cpustats) {
1175 ret = -ENOMEM;
1175 goto out_err; 1176 goto out_err;
1177 }
1176 1178
1177 /* I'm the first user of the service */ 1179 /* I'm the first user of the service */
1178 atomic_set(&svc->usecnt, 0); 1180 atomic_set(&svc->usecnt, 0);
@@ -2759,6 +2761,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2759 { 2761 {
2760 struct ip_vs_timeout_user t; 2762 struct ip_vs_timeout_user t;
2761 2763
2764 memset(&t, 0, sizeof(t));
2762 __ip_vs_get_timeouts(net, &t); 2765 __ip_vs_get_timeouts(net, &t);
2763 if (copy_to_user(user, &t, sizeof(t)) != 0) 2766 if (copy_to_user(user, &t, sizeof(t)) != 0)
2764 ret = -EFAULT; 2767 ret = -EFAULT;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index cf4875565d67..2ceec64b19f9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -249,12 +249,15 @@ static void death_by_event(unsigned long ul_conntrack)
249{ 249{
250 struct nf_conn *ct = (void *)ul_conntrack; 250 struct nf_conn *ct = (void *)ul_conntrack;
251 struct net *net = nf_ct_net(ct); 251 struct net *net = nf_ct_net(ct);
252 struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
253
254 BUG_ON(ecache == NULL);
252 255
253 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) { 256 if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
254 /* bad luck, let's retry again */ 257 /* bad luck, let's retry again */
255 ct->timeout.expires = jiffies + 258 ecache->timeout.expires = jiffies +
256 (random32() % net->ct.sysctl_events_retry_timeout); 259 (random32() % net->ct.sysctl_events_retry_timeout);
257 add_timer(&ct->timeout); 260 add_timer(&ecache->timeout);
258 return; 261 return;
259 } 262 }
260 /* we've got the event delivered, now it's dying */ 263 /* we've got the event delivered, now it's dying */
@@ -268,6 +271,9 @@ static void death_by_event(unsigned long ul_conntrack)
268void nf_ct_insert_dying_list(struct nf_conn *ct) 271void nf_ct_insert_dying_list(struct nf_conn *ct)
269{ 272{
270 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
274 struct nf_conntrack_ecache *ecache = nf_ct_ecache_find(ct);
275
276 BUG_ON(ecache == NULL);
271 277
272 /* add this conntrack to the dying list */ 278 /* add this conntrack to the dying list */
273 spin_lock_bh(&nf_conntrack_lock); 279 spin_lock_bh(&nf_conntrack_lock);
@@ -275,10 +281,10 @@ void nf_ct_insert_dying_list(struct nf_conn *ct)
275 &net->ct.dying); 281 &net->ct.dying);
276 spin_unlock_bh(&nf_conntrack_lock); 282 spin_unlock_bh(&nf_conntrack_lock);
277 /* set a new timer to retry event delivery */ 283 /* set a new timer to retry event delivery */
278 setup_timer(&ct->timeout, death_by_event, (unsigned long)ct); 284 setup_timer(&ecache->timeout, death_by_event, (unsigned long)ct);
279 ct->timeout.expires = jiffies + 285 ecache->timeout.expires = jiffies +
280 (random32() % net->ct.sysctl_events_retry_timeout); 286 (random32() % net->ct.sysctl_events_retry_timeout);
281 add_timer(&ct->timeout); 287 add_timer(&ecache->timeout);
282} 288}
283EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list); 289EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
284 290
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 45cf602a76bc..527651a53a45 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -361,23 +361,6 @@ static void evict_oldest_expect(struct nf_conn *master,
361 } 361 }
362} 362}
363 363
364static inline int refresh_timer(struct nf_conntrack_expect *i)
365{
366 struct nf_conn_help *master_help = nfct_help(i->master);
367 const struct nf_conntrack_expect_policy *p;
368
369 if (!del_timer(&i->timeout))
370 return 0;
371
372 p = &rcu_dereference_protected(
373 master_help->helper,
374 lockdep_is_held(&nf_conntrack_lock)
375 )->expect_policy[i->class];
376 i->timeout.expires = jiffies + p->timeout * HZ;
377 add_timer(&i->timeout);
378 return 1;
379}
380
381static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) 364static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
382{ 365{
383 const struct nf_conntrack_expect_policy *p; 366 const struct nf_conntrack_expect_policy *p;
@@ -386,7 +369,7 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
386 struct nf_conn_help *master_help = nfct_help(master); 369 struct nf_conn_help *master_help = nfct_help(master);
387 struct nf_conntrack_helper *helper; 370 struct nf_conntrack_helper *helper;
388 struct net *net = nf_ct_exp_net(expect); 371 struct net *net = nf_ct_exp_net(expect);
389 struct hlist_node *n; 372 struct hlist_node *n, *next;
390 unsigned int h; 373 unsigned int h;
391 int ret = 1; 374 int ret = 1;
392 375
@@ -395,12 +378,12 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
395 goto out; 378 goto out;
396 } 379 }
397 h = nf_ct_expect_dst_hash(&expect->tuple); 380 h = nf_ct_expect_dst_hash(&expect->tuple);
398 hlist_for_each_entry(i, n, &net->ct.expect_hash[h], hnode) { 381 hlist_for_each_entry_safe(i, n, next, &net->ct.expect_hash[h], hnode) {
399 if (expect_matches(i, expect)) { 382 if (expect_matches(i, expect)) {
400 /* Refresh timer: if it's dying, ignore.. */ 383 if (del_timer(&i->timeout)) {
401 if (refresh_timer(i)) { 384 nf_ct_unlink_expect(i);
402 ret = 0; 385 nf_ct_expect_put(i);
403 goto out; 386 break;
404 } 387 }
405 } else if (expect_clash(i, expect)) { 388 } else if (expect_clash(i, expect)) {
406 ret = -EBUSY; 389 ret = -EBUSY;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 14f67a2cbcb5..9807f3278fcb 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1896,10 +1896,15 @@ static int
1896ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct) 1896ctnetlink_nfqueue_parse(const struct nlattr *attr, struct nf_conn *ct)
1897{ 1897{
1898 struct nlattr *cda[CTA_MAX+1]; 1898 struct nlattr *cda[CTA_MAX+1];
1899 int ret;
1899 1900
1900 nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); 1901 nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy);
1901 1902
1902 return ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct); 1903 spin_lock_bh(&nf_conntrack_lock);
1904 ret = ctnetlink_nfqueue_parse_ct((const struct nlattr **)cda, ct);
1905 spin_unlock_bh(&nf_conntrack_lock);
1906
1907 return ret;
1903} 1908}
1904 1909
1905static struct nfq_ct_hook ctnetlink_nfqueue_hook = { 1910static struct nfq_ct_hook ctnetlink_nfqueue_hook = {
@@ -2785,7 +2790,8 @@ static int __init ctnetlink_init(void)
2785 goto err_unreg_subsys; 2790 goto err_unreg_subsys;
2786 } 2791 }
2787 2792
2788 if (register_pernet_subsys(&ctnetlink_net_ops)) { 2793 ret = register_pernet_subsys(&ctnetlink_net_ops);
2794 if (ret < 0) {
2789 pr_err("ctnetlink_init: cannot register pernet operations\n"); 2795 pr_err("ctnetlink_init: cannot register pernet operations\n");
2790 goto err_unreg_exp_subsys; 2796 goto err_unreg_exp_subsys;
2791 } 2797 }
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 758a1bacc126..5c0a112aeee6 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -183,12 +183,12 @@ static int media_len(const struct nf_conn *ct, const char *dptr,
183 return len + digits_len(ct, dptr, limit, shift); 183 return len + digits_len(ct, dptr, limit, shift);
184} 184}
185 185
186static int parse_addr(const struct nf_conn *ct, const char *cp, 186static int sip_parse_addr(const struct nf_conn *ct, const char *cp,
187 const char **endp, union nf_inet_addr *addr, 187 const char **endp, union nf_inet_addr *addr,
188 const char *limit) 188 const char *limit, bool delim)
189{ 189{
190 const char *end; 190 const char *end;
191 int ret = 0; 191 int ret;
192 192
193 if (!ct) 193 if (!ct)
194 return 0; 194 return 0;
@@ -197,16 +197,28 @@ static int parse_addr(const struct nf_conn *ct, const char *cp,
197 switch (nf_ct_l3num(ct)) { 197 switch (nf_ct_l3num(ct)) {
198 case AF_INET: 198 case AF_INET:
199 ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end); 199 ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
200 if (ret == 0)
201 return 0;
200 break; 202 break;
201 case AF_INET6: 203 case AF_INET6:
204 if (cp < limit && *cp == '[')
205 cp++;
206 else if (delim)
207 return 0;
208
202 ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end); 209 ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
210 if (ret == 0)
211 return 0;
212
213 if (end < limit && *end == ']')
214 end++;
215 else if (delim)
216 return 0;
203 break; 217 break;
204 default: 218 default:
205 BUG(); 219 BUG();
206 } 220 }
207 221
208 if (ret == 0 || end == cp)
209 return 0;
210 if (endp) 222 if (endp)
211 *endp = end; 223 *endp = end;
212 return 1; 224 return 1;
@@ -219,7 +231,7 @@ static int epaddr_len(const struct nf_conn *ct, const char *dptr,
219 union nf_inet_addr addr; 231 union nf_inet_addr addr;
220 const char *aux = dptr; 232 const char *aux = dptr;
221 233
222 if (!parse_addr(ct, dptr, &dptr, &addr, limit)) { 234 if (!sip_parse_addr(ct, dptr, &dptr, &addr, limit, true)) {
223 pr_debug("ip: %s parse failed.!\n", dptr); 235 pr_debug("ip: %s parse failed.!\n", dptr);
224 return 0; 236 return 0;
225 } 237 }
@@ -296,7 +308,7 @@ int ct_sip_parse_request(const struct nf_conn *ct,
296 return 0; 308 return 0;
297 dptr += shift; 309 dptr += shift;
298 310
299 if (!parse_addr(ct, dptr, &end, addr, limit)) 311 if (!sip_parse_addr(ct, dptr, &end, addr, limit, true))
300 return -1; 312 return -1;
301 if (end < limit && *end == ':') { 313 if (end < limit && *end == ':') {
302 end++; 314 end++;
@@ -550,7 +562,7 @@ int ct_sip_parse_header_uri(const struct nf_conn *ct, const char *dptr,
550 if (ret == 0) 562 if (ret == 0)
551 return ret; 563 return ret;
552 564
553 if (!parse_addr(ct, dptr + *matchoff, &c, addr, limit)) 565 if (!sip_parse_addr(ct, dptr + *matchoff, &c, addr, limit, true))
554 return -1; 566 return -1;
555 if (*c == ':') { 567 if (*c == ':') {
556 c++; 568 c++;
@@ -599,7 +611,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
599 unsigned int dataoff, unsigned int datalen, 611 unsigned int dataoff, unsigned int datalen,
600 const char *name, 612 const char *name,
601 unsigned int *matchoff, unsigned int *matchlen, 613 unsigned int *matchoff, unsigned int *matchlen,
602 union nf_inet_addr *addr) 614 union nf_inet_addr *addr, bool delim)
603{ 615{
604 const char *limit = dptr + datalen; 616 const char *limit = dptr + datalen;
605 const char *start, *end; 617 const char *start, *end;
@@ -613,7 +625,7 @@ int ct_sip_parse_address_param(const struct nf_conn *ct, const char *dptr,
613 return 0; 625 return 0;
614 626
615 start += strlen(name); 627 start += strlen(name);
616 if (!parse_addr(ct, start, &end, addr, limit)) 628 if (!sip_parse_addr(ct, start, &end, addr, limit, delim))
617 return 0; 629 return 0;
618 *matchoff = start - dptr; 630 *matchoff = start - dptr;
619 *matchlen = end - start; 631 *matchlen = end - start;
@@ -675,6 +687,47 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
675 return 1; 687 return 1;
676} 688}
677 689
690static int sdp_parse_addr(const struct nf_conn *ct, const char *cp,
691 const char **endp, union nf_inet_addr *addr,
692 const char *limit)
693{
694 const char *end;
695 int ret;
696
697 memset(addr, 0, sizeof(*addr));
698 switch (nf_ct_l3num(ct)) {
699 case AF_INET:
700 ret = in4_pton(cp, limit - cp, (u8 *)&addr->ip, -1, &end);
701 break;
702 case AF_INET6:
703 ret = in6_pton(cp, limit - cp, (u8 *)&addr->ip6, -1, &end);
704 break;
705 default:
706 BUG();
707 }
708
709 if (ret == 0)
710 return 0;
711 if (endp)
712 *endp = end;
713 return 1;
714}
715
716/* skip ip address. returns its length. */
717static int sdp_addr_len(const struct nf_conn *ct, const char *dptr,
718 const char *limit, int *shift)
719{
720 union nf_inet_addr addr;
721 const char *aux = dptr;
722
723 if (!sdp_parse_addr(ct, dptr, &dptr, &addr, limit)) {
724 pr_debug("ip: %s parse failed.!\n", dptr);
725 return 0;
726 }
727
728 return dptr - aux;
729}
730
678/* SDP header parsing: a SDP session description contains an ordered set of 731/* SDP header parsing: a SDP session description contains an ordered set of
679 * headers, starting with a section containing general session parameters, 732 * headers, starting with a section containing general session parameters,
680 * optionally followed by multiple media descriptions. 733 * optionally followed by multiple media descriptions.
@@ -686,10 +739,10 @@ static int ct_sip_parse_transport(struct nf_conn *ct, const char *dptr,
686 */ 739 */
687static const struct sip_header ct_sdp_hdrs[] = { 740static const struct sip_header ct_sdp_hdrs[] = {
688 [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len), 741 [SDP_HDR_VERSION] = SDP_HDR("v=", NULL, digits_len),
689 [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", epaddr_len), 742 [SDP_HDR_OWNER_IP4] = SDP_HDR("o=", "IN IP4 ", sdp_addr_len),
690 [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", epaddr_len), 743 [SDP_HDR_CONNECTION_IP4] = SDP_HDR("c=", "IN IP4 ", sdp_addr_len),
691 [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", epaddr_len), 744 [SDP_HDR_OWNER_IP6] = SDP_HDR("o=", "IN IP6 ", sdp_addr_len),
692 [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", epaddr_len), 745 [SDP_HDR_CONNECTION_IP6] = SDP_HDR("c=", "IN IP6 ", sdp_addr_len),
693 [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len), 746 [SDP_HDR_MEDIA] = SDP_HDR("m=", NULL, media_len),
694}; 747};
695 748
@@ -775,8 +828,8 @@ static int ct_sip_parse_sdp_addr(const struct nf_conn *ct, const char *dptr,
775 if (ret <= 0) 828 if (ret <= 0)
776 return ret; 829 return ret;
777 830
778 if (!parse_addr(ct, dptr + *matchoff, NULL, addr, 831 if (!sdp_parse_addr(ct, dptr + *matchoff, NULL, addr,
779 dptr + *matchoff + *matchlen)) 832 dptr + *matchoff + *matchlen))
780 return -1; 833 return -1;
781 return 1; 834 return 1;
782} 835}
@@ -1515,7 +1568,6 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
1515} 1568}
1516 1569
1517static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly; 1570static struct nf_conntrack_helper sip[MAX_PORTS][4] __read_mostly;
1518static char sip_names[MAX_PORTS][4][sizeof("sip-65535")] __read_mostly;
1519 1571
1520static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = { 1572static const struct nf_conntrack_expect_policy sip_exp_policy[SIP_EXPECT_MAX + 1] = {
1521 [SIP_EXPECT_SIGNALLING] = { 1573 [SIP_EXPECT_SIGNALLING] = {
@@ -1585,9 +1637,9 @@ static int __init nf_conntrack_sip_init(void)
1585 sip[i][j].me = THIS_MODULE; 1637 sip[i][j].me = THIS_MODULE;
1586 1638
1587 if (ports[i] == SIP_PORT) 1639 if (ports[i] == SIP_PORT)
1588 sprintf(sip_names[i][j], "sip"); 1640 sprintf(sip[i][j].name, "sip");
1589 else 1641 else
1590 sprintf(sip_names[i][j], "sip-%u", i); 1642 sprintf(sip[i][j].name, "sip-%u", i);
1591 1643
1592 pr_debug("port #%u: %u\n", i, ports[i]); 1644 pr_debug("port #%u: %u\n", i, ports[i]);
1593 1645
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 169ab59ed9d4..14e2f3903142 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -480,7 +480,7 @@ __build_packet_message(struct nfulnl_instance *inst,
480 } 480 }
481 481
482 if (indev && skb_mac_header_was_set(skb)) { 482 if (indev && skb_mac_header_was_set(skb)) {
483 if (nla_put_be32(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) || 483 if (nla_put_be16(inst->skb, NFULA_HWTYPE, htons(skb->dev->type)) ||
484 nla_put_be16(inst->skb, NFULA_HWLEN, 484 nla_put_be16(inst->skb, NFULA_HWLEN,
485 htons(skb->dev->hard_header_len)) || 485 htons(skb->dev->hard_header_len)) ||
486 nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len, 486 nla_put(inst->skb, NFULA_HWHEADER, skb->dev->hard_header_len,
@@ -996,8 +996,10 @@ static int __init nfnetlink_log_init(void)
996 996
997#ifdef CONFIG_PROC_FS 997#ifdef CONFIG_PROC_FS
998 if (!proc_create("nfnetlink_log", 0440, 998 if (!proc_create("nfnetlink_log", 0440,
999 proc_net_netfilter, &nful_file_ops)) 999 proc_net_netfilter, &nful_file_ops)) {
1000 status = -ENOMEM;
1000 goto cleanup_logger; 1001 goto cleanup_logger;
1002 }
1001#endif 1003#endif
1002 return status; 1004 return status;
1003 1005
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5463969da45b..527023823b5c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1362,7 +1362,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
1362 if (NULL == siocb->scm) 1362 if (NULL == siocb->scm)
1363 siocb->scm = &scm; 1363 siocb->scm = &scm;
1364 1364
1365 err = scm_send(sock, msg, siocb->scm); 1365 err = scm_send(sock, msg, siocb->scm, true);
1366 if (err < 0) 1366 if (err < 0)
1367 return err; 1367 return err;
1368 1368
@@ -1373,7 +1373,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
1373 dst_pid = addr->nl_pid; 1373 dst_pid = addr->nl_pid;
1374 dst_group = ffs(addr->nl_groups); 1374 dst_group = ffs(addr->nl_groups);
1375 err = -EPERM; 1375 err = -EPERM;
1376 if (dst_group && !netlink_capable(sock, NL_NONROOT_SEND)) 1376 if ((dst_group || dst_pid) &&
1377 !netlink_capable(sock, NL_NONROOT_SEND))
1377 goto out; 1378 goto out;
1378 } else { 1379 } else {
1379 dst_pid = nlk->dst_pid; 1380 dst_pid = nlk->dst_pid;
@@ -2147,6 +2148,7 @@ static void __init netlink_add_usersock_entry(void)
2147 rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); 2148 rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
2148 nl_table[NETLINK_USERSOCK].module = THIS_MODULE; 2149 nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
2149 nl_table[NETLINK_USERSOCK].registered = 1; 2150 nl_table[NETLINK_USERSOCK].registered = 1;
2151 nl_table[NETLINK_USERSOCK].nl_nonroot = NL_NONROOT_SEND;
2150 2152
2151 netlink_table_ungrab(); 2153 netlink_table_ungrab();
2152} 2154}
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 320fa0e6951a..f3f96badf5aa 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -325,9 +325,6 @@ static int sample(struct datapath *dp, struct sk_buff *skb,
325 } 325 }
326 } 326 }
327 327
328 if (!acts_list)
329 return 0;
330
331 return do_execute_actions(dp, skb, nla_data(acts_list), 328 return do_execute_actions(dp, skb, nla_data(acts_list),
332 nla_len(acts_list), true); 329 nla_len(acts_list), true);
333} 330}
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index ceaca7c134a0..c5c9e2a54218 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1079,7 +1079,7 @@ static void *packet_current_rx_frame(struct packet_sock *po,
1079 default: 1079 default:
1080 WARN(1, "TPACKET version not supported\n"); 1080 WARN(1, "TPACKET version not supported\n");
1081 BUG(); 1081 BUG();
1082 return 0; 1082 return NULL;
1083 } 1083 }
1084} 1084}
1085 1085
@@ -1273,6 +1273,14 @@ static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
1273 spin_unlock(&f->lock); 1273 spin_unlock(&f->lock);
1274} 1274}
1275 1275
1276static bool match_fanout_group(struct packet_type *ptype, struct sock * sk)
1277{
1278 if (ptype->af_packet_priv == (void*)((struct packet_sock *)sk)->fanout)
1279 return true;
1280
1281 return false;
1282}
1283
1276static int fanout_add(struct sock *sk, u16 id, u16 type_flags) 1284static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1277{ 1285{
1278 struct packet_sock *po = pkt_sk(sk); 1286 struct packet_sock *po = pkt_sk(sk);
@@ -1325,6 +1333,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
1325 match->prot_hook.dev = po->prot_hook.dev; 1333 match->prot_hook.dev = po->prot_hook.dev;
1326 match->prot_hook.func = packet_rcv_fanout; 1334 match->prot_hook.func = packet_rcv_fanout;
1327 match->prot_hook.af_packet_priv = match; 1335 match->prot_hook.af_packet_priv = match;
1336 match->prot_hook.id_match = match_fanout_group;
1328 dev_add_pack(&match->prot_hook); 1337 dev_add_pack(&match->prot_hook);
1329 list_add(&match->list, &fanout_list); 1338 list_add(&match->list, &fanout_list);
1330 } 1339 }
@@ -1936,7 +1945,6 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
1936 1945
1937 if (likely(po->tx_ring.pg_vec)) { 1946 if (likely(po->tx_ring.pg_vec)) {
1938 ph = skb_shinfo(skb)->destructor_arg; 1947 ph = skb_shinfo(skb)->destructor_arg;
1939 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
1940 BUG_ON(atomic_read(&po->tx_ring.pending) == 0); 1948 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
1941 atomic_dec(&po->tx_ring.pending); 1949 atomic_dec(&po->tx_ring.pending);
1942 __packet_set_status(po, ph, TP_STATUS_AVAILABLE); 1950 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index f10fb8256442..05d60859d8e3 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -67,6 +67,9 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
67 struct tcf_common *pc; 67 struct tcf_common *pc;
68 int ret = 0; 68 int ret = 0;
69 int err; 69 int err;
70#ifdef CONFIG_GACT_PROB
71 struct tc_gact_p *p_parm = NULL;
72#endif
70 73
71 if (nla == NULL) 74 if (nla == NULL)
72 return -EINVAL; 75 return -EINVAL;
@@ -82,6 +85,12 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
82#ifndef CONFIG_GACT_PROB 85#ifndef CONFIG_GACT_PROB
83 if (tb[TCA_GACT_PROB] != NULL) 86 if (tb[TCA_GACT_PROB] != NULL)
84 return -EOPNOTSUPP; 87 return -EOPNOTSUPP;
88#else
89 if (tb[TCA_GACT_PROB]) {
90 p_parm = nla_data(tb[TCA_GACT_PROB]);
91 if (p_parm->ptype >= MAX_RAND)
92 return -EINVAL;
93 }
85#endif 94#endif
86 95
87 pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info); 96 pc = tcf_hash_check(parm->index, a, bind, &gact_hash_info);
@@ -103,8 +112,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
103 spin_lock_bh(&gact->tcf_lock); 112 spin_lock_bh(&gact->tcf_lock);
104 gact->tcf_action = parm->action; 113 gact->tcf_action = parm->action;
105#ifdef CONFIG_GACT_PROB 114#ifdef CONFIG_GACT_PROB
106 if (tb[TCA_GACT_PROB] != NULL) { 115 if (p_parm) {
107 struct tc_gact_p *p_parm = nla_data(tb[TCA_GACT_PROB]);
108 gact->tcfg_paction = p_parm->paction; 116 gact->tcfg_paction = p_parm->paction;
109 gact->tcfg_pval = p_parm->pval; 117 gact->tcfg_pval = p_parm->pval;
110 gact->tcfg_ptype = p_parm->ptype; 118 gact->tcfg_ptype = p_parm->ptype;
@@ -133,7 +141,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
133 141
134 spin_lock(&gact->tcf_lock); 142 spin_lock(&gact->tcf_lock);
135#ifdef CONFIG_GACT_PROB 143#ifdef CONFIG_GACT_PROB
136 if (gact->tcfg_ptype && gact_rand[gact->tcfg_ptype] != NULL) 144 if (gact->tcfg_ptype)
137 action = gact_rand[gact->tcfg_ptype](gact); 145 action = gact_rand[gact->tcfg_ptype](gact);
138 else 146 else
139 action = gact->tcf_action; 147 action = gact->tcf_action;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index 60e281ad0f07..58fb3c7aab9e 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -185,7 +185,12 @@ err3:
185err2: 185err2:
186 kfree(tname); 186 kfree(tname);
187err1: 187err1:
188 kfree(pc); 188 if (ret == ACT_P_CREATED) {
189 if (est)
190 gen_kill_estimator(&pc->tcfc_bstats,
191 &pc->tcfc_rate_est);
192 kfree_rcu(pc, tcfc_rcu);
193 }
189 return err; 194 return err;
190} 195}
191 196
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index fe81cc18e9e0..9c0fd0c78814 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -200,13 +200,12 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
200out: 200out:
201 if (err) { 201 if (err) {
202 m->tcf_qstats.overlimits++; 202 m->tcf_qstats.overlimits++;
203 /* should we be asking for packet to be dropped? 203 if (m->tcfm_eaction != TCA_EGRESS_MIRROR)
204 * may make sense for redirect case only 204 retval = TC_ACT_SHOT;
205 */ 205 else
206 retval = TC_ACT_SHOT; 206 retval = m->tcf_action;
207 } else { 207 } else
208 retval = m->tcf_action; 208 retval = m->tcf_action;
209 }
210 spin_unlock(&m->tcf_lock); 209 spin_unlock(&m->tcf_lock);
211 210
212 return retval; 211 return retval;
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 26aa2f6ce257..45c53ab067a6 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -74,7 +74,10 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
74 p = to_pedit(pc); 74 p = to_pedit(pc);
75 keys = kmalloc(ksize, GFP_KERNEL); 75 keys = kmalloc(ksize, GFP_KERNEL);
76 if (keys == NULL) { 76 if (keys == NULL) {
77 kfree(pc); 77 if (est)
78 gen_kill_estimator(&pc->tcfc_bstats,
79 &pc->tcfc_rate_est);
80 kfree_rcu(pc, tcfc_rcu);
78 return -ENOMEM; 81 return -ENOMEM;
79 } 82 }
80 ret = ACT_P_CREATED; 83 ret = ACT_P_CREATED;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 3922f2a2821b..3714f60f0b3c 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -131,7 +131,10 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
131 d = to_defact(pc); 131 d = to_defact(pc);
132 ret = alloc_defdata(d, defdata); 132 ret = alloc_defdata(d, defdata);
133 if (ret < 0) { 133 if (ret < 0) {
134 kfree(pc); 134 if (est)
135 gen_kill_estimator(&pc->tcfc_bstats,
136 &pc->tcfc_rate_est);
137 kfree_rcu(pc, tcfc_rcu);
135 return ret; 138 return ret;
136 } 139 }
137 d->tcf_action = parm->action; 140 d->tcf_action = parm->action;
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 9af01f3df18c..e4723d31fdd5 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -203,6 +203,34 @@ out:
203 return index; 203 return index;
204} 204}
205 205
206/* Length of the next packet (0 if the queue is empty). */
207static unsigned int qdisc_peek_len(struct Qdisc *sch)
208{
209 struct sk_buff *skb;
210
211 skb = sch->ops->peek(sch);
212 return skb ? qdisc_pkt_len(skb) : 0;
213}
214
215static void qfq_deactivate_class(struct qfq_sched *, struct qfq_class *);
216static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
217 unsigned int len);
218
219static void qfq_update_class_params(struct qfq_sched *q, struct qfq_class *cl,
220 u32 lmax, u32 inv_w, int delta_w)
221{
222 int i;
223
224 /* update qfq-specific data */
225 cl->lmax = lmax;
226 cl->inv_w = inv_w;
227 i = qfq_calc_index(cl->inv_w, cl->lmax);
228
229 cl->grp = &q->groups[i];
230
231 q->wsum += delta_w;
232}
233
206static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, 234static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
207 struct nlattr **tca, unsigned long *arg) 235 struct nlattr **tca, unsigned long *arg)
208{ 236{
@@ -250,6 +278,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
250 lmax = 1UL << QFQ_MTU_SHIFT; 278 lmax = 1UL << QFQ_MTU_SHIFT;
251 279
252 if (cl != NULL) { 280 if (cl != NULL) {
281 bool need_reactivation = false;
282
253 if (tca[TCA_RATE]) { 283 if (tca[TCA_RATE]) {
254 err = gen_replace_estimator(&cl->bstats, &cl->rate_est, 284 err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
255 qdisc_root_sleeping_lock(sch), 285 qdisc_root_sleeping_lock(sch),
@@ -258,12 +288,29 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
258 return err; 288 return err;
259 } 289 }
260 290
261 if (inv_w != cl->inv_w) { 291 if (lmax == cl->lmax && inv_w == cl->inv_w)
262 sch_tree_lock(sch); 292 return 0; /* nothing to update */
263 q->wsum += delta_w; 293
264 cl->inv_w = inv_w; 294 i = qfq_calc_index(inv_w, lmax);
265 sch_tree_unlock(sch); 295 sch_tree_lock(sch);
296 if (&q->groups[i] != cl->grp && cl->qdisc->q.qlen > 0) {
297 /*
298 * shift cl->F back, to not charge the
299 * class for the not-yet-served head
300 * packet
301 */
302 cl->F = cl->S;
303 /* remove class from its slot in the old group */
304 qfq_deactivate_class(q, cl);
305 need_reactivation = true;
266 } 306 }
307
308 qfq_update_class_params(q, cl, lmax, inv_w, delta_w);
309
310 if (need_reactivation) /* activate in new group */
311 qfq_activate_class(q, cl, qdisc_peek_len(cl->qdisc));
312 sch_tree_unlock(sch);
313
267 return 0; 314 return 0;
268 } 315 }
269 316
@@ -273,11 +320,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
273 320
274 cl->refcnt = 1; 321 cl->refcnt = 1;
275 cl->common.classid = classid; 322 cl->common.classid = classid;
276 cl->lmax = lmax;
277 cl->inv_w = inv_w;
278 i = qfq_calc_index(cl->inv_w, cl->lmax);
279 323
280 cl->grp = &q->groups[i]; 324 qfq_update_class_params(q, cl, lmax, inv_w, delta_w);
281 325
282 cl->qdisc = qdisc_create_dflt(sch->dev_queue, 326 cl->qdisc = qdisc_create_dflt(sch->dev_queue,
283 &pfifo_qdisc_ops, classid); 327 &pfifo_qdisc_ops, classid);
@@ -294,7 +338,6 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
294 return err; 338 return err;
295 } 339 }
296 } 340 }
297 q->wsum += weight;
298 341
299 sch_tree_lock(sch); 342 sch_tree_lock(sch);
300 qdisc_class_hash_insert(&q->clhash, &cl->common); 343 qdisc_class_hash_insert(&q->clhash, &cl->common);
@@ -711,15 +754,6 @@ static void qfq_update_eligible(struct qfq_sched *q, u64 old_V)
711 } 754 }
712} 755}
713 756
714/* What is length of next packet in queue (0 if queue is empty) */
715static unsigned int qdisc_peek_len(struct Qdisc *sch)
716{
717 struct sk_buff *skb;
718
719 skb = sch->ops->peek(sch);
720 return skb ? qdisc_pkt_len(skb) : 0;
721}
722
723/* 757/*
724 * Updates the class, returns true if also the group needs to be updated. 758 * Updates the class, returns true if also the group needs to be updated.
725 */ 759 */
@@ -843,11 +877,8 @@ static void qfq_update_start(struct qfq_sched *q, struct qfq_class *cl)
843static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) 877static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
844{ 878{
845 struct qfq_sched *q = qdisc_priv(sch); 879 struct qfq_sched *q = qdisc_priv(sch);
846 struct qfq_group *grp;
847 struct qfq_class *cl; 880 struct qfq_class *cl;
848 int err; 881 int err;
849 u64 roundedS;
850 int s;
851 882
852 cl = qfq_classify(skb, sch, &err); 883 cl = qfq_classify(skb, sch, &err);
853 if (cl == NULL) { 884 if (cl == NULL) {
@@ -876,11 +907,25 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
876 return err; 907 return err;
877 908
878 /* If reach this point, queue q was idle */ 909 /* If reach this point, queue q was idle */
879 grp = cl->grp; 910 qfq_activate_class(q, cl, qdisc_pkt_len(skb));
911
912 return err;
913}
914
915/*
916 * Handle class switch from idle to backlogged.
917 */
918static void qfq_activate_class(struct qfq_sched *q, struct qfq_class *cl,
919 unsigned int pkt_len)
920{
921 struct qfq_group *grp = cl->grp;
922 u64 roundedS;
923 int s;
924
880 qfq_update_start(q, cl); 925 qfq_update_start(q, cl);
881 926
882 /* compute new finish time and rounded start. */ 927 /* compute new finish time and rounded start. */
883 cl->F = cl->S + (u64)qdisc_pkt_len(skb) * cl->inv_w; 928 cl->F = cl->S + (u64)pkt_len * cl->inv_w;
884 roundedS = qfq_round_down(cl->S, grp->slot_shift); 929 roundedS = qfq_round_down(cl->S, grp->slot_shift);
885 930
886 /* 931 /*
@@ -917,8 +962,6 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
917 962
918skip_update: 963skip_update:
919 qfq_slot_insert(grp, cl, roundedS); 964 qfq_slot_insert(grp, cl, roundedS);
920
921 return err;
922} 965}
923 966
924 967
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 33d894776192..10c018a5b9fe 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
702 if (rx_count >= asoc->base.sk->sk_rcvbuf) { 702 if (rx_count >= asoc->base.sk->sk_rcvbuf) {
703 703
704 if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || 704 if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
705 (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) 705 (!sk_rmem_schedule(asoc->base.sk, chunk->skb,
706 chunk->skb->truesize)))
706 goto fail; 707 goto fail;
707 } 708 }
708 709
diff --git a/net/socket.c b/net/socket.c
index dfe5b66c97e0..edc3c4af9085 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2604,7 +2604,7 @@ static int do_siocgstamp(struct net *net, struct socket *sock,
2604 err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv); 2604 err = sock_do_ioctl(net, sock, cmd, (unsigned long)&ktv);
2605 set_fs(old_fs); 2605 set_fs(old_fs);
2606 if (!err) 2606 if (!err)
2607 err = compat_put_timeval(up, &ktv); 2607 err = compat_put_timeval(&ktv, up);
2608 2608
2609 return err; 2609 return err;
2610} 2610}
@@ -2620,7 +2620,7 @@ static int do_siocgstampns(struct net *net, struct socket *sock,
2620 err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts); 2620 err = sock_do_ioctl(net, sock, cmd, (unsigned long)&kts);
2621 set_fs(old_fs); 2621 set_fs(old_fs);
2622 if (!err) 2622 if (!err)
2623 err = compat_put_timespec(up, &kts); 2623 err = compat_put_timespec(&kts, up);
2624 2624
2625 return err; 2625 return err;
2626} 2626}
@@ -2657,6 +2657,7 @@ static int dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
2657 if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf))) 2657 if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
2658 return -EFAULT; 2658 return -EFAULT;
2659 2659
2660 memset(&ifc, 0, sizeof(ifc));
2660 if (ifc32.ifcbuf == 0) { 2661 if (ifc32.ifcbuf == 0) {
2661 ifc32.ifc_len = 0; 2662 ifc32.ifc_len = 0;
2662 ifc.ifc_len = 0; 2663 ifc.ifc_len = 0;
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 9fe8857d8d59..03d03e37a7d5 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA
21 21
22 If unsure, say N. 22 If unsure, say N.
23 23
24config SUNRPC_SWAP
25 bool
26 depends on SUNRPC
27 select NETVM
28
24config RPCSEC_GSS_KRB5 29config RPCSEC_GSS_KRB5
25 tristate "Secure RPC: Kerberos V mechanism" 30 tristate "Secure RPC: Kerberos V mechanism"
26 depends on SUNRPC && CRYPTO 31 depends on SUNRPC && CRYPTO
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 727e506cacda..b5c067bccc45 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -13,6 +13,7 @@
13#include <linux/errno.h> 13#include <linux/errno.h>
14#include <linux/hash.h> 14#include <linux/hash.h>
15#include <linux/sunrpc/clnt.h> 15#include <linux/sunrpc/clnt.h>
16#include <linux/sunrpc/gss_api.h>
16#include <linux/spinlock.h> 17#include <linux/spinlock.h>
17 18
18#ifdef RPC_DEBUG 19#ifdef RPC_DEBUG
@@ -122,6 +123,59 @@ rpcauth_unregister(const struct rpc_authops *ops)
122} 123}
123EXPORT_SYMBOL_GPL(rpcauth_unregister); 124EXPORT_SYMBOL_GPL(rpcauth_unregister);
124 125
126/**
127 * rpcauth_list_flavors - discover registered flavors and pseudoflavors
128 * @array: array to fill in
129 * @size: size of "array"
130 *
131 * Returns the number of array items filled in, or a negative errno.
132 *
133 * The returned array is not sorted by any policy. Callers should not
134 * rely on the order of the items in the returned array.
135 */
136int
137rpcauth_list_flavors(rpc_authflavor_t *array, int size)
138{
139 rpc_authflavor_t flavor;
140 int result = 0;
141
142 spin_lock(&rpc_authflavor_lock);
143 for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) {
144 const struct rpc_authops *ops = auth_flavors[flavor];
145 rpc_authflavor_t pseudos[4];
146 int i, len;
147
148 if (result >= size) {
149 result = -ENOMEM;
150 break;
151 }
152
153 if (ops == NULL)
154 continue;
155 if (ops->list_pseudoflavors == NULL) {
156 array[result++] = ops->au_flavor;
157 continue;
158 }
159 len = ops->list_pseudoflavors(pseudos, ARRAY_SIZE(pseudos));
160 if (len < 0) {
161 result = len;
162 break;
163 }
164 for (i = 0; i < len; i++) {
165 if (result >= size) {
166 result = -ENOMEM;
167 break;
168 }
169 array[result++] = pseudos[i];
170 }
171 }
172 spin_unlock(&rpc_authflavor_lock);
173
174 dprintk("RPC: %s returns %d\n", __func__, result);
175 return result;
176}
177EXPORT_SYMBOL_GPL(rpcauth_list_flavors);
178
125struct rpc_auth * 179struct rpc_auth *
126rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) 180rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt)
127{ 181{
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index d3ad81f8da5b..34c522021004 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1619,6 +1619,7 @@ static const struct rpc_authops authgss_ops = {
1619 .crcreate = gss_create_cred, 1619 .crcreate = gss_create_cred,
1620 .pipes_create = gss_pipes_dentries_create, 1620 .pipes_create = gss_pipes_dentries_create,
1621 .pipes_destroy = gss_pipes_dentries_destroy, 1621 .pipes_destroy = gss_pipes_dentries_destroy,
1622 .list_pseudoflavors = gss_mech_list_pseudoflavors,
1622}; 1623};
1623 1624
1624static const struct rpc_credops gss_credops = { 1625static const struct rpc_credops gss_credops = {
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 782bfe1b6465..b174fcd9ff4c 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -239,14 +239,28 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
239 239
240EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor); 240EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor);
241 241
242int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr) 242/**
243 * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
244 * @array: array to fill in
245 * @size: size of "array"
246 *
247 * Returns the number of array items filled in, or a negative errno.
248 *
249 * The returned array is not sorted by any policy. Callers should not
250 * rely on the order of the items in the returned array.
251 */
252int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
243{ 253{
244 struct gss_api_mech *pos = NULL; 254 struct gss_api_mech *pos = NULL;
245 int j, i = 0; 255 int j, i = 0;
246 256
247 spin_lock(&registered_mechs_lock); 257 spin_lock(&registered_mechs_lock);
248 list_for_each_entry(pos, &registered_mechs, gm_list) { 258 list_for_each_entry(pos, &registered_mechs, gm_list) {
249 for (j=0; j < pos->gm_pf_num; j++) { 259 for (j = 0; j < pos->gm_pf_num; j++) {
260 if (i >= size) {
261 spin_unlock(&registered_mechs_lock);
262 return -ENOMEM;
263 }
250 array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; 264 array_ptr[i++] = pos->gm_pfs[j].pseudoflavor;
251 } 265 }
252 } 266 }
@@ -254,8 +268,6 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr)
254 return i; 268 return i;
255} 269}
256 270
257EXPORT_SYMBOL_GPL(gss_mech_list_pseudoflavors);
258
259u32 271u32
260gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service) 272gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
261{ 273{
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 47ad2666fdf6..2afd2a84dc35 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1349,8 +1349,11 @@ static int c_show(struct seq_file *m, void *p)
1349 if (cache_check(cd, cp, NULL)) 1349 if (cache_check(cd, cp, NULL))
1350 /* cache_check does a cache_put on failure */ 1350 /* cache_check does a cache_put on failure */
1351 seq_printf(m, "# "); 1351 seq_printf(m, "# ");
1352 else 1352 else {
1353 if (cache_is_expired(cd, cp))
1354 seq_printf(m, "# ");
1353 cache_put(cp, cd); 1355 cache_put(cp, cd);
1356 }
1354 1357
1355 return cd->cache_show(m, cd, cp); 1358 return cd->cache_show(m, cd, cp);
1356} 1359}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 00eb859b7de5..fa48c60aef23 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -717,6 +717,15 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
717 atomic_inc(&clnt->cl_count); 717 atomic_inc(&clnt->cl_count);
718 if (clnt->cl_softrtry) 718 if (clnt->cl_softrtry)
719 task->tk_flags |= RPC_TASK_SOFT; 719 task->tk_flags |= RPC_TASK_SOFT;
720 if (sk_memalloc_socks()) {
721 struct rpc_xprt *xprt;
722
723 rcu_read_lock();
724 xprt = rcu_dereference(clnt->cl_xprt);
725 if (xprt->swapper)
726 task->tk_flags |= RPC_TASK_SWAPPER;
727 rcu_read_unlock();
728 }
720 /* Add to the client's list of all tasks */ 729 /* Add to the client's list of all tasks */
721 spin_lock(&clnt->cl_lock); 730 spin_lock(&clnt->cl_lock);
722 list_add_tail(&task->tk_task, &clnt->cl_tasks); 731 list_add_tail(&task->tk_task, &clnt->cl_tasks);
@@ -1844,12 +1853,13 @@ call_timeout(struct rpc_task *task)
1844 return; 1853 return;
1845 } 1854 }
1846 if (RPC_IS_SOFT(task)) { 1855 if (RPC_IS_SOFT(task)) {
1847 if (clnt->cl_chatty) 1856 if (clnt->cl_chatty) {
1848 rcu_read_lock(); 1857 rcu_read_lock();
1849 printk(KERN_NOTICE "%s: server %s not responding, timed out\n", 1858 printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
1850 clnt->cl_protname, 1859 clnt->cl_protname,
1851 rcu_dereference(clnt->cl_xprt)->servername); 1860 rcu_dereference(clnt->cl_xprt)->servername);
1852 rcu_read_unlock(); 1861 rcu_read_unlock();
1862 }
1853 if (task->tk_flags & RPC_TASK_TIMEOUT) 1863 if (task->tk_flags & RPC_TASK_TIMEOUT)
1854 rpc_exit(task, -ETIMEDOUT); 1864 rpc_exit(task, -ETIMEDOUT);
1855 else 1865 else
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 92509ffe15fc..a70acae496e4 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -251,7 +251,7 @@ static int rpcb_create_local_unix(struct net *net)
251 if (IS_ERR(clnt)) { 251 if (IS_ERR(clnt)) {
252 dprintk("RPC: failed to create AF_LOCAL rpcbind " 252 dprintk("RPC: failed to create AF_LOCAL rpcbind "
253 "client (errno %ld).\n", PTR_ERR(clnt)); 253 "client (errno %ld).\n", PTR_ERR(clnt));
254 result = -PTR_ERR(clnt); 254 result = PTR_ERR(clnt);
255 goto out; 255 goto out;
256 } 256 }
257 257
@@ -298,7 +298,7 @@ static int rpcb_create_local_net(struct net *net)
298 if (IS_ERR(clnt)) { 298 if (IS_ERR(clnt)) {
299 dprintk("RPC: failed to create local rpcbind " 299 dprintk("RPC: failed to create local rpcbind "
300 "client (errno %ld).\n", PTR_ERR(clnt)); 300 "client (errno %ld).\n", PTR_ERR(clnt));
301 result = -PTR_ERR(clnt); 301 result = PTR_ERR(clnt);
302 goto out; 302 goto out;
303 } 303 }
304 304
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 994cfea2bad6..128494ec9a64 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -300,8 +300,9 @@ EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
300/* 300/*
301 * Make an RPC task runnable. 301 * Make an RPC task runnable.
302 * 302 *
303 * Note: If the task is ASYNC, this must be called with 303 * Note: If the task is ASYNC, and is being made runnable after sitting on an
304 * the spinlock held to protect the wait queue operation. 304 * rpc_wait_queue, this must be called with the queue spinlock held to protect
305 * the wait queue operation.
305 */ 306 */
306static void rpc_make_runnable(struct rpc_task *task) 307static void rpc_make_runnable(struct rpc_task *task)
307{ 308{
@@ -790,7 +791,9 @@ void rpc_execute(struct rpc_task *task)
790 791
791static void rpc_async_schedule(struct work_struct *work) 792static void rpc_async_schedule(struct work_struct *work)
792{ 793{
794 current->flags |= PF_FSTRANS;
793 __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); 795 __rpc_execute(container_of(work, struct rpc_task, u.tk_work));
796 current->flags &= ~PF_FSTRANS;
794} 797}
795 798
796/** 799/**
@@ -812,7 +815,10 @@ static void rpc_async_schedule(struct work_struct *work)
812void *rpc_malloc(struct rpc_task *task, size_t size) 815void *rpc_malloc(struct rpc_task *task, size_t size)
813{ 816{
814 struct rpc_buffer *buf; 817 struct rpc_buffer *buf;
815 gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; 818 gfp_t gfp = GFP_NOWAIT;
819
820 if (RPC_IS_SWAPPER(task))
821 gfp |= __GFP_MEMALLOC;
816 822
817 size += sizeof(struct rpc_buffer); 823 size += sizeof(struct rpc_buffer);
818 if (size <= RPC_BUFFER_MAXSIZE) 824 if (size <= RPC_BUFFER_MAXSIZE)
@@ -886,7 +892,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
886static struct rpc_task * 892static struct rpc_task *
887rpc_alloc_task(void) 893rpc_alloc_task(void)
888{ 894{
889 return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); 895 return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO);
890} 896}
891 897
892/* 898/*
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 88f2bf671960..bac973a31367 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -316,7 +316,6 @@ static bool svc_xprt_has_something_to_do(struct svc_xprt *xprt)
316 */ 316 */
317void svc_xprt_enqueue(struct svc_xprt *xprt) 317void svc_xprt_enqueue(struct svc_xprt *xprt)
318{ 318{
319 struct svc_serv *serv = xprt->xpt_server;
320 struct svc_pool *pool; 319 struct svc_pool *pool;
321 struct svc_rqst *rqstp; 320 struct svc_rqst *rqstp;
322 int cpu; 321 int cpu;
@@ -362,8 +361,6 @@ void svc_xprt_enqueue(struct svc_xprt *xprt)
362 rqstp, rqstp->rq_xprt); 361 rqstp, rqstp->rq_xprt);
363 rqstp->rq_xprt = xprt; 362 rqstp->rq_xprt = xprt;
364 svc_xprt_get(xprt); 363 svc_xprt_get(xprt);
365 rqstp->rq_reserved = serv->sv_max_mesg;
366 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
367 pool->sp_stats.threads_woken++; 364 pool->sp_stats.threads_woken++;
368 wake_up(&rqstp->rq_wait); 365 wake_up(&rqstp->rq_wait);
369 } else { 366 } else {
@@ -640,8 +637,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
640 if (xprt) { 637 if (xprt) {
641 rqstp->rq_xprt = xprt; 638 rqstp->rq_xprt = xprt;
642 svc_xprt_get(xprt); 639 svc_xprt_get(xprt);
643 rqstp->rq_reserved = serv->sv_max_mesg;
644 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
645 640
646 /* As there is a shortage of threads and this request 641 /* As there is a shortage of threads and this request
647 * had to be queued, don't allow the thread to wait so 642 * had to be queued, don't allow the thread to wait so
@@ -738,6 +733,8 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
738 else 733 else
739 len = xprt->xpt_ops->xpo_recvfrom(rqstp); 734 len = xprt->xpt_ops->xpo_recvfrom(rqstp);
740 dprintk("svc: got len=%d\n", len); 735 dprintk("svc: got len=%d\n", len);
736 rqstp->rq_reserved = serv->sv_max_mesg;
737 atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved);
741 } 738 }
742 svc_xprt_received(xprt); 739 svc_xprt_received(xprt);
743 740
@@ -794,7 +791,8 @@ int svc_send(struct svc_rqst *rqstp)
794 791
795 /* Grab mutex to serialize outgoing data. */ 792 /* Grab mutex to serialize outgoing data. */
796 mutex_lock(&xprt->xpt_mutex); 793 mutex_lock(&xprt->xpt_mutex);
797 if (test_bit(XPT_DEAD, &xprt->xpt_flags)) 794 if (test_bit(XPT_DEAD, &xprt->xpt_flags)
795 || test_bit(XPT_CLOSE, &xprt->xpt_flags))
798 len = -ENOTCONN; 796 len = -ENOTCONN;
799 else 797 else
800 len = xprt->xpt_ops->xpo_sendto(rqstp); 798 len = xprt->xpt_ops->xpo_sendto(rqstp);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 18bc130255a7..998aa8c1807c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1129,9 +1129,9 @@ static int svc_tcp_recvfrom(struct svc_rqst *rqstp)
1129 if (len >= 0) 1129 if (len >= 0)
1130 svsk->sk_tcplen += len; 1130 svsk->sk_tcplen += len;
1131 if (len != want) { 1131 if (len != want) {
1132 svc_tcp_save_pages(svsk, rqstp);
1132 if (len < 0 && len != -EAGAIN) 1133 if (len < 0 && len != -EAGAIN)
1133 goto err_other; 1134 goto err_other;
1134 svc_tcp_save_pages(svsk, rqstp);
1135 dprintk("svc: incomplete TCP record (%d of %d)\n", 1135 dprintk("svc: incomplete TCP record (%d of %d)\n",
1136 svsk->sk_tcplen, svsk->sk_reclen); 1136 svsk->sk_tcplen, svsk->sk_reclen);
1137 goto err_noclose; 1137 goto err_noclose;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 0cf165580d8d..0afba1b4b656 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -129,34 +129,6 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len)
129EXPORT_SYMBOL_GPL(xdr_terminate_string); 129EXPORT_SYMBOL_GPL(xdr_terminate_string);
130 130
131void 131void
132xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base,
133 unsigned int len)
134{
135 struct kvec *tail = xdr->tail;
136 u32 *p;
137
138 xdr->pages = pages;
139 xdr->page_base = base;
140 xdr->page_len = len;
141
142 p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len);
143 tail->iov_base = p;
144 tail->iov_len = 0;
145
146 if (len & 3) {
147 unsigned int pad = 4 - (len & 3);
148
149 *p = 0;
150 tail->iov_base = (char *)p + (len & 3);
151 tail->iov_len = pad;
152 len += pad;
153 }
154 xdr->buflen += len;
155 xdr->len += len;
156}
157EXPORT_SYMBOL_GPL(xdr_encode_pages);
158
159void
160xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, 132xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset,
161 struct page **pages, unsigned int base, unsigned int len) 133 struct page **pages, unsigned int base, unsigned int len)
162{ 134{
@@ -457,6 +429,16 @@ xdr_shift_buf(struct xdr_buf *buf, size_t len)
457EXPORT_SYMBOL_GPL(xdr_shift_buf); 429EXPORT_SYMBOL_GPL(xdr_shift_buf);
458 430
459/** 431/**
432 * xdr_stream_pos - Return the current offset from the start of the xdr_stream
433 * @xdr: pointer to struct xdr_stream
434 */
435unsigned int xdr_stream_pos(const struct xdr_stream *xdr)
436{
437 return (unsigned int)(XDR_QUADLEN(xdr->buf->len) - xdr->nwords) << 2;
438}
439EXPORT_SYMBOL_GPL(xdr_stream_pos);
440
441/**
460 * xdr_init_encode - Initialize a struct xdr_stream for sending data. 442 * xdr_init_encode - Initialize a struct xdr_stream for sending data.
461 * @xdr: pointer to xdr_stream struct 443 * @xdr: pointer to xdr_stream struct
462 * @buf: pointer to XDR buffer in which to encode data 444 * @buf: pointer to XDR buffer in which to encode data
@@ -556,13 +538,11 @@ void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int b
556EXPORT_SYMBOL_GPL(xdr_write_pages); 538EXPORT_SYMBOL_GPL(xdr_write_pages);
557 539
558static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov, 540static void xdr_set_iov(struct xdr_stream *xdr, struct kvec *iov,
559 __be32 *p, unsigned int len) 541 unsigned int len)
560{ 542{
561 if (len > iov->iov_len) 543 if (len > iov->iov_len)
562 len = iov->iov_len; 544 len = iov->iov_len;
563 if (p == NULL) 545 xdr->p = (__be32*)iov->iov_base;
564 p = (__be32*)iov->iov_base;
565 xdr->p = p;
566 xdr->end = (__be32*)(iov->iov_base + len); 546 xdr->end = (__be32*)(iov->iov_base + len);
567 xdr->iov = iov; 547 xdr->iov = iov;
568 xdr->page_ptr = NULL; 548 xdr->page_ptr = NULL;
@@ -609,7 +589,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
609 newbase -= xdr->buf->page_base; 589 newbase -= xdr->buf->page_base;
610 590
611 if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0) 591 if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
612 xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len); 592 xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
613} 593}
614 594
615static bool xdr_set_next_buffer(struct xdr_stream *xdr) 595static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -618,7 +598,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
618 xdr_set_next_page(xdr); 598 xdr_set_next_page(xdr);
619 else if (xdr->iov == xdr->buf->head) { 599 else if (xdr->iov == xdr->buf->head) {
620 if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0) 600 if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
621 xdr_set_iov(xdr, xdr->buf->tail, NULL, xdr->buf->len); 601 xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
622 } 602 }
623 return xdr->p != xdr->end; 603 return xdr->p != xdr->end;
624} 604}
@@ -634,10 +614,15 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p)
634 xdr->buf = buf; 614 xdr->buf = buf;
635 xdr->scratch.iov_base = NULL; 615 xdr->scratch.iov_base = NULL;
636 xdr->scratch.iov_len = 0; 616 xdr->scratch.iov_len = 0;
617 xdr->nwords = XDR_QUADLEN(buf->len);
637 if (buf->head[0].iov_len != 0) 618 if (buf->head[0].iov_len != 0)
638 xdr_set_iov(xdr, buf->head, p, buf->len); 619 xdr_set_iov(xdr, buf->head, buf->len);
639 else if (buf->page_len != 0) 620 else if (buf->page_len != 0)
640 xdr_set_page_base(xdr, 0, buf->len); 621 xdr_set_page_base(xdr, 0, buf->len);
622 if (p != NULL && p > xdr->p && xdr->end >= p) {
623 xdr->nwords -= p - xdr->p;
624 xdr->p = p;
625 }
641} 626}
642EXPORT_SYMBOL_GPL(xdr_init_decode); 627EXPORT_SYMBOL_GPL(xdr_init_decode);
643 628
@@ -662,12 +647,14 @@ EXPORT_SYMBOL_GPL(xdr_init_decode_pages);
662 647
663static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) 648static __be32 * __xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
664{ 649{
650 unsigned int nwords = XDR_QUADLEN(nbytes);
665 __be32 *p = xdr->p; 651 __be32 *p = xdr->p;
666 __be32 *q = p + XDR_QUADLEN(nbytes); 652 __be32 *q = p + nwords;
667 653
668 if (unlikely(q > xdr->end || q < p)) 654 if (unlikely(nwords > xdr->nwords || q > xdr->end || q < p))
669 return NULL; 655 return NULL;
670 xdr->p = q; 656 xdr->p = q;
657 xdr->nwords -= nwords;
671 return p; 658 return p;
672} 659}
673 660
@@ -734,6 +721,31 @@ __be32 * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes)
734} 721}
735EXPORT_SYMBOL_GPL(xdr_inline_decode); 722EXPORT_SYMBOL_GPL(xdr_inline_decode);
736 723
724static unsigned int xdr_align_pages(struct xdr_stream *xdr, unsigned int len)
725{
726 struct xdr_buf *buf = xdr->buf;
727 struct kvec *iov;
728 unsigned int nwords = XDR_QUADLEN(len);
729 unsigned int cur = xdr_stream_pos(xdr);
730
731 if (xdr->nwords == 0)
732 return 0;
733 if (nwords > xdr->nwords) {
734 nwords = xdr->nwords;
735 len = nwords << 2;
736 }
737 /* Realign pages to current pointer position */
738 iov = buf->head;
739 if (iov->iov_len > cur)
740 xdr_shrink_bufhead(buf, iov->iov_len - cur);
741
742 /* Truncate page data and move it into the tail */
743 if (buf->page_len > len)
744 xdr_shrink_pagelen(buf, buf->page_len - len);
745 xdr->nwords = XDR_QUADLEN(buf->len - cur);
746 return len;
747}
748
737/** 749/**
738 * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position 750 * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position
739 * @xdr: pointer to xdr_stream struct 751 * @xdr: pointer to xdr_stream struct
@@ -742,39 +754,37 @@ EXPORT_SYMBOL_GPL(xdr_inline_decode);
742 * Moves data beyond the current pointer position from the XDR head[] buffer 754 * Moves data beyond the current pointer position from the XDR head[] buffer
743 * into the page list. Any data that lies beyond current position + "len" 755 * into the page list. Any data that lies beyond current position + "len"
744 * bytes is moved into the XDR tail[]. 756 * bytes is moved into the XDR tail[].
757 *
758 * Returns the number of XDR encoded bytes now contained in the pages
745 */ 759 */
746void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) 760unsigned int xdr_read_pages(struct xdr_stream *xdr, unsigned int len)
747{ 761{
748 struct xdr_buf *buf = xdr->buf; 762 struct xdr_buf *buf = xdr->buf;
749 struct kvec *iov; 763 struct kvec *iov;
750 ssize_t shift; 764 unsigned int nwords;
751 unsigned int end; 765 unsigned int end;
752 int padding; 766 unsigned int padding;
753 767
754 /* Realign pages to current pointer position */ 768 len = xdr_align_pages(xdr, len);
755 iov = buf->head; 769 if (len == 0)
756 shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p; 770 return 0;
757 if (shift > 0) 771 nwords = XDR_QUADLEN(len);
758 xdr_shrink_bufhead(buf, shift); 772 padding = (nwords << 2) - len;
759
760 /* Truncate page data and move it into the tail */
761 if (buf->page_len > len)
762 xdr_shrink_pagelen(buf, buf->page_len - len);
763 padding = (XDR_QUADLEN(len) << 2) - len;
764 xdr->iov = iov = buf->tail; 773 xdr->iov = iov = buf->tail;
765 /* Compute remaining message length. */ 774 /* Compute remaining message length. */
766 end = iov->iov_len; 775 end = ((xdr->nwords - nwords) << 2) + padding;
767 shift = buf->buflen - buf->len; 776 if (end > iov->iov_len)
768 if (shift < end) 777 end = iov->iov_len;
769 end -= shift; 778
770 else if (shift > 0)
771 end = 0;
772 /* 779 /*
773 * Position current pointer at beginning of tail, and 780 * Position current pointer at beginning of tail, and
774 * set remaining message length. 781 * set remaining message length.
775 */ 782 */
776 xdr->p = (__be32 *)((char *)iov->iov_base + padding); 783 xdr->p = (__be32 *)((char *)iov->iov_base + padding);
777 xdr->end = (__be32 *)((char *)iov->iov_base + end); 784 xdr->end = (__be32 *)((char *)iov->iov_base + end);
785 xdr->page_ptr = NULL;
786 xdr->nwords = XDR_QUADLEN(end - padding);
787 return len;
778} 788}
779EXPORT_SYMBOL_GPL(xdr_read_pages); 789EXPORT_SYMBOL_GPL(xdr_read_pages);
780 790
@@ -790,12 +800,13 @@ EXPORT_SYMBOL_GPL(xdr_read_pages);
790 */ 800 */
791void xdr_enter_page(struct xdr_stream *xdr, unsigned int len) 801void xdr_enter_page(struct xdr_stream *xdr, unsigned int len)
792{ 802{
793 xdr_read_pages(xdr, len); 803 len = xdr_align_pages(xdr, len);
794 /* 804 /*
795 * Position current pointer at beginning of tail, and 805 * Position current pointer at beginning of tail, and
796 * set remaining message length. 806 * set remaining message length.
797 */ 807 */
798 xdr_set_page_base(xdr, 0, len); 808 if (len != 0)
809 xdr_set_page_base(xdr, 0, len);
799} 810}
800EXPORT_SYMBOL_GPL(xdr_enter_page); 811EXPORT_SYMBOL_GPL(xdr_enter_page);
801 812
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index b446e100286f..06cdbff79e4a 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -200,6 +200,7 @@ xprt_rdma_connect_worker(struct work_struct *work)
200 int rc = 0; 200 int rc = 0;
201 201
202 if (!xprt->shutdown) { 202 if (!xprt->shutdown) {
203 current->flags |= PF_FSTRANS;
203 xprt_clear_connected(xprt); 204 xprt_clear_connected(xprt);
204 205
205 dprintk("RPC: %s: %sconnect\n", __func__, 206 dprintk("RPC: %s: %sconnect\n", __func__,
@@ -212,10 +213,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
212 213
213out: 214out:
214 xprt_wake_pending_tasks(xprt, rc); 215 xprt_wake_pending_tasks(xprt, rc);
215
216out_clear: 216out_clear:
217 dprintk("RPC: %s: exit\n", __func__); 217 dprintk("RPC: %s: exit\n", __func__);
218 xprt_clear_connecting(xprt); 218 xprt_clear_connecting(xprt);
219 current->flags &= ~PF_FSTRANS;
219} 220}
220 221
221/* 222/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 62d0dac8f780..400567243f84 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1892,6 +1892,8 @@ static void xs_local_setup_socket(struct work_struct *work)
1892 if (xprt->shutdown) 1892 if (xprt->shutdown)
1893 goto out; 1893 goto out;
1894 1894
1895 current->flags |= PF_FSTRANS;
1896
1895 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); 1897 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
1896 status = __sock_create(xprt->xprt_net, AF_LOCAL, 1898 status = __sock_create(xprt->xprt_net, AF_LOCAL,
1897 SOCK_STREAM, 0, &sock, 1); 1899 SOCK_STREAM, 0, &sock, 1);
@@ -1925,7 +1927,47 @@ static void xs_local_setup_socket(struct work_struct *work)
1925out: 1927out:
1926 xprt_clear_connecting(xprt); 1928 xprt_clear_connecting(xprt);
1927 xprt_wake_pending_tasks(xprt, status); 1929 xprt_wake_pending_tasks(xprt, status);
1930 current->flags &= ~PF_FSTRANS;
1931}
1932
1933#ifdef CONFIG_SUNRPC_SWAP
1934static void xs_set_memalloc(struct rpc_xprt *xprt)
1935{
1936 struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
1937 xprt);
1938
1939 if (xprt->swapper)
1940 sk_set_memalloc(transport->inet);
1941}
1942
1943/**
1944 * xs_swapper - Tag this transport as being used for swap.
1945 * @xprt: transport to tag
1946 * @enable: enable/disable
1947 *
1948 */
1949int xs_swapper(struct rpc_xprt *xprt, int enable)
1950{
1951 struct sock_xprt *transport = container_of(xprt, struct sock_xprt,
1952 xprt);
1953 int err = 0;
1954
1955 if (enable) {
1956 xprt->swapper++;
1957 xs_set_memalloc(xprt);
1958 } else if (xprt->swapper) {
1959 xprt->swapper--;
1960 sk_clear_memalloc(transport->inet);
1961 }
1962
1963 return err;
1964}
1965EXPORT_SYMBOL_GPL(xs_swapper);
1966#else
1967static void xs_set_memalloc(struct rpc_xprt *xprt)
1968{
1928} 1969}
1970#endif
1929 1971
1930static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) 1972static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1931{ 1973{
@@ -1951,6 +1993,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
1951 transport->sock = sock; 1993 transport->sock = sock;
1952 transport->inet = sk; 1994 transport->inet = sk;
1953 1995
1996 xs_set_memalloc(xprt);
1997
1954 write_unlock_bh(&sk->sk_callback_lock); 1998 write_unlock_bh(&sk->sk_callback_lock);
1955 } 1999 }
1956 xs_udp_do_set_buffer_size(xprt); 2000 xs_udp_do_set_buffer_size(xprt);
@@ -1967,6 +2011,8 @@ static void xs_udp_setup_socket(struct work_struct *work)
1967 if (xprt->shutdown) 2011 if (xprt->shutdown)
1968 goto out; 2012 goto out;
1969 2013
2014 current->flags |= PF_FSTRANS;
2015
1970 /* Start by resetting any existing state */ 2016 /* Start by resetting any existing state */
1971 xs_reset_transport(transport); 2017 xs_reset_transport(transport);
1972 sock = xs_create_sock(xprt, transport, 2018 sock = xs_create_sock(xprt, transport,
@@ -1985,6 +2031,7 @@ static void xs_udp_setup_socket(struct work_struct *work)
1985out: 2031out:
1986 xprt_clear_connecting(xprt); 2032 xprt_clear_connecting(xprt);
1987 xprt_wake_pending_tasks(xprt, status); 2033 xprt_wake_pending_tasks(xprt, status);
2034 current->flags &= ~PF_FSTRANS;
1988} 2035}
1989 2036
1990/* 2037/*
@@ -2075,6 +2122,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
2075 if (!xprt_bound(xprt)) 2122 if (!xprt_bound(xprt))
2076 goto out; 2123 goto out;
2077 2124
2125 xs_set_memalloc(xprt);
2126
2078 /* Tell the socket layer to start connecting... */ 2127 /* Tell the socket layer to start connecting... */
2079 xprt->stat.connect_count++; 2128 xprt->stat.connect_count++;
2080 xprt->stat.connect_start = jiffies; 2129 xprt->stat.connect_start = jiffies;
@@ -2110,6 +2159,8 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2110 if (xprt->shutdown) 2159 if (xprt->shutdown)
2111 goto out; 2160 goto out;
2112 2161
2162 current->flags |= PF_FSTRANS;
2163
2113 if (!sock) { 2164 if (!sock) {
2114 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); 2165 clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
2115 sock = xs_create_sock(xprt, transport, 2166 sock = xs_create_sock(xprt, transport,
@@ -2159,6 +2210,7 @@ static void xs_tcp_setup_socket(struct work_struct *work)
2159 case -EINPROGRESS: 2210 case -EINPROGRESS:
2160 case -EALREADY: 2211 case -EALREADY:
2161 xprt_clear_connecting(xprt); 2212 xprt_clear_connecting(xprt);
2213 current->flags &= ~PF_FSTRANS;
2162 return; 2214 return;
2163 case -EINVAL: 2215 case -EINVAL:
2164 /* Happens, for instance, if the user specified a link 2216 /* Happens, for instance, if the user specified a link
@@ -2171,6 +2223,7 @@ out_eagain:
2171out: 2223out:
2172 xprt_clear_connecting(xprt); 2224 xprt_clear_connecting(xprt);
2173 xprt_wake_pending_tasks(xprt, status); 2225 xprt_wake_pending_tasks(xprt, status);
2226 current->flags &= ~PF_FSTRANS;
2174} 2227}
2175 2228
2176/** 2229/**
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 79981d97bc9c..c5ee4ff61364 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -823,6 +823,34 @@ fail:
823 return NULL; 823 return NULL;
824} 824}
825 825
826static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
827{
828 struct dentry *dentry;
829 struct path path;
830 int err = 0;
831 /*
832 * Get the parent directory, calculate the hash for last
833 * component.
834 */
835 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
836 err = PTR_ERR(dentry);
837 if (IS_ERR(dentry))
838 return err;
839
840 /*
841 * All right, let's create it.
842 */
843 err = security_path_mknod(&path, dentry, mode, 0);
844 if (!err) {
845 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
846 if (!err) {
847 res->mnt = mntget(path.mnt);
848 res->dentry = dget(dentry);
849 }
850 }
851 done_path_create(&path, dentry);
852 return err;
853}
826 854
827static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) 855static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
828{ 856{
@@ -831,8 +859,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
831 struct unix_sock *u = unix_sk(sk); 859 struct unix_sock *u = unix_sk(sk);
832 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; 860 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
833 char *sun_path = sunaddr->sun_path; 861 char *sun_path = sunaddr->sun_path;
834 struct dentry *dentry = NULL;
835 struct path path;
836 int err; 862 int err;
837 unsigned int hash; 863 unsigned int hash;
838 struct unix_address *addr; 864 struct unix_address *addr;
@@ -869,43 +895,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
869 atomic_set(&addr->refcnt, 1); 895 atomic_set(&addr->refcnt, 1);
870 896
871 if (sun_path[0]) { 897 if (sun_path[0]) {
872 umode_t mode; 898 struct path path;
873 err = 0; 899 umode_t mode = S_IFSOCK |
874 /*
875 * Get the parent directory, calculate the hash for last
876 * component.
877 */
878 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
879 err = PTR_ERR(dentry);
880 if (IS_ERR(dentry))
881 goto out_mknod_parent;
882
883 /*
884 * All right, let's create it.
885 */
886 mode = S_IFSOCK |
887 (SOCK_INODE(sock)->i_mode & ~current_umask()); 900 (SOCK_INODE(sock)->i_mode & ~current_umask());
888 err = mnt_want_write(path.mnt); 901 err = unix_mknod(sun_path, mode, &path);
889 if (err) 902 if (err) {
890 goto out_mknod_dput; 903 if (err == -EEXIST)
891 err = security_path_mknod(&path, dentry, mode, 0); 904 err = -EADDRINUSE;
892 if (err) 905 unix_release_addr(addr);
893 goto out_mknod_drop_write; 906 goto out_up;
894 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0); 907 }
895out_mknod_drop_write:
896 mnt_drop_write(path.mnt);
897 if (err)
898 goto out_mknod_dput;
899 mutex_unlock(&path.dentry->d_inode->i_mutex);
900 dput(path.dentry);
901 path.dentry = dentry;
902
903 addr->hash = UNIX_HASH_SIZE; 908 addr->hash = UNIX_HASH_SIZE;
904 } 909 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
905 910 spin_lock(&unix_table_lock);
906 spin_lock(&unix_table_lock); 911 u->path = path;
907 912 list = &unix_socket_table[hash];
908 if (!sun_path[0]) { 913 } else {
914 spin_lock(&unix_table_lock);
909 err = -EADDRINUSE; 915 err = -EADDRINUSE;
910 if (__unix_find_socket_byname(net, sunaddr, addr_len, 916 if (__unix_find_socket_byname(net, sunaddr, addr_len,
911 sk->sk_type, hash)) { 917 sk->sk_type, hash)) {
@@ -914,9 +920,6 @@ out_mknod_drop_write:
914 } 920 }
915 921
916 list = &unix_socket_table[addr->hash]; 922 list = &unix_socket_table[addr->hash];
917 } else {
918 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
919 u->path = path;
920 } 923 }
921 924
922 err = 0; 925 err = 0;
@@ -930,16 +933,6 @@ out_up:
930 mutex_unlock(&u->readlock); 933 mutex_unlock(&u->readlock);
931out: 934out:
932 return err; 935 return err;
933
934out_mknod_dput:
935 dput(dentry);
936 mutex_unlock(&path.dentry->d_inode->i_mutex);
937 path_put(&path);
938out_mknod_parent:
939 if (err == -EEXIST)
940 err = -EADDRINUSE;
941 unix_release_addr(addr);
942 goto out_up;
943} 936}
944 937
945static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) 938static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
@@ -1457,7 +1450,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1457 if (NULL == siocb->scm) 1450 if (NULL == siocb->scm)
1458 siocb->scm = &tmp_scm; 1451 siocb->scm = &tmp_scm;
1459 wait_for_unix_gc(); 1452 wait_for_unix_gc();
1460 err = scm_send(sock, msg, siocb->scm); 1453 err = scm_send(sock, msg, siocb->scm, false);
1461 if (err < 0) 1454 if (err < 0)
1462 return err; 1455 return err;
1463 1456
@@ -1626,7 +1619,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1626 if (NULL == siocb->scm) 1619 if (NULL == siocb->scm)
1627 siocb->scm = &tmp_scm; 1620 siocb->scm = &tmp_scm;
1628 wait_for_unix_gc(); 1621 wait_for_unix_gc();
1629 err = scm_send(sock, msg, siocb->scm); 1622 err = scm_send(sock, msg, siocb->scm, false);
1630 if (err < 0) 1623 if (err < 0)
1631 return err; 1624 return err;
1632 1625
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 788a12c1eb5d..2ab785064b7e 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -602,36 +602,31 @@ static int wanrouter_device_new_if(struct wan_device *wandev,
602 * successfully, add it to the interface list. 602 * successfully, add it to the interface list.
603 */ 603 */
604 604
605 if (dev->name == NULL) { 605#ifdef WANDEBUG
606 err = -EINVAL; 606 printk(KERN_INFO "%s: registering interface %s...\n",
607 } else { 607 wanrouter_modname, dev->name);
608#endif
608 609
609 #ifdef WANDEBUG 610 err = register_netdev(dev);
610 printk(KERN_INFO "%s: registering interface %s...\n", 611 if (!err) {
611 wanrouter_modname, dev->name); 612 struct net_device *slave = NULL;
612 #endif 613 unsigned long smp_flags=0;
613 614
614 err = register_netdev(dev); 615 lock_adapter_irq(&wandev->lock, &smp_flags);
615 if (!err) { 616
616 struct net_device *slave = NULL; 617 if (wandev->dev == NULL) {
617 unsigned long smp_flags=0; 618 wandev->dev = dev;
618 619 } else {
619 lock_adapter_irq(&wandev->lock, &smp_flags); 620 for (slave=wandev->dev;
620 621 DEV_TO_SLAVE(slave);
621 if (wandev->dev == NULL) { 622 slave = DEV_TO_SLAVE(slave))
622 wandev->dev = dev; 623 DEV_TO_SLAVE(slave) = dev;
623 } else {
624 for (slave=wandev->dev;
625 DEV_TO_SLAVE(slave);
626 slave = DEV_TO_SLAVE(slave))
627 DEV_TO_SLAVE(slave) = dev;
628 }
629 ++wandev->ndev;
630
631 unlock_adapter_irq(&wandev->lock, &smp_flags);
632 err = 0; /* done !!! */
633 goto out;
634 } 624 }
625 ++wandev->ndev;
626
627 unlock_adapter_irq(&wandev->lock, &smp_flags);
628 err = 0; /* done !!! */
629 goto out;
635 } 630 }
636 if (wandev->del_if) 631 if (wandev->del_if)
637 wandev->del_if(wandev, dev); 632 wandev->del_if(wandev, dev);
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 31b40cc4a9c3..dcd64d5b07aa 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -952,6 +952,11 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
952 */ 952 */
953 synchronize_rcu(); 953 synchronize_rcu();
954 INIT_LIST_HEAD(&wdev->list); 954 INIT_LIST_HEAD(&wdev->list);
955 /*
956 * Ensure that all events have been processed and
957 * freed.
958 */
959 cfg80211_process_wdev_events(wdev);
955 break; 960 break;
956 case NETDEV_PRE_UP: 961 case NETDEV_PRE_UP:
957 if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype))) 962 if (!(wdev->wiphy->interface_modes & BIT(wdev->iftype)))
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 5206c6844fd7..bc7430b54771 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -426,6 +426,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
426 struct net_device *dev, enum nl80211_iftype ntype, 426 struct net_device *dev, enum nl80211_iftype ntype,
427 u32 *flags, struct vif_params *params); 427 u32 *flags, struct vif_params *params);
428void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); 428void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
429void cfg80211_process_wdev_events(struct wireless_dev *wdev);
429 430
430int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev, 431int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
431 struct wireless_dev *wdev, 432 struct wireless_dev *wdev,
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 2303ee73b50a..2ded3c7fad06 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -680,6 +680,8 @@ static u32 map_regdom_flags(u32 rd_flags)
680 channel_flags |= IEEE80211_CHAN_NO_IBSS; 680 channel_flags |= IEEE80211_CHAN_NO_IBSS;
681 if (rd_flags & NL80211_RRF_DFS) 681 if (rd_flags & NL80211_RRF_DFS)
682 channel_flags |= IEEE80211_CHAN_RADAR; 682 channel_flags |= IEEE80211_CHAN_RADAR;
683 if (rd_flags & NL80211_RRF_NO_OFDM)
684 channel_flags |= IEEE80211_CHAN_NO_OFDM;
683 return channel_flags; 685 return channel_flags;
684} 686}
685 687
@@ -901,7 +903,21 @@ static void handle_channel(struct wiphy *wiphy,
901 chan->max_antenna_gain = min(chan->orig_mag, 903 chan->max_antenna_gain = min(chan->orig_mag,
902 (int) MBI_TO_DBI(power_rule->max_antenna_gain)); 904 (int) MBI_TO_DBI(power_rule->max_antenna_gain));
903 chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp); 905 chan->max_reg_power = (int) MBM_TO_DBM(power_rule->max_eirp);
904 chan->max_power = min(chan->max_power, chan->max_reg_power); 906 if (chan->orig_mpwr) {
907 /*
908 * Devices that have their own custom regulatory domain
909 * but also use WIPHY_FLAG_STRICT_REGULATORY will follow the
910 * passed country IE power settings.
911 */
912 if (initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE &&
913 wiphy->flags & WIPHY_FLAG_CUSTOM_REGULATORY &&
914 wiphy->flags & WIPHY_FLAG_STRICT_REGULATORY)
915 chan->max_power = chan->max_reg_power;
916 else
917 chan->max_power = min(chan->orig_mpwr,
918 chan->max_reg_power);
919 } else
920 chan->max_power = chan->max_reg_power;
905} 921}
906 922
907static void handle_band(struct wiphy *wiphy, 923static void handle_band(struct wiphy *wiphy,
@@ -1885,6 +1901,7 @@ static void restore_custom_reg_settings(struct wiphy *wiphy)
1885 chan->flags = chan->orig_flags; 1901 chan->flags = chan->orig_flags;
1886 chan->max_antenna_gain = chan->orig_mag; 1902 chan->max_antenna_gain = chan->orig_mag;
1887 chan->max_power = chan->orig_mpwr; 1903 chan->max_power = chan->orig_mpwr;
1904 chan->beacon_found = false;
1888 } 1905 }
1889 } 1906 }
1890} 1907}
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 26f8cd30f712..994e2f0cc7a8 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -735,7 +735,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev)
735 wdev->connect_keys = NULL; 735 wdev->connect_keys = NULL;
736} 736}
737 737
738static void cfg80211_process_wdev_events(struct wireless_dev *wdev) 738void cfg80211_process_wdev_events(struct wireless_dev *wdev)
739{ 739{
740 struct cfg80211_event *ev; 740 struct cfg80211_event *ev;
741 unsigned long flags; 741 unsigned long flags;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c5a5165a5927..5a2aa17e4d3c 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1357,6 +1357,8 @@ static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family)
1357 1357
1358 memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); 1358 memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst));
1359 xdst->flo.ops = &xfrm_bundle_fc_ops; 1359 xdst->flo.ops = &xfrm_bundle_fc_ops;
1360 if (afinfo->init_dst)
1361 afinfo->init_dst(net, xdst);
1360 } else 1362 } else
1361 xdst = ERR_PTR(-ENOBUFS); 1363 xdst = ERR_PTR(-ENOBUFS);
1362 1364
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 5b228f97d4b3..210be48d8ae3 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -415,8 +415,17 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me)
415 if (x->lft.hard_add_expires_seconds) { 415 if (x->lft.hard_add_expires_seconds) {
416 long tmo = x->lft.hard_add_expires_seconds + 416 long tmo = x->lft.hard_add_expires_seconds +
417 x->curlft.add_time - now; 417 x->curlft.add_time - now;
418 if (tmo <= 0) 418 if (tmo <= 0) {
419 goto expired; 419 if (x->xflags & XFRM_SOFT_EXPIRE) {
420 /* enter hard expire without soft expire first?!
421 * setting a new date could trigger this.
422 * workarbound: fix x->curflt.add_time by below:
423 */
424 x->curlft.add_time = now - x->saved_tmo - 1;
425 tmo = x->lft.hard_add_expires_seconds - x->saved_tmo;
426 } else
427 goto expired;
428 }
420 if (tmo < next) 429 if (tmo < next)
421 next = tmo; 430 next = tmo;
422 } 431 }
@@ -433,10 +442,14 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer * me)
433 if (x->lft.soft_add_expires_seconds) { 442 if (x->lft.soft_add_expires_seconds) {
434 long tmo = x->lft.soft_add_expires_seconds + 443 long tmo = x->lft.soft_add_expires_seconds +
435 x->curlft.add_time - now; 444 x->curlft.add_time - now;
436 if (tmo <= 0) 445 if (tmo <= 0) {
437 warn = 1; 446 warn = 1;
438 else if (tmo < next) 447 x->xflags &= ~XFRM_SOFT_EXPIRE;
448 } else if (tmo < next) {
439 next = tmo; 449 next = tmo;
450 x->xflags |= XFRM_SOFT_EXPIRE;
451 x->saved_tmo = tmo;
452 }
440 } 453 }
441 if (x->lft.soft_use_expires_seconds) { 454 if (x->lft.soft_use_expires_seconds) {
442 long tmo = x->lft.soft_use_expires_seconds + 455 long tmo = x->lft.soft_use_expires_seconds +
@@ -1981,8 +1994,10 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay)
1981 goto error; 1994 goto error;
1982 1995
1983 x->outer_mode = xfrm_get_mode(x->props.mode, family); 1996 x->outer_mode = xfrm_get_mode(x->props.mode, family);
1984 if (x->outer_mode == NULL) 1997 if (x->outer_mode == NULL) {
1998 err = -EPROTONOSUPPORT;
1985 goto error; 1999 goto error;
2000 }
1986 2001
1987 if (init_replay) { 2002 if (init_replay) {
1988 err = xfrm_init_replay(x); 2003 err = xfrm_init_replay(x);