aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/psnap.c13
-rw-r--r--net/802/tr.c4
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/8021q/vlan_core.c12
-rw-r--r--net/8021q/vlan_dev.c3
-rw-r--r--net/9p/trans_fd.c2
-rw-r--r--net/Kconfig15
-rw-r--r--net/Makefile1
-rw-r--r--net/appletalk/ddp.c6
-rw-r--r--net/atm/clip.c9
-rw-r--r--net/atm/lec.c2
-rw-r--r--net/atm/mpc.c32
-rw-r--r--net/atm/mpc.h5
-rw-r--r--net/ax25/af_ax25.c19
-rw-r--r--net/bluetooth/af_bluetooth.c17
-rw-r--r--net/bluetooth/cmtp/core.c3
-rw-r--r--net/bluetooth/hci_conn.c64
-rw-r--r--net/bluetooth/hci_core.c3
-rw-r--r--net/bluetooth/hci_event.c26
-rw-r--r--net/bluetooth/l2cap.c602
-rw-r--r--net/bluetooth/rfcomm/core.c179
-rw-r--r--net/bluetooth/rfcomm/sock.c189
-rw-r--r--net/bluetooth/sco.c57
-rw-r--r--net/bridge/br_netlink.c3
-rw-r--r--net/can/af_can.c3
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c110
-rw-r--r--net/core/drop_monitor.c263
-rw-r--r--net/core/ethtool.c58
-rw-r--r--net/core/fib_rules.c3
-rw-r--r--net/core/neighbour.c15
-rw-r--r--net/core/net-sysfs.c6
-rw-r--r--net/core/net-traces.c29
-rw-r--r--net/core/net_namespace.c89
-rw-r--r--net/core/pktgen.c18
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c33
-rw-r--r--net/core/sock.c9
-rw-r--r--net/core/sysctl_net_core.c1
-rw-r--r--net/dccp/ackvec.h3
-rw-r--r--net/dccp/dccp.h5
-rw-r--r--net/dccp/output.c37
-rw-r--r--net/decnet/af_decnet.c23
-rw-r--r--net/decnet/dn_dev.c6
-rw-r--r--net/decnet/dn_route.c6
-rw-r--r--net/decnet/dn_table.c3
-rw-r--r--net/decnet/sysctl_net_decnet.c2
-rw-r--r--net/dsa/Kconfig6
-rw-r--r--net/dsa/dsa.c177
-rw-r--r--net/dsa/dsa_priv.h97
-rw-r--r--net/dsa/mv88e6060.c12
-rw-r--r--net/dsa/mv88e6123_61_65.c92
-rw-r--r--net/dsa/mv88e6131.c96
-rw-r--r--net/dsa/slave.c34
-rw-r--r--net/dsa/tag_dsa.c32
-rw-r--r--net/dsa/tag_edsa.c32
-rw-r--r--net/dsa/tag_trailer.c12
-rw-r--r--net/econet/af_econet.c2
-rw-r--r--net/ipv4/Kconfig52
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/arp.c11
-rw-r--r--net/ipv4/cipso_ipv4.c9
-rw-r--r--net/ipv4/devinet.c3
-rw-r--r--net/ipv4/fib_frontend.c2
-rw-r--r--net/ipv4/fib_semantics.c5
-rw-r--r--net/ipv4/icmp.c2
-rw-r--r--net/ipv4/inet_fragment.c1
-rw-r--r--net/ipv4/ip_fragment.c3
-rw-r--r--net/ipv4/ip_gre.c5
-rw-r--r--net/ipv4/ipip.c7
-rw-r--r--net/ipv4/tcp.c62
-rw-r--r--net/ipv4/tcp_bic.c11
-rw-r--r--net/ipv4/tcp_cong.c21
-rw-r--r--net/ipv4/tcp_cubic.c11
-rw-r--r--net/ipv4/tcp_htcp.c3
-rw-r--r--net/ipv4/tcp_input.c207
-rw-r--r--net/ipv4/tcp_ipv4.c11
-rw-r--r--net/ipv4/tcp_minisocks.c9
-rw-r--r--net/ipv4/tcp_output.c95
-rw-r--r--net/ipv4/tcp_probe.c5
-rw-r--r--net/ipv4/tcp_scalable.c12
-rw-r--r--net/ipv4/tcp_timer.c23
-rw-r--r--net/ipv4/tcp_veno.c7
-rw-r--r--net/ipv4/tcp_yeah.c9
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv6/addrconf.c90
-rw-r--r--net/ipv6/af_inet6.c26
-rw-r--r--net/ipv6/inet6_hashtables.c4
-rw-r--r--net/ipv6/ipv6_sockglue.c3
-rw-r--r--net/ipv6/ndisc.c6
-rw-r--r--net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c5
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c8
-rw-r--r--net/ipv6/reassembly.c7
-rw-r--r--net/ipv6/route.c5
-rw-r--r--net/ipv6/sit.c7
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/ipv6/xfrm6_state.c2
-rw-r--r--net/ipx/af_ipx.c16
-rw-r--r--net/irda/irda_device.c5
-rw-r--r--net/irda/irlan/irlan_eth.c19
-rw-r--r--net/irda/irmod.c2
-rw-r--r--net/iucv/af_iucv.c3
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/llc/af_llc.c6
-rw-r--r--net/llc/llc_conn.c3
-rw-r--r--net/llc/llc_core.c4
-rw-r--r--net/mac80211/Makefile1
-rw-r--r--net/mac80211/agg-rx.c6
-rw-r--r--net/mac80211/agg-tx.c191
-rw-r--r--net/mac80211/cfg.c52
-rw-r--r--net/mac80211/debugfs_netdev.c48
-rw-r--r--net/mac80211/ht.c19
-rw-r--r--net/mac80211/ibss.c907
-rw-r--r--net/mac80211/ieee80211_i.h141
-rw-r--r--net/mac80211/iface.c91
-rw-r--r--net/mac80211/key.c2
-rw-r--r--net/mac80211/main.c24
-rw-r--r--net/mac80211/mlme.c1709
-rw-r--r--net/mac80211/rate.h12
-rw-r--r--net/mac80211/rx.c37
-rw-r--r--net/mac80211/scan.c66
-rw-r--r--net/mac80211/spectmgmt.c26
-rw-r--r--net/mac80211/sta_info.c15
-rw-r--r--net/mac80211/sta_info.h5
-rw-r--r--net/mac80211/tx.c31
-rw-r--r--net/mac80211/util.c254
-rw-r--r--net/mac80211/wext.c290
-rw-r--r--net/mac80211/wme.c170
-rw-r--r--net/mac80211/wme.h6
-rw-r--r--net/netfilter/nf_conntrack_core.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c4
-rw-r--r--net/netfilter/nfnetlink_log.c8
-rw-r--r--net/netfilter/x_tables.c199
-rw-r--r--net/netfilter/xt_recent.c2
-rw-r--r--net/netlink/af_netlink.c55
-rw-r--r--net/netrom/af_netrom.c11
-rw-r--r--net/packet/af_packet.c5
-rw-r--r--net/phonet/af_phonet.c3
-rw-r--r--net/phonet/pn_netlink.c5
-rw-r--r--net/rds/Kconfig14
-rw-r--r--net/rds/Makefile14
-rw-r--r--net/rds/af_rds.c586
-rw-r--r--net/rds/bind.c199
-rw-r--r--net/rds/cong.c404
-rw-r--r--net/rds/connection.c487
-rw-r--r--net/rds/ib.c323
-rw-r--r--net/rds/ib.h367
-rw-r--r--net/rds/ib_cm.c726
-rw-r--r--net/rds/ib_rdma.c641
-rw-r--r--net/rds/ib_recv.c869
-rw-r--r--net/rds/ib_ring.c168
-rw-r--r--net/rds/ib_send.c874
-rw-r--r--net/rds/ib_stats.c95
-rw-r--r--net/rds/ib_sysctl.c137
-rw-r--r--net/rds/info.c241
-rw-r--r--net/rds/info.h30
-rw-r--r--net/rds/iw.c333
-rw-r--r--net/rds/iw.h395
-rw-r--r--net/rds/iw_cm.c750
-rw-r--r--net/rds/iw_rdma.c888
-rw-r--r--net/rds/iw_recv.c869
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c975
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c137
-rw-r--r--net/rds/loop.c188
-rw-r--r--net/rds/loop.h9
-rw-r--r--net/rds/message.c402
-rw-r--r--net/rds/page.c221
-rw-r--r--net/rds/rdma.c679
-rw-r--r--net/rds/rdma.h84
-rw-r--r--net/rds/rdma_transport.c214
-rw-r--r--net/rds/rdma_transport.h28
-rw-r--r--net/rds/rds.h686
-rw-r--r--net/rds/recv.c542
-rw-r--r--net/rds/send.c1003
-rw-r--r--net/rds/stats.c148
-rw-r--r--net/rds/sysctl.c122
-rw-r--r--net/rds/threads.c265
-rw-r--r--net/rds/transport.c117
-rw-r--r--net/sched/act_police.c13
-rw-r--r--net/sched/sch_cbq.c7
-rw-r--r--net/sched/sch_drr.c13
-rw-r--r--net/sched/sch_hfsc.c7
-rw-r--r--net/sched/sch_htb.c7
-rw-r--r--net/sched/sch_tbf.c9
-rw-r--r--net/sctp/debug.c4
-rw-r--r--net/sctp/endpointola.c3
-rw-r--r--net/sctp/output.c5
-rw-r--r--net/sctp/outqueue.c6
-rw-r--r--net/sctp/protocol.c16
-rw-r--r--net/sctp/sm_make_chunk.c33
-rw-r--r--net/sctp/sm_sideeffect.c86
-rw-r--r--net/sctp/sm_statefuns.c22
-rw-r--r--net/sctp/socket.c161
-rw-r--r--net/sctp/transport.c7
-rw-r--r--net/tipc/bcast.c4
-rw-r--r--net/tipc/bcast.h2
-rw-r--r--net/tipc/dbg.c2
-rw-r--r--net/tipc/node.c2
-rw-r--r--net/unix/af_unix.c3
-rw-r--r--net/wanrouter/wanmain.c8
-rw-r--r--net/wanrouter/wanproc.c2
-rw-r--r--net/wireless/Kconfig10
-rw-r--r--net/wireless/core.c116
-rw-r--r--net/wireless/core.h39
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c2
-rw-r--r--net/wireless/lib80211_crypt_tkip.c4
-rw-r--r--net/wireless/nl80211.c148
-rw-r--r--net/wireless/nl80211.h9
-rw-r--r--net/wireless/reg.c1034
-rw-r--r--net/wireless/reg.h36
-rw-r--r--net/wireless/scan.c64
-rw-r--r--net/wireless/sysfs.c9
-rw-r--r--net/wireless/wext-compat.c97
-rw-r--r--net/x25/af_x25.c13
-rw-r--r--net/xfrm/xfrm_state.c90
219 files changed, 21713 insertions, 3691 deletions
diff --git a/net/802/psnap.c b/net/802/psnap.c
index 6ed711748f26..6fea0750662b 100644
--- a/net/802/psnap.c
+++ b/net/802/psnap.c
@@ -29,7 +29,7 @@ static struct llc_sap *snap_sap;
29/* 29/*
30 * Find a snap client by matching the 5 bytes. 30 * Find a snap client by matching the 5 bytes.
31 */ 31 */
32static struct datalink_proto *find_snap_client(unsigned char *desc) 32static struct datalink_proto *find_snap_client(const unsigned char *desc)
33{ 33{
34 struct datalink_proto *proto = NULL, *p; 34 struct datalink_proto *proto = NULL, *p;
35 35
@@ -95,15 +95,16 @@ static int snap_request(struct datalink_proto *dl,
95EXPORT_SYMBOL(register_snap_client); 95EXPORT_SYMBOL(register_snap_client);
96EXPORT_SYMBOL(unregister_snap_client); 96EXPORT_SYMBOL(unregister_snap_client);
97 97
98static char snap_err_msg[] __initdata = 98static const char snap_err_msg[] __initconst =
99 KERN_CRIT "SNAP - unable to register with 802.2\n"; 99 KERN_CRIT "SNAP - unable to register with 802.2\n";
100 100
101static int __init snap_init(void) 101static int __init snap_init(void)
102{ 102{
103 snap_sap = llc_sap_open(0xAA, snap_rcv); 103 snap_sap = llc_sap_open(0xAA, snap_rcv);
104 104 if (!snap_sap) {
105 if (!snap_sap)
106 printk(snap_err_msg); 105 printk(snap_err_msg);
106 return -EBUSY;
107 }
107 108
108 return 0; 109 return 0;
109} 110}
@@ -121,7 +122,7 @@ module_exit(snap_exit);
121/* 122/*
122 * Register SNAP clients. We don't yet use this for IP. 123 * Register SNAP clients. We don't yet use this for IP.
123 */ 124 */
124struct datalink_proto *register_snap_client(unsigned char *desc, 125struct datalink_proto *register_snap_client(const unsigned char *desc,
125 int (*rcvfunc)(struct sk_buff *, 126 int (*rcvfunc)(struct sk_buff *,
126 struct net_device *, 127 struct net_device *,
127 struct packet_type *, 128 struct packet_type *,
@@ -136,7 +137,7 @@ struct datalink_proto *register_snap_client(unsigned char *desc,
136 137
137 proto = kmalloc(sizeof(*proto), GFP_ATOMIC); 138 proto = kmalloc(sizeof(*proto), GFP_ATOMIC);
138 if (proto) { 139 if (proto) {
139 memcpy(proto->type, desc,5); 140 memcpy(proto->type, desc, 5);
140 proto->rcvfunc = rcvfunc; 141 proto->rcvfunc = rcvfunc;
141 proto->header_length = 5 + 3; /* snap + 802.2 */ 142 proto->header_length = 5 + 3; /* snap + 802.2 */
142 proto->request = snap_request; 143 proto->request = snap_request;
diff --git a/net/802/tr.c b/net/802/tr.c
index 158150fee462..e7eb13084d71 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -486,6 +486,7 @@ static struct rif_cache *rif_get_idx(loff_t pos)
486} 486}
487 487
488static void *rif_seq_start(struct seq_file *seq, loff_t *pos) 488static void *rif_seq_start(struct seq_file *seq, loff_t *pos)
489 __acquires(&rif_lock)
489{ 490{
490 spin_lock_irq(&rif_lock); 491 spin_lock_irq(&rif_lock);
491 492
@@ -517,6 +518,7 @@ static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
517} 518}
518 519
519static void rif_seq_stop(struct seq_file *seq, void *v) 520static void rif_seq_stop(struct seq_file *seq, void *v)
521 __releases(&rif_lock)
520{ 522{
521 spin_unlock_irq(&rif_lock); 523 spin_unlock_irq(&rif_lock);
522} 524}
@@ -668,3 +670,5 @@ module_init(rif_init);
668 670
669EXPORT_SYMBOL(tr_type_trans); 671EXPORT_SYMBOL(tr_type_trans);
670EXPORT_SYMBOL(alloc_trdev); 672EXPORT_SYMBOL(alloc_trdev);
673
674MODULE_LICENSE("GPL");
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 4163ea65bf41..2b7390e377b3 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -51,7 +51,7 @@ const char vlan_version[] = DRV_VERSION;
51static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>"; 51static const char vlan_copyright[] = "Ben Greear <greearb@candelatech.com>";
52static const char vlan_buggyright[] = "David S. Miller <davem@redhat.com>"; 52static const char vlan_buggyright[] = "David S. Miller <davem@redhat.com>";
53 53
54static struct packet_type vlan_packet_type = { 54static struct packet_type vlan_packet_type __read_mostly = {
55 .type = cpu_to_be16(ETH_P_8021Q), 55 .type = cpu_to_be16(ETH_P_8021Q),
56 .func = vlan_skb_recv, /* VLAN receive method */ 56 .func = vlan_skb_recv, /* VLAN receive method */
57}; 57};
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 70435af153f2..654e45f5719d 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -1,12 +1,16 @@
1#include <linux/skbuff.h> 1#include <linux/skbuff.h>
2#include <linux/netdevice.h> 2#include <linux/netdevice.h>
3#include <linux/if_vlan.h> 3#include <linux/if_vlan.h>
4#include <linux/netpoll.h>
4#include "vlan.h" 5#include "vlan.h"
5 6
6/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ 7/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */
7int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, 8int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
8 u16 vlan_tci, int polling) 9 u16 vlan_tci, int polling)
9{ 10{
11 if (netpoll_rx(skb))
12 return NET_RX_DROP;
13
10 if (skb_bond_should_drop(skb)) 14 if (skb_bond_should_drop(skb))
11 goto drop; 15 goto drop;
12 16
@@ -94,12 +98,15 @@ static int vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
94 return dev_gro_receive(napi, skb); 98 return dev_gro_receive(napi, skb);
95 99
96drop: 100drop:
97 return 2; 101 return GRO_DROP;
98} 102}
99 103
100int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, 104int vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp,
101 unsigned int vlan_tci, struct sk_buff *skb) 105 unsigned int vlan_tci, struct sk_buff *skb)
102{ 106{
107 if (netpoll_rx_on(skb))
108 return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
109
103 skb_gro_reset_offset(skb); 110 skb_gro_reset_offset(skb);
104 111
105 return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); 112 return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb);
@@ -114,6 +121,9 @@ int vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp,
114 if (!skb) 121 if (!skb)
115 return NET_RX_DROP; 122 return NET_RX_DROP;
116 123
124 if (netpoll_rx_on(skb))
125 return vlan_hwaccel_receive_skb(skb, grp, vlan_tci);
126
117 return napi_frags_finish(napi, skb, 127 return napi_frags_finish(napi, skb,
118 vlan_gro_common(napi, grp, vlan_tci, skb)); 128 vlan_gro_common(napi, grp, vlan_tci, skb));
119} 129}
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 4a19acd3a32b..1b34135cf990 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -553,7 +553,7 @@ static int vlan_dev_neigh_setup(struct net_device *dev, struct neigh_parms *pa)
553 int err = 0; 553 int err = 0;
554 554
555 if (netif_device_present(real_dev) && ops->ndo_neigh_setup) 555 if (netif_device_present(real_dev) && ops->ndo_neigh_setup)
556 err = ops->ndo_neigh_setup(dev, pa); 556 err = ops->ndo_neigh_setup(real_dev, pa);
557 557
558 return err; 558 return err;
559} 559}
@@ -639,6 +639,7 @@ static int vlan_dev_init(struct net_device *dev)
639 dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN; 639 dev->hard_header_len = real_dev->hard_header_len + VLAN_HLEN;
640 dev->netdev_ops = &vlan_netdev_ops; 640 dev->netdev_ops = &vlan_netdev_ops;
641 } 641 }
642 netdev_resync_ops(dev);
642 643
643 if (is_vlan_dev(real_dev)) 644 if (is_vlan_dev(real_dev))
644 subclass = 1; 645 subclass = 1;
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index 1df0356f242b..c613ed08a5ee 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -417,7 +417,7 @@ static int p9_fd_write(struct p9_client *client, void *v, int len)
417 oldfs = get_fs(); 417 oldfs = get_fs();
418 set_fs(get_ds()); 418 set_fs(get_ds());
419 /* The cast to a user pointer is valid due to the set_fs() */ 419 /* The cast to a user pointer is valid due to the set_fs() */
420 ret = vfs_write(ts->wr, (void __user *)v, len, &ts->wr->f_pos); 420 ret = vfs_write(ts->wr, (__force void __user *)v, len, &ts->wr->f_pos);
421 set_fs(oldfs); 421 set_fs(oldfs);
422 422
423 if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN) 423 if (ret <= 0 && ret != -ERESTARTSYS && ret != -EAGAIN)
diff --git a/net/Kconfig b/net/Kconfig
index a12bae0e3fe9..93998a9c39c2 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -24,9 +24,6 @@ if NET
24 24
25menu "Networking options" 25menu "Networking options"
26 26
27config COMPAT_NET_DEV_OPS
28 def_bool y
29
30source "net/packet/Kconfig" 27source "net/packet/Kconfig"
31source "net/unix/Kconfig" 28source "net/unix/Kconfig"
32source "net/xfrm/Kconfig" 29source "net/xfrm/Kconfig"
@@ -171,6 +168,7 @@ endif
171 168
172source "net/dccp/Kconfig" 169source "net/dccp/Kconfig"
173source "net/sctp/Kconfig" 170source "net/sctp/Kconfig"
171source "net/rds/Kconfig"
174source "net/tipc/Kconfig" 172source "net/tipc/Kconfig"
175source "net/atm/Kconfig" 173source "net/atm/Kconfig"
176source "net/802/Kconfig" 174source "net/802/Kconfig"
@@ -221,6 +219,17 @@ config NET_TCPPROBE
221 To compile this code as a module, choose M here: the 219 To compile this code as a module, choose M here: the
222 module will be called tcp_probe. 220 module will be called tcp_probe.
223 221
222config NET_DROP_MONITOR
223 boolean "Network packet drop alerting service"
224 depends on INET && EXPERIMENTAL && TRACEPOINTS
225 ---help---
226 This feature provides an alerting service to userspace in the
227 event that packets are discarded in the network stack. Alerts
228 are broadcast via netlink socket to any listening user space
229 process. If you don't need network drop alerts, or if you are ok
230 just checking the various proc files and other utilities for
231 drop statistics, say N here.
232
224endmenu 233endmenu
225 234
226endmenu 235endmenu
diff --git a/net/Makefile b/net/Makefile
index 0fcce89d7169..9e00a55a901b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -49,6 +49,7 @@ obj-y += 8021q/
49endif 49endif
50obj-$(CONFIG_IP_DCCP) += dccp/ 50obj-$(CONFIG_IP_DCCP) += dccp/
51obj-$(CONFIG_IP_SCTP) += sctp/ 51obj-$(CONFIG_IP_SCTP) += sctp/
52obj-$(CONFIG_RDS) += rds/
52obj-y += wireless/ 53obj-y += wireless/
53obj-$(CONFIG_MAC80211) += mac80211/ 54obj-$(CONFIG_MAC80211) += mac80211/
54obj-$(CONFIG_TIPC) += tipc/ 55obj-$(CONFIG_TIPC) += tipc/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index 510a6782da8f..3e0671df3a3f 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1860,12 +1860,12 @@ static struct notifier_block ddp_notifier = {
1860 .notifier_call = ddp_device_event, 1860 .notifier_call = ddp_device_event,
1861}; 1861};
1862 1862
1863static struct packet_type ltalk_packet_type = { 1863static struct packet_type ltalk_packet_type __read_mostly = {
1864 .type = cpu_to_be16(ETH_P_LOCALTALK), 1864 .type = cpu_to_be16(ETH_P_LOCALTALK),
1865 .func = ltalk_rcv, 1865 .func = ltalk_rcv,
1866}; 1866};
1867 1867
1868static struct packet_type ppptalk_packet_type = { 1868static struct packet_type ppptalk_packet_type __read_mostly = {
1869 .type = cpu_to_be16(ETH_P_PPPTALK), 1869 .type = cpu_to_be16(ETH_P_PPPTALK),
1870 .func = atalk_rcv, 1870 .func = atalk_rcv,
1871}; 1871};
@@ -1877,7 +1877,7 @@ EXPORT_SYMBOL(aarp_send_ddp);
1877EXPORT_SYMBOL(atrtr_get_dev); 1877EXPORT_SYMBOL(atrtr_get_dev);
1878EXPORT_SYMBOL(atalk_find_dev_addr); 1878EXPORT_SYMBOL(atalk_find_dev_addr);
1879 1879
1880static char atalk_err_snap[] __initdata = 1880static const char atalk_err_snap[] __initconst =
1881 KERN_CRIT "Unable to register DDP with SNAP.\n"; 1881 KERN_CRIT "Unable to register DDP with SNAP.\n";
1882 1882
1883/* Called by proto.c on kernel start up */ 1883/* Called by proto.c on kernel start up */
diff --git a/net/atm/clip.c b/net/atm/clip.c
index da42fd06b61f..3dc0a3a42a57 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -552,10 +552,13 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip)
552 return error; 552 return error;
553} 553}
554 554
555static const struct net_device_ops clip_netdev_ops = {
556 .ndo_start_xmit = clip_start_xmit,
557};
558
555static void clip_setup(struct net_device *dev) 559static void clip_setup(struct net_device *dev)
556{ 560{
557 dev->hard_start_xmit = clip_start_xmit; 561 dev->netdev_ops = &clip_netdev_ops;
558 /* sg_xmit ... */
559 dev->type = ARPHRD_ATM; 562 dev->type = ARPHRD_ATM;
560 dev->hard_header_len = RFC1483LLC_LEN; 563 dev->hard_header_len = RFC1483LLC_LEN;
561 dev->mtu = RFC1626_MTU; 564 dev->mtu = RFC1626_MTU;
@@ -615,7 +618,7 @@ static int clip_device_event(struct notifier_block *this, unsigned long event,
615 } 618 }
616 619
617 /* ignore non-CLIP devices */ 620 /* ignore non-CLIP devices */
618 if (dev->type != ARPHRD_ATM || dev->hard_start_xmit != clip_start_xmit) 621 if (dev->type != ARPHRD_ATM || dev->netdev_ops != &clip_netdev_ops)
619 return NOTIFY_DONE; 622 return NOTIFY_DONE;
620 623
621 switch (event) { 624 switch (event) {
diff --git a/net/atm/lec.c b/net/atm/lec.c
index c0cba9a037e8..199b6bb79f42 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -502,7 +502,7 @@ static int lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb)
502 priv->lane2_ops = NULL; 502 priv->lane2_ops = NULL;
503 if (priv->lane_version > 1) 503 if (priv->lane_version > 1)
504 priv->lane2_ops = &lane2_ops; 504 priv->lane2_ops = &lane2_ops;
505 if (dev->change_mtu(dev, mesg->content.config.mtu)) 505 if (dev_set_mtu(dev, mesg->content.config.mtu))
506 printk("%s: change_mtu to %d failed\n", dev->name, 506 printk("%s: change_mtu to %d failed\n", dev->name,
507 mesg->content.config.mtu); 507 mesg->content.config.mtu);
508 priv->is_proxy = mesg->content.config.is_proxy; 508 priv->is_proxy = mesg->content.config.is_proxy;
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 039d5cc72c3d..e5bf11453a18 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -286,33 +286,32 @@ static void start_mpc(struct mpoa_client *mpc, struct net_device *dev)
286{ 286{
287 287
288 dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name); 288 dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name);
289 if (dev->hard_start_xmit == NULL) { 289 if (!dev->netdev_ops)
290 printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n", 290 printk("mpoa: (%s) start_mpc not starting\n", dev->name);
291 dev->name); 291 else {
292 return; 292 mpc->old_ops = dev->netdev_ops;
293 mpc->new_ops = *mpc->old_ops;
294 mpc->new_ops.ndo_start_xmit = mpc_send_packet;
295 dev->netdev_ops = &mpc->new_ops;
293 } 296 }
294 mpc->old_hard_start_xmit = dev->hard_start_xmit;
295 dev->hard_start_xmit = mpc_send_packet;
296
297 return;
298} 297}
299 298
300static void stop_mpc(struct mpoa_client *mpc) 299static void stop_mpc(struct mpoa_client *mpc)
301{ 300{
302 301 struct net_device *dev = mpc->dev;
303 dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name); 302 dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name);
304 303
305 /* Lets not nullify lec device's dev->hard_start_xmit */ 304 /* Lets not nullify lec device's dev->hard_start_xmit */
306 if (mpc->dev->hard_start_xmit != mpc_send_packet) { 305 if (dev->netdev_ops != &mpc->new_ops) {
307 dprintk(" mpc already stopped, not fatal\n"); 306 dprintk(" mpc already stopped, not fatal\n");
308 return; 307 return;
309 } 308 }
310 dprintk("\n"); 309 dprintk("\n");
311 mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit;
312 mpc->old_hard_start_xmit = NULL;
313 /* close_shortcuts(mpc); ??? FIXME */
314 310
315 return; 311 dev->netdev_ops = mpc->old_ops;
312 mpc->old_ops = NULL;
313
314 /* close_shortcuts(mpc); ??? FIXME */
316} 315}
317 316
318static const char *mpoa_device_type_string(char type) __attribute__ ((unused)); 317static const char *mpoa_device_type_string(char type) __attribute__ ((unused));
@@ -531,7 +530,6 @@ static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc)
531 */ 530 */
532static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) 531static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
533{ 532{
534 int retval;
535 struct mpoa_client *mpc; 533 struct mpoa_client *mpc;
536 struct ethhdr *eth; 534 struct ethhdr *eth;
537 int i = 0; 535 int i = 0;
@@ -561,9 +559,7 @@ static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev)
561 } 559 }
562 560
563 non_ip: 561 non_ip:
564 retval = mpc->old_hard_start_xmit(skb,dev); 562 return mpc->old_ops->ndo_start_xmit(skb,dev);
565
566 return retval;
567} 563}
568 564
569static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) 565static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg)
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
index 24c386c35f57..0919a88bbc70 100644
--- a/net/atm/mpc.h
+++ b/net/atm/mpc.h
@@ -15,7 +15,7 @@ struct mpoa_client {
15 struct mpoa_client *next; 15 struct mpoa_client *next;
16 struct net_device *dev; /* lec in question */ 16 struct net_device *dev; /* lec in question */
17 int dev_num; /* e.g. 2 for lec2 */ 17 int dev_num; /* e.g. 2 for lec2 */
18 int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); 18
19 struct atm_vcc *mpoad_vcc; /* control channel to mpoad */ 19 struct atm_vcc *mpoad_vcc; /* control channel to mpoad */
20 uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */ 20 uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */
21 uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */ 21 uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */
@@ -31,6 +31,9 @@ struct mpoa_client {
31 uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */ 31 uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */
32 int number_of_mps_macs; /* number of the above MAC addresses */ 32 int number_of_mps_macs; /* number of the above MAC addresses */
33 struct mpc_parameters parameters; /* parameters for this client */ 33 struct mpc_parameters parameters; /* parameters for this client */
34
35 const struct net_device_ops *old_ops;
36 struct net_device_ops new_ops;
34}; 37};
35 38
36 39
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index d127fd3ba5c6..7da5ebb84e97 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1435,6 +1435,11 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
1435 size_t size; 1435 size_t size;
1436 int lv, err, addr_len = msg->msg_namelen; 1436 int lv, err, addr_len = msg->msg_namelen;
1437 1437
1438 /* AX.25 empty data frame has no meaning : don't send */
1439 if (len == 0) {
1440 return (0);
1441 }
1442
1438 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) 1443 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
1439 return -EINVAL; 1444 return -EINVAL;
1440 1445
@@ -1529,10 +1534,8 @@ static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock,
1529 dp = ax25->digipeat; 1534 dp = ax25->digipeat;
1530 } 1535 }
1531 1536
1532 SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n");
1533
1534 /* Build a packet */ 1537 /* Build a packet */
1535 SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n"); 1538 SOCK_DEBUG(sk, "AX.25: sendto: Addresses built. Building packet.\n");
1536 1539
1537 /* Assume the worst case */ 1540 /* Assume the worst case */
1538 size = len + ax25->ax25_dev->dev->hard_header_len; 1541 size = len + ax25->ax25_dev->dev->hard_header_len;
@@ -1636,6 +1639,13 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
1636 skb_reset_transport_header(skb); 1639 skb_reset_transport_header(skb);
1637 copied = skb->len; 1640 copied = skb->len;
1638 1641
1642 /* AX.25 empty data frame has no meaning : ignore it */
1643 if (copied == 0) {
1644 err = copied;
1645 skb_free_datagram(sk, skb);
1646 goto out;
1647 }
1648
1639 if (copied > size) { 1649 if (copied > size) {
1640 copied = size; 1650 copied = size;
1641 msg->msg_flags |= MSG_TRUNC; 1651 msg->msg_flags |= MSG_TRUNC;
@@ -1985,9 +1995,8 @@ static const struct proto_ops ax25_proto_ops = {
1985/* 1995/*
1986 * Called by socket.c on kernel start up 1996 * Called by socket.c on kernel start up
1987 */ 1997 */
1988static struct packet_type ax25_packet_type = { 1998static struct packet_type ax25_packet_type __read_mostly = {
1989 .type = cpu_to_be16(ETH_P_AX25), 1999 .type = cpu_to_be16(ETH_P_AX25),
1990 .dev = NULL, /* All devices */
1991 .func = ax25_kiss_rcv, 2000 .func = ax25_kiss_rcv,
1992}; 2001};
1993 2002
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 744ed3f07ef3..02b9baa1930b 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -41,14 +41,13 @@
41 41
42#include <net/bluetooth/bluetooth.h> 42#include <net/bluetooth/bluetooth.h>
43 43
44#define VERSION "2.14" 44#define VERSION "2.15"
45 45
46/* Bluetooth sockets */ 46/* Bluetooth sockets */
47#define BT_MAX_PROTO 8 47#define BT_MAX_PROTO 8
48static struct net_proto_family *bt_proto[BT_MAX_PROTO]; 48static struct net_proto_family *bt_proto[BT_MAX_PROTO];
49static DEFINE_RWLOCK(bt_proto_lock); 49static DEFINE_RWLOCK(bt_proto_lock);
50 50
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key bt_lock_key[BT_MAX_PROTO]; 51static struct lock_class_key bt_lock_key[BT_MAX_PROTO];
53static const char *bt_key_strings[BT_MAX_PROTO] = { 52static const char *bt_key_strings[BT_MAX_PROTO] = {
54 "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP", 53 "sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP",
@@ -86,11 +85,6 @@ static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
86 bt_slock_key_strings[proto], &bt_slock_key[proto], 85 bt_slock_key_strings[proto], &bt_slock_key[proto],
87 bt_key_strings[proto], &bt_lock_key[proto]); 86 bt_key_strings[proto], &bt_lock_key[proto]);
88} 87}
89#else
90static inline void bt_sock_reclassify_lock(struct socket *sock, int proto)
91{
92}
93#endif
94 88
95int bt_sock_register(int proto, struct net_proto_family *ops) 89int bt_sock_register(int proto, struct net_proto_family *ops)
96{ 90{
@@ -217,7 +211,8 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
217 continue; 211 continue;
218 } 212 }
219 213
220 if (sk->sk_state == BT_CONNECTED || !newsock) { 214 if (sk->sk_state == BT_CONNECTED || !newsock ||
215 bt_sk(parent)->defer_setup) {
221 bt_accept_unlink(sk); 216 bt_accept_unlink(sk);
222 if (newsock) 217 if (newsock)
223 sock_graft(sk, newsock); 218 sock_graft(sk, newsock);
@@ -232,7 +227,7 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
232EXPORT_SYMBOL(bt_accept_dequeue); 227EXPORT_SYMBOL(bt_accept_dequeue);
233 228
234int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, 229int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
235 struct msghdr *msg, size_t len, int flags) 230 struct msghdr *msg, size_t len, int flags)
236{ 231{
237 int noblock = flags & MSG_DONTWAIT; 232 int noblock = flags & MSG_DONTWAIT;
238 struct sock *sk = sock->sk; 233 struct sock *sk = sock->sk;
@@ -277,7 +272,9 @@ static inline unsigned int bt_accept_poll(struct sock *parent)
277 272
278 list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { 273 list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
279 sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); 274 sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
280 if (sk->sk_state == BT_CONNECTED) 275 if (sk->sk_state == BT_CONNECTED ||
276 (bt_sk(parent)->defer_setup &&
277 sk->sk_state == BT_CONNECT2))
281 return POLLIN | POLLRDNORM; 278 return POLLIN | POLLRDNORM;
282 } 279 }
283 280
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index c9cac7719efe..0073ec8495da 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -126,8 +126,7 @@ static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const
126 126
127 session->reassembly[id] = nskb; 127 session->reassembly[id] = nskb;
128 128
129 if (skb) 129 kfree_skb(skb);
130 kfree_skb(skb);
131} 130}
132 131
133static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb) 132static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb)
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index a4a789f24c8d..1181db08d9de 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -123,6 +123,8 @@ void hci_add_sco(struct hci_conn *conn, __u16 handle)
123 conn->state = BT_CONNECT; 123 conn->state = BT_CONNECT;
124 conn->out = 1; 124 conn->out = 1;
125 125
126 conn->attempt++;
127
126 cp.handle = cpu_to_le16(handle); 128 cp.handle = cpu_to_le16(handle);
127 cp.pkt_type = cpu_to_le16(conn->pkt_type); 129 cp.pkt_type = cpu_to_le16(conn->pkt_type);
128 130
@@ -139,6 +141,8 @@ void hci_setup_sync(struct hci_conn *conn, __u16 handle)
139 conn->state = BT_CONNECT; 141 conn->state = BT_CONNECT;
140 conn->out = 1; 142 conn->out = 1;
141 143
144 conn->attempt++;
145
142 cp.handle = cpu_to_le16(handle); 146 cp.handle = cpu_to_le16(handle);
143 cp.pkt_type = cpu_to_le16(conn->pkt_type); 147 cp.pkt_type = cpu_to_le16(conn->pkt_type);
144 148
@@ -155,6 +159,7 @@ static void hci_conn_timeout(unsigned long arg)
155{ 159{
156 struct hci_conn *conn = (void *) arg; 160 struct hci_conn *conn = (void *) arg;
157 struct hci_dev *hdev = conn->hdev; 161 struct hci_dev *hdev = conn->hdev;
162 __u8 reason;
158 163
159 BT_DBG("conn %p state %d", conn, conn->state); 164 BT_DBG("conn %p state %d", conn, conn->state);
160 165
@@ -173,7 +178,8 @@ static void hci_conn_timeout(unsigned long arg)
173 break; 178 break;
174 case BT_CONFIG: 179 case BT_CONFIG:
175 case BT_CONNECTED: 180 case BT_CONNECTED:
176 hci_acl_disconn(conn, 0x13); 181 reason = hci_proto_disconn_ind(conn);
182 hci_acl_disconn(conn, reason);
177 break; 183 break;
178 default: 184 default:
179 conn->state = BT_CLOSED; 185 conn->state = BT_CLOSED;
@@ -216,12 +222,13 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst)
216 break; 222 break;
217 case SCO_LINK: 223 case SCO_LINK:
218 if (lmp_esco_capable(hdev)) 224 if (lmp_esco_capable(hdev))
219 conn->pkt_type = hdev->esco_type & SCO_ESCO_MASK; 225 conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
226 (hdev->esco_type & EDR_ESCO_MASK);
220 else 227 else
221 conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK; 228 conn->pkt_type = hdev->pkt_type & SCO_PTYPE_MASK;
222 break; 229 break;
223 case ESCO_LINK: 230 case ESCO_LINK:
224 conn->pkt_type = hdev->esco_type; 231 conn->pkt_type = hdev->esco_type & ~EDR_ESCO_MASK;
225 break; 232 break;
226 } 233 }
227 234
@@ -280,6 +287,8 @@ int hci_conn_del(struct hci_conn *conn)
280 287
281 skb_queue_purge(&conn->data_q); 288 skb_queue_purge(&conn->data_q);
282 289
290 hci_conn_del_sysfs(conn);
291
283 return 0; 292 return 0;
284} 293}
285 294
@@ -325,7 +334,7 @@ EXPORT_SYMBOL(hci_get_route);
325 334
326/* Create SCO or ACL connection. 335/* Create SCO or ACL connection.
327 * Device _must_ be locked */ 336 * Device _must_ be locked */
328struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type) 337struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 sec_level, __u8 auth_type)
329{ 338{
330 struct hci_conn *acl; 339 struct hci_conn *acl;
331 struct hci_conn *sco; 340 struct hci_conn *sco;
@@ -340,6 +349,7 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8
340 hci_conn_hold(acl); 349 hci_conn_hold(acl);
341 350
342 if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { 351 if (acl->state == BT_OPEN || acl->state == BT_CLOSED) {
352 acl->sec_level = sec_level;
343 acl->auth_type = auth_type; 353 acl->auth_type = auth_type;
344 hci_acl_connect(acl); 354 hci_acl_connect(acl);
345 } 355 }
@@ -385,51 +395,59 @@ int hci_conn_check_link_mode(struct hci_conn *conn)
385EXPORT_SYMBOL(hci_conn_check_link_mode); 395EXPORT_SYMBOL(hci_conn_check_link_mode);
386 396
387/* Authenticate remote device */ 397/* Authenticate remote device */
388int hci_conn_auth(struct hci_conn *conn) 398static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
389{ 399{
390 BT_DBG("conn %p", conn); 400 BT_DBG("conn %p", conn);
391 401
392 if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) { 402 if (sec_level > conn->sec_level)
393 if (!(conn->auth_type & 0x01)) { 403 conn->sec_level = sec_level;
394 conn->auth_type |= 0x01; 404 else if (conn->link_mode & HCI_LM_AUTH)
395 conn->link_mode &= ~HCI_LM_AUTH;
396 }
397 }
398
399 if (conn->link_mode & HCI_LM_AUTH)
400 return 1; 405 return 1;
401 406
407 conn->auth_type = auth_type;
408
402 if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { 409 if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) {
403 struct hci_cp_auth_requested cp; 410 struct hci_cp_auth_requested cp;
404 cp.handle = cpu_to_le16(conn->handle); 411 cp.handle = cpu_to_le16(conn->handle);
405 hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, 412 hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED,
406 sizeof(cp), &cp); 413 sizeof(cp), &cp);
407 } 414 }
415
408 return 0; 416 return 0;
409} 417}
410EXPORT_SYMBOL(hci_conn_auth);
411 418
412/* Enable encryption */ 419/* Enable security */
413int hci_conn_encrypt(struct hci_conn *conn) 420int hci_conn_security(struct hci_conn *conn, __u8 sec_level, __u8 auth_type)
414{ 421{
415 BT_DBG("conn %p", conn); 422 BT_DBG("conn %p", conn);
416 423
424 if (sec_level == BT_SECURITY_SDP)
425 return 1;
426
427 if (sec_level == BT_SECURITY_LOW) {
428 if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0)
429 return hci_conn_auth(conn, sec_level, auth_type);
430 else
431 return 1;
432 }
433
417 if (conn->link_mode & HCI_LM_ENCRYPT) 434 if (conn->link_mode & HCI_LM_ENCRYPT)
418 return hci_conn_auth(conn); 435 return hci_conn_auth(conn, sec_level, auth_type);
419 436
420 if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) 437 if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend))
421 return 0; 438 return 0;
422 439
423 if (hci_conn_auth(conn)) { 440 if (hci_conn_auth(conn, sec_level, auth_type)) {
424 struct hci_cp_set_conn_encrypt cp; 441 struct hci_cp_set_conn_encrypt cp;
425 cp.handle = cpu_to_le16(conn->handle); 442 cp.handle = cpu_to_le16(conn->handle);
426 cp.encrypt = 1; 443 cp.encrypt = 1;
427 hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT, 444 hci_send_cmd(conn->hdev, HCI_OP_SET_CONN_ENCRYPT,
428 sizeof(cp), &cp); 445 sizeof(cp), &cp);
429 } 446 }
447
430 return 0; 448 return 0;
431} 449}
432EXPORT_SYMBOL(hci_conn_encrypt); 450EXPORT_SYMBOL(hci_conn_security);
433 451
434/* Change link key */ 452/* Change link key */
435int hci_conn_change_link_key(struct hci_conn *conn) 453int hci_conn_change_link_key(struct hci_conn *conn)
@@ -442,12 +460,13 @@ int hci_conn_change_link_key(struct hci_conn *conn)
442 hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY, 460 hci_send_cmd(conn->hdev, HCI_OP_CHANGE_CONN_LINK_KEY,
443 sizeof(cp), &cp); 461 sizeof(cp), &cp);
444 } 462 }
463
445 return 0; 464 return 0;
446} 465}
447EXPORT_SYMBOL(hci_conn_change_link_key); 466EXPORT_SYMBOL(hci_conn_change_link_key);
448 467
449/* Switch role */ 468/* Switch role */
450int hci_conn_switch_role(struct hci_conn *conn, uint8_t role) 469int hci_conn_switch_role(struct hci_conn *conn, __u8 role)
451{ 470{
452 BT_DBG("conn %p", conn); 471 BT_DBG("conn %p", conn);
453 472
@@ -460,6 +479,7 @@ int hci_conn_switch_role(struct hci_conn *conn, uint8_t role)
460 cp.role = role; 479 cp.role = role;
461 hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp); 480 hci_send_cmd(conn->hdev, HCI_OP_SWITCH_ROLE, sizeof(cp), &cp);
462 } 481 }
482
463 return 0; 483 return 0;
464} 484}
465EXPORT_SYMBOL(hci_conn_switch_role); 485EXPORT_SYMBOL(hci_conn_switch_role);
@@ -542,9 +562,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev)
542 562
543 c->state = BT_CLOSED; 563 c->state = BT_CLOSED;
544 564
545 hci_conn_del_sysfs(c); 565 hci_proto_disconn_cfm(c, 0x16);
546
547 hci_proto_disconn_ind(c, 0x16);
548 hci_conn_del(c); 566 hci_conn_del(c);
549 } 567 }
550} 568}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ba78cc1eb8d9..cd061510b6bd 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -1565,8 +1565,7 @@ static void hci_cmd_task(unsigned long arg)
1565 1565
1566 /* Send queued commands */ 1566 /* Send queued commands */
1567 if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) { 1567 if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) {
1568 if (hdev->sent_cmd) 1568 kfree_skb(hdev->sent_cmd);
1569 kfree_skb(hdev->sent_cmd);
1570 1569
1571 if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) { 1570 if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) {
1572 atomic_dec(&hdev->cmd_cnt); 1571 atomic_dec(&hdev->cmd_cnt);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index f91ba690f5d2..55534244c3a0 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -484,6 +484,15 @@ static void hci_cc_read_local_features(struct hci_dev *hdev, struct sk_buff *skb
484 if (hdev->features[4] & LMP_EV5) 484 if (hdev->features[4] & LMP_EV5)
485 hdev->esco_type |= (ESCO_EV5); 485 hdev->esco_type |= (ESCO_EV5);
486 486
487 if (hdev->features[5] & LMP_EDR_ESCO_2M)
488 hdev->esco_type |= (ESCO_2EV3);
489
490 if (hdev->features[5] & LMP_EDR_ESCO_3M)
491 hdev->esco_type |= (ESCO_3EV3);
492
493 if (hdev->features[5] & LMP_EDR_3S_ESCO)
494 hdev->esco_type |= (ESCO_2EV5 | ESCO_3EV5);
495
487 BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name, 496 BT_DBG("%s features 0x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", hdev->name,
488 hdev->features[0], hdev->features[1], 497 hdev->features[0], hdev->features[1],
489 hdev->features[2], hdev->features[3], 498 hdev->features[2], hdev->features[3],
@@ -914,7 +923,8 @@ static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *s
914 if (ev->status) { 923 if (ev->status) {
915 hci_proto_connect_cfm(conn, ev->status); 924 hci_proto_connect_cfm(conn, ev->status);
916 hci_conn_del(conn); 925 hci_conn_del(conn);
917 } 926 } else if (ev->link_type != ACL_LINK)
927 hci_proto_connect_cfm(conn, ev->status);
918 928
919unlock: 929unlock:
920 hci_dev_unlock(hdev); 930 hci_dev_unlock(hdev);
@@ -1009,9 +1019,7 @@ static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff
1009 if (conn) { 1019 if (conn) {
1010 conn->state = BT_CLOSED; 1020 conn->state = BT_CLOSED;
1011 1021
1012 hci_conn_del_sysfs(conn); 1022 hci_proto_disconn_cfm(conn, ev->reason);
1013
1014 hci_proto_disconn_ind(conn, ev->reason);
1015 hci_conn_del(conn); 1023 hci_conn_del(conn);
1016 } 1024 }
1017 1025
@@ -1600,7 +1608,8 @@ static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_b
1600 1608
1601 if (conn->state == BT_CONFIG) { 1609 if (conn->state == BT_CONFIG) {
1602 if (!ev->status && hdev->ssp_mode > 0 && 1610 if (!ev->status && hdev->ssp_mode > 0 &&
1603 conn->ssp_mode > 0 && conn->out) { 1611 conn->ssp_mode > 0 && conn->out &&
1612 conn->sec_level != BT_SECURITY_SDP) {
1604 struct hci_cp_auth_requested cp; 1613 struct hci_cp_auth_requested cp;
1605 cp.handle = ev->handle; 1614 cp.handle = ev->handle;
1606 hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, 1615 hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED,
@@ -1637,6 +1646,13 @@ static inline void hci_sync_conn_complete_evt(struct hci_dev *hdev, struct sk_bu
1637 conn->type = SCO_LINK; 1646 conn->type = SCO_LINK;
1638 } 1647 }
1639 1648
1649 if (conn->out && ev->status == 0x1c && conn->attempt < 2) {
1650 conn->pkt_type = (hdev->esco_type & SCO_ESCO_MASK) |
1651 (hdev->esco_type & EDR_ESCO_MASK);
1652 hci_setup_sync(conn, conn->link->handle);
1653 goto unlock;
1654 }
1655
1640 if (!ev->status) { 1656 if (!ev->status) {
1641 conn->handle = __le16_to_cpu(ev->handle); 1657 conn->handle = __le16_to_cpu(ev->handle);
1642 conn->state = BT_CONNECTED; 1658 conn->state = BT_CONNECTED;
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index b93748e224ff..ca4d3b40d5ce 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -50,9 +50,10 @@
50#include <net/bluetooth/hci_core.h> 50#include <net/bluetooth/hci_core.h>
51#include <net/bluetooth/l2cap.h> 51#include <net/bluetooth/l2cap.h>
52 52
53#define VERSION "2.11" 53#define VERSION "2.13"
54 54
55static u32 l2cap_feat_mask = 0x0000; 55static u32 l2cap_feat_mask = 0x0080;
56static u8 l2cap_fixed_chan[8] = { 0x02, };
56 57
57static const struct proto_ops l2cap_sock_ops; 58static const struct proto_ops l2cap_sock_ops;
58 59
@@ -77,9 +78,10 @@ static void l2cap_sock_timeout(unsigned long arg)
77 78
78 bh_lock_sock(sk); 79 bh_lock_sock(sk);
79 80
80 if (sk->sk_state == BT_CONNECT && 81 if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONFIG)
81 (l2cap_pi(sk)->link_mode & (L2CAP_LM_AUTH | 82 reason = ECONNREFUSED;
82 L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE))) 83 else if (sk->sk_state == BT_CONNECT &&
84 l2cap_pi(sk)->sec_level != BT_SECURITY_SDP)
83 reason = ECONNREFUSED; 85 reason = ECONNREFUSED;
84 else 86 else
85 reason = ETIMEDOUT; 87 reason = ETIMEDOUT;
@@ -204,6 +206,8 @@ static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct so
204 206
205 BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid); 207 BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid);
206 208
209 conn->disc_reason = 0x13;
210
207 l2cap_pi(sk)->conn = conn; 211 l2cap_pi(sk)->conn = conn;
208 212
209 if (sk->sk_type == SOCK_SEQPACKET) { 213 if (sk->sk_type == SOCK_SEQPACKET) {
@@ -259,18 +263,35 @@ static void l2cap_chan_del(struct sock *sk, int err)
259} 263}
260 264
261/* Service level security */ 265/* Service level security */
262static inline int l2cap_check_link_mode(struct sock *sk) 266static inline int l2cap_check_security(struct sock *sk)
263{ 267{
264 struct l2cap_conn *conn = l2cap_pi(sk)->conn; 268 struct l2cap_conn *conn = l2cap_pi(sk)->conn;
269 __u8 auth_type;
265 270
266 if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) || 271 if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) {
267 (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) 272 if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
268 return hci_conn_encrypt(conn->hcon); 273 auth_type = HCI_AT_NO_BONDING_MITM;
274 else
275 auth_type = HCI_AT_NO_BONDING;
269 276
270 if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH) 277 if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW)
271 return hci_conn_auth(conn->hcon); 278 l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
279 } else {
280 switch (l2cap_pi(sk)->sec_level) {
281 case BT_SECURITY_HIGH:
282 auth_type = HCI_AT_GENERAL_BONDING_MITM;
283 break;
284 case BT_SECURITY_MEDIUM:
285 auth_type = HCI_AT_GENERAL_BONDING;
286 break;
287 default:
288 auth_type = HCI_AT_NO_BONDING;
289 break;
290 }
291 }
272 292
273 return 1; 293 return hci_conn_security(conn->hcon, l2cap_pi(sk)->sec_level,
294 auth_type);
274} 295}
275 296
276static inline u8 l2cap_get_ident(struct l2cap_conn *conn) 297static inline u8 l2cap_get_ident(struct l2cap_conn *conn)
@@ -312,7 +333,10 @@ static void l2cap_do_start(struct sock *sk)
312 struct l2cap_conn *conn = l2cap_pi(sk)->conn; 333 struct l2cap_conn *conn = l2cap_pi(sk)->conn;
313 334
314 if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { 335 if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) {
315 if (l2cap_check_link_mode(sk)) { 336 if (!(conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE))
337 return;
338
339 if (l2cap_check_security(sk)) {
316 struct l2cap_conn_req req; 340 struct l2cap_conn_req req;
317 req.scid = cpu_to_le16(l2cap_pi(sk)->scid); 341 req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
318 req.psm = l2cap_pi(sk)->psm; 342 req.psm = l2cap_pi(sk)->psm;
@@ -356,7 +380,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
356 } 380 }
357 381
358 if (sk->sk_state == BT_CONNECT) { 382 if (sk->sk_state == BT_CONNECT) {
359 if (l2cap_check_link_mode(sk)) { 383 if (l2cap_check_security(sk)) {
360 struct l2cap_conn_req req; 384 struct l2cap_conn_req req;
361 req.scid = cpu_to_le16(l2cap_pi(sk)->scid); 385 req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
362 req.psm = l2cap_pi(sk)->psm; 386 req.psm = l2cap_pi(sk)->psm;
@@ -371,10 +395,18 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
371 rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); 395 rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
372 rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); 396 rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
373 397
374 if (l2cap_check_link_mode(sk)) { 398 if (l2cap_check_security(sk)) {
375 sk->sk_state = BT_CONFIG; 399 if (bt_sk(sk)->defer_setup) {
376 rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS); 400 struct sock *parent = bt_sk(sk)->parent;
377 rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); 401 rsp.result = cpu_to_le16(L2CAP_CR_PEND);
402 rsp.status = cpu_to_le16(L2CAP_CS_AUTHOR_PEND);
403 parent->sk_data_ready(parent, 0);
404
405 } else {
406 sk->sk_state = BT_CONFIG;
407 rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
408 rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
409 }
378 } else { 410 } else {
379 rsp.result = cpu_to_le16(L2CAP_CR_PEND); 411 rsp.result = cpu_to_le16(L2CAP_CR_PEND);
380 rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND); 412 rsp.status = cpu_to_le16(L2CAP_CS_AUTHEN_PEND);
@@ -426,7 +458,7 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
426 read_lock(&l->lock); 458 read_lock(&l->lock);
427 459
428 for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { 460 for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
429 if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE) 461 if (l2cap_pi(sk)->force_reliable)
430 sk->sk_err = err; 462 sk->sk_err = err;
431 } 463 }
432 464
@@ -437,6 +469,7 @@ static void l2cap_info_timeout(unsigned long arg)
437{ 469{
438 struct l2cap_conn *conn = (void *) arg; 470 struct l2cap_conn *conn = (void *) arg;
439 471
472 conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
440 conn->info_ident = 0; 473 conn->info_ident = 0;
441 474
442 l2cap_conn_start(conn); 475 l2cap_conn_start(conn);
@@ -470,6 +503,8 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status)
470 spin_lock_init(&conn->lock); 503 spin_lock_init(&conn->lock);
471 rwlock_init(&conn->chan_list.lock); 504 rwlock_init(&conn->chan_list.lock);
472 505
506 conn->disc_reason = 0x13;
507
473 return conn; 508 return conn;
474} 509}
475 510
@@ -483,8 +518,7 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
483 518
484 BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); 519 BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
485 520
486 if (conn->rx_skb) 521 kfree_skb(conn->rx_skb);
487 kfree_skb(conn->rx_skb);
488 522
489 /* Kill channels */ 523 /* Kill channels */
490 while ((sk = conn->chan_list.head)) { 524 while ((sk = conn->chan_list.head)) {
@@ -608,7 +642,6 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
608 642
609 case BT_CONNECTED: 643 case BT_CONNECTED:
610 case BT_CONFIG: 644 case BT_CONFIG:
611 case BT_CONNECT2:
612 if (sk->sk_type == SOCK_SEQPACKET) { 645 if (sk->sk_type == SOCK_SEQPACKET) {
613 struct l2cap_conn *conn = l2cap_pi(sk)->conn; 646 struct l2cap_conn *conn = l2cap_pi(sk)->conn;
614 struct l2cap_disconn_req req; 647 struct l2cap_disconn_req req;
@@ -624,6 +657,27 @@ static void __l2cap_sock_close(struct sock *sk, int reason)
624 l2cap_chan_del(sk, reason); 657 l2cap_chan_del(sk, reason);
625 break; 658 break;
626 659
660 case BT_CONNECT2:
661 if (sk->sk_type == SOCK_SEQPACKET) {
662 struct l2cap_conn *conn = l2cap_pi(sk)->conn;
663 struct l2cap_conn_rsp rsp;
664 __u16 result;
665
666 if (bt_sk(sk)->defer_setup)
667 result = L2CAP_CR_SEC_BLOCK;
668 else
669 result = L2CAP_CR_BAD_PSM;
670
671 rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
672 rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
673 rsp.result = cpu_to_le16(result);
674 rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
675 l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
676 L2CAP_CONN_RSP, sizeof(rsp), &rsp);
677 } else
678 l2cap_chan_del(sk, reason);
679 break;
680
627 case BT_CONNECT: 681 case BT_CONNECT:
628 case BT_DISCONN: 682 case BT_DISCONN:
629 l2cap_chan_del(sk, reason); 683 l2cap_chan_del(sk, reason);
@@ -653,13 +707,19 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent)
653 707
654 if (parent) { 708 if (parent) {
655 sk->sk_type = parent->sk_type; 709 sk->sk_type = parent->sk_type;
710 bt_sk(sk)->defer_setup = bt_sk(parent)->defer_setup;
711
656 pi->imtu = l2cap_pi(parent)->imtu; 712 pi->imtu = l2cap_pi(parent)->imtu;
657 pi->omtu = l2cap_pi(parent)->omtu; 713 pi->omtu = l2cap_pi(parent)->omtu;
658 pi->link_mode = l2cap_pi(parent)->link_mode; 714 pi->sec_level = l2cap_pi(parent)->sec_level;
715 pi->role_switch = l2cap_pi(parent)->role_switch;
716 pi->force_reliable = l2cap_pi(parent)->force_reliable;
659 } else { 717 } else {
660 pi->imtu = L2CAP_DEFAULT_MTU; 718 pi->imtu = L2CAP_DEFAULT_MTU;
661 pi->omtu = 0; 719 pi->omtu = 0;
662 pi->link_mode = 0; 720 pi->sec_level = BT_SECURITY_LOW;
721 pi->role_switch = 0;
722 pi->force_reliable = 0;
663 } 723 }
664 724
665 /* Default config options */ 725 /* Default config options */
@@ -723,17 +783,24 @@ static int l2cap_sock_create(struct net *net, struct socket *sock, int protocol)
723 return 0; 783 return 0;
724} 784}
725 785
726static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) 786static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
727{ 787{
728 struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
729 struct sock *sk = sock->sk; 788 struct sock *sk = sock->sk;
730 int err = 0; 789 struct sockaddr_l2 la;
790 int len, err = 0;
731 791
732 BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm); 792 BT_DBG("sk %p", sk);
733 793
734 if (!addr || addr->sa_family != AF_BLUETOOTH) 794 if (!addr || addr->sa_family != AF_BLUETOOTH)
735 return -EINVAL; 795 return -EINVAL;
736 796
797 memset(&la, 0, sizeof(la));
798 len = min_t(unsigned int, sizeof(la), alen);
799 memcpy(&la, addr, len);
800
801 if (la.l2_cid)
802 return -EINVAL;
803
737 lock_sock(sk); 804 lock_sock(sk);
738 805
739 if (sk->sk_state != BT_OPEN) { 806 if (sk->sk_state != BT_OPEN) {
@@ -741,7 +808,7 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_
741 goto done; 808 goto done;
742 } 809 }
743 810
744 if (la->l2_psm && btohs(la->l2_psm) < 0x1001 && 811 if (la.l2_psm && btohs(la.l2_psm) < 0x1001 &&
745 !capable(CAP_NET_BIND_SERVICE)) { 812 !capable(CAP_NET_BIND_SERVICE)) {
746 err = -EACCES; 813 err = -EACCES;
747 goto done; 814 goto done;
@@ -749,14 +816,17 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_
749 816
750 write_lock_bh(&l2cap_sk_list.lock); 817 write_lock_bh(&l2cap_sk_list.lock);
751 818
752 if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) { 819 if (la.l2_psm && __l2cap_get_sock_by_addr(la.l2_psm, &la.l2_bdaddr)) {
753 err = -EADDRINUSE; 820 err = -EADDRINUSE;
754 } else { 821 } else {
755 /* Save source address */ 822 /* Save source address */
756 bacpy(&bt_sk(sk)->src, &la->l2_bdaddr); 823 bacpy(&bt_sk(sk)->src, &la.l2_bdaddr);
757 l2cap_pi(sk)->psm = la->l2_psm; 824 l2cap_pi(sk)->psm = la.l2_psm;
758 l2cap_pi(sk)->sport = la->l2_psm; 825 l2cap_pi(sk)->sport = la.l2_psm;
759 sk->sk_state = BT_BOUND; 826 sk->sk_state = BT_BOUND;
827
828 if (btohs(la.l2_psm) == 0x0001 || btohs(la.l2_psm) == 0x0003)
829 l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
760 } 830 }
761 831
762 write_unlock_bh(&l2cap_sk_list.lock); 832 write_unlock_bh(&l2cap_sk_list.lock);
@@ -776,7 +846,8 @@ static int l2cap_do_connect(struct sock *sk)
776 __u8 auth_type; 846 __u8 auth_type;
777 int err = 0; 847 int err = 0;
778 848
779 BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm); 849 BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst),
850 l2cap_pi(sk)->psm);
780 851
781 if (!(hdev = hci_get_route(dst, src))) 852 if (!(hdev = hci_get_route(dst, src)))
782 return -EHOSTUNREACH; 853 return -EHOSTUNREACH;
@@ -785,21 +856,42 @@ static int l2cap_do_connect(struct sock *sk)
785 856
786 err = -ENOMEM; 857 err = -ENOMEM;
787 858
788 if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH || 859 if (sk->sk_type == SOCK_RAW) {
789 l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT || 860 switch (l2cap_pi(sk)->sec_level) {
790 l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) { 861 case BT_SECURITY_HIGH:
791 if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) 862 auth_type = HCI_AT_DEDICATED_BONDING_MITM;
863 break;
864 case BT_SECURITY_MEDIUM:
865 auth_type = HCI_AT_DEDICATED_BONDING;
866 break;
867 default:
868 auth_type = HCI_AT_NO_BONDING;
869 break;
870 }
871 } else if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) {
872 if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
792 auth_type = HCI_AT_NO_BONDING_MITM; 873 auth_type = HCI_AT_NO_BONDING_MITM;
793 else 874 else
794 auth_type = HCI_AT_GENERAL_BONDING_MITM;
795 } else {
796 if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001))
797 auth_type = HCI_AT_NO_BONDING; 875 auth_type = HCI_AT_NO_BONDING;
798 else 876
877 if (l2cap_pi(sk)->sec_level == BT_SECURITY_LOW)
878 l2cap_pi(sk)->sec_level = BT_SECURITY_SDP;
879 } else {
880 switch (l2cap_pi(sk)->sec_level) {
881 case BT_SECURITY_HIGH:
882 auth_type = HCI_AT_GENERAL_BONDING_MITM;
883 break;
884 case BT_SECURITY_MEDIUM:
799 auth_type = HCI_AT_GENERAL_BONDING; 885 auth_type = HCI_AT_GENERAL_BONDING;
886 break;
887 default:
888 auth_type = HCI_AT_NO_BONDING;
889 break;
890 }
800 } 891 }
801 892
802 hcon = hci_connect(hdev, ACL_LINK, dst, auth_type); 893 hcon = hci_connect(hdev, ACL_LINK, dst,
894 l2cap_pi(sk)->sec_level, auth_type);
803 if (!hcon) 895 if (!hcon)
804 goto done; 896 goto done;
805 897
@@ -835,20 +927,25 @@ done:
835 927
836static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) 928static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags)
837{ 929{
838 struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr;
839 struct sock *sk = sock->sk; 930 struct sock *sk = sock->sk;
840 int err = 0; 931 struct sockaddr_l2 la;
841 932 int len, err = 0;
842 lock_sock(sk);
843 933
844 BT_DBG("sk %p", sk); 934 BT_DBG("sk %p", sk);
845 935
846 if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) { 936 if (!addr || addr->sa_family != AF_BLUETOOTH)
847 err = -EINVAL; 937 return -EINVAL;
848 goto done; 938
849 } 939 memset(&la, 0, sizeof(la));
940 len = min_t(unsigned int, sizeof(la), alen);
941 memcpy(&la, addr, len);
942
943 if (la.l2_cid)
944 return -EINVAL;
945
946 lock_sock(sk);
850 947
851 if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) { 948 if (sk->sk_type == SOCK_SEQPACKET && !la.l2_psm) {
852 err = -EINVAL; 949 err = -EINVAL;
853 goto done; 950 goto done;
854 } 951 }
@@ -875,8 +972,8 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al
875 } 972 }
876 973
877 /* Set destination address and psm */ 974 /* Set destination address and psm */
878 bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr); 975 bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr);
879 l2cap_pi(sk)->psm = la->l2_psm; 976 l2cap_pi(sk)->psm = la.l2_psm;
880 977
881 if ((err = l2cap_do_connect(sk))) 978 if ((err = l2cap_do_connect(sk)))
882 goto done; 979 goto done;
@@ -1000,12 +1097,16 @@ static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *l
1000 addr->sa_family = AF_BLUETOOTH; 1097 addr->sa_family = AF_BLUETOOTH;
1001 *len = sizeof(struct sockaddr_l2); 1098 *len = sizeof(struct sockaddr_l2);
1002 1099
1003 if (peer) 1100 if (peer) {
1101 la->l2_psm = l2cap_pi(sk)->psm;
1004 bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst); 1102 bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst);
1005 else 1103 la->l2_cid = htobs(l2cap_pi(sk)->dcid);
1104 } else {
1105 la->l2_psm = l2cap_pi(sk)->sport;
1006 bacpy(&la->l2_bdaddr, &bt_sk(sk)->src); 1106 bacpy(&la->l2_bdaddr, &bt_sk(sk)->src);
1107 la->l2_cid = htobs(l2cap_pi(sk)->scid);
1108 }
1007 1109
1008 la->l2_psm = l2cap_pi(sk)->psm;
1009 return 0; 1110 return 0;
1010} 1111}
1011 1112
@@ -1106,11 +1207,38 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms
1106 return err; 1207 return err;
1107} 1208}
1108 1209
1109static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) 1210static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len, int flags)
1211{
1212 struct sock *sk = sock->sk;
1213
1214 lock_sock(sk);
1215
1216 if (sk->sk_state == BT_CONNECT2 && bt_sk(sk)->defer_setup) {
1217 struct l2cap_conn_rsp rsp;
1218
1219 sk->sk_state = BT_CONFIG;
1220
1221 rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid);
1222 rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid);
1223 rsp.result = cpu_to_le16(L2CAP_CR_SUCCESS);
1224 rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO);
1225 l2cap_send_cmd(l2cap_pi(sk)->conn, l2cap_pi(sk)->ident,
1226 L2CAP_CONN_RSP, sizeof(rsp), &rsp);
1227
1228 release_sock(sk);
1229 return 0;
1230 }
1231
1232 release_sock(sk);
1233
1234 return bt_sock_recvmsg(iocb, sock, msg, len, flags);
1235}
1236
1237static int l2cap_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen)
1110{ 1238{
1111 struct sock *sk = sock->sk; 1239 struct sock *sk = sock->sk;
1112 struct l2cap_options opts; 1240 struct l2cap_options opts;
1113 int err = 0, len; 1241 int len, err = 0;
1114 u32 opt; 1242 u32 opt;
1115 1243
1116 BT_DBG("sk %p", sk); 1244 BT_DBG("sk %p", sk);
@@ -1140,7 +1268,15 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
1140 break; 1268 break;
1141 } 1269 }
1142 1270
1143 l2cap_pi(sk)->link_mode = opt; 1271 if (opt & L2CAP_LM_AUTH)
1272 l2cap_pi(sk)->sec_level = BT_SECURITY_LOW;
1273 if (opt & L2CAP_LM_ENCRYPT)
1274 l2cap_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
1275 if (opt & L2CAP_LM_SECURE)
1276 l2cap_pi(sk)->sec_level = BT_SECURITY_HIGH;
1277
1278 l2cap_pi(sk)->role_switch = (opt & L2CAP_LM_MASTER);
1279 l2cap_pi(sk)->force_reliable = (opt & L2CAP_LM_RELIABLE);
1144 break; 1280 break;
1145 1281
1146 default: 1282 default:
@@ -1152,12 +1288,77 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, ch
1152 return err; 1288 return err;
1153} 1289}
1154 1290
1155static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) 1291static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1292{
1293 struct sock *sk = sock->sk;
1294 struct bt_security sec;
1295 int len, err = 0;
1296 u32 opt;
1297
1298 BT_DBG("sk %p", sk);
1299
1300 if (level == SOL_L2CAP)
1301 return l2cap_sock_setsockopt_old(sock, optname, optval, optlen);
1302
1303 if (level != SOL_BLUETOOTH)
1304 return -ENOPROTOOPT;
1305
1306 lock_sock(sk);
1307
1308 switch (optname) {
1309 case BT_SECURITY:
1310 if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) {
1311 err = -EINVAL;
1312 break;
1313 }
1314
1315 sec.level = BT_SECURITY_LOW;
1316
1317 len = min_t(unsigned int, sizeof(sec), optlen);
1318 if (copy_from_user((char *) &sec, optval, len)) {
1319 err = -EFAULT;
1320 break;
1321 }
1322
1323 if (sec.level < BT_SECURITY_LOW ||
1324 sec.level > BT_SECURITY_HIGH) {
1325 err = -EINVAL;
1326 break;
1327 }
1328
1329 l2cap_pi(sk)->sec_level = sec.level;
1330 break;
1331
1332 case BT_DEFER_SETUP:
1333 if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
1334 err = -EINVAL;
1335 break;
1336 }
1337
1338 if (get_user(opt, (u32 __user *) optval)) {
1339 err = -EFAULT;
1340 break;
1341 }
1342
1343 bt_sk(sk)->defer_setup = opt;
1344 break;
1345
1346 default:
1347 err = -ENOPROTOOPT;
1348 break;
1349 }
1350
1351 release_sock(sk);
1352 return err;
1353}
1354
1355static int l2cap_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
1156{ 1356{
1157 struct sock *sk = sock->sk; 1357 struct sock *sk = sock->sk;
1158 struct l2cap_options opts; 1358 struct l2cap_options opts;
1159 struct l2cap_conninfo cinfo; 1359 struct l2cap_conninfo cinfo;
1160 int len, err = 0; 1360 int len, err = 0;
1361 u32 opt;
1161 1362
1162 BT_DBG("sk %p", sk); 1363 BT_DBG("sk %p", sk);
1163 1364
@@ -1180,12 +1381,36 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch
1180 break; 1381 break;
1181 1382
1182 case L2CAP_LM: 1383 case L2CAP_LM:
1183 if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval)) 1384 switch (l2cap_pi(sk)->sec_level) {
1385 case BT_SECURITY_LOW:
1386 opt = L2CAP_LM_AUTH;
1387 break;
1388 case BT_SECURITY_MEDIUM:
1389 opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT;
1390 break;
1391 case BT_SECURITY_HIGH:
1392 opt = L2CAP_LM_AUTH | L2CAP_LM_ENCRYPT |
1393 L2CAP_LM_SECURE;
1394 break;
1395 default:
1396 opt = 0;
1397 break;
1398 }
1399
1400 if (l2cap_pi(sk)->role_switch)
1401 opt |= L2CAP_LM_MASTER;
1402
1403 if (l2cap_pi(sk)->force_reliable)
1404 opt |= L2CAP_LM_RELIABLE;
1405
1406 if (put_user(opt, (u32 __user *) optval))
1184 err = -EFAULT; 1407 err = -EFAULT;
1185 break; 1408 break;
1186 1409
1187 case L2CAP_CONNINFO: 1410 case L2CAP_CONNINFO:
1188 if (sk->sk_state != BT_CONNECTED) { 1411 if (sk->sk_state != BT_CONNECTED &&
1412 !(sk->sk_state == BT_CONNECT2 &&
1413 bt_sk(sk)->defer_setup)) {
1189 err = -ENOTCONN; 1414 err = -ENOTCONN;
1190 break; 1415 break;
1191 } 1416 }
@@ -1208,6 +1433,60 @@ static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, ch
1208 return err; 1433 return err;
1209} 1434}
1210 1435
1436static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
1437{
1438 struct sock *sk = sock->sk;
1439 struct bt_security sec;
1440 int len, err = 0;
1441
1442 BT_DBG("sk %p", sk);
1443
1444 if (level == SOL_L2CAP)
1445 return l2cap_sock_getsockopt_old(sock, optname, optval, optlen);
1446
1447 if (level != SOL_BLUETOOTH)
1448 return -ENOPROTOOPT;
1449
1450 if (get_user(len, optlen))
1451 return -EFAULT;
1452
1453 lock_sock(sk);
1454
1455 switch (optname) {
1456 case BT_SECURITY:
1457 if (sk->sk_type != SOCK_SEQPACKET && sk->sk_type != SOCK_RAW) {
1458 err = -EINVAL;
1459 break;
1460 }
1461
1462 sec.level = l2cap_pi(sk)->sec_level;
1463
1464 len = min_t(unsigned int, len, sizeof(sec));
1465 if (copy_to_user(optval, (char *) &sec, len))
1466 err = -EFAULT;
1467
1468 break;
1469
1470 case BT_DEFER_SETUP:
1471 if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
1472 err = -EINVAL;
1473 break;
1474 }
1475
1476 if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
1477 err = -EFAULT;
1478
1479 break;
1480
1481 default:
1482 err = -ENOPROTOOPT;
1483 break;
1484 }
1485
1486 release_sock(sk);
1487 return err;
1488}
1489
1211static int l2cap_sock_shutdown(struct socket *sock, int how) 1490static int l2cap_sock_shutdown(struct socket *sock, int how)
1212{ 1491{
1213 struct sock *sk = sock->sk; 1492 struct sock *sk = sock->sk;
@@ -1270,11 +1549,6 @@ static void l2cap_chan_ready(struct sock *sk)
1270 */ 1549 */
1271 parent->sk_data_ready(parent, 0); 1550 parent->sk_data_ready(parent, 0);
1272 } 1551 }
1273
1274 if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) {
1275 struct l2cap_conn *conn = l2cap_pi(sk)->conn;
1276 hci_conn_change_link_key(conn->hcon);
1277 }
1278} 1552}
1279 1553
1280/* Copy frame to all raw sockets on that connection */ 1554/* Copy frame to all raw sockets on that connection */
@@ -1549,8 +1823,11 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd
1549 1823
1550 if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && 1824 if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) &&
1551 cmd->ident == conn->info_ident) { 1825 cmd->ident == conn->info_ident) {
1552 conn->info_ident = 0;
1553 del_timer(&conn->info_timer); 1826 del_timer(&conn->info_timer);
1827
1828 conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
1829 conn->info_ident = 0;
1830
1554 l2cap_conn_start(conn); 1831 l2cap_conn_start(conn);
1555 } 1832 }
1556 1833
@@ -1580,6 +1857,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
1580 /* Check if the ACL is secure enough (if not SDP) */ 1857 /* Check if the ACL is secure enough (if not SDP) */
1581 if (psm != cpu_to_le16(0x0001) && 1858 if (psm != cpu_to_le16(0x0001) &&
1582 !hci_conn_check_link_mode(conn->hcon)) { 1859 !hci_conn_check_link_mode(conn->hcon)) {
1860 conn->disc_reason = 0x05;
1583 result = L2CAP_CR_SEC_BLOCK; 1861 result = L2CAP_CR_SEC_BLOCK;
1584 goto response; 1862 goto response;
1585 } 1863 }
@@ -1621,11 +1899,18 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd
1621 1899
1622 l2cap_pi(sk)->ident = cmd->ident; 1900 l2cap_pi(sk)->ident = cmd->ident;
1623 1901
1624 if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) { 1902 if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) {
1625 if (l2cap_check_link_mode(sk)) { 1903 if (l2cap_check_security(sk)) {
1626 sk->sk_state = BT_CONFIG; 1904 if (bt_sk(sk)->defer_setup) {
1627 result = L2CAP_CR_SUCCESS; 1905 sk->sk_state = BT_CONNECT2;
1628 status = L2CAP_CS_NO_INFO; 1906 result = L2CAP_CR_PEND;
1907 status = L2CAP_CS_AUTHOR_PEND;
1908 parent->sk_data_ready(parent, 0);
1909 } else {
1910 sk->sk_state = BT_CONFIG;
1911 result = L2CAP_CR_SUCCESS;
1912 status = L2CAP_CS_NO_INFO;
1913 }
1629 } else { 1914 } else {
1630 sk->sk_state = BT_CONNECT2; 1915 sk->sk_state = BT_CONNECT2;
1631 result = L2CAP_CR_PEND; 1916 result = L2CAP_CR_PEND;
@@ -1695,11 +1980,14 @@ static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hd
1695 l2cap_pi(sk)->dcid = dcid; 1980 l2cap_pi(sk)->dcid = dcid;
1696 l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; 1981 l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT;
1697 1982
1983 l2cap_pi(sk)->conf_state &= ~L2CAP_CONF_CONNECT_PEND;
1984
1698 l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, 1985 l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ,
1699 l2cap_build_conf_req(sk, req), req); 1986 l2cap_build_conf_req(sk, req), req);
1700 break; 1987 break;
1701 1988
1702 case L2CAP_CR_PEND: 1989 case L2CAP_CR_PEND:
1990 l2cap_pi(sk)->conf_state |= L2CAP_CONF_CONNECT_PEND;
1703 break; 1991 break;
1704 1992
1705 default: 1993 default:
@@ -1908,6 +2196,14 @@ static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cm
1908 put_unaligned(cpu_to_le32(l2cap_feat_mask), (__le32 *) rsp->data); 2196 put_unaligned(cpu_to_le32(l2cap_feat_mask), (__le32 *) rsp->data);
1909 l2cap_send_cmd(conn, cmd->ident, 2197 l2cap_send_cmd(conn, cmd->ident,
1910 L2CAP_INFO_RSP, sizeof(buf), buf); 2198 L2CAP_INFO_RSP, sizeof(buf), buf);
2199 } else if (type == L2CAP_IT_FIXED_CHAN) {
2200 u8 buf[12];
2201 struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) buf;
2202 rsp->type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
2203 rsp->result = cpu_to_le16(L2CAP_IR_SUCCESS);
2204 memcpy(buf + 4, l2cap_fixed_chan, 8);
2205 l2cap_send_cmd(conn, cmd->ident,
2206 L2CAP_INFO_RSP, sizeof(buf), buf);
1911 } else { 2207 } else {
1912 struct l2cap_info_rsp rsp; 2208 struct l2cap_info_rsp rsp;
1913 rsp.type = cpu_to_le16(type); 2209 rsp.type = cpu_to_le16(type);
@@ -1929,14 +2225,31 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm
1929 2225
1930 BT_DBG("type 0x%4.4x result 0x%2.2x", type, result); 2226 BT_DBG("type 0x%4.4x result 0x%2.2x", type, result);
1931 2227
1932 conn->info_ident = 0;
1933
1934 del_timer(&conn->info_timer); 2228 del_timer(&conn->info_timer);
1935 2229
1936 if (type == L2CAP_IT_FEAT_MASK) 2230 if (type == L2CAP_IT_FEAT_MASK) {
1937 conn->feat_mask = get_unaligned_le32(rsp->data); 2231 conn->feat_mask = get_unaligned_le32(rsp->data);
1938 2232
1939 l2cap_conn_start(conn); 2233 if (conn->feat_mask & 0x0080) {
2234 struct l2cap_info_req req;
2235 req.type = cpu_to_le16(L2CAP_IT_FIXED_CHAN);
2236
2237 conn->info_ident = l2cap_get_ident(conn);
2238
2239 l2cap_send_cmd(conn, conn->info_ident,
2240 L2CAP_INFO_REQ, sizeof(req), &req);
2241 } else {
2242 conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
2243 conn->info_ident = 0;
2244
2245 l2cap_conn_start(conn);
2246 }
2247 } else if (type == L2CAP_IT_FIXED_CHAN) {
2248 conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
2249 conn->info_ident = 0;
2250
2251 l2cap_conn_start(conn);
2252 }
1940 2253
1941 return 0; 2254 return 0;
1942} 2255}
@@ -2143,10 +2456,15 @@ static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type)
2143 continue; 2456 continue;
2144 2457
2145 if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) { 2458 if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) {
2146 lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); 2459 lm1 |= HCI_LM_ACCEPT;
2460 if (l2cap_pi(sk)->role_switch)
2461 lm1 |= HCI_LM_MASTER;
2147 exact++; 2462 exact++;
2148 } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) 2463 } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
2149 lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); 2464 lm2 |= HCI_LM_ACCEPT;
2465 if (l2cap_pi(sk)->role_switch)
2466 lm2 |= HCI_LM_MASTER;
2467 }
2150 } 2468 }
2151 read_unlock(&l2cap_sk_list.lock); 2469 read_unlock(&l2cap_sk_list.lock);
2152 2470
@@ -2172,89 +2490,48 @@ static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
2172 return 0; 2490 return 0;
2173} 2491}
2174 2492
2175static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason) 2493static int l2cap_disconn_ind(struct hci_conn *hcon)
2176{ 2494{
2177 BT_DBG("hcon %p reason %d", hcon, reason); 2495 struct l2cap_conn *conn = hcon->l2cap_data;
2178 2496
2179 if (hcon->type != ACL_LINK) 2497 BT_DBG("hcon %p", hcon);
2180 return 0;
2181 2498
2182 l2cap_conn_del(hcon, bt_err(reason)); 2499 if (hcon->type != ACL_LINK || !conn)
2500 return 0x13;
2183 2501
2184 return 0; 2502 return conn->disc_reason;
2185} 2503}
2186 2504
2187static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status) 2505static int l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
2188{ 2506{
2189 struct l2cap_chan_list *l; 2507 BT_DBG("hcon %p reason %d", hcon, reason);
2190 struct l2cap_conn *conn = hcon->l2cap_data;
2191 struct sock *sk;
2192 2508
2193 if (!conn) 2509 if (hcon->type != ACL_LINK)
2194 return 0; 2510 return 0;
2195 2511
2196 l = &conn->chan_list; 2512 l2cap_conn_del(hcon, bt_err(reason));
2197
2198 BT_DBG("conn %p", conn);
2199
2200 read_lock(&l->lock);
2201
2202 for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
2203 struct l2cap_pinfo *pi = l2cap_pi(sk);
2204
2205 bh_lock_sock(sk);
2206
2207 if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) &&
2208 !(hcon->link_mode & HCI_LM_ENCRYPT) &&
2209 !status) {
2210 bh_unlock_sock(sk);
2211 continue;
2212 }
2213
2214 if (sk->sk_state == BT_CONNECT) {
2215 if (!status) {
2216 struct l2cap_conn_req req;
2217 req.scid = cpu_to_le16(l2cap_pi(sk)->scid);
2218 req.psm = l2cap_pi(sk)->psm;
2219
2220 l2cap_pi(sk)->ident = l2cap_get_ident(conn);
2221
2222 l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
2223 L2CAP_CONN_REQ, sizeof(req), &req);
2224 } else {
2225 l2cap_sock_clear_timer(sk);
2226 l2cap_sock_set_timer(sk, HZ / 10);
2227 }
2228 } else if (sk->sk_state == BT_CONNECT2) {
2229 struct l2cap_conn_rsp rsp;
2230 __u16 result;
2231 2513
2232 if (!status) { 2514 return 0;
2233 sk->sk_state = BT_CONFIG; 2515}
2234 result = L2CAP_CR_SUCCESS;
2235 } else {
2236 sk->sk_state = BT_DISCONN;
2237 l2cap_sock_set_timer(sk, HZ / 10);
2238 result = L2CAP_CR_SEC_BLOCK;
2239 }
2240 2516
2241 rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); 2517static inline void l2cap_check_encryption(struct sock *sk, u8 encrypt)
2242 rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); 2518{
2243 rsp.result = cpu_to_le16(result); 2519 if (sk->sk_type != SOCK_SEQPACKET)
2244 rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); 2520 return;
2245 l2cap_send_cmd(conn, l2cap_pi(sk)->ident,
2246 L2CAP_CONN_RSP, sizeof(rsp), &rsp);
2247 }
2248 2521
2249 bh_unlock_sock(sk); 2522 if (encrypt == 0x00) {
2523 if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM) {
2524 l2cap_sock_clear_timer(sk);
2525 l2cap_sock_set_timer(sk, HZ * 5);
2526 } else if (l2cap_pi(sk)->sec_level == BT_SECURITY_HIGH)
2527 __l2cap_sock_close(sk, ECONNREFUSED);
2528 } else {
2529 if (l2cap_pi(sk)->sec_level == BT_SECURITY_MEDIUM)
2530 l2cap_sock_clear_timer(sk);
2250 } 2531 }
2251
2252 read_unlock(&l->lock);
2253
2254 return 0;
2255} 2532}
2256 2533
2257static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) 2534static int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
2258{ 2535{
2259 struct l2cap_chan_list *l; 2536 struct l2cap_chan_list *l;
2260 struct l2cap_conn *conn = hcon->l2cap_data; 2537 struct l2cap_conn *conn = hcon->l2cap_data;
@@ -2270,15 +2547,16 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
2270 read_lock(&l->lock); 2547 read_lock(&l->lock);
2271 2548
2272 for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { 2549 for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) {
2273 struct l2cap_pinfo *pi = l2cap_pi(sk);
2274
2275 bh_lock_sock(sk); 2550 bh_lock_sock(sk);
2276 2551
2277 if ((pi->link_mode & (L2CAP_LM_ENCRYPT | L2CAP_LM_SECURE)) && 2552 if (l2cap_pi(sk)->conf_state & L2CAP_CONF_CONNECT_PEND) {
2278 (sk->sk_state == BT_CONNECTED || 2553 bh_unlock_sock(sk);
2279 sk->sk_state == BT_CONFIG) && 2554 continue;
2280 !status && encrypt == 0x00) { 2555 }
2281 __l2cap_sock_close(sk, ECONNREFUSED); 2556
2557 if (!status && (sk->sk_state == BT_CONNECTED ||
2558 sk->sk_state == BT_CONFIG)) {
2559 l2cap_check_encryption(sk, encrypt);
2282 bh_unlock_sock(sk); 2560 bh_unlock_sock(sk);
2283 continue; 2561 continue;
2284 } 2562 }
@@ -2376,7 +2654,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
2376 goto drop; 2654 goto drop;
2377 2655
2378 skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), 2656 skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
2379 skb->len); 2657 skb->len);
2380 conn->rx_len = len - skb->len; 2658 conn->rx_len = len - skb->len;
2381 } else { 2659 } else {
2382 BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); 2660 BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len);
@@ -2398,7 +2676,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl
2398 } 2676 }
2399 2677
2400 skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), 2678 skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
2401 skb->len); 2679 skb->len);
2402 conn->rx_len -= skb->len; 2680 conn->rx_len -= skb->len;
2403 2681
2404 if (!conn->rx_len) { 2682 if (!conn->rx_len) {
@@ -2424,10 +2702,10 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf)
2424 sk_for_each(sk, node, &l2cap_sk_list.head) { 2702 sk_for_each(sk, node, &l2cap_sk_list.head) {
2425 struct l2cap_pinfo *pi = l2cap_pi(sk); 2703 struct l2cap_pinfo *pi = l2cap_pi(sk);
2426 2704
2427 str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", 2705 str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n",
2428 batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), 2706 batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst),
2429 sk->sk_state, btohs(pi->psm), pi->scid, pi->dcid, 2707 sk->sk_state, btohs(pi->psm), pi->scid, pi->dcid,
2430 pi->imtu, pi->omtu, pi->link_mode); 2708 pi->imtu, pi->omtu, pi->sec_level);
2431 } 2709 }
2432 2710
2433 read_unlock_bh(&l2cap_sk_list.lock); 2711 read_unlock_bh(&l2cap_sk_list.lock);
@@ -2447,7 +2725,7 @@ static const struct proto_ops l2cap_sock_ops = {
2447 .accept = l2cap_sock_accept, 2725 .accept = l2cap_sock_accept,
2448 .getname = l2cap_sock_getname, 2726 .getname = l2cap_sock_getname,
2449 .sendmsg = l2cap_sock_sendmsg, 2727 .sendmsg = l2cap_sock_sendmsg,
2450 .recvmsg = bt_sock_recvmsg, 2728 .recvmsg = l2cap_sock_recvmsg,
2451 .poll = bt_sock_poll, 2729 .poll = bt_sock_poll,
2452 .ioctl = bt_sock_ioctl, 2730 .ioctl = bt_sock_ioctl,
2453 .mmap = sock_no_mmap, 2731 .mmap = sock_no_mmap,
@@ -2469,8 +2747,8 @@ static struct hci_proto l2cap_hci_proto = {
2469 .connect_ind = l2cap_connect_ind, 2747 .connect_ind = l2cap_connect_ind,
2470 .connect_cfm = l2cap_connect_cfm, 2748 .connect_cfm = l2cap_connect_cfm,
2471 .disconn_ind = l2cap_disconn_ind, 2749 .disconn_ind = l2cap_disconn_ind,
2472 .auth_cfm = l2cap_auth_cfm, 2750 .disconn_cfm = l2cap_disconn_cfm,
2473 .encrypt_cfm = l2cap_encrypt_cfm, 2751 .security_cfm = l2cap_security_cfm,
2474 .recv_acldata = l2cap_recv_acldata 2752 .recv_acldata = l2cap_recv_acldata
2475}; 2753};
2476 2754
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index acd84fd524b8..1d0fb0f23c63 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -46,7 +46,7 @@
46#include <net/bluetooth/l2cap.h> 46#include <net/bluetooth/l2cap.h>
47#include <net/bluetooth/rfcomm.h> 47#include <net/bluetooth/rfcomm.h>
48 48
49#define VERSION "1.10" 49#define VERSION "1.11"
50 50
51static int disable_cfc = 0; 51static int disable_cfc = 0;
52static int channel_mtu = -1; 52static int channel_mtu = -1;
@@ -223,19 +223,25 @@ static int rfcomm_l2sock_create(struct socket **sock)
223 return err; 223 return err;
224} 224}
225 225
226static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d) 226static inline int rfcomm_check_security(struct rfcomm_dlc *d)
227{ 227{
228 struct sock *sk = d->session->sock->sk; 228 struct sock *sk = d->session->sock->sk;
229 __u8 auth_type;
229 230
230 if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) { 231 switch (d->sec_level) {
231 if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon)) 232 case BT_SECURITY_HIGH:
232 return 1; 233 auth_type = HCI_AT_GENERAL_BONDING_MITM;
233 } else if (d->link_mode & RFCOMM_LM_AUTH) { 234 break;
234 if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon)) 235 case BT_SECURITY_MEDIUM:
235 return 1; 236 auth_type = HCI_AT_GENERAL_BONDING;
237 break;
238 default:
239 auth_type = HCI_AT_NO_BONDING;
240 break;
236 } 241 }
237 242
238 return 0; 243 return hci_conn_security(l2cap_pi(sk)->conn->hcon, d->sec_level,
244 auth_type);
239} 245}
240 246
241/* ---- RFCOMM DLCs ---- */ 247/* ---- RFCOMM DLCs ---- */
@@ -388,10 +394,10 @@ static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst,
388 d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc; 394 d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc;
389 395
390 if (s->state == BT_CONNECTED) { 396 if (s->state == BT_CONNECTED) {
391 if (rfcomm_check_link_mode(d)) 397 if (rfcomm_check_security(d))
392 set_bit(RFCOMM_AUTH_PENDING, &d->flags);
393 else
394 rfcomm_send_pn(s, 1, d); 398 rfcomm_send_pn(s, 1, d);
399 else
400 set_bit(RFCOMM_AUTH_PENDING, &d->flags);
395 } 401 }
396 402
397 rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); 403 rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
@@ -421,9 +427,16 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
421 d, d->state, d->dlci, err, s); 427 d, d->state, d->dlci, err, s);
422 428
423 switch (d->state) { 429 switch (d->state) {
424 case BT_CONNECTED:
425 case BT_CONFIG:
426 case BT_CONNECT: 430 case BT_CONNECT:
431 case BT_CONFIG:
432 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
433 set_bit(RFCOMM_AUTH_REJECT, &d->flags);
434 rfcomm_schedule(RFCOMM_SCHED_AUTH);
435 break;
436 }
437 /* Fall through */
438
439 case BT_CONNECTED:
427 d->state = BT_DISCONN; 440 d->state = BT_DISCONN;
428 if (skb_queue_empty(&d->tx_queue)) { 441 if (skb_queue_empty(&d->tx_queue)) {
429 rfcomm_send_disc(s, d->dlci); 442 rfcomm_send_disc(s, d->dlci);
@@ -434,6 +447,15 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err)
434 } 447 }
435 break; 448 break;
436 449
450 case BT_OPEN:
451 case BT_CONNECT2:
452 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
453 set_bit(RFCOMM_AUTH_REJECT, &d->flags);
454 rfcomm_schedule(RFCOMM_SCHED_AUTH);
455 break;
456 }
457 /* Fall through */
458
437 default: 459 default:
438 rfcomm_dlc_clear_timer(d); 460 rfcomm_dlc_clear_timer(d);
439 461
@@ -636,6 +658,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
636 bacpy(&addr.l2_bdaddr, src); 658 bacpy(&addr.l2_bdaddr, src);
637 addr.l2_family = AF_BLUETOOTH; 659 addr.l2_family = AF_BLUETOOTH;
638 addr.l2_psm = 0; 660 addr.l2_psm = 0;
661 addr.l2_cid = 0;
639 *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); 662 *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
640 if (*err < 0) 663 if (*err < 0)
641 goto failed; 664 goto failed;
@@ -657,6 +680,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst
657 bacpy(&addr.l2_bdaddr, dst); 680 bacpy(&addr.l2_bdaddr, dst);
658 addr.l2_family = AF_BLUETOOTH; 681 addr.l2_family = AF_BLUETOOTH;
659 addr.l2_psm = htobs(RFCOMM_PSM); 682 addr.l2_psm = htobs(RFCOMM_PSM);
683 addr.l2_cid = 0;
660 *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); 684 *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK);
661 if (*err == 0 || *err == -EINPROGRESS) 685 if (*err == 0 || *err == -EINPROGRESS)
662 return s; 686 return s;
@@ -1162,7 +1186,7 @@ static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci)
1162 return 0; 1186 return 0;
1163} 1187}
1164 1188
1165static void rfcomm_dlc_accept(struct rfcomm_dlc *d) 1189void rfcomm_dlc_accept(struct rfcomm_dlc *d)
1166{ 1190{
1167 struct sock *sk = d->session->sock->sk; 1191 struct sock *sk = d->session->sock->sk;
1168 1192
@@ -1175,12 +1199,31 @@ static void rfcomm_dlc_accept(struct rfcomm_dlc *d)
1175 d->state_change(d, 0); 1199 d->state_change(d, 0);
1176 rfcomm_dlc_unlock(d); 1200 rfcomm_dlc_unlock(d);
1177 1201
1178 if (d->link_mode & RFCOMM_LM_MASTER) 1202 if (d->role_switch)
1179 hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00); 1203 hci_conn_switch_role(l2cap_pi(sk)->conn->hcon, 0x00);
1180 1204
1181 rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); 1205 rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig);
1182} 1206}
1183 1207
1208static void rfcomm_check_accept(struct rfcomm_dlc *d)
1209{
1210 if (rfcomm_check_security(d)) {
1211 if (d->defer_setup) {
1212 set_bit(RFCOMM_DEFER_SETUP, &d->flags);
1213 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
1214
1215 rfcomm_dlc_lock(d);
1216 d->state = BT_CONNECT2;
1217 d->state_change(d, 0);
1218 rfcomm_dlc_unlock(d);
1219 } else
1220 rfcomm_dlc_accept(d);
1221 } else {
1222 set_bit(RFCOMM_AUTH_PENDING, &d->flags);
1223 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
1224 }
1225}
1226
1184static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) 1227static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
1185{ 1228{
1186 struct rfcomm_dlc *d; 1229 struct rfcomm_dlc *d;
@@ -1203,11 +1246,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
1203 if (d) { 1246 if (d) {
1204 if (d->state == BT_OPEN) { 1247 if (d->state == BT_OPEN) {
1205 /* DLC was previously opened by PN request */ 1248 /* DLC was previously opened by PN request */
1206 if (rfcomm_check_link_mode(d)) { 1249 rfcomm_check_accept(d);
1207 set_bit(RFCOMM_AUTH_PENDING, &d->flags);
1208 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
1209 } else
1210 rfcomm_dlc_accept(d);
1211 } 1250 }
1212 return 0; 1251 return 0;
1213 } 1252 }
@@ -1219,11 +1258,7 @@ static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci)
1219 d->addr = __addr(s->initiator, dlci); 1258 d->addr = __addr(s->initiator, dlci);
1220 rfcomm_dlc_link(s, d); 1259 rfcomm_dlc_link(s, d);
1221 1260
1222 if (rfcomm_check_link_mode(d)) { 1261 rfcomm_check_accept(d);
1223 set_bit(RFCOMM_AUTH_PENDING, &d->flags);
1224 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
1225 } else
1226 rfcomm_dlc_accept(d);
1227 } else { 1262 } else {
1228 rfcomm_send_dm(s, dlci); 1263 rfcomm_send_dm(s, dlci);
1229 } 1264 }
@@ -1637,11 +1672,12 @@ static void rfcomm_process_connect(struct rfcomm_session *s)
1637 d = list_entry(p, struct rfcomm_dlc, list); 1672 d = list_entry(p, struct rfcomm_dlc, list);
1638 if (d->state == BT_CONFIG) { 1673 if (d->state == BT_CONFIG) {
1639 d->mtu = s->mtu; 1674 d->mtu = s->mtu;
1640 if (rfcomm_check_link_mode(d)) { 1675 if (rfcomm_check_security(d)) {
1676 rfcomm_send_pn(s, 1, d);
1677 } else {
1641 set_bit(RFCOMM_AUTH_PENDING, &d->flags); 1678 set_bit(RFCOMM_AUTH_PENDING, &d->flags);
1642 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); 1679 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
1643 } else 1680 }
1644 rfcomm_send_pn(s, 1, d);
1645 } 1681 }
1646 } 1682 }
1647} 1683}
@@ -1717,11 +1753,17 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
1717 if (d->out) { 1753 if (d->out) {
1718 rfcomm_send_pn(s, 1, d); 1754 rfcomm_send_pn(s, 1, d);
1719 rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); 1755 rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT);
1720 } else 1756 } else {
1721 rfcomm_dlc_accept(d); 1757 if (d->defer_setup) {
1722 if (d->link_mode & RFCOMM_LM_SECURE) { 1758 set_bit(RFCOMM_DEFER_SETUP, &d->flags);
1723 struct sock *sk = s->sock->sk; 1759 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
1724 hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon); 1760
1761 rfcomm_dlc_lock(d);
1762 d->state = BT_CONNECT2;
1763 d->state_change(d, 0);
1764 rfcomm_dlc_unlock(d);
1765 } else
1766 rfcomm_dlc_accept(d);
1725 } 1767 }
1726 continue; 1768 continue;
1727 } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) { 1769 } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) {
@@ -1734,6 +1776,9 @@ static inline void rfcomm_process_dlcs(struct rfcomm_session *s)
1734 continue; 1776 continue;
1735 } 1777 }
1736 1778
1779 if (test_bit(RFCOMM_SEC_PENDING, &d->flags))
1780 continue;
1781
1737 if (test_bit(RFCOMM_TX_THROTTLED, &s->flags)) 1782 if (test_bit(RFCOMM_TX_THROTTLED, &s->flags))
1738 continue; 1783 continue;
1739 1784
@@ -1876,6 +1921,7 @@ static int rfcomm_add_listener(bdaddr_t *ba)
1876 bacpy(&addr.l2_bdaddr, ba); 1921 bacpy(&addr.l2_bdaddr, ba);
1877 addr.l2_family = AF_BLUETOOTH; 1922 addr.l2_family = AF_BLUETOOTH;
1878 addr.l2_psm = htobs(RFCOMM_PSM); 1923 addr.l2_psm = htobs(RFCOMM_PSM);
1924 addr.l2_cid = 0;
1879 err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); 1925 err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr));
1880 if (err < 0) { 1926 if (err < 0) {
1881 BT_ERR("Bind failed %d", err); 1927 BT_ERR("Bind failed %d", err);
@@ -1947,42 +1993,7 @@ static int rfcomm_run(void *unused)
1947 return 0; 1993 return 0;
1948} 1994}
1949 1995
1950static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status) 1996static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
1951{
1952 struct rfcomm_session *s;
1953 struct rfcomm_dlc *d;
1954 struct list_head *p, *n;
1955
1956 BT_DBG("conn %p status 0x%02x", conn, status);
1957
1958 s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst);
1959 if (!s)
1960 return;
1961
1962 rfcomm_session_hold(s);
1963
1964 list_for_each_safe(p, n, &s->dlcs) {
1965 d = list_entry(p, struct rfcomm_dlc, list);
1966
1967 if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) &&
1968 !(conn->link_mode & HCI_LM_ENCRYPT) && !status)
1969 continue;
1970
1971 if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
1972 continue;
1973
1974 if (!status)
1975 set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
1976 else
1977 set_bit(RFCOMM_AUTH_REJECT, &d->flags);
1978 }
1979
1980 rfcomm_session_put(s);
1981
1982 rfcomm_schedule(RFCOMM_SCHED_AUTH);
1983}
1984
1985static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
1986{ 1997{
1987 struct rfcomm_session *s; 1998 struct rfcomm_session *s;
1988 struct rfcomm_dlc *d; 1999 struct rfcomm_dlc *d;
@@ -1999,18 +2010,29 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
1999 list_for_each_safe(p, n, &s->dlcs) { 2010 list_for_each_safe(p, n, &s->dlcs) {
2000 d = list_entry(p, struct rfcomm_dlc, list); 2011 d = list_entry(p, struct rfcomm_dlc, list);
2001 2012
2002 if ((d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) && 2013 if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
2003 (d->state == BT_CONNECTED || 2014 rfcomm_dlc_clear_timer(d);
2004 d->state == BT_CONFIG) && 2015 if (status || encrypt == 0x00) {
2005 !status && encrypt == 0x00) { 2016 __rfcomm_dlc_close(d, ECONNREFUSED);
2006 __rfcomm_dlc_close(d, ECONNREFUSED); 2017 continue;
2007 continue; 2018 }
2019 }
2020
2021 if (d->state == BT_CONNECTED && !status && encrypt == 0x00) {
2022 if (d->sec_level == BT_SECURITY_MEDIUM) {
2023 set_bit(RFCOMM_SEC_PENDING, &d->flags);
2024 rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT);
2025 continue;
2026 } else if (d->sec_level == BT_SECURITY_HIGH) {
2027 __rfcomm_dlc_close(d, ECONNREFUSED);
2028 continue;
2029 }
2008 } 2030 }
2009 2031
2010 if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) 2032 if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags))
2011 continue; 2033 continue;
2012 2034
2013 if (!status && encrypt) 2035 if (!status)
2014 set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); 2036 set_bit(RFCOMM_AUTH_ACCEPT, &d->flags);
2015 else 2037 else
2016 set_bit(RFCOMM_AUTH_REJECT, &d->flags); 2038 set_bit(RFCOMM_AUTH_REJECT, &d->flags);
@@ -2023,8 +2045,7 @@ static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
2023 2045
2024static struct hci_cb rfcomm_cb = { 2046static struct hci_cb rfcomm_cb = {
2025 .name = "RFCOMM", 2047 .name = "RFCOMM",
2026 .auth_cfm = rfcomm_auth_cfm, 2048 .security_cfm = rfcomm_security_cfm
2027 .encrypt_cfm = rfcomm_encrypt_cfm
2028}; 2049};
2029 2050
2030static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf) 2051static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, char *buf)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index d3fc6fca38d0..7f482784e9f7 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -261,12 +261,19 @@ static void rfcomm_sock_init(struct sock *sk, struct sock *parent)
261 261
262 if (parent) { 262 if (parent) {
263 sk->sk_type = parent->sk_type; 263 sk->sk_type = parent->sk_type;
264 pi->link_mode = rfcomm_pi(parent)->link_mode; 264 pi->dlc->defer_setup = bt_sk(parent)->defer_setup;
265
266 pi->sec_level = rfcomm_pi(parent)->sec_level;
267 pi->role_switch = rfcomm_pi(parent)->role_switch;
265 } else { 268 } else {
266 pi->link_mode = 0; 269 pi->dlc->defer_setup = 0;
270
271 pi->sec_level = BT_SECURITY_LOW;
272 pi->role_switch = 0;
267 } 273 }
268 274
269 pi->dlc->link_mode = pi->link_mode; 275 pi->dlc->sec_level = pi->sec_level;
276 pi->dlc->role_switch = pi->role_switch;
270} 277}
271 278
272static struct proto rfcomm_proto = { 279static struct proto rfcomm_proto = {
@@ -406,7 +413,8 @@ static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int a
406 bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr); 413 bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr);
407 rfcomm_pi(sk)->channel = sa->rc_channel; 414 rfcomm_pi(sk)->channel = sa->rc_channel;
408 415
409 d->link_mode = rfcomm_pi(sk)->link_mode; 416 d->sec_level = rfcomm_pi(sk)->sec_level;
417 d->role_switch = rfcomm_pi(sk)->role_switch;
410 418
411 err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel); 419 err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel);
412 if (!err) 420 if (!err)
@@ -554,6 +562,9 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
554 struct sk_buff *skb; 562 struct sk_buff *skb;
555 int sent = 0; 563 int sent = 0;
556 564
565 if (test_bit(RFCOMM_DEFER_SETUP, &d->flags))
566 return -ENOTCONN;
567
557 if (msg->msg_flags & MSG_OOB) 568 if (msg->msg_flags & MSG_OOB)
558 return -EOPNOTSUPP; 569 return -EOPNOTSUPP;
559 570
@@ -570,8 +581,11 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock,
570 581
571 skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, 582 skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE,
572 msg->msg_flags & MSG_DONTWAIT, &err); 583 msg->msg_flags & MSG_DONTWAIT, &err);
573 if (!skb) 584 if (!skb) {
585 if (sent == 0)
586 sent = err;
574 break; 587 break;
588 }
575 skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); 589 skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE);
576 590
577 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); 591 err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size);
@@ -630,10 +644,16 @@ static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
630 struct msghdr *msg, size_t size, int flags) 644 struct msghdr *msg, size_t size, int flags)
631{ 645{
632 struct sock *sk = sock->sk; 646 struct sock *sk = sock->sk;
647 struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc;
633 int err = 0; 648 int err = 0;
634 size_t target, copied = 0; 649 size_t target, copied = 0;
635 long timeo; 650 long timeo;
636 651
652 if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) {
653 rfcomm_dlc_accept(d);
654 return 0;
655 }
656
637 if (flags & MSG_OOB) 657 if (flags & MSG_OOB)
638 return -EOPNOTSUPP; 658 return -EOPNOTSUPP;
639 659
@@ -710,7 +730,7 @@ out:
710 return copied ? : err; 730 return copied ? : err;
711} 731}
712 732
713static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) 733static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, int optlen)
714{ 734{
715 struct sock *sk = sock->sk; 735 struct sock *sk = sock->sk;
716 int err = 0; 736 int err = 0;
@@ -727,7 +747,14 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
727 break; 747 break;
728 } 748 }
729 749
730 rfcomm_pi(sk)->link_mode = opt; 750 if (opt & RFCOMM_LM_AUTH)
751 rfcomm_pi(sk)->sec_level = BT_SECURITY_LOW;
752 if (opt & RFCOMM_LM_ENCRYPT)
753 rfcomm_pi(sk)->sec_level = BT_SECURITY_MEDIUM;
754 if (opt & RFCOMM_LM_SECURE)
755 rfcomm_pi(sk)->sec_level = BT_SECURITY_HIGH;
756
757 rfcomm_pi(sk)->role_switch = (opt & RFCOMM_LM_MASTER);
731 break; 758 break;
732 759
733 default: 760 default:
@@ -739,12 +766,76 @@ static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, c
739 return err; 766 return err;
740} 767}
741 768
742static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) 769static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
770{
771 struct sock *sk = sock->sk;
772 struct bt_security sec;
773 int len, err = 0;
774 u32 opt;
775
776 BT_DBG("sk %p", sk);
777
778 if (level == SOL_RFCOMM)
779 return rfcomm_sock_setsockopt_old(sock, optname, optval, optlen);
780
781 if (level != SOL_BLUETOOTH)
782 return -ENOPROTOOPT;
783
784 lock_sock(sk);
785
786 switch (optname) {
787 case BT_SECURITY:
788 if (sk->sk_type != SOCK_STREAM) {
789 err = -EINVAL;
790 break;
791 }
792
793 sec.level = BT_SECURITY_LOW;
794
795 len = min_t(unsigned int, sizeof(sec), optlen);
796 if (copy_from_user((char *) &sec, optval, len)) {
797 err = -EFAULT;
798 break;
799 }
800
801 if (sec.level > BT_SECURITY_HIGH) {
802 err = -EINVAL;
803 break;
804 }
805
806 rfcomm_pi(sk)->sec_level = sec.level;
807 break;
808
809 case BT_DEFER_SETUP:
810 if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
811 err = -EINVAL;
812 break;
813 }
814
815 if (get_user(opt, (u32 __user *) optval)) {
816 err = -EFAULT;
817 break;
818 }
819
820 bt_sk(sk)->defer_setup = opt;
821 break;
822
823 default:
824 err = -ENOPROTOOPT;
825 break;
826 }
827
828 release_sock(sk);
829 return err;
830}
831
832static int rfcomm_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
743{ 833{
744 struct sock *sk = sock->sk; 834 struct sock *sk = sock->sk;
745 struct sock *l2cap_sk; 835 struct sock *l2cap_sk;
746 struct rfcomm_conninfo cinfo; 836 struct rfcomm_conninfo cinfo;
747 int len, err = 0; 837 int len, err = 0;
838 u32 opt;
748 839
749 BT_DBG("sk %p", sk); 840 BT_DBG("sk %p", sk);
750 841
@@ -755,12 +846,32 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
755 846
756 switch (optname) { 847 switch (optname) {
757 case RFCOMM_LM: 848 case RFCOMM_LM:
758 if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval)) 849 switch (rfcomm_pi(sk)->sec_level) {
850 case BT_SECURITY_LOW:
851 opt = RFCOMM_LM_AUTH;
852 break;
853 case BT_SECURITY_MEDIUM:
854 opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT;
855 break;
856 case BT_SECURITY_HIGH:
857 opt = RFCOMM_LM_AUTH | RFCOMM_LM_ENCRYPT |
858 RFCOMM_LM_SECURE;
859 break;
860 default:
861 opt = 0;
862 break;
863 }
864
865 if (rfcomm_pi(sk)->role_switch)
866 opt |= RFCOMM_LM_MASTER;
867
868 if (put_user(opt, (u32 __user *) optval))
759 err = -EFAULT; 869 err = -EFAULT;
760 break; 870 break;
761 871
762 case RFCOMM_CONNINFO: 872 case RFCOMM_CONNINFO:
763 if (sk->sk_state != BT_CONNECTED) { 873 if (sk->sk_state != BT_CONNECTED &&
874 !rfcomm_pi(sk)->dlc->defer_setup) {
764 err = -ENOTCONN; 875 err = -ENOTCONN;
765 break; 876 break;
766 } 877 }
@@ -785,6 +896,60 @@ static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, c
785 return err; 896 return err;
786} 897}
787 898
899static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
900{
901 struct sock *sk = sock->sk;
902 struct bt_security sec;
903 int len, err = 0;
904
905 BT_DBG("sk %p", sk);
906
907 if (level == SOL_RFCOMM)
908 return rfcomm_sock_getsockopt_old(sock, optname, optval, optlen);
909
910 if (level != SOL_BLUETOOTH)
911 return -ENOPROTOOPT;
912
913 if (get_user(len, optlen))
914 return -EFAULT;
915
916 lock_sock(sk);
917
918 switch (optname) {
919 case BT_SECURITY:
920 if (sk->sk_type != SOCK_STREAM) {
921 err = -EINVAL;
922 break;
923 }
924
925 sec.level = rfcomm_pi(sk)->sec_level;
926
927 len = min_t(unsigned int, len, sizeof(sec));
928 if (copy_to_user(optval, (char *) &sec, len))
929 err = -EFAULT;
930
931 break;
932
933 case BT_DEFER_SETUP:
934 if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) {
935 err = -EINVAL;
936 break;
937 }
938
939 if (put_user(bt_sk(sk)->defer_setup, (u32 __user *) optval))
940 err = -EFAULT;
941
942 break;
943
944 default:
945 err = -ENOPROTOOPT;
946 break;
947 }
948
949 release_sock(sk);
950 return err;
951}
952
788static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 953static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
789{ 954{
790 struct sock *sk __maybe_unused = sock->sk; 955 struct sock *sk __maybe_unused = sock->sk;
@@ -888,6 +1053,10 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc *
888 1053
889done: 1054done:
890 bh_unlock_sock(parent); 1055 bh_unlock_sock(parent);
1056
1057 if (bt_sk(parent)->defer_setup)
1058 parent->sk_state_change(parent);
1059
891 return result; 1060 return result;
892} 1061}
893 1062
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 46fd8bf9a690..51ae0c3e470a 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -195,7 +195,7 @@ static int sco_connect(struct sock *sk)
195 else 195 else
196 type = SCO_LINK; 196 type = SCO_LINK;
197 197
198 hcon = hci_connect(hdev, type, dst, HCI_AT_NO_BONDING); 198 hcon = hci_connect(hdev, type, dst, BT_SECURITY_LOW, HCI_AT_NO_BONDING);
199 if (!hcon) 199 if (!hcon)
200 goto done; 200 goto done;
201 201
@@ -668,7 +668,7 @@ static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char
668 return err; 668 return err;
669} 669}
670 670
671static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) 671static int sco_sock_getsockopt_old(struct socket *sock, int optname, char __user *optval, int __user *optlen)
672{ 672{
673 struct sock *sk = sock->sk; 673 struct sock *sk = sock->sk;
674 struct sco_options opts; 674 struct sco_options opts;
@@ -723,6 +723,31 @@ static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char
723 return err; 723 return err;
724} 724}
725 725
726static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen)
727{
728 struct sock *sk = sock->sk;
729 int len, err = 0;
730
731 BT_DBG("sk %p", sk);
732
733 if (level == SOL_SCO)
734 return sco_sock_getsockopt_old(sock, optname, optval, optlen);
735
736 if (get_user(len, optlen))
737 return -EFAULT;
738
739 lock_sock(sk);
740
741 switch (optname) {
742 default:
743 err = -ENOPROTOOPT;
744 break;
745 }
746
747 release_sock(sk);
748 return err;
749}
750
726static int sco_sock_release(struct socket *sock) 751static int sco_sock_release(struct socket *sock)
727{ 752{
728 struct sock *sk = sock->sk; 753 struct sock *sk = sock->sk;
@@ -832,10 +857,30 @@ done:
832/* ----- SCO interface with lower layer (HCI) ----- */ 857/* ----- SCO interface with lower layer (HCI) ----- */
833static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type) 858static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type)
834{ 859{
860 register struct sock *sk;
861 struct hlist_node *node;
862 int lm = 0;
863
864 if (type != SCO_LINK && type != ESCO_LINK)
865 return 0;
866
835 BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); 867 BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr));
836 868
837 /* Always accept connection */ 869 /* Find listening sockets */
838 return HCI_LM_ACCEPT; 870 read_lock(&sco_sk_list.lock);
871 sk_for_each(sk, node, &sco_sk_list.head) {
872 if (sk->sk_state != BT_LISTEN)
873 continue;
874
875 if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr) ||
876 !bacmp(&bt_sk(sk)->src, BDADDR_ANY)) {
877 lm |= HCI_LM_ACCEPT;
878 break;
879 }
880 }
881 read_unlock(&sco_sk_list.lock);
882
883 return lm;
839} 884}
840 885
841static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) 886static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
@@ -857,7 +902,7 @@ static int sco_connect_cfm(struct hci_conn *hcon, __u8 status)
857 return 0; 902 return 0;
858} 903}
859 904
860static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason) 905static int sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
861{ 906{
862 BT_DBG("hcon %p reason %d", hcon, reason); 907 BT_DBG("hcon %p reason %d", hcon, reason);
863 908
@@ -940,7 +985,7 @@ static struct hci_proto sco_hci_proto = {
940 .id = HCI_PROTO_SCO, 985 .id = HCI_PROTO_SCO,
941 .connect_ind = sco_connect_ind, 986 .connect_ind = sco_connect_ind,
942 .connect_cfm = sco_connect_cfm, 987 .connect_cfm = sco_connect_cfm,
943 .disconn_ind = sco_disconn_ind, 988 .disconn_cfm = sco_disconn_cfm,
944 .recv_scodata = sco_recv_scodata 989 .recv_scodata = sco_recv_scodata
945}; 990};
946 991
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index ba7be195803c..fcffb3fb1177 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -98,7 +98,8 @@ void br_ifinfo_notify(int event, struct net_bridge_port *port)
98 kfree_skb(skb); 98 kfree_skb(skb);
99 goto errout; 99 goto errout;
100 } 100 }
101 err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); 101 rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC);
102 return;
102errout: 103errout:
103 if (err < 0) 104 if (err < 0)
104 rtnl_set_sk_err(net, RTNLGRP_LINK, err); 105 rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index d90e8dd975fc..547bafc79e28 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -273,8 +273,7 @@ int can_send(struct sk_buff *skb, int loop)
273 err = net_xmit_errno(err); 273 err = net_xmit_errno(err);
274 274
275 if (err) { 275 if (err) {
276 if (newskb) 276 kfree_skb(newskb);
277 kfree_skb(newskb);
278 return err; 277 return err;
279 } 278 }
280 279
diff --git a/net/core/Makefile b/net/core/Makefile
index 26a37cb31923..796f46eece5f 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -17,3 +17,6 @@ obj-$(CONFIG_NET_PKTGEN) += pktgen.o
17obj-$(CONFIG_NETPOLL) += netpoll.o 17obj-$(CONFIG_NETPOLL) += netpoll.o
18obj-$(CONFIG_NET_DMA) += user_dma.o 18obj-$(CONFIG_NET_DMA) += user_dma.o
19obj-$(CONFIG_FIB_RULES) += fib_rules.o 19obj-$(CONFIG_FIB_RULES) += fib_rules.o
20obj-$(CONFIG_TRACEPOINTS) += net-traces.o
21obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
22
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 5e2ac0c4b07c..d0de644b378d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -208,7 +208,7 @@ struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
208 208
209void skb_free_datagram(struct sock *sk, struct sk_buff *skb) 209void skb_free_datagram(struct sock *sk, struct sk_buff *skb)
210{ 210{
211 kfree_skb(skb); 211 consume_skb(skb);
212 sk_mem_reclaim_partial(sk); 212 sk_mem_reclaim_partial(sk);
213} 213}
214 214
diff --git a/net/core/dev.c b/net/core/dev.c
index d393fc997cd9..052dd478d3e1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,14 +135,6 @@
135/* This should be increased if a protocol with a bigger head is added. */ 135/* This should be increased if a protocol with a bigger head is added. */
136#define GRO_MAX_HEAD (MAX_HEADER + 128) 136#define GRO_MAX_HEAD (MAX_HEADER + 128)
137 137
138enum {
139 GRO_MERGED,
140 GRO_MERGED_FREE,
141 GRO_HELD,
142 GRO_NORMAL,
143 GRO_DROP,
144};
145
146/* 138/*
147 * The list of packet types we will receive (as opposed to discard) 139 * The list of packet types we will receive (as opposed to discard)
148 * and the routines to invoke. 140 * and the routines to invoke.
@@ -1672,23 +1664,12 @@ static int dev_gso_segment(struct sk_buff *skb)
1672 return 0; 1664 return 0;
1673} 1665}
1674 1666
1675static void tstamp_tx(struct sk_buff *skb)
1676{
1677 union skb_shared_tx *shtx =
1678 skb_tx(skb);
1679 if (unlikely(shtx->software &&
1680 !shtx->in_progress)) {
1681 skb_tstamp_tx(skb, NULL);
1682 }
1683}
1684
1685int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, 1667int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1686 struct netdev_queue *txq) 1668 struct netdev_queue *txq)
1687{ 1669{
1688 const struct net_device_ops *ops = dev->netdev_ops; 1670 const struct net_device_ops *ops = dev->netdev_ops;
1689 int rc; 1671 int rc;
1690 1672
1691 prefetch(&dev->netdev_ops->ndo_start_xmit);
1692 if (likely(!skb->next)) { 1673 if (likely(!skb->next)) {
1693 if (!list_empty(&ptype_all)) 1674 if (!list_empty(&ptype_all))
1694 dev_queue_xmit_nit(skb, dev); 1675 dev_queue_xmit_nit(skb, dev);
@@ -1715,8 +1696,6 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1715 * the skb destructor before the call and restoring it 1696 * the skb destructor before the call and restoring it
1716 * afterwards, then doing the skb_orphan() ourselves? 1697 * afterwards, then doing the skb_orphan() ourselves?
1717 */ 1698 */
1718 if (likely(!rc))
1719 tstamp_tx(skb);
1720 return rc; 1699 return rc;
1721 } 1700 }
1722 1701
@@ -1732,7 +1711,6 @@ gso:
1732 skb->next = nskb; 1711 skb->next = nskb;
1733 return rc; 1712 return rc;
1734 } 1713 }
1735 tstamp_tx(skb);
1736 if (unlikely(netif_tx_queue_stopped(txq) && skb->next)) 1714 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1737 return NETDEV_TX_BUSY; 1715 return NETDEV_TX_BUSY;
1738 } while (skb->next); 1716 } while (skb->next);
@@ -1745,17 +1723,11 @@ out_kfree_skb:
1745} 1723}
1746 1724
1747static u32 skb_tx_hashrnd; 1725static u32 skb_tx_hashrnd;
1748static int skb_tx_hashrnd_initialized = 0;
1749 1726
1750static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb) 1727u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1751{ 1728{
1752 u32 hash; 1729 u32 hash;
1753 1730
1754 if (unlikely(!skb_tx_hashrnd_initialized)) {
1755 get_random_bytes(&skb_tx_hashrnd, 4);
1756 skb_tx_hashrnd_initialized = 1;
1757 }
1758
1759 if (skb_rx_queue_recorded(skb)) { 1731 if (skb_rx_queue_recorded(skb)) {
1760 hash = skb_get_rx_queue(skb); 1732 hash = skb_get_rx_queue(skb);
1761 } else if (skb->sk && skb->sk->sk_hash) { 1733 } else if (skb->sk && skb->sk->sk_hash) {
@@ -1767,6 +1739,7 @@ static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
1767 1739
1768 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32); 1740 return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1769} 1741}
1742EXPORT_SYMBOL(skb_tx_hash);
1770 1743
1771static struct netdev_queue *dev_pick_tx(struct net_device *dev, 1744static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1772 struct sk_buff *skb) 1745 struct sk_buff *skb)
@@ -2273,12 +2246,6 @@ int netif_receive_skb(struct sk_buff *skb)
2273 2246
2274 rcu_read_lock(); 2247 rcu_read_lock();
2275 2248
2276 /* Don't receive packets in an exiting network namespace */
2277 if (!net_alive(dev_net(skb->dev))) {
2278 kfree_skb(skb);
2279 goto out;
2280 }
2281
2282#ifdef CONFIG_NET_CLS_ACT 2249#ifdef CONFIG_NET_CLS_ACT
2283 if (skb->tc_verd & TC_NCLS) { 2250 if (skb->tc_verd & TC_NCLS) {
2284 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); 2251 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
@@ -2499,6 +2466,9 @@ static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2499{ 2466{
2500 struct sk_buff *p; 2467 struct sk_buff *p;
2501 2468
2469 if (netpoll_rx_on(skb))
2470 return GRO_NORMAL;
2471
2502 for (p = napi->gro_list; p; p = p->next) { 2472 for (p = napi->gro_list; p; p = p->next) {
2503 NAPI_GRO_CB(p)->same_flow = !compare_ether_header( 2473 NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
2504 skb_mac_header(p), skb_gro_mac_header(skb)); 2474 skb_mac_header(p), skb_gro_mac_header(skb));
@@ -2657,9 +2627,9 @@ static int process_backlog(struct napi_struct *napi, int quota)
2657 local_irq_disable(); 2627 local_irq_disable();
2658 skb = __skb_dequeue(&queue->input_pkt_queue); 2628 skb = __skb_dequeue(&queue->input_pkt_queue);
2659 if (!skb) { 2629 if (!skb) {
2660 __napi_complete(napi);
2661 local_irq_enable(); 2630 local_irq_enable();
2662 break; 2631 napi_complete(napi);
2632 goto out;
2663 } 2633 }
2664 local_irq_enable(); 2634 local_irq_enable();
2665 2635
@@ -2668,6 +2638,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
2668 2638
2669 napi_gro_flush(napi); 2639 napi_gro_flush(napi);
2670 2640
2641out:
2671 return work; 2642 return work;
2672} 2643}
2673 2644
@@ -2741,7 +2712,7 @@ void netif_napi_del(struct napi_struct *napi)
2741 struct sk_buff *skb, *next; 2712 struct sk_buff *skb, *next;
2742 2713
2743 list_del_init(&napi->dev_list); 2714 list_del_init(&napi->dev_list);
2744 kfree(napi->skb); 2715 kfree_skb(napi->skb);
2745 2716
2746 for (skb = napi->gro_list; skb; skb = next) { 2717 for (skb = napi->gro_list; skb; skb = next) {
2747 next = skb->next; 2718 next = skb->next;
@@ -4355,6 +4326,39 @@ unsigned long netdev_fix_features(unsigned long features, const char *name)
4355} 4326}
4356EXPORT_SYMBOL(netdev_fix_features); 4327EXPORT_SYMBOL(netdev_fix_features);
4357 4328
4329/* Some devices need to (re-)set their netdev_ops inside
4330 * ->init() or similar. If that happens, we have to setup
4331 * the compat pointers again.
4332 */
4333void netdev_resync_ops(struct net_device *dev)
4334{
4335#ifdef CONFIG_COMPAT_NET_DEV_OPS
4336 const struct net_device_ops *ops = dev->netdev_ops;
4337
4338 dev->init = ops->ndo_init;
4339 dev->uninit = ops->ndo_uninit;
4340 dev->open = ops->ndo_open;
4341 dev->change_rx_flags = ops->ndo_change_rx_flags;
4342 dev->set_rx_mode = ops->ndo_set_rx_mode;
4343 dev->set_multicast_list = ops->ndo_set_multicast_list;
4344 dev->set_mac_address = ops->ndo_set_mac_address;
4345 dev->validate_addr = ops->ndo_validate_addr;
4346 dev->do_ioctl = ops->ndo_do_ioctl;
4347 dev->set_config = ops->ndo_set_config;
4348 dev->change_mtu = ops->ndo_change_mtu;
4349 dev->neigh_setup = ops->ndo_neigh_setup;
4350 dev->tx_timeout = ops->ndo_tx_timeout;
4351 dev->get_stats = ops->ndo_get_stats;
4352 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4353 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4354 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4355#ifdef CONFIG_NET_POLL_CONTROLLER
4356 dev->poll_controller = ops->ndo_poll_controller;
4357#endif
4358#endif
4359}
4360EXPORT_SYMBOL(netdev_resync_ops);
4361
4358/** 4362/**
4359 * register_netdevice - register a network device 4363 * register_netdevice - register a network device
4360 * @dev: device to register 4364 * @dev: device to register
@@ -4399,27 +4403,7 @@ int register_netdevice(struct net_device *dev)
4399 * This is temporary until all network devices are converted. 4403 * This is temporary until all network devices are converted.
4400 */ 4404 */
4401 if (dev->netdev_ops) { 4405 if (dev->netdev_ops) {
4402 const struct net_device_ops *ops = dev->netdev_ops; 4406 netdev_resync_ops(dev);
4403
4404 dev->init = ops->ndo_init;
4405 dev->uninit = ops->ndo_uninit;
4406 dev->open = ops->ndo_open;
4407 dev->change_rx_flags = ops->ndo_change_rx_flags;
4408 dev->set_rx_mode = ops->ndo_set_rx_mode;
4409 dev->set_multicast_list = ops->ndo_set_multicast_list;
4410 dev->set_mac_address = ops->ndo_set_mac_address;
4411 dev->validate_addr = ops->ndo_validate_addr;
4412 dev->do_ioctl = ops->ndo_do_ioctl;
4413 dev->set_config = ops->ndo_set_config;
4414 dev->change_mtu = ops->ndo_change_mtu;
4415 dev->tx_timeout = ops->ndo_tx_timeout;
4416 dev->get_stats = ops->ndo_get_stats;
4417 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4418 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4419 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4420#ifdef CONFIG_NET_POLL_CONTROLLER
4421 dev->poll_controller = ops->ndo_poll_controller;
4422#endif
4423 } else { 4407 } else {
4424 char drivername[64]; 4408 char drivername[64];
4425 pr_info("%s (%s): not using net_device_ops yet\n", 4409 pr_info("%s (%s): not using net_device_ops yet\n",
@@ -5291,6 +5275,14 @@ out:
5291 5275
5292subsys_initcall(net_dev_init); 5276subsys_initcall(net_dev_init);
5293 5277
5278static int __init initialize_hashrnd(void)
5279{
5280 get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5281 return 0;
5282}
5283
5284late_initcall_sync(initialize_hashrnd);
5285
5294EXPORT_SYMBOL(__dev_get_by_index); 5286EXPORT_SYMBOL(__dev_get_by_index);
5295EXPORT_SYMBOL(__dev_get_by_name); 5287EXPORT_SYMBOL(__dev_get_by_name);
5296EXPORT_SYMBOL(__dev_remove_pack); 5288EXPORT_SYMBOL(__dev_remove_pack);
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
new file mode 100644
index 000000000000..9fd0dc3cca99
--- /dev/null
+++ b/net/core/drop_monitor.c
@@ -0,0 +1,263 @@
1/*
2 * Monitoring code for network dropped packet alerts
3 *
4 * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
5 */
6
7#include <linux/netdevice.h>
8#include <linux/etherdevice.h>
9#include <linux/string.h>
10#include <linux/if_arp.h>
11#include <linux/inetdevice.h>
12#include <linux/inet.h>
13#include <linux/interrupt.h>
14#include <linux/netpoll.h>
15#include <linux/sched.h>
16#include <linux/delay.h>
17#include <linux/types.h>
18#include <linux/workqueue.h>
19#include <linux/netlink.h>
20#include <linux/net_dropmon.h>
21#include <linux/percpu.h>
22#include <linux/timer.h>
23#include <linux/bitops.h>
24#include <net/genetlink.h>
25
26#include <trace/skb.h>
27
28#include <asm/unaligned.h>
29
30#define TRACE_ON 1
31#define TRACE_OFF 0
32
33static void send_dm_alert(struct work_struct *unused);
34
35
36/*
37 * Globals, our netlink socket pointer
38 * and the work handle that will send up
39 * netlink alerts
40 */
41struct sock *dm_sock;
42
43struct per_cpu_dm_data {
44 struct work_struct dm_alert_work;
45 struct sk_buff *skb;
46 atomic_t dm_hit_count;
47 struct timer_list send_timer;
48};
49
50static struct genl_family net_drop_monitor_family = {
51 .id = GENL_ID_GENERATE,
52 .hdrsize = 0,
53 .name = "NET_DM",
54 .version = 1,
55 .maxattr = NET_DM_CMD_MAX,
56};
57
58static DEFINE_PER_CPU(struct per_cpu_dm_data, dm_cpu_data);
59
60static int dm_hit_limit = 64;
61static int dm_delay = 1;
62
63
64static void reset_per_cpu_data(struct per_cpu_dm_data *data)
65{
66 size_t al;
67 struct net_dm_alert_msg *msg;
68
69 al = sizeof(struct net_dm_alert_msg);
70 al += dm_hit_limit * sizeof(struct net_dm_drop_point);
71 data->skb = genlmsg_new(al, GFP_KERNEL);
72 genlmsg_put(data->skb, 0, 0, &net_drop_monitor_family,
73 0, NET_DM_CMD_ALERT);
74 msg = __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_alert_msg));
75 memset(msg, 0, al);
76 atomic_set(&data->dm_hit_count, dm_hit_limit);
77}
78
79static void send_dm_alert(struct work_struct *unused)
80{
81 struct sk_buff *skb;
82 struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
83
84 /*
85 * Grab the skb we're about to send
86 */
87 skb = data->skb;
88
89 /*
90 * Replace it with a new one
91 */
92 reset_per_cpu_data(data);
93
94 /*
95 * Ship it!
96 */
97 genlmsg_multicast(skb, 0, NET_DM_GRP_ALERT, GFP_KERNEL);
98
99}
100
101/*
102 * This is the timer function to delay the sending of an alert
103 * in the event that more drops will arrive during the
104 * hysteresis period. Note that it operates under the timer interrupt
105 * so we don't need to disable preemption here
106 */
107static void sched_send_work(unsigned long unused)
108{
109 struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
110
111 schedule_work(&data->dm_alert_work);
112}
113
114static void trace_kfree_skb_hit(struct sk_buff *skb, void *location)
115{
116 struct net_dm_alert_msg *msg;
117 struct nlmsghdr *nlh;
118 int i;
119 struct per_cpu_dm_data *data = &__get_cpu_var(dm_cpu_data);
120
121
122 if (!atomic_add_unless(&data->dm_hit_count, -1, 0)) {
123 /*
124 * we're already at zero, discard this hit
125 */
126 goto out;
127 }
128
129 nlh = (struct nlmsghdr *)data->skb->data;
130 msg = genlmsg_data(nlmsg_data(nlh));
131 for (i = 0; i < msg->entries; i++) {
132 if (!memcmp(&location, msg->points[i].pc, sizeof(void *))) {
133 msg->points[i].count++;
134 goto out;
135 }
136 }
137
138 /*
139 * We need to create a new entry
140 */
141 __nla_reserve_nohdr(data->skb, sizeof(struct net_dm_drop_point));
142 memcpy(msg->points[msg->entries].pc, &location, sizeof(void *));
143 msg->points[msg->entries].count = 1;
144 msg->entries++;
145
146 if (!timer_pending(&data->send_timer)) {
147 data->send_timer.expires = jiffies + dm_delay * HZ;
148 add_timer_on(&data->send_timer, smp_processor_id());
149 }
150
151out:
152 return;
153}
154
155static int set_all_monitor_traces(int state)
156{
157 int rc = 0;
158
159 switch (state) {
160 case TRACE_ON:
161 rc |= register_trace_kfree_skb(trace_kfree_skb_hit);
162 break;
163 case TRACE_OFF:
164 rc |= unregister_trace_kfree_skb(trace_kfree_skb_hit);
165
166 tracepoint_synchronize_unregister();
167 break;
168 default:
169 rc = 1;
170 break;
171 }
172
173 if (rc)
174 return -EINPROGRESS;
175 return rc;
176}
177
178
179static int net_dm_cmd_config(struct sk_buff *skb,
180 struct genl_info *info)
181{
182 return -ENOTSUPP;
183}
184
185static int net_dm_cmd_trace(struct sk_buff *skb,
186 struct genl_info *info)
187{
188 switch (info->genlhdr->cmd) {
189 case NET_DM_CMD_START:
190 return set_all_monitor_traces(TRACE_ON);
191 break;
192 case NET_DM_CMD_STOP:
193 return set_all_monitor_traces(TRACE_OFF);
194 break;
195 }
196
197 return -ENOTSUPP;
198}
199
200
201static struct genl_ops dropmon_ops[] = {
202 {
203 .cmd = NET_DM_CMD_CONFIG,
204 .doit = net_dm_cmd_config,
205 },
206 {
207 .cmd = NET_DM_CMD_START,
208 .doit = net_dm_cmd_trace,
209 },
210 {
211 .cmd = NET_DM_CMD_STOP,
212 .doit = net_dm_cmd_trace,
213 },
214};
215
216static int __init init_net_drop_monitor(void)
217{
218 int cpu;
219 int rc, i, ret;
220 struct per_cpu_dm_data *data;
221 printk(KERN_INFO "Initalizing network drop monitor service\n");
222
223 if (sizeof(void *) > 8) {
224 printk(KERN_ERR "Unable to store program counters on this arch, Drop monitor failed\n");
225 return -ENOSPC;
226 }
227
228 if (genl_register_family(&net_drop_monitor_family) < 0) {
229 printk(KERN_ERR "Could not create drop monitor netlink family\n");
230 return -EFAULT;
231 }
232
233 rc = -EFAULT;
234
235 for (i = 0; i < ARRAY_SIZE(dropmon_ops); i++) {
236 ret = genl_register_ops(&net_drop_monitor_family,
237 &dropmon_ops[i]);
238 if (ret) {
239 printk(KERN_CRIT "failed to register operation %d\n",
240 dropmon_ops[i].cmd);
241 goto out_unreg;
242 }
243 }
244
245 rc = 0;
246
247 for_each_present_cpu(cpu) {
248 data = &per_cpu(dm_cpu_data, cpu);
249 reset_per_cpu_data(data);
250 INIT_WORK(&data->dm_alert_work, send_dm_alert);
251 init_timer(&data->send_timer);
252 data->send_timer.data = cpu;
253 data->send_timer.function = sched_send_work;
254 }
255 goto out;
256
257out_unreg:
258 genl_unregister_family(&net_drop_monitor_family);
259out:
260 return rc;
261}
262
263late_initcall(init_net_drop_monitor);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 947710a36ced..244ca56dffac 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -209,34 +209,62 @@ static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr)
209 return 0; 209 return 0;
210} 210}
211 211
212static int ethtool_set_rxhash(struct net_device *dev, void __user *useraddr) 212static int ethtool_set_rxnfc(struct net_device *dev, void __user *useraddr)
213{ 213{
214 struct ethtool_rxnfc cmd; 214 struct ethtool_rxnfc cmd;
215 215
216 if (!dev->ethtool_ops->set_rxhash) 216 if (!dev->ethtool_ops->set_rxnfc)
217 return -EOPNOTSUPP; 217 return -EOPNOTSUPP;
218 218
219 if (copy_from_user(&cmd, useraddr, sizeof(cmd))) 219 if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
220 return -EFAULT; 220 return -EFAULT;
221 221
222 return dev->ethtool_ops->set_rxhash(dev, &cmd); 222 return dev->ethtool_ops->set_rxnfc(dev, &cmd);
223} 223}
224 224
225static int ethtool_get_rxhash(struct net_device *dev, void __user *useraddr) 225static int ethtool_get_rxnfc(struct net_device *dev, void __user *useraddr)
226{ 226{
227 struct ethtool_rxnfc info; 227 struct ethtool_rxnfc info;
228 const struct ethtool_ops *ops = dev->ethtool_ops;
229 int ret;
230 void *rule_buf = NULL;
228 231
229 if (!dev->ethtool_ops->get_rxhash) 232 if (!ops->get_rxnfc)
230 return -EOPNOTSUPP; 233 return -EOPNOTSUPP;
231 234
232 if (copy_from_user(&info, useraddr, sizeof(info))) 235 if (copy_from_user(&info, useraddr, sizeof(info)))
233 return -EFAULT; 236 return -EFAULT;
234 237
235 dev->ethtool_ops->get_rxhash(dev, &info); 238 if (info.cmd == ETHTOOL_GRXCLSRLALL) {
239 if (info.rule_cnt > 0) {
240 rule_buf = kmalloc(info.rule_cnt * sizeof(u32),
241 GFP_USER);
242 if (!rule_buf)
243 return -ENOMEM;
244 }
245 }
236 246
247 ret = ops->get_rxnfc(dev, &info, rule_buf);
248 if (ret < 0)
249 goto err_out;
250
251 ret = -EFAULT;
237 if (copy_to_user(useraddr, &info, sizeof(info))) 252 if (copy_to_user(useraddr, &info, sizeof(info)))
238 return -EFAULT; 253 goto err_out;
239 return 0; 254
255 if (rule_buf) {
256 useraddr += offsetof(struct ethtool_rxnfc, rule_locs);
257 if (copy_to_user(useraddr, rule_buf,
258 info.rule_cnt * sizeof(u32)))
259 goto err_out;
260 }
261 ret = 0;
262
263err_out:
264 if (rule_buf)
265 kfree(rule_buf);
266
267 return ret;
240} 268}
241 269
242static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) 270static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
@@ -901,6 +929,10 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
901 case ETHTOOL_GFLAGS: 929 case ETHTOOL_GFLAGS:
902 case ETHTOOL_GPFLAGS: 930 case ETHTOOL_GPFLAGS:
903 case ETHTOOL_GRXFH: 931 case ETHTOOL_GRXFH:
932 case ETHTOOL_GRXRINGS:
933 case ETHTOOL_GRXCLSRLCNT:
934 case ETHTOOL_GRXCLSRULE:
935 case ETHTOOL_GRXCLSRLALL:
904 break; 936 break;
905 default: 937 default:
906 if (!capable(CAP_NET_ADMIN)) 938 if (!capable(CAP_NET_ADMIN))
@@ -1052,10 +1084,16 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1052 dev->ethtool_ops->set_priv_flags); 1084 dev->ethtool_ops->set_priv_flags);
1053 break; 1085 break;
1054 case ETHTOOL_GRXFH: 1086 case ETHTOOL_GRXFH:
1055 rc = ethtool_get_rxhash(dev, useraddr); 1087 case ETHTOOL_GRXRINGS:
1088 case ETHTOOL_GRXCLSRLCNT:
1089 case ETHTOOL_GRXCLSRULE:
1090 case ETHTOOL_GRXCLSRLALL:
1091 rc = ethtool_get_rxnfc(dev, useraddr);
1056 break; 1092 break;
1057 case ETHTOOL_SRXFH: 1093 case ETHTOOL_SRXFH:
1058 rc = ethtool_set_rxhash(dev, useraddr); 1094 case ETHTOOL_SRXCLSRLDEL:
1095 case ETHTOOL_SRXCLSRLINS:
1096 rc = ethtool_set_rxnfc(dev, useraddr);
1059 break; 1097 break;
1060 case ETHTOOL_GGRO: 1098 case ETHTOOL_GGRO:
1061 rc = ethtool_get_gro(dev, useraddr); 1099 rc = ethtool_get_gro(dev, useraddr);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 32b3a0152d7a..98691e1466b8 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -588,7 +588,8 @@ static void notify_rule_change(int event, struct fib_rule *rule,
588 goto errout; 588 goto errout;
589 } 589 }
590 590
591 err = rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL); 591 rtnl_notify(skb, net, pid, ops->nlgroup, nlh, GFP_KERNEL);
592 return;
592errout: 593errout:
593 if (err < 0) 594 if (err < 0)
594 rtnl_set_sk_err(net, ops->nlgroup, err); 595 rtnl_set_sk_err(net, ops->nlgroup, err);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 278a142d1047..a1cbce7fdae5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -871,8 +871,7 @@ static void neigh_timer_handler(unsigned long arg)
871 write_unlock(&neigh->lock); 871 write_unlock(&neigh->lock);
872 neigh->ops->solicit(neigh, skb); 872 neigh->ops->solicit(neigh, skb);
873 atomic_inc(&neigh->probes); 873 atomic_inc(&neigh->probes);
874 if (skb) 874 kfree_skb(skb);
875 kfree_skb(skb);
876 } else { 875 } else {
877out: 876out:
878 write_unlock(&neigh->lock); 877 write_unlock(&neigh->lock);
@@ -908,8 +907,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
908 neigh->updated = jiffies; 907 neigh->updated = jiffies;
909 write_unlock_bh(&neigh->lock); 908 write_unlock_bh(&neigh->lock);
910 909
911 if (skb) 910 kfree_skb(skb);
912 kfree_skb(skb);
913 return 1; 911 return 1;
914 } 912 }
915 } else if (neigh->nud_state & NUD_STALE) { 913 } else if (neigh->nud_state & NUD_STALE) {
@@ -1656,7 +1654,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1656 flags &= ~NEIGH_UPDATE_F_OVERRIDE; 1654 flags &= ~NEIGH_UPDATE_F_OVERRIDE;
1657 } 1655 }
1658 1656
1659 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); 1657 if (ndm->ndm_flags & NTF_USE) {
1658 neigh_event_send(neigh, NULL);
1659 err = 0;
1660 } else
1661 err = neigh_update(neigh, lladdr, ndm->ndm_state, flags);
1660 neigh_release(neigh); 1662 neigh_release(neigh);
1661 goto out_dev_put; 1663 goto out_dev_put;
1662 } 1664 }
@@ -2534,7 +2536,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags)
2534 kfree_skb(skb); 2536 kfree_skb(skb);
2535 goto errout; 2537 goto errout;
2536 } 2538 }
2537 err = rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); 2539 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
2540 return;
2538errout: 2541errout:
2539 if (err < 0) 2542 if (err < 0)
2540 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); 2543 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 6ac29a46e23e..2da59a0ac4ac 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -77,7 +77,9 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
77 if (endp == buf) 77 if (endp == buf)
78 goto err; 78 goto err;
79 79
80 rtnl_lock(); 80 if (!rtnl_trylock())
81 return -ERESTARTSYS;
82
81 if (dev_isalive(net)) { 83 if (dev_isalive(net)) {
82 if ((ret = (*set)(net, new)) == 0) 84 if ((ret = (*set)(net, new)) == 0)
83 ret = len; 85 ret = len;
@@ -496,7 +498,7 @@ int netdev_register_kobject(struct net_device *net)
496 dev->groups = groups; 498 dev->groups = groups;
497 499
498 BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ); 500 BUILD_BUG_ON(BUS_ID_SIZE < IFNAMSIZ);
499 dev_set_name(dev, net->name); 501 dev_set_name(dev, "%s", net->name);
500 502
501#ifdef CONFIG_SYSFS 503#ifdef CONFIG_SYSFS
502 *groups++ = &netstat_group; 504 *groups++ = &netstat_group;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
new file mode 100644
index 000000000000..c8fb45665e4f
--- /dev/null
+++ b/net/core/net-traces.c
@@ -0,0 +1,29 @@
1/*
2 * consolidates trace point definitions
3 *
4 * Copyright (C) 2009 Neil Horman <nhorman@tuxdriver.com>
5 */
6
7#include <linux/netdevice.h>
8#include <linux/etherdevice.h>
9#include <linux/string.h>
10#include <linux/if_arp.h>
11#include <linux/inetdevice.h>
12#include <linux/inet.h>
13#include <linux/interrupt.h>
14#include <linux/netpoll.h>
15#include <linux/sched.h>
16#include <linux/delay.h>
17#include <linux/rcupdate.h>
18#include <linux/types.h>
19#include <linux/workqueue.h>
20#include <linux/netlink.h>
21#include <linux/net_dropmon.h>
22#include <trace/skb.h>
23
24#include <asm/unaligned.h>
25#include <asm/bitops.h>
26
27
28DEFINE_TRACE(kfree_skb);
29EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 55151faaf90c..e3bebd36f053 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -32,24 +32,14 @@ static __net_init int setup_net(struct net *net)
32{ 32{
33 /* Must be called with net_mutex held */ 33 /* Must be called with net_mutex held */
34 struct pernet_operations *ops; 34 struct pernet_operations *ops;
35 int error; 35 int error = 0;
36 struct net_generic *ng;
37 36
38 atomic_set(&net->count, 1); 37 atomic_set(&net->count, 1);
38
39#ifdef NETNS_REFCNT_DEBUG 39#ifdef NETNS_REFCNT_DEBUG
40 atomic_set(&net->use_count, 0); 40 atomic_set(&net->use_count, 0);
41#endif 41#endif
42 42
43 error = -ENOMEM;
44 ng = kzalloc(sizeof(struct net_generic) +
45 INITIAL_NET_GEN_PTRS * sizeof(void *), GFP_KERNEL);
46 if (ng == NULL)
47 goto out;
48
49 ng->len = INITIAL_NET_GEN_PTRS;
50 rcu_assign_pointer(net->gen, ng);
51
52 error = 0;
53 list_for_each_entry(ops, &pernet_list, list) { 43 list_for_each_entry(ops, &pernet_list, list) {
54 if (ops->init) { 44 if (ops->init) {
55 error = ops->init(net); 45 error = ops->init(net);
@@ -70,24 +60,50 @@ out_undo:
70 } 60 }
71 61
72 rcu_barrier(); 62 rcu_barrier();
73 kfree(ng);
74 goto out; 63 goto out;
75} 64}
76 65
66static struct net_generic *net_alloc_generic(void)
67{
68 struct net_generic *ng;
69 size_t generic_size = sizeof(struct net_generic) +
70 INITIAL_NET_GEN_PTRS * sizeof(void *);
71
72 ng = kzalloc(generic_size, GFP_KERNEL);
73 if (ng)
74 ng->len = INITIAL_NET_GEN_PTRS;
75
76 return ng;
77}
78
77#ifdef CONFIG_NET_NS 79#ifdef CONFIG_NET_NS
78static struct kmem_cache *net_cachep; 80static struct kmem_cache *net_cachep;
79static struct workqueue_struct *netns_wq; 81static struct workqueue_struct *netns_wq;
80 82
81static struct net *net_alloc(void) 83static struct net *net_alloc(void)
82{ 84{
83 return kmem_cache_zalloc(net_cachep, GFP_KERNEL); 85 struct net *net = NULL;
86 struct net_generic *ng;
87
88 ng = net_alloc_generic();
89 if (!ng)
90 goto out;
91
92 net = kmem_cache_zalloc(net_cachep, GFP_KERNEL);
93 if (!net)
94 goto out_free;
95
96 rcu_assign_pointer(net->gen, ng);
97out:
98 return net;
99
100out_free:
101 kfree(ng);
102 goto out;
84} 103}
85 104
86static void net_free(struct net *net) 105static void net_free(struct net *net)
87{ 106{
88 if (!net)
89 return;
90
91#ifdef NETNS_REFCNT_DEBUG 107#ifdef NETNS_REFCNT_DEBUG
92 if (unlikely(atomic_read(&net->use_count) != 0)) { 108 if (unlikely(atomic_read(&net->use_count) != 0)) {
93 printk(KERN_EMERG "network namespace not free! Usage: %d\n", 109 printk(KERN_EMERG "network namespace not free! Usage: %d\n",
@@ -112,27 +128,28 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
112 err = -ENOMEM; 128 err = -ENOMEM;
113 new_net = net_alloc(); 129 new_net = net_alloc();
114 if (!new_net) 130 if (!new_net)
115 goto out; 131 goto out_err;
116 132
117 mutex_lock(&net_mutex); 133 mutex_lock(&net_mutex);
118 err = setup_net(new_net); 134 err = setup_net(new_net);
119 if (err) 135 if (!err) {
120 goto out_unlock; 136 rtnl_lock();
121 137 list_add_tail(&new_net->list, &net_namespace_list);
122 rtnl_lock(); 138 rtnl_unlock();
123 list_add_tail(&new_net->list, &net_namespace_list); 139 }
124 rtnl_unlock();
125
126
127out_unlock:
128 mutex_unlock(&net_mutex); 140 mutex_unlock(&net_mutex);
141
142 if (err)
143 goto out_free;
129out: 144out:
130 put_net(old_net); 145 put_net(old_net);
131 if (err) {
132 net_free(new_net);
133 new_net = ERR_PTR(err);
134 }
135 return new_net; 146 return new_net;
147
148out_free:
149 net_free(new_net);
150out_err:
151 new_net = ERR_PTR(err);
152 goto out;
136} 153}
137 154
138static void cleanup_net(struct work_struct *work) 155static void cleanup_net(struct work_struct *work)
@@ -140,9 +157,6 @@ static void cleanup_net(struct work_struct *work)
140 struct pernet_operations *ops; 157 struct pernet_operations *ops;
141 struct net *net; 158 struct net *net;
142 159
143 /* Be very certain incoming network packets will not find us */
144 rcu_barrier();
145
146 net = container_of(work, struct net, work); 160 net = container_of(work, struct net, work);
147 161
148 mutex_lock(&net_mutex); 162 mutex_lock(&net_mutex);
@@ -188,6 +202,7 @@ struct net *copy_net_ns(unsigned long flags, struct net *old_net)
188 202
189static int __init net_ns_init(void) 203static int __init net_ns_init(void)
190{ 204{
205 struct net_generic *ng;
191 int err; 206 int err;
192 207
193 printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net)); 208 printk(KERN_INFO "net_namespace: %zd bytes\n", sizeof(struct net));
@@ -202,6 +217,12 @@ static int __init net_ns_init(void)
202 panic("Could not create netns workq"); 217 panic("Could not create netns workq");
203#endif 218#endif
204 219
220 ng = net_alloc_generic();
221 if (!ng)
222 panic("Could not allocate generic netns");
223
224 rcu_assign_pointer(init_net.gen, ng);
225
205 mutex_lock(&net_mutex); 226 mutex_lock(&net_mutex);
206 err = setup_net(&init_net); 227 err = setup_net(&init_net);
207 228
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 65498483325a..32d419f5ac98 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3275,8 +3275,7 @@ static void pktgen_stop(struct pktgen_thread *t)
3275 3275
3276 list_for_each_entry(pkt_dev, &t->if_list, list) { 3276 list_for_each_entry(pkt_dev, &t->if_list, list) {
3277 pktgen_stop_device(pkt_dev); 3277 pktgen_stop_device(pkt_dev);
3278 if (pkt_dev->skb) 3278 kfree_skb(pkt_dev->skb);
3279 kfree_skb(pkt_dev->skb);
3280 3279
3281 pkt_dev->skb = NULL; 3280 pkt_dev->skb = NULL;
3282 } 3281 }
@@ -3303,8 +3302,7 @@ static void pktgen_rem_one_if(struct pktgen_thread *t)
3303 if (!cur->removal_mark) 3302 if (!cur->removal_mark)
3304 continue; 3303 continue;
3305 3304
3306 if (cur->skb) 3305 kfree_skb(cur->skb);
3307 kfree_skb(cur->skb);
3308 cur->skb = NULL; 3306 cur->skb = NULL;
3309 3307
3310 pktgen_remove_device(t, cur); 3308 pktgen_remove_device(t, cur);
@@ -3328,8 +3326,7 @@ static void pktgen_rem_all_ifs(struct pktgen_thread *t)
3328 list_for_each_safe(q, n, &t->if_list) { 3326 list_for_each_safe(q, n, &t->if_list) {
3329 cur = list_entry(q, struct pktgen_dev, list); 3327 cur = list_entry(q, struct pktgen_dev, list);
3330 3328
3331 if (cur->skb) 3329 kfree_skb(cur->skb);
3332 kfree_skb(cur->skb);
3333 cur->skb = NULL; 3330 cur->skb = NULL;
3334 3331
3335 pktgen_remove_device(t, cur); 3332 pktgen_remove_device(t, cur);
@@ -3393,8 +3390,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
3393 3390
3394 if (!netif_running(odev)) { 3391 if (!netif_running(odev)) {
3395 pktgen_stop_device(pkt_dev); 3392 pktgen_stop_device(pkt_dev);
3396 if (pkt_dev->skb) 3393 kfree_skb(pkt_dev->skb);
3397 kfree_skb(pkt_dev->skb);
3398 pkt_dev->skb = NULL; 3394 pkt_dev->skb = NULL;
3399 goto out; 3395 goto out;
3400 } 3396 }
@@ -3415,8 +3411,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
3415 if ((++pkt_dev->clone_count >= pkt_dev->clone_skb) 3411 if ((++pkt_dev->clone_count >= pkt_dev->clone_skb)
3416 || (!pkt_dev->skb)) { 3412 || (!pkt_dev->skb)) {
3417 /* build a new pkt */ 3413 /* build a new pkt */
3418 if (pkt_dev->skb) 3414 kfree_skb(pkt_dev->skb);
3419 kfree_skb(pkt_dev->skb);
3420 3415
3421 pkt_dev->skb = fill_packet(odev, pkt_dev); 3416 pkt_dev->skb = fill_packet(odev, pkt_dev);
3422 if (pkt_dev->skb == NULL) { 3417 if (pkt_dev->skb == NULL) {
@@ -3498,8 +3493,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev)
3498 3493
3499 /* Done with this */ 3494 /* Done with this */
3500 pktgen_stop_device(pkt_dev); 3495 pktgen_stop_device(pkt_dev);
3501 if (pkt_dev->skb) 3496 kfree_skb(pkt_dev->skb);
3502 kfree_skb(pkt_dev->skb);
3503 pkt_dev->skb = NULL; 3497 pkt_dev->skb = NULL;
3504 } 3498 }
3505out:; 3499out:;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 790dd205bb5d..d78030f88bd0 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -455,8 +455,8 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
455 return nlmsg_unicast(rtnl, skb, pid); 455 return nlmsg_unicast(rtnl, skb, pid);
456} 456}
457 457
458int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, 458void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
459 struct nlmsghdr *nlh, gfp_t flags) 459 struct nlmsghdr *nlh, gfp_t flags)
460{ 460{
461 struct sock *rtnl = net->rtnl; 461 struct sock *rtnl = net->rtnl;
462 int report = 0; 462 int report = 0;
@@ -464,7 +464,7 @@ int rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
464 if (nlh) 464 if (nlh)
465 report = nlmsg_report(nlh); 465 report = nlmsg_report(nlh);
466 466
467 return nlmsg_notify(rtnl, skb, pid, group, report, flags); 467 nlmsg_notify(rtnl, skb, pid, group, report, flags);
468} 468}
469 469
470void rtnl_set_sk_err(struct net *net, u32 group, int error) 470void rtnl_set_sk_err(struct net *net, u32 group, int error)
@@ -1246,7 +1246,8 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change)
1246 kfree_skb(skb); 1246 kfree_skb(skb);
1247 goto errout; 1247 goto errout;
1248 } 1248 }
1249 err = rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL); 1249 rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_KERNEL);
1250 return;
1250errout: 1251errout:
1251 if (err < 0) 1252 if (err < 0)
1252 rtnl_set_sk_err(net, RTNLGRP_LINK, err); 1253 rtnl_set_sk_err(net, RTNLGRP_LINK, err);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e5a8351ff12d..6acbf9e79eb1 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,6 +65,7 @@
65 65
66#include <asm/uaccess.h> 66#include <asm/uaccess.h>
67#include <asm/system.h> 67#include <asm/system.h>
68#include <trace/skb.h>
68 69
69#include "kmap_skb.h" 70#include "kmap_skb.h"
70 71
@@ -146,14 +147,6 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
146} 147}
147EXPORT_SYMBOL(skb_under_panic); 148EXPORT_SYMBOL(skb_under_panic);
148 149
149void skb_truesize_bug(struct sk_buff *skb)
150{
151 WARN(net_ratelimit(), KERN_ERR "SKB BUG: Invalid truesize (%u) "
152 "len=%u, sizeof(sk_buff)=%Zd\n",
153 skb->truesize, skb->len, sizeof(struct sk_buff));
154}
155EXPORT_SYMBOL(skb_truesize_bug);
156
157/* Allocate a new skbuff. We do this ourselves so we can fill in a few 150/* Allocate a new skbuff. We do this ourselves so we can fill in a few
158 * 'private' fields and also do memory statistics to find all the 151 * 'private' fields and also do memory statistics to find all the
159 * [BEEP] leaks. 152 * [BEEP] leaks.
@@ -450,11 +443,32 @@ void kfree_skb(struct sk_buff *skb)
450 smp_rmb(); 443 smp_rmb();
451 else if (likely(!atomic_dec_and_test(&skb->users))) 444 else if (likely(!atomic_dec_and_test(&skb->users)))
452 return; 445 return;
446 trace_kfree_skb(skb, __builtin_return_address(0));
453 __kfree_skb(skb); 447 __kfree_skb(skb);
454} 448}
455EXPORT_SYMBOL(kfree_skb); 449EXPORT_SYMBOL(kfree_skb);
456 450
457/** 451/**
452 * consume_skb - free an skbuff
453 * @skb: buffer to free
454 *
455 * Drop a ref to the buffer and free it if the usage count has hit zero
456 * Functions identically to kfree_skb, but kfree_skb assumes that the frame
457 * is being dropped after a failure and notes that
458 */
459void consume_skb(struct sk_buff *skb)
460{
461 if (unlikely(!skb))
462 return;
463 if (likely(atomic_read(&skb->users) == 1))
464 smp_rmb();
465 else if (likely(!atomic_dec_and_test(&skb->users)))
466 return;
467 __kfree_skb(skb);
468}
469EXPORT_SYMBOL(consume_skb);
470
471/**
458 * skb_recycle_check - check if skb can be reused for receive 472 * skb_recycle_check - check if skb can be reused for receive
459 * @skb: buffer 473 * @skb: buffer
460 * @skb_size: minimum receive buffer size 474 * @skb_size: minimum receive buffer size
@@ -1216,8 +1230,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
1216 insp = list; 1230 insp = list;
1217 } 1231 }
1218 if (!pskb_pull(list, eat)) { 1232 if (!pskb_pull(list, eat)) {
1219 if (clone) 1233 kfree_skb(clone);
1220 kfree_skb(clone);
1221 return NULL; 1234 return NULL;
1222 } 1235 }
1223 break; 1236 break;
diff --git a/net/core/sock.c b/net/core/sock.c
index 40887e76652c..0620046e4eba 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -150,7 +150,7 @@ static const char *af_family_key_strings[AF_MAX+1] = {
150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , 150 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , 151 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , 152 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
153 "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , 153 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , 154 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 155 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 156 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
@@ -165,7 +165,7 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , 165 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , 166 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , 167 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
168 "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , 168 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , 169 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
170 "slock-27" , "slock-28" , "slock-AF_CAN" , 170 "slock-27" , "slock-28" , "slock-AF_CAN" ,
171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 171 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
@@ -180,7 +180,7 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
180 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 180 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
181 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 181 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
182 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 182 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
183 "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" , 183 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
184 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 184 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
185 "clock-27" , "clock-28" , "clock-AF_CAN" , 185 "clock-27" , "clock-28" , "clock-AF_CAN" ,
186 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 186 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
@@ -725,7 +725,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
725 if (len < 0) 725 if (len < 0)
726 return -EINVAL; 726 return -EINVAL;
727 727
728 v.val = 0; 728 memset(&v, 0, sizeof(v));
729 729
730 switch(optname) { 730 switch(optname) {
731 case SO_DEBUG: 731 case SO_DEBUG:
@@ -1185,7 +1185,6 @@ void sock_rfree(struct sk_buff *skb)
1185{ 1185{
1186 struct sock *sk = skb->sk; 1186 struct sock *sk = skb->sk;
1187 1187
1188 skb_truesize_check(skb);
1189 atomic_sub(skb->truesize, &sk->sk_rmem_alloc); 1188 atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1190 sk_mem_uncharge(skb->sk, skb->truesize); 1189 sk_mem_uncharge(skb->sk, skb->truesize);
1191} 1190}
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 83d3398559ea..7db1de0497c6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -11,6 +11,7 @@
11#include <linux/socket.h> 11#include <linux/socket.h>
12#include <linux/netdevice.h> 12#include <linux/netdevice.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <net/ip.h>
14#include <net/sock.h> 15#include <net/sock.h>
15 16
16static struct ctl_table net_core_table[] = { 17static struct ctl_table net_core_table[] = {
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 45f95e55f873..7ea557b7c6b1 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -20,6 +20,9 @@
20/* We can spread an ack vector across multiple options */ 20/* We can spread an ack vector across multiple options */
21#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2) 21#define DCCP_MAX_ACKVEC_LEN (DCCP_SINGLE_OPT_MAXLEN * 2)
22 22
23/* Estimated minimum average Ack Vector length - used for updating MPS */
24#define DCCPAV_MIN_OPTLEN 16
25
23#define DCCP_ACKVEC_STATE_RECEIVED 0 26#define DCCP_ACKVEC_STATE_RECEIVED 0
24#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6) 27#define DCCP_ACKVEC_STATE_ECN_MARKED (1 << 6)
25#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6) 28#define DCCP_ACKVEC_STATE_NOT_RECEIVED (3 << 6)
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 08a569ff02d1..d6bc47363b1c 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -63,11 +63,14 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo);
63 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields 63 * - DCCP-Reset with ACK Subheader and 4 bytes of Reset Code fields
64 * Hence a safe upper bound for the maximum option length is 1020-28 = 992 64 * Hence a safe upper bound for the maximum option length is 1020-28 = 992
65 */ 65 */
66#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(int)) 66#define MAX_DCCP_SPECIFIC_HEADER (255 * sizeof(uint32_t))
67#define DCCP_MAX_PACKET_HDR 28 67#define DCCP_MAX_PACKET_HDR 28
68#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR) 68#define DCCP_MAX_OPT_LEN (MAX_DCCP_SPECIFIC_HEADER - DCCP_MAX_PACKET_HDR)
69#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER) 69#define MAX_DCCP_HEADER (MAX_DCCP_SPECIFIC_HEADER + MAX_HEADER)
70 70
71/* Upper bound for initial feature-negotiation overhead (padded to 32 bits) */
72#define DCCP_FEATNEG_OVERHEAD (32 * sizeof(uint32_t))
73
71#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT 74#define DCCP_TIMEWAIT_LEN (60 * HZ) /* how long to wait to destroy TIME-WAIT
72 * state, about 60 seconds */ 75 * state, about 60 seconds */
73 76
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 22a618af4893..36bcc00654d3 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -161,21 +161,27 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu)
161 struct inet_connection_sock *icsk = inet_csk(sk); 161 struct inet_connection_sock *icsk = inet_csk(sk);
162 struct dccp_sock *dp = dccp_sk(sk); 162 struct dccp_sock *dp = dccp_sk(sk);
163 u32 ccmps = dccp_determine_ccmps(dp); 163 u32 ccmps = dccp_determine_ccmps(dp);
164 int cur_mps = ccmps ? min(pmtu, ccmps) : pmtu; 164 u32 cur_mps = ccmps ? min(pmtu, ccmps) : pmtu;
165 165
166 /* Account for header lengths and IPv4/v6 option overhead */ 166 /* Account for header lengths and IPv4/v6 option overhead */
167 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len + 167 cur_mps -= (icsk->icsk_af_ops->net_header_len + icsk->icsk_ext_hdr_len +
168 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext)); 168 sizeof(struct dccp_hdr) + sizeof(struct dccp_hdr_ext));
169 169
170 /* 170 /*
171 * FIXME: this should come from the CCID infrastructure, where, say, 171 * Leave enough headroom for common DCCP header options.
172 * TFRC will say it wants TIMESTAMPS, ELAPSED time, etc, for now lets 172 * This only considers options which may appear on DCCP-Data packets, as
173 * put a rough estimate for NDP + TIMESTAMP + TIMESTAMP_ECHO + ELAPSED 173 * per table 3 in RFC 4340, 5.8. When running out of space for other
174 * TIME + TFRC_OPT_LOSS_EVENT_RATE + TFRC_OPT_RECEIVE_RATE + padding to 174 * options (eg. Ack Vector which can take up to 255 bytes), it is better
175 * make it a multiple of 4 175 * to schedule a separate Ack. Thus we leave headroom for the following:
176 * - 1 byte for Slow Receiver (11.6)
177 * - 6 bytes for Timestamp (13.1)
178 * - 10 bytes for Timestamp Echo (13.3)
179 * - 8 bytes for NDP count (7.7, when activated)
180 * - 6 bytes for Data Checksum (9.3)
181 * - %DCCPAV_MIN_OPTLEN bytes for Ack Vector size (11.4, when enabled)
176 */ 182 */
177 183 cur_mps -= roundup(1 + 6 + 10 + dp->dccps_send_ndp_count * 8 + 6 +
178 cur_mps -= roundup(5 + 6 + 10 + 6 + 6 + 6, 4); 184 (dp->dccps_hc_rx_ackvec ? DCCPAV_MIN_OPTLEN : 0), 4);
179 185
180 /* And store cached results */ 186 /* And store cached results */
181 icsk->icsk_pmtu_cookie = pmtu; 187 icsk->icsk_pmtu_cookie = pmtu;
@@ -270,7 +276,20 @@ void dccp_write_xmit(struct sock *sk, int block)
270 const int len = skb->len; 276 const int len = skb->len;
271 277
272 if (sk->sk_state == DCCP_PARTOPEN) { 278 if (sk->sk_state == DCCP_PARTOPEN) {
273 /* See 8.1.5. Handshake Completion */ 279 const u32 cur_mps = dp->dccps_mss_cache - DCCP_FEATNEG_OVERHEAD;
280 /*
281 * See 8.1.5 - Handshake Completion.
282 *
283 * For robustness we resend Confirm options until the client has
284 * entered OPEN. During the initial feature negotiation, the MPS
285 * is smaller than usual, reduced by the Change/Confirm options.
286 */
287 if (!list_empty(&dp->dccps_featneg) && len > cur_mps) {
288 DCCP_WARN("Payload too large (%d) for featneg.\n", len);
289 dccp_send_ack(sk);
290 dccp_feat_list_purge(&dp->dccps_featneg);
291 }
292
274 inet_csk_schedule_ack(sk); 293 inet_csk_schedule_ack(sk);
275 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, 294 inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
276 inet_csk(sk)->icsk_rto, 295 inet_csk(sk)->icsk_rto,
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 12bf7d4c16c6..9647d911f916 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -1246,11 +1246,12 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1246 1246
1247 case TIOCINQ: 1247 case TIOCINQ:
1248 lock_sock(sk); 1248 lock_sock(sk);
1249 if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) { 1249 skb = skb_peek(&scp->other_receive_queue);
1250 if (skb) {
1250 amount = skb->len; 1251 amount = skb->len;
1251 } else { 1252 } else {
1252 struct sk_buff *skb = sk->sk_receive_queue.next; 1253 skb = sk->sk_receive_queue.next;
1253 for(;;) { 1254 for (;;) {
1254 if (skb == 1255 if (skb ==
1255 (struct sk_buff *)&sk->sk_receive_queue) 1256 (struct sk_buff *)&sk->sk_receive_queue)
1256 break; 1257 break;
@@ -1579,16 +1580,16 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us
1579 default: 1580 default:
1580#ifdef CONFIG_NETFILTER 1581#ifdef CONFIG_NETFILTER
1581 { 1582 {
1582 int val, len; 1583 int ret, len;
1583 1584
1584 if(get_user(len, optlen)) 1585 if(get_user(len, optlen))
1585 return -EFAULT; 1586 return -EFAULT;
1586 1587
1587 val = nf_getsockopt(sk, PF_DECnet, optname, 1588 ret = nf_getsockopt(sk, PF_DECnet, optname,
1588 optval, &len); 1589 optval, &len);
1589 if (val >= 0) 1590 if (ret >= 0)
1590 val = put_user(len, optlen); 1591 ret = put_user(len, optlen);
1591 return val; 1592 return ret;
1592 } 1593 }
1593#endif 1594#endif
1594 case DSO_STREAM: 1595 case DSO_STREAM:
@@ -2071,8 +2072,7 @@ static int dn_sendmsg(struct kiocb *iocb, struct socket *sock,
2071 } 2072 }
2072out: 2073out:
2073 2074
2074 if (skb) 2075 kfree_skb(skb);
2075 kfree_skb(skb);
2076 2076
2077 release_sock(sk); 2077 release_sock(sk);
2078 2078
@@ -2112,9 +2112,8 @@ static struct notifier_block dn_dev_notifier = {
2112 2112
2113extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); 2113extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *);
2114 2114
2115static struct packet_type dn_dix_packet_type = { 2115static struct packet_type dn_dix_packet_type __read_mostly = {
2116 .type = cpu_to_be16(ETH_P_DNA_RT), 2116 .type = cpu_to_be16(ETH_P_DNA_RT),
2117 .dev = NULL, /* All devices */
2118 .func = dn_route_rcv, 2117 .func = dn_route_rcv,
2119}; 2118};
2120 2119
diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c
index daf2b98b15fe..1c6a5bb6f0c8 100644
--- a/net/decnet/dn_dev.c
+++ b/net/decnet/dn_dev.c
@@ -684,7 +684,6 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
684 return -ENODEV; 684 return -ENODEV;
685 685
686 if ((dn_db = dev->dn_ptr) == NULL) { 686 if ((dn_db = dev->dn_ptr) == NULL) {
687 int err;
688 dn_db = dn_dev_create(dev, &err); 687 dn_db = dn_dev_create(dev, &err);
689 if (!dn_db) 688 if (!dn_db)
690 return err; 689 return err;
@@ -769,7 +768,8 @@ static void dn_ifaddr_notify(int event, struct dn_ifaddr *ifa)
769 kfree_skb(skb); 768 kfree_skb(skb);
770 goto errout; 769 goto errout;
771 } 770 }
772 err = rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL); 771 rtnl_notify(skb, &init_net, 0, RTNLGRP_DECnet_IFADDR, NULL, GFP_KERNEL);
772 return;
773errout: 773errout:
774 if (err < 0) 774 if (err < 0)
775 rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err); 775 rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_IFADDR, err);
@@ -1322,6 +1322,7 @@ static inline int is_dn_dev(struct net_device *dev)
1322} 1322}
1323 1323
1324static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) 1324static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos)
1325 __acquires(&dev_base_lock)
1325{ 1326{
1326 int i; 1327 int i;
1327 struct net_device *dev; 1328 struct net_device *dev;
@@ -1364,6 +1365,7 @@ static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1364} 1365}
1365 1366
1366static void dn_dev_seq_stop(struct seq_file *seq, void *v) 1367static void dn_dev_seq_stop(struct seq_file *seq, void *v)
1368 __releases(&dev_base_lock)
1367{ 1369{
1368 read_unlock(&dev_base_lock); 1370 read_unlock(&dev_base_lock);
1369} 1371}
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 5130dee0b384..0cc4394117df 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -380,7 +380,6 @@ static int dn_return_short(struct sk_buff *skb)
380 unsigned char *ptr; 380 unsigned char *ptr;
381 __le16 *src; 381 __le16 *src;
382 __le16 *dst; 382 __le16 *dst;
383 __le16 tmp;
384 383
385 /* Add back headers */ 384 /* Add back headers */
386 skb_push(skb, skb->data - skb_network_header(skb)); 385 skb_push(skb, skb->data - skb_network_header(skb));
@@ -399,10 +398,7 @@ static int dn_return_short(struct sk_buff *skb)
399 ptr += 2; 398 ptr += 2;
400 *ptr = 0; /* Zero hop count */ 399 *ptr = 0; /* Zero hop count */
401 400
402 /* Swap source and destination */ 401 swap(*src, *dst);
403 tmp = *src;
404 *src = *dst;
405 *dst = tmp;
406 402
407 skb->pkt_type = PACKET_OUTGOING; 403 skb->pkt_type = PACKET_OUTGOING;
408 dn_rt_finish_output(skb, NULL, NULL); 404 dn_rt_finish_output(skb, NULL, NULL);
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index 69ad9280c693..67054b0d550f 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -375,7 +375,8 @@ static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, u32 tb_id,
375 kfree_skb(skb); 375 kfree_skb(skb);
376 goto errout; 376 goto errout;
377 } 377 }
378 err = rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL); 378 rtnl_notify(skb, &init_net, pid, RTNLGRP_DECnet_ROUTE, nlh, GFP_KERNEL);
379 return;
379errout: 380errout:
380 if (err < 0) 381 if (err < 0)
381 rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err); 382 rtnl_set_sk_err(&init_net, RTNLGRP_DECnet_ROUTE, err);
diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c
index 965397af9a80..5bcd592ae6dd 100644
--- a/net/decnet/sysctl_net_decnet.c
+++ b/net/decnet/sysctl_net_decnet.c
@@ -179,7 +179,7 @@ static int dn_node_address_handler(ctl_table *table, int write,
179 } 179 }
180 180
181 if (write) { 181 if (write) {
182 int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); 182 len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1);
183 183
184 if (copy_from_user(addr, buffer, len)) 184 if (copy_from_user(addr, buffer, len))
185 return -EFAULT; 185 return -EFAULT;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index 49211b35725b..c51b55400dc5 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -41,13 +41,13 @@ config NET_DSA_MV88E6XXX_NEED_PPU
41 default n 41 default n
42 42
43config NET_DSA_MV88E6131 43config NET_DSA_MV88E6131
44 bool "Marvell 88E6131 ethernet switch chip support" 44 bool "Marvell 88E6095/6095F/6131 ethernet switch chip support"
45 select NET_DSA_MV88E6XXX 45 select NET_DSA_MV88E6XXX
46 select NET_DSA_MV88E6XXX_NEED_PPU 46 select NET_DSA_MV88E6XXX_NEED_PPU
47 select NET_DSA_TAG_DSA 47 select NET_DSA_TAG_DSA
48 ---help--- 48 ---help---
49 This enables support for the Marvell 88E6131 ethernet switch 49 This enables support for the Marvell 88E6095/6095F/6131
50 chip. 50 ethernet switch chips.
51 51
52config NET_DSA_MV88E6123_61_65 52config NET_DSA_MV88E6123_61_65
53 bool "Marvell 88E6123/6161/6165 ethernet switch chip support" 53 bool "Marvell 88E6123/6161/6165 ethernet switch chip support"
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 33e99462023a..71489f69a42c 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/dsa.c - Hardware switch handling 2 * net/dsa/dsa.c - Hardware switch handling
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -67,12 +67,13 @@ dsa_switch_probe(struct mii_bus *bus, int sw_addr, char **_name)
67 67
68/* basic switch operations **************************************************/ 68/* basic switch operations **************************************************/
69static struct dsa_switch * 69static struct dsa_switch *
70dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd, 70dsa_switch_setup(struct dsa_switch_tree *dst, int index,
71 struct mii_bus *bus, struct net_device *dev) 71 struct device *parent, struct mii_bus *bus)
72{ 72{
73 struct dsa_chip_data *pd = dst->pd->chip + index;
74 struct dsa_switch_driver *drv;
73 struct dsa_switch *ds; 75 struct dsa_switch *ds;
74 int ret; 76 int ret;
75 struct dsa_switch_driver *drv;
76 char *name; 77 char *name;
77 int i; 78 int i;
78 79
@@ -81,11 +82,12 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
81 */ 82 */
82 drv = dsa_switch_probe(bus, pd->sw_addr, &name); 83 drv = dsa_switch_probe(bus, pd->sw_addr, &name);
83 if (drv == NULL) { 84 if (drv == NULL) {
84 printk(KERN_ERR "%s: could not detect attached switch\n", 85 printk(KERN_ERR "%s[%d]: could not detect attached switch\n",
85 dev->name); 86 dst->master_netdev->name, index);
86 return ERR_PTR(-EINVAL); 87 return ERR_PTR(-EINVAL);
87 } 88 }
88 printk(KERN_INFO "%s: detected a %s switch\n", dev->name, name); 89 printk(KERN_INFO "%s[%d]: detected a %s switch\n",
90 dst->master_netdev->name, index, name);
89 91
90 92
91 /* 93 /*
@@ -95,18 +97,16 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
95 if (ds == NULL) 97 if (ds == NULL)
96 return ERR_PTR(-ENOMEM); 98 return ERR_PTR(-ENOMEM);
97 99
98 ds->pd = pd; 100 ds->dst = dst;
99 ds->master_netdev = dev; 101 ds->index = index;
100 ds->master_mii_bus = bus; 102 ds->pd = dst->pd->chip + index;
101
102 ds->drv = drv; 103 ds->drv = drv;
103 ds->tag_protocol = drv->tag_protocol; 104 ds->master_mii_bus = bus;
104 105
105 106
106 /* 107 /*
107 * Validate supplied switch configuration. 108 * Validate supplied switch configuration.
108 */ 109 */
109 ds->cpu_port = -1;
110 for (i = 0; i < DSA_MAX_PORTS; i++) { 110 for (i = 0; i < DSA_MAX_PORTS; i++) {
111 char *name; 111 char *name;
112 112
@@ -115,32 +115,28 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
115 continue; 115 continue;
116 116
117 if (!strcmp(name, "cpu")) { 117 if (!strcmp(name, "cpu")) {
118 if (ds->cpu_port != -1) { 118 if (dst->cpu_switch != -1) {
119 printk(KERN_ERR "multiple cpu ports?!\n"); 119 printk(KERN_ERR "multiple cpu ports?!\n");
120 ret = -EINVAL; 120 ret = -EINVAL;
121 goto out; 121 goto out;
122 } 122 }
123 ds->cpu_port = i; 123 dst->cpu_switch = index;
124 dst->cpu_port = i;
125 } else if (!strcmp(name, "dsa")) {
126 ds->dsa_port_mask |= 1 << i;
124 } else { 127 } else {
125 ds->valid_port_mask |= 1 << i; 128 ds->phys_port_mask |= 1 << i;
126 } 129 }
127 } 130 }
128 131
129 if (ds->cpu_port == -1) {
130 printk(KERN_ERR "no cpu port?!\n");
131 ret = -EINVAL;
132 goto out;
133 }
134
135 132
136 /* 133 /*
137 * If we use a tagging format that doesn't have an ethertype 134 * If the CPU connects to this switch, set the switch tree
138 * field, make sure that all packets from this point on get 135 * tagging protocol to the preferred tagging format of this
139 * sent to the tag format's receive function. (Which will 136 * switch.
140 * discard received packets until we set ds->ports[] below.)
141 */ 137 */
142 wmb(); 138 if (ds->dst->cpu_switch == index)
143 dev->dsa_ptr = (void *)ds; 139 ds->dst->tag_protocol = drv->tag_protocol;
144 140
145 141
146 /* 142 /*
@@ -150,7 +146,7 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
150 if (ret < 0) 146 if (ret < 0)
151 goto out; 147 goto out;
152 148
153 ret = drv->set_addr(ds, dev->dev_addr); 149 ret = drv->set_addr(ds, dst->master_netdev->dev_addr);
154 if (ret < 0) 150 if (ret < 0)
155 goto out; 151 goto out;
156 152
@@ -169,18 +165,18 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
169 /* 165 /*
170 * Create network devices for physical switch ports. 166 * Create network devices for physical switch ports.
171 */ 167 */
172 wmb();
173 for (i = 0; i < DSA_MAX_PORTS; i++) { 168 for (i = 0; i < DSA_MAX_PORTS; i++) {
174 struct net_device *slave_dev; 169 struct net_device *slave_dev;
175 170
176 if (!(ds->valid_port_mask & (1 << i))) 171 if (!(ds->phys_port_mask & (1 << i)))
177 continue; 172 continue;
178 173
179 slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]); 174 slave_dev = dsa_slave_create(ds, parent, i, pd->port_names[i]);
180 if (slave_dev == NULL) { 175 if (slave_dev == NULL) {
181 printk(KERN_ERR "%s: can't create dsa slave " 176 printk(KERN_ERR "%s[%d]: can't create dsa "
182 "device for port %d(%s)\n", 177 "slave device for port %d(%s)\n",
183 dev->name, i, pd->port_names[i]); 178 dst->master_netdev->name,
179 index, i, pd->port_names[i]);
184 continue; 180 continue;
185 } 181 }
186 182
@@ -192,7 +188,6 @@ dsa_switch_setup(struct device *parent, struct dsa_platform_data *pd,
192out_free: 188out_free:
193 mdiobus_free(ds->slave_mii_bus); 189 mdiobus_free(ds->slave_mii_bus);
194out: 190out:
195 dev->dsa_ptr = NULL;
196 kfree(ds); 191 kfree(ds);
197 return ERR_PTR(ret); 192 return ERR_PTR(ret);
198} 193}
@@ -212,35 +207,42 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
212 */ 207 */
213bool dsa_uses_dsa_tags(void *dsa_ptr) 208bool dsa_uses_dsa_tags(void *dsa_ptr)
214{ 209{
215 struct dsa_switch *ds = dsa_ptr; 210 struct dsa_switch_tree *dst = dsa_ptr;
216 211
217 return !!(ds->tag_protocol == htons(ETH_P_DSA)); 212 return !!(dst->tag_protocol == htons(ETH_P_DSA));
218} 213}
219 214
220bool dsa_uses_trailer_tags(void *dsa_ptr) 215bool dsa_uses_trailer_tags(void *dsa_ptr)
221{ 216{
222 struct dsa_switch *ds = dsa_ptr; 217 struct dsa_switch_tree *dst = dsa_ptr;
223 218
224 return !!(ds->tag_protocol == htons(ETH_P_TRAILER)); 219 return !!(dst->tag_protocol == htons(ETH_P_TRAILER));
225} 220}
226 221
227 222
228/* link polling *************************************************************/ 223/* link polling *************************************************************/
229static void dsa_link_poll_work(struct work_struct *ugly) 224static void dsa_link_poll_work(struct work_struct *ugly)
230{ 225{
231 struct dsa_switch *ds; 226 struct dsa_switch_tree *dst;
227 int i;
228
229 dst = container_of(ugly, struct dsa_switch_tree, link_poll_work);
232 230
233 ds = container_of(ugly, struct dsa_switch, link_poll_work); 231 for (i = 0; i < dst->pd->nr_chips; i++) {
232 struct dsa_switch *ds = dst->ds[i];
234 233
235 ds->drv->poll_link(ds); 234 if (ds != NULL && ds->drv->poll_link != NULL)
236 mod_timer(&ds->link_poll_timer, round_jiffies(jiffies + HZ)); 235 ds->drv->poll_link(ds);
236 }
237
238 mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ));
237} 239}
238 240
239static void dsa_link_poll_timer(unsigned long _ds) 241static void dsa_link_poll_timer(unsigned long _dst)
240{ 242{
241 struct dsa_switch *ds = (void *)_ds; 243 struct dsa_switch_tree *dst = (void *)_dst;
242 244
243 schedule_work(&ds->link_poll_work); 245 schedule_work(&dst->link_poll_work);
244} 246}
245 247
246 248
@@ -303,18 +305,14 @@ static int dsa_probe(struct platform_device *pdev)
303 static int dsa_version_printed; 305 static int dsa_version_printed;
304 struct dsa_platform_data *pd = pdev->dev.platform_data; 306 struct dsa_platform_data *pd = pdev->dev.platform_data;
305 struct net_device *dev; 307 struct net_device *dev;
306 struct mii_bus *bus; 308 struct dsa_switch_tree *dst;
307 struct dsa_switch *ds; 309 int i;
308 310
309 if (!dsa_version_printed++) 311 if (!dsa_version_printed++)
310 printk(KERN_NOTICE "Distributed Switch Architecture " 312 printk(KERN_NOTICE "Distributed Switch Architecture "
311 "driver version %s\n", dsa_driver_version); 313 "driver version %s\n", dsa_driver_version);
312 314
313 if (pd == NULL || pd->mii_bus == NULL || pd->netdev == NULL) 315 if (pd == NULL || pd->netdev == NULL)
314 return -EINVAL;
315
316 bus = dev_to_mii_bus(pd->mii_bus);
317 if (bus == NULL)
318 return -EINVAL; 316 return -EINVAL;
319 317
320 dev = dev_to_net_device(pd->netdev); 318 dev = dev_to_net_device(pd->netdev);
@@ -326,36 +324,79 @@ static int dsa_probe(struct platform_device *pdev)
326 return -EEXIST; 324 return -EEXIST;
327 } 325 }
328 326
329 ds = dsa_switch_setup(&pdev->dev, pd, bus, dev); 327 dst = kzalloc(sizeof(*dst), GFP_KERNEL);
330 if (IS_ERR(ds)) { 328 if (dst == NULL) {
331 dev_put(dev); 329 dev_put(dev);
332 return PTR_ERR(ds); 330 return -ENOMEM;
333 } 331 }
334 332
335 if (ds->drv->poll_link != NULL) { 333 platform_set_drvdata(pdev, dst);
336 INIT_WORK(&ds->link_poll_work, dsa_link_poll_work); 334
337 init_timer(&ds->link_poll_timer); 335 dst->pd = pd;
338 ds->link_poll_timer.data = (unsigned long)ds; 336 dst->master_netdev = dev;
339 ds->link_poll_timer.function = dsa_link_poll_timer; 337 dst->cpu_switch = -1;
340 ds->link_poll_timer.expires = round_jiffies(jiffies + HZ); 338 dst->cpu_port = -1;
341 add_timer(&ds->link_poll_timer); 339
340 for (i = 0; i < pd->nr_chips; i++) {
341 struct mii_bus *bus;
342 struct dsa_switch *ds;
343
344 bus = dev_to_mii_bus(pd->chip[i].mii_bus);
345 if (bus == NULL) {
346 printk(KERN_ERR "%s[%d]: no mii bus found for "
347 "dsa switch\n", dev->name, i);
348 continue;
349 }
350
351 ds = dsa_switch_setup(dst, i, &pdev->dev, bus);
352 if (IS_ERR(ds)) {
353 printk(KERN_ERR "%s[%d]: couldn't create dsa switch "
354 "instance (error %ld)\n", dev->name, i,
355 PTR_ERR(ds));
356 continue;
357 }
358
359 dst->ds[i] = ds;
360 if (ds->drv->poll_link != NULL)
361 dst->link_poll_needed = 1;
342 } 362 }
343 363
344 platform_set_drvdata(pdev, ds); 364 /*
365 * If we use a tagging format that doesn't have an ethertype
366 * field, make sure that all packets from this point on get
367 * sent to the tag format's receive function.
368 */
369 wmb();
370 dev->dsa_ptr = (void *)dst;
371
372 if (dst->link_poll_needed) {
373 INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
374 init_timer(&dst->link_poll_timer);
375 dst->link_poll_timer.data = (unsigned long)dst;
376 dst->link_poll_timer.function = dsa_link_poll_timer;
377 dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
378 add_timer(&dst->link_poll_timer);
379 }
345 380
346 return 0; 381 return 0;
347} 382}
348 383
349static int dsa_remove(struct platform_device *pdev) 384static int dsa_remove(struct platform_device *pdev)
350{ 385{
351 struct dsa_switch *ds = platform_get_drvdata(pdev); 386 struct dsa_switch_tree *dst = platform_get_drvdata(pdev);
387 int i;
352 388
353 if (ds->drv->poll_link != NULL) 389 if (dst->link_poll_needed)
354 del_timer_sync(&ds->link_poll_timer); 390 del_timer_sync(&dst->link_poll_timer);
355 391
356 flush_scheduled_work(); 392 flush_scheduled_work();
357 393
358 dsa_switch_destroy(ds); 394 for (i = 0; i < dst->pd->nr_chips; i++) {
395 struct dsa_switch *ds = dst->ds[i];
396
397 if (ds != NULL)
398 dsa_switch_destroy(ds);
399 }
359 400
360 return 0; 401 return 0;
361} 402}
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 7063378a1ebf..41055f33d28a 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/dsa_priv.h - Hardware switch handling 2 * net/dsa/dsa_priv.h - Hardware switch handling
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -19,42 +19,107 @@
19 19
20struct dsa_switch { 20struct dsa_switch {
21 /* 21 /*
22 * Configuration data for the platform device that owns 22 * Parent switch tree, and switch index.
23 * this dsa switch instance.
24 */ 23 */
25 struct dsa_platform_data *pd; 24 struct dsa_switch_tree *dst;
25 int index;
26 26
27 /* 27 /*
28 * References to network device and mii bus to use. 28 * Configuration data for this switch.
29 */ 29 */
30 struct net_device *master_netdev; 30 struct dsa_chip_data *pd;
31 struct mii_bus *master_mii_bus;
32 31
33 /* 32 /*
34 * The used switch driver and frame tagging type. 33 * The used switch driver.
35 */ 34 */
36 struct dsa_switch_driver *drv; 35 struct dsa_switch_driver *drv;
37 __be16 tag_protocol; 36
37 /*
38 * Reference to mii bus to use.
39 */
40 struct mii_bus *master_mii_bus;
38 41
39 /* 42 /*
40 * Slave mii_bus and devices for the individual ports. 43 * Slave mii_bus and devices for the individual ports.
41 */ 44 */
42 int cpu_port; 45 u32 dsa_port_mask;
43 u32 valid_port_mask; 46 u32 phys_port_mask;
44 struct mii_bus *slave_mii_bus; 47 struct mii_bus *slave_mii_bus;
45 struct net_device *ports[DSA_MAX_PORTS]; 48 struct net_device *ports[DSA_MAX_PORTS];
49};
50
51struct dsa_switch_tree {
52 /*
53 * Configuration data for the platform device that owns
54 * this dsa switch tree instance.
55 */
56 struct dsa_platform_data *pd;
57
58 /*
59 * Reference to network device to use, and which tagging
60 * protocol to use.
61 */
62 struct net_device *master_netdev;
63 __be16 tag_protocol;
64
65 /*
66 * The switch and port to which the CPU is attached.
67 */
68 s8 cpu_switch;
69 s8 cpu_port;
46 70
47 /* 71 /*
48 * Link state polling. 72 * Link state polling.
49 */ 73 */
50 struct work_struct link_poll_work; 74 int link_poll_needed;
51 struct timer_list link_poll_timer; 75 struct work_struct link_poll_work;
76 struct timer_list link_poll_timer;
77
78 /*
79 * Data for the individual switch chips.
80 */
81 struct dsa_switch *ds[DSA_MAX_SWITCHES];
52}; 82};
53 83
84static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p)
85{
86 return !!(ds->index == ds->dst->cpu_switch && p == ds->dst->cpu_port);
87}
88
89static inline u8 dsa_upstream_port(struct dsa_switch *ds)
90{
91 struct dsa_switch_tree *dst = ds->dst;
92
93 /*
94 * If this is the root switch (i.e. the switch that connects
95 * to the CPU), return the cpu port number on this switch.
96 * Else return the (DSA) port number that connects to the
97 * switch that is one hop closer to the cpu.
98 */
99 if (dst->cpu_switch == ds->index)
100 return dst->cpu_port;
101 else
102 return ds->pd->rtable[dst->cpu_switch];
103}
104
54struct dsa_slave_priv { 105struct dsa_slave_priv {
106 /*
107 * The linux network interface corresponding to this
108 * switch port.
109 */
55 struct net_device *dev; 110 struct net_device *dev;
111
112 /*
113 * Which switch this port is a part of, and the port index
114 * for this port.
115 */
56 struct dsa_switch *parent; 116 struct dsa_switch *parent;
57 int port; 117 u8 port;
118
119 /*
120 * The phylib phy_device pointer for the PHY connected
121 * to this port.
122 */
58 struct phy_device *phy; 123 struct phy_device *phy;
59}; 124};
60 125
diff --git a/net/dsa/mv88e6060.c b/net/dsa/mv88e6060.c
index 85081ae9fe89..83277f463af7 100644
--- a/net/dsa/mv88e6060.c
+++ b/net/dsa/mv88e6060.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips 2 * net/dsa/mv88e6060.c - Driver for Marvell 88e6060 switch chips
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -81,7 +81,7 @@ static int mv88e6060_switch_reset(struct dsa_switch *ds)
81 /* 81 /*
82 * Reset the switch. 82 * Reset the switch.
83 */ 83 */
84 REG_WRITE(REG_GLOBAL, 0x0A, 0xa130); 84 REG_WRITE(REG_GLOBAL, 0x0a, 0xa130);
85 85
86 /* 86 /*
87 * Wait up to one second for reset to complete. 87 * Wait up to one second for reset to complete.
@@ -128,7 +128,7 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
128 * state to Forwarding. Additionally, if this is the CPU 128 * state to Forwarding. Additionally, if this is the CPU
129 * port, enable Ingress and Egress Trailer tagging mode. 129 * port, enable Ingress and Egress Trailer tagging mode.
130 */ 130 */
131 REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x4103 : 0x0003); 131 REG_WRITE(addr, 0x04, dsa_is_cpu_port(ds, p) ? 0x4103 : 0x0003);
132 132
133 /* 133 /*
134 * Port based VLAN map: give each port its own address 134 * Port based VLAN map: give each port its own address
@@ -138,9 +138,9 @@ static int mv88e6060_setup_port(struct dsa_switch *ds, int p)
138 */ 138 */
139 REG_WRITE(addr, 0x06, 139 REG_WRITE(addr, 0x06,
140 ((p & 0xf) << 12) | 140 ((p & 0xf) << 12) |
141 ((p == ds->cpu_port) ? 141 (dsa_is_cpu_port(ds, p) ?
142 ds->valid_port_mask : 142 ds->phys_port_mask :
143 (1 << ds->cpu_port))); 143 (1 << ds->dst->cpu_port)));
144 144
145 /* 145 /*
146 * Port Association Vector: when learning source addresses 146 * Port Association Vector: when learning source addresses
diff --git a/net/dsa/mv88e6123_61_65.c b/net/dsa/mv88e6123_61_65.c
index 100318722214..52faaa21a4d9 100644
--- a/net/dsa/mv88e6123_61_65.c
+++ b/net/dsa/mv88e6123_61_65.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support 2 * net/dsa/mv88e6123_61_65.c - Marvell 88e6123/6161/6165 switch chip support
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -98,17 +98,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
98 return ret; 98 return ret;
99 99
100 /* 100 /*
101 * Configure the cpu port, and configure the cpu port as the 101 * Configure the upstream port, and configure the upstream
102 * port to which ingress and egress monitor frames are to be 102 * port as the port to which ingress and egress monitor frames
103 * sent. 103 * are to be sent.
104 */ 104 */
105 REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1110)); 105 REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1110));
106 106
107 /* 107 /*
108 * Disable remote management for now, and set the switch's 108 * Disable remote management for now, and set the switch's
109 * DSA device number to zero. 109 * DSA device number.
110 */ 110 */
111 REG_WRITE(REG_GLOBAL, 0x1c, 0x0000); 111 REG_WRITE(REG_GLOBAL, 0x1c, ds->index & 0x1f);
112 112
113 /* 113 /*
114 * Send all frames with destination addresses matching 114 * Send all frames with destination addresses matching
@@ -133,10 +133,17 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
133 REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); 133 REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
134 134
135 /* 135 /*
136 * Map all DSA device IDs to the CPU port. 136 * Program the DSA routing table.
137 */ 137 */
138 for (i = 0; i < 32; i++) 138 for (i = 0; i < 32; i++) {
139 REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port); 139 int nexthop;
140
141 nexthop = 0x1f;
142 if (i != ds->index && i < ds->dst->pd->nr_chips)
143 nexthop = ds->pd->rtable[i] & 0x1f;
144
145 REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
146 }
140 147
141 /* 148 /*
142 * Clear all trunk masks. 149 * Clear all trunk masks.
@@ -176,12 +183,18 @@ static int mv88e6123_61_65_setup_global(struct dsa_switch *ds)
176static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p) 183static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
177{ 184{
178 int addr = REG_PORT(p); 185 int addr = REG_PORT(p);
186 u16 val;
179 187
180 /* 188 /*
181 * MAC Forcing register: don't force link, speed, duplex 189 * MAC Forcing register: don't force link, speed, duplex
182 * or flow control state to any particular values. 190 * or flow control state to any particular values on physical
191 * ports, but force the CPU port and all DSA ports to 1000 Mb/s
192 * full duplex.
183 */ 193 */
184 REG_WRITE(addr, 0x01, 0x0003); 194 if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
195 REG_WRITE(addr, 0x01, 0x003e);
196 else
197 REG_WRITE(addr, 0x01, 0x0003);
185 198
186 /* 199 /*
187 * Do not limit the period of time that this port can be 200 * Do not limit the period of time that this port can be
@@ -192,37 +205,50 @@ static int mv88e6123_61_65_setup_port(struct dsa_switch *ds, int p)
192 205
193 /* 206 /*
194 * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock, 207 * Port Control: disable Drop-on-Unlock, disable Drop-on-Lock,
195 * configure the requested (DSA/EDSA) tagging mode if this is 208 * disable Header mode, enable IGMP/MLD snooping, disable VLAN
196 * the CPU port, disable Header mode, enable IGMP/MLD snooping, 209 * tunneling, determine priority by looking at 802.1p and IP
197 * disable VLAN tunneling, determine priority by looking at 210 * priority fields (IP prio has precedence), and set STP state
198 * 802.1p and IP priority fields (IP prio has precedence), and 211 * to Forwarding.
199 * set STP state to Forwarding. Finally, if this is the CPU 212 *
200 * port, additionally enable forwarding of unknown unicast and 213 * If this is the CPU link, use DSA or EDSA tagging depending
201 * multicast addresses. 214 * on which tagging mode was configured.
202 */ 215 *
203 REG_WRITE(addr, 0x04, 216 * If this is a link to another switch, use DSA tagging mode.
204 (p == ds->cpu_port) ? 217 *
205 (ds->tag_protocol == htons(ETH_P_DSA)) ? 218 * If this is the upstream port for this switch, enable
206 0x053f : 0x373f : 219 * forwarding of unknown unicasts and multicasts.
207 0x0433); 220 */
221 val = 0x0433;
222 if (dsa_is_cpu_port(ds, p)) {
223 if (ds->dst->tag_protocol == htons(ETH_P_EDSA))
224 val |= 0x3300;
225 else
226 val |= 0x0100;
227 }
228 if (ds->dsa_port_mask & (1 << p))
229 val |= 0x0100;
230 if (p == dsa_upstream_port(ds))
231 val |= 0x000c;
232 REG_WRITE(addr, 0x04, val);
208 233
209 /* 234 /*
210 * Port Control 1: disable trunking. Also, if this is the 235 * Port Control 1: disable trunking. Also, if this is the
211 * CPU port, enable learn messages to be sent to this port. 236 * CPU port, enable learn messages to be sent to this port.
212 */ 237 */
213 REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000); 238 REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
214 239
215 /* 240 /*
216 * Port based VLAN map: give each port its own address 241 * Port based VLAN map: give each port its own address
217 * database, allow the CPU port to talk to each of the 'real' 242 * database, allow the CPU port to talk to each of the 'real'
218 * ports, and allow each of the 'real' ports to only talk to 243 * ports, and allow each of the 'real' ports to only talk to
219 * the CPU port. 244 * the upstream port.
220 */ 245 */
221 REG_WRITE(addr, 0x06, 246 val = (p & 0xf) << 12;
222 ((p & 0xf) << 12) | 247 if (dsa_is_cpu_port(ds, p))
223 ((p == ds->cpu_port) ? 248 val |= ds->phys_port_mask;
224 ds->valid_port_mask : 249 else
225 (1 << ds->cpu_port))); 250 val |= 1 << dsa_upstream_port(ds);
251 REG_WRITE(addr, 0x06, val);
226 252
227 /* 253 /*
228 * Default VLAN ID and priority: don't set a default VLAN 254 * Default VLAN ID and priority: don't set a default VLAN
diff --git a/net/dsa/mv88e6131.c b/net/dsa/mv88e6131.c
index 70fae2444cb6..bb2b41bc854e 100644
--- a/net/dsa/mv88e6131.c
+++ b/net/dsa/mv88e6131.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/mv88e6131.c - Marvell 88e6131 switch chip support 2 * net/dsa/mv88e6131.c - Marvell 88e6095/6095f/6131 switch chip support
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -21,6 +21,8 @@ static char *mv88e6131_probe(struct mii_bus *bus, int sw_addr)
21 ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03); 21 ret = __mv88e6xxx_reg_read(bus, sw_addr, REG_PORT(0), 0x03);
22 if (ret >= 0) { 22 if (ret >= 0) {
23 ret &= 0xfff0; 23 ret &= 0xfff0;
24 if (ret == 0x0950)
25 return "Marvell 88E6095/88E6095F";
24 if (ret == 0x1060) 26 if (ret == 0x1060)
25 return "Marvell 88E6131"; 27 return "Marvell 88E6131";
26 } 28 }
@@ -36,7 +38,7 @@ static int mv88e6131_switch_reset(struct dsa_switch *ds)
36 /* 38 /*
37 * Set all ports to the disabled state. 39 * Set all ports to the disabled state.
38 */ 40 */
39 for (i = 0; i < 8; i++) { 41 for (i = 0; i < 11; i++) {
40 ret = REG_READ(REG_PORT(i), 0x04); 42 ret = REG_READ(REG_PORT(i), 0x04);
41 REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc); 43 REG_WRITE(REG_PORT(i), 0x04, ret & 0xfffc);
42 } 44 }
@@ -100,17 +102,17 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
100 REG_WRITE(REG_GLOBAL, 0x19, 0x8100); 102 REG_WRITE(REG_GLOBAL, 0x19, 0x8100);
101 103
102 /* 104 /*
103 * Disable ARP mirroring, and configure the cpu port as the 105 * Disable ARP mirroring, and configure the upstream port as
104 * port to which ingress and egress monitor frames are to be 106 * the port to which ingress and egress monitor frames are to
105 * sent. 107 * be sent.
106 */ 108 */
107 REG_WRITE(REG_GLOBAL, 0x1a, (ds->cpu_port * 0x1100) | 0x00f0); 109 REG_WRITE(REG_GLOBAL, 0x1a, (dsa_upstream_port(ds) * 0x1100) | 0x00f0);
108 110
109 /* 111 /*
110 * Disable cascade port functionality, and set the switch's 112 * Disable cascade port functionality, and set the switch's
111 * DSA device number to zero. 113 * DSA device number.
112 */ 114 */
113 REG_WRITE(REG_GLOBAL, 0x1c, 0xe000); 115 REG_WRITE(REG_GLOBAL, 0x1c, 0xe000 | (ds->index & 0x1f));
114 116
115 /* 117 /*
116 * Send all frames with destination addresses matching 118 * Send all frames with destination addresses matching
@@ -127,16 +129,23 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
127 REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff); 129 REG_WRITE(REG_GLOBAL2, 0x05, 0x00ff);
128 130
129 /* 131 /*
130 * Map all DSA device IDs to the CPU port. 132 * Program the DSA routing table.
131 */ 133 */
132 for (i = 0; i < 32; i++) 134 for (i = 0; i < 32; i++) {
133 REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | ds->cpu_port); 135 int nexthop;
136
137 nexthop = 0x1f;
138 if (i != ds->index && i < ds->dst->pd->nr_chips)
139 nexthop = ds->pd->rtable[i] & 0x1f;
140
141 REG_WRITE(REG_GLOBAL2, 0x06, 0x8000 | (i << 8) | nexthop);
142 }
134 143
135 /* 144 /*
136 * Clear all trunk masks. 145 * Clear all trunk masks.
137 */ 146 */
138 for (i = 0; i < 8; i++) 147 for (i = 0; i < 8; i++)
139 REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0xff); 148 REG_WRITE(REG_GLOBAL2, 0x07, 0x8000 | (i << 12) | 0x7ff);
140 149
141 /* 150 /*
142 * Clear all trunk mappings. 151 * Clear all trunk mappings.
@@ -156,12 +165,18 @@ static int mv88e6131_setup_global(struct dsa_switch *ds)
156static int mv88e6131_setup_port(struct dsa_switch *ds, int p) 165static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
157{ 166{
158 int addr = REG_PORT(p); 167 int addr = REG_PORT(p);
168 u16 val;
159 169
160 /* 170 /*
161 * MAC Forcing register: don't force link, speed, duplex 171 * MAC Forcing register: don't force link, speed, duplex
162 * or flow control state to any particular values. 172 * or flow control state to any particular values on physical
173 * ports, but force the CPU port and all DSA ports to 1000 Mb/s
174 * full duplex.
163 */ 175 */
164 REG_WRITE(addr, 0x01, 0x0003); 176 if (dsa_is_cpu_port(ds, p) || ds->dsa_port_mask & (1 << p))
177 REG_WRITE(addr, 0x01, 0x003e);
178 else
179 REG_WRITE(addr, 0x01, 0x0003);
165 180
166 /* 181 /*
167 * Port Control: disable Core Tag, disable Drop-on-Lock, 182 * Port Control: disable Core Tag, disable Drop-on-Lock,
@@ -169,29 +184,40 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
169 * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN 184 * enable IGMP/MLD snoop, disable DoubleTag, disable VLAN
170 * tunneling, determine priority by looking at 802.1p and 185 * tunneling, determine priority by looking at 802.1p and
171 * IP priority fields (IP prio has precedence), and set STP 186 * IP priority fields (IP prio has precedence), and set STP
172 * state to Forwarding. Finally, if this is the CPU port, 187 * state to Forwarding.
173 * additionally enable DSA tagging and forwarding of unknown 188 *
174 * unicast addresses. 189 * If this is the upstream port for this switch, enable
190 * forwarding of unknown unicasts, and enable DSA tagging
191 * mode.
192 *
193 * If this is the link to another switch, use DSA tagging
194 * mode, but do not enable forwarding of unknown unicasts.
175 */ 195 */
176 REG_WRITE(addr, 0x04, (p == ds->cpu_port) ? 0x0537 : 0x0433); 196 val = 0x0433;
197 if (p == dsa_upstream_port(ds))
198 val |= 0x0104;
199 if (ds->dsa_port_mask & (1 << p))
200 val |= 0x0100;
201 REG_WRITE(addr, 0x04, val);
177 202
178 /* 203 /*
179 * Port Control 1: disable trunking. Also, if this is the 204 * Port Control 1: disable trunking. Also, if this is the
180 * CPU port, enable learn messages to be sent to this port. 205 * CPU port, enable learn messages to be sent to this port.
181 */ 206 */
182 REG_WRITE(addr, 0x05, (p == ds->cpu_port) ? 0x8000 : 0x0000); 207 REG_WRITE(addr, 0x05, dsa_is_cpu_port(ds, p) ? 0x8000 : 0x0000);
183 208
184 /* 209 /*
185 * Port based VLAN map: give each port its own address 210 * Port based VLAN map: give each port its own address
186 * database, allow the CPU port to talk to each of the 'real' 211 * database, allow the CPU port to talk to each of the 'real'
187 * ports, and allow each of the 'real' ports to only talk to 212 * ports, and allow each of the 'real' ports to only talk to
188 * the CPU port. 213 * the upstream port.
189 */ 214 */
190 REG_WRITE(addr, 0x06, 215 val = (p & 0xf) << 12;
191 ((p & 0xf) << 12) | 216 if (dsa_is_cpu_port(ds, p))
192 ((p == ds->cpu_port) ? 217 val |= ds->phys_port_mask;
193 ds->valid_port_mask : 218 else
194 (1 << ds->cpu_port))); 219 val |= 1 << dsa_upstream_port(ds);
220 REG_WRITE(addr, 0x06, val);
195 221
196 /* 222 /*
197 * Default VLAN ID and priority: don't set a default VLAN 223 * Default VLAN ID and priority: don't set a default VLAN
@@ -207,13 +233,15 @@ static int mv88e6131_setup_port(struct dsa_switch *ds, int p)
207 * untagged frames on this port, do a destination address 233 * untagged frames on this port, do a destination address
208 * lookup on received packets as usual, don't send a copy 234 * lookup on received packets as usual, don't send a copy
209 * of all transmitted/received frames on this port to the 235 * of all transmitted/received frames on this port to the
210 * CPU, and configure the CPU port number. Also, if this 236 * CPU, and configure the upstream port number.
211 * is the CPU port, enable forwarding of unknown multicast 237 *
212 * addresses. 238 * If this is the upstream port for this switch, enable
239 * forwarding of unknown multicast addresses.
213 */ 240 */
214 REG_WRITE(addr, 0x08, 241 val = 0x0080 | dsa_upstream_port(ds);
215 ((p == ds->cpu_port) ? 0x00c0 : 0x0080) | 242 if (p == dsa_upstream_port(ds))
216 ds->cpu_port); 243 val |= 0x0040;
244 REG_WRITE(addr, 0x08, val);
217 245
218 /* 246 /*
219 * Rate Control: disable ingress rate limiting. 247 * Rate Control: disable ingress rate limiting.
@@ -268,7 +296,7 @@ static int mv88e6131_setup(struct dsa_switch *ds)
268 if (ret < 0) 296 if (ret < 0)
269 return ret; 297 return ret;
270 298
271 for (i = 0; i < 6; i++) { 299 for (i = 0; i < 11; i++) {
272 ret = mv88e6131_setup_port(ds, i); 300 ret = mv88e6131_setup_port(ds, i);
273 if (ret < 0) 301 if (ret < 0)
274 return ret; 302 return ret;
@@ -279,7 +307,7 @@ static int mv88e6131_setup(struct dsa_switch *ds)
279 307
280static int mv88e6131_port_to_phy_addr(int port) 308static int mv88e6131_port_to_phy_addr(int port)
281{ 309{
282 if (port >= 0 && port != 3 && port <= 7) 310 if (port >= 0 && port <= 11)
283 return port; 311 return port;
284 return -1; 312 return -1;
285} 313}
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index a68fd79e9eca..ed131181215d 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/slave.c - Slave device handling 2 * net/dsa/slave.c - Slave device handling
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg)
19{ 19{
20 struct dsa_switch *ds = bus->priv; 20 struct dsa_switch *ds = bus->priv;
21 21
22 if (ds->valid_port_mask & (1 << addr)) 22 if (ds->phys_port_mask & (1 << addr))
23 return ds->drv->phy_read(ds, addr, reg); 23 return ds->drv->phy_read(ds, addr, reg);
24 24
25 return 0xffff; 25 return 0xffff;
@@ -29,7 +29,7 @@ static int dsa_slave_phy_write(struct mii_bus *bus, int addr, int reg, u16 val)
29{ 29{
30 struct dsa_switch *ds = bus->priv; 30 struct dsa_switch *ds = bus->priv;
31 31
32 if (ds->valid_port_mask & (1 << addr)) 32 if (ds->phys_port_mask & (1 << addr))
33 return ds->drv->phy_write(ds, addr, reg, val); 33 return ds->drv->phy_write(ds, addr, reg, val);
34 34
35 return 0; 35 return 0;
@@ -43,15 +43,24 @@ void dsa_slave_mii_bus_init(struct dsa_switch *ds)
43 ds->slave_mii_bus->write = dsa_slave_phy_write; 43 ds->slave_mii_bus->write = dsa_slave_phy_write;
44 snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x", 44 snprintf(ds->slave_mii_bus->id, MII_BUS_ID_SIZE, "%s:%.2x",
45 ds->master_mii_bus->id, ds->pd->sw_addr); 45 ds->master_mii_bus->id, ds->pd->sw_addr);
46 ds->slave_mii_bus->parent = &(ds->master_mii_bus->dev); 46 ds->slave_mii_bus->parent = &ds->master_mii_bus->dev;
47} 47}
48 48
49 49
50/* slave device handling ****************************************************/ 50/* slave device handling ****************************************************/
51static int dsa_slave_init(struct net_device *dev)
52{
53 struct dsa_slave_priv *p = netdev_priv(dev);
54
55 dev->iflink = p->parent->dst->master_netdev->ifindex;
56
57 return 0;
58}
59
51static int dsa_slave_open(struct net_device *dev) 60static int dsa_slave_open(struct net_device *dev)
52{ 61{
53 struct dsa_slave_priv *p = netdev_priv(dev); 62 struct dsa_slave_priv *p = netdev_priv(dev);
54 struct net_device *master = p->parent->master_netdev; 63 struct net_device *master = p->parent->dst->master_netdev;
55 int err; 64 int err;
56 65
57 if (!(master->flags & IFF_UP)) 66 if (!(master->flags & IFF_UP))
@@ -89,7 +98,7 @@ out:
89static int dsa_slave_close(struct net_device *dev) 98static int dsa_slave_close(struct net_device *dev)
90{ 99{
91 struct dsa_slave_priv *p = netdev_priv(dev); 100 struct dsa_slave_priv *p = netdev_priv(dev);
92 struct net_device *master = p->parent->master_netdev; 101 struct net_device *master = p->parent->dst->master_netdev;
93 102
94 dev_mc_unsync(master, dev); 103 dev_mc_unsync(master, dev);
95 dev_unicast_unsync(master, dev); 104 dev_unicast_unsync(master, dev);
@@ -107,7 +116,7 @@ static int dsa_slave_close(struct net_device *dev)
107static void dsa_slave_change_rx_flags(struct net_device *dev, int change) 116static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
108{ 117{
109 struct dsa_slave_priv *p = netdev_priv(dev); 118 struct dsa_slave_priv *p = netdev_priv(dev);
110 struct net_device *master = p->parent->master_netdev; 119 struct net_device *master = p->parent->dst->master_netdev;
111 120
112 if (change & IFF_ALLMULTI) 121 if (change & IFF_ALLMULTI)
113 dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1); 122 dev_set_allmulti(master, dev->flags & IFF_ALLMULTI ? 1 : -1);
@@ -118,7 +127,7 @@ static void dsa_slave_change_rx_flags(struct net_device *dev, int change)
118static void dsa_slave_set_rx_mode(struct net_device *dev) 127static void dsa_slave_set_rx_mode(struct net_device *dev)
119{ 128{
120 struct dsa_slave_priv *p = netdev_priv(dev); 129 struct dsa_slave_priv *p = netdev_priv(dev);
121 struct net_device *master = p->parent->master_netdev; 130 struct net_device *master = p->parent->dst->master_netdev;
122 131
123 dev_mc_sync(master, dev); 132 dev_mc_sync(master, dev);
124 dev_unicast_sync(master, dev); 133 dev_unicast_sync(master, dev);
@@ -127,7 +136,7 @@ static void dsa_slave_set_rx_mode(struct net_device *dev)
127static int dsa_slave_set_mac_address(struct net_device *dev, void *a) 136static int dsa_slave_set_mac_address(struct net_device *dev, void *a)
128{ 137{
129 struct dsa_slave_priv *p = netdev_priv(dev); 138 struct dsa_slave_priv *p = netdev_priv(dev);
130 struct net_device *master = p->parent->master_netdev; 139 struct net_device *master = p->parent->dst->master_netdev;
131 struct sockaddr *addr = a; 140 struct sockaddr *addr = a;
132 int err; 141 int err;
133 142
@@ -288,6 +297,7 @@ static const struct ethtool_ops dsa_slave_ethtool_ops = {
288 297
289#ifdef CONFIG_NET_DSA_TAG_DSA 298#ifdef CONFIG_NET_DSA_TAG_DSA
290static const struct net_device_ops dsa_netdev_ops = { 299static const struct net_device_ops dsa_netdev_ops = {
300 .ndo_init = dsa_slave_init,
291 .ndo_open = dsa_slave_open, 301 .ndo_open = dsa_slave_open,
292 .ndo_stop = dsa_slave_close, 302 .ndo_stop = dsa_slave_close,
293 .ndo_start_xmit = dsa_xmit, 303 .ndo_start_xmit = dsa_xmit,
@@ -300,6 +310,7 @@ static const struct net_device_ops dsa_netdev_ops = {
300#endif 310#endif
301#ifdef CONFIG_NET_DSA_TAG_EDSA 311#ifdef CONFIG_NET_DSA_TAG_EDSA
302static const struct net_device_ops edsa_netdev_ops = { 312static const struct net_device_ops edsa_netdev_ops = {
313 .ndo_init = dsa_slave_init,
303 .ndo_open = dsa_slave_open, 314 .ndo_open = dsa_slave_open,
304 .ndo_stop = dsa_slave_close, 315 .ndo_stop = dsa_slave_close,
305 .ndo_start_xmit = edsa_xmit, 316 .ndo_start_xmit = edsa_xmit,
@@ -312,6 +323,7 @@ static const struct net_device_ops edsa_netdev_ops = {
312#endif 323#endif
313#ifdef CONFIG_NET_DSA_TAG_TRAILER 324#ifdef CONFIG_NET_DSA_TAG_TRAILER
314static const struct net_device_ops trailer_netdev_ops = { 325static const struct net_device_ops trailer_netdev_ops = {
326 .ndo_init = dsa_slave_init,
315 .ndo_open = dsa_slave_open, 327 .ndo_open = dsa_slave_open,
316 .ndo_stop = dsa_slave_close, 328 .ndo_stop = dsa_slave_close,
317 .ndo_start_xmit = trailer_xmit, 329 .ndo_start_xmit = trailer_xmit,
@@ -328,7 +340,7 @@ struct net_device *
328dsa_slave_create(struct dsa_switch *ds, struct device *parent, 340dsa_slave_create(struct dsa_switch *ds, struct device *parent,
329 int port, char *name) 341 int port, char *name)
330{ 342{
331 struct net_device *master = ds->master_netdev; 343 struct net_device *master = ds->dst->master_netdev;
332 struct net_device *slave_dev; 344 struct net_device *slave_dev;
333 struct dsa_slave_priv *p; 345 struct dsa_slave_priv *p;
334 int ret; 346 int ret;
@@ -343,7 +355,7 @@ dsa_slave_create(struct dsa_switch *ds, struct device *parent,
343 memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN); 355 memcpy(slave_dev->dev_addr, master->dev_addr, ETH_ALEN);
344 slave_dev->tx_queue_len = 0; 356 slave_dev->tx_queue_len = 0;
345 357
346 switch (ds->tag_protocol) { 358 switch (ds->dst->tag_protocol) {
347#ifdef CONFIG_NET_DSA_TAG_DSA 359#ifdef CONFIG_NET_DSA_TAG_DSA
348 case htons(ETH_P_DSA): 360 case htons(ETH_P_DSA):
349 slave_dev->netdev_ops = &dsa_netdev_ops; 361 slave_dev->netdev_ops = &dsa_netdev_ops;
diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c
index 63e532a69fdb..8fa25bafe6ca 100644
--- a/net/dsa/tag_dsa.c
+++ b/net/dsa/tag_dsa.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging 2 * net/dsa/tag_dsa.c - (Non-ethertype) DSA tagging
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -36,7 +36,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
36 * Construct tagged FROM_CPU DSA tag from 802.1q tag. 36 * Construct tagged FROM_CPU DSA tag from 802.1q tag.
37 */ 37 */
38 dsa_header = skb->data + 2 * ETH_ALEN; 38 dsa_header = skb->data + 2 * ETH_ALEN;
39 dsa_header[0] = 0x60; 39 dsa_header[0] = 0x60 | p->parent->index;
40 dsa_header[1] = p->port << 3; 40 dsa_header[1] = p->port << 3;
41 41
42 /* 42 /*
@@ -57,7 +57,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
57 * Construct untagged FROM_CPU DSA tag. 57 * Construct untagged FROM_CPU DSA tag.
58 */ 58 */
59 dsa_header = skb->data + 2 * ETH_ALEN; 59 dsa_header = skb->data + 2 * ETH_ALEN;
60 dsa_header[0] = 0x40; 60 dsa_header[0] = 0x40 | p->parent->index;
61 dsa_header[1] = p->port << 3; 61 dsa_header[1] = p->port << 3;
62 dsa_header[2] = 0x00; 62 dsa_header[2] = 0x00;
63 dsa_header[3] = 0x00; 63 dsa_header[3] = 0x00;
@@ -65,7 +65,7 @@ int dsa_xmit(struct sk_buff *skb, struct net_device *dev)
65 65
66 skb->protocol = htons(ETH_P_DSA); 66 skb->protocol = htons(ETH_P_DSA);
67 67
68 skb->dev = p->parent->master_netdev; 68 skb->dev = p->parent->dst->master_netdev;
69 dev_queue_xmit(skb); 69 dev_queue_xmit(skb);
70 70
71 return NETDEV_TX_OK; 71 return NETDEV_TX_OK;
@@ -78,11 +78,13 @@ out_free:
78static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, 78static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
79 struct packet_type *pt, struct net_device *orig_dev) 79 struct packet_type *pt, struct net_device *orig_dev)
80{ 80{
81 struct dsa_switch *ds = dev->dsa_ptr; 81 struct dsa_switch_tree *dst = dev->dsa_ptr;
82 struct dsa_switch *ds;
82 u8 *dsa_header; 83 u8 *dsa_header;
84 int source_device;
83 int source_port; 85 int source_port;
84 86
85 if (unlikely(ds == NULL)) 87 if (unlikely(dst == NULL))
86 goto out_drop; 88 goto out_drop;
87 89
88 skb = skb_unshare(skb, GFP_ATOMIC); 90 skb = skb_unshare(skb, GFP_ATOMIC);
@@ -98,16 +100,24 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev,
98 dsa_header = skb->data - 2; 100 dsa_header = skb->data - 2;
99 101
100 /* 102 /*
101 * Check that frame type is either TO_CPU or FORWARD, and 103 * Check that frame type is either TO_CPU or FORWARD.
102 * that the source device is zero.
103 */ 104 */
104 if ((dsa_header[0] & 0xdf) != 0x00 && (dsa_header[0] & 0xdf) != 0xc0) 105 if ((dsa_header[0] & 0xc0) != 0x00 && (dsa_header[0] & 0xc0) != 0xc0)
105 goto out_drop; 106 goto out_drop;
106 107
107 /* 108 /*
108 * Check that the source port is a registered DSA port. 109 * Determine source device and port.
109 */ 110 */
111 source_device = dsa_header[0] & 0x1f;
110 source_port = (dsa_header[1] >> 3) & 0x1f; 112 source_port = (dsa_header[1] >> 3) & 0x1f;
113
114 /*
115 * Check that the source device exists and that the source
116 * port is a registered DSA port.
117 */
118 if (source_device >= dst->pd->nr_chips)
119 goto out_drop;
120 ds = dst->ds[source_device];
111 if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) 121 if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
112 goto out_drop; 122 goto out_drop;
113 123
@@ -175,7 +185,7 @@ out:
175 return 0; 185 return 0;
176} 186}
177 187
178static struct packet_type dsa_packet_type = { 188static struct packet_type dsa_packet_type __read_mostly = {
179 .type = cpu_to_be16(ETH_P_DSA), 189 .type = cpu_to_be16(ETH_P_DSA),
180 .func = dsa_rcv, 190 .func = dsa_rcv,
181}; 191};
diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c
index 6197f9a7ef42..815607bd286f 100644
--- a/net/dsa/tag_edsa.c
+++ b/net/dsa/tag_edsa.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/tag_edsa.c - Ethertype DSA tagging 2 * net/dsa/tag_edsa.c - Ethertype DSA tagging
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -45,7 +45,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
45 edsa_header[1] = ETH_P_EDSA & 0xff; 45 edsa_header[1] = ETH_P_EDSA & 0xff;
46 edsa_header[2] = 0x00; 46 edsa_header[2] = 0x00;
47 edsa_header[3] = 0x00; 47 edsa_header[3] = 0x00;
48 edsa_header[4] = 0x60; 48 edsa_header[4] = 0x60 | p->parent->index;
49 edsa_header[5] = p->port << 3; 49 edsa_header[5] = p->port << 3;
50 50
51 /* 51 /*
@@ -70,7 +70,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
70 edsa_header[1] = ETH_P_EDSA & 0xff; 70 edsa_header[1] = ETH_P_EDSA & 0xff;
71 edsa_header[2] = 0x00; 71 edsa_header[2] = 0x00;
72 edsa_header[3] = 0x00; 72 edsa_header[3] = 0x00;
73 edsa_header[4] = 0x40; 73 edsa_header[4] = 0x40 | p->parent->index;
74 edsa_header[5] = p->port << 3; 74 edsa_header[5] = p->port << 3;
75 edsa_header[6] = 0x00; 75 edsa_header[6] = 0x00;
76 edsa_header[7] = 0x00; 76 edsa_header[7] = 0x00;
@@ -78,7 +78,7 @@ int edsa_xmit(struct sk_buff *skb, struct net_device *dev)
78 78
79 skb->protocol = htons(ETH_P_EDSA); 79 skb->protocol = htons(ETH_P_EDSA);
80 80
81 skb->dev = p->parent->master_netdev; 81 skb->dev = p->parent->dst->master_netdev;
82 dev_queue_xmit(skb); 82 dev_queue_xmit(skb);
83 83
84 return NETDEV_TX_OK; 84 return NETDEV_TX_OK;
@@ -91,11 +91,13 @@ out_free:
91static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, 91static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
92 struct packet_type *pt, struct net_device *orig_dev) 92 struct packet_type *pt, struct net_device *orig_dev)
93{ 93{
94 struct dsa_switch *ds = dev->dsa_ptr; 94 struct dsa_switch_tree *dst = dev->dsa_ptr;
95 struct dsa_switch *ds;
95 u8 *edsa_header; 96 u8 *edsa_header;
97 int source_device;
96 int source_port; 98 int source_port;
97 99
98 if (unlikely(ds == NULL)) 100 if (unlikely(dst == NULL))
99 goto out_drop; 101 goto out_drop;
100 102
101 skb = skb_unshare(skb, GFP_ATOMIC); 103 skb = skb_unshare(skb, GFP_ATOMIC);
@@ -111,16 +113,24 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev,
111 edsa_header = skb->data + 2; 113 edsa_header = skb->data + 2;
112 114
113 /* 115 /*
114 * Check that frame type is either TO_CPU or FORWARD, and 116 * Check that frame type is either TO_CPU or FORWARD.
115 * that the source device is zero.
116 */ 117 */
117 if ((edsa_header[0] & 0xdf) != 0x00 && (edsa_header[0] & 0xdf) != 0xc0) 118 if ((edsa_header[0] & 0xc0) != 0x00 && (edsa_header[0] & 0xc0) != 0xc0)
118 goto out_drop; 119 goto out_drop;
119 120
120 /* 121 /*
121 * Check that the source port is a registered DSA port. 122 * Determine source device and port.
122 */ 123 */
124 source_device = edsa_header[0] & 0x1f;
123 source_port = (edsa_header[1] >> 3) & 0x1f; 125 source_port = (edsa_header[1] >> 3) & 0x1f;
126
127 /*
128 * Check that the source device exists and that the source
129 * port is a registered DSA port.
130 */
131 if (source_device >= dst->pd->nr_chips)
132 goto out_drop;
133 ds = dst->ds[source_device];
124 if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL) 134 if (source_port >= DSA_MAX_PORTS || ds->ports[source_port] == NULL)
125 goto out_drop; 135 goto out_drop;
126 136
@@ -194,7 +204,7 @@ out:
194 return 0; 204 return 0;
195} 205}
196 206
197static struct packet_type edsa_packet_type = { 207static struct packet_type edsa_packet_type __read_mostly = {
198 .type = cpu_to_be16(ETH_P_EDSA), 208 .type = cpu_to_be16(ETH_P_EDSA),
199 .func = edsa_rcv, 209 .func = edsa_rcv,
200}; 210};
diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c
index d7e7f424ff0c..1c3e30c38b86 100644
--- a/net/dsa/tag_trailer.c
+++ b/net/dsa/tag_trailer.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * net/dsa/tag_trailer.c - Trailer tag format handling 2 * net/dsa/tag_trailer.c - Trailer tag format handling
3 * Copyright (c) 2008 Marvell Semiconductor 3 * Copyright (c) 2008-2009 Marvell Semiconductor
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -59,7 +59,7 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
59 59
60 nskb->protocol = htons(ETH_P_TRAILER); 60 nskb->protocol = htons(ETH_P_TRAILER);
61 61
62 nskb->dev = p->parent->master_netdev; 62 nskb->dev = p->parent->dst->master_netdev;
63 dev_queue_xmit(nskb); 63 dev_queue_xmit(nskb);
64 64
65 return NETDEV_TX_OK; 65 return NETDEV_TX_OK;
@@ -68,12 +68,14 @@ int trailer_xmit(struct sk_buff *skb, struct net_device *dev)
68static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, 68static int trailer_rcv(struct sk_buff *skb, struct net_device *dev,
69 struct packet_type *pt, struct net_device *orig_dev) 69 struct packet_type *pt, struct net_device *orig_dev)
70{ 70{
71 struct dsa_switch *ds = dev->dsa_ptr; 71 struct dsa_switch_tree *dst = dev->dsa_ptr;
72 struct dsa_switch *ds;
72 u8 *trailer; 73 u8 *trailer;
73 int source_port; 74 int source_port;
74 75
75 if (unlikely(ds == NULL)) 76 if (unlikely(dst == NULL))
76 goto out_drop; 77 goto out_drop;
78 ds = dst->ds[0];
77 79
78 skb = skb_unshare(skb, GFP_ATOMIC); 80 skb = skb_unshare(skb, GFP_ATOMIC);
79 if (skb == NULL) 81 if (skb == NULL)
@@ -111,7 +113,7 @@ out:
111 return 0; 113 return 0;
112} 114}
113 115
114static struct packet_type trailer_packet_type = { 116static struct packet_type trailer_packet_type __read_mostly = {
115 .type = cpu_to_be16(ETH_P_TRAILER), 117 .type = cpu_to_be16(ETH_P_TRAILER),
116 .func = trailer_rcv, 118 .func = trailer_rcv,
117}; 119};
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index 7bf35582f656..6f479fa522c3 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -1102,7 +1102,7 @@ drop:
1102 return NET_RX_DROP; 1102 return NET_RX_DROP;
1103} 1103}
1104 1104
1105static struct packet_type econet_packet_type = { 1105static struct packet_type econet_packet_type __read_mostly = {
1106 .type = cpu_to_be16(ETH_P_ECONET), 1106 .type = cpu_to_be16(ETH_P_ECONET),
1107 .func = econet_rcv, 1107 .func = econet_rcv,
1108}; 1108};
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 691268f3a359..b2cf91e4ccaa 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -35,7 +35,7 @@ config IP_ADVANCED_ROUTER
35 35
36 at boot time after the /proc file system has been mounted. 36 at boot time after the /proc file system has been mounted.
37 37
38 If you turn on IP forwarding, you will also get the rp_filter, which 38 If you turn on IP forwarding, you should consider the rp_filter, which
39 automatically rejects incoming packets if the routing table entry 39 automatically rejects incoming packets if the routing table entry
40 for their source address doesn't match the network interface they're 40 for their source address doesn't match the network interface they're
41 arriving on. This has security advantages because it prevents the 41 arriving on. This has security advantages because it prevents the
@@ -46,12 +46,16 @@ config IP_ADVANCED_ROUTER
46 rp_filter on use: 46 rp_filter on use:
47 47
48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter 48 echo 1 > /proc/sys/net/ipv4/conf/<device>/rp_filter
49 or 49 and
50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter 50 echo 1 > /proc/sys/net/ipv4/conf/all/rp_filter
51 51
52 Note that some distributions enable it in startup scripts.
53 For details about rp_filter strict and loose mode read
54 <file:Documentation/networking/ip-sysctl.txt>.
55
52 If unsure, say N here. 56 If unsure, say N here.
53 57
54choice 58choice
55 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)" 59 prompt "Choose IP: FIB lookup algorithm (choose FIB_HASH if unsure)"
56 depends on IP_ADVANCED_ROUTER 60 depends on IP_ADVANCED_ROUTER
57 default ASK_IP_FIB_HASH 61 default ASK_IP_FIB_HASH
@@ -59,27 +63,29 @@ choice
59config ASK_IP_FIB_HASH 63config ASK_IP_FIB_HASH
60 bool "FIB_HASH" 64 bool "FIB_HASH"
61 ---help--- 65 ---help---
62 Current FIB is very proven and good enough for most users. 66 Current FIB is very proven and good enough for most users.
63 67
64config IP_FIB_TRIE 68config IP_FIB_TRIE
65 bool "FIB_TRIE" 69 bool "FIB_TRIE"
66 ---help--- 70 ---help---
67 Use new experimental LC-trie as FIB lookup algorithm. 71 Use new experimental LC-trie as FIB lookup algorithm.
68 This improves lookup performance if you have a large 72 This improves lookup performance if you have a large
69 number of routes. 73 number of routes.
70 74
71 LC-trie is a longest matching prefix lookup algorithm which 75 LC-trie is a longest matching prefix lookup algorithm which
72 performs better than FIB_HASH for large routing tables. 76 performs better than FIB_HASH for large routing tables.
73 But, it consumes more memory and is more complex. 77 But, it consumes more memory and is more complex.
74 78
75 LC-trie is described in: 79 LC-trie is described in:
76 80
77 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson 81 IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
78 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999 82 IEEE Journal on Selected Areas in Communications, 17(6):1083-1092,
79 An experimental study of compression methods for dynamic tries 83 June 1999
80 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002. 84
81 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/ 85 An experimental study of compression methods for dynamic tries
82 86 Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
87 http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
88
83endchoice 89endchoice
84 90
85config IP_FIB_HASH 91config IP_FIB_HASH
@@ -191,7 +197,7 @@ config IP_PNP_RARP
191 <file:Documentation/filesystems/nfsroot.txt> for details. 197 <file:Documentation/filesystems/nfsroot.txt> for details.
192 198
193# not yet ready.. 199# not yet ready..
194# bool ' IP: ARP support' CONFIG_IP_PNP_ARP 200# bool ' IP: ARP support' CONFIG_IP_PNP_ARP
195config NET_IPIP 201config NET_IPIP
196 tristate "IP: tunneling" 202 tristate "IP: tunneling"
197 select INET_TUNNEL 203 select INET_TUNNEL
@@ -361,7 +367,7 @@ config INET_IPCOMP
361 ---help--- 367 ---help---
362 Support for IP Payload Compression Protocol (IPComp) (RFC3173), 368 Support for IP Payload Compression Protocol (IPComp) (RFC3173),
363 typically needed for IPsec. 369 typically needed for IPsec.
364 370
365 If unsure, say Y. 371 If unsure, say Y.
366 372
367config INET_XFRM_TUNNEL 373config INET_XFRM_TUNNEL
@@ -415,7 +421,7 @@ config INET_DIAG
415 Support for INET (TCP, DCCP, etc) socket monitoring interface used by 421 Support for INET (TCP, DCCP, etc) socket monitoring interface used by
416 native Linux tools such as ss. ss is included in iproute2, currently 422 native Linux tools such as ss. ss is included in iproute2, currently
417 downloadable at <http://linux-net.osdl.org/index.php/Iproute2>. 423 downloadable at <http://linux-net.osdl.org/index.php/Iproute2>.
418 424
419 If unsure, say Y. 425 If unsure, say Y.
420 426
421config INET_TCP_DIAG 427config INET_TCP_DIAG
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 627be4dc7fb0..d5aaabbb7cb3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1500,7 +1500,7 @@ static int ipv4_proc_init(void);
1500 * IP protocol layer initialiser 1500 * IP protocol layer initialiser
1501 */ 1501 */
1502 1502
1503static struct packet_type ip_packet_type = { 1503static struct packet_type ip_packet_type __read_mostly = {
1504 .type = cpu_to_be16(ETH_P_IP), 1504 .type = cpu_to_be16(ETH_P_IP),
1505 .func = ip_rcv, 1505 .func = ip_rcv,
1506 .gso_send_check = inet_gso_send_check, 1506 .gso_send_check = inet_gso_send_check,
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 3f6b7354699b..f11931c18381 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -801,8 +801,11 @@ static int arp_process(struct sk_buff *skb)
801 * cache. 801 * cache.
802 */ 802 */
803 803
804 /* Special case: IPv4 duplicate address detection packet (RFC2131) */ 804 /*
805 if (sip == 0) { 805 * Special case: IPv4 duplicate address detection packet (RFC2131)
806 * and Gratuitous ARP/ARP Announce. (RFC3927, Section 2.4)
807 */
808 if (sip == 0 || tip == sip) {
806 if (arp->ar_op == htons(ARPOP_REQUEST) && 809 if (arp->ar_op == htons(ARPOP_REQUEST) &&
807 inet_addr_type(net, tip) == RTN_LOCAL && 810 inet_addr_type(net, tip) == RTN_LOCAL &&
808 !arp_ignore(in_dev, sip, tip)) 811 !arp_ignore(in_dev, sip, tip))
@@ -892,7 +895,7 @@ static int arp_process(struct sk_buff *skb)
892out: 895out:
893 if (in_dev) 896 if (in_dev)
894 in_dev_put(in_dev); 897 in_dev_put(in_dev);
895 kfree_skb(skb); 898 consume_skb(skb);
896 return 0; 899 return 0;
897} 900}
898 901
@@ -1225,7 +1228,7 @@ void arp_ifdown(struct net_device *dev)
1225 * Called once on startup. 1228 * Called once on startup.
1226 */ 1229 */
1227 1230
1228static struct packet_type arp_packet_type = { 1231static struct packet_type arp_packet_type __read_mostly = {
1229 .type = cpu_to_be16(ETH_P_ARP), 1232 .type = cpu_to_be16(ETH_P_ARP),
1230 .func = arp_rcv, 1233 .func = arp_rcv,
1231}; 1234};
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 6bb2635b5ded..7bc992976d29 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -3,11 +3,16 @@
3 * 3 *
4 * This is an implementation of the CIPSO 2.2 protocol as specified in 4 * This is an implementation of the CIPSO 2.2 protocol as specified in
5 * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in 5 * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in
6 * FIPS-188, copies of both documents can be found in the Documentation 6 * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors
7 * directory. While CIPSO never became a full IETF RFC standard many vendors
8 * have chosen to adopt the protocol and over the years it has become a 7 * have chosen to adopt the protocol and over the years it has become a
9 * de-facto standard for labeled networking. 8 * de-facto standard for labeled networking.
10 * 9 *
10 * The CIPSO draft specification can be found in the kernel's Documentation
11 * directory as well as the following URL:
12 * http://netlabel.sourceforge.net/files/draft-ietf-cipso-ipsecurity-01.txt
13 * The FIPS-188 specification can be found at the following URL:
14 * http://www.itl.nist.gov/fipspubs/fip188.htm
15 *
11 * Author: Paul Moore <paul.moore@hp.com> 16 * Author: Paul Moore <paul.moore@hp.com>
12 * 17 *
13 */ 18 */
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d519a6a66726..126bb911880f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1216,7 +1216,8 @@ static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh,
1216 kfree_skb(skb); 1216 kfree_skb(skb);
1217 goto errout; 1217 goto errout;
1218 } 1218 }
1219 err = rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL); 1219 rtnl_notify(skb, net, pid, RTNLGRP_IPV4_IFADDR, nlh, GFP_KERNEL);
1220 return;
1220errout: 1221errout:
1221 if (err < 0) 1222 if (err < 0)
1222 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err); 1223 rtnl_set_sk_err(net, RTNLGRP_IPV4_IFADDR, err);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 741e4fa3e474..cafcc49d0993 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -275,7 +275,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
275 fib_res_put(&res); 275 fib_res_put(&res);
276 if (no_addr) 276 if (no_addr)
277 goto last_resort; 277 goto last_resort;
278 if (rpf) 278 if (rpf == 1)
279 goto e_inval; 279 goto e_inval;
280 fl.oif = dev->ifindex; 280 fl.oif = dev->ifindex;
281 281
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 4817dea3bc73..f831df500907 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -322,8 +322,9 @@ void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
322 kfree_skb(skb); 322 kfree_skb(skb);
323 goto errout; 323 goto errout;
324 } 324 }
325 err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE, 325 rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326 info->nlh, GFP_KERNEL); 326 info->nlh, GFP_KERNEL);
327 return;
327errout: 328errout:
328 if (err < 0) 329 if (err < 0)
329 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err); 330 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 382800a62b31..3f50807237e0 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -1207,7 +1207,7 @@ static struct pernet_operations __net_initdata icmp_sk_ops = {
1207 1207
1208int __init icmp_init(void) 1208int __init icmp_init(void)
1209{ 1209{
1210 return register_pernet_device(&icmp_sk_ops); 1210 return register_pernet_subsys(&icmp_sk_ops);
1211} 1211}
1212 1212
1213EXPORT_SYMBOL(icmp_err_convert); 1213EXPORT_SYMBOL(icmp_err_convert);
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 6c52e08f786e..eaf3e2c8646a 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -267,6 +267,7 @@ static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
267 267
268struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, 268struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
269 struct inet_frags *f, void *key, unsigned int hash) 269 struct inet_frags *f, void *key, unsigned int hash)
270 __releases(&f->lock)
270{ 271{
271 struct inet_frag_queue *q; 272 struct inet_frag_queue *q;
272 struct hlist_node *n; 273 struct hlist_node *n;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 6659ac000eeb..7985346653bd 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -463,6 +463,7 @@ err:
463static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, 463static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
464 struct net_device *dev) 464 struct net_device *dev)
465{ 465{
466 struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
466 struct iphdr *iph; 467 struct iphdr *iph;
467 struct sk_buff *fp, *head = qp->q.fragments; 468 struct sk_buff *fp, *head = qp->q.fragments;
468 int len; 469 int len;
@@ -548,7 +549,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
548 iph = ip_hdr(head); 549 iph = ip_hdr(head);
549 iph->frag_off = 0; 550 iph->frag_off = 0;
550 iph->tot_len = htons(len); 551 iph->tot_len = htons(len);
551 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_REASMOKS); 552 IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS);
552 qp->q.fragments = NULL; 553 qp->q.fragments = NULL;
553 return 0; 554 return 0;
554 555
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 07a188afb3ac..e62510d5ea5a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -491,7 +491,7 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
491 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 491 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
492 goto out; 492 goto out;
493 493
494 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) 494 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
495 t->err_count++; 495 t->err_count++;
496 else 496 else
497 t->err_count = 1; 497 t->err_count = 1;
@@ -803,7 +803,8 @@ static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
803#endif 803#endif
804 804
805 if (tunnel->err_count > 0) { 805 if (tunnel->err_count > 0) {
806 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { 806 if (time_before(jiffies,
807 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
807 tunnel->err_count--; 808 tunnel->err_count--;
808 809
809 dst_link_failure(skb); 810 dst_link_failure(skb);
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 5079dfbc6f38..9054139795af 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -327,7 +327,7 @@ static int ipip_err(struct sk_buff *skb, u32 info)
327 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 327 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
328 goto out; 328 goto out;
329 329
330 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) 330 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
331 t->err_count++; 331 t->err_count++;
332 else 332 else
333 t->err_count = 1; 333 t->err_count = 1;
@@ -466,7 +466,8 @@ static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
466 } 466 }
467 467
468 if (tunnel->err_count > 0) { 468 if (tunnel->err_count > 0) {
469 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { 469 if (time_before(jiffies,
470 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
470 tunnel->err_count--; 471 tunnel->err_count--;
471 dst_link_failure(skb); 472 dst_link_failure(skb);
472 } else 473 } else
@@ -750,7 +751,7 @@ static struct xfrm_tunnel ipip_handler = {
750 .priority = 1, 751 .priority = 1,
751}; 752};
752 753
753static char banner[] __initdata = 754static const char banner[] __initconst =
754 KERN_INFO "IPv4 over IPv4 tunneling driver\n"; 755 KERN_INFO "IPv4 over IPv4 tunneling driver\n";
755 756
756static void ipip_destroy_tunnels(struct ipip_net *ipn) 757static void ipip_destroy_tunnels(struct ipip_net *ipn)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 90b2f3c192ff..2451aeb5ac23 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -661,6 +661,47 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
661 return NULL; 661 return NULL;
662} 662}
663 663
664static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now,
665 int large_allowed)
666{
667 struct tcp_sock *tp = tcp_sk(sk);
668 u32 xmit_size_goal, old_size_goal;
669
670 xmit_size_goal = mss_now;
671
672 if (large_allowed && sk_can_gso(sk)) {
673 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
674 inet_csk(sk)->icsk_af_ops->net_header_len -
675 inet_csk(sk)->icsk_ext_hdr_len -
676 tp->tcp_header_len);
677
678 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
679
680 /* We try hard to avoid divides here */
681 old_size_goal = tp->xmit_size_goal_segs * mss_now;
682
683 if (likely(old_size_goal <= xmit_size_goal &&
684 old_size_goal + mss_now > xmit_size_goal)) {
685 xmit_size_goal = old_size_goal;
686 } else {
687 tp->xmit_size_goal_segs = xmit_size_goal / mss_now;
688 xmit_size_goal = tp->xmit_size_goal_segs * mss_now;
689 }
690 }
691
692 return max(xmit_size_goal, mss_now);
693}
694
695static int tcp_send_mss(struct sock *sk, int *size_goal, int flags)
696{
697 int mss_now;
698
699 mss_now = tcp_current_mss(sk);
700 *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB));
701
702 return mss_now;
703}
704
664static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, 705static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset,
665 size_t psize, int flags) 706 size_t psize, int flags)
666{ 707{
@@ -677,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
677 718
678 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 719 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
679 720
680 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 721 mss_now = tcp_send_mss(sk, &size_goal, flags);
681 size_goal = tp->xmit_size_goal;
682 copied = 0; 722 copied = 0;
683 723
684 err = -EPIPE; 724 err = -EPIPE;
685 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 725 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
686 goto do_error; 726 goto out_err;
687 727
688 while (psize > 0) { 728 while (psize > 0) {
689 struct sk_buff *skb = tcp_write_queue_tail(sk); 729 struct sk_buff *skb = tcp_write_queue_tail(sk);
@@ -761,8 +801,7 @@ wait_for_memory:
761 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 801 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
762 goto do_error; 802 goto do_error;
763 803
764 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 804 mss_now = tcp_send_mss(sk, &size_goal, flags);
765 size_goal = tp->xmit_size_goal;
766 } 805 }
767 806
768out: 807out:
@@ -844,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
844 /* This should be in poll */ 883 /* This should be in poll */
845 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 884 clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
846 885
847 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 886 mss_now = tcp_send_mss(sk, &size_goal, flags);
848 size_goal = tp->xmit_size_goal;
849 887
850 /* Ok commence sending. */ 888 /* Ok commence sending. */
851 iovlen = msg->msg_iovlen; 889 iovlen = msg->msg_iovlen;
@@ -854,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
854 892
855 err = -EPIPE; 893 err = -EPIPE;
856 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 894 if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
857 goto do_error; 895 goto out_err;
858 896
859 while (--iovlen >= 0) { 897 while (--iovlen >= 0) {
860 int seglen = iov->iov_len; 898 int seglen = iov->iov_len;
@@ -1007,8 +1045,7 @@ wait_for_memory:
1007 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 1045 if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
1008 goto do_error; 1046 goto do_error;
1009 1047
1010 mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 1048 mss_now = tcp_send_mss(sk, &size_goal, flags);
1011 size_goal = tp->xmit_size_goal;
1012 } 1049 }
1013 } 1050 }
1014 1051
@@ -1045,8 +1082,7 @@ out_err:
1045 */ 1082 */
1046 1083
1047static int tcp_recv_urg(struct sock *sk, long timeo, 1084static int tcp_recv_urg(struct sock *sk, long timeo,
1048 struct msghdr *msg, int len, int flags, 1085 struct msghdr *msg, int len, int flags)
1049 int *addr_len)
1050{ 1086{
1051 struct tcp_sock *tp = tcp_sk(sk); 1087 struct tcp_sock *tp = tcp_sk(sk);
1052 1088
@@ -1661,7 +1697,7 @@ out:
1661 return err; 1697 return err;
1662 1698
1663recv_urg: 1699recv_urg:
1664 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); 1700 err = tcp_recv_urg(sk, timeo, msg, len, flags);
1665 goto out; 1701 goto out;
1666} 1702}
1667 1703
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 7eb7636db0d0..3b53fd1af23f 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -149,16 +149,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
149 tcp_slow_start(tp); 149 tcp_slow_start(tp);
150 else { 150 else {
151 bictcp_update(ca, tp->snd_cwnd); 151 bictcp_update(ca, tp->snd_cwnd);
152 152 tcp_cong_avoid_ai(tp, ca->cnt);
153 /* In dangerous area, increase slowly.
154 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
155 */
156 if (tp->snd_cwnd_cnt >= ca->cnt) {
157 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
158 tp->snd_cwnd++;
159 tp->snd_cwnd_cnt = 0;
160 } else
161 tp->snd_cwnd_cnt++;
162 } 153 }
163 154
164} 155}
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 4ec5b4e97c4e..e92beb9e55e0 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -336,6 +336,19 @@ void tcp_slow_start(struct tcp_sock *tp)
336} 336}
337EXPORT_SYMBOL_GPL(tcp_slow_start); 337EXPORT_SYMBOL_GPL(tcp_slow_start);
338 338
339/* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
340void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
341{
342 if (tp->snd_cwnd_cnt >= w) {
343 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
344 tp->snd_cwnd++;
345 tp->snd_cwnd_cnt = 0;
346 } else {
347 tp->snd_cwnd_cnt++;
348 }
349}
350EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
351
339/* 352/*
340 * TCP Reno congestion control 353 * TCP Reno congestion control
341 * This is special case used for fallback as well. 354 * This is special case used for fallback as well.
@@ -365,13 +378,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
365 tp->snd_cwnd++; 378 tp->snd_cwnd++;
366 } 379 }
367 } else { 380 } else {
368 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ 381 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
369 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
370 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
371 tp->snd_cwnd++;
372 tp->snd_cwnd_cnt = 0;
373 } else
374 tp->snd_cwnd_cnt++;
375 } 382 }
376} 383}
377EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 384EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index ee467ec40c4f..71d5f2f29fa6 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -294,16 +294,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
294 tcp_slow_start(tp); 294 tcp_slow_start(tp);
295 } else { 295 } else {
296 bictcp_update(ca, tp->snd_cwnd); 296 bictcp_update(ca, tp->snd_cwnd);
297 297 tcp_cong_avoid_ai(tp, ca->cnt);
298 /* In dangerous area, increase slowly.
299 * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd
300 */
301 if (tp->snd_cwnd_cnt >= ca->cnt) {
302 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
303 tp->snd_cwnd++;
304 tp->snd_cwnd_cnt = 0;
305 } else
306 tp->snd_cwnd_cnt++;
307 } 298 }
308 299
309} 300}
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c
index 937549b8a921..26d5c7fc7de5 100644
--- a/net/ipv4/tcp_htcp.c
+++ b/net/ipv4/tcp_htcp.c
@@ -115,8 +115,7 @@ static void measure_achieved_throughput(struct sock *sk, u32 pkts_acked, s32 rtt
115 return; 115 return;
116 116
117 /* achieved throughput calculations */ 117 /* achieved throughput calculations */
118 if (icsk->icsk_ca_state != TCP_CA_Open && 118 if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_Disorder))) {
119 icsk->icsk_ca_state != TCP_CA_Disorder) {
120 ca->packetcount = 0; 119 ca->packetcount = 0;
121 ca->lasttime = now; 120 ca->lasttime = now;
122 return; 121 return;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a6961d75c7ea..2bc8e27a163d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -64,6 +64,7 @@
64#include <linux/mm.h> 64#include <linux/mm.h>
65#include <linux/module.h> 65#include <linux/module.h>
66#include <linux/sysctl.h> 66#include <linux/sysctl.h>
67#include <linux/kernel.h>
67#include <net/dst.h> 68#include <net/dst.h>
68#include <net/tcp.h> 69#include <net/tcp.h>
69#include <net/inet_common.h> 70#include <net/inet_common.h>
@@ -1178,10 +1179,18 @@ static void tcp_mark_lost_retrans(struct sock *sk)
1178 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)) 1179 if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))
1179 continue; 1180 continue;
1180 1181
1181 if (after(received_upto, ack_seq) && 1182 /* TODO: We would like to get rid of tcp_is_fack(tp) only
1182 (tcp_is_fack(tp) || 1183 * constraint here (see above) but figuring out that at
1183 !before(received_upto, 1184 * least tp->reordering SACK blocks reside between ack_seq
1184 ack_seq + tp->reordering * tp->mss_cache))) { 1185 * and received_upto is not easy task to do cheaply with
1186 * the available datastructures.
1187 *
1188 * Whether FACK should check here for tp->reordering segs
1189 * in-between one could argue for either way (it would be
1190 * rather simple to implement as we could count fack_count
1191 * during the walk and do tp->fackets_out - fack_count).
1192 */
1193 if (after(received_upto, ack_seq)) {
1185 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; 1194 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
1186 tp->retrans_out -= tcp_skb_pcount(skb); 1195 tp->retrans_out -= tcp_skb_pcount(skb);
1187 1196
@@ -1374,7 +1383,8 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk,
1374 1383
1375static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, 1384static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1376 struct tcp_sacktag_state *state, 1385 struct tcp_sacktag_state *state,
1377 unsigned int pcount, int shifted, int mss) 1386 unsigned int pcount, int shifted, int mss,
1387 int dup_sack)
1378{ 1388{
1379 struct tcp_sock *tp = tcp_sk(sk); 1389 struct tcp_sock *tp = tcp_sk(sk);
1380 struct sk_buff *prev = tcp_write_queue_prev(sk, skb); 1390 struct sk_buff *prev = tcp_write_queue_prev(sk, skb);
@@ -1410,7 +1420,7 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb,
1410 } 1420 }
1411 1421
1412 /* We discard results */ 1422 /* We discard results */
1413 tcp_sacktag_one(skb, sk, state, 0, pcount); 1423 tcp_sacktag_one(skb, sk, state, dup_sack, pcount);
1414 1424
1415 /* Difference in this won't matter, both ACKed by the same cumul. ACK */ 1425 /* Difference in this won't matter, both ACKed by the same cumul. ACK */
1416 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); 1426 TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS);
@@ -1561,7 +1571,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1561 1571
1562 if (!skb_shift(prev, skb, len)) 1572 if (!skb_shift(prev, skb, len))
1563 goto fallback; 1573 goto fallback;
1564 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss)) 1574 if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack))
1565 goto out; 1575 goto out;
1566 1576
1567 /* Hole filled allows collapsing with the next as well, this is very 1577 /* Hole filled allows collapsing with the next as well, this is very
@@ -1580,7 +1590,7 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb,
1580 len = skb->len; 1590 len = skb->len;
1581 if (skb_shift(prev, skb, len)) { 1591 if (skb_shift(prev, skb, len)) {
1582 pcount += tcp_skb_pcount(skb); 1592 pcount += tcp_skb_pcount(skb);
1583 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss); 1593 tcp_shifted_skb(sk, skb, state, tcp_skb_pcount(skb), len, mss, 0);
1584 } 1594 }
1585 1595
1586out: 1596out:
@@ -1793,11 +1803,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
1793 for (i = used_sacks - 1; i > 0; i--) { 1803 for (i = used_sacks - 1; i > 0; i--) {
1794 for (j = 0; j < i; j++) { 1804 for (j = 0; j < i; j++) {
1795 if (after(sp[j].start_seq, sp[j + 1].start_seq)) { 1805 if (after(sp[j].start_seq, sp[j + 1].start_seq)) {
1796 struct tcp_sack_block tmp; 1806 swap(sp[j], sp[j + 1]);
1797
1798 tmp = sp[j];
1799 sp[j] = sp[j + 1];
1800 sp[j + 1] = tmp;
1801 1807
1802 /* Track where the first SACK block goes to */ 1808 /* Track where the first SACK block goes to */
1803 if (j == first_sack_index) 1809 if (j == first_sack_index)
@@ -2452,6 +2458,44 @@ static int tcp_time_to_recover(struct sock *sk)
2452 return 0; 2458 return 0;
2453} 2459}
2454 2460
2461/* New heuristics: it is possible only after we switched to restart timer
2462 * each time when something is ACKed. Hence, we can detect timed out packets
2463 * during fast retransmit without falling to slow start.
2464 *
2465 * Usefulness of this as is very questionable, since we should know which of
2466 * the segments is the next to timeout which is relatively expensive to find
2467 * in general case unless we add some data structure just for that. The
2468 * current approach certainly won't find the right one too often and when it
2469 * finally does find _something_ it usually marks large part of the window
2470 * right away (because a retransmission with a larger timestamp blocks the
2471 * loop from advancing). -ij
2472 */
2473static void tcp_timeout_skbs(struct sock *sk)
2474{
2475 struct tcp_sock *tp = tcp_sk(sk);
2476 struct sk_buff *skb;
2477
2478 if (!tcp_is_fack(tp) || !tcp_head_timedout(sk))
2479 return;
2480
2481 skb = tp->scoreboard_skb_hint;
2482 if (tp->scoreboard_skb_hint == NULL)
2483 skb = tcp_write_queue_head(sk);
2484
2485 tcp_for_write_queue_from(skb, sk) {
2486 if (skb == tcp_send_head(sk))
2487 break;
2488 if (!tcp_skb_timedout(sk, skb))
2489 break;
2490
2491 tcp_skb_mark_lost(tp, skb);
2492 }
2493
2494 tp->scoreboard_skb_hint = skb;
2495
2496 tcp_verify_left_out(tp);
2497}
2498
2455/* Mark head of queue up as lost. With RFC3517 SACK, the packets is 2499/* Mark head of queue up as lost. With RFC3517 SACK, the packets is
2456 * is against sacked "cnt", otherwise it's against facked "cnt" 2500 * is against sacked "cnt", otherwise it's against facked "cnt"
2457 */ 2501 */
@@ -2524,30 +2568,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
2524 tcp_mark_head_lost(sk, sacked_upto); 2568 tcp_mark_head_lost(sk, sacked_upto);
2525 } 2569 }
2526 2570
2527 /* New heuristics: it is possible only after we switched 2571 tcp_timeout_skbs(sk);
2528 * to restart timer each time when something is ACKed.
2529 * Hence, we can detect timed out packets during fast
2530 * retransmit without falling to slow start.
2531 */
2532 if (tcp_is_fack(tp) && tcp_head_timedout(sk)) {
2533 struct sk_buff *skb;
2534
2535 skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
2536 : tcp_write_queue_head(sk);
2537
2538 tcp_for_write_queue_from(skb, sk) {
2539 if (skb == tcp_send_head(sk))
2540 break;
2541 if (!tcp_skb_timedout(sk, skb))
2542 break;
2543
2544 tcp_skb_mark_lost(tp, skb);
2545 }
2546
2547 tp->scoreboard_skb_hint = skb;
2548
2549 tcp_verify_left_out(tp);
2550 }
2551} 2572}
2552 2573
2553/* CWND moderation, preventing bursts due to too big ACKs 2574/* CWND moderation, preventing bursts due to too big ACKs
@@ -2812,7 +2833,7 @@ static void tcp_mtup_probe_failed(struct sock *sk)
2812 icsk->icsk_mtup.probe_size = 0; 2833 icsk->icsk_mtup.probe_size = 0;
2813} 2834}
2814 2835
2815static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) 2836static void tcp_mtup_probe_success(struct sock *sk)
2816{ 2837{
2817 struct tcp_sock *tp = tcp_sk(sk); 2838 struct tcp_sock *tp = tcp_sk(sk);
2818 struct inet_connection_sock *icsk = inet_csk(sk); 2839 struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2840,7 +2861,7 @@ void tcp_simple_retransmit(struct sock *sk)
2840 const struct inet_connection_sock *icsk = inet_csk(sk); 2861 const struct inet_connection_sock *icsk = inet_csk(sk);
2841 struct tcp_sock *tp = tcp_sk(sk); 2862 struct tcp_sock *tp = tcp_sk(sk);
2842 struct sk_buff *skb; 2863 struct sk_buff *skb;
2843 unsigned int mss = tcp_current_mss(sk, 0); 2864 unsigned int mss = tcp_current_mss(sk);
2844 u32 prior_lost = tp->lost_out; 2865 u32 prior_lost = tp->lost_out;
2845 2866
2846 tcp_for_write_queue(skb, sk) { 2867 tcp_for_write_queue(skb, sk) {
@@ -3177,7 +3198,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3177 3198
3178 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { 3199 while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
3179 struct tcp_skb_cb *scb = TCP_SKB_CB(skb); 3200 struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
3180 u32 end_seq;
3181 u32 acked_pcount; 3201 u32 acked_pcount;
3182 u8 sacked = scb->sacked; 3202 u8 sacked = scb->sacked;
3183 3203
@@ -3192,16 +3212,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3192 break; 3212 break;
3193 3213
3194 fully_acked = 0; 3214 fully_acked = 0;
3195 end_seq = tp->snd_una;
3196 } else { 3215 } else {
3197 acked_pcount = tcp_skb_pcount(skb); 3216 acked_pcount = tcp_skb_pcount(skb);
3198 end_seq = scb->end_seq;
3199 }
3200
3201 /* MTU probing checks */
3202 if (fully_acked && icsk->icsk_mtup.probe_size &&
3203 !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
3204 tcp_mtup_probe_success(sk, skb);
3205 } 3217 }
3206 3218
3207 if (sacked & TCPCB_RETRANS) { 3219 if (sacked & TCPCB_RETRANS) {
@@ -3266,24 +3278,26 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
3266 const struct tcp_congestion_ops *ca_ops 3278 const struct tcp_congestion_ops *ca_ops
3267 = inet_csk(sk)->icsk_ca_ops; 3279 = inet_csk(sk)->icsk_ca_ops;
3268 3280
3281 if (unlikely(icsk->icsk_mtup.probe_size &&
3282 !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) {
3283 tcp_mtup_probe_success(sk);
3284 }
3285
3269 tcp_ack_update_rtt(sk, flag, seq_rtt); 3286 tcp_ack_update_rtt(sk, flag, seq_rtt);
3270 tcp_rearm_rto(sk); 3287 tcp_rearm_rto(sk);
3271 3288
3272 if (tcp_is_reno(tp)) { 3289 if (tcp_is_reno(tp)) {
3273 tcp_remove_reno_sacks(sk, pkts_acked); 3290 tcp_remove_reno_sacks(sk, pkts_acked);
3274 } else { 3291 } else {
3292 int delta;
3293
3275 /* Non-retransmitted hole got filled? That's reordering */ 3294 /* Non-retransmitted hole got filled? That's reordering */
3276 if (reord < prior_fackets) 3295 if (reord < prior_fackets)
3277 tcp_update_reordering(sk, tp->fackets_out - reord, 0); 3296 tcp_update_reordering(sk, tp->fackets_out - reord, 0);
3278 3297
3279 /* No need to care for underflows here because 3298 delta = tcp_is_fack(tp) ? pkts_acked :
3280 * the lost_skb_hint gets NULLed if we're past it 3299 prior_sacked - tp->sacked_out;
3281 * (or something non-trivial happened) 3300 tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
3282 */
3283 if (tcp_is_fack(tp))
3284 tp->lost_cnt_hint -= pkts_acked;
3285 else
3286 tp->lost_cnt_hint -= prior_sacked - tp->sacked_out;
3287 } 3301 }
3288 3302
3289 tp->fackets_out -= min(pkts_acked, tp->fackets_out); 3303 tp->fackets_out -= min(pkts_acked, tp->fackets_out);
@@ -3395,7 +3409,7 @@ static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack,
3395 3409
3396 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { 3410 if (tcp_may_update_window(tp, ack, ack_seq, nwin)) {
3397 flag |= FLAG_WIN_UPDATE; 3411 flag |= FLAG_WIN_UPDATE;
3398 tcp_update_wl(tp, ack, ack_seq); 3412 tcp_update_wl(tp, ack_seq);
3399 3413
3400 if (tp->snd_wnd != nwin) { 3414 if (tp->snd_wnd != nwin) {
3401 tp->snd_wnd = nwin; 3415 tp->snd_wnd = nwin;
@@ -3571,15 +3585,18 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3571 int prior_packets; 3585 int prior_packets;
3572 int frto_cwnd = 0; 3586 int frto_cwnd = 0;
3573 3587
3574 /* If the ack is newer than sent or older than previous acks 3588 /* If the ack is older than previous acks
3575 * then we can probably ignore it. 3589 * then we can probably ignore it.
3576 */ 3590 */
3577 if (after(ack, tp->snd_nxt))
3578 goto uninteresting_ack;
3579
3580 if (before(ack, prior_snd_una)) 3591 if (before(ack, prior_snd_una))
3581 goto old_ack; 3592 goto old_ack;
3582 3593
3594 /* If the ack includes data we haven't sent yet, discard
3595 * this segment (RFC793 Section 3.9).
3596 */
3597 if (after(ack, tp->snd_nxt))
3598 goto invalid_ack;
3599
3583 if (after(ack, prior_snd_una)) 3600 if (after(ack, prior_snd_una))
3584 flag |= FLAG_SND_UNA_ADVANCED; 3601 flag |= FLAG_SND_UNA_ADVANCED;
3585 3602
@@ -3600,7 +3617,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
3600 * No more checks are required. 3617 * No more checks are required.
3601 * Note, we use the fact that SND.UNA>=SND.WL2. 3618 * Note, we use the fact that SND.UNA>=SND.WL2.
3602 */ 3619 */
3603 tcp_update_wl(tp, ack, ack_seq); 3620 tcp_update_wl(tp, ack_seq);
3604 tp->snd_una = ack; 3621 tp->snd_una = ack;
3605 flag |= FLAG_WIN_UPDATE; 3622 flag |= FLAG_WIN_UPDATE;
3606 3623
@@ -3669,6 +3686,10 @@ no_queue:
3669 tcp_ack_probe(sk); 3686 tcp_ack_probe(sk);
3670 return 1; 3687 return 1;
3671 3688
3689invalid_ack:
3690 SOCK_DEBUG(sk, "Ack %u after %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3691 return -1;
3692
3672old_ack: 3693old_ack:
3673 if (TCP_SKB_CB(skb)->sacked) { 3694 if (TCP_SKB_CB(skb)->sacked) {
3674 tcp_sacktag_write_queue(sk, skb, prior_snd_una); 3695 tcp_sacktag_write_queue(sk, skb, prior_snd_una);
@@ -3676,8 +3697,7 @@ old_ack:
3676 tcp_try_keep_open(sk); 3697 tcp_try_keep_open(sk);
3677 } 3698 }
3678 3699
3679uninteresting_ack: 3700 SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3680 SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
3681 return 0; 3701 return 0;
3682} 3702}
3683 3703
@@ -3865,8 +3885,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
3865 * Not only, also it occurs for expired timestamps. 3885 * Not only, also it occurs for expired timestamps.
3866 */ 3886 */
3867 3887
3868 if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || 3888 if (tcp_paws_check(&tp->rx_opt, 0))
3869 get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
3870 tcp_store_ts_recent(tp); 3889 tcp_store_ts_recent(tp);
3871 } 3890 }
3872} 3891}
@@ -3918,9 +3937,9 @@ static inline int tcp_paws_discard(const struct sock *sk,
3918 const struct sk_buff *skb) 3937 const struct sk_buff *skb)
3919{ 3938{
3920 const struct tcp_sock *tp = tcp_sk(sk); 3939 const struct tcp_sock *tp = tcp_sk(sk);
3921 return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && 3940
3922 get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && 3941 return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
3923 !tcp_disordered_ack(sk, skb)); 3942 !tcp_disordered_ack(sk, skb);
3924} 3943}
3925 3944
3926/* Check segment sequence number for validity. 3945/* Check segment sequence number for validity.
@@ -4078,7 +4097,6 @@ static void tcp_dsack_set(struct sock *sk, u32 seq, u32 end_seq)
4078 tp->rx_opt.dsack = 1; 4097 tp->rx_opt.dsack = 1;
4079 tp->duplicate_sack[0].start_seq = seq; 4098 tp->duplicate_sack[0].start_seq = seq;
4080 tp->duplicate_sack[0].end_seq = end_seq; 4099 tp->duplicate_sack[0].end_seq = end_seq;
4081 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + 1;
4082 } 4100 }
4083} 4101}
4084 4102
@@ -4133,8 +4151,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4133 * Decrease num_sacks. 4151 * Decrease num_sacks.
4134 */ 4152 */
4135 tp->rx_opt.num_sacks--; 4153 tp->rx_opt.num_sacks--;
4136 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
4137 tp->rx_opt.dsack;
4138 for (i = this_sack; i < tp->rx_opt.num_sacks; i++) 4154 for (i = this_sack; i < tp->rx_opt.num_sacks; i++)
4139 sp[i] = sp[i + 1]; 4155 sp[i] = sp[i + 1];
4140 continue; 4156 continue;
@@ -4143,20 +4159,6 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp)
4143 } 4159 }
4144} 4160}
4145 4161
4146static inline void tcp_sack_swap(struct tcp_sack_block *sack1,
4147 struct tcp_sack_block *sack2)
4148{
4149 __u32 tmp;
4150
4151 tmp = sack1->start_seq;
4152 sack1->start_seq = sack2->start_seq;
4153 sack2->start_seq = tmp;
4154
4155 tmp = sack1->end_seq;
4156 sack1->end_seq = sack2->end_seq;
4157 sack2->end_seq = tmp;
4158}
4159
4160static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) 4162static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4161{ 4163{
4162 struct tcp_sock *tp = tcp_sk(sk); 4164 struct tcp_sock *tp = tcp_sk(sk);
@@ -4171,7 +4173,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq)
4171 if (tcp_sack_extend(sp, seq, end_seq)) { 4173 if (tcp_sack_extend(sp, seq, end_seq)) {
4172 /* Rotate this_sack to the first one. */ 4174 /* Rotate this_sack to the first one. */
4173 for (; this_sack > 0; this_sack--, sp--) 4175 for (; this_sack > 0; this_sack--, sp--)
4174 tcp_sack_swap(sp, sp - 1); 4176 swap(*sp, *(sp - 1));
4175 if (cur_sacks > 1) 4177 if (cur_sacks > 1)
4176 tcp_sack_maybe_coalesce(tp); 4178 tcp_sack_maybe_coalesce(tp);
4177 return; 4179 return;
@@ -4197,7 +4199,6 @@ new_sack:
4197 sp->start_seq = seq; 4199 sp->start_seq = seq;
4198 sp->end_seq = end_seq; 4200 sp->end_seq = end_seq;
4199 tp->rx_opt.num_sacks++; 4201 tp->rx_opt.num_sacks++;
4200 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
4201} 4202}
4202 4203
4203/* RCV.NXT advances, some SACKs should be eaten. */ 4204/* RCV.NXT advances, some SACKs should be eaten. */
@@ -4211,7 +4212,6 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4211 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ 4212 /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */
4212 if (skb_queue_empty(&tp->out_of_order_queue)) { 4213 if (skb_queue_empty(&tp->out_of_order_queue)) {
4213 tp->rx_opt.num_sacks = 0; 4214 tp->rx_opt.num_sacks = 0;
4214 tp->rx_opt.eff_sacks = tp->rx_opt.dsack;
4215 return; 4215 return;
4216 } 4216 }
4217 4217
@@ -4232,11 +4232,7 @@ static void tcp_sack_remove(struct tcp_sock *tp)
4232 this_sack++; 4232 this_sack++;
4233 sp++; 4233 sp++;
4234 } 4234 }
4235 if (num_sacks != tp->rx_opt.num_sacks) { 4235 tp->rx_opt.num_sacks = num_sacks;
4236 tp->rx_opt.num_sacks = num_sacks;
4237 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks +
4238 tp->rx_opt.dsack;
4239 }
4240} 4236}
4241 4237
4242/* This one checks to see if we can put data from the 4238/* This one checks to see if we can put data from the
@@ -4312,10 +4308,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
4312 4308
4313 TCP_ECN_accept_cwr(tp, skb); 4309 TCP_ECN_accept_cwr(tp, skb);
4314 4310
4315 if (tp->rx_opt.dsack) { 4311 tp->rx_opt.dsack = 0;
4316 tp->rx_opt.dsack = 0;
4317 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
4318 }
4319 4312
4320 /* Queue data for delivery to the user. 4313 /* Queue data for delivery to the user.
4321 * Packets in sequence go to the receive queue. 4314 * Packets in sequence go to the receive queue.
@@ -4434,8 +4427,6 @@ drop:
4434 /* Initial out of order segment, build 1 SACK. */ 4427 /* Initial out of order segment, build 1 SACK. */
4435 if (tcp_is_sack(tp)) { 4428 if (tcp_is_sack(tp)) {
4436 tp->rx_opt.num_sacks = 1; 4429 tp->rx_opt.num_sacks = 1;
4437 tp->rx_opt.dsack = 0;
4438 tp->rx_opt.eff_sacks = 1;
4439 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; 4430 tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq;
4440 tp->selective_acks[0].end_seq = 4431 tp->selective_acks[0].end_seq =
4441 TCP_SKB_CB(skb)->end_seq; 4432 TCP_SKB_CB(skb)->end_seq;
@@ -5156,7 +5147,8 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
5156 */ 5147 */
5157 5148
5158 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && 5149 if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags &&
5159 TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { 5150 TCP_SKB_CB(skb)->seq == tp->rcv_nxt &&
5151 !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) {
5160 int tcp_header_len = tp->tcp_header_len; 5152 int tcp_header_len = tp->tcp_header_len;
5161 5153
5162 /* Timestamp header prediction: tcp_header_len 5154 /* Timestamp header prediction: tcp_header_len
@@ -5309,8 +5301,8 @@ slow_path:
5309 return -res; 5301 return -res;
5310 5302
5311step5: 5303step5:
5312 if (th->ack) 5304 if (th->ack && tcp_ack(sk, skb, FLAG_SLOWPATH) < 0)
5313 tcp_ack(sk, skb, FLAG_SLOWPATH); 5305 goto discard;
5314 5306
5315 tcp_rcv_rtt_measure_ts(sk, skb); 5307 tcp_rcv_rtt_measure_ts(sk, skb);
5316 5308
@@ -5408,7 +5400,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
5408 * never scaled. 5400 * never scaled.
5409 */ 5401 */
5410 tp->snd_wnd = ntohs(th->window); 5402 tp->snd_wnd = ntohs(th->window);
5411 tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); 5403 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5412 5404
5413 if (!tp->rx_opt.wscale_ok) { 5405 if (!tp->rx_opt.wscale_ok) {
5414 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 5406 tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0;
@@ -5509,7 +5501,7 @@ discard:
5509 5501
5510 /* PAWS check. */ 5502 /* PAWS check. */
5511 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && 5503 if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp &&
5512 tcp_paws_check(&tp->rx_opt, 0)) 5504 tcp_paws_reject(&tp->rx_opt, 0))
5513 goto discard_and_undo; 5505 goto discard_and_undo;
5514 5506
5515 if (th->syn) { 5507 if (th->syn) {
@@ -5647,7 +5639,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5647 5639
5648 /* step 5: check the ACK field */ 5640 /* step 5: check the ACK field */
5649 if (th->ack) { 5641 if (th->ack) {
5650 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); 5642 int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH) > 0;
5651 5643
5652 switch (sk->sk_state) { 5644 switch (sk->sk_state) {
5653 case TCP_SYN_RECV: 5645 case TCP_SYN_RECV:
@@ -5669,8 +5661,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb,
5669 tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 5661 tp->snd_una = TCP_SKB_CB(skb)->ack_seq;
5670 tp->snd_wnd = ntohs(th->window) << 5662 tp->snd_wnd = ntohs(th->window) <<
5671 tp->rx_opt.snd_wscale; 5663 tp->rx_opt.snd_wscale;
5672 tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, 5664 tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
5673 TCP_SKB_CB(skb)->seq);
5674 5665
5675 /* tcp_ack considers this ACK as duplicate 5666 /* tcp_ack considers this ACK as duplicate
5676 * and does not calculate rtt. 5667 * and does not calculate rtt.
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f6b962f56ab4..d0a314879d81 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1226,15 +1226,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1226 if (want_cookie && !tmp_opt.saw_tstamp) 1226 if (want_cookie && !tmp_opt.saw_tstamp)
1227 tcp_clear_options(&tmp_opt); 1227 tcp_clear_options(&tmp_opt);
1228 1228
1229 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1230 /* Some OSes (unknown ones, but I see them on web server, which
1231 * contains information interesting only for windows'
1232 * users) do not send their stamp in SYN. It is easy case.
1233 * We simply do not advertise TS support.
1234 */
1235 tmp_opt.saw_tstamp = 0;
1236 tmp_opt.tstamp_ok = 0;
1237 }
1238 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; 1229 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1239 1230
1240 tcp_openreq_init(req, &tmp_opt, skb); 1231 tcp_openreq_init(req, &tmp_opt, skb);
@@ -2443,7 +2434,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
2443void __init tcp_v4_init(void) 2434void __init tcp_v4_init(void)
2444{ 2435{
2445 inet_hashinfo_init(&tcp_hashinfo); 2436 inet_hashinfo_init(&tcp_hashinfo);
2446 if (register_pernet_device(&tcp_sk_ops)) 2437 if (register_pernet_subsys(&tcp_sk_ops))
2447 panic("Failed to create the TCP control socket.\n"); 2438 panic("Failed to create the TCP control socket.\n");
2448} 2439}
2449 2440
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f67effbb102b..43bbba7926ee 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -107,7 +107,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
107 if (tmp_opt.saw_tstamp) { 107 if (tmp_opt.saw_tstamp) {
108 tmp_opt.ts_recent = tcptw->tw_ts_recent; 108 tmp_opt.ts_recent = tcptw->tw_ts_recent;
109 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; 109 tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
110 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 110 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
111 } 111 }
112 } 112 }
113 113
@@ -399,7 +399,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
399 399
400 tcp_prequeue_init(newtp); 400 tcp_prequeue_init(newtp);
401 401
402 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn); 402 tcp_init_wl(newtp, treq->rcv_isn);
403 403
404 newtp->srtt = 0; 404 newtp->srtt = 0;
405 newtp->mdev = TCP_TIMEOUT_INIT; 405 newtp->mdev = TCP_TIMEOUT_INIT;
@@ -434,9 +434,8 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
434 newtp->rx_opt.saw_tstamp = 0; 434 newtp->rx_opt.saw_tstamp = 0;
435 435
436 newtp->rx_opt.dsack = 0; 436 newtp->rx_opt.dsack = 0;
437 newtp->rx_opt.eff_sacks = 0;
438
439 newtp->rx_opt.num_sacks = 0; 437 newtp->rx_opt.num_sacks = 0;
438
440 newtp->urg_data = 0; 439 newtp->urg_data = 0;
441 440
442 if (sock_flag(newsk, SOCK_KEEPOPEN)) 441 if (sock_flag(newsk, SOCK_KEEPOPEN))
@@ -512,7 +511,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
512 * from another data. 511 * from another data.
513 */ 512 */
514 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans); 513 tmp_opt.ts_recent_stamp = get_seconds() - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
515 paws_reject = tcp_paws_check(&tmp_opt, th->rst); 514 paws_reject = tcp_paws_reject(&tmp_opt, th->rst);
516 } 515 }
517 } 516 }
518 517
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index dda42f0bd7a3..c1f259d2d33b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -441,10 +441,7 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
441 *ptr++ = htonl(sp[this_sack].end_seq); 441 *ptr++ = htonl(sp[this_sack].end_seq);
442 } 442 }
443 443
444 if (tp->rx_opt.dsack) { 444 tp->rx_opt.dsack = 0;
445 tp->rx_opt.dsack = 0;
446 tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
447 }
448 } 445 }
449} 446}
450 447
@@ -550,6 +547,7 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
550 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; 547 struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
551 struct tcp_sock *tp = tcp_sk(sk); 548 struct tcp_sock *tp = tcp_sk(sk);
552 unsigned size = 0; 549 unsigned size = 0;
550 unsigned int eff_sacks;
553 551
554#ifdef CONFIG_TCP_MD5SIG 552#ifdef CONFIG_TCP_MD5SIG
555 *md5 = tp->af_specific->md5_lookup(sk, sk); 553 *md5 = tp->af_specific->md5_lookup(sk, sk);
@@ -568,10 +566,11 @@ static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
568 size += TCPOLEN_TSTAMP_ALIGNED; 566 size += TCPOLEN_TSTAMP_ALIGNED;
569 } 567 }
570 568
571 if (unlikely(tp->rx_opt.eff_sacks)) { 569 eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
570 if (unlikely(eff_sacks)) {
572 const unsigned remaining = MAX_TCP_OPTION_SPACE - size; 571 const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
573 opts->num_sack_blocks = 572 opts->num_sack_blocks =
574 min_t(unsigned, tp->rx_opt.eff_sacks, 573 min_t(unsigned, eff_sacks,
575 (remaining - TCPOLEN_SACK_BASE_ALIGNED) / 574 (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
576 TCPOLEN_SACK_PERBLOCK); 575 TCPOLEN_SACK_PERBLOCK);
577 size += TCPOLEN_SACK_BASE_ALIGNED + 576 size += TCPOLEN_SACK_BASE_ALIGNED +
@@ -663,10 +662,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
663 th->urg_ptr = 0; 662 th->urg_ptr = 0;
664 663
665 /* The urg_mode check is necessary during a below snd_una win probe */ 664 /* The urg_mode check is necessary during a below snd_una win probe */
666 if (unlikely(tcp_urg_mode(tp) && 665 if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
667 between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { 666 if (before(tp->snd_up, tcb->seq + 0x10000)) {
668 th->urg_ptr = htons(tp->snd_up - tcb->seq); 667 th->urg_ptr = htons(tp->snd_up - tcb->seq);
669 th->urg = 1; 668 th->urg = 1;
669 } else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
670 th->urg_ptr = 0xFFFF;
671 th->urg = 1;
672 }
670 } 673 }
671 674
672 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); 675 tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
@@ -763,11 +766,10 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
763 struct sk_buff *buff; 766 struct sk_buff *buff;
764 int nsize, old_factor; 767 int nsize, old_factor;
765 int nlen; 768 int nlen;
766 u16 flags; 769 u8 flags;
767 770
768 BUG_ON(len > skb->len); 771 BUG_ON(len > skb->len);
769 772
770 tcp_clear_retrans_hints_partial(tp);
771 nsize = skb_headlen(skb) - len; 773 nsize = skb_headlen(skb) - len;
772 if (nsize < 0) 774 if (nsize < 0)
773 nsize = 0; 775 nsize = 0;
@@ -850,6 +852,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len,
850 tcp_verify_left_out(tp); 852 tcp_verify_left_out(tp);
851 } 853 }
852 tcp_adjust_fackets_out(sk, skb, diff); 854 tcp_adjust_fackets_out(sk, skb, diff);
855
856 if (tp->lost_skb_hint &&
857 before(TCP_SKB_CB(skb)->seq,
858 TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
859 (tcp_is_fack(tp) || TCP_SKB_CB(skb)->sacked))
860 tp->lost_cnt_hint -= diff;
853 } 861 }
854 862
855 /* Link BUFF into the send queue. */ 863 /* Link BUFF into the send queue. */
@@ -913,7 +921,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
913 * factor and mss. 921 * factor and mss.
914 */ 922 */
915 if (tcp_skb_pcount(skb) > 1) 923 if (tcp_skb_pcount(skb) > 1)
916 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1)); 924 tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
917 925
918 return 0; 926 return 0;
919} 927}
@@ -974,15 +982,6 @@ void tcp_mtup_init(struct sock *sk)
974 icsk->icsk_mtup.probe_size = 0; 982 icsk->icsk_mtup.probe_size = 0;
975} 983}
976 984
977/* Bound MSS / TSO packet size with the half of the window */
978static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
979{
980 if (tp->max_window && pktsize > (tp->max_window >> 1))
981 return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
982 else
983 return pktsize;
984}
985
986/* This function synchronize snd mss to current pmtu/exthdr set. 985/* This function synchronize snd mss to current pmtu/exthdr set.
987 986
988 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts 987 tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
@@ -1029,22 +1028,17 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
1029/* Compute the current effective MSS, taking SACKs and IP options, 1028/* Compute the current effective MSS, taking SACKs and IP options,
1030 * and even PMTU discovery events into account. 1029 * and even PMTU discovery events into account.
1031 */ 1030 */
1032unsigned int tcp_current_mss(struct sock *sk, int large_allowed) 1031unsigned int tcp_current_mss(struct sock *sk)
1033{ 1032{
1034 struct tcp_sock *tp = tcp_sk(sk); 1033 struct tcp_sock *tp = tcp_sk(sk);
1035 struct dst_entry *dst = __sk_dst_get(sk); 1034 struct dst_entry *dst = __sk_dst_get(sk);
1036 u32 mss_now; 1035 u32 mss_now;
1037 u16 xmit_size_goal;
1038 int doing_tso = 0;
1039 unsigned header_len; 1036 unsigned header_len;
1040 struct tcp_out_options opts; 1037 struct tcp_out_options opts;
1041 struct tcp_md5sig_key *md5; 1038 struct tcp_md5sig_key *md5;
1042 1039
1043 mss_now = tp->mss_cache; 1040 mss_now = tp->mss_cache;
1044 1041
1045 if (large_allowed && sk_can_gso(sk))
1046 doing_tso = 1;
1047
1048 if (dst) { 1042 if (dst) {
1049 u32 mtu = dst_mtu(dst); 1043 u32 mtu = dst_mtu(dst);
1050 if (mtu != inet_csk(sk)->icsk_pmtu_cookie) 1044 if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
@@ -1062,19 +1056,6 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
1062 mss_now -= delta; 1056 mss_now -= delta;
1063 } 1057 }
1064 1058
1065 xmit_size_goal = mss_now;
1066
1067 if (doing_tso) {
1068 xmit_size_goal = ((sk->sk_gso_max_size - 1) -
1069 inet_csk(sk)->icsk_af_ops->net_header_len -
1070 inet_csk(sk)->icsk_ext_hdr_len -
1071 tp->tcp_header_len);
1072
1073 xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
1074 xmit_size_goal -= (xmit_size_goal % mss_now);
1075 }
1076 tp->xmit_size_goal = xmit_size_goal;
1077
1078 return mss_now; 1059 return mss_now;
1079} 1060}
1080 1061
@@ -1256,7 +1237,7 @@ int tcp_may_send_now(struct sock *sk)
1256 struct sk_buff *skb = tcp_send_head(sk); 1237 struct sk_buff *skb = tcp_send_head(sk);
1257 1238
1258 return (skb && 1239 return (skb &&
1259 tcp_snd_test(sk, skb, tcp_current_mss(sk, 1), 1240 tcp_snd_test(sk, skb, tcp_current_mss(sk),
1260 (tcp_skb_is_last(sk, skb) ? 1241 (tcp_skb_is_last(sk, skb) ?
1261 tp->nonagle : TCP_NAGLE_PUSH))); 1242 tp->nonagle : TCP_NAGLE_PUSH)));
1262} 1243}
@@ -1273,7 +1254,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
1273{ 1254{
1274 struct sk_buff *buff; 1255 struct sk_buff *buff;
1275 int nlen = skb->len - len; 1256 int nlen = skb->len - len;
1276 u16 flags; 1257 u8 flags;
1277 1258
1278 /* All of a TSO frame must be composed of paged data. */ 1259 /* All of a TSO frame must be composed of paged data. */
1279 if (skb->len != skb->data_len) 1260 if (skb->len != skb->data_len)
@@ -1352,6 +1333,10 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb)
1352 if (limit >= sk->sk_gso_max_size) 1333 if (limit >= sk->sk_gso_max_size)
1353 goto send_now; 1334 goto send_now;
1354 1335
1336 /* Middle in queue won't get any more data, full sendable already? */
1337 if ((skb != tcp_write_queue_tail(sk)) && (limit >= skb->len))
1338 goto send_now;
1339
1355 if (sysctl_tcp_tso_win_divisor) { 1340 if (sysctl_tcp_tso_win_divisor) {
1356 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); 1341 u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
1357 1342
@@ -1405,11 +1390,11 @@ static int tcp_mtu_probe(struct sock *sk)
1405 icsk->icsk_mtup.probe_size || 1390 icsk->icsk_mtup.probe_size ||
1406 inet_csk(sk)->icsk_ca_state != TCP_CA_Open || 1391 inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
1407 tp->snd_cwnd < 11 || 1392 tp->snd_cwnd < 11 ||
1408 tp->rx_opt.eff_sacks) 1393 tp->rx_opt.num_sacks || tp->rx_opt.dsack)
1409 return -1; 1394 return -1;
1410 1395
1411 /* Very simple search strategy: just double the MSS. */ 1396 /* Very simple search strategy: just double the MSS. */
1412 mss_now = tcp_current_mss(sk, 0); 1397 mss_now = tcp_current_mss(sk);
1413 probe_size = 2 * tp->mss_cache; 1398 probe_size = 2 * tp->mss_cache;
1414 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; 1399 size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
1415 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { 1400 if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
@@ -1754,11 +1739,9 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1754 struct tcp_sock *tp = tcp_sk(sk); 1739 struct tcp_sock *tp = tcp_sk(sk);
1755 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); 1740 struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
1756 int skb_size, next_skb_size; 1741 int skb_size, next_skb_size;
1757 u16 flags;
1758 1742
1759 skb_size = skb->len; 1743 skb_size = skb->len;
1760 next_skb_size = next_skb->len; 1744 next_skb_size = next_skb->len;
1761 flags = TCP_SKB_CB(skb)->flags;
1762 1745
1763 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); 1746 BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
1764 1747
@@ -1778,9 +1761,8 @@ static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
1778 /* Update sequence range on original skb. */ 1761 /* Update sequence range on original skb. */
1779 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; 1762 TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
1780 1763
1781 /* Merge over control information. */ 1764 /* Merge over control information. This moves PSH/FIN etc. over */
1782 flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ 1765 TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
1783 TCP_SKB_CB(skb)->flags = flags;
1784 1766
1785 /* All done, get rid of second SKB and account for it so 1767 /* All done, get rid of second SKB and account for it so
1786 * packet counting does not break. 1768 * packet counting does not break.
@@ -1894,7 +1876,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1894 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) 1876 if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
1895 return -EHOSTUNREACH; /* Routing failure or similar. */ 1877 return -EHOSTUNREACH; /* Routing failure or similar. */
1896 1878
1897 cur_mss = tcp_current_mss(sk, 0); 1879 cur_mss = tcp_current_mss(sk);
1898 1880
1899 /* If receiver has shrunk his window, and skb is out of 1881 /* If receiver has shrunk his window, and skb is out of
1900 * new window, do not retransmit it. The exception is the 1882 * new window, do not retransmit it. The exception is the
@@ -1908,6 +1890,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
1908 if (skb->len > cur_mss) { 1890 if (skb->len > cur_mss) {
1909 if (tcp_fragment(sk, skb, cur_mss, cur_mss)) 1891 if (tcp_fragment(sk, skb, cur_mss, cur_mss))
1910 return -ENOMEM; /* We'll try again later. */ 1892 return -ENOMEM; /* We'll try again later. */
1893 } else {
1894 tcp_init_tso_segs(sk, skb, cur_mss);
1911 } 1895 }
1912 1896
1913 tcp_retrans_try_collapse(sk, skb, cur_mss); 1897 tcp_retrans_try_collapse(sk, skb, cur_mss);
@@ -2023,7 +2007,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
2023 last_lost = tp->snd_una; 2007 last_lost = tp->snd_una;
2024 } 2008 }
2025 2009
2026 /* First pass: retransmit lost packets. */
2027 tcp_for_write_queue_from(skb, sk) { 2010 tcp_for_write_queue_from(skb, sk) {
2028 __u8 sacked = TCP_SKB_CB(skb)->sacked; 2011 __u8 sacked = TCP_SKB_CB(skb)->sacked;
2029 2012
@@ -2062,7 +2045,7 @@ begin_fwd:
2062 goto begin_fwd; 2045 goto begin_fwd;
2063 2046
2064 } else if (!(sacked & TCPCB_LOST)) { 2047 } else if (!(sacked & TCPCB_LOST)) {
2065 if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) 2048 if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
2066 hole = skb; 2049 hole = skb;
2067 continue; 2050 continue;
2068 2051
@@ -2101,7 +2084,7 @@ void tcp_send_fin(struct sock *sk)
2101 * unsent frames. But be careful about outgoing SACKS 2084 * unsent frames. But be careful about outgoing SACKS
2102 * and IP options. 2085 * and IP options.
2103 */ 2086 */
2104 mss_now = tcp_current_mss(sk, 1); 2087 mss_now = tcp_current_mss(sk);
2105 2088
2106 if (tcp_send_head(sk) != NULL) { 2089 if (tcp_send_head(sk) != NULL) {
2107 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; 2090 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
@@ -2326,7 +2309,7 @@ static void tcp_connect_init(struct sock *sk)
2326 sk->sk_err = 0; 2309 sk->sk_err = 0;
2327 sock_reset_flag(sk, SOCK_DONE); 2310 sock_reset_flag(sk, SOCK_DONE);
2328 tp->snd_wnd = 0; 2311 tp->snd_wnd = 0;
2329 tcp_init_wl(tp, tp->write_seq, 0); 2312 tcp_init_wl(tp, 0);
2330 tp->snd_una = tp->write_seq; 2313 tp->snd_una = tp->write_seq;
2331 tp->snd_sml = tp->write_seq; 2314 tp->snd_sml = tp->write_seq;
2332 tp->snd_up = tp->write_seq; 2315 tp->snd_up = tp->write_seq;
@@ -2513,7 +2496,7 @@ int tcp_write_wakeup(struct sock *sk)
2513 if ((skb = tcp_send_head(sk)) != NULL && 2496 if ((skb = tcp_send_head(sk)) != NULL &&
2514 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { 2497 before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
2515 int err; 2498 int err;
2516 unsigned int mss = tcp_current_mss(sk, 0); 2499 unsigned int mss = tcp_current_mss(sk);
2517 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; 2500 unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
2518 2501
2519 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) 2502 if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index 25524d4e372a..59f5b5e7c566 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -165,9 +165,10 @@ static int tcpprobe_sprint(char *tbuf, int n)
165static ssize_t tcpprobe_read(struct file *file, char __user *buf, 165static ssize_t tcpprobe_read(struct file *file, char __user *buf,
166 size_t len, loff_t *ppos) 166 size_t len, loff_t *ppos)
167{ 167{
168 int error = 0, cnt = 0; 168 int error = 0;
169 size_t cnt = 0;
169 170
170 if (!buf || len < 0) 171 if (!buf)
171 return -EINVAL; 172 return -EINVAL;
172 173
173 while (cnt < len) { 174 while (cnt < len) {
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 2747ec7bfb63..a76513779e2b 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -1,6 +1,6 @@
1/* Tom Kelly's Scalable TCP 1/* Tom Kelly's Scalable TCP
2 * 2 *
3 * See htt://www-lce.eng.cam.ac.uk/~ctk21/scalable/ 3 * See http://www.deneholme.net/tom/scalable/
4 * 4 *
5 * John Heffner <jheffner@sc.edu> 5 * John Heffner <jheffner@sc.edu>
6 */ 6 */
@@ -24,14 +24,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
24 24
25 if (tp->snd_cwnd <= tp->snd_ssthresh) 25 if (tp->snd_cwnd <= tp->snd_ssthresh)
26 tcp_slow_start(tp); 26 tcp_slow_start(tp);
27 else { 27 else
28 tp->snd_cwnd_cnt++; 28 tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
29 if (tp->snd_cwnd_cnt > min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT)){
30 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
31 tp->snd_cwnd++;
32 tp->snd_cwnd_cnt = 0;
33 }
34 }
35} 29}
36 30
37static u32 tcp_scalable_ssthresh(struct sock *sk) 31static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0170e914f1b0..b144a26359bc 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -328,19 +328,16 @@ static void tcp_retransmit_timer(struct sock *sk)
328 if (icsk->icsk_retransmits == 0) { 328 if (icsk->icsk_retransmits == 0) {
329 int mib_idx; 329 int mib_idx;
330 330
331 if (icsk->icsk_ca_state == TCP_CA_Disorder || 331 if (icsk->icsk_ca_state == TCP_CA_Disorder) {
332 icsk->icsk_ca_state == TCP_CA_Recovery) { 332 if (tcp_is_sack(tp))
333 if (tcp_is_sack(tp)) { 333 mib_idx = LINUX_MIB_TCPSACKFAILURES;
334 if (icsk->icsk_ca_state == TCP_CA_Recovery) 334 else
335 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; 335 mib_idx = LINUX_MIB_TCPRENOFAILURES;
336 else 336 } else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
337 mib_idx = LINUX_MIB_TCPSACKFAILURES; 337 if (tcp_is_sack(tp))
338 } else { 338 mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
339 if (icsk->icsk_ca_state == TCP_CA_Recovery) 339 else
340 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; 340 mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
341 else
342 mib_idx = LINUX_MIB_TCPRENOFAILURES;
343 }
344 } else if (icsk->icsk_ca_state == TCP_CA_Loss) { 341 } else if (icsk->icsk_ca_state == TCP_CA_Loss) {
345 mib_idx = LINUX_MIB_TCPLOSSFAILURES; 342 mib_idx = LINUX_MIB_TCPLOSSFAILURES;
346 } else { 343 } else {
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index d08b2e855c22..e9bbff746488 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,12 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
159 /* In the "non-congestive state", increase cwnd 159 /* In the "non-congestive state", increase cwnd
160 * every rtt. 160 * every rtt.
161 */ 161 */
162 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 162 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
163 if (tp->snd_cwnd < tp->snd_cwnd_clamp)
164 tp->snd_cwnd++;
165 tp->snd_cwnd_cnt = 0;
166 } else
167 tp->snd_cwnd_cnt++;
168 } else { 163 } else {
169 /* In the "congestive state", increase cwnd 164 /* In the "congestive state", increase cwnd
170 * every other rtt. 165 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 9ec843a9bbb2..66b6821b984e 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -94,14 +94,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
94 94
95 } else { 95 } else {
96 /* Reno */ 96 /* Reno */
97 97 tcp_cong_avoid_ai(tp, tp->snd_cwnd);
98 if (tp->snd_cwnd_cnt < tp->snd_cwnd)
99 tp->snd_cwnd_cnt++;
100
101 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
102 tp->snd_cwnd++;
103 tp->snd_cwnd_cnt = 0;
104 }
105 } 98 }
106 99
107 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. 100 /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 4bd178a111d5..05b7abb99f69 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1184,7 +1184,7 @@ static int __udp4_lib_mcast_deliver(struct net *net, struct sk_buff *skb,
1184 sk = sknext; 1184 sk = sknext;
1185 } while (sknext); 1185 } while (sknext);
1186 } else 1186 } else
1187 kfree_skb(skb); 1187 consume_skb(skb);
1188 spin_unlock(&hslot->lock); 1188 spin_unlock(&hslot->lock);
1189 return 0; 1189 return 0;
1190} 1190}
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 03e2a1ad71e9..8499da9e76a2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -40,6 +40,7 @@
40 40
41#include <linux/errno.h> 41#include <linux/errno.h>
42#include <linux/types.h> 42#include <linux/types.h>
43#include <linux/kernel.h>
43#include <linux/socket.h> 44#include <linux/socket.h>
44#include <linux/sockios.h> 45#include <linux/sockios.h>
45#include <linux/net.h> 46#include <linux/net.h>
@@ -493,15 +494,17 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
493 read_unlock(&dev_base_lock); 494 read_unlock(&dev_base_lock);
494} 495}
495 496
496static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) 497static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
497{ 498{
498 struct net *net; 499 struct net *net;
499 500
500 net = (struct net *)table->extra2; 501 net = (struct net *)table->extra2;
501 if (p == &net->ipv6.devconf_dflt->forwarding) 502 if (p == &net->ipv6.devconf_dflt->forwarding)
502 return; 503 return 0;
504
505 if (!rtnl_trylock())
506 return -ERESTARTSYS;
503 507
504 rtnl_lock();
505 if (p == &net->ipv6.devconf_all->forwarding) { 508 if (p == &net->ipv6.devconf_all->forwarding) {
506 __s32 newf = net->ipv6.devconf_all->forwarding; 509 __s32 newf = net->ipv6.devconf_all->forwarding;
507 net->ipv6.devconf_dflt->forwarding = newf; 510 net->ipv6.devconf_dflt->forwarding = newf;
@@ -512,6 +515,7 @@ static void addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old)
512 515
513 if (*p) 516 if (*p)
514 rt6_purge_dflt_routers(net); 517 rt6_purge_dflt_routers(net);
518 return 1;
515} 519}
516#endif 520#endif
517 521
@@ -587,6 +591,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
587{ 591{
588 struct inet6_ifaddr *ifa = NULL; 592 struct inet6_ifaddr *ifa = NULL;
589 struct rt6_info *rt; 593 struct rt6_info *rt;
594 struct net *net = dev_net(idev->dev);
590 int hash; 595 int hash;
591 int err = 0; 596 int err = 0;
592 int addr_type = ipv6_addr_type(addr); 597 int addr_type = ipv6_addr_type(addr);
@@ -603,6 +608,11 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen,
603 goto out2; 608 goto out2;
604 } 609 }
605 610
611 if (idev->cnf.disable_ipv6 || net->ipv6.devconf_all->disable_ipv6) {
612 err = -EACCES;
613 goto out2;
614 }
615
606 write_lock(&addrconf_hash_lock); 616 write_lock(&addrconf_hash_lock);
607 617
608 /* Ignore adding duplicate addresses on an interface */ 618 /* Ignore adding duplicate addresses on an interface */
@@ -1206,16 +1216,12 @@ int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
1206 } 1216 }
1207 break; 1217 break;
1208 } else if (minihiscore < miniscore) { 1218 } else if (minihiscore < miniscore) {
1209 struct ipv6_saddr_score *tmp;
1210
1211 if (hiscore->ifa) 1219 if (hiscore->ifa)
1212 in6_ifa_put(hiscore->ifa); 1220 in6_ifa_put(hiscore->ifa);
1213 1221
1214 in6_ifa_hold(score->ifa); 1222 in6_ifa_hold(score->ifa);
1215 1223
1216 tmp = hiscore; 1224 swap(hiscore, score);
1217 hiscore = score;
1218 score = tmp;
1219 1225
1220 /* restore our iterator */ 1226 /* restore our iterator */
1221 score->ifa = hiscore->ifa; 1227 score->ifa = hiscore->ifa;
@@ -1430,6 +1436,11 @@ static void addrconf_dad_stop(struct inet6_ifaddr *ifp)
1430void addrconf_dad_failure(struct inet6_ifaddr *ifp) 1436void addrconf_dad_failure(struct inet6_ifaddr *ifp)
1431{ 1437{
1432 struct inet6_dev *idev = ifp->idev; 1438 struct inet6_dev *idev = ifp->idev;
1439
1440 if (net_ratelimit())
1441 printk(KERN_INFO "%s: IPv6 duplicate address detected!\n",
1442 ifp->idev->dev->name);
1443
1433 if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) { 1444 if (idev->cnf.accept_dad > 1 && !idev->cnf.disable_ipv6) {
1434 struct in6_addr addr; 1445 struct in6_addr addr;
1435 1446
@@ -1440,11 +1451,12 @@ void addrconf_dad_failure(struct inet6_ifaddr *ifp)
1440 ipv6_addr_equal(&ifp->addr, &addr)) { 1451 ipv6_addr_equal(&ifp->addr, &addr)) {
1441 /* DAD failed for link-local based on MAC address */ 1452 /* DAD failed for link-local based on MAC address */
1442 idev->cnf.disable_ipv6 = 1; 1453 idev->cnf.disable_ipv6 = 1;
1454
1455 printk(KERN_INFO "%s: IPv6 being disabled!\n",
1456 ifp->idev->dev->name);
1443 } 1457 }
1444 } 1458 }
1445 1459
1446 if (net_ratelimit())
1447 printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name);
1448 addrconf_dad_stop(ifp); 1460 addrconf_dad_stop(ifp);
1449} 1461}
1450 1462
@@ -2599,9 +2611,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
2599 2611
2600 ASSERT_RTNL(); 2612 ASSERT_RTNL();
2601 2613
2602 if ((dev->flags & IFF_LOOPBACK) && how == 1)
2603 how = 0;
2604
2605 rt6_ifdown(net, dev); 2614 rt6_ifdown(net, dev);
2606 neigh_ifdown(&nd_tbl, dev); 2615 neigh_ifdown(&nd_tbl, dev);
2607 2616
@@ -2823,11 +2832,6 @@ static void addrconf_dad_timer(unsigned long data)
2823 read_unlock_bh(&idev->lock); 2832 read_unlock_bh(&idev->lock);
2824 goto out; 2833 goto out;
2825 } 2834 }
2826 if (idev->cnf.accept_dad > 1 && idev->cnf.disable_ipv6) {
2827 read_unlock_bh(&idev->lock);
2828 addrconf_dad_failure(ifp);
2829 return;
2830 }
2831 spin_lock_bh(&ifp->lock); 2835 spin_lock_bh(&ifp->lock);
2832 if (ifp->probes == 0) { 2836 if (ifp->probes == 0) {
2833 /* 2837 /*
@@ -3638,7 +3642,8 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
3638 kfree_skb(skb); 3642 kfree_skb(skb);
3639 goto errout; 3643 goto errout;
3640 } 3644 }
3641 err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); 3645 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
3646 return;
3642errout: 3647errout:
3643 if (err < 0) 3648 if (err < 0)
3644 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); 3649 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3849,7 +3854,8 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev)
3849 kfree_skb(skb); 3854 kfree_skb(skb);
3850 goto errout; 3855 goto errout;
3851 } 3856 }
3852 err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC); 3857 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_IFADDR, NULL, GFP_ATOMIC);
3858 return;
3853errout: 3859errout:
3854 if (err < 0) 3860 if (err < 0)
3855 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err); 3861 rtnl_set_sk_err(net, RTNLGRP_IPV6_IFADDR, err);
@@ -3919,7 +3925,8 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev,
3919 kfree_skb(skb); 3925 kfree_skb(skb);
3920 goto errout; 3926 goto errout;
3921 } 3927 }
3922 err = rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC); 3928 rtnl_notify(skb, net, 0, RTNLGRP_IPV6_PREFIX, NULL, GFP_ATOMIC);
3929 return;
3923errout: 3930errout:
3924 if (err < 0) 3931 if (err < 0)
3925 rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err); 3932 rtnl_set_sk_err(net, RTNLGRP_IPV6_PREFIX, err);
@@ -3974,7 +3981,7 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp,
3974 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 3981 ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
3975 3982
3976 if (write) 3983 if (write)
3977 addrconf_fixup_forwarding(ctl, valp, val); 3984 ret = addrconf_fixup_forwarding(ctl, valp, val);
3978 return ret; 3985 return ret;
3979} 3986}
3980 3987
@@ -4010,8 +4017,7 @@ static int addrconf_sysctl_forward_strategy(ctl_table *table,
4010 } 4017 }
4011 4018
4012 *valp = new; 4019 *valp = new;
4013 addrconf_fixup_forwarding(table, valp, val); 4020 return addrconf_fixup_forwarding(table, valp, val);
4014 return 1;
4015} 4021}
4016 4022
4017static struct addrconf_sysctl_table 4023static struct addrconf_sysctl_table
@@ -4437,25 +4443,6 @@ int unregister_inet6addr_notifier(struct notifier_block *nb)
4437 4443
4438EXPORT_SYMBOL(unregister_inet6addr_notifier); 4444EXPORT_SYMBOL(unregister_inet6addr_notifier);
4439 4445
4440static void addrconf_net_exit(struct net *net)
4441{
4442 struct net_device *dev;
4443
4444 rtnl_lock();
4445 /* clean dev list */
4446 for_each_netdev(net, dev) {
4447 if (__in6_dev_get(dev) == NULL)
4448 continue;
4449 addrconf_ifdown(dev, 1);
4450 }
4451 addrconf_ifdown(net->loopback_dev, 2);
4452 rtnl_unlock();
4453}
4454
4455static struct pernet_operations addrconf_net_ops = {
4456 .exit = addrconf_net_exit,
4457};
4458
4459/* 4446/*
4460 * Init / cleanup code 4447 * Init / cleanup code
4461 */ 4448 */
@@ -4497,10 +4484,6 @@ int __init addrconf_init(void)
4497 if (err) 4484 if (err)
4498 goto errlo; 4485 goto errlo;
4499 4486
4500 err = register_pernet_device(&addrconf_net_ops);
4501 if (err)
4502 return err;
4503
4504 register_netdevice_notifier(&ipv6_dev_notf); 4487 register_netdevice_notifier(&ipv6_dev_notf);
4505 4488
4506 addrconf_verify(0); 4489 addrconf_verify(0);
@@ -4530,15 +4513,22 @@ errlo:
4530void addrconf_cleanup(void) 4513void addrconf_cleanup(void)
4531{ 4514{
4532 struct inet6_ifaddr *ifa; 4515 struct inet6_ifaddr *ifa;
4516 struct net_device *dev;
4533 int i; 4517 int i;
4534 4518
4535 unregister_netdevice_notifier(&ipv6_dev_notf); 4519 unregister_netdevice_notifier(&ipv6_dev_notf);
4536 unregister_pernet_device(&addrconf_net_ops);
4537
4538 unregister_pernet_subsys(&addrconf_ops); 4520 unregister_pernet_subsys(&addrconf_ops);
4539 4521
4540 rtnl_lock(); 4522 rtnl_lock();
4541 4523
4524 /* clean dev list */
4525 for_each_netdev(&init_net, dev) {
4526 if (__in6_dev_get(dev) == NULL)
4527 continue;
4528 addrconf_ifdown(dev, 1);
4529 }
4530 addrconf_ifdown(init_net.loopback_dev, 2);
4531
4542 /* 4532 /*
4543 * Check hash table. 4533 * Check hash table.
4544 */ 4534 */
@@ -4559,6 +4549,4 @@ void addrconf_cleanup(void)
4559 4549
4560 del_timer(&addr_chk_timer); 4550 del_timer(&addr_chk_timer);
4561 rtnl_unlock(); 4551 rtnl_unlock();
4562
4563 unregister_pernet_subsys(&addrconf_net_ops);
4564} 4552}
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index fa2ac7ee662f..fbf533cc9dce 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -72,6 +72,10 @@ MODULE_LICENSE("GPL");
72static struct list_head inetsw6[SOCK_MAX]; 72static struct list_head inetsw6[SOCK_MAX];
73static DEFINE_SPINLOCK(inetsw6_lock); 73static DEFINE_SPINLOCK(inetsw6_lock);
74 74
75static int disable_ipv6 = 0;
76module_param_named(disable, disable_ipv6, int, 0);
77MODULE_PARM_DESC(disable, "Disable IPv6 such that it is non-functional");
78
75static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) 79static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk)
76{ 80{
77 const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); 81 const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo);
@@ -889,7 +893,7 @@ out_unlock:
889 return err; 893 return err;
890} 894}
891 895
892static struct packet_type ipv6_packet_type = { 896static struct packet_type ipv6_packet_type __read_mostly = {
893 .type = cpu_to_be16(ETH_P_IPV6), 897 .type = cpu_to_be16(ETH_P_IPV6),
894 .func = ipv6_rcv, 898 .func = ipv6_rcv,
895 .gso_send_check = ipv6_gso_send_check, 899 .gso_send_check = ipv6_gso_send_check,
@@ -1001,10 +1005,21 @@ static int __init inet6_init(void)
1001{ 1005{
1002 struct sk_buff *dummy_skb; 1006 struct sk_buff *dummy_skb;
1003 struct list_head *r; 1007 struct list_head *r;
1004 int err; 1008 int err = 0;
1005 1009
1006 BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)); 1010 BUILD_BUG_ON(sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb));
1007 1011
1012 /* Register the socket-side information for inet6_create. */
1013 for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
1014 INIT_LIST_HEAD(r);
1015
1016 if (disable_ipv6) {
1017 printk(KERN_INFO
1018 "IPv6: Loaded, but administratively disabled, "
1019 "reboot required to enable\n");
1020 goto out;
1021 }
1022
1008 err = proto_register(&tcpv6_prot, 1); 1023 err = proto_register(&tcpv6_prot, 1);
1009 if (err) 1024 if (err)
1010 goto out; 1025 goto out;
@@ -1022,10 +1037,6 @@ static int __init inet6_init(void)
1022 goto out_unregister_udplite_proto; 1037 goto out_unregister_udplite_proto;
1023 1038
1024 1039
1025 /* Register the socket-side information for inet6_create. */
1026 for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r)
1027 INIT_LIST_HEAD(r);
1028
1029 /* We MUST register RAW sockets before we create the ICMP6, 1040 /* We MUST register RAW sockets before we create the ICMP6,
1030 * IGMP6, or NDISC control sockets. 1041 * IGMP6, or NDISC control sockets.
1031 */ 1042 */
@@ -1191,6 +1202,9 @@ module_init(inet6_init);
1191 1202
1192static void __exit inet6_exit(void) 1203static void __exit inet6_exit(void)
1193{ 1204{
1205 if (disable_ipv6)
1206 return;
1207
1194 /* First of all disallow new sockets creation. */ 1208 /* First of all disallow new sockets creation. */
1195 sock_unregister(PF_INET6); 1209 sock_unregister(PF_INET6);
1196 /* Disallow any further netlink messages */ 1210 /* Disallow any further netlink messages */
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 8fe267feb81e..1bcc3431859e 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -258,11 +258,11 @@ unique:
258 258
259 if (twp != NULL) { 259 if (twp != NULL) {
260 *twp = tw; 260 *twp = tw;
261 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED); 261 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
262 } else if (tw != NULL) { 262 } else if (tw != NULL) {
263 /* Silly. Should hash-dance instead... */ 263 /* Silly. Should hash-dance instead... */
264 inet_twsk_deschedule(tw, death_row); 264 inet_twsk_deschedule(tw, death_row);
265 NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITRECYCLED); 265 NET_INC_STATS_BH(net, LINUX_MIB_TIMEWAITRECYCLED);
266 266
267 inet_twsk_put(tw); 267 inet_twsk_put(tw);
268 } 268 }
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 40f324655e24..d31df0f4bc9a 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -218,8 +218,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
218 if (opt) 218 if (opt)
219 sock_kfree_s(sk, opt, opt->tot_len); 219 sock_kfree_s(sk, opt, opt->tot_len);
220 pktopt = xchg(&np->pktoptions, NULL); 220 pktopt = xchg(&np->pktoptions, NULL);
221 if (pktopt) 221 kfree_skb(pktopt);
222 kfree_skb(pktopt);
223 222
224 sk->sk_destruct = inet_sock_destruct; 223 sk->sk_destruct = inet_sock_destruct;
225 /* 224 /*
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3cd83b85e9ef..9f061d1adbc2 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1095,11 +1095,7 @@ static void ndisc_ra_useropt(struct sk_buff *ra, struct nd_opt_hdr *opt)
1095 &ipv6_hdr(ra)->saddr); 1095 &ipv6_hdr(ra)->saddr);
1096 nlmsg_end(skb, nlh); 1096 nlmsg_end(skb, nlh);
1097 1097
1098 err = rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, 1098 rtnl_notify(skb, net, 0, RTNLGRP_ND_USEROPT, NULL, GFP_ATOMIC);
1099 GFP_ATOMIC);
1100 if (err < 0)
1101 goto errout;
1102
1103 return; 1099 return;
1104 1100
1105nla_put_failure: 1101nla_put_failure:
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
index 165b256a6fa0..41b8a956e1be 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
@@ -205,8 +205,9 @@ icmpv6_error(struct net *net, struct sk_buff *skb, unsigned int dataoff,
205 205
206 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && 206 if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
207 nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { 207 nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) {
208 nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, 208 if (LOG_INVALID(net, IPPROTO_ICMPV6))
209 "nf_ct_icmpv6: ICMPv6 checksum failed\n"); 209 nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL,
210 "nf_ct_icmpv6: ICMPv6 checksum failed ");
210 return -NF_ACCEPT; 211 return -NF_ACCEPT;
211 } 212 }
212 213
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index ed4d79a9e4a6..058a5e4a60c3 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -528,14 +528,14 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
528 if (!ipv6_ext_hdr(nexthdr)) { 528 if (!ipv6_ext_hdr(nexthdr)) {
529 return -1; 529 return -1;
530 } 530 }
531 if (len < (int)sizeof(struct ipv6_opt_hdr)) {
532 pr_debug("too short\n");
533 return -1;
534 }
535 if (nexthdr == NEXTHDR_NONE) { 531 if (nexthdr == NEXTHDR_NONE) {
536 pr_debug("next header is none\n"); 532 pr_debug("next header is none\n");
537 return -1; 533 return -1;
538 } 534 }
535 if (len < (int)sizeof(struct ipv6_opt_hdr)) {
536 pr_debug("too short\n");
537 return -1;
538 }
539 if (skb_copy_bits(skb, start, &hdr, sizeof(hdr))) 539 if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
540 BUG(); 540 BUG();
541 if (nexthdr == NEXTHDR_AUTH) 541 if (nexthdr == NEXTHDR_AUTH)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 3c575118fca5..e9ac7a12f595 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -452,6 +452,7 @@ err:
452static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, 452static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
453 struct net_device *dev) 453 struct net_device *dev)
454{ 454{
455 struct net *net = container_of(fq->q.net, struct net, ipv6.frags);
455 struct sk_buff *fp, *head = fq->q.fragments; 456 struct sk_buff *fp, *head = fq->q.fragments;
456 int payload_len; 457 int payload_len;
457 unsigned int nhoff; 458 unsigned int nhoff;
@@ -551,8 +552,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
551 head->csum); 552 head->csum);
552 553
553 rcu_read_lock(); 554 rcu_read_lock();
554 IP6_INC_STATS_BH(dev_net(dev), 555 IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
555 __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
556 rcu_read_unlock(); 556 rcu_read_unlock();
557 fq->q.fragments = NULL; 557 fq->q.fragments = NULL;
558 return 1; 558 return 1;
@@ -566,8 +566,7 @@ out_oom:
566 printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); 566 printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n");
567out_fail: 567out_fail:
568 rcu_read_lock(); 568 rcu_read_lock();
569 IP6_INC_STATS_BH(dev_net(dev), 569 IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
570 __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
571 rcu_read_unlock(); 570 rcu_read_unlock();
572 return -1; 571 return -1;
573} 572}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c3d486a3edad..1394ddb6e35c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2400,8 +2400,9 @@ void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2400 kfree_skb(skb); 2400 kfree_skb(skb);
2401 goto errout; 2401 goto errout;
2402 } 2402 }
2403 err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE, 2403 rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2404 info->nlh, gfp_any()); 2404 info->nlh, gfp_any());
2405 return;
2405errout: 2406errout:
2406 if (err < 0) 2407 if (err < 0)
2407 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); 2408 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d3467e563f02..664ab82e03b2 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -188,9 +188,9 @@ static struct ip_tunnel * ipip6_tunnel_locate(struct net *net,
188 } 188 }
189 189
190 nt = netdev_priv(dev); 190 nt = netdev_priv(dev);
191 ipip6_tunnel_init(dev);
192 191
193 nt->parms = *parms; 192 nt->parms = *parms;
193 ipip6_tunnel_init(dev);
194 194
195 if (parms->i_flags & SIT_ISATAP) 195 if (parms->i_flags & SIT_ISATAP)
196 dev->priv_flags |= IFF_ISATAP; 196 dev->priv_flags |= IFF_ISATAP;
@@ -454,7 +454,7 @@ static int ipip6_err(struct sk_buff *skb, u32 info)
454 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) 454 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
455 goto out; 455 goto out;
456 456
457 if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) 457 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
458 t->err_count++; 458 t->err_count++;
459 else 459 else
460 t->err_count = 1; 460 t->err_count = 1;
@@ -658,7 +658,8 @@ static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
658 } 658 }
659 659
660 if (tunnel->err_count > 0) { 660 if (tunnel->err_count > 0) {
661 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { 661 if (time_before(jiffies,
662 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
662 tunnel->err_count--; 663 tunnel->err_count--;
663 dst_link_failure(skb); 664 dst_link_failure(skb);
664 } else 665 } else
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 00f1269e11e9..4b5aa1854260 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -533,8 +533,7 @@ static inline void syn_flood_warning(struct sk_buff *skb)
533 533
534static void tcp_v6_reqsk_destructor(struct request_sock *req) 534static void tcp_v6_reqsk_destructor(struct request_sock *req)
535{ 535{
536 if (inet6_rsk(req)->pktopts) 536 kfree_skb(inet6_rsk(req)->pktopts);
537 kfree_skb(inet6_rsk(req)->pktopts);
538} 537}
539 538
540#ifdef CONFIG_TCP_MD5SIG 539#ifdef CONFIG_TCP_MD5SIG
@@ -1611,8 +1610,7 @@ ipv6_pktoptions:
1611 } 1610 }
1612 } 1611 }
1613 1612
1614 if (opt_skb) 1613 kfree_skb(opt_skb);
1615 kfree_skb(opt_skb);
1616 return 0; 1614 return 0;
1617} 1615}
1618 1616
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c
index 0e685b05496e..f417b77fa0e1 100644
--- a/net/ipv6/xfrm6_state.c
+++ b/net/ipv6/xfrm6_state.c
@@ -69,7 +69,7 @@ __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(void *p), int maxclass)
69 69
70 for (i = 0; i < n; i++) { 70 for (i = 0; i < n; i++) {
71 dst[count[class[i] - 1]++] = src[i]; 71 dst[count[class[i] - 1]++] = src[i];
72 src[i] = 0; 72 src[i] = NULL;
73 } 73 }
74 74
75 return 0; 75 return 0;
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 43d0ffc6d565..1627050e29fd 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1958,12 +1958,12 @@ static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = {
1958 1958
1959SOCKOPS_WRAP(ipx_dgram, PF_IPX); 1959SOCKOPS_WRAP(ipx_dgram, PF_IPX);
1960 1960
1961static struct packet_type ipx_8023_packet_type = { 1961static struct packet_type ipx_8023_packet_type __read_mostly = {
1962 .type = cpu_to_be16(ETH_P_802_3), 1962 .type = cpu_to_be16(ETH_P_802_3),
1963 .func = ipx_rcv, 1963 .func = ipx_rcv,
1964}; 1964};
1965 1965
1966static struct packet_type ipx_dix_packet_type = { 1966static struct packet_type ipx_dix_packet_type __read_mostly = {
1967 .type = cpu_to_be16(ETH_P_IPX), 1967 .type = cpu_to_be16(ETH_P_IPX),
1968 .func = ipx_rcv, 1968 .func = ipx_rcv,
1969}; 1969};
@@ -1975,15 +1975,15 @@ static struct notifier_block ipx_dev_notifier = {
1975extern struct datalink_proto *make_EII_client(void); 1975extern struct datalink_proto *make_EII_client(void);
1976extern void destroy_EII_client(struct datalink_proto *); 1976extern void destroy_EII_client(struct datalink_proto *);
1977 1977
1978static unsigned char ipx_8022_type = 0xE0; 1978static const unsigned char ipx_8022_type = 0xE0;
1979static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; 1979static const unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 };
1980static char ipx_EII_err_msg[] __initdata = 1980static const char ipx_EII_err_msg[] __initconst =
1981 KERN_CRIT "IPX: Unable to register with Ethernet II\n"; 1981 KERN_CRIT "IPX: Unable to register with Ethernet II\n";
1982static char ipx_8023_err_msg[] __initdata = 1982static const char ipx_8023_err_msg[] __initconst =
1983 KERN_CRIT "IPX: Unable to register with 802.3\n"; 1983 KERN_CRIT "IPX: Unable to register with 802.3\n";
1984static char ipx_llc_err_msg[] __initdata = 1984static const char ipx_llc_err_msg[] __initconst =
1985 KERN_CRIT "IPX: Unable to register with 802.2\n"; 1985 KERN_CRIT "IPX: Unable to register with 802.2\n";
1986static char ipx_snap_err_msg[] __initdata = 1986static const char ipx_snap_err_msg[] __initconst =
1987 KERN_CRIT "IPX: Unable to register with SNAP\n"; 1987 KERN_CRIT "IPX: Unable to register with SNAP\n";
1988 1988
1989static int __init ipx_init(void) 1989static int __init ipx_init(void)
diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c
index ea319e3ddc18..bf92e1473447 100644
--- a/net/irda/irda_device.c
+++ b/net/irda/irda_device.c
@@ -149,13 +149,14 @@ int irda_device_is_receiving(struct net_device *dev)
149 149
150 IRDA_DEBUG(2, "%s()\n", __func__); 150 IRDA_DEBUG(2, "%s()\n", __func__);
151 151
152 if (!dev->do_ioctl) { 152 if (!dev->netdev_ops->ndo_do_ioctl) {
153 IRDA_ERROR("%s: do_ioctl not impl. by device driver\n", 153 IRDA_ERROR("%s: do_ioctl not impl. by device driver\n",
154 __func__); 154 __func__);
155 return -1; 155 return -1;
156 } 156 }
157 157
158 ret = dev->do_ioctl(dev, (struct ifreq *) &req, SIOCGRECEIVING); 158 ret = (dev->netdev_ops->ndo_do_ioctl)(dev, (struct ifreq *) &req,
159 SIOCGRECEIVING);
159 if (ret < 0) 160 if (ret < 0)
160 return ret; 161 return ret;
161 162
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 05112be99569..724bcf951b80 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -45,6 +45,16 @@ static int irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev);
45static void irlan_eth_set_multicast_list( struct net_device *dev); 45static void irlan_eth_set_multicast_list( struct net_device *dev);
46static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); 46static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
47 47
48static const struct net_device_ops irlan_eth_netdev_ops = {
49 .ndo_open = irlan_eth_open,
50 .ndo_stop = irlan_eth_close,
51 .ndo_start_xmit = irlan_eth_xmit,
52 .ndo_get_stats = irlan_eth_get_stats,
53 .ndo_set_multicast_list = irlan_eth_set_multicast_list,
54 .ndo_change_mtu = eth_change_mtu,
55 .ndo_validate_addr = eth_validate_addr,
56};
57
48/* 58/*
49 * Function irlan_eth_setup (dev) 59 * Function irlan_eth_setup (dev)
50 * 60 *
@@ -53,14 +63,11 @@ static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev);
53 */ 63 */
54static void irlan_eth_setup(struct net_device *dev) 64static void irlan_eth_setup(struct net_device *dev)
55{ 65{
56 dev->open = irlan_eth_open; 66 ether_setup(dev);
57 dev->stop = irlan_eth_close; 67
58 dev->hard_start_xmit = irlan_eth_xmit; 68 dev->netdev_ops = &irlan_eth_netdev_ops;
59 dev->get_stats = irlan_eth_get_stats;
60 dev->set_multicast_list = irlan_eth_set_multicast_list;
61 dev->destructor = free_netdev; 69 dev->destructor = free_netdev;
62 70
63 ether_setup(dev);
64 71
65 /* 72 /*
66 * Lets do all queueing in IrTTP instead of this device driver. 73 * Lets do all queueing in IrTTP instead of this device driver.
diff --git a/net/irda/irmod.c b/net/irda/irmod.c
index 1bb607f2f5c7..303a68d92731 100644
--- a/net/irda/irmod.c
+++ b/net/irda/irmod.c
@@ -55,7 +55,7 @@ EXPORT_SYMBOL(irda_debug);
55/* Packet type handler. 55/* Packet type handler.
56 * Tell the kernel how IrDA packets should be handled. 56 * Tell the kernel how IrDA packets should be handled.
57 */ 57 */
58static struct packet_type irda_packet_type = { 58static struct packet_type irda_packet_type __read_mostly = {
59 .type = cpu_to_be16(ETH_P_IRDA), 59 .type = cpu_to_be16(ETH_P_IRDA),
60 .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */ 60 .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */
61}; 61};
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index eb8a2a0b6eb7..49e786535dc8 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1171,8 +1171,7 @@ static void iucv_callback_txdone(struct iucv_path *path,
1171 1171
1172 spin_unlock_irqrestore(&list->lock, flags); 1172 spin_unlock_irqrestore(&list->lock, flags);
1173 1173
1174 if (this) 1174 kfree_skb(this);
1175 kfree_skb(this);
1176 } 1175 }
1177 BUG_ON(!this); 1176 BUG_ON(!this);
1178 1177
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7dcbde3ea7d9..643c1be2d02e 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -313,8 +313,7 @@ static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
313 if (one_sk != NULL) 313 if (one_sk != NULL)
314 err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); 314 err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk);
315 315
316 if (skb2) 316 kfree_skb(skb2);
317 kfree_skb(skb2);
318 kfree_skb(skb); 317 kfree_skb(skb);
319 return err; 318 return err;
320} 319}
@@ -3573,8 +3572,7 @@ static int pfkey_sendmsg(struct kiocb *kiocb,
3573out: 3572out:
3574 if (err && hdr && pfkey_error(hdr, err, sk) == 0) 3573 if (err && hdr && pfkey_error(hdr, err, sk) == 0)
3575 err = 0; 3574 err = 0;
3576 if (skb) 3575 kfree_skb(skb);
3577 kfree_skb(skb);
3578 3576
3579 return err ? : len; 3577 return err ? : len;
3580} 3578}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 56fd85ab358e..febae702685c 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -1118,11 +1118,11 @@ static const struct proto_ops llc_ui_ops = {
1118 .sendpage = sock_no_sendpage, 1118 .sendpage = sock_no_sendpage,
1119}; 1119};
1120 1120
1121static char llc_proc_err_msg[] __initdata = 1121static const char llc_proc_err_msg[] __initconst =
1122 KERN_CRIT "LLC: Unable to register the proc_fs entries\n"; 1122 KERN_CRIT "LLC: Unable to register the proc_fs entries\n";
1123static char llc_sysctl_err_msg[] __initdata = 1123static const char llc_sysctl_err_msg[] __initconst =
1124 KERN_CRIT "LLC: Unable to register the sysctl entries\n"; 1124 KERN_CRIT "LLC: Unable to register the sysctl entries\n";
1125static char llc_sock_err_msg[] __initdata = 1125static const char llc_sock_err_msg[] __initconst =
1126 KERN_CRIT "LLC: Unable to register the network family\n"; 1126 KERN_CRIT "LLC: Unable to register the network family\n";
1127 1127
1128static int __init llc2_init(void) 1128static int __init llc2_init(void)
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 5c6d89c6d51d..3477624a4906 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -332,8 +332,7 @@ int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked)
332 332
333 for (i = 0; i < pdu_pos && i < q_len; i++) { 333 for (i = 0; i < pdu_pos && i < q_len; i++) {
334 skb = skb_dequeue(&llc->pdu_unack_q); 334 skb = skb_dequeue(&llc->pdu_unack_q);
335 if (skb) 335 kfree_skb(skb);
336 kfree_skb(skb);
337 nbr_acked++; 336 nbr_acked++;
338 } 337 }
339out: 338out:
diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
index a7fe1adc378d..ff4c0ab96a69 100644
--- a/net/llc/llc_core.c
+++ b/net/llc/llc_core.c
@@ -147,12 +147,12 @@ void llc_sap_close(struct llc_sap *sap)
147 kfree(sap); 147 kfree(sap);
148} 148}
149 149
150static struct packet_type llc_packet_type = { 150static struct packet_type llc_packet_type __read_mostly = {
151 .type = cpu_to_be16(ETH_P_802_2), 151 .type = cpu_to_be16(ETH_P_802_2),
152 .func = llc_rcv, 152 .func = llc_rcv,
153}; 153};
154 154
155static struct packet_type llc_tr_packet_type = { 155static struct packet_type llc_tr_packet_type __read_mostly = {
156 .type = cpu_to_be16(ETH_P_TR_802_2), 156 .type = cpu_to_be16(ETH_P_TR_802_2),
157 .func = llc_rcv, 157 .func = llc_rcv,
158}; 158};
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index 3503a3d21318..0e3ab88bb706 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -9,6 +9,7 @@ mac80211-y := \
9 wpa.o \ 9 wpa.o \
10 scan.o \ 10 scan.o \
11 ht.o agg-tx.o agg-rx.o \ 11 ht.o agg-tx.o agg-rx.o \
12 ibss.o \
12 mlme.o \ 13 mlme.o \
13 iface.o \ 14 iface.o \
14 rate.o \ 15 rate.o \
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 3112bfd441b6..a95affc94629 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -129,7 +129,6 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
129 u8 dialog_token, u16 status, u16 policy, 129 u8 dialog_token, u16 status, u16 policy,
130 u16 buf_size, u16 timeout) 130 u16 buf_size, u16 timeout)
131{ 131{
132 struct ieee80211_if_sta *ifsta = &sdata->u.sta;
133 struct ieee80211_local *local = sdata->local; 132 struct ieee80211_local *local = sdata->local;
134 struct sk_buff *skb; 133 struct sk_buff *skb;
135 struct ieee80211_mgmt *mgmt; 134 struct ieee80211_mgmt *mgmt;
@@ -151,8 +150,9 @@ static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *d
151 if (sdata->vif.type == NL80211_IFTYPE_AP || 150 if (sdata->vif.type == NL80211_IFTYPE_AP ||
152 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 151 sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
153 memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); 152 memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
154 else 153 else if (sdata->vif.type == NL80211_IFTYPE_STATION)
155 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); 154 memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
155
156 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | 156 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
157 IEEE80211_STYPE_ACTION); 157 IEEE80211_STYPE_ACTION);
158 158
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index 1232d9f01ca9..1df116d4d6e7 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -49,7 +49,6 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
49 u16 agg_size, u16 timeout) 49 u16 agg_size, u16 timeout)
50{ 50{
51 struct ieee80211_local *local = sdata->local; 51 struct ieee80211_local *local = sdata->local;
52 struct ieee80211_if_sta *ifsta = &sdata->u.sta;
53 struct sk_buff *skb; 52 struct sk_buff *skb;
54 struct ieee80211_mgmt *mgmt; 53 struct ieee80211_mgmt *mgmt;
55 u16 capab; 54 u16 capab;
@@ -69,8 +68,8 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata,
69 if (sdata->vif.type == NL80211_IFTYPE_AP || 68 if (sdata->vif.type == NL80211_IFTYPE_AP ||
70 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 69 sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
71 memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); 70 memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
72 else 71 else if (sdata->vif.type == NL80211_IFTYPE_STATION)
73 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); 72 memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
74 73
75 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | 74 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
76 IEEE80211_STYPE_ACTION); 75 IEEE80211_STYPE_ACTION);
@@ -132,9 +131,24 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
132 131
133 state = &sta->ampdu_mlme.tid_state_tx[tid]; 132 state = &sta->ampdu_mlme.tid_state_tx[tid];
134 133
135 if (local->hw.ampdu_queues) 134 if (local->hw.ampdu_queues) {
136 ieee80211_stop_queue(&local->hw, sta->tid_to_tx_q[tid]); 135 if (initiator) {
136 /*
137 * Stop the AC queue to avoid issues where we send
138 * unaggregated frames already before the delba.
139 */
140 ieee80211_stop_queue_by_reason(&local->hw,
141 local->hw.queues + sta->tid_to_tx_q[tid],
142 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
143 }
137 144
145 /*
146 * Pretend the driver woke the queue, just in case
147 * it disabled it before the session was stopped.
148 */
149 ieee80211_wake_queue(
150 &local->hw, local->hw.queues + sta->tid_to_tx_q[tid]);
151 }
138 *state = HT_AGG_STATE_REQ_STOP_BA_MSK | 152 *state = HT_AGG_STATE_REQ_STOP_BA_MSK |
139 (initiator << HT_AGG_STATE_INITIATOR_SHIFT); 153 (initiator << HT_AGG_STATE_INITIATOR_SHIFT);
140 154
@@ -144,8 +158,6 @@ static int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
144 /* HW shall not deny going back to legacy */ 158 /* HW shall not deny going back to legacy */
145 if (WARN_ON(ret)) { 159 if (WARN_ON(ret)) {
146 *state = HT_AGG_STATE_OPERATIONAL; 160 *state = HT_AGG_STATE_OPERATIONAL;
147 if (local->hw.ampdu_queues)
148 ieee80211_wake_queue(&local->hw, sta->tid_to_tx_q[tid]);
149 } 161 }
150 162
151 return ret; 163 return ret;
@@ -189,14 +201,19 @@ static void sta_addba_resp_timer_expired(unsigned long data)
189 spin_unlock_bh(&sta->lock); 201 spin_unlock_bh(&sta->lock);
190} 202}
191 203
204static inline int ieee80211_ac_from_tid(int tid)
205{
206 return ieee802_1d_to_ac[tid & 7];
207}
208
192int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid) 209int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
193{ 210{
194 struct ieee80211_local *local = hw_to_local(hw); 211 struct ieee80211_local *local = hw_to_local(hw);
195 struct sta_info *sta; 212 struct sta_info *sta;
196 struct ieee80211_sub_if_data *sdata; 213 struct ieee80211_sub_if_data *sdata;
197 u16 start_seq_num;
198 u8 *state; 214 u8 *state;
199 int ret = 0; 215 int i, qn = -1, ret = 0;
216 u16 start_seq_num;
200 217
201 if (WARN_ON(!local->ops->ampdu_action)) 218 if (WARN_ON(!local->ops->ampdu_action))
202 return -EINVAL; 219 return -EINVAL;
@@ -209,6 +226,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
209 ra, tid); 226 ra, tid);
210#endif /* CONFIG_MAC80211_HT_DEBUG */ 227#endif /* CONFIG_MAC80211_HT_DEBUG */
211 228
229 if (hw->ampdu_queues && ieee80211_ac_from_tid(tid) == 0) {
230#ifdef CONFIG_MAC80211_HT_DEBUG
231 printk(KERN_DEBUG "rejecting on voice AC\n");
232#endif
233 return -EINVAL;
234 }
235
212 rcu_read_lock(); 236 rcu_read_lock();
213 237
214 sta = sta_info_get(local, ra); 238 sta = sta_info_get(local, ra);
@@ -217,7 +241,7 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
217 printk(KERN_DEBUG "Could not find the station\n"); 241 printk(KERN_DEBUG "Could not find the station\n");
218#endif 242#endif
219 ret = -ENOENT; 243 ret = -ENOENT;
220 goto exit; 244 goto unlock;
221 } 245 }
222 246
223 /* 247 /*
@@ -230,11 +254,13 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
230 sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN && 254 sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN &&
231 sta->sdata->vif.type != NL80211_IFTYPE_AP) { 255 sta->sdata->vif.type != NL80211_IFTYPE_AP) {
232 ret = -EINVAL; 256 ret = -EINVAL;
233 goto exit; 257 goto unlock;
234 } 258 }
235 259
236 spin_lock_bh(&sta->lock); 260 spin_lock_bh(&sta->lock);
237 261
262 sdata = sta->sdata;
263
238 /* we have tried too many times, receiver does not want A-MPDU */ 264 /* we have tried too many times, receiver does not want A-MPDU */
239 if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) { 265 if (sta->ampdu_mlme.addba_req_num[tid] > HT_AGG_MAX_RETRIES) {
240 ret = -EBUSY; 266 ret = -EBUSY;
@@ -252,6 +278,42 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
252 goto err_unlock_sta; 278 goto err_unlock_sta;
253 } 279 }
254 280
281 if (hw->ampdu_queues) {
282 spin_lock(&local->queue_stop_reason_lock);
283 /* reserve a new queue for this session */
284 for (i = 0; i < local->hw.ampdu_queues; i++) {
285 if (local->ampdu_ac_queue[i] < 0) {
286 qn = i;
287 local->ampdu_ac_queue[qn] =
288 ieee80211_ac_from_tid(tid);
289 break;
290 }
291 }
292 spin_unlock(&local->queue_stop_reason_lock);
293
294 if (qn < 0) {
295#ifdef CONFIG_MAC80211_HT_DEBUG
296 printk(KERN_DEBUG "BA request denied - "
297 "queue unavailable for tid %d\n", tid);
298#endif /* CONFIG_MAC80211_HT_DEBUG */
299 ret = -ENOSPC;
300 goto err_unlock_sta;
301 }
302
303 /*
304 * If we successfully allocate the session, we can't have
305 * anything going on on the queue this TID maps into, so
306 * stop it for now. This is a "virtual" stop using the same
307 * mechanism that drivers will use.
308 *
309 * XXX: queue up frames for this session in the sta_info
310 * struct instead to avoid hitting all other STAs.
311 */
312 ieee80211_stop_queue_by_reason(
313 &local->hw, hw->queues + qn,
314 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
315 }
316
255 /* prepare A-MPDU MLME for Tx aggregation */ 317 /* prepare A-MPDU MLME for Tx aggregation */
256 sta->ampdu_mlme.tid_tx[tid] = 318 sta->ampdu_mlme.tid_tx[tid] =
257 kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC); 319 kmalloc(sizeof(struct tid_ampdu_tx), GFP_ATOMIC);
@@ -262,8 +324,9 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
262 tid); 324 tid);
263#endif 325#endif
264 ret = -ENOMEM; 326 ret = -ENOMEM;
265 goto err_unlock_sta; 327 goto err_return_queue;
266 } 328 }
329
267 /* Tx timer */ 330 /* Tx timer */
268 sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function = 331 sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer.function =
269 sta_addba_resp_timer_expired; 332 sta_addba_resp_timer_expired;
@@ -271,49 +334,25 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
271 (unsigned long)&sta->timer_to_tid[tid]; 334 (unsigned long)&sta->timer_to_tid[tid];
272 init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); 335 init_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer);
273 336
274 if (hw->ampdu_queues) {
275 /* create a new queue for this aggregation */
276 ret = ieee80211_ht_agg_queue_add(local, sta, tid);
277
278 /* case no queue is available to aggregation
279 * don't switch to aggregation */
280 if (ret) {
281#ifdef CONFIG_MAC80211_HT_DEBUG
282 printk(KERN_DEBUG "BA request denied - "
283 "queue unavailable for tid %d\n", tid);
284#endif /* CONFIG_MAC80211_HT_DEBUG */
285 goto err_unlock_queue;
286 }
287 }
288 sdata = sta->sdata;
289
290 /* Ok, the Addba frame hasn't been sent yet, but if the driver calls the 337 /* Ok, the Addba frame hasn't been sent yet, but if the driver calls the
291 * call back right away, it must see that the flow has begun */ 338 * call back right away, it must see that the flow has begun */
292 *state |= HT_ADDBA_REQUESTED_MSK; 339 *state |= HT_ADDBA_REQUESTED_MSK;
293 340
294 /* This is slightly racy because the queue isn't stopped */
295 start_seq_num = sta->tid_seq[tid]; 341 start_seq_num = sta->tid_seq[tid];
296 342
297 ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START, 343 ret = local->ops->ampdu_action(hw, IEEE80211_AMPDU_TX_START,
298 &sta->sta, tid, &start_seq_num); 344 &sta->sta, tid, &start_seq_num);
299 345
300 if (ret) { 346 if (ret) {
301 /* No need to requeue the packets in the agg queue, since we
302 * held the tx lock: no packet could be enqueued to the newly
303 * allocated queue */
304 if (hw->ampdu_queues)
305 ieee80211_ht_agg_queue_remove(local, sta, tid, 0);
306#ifdef CONFIG_MAC80211_HT_DEBUG 347#ifdef CONFIG_MAC80211_HT_DEBUG
307 printk(KERN_DEBUG "BA request denied - HW unavailable for" 348 printk(KERN_DEBUG "BA request denied - HW unavailable for"
308 " tid %d\n", tid); 349 " tid %d\n", tid);
309#endif /* CONFIG_MAC80211_HT_DEBUG */ 350#endif /* CONFIG_MAC80211_HT_DEBUG */
310 *state = HT_AGG_STATE_IDLE; 351 *state = HT_AGG_STATE_IDLE;
311 goto err_unlock_queue; 352 goto err_free;
312 } 353 }
354 sta->tid_to_tx_q[tid] = qn;
313 355
314 /* Will put all the packets in the new SW queue */
315 if (hw->ampdu_queues)
316 ieee80211_requeue(local, ieee802_1d_to_ac[tid]);
317 spin_unlock_bh(&sta->lock); 356 spin_unlock_bh(&sta->lock);
318 357
319 /* send an addBA request */ 358 /* send an addBA request */
@@ -322,7 +361,6 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
322 sta->ampdu_mlme.dialog_token_allocator; 361 sta->ampdu_mlme.dialog_token_allocator;
323 sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num; 362 sta->ampdu_mlme.tid_tx[tid]->ssn = start_seq_num;
324 363
325
326 ieee80211_send_addba_request(sta->sdata, ra, tid, 364 ieee80211_send_addba_request(sta->sdata, ra, tid,
327 sta->ampdu_mlme.tid_tx[tid]->dialog_token, 365 sta->ampdu_mlme.tid_tx[tid]->dialog_token,
328 sta->ampdu_mlme.tid_tx[tid]->ssn, 366 sta->ampdu_mlme.tid_tx[tid]->ssn,
@@ -334,15 +372,24 @@ int ieee80211_start_tx_ba_session(struct ieee80211_hw *hw, u8 *ra, u16 tid)
334#ifdef CONFIG_MAC80211_HT_DEBUG 372#ifdef CONFIG_MAC80211_HT_DEBUG
335 printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid); 373 printk(KERN_DEBUG "activated addBA response timer on tid %d\n", tid);
336#endif 374#endif
337 goto exit; 375 goto unlock;
338 376
339err_unlock_queue: 377 err_free:
340 kfree(sta->ampdu_mlme.tid_tx[tid]); 378 kfree(sta->ampdu_mlme.tid_tx[tid]);
341 sta->ampdu_mlme.tid_tx[tid] = NULL; 379 sta->ampdu_mlme.tid_tx[tid] = NULL;
342 ret = -EBUSY; 380 err_return_queue:
343err_unlock_sta: 381 if (qn >= 0) {
382 /* We failed, so start queue again right away. */
383 ieee80211_wake_queue_by_reason(hw, hw->queues + qn,
384 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
385 /* give queue back to pool */
386 spin_lock(&local->queue_stop_reason_lock);
387 local->ampdu_ac_queue[qn] = -1;
388 spin_unlock(&local->queue_stop_reason_lock);
389 }
390 err_unlock_sta:
344 spin_unlock_bh(&sta->lock); 391 spin_unlock_bh(&sta->lock);
345exit: 392 unlock:
346 rcu_read_unlock(); 393 rcu_read_unlock();
347 return ret; 394 return ret;
348} 395}
@@ -375,7 +422,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
375 state = &sta->ampdu_mlme.tid_state_tx[tid]; 422 state = &sta->ampdu_mlme.tid_state_tx[tid];
376 spin_lock_bh(&sta->lock); 423 spin_lock_bh(&sta->lock);
377 424
378 if (!(*state & HT_ADDBA_REQUESTED_MSK)) { 425 if (WARN_ON(!(*state & HT_ADDBA_REQUESTED_MSK))) {
379#ifdef CONFIG_MAC80211_HT_DEBUG 426#ifdef CONFIG_MAC80211_HT_DEBUG
380 printk(KERN_DEBUG "addBA was not requested yet, state is %d\n", 427 printk(KERN_DEBUG "addBA was not requested yet, state is %d\n",
381 *state); 428 *state);
@@ -385,7 +432,8 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
385 return; 432 return;
386 } 433 }
387 434
388 WARN_ON_ONCE(*state & HT_ADDBA_DRV_READY_MSK); 435 if (WARN_ON(*state & HT_ADDBA_DRV_READY_MSK))
436 goto out;
389 437
390 *state |= HT_ADDBA_DRV_READY_MSK; 438 *state |= HT_ADDBA_DRV_READY_MSK;
391 439
@@ -393,9 +441,18 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u16 tid)
393#ifdef CONFIG_MAC80211_HT_DEBUG 441#ifdef CONFIG_MAC80211_HT_DEBUG
394 printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid); 442 printk(KERN_DEBUG "Aggregation is on for tid %d \n", tid);
395#endif 443#endif
396 if (hw->ampdu_queues) 444 if (hw->ampdu_queues) {
397 ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); 445 /*
446 * Wake up this queue, we stopped it earlier,
447 * this will in turn wake the entire AC.
448 */
449 ieee80211_wake_queue_by_reason(hw,
450 hw->queues + sta->tid_to_tx_q[tid],
451 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
452 }
398 } 453 }
454
455 out:
399 spin_unlock_bh(&sta->lock); 456 spin_unlock_bh(&sta->lock);
400 rcu_read_unlock(); 457 rcu_read_unlock();
401} 458}
@@ -485,7 +542,6 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
485 struct ieee80211_local *local = hw_to_local(hw); 542 struct ieee80211_local *local = hw_to_local(hw);
486 struct sta_info *sta; 543 struct sta_info *sta;
487 u8 *state; 544 u8 *state;
488 int agg_queue;
489 545
490 if (tid >= STA_TID_NUM) { 546 if (tid >= STA_TID_NUM) {
491#ifdef CONFIG_MAC80211_HT_DEBUG 547#ifdef CONFIG_MAC80211_HT_DEBUG
@@ -527,19 +583,19 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_hw *hw, u8 *ra, u8 tid)
527 ieee80211_send_delba(sta->sdata, ra, tid, 583 ieee80211_send_delba(sta->sdata, ra, tid,
528 WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); 584 WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE);
529 585
530 if (hw->ampdu_queues) { 586 spin_lock_bh(&sta->lock);
531 agg_queue = sta->tid_to_tx_q[tid];
532 ieee80211_ht_agg_queue_remove(local, sta, tid, 1);
533 587
534 /* We just requeued the all the frames that were in the 588 if (*state & HT_AGG_STATE_INITIATOR_MSK &&
535 * removed queue, and since we might miss a softirq we do 589 hw->ampdu_queues) {
536 * netif_schedule_queue. ieee80211_wake_queue is not used 590 /*
537 * here as this queue is not necessarily stopped 591 * Wake up this queue, we stopped it earlier,
592 * this will in turn wake the entire AC.
538 */ 593 */
539 netif_schedule_queue(netdev_get_tx_queue(local->mdev, 594 ieee80211_wake_queue_by_reason(hw,
540 agg_queue)); 595 hw->queues + sta->tid_to_tx_q[tid],
596 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
541 } 597 }
542 spin_lock_bh(&sta->lock); 598
543 *state = HT_AGG_STATE_IDLE; 599 *state = HT_AGG_STATE_IDLE;
544 sta->ampdu_mlme.addba_req_num[tid] = 0; 600 sta->ampdu_mlme.addba_req_num[tid] = 0;
545 kfree(sta->ampdu_mlme.tid_tx[tid]); 601 kfree(sta->ampdu_mlme.tid_tx[tid]);
@@ -613,12 +669,21 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
613#endif /* CONFIG_MAC80211_HT_DEBUG */ 669#endif /* CONFIG_MAC80211_HT_DEBUG */
614 if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) 670 if (le16_to_cpu(mgmt->u.action.u.addba_resp.status)
615 == WLAN_STATUS_SUCCESS) { 671 == WLAN_STATUS_SUCCESS) {
672 u8 curstate = *state;
673
616 *state |= HT_ADDBA_RECEIVED_MSK; 674 *state |= HT_ADDBA_RECEIVED_MSK;
617 sta->ampdu_mlme.addba_req_num[tid] = 0;
618 675
619 if (*state == HT_AGG_STATE_OPERATIONAL && 676 if (hw->ampdu_queues && *state != curstate &&
620 local->hw.ampdu_queues) 677 *state == HT_AGG_STATE_OPERATIONAL) {
621 ieee80211_wake_queue(hw, sta->tid_to_tx_q[tid]); 678 /*
679 * Wake up this queue, we stopped it earlier,
680 * this will in turn wake the entire AC.
681 */
682 ieee80211_wake_queue_by_reason(hw,
683 hw->queues + sta->tid_to_tx_q[tid],
684 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
685 }
686 sta->ampdu_mlme.addba_req_num[tid] = 0;
622 687
623 if (local->ops->ampdu_action) { 688 if (local->ops->ampdu_action) {
624 (void)local->ops->ampdu_action(hw, 689 (void)local->ops->ampdu_action(hw,
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c8d969be440b..58693e52d458 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -341,11 +341,15 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo)
341 sinfo->filled = STATION_INFO_INACTIVE_TIME | 341 sinfo->filled = STATION_INFO_INACTIVE_TIME |
342 STATION_INFO_RX_BYTES | 342 STATION_INFO_RX_BYTES |
343 STATION_INFO_TX_BYTES | 343 STATION_INFO_TX_BYTES |
344 STATION_INFO_RX_PACKETS |
345 STATION_INFO_TX_PACKETS |
344 STATION_INFO_TX_BITRATE; 346 STATION_INFO_TX_BITRATE;
345 347
346 sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); 348 sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx);
347 sinfo->rx_bytes = sta->rx_bytes; 349 sinfo->rx_bytes = sta->rx_bytes;
348 sinfo->tx_bytes = sta->tx_bytes; 350 sinfo->tx_bytes = sta->tx_bytes;
351 sinfo->rx_packets = sta->rx_packets;
352 sinfo->tx_packets = sta->tx_packets;
349 353
350 if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { 354 if (sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
351 sinfo->filled |= STATION_INFO_SIGNAL; 355 sinfo->filled |= STATION_INFO_SIGNAL;
@@ -447,7 +451,8 @@ static int ieee80211_config_beacon(struct ieee80211_sub_if_data *sdata,
447 * This is a kludge. beacon interval should really be part 451 * This is a kludge. beacon interval should really be part
448 * of the beacon information. 452 * of the beacon information.
449 */ 453 */
450 if (params->interval) { 454 if (params->interval && (sdata->local->hw.conf.beacon_int !=
455 params->interval)) {
451 sdata->local->hw.conf.beacon_int = params->interval; 456 sdata->local->hw.conf.beacon_int = params->interval;
452 err = ieee80211_hw_config(sdata->local, 457 err = ieee80211_hw_config(sdata->local,
453 IEEE80211_CONF_CHANGE_BEACON_INTERVAL); 458 IEEE80211_CONF_CHANGE_BEACON_INTERVAL);
@@ -1180,45 +1185,45 @@ static int set_mgmt_extra_ie_sta(struct ieee80211_sub_if_data *sdata,
1180 u8 subtype, u8 *ies, size_t ies_len) 1185 u8 subtype, u8 *ies, size_t ies_len)
1181{ 1186{
1182 struct ieee80211_local *local = sdata->local; 1187 struct ieee80211_local *local = sdata->local;
1183 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 1188 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1184 1189
1185 switch (subtype) { 1190 switch (subtype) {
1186 case IEEE80211_STYPE_PROBE_REQ >> 4: 1191 case IEEE80211_STYPE_PROBE_REQ >> 4:
1187 if (local->ops->hw_scan) 1192 if (local->ops->hw_scan)
1188 break; 1193 break;
1189 kfree(ifsta->ie_probereq); 1194 kfree(ifmgd->ie_probereq);
1190 ifsta->ie_probereq = ies; 1195 ifmgd->ie_probereq = ies;
1191 ifsta->ie_probereq_len = ies_len; 1196 ifmgd->ie_probereq_len = ies_len;
1192 return 0; 1197 return 0;
1193 case IEEE80211_STYPE_PROBE_RESP >> 4: 1198 case IEEE80211_STYPE_PROBE_RESP >> 4:
1194 kfree(ifsta->ie_proberesp); 1199 kfree(ifmgd->ie_proberesp);
1195 ifsta->ie_proberesp = ies; 1200 ifmgd->ie_proberesp = ies;
1196 ifsta->ie_proberesp_len = ies_len; 1201 ifmgd->ie_proberesp_len = ies_len;
1197 return 0; 1202 return 0;
1198 case IEEE80211_STYPE_AUTH >> 4: 1203 case IEEE80211_STYPE_AUTH >> 4:
1199 kfree(ifsta->ie_auth); 1204 kfree(ifmgd->ie_auth);
1200 ifsta->ie_auth = ies; 1205 ifmgd->ie_auth = ies;
1201 ifsta->ie_auth_len = ies_len; 1206 ifmgd->ie_auth_len = ies_len;
1202 return 0; 1207 return 0;
1203 case IEEE80211_STYPE_ASSOC_REQ >> 4: 1208 case IEEE80211_STYPE_ASSOC_REQ >> 4:
1204 kfree(ifsta->ie_assocreq); 1209 kfree(ifmgd->ie_assocreq);
1205 ifsta->ie_assocreq = ies; 1210 ifmgd->ie_assocreq = ies;
1206 ifsta->ie_assocreq_len = ies_len; 1211 ifmgd->ie_assocreq_len = ies_len;
1207 return 0; 1212 return 0;
1208 case IEEE80211_STYPE_REASSOC_REQ >> 4: 1213 case IEEE80211_STYPE_REASSOC_REQ >> 4:
1209 kfree(ifsta->ie_reassocreq); 1214 kfree(ifmgd->ie_reassocreq);
1210 ifsta->ie_reassocreq = ies; 1215 ifmgd->ie_reassocreq = ies;
1211 ifsta->ie_reassocreq_len = ies_len; 1216 ifmgd->ie_reassocreq_len = ies_len;
1212 return 0; 1217 return 0;
1213 case IEEE80211_STYPE_DEAUTH >> 4: 1218 case IEEE80211_STYPE_DEAUTH >> 4:
1214 kfree(ifsta->ie_deauth); 1219 kfree(ifmgd->ie_deauth);
1215 ifsta->ie_deauth = ies; 1220 ifmgd->ie_deauth = ies;
1216 ifsta->ie_deauth_len = ies_len; 1221 ifmgd->ie_deauth_len = ies_len;
1217 return 0; 1222 return 0;
1218 case IEEE80211_STYPE_DISASSOC >> 4: 1223 case IEEE80211_STYPE_DISASSOC >> 4:
1219 kfree(ifsta->ie_disassoc); 1224 kfree(ifmgd->ie_disassoc);
1220 ifsta->ie_disassoc = ies; 1225 ifmgd->ie_disassoc = ies;
1221 ifsta->ie_disassoc_len = ies_len; 1226 ifmgd->ie_disassoc_len = ies_len;
1222 return 0; 1227 return 0;
1223 } 1228 }
1224 1229
@@ -1248,7 +1253,6 @@ static int ieee80211_set_mgmt_extra_ie(struct wiphy *wiphy,
1248 1253
1249 switch (sdata->vif.type) { 1254 switch (sdata->vif.type) {
1250 case NL80211_IFTYPE_STATION: 1255 case NL80211_IFTYPE_STATION:
1251 case NL80211_IFTYPE_ADHOC:
1252 ret = set_mgmt_extra_ie_sta(sdata, params->subtype, 1256 ret = set_mgmt_extra_ie_sta(sdata, params->subtype,
1253 ies, ies_len); 1257 ies, ies_len);
1254 break; 1258 break;
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index c54219301724..e3420329f4e6 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -94,31 +94,31 @@ IEEE80211_IF_FILE(drop_unencrypted, drop_unencrypted, DEC);
94IEEE80211_IF_FILE(force_unicast_rateidx, force_unicast_rateidx, DEC); 94IEEE80211_IF_FILE(force_unicast_rateidx, force_unicast_rateidx, DEC);
95IEEE80211_IF_FILE(max_ratectrl_rateidx, max_ratectrl_rateidx, DEC); 95IEEE80211_IF_FILE(max_ratectrl_rateidx, max_ratectrl_rateidx, DEC);
96 96
97/* STA/IBSS attributes */ 97/* STA attributes */
98IEEE80211_IF_FILE(state, u.sta.state, DEC); 98IEEE80211_IF_FILE(state, u.mgd.state, DEC);
99IEEE80211_IF_FILE(bssid, u.sta.bssid, MAC); 99IEEE80211_IF_FILE(bssid, u.mgd.bssid, MAC);
100IEEE80211_IF_FILE(prev_bssid, u.sta.prev_bssid, MAC); 100IEEE80211_IF_FILE(prev_bssid, u.mgd.prev_bssid, MAC);
101IEEE80211_IF_FILE(ssid_len, u.sta.ssid_len, SIZE); 101IEEE80211_IF_FILE(ssid_len, u.mgd.ssid_len, SIZE);
102IEEE80211_IF_FILE(aid, u.sta.aid, DEC); 102IEEE80211_IF_FILE(aid, u.mgd.aid, DEC);
103IEEE80211_IF_FILE(ap_capab, u.sta.ap_capab, HEX); 103IEEE80211_IF_FILE(ap_capab, u.mgd.ap_capab, HEX);
104IEEE80211_IF_FILE(capab, u.sta.capab, HEX); 104IEEE80211_IF_FILE(capab, u.mgd.capab, HEX);
105IEEE80211_IF_FILE(extra_ie_len, u.sta.extra_ie_len, SIZE); 105IEEE80211_IF_FILE(extra_ie_len, u.mgd.extra_ie_len, SIZE);
106IEEE80211_IF_FILE(auth_tries, u.sta.auth_tries, DEC); 106IEEE80211_IF_FILE(auth_tries, u.mgd.auth_tries, DEC);
107IEEE80211_IF_FILE(assoc_tries, u.sta.assoc_tries, DEC); 107IEEE80211_IF_FILE(assoc_tries, u.mgd.assoc_tries, DEC);
108IEEE80211_IF_FILE(auth_algs, u.sta.auth_algs, HEX); 108IEEE80211_IF_FILE(auth_algs, u.mgd.auth_algs, HEX);
109IEEE80211_IF_FILE(auth_alg, u.sta.auth_alg, DEC); 109IEEE80211_IF_FILE(auth_alg, u.mgd.auth_alg, DEC);
110IEEE80211_IF_FILE(auth_transaction, u.sta.auth_transaction, DEC); 110IEEE80211_IF_FILE(auth_transaction, u.mgd.auth_transaction, DEC);
111 111
112static ssize_t ieee80211_if_fmt_flags( 112static ssize_t ieee80211_if_fmt_flags(
113 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen) 113 const struct ieee80211_sub_if_data *sdata, char *buf, int buflen)
114{ 114{
115 return scnprintf(buf, buflen, "%s%s%s%s%s%s%s\n", 115 return scnprintf(buf, buflen, "%s%s%s%s%s%s%s\n",
116 sdata->u.sta.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "", 116 sdata->u.mgd.flags & IEEE80211_STA_SSID_SET ? "SSID\n" : "",
117 sdata->u.sta.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "", 117 sdata->u.mgd.flags & IEEE80211_STA_BSSID_SET ? "BSSID\n" : "",
118 sdata->u.sta.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "", 118 sdata->u.mgd.flags & IEEE80211_STA_PREV_BSSID_SET ? "prev BSSID\n" : "",
119 sdata->u.sta.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "", 119 sdata->u.mgd.flags & IEEE80211_STA_AUTHENTICATED ? "AUTH\n" : "",
120 sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "", 120 sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED ? "ASSOC\n" : "",
121 sdata->u.sta.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "", 121 sdata->u.mgd.flags & IEEE80211_STA_PROBEREQ_POLL ? "PROBEREQ POLL\n" : "",
122 sdata->vif.bss_conf.use_cts_prot ? "CTS prot\n" : ""); 122 sdata->vif.bss_conf.use_cts_prot ? "CTS prot\n" : "");
123} 123}
124__IEEE80211_IF_FILE(flags); 124__IEEE80211_IF_FILE(flags);
@@ -283,9 +283,11 @@ static void add_files(struct ieee80211_sub_if_data *sdata)
283#endif 283#endif
284 break; 284 break;
285 case NL80211_IFTYPE_STATION: 285 case NL80211_IFTYPE_STATION:
286 case NL80211_IFTYPE_ADHOC:
287 add_sta_files(sdata); 286 add_sta_files(sdata);
288 break; 287 break;
288 case NL80211_IFTYPE_ADHOC:
289 /* XXX */
290 break;
289 case NL80211_IFTYPE_AP: 291 case NL80211_IFTYPE_AP:
290 add_ap_files(sdata); 292 add_ap_files(sdata);
291 break; 293 break;
@@ -418,9 +420,11 @@ static void del_files(struct ieee80211_sub_if_data *sdata)
418#endif 420#endif
419 break; 421 break;
420 case NL80211_IFTYPE_STATION: 422 case NL80211_IFTYPE_STATION:
421 case NL80211_IFTYPE_ADHOC:
422 del_sta_files(sdata); 423 del_sta_files(sdata);
423 break; 424 break;
425 case NL80211_IFTYPE_ADHOC:
426 /* XXX */
427 break;
424 case NL80211_IFTYPE_AP: 428 case NL80211_IFTYPE_AP:
425 del_ap_files(sdata); 429 del_ap_files(sdata);
426 break; 430 break;
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 82ea0b63a386..4e3c72f20de7 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -17,6 +17,7 @@
17#include <net/wireless.h> 17#include <net/wireless.h>
18#include <net/mac80211.h> 18#include <net/mac80211.h>
19#include "ieee80211_i.h" 19#include "ieee80211_i.h"
20#include "rate.h"
20 21
21void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, 22void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband,
22 struct ieee80211_ht_cap *ht_cap_ie, 23 struct ieee80211_ht_cap *ht_cap_ie,
@@ -93,7 +94,9 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
93{ 94{
94 struct ieee80211_local *local = sdata->local; 95 struct ieee80211_local *local = sdata->local;
95 struct ieee80211_supported_band *sband; 96 struct ieee80211_supported_band *sband;
97 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
96 struct ieee80211_bss_ht_conf ht; 98 struct ieee80211_bss_ht_conf ht;
99 struct sta_info *sta;
97 u32 changed = 0; 100 u32 changed = 0;
98 bool enable_ht = true, ht_changed; 101 bool enable_ht = true, ht_changed;
99 enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; 102 enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT;
@@ -136,6 +139,16 @@ u32 ieee80211_enable_ht(struct ieee80211_sub_if_data *sdata,
136 if (ht_changed) { 139 if (ht_changed) {
137 /* channel_type change automatically detected */ 140 /* channel_type change automatically detected */
138 ieee80211_hw_config(local, 0); 141 ieee80211_hw_config(local, 0);
142
143 rcu_read_lock();
144
145 sta = sta_info_get(local, ifmgd->bssid);
146 if (sta)
147 rate_control_rate_update(local, sband, sta,
148 IEEE80211_RC_HT_CHANGED);
149
150 rcu_read_unlock();
151
139 } 152 }
140 153
141 /* disable HT */ 154 /* disable HT */
@@ -169,7 +182,6 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
169 u16 initiator, u16 reason_code) 182 u16 initiator, u16 reason_code)
170{ 183{
171 struct ieee80211_local *local = sdata->local; 184 struct ieee80211_local *local = sdata->local;
172 struct ieee80211_if_sta *ifsta = &sdata->u.sta;
173 struct sk_buff *skb; 185 struct sk_buff *skb;
174 struct ieee80211_mgmt *mgmt; 186 struct ieee80211_mgmt *mgmt;
175 u16 params; 187 u16 params;
@@ -190,8 +202,9 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
190 if (sdata->vif.type == NL80211_IFTYPE_AP || 202 if (sdata->vif.type == NL80211_IFTYPE_AP ||
191 sdata->vif.type == NL80211_IFTYPE_AP_VLAN) 203 sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
192 memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN); 204 memcpy(mgmt->bssid, sdata->dev->dev_addr, ETH_ALEN);
193 else 205 else if (sdata->vif.type == NL80211_IFTYPE_STATION)
194 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); 206 memcpy(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN);
207
195 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | 208 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
196 IEEE80211_STYPE_ACTION); 209 IEEE80211_STYPE_ACTION);
197 210
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
new file mode 100644
index 000000000000..f4becc12904e
--- /dev/null
+++ b/net/mac80211/ibss.c
@@ -0,0 +1,907 @@
1/*
2 * IBSS mode implementation
3 * Copyright 2003-2008, Jouni Malinen <j@w1.fi>
4 * Copyright 2004, Instant802 Networks, Inc.
5 * Copyright 2005, Devicescape Software, Inc.
6 * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
7 * Copyright 2007, Michael Wu <flamingice@sourmilk.net>
8 * Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 */
14
15#include <linux/delay.h>
16#include <linux/if_ether.h>
17#include <linux/skbuff.h>
18#include <linux/if_arp.h>
19#include <linux/etherdevice.h>
20#include <linux/rtnetlink.h>
21#include <net/mac80211.h>
22#include <asm/unaligned.h>
23
24#include "ieee80211_i.h"
25#include "rate.h"
26
27#define IEEE80211_SCAN_INTERVAL (2 * HZ)
28#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
29#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
30
31#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
32#define IEEE80211_IBSS_MERGE_DELAY 0x400000
33#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
34
35#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
36
37
38static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
39 struct ieee80211_mgmt *mgmt,
40 size_t len)
41{
42 u16 auth_alg, auth_transaction, status_code;
43
44 if (len < 24 + 6)
45 return;
46
47 auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
48 auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
49 status_code = le16_to_cpu(mgmt->u.auth.status_code);
50
51 /*
52 * IEEE 802.11 standard does not require authentication in IBSS
53 * networks and most implementations do not seem to use it.
54 * However, try to reply to authentication attempts if someone
55 * has actually implemented this.
56 */
57 if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
58 ieee80211_send_auth(sdata, 2, WLAN_AUTH_OPEN, NULL, 0,
59 sdata->u.ibss.bssid, 0);
60}
61
62static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
63 const u8 *bssid, const int beacon_int,
64 const int freq,
65 const size_t supp_rates_len,
66 const u8 *supp_rates,
67 const u16 capability, u64 tsf)
68{
69 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
70 struct ieee80211_local *local = sdata->local;
71 int res = 0, rates, i, j;
72 struct sk_buff *skb;
73 struct ieee80211_mgmt *mgmt;
74 u8 *pos;
75 struct ieee80211_supported_band *sband;
76 union iwreq_data wrqu;
77
78 if (local->ops->reset_tsf) {
79 /* Reset own TSF to allow time synchronization work. */
80 local->ops->reset_tsf(local_to_hw(local));
81 }
82
83 if ((ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) &&
84 memcmp(ifibss->bssid, bssid, ETH_ALEN) == 0)
85 return res;
86
87 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400);
88 if (!skb) {
89 printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
90 "response\n", sdata->dev->name);
91 return -ENOMEM;
92 }
93
94 if (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) {
95 /* Remove possible STA entries from other IBSS networks. */
96 sta_info_flush_delayed(sdata);
97 }
98
99 memcpy(ifibss->bssid, bssid, ETH_ALEN);
100 res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
101 if (res)
102 return res;
103
104 local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10;
105
106 sdata->drop_unencrypted = capability &
107 WLAN_CAPABILITY_PRIVACY ? 1 : 0;
108
109 res = ieee80211_set_freq(sdata, freq);
110
111 if (res)
112 return res;
113
114 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
115
116 /* Build IBSS probe response */
117
118 skb_reserve(skb, local->hw.extra_tx_headroom);
119
120 mgmt = (struct ieee80211_mgmt *)
121 skb_put(skb, 24 + sizeof(mgmt->u.beacon));
122 memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
123 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
124 IEEE80211_STYPE_PROBE_RESP);
125 memset(mgmt->da, 0xff, ETH_ALEN);
126 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
127 memcpy(mgmt->bssid, ifibss->bssid, ETH_ALEN);
128 mgmt->u.beacon.beacon_int =
129 cpu_to_le16(local->hw.conf.beacon_int);
130 mgmt->u.beacon.timestamp = cpu_to_le64(tsf);
131 mgmt->u.beacon.capab_info = cpu_to_le16(capability);
132
133 pos = skb_put(skb, 2 + ifibss->ssid_len);
134 *pos++ = WLAN_EID_SSID;
135 *pos++ = ifibss->ssid_len;
136 memcpy(pos, ifibss->ssid, ifibss->ssid_len);
137
138 rates = supp_rates_len;
139 if (rates > 8)
140 rates = 8;
141 pos = skb_put(skb, 2 + rates);
142 *pos++ = WLAN_EID_SUPP_RATES;
143 *pos++ = rates;
144 memcpy(pos, supp_rates, rates);
145
146 if (sband->band == IEEE80211_BAND_2GHZ) {
147 pos = skb_put(skb, 2 + 1);
148 *pos++ = WLAN_EID_DS_PARAMS;
149 *pos++ = 1;
150 *pos++ = ieee80211_frequency_to_channel(freq);
151 }
152
153 pos = skb_put(skb, 2 + 2);
154 *pos++ = WLAN_EID_IBSS_PARAMS;
155 *pos++ = 2;
156 /* FIX: set ATIM window based on scan results */
157 *pos++ = 0;
158 *pos++ = 0;
159
160 if (supp_rates_len > 8) {
161 rates = supp_rates_len - 8;
162 pos = skb_put(skb, 2 + rates);
163 *pos++ = WLAN_EID_EXT_SUPP_RATES;
164 *pos++ = rates;
165 memcpy(pos, &supp_rates[8], rates);
166 }
167
168 ifibss->probe_resp = skb;
169
170 ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON |
171 IEEE80211_IFCC_BEACON_ENABLED);
172
173
174 rates = 0;
175 for (i = 0; i < supp_rates_len; i++) {
176 int bitrate = (supp_rates[i] & 0x7f) * 5;
177 for (j = 0; j < sband->n_bitrates; j++)
178 if (sband->bitrates[j].bitrate == bitrate)
179 rates |= BIT(j);
180 }
181
182 ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates);
183
184 ifibss->flags |= IEEE80211_IBSS_PREV_BSSID_SET;
185 ifibss->state = IEEE80211_IBSS_MLME_JOINED;
186 mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
187
188 memset(&wrqu, 0, sizeof(wrqu));
189 memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
190 wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
191
192 return res;
193}
194
195static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
196 struct ieee80211_bss *bss)
197{
198 return __ieee80211_sta_join_ibss(sdata,
199 bss->cbss.bssid,
200 bss->cbss.beacon_interval,
201 bss->cbss.channel->center_freq,
202 bss->supp_rates_len, bss->supp_rates,
203 bss->cbss.capability,
204 bss->cbss.tsf);
205}
206
207static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
208 struct ieee80211_mgmt *mgmt,
209 size_t len,
210 struct ieee80211_rx_status *rx_status,
211 struct ieee802_11_elems *elems,
212 bool beacon)
213{
214 struct ieee80211_local *local = sdata->local;
215 int freq;
216 struct ieee80211_bss *bss;
217 struct sta_info *sta;
218 struct ieee80211_channel *channel;
219 u64 beacon_timestamp, rx_timestamp;
220 u32 supp_rates = 0;
221 enum ieee80211_band band = rx_status->band;
222
223 if (elems->ds_params && elems->ds_params_len == 1)
224 freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
225 else
226 freq = rx_status->freq;
227
228 channel = ieee80211_get_channel(local->hw.wiphy, freq);
229
230 if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
231 return;
232
233 if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
234 memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) {
235 supp_rates = ieee80211_sta_get_rates(local, elems, band);
236
237 rcu_read_lock();
238
239 sta = sta_info_get(local, mgmt->sa);
240 if (sta) {
241 u32 prev_rates;
242
243 prev_rates = sta->sta.supp_rates[band];
244 /* make sure mandatory rates are always added */
245 sta->sta.supp_rates[band] = supp_rates |
246 ieee80211_mandatory_rates(local, band);
247
248#ifdef CONFIG_MAC80211_IBSS_DEBUG
249 if (sta->sta.supp_rates[band] != prev_rates)
250 printk(KERN_DEBUG "%s: updated supp_rates set "
251 "for %pM based on beacon info (0x%llx | "
252 "0x%llx -> 0x%llx)\n",
253 sdata->dev->name,
254 sta->sta.addr,
255 (unsigned long long) prev_rates,
256 (unsigned long long) supp_rates,
257 (unsigned long long) sta->sta.supp_rates[band]);
258#endif
259 } else
260 ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
261
262 rcu_read_unlock();
263 }
264
265 bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
266 channel, beacon);
267 if (!bss)
268 return;
269
270 /* was just updated in ieee80211_bss_info_update */
271 beacon_timestamp = bss->cbss.tsf;
272
273 /* check if we need to merge IBSS */
274
275 /* merge only on beacons (???) */
276 if (!beacon)
277 goto put_bss;
278
279 /* we use a fixed BSSID */
280 if (sdata->u.ibss.flags & IEEE80211_IBSS_BSSID_SET)
281 goto put_bss;
282
283 /* not an IBSS */
284 if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS))
285 goto put_bss;
286
287 /* different channel */
288 if (bss->cbss.channel != local->oper_channel)
289 goto put_bss;
290
291 /* different SSID */
292 if (elems->ssid_len != sdata->u.ibss.ssid_len ||
293 memcmp(elems->ssid, sdata->u.ibss.ssid,
294 sdata->u.ibss.ssid_len))
295 goto put_bss;
296
297 /* same BSSID */
298 if (memcmp(bss->cbss.bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0)
299 goto put_bss;
300
301 if (rx_status->flag & RX_FLAG_TSFT) {
302 /*
303 * For correct IBSS merging we need mactime; since mactime is
304 * defined as the time the first data symbol of the frame hits
305 * the PHY, and the timestamp of the beacon is defined as "the
306 * time that the data symbol containing the first bit of the
307 * timestamp is transmitted to the PHY plus the transmitting
308 * STA's delays through its local PHY from the MAC-PHY
309 * interface to its interface with the WM" (802.11 11.1.2)
310 * - equals the time this bit arrives at the receiver - we have
311 * to take into account the offset between the two.
312 *
313 * E.g. at 1 MBit that means mactime is 192 usec earlier
314 * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
315 */
316 int rate;
317
318 if (rx_status->flag & RX_FLAG_HT)
319 rate = 65; /* TODO: HT rates */
320 else
321 rate = local->hw.wiphy->bands[band]->
322 bitrates[rx_status->rate_idx].bitrate;
323
324 rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
325 } else if (local && local->ops && local->ops->get_tsf)
326 /* second best option: get current TSF */
327 rx_timestamp = local->ops->get_tsf(local_to_hw(local));
328 else
329 /* can't merge without knowing the TSF */
330 rx_timestamp = -1LLU;
331
332#ifdef CONFIG_MAC80211_IBSS_DEBUG
333 printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
334 "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
335 mgmt->sa, mgmt->bssid,
336 (unsigned long long)rx_timestamp,
337 (unsigned long long)beacon_timestamp,
338 (unsigned long long)(rx_timestamp - beacon_timestamp),
339 jiffies);
340#endif
341
342 /* give slow hardware some time to do the TSF sync */
343 if (rx_timestamp < IEEE80211_IBSS_MERGE_DELAY)
344 goto put_bss;
345
346 if (beacon_timestamp > rx_timestamp) {
347#ifdef CONFIG_MAC80211_IBSS_DEBUG
348 printk(KERN_DEBUG "%s: beacon TSF higher than "
349 "local TSF - IBSS merge with BSSID %pM\n",
350 sdata->dev->name, mgmt->bssid);
351#endif
352 ieee80211_sta_join_ibss(sdata, bss);
353 ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
354 }
355
356 put_bss:
357 ieee80211_rx_bss_put(local, bss);
358}
359
360/*
361 * Add a new IBSS station, will also be called by the RX code when,
362 * in IBSS mode, receiving a frame from a yet-unknown station, hence
363 * must be callable in atomic context.
364 */
365struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
366 u8 *bssid,u8 *addr, u32 supp_rates)
367{
368 struct ieee80211_local *local = sdata->local;
369 struct sta_info *sta;
370 int band = local->hw.conf.channel->band;
371
372 /* TODO: Could consider removing the least recently used entry and
373 * allow new one to be added. */
374 if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
375 if (net_ratelimit()) {
376 printk(KERN_DEBUG "%s: No room for a new IBSS STA "
377 "entry %pM\n", sdata->dev->name, addr);
378 }
379 return NULL;
380 }
381
382 if (compare_ether_addr(bssid, sdata->u.ibss.bssid))
383 return NULL;
384
385#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
386 printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
387 wiphy_name(local->hw.wiphy), addr, sdata->dev->name);
388#endif
389
390 sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
391 if (!sta)
392 return NULL;
393
394 set_sta_flags(sta, WLAN_STA_AUTHORIZED);
395
396 /* make sure mandatory rates are always added */
397 sta->sta.supp_rates[band] = supp_rates |
398 ieee80211_mandatory_rates(local, band);
399
400 rate_control_rate_init(sta);
401
402 if (sta_info_insert(sta))
403 return NULL;
404
405 return sta;
406}
407
408static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
409{
410 struct ieee80211_local *local = sdata->local;
411 int active = 0;
412 struct sta_info *sta;
413
414 rcu_read_lock();
415
416 list_for_each_entry_rcu(sta, &local->sta_list, list) {
417 if (sta->sdata == sdata &&
418 time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
419 jiffies)) {
420 active++;
421 break;
422 }
423 }
424
425 rcu_read_unlock();
426
427 return active;
428}
429
430
431static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata)
432{
433 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
434
435 mod_timer(&ifibss->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
436
437 ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
438 if (ieee80211_sta_active_ibss(sdata))
439 return;
440
441 if ((ifibss->flags & IEEE80211_IBSS_BSSID_SET) &&
442 (!(ifibss->flags & IEEE80211_IBSS_AUTO_CHANNEL_SEL)))
443 return;
444
445 printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
446 "IBSS networks with same SSID (merge)\n", sdata->dev->name);
447
448 /* XXX maybe racy? */
449 if (sdata->local->scan_req)
450 return;
451
452 memcpy(sdata->local->int_scan_req.ssids[0].ssid,
453 ifibss->ssid, IEEE80211_MAX_SSID_LEN);
454 sdata->local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len;
455 ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
456}
457
458static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata)
459{
460 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
461 struct ieee80211_local *local = sdata->local;
462 struct ieee80211_supported_band *sband;
463 u8 *pos;
464 u8 bssid[ETH_ALEN];
465 u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
466 u16 capability;
467 int i;
468
469 if (ifibss->flags & IEEE80211_IBSS_BSSID_SET) {
470 memcpy(bssid, ifibss->bssid, ETH_ALEN);
471 } else {
472 /* Generate random, not broadcast, locally administered BSSID. Mix in
473 * own MAC address to make sure that devices that do not have proper
474 * random number generator get different BSSID. */
475 get_random_bytes(bssid, ETH_ALEN);
476 for (i = 0; i < ETH_ALEN; i++)
477 bssid[i] ^= sdata->dev->dev_addr[i];
478 bssid[0] &= ~0x01;
479 bssid[0] |= 0x02;
480 }
481
482 printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
483 sdata->dev->name, bssid);
484
485 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
486
487 if (local->hw.conf.beacon_int == 0)
488 local->hw.conf.beacon_int = 100;
489
490 capability = WLAN_CAPABILITY_IBSS;
491
492 if (sdata->default_key)
493 capability |= WLAN_CAPABILITY_PRIVACY;
494 else
495 sdata->drop_unencrypted = 0;
496
497 pos = supp_rates;
498 for (i = 0; i < sband->n_bitrates; i++) {
499 int rate = sband->bitrates[i].bitrate;
500 *pos++ = (u8) (rate / 5);
501 }
502
503 return __ieee80211_sta_join_ibss(sdata,
504 bssid, local->hw.conf.beacon_int,
505 local->hw.conf.channel->center_freq,
506 sband->n_bitrates, supp_rates,
507 capability, 0);
508}
509
510static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
511{
512 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
513 struct ieee80211_local *local = sdata->local;
514 struct ieee80211_bss *bss;
515 const u8 *bssid = NULL;
516 int active_ibss;
517
518 if (ifibss->ssid_len == 0)
519 return -EINVAL;
520
521 active_ibss = ieee80211_sta_active_ibss(sdata);
522#ifdef CONFIG_MAC80211_IBSS_DEBUG
523 printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
524 sdata->dev->name, active_ibss);
525#endif /* CONFIG_MAC80211_IBSS_DEBUG */
526
527 if (active_ibss)
528 return 0;
529
530 if (ifibss->flags & IEEE80211_IBSS_BSSID_SET)
531 bssid = ifibss->bssid;
532 bss = (void *)cfg80211_get_bss(local->hw.wiphy, NULL, bssid,
533 ifibss->ssid, ifibss->ssid_len,
534 WLAN_CAPABILITY_IBSS,
535 WLAN_CAPABILITY_IBSS);
536
537#ifdef CONFIG_MAC80211_IBSS_DEBUG
538 if (bss)
539 printk(KERN_DEBUG " sta_find_ibss: selected %pM current "
540 "%pM\n", bss->cbss.bssid, ifibss->bssid);
541#endif /* CONFIG_MAC80211_IBSS_DEBUG */
542
543 if (bss &&
544 (!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET) ||
545 memcmp(ifibss->bssid, bss->cbss.bssid, ETH_ALEN))) {
546 int ret;
547
548 printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
549 " based on configured SSID\n",
550 sdata->dev->name, bss->cbss.bssid);
551
552 ret = ieee80211_sta_join_ibss(sdata, bss);
553 ieee80211_rx_bss_put(local, bss);
554 return ret;
555 } else if (bss)
556 ieee80211_rx_bss_put(local, bss);
557
558#ifdef CONFIG_MAC80211_IBSS_DEBUG
559 printk(KERN_DEBUG " did not try to join ibss\n");
560#endif /* CONFIG_MAC80211_IBSS_DEBUG */
561
562 /* Selected IBSS not found in current scan results - try to scan */
563 if (ifibss->state == IEEE80211_IBSS_MLME_JOINED &&
564 !ieee80211_sta_active_ibss(sdata)) {
565 mod_timer(&ifibss->timer, jiffies +
566 IEEE80211_IBSS_MERGE_INTERVAL);
567 } else if (time_after(jiffies, local->last_scan_completed +
568 IEEE80211_SCAN_INTERVAL)) {
569 printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
570 "join\n", sdata->dev->name);
571
572 /* XXX maybe racy? */
573 if (local->scan_req)
574 return -EBUSY;
575
576 memcpy(local->int_scan_req.ssids[0].ssid,
577 ifibss->ssid, IEEE80211_MAX_SSID_LEN);
578 local->int_scan_req.ssids[0].ssid_len = ifibss->ssid_len;
579 return ieee80211_request_scan(sdata, &local->int_scan_req);
580 } else if (ifibss->state != IEEE80211_IBSS_MLME_JOINED) {
581 int interval = IEEE80211_SCAN_INTERVAL;
582
583 if (time_after(jiffies, ifibss->ibss_join_req +
584 IEEE80211_IBSS_JOIN_TIMEOUT)) {
585 if (!(local->oper_channel->flags &
586 IEEE80211_CHAN_NO_IBSS))
587 return ieee80211_sta_create_ibss(sdata);
588 printk(KERN_DEBUG "%s: IBSS not allowed on"
589 " %d MHz\n", sdata->dev->name,
590 local->hw.conf.channel->center_freq);
591
592 /* No IBSS found - decrease scan interval and continue
593 * scanning. */
594 interval = IEEE80211_SCAN_INTERVAL_SLOW;
595 }
596
597 ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
598 mod_timer(&ifibss->timer, jiffies + interval);
599 return 0;
600 }
601
602 return 0;
603}
604
605static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata,
606 struct ieee80211_mgmt *mgmt,
607 size_t len)
608{
609 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
610 struct ieee80211_local *local = sdata->local;
611 int tx_last_beacon;
612 struct sk_buff *skb;
613 struct ieee80211_mgmt *resp;
614 u8 *pos, *end;
615
616 if (ifibss->state != IEEE80211_IBSS_MLME_JOINED ||
617 len < 24 + 2 || !ifibss->probe_resp)
618 return;
619
620 if (local->ops->tx_last_beacon)
621 tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local));
622 else
623 tx_last_beacon = 1;
624
625#ifdef CONFIG_MAC80211_IBSS_DEBUG
626 printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
627 " (tx_last_beacon=%d)\n",
628 sdata->dev->name, mgmt->sa, mgmt->da,
629 mgmt->bssid, tx_last_beacon);
630#endif /* CONFIG_MAC80211_IBSS_DEBUG */
631
632 if (!tx_last_beacon)
633 return;
634
635 if (memcmp(mgmt->bssid, ifibss->bssid, ETH_ALEN) != 0 &&
636 memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
637 return;
638
639 end = ((u8 *) mgmt) + len;
640 pos = mgmt->u.probe_req.variable;
641 if (pos[0] != WLAN_EID_SSID ||
642 pos + 2 + pos[1] > end) {
643#ifdef CONFIG_MAC80211_IBSS_DEBUG
644 printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
645 "from %pM\n",
646 sdata->dev->name, mgmt->sa);
647#endif
648 return;
649 }
650 if (pos[1] != 0 &&
651 (pos[1] != ifibss->ssid_len ||
652 memcmp(pos + 2, ifibss->ssid, ifibss->ssid_len) != 0)) {
653 /* Ignore ProbeReq for foreign SSID */
654 return;
655 }
656
657 /* Reply with ProbeResp */
658 skb = skb_copy(ifibss->probe_resp, GFP_KERNEL);
659 if (!skb)
660 return;
661
662 resp = (struct ieee80211_mgmt *) skb->data;
663 memcpy(resp->da, mgmt->sa, ETH_ALEN);
664#ifdef CONFIG_MAC80211_IBSS_DEBUG
665 printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
666 sdata->dev->name, resp->da);
667#endif /* CONFIG_MAC80211_IBSS_DEBUG */
668 ieee80211_tx_skb(sdata, skb, 0);
669}
670
671static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
672 struct ieee80211_mgmt *mgmt,
673 size_t len,
674 struct ieee80211_rx_status *rx_status)
675{
676 size_t baselen;
677 struct ieee802_11_elems elems;
678
679 if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
680 return; /* ignore ProbeResp to foreign address */
681
682 baselen = (u8 *) mgmt->u.probe_resp.variable - (u8 *) mgmt;
683 if (baselen > len)
684 return;
685
686 ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
687 &elems);
688
689 ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, false);
690}
691
692static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
693 struct ieee80211_mgmt *mgmt,
694 size_t len,
695 struct ieee80211_rx_status *rx_status)
696{
697 size_t baselen;
698 struct ieee802_11_elems elems;
699
700 /* Process beacon from the current BSS */
701 baselen = (u8 *) mgmt->u.beacon.variable - (u8 *) mgmt;
702 if (baselen > len)
703 return;
704
705 ieee802_11_parse_elems(mgmt->u.beacon.variable, len - baselen, &elems);
706
707 ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true);
708}
709
710static void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
711 struct sk_buff *skb)
712{
713 struct ieee80211_rx_status *rx_status;
714 struct ieee80211_mgmt *mgmt;
715 u16 fc;
716
717 rx_status = (struct ieee80211_rx_status *) skb->cb;
718 mgmt = (struct ieee80211_mgmt *) skb->data;
719 fc = le16_to_cpu(mgmt->frame_control);
720
721 switch (fc & IEEE80211_FCTL_STYPE) {
722 case IEEE80211_STYPE_PROBE_REQ:
723 ieee80211_rx_mgmt_probe_req(sdata, mgmt, skb->len);
724 break;
725 case IEEE80211_STYPE_PROBE_RESP:
726 ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
727 rx_status);
728 break;
729 case IEEE80211_STYPE_BEACON:
730 ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
731 rx_status);
732 break;
733 case IEEE80211_STYPE_AUTH:
734 ieee80211_rx_mgmt_auth_ibss(sdata, mgmt, skb->len);
735 break;
736 }
737
738 kfree_skb(skb);
739}
740
741static void ieee80211_ibss_work(struct work_struct *work)
742{
743 struct ieee80211_sub_if_data *sdata =
744 container_of(work, struct ieee80211_sub_if_data, u.ibss.work);
745 struct ieee80211_local *local = sdata->local;
746 struct ieee80211_if_ibss *ifibss;
747 struct sk_buff *skb;
748
749 if (!netif_running(sdata->dev))
750 return;
751
752 if (local->sw_scanning || local->hw_scanning)
753 return;
754
755 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_ADHOC))
756 return;
757 ifibss = &sdata->u.ibss;
758
759 while ((skb = skb_dequeue(&ifibss->skb_queue)))
760 ieee80211_ibss_rx_queued_mgmt(sdata, skb);
761
762 if (!test_and_clear_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request))
763 return;
764
765 switch (ifibss->state) {
766 case IEEE80211_IBSS_MLME_SEARCH:
767 ieee80211_sta_find_ibss(sdata);
768 break;
769 case IEEE80211_IBSS_MLME_JOINED:
770 ieee80211_sta_merge_ibss(sdata);
771 break;
772 default:
773 WARN_ON(1);
774 break;
775 }
776}
777
778static void ieee80211_ibss_timer(unsigned long data)
779{
780 struct ieee80211_sub_if_data *sdata =
781 (struct ieee80211_sub_if_data *) data;
782 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
783 struct ieee80211_local *local = sdata->local;
784
785 set_bit(IEEE80211_IBSS_REQ_RUN, &ifibss->request);
786 queue_work(local->hw.workqueue, &ifibss->work);
787}
788
789void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata)
790{
791 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
792
793 INIT_WORK(&ifibss->work, ieee80211_ibss_work);
794 setup_timer(&ifibss->timer, ieee80211_ibss_timer,
795 (unsigned long) sdata);
796 skb_queue_head_init(&ifibss->skb_queue);
797
798 ifibss->flags |= IEEE80211_IBSS_AUTO_BSSID_SEL |
799 IEEE80211_IBSS_AUTO_CHANNEL_SEL;
800}
801
802int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata)
803{
804 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
805
806 ifibss->flags &= ~IEEE80211_IBSS_PREV_BSSID_SET;
807
808 if (ifibss->ssid_len)
809 ifibss->flags |= IEEE80211_IBSS_SSID_SET;
810 else
811 ifibss->flags &= ~IEEE80211_IBSS_SSID_SET;
812
813 ifibss->ibss_join_req = jiffies;
814 ifibss->state = IEEE80211_IBSS_MLME_SEARCH;
815
816 return ieee80211_sta_find_ibss(sdata);
817}
818
819int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
820{
821 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
822
823 if (len > IEEE80211_MAX_SSID_LEN)
824 return -EINVAL;
825
826 if (ifibss->ssid_len != len || memcmp(ifibss->ssid, ssid, len) != 0) {
827 memset(ifibss->ssid, 0, sizeof(ifibss->ssid));
828 memcpy(ifibss->ssid, ssid, len);
829 ifibss->ssid_len = len;
830 }
831
832 return ieee80211_ibss_commit(sdata);
833}
834
835int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
836{
837 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
838
839 memcpy(ssid, ifibss->ssid, ifibss->ssid_len);
840 *len = ifibss->ssid_len;
841
842 return 0;
843}
844
845int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
846{
847 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
848
849 if (is_valid_ether_addr(bssid)) {
850 memcpy(ifibss->bssid, bssid, ETH_ALEN);
851 ifibss->flags |= IEEE80211_IBSS_BSSID_SET;
852 } else {
853 memset(ifibss->bssid, 0, ETH_ALEN);
854 ifibss->flags &= ~IEEE80211_IBSS_BSSID_SET;
855 }
856
857 if (netif_running(sdata->dev)) {
858 if (ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID)) {
859 printk(KERN_DEBUG "%s: Failed to config new BSSID to "
860 "the low-level driver\n", sdata->dev->name);
861 }
862 }
863
864 return ieee80211_ibss_commit(sdata);
865}
866
867/* scan finished notification */
868void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
869{
870 struct ieee80211_sub_if_data *sdata = local->scan_sdata;
871 struct ieee80211_if_ibss *ifibss;
872
873 if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
874 ifibss = &sdata->u.ibss;
875 if ((!(ifibss->flags & IEEE80211_IBSS_PREV_BSSID_SET)) ||
876 !ieee80211_sta_active_ibss(sdata))
877 ieee80211_sta_find_ibss(sdata);
878 }
879}
880
881ieee80211_rx_result
882ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
883 struct ieee80211_rx_status *rx_status)
884{
885 struct ieee80211_local *local = sdata->local;
886 struct ieee80211_mgmt *mgmt;
887 u16 fc;
888
889 if (skb->len < 24)
890 return RX_DROP_MONITOR;
891
892 mgmt = (struct ieee80211_mgmt *) skb->data;
893 fc = le16_to_cpu(mgmt->frame_control);
894
895 switch (fc & IEEE80211_FCTL_STYPE) {
896 case IEEE80211_STYPE_PROBE_RESP:
897 case IEEE80211_STYPE_BEACON:
898 memcpy(skb->cb, rx_status, sizeof(*rx_status));
899 case IEEE80211_STYPE_PROBE_REQ:
900 case IEEE80211_STYPE_AUTH:
901 skb_queue_tail(&sdata->u.ibss.skb_queue, skb);
902 queue_work(local->hw.workqueue, &sdata->u.ibss.work);
903 return RX_QUEUED;
904 }
905
906 return RX_DROP_MONITOR;
907}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 2cb743ed9f9c..fbb91f1aebb2 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -239,7 +239,7 @@ struct mesh_preq_queue {
239 u8 flags; 239 u8 flags;
240}; 240};
241 241
242/* flags used in struct ieee80211_if_sta.flags */ 242/* flags used in struct ieee80211_if_managed.flags */
243#define IEEE80211_STA_SSID_SET BIT(0) 243#define IEEE80211_STA_SSID_SET BIT(0)
244#define IEEE80211_STA_BSSID_SET BIT(1) 244#define IEEE80211_STA_BSSID_SET BIT(1)
245#define IEEE80211_STA_PREV_BSSID_SET BIT(2) 245#define IEEE80211_STA_PREV_BSSID_SET BIT(2)
@@ -262,31 +262,30 @@ struct mesh_preq_queue {
262#define IEEE80211_STA_REQ_AUTH 2 262#define IEEE80211_STA_REQ_AUTH 2
263#define IEEE80211_STA_REQ_RUN 3 263#define IEEE80211_STA_REQ_RUN 3
264 264
265/* STA/IBSS MLME states */
266enum ieee80211_sta_mlme_state {
267 IEEE80211_STA_MLME_DISABLED,
268 IEEE80211_STA_MLME_DIRECT_PROBE,
269 IEEE80211_STA_MLME_AUTHENTICATE,
270 IEEE80211_STA_MLME_ASSOCIATE,
271 IEEE80211_STA_MLME_ASSOCIATED,
272 IEEE80211_STA_MLME_IBSS_SEARCH,
273 IEEE80211_STA_MLME_IBSS_JOINED,
274};
275
276/* bitfield of allowed auth algs */ 265/* bitfield of allowed auth algs */
277#define IEEE80211_AUTH_ALG_OPEN BIT(0) 266#define IEEE80211_AUTH_ALG_OPEN BIT(0)
278#define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1) 267#define IEEE80211_AUTH_ALG_SHARED_KEY BIT(1)
279#define IEEE80211_AUTH_ALG_LEAP BIT(2) 268#define IEEE80211_AUTH_ALG_LEAP BIT(2)
280 269
281struct ieee80211_if_sta { 270struct ieee80211_if_managed {
282 struct timer_list timer; 271 struct timer_list timer;
283 struct timer_list chswitch_timer; 272 struct timer_list chswitch_timer;
284 struct work_struct work; 273 struct work_struct work;
285 struct work_struct chswitch_work; 274 struct work_struct chswitch_work;
275
286 u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; 276 u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN];
277
287 u8 ssid[IEEE80211_MAX_SSID_LEN]; 278 u8 ssid[IEEE80211_MAX_SSID_LEN];
288 enum ieee80211_sta_mlme_state state;
289 size_t ssid_len; 279 size_t ssid_len;
280
281 enum {
282 IEEE80211_STA_MLME_DISABLED,
283 IEEE80211_STA_MLME_DIRECT_PROBE,
284 IEEE80211_STA_MLME_AUTHENTICATE,
285 IEEE80211_STA_MLME_ASSOCIATE,
286 IEEE80211_STA_MLME_ASSOCIATED,
287 } state;
288
290 u16 aid; 289 u16 aid;
291 u16 ap_capab, capab; 290 u16 ap_capab, capab;
292 u8 *extra_ie; /* to be added to the end of AssocReq */ 291 u8 *extra_ie; /* to be added to the end of AssocReq */
@@ -319,10 +318,6 @@ struct ieee80211_if_sta {
319 IEEE80211_MFP_REQUIRED 318 IEEE80211_MFP_REQUIRED
320 } mfp; /* management frame protection */ 319 } mfp; /* management frame protection */
321 320
322 unsigned long ibss_join_req;
323 struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
324 u32 supp_rates_bits[IEEE80211_NUM_BANDS];
325
326 int wmm_last_param_set; 321 int wmm_last_param_set;
327 322
328 /* Extra IE data for management frames */ 323 /* Extra IE data for management frames */
@@ -342,6 +337,42 @@ struct ieee80211_if_sta {
342 size_t ie_disassoc_len; 337 size_t ie_disassoc_len;
343}; 338};
344 339
340enum ieee80211_ibss_flags {
341 IEEE80211_IBSS_AUTO_CHANNEL_SEL = BIT(0),
342 IEEE80211_IBSS_AUTO_BSSID_SEL = BIT(1),
343 IEEE80211_IBSS_BSSID_SET = BIT(2),
344 IEEE80211_IBSS_PREV_BSSID_SET = BIT(3),
345 IEEE80211_IBSS_SSID_SET = BIT(4),
346};
347
348enum ieee80211_ibss_request {
349 IEEE80211_IBSS_REQ_RUN = 0,
350};
351
352struct ieee80211_if_ibss {
353 struct timer_list timer;
354 struct work_struct work;
355
356 struct sk_buff_head skb_queue;
357
358 u8 ssid[IEEE80211_MAX_SSID_LEN];
359 u8 ssid_len;
360
361 u32 flags;
362
363 u8 bssid[ETH_ALEN];
364
365 unsigned long request;
366
367 unsigned long ibss_join_req;
368 struct sk_buff *probe_resp; /* ProbeResp template for IBSS */
369
370 enum {
371 IEEE80211_IBSS_MLME_SEARCH,
372 IEEE80211_IBSS_MLME_JOINED,
373 } state;
374};
375
345struct ieee80211_if_mesh { 376struct ieee80211_if_mesh {
346 struct work_struct work; 377 struct work_struct work;
347 struct timer_list housekeeping_timer; 378 struct timer_list housekeeping_timer;
@@ -445,7 +476,8 @@ struct ieee80211_sub_if_data {
445 struct ieee80211_if_ap ap; 476 struct ieee80211_if_ap ap;
446 struct ieee80211_if_wds wds; 477 struct ieee80211_if_wds wds;
447 struct ieee80211_if_vlan vlan; 478 struct ieee80211_if_vlan vlan;
448 struct ieee80211_if_sta sta; 479 struct ieee80211_if_managed mgd;
480 struct ieee80211_if_ibss ibss;
449#ifdef CONFIG_MAC80211_MESH 481#ifdef CONFIG_MAC80211_MESH
450 struct ieee80211_if_mesh mesh; 482 struct ieee80211_if_mesh mesh;
451#endif 483#endif
@@ -564,12 +596,10 @@ enum {
564enum queue_stop_reason { 596enum queue_stop_reason {
565 IEEE80211_QUEUE_STOP_REASON_DRIVER, 597 IEEE80211_QUEUE_STOP_REASON_DRIVER,
566 IEEE80211_QUEUE_STOP_REASON_PS, 598 IEEE80211_QUEUE_STOP_REASON_PS,
567 IEEE80211_QUEUE_STOP_REASON_CSA 599 IEEE80211_QUEUE_STOP_REASON_CSA,
600 IEEE80211_QUEUE_STOP_REASON_AGGREGATION,
568}; 601};
569 602
570/* maximum number of hardware queues we support. */
571#define QD_MAX_QUEUES (IEEE80211_MAX_AMPDU_QUEUES + IEEE80211_MAX_QUEUES)
572
573struct ieee80211_master_priv { 603struct ieee80211_master_priv {
574 struct ieee80211_local *local; 604 struct ieee80211_local *local;
575}; 605};
@@ -582,9 +612,15 @@ struct ieee80211_local {
582 612
583 const struct ieee80211_ops *ops; 613 const struct ieee80211_ops *ops;
584 614
585 unsigned long queue_pool[BITS_TO_LONGS(QD_MAX_QUEUES)]; 615 /* AC queue corresponding to each AMPDU queue */
586 unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES]; 616 s8 ampdu_ac_queue[IEEE80211_MAX_AMPDU_QUEUES];
617 unsigned int amdpu_ac_stop_refcnt[IEEE80211_MAX_AMPDU_QUEUES];
618
619 unsigned long queue_stop_reasons[IEEE80211_MAX_QUEUES +
620 IEEE80211_MAX_AMPDU_QUEUES];
621 /* also used to protect ampdu_ac_queue and amdpu_ac_stop_refcnt */
587 spinlock_t queue_stop_reason_lock; 622 spinlock_t queue_stop_reason_lock;
623
588 struct net_device *mdev; /* wmaster# - "master" 802.11 device */ 624 struct net_device *mdev; /* wmaster# - "master" 802.11 device */
589 int open_count; 625 int open_count;
590 int monitors, cooked_mntrs; 626 int monitors, cooked_mntrs;
@@ -888,34 +924,41 @@ void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx);
888void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, 924void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
889 u32 changed); 925 u32 changed);
890void ieee80211_configure_filter(struct ieee80211_local *local); 926void ieee80211_configure_filter(struct ieee80211_local *local);
927u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
891 928
892/* wireless extensions */ 929/* wireless extensions */
893extern const struct iw_handler_def ieee80211_iw_handler_def; 930extern const struct iw_handler_def ieee80211_iw_handler_def;
894 931
895/* STA/IBSS code */ 932/* STA code */
896void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata); 933void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
897void ieee80211_scan_work(struct work_struct *work); 934ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
898void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, 935 struct sk_buff *skb,
899 struct ieee80211_rx_status *rx_status); 936 struct ieee80211_rx_status *rx_status);
937int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata);
900int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len); 938int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
901int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len); 939int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
902int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid); 940int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
903void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata, 941void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata);
904 struct ieee80211_if_sta *ifsta);
905struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
906 u8 *bssid, u8 *addr, u32 supp_rates);
907int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason); 942int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason);
908int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason); 943int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason);
909u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
910u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
911 struct ieee802_11_elems *elems,
912 enum ieee80211_band band);
913void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
914 u8 *ssid, size_t ssid_len);
915void ieee80211_send_pspoll(struct ieee80211_local *local, 944void ieee80211_send_pspoll(struct ieee80211_local *local,
916 struct ieee80211_sub_if_data *sdata); 945 struct ieee80211_sub_if_data *sdata);
917 946
947/* IBSS code */
948int ieee80211_ibss_commit(struct ieee80211_sub_if_data *sdata);
949int ieee80211_ibss_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len);
950int ieee80211_ibss_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len);
951int ieee80211_ibss_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid);
952void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local);
953void ieee80211_ibss_setup_sdata(struct ieee80211_sub_if_data *sdata);
954ieee80211_rx_result
955ieee80211_ibss_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
956 struct ieee80211_rx_status *rx_status);
957struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
958 u8 *bssid, u8 *addr, u32 supp_rates);
959
918/* scan/BSS handling */ 960/* scan/BSS handling */
961void ieee80211_scan_work(struct work_struct *work);
919int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, 962int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
920 struct cfg80211_scan_request *req); 963 struct cfg80211_scan_request *req);
921int ieee80211_scan_results(struct ieee80211_local *local, 964int ieee80211_scan_results(struct ieee80211_local *local,
@@ -929,6 +972,7 @@ int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata,
929 char *ie, size_t len); 972 char *ie, size_t len);
930 973
931void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local); 974void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local);
975void ieee80211_scan_failed(struct ieee80211_local *local);
932int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata, 976int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
933 struct cfg80211_scan_request *req); 977 struct cfg80211_scan_request *req);
934struct ieee80211_bss * 978struct ieee80211_bss *
@@ -1042,6 +1086,25 @@ void ieee80211_wake_queues_by_reason(struct ieee80211_hw *hw,
1042 enum queue_stop_reason reason); 1086 enum queue_stop_reason reason);
1043void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw, 1087void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
1044 enum queue_stop_reason reason); 1088 enum queue_stop_reason reason);
1089void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
1090 enum queue_stop_reason reason);
1091void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
1092 enum queue_stop_reason reason);
1093
1094void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
1095 u16 transaction, u16 auth_alg,
1096 u8 *extra, size_t extra_len,
1097 const u8 *bssid, int encrypt);
1098void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
1099 u8 *ssid, size_t ssid_len,
1100 u8 *ie, size_t ie_len);
1101
1102void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
1103 const size_t supp_rates_len,
1104 const u8 *supp_rates);
1105u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
1106 struct ieee802_11_elems *elems,
1107 enum ieee80211_band band);
1045 1108
1046#ifdef CONFIG_MAC80211_NOINLINE 1109#ifdef CONFIG_MAC80211_NOINLINE
1047#define debug_noinline noinline 1110#define debug_noinline noinline
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index df94b9365264..f9f27b9cadbe 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -236,7 +236,10 @@ static int ieee80211_open(struct net_device *dev)
236 break; 236 break;
237 case NL80211_IFTYPE_STATION: 237 case NL80211_IFTYPE_STATION:
238 case NL80211_IFTYPE_ADHOC: 238 case NL80211_IFTYPE_ADHOC:
239 sdata->u.sta.flags &= ~IEEE80211_STA_PREV_BSSID_SET; 239 if (sdata->vif.type == NL80211_IFTYPE_STATION)
240 sdata->u.mgd.flags &= ~IEEE80211_STA_PREV_BSSID_SET;
241 else
242 sdata->u.ibss.flags &= ~IEEE80211_IBSS_PREV_BSSID_SET;
240 /* fall through */ 243 /* fall through */
241 default: 244 default:
242 conf.vif = &sdata->vif; 245 conf.vif = &sdata->vif;
@@ -321,11 +324,10 @@ static int ieee80211_open(struct net_device *dev)
321 * yet be effective. Trigger execution of ieee80211_sta_work 324 * yet be effective. Trigger execution of ieee80211_sta_work
322 * to fix this. 325 * to fix this.
323 */ 326 */
324 if (sdata->vif.type == NL80211_IFTYPE_STATION || 327 if (sdata->vif.type == NL80211_IFTYPE_STATION)
325 sdata->vif.type == NL80211_IFTYPE_ADHOC) { 328 queue_work(local->hw.workqueue, &sdata->u.mgd.work);
326 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 329 else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
327 queue_work(local->hw.workqueue, &ifsta->work); 330 queue_work(local->hw.workqueue, &sdata->u.ibss.work);
328 }
329 331
330 netif_tx_start_all_queues(dev); 332 netif_tx_start_all_queues(dev);
331 333
@@ -368,6 +370,18 @@ static int ieee80211_stop(struct net_device *dev)
368 rcu_read_unlock(); 370 rcu_read_unlock();
369 371
370 /* 372 /*
373 * Announce that we are leaving the network, in case we are a
374 * station interface type. This must be done before removing
375 * all stations associated with sta_info_flush, otherwise STA
376 * information will be gone and no announce being done.
377 */
378 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
379 if (sdata->u.mgd.state != IEEE80211_STA_MLME_DISABLED)
380 ieee80211_sta_deauthenticate(sdata,
381 WLAN_REASON_DEAUTH_LEAVING);
382 }
383
384 /*
371 * Remove all stations associated with this interface. 385 * Remove all stations associated with this interface.
372 * 386 *
373 * This must be done before calling ops->remove_interface() 387 * This must be done before calling ops->remove_interface()
@@ -452,15 +466,9 @@ static int ieee80211_stop(struct net_device *dev)
452 netif_addr_unlock_bh(local->mdev); 466 netif_addr_unlock_bh(local->mdev);
453 break; 467 break;
454 case NL80211_IFTYPE_STATION: 468 case NL80211_IFTYPE_STATION:
455 case NL80211_IFTYPE_ADHOC: 469 memset(sdata->u.mgd.bssid, 0, ETH_ALEN);
456 /* Announce that we are leaving the network. */ 470 del_timer_sync(&sdata->u.mgd.chswitch_timer);
457 if (sdata->u.sta.state != IEEE80211_STA_MLME_DISABLED) 471 del_timer_sync(&sdata->u.mgd.timer);
458 ieee80211_sta_deauthenticate(sdata,
459 WLAN_REASON_DEAUTH_LEAVING);
460
461 memset(sdata->u.sta.bssid, 0, ETH_ALEN);
462 del_timer_sync(&sdata->u.sta.chswitch_timer);
463 del_timer_sync(&sdata->u.sta.timer);
464 /* 472 /*
465 * If the timer fired while we waited for it, it will have 473 * If the timer fired while we waited for it, it will have
466 * requeued the work. Now the work will be running again 474 * requeued the work. Now the work will be running again
@@ -468,8 +476,8 @@ static int ieee80211_stop(struct net_device *dev)
468 * whether the interface is running, which, at this point, 476 * whether the interface is running, which, at this point,
469 * it no longer is. 477 * it no longer is.
470 */ 478 */
471 cancel_work_sync(&sdata->u.sta.work); 479 cancel_work_sync(&sdata->u.mgd.work);
472 cancel_work_sync(&sdata->u.sta.chswitch_work); 480 cancel_work_sync(&sdata->u.mgd.chswitch_work);
473 /* 481 /*
474 * When we get here, the interface is marked down. 482 * When we get here, the interface is marked down.
475 * Call synchronize_rcu() to wait for the RX path 483 * Call synchronize_rcu() to wait for the RX path
@@ -477,13 +485,22 @@ static int ieee80211_stop(struct net_device *dev)
477 * frames at this very time on another CPU. 485 * frames at this very time on another CPU.
478 */ 486 */
479 synchronize_rcu(); 487 synchronize_rcu();
480 skb_queue_purge(&sdata->u.sta.skb_queue); 488 skb_queue_purge(&sdata->u.mgd.skb_queue);
481 489
482 sdata->u.sta.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED | 490 sdata->u.mgd.flags &= ~(IEEE80211_STA_PRIVACY_INVOKED |
483 IEEE80211_STA_TKIP_WEP_USED); 491 IEEE80211_STA_TKIP_WEP_USED);
484 kfree(sdata->u.sta.extra_ie); 492 kfree(sdata->u.mgd.extra_ie);
485 sdata->u.sta.extra_ie = NULL; 493 sdata->u.mgd.extra_ie = NULL;
486 sdata->u.sta.extra_ie_len = 0; 494 sdata->u.mgd.extra_ie_len = 0;
495 /* fall through */
496 case NL80211_IFTYPE_ADHOC:
497 if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
498 memset(sdata->u.ibss.bssid, 0, ETH_ALEN);
499 del_timer_sync(&sdata->u.ibss.timer);
500 cancel_work_sync(&sdata->u.ibss.work);
501 synchronize_rcu();
502 skb_queue_purge(&sdata->u.ibss.skb_queue);
503 }
487 /* fall through */ 504 /* fall through */
488 case NL80211_IFTYPE_MESH_POINT: 505 case NL80211_IFTYPE_MESH_POINT:
489 if (ieee80211_vif_is_mesh(&sdata->vif)) { 506 if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -629,19 +646,20 @@ static void ieee80211_teardown_sdata(struct net_device *dev)
629 if (ieee80211_vif_is_mesh(&sdata->vif)) 646 if (ieee80211_vif_is_mesh(&sdata->vif))
630 mesh_rmc_free(sdata); 647 mesh_rmc_free(sdata);
631 break; 648 break;
632 case NL80211_IFTYPE_STATION:
633 case NL80211_IFTYPE_ADHOC: 649 case NL80211_IFTYPE_ADHOC:
634 kfree(sdata->u.sta.extra_ie); 650 kfree_skb(sdata->u.ibss.probe_resp);
635 kfree(sdata->u.sta.assocreq_ies); 651 break;
636 kfree(sdata->u.sta.assocresp_ies); 652 case NL80211_IFTYPE_STATION:
637 kfree_skb(sdata->u.sta.probe_resp); 653 kfree(sdata->u.mgd.extra_ie);
638 kfree(sdata->u.sta.ie_probereq); 654 kfree(sdata->u.mgd.assocreq_ies);
639 kfree(sdata->u.sta.ie_proberesp); 655 kfree(sdata->u.mgd.assocresp_ies);
640 kfree(sdata->u.sta.ie_auth); 656 kfree(sdata->u.mgd.ie_probereq);
641 kfree(sdata->u.sta.ie_assocreq); 657 kfree(sdata->u.mgd.ie_proberesp);
642 kfree(sdata->u.sta.ie_reassocreq); 658 kfree(sdata->u.mgd.ie_auth);
643 kfree(sdata->u.sta.ie_deauth); 659 kfree(sdata->u.mgd.ie_assocreq);
644 kfree(sdata->u.sta.ie_disassoc); 660 kfree(sdata->u.mgd.ie_reassocreq);
661 kfree(sdata->u.mgd.ie_deauth);
662 kfree(sdata->u.mgd.ie_disassoc);
645 break; 663 break;
646 case NL80211_IFTYPE_WDS: 664 case NL80211_IFTYPE_WDS:
647 case NL80211_IFTYPE_AP_VLAN: 665 case NL80211_IFTYPE_AP_VLAN:
@@ -708,9 +726,11 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata,
708 INIT_LIST_HEAD(&sdata->u.ap.vlans); 726 INIT_LIST_HEAD(&sdata->u.ap.vlans);
709 break; 727 break;
710 case NL80211_IFTYPE_STATION: 728 case NL80211_IFTYPE_STATION:
711 case NL80211_IFTYPE_ADHOC:
712 ieee80211_sta_setup_sdata(sdata); 729 ieee80211_sta_setup_sdata(sdata);
713 break; 730 break;
731 case NL80211_IFTYPE_ADHOC:
732 ieee80211_ibss_setup_sdata(sdata);
733 break;
714 case NL80211_IFTYPE_MESH_POINT: 734 case NL80211_IFTYPE_MESH_POINT:
715 if (ieee80211_vif_is_mesh(&sdata->vif)) 735 if (ieee80211_vif_is_mesh(&sdata->vif))
716 ieee80211_mesh_init_sdata(sdata); 736 ieee80211_mesh_init_sdata(sdata);
@@ -798,6 +818,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
798 818
799 memcpy(ndev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN); 819 memcpy(ndev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
800 SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy)); 820 SET_NETDEV_DEV(ndev, wiphy_dev(local->hw.wiphy));
821 ndev->features |= NETIF_F_NETNS_LOCAL;
801 822
802 /* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */ 823 /* don't use IEEE80211_DEV_TO_SUB_IF because it checks too much */
803 sdata = netdev_priv(ndev); 824 sdata = netdev_priv(ndev);
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 19b480de4bbc..687acf23054d 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -400,7 +400,7 @@ void ieee80211_key_link(struct ieee80211_key *key,
400 */ 400 */
401 401
402 /* same here, the AP could be using QoS */ 402 /* same here, the AP could be using QoS */
403 ap = sta_info_get(key->local, key->sdata->u.sta.bssid); 403 ap = sta_info_get(key->local, key->sdata->u.mgd.bssid);
404 if (ap) { 404 if (ap) {
405 if (test_sta_flags(ap, WLAN_STA_WME)) 405 if (test_sta_flags(ap, WLAN_STA_WME))
406 key->conf.flags |= 406 key->conf.flags |=
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 5667f4e8067f..f38db4d37e5d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -169,9 +169,10 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
169 169
170 memset(&conf, 0, sizeof(conf)); 170 memset(&conf, 0, sizeof(conf));
171 171
172 if (sdata->vif.type == NL80211_IFTYPE_STATION || 172 if (sdata->vif.type == NL80211_IFTYPE_STATION)
173 sdata->vif.type == NL80211_IFTYPE_ADHOC) 173 conf.bssid = sdata->u.mgd.bssid;
174 conf.bssid = sdata->u.sta.bssid; 174 else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
175 conf.bssid = sdata->u.ibss.bssid;
175 else if (sdata->vif.type == NL80211_IFTYPE_AP) 176 else if (sdata->vif.type == NL80211_IFTYPE_AP)
176 conf.bssid = sdata->dev->dev_addr; 177 conf.bssid = sdata->dev->dev_addr;
177 else if (ieee80211_vif_is_mesh(&sdata->vif)) { 178 else if (ieee80211_vif_is_mesh(&sdata->vif)) {
@@ -210,7 +211,7 @@ int ieee80211_if_config(struct ieee80211_sub_if_data *sdata, u32 changed)
210 !!rcu_dereference(sdata->u.ap.beacon); 211 !!rcu_dereference(sdata->u.ap.beacon);
211 break; 212 break;
212 case NL80211_IFTYPE_ADHOC: 213 case NL80211_IFTYPE_ADHOC:
213 conf.enable_beacon = !!sdata->u.sta.probe_resp; 214 conf.enable_beacon = !!sdata->u.ibss.probe_resp;
214 break; 215 break;
215 case NL80211_IFTYPE_MESH_POINT: 216 case NL80211_IFTYPE_MESH_POINT:
216 conf.enable_beacon = true; 217 conf.enable_beacon = true;
@@ -705,7 +706,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
705 const struct ieee80211_ops *ops) 706 const struct ieee80211_ops *ops)
706{ 707{
707 struct ieee80211_local *local; 708 struct ieee80211_local *local;
708 int priv_size; 709 int priv_size, i;
709 struct wiphy *wiphy; 710 struct wiphy *wiphy;
710 711
711 /* Ensure 32-byte alignment of our private data and hw private data. 712 /* Ensure 32-byte alignment of our private data and hw private data.
@@ -779,6 +780,11 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len,
779 setup_timer(&local->dynamic_ps_timer, 780 setup_timer(&local->dynamic_ps_timer,
780 ieee80211_dynamic_ps_timer, (unsigned long) local); 781 ieee80211_dynamic_ps_timer, (unsigned long) local);
781 782
783 for (i = 0; i < IEEE80211_MAX_AMPDU_QUEUES; i++)
784 local->ampdu_ac_queue[i] = -1;
785 /* using an s8 won't work with more than that */
786 BUILD_BUG_ON(IEEE80211_MAX_AMPDU_QUEUES > 127);
787
782 sta_info_init(local); 788 sta_info_init(local);
783 789
784 tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending, 790 tasklet_init(&local->tx_pending_tasklet, ieee80211_tx_pending,
@@ -855,6 +861,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
855 /* mac80211 always supports monitor */ 861 /* mac80211 always supports monitor */
856 local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); 862 local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR);
857 863
864 if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
865 local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM;
866 else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
867 local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_UNSPEC;
868
858 result = wiphy_register(local->hw.wiphy); 869 result = wiphy_register(local->hw.wiphy);
859 if (result < 0) 870 if (result < 0)
860 goto fail_wiphy_register; 871 goto fail_wiphy_register;
@@ -872,7 +883,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
872 883
873 mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv), 884 mdev = alloc_netdev_mq(sizeof(struct ieee80211_master_priv),
874 "wmaster%d", ieee80211_master_setup, 885 "wmaster%d", ieee80211_master_setup,
875 ieee80211_num_queues(hw)); 886 hw->queues);
876 if (!mdev) 887 if (!mdev)
877 goto fail_mdev_alloc; 888 goto fail_mdev_alloc;
878 889
@@ -916,6 +927,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
916 927
917 memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN); 928 memcpy(local->mdev->dev_addr, local->hw.wiphy->perm_addr, ETH_ALEN);
918 SET_NETDEV_DEV(local->mdev, wiphy_dev(local->hw.wiphy)); 929 SET_NETDEV_DEV(local->mdev, wiphy_dev(local->hw.wiphy));
930 local->mdev->features |= NETIF_F_NETNS_LOCAL;
919 931
920 result = register_netdevice(local->mdev); 932 result = register_netdevice(local->mdev);
921 if (result < 0) 933 if (result < 0)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index fbb766afe599..841b8450b3de 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -15,11 +15,8 @@
15#include <linux/if_ether.h> 15#include <linux/if_ether.h>
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <linux/if_arp.h> 17#include <linux/if_arp.h>
18#include <linux/wireless.h>
19#include <linux/random.h>
20#include <linux/etherdevice.h> 18#include <linux/etherdevice.h>
21#include <linux/rtnetlink.h> 19#include <linux/rtnetlink.h>
22#include <net/iw_handler.h>
23#include <net/mac80211.h> 20#include <net/mac80211.h>
24#include <asm/unaligned.h> 21#include <asm/unaligned.h>
25 22
@@ -35,15 +32,6 @@
35#define IEEE80211_MONITORING_INTERVAL (2 * HZ) 32#define IEEE80211_MONITORING_INTERVAL (2 * HZ)
36#define IEEE80211_PROBE_INTERVAL (60 * HZ) 33#define IEEE80211_PROBE_INTERVAL (60 * HZ)
37#define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ) 34#define IEEE80211_RETRY_AUTH_INTERVAL (1 * HZ)
38#define IEEE80211_SCAN_INTERVAL (2 * HZ)
39#define IEEE80211_SCAN_INTERVAL_SLOW (15 * HZ)
40#define IEEE80211_IBSS_JOIN_TIMEOUT (7 * HZ)
41
42#define IEEE80211_IBSS_MERGE_INTERVAL (30 * HZ)
43#define IEEE80211_IBSS_INACTIVITY_LIMIT (60 * HZ)
44
45#define IEEE80211_IBSS_MAX_STA_ENTRIES 128
46
47 35
48/* utils */ 36/* utils */
49static int ecw2cw(int ecw) 37static int ecw2cw(int ecw)
@@ -92,43 +80,6 @@ static int ieee80211_compatible_rates(struct ieee80211_bss *bss,
92 return count; 80 return count;
93} 81}
94 82
95/* also used by mesh code */
96u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
97 struct ieee802_11_elems *elems,
98 enum ieee80211_band band)
99{
100 struct ieee80211_supported_band *sband;
101 struct ieee80211_rate *bitrates;
102 size_t num_rates;
103 u32 supp_rates;
104 int i, j;
105 sband = local->hw.wiphy->bands[band];
106
107 if (!sband) {
108 WARN_ON(1);
109 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
110 }
111
112 bitrates = sband->bitrates;
113 num_rates = sband->n_bitrates;
114 supp_rates = 0;
115 for (i = 0; i < elems->supp_rates_len +
116 elems->ext_supp_rates_len; i++) {
117 u8 rate = 0;
118 int own_rate;
119 if (i < elems->supp_rates_len)
120 rate = elems->supp_rates[i];
121 else if (elems->ext_supp_rates)
122 rate = elems->ext_supp_rates
123 [i - elems->supp_rates_len];
124 own_rate = 5 * (rate & 0x7f);
125 for (j = 0; j < num_rates; j++)
126 if (bitrates[j].bitrate == own_rate)
127 supp_rates |= BIT(j);
128 }
129 return supp_rates;
130}
131
132/* frame sending functions */ 83/* frame sending functions */
133 84
134static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len) 85static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
@@ -137,113 +88,9 @@ static void add_extra_ies(struct sk_buff *skb, u8 *ies, size_t ies_len)
137 memcpy(skb_put(skb, ies_len), ies, ies_len); 88 memcpy(skb_put(skb, ies_len), ies, ies_len);
138} 89}
139 90
140/* also used by scanning code */ 91static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
141void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
142 u8 *ssid, size_t ssid_len)
143{
144 struct ieee80211_local *local = sdata->local;
145 struct ieee80211_supported_band *sband;
146 struct sk_buff *skb;
147 struct ieee80211_mgmt *mgmt;
148 u8 *pos, *supp_rates, *esupp_rates = NULL;
149 int i;
150
151 skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
152 sdata->u.sta.ie_probereq_len);
153 if (!skb) {
154 printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
155 "request\n", sdata->dev->name);
156 return;
157 }
158 skb_reserve(skb, local->hw.extra_tx_headroom);
159
160 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
161 memset(mgmt, 0, 24);
162 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
163 IEEE80211_STYPE_PROBE_REQ);
164 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
165 if (dst) {
166 memcpy(mgmt->da, dst, ETH_ALEN);
167 memcpy(mgmt->bssid, dst, ETH_ALEN);
168 } else {
169 memset(mgmt->da, 0xff, ETH_ALEN);
170 memset(mgmt->bssid, 0xff, ETH_ALEN);
171 }
172 pos = skb_put(skb, 2 + ssid_len);
173 *pos++ = WLAN_EID_SSID;
174 *pos++ = ssid_len;
175 memcpy(pos, ssid, ssid_len);
176
177 supp_rates = skb_put(skb, 2);
178 supp_rates[0] = WLAN_EID_SUPP_RATES;
179 supp_rates[1] = 0;
180 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
181
182 for (i = 0; i < sband->n_bitrates; i++) {
183 struct ieee80211_rate *rate = &sband->bitrates[i];
184 if (esupp_rates) {
185 pos = skb_put(skb, 1);
186 esupp_rates[1]++;
187 } else if (supp_rates[1] == 8) {
188 esupp_rates = skb_put(skb, 3);
189 esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
190 esupp_rates[1] = 1;
191 pos = &esupp_rates[2];
192 } else {
193 pos = skb_put(skb, 1);
194 supp_rates[1]++;
195 }
196 *pos = rate->bitrate / 5;
197 }
198
199 add_extra_ies(skb, sdata->u.sta.ie_probereq,
200 sdata->u.sta.ie_probereq_len);
201
202 ieee80211_tx_skb(sdata, skb, 0);
203}
204
205static void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
206 struct ieee80211_if_sta *ifsta,
207 int transaction, u8 *extra, size_t extra_len,
208 int encrypt)
209{
210 struct ieee80211_local *local = sdata->local;
211 struct sk_buff *skb;
212 struct ieee80211_mgmt *mgmt;
213
214 skb = dev_alloc_skb(local->hw.extra_tx_headroom +
215 sizeof(*mgmt) + 6 + extra_len +
216 sdata->u.sta.ie_auth_len);
217 if (!skb) {
218 printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
219 "frame\n", sdata->dev->name);
220 return;
221 }
222 skb_reserve(skb, local->hw.extra_tx_headroom);
223
224 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
225 memset(mgmt, 0, 24 + 6);
226 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
227 IEEE80211_STYPE_AUTH);
228 if (encrypt)
229 mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
230 memcpy(mgmt->da, ifsta->bssid, ETH_ALEN);
231 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
232 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
233 mgmt->u.auth.auth_alg = cpu_to_le16(ifsta->auth_alg);
234 mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
235 ifsta->auth_transaction = transaction + 1;
236 mgmt->u.auth.status_code = cpu_to_le16(0);
237 if (extra)
238 memcpy(skb_put(skb, extra_len), extra, extra_len);
239 add_extra_ies(skb, sdata->u.sta.ie_auth, sdata->u.sta.ie_auth_len);
240
241 ieee80211_tx_skb(sdata, skb, encrypt);
242}
243
244static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
245 struct ieee80211_if_sta *ifsta)
246{ 92{
93 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
247 struct ieee80211_local *local = sdata->local; 94 struct ieee80211_local *local = sdata->local;
248 struct sk_buff *skb; 95 struct sk_buff *skb;
249 struct ieee80211_mgmt *mgmt; 96 struct ieee80211_mgmt *mgmt;
@@ -256,17 +103,17 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
256 u32 rates = 0; 103 u32 rates = 0;
257 size_t e_ies_len; 104 size_t e_ies_len;
258 105
259 if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) { 106 if (ifmgd->flags & IEEE80211_IBSS_PREV_BSSID_SET) {
260 e_ies = sdata->u.sta.ie_reassocreq; 107 e_ies = sdata->u.mgd.ie_reassocreq;
261 e_ies_len = sdata->u.sta.ie_reassocreq_len; 108 e_ies_len = sdata->u.mgd.ie_reassocreq_len;
262 } else { 109 } else {
263 e_ies = sdata->u.sta.ie_assocreq; 110 e_ies = sdata->u.mgd.ie_assocreq;
264 e_ies_len = sdata->u.sta.ie_assocreq_len; 111 e_ies_len = sdata->u.mgd.ie_assocreq_len;
265 } 112 }
266 113
267 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 114 skb = dev_alloc_skb(local->hw.extra_tx_headroom +
268 sizeof(*mgmt) + 200 + ifsta->extra_ie_len + 115 sizeof(*mgmt) + 200 + ifmgd->extra_ie_len +
269 ifsta->ssid_len + e_ies_len); 116 ifmgd->ssid_len + e_ies_len);
270 if (!skb) { 117 if (!skb) {
271 printk(KERN_DEBUG "%s: failed to allocate buffer for assoc " 118 printk(KERN_DEBUG "%s: failed to allocate buffer for assoc "
272 "frame\n", sdata->dev->name); 119 "frame\n", sdata->dev->name);
@@ -276,7 +123,7 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
276 123
277 sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; 124 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
278 125
279 capab = ifsta->capab; 126 capab = ifmgd->capab;
280 127
281 if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) { 128 if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ) {
282 if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE)) 129 if (!(local->hw.flags & IEEE80211_HW_2GHZ_SHORT_SLOT_INCAPABLE))
@@ -285,9 +132,9 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
285 capab |= WLAN_CAPABILITY_SHORT_PREAMBLE; 132 capab |= WLAN_CAPABILITY_SHORT_PREAMBLE;
286 } 133 }
287 134
288 bss = ieee80211_rx_bss_get(local, ifsta->bssid, 135 bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
289 local->hw.conf.channel->center_freq, 136 local->hw.conf.channel->center_freq,
290 ifsta->ssid, ifsta->ssid_len); 137 ifmgd->ssid, ifmgd->ssid_len);
291 if (bss) { 138 if (bss) {
292 if (bss->cbss.capability & WLAN_CAPABILITY_PRIVACY) 139 if (bss->cbss.capability & WLAN_CAPABILITY_PRIVACY)
293 capab |= WLAN_CAPABILITY_PRIVACY; 140 capab |= WLAN_CAPABILITY_PRIVACY;
@@ -312,18 +159,18 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
312 159
313 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); 160 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
314 memset(mgmt, 0, 24); 161 memset(mgmt, 0, 24);
315 memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); 162 memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN);
316 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); 163 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
317 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); 164 memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN);
318 165
319 if (ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) { 166 if (ifmgd->flags & IEEE80211_STA_PREV_BSSID_SET) {
320 skb_put(skb, 10); 167 skb_put(skb, 10);
321 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | 168 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
322 IEEE80211_STYPE_REASSOC_REQ); 169 IEEE80211_STYPE_REASSOC_REQ);
323 mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab); 170 mgmt->u.reassoc_req.capab_info = cpu_to_le16(capab);
324 mgmt->u.reassoc_req.listen_interval = 171 mgmt->u.reassoc_req.listen_interval =
325 cpu_to_le16(local->hw.conf.listen_interval); 172 cpu_to_le16(local->hw.conf.listen_interval);
326 memcpy(mgmt->u.reassoc_req.current_ap, ifsta->prev_bssid, 173 memcpy(mgmt->u.reassoc_req.current_ap, ifmgd->prev_bssid,
327 ETH_ALEN); 174 ETH_ALEN);
328 } else { 175 } else {
329 skb_put(skb, 4); 176 skb_put(skb, 4);
@@ -335,10 +182,10 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
335 } 182 }
336 183
337 /* SSID */ 184 /* SSID */
338 ies = pos = skb_put(skb, 2 + ifsta->ssid_len); 185 ies = pos = skb_put(skb, 2 + ifmgd->ssid_len);
339 *pos++ = WLAN_EID_SSID; 186 *pos++ = WLAN_EID_SSID;
340 *pos++ = ifsta->ssid_len; 187 *pos++ = ifmgd->ssid_len;
341 memcpy(pos, ifsta->ssid, ifsta->ssid_len); 188 memcpy(pos, ifmgd->ssid, ifmgd->ssid_len);
342 189
343 /* add all rates which were marked to be used above */ 190 /* add all rates which were marked to be used above */
344 supp_rates_len = rates_len; 191 supp_rates_len = rates_len;
@@ -393,12 +240,12 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
393 } 240 }
394 } 241 }
395 242
396 if (ifsta->extra_ie) { 243 if (ifmgd->extra_ie) {
397 pos = skb_put(skb, ifsta->extra_ie_len); 244 pos = skb_put(skb, ifmgd->extra_ie_len);
398 memcpy(pos, ifsta->extra_ie, ifsta->extra_ie_len); 245 memcpy(pos, ifmgd->extra_ie, ifmgd->extra_ie_len);
399 } 246 }
400 247
401 if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) { 248 if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED)) {
402 pos = skb_put(skb, 9); 249 pos = skb_put(skb, 9);
403 *pos++ = WLAN_EID_VENDOR_SPECIFIC; 250 *pos++ = WLAN_EID_VENDOR_SPECIFIC;
404 *pos++ = 7; /* len */ 251 *pos++ = 7; /* len */
@@ -418,11 +265,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
418 * mode (11a/b/g) if any one of these ciphers is 265 * mode (11a/b/g) if any one of these ciphers is
419 * configured as pairwise. 266 * configured as pairwise.
420 */ 267 */
421 if (wmm && (ifsta->flags & IEEE80211_STA_WMM_ENABLED) && 268 if (wmm && (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) &&
422 sband->ht_cap.ht_supported && 269 sband->ht_cap.ht_supported &&
423 (ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) && 270 (ht_ie = ieee80211_bss_get_ie(bss, WLAN_EID_HT_INFORMATION)) &&
424 ht_ie[1] >= sizeof(struct ieee80211_ht_info) && 271 ht_ie[1] >= sizeof(struct ieee80211_ht_info) &&
425 (!(ifsta->flags & IEEE80211_STA_TKIP_WEP_USED))) { 272 (!(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))) {
426 struct ieee80211_ht_info *ht_info = 273 struct ieee80211_ht_info *ht_info =
427 (struct ieee80211_ht_info *)(ht_ie + 2); 274 (struct ieee80211_ht_info *)(ht_ie + 2);
428 u16 cap = sband->ht_cap.cap; 275 u16 cap = sband->ht_cap.cap;
@@ -459,11 +306,11 @@ static void ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata,
459 306
460 add_extra_ies(skb, e_ies, e_ies_len); 307 add_extra_ies(skb, e_ies, e_ies_len);
461 308
462 kfree(ifsta->assocreq_ies); 309 kfree(ifmgd->assocreq_ies);
463 ifsta->assocreq_ies_len = (skb->data + skb->len) - ies; 310 ifmgd->assocreq_ies_len = (skb->data + skb->len) - ies;
464 ifsta->assocreq_ies = kmalloc(ifsta->assocreq_ies_len, GFP_KERNEL); 311 ifmgd->assocreq_ies = kmalloc(ifmgd->assocreq_ies_len, GFP_KERNEL);
465 if (ifsta->assocreq_ies) 312 if (ifmgd->assocreq_ies)
466 memcpy(ifsta->assocreq_ies, ies, ifsta->assocreq_ies_len); 313 memcpy(ifmgd->assocreq_ies, ies, ifmgd->assocreq_ies_len);
467 314
468 ieee80211_tx_skb(sdata, skb, 0); 315 ieee80211_tx_skb(sdata, skb, 0);
469} 316}
@@ -473,18 +320,18 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
473 u16 stype, u16 reason) 320 u16 stype, u16 reason)
474{ 321{
475 struct ieee80211_local *local = sdata->local; 322 struct ieee80211_local *local = sdata->local;
476 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 323 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
477 struct sk_buff *skb; 324 struct sk_buff *skb;
478 struct ieee80211_mgmt *mgmt; 325 struct ieee80211_mgmt *mgmt;
479 u8 *ies; 326 u8 *ies;
480 size_t ies_len; 327 size_t ies_len;
481 328
482 if (stype == IEEE80211_STYPE_DEAUTH) { 329 if (stype == IEEE80211_STYPE_DEAUTH) {
483 ies = sdata->u.sta.ie_deauth; 330 ies = sdata->u.mgd.ie_deauth;
484 ies_len = sdata->u.sta.ie_deauth_len; 331 ies_len = sdata->u.mgd.ie_deauth_len;
485 } else { 332 } else {
486 ies = sdata->u.sta.ie_disassoc; 333 ies = sdata->u.mgd.ie_disassoc;
487 ies_len = sdata->u.sta.ie_disassoc_len; 334 ies_len = sdata->u.mgd.ie_disassoc_len;
488 } 335 }
489 336
490 skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 337 skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) +
@@ -498,9 +345,9 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
498 345
499 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24); 346 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
500 memset(mgmt, 0, 24); 347 memset(mgmt, 0, 24);
501 memcpy(mgmt->da, ifsta->bssid, ETH_ALEN); 348 memcpy(mgmt->da, ifmgd->bssid, ETH_ALEN);
502 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN); 349 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
503 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN); 350 memcpy(mgmt->bssid, ifmgd->bssid, ETH_ALEN);
504 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype); 351 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | stype);
505 skb_put(skb, 2); 352 skb_put(skb, 2);
506 /* u.deauth.reason_code == u.disassoc.reason_code */ 353 /* u.deauth.reason_code == u.disassoc.reason_code */
@@ -508,13 +355,13 @@ static void ieee80211_send_deauth_disassoc(struct ieee80211_sub_if_data *sdata,
508 355
509 add_extra_ies(skb, ies, ies_len); 356 add_extra_ies(skb, ies, ies_len);
510 357
511 ieee80211_tx_skb(sdata, skb, ifsta->flags & IEEE80211_STA_MFP_ENABLED); 358 ieee80211_tx_skb(sdata, skb, ifmgd->flags & IEEE80211_STA_MFP_ENABLED);
512} 359}
513 360
514void ieee80211_send_pspoll(struct ieee80211_local *local, 361void ieee80211_send_pspoll(struct ieee80211_local *local,
515 struct ieee80211_sub_if_data *sdata) 362 struct ieee80211_sub_if_data *sdata)
516{ 363{
517 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 364 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
518 struct ieee80211_pspoll *pspoll; 365 struct ieee80211_pspoll *pspoll;
519 struct sk_buff *skb; 366 struct sk_buff *skb;
520 u16 fc; 367 u16 fc;
@@ -531,43 +378,20 @@ void ieee80211_send_pspoll(struct ieee80211_local *local,
531 memset(pspoll, 0, sizeof(*pspoll)); 378 memset(pspoll, 0, sizeof(*pspoll));
532 fc = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM; 379 fc = IEEE80211_FTYPE_CTL | IEEE80211_STYPE_PSPOLL | IEEE80211_FCTL_PM;
533 pspoll->frame_control = cpu_to_le16(fc); 380 pspoll->frame_control = cpu_to_le16(fc);
534 pspoll->aid = cpu_to_le16(ifsta->aid); 381 pspoll->aid = cpu_to_le16(ifmgd->aid);
535 382
536 /* aid in PS-Poll has its two MSBs each set to 1 */ 383 /* aid in PS-Poll has its two MSBs each set to 1 */
537 pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14); 384 pspoll->aid |= cpu_to_le16(1 << 15 | 1 << 14);
538 385
539 memcpy(pspoll->bssid, ifsta->bssid, ETH_ALEN); 386 memcpy(pspoll->bssid, ifmgd->bssid, ETH_ALEN);
540 memcpy(pspoll->ta, sdata->dev->dev_addr, ETH_ALEN); 387 memcpy(pspoll->ta, sdata->dev->dev_addr, ETH_ALEN);
541 388
542 ieee80211_tx_skb(sdata, skb, 0); 389 ieee80211_tx_skb(sdata, skb, 0);
543
544 return;
545} 390}
546 391
547/* MLME */ 392/* MLME */
548static void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
549 const size_t supp_rates_len,
550 const u8 *supp_rates)
551{
552 struct ieee80211_local *local = sdata->local;
553 int i, have_higher_than_11mbit = 0;
554
555 /* cf. IEEE 802.11 9.2.12 */
556 for (i = 0; i < supp_rates_len; i++)
557 if ((supp_rates[i] & 0x7f) * 5 > 110)
558 have_higher_than_11mbit = 1;
559
560 if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
561 have_higher_than_11mbit)
562 sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
563 else
564 sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
565
566 ieee80211_set_wmm_default(sdata);
567}
568
569static void ieee80211_sta_wmm_params(struct ieee80211_local *local, 393static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
570 struct ieee80211_if_sta *ifsta, 394 struct ieee80211_if_managed *ifmgd,
571 u8 *wmm_param, size_t wmm_param_len) 395 u8 *wmm_param, size_t wmm_param_len)
572{ 396{
573 struct ieee80211_tx_queue_params params; 397 struct ieee80211_tx_queue_params params;
@@ -575,7 +399,7 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
575 int count; 399 int count;
576 u8 *pos; 400 u8 *pos;
577 401
578 if (!(ifsta->flags & IEEE80211_STA_WMM_ENABLED)) 402 if (!(ifmgd->flags & IEEE80211_STA_WMM_ENABLED))
579 return; 403 return;
580 404
581 if (!wmm_param) 405 if (!wmm_param)
@@ -584,18 +408,15 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
584 if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1) 408 if (wmm_param_len < 8 || wmm_param[5] /* version */ != 1)
585 return; 409 return;
586 count = wmm_param[6] & 0x0f; 410 count = wmm_param[6] & 0x0f;
587 if (count == ifsta->wmm_last_param_set) 411 if (count == ifmgd->wmm_last_param_set)
588 return; 412 return;
589 ifsta->wmm_last_param_set = count; 413 ifmgd->wmm_last_param_set = count;
590 414
591 pos = wmm_param + 8; 415 pos = wmm_param + 8;
592 left = wmm_param_len - 8; 416 left = wmm_param_len - 8;
593 417
594 memset(&params, 0, sizeof(params)); 418 memset(&params, 0, sizeof(params));
595 419
596 if (!local->ops->conf_tx)
597 return;
598
599 local->wmm_acm = 0; 420 local->wmm_acm = 0;
600 for (; left >= 4; left -= 4, pos += 4) { 421 for (; left >= 4; left -= 4, pos += 4) {
601 int aci = (pos[0] >> 5) & 0x03; 422 int aci = (pos[0] >> 5) & 0x03;
@@ -603,26 +424,26 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
603 int queue; 424 int queue;
604 425
605 switch (aci) { 426 switch (aci) {
606 case 1: 427 case 1: /* AC_BK */
607 queue = 3; 428 queue = 3;
608 if (acm) 429 if (acm)
609 local->wmm_acm |= BIT(0) | BIT(3); 430 local->wmm_acm |= BIT(1) | BIT(2); /* BK/- */
610 break; 431 break;
611 case 2: 432 case 2: /* AC_VI */
612 queue = 1; 433 queue = 1;
613 if (acm) 434 if (acm)
614 local->wmm_acm |= BIT(4) | BIT(5); 435 local->wmm_acm |= BIT(4) | BIT(5); /* CL/VI */
615 break; 436 break;
616 case 3: 437 case 3: /* AC_VO */
617 queue = 0; 438 queue = 0;
618 if (acm) 439 if (acm)
619 local->wmm_acm |= BIT(6) | BIT(7); 440 local->wmm_acm |= BIT(6) | BIT(7); /* VO/NC */
620 break; 441 break;
621 case 0: 442 case 0: /* AC_BE */
622 default: 443 default:
623 queue = 2; 444 queue = 2;
624 if (acm) 445 if (acm)
625 local->wmm_acm |= BIT(1) | BIT(2); 446 local->wmm_acm |= BIT(0) | BIT(3); /* BE/EE */
626 break; 447 break;
627 } 448 }
628 449
@@ -636,9 +457,8 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local,
636 local->mdev->name, queue, aci, acm, params.aifs, params.cw_min, 457 local->mdev->name, queue, aci, acm, params.aifs, params.cw_min,
637 params.cw_max, params.txop); 458 params.cw_max, params.txop);
638#endif 459#endif
639 /* TODO: handle ACM (block TX, fallback to next lowest allowed 460 if (local->ops->conf_tx &&
640 * AC for now) */ 461 local->ops->conf_tx(local_to_hw(local), queue, &params)) {
641 if (local->ops->conf_tx(local_to_hw(local), queue, &params)) {
642 printk(KERN_DEBUG "%s: failed to set TX queue " 462 printk(KERN_DEBUG "%s: failed to set TX queue "
643 "parameters for queue %d\n", local->mdev->name, queue); 463 "parameters for queue %d\n", local->mdev->name, queue);
644 } 464 }
@@ -671,7 +491,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
671{ 491{
672 struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; 492 struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
673#ifdef CONFIG_MAC80211_VERBOSE_DEBUG 493#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
674 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 494 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
675#endif 495#endif
676 u32 changed = 0; 496 u32 changed = 0;
677 bool use_protection; 497 bool use_protection;
@@ -694,7 +514,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
694 printk(KERN_DEBUG "%s: CTS protection %s (BSSID=%pM)\n", 514 printk(KERN_DEBUG "%s: CTS protection %s (BSSID=%pM)\n",
695 sdata->dev->name, 515 sdata->dev->name,
696 use_protection ? "enabled" : "disabled", 516 use_protection ? "enabled" : "disabled",
697 ifsta->bssid); 517 ifmgd->bssid);
698 } 518 }
699#endif 519#endif
700 bss_conf->use_cts_prot = use_protection; 520 bss_conf->use_cts_prot = use_protection;
@@ -708,7 +528,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
708 " (BSSID=%pM)\n", 528 " (BSSID=%pM)\n",
709 sdata->dev->name, 529 sdata->dev->name,
710 use_short_preamble ? "short" : "long", 530 use_short_preamble ? "short" : "long",
711 ifsta->bssid); 531 ifmgd->bssid);
712 } 532 }
713#endif 533#endif
714 bss_conf->use_short_preamble = use_short_preamble; 534 bss_conf->use_short_preamble = use_short_preamble;
@@ -722,7 +542,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
722 " (BSSID=%pM)\n", 542 " (BSSID=%pM)\n",
723 sdata->dev->name, 543 sdata->dev->name,
724 use_short_slot ? "short" : "long", 544 use_short_slot ? "short" : "long",
725 ifsta->bssid); 545 ifmgd->bssid);
726 } 546 }
727#endif 547#endif
728 bss_conf->use_short_slot = use_short_slot; 548 bss_conf->use_short_slot = use_short_slot;
@@ -732,57 +552,57 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata,
732 return changed; 552 return changed;
733} 553}
734 554
735static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata, 555static void ieee80211_sta_send_apinfo(struct ieee80211_sub_if_data *sdata)
736 struct ieee80211_if_sta *ifsta)
737{ 556{
738 union iwreq_data wrqu; 557 union iwreq_data wrqu;
558
739 memset(&wrqu, 0, sizeof(wrqu)); 559 memset(&wrqu, 0, sizeof(wrqu));
740 if (ifsta->flags & IEEE80211_STA_ASSOCIATED) 560 if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED)
741 memcpy(wrqu.ap_addr.sa_data, sdata->u.sta.bssid, ETH_ALEN); 561 memcpy(wrqu.ap_addr.sa_data, sdata->u.mgd.bssid, ETH_ALEN);
742 wrqu.ap_addr.sa_family = ARPHRD_ETHER; 562 wrqu.ap_addr.sa_family = ARPHRD_ETHER;
743 wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL); 563 wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
744} 564}
745 565
746static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata, 566static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata)
747 struct ieee80211_if_sta *ifsta)
748{ 567{
568 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
749 char *buf; 569 char *buf;
750 size_t len; 570 size_t len;
751 int i; 571 int i;
752 union iwreq_data wrqu; 572 union iwreq_data wrqu;
753 573
754 if (!ifsta->assocreq_ies && !ifsta->assocresp_ies) 574 if (!ifmgd->assocreq_ies && !ifmgd->assocresp_ies)
755 return; 575 return;
756 576
757 buf = kmalloc(50 + 2 * (ifsta->assocreq_ies_len + 577 buf = kmalloc(50 + 2 * (ifmgd->assocreq_ies_len +
758 ifsta->assocresp_ies_len), GFP_KERNEL); 578 ifmgd->assocresp_ies_len), GFP_KERNEL);
759 if (!buf) 579 if (!buf)
760 return; 580 return;
761 581
762 len = sprintf(buf, "ASSOCINFO("); 582 len = sprintf(buf, "ASSOCINFO(");
763 if (ifsta->assocreq_ies) { 583 if (ifmgd->assocreq_ies) {
764 len += sprintf(buf + len, "ReqIEs="); 584 len += sprintf(buf + len, "ReqIEs=");
765 for (i = 0; i < ifsta->assocreq_ies_len; i++) { 585 for (i = 0; i < ifmgd->assocreq_ies_len; i++) {
766 len += sprintf(buf + len, "%02x", 586 len += sprintf(buf + len, "%02x",
767 ifsta->assocreq_ies[i]); 587 ifmgd->assocreq_ies[i]);
768 } 588 }
769 } 589 }
770 if (ifsta->assocresp_ies) { 590 if (ifmgd->assocresp_ies) {
771 if (ifsta->assocreq_ies) 591 if (ifmgd->assocreq_ies)
772 len += sprintf(buf + len, " "); 592 len += sprintf(buf + len, " ");
773 len += sprintf(buf + len, "RespIEs="); 593 len += sprintf(buf + len, "RespIEs=");
774 for (i = 0; i < ifsta->assocresp_ies_len; i++) { 594 for (i = 0; i < ifmgd->assocresp_ies_len; i++) {
775 len += sprintf(buf + len, "%02x", 595 len += sprintf(buf + len, "%02x",
776 ifsta->assocresp_ies[i]); 596 ifmgd->assocresp_ies[i]);
777 } 597 }
778 } 598 }
779 len += sprintf(buf + len, ")"); 599 len += sprintf(buf + len, ")");
780 600
781 if (len > IW_CUSTOM_MAX) { 601 if (len > IW_CUSTOM_MAX) {
782 len = sprintf(buf, "ASSOCRESPIE="); 602 len = sprintf(buf, "ASSOCRESPIE=");
783 for (i = 0; i < ifsta->assocresp_ies_len; i++) { 603 for (i = 0; i < ifmgd->assocresp_ies_len; i++) {
784 len += sprintf(buf + len, "%02x", 604 len += sprintf(buf + len, "%02x",
785 ifsta->assocresp_ies[i]); 605 ifmgd->assocresp_ies[i]);
786 } 606 }
787 } 607 }
788 608
@@ -797,20 +617,20 @@ static void ieee80211_sta_send_associnfo(struct ieee80211_sub_if_data *sdata,
797 617
798 618
799static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, 619static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
800 struct ieee80211_if_sta *ifsta,
801 u32 bss_info_changed) 620 u32 bss_info_changed)
802{ 621{
622 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
803 struct ieee80211_local *local = sdata->local; 623 struct ieee80211_local *local = sdata->local;
804 struct ieee80211_conf *conf = &local_to_hw(local)->conf; 624 struct ieee80211_conf *conf = &local_to_hw(local)->conf;
805 625
806 struct ieee80211_bss *bss; 626 struct ieee80211_bss *bss;
807 627
808 bss_info_changed |= BSS_CHANGED_ASSOC; 628 bss_info_changed |= BSS_CHANGED_ASSOC;
809 ifsta->flags |= IEEE80211_STA_ASSOCIATED; 629 ifmgd->flags |= IEEE80211_STA_ASSOCIATED;
810 630
811 bss = ieee80211_rx_bss_get(local, ifsta->bssid, 631 bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
812 conf->channel->center_freq, 632 conf->channel->center_freq,
813 ifsta->ssid, ifsta->ssid_len); 633 ifmgd->ssid, ifmgd->ssid_len);
814 if (bss) { 634 if (bss) {
815 /* set timing information */ 635 /* set timing information */
816 sdata->vif.bss_conf.beacon_int = bss->cbss.beacon_interval; 636 sdata->vif.bss_conf.beacon_int = bss->cbss.beacon_interval;
@@ -823,11 +643,11 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
823 ieee80211_rx_bss_put(local, bss); 643 ieee80211_rx_bss_put(local, bss);
824 } 644 }
825 645
826 ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET; 646 ifmgd->flags |= IEEE80211_STA_PREV_BSSID_SET;
827 memcpy(ifsta->prev_bssid, sdata->u.sta.bssid, ETH_ALEN); 647 memcpy(ifmgd->prev_bssid, sdata->u.mgd.bssid, ETH_ALEN);
828 ieee80211_sta_send_associnfo(sdata, ifsta); 648 ieee80211_sta_send_associnfo(sdata);
829 649
830 ifsta->last_probe = jiffies; 650 ifmgd->last_probe = jiffies;
831 ieee80211_led_assoc(local, 1); 651 ieee80211_led_assoc(local, 1);
832 652
833 sdata->vif.bss_conf.assoc = 1; 653 sdata->vif.bss_conf.assoc = 1;
@@ -856,70 +676,74 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
856 netif_tx_start_all_queues(sdata->dev); 676 netif_tx_start_all_queues(sdata->dev);
857 netif_carrier_on(sdata->dev); 677 netif_carrier_on(sdata->dev);
858 678
859 ieee80211_sta_send_apinfo(sdata, ifsta); 679 ieee80211_sta_send_apinfo(sdata);
860} 680}
861 681
862static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata, 682static void ieee80211_direct_probe(struct ieee80211_sub_if_data *sdata)
863 struct ieee80211_if_sta *ifsta)
864{ 683{
865 ifsta->direct_probe_tries++; 684 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
866 if (ifsta->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) { 685
686 ifmgd->direct_probe_tries++;
687 if (ifmgd->direct_probe_tries > IEEE80211_AUTH_MAX_TRIES) {
867 printk(KERN_DEBUG "%s: direct probe to AP %pM timed out\n", 688 printk(KERN_DEBUG "%s: direct probe to AP %pM timed out\n",
868 sdata->dev->name, ifsta->bssid); 689 sdata->dev->name, ifmgd->bssid);
869 ifsta->state = IEEE80211_STA_MLME_DISABLED; 690 ifmgd->state = IEEE80211_STA_MLME_DISABLED;
870 ieee80211_sta_send_apinfo(sdata, ifsta); 691 ieee80211_sta_send_apinfo(sdata);
871 692
872 /* 693 /*
873 * Most likely AP is not in the range so remove the 694 * Most likely AP is not in the range so remove the
874 * bss information associated to the AP 695 * bss information associated to the AP
875 */ 696 */
876 ieee80211_rx_bss_remove(sdata, ifsta->bssid, 697 ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
877 sdata->local->hw.conf.channel->center_freq, 698 sdata->local->hw.conf.channel->center_freq,
878 ifsta->ssid, ifsta->ssid_len); 699 ifmgd->ssid, ifmgd->ssid_len);
879 return; 700 return;
880 } 701 }
881 702
882 printk(KERN_DEBUG "%s: direct probe to AP %pM try %d\n", 703 printk(KERN_DEBUG "%s: direct probe to AP %pM try %d\n",
883 sdata->dev->name, ifsta->bssid, 704 sdata->dev->name, ifmgd->bssid,
884 ifsta->direct_probe_tries); 705 ifmgd->direct_probe_tries);
885 706
886 ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; 707 ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
887 708
888 set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifsta->request); 709 set_bit(IEEE80211_STA_REQ_DIRECT_PROBE, &ifmgd->request);
889 710
890 /* Direct probe is sent to broadcast address as some APs 711 /* Direct probe is sent to broadcast address as some APs
891 * will not answer to direct packet in unassociated state. 712 * will not answer to direct packet in unassociated state.
892 */ 713 */
893 ieee80211_send_probe_req(sdata, NULL, 714 ieee80211_send_probe_req(sdata, NULL,
894 ifsta->ssid, ifsta->ssid_len); 715 ifmgd->ssid, ifmgd->ssid_len, NULL, 0);
895 716
896 mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT); 717 mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
897} 718}
898 719
899 720
900static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata, 721static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata)
901 struct ieee80211_if_sta *ifsta)
902{ 722{
903 ifsta->auth_tries++; 723 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
904 if (ifsta->auth_tries > IEEE80211_AUTH_MAX_TRIES) { 724
725 ifmgd->auth_tries++;
726 if (ifmgd->auth_tries > IEEE80211_AUTH_MAX_TRIES) {
905 printk(KERN_DEBUG "%s: authentication with AP %pM" 727 printk(KERN_DEBUG "%s: authentication with AP %pM"
906 " timed out\n", 728 " timed out\n",
907 sdata->dev->name, ifsta->bssid); 729 sdata->dev->name, ifmgd->bssid);
908 ifsta->state = IEEE80211_STA_MLME_DISABLED; 730 ifmgd->state = IEEE80211_STA_MLME_DISABLED;
909 ieee80211_sta_send_apinfo(sdata, ifsta); 731 ieee80211_sta_send_apinfo(sdata);
910 ieee80211_rx_bss_remove(sdata, ifsta->bssid, 732 ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
911 sdata->local->hw.conf.channel->center_freq, 733 sdata->local->hw.conf.channel->center_freq,
912 ifsta->ssid, ifsta->ssid_len); 734 ifmgd->ssid, ifmgd->ssid_len);
913 return; 735 return;
914 } 736 }
915 737
916 ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; 738 ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
917 printk(KERN_DEBUG "%s: authenticate with AP %pM\n", 739 printk(KERN_DEBUG "%s: authenticate with AP %pM\n",
918 sdata->dev->name, ifsta->bssid); 740 sdata->dev->name, ifmgd->bssid);
919 741
920 ieee80211_send_auth(sdata, ifsta, 1, NULL, 0, 0); 742 ieee80211_send_auth(sdata, 1, ifmgd->auth_alg, NULL, 0,
743 ifmgd->bssid, 0);
744 ifmgd->auth_transaction = 2;
921 745
922 mod_timer(&ifsta->timer, jiffies + IEEE80211_AUTH_TIMEOUT); 746 mod_timer(&ifmgd->timer, jiffies + IEEE80211_AUTH_TIMEOUT);
923} 747}
924 748
925/* 749/*
@@ -927,27 +751,28 @@ static void ieee80211_authenticate(struct ieee80211_sub_if_data *sdata,
927 * if self disconnected or a reason code from the AP. 751 * if self disconnected or a reason code from the AP.
928 */ 752 */
929static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, 753static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
930 struct ieee80211_if_sta *ifsta, bool deauth, 754 bool deauth, bool self_disconnected,
931 bool self_disconnected, u16 reason) 755 u16 reason)
932{ 756{
757 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
933 struct ieee80211_local *local = sdata->local; 758 struct ieee80211_local *local = sdata->local;
934 struct sta_info *sta; 759 struct sta_info *sta;
935 u32 changed = 0, config_changed = 0; 760 u32 changed = 0, config_changed = 0;
936 761
937 rcu_read_lock(); 762 rcu_read_lock();
938 763
939 sta = sta_info_get(local, ifsta->bssid); 764 sta = sta_info_get(local, ifmgd->bssid);
940 if (!sta) { 765 if (!sta) {
941 rcu_read_unlock(); 766 rcu_read_unlock();
942 return; 767 return;
943 } 768 }
944 769
945 if (deauth) { 770 if (deauth) {
946 ifsta->direct_probe_tries = 0; 771 ifmgd->direct_probe_tries = 0;
947 ifsta->auth_tries = 0; 772 ifmgd->auth_tries = 0;
948 } 773 }
949 ifsta->assoc_scan_tries = 0; 774 ifmgd->assoc_scan_tries = 0;
950 ifsta->assoc_tries = 0; 775 ifmgd->assoc_tries = 0;
951 776
952 netif_tx_stop_all_queues(sdata->dev); 777 netif_tx_stop_all_queues(sdata->dev);
953 netif_carrier_off(sdata->dev); 778 netif_carrier_off(sdata->dev);
@@ -963,20 +788,20 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
963 IEEE80211_STYPE_DISASSOC, reason); 788 IEEE80211_STYPE_DISASSOC, reason);
964 } 789 }
965 790
966 ifsta->flags &= ~IEEE80211_STA_ASSOCIATED; 791 ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED;
967 changed |= ieee80211_reset_erp_info(sdata); 792 changed |= ieee80211_reset_erp_info(sdata);
968 793
969 ieee80211_led_assoc(local, 0); 794 ieee80211_led_assoc(local, 0);
970 changed |= BSS_CHANGED_ASSOC; 795 changed |= BSS_CHANGED_ASSOC;
971 sdata->vif.bss_conf.assoc = false; 796 sdata->vif.bss_conf.assoc = false;
972 797
973 ieee80211_sta_send_apinfo(sdata, ifsta); 798 ieee80211_sta_send_apinfo(sdata);
974 799
975 if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) { 800 if (self_disconnected || reason == WLAN_REASON_DISASSOC_STA_HAS_LEFT) {
976 ifsta->state = IEEE80211_STA_MLME_DISABLED; 801 ifmgd->state = IEEE80211_STA_MLME_DISABLED;
977 ieee80211_rx_bss_remove(sdata, ifsta->bssid, 802 ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
978 sdata->local->hw.conf.channel->center_freq, 803 sdata->local->hw.conf.channel->center_freq,
979 ifsta->ssid, ifsta->ssid_len); 804 ifmgd->ssid, ifmgd->ssid_len);
980 } 805 }
981 806
982 rcu_read_unlock(); 807 rcu_read_unlock();
@@ -999,7 +824,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
999 824
1000 rcu_read_lock(); 825 rcu_read_lock();
1001 826
1002 sta = sta_info_get(local, ifsta->bssid); 827 sta = sta_info_get(local, ifmgd->bssid);
1003 if (!sta) { 828 if (!sta) {
1004 rcu_read_unlock(); 829 rcu_read_unlock();
1005 return; 830 return;
@@ -1020,27 +845,27 @@ static int ieee80211_sta_wep_configured(struct ieee80211_sub_if_data *sdata)
1020 return 1; 845 return 1;
1021} 846}
1022 847
1023static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata, 848static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata)
1024 struct ieee80211_if_sta *ifsta)
1025{ 849{
850 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1026 struct ieee80211_local *local = sdata->local; 851 struct ieee80211_local *local = sdata->local;
1027 struct ieee80211_bss *bss; 852 struct ieee80211_bss *bss;
1028 int bss_privacy; 853 int bss_privacy;
1029 int wep_privacy; 854 int wep_privacy;
1030 int privacy_invoked; 855 int privacy_invoked;
1031 856
1032 if (!ifsta || (ifsta->flags & IEEE80211_STA_MIXED_CELL)) 857 if (!ifmgd || (ifmgd->flags & IEEE80211_STA_MIXED_CELL))
1033 return 0; 858 return 0;
1034 859
1035 bss = ieee80211_rx_bss_get(local, ifsta->bssid, 860 bss = ieee80211_rx_bss_get(local, ifmgd->bssid,
1036 local->hw.conf.channel->center_freq, 861 local->hw.conf.channel->center_freq,
1037 ifsta->ssid, ifsta->ssid_len); 862 ifmgd->ssid, ifmgd->ssid_len);
1038 if (!bss) 863 if (!bss)
1039 return 0; 864 return 0;
1040 865
1041 bss_privacy = !!(bss->cbss.capability & WLAN_CAPABILITY_PRIVACY); 866 bss_privacy = !!(bss->cbss.capability & WLAN_CAPABILITY_PRIVACY);
1042 wep_privacy = !!ieee80211_sta_wep_configured(sdata); 867 wep_privacy = !!ieee80211_sta_wep_configured(sdata);
1043 privacy_invoked = !!(ifsta->flags & IEEE80211_STA_PRIVACY_INVOKED); 868 privacy_invoked = !!(ifmgd->flags & IEEE80211_STA_PRIVACY_INVOKED);
1044 869
1045 ieee80211_rx_bss_put(local, bss); 870 ieee80211_rx_bss_put(local, bss);
1046 871
@@ -1050,41 +875,42 @@ static int ieee80211_privacy_mismatch(struct ieee80211_sub_if_data *sdata,
1050 return 1; 875 return 1;
1051} 876}
1052 877
1053static void ieee80211_associate(struct ieee80211_sub_if_data *sdata, 878static void ieee80211_associate(struct ieee80211_sub_if_data *sdata)
1054 struct ieee80211_if_sta *ifsta)
1055{ 879{
1056 ifsta->assoc_tries++; 880 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1057 if (ifsta->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) { 881
882 ifmgd->assoc_tries++;
883 if (ifmgd->assoc_tries > IEEE80211_ASSOC_MAX_TRIES) {
1058 printk(KERN_DEBUG "%s: association with AP %pM" 884 printk(KERN_DEBUG "%s: association with AP %pM"
1059 " timed out\n", 885 " timed out\n",
1060 sdata->dev->name, ifsta->bssid); 886 sdata->dev->name, ifmgd->bssid);
1061 ifsta->state = IEEE80211_STA_MLME_DISABLED; 887 ifmgd->state = IEEE80211_STA_MLME_DISABLED;
1062 ieee80211_sta_send_apinfo(sdata, ifsta); 888 ieee80211_sta_send_apinfo(sdata);
1063 ieee80211_rx_bss_remove(sdata, ifsta->bssid, 889 ieee80211_rx_bss_remove(sdata, ifmgd->bssid,
1064 sdata->local->hw.conf.channel->center_freq, 890 sdata->local->hw.conf.channel->center_freq,
1065 ifsta->ssid, ifsta->ssid_len); 891 ifmgd->ssid, ifmgd->ssid_len);
1066 return; 892 return;
1067 } 893 }
1068 894
1069 ifsta->state = IEEE80211_STA_MLME_ASSOCIATE; 895 ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
1070 printk(KERN_DEBUG "%s: associate with AP %pM\n", 896 printk(KERN_DEBUG "%s: associate with AP %pM\n",
1071 sdata->dev->name, ifsta->bssid); 897 sdata->dev->name, ifmgd->bssid);
1072 if (ieee80211_privacy_mismatch(sdata, ifsta)) { 898 if (ieee80211_privacy_mismatch(sdata)) {
1073 printk(KERN_DEBUG "%s: mismatch in privacy configuration and " 899 printk(KERN_DEBUG "%s: mismatch in privacy configuration and "
1074 "mixed-cell disabled - abort association\n", sdata->dev->name); 900 "mixed-cell disabled - abort association\n", sdata->dev->name);
1075 ifsta->state = IEEE80211_STA_MLME_DISABLED; 901 ifmgd->state = IEEE80211_STA_MLME_DISABLED;
1076 return; 902 return;
1077 } 903 }
1078 904
1079 ieee80211_send_assoc(sdata, ifsta); 905 ieee80211_send_assoc(sdata);
1080 906
1081 mod_timer(&ifsta->timer, jiffies + IEEE80211_ASSOC_TIMEOUT); 907 mod_timer(&ifmgd->timer, jiffies + IEEE80211_ASSOC_TIMEOUT);
1082} 908}
1083 909
1084 910
1085static void ieee80211_associated(struct ieee80211_sub_if_data *sdata, 911static void ieee80211_associated(struct ieee80211_sub_if_data *sdata)
1086 struct ieee80211_if_sta *ifsta)
1087{ 912{
913 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1088 struct ieee80211_local *local = sdata->local; 914 struct ieee80211_local *local = sdata->local;
1089 struct sta_info *sta; 915 struct sta_info *sta;
1090 int disassoc; 916 int disassoc;
@@ -1094,38 +920,40 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
1094 * for better APs. */ 920 * for better APs. */
1095 /* TODO: remove expired BSSes */ 921 /* TODO: remove expired BSSes */
1096 922
1097 ifsta->state = IEEE80211_STA_MLME_ASSOCIATED; 923 ifmgd->state = IEEE80211_STA_MLME_ASSOCIATED;
1098 924
1099 rcu_read_lock(); 925 rcu_read_lock();
1100 926
1101 sta = sta_info_get(local, ifsta->bssid); 927 sta = sta_info_get(local, ifmgd->bssid);
1102 if (!sta) { 928 if (!sta) {
1103 printk(KERN_DEBUG "%s: No STA entry for own AP %pM\n", 929 printk(KERN_DEBUG "%s: No STA entry for own AP %pM\n",
1104 sdata->dev->name, ifsta->bssid); 930 sdata->dev->name, ifmgd->bssid);
1105 disassoc = 1; 931 disassoc = 1;
1106 } else { 932 } else {
1107 disassoc = 0; 933 disassoc = 0;
1108 if (time_after(jiffies, 934 if (time_after(jiffies,
1109 sta->last_rx + IEEE80211_MONITORING_INTERVAL)) { 935 sta->last_rx + IEEE80211_MONITORING_INTERVAL)) {
1110 if (ifsta->flags & IEEE80211_STA_PROBEREQ_POLL) { 936 if (ifmgd->flags & IEEE80211_STA_PROBEREQ_POLL) {
1111 printk(KERN_DEBUG "%s: No ProbeResp from " 937 printk(KERN_DEBUG "%s: No ProbeResp from "
1112 "current AP %pM - assume out of " 938 "current AP %pM - assume out of "
1113 "range\n", 939 "range\n",
1114 sdata->dev->name, ifsta->bssid); 940 sdata->dev->name, ifmgd->bssid);
1115 disassoc = 1; 941 disassoc = 1;
1116 } else 942 } else
1117 ieee80211_send_probe_req(sdata, ifsta->bssid, 943 ieee80211_send_probe_req(sdata, ifmgd->bssid,
1118 ifsta->ssid, 944 ifmgd->ssid,
1119 ifsta->ssid_len); 945 ifmgd->ssid_len,
1120 ifsta->flags ^= IEEE80211_STA_PROBEREQ_POLL; 946 NULL, 0);
947 ifmgd->flags ^= IEEE80211_STA_PROBEREQ_POLL;
1121 } else { 948 } else {
1122 ifsta->flags &= ~IEEE80211_STA_PROBEREQ_POLL; 949 ifmgd->flags &= ~IEEE80211_STA_PROBEREQ_POLL;
1123 if (time_after(jiffies, ifsta->last_probe + 950 if (time_after(jiffies, ifmgd->last_probe +
1124 IEEE80211_PROBE_INTERVAL)) { 951 IEEE80211_PROBE_INTERVAL)) {
1125 ifsta->last_probe = jiffies; 952 ifmgd->last_probe = jiffies;
1126 ieee80211_send_probe_req(sdata, ifsta->bssid, 953 ieee80211_send_probe_req(sdata, ifmgd->bssid,
1127 ifsta->ssid, 954 ifmgd->ssid,
1128 ifsta->ssid_len); 955 ifmgd->ssid_len,
956 NULL, 0);
1129 } 957 }
1130 } 958 }
1131 } 959 }
@@ -1133,25 +961,25 @@ static void ieee80211_associated(struct ieee80211_sub_if_data *sdata,
1133 rcu_read_unlock(); 961 rcu_read_unlock();
1134 962
1135 if (disassoc) 963 if (disassoc)
1136 ieee80211_set_disassoc(sdata, ifsta, true, true, 964 ieee80211_set_disassoc(sdata, true, true,
1137 WLAN_REASON_PREV_AUTH_NOT_VALID); 965 WLAN_REASON_PREV_AUTH_NOT_VALID);
1138 else 966 else
1139 mod_timer(&ifsta->timer, jiffies + 967 mod_timer(&ifmgd->timer, jiffies +
1140 IEEE80211_MONITORING_INTERVAL); 968 IEEE80211_MONITORING_INTERVAL);
1141} 969}
1142 970
1143 971
1144static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata, 972static void ieee80211_auth_completed(struct ieee80211_sub_if_data *sdata)
1145 struct ieee80211_if_sta *ifsta)
1146{ 973{
974 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
975
1147 printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name); 976 printk(KERN_DEBUG "%s: authenticated\n", sdata->dev->name);
1148 ifsta->flags |= IEEE80211_STA_AUTHENTICATED; 977 ifmgd->flags |= IEEE80211_STA_AUTHENTICATED;
1149 ieee80211_associate(sdata, ifsta); 978 ieee80211_associate(sdata);
1150} 979}
1151 980
1152 981
1153static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata, 982static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
1154 struct ieee80211_if_sta *ifsta,
1155 struct ieee80211_mgmt *mgmt, 983 struct ieee80211_mgmt *mgmt,
1156 size_t len) 984 size_t len)
1157{ 985{
@@ -1162,59 +990,37 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
1162 ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems); 990 ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
1163 if (!elems.challenge) 991 if (!elems.challenge)
1164 return; 992 return;
1165 ieee80211_send_auth(sdata, ifsta, 3, elems.challenge - 2, 993 ieee80211_send_auth(sdata, 3, sdata->u.mgd.auth_alg,
1166 elems.challenge_len + 2, 1); 994 elems.challenge - 2, elems.challenge_len + 2,
1167} 995 sdata->u.mgd.bssid, 1);
1168 996 sdata->u.mgd.auth_transaction = 4;
1169static void ieee80211_rx_mgmt_auth_ibss(struct ieee80211_sub_if_data *sdata,
1170 struct ieee80211_if_sta *ifsta,
1171 struct ieee80211_mgmt *mgmt,
1172 size_t len)
1173{
1174 u16 auth_alg, auth_transaction, status_code;
1175
1176 if (len < 24 + 6)
1177 return;
1178
1179 auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
1180 auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
1181 status_code = le16_to_cpu(mgmt->u.auth.status_code);
1182
1183 /*
1184 * IEEE 802.11 standard does not require authentication in IBSS
1185 * networks and most implementations do not seem to use it.
1186 * However, try to reply to authentication attempts if someone
1187 * has actually implemented this.
1188 */
1189 if (auth_alg == WLAN_AUTH_OPEN && auth_transaction == 1)
1190 ieee80211_send_auth(sdata, ifsta, 2, NULL, 0, 0);
1191} 997}
1192 998
1193static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, 999static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
1194 struct ieee80211_if_sta *ifsta,
1195 struct ieee80211_mgmt *mgmt, 1000 struct ieee80211_mgmt *mgmt,
1196 size_t len) 1001 size_t len)
1197{ 1002{
1003 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1198 u16 auth_alg, auth_transaction, status_code; 1004 u16 auth_alg, auth_transaction, status_code;
1199 1005
1200 if (ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE) 1006 if (ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE)
1201 return; 1007 return;
1202 1008
1203 if (len < 24 + 6) 1009 if (len < 24 + 6)
1204 return; 1010 return;
1205 1011
1206 if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) 1012 if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0)
1207 return; 1013 return;
1208 1014
1209 if (memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) 1015 if (memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0)
1210 return; 1016 return;
1211 1017
1212 auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg); 1018 auth_alg = le16_to_cpu(mgmt->u.auth.auth_alg);
1213 auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction); 1019 auth_transaction = le16_to_cpu(mgmt->u.auth.auth_transaction);
1214 status_code = le16_to_cpu(mgmt->u.auth.status_code); 1020 status_code = le16_to_cpu(mgmt->u.auth.status_code);
1215 1021
1216 if (auth_alg != ifsta->auth_alg || 1022 if (auth_alg != ifmgd->auth_alg ||
1217 auth_transaction != ifsta->auth_transaction) 1023 auth_transaction != ifmgd->auth_transaction)
1218 return; 1024 return;
1219 1025
1220 if (status_code != WLAN_STATUS_SUCCESS) { 1026 if (status_code != WLAN_STATUS_SUCCESS) {
@@ -1223,15 +1029,15 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
1223 const int num_algs = ARRAY_SIZE(algs); 1029 const int num_algs = ARRAY_SIZE(algs);
1224 int i, pos; 1030 int i, pos;
1225 algs[0] = algs[1] = algs[2] = 0xff; 1031 algs[0] = algs[1] = algs[2] = 0xff;
1226 if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN) 1032 if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN)
1227 algs[0] = WLAN_AUTH_OPEN; 1033 algs[0] = WLAN_AUTH_OPEN;
1228 if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) 1034 if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
1229 algs[1] = WLAN_AUTH_SHARED_KEY; 1035 algs[1] = WLAN_AUTH_SHARED_KEY;
1230 if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP) 1036 if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
1231 algs[2] = WLAN_AUTH_LEAP; 1037 algs[2] = WLAN_AUTH_LEAP;
1232 if (ifsta->auth_alg == WLAN_AUTH_OPEN) 1038 if (ifmgd->auth_alg == WLAN_AUTH_OPEN)
1233 pos = 0; 1039 pos = 0;
1234 else if (ifsta->auth_alg == WLAN_AUTH_SHARED_KEY) 1040 else if (ifmgd->auth_alg == WLAN_AUTH_SHARED_KEY)
1235 pos = 1; 1041 pos = 1;
1236 else 1042 else
1237 pos = 2; 1043 pos = 2;
@@ -1239,101 +1045,101 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
1239 pos++; 1045 pos++;
1240 if (pos >= num_algs) 1046 if (pos >= num_algs)
1241 pos = 0; 1047 pos = 0;
1242 if (algs[pos] == ifsta->auth_alg || 1048 if (algs[pos] == ifmgd->auth_alg ||
1243 algs[pos] == 0xff) 1049 algs[pos] == 0xff)
1244 continue; 1050 continue;
1245 if (algs[pos] == WLAN_AUTH_SHARED_KEY && 1051 if (algs[pos] == WLAN_AUTH_SHARED_KEY &&
1246 !ieee80211_sta_wep_configured(sdata)) 1052 !ieee80211_sta_wep_configured(sdata))
1247 continue; 1053 continue;
1248 ifsta->auth_alg = algs[pos]; 1054 ifmgd->auth_alg = algs[pos];
1249 break; 1055 break;
1250 } 1056 }
1251 } 1057 }
1252 return; 1058 return;
1253 } 1059 }
1254 1060
1255 switch (ifsta->auth_alg) { 1061 switch (ifmgd->auth_alg) {
1256 case WLAN_AUTH_OPEN: 1062 case WLAN_AUTH_OPEN:
1257 case WLAN_AUTH_LEAP: 1063 case WLAN_AUTH_LEAP:
1258 ieee80211_auth_completed(sdata, ifsta); 1064 ieee80211_auth_completed(sdata);
1259 break; 1065 break;
1260 case WLAN_AUTH_SHARED_KEY: 1066 case WLAN_AUTH_SHARED_KEY:
1261 if (ifsta->auth_transaction == 4) 1067 if (ifmgd->auth_transaction == 4)
1262 ieee80211_auth_completed(sdata, ifsta); 1068 ieee80211_auth_completed(sdata);
1263 else 1069 else
1264 ieee80211_auth_challenge(sdata, ifsta, mgmt, len); 1070 ieee80211_auth_challenge(sdata, mgmt, len);
1265 break; 1071 break;
1266 } 1072 }
1267} 1073}
1268 1074
1269 1075
1270static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, 1076static void ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata,
1271 struct ieee80211_if_sta *ifsta,
1272 struct ieee80211_mgmt *mgmt, 1077 struct ieee80211_mgmt *mgmt,
1273 size_t len) 1078 size_t len)
1274{ 1079{
1080 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1275 u16 reason_code; 1081 u16 reason_code;
1276 1082
1277 if (len < 24 + 2) 1083 if (len < 24 + 2)
1278 return; 1084 return;
1279 1085
1280 if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN)) 1086 if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN))
1281 return; 1087 return;
1282 1088
1283 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code); 1089 reason_code = le16_to_cpu(mgmt->u.deauth.reason_code);
1284 1090
1285 if (ifsta->flags & IEEE80211_STA_AUTHENTICATED) 1091 if (ifmgd->flags & IEEE80211_STA_AUTHENTICATED)
1286 printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n", 1092 printk(KERN_DEBUG "%s: deauthenticated (Reason: %u)\n",
1287 sdata->dev->name, reason_code); 1093 sdata->dev->name, reason_code);
1288 1094
1289 if (ifsta->state == IEEE80211_STA_MLME_AUTHENTICATE || 1095 if (ifmgd->state == IEEE80211_STA_MLME_AUTHENTICATE ||
1290 ifsta->state == IEEE80211_STA_MLME_ASSOCIATE || 1096 ifmgd->state == IEEE80211_STA_MLME_ASSOCIATE ||
1291 ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) { 1097 ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
1292 ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; 1098 ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
1293 mod_timer(&ifsta->timer, jiffies + 1099 mod_timer(&ifmgd->timer, jiffies +
1294 IEEE80211_RETRY_AUTH_INTERVAL); 1100 IEEE80211_RETRY_AUTH_INTERVAL);
1295 } 1101 }
1296 1102
1297 ieee80211_set_disassoc(sdata, ifsta, true, false, 0); 1103 ieee80211_set_disassoc(sdata, true, false, 0);
1298 ifsta->flags &= ~IEEE80211_STA_AUTHENTICATED; 1104 ifmgd->flags &= ~IEEE80211_STA_AUTHENTICATED;
1299} 1105}
1300 1106
1301 1107
1302static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, 1108static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
1303 struct ieee80211_if_sta *ifsta,
1304 struct ieee80211_mgmt *mgmt, 1109 struct ieee80211_mgmt *mgmt,
1305 size_t len) 1110 size_t len)
1306{ 1111{
1112 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1307 u16 reason_code; 1113 u16 reason_code;
1308 1114
1309 if (len < 24 + 2) 1115 if (len < 24 + 2)
1310 return; 1116 return;
1311 1117
1312 if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN)) 1118 if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN))
1313 return; 1119 return;
1314 1120
1315 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); 1121 reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code);
1316 1122
1317 if (ifsta->flags & IEEE80211_STA_ASSOCIATED) 1123 if (ifmgd->flags & IEEE80211_STA_ASSOCIATED)
1318 printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n", 1124 printk(KERN_DEBUG "%s: disassociated (Reason: %u)\n",
1319 sdata->dev->name, reason_code); 1125 sdata->dev->name, reason_code);
1320 1126
1321 if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) { 1127 if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED) {
1322 ifsta->state = IEEE80211_STA_MLME_ASSOCIATE; 1128 ifmgd->state = IEEE80211_STA_MLME_ASSOCIATE;
1323 mod_timer(&ifsta->timer, jiffies + 1129 mod_timer(&ifmgd->timer, jiffies +
1324 IEEE80211_RETRY_AUTH_INTERVAL); 1130 IEEE80211_RETRY_AUTH_INTERVAL);
1325 } 1131 }
1326 1132
1327 ieee80211_set_disassoc(sdata, ifsta, false, false, reason_code); 1133 ieee80211_set_disassoc(sdata, false, false, reason_code);
1328} 1134}
1329 1135
1330 1136
1331static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, 1137static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1332 struct ieee80211_if_sta *ifsta,
1333 struct ieee80211_mgmt *mgmt, 1138 struct ieee80211_mgmt *mgmt,
1334 size_t len, 1139 size_t len,
1335 int reassoc) 1140 int reassoc)
1336{ 1141{
1142 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
1337 struct ieee80211_local *local = sdata->local; 1143 struct ieee80211_local *local = sdata->local;
1338 struct ieee80211_supported_band *sband; 1144 struct ieee80211_supported_band *sband;
1339 struct sta_info *sta; 1145 struct sta_info *sta;
@@ -1350,13 +1156,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1350 /* AssocResp and ReassocResp have identical structure, so process both 1156 /* AssocResp and ReassocResp have identical structure, so process both
1351 * of them in this function. */ 1157 * of them in this function. */
1352 1158
1353 if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATE) 1159 if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE)
1354 return; 1160 return;
1355 1161
1356 if (len < 24 + 6) 1162 if (len < 24 + 6)
1357 return; 1163 return;
1358 1164
1359 if (memcmp(ifsta->bssid, mgmt->sa, ETH_ALEN) != 0) 1165 if (memcmp(ifmgd->bssid, mgmt->sa, ETH_ALEN) != 0)
1360 return; 1166 return;
1361 1167
1362 capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); 1168 capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
@@ -1381,7 +1187,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1381 "comeback duration %u TU (%u ms)\n", 1187 "comeback duration %u TU (%u ms)\n",
1382 sdata->dev->name, tu, ms); 1188 sdata->dev->name, tu, ms);
1383 if (ms > IEEE80211_ASSOC_TIMEOUT) 1189 if (ms > IEEE80211_ASSOC_TIMEOUT)
1384 mod_timer(&ifsta->timer, 1190 mod_timer(&ifmgd->timer,
1385 jiffies + msecs_to_jiffies(ms)); 1191 jiffies + msecs_to_jiffies(ms));
1386 return; 1192 return;
1387 } 1193 }
@@ -1392,7 +1198,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1392 /* if this was a reassociation, ensure we try a "full" 1198 /* if this was a reassociation, ensure we try a "full"
1393 * association next time. This works around some broken APs 1199 * association next time. This works around some broken APs
1394 * which do not correctly reject reassociation requests. */ 1200 * which do not correctly reject reassociation requests. */
1395 ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET; 1201 ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
1396 return; 1202 return;
1397 } 1203 }
1398 1204
@@ -1408,23 +1214,23 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1408 } 1214 }
1409 1215
1410 printk(KERN_DEBUG "%s: associated\n", sdata->dev->name); 1216 printk(KERN_DEBUG "%s: associated\n", sdata->dev->name);
1411 ifsta->aid = aid; 1217 ifmgd->aid = aid;
1412 ifsta->ap_capab = capab_info; 1218 ifmgd->ap_capab = capab_info;
1413 1219
1414 kfree(ifsta->assocresp_ies); 1220 kfree(ifmgd->assocresp_ies);
1415 ifsta->assocresp_ies_len = len - (pos - (u8 *) mgmt); 1221 ifmgd->assocresp_ies_len = len - (pos - (u8 *) mgmt);
1416 ifsta->assocresp_ies = kmalloc(ifsta->assocresp_ies_len, GFP_KERNEL); 1222 ifmgd->assocresp_ies = kmalloc(ifmgd->assocresp_ies_len, GFP_KERNEL);
1417 if (ifsta->assocresp_ies) 1223 if (ifmgd->assocresp_ies)
1418 memcpy(ifsta->assocresp_ies, pos, ifsta->assocresp_ies_len); 1224 memcpy(ifmgd->assocresp_ies, pos, ifmgd->assocresp_ies_len);
1419 1225
1420 rcu_read_lock(); 1226 rcu_read_lock();
1421 1227
1422 /* Add STA entry for the AP */ 1228 /* Add STA entry for the AP */
1423 sta = sta_info_get(local, ifsta->bssid); 1229 sta = sta_info_get(local, ifmgd->bssid);
1424 if (!sta) { 1230 if (!sta) {
1425 newsta = true; 1231 newsta = true;
1426 1232
1427 sta = sta_info_alloc(sdata, ifsta->bssid, GFP_ATOMIC); 1233 sta = sta_info_alloc(sdata, ifmgd->bssid, GFP_ATOMIC);
1428 if (!sta) { 1234 if (!sta) {
1429 printk(KERN_DEBUG "%s: failed to alloc STA entry for" 1235 printk(KERN_DEBUG "%s: failed to alloc STA entry for"
1430 " the AP\n", sdata->dev->name); 1236 " the AP\n", sdata->dev->name);
@@ -1497,7 +1303,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1497 else 1303 else
1498 sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE; 1304 sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
1499 1305
1500 if (elems.ht_cap_elem) 1306 /* If TKIP/WEP is used, no need to parse AP's HT capabilities */
1307 if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))
1501 ieee80211_ht_cap_ie_to_sta_ht_cap(sband, 1308 ieee80211_ht_cap_ie_to_sta_ht_cap(sband,
1502 elems.ht_cap_elem, &sta->sta.ht_cap); 1309 elems.ht_cap_elem, &sta->sta.ht_cap);
1503 1310
@@ -1505,7 +1312,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1505 1312
1506 rate_control_rate_init(sta); 1313 rate_control_rate_init(sta);
1507 1314
1508 if (ifsta->flags & IEEE80211_STA_MFP_ENABLED) 1315 if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED)
1509 set_sta_flags(sta, WLAN_STA_MFP); 1316 set_sta_flags(sta, WLAN_STA_MFP);
1510 1317
1511 if (elems.wmm_param) 1318 if (elems.wmm_param)
@@ -1524,11 +1331,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1524 rcu_read_unlock(); 1331 rcu_read_unlock();
1525 1332
1526 if (elems.wmm_param) 1333 if (elems.wmm_param)
1527 ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param, 1334 ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
1528 elems.wmm_param_len); 1335 elems.wmm_param_len);
1529 1336
1530 if (elems.ht_info_elem && elems.wmm_param && 1337 if (elems.ht_info_elem && elems.wmm_param &&
1531 (ifsta->flags & IEEE80211_STA_WMM_ENABLED)) 1338 (ifmgd->flags & IEEE80211_STA_WMM_ENABLED) &&
1339 !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED))
1532 changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem, 1340 changed |= ieee80211_enable_ht(sdata, elems.ht_info_elem,
1533 ap_ht_cap_flags); 1341 ap_ht_cap_flags);
1534 1342
@@ -1536,163 +1344,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
1536 * ieee80211_set_associated() will tell the driver */ 1344 * ieee80211_set_associated() will tell the driver */
1537 bss_conf->aid = aid; 1345 bss_conf->aid = aid;
1538 bss_conf->assoc_capability = capab_info; 1346 bss_conf->assoc_capability = capab_info;
1539 ieee80211_set_associated(sdata, ifsta, changed); 1347 ieee80211_set_associated(sdata, changed);
1540 1348
1541 ieee80211_associated(sdata, ifsta); 1349 ieee80211_associated(sdata);
1542} 1350}
1543 1351
1544 1352
1545static int __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
1546 struct ieee80211_if_sta *ifsta,
1547 const u8 *bssid, const int beacon_int,
1548 const int freq,
1549 const size_t supp_rates_len,
1550 const u8 *supp_rates,
1551 const u16 capability)
1552{
1553 struct ieee80211_local *local = sdata->local;
1554 int res = 0, rates, i, j;
1555 struct sk_buff *skb;
1556 struct ieee80211_mgmt *mgmt;
1557 u8 *pos;
1558 struct ieee80211_supported_band *sband;
1559 union iwreq_data wrqu;
1560
1561 if (local->ops->reset_tsf) {
1562 /* Reset own TSF to allow time synchronization work. */
1563 local->ops->reset_tsf(local_to_hw(local));
1564 }
1565
1566 if ((ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) &&
1567 memcmp(ifsta->bssid, bssid, ETH_ALEN) == 0)
1568 return res;
1569
1570 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 400 +
1571 sdata->u.sta.ie_proberesp_len);
1572 if (!skb) {
1573 printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
1574 "response\n", sdata->dev->name);
1575 return -ENOMEM;
1576 }
1577
1578 if (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) {
1579 /* Remove possible STA entries from other IBSS networks. */
1580 sta_info_flush_delayed(sdata);
1581 }
1582
1583 memcpy(ifsta->bssid, bssid, ETH_ALEN);
1584 res = ieee80211_if_config(sdata, IEEE80211_IFCC_BSSID);
1585 if (res)
1586 return res;
1587
1588 local->hw.conf.beacon_int = beacon_int >= 10 ? beacon_int : 10;
1589
1590 sdata->drop_unencrypted = capability &
1591 WLAN_CAPABILITY_PRIVACY ? 1 : 0;
1592
1593 res = ieee80211_set_freq(sdata, freq);
1594
1595 if (res)
1596 return res;
1597
1598 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
1599
1600 /* Build IBSS probe response */
1601
1602 skb_reserve(skb, local->hw.extra_tx_headroom);
1603
1604 mgmt = (struct ieee80211_mgmt *)
1605 skb_put(skb, 24 + sizeof(mgmt->u.beacon));
1606 memset(mgmt, 0, 24 + sizeof(mgmt->u.beacon));
1607 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
1608 IEEE80211_STYPE_PROBE_RESP);
1609 memset(mgmt->da, 0xff, ETH_ALEN);
1610 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
1611 memcpy(mgmt->bssid, ifsta->bssid, ETH_ALEN);
1612 mgmt->u.beacon.beacon_int =
1613 cpu_to_le16(local->hw.conf.beacon_int);
1614 mgmt->u.beacon.capab_info = cpu_to_le16(capability);
1615
1616 pos = skb_put(skb, 2 + ifsta->ssid_len);
1617 *pos++ = WLAN_EID_SSID;
1618 *pos++ = ifsta->ssid_len;
1619 memcpy(pos, ifsta->ssid, ifsta->ssid_len);
1620
1621 rates = supp_rates_len;
1622 if (rates > 8)
1623 rates = 8;
1624 pos = skb_put(skb, 2 + rates);
1625 *pos++ = WLAN_EID_SUPP_RATES;
1626 *pos++ = rates;
1627 memcpy(pos, supp_rates, rates);
1628
1629 if (sband->band == IEEE80211_BAND_2GHZ) {
1630 pos = skb_put(skb, 2 + 1);
1631 *pos++ = WLAN_EID_DS_PARAMS;
1632 *pos++ = 1;
1633 *pos++ = ieee80211_frequency_to_channel(freq);
1634 }
1635
1636 pos = skb_put(skb, 2 + 2);
1637 *pos++ = WLAN_EID_IBSS_PARAMS;
1638 *pos++ = 2;
1639 /* FIX: set ATIM window based on scan results */
1640 *pos++ = 0;
1641 *pos++ = 0;
1642
1643 if (supp_rates_len > 8) {
1644 rates = supp_rates_len - 8;
1645 pos = skb_put(skb, 2 + rates);
1646 *pos++ = WLAN_EID_EXT_SUPP_RATES;
1647 *pos++ = rates;
1648 memcpy(pos, &supp_rates[8], rates);
1649 }
1650
1651 add_extra_ies(skb, sdata->u.sta.ie_proberesp,
1652 sdata->u.sta.ie_proberesp_len);
1653
1654 ifsta->probe_resp = skb;
1655
1656 ieee80211_if_config(sdata, IEEE80211_IFCC_BEACON |
1657 IEEE80211_IFCC_BEACON_ENABLED);
1658
1659
1660 rates = 0;
1661 for (i = 0; i < supp_rates_len; i++) {
1662 int bitrate = (supp_rates[i] & 0x7f) * 5;
1663 for (j = 0; j < sband->n_bitrates; j++)
1664 if (sband->bitrates[j].bitrate == bitrate)
1665 rates |= BIT(j);
1666 }
1667 ifsta->supp_rates_bits[local->hw.conf.channel->band] = rates;
1668
1669 ieee80211_sta_def_wmm_params(sdata, supp_rates_len, supp_rates);
1670
1671 ifsta->flags |= IEEE80211_STA_PREV_BSSID_SET;
1672 ifsta->state = IEEE80211_STA_MLME_IBSS_JOINED;
1673 mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
1674
1675 ieee80211_led_assoc(local, true);
1676
1677 memset(&wrqu, 0, sizeof(wrqu));
1678 memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN);
1679 wireless_send_event(sdata->dev, SIOCGIWAP, &wrqu, NULL);
1680
1681 return res;
1682}
1683
1684static int ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
1685 struct ieee80211_if_sta *ifsta,
1686 struct ieee80211_bss *bss)
1687{
1688 return __ieee80211_sta_join_ibss(sdata, ifsta,
1689 bss->cbss.bssid,
1690 bss->cbss.beacon_interval,
1691 bss->cbss.channel->center_freq,
1692 bss->supp_rates_len, bss->supp_rates,
1693 bss->cbss.capability);
1694}
1695
1696static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, 1353static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
1697 struct ieee80211_mgmt *mgmt, 1354 struct ieee80211_mgmt *mgmt,
1698 size_t len, 1355 size_t len,
@@ -1703,11 +1360,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
1703 struct ieee80211_local *local = sdata->local; 1360 struct ieee80211_local *local = sdata->local;
1704 int freq; 1361 int freq;
1705 struct ieee80211_bss *bss; 1362 struct ieee80211_bss *bss;
1706 struct sta_info *sta;
1707 struct ieee80211_channel *channel; 1363 struct ieee80211_channel *channel;
1708 u64 beacon_timestamp, rx_timestamp;
1709 u32 supp_rates = 0;
1710 enum ieee80211_band band = rx_status->band;
1711 1364
1712 if (elems->ds_params && elems->ds_params_len == 1) 1365 if (elems->ds_params && elems->ds_params_len == 1)
1713 freq = ieee80211_channel_to_frequency(elems->ds_params[0]); 1366 freq = ieee80211_channel_to_frequency(elems->ds_params[0]);
@@ -1719,133 +1372,18 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
1719 if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) 1372 if (!channel || channel->flags & IEEE80211_CHAN_DISABLED)
1720 return; 1373 return;
1721 1374
1722 if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates &&
1723 memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0) {
1724 supp_rates = ieee80211_sta_get_rates(local, elems, band);
1725
1726 rcu_read_lock();
1727
1728 sta = sta_info_get(local, mgmt->sa);
1729 if (sta) {
1730 u32 prev_rates;
1731
1732 prev_rates = sta->sta.supp_rates[band];
1733 /* make sure mandatory rates are always added */
1734 sta->sta.supp_rates[band] = supp_rates |
1735 ieee80211_mandatory_rates(local, band);
1736
1737#ifdef CONFIG_MAC80211_IBSS_DEBUG
1738 if (sta->sta.supp_rates[band] != prev_rates)
1739 printk(KERN_DEBUG "%s: updated supp_rates set "
1740 "for %pM based on beacon info (0x%llx | "
1741 "0x%llx -> 0x%llx)\n",
1742 sdata->dev->name,
1743 sta->sta.addr,
1744 (unsigned long long) prev_rates,
1745 (unsigned long long) supp_rates,
1746 (unsigned long long) sta->sta.supp_rates[band]);
1747#endif
1748 } else {
1749 ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
1750 }
1751
1752 rcu_read_unlock();
1753 }
1754
1755 bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, 1375 bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems,
1756 channel, beacon); 1376 channel, beacon);
1757 if (!bss) 1377 if (!bss)
1758 return; 1378 return;
1759 1379
1760 if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) && 1380 if (elems->ch_switch_elem && (elems->ch_switch_elem_len == 3) &&
1761 (memcmp(mgmt->bssid, sdata->u.sta.bssid, ETH_ALEN) == 0)) { 1381 (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN) == 0)) {
1762 struct ieee80211_channel_sw_ie *sw_elem = 1382 struct ieee80211_channel_sw_ie *sw_elem =
1763 (struct ieee80211_channel_sw_ie *)elems->ch_switch_elem; 1383 (struct ieee80211_channel_sw_ie *)elems->ch_switch_elem;
1764 ieee80211_process_chanswitch(sdata, sw_elem, bss); 1384 ieee80211_process_chanswitch(sdata, sw_elem, bss);
1765 } 1385 }
1766 1386
1767 /* was just updated in ieee80211_bss_info_update */
1768 beacon_timestamp = bss->cbss.tsf;
1769
1770 if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
1771 goto put_bss;
1772
1773 /* check if we need to merge IBSS */
1774
1775 /* merge only on beacons (???) */
1776 if (!beacon)
1777 goto put_bss;
1778
1779 /* we use a fixed BSSID */
1780 if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET)
1781 goto put_bss;
1782
1783 /* not an IBSS */
1784 if (!(bss->cbss.capability & WLAN_CAPABILITY_IBSS))
1785 goto put_bss;
1786
1787 /* different channel */
1788 if (bss->cbss.channel != local->oper_channel)
1789 goto put_bss;
1790
1791 /* different SSID */
1792 if (elems->ssid_len != sdata->u.sta.ssid_len ||
1793 memcmp(elems->ssid, sdata->u.sta.ssid,
1794 sdata->u.sta.ssid_len))
1795 goto put_bss;
1796
1797 if (rx_status->flag & RX_FLAG_TSFT) {
1798 /*
1799 * For correct IBSS merging we need mactime; since mactime is
1800 * defined as the time the first data symbol of the frame hits
1801 * the PHY, and the timestamp of the beacon is defined as "the
1802 * time that the data symbol containing the first bit of the
1803 * timestamp is transmitted to the PHY plus the transmitting
1804 * STA's delays through its local PHY from the MAC-PHY
1805 * interface to its interface with the WM" (802.11 11.1.2)
1806 * - equals the time this bit arrives at the receiver - we have
1807 * to take into account the offset between the two.
1808 *
1809 * E.g. at 1 MBit that means mactime is 192 usec earlier
1810 * (=24 bytes * 8 usecs/byte) than the beacon timestamp.
1811 */
1812 int rate;
1813
1814 if (rx_status->flag & RX_FLAG_HT)
1815 rate = 65; /* TODO: HT rates */
1816 else
1817 rate = local->hw.wiphy->bands[band]->
1818 bitrates[rx_status->rate_idx].bitrate;
1819
1820 rx_timestamp = rx_status->mactime + (24 * 8 * 10 / rate);
1821 } else if (local && local->ops && local->ops->get_tsf)
1822 /* second best option: get current TSF */
1823 rx_timestamp = local->ops->get_tsf(local_to_hw(local));
1824 else
1825 /* can't merge without knowing the TSF */
1826 rx_timestamp = -1LLU;
1827
1828#ifdef CONFIG_MAC80211_IBSS_DEBUG
1829 printk(KERN_DEBUG "RX beacon SA=%pM BSSID="
1830 "%pM TSF=0x%llx BCN=0x%llx diff=%lld @%lu\n",
1831 mgmt->sa, mgmt->bssid,
1832 (unsigned long long)rx_timestamp,
1833 (unsigned long long)beacon_timestamp,
1834 (unsigned long long)(rx_timestamp - beacon_timestamp),
1835 jiffies);
1836#endif
1837
1838 if (beacon_timestamp > rx_timestamp) {
1839#ifdef CONFIG_MAC80211_IBSS_DEBUG
1840 printk(KERN_DEBUG "%s: beacon TSF higher than "
1841 "local TSF - IBSS merge with BSSID %pM\n",
1842 sdata->dev->name, mgmt->bssid);
1843#endif
1844 ieee80211_sta_join_ibss(sdata, &sdata->u.sta, bss);
1845 ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, supp_rates);
1846 }
1847
1848 put_bss:
1849 ieee80211_rx_bss_put(local, bss); 1387 ieee80211_rx_bss_put(local, bss);
1850} 1388}
1851 1389
@@ -1857,7 +1395,6 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
1857{ 1395{
1858 size_t baselen; 1396 size_t baselen;
1859 struct ieee802_11_elems elems; 1397 struct ieee802_11_elems elems;
1860 struct ieee80211_if_sta *ifsta = &sdata->u.sta;
1861 1398
1862 if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN)) 1399 if (memcmp(mgmt->da, sdata->dev->dev_addr, ETH_ALEN))
1863 return; /* ignore ProbeResp to foreign address */ 1400 return; /* ignore ProbeResp to foreign address */
@@ -1873,20 +1410,19 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
1873 1410
1874 /* direct probe may be part of the association flow */ 1411 /* direct probe may be part of the association flow */
1875 if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE, 1412 if (test_and_clear_bit(IEEE80211_STA_REQ_DIRECT_PROBE,
1876 &ifsta->request)) { 1413 &sdata->u.mgd.request)) {
1877 printk(KERN_DEBUG "%s direct probe responded\n", 1414 printk(KERN_DEBUG "%s direct probe responded\n",
1878 sdata->dev->name); 1415 sdata->dev->name);
1879 ieee80211_authenticate(sdata, ifsta); 1416 ieee80211_authenticate(sdata);
1880 } 1417 }
1881} 1418}
1882 1419
1883
1884static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, 1420static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1885 struct ieee80211_mgmt *mgmt, 1421 struct ieee80211_mgmt *mgmt,
1886 size_t len, 1422 size_t len,
1887 struct ieee80211_rx_status *rx_status) 1423 struct ieee80211_rx_status *rx_status)
1888{ 1424{
1889 struct ieee80211_if_sta *ifsta; 1425 struct ieee80211_if_managed *ifmgd;
1890 size_t baselen; 1426 size_t baselen;
1891 struct ieee802_11_elems elems; 1427 struct ieee802_11_elems elems;
1892 struct ieee80211_local *local = sdata->local; 1428 struct ieee80211_local *local = sdata->local;
@@ -1905,21 +1441,21 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1905 1441
1906 if (sdata->vif.type != NL80211_IFTYPE_STATION) 1442 if (sdata->vif.type != NL80211_IFTYPE_STATION)
1907 return; 1443 return;
1908 ifsta = &sdata->u.sta;
1909 1444
1910 if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED) || 1445 ifmgd = &sdata->u.mgd;
1911 memcmp(ifsta->bssid, mgmt->bssid, ETH_ALEN) != 0) 1446
1447 if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED) ||
1448 memcmp(ifmgd->bssid, mgmt->bssid, ETH_ALEN) != 0)
1912 return; 1449 return;
1913 1450
1914 if (rx_status->freq != local->hw.conf.channel->center_freq) 1451 if (rx_status->freq != local->hw.conf.channel->center_freq)
1915 return; 1452 return;
1916 1453
1917 ieee80211_sta_wmm_params(local, ifsta, elems.wmm_param, 1454 ieee80211_sta_wmm_params(local, ifmgd, elems.wmm_param,
1918 elems.wmm_param_len); 1455 elems.wmm_param_len);
1919 1456
1920 if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK && 1457 if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
1921 local->hw.conf.flags & IEEE80211_CONF_PS) { 1458 directed_tim = ieee80211_check_tim(&elems, ifmgd->aid);
1922 directed_tim = ieee80211_check_tim(&elems, ifsta->aid);
1923 1459
1924 if (directed_tim) { 1460 if (directed_tim) {
1925 if (local->hw.conf.dynamic_ps_timeout > 0) { 1461 if (local->hw.conf.dynamic_ps_timeout > 0) {
@@ -1954,14 +1490,15 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1954 erp_valid, erp_value); 1490 erp_valid, erp_value);
1955 1491
1956 1492
1957 if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param) { 1493 if (elems.ht_cap_elem && elems.ht_info_elem && elems.wmm_param &&
1494 !(ifmgd->flags & IEEE80211_STA_TKIP_WEP_USED)) {
1958 struct sta_info *sta; 1495 struct sta_info *sta;
1959 struct ieee80211_supported_band *sband; 1496 struct ieee80211_supported_band *sband;
1960 u16 ap_ht_cap_flags; 1497 u16 ap_ht_cap_flags;
1961 1498
1962 rcu_read_lock(); 1499 rcu_read_lock();
1963 1500
1964 sta = sta_info_get(local, ifsta->bssid); 1501 sta = sta_info_get(local, ifmgd->bssid);
1965 if (!sta) { 1502 if (!sta) {
1966 rcu_read_unlock(); 1503 rcu_read_unlock();
1967 return; 1504 return;
@@ -1997,85 +1534,16 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
1997 ieee80211_bss_info_change_notify(sdata, changed); 1534 ieee80211_bss_info_change_notify(sdata, changed);
1998} 1535}
1999 1536
2000 1537ieee80211_rx_result ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata,
2001static void ieee80211_rx_mgmt_probe_req(struct ieee80211_sub_if_data *sdata, 1538 struct sk_buff *skb,
2002 struct ieee80211_if_sta *ifsta, 1539 struct ieee80211_rx_status *rx_status)
2003 struct ieee80211_mgmt *mgmt,
2004 size_t len)
2005{ 1540{
2006 struct ieee80211_local *local = sdata->local; 1541 struct ieee80211_local *local = sdata->local;
2007 int tx_last_beacon;
2008 struct sk_buff *skb;
2009 struct ieee80211_mgmt *resp;
2010 u8 *pos, *end;
2011
2012 if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED ||
2013 len < 24 + 2 || !ifsta->probe_resp)
2014 return;
2015
2016 if (local->ops->tx_last_beacon)
2017 tx_last_beacon = local->ops->tx_last_beacon(local_to_hw(local));
2018 else
2019 tx_last_beacon = 1;
2020
2021#ifdef CONFIG_MAC80211_IBSS_DEBUG
2022 printk(KERN_DEBUG "%s: RX ProbeReq SA=%pM DA=%pM BSSID=%pM"
2023 " (tx_last_beacon=%d)\n",
2024 sdata->dev->name, mgmt->sa, mgmt->da,
2025 mgmt->bssid, tx_last_beacon);
2026#endif /* CONFIG_MAC80211_IBSS_DEBUG */
2027
2028 if (!tx_last_beacon)
2029 return;
2030
2031 if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0 &&
2032 memcmp(mgmt->bssid, "\xff\xff\xff\xff\xff\xff", ETH_ALEN) != 0)
2033 return;
2034
2035 end = ((u8 *) mgmt) + len;
2036 pos = mgmt->u.probe_req.variable;
2037 if (pos[0] != WLAN_EID_SSID ||
2038 pos + 2 + pos[1] > end) {
2039#ifdef CONFIG_MAC80211_IBSS_DEBUG
2040 printk(KERN_DEBUG "%s: Invalid SSID IE in ProbeReq "
2041 "from %pM\n",
2042 sdata->dev->name, mgmt->sa);
2043#endif
2044 return;
2045 }
2046 if (pos[1] != 0 &&
2047 (pos[1] != ifsta->ssid_len ||
2048 memcmp(pos + 2, ifsta->ssid, ifsta->ssid_len) != 0)) {
2049 /* Ignore ProbeReq for foreign SSID */
2050 return;
2051 }
2052
2053 /* Reply with ProbeResp */
2054 skb = skb_copy(ifsta->probe_resp, GFP_KERNEL);
2055 if (!skb)
2056 return;
2057
2058 resp = (struct ieee80211_mgmt *) skb->data;
2059 memcpy(resp->da, mgmt->sa, ETH_ALEN);
2060#ifdef CONFIG_MAC80211_IBSS_DEBUG
2061 printk(KERN_DEBUG "%s: Sending ProbeResp to %pM\n",
2062 sdata->dev->name, resp->da);
2063#endif /* CONFIG_MAC80211_IBSS_DEBUG */
2064 ieee80211_tx_skb(sdata, skb, 0);
2065}
2066
2067void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
2068 struct ieee80211_rx_status *rx_status)
2069{
2070 struct ieee80211_local *local = sdata->local;
2071 struct ieee80211_if_sta *ifsta;
2072 struct ieee80211_mgmt *mgmt; 1542 struct ieee80211_mgmt *mgmt;
2073 u16 fc; 1543 u16 fc;
2074 1544
2075 if (skb->len < 24) 1545 if (skb->len < 24)
2076 goto fail; 1546 return RX_DROP_MONITOR;
2077
2078 ifsta = &sdata->u.sta;
2079 1547
2080 mgmt = (struct ieee80211_mgmt *) skb->data; 1548 mgmt = (struct ieee80211_mgmt *) skb->data;
2081 fc = le16_to_cpu(mgmt->frame_control); 1549 fc = le16_to_cpu(mgmt->frame_control);
@@ -2090,147 +1558,68 @@ void ieee80211_sta_rx_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *
2090 case IEEE80211_STYPE_REASSOC_RESP: 1558 case IEEE80211_STYPE_REASSOC_RESP:
2091 case IEEE80211_STYPE_DEAUTH: 1559 case IEEE80211_STYPE_DEAUTH:
2092 case IEEE80211_STYPE_DISASSOC: 1560 case IEEE80211_STYPE_DISASSOC:
2093 skb_queue_tail(&ifsta->skb_queue, skb); 1561 skb_queue_tail(&sdata->u.mgd.skb_queue, skb);
2094 queue_work(local->hw.workqueue, &ifsta->work); 1562 queue_work(local->hw.workqueue, &sdata->u.mgd.work);
2095 return; 1563 return RX_QUEUED;
2096 } 1564 }
2097 1565
2098 fail: 1566 return RX_DROP_MONITOR;
2099 kfree_skb(skb);
2100} 1567}
2101 1568
2102static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, 1569static void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
2103 struct sk_buff *skb) 1570 struct sk_buff *skb)
2104{ 1571{
2105 struct ieee80211_rx_status *rx_status; 1572 struct ieee80211_rx_status *rx_status;
2106 struct ieee80211_if_sta *ifsta;
2107 struct ieee80211_mgmt *mgmt; 1573 struct ieee80211_mgmt *mgmt;
2108 u16 fc; 1574 u16 fc;
2109 1575
2110 ifsta = &sdata->u.sta;
2111
2112 rx_status = (struct ieee80211_rx_status *) skb->cb; 1576 rx_status = (struct ieee80211_rx_status *) skb->cb;
2113 mgmt = (struct ieee80211_mgmt *) skb->data; 1577 mgmt = (struct ieee80211_mgmt *) skb->data;
2114 fc = le16_to_cpu(mgmt->frame_control); 1578 fc = le16_to_cpu(mgmt->frame_control);
2115 1579
2116 if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { 1580 switch (fc & IEEE80211_FCTL_STYPE) {
2117 switch (fc & IEEE80211_FCTL_STYPE) { 1581 case IEEE80211_STYPE_PROBE_RESP:
2118 case IEEE80211_STYPE_PROBE_REQ: 1582 ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len,
2119 ieee80211_rx_mgmt_probe_req(sdata, ifsta, mgmt, 1583 rx_status);
2120 skb->len); 1584 break;
2121 break; 1585 case IEEE80211_STYPE_BEACON:
2122 case IEEE80211_STYPE_PROBE_RESP: 1586 ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
2123 ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, 1587 rx_status);
2124 rx_status); 1588 break;
2125 break; 1589 case IEEE80211_STYPE_AUTH:
2126 case IEEE80211_STYPE_BEACON: 1590 ieee80211_rx_mgmt_auth(sdata, mgmt, skb->len);
2127 ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len, 1591 break;
2128 rx_status); 1592 case IEEE80211_STYPE_ASSOC_RESP:
2129 break; 1593 ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 0);
2130 case IEEE80211_STYPE_AUTH: 1594 break;
2131 ieee80211_rx_mgmt_auth_ibss(sdata, ifsta, mgmt, 1595 case IEEE80211_STYPE_REASSOC_RESP:
2132 skb->len); 1596 ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, 1);
2133 break; 1597 break;
2134 } 1598 case IEEE80211_STYPE_DEAUTH:
2135 } else { /* NL80211_IFTYPE_STATION */ 1599 ieee80211_rx_mgmt_deauth(sdata, mgmt, skb->len);
2136 switch (fc & IEEE80211_FCTL_STYPE) { 1600 break;
2137 case IEEE80211_STYPE_PROBE_RESP: 1601 case IEEE80211_STYPE_DISASSOC:
2138 ieee80211_rx_mgmt_probe_resp(sdata, mgmt, skb->len, 1602 ieee80211_rx_mgmt_disassoc(sdata, mgmt, skb->len);
2139 rx_status); 1603 break;
2140 break;
2141 case IEEE80211_STYPE_BEACON:
2142 ieee80211_rx_mgmt_beacon(sdata, mgmt, skb->len,
2143 rx_status);
2144 break;
2145 case IEEE80211_STYPE_AUTH:
2146 ieee80211_rx_mgmt_auth(sdata, ifsta, mgmt, skb->len);
2147 break;
2148 case IEEE80211_STYPE_ASSOC_RESP:
2149 ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt,
2150 skb->len, 0);
2151 break;
2152 case IEEE80211_STYPE_REASSOC_RESP:
2153 ieee80211_rx_mgmt_assoc_resp(sdata, ifsta, mgmt,
2154 skb->len, 1);
2155 break;
2156 case IEEE80211_STYPE_DEAUTH:
2157 ieee80211_rx_mgmt_deauth(sdata, ifsta, mgmt, skb->len);
2158 break;
2159 case IEEE80211_STYPE_DISASSOC:
2160 ieee80211_rx_mgmt_disassoc(sdata, ifsta, mgmt,
2161 skb->len);
2162 break;
2163 }
2164 } 1604 }
2165 1605
2166 kfree_skb(skb); 1606 kfree_skb(skb);
2167} 1607}
2168 1608
2169
2170static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata)
2171{
2172 struct ieee80211_local *local = sdata->local;
2173 int active = 0;
2174 struct sta_info *sta;
2175
2176 rcu_read_lock();
2177
2178 list_for_each_entry_rcu(sta, &local->sta_list, list) {
2179 if (sta->sdata == sdata &&
2180 time_after(sta->last_rx + IEEE80211_IBSS_MERGE_INTERVAL,
2181 jiffies)) {
2182 active++;
2183 break;
2184 }
2185 }
2186
2187 rcu_read_unlock();
2188
2189 return active;
2190}
2191
2192
2193static void ieee80211_sta_merge_ibss(struct ieee80211_sub_if_data *sdata,
2194 struct ieee80211_if_sta *ifsta)
2195{
2196 mod_timer(&ifsta->timer, jiffies + IEEE80211_IBSS_MERGE_INTERVAL);
2197
2198 ieee80211_sta_expire(sdata, IEEE80211_IBSS_INACTIVITY_LIMIT);
2199 if (ieee80211_sta_active_ibss(sdata))
2200 return;
2201
2202 if ((sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) &&
2203 (!(sdata->u.sta.flags & IEEE80211_STA_AUTO_CHANNEL_SEL)))
2204 return;
2205
2206 printk(KERN_DEBUG "%s: No active IBSS STAs - trying to scan for other "
2207 "IBSS networks with same SSID (merge)\n", sdata->dev->name);
2208
2209 /* XXX maybe racy? */
2210 if (sdata->local->scan_req)
2211 return;
2212
2213 memcpy(sdata->local->int_scan_req.ssids[0].ssid,
2214 ifsta->ssid, IEEE80211_MAX_SSID_LEN);
2215 sdata->local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
2216 ieee80211_request_scan(sdata, &sdata->local->int_scan_req);
2217}
2218
2219
2220static void ieee80211_sta_timer(unsigned long data) 1609static void ieee80211_sta_timer(unsigned long data)
2221{ 1610{
2222 struct ieee80211_sub_if_data *sdata = 1611 struct ieee80211_sub_if_data *sdata =
2223 (struct ieee80211_sub_if_data *) data; 1612 (struct ieee80211_sub_if_data *) data;
2224 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 1613 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2225 struct ieee80211_local *local = sdata->local; 1614 struct ieee80211_local *local = sdata->local;
2226 1615
2227 set_bit(IEEE80211_STA_REQ_RUN, &ifsta->request); 1616 set_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
2228 queue_work(local->hw.workqueue, &ifsta->work); 1617 queue_work(local->hw.workqueue, &ifmgd->work);
2229} 1618}
2230 1619
2231static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata, 1620static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata)
2232 struct ieee80211_if_sta *ifsta)
2233{ 1621{
1622 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2234 struct ieee80211_local *local = sdata->local; 1623 struct ieee80211_local *local = sdata->local;
2235 1624
2236 if (local->ops->reset_tsf) { 1625 if (local->ops->reset_tsf) {
@@ -2238,191 +1627,39 @@ static void ieee80211_sta_reset_auth(struct ieee80211_sub_if_data *sdata,
2238 local->ops->reset_tsf(local_to_hw(local)); 1627 local->ops->reset_tsf(local_to_hw(local));
2239 } 1628 }
2240 1629
2241 ifsta->wmm_last_param_set = -1; /* allow any WMM update */ 1630 ifmgd->wmm_last_param_set = -1; /* allow any WMM update */
2242 1631
2243 1632
2244 if (ifsta->auth_algs & IEEE80211_AUTH_ALG_OPEN) 1633 if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_OPEN)
2245 ifsta->auth_alg = WLAN_AUTH_OPEN; 1634 ifmgd->auth_alg = WLAN_AUTH_OPEN;
2246 else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY) 1635 else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_SHARED_KEY)
2247 ifsta->auth_alg = WLAN_AUTH_SHARED_KEY; 1636 ifmgd->auth_alg = WLAN_AUTH_SHARED_KEY;
2248 else if (ifsta->auth_algs & IEEE80211_AUTH_ALG_LEAP) 1637 else if (ifmgd->auth_algs & IEEE80211_AUTH_ALG_LEAP)
2249 ifsta->auth_alg = WLAN_AUTH_LEAP; 1638 ifmgd->auth_alg = WLAN_AUTH_LEAP;
2250 else 1639 else
2251 ifsta->auth_alg = WLAN_AUTH_OPEN; 1640 ifmgd->auth_alg = WLAN_AUTH_OPEN;
2252 ifsta->auth_transaction = -1; 1641 ifmgd->auth_transaction = -1;
2253 ifsta->flags &= ~IEEE80211_STA_ASSOCIATED; 1642 ifmgd->flags &= ~IEEE80211_STA_ASSOCIATED;
2254 ifsta->assoc_scan_tries = 0; 1643 ifmgd->assoc_scan_tries = 0;
2255 ifsta->direct_probe_tries = 0; 1644 ifmgd->direct_probe_tries = 0;
2256 ifsta->auth_tries = 0; 1645 ifmgd->auth_tries = 0;
2257 ifsta->assoc_tries = 0; 1646 ifmgd->assoc_tries = 0;
2258 netif_tx_stop_all_queues(sdata->dev); 1647 netif_tx_stop_all_queues(sdata->dev);
2259 netif_carrier_off(sdata->dev); 1648 netif_carrier_off(sdata->dev);
2260} 1649}
2261 1650
2262static int ieee80211_sta_create_ibss(struct ieee80211_sub_if_data *sdata, 1651static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata)
2263 struct ieee80211_if_sta *ifsta)
2264{
2265 struct ieee80211_local *local = sdata->local;
2266 struct ieee80211_supported_band *sband;
2267 u8 *pos;
2268 u8 bssid[ETH_ALEN];
2269 u8 supp_rates[IEEE80211_MAX_SUPP_RATES];
2270 u16 capability;
2271 int i;
2272
2273 if (sdata->u.sta.flags & IEEE80211_STA_BSSID_SET) {
2274 memcpy(bssid, ifsta->bssid, ETH_ALEN);
2275 } else {
2276 /* Generate random, not broadcast, locally administered BSSID. Mix in
2277 * own MAC address to make sure that devices that do not have proper
2278 * random number generator get different BSSID. */
2279 get_random_bytes(bssid, ETH_ALEN);
2280 for (i = 0; i < ETH_ALEN; i++)
2281 bssid[i] ^= sdata->dev->dev_addr[i];
2282 bssid[0] &= ~0x01;
2283 bssid[0] |= 0x02;
2284 }
2285
2286 printk(KERN_DEBUG "%s: Creating new IBSS network, BSSID %pM\n",
2287 sdata->dev->name, bssid);
2288
2289 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
2290
2291 if (local->hw.conf.beacon_int == 0)
2292 local->hw.conf.beacon_int = 100;
2293
2294 capability = WLAN_CAPABILITY_IBSS;
2295
2296 if (sdata->default_key)
2297 capability |= WLAN_CAPABILITY_PRIVACY;
2298 else
2299 sdata->drop_unencrypted = 0;
2300
2301 pos = supp_rates;
2302 for (i = 0; i < sband->n_bitrates; i++) {
2303 int rate = sband->bitrates[i].bitrate;
2304 *pos++ = (u8) (rate / 5);
2305 }
2306
2307 return __ieee80211_sta_join_ibss(sdata, ifsta,
2308 bssid, local->hw.conf.beacon_int,
2309 local->hw.conf.channel->center_freq,
2310 sband->n_bitrates, supp_rates,
2311 capability);
2312}
2313
2314
2315static int ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata,
2316 struct ieee80211_if_sta *ifsta)
2317{
2318 struct ieee80211_local *local = sdata->local;
2319 struct ieee80211_bss *bss;
2320 int active_ibss;
2321
2322 if (ifsta->ssid_len == 0)
2323 return -EINVAL;
2324
2325 active_ibss = ieee80211_sta_active_ibss(sdata);
2326#ifdef CONFIG_MAC80211_IBSS_DEBUG
2327 printk(KERN_DEBUG "%s: sta_find_ibss (active_ibss=%d)\n",
2328 sdata->dev->name, active_ibss);
2329#endif /* CONFIG_MAC80211_IBSS_DEBUG */
2330
2331 if (active_ibss)
2332 return 0;
2333
2334 if (ifsta->flags & IEEE80211_STA_BSSID_SET)
2335 bss = ieee80211_rx_bss_get(local, ifsta->bssid, 0,
2336 ifsta->ssid, ifsta->ssid_len);
2337 else
2338 bss = (void *)cfg80211_get_ibss(local->hw.wiphy,
2339 NULL,
2340 ifsta->ssid, ifsta->ssid_len);
2341
2342#ifdef CONFIG_MAC80211_IBSS_DEBUG
2343 if (bss)
2344 printk(KERN_DEBUG " sta_find_ibss: selected %pM current "
2345 "%pM\n", bss->cbss.bssid, ifsta->bssid);
2346#endif /* CONFIG_MAC80211_IBSS_DEBUG */
2347
2348 if (bss &&
2349 (!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET) ||
2350 memcmp(ifsta->bssid, bss->cbss.bssid, ETH_ALEN))) {
2351 int ret;
2352
2353 printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM"
2354 " based on configured SSID\n",
2355 sdata->dev->name, bss->cbss.bssid);
2356
2357 ret = ieee80211_sta_join_ibss(sdata, ifsta, bss);
2358 ieee80211_rx_bss_put(local, bss);
2359 return ret;
2360 } else if (bss)
2361 ieee80211_rx_bss_put(local, bss);
2362
2363#ifdef CONFIG_MAC80211_IBSS_DEBUG
2364 printk(KERN_DEBUG " did not try to join ibss\n");
2365#endif /* CONFIG_MAC80211_IBSS_DEBUG */
2366
2367 /* Selected IBSS not found in current scan results - try to scan */
2368 if (ifsta->state == IEEE80211_STA_MLME_IBSS_JOINED &&
2369 !ieee80211_sta_active_ibss(sdata)) {
2370 mod_timer(&ifsta->timer, jiffies +
2371 IEEE80211_IBSS_MERGE_INTERVAL);
2372 } else if (time_after(jiffies, local->last_scan_completed +
2373 IEEE80211_SCAN_INTERVAL)) {
2374 printk(KERN_DEBUG "%s: Trigger new scan to find an IBSS to "
2375 "join\n", sdata->dev->name);
2376
2377 /* XXX maybe racy? */
2378 if (local->scan_req)
2379 return -EBUSY;
2380
2381 memcpy(local->int_scan_req.ssids[0].ssid,
2382 ifsta->ssid, IEEE80211_MAX_SSID_LEN);
2383 local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len;
2384 return ieee80211_request_scan(sdata, &local->int_scan_req);
2385 } else if (ifsta->state != IEEE80211_STA_MLME_IBSS_JOINED) {
2386 int interval = IEEE80211_SCAN_INTERVAL;
2387
2388 if (time_after(jiffies, ifsta->ibss_join_req +
2389 IEEE80211_IBSS_JOIN_TIMEOUT)) {
2390 if ((ifsta->flags & IEEE80211_STA_CREATE_IBSS) &&
2391 (!(local->oper_channel->flags &
2392 IEEE80211_CHAN_NO_IBSS)))
2393 return ieee80211_sta_create_ibss(sdata, ifsta);
2394 if (ifsta->flags & IEEE80211_STA_CREATE_IBSS) {
2395 printk(KERN_DEBUG "%s: IBSS not allowed on"
2396 " %d MHz\n", sdata->dev->name,
2397 local->hw.conf.channel->center_freq);
2398 }
2399
2400 /* No IBSS found - decrease scan interval and continue
2401 * scanning. */
2402 interval = IEEE80211_SCAN_INTERVAL_SLOW;
2403 }
2404
2405 ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH;
2406 mod_timer(&ifsta->timer, jiffies + interval);
2407 return 0;
2408 }
2409
2410 return 0;
2411}
2412
2413
2414static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
2415 struct ieee80211_if_sta *ifsta)
2416{ 1652{
1653 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2417 struct ieee80211_local *local = sdata->local; 1654 struct ieee80211_local *local = sdata->local;
2418 struct ieee80211_bss *bss; 1655 struct ieee80211_bss *bss;
2419 u8 *bssid = ifsta->bssid, *ssid = ifsta->ssid; 1656 u8 *bssid = ifmgd->bssid, *ssid = ifmgd->ssid;
2420 u8 ssid_len = ifsta->ssid_len; 1657 u8 ssid_len = ifmgd->ssid_len;
2421 u16 capa_mask = WLAN_CAPABILITY_ESS; 1658 u16 capa_mask = WLAN_CAPABILITY_ESS;
2422 u16 capa_val = WLAN_CAPABILITY_ESS; 1659 u16 capa_val = WLAN_CAPABILITY_ESS;
2423 struct ieee80211_channel *chan = local->oper_channel; 1660 struct ieee80211_channel *chan = local->oper_channel;
2424 1661
2425 if (ifsta->flags & (IEEE80211_STA_AUTO_SSID_SEL | 1662 if (ifmgd->flags & (IEEE80211_STA_AUTO_SSID_SEL |
2426 IEEE80211_STA_AUTO_BSSID_SEL | 1663 IEEE80211_STA_AUTO_BSSID_SEL |
2427 IEEE80211_STA_AUTO_CHANNEL_SEL)) { 1664 IEEE80211_STA_AUTO_CHANNEL_SEL)) {
2428 capa_mask |= WLAN_CAPABILITY_PRIVACY; 1665 capa_mask |= WLAN_CAPABILITY_PRIVACY;
@@ -2430,13 +1667,13 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
2430 capa_val |= WLAN_CAPABILITY_PRIVACY; 1667 capa_val |= WLAN_CAPABILITY_PRIVACY;
2431 } 1668 }
2432 1669
2433 if (ifsta->flags & IEEE80211_STA_AUTO_CHANNEL_SEL) 1670 if (ifmgd->flags & IEEE80211_STA_AUTO_CHANNEL_SEL)
2434 chan = NULL; 1671 chan = NULL;
2435 1672
2436 if (ifsta->flags & IEEE80211_STA_AUTO_BSSID_SEL) 1673 if (ifmgd->flags & IEEE80211_STA_AUTO_BSSID_SEL)
2437 bssid = NULL; 1674 bssid = NULL;
2438 1675
2439 if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) { 1676 if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL) {
2440 ssid = NULL; 1677 ssid = NULL;
2441 ssid_len = 0; 1678 ssid_len = 0;
2442 } 1679 }
@@ -2447,16 +1684,16 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
2447 1684
2448 if (bss) { 1685 if (bss) {
2449 ieee80211_set_freq(sdata, bss->cbss.channel->center_freq); 1686 ieee80211_set_freq(sdata, bss->cbss.channel->center_freq);
2450 if (!(ifsta->flags & IEEE80211_STA_SSID_SET)) 1687 if (!(ifmgd->flags & IEEE80211_STA_SSID_SET))
2451 ieee80211_sta_set_ssid(sdata, bss->ssid, 1688 ieee80211_sta_set_ssid(sdata, bss->ssid,
2452 bss->ssid_len); 1689 bss->ssid_len);
2453 ieee80211_sta_set_bssid(sdata, bss->cbss.bssid); 1690 ieee80211_sta_set_bssid(sdata, bss->cbss.bssid);
2454 ieee80211_sta_def_wmm_params(sdata, bss->supp_rates_len, 1691 ieee80211_sta_def_wmm_params(sdata, bss->supp_rates_len,
2455 bss->supp_rates); 1692 bss->supp_rates);
2456 if (sdata->u.sta.mfp == IEEE80211_MFP_REQUIRED) 1693 if (sdata->u.mgd.mfp == IEEE80211_MFP_REQUIRED)
2457 sdata->u.sta.flags |= IEEE80211_STA_MFP_ENABLED; 1694 sdata->u.mgd.flags |= IEEE80211_STA_MFP_ENABLED;
2458 else 1695 else
2459 sdata->u.sta.flags &= ~IEEE80211_STA_MFP_ENABLED; 1696 sdata->u.mgd.flags &= ~IEEE80211_STA_MFP_ENABLED;
2460 1697
2461 /* Send out direct probe if no probe resp was received or 1698 /* Send out direct probe if no probe resp was received or
2462 * the one we have is outdated 1699 * the one we have is outdated
@@ -2464,31 +1701,34 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
2464 if (!bss->last_probe_resp || 1701 if (!bss->last_probe_resp ||
2465 time_after(jiffies, bss->last_probe_resp 1702 time_after(jiffies, bss->last_probe_resp
2466 + IEEE80211_SCAN_RESULT_EXPIRE)) 1703 + IEEE80211_SCAN_RESULT_EXPIRE))
2467 ifsta->state = IEEE80211_STA_MLME_DIRECT_PROBE; 1704 ifmgd->state = IEEE80211_STA_MLME_DIRECT_PROBE;
2468 else 1705 else
2469 ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; 1706 ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
2470 1707
2471 ieee80211_rx_bss_put(local, bss); 1708 ieee80211_rx_bss_put(local, bss);
2472 ieee80211_sta_reset_auth(sdata, ifsta); 1709 ieee80211_sta_reset_auth(sdata);
2473 return 0; 1710 return 0;
2474 } else { 1711 } else {
2475 if (ifsta->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) { 1712 if (ifmgd->assoc_scan_tries < IEEE80211_ASSOC_SCANS_MAX_TRIES) {
2476 ifsta->assoc_scan_tries++; 1713 ifmgd->assoc_scan_tries++;
2477 /* XXX maybe racy? */ 1714 /* XXX maybe racy? */
2478 if (local->scan_req) 1715 if (local->scan_req)
2479 return -1; 1716 return -1;
2480 memcpy(local->int_scan_req.ssids[0].ssid, 1717 memcpy(local->int_scan_req.ssids[0].ssid,
2481 ifsta->ssid, IEEE80211_MAX_SSID_LEN); 1718 ifmgd->ssid, IEEE80211_MAX_SSID_LEN);
2482 if (ifsta->flags & IEEE80211_STA_AUTO_SSID_SEL) 1719 if (ifmgd->flags & IEEE80211_STA_AUTO_SSID_SEL)
2483 local->int_scan_req.ssids[0].ssid_len = 0; 1720 local->int_scan_req.ssids[0].ssid_len = 0;
2484 else 1721 else
2485 local->int_scan_req.ssids[0].ssid_len = ifsta->ssid_len; 1722 local->int_scan_req.ssids[0].ssid_len = ifmgd->ssid_len;
2486 ieee80211_start_scan(sdata, &local->int_scan_req); 1723
2487 ifsta->state = IEEE80211_STA_MLME_AUTHENTICATE; 1724 if (ieee80211_start_scan(sdata, &local->int_scan_req))
2488 set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request); 1725 ieee80211_scan_failed(local);
1726
1727 ifmgd->state = IEEE80211_STA_MLME_AUTHENTICATE;
1728 set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
2489 } else { 1729 } else {
2490 ifsta->assoc_scan_tries = 0; 1730 ifmgd->assoc_scan_tries = 0;
2491 ifsta->state = IEEE80211_STA_MLME_DISABLED; 1731 ifmgd->state = IEEE80211_STA_MLME_DISABLED;
2492 } 1732 }
2493 } 1733 }
2494 return -1; 1734 return -1;
@@ -2498,9 +1738,9 @@ static int ieee80211_sta_config_auth(struct ieee80211_sub_if_data *sdata,
2498static void ieee80211_sta_work(struct work_struct *work) 1738static void ieee80211_sta_work(struct work_struct *work)
2499{ 1739{
2500 struct ieee80211_sub_if_data *sdata = 1740 struct ieee80211_sub_if_data *sdata =
2501 container_of(work, struct ieee80211_sub_if_data, u.sta.work); 1741 container_of(work, struct ieee80211_sub_if_data, u.mgd.work);
2502 struct ieee80211_local *local = sdata->local; 1742 struct ieee80211_local *local = sdata->local;
2503 struct ieee80211_if_sta *ifsta; 1743 struct ieee80211_if_managed *ifmgd;
2504 struct sk_buff *skb; 1744 struct sk_buff *skb;
2505 1745
2506 if (!netif_running(sdata->dev)) 1746 if (!netif_running(sdata->dev))
@@ -2509,60 +1749,60 @@ static void ieee80211_sta_work(struct work_struct *work)
2509 if (local->sw_scanning || local->hw_scanning) 1749 if (local->sw_scanning || local->hw_scanning)
2510 return; 1750 return;
2511 1751
2512 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION && 1752 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
2513 sdata->vif.type != NL80211_IFTYPE_ADHOC))
2514 return; 1753 return;
2515 ifsta = &sdata->u.sta; 1754 ifmgd = &sdata->u.mgd;
2516 1755
2517 while ((skb = skb_dequeue(&ifsta->skb_queue))) 1756 while ((skb = skb_dequeue(&ifmgd->skb_queue)))
2518 ieee80211_sta_rx_queued_mgmt(sdata, skb); 1757 ieee80211_sta_rx_queued_mgmt(sdata, skb);
2519 1758
2520 if (ifsta->state != IEEE80211_STA_MLME_DIRECT_PROBE && 1759 if (ifmgd->state != IEEE80211_STA_MLME_DIRECT_PROBE &&
2521 ifsta->state != IEEE80211_STA_MLME_AUTHENTICATE && 1760 ifmgd->state != IEEE80211_STA_MLME_AUTHENTICATE &&
2522 ifsta->state != IEEE80211_STA_MLME_ASSOCIATE && 1761 ifmgd->state != IEEE80211_STA_MLME_ASSOCIATE &&
2523 test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request)) { 1762 test_and_clear_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request)) {
2524 ieee80211_start_scan(sdata, local->scan_req); 1763 /*
1764 * The call to ieee80211_start_scan can fail but ieee80211_request_scan
1765 * (which queued ieee80211_sta_work) did not return an error. Thus, call
1766 * ieee80211_scan_failed here if ieee80211_start_scan fails in order to
1767 * notify the scan requester.
1768 */
1769 if (ieee80211_start_scan(sdata, local->scan_req))
1770 ieee80211_scan_failed(local);
2525 return; 1771 return;
2526 } 1772 }
2527 1773
2528 if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request)) { 1774 if (test_and_clear_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request)) {
2529 if (ieee80211_sta_config_auth(sdata, ifsta)) 1775 if (ieee80211_sta_config_auth(sdata))
2530 return; 1776 return;
2531 clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request); 1777 clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request);
2532 } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifsta->request)) 1778 } else if (!test_and_clear_bit(IEEE80211_STA_REQ_RUN, &ifmgd->request))
2533 return; 1779 return;
2534 1780
2535 switch (ifsta->state) { 1781 switch (ifmgd->state) {
2536 case IEEE80211_STA_MLME_DISABLED: 1782 case IEEE80211_STA_MLME_DISABLED:
2537 break; 1783 break;
2538 case IEEE80211_STA_MLME_DIRECT_PROBE: 1784 case IEEE80211_STA_MLME_DIRECT_PROBE:
2539 ieee80211_direct_probe(sdata, ifsta); 1785 ieee80211_direct_probe(sdata);
2540 break; 1786 break;
2541 case IEEE80211_STA_MLME_AUTHENTICATE: 1787 case IEEE80211_STA_MLME_AUTHENTICATE:
2542 ieee80211_authenticate(sdata, ifsta); 1788 ieee80211_authenticate(sdata);
2543 break; 1789 break;
2544 case IEEE80211_STA_MLME_ASSOCIATE: 1790 case IEEE80211_STA_MLME_ASSOCIATE:
2545 ieee80211_associate(sdata, ifsta); 1791 ieee80211_associate(sdata);
2546 break; 1792 break;
2547 case IEEE80211_STA_MLME_ASSOCIATED: 1793 case IEEE80211_STA_MLME_ASSOCIATED:
2548 ieee80211_associated(sdata, ifsta); 1794 ieee80211_associated(sdata);
2549 break;
2550 case IEEE80211_STA_MLME_IBSS_SEARCH:
2551 ieee80211_sta_find_ibss(sdata, ifsta);
2552 break;
2553 case IEEE80211_STA_MLME_IBSS_JOINED:
2554 ieee80211_sta_merge_ibss(sdata, ifsta);
2555 break; 1795 break;
2556 default: 1796 default:
2557 WARN_ON(1); 1797 WARN_ON(1);
2558 break; 1798 break;
2559 } 1799 }
2560 1800
2561 if (ieee80211_privacy_mismatch(sdata, ifsta)) { 1801 if (ieee80211_privacy_mismatch(sdata)) {
2562 printk(KERN_DEBUG "%s: privacy configuration mismatch and " 1802 printk(KERN_DEBUG "%s: privacy configuration mismatch and "
2563 "mixed-cell disabled - disassociate\n", sdata->dev->name); 1803 "mixed-cell disabled - disassociate\n", sdata->dev->name);
2564 1804
2565 ieee80211_set_disassoc(sdata, ifsta, false, true, 1805 ieee80211_set_disassoc(sdata, false, true,
2566 WLAN_REASON_UNSPECIFIED); 1806 WLAN_REASON_UNSPECIFIED);
2567 } 1807 }
2568} 1808}
@@ -2571,155 +1811,106 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
2571{ 1811{
2572 if (sdata->vif.type == NL80211_IFTYPE_STATION) 1812 if (sdata->vif.type == NL80211_IFTYPE_STATION)
2573 queue_work(sdata->local->hw.workqueue, 1813 queue_work(sdata->local->hw.workqueue,
2574 &sdata->u.sta.work); 1814 &sdata->u.mgd.work);
2575} 1815}
2576 1816
2577/* interface setup */ 1817/* interface setup */
2578void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) 1818void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
2579{ 1819{
2580 struct ieee80211_if_sta *ifsta; 1820 struct ieee80211_if_managed *ifmgd;
2581 1821
2582 ifsta = &sdata->u.sta; 1822 ifmgd = &sdata->u.mgd;
2583 INIT_WORK(&ifsta->work, ieee80211_sta_work); 1823 INIT_WORK(&ifmgd->work, ieee80211_sta_work);
2584 INIT_WORK(&ifsta->chswitch_work, ieee80211_chswitch_work); 1824 INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work);
2585 setup_timer(&ifsta->timer, ieee80211_sta_timer, 1825 setup_timer(&ifmgd->timer, ieee80211_sta_timer,
2586 (unsigned long) sdata); 1826 (unsigned long) sdata);
2587 setup_timer(&ifsta->chswitch_timer, ieee80211_chswitch_timer, 1827 setup_timer(&ifmgd->chswitch_timer, ieee80211_chswitch_timer,
2588 (unsigned long) sdata); 1828 (unsigned long) sdata);
2589 skb_queue_head_init(&ifsta->skb_queue); 1829 skb_queue_head_init(&ifmgd->skb_queue);
2590 1830
2591 ifsta->capab = WLAN_CAPABILITY_ESS; 1831 ifmgd->capab = WLAN_CAPABILITY_ESS;
2592 ifsta->auth_algs = IEEE80211_AUTH_ALG_OPEN | 1832 ifmgd->auth_algs = IEEE80211_AUTH_ALG_OPEN |
2593 IEEE80211_AUTH_ALG_SHARED_KEY; 1833 IEEE80211_AUTH_ALG_SHARED_KEY;
2594 ifsta->flags |= IEEE80211_STA_CREATE_IBSS | 1834 ifmgd->flags |= IEEE80211_STA_CREATE_IBSS |
2595 IEEE80211_STA_AUTO_BSSID_SEL | 1835 IEEE80211_STA_AUTO_BSSID_SEL |
2596 IEEE80211_STA_AUTO_CHANNEL_SEL; 1836 IEEE80211_STA_AUTO_CHANNEL_SEL;
2597 if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4) 1837 if (ieee80211_num_regular_queues(&sdata->local->hw) >= 4)
2598 ifsta->flags |= IEEE80211_STA_WMM_ENABLED; 1838 ifmgd->flags |= IEEE80211_STA_WMM_ENABLED;
2599}
2600
2601/*
2602 * Add a new IBSS station, will also be called by the RX code when,
2603 * in IBSS mode, receiving a frame from a yet-unknown station, hence
2604 * must be callable in atomic context.
2605 */
2606struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata,
2607 u8 *bssid,u8 *addr, u32 supp_rates)
2608{
2609 struct ieee80211_local *local = sdata->local;
2610 struct sta_info *sta;
2611 int band = local->hw.conf.channel->band;
2612
2613 /* TODO: Could consider removing the least recently used entry and
2614 * allow new one to be added. */
2615 if (local->num_sta >= IEEE80211_IBSS_MAX_STA_ENTRIES) {
2616 if (net_ratelimit()) {
2617 printk(KERN_DEBUG "%s: No room for a new IBSS STA "
2618 "entry %pM\n", sdata->dev->name, addr);
2619 }
2620 return NULL;
2621 }
2622
2623 if (compare_ether_addr(bssid, sdata->u.sta.bssid))
2624 return NULL;
2625
2626#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
2627 printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n",
2628 wiphy_name(local->hw.wiphy), addr, sdata->dev->name);
2629#endif
2630
2631 sta = sta_info_alloc(sdata, addr, GFP_ATOMIC);
2632 if (!sta)
2633 return NULL;
2634
2635 set_sta_flags(sta, WLAN_STA_AUTHORIZED);
2636
2637 /* make sure mandatory rates are always added */
2638 sta->sta.supp_rates[band] = supp_rates |
2639 ieee80211_mandatory_rates(local, band);
2640
2641 rate_control_rate_init(sta);
2642
2643 if (sta_info_insert(sta))
2644 return NULL;
2645
2646 return sta;
2647} 1839}
2648 1840
2649/* configuration hooks */ 1841/* configuration hooks */
2650void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata, 1842void ieee80211_sta_req_auth(struct ieee80211_sub_if_data *sdata)
2651 struct ieee80211_if_sta *ifsta)
2652{ 1843{
1844 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2653 struct ieee80211_local *local = sdata->local; 1845 struct ieee80211_local *local = sdata->local;
2654 1846
2655 if (sdata->vif.type != NL80211_IFTYPE_STATION) 1847 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
2656 return; 1848 return;
2657 1849
2658 if ((ifsta->flags & (IEEE80211_STA_BSSID_SET | 1850 if ((ifmgd->flags & (IEEE80211_STA_BSSID_SET |
2659 IEEE80211_STA_AUTO_BSSID_SEL)) && 1851 IEEE80211_STA_AUTO_BSSID_SEL)) &&
2660 (ifsta->flags & (IEEE80211_STA_SSID_SET | 1852 (ifmgd->flags & (IEEE80211_STA_SSID_SET |
2661 IEEE80211_STA_AUTO_SSID_SEL))) { 1853 IEEE80211_STA_AUTO_SSID_SEL))) {
2662 1854
2663 if (ifsta->state == IEEE80211_STA_MLME_ASSOCIATED) 1855 if (ifmgd->state == IEEE80211_STA_MLME_ASSOCIATED)
2664 ieee80211_set_disassoc(sdata, ifsta, true, true, 1856 ieee80211_set_disassoc(sdata, true, true,
2665 WLAN_REASON_DEAUTH_LEAVING); 1857 WLAN_REASON_DEAUTH_LEAVING);
2666 1858
2667 set_bit(IEEE80211_STA_REQ_AUTH, &ifsta->request); 1859 set_bit(IEEE80211_STA_REQ_AUTH, &ifmgd->request);
2668 queue_work(local->hw.workqueue, &ifsta->work); 1860 queue_work(local->hw.workqueue, &ifmgd->work);
2669 } 1861 }
2670} 1862}
2671 1863
2672int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len) 1864int ieee80211_sta_commit(struct ieee80211_sub_if_data *sdata)
2673{ 1865{
2674 struct ieee80211_if_sta *ifsta; 1866 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2675 1867
2676 if (len > IEEE80211_MAX_SSID_LEN) 1868 ifmgd->flags &= ~IEEE80211_STA_PREV_BSSID_SET;
2677 return -EINVAL;
2678 1869
2679 ifsta = &sdata->u.sta; 1870 if (ifmgd->ssid_len)
1871 ifmgd->flags |= IEEE80211_STA_SSID_SET;
1872 else
1873 ifmgd->flags &= ~IEEE80211_STA_SSID_SET;
2680 1874
2681 if (ifsta->ssid_len != len || memcmp(ifsta->ssid, ssid, len) != 0) { 1875 return 0;
2682 memset(ifsta->ssid, 0, sizeof(ifsta->ssid)); 1876}
2683 memcpy(ifsta->ssid, ssid, len);
2684 ifsta->ssid_len = len;
2685 }
2686 1877
2687 ifsta->flags &= ~IEEE80211_STA_PREV_BSSID_SET; 1878int ieee80211_sta_set_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t len)
1879{
1880 struct ieee80211_if_managed *ifmgd;
2688 1881
2689 if (len) 1882 if (len > IEEE80211_MAX_SSID_LEN)
2690 ifsta->flags |= IEEE80211_STA_SSID_SET; 1883 return -EINVAL;
2691 else 1884
2692 ifsta->flags &= ~IEEE80211_STA_SSID_SET; 1885 ifmgd = &sdata->u.mgd;
2693 1886
2694 if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { 1887 if (ifmgd->ssid_len != len || memcmp(ifmgd->ssid, ssid, len) != 0) {
2695 ifsta->ibss_join_req = jiffies; 1888 memset(ifmgd->ssid, 0, sizeof(ifmgd->ssid));
2696 ifsta->state = IEEE80211_STA_MLME_IBSS_SEARCH; 1889 memcpy(ifmgd->ssid, ssid, len);
2697 return ieee80211_sta_find_ibss(sdata, ifsta); 1890 ifmgd->ssid_len = len;
2698 } 1891 }
2699 1892
2700 return 0; 1893 return ieee80211_sta_commit(sdata);
2701} 1894}
2702 1895
2703int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len) 1896int ieee80211_sta_get_ssid(struct ieee80211_sub_if_data *sdata, char *ssid, size_t *len)
2704{ 1897{
2705 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 1898 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2706 memcpy(ssid, ifsta->ssid, ifsta->ssid_len); 1899 memcpy(ssid, ifmgd->ssid, ifmgd->ssid_len);
2707 *len = ifsta->ssid_len; 1900 *len = ifmgd->ssid_len;
2708 return 0; 1901 return 0;
2709} 1902}
2710 1903
2711int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid) 1904int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
2712{ 1905{
2713 struct ieee80211_if_sta *ifsta; 1906 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2714
2715 ifsta = &sdata->u.sta;
2716 1907
2717 if (is_valid_ether_addr(bssid)) { 1908 if (is_valid_ether_addr(bssid)) {
2718 memcpy(ifsta->bssid, bssid, ETH_ALEN); 1909 memcpy(ifmgd->bssid, bssid, ETH_ALEN);
2719 ifsta->flags |= IEEE80211_STA_BSSID_SET; 1910 ifmgd->flags |= IEEE80211_STA_BSSID_SET;
2720 } else { 1911 } else {
2721 memset(ifsta->bssid, 0, ETH_ALEN); 1912 memset(ifmgd->bssid, 0, ETH_ALEN);
2722 ifsta->flags &= ~IEEE80211_STA_BSSID_SET; 1913 ifmgd->flags &= ~IEEE80211_STA_BSSID_SET;
2723 } 1914 }
2724 1915
2725 if (netif_running(sdata->dev)) { 1916 if (netif_running(sdata->dev)) {
@@ -2729,47 +1920,44 @@ int ieee80211_sta_set_bssid(struct ieee80211_sub_if_data *sdata, u8 *bssid)
2729 } 1920 }
2730 } 1921 }
2731 1922
2732 return ieee80211_sta_set_ssid(sdata, ifsta->ssid, ifsta->ssid_len); 1923 return ieee80211_sta_commit(sdata);
2733} 1924}
2734 1925
2735int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len) 1926int ieee80211_sta_set_extra_ie(struct ieee80211_sub_if_data *sdata, char *ie, size_t len)
2736{ 1927{
2737 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 1928 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2738 1929
2739 kfree(ifsta->extra_ie); 1930 kfree(ifmgd->extra_ie);
2740 if (len == 0) { 1931 if (len == 0) {
2741 ifsta->extra_ie = NULL; 1932 ifmgd->extra_ie = NULL;
2742 ifsta->extra_ie_len = 0; 1933 ifmgd->extra_ie_len = 0;
2743 return 0; 1934 return 0;
2744 } 1935 }
2745 ifsta->extra_ie = kmalloc(len, GFP_KERNEL); 1936 ifmgd->extra_ie = kmalloc(len, GFP_KERNEL);
2746 if (!ifsta->extra_ie) { 1937 if (!ifmgd->extra_ie) {
2747 ifsta->extra_ie_len = 0; 1938 ifmgd->extra_ie_len = 0;
2748 return -ENOMEM; 1939 return -ENOMEM;
2749 } 1940 }
2750 memcpy(ifsta->extra_ie, ie, len); 1941 memcpy(ifmgd->extra_ie, ie, len);
2751 ifsta->extra_ie_len = len; 1942 ifmgd->extra_ie_len = len;
2752 return 0; 1943 return 0;
2753} 1944}
2754 1945
2755int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason) 1946int ieee80211_sta_deauthenticate(struct ieee80211_sub_if_data *sdata, u16 reason)
2756{ 1947{
2757 struct ieee80211_if_sta *ifsta = &sdata->u.sta;
2758
2759 printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n", 1948 printk(KERN_DEBUG "%s: deauthenticating by local choice (reason=%d)\n",
2760 sdata->dev->name, reason); 1949 sdata->dev->name, reason);
2761 1950
2762 if (sdata->vif.type != NL80211_IFTYPE_STATION && 1951 if (sdata->vif.type != NL80211_IFTYPE_STATION)
2763 sdata->vif.type != NL80211_IFTYPE_ADHOC)
2764 return -EINVAL; 1952 return -EINVAL;
2765 1953
2766 ieee80211_set_disassoc(sdata, ifsta, true, true, reason); 1954 ieee80211_set_disassoc(sdata, true, true, reason);
2767 return 0; 1955 return 0;
2768} 1956}
2769 1957
2770int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason) 1958int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
2771{ 1959{
2772 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 1960 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
2773 1961
2774 printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n", 1962 printk(KERN_DEBUG "%s: disassociating by local choice (reason=%d)\n",
2775 sdata->dev->name, reason); 1963 sdata->dev->name, reason);
@@ -2777,10 +1965,10 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
2777 if (sdata->vif.type != NL80211_IFTYPE_STATION) 1965 if (sdata->vif.type != NL80211_IFTYPE_STATION)
2778 return -EINVAL; 1966 return -EINVAL;
2779 1967
2780 if (!(ifsta->flags & IEEE80211_STA_ASSOCIATED)) 1968 if (!(ifmgd->flags & IEEE80211_STA_ASSOCIATED))
2781 return -1; 1969 return -ENOLINK;
2782 1970
2783 ieee80211_set_disassoc(sdata, ifsta, false, true, reason); 1971 ieee80211_set_disassoc(sdata, false, true, reason);
2784 return 0; 1972 return 0;
2785} 1973}
2786 1974
@@ -2788,14 +1976,6 @@ int ieee80211_sta_disassociate(struct ieee80211_sub_if_data *sdata, u16 reason)
2788void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local) 1976void ieee80211_mlme_notify_scan_completed(struct ieee80211_local *local)
2789{ 1977{
2790 struct ieee80211_sub_if_data *sdata = local->scan_sdata; 1978 struct ieee80211_sub_if_data *sdata = local->scan_sdata;
2791 struct ieee80211_if_sta *ifsta;
2792
2793 if (sdata && sdata->vif.type == NL80211_IFTYPE_ADHOC) {
2794 ifsta = &sdata->u.sta;
2795 if ((!(ifsta->flags & IEEE80211_STA_PREV_BSSID_SET)) ||
2796 !ieee80211_sta_active_ibss(sdata))
2797 ieee80211_sta_find_ibss(sdata, ifsta);
2798 }
2799 1979
2800 /* Restart STA timers */ 1980 /* Restart STA timers */
2801 rcu_read_lock(); 1981 rcu_read_lock();
@@ -2842,3 +2022,36 @@ void ieee80211_dynamic_ps_timer(unsigned long data)
2842 2022
2843 queue_work(local->hw.workqueue, &local->dynamic_ps_enable_work); 2023 queue_work(local->hw.workqueue, &local->dynamic_ps_enable_work);
2844} 2024}
2025
2026void ieee80211_send_nullfunc(struct ieee80211_local *local,
2027 struct ieee80211_sub_if_data *sdata,
2028 int powersave)
2029{
2030 struct sk_buff *skb;
2031 struct ieee80211_hdr *nullfunc;
2032 __le16 fc;
2033
2034 if (WARN_ON(sdata->vif.type != NL80211_IFTYPE_STATION))
2035 return;
2036
2037 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
2038 if (!skb) {
2039 printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
2040 "frame\n", sdata->dev->name);
2041 return;
2042 }
2043 skb_reserve(skb, local->hw.extra_tx_headroom);
2044
2045 nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24);
2046 memset(nullfunc, 0, 24);
2047 fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC |
2048 IEEE80211_FCTL_TODS);
2049 if (powersave)
2050 fc |= cpu_to_le16(IEEE80211_FCTL_PM);
2051 nullfunc->frame_control = fc;
2052 memcpy(nullfunc->addr1, sdata->u.mgd.bssid, ETH_ALEN);
2053 memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
2054 memcpy(nullfunc->addr3, sdata->u.mgd.bssid, ETH_ALEN);
2055
2056 ieee80211_tx_skb(sdata, skb, 0);
2057}
diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h
index 928da625e281..b9164c9a9563 100644
--- a/net/mac80211/rate.h
+++ b/net/mac80211/rate.h
@@ -62,6 +62,18 @@ static inline void rate_control_rate_init(struct sta_info *sta)
62 ref->ops->rate_init(ref->priv, sband, ista, priv_sta); 62 ref->ops->rate_init(ref->priv, sband, ista, priv_sta);
63} 63}
64 64
65static inline void rate_control_rate_update(struct ieee80211_local *local,
66 struct ieee80211_supported_band *sband,
67 struct sta_info *sta, u32 changed)
68{
69 struct rate_control_ref *ref = local->rate_ctrl;
70 struct ieee80211_sta *ista = &sta->sta;
71 void *priv_sta = sta->rate_ctrl_priv;
72
73 if (ref->ops->rate_update)
74 ref->ops->rate_update(ref->priv, sband, ista,
75 priv_sta, changed);
76}
65 77
66static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, 78static inline void *rate_control_alloc_sta(struct rate_control_ref *ref,
67 struct ieee80211_sta *sta, 79 struct ieee80211_sta *sta,
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 1327d424bf31..66f7ecf51b92 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -838,7 +838,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
838 if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) { 838 if (rx->sdata->vif.type == NL80211_IFTYPE_ADHOC) {
839 u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len, 839 u8 *bssid = ieee80211_get_bssid(hdr, rx->skb->len,
840 NL80211_IFTYPE_ADHOC); 840 NL80211_IFTYPE_ADHOC);
841 if (compare_ether_addr(bssid, rx->sdata->u.sta.bssid) == 0) 841 if (compare_ether_addr(bssid, rx->sdata->u.ibss.bssid) == 0)
842 sta->last_rx = jiffies; 842 sta->last_rx = jiffies;
843 } else 843 } else
844 if (!is_multicast_ether_addr(hdr->addr1) || 844 if (!is_multicast_ether_addr(hdr->addr1) ||
@@ -1702,13 +1702,13 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
1702 return; 1702 return;
1703 } 1703 }
1704 1704
1705 if (compare_ether_addr(mgmt->sa, sdata->u.sta.bssid) != 0 || 1705 if (compare_ether_addr(mgmt->sa, sdata->u.mgd.bssid) != 0 ||
1706 compare_ether_addr(mgmt->bssid, sdata->u.sta.bssid) != 0) { 1706 compare_ether_addr(mgmt->bssid, sdata->u.mgd.bssid) != 0) {
1707 /* Not from the current AP. */ 1707 /* Not from the current AP. */
1708 return; 1708 return;
1709 } 1709 }
1710 1710
1711 if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATE) { 1711 if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATE) {
1712 /* Association in progress; ignore SA Query */ 1712 /* Association in progress; ignore SA Query */
1713 return; 1713 return;
1714 } 1714 }
@@ -1727,7 +1727,7 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
1727 memset(resp, 0, 24); 1727 memset(resp, 0, 24);
1728 memcpy(resp->da, mgmt->sa, ETH_ALEN); 1728 memcpy(resp->da, mgmt->sa, ETH_ALEN);
1729 memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN); 1729 memcpy(resp->sa, sdata->dev->dev_addr, ETH_ALEN);
1730 memcpy(resp->bssid, sdata->u.sta.bssid, ETH_ALEN); 1730 memcpy(resp->bssid, sdata->u.mgd.bssid, ETH_ALEN);
1731 resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | 1731 resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
1732 IEEE80211_STYPE_ACTION); 1732 IEEE80211_STYPE_ACTION);
1733 skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query)); 1733 skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
@@ -1745,7 +1745,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
1745{ 1745{
1746 struct ieee80211_local *local = rx->local; 1746 struct ieee80211_local *local = rx->local;
1747 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev); 1747 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(rx->dev);
1748 struct ieee80211_if_sta *ifsta = &sdata->u.sta;
1749 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; 1748 struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data;
1750 struct ieee80211_bss *bss; 1749 struct ieee80211_bss *bss;
1751 int len = rx->skb->len; 1750 int len = rx->skb->len;
@@ -1803,6 +1802,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
1803 case WLAN_CATEGORY_SPECTRUM_MGMT: 1802 case WLAN_CATEGORY_SPECTRUM_MGMT:
1804 if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ) 1803 if (local->hw.conf.channel->band != IEEE80211_BAND_5GHZ)
1805 return RX_DROP_MONITOR; 1804 return RX_DROP_MONITOR;
1805
1806 if (sdata->vif.type != NL80211_IFTYPE_STATION)
1807 return RX_DROP_MONITOR;
1808
1806 switch (mgmt->u.action.u.measurement.action_code) { 1809 switch (mgmt->u.action.u.measurement.action_code) {
1807 case WLAN_ACTION_SPCT_MSR_REQ: 1810 case WLAN_ACTION_SPCT_MSR_REQ:
1808 if (len < (IEEE80211_MIN_ACTION_SIZE + 1811 if (len < (IEEE80211_MIN_ACTION_SIZE +
@@ -1815,12 +1818,13 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
1815 sizeof(mgmt->u.action.u.chan_switch))) 1818 sizeof(mgmt->u.action.u.chan_switch)))
1816 return RX_DROP_MONITOR; 1819 return RX_DROP_MONITOR;
1817 1820
1818 if (memcmp(mgmt->bssid, ifsta->bssid, ETH_ALEN) != 0) 1821 if (memcmp(mgmt->bssid, sdata->u.mgd.bssid, ETH_ALEN))
1819 return RX_DROP_MONITOR; 1822 return RX_DROP_MONITOR;
1820 1823
1821 bss = ieee80211_rx_bss_get(local, ifsta->bssid, 1824 bss = ieee80211_rx_bss_get(local, sdata->u.mgd.bssid,
1822 local->hw.conf.channel->center_freq, 1825 local->hw.conf.channel->center_freq,
1823 ifsta->ssid, ifsta->ssid_len); 1826 sdata->u.mgd.ssid,
1827 sdata->u.mgd.ssid_len);
1824 if (!bss) 1828 if (!bss)
1825 return RX_DROP_MONITOR; 1829 return RX_DROP_MONITOR;
1826 1830
@@ -1876,11 +1880,14 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
1876 sdata->vif.type != NL80211_IFTYPE_ADHOC) 1880 sdata->vif.type != NL80211_IFTYPE_ADHOC)
1877 return RX_DROP_MONITOR; 1881 return RX_DROP_MONITOR;
1878 1882
1879 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
1880 return RX_DROP_MONITOR;
1881 1883
1882 ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status); 1884 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
1883 return RX_QUEUED; 1885 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
1886 return RX_DROP_MONITOR;
1887 return ieee80211_sta_rx_mgmt(sdata, rx->skb, rx->status);
1888 }
1889
1890 return ieee80211_ibss_rx_mgmt(sdata, rx->skb, rx->status);
1884} 1891}
1885 1892
1886static void ieee80211_rx_michael_mic_report(struct net_device *dev, 1893static void ieee80211_rx_michael_mic_report(struct net_device *dev,
@@ -2083,7 +2090,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2083 case NL80211_IFTYPE_STATION: 2090 case NL80211_IFTYPE_STATION:
2084 if (!bssid) 2091 if (!bssid)
2085 return 0; 2092 return 0;
2086 if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) { 2093 if (!ieee80211_bssid_match(bssid, sdata->u.mgd.bssid)) {
2087 if (!(rx->flags & IEEE80211_RX_IN_SCAN)) 2094 if (!(rx->flags & IEEE80211_RX_IN_SCAN))
2088 return 0; 2095 return 0;
2089 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2096 rx->flags &= ~IEEE80211_RX_RA_MATCH;
@@ -2101,7 +2108,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata,
2101 if (ieee80211_is_beacon(hdr->frame_control)) { 2108 if (ieee80211_is_beacon(hdr->frame_control)) {
2102 return 1; 2109 return 1;
2103 } 2110 }
2104 else if (!ieee80211_bssid_match(bssid, sdata->u.sta.bssid)) { 2111 else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) {
2105 if (!(rx->flags & IEEE80211_RX_IN_SCAN)) 2112 if (!(rx->flags & IEEE80211_RX_IN_SCAN))
2106 return 0; 2113 return 0;
2107 rx->flags &= ~IEEE80211_RX_RA_MATCH; 2114 rx->flags &= ~IEEE80211_RX_RA_MATCH;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index f883ab9f1e6e..5030a3c87509 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -63,20 +63,15 @@ ieee80211_bss_info_update(struct ieee80211_local *local,
63{ 63{
64 struct ieee80211_bss *bss; 64 struct ieee80211_bss *bss;
65 int clen; 65 int clen;
66 enum cfg80211_signal_type sigtype = CFG80211_SIGNAL_TYPE_NONE;
67 s32 signal = 0; 66 s32 signal = 0;
68 67
69 if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) { 68 if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
70 sigtype = CFG80211_SIGNAL_TYPE_MBM;
71 signal = rx_status->signal * 100; 69 signal = rx_status->signal * 100;
72 } else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) { 70 else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
73 sigtype = CFG80211_SIGNAL_TYPE_UNSPEC;
74 signal = (rx_status->signal * 100) / local->hw.max_signal; 71 signal = (rx_status->signal * 100) / local->hw.max_signal;
75 }
76 72
77 bss = (void *)cfg80211_inform_bss_frame(local->hw.wiphy, channel, 73 bss = (void *)cfg80211_inform_bss_frame(local->hw.wiphy, channel,
78 mgmt, len, signal, sigtype, 74 mgmt, len, signal, GFP_ATOMIC);
79 GFP_ATOMIC);
80 75
81 if (!bss) 76 if (!bss)
82 return NULL; 77 return NULL;
@@ -207,34 +202,16 @@ ieee80211_scan_rx(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
207 return RX_QUEUED; 202 return RX_QUEUED;
208} 203}
209 204
210void ieee80211_send_nullfunc(struct ieee80211_local *local, 205void ieee80211_scan_failed(struct ieee80211_local *local)
211 struct ieee80211_sub_if_data *sdata,
212 int powersave)
213{ 206{
214 struct sk_buff *skb; 207 if (WARN_ON(!local->scan_req))
215 struct ieee80211_hdr *nullfunc;
216 __le16 fc;
217
218 skb = dev_alloc_skb(local->hw.extra_tx_headroom + 24);
219 if (!skb) {
220 printk(KERN_DEBUG "%s: failed to allocate buffer for nullfunc "
221 "frame\n", sdata->dev->name);
222 return; 208 return;
223 } 209
224 skb_reserve(skb, local->hw.extra_tx_headroom); 210 /* notify cfg80211 about the failed scan */
225 211 if (local->scan_req != &local->int_scan_req)
226 nullfunc = (struct ieee80211_hdr *) skb_put(skb, 24); 212 cfg80211_scan_done(local->scan_req, true);
227 memset(nullfunc, 0, 24); 213
228 fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | 214 local->scan_req = NULL;
229 IEEE80211_FCTL_TODS);
230 if (powersave)
231 fc |= cpu_to_le16(IEEE80211_FCTL_PM);
232 nullfunc->frame_control = fc;
233 memcpy(nullfunc->addr1, sdata->u.sta.bssid, ETH_ALEN);
234 memcpy(nullfunc->addr2, sdata->dev->dev_addr, ETH_ALEN);
235 memcpy(nullfunc->addr3, sdata->u.sta.bssid, ETH_ALEN);
236
237 ieee80211_tx_skb(sdata, skb, 0);
238} 215}
239 216
240void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) 217void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
@@ -280,6 +257,9 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
280 netif_addr_unlock(local->mdev); 257 netif_addr_unlock(local->mdev);
281 netif_tx_unlock_bh(local->mdev); 258 netif_tx_unlock_bh(local->mdev);
282 259
260 if (local->ops->sw_scan_complete)
261 local->ops->sw_scan_complete(local_to_hw(local));
262
283 mutex_lock(&local->iflist_mtx); 263 mutex_lock(&local->iflist_mtx);
284 list_for_each_entry(sdata, &local->interfaces, list) { 264 list_for_each_entry(sdata, &local->interfaces, list) {
285 if (!netif_running(sdata->dev)) 265 if (!netif_running(sdata->dev))
@@ -287,7 +267,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
287 267
288 /* Tell AP we're back */ 268 /* Tell AP we're back */
289 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 269 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
290 if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { 270 if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) {
291 ieee80211_send_nullfunc(local, sdata, 0); 271 ieee80211_send_nullfunc(local, sdata, 0);
292 netif_tx_wake_all_queues(sdata->dev); 272 netif_tx_wake_all_queues(sdata->dev);
293 } 273 }
@@ -305,6 +285,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
305 285
306 done: 286 done:
307 ieee80211_mlme_notify_scan_completed(local); 287 ieee80211_mlme_notify_scan_completed(local);
288 ieee80211_ibss_notify_scan_completed(local);
308 ieee80211_mesh_notify_scan_completed(local); 289 ieee80211_mesh_notify_scan_completed(local);
309} 290}
310EXPORT_SYMBOL(ieee80211_scan_completed); 291EXPORT_SYMBOL(ieee80211_scan_completed);
@@ -367,7 +348,8 @@ void ieee80211_scan_work(struct work_struct *work)
367 ieee80211_send_probe_req( 348 ieee80211_send_probe_req(
368 sdata, NULL, 349 sdata, NULL,
369 local->scan_req->ssids[i].ssid, 350 local->scan_req->ssids[i].ssid,
370 local->scan_req->ssids[i].ssid_len); 351 local->scan_req->ssids[i].ssid_len,
352 local->scan_req->ie, local->scan_req->ie_len);
371 next_delay = IEEE80211_CHANNEL_TIME; 353 next_delay = IEEE80211_CHANNEL_TIME;
372 break; 354 break;
373 } 355 }
@@ -428,6 +410,8 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
428 } 410 }
429 411
430 local->sw_scanning = true; 412 local->sw_scanning = true;
413 if (local->ops->sw_scan_start)
414 local->ops->sw_scan_start(local_to_hw(local));
431 415
432 mutex_lock(&local->iflist_mtx); 416 mutex_lock(&local->iflist_mtx);
433 list_for_each_entry(sdata, &local->interfaces, list) { 417 list_for_each_entry(sdata, &local->interfaces, list) {
@@ -442,7 +426,7 @@ int ieee80211_start_scan(struct ieee80211_sub_if_data *scan_sdata,
442 IEEE80211_IFCC_BEACON_ENABLED); 426 IEEE80211_IFCC_BEACON_ENABLED);
443 427
444 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 428 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
445 if (sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED) { 429 if (sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED) {
446 netif_tx_stop_all_queues(sdata->dev); 430 netif_tx_stop_all_queues(sdata->dev);
447 ieee80211_send_nullfunc(local, sdata, 1); 431 ieee80211_send_nullfunc(local, sdata, 1);
448 } 432 }
@@ -477,7 +461,7 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
477 struct cfg80211_scan_request *req) 461 struct cfg80211_scan_request *req)
478{ 462{
479 struct ieee80211_local *local = sdata->local; 463 struct ieee80211_local *local = sdata->local;
480 struct ieee80211_if_sta *ifsta; 464 struct ieee80211_if_managed *ifmgd;
481 465
482 if (!req) 466 if (!req)
483 return -EINVAL; 467 return -EINVAL;
@@ -502,9 +486,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata,
502 return -EBUSY; 486 return -EBUSY;
503 } 487 }
504 488
505 ifsta = &sdata->u.sta; 489 ifmgd = &sdata->u.mgd;
506 set_bit(IEEE80211_STA_REQ_SCAN, &ifsta->request); 490 set_bit(IEEE80211_STA_REQ_SCAN, &ifmgd->request);
507 queue_work(local->hw.workqueue, &ifsta->work); 491 queue_work(local->hw.workqueue, &ifmgd->work);
508 492
509 return 0; 493 return 0;
510} 494}
diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c
index 47bb2aed2813..5f7a2624ed74 100644
--- a/net/mac80211/spectmgmt.c
+++ b/net/mac80211/spectmgmt.c
@@ -88,16 +88,16 @@ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata,
88void ieee80211_chswitch_work(struct work_struct *work) 88void ieee80211_chswitch_work(struct work_struct *work)
89{ 89{
90 struct ieee80211_sub_if_data *sdata = 90 struct ieee80211_sub_if_data *sdata =
91 container_of(work, struct ieee80211_sub_if_data, u.sta.chswitch_work); 91 container_of(work, struct ieee80211_sub_if_data, u.mgd.chswitch_work);
92 struct ieee80211_bss *bss; 92 struct ieee80211_bss *bss;
93 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 93 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
94 94
95 if (!netif_running(sdata->dev)) 95 if (!netif_running(sdata->dev))
96 return; 96 return;
97 97
98 bss = ieee80211_rx_bss_get(sdata->local, ifsta->bssid, 98 bss = ieee80211_rx_bss_get(sdata->local, ifmgd->bssid,
99 sdata->local->hw.conf.channel->center_freq, 99 sdata->local->hw.conf.channel->center_freq,
100 ifsta->ssid, ifsta->ssid_len); 100 ifmgd->ssid, ifmgd->ssid_len);
101 if (!bss) 101 if (!bss)
102 goto exit; 102 goto exit;
103 103
@@ -108,7 +108,7 @@ void ieee80211_chswitch_work(struct work_struct *work)
108 108
109 ieee80211_rx_bss_put(sdata->local, bss); 109 ieee80211_rx_bss_put(sdata->local, bss);
110exit: 110exit:
111 ifsta->flags &= ~IEEE80211_STA_CSA_RECEIVED; 111 ifmgd->flags &= ~IEEE80211_STA_CSA_RECEIVED;
112 ieee80211_wake_queues_by_reason(&sdata->local->hw, 112 ieee80211_wake_queues_by_reason(&sdata->local->hw,
113 IEEE80211_QUEUE_STOP_REASON_CSA); 113 IEEE80211_QUEUE_STOP_REASON_CSA);
114} 114}
@@ -117,9 +117,9 @@ void ieee80211_chswitch_timer(unsigned long data)
117{ 117{
118 struct ieee80211_sub_if_data *sdata = 118 struct ieee80211_sub_if_data *sdata =
119 (struct ieee80211_sub_if_data *) data; 119 (struct ieee80211_sub_if_data *) data;
120 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 120 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
121 121
122 queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work); 122 queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work);
123} 123}
124 124
125void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata, 125void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
@@ -127,14 +127,14 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
127 struct ieee80211_bss *bss) 127 struct ieee80211_bss *bss)
128{ 128{
129 struct ieee80211_channel *new_ch; 129 struct ieee80211_channel *new_ch;
130 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 130 struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
131 int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num); 131 int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num);
132 132
133 /* FIXME: Handle ADHOC later */ 133 /* FIXME: Handle ADHOC later */
134 if (sdata->vif.type != NL80211_IFTYPE_STATION) 134 if (sdata->vif.type != NL80211_IFTYPE_STATION)
135 return; 135 return;
136 136
137 if (ifsta->state != IEEE80211_STA_MLME_ASSOCIATED) 137 if (ifmgd->state != IEEE80211_STA_MLME_ASSOCIATED)
138 return; 138 return;
139 139
140 if (sdata->local->sw_scanning || sdata->local->hw_scanning) 140 if (sdata->local->sw_scanning || sdata->local->hw_scanning)
@@ -143,7 +143,7 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
143 /* Disregard subsequent beacons if we are already running a timer 143 /* Disregard subsequent beacons if we are already running a timer
144 processing a CSA */ 144 processing a CSA */
145 145
146 if (ifsta->flags & IEEE80211_STA_CSA_RECEIVED) 146 if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
147 return; 147 return;
148 148
149 new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq); 149 new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
@@ -153,12 +153,12 @@ void ieee80211_process_chanswitch(struct ieee80211_sub_if_data *sdata,
153 sdata->local->csa_channel = new_ch; 153 sdata->local->csa_channel = new_ch;
154 154
155 if (sw_elem->count <= 1) { 155 if (sw_elem->count <= 1) {
156 queue_work(sdata->local->hw.workqueue, &ifsta->chswitch_work); 156 queue_work(sdata->local->hw.workqueue, &ifmgd->chswitch_work);
157 } else { 157 } else {
158 ieee80211_stop_queues_by_reason(&sdata->local->hw, 158 ieee80211_stop_queues_by_reason(&sdata->local->hw,
159 IEEE80211_QUEUE_STOP_REASON_CSA); 159 IEEE80211_QUEUE_STOP_REASON_CSA);
160 ifsta->flags |= IEEE80211_STA_CSA_RECEIVED; 160 ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
161 mod_timer(&ifsta->chswitch_timer, 161 mod_timer(&ifmgd->chswitch_timer,
162 jiffies + 162 jiffies +
163 msecs_to_jiffies(sw_elem->count * 163 msecs_to_jiffies(sw_elem->count *
164 bss->cbss.beacon_interval)); 164 bss->cbss.beacon_interval));
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 634f65c0130e..4ba3c540fcf3 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -202,6 +202,18 @@ void sta_info_destroy(struct sta_info *sta)
202 /* Make sure timer won't free the tid_rx struct, see below */ 202 /* Make sure timer won't free the tid_rx struct, see below */
203 if (tid_rx) 203 if (tid_rx)
204 tid_rx->shutdown = true; 204 tid_rx->shutdown = true;
205
206 /*
207 * The stop callback cannot find this station any more, but
208 * it didn't complete its work -- start the queue if necessary
209 */
210 if (sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_INITIATOR_MSK &&
211 sta->ampdu_mlme.tid_state_tx[i] & HT_AGG_STATE_REQ_STOP_BA_MSK &&
212 local->hw.ampdu_queues)
213 ieee80211_wake_queue_by_reason(&local->hw,
214 local->hw.queues + sta->tid_to_tx_q[i],
215 IEEE80211_QUEUE_STOP_REASON_AGGREGATION);
216
205 spin_unlock_bh(&sta->lock); 217 spin_unlock_bh(&sta->lock);
206 218
207 /* 219 /*
@@ -275,8 +287,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata,
275 * enable session_timer's data differentiation. refer to 287 * enable session_timer's data differentiation. refer to
276 * sta_rx_agg_session_timer_expired for useage */ 288 * sta_rx_agg_session_timer_expired for useage */
277 sta->timer_to_tid[i] = i; 289 sta->timer_to_tid[i] = i;
278 /* tid to tx queue: initialize according to HW (0 is valid) */ 290 sta->tid_to_tx_q[i] = -1;
279 sta->tid_to_tx_q[i] = ieee80211_num_queues(&local->hw);
280 /* rx */ 291 /* rx */
281 sta->ampdu_mlme.tid_state_rx[i] = HT_AGG_STATE_IDLE; 292 sta->ampdu_mlme.tid_state_rx[i] = HT_AGG_STATE_IDLE;
282 sta->ampdu_mlme.tid_rx[i] = NULL; 293 sta->ampdu_mlme.tid_rx[i] = NULL;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index d9653231992f..1f45573c580c 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -90,6 +90,7 @@ struct tid_ampdu_tx {
90 * @buf_size: buffer size for incoming A-MPDUs 90 * @buf_size: buffer size for incoming A-MPDUs
91 * @timeout: reset timer value (in TUs). 91 * @timeout: reset timer value (in TUs).
92 * @dialog_token: dialog token for aggregation session 92 * @dialog_token: dialog token for aggregation session
93 * @shutdown: this session is being shut down due to STA removal
93 */ 94 */
94struct tid_ampdu_rx { 95struct tid_ampdu_rx {
95 struct sk_buff **reorder_buf; 96 struct sk_buff **reorder_buf;
@@ -200,7 +201,7 @@ struct sta_ampdu_mlme {
200 * @tid_seq: per-TID sequence numbers for sending to this STA 201 * @tid_seq: per-TID sequence numbers for sending to this STA
201 * @ampdu_mlme: A-MPDU state machine state 202 * @ampdu_mlme: A-MPDU state machine state
202 * @timer_to_tid: identity mapping to ID timers 203 * @timer_to_tid: identity mapping to ID timers
203 * @tid_to_tx_q: map tid to tx queue 204 * @tid_to_tx_q: map tid to tx queue (invalid == negative values)
204 * @llid: Local link ID 205 * @llid: Local link ID
205 * @plid: Peer link ID 206 * @plid: Peer link ID
206 * @reason: Cancel reason on PLINK_HOLDING state 207 * @reason: Cancel reason on PLINK_HOLDING state
@@ -275,7 +276,7 @@ struct sta_info {
275 */ 276 */
276 struct sta_ampdu_mlme ampdu_mlme; 277 struct sta_ampdu_mlme ampdu_mlme;
277 u8 timer_to_tid[STA_TID_NUM]; 278 u8 timer_to_tid[STA_TID_NUM];
278 u8 tid_to_tx_q[STA_TID_NUM]; 279 s8 tid_to_tx_q[STA_TID_NUM];
279 280
280#ifdef CONFIG_MAC80211_MESH 281#ifdef CONFIG_MAC80211_MESH
281 /* 282 /*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 33926831c648..457238a2f3fc 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -784,6 +784,8 @@ ieee80211_tx_h_fragment(struct ieee80211_tx_data *tx)
784 skb_copy_queue_mapping(frag, first); 784 skb_copy_queue_mapping(frag, first);
785 785
786 frag->do_not_encrypt = first->do_not_encrypt; 786 frag->do_not_encrypt = first->do_not_encrypt;
787 frag->dev = first->dev;
788 frag->iif = first->iif;
787 789
788 pos += copylen; 790 pos += copylen;
789 left -= copylen; 791 left -= copylen;
@@ -876,7 +878,6 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx)
876 return TX_CONTINUE; 878 return TX_CONTINUE;
877} 879}
878 880
879
880/* actual transmit path */ 881/* actual transmit path */
881 882
882/* 883/*
@@ -1016,12 +1017,20 @@ __ieee80211_tx_prepare(struct ieee80211_tx_data *tx,
1016 tx->sta = sta_info_get(local, hdr->addr1); 1017 tx->sta = sta_info_get(local, hdr->addr1);
1017 1018
1018 if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) { 1019 if (tx->sta && ieee80211_is_data_qos(hdr->frame_control)) {
1020 unsigned long flags;
1019 qc = ieee80211_get_qos_ctl(hdr); 1021 qc = ieee80211_get_qos_ctl(hdr);
1020 tid = *qc & IEEE80211_QOS_CTL_TID_MASK; 1022 tid = *qc & IEEE80211_QOS_CTL_TID_MASK;
1021 1023
1024 spin_lock_irqsave(&tx->sta->lock, flags);
1022 state = &tx->sta->ampdu_mlme.tid_state_tx[tid]; 1025 state = &tx->sta->ampdu_mlme.tid_state_tx[tid];
1023 if (*state == HT_AGG_STATE_OPERATIONAL) 1026 if (*state == HT_AGG_STATE_OPERATIONAL) {
1024 info->flags |= IEEE80211_TX_CTL_AMPDU; 1027 info->flags |= IEEE80211_TX_CTL_AMPDU;
1028 if (local->hw.ampdu_queues)
1029 skb_set_queue_mapping(
1030 skb, tx->local->hw.queues +
1031 tx->sta->tid_to_tx_q[tid]);
1032 }
1033 spin_unlock_irqrestore(&tx->sta->lock, flags);
1025 } 1034 }
1026 1035
1027 if (is_multicast_ether_addr(hdr->addr1)) { 1036 if (is_multicast_ether_addr(hdr->addr1)) {
@@ -1085,7 +1094,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
1085 int ret, i; 1094 int ret, i;
1086 1095
1087 if (skb) { 1096 if (skb) {
1088 if (netif_subqueue_stopped(local->mdev, skb)) 1097 if (ieee80211_queue_stopped(&local->hw,
1098 skb_get_queue_mapping(skb)))
1089 return IEEE80211_TX_PENDING; 1099 return IEEE80211_TX_PENDING;
1090 1100
1091 ret = local->ops->tx(local_to_hw(local), skb); 1101 ret = local->ops->tx(local_to_hw(local), skb);
@@ -1101,8 +1111,8 @@ static int __ieee80211_tx(struct ieee80211_local *local, struct sk_buff *skb,
1101 info = IEEE80211_SKB_CB(tx->extra_frag[i]); 1111 info = IEEE80211_SKB_CB(tx->extra_frag[i]);
1102 info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT | 1112 info->flags &= ~(IEEE80211_TX_CTL_CLEAR_PS_FILT |
1103 IEEE80211_TX_CTL_FIRST_FRAGMENT); 1113 IEEE80211_TX_CTL_FIRST_FRAGMENT);
1104 if (netif_subqueue_stopped(local->mdev, 1114 if (ieee80211_queue_stopped(&local->hw,
1105 tx->extra_frag[i])) 1115 skb_get_queue_mapping(tx->extra_frag[i])))
1106 return IEEE80211_TX_FRAG_AGAIN; 1116 return IEEE80211_TX_FRAG_AGAIN;
1107 1117
1108 ret = local->ops->tx(local_to_hw(local), 1118 ret = local->ops->tx(local_to_hw(local),
@@ -1625,7 +1635,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
1625 case NL80211_IFTYPE_STATION: 1635 case NL80211_IFTYPE_STATION:
1626 fc |= cpu_to_le16(IEEE80211_FCTL_TODS); 1636 fc |= cpu_to_le16(IEEE80211_FCTL_TODS);
1627 /* BSSID SA DA */ 1637 /* BSSID SA DA */
1628 memcpy(hdr.addr1, sdata->u.sta.bssid, ETH_ALEN); 1638 memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN);
1629 memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); 1639 memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
1630 memcpy(hdr.addr3, skb->data, ETH_ALEN); 1640 memcpy(hdr.addr3, skb->data, ETH_ALEN);
1631 hdrlen = 24; 1641 hdrlen = 24;
@@ -1634,7 +1644,7 @@ int ieee80211_subif_start_xmit(struct sk_buff *skb,
1634 /* DA SA BSSID */ 1644 /* DA SA BSSID */
1635 memcpy(hdr.addr1, skb->data, ETH_ALEN); 1645 memcpy(hdr.addr1, skb->data, ETH_ALEN);
1636 memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN); 1646 memcpy(hdr.addr2, skb->data + ETH_ALEN, ETH_ALEN);
1637 memcpy(hdr.addr3, sdata->u.sta.bssid, ETH_ALEN); 1647 memcpy(hdr.addr3, sdata->u.ibss.bssid, ETH_ALEN);
1638 hdrlen = 24; 1648 hdrlen = 24;
1639 break; 1649 break;
1640 default: 1650 default:
@@ -1920,7 +1930,6 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
1920 struct ieee80211_tx_info *info; 1930 struct ieee80211_tx_info *info;
1921 struct ieee80211_sub_if_data *sdata = NULL; 1931 struct ieee80211_sub_if_data *sdata = NULL;
1922 struct ieee80211_if_ap *ap = NULL; 1932 struct ieee80211_if_ap *ap = NULL;
1923 struct ieee80211_if_sta *ifsta = NULL;
1924 struct beacon_data *beacon; 1933 struct beacon_data *beacon;
1925 struct ieee80211_supported_band *sband; 1934 struct ieee80211_supported_band *sband;
1926 enum ieee80211_band band = local->hw.conf.channel->band; 1935 enum ieee80211_band band = local->hw.conf.channel->band;
@@ -1972,13 +1981,13 @@ struct sk_buff *ieee80211_beacon_get(struct ieee80211_hw *hw,
1972 } else 1981 } else
1973 goto out; 1982 goto out;
1974 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { 1983 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
1984 struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
1975 struct ieee80211_hdr *hdr; 1985 struct ieee80211_hdr *hdr;
1976 ifsta = &sdata->u.sta;
1977 1986
1978 if (!ifsta->probe_resp) 1987 if (!ifibss->probe_resp)
1979 goto out; 1988 goto out;
1980 1989
1981 skb = skb_copy(ifsta->probe_resp, GFP_ATOMIC); 1990 skb = skb_copy(ifibss->probe_resp, GFP_ATOMIC);
1982 if (!skb) 1991 if (!skb)
1983 goto out; 1992 goto out;
1984 1993
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 73c7d7345abd..e0431a1d218b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -344,15 +344,36 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
344{ 344{
345 struct ieee80211_local *local = hw_to_local(hw); 345 struct ieee80211_local *local = hw_to_local(hw);
346 346
347 /* we don't need to track ampdu queues */ 347 if (queue >= hw->queues) {
348 if (queue < ieee80211_num_regular_queues(hw)) { 348 if (local->ampdu_ac_queue[queue - hw->queues] < 0)
349 __clear_bit(reason, &local->queue_stop_reasons[queue]); 349 return;
350
351 /*
352 * for virtual aggregation queues, we need to refcount the
353 * internal mac80211 disable (multiple times!), keep track of
354 * driver disable _and_ make sure the regular queue is
355 * actually enabled.
356 */
357 if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION)
358 local->amdpu_ac_stop_refcnt[queue - hw->queues]--;
359 else
360 __clear_bit(reason, &local->queue_stop_reasons[queue]);
350 361
351 if (local->queue_stop_reasons[queue] != 0) 362 if (local->queue_stop_reasons[queue] ||
352 /* someone still has this queue stopped */ 363 local->amdpu_ac_stop_refcnt[queue - hw->queues])
353 return; 364 return;
365
366 /* now go on to treat the corresponding regular queue */
367 queue = local->ampdu_ac_queue[queue - hw->queues];
368 reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION;
354 } 369 }
355 370
371 __clear_bit(reason, &local->queue_stop_reasons[queue]);
372
373 if (local->queue_stop_reasons[queue] != 0)
374 /* someone still has this queue stopped */
375 return;
376
356 if (test_bit(queue, local->queues_pending)) { 377 if (test_bit(queue, local->queues_pending)) {
357 set_bit(queue, local->queues_pending_run); 378 set_bit(queue, local->queues_pending_run);
358 tasklet_schedule(&local->tx_pending_tasklet); 379 tasklet_schedule(&local->tx_pending_tasklet);
@@ -361,8 +382,8 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
361 } 382 }
362} 383}
363 384
364static void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, 385void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue,
365 enum queue_stop_reason reason) 386 enum queue_stop_reason reason)
366{ 387{
367 struct ieee80211_local *local = hw_to_local(hw); 388 struct ieee80211_local *local = hw_to_local(hw);
368 unsigned long flags; 389 unsigned long flags;
@@ -384,15 +405,33 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
384{ 405{
385 struct ieee80211_local *local = hw_to_local(hw); 406 struct ieee80211_local *local = hw_to_local(hw);
386 407
387 /* we don't need to track ampdu queues */ 408 if (queue >= hw->queues) {
388 if (queue < ieee80211_num_regular_queues(hw)) 409 if (local->ampdu_ac_queue[queue - hw->queues] < 0)
389 __set_bit(reason, &local->queue_stop_reasons[queue]); 410 return;
411
412 /*
413 * for virtual aggregation queues, we need to refcount the
414 * internal mac80211 disable (multiple times!), keep track of
415 * driver disable _and_ make sure the regular queue is
416 * actually enabled.
417 */
418 if (reason == IEEE80211_QUEUE_STOP_REASON_AGGREGATION)
419 local->amdpu_ac_stop_refcnt[queue - hw->queues]++;
420 else
421 __set_bit(reason, &local->queue_stop_reasons[queue]);
422
423 /* now go on to treat the corresponding regular queue */
424 queue = local->ampdu_ac_queue[queue - hw->queues];
425 reason = IEEE80211_QUEUE_STOP_REASON_AGGREGATION;
426 }
427
428 __set_bit(reason, &local->queue_stop_reasons[queue]);
390 429
391 netif_stop_subqueue(local->mdev, queue); 430 netif_stop_subqueue(local->mdev, queue);
392} 431}
393 432
394static void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, 433void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue,
395 enum queue_stop_reason reason) 434 enum queue_stop_reason reason)
396{ 435{
397 struct ieee80211_local *local = hw_to_local(hw); 436 struct ieee80211_local *local = hw_to_local(hw);
398 unsigned long flags; 437 unsigned long flags;
@@ -418,7 +457,7 @@ void ieee80211_stop_queues_by_reason(struct ieee80211_hw *hw,
418 457
419 spin_lock_irqsave(&local->queue_stop_reason_lock, flags); 458 spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
420 459
421 for (i = 0; i < ieee80211_num_queues(hw); i++) 460 for (i = 0; i < hw->queues; i++)
422 __ieee80211_stop_queue(hw, i, reason); 461 __ieee80211_stop_queue(hw, i, reason);
423 462
424 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); 463 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
@@ -434,6 +473,16 @@ EXPORT_SYMBOL(ieee80211_stop_queues);
434int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue) 473int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue)
435{ 474{
436 struct ieee80211_local *local = hw_to_local(hw); 475 struct ieee80211_local *local = hw_to_local(hw);
476 unsigned long flags;
477
478 if (queue >= hw->queues) {
479 spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
480 queue = local->ampdu_ac_queue[queue - hw->queues];
481 spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
482 if (queue < 0)
483 return true;
484 }
485
437 return __netif_subqueue_stopped(local->mdev, queue); 486 return __netif_subqueue_stopped(local->mdev, queue);
438} 487}
439EXPORT_SYMBOL(ieee80211_queue_stopped); 488EXPORT_SYMBOL(ieee80211_queue_stopped);
@@ -701,6 +750,27 @@ void ieee80211_set_wmm_default(struct ieee80211_sub_if_data *sdata)
701 local->ops->conf_tx(local_to_hw(local), i, &qparam); 750 local->ops->conf_tx(local_to_hw(local), i, &qparam);
702} 751}
703 752
753void ieee80211_sta_def_wmm_params(struct ieee80211_sub_if_data *sdata,
754 const size_t supp_rates_len,
755 const u8 *supp_rates)
756{
757 struct ieee80211_local *local = sdata->local;
758 int i, have_higher_than_11mbit = 0;
759
760 /* cf. IEEE 802.11 9.2.12 */
761 for (i = 0; i < supp_rates_len; i++)
762 if ((supp_rates[i] & 0x7f) * 5 > 110)
763 have_higher_than_11mbit = 1;
764
765 if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ &&
766 have_higher_than_11mbit)
767 sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE;
768 else
769 sdata->flags &= ~IEEE80211_SDATA_OPERATING_GMODE;
770
771 ieee80211_set_wmm_default(sdata);
772}
773
704void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, 774void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb,
705 int encrypt) 775 int encrypt)
706{ 776{
@@ -767,3 +837,161 @@ u32 ieee80211_mandatory_rates(struct ieee80211_local *local,
767 mandatory_rates |= BIT(i); 837 mandatory_rates |= BIT(i);
768 return mandatory_rates; 838 return mandatory_rates;
769} 839}
840
841void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
842 u16 transaction, u16 auth_alg,
843 u8 *extra, size_t extra_len,
844 const u8 *bssid, int encrypt)
845{
846 struct ieee80211_local *local = sdata->local;
847 struct sk_buff *skb;
848 struct ieee80211_mgmt *mgmt;
849 const u8 *ie_auth = NULL;
850 int ie_auth_len = 0;
851
852 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
853 ie_auth_len = sdata->u.mgd.ie_auth_len;
854 ie_auth = sdata->u.mgd.ie_auth;
855 }
856
857 skb = dev_alloc_skb(local->hw.extra_tx_headroom +
858 sizeof(*mgmt) + 6 + extra_len + ie_auth_len);
859 if (!skb) {
860 printk(KERN_DEBUG "%s: failed to allocate buffer for auth "
861 "frame\n", sdata->dev->name);
862 return;
863 }
864 skb_reserve(skb, local->hw.extra_tx_headroom);
865
866 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24 + 6);
867 memset(mgmt, 0, 24 + 6);
868 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
869 IEEE80211_STYPE_AUTH);
870 if (encrypt)
871 mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
872 memcpy(mgmt->da, bssid, ETH_ALEN);
873 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
874 memcpy(mgmt->bssid, bssid, ETH_ALEN);
875 mgmt->u.auth.auth_alg = cpu_to_le16(auth_alg);
876 mgmt->u.auth.auth_transaction = cpu_to_le16(transaction);
877 mgmt->u.auth.status_code = cpu_to_le16(0);
878 if (extra)
879 memcpy(skb_put(skb, extra_len), extra, extra_len);
880 if (ie_auth)
881 memcpy(skb_put(skb, ie_auth_len), ie_auth, ie_auth_len);
882
883 ieee80211_tx_skb(sdata, skb, encrypt);
884}
885
886void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst,
887 u8 *ssid, size_t ssid_len,
888 u8 *ie, size_t ie_len)
889{
890 struct ieee80211_local *local = sdata->local;
891 struct ieee80211_supported_band *sband;
892 struct sk_buff *skb;
893 struct ieee80211_mgmt *mgmt;
894 u8 *pos, *supp_rates, *esupp_rates = NULL, *extra_preq_ie = NULL;
895 int i, extra_preq_ie_len = 0;
896
897 switch (sdata->vif.type) {
898 case NL80211_IFTYPE_STATION:
899 extra_preq_ie_len = sdata->u.mgd.ie_probereq_len;
900 extra_preq_ie = sdata->u.mgd.ie_probereq;
901 break;
902 default:
903 break;
904 }
905
906 skb = dev_alloc_skb(local->hw.extra_tx_headroom + sizeof(*mgmt) + 200 +
907 ie_len + extra_preq_ie_len);
908 if (!skb) {
909 printk(KERN_DEBUG "%s: failed to allocate buffer for probe "
910 "request\n", sdata->dev->name);
911 return;
912 }
913 skb_reserve(skb, local->hw.extra_tx_headroom);
914
915 mgmt = (struct ieee80211_mgmt *) skb_put(skb, 24);
916 memset(mgmt, 0, 24);
917 mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
918 IEEE80211_STYPE_PROBE_REQ);
919 memcpy(mgmt->sa, sdata->dev->dev_addr, ETH_ALEN);
920 if (dst) {
921 memcpy(mgmt->da, dst, ETH_ALEN);
922 memcpy(mgmt->bssid, dst, ETH_ALEN);
923 } else {
924 memset(mgmt->da, 0xff, ETH_ALEN);
925 memset(mgmt->bssid, 0xff, ETH_ALEN);
926 }
927 pos = skb_put(skb, 2 + ssid_len);
928 *pos++ = WLAN_EID_SSID;
929 *pos++ = ssid_len;
930 memcpy(pos, ssid, ssid_len);
931
932 supp_rates = skb_put(skb, 2);
933 supp_rates[0] = WLAN_EID_SUPP_RATES;
934 supp_rates[1] = 0;
935 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
936
937 for (i = 0; i < sband->n_bitrates; i++) {
938 struct ieee80211_rate *rate = &sband->bitrates[i];
939 if (esupp_rates) {
940 pos = skb_put(skb, 1);
941 esupp_rates[1]++;
942 } else if (supp_rates[1] == 8) {
943 esupp_rates = skb_put(skb, 3);
944 esupp_rates[0] = WLAN_EID_EXT_SUPP_RATES;
945 esupp_rates[1] = 1;
946 pos = &esupp_rates[2];
947 } else {
948 pos = skb_put(skb, 1);
949 supp_rates[1]++;
950 }
951 *pos = rate->bitrate / 5;
952 }
953
954 if (ie)
955 memcpy(skb_put(skb, ie_len), ie, ie_len);
956 if (extra_preq_ie)
957 memcpy(skb_put(skb, extra_preq_ie_len), extra_preq_ie,
958 extra_preq_ie_len);
959
960 ieee80211_tx_skb(sdata, skb, 0);
961}
962
963u32 ieee80211_sta_get_rates(struct ieee80211_local *local,
964 struct ieee802_11_elems *elems,
965 enum ieee80211_band band)
966{
967 struct ieee80211_supported_band *sband;
968 struct ieee80211_rate *bitrates;
969 size_t num_rates;
970 u32 supp_rates;
971 int i, j;
972 sband = local->hw.wiphy->bands[band];
973
974 if (!sband) {
975 WARN_ON(1);
976 sband = local->hw.wiphy->bands[local->hw.conf.channel->band];
977 }
978
979 bitrates = sband->bitrates;
980 num_rates = sband->n_bitrates;
981 supp_rates = 0;
982 for (i = 0; i < elems->supp_rates_len +
983 elems->ext_supp_rates_len; i++) {
984 u8 rate = 0;
985 int own_rate;
986 if (i < elems->supp_rates_len)
987 rate = elems->supp_rates[i];
988 else if (elems->ext_supp_rates)
989 rate = elems->ext_supp_rates
990 [i - elems->supp_rates_len];
991 own_rate = 5 * (rate & 0x7f);
992 for (j = 0; j < num_rates; j++)
993 if (bitrates[j].bitrate == own_rate)
994 supp_rates |= BIT(j);
995 }
996 return supp_rates;
997}
diff --git a/net/mac80211/wext.c b/net/mac80211/wext.c
index 2b023dce8b24..935c63ed3dfa 100644
--- a/net/mac80211/wext.c
+++ b/net/mac80211/wext.c
@@ -132,139 +132,37 @@ static int ieee80211_ioctl_siwgenie(struct net_device *dev,
132 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) 132 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME)
133 return -EOPNOTSUPP; 133 return -EOPNOTSUPP;
134 134
135 if (sdata->vif.type == NL80211_IFTYPE_STATION || 135 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
136 sdata->vif.type == NL80211_IFTYPE_ADHOC) {
137 int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length); 136 int ret = ieee80211_sta_set_extra_ie(sdata, extra, data->length);
138 if (ret) 137 if (ret)
139 return ret; 138 return ret;
140 sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; 139 sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
141 ieee80211_sta_req_auth(sdata, &sdata->u.sta); 140 ieee80211_sta_req_auth(sdata);
142 return 0; 141 return 0;
143 } 142 }
144 143
145 return -EOPNOTSUPP; 144 return -EOPNOTSUPP;
146} 145}
147 146
148static u8 ieee80211_get_wstats_flags(struct ieee80211_local *local)
149{
150 u8 wstats_flags = 0;
151
152 wstats_flags |= local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC |
153 IEEE80211_HW_SIGNAL_DBM) ?
154 IW_QUAL_QUAL_UPDATED : IW_QUAL_QUAL_INVALID;
155 wstats_flags |= local->hw.flags & IEEE80211_HW_NOISE_DBM ?
156 IW_QUAL_NOISE_UPDATED : IW_QUAL_NOISE_INVALID;
157 if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
158 wstats_flags |= IW_QUAL_DBM;
159
160 return wstats_flags;
161}
162
163static int ieee80211_ioctl_giwrange(struct net_device *dev,
164 struct iw_request_info *info,
165 struct iw_point *data, char *extra)
166{
167 struct ieee80211_local *local = wdev_priv(dev->ieee80211_ptr);
168 struct iw_range *range = (struct iw_range *) extra;
169 enum ieee80211_band band;
170 int c = 0;
171
172 data->length = sizeof(struct iw_range);
173 memset(range, 0, sizeof(struct iw_range));
174
175 range->we_version_compiled = WIRELESS_EXT;
176 range->we_version_source = 21;
177 range->retry_capa = IW_RETRY_LIMIT;
178 range->retry_flags = IW_RETRY_LIMIT;
179 range->min_retry = 0;
180 range->max_retry = 255;
181 range->min_rts = 0;
182 range->max_rts = 2347;
183 range->min_frag = 256;
184 range->max_frag = 2346;
185
186 range->encoding_size[0] = 5;
187 range->encoding_size[1] = 13;
188 range->num_encoding_sizes = 2;
189 range->max_encoding_tokens = NUM_DEFAULT_KEYS;
190
191 /* cfg80211 requires this, and enforces 0..100 */
192 if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)
193 range->max_qual.level = 100;
194 else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM)
195 range->max_qual.level = -110;
196 else
197 range->max_qual.level = 0;
198
199 if (local->hw.flags & IEEE80211_HW_NOISE_DBM)
200 range->max_qual.noise = -110;
201 else
202 range->max_qual.noise = 0;
203
204 range->max_qual.qual = 100;
205 range->max_qual.updated = ieee80211_get_wstats_flags(local);
206
207 range->avg_qual.qual = 50;
208 /* not always true but better than nothing */
209 range->avg_qual.level = range->max_qual.level / 2;
210 range->avg_qual.noise = range->max_qual.noise / 2;
211 range->avg_qual.updated = ieee80211_get_wstats_flags(local);
212
213 range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 |
214 IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
215
216
217 for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
218 int i;
219 struct ieee80211_supported_band *sband;
220
221 sband = local->hw.wiphy->bands[band];
222
223 if (!sband)
224 continue;
225
226 for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
227 struct ieee80211_channel *chan = &sband->channels[i];
228
229 if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
230 range->freq[c].i =
231 ieee80211_frequency_to_channel(
232 chan->center_freq);
233 range->freq[c].m = chan->center_freq;
234 range->freq[c].e = 6;
235 c++;
236 }
237 }
238 }
239 range->num_channels = c;
240 range->num_frequency = c;
241
242 IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
243 IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
244 IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
245
246 range->scan_capa |= IW_SCAN_CAPA_ESSID;
247
248 return 0;
249}
250
251
252static int ieee80211_ioctl_siwfreq(struct net_device *dev, 147static int ieee80211_ioctl_siwfreq(struct net_device *dev,
253 struct iw_request_info *info, 148 struct iw_request_info *info,
254 struct iw_freq *freq, char *extra) 149 struct iw_freq *freq, char *extra)
255{ 150{
256 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); 151 struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
257 152
258 if (sdata->vif.type == NL80211_IFTYPE_ADHOC || 153 if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
259 sdata->vif.type == NL80211_IFTYPE_STATION) 154 sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_CHANNEL_SEL;
260 sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL; 155 else if (sdata->vif.type == NL80211_IFTYPE_STATION)
156 sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_CHANNEL_SEL;
261 157
262 /* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */ 158 /* freq->e == 0: freq->m = channel; otherwise freq = m * 10^e */
263 if (freq->e == 0) { 159 if (freq->e == 0) {
264 if (freq->m < 0) { 160 if (freq->m < 0) {
265 if (sdata->vif.type == NL80211_IFTYPE_ADHOC || 161 if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
266 sdata->vif.type == NL80211_IFTYPE_STATION) 162 sdata->u.ibss.flags |=
267 sdata->u.sta.flags |= 163 IEEE80211_IBSS_AUTO_CHANNEL_SEL;
164 else if (sdata->vif.type == NL80211_IFTYPE_STATION)
165 sdata->u.mgd.flags |=
268 IEEE80211_STA_AUTO_CHANNEL_SEL; 166 IEEE80211_STA_AUTO_CHANNEL_SEL;
269 return 0; 167 return 0;
270 } else 168 } else
@@ -301,32 +199,35 @@ static int ieee80211_ioctl_siwessid(struct net_device *dev,
301{ 199{
302 struct ieee80211_sub_if_data *sdata; 200 struct ieee80211_sub_if_data *sdata;
303 size_t len = data->length; 201 size_t len = data->length;
202 int ret;
304 203
305 /* iwconfig uses nul termination in SSID.. */ 204 /* iwconfig uses nul termination in SSID.. */
306 if (len > 0 && ssid[len - 1] == '\0') 205 if (len > 0 && ssid[len - 1] == '\0')
307 len--; 206 len--;
308 207
309 sdata = IEEE80211_DEV_TO_SUB_IF(dev); 208 sdata = IEEE80211_DEV_TO_SUB_IF(dev);
310 if (sdata->vif.type == NL80211_IFTYPE_STATION || 209 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
311 sdata->vif.type == NL80211_IFTYPE_ADHOC) {
312 int ret;
313 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { 210 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
314 if (len > IEEE80211_MAX_SSID_LEN) 211 if (len > IEEE80211_MAX_SSID_LEN)
315 return -EINVAL; 212 return -EINVAL;
316 memcpy(sdata->u.sta.ssid, ssid, len); 213 memcpy(sdata->u.mgd.ssid, ssid, len);
317 sdata->u.sta.ssid_len = len; 214 sdata->u.mgd.ssid_len = len;
318 return 0; 215 return 0;
319 } 216 }
217
320 if (data->flags) 218 if (data->flags)
321 sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_SSID_SEL; 219 sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_SSID_SEL;
322 else 220 else
323 sdata->u.sta.flags |= IEEE80211_STA_AUTO_SSID_SEL; 221 sdata->u.mgd.flags |= IEEE80211_STA_AUTO_SSID_SEL;
222
324 ret = ieee80211_sta_set_ssid(sdata, ssid, len); 223 ret = ieee80211_sta_set_ssid(sdata, ssid, len);
325 if (ret) 224 if (ret)
326 return ret; 225 return ret;
327 ieee80211_sta_req_auth(sdata, &sdata->u.sta); 226
227 ieee80211_sta_req_auth(sdata);
328 return 0; 228 return 0;
329 } 229 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
230 return ieee80211_ibss_set_ssid(sdata, ssid, len);
330 231
331 return -EOPNOTSUPP; 232 return -EOPNOTSUPP;
332} 233}
@@ -340,8 +241,7 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
340 241
341 struct ieee80211_sub_if_data *sdata; 242 struct ieee80211_sub_if_data *sdata;
342 sdata = IEEE80211_DEV_TO_SUB_IF(dev); 243 sdata = IEEE80211_DEV_TO_SUB_IF(dev);
343 if (sdata->vif.type == NL80211_IFTYPE_STATION || 244 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
344 sdata->vif.type == NL80211_IFTYPE_ADHOC) {
345 int res = ieee80211_sta_get_ssid(sdata, ssid, &len); 245 int res = ieee80211_sta_get_ssid(sdata, ssid, &len);
346 if (res == 0) { 246 if (res == 0) {
347 data->length = len; 247 data->length = len;
@@ -349,6 +249,14 @@ static int ieee80211_ioctl_giwessid(struct net_device *dev,
349 } else 249 } else
350 data->flags = 0; 250 data->flags = 0;
351 return res; 251 return res;
252 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
253 int res = ieee80211_ibss_get_ssid(sdata, ssid, &len);
254 if (res == 0) {
255 data->length = len;
256 data->flags = 1;
257 } else
258 data->flags = 0;
259 return res;
352 } 260 }
353 261
354 return -EOPNOTSUPP; 262 return -EOPNOTSUPP;
@@ -362,26 +270,35 @@ static int ieee80211_ioctl_siwap(struct net_device *dev,
362 struct ieee80211_sub_if_data *sdata; 270 struct ieee80211_sub_if_data *sdata;
363 271
364 sdata = IEEE80211_DEV_TO_SUB_IF(dev); 272 sdata = IEEE80211_DEV_TO_SUB_IF(dev);
365 if (sdata->vif.type == NL80211_IFTYPE_STATION || 273 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
366 sdata->vif.type == NL80211_IFTYPE_ADHOC) {
367 int ret; 274 int ret;
368 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) { 275 if (sdata->flags & IEEE80211_SDATA_USERSPACE_MLME) {
369 memcpy(sdata->u.sta.bssid, (u8 *) &ap_addr->sa_data, 276 memcpy(sdata->u.mgd.bssid, (u8 *) &ap_addr->sa_data,
370 ETH_ALEN); 277 ETH_ALEN);
371 return 0; 278 return 0;
372 } 279 }
373 if (is_zero_ether_addr((u8 *) &ap_addr->sa_data)) 280 if (is_zero_ether_addr((u8 *) &ap_addr->sa_data))
374 sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL | 281 sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL |
375 IEEE80211_STA_AUTO_CHANNEL_SEL; 282 IEEE80211_STA_AUTO_CHANNEL_SEL;
376 else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data)) 283 else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data))
377 sdata->u.sta.flags |= IEEE80211_STA_AUTO_BSSID_SEL; 284 sdata->u.mgd.flags |= IEEE80211_STA_AUTO_BSSID_SEL;
378 else 285 else
379 sdata->u.sta.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL; 286 sdata->u.mgd.flags &= ~IEEE80211_STA_AUTO_BSSID_SEL;
380 ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data); 287 ret = ieee80211_sta_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
381 if (ret) 288 if (ret)
382 return ret; 289 return ret;
383 ieee80211_sta_req_auth(sdata, &sdata->u.sta); 290 ieee80211_sta_req_auth(sdata);
384 return 0; 291 return 0;
292 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
293 if (is_zero_ether_addr((u8 *) &ap_addr->sa_data))
294 sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL |
295 IEEE80211_IBSS_AUTO_CHANNEL_SEL;
296 else if (is_broadcast_ether_addr((u8 *) &ap_addr->sa_data))
297 sdata->u.ibss.flags |= IEEE80211_IBSS_AUTO_BSSID_SEL;
298 else
299 sdata->u.ibss.flags &= ~IEEE80211_IBSS_AUTO_BSSID_SEL;
300
301 return ieee80211_ibss_set_bssid(sdata, (u8 *) &ap_addr->sa_data);
385 } else if (sdata->vif.type == NL80211_IFTYPE_WDS) { 302 } else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
386 /* 303 /*
387 * If it is necessary to update the WDS peer address 304 * If it is necessary to update the WDS peer address
@@ -410,17 +327,20 @@ static int ieee80211_ioctl_giwap(struct net_device *dev,
410 struct ieee80211_sub_if_data *sdata; 327 struct ieee80211_sub_if_data *sdata;
411 328
412 sdata = IEEE80211_DEV_TO_SUB_IF(dev); 329 sdata = IEEE80211_DEV_TO_SUB_IF(dev);
413 if (sdata->vif.type == NL80211_IFTYPE_STATION || 330 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
414 sdata->vif.type == NL80211_IFTYPE_ADHOC) { 331 if (sdata->u.mgd.state == IEEE80211_STA_MLME_ASSOCIATED) {
415 if (sdata->u.sta.state == IEEE80211_STA_MLME_ASSOCIATED ||
416 sdata->u.sta.state == IEEE80211_STA_MLME_IBSS_JOINED) {
417 ap_addr->sa_family = ARPHRD_ETHER; 332 ap_addr->sa_family = ARPHRD_ETHER;
418 memcpy(&ap_addr->sa_data, sdata->u.sta.bssid, ETH_ALEN); 333 memcpy(&ap_addr->sa_data, sdata->u.mgd.bssid, ETH_ALEN);
419 return 0; 334 } else
420 } else {
421 memset(&ap_addr->sa_data, 0, ETH_ALEN); 335 memset(&ap_addr->sa_data, 0, ETH_ALEN);
422 return 0; 336 return 0;
423 } 337 } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) {
338 if (sdata->u.ibss.state == IEEE80211_IBSS_MLME_JOINED) {
339 ap_addr->sa_family = ARPHRD_ETHER;
340 memcpy(&ap_addr->sa_data, sdata->u.ibss.bssid, ETH_ALEN);
341 } else
342 memset(&ap_addr->sa_data, 0, ETH_ALEN);
343 return 0;
424 } else if (sdata->vif.type == NL80211_IFTYPE_WDS) { 344 } else if (sdata->vif.type == NL80211_IFTYPE_WDS) {
425 ap_addr->sa_family = ARPHRD_ETHER; 345 ap_addr->sa_family = ARPHRD_ETHER;
426 memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN); 346 memcpy(&ap_addr->sa_data, sdata->u.wds.remote_addr, ETH_ALEN);
@@ -486,7 +406,7 @@ static int ieee80211_ioctl_giwrate(struct net_device *dev,
486 406
487 rcu_read_lock(); 407 rcu_read_lock();
488 408
489 sta = sta_info_get(local, sdata->u.sta.bssid); 409 sta = sta_info_get(local, sdata->u.mgd.bssid);
490 410
491 if (sta && !(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS)) 411 if (sta && !(sta->last_tx_rate.flags & IEEE80211_TX_RC_MCS))
492 rate->value = sband->bitrates[sta->last_tx_rate.idx].bitrate; 412 rate->value = sband->bitrates[sta->last_tx_rate.idx].bitrate;
@@ -687,8 +607,7 @@ static int ieee80211_ioctl_siwmlme(struct net_device *dev,
687 struct iw_mlme *mlme = (struct iw_mlme *) extra; 607 struct iw_mlme *mlme = (struct iw_mlme *) extra;
688 608
689 sdata = IEEE80211_DEV_TO_SUB_IF(dev); 609 sdata = IEEE80211_DEV_TO_SUB_IF(dev);
690 if (sdata->vif.type != NL80211_IFTYPE_STATION && 610 if (!(sdata->vif.type == NL80211_IFTYPE_STATION))
691 sdata->vif.type != NL80211_IFTYPE_ADHOC)
692 return -EINVAL; 611 return -EINVAL;
693 612
694 switch (mlme->cmd) { 613 switch (mlme->cmd) {
@@ -784,8 +703,7 @@ static int ieee80211_ioctl_giwencode(struct net_device *dev,
784 erq->flags |= IW_ENCODE_ENABLED; 703 erq->flags |= IW_ENCODE_ENABLED;
785 704
786 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 705 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
787 struct ieee80211_if_sta *ifsta = &sdata->u.sta; 706 switch (sdata->u.mgd.auth_alg) {
788 switch (ifsta->auth_alg) {
789 case WLAN_AUTH_OPEN: 707 case WLAN_AUTH_OPEN:
790 case WLAN_AUTH_LEAP: 708 case WLAN_AUTH_LEAP:
791 erq->flags |= IW_ENCODE_OPEN; 709 erq->flags |= IW_ENCODE_OPEN;
@@ -849,7 +767,7 @@ static int ieee80211_ioctl_siwpower(struct net_device *dev,
849 ret = ieee80211_hw_config(local, 767 ret = ieee80211_hw_config(local,
850 IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT); 768 IEEE80211_CONF_CHANGE_DYNPS_TIMEOUT);
851 769
852 if (!(sdata->u.sta.flags & IEEE80211_STA_ASSOCIATED)) 770 if (!(sdata->u.mgd.flags & IEEE80211_STA_ASSOCIATED))
853 return ret; 771 return ret;
854 772
855 if (conf->dynamic_ps_timeout > 0 && 773 if (conf->dynamic_ps_timeout > 0 &&
@@ -908,10 +826,10 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
908 if (sdata->vif.type == NL80211_IFTYPE_STATION) { 826 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
909 if (data->value & (IW_AUTH_CIPHER_WEP40 | 827 if (data->value & (IW_AUTH_CIPHER_WEP40 |
910 IW_AUTH_CIPHER_WEP104 | IW_AUTH_CIPHER_TKIP)) 828 IW_AUTH_CIPHER_WEP104 | IW_AUTH_CIPHER_TKIP))
911 sdata->u.sta.flags |= 829 sdata->u.mgd.flags |=
912 IEEE80211_STA_TKIP_WEP_USED; 830 IEEE80211_STA_TKIP_WEP_USED;
913 else 831 else
914 sdata->u.sta.flags &= 832 sdata->u.mgd.flags &=
915 ~IEEE80211_STA_TKIP_WEP_USED; 833 ~IEEE80211_STA_TKIP_WEP_USED;
916 } 834 }
917 break; 835 break;
@@ -922,21 +840,20 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
922 if (sdata->vif.type != NL80211_IFTYPE_STATION) 840 if (sdata->vif.type != NL80211_IFTYPE_STATION)
923 ret = -EINVAL; 841 ret = -EINVAL;
924 else { 842 else {
925 sdata->u.sta.flags &= ~IEEE80211_STA_PRIVACY_INVOKED; 843 sdata->u.mgd.flags &= ~IEEE80211_STA_PRIVACY_INVOKED;
926 /* 844 /*
927 * Privacy invoked by wpa_supplicant, store the 845 * Privacy invoked by wpa_supplicant, store the
928 * value and allow associating to a protected 846 * value and allow associating to a protected
929 * network without having a key up front. 847 * network without having a key up front.
930 */ 848 */
931 if (data->value) 849 if (data->value)
932 sdata->u.sta.flags |= 850 sdata->u.mgd.flags |=
933 IEEE80211_STA_PRIVACY_INVOKED; 851 IEEE80211_STA_PRIVACY_INVOKED;
934 } 852 }
935 break; 853 break;
936 case IW_AUTH_80211_AUTH_ALG: 854 case IW_AUTH_80211_AUTH_ALG:
937 if (sdata->vif.type == NL80211_IFTYPE_STATION || 855 if (sdata->vif.type == NL80211_IFTYPE_STATION)
938 sdata->vif.type == NL80211_IFTYPE_ADHOC) 856 sdata->u.mgd.auth_algs = data->value;
939 sdata->u.sta.auth_algs = data->value;
940 else 857 else
941 ret = -EOPNOTSUPP; 858 ret = -EOPNOTSUPP;
942 break; 859 break;
@@ -945,17 +862,16 @@ static int ieee80211_ioctl_siwauth(struct net_device *dev,
945 ret = -EOPNOTSUPP; 862 ret = -EOPNOTSUPP;
946 break; 863 break;
947 } 864 }
948 if (sdata->vif.type == NL80211_IFTYPE_STATION || 865 if (sdata->vif.type == NL80211_IFTYPE_STATION) {
949 sdata->vif.type == NL80211_IFTYPE_ADHOC) {
950 switch (data->value) { 866 switch (data->value) {
951 case IW_AUTH_MFP_DISABLED: 867 case IW_AUTH_MFP_DISABLED:
952 sdata->u.sta.mfp = IEEE80211_MFP_DISABLED; 868 sdata->u.mgd.mfp = IEEE80211_MFP_DISABLED;
953 break; 869 break;
954 case IW_AUTH_MFP_OPTIONAL: 870 case IW_AUTH_MFP_OPTIONAL:
955 sdata->u.sta.mfp = IEEE80211_MFP_OPTIONAL; 871 sdata->u.mgd.mfp = IEEE80211_MFP_OPTIONAL;
956 break; 872 break;
957 case IW_AUTH_MFP_REQUIRED: 873 case IW_AUTH_MFP_REQUIRED:
958 sdata->u.sta.mfp = IEEE80211_MFP_REQUIRED; 874 sdata->u.mgd.mfp = IEEE80211_MFP_REQUIRED;
959 break; 875 break;
960 default: 876 default:
961 ret = -EINVAL; 877 ret = -EINVAL;
@@ -980,9 +896,9 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
980 896
981 rcu_read_lock(); 897 rcu_read_lock();
982 898
983 if (sdata->vif.type == NL80211_IFTYPE_STATION || 899 if (sdata->vif.type == NL80211_IFTYPE_STATION)
984 sdata->vif.type == NL80211_IFTYPE_ADHOC) 900 sta = sta_info_get(local, sdata->u.mgd.bssid);
985 sta = sta_info_get(local, sdata->u.sta.bssid); 901
986 if (!sta) { 902 if (!sta) {
987 wstats->discard.fragment = 0; 903 wstats->discard.fragment = 0;
988 wstats->discard.misc = 0; 904 wstats->discard.misc = 0;
@@ -991,10 +907,45 @@ static struct iw_statistics *ieee80211_get_wireless_stats(struct net_device *dev
991 wstats->qual.noise = 0; 907 wstats->qual.noise = 0;
992 wstats->qual.updated = IW_QUAL_ALL_INVALID; 908 wstats->qual.updated = IW_QUAL_ALL_INVALID;
993 } else { 909 } else {
994 wstats->qual.level = sta->last_signal; 910 wstats->qual.updated = 0;
995 wstats->qual.qual = sta->last_qual; 911 /*
996 wstats->qual.noise = sta->last_noise; 912 * mirror what cfg80211 does for iwrange/scan results,
997 wstats->qual.updated = ieee80211_get_wstats_flags(local); 913 * otherwise userspace gets confused.
914 */
915 if (local->hw.flags & (IEEE80211_HW_SIGNAL_UNSPEC |
916 IEEE80211_HW_SIGNAL_DBM)) {
917 wstats->qual.updated |= IW_QUAL_LEVEL_UPDATED;
918 wstats->qual.updated |= IW_QUAL_QUAL_UPDATED;
919 } else {
920 wstats->qual.updated |= IW_QUAL_LEVEL_INVALID;
921 wstats->qual.updated |= IW_QUAL_QUAL_INVALID;
922 }
923
924 if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) {
925 wstats->qual.level = sta->last_signal;
926 wstats->qual.qual = sta->last_signal;
927 } else if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) {
928 int sig = sta->last_signal;
929
930 wstats->qual.updated |= IW_QUAL_DBM;
931 wstats->qual.level = sig;
932 if (sig < -110)
933 sig = -110;
934 else if (sig > -40)
935 sig = -40;
936 wstats->qual.qual = sig + 110;
937 }
938
939 if (local->hw.flags & IEEE80211_HW_NOISE_DBM) {
940 /*
941 * This assumes that if driver reports noise, it also
942 * reports signal in dBm.
943 */
944 wstats->qual.noise = sta->last_noise;
945 wstats->qual.updated |= IW_QUAL_NOISE_UPDATED;
946 } else {
947 wstats->qual.updated |= IW_QUAL_NOISE_INVALID;
948 }
998 } 949 }
999 950
1000 rcu_read_unlock(); 951 rcu_read_unlock();
@@ -1011,9 +962,8 @@ static int ieee80211_ioctl_giwauth(struct net_device *dev,
1011 962
1012 switch (data->flags & IW_AUTH_INDEX) { 963 switch (data->flags & IW_AUTH_INDEX) {
1013 case IW_AUTH_80211_AUTH_ALG: 964 case IW_AUTH_80211_AUTH_ALG:
1014 if (sdata->vif.type == NL80211_IFTYPE_STATION || 965 if (sdata->vif.type == NL80211_IFTYPE_STATION)
1015 sdata->vif.type == NL80211_IFTYPE_ADHOC) 966 data->value = sdata->u.mgd.auth_algs;
1016 data->value = sdata->u.sta.auth_algs;
1017 else 967 else
1018 ret = -EOPNOTSUPP; 968 ret = -EOPNOTSUPP;
1019 break; 969 break;
@@ -1116,7 +1066,7 @@ static const iw_handler ieee80211_handler[] =
1116 (iw_handler) NULL, /* SIOCSIWSENS */ 1066 (iw_handler) NULL, /* SIOCSIWSENS */
1117 (iw_handler) NULL, /* SIOCGIWSENS */ 1067 (iw_handler) NULL, /* SIOCGIWSENS */
1118 (iw_handler) NULL /* not used */, /* SIOCSIWRANGE */ 1068 (iw_handler) NULL /* not used */, /* SIOCSIWRANGE */
1119 (iw_handler) ieee80211_ioctl_giwrange, /* SIOCGIWRANGE */ 1069 (iw_handler) cfg80211_wext_giwrange, /* SIOCGIWRANGE */
1120 (iw_handler) NULL /* not used */, /* SIOCSIWPRIV */ 1070 (iw_handler) NULL /* not used */, /* SIOCSIWPRIV */
1121 (iw_handler) NULL /* kernel code */, /* SIOCGIWPRIV */ 1071 (iw_handler) NULL /* kernel code */, /* SIOCGIWPRIV */
1122 (iw_handler) NULL /* not used */, /* SIOCSIWSTATS */ 1072 (iw_handler) NULL /* not used */, /* SIOCSIWSTATS */
diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c
index ac71b38f7cb5..0b8ad1f4ecdd 100644
--- a/net/mac80211/wme.c
+++ b/net/mac80211/wme.c
@@ -99,10 +99,13 @@ static u16 classify80211(struct ieee80211_local *local, struct sk_buff *skb)
99 /* in case we are a client verify acm is not set for this ac */ 99 /* in case we are a client verify acm is not set for this ac */
100 while (unlikely(local->wmm_acm & BIT(skb->priority))) { 100 while (unlikely(local->wmm_acm & BIT(skb->priority))) {
101 if (wme_downgrade_ac(skb)) { 101 if (wme_downgrade_ac(skb)) {
102 /* The old code would drop the packet in this 102 /*
103 * case. 103 * This should not really happen. The AP has marked all
104 * lower ACs to require admission control which is not
105 * a reasonable configuration. Allow the frame to be
106 * transmitted using AC_BK as a workaround.
104 */ 107 */
105 return 0; 108 break;
106 } 109 }
107 } 110 }
108 111
@@ -114,9 +117,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
114{ 117{
115 struct ieee80211_master_priv *mpriv = netdev_priv(dev); 118 struct ieee80211_master_priv *mpriv = netdev_priv(dev);
116 struct ieee80211_local *local = mpriv->local; 119 struct ieee80211_local *local = mpriv->local;
117 struct ieee80211_hw *hw = &local->hw;
118 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; 120 struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
119 struct sta_info *sta;
120 u16 queue; 121 u16 queue;
121 u8 tid; 122 u8 tid;
122 123
@@ -124,29 +125,11 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
124 if (unlikely(queue >= local->hw.queues)) 125 if (unlikely(queue >= local->hw.queues))
125 queue = local->hw.queues - 1; 126 queue = local->hw.queues - 1;
126 127
127 if (skb->requeue) { 128 /*
128 if (!hw->ampdu_queues) 129 * Now we know the 1d priority, fill in the QoS header if
129 return queue; 130 * there is one (and we haven't done this before).
130
131 rcu_read_lock();
132 sta = sta_info_get(local, hdr->addr1);
133 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
134 if (sta) {
135 int ampdu_queue = sta->tid_to_tx_q[tid];
136
137 if ((ampdu_queue < ieee80211_num_queues(hw)) &&
138 test_bit(ampdu_queue, local->queue_pool))
139 queue = ampdu_queue;
140 }
141 rcu_read_unlock();
142
143 return queue;
144 }
145
146 /* Now we know the 1d priority, fill in the QoS header if
147 * there is one.
148 */ 131 */
149 if (ieee80211_is_data_qos(hdr->frame_control)) { 132 if (!skb->requeue && ieee80211_is_data_qos(hdr->frame_control)) {
150 u8 *p = ieee80211_get_qos_ctl(hdr); 133 u8 *p = ieee80211_get_qos_ctl(hdr);
151 u8 ack_policy = 0; 134 u8 ack_policy = 0;
152 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK; 135 tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
@@ -156,140 +139,7 @@ u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb)
156 /* qos header is 2 bytes, second reserved */ 139 /* qos header is 2 bytes, second reserved */
157 *p++ = ack_policy | tid; 140 *p++ = ack_policy | tid;
158 *p = 0; 141 *p = 0;
159
160 if (!hw->ampdu_queues)
161 return queue;
162
163 rcu_read_lock();
164
165 sta = sta_info_get(local, hdr->addr1);
166 if (sta) {
167 int ampdu_queue = sta->tid_to_tx_q[tid];
168
169 if ((ampdu_queue < ieee80211_num_queues(hw)) &&
170 test_bit(ampdu_queue, local->queue_pool))
171 queue = ampdu_queue;
172 }
173
174 rcu_read_unlock();
175 } 142 }
176 143
177 return queue; 144 return queue;
178} 145}
179
180int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
181 struct sta_info *sta, u16 tid)
182{
183 int i;
184
185 /* XXX: currently broken due to cb/requeue use */
186 return -EPERM;
187
188 /* prepare the filter and save it for the SW queue
189 * matching the received HW queue */
190
191 if (!local->hw.ampdu_queues)
192 return -EPERM;
193
194 /* try to get a Qdisc from the pool */
195 for (i = local->hw.queues; i < ieee80211_num_queues(&local->hw); i++)
196 if (!test_and_set_bit(i, local->queue_pool)) {
197 ieee80211_stop_queue(local_to_hw(local), i);
198 sta->tid_to_tx_q[tid] = i;
199
200 /* IF there are already pending packets
201 * on this tid first we need to drain them
202 * on the previous queue
203 * since HT is strict in order */
204#ifdef CONFIG_MAC80211_HT_DEBUG
205 if (net_ratelimit())
206 printk(KERN_DEBUG "allocated aggregation queue"
207 " %d tid %d addr %pM pool=0x%lX\n",
208 i, tid, sta->sta.addr,
209 local->queue_pool[0]);
210#endif /* CONFIG_MAC80211_HT_DEBUG */
211 return 0;
212 }
213
214 return -EAGAIN;
215}
216
217/**
218 * the caller needs to hold netdev_get_tx_queue(local->mdev, X)->lock
219 */
220void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local,
221 struct sta_info *sta, u16 tid,
222 u8 requeue)
223{
224 int agg_queue = sta->tid_to_tx_q[tid];
225 struct ieee80211_hw *hw = &local->hw;
226
227 /* return the qdisc to the pool */
228 clear_bit(agg_queue, local->queue_pool);
229 sta->tid_to_tx_q[tid] = ieee80211_num_queues(hw);
230
231 if (requeue) {
232 ieee80211_requeue(local, agg_queue);
233 } else {
234 struct netdev_queue *txq;
235 spinlock_t *root_lock;
236 struct Qdisc *q;
237
238 txq = netdev_get_tx_queue(local->mdev, agg_queue);
239 q = rcu_dereference(txq->qdisc);
240 root_lock = qdisc_lock(q);
241
242 spin_lock_bh(root_lock);
243 qdisc_reset(q);
244 spin_unlock_bh(root_lock);
245 }
246}
247
248void ieee80211_requeue(struct ieee80211_local *local, int queue)
249{
250 struct netdev_queue *txq = netdev_get_tx_queue(local->mdev, queue);
251 struct sk_buff_head list;
252 spinlock_t *root_lock;
253 struct Qdisc *qdisc;
254 u32 len;
255
256 rcu_read_lock_bh();
257
258 qdisc = rcu_dereference(txq->qdisc);
259 if (!qdisc || !qdisc->dequeue)
260 goto out_unlock;
261
262 skb_queue_head_init(&list);
263
264 root_lock = qdisc_root_lock(qdisc);
265 spin_lock(root_lock);
266 for (len = qdisc->q.qlen; len > 0; len--) {
267 struct sk_buff *skb = qdisc->dequeue(qdisc);
268
269 if (skb)
270 __skb_queue_tail(&list, skb);
271 }
272 spin_unlock(root_lock);
273
274 for (len = list.qlen; len > 0; len--) {
275 struct sk_buff *skb = __skb_dequeue(&list);
276 u16 new_queue;
277
278 BUG_ON(!skb);
279 new_queue = ieee80211_select_queue(local->mdev, skb);
280 skb_set_queue_mapping(skb, new_queue);
281
282 txq = netdev_get_tx_queue(local->mdev, new_queue);
283
284
285 qdisc = rcu_dereference(txq->qdisc);
286 root_lock = qdisc_root_lock(qdisc);
287
288 spin_lock(root_lock);
289 qdisc_enqueue_root(skb, qdisc);
290 spin_unlock(root_lock);
291 }
292
293out_unlock:
294 rcu_read_unlock_bh();
295}
diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h
index bc62f28a4d3d..7520d2e014dc 100644
--- a/net/mac80211/wme.h
+++ b/net/mac80211/wme.h
@@ -21,11 +21,5 @@
21extern const int ieee802_1d_to_ac[8]; 21extern const int ieee802_1d_to_ac[8];
22 22
23u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb); 23u16 ieee80211_select_queue(struct net_device *dev, struct sk_buff *skb);
24int ieee80211_ht_agg_queue_add(struct ieee80211_local *local,
25 struct sta_info *sta, u16 tid);
26void ieee80211_ht_agg_queue_remove(struct ieee80211_local *local,
27 struct sta_info *sta, u16 tid,
28 u8 requeue);
29void ieee80211_requeue(struct ieee80211_local *local, int queue);
30 24
31#endif /* _WME_H */ 25#endif /* _WME_H */
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 55befe59e1c0..dfb447b584da 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -728,7 +728,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
728 NF_CT_ASSERT(skb->nfct); 728 NF_CT_ASSERT(skb->nfct);
729 729
730 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum); 730 ret = l4proto->packet(ct, skb, dataoff, ctinfo, pf, hooknum);
731 if (ret < 0) { 731 if (ret <= 0) {
732 /* Invalid: inverse of the return code tells 732 /* Invalid: inverse of the return code tells
733 * the netfilter core what to do */ 733 * the netfilter core what to do */
734 pr_debug("nf_conntrack_in: Can't track with proto module\n"); 734 pr_debug("nf_conntrack_in: Can't track with proto module\n");
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 1b75c9efb0eb..7a16bd462f82 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1763,6 +1763,7 @@ ctnetlink_create_expect(struct nlattr *cda[], u_int8_t u3, u32 pid, int report)
1763 goto out; 1763 goto out;
1764 } 1764 }
1765 1765
1766 exp->class = 0;
1766 exp->expectfn = NULL; 1767 exp->expectfn = NULL;
1767 exp->flags = 0; 1768 exp->flags = 0;
1768 exp->master = ct; 1769 exp->master = ct;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 7d3944f02ea1..e46f3b79adb3 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -861,7 +861,7 @@ static int tcp_packet(struct nf_conn *ct,
861 */ 861 */
862 if (nf_ct_kill(ct)) 862 if (nf_ct_kill(ct))
863 return -NF_REPEAT; 863 return -NF_REPEAT;
864 return -NF_DROP; 864 return NF_DROP;
865 } 865 }
866 /* Fall through */ 866 /* Fall through */
867 case TCP_CONNTRACK_IGNORE: 867 case TCP_CONNTRACK_IGNORE:
@@ -894,7 +894,7 @@ static int tcp_packet(struct nf_conn *ct,
894 nf_log_packet(pf, 0, skb, NULL, NULL, NULL, 894 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
895 "nf_ct_tcp: killing out of sync session "); 895 "nf_ct_tcp: killing out of sync session ");
896 nf_ct_kill(ct); 896 nf_ct_kill(ct);
897 return -NF_DROP; 897 return NF_DROP;
898 } 898 }
899 ct->proto.tcp.last_index = index; 899 ct->proto.tcp.last_index = index;
900 ct->proto.tcp.last_dir = dir; 900 ct->proto.tcp.last_dir = dir;
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3eae3fca29d8..fd326ac27ec8 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -39,7 +39,7 @@
39#endif 39#endif
40 40
41#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE 41#define NFULNL_NLBUFSIZ_DEFAULT NLMSG_GOODSIZE
42#define NFULNL_TIMEOUT_DEFAULT HZ /* every second */ 42#define NFULNL_TIMEOUT_DEFAULT 100 /* every second */
43#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */ 43#define NFULNL_QTHRESH_DEFAULT 100 /* 100 packets */
44#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */ 44#define NFULNL_COPY_RANGE_MAX 0xFFFF /* max packet size is limited by 16-bit struct nfattr nfa_len field */
45 45
@@ -590,8 +590,10 @@ nfulnl_log_packet(u_int8_t pf,
590 590
591 qthreshold = inst->qthreshold; 591 qthreshold = inst->qthreshold;
592 /* per-rule qthreshold overrides per-instance */ 592 /* per-rule qthreshold overrides per-instance */
593 if (qthreshold > li->u.ulog.qthreshold) 593 if (li->u.ulog.qthreshold)
594 qthreshold = li->u.ulog.qthreshold; 594 if (qthreshold > li->u.ulog.qthreshold)
595 qthreshold = li->u.ulog.qthreshold;
596
595 597
596 switch (inst->copy_mode) { 598 switch (inst->copy_mode) {
597 case NFULNL_COPY_META: 599 case NFULNL_COPY_META:
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index bfcac92d5563..509a95621f9f 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -843,59 +843,143 @@ static const struct file_operations xt_table_ops = {
843 .release = seq_release_net, 843 .release = seq_release_net,
844}; 844};
845 845
846static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos) 846/*
847 * Traverse state for ip{,6}_{tables,matches} for helping crossing
848 * the multi-AF mutexes.
849 */
850struct nf_mttg_trav {
851 struct list_head *head, *curr;
852 uint8_t class, nfproto;
853};
854
855enum {
856 MTTG_TRAV_INIT,
857 MTTG_TRAV_NFP_UNSPEC,
858 MTTG_TRAV_NFP_SPEC,
859 MTTG_TRAV_DONE,
860};
861
862static void *xt_mttg_seq_next(struct seq_file *seq, void *v, loff_t *ppos,
863 bool is_target)
847{ 864{
848 struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; 865 static const uint8_t next_class[] = {
849 u_int16_t af = (unsigned long)pde->data; 866 [MTTG_TRAV_NFP_UNSPEC] = MTTG_TRAV_NFP_SPEC,
867 [MTTG_TRAV_NFP_SPEC] = MTTG_TRAV_DONE,
868 };
869 struct nf_mttg_trav *trav = seq->private;
870
871 switch (trav->class) {
872 case MTTG_TRAV_INIT:
873 trav->class = MTTG_TRAV_NFP_UNSPEC;
874 mutex_lock(&xt[NFPROTO_UNSPEC].mutex);
875 trav->head = trav->curr = is_target ?
876 &xt[NFPROTO_UNSPEC].target : &xt[NFPROTO_UNSPEC].match;
877 break;
878 case MTTG_TRAV_NFP_UNSPEC:
879 trav->curr = trav->curr->next;
880 if (trav->curr != trav->head)
881 break;
882 mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
883 mutex_lock(&xt[trav->nfproto].mutex);
884 trav->head = trav->curr = is_target ?
885 &xt[trav->nfproto].target : &xt[trav->nfproto].match;
886 trav->class = next_class[trav->class];
887 break;
888 case MTTG_TRAV_NFP_SPEC:
889 trav->curr = trav->curr->next;
890 if (trav->curr != trav->head)
891 break;
892 /* fallthru, _stop will unlock */
893 default:
894 return NULL;
895 }
850 896
851 mutex_lock(&xt[af].mutex); 897 if (ppos != NULL)
852 return seq_list_start(&xt[af].match, *pos); 898 ++*ppos;
899 return trav;
853} 900}
854 901
855static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *pos) 902static void *xt_mttg_seq_start(struct seq_file *seq, loff_t *pos,
903 bool is_target)
856{ 904{
857 struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; 905 struct nf_mttg_trav *trav = seq->private;
858 u_int16_t af = (unsigned long)pde->data; 906 unsigned int j;
859 907
860 return seq_list_next(v, &xt[af].match, pos); 908 trav->class = MTTG_TRAV_INIT;
909 for (j = 0; j < *pos; ++j)
910 if (xt_mttg_seq_next(seq, NULL, NULL, is_target) == NULL)
911 return NULL;
912 return trav;
861} 913}
862 914
863static void xt_match_seq_stop(struct seq_file *seq, void *v) 915static void xt_mttg_seq_stop(struct seq_file *seq, void *v)
864{ 916{
865 struct proc_dir_entry *pde = seq->private; 917 struct nf_mttg_trav *trav = seq->private;
866 u_int16_t af = (unsigned long)pde->data; 918
919 switch (trav->class) {
920 case MTTG_TRAV_NFP_UNSPEC:
921 mutex_unlock(&xt[NFPROTO_UNSPEC].mutex);
922 break;
923 case MTTG_TRAV_NFP_SPEC:
924 mutex_unlock(&xt[trav->nfproto].mutex);
925 break;
926 }
927}
867 928
868 mutex_unlock(&xt[af].mutex); 929static void *xt_match_seq_start(struct seq_file *seq, loff_t *pos)
930{
931 return xt_mttg_seq_start(seq, pos, false);
869} 932}
870 933
871static int xt_match_seq_show(struct seq_file *seq, void *v) 934static void *xt_match_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
872{ 935{
873 struct xt_match *match = list_entry(v, struct xt_match, list); 936 return xt_mttg_seq_next(seq, v, ppos, false);
937}
874 938
875 if (strlen(match->name)) 939static int xt_match_seq_show(struct seq_file *seq, void *v)
876 return seq_printf(seq, "%s\n", match->name); 940{
877 else 941 const struct nf_mttg_trav *trav = seq->private;
878 return 0; 942 const struct xt_match *match;
943
944 switch (trav->class) {
945 case MTTG_TRAV_NFP_UNSPEC:
946 case MTTG_TRAV_NFP_SPEC:
947 if (trav->curr == trav->head)
948 return 0;
949 match = list_entry(trav->curr, struct xt_match, list);
950 return (*match->name == '\0') ? 0 :
951 seq_printf(seq, "%s\n", match->name);
952 }
953 return 0;
879} 954}
880 955
881static const struct seq_operations xt_match_seq_ops = { 956static const struct seq_operations xt_match_seq_ops = {
882 .start = xt_match_seq_start, 957 .start = xt_match_seq_start,
883 .next = xt_match_seq_next, 958 .next = xt_match_seq_next,
884 .stop = xt_match_seq_stop, 959 .stop = xt_mttg_seq_stop,
885 .show = xt_match_seq_show, 960 .show = xt_match_seq_show,
886}; 961};
887 962
888static int xt_match_open(struct inode *inode, struct file *file) 963static int xt_match_open(struct inode *inode, struct file *file)
889{ 964{
965 struct seq_file *seq;
966 struct nf_mttg_trav *trav;
890 int ret; 967 int ret;
891 968
892 ret = seq_open(file, &xt_match_seq_ops); 969 trav = kmalloc(sizeof(*trav), GFP_KERNEL);
893 if (!ret) { 970 if (trav == NULL)
894 struct seq_file *seq = file->private_data; 971 return -ENOMEM;
895 972
896 seq->private = PDE(inode); 973 ret = seq_open(file, &xt_match_seq_ops);
974 if (ret < 0) {
975 kfree(trav);
976 return ret;
897 } 977 }
898 return ret; 978
979 seq = file->private_data;
980 seq->private = trav;
981 trav->nfproto = (unsigned long)PDE(inode)->data;
982 return 0;
899} 983}
900 984
901static const struct file_operations xt_match_ops = { 985static const struct file_operations xt_match_ops = {
@@ -903,62 +987,63 @@ static const struct file_operations xt_match_ops = {
903 .open = xt_match_open, 987 .open = xt_match_open,
904 .read = seq_read, 988 .read = seq_read,
905 .llseek = seq_lseek, 989 .llseek = seq_lseek,
906 .release = seq_release, 990 .release = seq_release_private,
907}; 991};
908 992
909static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos) 993static void *xt_target_seq_start(struct seq_file *seq, loff_t *pos)
910{ 994{
911 struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; 995 return xt_mttg_seq_start(seq, pos, true);
912 u_int16_t af = (unsigned long)pde->data;
913
914 mutex_lock(&xt[af].mutex);
915 return seq_list_start(&xt[af].target, *pos);
916} 996}
917 997
918static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *pos) 998static void *xt_target_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
919{ 999{
920 struct proc_dir_entry *pde = (struct proc_dir_entry *)seq->private; 1000 return xt_mttg_seq_next(seq, v, ppos, true);
921 u_int16_t af = (unsigned long)pde->data;
922
923 return seq_list_next(v, &xt[af].target, pos);
924}
925
926static void xt_target_seq_stop(struct seq_file *seq, void *v)
927{
928 struct proc_dir_entry *pde = seq->private;
929 u_int16_t af = (unsigned long)pde->data;
930
931 mutex_unlock(&xt[af].mutex);
932} 1001}
933 1002
934static int xt_target_seq_show(struct seq_file *seq, void *v) 1003static int xt_target_seq_show(struct seq_file *seq, void *v)
935{ 1004{
936 struct xt_target *target = list_entry(v, struct xt_target, list); 1005 const struct nf_mttg_trav *trav = seq->private;
937 1006 const struct xt_target *target;
938 if (strlen(target->name)) 1007
939 return seq_printf(seq, "%s\n", target->name); 1008 switch (trav->class) {
940 else 1009 case MTTG_TRAV_NFP_UNSPEC:
941 return 0; 1010 case MTTG_TRAV_NFP_SPEC:
1011 if (trav->curr == trav->head)
1012 return 0;
1013 target = list_entry(trav->curr, struct xt_target, list);
1014 return (*target->name == '\0') ? 0 :
1015 seq_printf(seq, "%s\n", target->name);
1016 }
1017 return 0;
942} 1018}
943 1019
944static const struct seq_operations xt_target_seq_ops = { 1020static const struct seq_operations xt_target_seq_ops = {
945 .start = xt_target_seq_start, 1021 .start = xt_target_seq_start,
946 .next = xt_target_seq_next, 1022 .next = xt_target_seq_next,
947 .stop = xt_target_seq_stop, 1023 .stop = xt_mttg_seq_stop,
948 .show = xt_target_seq_show, 1024 .show = xt_target_seq_show,
949}; 1025};
950 1026
951static int xt_target_open(struct inode *inode, struct file *file) 1027static int xt_target_open(struct inode *inode, struct file *file)
952{ 1028{
1029 struct seq_file *seq;
1030 struct nf_mttg_trav *trav;
953 int ret; 1031 int ret;
954 1032
955 ret = seq_open(file, &xt_target_seq_ops); 1033 trav = kmalloc(sizeof(*trav), GFP_KERNEL);
956 if (!ret) { 1034 if (trav == NULL)
957 struct seq_file *seq = file->private_data; 1035 return -ENOMEM;
958 1036
959 seq->private = PDE(inode); 1037 ret = seq_open(file, &xt_target_seq_ops);
1038 if (ret < 0) {
1039 kfree(trav);
1040 return ret;
960 } 1041 }
961 return ret; 1042
1043 seq = file->private_data;
1044 seq->private = trav;
1045 trav->nfproto = (unsigned long)PDE(inode)->data;
1046 return 0;
962} 1047}
963 1048
964static const struct file_operations xt_target_ops = { 1049static const struct file_operations xt_target_ops = {
@@ -966,7 +1051,7 @@ static const struct file_operations xt_target_ops = {
966 .open = xt_target_open, 1051 .open = xt_target_open,
967 .read = seq_read, 1052 .read = seq_read,
968 .llseek = seq_lseek, 1053 .llseek = seq_lseek,
969 .release = seq_release, 1054 .release = seq_release_private,
970}; 1055};
971 1056
972#define FORMAT_TABLES "_tables_names" 1057#define FORMAT_TABLES "_tables_names"
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index fe80b614a400..791e030ea903 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -542,7 +542,7 @@ recent_mt_proc_write(struct file *file, const char __user *input,
542 struct recent_entry *e; 542 struct recent_entry *e;
543 char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; 543 char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
544 const char *c = buf; 544 const char *c = buf;
545 union nf_inet_addr addr; 545 union nf_inet_addr addr = {};
546 u_int16_t family; 546 u_int16_t family;
547 bool add, succ; 547 bool add, succ;
548 548
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5b33879c6422..b73d4e61c5ac 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -85,6 +85,7 @@ struct netlink_sock {
85 85
86#define NETLINK_KERNEL_SOCKET 0x1 86#define NETLINK_KERNEL_SOCKET 0x1
87#define NETLINK_RECV_PKTINFO 0x2 87#define NETLINK_RECV_PKTINFO 0x2
88#define NETLINK_BROADCAST_SEND_ERROR 0x4
88 89
89static inline struct netlink_sock *nlk_sk(struct sock *sk) 90static inline struct netlink_sock *nlk_sk(struct sock *sk)
90{ 91{
@@ -995,12 +996,15 @@ static inline int do_one_broadcast(struct sock *sk,
995 netlink_overrun(sk); 996 netlink_overrun(sk);
996 /* Clone failed. Notify ALL listeners. */ 997 /* Clone failed. Notify ALL listeners. */
997 p->failure = 1; 998 p->failure = 1;
999 if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
1000 p->delivery_failure = 1;
998 } else if (sk_filter(sk, p->skb2)) { 1001 } else if (sk_filter(sk, p->skb2)) {
999 kfree_skb(p->skb2); 1002 kfree_skb(p->skb2);
1000 p->skb2 = NULL; 1003 p->skb2 = NULL;
1001 } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { 1004 } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
1002 netlink_overrun(sk); 1005 netlink_overrun(sk);
1003 p->delivery_failure = 1; 1006 if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
1007 p->delivery_failure = 1;
1004 } else { 1008 } else {
1005 p->congested |= val; 1009 p->congested |= val;
1006 p->delivered = 1; 1010 p->delivered = 1;
@@ -1045,10 +1049,9 @@ int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
1045 1049
1046 netlink_unlock_table(); 1050 netlink_unlock_table();
1047 1051
1048 if (info.skb2) 1052 kfree_skb(info.skb2);
1049 kfree_skb(info.skb2);
1050 1053
1051 if (info.delivery_failure || info.failure) 1054 if (info.delivery_failure)
1052 return -ENOBUFS; 1055 return -ENOBUFS;
1053 1056
1054 if (info.delivered) { 1057 if (info.delivered) {
@@ -1088,6 +1091,13 @@ out:
1088 return 0; 1091 return 0;
1089} 1092}
1090 1093
1094/**
1095 * netlink_set_err - report error to broadcast listeners
1096 * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
1097 * @pid: the PID of a process that we want to skip (if any)
1098 * @groups: the broadcast group that will notice the error
1099 * @code: error code, must be negative (as usual in kernelspace)
1100 */
1091void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) 1101void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
1092{ 1102{
1093 struct netlink_set_err_data info; 1103 struct netlink_set_err_data info;
@@ -1097,7 +1107,8 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
1097 info.exclude_sk = ssk; 1107 info.exclude_sk = ssk;
1098 info.pid = pid; 1108 info.pid = pid;
1099 info.group = group; 1109 info.group = group;
1100 info.code = code; 1110 /* sk->sk_err wants a positive error value */
1111 info.code = -code;
1101 1112
1102 read_lock(&nl_table_lock); 1113 read_lock(&nl_table_lock);
1103 1114
@@ -1164,6 +1175,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
1164 err = 0; 1175 err = 0;
1165 break; 1176 break;
1166 } 1177 }
1178 case NETLINK_BROADCAST_ERROR:
1179 if (val)
1180 nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
1181 else
1182 nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
1183 err = 0;
1184 break;
1167 default: 1185 default:
1168 err = -ENOPROTOOPT; 1186 err = -ENOPROTOOPT;
1169 } 1187 }
@@ -1196,6 +1214,16 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname,
1196 return -EFAULT; 1214 return -EFAULT;
1197 err = 0; 1215 err = 0;
1198 break; 1216 break;
1217 case NETLINK_BROADCAST_ERROR:
1218 if (len < sizeof(int))
1219 return -EINVAL;
1220 len = sizeof(int);
1221 val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
1222 if (put_user(len, optlen) ||
1223 put_user(val, optval))
1224 return -EFAULT;
1225 err = 0;
1226 break;
1199 default: 1227 default:
1200 err = -ENOPROTOOPT; 1228 err = -ENOPROTOOPT;
1201 } 1229 }
@@ -1522,8 +1550,7 @@ EXPORT_SYMBOL(netlink_set_nonroot);
1522 1550
1523static void netlink_destroy_callback(struct netlink_callback *cb) 1551static void netlink_destroy_callback(struct netlink_callback *cb)
1524{ 1552{
1525 if (cb->skb) 1553 kfree_skb(cb->skb);
1526 kfree_skb(cb->skb);
1527 kfree(cb); 1554 kfree(cb);
1528} 1555}
1529 1556
@@ -1740,12 +1767,18 @@ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 pid,
1740 exclude_pid = pid; 1767 exclude_pid = pid;
1741 } 1768 }
1742 1769
1743 /* errors reported via destination sk->sk_err */ 1770 /* errors reported via destination sk->sk_err, but propagate
1744 nlmsg_multicast(sk, skb, exclude_pid, group, flags); 1771 * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
1772 err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
1745 } 1773 }
1746 1774
1747 if (report) 1775 if (report) {
1748 err = nlmsg_unicast(sk, skb, pid); 1776 int err2;
1777
1778 err2 = nlmsg_unicast(sk, skb, pid);
1779 if (!err || err == -ESRCH)
1780 err = err2;
1781 }
1749 1782
1750 return err; 1783 return err;
1751} 1784}
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index cba7849de98e..6d9c58ec56ac 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1037,6 +1037,10 @@ static int nr_sendmsg(struct kiocb *iocb, struct socket *sock,
1037 unsigned char *asmptr; 1037 unsigned char *asmptr;
1038 int size; 1038 int size;
1039 1039
1040 /* Netrom empty data frame has no meaning : don't send */
1041 if (len == 0)
1042 return 0;
1043
1040 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) 1044 if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT))
1041 return -EINVAL; 1045 return -EINVAL;
1042 1046
@@ -1167,6 +1171,11 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
1167 skb_reset_transport_header(skb); 1171 skb_reset_transport_header(skb);
1168 copied = skb->len; 1172 copied = skb->len;
1169 1173
1174 /* NetRom empty data frame has no meaning : ignore it */
1175 if (copied == 0) {
1176 goto out;
1177 }
1178
1170 if (copied > size) { 1179 if (copied > size) {
1171 copied = size; 1180 copied = size;
1172 msg->msg_flags |= MSG_TRUNC; 1181 msg->msg_flags |= MSG_TRUNC;
@@ -1182,7 +1191,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
1182 1191
1183 msg->msg_namelen = sizeof(*sax); 1192 msg->msg_namelen = sizeof(*sax);
1184 1193
1185 skb_free_datagram(sk, skb); 1194out: skb_free_datagram(sk, skb);
1186 1195
1187 release_sock(sk); 1196 release_sock(sk);
1188 return copied; 1197 return copied;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1fc4a7885c41..74776de523ec 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -584,7 +584,7 @@ drop_n_restore:
584 skb->len = skb_len; 584 skb->len = skb_len;
585 } 585 }
586drop: 586drop:
587 kfree_skb(skb); 587 consume_skb(skb);
588 return 0; 588 return 0;
589} 589}
590 590
@@ -756,8 +756,7 @@ ring_is_full:
756 spin_unlock(&sk->sk_receive_queue.lock); 756 spin_unlock(&sk->sk_receive_queue.lock);
757 757
758 sk->sk_data_ready(sk, 0); 758 sk->sk_data_ready(sk, 0);
759 if (copy_skb) 759 kfree_skb(copy_skb);
760 kfree_skb(copy_skb);
761 goto drop_n_restore; 760 goto drop_n_restore;
762} 761}
763 762
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 81795ea87794..a662e62a99cf 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -382,9 +382,8 @@ out:
382 return NET_RX_DROP; 382 return NET_RX_DROP;
383} 383}
384 384
385static struct packet_type phonet_packet_type = { 385static struct packet_type phonet_packet_type __read_mostly = {
386 .type = cpu_to_be16(ETH_P_PHONET), 386 .type = cpu_to_be16(ETH_P_PHONET),
387 .dev = NULL,
388 .func = phonet_rcv, 387 .func = phonet_rcv,
389}; 388};
390 389
diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c
index 1ceea1f92413..cec4e5951681 100644
--- a/net/phonet/pn_netlink.c
+++ b/net/phonet/pn_netlink.c
@@ -47,8 +47,9 @@ static void rtmsg_notify(int event, struct net_device *dev, u8 addr)
47 kfree_skb(skb); 47 kfree_skb(skb);
48 goto errout; 48 goto errout;
49 } 49 }
50 err = rtnl_notify(skb, dev_net(dev), 0, 50 rtnl_notify(skb, dev_net(dev), 0,
51 RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL); 51 RTNLGRP_PHONET_IFADDR, NULL, GFP_KERNEL);
52 return;
52errout: 53errout:
53 if (err < 0) 54 if (err < 0)
54 rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err); 55 rtnl_set_sk_err(dev_net(dev), RTNLGRP_PHONET_IFADDR, err);
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
new file mode 100644
index 000000000000..796773b5df9b
--- /dev/null
+++ b/net/rds/Kconfig
@@ -0,0 +1,14 @@
1
2config RDS
3 tristate "Reliable Datagram Sockets (RDS) (EXPERIMENTAL)"
4 depends on INET && INFINIBAND_IPOIB && EXPERIMENTAL
5 depends on INFINIBAND && INFINIBAND_ADDR_TRANS
6 ---help---
7 RDS provides reliable, sequenced delivery of datagrams
8 over Infiniband.
9
10config RDS_DEBUG
11 bool "Debugging messages"
12 depends on RDS
13 default n
14
diff --git a/net/rds/Makefile b/net/rds/Makefile
new file mode 100644
index 000000000000..51f27585fa08
--- /dev/null
+++ b/net/rds/Makefile
@@ -0,0 +1,14 @@
1obj-$(CONFIG_RDS) += rds.o
2rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
3 recv.o send.o stats.o sysctl.o threads.o transport.o \
4 loop.o page.o rdma.o \
5 rdma_transport.o \
6 ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
7 ib_sysctl.o ib_rdma.o \
8 iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
9 iw_sysctl.o iw_rdma.o
10
11ifeq ($(CONFIG_RDS_DEBUG), y)
12EXTRA_CFLAGS += -DDEBUG
13endif
14
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
new file mode 100644
index 000000000000..20cf16fc572f
--- /dev/null
+++ b/net/rds/af_rds.c
@@ -0,0 +1,586 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/module.h>
34#include <linux/errno.h>
35#include <linux/kernel.h>
36#include <linux/in.h>
37#include <linux/poll.h>
38#include <linux/version.h>
39#include <net/sock.h>
40
41#include "rds.h"
42#include "rdma.h"
43#include "rdma_transport.h"
44
45/* this is just used for stats gathering :/ */
46static DEFINE_SPINLOCK(rds_sock_lock);
47static unsigned long rds_sock_count;
48static LIST_HEAD(rds_sock_list);
49DECLARE_WAIT_QUEUE_HEAD(rds_poll_waitq);
50
51/*
52 * This is called as the final descriptor referencing this socket is closed.
53 * We have to unbind the socket so that another socket can be bound to the
54 * address it was using.
55 *
56 * We have to be careful about racing with the incoming path. sock_orphan()
57 * sets SOCK_DEAD and we use that as an indicator to the rx path that new
58 * messages shouldn't be queued.
59 */
60static int rds_release(struct socket *sock)
61{
62 struct sock *sk = sock->sk;
63 struct rds_sock *rs;
64 unsigned long flags;
65
66 if (sk == NULL)
67 goto out;
68
69 rs = rds_sk_to_rs(sk);
70
71 sock_orphan(sk);
72 /* Note - rds_clear_recv_queue grabs rs_recv_lock, so
73 * that ensures the recv path has completed messing
74 * with the socket. */
75 rds_clear_recv_queue(rs);
76 rds_cong_remove_socket(rs);
77 rds_remove_bound(rs);
78 rds_send_drop_to(rs, NULL);
79 rds_rdma_drop_keys(rs);
80 rds_notify_queue_get(rs, NULL);
81
82 spin_lock_irqsave(&rds_sock_lock, flags);
83 list_del_init(&rs->rs_item);
84 rds_sock_count--;
85 spin_unlock_irqrestore(&rds_sock_lock, flags);
86
87 sock->sk = NULL;
88 sock_put(sk);
89out:
90 return 0;
91}
92
93/*
94 * Careful not to race with rds_release -> sock_orphan which clears sk_sleep.
95 * _bh() isn't OK here, we're called from interrupt handlers. It's probably OK
96 * to wake the waitqueue after sk_sleep is clear as we hold a sock ref, but
97 * this seems more conservative.
98 * NB - normally, one would use sk_callback_lock for this, but we can
99 * get here from interrupts, whereas the network code grabs sk_callback_lock
100 * with _lock_bh only - so relying on sk_callback_lock introduces livelocks.
101 */
102void rds_wake_sk_sleep(struct rds_sock *rs)
103{
104 unsigned long flags;
105
106 read_lock_irqsave(&rs->rs_recv_lock, flags);
107 __rds_wake_sk_sleep(rds_rs_to_sk(rs));
108 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
109}
110
111static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
112 int *uaddr_len, int peer)
113{
114 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
115 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
116
117 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
118
119 /* racey, don't care */
120 if (peer) {
121 if (!rs->rs_conn_addr)
122 return -ENOTCONN;
123
124 sin->sin_port = rs->rs_conn_port;
125 sin->sin_addr.s_addr = rs->rs_conn_addr;
126 } else {
127 sin->sin_port = rs->rs_bound_port;
128 sin->sin_addr.s_addr = rs->rs_bound_addr;
129 }
130
131 sin->sin_family = AF_INET;
132
133 *uaddr_len = sizeof(*sin);
134 return 0;
135}
136
137/*
138 * RDS' poll is without a doubt the least intuitive part of the interface,
139 * as POLLIN and POLLOUT do not behave entirely as you would expect from
140 * a network protocol.
141 *
142 * POLLIN is asserted if
143 * - there is data on the receive queue.
144 * - to signal that a previously congested destination may have become
145 * uncongested
146 * - A notification has been queued to the socket (this can be a congestion
147 * update, or a RDMA completion).
148 *
149 * POLLOUT is asserted if there is room on the send queue. This does not mean
150 * however, that the next sendmsg() call will succeed. If the application tries
151 * to send to a congested destination, the system call may still fail (and
152 * return ENOBUFS).
153 */
154static unsigned int rds_poll(struct file *file, struct socket *sock,
155 poll_table *wait)
156{
157 struct sock *sk = sock->sk;
158 struct rds_sock *rs = rds_sk_to_rs(sk);
159 unsigned int mask = 0;
160 unsigned long flags;
161
162 poll_wait(file, sk->sk_sleep, wait);
163
164 poll_wait(file, &rds_poll_waitq, wait);
165
166 read_lock_irqsave(&rs->rs_recv_lock, flags);
167 if (!rs->rs_cong_monitor) {
168 /* When a congestion map was updated, we signal POLLIN for
169 * "historical" reasons. Applications can also poll for
170 * WRBAND instead. */
171 if (rds_cong_updated_since(&rs->rs_cong_track))
172 mask |= (POLLIN | POLLRDNORM | POLLWRBAND);
173 } else {
174 spin_lock(&rs->rs_lock);
175 if (rs->rs_cong_notify)
176 mask |= (POLLIN | POLLRDNORM);
177 spin_unlock(&rs->rs_lock);
178 }
179 if (!list_empty(&rs->rs_recv_queue)
180 || !list_empty(&rs->rs_notify_queue))
181 mask |= (POLLIN | POLLRDNORM);
182 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
183 mask |= (POLLOUT | POLLWRNORM);
184 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
185
186 return mask;
187}
188
189static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
190{
191 return -ENOIOCTLCMD;
192}
193
194static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
195 int len)
196{
197 struct sockaddr_in sin;
198 int ret = 0;
199
200 /* racing with another thread binding seems ok here */
201 if (rs->rs_bound_addr == 0) {
202 ret = -ENOTCONN; /* XXX not a great errno */
203 goto out;
204 }
205
206 if (len < sizeof(struct sockaddr_in)) {
207 ret = -EINVAL;
208 goto out;
209 }
210
211 if (copy_from_user(&sin, optval, sizeof(sin))) {
212 ret = -EFAULT;
213 goto out;
214 }
215
216 rds_send_drop_to(rs, &sin);
217out:
218 return ret;
219}
220
221static int rds_set_bool_option(unsigned char *optvar, char __user *optval,
222 int optlen)
223{
224 int value;
225
226 if (optlen < sizeof(int))
227 return -EINVAL;
228 if (get_user(value, (int __user *) optval))
229 return -EFAULT;
230 *optvar = !!value;
231 return 0;
232}
233
234static int rds_cong_monitor(struct rds_sock *rs, char __user *optval,
235 int optlen)
236{
237 int ret;
238
239 ret = rds_set_bool_option(&rs->rs_cong_monitor, optval, optlen);
240 if (ret == 0) {
241 if (rs->rs_cong_monitor) {
242 rds_cong_add_socket(rs);
243 } else {
244 rds_cong_remove_socket(rs);
245 rs->rs_cong_mask = 0;
246 rs->rs_cong_notify = 0;
247 }
248 }
249 return ret;
250}
251
252static int rds_setsockopt(struct socket *sock, int level, int optname,
253 char __user *optval, int optlen)
254{
255 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
256 int ret;
257
258 if (level != SOL_RDS) {
259 ret = -ENOPROTOOPT;
260 goto out;
261 }
262
263 switch (optname) {
264 case RDS_CANCEL_SENT_TO:
265 ret = rds_cancel_sent_to(rs, optval, optlen);
266 break;
267 case RDS_GET_MR:
268 ret = rds_get_mr(rs, optval, optlen);
269 break;
270 case RDS_FREE_MR:
271 ret = rds_free_mr(rs, optval, optlen);
272 break;
273 case RDS_RECVERR:
274 ret = rds_set_bool_option(&rs->rs_recverr, optval, optlen);
275 break;
276 case RDS_CONG_MONITOR:
277 ret = rds_cong_monitor(rs, optval, optlen);
278 break;
279 default:
280 ret = -ENOPROTOOPT;
281 }
282out:
283 return ret;
284}
285
286static int rds_getsockopt(struct socket *sock, int level, int optname,
287 char __user *optval, int __user *optlen)
288{
289 struct rds_sock *rs = rds_sk_to_rs(sock->sk);
290 int ret = -ENOPROTOOPT, len;
291
292 if (level != SOL_RDS)
293 goto out;
294
295 if (get_user(len, optlen)) {
296 ret = -EFAULT;
297 goto out;
298 }
299
300 switch (optname) {
301 case RDS_INFO_FIRST ... RDS_INFO_LAST:
302 ret = rds_info_getsockopt(sock, optname, optval,
303 optlen);
304 break;
305
306 case RDS_RECVERR:
307 if (len < sizeof(int))
308 ret = -EINVAL;
309 else
310 if (put_user(rs->rs_recverr, (int __user *) optval)
311 || put_user(sizeof(int), optlen))
312 ret = -EFAULT;
313 else
314 ret = 0;
315 break;
316 default:
317 break;
318 }
319
320out:
321 return ret;
322
323}
324
325static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
326 int addr_len, int flags)
327{
328 struct sock *sk = sock->sk;
329 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
330 struct rds_sock *rs = rds_sk_to_rs(sk);
331 int ret = 0;
332
333 lock_sock(sk);
334
335 if (addr_len != sizeof(struct sockaddr_in)) {
336 ret = -EINVAL;
337 goto out;
338 }
339
340 if (sin->sin_family != AF_INET) {
341 ret = -EAFNOSUPPORT;
342 goto out;
343 }
344
345 if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
346 ret = -EDESTADDRREQ;
347 goto out;
348 }
349
350 rs->rs_conn_addr = sin->sin_addr.s_addr;
351 rs->rs_conn_port = sin->sin_port;
352
353out:
354 release_sock(sk);
355 return ret;
356}
357
358static struct proto rds_proto = {
359 .name = "RDS",
360 .owner = THIS_MODULE,
361 .obj_size = sizeof(struct rds_sock),
362};
363
364static struct proto_ops rds_proto_ops = {
365 .family = AF_RDS,
366 .owner = THIS_MODULE,
367 .release = rds_release,
368 .bind = rds_bind,
369 .connect = rds_connect,
370 .socketpair = sock_no_socketpair,
371 .accept = sock_no_accept,
372 .getname = rds_getname,
373 .poll = rds_poll,
374 .ioctl = rds_ioctl,
375 .listen = sock_no_listen,
376 .shutdown = sock_no_shutdown,
377 .setsockopt = rds_setsockopt,
378 .getsockopt = rds_getsockopt,
379 .sendmsg = rds_sendmsg,
380 .recvmsg = rds_recvmsg,
381 .mmap = sock_no_mmap,
382 .sendpage = sock_no_sendpage,
383};
384
385static int __rds_create(struct socket *sock, struct sock *sk, int protocol)
386{
387 unsigned long flags;
388 struct rds_sock *rs;
389
390 sock_init_data(sock, sk);
391 sock->ops = &rds_proto_ops;
392 sk->sk_protocol = protocol;
393
394 rs = rds_sk_to_rs(sk);
395 spin_lock_init(&rs->rs_lock);
396 rwlock_init(&rs->rs_recv_lock);
397 INIT_LIST_HEAD(&rs->rs_send_queue);
398 INIT_LIST_HEAD(&rs->rs_recv_queue);
399 INIT_LIST_HEAD(&rs->rs_notify_queue);
400 INIT_LIST_HEAD(&rs->rs_cong_list);
401 spin_lock_init(&rs->rs_rdma_lock);
402 rs->rs_rdma_keys = RB_ROOT;
403
404 spin_lock_irqsave(&rds_sock_lock, flags);
405 list_add_tail(&rs->rs_item, &rds_sock_list);
406 rds_sock_count++;
407 spin_unlock_irqrestore(&rds_sock_lock, flags);
408
409 return 0;
410}
411
412static int rds_create(struct net *net, struct socket *sock, int protocol)
413{
414 struct sock *sk;
415
416 if (sock->type != SOCK_SEQPACKET || protocol)
417 return -ESOCKTNOSUPPORT;
418
419 sk = sk_alloc(net, AF_RDS, GFP_ATOMIC, &rds_proto);
420 if (!sk)
421 return -ENOMEM;
422
423 return __rds_create(sock, sk, protocol);
424}
425
426void rds_sock_addref(struct rds_sock *rs)
427{
428 sock_hold(rds_rs_to_sk(rs));
429}
430
431void rds_sock_put(struct rds_sock *rs)
432{
433 sock_put(rds_rs_to_sk(rs));
434}
435
436static struct net_proto_family rds_family_ops = {
437 .family = AF_RDS,
438 .create = rds_create,
439 .owner = THIS_MODULE,
440};
441
442static void rds_sock_inc_info(struct socket *sock, unsigned int len,
443 struct rds_info_iterator *iter,
444 struct rds_info_lengths *lens)
445{
446 struct rds_sock *rs;
447 struct sock *sk;
448 struct rds_incoming *inc;
449 unsigned long flags;
450 unsigned int total = 0;
451
452 len /= sizeof(struct rds_info_message);
453
454 spin_lock_irqsave(&rds_sock_lock, flags);
455
456 list_for_each_entry(rs, &rds_sock_list, rs_item) {
457 sk = rds_rs_to_sk(rs);
458 read_lock(&rs->rs_recv_lock);
459
460 /* XXX too lazy to maintain counts.. */
461 list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
462 total++;
463 if (total <= len)
464 rds_inc_info_copy(inc, iter, inc->i_saddr,
465 rs->rs_bound_addr, 1);
466 }
467
468 read_unlock(&rs->rs_recv_lock);
469 }
470
471 spin_unlock_irqrestore(&rds_sock_lock, flags);
472
473 lens->nr = total;
474 lens->each = sizeof(struct rds_info_message);
475}
476
477static void rds_sock_info(struct socket *sock, unsigned int len,
478 struct rds_info_iterator *iter,
479 struct rds_info_lengths *lens)
480{
481 struct rds_info_socket sinfo;
482 struct rds_sock *rs;
483 unsigned long flags;
484
485 len /= sizeof(struct rds_info_socket);
486
487 spin_lock_irqsave(&rds_sock_lock, flags);
488
489 if (len < rds_sock_count)
490 goto out;
491
492 list_for_each_entry(rs, &rds_sock_list, rs_item) {
493 sinfo.sndbuf = rds_sk_sndbuf(rs);
494 sinfo.rcvbuf = rds_sk_rcvbuf(rs);
495 sinfo.bound_addr = rs->rs_bound_addr;
496 sinfo.connected_addr = rs->rs_conn_addr;
497 sinfo.bound_port = rs->rs_bound_port;
498 sinfo.connected_port = rs->rs_conn_port;
499 sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
500
501 rds_info_copy(iter, &sinfo, sizeof(sinfo));
502 }
503
504out:
505 lens->nr = rds_sock_count;
506 lens->each = sizeof(struct rds_info_socket);
507
508 spin_unlock_irqrestore(&rds_sock_lock, flags);
509}
510
511static void __exit rds_exit(void)
512{
513 rds_rdma_exit();
514 sock_unregister(rds_family_ops.family);
515 proto_unregister(&rds_proto);
516 rds_conn_exit();
517 rds_cong_exit();
518 rds_sysctl_exit();
519 rds_threads_exit();
520 rds_stats_exit();
521 rds_page_exit();
522 rds_info_deregister_func(RDS_INFO_SOCKETS, rds_sock_info);
523 rds_info_deregister_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
524}
525module_exit(rds_exit);
526
527static int __init rds_init(void)
528{
529 int ret;
530
531 ret = rds_conn_init();
532 if (ret)
533 goto out;
534 ret = rds_threads_init();
535 if (ret)
536 goto out_conn;
537 ret = rds_sysctl_init();
538 if (ret)
539 goto out_threads;
540 ret = rds_stats_init();
541 if (ret)
542 goto out_sysctl;
543 ret = proto_register(&rds_proto, 1);
544 if (ret)
545 goto out_stats;
546 ret = sock_register(&rds_family_ops);
547 if (ret)
548 goto out_proto;
549
550 rds_info_register_func(RDS_INFO_SOCKETS, rds_sock_info);
551 rds_info_register_func(RDS_INFO_RECV_MESSAGES, rds_sock_inc_info);
552
553 /* ib/iwarp transports currently compiled-in */
554 ret = rds_rdma_init();
555 if (ret)
556 goto out_sock;
557 goto out;
558
559out_sock:
560 sock_unregister(rds_family_ops.family);
561out_proto:
562 proto_unregister(&rds_proto);
563out_stats:
564 rds_stats_exit();
565out_sysctl:
566 rds_sysctl_exit();
567out_threads:
568 rds_threads_exit();
569out_conn:
570 rds_conn_exit();
571 rds_cong_exit();
572 rds_page_exit();
573out:
574 return ret;
575}
576module_init(rds_init);
577
578#define DRV_VERSION "4.0"
579#define DRV_RELDATE "Feb 12, 2009"
580
581MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
582MODULE_DESCRIPTION("RDS: Reliable Datagram Sockets"
583 " v" DRV_VERSION " (" DRV_RELDATE ")");
584MODULE_VERSION(DRV_VERSION);
585MODULE_LICENSE("Dual BSD/GPL");
586MODULE_ALIAS_NETPROTO(PF_RDS);
diff --git a/net/rds/bind.c b/net/rds/bind.c
new file mode 100644
index 000000000000..c17cc39160ce
--- /dev/null
+++ b/net/rds/bind.c
@@ -0,0 +1,199 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/sock.h>
35#include <linux/in.h>
36#include <linux/if_arp.h>
37#include "rds.h"
38
39/*
40 * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't
41 * particularly zippy.
42 *
43 * This is now called for every incoming frame so we arguably care much more
44 * about it than we used to.
45 */
46static DEFINE_SPINLOCK(rds_bind_lock);
47static struct rb_root rds_bind_tree = RB_ROOT;
48
49static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port,
50 struct rds_sock *insert)
51{
52 struct rb_node **p = &rds_bind_tree.rb_node;
53 struct rb_node *parent = NULL;
54 struct rds_sock *rs;
55 u64 cmp;
56 u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port);
57
58 while (*p) {
59 parent = *p;
60 rs = rb_entry(parent, struct rds_sock, rs_bound_node);
61
62 cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) |
63 be16_to_cpu(rs->rs_bound_port);
64
65 if (needle < cmp)
66 p = &(*p)->rb_left;
67 else if (needle > cmp)
68 p = &(*p)->rb_right;
69 else
70 return rs;
71 }
72
73 if (insert) {
74 rb_link_node(&insert->rs_bound_node, parent, p);
75 rb_insert_color(&insert->rs_bound_node, &rds_bind_tree);
76 }
77 return NULL;
78}
79
80/*
81 * Return the rds_sock bound at the given local address.
82 *
83 * The rx path can race with rds_release. We notice if rds_release() has
84 * marked this socket and don't return a rs ref to the rx path.
85 */
86struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
87{
88 struct rds_sock *rs;
89 unsigned long flags;
90
91 spin_lock_irqsave(&rds_bind_lock, flags);
92 rs = rds_bind_tree_walk(addr, port, NULL);
93 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
94 rds_sock_addref(rs);
95 else
96 rs = NULL;
97 spin_unlock_irqrestore(&rds_bind_lock, flags);
98
99 rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
100 ntohs(port));
101 return rs;
102}
103
104/* returns -ve errno or +ve port */
105static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
106{
107 unsigned long flags;
108 int ret = -EADDRINUSE;
109 u16 rover, last;
110
111 if (*port != 0) {
112 rover = be16_to_cpu(*port);
113 last = rover;
114 } else {
115 rover = max_t(u16, net_random(), 2);
116 last = rover - 1;
117 }
118
119 spin_lock_irqsave(&rds_bind_lock, flags);
120
121 do {
122 if (rover == 0)
123 rover++;
124 if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) {
125 *port = cpu_to_be16(rover);
126 ret = 0;
127 break;
128 }
129 } while (rover++ != last);
130
131 if (ret == 0) {
132 rs->rs_bound_addr = addr;
133 rs->rs_bound_port = *port;
134 rds_sock_addref(rs);
135
136 rdsdebug("rs %p binding to %pI4:%d\n",
137 rs, &addr, (int)ntohs(*port));
138 }
139
140 spin_unlock_irqrestore(&rds_bind_lock, flags);
141
142 return ret;
143}
144
145void rds_remove_bound(struct rds_sock *rs)
146{
147 unsigned long flags;
148
149 spin_lock_irqsave(&rds_bind_lock, flags);
150
151 if (rs->rs_bound_addr) {
152 rdsdebug("rs %p unbinding from %pI4:%d\n",
153 rs, &rs->rs_bound_addr,
154 ntohs(rs->rs_bound_port));
155
156 rb_erase(&rs->rs_bound_node, &rds_bind_tree);
157 rds_sock_put(rs);
158 rs->rs_bound_addr = 0;
159 }
160
161 spin_unlock_irqrestore(&rds_bind_lock, flags);
162}
163
164int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
165{
166 struct sock *sk = sock->sk;
167 struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
168 struct rds_sock *rs = rds_sk_to_rs(sk);
169 struct rds_transport *trans;
170 int ret = 0;
171
172 lock_sock(sk);
173
174 if (addr_len != sizeof(struct sockaddr_in) ||
175 sin->sin_family != AF_INET ||
176 rs->rs_bound_addr ||
177 sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
178 ret = -EINVAL;
179 goto out;
180 }
181
182 ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
183 if (ret)
184 goto out;
185
186 trans = rds_trans_get_preferred(sin->sin_addr.s_addr);
187 if (trans == NULL) {
188 ret = -EADDRNOTAVAIL;
189 rds_remove_bound(rs);
190 goto out;
191 }
192
193 rs->rs_transport = trans;
194 ret = 0;
195
196out:
197 release_sock(sk);
198 return ret;
199}
diff --git a/net/rds/cong.c b/net/rds/cong.c
new file mode 100644
index 000000000000..710e4599d76c
--- /dev/null
+++ b/net/rds/cong.c
@@ -0,0 +1,404 @@
1/*
2 * Copyright (c) 2007 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/types.h>
34#include <linux/rbtree.h>
35
36#include <asm-generic/bitops/le.h>
37
38#include "rds.h"
39
40/*
41 * This file implements the receive side of the unconventional congestion
42 * management in RDS.
43 *
44 * Messages waiting in the receive queue on the receiving socket are accounted
45 * against the sockets SO_RCVBUF option value. Only the payload bytes in the
46 * message are accounted for. If the number of bytes queued equals or exceeds
47 * rcvbuf then the socket is congested. All sends attempted to this socket's
48 * address should return block or return -EWOULDBLOCK.
49 *
50 * Applications are expected to be reasonably tuned such that this situation
51 * very rarely occurs. An application encountering this "back-pressure" is
52 * considered a bug.
53 *
54 * This is implemented by having each node maintain bitmaps which indicate
55 * which ports on bound addresses are congested. As the bitmap changes it is
56 * sent through all the connections which terminate in the local address of the
57 * bitmap which changed.
58 *
59 * The bitmaps are allocated as connections are brought up. This avoids
60 * allocation in the interrupt handling path which queues messages on sockets.
61 * The dense bitmaps let transports send the entire bitmap on any bitmap change
62 * reasonably efficiently. This is much easier to implement than some
63 * finer-grained communication of per-port congestion. The sender does a very
64 * inexpensive bit test to test if the port it's about to send to is congested
65 * or not.
66 */
67
68/*
69 * Interaction with poll is a tad tricky. We want all processes stuck in
70 * poll to wake up and check whether a congested destination became uncongested.
71 * The really sad thing is we have no idea which destinations the application
72 * wants to send to - we don't even know which rds_connections are involved.
73 * So until we implement a more flexible rds poll interface, we have to make
74 * do with this:
75 * We maintain a global counter that is incremented each time a congestion map
76 * update is received. Each rds socket tracks this value, and if rds_poll
77 * finds that the saved generation number is smaller than the global generation
78 * number, it wakes up the process.
79 */
80static atomic_t rds_cong_generation = ATOMIC_INIT(0);
81
82/*
83 * Congestion monitoring
84 */
85static LIST_HEAD(rds_cong_monitor);
86static DEFINE_RWLOCK(rds_cong_monitor_lock);
87
88/*
89 * Yes, a global lock. It's used so infrequently that it's worth keeping it
90 * global to simplify the locking. It's only used in the following
91 * circumstances:
92 *
93 * - on connection buildup to associate a conn with its maps
94 * - on map changes to inform conns of a new map to send
95 *
96 * It's sadly ordered under the socket callback lock and the connection lock.
97 * Receive paths can mark ports congested from interrupt context so the
98 * lock masks interrupts.
99 */
100static DEFINE_SPINLOCK(rds_cong_lock);
101static struct rb_root rds_cong_tree = RB_ROOT;
102
103static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
104 struct rds_cong_map *insert)
105{
106 struct rb_node **p = &rds_cong_tree.rb_node;
107 struct rb_node *parent = NULL;
108 struct rds_cong_map *map;
109
110 while (*p) {
111 parent = *p;
112 map = rb_entry(parent, struct rds_cong_map, m_rb_node);
113
114 if (addr < map->m_addr)
115 p = &(*p)->rb_left;
116 else if (addr > map->m_addr)
117 p = &(*p)->rb_right;
118 else
119 return map;
120 }
121
122 if (insert) {
123 rb_link_node(&insert->m_rb_node, parent, p);
124 rb_insert_color(&insert->m_rb_node, &rds_cong_tree);
125 }
126 return NULL;
127}
128
129/*
130 * There is only ever one bitmap for any address. Connections try and allocate
131 * these bitmaps in the process getting pointers to them. The bitmaps are only
132 * ever freed as the module is removed after all connections have been freed.
133 */
134static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
135{
136 struct rds_cong_map *map;
137 struct rds_cong_map *ret = NULL;
138 unsigned long zp;
139 unsigned long i;
140 unsigned long flags;
141
142 map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL);
143 if (map == NULL)
144 return NULL;
145
146 map->m_addr = addr;
147 init_waitqueue_head(&map->m_waitq);
148 INIT_LIST_HEAD(&map->m_conn_list);
149
150 for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
151 zp = get_zeroed_page(GFP_KERNEL);
152 if (zp == 0)
153 goto out;
154 map->m_page_addrs[i] = zp;
155 }
156
157 spin_lock_irqsave(&rds_cong_lock, flags);
158 ret = rds_cong_tree_walk(addr, map);
159 spin_unlock_irqrestore(&rds_cong_lock, flags);
160
161 if (ret == NULL) {
162 ret = map;
163 map = NULL;
164 }
165
166out:
167 if (map) {
168 for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
169 free_page(map->m_page_addrs[i]);
170 kfree(map);
171 }
172
173 rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
174
175 return ret;
176}
177
178/*
179 * Put the conn on its local map's list. This is called when the conn is
180 * really added to the hash. It's nested under the rds_conn_lock, sadly.
181 */
182void rds_cong_add_conn(struct rds_connection *conn)
183{
184 unsigned long flags;
185
186 rdsdebug("conn %p now on map %p\n", conn, conn->c_lcong);
187 spin_lock_irqsave(&rds_cong_lock, flags);
188 list_add_tail(&conn->c_map_item, &conn->c_lcong->m_conn_list);
189 spin_unlock_irqrestore(&rds_cong_lock, flags);
190}
191
192void rds_cong_remove_conn(struct rds_connection *conn)
193{
194 unsigned long flags;
195
196 rdsdebug("removing conn %p from map %p\n", conn, conn->c_lcong);
197 spin_lock_irqsave(&rds_cong_lock, flags);
198 list_del_init(&conn->c_map_item);
199 spin_unlock_irqrestore(&rds_cong_lock, flags);
200}
201
202int rds_cong_get_maps(struct rds_connection *conn)
203{
204 conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
205 conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
206
207 if (conn->c_lcong == NULL || conn->c_fcong == NULL)
208 return -ENOMEM;
209
210 return 0;
211}
212
213void rds_cong_queue_updates(struct rds_cong_map *map)
214{
215 struct rds_connection *conn;
216 unsigned long flags;
217
218 spin_lock_irqsave(&rds_cong_lock, flags);
219
220 list_for_each_entry(conn, &map->m_conn_list, c_map_item) {
221 if (!test_and_set_bit(0, &conn->c_map_queued)) {
222 rds_stats_inc(s_cong_update_queued);
223 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
224 }
225 }
226
227 spin_unlock_irqrestore(&rds_cong_lock, flags);
228}
229
230void rds_cong_map_updated(struct rds_cong_map *map, uint64_t portmask)
231{
232 rdsdebug("waking map %p for %pI4\n",
233 map, &map->m_addr);
234 rds_stats_inc(s_cong_update_received);
235 atomic_inc(&rds_cong_generation);
236 if (waitqueue_active(&map->m_waitq))
237 wake_up(&map->m_waitq);
238 if (waitqueue_active(&rds_poll_waitq))
239 wake_up_all(&rds_poll_waitq);
240
241 if (portmask && !list_empty(&rds_cong_monitor)) {
242 unsigned long flags;
243 struct rds_sock *rs;
244
245 read_lock_irqsave(&rds_cong_monitor_lock, flags);
246 list_for_each_entry(rs, &rds_cong_monitor, rs_cong_list) {
247 spin_lock(&rs->rs_lock);
248 rs->rs_cong_notify |= (rs->rs_cong_mask & portmask);
249 rs->rs_cong_mask &= ~portmask;
250 spin_unlock(&rs->rs_lock);
251 if (rs->rs_cong_notify)
252 rds_wake_sk_sleep(rs);
253 }
254 read_unlock_irqrestore(&rds_cong_monitor_lock, flags);
255 }
256}
257
258int rds_cong_updated_since(unsigned long *recent)
259{
260 unsigned long gen = atomic_read(&rds_cong_generation);
261
262 if (likely(*recent == gen))
263 return 0;
264 *recent = gen;
265 return 1;
266}
267
268/*
269 * We're called under the locking that protects the sockets receive buffer
270 * consumption. This makes it a lot easier for the caller to only call us
271 * when it knows that an existing set bit needs to be cleared, and vice versa.
272 * We can't block and we need to deal with concurrent sockets working against
273 * the same per-address map.
274 */
275void rds_cong_set_bit(struct rds_cong_map *map, __be16 port)
276{
277 unsigned long i;
278 unsigned long off;
279
280 rdsdebug("setting congestion for %pI4:%u in map %p\n",
281 &map->m_addr, ntohs(port), map);
282
283 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
284 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
285
286 generic___set_le_bit(off, (void *)map->m_page_addrs[i]);
287}
288
289void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port)
290{
291 unsigned long i;
292 unsigned long off;
293
294 rdsdebug("clearing congestion for %pI4:%u in map %p\n",
295 &map->m_addr, ntohs(port), map);
296
297 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
298 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
299
300 generic___clear_le_bit(off, (void *)map->m_page_addrs[i]);
301}
302
303static int rds_cong_test_bit(struct rds_cong_map *map, __be16 port)
304{
305 unsigned long i;
306 unsigned long off;
307
308 i = be16_to_cpu(port) / RDS_CONG_MAP_PAGE_BITS;
309 off = be16_to_cpu(port) % RDS_CONG_MAP_PAGE_BITS;
310
311 return generic_test_le_bit(off, (void *)map->m_page_addrs[i]);
312}
313
314void rds_cong_add_socket(struct rds_sock *rs)
315{
316 unsigned long flags;
317
318 write_lock_irqsave(&rds_cong_monitor_lock, flags);
319 if (list_empty(&rs->rs_cong_list))
320 list_add(&rs->rs_cong_list, &rds_cong_monitor);
321 write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
322}
323
324void rds_cong_remove_socket(struct rds_sock *rs)
325{
326 unsigned long flags;
327 struct rds_cong_map *map;
328
329 write_lock_irqsave(&rds_cong_monitor_lock, flags);
330 list_del_init(&rs->rs_cong_list);
331 write_unlock_irqrestore(&rds_cong_monitor_lock, flags);
332
333 /* update congestion map for now-closed port */
334 spin_lock_irqsave(&rds_cong_lock, flags);
335 map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
336 spin_unlock_irqrestore(&rds_cong_lock, flags);
337
338 if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
339 rds_cong_clear_bit(map, rs->rs_bound_port);
340 rds_cong_queue_updates(map);
341 }
342}
343
344int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock,
345 struct rds_sock *rs)
346{
347 if (!rds_cong_test_bit(map, port))
348 return 0;
349 if (nonblock) {
350 if (rs && rs->rs_cong_monitor) {
351 unsigned long flags;
352
353 /* It would have been nice to have an atomic set_bit on
354 * a uint64_t. */
355 spin_lock_irqsave(&rs->rs_lock, flags);
356 rs->rs_cong_mask |= RDS_CONG_MONITOR_MASK(ntohs(port));
357 spin_unlock_irqrestore(&rs->rs_lock, flags);
358
359 /* Test again - a congestion update may have arrived in
360 * the meantime. */
361 if (!rds_cong_test_bit(map, port))
362 return 0;
363 }
364 rds_stats_inc(s_cong_send_error);
365 return -ENOBUFS;
366 }
367
368 rds_stats_inc(s_cong_send_blocked);
369 rdsdebug("waiting on map %p for port %u\n", map, be16_to_cpu(port));
370
371 return wait_event_interruptible(map->m_waitq,
372 !rds_cong_test_bit(map, port));
373}
374
375void rds_cong_exit(void)
376{
377 struct rb_node *node;
378 struct rds_cong_map *map;
379 unsigned long i;
380
381 while ((node = rb_first(&rds_cong_tree))) {
382 map = rb_entry(node, struct rds_cong_map, m_rb_node);
383 rdsdebug("freeing map %p\n", map);
384 rb_erase(&map->m_rb_node, &rds_cong_tree);
385 for (i = 0; i < RDS_CONG_MAP_PAGES && map->m_page_addrs[i]; i++)
386 free_page(map->m_page_addrs[i]);
387 kfree(map);
388 }
389}
390
391/*
392 * Allocate a RDS message containing a congestion update.
393 */
394struct rds_message *rds_cong_update_alloc(struct rds_connection *conn)
395{
396 struct rds_cong_map *map = conn->c_lcong;
397 struct rds_message *rm;
398
399 rm = rds_message_map_pages(map->m_page_addrs, RDS_CONG_MAP_BYTES);
400 if (!IS_ERR(rm))
401 rm->m_inc.i_hdr.h_flags = RDS_FLAG_CONG_BITMAP;
402
403 return rm;
404}
diff --git a/net/rds/connection.c b/net/rds/connection.c
new file mode 100644
index 000000000000..273f064930a8
--- /dev/null
+++ b/net/rds/connection.c
@@ -0,0 +1,487 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/list.h>
35#include <net/inet_hashtables.h>
36
37#include "rds.h"
38#include "loop.h"
39#include "rdma.h"
40
41#define RDS_CONNECTION_HASH_BITS 12
42#define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS)
43#define RDS_CONNECTION_HASH_MASK (RDS_CONNECTION_HASH_ENTRIES - 1)
44
45/* converting this to RCU is a chore for another day.. */
46static DEFINE_SPINLOCK(rds_conn_lock);
47static unsigned long rds_conn_count;
48static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
49static struct kmem_cache *rds_conn_slab;
50
51static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
52{
53 /* Pass NULL, don't need struct net for hash */
54 unsigned long hash = inet_ehashfn(NULL,
55 be32_to_cpu(laddr), 0,
56 be32_to_cpu(faddr), 0);
57 return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
58}
59
60#define rds_conn_info_set(var, test, suffix) do { \
61 if (test) \
62 var |= RDS_INFO_CONNECTION_FLAG_##suffix; \
63} while (0)
64
65static inline int rds_conn_is_sending(struct rds_connection *conn)
66{
67 int ret = 0;
68
69 if (!mutex_trylock(&conn->c_send_lock))
70 ret = 1;
71 else
72 mutex_unlock(&conn->c_send_lock);
73
74 return ret;
75}
76
77static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
78 __be32 laddr, __be32 faddr,
79 struct rds_transport *trans)
80{
81 struct rds_connection *conn, *ret = NULL;
82 struct hlist_node *pos;
83
84 hlist_for_each_entry(conn, pos, head, c_hash_node) {
85 if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
86 conn->c_trans == trans) {
87 ret = conn;
88 break;
89 }
90 }
91 rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
92 &laddr, &faddr);
93 return ret;
94}
95
96/*
97 * This is called by transports as they're bringing down a connection.
98 * It clears partial message state so that the transport can start sending
99 * and receiving over this connection again in the future. It is up to
100 * the transport to have serialized this call with its send and recv.
101 */
102void rds_conn_reset(struct rds_connection *conn)
103{
104 rdsdebug("connection %pI4 to %pI4 reset\n",
105 &conn->c_laddr, &conn->c_faddr);
106
107 rds_stats_inc(s_conn_reset);
108 rds_send_reset(conn);
109 conn->c_flags = 0;
110
111 /* Do not clear next_rx_seq here, else we cannot distinguish
112 * retransmitted packets from new packets, and will hand all
113 * of them to the application. That is not consistent with the
114 * reliability guarantees of RDS. */
115}
116
117/*
118 * There is only every one 'conn' for a given pair of addresses in the
119 * system at a time. They contain messages to be retransmitted and so
120 * span the lifetime of the actual underlying transport connections.
121 *
122 * For now they are not garbage collected once they're created. They
123 * are torn down as the module is removed, if ever.
124 */
125static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
126 struct rds_transport *trans, gfp_t gfp,
127 int is_outgoing)
128{
129 struct rds_connection *conn, *tmp, *parent = NULL;
130 struct hlist_head *head = rds_conn_bucket(laddr, faddr);
131 unsigned long flags;
132 int ret;
133
134 spin_lock_irqsave(&rds_conn_lock, flags);
135 conn = rds_conn_lookup(head, laddr, faddr, trans);
136 if (conn
137 && conn->c_loopback
138 && conn->c_trans != &rds_loop_transport
139 && !is_outgoing) {
140 /* This is a looped back IB connection, and we're
141 * called by the code handling the incoming connect.
142 * We need a second connection object into which we
143 * can stick the other QP. */
144 parent = conn;
145 conn = parent->c_passive;
146 }
147 spin_unlock_irqrestore(&rds_conn_lock, flags);
148 if (conn)
149 goto out;
150
151 conn = kmem_cache_alloc(rds_conn_slab, gfp);
152 if (conn == NULL) {
153 conn = ERR_PTR(-ENOMEM);
154 goto out;
155 }
156
157 memset(conn, 0, sizeof(*conn));
158
159 INIT_HLIST_NODE(&conn->c_hash_node);
160 conn->c_version = RDS_PROTOCOL_3_0;
161 conn->c_laddr = laddr;
162 conn->c_faddr = faddr;
163 spin_lock_init(&conn->c_lock);
164 conn->c_next_tx_seq = 1;
165
166 mutex_init(&conn->c_send_lock);
167 INIT_LIST_HEAD(&conn->c_send_queue);
168 INIT_LIST_HEAD(&conn->c_retrans);
169
170 ret = rds_cong_get_maps(conn);
171 if (ret) {
172 kmem_cache_free(rds_conn_slab, conn);
173 conn = ERR_PTR(ret);
174 goto out;
175 }
176
177 /*
178 * This is where a connection becomes loopback. If *any* RDS sockets
179 * can bind to the destination address then we'd rather the messages
180 * flow through loopback rather than either transport.
181 */
182 if (rds_trans_get_preferred(faddr)) {
183 conn->c_loopback = 1;
184 if (is_outgoing && trans->t_prefer_loopback) {
185 /* "outgoing" connection - and the transport
186 * says it wants the connection handled by the
187 * loopback transport. This is what TCP does.
188 */
189 trans = &rds_loop_transport;
190 }
191 }
192
193 conn->c_trans = trans;
194
195 ret = trans->conn_alloc(conn, gfp);
196 if (ret) {
197 kmem_cache_free(rds_conn_slab, conn);
198 conn = ERR_PTR(ret);
199 goto out;
200 }
201
202 atomic_set(&conn->c_state, RDS_CONN_DOWN);
203 conn->c_reconnect_jiffies = 0;
204 INIT_DELAYED_WORK(&conn->c_send_w, rds_send_worker);
205 INIT_DELAYED_WORK(&conn->c_recv_w, rds_recv_worker);
206 INIT_DELAYED_WORK(&conn->c_conn_w, rds_connect_worker);
207 INIT_WORK(&conn->c_down_w, rds_shutdown_worker);
208 mutex_init(&conn->c_cm_lock);
209 conn->c_flags = 0;
210
211 rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
212 conn, &laddr, &faddr,
213 trans->t_name ? trans->t_name : "[unknown]",
214 is_outgoing ? "(outgoing)" : "");
215
216 spin_lock_irqsave(&rds_conn_lock, flags);
217 if (parent == NULL) {
218 tmp = rds_conn_lookup(head, laddr, faddr, trans);
219 if (tmp == NULL)
220 hlist_add_head(&conn->c_hash_node, head);
221 } else {
222 tmp = parent->c_passive;
223 if (!tmp)
224 parent->c_passive = conn;
225 }
226
227 if (tmp) {
228 trans->conn_free(conn->c_transport_data);
229 kmem_cache_free(rds_conn_slab, conn);
230 conn = tmp;
231 } else {
232 rds_cong_add_conn(conn);
233 rds_conn_count++;
234 }
235
236 spin_unlock_irqrestore(&rds_conn_lock, flags);
237
238out:
239 return conn;
240}
241
242struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
243 struct rds_transport *trans, gfp_t gfp)
244{
245 return __rds_conn_create(laddr, faddr, trans, gfp, 0);
246}
247
248struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
249 struct rds_transport *trans, gfp_t gfp)
250{
251 return __rds_conn_create(laddr, faddr, trans, gfp, 1);
252}
253
254void rds_conn_destroy(struct rds_connection *conn)
255{
256 struct rds_message *rm, *rtmp;
257
258 rdsdebug("freeing conn %p for %pI4 -> "
259 "%pI4\n", conn, &conn->c_laddr,
260 &conn->c_faddr);
261
262 hlist_del_init(&conn->c_hash_node);
263
264 /* wait for the rds thread to shut it down */
265 atomic_set(&conn->c_state, RDS_CONN_ERROR);
266 cancel_delayed_work(&conn->c_conn_w);
267 queue_work(rds_wq, &conn->c_down_w);
268 flush_workqueue(rds_wq);
269
270 /* tear down queued messages */
271 list_for_each_entry_safe(rm, rtmp,
272 &conn->c_send_queue,
273 m_conn_item) {
274 list_del_init(&rm->m_conn_item);
275 BUG_ON(!list_empty(&rm->m_sock_item));
276 rds_message_put(rm);
277 }
278 if (conn->c_xmit_rm)
279 rds_message_put(conn->c_xmit_rm);
280
281 conn->c_trans->conn_free(conn->c_transport_data);
282
283 /*
284 * The congestion maps aren't freed up here. They're
285 * freed by rds_cong_exit() after all the connections
286 * have been freed.
287 */
288 rds_cong_remove_conn(conn);
289
290 BUG_ON(!list_empty(&conn->c_retrans));
291 kmem_cache_free(rds_conn_slab, conn);
292
293 rds_conn_count--;
294}
295
296static void rds_conn_message_info(struct socket *sock, unsigned int len,
297 struct rds_info_iterator *iter,
298 struct rds_info_lengths *lens,
299 int want_send)
300{
301 struct hlist_head *head;
302 struct hlist_node *pos;
303 struct list_head *list;
304 struct rds_connection *conn;
305 struct rds_message *rm;
306 unsigned long flags;
307 unsigned int total = 0;
308 size_t i;
309
310 len /= sizeof(struct rds_info_message);
311
312 spin_lock_irqsave(&rds_conn_lock, flags);
313
314 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
315 i++, head++) {
316 hlist_for_each_entry(conn, pos, head, c_hash_node) {
317 if (want_send)
318 list = &conn->c_send_queue;
319 else
320 list = &conn->c_retrans;
321
322 spin_lock(&conn->c_lock);
323
324 /* XXX too lazy to maintain counts.. */
325 list_for_each_entry(rm, list, m_conn_item) {
326 total++;
327 if (total <= len)
328 rds_inc_info_copy(&rm->m_inc, iter,
329 conn->c_laddr,
330 conn->c_faddr, 0);
331 }
332
333 spin_unlock(&conn->c_lock);
334 }
335 }
336
337 spin_unlock_irqrestore(&rds_conn_lock, flags);
338
339 lens->nr = total;
340 lens->each = sizeof(struct rds_info_message);
341}
342
343static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
344 struct rds_info_iterator *iter,
345 struct rds_info_lengths *lens)
346{
347 rds_conn_message_info(sock, len, iter, lens, 1);
348}
349
350static void rds_conn_message_info_retrans(struct socket *sock,
351 unsigned int len,
352 struct rds_info_iterator *iter,
353 struct rds_info_lengths *lens)
354{
355 rds_conn_message_info(sock, len, iter, lens, 0);
356}
357
358void rds_for_each_conn_info(struct socket *sock, unsigned int len,
359 struct rds_info_iterator *iter,
360 struct rds_info_lengths *lens,
361 int (*visitor)(struct rds_connection *, void *),
362 size_t item_len)
363{
364 uint64_t buffer[(item_len + 7) / 8];
365 struct hlist_head *head;
366 struct hlist_node *pos;
367 struct hlist_node *tmp;
368 struct rds_connection *conn;
369 unsigned long flags;
370 size_t i;
371
372 spin_lock_irqsave(&rds_conn_lock, flags);
373
374 lens->nr = 0;
375 lens->each = item_len;
376
377 for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash);
378 i++, head++) {
379 hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) {
380
381 /* XXX no c_lock usage.. */
382 if (!visitor(conn, buffer))
383 continue;
384
385 /* We copy as much as we can fit in the buffer,
386 * but we count all items so that the caller
387 * can resize the buffer. */
388 if (len >= item_len) {
389 rds_info_copy(iter, buffer, item_len);
390 len -= item_len;
391 }
392 lens->nr++;
393 }
394 }
395
396 spin_unlock_irqrestore(&rds_conn_lock, flags);
397}
398
399static int rds_conn_info_visitor(struct rds_connection *conn,
400 void *buffer)
401{
402 struct rds_info_connection *cinfo = buffer;
403
404 cinfo->next_tx_seq = conn->c_next_tx_seq;
405 cinfo->next_rx_seq = conn->c_next_rx_seq;
406 cinfo->laddr = conn->c_laddr;
407 cinfo->faddr = conn->c_faddr;
408 strncpy(cinfo->transport, conn->c_trans->t_name,
409 sizeof(cinfo->transport));
410 cinfo->flags = 0;
411
412 rds_conn_info_set(cinfo->flags,
413 rds_conn_is_sending(conn), SENDING);
414 /* XXX Future: return the state rather than these funky bits */
415 rds_conn_info_set(cinfo->flags,
416 atomic_read(&conn->c_state) == RDS_CONN_CONNECTING,
417 CONNECTING);
418 rds_conn_info_set(cinfo->flags,
419 atomic_read(&conn->c_state) == RDS_CONN_UP,
420 CONNECTED);
421 return 1;
422}
423
424static void rds_conn_info(struct socket *sock, unsigned int len,
425 struct rds_info_iterator *iter,
426 struct rds_info_lengths *lens)
427{
428 rds_for_each_conn_info(sock, len, iter, lens,
429 rds_conn_info_visitor,
430 sizeof(struct rds_info_connection));
431}
432
433int __init rds_conn_init(void)
434{
435 rds_conn_slab = kmem_cache_create("rds_connection",
436 sizeof(struct rds_connection),
437 0, 0, NULL);
438 if (rds_conn_slab == NULL)
439 return -ENOMEM;
440
441 rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info);
442 rds_info_register_func(RDS_INFO_SEND_MESSAGES,
443 rds_conn_message_info_send);
444 rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
445 rds_conn_message_info_retrans);
446
447 return 0;
448}
449
450void rds_conn_exit(void)
451{
452 rds_loop_exit();
453
454 WARN_ON(!hlist_empty(rds_conn_hash));
455
456 kmem_cache_destroy(rds_conn_slab);
457
458 rds_info_deregister_func(RDS_INFO_CONNECTIONS, rds_conn_info);
459 rds_info_deregister_func(RDS_INFO_SEND_MESSAGES,
460 rds_conn_message_info_send);
461 rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
462 rds_conn_message_info_retrans);
463}
464
465/*
466 * Force a disconnect
467 */
468void rds_conn_drop(struct rds_connection *conn)
469{
470 atomic_set(&conn->c_state, RDS_CONN_ERROR);
471 queue_work(rds_wq, &conn->c_down_w);
472}
473
474/*
475 * An error occurred on the connection
476 */
477void
478__rds_conn_error(struct rds_connection *conn, const char *fmt, ...)
479{
480 va_list ap;
481
482 va_start(ap, fmt);
483 vprintk(fmt, ap);
484 va_end(ap);
485
486 rds_conn_drop(conn);
487}
diff --git a/net/rds/ib.c b/net/rds/ib.c
new file mode 100644
index 000000000000..06a7b798d9a7
--- /dev/null
+++ b/net/rds/ib.c
@@ -0,0 +1,323 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/if.h>
36#include <linux/netdevice.h>
37#include <linux/inetdevice.h>
38#include <linux/if_arp.h>
39#include <linux/delay.h>
40
41#include "rds.h"
42#include "ib.h"
43
44unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE;
45unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */
46
47module_param(fmr_pool_size, int, 0444);
48MODULE_PARM_DESC(fmr_pool_size, " Max number of fmr per HCA");
49module_param(fmr_message_size, int, 0444);
50MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer");
51
52struct list_head rds_ib_devices;
53
54DEFINE_SPINLOCK(ib_nodev_conns_lock);
55LIST_HEAD(ib_nodev_conns);
56
57void rds_ib_add_one(struct ib_device *device)
58{
59 struct rds_ib_device *rds_ibdev;
60 struct ib_device_attr *dev_attr;
61
62 /* Only handle IB (no iWARP) devices */
63 if (device->node_type != RDMA_NODE_IB_CA)
64 return;
65
66 dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
67 if (!dev_attr)
68 return;
69
70 if (ib_query_device(device, dev_attr)) {
71 rdsdebug("Query device failed for %s\n", device->name);
72 goto free_attr;
73 }
74
75 rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL);
76 if (!rds_ibdev)
77 goto free_attr;
78
79 spin_lock_init(&rds_ibdev->spinlock);
80
81 rds_ibdev->max_wrs = dev_attr->max_qp_wr;
82 rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
83
84 rds_ibdev->fmr_page_shift = max(9, ffs(dev_attr->page_size_cap) - 1);
85 rds_ibdev->fmr_page_size = 1 << rds_ibdev->fmr_page_shift;
86 rds_ibdev->fmr_page_mask = ~((u64) rds_ibdev->fmr_page_size - 1);
87 rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
88 rds_ibdev->max_fmrs = dev_attr->max_fmr ?
89 min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) :
90 fmr_pool_size;
91
92 rds_ibdev->dev = device;
93 rds_ibdev->pd = ib_alloc_pd(device);
94 if (IS_ERR(rds_ibdev->pd))
95 goto free_dev;
96
97 rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd,
98 IB_ACCESS_LOCAL_WRITE);
99 if (IS_ERR(rds_ibdev->mr))
100 goto err_pd;
101
102 rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev);
103 if (IS_ERR(rds_ibdev->mr_pool)) {
104 rds_ibdev->mr_pool = NULL;
105 goto err_mr;
106 }
107
108 INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
109 INIT_LIST_HEAD(&rds_ibdev->conn_list);
110 list_add_tail(&rds_ibdev->list, &rds_ib_devices);
111
112 ib_set_client_data(device, &rds_ib_client, rds_ibdev);
113
114 goto free_attr;
115
116err_mr:
117 ib_dereg_mr(rds_ibdev->mr);
118err_pd:
119 ib_dealloc_pd(rds_ibdev->pd);
120free_dev:
121 kfree(rds_ibdev);
122free_attr:
123 kfree(dev_attr);
124}
125
126void rds_ib_remove_one(struct ib_device *device)
127{
128 struct rds_ib_device *rds_ibdev;
129 struct rds_ib_ipaddr *i_ipaddr, *i_next;
130
131 rds_ibdev = ib_get_client_data(device, &rds_ib_client);
132 if (!rds_ibdev)
133 return;
134
135 list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) {
136 list_del(&i_ipaddr->list);
137 kfree(i_ipaddr);
138 }
139
140 rds_ib_remove_conns(rds_ibdev);
141
142 if (rds_ibdev->mr_pool)
143 rds_ib_destroy_mr_pool(rds_ibdev->mr_pool);
144
145 ib_dereg_mr(rds_ibdev->mr);
146
147 while (ib_dealloc_pd(rds_ibdev->pd)) {
148 rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd);
149 msleep(1);
150 }
151
152 list_del(&rds_ibdev->list);
153 kfree(rds_ibdev);
154}
155
156struct ib_client rds_ib_client = {
157 .name = "rds_ib",
158 .add = rds_ib_add_one,
159 .remove = rds_ib_remove_one
160};
161
162static int rds_ib_conn_info_visitor(struct rds_connection *conn,
163 void *buffer)
164{
165 struct rds_info_rdma_connection *iinfo = buffer;
166 struct rds_ib_connection *ic;
167
168 /* We will only ever look at IB transports */
169 if (conn->c_trans != &rds_ib_transport)
170 return 0;
171
172 iinfo->src_addr = conn->c_laddr;
173 iinfo->dst_addr = conn->c_faddr;
174
175 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
176 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
177 if (rds_conn_state(conn) == RDS_CONN_UP) {
178 struct rds_ib_device *rds_ibdev;
179 struct rdma_dev_addr *dev_addr;
180
181 ic = conn->c_transport_data;
182 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
183
184 ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
185 ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
186
187 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
188 iinfo->max_send_wr = ic->i_send_ring.w_nr;
189 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
190 iinfo->max_send_sge = rds_ibdev->max_sge;
191 rds_ib_get_mr_info(rds_ibdev, iinfo);
192 }
193 return 1;
194}
195
196static void rds_ib_ic_info(struct socket *sock, unsigned int len,
197 struct rds_info_iterator *iter,
198 struct rds_info_lengths *lens)
199{
200 rds_for_each_conn_info(sock, len, iter, lens,
201 rds_ib_conn_info_visitor,
202 sizeof(struct rds_info_rdma_connection));
203}
204
205
206/*
207 * Early RDS/IB was built to only bind to an address if there is an IPoIB
208 * device with that address set.
209 *
210 * If it were me, I'd advocate for something more flexible. Sending and
211 * receiving should be device-agnostic. Transports would try and maintain
212 * connections between peers who have messages queued. Userspace would be
213 * allowed to influence which paths have priority. We could call userspace
214 * asserting this policy "routing".
215 */
216static int rds_ib_laddr_check(__be32 addr)
217{
218 int ret;
219 struct rdma_cm_id *cm_id;
220 struct sockaddr_in sin;
221
222 /* Create a CMA ID and try to bind it. This catches both
223 * IB and iWARP capable NICs.
224 */
225 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
226 if (!cm_id)
227 return -EADDRNOTAVAIL;
228
229 memset(&sin, 0, sizeof(sin));
230 sin.sin_family = AF_INET;
231 sin.sin_addr.s_addr = addr;
232
233 /* rdma_bind_addr will only succeed for IB & iWARP devices */
234 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
235 /* due to this, we will claim to support iWARP devices unless we
236 check node_type. */
237 if (ret || cm_id->device->node_type != RDMA_NODE_IB_CA)
238 ret = -EADDRNOTAVAIL;
239
240 rdsdebug("addr %pI4 ret %d node type %d\n",
241 &addr, ret,
242 cm_id->device ? cm_id->device->node_type : -1);
243
244 rdma_destroy_id(cm_id);
245
246 return ret;
247}
248
249void rds_ib_exit(void)
250{
251 rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
252 rds_ib_remove_nodev_conns();
253 ib_unregister_client(&rds_ib_client);
254 rds_ib_sysctl_exit();
255 rds_ib_recv_exit();
256 rds_trans_unregister(&rds_ib_transport);
257}
258
259struct rds_transport rds_ib_transport = {
260 .laddr_check = rds_ib_laddr_check,
261 .xmit_complete = rds_ib_xmit_complete,
262 .xmit = rds_ib_xmit,
263 .xmit_cong_map = NULL,
264 .xmit_rdma = rds_ib_xmit_rdma,
265 .recv = rds_ib_recv,
266 .conn_alloc = rds_ib_conn_alloc,
267 .conn_free = rds_ib_conn_free,
268 .conn_connect = rds_ib_conn_connect,
269 .conn_shutdown = rds_ib_conn_shutdown,
270 .inc_copy_to_user = rds_ib_inc_copy_to_user,
271 .inc_purge = rds_ib_inc_purge,
272 .inc_free = rds_ib_inc_free,
273 .cm_initiate_connect = rds_ib_cm_initiate_connect,
274 .cm_handle_connect = rds_ib_cm_handle_connect,
275 .cm_connect_complete = rds_ib_cm_connect_complete,
276 .stats_info_copy = rds_ib_stats_info_copy,
277 .exit = rds_ib_exit,
278 .get_mr = rds_ib_get_mr,
279 .sync_mr = rds_ib_sync_mr,
280 .free_mr = rds_ib_free_mr,
281 .flush_mrs = rds_ib_flush_mrs,
282 .t_owner = THIS_MODULE,
283 .t_name = "infiniband",
284};
285
286int __init rds_ib_init(void)
287{
288 int ret;
289
290 INIT_LIST_HEAD(&rds_ib_devices);
291
292 ret = ib_register_client(&rds_ib_client);
293 if (ret)
294 goto out;
295
296 ret = rds_ib_sysctl_init();
297 if (ret)
298 goto out_ibreg;
299
300 ret = rds_ib_recv_init();
301 if (ret)
302 goto out_sysctl;
303
304 ret = rds_trans_register(&rds_ib_transport);
305 if (ret)
306 goto out_recv;
307
308 rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
309
310 goto out;
311
312out_recv:
313 rds_ib_recv_exit();
314out_sysctl:
315 rds_ib_sysctl_exit();
316out_ibreg:
317 ib_unregister_client(&rds_ib_client);
318out:
319 return ret;
320}
321
322MODULE_LICENSE("GPL");
323
diff --git a/net/rds/ib.h b/net/rds/ib.h
new file mode 100644
index 000000000000..8be563a1363a
--- /dev/null
+++ b/net/rds/ib.h
@@ -0,0 +1,367 @@
1#ifndef _RDS_IB_H
2#define _RDS_IB_H
3
4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h>
6#include "rds.h"
7#include "rdma_transport.h"
8
9#define RDS_FMR_SIZE 256
10#define RDS_FMR_POOL_SIZE 4096
11
12#define RDS_IB_MAX_SGE 8
13#define RDS_IB_RECV_SGE 2
14
15#define RDS_IB_DEFAULT_RECV_WR 1024
16#define RDS_IB_DEFAULT_SEND_WR 256
17
18#define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
19
20extern struct list_head rds_ib_devices;
21
22/*
23 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
24 * try and minimize the amount of memory tied up both the device and
25 * socket receive queues.
26 */
27/* page offset of the final full frag that fits in the page */
28#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
29struct rds_page_frag {
30 struct list_head f_item;
31 struct page *f_page;
32 unsigned long f_offset;
33 dma_addr_t f_mapped;
34};
35
36struct rds_ib_incoming {
37 struct list_head ii_frags;
38 struct rds_incoming ii_inc;
39};
40
41struct rds_ib_connect_private {
42 /* Add new fields at the end, and don't permute existing fields. */
43 __be32 dp_saddr;
44 __be32 dp_daddr;
45 u8 dp_protocol_major;
46 u8 dp_protocol_minor;
47 __be16 dp_protocol_minor_mask; /* bitmask */
48 __be32 dp_reserved1;
49 __be64 dp_ack_seq;
50 __be32 dp_credit; /* non-zero enables flow ctl */
51};
52
53struct rds_ib_send_work {
54 struct rds_message *s_rm;
55 struct rds_rdma_op *s_op;
56 struct ib_send_wr s_wr;
57 struct ib_sge s_sge[RDS_IB_MAX_SGE];
58 unsigned long s_queued;
59};
60
61struct rds_ib_recv_work {
62 struct rds_ib_incoming *r_ibinc;
63 struct rds_page_frag *r_frag;
64 struct ib_recv_wr r_wr;
65 struct ib_sge r_sge[2];
66};
67
68struct rds_ib_work_ring {
69 u32 w_nr;
70 u32 w_alloc_ptr;
71 u32 w_alloc_ctr;
72 u32 w_free_ptr;
73 atomic_t w_free_ctr;
74};
75
76struct rds_ib_device;
77
78struct rds_ib_connection {
79
80 struct list_head ib_node;
81 struct rds_ib_device *rds_ibdev;
82 struct rds_connection *conn;
83
84 /* alphabet soup, IBTA style */
85 struct rdma_cm_id *i_cm_id;
86 struct ib_pd *i_pd;
87 struct ib_mr *i_mr;
88 struct ib_cq *i_send_cq;
89 struct ib_cq *i_recv_cq;
90
91 /* tx */
92 struct rds_ib_work_ring i_send_ring;
93 struct rds_message *i_rm;
94 struct rds_header *i_send_hdrs;
95 u64 i_send_hdrs_dma;
96 struct rds_ib_send_work *i_sends;
97
98 /* rx */
99 struct mutex i_recv_mutex;
100 struct rds_ib_work_ring i_recv_ring;
101 struct rds_ib_incoming *i_ibinc;
102 u32 i_recv_data_rem;
103 struct rds_header *i_recv_hdrs;
104 u64 i_recv_hdrs_dma;
105 struct rds_ib_recv_work *i_recvs;
106 struct rds_page_frag i_frag;
107 u64 i_ack_recv; /* last ACK received */
108
109 /* sending acks */
110 unsigned long i_ack_flags;
111 u64 i_ack_next; /* next ACK to send */
112 struct rds_header *i_ack;
113 struct ib_send_wr i_ack_wr;
114 struct ib_sge i_ack_sge;
115 u64 i_ack_dma;
116 unsigned long i_ack_queued;
117
118 /* Flow control related information
119 *
120 * Our algorithm uses a pair variables that we need to access
121 * atomically - one for the send credits, and one posted
122 * recv credits we need to transfer to remote.
123 * Rather than protect them using a slow spinlock, we put both into
124 * a single atomic_t and update it using cmpxchg
125 */
126 atomic_t i_credits;
127
128 /* Protocol version specific information */
129 unsigned int i_flowctl:1; /* enable/disable flow ctl */
130
131 /* Batched completions */
132 unsigned int i_unsignaled_wrs;
133 long i_unsignaled_bytes;
134};
135
136/* This assumes that atomic_t is at least 32 bits */
137#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
138#define IB_GET_POST_CREDITS(v) ((v) >> 16)
139#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
140#define IB_SET_POST_CREDITS(v) ((v) << 16)
141
142struct rds_ib_ipaddr {
143 struct list_head list;
144 __be32 ipaddr;
145};
146
147struct rds_ib_device {
148 struct list_head list;
149 struct list_head ipaddr_list;
150 struct list_head conn_list;
151 struct ib_device *dev;
152 struct ib_pd *pd;
153 struct ib_mr *mr;
154 struct rds_ib_mr_pool *mr_pool;
155 int fmr_page_shift;
156 int fmr_page_size;
157 u64 fmr_page_mask;
158 unsigned int fmr_max_remaps;
159 unsigned int max_fmrs;
160 int max_sge;
161 unsigned int max_wrs;
162 spinlock_t spinlock; /* protect the above */
163};
164
165/* bits for i_ack_flags */
166#define IB_ACK_IN_FLIGHT 0
167#define IB_ACK_REQUESTED 1
168
169/* Magic WR_ID for ACKs */
170#define RDS_IB_ACK_WR_ID (~(u64) 0)
171
172struct rds_ib_statistics {
173 uint64_t s_ib_connect_raced;
174 uint64_t s_ib_listen_closed_stale;
175 uint64_t s_ib_tx_cq_call;
176 uint64_t s_ib_tx_cq_event;
177 uint64_t s_ib_tx_ring_full;
178 uint64_t s_ib_tx_throttle;
179 uint64_t s_ib_tx_sg_mapping_failure;
180 uint64_t s_ib_tx_stalled;
181 uint64_t s_ib_tx_credit_updates;
182 uint64_t s_ib_rx_cq_call;
183 uint64_t s_ib_rx_cq_event;
184 uint64_t s_ib_rx_ring_empty;
185 uint64_t s_ib_rx_refill_from_cq;
186 uint64_t s_ib_rx_refill_from_thread;
187 uint64_t s_ib_rx_alloc_limit;
188 uint64_t s_ib_rx_credit_updates;
189 uint64_t s_ib_ack_sent;
190 uint64_t s_ib_ack_send_failure;
191 uint64_t s_ib_ack_send_delayed;
192 uint64_t s_ib_ack_send_piggybacked;
193 uint64_t s_ib_ack_received;
194 uint64_t s_ib_rdma_mr_alloc;
195 uint64_t s_ib_rdma_mr_free;
196 uint64_t s_ib_rdma_mr_used;
197 uint64_t s_ib_rdma_mr_pool_flush;
198 uint64_t s_ib_rdma_mr_pool_wait;
199 uint64_t s_ib_rdma_mr_pool_depleted;
200};
201
202extern struct workqueue_struct *rds_ib_wq;
203
204/*
205 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
206 * doesn't define it.
207 */
208static inline void rds_ib_dma_sync_sg_for_cpu(struct ib_device *dev,
209 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
210{
211 unsigned int i;
212
213 for (i = 0; i < sg_dma_len; ++i) {
214 ib_dma_sync_single_for_cpu(dev,
215 ib_sg_dma_address(dev, &sg[i]),
216 ib_sg_dma_len(dev, &sg[i]),
217 direction);
218 }
219}
220#define ib_dma_sync_sg_for_cpu rds_ib_dma_sync_sg_for_cpu
221
222static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev,
223 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
224{
225 unsigned int i;
226
227 for (i = 0; i < sg_dma_len; ++i) {
228 ib_dma_sync_single_for_device(dev,
229 ib_sg_dma_address(dev, &sg[i]),
230 ib_sg_dma_len(dev, &sg[i]),
231 direction);
232 }
233}
234#define ib_dma_sync_sg_for_device rds_ib_dma_sync_sg_for_device
235
236
237/* ib.c */
238extern struct rds_transport rds_ib_transport;
239extern void rds_ib_add_one(struct ib_device *device);
240extern void rds_ib_remove_one(struct ib_device *device);
241extern struct ib_client rds_ib_client;
242
243extern unsigned int fmr_pool_size;
244extern unsigned int fmr_message_size;
245
246extern spinlock_t ib_nodev_conns_lock;
247extern struct list_head ib_nodev_conns;
248
249/* ib_cm.c */
250int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp);
251void rds_ib_conn_free(void *arg);
252int rds_ib_conn_connect(struct rds_connection *conn);
253void rds_ib_conn_shutdown(struct rds_connection *conn);
254void rds_ib_state_change(struct sock *sk);
255int __init rds_ib_listen_init(void);
256void rds_ib_listen_stop(void);
257void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
258int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
259 struct rdma_cm_event *event);
260int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
261void rds_ib_cm_connect_complete(struct rds_connection *conn,
262 struct rdma_cm_event *event);
263
264
265#define rds_ib_conn_error(conn, fmt...) \
266 __rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
267
268/* ib_rdma.c */
269int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
270int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
271void rds_ib_remove_nodev_conns(void);
272void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev);
273struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *);
274void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
275void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
276void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
277 struct rds_sock *rs, u32 *key_ret);
278void rds_ib_sync_mr(void *trans_private, int dir);
279void rds_ib_free_mr(void *trans_private, int invalidate);
280void rds_ib_flush_mrs(void);
281
282/* ib_recv.c */
283int __init rds_ib_recv_init(void);
284void rds_ib_recv_exit(void);
285int rds_ib_recv(struct rds_connection *conn);
286int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
287 gfp_t page_gfp, int prefill);
288void rds_ib_inc_purge(struct rds_incoming *inc);
289void rds_ib_inc_free(struct rds_incoming *inc);
290int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
291 size_t size);
292void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context);
293void rds_ib_recv_init_ring(struct rds_ib_connection *ic);
294void rds_ib_recv_clear_ring(struct rds_ib_connection *ic);
295void rds_ib_recv_init_ack(struct rds_ib_connection *ic);
296void rds_ib_attempt_ack(struct rds_ib_connection *ic);
297void rds_ib_ack_send_complete(struct rds_ib_connection *ic);
298u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic);
299
300/* ib_ring.c */
301void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr);
302void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr);
303u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos);
304void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val);
305void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val);
306int rds_ib_ring_empty(struct rds_ib_work_ring *ring);
307int rds_ib_ring_low(struct rds_ib_work_ring *ring);
308u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring);
309u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest);
310extern wait_queue_head_t rds_ib_ring_empty_wait;
311
312/* ib_send.c */
313void rds_ib_xmit_complete(struct rds_connection *conn);
314int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
315 unsigned int hdr_off, unsigned int sg, unsigned int off);
316void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context);
317void rds_ib_send_init_ring(struct rds_ib_connection *ic);
318void rds_ib_send_clear_ring(struct rds_ib_connection *ic);
319int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
320void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits);
321void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted);
322int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted,
323 u32 *adv_credits, int need_posted);
324
325/* ib_stats.c */
326DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats);
327#define rds_ib_stats_inc(member) rds_stats_inc_which(rds_ib_stats, member)
328unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
329 unsigned int avail);
330
331/* ib_sysctl.c */
332int __init rds_ib_sysctl_init(void);
333void rds_ib_sysctl_exit(void);
334extern unsigned long rds_ib_sysctl_max_send_wr;
335extern unsigned long rds_ib_sysctl_max_recv_wr;
336extern unsigned long rds_ib_sysctl_max_unsig_wrs;
337extern unsigned long rds_ib_sysctl_max_unsig_bytes;
338extern unsigned long rds_ib_sysctl_max_recv_allocation;
339extern unsigned int rds_ib_sysctl_flow_control;
340extern ctl_table rds_ib_sysctl_table[];
341
342/*
343 * Helper functions for getting/setting the header and data SGEs in
344 * RDS packets (not RDMA)
345 */
346static inline struct ib_sge *
347rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
348{
349 return &sge[0];
350}
351
352static inline struct ib_sge *
353rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge)
354{
355 return &sge[1];
356}
357
358static inline void rds_ib_set_64bit(u64 *ptr, u64 val)
359{
360#if BITS_PER_LONG == 64
361 *ptr = val;
362#else
363 set_64bit(ptr, val);
364#endif
365}
366
367#endif
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
new file mode 100644
index 000000000000..0532237bd128
--- /dev/null
+++ b/net/rds/ib_cm.c
@@ -0,0 +1,726 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/vmalloc.h>
36
37#include "rds.h"
38#include "ib.h"
39
40/*
41 * Set the selected protocol version
42 */
43static void rds_ib_set_protocol(struct rds_connection *conn, unsigned int version)
44{
45 conn->c_version = version;
46}
47
48/*
49 * Set up flow control
50 */
51static void rds_ib_set_flow_control(struct rds_connection *conn, u32 credits)
52{
53 struct rds_ib_connection *ic = conn->c_transport_data;
54
55 if (rds_ib_sysctl_flow_control && credits != 0) {
56 /* We're doing flow control */
57 ic->i_flowctl = 1;
58 rds_ib_send_add_credits(conn, credits);
59 } else {
60 ic->i_flowctl = 0;
61 }
62}
63
64/*
65 * Tune RNR behavior. Without flow control, we use a rather
66 * low timeout, but not the absolute minimum - this should
67 * be tunable.
68 *
69 * We already set the RNR retry count to 7 (which is the
70 * smallest infinite number :-) above.
71 * If flow control is off, we want to change this back to 0
72 * so that we learn quickly when our credit accounting is
73 * buggy.
74 *
75 * Caller passes in a qp_attr pointer - don't waste stack spacv
76 * by allocation this twice.
77 */
78static void
79rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
80{
81 int ret;
82
83 attr->min_rnr_timer = IB_RNR_TIMER_000_32;
84 ret = ib_modify_qp(ic->i_cm_id->qp, attr, IB_QP_MIN_RNR_TIMER);
85 if (ret)
86 printk(KERN_NOTICE "ib_modify_qp(IB_QP_MIN_RNR_TIMER): err=%d\n", -ret);
87}
88
89/*
90 * Connection established.
91 * We get here for both outgoing and incoming connection.
92 */
93void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
94{
95 const struct rds_ib_connect_private *dp = NULL;
96 struct rds_ib_connection *ic = conn->c_transport_data;
97 struct rds_ib_device *rds_ibdev;
98 struct ib_qp_attr qp_attr;
99 int err;
100
101 if (event->param.conn.private_data_len) {
102 dp = event->param.conn.private_data;
103
104 rds_ib_set_protocol(conn,
105 RDS_PROTOCOL(dp->dp_protocol_major,
106 dp->dp_protocol_minor));
107 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
108 }
109
110 printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n",
111 &conn->c_laddr,
112 RDS_PROTOCOL_MAJOR(conn->c_version),
113 RDS_PROTOCOL_MINOR(conn->c_version),
114 ic->i_flowctl ? ", flow control" : "");
115
116 /* Tune RNR behavior */
117 rds_ib_tune_rnr(ic, &qp_attr);
118
119 qp_attr.qp_state = IB_QPS_RTS;
120 err = ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
121 if (err)
122 printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
123
124 /* update ib_device with this local ipaddr & conn */
125 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
126 err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr);
127 if (err)
128 printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err);
129 err = rds_ib_add_conn(rds_ibdev, conn);
130 if (err)
131 printk(KERN_ERR "rds_ib_add_conn failed (%d)\n", err);
132
133 /* If the peer gave us the last packet it saw, process this as if
134 * we had received a regular ACK. */
135 if (dp && dp->dp_ack_seq)
136 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
137
138 rds_connect_complete(conn);
139}
140
141static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
142 struct rdma_conn_param *conn_param,
143 struct rds_ib_connect_private *dp,
144 u32 protocol_version)
145{
146 memset(conn_param, 0, sizeof(struct rdma_conn_param));
147 /* XXX tune these? */
148 conn_param->responder_resources = 1;
149 conn_param->initiator_depth = 1;
150 conn_param->retry_count = 7;
151 conn_param->rnr_retry_count = 7;
152
153 if (dp) {
154 struct rds_ib_connection *ic = conn->c_transport_data;
155
156 memset(dp, 0, sizeof(*dp));
157 dp->dp_saddr = conn->c_laddr;
158 dp->dp_daddr = conn->c_faddr;
159 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
160 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
161 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
162 dp->dp_ack_seq = rds_ib_piggyb_ack(ic);
163
164 /* Advertise flow control */
165 if (ic->i_flowctl) {
166 unsigned int credits;
167
168 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
169 dp->dp_credit = cpu_to_be32(credits);
170 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
171 }
172
173 conn_param->private_data = dp;
174 conn_param->private_data_len = sizeof(*dp);
175 }
176}
177
178static void rds_ib_cq_event_handler(struct ib_event *event, void *data)
179{
180 rdsdebug("event %u data %p\n", event->event, data);
181}
182
183static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
184{
185 struct rds_connection *conn = data;
186 struct rds_ib_connection *ic = conn->c_transport_data;
187
188 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
189
190 switch (event->event) {
191 case IB_EVENT_COMM_EST:
192 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
193 break;
194 default:
195 printk(KERN_WARNING "RDS/ib: unhandled QP event %u "
196 "on connection to %pI4\n", event->event,
197 &conn->c_faddr);
198 break;
199 }
200}
201
202/*
203 * This needs to be very careful to not leave IS_ERR pointers around for
204 * cleanup to trip over.
205 */
206static int rds_ib_setup_qp(struct rds_connection *conn)
207{
208 struct rds_ib_connection *ic = conn->c_transport_data;
209 struct ib_device *dev = ic->i_cm_id->device;
210 struct ib_qp_init_attr attr;
211 struct rds_ib_device *rds_ibdev;
212 int ret;
213
214 /* rds_ib_add_one creates a rds_ib_device object per IB device,
215 * and allocates a protection domain, memory range and FMR pool
216 * for each. If that fails for any reason, it will not register
217 * the rds_ibdev at all.
218 */
219 rds_ibdev = ib_get_client_data(dev, &rds_ib_client);
220 if (rds_ibdev == NULL) {
221 if (printk_ratelimit())
222 printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n",
223 dev->name);
224 return -EOPNOTSUPP;
225 }
226
227 if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1)
228 rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1);
229 if (rds_ibdev->max_wrs < ic->i_recv_ring.w_nr + 1)
230 rds_ib_ring_resize(&ic->i_recv_ring, rds_ibdev->max_wrs - 1);
231
232 /* Protection domain and memory range */
233 ic->i_pd = rds_ibdev->pd;
234 ic->i_mr = rds_ibdev->mr;
235
236 ic->i_send_cq = ib_create_cq(dev, rds_ib_send_cq_comp_handler,
237 rds_ib_cq_event_handler, conn,
238 ic->i_send_ring.w_nr + 1, 0);
239 if (IS_ERR(ic->i_send_cq)) {
240 ret = PTR_ERR(ic->i_send_cq);
241 ic->i_send_cq = NULL;
242 rdsdebug("ib_create_cq send failed: %d\n", ret);
243 goto out;
244 }
245
246 ic->i_recv_cq = ib_create_cq(dev, rds_ib_recv_cq_comp_handler,
247 rds_ib_cq_event_handler, conn,
248 ic->i_recv_ring.w_nr, 0);
249 if (IS_ERR(ic->i_recv_cq)) {
250 ret = PTR_ERR(ic->i_recv_cq);
251 ic->i_recv_cq = NULL;
252 rdsdebug("ib_create_cq recv failed: %d\n", ret);
253 goto out;
254 }
255
256 ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
257 if (ret) {
258 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
259 goto out;
260 }
261
262 ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
263 if (ret) {
264 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
265 goto out;
266 }
267
268 /* XXX negotiate max send/recv with remote? */
269 memset(&attr, 0, sizeof(attr));
270 attr.event_handler = rds_ib_qp_event_handler;
271 attr.qp_context = conn;
272 /* + 1 to allow for the single ack message */
273 attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
274 attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
275 attr.cap.max_send_sge = rds_ibdev->max_sge;
276 attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
277 attr.sq_sig_type = IB_SIGNAL_REQ_WR;
278 attr.qp_type = IB_QPT_RC;
279 attr.send_cq = ic->i_send_cq;
280 attr.recv_cq = ic->i_recv_cq;
281
282 /*
283 * XXX this can fail if max_*_wr is too large? Are we supposed
284 * to back off until we get a value that the hardware can support?
285 */
286 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
287 if (ret) {
288 rdsdebug("rdma_create_qp failed: %d\n", ret);
289 goto out;
290 }
291
292 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
293 ic->i_send_ring.w_nr *
294 sizeof(struct rds_header),
295 &ic->i_send_hdrs_dma, GFP_KERNEL);
296 if (ic->i_send_hdrs == NULL) {
297 ret = -ENOMEM;
298 rdsdebug("ib_dma_alloc_coherent send failed\n");
299 goto out;
300 }
301
302 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
303 ic->i_recv_ring.w_nr *
304 sizeof(struct rds_header),
305 &ic->i_recv_hdrs_dma, GFP_KERNEL);
306 if (ic->i_recv_hdrs == NULL) {
307 ret = -ENOMEM;
308 rdsdebug("ib_dma_alloc_coherent recv failed\n");
309 goto out;
310 }
311
312 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
313 &ic->i_ack_dma, GFP_KERNEL);
314 if (ic->i_ack == NULL) {
315 ret = -ENOMEM;
316 rdsdebug("ib_dma_alloc_coherent ack failed\n");
317 goto out;
318 }
319
320 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work));
321 if (ic->i_sends == NULL) {
322 ret = -ENOMEM;
323 rdsdebug("send allocation failed\n");
324 goto out;
325 }
326 rds_ib_send_init_ring(ic);
327
328 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work));
329 if (ic->i_recvs == NULL) {
330 ret = -ENOMEM;
331 rdsdebug("recv allocation failed\n");
332 goto out;
333 }
334
335 rds_ib_recv_init_ring(ic);
336 rds_ib_recv_init_ack(ic);
337
338 /* Post receive buffers - as a side effect, this will update
339 * the posted credit count. */
340 rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
341
342 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
343 ic->i_send_cq, ic->i_recv_cq);
344
345out:
346 return ret;
347}
348
349static u32 rds_ib_protocol_compatible(const struct rds_ib_connect_private *dp)
350{
351 u16 common;
352 u32 version = 0;
353
354 /* rdma_cm private data is odd - when there is any private data in the
355 * request, we will be given a pretty large buffer without telling us the
356 * original size. The only way to tell the difference is by looking at
357 * the contents, which are initialized to zero.
358 * If the protocol version fields aren't set, this is a connection attempt
359 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
360 * We really should have changed this for OFED 1.3 :-( */
361 if (dp->dp_protocol_major == 0)
362 return RDS_PROTOCOL_3_0;
363
364 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
365 if (dp->dp_protocol_major == 3 && common) {
366 version = RDS_PROTOCOL_3_0;
367 while ((common >>= 1) != 0)
368 version++;
369 } else if (printk_ratelimit()) {
370 printk(KERN_NOTICE "RDS: Connection from %pI4 using "
371 "incompatible protocol version %u.%u\n",
372 &dp->dp_saddr,
373 dp->dp_protocol_major,
374 dp->dp_protocol_minor);
375 }
376 return version;
377}
378
379int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
380 struct rdma_cm_event *event)
381{
382 __be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
383 __be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
384 const struct rds_ib_connect_private *dp = event->param.conn.private_data;
385 struct rds_ib_connect_private dp_rep;
386 struct rds_connection *conn = NULL;
387 struct rds_ib_connection *ic = NULL;
388 struct rdma_conn_param conn_param;
389 u32 version;
390 int err, destroy = 1;
391
392 /* Check whether the remote protocol version matches ours. */
393 version = rds_ib_protocol_compatible(dp);
394 if (!version)
395 goto out;
396
397 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
398 "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
399 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
400 (unsigned long long)be64_to_cpu(lguid),
401 (unsigned long long)be64_to_cpu(fguid));
402
403 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_ib_transport,
404 GFP_KERNEL);
405 if (IS_ERR(conn)) {
406 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
407 conn = NULL;
408 goto out;
409 }
410
411 /*
412 * The connection request may occur while the
413 * previous connection exist, e.g. in case of failover.
414 * But as connections may be initiated simultaneously
415 * by both hosts, we have a random backoff mechanism -
416 * see the comment above rds_queue_reconnect()
417 */
418 mutex_lock(&conn->c_cm_lock);
419 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
420 if (rds_conn_state(conn) == RDS_CONN_UP) {
421 rdsdebug("incoming connect while connecting\n");
422 rds_conn_drop(conn);
423 rds_ib_stats_inc(s_ib_listen_closed_stale);
424 } else
425 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
426 /* Wait and see - our connect may still be succeeding */
427 rds_ib_stats_inc(s_ib_connect_raced);
428 }
429 mutex_unlock(&conn->c_cm_lock);
430 goto out;
431 }
432
433 ic = conn->c_transport_data;
434
435 rds_ib_set_protocol(conn, version);
436 rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
437
438 /* If the peer gave us the last packet it saw, process this as if
439 * we had received a regular ACK. */
440 if (dp->dp_ack_seq)
441 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
442
443 BUG_ON(cm_id->context);
444 BUG_ON(ic->i_cm_id);
445
446 ic->i_cm_id = cm_id;
447 cm_id->context = conn;
448
449 /* We got halfway through setting up the ib_connection, if we
450 * fail now, we have to take the long route out of this mess. */
451 destroy = 0;
452
453 err = rds_ib_setup_qp(conn);
454 if (err) {
455 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err);
456 goto out;
457 }
458
459 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
460
461 /* rdma_accept() calls rdma_reject() internally if it fails */
462 err = rdma_accept(cm_id, &conn_param);
463 mutex_unlock(&conn->c_cm_lock);
464 if (err) {
465 rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err);
466 goto out;
467 }
468
469 return 0;
470
471out:
472 rdma_reject(cm_id, NULL, 0);
473 return destroy;
474}
475
476
477int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
478{
479 struct rds_connection *conn = cm_id->context;
480 struct rds_ib_connection *ic = conn->c_transport_data;
481 struct rdma_conn_param conn_param;
482 struct rds_ib_connect_private dp;
483 int ret;
484
485 /* If the peer doesn't do protocol negotiation, we must
486 * default to RDSv3.0 */
487 rds_ib_set_protocol(conn, RDS_PROTOCOL_3_0);
488 ic->i_flowctl = rds_ib_sysctl_flow_control; /* advertise flow control */
489
490 ret = rds_ib_setup_qp(conn);
491 if (ret) {
492 rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", ret);
493 goto out;
494 }
495
496 rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
497
498 ret = rdma_connect(cm_id, &conn_param);
499 if (ret)
500 rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
501
502out:
503 /* Beware - returning non-zero tells the rdma_cm to destroy
504 * the cm_id. We should certainly not do it as long as we still
505 * "own" the cm_id. */
506 if (ret) {
507 if (ic->i_cm_id == cm_id)
508 ret = 0;
509 }
510 return ret;
511}
512
513int rds_ib_conn_connect(struct rds_connection *conn)
514{
515 struct rds_ib_connection *ic = conn->c_transport_data;
516 struct sockaddr_in src, dest;
517 int ret;
518
519 /* XXX I wonder what affect the port space has */
520 /* delegate cm event handler to rdma_transport */
521 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
522 RDMA_PS_TCP);
523 if (IS_ERR(ic->i_cm_id)) {
524 ret = PTR_ERR(ic->i_cm_id);
525 ic->i_cm_id = NULL;
526 rdsdebug("rdma_create_id() failed: %d\n", ret);
527 goto out;
528 }
529
530 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
531
532 src.sin_family = AF_INET;
533 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
534 src.sin_port = (__force u16)htons(0);
535
536 dest.sin_family = AF_INET;
537 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
538 dest.sin_port = (__force u16)htons(RDS_PORT);
539
540 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
541 (struct sockaddr *)&dest,
542 RDS_RDMA_RESOLVE_TIMEOUT_MS);
543 if (ret) {
544 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
545 ret);
546 rdma_destroy_id(ic->i_cm_id);
547 ic->i_cm_id = NULL;
548 }
549
550out:
551 return ret;
552}
553
554/*
555 * This is so careful about only cleaning up resources that were built up
556 * so that it can be called at any point during startup. In fact it
557 * can be called multiple times for a given connection.
558 */
559void rds_ib_conn_shutdown(struct rds_connection *conn)
560{
561 struct rds_ib_connection *ic = conn->c_transport_data;
562 int err = 0;
563
564 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
565 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
566 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
567
568 if (ic->i_cm_id) {
569 struct ib_device *dev = ic->i_cm_id->device;
570
571 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
572 err = rdma_disconnect(ic->i_cm_id);
573 if (err) {
574 /* Actually this may happen quite frequently, when
575 * an outgoing connect raced with an incoming connect.
576 */
577 rdsdebug("failed to disconnect, cm: %p err %d\n",
578 ic->i_cm_id, err);
579 }
580
581 wait_event(rds_ib_ring_empty_wait,
582 rds_ib_ring_empty(&ic->i_send_ring) &&
583 rds_ib_ring_empty(&ic->i_recv_ring));
584
585 if (ic->i_send_hdrs)
586 ib_dma_free_coherent(dev,
587 ic->i_send_ring.w_nr *
588 sizeof(struct rds_header),
589 ic->i_send_hdrs,
590 ic->i_send_hdrs_dma);
591
592 if (ic->i_recv_hdrs)
593 ib_dma_free_coherent(dev,
594 ic->i_recv_ring.w_nr *
595 sizeof(struct rds_header),
596 ic->i_recv_hdrs,
597 ic->i_recv_hdrs_dma);
598
599 if (ic->i_ack)
600 ib_dma_free_coherent(dev, sizeof(struct rds_header),
601 ic->i_ack, ic->i_ack_dma);
602
603 if (ic->i_sends)
604 rds_ib_send_clear_ring(ic);
605 if (ic->i_recvs)
606 rds_ib_recv_clear_ring(ic);
607
608 if (ic->i_cm_id->qp)
609 rdma_destroy_qp(ic->i_cm_id);
610 if (ic->i_send_cq)
611 ib_destroy_cq(ic->i_send_cq);
612 if (ic->i_recv_cq)
613 ib_destroy_cq(ic->i_recv_cq);
614 rdma_destroy_id(ic->i_cm_id);
615
616 /*
617 * Move connection back to the nodev list.
618 */
619 if (ic->rds_ibdev) {
620
621 spin_lock_irq(&ic->rds_ibdev->spinlock);
622 BUG_ON(list_empty(&ic->ib_node));
623 list_del(&ic->ib_node);
624 spin_unlock_irq(&ic->rds_ibdev->spinlock);
625
626 spin_lock_irq(&ib_nodev_conns_lock);
627 list_add_tail(&ic->ib_node, &ib_nodev_conns);
628 spin_unlock_irq(&ib_nodev_conns_lock);
629 ic->rds_ibdev = NULL;
630 }
631
632 ic->i_cm_id = NULL;
633 ic->i_pd = NULL;
634 ic->i_mr = NULL;
635 ic->i_send_cq = NULL;
636 ic->i_recv_cq = NULL;
637 ic->i_send_hdrs = NULL;
638 ic->i_recv_hdrs = NULL;
639 ic->i_ack = NULL;
640 }
641 BUG_ON(ic->rds_ibdev);
642
643 /* Clear pending transmit */
644 if (ic->i_rm) {
645 rds_message_put(ic->i_rm);
646 ic->i_rm = NULL;
647 }
648
649 /* Clear the ACK state */
650 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
651 rds_ib_set_64bit(&ic->i_ack_next, 0);
652 ic->i_ack_recv = 0;
653
654 /* Clear flow control state */
655 ic->i_flowctl = 0;
656 atomic_set(&ic->i_credits, 0);
657
658 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
659 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
660
661 if (ic->i_ibinc) {
662 rds_inc_put(&ic->i_ibinc->ii_inc);
663 ic->i_ibinc = NULL;
664 }
665
666 vfree(ic->i_sends);
667 ic->i_sends = NULL;
668 vfree(ic->i_recvs);
669 ic->i_recvs = NULL;
670}
671
672int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
673{
674 struct rds_ib_connection *ic;
675 unsigned long flags;
676
677 /* XXX too lazy? */
678 ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL);
679 if (ic == NULL)
680 return -ENOMEM;
681
682 INIT_LIST_HEAD(&ic->ib_node);
683 mutex_init(&ic->i_recv_mutex);
684
685 /*
686 * rds_ib_conn_shutdown() waits for these to be emptied so they
687 * must be initialized before it can be called.
688 */
689 rds_ib_ring_init(&ic->i_send_ring, rds_ib_sysctl_max_send_wr);
690 rds_ib_ring_init(&ic->i_recv_ring, rds_ib_sysctl_max_recv_wr);
691
692 ic->conn = conn;
693 conn->c_transport_data = ic;
694
695 spin_lock_irqsave(&ib_nodev_conns_lock, flags);
696 list_add_tail(&ic->ib_node, &ib_nodev_conns);
697 spin_unlock_irqrestore(&ib_nodev_conns_lock, flags);
698
699
700 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
701 return 0;
702}
703
704void rds_ib_conn_free(void *arg)
705{
706 struct rds_ib_connection *ic = arg;
707 rdsdebug("ic %p\n", ic);
708 list_del(&ic->ib_node);
709 kfree(ic);
710}
711
712
713/*
714 * An error occurred on the connection
715 */
716void
717__rds_ib_conn_error(struct rds_connection *conn, const char *fmt, ...)
718{
719 va_list ap;
720
721 rds_conn_drop(conn);
722
723 va_start(ap, fmt);
724 vprintk(fmt, ap);
725 va_end(ap);
726}
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
new file mode 100644
index 000000000000..69a6289ed672
--- /dev/null
+++ b/net/rds/ib_rdma.c
@@ -0,0 +1,641 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "rdma.h"
37#include "ib.h"
38
39
40/*
41 * This is stored as mr->r_trans_private.
42 */
43struct rds_ib_mr {
44 struct rds_ib_device *device;
45 struct rds_ib_mr_pool *pool;
46 struct ib_fmr *fmr;
47 struct list_head list;
48 unsigned int remap_count;
49
50 struct scatterlist *sg;
51 unsigned int sg_len;
52 u64 *dma;
53 int sg_dma_len;
54};
55
56/*
57 * Our own little FMR pool
58 */
59struct rds_ib_mr_pool {
60 struct mutex flush_lock; /* serialize fmr invalidate */
61 struct work_struct flush_worker; /* flush worker */
62
63 spinlock_t list_lock; /* protect variables below */
64 atomic_t item_count; /* total # of MRs */
65 atomic_t dirty_count; /* # dirty of MRs */
66 struct list_head drop_list; /* MRs that have reached their max_maps limit */
67 struct list_head free_list; /* unused MRs */
68 struct list_head clean_list; /* unused & unamapped MRs */
69 atomic_t free_pinned; /* memory pinned by free MRs */
70 unsigned long max_items;
71 unsigned long max_items_soft;
72 unsigned long max_free_pinned;
73 struct ib_fmr_attr fmr_attr;
74};
75
76static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all);
77static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
78static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
79
80static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
81{
82 struct rds_ib_device *rds_ibdev;
83 struct rds_ib_ipaddr *i_ipaddr;
84
85 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
86 spin_lock_irq(&rds_ibdev->spinlock);
87 list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) {
88 if (i_ipaddr->ipaddr == ipaddr) {
89 spin_unlock_irq(&rds_ibdev->spinlock);
90 return rds_ibdev;
91 }
92 }
93 spin_unlock_irq(&rds_ibdev->spinlock);
94 }
95
96 return NULL;
97}
98
99static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
100{
101 struct rds_ib_ipaddr *i_ipaddr;
102
103 i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL);
104 if (!i_ipaddr)
105 return -ENOMEM;
106
107 i_ipaddr->ipaddr = ipaddr;
108
109 spin_lock_irq(&rds_ibdev->spinlock);
110 list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list);
111 spin_unlock_irq(&rds_ibdev->spinlock);
112
113 return 0;
114}
115
116static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
117{
118 struct rds_ib_ipaddr *i_ipaddr, *next;
119
120 spin_lock_irq(&rds_ibdev->spinlock);
121 list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) {
122 if (i_ipaddr->ipaddr == ipaddr) {
123 list_del(&i_ipaddr->list);
124 kfree(i_ipaddr);
125 break;
126 }
127 }
128 spin_unlock_irq(&rds_ibdev->spinlock);
129}
130
131int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
132{
133 struct rds_ib_device *rds_ibdev_old;
134
135 rds_ibdev_old = rds_ib_get_device(ipaddr);
136 if (rds_ibdev_old)
137 rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
138
139 return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
140}
141
142int rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn)
143{
144 struct rds_ib_connection *ic = conn->c_transport_data;
145
146 /* conn was previously on the nodev_conns_list */
147 spin_lock_irq(&ib_nodev_conns_lock);
148 BUG_ON(list_empty(&ib_nodev_conns));
149 BUG_ON(list_empty(&ic->ib_node));
150 list_del(&ic->ib_node);
151 spin_unlock_irq(&ib_nodev_conns_lock);
152
153 spin_lock_irq(&rds_ibdev->spinlock);
154 list_add_tail(&ic->ib_node, &rds_ibdev->conn_list);
155 spin_unlock_irq(&rds_ibdev->spinlock);
156
157 ic->rds_ibdev = rds_ibdev;
158
159 return 0;
160}
161
162void rds_ib_remove_nodev_conns(void)
163{
164 struct rds_ib_connection *ic, *_ic;
165 LIST_HEAD(tmp_list);
166
167 /* avoid calling conn_destroy with irqs off */
168 spin_lock_irq(&ib_nodev_conns_lock);
169 list_splice(&ib_nodev_conns, &tmp_list);
170 INIT_LIST_HEAD(&ib_nodev_conns);
171 spin_unlock_irq(&ib_nodev_conns_lock);
172
173 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
174 if (ic->conn->c_passive)
175 rds_conn_destroy(ic->conn->c_passive);
176 rds_conn_destroy(ic->conn);
177 }
178}
179
180void rds_ib_remove_conns(struct rds_ib_device *rds_ibdev)
181{
182 struct rds_ib_connection *ic, *_ic;
183 LIST_HEAD(tmp_list);
184
185 /* avoid calling conn_destroy with irqs off */
186 spin_lock_irq(&rds_ibdev->spinlock);
187 list_splice(&rds_ibdev->conn_list, &tmp_list);
188 INIT_LIST_HEAD(&rds_ibdev->conn_list);
189 spin_unlock_irq(&rds_ibdev->spinlock);
190
191 list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) {
192 if (ic->conn->c_passive)
193 rds_conn_destroy(ic->conn->c_passive);
194 rds_conn_destroy(ic->conn);
195 }
196}
197
198struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev)
199{
200 struct rds_ib_mr_pool *pool;
201
202 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
203 if (!pool)
204 return ERR_PTR(-ENOMEM);
205
206 INIT_LIST_HEAD(&pool->free_list);
207 INIT_LIST_HEAD(&pool->drop_list);
208 INIT_LIST_HEAD(&pool->clean_list);
209 mutex_init(&pool->flush_lock);
210 spin_lock_init(&pool->list_lock);
211 INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
212
213 pool->fmr_attr.max_pages = fmr_message_size;
214 pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
215 pool->fmr_attr.page_shift = rds_ibdev->fmr_page_shift;
216 pool->max_free_pinned = rds_ibdev->max_fmrs * fmr_message_size / 4;
217
218 /* We never allow more than max_items MRs to be allocated.
219 * When we exceed more than max_items_soft, we start freeing
220 * items more aggressively.
221 * Make sure that max_items > max_items_soft > max_items / 2
222 */
223 pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
224 pool->max_items = rds_ibdev->max_fmrs;
225
226 return pool;
227}
228
229void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
230{
231 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
232
233 iinfo->rdma_mr_max = pool->max_items;
234 iinfo->rdma_mr_size = pool->fmr_attr.max_pages;
235}
236
237void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
238{
239 flush_workqueue(rds_wq);
240 rds_ib_flush_mr_pool(pool, 1);
241 BUG_ON(atomic_read(&pool->item_count));
242 BUG_ON(atomic_read(&pool->free_pinned));
243 kfree(pool);
244}
245
246static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
247{
248 struct rds_ib_mr *ibmr = NULL;
249 unsigned long flags;
250
251 spin_lock_irqsave(&pool->list_lock, flags);
252 if (!list_empty(&pool->clean_list)) {
253 ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list);
254 list_del_init(&ibmr->list);
255 }
256 spin_unlock_irqrestore(&pool->list_lock, flags);
257
258 return ibmr;
259}
260
261static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev)
262{
263 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
264 struct rds_ib_mr *ibmr = NULL;
265 int err = 0, iter = 0;
266
267 while (1) {
268 ibmr = rds_ib_reuse_fmr(pool);
269 if (ibmr)
270 return ibmr;
271
272 /* No clean MRs - now we have the choice of either
273 * allocating a fresh MR up to the limit imposed by the
274 * driver, or flush any dirty unused MRs.
275 * We try to avoid stalling in the send path if possible,
276 * so we allocate as long as we're allowed to.
277 *
278 * We're fussy with enforcing the FMR limit, though. If the driver
279 * tells us we can't use more than N fmrs, we shouldn't start
280 * arguing with it */
281 if (atomic_inc_return(&pool->item_count) <= pool->max_items)
282 break;
283
284 atomic_dec(&pool->item_count);
285
286 if (++iter > 2) {
287 rds_ib_stats_inc(s_ib_rdma_mr_pool_depleted);
288 return ERR_PTR(-EAGAIN);
289 }
290
291 /* We do have some empty MRs. Flush them out. */
292 rds_ib_stats_inc(s_ib_rdma_mr_pool_wait);
293 rds_ib_flush_mr_pool(pool, 0);
294 }
295
296 ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
297 if (!ibmr) {
298 err = -ENOMEM;
299 goto out_no_cigar;
300 }
301
302 ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
303 (IB_ACCESS_LOCAL_WRITE |
304 IB_ACCESS_REMOTE_READ |
305 IB_ACCESS_REMOTE_WRITE),
306 &pool->fmr_attr);
307 if (IS_ERR(ibmr->fmr)) {
308 err = PTR_ERR(ibmr->fmr);
309 ibmr->fmr = NULL;
310 printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
311 goto out_no_cigar;
312 }
313
314 rds_ib_stats_inc(s_ib_rdma_mr_alloc);
315 return ibmr;
316
317out_no_cigar:
318 if (ibmr) {
319 if (ibmr->fmr)
320 ib_dealloc_fmr(ibmr->fmr);
321 kfree(ibmr);
322 }
323 atomic_dec(&pool->item_count);
324 return ERR_PTR(err);
325}
326
327static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
328 struct scatterlist *sg, unsigned int nents)
329{
330 struct ib_device *dev = rds_ibdev->dev;
331 struct scatterlist *scat = sg;
332 u64 io_addr = 0;
333 u64 *dma_pages;
334 u32 len;
335 int page_cnt, sg_dma_len;
336 int i, j;
337 int ret;
338
339 sg_dma_len = ib_dma_map_sg(dev, sg, nents,
340 DMA_BIDIRECTIONAL);
341 if (unlikely(!sg_dma_len)) {
342 printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
343 return -EBUSY;
344 }
345
346 len = 0;
347 page_cnt = 0;
348
349 for (i = 0; i < sg_dma_len; ++i) {
350 unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
351 u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
352
353 if (dma_addr & ~rds_ibdev->fmr_page_mask) {
354 if (i > 0)
355 return -EINVAL;
356 else
357 ++page_cnt;
358 }
359 if ((dma_addr + dma_len) & ~rds_ibdev->fmr_page_mask) {
360 if (i < sg_dma_len - 1)
361 return -EINVAL;
362 else
363 ++page_cnt;
364 }
365
366 len += dma_len;
367 }
368
369 page_cnt += len >> rds_ibdev->fmr_page_shift;
370 if (page_cnt > fmr_message_size)
371 return -EINVAL;
372
373 dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC);
374 if (!dma_pages)
375 return -ENOMEM;
376
377 page_cnt = 0;
378 for (i = 0; i < sg_dma_len; ++i) {
379 unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
380 u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
381
382 for (j = 0; j < dma_len; j += rds_ibdev->fmr_page_size)
383 dma_pages[page_cnt++] =
384 (dma_addr & rds_ibdev->fmr_page_mask) + j;
385 }
386
387 ret = ib_map_phys_fmr(ibmr->fmr,
388 dma_pages, page_cnt, io_addr);
389 if (ret)
390 goto out;
391
392 /* Success - we successfully remapped the MR, so we can
393 * safely tear down the old mapping. */
394 rds_ib_teardown_mr(ibmr);
395
396 ibmr->sg = scat;
397 ibmr->sg_len = nents;
398 ibmr->sg_dma_len = sg_dma_len;
399 ibmr->remap_count++;
400
401 rds_ib_stats_inc(s_ib_rdma_mr_used);
402 ret = 0;
403
404out:
405 kfree(dma_pages);
406
407 return ret;
408}
409
410void rds_ib_sync_mr(void *trans_private, int direction)
411{
412 struct rds_ib_mr *ibmr = trans_private;
413 struct rds_ib_device *rds_ibdev = ibmr->device;
414
415 switch (direction) {
416 case DMA_FROM_DEVICE:
417 ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg,
418 ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
419 break;
420 case DMA_TO_DEVICE:
421 ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg,
422 ibmr->sg_dma_len, DMA_BIDIRECTIONAL);
423 break;
424 }
425}
426
427static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
428{
429 struct rds_ib_device *rds_ibdev = ibmr->device;
430
431 if (ibmr->sg_dma_len) {
432 ib_dma_unmap_sg(rds_ibdev->dev,
433 ibmr->sg, ibmr->sg_len,
434 DMA_BIDIRECTIONAL);
435 ibmr->sg_dma_len = 0;
436 }
437
438 /* Release the s/g list */
439 if (ibmr->sg_len) {
440 unsigned int i;
441
442 for (i = 0; i < ibmr->sg_len; ++i) {
443 struct page *page = sg_page(&ibmr->sg[i]);
444
445 /* FIXME we need a way to tell a r/w MR
446 * from a r/o MR */
447 set_page_dirty(page);
448 put_page(page);
449 }
450 kfree(ibmr->sg);
451
452 ibmr->sg = NULL;
453 ibmr->sg_len = 0;
454 }
455}
456
457static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
458{
459 unsigned int pinned = ibmr->sg_len;
460
461 __rds_ib_teardown_mr(ibmr);
462 if (pinned) {
463 struct rds_ib_device *rds_ibdev = ibmr->device;
464 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
465
466 atomic_sub(pinned, &pool->free_pinned);
467 }
468}
469
470static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all)
471{
472 unsigned int item_count;
473
474 item_count = atomic_read(&pool->item_count);
475 if (free_all)
476 return item_count;
477
478 return 0;
479}
480
481/*
482 * Flush our pool of MRs.
483 * At a minimum, all currently unused MRs are unmapped.
484 * If the number of MRs allocated exceeds the limit, we also try
485 * to free as many MRs as needed to get back to this limit.
486 */
487static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all)
488{
489 struct rds_ib_mr *ibmr, *next;
490 LIST_HEAD(unmap_list);
491 LIST_HEAD(fmr_list);
492 unsigned long unpinned = 0;
493 unsigned long flags;
494 unsigned int nfreed = 0, ncleaned = 0, free_goal;
495 int ret = 0;
496
497 rds_ib_stats_inc(s_ib_rdma_mr_pool_flush);
498
499 mutex_lock(&pool->flush_lock);
500
501 spin_lock_irqsave(&pool->list_lock, flags);
502 /* Get the list of all MRs to be dropped. Ordering matters -
503 * we want to put drop_list ahead of free_list. */
504 list_splice_init(&pool->free_list, &unmap_list);
505 list_splice_init(&pool->drop_list, &unmap_list);
506 if (free_all)
507 list_splice_init(&pool->clean_list, &unmap_list);
508 spin_unlock_irqrestore(&pool->list_lock, flags);
509
510 free_goal = rds_ib_flush_goal(pool, free_all);
511
512 if (list_empty(&unmap_list))
513 goto out;
514
515 /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
516 list_for_each_entry(ibmr, &unmap_list, list)
517 list_add(&ibmr->fmr->list, &fmr_list);
518 ret = ib_unmap_fmr(&fmr_list);
519 if (ret)
520 printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
521
522 /* Now we can destroy the DMA mapping and unpin any pages */
523 list_for_each_entry_safe(ibmr, next, &unmap_list, list) {
524 unpinned += ibmr->sg_len;
525 __rds_ib_teardown_mr(ibmr);
526 if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) {
527 rds_ib_stats_inc(s_ib_rdma_mr_free);
528 list_del(&ibmr->list);
529 ib_dealloc_fmr(ibmr->fmr);
530 kfree(ibmr);
531 nfreed++;
532 }
533 ncleaned++;
534 }
535
536 spin_lock_irqsave(&pool->list_lock, flags);
537 list_splice(&unmap_list, &pool->clean_list);
538 spin_unlock_irqrestore(&pool->list_lock, flags);
539
540 atomic_sub(unpinned, &pool->free_pinned);
541 atomic_sub(ncleaned, &pool->dirty_count);
542 atomic_sub(nfreed, &pool->item_count);
543
544out:
545 mutex_unlock(&pool->flush_lock);
546 return ret;
547}
548
549static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
550{
551 struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker);
552
553 rds_ib_flush_mr_pool(pool, 0);
554}
555
556void rds_ib_free_mr(void *trans_private, int invalidate)
557{
558 struct rds_ib_mr *ibmr = trans_private;
559 struct rds_ib_device *rds_ibdev = ibmr->device;
560 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
561 unsigned long flags;
562
563 rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
564
565 /* Return it to the pool's free list */
566 spin_lock_irqsave(&pool->list_lock, flags);
567 if (ibmr->remap_count >= pool->fmr_attr.max_maps)
568 list_add(&ibmr->list, &pool->drop_list);
569 else
570 list_add(&ibmr->list, &pool->free_list);
571
572 atomic_add(ibmr->sg_len, &pool->free_pinned);
573 atomic_inc(&pool->dirty_count);
574 spin_unlock_irqrestore(&pool->list_lock, flags);
575
576 /* If we've pinned too many pages, request a flush */
577 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
578 || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
579 queue_work(rds_wq, &pool->flush_worker);
580
581 if (invalidate) {
582 if (likely(!in_interrupt())) {
583 rds_ib_flush_mr_pool(pool, 0);
584 } else {
585 /* We get here if the user created a MR marked
586 * as use_once and invalidate at the same time. */
587 queue_work(rds_wq, &pool->flush_worker);
588 }
589 }
590}
591
592void rds_ib_flush_mrs(void)
593{
594 struct rds_ib_device *rds_ibdev;
595
596 list_for_each_entry(rds_ibdev, &rds_ib_devices, list) {
597 struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool;
598
599 if (pool)
600 rds_ib_flush_mr_pool(pool, 0);
601 }
602}
603
604void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
605 struct rds_sock *rs, u32 *key_ret)
606{
607 struct rds_ib_device *rds_ibdev;
608 struct rds_ib_mr *ibmr = NULL;
609 int ret;
610
611 rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
612 if (!rds_ibdev) {
613 ret = -ENODEV;
614 goto out;
615 }
616
617 if (!rds_ibdev->mr_pool) {
618 ret = -ENODEV;
619 goto out;
620 }
621
622 ibmr = rds_ib_alloc_fmr(rds_ibdev);
623 if (IS_ERR(ibmr))
624 return ibmr;
625
626 ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
627 if (ret == 0)
628 *key_ret = ibmr->fmr->rkey;
629 else
630 printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
631
632 ibmr->device = rds_ibdev;
633
634 out:
635 if (ret) {
636 if (ibmr)
637 rds_ib_free_mr(ibmr, 0);
638 ibmr = ERR_PTR(ret);
639 }
640 return ibmr;
641}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
new file mode 100644
index 000000000000..5061b5502162
--- /dev/null
+++ b/net/rds/ib_recv.c
@@ -0,0 +1,869 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/pci.h>
35#include <linux/dma-mapping.h>
36#include <rdma/rdma_cm.h>
37
38#include "rds.h"
39#include "ib.h"
40
41static struct kmem_cache *rds_ib_incoming_slab;
42static struct kmem_cache *rds_ib_frag_slab;
43static atomic_t rds_ib_allocation = ATOMIC_INIT(0);
44
45static void rds_ib_frag_drop_page(struct rds_page_frag *frag)
46{
47 rdsdebug("frag %p page %p\n", frag, frag->f_page);
48 __free_page(frag->f_page);
49 frag->f_page = NULL;
50}
51
52static void rds_ib_frag_free(struct rds_page_frag *frag)
53{
54 rdsdebug("frag %p page %p\n", frag, frag->f_page);
55 BUG_ON(frag->f_page != NULL);
56 kmem_cache_free(rds_ib_frag_slab, frag);
57}
58
59/*
60 * We map a page at a time. Its fragments are posted in order. This
61 * is called in fragment order as the fragments get send completion events.
62 * Only the last frag in the page performs the unmapping.
63 *
64 * It's OK for ring cleanup to call this in whatever order it likes because
65 * DMA is not in flight and so we can unmap while other ring entries still
66 * hold page references in their frags.
67 */
68static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic,
69 struct rds_ib_recv_work *recv)
70{
71 struct rds_page_frag *frag = recv->r_frag;
72
73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
74 if (frag->f_mapped)
75 ib_dma_unmap_page(ic->i_cm_id->device,
76 frag->f_mapped,
77 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
78 frag->f_mapped = 0;
79}
80
81void rds_ib_recv_init_ring(struct rds_ib_connection *ic)
82{
83 struct rds_ib_recv_work *recv;
84 u32 i;
85
86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
87 struct ib_sge *sge;
88
89 recv->r_ibinc = NULL;
90 recv->r_frag = NULL;
91
92 recv->r_wr.next = NULL;
93 recv->r_wr.wr_id = i;
94 recv->r_wr.sg_list = recv->r_sge;
95 recv->r_wr.num_sge = RDS_IB_RECV_SGE;
96
97 sge = rds_ib_data_sge(ic, recv->r_sge);
98 sge->addr = 0;
99 sge->length = RDS_FRAG_SIZE;
100 sge->lkey = ic->i_mr->lkey;
101
102 sge = rds_ib_header_sge(ic, recv->r_sge);
103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
104 sge->length = sizeof(struct rds_header);
105 sge->lkey = ic->i_mr->lkey;
106 }
107}
108
109static void rds_ib_recv_clear_one(struct rds_ib_connection *ic,
110 struct rds_ib_recv_work *recv)
111{
112 if (recv->r_ibinc) {
113 rds_inc_put(&recv->r_ibinc->ii_inc);
114 recv->r_ibinc = NULL;
115 }
116 if (recv->r_frag) {
117 rds_ib_recv_unmap_page(ic, recv);
118 if (recv->r_frag->f_page)
119 rds_ib_frag_drop_page(recv->r_frag);
120 rds_ib_frag_free(recv->r_frag);
121 recv->r_frag = NULL;
122 }
123}
124
125void rds_ib_recv_clear_ring(struct rds_ib_connection *ic)
126{
127 u32 i;
128
129 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
130 rds_ib_recv_clear_one(ic, &ic->i_recvs[i]);
131
132 if (ic->i_frag.f_page)
133 rds_ib_frag_drop_page(&ic->i_frag);
134}
135
136static int rds_ib_recv_refill_one(struct rds_connection *conn,
137 struct rds_ib_recv_work *recv,
138 gfp_t kptr_gfp, gfp_t page_gfp)
139{
140 struct rds_ib_connection *ic = conn->c_transport_data;
141 dma_addr_t dma_addr;
142 struct ib_sge *sge;
143 int ret = -ENOMEM;
144
145 if (recv->r_ibinc == NULL) {
146 if (atomic_read(&rds_ib_allocation) >= rds_ib_sysctl_max_recv_allocation) {
147 rds_ib_stats_inc(s_ib_rx_alloc_limit);
148 goto out;
149 }
150 recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab,
151 kptr_gfp);
152 if (recv->r_ibinc == NULL)
153 goto out;
154 atomic_inc(&rds_ib_allocation);
155 INIT_LIST_HEAD(&recv->r_ibinc->ii_frags);
156 rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr);
157 }
158
159 if (recv->r_frag == NULL) {
160 recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp);
161 if (recv->r_frag == NULL)
162 goto out;
163 INIT_LIST_HEAD(&recv->r_frag->f_item);
164 recv->r_frag->f_page = NULL;
165 }
166
167 if (ic->i_frag.f_page == NULL) {
168 ic->i_frag.f_page = alloc_page(page_gfp);
169 if (ic->i_frag.f_page == NULL)
170 goto out;
171 ic->i_frag.f_offset = 0;
172 }
173
174 dma_addr = ib_dma_map_page(ic->i_cm_id->device,
175 ic->i_frag.f_page,
176 ic->i_frag.f_offset,
177 RDS_FRAG_SIZE,
178 DMA_FROM_DEVICE);
179 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
180 goto out;
181
182 /*
183 * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap()
184 * must be called on this recv. This happens as completions hit
185 * in order or on connection shutdown.
186 */
187 recv->r_frag->f_page = ic->i_frag.f_page;
188 recv->r_frag->f_offset = ic->i_frag.f_offset;
189 recv->r_frag->f_mapped = dma_addr;
190
191 sge = rds_ib_data_sge(ic, recv->r_sge);
192 sge->addr = dma_addr;
193 sge->length = RDS_FRAG_SIZE;
194
195 sge = rds_ib_header_sge(ic, recv->r_sge);
196 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
197 sge->length = sizeof(struct rds_header);
198
199 get_page(recv->r_frag->f_page);
200
201 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
202 ic->i_frag.f_offset += RDS_FRAG_SIZE;
203 } else {
204 put_page(ic->i_frag.f_page);
205 ic->i_frag.f_page = NULL;
206 ic->i_frag.f_offset = 0;
207 }
208
209 ret = 0;
210out:
211 return ret;
212}
213
214/*
215 * This tries to allocate and post unused work requests after making sure that
216 * they have all the allocations they need to queue received fragments into
217 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
218 * pairs don't go unmatched.
219 *
220 * -1 is returned if posting fails due to temporary resource exhaustion.
221 */
222int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
223 gfp_t page_gfp, int prefill)
224{
225 struct rds_ib_connection *ic = conn->c_transport_data;
226 struct rds_ib_recv_work *recv;
227 struct ib_recv_wr *failed_wr;
228 unsigned int posted = 0;
229 int ret = 0;
230 u32 pos;
231
232 while ((prefill || rds_conn_up(conn))
233 && rds_ib_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
234 if (pos >= ic->i_recv_ring.w_nr) {
235 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
236 pos);
237 ret = -EINVAL;
238 break;
239 }
240
241 recv = &ic->i_recvs[pos];
242 ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
243 if (ret) {
244 ret = -1;
245 break;
246 }
247
248 /* XXX when can this fail? */
249 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
250 rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv,
251 recv->r_ibinc, recv->r_frag->f_page,
252 (long) recv->r_frag->f_mapped, ret);
253 if (ret) {
254 rds_ib_conn_error(conn, "recv post on "
255 "%pI4 returned %d, disconnecting and "
256 "reconnecting\n", &conn->c_faddr,
257 ret);
258 ret = -1;
259 break;
260 }
261
262 posted++;
263 }
264
265 /* We're doing flow control - update the window. */
266 if (ic->i_flowctl && posted)
267 rds_ib_advertise_credits(conn, posted);
268
269 if (ret)
270 rds_ib_ring_unalloc(&ic->i_recv_ring, 1);
271 return ret;
272}
273
274void rds_ib_inc_purge(struct rds_incoming *inc)
275{
276 struct rds_ib_incoming *ibinc;
277 struct rds_page_frag *frag;
278 struct rds_page_frag *pos;
279
280 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
281 rdsdebug("purging ibinc %p inc %p\n", ibinc, inc);
282
283 list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) {
284 list_del_init(&frag->f_item);
285 rds_ib_frag_drop_page(frag);
286 rds_ib_frag_free(frag);
287 }
288}
289
290void rds_ib_inc_free(struct rds_incoming *inc)
291{
292 struct rds_ib_incoming *ibinc;
293
294 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
295
296 rds_ib_inc_purge(inc);
297 rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc);
298 BUG_ON(!list_empty(&ibinc->ii_frags));
299 kmem_cache_free(rds_ib_incoming_slab, ibinc);
300 atomic_dec(&rds_ib_allocation);
301 BUG_ON(atomic_read(&rds_ib_allocation) < 0);
302}
303
304int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
305 size_t size)
306{
307 struct rds_ib_incoming *ibinc;
308 struct rds_page_frag *frag;
309 struct iovec *iov = first_iov;
310 unsigned long to_copy;
311 unsigned long frag_off = 0;
312 unsigned long iov_off = 0;
313 int copied = 0;
314 int ret;
315 u32 len;
316
317 ibinc = container_of(inc, struct rds_ib_incoming, ii_inc);
318 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
319 len = be32_to_cpu(inc->i_hdr.h_len);
320
321 while (copied < size && copied < len) {
322 if (frag_off == RDS_FRAG_SIZE) {
323 frag = list_entry(frag->f_item.next,
324 struct rds_page_frag, f_item);
325 frag_off = 0;
326 }
327 while (iov_off == iov->iov_len) {
328 iov_off = 0;
329 iov++;
330 }
331
332 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
333 to_copy = min_t(size_t, to_copy, size - copied);
334 to_copy = min_t(unsigned long, to_copy, len - copied);
335
336 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
337 "[%p, %lu] + %lu\n",
338 to_copy, iov->iov_base, iov->iov_len, iov_off,
339 frag->f_page, frag->f_offset, frag_off);
340
341 /* XXX needs + offset for multiple recvs per page */
342 ret = rds_page_copy_to_user(frag->f_page,
343 frag->f_offset + frag_off,
344 iov->iov_base + iov_off,
345 to_copy);
346 if (ret) {
347 copied = ret;
348 break;
349 }
350
351 iov_off += to_copy;
352 frag_off += to_copy;
353 copied += to_copy;
354 }
355
356 return copied;
357}
358
359/* ic starts out kzalloc()ed */
360void rds_ib_recv_init_ack(struct rds_ib_connection *ic)
361{
362 struct ib_send_wr *wr = &ic->i_ack_wr;
363 struct ib_sge *sge = &ic->i_ack_sge;
364
365 sge->addr = ic->i_ack_dma;
366 sge->length = sizeof(struct rds_header);
367 sge->lkey = ic->i_mr->lkey;
368
369 wr->sg_list = sge;
370 wr->num_sge = 1;
371 wr->opcode = IB_WR_SEND;
372 wr->wr_id = RDS_IB_ACK_WR_ID;
373 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
374}
375
376/*
377 * You'd think that with reliable IB connections you wouldn't need to ack
378 * messages that have been received. The problem is that IB hardware generates
379 * an ack message before it has DMAed the message into memory. This creates a
380 * potential message loss if the HCA is disabled for any reason between when it
381 * sends the ack and before the message is DMAed and processed. This is only a
382 * potential issue if another HCA is available for fail-over.
383 *
384 * When the remote host receives our ack they'll free the sent message from
385 * their send queue. To decrease the latency of this we always send an ack
386 * immediately after we've received messages.
387 *
388 * For simplicity, we only have one ack in flight at a time. This puts
389 * pressure on senders to have deep enough send queues to absorb the latency of
390 * a single ack frame being in flight. This might not be good enough.
391 *
392 * This is implemented by have a long-lived send_wr and sge which point to a
393 * statically allocated ack frame. This ack wr does not fall under the ring
394 * accounting that the tx and rx wrs do. The QP attribute specifically makes
395 * room for it beyond the ring size. Send completion notices its special
396 * wr_id and avoids working with the ring in that case.
397 */
398static void rds_ib_set_ack(struct rds_ib_connection *ic, u64 seq,
399 int ack_required)
400{
401 rds_ib_set_64bit(&ic->i_ack_next, seq);
402 if (ack_required) {
403 smp_mb__before_clear_bit();
404 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
405 }
406}
407
408static u64 rds_ib_get_ack(struct rds_ib_connection *ic)
409{
410 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
411 smp_mb__after_clear_bit();
412
413 return ic->i_ack_next;
414}
415
416static void rds_ib_send_ack(struct rds_ib_connection *ic, unsigned int adv_credits)
417{
418 struct rds_header *hdr = ic->i_ack;
419 struct ib_send_wr *failed_wr;
420 u64 seq;
421 int ret;
422
423 seq = rds_ib_get_ack(ic);
424
425 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
426 rds_message_populate_header(hdr, 0, 0, 0);
427 hdr->h_ack = cpu_to_be64(seq);
428 hdr->h_credit = adv_credits;
429 rds_message_make_checksum(hdr);
430 ic->i_ack_queued = jiffies;
431
432 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
433 if (unlikely(ret)) {
434 /* Failed to send. Release the WR, and
435 * force another ACK.
436 */
437 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
438 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
439
440 rds_ib_stats_inc(s_ib_ack_send_failure);
441 /* Need to finesse this later. */
442 BUG();
443 } else
444 rds_ib_stats_inc(s_ib_ack_sent);
445}
446
447/*
448 * There are 3 ways of getting acknowledgements to the peer:
449 * 1. We call rds_ib_attempt_ack from the recv completion handler
450 * to send an ACK-only frame.
451 * However, there can be only one such frame in the send queue
452 * at any time, so we may have to postpone it.
453 * 2. When another (data) packet is transmitted while there's
454 * an ACK in the queue, we piggyback the ACK sequence number
455 * on the data packet.
456 * 3. If the ACK WR is done sending, we get called from the
457 * send queue completion handler, and check whether there's
458 * another ACK pending (postponed because the WR was on the
459 * queue). If so, we transmit it.
460 *
461 * We maintain 2 variables:
462 * - i_ack_flags, which keeps track of whether the ACK WR
463 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
464 * - i_ack_next, which is the last sequence number we received
465 *
466 * Potentially, send queue and receive queue handlers can run concurrently.
467 *
468 * Reconnecting complicates this picture just slightly. When we
469 * reconnect, we may be seeing duplicate packets. The peer
470 * is retransmitting them, because it hasn't seen an ACK for
471 * them. It is important that we ACK these.
472 *
473 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
474 * this flag set *MUST* be acknowledged immediately.
475 */
476
477/*
478 * When we get here, we're called from the recv queue handler.
479 * Check whether we ought to transmit an ACK.
480 */
481void rds_ib_attempt_ack(struct rds_ib_connection *ic)
482{
483 unsigned int adv_credits;
484
485 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
486 return;
487
488 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
489 rds_ib_stats_inc(s_ib_ack_send_delayed);
490 return;
491 }
492
493 /* Can we get a send credit? */
494 if (!rds_ib_send_grab_credits(ic, 1, &adv_credits, 0)) {
495 rds_ib_stats_inc(s_ib_tx_throttle);
496 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
497 return;
498 }
499
500 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
501 rds_ib_send_ack(ic, adv_credits);
502}
503
504/*
505 * We get here from the send completion handler, when the
506 * adapter tells us the ACK frame was sent.
507 */
508void rds_ib_ack_send_complete(struct rds_ib_connection *ic)
509{
510 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
511 rds_ib_attempt_ack(ic);
512}
513
514/*
515 * This is called by the regular xmit code when it wants to piggyback
516 * an ACK on an outgoing frame.
517 */
518u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic)
519{
520 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
521 rds_ib_stats_inc(s_ib_ack_send_piggybacked);
522 return rds_ib_get_ack(ic);
523}
524
525/*
526 * It's kind of lame that we're copying from the posted receive pages into
527 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
528 * them. But receiving new congestion bitmaps should be a *rare* event, so
529 * hopefully we won't need to invest that complexity in making it more
530 * efficient. By copying we can share a simpler core with TCP which has to
531 * copy.
532 */
533static void rds_ib_cong_recv(struct rds_connection *conn,
534 struct rds_ib_incoming *ibinc)
535{
536 struct rds_cong_map *map;
537 unsigned int map_off;
538 unsigned int map_page;
539 struct rds_page_frag *frag;
540 unsigned long frag_off;
541 unsigned long to_copy;
542 unsigned long copied;
543 uint64_t uncongested = 0;
544 void *addr;
545
546 /* catch completely corrupt packets */
547 if (be32_to_cpu(ibinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
548 return;
549
550 map = conn->c_fcong;
551 map_page = 0;
552 map_off = 0;
553
554 frag = list_entry(ibinc->ii_frags.next, struct rds_page_frag, f_item);
555 frag_off = 0;
556
557 copied = 0;
558
559 while (copied < RDS_CONG_MAP_BYTES) {
560 uint64_t *src, *dst;
561 unsigned int k;
562
563 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
564 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
565
566 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
567
568 src = addr + frag_off;
569 dst = (void *)map->m_page_addrs[map_page] + map_off;
570 for (k = 0; k < to_copy; k += 8) {
571 /* Record ports that became uncongested, ie
572 * bits that changed from 0 to 1. */
573 uncongested |= ~(*src) & *dst;
574 *dst++ = *src++;
575 }
576 kunmap_atomic(addr, KM_SOFTIRQ0);
577
578 copied += to_copy;
579
580 map_off += to_copy;
581 if (map_off == PAGE_SIZE) {
582 map_off = 0;
583 map_page++;
584 }
585
586 frag_off += to_copy;
587 if (frag_off == RDS_FRAG_SIZE) {
588 frag = list_entry(frag->f_item.next,
589 struct rds_page_frag, f_item);
590 frag_off = 0;
591 }
592 }
593
594 /* the congestion map is in little endian order */
595 uncongested = le64_to_cpu(uncongested);
596
597 rds_cong_map_updated(map, uncongested);
598}
599
600/*
601 * Rings are posted with all the allocations they'll need to queue the
602 * incoming message to the receiving socket so this can't fail.
603 * All fragments start with a header, so we can make sure we're not receiving
604 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
605 */
606struct rds_ib_ack_state {
607 u64 ack_next;
608 u64 ack_recv;
609 unsigned int ack_required:1;
610 unsigned int ack_next_valid:1;
611 unsigned int ack_recv_valid:1;
612};
613
614static void rds_ib_process_recv(struct rds_connection *conn,
615 struct rds_ib_recv_work *recv, u32 byte_len,
616 struct rds_ib_ack_state *state)
617{
618 struct rds_ib_connection *ic = conn->c_transport_data;
619 struct rds_ib_incoming *ibinc = ic->i_ibinc;
620 struct rds_header *ihdr, *hdr;
621
622 /* XXX shut down the connection if port 0,0 are seen? */
623
624 rdsdebug("ic %p ibinc %p recv %p byte len %u\n", ic, ibinc, recv,
625 byte_len);
626
627 if (byte_len < sizeof(struct rds_header)) {
628 rds_ib_conn_error(conn, "incoming message "
629 "from %pI4 didn't inclue a "
630 "header, disconnecting and "
631 "reconnecting\n",
632 &conn->c_faddr);
633 return;
634 }
635 byte_len -= sizeof(struct rds_header);
636
637 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
638
639 /* Validate the checksum. */
640 if (!rds_message_verify_checksum(ihdr)) {
641 rds_ib_conn_error(conn, "incoming message "
642 "from %pI4 has corrupted header - "
643 "forcing a reconnect\n",
644 &conn->c_faddr);
645 rds_stats_inc(s_recv_drop_bad_checksum);
646 return;
647 }
648
649 /* Process the ACK sequence which comes with every packet */
650 state->ack_recv = be64_to_cpu(ihdr->h_ack);
651 state->ack_recv_valid = 1;
652
653 /* Process the credits update if there was one */
654 if (ihdr->h_credit)
655 rds_ib_send_add_credits(conn, ihdr->h_credit);
656
657 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
658 /* This is an ACK-only packet. The fact that it gets
659 * special treatment here is that historically, ACKs
660 * were rather special beasts.
661 */
662 rds_ib_stats_inc(s_ib_ack_received);
663
664 /*
665 * Usually the frags make their way on to incs and are then freed as
666 * the inc is freed. We don't go that route, so we have to drop the
667 * page ref ourselves. We can't just leave the page on the recv
668 * because that confuses the dma mapping of pages and each recv's use
669 * of a partial page. We can leave the frag, though, it will be
670 * reused.
671 *
672 * FIXME: Fold this into the code path below.
673 */
674 rds_ib_frag_drop_page(recv->r_frag);
675 return;
676 }
677
678 /*
679 * If we don't already have an inc on the connection then this
680 * fragment has a header and starts a message.. copy its header
681 * into the inc and save the inc so we can hang upcoming fragments
682 * off its list.
683 */
684 if (ibinc == NULL) {
685 ibinc = recv->r_ibinc;
686 recv->r_ibinc = NULL;
687 ic->i_ibinc = ibinc;
688
689 hdr = &ibinc->ii_inc.i_hdr;
690 memcpy(hdr, ihdr, sizeof(*hdr));
691 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
692
693 rdsdebug("ic %p ibinc %p rem %u flag 0x%x\n", ic, ibinc,
694 ic->i_recv_data_rem, hdr->h_flags);
695 } else {
696 hdr = &ibinc->ii_inc.i_hdr;
697 /* We can't just use memcmp here; fragments of a
698 * single message may carry different ACKs */
699 if (hdr->h_sequence != ihdr->h_sequence
700 || hdr->h_len != ihdr->h_len
701 || hdr->h_sport != ihdr->h_sport
702 || hdr->h_dport != ihdr->h_dport) {
703 rds_ib_conn_error(conn,
704 "fragment header mismatch; forcing reconnect\n");
705 return;
706 }
707 }
708
709 list_add_tail(&recv->r_frag->f_item, &ibinc->ii_frags);
710 recv->r_frag = NULL;
711
712 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
713 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
714 else {
715 ic->i_recv_data_rem = 0;
716 ic->i_ibinc = NULL;
717
718 if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
719 rds_ib_cong_recv(conn, ibinc);
720 else {
721 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
722 &ibinc->ii_inc, GFP_ATOMIC,
723 KM_SOFTIRQ0);
724 state->ack_next = be64_to_cpu(hdr->h_sequence);
725 state->ack_next_valid = 1;
726 }
727
728 /* Evaluate the ACK_REQUIRED flag *after* we received
729 * the complete frame, and after bumping the next_rx
730 * sequence. */
731 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
732 rds_stats_inc(s_recv_ack_required);
733 state->ack_required = 1;
734 }
735
736 rds_inc_put(&ibinc->ii_inc);
737 }
738}
739
740/*
741 * Plucking the oldest entry from the ring can be done concurrently with
742 * the thread refilling the ring. Each ring operation is protected by
743 * spinlocks and the transient state of refilling doesn't change the
744 * recording of which entry is oldest.
745 *
746 * This relies on IB only calling one cq comp_handler for each cq so that
747 * there will only be one caller of rds_recv_incoming() per RDS connection.
748 */
749void rds_ib_recv_cq_comp_handler(struct ib_cq *cq, void *context)
750{
751 struct rds_connection *conn = context;
752 struct rds_ib_connection *ic = conn->c_transport_data;
753 struct ib_wc wc;
754 struct rds_ib_ack_state state = { 0, };
755 struct rds_ib_recv_work *recv;
756
757 rdsdebug("conn %p cq %p\n", conn, cq);
758
759 rds_ib_stats_inc(s_ib_rx_cq_call);
760
761 ib_req_notify_cq(cq, IB_CQ_SOLICITED);
762
763 while (ib_poll_cq(cq, 1, &wc) > 0) {
764 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
765 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
766 be32_to_cpu(wc.ex.imm_data));
767 rds_ib_stats_inc(s_ib_rx_cq_event);
768
769 recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)];
770
771 rds_ib_recv_unmap_page(ic, recv);
772
773 /*
774 * Also process recvs in connecting state because it is possible
775 * to get a recv completion _before_ the rdmacm ESTABLISHED
776 * event is processed.
777 */
778 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
779 /* We expect errors as the qp is drained during shutdown */
780 if (wc.status == IB_WC_SUCCESS) {
781 rds_ib_process_recv(conn, recv, wc.byte_len, &state);
782 } else {
783 rds_ib_conn_error(conn, "recv completion on "
784 "%pI4 had status %u, disconnecting and "
785 "reconnecting\n", &conn->c_faddr,
786 wc.status);
787 }
788 }
789
790 rds_ib_ring_free(&ic->i_recv_ring, 1);
791 }
792
793 if (state.ack_next_valid)
794 rds_ib_set_ack(ic, state.ack_next, state.ack_required);
795 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
796 rds_send_drop_acked(conn, state.ack_recv, NULL);
797 ic->i_ack_recv = state.ack_recv;
798 }
799 if (rds_conn_up(conn))
800 rds_ib_attempt_ack(ic);
801
802 /* If we ever end up with a really empty receive ring, we're
803 * in deep trouble, as the sender will definitely see RNR
804 * timeouts. */
805 if (rds_ib_ring_empty(&ic->i_recv_ring))
806 rds_ib_stats_inc(s_ib_rx_ring_empty);
807
808 /*
809 * If the ring is running low, then schedule the thread to refill.
810 */
811 if (rds_ib_ring_low(&ic->i_recv_ring))
812 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
813}
814
815int rds_ib_recv(struct rds_connection *conn)
816{
817 struct rds_ib_connection *ic = conn->c_transport_data;
818 int ret = 0;
819
820 rdsdebug("conn %p\n", conn);
821
822 /*
823 * If we get a temporary posting failure in this context then
824 * we're really low and we want the caller to back off for a bit.
825 */
826 mutex_lock(&ic->i_recv_mutex);
827 if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
828 ret = -ENOMEM;
829 else
830 rds_ib_stats_inc(s_ib_rx_refill_from_thread);
831 mutex_unlock(&ic->i_recv_mutex);
832
833 if (rds_conn_up(conn))
834 rds_ib_attempt_ack(ic);
835
836 return ret;
837}
838
839int __init rds_ib_recv_init(void)
840{
841 struct sysinfo si;
842 int ret = -ENOMEM;
843
844 /* Default to 30% of all available RAM for recv memory */
845 si_meminfo(&si);
846 rds_ib_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
847
848 rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming",
849 sizeof(struct rds_ib_incoming),
850 0, 0, NULL);
851 if (rds_ib_incoming_slab == NULL)
852 goto out;
853
854 rds_ib_frag_slab = kmem_cache_create("rds_ib_frag",
855 sizeof(struct rds_page_frag),
856 0, 0, NULL);
857 if (rds_ib_frag_slab == NULL)
858 kmem_cache_destroy(rds_ib_incoming_slab);
859 else
860 ret = 0;
861out:
862 return ret;
863}
864
865void rds_ib_recv_exit(void)
866{
867 kmem_cache_destroy(rds_ib_incoming_slab);
868 kmem_cache_destroy(rds_ib_frag_slab);
869}
diff --git a/net/rds/ib_ring.c b/net/rds/ib_ring.c
new file mode 100644
index 000000000000..99a6ccae964c
--- /dev/null
+++ b/net/rds/ib_ring.c
@@ -0,0 +1,168 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "ib.h"
37
38/*
39 * Locking for IB rings.
40 * We assume that allocation is always protected by a mutex
41 * in the caller (this is a valid assumption for the current
42 * implementation).
43 *
44 * Freeing always happens in an interrupt, and hence only
45 * races with allocations, but not with other free()s.
46 *
47 * The interaction between allocation and freeing is that
48 * the alloc code has to determine the number of free entries.
49 * To this end, we maintain two counters; an allocation counter
50 * and a free counter. Both are allowed to run freely, and wrap
51 * around.
52 * The number of used entries is always (alloc_ctr - free_ctr) % NR.
53 *
54 * The current implementation makes free_ctr atomic. When the
55 * caller finds an allocation fails, it should set an "alloc fail"
56 * bit and retry the allocation. The "alloc fail" bit essentially tells
57 * the CQ completion handlers to wake it up after freeing some
58 * more entries.
59 */
60
61/*
62 * This only happens on shutdown.
63 */
64DECLARE_WAIT_QUEUE_HEAD(rds_ib_ring_empty_wait);
65
66void rds_ib_ring_init(struct rds_ib_work_ring *ring, u32 nr)
67{
68 memset(ring, 0, sizeof(*ring));
69 ring->w_nr = nr;
70 rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
71}
72
73static inline u32 __rds_ib_ring_used(struct rds_ib_work_ring *ring)
74{
75 u32 diff;
76
77 /* This assumes that atomic_t has at least as many bits as u32 */
78 diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
79 BUG_ON(diff > ring->w_nr);
80
81 return diff;
82}
83
84void rds_ib_ring_resize(struct rds_ib_work_ring *ring, u32 nr)
85{
86 /* We only ever get called from the connection setup code,
87 * prior to creating the QP. */
88 BUG_ON(__rds_ib_ring_used(ring));
89 ring->w_nr = nr;
90}
91
92static int __rds_ib_ring_empty(struct rds_ib_work_ring *ring)
93{
94 return __rds_ib_ring_used(ring) == 0;
95}
96
97u32 rds_ib_ring_alloc(struct rds_ib_work_ring *ring, u32 val, u32 *pos)
98{
99 u32 ret = 0, avail;
100
101 avail = ring->w_nr - __rds_ib_ring_used(ring);
102
103 rdsdebug("ring %p val %u next %u free %u\n", ring, val,
104 ring->w_alloc_ptr, avail);
105
106 if (val && avail) {
107 ret = min(val, avail);
108 *pos = ring->w_alloc_ptr;
109
110 ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
111 ring->w_alloc_ctr += ret;
112 }
113
114 return ret;
115}
116
117void rds_ib_ring_free(struct rds_ib_work_ring *ring, u32 val)
118{
119 ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
120 atomic_add(val, &ring->w_free_ctr);
121
122 if (__rds_ib_ring_empty(ring) &&
123 waitqueue_active(&rds_ib_ring_empty_wait))
124 wake_up(&rds_ib_ring_empty_wait);
125}
126
127void rds_ib_ring_unalloc(struct rds_ib_work_ring *ring, u32 val)
128{
129 ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
130 ring->w_alloc_ctr -= val;
131}
132
133int rds_ib_ring_empty(struct rds_ib_work_ring *ring)
134{
135 return __rds_ib_ring_empty(ring);
136}
137
138int rds_ib_ring_low(struct rds_ib_work_ring *ring)
139{
140 return __rds_ib_ring_used(ring) <= (ring->w_nr >> 2);
141}
142
143/*
144 * returns the oldest alloced ring entry. This will be the next one
145 * freed. This can't be called if there are none allocated.
146 */
147u32 rds_ib_ring_oldest(struct rds_ib_work_ring *ring)
148{
149 return ring->w_free_ptr;
150}
151
152/*
153 * returns the number of completed work requests.
154 */
155
156u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest)
157{
158 u32 ret;
159
160 if (oldest <= (unsigned long long)wr_id)
161 ret = (unsigned long long)wr_id - oldest + 1;
162 else
163 ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
164
165 rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
166 wr_id, oldest);
167 return ret;
168}
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
new file mode 100644
index 000000000000..cb6c52cb1c4c
--- /dev/null
+++ b/net/rds/ib_send.c
@@ -0,0 +1,874 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/device.h>
36#include <linux/dmapool.h>
37
38#include "rds.h"
39#include "rdma.h"
40#include "ib.h"
41
42static void rds_ib_send_rdma_complete(struct rds_message *rm,
43 int wc_status)
44{
45 int notify_status;
46
47 switch (wc_status) {
48 case IB_WC_WR_FLUSH_ERR:
49 return;
50
51 case IB_WC_SUCCESS:
52 notify_status = RDS_RDMA_SUCCESS;
53 break;
54
55 case IB_WC_REM_ACCESS_ERR:
56 notify_status = RDS_RDMA_REMOTE_ERROR;
57 break;
58
59 default:
60 notify_status = RDS_RDMA_OTHER_ERROR;
61 break;
62 }
63 rds_rdma_send_complete(rm, notify_status);
64}
65
66static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic,
67 struct rds_rdma_op *op)
68{
69 if (op->r_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0;
74 }
75}
76
77static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic,
78 struct rds_ib_send_work *send,
79 int wc_status)
80{
81 struct rds_message *rm = send->s_rm;
82
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
84
85 ib_dma_unmap_sg(ic->i_cm_id->device,
86 rm->m_sg, rm->m_nents,
87 DMA_TO_DEVICE);
88
89 if (rm->m_rdma_op != NULL) {
90 rds_ib_send_unmap_rdma(ic, rm->m_rdma_op);
91
92 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics:
94 * 1. Notify when we received the ACK on the RDS message
95 * that was queued with the RDMA. This provides reliable
96 * notification of RDMA status at the expense of a one-way
97 * packet delay.
98 * 2. Notify when the IB stack gives us the completion event for
99 * the RDMA operation.
100 * 3. Notify when the IB stack gives us the completion event for
101 * the accompanying RDS messages.
102 * Here, we implement approach #3. To implement approach #2,
103 * call rds_rdma_send_complete from the cq_handler. To implement #1,
104 * don't call rds_rdma_send_complete at all, and fall back to the notify
105 * handling in the ACK processing code.
106 *
107 * Note: There's no need to explicitly sync any RDMA buffers using
108 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
109 * operation itself unmapped the RDMA buffers, which takes care
110 * of synching.
111 */
112 rds_ib_send_rdma_complete(rm, wc_status);
113
114 if (rm->m_rdma_op->r_write)
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
116 else
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
118 }
119
120 /* If anyone waited for this message to get flushed out, wake
121 * them up now */
122 rds_message_unmapped(rm);
123
124 rds_message_put(rm);
125 send->s_rm = NULL;
126}
127
128void rds_ib_send_init_ring(struct rds_ib_connection *ic)
129{
130 struct rds_ib_send_work *send;
131 u32 i;
132
133 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
134 struct ib_sge *sge;
135
136 send->s_rm = NULL;
137 send->s_op = NULL;
138
139 send->s_wr.wr_id = i;
140 send->s_wr.sg_list = send->s_sge;
141 send->s_wr.num_sge = 1;
142 send->s_wr.opcode = IB_WR_SEND;
143 send->s_wr.send_flags = 0;
144 send->s_wr.ex.imm_data = 0;
145
146 sge = rds_ib_data_sge(ic, send->s_sge);
147 sge->lkey = ic->i_mr->lkey;
148
149 sge = rds_ib_header_sge(ic, send->s_sge);
150 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
151 sge->length = sizeof(struct rds_header);
152 sge->lkey = ic->i_mr->lkey;
153 }
154}
155
156void rds_ib_send_clear_ring(struct rds_ib_connection *ic)
157{
158 struct rds_ib_send_work *send;
159 u32 i;
160
161 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
162 if (send->s_wr.opcode == 0xdead)
163 continue;
164 if (send->s_rm)
165 rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
166 if (send->s_op)
167 rds_ib_send_unmap_rdma(ic, send->s_op);
168 }
169}
170
171/*
172 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
173 * operations performed in the send path. As the sender allocs and potentially
174 * unallocs the next free entry in the ring it doesn't alter which is
175 * the next to be freed, which is what this is concerned with.
176 */
177void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context)
178{
179 struct rds_connection *conn = context;
180 struct rds_ib_connection *ic = conn->c_transport_data;
181 struct ib_wc wc;
182 struct rds_ib_send_work *send;
183 u32 completed;
184 u32 oldest;
185 u32 i = 0;
186 int ret;
187
188 rdsdebug("cq %p conn %p\n", cq, conn);
189 rds_ib_stats_inc(s_ib_tx_cq_call);
190 ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
191 if (ret)
192 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
193
194 while (ib_poll_cq(cq, 1, &wc) > 0) {
195 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
196 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
197 be32_to_cpu(wc.ex.imm_data));
198 rds_ib_stats_inc(s_ib_tx_cq_event);
199
200 if (wc.wr_id == RDS_IB_ACK_WR_ID) {
201 if (ic->i_ack_queued + HZ/2 < jiffies)
202 rds_ib_stats_inc(s_ib_tx_stalled);
203 rds_ib_ack_send_complete(ic);
204 continue;
205 }
206
207 oldest = rds_ib_ring_oldest(&ic->i_send_ring);
208
209 completed = rds_ib_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
210
211 for (i = 0; i < completed; i++) {
212 send = &ic->i_sends[oldest];
213
214 /* In the error case, wc.opcode sometimes contains garbage */
215 switch (send->s_wr.opcode) {
216 case IB_WR_SEND:
217 if (send->s_rm)
218 rds_ib_send_unmap_rm(ic, send, wc.status);
219 break;
220 case IB_WR_RDMA_WRITE:
221 case IB_WR_RDMA_READ:
222 /* Nothing to be done - the SG list will be unmapped
223 * when the SEND completes. */
224 break;
225 default:
226 if (printk_ratelimit())
227 printk(KERN_NOTICE
228 "RDS/IB: %s: unexpected opcode 0x%x in WR!\n",
229 __func__, send->s_wr.opcode);
230 break;
231 }
232
233 send->s_wr.opcode = 0xdead;
234 send->s_wr.num_sge = 1;
235 if (send->s_queued + HZ/2 < jiffies)
236 rds_ib_stats_inc(s_ib_tx_stalled);
237
238 /* If a RDMA operation produced an error, signal this right
239 * away. If we don't, the subsequent SEND that goes with this
240 * RDMA will be canceled with ERR_WFLUSH, and the application
241 * never learn that the RDMA failed. */
242 if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
243 struct rds_message *rm;
244
245 rm = rds_send_get_message(conn, send->s_op);
246 if (rm)
247 rds_ib_send_rdma_complete(rm, wc.status);
248 }
249
250 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
251 }
252
253 rds_ib_ring_free(&ic->i_send_ring, completed);
254
255 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
256 || test_bit(0, &conn->c_map_queued))
257 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
258
259 /* We expect errors as the qp is drained during shutdown */
260 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
261 rds_ib_conn_error(conn,
262 "send completion on %pI4 "
263 "had status %u, disconnecting and reconnecting\n",
264 &conn->c_faddr, wc.status);
265 }
266 }
267}
268
269/*
270 * This is the main function for allocating credits when sending
271 * messages.
272 *
273 * Conceptually, we have two counters:
274 * - send credits: this tells us how many WRs we're allowed
275 * to submit without overruning the reciever's queue. For
276 * each SEND WR we post, we decrement this by one.
277 *
278 * - posted credits: this tells us how many WRs we recently
279 * posted to the receive queue. This value is transferred
280 * to the peer as a "credit update" in a RDS header field.
281 * Every time we transmit credits to the peer, we subtract
282 * the amount of transferred credits from this counter.
283 *
284 * It is essential that we avoid situations where both sides have
285 * exhausted their send credits, and are unable to send new credits
286 * to the peer. We achieve this by requiring that we send at least
287 * one credit update to the peer before exhausting our credits.
288 * When new credits arrive, we subtract one credit that is withheld
289 * until we've posted new buffers and are ready to transmit these
290 * credits (see rds_ib_send_add_credits below).
291 *
292 * The RDS send code is essentially single-threaded; rds_send_xmit
293 * grabs c_send_lock to ensure exclusive access to the send ring.
294 * However, the ACK sending code is independent and can race with
295 * message SENDs.
296 *
297 * In the send path, we need to update the counters for send credits
298 * and the counter of posted buffers atomically - when we use the
299 * last available credit, we cannot allow another thread to race us
300 * and grab the posted credits counter. Hence, we have to use a
301 * spinlock to protect the credit counter, or use atomics.
302 *
303 * Spinlocks shared between the send and the receive path are bad,
304 * because they create unnecessary delays. An early implementation
305 * using a spinlock showed a 5% degradation in throughput at some
306 * loads.
307 *
308 * This implementation avoids spinlocks completely, putting both
309 * counters into a single atomic, and updating that atomic using
310 * atomic_add (in the receive path, when receiving fresh credits),
311 * and using atomic_cmpxchg when updating the two counters.
312 */
313int rds_ib_send_grab_credits(struct rds_ib_connection *ic,
314 u32 wanted, u32 *adv_credits, int need_posted)
315{
316 unsigned int avail, posted, got = 0, advertise;
317 long oldval, newval;
318
319 *adv_credits = 0;
320 if (!ic->i_flowctl)
321 return wanted;
322
323try_again:
324 advertise = 0;
325 oldval = newval = atomic_read(&ic->i_credits);
326 posted = IB_GET_POST_CREDITS(oldval);
327 avail = IB_GET_SEND_CREDITS(oldval);
328
329 rdsdebug("rds_ib_send_grab_credits(%u): credits=%u posted=%u\n",
330 wanted, avail, posted);
331
332 /* The last credit must be used to send a credit update. */
333 if (avail && !posted)
334 avail--;
335
336 if (avail < wanted) {
337 struct rds_connection *conn = ic->i_cm_id->context;
338
339 /* Oops, there aren't that many credits left! */
340 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
341 got = avail;
342 } else {
343 /* Sometimes you get what you want, lalala. */
344 got = wanted;
345 }
346 newval -= IB_SET_SEND_CREDITS(got);
347
348 /*
349 * If need_posted is non-zero, then the caller wants
350 * the posted regardless of whether any send credits are
351 * available.
352 */
353 if (posted && (got || need_posted)) {
354 advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
355 newval -= IB_SET_POST_CREDITS(advertise);
356 }
357
358 /* Finally bill everything */
359 if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
360 goto try_again;
361
362 *adv_credits = advertise;
363 return got;
364}
365
366void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
367{
368 struct rds_ib_connection *ic = conn->c_transport_data;
369
370 if (credits == 0)
371 return;
372
373 rdsdebug("rds_ib_send_add_credits(%u): current=%u%s\n",
374 credits,
375 IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
376 test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
377
378 atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
379 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
380 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
381
382 WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
383
384 rds_ib_stats_inc(s_ib_rx_credit_updates);
385}
386
387void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted)
388{
389 struct rds_ib_connection *ic = conn->c_transport_data;
390
391 if (posted == 0)
392 return;
393
394 atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
395
396 /* Decide whether to send an update to the peer now.
397 * If we would send a credit update for every single buffer we
398 * post, we would end up with an ACK storm (ACK arrives,
399 * consumes buffer, we refill the ring, send ACK to remote
400 * advertising the newly posted buffer... ad inf)
401 *
402 * Performance pretty much depends on how often we send
403 * credit updates - too frequent updates mean lots of ACKs.
404 * Too infrequent updates, and the peer will run out of
405 * credits and has to throttle.
406 * For the time being, 16 seems to be a good compromise.
407 */
408 if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
409 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
410}
411
412static inline void
413rds_ib_xmit_populate_wr(struct rds_ib_connection *ic,
414 struct rds_ib_send_work *send, unsigned int pos,
415 unsigned long buffer, unsigned int length,
416 int send_flags)
417{
418 struct ib_sge *sge;
419
420 WARN_ON(pos != send - ic->i_sends);
421
422 send->s_wr.send_flags = send_flags;
423 send->s_wr.opcode = IB_WR_SEND;
424 send->s_wr.num_sge = 2;
425 send->s_wr.next = NULL;
426 send->s_queued = jiffies;
427 send->s_op = NULL;
428
429 if (length != 0) {
430 sge = rds_ib_data_sge(ic, send->s_sge);
431 sge->addr = buffer;
432 sge->length = length;
433 sge->lkey = ic->i_mr->lkey;
434
435 sge = rds_ib_header_sge(ic, send->s_sge);
436 } else {
437 /* We're sending a packet with no payload. There is only
438 * one SGE */
439 send->s_wr.num_sge = 1;
440 sge = &send->s_sge[0];
441 }
442
443 sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
444 sge->length = sizeof(struct rds_header);
445 sge->lkey = ic->i_mr->lkey;
446}
447
448/*
449 * This can be called multiple times for a given message. The first time
450 * we see a message we map its scatterlist into the IB device so that
451 * we can provide that mapped address to the IB scatter gather entries
452 * in the IB work requests. We translate the scatterlist into a series
453 * of work requests that fragment the message. These work requests complete
454 * in order so we pass ownership of the message to the completion handler
455 * once we send the final fragment.
456 *
457 * The RDS core uses the c_send_lock to only enter this function once
458 * per connection. This makes sure that the tx ring alloc/unalloc pairs
459 * don't get out of sync and confuse the ring.
460 */
461int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
462 unsigned int hdr_off, unsigned int sg, unsigned int off)
463{
464 struct rds_ib_connection *ic = conn->c_transport_data;
465 struct ib_device *dev = ic->i_cm_id->device;
466 struct rds_ib_send_work *send = NULL;
467 struct rds_ib_send_work *first;
468 struct rds_ib_send_work *prev;
469 struct ib_send_wr *failed_wr;
470 struct scatterlist *scat;
471 u32 pos;
472 u32 i;
473 u32 work_alloc;
474 u32 credit_alloc;
475 u32 posted;
476 u32 adv_credits = 0;
477 int send_flags = 0;
478 int sent;
479 int ret;
480 int flow_controlled = 0;
481
482 BUG_ON(off % RDS_FRAG_SIZE);
483 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
484
485 /* FIXME we may overallocate here */
486 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
487 i = 1;
488 else
489 i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
490
491 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
492 if (work_alloc == 0) {
493 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
494 rds_ib_stats_inc(s_ib_tx_ring_full);
495 ret = -ENOMEM;
496 goto out;
497 }
498
499 credit_alloc = work_alloc;
500 if (ic->i_flowctl) {
501 credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0);
502 adv_credits += posted;
503 if (credit_alloc < work_alloc) {
504 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
505 work_alloc = credit_alloc;
506 flow_controlled++;
507 }
508 if (work_alloc == 0) {
509 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
510 rds_ib_stats_inc(s_ib_tx_throttle);
511 ret = -ENOMEM;
512 goto out;
513 }
514 }
515
516 /* map the message the first time we see it */
517 if (ic->i_rm == NULL) {
518 /*
519 printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n",
520 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
521 rm->m_inc.i_hdr.h_flags,
522 be32_to_cpu(rm->m_inc.i_hdr.h_len));
523 */
524 if (rm->m_nents) {
525 rm->m_count = ib_dma_map_sg(dev,
526 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
527 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
528 if (rm->m_count == 0) {
529 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
530 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
531 ret = -ENOMEM; /* XXX ? */
532 goto out;
533 }
534 } else {
535 rm->m_count = 0;
536 }
537
538 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
539 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
540 rds_message_addref(rm);
541 ic->i_rm = rm;
542
543 /* Finalize the header */
544 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
545 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
546 if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
547 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
548
549 /* If it has a RDMA op, tell the peer we did it. This is
550 * used by the peer to release use-once RDMA MRs. */
551 if (rm->m_rdma_op) {
552 struct rds_ext_header_rdma ext_hdr;
553
554 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
555 rds_message_add_extension(&rm->m_inc.i_hdr,
556 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
557 }
558 if (rm->m_rdma_cookie) {
559 rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
560 rds_rdma_cookie_key(rm->m_rdma_cookie),
561 rds_rdma_cookie_offset(rm->m_rdma_cookie));
562 }
563
564 /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
565 * we should not do this unless we have a chance of at least
566 * sticking the header into the send ring. Which is why we
567 * should call rds_ib_ring_alloc first. */
568 rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_ib_piggyb_ack(ic));
569 rds_message_make_checksum(&rm->m_inc.i_hdr);
570
571 /*
572 * Update adv_credits since we reset the ACK_REQUIRED bit.
573 */
574 rds_ib_send_grab_credits(ic, 0, &posted, 1);
575 adv_credits += posted;
576 BUG_ON(adv_credits > 255);
577 } else if (ic->i_rm != rm)
578 BUG();
579
580 send = &ic->i_sends[pos];
581 first = send;
582 prev = NULL;
583 scat = &rm->m_sg[sg];
584 sent = 0;
585 i = 0;
586
587 /* Sometimes you want to put a fence between an RDMA
588 * READ and the following SEND.
589 * We could either do this all the time
590 * or when requested by the user. Right now, we let
591 * the application choose.
592 */
593 if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
594 send_flags = IB_SEND_FENCE;
595
596 /*
597 * We could be copying the header into the unused tail of the page.
598 * That would need to be changed in the future when those pages might
599 * be mapped userspace pages or page cache pages. So instead we always
600 * use a second sge and our long-lived ring of mapped headers. We send
601 * the header after the data so that the data payload can be aligned on
602 * the receiver.
603 */
604
605 /* handle a 0-len message */
606 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
607 rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
608 goto add_header;
609 }
610
611 /* if there's data reference it with a chain of work reqs */
612 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
613 unsigned int len;
614
615 send = &ic->i_sends[pos];
616
617 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
618 rds_ib_xmit_populate_wr(ic, send, pos,
619 ib_sg_dma_address(dev, scat) + off, len,
620 send_flags);
621
622 /*
623 * We want to delay signaling completions just enough to get
624 * the batching benefits but not so much that we create dead time
625 * on the wire.
626 */
627 if (ic->i_unsignaled_wrs-- == 0) {
628 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
629 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
630 }
631
632 ic->i_unsignaled_bytes -= len;
633 if (ic->i_unsignaled_bytes <= 0) {
634 ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes;
635 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
636 }
637
638 /*
639 * Always signal the last one if we're stopping due to flow control.
640 */
641 if (flow_controlled && i == (work_alloc-1))
642 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
643
644 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
645 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
646
647 sent += len;
648 off += len;
649 if (off == ib_sg_dma_len(dev, scat)) {
650 scat++;
651 off = 0;
652 }
653
654add_header:
655 /* Tack on the header after the data. The header SGE should already
656 * have been set up to point to the right header buffer. */
657 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
658
659 if (0) {
660 struct rds_header *hdr = &ic->i_send_hdrs[pos];
661
662 printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
663 be16_to_cpu(hdr->h_dport),
664 hdr->h_flags,
665 be32_to_cpu(hdr->h_len));
666 }
667 if (adv_credits) {
668 struct rds_header *hdr = &ic->i_send_hdrs[pos];
669
670 /* add credit and redo the header checksum */
671 hdr->h_credit = adv_credits;
672 rds_message_make_checksum(hdr);
673 adv_credits = 0;
674 rds_ib_stats_inc(s_ib_tx_credit_updates);
675 }
676
677 if (prev)
678 prev->s_wr.next = &send->s_wr;
679 prev = send;
680
681 pos = (pos + 1) % ic->i_send_ring.w_nr;
682 }
683
684 /* Account the RDS header in the number of bytes we sent, but just once.
685 * The caller has no concept of fragmentation. */
686 if (hdr_off == 0)
687 sent += sizeof(struct rds_header);
688
689 /* if we finished the message then send completion owns it */
690 if (scat == &rm->m_sg[rm->m_count]) {
691 prev->s_rm = ic->i_rm;
692 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
693 ic->i_rm = NULL;
694 }
695
696 if (i < work_alloc) {
697 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
698 work_alloc = i;
699 }
700 if (ic->i_flowctl && i < credit_alloc)
701 rds_ib_send_add_credits(conn, credit_alloc - i);
702
703 /* XXX need to worry about failed_wr and partial sends. */
704 failed_wr = &first->s_wr;
705 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
706 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
707 first, &first->s_wr, ret, failed_wr);
708 BUG_ON(failed_wr != &first->s_wr);
709 if (ret) {
710 printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
711 "returned %d\n", &conn->c_faddr, ret);
712 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
713 if (prev->s_rm) {
714 ic->i_rm = prev->s_rm;
715 prev->s_rm = NULL;
716 }
717 /* Finesse this later */
718 BUG();
719 goto out;
720 }
721
722 ret = sent;
723out:
724 BUG_ON(adv_credits);
725 return ret;
726}
727
728int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
729{
730 struct rds_ib_connection *ic = conn->c_transport_data;
731 struct rds_ib_send_work *send = NULL;
732 struct rds_ib_send_work *first;
733 struct rds_ib_send_work *prev;
734 struct ib_send_wr *failed_wr;
735 struct rds_ib_device *rds_ibdev;
736 struct scatterlist *scat;
737 unsigned long len;
738 u64 remote_addr = op->r_remote_addr;
739 u32 pos;
740 u32 work_alloc;
741 u32 i;
742 u32 j;
743 int sent;
744 int ret;
745 int num_sge;
746
747 rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
748
749 /* map the message the first time we see it */
750 if (!op->r_mapped) {
751 op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
752 op->r_sg, op->r_nents, (op->r_write) ?
753 DMA_TO_DEVICE : DMA_FROM_DEVICE);
754 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
755 if (op->r_count == 0) {
756 rds_ib_stats_inc(s_ib_tx_sg_mapping_failure);
757 ret = -ENOMEM; /* XXX ? */
758 goto out;
759 }
760
761 op->r_mapped = 1;
762 }
763
764 /*
765 * Instead of knowing how to return a partial rdma read/write we insist that there
766 * be enough work requests to send the entire message.
767 */
768 i = ceil(op->r_count, rds_ibdev->max_sge);
769
770 work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos);
771 if (work_alloc != i) {
772 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
773 rds_ib_stats_inc(s_ib_tx_ring_full);
774 ret = -ENOMEM;
775 goto out;
776 }
777
778 send = &ic->i_sends[pos];
779 first = send;
780 prev = NULL;
781 scat = &op->r_sg[0];
782 sent = 0;
783 num_sge = op->r_count;
784
785 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
786 send->s_wr.send_flags = 0;
787 send->s_queued = jiffies;
788 /*
789 * We want to delay signaling completions just enough to get
790 * the batching benefits but not so much that we create dead time on the wire.
791 */
792 if (ic->i_unsignaled_wrs-- == 0) {
793 ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs;
794 send->s_wr.send_flags = IB_SEND_SIGNALED;
795 }
796
797 send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
798 send->s_wr.wr.rdma.remote_addr = remote_addr;
799 send->s_wr.wr.rdma.rkey = op->r_key;
800 send->s_op = op;
801
802 if (num_sge > rds_ibdev->max_sge) {
803 send->s_wr.num_sge = rds_ibdev->max_sge;
804 num_sge -= rds_ibdev->max_sge;
805 } else {
806 send->s_wr.num_sge = num_sge;
807 }
808
809 send->s_wr.next = NULL;
810
811 if (prev)
812 prev->s_wr.next = &send->s_wr;
813
814 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
815 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
816 send->s_sge[j].addr =
817 ib_sg_dma_address(ic->i_cm_id->device, scat);
818 send->s_sge[j].length = len;
819 send->s_sge[j].lkey = ic->i_mr->lkey;
820
821 sent += len;
822 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
823
824 remote_addr += len;
825 scat++;
826 }
827
828 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
829 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
830
831 prev = send;
832 if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
833 send = ic->i_sends;
834 }
835
836 /* if we finished the message then send completion owns it */
837 if (scat == &op->r_sg[op->r_count])
838 prev->s_wr.send_flags = IB_SEND_SIGNALED;
839
840 if (i < work_alloc) {
841 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i);
842 work_alloc = i;
843 }
844
845 failed_wr = &first->s_wr;
846 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
847 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
848 first, &first->s_wr, ret, failed_wr);
849 BUG_ON(failed_wr != &first->s_wr);
850 if (ret) {
851 printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
852 "returned %d\n", &conn->c_faddr, ret);
853 rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
854 goto out;
855 }
856
857 if (unlikely(failed_wr != &first->s_wr)) {
858 printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
859 BUG_ON(failed_wr != &first->s_wr);
860 }
861
862
863out:
864 return ret;
865}
866
867void rds_ib_xmit_complete(struct rds_connection *conn)
868{
869 struct rds_ib_connection *ic = conn->c_transport_data;
870
871 /* We may have a pending ACK or window update we were unable
872 * to send previously (due to flow control). Try again. */
873 rds_ib_attempt_ack(ic);
874}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
new file mode 100644
index 000000000000..02e3e3d50d4a
--- /dev/null
+++ b/net/rds/ib_stats.c
@@ -0,0 +1,95 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38#include "ib.h"
39
40DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
41
42static char *rds_ib_stat_names[] = {
43 "ib_connect_raced",
44 "ib_listen_closed_stale",
45 "ib_tx_cq_call",
46 "ib_tx_cq_event",
47 "ib_tx_ring_full",
48 "ib_tx_throttle",
49 "ib_tx_sg_mapping_failure",
50 "ib_tx_stalled",
51 "ib_tx_credit_updates",
52 "ib_rx_cq_call",
53 "ib_rx_cq_event",
54 "ib_rx_ring_empty",
55 "ib_rx_refill_from_cq",
56 "ib_rx_refill_from_thread",
57 "ib_rx_alloc_limit",
58 "ib_rx_credit_updates",
59 "ib_ack_sent",
60 "ib_ack_send_failure",
61 "ib_ack_send_delayed",
62 "ib_ack_send_piggybacked",
63 "ib_ack_received",
64 "ib_rdma_mr_alloc",
65 "ib_rdma_mr_free",
66 "ib_rdma_mr_used",
67 "ib_rdma_mr_pool_flush",
68 "ib_rdma_mr_pool_wait",
69 "ib_rdma_mr_pool_depleted",
70};
71
72unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter,
73 unsigned int avail)
74{
75 struct rds_ib_statistics stats = {0, };
76 uint64_t *src;
77 uint64_t *sum;
78 size_t i;
79 int cpu;
80
81 if (avail < ARRAY_SIZE(rds_ib_stat_names))
82 goto out;
83
84 for_each_online_cpu(cpu) {
85 src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu));
86 sum = (uint64_t *)&stats;
87 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
88 *(sum++) += *(src++);
89 }
90
91 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names,
92 ARRAY_SIZE(rds_ib_stat_names));
93out:
94 return ARRAY_SIZE(rds_ib_stat_names);
95}
diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c
new file mode 100644
index 000000000000..d87830db93a0
--- /dev/null
+++ b/net/rds/ib_sysctl.c
@@ -0,0 +1,137 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/sysctl.h>
35#include <linux/proc_fs.h>
36
37#include "ib.h"
38
39static struct ctl_table_header *rds_ib_sysctl_hdr;
40
41unsigned long rds_ib_sysctl_max_send_wr = RDS_IB_DEFAULT_SEND_WR;
42unsigned long rds_ib_sysctl_max_recv_wr = RDS_IB_DEFAULT_RECV_WR;
43unsigned long rds_ib_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
44static unsigned long rds_ib_sysctl_max_wr_min = 1;
45/* hardware will fail CQ creation long before this */
46static unsigned long rds_ib_sysctl_max_wr_max = (u32)~0;
47
48unsigned long rds_ib_sysctl_max_unsig_wrs = 16;
49static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1;
50static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64;
51
52unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20);
53static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1;
54static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL;
55
56unsigned int rds_ib_sysctl_flow_control = 1;
57
58ctl_table rds_ib_sysctl_table[] = {
59 {
60 .ctl_name = CTL_UNNUMBERED,
61 .procname = "max_send_wr",
62 .data = &rds_ib_sysctl_max_send_wr,
63 .maxlen = sizeof(unsigned long),
64 .mode = 0644,
65 .proc_handler = &proc_doulongvec_minmax,
66 .extra1 = &rds_ib_sysctl_max_wr_min,
67 .extra2 = &rds_ib_sysctl_max_wr_max,
68 },
69 {
70 .ctl_name = CTL_UNNUMBERED,
71 .procname = "max_recv_wr",
72 .data = &rds_ib_sysctl_max_recv_wr,
73 .maxlen = sizeof(unsigned long),
74 .mode = 0644,
75 .proc_handler = &proc_doulongvec_minmax,
76 .extra1 = &rds_ib_sysctl_max_wr_min,
77 .extra2 = &rds_ib_sysctl_max_wr_max,
78 },
79 {
80 .ctl_name = CTL_UNNUMBERED,
81 .procname = "max_unsignaled_wr",
82 .data = &rds_ib_sysctl_max_unsig_wrs,
83 .maxlen = sizeof(unsigned long),
84 .mode = 0644,
85 .proc_handler = &proc_doulongvec_minmax,
86 .extra1 = &rds_ib_sysctl_max_unsig_wr_min,
87 .extra2 = &rds_ib_sysctl_max_unsig_wr_max,
88 },
89 {
90 .ctl_name = CTL_UNNUMBERED,
91 .procname = "max_unsignaled_bytes",
92 .data = &rds_ib_sysctl_max_unsig_bytes,
93 .maxlen = sizeof(unsigned long),
94 .mode = 0644,
95 .proc_handler = &proc_doulongvec_minmax,
96 .extra1 = &rds_ib_sysctl_max_unsig_bytes_min,
97 .extra2 = &rds_ib_sysctl_max_unsig_bytes_max,
98 },
99 {
100 .ctl_name = CTL_UNNUMBERED,
101 .procname = "max_recv_allocation",
102 .data = &rds_ib_sysctl_max_recv_allocation,
103 .maxlen = sizeof(unsigned long),
104 .mode = 0644,
105 .proc_handler = &proc_doulongvec_minmax,
106 },
107 {
108 .ctl_name = CTL_UNNUMBERED,
109 .procname = "flow_control",
110 .data = &rds_ib_sysctl_flow_control,
111 .maxlen = sizeof(rds_ib_sysctl_flow_control),
112 .mode = 0644,
113 .proc_handler = &proc_dointvec,
114 },
115 { .ctl_name = 0}
116};
117
118static struct ctl_path rds_ib_sysctl_path[] = {
119 { .procname = "net", .ctl_name = CTL_NET, },
120 { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
121 { .procname = "ib", .ctl_name = CTL_UNNUMBERED, },
122 { }
123};
124
125void rds_ib_sysctl_exit(void)
126{
127 if (rds_ib_sysctl_hdr)
128 unregister_sysctl_table(rds_ib_sysctl_hdr);
129}
130
131int __init rds_ib_sysctl_init(void)
132{
133 rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table);
134 if (rds_ib_sysctl_hdr == NULL)
135 return -ENOMEM;
136 return 0;
137}
diff --git a/net/rds/info.c b/net/rds/info.c
new file mode 100644
index 000000000000..1d885535214d
--- /dev/null
+++ b/net/rds/info.c
@@ -0,0 +1,241 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38
39/*
40 * This file implements a getsockopt() call which copies a set of fixed
41 * sized structs into a user-specified buffer as a means of providing
42 * read-only information about RDS.
43 *
44 * For a given information source there are a given number of fixed sized
45 * structs at a given time. The structs are only copied if the user-specified
46 * buffer is big enough. The destination pages that make up the buffer
47 * are pinned for the duration of the copy.
48 *
49 * This gives us the following benefits:
50 *
51 * - simple implementation, no copy "position" across multiple calls
52 * - consistent snapshot of an info source
53 * - atomic copy works well with whatever locking info source has
54 * - one portable tool to get rds info across implementations
55 * - long-lived tool can get info without allocating
56 *
57 * at the following costs:
58 *
59 * - info source copy must be pinned, may be "large"
60 */
61
62struct rds_info_iterator {
63 struct page **pages;
64 void *addr;
65 unsigned long offset;
66};
67
68static DEFINE_SPINLOCK(rds_info_lock);
69static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];
70
71void rds_info_register_func(int optname, rds_info_func func)
72{
73 int offset = optname - RDS_INFO_FIRST;
74
75 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
76
77 spin_lock(&rds_info_lock);
78 BUG_ON(rds_info_funcs[offset] != NULL);
79 rds_info_funcs[offset] = func;
80 spin_unlock(&rds_info_lock);
81}
82
83void rds_info_deregister_func(int optname, rds_info_func func)
84{
85 int offset = optname - RDS_INFO_FIRST;
86
87 BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);
88
89 spin_lock(&rds_info_lock);
90 BUG_ON(rds_info_funcs[offset] != func);
91 rds_info_funcs[offset] = NULL;
92 spin_unlock(&rds_info_lock);
93}
94
95/*
96 * Typically we hold an atomic kmap across multiple rds_info_copy() calls
97 * because the kmap is so expensive. This must be called before using blocking
98 * operations while holding the mapping and as the iterator is torn down.
99 */
100void rds_info_iter_unmap(struct rds_info_iterator *iter)
101{
102 if (iter->addr != NULL) {
103 kunmap_atomic(iter->addr, KM_USER0);
104 iter->addr = NULL;
105 }
106}
107
108/*
109 * get_user_pages() called flush_dcache_page() on the pages for us.
110 */
111void rds_info_copy(struct rds_info_iterator *iter, void *data,
112 unsigned long bytes)
113{
114 unsigned long this;
115
116 while (bytes) {
117 if (iter->addr == NULL)
118 iter->addr = kmap_atomic(*iter->pages, KM_USER0);
119
120 this = min(bytes, PAGE_SIZE - iter->offset);
121
122 rdsdebug("page %p addr %p offset %lu this %lu data %p "
123 "bytes %lu\n", *iter->pages, iter->addr,
124 iter->offset, this, data, bytes);
125
126 memcpy(iter->addr + iter->offset, data, this);
127
128 data += this;
129 bytes -= this;
130 iter->offset += this;
131
132 if (iter->offset == PAGE_SIZE) {
133 kunmap_atomic(iter->addr, KM_USER0);
134 iter->addr = NULL;
135 iter->offset = 0;
136 iter->pages++;
137 }
138 }
139}
140
141/*
142 * @optval points to the userspace buffer that the information snapshot
143 * will be copied into.
144 *
145 * @optlen on input is the size of the buffer in userspace. @optlen
146 * on output is the size of the requested snapshot in bytes.
147 *
148 * This function returns -errno if there is a failure, particularly -ENOSPC
149 * if the given userspace buffer was not large enough to fit the snapshot.
150 * On success it returns the positive number of bytes of each array element
151 * in the snapshot.
152 */
153int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
154 int __user *optlen)
155{
156 struct rds_info_iterator iter;
157 struct rds_info_lengths lens;
158 unsigned long nr_pages = 0;
159 unsigned long start;
160 unsigned long i;
161 rds_info_func func;
162 struct page **pages = NULL;
163 int ret;
164 int len;
165 int total;
166
167 if (get_user(len, optlen)) {
168 ret = -EFAULT;
169 goto out;
170 }
171
172 /* check for all kinds of wrapping and the like */
173 start = (unsigned long)optval;
174 if (len < 0 || len + PAGE_SIZE - 1 < len || start + len < start) {
175 ret = -EINVAL;
176 goto out;
177 }
178
179 /* a 0 len call is just trying to probe its length */
180 if (len == 0)
181 goto call_func;
182
183 nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
184 >> PAGE_SHIFT;
185
186 pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
187 if (pages == NULL) {
188 ret = -ENOMEM;
189 goto out;
190 }
191 down_read(&current->mm->mmap_sem);
192 ret = get_user_pages(current, current->mm, start, nr_pages, 1, 0,
193 pages, NULL);
194 up_read(&current->mm->mmap_sem);
195 if (ret != nr_pages) {
196 if (ret > 0)
197 nr_pages = ret;
198 else
199 nr_pages = 0;
200 ret = -EAGAIN; /* XXX ? */
201 goto out;
202 }
203
204 rdsdebug("len %d nr_pages %lu\n", len, nr_pages);
205
206call_func:
207 func = rds_info_funcs[optname - RDS_INFO_FIRST];
208 if (func == NULL) {
209 ret = -ENOPROTOOPT;
210 goto out;
211 }
212
213 iter.pages = pages;
214 iter.addr = NULL;
215 iter.offset = start & (PAGE_SIZE - 1);
216
217 func(sock, len, &iter, &lens);
218 BUG_ON(lens.each == 0);
219
220 total = lens.nr * lens.each;
221
222 rds_info_iter_unmap(&iter);
223
224 if (total > len) {
225 len = total;
226 ret = -ENOSPC;
227 } else {
228 len = total;
229 ret = lens.each;
230 }
231
232 if (put_user(len, optlen))
233 ret = -EFAULT;
234
235out:
236 for (i = 0; pages != NULL && i < nr_pages; i++)
237 put_page(pages[i]);
238 kfree(pages);
239
240 return ret;
241}
diff --git a/net/rds/info.h b/net/rds/info.h
new file mode 100644
index 000000000000..b6c052ca7d22
--- /dev/null
+++ b/net/rds/info.h
@@ -0,0 +1,30 @@
1#ifndef _RDS_INFO_H
2#define _RDS_INFO_H
3
4struct rds_info_lengths {
5 unsigned int nr;
6 unsigned int each;
7};
8
9struct rds_info_iterator;
10
11/*
12 * These functions must fill in the fields of @lens to reflect the size
13 * of the available info source. If the snapshot fits in @len then it
14 * should be copied using @iter. The caller will deduce if it was copied
15 * or not by comparing the lengths.
16 */
17typedef void (*rds_info_func)(struct socket *sock, unsigned int len,
18 struct rds_info_iterator *iter,
19 struct rds_info_lengths *lens);
20
21void rds_info_register_func(int optname, rds_info_func func);
22void rds_info_deregister_func(int optname, rds_info_func func);
23int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
24 int __user *optlen);
25void rds_info_copy(struct rds_info_iterator *iter, void *data,
26 unsigned long bytes);
27void rds_info_iter_unmap(struct rds_info_iterator *iter);
28
29
30#endif
diff --git a/net/rds/iw.c b/net/rds/iw.c
new file mode 100644
index 000000000000..1b56905c4c08
--- /dev/null
+++ b/net/rds/iw.c
@@ -0,0 +1,333 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/if.h>
36#include <linux/netdevice.h>
37#include <linux/inetdevice.h>
38#include <linux/if_arp.h>
39#include <linux/delay.h>
40
41#include "rds.h"
42#include "iw.h"
43
44unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
45unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
46
47module_param(fastreg_pool_size, int, 0444);
48MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
49module_param(fastreg_message_size, int, 0444);
50MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
51
52struct list_head rds_iw_devices;
53
54DEFINE_SPINLOCK(iw_nodev_conns_lock);
55LIST_HEAD(iw_nodev_conns);
56
57void rds_iw_add_one(struct ib_device *device)
58{
59 struct rds_iw_device *rds_iwdev;
60 struct ib_device_attr *dev_attr;
61
62 /* Only handle iwarp devices */
63 if (device->node_type != RDMA_NODE_RNIC)
64 return;
65
66 dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
67 if (!dev_attr)
68 return;
69
70 if (ib_query_device(device, dev_attr)) {
71 rdsdebug("Query device failed for %s\n", device->name);
72 goto free_attr;
73 }
74
75 rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
76 if (!rds_iwdev)
77 goto free_attr;
78
79 spin_lock_init(&rds_iwdev->spinlock);
80
81 rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
82 rds_iwdev->max_wrs = dev_attr->max_qp_wr;
83 rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
84
85 rds_iwdev->page_shift = max(PAGE_SHIFT, ffs(dev_attr->page_size_cap) - 1);
86
87 rds_iwdev->dev = device;
88 rds_iwdev->pd = ib_alloc_pd(device);
89 if (IS_ERR(rds_iwdev->pd))
90 goto free_dev;
91
92 if (!rds_iwdev->dma_local_lkey) {
93 if (device->node_type != RDMA_NODE_RNIC) {
94 rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
95 IB_ACCESS_LOCAL_WRITE);
96 } else {
97 rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
98 IB_ACCESS_REMOTE_READ |
99 IB_ACCESS_REMOTE_WRITE |
100 IB_ACCESS_LOCAL_WRITE);
101 }
102 if (IS_ERR(rds_iwdev->mr))
103 goto err_pd;
104 } else
105 rds_iwdev->mr = NULL;
106
107 rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
108 if (IS_ERR(rds_iwdev->mr_pool)) {
109 rds_iwdev->mr_pool = NULL;
110 goto err_mr;
111 }
112
113 INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
114 INIT_LIST_HEAD(&rds_iwdev->conn_list);
115 list_add_tail(&rds_iwdev->list, &rds_iw_devices);
116
117 ib_set_client_data(device, &rds_iw_client, rds_iwdev);
118
119 goto free_attr;
120
121err_mr:
122 if (rds_iwdev->mr)
123 ib_dereg_mr(rds_iwdev->mr);
124err_pd:
125 ib_dealloc_pd(rds_iwdev->pd);
126free_dev:
127 kfree(rds_iwdev);
128free_attr:
129 kfree(dev_attr);
130}
131
132void rds_iw_remove_one(struct ib_device *device)
133{
134 struct rds_iw_device *rds_iwdev;
135 struct rds_iw_cm_id *i_cm_id, *next;
136
137 rds_iwdev = ib_get_client_data(device, &rds_iw_client);
138 if (!rds_iwdev)
139 return;
140
141 spin_lock_irq(&rds_iwdev->spinlock);
142 list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
143 list_del(&i_cm_id->list);
144 kfree(i_cm_id);
145 }
146 spin_unlock_irq(&rds_iwdev->spinlock);
147
148 rds_iw_remove_conns(rds_iwdev);
149
150 if (rds_iwdev->mr_pool)
151 rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
152
153 if (rds_iwdev->mr)
154 ib_dereg_mr(rds_iwdev->mr);
155
156 while (ib_dealloc_pd(rds_iwdev->pd)) {
157 rdsdebug("Failed to dealloc pd %p\n", rds_iwdev->pd);
158 msleep(1);
159 }
160
161 list_del(&rds_iwdev->list);
162 kfree(rds_iwdev);
163}
164
165struct ib_client rds_iw_client = {
166 .name = "rds_iw",
167 .add = rds_iw_add_one,
168 .remove = rds_iw_remove_one
169};
170
171static int rds_iw_conn_info_visitor(struct rds_connection *conn,
172 void *buffer)
173{
174 struct rds_info_rdma_connection *iinfo = buffer;
175 struct rds_iw_connection *ic;
176
177 /* We will only ever look at IB transports */
178 if (conn->c_trans != &rds_iw_transport)
179 return 0;
180
181 iinfo->src_addr = conn->c_laddr;
182 iinfo->dst_addr = conn->c_faddr;
183
184 memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
185 memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
186 if (rds_conn_state(conn) == RDS_CONN_UP) {
187 struct rds_iw_device *rds_iwdev;
188 struct rdma_dev_addr *dev_addr;
189
190 ic = conn->c_transport_data;
191 dev_addr = &ic->i_cm_id->route.addr.dev_addr;
192
193 ib_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
194 ib_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
195
196 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
197 iinfo->max_send_wr = ic->i_send_ring.w_nr;
198 iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
199 iinfo->max_send_sge = rds_iwdev->max_sge;
200 rds_iw_get_mr_info(rds_iwdev, iinfo);
201 }
202 return 1;
203}
204
205static void rds_iw_ic_info(struct socket *sock, unsigned int len,
206 struct rds_info_iterator *iter,
207 struct rds_info_lengths *lens)
208{
209 rds_for_each_conn_info(sock, len, iter, lens,
210 rds_iw_conn_info_visitor,
211 sizeof(struct rds_info_rdma_connection));
212}
213
214
215/*
216 * Early RDS/IB was built to only bind to an address if there is an IPoIB
217 * device with that address set.
218 *
219 * If it were me, I'd advocate for something more flexible. Sending and
220 * receiving should be device-agnostic. Transports would try and maintain
221 * connections between peers who have messages queued. Userspace would be
222 * allowed to influence which paths have priority. We could call userspace
223 * asserting this policy "routing".
224 */
225static int rds_iw_laddr_check(__be32 addr)
226{
227 int ret;
228 struct rdma_cm_id *cm_id;
229 struct sockaddr_in sin;
230
231 /* Create a CMA ID and try to bind it. This catches both
232 * IB and iWARP capable NICs.
233 */
234 cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP);
235 if (!cm_id)
236 return -EADDRNOTAVAIL;
237
238 memset(&sin, 0, sizeof(sin));
239 sin.sin_family = AF_INET;
240 sin.sin_addr.s_addr = addr;
241
242 /* rdma_bind_addr will only succeed for IB & iWARP devices */
243 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
244 /* due to this, we will claim to support IB devices unless we
245 check node_type. */
246 if (ret || cm_id->device->node_type != RDMA_NODE_RNIC)
247 ret = -EADDRNOTAVAIL;
248
249 rdsdebug("addr %pI4 ret %d node type %d\n",
250 &addr, ret,
251 cm_id->device ? cm_id->device->node_type : -1);
252
253 rdma_destroy_id(cm_id);
254
255 return ret;
256}
257
258void rds_iw_exit(void)
259{
260 rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
261 rds_iw_remove_nodev_conns();
262 ib_unregister_client(&rds_iw_client);
263 rds_iw_sysctl_exit();
264 rds_iw_recv_exit();
265 rds_trans_unregister(&rds_iw_transport);
266}
267
268struct rds_transport rds_iw_transport = {
269 .laddr_check = rds_iw_laddr_check,
270 .xmit_complete = rds_iw_xmit_complete,
271 .xmit = rds_iw_xmit,
272 .xmit_cong_map = NULL,
273 .xmit_rdma = rds_iw_xmit_rdma,
274 .recv = rds_iw_recv,
275 .conn_alloc = rds_iw_conn_alloc,
276 .conn_free = rds_iw_conn_free,
277 .conn_connect = rds_iw_conn_connect,
278 .conn_shutdown = rds_iw_conn_shutdown,
279 .inc_copy_to_user = rds_iw_inc_copy_to_user,
280 .inc_purge = rds_iw_inc_purge,
281 .inc_free = rds_iw_inc_free,
282 .cm_initiate_connect = rds_iw_cm_initiate_connect,
283 .cm_handle_connect = rds_iw_cm_handle_connect,
284 .cm_connect_complete = rds_iw_cm_connect_complete,
285 .stats_info_copy = rds_iw_stats_info_copy,
286 .exit = rds_iw_exit,
287 .get_mr = rds_iw_get_mr,
288 .sync_mr = rds_iw_sync_mr,
289 .free_mr = rds_iw_free_mr,
290 .flush_mrs = rds_iw_flush_mrs,
291 .t_owner = THIS_MODULE,
292 .t_name = "iwarp",
293 .t_prefer_loopback = 1,
294};
295
296int __init rds_iw_init(void)
297{
298 int ret;
299
300 INIT_LIST_HEAD(&rds_iw_devices);
301
302 ret = ib_register_client(&rds_iw_client);
303 if (ret)
304 goto out;
305
306 ret = rds_iw_sysctl_init();
307 if (ret)
308 goto out_ibreg;
309
310 ret = rds_iw_recv_init();
311 if (ret)
312 goto out_sysctl;
313
314 ret = rds_trans_register(&rds_iw_transport);
315 if (ret)
316 goto out_recv;
317
318 rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
319
320 goto out;
321
322out_recv:
323 rds_iw_recv_exit();
324out_sysctl:
325 rds_iw_sysctl_exit();
326out_ibreg:
327 ib_unregister_client(&rds_iw_client);
328out:
329 return ret;
330}
331
332MODULE_LICENSE("GPL");
333
diff --git a/net/rds/iw.h b/net/rds/iw.h
new file mode 100644
index 000000000000..0ddda34f2a1c
--- /dev/null
+++ b/net/rds/iw.h
@@ -0,0 +1,395 @@
1#ifndef _RDS_IW_H
2#define _RDS_IW_H
3
4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h>
6#include "rds.h"
7#include "rdma_transport.h"
8
9#define RDS_FASTREG_SIZE 20
10#define RDS_FASTREG_POOL_SIZE 2048
11
12#define RDS_IW_MAX_SGE 8
13#define RDS_IW_RECV_SGE 2
14
15#define RDS_IW_DEFAULT_RECV_WR 1024
16#define RDS_IW_DEFAULT_SEND_WR 256
17
18#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
19
20extern struct list_head rds_iw_devices;
21
22/*
23 * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
24 * try and minimize the amount of memory tied up both the device and
25 * socket receive queues.
26 */
27/* page offset of the final full frag that fits in the page */
28#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
29struct rds_page_frag {
30 struct list_head f_item;
31 struct page *f_page;
32 unsigned long f_offset;
33 dma_addr_t f_mapped;
34};
35
36struct rds_iw_incoming {
37 struct list_head ii_frags;
38 struct rds_incoming ii_inc;
39};
40
41struct rds_iw_connect_private {
42 /* Add new fields at the end, and don't permute existing fields. */
43 __be32 dp_saddr;
44 __be32 dp_daddr;
45 u8 dp_protocol_major;
46 u8 dp_protocol_minor;
47 __be16 dp_protocol_minor_mask; /* bitmask */
48 __be32 dp_reserved1;
49 __be64 dp_ack_seq;
50 __be32 dp_credit; /* non-zero enables flow ctl */
51};
52
53struct rds_iw_scatterlist {
54 struct scatterlist *list;
55 unsigned int len;
56 int dma_len;
57 unsigned int dma_npages;
58 unsigned int bytes;
59};
60
61struct rds_iw_mapping {
62 spinlock_t m_lock; /* protect the mapping struct */
63 struct list_head m_list;
64 struct rds_iw_mr *m_mr;
65 uint32_t m_rkey;
66 struct rds_iw_scatterlist m_sg;
67};
68
69struct rds_iw_send_work {
70 struct rds_message *s_rm;
71
72 /* We should really put these into a union: */
73 struct rds_rdma_op *s_op;
74 struct rds_iw_mapping *s_mapping;
75 struct ib_mr *s_mr;
76 struct ib_fast_reg_page_list *s_page_list;
77 unsigned char s_remap_count;
78
79 struct ib_send_wr s_wr;
80 struct ib_sge s_sge[RDS_IW_MAX_SGE];
81 unsigned long s_queued;
82};
83
84struct rds_iw_recv_work {
85 struct rds_iw_incoming *r_iwinc;
86 struct rds_page_frag *r_frag;
87 struct ib_recv_wr r_wr;
88 struct ib_sge r_sge[2];
89};
90
91struct rds_iw_work_ring {
92 u32 w_nr;
93 u32 w_alloc_ptr;
94 u32 w_alloc_ctr;
95 u32 w_free_ptr;
96 atomic_t w_free_ctr;
97};
98
99struct rds_iw_device;
100
101struct rds_iw_connection {
102
103 struct list_head iw_node;
104 struct rds_iw_device *rds_iwdev;
105 struct rds_connection *conn;
106
107 /* alphabet soup, IBTA style */
108 struct rdma_cm_id *i_cm_id;
109 struct ib_pd *i_pd;
110 struct ib_mr *i_mr;
111 struct ib_cq *i_send_cq;
112 struct ib_cq *i_recv_cq;
113
114 /* tx */
115 struct rds_iw_work_ring i_send_ring;
116 struct rds_message *i_rm;
117 struct rds_header *i_send_hdrs;
118 u64 i_send_hdrs_dma;
119 struct rds_iw_send_work *i_sends;
120
121 /* rx */
122 struct mutex i_recv_mutex;
123 struct rds_iw_work_ring i_recv_ring;
124 struct rds_iw_incoming *i_iwinc;
125 u32 i_recv_data_rem;
126 struct rds_header *i_recv_hdrs;
127 u64 i_recv_hdrs_dma;
128 struct rds_iw_recv_work *i_recvs;
129 struct rds_page_frag i_frag;
130 u64 i_ack_recv; /* last ACK received */
131
132 /* sending acks */
133 unsigned long i_ack_flags;
134 u64 i_ack_next; /* next ACK to send */
135 struct rds_header *i_ack;
136 struct ib_send_wr i_ack_wr;
137 struct ib_sge i_ack_sge;
138 u64 i_ack_dma;
139 unsigned long i_ack_queued;
140
141 /* Flow control related information
142 *
143 * Our algorithm uses a pair variables that we need to access
144 * atomically - one for the send credits, and one posted
145 * recv credits we need to transfer to remote.
146 * Rather than protect them using a slow spinlock, we put both into
147 * a single atomic_t and update it using cmpxchg
148 */
149 atomic_t i_credits;
150
151 /* Protocol version specific information */
152 unsigned int i_flowctl:1; /* enable/disable flow ctl */
153 unsigned int i_dma_local_lkey:1;
154 unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
155 /* Batched completions */
156 unsigned int i_unsignaled_wrs;
157 long i_unsignaled_bytes;
158};
159
160/* This assumes that atomic_t is at least 32 bits */
161#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
162#define IB_GET_POST_CREDITS(v) ((v) >> 16)
163#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
164#define IB_SET_POST_CREDITS(v) ((v) << 16)
165
166struct rds_iw_cm_id {
167 struct list_head list;
168 struct rdma_cm_id *cm_id;
169};
170
171struct rds_iw_device {
172 struct list_head list;
173 struct list_head cm_id_list;
174 struct list_head conn_list;
175 struct ib_device *dev;
176 struct ib_pd *pd;
177 struct ib_mr *mr;
178 struct rds_iw_mr_pool *mr_pool;
179 int page_shift;
180 int max_sge;
181 unsigned int max_wrs;
182 unsigned int dma_local_lkey:1;
183 spinlock_t spinlock; /* protect the above */
184};
185
186/* bits for i_ack_flags */
187#define IB_ACK_IN_FLIGHT 0
188#define IB_ACK_REQUESTED 1
189
190/* Magic WR_ID for ACKs */
191#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
192#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
193#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
194
195struct rds_iw_statistics {
196 uint64_t s_iw_connect_raced;
197 uint64_t s_iw_listen_closed_stale;
198 uint64_t s_iw_tx_cq_call;
199 uint64_t s_iw_tx_cq_event;
200 uint64_t s_iw_tx_ring_full;
201 uint64_t s_iw_tx_throttle;
202 uint64_t s_iw_tx_sg_mapping_failure;
203 uint64_t s_iw_tx_stalled;
204 uint64_t s_iw_tx_credit_updates;
205 uint64_t s_iw_rx_cq_call;
206 uint64_t s_iw_rx_cq_event;
207 uint64_t s_iw_rx_ring_empty;
208 uint64_t s_iw_rx_refill_from_cq;
209 uint64_t s_iw_rx_refill_from_thread;
210 uint64_t s_iw_rx_alloc_limit;
211 uint64_t s_iw_rx_credit_updates;
212 uint64_t s_iw_ack_sent;
213 uint64_t s_iw_ack_send_failure;
214 uint64_t s_iw_ack_send_delayed;
215 uint64_t s_iw_ack_send_piggybacked;
216 uint64_t s_iw_ack_received;
217 uint64_t s_iw_rdma_mr_alloc;
218 uint64_t s_iw_rdma_mr_free;
219 uint64_t s_iw_rdma_mr_used;
220 uint64_t s_iw_rdma_mr_pool_flush;
221 uint64_t s_iw_rdma_mr_pool_wait;
222 uint64_t s_iw_rdma_mr_pool_depleted;
223};
224
225extern struct workqueue_struct *rds_iw_wq;
226
227/*
228 * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
229 * doesn't define it.
230 */
231static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
232 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
233{
234 unsigned int i;
235
236 for (i = 0; i < sg_dma_len; ++i) {
237 ib_dma_sync_single_for_cpu(dev,
238 ib_sg_dma_address(dev, &sg[i]),
239 ib_sg_dma_len(dev, &sg[i]),
240 direction);
241 }
242}
243#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
244
245static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
246 struct scatterlist *sg, unsigned int sg_dma_len, int direction)
247{
248 unsigned int i;
249
250 for (i = 0; i < sg_dma_len; ++i) {
251 ib_dma_sync_single_for_device(dev,
252 ib_sg_dma_address(dev, &sg[i]),
253 ib_sg_dma_len(dev, &sg[i]),
254 direction);
255 }
256}
257#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
258
259static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
260{
261 return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
262}
263
264/* ib.c */
265extern struct rds_transport rds_iw_transport;
266extern void rds_iw_add_one(struct ib_device *device);
267extern void rds_iw_remove_one(struct ib_device *device);
268extern struct ib_client rds_iw_client;
269
270extern unsigned int fastreg_pool_size;
271extern unsigned int fastreg_message_size;
272
273extern spinlock_t iw_nodev_conns_lock;
274extern struct list_head iw_nodev_conns;
275
276/* ib_cm.c */
277int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
278void rds_iw_conn_free(void *arg);
279int rds_iw_conn_connect(struct rds_connection *conn);
280void rds_iw_conn_shutdown(struct rds_connection *conn);
281void rds_iw_state_change(struct sock *sk);
282int __init rds_iw_listen_init(void);
283void rds_iw_listen_stop(void);
284void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
285int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
286 struct rdma_cm_event *event);
287int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
288void rds_iw_cm_connect_complete(struct rds_connection *conn,
289 struct rdma_cm_event *event);
290
291
292#define rds_iw_conn_error(conn, fmt...) \
293 __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
294
295/* ib_rdma.c */
296int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
297int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
298void rds_iw_remove_nodev_conns(void);
299void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev);
300struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
301void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
302void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
303void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
304 struct rds_sock *rs, u32 *key_ret);
305void rds_iw_sync_mr(void *trans_private, int dir);
306void rds_iw_free_mr(void *trans_private, int invalidate);
307void rds_iw_flush_mrs(void);
308void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
309
310/* ib_recv.c */
311int __init rds_iw_recv_init(void);
312void rds_iw_recv_exit(void);
313int rds_iw_recv(struct rds_connection *conn);
314int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
315 gfp_t page_gfp, int prefill);
316void rds_iw_inc_purge(struct rds_incoming *inc);
317void rds_iw_inc_free(struct rds_incoming *inc);
318int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov,
319 size_t size);
320void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
321void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
322void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
323void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
324void rds_iw_attempt_ack(struct rds_iw_connection *ic);
325void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
326u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
327
328/* ib_ring.c */
329void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
330void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
331u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
332void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
333void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
334int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
335int rds_iw_ring_low(struct rds_iw_work_ring *ring);
336u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
337u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
338extern wait_queue_head_t rds_iw_ring_empty_wait;
339
340/* ib_send.c */
341void rds_iw_xmit_complete(struct rds_connection *conn);
342int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
343 unsigned int hdr_off, unsigned int sg, unsigned int off);
344void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
345void rds_iw_send_init_ring(struct rds_iw_connection *ic);
346void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
347int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op);
348void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
349void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
350int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
351 u32 *adv_credits, int need_posted);
352
353/* ib_stats.c */
354DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
355#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
356unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
357 unsigned int avail);
358
359/* ib_sysctl.c */
360int __init rds_iw_sysctl_init(void);
361void rds_iw_sysctl_exit(void);
362extern unsigned long rds_iw_sysctl_max_send_wr;
363extern unsigned long rds_iw_sysctl_max_recv_wr;
364extern unsigned long rds_iw_sysctl_max_unsig_wrs;
365extern unsigned long rds_iw_sysctl_max_unsig_bytes;
366extern unsigned long rds_iw_sysctl_max_recv_allocation;
367extern unsigned int rds_iw_sysctl_flow_control;
368extern ctl_table rds_iw_sysctl_table[];
369
370/*
371 * Helper functions for getting/setting the header and data SGEs in
372 * RDS packets (not RDMA)
373 */
374static inline struct ib_sge *
375rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
376{
377 return &sge[0];
378}
379
380static inline struct ib_sge *
381rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
382{
383 return &sge[1];
384}
385
386static inline void rds_iw_set_64bit(u64 *ptr, u64 val)
387{
388#if BITS_PER_LONG == 64
389 *ptr = val;
390#else
391 set_64bit(ptr, val);
392#endif
393}
394
395#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
new file mode 100644
index 000000000000..57ecb3d4b8a5
--- /dev/null
+++ b/net/rds/iw_cm.c
@@ -0,0 +1,750 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/vmalloc.h>
36
37#include "rds.h"
38#include "iw.h"
39
40/*
41 * Set the selected protocol version
42 */
43static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
44{
45 conn->c_version = version;
46}
47
48/*
49 * Set up flow control
50 */
51static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
52{
53 struct rds_iw_connection *ic = conn->c_transport_data;
54
55 if (rds_iw_sysctl_flow_control && credits != 0) {
56 /* We're doing flow control */
57 ic->i_flowctl = 1;
58 rds_iw_send_add_credits(conn, credits);
59 } else {
60 ic->i_flowctl = 0;
61 }
62}
63
64/*
65 * Connection established.
66 * We get here for both outgoing and incoming connection.
67 */
68void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
69{
70 const struct rds_iw_connect_private *dp = NULL;
71 struct rds_iw_connection *ic = conn->c_transport_data;
72 struct rds_iw_device *rds_iwdev;
73 int err;
74
75 if (event->param.conn.private_data_len) {
76 dp = event->param.conn.private_data;
77
78 rds_iw_set_protocol(conn,
79 RDS_PROTOCOL(dp->dp_protocol_major,
80 dp->dp_protocol_minor));
81 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
82 }
83
84 /* update ib_device with this local ipaddr & conn */
85 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
86 err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
87 if (err)
88 printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
89 err = rds_iw_add_conn(rds_iwdev, conn);
90 if (err)
91 printk(KERN_ERR "rds_iw_add_conn failed (%d)\n", err);
92
93 /* If the peer gave us the last packet it saw, process this as if
94 * we had received a regular ACK. */
95 if (dp && dp->dp_ack_seq)
96 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
97
98 printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
99 &conn->c_laddr, &conn->c_faddr,
100 RDS_PROTOCOL_MAJOR(conn->c_version),
101 RDS_PROTOCOL_MINOR(conn->c_version),
102 ic->i_flowctl ? ", flow control" : "");
103
104 rds_connect_complete(conn);
105}
106
107static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
108 struct rdma_conn_param *conn_param,
109 struct rds_iw_connect_private *dp,
110 u32 protocol_version)
111{
112 struct rds_iw_connection *ic = conn->c_transport_data;
113
114 memset(conn_param, 0, sizeof(struct rdma_conn_param));
115 /* XXX tune these? */
116 conn_param->responder_resources = 1;
117 conn_param->initiator_depth = 1;
118
119 if (dp) {
120 memset(dp, 0, sizeof(*dp));
121 dp->dp_saddr = conn->c_laddr;
122 dp->dp_daddr = conn->c_faddr;
123 dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
124 dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
125 dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
126 dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
127
128 /* Advertise flow control */
129 if (ic->i_flowctl) {
130 unsigned int credits;
131
132 credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
133 dp->dp_credit = cpu_to_be32(credits);
134 atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
135 }
136
137 conn_param->private_data = dp;
138 conn_param->private_data_len = sizeof(*dp);
139 }
140}
141
142static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
143{
144 rdsdebug("event %u data %p\n", event->event, data);
145}
146
147static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
148{
149 struct rds_connection *conn = data;
150 struct rds_iw_connection *ic = conn->c_transport_data;
151
152 rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
153
154 switch (event->event) {
155 case IB_EVENT_COMM_EST:
156 rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
157 break;
158 case IB_EVENT_QP_REQ_ERR:
159 case IB_EVENT_QP_FATAL:
160 default:
161 rds_iw_conn_error(conn, "RDS/IW: Fatal QP Event %u - connection %pI4->%pI4...reconnecting\n",
162 event->event, &conn->c_laddr,
163 &conn->c_faddr);
164 break;
165 }
166}
167
168/*
169 * Create a QP
170 */
171static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
172 struct rds_iw_device *rds_iwdev,
173 struct rds_iw_work_ring *send_ring,
174 void (*send_cq_handler)(struct ib_cq *, void *),
175 struct rds_iw_work_ring *recv_ring,
176 void (*recv_cq_handler)(struct ib_cq *, void *),
177 void *context)
178{
179 struct ib_device *dev = rds_iwdev->dev;
180 unsigned int send_size, recv_size;
181 int ret;
182
183 /* The offset of 1 is to accomodate the additional ACK WR. */
184 send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
185 recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
186 rds_iw_ring_resize(send_ring, send_size - 1);
187 rds_iw_ring_resize(recv_ring, recv_size - 1);
188
189 memset(attr, 0, sizeof(*attr));
190 attr->event_handler = rds_iw_qp_event_handler;
191 attr->qp_context = context;
192 attr->cap.max_send_wr = send_size;
193 attr->cap.max_recv_wr = recv_size;
194 attr->cap.max_send_sge = rds_iwdev->max_sge;
195 attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
196 attr->sq_sig_type = IB_SIGNAL_REQ_WR;
197 attr->qp_type = IB_QPT_RC;
198
199 attr->send_cq = ib_create_cq(dev, send_cq_handler,
200 rds_iw_cq_event_handler,
201 context, send_size, 0);
202 if (IS_ERR(attr->send_cq)) {
203 ret = PTR_ERR(attr->send_cq);
204 attr->send_cq = NULL;
205 rdsdebug("ib_create_cq send failed: %d\n", ret);
206 goto out;
207 }
208
209 attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
210 rds_iw_cq_event_handler,
211 context, recv_size, 0);
212 if (IS_ERR(attr->recv_cq)) {
213 ret = PTR_ERR(attr->recv_cq);
214 attr->recv_cq = NULL;
215 rdsdebug("ib_create_cq send failed: %d\n", ret);
216 goto out;
217 }
218
219 ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
220 if (ret) {
221 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
222 goto out;
223 }
224
225 ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
226 if (ret) {
227 rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
228 goto out;
229 }
230
231out:
232 if (ret) {
233 if (attr->send_cq)
234 ib_destroy_cq(attr->send_cq);
235 if (attr->recv_cq)
236 ib_destroy_cq(attr->recv_cq);
237 }
238 return ret;
239}
240
241/*
242 * This needs to be very careful to not leave IS_ERR pointers around for
243 * cleanup to trip over.
244 */
245static int rds_iw_setup_qp(struct rds_connection *conn)
246{
247 struct rds_iw_connection *ic = conn->c_transport_data;
248 struct ib_device *dev = ic->i_cm_id->device;
249 struct ib_qp_init_attr attr;
250 struct rds_iw_device *rds_iwdev;
251 int ret;
252
253 /* rds_iw_add_one creates a rds_iw_device object per IB device,
254 * and allocates a protection domain, memory range and MR pool
255 * for each. If that fails for any reason, it will not register
256 * the rds_iwdev at all.
257 */
258 rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
259 if (rds_iwdev == NULL) {
260 if (printk_ratelimit())
261 printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
262 dev->name);
263 return -EOPNOTSUPP;
264 }
265
266 /* Protection domain and memory range */
267 ic->i_pd = rds_iwdev->pd;
268 ic->i_mr = rds_iwdev->mr;
269
270 ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
271 &ic->i_send_ring, rds_iw_send_cq_comp_handler,
272 &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
273 conn);
274 if (ret < 0)
275 goto out;
276
277 ic->i_send_cq = attr.send_cq;
278 ic->i_recv_cq = attr.recv_cq;
279
280 /*
281 * XXX this can fail if max_*_wr is too large? Are we supposed
282 * to back off until we get a value that the hardware can support?
283 */
284 ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
285 if (ret) {
286 rdsdebug("rdma_create_qp failed: %d\n", ret);
287 goto out;
288 }
289
290 ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
291 ic->i_send_ring.w_nr *
292 sizeof(struct rds_header),
293 &ic->i_send_hdrs_dma, GFP_KERNEL);
294 if (ic->i_send_hdrs == NULL) {
295 ret = -ENOMEM;
296 rdsdebug("ib_dma_alloc_coherent send failed\n");
297 goto out;
298 }
299
300 ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
301 ic->i_recv_ring.w_nr *
302 sizeof(struct rds_header),
303 &ic->i_recv_hdrs_dma, GFP_KERNEL);
304 if (ic->i_recv_hdrs == NULL) {
305 ret = -ENOMEM;
306 rdsdebug("ib_dma_alloc_coherent recv failed\n");
307 goto out;
308 }
309
310 ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
311 &ic->i_ack_dma, GFP_KERNEL);
312 if (ic->i_ack == NULL) {
313 ret = -ENOMEM;
314 rdsdebug("ib_dma_alloc_coherent ack failed\n");
315 goto out;
316 }
317
318 ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
319 if (ic->i_sends == NULL) {
320 ret = -ENOMEM;
321 rdsdebug("send allocation failed\n");
322 goto out;
323 }
324 rds_iw_send_init_ring(ic);
325
326 ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
327 if (ic->i_recvs == NULL) {
328 ret = -ENOMEM;
329 rdsdebug("recv allocation failed\n");
330 goto out;
331 }
332
333 rds_iw_recv_init_ring(ic);
334 rds_iw_recv_init_ack(ic);
335
336 /* Post receive buffers - as a side effect, this will update
337 * the posted credit count. */
338 rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
339
340 rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
341 ic->i_send_cq, ic->i_recv_cq);
342
343out:
344 return ret;
345}
346
347static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
348{
349 u16 common;
350 u32 version = 0;
351
352 /* rdma_cm private data is odd - when there is any private data in the
353 * request, we will be given a pretty large buffer without telling us the
354 * original size. The only way to tell the difference is by looking at
355 * the contents, which are initialized to zero.
356 * If the protocol version fields aren't set, this is a connection attempt
357 * from an older version. This could could be 3.0 or 2.0 - we can't tell.
358 * We really should have changed this for OFED 1.3 :-( */
359 if (dp->dp_protocol_major == 0)
360 return RDS_PROTOCOL_3_0;
361
362 common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
363 if (dp->dp_protocol_major == 3 && common) {
364 version = RDS_PROTOCOL_3_0;
365 while ((common >>= 1) != 0)
366 version++;
367 } else if (printk_ratelimit()) {
368 printk(KERN_NOTICE "RDS: Connection from %pI4 using "
369 "incompatible protocol version %u.%u\n",
370 &dp->dp_saddr,
371 dp->dp_protocol_major,
372 dp->dp_protocol_minor);
373 }
374 return version;
375}
376
377int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
378 struct rdma_cm_event *event)
379{
380 const struct rds_iw_connect_private *dp = event->param.conn.private_data;
381 struct rds_iw_connect_private dp_rep;
382 struct rds_connection *conn = NULL;
383 struct rds_iw_connection *ic = NULL;
384 struct rdma_conn_param conn_param;
385 struct rds_iw_device *rds_iwdev;
386 u32 version;
387 int err, destroy = 1;
388
389 /* Check whether the remote protocol version matches ours. */
390 version = rds_iw_protocol_compatible(dp);
391 if (!version)
392 goto out;
393
394 rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
395 &dp->dp_saddr, &dp->dp_daddr,
396 RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
397
398 conn = rds_conn_create(dp->dp_daddr, dp->dp_saddr, &rds_iw_transport,
399 GFP_KERNEL);
400 if (IS_ERR(conn)) {
401 rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
402 conn = NULL;
403 goto out;
404 }
405
406 /*
407 * The connection request may occur while the
408 * previous connection exist, e.g. in case of failover.
409 * But as connections may be initiated simultaneously
410 * by both hosts, we have a random backoff mechanism -
411 * see the comment above rds_queue_reconnect()
412 */
413 mutex_lock(&conn->c_cm_lock);
414 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
415 if (rds_conn_state(conn) == RDS_CONN_UP) {
416 rdsdebug("incoming connect while connecting\n");
417 rds_conn_drop(conn);
418 rds_iw_stats_inc(s_iw_listen_closed_stale);
419 } else
420 if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
421 /* Wait and see - our connect may still be succeeding */
422 rds_iw_stats_inc(s_iw_connect_raced);
423 }
424 mutex_unlock(&conn->c_cm_lock);
425 goto out;
426 }
427
428 ic = conn->c_transport_data;
429
430 rds_iw_set_protocol(conn, version);
431 rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
432
433 /* If the peer gave us the last packet it saw, process this as if
434 * we had received a regular ACK. */
435 if (dp->dp_ack_seq)
436 rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
437
438 BUG_ON(cm_id->context);
439 BUG_ON(ic->i_cm_id);
440
441 ic->i_cm_id = cm_id;
442 cm_id->context = conn;
443
444 rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
445 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
446
447 /* We got halfway through setting up the ib_connection, if we
448 * fail now, we have to take the long route out of this mess. */
449 destroy = 0;
450
451 err = rds_iw_setup_qp(conn);
452 if (err) {
453 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
454 goto out;
455 }
456
457 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
458
459 /* rdma_accept() calls rdma_reject() internally if it fails */
460 err = rdma_accept(cm_id, &conn_param);
461 mutex_unlock(&conn->c_cm_lock);
462 if (err) {
463 rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
464 goto out;
465 }
466
467 return 0;
468
469out:
470 rdma_reject(cm_id, NULL, 0);
471 return destroy;
472}
473
474
475int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
476{
477 struct rds_connection *conn = cm_id->context;
478 struct rds_iw_connection *ic = conn->c_transport_data;
479 struct rdma_conn_param conn_param;
480 struct rds_iw_connect_private dp;
481 int ret;
482
483 /* If the peer doesn't do protocol negotiation, we must
484 * default to RDSv3.0 */
485 rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
486 ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
487
488 ret = rds_iw_setup_qp(conn);
489 if (ret) {
490 rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
491 goto out;
492 }
493
494 rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
495
496 ret = rdma_connect(cm_id, &conn_param);
497 if (ret)
498 rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
499
500out:
501 /* Beware - returning non-zero tells the rdma_cm to destroy
502 * the cm_id. We should certainly not do it as long as we still
503 * "own" the cm_id. */
504 if (ret) {
505 struct rds_iw_connection *ic = conn->c_transport_data;
506
507 if (ic->i_cm_id == cm_id)
508 ret = 0;
509 }
510 return ret;
511}
512
513int rds_iw_conn_connect(struct rds_connection *conn)
514{
515 struct rds_iw_connection *ic = conn->c_transport_data;
516 struct rds_iw_device *rds_iwdev;
517 struct sockaddr_in src, dest;
518 int ret;
519
520 /* XXX I wonder what affect the port space has */
521 /* delegate cm event handler to rdma_transport */
522 ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
523 RDMA_PS_TCP);
524 if (IS_ERR(ic->i_cm_id)) {
525 ret = PTR_ERR(ic->i_cm_id);
526 ic->i_cm_id = NULL;
527 rdsdebug("rdma_create_id() failed: %d\n", ret);
528 goto out;
529 }
530
531 rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
532
533 src.sin_family = AF_INET;
534 src.sin_addr.s_addr = (__force u32)conn->c_laddr;
535 src.sin_port = (__force u16)htons(0);
536
537 /* First, bind to the local address and device. */
538 ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
539 if (ret) {
540 rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
541 &conn->c_laddr, ret);
542 rdma_destroy_id(ic->i_cm_id);
543 ic->i_cm_id = NULL;
544 goto out;
545 }
546
547 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
548 ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
549
550 dest.sin_family = AF_INET;
551 dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
552 dest.sin_port = (__force u16)htons(RDS_PORT);
553
554 ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
555 (struct sockaddr *)&dest,
556 RDS_RDMA_RESOLVE_TIMEOUT_MS);
557 if (ret) {
558 rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
559 ret);
560 rdma_destroy_id(ic->i_cm_id);
561 ic->i_cm_id = NULL;
562 }
563
564out:
565 return ret;
566}
567
568/*
569 * This is so careful about only cleaning up resources that were built up
570 * so that it can be called at any point during startup. In fact it
571 * can be called multiple times for a given connection.
572 */
573void rds_iw_conn_shutdown(struct rds_connection *conn)
574{
575 struct rds_iw_connection *ic = conn->c_transport_data;
576 int err = 0;
577 struct ib_qp_attr qp_attr;
578
579 rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
580 ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
581 ic->i_cm_id ? ic->i_cm_id->qp : NULL);
582
583 if (ic->i_cm_id) {
584 struct ib_device *dev = ic->i_cm_id->device;
585
586 rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
587 err = rdma_disconnect(ic->i_cm_id);
588 if (err) {
589 /* Actually this may happen quite frequently, when
590 * an outgoing connect raced with an incoming connect.
591 */
592 rdsdebug("rds_iw_conn_shutdown: failed to disconnect,"
593 " cm: %p err %d\n", ic->i_cm_id, err);
594 }
595
596 if (ic->i_cm_id->qp) {
597 qp_attr.qp_state = IB_QPS_ERR;
598 ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
599 }
600
601 wait_event(rds_iw_ring_empty_wait,
602 rds_iw_ring_empty(&ic->i_send_ring) &&
603 rds_iw_ring_empty(&ic->i_recv_ring));
604
605 if (ic->i_send_hdrs)
606 ib_dma_free_coherent(dev,
607 ic->i_send_ring.w_nr *
608 sizeof(struct rds_header),
609 ic->i_send_hdrs,
610 ic->i_send_hdrs_dma);
611
612 if (ic->i_recv_hdrs)
613 ib_dma_free_coherent(dev,
614 ic->i_recv_ring.w_nr *
615 sizeof(struct rds_header),
616 ic->i_recv_hdrs,
617 ic->i_recv_hdrs_dma);
618
619 if (ic->i_ack)
620 ib_dma_free_coherent(dev, sizeof(struct rds_header),
621 ic->i_ack, ic->i_ack_dma);
622
623 if (ic->i_sends)
624 rds_iw_send_clear_ring(ic);
625 if (ic->i_recvs)
626 rds_iw_recv_clear_ring(ic);
627
628 if (ic->i_cm_id->qp)
629 rdma_destroy_qp(ic->i_cm_id);
630 if (ic->i_send_cq)
631 ib_destroy_cq(ic->i_send_cq);
632 if (ic->i_recv_cq)
633 ib_destroy_cq(ic->i_recv_cq);
634
635 /*
636 * If associated with an rds_iw_device:
637 * Move connection back to the nodev list.
638 * Remove cm_id from the device cm_id list.
639 */
640 if (ic->rds_iwdev) {
641
642 spin_lock_irq(&ic->rds_iwdev->spinlock);
643 BUG_ON(list_empty(&ic->iw_node));
644 list_del(&ic->iw_node);
645 spin_unlock_irq(&ic->rds_iwdev->spinlock);
646
647 spin_lock_irq(&iw_nodev_conns_lock);
648 list_add_tail(&ic->iw_node, &iw_nodev_conns);
649 spin_unlock_irq(&iw_nodev_conns_lock);
650 rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
651 ic->rds_iwdev = NULL;
652 }
653
654 rdma_destroy_id(ic->i_cm_id);
655
656 ic->i_cm_id = NULL;
657 ic->i_pd = NULL;
658 ic->i_mr = NULL;
659 ic->i_send_cq = NULL;
660 ic->i_recv_cq = NULL;
661 ic->i_send_hdrs = NULL;
662 ic->i_recv_hdrs = NULL;
663 ic->i_ack = NULL;
664 }
665 BUG_ON(ic->rds_iwdev);
666
667 /* Clear pending transmit */
668 if (ic->i_rm) {
669 rds_message_put(ic->i_rm);
670 ic->i_rm = NULL;
671 }
672
673 /* Clear the ACK state */
674 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
675 rds_iw_set_64bit(&ic->i_ack_next, 0);
676 ic->i_ack_recv = 0;
677
678 /* Clear flow control state */
679 ic->i_flowctl = 0;
680 atomic_set(&ic->i_credits, 0);
681
682 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
683 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
684
685 if (ic->i_iwinc) {
686 rds_inc_put(&ic->i_iwinc->ii_inc);
687 ic->i_iwinc = NULL;
688 }
689
690 vfree(ic->i_sends);
691 ic->i_sends = NULL;
692 vfree(ic->i_recvs);
693 ic->i_recvs = NULL;
694 rdsdebug("shutdown complete\n");
695}
696
697int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
698{
699 struct rds_iw_connection *ic;
700 unsigned long flags;
701
702 /* XXX too lazy? */
703 ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL);
704 if (ic == NULL)
705 return -ENOMEM;
706
707 INIT_LIST_HEAD(&ic->iw_node);
708 mutex_init(&ic->i_recv_mutex);
709
710 /*
711 * rds_iw_conn_shutdown() waits for these to be emptied so they
712 * must be initialized before it can be called.
713 */
714 rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
715 rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
716
717 ic->conn = conn;
718 conn->c_transport_data = ic;
719
720 spin_lock_irqsave(&iw_nodev_conns_lock, flags);
721 list_add_tail(&ic->iw_node, &iw_nodev_conns);
722 spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
723
724
725 rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
726 return 0;
727}
728
729void rds_iw_conn_free(void *arg)
730{
731 struct rds_iw_connection *ic = arg;
732 rdsdebug("ic %p\n", ic);
733 list_del(&ic->iw_node);
734 kfree(ic);
735}
736
737/*
738 * An error occurred on the connection
739 */
740void
741__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
742{
743 va_list ap;
744
745 rds_conn_drop(conn);
746
747 va_start(ap, fmt);
748 vprintk(fmt, ap);
749 va_end(ap);
750}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
new file mode 100644
index 000000000000..1c02a8f952d0
--- /dev/null
+++ b/net/rds/iw_rdma.c
@@ -0,0 +1,888 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "rdma.h"
37#include "iw.h"
38
39
40/*
41 * This is stored as mr->r_trans_private.
42 */
43struct rds_iw_mr {
44 struct rds_iw_device *device;
45 struct rds_iw_mr_pool *pool;
46 struct rdma_cm_id *cm_id;
47
48 struct ib_mr *mr;
49 struct ib_fast_reg_page_list *page_list;
50
51 struct rds_iw_mapping mapping;
52 unsigned char remap_count;
53};
54
55/*
56 * Our own little MR pool
57 */
58struct rds_iw_mr_pool {
59 struct rds_iw_device *device; /* back ptr to the device that owns us */
60
61 struct mutex flush_lock; /* serialize fmr invalidate */
62 struct work_struct flush_worker; /* flush worker */
63
64 spinlock_t list_lock; /* protect variables below */
65 atomic_t item_count; /* total # of MRs */
66 atomic_t dirty_count; /* # dirty of MRs */
67 struct list_head dirty_list; /* dirty mappings */
68 struct list_head clean_list; /* unused & unamapped MRs */
69 atomic_t free_pinned; /* memory pinned by free MRs */
70 unsigned long max_message_size; /* in pages */
71 unsigned long max_items;
72 unsigned long max_items_soft;
73 unsigned long max_free_pinned;
74 int max_pages;
75};
76
77static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
78static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
79static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
80static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
81 struct rds_iw_mr *ibmr,
82 struct scatterlist *sg, unsigned int nents);
83static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
84static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
85 struct list_head *unmap_list,
86 struct list_head *kill_list);
87static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
88
89static int rds_iw_get_device(struct rds_sock *rs, struct rds_iw_device **rds_iwdev, struct rdma_cm_id **cm_id)
90{
91 struct rds_iw_device *iwdev;
92 struct rds_iw_cm_id *i_cm_id;
93
94 *rds_iwdev = NULL;
95 *cm_id = NULL;
96
97 list_for_each_entry(iwdev, &rds_iw_devices, list) {
98 spin_lock_irq(&iwdev->spinlock);
99 list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
100 struct sockaddr_in *src_addr, *dst_addr;
101
102 src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
103 dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
104
105 rdsdebug("local ipaddr = %x port %d, "
106 "remote ipaddr = %x port %d"
107 "..looking for %x port %d, "
108 "remote ipaddr = %x port %d\n",
109 src_addr->sin_addr.s_addr,
110 src_addr->sin_port,
111 dst_addr->sin_addr.s_addr,
112 dst_addr->sin_port,
113 rs->rs_bound_addr,
114 rs->rs_bound_port,
115 rs->rs_conn_addr,
116 rs->rs_conn_port);
117#ifdef WORKING_TUPLE_DETECTION
118 if (src_addr->sin_addr.s_addr == rs->rs_bound_addr &&
119 src_addr->sin_port == rs->rs_bound_port &&
120 dst_addr->sin_addr.s_addr == rs->rs_conn_addr &&
121 dst_addr->sin_port == rs->rs_conn_port) {
122#else
123 /* FIXME - needs to compare the local and remote
124 * ipaddr/port tuple, but the ipaddr is the only
125 * available infomation in the rds_sock (as the rest are
126 * zero'ed. It doesn't appear to be properly populated
127 * during connection setup...
128 */
129 if (src_addr->sin_addr.s_addr == rs->rs_bound_addr) {
130#endif
131 spin_unlock_irq(&iwdev->spinlock);
132 *rds_iwdev = iwdev;
133 *cm_id = i_cm_id->cm_id;
134 return 0;
135 }
136 }
137 spin_unlock_irq(&iwdev->spinlock);
138 }
139
140 return 1;
141}
142
143static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
144{
145 struct rds_iw_cm_id *i_cm_id;
146
147 i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
148 if (!i_cm_id)
149 return -ENOMEM;
150
151 i_cm_id->cm_id = cm_id;
152
153 spin_lock_irq(&rds_iwdev->spinlock);
154 list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
155 spin_unlock_irq(&rds_iwdev->spinlock);
156
157 return 0;
158}
159
160void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
161{
162 struct rds_iw_cm_id *i_cm_id;
163
164 spin_lock_irq(&rds_iwdev->spinlock);
165 list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
166 if (i_cm_id->cm_id == cm_id) {
167 list_del(&i_cm_id->list);
168 kfree(i_cm_id);
169 break;
170 }
171 }
172 spin_unlock_irq(&rds_iwdev->spinlock);
173}
174
175
176int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
177{
178 struct sockaddr_in *src_addr, *dst_addr;
179 struct rds_iw_device *rds_iwdev_old;
180 struct rds_sock rs;
181 struct rdma_cm_id *pcm_id;
182 int rc;
183
184 src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
185 dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
186
187 rs.rs_bound_addr = src_addr->sin_addr.s_addr;
188 rs.rs_bound_port = src_addr->sin_port;
189 rs.rs_conn_addr = dst_addr->sin_addr.s_addr;
190 rs.rs_conn_port = dst_addr->sin_port;
191
192 rc = rds_iw_get_device(&rs, &rds_iwdev_old, &pcm_id);
193 if (rc)
194 rds_iw_remove_cm_id(rds_iwdev, cm_id);
195
196 return rds_iw_add_cm_id(rds_iwdev, cm_id);
197}
198
199int rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
200{
201 struct rds_iw_connection *ic = conn->c_transport_data;
202
203 /* conn was previously on the nodev_conns_list */
204 spin_lock_irq(&iw_nodev_conns_lock);
205 BUG_ON(list_empty(&iw_nodev_conns));
206 BUG_ON(list_empty(&ic->iw_node));
207 list_del(&ic->iw_node);
208 spin_unlock_irq(&iw_nodev_conns_lock);
209
210 spin_lock_irq(&rds_iwdev->spinlock);
211 list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
212 spin_unlock_irq(&rds_iwdev->spinlock);
213
214 ic->rds_iwdev = rds_iwdev;
215
216 return 0;
217}
218
219void rds_iw_remove_nodev_conns(void)
220{
221 struct rds_iw_connection *ic, *_ic;
222 LIST_HEAD(tmp_list);
223
224 /* avoid calling conn_destroy with irqs off */
225 spin_lock_irq(&iw_nodev_conns_lock);
226 list_splice(&iw_nodev_conns, &tmp_list);
227 INIT_LIST_HEAD(&iw_nodev_conns);
228 spin_unlock_irq(&iw_nodev_conns_lock);
229
230 list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
231 if (ic->conn->c_passive)
232 rds_conn_destroy(ic->conn->c_passive);
233 rds_conn_destroy(ic->conn);
234 }
235}
236
237void rds_iw_remove_conns(struct rds_iw_device *rds_iwdev)
238{
239 struct rds_iw_connection *ic, *_ic;
240 LIST_HEAD(tmp_list);
241
242 /* avoid calling conn_destroy with irqs off */
243 spin_lock_irq(&rds_iwdev->spinlock);
244 list_splice(&rds_iwdev->conn_list, &tmp_list);
245 INIT_LIST_HEAD(&rds_iwdev->conn_list);
246 spin_unlock_irq(&rds_iwdev->spinlock);
247
248 list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node) {
249 if (ic->conn->c_passive)
250 rds_conn_destroy(ic->conn->c_passive);
251 rds_conn_destroy(ic->conn);
252 }
253}
254
255static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
256 struct scatterlist *list, unsigned int sg_len)
257{
258 sg->list = list;
259 sg->len = sg_len;
260 sg->dma_len = 0;
261 sg->dma_npages = 0;
262 sg->bytes = 0;
263}
264
265static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
266 struct rds_iw_scatterlist *sg,
267 unsigned int dma_page_shift)
268{
269 struct ib_device *dev = rds_iwdev->dev;
270 u64 *dma_pages = NULL;
271 u64 dma_mask;
272 unsigned int dma_page_size;
273 int i, j, ret;
274
275 dma_page_size = 1 << dma_page_shift;
276 dma_mask = dma_page_size - 1;
277
278 WARN_ON(sg->dma_len);
279
280 sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
281 if (unlikely(!sg->dma_len)) {
282 printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
283 return ERR_PTR(-EBUSY);
284 }
285
286 sg->bytes = 0;
287 sg->dma_npages = 0;
288
289 ret = -EINVAL;
290 for (i = 0; i < sg->dma_len; ++i) {
291 unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
292 u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
293 u64 end_addr;
294
295 sg->bytes += dma_len;
296
297 end_addr = dma_addr + dma_len;
298 if (dma_addr & dma_mask) {
299 if (i > 0)
300 goto out_unmap;
301 dma_addr &= ~dma_mask;
302 }
303 if (end_addr & dma_mask) {
304 if (i < sg->dma_len - 1)
305 goto out_unmap;
306 end_addr = (end_addr + dma_mask) & ~dma_mask;
307 }
308
309 sg->dma_npages += (end_addr - dma_addr) >> dma_page_shift;
310 }
311
312 /* Now gather the dma addrs into one list */
313 if (sg->dma_npages > fastreg_message_size)
314 goto out_unmap;
315
316 dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
317 if (!dma_pages) {
318 ret = -ENOMEM;
319 goto out_unmap;
320 }
321
322 for (i = j = 0; i < sg->dma_len; ++i) {
323 unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
324 u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
325 u64 end_addr;
326
327 end_addr = dma_addr + dma_len;
328 dma_addr &= ~dma_mask;
329 for (; dma_addr < end_addr; dma_addr += dma_page_size)
330 dma_pages[j++] = dma_addr;
331 BUG_ON(j > sg->dma_npages);
332 }
333
334 return dma_pages;
335
336out_unmap:
337 ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
338 sg->dma_len = 0;
339 kfree(dma_pages);
340 return ERR_PTR(ret);
341}
342
343
344struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
345{
346 struct rds_iw_mr_pool *pool;
347
348 pool = kzalloc(sizeof(*pool), GFP_KERNEL);
349 if (!pool) {
350 printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
351 return ERR_PTR(-ENOMEM);
352 }
353
354 pool->device = rds_iwdev;
355 INIT_LIST_HEAD(&pool->dirty_list);
356 INIT_LIST_HEAD(&pool->clean_list);
357 mutex_init(&pool->flush_lock);
358 spin_lock_init(&pool->list_lock);
359 INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
360
361 pool->max_message_size = fastreg_message_size;
362 pool->max_items = fastreg_pool_size;
363 pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
364 pool->max_pages = fastreg_message_size;
365
366 /* We never allow more than max_items MRs to be allocated.
367 * When we exceed more than max_items_soft, we start freeing
368 * items more aggressively.
369 * Make sure that max_items > max_items_soft > max_items / 2
370 */
371 pool->max_items_soft = pool->max_items * 3 / 4;
372
373 return pool;
374}
375
376void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
377{
378 struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
379
380 iinfo->rdma_mr_max = pool->max_items;
381 iinfo->rdma_mr_size = pool->max_pages;
382}
383
384void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
385{
386 flush_workqueue(rds_wq);
387 rds_iw_flush_mr_pool(pool, 1);
388 BUG_ON(atomic_read(&pool->item_count));
389 BUG_ON(atomic_read(&pool->free_pinned));
390 kfree(pool);
391}
392
393static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
394{
395 struct rds_iw_mr *ibmr = NULL;
396 unsigned long flags;
397
398 spin_lock_irqsave(&pool->list_lock, flags);
399 if (!list_empty(&pool->clean_list)) {
400 ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
401 list_del_init(&ibmr->mapping.m_list);
402 }
403 spin_unlock_irqrestore(&pool->list_lock, flags);
404
405 return ibmr;
406}
407
408static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
409{
410 struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
411 struct rds_iw_mr *ibmr = NULL;
412 int err = 0, iter = 0;
413
414 while (1) {
415 ibmr = rds_iw_reuse_fmr(pool);
416 if (ibmr)
417 return ibmr;
418
419 /* No clean MRs - now we have the choice of either
420 * allocating a fresh MR up to the limit imposed by the
421 * driver, or flush any dirty unused MRs.
422 * We try to avoid stalling in the send path if possible,
423 * so we allocate as long as we're allowed to.
424 *
425 * We're fussy with enforcing the FMR limit, though. If the driver
426 * tells us we can't use more than N fmrs, we shouldn't start
427 * arguing with it */
428 if (atomic_inc_return(&pool->item_count) <= pool->max_items)
429 break;
430
431 atomic_dec(&pool->item_count);
432
433 if (++iter > 2) {
434 rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
435 return ERR_PTR(-EAGAIN);
436 }
437
438 /* We do have some empty MRs. Flush them out. */
439 rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
440 rds_iw_flush_mr_pool(pool, 0);
441 }
442
443 ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
444 if (!ibmr) {
445 err = -ENOMEM;
446 goto out_no_cigar;
447 }
448
449 spin_lock_init(&ibmr->mapping.m_lock);
450 INIT_LIST_HEAD(&ibmr->mapping.m_list);
451 ibmr->mapping.m_mr = ibmr;
452
453 err = rds_iw_init_fastreg(pool, ibmr);
454 if (err)
455 goto out_no_cigar;
456
457 rds_iw_stats_inc(s_iw_rdma_mr_alloc);
458 return ibmr;
459
460out_no_cigar:
461 if (ibmr) {
462 rds_iw_destroy_fastreg(pool, ibmr);
463 kfree(ibmr);
464 }
465 atomic_dec(&pool->item_count);
466 return ERR_PTR(err);
467}
468
469void rds_iw_sync_mr(void *trans_private, int direction)
470{
471 struct rds_iw_mr *ibmr = trans_private;
472 struct rds_iw_device *rds_iwdev = ibmr->device;
473
474 switch (direction) {
475 case DMA_FROM_DEVICE:
476 ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
477 ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
478 break;
479 case DMA_TO_DEVICE:
480 ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
481 ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
482 break;
483 }
484}
485
486static inline unsigned int rds_iw_flush_goal(struct rds_iw_mr_pool *pool, int free_all)
487{
488 unsigned int item_count;
489
490 item_count = atomic_read(&pool->item_count);
491 if (free_all)
492 return item_count;
493
494 return 0;
495}
496
497/*
498 * Flush our pool of MRs.
499 * At a minimum, all currently unused MRs are unmapped.
500 * If the number of MRs allocated exceeds the limit, we also try
501 * to free as many MRs as needed to get back to this limit.
502 */
503static int rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
504{
505 struct rds_iw_mr *ibmr, *next;
506 LIST_HEAD(unmap_list);
507 LIST_HEAD(kill_list);
508 unsigned long flags;
509 unsigned int nfreed = 0, ncleaned = 0, free_goal;
510 int ret = 0;
511
512 rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
513
514 mutex_lock(&pool->flush_lock);
515
516 spin_lock_irqsave(&pool->list_lock, flags);
517 /* Get the list of all mappings to be destroyed */
518 list_splice_init(&pool->dirty_list, &unmap_list);
519 if (free_all)
520 list_splice_init(&pool->clean_list, &kill_list);
521 spin_unlock_irqrestore(&pool->list_lock, flags);
522
523 free_goal = rds_iw_flush_goal(pool, free_all);
524
525 /* Batched invalidate of dirty MRs.
526 * For FMR based MRs, the mappings on the unmap list are
527 * actually members of an ibmr (ibmr->mapping). They either
528 * migrate to the kill_list, or have been cleaned and should be
529 * moved to the clean_list.
530 * For fastregs, they will be dynamically allocated, and
531 * will be destroyed by the unmap function.
532 */
533 if (!list_empty(&unmap_list)) {
534 ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list, &kill_list);
535 /* If we've been asked to destroy all MRs, move those
536 * that were simply cleaned to the kill list */
537 if (free_all)
538 list_splice_init(&unmap_list, &kill_list);
539 }
540
541 /* Destroy any MRs that are past their best before date */
542 list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
543 rds_iw_stats_inc(s_iw_rdma_mr_free);
544 list_del(&ibmr->mapping.m_list);
545 rds_iw_destroy_fastreg(pool, ibmr);
546 kfree(ibmr);
547 nfreed++;
548 }
549
550 /* Anything that remains are laundered ibmrs, which we can add
551 * back to the clean list. */
552 if (!list_empty(&unmap_list)) {
553 spin_lock_irqsave(&pool->list_lock, flags);
554 list_splice(&unmap_list, &pool->clean_list);
555 spin_unlock_irqrestore(&pool->list_lock, flags);
556 }
557
558 atomic_sub(ncleaned, &pool->dirty_count);
559 atomic_sub(nfreed, &pool->item_count);
560
561 mutex_unlock(&pool->flush_lock);
562 return ret;
563}
564
565static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
566{
567 struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
568
569 rds_iw_flush_mr_pool(pool, 0);
570}
571
572void rds_iw_free_mr(void *trans_private, int invalidate)
573{
574 struct rds_iw_mr *ibmr = trans_private;
575 struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
576
577 rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
578 if (!pool)
579 return;
580
581 /* Return it to the pool's free list */
582 rds_iw_free_fastreg(pool, ibmr);
583
584 /* If we've pinned too many pages, request a flush */
585 if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned
586 || atomic_read(&pool->dirty_count) >= pool->max_items / 10)
587 queue_work(rds_wq, &pool->flush_worker);
588
589 if (invalidate) {
590 if (likely(!in_interrupt())) {
591 rds_iw_flush_mr_pool(pool, 0);
592 } else {
593 /* We get here if the user created a MR marked
594 * as use_once and invalidate at the same time. */
595 queue_work(rds_wq, &pool->flush_worker);
596 }
597 }
598}
599
600void rds_iw_flush_mrs(void)
601{
602 struct rds_iw_device *rds_iwdev;
603
604 list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
605 struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
606
607 if (pool)
608 rds_iw_flush_mr_pool(pool, 0);
609 }
610}
611
612void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
613 struct rds_sock *rs, u32 *key_ret)
614{
615 struct rds_iw_device *rds_iwdev;
616 struct rds_iw_mr *ibmr = NULL;
617 struct rdma_cm_id *cm_id;
618 int ret;
619
620 ret = rds_iw_get_device(rs, &rds_iwdev, &cm_id);
621 if (ret || !cm_id) {
622 ret = -ENODEV;
623 goto out;
624 }
625
626 if (!rds_iwdev->mr_pool) {
627 ret = -ENODEV;
628 goto out;
629 }
630
631 ibmr = rds_iw_alloc_mr(rds_iwdev);
632 if (IS_ERR(ibmr))
633 return ibmr;
634
635 ibmr->cm_id = cm_id;
636 ibmr->device = rds_iwdev;
637
638 ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
639 if (ret == 0)
640 *key_ret = ibmr->mr->rkey;
641 else
642 printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
643
644out:
645 if (ret) {
646 if (ibmr)
647 rds_iw_free_mr(ibmr, 0);
648 ibmr = ERR_PTR(ret);
649 }
650 return ibmr;
651}
652
653/*
654 * iWARP fastreg handling
655 *
656 * The life cycle of a fastreg registration is a bit different from
657 * FMRs.
658 * The idea behind fastreg is to have one MR, to which we bind different
659 * mappings over time. To avoid stalling on the expensive map and invalidate
660 * operations, these operations are pipelined on the same send queue on
661 * which we want to send the message containing the r_key.
662 *
663 * This creates a bit of a problem for us, as we do not have the destination
664 * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
665 * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
666 * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
667 * before queuing the SEND. When completions for these arrive, they are
668 * dispatched to the MR has a bit set showing that RDMa can be performed.
669 *
670 * There is another interesting aspect that's related to invalidation.
671 * The application can request that a mapping is invalidated in FREE_MR.
672 * The expectation there is that this invalidation step includes ALL
673 * PREVIOUSLY FREED MRs.
674 */
675static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
676 struct rds_iw_mr *ibmr)
677{
678 struct rds_iw_device *rds_iwdev = pool->device;
679 struct ib_fast_reg_page_list *page_list = NULL;
680 struct ib_mr *mr;
681 int err;
682
683 mr = ib_alloc_fast_reg_mr(rds_iwdev->pd, pool->max_message_size);
684 if (IS_ERR(mr)) {
685 err = PTR_ERR(mr);
686
687 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed (err=%d)\n", err);
688 return err;
689 }
690
691 /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
692 * is not filled in.
693 */
694 page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
695 if (IS_ERR(page_list)) {
696 err = PTR_ERR(page_list);
697
698 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
699 ib_dereg_mr(mr);
700 return err;
701 }
702
703 ibmr->page_list = page_list;
704 ibmr->mr = mr;
705 return 0;
706}
707
708static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
709{
710 struct rds_iw_mr *ibmr = mapping->m_mr;
711 struct ib_send_wr f_wr, *failed_wr;
712 int ret;
713
714 /*
715 * Perform a WR for the fast_reg_mr. Each individual page
716 * in the sg list is added to the fast reg page list and placed
717 * inside the fast_reg_mr WR. The key used is a rolling 8bit
718 * counter, which should guarantee uniqueness.
719 */
720 ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
721 mapping->m_rkey = ibmr->mr->rkey;
722
723 memset(&f_wr, 0, sizeof(f_wr));
724 f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
725 f_wr.opcode = IB_WR_FAST_REG_MR;
726 f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
727 f_wr.wr.fast_reg.rkey = mapping->m_rkey;
728 f_wr.wr.fast_reg.page_list = ibmr->page_list;
729 f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
730 f_wr.wr.fast_reg.page_shift = ibmr->device->page_shift;
731 f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
732 IB_ACCESS_REMOTE_READ |
733 IB_ACCESS_REMOTE_WRITE;
734 f_wr.wr.fast_reg.iova_start = 0;
735 f_wr.send_flags = IB_SEND_SIGNALED;
736
737 failed_wr = &f_wr;
738 ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
739 BUG_ON(failed_wr != &f_wr);
740 if (ret && printk_ratelimit())
741 printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
742 __func__, __LINE__, ret);
743 return ret;
744}
745
746static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
747{
748 struct ib_send_wr s_wr, *failed_wr;
749 int ret = 0;
750
751 if (!ibmr->cm_id->qp || !ibmr->mr)
752 goto out;
753
754 memset(&s_wr, 0, sizeof(s_wr));
755 s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
756 s_wr.opcode = IB_WR_LOCAL_INV;
757 s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
758 s_wr.send_flags = IB_SEND_SIGNALED;
759
760 failed_wr = &s_wr;
761 ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
762 if (ret && printk_ratelimit()) {
763 printk(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
764 __func__, __LINE__, ret);
765 goto out;
766 }
767out:
768 return ret;
769}
770
771static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
772 struct rds_iw_mr *ibmr,
773 struct scatterlist *sg,
774 unsigned int sg_len)
775{
776 struct rds_iw_device *rds_iwdev = pool->device;
777 struct rds_iw_mapping *mapping = &ibmr->mapping;
778 u64 *dma_pages;
779 int i, ret = 0;
780
781 rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
782
783 dma_pages = rds_iw_map_scatterlist(rds_iwdev,
784 &mapping->m_sg,
785 rds_iwdev->page_shift);
786 if (IS_ERR(dma_pages)) {
787 ret = PTR_ERR(dma_pages);
788 dma_pages = NULL;
789 goto out;
790 }
791
792 if (mapping->m_sg.dma_len > pool->max_message_size) {
793 ret = -EMSGSIZE;
794 goto out;
795 }
796
797 for (i = 0; i < mapping->m_sg.dma_npages; ++i)
798 ibmr->page_list->page_list[i] = dma_pages[i];
799
800 ret = rds_iw_rdma_build_fastreg(mapping);
801 if (ret)
802 goto out;
803
804 rds_iw_stats_inc(s_iw_rdma_mr_used);
805
806out:
807 kfree(dma_pages);
808
809 return ret;
810}
811
812/*
813 * "Free" a fastreg MR.
814 */
815static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
816 struct rds_iw_mr *ibmr)
817{
818 unsigned long flags;
819 int ret;
820
821 if (!ibmr->mapping.m_sg.dma_len)
822 return;
823
824 ret = rds_iw_rdma_fastreg_inv(ibmr);
825 if (ret)
826 return;
827
828 /* Try to post the LOCAL_INV WR to the queue. */
829 spin_lock_irqsave(&pool->list_lock, flags);
830
831 list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
832 atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
833 atomic_inc(&pool->dirty_count);
834
835 spin_unlock_irqrestore(&pool->list_lock, flags);
836}
837
838static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
839 struct list_head *unmap_list,
840 struct list_head *kill_list)
841{
842 struct rds_iw_mapping *mapping, *next;
843 unsigned int ncleaned = 0;
844 LIST_HEAD(laundered);
845
846 /* Batched invalidation of fastreg MRs.
847 * Why do we do it this way, even though we could pipeline unmap
848 * and remap? The reason is the application semantics - when the
849 * application requests an invalidation of MRs, it expects all
850 * previously released R_Keys to become invalid.
851 *
852 * If we implement MR reuse naively, we risk memory corruption
853 * (this has actually been observed). So the default behavior
854 * requires that a MR goes through an explicit unmap operation before
855 * we can reuse it again.
856 *
857 * We could probably improve on this a little, by allowing immediate
858 * reuse of a MR on the same socket (eg you could add small
859 * cache of unused MRs to strct rds_socket - GET_MR could grab one
860 * of these without requiring an explicit invalidate).
861 */
862 while (!list_empty(unmap_list)) {
863 unsigned long flags;
864
865 spin_lock_irqsave(&pool->list_lock, flags);
866 list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
867 list_move(&mapping->m_list, &laundered);
868 ncleaned++;
869 }
870 spin_unlock_irqrestore(&pool->list_lock, flags);
871 }
872
873 /* Move all laundered mappings back to the unmap list.
874 * We do not kill any WRs right now - it doesn't seem the
875 * fastreg API has a max_remap limit. */
876 list_splice_init(&laundered, unmap_list);
877
878 return ncleaned;
879}
880
881static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
882 struct rds_iw_mr *ibmr)
883{
884 if (ibmr->page_list)
885 ib_free_fast_reg_page_list(ibmr->page_list);
886 if (ibmr->mr)
887 ib_dereg_mr(ibmr->mr);
888}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
new file mode 100644
index 000000000000..a1931f0027a2
--- /dev/null
+++ b/net/rds/iw_recv.c
@@ -0,0 +1,869 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/pci.h>
35#include <linux/dma-mapping.h>
36#include <rdma/rdma_cm.h>
37
38#include "rds.h"
39#include "iw.h"
40
41static struct kmem_cache *rds_iw_incoming_slab;
42static struct kmem_cache *rds_iw_frag_slab;
43static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
44
45static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
46{
47 rdsdebug("frag %p page %p\n", frag, frag->f_page);
48 __free_page(frag->f_page);
49 frag->f_page = NULL;
50}
51
52static void rds_iw_frag_free(struct rds_page_frag *frag)
53{
54 rdsdebug("frag %p page %p\n", frag, frag->f_page);
55 BUG_ON(frag->f_page != NULL);
56 kmem_cache_free(rds_iw_frag_slab, frag);
57}
58
59/*
60 * We map a page at a time. Its fragments are posted in order. This
61 * is called in fragment order as the fragments get send completion events.
62 * Only the last frag in the page performs the unmapping.
63 *
64 * It's OK for ring cleanup to call this in whatever order it likes because
65 * DMA is not in flight and so we can unmap while other ring entries still
66 * hold page references in their frags.
67 */
68static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
69 struct rds_iw_recv_work *recv)
70{
71 struct rds_page_frag *frag = recv->r_frag;
72
73 rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
74 if (frag->f_mapped)
75 ib_dma_unmap_page(ic->i_cm_id->device,
76 frag->f_mapped,
77 RDS_FRAG_SIZE, DMA_FROM_DEVICE);
78 frag->f_mapped = 0;
79}
80
81void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
82{
83 struct rds_iw_recv_work *recv;
84 u32 i;
85
86 for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
87 struct ib_sge *sge;
88
89 recv->r_iwinc = NULL;
90 recv->r_frag = NULL;
91
92 recv->r_wr.next = NULL;
93 recv->r_wr.wr_id = i;
94 recv->r_wr.sg_list = recv->r_sge;
95 recv->r_wr.num_sge = RDS_IW_RECV_SGE;
96
97 sge = rds_iw_data_sge(ic, recv->r_sge);
98 sge->addr = 0;
99 sge->length = RDS_FRAG_SIZE;
100 sge->lkey = 0;
101
102 sge = rds_iw_header_sge(ic, recv->r_sge);
103 sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
104 sge->length = sizeof(struct rds_header);
105 sge->lkey = 0;
106 }
107}
108
109static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
110 struct rds_iw_recv_work *recv)
111{
112 if (recv->r_iwinc) {
113 rds_inc_put(&recv->r_iwinc->ii_inc);
114 recv->r_iwinc = NULL;
115 }
116 if (recv->r_frag) {
117 rds_iw_recv_unmap_page(ic, recv);
118 if (recv->r_frag->f_page)
119 rds_iw_frag_drop_page(recv->r_frag);
120 rds_iw_frag_free(recv->r_frag);
121 recv->r_frag = NULL;
122 }
123}
124
125void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
126{
127 u32 i;
128
129 for (i = 0; i < ic->i_recv_ring.w_nr; i++)
130 rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
131
132 if (ic->i_frag.f_page)
133 rds_iw_frag_drop_page(&ic->i_frag);
134}
135
136static int rds_iw_recv_refill_one(struct rds_connection *conn,
137 struct rds_iw_recv_work *recv,
138 gfp_t kptr_gfp, gfp_t page_gfp)
139{
140 struct rds_iw_connection *ic = conn->c_transport_data;
141 dma_addr_t dma_addr;
142 struct ib_sge *sge;
143 int ret = -ENOMEM;
144
145 if (recv->r_iwinc == NULL) {
146 if (atomic_read(&rds_iw_allocation) >= rds_iw_sysctl_max_recv_allocation) {
147 rds_iw_stats_inc(s_iw_rx_alloc_limit);
148 goto out;
149 }
150 recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
151 kptr_gfp);
152 if (recv->r_iwinc == NULL)
153 goto out;
154 atomic_inc(&rds_iw_allocation);
155 INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
156 rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
157 }
158
159 if (recv->r_frag == NULL) {
160 recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
161 if (recv->r_frag == NULL)
162 goto out;
163 INIT_LIST_HEAD(&recv->r_frag->f_item);
164 recv->r_frag->f_page = NULL;
165 }
166
167 if (ic->i_frag.f_page == NULL) {
168 ic->i_frag.f_page = alloc_page(page_gfp);
169 if (ic->i_frag.f_page == NULL)
170 goto out;
171 ic->i_frag.f_offset = 0;
172 }
173
174 dma_addr = ib_dma_map_page(ic->i_cm_id->device,
175 ic->i_frag.f_page,
176 ic->i_frag.f_offset,
177 RDS_FRAG_SIZE,
178 DMA_FROM_DEVICE);
179 if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
180 goto out;
181
182 /*
183 * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
184 * must be called on this recv. This happens as completions hit
185 * in order or on connection shutdown.
186 */
187 recv->r_frag->f_page = ic->i_frag.f_page;
188 recv->r_frag->f_offset = ic->i_frag.f_offset;
189 recv->r_frag->f_mapped = dma_addr;
190
191 sge = rds_iw_data_sge(ic, recv->r_sge);
192 sge->addr = dma_addr;
193 sge->length = RDS_FRAG_SIZE;
194
195 sge = rds_iw_header_sge(ic, recv->r_sge);
196 sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
197 sge->length = sizeof(struct rds_header);
198
199 get_page(recv->r_frag->f_page);
200
201 if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
202 ic->i_frag.f_offset += RDS_FRAG_SIZE;
203 } else {
204 put_page(ic->i_frag.f_page);
205 ic->i_frag.f_page = NULL;
206 ic->i_frag.f_offset = 0;
207 }
208
209 ret = 0;
210out:
211 return ret;
212}
213
214/*
215 * This tries to allocate and post unused work requests after making sure that
216 * they have all the allocations they need to queue received fragments into
217 * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
218 * pairs don't go unmatched.
219 *
220 * -1 is returned if posting fails due to temporary resource exhaustion.
221 */
222int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
223 gfp_t page_gfp, int prefill)
224{
225 struct rds_iw_connection *ic = conn->c_transport_data;
226 struct rds_iw_recv_work *recv;
227 struct ib_recv_wr *failed_wr;
228 unsigned int posted = 0;
229 int ret = 0;
230 u32 pos;
231
232 while ((prefill || rds_conn_up(conn))
233 && rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
234 if (pos >= ic->i_recv_ring.w_nr) {
235 printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
236 pos);
237 ret = -EINVAL;
238 break;
239 }
240
241 recv = &ic->i_recvs[pos];
242 ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
243 if (ret) {
244 ret = -1;
245 break;
246 }
247
248 /* XXX when can this fail? */
249 ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
250 rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
251 recv->r_iwinc, recv->r_frag->f_page,
252 (long) recv->r_frag->f_mapped, ret);
253 if (ret) {
254 rds_iw_conn_error(conn, "recv post on "
255 "%pI4 returned %d, disconnecting and "
256 "reconnecting\n", &conn->c_faddr,
257 ret);
258 ret = -1;
259 break;
260 }
261
262 posted++;
263 }
264
265 /* We're doing flow control - update the window. */
266 if (ic->i_flowctl && posted)
267 rds_iw_advertise_credits(conn, posted);
268
269 if (ret)
270 rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
271 return ret;
272}
273
274void rds_iw_inc_purge(struct rds_incoming *inc)
275{
276 struct rds_iw_incoming *iwinc;
277 struct rds_page_frag *frag;
278 struct rds_page_frag *pos;
279
280 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
281 rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
282
283 list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
284 list_del_init(&frag->f_item);
285 rds_iw_frag_drop_page(frag);
286 rds_iw_frag_free(frag);
287 }
288}
289
290void rds_iw_inc_free(struct rds_incoming *inc)
291{
292 struct rds_iw_incoming *iwinc;
293
294 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
295
296 rds_iw_inc_purge(inc);
297 rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
298 BUG_ON(!list_empty(&iwinc->ii_frags));
299 kmem_cache_free(rds_iw_incoming_slab, iwinc);
300 atomic_dec(&rds_iw_allocation);
301 BUG_ON(atomic_read(&rds_iw_allocation) < 0);
302}
303
304int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov,
305 size_t size)
306{
307 struct rds_iw_incoming *iwinc;
308 struct rds_page_frag *frag;
309 struct iovec *iov = first_iov;
310 unsigned long to_copy;
311 unsigned long frag_off = 0;
312 unsigned long iov_off = 0;
313 int copied = 0;
314 int ret;
315 u32 len;
316
317 iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
318 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
319 len = be32_to_cpu(inc->i_hdr.h_len);
320
321 while (copied < size && copied < len) {
322 if (frag_off == RDS_FRAG_SIZE) {
323 frag = list_entry(frag->f_item.next,
324 struct rds_page_frag, f_item);
325 frag_off = 0;
326 }
327 while (iov_off == iov->iov_len) {
328 iov_off = 0;
329 iov++;
330 }
331
332 to_copy = min(iov->iov_len - iov_off, RDS_FRAG_SIZE - frag_off);
333 to_copy = min_t(size_t, to_copy, size - copied);
334 to_copy = min_t(unsigned long, to_copy, len - copied);
335
336 rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag "
337 "[%p, %lu] + %lu\n",
338 to_copy, iov->iov_base, iov->iov_len, iov_off,
339 frag->f_page, frag->f_offset, frag_off);
340
341 /* XXX needs + offset for multiple recvs per page */
342 ret = rds_page_copy_to_user(frag->f_page,
343 frag->f_offset + frag_off,
344 iov->iov_base + iov_off,
345 to_copy);
346 if (ret) {
347 copied = ret;
348 break;
349 }
350
351 iov_off += to_copy;
352 frag_off += to_copy;
353 copied += to_copy;
354 }
355
356 return copied;
357}
358
359/* ic starts out kzalloc()ed */
360void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
361{
362 struct ib_send_wr *wr = &ic->i_ack_wr;
363 struct ib_sge *sge = &ic->i_ack_sge;
364
365 sge->addr = ic->i_ack_dma;
366 sge->length = sizeof(struct rds_header);
367 sge->lkey = rds_iw_local_dma_lkey(ic);
368
369 wr->sg_list = sge;
370 wr->num_sge = 1;
371 wr->opcode = IB_WR_SEND;
372 wr->wr_id = RDS_IW_ACK_WR_ID;
373 wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
374}
375
376/*
377 * You'd think that with reliable IB connections you wouldn't need to ack
378 * messages that have been received. The problem is that IB hardware generates
379 * an ack message before it has DMAed the message into memory. This creates a
380 * potential message loss if the HCA is disabled for any reason between when it
381 * sends the ack and before the message is DMAed and processed. This is only a
382 * potential issue if another HCA is available for fail-over.
383 *
384 * When the remote host receives our ack they'll free the sent message from
385 * their send queue. To decrease the latency of this we always send an ack
386 * immediately after we've received messages.
387 *
388 * For simplicity, we only have one ack in flight at a time. This puts
389 * pressure on senders to have deep enough send queues to absorb the latency of
390 * a single ack frame being in flight. This might not be good enough.
391 *
392 * This is implemented by have a long-lived send_wr and sge which point to a
393 * statically allocated ack frame. This ack wr does not fall under the ring
394 * accounting that the tx and rx wrs do. The QP attribute specifically makes
395 * room for it beyond the ring size. Send completion notices its special
396 * wr_id and avoids working with the ring in that case.
397 */
398static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
399 int ack_required)
400{
401 rds_iw_set_64bit(&ic->i_ack_next, seq);
402 if (ack_required) {
403 smp_mb__before_clear_bit();
404 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
405 }
406}
407
408static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
409{
410 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
411 smp_mb__after_clear_bit();
412
413 return ic->i_ack_next;
414}
415
416static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
417{
418 struct rds_header *hdr = ic->i_ack;
419 struct ib_send_wr *failed_wr;
420 u64 seq;
421 int ret;
422
423 seq = rds_iw_get_ack(ic);
424
425 rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
426 rds_message_populate_header(hdr, 0, 0, 0);
427 hdr->h_ack = cpu_to_be64(seq);
428 hdr->h_credit = adv_credits;
429 rds_message_make_checksum(hdr);
430 ic->i_ack_queued = jiffies;
431
432 ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
433 if (unlikely(ret)) {
434 /* Failed to send. Release the WR, and
435 * force another ACK.
436 */
437 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
438 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
439
440 rds_iw_stats_inc(s_iw_ack_send_failure);
441 /* Need to finesse this later. */
442 BUG();
443 } else
444 rds_iw_stats_inc(s_iw_ack_sent);
445}
446
447/*
448 * There are 3 ways of getting acknowledgements to the peer:
449 * 1. We call rds_iw_attempt_ack from the recv completion handler
450 * to send an ACK-only frame.
451 * However, there can be only one such frame in the send queue
452 * at any time, so we may have to postpone it.
453 * 2. When another (data) packet is transmitted while there's
454 * an ACK in the queue, we piggyback the ACK sequence number
455 * on the data packet.
456 * 3. If the ACK WR is done sending, we get called from the
457 * send queue completion handler, and check whether there's
458 * another ACK pending (postponed because the WR was on the
459 * queue). If so, we transmit it.
460 *
461 * We maintain 2 variables:
462 * - i_ack_flags, which keeps track of whether the ACK WR
463 * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
464 * - i_ack_next, which is the last sequence number we received
465 *
466 * Potentially, send queue and receive queue handlers can run concurrently.
467 *
468 * Reconnecting complicates this picture just slightly. When we
469 * reconnect, we may be seeing duplicate packets. The peer
470 * is retransmitting them, because it hasn't seen an ACK for
471 * them. It is important that we ACK these.
472 *
473 * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
474 * this flag set *MUST* be acknowledged immediately.
475 */
476
477/*
478 * When we get here, we're called from the recv queue handler.
479 * Check whether we ought to transmit an ACK.
480 */
481void rds_iw_attempt_ack(struct rds_iw_connection *ic)
482{
483 unsigned int adv_credits;
484
485 if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
486 return;
487
488 if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
489 rds_iw_stats_inc(s_iw_ack_send_delayed);
490 return;
491 }
492
493 /* Can we get a send credit? */
494 if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0)) {
495 rds_iw_stats_inc(s_iw_tx_throttle);
496 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
497 return;
498 }
499
500 clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
501 rds_iw_send_ack(ic, adv_credits);
502}
503
504/*
505 * We get here from the send completion handler, when the
506 * adapter tells us the ACK frame was sent.
507 */
508void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
509{
510 clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
511 rds_iw_attempt_ack(ic);
512}
513
514/*
515 * This is called by the regular xmit code when it wants to piggyback
516 * an ACK on an outgoing frame.
517 */
518u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
519{
520 if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
521 rds_iw_stats_inc(s_iw_ack_send_piggybacked);
522 return rds_iw_get_ack(ic);
523}
524
525/*
526 * It's kind of lame that we're copying from the posted receive pages into
527 * long-lived bitmaps. We could have posted the bitmaps and rdma written into
528 * them. But receiving new congestion bitmaps should be a *rare* event, so
529 * hopefully we won't need to invest that complexity in making it more
530 * efficient. By copying we can share a simpler core with TCP which has to
531 * copy.
532 */
533static void rds_iw_cong_recv(struct rds_connection *conn,
534 struct rds_iw_incoming *iwinc)
535{
536 struct rds_cong_map *map;
537 unsigned int map_off;
538 unsigned int map_page;
539 struct rds_page_frag *frag;
540 unsigned long frag_off;
541 unsigned long to_copy;
542 unsigned long copied;
543 uint64_t uncongested = 0;
544 void *addr;
545
546 /* catch completely corrupt packets */
547 if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
548 return;
549
550 map = conn->c_fcong;
551 map_page = 0;
552 map_off = 0;
553
554 frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
555 frag_off = 0;
556
557 copied = 0;
558
559 while (copied < RDS_CONG_MAP_BYTES) {
560 uint64_t *src, *dst;
561 unsigned int k;
562
563 to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
564 BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
565
566 addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0);
567
568 src = addr + frag_off;
569 dst = (void *)map->m_page_addrs[map_page] + map_off;
570 for (k = 0; k < to_copy; k += 8) {
571 /* Record ports that became uncongested, ie
572 * bits that changed from 0 to 1. */
573 uncongested |= ~(*src) & *dst;
574 *dst++ = *src++;
575 }
576 kunmap_atomic(addr, KM_SOFTIRQ0);
577
578 copied += to_copy;
579
580 map_off += to_copy;
581 if (map_off == PAGE_SIZE) {
582 map_off = 0;
583 map_page++;
584 }
585
586 frag_off += to_copy;
587 if (frag_off == RDS_FRAG_SIZE) {
588 frag = list_entry(frag->f_item.next,
589 struct rds_page_frag, f_item);
590 frag_off = 0;
591 }
592 }
593
594 /* the congestion map is in little endian order */
595 uncongested = le64_to_cpu(uncongested);
596
597 rds_cong_map_updated(map, uncongested);
598}
599
600/*
601 * Rings are posted with all the allocations they'll need to queue the
602 * incoming message to the receiving socket so this can't fail.
603 * All fragments start with a header, so we can make sure we're not receiving
604 * garbage, and we can tell a small 8 byte fragment from an ACK frame.
605 */
606struct rds_iw_ack_state {
607 u64 ack_next;
608 u64 ack_recv;
609 unsigned int ack_required:1;
610 unsigned int ack_next_valid:1;
611 unsigned int ack_recv_valid:1;
612};
613
614static void rds_iw_process_recv(struct rds_connection *conn,
615 struct rds_iw_recv_work *recv, u32 byte_len,
616 struct rds_iw_ack_state *state)
617{
618 struct rds_iw_connection *ic = conn->c_transport_data;
619 struct rds_iw_incoming *iwinc = ic->i_iwinc;
620 struct rds_header *ihdr, *hdr;
621
622 /* XXX shut down the connection if port 0,0 are seen? */
623
624 rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
625 byte_len);
626
627 if (byte_len < sizeof(struct rds_header)) {
628 rds_iw_conn_error(conn, "incoming message "
629 "from %pI4 didn't inclue a "
630 "header, disconnecting and "
631 "reconnecting\n",
632 &conn->c_faddr);
633 return;
634 }
635 byte_len -= sizeof(struct rds_header);
636
637 ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
638
639 /* Validate the checksum. */
640 if (!rds_message_verify_checksum(ihdr)) {
641 rds_iw_conn_error(conn, "incoming message "
642 "from %pI4 has corrupted header - "
643 "forcing a reconnect\n",
644 &conn->c_faddr);
645 rds_stats_inc(s_recv_drop_bad_checksum);
646 return;
647 }
648
649 /* Process the ACK sequence which comes with every packet */
650 state->ack_recv = be64_to_cpu(ihdr->h_ack);
651 state->ack_recv_valid = 1;
652
653 /* Process the credits update if there was one */
654 if (ihdr->h_credit)
655 rds_iw_send_add_credits(conn, ihdr->h_credit);
656
657 if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
658 /* This is an ACK-only packet. The fact that it gets
659 * special treatment here is that historically, ACKs
660 * were rather special beasts.
661 */
662 rds_iw_stats_inc(s_iw_ack_received);
663
664 /*
665 * Usually the frags make their way on to incs and are then freed as
666 * the inc is freed. We don't go that route, so we have to drop the
667 * page ref ourselves. We can't just leave the page on the recv
668 * because that confuses the dma mapping of pages and each recv's use
669 * of a partial page. We can leave the frag, though, it will be
670 * reused.
671 *
672 * FIXME: Fold this into the code path below.
673 */
674 rds_iw_frag_drop_page(recv->r_frag);
675 return;
676 }
677
678 /*
679 * If we don't already have an inc on the connection then this
680 * fragment has a header and starts a message.. copy its header
681 * into the inc and save the inc so we can hang upcoming fragments
682 * off its list.
683 */
684 if (iwinc == NULL) {
685 iwinc = recv->r_iwinc;
686 recv->r_iwinc = NULL;
687 ic->i_iwinc = iwinc;
688
689 hdr = &iwinc->ii_inc.i_hdr;
690 memcpy(hdr, ihdr, sizeof(*hdr));
691 ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
692
693 rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
694 ic->i_recv_data_rem, hdr->h_flags);
695 } else {
696 hdr = &iwinc->ii_inc.i_hdr;
697 /* We can't just use memcmp here; fragments of a
698 * single message may carry different ACKs */
699 if (hdr->h_sequence != ihdr->h_sequence
700 || hdr->h_len != ihdr->h_len
701 || hdr->h_sport != ihdr->h_sport
702 || hdr->h_dport != ihdr->h_dport) {
703 rds_iw_conn_error(conn,
704 "fragment header mismatch; forcing reconnect\n");
705 return;
706 }
707 }
708
709 list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
710 recv->r_frag = NULL;
711
712 if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
713 ic->i_recv_data_rem -= RDS_FRAG_SIZE;
714 else {
715 ic->i_recv_data_rem = 0;
716 ic->i_iwinc = NULL;
717
718 if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
719 rds_iw_cong_recv(conn, iwinc);
720 else {
721 rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
722 &iwinc->ii_inc, GFP_ATOMIC,
723 KM_SOFTIRQ0);
724 state->ack_next = be64_to_cpu(hdr->h_sequence);
725 state->ack_next_valid = 1;
726 }
727
728 /* Evaluate the ACK_REQUIRED flag *after* we received
729 * the complete frame, and after bumping the next_rx
730 * sequence. */
731 if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
732 rds_stats_inc(s_recv_ack_required);
733 state->ack_required = 1;
734 }
735
736 rds_inc_put(&iwinc->ii_inc);
737 }
738}
739
740/*
741 * Plucking the oldest entry from the ring can be done concurrently with
742 * the thread refilling the ring. Each ring operation is protected by
743 * spinlocks and the transient state of refilling doesn't change the
744 * recording of which entry is oldest.
745 *
746 * This relies on IB only calling one cq comp_handler for each cq so that
747 * there will only be one caller of rds_recv_incoming() per RDS connection.
748 */
749void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
750{
751 struct rds_connection *conn = context;
752 struct rds_iw_connection *ic = conn->c_transport_data;
753 struct ib_wc wc;
754 struct rds_iw_ack_state state = { 0, };
755 struct rds_iw_recv_work *recv;
756
757 rdsdebug("conn %p cq %p\n", conn, cq);
758
759 rds_iw_stats_inc(s_iw_rx_cq_call);
760
761 ib_req_notify_cq(cq, IB_CQ_SOLICITED);
762
763 while (ib_poll_cq(cq, 1, &wc) > 0) {
764 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
765 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
766 be32_to_cpu(wc.ex.imm_data));
767 rds_iw_stats_inc(s_iw_rx_cq_event);
768
769 recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
770
771 rds_iw_recv_unmap_page(ic, recv);
772
773 /*
774 * Also process recvs in connecting state because it is possible
775 * to get a recv completion _before_ the rdmacm ESTABLISHED
776 * event is processed.
777 */
778 if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
779 /* We expect errors as the qp is drained during shutdown */
780 if (wc.status == IB_WC_SUCCESS) {
781 rds_iw_process_recv(conn, recv, wc.byte_len, &state);
782 } else {
783 rds_iw_conn_error(conn, "recv completion on "
784 "%pI4 had status %u, disconnecting and "
785 "reconnecting\n", &conn->c_faddr,
786 wc.status);
787 }
788 }
789
790 rds_iw_ring_free(&ic->i_recv_ring, 1);
791 }
792
793 if (state.ack_next_valid)
794 rds_iw_set_ack(ic, state.ack_next, state.ack_required);
795 if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
796 rds_send_drop_acked(conn, state.ack_recv, NULL);
797 ic->i_ack_recv = state.ack_recv;
798 }
799 if (rds_conn_up(conn))
800 rds_iw_attempt_ack(ic);
801
802 /* If we ever end up with a really empty receive ring, we're
803 * in deep trouble, as the sender will definitely see RNR
804 * timeouts. */
805 if (rds_iw_ring_empty(&ic->i_recv_ring))
806 rds_iw_stats_inc(s_iw_rx_ring_empty);
807
808 /*
809 * If the ring is running low, then schedule the thread to refill.
810 */
811 if (rds_iw_ring_low(&ic->i_recv_ring))
812 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
813}
814
815int rds_iw_recv(struct rds_connection *conn)
816{
817 struct rds_iw_connection *ic = conn->c_transport_data;
818 int ret = 0;
819
820 rdsdebug("conn %p\n", conn);
821
822 /*
823 * If we get a temporary posting failure in this context then
824 * we're really low and we want the caller to back off for a bit.
825 */
826 mutex_lock(&ic->i_recv_mutex);
827 if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
828 ret = -ENOMEM;
829 else
830 rds_iw_stats_inc(s_iw_rx_refill_from_thread);
831 mutex_unlock(&ic->i_recv_mutex);
832
833 if (rds_conn_up(conn))
834 rds_iw_attempt_ack(ic);
835
836 return ret;
837}
838
839int __init rds_iw_recv_init(void)
840{
841 struct sysinfo si;
842 int ret = -ENOMEM;
843
844 /* Default to 30% of all available RAM for recv memory */
845 si_meminfo(&si);
846 rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
847
848 rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
849 sizeof(struct rds_iw_incoming),
850 0, 0, NULL);
851 if (rds_iw_incoming_slab == NULL)
852 goto out;
853
854 rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
855 sizeof(struct rds_page_frag),
856 0, 0, NULL);
857 if (rds_iw_frag_slab == NULL)
858 kmem_cache_destroy(rds_iw_incoming_slab);
859 else
860 ret = 0;
861out:
862 return ret;
863}
864
865void rds_iw_recv_exit(void)
866{
867 kmem_cache_destroy(rds_iw_incoming_slab);
868 kmem_cache_destroy(rds_iw_frag_slab);
869}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
new file mode 100644
index 000000000000..d422d4b5deef
--- /dev/null
+++ b/net/rds/iw_ring.c
@@ -0,0 +1,169 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "iw.h"
37
38/*
39 * Locking for IB rings.
40 * We assume that allocation is always protected by a mutex
41 * in the caller (this is a valid assumption for the current
42 * implementation).
43 *
44 * Freeing always happens in an interrupt, and hence only
45 * races with allocations, but not with other free()s.
46 *
47 * The interaction between allocation and freeing is that
48 * the alloc code has to determine the number of free entries.
49 * To this end, we maintain two counters; an allocation counter
50 * and a free counter. Both are allowed to run freely, and wrap
51 * around.
52 * The number of used entries is always (alloc_ctr - free_ctr) % NR.
53 *
54 * The current implementation makes free_ctr atomic. When the
55 * caller finds an allocation fails, it should set an "alloc fail"
56 * bit and retry the allocation. The "alloc fail" bit essentially tells
57 * the CQ completion handlers to wake it up after freeing some
58 * more entries.
59 */
60
61/*
62 * This only happens on shutdown.
63 */
64DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
65
66void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
67{
68 memset(ring, 0, sizeof(*ring));
69 ring->w_nr = nr;
70 rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
71}
72
73static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
74{
75 u32 diff;
76
77 /* This assumes that atomic_t has at least as many bits as u32 */
78 diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
79 BUG_ON(diff > ring->w_nr);
80
81 return diff;
82}
83
84void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
85{
86 /* We only ever get called from the connection setup code,
87 * prior to creating the QP. */
88 BUG_ON(__rds_iw_ring_used(ring));
89 ring->w_nr = nr;
90}
91
92static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
93{
94 return __rds_iw_ring_used(ring) == 0;
95}
96
97u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
98{
99 u32 ret = 0, avail;
100
101 avail = ring->w_nr - __rds_iw_ring_used(ring);
102
103 rdsdebug("ring %p val %u next %u free %u\n", ring, val,
104 ring->w_alloc_ptr, avail);
105
106 if (val && avail) {
107 ret = min(val, avail);
108 *pos = ring->w_alloc_ptr;
109
110 ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
111 ring->w_alloc_ctr += ret;
112 }
113
114 return ret;
115}
116
117void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
118{
119 ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
120 atomic_add(val, &ring->w_free_ctr);
121
122 if (__rds_iw_ring_empty(ring) &&
123 waitqueue_active(&rds_iw_ring_empty_wait))
124 wake_up(&rds_iw_ring_empty_wait);
125}
126
127void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
128{
129 ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
130 ring->w_alloc_ctr -= val;
131}
132
133int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
134{
135 return __rds_iw_ring_empty(ring);
136}
137
138int rds_iw_ring_low(struct rds_iw_work_ring *ring)
139{
140 return __rds_iw_ring_used(ring) <= (ring->w_nr >> 2);
141}
142
143
144/*
145 * returns the oldest alloced ring entry. This will be the next one
146 * freed. This can't be called if there are none allocated.
147 */
148u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
149{
150 return ring->w_free_ptr;
151}
152
153/*
154 * returns the number of completed work requests.
155 */
156
157u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
158{
159 u32 ret;
160
161 if (oldest <= (unsigned long long)wr_id)
162 ret = (unsigned long long)wr_id - oldest + 1;
163 else
164 ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
165
166 rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
167 wr_id, oldest);
168 return ret;
169}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
new file mode 100644
index 000000000000..22dd38ffd608
--- /dev/null
+++ b/net/rds/iw_send.c
@@ -0,0 +1,975 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35#include <linux/device.h>
36#include <linux/dmapool.h>
37
38#include "rds.h"
39#include "rdma.h"
40#include "iw.h"
41
42static void rds_iw_send_rdma_complete(struct rds_message *rm,
43 int wc_status)
44{
45 int notify_status;
46
47 switch (wc_status) {
48 case IB_WC_WR_FLUSH_ERR:
49 return;
50
51 case IB_WC_SUCCESS:
52 notify_status = RDS_RDMA_SUCCESS;
53 break;
54
55 case IB_WC_REM_ACCESS_ERR:
56 notify_status = RDS_RDMA_REMOTE_ERROR;
57 break;
58
59 default:
60 notify_status = RDS_RDMA_OTHER_ERROR;
61 break;
62 }
63 rds_rdma_send_complete(rm, notify_status);
64}
65
66static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
67 struct rds_rdma_op *op)
68{
69 if (op->r_mapped) {
70 ib_dma_unmap_sg(ic->i_cm_id->device,
71 op->r_sg, op->r_nents,
72 op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
73 op->r_mapped = 0;
74 }
75}
76
77static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
78 struct rds_iw_send_work *send,
79 int wc_status)
80{
81 struct rds_message *rm = send->s_rm;
82
83 rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
84
85 ib_dma_unmap_sg(ic->i_cm_id->device,
86 rm->m_sg, rm->m_nents,
87 DMA_TO_DEVICE);
88
89 if (rm->m_rdma_op != NULL) {
90 rds_iw_send_unmap_rdma(ic, rm->m_rdma_op);
91
92 /* If the user asked for a completion notification on this
93 * message, we can implement three different semantics:
94 * 1. Notify when we received the ACK on the RDS message
95 * that was queued with the RDMA. This provides reliable
96 * notification of RDMA status at the expense of a one-way
97 * packet delay.
98 * 2. Notify when the IB stack gives us the completion event for
99 * the RDMA operation.
100 * 3. Notify when the IB stack gives us the completion event for
101 * the accompanying RDS messages.
102 * Here, we implement approach #3. To implement approach #2,
103 * call rds_rdma_send_complete from the cq_handler. To implement #1,
104 * don't call rds_rdma_send_complete at all, and fall back to the notify
105 * handling in the ACK processing code.
106 *
107 * Note: There's no need to explicitly sync any RDMA buffers using
108 * ib_dma_sync_sg_for_cpu - the completion for the RDMA
109 * operation itself unmapped the RDMA buffers, which takes care
110 * of synching.
111 */
112 rds_iw_send_rdma_complete(rm, wc_status);
113
114 if (rm->m_rdma_op->r_write)
115 rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes);
116 else
117 rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes);
118 }
119
120 /* If anyone waited for this message to get flushed out, wake
121 * them up now */
122 rds_message_unmapped(rm);
123
124 rds_message_put(rm);
125 send->s_rm = NULL;
126}
127
128void rds_iw_send_init_ring(struct rds_iw_connection *ic)
129{
130 struct rds_iw_send_work *send;
131 u32 i;
132
133 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
134 struct ib_sge *sge;
135
136 send->s_rm = NULL;
137 send->s_op = NULL;
138 send->s_mapping = NULL;
139
140 send->s_wr.next = NULL;
141 send->s_wr.wr_id = i;
142 send->s_wr.sg_list = send->s_sge;
143 send->s_wr.num_sge = 1;
144 send->s_wr.opcode = IB_WR_SEND;
145 send->s_wr.send_flags = 0;
146 send->s_wr.ex.imm_data = 0;
147
148 sge = rds_iw_data_sge(ic, send->s_sge);
149 sge->lkey = 0;
150
151 sge = rds_iw_header_sge(ic, send->s_sge);
152 sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
153 sge->length = sizeof(struct rds_header);
154 sge->lkey = 0;
155
156 send->s_mr = ib_alloc_fast_reg_mr(ic->i_pd, fastreg_message_size);
157 if (IS_ERR(send->s_mr)) {
158 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_mr failed\n");
159 break;
160 }
161
162 send->s_page_list = ib_alloc_fast_reg_page_list(
163 ic->i_cm_id->device, fastreg_message_size);
164 if (IS_ERR(send->s_page_list)) {
165 printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
166 break;
167 }
168 }
169}
170
171void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
172{
173 struct rds_iw_send_work *send;
174 u32 i;
175
176 for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
177 BUG_ON(!send->s_mr);
178 ib_dereg_mr(send->s_mr);
179 BUG_ON(!send->s_page_list);
180 ib_free_fast_reg_page_list(send->s_page_list);
181 if (send->s_wr.opcode == 0xdead)
182 continue;
183 if (send->s_rm)
184 rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
185 if (send->s_op)
186 rds_iw_send_unmap_rdma(ic, send->s_op);
187 }
188}
189
190/*
191 * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
192 * operations performed in the send path. As the sender allocs and potentially
193 * unallocs the next free entry in the ring it doesn't alter which is
194 * the next to be freed, which is what this is concerned with.
195 */
196void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
197{
198 struct rds_connection *conn = context;
199 struct rds_iw_connection *ic = conn->c_transport_data;
200 struct ib_wc wc;
201 struct rds_iw_send_work *send;
202 u32 completed;
203 u32 oldest;
204 u32 i;
205 int ret;
206
207 rdsdebug("cq %p conn %p\n", cq, conn);
208 rds_iw_stats_inc(s_iw_tx_cq_call);
209 ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
210 if (ret)
211 rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
212
213 while (ib_poll_cq(cq, 1, &wc) > 0) {
214 rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
215 (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
216 be32_to_cpu(wc.ex.imm_data));
217 rds_iw_stats_inc(s_iw_tx_cq_event);
218
219 if (wc.status != IB_WC_SUCCESS) {
220 printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
221 break;
222 }
223
224 if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
225 ic->i_fastreg_posted = 0;
226 continue;
227 }
228
229 if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
230 ic->i_fastreg_posted = 1;
231 continue;
232 }
233
234 if (wc.wr_id == RDS_IW_ACK_WR_ID) {
235 if (ic->i_ack_queued + HZ/2 < jiffies)
236 rds_iw_stats_inc(s_iw_tx_stalled);
237 rds_iw_ack_send_complete(ic);
238 continue;
239 }
240
241 oldest = rds_iw_ring_oldest(&ic->i_send_ring);
242
243 completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
244
245 for (i = 0; i < completed; i++) {
246 send = &ic->i_sends[oldest];
247
248 /* In the error case, wc.opcode sometimes contains garbage */
249 switch (send->s_wr.opcode) {
250 case IB_WR_SEND:
251 if (send->s_rm)
252 rds_iw_send_unmap_rm(ic, send, wc.status);
253 break;
254 case IB_WR_FAST_REG_MR:
255 case IB_WR_RDMA_WRITE:
256 case IB_WR_RDMA_READ:
257 case IB_WR_RDMA_READ_WITH_INV:
258 /* Nothing to be done - the SG list will be unmapped
259 * when the SEND completes. */
260 break;
261 default:
262 if (printk_ratelimit())
263 printk(KERN_NOTICE
264 "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
265 __func__, send->s_wr.opcode);
266 break;
267 }
268
269 send->s_wr.opcode = 0xdead;
270 send->s_wr.num_sge = 1;
271 if (send->s_queued + HZ/2 < jiffies)
272 rds_iw_stats_inc(s_iw_tx_stalled);
273
274 /* If a RDMA operation produced an error, signal this right
275 * away. If we don't, the subsequent SEND that goes with this
276 * RDMA will be canceled with ERR_WFLUSH, and the application
277 * never learn that the RDMA failed. */
278 if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
279 struct rds_message *rm;
280
281 rm = rds_send_get_message(conn, send->s_op);
282 if (rm)
283 rds_iw_send_rdma_complete(rm, wc.status);
284 }
285
286 oldest = (oldest + 1) % ic->i_send_ring.w_nr;
287 }
288
289 rds_iw_ring_free(&ic->i_send_ring, completed);
290
291 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)
292 || test_bit(0, &conn->c_map_queued))
293 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
294
295 /* We expect errors as the qp is drained during shutdown */
296 if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
297 rds_iw_conn_error(conn,
298 "send completion on %pI4 "
299 "had status %u, disconnecting and reconnecting\n",
300 &conn->c_faddr, wc.status);
301 }
302 }
303}
304
305/*
306 * This is the main function for allocating credits when sending
307 * messages.
308 *
309 * Conceptually, we have two counters:
310 * - send credits: this tells us how many WRs we're allowed
311 * to submit without overruning the reciever's queue. For
312 * each SEND WR we post, we decrement this by one.
313 *
314 * - posted credits: this tells us how many WRs we recently
315 * posted to the receive queue. This value is transferred
316 * to the peer as a "credit update" in a RDS header field.
317 * Every time we transmit credits to the peer, we subtract
318 * the amount of transferred credits from this counter.
319 *
320 * It is essential that we avoid situations where both sides have
321 * exhausted their send credits, and are unable to send new credits
322 * to the peer. We achieve this by requiring that we send at least
323 * one credit update to the peer before exhausting our credits.
324 * When new credits arrive, we subtract one credit that is withheld
325 * until we've posted new buffers and are ready to transmit these
326 * credits (see rds_iw_send_add_credits below).
327 *
328 * The RDS send code is essentially single-threaded; rds_send_xmit
329 * grabs c_send_lock to ensure exclusive access to the send ring.
330 * However, the ACK sending code is independent and can race with
331 * message SENDs.
332 *
333 * In the send path, we need to update the counters for send credits
334 * and the counter of posted buffers atomically - when we use the
335 * last available credit, we cannot allow another thread to race us
336 * and grab the posted credits counter. Hence, we have to use a
337 * spinlock to protect the credit counter, or use atomics.
338 *
339 * Spinlocks shared between the send and the receive path are bad,
340 * because they create unnecessary delays. An early implementation
341 * using a spinlock showed a 5% degradation in throughput at some
342 * loads.
343 *
344 * This implementation avoids spinlocks completely, putting both
345 * counters into a single atomic, and updating that atomic using
346 * atomic_add (in the receive path, when receiving fresh credits),
347 * and using atomic_cmpxchg when updating the two counters.
348 */
349int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
350 u32 wanted, u32 *adv_credits, int need_posted)
351{
352 unsigned int avail, posted, got = 0, advertise;
353 long oldval, newval;
354
355 *adv_credits = 0;
356 if (!ic->i_flowctl)
357 return wanted;
358
359try_again:
360 advertise = 0;
361 oldval = newval = atomic_read(&ic->i_credits);
362 posted = IB_GET_POST_CREDITS(oldval);
363 avail = IB_GET_SEND_CREDITS(oldval);
364
365 rdsdebug("rds_iw_send_grab_credits(%u): credits=%u posted=%u\n",
366 wanted, avail, posted);
367
368 /* The last credit must be used to send a credit update. */
369 if (avail && !posted)
370 avail--;
371
372 if (avail < wanted) {
373 struct rds_connection *conn = ic->i_cm_id->context;
374
375 /* Oops, there aren't that many credits left! */
376 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
377 got = avail;
378 } else {
379 /* Sometimes you get what you want, lalala. */
380 got = wanted;
381 }
382 newval -= IB_SET_SEND_CREDITS(got);
383
384 /*
385 * If need_posted is non-zero, then the caller wants
386 * the posted regardless of whether any send credits are
387 * available.
388 */
389 if (posted && (got || need_posted)) {
390 advertise = min_t(unsigned int, posted, RDS_MAX_ADV_CREDIT);
391 newval -= IB_SET_POST_CREDITS(advertise);
392 }
393
394 /* Finally bill everything */
395 if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
396 goto try_again;
397
398 *adv_credits = advertise;
399 return got;
400}
401
402void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
403{
404 struct rds_iw_connection *ic = conn->c_transport_data;
405
406 if (credits == 0)
407 return;
408
409 rdsdebug("rds_iw_send_add_credits(%u): current=%u%s\n",
410 credits,
411 IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
412 test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
413
414 atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
415 if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
416 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
417
418 WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
419
420 rds_iw_stats_inc(s_iw_rx_credit_updates);
421}
422
423void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
424{
425 struct rds_iw_connection *ic = conn->c_transport_data;
426
427 if (posted == 0)
428 return;
429
430 atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
431
432 /* Decide whether to send an update to the peer now.
433 * If we would send a credit update for every single buffer we
434 * post, we would end up with an ACK storm (ACK arrives,
435 * consumes buffer, we refill the ring, send ACK to remote
436 * advertising the newly posted buffer... ad inf)
437 *
438 * Performance pretty much depends on how often we send
439 * credit updates - too frequent updates mean lots of ACKs.
440 * Too infrequent updates, and the peer will run out of
441 * credits and has to throttle.
442 * For the time being, 16 seems to be a good compromise.
443 */
444 if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
445 set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
446}
447
448static inline void
449rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
450 struct rds_iw_send_work *send, unsigned int pos,
451 unsigned long buffer, unsigned int length,
452 int send_flags)
453{
454 struct ib_sge *sge;
455
456 WARN_ON(pos != send - ic->i_sends);
457
458 send->s_wr.send_flags = send_flags;
459 send->s_wr.opcode = IB_WR_SEND;
460 send->s_wr.num_sge = 2;
461 send->s_wr.next = NULL;
462 send->s_queued = jiffies;
463 send->s_op = NULL;
464
465 if (length != 0) {
466 sge = rds_iw_data_sge(ic, send->s_sge);
467 sge->addr = buffer;
468 sge->length = length;
469 sge->lkey = rds_iw_local_dma_lkey(ic);
470
471 sge = rds_iw_header_sge(ic, send->s_sge);
472 } else {
473 /* We're sending a packet with no payload. There is only
474 * one SGE */
475 send->s_wr.num_sge = 1;
476 sge = &send->s_sge[0];
477 }
478
479 sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
480 sge->length = sizeof(struct rds_header);
481 sge->lkey = rds_iw_local_dma_lkey(ic);
482}
483
484/*
485 * This can be called multiple times for a given message. The first time
486 * we see a message we map its scatterlist into the IB device so that
487 * we can provide that mapped address to the IB scatter gather entries
488 * in the IB work requests. We translate the scatterlist into a series
489 * of work requests that fragment the message. These work requests complete
490 * in order so we pass ownership of the message to the completion handler
491 * once we send the final fragment.
492 *
493 * The RDS core uses the c_send_lock to only enter this function once
494 * per connection. This makes sure that the tx ring alloc/unalloc pairs
495 * don't get out of sync and confuse the ring.
496 */
497int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
498 unsigned int hdr_off, unsigned int sg, unsigned int off)
499{
500 struct rds_iw_connection *ic = conn->c_transport_data;
501 struct ib_device *dev = ic->i_cm_id->device;
502 struct rds_iw_send_work *send = NULL;
503 struct rds_iw_send_work *first;
504 struct rds_iw_send_work *prev;
505 struct ib_send_wr *failed_wr;
506 struct scatterlist *scat;
507 u32 pos;
508 u32 i;
509 u32 work_alloc;
510 u32 credit_alloc;
511 u32 posted;
512 u32 adv_credits = 0;
513 int send_flags = 0;
514 int sent;
515 int ret;
516 int flow_controlled = 0;
517
518 BUG_ON(off % RDS_FRAG_SIZE);
519 BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
520
521 /* Fastreg support */
522 if (rds_rdma_cookie_key(rm->m_rdma_cookie)
523 && !ic->i_fastreg_posted) {
524 ret = -EAGAIN;
525 goto out;
526 }
527
528 /* FIXME we may overallocate here */
529 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
530 i = 1;
531 else
532 i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
533
534 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
535 if (work_alloc == 0) {
536 set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
537 rds_iw_stats_inc(s_iw_tx_ring_full);
538 ret = -ENOMEM;
539 goto out;
540 }
541
542 credit_alloc = work_alloc;
543 if (ic->i_flowctl) {
544 credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0);
545 adv_credits += posted;
546 if (credit_alloc < work_alloc) {
547 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
548 work_alloc = credit_alloc;
549 flow_controlled++;
550 }
551 if (work_alloc == 0) {
552 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
553 rds_iw_stats_inc(s_iw_tx_throttle);
554 ret = -ENOMEM;
555 goto out;
556 }
557 }
558
559 /* map the message the first time we see it */
560 if (ic->i_rm == NULL) {
561 /*
562 printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
563 be16_to_cpu(rm->m_inc.i_hdr.h_dport),
564 rm->m_inc.i_hdr.h_flags,
565 be32_to_cpu(rm->m_inc.i_hdr.h_len));
566 */
567 if (rm->m_nents) {
568 rm->m_count = ib_dma_map_sg(dev,
569 rm->m_sg, rm->m_nents, DMA_TO_DEVICE);
570 rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count);
571 if (rm->m_count == 0) {
572 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
573 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
574 ret = -ENOMEM; /* XXX ? */
575 goto out;
576 }
577 } else {
578 rm->m_count = 0;
579 }
580
581 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
582 ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
583 rds_message_addref(rm);
584 ic->i_rm = rm;
585
586 /* Finalize the header */
587 if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
588 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
589 if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
590 rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
591
592 /* If it has a RDMA op, tell the peer we did it. This is
593 * used by the peer to release use-once RDMA MRs. */
594 if (rm->m_rdma_op) {
595 struct rds_ext_header_rdma ext_hdr;
596
597 ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key);
598 rds_message_add_extension(&rm->m_inc.i_hdr,
599 RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
600 }
601 if (rm->m_rdma_cookie) {
602 rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
603 rds_rdma_cookie_key(rm->m_rdma_cookie),
604 rds_rdma_cookie_offset(rm->m_rdma_cookie));
605 }
606
607 /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
608 * we should not do this unless we have a chance of at least
609 * sticking the header into the send ring. Which is why we
610 * should call rds_iw_ring_alloc first. */
611 rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
612 rds_message_make_checksum(&rm->m_inc.i_hdr);
613
614 /*
615 * Update adv_credits since we reset the ACK_REQUIRED bit.
616 */
617 rds_iw_send_grab_credits(ic, 0, &posted, 1);
618 adv_credits += posted;
619 BUG_ON(adv_credits > 255);
620 } else if (ic->i_rm != rm)
621 BUG();
622
623 send = &ic->i_sends[pos];
624 first = send;
625 prev = NULL;
626 scat = &rm->m_sg[sg];
627 sent = 0;
628 i = 0;
629
630 /* Sometimes you want to put a fence between an RDMA
631 * READ and the following SEND.
632 * We could either do this all the time
633 * or when requested by the user. Right now, we let
634 * the application choose.
635 */
636 if (rm->m_rdma_op && rm->m_rdma_op->r_fence)
637 send_flags = IB_SEND_FENCE;
638
639 /*
640 * We could be copying the header into the unused tail of the page.
641 * That would need to be changed in the future when those pages might
642 * be mapped userspace pages or page cache pages. So instead we always
643 * use a second sge and our long-lived ring of mapped headers. We send
644 * the header after the data so that the data payload can be aligned on
645 * the receiver.
646 */
647
648 /* handle a 0-len message */
649 if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
650 rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
651 goto add_header;
652 }
653
654 /* if there's data reference it with a chain of work reqs */
655 for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) {
656 unsigned int len;
657
658 send = &ic->i_sends[pos];
659
660 len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off);
661 rds_iw_xmit_populate_wr(ic, send, pos,
662 ib_sg_dma_address(dev, scat) + off, len,
663 send_flags);
664
665 /*
666 * We want to delay signaling completions just enough to get
667 * the batching benefits but not so much that we create dead time
668 * on the wire.
669 */
670 if (ic->i_unsignaled_wrs-- == 0) {
671 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
672 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
673 }
674
675 ic->i_unsignaled_bytes -= len;
676 if (ic->i_unsignaled_bytes <= 0) {
677 ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
678 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
679 }
680
681 /*
682 * Always signal the last one if we're stopping due to flow control.
683 */
684 if (flow_controlled && i == (work_alloc-1))
685 send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
686
687 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
688 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
689
690 sent += len;
691 off += len;
692 if (off == ib_sg_dma_len(dev, scat)) {
693 scat++;
694 off = 0;
695 }
696
697add_header:
698 /* Tack on the header after the data. The header SGE should already
699 * have been set up to point to the right header buffer. */
700 memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
701
702 if (0) {
703 struct rds_header *hdr = &ic->i_send_hdrs[pos];
704
705 printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
706 be16_to_cpu(hdr->h_dport),
707 hdr->h_flags,
708 be32_to_cpu(hdr->h_len));
709 }
710 if (adv_credits) {
711 struct rds_header *hdr = &ic->i_send_hdrs[pos];
712
713 /* add credit and redo the header checksum */
714 hdr->h_credit = adv_credits;
715 rds_message_make_checksum(hdr);
716 adv_credits = 0;
717 rds_iw_stats_inc(s_iw_tx_credit_updates);
718 }
719
720 if (prev)
721 prev->s_wr.next = &send->s_wr;
722 prev = send;
723
724 pos = (pos + 1) % ic->i_send_ring.w_nr;
725 }
726
727 /* Account the RDS header in the number of bytes we sent, but just once.
728 * The caller has no concept of fragmentation. */
729 if (hdr_off == 0)
730 sent += sizeof(struct rds_header);
731
732 /* if we finished the message then send completion owns it */
733 if (scat == &rm->m_sg[rm->m_count]) {
734 prev->s_rm = ic->i_rm;
735 prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
736 ic->i_rm = NULL;
737 }
738
739 if (i < work_alloc) {
740 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
741 work_alloc = i;
742 }
743 if (ic->i_flowctl && i < credit_alloc)
744 rds_iw_send_add_credits(conn, credit_alloc - i);
745
746 /* XXX need to worry about failed_wr and partial sends. */
747 failed_wr = &first->s_wr;
748 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
749 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
750 first, &first->s_wr, ret, failed_wr);
751 BUG_ON(failed_wr != &first->s_wr);
752 if (ret) {
753 printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
754 "returned %d\n", &conn->c_faddr, ret);
755 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
756 if (prev->s_rm) {
757 ic->i_rm = prev->s_rm;
758 prev->s_rm = NULL;
759 }
760 goto out;
761 }
762
763 ret = sent;
764out:
765 BUG_ON(adv_credits);
766 return ret;
767}
768
769static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
770{
771 BUG_ON(nent > send->s_page_list->max_page_list_len);
772 /*
773 * Perform a WR for the fast_reg_mr. Each individual page
774 * in the sg list is added to the fast reg page list and placed
775 * inside the fast_reg_mr WR.
776 */
777 send->s_wr.opcode = IB_WR_FAST_REG_MR;
778 send->s_wr.wr.fast_reg.length = len;
779 send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
780 send->s_wr.wr.fast_reg.page_list = send->s_page_list;
781 send->s_wr.wr.fast_reg.page_list_len = nent;
782 send->s_wr.wr.fast_reg.page_shift = rds_iwdev->page_shift;
783 send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
784 send->s_wr.wr.fast_reg.iova_start = sg_addr;
785
786 ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
787}
788
789int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op)
790{
791 struct rds_iw_connection *ic = conn->c_transport_data;
792 struct rds_iw_send_work *send = NULL;
793 struct rds_iw_send_work *first;
794 struct rds_iw_send_work *prev;
795 struct ib_send_wr *failed_wr;
796 struct rds_iw_device *rds_iwdev;
797 struct scatterlist *scat;
798 unsigned long len;
799 u64 remote_addr = op->r_remote_addr;
800 u32 pos, fr_pos;
801 u32 work_alloc;
802 u32 i;
803 u32 j;
804 int sent;
805 int ret;
806 int num_sge;
807
808 rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
809
810 /* map the message the first time we see it */
811 if (!op->r_mapped) {
812 op->r_count = ib_dma_map_sg(ic->i_cm_id->device,
813 op->r_sg, op->r_nents, (op->r_write) ?
814 DMA_TO_DEVICE : DMA_FROM_DEVICE);
815 rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count);
816 if (op->r_count == 0) {
817 rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
818 ret = -ENOMEM; /* XXX ? */
819 goto out;
820 }
821
822 op->r_mapped = 1;
823 }
824
825 if (!op->r_write) {
826 /* Alloc space on the send queue for the fastreg */
827 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
828 if (work_alloc != 1) {
829 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
830 rds_iw_stats_inc(s_iw_tx_ring_full);
831 ret = -ENOMEM;
832 goto out;
833 }
834 }
835
836 /*
837 * Instead of knowing how to return a partial rdma read/write we insist that there
838 * be enough work requests to send the entire message.
839 */
840 i = ceil(op->r_count, rds_iwdev->max_sge);
841
842 work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
843 if (work_alloc != i) {
844 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
845 rds_iw_stats_inc(s_iw_tx_ring_full);
846 ret = -ENOMEM;
847 goto out;
848 }
849
850 send = &ic->i_sends[pos];
851 if (!op->r_write) {
852 first = prev = &ic->i_sends[fr_pos];
853 } else {
854 first = send;
855 prev = NULL;
856 }
857 scat = &op->r_sg[0];
858 sent = 0;
859 num_sge = op->r_count;
860
861 for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) {
862 send->s_wr.send_flags = 0;
863 send->s_queued = jiffies;
864
865 /*
866 * We want to delay signaling completions just enough to get
867 * the batching benefits but not so much that we create dead time on the wire.
868 */
869 if (ic->i_unsignaled_wrs-- == 0) {
870 ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
871 send->s_wr.send_flags = IB_SEND_SIGNALED;
872 }
873
874 /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
875 * for local access after RDS is finished with it, using
876 * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
877 */
878 if (op->r_write)
879 send->s_wr.opcode = IB_WR_RDMA_WRITE;
880 else
881 send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
882
883 send->s_wr.wr.rdma.remote_addr = remote_addr;
884 send->s_wr.wr.rdma.rkey = op->r_key;
885 send->s_op = op;
886
887 if (num_sge > rds_iwdev->max_sge) {
888 send->s_wr.num_sge = rds_iwdev->max_sge;
889 num_sge -= rds_iwdev->max_sge;
890 } else
891 send->s_wr.num_sge = num_sge;
892
893 send->s_wr.next = NULL;
894
895 if (prev)
896 prev->s_wr.next = &send->s_wr;
897
898 for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) {
899 len = ib_sg_dma_len(ic->i_cm_id->device, scat);
900
901 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
902 send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
903 else {
904 send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
905 send->s_sge[j].length = len;
906 send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
907 }
908
909 sent += len;
910 rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
911 remote_addr += len;
912
913 scat++;
914 }
915
916 if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
917 send->s_wr.num_sge = 1;
918 send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
919 send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
920 send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
921 }
922
923 rdsdebug("send %p wr %p num_sge %u next %p\n", send,
924 &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
925
926 prev = send;
927 if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
928 send = ic->i_sends;
929 }
930
931 /* if we finished the message then send completion owns it */
932 if (scat == &op->r_sg[op->r_count])
933 first->s_wr.send_flags = IB_SEND_SIGNALED;
934
935 if (i < work_alloc) {
936 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
937 work_alloc = i;
938 }
939
940 /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
941 * recommended. Putting the lkey on the wire is a security hole, as it can
942 * allow for memory access to all of memory on the remote system. Some
943 * adapters do not allow using the lkey for this at all. To bypass this use a
944 * fastreg_mr (or possibly a dma_mr)
945 */
946 if (!op->r_write) {
947 rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
948 op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
949 work_alloc++;
950 }
951
952 failed_wr = &first->s_wr;
953 ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
954 rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
955 first, &first->s_wr, ret, failed_wr);
956 BUG_ON(failed_wr != &first->s_wr);
957 if (ret) {
958 printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
959 "returned %d\n", &conn->c_faddr, ret);
960 rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
961 goto out;
962 }
963
964out:
965 return ret;
966}
967
968void rds_iw_xmit_complete(struct rds_connection *conn)
969{
970 struct rds_iw_connection *ic = conn->c_transport_data;
971
972 /* We may have a pending ACK or window update we were unable
973 * to send previously (due to flow control). Try again. */
974 rds_iw_attempt_ack(ic);
975}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
new file mode 100644
index 000000000000..ccc7e8f0bf0e
--- /dev/null
+++ b/net/rds/iw_stats.c
@@ -0,0 +1,95 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38#include "iw.h"
39
40DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
41
42static char *rds_iw_stat_names[] = {
43 "iw_connect_raced",
44 "iw_listen_closed_stale",
45 "iw_tx_cq_call",
46 "iw_tx_cq_event",
47 "iw_tx_ring_full",
48 "iw_tx_throttle",
49 "iw_tx_sg_mapping_failure",
50 "iw_tx_stalled",
51 "iw_tx_credit_updates",
52 "iw_rx_cq_call",
53 "iw_rx_cq_event",
54 "iw_rx_ring_empty",
55 "iw_rx_refill_from_cq",
56 "iw_rx_refill_from_thread",
57 "iw_rx_alloc_limit",
58 "iw_rx_credit_updates",
59 "iw_ack_sent",
60 "iw_ack_send_failure",
61 "iw_ack_send_delayed",
62 "iw_ack_send_piggybacked",
63 "iw_ack_received",
64 "iw_rdma_mr_alloc",
65 "iw_rdma_mr_free",
66 "iw_rdma_mr_used",
67 "iw_rdma_mr_pool_flush",
68 "iw_rdma_mr_pool_wait",
69 "iw_rdma_mr_pool_depleted",
70};
71
72unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
73 unsigned int avail)
74{
75 struct rds_iw_statistics stats = {0, };
76 uint64_t *src;
77 uint64_t *sum;
78 size_t i;
79 int cpu;
80
81 if (avail < ARRAY_SIZE(rds_iw_stat_names))
82 goto out;
83
84 for_each_online_cpu(cpu) {
85 src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
86 sum = (uint64_t *)&stats;
87 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
88 *(sum++) += *(src++);
89 }
90
91 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
92 ARRAY_SIZE(rds_iw_stat_names));
93out:
94 return ARRAY_SIZE(rds_iw_stat_names);
95}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
new file mode 100644
index 000000000000..9590678cd616
--- /dev/null
+++ b/net/rds/iw_sysctl.c
@@ -0,0 +1,137 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/sysctl.h>
35#include <linux/proc_fs.h>
36
37#include "iw.h"
38
39static struct ctl_table_header *rds_iw_sysctl_hdr;
40
41unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
42unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
43unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
44static unsigned long rds_iw_sysctl_max_wr_min = 1;
45/* hardware will fail CQ creation long before this */
46static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
47
48unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
49static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
50static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
51
52unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
53static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
54static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
55
56unsigned int rds_iw_sysctl_flow_control = 1;
57
58ctl_table rds_iw_sysctl_table[] = {
59 {
60 .ctl_name = CTL_UNNUMBERED,
61 .procname = "max_send_wr",
62 .data = &rds_iw_sysctl_max_send_wr,
63 .maxlen = sizeof(unsigned long),
64 .mode = 0644,
65 .proc_handler = &proc_doulongvec_minmax,
66 .extra1 = &rds_iw_sysctl_max_wr_min,
67 .extra2 = &rds_iw_sysctl_max_wr_max,
68 },
69 {
70 .ctl_name = CTL_UNNUMBERED,
71 .procname = "max_recv_wr",
72 .data = &rds_iw_sysctl_max_recv_wr,
73 .maxlen = sizeof(unsigned long),
74 .mode = 0644,
75 .proc_handler = &proc_doulongvec_minmax,
76 .extra1 = &rds_iw_sysctl_max_wr_min,
77 .extra2 = &rds_iw_sysctl_max_wr_max,
78 },
79 {
80 .ctl_name = CTL_UNNUMBERED,
81 .procname = "max_unsignaled_wr",
82 .data = &rds_iw_sysctl_max_unsig_wrs,
83 .maxlen = sizeof(unsigned long),
84 .mode = 0644,
85 .proc_handler = &proc_doulongvec_minmax,
86 .extra1 = &rds_iw_sysctl_max_unsig_wr_min,
87 .extra2 = &rds_iw_sysctl_max_unsig_wr_max,
88 },
89 {
90 .ctl_name = CTL_UNNUMBERED,
91 .procname = "max_unsignaled_bytes",
92 .data = &rds_iw_sysctl_max_unsig_bytes,
93 .maxlen = sizeof(unsigned long),
94 .mode = 0644,
95 .proc_handler = &proc_doulongvec_minmax,
96 .extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
97 .extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
98 },
99 {
100 .ctl_name = CTL_UNNUMBERED,
101 .procname = "max_recv_allocation",
102 .data = &rds_iw_sysctl_max_recv_allocation,
103 .maxlen = sizeof(unsigned long),
104 .mode = 0644,
105 .proc_handler = &proc_doulongvec_minmax,
106 },
107 {
108 .ctl_name = CTL_UNNUMBERED,
109 .procname = "flow_control",
110 .data = &rds_iw_sysctl_flow_control,
111 .maxlen = sizeof(rds_iw_sysctl_flow_control),
112 .mode = 0644,
113 .proc_handler = &proc_dointvec,
114 },
115 { .ctl_name = 0}
116};
117
118static struct ctl_path rds_iw_sysctl_path[] = {
119 { .procname = "net", .ctl_name = CTL_NET, },
120 { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
121 { .procname = "iw", .ctl_name = CTL_UNNUMBERED, },
122 { }
123};
124
125void rds_iw_sysctl_exit(void)
126{
127 if (rds_iw_sysctl_hdr)
128 unregister_sysctl_table(rds_iw_sysctl_hdr);
129}
130
131int __init rds_iw_sysctl_init(void)
132{
133 rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table);
134 if (rds_iw_sysctl_hdr == NULL)
135 return -ENOMEM;
136 return 0;
137}
diff --git a/net/rds/loop.c b/net/rds/loop.c
new file mode 100644
index 000000000000..4a61997f554d
--- /dev/null
+++ b/net/rds/loop.c
@@ -0,0 +1,188 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/in.h>
35
36#include "rds.h"
37#include "loop.h"
38
39static DEFINE_SPINLOCK(loop_conns_lock);
40static LIST_HEAD(loop_conns);
41
42/*
43 * This 'loopback' transport is a special case for flows that originate
44 * and terminate on the same machine.
45 *
46 * Connection build-up notices if the destination address is thought of
47 * as a local address by a transport. At that time it decides to use the
48 * loopback transport instead of the bound transport of the sending socket.
49 *
50 * The loopback transport's sending path just hands the sent rds_message
51 * straight to the receiving path via an embedded rds_incoming.
52 */
53
54/*
55 * Usually a message transits both the sender and receiver's conns as it
56 * flows to the receiver. In the loopback case, though, the receive path
57 * is handed the sending conn so the sense of the addresses is reversed.
58 */
59static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
60 unsigned int hdr_off, unsigned int sg,
61 unsigned int off)
62{
63 BUG_ON(hdr_off || sg || off);
64
65 rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
66 rds_message_addref(rm); /* for the inc */
67
68 rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
69 GFP_KERNEL, KM_USER0);
70
71 rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
72 NULL);
73
74 rds_inc_put(&rm->m_inc);
75
76 return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len);
77}
78
79static int rds_loop_xmit_cong_map(struct rds_connection *conn,
80 struct rds_cong_map *map,
81 unsigned long offset)
82{
83 unsigned long i;
84
85 BUG_ON(offset);
86 BUG_ON(map != conn->c_lcong);
87
88 for (i = 0; i < RDS_CONG_MAP_PAGES; i++) {
89 memcpy((void *)conn->c_fcong->m_page_addrs[i],
90 (void *)map->m_page_addrs[i], PAGE_SIZE);
91 }
92
93 rds_cong_map_updated(conn->c_fcong, ~(u64) 0);
94
95 return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES;
96}
97
98/* we need to at least give the thread something to succeed */
99static int rds_loop_recv(struct rds_connection *conn)
100{
101 return 0;
102}
103
104struct rds_loop_connection {
105 struct list_head loop_node;
106 struct rds_connection *conn;
107};
108
109/*
110 * Even the loopback transport needs to keep track of its connections,
111 * so it can call rds_conn_destroy() on them on exit. N.B. there are
112 * 1+ loopback addresses (127.*.*.*) so it's not a bug to have
113 * multiple loopback conns allocated, although rather useless.
114 */
115static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp)
116{
117 struct rds_loop_connection *lc;
118 unsigned long flags;
119
120 lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL);
121 if (lc == NULL)
122 return -ENOMEM;
123
124 INIT_LIST_HEAD(&lc->loop_node);
125 lc->conn = conn;
126 conn->c_transport_data = lc;
127
128 spin_lock_irqsave(&loop_conns_lock, flags);
129 list_add_tail(&lc->loop_node, &loop_conns);
130 spin_unlock_irqrestore(&loop_conns_lock, flags);
131
132 return 0;
133}
134
135static void rds_loop_conn_free(void *arg)
136{
137 struct rds_loop_connection *lc = arg;
138 rdsdebug("lc %p\n", lc);
139 list_del(&lc->loop_node);
140 kfree(lc);
141}
142
143static int rds_loop_conn_connect(struct rds_connection *conn)
144{
145 rds_connect_complete(conn);
146 return 0;
147}
148
149static void rds_loop_conn_shutdown(struct rds_connection *conn)
150{
151}
152
153void rds_loop_exit(void)
154{
155 struct rds_loop_connection *lc, *_lc;
156 LIST_HEAD(tmp_list);
157
158 /* avoid calling conn_destroy with irqs off */
159 spin_lock_irq(&loop_conns_lock);
160 list_splice(&loop_conns, &tmp_list);
161 INIT_LIST_HEAD(&loop_conns);
162 spin_unlock_irq(&loop_conns_lock);
163
164 list_for_each_entry_safe(lc, _lc, &tmp_list, loop_node) {
165 WARN_ON(lc->conn->c_passive);
166 rds_conn_destroy(lc->conn);
167 }
168}
169
170/*
171 * This is missing .xmit_* because loop doesn't go through generic
172 * rds_send_xmit() and doesn't call rds_recv_incoming(). .listen_stop and
173 * .laddr_check are missing because transport.c doesn't iterate over
174 * rds_loop_transport.
175 */
176struct rds_transport rds_loop_transport = {
177 .xmit = rds_loop_xmit,
178 .xmit_cong_map = rds_loop_xmit_cong_map,
179 .recv = rds_loop_recv,
180 .conn_alloc = rds_loop_conn_alloc,
181 .conn_free = rds_loop_conn_free,
182 .conn_connect = rds_loop_conn_connect,
183 .conn_shutdown = rds_loop_conn_shutdown,
184 .inc_copy_to_user = rds_message_inc_copy_to_user,
185 .inc_purge = rds_message_inc_purge,
186 .inc_free = rds_message_inc_free,
187 .t_name = "loopback",
188};
diff --git a/net/rds/loop.h b/net/rds/loop.h
new file mode 100644
index 000000000000..f32b0939a04d
--- /dev/null
+++ b/net/rds/loop.h
@@ -0,0 +1,9 @@
1#ifndef _RDS_LOOP_H
2#define _RDS_LOOP_H
3
4/* loop.c */
5extern struct rds_transport rds_loop_transport;
6
7void rds_loop_exit(void);
8
9#endif
diff --git a/net/rds/message.c b/net/rds/message.c
new file mode 100644
index 000000000000..5a15dc8d0cd7
--- /dev/null
+++ b/net/rds/message.c
@@ -0,0 +1,402 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34
35#include "rds.h"
36#include "rdma.h"
37
38static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq);
39
40static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
41[RDS_EXTHDR_NONE] = 0,
42[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
43[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
44[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
45};
46
47
48void rds_message_addref(struct rds_message *rm)
49{
50 rdsdebug("addref rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
51 atomic_inc(&rm->m_refcount);
52}
53
54/*
55 * This relies on dma_map_sg() not touching sg[].page during merging.
56 */
57static void rds_message_purge(struct rds_message *rm)
58{
59 unsigned long i;
60
61 if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags)))
62 return;
63
64 for (i = 0; i < rm->m_nents; i++) {
65 rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i]));
66 /* XXX will have to put_page for page refs */
67 __free_page(sg_page(&rm->m_sg[i]));
68 }
69 rm->m_nents = 0;
70
71 if (rm->m_rdma_op)
72 rds_rdma_free_op(rm->m_rdma_op);
73 if (rm->m_rdma_mr)
74 rds_mr_put(rm->m_rdma_mr);
75}
76
77void rds_message_inc_purge(struct rds_incoming *inc)
78{
79 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
80 rds_message_purge(rm);
81}
82
83void rds_message_put(struct rds_message *rm)
84{
85 rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount));
86
87 if (atomic_dec_and_test(&rm->m_refcount)) {
88 BUG_ON(!list_empty(&rm->m_sock_item));
89 BUG_ON(!list_empty(&rm->m_conn_item));
90 rds_message_purge(rm);
91
92 kfree(rm);
93 }
94}
95
96void rds_message_inc_free(struct rds_incoming *inc)
97{
98 struct rds_message *rm = container_of(inc, struct rds_message, m_inc);
99 rds_message_put(rm);
100}
101
102void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
103 __be16 dport, u64 seq)
104{
105 hdr->h_flags = 0;
106 hdr->h_sport = sport;
107 hdr->h_dport = dport;
108 hdr->h_sequence = cpu_to_be64(seq);
109 hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
110}
111
112int rds_message_add_extension(struct rds_header *hdr,
113 unsigned int type, const void *data, unsigned int len)
114{
115 unsigned int ext_len = sizeof(u8) + len;
116 unsigned char *dst;
117
118 /* For now, refuse to add more than one extension header */
119 if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
120 return 0;
121
122 if (type >= __RDS_EXTHDR_MAX
123 || len != rds_exthdr_size[type])
124 return 0;
125
126 if (ext_len >= RDS_HEADER_EXT_SPACE)
127 return 0;
128 dst = hdr->h_exthdr;
129
130 *dst++ = type;
131 memcpy(dst, data, len);
132
133 dst[len] = RDS_EXTHDR_NONE;
134 return 1;
135}
136
137/*
138 * If a message has extension headers, retrieve them here.
139 * Call like this:
140 *
141 * unsigned int pos = 0;
142 *
143 * while (1) {
144 * buflen = sizeof(buffer);
145 * type = rds_message_next_extension(hdr, &pos, buffer, &buflen);
146 * if (type == RDS_EXTHDR_NONE)
147 * break;
148 * ...
149 * }
150 */
151int rds_message_next_extension(struct rds_header *hdr,
152 unsigned int *pos, void *buf, unsigned int *buflen)
153{
154 unsigned int offset, ext_type, ext_len;
155 u8 *src = hdr->h_exthdr;
156
157 offset = *pos;
158 if (offset >= RDS_HEADER_EXT_SPACE)
159 goto none;
160
161 /* Get the extension type and length. For now, the
162 * length is implied by the extension type. */
163 ext_type = src[offset++];
164
165 if (ext_type == RDS_EXTHDR_NONE || ext_type >= __RDS_EXTHDR_MAX)
166 goto none;
167 ext_len = rds_exthdr_size[ext_type];
168 if (offset + ext_len > RDS_HEADER_EXT_SPACE)
169 goto none;
170
171 *pos = offset + ext_len;
172 if (ext_len < *buflen)
173 *buflen = ext_len;
174 memcpy(buf, src + offset, *buflen);
175 return ext_type;
176
177none:
178 *pos = RDS_HEADER_EXT_SPACE;
179 *buflen = 0;
180 return RDS_EXTHDR_NONE;
181}
182
183int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version)
184{
185 struct rds_ext_header_version ext_hdr;
186
187 ext_hdr.h_version = cpu_to_be32(version);
188 return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr));
189}
190
191int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version)
192{
193 struct rds_ext_header_version ext_hdr;
194 unsigned int pos = 0, len = sizeof(ext_hdr);
195
196 /* We assume the version extension is the only one present */
197 if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION)
198 return 0;
199 *version = be32_to_cpu(ext_hdr.h_version);
200 return 1;
201}
202
203int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset)
204{
205 struct rds_ext_header_rdma_dest ext_hdr;
206
207 ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
208 ext_hdr.h_rdma_offset = cpu_to_be32(offset);
209 return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
210}
211
212struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp)
213{
214 struct rds_message *rm;
215
216 rm = kzalloc(sizeof(struct rds_message) +
217 (nents * sizeof(struct scatterlist)), gfp);
218 if (!rm)
219 goto out;
220
221 if (nents)
222 sg_init_table(rm->m_sg, nents);
223 atomic_set(&rm->m_refcount, 1);
224 INIT_LIST_HEAD(&rm->m_sock_item);
225 INIT_LIST_HEAD(&rm->m_conn_item);
226 spin_lock_init(&rm->m_rs_lock);
227
228out:
229 return rm;
230}
231
232struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len)
233{
234 struct rds_message *rm;
235 unsigned int i;
236
237 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
238 if (rm == NULL)
239 return ERR_PTR(-ENOMEM);
240
241 set_bit(RDS_MSG_PAGEVEC, &rm->m_flags);
242 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
243 rm->m_nents = ceil(total_len, PAGE_SIZE);
244
245 for (i = 0; i < rm->m_nents; ++i) {
246 sg_set_page(&rm->m_sg[i],
247 virt_to_page(page_addrs[i]),
248 PAGE_SIZE, 0);
249 }
250
251 return rm;
252}
253
254struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
255 size_t total_len)
256{
257 unsigned long to_copy;
258 unsigned long iov_off;
259 unsigned long sg_off;
260 struct rds_message *rm;
261 struct iovec *iov;
262 struct scatterlist *sg;
263 int ret;
264
265 rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL);
266 if (rm == NULL) {
267 ret = -ENOMEM;
268 goto out;
269 }
270
271 rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len);
272
273 /*
274 * now allocate and copy in the data payload.
275 */
276 sg = rm->m_sg;
277 iov = first_iov;
278 iov_off = 0;
279 sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */
280
281 while (total_len) {
282 if (sg_page(sg) == NULL) {
283 ret = rds_page_remainder_alloc(sg, total_len,
284 GFP_HIGHUSER);
285 if (ret)
286 goto out;
287 rm->m_nents++;
288 sg_off = 0;
289 }
290
291 while (iov_off == iov->iov_len) {
292 iov_off = 0;
293 iov++;
294 }
295
296 to_copy = min(iov->iov_len - iov_off, sg->length - sg_off);
297 to_copy = min_t(size_t, to_copy, total_len);
298
299 rdsdebug("copying %lu bytes from user iov [%p, %zu] + %lu to "
300 "sg [%p, %u, %u] + %lu\n",
301 to_copy, iov->iov_base, iov->iov_len, iov_off,
302 (void *)sg_page(sg), sg->offset, sg->length, sg_off);
303
304 ret = rds_page_copy_from_user(sg_page(sg), sg->offset + sg_off,
305 iov->iov_base + iov_off,
306 to_copy);
307 if (ret)
308 goto out;
309
310 iov_off += to_copy;
311 total_len -= to_copy;
312 sg_off += to_copy;
313
314 if (sg_off == sg->length)
315 sg++;
316 }
317
318 ret = 0;
319out:
320 if (ret) {
321 if (rm)
322 rds_message_put(rm);
323 rm = ERR_PTR(ret);
324 }
325 return rm;
326}
327
328int rds_message_inc_copy_to_user(struct rds_incoming *inc,
329 struct iovec *first_iov, size_t size)
330{
331 struct rds_message *rm;
332 struct iovec *iov;
333 struct scatterlist *sg;
334 unsigned long to_copy;
335 unsigned long iov_off;
336 unsigned long vec_off;
337 int copied;
338 int ret;
339 u32 len;
340
341 rm = container_of(inc, struct rds_message, m_inc);
342 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
343
344 iov = first_iov;
345 iov_off = 0;
346 sg = rm->m_sg;
347 vec_off = 0;
348 copied = 0;
349
350 while (copied < size && copied < len) {
351 while (iov_off == iov->iov_len) {
352 iov_off = 0;
353 iov++;
354 }
355
356 to_copy = min(iov->iov_len - iov_off, sg->length - vec_off);
357 to_copy = min_t(size_t, to_copy, size - copied);
358 to_copy = min_t(unsigned long, to_copy, len - copied);
359
360 rdsdebug("copying %lu bytes to user iov [%p, %zu] + %lu to "
361 "sg [%p, %u, %u] + %lu\n",
362 to_copy, iov->iov_base, iov->iov_len, iov_off,
363 sg_page(sg), sg->offset, sg->length, vec_off);
364
365 ret = rds_page_copy_to_user(sg_page(sg), sg->offset + vec_off,
366 iov->iov_base + iov_off,
367 to_copy);
368 if (ret) {
369 copied = ret;
370 break;
371 }
372
373 iov_off += to_copy;
374 vec_off += to_copy;
375 copied += to_copy;
376
377 if (vec_off == sg->length) {
378 vec_off = 0;
379 sg++;
380 }
381 }
382
383 return copied;
384}
385
386/*
387 * If the message is still on the send queue, wait until the transport
388 * is done with it. This is particularly important for RDMA operations.
389 */
390void rds_message_wait(struct rds_message *rm)
391{
392 wait_event(rds_message_flush_waitq,
393 !test_bit(RDS_MSG_MAPPED, &rm->m_flags));
394}
395
396void rds_message_unmapped(struct rds_message *rm)
397{
398 clear_bit(RDS_MSG_MAPPED, &rm->m_flags);
399 if (waitqueue_active(&rds_message_flush_waitq))
400 wake_up(&rds_message_flush_waitq);
401}
402
diff --git a/net/rds/page.c b/net/rds/page.c
new file mode 100644
index 000000000000..c460743a89ad
--- /dev/null
+++ b/net/rds/page.c
@@ -0,0 +1,221 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/highmem.h>
34
35#include "rds.h"
36
37struct rds_page_remainder {
38 struct page *r_page;
39 unsigned long r_offset;
40};
41
42DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
43
44/*
45 * returns 0 on success or -errno on failure.
46 *
47 * We don't have to worry about flush_dcache_page() as this only works
48 * with private pages. If, say, we were to do directed receive to pinned
49 * user pages we'd have to worry more about cache coherence. (Though
50 * the flush_dcache_page() in get_user_pages() would probably be enough).
51 */
52int rds_page_copy_user(struct page *page, unsigned long offset,
53 void __user *ptr, unsigned long bytes,
54 int to_user)
55{
56 unsigned long ret;
57 void *addr;
58
59 if (to_user)
60 rds_stats_add(s_copy_to_user, bytes);
61 else
62 rds_stats_add(s_copy_from_user, bytes);
63
64 addr = kmap_atomic(page, KM_USER0);
65 if (to_user)
66 ret = __copy_to_user_inatomic(ptr, addr + offset, bytes);
67 else
68 ret = __copy_from_user_inatomic(addr + offset, ptr, bytes);
69 kunmap_atomic(addr, KM_USER0);
70
71 if (ret) {
72 addr = kmap(page);
73 if (to_user)
74 ret = copy_to_user(ptr, addr + offset, bytes);
75 else
76 ret = copy_from_user(addr + offset, ptr, bytes);
77 kunmap(page);
78 if (ret)
79 return -EFAULT;
80 }
81
82 return 0;
83}
84
85/*
86 * Message allocation uses this to build up regions of a message.
87 *
88 * @bytes - the number of bytes needed.
89 * @gfp - the waiting behaviour of the allocation
90 *
91 * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to
92 * kmap the pages, etc.
93 *
94 * If @bytes is at least a full page then this just returns a page from
95 * alloc_page().
96 *
97 * If @bytes is a partial page then this stores the unused region of the
98 * page in a per-cpu structure. Future partial-page allocations may be
99 * satisfied from that cached region. This lets us waste less memory on
100 * small allocations with minimal complexity. It works because the transmit
101 * path passes read-only page regions down to devices. They hold a page
102 * reference until they are done with the region.
103 */
104int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
105 gfp_t gfp)
106{
107 struct rds_page_remainder *rem;
108 unsigned long flags;
109 struct page *page;
110 int ret;
111
112 gfp |= __GFP_HIGHMEM;
113
114 /* jump straight to allocation if we're trying for a huge page */
115 if (bytes >= PAGE_SIZE) {
116 page = alloc_page(gfp);
117 if (page == NULL) {
118 ret = -ENOMEM;
119 } else {
120 sg_set_page(scat, page, PAGE_SIZE, 0);
121 ret = 0;
122 }
123 goto out;
124 }
125
126 rem = &per_cpu(rds_page_remainders, get_cpu());
127 local_irq_save(flags);
128
129 while (1) {
130 /* avoid a tiny region getting stuck by tossing it */
131 if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) {
132 rds_stats_inc(s_page_remainder_miss);
133 __free_page(rem->r_page);
134 rem->r_page = NULL;
135 }
136
137 /* hand out a fragment from the cached page */
138 if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) {
139 sg_set_page(scat, rem->r_page, bytes, rem->r_offset);
140 get_page(sg_page(scat));
141
142 if (rem->r_offset != 0)
143 rds_stats_inc(s_page_remainder_hit);
144
145 rem->r_offset += bytes;
146 if (rem->r_offset == PAGE_SIZE) {
147 __free_page(rem->r_page);
148 rem->r_page = NULL;
149 }
150 ret = 0;
151 break;
152 }
153
154 /* alloc if there is nothing for us to use */
155 local_irq_restore(flags);
156 put_cpu();
157
158 page = alloc_page(gfp);
159
160 rem = &per_cpu(rds_page_remainders, get_cpu());
161 local_irq_save(flags);
162
163 if (page == NULL) {
164 ret = -ENOMEM;
165 break;
166 }
167
168 /* did someone race to fill the remainder before us? */
169 if (rem->r_page) {
170 __free_page(page);
171 continue;
172 }
173
174 /* otherwise install our page and loop around to alloc */
175 rem->r_page = page;
176 rem->r_offset = 0;
177 }
178
179 local_irq_restore(flags);
180 put_cpu();
181out:
182 rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret,
183 ret ? NULL : sg_page(scat), ret ? 0 : scat->offset,
184 ret ? 0 : scat->length);
185 return ret;
186}
187
188static int rds_page_remainder_cpu_notify(struct notifier_block *self,
189 unsigned long action, void *hcpu)
190{
191 struct rds_page_remainder *rem;
192 long cpu = (long)hcpu;
193
194 rem = &per_cpu(rds_page_remainders, cpu);
195
196 rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
197
198 switch (action) {
199 case CPU_DEAD:
200 if (rem->r_page)
201 __free_page(rem->r_page);
202 rem->r_page = NULL;
203 break;
204 }
205
206 return 0;
207}
208
209static struct notifier_block rds_page_remainder_nb = {
210 .notifier_call = rds_page_remainder_cpu_notify,
211};
212
213void rds_page_exit(void)
214{
215 int i;
216
217 for_each_possible_cpu(i)
218 rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
219 (unsigned long)CPU_DEAD,
220 (void *)(long)i);
221}
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
new file mode 100644
index 000000000000..eaeeb91e1119
--- /dev/null
+++ b/net/rds/rdma.c
@@ -0,0 +1,679 @@
1/*
2 * Copyright (c) 2007 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/pagemap.h>
34#include <linux/rbtree.h>
35#include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
36
37#include "rdma.h"
38
39/*
40 * XXX
41 * - build with sparse
42 * - should we limit the size of a mr region? let transport return failure?
43 * - should we detect duplicate keys on a socket? hmm.
44 * - an rdma is an mlock, apply rlimit?
45 */
46
47/*
48 * get the number of pages by looking at the page indices that the start and
49 * end addresses fall in.
50 *
51 * Returns 0 if the vec is invalid. It is invalid if the number of bytes
52 * causes the address to wrap or overflows an unsigned int. This comes
53 * from being stored in the 'length' member of 'struct scatterlist'.
54 */
55static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
56{
57 if ((vec->addr + vec->bytes <= vec->addr) ||
58 (vec->bytes > (u64)UINT_MAX))
59 return 0;
60
61 return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
62 (vec->addr >> PAGE_SHIFT);
63}
64
65static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
66 struct rds_mr *insert)
67{
68 struct rb_node **p = &root->rb_node;
69 struct rb_node *parent = NULL;
70 struct rds_mr *mr;
71
72 while (*p) {
73 parent = *p;
74 mr = rb_entry(parent, struct rds_mr, r_rb_node);
75
76 if (key < mr->r_key)
77 p = &(*p)->rb_left;
78 else if (key > mr->r_key)
79 p = &(*p)->rb_right;
80 else
81 return mr;
82 }
83
84 if (insert) {
85 rb_link_node(&insert->r_rb_node, parent, p);
86 rb_insert_color(&insert->r_rb_node, root);
87 atomic_inc(&insert->r_refcount);
88 }
89 return NULL;
90}
91
92/*
93 * Destroy the transport-specific part of a MR.
94 */
95static void rds_destroy_mr(struct rds_mr *mr)
96{
97 struct rds_sock *rs = mr->r_sock;
98 void *trans_private = NULL;
99 unsigned long flags;
100
101 rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
102 mr->r_key, atomic_read(&mr->r_refcount));
103
104 if (test_and_set_bit(RDS_MR_DEAD, &mr->r_state))
105 return;
106
107 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
108 if (!RB_EMPTY_NODE(&mr->r_rb_node))
109 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
110 trans_private = mr->r_trans_private;
111 mr->r_trans_private = NULL;
112 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
113
114 if (trans_private)
115 mr->r_trans->free_mr(trans_private, mr->r_invalidate);
116}
117
118void __rds_put_mr_final(struct rds_mr *mr)
119{
120 rds_destroy_mr(mr);
121 kfree(mr);
122}
123
124/*
125 * By the time this is called we can't have any more ioctls called on
126 * the socket so we don't need to worry about racing with others.
127 */
128void rds_rdma_drop_keys(struct rds_sock *rs)
129{
130 struct rds_mr *mr;
131 struct rb_node *node;
132
133 /* Release any MRs associated with this socket */
134 while ((node = rb_first(&rs->rs_rdma_keys))) {
135 mr = container_of(node, struct rds_mr, r_rb_node);
136 if (mr->r_trans == rs->rs_transport)
137 mr->r_invalidate = 0;
138 rds_mr_put(mr);
139 }
140
141 if (rs->rs_transport && rs->rs_transport->flush_mrs)
142 rs->rs_transport->flush_mrs();
143}
144
145/*
146 * Helper function to pin user pages.
147 */
148static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
149 struct page **pages, int write)
150{
151 int ret;
152
153 down_read(&current->mm->mmap_sem);
154 ret = get_user_pages(current, current->mm, user_addr,
155 nr_pages, write, 0, pages, NULL);
156 up_read(&current->mm->mmap_sem);
157
158 if (0 <= ret && (unsigned) ret < nr_pages) {
159 while (ret--)
160 put_page(pages[ret]);
161 ret = -EFAULT;
162 }
163
164 return ret;
165}
166
167static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
168 u64 *cookie_ret, struct rds_mr **mr_ret)
169{
170 struct rds_mr *mr = NULL, *found;
171 unsigned int nr_pages;
172 struct page **pages = NULL;
173 struct scatterlist *sg;
174 void *trans_private;
175 unsigned long flags;
176 rds_rdma_cookie_t cookie;
177 unsigned int nents;
178 long i;
179 int ret;
180
181 if (rs->rs_bound_addr == 0) {
182 ret = -ENOTCONN; /* XXX not a great errno */
183 goto out;
184 }
185
186 if (rs->rs_transport->get_mr == NULL) {
187 ret = -EOPNOTSUPP;
188 goto out;
189 }
190
191 nr_pages = rds_pages_in_vec(&args->vec);
192 if (nr_pages == 0) {
193 ret = -EINVAL;
194 goto out;
195 }
196
197 rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
198 args->vec.addr, args->vec.bytes, nr_pages);
199
200 /* XXX clamp nr_pages to limit the size of this alloc? */
201 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
202 if (pages == NULL) {
203 ret = -ENOMEM;
204 goto out;
205 }
206
207 mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
208 if (mr == NULL) {
209 ret = -ENOMEM;
210 goto out;
211 }
212
213 atomic_set(&mr->r_refcount, 1);
214 RB_CLEAR_NODE(&mr->r_rb_node);
215 mr->r_trans = rs->rs_transport;
216 mr->r_sock = rs;
217
218 if (args->flags & RDS_RDMA_USE_ONCE)
219 mr->r_use_once = 1;
220 if (args->flags & RDS_RDMA_INVALIDATE)
221 mr->r_invalidate = 1;
222 if (args->flags & RDS_RDMA_READWRITE)
223 mr->r_write = 1;
224
225 /*
226 * Pin the pages that make up the user buffer and transfer the page
227 * pointers to the mr's sg array. We check to see if we've mapped
228 * the whole region after transferring the partial page references
229 * to the sg array so that we can have one page ref cleanup path.
230 *
231 * For now we have no flag that tells us whether the mapping is
232 * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
233 * the zero page.
234 */
235 ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1);
236 if (ret < 0)
237 goto out;
238
239 nents = ret;
240 sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL);
241 if (sg == NULL) {
242 ret = -ENOMEM;
243 goto out;
244 }
245 WARN_ON(!nents);
246 sg_init_table(sg, nents);
247
248 /* Stick all pages into the scatterlist */
249 for (i = 0 ; i < nents; i++)
250 sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
251
252 rdsdebug("RDS: trans_private nents is %u\n", nents);
253
254 /* Obtain a transport specific MR. If this succeeds, the
255 * s/g list is now owned by the MR.
256 * Note that dma_map() implies that pending writes are
257 * flushed to RAM, so no dma_sync is needed here. */
258 trans_private = rs->rs_transport->get_mr(sg, nents, rs,
259 &mr->r_key);
260
261 if (IS_ERR(trans_private)) {
262 for (i = 0 ; i < nents; i++)
263 put_page(sg_page(&sg[i]));
264 kfree(sg);
265 ret = PTR_ERR(trans_private);
266 goto out;
267 }
268
269 mr->r_trans_private = trans_private;
270
271 rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
272 mr->r_key, (void *)(unsigned long) args->cookie_addr);
273
274 /* The user may pass us an unaligned address, but we can only
275 * map page aligned regions. So we keep the offset, and build
276 * a 64bit cookie containing <R_Key, offset> and pass that
277 * around. */
278 cookie = rds_rdma_make_cookie(mr->r_key, args->vec.addr & ~PAGE_MASK);
279 if (cookie_ret)
280 *cookie_ret = cookie;
281
282 if (args->cookie_addr && put_user(cookie, (u64 __user *)(unsigned long) args->cookie_addr)) {
283 ret = -EFAULT;
284 goto out;
285 }
286
287 /* Inserting the new MR into the rbtree bumps its
288 * reference count. */
289 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
290 found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
291 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
292
293 BUG_ON(found && found != mr);
294
295 rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
296 if (mr_ret) {
297 atomic_inc(&mr->r_refcount);
298 *mr_ret = mr;
299 }
300
301 ret = 0;
302out:
303 kfree(pages);
304 if (mr)
305 rds_mr_put(mr);
306 return ret;
307}
308
309int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
310{
311 struct rds_get_mr_args args;
312
313 if (optlen != sizeof(struct rds_get_mr_args))
314 return -EINVAL;
315
316 if (copy_from_user(&args, (struct rds_get_mr_args __user *)optval,
317 sizeof(struct rds_get_mr_args)))
318 return -EFAULT;
319
320 return __rds_rdma_map(rs, &args, NULL, NULL);
321}
322
323/*
324 * Free the MR indicated by the given R_Key
325 */
326int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen)
327{
328 struct rds_free_mr_args args;
329 struct rds_mr *mr;
330 unsigned long flags;
331
332 if (optlen != sizeof(struct rds_free_mr_args))
333 return -EINVAL;
334
335 if (copy_from_user(&args, (struct rds_free_mr_args __user *)optval,
336 sizeof(struct rds_free_mr_args)))
337 return -EFAULT;
338
339 /* Special case - a null cookie means flush all unused MRs */
340 if (args.cookie == 0) {
341 if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
342 return -EINVAL;
343 rs->rs_transport->flush_mrs();
344 return 0;
345 }
346
347 /* Look up the MR given its R_key and remove it from the rbtree
348 * so nobody else finds it.
349 * This should also prevent races with rds_rdma_unuse.
350 */
351 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
352 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
353 if (mr) {
354 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
355 RB_CLEAR_NODE(&mr->r_rb_node);
356 if (args.flags & RDS_RDMA_INVALIDATE)
357 mr->r_invalidate = 1;
358 }
359 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
360
361 if (!mr)
362 return -EINVAL;
363
364 /*
365 * call rds_destroy_mr() ourselves so that we're sure it's done by the time
366 * we return. If we let rds_mr_put() do it it might not happen until
367 * someone else drops their ref.
368 */
369 rds_destroy_mr(mr);
370 rds_mr_put(mr);
371 return 0;
372}
373
374/*
375 * This is called when we receive an extension header that
376 * tells us this MR was used. It allows us to implement
377 * use_once semantics
378 */
379void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
380{
381 struct rds_mr *mr;
382 unsigned long flags;
383 int zot_me = 0;
384
385 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
386 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
387 if (mr && (mr->r_use_once || force)) {
388 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
389 RB_CLEAR_NODE(&mr->r_rb_node);
390 zot_me = 1;
391 } else if (mr)
392 atomic_inc(&mr->r_refcount);
393 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
394
395 /* May have to issue a dma_sync on this memory region.
396 * Note we could avoid this if the operation was a RDMA READ,
397 * but at this point we can't tell. */
398 if (mr != NULL) {
399 if (mr->r_trans->sync_mr)
400 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
401
402 /* If the MR was marked as invalidate, this will
403 * trigger an async flush. */
404 if (zot_me)
405 rds_destroy_mr(mr);
406 rds_mr_put(mr);
407 }
408}
409
410void rds_rdma_free_op(struct rds_rdma_op *ro)
411{
412 unsigned int i;
413
414 for (i = 0; i < ro->r_nents; i++) {
415 struct page *page = sg_page(&ro->r_sg[i]);
416
417 /* Mark page dirty if it was possibly modified, which
418 * is the case for a RDMA_READ which copies from remote
419 * to local memory */
420 if (!ro->r_write)
421 set_page_dirty(page);
422 put_page(page);
423 }
424
425 kfree(ro->r_notifier);
426 kfree(ro);
427}
428
429/*
430 * args is a pointer to an in-kernel copy in the sendmsg cmsg.
431 */
432static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs,
433 struct rds_rdma_args *args)
434{
435 struct rds_iovec vec;
436 struct rds_rdma_op *op = NULL;
437 unsigned int nr_pages;
438 unsigned int max_pages;
439 unsigned int nr_bytes;
440 struct page **pages = NULL;
441 struct rds_iovec __user *local_vec;
442 struct scatterlist *sg;
443 unsigned int nr;
444 unsigned int i, j;
445 int ret;
446
447
448 if (rs->rs_bound_addr == 0) {
449 ret = -ENOTCONN; /* XXX not a great errno */
450 goto out;
451 }
452
453 if (args->nr_local > (u64)UINT_MAX) {
454 ret = -EMSGSIZE;
455 goto out;
456 }
457
458 nr_pages = 0;
459 max_pages = 0;
460
461 local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
462
463 /* figure out the number of pages in the vector */
464 for (i = 0; i < args->nr_local; i++) {
465 if (copy_from_user(&vec, &local_vec[i],
466 sizeof(struct rds_iovec))) {
467 ret = -EFAULT;
468 goto out;
469 }
470
471 nr = rds_pages_in_vec(&vec);
472 if (nr == 0) {
473 ret = -EINVAL;
474 goto out;
475 }
476
477 max_pages = max(nr, max_pages);
478 nr_pages += nr;
479 }
480
481 pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL);
482 if (pages == NULL) {
483 ret = -ENOMEM;
484 goto out;
485 }
486
487 op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL);
488 if (op == NULL) {
489 ret = -ENOMEM;
490 goto out;
491 }
492
493 op->r_write = !!(args->flags & RDS_RDMA_READWRITE);
494 op->r_fence = !!(args->flags & RDS_RDMA_FENCE);
495 op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
496 op->r_recverr = rs->rs_recverr;
497 WARN_ON(!nr_pages);
498 sg_init_table(op->r_sg, nr_pages);
499
500 if (op->r_notify || op->r_recverr) {
501 /* We allocate an uninitialized notifier here, because
502 * we don't want to do that in the completion handler. We
503 * would have to use GFP_ATOMIC there, and don't want to deal
504 * with failed allocations.
505 */
506 op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
507 if (!op->r_notifier) {
508 ret = -ENOMEM;
509 goto out;
510 }
511 op->r_notifier->n_user_token = args->user_token;
512 op->r_notifier->n_status = RDS_RDMA_SUCCESS;
513 }
514
515 /* The cookie contains the R_Key of the remote memory region, and
516 * optionally an offset into it. This is how we implement RDMA into
517 * unaligned memory.
518 * When setting up the RDMA, we need to add that offset to the
519 * destination address (which is really an offset into the MR)
520 * FIXME: We may want to move this into ib_rdma.c
521 */
522 op->r_key = rds_rdma_cookie_key(args->cookie);
523 op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
524
525 nr_bytes = 0;
526
527 rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
528 (unsigned long long)args->nr_local,
529 (unsigned long long)args->remote_vec.addr,
530 op->r_key);
531
532 for (i = 0; i < args->nr_local; i++) {
533 if (copy_from_user(&vec, &local_vec[i],
534 sizeof(struct rds_iovec))) {
535 ret = -EFAULT;
536 goto out;
537 }
538
539 nr = rds_pages_in_vec(&vec);
540 if (nr == 0) {
541 ret = -EINVAL;
542 goto out;
543 }
544
545 rs->rs_user_addr = vec.addr;
546 rs->rs_user_bytes = vec.bytes;
547
548 /* did the user change the vec under us? */
549 if (nr > max_pages || op->r_nents + nr > nr_pages) {
550 ret = -EINVAL;
551 goto out;
552 }
553 /* If it's a WRITE operation, we want to pin the pages for reading.
554 * If it's a READ operation, we need to pin the pages for writing.
555 */
556 ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write);
557 if (ret < 0)
558 goto out;
559
560 rdsdebug("RDS: nr_bytes %u nr %u vec.bytes %llu vec.addr %llx\n",
561 nr_bytes, nr, vec.bytes, vec.addr);
562
563 nr_bytes += vec.bytes;
564
565 for (j = 0; j < nr; j++) {
566 unsigned int offset = vec.addr & ~PAGE_MASK;
567
568 sg = &op->r_sg[op->r_nents + j];
569 sg_set_page(sg, pages[j],
570 min_t(unsigned int, vec.bytes, PAGE_SIZE - offset),
571 offset);
572
573 rdsdebug("RDS: sg->offset %x sg->len %x vec.addr %llx vec.bytes %llu\n",
574 sg->offset, sg->length, vec.addr, vec.bytes);
575
576 vec.addr += sg->length;
577 vec.bytes -= sg->length;
578 }
579
580 op->r_nents += nr;
581 }
582
583
584 if (nr_bytes > args->remote_vec.bytes) {
585 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
586 nr_bytes,
587 (unsigned int) args->remote_vec.bytes);
588 ret = -EINVAL;
589 goto out;
590 }
591 op->r_bytes = nr_bytes;
592
593 ret = 0;
594out:
595 kfree(pages);
596 if (ret) {
597 if (op)
598 rds_rdma_free_op(op);
599 op = ERR_PTR(ret);
600 }
601 return op;
602}
603
604/*
605 * The application asks for a RDMA transfer.
606 * Extract all arguments and set up the rdma_op
607 */
608int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
609 struct cmsghdr *cmsg)
610{
611 struct rds_rdma_op *op;
612
613 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
614 || rm->m_rdma_op != NULL)
615 return -EINVAL;
616
617 op = rds_rdma_prepare(rs, CMSG_DATA(cmsg));
618 if (IS_ERR(op))
619 return PTR_ERR(op);
620 rds_stats_inc(s_send_rdma);
621 rm->m_rdma_op = op;
622 return 0;
623}
624
625/*
626 * The application wants us to pass an RDMA destination (aka MR)
627 * to the remote
628 */
629int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
630 struct cmsghdr *cmsg)
631{
632 unsigned long flags;
633 struct rds_mr *mr;
634 u32 r_key;
635 int err = 0;
636
637 if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t))
638 || rm->m_rdma_cookie != 0)
639 return -EINVAL;
640
641 memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
642
643 /* We are reusing a previously mapped MR here. Most likely, the
644 * application has written to the buffer, so we need to explicitly
645 * flush those writes to RAM. Otherwise the HCA may not see them
646 * when doing a DMA from that buffer.
647 */
648 r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
649
650 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
651 mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
652 if (mr == NULL)
653 err = -EINVAL; /* invalid r_key */
654 else
655 atomic_inc(&mr->r_refcount);
656 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
657
658 if (mr) {
659 mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE);
660 rm->m_rdma_mr = mr;
661 }
662 return err;
663}
664
665/*
666 * The application passes us an address range it wants to enable RDMA
667 * to/from. We map the area, and save the <R_Key,offset> pair
668 * in rm->m_rdma_cookie. This causes it to be sent along to the peer
669 * in an extension header.
670 */
671int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
672 struct cmsghdr *cmsg)
673{
674 if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args))
675 || rm->m_rdma_cookie != 0)
676 return -EINVAL;
677
678 return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr);
679}
diff --git a/net/rds/rdma.h b/net/rds/rdma.h
new file mode 100644
index 000000000000..425512098b0b
--- /dev/null
+++ b/net/rds/rdma.h
@@ -0,0 +1,84 @@
1#ifndef _RDS_RDMA_H
2#define _RDS_RDMA_H
3
4#include <linux/rbtree.h>
5#include <linux/spinlock.h>
6#include <linux/scatterlist.h>
7
8#include "rds.h"
9
10struct rds_mr {
11 struct rb_node r_rb_node;
12 atomic_t r_refcount;
13 u32 r_key;
14
15 /* A copy of the creation flags */
16 unsigned int r_use_once:1;
17 unsigned int r_invalidate:1;
18 unsigned int r_write:1;
19
20 /* This is for RDS_MR_DEAD.
21 * It would be nice & consistent to make this part of the above
22 * bit field here, but we need to use test_and_set_bit.
23 */
24 unsigned long r_state;
25 struct rds_sock *r_sock; /* back pointer to the socket that owns us */
26 struct rds_transport *r_trans;
27 void *r_trans_private;
28};
29
30/* Flags for mr->r_state */
31#define RDS_MR_DEAD 0
32
33struct rds_rdma_op {
34 u32 r_key;
35 u64 r_remote_addr;
36 unsigned int r_write:1;
37 unsigned int r_fence:1;
38 unsigned int r_notify:1;
39 unsigned int r_recverr:1;
40 unsigned int r_mapped:1;
41 struct rds_notifier *r_notifier;
42 unsigned int r_bytes;
43 unsigned int r_nents;
44 unsigned int r_count;
45 struct scatterlist r_sg[0];
46};
47
48static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset)
49{
50 return r_key | (((u64) offset) << 32);
51}
52
53static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie)
54{
55 return cookie;
56}
57
58static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie)
59{
60 return cookie >> 32;
61}
62
63int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen);
64int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen);
65void rds_rdma_drop_keys(struct rds_sock *rs);
66int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
67 struct cmsghdr *cmsg);
68int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
69 struct cmsghdr *cmsg);
70int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
71 struct cmsghdr *cmsg);
72int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
73 struct cmsghdr *cmsg);
74void rds_rdma_free_op(struct rds_rdma_op *ro);
75void rds_rdma_send_complete(struct rds_message *rm, int);
76
77extern void __rds_put_mr_final(struct rds_mr *mr);
78static inline void rds_mr_put(struct rds_mr *mr)
79{
80 if (atomic_dec_and_test(&mr->r_refcount))
81 __rds_put_mr_final(mr);
82}
83
84#endif
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
new file mode 100644
index 000000000000..7b19024f9706
--- /dev/null
+++ b/net/rds/rdma_transport.c
@@ -0,0 +1,214 @@
1/*
2 * Copyright (c) 2009 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <rdma/rdma_cm.h>
34
35#include "rdma_transport.h"
36
37static struct rdma_cm_id *rds_iw_listen_id;
38
39int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
40 struct rdma_cm_event *event)
41{
42 /* this can be null in the listening path */
43 struct rds_connection *conn = cm_id->context;
44 struct rds_transport *trans;
45 int ret = 0;
46
47 rdsdebug("conn %p id %p handling event %u\n", conn, cm_id,
48 event->event);
49
50 if (cm_id->device->node_type == RDMA_NODE_RNIC)
51 trans = &rds_iw_transport;
52 else
53 trans = &rds_ib_transport;
54
55 /* Prevent shutdown from tearing down the connection
56 * while we're executing. */
57 if (conn) {
58 mutex_lock(&conn->c_cm_lock);
59
60 /* If the connection is being shut down, bail out
61 * right away. We return 0 so cm_id doesn't get
62 * destroyed prematurely */
63 if (rds_conn_state(conn) == RDS_CONN_DISCONNECTING) {
64 /* Reject incoming connections while we're tearing
65 * down an existing one. */
66 if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST)
67 ret = 1;
68 goto out;
69 }
70 }
71
72 switch (event->event) {
73 case RDMA_CM_EVENT_CONNECT_REQUEST:
74 ret = trans->cm_handle_connect(cm_id, event);
75 break;
76
77 case RDMA_CM_EVENT_ADDR_RESOLVED:
78 /* XXX do we need to clean up if this fails? */
79 ret = rdma_resolve_route(cm_id,
80 RDS_RDMA_RESOLVE_TIMEOUT_MS);
81 break;
82
83 case RDMA_CM_EVENT_ROUTE_RESOLVED:
84 /* XXX worry about racing with listen acceptance */
85 ret = trans->cm_initiate_connect(cm_id);
86 break;
87
88 case RDMA_CM_EVENT_ESTABLISHED:
89 trans->cm_connect_complete(conn, event);
90 break;
91
92 case RDMA_CM_EVENT_ADDR_ERROR:
93 case RDMA_CM_EVENT_ROUTE_ERROR:
94 case RDMA_CM_EVENT_CONNECT_ERROR:
95 case RDMA_CM_EVENT_UNREACHABLE:
96 case RDMA_CM_EVENT_REJECTED:
97 case RDMA_CM_EVENT_DEVICE_REMOVAL:
98 case RDMA_CM_EVENT_ADDR_CHANGE:
99 if (conn)
100 rds_conn_drop(conn);
101 break;
102
103 case RDMA_CM_EVENT_DISCONNECTED:
104 printk(KERN_WARNING "RDS/IW: DISCONNECT event - dropping connection "
105 "%pI4->%pI4\n", &conn->c_laddr,
106 &conn->c_faddr);
107 rds_conn_drop(conn);
108 break;
109
110 default:
111 /* things like device disconnect? */
112 printk(KERN_ERR "unknown event %u\n", event->event);
113 BUG();
114 break;
115 }
116
117out:
118 if (conn)
119 mutex_unlock(&conn->c_cm_lock);
120
121 rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret);
122
123 return ret;
124}
125
126static int __init rds_rdma_listen_init(void)
127{
128 struct sockaddr_in sin;
129 struct rdma_cm_id *cm_id;
130 int ret;
131
132 cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP);
133 if (IS_ERR(cm_id)) {
134 ret = PTR_ERR(cm_id);
135 printk(KERN_ERR "RDS/IW: failed to setup listener, "
136 "rdma_create_id() returned %d\n", ret);
137 goto out;
138 }
139
140 sin.sin_family = PF_INET,
141 sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
142 sin.sin_port = (__force u16)htons(RDS_PORT);
143
144 /*
145 * XXX I bet this binds the cm_id to a device. If we want to support
146 * fail-over we'll have to take this into consideration.
147 */
148 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
149 if (ret) {
150 printk(KERN_ERR "RDS/IW: failed to setup listener, "
151 "rdma_bind_addr() returned %d\n", ret);
152 goto out;
153 }
154
155 ret = rdma_listen(cm_id, 128);
156 if (ret) {
157 printk(KERN_ERR "RDS/IW: failed to setup listener, "
158 "rdma_listen() returned %d\n", ret);
159 goto out;
160 }
161
162 rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
163
164 rds_iw_listen_id = cm_id;
165 cm_id = NULL;
166out:
167 if (cm_id)
168 rdma_destroy_id(cm_id);
169 return ret;
170}
171
172static void rds_rdma_listen_stop(void)
173{
174 if (rds_iw_listen_id) {
175 rdsdebug("cm %p\n", rds_iw_listen_id);
176 rdma_destroy_id(rds_iw_listen_id);
177 rds_iw_listen_id = NULL;
178 }
179}
180
181int __init rds_rdma_init(void)
182{
183 int ret;
184
185 ret = rds_rdma_listen_init();
186 if (ret)
187 goto out;
188
189 ret = rds_iw_init();
190 if (ret)
191 goto err_iw_init;
192
193 ret = rds_ib_init();
194 if (ret)
195 goto err_ib_init;
196
197 goto out;
198
199err_ib_init:
200 rds_iw_exit();
201err_iw_init:
202 rds_rdma_listen_stop();
203out:
204 return ret;
205}
206
207void rds_rdma_exit(void)
208{
209 /* stop listening first to ensure no new connections are attempted */
210 rds_rdma_listen_stop();
211 rds_ib_exit();
212 rds_iw_exit();
213}
214
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
new file mode 100644
index 000000000000..2f2c7d976c21
--- /dev/null
+++ b/net/rds/rdma_transport.h
@@ -0,0 +1,28 @@
1#ifndef _RDMA_TRANSPORT_H
2#define _RDMA_TRANSPORT_H
3
4#include <rdma/ib_verbs.h>
5#include <rdma/rdma_cm.h>
6#include "rds.h"
7
8#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
9
10int rds_rdma_conn_connect(struct rds_connection *conn);
11int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
12 struct rdma_cm_event *event);
13
14/* from rdma_transport.c */
15int rds_rdma_init(void);
16void rds_rdma_exit(void);
17
18/* from ib.c */
19extern struct rds_transport rds_ib_transport;
20int rds_ib_init(void);
21void rds_ib_exit(void);
22
23/* from iw.c */
24extern struct rds_transport rds_iw_transport;
25int rds_iw_init(void);
26void rds_iw_exit(void);
27
28#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
new file mode 100644
index 000000000000..060400704979
--- /dev/null
+++ b/net/rds/rds.h
@@ -0,0 +1,686 @@
1#ifndef _RDS_RDS_H
2#define _RDS_RDS_H
3
4#include <net/sock.h>
5#include <linux/scatterlist.h>
6#include <linux/highmem.h>
7#include <rdma/rdma_cm.h>
8#include <linux/mutex.h>
9#include <linux/rds.h>
10
11#include "info.h"
12
13/*
14 * RDS Network protocol version
15 */
16#define RDS_PROTOCOL_3_0 0x0300
17#define RDS_PROTOCOL_3_1 0x0301
18#define RDS_PROTOCOL_VERSION RDS_PROTOCOL_3_1
19#define RDS_PROTOCOL_MAJOR(v) ((v) >> 8)
20#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
21#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
22
23/*
24 * XXX randomly chosen, but at least seems to be unused:
25 * # 18464-18768 Unassigned
26 * We should do better. We want a reserved port to discourage unpriv'ed
27 * userspace from listening.
28 */
29#define RDS_PORT 18634
30
31#ifdef DEBUG
32#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
33#else
34/* sigh, pr_debug() causes unused variable warnings */
35static inline void __attribute__ ((format (printf, 1, 2)))
36rdsdebug(char *fmt, ...)
37{
38}
39#endif
40
41/* XXX is there one of these somewhere? */
42#define ceil(x, y) \
43 ({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })
44
45#define RDS_FRAG_SHIFT 12
46#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
47
48#define RDS_CONG_MAP_BYTES (65536 / 8)
49#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
50#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
51#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
52
53struct rds_cong_map {
54 struct rb_node m_rb_node;
55 __be32 m_addr;
56 wait_queue_head_t m_waitq;
57 struct list_head m_conn_list;
58 unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
59};
60
61
62/*
63 * This is how we will track the connection state:
64 * A connection is always in one of the following
65 * states. Updates to the state are atomic and imply
66 * a memory barrier.
67 */
68enum {
69 RDS_CONN_DOWN = 0,
70 RDS_CONN_CONNECTING,
71 RDS_CONN_DISCONNECTING,
72 RDS_CONN_UP,
73 RDS_CONN_ERROR,
74};
75
76/* Bits for c_flags */
77#define RDS_LL_SEND_FULL 0
78#define RDS_RECONNECT_PENDING 1
79
80struct rds_connection {
81 struct hlist_node c_hash_node;
82 __be32 c_laddr;
83 __be32 c_faddr;
84 unsigned int c_loopback:1;
85 struct rds_connection *c_passive;
86
87 struct rds_cong_map *c_lcong;
88 struct rds_cong_map *c_fcong;
89
90 struct mutex c_send_lock; /* protect send ring */
91 struct rds_message *c_xmit_rm;
92 unsigned long c_xmit_sg;
93 unsigned int c_xmit_hdr_off;
94 unsigned int c_xmit_data_off;
95 unsigned int c_xmit_rdma_sent;
96
97 spinlock_t c_lock; /* protect msg queues */
98 u64 c_next_tx_seq;
99 struct list_head c_send_queue;
100 struct list_head c_retrans;
101
102 u64 c_next_rx_seq;
103
104 struct rds_transport *c_trans;
105 void *c_transport_data;
106
107 atomic_t c_state;
108 unsigned long c_flags;
109 unsigned long c_reconnect_jiffies;
110 struct delayed_work c_send_w;
111 struct delayed_work c_recv_w;
112 struct delayed_work c_conn_w;
113 struct work_struct c_down_w;
114 struct mutex c_cm_lock; /* protect conn state & cm */
115
116 struct list_head c_map_item;
117 unsigned long c_map_queued;
118 unsigned long c_map_offset;
119 unsigned long c_map_bytes;
120
121 unsigned int c_unacked_packets;
122 unsigned int c_unacked_bytes;
123
124 /* Protocol version */
125 unsigned int c_version;
126};
127
128#define RDS_FLAG_CONG_BITMAP 0x01
129#define RDS_FLAG_ACK_REQUIRED 0x02
130#define RDS_FLAG_RETRANSMITTED 0x04
131#define RDS_MAX_ADV_CREDIT 127
132
133/*
134 * Maximum space available for extension headers.
135 */
136#define RDS_HEADER_EXT_SPACE 16
137
138struct rds_header {
139 __be64 h_sequence;
140 __be64 h_ack;
141 __be32 h_len;
142 __be16 h_sport;
143 __be16 h_dport;
144 u8 h_flags;
145 u8 h_credit;
146 u8 h_padding[4];
147 __sum16 h_csum;
148
149 u8 h_exthdr[RDS_HEADER_EXT_SPACE];
150};
151
152/*
153 * Reserved - indicates end of extensions
154 */
155#define RDS_EXTHDR_NONE 0
156
157/*
158 * This extension header is included in the very
159 * first message that is sent on a new connection,
160 * and identifies the protocol level. This will help
161 * rolling updates if a future change requires breaking
162 * the protocol.
163 * NB: This is no longer true for IB, where we do a version
164 * negotiation during the connection setup phase (protocol
165 * version information is included in the RDMA CM private data).
166 */
167#define RDS_EXTHDR_VERSION 1
168struct rds_ext_header_version {
169 __be32 h_version;
170};
171
172/*
173 * This extension header is included in the RDS message
174 * chasing an RDMA operation.
175 */
176#define RDS_EXTHDR_RDMA 2
177struct rds_ext_header_rdma {
178 __be32 h_rdma_rkey;
179};
180
181/*
182 * This extension header tells the peer about the
183 * destination <R_Key,offset> of the requested RDMA
184 * operation.
185 */
186#define RDS_EXTHDR_RDMA_DEST 3
187struct rds_ext_header_rdma_dest {
188 __be32 h_rdma_rkey;
189 __be32 h_rdma_offset;
190};
191
192#define __RDS_EXTHDR_MAX 16 /* for now */
193
194struct rds_incoming {
195 atomic_t i_refcount;
196 struct list_head i_item;
197 struct rds_connection *i_conn;
198 struct rds_header i_hdr;
199 unsigned long i_rx_jiffies;
200 __be32 i_saddr;
201
202 rds_rdma_cookie_t i_rdma_cookie;
203};
204
205/*
206 * m_sock_item and m_conn_item are on lists that are serialized under
207 * conn->c_lock. m_sock_item has additional meaning in that once it is empty
208 * the message will not be put back on the retransmit list after being sent.
209 * messages that are canceled while being sent rely on this.
210 *
211 * m_inc is used by loopback so that it can pass an incoming message straight
212 * back up into the rx path. It embeds a wire header which is also used by
213 * the send path, which is kind of awkward.
214 *
215 * m_sock_item indicates the message's presence on a socket's send or receive
216 * queue. m_rs will point to that socket.
217 *
218 * m_daddr is used by cancellation to prune messages to a given destination.
219 *
220 * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
221 * nesting. As paths iterate over messages on a sock, or conn, they must
222 * also lock the conn, or sock, to remove the message from those lists too.
223 * Testing the flag to determine if the message is still on the lists lets
224 * us avoid testing the list_head directly. That means each path can use
225 * the message's list_head to keep it on a local list while juggling locks
226 * without confusing the other path.
227 *
228 * m_ack_seq is an optional field set by transports who need a different
229 * sequence number range to invalidate. They can use this in a callback
230 * that they pass to rds_send_drop_acked() to see if each message has been
231 * acked. The HAS_ACK_SEQ flag can be used to detect messages which haven't
232 * had ack_seq set yet.
233 */
234#define RDS_MSG_ON_SOCK 1
235#define RDS_MSG_ON_CONN 2
236#define RDS_MSG_HAS_ACK_SEQ 3
237#define RDS_MSG_ACK_REQUIRED 4
238#define RDS_MSG_RETRANSMITTED 5
239#define RDS_MSG_MAPPED 6
240#define RDS_MSG_PAGEVEC 7
241
242struct rds_message {
243 atomic_t m_refcount;
244 struct list_head m_sock_item;
245 struct list_head m_conn_item;
246 struct rds_incoming m_inc;
247 u64 m_ack_seq;
248 __be32 m_daddr;
249 unsigned long m_flags;
250
251 /* Never access m_rs without holding m_rs_lock.
252 * Lock nesting is
253 * rm->m_rs_lock
254 * -> rs->rs_lock
255 */
256 spinlock_t m_rs_lock;
257 struct rds_sock *m_rs;
258 struct rds_rdma_op *m_rdma_op;
259 rds_rdma_cookie_t m_rdma_cookie;
260 struct rds_mr *m_rdma_mr;
261 unsigned int m_nents;
262 unsigned int m_count;
263 struct scatterlist m_sg[0];
264};
265
266/*
267 * The RDS notifier is used (optionally) to tell the application about
268 * completed RDMA operations. Rather than keeping the whole rds message
269 * around on the queue, we allocate a small notifier that is put on the
270 * socket's notifier_list. Notifications are delivered to the application
271 * through control messages.
272 */
273struct rds_notifier {
274 struct list_head n_list;
275 uint64_t n_user_token;
276 int n_status;
277};
278
279/**
280 * struct rds_transport - transport specific behavioural hooks
281 *
282 * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
283 * part of a message. The caller serializes on the send_sem so this
284 * doesn't need to be reentrant for a given conn. The header must be
285 * sent before the data payload. .xmit must be prepared to send a
286 * message with no data payload. .xmit should return the number of
287 * bytes that were sent down the connection, including header bytes.
288 * Returning 0 tells the caller that it doesn't need to perform any
289 * additional work now. This is usually the case when the transport has
290 * filled the sending queue for its connection and will handle
291 * triggering the rds thread to continue the send when space becomes
292 * available. Returning -EAGAIN tells the caller to retry the send
293 * immediately. Returning -ENOMEM tells the caller to retry the send at
294 * some point in the future.
295 *
296 * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
297 * it returns the connection can not call rds_recv_incoming().
298 * This will only be called once after conn_connect returns
299 * non-zero success and will The caller serializes this with
300 * the send and connecting paths (xmit_* and conn_*). The
301 * transport is responsible for other serialization, including
302 * rds_recv_incoming(). This is called in process context but
303 * should try hard not to block.
304 *
305 * @xmit_cong_map: This asks the transport to send the local bitmap down the
306 * given connection. XXX get a better story about the bitmap
307 * flag and header.
308 */
309
310struct rds_transport {
311 char t_name[TRANSNAMSIZ];
312 struct list_head t_item;
313 struct module *t_owner;
314 unsigned int t_prefer_loopback:1;
315
316 int (*laddr_check)(__be32 addr);
317 int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
318 void (*conn_free)(void *data);
319 int (*conn_connect)(struct rds_connection *conn);
320 void (*conn_shutdown)(struct rds_connection *conn);
321 void (*xmit_prepare)(struct rds_connection *conn);
322 void (*xmit_complete)(struct rds_connection *conn);
323 int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
324 unsigned int hdr_off, unsigned int sg, unsigned int off);
325 int (*xmit_cong_map)(struct rds_connection *conn,
326 struct rds_cong_map *map, unsigned long offset);
327 int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op);
328 int (*recv)(struct rds_connection *conn);
329 int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
330 size_t size);
331 void (*inc_purge)(struct rds_incoming *inc);
332 void (*inc_free)(struct rds_incoming *inc);
333
334 int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
335 struct rdma_cm_event *event);
336 int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
337 void (*cm_connect_complete)(struct rds_connection *conn,
338 struct rdma_cm_event *event);
339
340 unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
341 unsigned int avail);
342 void (*exit)(void);
343 void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
344 struct rds_sock *rs, u32 *key_ret);
345 void (*sync_mr)(void *trans_private, int direction);
346 void (*free_mr)(void *trans_private, int invalidate);
347 void (*flush_mrs)(void);
348};
349
350struct rds_sock {
351 struct sock rs_sk;
352
353 u64 rs_user_addr;
354 u64 rs_user_bytes;
355
356 /*
357 * bound_addr used for both incoming and outgoing, no INADDR_ANY
358 * support.
359 */
360 struct rb_node rs_bound_node;
361 __be32 rs_bound_addr;
362 __be32 rs_conn_addr;
363 __be16 rs_bound_port;
364 __be16 rs_conn_port;
365
366 /*
367 * This is only used to communicate the transport between bind and
368 * initiating connections. All other trans use is referenced through
369 * the connection.
370 */
371 struct rds_transport *rs_transport;
372
373 /*
374 * rds_sendmsg caches the conn it used the last time around.
375 * This helps avoid costly lookups.
376 */
377 struct rds_connection *rs_conn;
378
379 /* flag indicating we were congested or not */
380 int rs_congested;
381
382 /* rs_lock protects all these adjacent members before the newline */
383 spinlock_t rs_lock;
384 struct list_head rs_send_queue;
385 u32 rs_snd_bytes;
386 int rs_rcv_bytes;
387 struct list_head rs_notify_queue; /* currently used for failed RDMAs */
388
389 /* Congestion wake_up. If rs_cong_monitor is set, we use cong_mask
390 * to decide whether the application should be woken up.
391 * If not set, we use rs_cong_track to find out whether a cong map
392 * update arrived.
393 */
394 uint64_t rs_cong_mask;
395 uint64_t rs_cong_notify;
396 struct list_head rs_cong_list;
397 unsigned long rs_cong_track;
398
399 /*
400 * rs_recv_lock protects the receive queue, and is
401 * used to serialize with rds_release.
402 */
403 rwlock_t rs_recv_lock;
404 struct list_head rs_recv_queue;
405
406 /* just for stats reporting */
407 struct list_head rs_item;
408
409 /* these have their own lock */
410 spinlock_t rs_rdma_lock;
411 struct rb_root rs_rdma_keys;
412
413 /* Socket options - in case there will be more */
414 unsigned char rs_recverr,
415 rs_cong_monitor;
416};
417
418static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
419{
420 return container_of(sk, struct rds_sock, rs_sk);
421}
422static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
423{
424 return &rs->rs_sk;
425}
426
427/*
428 * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
429 * to account for overhead. We don't account for overhead, we just apply
430 * the number of payload bytes to the specified value.
431 */
432static inline int rds_sk_sndbuf(struct rds_sock *rs)
433{
434 return rds_rs_to_sk(rs)->sk_sndbuf / 2;
435}
436static inline int rds_sk_rcvbuf(struct rds_sock *rs)
437{
438 return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
439}
440
441struct rds_statistics {
442 uint64_t s_conn_reset;
443 uint64_t s_recv_drop_bad_checksum;
444 uint64_t s_recv_drop_old_seq;
445 uint64_t s_recv_drop_no_sock;
446 uint64_t s_recv_drop_dead_sock;
447 uint64_t s_recv_deliver_raced;
448 uint64_t s_recv_delivered;
449 uint64_t s_recv_queued;
450 uint64_t s_recv_immediate_retry;
451 uint64_t s_recv_delayed_retry;
452 uint64_t s_recv_ack_required;
453 uint64_t s_recv_rdma_bytes;
454 uint64_t s_recv_ping;
455 uint64_t s_send_queue_empty;
456 uint64_t s_send_queue_full;
457 uint64_t s_send_sem_contention;
458 uint64_t s_send_sem_queue_raced;
459 uint64_t s_send_immediate_retry;
460 uint64_t s_send_delayed_retry;
461 uint64_t s_send_drop_acked;
462 uint64_t s_send_ack_required;
463 uint64_t s_send_queued;
464 uint64_t s_send_rdma;
465 uint64_t s_send_rdma_bytes;
466 uint64_t s_send_pong;
467 uint64_t s_page_remainder_hit;
468 uint64_t s_page_remainder_miss;
469 uint64_t s_copy_to_user;
470 uint64_t s_copy_from_user;
471 uint64_t s_cong_update_queued;
472 uint64_t s_cong_update_received;
473 uint64_t s_cong_send_error;
474 uint64_t s_cong_send_blocked;
475};
476
477/* af_rds.c */
478void rds_sock_addref(struct rds_sock *rs);
479void rds_sock_put(struct rds_sock *rs);
480void rds_wake_sk_sleep(struct rds_sock *rs);
481static inline void __rds_wake_sk_sleep(struct sock *sk)
482{
483 wait_queue_head_t *waitq = sk->sk_sleep;
484
485 if (!sock_flag(sk, SOCK_DEAD) && waitq)
486 wake_up(waitq);
487}
488extern wait_queue_head_t rds_poll_waitq;
489
490
491/* bind.c */
492int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
493void rds_remove_bound(struct rds_sock *rs);
494struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
495
496/* cong.c */
497int rds_cong_get_maps(struct rds_connection *conn);
498void rds_cong_add_conn(struct rds_connection *conn);
499void rds_cong_remove_conn(struct rds_connection *conn);
500void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
501void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
502int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock, struct rds_sock *rs);
503void rds_cong_queue_updates(struct rds_cong_map *map);
504void rds_cong_map_updated(struct rds_cong_map *map, uint64_t);
505int rds_cong_updated_since(unsigned long *recent);
506void rds_cong_add_socket(struct rds_sock *);
507void rds_cong_remove_socket(struct rds_sock *);
508void rds_cong_exit(void);
509struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
510
511/* conn.c */
512int __init rds_conn_init(void);
513void rds_conn_exit(void);
514struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
515 struct rds_transport *trans, gfp_t gfp);
516struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
517 struct rds_transport *trans, gfp_t gfp);
518void rds_conn_destroy(struct rds_connection *conn);
519void rds_conn_reset(struct rds_connection *conn);
520void rds_conn_drop(struct rds_connection *conn);
521void rds_for_each_conn_info(struct socket *sock, unsigned int len,
522 struct rds_info_iterator *iter,
523 struct rds_info_lengths *lens,
524 int (*visitor)(struct rds_connection *, void *),
525 size_t item_len);
526void __rds_conn_error(struct rds_connection *conn, const char *, ...)
527 __attribute__ ((format (printf, 2, 3)));
528#define rds_conn_error(conn, fmt...) \
529 __rds_conn_error(conn, KERN_WARNING "RDS: " fmt)
530
531static inline int
532rds_conn_transition(struct rds_connection *conn, int old, int new)
533{
534 return atomic_cmpxchg(&conn->c_state, old, new) == old;
535}
536
537static inline int
538rds_conn_state(struct rds_connection *conn)
539{
540 return atomic_read(&conn->c_state);
541}
542
543static inline int
544rds_conn_up(struct rds_connection *conn)
545{
546 return atomic_read(&conn->c_state) == RDS_CONN_UP;
547}
548
549static inline int
550rds_conn_connecting(struct rds_connection *conn)
551{
552 return atomic_read(&conn->c_state) == RDS_CONN_CONNECTING;
553}
554
555/* message.c */
556struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp);
557struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
558 size_t total_len);
559struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len);
560void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
561 __be16 dport, u64 seq);
562int rds_message_add_extension(struct rds_header *hdr,
563 unsigned int type, const void *data, unsigned int len);
564int rds_message_next_extension(struct rds_header *hdr,
565 unsigned int *pos, void *buf, unsigned int *buflen);
566int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version);
567int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version);
568int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
569int rds_message_inc_copy_to_user(struct rds_incoming *inc,
570 struct iovec *first_iov, size_t size);
571void rds_message_inc_purge(struct rds_incoming *inc);
572void rds_message_inc_free(struct rds_incoming *inc);
573void rds_message_addref(struct rds_message *rm);
574void rds_message_put(struct rds_message *rm);
575void rds_message_wait(struct rds_message *rm);
576void rds_message_unmapped(struct rds_message *rm);
577
578static inline void rds_message_make_checksum(struct rds_header *hdr)
579{
580 hdr->h_csum = 0;
581 hdr->h_csum = ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2);
582}
583
584static inline int rds_message_verify_checksum(const struct rds_header *hdr)
585{
586 return !hdr->h_csum || ip_fast_csum((void *) hdr, sizeof(*hdr) >> 2) == 0;
587}
588
589
590/* page.c */
591int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
592 gfp_t gfp);
593int rds_page_copy_user(struct page *page, unsigned long offset,
594 void __user *ptr, unsigned long bytes,
595 int to_user);
596#define rds_page_copy_to_user(page, offset, ptr, bytes) \
597 rds_page_copy_user(page, offset, ptr, bytes, 1)
598#define rds_page_copy_from_user(page, offset, ptr, bytes) \
599 rds_page_copy_user(page, offset, ptr, bytes, 0)
600void rds_page_exit(void);
601
602/* recv.c */
603void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
604 __be32 saddr);
605void rds_inc_addref(struct rds_incoming *inc);
606void rds_inc_put(struct rds_incoming *inc);
607void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
608 struct rds_incoming *inc, gfp_t gfp, enum km_type km);
609int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
610 size_t size, int msg_flags);
611void rds_clear_recv_queue(struct rds_sock *rs);
612int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
613void rds_inc_info_copy(struct rds_incoming *inc,
614 struct rds_info_iterator *iter,
615 __be32 saddr, __be32 daddr, int flip);
616
617/* send.c */
618int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
619 size_t payload_len);
620void rds_send_reset(struct rds_connection *conn);
621int rds_send_xmit(struct rds_connection *conn);
622struct sockaddr_in;
623void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
624typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
625void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
626 is_acked_func is_acked);
627int rds_send_acked_before(struct rds_connection *conn, u64 seq);
628void rds_send_remove_from_sock(struct list_head *messages, int status);
629int rds_send_pong(struct rds_connection *conn, __be16 dport);
630struct rds_message *rds_send_get_message(struct rds_connection *,
631 struct rds_rdma_op *);
632
633/* rdma.c */
634void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force);
635
636/* stats.c */
637DECLARE_PER_CPU(struct rds_statistics, rds_stats);
638#define rds_stats_inc_which(which, member) do { \
639 per_cpu(which, get_cpu()).member++; \
640 put_cpu(); \
641} while (0)
642#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
643#define rds_stats_add_which(which, member, count) do { \
644 per_cpu(which, get_cpu()).member += count; \
645 put_cpu(); \
646} while (0)
647#define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count)
648int __init rds_stats_init(void);
649void rds_stats_exit(void);
650void rds_stats_info_copy(struct rds_info_iterator *iter,
651 uint64_t *values, char **names, size_t nr);
652
653/* sysctl.c */
654int __init rds_sysctl_init(void);
655void rds_sysctl_exit(void);
656extern unsigned long rds_sysctl_sndbuf_min;
657extern unsigned long rds_sysctl_sndbuf_default;
658extern unsigned long rds_sysctl_sndbuf_max;
659extern unsigned long rds_sysctl_reconnect_min_jiffies;
660extern unsigned long rds_sysctl_reconnect_max_jiffies;
661extern unsigned int rds_sysctl_max_unacked_packets;
662extern unsigned int rds_sysctl_max_unacked_bytes;
663extern unsigned int rds_sysctl_ping_enable;
664extern unsigned long rds_sysctl_trace_flags;
665extern unsigned int rds_sysctl_trace_level;
666
667/* threads.c */
668int __init rds_threads_init(void);
669void rds_threads_exit(void);
670extern struct workqueue_struct *rds_wq;
671void rds_connect_worker(struct work_struct *);
672void rds_shutdown_worker(struct work_struct *);
673void rds_send_worker(struct work_struct *);
674void rds_recv_worker(struct work_struct *);
675void rds_connect_complete(struct rds_connection *conn);
676
677/* transport.c */
678int rds_trans_register(struct rds_transport *trans);
679void rds_trans_unregister(struct rds_transport *trans);
680struct rds_transport *rds_trans_get_preferred(__be32 addr);
681unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
682 unsigned int avail);
683int __init rds_trans_init(void);
684void rds_trans_exit(void);
685
686#endif
diff --git a/net/rds/recv.c b/net/rds/recv.c
new file mode 100644
index 000000000000..f2118c51cfa3
--- /dev/null
+++ b/net/rds/recv.c
@@ -0,0 +1,542 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/sock.h>
35#include <linux/in.h>
36
37#include "rds.h"
38#include "rdma.h"
39
40void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
41 __be32 saddr)
42{
43 atomic_set(&inc->i_refcount, 1);
44 INIT_LIST_HEAD(&inc->i_item);
45 inc->i_conn = conn;
46 inc->i_saddr = saddr;
47 inc->i_rdma_cookie = 0;
48}
49
50void rds_inc_addref(struct rds_incoming *inc)
51{
52 rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
53 atomic_inc(&inc->i_refcount);
54}
55
56void rds_inc_put(struct rds_incoming *inc)
57{
58 rdsdebug("put inc %p ref %d\n", inc, atomic_read(&inc->i_refcount));
59 if (atomic_dec_and_test(&inc->i_refcount)) {
60 BUG_ON(!list_empty(&inc->i_item));
61
62 inc->i_conn->c_trans->inc_free(inc);
63 }
64}
65
66static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
67 struct rds_cong_map *map,
68 int delta, __be16 port)
69{
70 int now_congested;
71
72 if (delta == 0)
73 return;
74
75 rs->rs_rcv_bytes += delta;
76 now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
77
78 rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
79 "now_cong %d delta %d\n",
80 rs, &rs->rs_bound_addr,
81 ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
82 rds_sk_rcvbuf(rs), now_congested, delta);
83
84 /* wasn't -> am congested */
85 if (!rs->rs_congested && now_congested) {
86 rs->rs_congested = 1;
87 rds_cong_set_bit(map, port);
88 rds_cong_queue_updates(map);
89 }
90 /* was -> aren't congested */
91 /* Require more free space before reporting uncongested to prevent
92 bouncing cong/uncong state too often */
93 else if (rs->rs_congested && (rs->rs_rcv_bytes < (rds_sk_rcvbuf(rs)/2))) {
94 rs->rs_congested = 0;
95 rds_cong_clear_bit(map, port);
96 rds_cong_queue_updates(map);
97 }
98
99 /* do nothing if no change in cong state */
100}
101
102/*
103 * Process all extension headers that come with this message.
104 */
105static void rds_recv_incoming_exthdrs(struct rds_incoming *inc, struct rds_sock *rs)
106{
107 struct rds_header *hdr = &inc->i_hdr;
108 unsigned int pos = 0, type, len;
109 union {
110 struct rds_ext_header_version version;
111 struct rds_ext_header_rdma rdma;
112 struct rds_ext_header_rdma_dest rdma_dest;
113 } buffer;
114
115 while (1) {
116 len = sizeof(buffer);
117 type = rds_message_next_extension(hdr, &pos, &buffer, &len);
118 if (type == RDS_EXTHDR_NONE)
119 break;
120 /* Process extension header here */
121 switch (type) {
122 case RDS_EXTHDR_RDMA:
123 rds_rdma_unuse(rs, be32_to_cpu(buffer.rdma.h_rdma_rkey), 0);
124 break;
125
126 case RDS_EXTHDR_RDMA_DEST:
127 /* We ignore the size for now. We could stash it
128 * somewhere and use it for error checking. */
129 inc->i_rdma_cookie = rds_rdma_make_cookie(
130 be32_to_cpu(buffer.rdma_dest.h_rdma_rkey),
131 be32_to_cpu(buffer.rdma_dest.h_rdma_offset));
132
133 break;
134 }
135 }
136}
137
138/*
139 * The transport must make sure that this is serialized against other
140 * rx and conn reset on this specific conn.
141 *
142 * We currently assert that only one fragmented message will be sent
143 * down a connection at a time. This lets us reassemble in the conn
144 * instead of per-flow which means that we don't have to go digging through
145 * flows to tear down partial reassembly progress on conn failure and
146 * we save flow lookup and locking for each frag arrival. It does mean
147 * that small messages will wait behind large ones. Fragmenting at all
148 * is only to reduce the memory consumption of pre-posted buffers.
149 *
150 * The caller passes in saddr and daddr instead of us getting it from the
151 * conn. This lets loopback, who only has one conn for both directions,
152 * tell us which roles the addrs in the conn are playing for this message.
153 */
154void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
155 struct rds_incoming *inc, gfp_t gfp, enum km_type km)
156{
157 struct rds_sock *rs = NULL;
158 struct sock *sk;
159 unsigned long flags;
160
161 inc->i_conn = conn;
162 inc->i_rx_jiffies = jiffies;
163
164 rdsdebug("conn %p next %llu inc %p seq %llu len %u sport %u dport %u "
165 "flags 0x%x rx_jiffies %lu\n", conn,
166 (unsigned long long)conn->c_next_rx_seq,
167 inc,
168 (unsigned long long)be64_to_cpu(inc->i_hdr.h_sequence),
169 be32_to_cpu(inc->i_hdr.h_len),
170 be16_to_cpu(inc->i_hdr.h_sport),
171 be16_to_cpu(inc->i_hdr.h_dport),
172 inc->i_hdr.h_flags,
173 inc->i_rx_jiffies);
174
175 /*
176 * Sequence numbers should only increase. Messages get their
177 * sequence number as they're queued in a sending conn. They
178 * can be dropped, though, if the sending socket is closed before
179 * they hit the wire. So sequence numbers can skip forward
180 * under normal operation. They can also drop back in the conn
181 * failover case as previously sent messages are resent down the
182 * new instance of a conn. We drop those, otherwise we have
183 * to assume that the next valid seq does not come after a
184 * hole in the fragment stream.
185 *
186 * The headers don't give us a way to realize if fragments of
187 * a message have been dropped. We assume that frags that arrive
188 * to a flow are part of the current message on the flow that is
189 * being reassembled. This means that senders can't drop messages
190 * from the sending conn until all their frags are sent.
191 *
192 * XXX we could spend more on the wire to get more robust failure
193 * detection, arguably worth it to avoid data corruption.
194 */
195 if (be64_to_cpu(inc->i_hdr.h_sequence) < conn->c_next_rx_seq
196 && (inc->i_hdr.h_flags & RDS_FLAG_RETRANSMITTED)) {
197 rds_stats_inc(s_recv_drop_old_seq);
198 goto out;
199 }
200 conn->c_next_rx_seq = be64_to_cpu(inc->i_hdr.h_sequence) + 1;
201
202 if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
203 rds_stats_inc(s_recv_ping);
204 rds_send_pong(conn, inc->i_hdr.h_sport);
205 goto out;
206 }
207
208 rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
209 if (rs == NULL) {
210 rds_stats_inc(s_recv_drop_no_sock);
211 goto out;
212 }
213
214 /* Process extension headers */
215 rds_recv_incoming_exthdrs(inc, rs);
216
217 /* We can be racing with rds_release() which marks the socket dead. */
218 sk = rds_rs_to_sk(rs);
219
220 /* serialize with rds_release -> sock_orphan */
221 write_lock_irqsave(&rs->rs_recv_lock, flags);
222 if (!sock_flag(sk, SOCK_DEAD)) {
223 rdsdebug("adding inc %p to rs %p's recv queue\n", inc, rs);
224 rds_stats_inc(s_recv_queued);
225 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
226 be32_to_cpu(inc->i_hdr.h_len),
227 inc->i_hdr.h_dport);
228 rds_inc_addref(inc);
229 list_add_tail(&inc->i_item, &rs->rs_recv_queue);
230 __rds_wake_sk_sleep(sk);
231 } else {
232 rds_stats_inc(s_recv_drop_dead_sock);
233 }
234 write_unlock_irqrestore(&rs->rs_recv_lock, flags);
235
236out:
237 if (rs)
238 rds_sock_put(rs);
239}
240
241/*
242 * be very careful here. This is being called as the condition in
243 * wait_event_*() needs to cope with being called many times.
244 */
245static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc)
246{
247 unsigned long flags;
248
249 if (*inc == NULL) {
250 read_lock_irqsave(&rs->rs_recv_lock, flags);
251 if (!list_empty(&rs->rs_recv_queue)) {
252 *inc = list_entry(rs->rs_recv_queue.next,
253 struct rds_incoming,
254 i_item);
255 rds_inc_addref(*inc);
256 }
257 read_unlock_irqrestore(&rs->rs_recv_lock, flags);
258 }
259
260 return *inc != NULL;
261}
262
263static int rds_still_queued(struct rds_sock *rs, struct rds_incoming *inc,
264 int drop)
265{
266 struct sock *sk = rds_rs_to_sk(rs);
267 int ret = 0;
268 unsigned long flags;
269
270 write_lock_irqsave(&rs->rs_recv_lock, flags);
271 if (!list_empty(&inc->i_item)) {
272 ret = 1;
273 if (drop) {
274 /* XXX make sure this i_conn is reliable */
275 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
276 -be32_to_cpu(inc->i_hdr.h_len),
277 inc->i_hdr.h_dport);
278 list_del_init(&inc->i_item);
279 rds_inc_put(inc);
280 }
281 }
282 write_unlock_irqrestore(&rs->rs_recv_lock, flags);
283
284 rdsdebug("inc %p rs %p still %d dropped %d\n", inc, rs, ret, drop);
285 return ret;
286}
287
288/*
289 * Pull errors off the error queue.
290 * If msghdr is NULL, we will just purge the error queue.
291 */
292int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr)
293{
294 struct rds_notifier *notifier;
295 struct rds_rdma_notify cmsg;
296 unsigned int count = 0, max_messages = ~0U;
297 unsigned long flags;
298 LIST_HEAD(copy);
299 int err = 0;
300
301
302 /* put_cmsg copies to user space and thus may sleep. We can't do this
303 * with rs_lock held, so first grab as many notifications as we can stuff
304 * in the user provided cmsg buffer. We don't try to copy more, to avoid
305 * losing notifications - except when the buffer is so small that it wouldn't
306 * even hold a single notification. Then we give him as much of this single
307 * msg as we can squeeze in, and set MSG_CTRUNC.
308 */
309 if (msghdr) {
310 max_messages = msghdr->msg_controllen / CMSG_SPACE(sizeof(cmsg));
311 if (!max_messages)
312 max_messages = 1;
313 }
314
315 spin_lock_irqsave(&rs->rs_lock, flags);
316 while (!list_empty(&rs->rs_notify_queue) && count < max_messages) {
317 notifier = list_entry(rs->rs_notify_queue.next,
318 struct rds_notifier, n_list);
319 list_move(&notifier->n_list, &copy);
320 count++;
321 }
322 spin_unlock_irqrestore(&rs->rs_lock, flags);
323
324 if (!count)
325 return 0;
326
327 while (!list_empty(&copy)) {
328 notifier = list_entry(copy.next, struct rds_notifier, n_list);
329
330 if (msghdr) {
331 cmsg.user_token = notifier->n_user_token;
332 cmsg.status = notifier->n_status;
333
334 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS,
335 sizeof(cmsg), &cmsg);
336 if (err)
337 break;
338 }
339
340 list_del_init(&notifier->n_list);
341 kfree(notifier);
342 }
343
344 /* If we bailed out because of an error in put_cmsg,
345 * we may be left with one or more notifications that we
346 * didn't process. Return them to the head of the list. */
347 if (!list_empty(&copy)) {
348 spin_lock_irqsave(&rs->rs_lock, flags);
349 list_splice(&copy, &rs->rs_notify_queue);
350 spin_unlock_irqrestore(&rs->rs_lock, flags);
351 }
352
353 return err;
354}
355
356/*
357 * Queue a congestion notification
358 */
359static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
360{
361 uint64_t notify = rs->rs_cong_notify;
362 unsigned long flags;
363 int err;
364
365 err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_CONG_UPDATE,
366 sizeof(notify), &notify);
367 if (err)
368 return err;
369
370 spin_lock_irqsave(&rs->rs_lock, flags);
371 rs->rs_cong_notify &= ~notify;
372 spin_unlock_irqrestore(&rs->rs_lock, flags);
373
374 return 0;
375}
376
377/*
378 * Receive any control messages.
379 */
380static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
381{
382 int ret = 0;
383
384 if (inc->i_rdma_cookie) {
385 ret = put_cmsg(msg, SOL_RDS, RDS_CMSG_RDMA_DEST,
386 sizeof(inc->i_rdma_cookie), &inc->i_rdma_cookie);
387 if (ret)
388 return ret;
389 }
390
391 return 0;
392}
393
394int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
395 size_t size, int msg_flags)
396{
397 struct sock *sk = sock->sk;
398 struct rds_sock *rs = rds_sk_to_rs(sk);
399 long timeo;
400 int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
401 struct sockaddr_in *sin;
402 struct rds_incoming *inc = NULL;
403
404 /* udp_recvmsg()->sock_recvtimeo() gets away without locking too.. */
405 timeo = sock_rcvtimeo(sk, nonblock);
406
407 rdsdebug("size %zu flags 0x%x timeo %ld\n", size, msg_flags, timeo);
408
409 if (msg_flags & MSG_OOB)
410 goto out;
411
412 /* If there are pending notifications, do those - and nothing else */
413 if (!list_empty(&rs->rs_notify_queue)) {
414 ret = rds_notify_queue_get(rs, msg);
415 goto out;
416 }
417
418 if (rs->rs_cong_notify) {
419 ret = rds_notify_cong(rs, msg);
420 goto out;
421 }
422
423 while (1) {
424 if (!rds_next_incoming(rs, &inc)) {
425 if (nonblock) {
426 ret = -EAGAIN;
427 break;
428 }
429
430 timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
431 rds_next_incoming(rs, &inc),
432 timeo);
433 rdsdebug("recvmsg woke inc %p timeo %ld\n", inc,
434 timeo);
435 if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
436 continue;
437
438 ret = timeo;
439 if (ret == 0)
440 ret = -ETIMEDOUT;
441 break;
442 }
443
444 rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
445 &inc->i_conn->c_faddr,
446 ntohs(inc->i_hdr.h_sport));
447 ret = inc->i_conn->c_trans->inc_copy_to_user(inc, msg->msg_iov,
448 size);
449 if (ret < 0)
450 break;
451
452 /*
453 * if the message we just copied isn't at the head of the
454 * recv queue then someone else raced us to return it, try
455 * to get the next message.
456 */
457 if (!rds_still_queued(rs, inc, !(msg_flags & MSG_PEEK))) {
458 rds_inc_put(inc);
459 inc = NULL;
460 rds_stats_inc(s_recv_deliver_raced);
461 continue;
462 }
463
464 if (ret < be32_to_cpu(inc->i_hdr.h_len)) {
465 if (msg_flags & MSG_TRUNC)
466 ret = be32_to_cpu(inc->i_hdr.h_len);
467 msg->msg_flags |= MSG_TRUNC;
468 }
469
470 if (rds_cmsg_recv(inc, msg)) {
471 ret = -EFAULT;
472 goto out;
473 }
474
475 rds_stats_inc(s_recv_delivered);
476
477 sin = (struct sockaddr_in *)msg->msg_name;
478 if (sin) {
479 sin->sin_family = AF_INET;
480 sin->sin_port = inc->i_hdr.h_sport;
481 sin->sin_addr.s_addr = inc->i_saddr;
482 memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
483 }
484 break;
485 }
486
487 if (inc)
488 rds_inc_put(inc);
489
490out:
491 return ret;
492}
493
494/*
495 * The socket is being shut down and we're asked to drop messages that were
496 * queued for recvmsg. The caller has unbound the socket so the receive path
497 * won't queue any more incoming fragments or messages on the socket.
498 */
499void rds_clear_recv_queue(struct rds_sock *rs)
500{
501 struct sock *sk = rds_rs_to_sk(rs);
502 struct rds_incoming *inc, *tmp;
503 unsigned long flags;
504
505 write_lock_irqsave(&rs->rs_recv_lock, flags);
506 list_for_each_entry_safe(inc, tmp, &rs->rs_recv_queue, i_item) {
507 rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
508 -be32_to_cpu(inc->i_hdr.h_len),
509 inc->i_hdr.h_dport);
510 list_del_init(&inc->i_item);
511 rds_inc_put(inc);
512 }
513 write_unlock_irqrestore(&rs->rs_recv_lock, flags);
514}
515
516/*
517 * inc->i_saddr isn't used here because it is only set in the receive
518 * path.
519 */
520void rds_inc_info_copy(struct rds_incoming *inc,
521 struct rds_info_iterator *iter,
522 __be32 saddr, __be32 daddr, int flip)
523{
524 struct rds_info_message minfo;
525
526 minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence);
527 minfo.len = be32_to_cpu(inc->i_hdr.h_len);
528
529 if (flip) {
530 minfo.laddr = daddr;
531 minfo.faddr = saddr;
532 minfo.lport = inc->i_hdr.h_dport;
533 minfo.fport = inc->i_hdr.h_sport;
534 } else {
535 minfo.laddr = saddr;
536 minfo.faddr = daddr;
537 minfo.lport = inc->i_hdr.h_sport;
538 minfo.fport = inc->i_hdr.h_dport;
539 }
540
541 rds_info_copy(iter, &minfo, sizeof(minfo));
542}
diff --git a/net/rds/send.c b/net/rds/send.c
new file mode 100644
index 000000000000..1b37364656f0
--- /dev/null
+++ b/net/rds/send.c
@@ -0,0 +1,1003 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <net/sock.h>
35#include <linux/in.h>
36#include <linux/list.h>
37
38#include "rds.h"
39#include "rdma.h"
40
41/* When transmitting messages in rds_send_xmit, we need to emerge from
42 * time to time and briefly release the CPU. Otherwise the softlock watchdog
43 * will kick our shin.
44 * Also, it seems fairer to not let one busy connection stall all the
45 * others.
46 *
47 * send_batch_count is the number of times we'll loop in send_xmit. Setting
48 * it to 0 will restore the old behavior (where we looped until we had
49 * drained the queue).
50 */
51static int send_batch_count = 64;
52module_param(send_batch_count, int, 0444);
53MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue");
54
55/*
56 * Reset the send state. Caller must hold c_send_lock when calling here.
57 */
58void rds_send_reset(struct rds_connection *conn)
59{
60 struct rds_message *rm, *tmp;
61 unsigned long flags;
62
63 if (conn->c_xmit_rm) {
64 /* Tell the user the RDMA op is no longer mapped by the
65 * transport. This isn't entirely true (it's flushed out
66 * independently) but as the connection is down, there's
67 * no ongoing RDMA to/from that memory */
68 rds_message_unmapped(conn->c_xmit_rm);
69 rds_message_put(conn->c_xmit_rm);
70 conn->c_xmit_rm = NULL;
71 }
72 conn->c_xmit_sg = 0;
73 conn->c_xmit_hdr_off = 0;
74 conn->c_xmit_data_off = 0;
75 conn->c_xmit_rdma_sent = 0;
76
77 conn->c_map_queued = 0;
78
79 conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
80 conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
81
82 /* Mark messages as retransmissions, and move them to the send q */
83 spin_lock_irqsave(&conn->c_lock, flags);
84 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
85 set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
86 set_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags);
87 }
88 list_splice_init(&conn->c_retrans, &conn->c_send_queue);
89 spin_unlock_irqrestore(&conn->c_lock, flags);
90}
91
92/*
93 * We're making the concious trade-off here to only send one message
94 * down the connection at a time.
95 * Pro:
96 * - tx queueing is a simple fifo list
97 * - reassembly is optional and easily done by transports per conn
98 * - no per flow rx lookup at all, straight to the socket
99 * - less per-frag memory and wire overhead
100 * Con:
101 * - queued acks can be delayed behind large messages
102 * Depends:
103 * - small message latency is higher behind queued large messages
104 * - large message latency isn't starved by intervening small sends
105 */
106int rds_send_xmit(struct rds_connection *conn)
107{
108 struct rds_message *rm;
109 unsigned long flags;
110 unsigned int tmp;
111 unsigned int send_quota = send_batch_count;
112 struct scatterlist *sg;
113 int ret = 0;
114 int was_empty = 0;
115 LIST_HEAD(to_be_dropped);
116
117 /*
118 * sendmsg calls here after having queued its message on the send
119 * queue. We only have one task feeding the connection at a time. If
120 * another thread is already feeding the queue then we back off. This
121 * avoids blocking the caller and trading per-connection data between
122 * caches per message.
123 *
124 * The sem holder will issue a retry if they notice that someone queued
125 * a message after they stopped walking the send queue but before they
126 * dropped the sem.
127 */
128 if (!mutex_trylock(&conn->c_send_lock)) {
129 rds_stats_inc(s_send_sem_contention);
130 ret = -ENOMEM;
131 goto out;
132 }
133
134 if (conn->c_trans->xmit_prepare)
135 conn->c_trans->xmit_prepare(conn);
136
137 /*
138 * spin trying to push headers and data down the connection until
139 * the connection doens't make forward progress.
140 */
141 while (--send_quota) {
142 /*
143 * See if need to send a congestion map update if we're
144 * between sending messages. The send_sem protects our sole
145 * use of c_map_offset and _bytes.
146 * Note this is used only by transports that define a special
147 * xmit_cong_map function. For all others, we create allocate
148 * a cong_map message and treat it just like any other send.
149 */
150 if (conn->c_map_bytes) {
151 ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong,
152 conn->c_map_offset);
153 if (ret <= 0)
154 break;
155
156 conn->c_map_offset += ret;
157 conn->c_map_bytes -= ret;
158 if (conn->c_map_bytes)
159 continue;
160 }
161
162 /* If we're done sending the current message, clear the
163 * offset and S/G temporaries.
164 */
165 rm = conn->c_xmit_rm;
166 if (rm != NULL &&
167 conn->c_xmit_hdr_off == sizeof(struct rds_header) &&
168 conn->c_xmit_sg == rm->m_nents) {
169 conn->c_xmit_rm = NULL;
170 conn->c_xmit_sg = 0;
171 conn->c_xmit_hdr_off = 0;
172 conn->c_xmit_data_off = 0;
173 conn->c_xmit_rdma_sent = 0;
174
175 /* Release the reference to the previous message. */
176 rds_message_put(rm);
177 rm = NULL;
178 }
179
180 /* If we're asked to send a cong map update, do so.
181 */
182 if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) {
183 if (conn->c_trans->xmit_cong_map != NULL) {
184 conn->c_map_offset = 0;
185 conn->c_map_bytes = sizeof(struct rds_header) +
186 RDS_CONG_MAP_BYTES;
187 continue;
188 }
189
190 rm = rds_cong_update_alloc(conn);
191 if (IS_ERR(rm)) {
192 ret = PTR_ERR(rm);
193 break;
194 }
195
196 conn->c_xmit_rm = rm;
197 }
198
199 /*
200 * Grab the next message from the send queue, if there is one.
201 *
202 * c_xmit_rm holds a ref while we're sending this message down
203 * the connction. We can use this ref while holding the
204 * send_sem.. rds_send_reset() is serialized with it.
205 */
206 if (rm == NULL) {
207 unsigned int len;
208
209 spin_lock_irqsave(&conn->c_lock, flags);
210
211 if (!list_empty(&conn->c_send_queue)) {
212 rm = list_entry(conn->c_send_queue.next,
213 struct rds_message,
214 m_conn_item);
215 rds_message_addref(rm);
216
217 /*
218 * Move the message from the send queue to the retransmit
219 * list right away.
220 */
221 list_move_tail(&rm->m_conn_item, &conn->c_retrans);
222 }
223
224 spin_unlock_irqrestore(&conn->c_lock, flags);
225
226 if (rm == NULL) {
227 was_empty = 1;
228 break;
229 }
230
231 /* Unfortunately, the way Infiniband deals with
232 * RDMA to a bad MR key is by moving the entire
233 * queue pair to error state. We cold possibly
234 * recover from that, but right now we drop the
235 * connection.
236 * Therefore, we never retransmit messages with RDMA ops.
237 */
238 if (rm->m_rdma_op
239 && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) {
240 spin_lock_irqsave(&conn->c_lock, flags);
241 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags))
242 list_move(&rm->m_conn_item, &to_be_dropped);
243 spin_unlock_irqrestore(&conn->c_lock, flags);
244 rds_message_put(rm);
245 continue;
246 }
247
248 /* Require an ACK every once in a while */
249 len = ntohl(rm->m_inc.i_hdr.h_len);
250 if (conn->c_unacked_packets == 0
251 || conn->c_unacked_bytes < len) {
252 __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
253
254 conn->c_unacked_packets = rds_sysctl_max_unacked_packets;
255 conn->c_unacked_bytes = rds_sysctl_max_unacked_bytes;
256 rds_stats_inc(s_send_ack_required);
257 } else {
258 conn->c_unacked_bytes -= len;
259 conn->c_unacked_packets--;
260 }
261
262 conn->c_xmit_rm = rm;
263 }
264
265 /*
266 * Try and send an rdma message. Let's see if we can
267 * keep this simple and require that the transport either
268 * send the whole rdma or none of it.
269 */
270 if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) {
271 ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op);
272 if (ret)
273 break;
274 conn->c_xmit_rdma_sent = 1;
275 /* The transport owns the mapped memory for now.
276 * You can't unmap it while it's on the send queue */
277 set_bit(RDS_MSG_MAPPED, &rm->m_flags);
278 }
279
280 if (conn->c_xmit_hdr_off < sizeof(struct rds_header) ||
281 conn->c_xmit_sg < rm->m_nents) {
282 ret = conn->c_trans->xmit(conn, rm,
283 conn->c_xmit_hdr_off,
284 conn->c_xmit_sg,
285 conn->c_xmit_data_off);
286 if (ret <= 0)
287 break;
288
289 if (conn->c_xmit_hdr_off < sizeof(struct rds_header)) {
290 tmp = min_t(int, ret,
291 sizeof(struct rds_header) -
292 conn->c_xmit_hdr_off);
293 conn->c_xmit_hdr_off += tmp;
294 ret -= tmp;
295 }
296
297 sg = &rm->m_sg[conn->c_xmit_sg];
298 while (ret) {
299 tmp = min_t(int, ret, sg->length -
300 conn->c_xmit_data_off);
301 conn->c_xmit_data_off += tmp;
302 ret -= tmp;
303 if (conn->c_xmit_data_off == sg->length) {
304 conn->c_xmit_data_off = 0;
305 sg++;
306 conn->c_xmit_sg++;
307 BUG_ON(ret != 0 &&
308 conn->c_xmit_sg == rm->m_nents);
309 }
310 }
311 }
312 }
313
314 /* Nuke any messages we decided not to retransmit. */
315 if (!list_empty(&to_be_dropped))
316 rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED);
317
318 if (conn->c_trans->xmit_complete)
319 conn->c_trans->xmit_complete(conn);
320
321 /*
322 * We might be racing with another sender who queued a message but
323 * backed off on noticing that we held the c_send_lock. If we check
324 * for queued messages after dropping the sem then either we'll
325 * see the queued message or the queuer will get the sem. If we
326 * notice the queued message then we trigger an immediate retry.
327 *
328 * We need to be careful only to do this when we stopped processing
329 * the send queue because it was empty. It's the only way we
330 * stop processing the loop when the transport hasn't taken
331 * responsibility for forward progress.
332 */
333 mutex_unlock(&conn->c_send_lock);
334
335 if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) {
336 /* We exhausted the send quota, but there's work left to
337 * do. Return and (re-)schedule the send worker.
338 */
339 ret = -EAGAIN;
340 }
341
342 if (ret == 0 && was_empty) {
343 /* A simple bit test would be way faster than taking the
344 * spin lock */
345 spin_lock_irqsave(&conn->c_lock, flags);
346 if (!list_empty(&conn->c_send_queue)) {
347 rds_stats_inc(s_send_sem_queue_raced);
348 ret = -EAGAIN;
349 }
350 spin_unlock_irqrestore(&conn->c_lock, flags);
351 }
352out:
353 return ret;
354}
355
356static void rds_send_sndbuf_remove(struct rds_sock *rs, struct rds_message *rm)
357{
358 u32 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
359
360 assert_spin_locked(&rs->rs_lock);
361
362 BUG_ON(rs->rs_snd_bytes < len);
363 rs->rs_snd_bytes -= len;
364
365 if (rs->rs_snd_bytes == 0)
366 rds_stats_inc(s_send_queue_empty);
367}
368
369static inline int rds_send_is_acked(struct rds_message *rm, u64 ack,
370 is_acked_func is_acked)
371{
372 if (is_acked)
373 return is_acked(rm, ack);
374 return be64_to_cpu(rm->m_inc.i_hdr.h_sequence) <= ack;
375}
376
377/*
378 * Returns true if there are no messages on the send and retransmit queues
379 * which have a sequence number greater than or equal to the given sequence
380 * number.
381 */
382int rds_send_acked_before(struct rds_connection *conn, u64 seq)
383{
384 struct rds_message *rm, *tmp;
385 int ret = 1;
386
387 spin_lock(&conn->c_lock);
388
389 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
390 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
391 ret = 0;
392 break;
393 }
394
395 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
396 if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq)
397 ret = 0;
398 break;
399 }
400
401 spin_unlock(&conn->c_lock);
402
403 return ret;
404}
405
406/*
407 * This is pretty similar to what happens below in the ACK
408 * handling code - except that we call here as soon as we get
409 * the IB send completion on the RDMA op and the accompanying
410 * message.
411 */
412void rds_rdma_send_complete(struct rds_message *rm, int status)
413{
414 struct rds_sock *rs = NULL;
415 struct rds_rdma_op *ro;
416 struct rds_notifier *notifier;
417
418 spin_lock(&rm->m_rs_lock);
419
420 ro = rm->m_rdma_op;
421 if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags)
422 && ro && ro->r_notify && ro->r_notifier) {
423 notifier = ro->r_notifier;
424 rs = rm->m_rs;
425 sock_hold(rds_rs_to_sk(rs));
426
427 notifier->n_status = status;
428 spin_lock(&rs->rs_lock);
429 list_add_tail(&notifier->n_list, &rs->rs_notify_queue);
430 spin_unlock(&rs->rs_lock);
431
432 ro->r_notifier = NULL;
433 }
434
435 spin_unlock(&rm->m_rs_lock);
436
437 if (rs) {
438 rds_wake_sk_sleep(rs);
439 sock_put(rds_rs_to_sk(rs));
440 }
441}
442
443/*
444 * This is the same as rds_rdma_send_complete except we
445 * don't do any locking - we have all the ingredients (message,
446 * socket, socket lock) and can just move the notifier.
447 */
448static inline void
449__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status)
450{
451 struct rds_rdma_op *ro;
452
453 ro = rm->m_rdma_op;
454 if (ro && ro->r_notify && ro->r_notifier) {
455 ro->r_notifier->n_status = status;
456 list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue);
457 ro->r_notifier = NULL;
458 }
459
460 /* No need to wake the app - caller does this */
461}
462
463/*
464 * This is called from the IB send completion when we detect
465 * a RDMA operation that failed with remote access error.
466 * So speed is not an issue here.
467 */
468struct rds_message *rds_send_get_message(struct rds_connection *conn,
469 struct rds_rdma_op *op)
470{
471 struct rds_message *rm, *tmp, *found = NULL;
472 unsigned long flags;
473
474 spin_lock_irqsave(&conn->c_lock, flags);
475
476 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
477 if (rm->m_rdma_op == op) {
478 atomic_inc(&rm->m_refcount);
479 found = rm;
480 goto out;
481 }
482 }
483
484 list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) {
485 if (rm->m_rdma_op == op) {
486 atomic_inc(&rm->m_refcount);
487 found = rm;
488 break;
489 }
490 }
491
492out:
493 spin_unlock_irqrestore(&conn->c_lock, flags);
494
495 return found;
496}
497
498/*
499 * This removes messages from the socket's list if they're on it. The list
500 * argument must be private to the caller, we must be able to modify it
501 * without locks. The messages must have a reference held for their
502 * position on the list. This function will drop that reference after
503 * removing the messages from the 'messages' list regardless of if it found
504 * the messages on the socket list or not.
505 */
506void rds_send_remove_from_sock(struct list_head *messages, int status)
507{
508 unsigned long flags = 0; /* silence gcc :P */
509 struct rds_sock *rs = NULL;
510 struct rds_message *rm;
511
512 local_irq_save(flags);
513 while (!list_empty(messages)) {
514 rm = list_entry(messages->next, struct rds_message,
515 m_conn_item);
516 list_del_init(&rm->m_conn_item);
517
518 /*
519 * If we see this flag cleared then we're *sure* that someone
520 * else beat us to removing it from the sock. If we race
521 * with their flag update we'll get the lock and then really
522 * see that the flag has been cleared.
523 *
524 * The message spinlock makes sure nobody clears rm->m_rs
525 * while we're messing with it. It does not prevent the
526 * message from being removed from the socket, though.
527 */
528 spin_lock(&rm->m_rs_lock);
529 if (!test_bit(RDS_MSG_ON_SOCK, &rm->m_flags))
530 goto unlock_and_drop;
531
532 if (rs != rm->m_rs) {
533 if (rs) {
534 spin_unlock(&rs->rs_lock);
535 rds_wake_sk_sleep(rs);
536 sock_put(rds_rs_to_sk(rs));
537 }
538 rs = rm->m_rs;
539 spin_lock(&rs->rs_lock);
540 sock_hold(rds_rs_to_sk(rs));
541 }
542
543 if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) {
544 struct rds_rdma_op *ro = rm->m_rdma_op;
545 struct rds_notifier *notifier;
546
547 list_del_init(&rm->m_sock_item);
548 rds_send_sndbuf_remove(rs, rm);
549
550 if (ro && ro->r_notifier
551 && (status || ro->r_notify)) {
552 notifier = ro->r_notifier;
553 list_add_tail(&notifier->n_list,
554 &rs->rs_notify_queue);
555 if (!notifier->n_status)
556 notifier->n_status = status;
557 rm->m_rdma_op->r_notifier = NULL;
558 }
559 rds_message_put(rm);
560 rm->m_rs = NULL;
561 }
562
563unlock_and_drop:
564 spin_unlock(&rm->m_rs_lock);
565 rds_message_put(rm);
566 }
567
568 if (rs) {
569 spin_unlock(&rs->rs_lock);
570 rds_wake_sk_sleep(rs);
571 sock_put(rds_rs_to_sk(rs));
572 }
573 local_irq_restore(flags);
574}
575
576/*
577 * Transports call here when they've determined that the receiver queued
578 * messages up to, and including, the given sequence number. Messages are
579 * moved to the retrans queue when rds_send_xmit picks them off the send
580 * queue. This means that in the TCP case, the message may not have been
581 * assigned the m_ack_seq yet - but that's fine as long as tcp_is_acked
582 * checks the RDS_MSG_HAS_ACK_SEQ bit.
583 *
584 * XXX It's not clear to me how this is safely serialized with socket
585 * destruction. Maybe it should bail if it sees SOCK_DEAD.
586 */
587void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
588 is_acked_func is_acked)
589{
590 struct rds_message *rm, *tmp;
591 unsigned long flags;
592 LIST_HEAD(list);
593
594 spin_lock_irqsave(&conn->c_lock, flags);
595
596 list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) {
597 if (!rds_send_is_acked(rm, ack, is_acked))
598 break;
599
600 list_move(&rm->m_conn_item, &list);
601 clear_bit(RDS_MSG_ON_CONN, &rm->m_flags);
602 }
603
604 /* order flag updates with spin locks */
605 if (!list_empty(&list))
606 smp_mb__after_clear_bit();
607
608 spin_unlock_irqrestore(&conn->c_lock, flags);
609
610 /* now remove the messages from the sock list as needed */
611 rds_send_remove_from_sock(&list, RDS_RDMA_SUCCESS);
612}
613
614void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
615{
616 struct rds_message *rm, *tmp;
617 struct rds_connection *conn;
618 unsigned long flags;
619 LIST_HEAD(list);
620 int wake = 0;
621
622 /* get all the messages we're dropping under the rs lock */
623 spin_lock_irqsave(&rs->rs_lock, flags);
624
625 list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
626 if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
627 dest->sin_port != rm->m_inc.i_hdr.h_dport))
628 continue;
629
630 wake = 1;
631 list_move(&rm->m_sock_item, &list);
632 rds_send_sndbuf_remove(rs, rm);
633 clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
634
635 /* If this is a RDMA operation, notify the app. */
636 __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED);
637 }
638
639 /* order flag updates with the rs lock */
640 if (wake)
641 smp_mb__after_clear_bit();
642
643 spin_unlock_irqrestore(&rs->rs_lock, flags);
644
645 if (wake)
646 rds_wake_sk_sleep(rs);
647
648 conn = NULL;
649
650 /* now remove the messages from the conn list as needed */
651 list_for_each_entry(rm, &list, m_sock_item) {
652 /* We do this here rather than in the loop above, so that
653 * we don't have to nest m_rs_lock under rs->rs_lock */
654 spin_lock(&rm->m_rs_lock);
655 rm->m_rs = NULL;
656 spin_unlock(&rm->m_rs_lock);
657
658 /*
659 * If we see this flag cleared then we're *sure* that someone
660 * else beat us to removing it from the conn. If we race
661 * with their flag update we'll get the lock and then really
662 * see that the flag has been cleared.
663 */
664 if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags))
665 continue;
666
667 if (conn != rm->m_inc.i_conn) {
668 if (conn)
669 spin_unlock_irqrestore(&conn->c_lock, flags);
670 conn = rm->m_inc.i_conn;
671 spin_lock_irqsave(&conn->c_lock, flags);
672 }
673
674 if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) {
675 list_del_init(&rm->m_conn_item);
676 rds_message_put(rm);
677 }
678 }
679
680 if (conn)
681 spin_unlock_irqrestore(&conn->c_lock, flags);
682
683 while (!list_empty(&list)) {
684 rm = list_entry(list.next, struct rds_message, m_sock_item);
685 list_del_init(&rm->m_sock_item);
686
687 rds_message_wait(rm);
688 rds_message_put(rm);
689 }
690}
691
692/*
693 * we only want this to fire once so we use the callers 'queued'. It's
694 * possible that another thread can race with us and remove the
695 * message from the flow with RDS_CANCEL_SENT_TO.
696 */
697static int rds_send_queue_rm(struct rds_sock *rs, struct rds_connection *conn,
698 struct rds_message *rm, __be16 sport,
699 __be16 dport, int *queued)
700{
701 unsigned long flags;
702 u32 len;
703
704 if (*queued)
705 goto out;
706
707 len = be32_to_cpu(rm->m_inc.i_hdr.h_len);
708
709 /* this is the only place which holds both the socket's rs_lock
710 * and the connection's c_lock */
711 spin_lock_irqsave(&rs->rs_lock, flags);
712
713 /*
714 * If there is a little space in sndbuf, we don't queue anything,
715 * and userspace gets -EAGAIN. But poll() indicates there's send
716 * room. This can lead to bad behavior (spinning) if snd_bytes isn't
717 * freed up by incoming acks. So we check the *old* value of
718 * rs_snd_bytes here to allow the last msg to exceed the buffer,
719 * and poll() now knows no more data can be sent.
720 */
721 if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) {
722 rs->rs_snd_bytes += len;
723
724 /* let recv side know we are close to send space exhaustion.
725 * This is probably not the optimal way to do it, as this
726 * means we set the flag on *all* messages as soon as our
727 * throughput hits a certain threshold.
728 */
729 if (rs->rs_snd_bytes >= rds_sk_sndbuf(rs) / 2)
730 __set_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags);
731
732 list_add_tail(&rm->m_sock_item, &rs->rs_send_queue);
733 set_bit(RDS_MSG_ON_SOCK, &rm->m_flags);
734 rds_message_addref(rm);
735 rm->m_rs = rs;
736
737 /* The code ordering is a little weird, but we're
738 trying to minimize the time we hold c_lock */
739 rds_message_populate_header(&rm->m_inc.i_hdr, sport, dport, 0);
740 rm->m_inc.i_conn = conn;
741 rds_message_addref(rm);
742
743 spin_lock(&conn->c_lock);
744 rm->m_inc.i_hdr.h_sequence = cpu_to_be64(conn->c_next_tx_seq++);
745 list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
746 set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
747 spin_unlock(&conn->c_lock);
748
749 rdsdebug("queued msg %p len %d, rs %p bytes %d seq %llu\n",
750 rm, len, rs, rs->rs_snd_bytes,
751 (unsigned long long)be64_to_cpu(rm->m_inc.i_hdr.h_sequence));
752
753 *queued = 1;
754 }
755
756 spin_unlock_irqrestore(&rs->rs_lock, flags);
757out:
758 return *queued;
759}
760
761static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
762 struct msghdr *msg, int *allocated_mr)
763{
764 struct cmsghdr *cmsg;
765 int ret = 0;
766
767 for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) {
768 if (!CMSG_OK(msg, cmsg))
769 return -EINVAL;
770
771 if (cmsg->cmsg_level != SOL_RDS)
772 continue;
773
774 /* As a side effect, RDMA_DEST and RDMA_MAP will set
775 * rm->m_rdma_cookie and rm->m_rdma_mr.
776 */
777 switch (cmsg->cmsg_type) {
778 case RDS_CMSG_RDMA_ARGS:
779 ret = rds_cmsg_rdma_args(rs, rm, cmsg);
780 break;
781
782 case RDS_CMSG_RDMA_DEST:
783 ret = rds_cmsg_rdma_dest(rs, rm, cmsg);
784 break;
785
786 case RDS_CMSG_RDMA_MAP:
787 ret = rds_cmsg_rdma_map(rs, rm, cmsg);
788 if (!ret)
789 *allocated_mr = 1;
790 break;
791
792 default:
793 return -EINVAL;
794 }
795
796 if (ret)
797 break;
798 }
799
800 return ret;
801}
802
803int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
804 size_t payload_len)
805{
806 struct sock *sk = sock->sk;
807 struct rds_sock *rs = rds_sk_to_rs(sk);
808 struct sockaddr_in *usin = (struct sockaddr_in *)msg->msg_name;
809 __be32 daddr;
810 __be16 dport;
811 struct rds_message *rm = NULL;
812 struct rds_connection *conn;
813 int ret = 0;
814 int queued = 0, allocated_mr = 0;
815 int nonblock = msg->msg_flags & MSG_DONTWAIT;
816 long timeo = sock_rcvtimeo(sk, nonblock);
817
818 /* Mirror Linux UDP mirror of BSD error message compatibility */
819 /* XXX: Perhaps MSG_MORE someday */
820 if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_CMSG_COMPAT)) {
821 printk(KERN_INFO "msg_flags 0x%08X\n", msg->msg_flags);
822 ret = -EOPNOTSUPP;
823 goto out;
824 }
825
826 if (msg->msg_namelen) {
827 /* XXX fail non-unicast destination IPs? */
828 if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
829 ret = -EINVAL;
830 goto out;
831 }
832 daddr = usin->sin_addr.s_addr;
833 dport = usin->sin_port;
834 } else {
835 /* We only care about consistency with ->connect() */
836 lock_sock(sk);
837 daddr = rs->rs_conn_addr;
838 dport = rs->rs_conn_port;
839 release_sock(sk);
840 }
841
842 /* racing with another thread binding seems ok here */
843 if (daddr == 0 || rs->rs_bound_addr == 0) {
844 ret = -ENOTCONN; /* XXX not a great errno */
845 goto out;
846 }
847
848 rm = rds_message_copy_from_user(msg->msg_iov, payload_len);
849 if (IS_ERR(rm)) {
850 ret = PTR_ERR(rm);
851 rm = NULL;
852 goto out;
853 }
854
855 rm->m_daddr = daddr;
856
857 /* Parse any control messages the user may have included. */
858 ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
859 if (ret)
860 goto out;
861
862 /* rds_conn_create has a spinlock that runs with IRQ off.
863 * Caching the conn in the socket helps a lot. */
864 if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
865 conn = rs->rs_conn;
866 else {
867 conn = rds_conn_create_outgoing(rs->rs_bound_addr, daddr,
868 rs->rs_transport,
869 sock->sk->sk_allocation);
870 if (IS_ERR(conn)) {
871 ret = PTR_ERR(conn);
872 goto out;
873 }
874 rs->rs_conn = conn;
875 }
876
877 if ((rm->m_rdma_cookie || rm->m_rdma_op)
878 && conn->c_trans->xmit_rdma == NULL) {
879 if (printk_ratelimit())
880 printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n",
881 rm->m_rdma_op, conn->c_trans->xmit_rdma);
882 ret = -EOPNOTSUPP;
883 goto out;
884 }
885
886 /* If the connection is down, trigger a connect. We may
887 * have scheduled a delayed reconnect however - in this case
888 * we should not interfere.
889 */
890 if (rds_conn_state(conn) == RDS_CONN_DOWN
891 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
892 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
893
894 ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs);
895 if (ret)
896 goto out;
897
898 while (!rds_send_queue_rm(rs, conn, rm, rs->rs_bound_port,
899 dport, &queued)) {
900 rds_stats_inc(s_send_queue_full);
901 /* XXX make sure this is reasonable */
902 if (payload_len > rds_sk_sndbuf(rs)) {
903 ret = -EMSGSIZE;
904 goto out;
905 }
906 if (nonblock) {
907 ret = -EAGAIN;
908 goto out;
909 }
910
911 timeo = wait_event_interruptible_timeout(*sk->sk_sleep,
912 rds_send_queue_rm(rs, conn, rm,
913 rs->rs_bound_port,
914 dport,
915 &queued),
916 timeo);
917 rdsdebug("sendmsg woke queued %d timeo %ld\n", queued, timeo);
918 if (timeo > 0 || timeo == MAX_SCHEDULE_TIMEOUT)
919 continue;
920
921 ret = timeo;
922 if (ret == 0)
923 ret = -ETIMEDOUT;
924 goto out;
925 }
926
927 /*
928 * By now we've committed to the send. We reuse rds_send_worker()
929 * to retry sends in the rds thread if the transport asks us to.
930 */
931 rds_stats_inc(s_send_queued);
932
933 if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags))
934 rds_send_worker(&conn->c_send_w.work);
935
936 rds_message_put(rm);
937 return payload_len;
938
939out:
940 /* If the user included a RDMA_MAP cmsg, we allocated a MR on the fly.
941 * If the sendmsg goes through, we keep the MR. If it fails with EAGAIN
942 * or in any other way, we need to destroy the MR again */
943 if (allocated_mr)
944 rds_rdma_unuse(rs, rds_rdma_cookie_key(rm->m_rdma_cookie), 1);
945
946 if (rm)
947 rds_message_put(rm);
948 return ret;
949}
950
951/*
952 * Reply to a ping packet.
953 */
954int
955rds_send_pong(struct rds_connection *conn, __be16 dport)
956{
957 struct rds_message *rm;
958 unsigned long flags;
959 int ret = 0;
960
961 rm = rds_message_alloc(0, GFP_ATOMIC);
962 if (rm == NULL) {
963 ret = -ENOMEM;
964 goto out;
965 }
966
967 rm->m_daddr = conn->c_faddr;
968
969 /* If the connection is down, trigger a connect. We may
970 * have scheduled a delayed reconnect however - in this case
971 * we should not interfere.
972 */
973 if (rds_conn_state(conn) == RDS_CONN_DOWN
974 && !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags))
975 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
976
977 ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL);
978 if (ret)
979 goto out;
980
981 spin_lock_irqsave(&conn->c_lock, flags);
982 list_add_tail(&rm->m_conn_item, &conn->c_send_queue);
983 set_bit(RDS_MSG_ON_CONN, &rm->m_flags);
984 rds_message_addref(rm);
985 rm->m_inc.i_conn = conn;
986
987 rds_message_populate_header(&rm->m_inc.i_hdr, 0, dport,
988 conn->c_next_tx_seq);
989 conn->c_next_tx_seq++;
990 spin_unlock_irqrestore(&conn->c_lock, flags);
991
992 rds_stats_inc(s_send_queued);
993 rds_stats_inc(s_send_pong);
994
995 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
996 rds_message_put(rm);
997 return 0;
998
999out:
1000 if (rm)
1001 rds_message_put(rm);
1002 return ret;
1003}
diff --git a/net/rds/stats.c b/net/rds/stats.c
new file mode 100644
index 000000000000..637146893cf3
--- /dev/null
+++ b/net/rds/stats.c
@@ -0,0 +1,148 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/percpu.h>
34#include <linux/seq_file.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38
39DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats);
40
41/* :.,$s/unsigned long\>.*\<s_\(.*\);/"\1",/g */
42
43static char *rds_stat_names[] = {
44 "conn_reset",
45 "recv_drop_bad_checksum",
46 "recv_drop_old_seq",
47 "recv_drop_no_sock",
48 "recv_drop_dead_sock",
49 "recv_deliver_raced",
50 "recv_delivered",
51 "recv_queued",
52 "recv_immediate_retry",
53 "recv_delayed_retry",
54 "recv_ack_required",
55 "recv_rdma_bytes",
56 "recv_ping",
57 "send_queue_empty",
58 "send_queue_full",
59 "send_sem_contention",
60 "send_sem_queue_raced",
61 "send_immediate_retry",
62 "send_delayed_retry",
63 "send_drop_acked",
64 "send_ack_required",
65 "send_queued",
66 "send_rdma",
67 "send_rdma_bytes",
68 "send_pong",
69 "page_remainder_hit",
70 "page_remainder_miss",
71 "copy_to_user",
72 "copy_from_user",
73 "cong_update_queued",
74 "cong_update_received",
75 "cong_send_error",
76 "cong_send_blocked",
77};
78
79void rds_stats_info_copy(struct rds_info_iterator *iter,
80 uint64_t *values, char **names, size_t nr)
81{
82 struct rds_info_counter ctr;
83 size_t i;
84
85 for (i = 0; i < nr; i++) {
86 BUG_ON(strlen(names[i]) >= sizeof(ctr.name));
87 strncpy(ctr.name, names[i], sizeof(ctr.name) - 1);
88 ctr.value = values[i];
89
90 rds_info_copy(iter, &ctr, sizeof(ctr));
91 }
92}
93
94/*
95 * This gives global counters across all the transports. The strings
96 * are copied in so that the tool doesn't need knowledge of the specific
97 * stats that we're exporting. Some are pretty implementation dependent
98 * and may change over time. That doesn't stop them from being useful.
99 *
100 * This is the only function in the chain that knows about the byte granular
101 * length in userspace. It converts it to number of stat entries that the
102 * rest of the functions operate in.
103 */
104static void rds_stats_info(struct socket *sock, unsigned int len,
105 struct rds_info_iterator *iter,
106 struct rds_info_lengths *lens)
107{
108 struct rds_statistics stats = {0, };
109 uint64_t *src;
110 uint64_t *sum;
111 size_t i;
112 int cpu;
113 unsigned int avail;
114
115 avail = len / sizeof(struct rds_info_counter);
116
117 if (avail < ARRAY_SIZE(rds_stat_names)) {
118 avail = 0;
119 goto trans;
120 }
121
122 for_each_online_cpu(cpu) {
123 src = (uint64_t *)&(per_cpu(rds_stats, cpu));
124 sum = (uint64_t *)&stats;
125 for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
126 *(sum++) += *(src++);
127 }
128
129 rds_stats_info_copy(iter, (uint64_t *)&stats, rds_stat_names,
130 ARRAY_SIZE(rds_stat_names));
131 avail -= ARRAY_SIZE(rds_stat_names);
132
133trans:
134 lens->each = sizeof(struct rds_info_counter);
135 lens->nr = rds_trans_stats_info_copy(iter, avail) +
136 ARRAY_SIZE(rds_stat_names);
137}
138
139void rds_stats_exit(void)
140{
141 rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info);
142}
143
144int __init rds_stats_init(void)
145{
146 rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info);
147 return 0;
148}
diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c
new file mode 100644
index 000000000000..307dc5c1be15
--- /dev/null
+++ b/net/rds/sysctl.c
@@ -0,0 +1,122 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/sysctl.h>
35#include <linux/proc_fs.h>
36
37#include "rds.h"
38
39static struct ctl_table_header *rds_sysctl_reg_table;
40
41static unsigned long rds_sysctl_reconnect_min = 1;
42static unsigned long rds_sysctl_reconnect_max = ~0UL;
43
44unsigned long rds_sysctl_reconnect_min_jiffies;
45unsigned long rds_sysctl_reconnect_max_jiffies = HZ;
46
47unsigned int rds_sysctl_max_unacked_packets = 8;
48unsigned int rds_sysctl_max_unacked_bytes = (16 << 20);
49
50unsigned int rds_sysctl_ping_enable = 1;
51
52static ctl_table rds_sysctl_rds_table[] = {
53 {
54 .ctl_name = CTL_UNNUMBERED,
55 .procname = "reconnect_min_delay_ms",
56 .data = &rds_sysctl_reconnect_min_jiffies,
57 .maxlen = sizeof(unsigned long),
58 .mode = 0644,
59 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
60 .extra1 = &rds_sysctl_reconnect_min,
61 .extra2 = &rds_sysctl_reconnect_max_jiffies,
62 },
63 {
64 .ctl_name = CTL_UNNUMBERED,
65 .procname = "reconnect_max_delay_ms",
66 .data = &rds_sysctl_reconnect_max_jiffies,
67 .maxlen = sizeof(unsigned long),
68 .mode = 0644,
69 .proc_handler = &proc_doulongvec_ms_jiffies_minmax,
70 .extra1 = &rds_sysctl_reconnect_min_jiffies,
71 .extra2 = &rds_sysctl_reconnect_max,
72 },
73 {
74 .ctl_name = CTL_UNNUMBERED,
75 .procname = "max_unacked_packets",
76 .data = &rds_sysctl_max_unacked_packets,
77 .maxlen = sizeof(unsigned long),
78 .mode = 0644,
79 .proc_handler = &proc_dointvec,
80 },
81 {
82 .ctl_name = CTL_UNNUMBERED,
83 .procname = "max_unacked_bytes",
84 .data = &rds_sysctl_max_unacked_bytes,
85 .maxlen = sizeof(unsigned long),
86 .mode = 0644,
87 .proc_handler = &proc_dointvec,
88 },
89 {
90 .ctl_name = CTL_UNNUMBERED,
91 .procname = "ping_enable",
92 .data = &rds_sysctl_ping_enable,
93 .maxlen = sizeof(int),
94 .mode = 0644,
95 .proc_handler = &proc_dointvec,
96 },
97 { .ctl_name = 0}
98};
99
100static struct ctl_path rds_sysctl_path[] = {
101 { .procname = "net", .ctl_name = CTL_NET, },
102 { .procname = "rds", .ctl_name = CTL_UNNUMBERED, },
103 { }
104};
105
106
107void rds_sysctl_exit(void)
108{
109 if (rds_sysctl_reg_table)
110 unregister_sysctl_table(rds_sysctl_reg_table);
111}
112
113int __init rds_sysctl_init(void)
114{
115 rds_sysctl_reconnect_min = msecs_to_jiffies(1);
116 rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min;
117
118 rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table);
119 if (rds_sysctl_reg_table == NULL)
120 return -ENOMEM;
121 return 0;
122}
diff --git a/net/rds/threads.c b/net/rds/threads.c
new file mode 100644
index 000000000000..828a1bf9ea92
--- /dev/null
+++ b/net/rds/threads.c
@@ -0,0 +1,265 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/random.h>
35
36#include "rds.h"
37
38/*
39 * All of connection management is simplified by serializing it through
40 * work queues that execute in a connection managing thread.
41 *
42 * TCP wants to send acks through sendpage() in response to data_ready(),
43 * but it needs a process context to do so.
44 *
45 * The receive paths need to allocate but can't drop packets (!) so we have
46 * a thread around to block allocating if the receive fast path sees an
47 * allocation failure.
48 */
49
50/* Grand Unified Theory of connection life cycle:
51 * At any point in time, the connection can be in one of these states:
52 * DOWN, CONNECTING, UP, DISCONNECTING, ERROR
53 *
54 * The following transitions are possible:
55 * ANY -> ERROR
56 * UP -> DISCONNECTING
57 * ERROR -> DISCONNECTING
58 * DISCONNECTING -> DOWN
59 * DOWN -> CONNECTING
60 * CONNECTING -> UP
61 *
62 * Transition to state DISCONNECTING/DOWN:
63 * - Inside the shutdown worker; synchronizes with xmit path
64 * through c_send_lock, and with connection management callbacks
65 * via c_cm_lock.
66 *
67 * For receive callbacks, we rely on the underlying transport
68 * (TCP, IB/RDMA) to provide the necessary synchronisation.
69 */
70struct workqueue_struct *rds_wq;
71
72void rds_connect_complete(struct rds_connection *conn)
73{
74 if (!rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_UP)) {
75 printk(KERN_WARNING "%s: Cannot transition to state UP, "
76 "current state is %d\n",
77 __func__,
78 atomic_read(&conn->c_state));
79 atomic_set(&conn->c_state, RDS_CONN_ERROR);
80 queue_work(rds_wq, &conn->c_down_w);
81 return;
82 }
83
84 rdsdebug("conn %p for %pI4 to %pI4 complete\n",
85 conn, &conn->c_laddr, &conn->c_faddr);
86
87 conn->c_reconnect_jiffies = 0;
88 set_bit(0, &conn->c_map_queued);
89 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
90 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
91}
92
93/*
94 * This random exponential backoff is relied on to eventually resolve racing
95 * connects.
96 *
97 * If connect attempts race then both parties drop both connections and come
98 * here to wait for a random amount of time before trying again. Eventually
99 * the backoff range will be so much greater than the time it takes to
100 * establish a connection that one of the pair will establish the connection
101 * before the other's random delay fires.
102 *
103 * Connection attempts that arrive while a connection is already established
104 * are also considered to be racing connects. This lets a connection from
105 * a rebooted machine replace an existing stale connection before the transport
106 * notices that the connection has failed.
107 *
108 * We should *always* start with a random backoff; otherwise a broken connection
109 * will always take several iterations to be re-established.
110 */
111static void rds_queue_reconnect(struct rds_connection *conn)
112{
113 unsigned long rand;
114
115 rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
116 conn, &conn->c_laddr, &conn->c_faddr,
117 conn->c_reconnect_jiffies);
118
119 set_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
120 if (conn->c_reconnect_jiffies == 0) {
121 conn->c_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
122 queue_delayed_work(rds_wq, &conn->c_conn_w, 0);
123 return;
124 }
125
126 get_random_bytes(&rand, sizeof(rand));
127 rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
128 rand % conn->c_reconnect_jiffies, conn->c_reconnect_jiffies,
129 conn, &conn->c_laddr, &conn->c_faddr);
130 queue_delayed_work(rds_wq, &conn->c_conn_w,
131 rand % conn->c_reconnect_jiffies);
132
133 conn->c_reconnect_jiffies = min(conn->c_reconnect_jiffies * 2,
134 rds_sysctl_reconnect_max_jiffies);
135}
136
137void rds_connect_worker(struct work_struct *work)
138{
139 struct rds_connection *conn = container_of(work, struct rds_connection, c_conn_w.work);
140 int ret;
141
142 clear_bit(RDS_RECONNECT_PENDING, &conn->c_flags);
143 if (rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
144 ret = conn->c_trans->conn_connect(conn);
145 rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
146 conn, &conn->c_laddr, &conn->c_faddr, ret);
147
148 if (ret) {
149 if (rds_conn_transition(conn, RDS_CONN_CONNECTING, RDS_CONN_DOWN))
150 rds_queue_reconnect(conn);
151 else
152 rds_conn_error(conn, "RDS: connect failed\n");
153 }
154 }
155}
156
157void rds_shutdown_worker(struct work_struct *work)
158{
159 struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w);
160
161 /* shut it down unless it's down already */
162 if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) {
163 /*
164 * Quiesce the connection mgmt handlers before we start tearing
165 * things down. We don't hold the mutex for the entire
166 * duration of the shutdown operation, else we may be
167 * deadlocking with the CM handler. Instead, the CM event
168 * handler is supposed to check for state DISCONNECTING
169 */
170 mutex_lock(&conn->c_cm_lock);
171 if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING)
172 && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) {
173 rds_conn_error(conn, "shutdown called in state %d\n",
174 atomic_read(&conn->c_state));
175 mutex_unlock(&conn->c_cm_lock);
176 return;
177 }
178 mutex_unlock(&conn->c_cm_lock);
179
180 mutex_lock(&conn->c_send_lock);
181 conn->c_trans->conn_shutdown(conn);
182 rds_conn_reset(conn);
183 mutex_unlock(&conn->c_send_lock);
184
185 if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) {
186 /* This can happen - eg when we're in the middle of tearing
187 * down the connection, and someone unloads the rds module.
188 * Quite reproduceable with loopback connections.
189 * Mostly harmless.
190 */
191 rds_conn_error(conn,
192 "%s: failed to transition to state DOWN, "
193 "current state is %d\n",
194 __func__,
195 atomic_read(&conn->c_state));
196 return;
197 }
198 }
199
200 /* Then reconnect if it's still live.
201 * The passive side of an IB loopback connection is never added
202 * to the conn hash, so we never trigger a reconnect on this
203 * conn - the reconnect is always triggered by the active peer. */
204 cancel_delayed_work(&conn->c_conn_w);
205 if (!hlist_unhashed(&conn->c_hash_node))
206 rds_queue_reconnect(conn);
207}
208
209void rds_send_worker(struct work_struct *work)
210{
211 struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work);
212 int ret;
213
214 if (rds_conn_state(conn) == RDS_CONN_UP) {
215 ret = rds_send_xmit(conn);
216 rdsdebug("conn %p ret %d\n", conn, ret);
217 switch (ret) {
218 case -EAGAIN:
219 rds_stats_inc(s_send_immediate_retry);
220 queue_delayed_work(rds_wq, &conn->c_send_w, 0);
221 break;
222 case -ENOMEM:
223 rds_stats_inc(s_send_delayed_retry);
224 queue_delayed_work(rds_wq, &conn->c_send_w, 2);
225 default:
226 break;
227 }
228 }
229}
230
231void rds_recv_worker(struct work_struct *work)
232{
233 struct rds_connection *conn = container_of(work, struct rds_connection, c_recv_w.work);
234 int ret;
235
236 if (rds_conn_state(conn) == RDS_CONN_UP) {
237 ret = conn->c_trans->recv(conn);
238 rdsdebug("conn %p ret %d\n", conn, ret);
239 switch (ret) {
240 case -EAGAIN:
241 rds_stats_inc(s_recv_immediate_retry);
242 queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
243 break;
244 case -ENOMEM:
245 rds_stats_inc(s_recv_delayed_retry);
246 queue_delayed_work(rds_wq, &conn->c_recv_w, 2);
247 default:
248 break;
249 }
250 }
251}
252
253void rds_threads_exit(void)
254{
255 destroy_workqueue(rds_wq);
256}
257
258int __init rds_threads_init(void)
259{
260 rds_wq = create_singlethread_workqueue("krdsd");
261 if (rds_wq == NULL)
262 return -ENOMEM;
263
264 return 0;
265}
diff --git a/net/rds/transport.c b/net/rds/transport.c
new file mode 100644
index 000000000000..767da61ad2f3
--- /dev/null
+++ b/net/rds/transport.c
@@ -0,0 +1,117 @@
1/*
2 * Copyright (c) 2006 Oracle. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 *
32 */
33#include <linux/kernel.h>
34#include <linux/module.h>
35#include <linux/in.h>
36
37#include "rds.h"
38#include "loop.h"
39
40static LIST_HEAD(rds_transports);
41static DECLARE_RWSEM(rds_trans_sem);
42
43int rds_trans_register(struct rds_transport *trans)
44{
45 BUG_ON(strlen(trans->t_name) + 1 > TRANSNAMSIZ);
46
47 down_write(&rds_trans_sem);
48
49 list_add_tail(&trans->t_item, &rds_transports);
50 printk(KERN_INFO "Registered RDS/%s transport\n", trans->t_name);
51
52 up_write(&rds_trans_sem);
53
54 return 0;
55}
56
57void rds_trans_unregister(struct rds_transport *trans)
58{
59 down_write(&rds_trans_sem);
60
61 list_del_init(&trans->t_item);
62 printk(KERN_INFO "Unregistered RDS/%s transport\n", trans->t_name);
63
64 up_write(&rds_trans_sem);
65}
66
67struct rds_transport *rds_trans_get_preferred(__be32 addr)
68{
69 struct rds_transport *trans;
70 struct rds_transport *ret = NULL;
71
72 if (IN_LOOPBACK(ntohl(addr)))
73 return &rds_loop_transport;
74
75 down_read(&rds_trans_sem);
76 list_for_each_entry(trans, &rds_transports, t_item) {
77 if (trans->laddr_check(addr) == 0) {
78 ret = trans;
79 break;
80 }
81 }
82 up_read(&rds_trans_sem);
83
84 return ret;
85}
86
87/*
88 * This returns the number of stats entries in the snapshot and only
89 * copies them using the iter if there is enough space for them. The
90 * caller passes in the global stats so that we can size and copy while
91 * holding the lock.
92 */
93unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
94 unsigned int avail)
95
96{
97 struct rds_transport *trans;
98 unsigned int total = 0;
99 unsigned int part;
100
101 rds_info_iter_unmap(iter);
102 down_read(&rds_trans_sem);
103
104 list_for_each_entry(trans, &rds_transports, t_item) {
105 if (trans->stats_info_copy == NULL)
106 continue;
107
108 part = trans->stats_info_copy(iter, avail);
109 avail -= min(avail, part);
110 total += part;
111 }
112
113 up_read(&rds_trans_sem);
114
115 return total;
116}
117
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 5c72a116b1a4..f8f047b61245 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -183,13 +183,6 @@ override:
183 if (R_tab == NULL) 183 if (R_tab == NULL)
184 goto failure; 184 goto failure;
185 185
186 if (!est && (ret == ACT_P_CREATED ||
187 !gen_estimator_active(&police->tcf_bstats,
188 &police->tcf_rate_est))) {
189 err = -EINVAL;
190 goto failure;
191 }
192
193 if (parm->peakrate.rate) { 186 if (parm->peakrate.rate) {
194 P_tab = qdisc_get_rtab(&parm->peakrate, 187 P_tab = qdisc_get_rtab(&parm->peakrate,
195 tb[TCA_POLICE_PEAKRATE]); 188 tb[TCA_POLICE_PEAKRATE]);
@@ -205,6 +198,12 @@ override:
205 &police->tcf_lock, est); 198 &police->tcf_lock, est);
206 if (err) 199 if (err)
207 goto failure_unlock; 200 goto failure_unlock;
201 } else if (tb[TCA_POLICE_AVRATE] &&
202 (ret == ACT_P_CREATED ||
203 !gen_estimator_active(&police->tcf_bstats,
204 &police->tcf_rate_est))) {
205 err = -EINVAL;
206 goto failure_unlock;
208 } 207 }
209 208
210 /* No failure allowed after this point */ 209 /* No failure allowed after this point */
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 9e43ed949167..d728d8111732 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1960,8 +1960,11 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1960 cbq_rmprio(q, cl); 1960 cbq_rmprio(q, cl);
1961 sch_tree_unlock(sch); 1961 sch_tree_unlock(sch);
1962 1962
1963 if (--cl->refcnt == 0) 1963 BUG_ON(--cl->refcnt == 0);
1964 cbq_destroy_class(sch, cl); 1964 /*
1965 * This shouldn't happen: we "hold" one cops->get() when called
1966 * from tc_ctl_tclass; the destroy method is done from cops->put().
1967 */
1965 1968
1966 return 0; 1969 return 0;
1967} 1970}
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index f6b4fa97df70..7597fe146866 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -66,11 +66,15 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
66{ 66{
67 struct drr_sched *q = qdisc_priv(sch); 67 struct drr_sched *q = qdisc_priv(sch);
68 struct drr_class *cl = (struct drr_class *)*arg; 68 struct drr_class *cl = (struct drr_class *)*arg;
69 struct nlattr *opt = tca[TCA_OPTIONS];
69 struct nlattr *tb[TCA_DRR_MAX + 1]; 70 struct nlattr *tb[TCA_DRR_MAX + 1];
70 u32 quantum; 71 u32 quantum;
71 int err; 72 int err;
72 73
73 err = nla_parse_nested(tb, TCA_DRR_MAX, tca[TCA_OPTIONS], drr_policy); 74 if (!opt)
75 return -EINVAL;
76
77 err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy);
74 if (err < 0) 78 if (err < 0)
75 return err; 79 return err;
76 80
@@ -151,8 +155,11 @@ static int drr_delete_class(struct Qdisc *sch, unsigned long arg)
151 drr_purge_queue(cl); 155 drr_purge_queue(cl);
152 qdisc_class_hash_remove(&q->clhash, &cl->common); 156 qdisc_class_hash_remove(&q->clhash, &cl->common);
153 157
154 if (--cl->refcnt == 0) 158 BUG_ON(--cl->refcnt == 0);
155 drr_destroy_class(sch, cl); 159 /*
160 * This shouldn't happen: we "hold" one cops->get() when called
161 * from tc_ctl_tclass; the destroy method is done from cops->put().
162 */
156 163
157 sch_tree_unlock(sch); 164 sch_tree_unlock(sch);
158 return 0; 165 return 0;
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 74226b265528..5022f9c1f34b 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1139,8 +1139,11 @@ hfsc_delete_class(struct Qdisc *sch, unsigned long arg)
1139 hfsc_purge_queue(sch, cl); 1139 hfsc_purge_queue(sch, cl);
1140 qdisc_class_hash_remove(&q->clhash, &cl->cl_common); 1140 qdisc_class_hash_remove(&q->clhash, &cl->cl_common);
1141 1141
1142 if (--cl->refcnt == 0) 1142 BUG_ON(--cl->refcnt == 0);
1143 hfsc_destroy_class(sch, cl); 1143 /*
1144 * This shouldn't happen: we "hold" one cops->get() when called
1145 * from tc_ctl_tclass; the destroy method is done from cops->put().
1146 */
1144 1147
1145 sch_tree_unlock(sch); 1148 sch_tree_unlock(sch);
1146 return 0; 1149 return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 355974f610c5..88cd02626621 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1275,8 +1275,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
1275 if (last_child) 1275 if (last_child)
1276 htb_parent_to_leaf(q, cl, new_q); 1276 htb_parent_to_leaf(q, cl, new_q);
1277 1277
1278 if (--cl->refcnt == 0) 1278 BUG_ON(--cl->refcnt == 0);
1279 htb_destroy_class(sch, cl); 1279 /*
1280 * This shouldn't happen: we "hold" one cops->get() when called
1281 * from tc_ctl_tclass; the destroy method is done from cops->put().
1282 */
1280 1283
1281 sch_tree_unlock(sch); 1284 sch_tree_unlock(sch);
1282 return 0; 1285 return 0;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a2f93c09f3cc..e22dfe85e43e 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -236,7 +236,6 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
236 struct tc_tbf_qopt *qopt; 236 struct tc_tbf_qopt *qopt;
237 struct qdisc_rate_table *rtab = NULL; 237 struct qdisc_rate_table *rtab = NULL;
238 struct qdisc_rate_table *ptab = NULL; 238 struct qdisc_rate_table *ptab = NULL;
239 struct qdisc_rate_table *tmp;
240 struct Qdisc *child = NULL; 239 struct Qdisc *child = NULL;
241 int max_size,n; 240 int max_size,n;
242 241
@@ -295,13 +294,9 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
295 q->tokens = q->buffer; 294 q->tokens = q->buffer;
296 q->ptokens = q->mtu; 295 q->ptokens = q->mtu;
297 296
298 tmp = q->R_tab; 297 swap(q->R_tab, rtab);
299 q->R_tab = rtab; 298 swap(q->P_tab, ptab);
300 rtab = tmp;
301 299
302 tmp = q->P_tab;
303 q->P_tab = ptab;
304 ptab = tmp;
305 sch_tree_unlock(sch); 300 sch_tree_unlock(sch);
306 err = 0; 301 err = 0;
307done: 302done:
diff --git a/net/sctp/debug.c b/net/sctp/debug.c
index 67715f4eb849..7ff548a30cfb 100644
--- a/net/sctp/debug.c
+++ b/net/sctp/debug.c
@@ -86,6 +86,9 @@ const char *sctp_cname(const sctp_subtype_t cid)
86 case SCTP_CID_FWD_TSN: 86 case SCTP_CID_FWD_TSN:
87 return "FWD_TSN"; 87 return "FWD_TSN";
88 88
89 case SCTP_CID_AUTH:
90 return "AUTH";
91
89 default: 92 default:
90 break; 93 break;
91 } 94 }
@@ -135,6 +138,7 @@ static const char *sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = {
135 "PRIMITIVE_ABORT", 138 "PRIMITIVE_ABORT",
136 "PRIMITIVE_SEND", 139 "PRIMITIVE_SEND",
137 "PRIMITIVE_REQUESTHEARTBEAT", 140 "PRIMITIVE_REQUESTHEARTBEAT",
141 "PRIMITIVE_ASCONF",
138}; 142};
139 143
140/* Lookup primitive debug name. */ 144/* Lookup primitive debug name. */
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 4c8d9f45ce09..905fda582b92 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -111,7 +111,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
111 if (sctp_addip_enable) { 111 if (sctp_addip_enable) {
112 auth_chunks->chunks[0] = SCTP_CID_ASCONF; 112 auth_chunks->chunks[0] = SCTP_CID_ASCONF;
113 auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK; 113 auth_chunks->chunks[1] = SCTP_CID_ASCONF_ACK;
114 auth_chunks->param_hdr.length += htons(2); 114 auth_chunks->param_hdr.length =
115 htons(sizeof(sctp_paramhdr_t) + 2);
115 } 116 }
116 } 117 }
117 118
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 07d58903a746..7d08f522ec84 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -49,13 +49,10 @@
49#include <linux/ipv6.h> 49#include <linux/ipv6.h>
50#include <linux/init.h> 50#include <linux/init.h>
51#include <net/inet_ecn.h> 51#include <net/inet_ecn.h>
52#include <net/ip.h>
52#include <net/icmp.h> 53#include <net/icmp.h>
53#include <net/net_namespace.h> 54#include <net/net_namespace.h>
54 55
55#ifndef TEST_FRAME
56#include <net/tcp.h>
57#endif /* TEST_FRAME (not defined) */
58
59#include <linux/socket.h> /* for sa_family_t */ 56#include <linux/socket.h> /* for sa_family_t */
60#include <net/sock.h> 57#include <net/sock.h>
61 58
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index bc411c896216..d765fc53e74d 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -428,7 +428,8 @@ void sctp_retransmit_mark(struct sctp_outq *q,
428 * retransmitting due to T3 timeout. 428 * retransmitting due to T3 timeout.
429 */ 429 */
430 if (reason == SCTP_RTXR_T3_RTX && 430 if (reason == SCTP_RTXR_T3_RTX &&
431 (jiffies - chunk->sent_at) < transport->last_rto) 431 time_before(jiffies, chunk->sent_at +
432 transport->last_rto))
432 continue; 433 continue;
433 434
434 /* RFC 2960 6.2.1 Processing a Received SACK 435 /* RFC 2960 6.2.1 Processing a Received SACK
@@ -1757,6 +1758,9 @@ static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn)
1757 struct sctp_chunk *chunk; 1758 struct sctp_chunk *chunk;
1758 struct list_head *lchunk, *temp; 1759 struct list_head *lchunk, *temp;
1759 1760
1761 if (!asoc->peer.prsctp_capable)
1762 return;
1763
1760 /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the 1764 /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the
1761 * received SACK. 1765 * received SACK.
1762 * 1766 *
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index c1e316ee7155..cb198af8887c 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -692,15 +692,20 @@ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev,
692static int sctp_ctl_sock_init(void) 692static int sctp_ctl_sock_init(void)
693{ 693{
694 int err; 694 int err;
695 sa_family_t family; 695 sa_family_t family = PF_INET;
696 696
697 if (sctp_get_pf_specific(PF_INET6)) 697 if (sctp_get_pf_specific(PF_INET6))
698 family = PF_INET6; 698 family = PF_INET6;
699 else
700 family = PF_INET;
701 699
702 err = inet_ctl_sock_create(&sctp_ctl_sock, family, 700 err = inet_ctl_sock_create(&sctp_ctl_sock, family,
703 SOCK_SEQPACKET, IPPROTO_SCTP, &init_net); 701 SOCK_SEQPACKET, IPPROTO_SCTP, &init_net);
702
703 /* If IPv6 socket could not be created, try the IPv4 socket */
704 if (err < 0 && family == PF_INET6)
705 err = inet_ctl_sock_create(&sctp_ctl_sock, AF_INET,
706 SOCK_SEQPACKET, IPPROTO_SCTP,
707 &init_net);
708
704 if (err < 0) { 709 if (err < 0) {
705 printk(KERN_ERR 710 printk(KERN_ERR
706 "SCTP: Failed to create the SCTP control socket.\n"); 711 "SCTP: Failed to create the SCTP control socket.\n");
@@ -1297,9 +1302,8 @@ SCTP_STATIC __init int sctp_init(void)
1297out: 1302out:
1298 return status; 1303 return status;
1299err_v6_add_protocol: 1304err_v6_add_protocol:
1300 sctp_v6_del_protocol();
1301err_add_protocol:
1302 sctp_v4_del_protocol(); 1305 sctp_v4_del_protocol();
1306err_add_protocol:
1303 inet_ctl_sock_destroy(sctp_ctl_sock); 1307 inet_ctl_sock_destroy(sctp_ctl_sock);
1304err_ctl_sock_init: 1308err_ctl_sock_init:
1305 sctp_v6_protosw_exit(); 1309 sctp_v6_protosw_exit();
@@ -1310,7 +1314,6 @@ err_protosw_init:
1310 sctp_v4_pf_exit(); 1314 sctp_v4_pf_exit();
1311 sctp_v6_pf_exit(); 1315 sctp_v6_pf_exit();
1312 sctp_sysctl_unregister(); 1316 sctp_sysctl_unregister();
1313 list_del(&sctp_af_inet.list);
1314 free_pages((unsigned long)sctp_port_hashtable, 1317 free_pages((unsigned long)sctp_port_hashtable,
1315 get_order(sctp_port_hashsize * 1318 get_order(sctp_port_hashsize *
1316 sizeof(struct sctp_bind_hashbucket))); 1319 sizeof(struct sctp_bind_hashbucket)));
@@ -1358,7 +1361,6 @@ SCTP_STATIC __exit void sctp_exit(void)
1358 sctp_v4_pf_exit(); 1361 sctp_v4_pf_exit();
1359 1362
1360 sctp_sysctl_unregister(); 1363 sctp_sysctl_unregister();
1361 list_del(&sctp_af_inet.list);
1362 1364
1363 free_pages((unsigned long)sctp_assoc_hashtable, 1365 free_pages((unsigned long)sctp_assoc_hashtable,
1364 get_order(sctp_assoc_hashsize * 1366 get_order(sctp_assoc_hashsize *
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index b40e95f9851b..6851ee94e974 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -224,7 +224,9 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
224 num_ext += 2; 224 num_ext += 2;
225 } 225 }
226 226
227 chunksize += sizeof(aiparam); 227 if (sp->adaptation_ind)
228 chunksize += sizeof(aiparam);
229
228 chunksize += vparam_len; 230 chunksize += vparam_len;
229 231
230 /* Account for AUTH related parameters */ 232 /* Account for AUTH related parameters */
@@ -304,10 +306,12 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
304 if (sctp_prsctp_enable) 306 if (sctp_prsctp_enable)
305 sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); 307 sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
306 308
307 aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; 309 if (sp->adaptation_ind) {
308 aiparam.param_hdr.length = htons(sizeof(aiparam)); 310 aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
309 aiparam.adaptation_ind = htonl(sp->adaptation_ind); 311 aiparam.param_hdr.length = htons(sizeof(aiparam));
310 sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); 312 aiparam.adaptation_ind = htonl(sp->adaptation_ind);
313 sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
314 }
311 315
312 /* Add SCTP-AUTH chunks to the parameter list */ 316 /* Add SCTP-AUTH chunks to the parameter list */
313 if (sctp_auth_enable) { 317 if (sctp_auth_enable) {
@@ -332,6 +336,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
332 sctp_inithdr_t initack; 336 sctp_inithdr_t initack;
333 struct sctp_chunk *retval; 337 struct sctp_chunk *retval;
334 union sctp_params addrs; 338 union sctp_params addrs;
339 struct sctp_sock *sp;
335 int addrs_len; 340 int addrs_len;
336 sctp_cookie_param_t *cookie; 341 sctp_cookie_param_t *cookie;
337 int cookie_len; 342 int cookie_len;
@@ -366,22 +371,24 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
366 /* Calculate the total size of allocation, include the reserved 371 /* Calculate the total size of allocation, include the reserved
367 * space for reporting unknown parameters if it is specified. 372 * space for reporting unknown parameters if it is specified.
368 */ 373 */
374 sp = sctp_sk(asoc->base.sk);
369 chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; 375 chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len;
370 376
371 /* Tell peer that we'll do ECN only if peer advertised such cap. */ 377 /* Tell peer that we'll do ECN only if peer advertised such cap. */
372 if (asoc->peer.ecn_capable) 378 if (asoc->peer.ecn_capable)
373 chunksize += sizeof(ecap_param); 379 chunksize += sizeof(ecap_param);
374 380
375 if (sctp_prsctp_enable) 381 if (asoc->peer.prsctp_capable)
376 chunksize += sizeof(prsctp_param); 382 chunksize += sizeof(prsctp_param);
377 383
378 if (sctp_addip_enable) { 384 if (asoc->peer.asconf_capable) {
379 extensions[num_ext] = SCTP_CID_ASCONF; 385 extensions[num_ext] = SCTP_CID_ASCONF;
380 extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; 386 extensions[num_ext+1] = SCTP_CID_ASCONF_ACK;
381 num_ext += 2; 387 num_ext += 2;
382 } 388 }
383 389
384 chunksize += sizeof(aiparam); 390 if (sp->adaptation_ind)
391 chunksize += sizeof(aiparam);
385 392
386 if (asoc->peer.auth_capable) { 393 if (asoc->peer.auth_capable) {
387 auth_random = (sctp_paramhdr_t *)asoc->c.auth_random; 394 auth_random = (sctp_paramhdr_t *)asoc->c.auth_random;
@@ -432,10 +439,12 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
432 if (asoc->peer.prsctp_capable) 439 if (asoc->peer.prsctp_capable)
433 sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); 440 sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param);
434 441
435 aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; 442 if (sp->adaptation_ind) {
436 aiparam.param_hdr.length = htons(sizeof(aiparam)); 443 aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND;
437 aiparam.adaptation_ind = htonl(sctp_sk(asoc->base.sk)->adaptation_ind); 444 aiparam.param_hdr.length = htons(sizeof(aiparam));
438 sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); 445 aiparam.adaptation_ind = htonl(sp->adaptation_ind);
446 sctp_addto_chunk(retval, sizeof(aiparam), &aiparam);
447 }
439 448
440 if (asoc->peer.auth_capable) { 449 if (asoc->peer.auth_capable) {
441 sctp_addto_chunk(retval, ntohs(auth_random->length), 450 sctp_addto_chunk(retval, ntohs(auth_random->length),
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 0146cfb1f182..e2020eb2c8ca 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -434,7 +434,8 @@ sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = {
434 * 434 *
435 */ 435 */
436static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, 436static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
437 struct sctp_transport *transport) 437 struct sctp_transport *transport,
438 int is_hb)
438{ 439{
439 /* The check for association's overall error counter exceeding the 440 /* The check for association's overall error counter exceeding the
440 * threshold is done in the state function. 441 * threshold is done in the state function.
@@ -466,7 +467,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc,
466 * The first unacknowleged HB triggers it. We do this with a flag 467 * The first unacknowleged HB triggers it. We do this with a flag
467 * that indicates that we have an outstanding HB. 468 * that indicates that we have an outstanding HB.
468 */ 469 */
469 if (transport->hb_sent) { 470 if (!is_hb || transport->hb_sent) {
470 transport->last_rto = transport->rto; 471 transport->last_rto = transport->rto;
471 transport->rto = min((transport->rto * 2), transport->asoc->rto_max); 472 transport->rto = min((transport->rto * 2), transport->asoc->rto_max);
472 } 473 }
@@ -657,20 +658,6 @@ static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds,
657 sctp_transport_hold(t); 658 sctp_transport_hold(t);
658} 659}
659 660
660/* Helper function to do a transport reset at the expiry of the hearbeat
661 * timer.
662 */
663static void sctp_cmd_transport_reset(sctp_cmd_seq_t *cmds,
664 struct sctp_association *asoc,
665 struct sctp_transport *t)
666{
667 sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
668
669 /* Mark one strike against a transport. */
670 sctp_do_8_2_transport_strike(asoc, t);
671
672 t->hb_sent = 1;
673}
674 661
675/* Helper function to process the process SACK command. */ 662/* Helper function to process the process SACK command. */
676static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds, 663static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds,
@@ -800,36 +787,48 @@ static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds,
800 struct sctp_association *asoc, 787 struct sctp_association *asoc,
801 struct sctp_chunk *chunk) 788 struct sctp_chunk *chunk)
802{ 789{
803 struct sctp_operr_chunk *operr_chunk;
804 struct sctp_errhdr *err_hdr; 790 struct sctp_errhdr *err_hdr;
791 struct sctp_ulpevent *ev;
805 792
806 operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr; 793 while (chunk->chunk_end > chunk->skb->data) {
807 err_hdr = &operr_chunk->err_hdr; 794 err_hdr = (struct sctp_errhdr *)(chunk->skb->data);
808 795
809 switch (err_hdr->cause) { 796 ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0,
810 case SCTP_ERROR_UNKNOWN_CHUNK: 797 GFP_ATOMIC);
811 { 798 if (!ev)
812 struct sctp_chunkhdr *unk_chunk_hdr; 799 return;
813 800
814 unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable; 801 sctp_ulpq_tail_event(&asoc->ulpq, ev);
815 switch (unk_chunk_hdr->type) { 802
816 /* ADDIP 4.1 A9) If the peer responds to an ASCONF with an 803 switch (err_hdr->cause) {
817 * ERROR chunk reporting that it did not recognized the ASCONF 804 case SCTP_ERROR_UNKNOWN_CHUNK:
818 * chunk type, the sender of the ASCONF MUST NOT send any 805 {
819 * further ASCONF chunks and MUST stop its T-4 timer. 806 sctp_chunkhdr_t *unk_chunk_hdr;
820 */ 807
821 case SCTP_CID_ASCONF: 808 unk_chunk_hdr = (sctp_chunkhdr_t *)err_hdr->variable;
822 asoc->peer.asconf_capable = 0; 809 switch (unk_chunk_hdr->type) {
823 sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, 810 /* ADDIP 4.1 A9) If the peer responds to an ASCONF with
811 * an ERROR chunk reporting that it did not recognized
812 * the ASCONF chunk type, the sender of the ASCONF MUST
813 * NOT send any further ASCONF chunks and MUST stop its
814 * T-4 timer.
815 */
816 case SCTP_CID_ASCONF:
817 if (asoc->peer.asconf_capable == 0)
818 break;
819
820 asoc->peer.asconf_capable = 0;
821 sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP,
824 SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); 822 SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO));
823 break;
824 default:
825 break;
826 }
825 break; 827 break;
828 }
826 default: 829 default:
827 break; 830 break;
828 } 831 }
829 break;
830 }
831 default:
832 break;
833 } 832 }
834} 833}
835 834
@@ -1459,12 +1458,19 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
1459 1458
1460 case SCTP_CMD_STRIKE: 1459 case SCTP_CMD_STRIKE:
1461 /* Mark one strike against a transport. */ 1460 /* Mark one strike against a transport. */
1462 sctp_do_8_2_transport_strike(asoc, cmd->obj.transport); 1461 sctp_do_8_2_transport_strike(asoc, cmd->obj.transport,
1462 0);
1463 break;
1464
1465 case SCTP_CMD_TRANSPORT_IDLE:
1466 t = cmd->obj.transport;
1467 sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE);
1463 break; 1468 break;
1464 1469
1465 case SCTP_CMD_TRANSPORT_RESET: 1470 case SCTP_CMD_TRANSPORT_HB_SENT:
1466 t = cmd->obj.transport; 1471 t = cmd->obj.transport;
1467 sctp_cmd_transport_reset(commands, asoc, t); 1472 sctp_do_8_2_transport_strike(asoc, t, 1);
1473 t->hb_sent = 1;
1468 break; 1474 break;
1469 1475
1470 case SCTP_CMD_TRANSPORT_ON: 1476 case SCTP_CMD_TRANSPORT_ON:
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 3a0cd075914f..55a61aa69662 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -988,7 +988,9 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep,
988 /* Set transport error counter and association error counter 988 /* Set transport error counter and association error counter
989 * when sending heartbeat. 989 * when sending heartbeat.
990 */ 990 */
991 sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, 991 sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_IDLE,
992 SCTP_TRANSPORT(transport));
993 sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
992 SCTP_TRANSPORT(transport)); 994 SCTP_TRANSPORT(transport));
993 } 995 }
994 sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE, 996 sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE,
@@ -3163,7 +3165,6 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
3163 sctp_cmd_seq_t *commands) 3165 sctp_cmd_seq_t *commands)
3164{ 3166{
3165 struct sctp_chunk *chunk = arg; 3167 struct sctp_chunk *chunk = arg;
3166 struct sctp_ulpevent *ev;
3167 3168
3168 if (!sctp_vtag_verify(chunk, asoc)) 3169 if (!sctp_vtag_verify(chunk, asoc))
3169 return sctp_sf_pdiscard(ep, asoc, type, arg, commands); 3170 return sctp_sf_pdiscard(ep, asoc, type, arg, commands);
@@ -3173,21 +3174,10 @@ sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep,
3173 return sctp_sf_violation_chunklen(ep, asoc, type, arg, 3174 return sctp_sf_violation_chunklen(ep, asoc, type, arg,
3174 commands); 3175 commands);
3175 3176
3176 while (chunk->chunk_end > chunk->skb->data) { 3177 sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
3177 ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, 3178 SCTP_CHUNK(chunk));
3178 GFP_ATOMIC);
3179 if (!ev)
3180 goto nomem;
3181 3179
3182 sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP,
3183 SCTP_ULPEVENT(ev));
3184 sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR,
3185 SCTP_CHUNK(chunk));
3186 }
3187 return SCTP_DISPOSITION_CONSUME; 3180 return SCTP_DISPOSITION_CONSUME;
3188
3189nomem:
3190 return SCTP_DISPOSITION_NOMEM;
3191} 3181}
3192 3182
3193/* 3183/*
@@ -4967,7 +4957,7 @@ sctp_disposition_t sctp_sf_do_prm_requestheartbeat(
4967 * to that address and not acknowledged within one RTO. 4957 * to that address and not acknowledged within one RTO.
4968 * 4958 *
4969 */ 4959 */
4970 sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, 4960 sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_HB_SENT,
4971 SCTP_TRANSPORT(arg)); 4961 SCTP_TRANSPORT(arg));
4972 return SCTP_DISPOSITION_CONSUME; 4962 return SCTP_DISPOSITION_CONSUME;
4973} 4963}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index dea864f5de54..5fb3a8c9792e 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3069,9 +3069,6 @@ static int sctp_setsockopt_maxburst(struct sock *sk,
3069 int val; 3069 int val;
3070 int assoc_id = 0; 3070 int assoc_id = 0;
3071 3071
3072 if (optlen < sizeof(int))
3073 return -EINVAL;
3074
3075 if (optlen == sizeof(int)) { 3072 if (optlen == sizeof(int)) {
3076 printk(KERN_WARNING 3073 printk(KERN_WARNING
3077 "SCTP: Use of int in max_burst socket option deprecated\n"); 3074 "SCTP: Use of int in max_burst socket option deprecated\n");
@@ -5283,16 +5280,14 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len,
5283 struct sctp_sock *sp; 5280 struct sctp_sock *sp;
5284 struct sctp_association *asoc; 5281 struct sctp_association *asoc;
5285 5282
5286 if (len < sizeof(int))
5287 return -EINVAL;
5288
5289 if (len == sizeof(int)) { 5283 if (len == sizeof(int)) {
5290 printk(KERN_WARNING 5284 printk(KERN_WARNING
5291 "SCTP: Use of int in max_burst socket option deprecated\n"); 5285 "SCTP: Use of int in max_burst socket option deprecated\n");
5292 printk(KERN_WARNING 5286 printk(KERN_WARNING
5293 "SCTP: Use struct sctp_assoc_value instead\n"); 5287 "SCTP: Use struct sctp_assoc_value instead\n");
5294 params.assoc_id = 0; 5288 params.assoc_id = 0;
5295 } else if (len == sizeof (struct sctp_assoc_value)) { 5289 } else if (len >= sizeof(struct sctp_assoc_value)) {
5290 len = sizeof(struct sctp_assoc_value);
5296 if (copy_from_user(&params, optval, len)) 5291 if (copy_from_user(&params, optval, len))
5297 return -EFAULT; 5292 return -EFAULT;
5298 } else 5293 } else
@@ -5848,37 +5843,28 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
5848} 5843}
5849 5844
5850/* 5845/*
5851 * 3.1.3 listen() - UDP Style Syntax 5846 * Move a socket to LISTENING state.
5852 *
5853 * By default, new associations are not accepted for UDP style sockets.
5854 * An application uses listen() to mark a socket as being able to
5855 * accept new associations.
5856 */ 5847 */
5857SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) 5848SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog)
5858{ 5849{
5859 struct sctp_sock *sp = sctp_sk(sk); 5850 struct sctp_sock *sp = sctp_sk(sk);
5860 struct sctp_endpoint *ep = sp->ep; 5851 struct sctp_endpoint *ep = sp->ep;
5852 struct crypto_hash *tfm = NULL;
5861 5853
5862 /* Only UDP style sockets that are not peeled off are allowed to 5854 /* Allocate HMAC for generating cookie. */
5863 * listen(). 5855 if (!sctp_sk(sk)->hmac && sctp_hmac_alg) {
5864 */ 5856 tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC);
5865 if (!sctp_style(sk, UDP)) 5857 if (IS_ERR(tfm)) {
5866 return -EINVAL; 5858 if (net_ratelimit()) {
5867 5859 printk(KERN_INFO
5868 /* If backlog is zero, disable listening. */ 5860 "SCTP: failed to load transform for %s: %ld\n",
5869 if (!backlog) { 5861 sctp_hmac_alg, PTR_ERR(tfm));
5870 if (sctp_sstate(sk, CLOSED)) 5862 }
5871 return 0; 5863 return -ENOSYS;
5872 5864 }
5873 sctp_unhash_endpoint(ep); 5865 sctp_sk(sk)->hmac = tfm;
5874 sk->sk_state = SCTP_SS_CLOSED;
5875 return 0;
5876 } 5866 }
5877 5867
5878 /* Return if we are already listening. */
5879 if (sctp_sstate(sk, LISTENING))
5880 return 0;
5881
5882 /* 5868 /*
5883 * If a bind() or sctp_bindx() is not called prior to a listen() 5869 * If a bind() or sctp_bindx() is not called prior to a listen()
5884 * call that allows new associations to be accepted, the system 5870 * call that allows new associations to be accepted, the system
@@ -5889,7 +5875,6 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
5889 * extensions draft, but follows the practice as seen in TCP 5875 * extensions draft, but follows the practice as seen in TCP
5890 * sockets. 5876 * sockets.
5891 * 5877 *
5892 * Additionally, turn off fastreuse flag since we are not listening
5893 */ 5878 */
5894 sk->sk_state = SCTP_SS_LISTENING; 5879 sk->sk_state = SCTP_SS_LISTENING;
5895 if (!ep->base.bind_addr.port) { 5880 if (!ep->base.bind_addr.port) {
@@ -5900,113 +5885,71 @@ SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog)
5900 sk->sk_state = SCTP_SS_CLOSED; 5885 sk->sk_state = SCTP_SS_CLOSED;
5901 return -EADDRINUSE; 5886 return -EADDRINUSE;
5902 } 5887 }
5903 sctp_sk(sk)->bind_hash->fastreuse = 0;
5904 } 5888 }
5905 5889
5906 sctp_hash_endpoint(ep);
5907 return 0;
5908}
5909
5910/*
5911 * 4.1.3 listen() - TCP Style Syntax
5912 *
5913 * Applications uses listen() to ready the SCTP endpoint for accepting
5914 * inbound associations.
5915 */
5916SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog)
5917{
5918 struct sctp_sock *sp = sctp_sk(sk);
5919 struct sctp_endpoint *ep = sp->ep;
5920
5921 /* If backlog is zero, disable listening. */
5922 if (!backlog) {
5923 if (sctp_sstate(sk, CLOSED))
5924 return 0;
5925
5926 sctp_unhash_endpoint(ep);
5927 sk->sk_state = SCTP_SS_CLOSED;
5928 return 0;
5929 }
5930
5931 if (sctp_sstate(sk, LISTENING))
5932 return 0;
5933
5934 /*
5935 * If a bind() or sctp_bindx() is not called prior to a listen()
5936 * call that allows new associations to be accepted, the system
5937 * picks an ephemeral port and will choose an address set equivalent
5938 * to binding with a wildcard address.
5939 *
5940 * This is not currently spelled out in the SCTP sockets
5941 * extensions draft, but follows the practice as seen in TCP
5942 * sockets.
5943 */
5944 sk->sk_state = SCTP_SS_LISTENING;
5945 if (!ep->base.bind_addr.port) {
5946 if (sctp_autobind(sk))
5947 return -EAGAIN;
5948 } else
5949 sctp_sk(sk)->bind_hash->fastreuse = 0;
5950
5951 sk->sk_max_ack_backlog = backlog; 5890 sk->sk_max_ack_backlog = backlog;
5952 sctp_hash_endpoint(ep); 5891 sctp_hash_endpoint(ep);
5953 return 0; 5892 return 0;
5954} 5893}
5955 5894
5956/* 5895/*
5896 * 4.1.3 / 5.1.3 listen()
5897 *
5898 * By default, new associations are not accepted for UDP style sockets.
5899 * An application uses listen() to mark a socket as being able to
5900 * accept new associations.
5901 *
5902 * On TCP style sockets, applications use listen() to ready the SCTP
5903 * endpoint for accepting inbound associations.
5904 *
5905 * On both types of endpoints a backlog of '0' disables listening.
5906 *
5957 * Move a socket to LISTENING state. 5907 * Move a socket to LISTENING state.
5958 */ 5908 */
5959int sctp_inet_listen(struct socket *sock, int backlog) 5909int sctp_inet_listen(struct socket *sock, int backlog)
5960{ 5910{
5961 struct sock *sk = sock->sk; 5911 struct sock *sk = sock->sk;
5962 struct crypto_hash *tfm = NULL; 5912 struct sctp_endpoint *ep = sctp_sk(sk)->ep;
5963 int err = -EINVAL; 5913 int err = -EINVAL;
5964 5914
5965 if (unlikely(backlog < 0)) 5915 if (unlikely(backlog < 0))
5966 goto out; 5916 return err;
5967 5917
5968 sctp_lock_sock(sk); 5918 sctp_lock_sock(sk);
5969 5919
5920 /* Peeled-off sockets are not allowed to listen(). */
5921 if (sctp_style(sk, UDP_HIGH_BANDWIDTH))
5922 goto out;
5923
5970 if (sock->state != SS_UNCONNECTED) 5924 if (sock->state != SS_UNCONNECTED)
5971 goto out; 5925 goto out;
5972 5926
5973 /* Allocate HMAC for generating cookie. */ 5927 /* If backlog is zero, disable listening. */
5974 if (!sctp_sk(sk)->hmac && sctp_hmac_alg) { 5928 if (!backlog) {
5975 tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); 5929 if (sctp_sstate(sk, CLOSED))
5976 if (IS_ERR(tfm)) {
5977 if (net_ratelimit()) {
5978 printk(KERN_INFO
5979 "SCTP: failed to load transform for %s: %ld\n",
5980 sctp_hmac_alg, PTR_ERR(tfm));
5981 }
5982 err = -ENOSYS;
5983 goto out; 5930 goto out;
5984 }
5985 }
5986 5931
5987 switch (sock->type) { 5932 err = 0;
5988 case SOCK_SEQPACKET: 5933 sctp_unhash_endpoint(ep);
5989 err = sctp_seqpacket_listen(sk, backlog); 5934 sk->sk_state = SCTP_SS_CLOSED;
5990 break; 5935 if (sk->sk_reuse)
5991 case SOCK_STREAM: 5936 sctp_sk(sk)->bind_hash->fastreuse = 1;
5992 err = sctp_stream_listen(sk, backlog); 5937 goto out;
5993 break;
5994 default:
5995 break;
5996 } 5938 }
5997 5939
5998 if (err) 5940 /* If we are already listening, just update the backlog */
5999 goto cleanup; 5941 if (sctp_sstate(sk, LISTENING))
5942 sk->sk_max_ack_backlog = backlog;
5943 else {
5944 err = sctp_listen_start(sk, backlog);
5945 if (err)
5946 goto out;
5947 }
6000 5948
6001 /* Store away the transform reference. */ 5949 err = 0;
6002 if (!sctp_sk(sk)->hmac)
6003 sctp_sk(sk)->hmac = tfm;
6004out: 5950out:
6005 sctp_release_sock(sk); 5951 sctp_release_sock(sk);
6006 return err; 5952 return err;
6007cleanup:
6008 crypto_free_hash(tfm);
6009 goto out;
6010} 5953}
6011 5954
6012/* 5955/*
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 5c29b14ee9af..e5dde45c79d3 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -543,8 +543,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
543 * congestion indications more than once every window of 543 * congestion indications more than once every window of
544 * data (or more loosely more than once every round-trip time). 544 * data (or more loosely more than once every round-trip time).
545 */ 545 */
546 if ((jiffies - transport->last_time_ecne_reduced) > 546 if (time_after(jiffies, transport->last_time_ecne_reduced +
547 transport->rtt) { 547 transport->rtt)) {
548 transport->ssthresh = max(transport->cwnd/2, 548 transport->ssthresh = max(transport->cwnd/2,
549 4*transport->asoc->pathmtu); 549 4*transport->asoc->pathmtu);
550 transport->cwnd = transport->ssthresh; 550 transport->cwnd = transport->ssthresh;
@@ -561,7 +561,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport,
561 * to be done every RTO interval, we do it every hearbeat 561 * to be done every RTO interval, we do it every hearbeat
562 * interval. 562 * interval.
563 */ 563 */
564 if ((jiffies - transport->last_time_used) > transport->rto) 564 if (time_after(jiffies, transport->last_time_used +
565 transport->rto))
565 transport->cwnd = max(transport->cwnd/2, 566 transport->cwnd = max(transport->cwnd/2,
566 4*transport->asoc->pathmtu); 567 4*transport->asoc->pathmtu);
567 break; 568 break;
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 3ddaff42d1bb..a3bfd4064912 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -119,7 +119,7 @@ static struct bclink *bclink = NULL;
119static struct link *bcl = NULL; 119static struct link *bcl = NULL;
120static DEFINE_SPINLOCK(bc_lock); 120static DEFINE_SPINLOCK(bc_lock);
121 121
122char tipc_bclink_name[] = "multicast-link"; 122const char tipc_bclink_name[] = "multicast-link";
123 123
124 124
125static u32 buf_seqno(struct sk_buff *buf) 125static u32 buf_seqno(struct sk_buff *buf)
@@ -800,7 +800,7 @@ int tipc_bclink_init(void)
800 tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT); 800 tipc_link_set_queue_limits(bcl, BCLINK_WIN_DEFAULT);
801 bcl->b_ptr = &bcbearer->bearer; 801 bcl->b_ptr = &bcbearer->bearer;
802 bcl->state = WORKING_WORKING; 802 bcl->state = WORKING_WORKING;
803 sprintf(bcl->name, tipc_bclink_name); 803 strlcpy(bcl->name, tipc_bclink_name, TIPC_MAX_LINK_NAME);
804 804
805 if (BCLINK_LOG_BUF_SIZE) { 805 if (BCLINK_LOG_BUF_SIZE) {
806 char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC); 806 char *pb = kmalloc(BCLINK_LOG_BUF_SIZE, GFP_ATOMIC);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 2f2d731bc1c2..4c1771e95c99 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -70,7 +70,7 @@ struct port_list {
70 70
71struct tipc_node; 71struct tipc_node;
72 72
73extern char tipc_bclink_name[]; 73extern const char tipc_bclink_name[];
74 74
75 75
76/** 76/**
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 29ecae851668..1885a7edb0c8 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -258,7 +258,7 @@ void tipc_printf(struct print_buf *pb, const char *fmt, ...)
258 } 258 }
259 259
260 if (pb->echo) 260 if (pb->echo)
261 printk(print_string); 261 printk("%s", print_string);
262 262
263 spin_unlock_bh(&print_lock); 263 spin_unlock_bh(&print_lock);
264} 264}
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 20d98c56e152..2c24e7d6d950 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -703,7 +703,7 @@ struct sk_buff *tipc_node_get_links(const void *req_tlv_area, int req_tlv_space)
703 703
704 link_info.dest = htonl(tipc_own_addr & 0xfffff00); 704 link_info.dest = htonl(tipc_own_addr & 0xfffff00);
705 link_info.up = htonl(1); 705 link_info.up = htonl(1);
706 sprintf(link_info.str, tipc_bclink_name); 706 strlcpy(link_info.str, tipc_bclink_name, TIPC_MAX_LINK_NAME);
707 tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info)); 707 tipc_cfg_append_tlv(buf, TIPC_TLV_LINK_INFO, &link_info, sizeof(link_info));
708 708
709 /* Add TLVs for any other links in scope */ 709 /* Add TLVs for any other links in scope */
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d1b89820ab4f..baac91049b0e 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1178,8 +1178,7 @@ out_unlock:
1178 unix_state_unlock(other); 1178 unix_state_unlock(other);
1179 1179
1180out: 1180out:
1181 if (skb) 1181 kfree_skb(skb);
1182 kfree_skb(skb);
1183 if (newsk) 1182 if (newsk)
1184 unix_release_sock(newsk, 0); 1183 unix_release_sock(newsk, 0);
1185 if (other) 1184 if (other)
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 39701dec1dba..466e2d22d256 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -86,8 +86,10 @@ static int wanrouter_device_del_if(struct wan_device *wandev,
86 86
87static struct wan_device *wanrouter_find_device(char *name); 87static struct wan_device *wanrouter_find_device(char *name);
88static int wanrouter_delete_interface(struct wan_device *wandev, char *name); 88static int wanrouter_delete_interface(struct wan_device *wandev, char *name);
89static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); 89static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
90static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); 90 __acquires(lock);
91static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
92 __releases(lock);
91 93
92 94
93 95
@@ -763,12 +765,14 @@ static int wanrouter_delete_interface(struct wan_device *wandev, char *name)
763} 765}
764 766
765static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) 767static void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
768 __acquires(lock)
766{ 769{
767 spin_lock_irqsave(lock, *smp_flags); 770 spin_lock_irqsave(lock, *smp_flags);
768} 771}
769 772
770 773
771static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) 774static void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags)
775 __releases(lock)
772{ 776{
773 spin_unlock_irqrestore(lock, *smp_flags); 777 spin_unlock_irqrestore(lock, *smp_flags);
774} 778}
diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c
index 267f7ff49827..c44d96b3a437 100644
--- a/net/wanrouter/wanproc.c
+++ b/net/wanrouter/wanproc.c
@@ -80,6 +80,7 @@ static struct proc_dir_entry *proc_router;
80 * Iterator 80 * Iterator
81 */ 81 */
82static void *r_start(struct seq_file *m, loff_t *pos) 82static void *r_start(struct seq_file *m, loff_t *pos)
83 __acquires(kernel_lock)
83{ 84{
84 struct wan_device *wandev; 85 struct wan_device *wandev;
85 loff_t l = *pos; 86 loff_t l = *pos;
@@ -101,6 +102,7 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
101} 102}
102 103
103static void r_stop(struct seq_file *m, void *v) 104static void r_stop(struct seq_file *m, void *v)
105 __releases(kernel_lock)
104{ 106{
105 unlock_kernel(); 107 unlock_kernel();
106} 108}
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index e28e2b8fa436..092ae6faccca 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -102,3 +102,13 @@ config LIB80211_CRYPT_CCMP
102 102
103config LIB80211_CRYPT_TKIP 103config LIB80211_CRYPT_TKIP
104 tristate 104 tristate
105
106config LIB80211_DEBUG
107 bool "lib80211 debugging messages"
108 depends on LIB80211
109 default n
110 ---help---
111 You can enable this if you want verbose debugging messages
112 from lib80211.
113
114 If unsure, say N.
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 0668b2bfc1da..17fe39049740 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -7,7 +7,6 @@
7#include <linux/if.h> 7#include <linux/if.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/err.h> 9#include <linux/err.h>
10#include <linux/mutex.h>
11#include <linux/list.h> 10#include <linux/list.h>
12#include <linux/nl80211.h> 11#include <linux/nl80211.h>
13#include <linux/debugfs.h> 12#include <linux/debugfs.h>
@@ -31,18 +30,29 @@ MODULE_DESCRIPTION("wireless configuration support");
31 * only read the list, and that can happen quite 30 * only read the list, and that can happen quite
32 * often because we need to do it for each command */ 31 * often because we need to do it for each command */
33LIST_HEAD(cfg80211_drv_list); 32LIST_HEAD(cfg80211_drv_list);
34DEFINE_MUTEX(cfg80211_drv_mutex); 33
34/*
35 * This is used to protect the cfg80211_drv_list, cfg80211_regdomain,
36 * country_ie_regdomain, the reg_beacon_list and the the last regulatory
37 * request receipt (last_request).
38 */
39DEFINE_MUTEX(cfg80211_mutex);
35 40
36/* for debugfs */ 41/* for debugfs */
37static struct dentry *ieee80211_debugfs_dir; 42static struct dentry *ieee80211_debugfs_dir;
38 43
39/* requires cfg80211_drv_mutex to be held! */ 44/* requires cfg80211_mutex to be held! */
40static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy) 45struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx)
41{ 46{
42 struct cfg80211_registered_device *result = NULL, *drv; 47 struct cfg80211_registered_device *result = NULL, *drv;
43 48
49 if (!wiphy_idx_valid(wiphy_idx))
50 return NULL;
51
52 assert_cfg80211_lock();
53
44 list_for_each_entry(drv, &cfg80211_drv_list, list) { 54 list_for_each_entry(drv, &cfg80211_drv_list, list) {
45 if (drv->idx == wiphy) { 55 if (drv->wiphy_idx == wiphy_idx) {
46 result = drv; 56 result = drv;
47 break; 57 break;
48 } 58 }
@@ -51,17 +61,44 @@ static struct cfg80211_registered_device *cfg80211_drv_by_wiphy(int wiphy)
51 return result; 61 return result;
52} 62}
53 63
64int get_wiphy_idx(struct wiphy *wiphy)
65{
66 struct cfg80211_registered_device *drv;
67 if (!wiphy)
68 return WIPHY_IDX_STALE;
69 drv = wiphy_to_dev(wiphy);
70 return drv->wiphy_idx;
71}
72
54/* requires cfg80211_drv_mutex to be held! */ 73/* requires cfg80211_drv_mutex to be held! */
74struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx)
75{
76 struct cfg80211_registered_device *drv;
77
78 if (!wiphy_idx_valid(wiphy_idx))
79 return NULL;
80
81 assert_cfg80211_lock();
82
83 drv = cfg80211_drv_by_wiphy_idx(wiphy_idx);
84 if (!drv)
85 return NULL;
86 return &drv->wiphy;
87}
88
89/* requires cfg80211_mutex to be held! */
55static struct cfg80211_registered_device * 90static struct cfg80211_registered_device *
56__cfg80211_drv_from_info(struct genl_info *info) 91__cfg80211_drv_from_info(struct genl_info *info)
57{ 92{
58 int ifindex; 93 int ifindex;
59 struct cfg80211_registered_device *bywiphy = NULL, *byifidx = NULL; 94 struct cfg80211_registered_device *bywiphyidx = NULL, *byifidx = NULL;
60 struct net_device *dev; 95 struct net_device *dev;
61 int err = -EINVAL; 96 int err = -EINVAL;
62 97
98 assert_cfg80211_lock();
99
63 if (info->attrs[NL80211_ATTR_WIPHY]) { 100 if (info->attrs[NL80211_ATTR_WIPHY]) {
64 bywiphy = cfg80211_drv_by_wiphy( 101 bywiphyidx = cfg80211_drv_by_wiphy_idx(
65 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY])); 102 nla_get_u32(info->attrs[NL80211_ATTR_WIPHY]));
66 err = -ENODEV; 103 err = -ENODEV;
67 } 104 }
@@ -78,14 +115,14 @@ __cfg80211_drv_from_info(struct genl_info *info)
78 err = -ENODEV; 115 err = -ENODEV;
79 } 116 }
80 117
81 if (bywiphy && byifidx) { 118 if (bywiphyidx && byifidx) {
82 if (bywiphy != byifidx) 119 if (bywiphyidx != byifidx)
83 return ERR_PTR(-EINVAL); 120 return ERR_PTR(-EINVAL);
84 else 121 else
85 return bywiphy; /* == byifidx */ 122 return bywiphyidx; /* == byifidx */
86 } 123 }
87 if (bywiphy) 124 if (bywiphyidx)
88 return bywiphy; 125 return bywiphyidx;
89 126
90 if (byifidx) 127 if (byifidx)
91 return byifidx; 128 return byifidx;
@@ -98,7 +135,7 @@ cfg80211_get_dev_from_info(struct genl_info *info)
98{ 135{
99 struct cfg80211_registered_device *drv; 136 struct cfg80211_registered_device *drv;
100 137
101 mutex_lock(&cfg80211_drv_mutex); 138 mutex_lock(&cfg80211_mutex);
102 drv = __cfg80211_drv_from_info(info); 139 drv = __cfg80211_drv_from_info(info);
103 140
104 /* if it is not an error we grab the lock on 141 /* if it is not an error we grab the lock on
@@ -107,7 +144,7 @@ cfg80211_get_dev_from_info(struct genl_info *info)
107 if (!IS_ERR(drv)) 144 if (!IS_ERR(drv))
108 mutex_lock(&drv->mtx); 145 mutex_lock(&drv->mtx);
109 146
110 mutex_unlock(&cfg80211_drv_mutex); 147 mutex_unlock(&cfg80211_mutex);
111 148
112 return drv; 149 return drv;
113} 150}
@@ -118,7 +155,7 @@ cfg80211_get_dev_from_ifindex(int ifindex)
118 struct cfg80211_registered_device *drv = ERR_PTR(-ENODEV); 155 struct cfg80211_registered_device *drv = ERR_PTR(-ENODEV);
119 struct net_device *dev; 156 struct net_device *dev;
120 157
121 mutex_lock(&cfg80211_drv_mutex); 158 mutex_lock(&cfg80211_mutex);
122 dev = dev_get_by_index(&init_net, ifindex); 159 dev = dev_get_by_index(&init_net, ifindex);
123 if (!dev) 160 if (!dev)
124 goto out; 161 goto out;
@@ -129,7 +166,7 @@ cfg80211_get_dev_from_ifindex(int ifindex)
129 drv = ERR_PTR(-ENODEV); 166 drv = ERR_PTR(-ENODEV);
130 dev_put(dev); 167 dev_put(dev);
131 out: 168 out:
132 mutex_unlock(&cfg80211_drv_mutex); 169 mutex_unlock(&cfg80211_mutex);
133 return drv; 170 return drv;
134} 171}
135 172
@@ -143,16 +180,16 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
143 char *newname) 180 char *newname)
144{ 181{
145 struct cfg80211_registered_device *drv; 182 struct cfg80211_registered_device *drv;
146 int idx, taken = -1, result, digits; 183 int wiphy_idx, taken = -1, result, digits;
147 184
148 mutex_lock(&cfg80211_drv_mutex); 185 mutex_lock(&cfg80211_mutex);
149 186
150 /* prohibit calling the thing phy%d when %d is not its number */ 187 /* prohibit calling the thing phy%d when %d is not its number */
151 sscanf(newname, PHY_NAME "%d%n", &idx, &taken); 188 sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken);
152 if (taken == strlen(newname) && idx != rdev->idx) { 189 if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) {
153 /* count number of places needed to print idx */ 190 /* count number of places needed to print wiphy_idx */
154 digits = 1; 191 digits = 1;
155 while (idx /= 10) 192 while (wiphy_idx /= 10)
156 digits++; 193 digits++;
157 /* 194 /*
158 * deny the name if it is phy<idx> where <idx> is printed 195 * deny the name if it is phy<idx> where <idx> is printed
@@ -193,7 +230,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
193 230
194 result = 0; 231 result = 0;
195out_unlock: 232out_unlock:
196 mutex_unlock(&cfg80211_drv_mutex); 233 mutex_unlock(&cfg80211_mutex);
197 if (result == 0) 234 if (result == 0)
198 nl80211_notify_dev_rename(rdev); 235 nl80211_notify_dev_rename(rdev);
199 236
@@ -220,22 +257,22 @@ struct wiphy *wiphy_new(struct cfg80211_ops *ops, int sizeof_priv)
220 257
221 drv->ops = ops; 258 drv->ops = ops;
222 259
223 mutex_lock(&cfg80211_drv_mutex); 260 mutex_lock(&cfg80211_mutex);
224 261
225 drv->idx = wiphy_counter++; 262 drv->wiphy_idx = wiphy_counter++;
226 263
227 if (unlikely(drv->idx < 0)) { 264 if (unlikely(!wiphy_idx_valid(drv->wiphy_idx))) {
228 wiphy_counter--; 265 wiphy_counter--;
229 mutex_unlock(&cfg80211_drv_mutex); 266 mutex_unlock(&cfg80211_mutex);
230 /* ugh, wrapped! */ 267 /* ugh, wrapped! */
231 kfree(drv); 268 kfree(drv);
232 return NULL; 269 return NULL;
233 } 270 }
234 271
235 mutex_unlock(&cfg80211_drv_mutex); 272 mutex_unlock(&cfg80211_mutex);
236 273
237 /* give it a proper name */ 274 /* give it a proper name */
238 dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->idx); 275 dev_set_name(&drv->wiphy.dev, PHY_NAME "%d", drv->wiphy_idx);
239 276
240 mutex_init(&drv->mtx); 277 mutex_init(&drv->mtx);
241 mutex_init(&drv->devlist_mtx); 278 mutex_init(&drv->devlist_mtx);
@@ -310,10 +347,10 @@ int wiphy_register(struct wiphy *wiphy)
310 /* check and set up bitrates */ 347 /* check and set up bitrates */
311 ieee80211_set_bitrate_flags(wiphy); 348 ieee80211_set_bitrate_flags(wiphy);
312 349
313 mutex_lock(&cfg80211_drv_mutex); 350 mutex_lock(&cfg80211_mutex);
314 351
315 /* set up regulatory info */ 352 /* set up regulatory info */
316 wiphy_update_regulatory(wiphy, REGDOM_SET_BY_CORE); 353 wiphy_update_regulatory(wiphy, NL80211_REGDOM_SET_BY_CORE);
317 354
318 res = device_add(&drv->wiphy.dev); 355 res = device_add(&drv->wiphy.dev);
319 if (res) 356 if (res)
@@ -328,9 +365,20 @@ int wiphy_register(struct wiphy *wiphy)
328 if (IS_ERR(drv->wiphy.debugfsdir)) 365 if (IS_ERR(drv->wiphy.debugfsdir))
329 drv->wiphy.debugfsdir = NULL; 366 drv->wiphy.debugfsdir = NULL;
330 367
368 if (wiphy->custom_regulatory) {
369 struct regulatory_request request;
370
371 request.wiphy_idx = get_wiphy_idx(wiphy);
372 request.initiator = NL80211_REGDOM_SET_BY_DRIVER;
373 request.alpha2[0] = '9';
374 request.alpha2[1] = '9';
375
376 nl80211_send_reg_change_event(&request);
377 }
378
331 res = 0; 379 res = 0;
332out_unlock: 380out_unlock:
333 mutex_unlock(&cfg80211_drv_mutex); 381 mutex_unlock(&cfg80211_mutex);
334 return res; 382 return res;
335} 383}
336EXPORT_SYMBOL(wiphy_register); 384EXPORT_SYMBOL(wiphy_register);
@@ -340,7 +388,7 @@ void wiphy_unregister(struct wiphy *wiphy)
340 struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy); 388 struct cfg80211_registered_device *drv = wiphy_to_dev(wiphy);
341 389
342 /* protect the device list */ 390 /* protect the device list */
343 mutex_lock(&cfg80211_drv_mutex); 391 mutex_lock(&cfg80211_mutex);
344 392
345 BUG_ON(!list_empty(&drv->netdev_list)); 393 BUG_ON(!list_empty(&drv->netdev_list));
346 394
@@ -366,7 +414,7 @@ void wiphy_unregister(struct wiphy *wiphy)
366 device_del(&drv->wiphy.dev); 414 device_del(&drv->wiphy.dev);
367 debugfs_remove(drv->wiphy.debugfsdir); 415 debugfs_remove(drv->wiphy.debugfsdir);
368 416
369 mutex_unlock(&cfg80211_drv_mutex); 417 mutex_unlock(&cfg80211_mutex);
370} 418}
371EXPORT_SYMBOL(wiphy_unregister); 419EXPORT_SYMBOL(wiphy_unregister);
372 420
diff --git a/net/wireless/core.h b/net/wireless/core.h
index e29ad4cd464f..6acd483a61f8 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -10,6 +10,7 @@
10#include <linux/netdevice.h> 10#include <linux/netdevice.h>
11#include <linux/kref.h> 11#include <linux/kref.h>
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/mutex.h>
13#include <net/genetlink.h> 14#include <net/genetlink.h>
14#include <net/wireless.h> 15#include <net/wireless.h>
15#include <net/cfg80211.h> 16#include <net/cfg80211.h>
@@ -37,7 +38,7 @@ struct cfg80211_registered_device {
37 enum environment_cap env; 38 enum environment_cap env;
38 39
39 /* wiphy index, internal only */ 40 /* wiphy index, internal only */
40 int idx; 41 int wiphy_idx;
41 42
42 /* associate netdev list */ 43 /* associate netdev list */
43 struct mutex devlist_mtx; 44 struct mutex devlist_mtx;
@@ -49,6 +50,7 @@ struct cfg80211_registered_device {
49 struct rb_root bss_tree; 50 struct rb_root bss_tree;
50 u32 bss_generation; 51 u32 bss_generation;
51 struct cfg80211_scan_request *scan_req; /* protected by RTNL */ 52 struct cfg80211_scan_request *scan_req; /* protected by RTNL */
53 unsigned long suspend_at;
52 54
53 /* must be last because of the way we do wiphy_priv(), 55 /* must be last because of the way we do wiphy_priv(),
54 * and it should at least be aligned to NETDEV_ALIGN */ 56 * and it should at least be aligned to NETDEV_ALIGN */
@@ -62,9 +64,27 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
62 return container_of(wiphy, struct cfg80211_registered_device, wiphy); 64 return container_of(wiphy, struct cfg80211_registered_device, wiphy);
63} 65}
64 66
65extern struct mutex cfg80211_drv_mutex; 67/* Note 0 is valid, hence phy0 */
68static inline
69bool wiphy_idx_valid(int wiphy_idx)
70{
71 return (wiphy_idx >= 0);
72}
73
74extern struct mutex cfg80211_mutex;
66extern struct list_head cfg80211_drv_list; 75extern struct list_head cfg80211_drv_list;
67 76
77static inline void assert_cfg80211_lock(void)
78{
79 WARN_ON(!mutex_is_locked(&cfg80211_mutex));
80}
81
82/*
83 * You can use this to mark a wiphy_idx as not having an associated wiphy.
84 * It guarantees cfg80211_drv_by_wiphy_idx(wiphy_idx) will return NULL
85 */
86#define WIPHY_IDX_STALE -1
87
68struct cfg80211_internal_bss { 88struct cfg80211_internal_bss {
69 struct list_head list; 89 struct list_head list;
70 struct rb_node rbn; 90 struct rb_node rbn;
@@ -74,6 +94,9 @@ struct cfg80211_internal_bss {
74 struct cfg80211_bss pub; 94 struct cfg80211_bss pub;
75}; 95};
76 96
97struct cfg80211_registered_device *cfg80211_drv_by_wiphy_idx(int wiphy_idx);
98int get_wiphy_idx(struct wiphy *wiphy);
99
77/* 100/*
78 * This function returns a pointer to the driver 101 * This function returns a pointer to the driver
79 * that the genl_info item that is passed refers to. 102 * that the genl_info item that is passed refers to.
@@ -81,13 +104,13 @@ struct cfg80211_internal_bss {
81 * the driver's mutex! 104 * the driver's mutex!
82 * 105 *
83 * This means that you need to call cfg80211_put_dev() 106 * This means that you need to call cfg80211_put_dev()
84 * before being allowed to acquire &cfg80211_drv_mutex! 107 * before being allowed to acquire &cfg80211_mutex!
85 * 108 *
86 * This is necessary because we need to lock the global 109 * This is necessary because we need to lock the global
87 * mutex to get an item off the list safely, and then 110 * mutex to get an item off the list safely, and then
88 * we lock the drv mutex so it doesn't go away under us. 111 * we lock the drv mutex so it doesn't go away under us.
89 * 112 *
90 * We don't want to keep cfg80211_drv_mutex locked 113 * We don't want to keep cfg80211_mutex locked
91 * for all the time in order to allow requests on 114 * for all the time in order to allow requests on
92 * other interfaces to go through at the same time. 115 * other interfaces to go through at the same time.
93 * 116 *
@@ -97,6 +120,9 @@ struct cfg80211_internal_bss {
97extern struct cfg80211_registered_device * 120extern struct cfg80211_registered_device *
98cfg80211_get_dev_from_info(struct genl_info *info); 121cfg80211_get_dev_from_info(struct genl_info *info);
99 122
123/* requires cfg80211_drv_mutex to be held! */
124struct wiphy *wiphy_idx_to_wiphy(int wiphy_idx);
125
100/* identical to cfg80211_get_dev_from_info but only operate on ifindex */ 126/* identical to cfg80211_get_dev_from_info but only operate on ifindex */
101extern struct cfg80211_registered_device * 127extern struct cfg80211_registered_device *
102cfg80211_get_dev_from_ifindex(int ifindex); 128cfg80211_get_dev_from_ifindex(int ifindex);
@@ -110,8 +136,11 @@ extern int cfg80211_dev_rename(struct cfg80211_registered_device *drv,
110 char *newname); 136 char *newname);
111 137
112void ieee80211_set_bitrate_flags(struct wiphy *wiphy); 138void ieee80211_set_bitrate_flags(struct wiphy *wiphy);
113void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby); 139void wiphy_update_regulatory(struct wiphy *wiphy,
140 enum nl80211_reg_initiator setby);
114 141
115void cfg80211_bss_expire(struct cfg80211_registered_device *dev); 142void cfg80211_bss_expire(struct cfg80211_registered_device *dev);
143void cfg80211_bss_age(struct cfg80211_registered_device *dev,
144 unsigned long age_secs);
116 145
117#endif /* __NET_WIRELESS_CORE_H */ 146#endif /* __NET_WIRELESS_CORE_H */
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index db428194c16a..2301dc1edc4c 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -337,6 +337,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
337 pos += 8; 337 pos += 8;
338 338
339 if (ccmp_replay_check(pn, key->rx_pn)) { 339 if (ccmp_replay_check(pn, key->rx_pn)) {
340#ifdef CONFIG_LIB80211_DEBUG
340 if (net_ratelimit()) { 341 if (net_ratelimit()) {
341 printk(KERN_DEBUG "CCMP: replay detected: STA=%pM " 342 printk(KERN_DEBUG "CCMP: replay detected: STA=%pM "
342 "previous PN %02x%02x%02x%02x%02x%02x " 343 "previous PN %02x%02x%02x%02x%02x%02x "
@@ -346,6 +347,7 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
346 key->rx_pn[3], key->rx_pn[4], key->rx_pn[5], 347 key->rx_pn[3], key->rx_pn[4], key->rx_pn[5],
347 pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]); 348 pn[0], pn[1], pn[2], pn[3], pn[4], pn[5]);
348 } 349 }
350#endif
349 key->dot11RSNAStatsCCMPReplays++; 351 key->dot11RSNAStatsCCMPReplays++;
350 return -4; 352 return -4;
351 } 353 }
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 7e8e22bfed90..c36287399d7e 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -465,12 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
465 pos += 8; 465 pos += 8;
466 466
467 if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) { 467 if (tkip_replay_check(iv32, iv16, tkey->rx_iv32, tkey->rx_iv16)) {
468#ifdef CONFIG_LIB80211_DEBUG
468 if (net_ratelimit()) { 469 if (net_ratelimit()) {
469 printk(KERN_DEBUG "TKIP: replay detected: STA=%pM" 470 printk(KERN_DEBUG "TKIP: replay detected: STA=%pM"
470 " previous TSC %08x%04x received TSC " 471 " previous TSC %08x%04x received TSC "
471 "%08x%04x\n", hdr->addr2, 472 "%08x%04x\n", hdr->addr2,
472 tkey->rx_iv32, tkey->rx_iv16, iv32, iv16); 473 tkey->rx_iv32, tkey->rx_iv16, iv32, iv16);
473 } 474 }
475#endif
474 tkey->dot11RSNAStatsTKIPReplays++; 476 tkey->dot11RSNAStatsTKIPReplays++;
475 return -4; 477 return -4;
476 } 478 }
@@ -505,10 +507,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
505 * it needs to be recalculated for the next packet. */ 507 * it needs to be recalculated for the next packet. */
506 tkey->rx_phase1_done = 0; 508 tkey->rx_phase1_done = 0;
507 } 509 }
510#ifdef CONFIG_LIB80211_DEBUG
508 if (net_ratelimit()) { 511 if (net_ratelimit()) {
509 printk(KERN_DEBUG "TKIP: ICV error detected: STA=" 512 printk(KERN_DEBUG "TKIP: ICV error detected: STA="
510 "%pM\n", hdr->addr2); 513 "%pM\n", hdr->addr2);
511 } 514 }
515#endif
512 tkey->dot11RSNAStatsTKIPICVErrors++; 516 tkey->dot11RSNAStatsTKIPICVErrors++;
513 return -5; 517 return -5;
514 } 518 }
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 298a4de59948..ab9d8f14e151 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7,7 +7,6 @@
7#include <linux/if.h> 7#include <linux/if.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/err.h> 9#include <linux/err.h>
10#include <linux/mutex.h>
11#include <linux/list.h> 10#include <linux/list.h>
12#include <linux/if_ether.h> 11#include <linux/if_ether.h>
13#include <linux/ieee80211.h> 12#include <linux/ieee80211.h>
@@ -142,7 +141,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags,
142 if (!hdr) 141 if (!hdr)
143 return -1; 142 return -1;
144 143
145 NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->idx); 144 NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, dev->wiphy_idx);
146 NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy)); 145 NLA_PUT_STRING(msg, NL80211_ATTR_WIPHY_NAME, wiphy_name(&dev->wiphy));
147 NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS, 146 NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_SCAN_SSIDS,
148 dev->wiphy.max_scan_ssids); 147 dev->wiphy.max_scan_ssids);
@@ -256,7 +255,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
256 int start = cb->args[0]; 255 int start = cb->args[0];
257 struct cfg80211_registered_device *dev; 256 struct cfg80211_registered_device *dev;
258 257
259 mutex_lock(&cfg80211_drv_mutex); 258 mutex_lock(&cfg80211_mutex);
260 list_for_each_entry(dev, &cfg80211_drv_list, list) { 259 list_for_each_entry(dev, &cfg80211_drv_list, list) {
261 if (++idx <= start) 260 if (++idx <= start)
262 continue; 261 continue;
@@ -267,7 +266,7 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb)
267 break; 266 break;
268 } 267 }
269 } 268 }
270 mutex_unlock(&cfg80211_drv_mutex); 269 mutex_unlock(&cfg80211_mutex);
271 270
272 cb->args[0] = idx; 271 cb->args[0] = idx;
273 272
@@ -470,7 +469,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
470 struct cfg80211_registered_device *dev; 469 struct cfg80211_registered_device *dev;
471 struct wireless_dev *wdev; 470 struct wireless_dev *wdev;
472 471
473 mutex_lock(&cfg80211_drv_mutex); 472 mutex_lock(&cfg80211_mutex);
474 list_for_each_entry(dev, &cfg80211_drv_list, list) { 473 list_for_each_entry(dev, &cfg80211_drv_list, list) {
475 if (wp_idx < wp_start) { 474 if (wp_idx < wp_start) {
476 wp_idx++; 475 wp_idx++;
@@ -497,7 +496,7 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
497 wp_idx++; 496 wp_idx++;
498 } 497 }
499 out: 498 out:
500 mutex_unlock(&cfg80211_drv_mutex); 499 mutex_unlock(&cfg80211_mutex);
501 500
502 cb->args[0] = wp_idx; 501 cb->args[0] = wp_idx;
503 cb->args[1] = if_idx; 502 cb->args[1] = if_idx;
@@ -1206,6 +1205,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq,
1206 1205
1207 nla_nest_end(msg, txrate); 1206 nla_nest_end(msg, txrate);
1208 } 1207 }
1208 if (sinfo->filled & STATION_INFO_RX_PACKETS)
1209 NLA_PUT_U32(msg, NL80211_STA_INFO_RX_PACKETS,
1210 sinfo->rx_packets);
1211 if (sinfo->filled & STATION_INFO_TX_PACKETS)
1212 NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS,
1213 sinfo->tx_packets);
1209 nla_nest_end(msg, sinfoattr); 1214 nla_nest_end(msg, sinfoattr);
1210 1215
1211 return genlmsg_end(msg, hdr); 1216 return genlmsg_end(msg, hdr);
@@ -1900,6 +1905,19 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
1900 int r; 1905 int r;
1901 char *data = NULL; 1906 char *data = NULL;
1902 1907
1908 /*
1909 * You should only get this when cfg80211 hasn't yet initialized
1910 * completely when built-in to the kernel right between the time
1911 * window between nl80211_init() and regulatory_init(), if that is
1912 * even possible.
1913 */
1914 mutex_lock(&cfg80211_mutex);
1915 if (unlikely(!cfg80211_regdomain)) {
1916 mutex_unlock(&cfg80211_mutex);
1917 return -EINPROGRESS;
1918 }
1919 mutex_unlock(&cfg80211_mutex);
1920
1903 if (!info->attrs[NL80211_ATTR_REG_ALPHA2]) 1921 if (!info->attrs[NL80211_ATTR_REG_ALPHA2])
1904 return -EINVAL; 1922 return -EINVAL;
1905 1923
@@ -1910,14 +1928,9 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info)
1910 if (is_world_regdom(data)) 1928 if (is_world_regdom(data))
1911 return -EINVAL; 1929 return -EINVAL;
1912#endif 1930#endif
1913 mutex_lock(&cfg80211_drv_mutex); 1931
1914 r = __regulatory_hint(NULL, REGDOM_SET_BY_USER, data, 0, ENVIRON_ANY); 1932 r = regulatory_hint_user(data);
1915 mutex_unlock(&cfg80211_drv_mutex); 1933
1916 /* This means the regulatory domain was already set, however
1917 * we don't want to confuse userspace with a "successful error"
1918 * message so lets just treat it as a success */
1919 if (r == -EALREADY)
1920 r = 0;
1921 return r; 1934 return r;
1922} 1935}
1923 1936
@@ -1937,6 +1950,11 @@ static int nl80211_get_mesh_params(struct sk_buff *skb,
1937 if (err) 1950 if (err)
1938 return err; 1951 return err;
1939 1952
1953 if (!drv->ops->get_mesh_params) {
1954 err = -EOPNOTSUPP;
1955 goto out;
1956 }
1957
1940 /* Get the mesh params */ 1958 /* Get the mesh params */
1941 rtnl_lock(); 1959 rtnl_lock();
1942 err = drv->ops->get_mesh_params(&drv->wiphy, dev, &cur_params); 1960 err = drv->ops->get_mesh_params(&drv->wiphy, dev, &cur_params);
@@ -2046,6 +2064,11 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
2046 if (err) 2064 if (err)
2047 return err; 2065 return err;
2048 2066
2067 if (!drv->ops->set_mesh_params) {
2068 err = -EOPNOTSUPP;
2069 goto out;
2070 }
2071
2049 /* This makes sure that there aren't more than 32 mesh config 2072 /* This makes sure that there aren't more than 32 mesh config
2050 * parameters (otherwise our bitfield scheme would not work.) */ 2073 * parameters (otherwise our bitfield scheme would not work.) */
2051 BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32); 2074 BUILD_BUG_ON(NL80211_MESHCONF_ATTR_MAX > 32);
@@ -2090,6 +2113,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info)
2090 err = drv->ops->set_mesh_params(&drv->wiphy, dev, &cfg, mask); 2113 err = drv->ops->set_mesh_params(&drv->wiphy, dev, &cfg, mask);
2091 rtnl_unlock(); 2114 rtnl_unlock();
2092 2115
2116 out:
2093 /* cleanup */ 2117 /* cleanup */
2094 cfg80211_put_dev(drv); 2118 cfg80211_put_dev(drv);
2095 dev_put(dev); 2119 dev_put(dev);
@@ -2106,7 +2130,7 @@ static int nl80211_get_reg(struct sk_buff *skb, struct genl_info *info)
2106 unsigned int i; 2130 unsigned int i;
2107 int err = -EINVAL; 2131 int err = -EINVAL;
2108 2132
2109 mutex_lock(&cfg80211_drv_mutex); 2133 mutex_lock(&cfg80211_mutex);
2110 2134
2111 if (!cfg80211_regdomain) 2135 if (!cfg80211_regdomain)
2112 goto out; 2136 goto out;
@@ -2169,7 +2193,7 @@ nla_put_failure:
2169 genlmsg_cancel(msg, hdr); 2193 genlmsg_cancel(msg, hdr);
2170 err = -EMSGSIZE; 2194 err = -EMSGSIZE;
2171out: 2195out:
2172 mutex_unlock(&cfg80211_drv_mutex); 2196 mutex_unlock(&cfg80211_mutex);
2173 return err; 2197 return err;
2174} 2198}
2175 2199
@@ -2228,9 +2252,9 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info)
2228 2252
2229 BUG_ON(rule_idx != num_rules); 2253 BUG_ON(rule_idx != num_rules);
2230 2254
2231 mutex_lock(&cfg80211_drv_mutex); 2255 mutex_lock(&cfg80211_mutex);
2232 r = set_regdom(rd); 2256 r = set_regdom(rd);
2233 mutex_unlock(&cfg80211_drv_mutex); 2257 mutex_unlock(&cfg80211_mutex);
2234 return r; 2258 return r;
2235 2259
2236 bad_reg: 2260 bad_reg:
@@ -2286,6 +2310,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
2286 struct wiphy *wiphy; 2310 struct wiphy *wiphy;
2287 int err, tmp, n_ssids = 0, n_channels = 0, i; 2311 int err, tmp, n_ssids = 0, n_channels = 0, i;
2288 enum ieee80211_band band; 2312 enum ieee80211_band band;
2313 size_t ie_len;
2289 2314
2290 err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev); 2315 err = get_drv_dev_by_info_ifindex(info->attrs, &drv, &dev);
2291 if (err) 2316 if (err)
@@ -2327,9 +2352,15 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
2327 goto out_unlock; 2352 goto out_unlock;
2328 } 2353 }
2329 2354
2355 if (info->attrs[NL80211_ATTR_IE])
2356 ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
2357 else
2358 ie_len = 0;
2359
2330 request = kzalloc(sizeof(*request) 2360 request = kzalloc(sizeof(*request)
2331 + sizeof(*ssid) * n_ssids 2361 + sizeof(*ssid) * n_ssids
2332 + sizeof(channel) * n_channels, GFP_KERNEL); 2362 + sizeof(channel) * n_channels
2363 + ie_len, GFP_KERNEL);
2333 if (!request) { 2364 if (!request) {
2334 err = -ENOMEM; 2365 err = -ENOMEM;
2335 goto out_unlock; 2366 goto out_unlock;
@@ -2340,6 +2371,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
2340 if (n_ssids) 2371 if (n_ssids)
2341 request->ssids = (void *)(request->channels + n_channels); 2372 request->ssids = (void *)(request->channels + n_channels);
2342 request->n_ssids = n_ssids; 2373 request->n_ssids = n_ssids;
2374 if (ie_len) {
2375 if (request->ssids)
2376 request->ie = (void *)(request->ssids + n_ssids);
2377 else
2378 request->ie = (void *)(request->channels + n_channels);
2379 }
2343 2380
2344 if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { 2381 if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) {
2345 /* user specified, bail out if channel not found */ 2382 /* user specified, bail out if channel not found */
@@ -2380,6 +2417,12 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
2380 } 2417 }
2381 } 2418 }
2382 2419
2420 if (info->attrs[NL80211_ATTR_IE]) {
2421 request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
2422 memcpy(request->ie, nla_data(info->attrs[NL80211_ATTR_IE]),
2423 request->ie_len);
2424 }
2425
2383 request->ifidx = dev->ifindex; 2426 request->ifidx = dev->ifindex;
2384 request->wiphy = &drv->wiphy; 2427 request->wiphy = &drv->wiphy;
2385 2428
@@ -2432,7 +2475,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags,
2432 NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability); 2475 NLA_PUT_U16(msg, NL80211_BSS_CAPABILITY, res->capability);
2433 NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq); 2476 NLA_PUT_U32(msg, NL80211_BSS_FREQUENCY, res->channel->center_freq);
2434 2477
2435 switch (res->signal_type) { 2478 switch (rdev->wiphy.signal_type) {
2436 case CFG80211_SIGNAL_TYPE_MBM: 2479 case CFG80211_SIGNAL_TYPE_MBM:
2437 NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal); 2480 NLA_PUT_U32(msg, NL80211_BSS_SIGNAL_MBM, res->signal);
2438 break; 2481 break;
@@ -2601,7 +2644,6 @@ static struct genl_ops nl80211_ops[] = {
2601 .doit = nl80211_get_station, 2644 .doit = nl80211_get_station,
2602 .dumpit = nl80211_dump_station, 2645 .dumpit = nl80211_dump_station,
2603 .policy = nl80211_policy, 2646 .policy = nl80211_policy,
2604 .flags = GENL_ADMIN_PERM,
2605 }, 2647 },
2606 { 2648 {
2607 .cmd = NL80211_CMD_SET_STATION, 2649 .cmd = NL80211_CMD_SET_STATION,
@@ -2708,6 +2750,9 @@ static struct genl_multicast_group nl80211_config_mcgrp = {
2708static struct genl_multicast_group nl80211_scan_mcgrp = { 2750static struct genl_multicast_group nl80211_scan_mcgrp = {
2709 .name = "scan", 2751 .name = "scan",
2710}; 2752};
2753static struct genl_multicast_group nl80211_regulatory_mcgrp = {
2754 .name = "regulatory",
2755};
2711 2756
2712/* notification functions */ 2757/* notification functions */
2713 2758
@@ -2739,7 +2784,7 @@ static int nl80211_send_scan_donemsg(struct sk_buff *msg,
2739 if (!hdr) 2784 if (!hdr)
2740 return -1; 2785 return -1;
2741 2786
2742 NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->idx); 2787 NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx);
2743 NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex); 2788 NLA_PUT_U32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex);
2744 2789
2745 /* XXX: we should probably bounce back the request? */ 2790 /* XXX: we should probably bounce back the request? */
@@ -2787,6 +2832,61 @@ void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
2787 genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL); 2832 genlmsg_multicast(msg, 0, nl80211_scan_mcgrp.id, GFP_KERNEL);
2788} 2833}
2789 2834
2835/*
2836 * This can happen on global regulatory changes or device specific settings
2837 * based on custom world regulatory domains.
2838 */
2839void nl80211_send_reg_change_event(struct regulatory_request *request)
2840{
2841 struct sk_buff *msg;
2842 void *hdr;
2843
2844 msg = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
2845 if (!msg)
2846 return;
2847
2848 hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_REG_CHANGE);
2849 if (!hdr) {
2850 nlmsg_free(msg);
2851 return;
2852 }
2853
2854 /* Userspace can always count this one always being set */
2855 NLA_PUT_U8(msg, NL80211_ATTR_REG_INITIATOR, request->initiator);
2856
2857 if (request->alpha2[0] == '0' && request->alpha2[1] == '0')
2858 NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
2859 NL80211_REGDOM_TYPE_WORLD);
2860 else if (request->alpha2[0] == '9' && request->alpha2[1] == '9')
2861 NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
2862 NL80211_REGDOM_TYPE_CUSTOM_WORLD);
2863 else if ((request->alpha2[0] == '9' && request->alpha2[1] == '8') ||
2864 request->intersect)
2865 NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
2866 NL80211_REGDOM_TYPE_INTERSECTION);
2867 else {
2868 NLA_PUT_U8(msg, NL80211_ATTR_REG_TYPE,
2869 NL80211_REGDOM_TYPE_COUNTRY);
2870 NLA_PUT_STRING(msg, NL80211_ATTR_REG_ALPHA2, request->alpha2);
2871 }
2872
2873 if (wiphy_idx_valid(request->wiphy_idx))
2874 NLA_PUT_U32(msg, NL80211_ATTR_WIPHY, request->wiphy_idx);
2875
2876 if (genlmsg_end(msg, hdr) < 0) {
2877 nlmsg_free(msg);
2878 return;
2879 }
2880
2881 genlmsg_multicast(msg, 0, nl80211_regulatory_mcgrp.id, GFP_KERNEL);
2882
2883 return;
2884
2885nla_put_failure:
2886 genlmsg_cancel(msg, hdr);
2887 nlmsg_free(msg);
2888}
2889
2790/* initialisation/exit functions */ 2890/* initialisation/exit functions */
2791 2891
2792int nl80211_init(void) 2892int nl80211_init(void)
@@ -2811,6 +2911,10 @@ int nl80211_init(void)
2811 if (err) 2911 if (err)
2812 goto err_out; 2912 goto err_out;
2813 2913
2914 err = genl_register_mc_group(&nl80211_fam, &nl80211_regulatory_mcgrp);
2915 if (err)
2916 goto err_out;
2917
2814 return 0; 2918 return 0;
2815 err_out: 2919 err_out:
2816 genl_unregister_family(&nl80211_fam); 2920 genl_unregister_family(&nl80211_fam);
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index b565a5f84e97..e65a3c38c52f 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -11,6 +11,7 @@ extern void nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
11 struct net_device *netdev); 11 struct net_device *netdev);
12extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev, 12extern void nl80211_send_scan_aborted(struct cfg80211_registered_device *rdev,
13 struct net_device *netdev); 13 struct net_device *netdev);
14extern void nl80211_send_reg_change_event(struct regulatory_request *request);
14#else 15#else
15static inline int nl80211_init(void) 16static inline int nl80211_init(void)
16{ 17{
@@ -27,6 +28,14 @@ static inline void
27nl80211_send_scan_done(struct cfg80211_registered_device *rdev, 28nl80211_send_scan_done(struct cfg80211_registered_device *rdev,
28 struct net_device *netdev) 29 struct net_device *netdev)
29{} 30{}
31static inline void nl80211_send_scan_aborted(
32 struct cfg80211_registered_device *rdev,
33 struct net_device *netdev)
34{}
35static inline void
36nl80211_send_reg_change_event(struct regulatory_request *request)
37{
38}
30#endif /* CONFIG_NL80211 */ 39#endif /* CONFIG_NL80211 */
31 40
32#endif /* __NET_WIRELESS_NL80211_H */ 41#endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 2323644330cd..eb8b8ed16155 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -41,6 +41,7 @@
41#include <net/cfg80211.h> 41#include <net/cfg80211.h>
42#include "core.h" 42#include "core.h"
43#include "reg.h" 43#include "reg.h"
44#include "nl80211.h"
44 45
45/* Receipt of information from last regulatory request */ 46/* Receipt of information from last regulatory request */
46static struct regulatory_request *last_request; 47static struct regulatory_request *last_request;
@@ -54,22 +55,63 @@ static u32 supported_bandwidths[] = {
54 MHZ_TO_KHZ(20), 55 MHZ_TO_KHZ(20),
55}; 56};
56 57
57/* Central wireless core regulatory domains, we only need two, 58/*
59 * Central wireless core regulatory domains, we only need two,
58 * the current one and a world regulatory domain in case we have no 60 * the current one and a world regulatory domain in case we have no
59 * information to give us an alpha2 */ 61 * information to give us an alpha2
62 */
60const struct ieee80211_regdomain *cfg80211_regdomain; 63const struct ieee80211_regdomain *cfg80211_regdomain;
61 64
62/* We use this as a place for the rd structure built from the 65/*
66 * We use this as a place for the rd structure built from the
63 * last parsed country IE to rest until CRDA gets back to us with 67 * last parsed country IE to rest until CRDA gets back to us with
64 * what it thinks should apply for the same country */ 68 * what it thinks should apply for the same country
69 */
65static const struct ieee80211_regdomain *country_ie_regdomain; 70static const struct ieee80211_regdomain *country_ie_regdomain;
66 71
72/* Used to queue up regulatory hints */
73static LIST_HEAD(reg_requests_list);
74static spinlock_t reg_requests_lock;
75
76/* Used to queue up beacon hints for review */
77static LIST_HEAD(reg_pending_beacons);
78static spinlock_t reg_pending_beacons_lock;
79
80/* Used to keep track of processed beacon hints */
81static LIST_HEAD(reg_beacon_list);
82
83struct reg_beacon {
84 struct list_head list;
85 struct ieee80211_channel chan;
86};
87
67/* We keep a static world regulatory domain in case of the absence of CRDA */ 88/* We keep a static world regulatory domain in case of the absence of CRDA */
68static const struct ieee80211_regdomain world_regdom = { 89static const struct ieee80211_regdomain world_regdom = {
69 .n_reg_rules = 1, 90 .n_reg_rules = 5,
70 .alpha2 = "00", 91 .alpha2 = "00",
71 .reg_rules = { 92 .reg_rules = {
72 REG_RULE(2412-10, 2462+10, 40, 6, 20, 93 /* IEEE 802.11b/g, channels 1..11 */
94 REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
95 /* IEEE 802.11b/g, channels 12..13. No HT40
96 * channel fits here. */
97 REG_RULE(2467-10, 2472+10, 20, 6, 20,
98 NL80211_RRF_PASSIVE_SCAN |
99 NL80211_RRF_NO_IBSS),
100 /* IEEE 802.11 channel 14 - Only JP enables
101 * this and for 802.11b only */
102 REG_RULE(2484-10, 2484+10, 20, 6, 20,
103 NL80211_RRF_PASSIVE_SCAN |
104 NL80211_RRF_NO_IBSS |
105 NL80211_RRF_NO_OFDM),
106 /* IEEE 802.11a, channel 36..48 */
107 REG_RULE(5180-10, 5240+10, 40, 6, 20,
108 NL80211_RRF_PASSIVE_SCAN |
109 NL80211_RRF_NO_IBSS),
110
111 /* NB: 5260 MHz - 5700 MHz requies DFS */
112
113 /* IEEE 802.11a, channel 149..165 */
114 REG_RULE(5745-10, 5825+10, 40, 6, 20,
73 NL80211_RRF_PASSIVE_SCAN | 115 NL80211_RRF_PASSIVE_SCAN |
74 NL80211_RRF_NO_IBSS), 116 NL80211_RRF_NO_IBSS),
75 } 117 }
@@ -83,9 +125,11 @@ static char *ieee80211_regdom = "US";
83module_param(ieee80211_regdom, charp, 0444); 125module_param(ieee80211_regdom, charp, 0444);
84MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code"); 126MODULE_PARM_DESC(ieee80211_regdom, "IEEE 802.11 regulatory domain code");
85 127
86/* We assume 40 MHz bandwidth for the old regulatory work. 128/*
129 * We assume 40 MHz bandwidth for the old regulatory work.
87 * We make emphasis we are using the exact same frequencies 130 * We make emphasis we are using the exact same frequencies
88 * as before */ 131 * as before
132 */
89 133
90static const struct ieee80211_regdomain us_regdom = { 134static const struct ieee80211_regdomain us_regdom = {
91 .n_reg_rules = 6, 135 .n_reg_rules = 6,
@@ -124,8 +168,10 @@ static const struct ieee80211_regdomain jp_regdom = {
124 168
125static const struct ieee80211_regdomain eu_regdom = { 169static const struct ieee80211_regdomain eu_regdom = {
126 .n_reg_rules = 6, 170 .n_reg_rules = 6,
127 /* This alpha2 is bogus, we leave it here just for stupid 171 /*
128 * backward compatibility */ 172 * This alpha2 is bogus, we leave it here just for stupid
173 * backward compatibility
174 */
129 .alpha2 = "EU", 175 .alpha2 = "EU",
130 .reg_rules = { 176 .reg_rules = {
131 /* IEEE 802.11b/g, channels 1..13 */ 177 /* IEEE 802.11b/g, channels 1..13 */
@@ -194,8 +240,10 @@ static void reset_regdomains(void)
194 cfg80211_regdomain = NULL; 240 cfg80211_regdomain = NULL;
195} 241}
196 242
197/* Dynamic world regulatory domain requested by the wireless 243/*
198 * core upon initialization */ 244 * Dynamic world regulatory domain requested by the wireless
245 * core upon initialization
246 */
199static void update_world_regdomain(const struct ieee80211_regdomain *rd) 247static void update_world_regdomain(const struct ieee80211_regdomain *rd)
200{ 248{
201 BUG_ON(!last_request); 249 BUG_ON(!last_request);
@@ -236,8 +284,10 @@ static bool is_unknown_alpha2(const char *alpha2)
236{ 284{
237 if (!alpha2) 285 if (!alpha2)
238 return false; 286 return false;
239 /* Special case where regulatory domain was built by driver 287 /*
240 * but a specific alpha2 cannot be determined */ 288 * Special case where regulatory domain was built by driver
289 * but a specific alpha2 cannot be determined
290 */
241 if (alpha2[0] == '9' && alpha2[1] == '9') 291 if (alpha2[0] == '9' && alpha2[1] == '9')
242 return true; 292 return true;
243 return false; 293 return false;
@@ -247,9 +297,11 @@ static bool is_intersected_alpha2(const char *alpha2)
247{ 297{
248 if (!alpha2) 298 if (!alpha2)
249 return false; 299 return false;
250 /* Special case where regulatory domain is the 300 /*
301 * Special case where regulatory domain is the
251 * result of an intersection between two regulatory domain 302 * result of an intersection between two regulatory domain
252 * structures */ 303 * structures
304 */
253 if (alpha2[0] == '9' && alpha2[1] == '8') 305 if (alpha2[0] == '9' && alpha2[1] == '8')
254 return true; 306 return true;
255 return false; 307 return false;
@@ -274,8 +326,10 @@ static bool alpha2_equal(const char *alpha2_x, const char *alpha2_y)
274 return false; 326 return false;
275} 327}
276 328
277static bool regdom_changed(const char *alpha2) 329static bool regdom_changes(const char *alpha2)
278{ 330{
331 assert_cfg80211_lock();
332
279 if (!cfg80211_regdomain) 333 if (!cfg80211_regdomain)
280 return true; 334 return true;
281 if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) 335 if (alpha2_equal(cfg80211_regdomain->alpha2, alpha2))
@@ -302,8 +356,10 @@ static bool country_ie_integrity_changes(u32 checksum)
302 return false; 356 return false;
303} 357}
304 358
305/* This lets us keep regulatory code which is updated on a regulatory 359/*
306 * basis in userspace. */ 360 * This lets us keep regulatory code which is updated on a regulatory
361 * basis in userspace.
362 */
307static int call_crda(const char *alpha2) 363static int call_crda(const char *alpha2)
308{ 364{
309 char country_env[9 + 2] = "COUNTRY="; 365 char country_env[9 + 2] = "COUNTRY=";
@@ -348,7 +404,8 @@ static bool is_valid_reg_rule(const struct ieee80211_reg_rule *rule)
348 404
349 freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz; 405 freq_diff = freq_range->end_freq_khz - freq_range->start_freq_khz;
350 406
351 if (freq_diff <= 0 || freq_range->max_bandwidth_khz > freq_diff) 407 if (freq_range->end_freq_khz <= freq_range->start_freq_khz ||
408 freq_range->max_bandwidth_khz > freq_diff)
352 return false; 409 return false;
353 410
354 return true; 411 return true;
@@ -414,10 +471,12 @@ static bool freq_in_rule_band(const struct ieee80211_freq_range *freq_range,
414#undef ONE_GHZ_IN_KHZ 471#undef ONE_GHZ_IN_KHZ
415} 472}
416 473
417/* Converts a country IE to a regulatory domain. A regulatory domain 474/*
475 * Converts a country IE to a regulatory domain. A regulatory domain
418 * structure has a lot of information which the IE doesn't yet have, 476 * structure has a lot of information which the IE doesn't yet have,
419 * so for the other values we use upper max values as we will intersect 477 * so for the other values we use upper max values as we will intersect
420 * with our userspace regulatory agent to get lower bounds. */ 478 * with our userspace regulatory agent to get lower bounds.
479 */
421static struct ieee80211_regdomain *country_ie_2_rd( 480static struct ieee80211_regdomain *country_ie_2_rd(
422 u8 *country_ie, 481 u8 *country_ie,
423 u8 country_ie_len, 482 u8 country_ie_len,
@@ -462,9 +521,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
462 521
463 *checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8); 522 *checksum ^= ((flags ^ alpha2[0] ^ alpha2[1]) << 8);
464 523
465 /* We need to build a reg rule for each triplet, but first we must 524 /*
525 * We need to build a reg rule for each triplet, but first we must
466 * calculate the number of reg rules we will need. We will need one 526 * calculate the number of reg rules we will need. We will need one
467 * for each channel subband */ 527 * for each channel subband
528 */
468 while (country_ie_len >= 3) { 529 while (country_ie_len >= 3) {
469 int end_channel = 0; 530 int end_channel = 0;
470 struct ieee80211_country_ie_triplet *triplet = 531 struct ieee80211_country_ie_triplet *triplet =
@@ -502,9 +563,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
502 if (cur_sub_max_channel < cur_channel) 563 if (cur_sub_max_channel < cur_channel)
503 return NULL; 564 return NULL;
504 565
505 /* Do not allow overlapping channels. Also channels 566 /*
567 * Do not allow overlapping channels. Also channels
506 * passed in each subband must be monotonically 568 * passed in each subband must be monotonically
507 * increasing */ 569 * increasing
570 */
508 if (last_sub_max_channel) { 571 if (last_sub_max_channel) {
509 if (cur_channel <= last_sub_max_channel) 572 if (cur_channel <= last_sub_max_channel)
510 return NULL; 573 return NULL;
@@ -512,10 +575,12 @@ static struct ieee80211_regdomain *country_ie_2_rd(
512 return NULL; 575 return NULL;
513 } 576 }
514 577
515 /* When dot11RegulatoryClassesRequired is supported 578 /*
579 * When dot11RegulatoryClassesRequired is supported
516 * we can throw ext triplets as part of this soup, 580 * we can throw ext triplets as part of this soup,
517 * for now we don't care when those change as we 581 * for now we don't care when those change as we
518 * don't support them */ 582 * don't support them
583 */
519 *checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) | 584 *checksum ^= ((cur_channel ^ cur_sub_max_channel) << 8) |
520 ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) | 585 ((cur_sub_max_channel ^ cur_sub_max_channel) << 16) |
521 ((triplet->chans.max_power ^ cur_sub_max_channel) << 24); 586 ((triplet->chans.max_power ^ cur_sub_max_channel) << 24);
@@ -526,8 +591,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
526 country_ie_len -= 3; 591 country_ie_len -= 3;
527 num_rules++; 592 num_rules++;
528 593
529 /* Note: this is not a IEEE requirement but 594 /*
530 * simply a memory requirement */ 595 * Note: this is not a IEEE requirement but
596 * simply a memory requirement
597 */
531 if (num_rules > NL80211_MAX_SUPP_REG_RULES) 598 if (num_rules > NL80211_MAX_SUPP_REG_RULES)
532 return NULL; 599 return NULL;
533 } 600 }
@@ -555,8 +622,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
555 struct ieee80211_freq_range *freq_range = NULL; 622 struct ieee80211_freq_range *freq_range = NULL;
556 struct ieee80211_power_rule *power_rule = NULL; 623 struct ieee80211_power_rule *power_rule = NULL;
557 624
558 /* Must parse if dot11RegulatoryClassesRequired is true, 625 /*
559 * we don't support this yet */ 626 * Must parse if dot11RegulatoryClassesRequired is true,
627 * we don't support this yet
628 */
560 if (triplet->ext.reg_extension_id >= 629 if (triplet->ext.reg_extension_id >=
561 IEEE80211_COUNTRY_EXTENSION_ID) { 630 IEEE80211_COUNTRY_EXTENSION_ID) {
562 country_ie += 3; 631 country_ie += 3;
@@ -578,10 +647,12 @@ static struct ieee80211_regdomain *country_ie_2_rd(
578 end_channel = triplet->chans.first_channel + 647 end_channel = triplet->chans.first_channel +
579 (4 * (triplet->chans.num_channels - 1)); 648 (4 * (triplet->chans.num_channels - 1));
580 649
581 /* The +10 is since the regulatory domain expects 650 /*
651 * The +10 is since the regulatory domain expects
582 * the actual band edge, not the center of freq for 652 * the actual band edge, not the center of freq for
583 * its start and end freqs, assuming 20 MHz bandwidth on 653 * its start and end freqs, assuming 20 MHz bandwidth on
584 * the channels passed */ 654 * the channels passed
655 */
585 freq_range->start_freq_khz = 656 freq_range->start_freq_khz =
586 MHZ_TO_KHZ(ieee80211_channel_to_frequency( 657 MHZ_TO_KHZ(ieee80211_channel_to_frequency(
587 triplet->chans.first_channel) - 10); 658 triplet->chans.first_channel) - 10);
@@ -589,9 +660,11 @@ static struct ieee80211_regdomain *country_ie_2_rd(
589 MHZ_TO_KHZ(ieee80211_channel_to_frequency( 660 MHZ_TO_KHZ(ieee80211_channel_to_frequency(
590 end_channel) + 10); 661 end_channel) + 10);
591 662
592 /* Large arbitrary values, we intersect later */ 663 /*
593 /* Increment this if we ever support >= 40 MHz channels 664 * These are large arbitrary values we use to intersect later.
594 * in IEEE 802.11 */ 665 * Increment this if we ever support >= 40 MHz channels
666 * in IEEE 802.11
667 */
595 freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40); 668 freq_range->max_bandwidth_khz = MHZ_TO_KHZ(40);
596 power_rule->max_antenna_gain = DBI_TO_MBI(100); 669 power_rule->max_antenna_gain = DBI_TO_MBI(100);
597 power_rule->max_eirp = DBM_TO_MBM(100); 670 power_rule->max_eirp = DBM_TO_MBM(100);
@@ -607,8 +680,10 @@ static struct ieee80211_regdomain *country_ie_2_rd(
607} 680}
608 681
609 682
610/* Helper for regdom_intersect(), this does the real 683/*
611 * mathematical intersection fun */ 684 * Helper for regdom_intersect(), this does the real
685 * mathematical intersection fun
686 */
612static int reg_rules_intersect( 687static int reg_rules_intersect(
613 const struct ieee80211_reg_rule *rule1, 688 const struct ieee80211_reg_rule *rule1,
614 const struct ieee80211_reg_rule *rule2, 689 const struct ieee80211_reg_rule *rule2,
@@ -686,11 +761,13 @@ static struct ieee80211_regdomain *regdom_intersect(
686 if (!rd1 || !rd2) 761 if (!rd1 || !rd2)
687 return NULL; 762 return NULL;
688 763
689 /* First we get a count of the rules we'll need, then we actually 764 /*
765 * First we get a count of the rules we'll need, then we actually
690 * build them. This is to so we can malloc() and free() a 766 * build them. This is to so we can malloc() and free() a
691 * regdomain once. The reason we use reg_rules_intersect() here 767 * regdomain once. The reason we use reg_rules_intersect() here
692 * is it will return -EINVAL if the rule computed makes no sense. 768 * is it will return -EINVAL if the rule computed makes no sense.
693 * All rules that do check out OK are valid. */ 769 * All rules that do check out OK are valid.
770 */
694 771
695 for (x = 0; x < rd1->n_reg_rules; x++) { 772 for (x = 0; x < rd1->n_reg_rules; x++) {
696 rule1 = &rd1->reg_rules[x]; 773 rule1 = &rd1->reg_rules[x];
@@ -718,14 +795,18 @@ static struct ieee80211_regdomain *regdom_intersect(
718 rule1 = &rd1->reg_rules[x]; 795 rule1 = &rd1->reg_rules[x];
719 for (y = 0; y < rd2->n_reg_rules; y++) { 796 for (y = 0; y < rd2->n_reg_rules; y++) {
720 rule2 = &rd2->reg_rules[y]; 797 rule2 = &rd2->reg_rules[y];
721 /* This time around instead of using the stack lets 798 /*
799 * This time around instead of using the stack lets
722 * write to the target rule directly saving ourselves 800 * write to the target rule directly saving ourselves
723 * a memcpy() */ 801 * a memcpy()
802 */
724 intersected_rule = &rd->reg_rules[rule_idx]; 803 intersected_rule = &rd->reg_rules[rule_idx];
725 r = reg_rules_intersect(rule1, rule2, 804 r = reg_rules_intersect(rule1, rule2,
726 intersected_rule); 805 intersected_rule);
727 /* No need to memset here the intersected rule here as 806 /*
728 * we're not using the stack anymore */ 807 * No need to memset here the intersected rule here as
808 * we're not using the stack anymore
809 */
729 if (r) 810 if (r)
730 continue; 811 continue;
731 rule_idx++; 812 rule_idx++;
@@ -744,8 +825,10 @@ static struct ieee80211_regdomain *regdom_intersect(
744 return rd; 825 return rd;
745} 826}
746 827
747/* XXX: add support for the rest of enum nl80211_reg_rule_flags, we may 828/*
748 * want to just have the channel structure use these */ 829 * XXX: add support for the rest of enum nl80211_reg_rule_flags, we may
830 * want to just have the channel structure use these
831 */
749static u32 map_regdom_flags(u32 rd_flags) 832static u32 map_regdom_flags(u32 rd_flags)
750{ 833{
751 u32 channel_flags = 0; 834 u32 channel_flags = 0;
@@ -771,10 +854,12 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
771 854
772 regd = custom_regd ? custom_regd : cfg80211_regdomain; 855 regd = custom_regd ? custom_regd : cfg80211_regdomain;
773 856
774 /* Follow the driver's regulatory domain, if present, unless a country 857 /*
775 * IE has been processed or a user wants to help complaince further */ 858 * Follow the driver's regulatory domain, if present, unless a country
776 if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE && 859 * IE has been processed or a user wants to help complaince further
777 last_request->initiator != REGDOM_SET_BY_USER && 860 */
861 if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
862 last_request->initiator != NL80211_REGDOM_SET_BY_USER &&
778 wiphy->regd) 863 wiphy->regd)
779 regd = wiphy->regd; 864 regd = wiphy->regd;
780 865
@@ -790,9 +875,11 @@ static int freq_reg_info_regd(struct wiphy *wiphy,
790 fr = &rr->freq_range; 875 fr = &rr->freq_range;
791 pr = &rr->power_rule; 876 pr = &rr->power_rule;
792 877
793 /* We only need to know if one frequency rule was 878 /*
879 * We only need to know if one frequency rule was
794 * was in center_freq's band, that's enough, so lets 880 * was in center_freq's band, that's enough, so lets
795 * not overwrite it once found */ 881 * not overwrite it once found
882 */
796 if (!band_rule_found) 883 if (!band_rule_found)
797 band_rule_found = freq_in_rule_band(fr, center_freq); 884 band_rule_found = freq_in_rule_band(fr, center_freq);
798 885
@@ -829,6 +916,11 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
829 const struct ieee80211_power_rule *power_rule = NULL; 916 const struct ieee80211_power_rule *power_rule = NULL;
830 struct ieee80211_supported_band *sband; 917 struct ieee80211_supported_band *sband;
831 struct ieee80211_channel *chan; 918 struct ieee80211_channel *chan;
919 struct wiphy *request_wiphy = NULL;
920
921 assert_cfg80211_lock();
922
923 request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
832 924
833 sband = wiphy->bands[band]; 925 sband = wiphy->bands[band];
834 BUG_ON(chan_idx >= sband->n_channels); 926 BUG_ON(chan_idx >= sband->n_channels);
@@ -840,7 +932,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
840 &max_bandwidth, &reg_rule); 932 &max_bandwidth, &reg_rule);
841 933
842 if (r) { 934 if (r) {
843 /* This means no regulatory rule was found in the country IE 935 /*
936 * This means no regulatory rule was found in the country IE
844 * with a frequency range on the center_freq's band, since 937 * with a frequency range on the center_freq's band, since
845 * IEEE-802.11 allows for a country IE to have a subset of the 938 * IEEE-802.11 allows for a country IE to have a subset of the
846 * regulatory information provided in a country we ignore 939 * regulatory information provided in a country we ignore
@@ -851,7 +944,8 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
851 * http://tinyurl.com/11d-clarification 944 * http://tinyurl.com/11d-clarification
852 */ 945 */
853 if (r == -ERANGE && 946 if (r == -ERANGE &&
854 last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { 947 last_request->initiator ==
948 NL80211_REGDOM_SET_BY_COUNTRY_IE) {
855#ifdef CONFIG_CFG80211_REG_DEBUG 949#ifdef CONFIG_CFG80211_REG_DEBUG
856 printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz " 950 printk(KERN_DEBUG "cfg80211: Leaving channel %d MHz "
857 "intact on %s - no rule found in band on " 951 "intact on %s - no rule found in band on "
@@ -859,10 +953,13 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
859 chan->center_freq, wiphy_name(wiphy)); 953 chan->center_freq, wiphy_name(wiphy));
860#endif 954#endif
861 } else { 955 } else {
862 /* In this case we know the country IE has at least one reg rule 956 /*
863 * for the band so we respect its band definitions */ 957 * In this case we know the country IE has at least one reg rule
958 * for the band so we respect its band definitions
959 */
864#ifdef CONFIG_CFG80211_REG_DEBUG 960#ifdef CONFIG_CFG80211_REG_DEBUG
865 if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) 961 if (last_request->initiator ==
962 NL80211_REGDOM_SET_BY_COUNTRY_IE)
866 printk(KERN_DEBUG "cfg80211: Disabling " 963 printk(KERN_DEBUG "cfg80211: Disabling "
867 "channel %d MHz on %s due to " 964 "channel %d MHz on %s due to "
868 "Country IE\n", 965 "Country IE\n",
@@ -876,12 +973,14 @@ static void handle_channel(struct wiphy *wiphy, enum ieee80211_band band,
876 973
877 power_rule = &reg_rule->power_rule; 974 power_rule = &reg_rule->power_rule;
878 975
879 if (last_request->initiator == REGDOM_SET_BY_DRIVER && 976 if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
880 last_request->wiphy && last_request->wiphy == wiphy && 977 request_wiphy && request_wiphy == wiphy &&
881 last_request->wiphy->strict_regulatory) { 978 request_wiphy->strict_regulatory) {
882 /* This gaurantees the driver's requested regulatory domain 979 /*
980 * This gaurantees the driver's requested regulatory domain
883 * will always be used as a base for further regulatory 981 * will always be used as a base for further regulatory
884 * settings */ 982 * settings
983 */
885 chan->flags = chan->orig_flags = 984 chan->flags = chan->orig_flags =
886 map_regdom_flags(reg_rule->flags); 985 map_regdom_flags(reg_rule->flags);
887 chan->max_antenna_gain = chan->orig_mag = 986 chan->max_antenna_gain = chan->orig_mag =
@@ -915,39 +1014,147 @@ static void handle_band(struct wiphy *wiphy, enum ieee80211_band band)
915 handle_channel(wiphy, band, i); 1014 handle_channel(wiphy, band, i);
916} 1015}
917 1016
918static bool ignore_reg_update(struct wiphy *wiphy, enum reg_set_by setby) 1017static bool ignore_reg_update(struct wiphy *wiphy,
1018 enum nl80211_reg_initiator initiator)
919{ 1019{
920 if (!last_request) 1020 if (!last_request)
921 return true; 1021 return true;
922 if (setby == REGDOM_SET_BY_CORE && 1022 if (initiator == NL80211_REGDOM_SET_BY_CORE &&
923 wiphy->custom_regulatory) 1023 wiphy->custom_regulatory)
924 return true; 1024 return true;
925 /* wiphy->regd will be set once the device has its own 1025 /*
926 * desired regulatory domain set */ 1026 * wiphy->regd will be set once the device has its own
1027 * desired regulatory domain set
1028 */
927 if (wiphy->strict_regulatory && !wiphy->regd && 1029 if (wiphy->strict_regulatory && !wiphy->regd &&
928 !is_world_regdom(last_request->alpha2)) 1030 !is_world_regdom(last_request->alpha2))
929 return true; 1031 return true;
930 return false; 1032 return false;
931} 1033}
932 1034
933static void update_all_wiphy_regulatory(enum reg_set_by setby) 1035static void update_all_wiphy_regulatory(enum nl80211_reg_initiator initiator)
934{ 1036{
935 struct cfg80211_registered_device *drv; 1037 struct cfg80211_registered_device *drv;
936 1038
937 list_for_each_entry(drv, &cfg80211_drv_list, list) 1039 list_for_each_entry(drv, &cfg80211_drv_list, list)
938 wiphy_update_regulatory(&drv->wiphy, setby); 1040 wiphy_update_regulatory(&drv->wiphy, initiator);
939} 1041}
940 1042
941void wiphy_update_regulatory(struct wiphy *wiphy, enum reg_set_by setby) 1043static void handle_reg_beacon(struct wiphy *wiphy,
1044 unsigned int chan_idx,
1045 struct reg_beacon *reg_beacon)
942{ 1046{
943 enum ieee80211_band band; 1047#ifdef CONFIG_CFG80211_REG_DEBUG
1048#define REG_DEBUG_BEACON_FLAG(desc) \
1049 printk(KERN_DEBUG "cfg80211: Enabling " desc " on " \
1050 "frequency: %d MHz (Ch %d) on %s\n", \
1051 reg_beacon->chan.center_freq, \
1052 ieee80211_frequency_to_channel(reg_beacon->chan.center_freq), \
1053 wiphy_name(wiphy));
1054#else
1055#define REG_DEBUG_BEACON_FLAG(desc) do {} while (0)
1056#endif
1057 struct ieee80211_supported_band *sband;
1058 struct ieee80211_channel *chan;
1059
1060 assert_cfg80211_lock();
1061
1062 sband = wiphy->bands[reg_beacon->chan.band];
1063 chan = &sband->channels[chan_idx];
1064
1065 if (likely(chan->center_freq != reg_beacon->chan.center_freq))
1066 return;
1067
1068 if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) {
1069 chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN;
1070 REG_DEBUG_BEACON_FLAG("active scanning");
1071 }
1072
1073 if (chan->flags & IEEE80211_CHAN_NO_IBSS) {
1074 chan->flags &= ~IEEE80211_CHAN_NO_IBSS;
1075 REG_DEBUG_BEACON_FLAG("beaconing");
1076 }
1077
1078 chan->beacon_found = true;
1079#undef REG_DEBUG_BEACON_FLAG
1080}
1081
1082/*
1083 * Called when a scan on a wiphy finds a beacon on
1084 * new channel
1085 */
1086static void wiphy_update_new_beacon(struct wiphy *wiphy,
1087 struct reg_beacon *reg_beacon)
1088{
1089 unsigned int i;
1090 struct ieee80211_supported_band *sband;
1091
1092 assert_cfg80211_lock();
944 1093
945 if (ignore_reg_update(wiphy, setby)) 1094 if (!wiphy->bands[reg_beacon->chan.band])
946 return; 1095 return;
1096
1097 sband = wiphy->bands[reg_beacon->chan.band];
1098
1099 for (i = 0; i < sband->n_channels; i++)
1100 handle_reg_beacon(wiphy, i, reg_beacon);
1101}
1102
1103/*
1104 * Called upon reg changes or a new wiphy is added
1105 */
1106static void wiphy_update_beacon_reg(struct wiphy *wiphy)
1107{
1108 unsigned int i;
1109 struct ieee80211_supported_band *sband;
1110 struct reg_beacon *reg_beacon;
1111
1112 assert_cfg80211_lock();
1113
1114 if (list_empty(&reg_beacon_list))
1115 return;
1116
1117 list_for_each_entry(reg_beacon, &reg_beacon_list, list) {
1118 if (!wiphy->bands[reg_beacon->chan.band])
1119 continue;
1120 sband = wiphy->bands[reg_beacon->chan.band];
1121 for (i = 0; i < sband->n_channels; i++)
1122 handle_reg_beacon(wiphy, i, reg_beacon);
1123 }
1124}
1125
1126static bool reg_is_world_roaming(struct wiphy *wiphy)
1127{
1128 if (is_world_regdom(cfg80211_regdomain->alpha2) ||
1129 (wiphy->regd && is_world_regdom(wiphy->regd->alpha2)))
1130 return true;
1131 if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
1132 wiphy->custom_regulatory)
1133 return true;
1134 return false;
1135}
1136
1137/* Reap the advantages of previously found beacons */
1138static void reg_process_beacons(struct wiphy *wiphy)
1139{
1140 if (!reg_is_world_roaming(wiphy))
1141 return;
1142 wiphy_update_beacon_reg(wiphy);
1143}
1144
1145void wiphy_update_regulatory(struct wiphy *wiphy,
1146 enum nl80211_reg_initiator initiator)
1147{
1148 enum ieee80211_band band;
1149
1150 if (ignore_reg_update(wiphy, initiator))
1151 goto out;
947 for (band = 0; band < IEEE80211_NUM_BANDS; band++) { 1152 for (band = 0; band < IEEE80211_NUM_BANDS; band++) {
948 if (wiphy->bands[band]) 1153 if (wiphy->bands[band])
949 handle_band(wiphy, band); 1154 handle_band(wiphy, band);
950 } 1155 }
1156out:
1157 reg_process_beacons(wiphy);
951 if (wiphy->reg_notifier) 1158 if (wiphy->reg_notifier)
952 wiphy->reg_notifier(wiphy, last_request); 1159 wiphy->reg_notifier(wiphy, last_request);
953} 1160}
@@ -1033,81 +1240,98 @@ static int reg_copy_regd(const struct ieee80211_regdomain **dst_regd,
1033 return 0; 1240 return 0;
1034} 1241}
1035 1242
1036/* Return value which can be used by ignore_request() to indicate 1243/*
1037 * it has been determined we should intersect two regulatory domains */ 1244 * Return value which can be used by ignore_request() to indicate
1245 * it has been determined we should intersect two regulatory domains
1246 */
1038#define REG_INTERSECT 1 1247#define REG_INTERSECT 1
1039 1248
1040/* This has the logic which determines when a new request 1249/* This has the logic which determines when a new request
1041 * should be ignored. */ 1250 * should be ignored. */
1042static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by, 1251static int ignore_request(struct wiphy *wiphy,
1043 const char *alpha2) 1252 struct regulatory_request *pending_request)
1044{ 1253{
1254 struct wiphy *last_wiphy = NULL;
1255
1256 assert_cfg80211_lock();
1257
1045 /* All initial requests are respected */ 1258 /* All initial requests are respected */
1046 if (!last_request) 1259 if (!last_request)
1047 return 0; 1260 return 0;
1048 1261
1049 switch (set_by) { 1262 switch (pending_request->initiator) {
1050 case REGDOM_SET_BY_INIT: 1263 case NL80211_REGDOM_SET_BY_CORE:
1051 return -EINVAL; 1264 return -EINVAL;
1052 case REGDOM_SET_BY_CORE: 1265 case NL80211_REGDOM_SET_BY_COUNTRY_IE:
1053 /* 1266
1054 * Always respect new wireless core hints, should only happen 1267 last_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
1055 * when updating the world regulatory domain at init. 1268
1056 */ 1269 if (unlikely(!is_an_alpha2(pending_request->alpha2)))
1057 return 0;
1058 case REGDOM_SET_BY_COUNTRY_IE:
1059 if (unlikely(!is_an_alpha2(alpha2)))
1060 return -EINVAL; 1270 return -EINVAL;
1061 if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { 1271 if (last_request->initiator ==
1062 if (last_request->wiphy != wiphy) { 1272 NL80211_REGDOM_SET_BY_COUNTRY_IE) {
1273 if (last_wiphy != wiphy) {
1063 /* 1274 /*
1064 * Two cards with two APs claiming different 1275 * Two cards with two APs claiming different
1065 * different Country IE alpha2s. We could 1276 * different Country IE alpha2s. We could
1066 * intersect them, but that seems unlikely 1277 * intersect them, but that seems unlikely
1067 * to be correct. Reject second one for now. 1278 * to be correct. Reject second one for now.
1068 */ 1279 */
1069 if (!alpha2_equal(alpha2, 1280 if (regdom_changes(pending_request->alpha2))
1070 cfg80211_regdomain->alpha2))
1071 return -EOPNOTSUPP; 1281 return -EOPNOTSUPP;
1072 return -EALREADY; 1282 return -EALREADY;
1073 } 1283 }
1074 /* Two consecutive Country IE hints on the same wiphy. 1284 /*
1075 * This should be picked up early by the driver/stack */ 1285 * Two consecutive Country IE hints on the same wiphy.
1076 if (WARN_ON(!alpha2_equal(cfg80211_regdomain->alpha2, 1286 * This should be picked up early by the driver/stack
1077 alpha2))) 1287 */
1288 if (WARN_ON(regdom_changes(pending_request->alpha2)))
1078 return 0; 1289 return 0;
1079 return -EALREADY; 1290 return -EALREADY;
1080 } 1291 }
1081 return REG_INTERSECT; 1292 return REG_INTERSECT;
1082 case REGDOM_SET_BY_DRIVER: 1293 case NL80211_REGDOM_SET_BY_DRIVER:
1083 if (last_request->initiator == REGDOM_SET_BY_CORE) { 1294 if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE) {
1084 if (is_old_static_regdom(cfg80211_regdomain)) 1295 if (is_old_static_regdom(cfg80211_regdomain))
1085 return 0; 1296 return 0;
1086 if (!alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) 1297 if (regdom_changes(pending_request->alpha2))
1087 return 0; 1298 return 0;
1088 return -EALREADY; 1299 return -EALREADY;
1089 } 1300 }
1301
1302 /*
1303 * This would happen if you unplug and plug your card
1304 * back in or if you add a new device for which the previously
1305 * loaded card also agrees on the regulatory domain.
1306 */
1307 if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
1308 !regdom_changes(pending_request->alpha2))
1309 return -EALREADY;
1310
1090 return REG_INTERSECT; 1311 return REG_INTERSECT;
1091 case REGDOM_SET_BY_USER: 1312 case NL80211_REGDOM_SET_BY_USER:
1092 if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) 1313 if (last_request->initiator == NL80211_REGDOM_SET_BY_COUNTRY_IE)
1093 return REG_INTERSECT; 1314 return REG_INTERSECT;
1094 /* If the user knows better the user should set the regdom 1315 /*
1095 * to their country before the IE is picked up */ 1316 * If the user knows better the user should set the regdom
1096 if (last_request->initiator == REGDOM_SET_BY_USER && 1317 * to their country before the IE is picked up
1318 */
1319 if (last_request->initiator == NL80211_REGDOM_SET_BY_USER &&
1097 last_request->intersect) 1320 last_request->intersect)
1098 return -EOPNOTSUPP; 1321 return -EOPNOTSUPP;
1099 /* Process user requests only after previous user/driver/core 1322 /*
1100 * requests have been processed */ 1323 * Process user requests only after previous user/driver/core
1101 if (last_request->initiator == REGDOM_SET_BY_CORE || 1324 * requests have been processed
1102 last_request->initiator == REGDOM_SET_BY_DRIVER || 1325 */
1103 last_request->initiator == REGDOM_SET_BY_USER) { 1326 if (last_request->initiator == NL80211_REGDOM_SET_BY_CORE ||
1104 if (!alpha2_equal(last_request->alpha2, 1327 last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER ||
1105 cfg80211_regdomain->alpha2)) 1328 last_request->initiator == NL80211_REGDOM_SET_BY_USER) {
1329 if (regdom_changes(last_request->alpha2))
1106 return -EAGAIN; 1330 return -EAGAIN;
1107 } 1331 }
1108 1332
1109 if (!is_old_static_regdom(cfg80211_regdomain) && 1333 if (!is_old_static_regdom(cfg80211_regdomain) &&
1110 alpha2_equal(cfg80211_regdomain->alpha2, alpha2)) 1334 !regdom_changes(pending_request->alpha2))
1111 return -EALREADY; 1335 return -EALREADY;
1112 1336
1113 return 0; 1337 return 0;
@@ -1116,59 +1340,80 @@ static int ignore_request(struct wiphy *wiphy, enum reg_set_by set_by,
1116 return -EINVAL; 1340 return -EINVAL;
1117} 1341}
1118 1342
1119/* Caller must hold &cfg80211_drv_mutex */ 1343/**
1120int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, 1344 * __regulatory_hint - hint to the wireless core a regulatory domain
1121 const char *alpha2, 1345 * @wiphy: if the hint comes from country information from an AP, this
1122 u32 country_ie_checksum, 1346 * is required to be set to the wiphy that received the information
1123 enum environment_cap env) 1347 * @pending_request: the regulatory request currently being processed
1348 *
1349 * The Wireless subsystem can use this function to hint to the wireless core
1350 * what it believes should be the current regulatory domain.
1351 *
1352 * Returns zero if all went fine, %-EALREADY if a regulatory domain had
1353 * already been set or other standard error codes.
1354 *
1355 * Caller must hold &cfg80211_mutex
1356 */
1357static int __regulatory_hint(struct wiphy *wiphy,
1358 struct regulatory_request *pending_request)
1124{ 1359{
1125 struct regulatory_request *request;
1126 bool intersect = false; 1360 bool intersect = false;
1127 int r = 0; 1361 int r = 0;
1128 1362
1129 r = ignore_request(wiphy, set_by, alpha2); 1363 assert_cfg80211_lock();
1364
1365 r = ignore_request(wiphy, pending_request);
1130 1366
1131 if (r == REG_INTERSECT) { 1367 if (r == REG_INTERSECT) {
1132 if (set_by == REGDOM_SET_BY_DRIVER) { 1368 if (pending_request->initiator ==
1369 NL80211_REGDOM_SET_BY_DRIVER) {
1133 r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); 1370 r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
1134 if (r) 1371 if (r) {
1372 kfree(pending_request);
1135 return r; 1373 return r;
1374 }
1136 } 1375 }
1137 intersect = true; 1376 intersect = true;
1138 } else if (r) { 1377 } else if (r) {
1139 /* If the regulatory domain being requested by the 1378 /*
1379 * If the regulatory domain being requested by the
1140 * driver has already been set just copy it to the 1380 * driver has already been set just copy it to the
1141 * wiphy */ 1381 * wiphy
1142 if (r == -EALREADY && set_by == REGDOM_SET_BY_DRIVER) { 1382 */
1383 if (r == -EALREADY &&
1384 pending_request->initiator ==
1385 NL80211_REGDOM_SET_BY_DRIVER) {
1143 r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain); 1386 r = reg_copy_regd(&wiphy->regd, cfg80211_regdomain);
1144 if (r) 1387 if (r) {
1388 kfree(pending_request);
1145 return r; 1389 return r;
1390 }
1146 r = -EALREADY; 1391 r = -EALREADY;
1147 goto new_request; 1392 goto new_request;
1148 } 1393 }
1394 kfree(pending_request);
1149 return r; 1395 return r;
1150 } 1396 }
1151 1397
1152new_request: 1398new_request:
1153 request = kzalloc(sizeof(struct regulatory_request), 1399 kfree(last_request);
1154 GFP_KERNEL);
1155 if (!request)
1156 return -ENOMEM;
1157 1400
1158 request->alpha2[0] = alpha2[0]; 1401 last_request = pending_request;
1159 request->alpha2[1] = alpha2[1]; 1402 last_request->intersect = intersect;
1160 request->initiator = set_by;
1161 request->wiphy = wiphy;
1162 request->intersect = intersect;
1163 request->country_ie_checksum = country_ie_checksum;
1164 request->country_ie_env = env;
1165 1403
1166 kfree(last_request); 1404 pending_request = NULL;
1167 last_request = request;
1168 1405
1169 /* When r == REG_INTERSECT we do need to call CRDA */ 1406 /* When r == REG_INTERSECT we do need to call CRDA */
1170 if (r < 0) 1407 if (r < 0) {
1408 /*
1409 * Since CRDA will not be called in this case as we already
1410 * have applied the requested regulatory domain before we just
1411 * inform userspace we have processed the request
1412 */
1413 if (r == -EALREADY)
1414 nl80211_send_reg_change_event(last_request);
1171 return r; 1415 return r;
1416 }
1172 1417
1173 /* 1418 /*
1174 * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled 1419 * Note: When CONFIG_WIRELESS_OLD_REGULATORY is enabled
@@ -1180,34 +1425,194 @@ new_request:
1180 * 1425 *
1181 * to intersect with the static rd 1426 * to intersect with the static rd
1182 */ 1427 */
1183 return call_crda(alpha2); 1428 return call_crda(last_request->alpha2);
1184} 1429}
1185 1430
1186void regulatory_hint(struct wiphy *wiphy, const char *alpha2) 1431/* This currently only processes user and driver regulatory hints */
1432static void reg_process_hint(struct regulatory_request *reg_request)
1187{ 1433{
1188 int r; 1434 int r = 0;
1189 BUG_ON(!alpha2); 1435 struct wiphy *wiphy = NULL;
1436
1437 BUG_ON(!reg_request->alpha2);
1438
1439 mutex_lock(&cfg80211_mutex);
1440
1441 if (wiphy_idx_valid(reg_request->wiphy_idx))
1442 wiphy = wiphy_idx_to_wiphy(reg_request->wiphy_idx);
1443
1444 if (reg_request->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
1445 !wiphy) {
1446 kfree(reg_request);
1447 goto out;
1448 }
1190 1449
1191 mutex_lock(&cfg80211_drv_mutex); 1450 r = __regulatory_hint(wiphy, reg_request);
1192 r = __regulatory_hint(wiphy, REGDOM_SET_BY_DRIVER,
1193 alpha2, 0, ENVIRON_ANY);
1194 /* This is required so that the orig_* parameters are saved */ 1451 /* This is required so that the orig_* parameters are saved */
1195 if (r == -EALREADY && wiphy->strict_regulatory) 1452 if (r == -EALREADY && wiphy && wiphy->strict_regulatory)
1196 wiphy_update_regulatory(wiphy, REGDOM_SET_BY_DRIVER); 1453 wiphy_update_regulatory(wiphy, reg_request->initiator);
1197 mutex_unlock(&cfg80211_drv_mutex); 1454out:
1455 mutex_unlock(&cfg80211_mutex);
1456}
1457
1458/* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_* */
1459static void reg_process_pending_hints(void)
1460 {
1461 struct regulatory_request *reg_request;
1462
1463 spin_lock(&reg_requests_lock);
1464 while (!list_empty(&reg_requests_list)) {
1465 reg_request = list_first_entry(&reg_requests_list,
1466 struct regulatory_request,
1467 list);
1468 list_del_init(&reg_request->list);
1469
1470 spin_unlock(&reg_requests_lock);
1471 reg_process_hint(reg_request);
1472 spin_lock(&reg_requests_lock);
1473 }
1474 spin_unlock(&reg_requests_lock);
1475}
1476
1477/* Processes beacon hints -- this has nothing to do with country IEs */
1478static void reg_process_pending_beacon_hints(void)
1479{
1480 struct cfg80211_registered_device *drv;
1481 struct reg_beacon *pending_beacon, *tmp;
1482
1483 mutex_lock(&cfg80211_mutex);
1484
1485 /* This goes through the _pending_ beacon list */
1486 spin_lock_bh(&reg_pending_beacons_lock);
1487
1488 if (list_empty(&reg_pending_beacons)) {
1489 spin_unlock_bh(&reg_pending_beacons_lock);
1490 goto out;
1491 }
1492
1493 list_for_each_entry_safe(pending_beacon, tmp,
1494 &reg_pending_beacons, list) {
1495
1496 list_del_init(&pending_beacon->list);
1497
1498 /* Applies the beacon hint to current wiphys */
1499 list_for_each_entry(drv, &cfg80211_drv_list, list)
1500 wiphy_update_new_beacon(&drv->wiphy, pending_beacon);
1501
1502 /* Remembers the beacon hint for new wiphys or reg changes */
1503 list_add_tail(&pending_beacon->list, &reg_beacon_list);
1504 }
1505
1506 spin_unlock_bh(&reg_pending_beacons_lock);
1507out:
1508 mutex_unlock(&cfg80211_mutex);
1509}
1510
1511static void reg_todo(struct work_struct *work)
1512{
1513 reg_process_pending_hints();
1514 reg_process_pending_beacon_hints();
1515}
1516
1517static DECLARE_WORK(reg_work, reg_todo);
1518
1519static void queue_regulatory_request(struct regulatory_request *request)
1520{
1521 spin_lock(&reg_requests_lock);
1522 list_add_tail(&request->list, &reg_requests_list);
1523 spin_unlock(&reg_requests_lock);
1524
1525 schedule_work(&reg_work);
1526}
1527
1528/* Core regulatory hint -- happens once during cfg80211_init() */
1529static int regulatory_hint_core(const char *alpha2)
1530{
1531 struct regulatory_request *request;
1532
1533 BUG_ON(last_request);
1534
1535 request = kzalloc(sizeof(struct regulatory_request),
1536 GFP_KERNEL);
1537 if (!request)
1538 return -ENOMEM;
1539
1540 request->alpha2[0] = alpha2[0];
1541 request->alpha2[1] = alpha2[1];
1542 request->initiator = NL80211_REGDOM_SET_BY_CORE;
1543
1544 queue_regulatory_request(request);
1545
1546 return 0;
1547}
1548
1549/* User hints */
1550int regulatory_hint_user(const char *alpha2)
1551{
1552 struct regulatory_request *request;
1553
1554 BUG_ON(!alpha2);
1555
1556 request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
1557 if (!request)
1558 return -ENOMEM;
1559
1560 request->wiphy_idx = WIPHY_IDX_STALE;
1561 request->alpha2[0] = alpha2[0];
1562 request->alpha2[1] = alpha2[1];
1563 request->initiator = NL80211_REGDOM_SET_BY_USER,
1564
1565 queue_regulatory_request(request);
1566
1567 return 0;
1568}
1569
1570/* Driver hints */
1571int regulatory_hint(struct wiphy *wiphy, const char *alpha2)
1572{
1573 struct regulatory_request *request;
1574
1575 BUG_ON(!alpha2);
1576 BUG_ON(!wiphy);
1577
1578 request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
1579 if (!request)
1580 return -ENOMEM;
1581
1582 request->wiphy_idx = get_wiphy_idx(wiphy);
1583
1584 /* Must have registered wiphy first */
1585 BUG_ON(!wiphy_idx_valid(request->wiphy_idx));
1586
1587 request->alpha2[0] = alpha2[0];
1588 request->alpha2[1] = alpha2[1];
1589 request->initiator = NL80211_REGDOM_SET_BY_DRIVER;
1590
1591 queue_regulatory_request(request);
1592
1593 return 0;
1198} 1594}
1199EXPORT_SYMBOL(regulatory_hint); 1595EXPORT_SYMBOL(regulatory_hint);
1200 1596
1201static bool reg_same_country_ie_hint(struct wiphy *wiphy, 1597static bool reg_same_country_ie_hint(struct wiphy *wiphy,
1202 u32 country_ie_checksum) 1598 u32 country_ie_checksum)
1203{ 1599{
1204 if (!last_request->wiphy) 1600 struct wiphy *request_wiphy;
1601
1602 assert_cfg80211_lock();
1603
1604 request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
1605
1606 if (!request_wiphy)
1205 return false; 1607 return false;
1206 if (likely(last_request->wiphy != wiphy)) 1608
1609 if (likely(request_wiphy != wiphy))
1207 return !country_ie_integrity_changes(country_ie_checksum); 1610 return !country_ie_integrity_changes(country_ie_checksum);
1208 /* We should not have let these through at this point, they 1611 /*
1612 * We should not have let these through at this point, they
1209 * should have been picked up earlier by the first alpha2 check 1613 * should have been picked up earlier by the first alpha2 check
1210 * on the device */ 1614 * on the device
1615 */
1211 if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum))) 1616 if (WARN_ON(!country_ie_integrity_changes(country_ie_checksum)))
1212 return true; 1617 return true;
1213 return false; 1618 return false;
@@ -1221,11 +1626,14 @@ void regulatory_hint_11d(struct wiphy *wiphy,
1221 char alpha2[2]; 1626 char alpha2[2];
1222 u32 checksum = 0; 1627 u32 checksum = 0;
1223 enum environment_cap env = ENVIRON_ANY; 1628 enum environment_cap env = ENVIRON_ANY;
1629 struct regulatory_request *request;
1224 1630
1225 if (!last_request) 1631 mutex_lock(&cfg80211_mutex);
1226 return;
1227 1632
1228 mutex_lock(&cfg80211_drv_mutex); 1633 if (unlikely(!last_request)) {
1634 mutex_unlock(&cfg80211_mutex);
1635 return;
1636 }
1229 1637
1230 /* IE len must be evenly divisible by 2 */ 1638 /* IE len must be evenly divisible by 2 */
1231 if (country_ie_len & 0x01) 1639 if (country_ie_len & 0x01)
@@ -1234,9 +1642,11 @@ void regulatory_hint_11d(struct wiphy *wiphy,
1234 if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN) 1642 if (country_ie_len < IEEE80211_COUNTRY_IE_MIN_LEN)
1235 goto out; 1643 goto out;
1236 1644
1237 /* Pending country IE processing, this can happen after we 1645 /*
1646 * Pending country IE processing, this can happen after we
1238 * call CRDA and wait for a response if a beacon was received before 1647 * call CRDA and wait for a response if a beacon was received before
1239 * we were able to process the last regulatory_hint_11d() call */ 1648 * we were able to process the last regulatory_hint_11d() call
1649 */
1240 if (country_ie_regdomain) 1650 if (country_ie_regdomain)
1241 goto out; 1651 goto out;
1242 1652
@@ -1248,33 +1658,44 @@ void regulatory_hint_11d(struct wiphy *wiphy,
1248 else if (country_ie[2] == 'O') 1658 else if (country_ie[2] == 'O')
1249 env = ENVIRON_OUTDOOR; 1659 env = ENVIRON_OUTDOOR;
1250 1660
1251 /* We will run this for *every* beacon processed for the BSSID, so 1661 /*
1662 * We will run this for *every* beacon processed for the BSSID, so
1252 * we optimize an early check to exit out early if we don't have to 1663 * we optimize an early check to exit out early if we don't have to
1253 * do anything */ 1664 * do anything
1254 if (likely(last_request->wiphy)) { 1665 */
1666 if (likely(wiphy_idx_valid(last_request->wiphy_idx))) {
1255 struct cfg80211_registered_device *drv_last_ie; 1667 struct cfg80211_registered_device *drv_last_ie;
1256 1668
1257 drv_last_ie = wiphy_to_dev(last_request->wiphy); 1669 drv_last_ie =
1670 cfg80211_drv_by_wiphy_idx(last_request->wiphy_idx);
1258 1671
1259 /* Lets keep this simple -- we trust the first AP 1672 /*
1260 * after we intersect with CRDA */ 1673 * Lets keep this simple -- we trust the first AP
1261 if (likely(last_request->wiphy == wiphy)) { 1674 * after we intersect with CRDA
1262 /* Ignore IEs coming in on this wiphy with 1675 */
1263 * the same alpha2 and environment cap */ 1676 if (likely(&drv_last_ie->wiphy == wiphy)) {
1677 /*
1678 * Ignore IEs coming in on this wiphy with
1679 * the same alpha2 and environment cap
1680 */
1264 if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2, 1681 if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
1265 alpha2) && 1682 alpha2) &&
1266 env == drv_last_ie->env)) { 1683 env == drv_last_ie->env)) {
1267 goto out; 1684 goto out;
1268 } 1685 }
1269 /* the wiphy moved on to another BSSID or the AP 1686 /*
1687 * the wiphy moved on to another BSSID or the AP
1270 * was reconfigured. XXX: We need to deal with the 1688 * was reconfigured. XXX: We need to deal with the
1271 * case where the user suspends and goes to goes 1689 * case where the user suspends and goes to goes
1272 * to another country, and then gets IEs from an 1690 * to another country, and then gets IEs from an
1273 * AP with different settings */ 1691 * AP with different settings
1692 */
1274 goto out; 1693 goto out;
1275 } else { 1694 } else {
1276 /* Ignore IEs coming in on two separate wiphys with 1695 /*
1277 * the same alpha2 and environment cap */ 1696 * Ignore IEs coming in on two separate wiphys with
1697 * the same alpha2 and environment cap
1698 */
1278 if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2, 1699 if (likely(alpha2_equal(drv_last_ie->country_ie_alpha2,
1279 alpha2) && 1700 alpha2) &&
1280 env == drv_last_ie->env)) { 1701 env == drv_last_ie->env)) {
@@ -1289,28 +1710,97 @@ void regulatory_hint_11d(struct wiphy *wiphy,
1289 if (!rd) 1710 if (!rd)
1290 goto out; 1711 goto out;
1291 1712
1292 /* This will not happen right now but we leave it here for the 1713 /*
1714 * This will not happen right now but we leave it here for the
1293 * the future when we want to add suspend/resume support and having 1715 * the future when we want to add suspend/resume support and having
1294 * the user move to another country after doing so, or having the user 1716 * the user move to another country after doing so, or having the user
1295 * move to another AP. Right now we just trust the first AP. This is why 1717 * move to another AP. Right now we just trust the first AP.
1296 * this is marked as likley(). If we hit this before we add this support 1718 *
1297 * we want to be informed of it as it would indicate a mistake in the 1719 * If we hit this before we add this support we want to be informed of
1298 * current design */ 1720 * it as it would indicate a mistake in the current design
1299 if (likely(WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))) 1721 */
1300 goto out; 1722 if (WARN_ON(reg_same_country_ie_hint(wiphy, checksum)))
1723 goto free_rd_out;
1724
1725 request = kzalloc(sizeof(struct regulatory_request), GFP_KERNEL);
1726 if (!request)
1727 goto free_rd_out;
1301 1728
1302 /* We keep this around for when CRDA comes back with a response so 1729 /*
1303 * we can intersect with that */ 1730 * We keep this around for when CRDA comes back with a response so
1731 * we can intersect with that
1732 */
1304 country_ie_regdomain = rd; 1733 country_ie_regdomain = rd;
1305 1734
1306 __regulatory_hint(wiphy, REGDOM_SET_BY_COUNTRY_IE, 1735 request->wiphy_idx = get_wiphy_idx(wiphy);
1307 country_ie_regdomain->alpha2, checksum, env); 1736 request->alpha2[0] = rd->alpha2[0];
1737 request->alpha2[1] = rd->alpha2[1];
1738 request->initiator = NL80211_REGDOM_SET_BY_COUNTRY_IE;
1739 request->country_ie_checksum = checksum;
1740 request->country_ie_env = env;
1308 1741
1742 mutex_unlock(&cfg80211_mutex);
1743
1744 queue_regulatory_request(request);
1745
1746 return;
1747
1748free_rd_out:
1749 kfree(rd);
1309out: 1750out:
1310 mutex_unlock(&cfg80211_drv_mutex); 1751 mutex_unlock(&cfg80211_mutex);
1311} 1752}
1312EXPORT_SYMBOL(regulatory_hint_11d); 1753EXPORT_SYMBOL(regulatory_hint_11d);
1313 1754
1755static bool freq_is_chan_12_13_14(u16 freq)
1756{
1757 if (freq == ieee80211_channel_to_frequency(12) ||
1758 freq == ieee80211_channel_to_frequency(13) ||
1759 freq == ieee80211_channel_to_frequency(14))
1760 return true;
1761 return false;
1762}
1763
1764int regulatory_hint_found_beacon(struct wiphy *wiphy,
1765 struct ieee80211_channel *beacon_chan,
1766 gfp_t gfp)
1767{
1768 struct reg_beacon *reg_beacon;
1769
1770 if (likely((beacon_chan->beacon_found ||
1771 (beacon_chan->flags & IEEE80211_CHAN_RADAR) ||
1772 (beacon_chan->band == IEEE80211_BAND_2GHZ &&
1773 !freq_is_chan_12_13_14(beacon_chan->center_freq)))))
1774 return 0;
1775
1776 reg_beacon = kzalloc(sizeof(struct reg_beacon), gfp);
1777 if (!reg_beacon)
1778 return -ENOMEM;
1779
1780#ifdef CONFIG_CFG80211_REG_DEBUG
1781 printk(KERN_DEBUG "cfg80211: Found new beacon on "
1782 "frequency: %d MHz (Ch %d) on %s\n",
1783 beacon_chan->center_freq,
1784 ieee80211_frequency_to_channel(beacon_chan->center_freq),
1785 wiphy_name(wiphy));
1786#endif
1787 memcpy(&reg_beacon->chan, beacon_chan,
1788 sizeof(struct ieee80211_channel));
1789
1790
1791 /*
1792 * Since we can be called from BH or and non-BH context
1793 * we must use spin_lock_bh()
1794 */
1795 spin_lock_bh(&reg_pending_beacons_lock);
1796 list_add_tail(&reg_beacon->list, &reg_pending_beacons);
1797 spin_unlock_bh(&reg_pending_beacons_lock);
1798
1799 schedule_work(&reg_work);
1800
1801 return 0;
1802}
1803
1314static void print_rd_rules(const struct ieee80211_regdomain *rd) 1804static void print_rd_rules(const struct ieee80211_regdomain *rd)
1315{ 1805{
1316 unsigned int i; 1806 unsigned int i;
@@ -1326,8 +1816,10 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
1326 freq_range = &reg_rule->freq_range; 1816 freq_range = &reg_rule->freq_range;
1327 power_rule = &reg_rule->power_rule; 1817 power_rule = &reg_rule->power_rule;
1328 1818
1329 /* There may not be documentation for max antenna gain 1819 /*
1330 * in certain regions */ 1820 * There may not be documentation for max antenna gain
1821 * in certain regions
1822 */
1331 if (power_rule->max_antenna_gain) 1823 if (power_rule->max_antenna_gain)
1332 printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), " 1824 printk(KERN_INFO "\t(%d KHz - %d KHz @ %d KHz), "
1333 "(%d mBi, %d mBm)\n", 1825 "(%d mBi, %d mBm)\n",
@@ -1350,13 +1842,13 @@ static void print_regdomain(const struct ieee80211_regdomain *rd)
1350{ 1842{
1351 1843
1352 if (is_intersected_alpha2(rd->alpha2)) { 1844 if (is_intersected_alpha2(rd->alpha2)) {
1353 struct wiphy *wiphy = NULL;
1354 struct cfg80211_registered_device *drv;
1355 1845
1356 if (last_request->initiator == REGDOM_SET_BY_COUNTRY_IE) { 1846 if (last_request->initiator ==
1357 if (last_request->wiphy) { 1847 NL80211_REGDOM_SET_BY_COUNTRY_IE) {
1358 wiphy = last_request->wiphy; 1848 struct cfg80211_registered_device *drv;
1359 drv = wiphy_to_dev(wiphy); 1849 drv = cfg80211_drv_by_wiphy_idx(
1850 last_request->wiphy_idx);
1851 if (drv) {
1360 printk(KERN_INFO "cfg80211: Current regulatory " 1852 printk(KERN_INFO "cfg80211: Current regulatory "
1361 "domain updated by AP to: %c%c\n", 1853 "domain updated by AP to: %c%c\n",
1362 drv->country_ie_alpha2[0], 1854 drv->country_ie_alpha2[0],
@@ -1422,7 +1914,7 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1422{ 1914{
1423 const struct ieee80211_regdomain *intersected_rd = NULL; 1915 const struct ieee80211_regdomain *intersected_rd = NULL;
1424 struct cfg80211_registered_device *drv = NULL; 1916 struct cfg80211_registered_device *drv = NULL;
1425 struct wiphy *wiphy = NULL; 1917 struct wiphy *request_wiphy;
1426 /* Some basic sanity checks first */ 1918 /* Some basic sanity checks first */
1427 1919
1428 if (is_world_regdom(rd->alpha2)) { 1920 if (is_world_regdom(rd->alpha2)) {
@@ -1439,23 +1931,27 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1439 if (!last_request) 1931 if (!last_request)
1440 return -EINVAL; 1932 return -EINVAL;
1441 1933
1442 /* Lets only bother proceeding on the same alpha2 if the current 1934 /*
1935 * Lets only bother proceeding on the same alpha2 if the current
1443 * rd is non static (it means CRDA was present and was used last) 1936 * rd is non static (it means CRDA was present and was used last)
1444 * and the pending request came in from a country IE */ 1937 * and the pending request came in from a country IE
1445 if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) { 1938 */
1446 /* If someone else asked us to change the rd lets only bother 1939 if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
1447 * checking if the alpha2 changes if CRDA was already called */ 1940 /*
1941 * If someone else asked us to change the rd lets only bother
1942 * checking if the alpha2 changes if CRDA was already called
1943 */
1448 if (!is_old_static_regdom(cfg80211_regdomain) && 1944 if (!is_old_static_regdom(cfg80211_regdomain) &&
1449 !regdom_changed(rd->alpha2)) 1945 !regdom_changes(rd->alpha2))
1450 return -EINVAL; 1946 return -EINVAL;
1451 } 1947 }
1452 1948
1453 wiphy = last_request->wiphy; 1949 /*
1454 1950 * Now lets set the regulatory domain, update all driver channels
1455 /* Now lets set the regulatory domain, update all driver channels
1456 * and finally inform them of what we have done, in case they want 1951 * and finally inform them of what we have done, in case they want
1457 * to review or adjust their own settings based on their own 1952 * to review or adjust their own settings based on their own
1458 * internal EEPROM data */ 1953 * internal EEPROM data
1954 */
1459 1955
1460 if (WARN_ON(!reg_is_valid_request(rd->alpha2))) 1956 if (WARN_ON(!reg_is_valid_request(rd->alpha2)))
1461 return -EINVAL; 1957 return -EINVAL;
@@ -1467,21 +1963,25 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1467 return -EINVAL; 1963 return -EINVAL;
1468 } 1964 }
1469 1965
1966 request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
1967
1470 if (!last_request->intersect) { 1968 if (!last_request->intersect) {
1471 int r; 1969 int r;
1472 1970
1473 if (last_request->initiator != REGDOM_SET_BY_DRIVER) { 1971 if (last_request->initiator != NL80211_REGDOM_SET_BY_DRIVER) {
1474 reset_regdomains(); 1972 reset_regdomains();
1475 cfg80211_regdomain = rd; 1973 cfg80211_regdomain = rd;
1476 return 0; 1974 return 0;
1477 } 1975 }
1478 1976
1479 /* For a driver hint, lets copy the regulatory domain the 1977 /*
1480 * driver wanted to the wiphy to deal with conflicts */ 1978 * For a driver hint, lets copy the regulatory domain the
1979 * driver wanted to the wiphy to deal with conflicts
1980 */
1481 1981
1482 BUG_ON(last_request->wiphy->regd); 1982 BUG_ON(request_wiphy->regd);
1483 1983
1484 r = reg_copy_regd(&last_request->wiphy->regd, rd); 1984 r = reg_copy_regd(&request_wiphy->regd, rd);
1485 if (r) 1985 if (r)
1486 return r; 1986 return r;
1487 1987
@@ -1492,17 +1992,19 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1492 1992
1493 /* Intersection requires a bit more work */ 1993 /* Intersection requires a bit more work */
1494 1994
1495 if (last_request->initiator != REGDOM_SET_BY_COUNTRY_IE) { 1995 if (last_request->initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE) {
1496 1996
1497 intersected_rd = regdom_intersect(rd, cfg80211_regdomain); 1997 intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
1498 if (!intersected_rd) 1998 if (!intersected_rd)
1499 return -EINVAL; 1999 return -EINVAL;
1500 2000
1501 /* We can trash what CRDA provided now. 2001 /*
2002 * We can trash what CRDA provided now.
1502 * However if a driver requested this specific regulatory 2003 * However if a driver requested this specific regulatory
1503 * domain we keep it for its private use */ 2004 * domain we keep it for its private use
1504 if (last_request->initiator == REGDOM_SET_BY_DRIVER) 2005 */
1505 last_request->wiphy->regd = rd; 2006 if (last_request->initiator == NL80211_REGDOM_SET_BY_DRIVER)
2007 request_wiphy->regd = rd;
1506 else 2008 else
1507 kfree(rd); 2009 kfree(rd);
1508 2010
@@ -1522,8 +2024,10 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1522 BUG_ON(!country_ie_regdomain); 2024 BUG_ON(!country_ie_regdomain);
1523 2025
1524 if (rd != country_ie_regdomain) { 2026 if (rd != country_ie_regdomain) {
1525 /* Intersect what CRDA returned and our what we 2027 /*
1526 * had built from the Country IE received */ 2028 * Intersect what CRDA returned and our what we
2029 * had built from the Country IE received
2030 */
1527 2031
1528 intersected_rd = regdom_intersect(rd, country_ie_regdomain); 2032 intersected_rd = regdom_intersect(rd, country_ie_regdomain);
1529 2033
@@ -1533,16 +2037,18 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1533 kfree(country_ie_regdomain); 2037 kfree(country_ie_regdomain);
1534 country_ie_regdomain = NULL; 2038 country_ie_regdomain = NULL;
1535 } else { 2039 } else {
1536 /* This would happen when CRDA was not present and 2040 /*
2041 * This would happen when CRDA was not present and
1537 * OLD_REGULATORY was enabled. We intersect our Country 2042 * OLD_REGULATORY was enabled. We intersect our Country
1538 * IE rd and what was set on cfg80211 originally */ 2043 * IE rd and what was set on cfg80211 originally
2044 */
1539 intersected_rd = regdom_intersect(rd, cfg80211_regdomain); 2045 intersected_rd = regdom_intersect(rd, cfg80211_regdomain);
1540 } 2046 }
1541 2047
1542 if (!intersected_rd) 2048 if (!intersected_rd)
1543 return -EINVAL; 2049 return -EINVAL;
1544 2050
1545 drv = wiphy_to_dev(wiphy); 2051 drv = wiphy_to_dev(request_wiphy);
1546 2052
1547 drv->country_ie_alpha2[0] = rd->alpha2[0]; 2053 drv->country_ie_alpha2[0] = rd->alpha2[0];
1548 drv->country_ie_alpha2[1] = rd->alpha2[1]; 2054 drv->country_ie_alpha2[1] = rd->alpha2[1];
@@ -1560,13 +2066,17 @@ static int __set_regdom(const struct ieee80211_regdomain *rd)
1560} 2066}
1561 2067
1562 2068
1563/* Use this call to set the current regulatory domain. Conflicts with 2069/*
2070 * Use this call to set the current regulatory domain. Conflicts with
1564 * multiple drivers can be ironed out later. Caller must've already 2071 * multiple drivers can be ironed out later. Caller must've already
1565 * kmalloc'd the rd structure. Caller must hold cfg80211_drv_mutex */ 2072 * kmalloc'd the rd structure. Caller must hold cfg80211_mutex
2073 */
1566int set_regdom(const struct ieee80211_regdomain *rd) 2074int set_regdom(const struct ieee80211_regdomain *rd)
1567{ 2075{
1568 int r; 2076 int r;
1569 2077
2078 assert_cfg80211_lock();
2079
1570 /* Note that this doesn't update the wiphys, this is done below */ 2080 /* Note that this doesn't update the wiphys, this is done below */
1571 r = __set_regdom(rd); 2081 r = __set_regdom(rd);
1572 if (r) { 2082 if (r) {
@@ -1583,57 +2093,87 @@ int set_regdom(const struct ieee80211_regdomain *rd)
1583 2093
1584 print_regdomain(cfg80211_regdomain); 2094 print_regdomain(cfg80211_regdomain);
1585 2095
2096 nl80211_send_reg_change_event(last_request);
2097
1586 return r; 2098 return r;
1587} 2099}
1588 2100
1589/* Caller must hold cfg80211_drv_mutex */ 2101/* Caller must hold cfg80211_mutex */
1590void reg_device_remove(struct wiphy *wiphy) 2102void reg_device_remove(struct wiphy *wiphy)
1591{ 2103{
2104 struct wiphy *request_wiphy;
2105
2106 assert_cfg80211_lock();
2107
2108 request_wiphy = wiphy_idx_to_wiphy(last_request->wiphy_idx);
2109
1592 kfree(wiphy->regd); 2110 kfree(wiphy->regd);
1593 if (!last_request || !last_request->wiphy) 2111 if (!last_request || !request_wiphy)
1594 return; 2112 return;
1595 if (last_request->wiphy != wiphy) 2113 if (request_wiphy != wiphy)
1596 return; 2114 return;
1597 last_request->wiphy = NULL; 2115 last_request->wiphy_idx = WIPHY_IDX_STALE;
1598 last_request->country_ie_env = ENVIRON_ANY; 2116 last_request->country_ie_env = ENVIRON_ANY;
1599} 2117}
1600 2118
1601int regulatory_init(void) 2119int regulatory_init(void)
1602{ 2120{
1603 int err; 2121 int err = 0;
1604 2122
1605 reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0); 2123 reg_pdev = platform_device_register_simple("regulatory", 0, NULL, 0);
1606 if (IS_ERR(reg_pdev)) 2124 if (IS_ERR(reg_pdev))
1607 return PTR_ERR(reg_pdev); 2125 return PTR_ERR(reg_pdev);
1608 2126
2127 spin_lock_init(&reg_requests_lock);
2128 spin_lock_init(&reg_pending_beacons_lock);
2129
1609#ifdef CONFIG_WIRELESS_OLD_REGULATORY 2130#ifdef CONFIG_WIRELESS_OLD_REGULATORY
1610 cfg80211_regdomain = static_regdom(ieee80211_regdom); 2131 cfg80211_regdomain = static_regdom(ieee80211_regdom);
1611 2132
1612 printk(KERN_INFO "cfg80211: Using static regulatory domain info\n"); 2133 printk(KERN_INFO "cfg80211: Using static regulatory domain info\n");
1613 print_regdomain_info(cfg80211_regdomain); 2134 print_regdomain_info(cfg80211_regdomain);
1614 /* The old code still requests for a new regdomain and if 2135 /*
2136 * The old code still requests for a new regdomain and if
1615 * you have CRDA you get it updated, otherwise you get 2137 * you have CRDA you get it updated, otherwise you get
1616 * stuck with the static values. We ignore "EU" code as 2138 * stuck with the static values. We ignore "EU" code as
1617 * that is not a valid ISO / IEC 3166 alpha2 */ 2139 * that is not a valid ISO / IEC 3166 alpha2
2140 */
1618 if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U') 2141 if (ieee80211_regdom[0] != 'E' || ieee80211_regdom[1] != 'U')
1619 err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, 2142 err = regulatory_hint_core(ieee80211_regdom);
1620 ieee80211_regdom, 0, ENVIRON_ANY);
1621#else 2143#else
1622 cfg80211_regdomain = cfg80211_world_regdom; 2144 cfg80211_regdomain = cfg80211_world_regdom;
1623 2145
1624 err = __regulatory_hint(NULL, REGDOM_SET_BY_CORE, "00", 0, ENVIRON_ANY); 2146 err = regulatory_hint_core("00");
1625 if (err)
1626 printk(KERN_ERR "cfg80211: calling CRDA failed - "
1627 "unable to update world regulatory domain, "
1628 "using static definition\n");
1629#endif 2147#endif
2148 if (err) {
2149 if (err == -ENOMEM)
2150 return err;
2151 /*
2152 * N.B. kobject_uevent_env() can fail mainly for when we're out
2153 * memory which is handled and propagated appropriately above
2154 * but it can also fail during a netlink_broadcast() or during
2155 * early boot for call_usermodehelper(). For now treat these
2156 * errors as non-fatal.
2157 */
2158 printk(KERN_ERR "cfg80211: kobject_uevent_env() was unable "
2159 "to call CRDA during init");
2160#ifdef CONFIG_CFG80211_REG_DEBUG
2161 /* We want to find out exactly why when debugging */
2162 WARN_ON(err);
2163#endif
2164 }
1630 2165
1631 return 0; 2166 return 0;
1632} 2167}
1633 2168
1634void regulatory_exit(void) 2169void regulatory_exit(void)
1635{ 2170{
1636 mutex_lock(&cfg80211_drv_mutex); 2171 struct regulatory_request *reg_request, *tmp;
2172 struct reg_beacon *reg_beacon, *btmp;
2173
2174 cancel_work_sync(&reg_work);
2175
2176 mutex_lock(&cfg80211_mutex);
1637 2177
1638 reset_regdomains(); 2178 reset_regdomains();
1639 2179
@@ -1644,5 +2184,33 @@ void regulatory_exit(void)
1644 2184
1645 platform_device_unregister(reg_pdev); 2185 platform_device_unregister(reg_pdev);
1646 2186
1647 mutex_unlock(&cfg80211_drv_mutex); 2187 spin_lock_bh(&reg_pending_beacons_lock);
2188 if (!list_empty(&reg_pending_beacons)) {
2189 list_for_each_entry_safe(reg_beacon, btmp,
2190 &reg_pending_beacons, list) {
2191 list_del(&reg_beacon->list);
2192 kfree(reg_beacon);
2193 }
2194 }
2195 spin_unlock_bh(&reg_pending_beacons_lock);
2196
2197 if (!list_empty(&reg_beacon_list)) {
2198 list_for_each_entry_safe(reg_beacon, btmp,
2199 &reg_beacon_list, list) {
2200 list_del(&reg_beacon->list);
2201 kfree(reg_beacon);
2202 }
2203 }
2204
2205 spin_lock(&reg_requests_lock);
2206 if (!list_empty(&reg_requests_list)) {
2207 list_for_each_entry_safe(reg_request, tmp,
2208 &reg_requests_list, list) {
2209 list_del(&reg_request->list);
2210 kfree(reg_request);
2211 }
2212 }
2213 spin_unlock(&reg_requests_lock);
2214
2215 mutex_unlock(&cfg80211_mutex);
1648} 2216}
diff --git a/net/wireless/reg.h b/net/wireless/reg.h
index fe8c83f34fb7..e37829a49dc4 100644
--- a/net/wireless/reg.h
+++ b/net/wireless/reg.h
@@ -6,6 +6,8 @@ extern const struct ieee80211_regdomain *cfg80211_regdomain;
6bool is_world_regdom(const char *alpha2); 6bool is_world_regdom(const char *alpha2);
7bool reg_is_valid_request(const char *alpha2); 7bool reg_is_valid_request(const char *alpha2);
8 8
9int regulatory_hint_user(const char *alpha2);
10
9void reg_device_remove(struct wiphy *wiphy); 11void reg_device_remove(struct wiphy *wiphy);
10 12
11int regulatory_init(void); 13int regulatory_init(void);
@@ -14,26 +16,24 @@ void regulatory_exit(void);
14int set_regdom(const struct ieee80211_regdomain *rd); 16int set_regdom(const struct ieee80211_regdomain *rd);
15 17
16/** 18/**
17 * __regulatory_hint - hint to the wireless core a regulatory domain 19 * regulatory_hint_found_beacon - hints a beacon was found on a channel
18 * @wiphy: if the hint comes from country information from an AP, this 20 * @wiphy: the wireless device where the beacon was found on
19 * is required to be set to the wiphy that received the information 21 * @beacon_chan: the channel on which the beacon was found on
20 * @alpha2: the ISO/IEC 3166 alpha2 being claimed the regulatory domain 22 * @gfp: context flags
21 * should be in.
22 * @country_ie_checksum: checksum of processed country IE, set this to 0
23 * if the hint did not come from a country IE
24 * @country_ie_env: the environment the IE told us we are in, %ENVIRON_*
25 *
26 * The Wireless subsystem can use this function to hint to the wireless core
27 * what it believes should be the current regulatory domain by giving it an
28 * ISO/IEC 3166 alpha2 country code it knows its regulatory domain should be
29 * in.
30 * 23 *
31 * Returns zero if all went fine, %-EALREADY if a regulatory domain had 24 * This informs the wireless core that a beacon from an AP was found on
32 * already been set or other standard error codes. 25 * the channel provided. This allows the wireless core to make educated
26 * guesses on regulatory to help with world roaming. This is only used for
27 * world roaming -- when we do not know our current location. This is
28 * only useful on channels 12, 13 and 14 on the 2 GHz band as channels
29 * 1-11 are already enabled by the world regulatory domain; and on
30 * non-radar 5 GHz channels.
33 * 31 *
32 * Drivers do not need to call this, cfg80211 will do it for after a scan
33 * on a newly found BSS.
34 */ 34 */
35extern int __regulatory_hint(struct wiphy *wiphy, enum reg_set_by set_by, 35int regulatory_hint_found_beacon(struct wiphy *wiphy,
36 const char *alpha2, u32 country_ie_checksum, 36 struct ieee80211_channel *beacon_chan,
37 enum environment_cap country_ie_env); 37 gfp_t gfp);
38 38
39#endif /* __NET_WIRELESS_REG_H */ 39#endif /* __NET_WIRELESS_REG_H */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index b1893c863b97..280dbcd02c15 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -62,6 +62,18 @@ static void bss_release(struct kref *ref)
62} 62}
63 63
64/* must hold dev->bss_lock! */ 64/* must hold dev->bss_lock! */
65void cfg80211_bss_age(struct cfg80211_registered_device *dev,
66 unsigned long age_secs)
67{
68 struct cfg80211_internal_bss *bss;
69 unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC);
70
71 list_for_each_entry(bss, &dev->bss_list, list) {
72 bss->ts -= age_jiffies;
73 }
74}
75
76/* must hold dev->bss_lock! */
65void cfg80211_bss_expire(struct cfg80211_registered_device *dev) 77void cfg80211_bss_expire(struct cfg80211_registered_device *dev)
66{ 78{
67 struct cfg80211_internal_bss *bss, *tmp; 79 struct cfg80211_internal_bss *bss, *tmp;
@@ -358,7 +370,6 @@ cfg80211_bss_update(struct cfg80211_registered_device *dev,
358 found->pub.beacon_interval = res->pub.beacon_interval; 370 found->pub.beacon_interval = res->pub.beacon_interval;
359 found->pub.tsf = res->pub.tsf; 371 found->pub.tsf = res->pub.tsf;
360 found->pub.signal = res->pub.signal; 372 found->pub.signal = res->pub.signal;
361 found->pub.signal_type = res->pub.signal_type;
362 found->pub.capability = res->pub.capability; 373 found->pub.capability = res->pub.capability;
363 found->ts = res->ts; 374 found->ts = res->ts;
364 kref_put(&res->ref, bss_release); 375 kref_put(&res->ref, bss_release);
@@ -380,8 +391,7 @@ struct cfg80211_bss *
380cfg80211_inform_bss_frame(struct wiphy *wiphy, 391cfg80211_inform_bss_frame(struct wiphy *wiphy,
381 struct ieee80211_channel *channel, 392 struct ieee80211_channel *channel,
382 struct ieee80211_mgmt *mgmt, size_t len, 393 struct ieee80211_mgmt *mgmt, size_t len,
383 s32 signal, enum cfg80211_signal_type sigtype, 394 s32 signal, gfp_t gfp)
384 gfp_t gfp)
385{ 395{
386 struct cfg80211_internal_bss *res; 396 struct cfg80211_internal_bss *res;
387 size_t ielen = len - offsetof(struct ieee80211_mgmt, 397 size_t ielen = len - offsetof(struct ieee80211_mgmt,
@@ -389,7 +399,7 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
389 bool overwrite; 399 bool overwrite;
390 size_t privsz = wiphy->bss_priv_size; 400 size_t privsz = wiphy->bss_priv_size;
391 401
392 if (WARN_ON(sigtype == NL80211_BSS_SIGNAL_UNSPEC && 402 if (WARN_ON(wiphy->signal_type == NL80211_BSS_SIGNAL_UNSPEC &&
393 (signal < 0 || signal > 100))) 403 (signal < 0 || signal > 100)))
394 return NULL; 404 return NULL;
395 405
@@ -403,7 +413,6 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
403 413
404 memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN); 414 memcpy(res->pub.bssid, mgmt->bssid, ETH_ALEN);
405 res->pub.channel = channel; 415 res->pub.channel = channel;
406 res->pub.signal_type = sigtype;
407 res->pub.signal = signal; 416 res->pub.signal = signal;
408 res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); 417 res->pub.tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp);
409 res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int); 418 res->pub.beacon_interval = le16_to_cpu(mgmt->u.probe_resp.beacon_int);
@@ -421,6 +430,9 @@ cfg80211_inform_bss_frame(struct wiphy *wiphy,
421 if (!res) 430 if (!res)
422 return NULL; 431 return NULL;
423 432
433 if (res->pub.capability & WLAN_CAPABILITY_ESS)
434 regulatory_hint_found_beacon(wiphy, channel, gfp);
435
424 /* cfg80211_bss_update gives us a referenced result */ 436 /* cfg80211_bss_update gives us a referenced result */
425 return &res->pub; 437 return &res->pub;
426} 438}
@@ -584,16 +596,25 @@ static void ieee80211_scan_add_ies(struct iw_request_info *info,
584 } 596 }
585} 597}
586 598
599static inline unsigned int elapsed_jiffies_msecs(unsigned long start)
600{
601 unsigned long end = jiffies;
602
603 if (end >= start)
604 return jiffies_to_msecs(end - start);
605
606 return jiffies_to_msecs(end + (MAX_JIFFY_OFFSET - start) + 1);
607}
587 608
588static char * 609static char *
589ieee80211_bss(struct iw_request_info *info, 610ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info,
590 struct cfg80211_internal_bss *bss, 611 struct cfg80211_internal_bss *bss, char *current_ev,
591 char *current_ev, char *end_buf) 612 char *end_buf)
592{ 613{
593 struct iw_event iwe; 614 struct iw_event iwe;
594 u8 *buf, *cfg, *p; 615 u8 *buf, *cfg, *p;
595 u8 *ie = bss->pub.information_elements; 616 u8 *ie = bss->pub.information_elements;
596 int rem = bss->pub.len_information_elements, i; 617 int rem = bss->pub.len_information_elements, i, sig;
597 bool ismesh = false; 618 bool ismesh = false;
598 619
599 memset(&iwe, 0, sizeof(iwe)); 620 memset(&iwe, 0, sizeof(iwe));
@@ -617,19 +638,28 @@ ieee80211_bss(struct iw_request_info *info,
617 current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe, 638 current_ev = iwe_stream_add_event(info, current_ev, end_buf, &iwe,
618 IW_EV_FREQ_LEN); 639 IW_EV_FREQ_LEN);
619 640
620 if (bss->pub.signal_type != CFG80211_SIGNAL_TYPE_NONE) { 641 if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) {
621 memset(&iwe, 0, sizeof(iwe)); 642 memset(&iwe, 0, sizeof(iwe));
622 iwe.cmd = IWEVQUAL; 643 iwe.cmd = IWEVQUAL;
623 iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | 644 iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED |
624 IW_QUAL_NOISE_INVALID | 645 IW_QUAL_NOISE_INVALID |
625 IW_QUAL_QUAL_INVALID; 646 IW_QUAL_QUAL_UPDATED;
626 switch (bss->pub.signal_type) { 647 switch (wiphy->signal_type) {
627 case CFG80211_SIGNAL_TYPE_MBM: 648 case CFG80211_SIGNAL_TYPE_MBM:
628 iwe.u.qual.level = bss->pub.signal / 100; 649 sig = bss->pub.signal / 100;
650 iwe.u.qual.level = sig;
629 iwe.u.qual.updated |= IW_QUAL_DBM; 651 iwe.u.qual.updated |= IW_QUAL_DBM;
652 if (sig < -110) /* rather bad */
653 sig = -110;
654 else if (sig > -40) /* perfect */
655 sig = -40;
656 /* will give a range of 0 .. 70 */
657 iwe.u.qual.qual = sig + 110;
630 break; 658 break;
631 case CFG80211_SIGNAL_TYPE_UNSPEC: 659 case CFG80211_SIGNAL_TYPE_UNSPEC:
632 iwe.u.qual.level = bss->pub.signal; 660 iwe.u.qual.level = bss->pub.signal;
661 /* will give range 0 .. 100 */
662 iwe.u.qual.qual = bss->pub.signal;
633 break; 663 break;
634 default: 664 default:
635 /* not reached */ 665 /* not reached */
@@ -763,8 +793,8 @@ ieee80211_bss(struct iw_request_info *info,
763 &iwe, buf); 793 &iwe, buf);
764 memset(&iwe, 0, sizeof(iwe)); 794 memset(&iwe, 0, sizeof(iwe));
765 iwe.cmd = IWEVCUSTOM; 795 iwe.cmd = IWEVCUSTOM;
766 sprintf(buf, " Last beacon: %dms ago", 796 sprintf(buf, " Last beacon: %ums ago",
767 jiffies_to_msecs(jiffies - bss->ts)); 797 elapsed_jiffies_msecs(bss->ts));
768 iwe.u.data.length = strlen(buf); 798 iwe.u.data.length = strlen(buf);
769 current_ev = iwe_stream_add_point(info, current_ev, 799 current_ev = iwe_stream_add_point(info, current_ev,
770 end_buf, &iwe, buf); 800 end_buf, &iwe, buf);
@@ -793,8 +823,8 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *dev,
793 spin_unlock_bh(&dev->bss_lock); 823 spin_unlock_bh(&dev->bss_lock);
794 return -E2BIG; 824 return -E2BIG;
795 } 825 }
796 current_ev = ieee80211_bss(info, bss, 826 current_ev = ieee80211_bss(&dev->wiphy, info, bss,
797 current_ev, end_buf); 827 current_ev, end_buf);
798 } 828 }
799 spin_unlock_bh(&dev->bss_lock); 829 spin_unlock_bh(&dev->bss_lock);
800 return current_ev - buf; 830 return current_ev - buf;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 26a72b0797a0..efe3c5c92b2d 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -31,7 +31,7 @@ static ssize_t name ## _show(struct device *dev, \
31 return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \ 31 return sprintf(buf, fmt "\n", dev_to_rdev(dev)->member); \
32} 32}
33 33
34SHOW_FMT(index, "%d", idx); 34SHOW_FMT(index, "%d", wiphy_idx);
35SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); 35SHOW_FMT(macaddress, "%pM", wiphy.perm_addr);
36 36
37static struct device_attribute ieee80211_dev_attrs[] = { 37static struct device_attribute ieee80211_dev_attrs[] = {
@@ -60,6 +60,8 @@ static int wiphy_suspend(struct device *dev, pm_message_t state)
60 struct cfg80211_registered_device *rdev = dev_to_rdev(dev); 60 struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
61 int ret = 0; 61 int ret = 0;
62 62
63 rdev->suspend_at = get_seconds();
64
63 if (rdev->ops->suspend) { 65 if (rdev->ops->suspend) {
64 rtnl_lock(); 66 rtnl_lock();
65 ret = rdev->ops->suspend(&rdev->wiphy); 67 ret = rdev->ops->suspend(&rdev->wiphy);
@@ -74,6 +76,11 @@ static int wiphy_resume(struct device *dev)
74 struct cfg80211_registered_device *rdev = dev_to_rdev(dev); 76 struct cfg80211_registered_device *rdev = dev_to_rdev(dev);
75 int ret = 0; 77 int ret = 0;
76 78
79 /* Age scan results with time spent in suspend */
80 spin_lock_bh(&rdev->bss_lock);
81 cfg80211_bss_age(rdev, get_seconds() - rdev->suspend_at);
82 spin_unlock_bh(&rdev->bss_lock);
83
77 if (rdev->ops->resume) { 84 if (rdev->ops->resume) {
78 rtnl_lock(); 85 rtnl_lock();
79 ret = rdev->ops->resume(&rdev->wiphy); 86 ret = rdev->ops->resume(&rdev->wiphy);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 58e489fd4aed..b84a9b4fe96a 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -137,3 +137,100 @@ int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
137 return 0; 137 return 0;
138} 138}
139EXPORT_SYMBOL(cfg80211_wext_giwmode); 139EXPORT_SYMBOL(cfg80211_wext_giwmode);
140
141
142int cfg80211_wext_giwrange(struct net_device *dev,
143 struct iw_request_info *info,
144 struct iw_point *data, char *extra)
145{
146 struct wireless_dev *wdev = dev->ieee80211_ptr;
147 struct iw_range *range = (struct iw_range *) extra;
148 enum ieee80211_band band;
149 int c = 0;
150
151 if (!wdev)
152 return -EOPNOTSUPP;
153
154 data->length = sizeof(struct iw_range);
155 memset(range, 0, sizeof(struct iw_range));
156
157 range->we_version_compiled = WIRELESS_EXT;
158 range->we_version_source = 21;
159 range->retry_capa = IW_RETRY_LIMIT;
160 range->retry_flags = IW_RETRY_LIMIT;
161 range->min_retry = 0;
162 range->max_retry = 255;
163 range->min_rts = 0;
164 range->max_rts = 2347;
165 range->min_frag = 256;
166 range->max_frag = 2346;
167
168 range->encoding_size[0] = 5;
169 range->encoding_size[1] = 13;
170 range->num_encoding_sizes = 2;
171 range->max_encoding_tokens = 4;
172
173 range->max_qual.updated = IW_QUAL_NOISE_INVALID;
174
175 switch (wdev->wiphy->signal_type) {
176 case CFG80211_SIGNAL_TYPE_NONE:
177 break;
178 case CFG80211_SIGNAL_TYPE_MBM:
179 range->max_qual.level = -110;
180 range->max_qual.qual = 70;
181 range->avg_qual.qual = 35;
182 range->max_qual.updated |= IW_QUAL_DBM;
183 range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
184 range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
185 break;
186 case CFG80211_SIGNAL_TYPE_UNSPEC:
187 range->max_qual.level = 100;
188 range->max_qual.qual = 100;
189 range->avg_qual.qual = 50;
190 range->max_qual.updated |= IW_QUAL_QUAL_UPDATED;
191 range->max_qual.updated |= IW_QUAL_LEVEL_UPDATED;
192 break;
193 }
194
195 range->avg_qual.level = range->max_qual.level / 2;
196 range->avg_qual.noise = range->max_qual.noise / 2;
197 range->avg_qual.updated = range->max_qual.updated;
198
199 range->enc_capa = IW_ENC_CAPA_WPA | IW_ENC_CAPA_WPA2 |
200 IW_ENC_CAPA_CIPHER_TKIP | IW_ENC_CAPA_CIPHER_CCMP;
201
202
203 for (band = 0; band < IEEE80211_NUM_BANDS; band ++) {
204 int i;
205 struct ieee80211_supported_band *sband;
206
207 sband = wdev->wiphy->bands[band];
208
209 if (!sband)
210 continue;
211
212 for (i = 0; i < sband->n_channels && c < IW_MAX_FREQUENCIES; i++) {
213 struct ieee80211_channel *chan = &sband->channels[i];
214
215 if (!(chan->flags & IEEE80211_CHAN_DISABLED)) {
216 range->freq[c].i =
217 ieee80211_frequency_to_channel(
218 chan->center_freq);
219 range->freq[c].m = chan->center_freq;
220 range->freq[c].e = 6;
221 c++;
222 }
223 }
224 }
225 range->num_channels = c;
226 range->num_frequency = c;
227
228 IW_EVENT_CAPA_SET_KERNEL(range->event_capa);
229 IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWAP);
230 IW_EVENT_CAPA_SET(range->event_capa, SIOCGIWSCAN);
231
232 range->scan_capa |= IW_SCAN_CAPA_ESSID;
233
234 return 0;
235}
236EXPORT_SYMBOL(cfg80211_wext_giwrange);
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 8f76f4009c24..9ca17b1ce52e 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -951,10 +951,8 @@ int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb,
951 /* 951 /*
952 * Incoming Call User Data. 952 * Incoming Call User Data.
953 */ 953 */
954 if (skb->len >= 0) { 954 skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len);
955 skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); 955 makex25->calluserdata.cudlength = skb->len;
956 makex25->calluserdata.cudlength = skb->len;
957 }
958 956
959 sk->sk_ack_backlog++; 957 sk->sk_ack_backlog++;
960 958
@@ -1122,8 +1120,9 @@ static int x25_sendmsg(struct kiocb *iocb, struct socket *sock,
1122 if (msg->msg_flags & MSG_OOB) 1120 if (msg->msg_flags & MSG_OOB)
1123 skb_queue_tail(&x25->interrupt_out_queue, skb); 1121 skb_queue_tail(&x25->interrupt_out_queue, skb);
1124 else { 1122 else {
1125 len = x25_output(sk, skb); 1123 rc = x25_output(sk, skb);
1126 if (len < 0) 1124 len = rc;
1125 if (rc < 0)
1127 kfree_skb(skb); 1126 kfree_skb(skb);
1128 else if (x25->qbitincl) 1127 else if (x25->qbitincl)
1129 len++; 1128 len++;
@@ -1608,7 +1607,7 @@ static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = {
1608 1607
1609SOCKOPS_WRAP(x25_proto, AF_X25); 1608SOCKOPS_WRAP(x25_proto, AF_X25);
1610 1609
1611static struct packet_type x25_packet_type = { 1610static struct packet_type x25_packet_type __read_mostly = {
1612 .type = cpu_to_be16(ETH_P_X25), 1611 .type = cpu_to_be16(ETH_P_X25),
1613 .func = x25_lapb_receive_frame, 1612 .func = x25_lapb_receive_frame,
1614}; 1613};
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index e25ff62ab2a6..62a5425cc6aa 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -748,12 +748,51 @@ static void xfrm_hash_grow_check(struct net *net, int have_hash_collision)
748 schedule_work(&net->xfrm.state_hash_work); 748 schedule_work(&net->xfrm.state_hash_work);
749} 749}
750 750
751static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x,
752 struct flowi *fl, unsigned short family,
753 xfrm_address_t *daddr, xfrm_address_t *saddr,
754 struct xfrm_state **best, int *acq_in_progress,
755 int *error)
756{
757 /* Resolution logic:
758 * 1. There is a valid state with matching selector. Done.
759 * 2. Valid state with inappropriate selector. Skip.
760 *
761 * Entering area of "sysdeps".
762 *
763 * 3. If state is not valid, selector is temporary, it selects
764 * only session which triggered previous resolution. Key
765 * manager will do something to install a state with proper
766 * selector.
767 */
768 if (x->km.state == XFRM_STATE_VALID) {
769 if ((x->sel.family &&
770 !xfrm_selector_match(&x->sel, fl, x->sel.family)) ||
771 !security_xfrm_state_pol_flow_match(x, pol, fl))
772 return;
773
774 if (!*best ||
775 (*best)->km.dying > x->km.dying ||
776 ((*best)->km.dying == x->km.dying &&
777 (*best)->curlft.add_time < x->curlft.add_time))
778 *best = x;
779 } else if (x->km.state == XFRM_STATE_ACQ) {
780 *acq_in_progress = 1;
781 } else if (x->km.state == XFRM_STATE_ERROR ||
782 x->km.state == XFRM_STATE_EXPIRED) {
783 if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
784 security_xfrm_state_pol_flow_match(x, pol, fl))
785 *error = -ESRCH;
786 }
787}
788
751struct xfrm_state * 789struct xfrm_state *
752xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, 790xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
753 struct flowi *fl, struct xfrm_tmpl *tmpl, 791 struct flowi *fl, struct xfrm_tmpl *tmpl,
754 struct xfrm_policy *pol, int *err, 792 struct xfrm_policy *pol, int *err,
755 unsigned short family) 793 unsigned short family)
756{ 794{
795 static xfrm_address_t saddr_wildcard = { };
757 struct net *net = xp_net(pol); 796 struct net *net = xp_net(pol);
758 unsigned int h; 797 unsigned int h;
759 struct hlist_node *entry; 798 struct hlist_node *entry;
@@ -773,40 +812,27 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
773 xfrm_state_addr_check(x, daddr, saddr, family) && 812 xfrm_state_addr_check(x, daddr, saddr, family) &&
774 tmpl->mode == x->props.mode && 813 tmpl->mode == x->props.mode &&
775 tmpl->id.proto == x->id.proto && 814 tmpl->id.proto == x->id.proto &&
776 (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) { 815 (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
777 /* Resolution logic: 816 xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
778 1. There is a valid state with matching selector. 817 &best, &acquire_in_progress, &error);
779 Done. 818 }
780 2. Valid state with inappropriate selector. Skip. 819 if (best)
781 820 goto found;
782 Entering area of "sysdeps". 821
783 822 h = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, family);
784 3. If state is not valid, selector is temporary, 823 hlist_for_each_entry(x, entry, net->xfrm.state_bydst+h, bydst) {
785 it selects only session which triggered 824 if (x->props.family == family &&
786 previous resolution. Key manager will do 825 x->props.reqid == tmpl->reqid &&
787 something to install a state with proper 826 !(x->props.flags & XFRM_STATE_WILDRECV) &&
788 selector. 827 xfrm_state_addr_check(x, daddr, saddr, family) &&
789 */ 828 tmpl->mode == x->props.mode &&
790 if (x->km.state == XFRM_STATE_VALID) { 829 tmpl->id.proto == x->id.proto &&
791 if ((x->sel.family && !xfrm_selector_match(&x->sel, fl, x->sel.family)) || 830 (tmpl->id.spi == x->id.spi || !tmpl->id.spi))
792 !security_xfrm_state_pol_flow_match(x, pol, fl)) 831 xfrm_state_look_at(pol, x, fl, family, daddr, saddr,
793 continue; 832 &best, &acquire_in_progress, &error);
794 if (!best ||
795 best->km.dying > x->km.dying ||
796 (best->km.dying == x->km.dying &&
797 best->curlft.add_time < x->curlft.add_time))
798 best = x;
799 } else if (x->km.state == XFRM_STATE_ACQ) {
800 acquire_in_progress = 1;
801 } else if (x->km.state == XFRM_STATE_ERROR ||
802 x->km.state == XFRM_STATE_EXPIRED) {
803 if (xfrm_selector_match(&x->sel, fl, x->sel.family) &&
804 security_xfrm_state_pol_flow_match(x, pol, fl))
805 error = -ESRCH;
806 }
807 }
808 } 833 }
809 834
835found:
810 x = best; 836 x = best;
811 if (!x && !error && !acquire_in_progress) { 837 if (!x && !error && !acquire_in_progress) {
812 if (tmpl->id.spi && 838 if (tmpl->id.spi &&