aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/9p/trans_rdma.c1
-rw-r--r--net/Kconfig6
-rw-r--r--net/bridge/br_if.c4
-rw-r--r--net/bridge/br_private.h2
-rw-r--r--net/bridge/netfilter/ebt_ip6.c46
-rw-r--r--net/bridge/netfilter/ebtables.c1
-rw-r--r--net/caif/cfcnfg.c2
-rw-r--r--net/caif/cfdgml.c1
-rw-r--r--net/caif/cfserl.c1
-rw-r--r--net/caif/cfutill.c2
-rw-r--r--net/caif/cfveil.c2
-rw-r--r--net/core/dev.c240
-rw-r--r--net/core/ethtool.c2
-rw-r--r--net/core/filter.c6
-rw-r--r--net/core/neighbour.c13
-rw-r--r--net/core/net-sysfs.c17
-rw-r--r--net/core/pktgen.c234
-rw-r--r--net/core/rtnetlink.c43
-rw-r--r--net/core/skbuff.c4
-rw-r--r--net/decnet/dn_table.c1
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/af_inet.c2
-rw-r--r--net/ipv4/fib_rules.c10
-rw-r--r--net/ipv4/fib_semantics.c14
-rw-r--r--net/ipv4/ip_input.c2
-rw-r--r--net/ipv4/netfilter/Kconfig3
-rw-r--r--net/ipv4/netfilter/arp_tables.c2
-rw-r--r--net/ipv4/netfilter/ip_tables.c2
-rw-r--r--net/ipv4/netfilter/ipt_CLUSTERIP.c7
-rw-r--r--net/ipv4/netfilter/ipt_LOG.c3
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c2
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c17
-rw-r--r--net/ipv4/netfilter/nf_nat_amanda.c8
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c33
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c9
-rw-r--r--net/ipv4/route.c26
-rw-r--r--net/ipv4/tcp.c2
-rw-r--r--net/ipv4/udp.c2
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/netfilter/ip6_tables.c2
-rw-r--r--net/ipv6/netfilter/ip6t_LOG.c3
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c3
-rw-r--r--net/ipv6/raw.c14
-rw-r--r--net/ipv6/sit.c23
-rw-r--r--net/ipv6/udp.c2
-rw-r--r--net/netfilter/Kconfig43
-rw-r--r--net/netfilter/Makefile4
-rw-r--r--net/netfilter/core.c20
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c98
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c195
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c378
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c887
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c134
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c61
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c67
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c72
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_pe.c17
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c125
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_ah_esp.c45
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c153
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c142
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c110
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c1235
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c26
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c82
-rw-r--r--net/netfilter/nf_conntrack_core.c57
-rw-r--r--net/netfilter/nf_conntrack_expect.c34
-rw-r--r--net/netfilter/nf_conntrack_extend.c11
-rw-r--r--net/netfilter/nf_conntrack_helper.c20
-rw-r--r--net/netfilter/nf_conntrack_netbios_ns.c74
-rw-r--r--net/netfilter/nf_conntrack_netlink.c47
-rw-r--r--net/netfilter/nf_conntrack_proto.c24
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c3
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c14
-rw-r--r--net/netfilter/nf_conntrack_snmp.c77
-rw-r--r--net/netfilter/nf_conntrack_standalone.c45
-rw-r--r--net/netfilter/nf_conntrack_timestamp.c120
-rw-r--r--net/netfilter/nf_log.c6
-rw-r--r--net/netfilter/nf_queue.c82
-rw-r--r--net/netfilter/nfnetlink_log.c6
-rw-r--r--net/netfilter/nfnetlink_queue.c22
-rw-r--r--net/netfilter/x_tables.c98
-rw-r--r--net/netfilter/xt_AUDIT.c204
-rw-r--r--net/netfilter/xt_CLASSIFY.c36
-rw-r--r--net/netfilter/xt_IDLETIMER.c2
-rw-r--r--net/netfilter/xt_LED.c2
-rw-r--r--net/netfilter/xt_NFQUEUE.c34
-rw-r--r--net/netfilter/xt_connlimit.c50
-rw-r--r--net/netfilter/xt_conntrack.c75
-rw-r--r--net/netfilter/xt_cpu.c2
-rw-r--r--net/netfilter/xt_ipvs.c2
-rw-r--r--net/packet/af_packet.c7
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/sched/Kconfig17
-rw-r--r--net/sched/Makefile1
-rw-r--r--net/sched/act_api.c46
-rw-r--r--net/sched/act_csum.c2
-rw-r--r--net/sched/act_gact.c8
-rw-r--r--net/sched/act_ipt.c16
-rw-r--r--net/sched/act_mirred.c4
-rw-r--r--net/sched/act_nat.c2
-rw-r--r--net/sched/act_pedit.c10
-rw-r--r--net/sched/act_police.c9
-rw-r--r--net/sched/act_simple.c10
-rw-r--r--net/sched/act_skbedit.c8
-rw-r--r--net/sched/cls_api.c33
-rw-r--r--net/sched/cls_basic.c17
-rw-r--r--net/sched/cls_cgroup.c8
-rw-r--r--net/sched/cls_flow.c6
-rw-r--r--net/sched/cls_fw.c38
-rw-r--r--net/sched/cls_route.c126
-rw-r--r--net/sched/cls_rsvp.h95
-rw-r--r--net/sched/cls_tcindex.c2
-rw-r--r--net/sched/cls_u32.c77
-rw-r--r--net/sched/em_cmp.c47
-rw-r--r--net/sched/em_meta.c44
-rw-r--r--net/sched/em_nbyte.c3
-rw-r--r--net/sched/em_text.c3
-rw-r--r--net/sched/em_u32.c2
-rw-r--r--net/sched/ematch.c37
-rw-r--r--net/sched/sch_api.c169
-rw-r--r--net/sched/sch_atm.c16
-rw-r--r--net/sched/sch_cbq.c362
-rw-r--r--net/sched/sch_dsmark.c21
-rw-r--r--net/sched/sch_fifo.c22
-rw-r--r--net/sched/sch_generic.c40
-rw-r--r--net/sched/sch_gred.c85
-rw-r--r--net/sched/sch_hfsc.c37
-rw-r--r--net/sched/sch_htb.c106
-rw-r--r--net/sched/sch_mq.c1
-rw-r--r--net/sched/sch_mqprio.c416
-rw-r--r--net/sched/sch_multiq.c8
-rw-r--r--net/sched/sch_netem.c8
-rw-r--r--net/sched/sch_prio.c34
-rw-r--r--net/sched/sch_red.c61
-rw-r--r--net/sched/sch_sfq.c66
-rw-r--r--net/sched/sch_tbf.c39
-rw-r--r--net/sched/sch_teql.c36
-rw-r--r--net/unix/af_unix.c66
-rw-r--r--net/wanrouter/wanmain.c2
144 files changed, 5551 insertions, 2601 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 6e64f7c6a2e9..7850412f52b7 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -327,7 +327,7 @@ static void vlan_sync_address(struct net_device *dev,
327static void vlan_transfer_features(struct net_device *dev, 327static void vlan_transfer_features(struct net_device *dev,
328 struct net_device *vlandev) 328 struct net_device *vlandev)
329{ 329{
330 unsigned long old_features = vlandev->features; 330 u32 old_features = vlandev->features;
331 331
332 vlandev->features &= ~dev->vlan_features; 332 vlandev->features &= ~dev->vlan_features;
333 vlandev->features |= dev->features & dev->vlan_features; 333 vlandev->features |= dev->features & dev->vlan_features;
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 17c5ba7551a5..29a54ccd213d 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -59,7 +59,6 @@
59 * safely advertise a maxsize 59 * safely advertise a maxsize
60 * of 64k */ 60 * of 64k */
61 61
62#define P9_RDMA_MAX_SGE (P9_RDMA_MAXSIZE >> PAGE_SHIFT)
63/** 62/**
64 * struct p9_trans_rdma - RDMA transport instance 63 * struct p9_trans_rdma - RDMA transport instance
65 * 64 *
diff --git a/net/Kconfig b/net/Kconfig
index 72840626284b..79cabf1ee68b 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -221,6 +221,12 @@ config RPS
221 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS 221 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
222 default y 222 default y
223 223
224config RFS_ACCEL
225 boolean
226 depends on RPS && GENERIC_HARDIRQS
227 select CPU_RMAP
228 default y
229
224config XPS 230config XPS
225 boolean 231 boolean
226 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS 232 depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index d9d1e2bac1d6..2a6801d8b728 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -365,7 +365,7 @@ int br_min_mtu(const struct net_bridge *br)
365void br_features_recompute(struct net_bridge *br) 365void br_features_recompute(struct net_bridge *br)
366{ 366{
367 struct net_bridge_port *p; 367 struct net_bridge_port *p;
368 unsigned long features, mask; 368 u32 features, mask;
369 369
370 features = mask = br->feature_mask; 370 features = mask = br->feature_mask;
371 if (list_empty(&br->port_list)) 371 if (list_empty(&br->port_list))
@@ -379,7 +379,7 @@ void br_features_recompute(struct net_bridge *br)
379 } 379 }
380 380
381done: 381done:
382 br->dev->features = netdev_fix_features(features, NULL); 382 br->dev->features = netdev_fix_features(br->dev, features);
383} 383}
384 384
385/* called with RTNL */ 385/* called with RTNL */
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 84aac7734bfc..9f22898c5359 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -182,7 +182,7 @@ struct net_bridge
182 struct br_cpu_netstats __percpu *stats; 182 struct br_cpu_netstats __percpu *stats;
183 spinlock_t hash_lock; 183 spinlock_t hash_lock;
184 struct hlist_head hash[BR_HASH_SIZE]; 184 struct hlist_head hash[BR_HASH_SIZE];
185 unsigned long feature_mask; 185 u32 feature_mask;
186#ifdef CONFIG_BRIDGE_NETFILTER 186#ifdef CONFIG_BRIDGE_NETFILTER
187 struct rtable fake_rtable; 187 struct rtable fake_rtable;
188 bool nf_call_iptables; 188 bool nf_call_iptables;
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 50a46afc2bcc..2ed0056a39a8 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -22,9 +22,15 @@
22#include <linux/netfilter_bridge/ebtables.h> 22#include <linux/netfilter_bridge/ebtables.h>
23#include <linux/netfilter_bridge/ebt_ip6.h> 23#include <linux/netfilter_bridge/ebt_ip6.h>
24 24
25struct tcpudphdr { 25union pkthdr {
26 __be16 src; 26 struct {
27 __be16 dst; 27 __be16 src;
28 __be16 dst;
29 } tcpudphdr;
30 struct {
31 u8 type;
32 u8 code;
33 } icmphdr;
28}; 34};
29 35
30static bool 36static bool
@@ -33,8 +39,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
33 const struct ebt_ip6_info *info = par->matchinfo; 39 const struct ebt_ip6_info *info = par->matchinfo;
34 const struct ipv6hdr *ih6; 40 const struct ipv6hdr *ih6;
35 struct ipv6hdr _ip6h; 41 struct ipv6hdr _ip6h;
36 const struct tcpudphdr *pptr; 42 const union pkthdr *pptr;
37 struct tcpudphdr _ports; 43 union pkthdr _pkthdr;
38 44
39 ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); 45 ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h);
40 if (ih6 == NULL) 46 if (ih6 == NULL)
@@ -56,26 +62,34 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
56 return false; 62 return false;
57 if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO)) 63 if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
58 return false; 64 return false;
59 if (!(info->bitmask & EBT_IP6_DPORT) && 65 if (!(info->bitmask & ( EBT_IP6_DPORT |
60 !(info->bitmask & EBT_IP6_SPORT)) 66 EBT_IP6_SPORT | EBT_IP6_ICMP6)))
61 return true; 67 return true;
62 pptr = skb_header_pointer(skb, offset_ph, sizeof(_ports), 68
63 &_ports); 69 /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
70 pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr),
71 &_pkthdr);
64 if (pptr == NULL) 72 if (pptr == NULL)
65 return false; 73 return false;
66 if (info->bitmask & EBT_IP6_DPORT) { 74 if (info->bitmask & EBT_IP6_DPORT) {
67 u32 dst = ntohs(pptr->dst); 75 u16 dst = ntohs(pptr->tcpudphdr.dst);
68 if (FWINV(dst < info->dport[0] || 76 if (FWINV(dst < info->dport[0] ||
69 dst > info->dport[1], EBT_IP6_DPORT)) 77 dst > info->dport[1], EBT_IP6_DPORT))
70 return false; 78 return false;
71 } 79 }
72 if (info->bitmask & EBT_IP6_SPORT) { 80 if (info->bitmask & EBT_IP6_SPORT) {
73 u32 src = ntohs(pptr->src); 81 u16 src = ntohs(pptr->tcpudphdr.src);
74 if (FWINV(src < info->sport[0] || 82 if (FWINV(src < info->sport[0] ||
75 src > info->sport[1], EBT_IP6_SPORT)) 83 src > info->sport[1], EBT_IP6_SPORT))
76 return false; 84 return false;
77 } 85 }
78 return true; 86 if ((info->bitmask & EBT_IP6_ICMP6) &&
87 FWINV(pptr->icmphdr.type < info->icmpv6_type[0] ||
88 pptr->icmphdr.type > info->icmpv6_type[1] ||
89 pptr->icmphdr.code < info->icmpv6_code[0] ||
90 pptr->icmphdr.code > info->icmpv6_code[1],
91 EBT_IP6_ICMP6))
92 return false;
79 } 93 }
80 return true; 94 return true;
81} 95}
@@ -103,6 +117,14 @@ static int ebt_ip6_mt_check(const struct xt_mtchk_param *par)
103 return -EINVAL; 117 return -EINVAL;
104 if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1]) 118 if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1])
105 return -EINVAL; 119 return -EINVAL;
120 if (info->bitmask & EBT_IP6_ICMP6) {
121 if ((info->invflags & EBT_IP6_PROTO) ||
122 info->protocol != IPPROTO_ICMPV6)
123 return -EINVAL;
124 if (info->icmpv6_type[0] > info->icmpv6_type[1] ||
125 info->icmpv6_code[0] > info->icmpv6_code[1])
126 return -EINVAL;
127 }
106 return 0; 128 return 0;
107} 129}
108 130
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 16df0532d4b9..5f1825df9dca 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1764,6 +1764,7 @@ static int compat_table_info(const struct ebt_table_info *info,
1764 1764
1765 newinfo->entries_size = size; 1765 newinfo->entries_size = size;
1766 1766
1767 xt_compat_init_offsets(AF_INET, info->nentries);
1767 return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, 1768 return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info,
1768 entries, newinfo); 1769 entries, newinfo);
1769} 1770}
diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c
index c665de778b60..f1f98d967d8a 100644
--- a/net/caif/cfcnfg.c
+++ b/net/caif/cfcnfg.c
@@ -23,10 +23,8 @@
23#include <asm/atomic.h> 23#include <asm/atomic.h>
24 24
25#define MAX_PHY_LAYERS 7 25#define MAX_PHY_LAYERS 7
26#define PHY_NAME_LEN 20
27 26
28#define container_obj(layr) container_of(layr, struct cfcnfg, layer) 27#define container_obj(layr) container_of(layr, struct cfcnfg, layer)
29#define RFM_FRAGMENT_SIZE 4030
30 28
31/* Information about CAIF physical interfaces held by Config Module in order 29/* Information about CAIF physical interfaces held by Config Module in order
32 * to manage physical interfaces 30 * to manage physical interfaces
diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c
index d3ed264ad6c4..27dab26ad3b8 100644
--- a/net/caif/cfdgml.c
+++ b/net/caif/cfdgml.c
@@ -18,7 +18,6 @@
18#define DGM_CMD_BIT 0x80 18#define DGM_CMD_BIT 0x80
19#define DGM_FLOW_OFF 0x81 19#define DGM_FLOW_OFF 0x81
20#define DGM_FLOW_ON 0x80 20#define DGM_FLOW_ON 0x80
21#define DGM_CTRL_PKT_SIZE 1
22#define DGM_MTU 1500 21#define DGM_MTU 1500
23 22
24static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt); 23static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c
index 9297f7dea9d8..8303fe3ebf89 100644
--- a/net/caif/cfserl.c
+++ b/net/caif/cfserl.c
@@ -25,7 +25,6 @@ struct cfserl {
25 spinlock_t sync; 25 spinlock_t sync;
26 bool usestx; 26 bool usestx;
27}; 27};
28#define STXLEN(layr) (layr->usestx ? 1 : 0)
29 28
30static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt); 29static int cfserl_receive(struct cflayer *layr, struct cfpkt *pkt);
31static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt); 30static int cfserl_transmit(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c
index efad410e4c82..315c0d601368 100644
--- a/net/caif/cfutill.c
+++ b/net/caif/cfutill.c
@@ -20,7 +20,7 @@
20#define UTIL_REMOTE_SHUTDOWN 0x82 20#define UTIL_REMOTE_SHUTDOWN 0x82
21#define UTIL_FLOW_OFF 0x81 21#define UTIL_FLOW_OFF 0x81
22#define UTIL_FLOW_ON 0x80 22#define UTIL_FLOW_ON 0x80
23#define UTIL_CTRL_PKT_SIZE 1 23
24static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt); 24static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt);
25static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt); 25static int cfutill_transmit(struct cflayer *layr, struct cfpkt *pkt);
26 26
diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c
index 3b425b189a99..c3b1dec4acf6 100644
--- a/net/caif/cfveil.c
+++ b/net/caif/cfveil.c
@@ -17,7 +17,7 @@
17#define VEI_FLOW_OFF 0x81 17#define VEI_FLOW_OFF 0x81
18#define VEI_FLOW_ON 0x80 18#define VEI_FLOW_ON 0x80
19#define VEI_SET_PIN 0x82 19#define VEI_SET_PIN 0x82
20#define VEI_CTRL_PKT_SIZE 1 20
21#define container_obj(layr) container_of(layr, struct cfsrvl, layer) 21#define container_obj(layr) container_of(layr, struct cfsrvl, layer)
22 22
23static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt); 23static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt);
diff --git a/net/core/dev.c b/net/core/dev.c
index 24ea2d71e7ea..1b4c07fe295f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -132,6 +132,7 @@
132#include <trace/events/skb.h> 132#include <trace/events/skb.h>
133#include <linux/pci.h> 133#include <linux/pci.h>
134#include <linux/inetdevice.h> 134#include <linux/inetdevice.h>
135#include <linux/cpu_rmap.h>
135 136
136#include "net-sysfs.h" 137#include "net-sysfs.h"
137 138
@@ -1286,7 +1287,7 @@ static int __dev_close(struct net_device *dev)
1286 return __dev_close_many(&single); 1287 return __dev_close_many(&single);
1287} 1288}
1288 1289
1289int dev_close_many(struct list_head *head) 1290static int dev_close_many(struct list_head *head)
1290{ 1291{
1291 struct net_device *dev, *tmp; 1292 struct net_device *dev, *tmp;
1292 LIST_HEAD(tmp_list); 1293 LIST_HEAD(tmp_list);
@@ -1594,6 +1595,48 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1594 rcu_read_unlock(); 1595 rcu_read_unlock();
1595} 1596}
1596 1597
1598/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1599 * @dev: Network device
1600 * @txq: number of queues available
1601 *
1602 * If real_num_tx_queues is changed the tc mappings may no longer be
1603 * valid. To resolve this verify the tc mapping remains valid and if
1604 * not NULL the mapping. With no priorities mapping to this
1605 * offset/count pair it will no longer be used. In the worst case TC0
1606 * is invalid nothing can be done so disable priority mappings. If is
1607 * expected that drivers will fix this mapping if they can before
1608 * calling netif_set_real_num_tx_queues.
1609 */
1610static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1611{
1612 int i;
1613 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1614
1615 /* If TC0 is invalidated disable TC mapping */
1616 if (tc->offset + tc->count > txq) {
1617 pr_warning("Number of in use tx queues changed "
1618 "invalidating tc mappings. Priority "
1619 "traffic classification disabled!\n");
1620 dev->num_tc = 0;
1621 return;
1622 }
1623
1624 /* Invalidated prio to tc mappings set to TC0 */
1625 for (i = 1; i < TC_BITMASK + 1; i++) {
1626 int q = netdev_get_prio_tc_map(dev, i);
1627
1628 tc = &dev->tc_to_txq[q];
1629 if (tc->offset + tc->count > txq) {
1630 pr_warning("Number of in use tx queues "
1631 "changed. Priority %i to tc "
1632 "mapping %i is no longer valid "
1633 "setting map to 0\n",
1634 i, q);
1635 netdev_set_prio_tc_map(dev, i, 0);
1636 }
1637 }
1638}
1639
1597/* 1640/*
1598 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues 1641 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1599 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. 1642 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
@@ -1613,6 +1656,9 @@ int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
1613 if (rc) 1656 if (rc)
1614 return rc; 1657 return rc;
1615 1658
1659 if (dev->num_tc)
1660 netif_setup_tc(dev, txq);
1661
1616 if (txq < dev->real_num_tx_queues) 1662 if (txq < dev->real_num_tx_queues)
1617 qdisc_reset_all_tx_gt(dev, txq); 1663 qdisc_reset_all_tx_gt(dev, txq);
1618 } 1664 }
@@ -1812,7 +1858,7 @@ EXPORT_SYMBOL(skb_checksum_help);
1812 * It may return NULL if the skb requires no segmentation. This is 1858 * It may return NULL if the skb requires no segmentation. This is
1813 * only possible when GSO is used for verifying header integrity. 1859 * only possible when GSO is used for verifying header integrity.
1814 */ 1860 */
1815struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) 1861struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
1816{ 1862{
1817 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT); 1863 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1818 struct packet_type *ptype; 1864 struct packet_type *ptype;
@@ -2000,7 +2046,7 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
2000 protocol == htons(ETH_P_FCOE))); 2046 protocol == htons(ETH_P_FCOE)));
2001} 2047}
2002 2048
2003static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features) 2049static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
2004{ 2050{
2005 if (!can_checksum_protocol(features, protocol)) { 2051 if (!can_checksum_protocol(features, protocol)) {
2006 features &= ~NETIF_F_ALL_CSUM; 2052 features &= ~NETIF_F_ALL_CSUM;
@@ -2012,10 +2058,10 @@ static int harmonize_features(struct sk_buff *skb, __be16 protocol, int features
2012 return features; 2058 return features;
2013} 2059}
2014 2060
2015int netif_skb_features(struct sk_buff *skb) 2061u32 netif_skb_features(struct sk_buff *skb)
2016{ 2062{
2017 __be16 protocol = skb->protocol; 2063 __be16 protocol = skb->protocol;
2018 int features = skb->dev->features; 2064 u32 features = skb->dev->features;
2019 2065
2020 if (protocol == htons(ETH_P_8021Q)) { 2066 if (protocol == htons(ETH_P_8021Q)) {
2021 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data; 2067 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2060,7 +2106,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2060 int rc = NETDEV_TX_OK; 2106 int rc = NETDEV_TX_OK;
2061 2107
2062 if (likely(!skb->next)) { 2108 if (likely(!skb->next)) {
2063 int features; 2109 u32 features;
2064 2110
2065 /* 2111 /*
2066 * If device doesnt need skb->dst, release it right now while 2112 * If device doesnt need skb->dst, release it right now while
@@ -2162,6 +2208,8 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2162 unsigned int num_tx_queues) 2208 unsigned int num_tx_queues)
2163{ 2209{
2164 u32 hash; 2210 u32 hash;
2211 u16 qoffset = 0;
2212 u16 qcount = num_tx_queues;
2165 2213
2166 if (skb_rx_queue_recorded(skb)) { 2214 if (skb_rx_queue_recorded(skb)) {
2167 hash = skb_get_rx_queue(skb); 2215 hash = skb_get_rx_queue(skb);
@@ -2170,13 +2218,19 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2170 return hash; 2218 return hash;
2171 } 2219 }
2172 2220
2221 if (dev->num_tc) {
2222 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2223 qoffset = dev->tc_to_txq[tc].offset;
2224 qcount = dev->tc_to_txq[tc].count;
2225 }
2226
2173 if (skb->sk && skb->sk->sk_hash) 2227 if (skb->sk && skb->sk->sk_hash)
2174 hash = skb->sk->sk_hash; 2228 hash = skb->sk->sk_hash;
2175 else 2229 else
2176 hash = (__force u16) skb->protocol ^ skb->rxhash; 2230 hash = (__force u16) skb->protocol ^ skb->rxhash;
2177 hash = jhash_1word(hash, hashrnd); 2231 hash = jhash_1word(hash, hashrnd);
2178 2232
2179 return (u16) (((u64) hash * num_tx_queues) >> 32); 2233 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
2180} 2234}
2181EXPORT_SYMBOL(__skb_tx_hash); 2235EXPORT_SYMBOL(__skb_tx_hash);
2182 2236
@@ -2273,15 +2327,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2273 struct netdev_queue *txq) 2327 struct netdev_queue *txq)
2274{ 2328{
2275 spinlock_t *root_lock = qdisc_lock(q); 2329 spinlock_t *root_lock = qdisc_lock(q);
2276 bool contended = qdisc_is_running(q); 2330 bool contended;
2277 int rc; 2331 int rc;
2278 2332
2333 qdisc_skb_cb(skb)->pkt_len = skb->len;
2334 qdisc_calculate_pkt_len(skb, q);
2279 /* 2335 /*
2280 * Heuristic to force contended enqueues to serialize on a 2336 * Heuristic to force contended enqueues to serialize on a
2281 * separate lock before trying to get qdisc main lock. 2337 * separate lock before trying to get qdisc main lock.
2282 * This permits __QDISC_STATE_RUNNING owner to get the lock more often 2338 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2283 * and dequeue packets faster. 2339 * and dequeue packets faster.
2284 */ 2340 */
2341 contended = qdisc_is_running(q);
2285 if (unlikely(contended)) 2342 if (unlikely(contended))
2286 spin_lock(&q->busylock); 2343 spin_lock(&q->busylock);
2287 2344
@@ -2299,7 +2356,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2299 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE)) 2356 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2300 skb_dst_force(skb); 2357 skb_dst_force(skb);
2301 2358
2302 qdisc_skb_cb(skb)->pkt_len = skb->len;
2303 qdisc_bstats_update(q, skb); 2359 qdisc_bstats_update(q, skb);
2304 2360
2305 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) { 2361 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
@@ -2314,7 +2370,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2314 rc = NET_XMIT_SUCCESS; 2370 rc = NET_XMIT_SUCCESS;
2315 } else { 2371 } else {
2316 skb_dst_force(skb); 2372 skb_dst_force(skb);
2317 rc = qdisc_enqueue_root(skb, q); 2373 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2318 if (qdisc_run_begin(q)) { 2374 if (qdisc_run_begin(q)) {
2319 if (unlikely(contended)) { 2375 if (unlikely(contended)) {
2320 spin_unlock(&q->busylock); 2376 spin_unlock(&q->busylock);
@@ -2533,6 +2589,53 @@ EXPORT_SYMBOL(__skb_get_rxhash);
2533struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly; 2589struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2534EXPORT_SYMBOL(rps_sock_flow_table); 2590EXPORT_SYMBOL(rps_sock_flow_table);
2535 2591
2592static struct rps_dev_flow *
2593set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2594 struct rps_dev_flow *rflow, u16 next_cpu)
2595{
2596 u16 tcpu;
2597
2598 tcpu = rflow->cpu = next_cpu;
2599 if (tcpu != RPS_NO_CPU) {
2600#ifdef CONFIG_RFS_ACCEL
2601 struct netdev_rx_queue *rxqueue;
2602 struct rps_dev_flow_table *flow_table;
2603 struct rps_dev_flow *old_rflow;
2604 u32 flow_id;
2605 u16 rxq_index;
2606 int rc;
2607
2608 /* Should we steer this flow to a different hardware queue? */
2609 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap)
2610 goto out;
2611 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2612 if (rxq_index == skb_get_rx_queue(skb))
2613 goto out;
2614
2615 rxqueue = dev->_rx + rxq_index;
2616 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2617 if (!flow_table)
2618 goto out;
2619 flow_id = skb->rxhash & flow_table->mask;
2620 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2621 rxq_index, flow_id);
2622 if (rc < 0)
2623 goto out;
2624 old_rflow = rflow;
2625 rflow = &flow_table->flows[flow_id];
2626 rflow->cpu = next_cpu;
2627 rflow->filter = rc;
2628 if (old_rflow->filter == rflow->filter)
2629 old_rflow->filter = RPS_NO_FILTER;
2630 out:
2631#endif
2632 rflow->last_qtail =
2633 per_cpu(softnet_data, tcpu).input_queue_head;
2634 }
2635
2636 return rflow;
2637}
2638
2536/* 2639/*
2537 * get_rps_cpu is called from netif_receive_skb and returns the target 2640 * get_rps_cpu is called from netif_receive_skb and returns the target
2538 * CPU from the RPS map of the receiving queue for a given skb. 2641 * CPU from the RPS map of the receiving queue for a given skb.
@@ -2603,12 +2706,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2603 if (unlikely(tcpu != next_cpu) && 2706 if (unlikely(tcpu != next_cpu) &&
2604 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) || 2707 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2605 ((int)(per_cpu(softnet_data, tcpu).input_queue_head - 2708 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
2606 rflow->last_qtail)) >= 0)) { 2709 rflow->last_qtail)) >= 0))
2607 tcpu = rflow->cpu = next_cpu; 2710 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2608 if (tcpu != RPS_NO_CPU) 2711
2609 rflow->last_qtail = per_cpu(softnet_data,
2610 tcpu).input_queue_head;
2611 }
2612 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) { 2712 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2613 *rflowp = rflow; 2713 *rflowp = rflow;
2614 cpu = tcpu; 2714 cpu = tcpu;
@@ -2629,6 +2729,46 @@ done:
2629 return cpu; 2729 return cpu;
2630} 2730}
2631 2731
2732#ifdef CONFIG_RFS_ACCEL
2733
2734/**
2735 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2736 * @dev: Device on which the filter was set
2737 * @rxq_index: RX queue index
2738 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2739 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2740 *
2741 * Drivers that implement ndo_rx_flow_steer() should periodically call
2742 * this function for each installed filter and remove the filters for
2743 * which it returns %true.
2744 */
2745bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2746 u32 flow_id, u16 filter_id)
2747{
2748 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2749 struct rps_dev_flow_table *flow_table;
2750 struct rps_dev_flow *rflow;
2751 bool expire = true;
2752 int cpu;
2753
2754 rcu_read_lock();
2755 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2756 if (flow_table && flow_id <= flow_table->mask) {
2757 rflow = &flow_table->flows[flow_id];
2758 cpu = ACCESS_ONCE(rflow->cpu);
2759 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2760 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2761 rflow->last_qtail) <
2762 (int)(10 * flow_table->mask)))
2763 expire = false;
2764 }
2765 rcu_read_unlock();
2766 return expire;
2767}
2768EXPORT_SYMBOL(rps_may_expire_flow);
2769
2770#endif /* CONFIG_RFS_ACCEL */
2771
2632/* Called from hardirq (IPI) context */ 2772/* Called from hardirq (IPI) context */
2633static void rps_trigger_softirq(void *data) 2773static void rps_trigger_softirq(void *data)
2634{ 2774{
@@ -4573,6 +4713,17 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
4573EXPORT_SYMBOL(dev_set_mtu); 4713EXPORT_SYMBOL(dev_set_mtu);
4574 4714
4575/** 4715/**
4716 * dev_set_group - Change group this device belongs to
4717 * @dev: device
4718 * @new_group: group this device should belong to
4719 */
4720void dev_set_group(struct net_device *dev, int new_group)
4721{
4722 dev->group = new_group;
4723}
4724EXPORT_SYMBOL(dev_set_group);
4725
4726/**
4576 * dev_set_mac_address - Change Media Access Control Address 4727 * dev_set_mac_address - Change Media Access Control Address
4577 * @dev: device 4728 * @dev: device
4578 * @sa: new address 4729 * @sa: new address
@@ -5062,41 +5213,49 @@ static void rollback_registered(struct net_device *dev)
5062 rollback_registered_many(&single); 5213 rollback_registered_many(&single);
5063} 5214}
5064 5215
5065unsigned long netdev_fix_features(unsigned long features, const char *name) 5216u32 netdev_fix_features(struct net_device *dev, u32 features)
5066{ 5217{
5218 /* Fix illegal checksum combinations */
5219 if ((features & NETIF_F_HW_CSUM) &&
5220 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5221 netdev_info(dev, "mixed HW and IP checksum settings.\n");
5222 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5223 }
5224
5225 if ((features & NETIF_F_NO_CSUM) &&
5226 (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5227 netdev_info(dev, "mixed no checksumming and other settings.\n");
5228 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5229 }
5230
5067 /* Fix illegal SG+CSUM combinations. */ 5231 /* Fix illegal SG+CSUM combinations. */
5068 if ((features & NETIF_F_SG) && 5232 if ((features & NETIF_F_SG) &&
5069 !(features & NETIF_F_ALL_CSUM)) { 5233 !(features & NETIF_F_ALL_CSUM)) {
5070 if (name) 5234 netdev_info(dev,
5071 printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no " 5235 "Dropping NETIF_F_SG since no checksum feature.\n");
5072 "checksum feature.\n", name);
5073 features &= ~NETIF_F_SG; 5236 features &= ~NETIF_F_SG;
5074 } 5237 }
5075 5238
5076 /* TSO requires that SG is present as well. */ 5239 /* TSO requires that SG is present as well. */
5077 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) { 5240 if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5078 if (name) 5241 netdev_info(dev, "Dropping NETIF_F_TSO since no SG feature.\n");
5079 printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5080 "SG feature.\n", name);
5081 features &= ~NETIF_F_TSO; 5242 features &= ~NETIF_F_TSO;
5082 } 5243 }
5083 5244
5245 /* UFO needs SG and checksumming */
5084 if (features & NETIF_F_UFO) { 5246 if (features & NETIF_F_UFO) {
5085 /* maybe split UFO into V4 and V6? */ 5247 /* maybe split UFO into V4 and V6? */
5086 if (!((features & NETIF_F_GEN_CSUM) || 5248 if (!((features & NETIF_F_GEN_CSUM) ||
5087 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) 5249 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5088 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) { 5250 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5089 if (name) 5251 netdev_info(dev,
5090 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 5252 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5091 "since no checksum offload features.\n",
5092 name);
5093 features &= ~NETIF_F_UFO; 5253 features &= ~NETIF_F_UFO;
5094 } 5254 }
5095 5255
5096 if (!(features & NETIF_F_SG)) { 5256 if (!(features & NETIF_F_SG)) {
5097 if (name) 5257 netdev_info(dev,
5098 printk(KERN_ERR "%s: Dropping NETIF_F_UFO " 5258 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5099 "since no NETIF_F_SG feature.\n", name);
5100 features &= ~NETIF_F_UFO; 5259 features &= ~NETIF_F_UFO;
5101 } 5260 }
5102 } 5261 }
@@ -5239,22 +5398,7 @@ int register_netdevice(struct net_device *dev)
5239 if (dev->iflink == -1) 5398 if (dev->iflink == -1)
5240 dev->iflink = dev->ifindex; 5399 dev->iflink = dev->ifindex;
5241 5400
5242 /* Fix illegal checksum combinations */ 5401 dev->features = netdev_fix_features(dev, dev->features);
5243 if ((dev->features & NETIF_F_HW_CSUM) &&
5244 (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5245 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5246 dev->name);
5247 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5248 }
5249
5250 if ((dev->features & NETIF_F_NO_CSUM) &&
5251 (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5252 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5253 dev->name);
5254 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5255 }
5256
5257 dev->features = netdev_fix_features(dev->features, dev->name);
5258 5402
5259 /* Enable software GSO if SG is supported. */ 5403 /* Enable software GSO if SG is supported. */
5260 if (dev->features & NETIF_F_SG) 5404 if (dev->features & NETIF_F_SG)
@@ -5679,6 +5823,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5679 dev->priv_flags = IFF_XMIT_DST_RELEASE; 5823 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5680 setup(dev); 5824 setup(dev);
5681 strcpy(dev->name, name); 5825 strcpy(dev->name, name);
5826 dev->group = INIT_NETDEV_GROUP;
5682 return dev; 5827 return dev;
5683 5828
5684free_pcpu: 5829free_pcpu:
@@ -5989,8 +6134,7 @@ static int dev_cpu_callback(struct notifier_block *nfb,
5989 * @one to the master device with current feature set @all. Will not 6134 * @one to the master device with current feature set @all. Will not
5990 * enable anything that is off in @mask. Returns the new feature set. 6135 * enable anything that is off in @mask. Returns the new feature set.
5991 */ 6136 */
5992unsigned long netdev_increment_features(unsigned long all, unsigned long one, 6137u32 netdev_increment_features(u32 all, u32 one, u32 mask)
5993 unsigned long mask)
5994{ 6138{
5995 /* If device needs checksumming, downgrade to it. */ 6139 /* If device needs checksumming, downgrade to it. */
5996 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM)) 6140 if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index ff2302910b5e..5984ee0c7136 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -1458,7 +1458,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
1458 void __user *useraddr = ifr->ifr_data; 1458 void __user *useraddr = ifr->ifr_data;
1459 u32 ethcmd; 1459 u32 ethcmd;
1460 int rc; 1460 int rc;
1461 unsigned long old_features; 1461 u32 old_features;
1462 1462
1463 if (!dev || !netif_device_present(dev)) 1463 if (!dev || !netif_device_present(dev))
1464 return -ENODEV; 1464 return -ENODEV;
diff --git a/net/core/filter.c b/net/core/filter.c
index afc58374ca96..232b1873bb28 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -142,14 +142,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
142 if (err) 142 if (err)
143 return err; 143 return err;
144 144
145 rcu_read_lock_bh(); 145 rcu_read_lock();
146 filter = rcu_dereference_bh(sk->sk_filter); 146 filter = rcu_dereference(sk->sk_filter);
147 if (filter) { 147 if (filter) {
148 unsigned int pkt_len = sk_run_filter(skb, filter->insns); 148 unsigned int pkt_len = sk_run_filter(skb, filter->insns);
149 149
150 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; 150 err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
151 } 151 }
152 rcu_read_unlock_bh(); 152 rcu_read_unlock();
153 153
154 return err; 154 return err;
155} 155}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 60a902913429..799f06e03a22 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -316,7 +316,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
316{ 316{
317 size_t size = entries * sizeof(struct neighbour *); 317 size_t size = entries * sizeof(struct neighbour *);
318 struct neigh_hash_table *ret; 318 struct neigh_hash_table *ret;
319 struct neighbour **buckets; 319 struct neighbour __rcu **buckets;
320 320
321 ret = kmalloc(sizeof(*ret), GFP_ATOMIC); 321 ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
322 if (!ret) 322 if (!ret)
@@ -324,14 +324,14 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
324 if (size <= PAGE_SIZE) 324 if (size <= PAGE_SIZE)
325 buckets = kzalloc(size, GFP_ATOMIC); 325 buckets = kzalloc(size, GFP_ATOMIC);
326 else 326 else
327 buckets = (struct neighbour **) 327 buckets = (struct neighbour __rcu **)
328 __get_free_pages(GFP_ATOMIC | __GFP_ZERO, 328 __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
329 get_order(size)); 329 get_order(size));
330 if (!buckets) { 330 if (!buckets) {
331 kfree(ret); 331 kfree(ret);
332 return NULL; 332 return NULL;
333 } 333 }
334 rcu_assign_pointer(ret->hash_buckets, buckets); 334 ret->hash_buckets = buckets;
335 ret->hash_mask = entries - 1; 335 ret->hash_mask = entries - 1;
336 get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); 336 get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
337 return ret; 337 return ret;
@@ -343,7 +343,7 @@ static void neigh_hash_free_rcu(struct rcu_head *head)
343 struct neigh_hash_table, 343 struct neigh_hash_table,
344 rcu); 344 rcu);
345 size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); 345 size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
346 struct neighbour **buckets = nht->hash_buckets; 346 struct neighbour __rcu **buckets = nht->hash_buckets;
347 347
348 if (size <= PAGE_SIZE) 348 if (size <= PAGE_SIZE)
349 kfree(buckets); 349 kfree(buckets);
@@ -1540,7 +1540,7 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
1540 panic("cannot create neighbour proc dir entry"); 1540 panic("cannot create neighbour proc dir entry");
1541#endif 1541#endif
1542 1542
1543 tbl->nht = neigh_hash_alloc(8); 1543 RCU_INIT_POINTER(tbl->nht, neigh_hash_alloc(8));
1544 1544
1545 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); 1545 phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
1546 tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); 1546 tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
@@ -1602,7 +1602,8 @@ int neigh_table_clear(struct neigh_table *tbl)
1602 } 1602 }
1603 write_unlock(&neigh_tbl_lock); 1603 write_unlock(&neigh_tbl_lock);
1604 1604
1605 call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); 1605 call_rcu(&rcu_dereference_protected(tbl->nht, 1)->rcu,
1606 neigh_hash_free_rcu);
1606 tbl->nht = NULL; 1607 tbl->nht = NULL;
1607 1608
1608 kfree(tbl->phash_buckets); 1609 kfree(tbl->phash_buckets);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e23c01be5a5b..2e4a393dfc3b 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -99,7 +99,7 @@ NETDEVICE_SHOW(addr_assign_type, fmt_dec);
99NETDEVICE_SHOW(addr_len, fmt_dec); 99NETDEVICE_SHOW(addr_len, fmt_dec);
100NETDEVICE_SHOW(iflink, fmt_dec); 100NETDEVICE_SHOW(iflink, fmt_dec);
101NETDEVICE_SHOW(ifindex, fmt_dec); 101NETDEVICE_SHOW(ifindex, fmt_dec);
102NETDEVICE_SHOW(features, fmt_long_hex); 102NETDEVICE_SHOW(features, fmt_hex);
103NETDEVICE_SHOW(type, fmt_dec); 103NETDEVICE_SHOW(type, fmt_dec);
104NETDEVICE_SHOW(link_mode, fmt_dec); 104NETDEVICE_SHOW(link_mode, fmt_dec);
105 105
@@ -295,6 +295,20 @@ static ssize_t show_ifalias(struct device *dev,
295 return ret; 295 return ret;
296} 296}
297 297
298NETDEVICE_SHOW(group, fmt_dec);
299
300static int change_group(struct net_device *net, unsigned long new_group)
301{
302 dev_set_group(net, (int) new_group);
303 return 0;
304}
305
306static ssize_t store_group(struct device *dev, struct device_attribute *attr,
307 const char *buf, size_t len)
308{
309 return netdev_store(dev, attr, buf, len, change_group);
310}
311
298static struct device_attribute net_class_attributes[] = { 312static struct device_attribute net_class_attributes[] = {
299 __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL), 313 __ATTR(addr_assign_type, S_IRUGO, show_addr_assign_type, NULL),
300 __ATTR(addr_len, S_IRUGO, show_addr_len, NULL), 314 __ATTR(addr_len, S_IRUGO, show_addr_len, NULL),
@@ -316,6 +330,7 @@ static struct device_attribute net_class_attributes[] = {
316 __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags), 330 __ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags),
317 __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, 331 __ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len,
318 store_tx_queue_len), 332 store_tx_queue_len),
333 __ATTR(group, S_IRUGO | S_IWUSR, show_group, store_group),
319 {} 334 {}
320}; 335};
321 336
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index a9e7fc4c461f..d73b77adb676 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -251,6 +251,7 @@ struct pktgen_dev {
251 int max_pkt_size; /* = ETH_ZLEN; */ 251 int max_pkt_size; /* = ETH_ZLEN; */
252 int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */ 252 int pkt_overhead; /* overhead for MPLS, VLANs, IPSEC etc */
253 int nfrags; 253 int nfrags;
254 struct page *page;
254 u64 delay; /* nano-seconds */ 255 u64 delay; /* nano-seconds */
255 256
256 __u64 count; /* Default No packets to send */ 257 __u64 count; /* Default No packets to send */
@@ -1134,6 +1135,10 @@ static ssize_t pktgen_if_write(struct file *file,
1134 if (node_possible(value)) { 1135 if (node_possible(value)) {
1135 pkt_dev->node = value; 1136 pkt_dev->node = value;
1136 sprintf(pg_result, "OK: node=%d", pkt_dev->node); 1137 sprintf(pg_result, "OK: node=%d", pkt_dev->node);
1138 if (pkt_dev->page) {
1139 put_page(pkt_dev->page);
1140 pkt_dev->page = NULL;
1141 }
1137 } 1142 }
1138 else 1143 else
1139 sprintf(pg_result, "ERROR: node not possible"); 1144 sprintf(pg_result, "ERROR: node not possible");
@@ -2605,6 +2610,90 @@ static inline __be16 build_tci(unsigned int id, unsigned int cfi,
2605 return htons(id | (cfi << 12) | (prio << 13)); 2610 return htons(id | (cfi << 12) | (prio << 13));
2606} 2611}
2607 2612
2613static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
2614 int datalen)
2615{
2616 struct timeval timestamp;
2617 struct pktgen_hdr *pgh;
2618
2619 pgh = (struct pktgen_hdr *)skb_put(skb, sizeof(*pgh));
2620 datalen -= sizeof(*pgh);
2621
2622 if (pkt_dev->nfrags <= 0) {
2623 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
2624 memset(pgh + 1, 0, datalen);
2625 } else {
2626 int frags = pkt_dev->nfrags;
2627 int i, len;
2628
2629
2630 if (frags > MAX_SKB_FRAGS)
2631 frags = MAX_SKB_FRAGS;
2632 len = datalen - frags * PAGE_SIZE;
2633 if (len > 0) {
2634 memset(skb_put(skb, len), 0, len);
2635 datalen = frags * PAGE_SIZE;
2636 }
2637
2638 i = 0;
2639 while (datalen > 0) {
2640 if (unlikely(!pkt_dev->page)) {
2641 int node = numa_node_id();
2642
2643 if (pkt_dev->node >= 0 && (pkt_dev->flags & F_NODE))
2644 node = pkt_dev->node;
2645 pkt_dev->page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2646 if (!pkt_dev->page)
2647 break;
2648 }
2649 skb_shinfo(skb)->frags[i].page = pkt_dev->page;
2650 get_page(pkt_dev->page);
2651 skb_shinfo(skb)->frags[i].page_offset = 0;
2652 skb_shinfo(skb)->frags[i].size =
2653 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
2654 datalen -= skb_shinfo(skb)->frags[i].size;
2655 skb->len += skb_shinfo(skb)->frags[i].size;
2656 skb->data_len += skb_shinfo(skb)->frags[i].size;
2657 i++;
2658 skb_shinfo(skb)->nr_frags = i;
2659 }
2660
2661 while (i < frags) {
2662 int rem;
2663
2664 if (i == 0)
2665 break;
2666
2667 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
2668 if (rem == 0)
2669 break;
2670
2671 skb_shinfo(skb)->frags[i - 1].size -= rem;
2672
2673 skb_shinfo(skb)->frags[i] =
2674 skb_shinfo(skb)->frags[i - 1];
2675 get_page(skb_shinfo(skb)->frags[i].page);
2676 skb_shinfo(skb)->frags[i].page =
2677 skb_shinfo(skb)->frags[i - 1].page;
2678 skb_shinfo(skb)->frags[i].page_offset +=
2679 skb_shinfo(skb)->frags[i - 1].size;
2680 skb_shinfo(skb)->frags[i].size = rem;
2681 i++;
2682 skb_shinfo(skb)->nr_frags = i;
2683 }
2684 }
2685
2686 /* Stamp the time, and sequence number,
2687 * convert them to network byte order
2688 */
2689 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2690 pgh->seq_num = htonl(pkt_dev->seq_num);
2691
2692 do_gettimeofday(&timestamp);
2693 pgh->tv_sec = htonl(timestamp.tv_sec);
2694 pgh->tv_usec = htonl(timestamp.tv_usec);
2695}
2696
2608static struct sk_buff *fill_packet_ipv4(struct net_device *odev, 2697static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2609 struct pktgen_dev *pkt_dev) 2698 struct pktgen_dev *pkt_dev)
2610{ 2699{
@@ -2613,7 +2702,6 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2613 struct udphdr *udph; 2702 struct udphdr *udph;
2614 int datalen, iplen; 2703 int datalen, iplen;
2615 struct iphdr *iph; 2704 struct iphdr *iph;
2616 struct pktgen_hdr *pgh = NULL;
2617 __be16 protocol = htons(ETH_P_IP); 2705 __be16 protocol = htons(ETH_P_IP);
2618 __be32 *mpls; 2706 __be32 *mpls;
2619 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ 2707 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -2729,76 +2817,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
2729 pkt_dev->pkt_overhead); 2817 pkt_dev->pkt_overhead);
2730 skb->dev = odev; 2818 skb->dev = odev;
2731 skb->pkt_type = PACKET_HOST; 2819 skb->pkt_type = PACKET_HOST;
2732 2820 pktgen_finalize_skb(pkt_dev, skb, datalen);
2733 if (pkt_dev->nfrags <= 0) {
2734 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
2735 memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr));
2736 } else {
2737 int frags = pkt_dev->nfrags;
2738 int i, len;
2739
2740 pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
2741
2742 if (frags > MAX_SKB_FRAGS)
2743 frags = MAX_SKB_FRAGS;
2744 if (datalen > frags * PAGE_SIZE) {
2745 len = datalen - frags * PAGE_SIZE;
2746 memset(skb_put(skb, len), 0, len);
2747 datalen = frags * PAGE_SIZE;
2748 }
2749
2750 i = 0;
2751 while (datalen > 0) {
2752 struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
2753 skb_shinfo(skb)->frags[i].page = page;
2754 skb_shinfo(skb)->frags[i].page_offset = 0;
2755 skb_shinfo(skb)->frags[i].size =
2756 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
2757 datalen -= skb_shinfo(skb)->frags[i].size;
2758 skb->len += skb_shinfo(skb)->frags[i].size;
2759 skb->data_len += skb_shinfo(skb)->frags[i].size;
2760 i++;
2761 skb_shinfo(skb)->nr_frags = i;
2762 }
2763
2764 while (i < frags) {
2765 int rem;
2766
2767 if (i == 0)
2768 break;
2769
2770 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
2771 if (rem == 0)
2772 break;
2773
2774 skb_shinfo(skb)->frags[i - 1].size -= rem;
2775
2776 skb_shinfo(skb)->frags[i] =
2777 skb_shinfo(skb)->frags[i - 1];
2778 get_page(skb_shinfo(skb)->frags[i].page);
2779 skb_shinfo(skb)->frags[i].page =
2780 skb_shinfo(skb)->frags[i - 1].page;
2781 skb_shinfo(skb)->frags[i].page_offset +=
2782 skb_shinfo(skb)->frags[i - 1].size;
2783 skb_shinfo(skb)->frags[i].size = rem;
2784 i++;
2785 skb_shinfo(skb)->nr_frags = i;
2786 }
2787 }
2788
2789 /* Stamp the time, and sequence number,
2790 * convert them to network byte order
2791 */
2792 if (pgh) {
2793 struct timeval timestamp;
2794
2795 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
2796 pgh->seq_num = htonl(pkt_dev->seq_num);
2797
2798 do_gettimeofday(&timestamp);
2799 pgh->tv_sec = htonl(timestamp.tv_sec);
2800 pgh->tv_usec = htonl(timestamp.tv_usec);
2801 }
2802 2821
2803#ifdef CONFIG_XFRM 2822#ifdef CONFIG_XFRM
2804 if (!process_ipsec(pkt_dev, skb, protocol)) 2823 if (!process_ipsec(pkt_dev, skb, protocol))
@@ -2980,7 +2999,6 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
2980 struct udphdr *udph; 2999 struct udphdr *udph;
2981 int datalen; 3000 int datalen;
2982 struct ipv6hdr *iph; 3001 struct ipv6hdr *iph;
2983 struct pktgen_hdr *pgh = NULL;
2984 __be16 protocol = htons(ETH_P_IPV6); 3002 __be16 protocol = htons(ETH_P_IPV6);
2985 __be32 *mpls; 3003 __be32 *mpls;
2986 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */ 3004 __be16 *vlan_tci = NULL; /* Encapsulates priority and VLAN ID */
@@ -3083,75 +3101,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
3083 skb->dev = odev; 3101 skb->dev = odev;
3084 skb->pkt_type = PACKET_HOST; 3102 skb->pkt_type = PACKET_HOST;
3085 3103
3086 if (pkt_dev->nfrags <= 0) 3104 pktgen_finalize_skb(pkt_dev, skb, datalen);
3087 pgh = (struct pktgen_hdr *)skb_put(skb, datalen);
3088 else {
3089 int frags = pkt_dev->nfrags;
3090 int i;
3091
3092 pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8);
3093
3094 if (frags > MAX_SKB_FRAGS)
3095 frags = MAX_SKB_FRAGS;
3096 if (datalen > frags * PAGE_SIZE) {
3097 skb_put(skb, datalen - frags * PAGE_SIZE);
3098 datalen = frags * PAGE_SIZE;
3099 }
3100
3101 i = 0;
3102 while (datalen > 0) {
3103 struct page *page = alloc_pages(GFP_KERNEL, 0);
3104 skb_shinfo(skb)->frags[i].page = page;
3105 skb_shinfo(skb)->frags[i].page_offset = 0;
3106 skb_shinfo(skb)->frags[i].size =
3107 (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
3108 datalen -= skb_shinfo(skb)->frags[i].size;
3109 skb->len += skb_shinfo(skb)->frags[i].size;
3110 skb->data_len += skb_shinfo(skb)->frags[i].size;
3111 i++;
3112 skb_shinfo(skb)->nr_frags = i;
3113 }
3114
3115 while (i < frags) {
3116 int rem;
3117
3118 if (i == 0)
3119 break;
3120
3121 rem = skb_shinfo(skb)->frags[i - 1].size / 2;
3122 if (rem == 0)
3123 break;
3124
3125 skb_shinfo(skb)->frags[i - 1].size -= rem;
3126
3127 skb_shinfo(skb)->frags[i] =
3128 skb_shinfo(skb)->frags[i - 1];
3129 get_page(skb_shinfo(skb)->frags[i].page);
3130 skb_shinfo(skb)->frags[i].page =
3131 skb_shinfo(skb)->frags[i - 1].page;
3132 skb_shinfo(skb)->frags[i].page_offset +=
3133 skb_shinfo(skb)->frags[i - 1].size;
3134 skb_shinfo(skb)->frags[i].size = rem;
3135 i++;
3136 skb_shinfo(skb)->nr_frags = i;
3137 }
3138 }
3139
3140 /* Stamp the time, and sequence number,
3141 * convert them to network byte order
3142 * should we update cloned packets too ?
3143 */
3144 if (pgh) {
3145 struct timeval timestamp;
3146
3147 pgh->pgh_magic = htonl(PKTGEN_MAGIC);
3148 pgh->seq_num = htonl(pkt_dev->seq_num);
3149
3150 do_gettimeofday(&timestamp);
3151 pgh->tv_sec = htonl(timestamp.tv_sec);
3152 pgh->tv_usec = htonl(timestamp.tv_usec);
3153 }
3154 /* pkt_dev->seq_num++; FF: you really mean this? */
3155 3105
3156 return skb; 3106 return skb;
3157} 3107}
@@ -3884,6 +3834,8 @@ static int pktgen_remove_device(struct pktgen_thread *t,
3884 free_SAs(pkt_dev); 3834 free_SAs(pkt_dev);
3885#endif 3835#endif
3886 vfree(pkt_dev->flows); 3836 vfree(pkt_dev->flows);
3837 if (pkt_dev->page)
3838 put_page(pkt_dev->page);
3887 kfree(pkt_dev); 3839 kfree(pkt_dev);
3888 return 0; 3840 return 0;
3889} 3841}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 750db57f3bb3..c668f8c371b2 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -868,6 +868,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
868 netif_running(dev) ? dev->operstate : IF_OPER_DOWN); 868 netif_running(dev) ? dev->operstate : IF_OPER_DOWN);
869 NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode); 869 NLA_PUT_U8(skb, IFLA_LINKMODE, dev->link_mode);
870 NLA_PUT_U32(skb, IFLA_MTU, dev->mtu); 870 NLA_PUT_U32(skb, IFLA_MTU, dev->mtu);
871 NLA_PUT_U32(skb, IFLA_GROUP, dev->group);
871 872
872 if (dev->ifindex != dev->iflink) 873 if (dev->ifindex != dev->iflink)
873 NLA_PUT_U32(skb, IFLA_LINK, dev->iflink); 874 NLA_PUT_U32(skb, IFLA_LINK, dev->iflink);
@@ -1265,6 +1266,11 @@ static int do_setlink(struct net_device *dev, struct ifinfomsg *ifm,
1265 modified = 1; 1266 modified = 1;
1266 } 1267 }
1267 1268
1269 if (tb[IFLA_GROUP]) {
1270 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
1271 modified = 1;
1272 }
1273
1268 /* 1274 /*
1269 * Interface selected by interface index but interface 1275 * Interface selected by interface index but interface
1270 * name provided implies that a name change has been 1276 * name provided implies that a name change has been
@@ -1542,6 +1548,8 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
1542 set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); 1548 set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE]));
1543 if (tb[IFLA_LINKMODE]) 1549 if (tb[IFLA_LINKMODE])
1544 dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); 1550 dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]);
1551 if (tb[IFLA_GROUP])
1552 dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP]));
1545 1553
1546 return dev; 1554 return dev;
1547 1555
@@ -1552,6 +1560,24 @@ err:
1552} 1560}
1553EXPORT_SYMBOL(rtnl_create_link); 1561EXPORT_SYMBOL(rtnl_create_link);
1554 1562
1563static int rtnl_group_changelink(struct net *net, int group,
1564 struct ifinfomsg *ifm,
1565 struct nlattr **tb)
1566{
1567 struct net_device *dev;
1568 int err;
1569
1570 for_each_netdev(net, dev) {
1571 if (dev->group == group) {
1572 err = do_setlink(dev, ifm, tb, NULL, 0);
1573 if (err < 0)
1574 return err;
1575 }
1576 }
1577
1578 return 0;
1579}
1580
1555static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) 1581static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
1556{ 1582{
1557 struct net *net = sock_net(skb->sk); 1583 struct net *net = sock_net(skb->sk);
@@ -1579,10 +1605,12 @@ replay:
1579 ifm = nlmsg_data(nlh); 1605 ifm = nlmsg_data(nlh);
1580 if (ifm->ifi_index > 0) 1606 if (ifm->ifi_index > 0)
1581 dev = __dev_get_by_index(net, ifm->ifi_index); 1607 dev = __dev_get_by_index(net, ifm->ifi_index);
1582 else if (ifname[0]) 1608 else {
1583 dev = __dev_get_by_name(net, ifname); 1609 if (ifname[0])
1584 else 1610 dev = __dev_get_by_name(net, ifname);
1585 dev = NULL; 1611 else
1612 dev = NULL;
1613 }
1586 1614
1587 err = validate_linkmsg(dev, tb); 1615 err = validate_linkmsg(dev, tb);
1588 if (err < 0) 1616 if (err < 0)
@@ -1646,8 +1674,13 @@ replay:
1646 return do_setlink(dev, ifm, tb, ifname, modified); 1674 return do_setlink(dev, ifm, tb, ifname, modified);
1647 } 1675 }
1648 1676
1649 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) 1677 if (!(nlh->nlmsg_flags & NLM_F_CREATE)) {
1678 if (ifm->ifi_index == 0 && tb[IFLA_GROUP])
1679 return rtnl_group_changelink(net,
1680 nla_get_u32(tb[IFLA_GROUP]),
1681 ifm, tb);
1650 return -ENODEV; 1682 return -ENODEV;
1683 }
1651 1684
1652 if (ifm->ifi_index) 1685 if (ifm->ifi_index)
1653 return -EOPNOTSUPP; 1686 return -EOPNOTSUPP;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7cd1bc86d591..a8b1e3c70d23 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2497,7 +2497,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
2497 * a pointer to the first in a list of new skbs for the segments. 2497 * a pointer to the first in a list of new skbs for the segments.
2498 * In case of error it returns ERR_PTR(err). 2498 * In case of error it returns ERR_PTR(err).
2499 */ 2499 */
2500struct sk_buff *skb_segment(struct sk_buff *skb, int features) 2500struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
2501{ 2501{
2502 struct sk_buff *segs = NULL; 2502 struct sk_buff *segs = NULL;
2503 struct sk_buff *tail = NULL; 2503 struct sk_buff *tail = NULL;
@@ -2507,7 +2507,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, int features)
2507 unsigned int offset = doffset; 2507 unsigned int offset = doffset;
2508 unsigned int headroom; 2508 unsigned int headroom;
2509 unsigned int len; 2509 unsigned int len;
2510 int sg = features & NETIF_F_SG; 2510 int sg = !!(features & NETIF_F_SG);
2511 int nfrags = skb_shinfo(skb)->nr_frags; 2511 int nfrags = skb_shinfo(skb)->nr_frags;
2512 int err = -ENOMEM; 2512 int err = -ENOMEM;
2513 int i = 0; 2513 int i = 0;
diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c
index f2abd3755690..b66600b3f4b5 100644
--- a/net/decnet/dn_table.c
+++ b/net/decnet/dn_table.c
@@ -59,7 +59,6 @@ struct dn_hash
59}; 59};
60 60
61#define dz_key_0(key) ((key).datum = 0) 61#define dz_key_0(key) ((key).datum = 0)
62#define dz_prefix(key,dz) ((key).datum)
63 62
64#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ 63#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\
65 for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) 64 for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index a5a1050595d1..8949a05ac307 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -140,6 +140,9 @@ config IP_ROUTE_VERBOSE
140 handled by the klogd daemon which is responsible for kernel messages 140 handled by the klogd daemon which is responsible for kernel messages
141 ("man klogd"). 141 ("man klogd").
142 142
143config IP_ROUTE_CLASSID
144 bool
145
143config IP_PNP 146config IP_PNP
144 bool "IP: kernel level autoconfiguration" 147 bool "IP: kernel level autoconfiguration"
145 help 148 help
@@ -657,4 +660,3 @@ config TCP_MD5SIG
657 on the Internet. 660 on the Internet.
658 661
659 If unsure, say N. 662 If unsure, say N.
660
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index f2b61107df6c..e5e2d9d64abb 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1215,7 +1215,7 @@ out:
1215 return err; 1215 return err;
1216} 1216}
1217 1217
1218static struct sk_buff *inet_gso_segment(struct sk_buff *skb, int features) 1218static struct sk_buff *inet_gso_segment(struct sk_buff *skb, u32 features)
1219{ 1219{
1220 struct sk_buff *segs = ERR_PTR(-EINVAL); 1220 struct sk_buff *segs = ERR_PTR(-EINVAL);
1221 struct iphdr *iph; 1221 struct iphdr *iph;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 7981a24f5c7b..9cefe72029cf 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -41,12 +41,12 @@ struct fib4_rule {
41 __be32 srcmask; 41 __be32 srcmask;
42 __be32 dst; 42 __be32 dst;
43 __be32 dstmask; 43 __be32 dstmask;
44#ifdef CONFIG_NET_CLS_ROUTE 44#ifdef CONFIG_IP_ROUTE_CLASSID
45 u32 tclassid; 45 u32 tclassid;
46#endif 46#endif
47}; 47};
48 48
49#ifdef CONFIG_NET_CLS_ROUTE 49#ifdef CONFIG_IP_ROUTE_CLASSID
50u32 fib_rules_tclass(struct fib_result *res) 50u32 fib_rules_tclass(struct fib_result *res)
51{ 51{
52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0; 52 return res->r ? ((struct fib4_rule *) res->r)->tclassid : 0;
@@ -165,7 +165,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
165 if (frh->dst_len) 165 if (frh->dst_len)
166 rule4->dst = nla_get_be32(tb[FRA_DST]); 166 rule4->dst = nla_get_be32(tb[FRA_DST]);
167 167
168#ifdef CONFIG_NET_CLS_ROUTE 168#ifdef CONFIG_IP_ROUTE_CLASSID
169 if (tb[FRA_FLOW]) 169 if (tb[FRA_FLOW])
170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]); 170 rule4->tclassid = nla_get_u32(tb[FRA_FLOW]);
171#endif 171#endif
@@ -195,7 +195,7 @@ static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
195 if (frh->tos && (rule4->tos != frh->tos)) 195 if (frh->tos && (rule4->tos != frh->tos))
196 return 0; 196 return 0;
197 197
198#ifdef CONFIG_NET_CLS_ROUTE 198#ifdef CONFIG_IP_ROUTE_CLASSID
199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW]))) 199 if (tb[FRA_FLOW] && (rule4->tclassid != nla_get_u32(tb[FRA_FLOW])))
200 return 0; 200 return 0;
201#endif 201#endif
@@ -224,7 +224,7 @@ static int fib4_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
224 if (rule4->src_len) 224 if (rule4->src_len)
225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src); 225 NLA_PUT_BE32(skb, FRA_SRC, rule4->src);
226 226
227#ifdef CONFIG_NET_CLS_ROUTE 227#ifdef CONFIG_IP_ROUTE_CLASSID
228 if (rule4->tclassid) 228 if (rule4->tclassid)
229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid); 229 NLA_PUT_U32(skb, FRA_FLOW, rule4->tclassid);
230#endif 230#endif
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 12d3dc3df1b7..9aff11d7278f 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -200,7 +200,7 @@ static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
200#ifdef CONFIG_IP_ROUTE_MULTIPATH 200#ifdef CONFIG_IP_ROUTE_MULTIPATH
201 nh->nh_weight != onh->nh_weight || 201 nh->nh_weight != onh->nh_weight ||
202#endif 202#endif
203#ifdef CONFIG_NET_CLS_ROUTE 203#ifdef CONFIG_IP_ROUTE_CLASSID
204 nh->nh_tclassid != onh->nh_tclassid || 204 nh->nh_tclassid != onh->nh_tclassid ||
205#endif 205#endif
206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) 206 ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
@@ -422,7 +422,7 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
422 422
423 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 423 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0; 424 nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
425#ifdef CONFIG_NET_CLS_ROUTE 425#ifdef CONFIG_IP_ROUTE_CLASSID
426 nla = nla_find(attrs, attrlen, RTA_FLOW); 426 nla = nla_find(attrs, attrlen, RTA_FLOW);
427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0; 427 nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
428#endif 428#endif
@@ -476,7 +476,7 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
476 nla = nla_find(attrs, attrlen, RTA_GATEWAY); 476 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
477 if (nla && nla_get_be32(nla) != nh->nh_gw) 477 if (nla && nla_get_be32(nla) != nh->nh_gw)
478 return 1; 478 return 1;
479#ifdef CONFIG_NET_CLS_ROUTE 479#ifdef CONFIG_IP_ROUTE_CLASSID
480 nla = nla_find(attrs, attrlen, RTA_FLOW); 480 nla = nla_find(attrs, attrlen, RTA_FLOW);
481 if (nla && nla_get_u32(nla) != nh->nh_tclassid) 481 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
482 return 1; 482 return 1;
@@ -779,7 +779,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
779 goto err_inval; 779 goto err_inval;
780 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw) 780 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
781 goto err_inval; 781 goto err_inval;
782#ifdef CONFIG_NET_CLS_ROUTE 782#ifdef CONFIG_IP_ROUTE_CLASSID
783 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow) 783 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
784 goto err_inval; 784 goto err_inval;
785#endif 785#endif
@@ -792,7 +792,7 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
792 nh->nh_oif = cfg->fc_oif; 792 nh->nh_oif = cfg->fc_oif;
793 nh->nh_gw = cfg->fc_gw; 793 nh->nh_gw = cfg->fc_gw;
794 nh->nh_flags = cfg->fc_flags; 794 nh->nh_flags = cfg->fc_flags;
795#ifdef CONFIG_NET_CLS_ROUTE 795#ifdef CONFIG_IP_ROUTE_CLASSID
796 nh->nh_tclassid = cfg->fc_flow; 796 nh->nh_tclassid = cfg->fc_flow;
797#endif 797#endif
798#ifdef CONFIG_IP_ROUTE_MULTIPATH 798#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1002,7 +1002,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1002 1002
1003 if (fi->fib_nh->nh_oif) 1003 if (fi->fib_nh->nh_oif)
1004 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif); 1004 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
1005#ifdef CONFIG_NET_CLS_ROUTE 1005#ifdef CONFIG_IP_ROUTE_CLASSID
1006 if (fi->fib_nh[0].nh_tclassid) 1006 if (fi->fib_nh[0].nh_tclassid)
1007 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid); 1007 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
1008#endif 1008#endif
@@ -1027,7 +1027,7 @@ int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
1027 1027
1028 if (nh->nh_gw) 1028 if (nh->nh_gw)
1029 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw); 1029 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1030#ifdef CONFIG_NET_CLS_ROUTE 1030#ifdef CONFIG_IP_ROUTE_CLASSID
1031 if (nh->nh_tclassid) 1031 if (nh->nh_tclassid)
1032 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid); 1032 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1033#endif 1033#endif
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index d859bcc26cb7..d7b2b0987a3b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -340,7 +340,7 @@ static int ip_rcv_finish(struct sk_buff *skb)
340 } 340 }
341 } 341 }
342 342
343#ifdef CONFIG_NET_CLS_ROUTE 343#ifdef CONFIG_IP_ROUTE_CLASSID
344 if (unlikely(skb_dst(skb)->tclassid)) { 344 if (unlikely(skb_dst(skb)->tclassid)) {
345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct); 345 struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
346 u32 idx = skb_dst(skb)->tclassid; 346 u32 idx = skb_dst(skb)->tclassid;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index babd1a2bae5f..f926a310075d 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -206,8 +206,9 @@ config IP_NF_TARGET_REDIRECT
206 206
207config NF_NAT_SNMP_BASIC 207config NF_NAT_SNMP_BASIC
208 tristate "Basic SNMP-ALG support" 208 tristate "Basic SNMP-ALG support"
209 depends on NF_NAT 209 depends on NF_CONNTRACK_SNMP && NF_NAT
210 depends on NETFILTER_ADVANCED 210 depends on NETFILTER_ADVANCED
211 default NF_NAT && NF_CONNTRACK_SNMP
211 ---help--- 212 ---help---
212 213
213 This module implements an Application Layer Gateway (ALG) for 214 This module implements an Application Layer Gateway (ALG) for
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e855fffaed95..e95054c690c6 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -866,6 +866,7 @@ static int compat_table_info(const struct xt_table_info *info,
866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 866 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
867 newinfo->initial_entries = 0; 867 newinfo->initial_entries = 0;
868 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 868 loc_cpu_entry = info->entries[raw_smp_processor_id()];
869 xt_compat_init_offsets(NFPROTO_ARP, info->number);
869 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 870 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
870 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 871 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
871 if (ret != 0) 872 if (ret != 0)
@@ -1333,6 +1334,7 @@ static int translate_compat_table(const char *name,
1333 duprintf("translate_compat_table: size %u\n", info->size); 1334 duprintf("translate_compat_table: size %u\n", info->size);
1334 j = 0; 1335 j = 0;
1335 xt_compat_lock(NFPROTO_ARP); 1336 xt_compat_lock(NFPROTO_ARP);
1337 xt_compat_init_offsets(NFPROTO_ARP, number);
1336 /* Walk through entries, checking offsets. */ 1338 /* Walk through entries, checking offsets. */
1337 xt_entry_foreach(iter0, entry0, total_size) { 1339 xt_entry_foreach(iter0, entry0, total_size) {
1338 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1340 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 652efea013dc..ef7d7b9680ea 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1063,6 +1063,7 @@ static int compat_table_info(const struct xt_table_info *info,
1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1063 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1064 newinfo->initial_entries = 0; 1064 newinfo->initial_entries = 0;
1065 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1065 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1066 xt_compat_init_offsets(AF_INET, info->number);
1066 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1067 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1067 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1068 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1068 if (ret != 0) 1069 if (ret != 0)
@@ -1664,6 +1665,7 @@ translate_compat_table(struct net *net,
1664 duprintf("translate_compat_table: size %u\n", info->size); 1665 duprintf("translate_compat_table: size %u\n", info->size);
1665 j = 0; 1666 j = 0;
1666 xt_compat_lock(AF_INET); 1667 xt_compat_lock(AF_INET);
1668 xt_compat_init_offsets(AF_INET, number);
1667 /* Walk through entries, checking offsets. */ 1669 /* Walk through entries, checking offsets. */
1668 xt_entry_foreach(iter0, entry0, total_size) { 1670 xt_entry_foreach(iter0, entry0, total_size) {
1669 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1671 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 1e26a4897655..403ca57f6011 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -300,13 +300,8 @@ clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par)
300 * that the ->target() function isn't called after ->destroy() */ 300 * that the ->target() function isn't called after ->destroy() */
301 301
302 ct = nf_ct_get(skb, &ctinfo); 302 ct = nf_ct_get(skb, &ctinfo);
303 if (ct == NULL) { 303 if (ct == NULL)
304 pr_info("no conntrack!\n");
305 /* FIXME: need to drop invalid ones, since replies
306 * to outgoing connections of other nodes will be
307 * marked as INVALID */
308 return NF_DROP; 304 return NF_DROP;
309 }
310 305
311 /* special case: ICMP error handling. conntrack distinguishes between 306 /* special case: ICMP error handling. conntrack distinguishes between
312 * error messages (RELATED) and information requests (see below) */ 307 * error messages (RELATED) and information requests (see below) */
diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 72ffc8fda2e9..d76d6c9ed946 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -442,8 +442,7 @@ ipt_log_packet(u_int8_t pf,
442 } 442 }
443#endif 443#endif
444 444
445 /* MAC logging for input path only. */ 445 if (in != NULL)
446 if (in && !out)
447 dump_mac_header(m, loginfo, skb); 446 dump_mac_header(m, loginfo, skb);
448 447
449 dump_packet(m, loginfo, skb, 0); 448 dump_packet(m, loginfo, skb, 0);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index 294a2a32f293..aef5d1fbe77d 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -60,7 +60,7 @@ ipt_mangle_out(struct sk_buff *skb, const struct net_device *out)
60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out, 60 ret = ipt_do_table(skb, NF_INET_LOCAL_OUT, NULL, out,
61 dev_net(out)->ipv4.iptable_mangle); 61 dev_net(out)->ipv4.iptable_mangle);
62 /* Reroute for ANY change. */ 62 /* Reroute for ANY change. */
63 if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE) { 63 if (ret != NF_DROP && ret != NF_STOLEN) {
64 iph = ip_hdr(skb); 64 iph = ip_hdr(skb);
65 65
66 if (iph->saddr != saddr || 66 if (iph->saddr != saddr ||
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 63f60fc5d26a..5585980fce2e 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -20,6 +20,7 @@
20#include <net/netfilter/nf_conntrack_l4proto.h> 20#include <net/netfilter/nf_conntrack_l4proto.h>
21#include <net/netfilter/nf_conntrack_expect.h> 21#include <net/netfilter/nf_conntrack_expect.h>
22#include <net/netfilter/nf_conntrack_acct.h> 22#include <net/netfilter/nf_conntrack_acct.h>
23#include <linux/rculist_nulls.h>
23 24
24struct ct_iter_state { 25struct ct_iter_state {
25 struct seq_net_private p; 26 struct seq_net_private p;
@@ -35,7 +36,8 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
35 for (st->bucket = 0; 36 for (st->bucket = 0;
36 st->bucket < net->ct.htable_size; 37 st->bucket < net->ct.htable_size;
37 st->bucket++) { 38 st->bucket++) {
38 n = rcu_dereference(net->ct.hash[st->bucket].first); 39 n = rcu_dereference(
40 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
39 if (!is_a_nulls(n)) 41 if (!is_a_nulls(n))
40 return n; 42 return n;
41 } 43 }
@@ -48,13 +50,14 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
48 struct net *net = seq_file_net(seq); 50 struct net *net = seq_file_net(seq);
49 struct ct_iter_state *st = seq->private; 51 struct ct_iter_state *st = seq->private;
50 52
51 head = rcu_dereference(head->next); 53 head = rcu_dereference(hlist_nulls_next_rcu(head));
52 while (is_a_nulls(head)) { 54 while (is_a_nulls(head)) {
53 if (likely(get_nulls_value(head) == st->bucket)) { 55 if (likely(get_nulls_value(head) == st->bucket)) {
54 if (++st->bucket >= net->ct.htable_size) 56 if (++st->bucket >= net->ct.htable_size)
55 return NULL; 57 return NULL;
56 } 58 }
57 head = rcu_dereference(net->ct.hash[st->bucket].first); 59 head = rcu_dereference(
60 hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
58 } 61 }
59 return head; 62 return head;
60} 63}
@@ -217,7 +220,8 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
217 struct hlist_node *n; 220 struct hlist_node *n;
218 221
219 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 222 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
220 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 223 n = rcu_dereference(
224 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
221 if (n) 225 if (n)
222 return n; 226 return n;
223 } 227 }
@@ -230,11 +234,12 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
230 struct net *net = seq_file_net(seq); 234 struct net *net = seq_file_net(seq);
231 struct ct_expect_iter_state *st = seq->private; 235 struct ct_expect_iter_state *st = seq->private;
232 236
233 head = rcu_dereference(head->next); 237 head = rcu_dereference(hlist_next_rcu(head));
234 while (head == NULL) { 238 while (head == NULL) {
235 if (++st->bucket >= nf_ct_expect_hsize) 239 if (++st->bucket >= nf_ct_expect_hsize)
236 return NULL; 240 return NULL;
237 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 241 head = rcu_dereference(
242 hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
238 } 243 }
239 return head; 244 return head;
240} 245}
diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index 0f23b3f06df0..703f366fd235 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,13 +44,13 @@ static unsigned int help(struct sk_buff *skb,
44 44
45 /* Try to get same port: if not, try to change it. */ 45 /* Try to get same port: if not, try to change it. */
46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { 46 for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
47 int ret; 47 int res;
48 48
49 exp->tuple.dst.u.tcp.port = htons(port); 49 exp->tuple.dst.u.tcp.port = htons(port);
50 ret = nf_ct_expect_related(exp); 50 res = nf_ct_expect_related(exp);
51 if (ret == 0) 51 if (res == 0)
52 break; 52 break;
53 else if (ret != -EBUSY) { 53 else if (res != -EBUSY) {
54 port = 0; 54 port = 0;
55 break; 55 break;
56 } 56 }
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index c04787ce1a71..21bcf471b25a 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -221,7 +221,14 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
221 manips not an issue. */ 221 manips not an issue. */
222 if (maniptype == IP_NAT_MANIP_SRC && 222 if (maniptype == IP_NAT_MANIP_SRC &&
223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { 223 !(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
224 if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { 224 /* try the original tuple first */
225 if (in_range(orig_tuple, range)) {
226 if (!nf_nat_used_tuple(orig_tuple, ct)) {
227 *tuple = *orig_tuple;
228 return;
229 }
230 } else if (find_appropriate_src(net, zone, orig_tuple, tuple,
231 range)) {
225 pr_debug("get_unique_tuple: Found current src map\n"); 232 pr_debug("get_unique_tuple: Found current src map\n");
226 if (!nf_nat_used_tuple(tuple, ct)) 233 if (!nf_nat_used_tuple(tuple, ct))
227 return; 234 return;
@@ -266,7 +273,6 @@ nf_nat_setup_info(struct nf_conn *ct,
266 struct net *net = nf_ct_net(ct); 273 struct net *net = nf_ct_net(ct);
267 struct nf_conntrack_tuple curr_tuple, new_tuple; 274 struct nf_conntrack_tuple curr_tuple, new_tuple;
268 struct nf_conn_nat *nat; 275 struct nf_conn_nat *nat;
269 int have_to_hash = !(ct->status & IPS_NAT_DONE_MASK);
270 276
271 /* nat helper or nfctnetlink also setup binding */ 277 /* nat helper or nfctnetlink also setup binding */
272 nat = nfct_nat(ct); 278 nat = nfct_nat(ct);
@@ -306,8 +312,7 @@ nf_nat_setup_info(struct nf_conn *ct,
306 ct->status |= IPS_DST_NAT; 312 ct->status |= IPS_DST_NAT;
307 } 313 }
308 314
309 /* Place in source hash if this is the first time. */ 315 if (maniptype == IP_NAT_MANIP_SRC) {
310 if (have_to_hash) {
311 unsigned int srchash; 316 unsigned int srchash;
312 317
313 srchash = hash_by_src(net, nf_ct_zone(ct), 318 srchash = hash_by_src(net, nf_ct_zone(ct),
@@ -323,9 +328,9 @@ nf_nat_setup_info(struct nf_conn *ct,
323 328
324 /* It's done. */ 329 /* It's done. */
325 if (maniptype == IP_NAT_MANIP_DST) 330 if (maniptype == IP_NAT_MANIP_DST)
326 set_bit(IPS_DST_NAT_DONE_BIT, &ct->status); 331 ct->status |= IPS_DST_NAT_DONE;
327 else 332 else
328 set_bit(IPS_SRC_NAT_DONE_BIT, &ct->status); 333 ct->status |= IPS_SRC_NAT_DONE;
329 334
330 return NF_ACCEPT; 335 return NF_ACCEPT;
331} 336}
@@ -502,7 +507,10 @@ int nf_nat_protocol_register(const struct nf_nat_protocol *proto)
502 int ret = 0; 507 int ret = 0;
503 508
504 spin_lock_bh(&nf_nat_lock); 509 spin_lock_bh(&nf_nat_lock);
505 if (nf_nat_protos[proto->protonum] != &nf_nat_unknown_protocol) { 510 if (rcu_dereference_protected(
511 nf_nat_protos[proto->protonum],
512 lockdep_is_held(&nf_nat_lock)
513 ) != &nf_nat_unknown_protocol) {
506 ret = -EBUSY; 514 ret = -EBUSY;
507 goto out; 515 goto out;
508 } 516 }
@@ -532,7 +540,7 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
532 if (nat == NULL || nat->ct == NULL) 540 if (nat == NULL || nat->ct == NULL)
533 return; 541 return;
534 542
535 NF_CT_ASSERT(nat->ct->status & IPS_NAT_DONE_MASK); 543 NF_CT_ASSERT(nat->ct->status & IPS_SRC_NAT_DONE);
536 544
537 spin_lock_bh(&nf_nat_lock); 545 spin_lock_bh(&nf_nat_lock);
538 hlist_del_rcu(&nat->bysource); 546 hlist_del_rcu(&nat->bysource);
@@ -545,11 +553,10 @@ static void nf_nat_move_storage(void *new, void *old)
545 struct nf_conn_nat *old_nat = old; 553 struct nf_conn_nat *old_nat = old;
546 struct nf_conn *ct = old_nat->ct; 554 struct nf_conn *ct = old_nat->ct;
547 555
548 if (!ct || !(ct->status & IPS_NAT_DONE_MASK)) 556 if (!ct || !(ct->status & IPS_SRC_NAT_DONE))
549 return; 557 return;
550 558
551 spin_lock_bh(&nf_nat_lock); 559 spin_lock_bh(&nf_nat_lock);
552 new_nat->ct = ct;
553 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource); 560 hlist_replace_rcu(&old_nat->bysource, &new_nat->bysource);
554 spin_unlock_bh(&nf_nat_lock); 561 spin_unlock_bh(&nf_nat_lock);
555} 562}
@@ -679,8 +686,7 @@ static int __net_init nf_nat_net_init(struct net *net)
679{ 686{
680 /* Leave them the same for the moment. */ 687 /* Leave them the same for the moment. */
681 net->ipv4.nat_htable_size = net->ct.htable_size; 688 net->ipv4.nat_htable_size = net->ct.htable_size;
682 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 689 net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, 0);
683 &net->ipv4.nat_vmalloced, 0);
684 if (!net->ipv4.nat_bysource) 690 if (!net->ipv4.nat_bysource)
685 return -ENOMEM; 691 return -ENOMEM;
686 return 0; 692 return 0;
@@ -702,8 +708,7 @@ static void __net_exit nf_nat_net_exit(struct net *net)
702{ 708{
703 nf_ct_iterate_cleanup(net, &clean_nat, NULL); 709 nf_ct_iterate_cleanup(net, &clean_nat, NULL);
704 synchronize_rcu(); 710 synchronize_rcu();
705 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, 711 nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_htable_size);
706 net->ipv4.nat_htable_size);
707} 712}
708 713
709static struct pernet_operations nf_nat_net_ops = { 714static struct pernet_operations nf_nat_net_ops = {
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ee5f419d0a56..8812a02078ab 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -54,6 +54,7 @@
54#include <net/netfilter/nf_conntrack_expect.h> 54#include <net/netfilter/nf_conntrack_expect.h>
55#include <net/netfilter/nf_conntrack_helper.h> 55#include <net/netfilter/nf_conntrack_helper.h>
56#include <net/netfilter/nf_nat_helper.h> 56#include <net/netfilter/nf_nat_helper.h>
57#include <linux/netfilter/nf_conntrack_snmp.h>
57 58
58MODULE_LICENSE("GPL"); 59MODULE_LICENSE("GPL");
59MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>"); 60MODULE_AUTHOR("James Morris <jmorris@intercode.com.au>");
@@ -1310,9 +1311,9 @@ static int __init nf_nat_snmp_basic_init(void)
1310{ 1311{
1311 int ret = 0; 1312 int ret = 0;
1312 1313
1313 ret = nf_conntrack_helper_register(&snmp_helper); 1314 BUG_ON(nf_nat_snmp_hook != NULL);
1314 if (ret < 0) 1315 rcu_assign_pointer(nf_nat_snmp_hook, help);
1315 return ret; 1316
1316 ret = nf_conntrack_helper_register(&snmp_trap_helper); 1317 ret = nf_conntrack_helper_register(&snmp_trap_helper);
1317 if (ret < 0) { 1318 if (ret < 0) {
1318 nf_conntrack_helper_unregister(&snmp_helper); 1319 nf_conntrack_helper_unregister(&snmp_helper);
@@ -1323,7 +1324,7 @@ static int __init nf_nat_snmp_basic_init(void)
1323 1324
1324static void __exit nf_nat_snmp_basic_fini(void) 1325static void __exit nf_nat_snmp_basic_fini(void)
1325{ 1326{
1326 nf_conntrack_helper_unregister(&snmp_helper); 1327 rcu_assign_pointer(nf_nat_snmp_hook, NULL);
1327 nf_conntrack_helper_unregister(&snmp_trap_helper); 1328 nf_conntrack_helper_unregister(&snmp_trap_helper);
1328} 1329}
1329 1330
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 351dc4e85242..3e5b7cc2db4f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -514,7 +514,7 @@ static const struct file_operations rt_cpu_seq_fops = {
514 .release = seq_release, 514 .release = seq_release,
515}; 515};
516 516
517#ifdef CONFIG_NET_CLS_ROUTE 517#ifdef CONFIG_IP_ROUTE_CLASSID
518static int rt_acct_proc_show(struct seq_file *m, void *v) 518static int rt_acct_proc_show(struct seq_file *m, void *v)
519{ 519{
520 struct ip_rt_acct *dst, *src; 520 struct ip_rt_acct *dst, *src;
@@ -567,14 +567,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
567 if (!pde) 567 if (!pde)
568 goto err2; 568 goto err2;
569 569
570#ifdef CONFIG_NET_CLS_ROUTE 570#ifdef CONFIG_IP_ROUTE_CLASSID
571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 571 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
572 if (!pde) 572 if (!pde)
573 goto err3; 573 goto err3;
574#endif 574#endif
575 return 0; 575 return 0;
576 576
577#ifdef CONFIG_NET_CLS_ROUTE 577#ifdef CONFIG_IP_ROUTE_CLASSID
578err3: 578err3:
579 remove_proc_entry("rt_cache", net->proc_net_stat); 579 remove_proc_entry("rt_cache", net->proc_net_stat);
580#endif 580#endif
@@ -588,7 +588,7 @@ static void __net_exit ip_rt_do_proc_exit(struct net *net)
588{ 588{
589 remove_proc_entry("rt_cache", net->proc_net_stat); 589 remove_proc_entry("rt_cache", net->proc_net_stat);
590 remove_proc_entry("rt_cache", net->proc_net); 590 remove_proc_entry("rt_cache", net->proc_net);
591#ifdef CONFIG_NET_CLS_ROUTE 591#ifdef CONFIG_IP_ROUTE_CLASSID
592 remove_proc_entry("rt_acct", net->proc_net); 592 remove_proc_entry("rt_acct", net->proc_net);
593#endif 593#endif
594} 594}
@@ -1775,7 +1775,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
1775 memcpy(addr, &src, 4); 1775 memcpy(addr, &src, 4);
1776} 1776}
1777 1777
1778#ifdef CONFIG_NET_CLS_ROUTE 1778#ifdef CONFIG_IP_ROUTE_CLASSID
1779static void set_class_tag(struct rtable *rt, u32 tag) 1779static void set_class_tag(struct rtable *rt, u32 tag)
1780{ 1780{
1781 if (!(rt->dst.tclassid & 0xFFFF)) 1781 if (!(rt->dst.tclassid & 0xFFFF))
@@ -1825,7 +1825,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1825 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1826 rt->rt_gateway = FIB_RES_GW(*res); 1826 rt->rt_gateway = FIB_RES_GW(*res);
1827 dst_import_metrics(dst, fi->fib_metrics); 1827 dst_import_metrics(dst, fi->fib_metrics);
1828#ifdef CONFIG_NET_CLS_ROUTE 1828#ifdef CONFIG_IP_ROUTE_CLASSID
1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1829 dst->tclassid = FIB_RES_NH(*res).nh_tclassid;
1830#endif 1830#endif
1831 } 1831 }
@@ -1835,7 +1835,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1835 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40)
1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1836 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40);
1837 1837
1838#ifdef CONFIG_NET_CLS_ROUTE 1838#ifdef CONFIG_IP_ROUTE_CLASSID
1839#ifdef CONFIG_IP_MULTIPLE_TABLES 1839#ifdef CONFIG_IP_MULTIPLE_TABLES
1840 set_class_tag(rt, fib_rules_tclass(res)); 1840 set_class_tag(rt, fib_rules_tclass(res));
1841#endif 1841#endif
@@ -1891,7 +1891,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1891 rth->fl.mark = skb->mark; 1891 rth->fl.mark = skb->mark;
1892 rth->fl.fl4_src = saddr; 1892 rth->fl.fl4_src = saddr;
1893 rth->rt_src = saddr; 1893 rth->rt_src = saddr;
1894#ifdef CONFIG_NET_CLS_ROUTE 1894#ifdef CONFIG_IP_ROUTE_CLASSID
1895 rth->dst.tclassid = itag; 1895 rth->dst.tclassid = itag;
1896#endif 1896#endif
1897 rth->rt_iif = 1897 rth->rt_iif =
@@ -2208,7 +2208,7 @@ local_input:
2208 rth->fl.mark = skb->mark; 2208 rth->fl.mark = skb->mark;
2209 rth->fl.fl4_src = saddr; 2209 rth->fl.fl4_src = saddr;
2210 rth->rt_src = saddr; 2210 rth->rt_src = saddr;
2211#ifdef CONFIG_NET_CLS_ROUTE 2211#ifdef CONFIG_IP_ROUTE_CLASSID
2212 rth->dst.tclassid = itag; 2212 rth->dst.tclassid = itag;
2213#endif 2213#endif
2214 rth->rt_iif = 2214 rth->rt_iif =
@@ -2828,7 +2828,7 @@ static int rt_fill_info(struct net *net,
2828 } 2828 }
2829 if (rt->dst.dev) 2829 if (rt->dst.dev)
2830 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2830 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex);
2831#ifdef CONFIG_NET_CLS_ROUTE 2831#ifdef CONFIG_IP_ROUTE_CLASSID
2832 if (rt->dst.tclassid) 2832 if (rt->dst.tclassid)
2833 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2833 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid);
2834#endif 2834#endif
@@ -3249,9 +3249,9 @@ static __net_initdata struct pernet_operations rt_genid_ops = {
3249}; 3249};
3250 3250
3251 3251
3252#ifdef CONFIG_NET_CLS_ROUTE 3252#ifdef CONFIG_IP_ROUTE_CLASSID
3253struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3253struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3254#endif /* CONFIG_NET_CLS_ROUTE */ 3254#endif /* CONFIG_IP_ROUTE_CLASSID */
3255 3255
3256static __initdata unsigned long rhash_entries; 3256static __initdata unsigned long rhash_entries;
3257static int __init set_rhash_entries(char *str) 3257static int __init set_rhash_entries(char *str)
@@ -3267,7 +3267,7 @@ int __init ip_rt_init(void)
3267{ 3267{
3268 int rc = 0; 3268 int rc = 0;
3269 3269
3270#ifdef CONFIG_NET_CLS_ROUTE 3270#ifdef CONFIG_IP_ROUTE_CLASSID
3271 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3271 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3272 if (!ip_rt_acct) 3272 if (!ip_rt_acct)
3273 panic("IP: failed to allocate ip_rt_acct\n"); 3273 panic("IP: failed to allocate ip_rt_acct\n");
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 6c11eece262c..f9867d2dbef4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2653,7 +2653,7 @@ int compat_tcp_getsockopt(struct sock *sk, int level, int optname,
2653EXPORT_SYMBOL(compat_tcp_getsockopt); 2653EXPORT_SYMBOL(compat_tcp_getsockopt);
2654#endif 2654#endif
2655 2655
2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) 2656struct sk_buff *tcp_tso_segment(struct sk_buff *skb, u32 features)
2657{ 2657{
2658 struct sk_buff *segs = ERR_PTR(-EINVAL); 2658 struct sk_buff *segs = ERR_PTR(-EINVAL);
2659 struct tcphdr *th; 2659 struct tcphdr *th;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 8157b17959ee..d37baaa1dbe3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2199,7 +2199,7 @@ int udp4_ufo_send_check(struct sk_buff *skb)
2199 return 0; 2199 return 0;
2200} 2200}
2201 2201
2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, int features) 2202struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, u32 features)
2203{ 2203{
2204 struct sk_buff *segs = ERR_PTR(-EINVAL); 2204 struct sk_buff *segs = ERR_PTR(-EINVAL);
2205 unsigned int mss; 2205 unsigned int mss;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 978e80e2c4a8..3194aa909872 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -772,7 +772,7 @@ out:
772 return err; 772 return err;
773} 773}
774 774
775static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, int features) 775static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, u32 features)
776{ 776{
777 struct sk_buff *segs = ERR_PTR(-EINVAL); 777 struct sk_buff *segs = ERR_PTR(-EINVAL);
778 struct ipv6hdr *ipv6h; 778 struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7d227c644f72..47b7b8df7fac 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1076,6 +1076,7 @@ static int compat_table_info(const struct xt_table_info *info,
1076 memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1076 memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
1077 newinfo->initial_entries = 0; 1077 newinfo->initial_entries = 0;
1078 loc_cpu_entry = info->entries[raw_smp_processor_id()]; 1078 loc_cpu_entry = info->entries[raw_smp_processor_id()];
1079 xt_compat_init_offsets(AF_INET6, info->number);
1079 xt_entry_foreach(iter, loc_cpu_entry, info->size) { 1080 xt_entry_foreach(iter, loc_cpu_entry, info->size) {
1080 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo); 1081 ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
1081 if (ret != 0) 1082 if (ret != 0)
@@ -1679,6 +1680,7 @@ translate_compat_table(struct net *net,
1679 duprintf("translate_compat_table: size %u\n", info->size); 1680 duprintf("translate_compat_table: size %u\n", info->size);
1680 j = 0; 1681 j = 0;
1681 xt_compat_lock(AF_INET6); 1682 xt_compat_lock(AF_INET6);
1683 xt_compat_init_offsets(AF_INET6, number);
1682 /* Walk through entries, checking offsets. */ 1684 /* Walk through entries, checking offsets. */
1683 xt_entry_foreach(iter0, entry0, total_size) { 1685 xt_entry_foreach(iter0, entry0, total_size) {
1684 ret = check_compat_entry_size_and_hooks(iter0, info, &size, 1686 ret = check_compat_entry_size_and_hooks(iter0, info, &size,
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 09c88891a753..05027b753721 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -452,8 +452,7 @@ ip6t_log_packet(u_int8_t pf,
452 in ? in->name : "", 452 in ? in->name : "",
453 out ? out->name : ""); 453 out ? out->name : "");
454 454
455 /* MAC logging for input path only. */ 455 if (in != NULL)
456 if (in && !out)
457 dump_mac_header(m, loginfo, skb); 456 dump_mac_header(m, loginfo, skb);
458 457
459 dump_packet(m, loginfo, skb, skb_network_offset(skb), 1); 458 dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 79d43aa8fa8d..085727263812 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -45,6 +45,7 @@
45#include <linux/netfilter_ipv6.h> 45#include <linux/netfilter_ipv6.h>
46#include <linux/kernel.h> 46#include <linux/kernel.h>
47#include <linux/module.h> 47#include <linux/module.h>
48#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
48 49
49 50
50struct nf_ct_frag6_skb_cb 51struct nf_ct_frag6_skb_cb
@@ -73,7 +74,7 @@ static struct inet_frags nf_frags;
73static struct netns_frags nf_init_frags; 74static struct netns_frags nf_init_frags;
74 75
75#ifdef CONFIG_SYSCTL 76#ifdef CONFIG_SYSCTL
76struct ctl_table nf_ct_frag6_sysctl_table[] = { 77static struct ctl_table nf_ct_frag6_sysctl_table[] = {
77 { 78 {
78 .procname = "nf_conntrack_frag6_timeout", 79 .procname = "nf_conntrack_frag6_timeout",
79 .data = &nf_init_frags.timeout, 80 .data = &nf_init_frags.timeout,
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 86c39526ba5e..2bc6cd7bb8ec 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -123,18 +123,18 @@ static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb)
123} 123}
124 124
125#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) 125#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
126static int (*mh_filter)(struct sock *sock, struct sk_buff *skb); 126typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb);
127 127
128int rawv6_mh_filter_register(int (*filter)(struct sock *sock, 128static mh_filter_t __rcu *mh_filter __read_mostly;
129 struct sk_buff *skb)) 129
130int rawv6_mh_filter_register(mh_filter_t filter)
130{ 131{
131 rcu_assign_pointer(mh_filter, filter); 132 rcu_assign_pointer(mh_filter, filter);
132 return 0; 133 return 0;
133} 134}
134EXPORT_SYMBOL(rawv6_mh_filter_register); 135EXPORT_SYMBOL(rawv6_mh_filter_register);
135 136
136int rawv6_mh_filter_unregister(int (*filter)(struct sock *sock, 137int rawv6_mh_filter_unregister(mh_filter_t filter)
137 struct sk_buff *skb))
138{ 138{
139 rcu_assign_pointer(mh_filter, NULL); 139 rcu_assign_pointer(mh_filter, NULL);
140 synchronize_rcu(); 140 synchronize_rcu();
@@ -192,10 +192,10 @@ static int ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
192 * policy is placed in rawv6_rcv() because it is 192 * policy is placed in rawv6_rcv() because it is
193 * required for each socket. 193 * required for each socket.
194 */ 194 */
195 int (*filter)(struct sock *sock, struct sk_buff *skb); 195 mh_filter_t *filter;
196 196
197 filter = rcu_dereference(mh_filter); 197 filter = rcu_dereference(mh_filter);
198 filtered = filter ? filter(sk, skb) : 0; 198 filtered = filter ? (*filter)(sk, skb) : 0;
199 break; 199 break;
200 } 200 }
201#endif 201#endif
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 8ce38f10a547..b1599a345c10 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -412,7 +412,7 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
412 412
413 p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); 413 p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
414 do { 414 do {
415 n = p->next; 415 n = rcu_dereference_protected(p->next, 1);
416 kfree(p); 416 kfree(p);
417 p = n; 417 p = n;
418 } while (p); 418 } while (p);
@@ -421,15 +421,17 @@ static void prl_list_destroy_rcu(struct rcu_head *head)
421static int 421static int
422ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) 422ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
423{ 423{
424 struct ip_tunnel_prl_entry *x, **p; 424 struct ip_tunnel_prl_entry *x;
425 struct ip_tunnel_prl_entry __rcu **p;
425 int err = 0; 426 int err = 0;
426 427
427 ASSERT_RTNL(); 428 ASSERT_RTNL();
428 429
429 if (a && a->addr != htonl(INADDR_ANY)) { 430 if (a && a->addr != htonl(INADDR_ANY)) {
430 for (p = &t->prl; *p; p = &(*p)->next) { 431 for (p = &t->prl;
431 if ((*p)->addr == a->addr) { 432 (x = rtnl_dereference(*p)) != NULL;
432 x = *p; 433 p = &x->next) {
434 if (x->addr == a->addr) {
433 *p = x->next; 435 *p = x->next;
434 call_rcu(&x->rcu_head, prl_entry_destroy_rcu); 436 call_rcu(&x->rcu_head, prl_entry_destroy_rcu);
435 t->prl_count--; 437 t->prl_count--;
@@ -438,9 +440,9 @@ ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
438 } 440 }
439 err = -ENXIO; 441 err = -ENXIO;
440 } else { 442 } else {
441 if (t->prl) { 443 x = rtnl_dereference(t->prl);
444 if (x) {
442 t->prl_count = 0; 445 t->prl_count = 0;
443 x = t->prl;
444 call_rcu(&x->rcu_head, prl_list_destroy_rcu); 446 call_rcu(&x->rcu_head, prl_list_destroy_rcu);
445 t->prl = NULL; 447 t->prl = NULL;
446 } 448 }
@@ -1179,7 +1181,7 @@ static int __net_init ipip6_fb_tunnel_init(struct net_device *dev)
1179 if (!dev->tstats) 1181 if (!dev->tstats)
1180 return -ENOMEM; 1182 return -ENOMEM;
1181 dev_hold(dev); 1183 dev_hold(dev);
1182 sitn->tunnels_wc[0] = tunnel; 1184 rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
1183 return 0; 1185 return 0;
1184} 1186}
1185 1187
@@ -1196,11 +1198,12 @@ static void __net_exit sit_destroy_tunnels(struct sit_net *sitn, struct list_hea
1196 for (prio = 1; prio < 4; prio++) { 1198 for (prio = 1; prio < 4; prio++) {
1197 int h; 1199 int h;
1198 for (h = 0; h < HASH_SIZE; h++) { 1200 for (h = 0; h < HASH_SIZE; h++) {
1199 struct ip_tunnel *t = sitn->tunnels[prio][h]; 1201 struct ip_tunnel *t;
1200 1202
1203 t = rtnl_dereference(sitn->tunnels[prio][h]);
1201 while (t != NULL) { 1204 while (t != NULL) {
1202 unregister_netdevice_queue(t->dev, head); 1205 unregister_netdevice_queue(t->dev, head);
1203 t = t->next; 1206 t = rtnl_dereference(t->next);
1204 } 1207 }
1205 } 1208 }
1206 } 1209 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 9a009c66c8a3..a419a787eb69 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1299,7 +1299,7 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
1299 return 0; 1299 return 0;
1300} 1300}
1301 1301
1302static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, int features) 1302static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, u32 features)
1303{ 1303{
1304 struct sk_buff *segs = ERR_PTR(-EINVAL); 1304 struct sk_buff *segs = ERR_PTR(-EINVAL);
1305 unsigned int mss; 1305 unsigned int mss;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 1534f2b44caf..faf7412ea453 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -85,6 +85,17 @@ config NF_CONNTRACK_EVENTS
85 85
86 If unsure, say `N'. 86 If unsure, say `N'.
87 87
88config NF_CONNTRACK_TIMESTAMP
89 bool 'Connection tracking timestamping'
90 depends on NETFILTER_ADVANCED
91 help
92 This option enables support for connection tracking timestamping.
93 This allows you to store the flow start-time and to obtain
94 the flow-stop time (once it has been destroyed) via Connection
95 tracking events.
96
97 If unsure, say `N'.
98
88config NF_CT_PROTO_DCCP 99config NF_CT_PROTO_DCCP
89 tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)' 100 tristate 'DCCP protocol connection tracking support (EXPERIMENTAL)'
90 depends on EXPERIMENTAL 101 depends on EXPERIMENTAL
@@ -185,9 +196,13 @@ config NF_CONNTRACK_IRC
185 196
186 To compile it as a module, choose M here. If unsure, say N. 197 To compile it as a module, choose M here. If unsure, say N.
187 198
199config NF_CONNTRACK_BROADCAST
200 tristate
201
188config NF_CONNTRACK_NETBIOS_NS 202config NF_CONNTRACK_NETBIOS_NS
189 tristate "NetBIOS name service protocol support" 203 tristate "NetBIOS name service protocol support"
190 depends on NETFILTER_ADVANCED 204 depends on NETFILTER_ADVANCED
205 select NF_CONNTRACK_BROADCAST
191 help 206 help
192 NetBIOS name service requests are sent as broadcast messages from an 207 NetBIOS name service requests are sent as broadcast messages from an
193 unprivileged port and responded to with unicast messages to the 208 unprivileged port and responded to with unicast messages to the
@@ -204,6 +219,21 @@ config NF_CONNTRACK_NETBIOS_NS
204 219
205 To compile it as a module, choose M here. If unsure, say N. 220 To compile it as a module, choose M here. If unsure, say N.
206 221
222config NF_CONNTRACK_SNMP
223 tristate "SNMP service protocol support"
224 depends on NETFILTER_ADVANCED
225 select NF_CONNTRACK_BROADCAST
226 help
227 SNMP service requests are sent as broadcast messages from an
228 unprivileged port and responded to with unicast messages to the
229 same port. This make them hard to firewall properly because connection
230 tracking doesn't deal with broadcasts. This helper tracks locally
231 originating SNMP service requests and the corresponding
232 responses. It relies on correct IP address configuration, specifically
233 netmask and broadcast address.
234
235 To compile it as a module, choose M here. If unsure, say N.
236
207config NF_CONNTRACK_PPTP 237config NF_CONNTRACK_PPTP
208 tristate "PPtP protocol support" 238 tristate "PPtP protocol support"
209 depends on NETFILTER_ADVANCED 239 depends on NETFILTER_ADVANCED
@@ -326,6 +356,16 @@ config NETFILTER_XT_CONNMARK
326 356
327comment "Xtables targets" 357comment "Xtables targets"
328 358
359config NETFILTER_XT_TARGET_AUDIT
360 tristate "AUDIT target support"
361 depends on AUDIT
362 depends on NETFILTER_ADVANCED
363 ---help---
364 This option adds a 'AUDIT' target, which can be used to create
365 audit records for packets dropped/accepted.
366
367 To compileit as a module, choose M here. If unsure, say N.
368
329config NETFILTER_XT_TARGET_CHECKSUM 369config NETFILTER_XT_TARGET_CHECKSUM
330 tristate "CHECKSUM target support" 370 tristate "CHECKSUM target support"
331 depends on IP_NF_MANGLE || IP6_NF_MANGLE 371 depends on IP_NF_MANGLE || IP6_NF_MANGLE
@@ -477,6 +517,7 @@ config NETFILTER_XT_TARGET_NFLOG
477config NETFILTER_XT_TARGET_NFQUEUE 517config NETFILTER_XT_TARGET_NFQUEUE
478 tristate '"NFQUEUE" target Support' 518 tristate '"NFQUEUE" target Support'
479 depends on NETFILTER_ADVANCED 519 depends on NETFILTER_ADVANCED
520 select NETFILTER_NETLINK_QUEUE
480 help 521 help
481 This target replaced the old obsolete QUEUE target. 522 This target replaced the old obsolete QUEUE target.
482 523
@@ -886,7 +927,7 @@ config NETFILTER_XT_MATCH_RATEEST
886config NETFILTER_XT_MATCH_REALM 927config NETFILTER_XT_MATCH_REALM
887 tristate '"realm" match support' 928 tristate '"realm" match support'
888 depends on NETFILTER_ADVANCED 929 depends on NETFILTER_ADVANCED
889 select NET_CLS_ROUTE 930 select IP_ROUTE_CLASSID
890 help 931 help
891 This option adds a `realm' match, which allows you to use the realm 932 This option adds a `realm' match, which allows you to use the realm
892 key from the routing subsystem inside iptables. 933 key from the routing subsystem inside iptables.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 441050f31111..9ae6878a85b1 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,6 +1,7 @@
1netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o 1netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o
2 2
3nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o 3nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o
4nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
4nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o 5nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
5 6
6obj-$(CONFIG_NETFILTER) = netfilter.o 7obj-$(CONFIG_NETFILTER) = netfilter.o
@@ -28,7 +29,9 @@ obj-$(CONFIG_NF_CONNTRACK_AMANDA) += nf_conntrack_amanda.o
28obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o 29obj-$(CONFIG_NF_CONNTRACK_FTP) += nf_conntrack_ftp.o
29obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o 30obj-$(CONFIG_NF_CONNTRACK_H323) += nf_conntrack_h323.o
30obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o 31obj-$(CONFIG_NF_CONNTRACK_IRC) += nf_conntrack_irc.o
32obj-$(CONFIG_NF_CONNTRACK_BROADCAST) += nf_conntrack_broadcast.o
31obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o 33obj-$(CONFIG_NF_CONNTRACK_NETBIOS_NS) += nf_conntrack_netbios_ns.o
34obj-$(CONFIG_NF_CONNTRACK_SNMP) += nf_conntrack_snmp.o
32obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o 35obj-$(CONFIG_NF_CONNTRACK_PPTP) += nf_conntrack_pptp.o
33obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o 36obj-$(CONFIG_NF_CONNTRACK_SANE) += nf_conntrack_sane.o
34obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o 37obj-$(CONFIG_NF_CONNTRACK_SIP) += nf_conntrack_sip.o
@@ -45,6 +48,7 @@ obj-$(CONFIG_NETFILTER_XT_MARK) += xt_mark.o
45obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o 48obj-$(CONFIG_NETFILTER_XT_CONNMARK) += xt_connmark.o
46 49
47# targets 50# targets
51obj-$(CONFIG_NETFILTER_XT_TARGET_AUDIT) += xt_AUDIT.o
48obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o 52obj-$(CONFIG_NETFILTER_XT_TARGET_CHECKSUM) += xt_CHECKSUM.o
49obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o 53obj-$(CONFIG_NETFILTER_XT_TARGET_CLASSIFY) += xt_CLASSIFY.o
50obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o 54obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 32fcbe290c04..1e00bf7d27c5 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -175,13 +175,21 @@ next_hook:
175 ret = 1; 175 ret = 1;
176 } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { 176 } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {
177 kfree_skb(skb); 177 kfree_skb(skb);
178 ret = -(verdict >> NF_VERDICT_BITS); 178 ret = NF_DROP_GETERR(verdict);
179 if (ret == 0) 179 if (ret == 0)
180 ret = -EPERM; 180 ret = -EPERM;
181 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { 181 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {
182 if (!nf_queue(skb, elem, pf, hook, indev, outdev, okfn, 182 ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
183 verdict >> NF_VERDICT_BITS)) 183 verdict >> NF_VERDICT_QBITS);
184 goto next_hook; 184 if (ret < 0) {
185 if (ret == -ECANCELED)
186 goto next_hook;
187 if (ret == -ESRCH &&
188 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
189 goto next_hook;
190 kfree_skb(skb);
191 }
192 ret = 0;
185 } 193 }
186 rcu_read_unlock(); 194 rcu_read_unlock();
187 return ret; 195 return ret;
@@ -214,7 +222,7 @@ EXPORT_SYMBOL(skb_make_writable);
214/* This does not belong here, but locally generated errors need it if connection 222/* This does not belong here, but locally generated errors need it if connection
215 tracking in use: without this, connection may not be in hash table, and hence 223 tracking in use: without this, connection may not be in hash table, and hence
216 manufactured ICMP or RST packets will not be associated with it. */ 224 manufactured ICMP or RST packets will not be associated with it. */
217void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); 225void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *) __rcu __read_mostly;
218EXPORT_SYMBOL(ip_ct_attach); 226EXPORT_SYMBOL(ip_ct_attach);
219 227
220void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) 228void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
@@ -231,7 +239,7 @@ void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
231} 239}
232EXPORT_SYMBOL(nf_ct_attach); 240EXPORT_SYMBOL(nf_ct_attach);
233 241
234void (*nf_ct_destroy)(struct nf_conntrack *); 242void (*nf_ct_destroy)(struct nf_conntrack *) __rcu __read_mostly;
235EXPORT_SYMBOL(nf_ct_destroy); 243EXPORT_SYMBOL(nf_ct_destroy);
236 244
237void nf_conntrack_destroy(struct nf_conntrack *nfct) 245void nf_conntrack_destroy(struct nf_conntrack *nfct)
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index a475edee0912..5c48ffb60c28 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -43,11 +43,6 @@ EXPORT_SYMBOL(register_ip_vs_app);
43EXPORT_SYMBOL(unregister_ip_vs_app); 43EXPORT_SYMBOL(unregister_ip_vs_app);
44EXPORT_SYMBOL(register_ip_vs_app_inc); 44EXPORT_SYMBOL(register_ip_vs_app_inc);
45 45
46/* ipvs application list head */
47static LIST_HEAD(ip_vs_app_list);
48static DEFINE_MUTEX(__ip_vs_app_mutex);
49
50
51/* 46/*
52 * Get an ip_vs_app object 47 * Get an ip_vs_app object
53 */ 48 */
@@ -67,7 +62,8 @@ static inline void ip_vs_app_put(struct ip_vs_app *app)
67 * Allocate/initialize app incarnation and register it in proto apps. 62 * Allocate/initialize app incarnation and register it in proto apps.
68 */ 63 */
69static int 64static int
70ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) 65ip_vs_app_inc_new(struct net *net, struct ip_vs_app *app, __u16 proto,
66 __u16 port)
71{ 67{
72 struct ip_vs_protocol *pp; 68 struct ip_vs_protocol *pp;
73 struct ip_vs_app *inc; 69 struct ip_vs_app *inc;
@@ -98,7 +94,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
98 } 94 }
99 } 95 }
100 96
101 ret = pp->register_app(inc); 97 ret = pp->register_app(net, inc);
102 if (ret) 98 if (ret)
103 goto out; 99 goto out;
104 100
@@ -119,7 +115,7 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port)
119 * Release app incarnation 115 * Release app incarnation
120 */ 116 */
121static void 117static void
122ip_vs_app_inc_release(struct ip_vs_app *inc) 118ip_vs_app_inc_release(struct net *net, struct ip_vs_app *inc)
123{ 119{
124 struct ip_vs_protocol *pp; 120 struct ip_vs_protocol *pp;
125 121
@@ -127,7 +123,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc)
127 return; 123 return;
128 124
129 if (pp->unregister_app) 125 if (pp->unregister_app)
130 pp->unregister_app(inc); 126 pp->unregister_app(net, inc);
131 127
132 IP_VS_DBG(9, "%s App %s:%u unregistered\n", 128 IP_VS_DBG(9, "%s App %s:%u unregistered\n",
133 pp->name, inc->name, ntohs(inc->port)); 129 pp->name, inc->name, ntohs(inc->port));
@@ -168,15 +164,17 @@ void ip_vs_app_inc_put(struct ip_vs_app *inc)
168 * Register an application incarnation in protocol applications 164 * Register an application incarnation in protocol applications
169 */ 165 */
170int 166int
171register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) 167register_ip_vs_app_inc(struct net *net, struct ip_vs_app *app, __u16 proto,
168 __u16 port)
172{ 169{
170 struct netns_ipvs *ipvs = net_ipvs(net);
173 int result; 171 int result;
174 172
175 mutex_lock(&__ip_vs_app_mutex); 173 mutex_lock(&ipvs->app_mutex);
176 174
177 result = ip_vs_app_inc_new(app, proto, port); 175 result = ip_vs_app_inc_new(net, app, proto, port);
178 176
179 mutex_unlock(&__ip_vs_app_mutex); 177 mutex_unlock(&ipvs->app_mutex);
180 178
181 return result; 179 return result;
182} 180}
@@ -185,16 +183,17 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port)
185/* 183/*
186 * ip_vs_app registration routine 184 * ip_vs_app registration routine
187 */ 185 */
188int register_ip_vs_app(struct ip_vs_app *app) 186int register_ip_vs_app(struct net *net, struct ip_vs_app *app)
189{ 187{
188 struct netns_ipvs *ipvs = net_ipvs(net);
190 /* increase the module use count */ 189 /* increase the module use count */
191 ip_vs_use_count_inc(); 190 ip_vs_use_count_inc();
192 191
193 mutex_lock(&__ip_vs_app_mutex); 192 mutex_lock(&ipvs->app_mutex);
194 193
195 list_add(&app->a_list, &ip_vs_app_list); 194 list_add(&app->a_list, &ipvs->app_list);
196 195
197 mutex_unlock(&__ip_vs_app_mutex); 196 mutex_unlock(&ipvs->app_mutex);
198 197
199 return 0; 198 return 0;
200} 199}
@@ -204,19 +203,20 @@ int register_ip_vs_app(struct ip_vs_app *app)
204 * ip_vs_app unregistration routine 203 * ip_vs_app unregistration routine
205 * We are sure there are no app incarnations attached to services 204 * We are sure there are no app incarnations attached to services
206 */ 205 */
207void unregister_ip_vs_app(struct ip_vs_app *app) 206void unregister_ip_vs_app(struct net *net, struct ip_vs_app *app)
208{ 207{
208 struct netns_ipvs *ipvs = net_ipvs(net);
209 struct ip_vs_app *inc, *nxt; 209 struct ip_vs_app *inc, *nxt;
210 210
211 mutex_lock(&__ip_vs_app_mutex); 211 mutex_lock(&ipvs->app_mutex);
212 212
213 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { 213 list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) {
214 ip_vs_app_inc_release(inc); 214 ip_vs_app_inc_release(net, inc);
215 } 215 }
216 216
217 list_del(&app->a_list); 217 list_del(&app->a_list);
218 218
219 mutex_unlock(&__ip_vs_app_mutex); 219 mutex_unlock(&ipvs->app_mutex);
220 220
221 /* decrease the module use count */ 221 /* decrease the module use count */
222 ip_vs_use_count_dec(); 222 ip_vs_use_count_dec();
@@ -226,7 +226,8 @@ void unregister_ip_vs_app(struct ip_vs_app *app)
226/* 226/*
227 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) 227 * Bind ip_vs_conn to its ip_vs_app (called by cp constructor)
228 */ 228 */
229int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) 229int ip_vs_bind_app(struct ip_vs_conn *cp,
230 struct ip_vs_protocol *pp)
230{ 231{
231 return pp->app_conn_bind(cp); 232 return pp->app_conn_bind(cp);
232} 233}
@@ -481,11 +482,11 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
481 * /proc/net/ip_vs_app entry function 482 * /proc/net/ip_vs_app entry function
482 */ 483 */
483 484
484static struct ip_vs_app *ip_vs_app_idx(loff_t pos) 485static struct ip_vs_app *ip_vs_app_idx(struct netns_ipvs *ipvs, loff_t pos)
485{ 486{
486 struct ip_vs_app *app, *inc; 487 struct ip_vs_app *app, *inc;
487 488
488 list_for_each_entry(app, &ip_vs_app_list, a_list) { 489 list_for_each_entry(app, &ipvs->app_list, a_list) {
489 list_for_each_entry(inc, &app->incs_list, a_list) { 490 list_for_each_entry(inc, &app->incs_list, a_list) {
490 if (pos-- == 0) 491 if (pos-- == 0)
491 return inc; 492 return inc;
@@ -497,19 +498,24 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
497 498
498static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) 499static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
499{ 500{
500 mutex_lock(&__ip_vs_app_mutex); 501 struct net *net = seq_file_net(seq);
502 struct netns_ipvs *ipvs = net_ipvs(net);
501 503
502 return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; 504 mutex_lock(&ipvs->app_mutex);
505
506 return *pos ? ip_vs_app_idx(ipvs, *pos - 1) : SEQ_START_TOKEN;
503} 507}
504 508
505static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) 509static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
506{ 510{
507 struct ip_vs_app *inc, *app; 511 struct ip_vs_app *inc, *app;
508 struct list_head *e; 512 struct list_head *e;
513 struct net *net = seq_file_net(seq);
514 struct netns_ipvs *ipvs = net_ipvs(net);
509 515
510 ++*pos; 516 ++*pos;
511 if (v == SEQ_START_TOKEN) 517 if (v == SEQ_START_TOKEN)
512 return ip_vs_app_idx(0); 518 return ip_vs_app_idx(ipvs, 0);
513 519
514 inc = v; 520 inc = v;
515 app = inc->app; 521 app = inc->app;
@@ -518,7 +524,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
518 return list_entry(e, struct ip_vs_app, a_list); 524 return list_entry(e, struct ip_vs_app, a_list);
519 525
520 /* go on to next application */ 526 /* go on to next application */
521 for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { 527 for (e = app->a_list.next; e != &ipvs->app_list; e = e->next) {
522 app = list_entry(e, struct ip_vs_app, a_list); 528 app = list_entry(e, struct ip_vs_app, a_list);
523 list_for_each_entry(inc, &app->incs_list, a_list) { 529 list_for_each_entry(inc, &app->incs_list, a_list) {
524 return inc; 530 return inc;
@@ -529,7 +535,9 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
529 535
530static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) 536static void ip_vs_app_seq_stop(struct seq_file *seq, void *v)
531{ 537{
532 mutex_unlock(&__ip_vs_app_mutex); 538 struct netns_ipvs *ipvs = net_ipvs(seq_file_net(seq));
539
540 mutex_unlock(&ipvs->app_mutex);
533} 541}
534 542
535static int ip_vs_app_seq_show(struct seq_file *seq, void *v) 543static int ip_vs_app_seq_show(struct seq_file *seq, void *v)
@@ -557,7 +565,8 @@ static const struct seq_operations ip_vs_app_seq_ops = {
557 565
558static int ip_vs_app_open(struct inode *inode, struct file *file) 566static int ip_vs_app_open(struct inode *inode, struct file *file)
559{ 567{
560 return seq_open(file, &ip_vs_app_seq_ops); 568 return seq_open_net(inode, file, &ip_vs_app_seq_ops,
569 sizeof(struct seq_net_private));
561} 570}
562 571
563static const struct file_operations ip_vs_app_fops = { 572static const struct file_operations ip_vs_app_fops = {
@@ -569,15 +578,36 @@ static const struct file_operations ip_vs_app_fops = {
569}; 578};
570#endif 579#endif
571 580
572int __init ip_vs_app_init(void) 581static int __net_init __ip_vs_app_init(struct net *net)
573{ 582{
574 /* we will replace it with proc_net_ipvs_create() soon */ 583 struct netns_ipvs *ipvs = net_ipvs(net);
575 proc_net_fops_create(&init_net, "ip_vs_app", 0, &ip_vs_app_fops); 584
585 INIT_LIST_HEAD(&ipvs->app_list);
586 __mutex_init(&ipvs->app_mutex, "ipvs->app_mutex", &ipvs->app_key);
587 proc_net_fops_create(net, "ip_vs_app", 0, &ip_vs_app_fops);
576 return 0; 588 return 0;
577} 589}
578 590
591static void __net_exit __ip_vs_app_cleanup(struct net *net)
592{
593 proc_net_remove(net, "ip_vs_app");
594}
595
596static struct pernet_operations ip_vs_app_ops = {
597 .init = __ip_vs_app_init,
598 .exit = __ip_vs_app_cleanup,
599};
600
601int __init ip_vs_app_init(void)
602{
603 int rv;
604
605 rv = register_pernet_subsys(&ip_vs_app_ops);
606 return rv;
607}
608
579 609
580void ip_vs_app_cleanup(void) 610void ip_vs_app_cleanup(void)
581{ 611{
582 proc_net_remove(&init_net, "ip_vs_app"); 612 unregister_pernet_subsys(&ip_vs_app_ops);
583} 613}
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index e9adecdc8ca4..83233fe24a08 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -48,35 +48,32 @@
48/* 48/*
49 * Connection hash size. Default is what was selected at compile time. 49 * Connection hash size. Default is what was selected at compile time.
50*/ 50*/
51int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS; 51static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
52module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444); 52module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
53MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size"); 53MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
54 54
55/* size and mask values */ 55/* size and mask values */
56int ip_vs_conn_tab_size; 56int ip_vs_conn_tab_size __read_mostly;
57int ip_vs_conn_tab_mask; 57static int ip_vs_conn_tab_mask __read_mostly;
58 58
59/* 59/*
60 * Connection hash table: for input and output packets lookups of IPVS 60 * Connection hash table: for input and output packets lookups of IPVS
61 */ 61 */
62static struct list_head *ip_vs_conn_tab; 62static struct list_head *ip_vs_conn_tab __read_mostly;
63 63
64/* SLAB cache for IPVS connections */ 64/* SLAB cache for IPVS connections */
65static struct kmem_cache *ip_vs_conn_cachep __read_mostly; 65static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
66 66
67/* counter for current IPVS connections */
68static atomic_t ip_vs_conn_count = ATOMIC_INIT(0);
69
70/* counter for no client port connections */ 67/* counter for no client port connections */
71static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); 68static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0);
72 69
73/* random value for IPVS connection hash */ 70/* random value for IPVS connection hash */
74static unsigned int ip_vs_conn_rnd; 71static unsigned int ip_vs_conn_rnd __read_mostly;
75 72
76/* 73/*
77 * Fine locking granularity for big connection hash table 74 * Fine locking granularity for big connection hash table
78 */ 75 */
79#define CT_LOCKARRAY_BITS 4 76#define CT_LOCKARRAY_BITS 5
80#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS) 77#define CT_LOCKARRAY_SIZE (1<<CT_LOCKARRAY_BITS)
81#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1) 78#define CT_LOCKARRAY_MASK (CT_LOCKARRAY_SIZE-1)
82 79
@@ -133,19 +130,19 @@ static inline void ct_write_unlock_bh(unsigned key)
133/* 130/*
134 * Returns hash value for IPVS connection entry 131 * Returns hash value for IPVS connection entry
135 */ 132 */
136static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, 133static unsigned int ip_vs_conn_hashkey(struct net *net, int af, unsigned proto,
137 const union nf_inet_addr *addr, 134 const union nf_inet_addr *addr,
138 __be16 port) 135 __be16 port)
139{ 136{
140#ifdef CONFIG_IP_VS_IPV6 137#ifdef CONFIG_IP_VS_IPV6
141 if (af == AF_INET6) 138 if (af == AF_INET6)
142 return jhash_3words(jhash(addr, 16, ip_vs_conn_rnd), 139 return (jhash_3words(jhash(addr, 16, ip_vs_conn_rnd),
143 (__force u32)port, proto, ip_vs_conn_rnd) 140 (__force u32)port, proto, ip_vs_conn_rnd) ^
144 & ip_vs_conn_tab_mask; 141 ((size_t)net>>8)) & ip_vs_conn_tab_mask;
145#endif 142#endif
146 return jhash_3words((__force u32)addr->ip, (__force u32)port, proto, 143 return (jhash_3words((__force u32)addr->ip, (__force u32)port, proto,
147 ip_vs_conn_rnd) 144 ip_vs_conn_rnd) ^
148 & ip_vs_conn_tab_mask; 145 ((size_t)net>>8)) & ip_vs_conn_tab_mask;
149} 146}
150 147
151static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, 148static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
@@ -166,18 +163,18 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
166 port = p->vport; 163 port = p->vport;
167 } 164 }
168 165
169 return ip_vs_conn_hashkey(p->af, p->protocol, addr, port); 166 return ip_vs_conn_hashkey(p->net, p->af, p->protocol, addr, port);
170} 167}
171 168
172static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) 169static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
173{ 170{
174 struct ip_vs_conn_param p; 171 struct ip_vs_conn_param p;
175 172
176 ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, 173 ip_vs_conn_fill_param(ip_vs_conn_net(cp), cp->af, cp->protocol,
177 NULL, 0, &p); 174 &cp->caddr, cp->cport, NULL, 0, &p);
178 175
179 if (cp->dest && cp->dest->svc->pe) { 176 if (cp->pe) {
180 p.pe = cp->dest->svc->pe; 177 p.pe = cp->pe;
181 p.pe_data = cp->pe_data; 178 p.pe_data = cp->pe_data;
182 p.pe_data_len = cp->pe_data_len; 179 p.pe_data_len = cp->pe_data_len;
183 } 180 }
@@ -186,7 +183,7 @@ static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp)
186} 183}
187 184
188/* 185/*
189 * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. 186 * Hashes ip_vs_conn in ip_vs_conn_tab by netns,proto,addr,port.
190 * returns bool success. 187 * returns bool success.
191 */ 188 */
192static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) 189static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
@@ -269,11 +266,12 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
269 266
270 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 267 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
271 if (cp->af == p->af && 268 if (cp->af == p->af &&
269 p->cport == cp->cport && p->vport == cp->vport &&
272 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && 270 ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) &&
273 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && 271 ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) &&
274 p->cport == cp->cport && p->vport == cp->vport &&
275 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && 272 ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
276 p->protocol == cp->protocol) { 273 p->protocol == cp->protocol &&
274 ip_vs_conn_net_eq(cp, p->net)) {
277 /* HIT */ 275 /* HIT */
278 atomic_inc(&cp->refcnt); 276 atomic_inc(&cp->refcnt);
279 ct_read_unlock(hash); 277 ct_read_unlock(hash);
@@ -313,23 +311,23 @@ ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb,
313 struct ip_vs_conn_param *p) 311 struct ip_vs_conn_param *p)
314{ 312{
315 __be16 _ports[2], *pptr; 313 __be16 _ports[2], *pptr;
314 struct net *net = skb_net(skb);
316 315
317 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); 316 pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports);
318 if (pptr == NULL) 317 if (pptr == NULL)
319 return 1; 318 return 1;
320 319
321 if (likely(!inverse)) 320 if (likely(!inverse))
322 ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0], 321 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->saddr,
323 &iph->daddr, pptr[1], p); 322 pptr[0], &iph->daddr, pptr[1], p);
324 else 323 else
325 ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1], 324 ip_vs_conn_fill_param(net, af, iph->protocol, &iph->daddr,
326 &iph->saddr, pptr[0], p); 325 pptr[1], &iph->saddr, pptr[0], p);
327 return 0; 326 return 0;
328} 327}
329 328
330struct ip_vs_conn * 329struct ip_vs_conn *
331ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, 330ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb,
332 struct ip_vs_protocol *pp,
333 const struct ip_vs_iphdr *iph, 331 const struct ip_vs_iphdr *iph,
334 unsigned int proto_off, int inverse) 332 unsigned int proto_off, int inverse)
335{ 333{
@@ -353,8 +351,10 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
353 ct_read_lock(hash); 351 ct_read_lock(hash);
354 352
355 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 353 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
354 if (!ip_vs_conn_net_eq(cp, p->net))
355 continue;
356 if (p->pe_data && p->pe->ct_match) { 356 if (p->pe_data && p->pe->ct_match) {
357 if (p->pe->ct_match(p, cp)) 357 if (p->pe == cp->pe && p->pe->ct_match(p, cp))
358 goto out; 358 goto out;
359 continue; 359 continue;
360 } 360 }
@@ -404,10 +404,11 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
404 404
405 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { 405 list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) {
406 if (cp->af == p->af && 406 if (cp->af == p->af &&
407 p->vport == cp->cport && p->cport == cp->dport &&
407 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && 408 ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) &&
408 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && 409 ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) &&
409 p->vport == cp->cport && p->cport == cp->dport && 410 p->protocol == cp->protocol &&
410 p->protocol == cp->protocol) { 411 ip_vs_conn_net_eq(cp, p->net)) {
411 /* HIT */ 412 /* HIT */
412 atomic_inc(&cp->refcnt); 413 atomic_inc(&cp->refcnt);
413 ret = cp; 414 ret = cp;
@@ -428,7 +429,6 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
428 429
429struct ip_vs_conn * 430struct ip_vs_conn *
430ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, 431ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb,
431 struct ip_vs_protocol *pp,
432 const struct ip_vs_iphdr *iph, 432 const struct ip_vs_iphdr *iph,
433 unsigned int proto_off, int inverse) 433 unsigned int proto_off, int inverse)
434{ 434{
@@ -611,9 +611,9 @@ struct ip_vs_dest *ip_vs_try_bind_dest(struct ip_vs_conn *cp)
611 struct ip_vs_dest *dest; 611 struct ip_vs_dest *dest;
612 612
613 if ((cp) && (!cp->dest)) { 613 if ((cp) && (!cp->dest)) {
614 dest = ip_vs_find_dest(cp->af, &cp->daddr, cp->dport, 614 dest = ip_vs_find_dest(ip_vs_conn_net(cp), cp->af, &cp->daddr,
615 &cp->vaddr, cp->vport, 615 cp->dport, &cp->vaddr, cp->vport,
616 cp->protocol); 616 cp->protocol, cp->fwmark);
617 ip_vs_bind_dest(cp, dest); 617 ip_vs_bind_dest(cp, dest);
618 return dest; 618 return dest;
619 } else 619 } else
@@ -686,13 +686,14 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
686int ip_vs_check_template(struct ip_vs_conn *ct) 686int ip_vs_check_template(struct ip_vs_conn *ct)
687{ 687{
688 struct ip_vs_dest *dest = ct->dest; 688 struct ip_vs_dest *dest = ct->dest;
689 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(ct));
689 690
690 /* 691 /*
691 * Checking the dest server status. 692 * Checking the dest server status.
692 */ 693 */
693 if ((dest == NULL) || 694 if ((dest == NULL) ||
694 !(dest->flags & IP_VS_DEST_F_AVAILABLE) || 695 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
695 (sysctl_ip_vs_expire_quiescent_template && 696 (ipvs->sysctl_expire_quiescent_template &&
696 (atomic_read(&dest->weight) == 0))) { 697 (atomic_read(&dest->weight) == 0))) {
697 IP_VS_DBG_BUF(9, "check_template: dest not available for " 698 IP_VS_DBG_BUF(9, "check_template: dest not available for "
698 "protocol %s s:%s:%d v:%s:%d " 699 "protocol %s s:%s:%d v:%s:%d "
@@ -730,6 +731,7 @@ int ip_vs_check_template(struct ip_vs_conn *ct)
730static void ip_vs_conn_expire(unsigned long data) 731static void ip_vs_conn_expire(unsigned long data)
731{ 732{
732 struct ip_vs_conn *cp = (struct ip_vs_conn *)data; 733 struct ip_vs_conn *cp = (struct ip_vs_conn *)data;
734 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
733 735
734 cp->timeout = 60*HZ; 736 cp->timeout = 60*HZ;
735 737
@@ -765,13 +767,14 @@ static void ip_vs_conn_expire(unsigned long data)
765 if (cp->flags & IP_VS_CONN_F_NFCT) 767 if (cp->flags & IP_VS_CONN_F_NFCT)
766 ip_vs_conn_drop_conntrack(cp); 768 ip_vs_conn_drop_conntrack(cp);
767 769
770 ip_vs_pe_put(cp->pe);
768 kfree(cp->pe_data); 771 kfree(cp->pe_data);
769 if (unlikely(cp->app != NULL)) 772 if (unlikely(cp->app != NULL))
770 ip_vs_unbind_app(cp); 773 ip_vs_unbind_app(cp);
771 ip_vs_unbind_dest(cp); 774 ip_vs_unbind_dest(cp);
772 if (cp->flags & IP_VS_CONN_F_NO_CPORT) 775 if (cp->flags & IP_VS_CONN_F_NO_CPORT)
773 atomic_dec(&ip_vs_conn_no_cport_cnt); 776 atomic_dec(&ip_vs_conn_no_cport_cnt);
774 atomic_dec(&ip_vs_conn_count); 777 atomic_dec(&ipvs->conn_count);
775 778
776 kmem_cache_free(ip_vs_conn_cachep, cp); 779 kmem_cache_free(ip_vs_conn_cachep, cp);
777 return; 780 return;
@@ -802,10 +805,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
802struct ip_vs_conn * 805struct ip_vs_conn *
803ip_vs_conn_new(const struct ip_vs_conn_param *p, 806ip_vs_conn_new(const struct ip_vs_conn_param *p,
804 const union nf_inet_addr *daddr, __be16 dport, unsigned flags, 807 const union nf_inet_addr *daddr, __be16 dport, unsigned flags,
805 struct ip_vs_dest *dest) 808 struct ip_vs_dest *dest, __u32 fwmark)
806{ 809{
807 struct ip_vs_conn *cp; 810 struct ip_vs_conn *cp;
808 struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); 811 struct netns_ipvs *ipvs = net_ipvs(p->net);
812 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->net,
813 p->protocol);
809 814
810 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); 815 cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC);
811 if (cp == NULL) { 816 if (cp == NULL) {
@@ -815,6 +820,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
815 820
816 INIT_LIST_HEAD(&cp->c_list); 821 INIT_LIST_HEAD(&cp->c_list);
817 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); 822 setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp);
823 ip_vs_conn_net_set(cp, p->net);
818 cp->af = p->af; 824 cp->af = p->af;
819 cp->protocol = p->protocol; 825 cp->protocol = p->protocol;
820 ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); 826 ip_vs_addr_copy(p->af, &cp->caddr, p->caddr);
@@ -826,7 +832,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
826 &cp->daddr, daddr); 832 &cp->daddr, daddr);
827 cp->dport = dport; 833 cp->dport = dport;
828 cp->flags = flags; 834 cp->flags = flags;
829 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) { 835 cp->fwmark = fwmark;
836 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
837 ip_vs_pe_get(p->pe);
838 cp->pe = p->pe;
830 cp->pe_data = p->pe_data; 839 cp->pe_data = p->pe_data;
831 cp->pe_data_len = p->pe_data_len; 840 cp->pe_data_len = p->pe_data_len;
832 } 841 }
@@ -842,7 +851,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
842 atomic_set(&cp->n_control, 0); 851 atomic_set(&cp->n_control, 0);
843 atomic_set(&cp->in_pkts, 0); 852 atomic_set(&cp->in_pkts, 0);
844 853
845 atomic_inc(&ip_vs_conn_count); 854 atomic_inc(&ipvs->conn_count);
846 if (flags & IP_VS_CONN_F_NO_CPORT) 855 if (flags & IP_VS_CONN_F_NO_CPORT)
847 atomic_inc(&ip_vs_conn_no_cport_cnt); 856 atomic_inc(&ip_vs_conn_no_cport_cnt);
848 857
@@ -861,8 +870,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
861#endif 870#endif
862 ip_vs_bind_xmit(cp); 871 ip_vs_bind_xmit(cp);
863 872
864 if (unlikely(pp && atomic_read(&pp->appcnt))) 873 if (unlikely(pd && atomic_read(&pd->appcnt)))
865 ip_vs_bind_app(cp, pp); 874 ip_vs_bind_app(cp, pd->pp);
866 875
867 /* 876 /*
868 * Allow conntrack to be preserved. By default, conntrack 877 * Allow conntrack to be preserved. By default, conntrack
@@ -871,7 +880,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
871 * IP_VS_CONN_F_ONE_PACKET too. 880 * IP_VS_CONN_F_ONE_PACKET too.
872 */ 881 */
873 882
874 if (ip_vs_conntrack_enabled()) 883 if (ip_vs_conntrack_enabled(ipvs))
875 cp->flags |= IP_VS_CONN_F_NFCT; 884 cp->flags |= IP_VS_CONN_F_NFCT;
876 885
877 /* Hash it in the ip_vs_conn_tab finally */ 886 /* Hash it in the ip_vs_conn_tab finally */
@@ -884,17 +893,22 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p,
884 * /proc/net/ip_vs_conn entries 893 * /proc/net/ip_vs_conn entries
885 */ 894 */
886#ifdef CONFIG_PROC_FS 895#ifdef CONFIG_PROC_FS
896struct ip_vs_iter_state {
897 struct seq_net_private p;
898 struct list_head *l;
899};
887 900
888static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) 901static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
889{ 902{
890 int idx; 903 int idx;
891 struct ip_vs_conn *cp; 904 struct ip_vs_conn *cp;
905 struct ip_vs_iter_state *iter = seq->private;
892 906
893 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 907 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
894 ct_read_lock_bh(idx); 908 ct_read_lock_bh(idx);
895 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 909 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
896 if (pos-- == 0) { 910 if (pos-- == 0) {
897 seq->private = &ip_vs_conn_tab[idx]; 911 iter->l = &ip_vs_conn_tab[idx];
898 return cp; 912 return cp;
899 } 913 }
900 } 914 }
@@ -906,14 +920,17 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
906 920
907static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) 921static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
908{ 922{
909 seq->private = NULL; 923 struct ip_vs_iter_state *iter = seq->private;
924
925 iter->l = NULL;
910 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; 926 return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
911} 927}
912 928
913static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) 929static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
914{ 930{
915 struct ip_vs_conn *cp = v; 931 struct ip_vs_conn *cp = v;
916 struct list_head *e, *l = seq->private; 932 struct ip_vs_iter_state *iter = seq->private;
933 struct list_head *e, *l = iter->l;
917 int idx; 934 int idx;
918 935
919 ++*pos; 936 ++*pos;
@@ -930,18 +947,19 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
930 while (++idx < ip_vs_conn_tab_size) { 947 while (++idx < ip_vs_conn_tab_size) {
931 ct_read_lock_bh(idx); 948 ct_read_lock_bh(idx);
932 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 949 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
933 seq->private = &ip_vs_conn_tab[idx]; 950 iter->l = &ip_vs_conn_tab[idx];
934 return cp; 951 return cp;
935 } 952 }
936 ct_read_unlock_bh(idx); 953 ct_read_unlock_bh(idx);
937 } 954 }
938 seq->private = NULL; 955 iter->l = NULL;
939 return NULL; 956 return NULL;
940} 957}
941 958
942static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) 959static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
943{ 960{
944 struct list_head *l = seq->private; 961 struct ip_vs_iter_state *iter = seq->private;
962 struct list_head *l = iter->l;
945 963
946 if (l) 964 if (l)
947 ct_read_unlock_bh(l - ip_vs_conn_tab); 965 ct_read_unlock_bh(l - ip_vs_conn_tab);
@@ -955,18 +973,19 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
955 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); 973 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
956 else { 974 else {
957 const struct ip_vs_conn *cp = v; 975 const struct ip_vs_conn *cp = v;
976 struct net *net = seq_file_net(seq);
958 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; 977 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
959 size_t len = 0; 978 size_t len = 0;
960 979
961 if (cp->dest && cp->pe_data && 980 if (!ip_vs_conn_net_eq(cp, net))
962 cp->dest->svc->pe->show_pe_data) { 981 return 0;
982 if (cp->pe_data) {
963 pe_data[0] = ' '; 983 pe_data[0] = ' ';
964 len = strlen(cp->dest->svc->pe->name); 984 len = strlen(cp->pe->name);
965 memcpy(pe_data + 1, cp->dest->svc->pe->name, len); 985 memcpy(pe_data + 1, cp->pe->name, len);
966 pe_data[len + 1] = ' '; 986 pe_data[len + 1] = ' ';
967 len += 2; 987 len += 2;
968 len += cp->dest->svc->pe->show_pe_data(cp, 988 len += cp->pe->show_pe_data(cp, pe_data + len);
969 pe_data + len);
970 } 989 }
971 pe_data[len] = '\0'; 990 pe_data[len] = '\0';
972 991
@@ -1004,7 +1023,8 @@ static const struct seq_operations ip_vs_conn_seq_ops = {
1004 1023
1005static int ip_vs_conn_open(struct inode *inode, struct file *file) 1024static int ip_vs_conn_open(struct inode *inode, struct file *file)
1006{ 1025{
1007 return seq_open(file, &ip_vs_conn_seq_ops); 1026 return seq_open_net(inode, file, &ip_vs_conn_seq_ops,
1027 sizeof(struct ip_vs_iter_state));
1008} 1028}
1009 1029
1010static const struct file_operations ip_vs_conn_fops = { 1030static const struct file_operations ip_vs_conn_fops = {
@@ -1031,6 +1051,10 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1031 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n"); 1051 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
1032 else { 1052 else {
1033 const struct ip_vs_conn *cp = v; 1053 const struct ip_vs_conn *cp = v;
1054 struct net *net = seq_file_net(seq);
1055
1056 if (!ip_vs_conn_net_eq(cp, net))
1057 return 0;
1034 1058
1035#ifdef CONFIG_IP_VS_IPV6 1059#ifdef CONFIG_IP_VS_IPV6
1036 if (cp->af == AF_INET6) 1060 if (cp->af == AF_INET6)
@@ -1067,7 +1091,8 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1067 1091
1068static int ip_vs_conn_sync_open(struct inode *inode, struct file *file) 1092static int ip_vs_conn_sync_open(struct inode *inode, struct file *file)
1069{ 1093{
1070 return seq_open(file, &ip_vs_conn_sync_seq_ops); 1094 return seq_open_net(inode, file, &ip_vs_conn_sync_seq_ops,
1095 sizeof(struct ip_vs_iter_state));
1071} 1096}
1072 1097
1073static const struct file_operations ip_vs_conn_sync_fops = { 1098static const struct file_operations ip_vs_conn_sync_fops = {
@@ -1113,7 +1138,7 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
1113} 1138}
1114 1139
1115/* Called from keventd and must protect itself from softirqs */ 1140/* Called from keventd and must protect itself from softirqs */
1116void ip_vs_random_dropentry(void) 1141void ip_vs_random_dropentry(struct net *net)
1117{ 1142{
1118 int idx; 1143 int idx;
1119 struct ip_vs_conn *cp; 1144 struct ip_vs_conn *cp;
@@ -1133,7 +1158,8 @@ void ip_vs_random_dropentry(void)
1133 if (cp->flags & IP_VS_CONN_F_TEMPLATE) 1158 if (cp->flags & IP_VS_CONN_F_TEMPLATE)
1134 /* connection template */ 1159 /* connection template */
1135 continue; 1160 continue;
1136 1161 if (!ip_vs_conn_net_eq(cp, net))
1162 continue;
1137 if (cp->protocol == IPPROTO_TCP) { 1163 if (cp->protocol == IPPROTO_TCP) {
1138 switch(cp->state) { 1164 switch(cp->state) {
1139 case IP_VS_TCP_S_SYN_RECV: 1165 case IP_VS_TCP_S_SYN_RECV:
@@ -1168,12 +1194,13 @@ void ip_vs_random_dropentry(void)
1168/* 1194/*
1169 * Flush all the connection entries in the ip_vs_conn_tab 1195 * Flush all the connection entries in the ip_vs_conn_tab
1170 */ 1196 */
1171static void ip_vs_conn_flush(void) 1197static void ip_vs_conn_flush(struct net *net)
1172{ 1198{
1173 int idx; 1199 int idx;
1174 struct ip_vs_conn *cp; 1200 struct ip_vs_conn *cp;
1201 struct netns_ipvs *ipvs = net_ipvs(net);
1175 1202
1176 flush_again: 1203flush_again:
1177 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { 1204 for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
1178 /* 1205 /*
1179 * Lock is actually needed in this loop. 1206 * Lock is actually needed in this loop.
@@ -1181,7 +1208,8 @@ static void ip_vs_conn_flush(void)
1181 ct_write_lock_bh(idx); 1208 ct_write_lock_bh(idx);
1182 1209
1183 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { 1210 list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) {
1184 1211 if (!ip_vs_conn_net_eq(cp, net))
1212 continue;
1185 IP_VS_DBG(4, "del connection\n"); 1213 IP_VS_DBG(4, "del connection\n");
1186 ip_vs_conn_expire_now(cp); 1214 ip_vs_conn_expire_now(cp);
1187 if (cp->control) { 1215 if (cp->control) {
@@ -1194,16 +1222,41 @@ static void ip_vs_conn_flush(void)
1194 1222
1195 /* the counter may be not NULL, because maybe some conn entries 1223 /* the counter may be not NULL, because maybe some conn entries
1196 are run by slow timer handler or unhashed but still referred */ 1224 are run by slow timer handler or unhashed but still referred */
1197 if (atomic_read(&ip_vs_conn_count) != 0) { 1225 if (atomic_read(&ipvs->conn_count) != 0) {
1198 schedule(); 1226 schedule();
1199 goto flush_again; 1227 goto flush_again;
1200 } 1228 }
1201} 1229}
1230/*
1231 * per netns init and exit
1232 */
1233int __net_init __ip_vs_conn_init(struct net *net)
1234{
1235 struct netns_ipvs *ipvs = net_ipvs(net);
1236
1237 atomic_set(&ipvs->conn_count, 0);
1238
1239 proc_net_fops_create(net, "ip_vs_conn", 0, &ip_vs_conn_fops);
1240 proc_net_fops_create(net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1241 return 0;
1242}
1202 1243
1244static void __net_exit __ip_vs_conn_cleanup(struct net *net)
1245{
1246 /* flush all the connection entries first */
1247 ip_vs_conn_flush(net);
1248 proc_net_remove(net, "ip_vs_conn");
1249 proc_net_remove(net, "ip_vs_conn_sync");
1250}
1251static struct pernet_operations ipvs_conn_ops = {
1252 .init = __ip_vs_conn_init,
1253 .exit = __ip_vs_conn_cleanup,
1254};
1203 1255
1204int __init ip_vs_conn_init(void) 1256int __init ip_vs_conn_init(void)
1205{ 1257{
1206 int idx; 1258 int idx;
1259 int retc;
1207 1260
1208 /* Compute size and mask */ 1261 /* Compute size and mask */
1209 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; 1262 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
@@ -1241,24 +1294,18 @@ int __init ip_vs_conn_init(void)
1241 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); 1294 rwlock_init(&__ip_vs_conntbl_lock_array[idx].l);
1242 } 1295 }
1243 1296
1244 proc_net_fops_create(&init_net, "ip_vs_conn", 0, &ip_vs_conn_fops); 1297 retc = register_pernet_subsys(&ipvs_conn_ops);
1245 proc_net_fops_create(&init_net, "ip_vs_conn_sync", 0, &ip_vs_conn_sync_fops);
1246 1298
1247 /* calculate the random value for connection hash */ 1299 /* calculate the random value for connection hash */
1248 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); 1300 get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd));
1249 1301
1250 return 0; 1302 return retc;
1251} 1303}
1252 1304
1253
1254void ip_vs_conn_cleanup(void) 1305void ip_vs_conn_cleanup(void)
1255{ 1306{
1256 /* flush all the connection entries first */ 1307 unregister_pernet_subsys(&ipvs_conn_ops);
1257 ip_vs_conn_flush();
1258
1259 /* Release the empty cache */ 1308 /* Release the empty cache */
1260 kmem_cache_destroy(ip_vs_conn_cachep); 1309 kmem_cache_destroy(ip_vs_conn_cachep);
1261 proc_net_remove(&init_net, "ip_vs_conn");
1262 proc_net_remove(&init_net, "ip_vs_conn_sync");
1263 vfree(ip_vs_conn_tab); 1310 vfree(ip_vs_conn_tab);
1264} 1311}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index b4e51e9c5a04..f36a84f33efb 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -41,6 +41,7 @@
41#include <net/icmp.h> /* for icmp_send */ 41#include <net/icmp.h> /* for icmp_send */
42#include <net/route.h> 42#include <net/route.h>
43#include <net/ip6_checksum.h> 43#include <net/ip6_checksum.h>
44#include <net/netns/generic.h> /* net_generic() */
44 45
45#include <linux/netfilter.h> 46#include <linux/netfilter.h>
46#include <linux/netfilter_ipv4.h> 47#include <linux/netfilter_ipv4.h>
@@ -68,6 +69,12 @@ EXPORT_SYMBOL(ip_vs_conn_put);
68EXPORT_SYMBOL(ip_vs_get_debug_level); 69EXPORT_SYMBOL(ip_vs_get_debug_level);
69#endif 70#endif
70 71
72int ip_vs_net_id __read_mostly;
73#ifdef IP_VS_GENERIC_NETNS
74EXPORT_SYMBOL(ip_vs_net_id);
75#endif
76/* netns cnt used for uniqueness */
77static atomic_t ipvs_netns_cnt = ATOMIC_INIT(0);
71 78
72/* ID used in ICMP lookups */ 79/* ID used in ICMP lookups */
73#define icmp_id(icmph) (((icmph)->un).echo.id) 80#define icmp_id(icmph) (((icmph)->un).echo.id)
@@ -108,21 +115,28 @@ static inline void
108ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 115ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
109{ 116{
110 struct ip_vs_dest *dest = cp->dest; 117 struct ip_vs_dest *dest = cp->dest;
118 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
119
111 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 120 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
112 spin_lock(&dest->stats.lock); 121 struct ip_vs_cpu_stats *s;
113 dest->stats.ustats.inpkts++; 122
114 dest->stats.ustats.inbytes += skb->len; 123 s = this_cpu_ptr(dest->stats.cpustats);
115 spin_unlock(&dest->stats.lock); 124 s->ustats.inpkts++;
116 125 u64_stats_update_begin(&s->syncp);
117 spin_lock(&dest->svc->stats.lock); 126 s->ustats.inbytes += skb->len;
118 dest->svc->stats.ustats.inpkts++; 127 u64_stats_update_end(&s->syncp);
119 dest->svc->stats.ustats.inbytes += skb->len; 128
120 spin_unlock(&dest->svc->stats.lock); 129 s = this_cpu_ptr(dest->svc->stats.cpustats);
121 130 s->ustats.inpkts++;
122 spin_lock(&ip_vs_stats.lock); 131 u64_stats_update_begin(&s->syncp);
123 ip_vs_stats.ustats.inpkts++; 132 s->ustats.inbytes += skb->len;
124 ip_vs_stats.ustats.inbytes += skb->len; 133 u64_stats_update_end(&s->syncp);
125 spin_unlock(&ip_vs_stats.lock); 134
135 s = this_cpu_ptr(ipvs->cpustats);
136 s->ustats.inpkts++;
137 u64_stats_update_begin(&s->syncp);
138 s->ustats.inbytes += skb->len;
139 u64_stats_update_end(&s->syncp);
126 } 140 }
127} 141}
128 142
@@ -131,21 +145,28 @@ static inline void
131ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) 145ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
132{ 146{
133 struct ip_vs_dest *dest = cp->dest; 147 struct ip_vs_dest *dest = cp->dest;
148 struct netns_ipvs *ipvs = net_ipvs(skb_net(skb));
149
134 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { 150 if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) {
135 spin_lock(&dest->stats.lock); 151 struct ip_vs_cpu_stats *s;
136 dest->stats.ustats.outpkts++; 152
137 dest->stats.ustats.outbytes += skb->len; 153 s = this_cpu_ptr(dest->stats.cpustats);
138 spin_unlock(&dest->stats.lock); 154 s->ustats.outpkts++;
139 155 u64_stats_update_begin(&s->syncp);
140 spin_lock(&dest->svc->stats.lock); 156 s->ustats.outbytes += skb->len;
141 dest->svc->stats.ustats.outpkts++; 157 u64_stats_update_end(&s->syncp);
142 dest->svc->stats.ustats.outbytes += skb->len; 158
143 spin_unlock(&dest->svc->stats.lock); 159 s = this_cpu_ptr(dest->svc->stats.cpustats);
144 160 s->ustats.outpkts++;
145 spin_lock(&ip_vs_stats.lock); 161 u64_stats_update_begin(&s->syncp);
146 ip_vs_stats.ustats.outpkts++; 162 s->ustats.outbytes += skb->len;
147 ip_vs_stats.ustats.outbytes += skb->len; 163 u64_stats_update_end(&s->syncp);
148 spin_unlock(&ip_vs_stats.lock); 164
165 s = this_cpu_ptr(ipvs->cpustats);
166 s->ustats.outpkts++;
167 u64_stats_update_begin(&s->syncp);
168 s->ustats.outbytes += skb->len;
169 u64_stats_update_end(&s->syncp);
149 } 170 }
150} 171}
151 172
@@ -153,41 +174,44 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
153static inline void 174static inline void
154ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) 175ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
155{ 176{
156 spin_lock(&cp->dest->stats.lock); 177 struct netns_ipvs *ipvs = net_ipvs(svc->net);
157 cp->dest->stats.ustats.conns++; 178 struct ip_vs_cpu_stats *s;
158 spin_unlock(&cp->dest->stats.lock); 179
180 s = this_cpu_ptr(cp->dest->stats.cpustats);
181 s->ustats.conns++;
159 182
160 spin_lock(&svc->stats.lock); 183 s = this_cpu_ptr(svc->stats.cpustats);
161 svc->stats.ustats.conns++; 184 s->ustats.conns++;
162 spin_unlock(&svc->stats.lock);
163 185
164 spin_lock(&ip_vs_stats.lock); 186 s = this_cpu_ptr(ipvs->cpustats);
165 ip_vs_stats.ustats.conns++; 187 s->ustats.conns++;
166 spin_unlock(&ip_vs_stats.lock);
167} 188}
168 189
169 190
170static inline int 191static inline int
171ip_vs_set_state(struct ip_vs_conn *cp, int direction, 192ip_vs_set_state(struct ip_vs_conn *cp, int direction,
172 const struct sk_buff *skb, 193 const struct sk_buff *skb,
173 struct ip_vs_protocol *pp) 194 struct ip_vs_proto_data *pd)
174{ 195{
175 if (unlikely(!pp->state_transition)) 196 if (unlikely(!pd->pp->state_transition))
176 return 0; 197 return 0;
177 return pp->state_transition(cp, direction, skb, pp); 198 return pd->pp->state_transition(cp, direction, skb, pd);
178} 199}
179 200
180static inline void 201static inline int
181ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, 202ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
182 struct sk_buff *skb, int protocol, 203 struct sk_buff *skb, int protocol,
183 const union nf_inet_addr *caddr, __be16 cport, 204 const union nf_inet_addr *caddr, __be16 cport,
184 const union nf_inet_addr *vaddr, __be16 vport, 205 const union nf_inet_addr *vaddr, __be16 vport,
185 struct ip_vs_conn_param *p) 206 struct ip_vs_conn_param *p)
186{ 207{
187 ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); 208 ip_vs_conn_fill_param(svc->net, svc->af, protocol, caddr, cport, vaddr,
209 vport, p);
188 p->pe = svc->pe; 210 p->pe = svc->pe;
189 if (p->pe && p->pe->fill_param) 211 if (p->pe && p->pe->fill_param)
190 p->pe->fill_param(p, skb); 212 return p->pe->fill_param(p, skb);
213
214 return 0;
191} 215}
192 216
193/* 217/*
@@ -200,7 +224,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc,
200static struct ip_vs_conn * 224static struct ip_vs_conn *
201ip_vs_sched_persist(struct ip_vs_service *svc, 225ip_vs_sched_persist(struct ip_vs_service *svc,
202 struct sk_buff *skb, 226 struct sk_buff *skb,
203 __be16 ports[2]) 227 __be16 src_port, __be16 dst_port, int *ignored)
204{ 228{
205 struct ip_vs_conn *cp = NULL; 229 struct ip_vs_conn *cp = NULL;
206 struct ip_vs_iphdr iph; 230 struct ip_vs_iphdr iph;
@@ -224,8 +248,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
224 248
225 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u " 249 IP_VS_DBG_BUF(6, "p-schedule: src %s:%u dest %s:%u "
226 "mnet %s\n", 250 "mnet %s\n",
227 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(ports[0]), 251 IP_VS_DBG_ADDR(svc->af, &iph.saddr), ntohs(src_port),
228 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(ports[1]), 252 IP_VS_DBG_ADDR(svc->af, &iph.daddr), ntohs(dst_port),
229 IP_VS_DBG_ADDR(svc->af, &snet)); 253 IP_VS_DBG_ADDR(svc->af, &snet));
230 254
231 /* 255 /*
@@ -247,14 +271,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
247 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; 271 const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) };
248 __be16 vport = 0; 272 __be16 vport = 0;
249 273
250 if (ports[1] == svc->port) { 274 if (dst_port == svc->port) {
251 /* non-FTP template: 275 /* non-FTP template:
252 * <protocol, caddr, 0, vaddr, vport, daddr, dport> 276 * <protocol, caddr, 0, vaddr, vport, daddr, dport>
253 * FTP template: 277 * FTP template:
254 * <protocol, caddr, 0, vaddr, 0, daddr, 0> 278 * <protocol, caddr, 0, vaddr, 0, daddr, 0>
255 */ 279 */
256 if (svc->port != FTPPORT) 280 if (svc->port != FTPPORT)
257 vport = ports[1]; 281 vport = dst_port;
258 } else { 282 } else {
259 /* Note: persistent fwmark-based services and 283 /* Note: persistent fwmark-based services and
260 * persistent port zero service are handled here. 284 * persistent port zero service are handled here.
@@ -268,24 +292,31 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
268 vaddr = &fwmark; 292 vaddr = &fwmark;
269 } 293 }
270 } 294 }
271 ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, 295 /* return *ignored = -1 so NF_DROP can be used */
272 vaddr, vport, &param); 296 if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0,
297 vaddr, vport, &param) < 0) {
298 *ignored = -1;
299 return NULL;
300 }
273 } 301 }
274 302
275 /* Check if a template already exists */ 303 /* Check if a template already exists */
276 ct = ip_vs_ct_in_get(&param); 304 ct = ip_vs_ct_in_get(&param);
277 if (!ct || !ip_vs_check_template(ct)) { 305 if (!ct || !ip_vs_check_template(ct)) {
278 /* No template found or the dest of the connection 306 /*
307 * No template found or the dest of the connection
279 * template is not available. 308 * template is not available.
309 * return *ignored=0 i.e. ICMP and NF_DROP
280 */ 310 */
281 dest = svc->scheduler->schedule(svc, skb); 311 dest = svc->scheduler->schedule(svc, skb);
282 if (!dest) { 312 if (!dest) {
283 IP_VS_DBG(1, "p-schedule: no dest found.\n"); 313 IP_VS_DBG(1, "p-schedule: no dest found.\n");
284 kfree(param.pe_data); 314 kfree(param.pe_data);
315 *ignored = 0;
285 return NULL; 316 return NULL;
286 } 317 }
287 318
288 if (ports[1] == svc->port && svc->port != FTPPORT) 319 if (dst_port == svc->port && svc->port != FTPPORT)
289 dport = dest->port; 320 dport = dest->port;
290 321
291 /* Create a template 322 /* Create a template
@@ -293,9 +324,10 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
293 * and thus param.pe_data will be destroyed 324 * and thus param.pe_data will be destroyed
294 * when the template expires */ 325 * when the template expires */
295 ct = ip_vs_conn_new(&param, &dest->addr, dport, 326 ct = ip_vs_conn_new(&param, &dest->addr, dport,
296 IP_VS_CONN_F_TEMPLATE, dest); 327 IP_VS_CONN_F_TEMPLATE, dest, skb->mark);
297 if (ct == NULL) { 328 if (ct == NULL) {
298 kfree(param.pe_data); 329 kfree(param.pe_data);
330 *ignored = -1;
299 return NULL; 331 return NULL;
300 } 332 }
301 333
@@ -306,7 +338,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
306 kfree(param.pe_data); 338 kfree(param.pe_data);
307 } 339 }
308 340
309 dport = ports[1]; 341 dport = dst_port;
310 if (dport == svc->port && dest->port) 342 if (dport == svc->port && dest->port)
311 dport = dest->port; 343 dport = dest->port;
312 344
@@ -317,11 +349,13 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
317 /* 349 /*
318 * Create a new connection according to the template 350 * Create a new connection according to the template
319 */ 351 */
320 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], 352 ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol, &iph.saddr,
321 &iph.daddr, ports[1], &param); 353 src_port, &iph.daddr, dst_port, &param);
322 cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest); 354
355 cp = ip_vs_conn_new(&param, &dest->addr, dport, flags, dest, skb->mark);
323 if (cp == NULL) { 356 if (cp == NULL) {
324 ip_vs_conn_put(ct); 357 ip_vs_conn_put(ct);
358 *ignored = -1;
325 return NULL; 359 return NULL;
326 } 360 }
327 361
@@ -341,11 +375,27 @@ ip_vs_sched_persist(struct ip_vs_service *svc,
341 * It selects a server according to the virtual service, and 375 * It selects a server according to the virtual service, and
342 * creates a connection entry. 376 * creates a connection entry.
343 * Protocols supported: TCP, UDP 377 * Protocols supported: TCP, UDP
378 *
379 * Usage of *ignored
380 *
381 * 1 : protocol tried to schedule (eg. on SYN), found svc but the
382 * svc/scheduler decides that this packet should be accepted with
383 * NF_ACCEPT because it must not be scheduled.
384 *
385 * 0 : scheduler can not find destination, so try bypass or
386 * return ICMP and then NF_DROP (ip_vs_leave).
387 *
388 * -1 : scheduler tried to schedule but fatal error occurred, eg.
389 * ip_vs_conn_new failure (ENOMEM) or ip_vs_sip_fill_param
390 * failure such as missing Call-ID, ENOMEM on skb_linearize
391 * or pe_data. In this case we should return NF_DROP without
392 * any attempts to send ICMP with ip_vs_leave.
344 */ 393 */
345struct ip_vs_conn * 394struct ip_vs_conn *
346ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, 395ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
347 struct ip_vs_protocol *pp, int *ignored) 396 struct ip_vs_proto_data *pd, int *ignored)
348{ 397{
398 struct ip_vs_protocol *pp = pd->pp;
349 struct ip_vs_conn *cp = NULL; 399 struct ip_vs_conn *cp = NULL;
350 struct ip_vs_iphdr iph; 400 struct ip_vs_iphdr iph;
351 struct ip_vs_dest *dest; 401 struct ip_vs_dest *dest;
@@ -371,12 +421,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
371 } 421 }
372 422
373 /* 423 /*
374 * Do not schedule replies from local real server. It is risky 424 * Do not schedule replies from local real server.
375 * for fwmark services but mostly for persistent services.
376 */ 425 */
377 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && 426 if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) &&
378 (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && 427 (cp = pp->conn_in_get(svc->af, skb, &iph, iph.len, 1))) {
379 (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) {
380 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, 428 IP_VS_DBG_PKT(12, svc->af, pp, skb, 0,
381 "Not scheduling reply for existing connection"); 429 "Not scheduling reply for existing connection");
382 __ip_vs_conn_put(cp); 430 __ip_vs_conn_put(cp);
@@ -386,10 +434,10 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
386 /* 434 /*
387 * Persistent service 435 * Persistent service
388 */ 436 */
389 if (svc->flags & IP_VS_SVC_F_PERSISTENT) { 437 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
390 *ignored = 0; 438 return ip_vs_sched_persist(svc, skb, pptr[0], pptr[1], ignored);
391 return ip_vs_sched_persist(svc, skb, pptr); 439
392 } 440 *ignored = 0;
393 441
394 /* 442 /*
395 * Non-persistent service 443 * Non-persistent service
@@ -402,8 +450,6 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
402 return NULL; 450 return NULL;
403 } 451 }
404 452
405 *ignored = 0;
406
407 dest = svc->scheduler->schedule(svc, skb); 453 dest = svc->scheduler->schedule(svc, skb);
408 if (dest == NULL) { 454 if (dest == NULL) {
409 IP_VS_DBG(1, "Schedule: no dest found.\n"); 455 IP_VS_DBG(1, "Schedule: no dest found.\n");
@@ -419,13 +465,17 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
419 */ 465 */
420 { 466 {
421 struct ip_vs_conn_param p; 467 struct ip_vs_conn_param p;
422 ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, 468
423 pptr[0], &iph.daddr, pptr[1], &p); 469 ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
470 &iph.saddr, pptr[0], &iph.daddr, pptr[1],
471 &p);
424 cp = ip_vs_conn_new(&p, &dest->addr, 472 cp = ip_vs_conn_new(&p, &dest->addr,
425 dest->port ? dest->port : pptr[1], 473 dest->port ? dest->port : pptr[1],
426 flags, dest); 474 flags, dest, skb->mark);
427 if (!cp) 475 if (!cp) {
476 *ignored = -1;
428 return NULL; 477 return NULL;
478 }
429 } 479 }
430 480
431 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " 481 IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u "
@@ -447,11 +497,14 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb,
447 * no destination is available for a new connection. 497 * no destination is available for a new connection.
448 */ 498 */
449int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, 499int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
450 struct ip_vs_protocol *pp) 500 struct ip_vs_proto_data *pd)
451{ 501{
502 struct net *net;
503 struct netns_ipvs *ipvs;
452 __be16 _ports[2], *pptr; 504 __be16 _ports[2], *pptr;
453 struct ip_vs_iphdr iph; 505 struct ip_vs_iphdr iph;
454 int unicast; 506 int unicast;
507
455 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); 508 ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph);
456 509
457 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); 510 pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports);
@@ -459,18 +512,20 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
459 ip_vs_service_put(svc); 512 ip_vs_service_put(svc);
460 return NF_DROP; 513 return NF_DROP;
461 } 514 }
515 net = skb_net(skb);
462 516
463#ifdef CONFIG_IP_VS_IPV6 517#ifdef CONFIG_IP_VS_IPV6
464 if (svc->af == AF_INET6) 518 if (svc->af == AF_INET6)
465 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST; 519 unicast = ipv6_addr_type(&iph.daddr.in6) & IPV6_ADDR_UNICAST;
466 else 520 else
467#endif 521#endif
468 unicast = (inet_addr_type(&init_net, iph.daddr.ip) == RTN_UNICAST); 522 unicast = (inet_addr_type(net, iph.daddr.ip) == RTN_UNICAST);
469 523
470 /* if it is fwmark-based service, the cache_bypass sysctl is up 524 /* if it is fwmark-based service, the cache_bypass sysctl is up
471 and the destination is a non-local unicast, then create 525 and the destination is a non-local unicast, then create
472 a cache_bypass connection entry */ 526 a cache_bypass connection entry */
473 if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { 527 ipvs = net_ipvs(net);
528 if (ipvs->sysctl_cache_bypass && svc->fwmark && unicast) {
474 int ret, cs; 529 int ret, cs;
475 struct ip_vs_conn *cp; 530 struct ip_vs_conn *cp;
476 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && 531 unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET &&
@@ -484,12 +539,12 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
484 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); 539 IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__);
485 { 540 {
486 struct ip_vs_conn_param p; 541 struct ip_vs_conn_param p;
487 ip_vs_conn_fill_param(svc->af, iph.protocol, 542 ip_vs_conn_fill_param(svc->net, svc->af, iph.protocol,
488 &iph.saddr, pptr[0], 543 &iph.saddr, pptr[0],
489 &iph.daddr, pptr[1], &p); 544 &iph.daddr, pptr[1], &p);
490 cp = ip_vs_conn_new(&p, &daddr, 0, 545 cp = ip_vs_conn_new(&p, &daddr, 0,
491 IP_VS_CONN_F_BYPASS | flags, 546 IP_VS_CONN_F_BYPASS | flags,
492 NULL); 547 NULL, skb->mark);
493 if (!cp) 548 if (!cp)
494 return NF_DROP; 549 return NF_DROP;
495 } 550 }
@@ -498,10 +553,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
498 ip_vs_in_stats(cp, skb); 553 ip_vs_in_stats(cp, skb);
499 554
500 /* set state */ 555 /* set state */
501 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); 556 cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
502 557
503 /* transmit the first SYN packet */ 558 /* transmit the first SYN packet */
504 ret = cp->packet_xmit(skb, cp, pp); 559 ret = cp->packet_xmit(skb, cp, pd->pp);
505 /* do not touch skb anymore */ 560 /* do not touch skb anymore */
506 561
507 atomic_inc(&cp->in_pkts); 562 atomic_inc(&cp->in_pkts);
@@ -682,6 +737,7 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
682 struct ip_vs_protocol *pp, 737 struct ip_vs_protocol *pp,
683 unsigned int offset, unsigned int ihl) 738 unsigned int offset, unsigned int ihl)
684{ 739{
740 struct netns_ipvs *ipvs;
685 unsigned int verdict = NF_DROP; 741 unsigned int verdict = NF_DROP;
686 742
687 if (IP_VS_FWD_METHOD(cp) != 0) { 743 if (IP_VS_FWD_METHOD(cp) != 0) {
@@ -703,6 +759,8 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
703 if (!skb_make_writable(skb, offset)) 759 if (!skb_make_writable(skb, offset))
704 goto out; 760 goto out;
705 761
762 ipvs = net_ipvs(skb_net(skb));
763
706#ifdef CONFIG_IP_VS_IPV6 764#ifdef CONFIG_IP_VS_IPV6
707 if (af == AF_INET6) 765 if (af == AF_INET6)
708 ip_vs_nat_icmp_v6(skb, pp, cp, 1); 766 ip_vs_nat_icmp_v6(skb, pp, cp, 1);
@@ -712,11 +770,11 @@ static int handle_response_icmp(int af, struct sk_buff *skb,
712 770
713#ifdef CONFIG_IP_VS_IPV6 771#ifdef CONFIG_IP_VS_IPV6
714 if (af == AF_INET6) { 772 if (af == AF_INET6) {
715 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) 773 if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
716 goto out; 774 goto out;
717 } else 775 } else
718#endif 776#endif
719 if ((sysctl_ip_vs_snat_reroute || 777 if ((ipvs->sysctl_snat_reroute ||
720 skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 778 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
721 ip_route_me_harder(skb, RTN_LOCAL) != 0) 779 ip_route_me_harder(skb, RTN_LOCAL) != 0)
722 goto out; 780 goto out;
@@ -808,7 +866,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related,
808 866
809 ip_vs_fill_iphdr(AF_INET, cih, &ciph); 867 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
810 /* The embedded headers contain source and dest in reverse order */ 868 /* The embedded headers contain source and dest in reverse order */
811 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); 869 cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
812 if (!cp) 870 if (!cp)
813 return NF_ACCEPT; 871 return NF_ACCEPT;
814 872
@@ -885,7 +943,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related,
885 943
886 ip_vs_fill_iphdr(AF_INET6, cih, &ciph); 944 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
887 /* The embedded headers contain source and dest in reverse order */ 945 /* The embedded headers contain source and dest in reverse order */
888 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); 946 cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
889 if (!cp) 947 if (!cp)
890 return NF_ACCEPT; 948 return NF_ACCEPT;
891 949
@@ -924,9 +982,12 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len)
924 * Used for NAT and local client. 982 * Used for NAT and local client.
925 */ 983 */
926static unsigned int 984static unsigned int
927handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 985handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
928 struct ip_vs_conn *cp, int ihl) 986 struct ip_vs_conn *cp, int ihl)
929{ 987{
988 struct ip_vs_protocol *pp = pd->pp;
989 struct netns_ipvs *ipvs;
990
930 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); 991 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet");
931 992
932 if (!skb_make_writable(skb, ihl)) 993 if (!skb_make_writable(skb, ihl))
@@ -961,13 +1022,15 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
961 * if it came from this machine itself. So re-compute 1022 * if it came from this machine itself. So re-compute
962 * the routing information. 1023 * the routing information.
963 */ 1024 */
1025 ipvs = net_ipvs(skb_net(skb));
1026
964#ifdef CONFIG_IP_VS_IPV6 1027#ifdef CONFIG_IP_VS_IPV6
965 if (af == AF_INET6) { 1028 if (af == AF_INET6) {
966 if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) 1029 if (ipvs->sysctl_snat_reroute && ip6_route_me_harder(skb) != 0)
967 goto drop; 1030 goto drop;
968 } else 1031 } else
969#endif 1032#endif
970 if ((sysctl_ip_vs_snat_reroute || 1033 if ((ipvs->sysctl_snat_reroute ||
971 skb_rtable(skb)->rt_flags & RTCF_LOCAL) && 1034 skb_rtable(skb)->rt_flags & RTCF_LOCAL) &&
972 ip_route_me_harder(skb, RTN_LOCAL) != 0) 1035 ip_route_me_harder(skb, RTN_LOCAL) != 0)
973 goto drop; 1036 goto drop;
@@ -975,7 +1038,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
975 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); 1038 IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT");
976 1039
977 ip_vs_out_stats(cp, skb); 1040 ip_vs_out_stats(cp, skb);
978 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); 1041 ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pd);
979 skb->ipvs_property = 1; 1042 skb->ipvs_property = 1;
980 if (!(cp->flags & IP_VS_CONN_F_NFCT)) 1043 if (!(cp->flags & IP_VS_CONN_F_NFCT))
981 ip_vs_notrack(skb); 1044 ip_vs_notrack(skb);
@@ -999,9 +1062,12 @@ drop:
999static unsigned int 1062static unsigned int
1000ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) 1063ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1001{ 1064{
1065 struct net *net = NULL;
1002 struct ip_vs_iphdr iph; 1066 struct ip_vs_iphdr iph;
1003 struct ip_vs_protocol *pp; 1067 struct ip_vs_protocol *pp;
1068 struct ip_vs_proto_data *pd;
1004 struct ip_vs_conn *cp; 1069 struct ip_vs_conn *cp;
1070 struct netns_ipvs *ipvs;
1005 1071
1006 EnterFunction(11); 1072 EnterFunction(11);
1007 1073
@@ -1022,6 +1088,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1022 if (unlikely(!skb_dst(skb))) 1088 if (unlikely(!skb_dst(skb)))
1023 return NF_ACCEPT; 1089 return NF_ACCEPT;
1024 1090
1091 net = skb_net(skb);
1025 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1092 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1026#ifdef CONFIG_IP_VS_IPV6 1093#ifdef CONFIG_IP_VS_IPV6
1027 if (af == AF_INET6) { 1094 if (af == AF_INET6) {
@@ -1045,9 +1112,10 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1045 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1112 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1046 } 1113 }
1047 1114
1048 pp = ip_vs_proto_get(iph.protocol); 1115 pd = ip_vs_proto_data_get(net, iph.protocol);
1049 if (unlikely(!pp)) 1116 if (unlikely(!pd))
1050 return NF_ACCEPT; 1117 return NF_ACCEPT;
1118 pp = pd->pp;
1051 1119
1052 /* reassemble IP fragments */ 1120 /* reassemble IP fragments */
1053#ifdef CONFIG_IP_VS_IPV6 1121#ifdef CONFIG_IP_VS_IPV6
@@ -1073,11 +1141,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1073 /* 1141 /*
1074 * Check if the packet belongs to an existing entry 1142 * Check if the packet belongs to an existing entry
1075 */ 1143 */
1076 cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); 1144 cp = pp->conn_out_get(af, skb, &iph, iph.len, 0);
1145 ipvs = net_ipvs(net);
1077 1146
1078 if (likely(cp)) 1147 if (likely(cp))
1079 return handle_response(af, skb, pp, cp, iph.len); 1148 return handle_response(af, skb, pd, cp, iph.len);
1080 if (sysctl_ip_vs_nat_icmp_send && 1149 if (ipvs->sysctl_nat_icmp_send &&
1081 (pp->protocol == IPPROTO_TCP || 1150 (pp->protocol == IPPROTO_TCP ||
1082 pp->protocol == IPPROTO_UDP || 1151 pp->protocol == IPPROTO_UDP ||
1083 pp->protocol == IPPROTO_SCTP)) { 1152 pp->protocol == IPPROTO_SCTP)) {
@@ -1087,7 +1156,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af)
1087 sizeof(_ports), _ports); 1156 sizeof(_ports), _ports);
1088 if (pptr == NULL) 1157 if (pptr == NULL)
1089 return NF_ACCEPT; /* Not for me */ 1158 return NF_ACCEPT; /* Not for me */
1090 if (ip_vs_lookup_real_service(af, iph.protocol, 1159 if (ip_vs_lookup_real_service(net, af, iph.protocol,
1091 &iph.saddr, 1160 &iph.saddr,
1092 pptr[0])) { 1161 pptr[0])) {
1093 /* 1162 /*
@@ -1202,12 +1271,14 @@ ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb,
1202static int 1271static int
1203ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) 1272ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1204{ 1273{
1274 struct net *net = NULL;
1205 struct iphdr *iph; 1275 struct iphdr *iph;
1206 struct icmphdr _icmph, *ic; 1276 struct icmphdr _icmph, *ic;
1207 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ 1277 struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
1208 struct ip_vs_iphdr ciph; 1278 struct ip_vs_iphdr ciph;
1209 struct ip_vs_conn *cp; 1279 struct ip_vs_conn *cp;
1210 struct ip_vs_protocol *pp; 1280 struct ip_vs_protocol *pp;
1281 struct ip_vs_proto_data *pd;
1211 unsigned int offset, ihl, verdict; 1282 unsigned int offset, ihl, verdict;
1212 union nf_inet_addr snet; 1283 union nf_inet_addr snet;
1213 1284
@@ -1249,9 +1320,11 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1249 if (cih == NULL) 1320 if (cih == NULL)
1250 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1321 return NF_ACCEPT; /* The packet looks wrong, ignore */
1251 1322
1252 pp = ip_vs_proto_get(cih->protocol); 1323 net = skb_net(skb);
1253 if (!pp) 1324 pd = ip_vs_proto_data_get(net, cih->protocol);
1325 if (!pd)
1254 return NF_ACCEPT; 1326 return NF_ACCEPT;
1327 pp = pd->pp;
1255 1328
1256 /* Is the embedded protocol header present? */ 1329 /* Is the embedded protocol header present? */
1257 if (unlikely(cih->frag_off & htons(IP_OFFSET) && 1330 if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
@@ -1265,10 +1338,10 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1265 1338
1266 ip_vs_fill_iphdr(AF_INET, cih, &ciph); 1339 ip_vs_fill_iphdr(AF_INET, cih, &ciph);
1267 /* The embedded headers contain source and dest in reverse order */ 1340 /* The embedded headers contain source and dest in reverse order */
1268 cp = pp->conn_in_get(AF_INET, skb, pp, &ciph, offset, 1); 1341 cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1);
1269 if (!cp) { 1342 if (!cp) {
1270 /* The packet could also belong to a local client */ 1343 /* The packet could also belong to a local client */
1271 cp = pp->conn_out_get(AF_INET, skb, pp, &ciph, offset, 1); 1344 cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1);
1272 if (cp) { 1345 if (cp) {
1273 snet.ip = iph->saddr; 1346 snet.ip = iph->saddr;
1274 return handle_response_icmp(AF_INET, skb, &snet, 1347 return handle_response_icmp(AF_INET, skb, &snet,
@@ -1312,6 +1385,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
1312static int 1385static int
1313ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) 1386ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1314{ 1387{
1388 struct net *net = NULL;
1315 struct ipv6hdr *iph; 1389 struct ipv6hdr *iph;
1316 struct icmp6hdr _icmph, *ic; 1390 struct icmp6hdr _icmph, *ic;
1317 struct ipv6hdr _ciph, *cih; /* The ip header contained 1391 struct ipv6hdr _ciph, *cih; /* The ip header contained
@@ -1319,6 +1393,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1319 struct ip_vs_iphdr ciph; 1393 struct ip_vs_iphdr ciph;
1320 struct ip_vs_conn *cp; 1394 struct ip_vs_conn *cp;
1321 struct ip_vs_protocol *pp; 1395 struct ip_vs_protocol *pp;
1396 struct ip_vs_proto_data *pd;
1322 unsigned int offset, verdict; 1397 unsigned int offset, verdict;
1323 union nf_inet_addr snet; 1398 union nf_inet_addr snet;
1324 struct rt6_info *rt; 1399 struct rt6_info *rt;
@@ -1361,9 +1436,11 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1361 if (cih == NULL) 1436 if (cih == NULL)
1362 return NF_ACCEPT; /* The packet looks wrong, ignore */ 1437 return NF_ACCEPT; /* The packet looks wrong, ignore */
1363 1438
1364 pp = ip_vs_proto_get(cih->nexthdr); 1439 net = skb_net(skb);
1365 if (!pp) 1440 pd = ip_vs_proto_data_get(net, cih->nexthdr);
1441 if (!pd)
1366 return NF_ACCEPT; 1442 return NF_ACCEPT;
1443 pp = pd->pp;
1367 1444
1368 /* Is the embedded protocol header present? */ 1445 /* Is the embedded protocol header present? */
1369 /* TODO: we don't support fragmentation at the moment anyways */ 1446 /* TODO: we don't support fragmentation at the moment anyways */
@@ -1377,10 +1454,10 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1377 1454
1378 ip_vs_fill_iphdr(AF_INET6, cih, &ciph); 1455 ip_vs_fill_iphdr(AF_INET6, cih, &ciph);
1379 /* The embedded headers contain source and dest in reverse order */ 1456 /* The embedded headers contain source and dest in reverse order */
1380 cp = pp->conn_in_get(AF_INET6, skb, pp, &ciph, offset, 1); 1457 cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1);
1381 if (!cp) { 1458 if (!cp) {
1382 /* The packet could also belong to a local client */ 1459 /* The packet could also belong to a local client */
1383 cp = pp->conn_out_get(AF_INET6, skb, pp, &ciph, offset, 1); 1460 cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1);
1384 if (cp) { 1461 if (cp) {
1385 ipv6_addr_copy(&snet.in6, &iph->saddr); 1462 ipv6_addr_copy(&snet.in6, &iph->saddr);
1386 return handle_response_icmp(AF_INET6, skb, &snet, 1463 return handle_response_icmp(AF_INET6, skb, &snet,
@@ -1423,10 +1500,13 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum)
1423static unsigned int 1500static unsigned int
1424ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) 1501ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1425{ 1502{
1503 struct net *net;
1426 struct ip_vs_iphdr iph; 1504 struct ip_vs_iphdr iph;
1427 struct ip_vs_protocol *pp; 1505 struct ip_vs_protocol *pp;
1506 struct ip_vs_proto_data *pd;
1428 struct ip_vs_conn *cp; 1507 struct ip_vs_conn *cp;
1429 int ret, restart, pkts; 1508 int ret, restart, pkts;
1509 struct netns_ipvs *ipvs;
1430 1510
1431 /* Already marked as IPVS request or reply? */ 1511 /* Already marked as IPVS request or reply? */
1432 if (skb->ipvs_property) 1512 if (skb->ipvs_property)
@@ -1480,20 +1560,21 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1480 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); 1560 ip_vs_fill_iphdr(af, skb_network_header(skb), &iph);
1481 } 1561 }
1482 1562
1563 net = skb_net(skb);
1483 /* Protocol supported? */ 1564 /* Protocol supported? */
1484 pp = ip_vs_proto_get(iph.protocol); 1565 pd = ip_vs_proto_data_get(net, iph.protocol);
1485 if (unlikely(!pp)) 1566 if (unlikely(!pd))
1486 return NF_ACCEPT; 1567 return NF_ACCEPT;
1487 1568 pp = pd->pp;
1488 /* 1569 /*
1489 * Check if the packet belongs to an existing connection entry 1570 * Check if the packet belongs to an existing connection entry
1490 */ 1571 */
1491 cp = pp->conn_in_get(af, skb, pp, &iph, iph.len, 0); 1572 cp = pp->conn_in_get(af, skb, &iph, iph.len, 0);
1492 1573
1493 if (unlikely(!cp)) { 1574 if (unlikely(!cp)) {
1494 int v; 1575 int v;
1495 1576
1496 if (!pp->conn_schedule(af, skb, pp, &v, &cp)) 1577 if (!pp->conn_schedule(af, skb, pd, &v, &cp))
1497 return v; 1578 return v;
1498 } 1579 }
1499 1580
@@ -1505,12 +1586,13 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1505 } 1586 }
1506 1587
1507 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); 1588 IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet");
1508 1589 net = skb_net(skb);
1590 ipvs = net_ipvs(net);
1509 /* Check the server status */ 1591 /* Check the server status */
1510 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { 1592 if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
1511 /* the destination server is not available */ 1593 /* the destination server is not available */
1512 1594
1513 if (sysctl_ip_vs_expire_nodest_conn) { 1595 if (ipvs->sysctl_expire_nodest_conn) {
1514 /* try to expire the connection immediately */ 1596 /* try to expire the connection immediately */
1515 ip_vs_conn_expire_now(cp); 1597 ip_vs_conn_expire_now(cp);
1516 } 1598 }
@@ -1521,7 +1603,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1521 } 1603 }
1522 1604
1523 ip_vs_in_stats(cp, skb); 1605 ip_vs_in_stats(cp, skb);
1524 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); 1606 restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pd);
1525 if (cp->packet_xmit) 1607 if (cp->packet_xmit)
1526 ret = cp->packet_xmit(skb, cp, pp); 1608 ret = cp->packet_xmit(skb, cp, pp);
1527 /* do not touch skb anymore */ 1609 /* do not touch skb anymore */
@@ -1535,35 +1617,41 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af)
1535 * 1617 *
1536 * Sync connection if it is about to close to 1618 * Sync connection if it is about to close to
1537 * encorage the standby servers to update the connections timeout 1619 * encorage the standby servers to update the connections timeout
1620 *
1621 * For ONE_PKT let ip_vs_sync_conn() do the filter work.
1538 */ 1622 */
1539 pkts = atomic_add_return(1, &cp->in_pkts); 1623
1540 if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && 1624 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1625 pkts = ipvs->sysctl_sync_threshold[0];
1626 else
1627 pkts = atomic_add_return(1, &cp->in_pkts);
1628
1629 if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
1541 cp->protocol == IPPROTO_SCTP) { 1630 cp->protocol == IPPROTO_SCTP) {
1542 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && 1631 if ((cp->state == IP_VS_SCTP_S_ESTABLISHED &&
1543 (pkts % sysctl_ip_vs_sync_threshold[1] 1632 (pkts % ipvs->sysctl_sync_threshold[1]
1544 == sysctl_ip_vs_sync_threshold[0])) || 1633 == ipvs->sysctl_sync_threshold[0])) ||
1545 (cp->old_state != cp->state && 1634 (cp->old_state != cp->state &&
1546 ((cp->state == IP_VS_SCTP_S_CLOSED) || 1635 ((cp->state == IP_VS_SCTP_S_CLOSED) ||
1547 (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) || 1636 (cp->state == IP_VS_SCTP_S_SHUT_ACK_CLI) ||
1548 (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) { 1637 (cp->state == IP_VS_SCTP_S_SHUT_ACK_SER)))) {
1549 ip_vs_sync_conn(cp); 1638 ip_vs_sync_conn(net, cp);
1550 goto out; 1639 goto out;
1551 } 1640 }
1552 } 1641 }
1553 1642
1554 /* Keep this block last: TCP and others with pp->num_states <= 1 */ 1643 /* Keep this block last: TCP and others with pp->num_states <= 1 */
1555 else if (af == AF_INET && 1644 else if ((ipvs->sync_state & IP_VS_STATE_MASTER) &&
1556 (ip_vs_sync_state & IP_VS_STATE_MASTER) &&
1557 (((cp->protocol != IPPROTO_TCP || 1645 (((cp->protocol != IPPROTO_TCP ||
1558 cp->state == IP_VS_TCP_S_ESTABLISHED) && 1646 cp->state == IP_VS_TCP_S_ESTABLISHED) &&
1559 (pkts % sysctl_ip_vs_sync_threshold[1] 1647 (pkts % ipvs->sysctl_sync_threshold[1]
1560 == sysctl_ip_vs_sync_threshold[0])) || 1648 == ipvs->sysctl_sync_threshold[0])) ||
1561 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) && 1649 ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
1562 ((cp->state == IP_VS_TCP_S_FIN_WAIT) || 1650 ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
1563 (cp->state == IP_VS_TCP_S_CLOSE) || 1651 (cp->state == IP_VS_TCP_S_CLOSE) ||
1564 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) || 1652 (cp->state == IP_VS_TCP_S_CLOSE_WAIT) ||
1565 (cp->state == IP_VS_TCP_S_TIME_WAIT))))) 1653 (cp->state == IP_VS_TCP_S_TIME_WAIT)))))
1566 ip_vs_sync_conn(cp); 1654 ip_vs_sync_conn(net, cp);
1567out: 1655out:
1568 cp->old_state = cp->state; 1656 cp->old_state = cp->state;
1569 1657
@@ -1782,7 +1870,41 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
1782 }, 1870 },
1783#endif 1871#endif
1784}; 1872};
1873/*
1874 * Initialize IP Virtual Server netns mem.
1875 */
1876static int __net_init __ip_vs_init(struct net *net)
1877{
1878 struct netns_ipvs *ipvs;
1879
1880 ipvs = net_generic(net, ip_vs_net_id);
1881 if (ipvs == NULL) {
1882 pr_err("%s(): no memory.\n", __func__);
1883 return -ENOMEM;
1884 }
1885 ipvs->net = net;
1886 /* Counters used for creating unique names */
1887 ipvs->gen = atomic_read(&ipvs_netns_cnt);
1888 atomic_inc(&ipvs_netns_cnt);
1889 net->ipvs = ipvs;
1890 printk(KERN_INFO "IPVS: Creating netns size=%lu id=%d\n",
1891 sizeof(struct netns_ipvs), ipvs->gen);
1892 return 0;
1893}
1894
1895static void __net_exit __ip_vs_cleanup(struct net *net)
1896{
1897 struct netns_ipvs *ipvs = net_ipvs(net);
1785 1898
1899 IP_VS_DBG(10, "ipvs netns %d released\n", ipvs->gen);
1900}
1901
1902static struct pernet_operations ipvs_core_ops = {
1903 .init = __ip_vs_init,
1904 .exit = __ip_vs_cleanup,
1905 .id = &ip_vs_net_id,
1906 .size = sizeof(struct netns_ipvs),
1907};
1786 1908
1787/* 1909/*
1788 * Initialize IP Virtual Server 1910 * Initialize IP Virtual Server
@@ -1791,8 +1913,11 @@ static int __init ip_vs_init(void)
1791{ 1913{
1792 int ret; 1914 int ret;
1793 1915
1794 ip_vs_estimator_init(); 1916 ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */
1917 if (ret < 0)
1918 return ret;
1795 1919
1920 ip_vs_estimator_init();
1796 ret = ip_vs_control_init(); 1921 ret = ip_vs_control_init();
1797 if (ret < 0) { 1922 if (ret < 0) {
1798 pr_err("can't setup control.\n"); 1923 pr_err("can't setup control.\n");
@@ -1813,15 +1938,23 @@ static int __init ip_vs_init(void)
1813 goto cleanup_app; 1938 goto cleanup_app;
1814 } 1939 }
1815 1940
1941 ret = ip_vs_sync_init();
1942 if (ret < 0) {
1943 pr_err("can't setup sync data.\n");
1944 goto cleanup_conn;
1945 }
1946
1816 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 1947 ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1817 if (ret < 0) { 1948 if (ret < 0) {
1818 pr_err("can't register hooks.\n"); 1949 pr_err("can't register hooks.\n");
1819 goto cleanup_conn; 1950 goto cleanup_sync;
1820 } 1951 }
1821 1952
1822 pr_info("ipvs loaded.\n"); 1953 pr_info("ipvs loaded.\n");
1823 return ret; 1954 return ret;
1824 1955
1956cleanup_sync:
1957 ip_vs_sync_cleanup();
1825 cleanup_conn: 1958 cleanup_conn:
1826 ip_vs_conn_cleanup(); 1959 ip_vs_conn_cleanup();
1827 cleanup_app: 1960 cleanup_app:
@@ -1831,17 +1964,20 @@ static int __init ip_vs_init(void)
1831 ip_vs_control_cleanup(); 1964 ip_vs_control_cleanup();
1832 cleanup_estimator: 1965 cleanup_estimator:
1833 ip_vs_estimator_cleanup(); 1966 ip_vs_estimator_cleanup();
1967 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
1834 return ret; 1968 return ret;
1835} 1969}
1836 1970
1837static void __exit ip_vs_cleanup(void) 1971static void __exit ip_vs_cleanup(void)
1838{ 1972{
1839 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); 1973 nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
1974 ip_vs_sync_cleanup();
1840 ip_vs_conn_cleanup(); 1975 ip_vs_conn_cleanup();
1841 ip_vs_app_cleanup(); 1976 ip_vs_app_cleanup();
1842 ip_vs_protocol_cleanup(); 1977 ip_vs_protocol_cleanup();
1843 ip_vs_control_cleanup(); 1978 ip_vs_control_cleanup();
1844 ip_vs_estimator_cleanup(); 1979 ip_vs_estimator_cleanup();
1980 unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */
1845 pr_info("ipvs unloaded.\n"); 1981 pr_info("ipvs unloaded.\n");
1846} 1982}
1847 1983
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 22f7ad5101ab..09ca2ce2f2b7 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -38,6 +38,7 @@
38#include <linux/mutex.h> 38#include <linux/mutex.h>
39 39
40#include <net/net_namespace.h> 40#include <net/net_namespace.h>
41#include <linux/nsproxy.h>
41#include <net/ip.h> 42#include <net/ip.h>
42#ifdef CONFIG_IP_VS_IPV6 43#ifdef CONFIG_IP_VS_IPV6
43#include <net/ipv6.h> 44#include <net/ipv6.h>
@@ -57,42 +58,7 @@ static DEFINE_MUTEX(__ip_vs_mutex);
57/* lock for service table */ 58/* lock for service table */
58static DEFINE_RWLOCK(__ip_vs_svc_lock); 59static DEFINE_RWLOCK(__ip_vs_svc_lock);
59 60
60/* lock for table with the real services */
61static DEFINE_RWLOCK(__ip_vs_rs_lock);
62
63/* lock for state and timeout tables */
64static DEFINE_SPINLOCK(ip_vs_securetcp_lock);
65
66/* lock for drop entry handling */
67static DEFINE_SPINLOCK(__ip_vs_dropentry_lock);
68
69/* lock for drop packet handling */
70static DEFINE_SPINLOCK(__ip_vs_droppacket_lock);
71
72/* 1/rate drop and drop-entry variables */
73int ip_vs_drop_rate = 0;
74int ip_vs_drop_counter = 0;
75static atomic_t ip_vs_dropentry = ATOMIC_INIT(0);
76
77/* number of virtual services */
78static int ip_vs_num_services = 0;
79
80/* sysctl variables */ 61/* sysctl variables */
81static int sysctl_ip_vs_drop_entry = 0;
82static int sysctl_ip_vs_drop_packet = 0;
83static int sysctl_ip_vs_secure_tcp = 0;
84static int sysctl_ip_vs_amemthresh = 1024;
85static int sysctl_ip_vs_am_droprate = 10;
86int sysctl_ip_vs_cache_bypass = 0;
87int sysctl_ip_vs_expire_nodest_conn = 0;
88int sysctl_ip_vs_expire_quiescent_template = 0;
89int sysctl_ip_vs_sync_threshold[2] = { 3, 50 };
90int sysctl_ip_vs_nat_icmp_send = 0;
91#ifdef CONFIG_IP_VS_NFCT
92int sysctl_ip_vs_conntrack;
93#endif
94int sysctl_ip_vs_snat_reroute = 1;
95
96 62
97#ifdef CONFIG_IP_VS_DEBUG 63#ifdef CONFIG_IP_VS_DEBUG
98static int sysctl_ip_vs_debug_level = 0; 64static int sysctl_ip_vs_debug_level = 0;
@@ -105,7 +71,8 @@ int ip_vs_get_debug_level(void)
105 71
106#ifdef CONFIG_IP_VS_IPV6 72#ifdef CONFIG_IP_VS_IPV6
107/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ 73/* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
108static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr) 74static int __ip_vs_addr_is_local_v6(struct net *net,
75 const struct in6_addr *addr)
109{ 76{
110 struct rt6_info *rt; 77 struct rt6_info *rt;
111 struct flowi fl = { 78 struct flowi fl = {
@@ -114,7 +81,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
114 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} }, 81 .fl6_src = { .s6_addr32 = {0, 0, 0, 0} },
115 }; 82 };
116 83
117 rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); 84 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl);
118 if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK)) 85 if (rt && rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
119 return 1; 86 return 1;
120 87
@@ -125,7 +92,7 @@ static int __ip_vs_addr_is_local_v6(const struct in6_addr *addr)
125 * update_defense_level is called from keventd and from sysctl, 92 * update_defense_level is called from keventd and from sysctl,
126 * so it needs to protect itself from softirqs 93 * so it needs to protect itself from softirqs
127 */ 94 */
128static void update_defense_level(void) 95static void update_defense_level(struct netns_ipvs *ipvs)
129{ 96{
130 struct sysinfo i; 97 struct sysinfo i;
131 static int old_secure_tcp = 0; 98 static int old_secure_tcp = 0;
@@ -141,73 +108,73 @@ static void update_defense_level(void)
141 /* si_swapinfo(&i); */ 108 /* si_swapinfo(&i); */
142 /* availmem = availmem - (i.totalswap - i.freeswap); */ 109 /* availmem = availmem - (i.totalswap - i.freeswap); */
143 110
144 nomem = (availmem < sysctl_ip_vs_amemthresh); 111 nomem = (availmem < ipvs->sysctl_amemthresh);
145 112
146 local_bh_disable(); 113 local_bh_disable();
147 114
148 /* drop_entry */ 115 /* drop_entry */
149 spin_lock(&__ip_vs_dropentry_lock); 116 spin_lock(&ipvs->dropentry_lock);
150 switch (sysctl_ip_vs_drop_entry) { 117 switch (ipvs->sysctl_drop_entry) {
151 case 0: 118 case 0:
152 atomic_set(&ip_vs_dropentry, 0); 119 atomic_set(&ipvs->dropentry, 0);
153 break; 120 break;
154 case 1: 121 case 1:
155 if (nomem) { 122 if (nomem) {
156 atomic_set(&ip_vs_dropentry, 1); 123 atomic_set(&ipvs->dropentry, 1);
157 sysctl_ip_vs_drop_entry = 2; 124 ipvs->sysctl_drop_entry = 2;
158 } else { 125 } else {
159 atomic_set(&ip_vs_dropentry, 0); 126 atomic_set(&ipvs->dropentry, 0);
160 } 127 }
161 break; 128 break;
162 case 2: 129 case 2:
163 if (nomem) { 130 if (nomem) {
164 atomic_set(&ip_vs_dropentry, 1); 131 atomic_set(&ipvs->dropentry, 1);
165 } else { 132 } else {
166 atomic_set(&ip_vs_dropentry, 0); 133 atomic_set(&ipvs->dropentry, 0);
167 sysctl_ip_vs_drop_entry = 1; 134 ipvs->sysctl_drop_entry = 1;
168 }; 135 };
169 break; 136 break;
170 case 3: 137 case 3:
171 atomic_set(&ip_vs_dropentry, 1); 138 atomic_set(&ipvs->dropentry, 1);
172 break; 139 break;
173 } 140 }
174 spin_unlock(&__ip_vs_dropentry_lock); 141 spin_unlock(&ipvs->dropentry_lock);
175 142
176 /* drop_packet */ 143 /* drop_packet */
177 spin_lock(&__ip_vs_droppacket_lock); 144 spin_lock(&ipvs->droppacket_lock);
178 switch (sysctl_ip_vs_drop_packet) { 145 switch (ipvs->sysctl_drop_packet) {
179 case 0: 146 case 0:
180 ip_vs_drop_rate = 0; 147 ipvs->drop_rate = 0;
181 break; 148 break;
182 case 1: 149 case 1:
183 if (nomem) { 150 if (nomem) {
184 ip_vs_drop_rate = ip_vs_drop_counter 151 ipvs->drop_rate = ipvs->drop_counter
185 = sysctl_ip_vs_amemthresh / 152 = ipvs->sysctl_amemthresh /
186 (sysctl_ip_vs_amemthresh-availmem); 153 (ipvs->sysctl_amemthresh-availmem);
187 sysctl_ip_vs_drop_packet = 2; 154 ipvs->sysctl_drop_packet = 2;
188 } else { 155 } else {
189 ip_vs_drop_rate = 0; 156 ipvs->drop_rate = 0;
190 } 157 }
191 break; 158 break;
192 case 2: 159 case 2:
193 if (nomem) { 160 if (nomem) {
194 ip_vs_drop_rate = ip_vs_drop_counter 161 ipvs->drop_rate = ipvs->drop_counter
195 = sysctl_ip_vs_amemthresh / 162 = ipvs->sysctl_amemthresh /
196 (sysctl_ip_vs_amemthresh-availmem); 163 (ipvs->sysctl_amemthresh-availmem);
197 } else { 164 } else {
198 ip_vs_drop_rate = 0; 165 ipvs->drop_rate = 0;
199 sysctl_ip_vs_drop_packet = 1; 166 ipvs->sysctl_drop_packet = 1;
200 } 167 }
201 break; 168 break;
202 case 3: 169 case 3:
203 ip_vs_drop_rate = sysctl_ip_vs_am_droprate; 170 ipvs->drop_rate = ipvs->sysctl_am_droprate;
204 break; 171 break;
205 } 172 }
206 spin_unlock(&__ip_vs_droppacket_lock); 173 spin_unlock(&ipvs->droppacket_lock);
207 174
208 /* secure_tcp */ 175 /* secure_tcp */
209 spin_lock(&ip_vs_securetcp_lock); 176 spin_lock(&ipvs->securetcp_lock);
210 switch (sysctl_ip_vs_secure_tcp) { 177 switch (ipvs->sysctl_secure_tcp) {
211 case 0: 178 case 0:
212 if (old_secure_tcp >= 2) 179 if (old_secure_tcp >= 2)
213 to_change = 0; 180 to_change = 0;
@@ -216,7 +183,7 @@ static void update_defense_level(void)
216 if (nomem) { 183 if (nomem) {
217 if (old_secure_tcp < 2) 184 if (old_secure_tcp < 2)
218 to_change = 1; 185 to_change = 1;
219 sysctl_ip_vs_secure_tcp = 2; 186 ipvs->sysctl_secure_tcp = 2;
220 } else { 187 } else {
221 if (old_secure_tcp >= 2) 188 if (old_secure_tcp >= 2)
222 to_change = 0; 189 to_change = 0;
@@ -229,7 +196,7 @@ static void update_defense_level(void)
229 } else { 196 } else {
230 if (old_secure_tcp >= 2) 197 if (old_secure_tcp >= 2)
231 to_change = 0; 198 to_change = 0;
232 sysctl_ip_vs_secure_tcp = 1; 199 ipvs->sysctl_secure_tcp = 1;
233 } 200 }
234 break; 201 break;
235 case 3: 202 case 3:
@@ -237,10 +204,11 @@ static void update_defense_level(void)
237 to_change = 1; 204 to_change = 1;
238 break; 205 break;
239 } 206 }
240 old_secure_tcp = sysctl_ip_vs_secure_tcp; 207 old_secure_tcp = ipvs->sysctl_secure_tcp;
241 if (to_change >= 0) 208 if (to_change >= 0)
242 ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); 209 ip_vs_protocol_timeout_change(ipvs,
243 spin_unlock(&ip_vs_securetcp_lock); 210 ipvs->sysctl_secure_tcp > 1);
211 spin_unlock(&ipvs->securetcp_lock);
244 212
245 local_bh_enable(); 213 local_bh_enable();
246} 214}
@@ -250,16 +218,16 @@ static void update_defense_level(void)
250 * Timer for checking the defense 218 * Timer for checking the defense
251 */ 219 */
252#define DEFENSE_TIMER_PERIOD 1*HZ 220#define DEFENSE_TIMER_PERIOD 1*HZ
253static void defense_work_handler(struct work_struct *work);
254static DECLARE_DELAYED_WORK(defense_work, defense_work_handler);
255 221
256static void defense_work_handler(struct work_struct *work) 222static void defense_work_handler(struct work_struct *work)
257{ 223{
258 update_defense_level(); 224 struct netns_ipvs *ipvs =
259 if (atomic_read(&ip_vs_dropentry)) 225 container_of(work, struct netns_ipvs, defense_work.work);
260 ip_vs_random_dropentry();
261 226
262 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); 227 update_defense_level(ipvs);
228 if (atomic_read(&ipvs->dropentry))
229 ip_vs_random_dropentry(ipvs->net);
230 schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
263} 231}
264 232
265int 233int
@@ -287,33 +255,13 @@ static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE];
287/* the service table hashed by fwmark */ 255/* the service table hashed by fwmark */
288static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; 256static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE];
289 257
290/*
291 * Hash table: for real service lookups
292 */
293#define IP_VS_RTAB_BITS 4
294#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS)
295#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1)
296
297static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE];
298
299/*
300 * Trash for destinations
301 */
302static LIST_HEAD(ip_vs_dest_trash);
303
304/*
305 * FTP & NULL virtual service counters
306 */
307static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0);
308static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0);
309
310 258
311/* 259/*
312 * Returns hash value for virtual service 260 * Returns hash value for virtual service
313 */ 261 */
314static __inline__ unsigned 262static inline unsigned
315ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr, 263ip_vs_svc_hashkey(struct net *net, int af, unsigned proto,
316 __be16 port) 264 const union nf_inet_addr *addr, __be16 port)
317{ 265{
318 register unsigned porth = ntohs(port); 266 register unsigned porth = ntohs(port);
319 __be32 addr_fold = addr->ip; 267 __be32 addr_fold = addr->ip;
@@ -323,6 +271,7 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
323 addr_fold = addr->ip6[0]^addr->ip6[1]^ 271 addr_fold = addr->ip6[0]^addr->ip6[1]^
324 addr->ip6[2]^addr->ip6[3]; 272 addr->ip6[2]^addr->ip6[3];
325#endif 273#endif
274 addr_fold ^= ((size_t)net>>8);
326 275
327 return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth) 276 return (proto^ntohl(addr_fold)^(porth>>IP_VS_SVC_TAB_BITS)^porth)
328 & IP_VS_SVC_TAB_MASK; 277 & IP_VS_SVC_TAB_MASK;
@@ -331,13 +280,13 @@ ip_vs_svc_hashkey(int af, unsigned proto, const union nf_inet_addr *addr,
331/* 280/*
332 * Returns hash value of fwmark for virtual service lookup 281 * Returns hash value of fwmark for virtual service lookup
333 */ 282 */
334static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) 283static inline unsigned ip_vs_svc_fwm_hashkey(struct net *net, __u32 fwmark)
335{ 284{
336 return fwmark & IP_VS_SVC_TAB_MASK; 285 return (((size_t)net>>8) ^ fwmark) & IP_VS_SVC_TAB_MASK;
337} 286}
338 287
339/* 288/*
340 * Hashes a service in the ip_vs_svc_table by <proto,addr,port> 289 * Hashes a service in the ip_vs_svc_table by <netns,proto,addr,port>
341 * or in the ip_vs_svc_fwm_table by fwmark. 290 * or in the ip_vs_svc_fwm_table by fwmark.
342 * Should be called with locked tables. 291 * Should be called with locked tables.
343 */ 292 */
@@ -353,16 +302,16 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
353 302
354 if (svc->fwmark == 0) { 303 if (svc->fwmark == 0) {
355 /* 304 /*
356 * Hash it by <protocol,addr,port> in ip_vs_svc_table 305 * Hash it by <netns,protocol,addr,port> in ip_vs_svc_table
357 */ 306 */
358 hash = ip_vs_svc_hashkey(svc->af, svc->protocol, &svc->addr, 307 hash = ip_vs_svc_hashkey(svc->net, svc->af, svc->protocol,
359 svc->port); 308 &svc->addr, svc->port);
360 list_add(&svc->s_list, &ip_vs_svc_table[hash]); 309 list_add(&svc->s_list, &ip_vs_svc_table[hash]);
361 } else { 310 } else {
362 /* 311 /*
363 * Hash it by fwmark in ip_vs_svc_fwm_table 312 * Hash it by fwmark in svc_fwm_table
364 */ 313 */
365 hash = ip_vs_svc_fwm_hashkey(svc->fwmark); 314 hash = ip_vs_svc_fwm_hashkey(svc->net, svc->fwmark);
366 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); 315 list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]);
367 } 316 }
368 317
@@ -374,7 +323,7 @@ static int ip_vs_svc_hash(struct ip_vs_service *svc)
374 323
375 324
376/* 325/*
377 * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. 326 * Unhashes a service from svc_table / svc_fwm_table.
378 * Should be called with locked tables. 327 * Should be called with locked tables.
379 */ 328 */
380static int ip_vs_svc_unhash(struct ip_vs_service *svc) 329static int ip_vs_svc_unhash(struct ip_vs_service *svc)
@@ -386,10 +335,10 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
386 } 335 }
387 336
388 if (svc->fwmark == 0) { 337 if (svc->fwmark == 0) {
389 /* Remove it from the ip_vs_svc_table table */ 338 /* Remove it from the svc_table table */
390 list_del(&svc->s_list); 339 list_del(&svc->s_list);
391 } else { 340 } else {
392 /* Remove it from the ip_vs_svc_fwm_table table */ 341 /* Remove it from the svc_fwm_table table */
393 list_del(&svc->f_list); 342 list_del(&svc->f_list);
394 } 343 }
395 344
@@ -400,23 +349,24 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc)
400 349
401 350
402/* 351/*
403 * Get service by {proto,addr,port} in the service table. 352 * Get service by {netns, proto,addr,port} in the service table.
404 */ 353 */
405static inline struct ip_vs_service * 354static inline struct ip_vs_service *
406__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, 355__ip_vs_service_find(struct net *net, int af, __u16 protocol,
407 __be16 vport) 356 const union nf_inet_addr *vaddr, __be16 vport)
408{ 357{
409 unsigned hash; 358 unsigned hash;
410 struct ip_vs_service *svc; 359 struct ip_vs_service *svc;
411 360
412 /* Check for "full" addressed entries */ 361 /* Check for "full" addressed entries */
413 hash = ip_vs_svc_hashkey(af, protocol, vaddr, vport); 362 hash = ip_vs_svc_hashkey(net, af, protocol, vaddr, vport);
414 363
415 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ 364 list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){
416 if ((svc->af == af) 365 if ((svc->af == af)
417 && ip_vs_addr_equal(af, &svc->addr, vaddr) 366 && ip_vs_addr_equal(af, &svc->addr, vaddr)
418 && (svc->port == vport) 367 && (svc->port == vport)
419 && (svc->protocol == protocol)) { 368 && (svc->protocol == protocol)
369 && net_eq(svc->net, net)) {
420 /* HIT */ 370 /* HIT */
421 return svc; 371 return svc;
422 } 372 }
@@ -430,16 +380,17 @@ __ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr,
430 * Get service by {fwmark} in the service table. 380 * Get service by {fwmark} in the service table.
431 */ 381 */
432static inline struct ip_vs_service * 382static inline struct ip_vs_service *
433__ip_vs_svc_fwm_find(int af, __u32 fwmark) 383__ip_vs_svc_fwm_find(struct net *net, int af, __u32 fwmark)
434{ 384{
435 unsigned hash; 385 unsigned hash;
436 struct ip_vs_service *svc; 386 struct ip_vs_service *svc;
437 387
438 /* Check for fwmark addressed entries */ 388 /* Check for fwmark addressed entries */
439 hash = ip_vs_svc_fwm_hashkey(fwmark); 389 hash = ip_vs_svc_fwm_hashkey(net, fwmark);
440 390
441 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { 391 list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) {
442 if (svc->fwmark == fwmark && svc->af == af) { 392 if (svc->fwmark == fwmark && svc->af == af
393 && net_eq(svc->net, net)) {
443 /* HIT */ 394 /* HIT */
444 return svc; 395 return svc;
445 } 396 }
@@ -449,42 +400,44 @@ __ip_vs_svc_fwm_find(int af, __u32 fwmark)
449} 400}
450 401
451struct ip_vs_service * 402struct ip_vs_service *
452ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, 403ip_vs_service_get(struct net *net, int af, __u32 fwmark, __u16 protocol,
453 const union nf_inet_addr *vaddr, __be16 vport) 404 const union nf_inet_addr *vaddr, __be16 vport)
454{ 405{
455 struct ip_vs_service *svc; 406 struct ip_vs_service *svc;
407 struct netns_ipvs *ipvs = net_ipvs(net);
456 408
457 read_lock(&__ip_vs_svc_lock); 409 read_lock(&__ip_vs_svc_lock);
458 410
459 /* 411 /*
460 * Check the table hashed by fwmark first 412 * Check the table hashed by fwmark first
461 */ 413 */
462 if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark))) 414 svc = __ip_vs_svc_fwm_find(net, af, fwmark);
415 if (fwmark && svc)
463 goto out; 416 goto out;
464 417
465 /* 418 /*
466 * Check the table hashed by <protocol,addr,port> 419 * Check the table hashed by <protocol,addr,port>
467 * for "full" addressed entries 420 * for "full" addressed entries
468 */ 421 */
469 svc = __ip_vs_service_find(af, protocol, vaddr, vport); 422 svc = __ip_vs_service_find(net, af, protocol, vaddr, vport);
470 423
471 if (svc == NULL 424 if (svc == NULL
472 && protocol == IPPROTO_TCP 425 && protocol == IPPROTO_TCP
473 && atomic_read(&ip_vs_ftpsvc_counter) 426 && atomic_read(&ipvs->ftpsvc_counter)
474 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { 427 && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) {
475 /* 428 /*
476 * Check if ftp service entry exists, the packet 429 * Check if ftp service entry exists, the packet
477 * might belong to FTP data connections. 430 * might belong to FTP data connections.
478 */ 431 */
479 svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT); 432 svc = __ip_vs_service_find(net, af, protocol, vaddr, FTPPORT);
480 } 433 }
481 434
482 if (svc == NULL 435 if (svc == NULL
483 && atomic_read(&ip_vs_nullsvc_counter)) { 436 && atomic_read(&ipvs->nullsvc_counter)) {
484 /* 437 /*
485 * Check if the catch-all port (port zero) exists 438 * Check if the catch-all port (port zero) exists
486 */ 439 */
487 svc = __ip_vs_service_find(af, protocol, vaddr, 0); 440 svc = __ip_vs_service_find(net, af, protocol, vaddr, 0);
488 } 441 }
489 442
490 out: 443 out:
@@ -519,6 +472,7 @@ __ip_vs_unbind_svc(struct ip_vs_dest *dest)
519 svc->fwmark, 472 svc->fwmark,
520 IP_VS_DBG_ADDR(svc->af, &svc->addr), 473 IP_VS_DBG_ADDR(svc->af, &svc->addr),
521 ntohs(svc->port), atomic_read(&svc->usecnt)); 474 ntohs(svc->port), atomic_read(&svc->usecnt));
475 free_percpu(svc->stats.cpustats);
522 kfree(svc); 476 kfree(svc);
523 } 477 }
524} 478}
@@ -545,10 +499,10 @@ static inline unsigned ip_vs_rs_hashkey(int af,
545} 499}
546 500
547/* 501/*
548 * Hashes ip_vs_dest in ip_vs_rtable by <proto,addr,port>. 502 * Hashes ip_vs_dest in rs_table by <proto,addr,port>.
549 * should be called with locked tables. 503 * should be called with locked tables.
550 */ 504 */
551static int ip_vs_rs_hash(struct ip_vs_dest *dest) 505static int ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
552{ 506{
553 unsigned hash; 507 unsigned hash;
554 508
@@ -562,19 +516,19 @@ static int ip_vs_rs_hash(struct ip_vs_dest *dest)
562 */ 516 */
563 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port); 517 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, dest->port);
564 518
565 list_add(&dest->d_list, &ip_vs_rtable[hash]); 519 list_add(&dest->d_list, &ipvs->rs_table[hash]);
566 520
567 return 1; 521 return 1;
568} 522}
569 523
570/* 524/*
571 * UNhashes ip_vs_dest from ip_vs_rtable. 525 * UNhashes ip_vs_dest from rs_table.
572 * should be called with locked tables. 526 * should be called with locked tables.
573 */ 527 */
574static int ip_vs_rs_unhash(struct ip_vs_dest *dest) 528static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
575{ 529{
576 /* 530 /*
577 * Remove it from the ip_vs_rtable table. 531 * Remove it from the rs_table table.
578 */ 532 */
579 if (!list_empty(&dest->d_list)) { 533 if (!list_empty(&dest->d_list)) {
580 list_del(&dest->d_list); 534 list_del(&dest->d_list);
@@ -588,10 +542,11 @@ static int ip_vs_rs_unhash(struct ip_vs_dest *dest)
588 * Lookup real service by <proto,addr,port> in the real service table. 542 * Lookup real service by <proto,addr,port> in the real service table.
589 */ 543 */
590struct ip_vs_dest * 544struct ip_vs_dest *
591ip_vs_lookup_real_service(int af, __u16 protocol, 545ip_vs_lookup_real_service(struct net *net, int af, __u16 protocol,
592 const union nf_inet_addr *daddr, 546 const union nf_inet_addr *daddr,
593 __be16 dport) 547 __be16 dport)
594{ 548{
549 struct netns_ipvs *ipvs = net_ipvs(net);
595 unsigned hash; 550 unsigned hash;
596 struct ip_vs_dest *dest; 551 struct ip_vs_dest *dest;
597 552
@@ -601,19 +556,19 @@ ip_vs_lookup_real_service(int af, __u16 protocol,
601 */ 556 */
602 hash = ip_vs_rs_hashkey(af, daddr, dport); 557 hash = ip_vs_rs_hashkey(af, daddr, dport);
603 558
604 read_lock(&__ip_vs_rs_lock); 559 read_lock(&ipvs->rs_lock);
605 list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { 560 list_for_each_entry(dest, &ipvs->rs_table[hash], d_list) {
606 if ((dest->af == af) 561 if ((dest->af == af)
607 && ip_vs_addr_equal(af, &dest->addr, daddr) 562 && ip_vs_addr_equal(af, &dest->addr, daddr)
608 && (dest->port == dport) 563 && (dest->port == dport)
609 && ((dest->protocol == protocol) || 564 && ((dest->protocol == protocol) ||
610 dest->vfwmark)) { 565 dest->vfwmark)) {
611 /* HIT */ 566 /* HIT */
612 read_unlock(&__ip_vs_rs_lock); 567 read_unlock(&ipvs->rs_lock);
613 return dest; 568 return dest;
614 } 569 }
615 } 570 }
616 read_unlock(&__ip_vs_rs_lock); 571 read_unlock(&ipvs->rs_lock);
617 572
618 return NULL; 573 return NULL;
619} 574}
@@ -652,15 +607,16 @@ ip_vs_lookup_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
652 * ip_vs_lookup_real_service() looked promissing, but 607 * ip_vs_lookup_real_service() looked promissing, but
653 * seems not working as expected. 608 * seems not working as expected.
654 */ 609 */
655struct ip_vs_dest *ip_vs_find_dest(int af, const union nf_inet_addr *daddr, 610struct ip_vs_dest *ip_vs_find_dest(struct net *net, int af,
611 const union nf_inet_addr *daddr,
656 __be16 dport, 612 __be16 dport,
657 const union nf_inet_addr *vaddr, 613 const union nf_inet_addr *vaddr,
658 __be16 vport, __u16 protocol) 614 __be16 vport, __u16 protocol, __u32 fwmark)
659{ 615{
660 struct ip_vs_dest *dest; 616 struct ip_vs_dest *dest;
661 struct ip_vs_service *svc; 617 struct ip_vs_service *svc;
662 618
663 svc = ip_vs_service_get(af, 0, protocol, vaddr, vport); 619 svc = ip_vs_service_get(net, af, fwmark, protocol, vaddr, vport);
664 if (!svc) 620 if (!svc)
665 return NULL; 621 return NULL;
666 dest = ip_vs_lookup_dest(svc, daddr, dport); 622 dest = ip_vs_lookup_dest(svc, daddr, dport);
@@ -685,11 +641,12 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
685 __be16 dport) 641 __be16 dport)
686{ 642{
687 struct ip_vs_dest *dest, *nxt; 643 struct ip_vs_dest *dest, *nxt;
644 struct netns_ipvs *ipvs = net_ipvs(svc->net);
688 645
689 /* 646 /*
690 * Find the destination in trash 647 * Find the destination in trash
691 */ 648 */
692 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { 649 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
693 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, " 650 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
694 "dest->refcnt=%d\n", 651 "dest->refcnt=%d\n",
695 dest->vfwmark, 652 dest->vfwmark,
@@ -720,6 +677,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
720 list_del(&dest->n_list); 677 list_del(&dest->n_list);
721 ip_vs_dst_reset(dest); 678 ip_vs_dst_reset(dest);
722 __ip_vs_unbind_svc(dest); 679 __ip_vs_unbind_svc(dest);
680 free_percpu(dest->stats.cpustats);
723 kfree(dest); 681 kfree(dest);
724 } 682 }
725 } 683 }
@@ -737,14 +695,16 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, const union nf_inet_addr *daddr,
737 * are expired, and the refcnt of each destination in the trash must 695 * are expired, and the refcnt of each destination in the trash must
738 * be 1, so we simply release them here. 696 * be 1, so we simply release them here.
739 */ 697 */
740static void ip_vs_trash_cleanup(void) 698static void ip_vs_trash_cleanup(struct net *net)
741{ 699{
742 struct ip_vs_dest *dest, *nxt; 700 struct ip_vs_dest *dest, *nxt;
701 struct netns_ipvs *ipvs = net_ipvs(net);
743 702
744 list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { 703 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, n_list) {
745 list_del(&dest->n_list); 704 list_del(&dest->n_list);
746 ip_vs_dst_reset(dest); 705 ip_vs_dst_reset(dest);
747 __ip_vs_unbind_svc(dest); 706 __ip_vs_unbind_svc(dest);
707 free_percpu(dest->stats.cpustats);
748 kfree(dest); 708 kfree(dest);
749 } 709 }
750} 710}
@@ -768,6 +728,7 @@ static void
768__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, 728__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
769 struct ip_vs_dest_user_kern *udest, int add) 729 struct ip_vs_dest_user_kern *udest, int add)
770{ 730{
731 struct netns_ipvs *ipvs = net_ipvs(svc->net);
771 int conn_flags; 732 int conn_flags;
772 733
773 /* set the weight and the flags */ 734 /* set the weight and the flags */
@@ -780,12 +741,12 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
780 conn_flags |= IP_VS_CONN_F_NOOUTPUT; 741 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
781 } else { 742 } else {
782 /* 743 /*
783 * Put the real service in ip_vs_rtable if not present. 744 * Put the real service in rs_table if not present.
784 * For now only for NAT! 745 * For now only for NAT!
785 */ 746 */
786 write_lock_bh(&__ip_vs_rs_lock); 747 write_lock_bh(&ipvs->rs_lock);
787 ip_vs_rs_hash(dest); 748 ip_vs_rs_hash(ipvs, dest);
788 write_unlock_bh(&__ip_vs_rs_lock); 749 write_unlock_bh(&ipvs->rs_lock);
789 } 750 }
790 atomic_set(&dest->conn_flags, conn_flags); 751 atomic_set(&dest->conn_flags, conn_flags);
791 752
@@ -813,7 +774,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
813 spin_unlock(&dest->dst_lock); 774 spin_unlock(&dest->dst_lock);
814 775
815 if (add) 776 if (add)
816 ip_vs_new_estimator(&dest->stats); 777 ip_vs_new_estimator(svc->net, &dest->stats);
817 778
818 write_lock_bh(&__ip_vs_svc_lock); 779 write_lock_bh(&__ip_vs_svc_lock);
819 780
@@ -850,12 +811,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
850 atype = ipv6_addr_type(&udest->addr.in6); 811 atype = ipv6_addr_type(&udest->addr.in6);
851 if ((!(atype & IPV6_ADDR_UNICAST) || 812 if ((!(atype & IPV6_ADDR_UNICAST) ||
852 atype & IPV6_ADDR_LINKLOCAL) && 813 atype & IPV6_ADDR_LINKLOCAL) &&
853 !__ip_vs_addr_is_local_v6(&udest->addr.in6)) 814 !__ip_vs_addr_is_local_v6(svc->net, &udest->addr.in6))
854 return -EINVAL; 815 return -EINVAL;
855 } else 816 } else
856#endif 817#endif
857 { 818 {
858 atype = inet_addr_type(&init_net, udest->addr.ip); 819 atype = inet_addr_type(svc->net, udest->addr.ip);
859 if (atype != RTN_LOCAL && atype != RTN_UNICAST) 820 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
860 return -EINVAL; 821 return -EINVAL;
861 } 822 }
@@ -865,6 +826,11 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
865 pr_err("%s(): no memory.\n", __func__); 826 pr_err("%s(): no memory.\n", __func__);
866 return -ENOMEM; 827 return -ENOMEM;
867 } 828 }
829 dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
830 if (!dest->stats.cpustats) {
831 pr_err("%s() alloc_percpu failed\n", __func__);
832 goto err_alloc;
833 }
868 834
869 dest->af = svc->af; 835 dest->af = svc->af;
870 dest->protocol = svc->protocol; 836 dest->protocol = svc->protocol;
@@ -888,6 +854,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest,
888 854
889 LeaveFunction(2); 855 LeaveFunction(2);
890 return 0; 856 return 0;
857
858err_alloc:
859 kfree(dest);
860 return -ENOMEM;
891} 861}
892 862
893 863
@@ -1006,16 +976,18 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1006/* 976/*
1007 * Delete a destination (must be already unlinked from the service) 977 * Delete a destination (must be already unlinked from the service)
1008 */ 978 */
1009static void __ip_vs_del_dest(struct ip_vs_dest *dest) 979static void __ip_vs_del_dest(struct net *net, struct ip_vs_dest *dest)
1010{ 980{
1011 ip_vs_kill_estimator(&dest->stats); 981 struct netns_ipvs *ipvs = net_ipvs(net);
982
983 ip_vs_kill_estimator(net, &dest->stats);
1012 984
1013 /* 985 /*
1014 * Remove it from the d-linked list with the real services. 986 * Remove it from the d-linked list with the real services.
1015 */ 987 */
1016 write_lock_bh(&__ip_vs_rs_lock); 988 write_lock_bh(&ipvs->rs_lock);
1017 ip_vs_rs_unhash(dest); 989 ip_vs_rs_unhash(dest);
1018 write_unlock_bh(&__ip_vs_rs_lock); 990 write_unlock_bh(&ipvs->rs_lock);
1019 991
1020 /* 992 /*
1021 * Decrease the refcnt of the dest, and free the dest 993 * Decrease the refcnt of the dest, and free the dest
@@ -1034,6 +1006,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
1034 and only one user context can update virtual service at a 1006 and only one user context can update virtual service at a
1035 time, so the operation here is OK */ 1007 time, so the operation here is OK */
1036 atomic_dec(&dest->svc->refcnt); 1008 atomic_dec(&dest->svc->refcnt);
1009 free_percpu(dest->stats.cpustats);
1037 kfree(dest); 1010 kfree(dest);
1038 } else { 1011 } else {
1039 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, " 1012 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, "
@@ -1041,7 +1014,7 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest)
1041 IP_VS_DBG_ADDR(dest->af, &dest->addr), 1014 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1042 ntohs(dest->port), 1015 ntohs(dest->port),
1043 atomic_read(&dest->refcnt)); 1016 atomic_read(&dest->refcnt));
1044 list_add(&dest->n_list, &ip_vs_dest_trash); 1017 list_add(&dest->n_list, &ipvs->dest_trash);
1045 atomic_inc(&dest->refcnt); 1018 atomic_inc(&dest->refcnt);
1046 } 1019 }
1047} 1020}
@@ -1105,7 +1078,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1105 /* 1078 /*
1106 * Delete the destination 1079 * Delete the destination
1107 */ 1080 */
1108 __ip_vs_del_dest(dest); 1081 __ip_vs_del_dest(svc->net, dest);
1109 1082
1110 LeaveFunction(2); 1083 LeaveFunction(2);
1111 1084
@@ -1117,13 +1090,14 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1117 * Add a service into the service hash table 1090 * Add a service into the service hash table
1118 */ 1091 */
1119static int 1092static int
1120ip_vs_add_service(struct ip_vs_service_user_kern *u, 1093ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u,
1121 struct ip_vs_service **svc_p) 1094 struct ip_vs_service **svc_p)
1122{ 1095{
1123 int ret = 0; 1096 int ret = 0;
1124 struct ip_vs_scheduler *sched = NULL; 1097 struct ip_vs_scheduler *sched = NULL;
1125 struct ip_vs_pe *pe = NULL; 1098 struct ip_vs_pe *pe = NULL;
1126 struct ip_vs_service *svc = NULL; 1099 struct ip_vs_service *svc = NULL;
1100 struct netns_ipvs *ipvs = net_ipvs(net);
1127 1101
1128 /* increase the module use count */ 1102 /* increase the module use count */
1129 ip_vs_use_count_inc(); 1103 ip_vs_use_count_inc();
@@ -1137,7 +1111,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1137 } 1111 }
1138 1112
1139 if (u->pe_name && *u->pe_name) { 1113 if (u->pe_name && *u->pe_name) {
1140 pe = ip_vs_pe_get(u->pe_name); 1114 pe = ip_vs_pe_getbyname(u->pe_name);
1141 if (pe == NULL) { 1115 if (pe == NULL) {
1142 pr_info("persistence engine module ip_vs_pe_%s " 1116 pr_info("persistence engine module ip_vs_pe_%s "
1143 "not found\n", u->pe_name); 1117 "not found\n", u->pe_name);
@@ -1159,6 +1133,11 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1159 ret = -ENOMEM; 1133 ret = -ENOMEM;
1160 goto out_err; 1134 goto out_err;
1161 } 1135 }
1136 svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1137 if (!svc->stats.cpustats) {
1138 pr_err("%s() alloc_percpu failed\n", __func__);
1139 goto out_err;
1140 }
1162 1141
1163 /* I'm the first user of the service */ 1142 /* I'm the first user of the service */
1164 atomic_set(&svc->usecnt, 0); 1143 atomic_set(&svc->usecnt, 0);
@@ -1172,6 +1151,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1172 svc->flags = u->flags; 1151 svc->flags = u->flags;
1173 svc->timeout = u->timeout * HZ; 1152 svc->timeout = u->timeout * HZ;
1174 svc->netmask = u->netmask; 1153 svc->netmask = u->netmask;
1154 svc->net = net;
1175 1155
1176 INIT_LIST_HEAD(&svc->destinations); 1156 INIT_LIST_HEAD(&svc->destinations);
1177 rwlock_init(&svc->sched_lock); 1157 rwlock_init(&svc->sched_lock);
@@ -1189,15 +1169,15 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1189 1169
1190 /* Update the virtual service counters */ 1170 /* Update the virtual service counters */
1191 if (svc->port == FTPPORT) 1171 if (svc->port == FTPPORT)
1192 atomic_inc(&ip_vs_ftpsvc_counter); 1172 atomic_inc(&ipvs->ftpsvc_counter);
1193 else if (svc->port == 0) 1173 else if (svc->port == 0)
1194 atomic_inc(&ip_vs_nullsvc_counter); 1174 atomic_inc(&ipvs->nullsvc_counter);
1195 1175
1196 ip_vs_new_estimator(&svc->stats); 1176 ip_vs_new_estimator(net, &svc->stats);
1197 1177
1198 /* Count only IPv4 services for old get/setsockopt interface */ 1178 /* Count only IPv4 services for old get/setsockopt interface */
1199 if (svc->af == AF_INET) 1179 if (svc->af == AF_INET)
1200 ip_vs_num_services++; 1180 ipvs->num_services++;
1201 1181
1202 /* Hash the service into the service table */ 1182 /* Hash the service into the service table */
1203 write_lock_bh(&__ip_vs_svc_lock); 1183 write_lock_bh(&__ip_vs_svc_lock);
@@ -1207,6 +1187,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1207 *svc_p = svc; 1187 *svc_p = svc;
1208 return 0; 1188 return 0;
1209 1189
1190
1210 out_err: 1191 out_err:
1211 if (svc != NULL) { 1192 if (svc != NULL) {
1212 ip_vs_unbind_scheduler(svc); 1193 ip_vs_unbind_scheduler(svc);
@@ -1215,6 +1196,8 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u,
1215 ip_vs_app_inc_put(svc->inc); 1196 ip_vs_app_inc_put(svc->inc);
1216 local_bh_enable(); 1197 local_bh_enable();
1217 } 1198 }
1199 if (svc->stats.cpustats)
1200 free_percpu(svc->stats.cpustats);
1218 kfree(svc); 1201 kfree(svc);
1219 } 1202 }
1220 ip_vs_scheduler_put(sched); 1203 ip_vs_scheduler_put(sched);
@@ -1248,7 +1231,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1248 old_sched = sched; 1231 old_sched = sched;
1249 1232
1250 if (u->pe_name && *u->pe_name) { 1233 if (u->pe_name && *u->pe_name) {
1251 pe = ip_vs_pe_get(u->pe_name); 1234 pe = ip_vs_pe_getbyname(u->pe_name);
1252 if (pe == NULL) { 1235 if (pe == NULL) {
1253 pr_info("persistence engine module ip_vs_pe_%s " 1236 pr_info("persistence engine module ip_vs_pe_%s "
1254 "not found\n", u->pe_name); 1237 "not found\n", u->pe_name);
@@ -1334,14 +1317,15 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1334 struct ip_vs_dest *dest, *nxt; 1317 struct ip_vs_dest *dest, *nxt;
1335 struct ip_vs_scheduler *old_sched; 1318 struct ip_vs_scheduler *old_sched;
1336 struct ip_vs_pe *old_pe; 1319 struct ip_vs_pe *old_pe;
1320 struct netns_ipvs *ipvs = net_ipvs(svc->net);
1337 1321
1338 pr_info("%s: enter\n", __func__); 1322 pr_info("%s: enter\n", __func__);
1339 1323
1340 /* Count only IPv4 services for old get/setsockopt interface */ 1324 /* Count only IPv4 services for old get/setsockopt interface */
1341 if (svc->af == AF_INET) 1325 if (svc->af == AF_INET)
1342 ip_vs_num_services--; 1326 ipvs->num_services--;
1343 1327
1344 ip_vs_kill_estimator(&svc->stats); 1328 ip_vs_kill_estimator(svc->net, &svc->stats);
1345 1329
1346 /* Unbind scheduler */ 1330 /* Unbind scheduler */
1347 old_sched = svc->scheduler; 1331 old_sched = svc->scheduler;
@@ -1364,16 +1348,16 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1364 */ 1348 */
1365 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { 1349 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1366 __ip_vs_unlink_dest(svc, dest, 0); 1350 __ip_vs_unlink_dest(svc, dest, 0);
1367 __ip_vs_del_dest(dest); 1351 __ip_vs_del_dest(svc->net, dest);
1368 } 1352 }
1369 1353
1370 /* 1354 /*
1371 * Update the virtual service counters 1355 * Update the virtual service counters
1372 */ 1356 */
1373 if (svc->port == FTPPORT) 1357 if (svc->port == FTPPORT)
1374 atomic_dec(&ip_vs_ftpsvc_counter); 1358 atomic_dec(&ipvs->ftpsvc_counter);
1375 else if (svc->port == 0) 1359 else if (svc->port == 0)
1376 atomic_dec(&ip_vs_nullsvc_counter); 1360 atomic_dec(&ipvs->nullsvc_counter);
1377 1361
1378 /* 1362 /*
1379 * Free the service if nobody refers to it 1363 * Free the service if nobody refers to it
@@ -1383,6 +1367,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc)
1383 svc->fwmark, 1367 svc->fwmark,
1384 IP_VS_DBG_ADDR(svc->af, &svc->addr), 1368 IP_VS_DBG_ADDR(svc->af, &svc->addr),
1385 ntohs(svc->port), atomic_read(&svc->usecnt)); 1369 ntohs(svc->port), atomic_read(&svc->usecnt));
1370 free_percpu(svc->stats.cpustats);
1386 kfree(svc); 1371 kfree(svc);
1387 } 1372 }
1388 1373
@@ -1428,17 +1413,19 @@ static int ip_vs_del_service(struct ip_vs_service *svc)
1428/* 1413/*
1429 * Flush all the virtual services 1414 * Flush all the virtual services
1430 */ 1415 */
1431static int ip_vs_flush(void) 1416static int ip_vs_flush(struct net *net)
1432{ 1417{
1433 int idx; 1418 int idx;
1434 struct ip_vs_service *svc, *nxt; 1419 struct ip_vs_service *svc, *nxt;
1435 1420
1436 /* 1421 /*
1437 * Flush the service table hashed by <protocol,addr,port> 1422 * Flush the service table hashed by <netns,protocol,addr,port>
1438 */ 1423 */
1439 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1424 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1440 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { 1425 list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx],
1441 ip_vs_unlink_service(svc); 1426 s_list) {
1427 if (net_eq(svc->net, net))
1428 ip_vs_unlink_service(svc);
1442 } 1429 }
1443 } 1430 }
1444 1431
@@ -1448,7 +1435,8 @@ static int ip_vs_flush(void)
1448 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1435 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1449 list_for_each_entry_safe(svc, nxt, 1436 list_for_each_entry_safe(svc, nxt,
1450 &ip_vs_svc_fwm_table[idx], f_list) { 1437 &ip_vs_svc_fwm_table[idx], f_list) {
1451 ip_vs_unlink_service(svc); 1438 if (net_eq(svc->net, net))
1439 ip_vs_unlink_service(svc);
1452 } 1440 }
1453 } 1441 }
1454 1442
@@ -1472,24 +1460,26 @@ static int ip_vs_zero_service(struct ip_vs_service *svc)
1472 return 0; 1460 return 0;
1473} 1461}
1474 1462
1475static int ip_vs_zero_all(void) 1463static int ip_vs_zero_all(struct net *net)
1476{ 1464{
1477 int idx; 1465 int idx;
1478 struct ip_vs_service *svc; 1466 struct ip_vs_service *svc;
1479 1467
1480 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1468 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1481 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 1469 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1482 ip_vs_zero_service(svc); 1470 if (net_eq(svc->net, net))
1471 ip_vs_zero_service(svc);
1483 } 1472 }
1484 } 1473 }
1485 1474
1486 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1475 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1487 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 1476 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1488 ip_vs_zero_service(svc); 1477 if (net_eq(svc->net, net))
1478 ip_vs_zero_service(svc);
1489 } 1479 }
1490 } 1480 }
1491 1481
1492 ip_vs_zero_stats(&ip_vs_stats); 1482 ip_vs_zero_stats(net_ipvs(net)->tot_stats);
1493 return 0; 1483 return 0;
1494} 1484}
1495 1485
@@ -1498,6 +1488,7 @@ static int
1498proc_do_defense_mode(ctl_table *table, int write, 1488proc_do_defense_mode(ctl_table *table, int write,
1499 void __user *buffer, size_t *lenp, loff_t *ppos) 1489 void __user *buffer, size_t *lenp, loff_t *ppos)
1500{ 1490{
1491 struct net *net = current->nsproxy->net_ns;
1501 int *valp = table->data; 1492 int *valp = table->data;
1502 int val = *valp; 1493 int val = *valp;
1503 int rc; 1494 int rc;
@@ -1508,7 +1499,7 @@ proc_do_defense_mode(ctl_table *table, int write,
1508 /* Restore the correct value */ 1499 /* Restore the correct value */
1509 *valp = val; 1500 *valp = val;
1510 } else { 1501 } else {
1511 update_defense_level(); 1502 update_defense_level(net_ipvs(net));
1512 } 1503 }
1513 } 1504 }
1514 return rc; 1505 return rc;
@@ -1534,45 +1525,54 @@ proc_do_sync_threshold(ctl_table *table, int write,
1534 return rc; 1525 return rc;
1535} 1526}
1536 1527
1528static int
1529proc_do_sync_mode(ctl_table *table, int write,
1530 void __user *buffer, size_t *lenp, loff_t *ppos)
1531{
1532 int *valp = table->data;
1533 int val = *valp;
1534 int rc;
1535
1536 rc = proc_dointvec(table, write, buffer, lenp, ppos);
1537 if (write && (*valp != val)) {
1538 if ((*valp < 0) || (*valp > 1)) {
1539 /* Restore the correct value */
1540 *valp = val;
1541 } else {
1542 struct net *net = current->nsproxy->net_ns;
1543 ip_vs_sync_switch_mode(net, val);
1544 }
1545 }
1546 return rc;
1547}
1537 1548
1538/* 1549/*
1539 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) 1550 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
1551 * Do not change order or insert new entries without
1552 * align with netns init in __ip_vs_control_init()
1540 */ 1553 */
1541 1554
1542static struct ctl_table vs_vars[] = { 1555static struct ctl_table vs_vars[] = {
1543 { 1556 {
1544 .procname = "amemthresh", 1557 .procname = "amemthresh",
1545 .data = &sysctl_ip_vs_amemthresh,
1546 .maxlen = sizeof(int), 1558 .maxlen = sizeof(int),
1547 .mode = 0644, 1559 .mode = 0644,
1548 .proc_handler = proc_dointvec, 1560 .proc_handler = proc_dointvec,
1549 }, 1561 },
1550#ifdef CONFIG_IP_VS_DEBUG
1551 {
1552 .procname = "debug_level",
1553 .data = &sysctl_ip_vs_debug_level,
1554 .maxlen = sizeof(int),
1555 .mode = 0644,
1556 .proc_handler = proc_dointvec,
1557 },
1558#endif
1559 { 1562 {
1560 .procname = "am_droprate", 1563 .procname = "am_droprate",
1561 .data = &sysctl_ip_vs_am_droprate,
1562 .maxlen = sizeof(int), 1564 .maxlen = sizeof(int),
1563 .mode = 0644, 1565 .mode = 0644,
1564 .proc_handler = proc_dointvec, 1566 .proc_handler = proc_dointvec,
1565 }, 1567 },
1566 { 1568 {
1567 .procname = "drop_entry", 1569 .procname = "drop_entry",
1568 .data = &sysctl_ip_vs_drop_entry,
1569 .maxlen = sizeof(int), 1570 .maxlen = sizeof(int),
1570 .mode = 0644, 1571 .mode = 0644,
1571 .proc_handler = proc_do_defense_mode, 1572 .proc_handler = proc_do_defense_mode,
1572 }, 1573 },
1573 { 1574 {
1574 .procname = "drop_packet", 1575 .procname = "drop_packet",
1575 .data = &sysctl_ip_vs_drop_packet,
1576 .maxlen = sizeof(int), 1576 .maxlen = sizeof(int),
1577 .mode = 0644, 1577 .mode = 0644,
1578 .proc_handler = proc_do_defense_mode, 1578 .proc_handler = proc_do_defense_mode,
@@ -1580,7 +1580,6 @@ static struct ctl_table vs_vars[] = {
1580#ifdef CONFIG_IP_VS_NFCT 1580#ifdef CONFIG_IP_VS_NFCT
1581 { 1581 {
1582 .procname = "conntrack", 1582 .procname = "conntrack",
1583 .data = &sysctl_ip_vs_conntrack,
1584 .maxlen = sizeof(int), 1583 .maxlen = sizeof(int),
1585 .mode = 0644, 1584 .mode = 0644,
1586 .proc_handler = &proc_dointvec, 1585 .proc_handler = &proc_dointvec,
@@ -1588,18 +1587,62 @@ static struct ctl_table vs_vars[] = {
1588#endif 1587#endif
1589 { 1588 {
1590 .procname = "secure_tcp", 1589 .procname = "secure_tcp",
1591 .data = &sysctl_ip_vs_secure_tcp,
1592 .maxlen = sizeof(int), 1590 .maxlen = sizeof(int),
1593 .mode = 0644, 1591 .mode = 0644,
1594 .proc_handler = proc_do_defense_mode, 1592 .proc_handler = proc_do_defense_mode,
1595 }, 1593 },
1596 { 1594 {
1597 .procname = "snat_reroute", 1595 .procname = "snat_reroute",
1598 .data = &sysctl_ip_vs_snat_reroute,
1599 .maxlen = sizeof(int), 1596 .maxlen = sizeof(int),
1600 .mode = 0644, 1597 .mode = 0644,
1601 .proc_handler = &proc_dointvec, 1598 .proc_handler = &proc_dointvec,
1602 }, 1599 },
1600 {
1601 .procname = "sync_version",
1602 .maxlen = sizeof(int),
1603 .mode = 0644,
1604 .proc_handler = &proc_do_sync_mode,
1605 },
1606 {
1607 .procname = "cache_bypass",
1608 .maxlen = sizeof(int),
1609 .mode = 0644,
1610 .proc_handler = proc_dointvec,
1611 },
1612 {
1613 .procname = "expire_nodest_conn",
1614 .maxlen = sizeof(int),
1615 .mode = 0644,
1616 .proc_handler = proc_dointvec,
1617 },
1618 {
1619 .procname = "expire_quiescent_template",
1620 .maxlen = sizeof(int),
1621 .mode = 0644,
1622 .proc_handler = proc_dointvec,
1623 },
1624 {
1625 .procname = "sync_threshold",
1626 .maxlen =
1627 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
1628 .mode = 0644,
1629 .proc_handler = proc_do_sync_threshold,
1630 },
1631 {
1632 .procname = "nat_icmp_send",
1633 .maxlen = sizeof(int),
1634 .mode = 0644,
1635 .proc_handler = proc_dointvec,
1636 },
1637#ifdef CONFIG_IP_VS_DEBUG
1638 {
1639 .procname = "debug_level",
1640 .data = &sysctl_ip_vs_debug_level,
1641 .maxlen = sizeof(int),
1642 .mode = 0644,
1643 .proc_handler = proc_dointvec,
1644 },
1645#endif
1603#if 0 1646#if 0
1604 { 1647 {
1605 .procname = "timeout_established", 1648 .procname = "timeout_established",
@@ -1686,41 +1729,6 @@ static struct ctl_table vs_vars[] = {
1686 .proc_handler = proc_dointvec_jiffies, 1729 .proc_handler = proc_dointvec_jiffies,
1687 }, 1730 },
1688#endif 1731#endif
1689 {
1690 .procname = "cache_bypass",
1691 .data = &sysctl_ip_vs_cache_bypass,
1692 .maxlen = sizeof(int),
1693 .mode = 0644,
1694 .proc_handler = proc_dointvec,
1695 },
1696 {
1697 .procname = "expire_nodest_conn",
1698 .data = &sysctl_ip_vs_expire_nodest_conn,
1699 .maxlen = sizeof(int),
1700 .mode = 0644,
1701 .proc_handler = proc_dointvec,
1702 },
1703 {
1704 .procname = "expire_quiescent_template",
1705 .data = &sysctl_ip_vs_expire_quiescent_template,
1706 .maxlen = sizeof(int),
1707 .mode = 0644,
1708 .proc_handler = proc_dointvec,
1709 },
1710 {
1711 .procname = "sync_threshold",
1712 .data = &sysctl_ip_vs_sync_threshold,
1713 .maxlen = sizeof(sysctl_ip_vs_sync_threshold),
1714 .mode = 0644,
1715 .proc_handler = proc_do_sync_threshold,
1716 },
1717 {
1718 .procname = "nat_icmp_send",
1719 .data = &sysctl_ip_vs_nat_icmp_send,
1720 .maxlen = sizeof(int),
1721 .mode = 0644,
1722 .proc_handler = proc_dointvec,
1723 },
1724 { } 1732 { }
1725}; 1733};
1726 1734
@@ -1732,11 +1740,10 @@ const struct ctl_path net_vs_ctl_path[] = {
1732}; 1740};
1733EXPORT_SYMBOL_GPL(net_vs_ctl_path); 1741EXPORT_SYMBOL_GPL(net_vs_ctl_path);
1734 1742
1735static struct ctl_table_header * sysctl_header;
1736
1737#ifdef CONFIG_PROC_FS 1743#ifdef CONFIG_PROC_FS
1738 1744
1739struct ip_vs_iter { 1745struct ip_vs_iter {
1746 struct seq_net_private p; /* Do not move this, netns depends upon it*/
1740 struct list_head *table; 1747 struct list_head *table;
1741 int bucket; 1748 int bucket;
1742}; 1749};
@@ -1763,6 +1770,7 @@ static inline const char *ip_vs_fwd_name(unsigned flags)
1763/* Get the Nth entry in the two lists */ 1770/* Get the Nth entry in the two lists */
1764static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) 1771static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1765{ 1772{
1773 struct net *net = seq_file_net(seq);
1766 struct ip_vs_iter *iter = seq->private; 1774 struct ip_vs_iter *iter = seq->private;
1767 int idx; 1775 int idx;
1768 struct ip_vs_service *svc; 1776 struct ip_vs_service *svc;
@@ -1770,7 +1778,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1770 /* look in hash by protocol */ 1778 /* look in hash by protocol */
1771 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1779 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1772 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 1780 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
1773 if (pos-- == 0){ 1781 if (net_eq(svc->net, net) && pos-- == 0) {
1774 iter->table = ip_vs_svc_table; 1782 iter->table = ip_vs_svc_table;
1775 iter->bucket = idx; 1783 iter->bucket = idx;
1776 return svc; 1784 return svc;
@@ -1781,7 +1789,7 @@ static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
1781 /* keep looking in fwmark */ 1789 /* keep looking in fwmark */
1782 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 1790 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
1783 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 1791 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
1784 if (pos-- == 0) { 1792 if (net_eq(svc->net, net) && pos-- == 0) {
1785 iter->table = ip_vs_svc_fwm_table; 1793 iter->table = ip_vs_svc_fwm_table;
1786 iter->bucket = idx; 1794 iter->bucket = idx;
1787 return svc; 1795 return svc;
@@ -1935,7 +1943,7 @@ static const struct seq_operations ip_vs_info_seq_ops = {
1935 1943
1936static int ip_vs_info_open(struct inode *inode, struct file *file) 1944static int ip_vs_info_open(struct inode *inode, struct file *file)
1937{ 1945{
1938 return seq_open_private(file, &ip_vs_info_seq_ops, 1946 return seq_open_net(inode, file, &ip_vs_info_seq_ops,
1939 sizeof(struct ip_vs_iter)); 1947 sizeof(struct ip_vs_iter));
1940} 1948}
1941 1949
@@ -1949,13 +1957,11 @@ static const struct file_operations ip_vs_info_fops = {
1949 1957
1950#endif 1958#endif
1951 1959
1952struct ip_vs_stats ip_vs_stats = {
1953 .lock = __SPIN_LOCK_UNLOCKED(ip_vs_stats.lock),
1954};
1955
1956#ifdef CONFIG_PROC_FS 1960#ifdef CONFIG_PROC_FS
1957static int ip_vs_stats_show(struct seq_file *seq, void *v) 1961static int ip_vs_stats_show(struct seq_file *seq, void *v)
1958{ 1962{
1963 struct net *net = seq_file_single_net(seq);
1964 struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
1959 1965
1960/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 1966/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1961 seq_puts(seq, 1967 seq_puts(seq,
@@ -1963,29 +1969,29 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
1963 seq_printf(seq, 1969 seq_printf(seq,
1964 " Conns Packets Packets Bytes Bytes\n"); 1970 " Conns Packets Packets Bytes Bytes\n");
1965 1971
1966 spin_lock_bh(&ip_vs_stats.lock); 1972 spin_lock_bh(&tot_stats->lock);
1967 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.ustats.conns, 1973 seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", tot_stats->ustats.conns,
1968 ip_vs_stats.ustats.inpkts, ip_vs_stats.ustats.outpkts, 1974 tot_stats->ustats.inpkts, tot_stats->ustats.outpkts,
1969 (unsigned long long) ip_vs_stats.ustats.inbytes, 1975 (unsigned long long) tot_stats->ustats.inbytes,
1970 (unsigned long long) ip_vs_stats.ustats.outbytes); 1976 (unsigned long long) tot_stats->ustats.outbytes);
1971 1977
1972/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ 1978/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
1973 seq_puts(seq, 1979 seq_puts(seq,
1974 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); 1980 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
1975 seq_printf(seq,"%8X %8X %8X %16X %16X\n", 1981 seq_printf(seq,"%8X %8X %8X %16X %16X\n",
1976 ip_vs_stats.ustats.cps, 1982 tot_stats->ustats.cps,
1977 ip_vs_stats.ustats.inpps, 1983 tot_stats->ustats.inpps,
1978 ip_vs_stats.ustats.outpps, 1984 tot_stats->ustats.outpps,
1979 ip_vs_stats.ustats.inbps, 1985 tot_stats->ustats.inbps,
1980 ip_vs_stats.ustats.outbps); 1986 tot_stats->ustats.outbps);
1981 spin_unlock_bh(&ip_vs_stats.lock); 1987 spin_unlock_bh(&tot_stats->lock);
1982 1988
1983 return 0; 1989 return 0;
1984} 1990}
1985 1991
1986static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) 1992static int ip_vs_stats_seq_open(struct inode *inode, struct file *file)
1987{ 1993{
1988 return single_open(file, ip_vs_stats_show, NULL); 1994 return single_open_net(inode, file, ip_vs_stats_show);
1989} 1995}
1990 1996
1991static const struct file_operations ip_vs_stats_fops = { 1997static const struct file_operations ip_vs_stats_fops = {
@@ -1996,13 +2002,68 @@ static const struct file_operations ip_vs_stats_fops = {
1996 .release = single_release, 2002 .release = single_release,
1997}; 2003};
1998 2004
2005static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2006{
2007 struct net *net = seq_file_single_net(seq);
2008 struct ip_vs_stats *tot_stats = net_ipvs(net)->tot_stats;
2009 int i;
2010
2011/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2012 seq_puts(seq,
2013 " Total Incoming Outgoing Incoming Outgoing\n");
2014 seq_printf(seq,
2015 "CPU Conns Packets Packets Bytes Bytes\n");
2016
2017 for_each_possible_cpu(i) {
2018 struct ip_vs_cpu_stats *u = per_cpu_ptr(net->ipvs->cpustats, i);
2019 seq_printf(seq, "%3X %8X %8X %8X %16LX %16LX\n",
2020 i, u->ustats.conns, u->ustats.inpkts,
2021 u->ustats.outpkts, (__u64)u->ustats.inbytes,
2022 (__u64)u->ustats.outbytes);
2023 }
2024
2025 spin_lock_bh(&tot_stats->lock);
2026 seq_printf(seq, " ~ %8X %8X %8X %16LX %16LX\n\n",
2027 tot_stats->ustats.conns, tot_stats->ustats.inpkts,
2028 tot_stats->ustats.outpkts,
2029 (unsigned long long) tot_stats->ustats.inbytes,
2030 (unsigned long long) tot_stats->ustats.outbytes);
2031
2032/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2033 seq_puts(seq,
2034 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2035 seq_printf(seq, " %8X %8X %8X %16X %16X\n",
2036 tot_stats->ustats.cps,
2037 tot_stats->ustats.inpps,
2038 tot_stats->ustats.outpps,
2039 tot_stats->ustats.inbps,
2040 tot_stats->ustats.outbps);
2041 spin_unlock_bh(&tot_stats->lock);
2042
2043 return 0;
2044}
2045
2046static int ip_vs_stats_percpu_seq_open(struct inode *inode, struct file *file)
2047{
2048 return single_open_net(inode, file, ip_vs_stats_percpu_show);
2049}
2050
2051static const struct file_operations ip_vs_stats_percpu_fops = {
2052 .owner = THIS_MODULE,
2053 .open = ip_vs_stats_percpu_seq_open,
2054 .read = seq_read,
2055 .llseek = seq_lseek,
2056 .release = single_release,
2057};
1999#endif 2058#endif
2000 2059
2001/* 2060/*
2002 * Set timeout values for tcp tcpfin udp in the timeout_table. 2061 * Set timeout values for tcp tcpfin udp in the timeout_table.
2003 */ 2062 */
2004static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) 2063static int ip_vs_set_timeout(struct net *net, struct ip_vs_timeout_user *u)
2005{ 2064{
2065 struct ip_vs_proto_data *pd;
2066
2006 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", 2067 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
2007 u->tcp_timeout, 2068 u->tcp_timeout,
2008 u->tcp_fin_timeout, 2069 u->tcp_fin_timeout,
@@ -2010,19 +2071,22 @@ static int ip_vs_set_timeout(struct ip_vs_timeout_user *u)
2010 2071
2011#ifdef CONFIG_IP_VS_PROTO_TCP 2072#ifdef CONFIG_IP_VS_PROTO_TCP
2012 if (u->tcp_timeout) { 2073 if (u->tcp_timeout) {
2013 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] 2074 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2075 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
2014 = u->tcp_timeout * HZ; 2076 = u->tcp_timeout * HZ;
2015 } 2077 }
2016 2078
2017 if (u->tcp_fin_timeout) { 2079 if (u->tcp_fin_timeout) {
2018 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] 2080 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2081 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
2019 = u->tcp_fin_timeout * HZ; 2082 = u->tcp_fin_timeout * HZ;
2020 } 2083 }
2021#endif 2084#endif
2022 2085
2023#ifdef CONFIG_IP_VS_PROTO_UDP 2086#ifdef CONFIG_IP_VS_PROTO_UDP
2024 if (u->udp_timeout) { 2087 if (u->udp_timeout) {
2025 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] 2088 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2089 pd->timeout_table[IP_VS_UDP_S_NORMAL]
2026 = u->udp_timeout * HZ; 2090 = u->udp_timeout * HZ;
2027 } 2091 }
2028#endif 2092#endif
@@ -2087,6 +2151,7 @@ static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
2087static int 2151static int
2088do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) 2152do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2089{ 2153{
2154 struct net *net = sock_net(sk);
2090 int ret; 2155 int ret;
2091 unsigned char arg[MAX_ARG_LEN]; 2156 unsigned char arg[MAX_ARG_LEN];
2092 struct ip_vs_service_user *usvc_compat; 2157 struct ip_vs_service_user *usvc_compat;
@@ -2121,19 +2186,20 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2121 2186
2122 if (cmd == IP_VS_SO_SET_FLUSH) { 2187 if (cmd == IP_VS_SO_SET_FLUSH) {
2123 /* Flush the virtual service */ 2188 /* Flush the virtual service */
2124 ret = ip_vs_flush(); 2189 ret = ip_vs_flush(net);
2125 goto out_unlock; 2190 goto out_unlock;
2126 } else if (cmd == IP_VS_SO_SET_TIMEOUT) { 2191 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
2127 /* Set timeout values for (tcp tcpfin udp) */ 2192 /* Set timeout values for (tcp tcpfin udp) */
2128 ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); 2193 ret = ip_vs_set_timeout(net, (struct ip_vs_timeout_user *)arg);
2129 goto out_unlock; 2194 goto out_unlock;
2130 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { 2195 } else if (cmd == IP_VS_SO_SET_STARTDAEMON) {
2131 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 2196 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2132 ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); 2197 ret = start_sync_thread(net, dm->state, dm->mcast_ifn,
2198 dm->syncid);
2133 goto out_unlock; 2199 goto out_unlock;
2134 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { 2200 } else if (cmd == IP_VS_SO_SET_STOPDAEMON) {
2135 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; 2201 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
2136 ret = stop_sync_thread(dm->state); 2202 ret = stop_sync_thread(net, dm->state);
2137 goto out_unlock; 2203 goto out_unlock;
2138 } 2204 }
2139 2205
@@ -2148,7 +2214,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2148 if (cmd == IP_VS_SO_SET_ZERO) { 2214 if (cmd == IP_VS_SO_SET_ZERO) {
2149 /* if no service address is set, zero counters in all */ 2215 /* if no service address is set, zero counters in all */
2150 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) { 2216 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
2151 ret = ip_vs_zero_all(); 2217 ret = ip_vs_zero_all(net);
2152 goto out_unlock; 2218 goto out_unlock;
2153 } 2219 }
2154 } 2220 }
@@ -2165,10 +2231,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2165 2231
2166 /* Lookup the exact service by <protocol, addr, port> or fwmark */ 2232 /* Lookup the exact service by <protocol, addr, port> or fwmark */
2167 if (usvc.fwmark == 0) 2233 if (usvc.fwmark == 0)
2168 svc = __ip_vs_service_find(usvc.af, usvc.protocol, 2234 svc = __ip_vs_service_find(net, usvc.af, usvc.protocol,
2169 &usvc.addr, usvc.port); 2235 &usvc.addr, usvc.port);
2170 else 2236 else
2171 svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark); 2237 svc = __ip_vs_svc_fwm_find(net, usvc.af, usvc.fwmark);
2172 2238
2173 if (cmd != IP_VS_SO_SET_ADD 2239 if (cmd != IP_VS_SO_SET_ADD
2174 && (svc == NULL || svc->protocol != usvc.protocol)) { 2240 && (svc == NULL || svc->protocol != usvc.protocol)) {
@@ -2181,7 +2247,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
2181 if (svc != NULL) 2247 if (svc != NULL)
2182 ret = -EEXIST; 2248 ret = -EEXIST;
2183 else 2249 else
2184 ret = ip_vs_add_service(&usvc, &svc); 2250 ret = ip_vs_add_service(net, &usvc, &svc);
2185 break; 2251 break;
2186 case IP_VS_SO_SET_EDIT: 2252 case IP_VS_SO_SET_EDIT:
2187 ret = ip_vs_edit_service(svc, &usvc); 2253 ret = ip_vs_edit_service(svc, &usvc);
@@ -2241,7 +2307,8 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
2241} 2307}
2242 2308
2243static inline int 2309static inline int
2244__ip_vs_get_service_entries(const struct ip_vs_get_services *get, 2310__ip_vs_get_service_entries(struct net *net,
2311 const struct ip_vs_get_services *get,
2245 struct ip_vs_get_services __user *uptr) 2312 struct ip_vs_get_services __user *uptr)
2246{ 2313{
2247 int idx, count=0; 2314 int idx, count=0;
@@ -2252,7 +2319,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2252 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2319 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2253 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { 2320 list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) {
2254 /* Only expose IPv4 entries to old interface */ 2321 /* Only expose IPv4 entries to old interface */
2255 if (svc->af != AF_INET) 2322 if (svc->af != AF_INET || !net_eq(svc->net, net))
2256 continue; 2323 continue;
2257 2324
2258 if (count >= get->num_services) 2325 if (count >= get->num_services)
@@ -2271,7 +2338,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2271 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 2338 for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
2272 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { 2339 list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) {
2273 /* Only expose IPv4 entries to old interface */ 2340 /* Only expose IPv4 entries to old interface */
2274 if (svc->af != AF_INET) 2341 if (svc->af != AF_INET || !net_eq(svc->net, net))
2275 continue; 2342 continue;
2276 2343
2277 if (count >= get->num_services) 2344 if (count >= get->num_services)
@@ -2291,7 +2358,7 @@ __ip_vs_get_service_entries(const struct ip_vs_get_services *get,
2291} 2358}
2292 2359
2293static inline int 2360static inline int
2294__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, 2361__ip_vs_get_dest_entries(struct net *net, const struct ip_vs_get_dests *get,
2295 struct ip_vs_get_dests __user *uptr) 2362 struct ip_vs_get_dests __user *uptr)
2296{ 2363{
2297 struct ip_vs_service *svc; 2364 struct ip_vs_service *svc;
@@ -2299,9 +2366,9 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2299 int ret = 0; 2366 int ret = 0;
2300 2367
2301 if (get->fwmark) 2368 if (get->fwmark)
2302 svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark); 2369 svc = __ip_vs_svc_fwm_find(net, AF_INET, get->fwmark);
2303 else 2370 else
2304 svc = __ip_vs_service_find(AF_INET, get->protocol, &addr, 2371 svc = __ip_vs_service_find(net, AF_INET, get->protocol, &addr,
2305 get->port); 2372 get->port);
2306 2373
2307 if (svc) { 2374 if (svc) {
@@ -2336,17 +2403,19 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get,
2336} 2403}
2337 2404
2338static inline void 2405static inline void
2339__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) 2406__ip_vs_get_timeouts(struct net *net, struct ip_vs_timeout_user *u)
2340{ 2407{
2408 struct ip_vs_proto_data *pd;
2409
2341#ifdef CONFIG_IP_VS_PROTO_TCP 2410#ifdef CONFIG_IP_VS_PROTO_TCP
2342 u->tcp_timeout = 2411 pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
2343 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; 2412 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
2344 u->tcp_fin_timeout = 2413 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2345 ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
2346#endif 2414#endif
2347#ifdef CONFIG_IP_VS_PROTO_UDP 2415#ifdef CONFIG_IP_VS_PROTO_UDP
2416 pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
2348 u->udp_timeout = 2417 u->udp_timeout =
2349 ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; 2418 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
2350#endif 2419#endif
2351} 2420}
2352 2421
@@ -2375,7 +2444,10 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2375 unsigned char arg[128]; 2444 unsigned char arg[128];
2376 int ret = 0; 2445 int ret = 0;
2377 unsigned int copylen; 2446 unsigned int copylen;
2447 struct net *net = sock_net(sk);
2448 struct netns_ipvs *ipvs = net_ipvs(net);
2378 2449
2450 BUG_ON(!net);
2379 if (!capable(CAP_NET_ADMIN)) 2451 if (!capable(CAP_NET_ADMIN))
2380 return -EPERM; 2452 return -EPERM;
2381 2453
@@ -2418,7 +2490,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2418 struct ip_vs_getinfo info; 2490 struct ip_vs_getinfo info;
2419 info.version = IP_VS_VERSION_CODE; 2491 info.version = IP_VS_VERSION_CODE;
2420 info.size = ip_vs_conn_tab_size; 2492 info.size = ip_vs_conn_tab_size;
2421 info.num_services = ip_vs_num_services; 2493 info.num_services = ipvs->num_services;
2422 if (copy_to_user(user, &info, sizeof(info)) != 0) 2494 if (copy_to_user(user, &info, sizeof(info)) != 0)
2423 ret = -EFAULT; 2495 ret = -EFAULT;
2424 } 2496 }
@@ -2437,7 +2509,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2437 ret = -EINVAL; 2509 ret = -EINVAL;
2438 goto out; 2510 goto out;
2439 } 2511 }
2440 ret = __ip_vs_get_service_entries(get, user); 2512 ret = __ip_vs_get_service_entries(net, get, user);
2441 } 2513 }
2442 break; 2514 break;
2443 2515
@@ -2450,10 +2522,11 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2450 entry = (struct ip_vs_service_entry *)arg; 2522 entry = (struct ip_vs_service_entry *)arg;
2451 addr.ip = entry->addr; 2523 addr.ip = entry->addr;
2452 if (entry->fwmark) 2524 if (entry->fwmark)
2453 svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark); 2525 svc = __ip_vs_svc_fwm_find(net, AF_INET, entry->fwmark);
2454 else 2526 else
2455 svc = __ip_vs_service_find(AF_INET, entry->protocol, 2527 svc = __ip_vs_service_find(net, AF_INET,
2456 &addr, entry->port); 2528 entry->protocol, &addr,
2529 entry->port);
2457 if (svc) { 2530 if (svc) {
2458 ip_vs_copy_service(entry, svc); 2531 ip_vs_copy_service(entry, svc);
2459 if (copy_to_user(user, entry, sizeof(*entry)) != 0) 2532 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
@@ -2476,7 +2549,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2476 ret = -EINVAL; 2549 ret = -EINVAL;
2477 goto out; 2550 goto out;
2478 } 2551 }
2479 ret = __ip_vs_get_dest_entries(get, user); 2552 ret = __ip_vs_get_dest_entries(net, get, user);
2480 } 2553 }
2481 break; 2554 break;
2482 2555
@@ -2484,7 +2557,7 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2484 { 2557 {
2485 struct ip_vs_timeout_user t; 2558 struct ip_vs_timeout_user t;
2486 2559
2487 __ip_vs_get_timeouts(&t); 2560 __ip_vs_get_timeouts(net, &t);
2488 if (copy_to_user(user, &t, sizeof(t)) != 0) 2561 if (copy_to_user(user, &t, sizeof(t)) != 0)
2489 ret = -EFAULT; 2562 ret = -EFAULT;
2490 } 2563 }
@@ -2495,15 +2568,17 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
2495 struct ip_vs_daemon_user d[2]; 2568 struct ip_vs_daemon_user d[2];
2496 2569
2497 memset(&d, 0, sizeof(d)); 2570 memset(&d, 0, sizeof(d));
2498 if (ip_vs_sync_state & IP_VS_STATE_MASTER) { 2571 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
2499 d[0].state = IP_VS_STATE_MASTER; 2572 d[0].state = IP_VS_STATE_MASTER;
2500 strlcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn, sizeof(d[0].mcast_ifn)); 2573 strlcpy(d[0].mcast_ifn, ipvs->master_mcast_ifn,
2501 d[0].syncid = ip_vs_master_syncid; 2574 sizeof(d[0].mcast_ifn));
2575 d[0].syncid = ipvs->master_syncid;
2502 } 2576 }
2503 if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { 2577 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
2504 d[1].state = IP_VS_STATE_BACKUP; 2578 d[1].state = IP_VS_STATE_BACKUP;
2505 strlcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn, sizeof(d[1].mcast_ifn)); 2579 strlcpy(d[1].mcast_ifn, ipvs->backup_mcast_ifn,
2506 d[1].syncid = ip_vs_backup_syncid; 2580 sizeof(d[1].mcast_ifn));
2581 d[1].syncid = ipvs->backup_syncid;
2507 } 2582 }
2508 if (copy_to_user(user, &d, sizeof(d)) != 0) 2583 if (copy_to_user(user, &d, sizeof(d)) != 0)
2509 ret = -EFAULT; 2584 ret = -EFAULT;
@@ -2542,6 +2617,7 @@ static struct genl_family ip_vs_genl_family = {
2542 .name = IPVS_GENL_NAME, 2617 .name = IPVS_GENL_NAME,
2543 .version = IPVS_GENL_VERSION, 2618 .version = IPVS_GENL_VERSION,
2544 .maxattr = IPVS_CMD_MAX, 2619 .maxattr = IPVS_CMD_MAX,
2620 .netnsok = true, /* Make ipvsadm to work on netns */
2545}; 2621};
2546 2622
2547/* Policy used for first-level command attributes */ 2623/* Policy used for first-level command attributes */
@@ -2696,11 +2772,12 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
2696 int idx = 0, i; 2772 int idx = 0, i;
2697 int start = cb->args[0]; 2773 int start = cb->args[0];
2698 struct ip_vs_service *svc; 2774 struct ip_vs_service *svc;
2775 struct net *net = skb_sknet(skb);
2699 2776
2700 mutex_lock(&__ip_vs_mutex); 2777 mutex_lock(&__ip_vs_mutex);
2701 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { 2778 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2702 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) { 2779 list_for_each_entry(svc, &ip_vs_svc_table[i], s_list) {
2703 if (++idx <= start) 2780 if (++idx <= start || !net_eq(svc->net, net))
2704 continue; 2781 continue;
2705 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 2782 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2706 idx--; 2783 idx--;
@@ -2711,7 +2788,7 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb,
2711 2788
2712 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) { 2789 for (i = 0; i < IP_VS_SVC_TAB_SIZE; i++) {
2713 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) { 2790 list_for_each_entry(svc, &ip_vs_svc_fwm_table[i], f_list) {
2714 if (++idx <= start) 2791 if (++idx <= start || !net_eq(svc->net, net))
2715 continue; 2792 continue;
2716 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) { 2793 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
2717 idx--; 2794 idx--;
@@ -2727,7 +2804,8 @@ nla_put_failure:
2727 return skb->len; 2804 return skb->len;
2728} 2805}
2729 2806
2730static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, 2807static int ip_vs_genl_parse_service(struct net *net,
2808 struct ip_vs_service_user_kern *usvc,
2731 struct nlattr *nla, int full_entry, 2809 struct nlattr *nla, int full_entry,
2732 struct ip_vs_service **ret_svc) 2810 struct ip_vs_service **ret_svc)
2733{ 2811{
@@ -2770,9 +2848,9 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2770 } 2848 }
2771 2849
2772 if (usvc->fwmark) 2850 if (usvc->fwmark)
2773 svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark); 2851 svc = __ip_vs_svc_fwm_find(net, usvc->af, usvc->fwmark);
2774 else 2852 else
2775 svc = __ip_vs_service_find(usvc->af, usvc->protocol, 2853 svc = __ip_vs_service_find(net, usvc->af, usvc->protocol,
2776 &usvc->addr, usvc->port); 2854 &usvc->addr, usvc->port);
2777 *ret_svc = svc; 2855 *ret_svc = svc;
2778 2856
@@ -2809,13 +2887,14 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc,
2809 return 0; 2887 return 0;
2810} 2888}
2811 2889
2812static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) 2890static struct ip_vs_service *ip_vs_genl_find_service(struct net *net,
2891 struct nlattr *nla)
2813{ 2892{
2814 struct ip_vs_service_user_kern usvc; 2893 struct ip_vs_service_user_kern usvc;
2815 struct ip_vs_service *svc; 2894 struct ip_vs_service *svc;
2816 int ret; 2895 int ret;
2817 2896
2818 ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc); 2897 ret = ip_vs_genl_parse_service(net, &usvc, nla, 0, &svc);
2819 return ret ? ERR_PTR(ret) : svc; 2898 return ret ? ERR_PTR(ret) : svc;
2820} 2899}
2821 2900
@@ -2883,6 +2962,7 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2883 struct ip_vs_service *svc; 2962 struct ip_vs_service *svc;
2884 struct ip_vs_dest *dest; 2963 struct ip_vs_dest *dest;
2885 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1]; 2964 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
2965 struct net *net = skb_sknet(skb);
2886 2966
2887 mutex_lock(&__ip_vs_mutex); 2967 mutex_lock(&__ip_vs_mutex);
2888 2968
@@ -2891,7 +2971,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb,
2891 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) 2971 IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy))
2892 goto out_err; 2972 goto out_err;
2893 2973
2894 svc = ip_vs_genl_find_service(attrs[IPVS_CMD_ATTR_SERVICE]); 2974
2975 svc = ip_vs_genl_find_service(net, attrs[IPVS_CMD_ATTR_SERVICE]);
2895 if (IS_ERR(svc) || svc == NULL) 2976 if (IS_ERR(svc) || svc == NULL)
2896 goto out_err; 2977 goto out_err;
2897 2978
@@ -3005,20 +3086,23 @@ nla_put_failure:
3005static int ip_vs_genl_dump_daemons(struct sk_buff *skb, 3086static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
3006 struct netlink_callback *cb) 3087 struct netlink_callback *cb)
3007{ 3088{
3089 struct net *net = skb_net(skb);
3090 struct netns_ipvs *ipvs = net_ipvs(net);
3091
3008 mutex_lock(&__ip_vs_mutex); 3092 mutex_lock(&__ip_vs_mutex);
3009 if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) { 3093 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
3010 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER, 3094 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
3011 ip_vs_master_mcast_ifn, 3095 ipvs->master_mcast_ifn,
3012 ip_vs_master_syncid, cb) < 0) 3096 ipvs->master_syncid, cb) < 0)
3013 goto nla_put_failure; 3097 goto nla_put_failure;
3014 3098
3015 cb->args[0] = 1; 3099 cb->args[0] = 1;
3016 } 3100 }
3017 3101
3018 if ((ip_vs_sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) { 3102 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
3019 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP, 3103 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
3020 ip_vs_backup_mcast_ifn, 3104 ipvs->backup_mcast_ifn,
3021 ip_vs_backup_syncid, cb) < 0) 3105 ipvs->backup_syncid, cb) < 0)
3022 goto nla_put_failure; 3106 goto nla_put_failure;
3023 3107
3024 cb->args[1] = 1; 3108 cb->args[1] = 1;
@@ -3030,31 +3114,33 @@ nla_put_failure:
3030 return skb->len; 3114 return skb->len;
3031} 3115}
3032 3116
3033static int ip_vs_genl_new_daemon(struct nlattr **attrs) 3117static int ip_vs_genl_new_daemon(struct net *net, struct nlattr **attrs)
3034{ 3118{
3035 if (!(attrs[IPVS_DAEMON_ATTR_STATE] && 3119 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
3036 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && 3120 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
3037 attrs[IPVS_DAEMON_ATTR_SYNC_ID])) 3121 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
3038 return -EINVAL; 3122 return -EINVAL;
3039 3123
3040 return start_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]), 3124 return start_sync_thread(net,
3125 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]),
3041 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), 3126 nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
3042 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID])); 3127 nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]));
3043} 3128}
3044 3129
3045static int ip_vs_genl_del_daemon(struct nlattr **attrs) 3130static int ip_vs_genl_del_daemon(struct net *net, struct nlattr **attrs)
3046{ 3131{
3047 if (!attrs[IPVS_DAEMON_ATTR_STATE]) 3132 if (!attrs[IPVS_DAEMON_ATTR_STATE])
3048 return -EINVAL; 3133 return -EINVAL;
3049 3134
3050 return stop_sync_thread(nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE])); 3135 return stop_sync_thread(net,
3136 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
3051} 3137}
3052 3138
3053static int ip_vs_genl_set_config(struct nlattr **attrs) 3139static int ip_vs_genl_set_config(struct net *net, struct nlattr **attrs)
3054{ 3140{
3055 struct ip_vs_timeout_user t; 3141 struct ip_vs_timeout_user t;
3056 3142
3057 __ip_vs_get_timeouts(&t); 3143 __ip_vs_get_timeouts(net, &t);
3058 3144
3059 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]) 3145 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
3060 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]); 3146 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
@@ -3066,7 +3152,7 @@ static int ip_vs_genl_set_config(struct nlattr **attrs)
3066 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]) 3152 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
3067 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]); 3153 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
3068 3154
3069 return ip_vs_set_timeout(&t); 3155 return ip_vs_set_timeout(net, &t);
3070} 3156}
3071 3157
3072static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) 3158static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
@@ -3076,16 +3162,20 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3076 struct ip_vs_dest_user_kern udest; 3162 struct ip_vs_dest_user_kern udest;
3077 int ret = 0, cmd; 3163 int ret = 0, cmd;
3078 int need_full_svc = 0, need_full_dest = 0; 3164 int need_full_svc = 0, need_full_dest = 0;
3165 struct net *net;
3166 struct netns_ipvs *ipvs;
3079 3167
3168 net = skb_sknet(skb);
3169 ipvs = net_ipvs(net);
3080 cmd = info->genlhdr->cmd; 3170 cmd = info->genlhdr->cmd;
3081 3171
3082 mutex_lock(&__ip_vs_mutex); 3172 mutex_lock(&__ip_vs_mutex);
3083 3173
3084 if (cmd == IPVS_CMD_FLUSH) { 3174 if (cmd == IPVS_CMD_FLUSH) {
3085 ret = ip_vs_flush(); 3175 ret = ip_vs_flush(net);
3086 goto out; 3176 goto out;
3087 } else if (cmd == IPVS_CMD_SET_CONFIG) { 3177 } else if (cmd == IPVS_CMD_SET_CONFIG) {
3088 ret = ip_vs_genl_set_config(info->attrs); 3178 ret = ip_vs_genl_set_config(net, info->attrs);
3089 goto out; 3179 goto out;
3090 } else if (cmd == IPVS_CMD_NEW_DAEMON || 3180 } else if (cmd == IPVS_CMD_NEW_DAEMON ||
3091 cmd == IPVS_CMD_DEL_DAEMON) { 3181 cmd == IPVS_CMD_DEL_DAEMON) {
@@ -3101,13 +3191,13 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3101 } 3191 }
3102 3192
3103 if (cmd == IPVS_CMD_NEW_DAEMON) 3193 if (cmd == IPVS_CMD_NEW_DAEMON)
3104 ret = ip_vs_genl_new_daemon(daemon_attrs); 3194 ret = ip_vs_genl_new_daemon(net, daemon_attrs);
3105 else 3195 else
3106 ret = ip_vs_genl_del_daemon(daemon_attrs); 3196 ret = ip_vs_genl_del_daemon(net, daemon_attrs);
3107 goto out; 3197 goto out;
3108 } else if (cmd == IPVS_CMD_ZERO && 3198 } else if (cmd == IPVS_CMD_ZERO &&
3109 !info->attrs[IPVS_CMD_ATTR_SERVICE]) { 3199 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
3110 ret = ip_vs_zero_all(); 3200 ret = ip_vs_zero_all(net);
3111 goto out; 3201 goto out;
3112 } 3202 }
3113 3203
@@ -3117,7 +3207,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3117 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE) 3207 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
3118 need_full_svc = 1; 3208 need_full_svc = 1;
3119 3209
3120 ret = ip_vs_genl_parse_service(&usvc, 3210 ret = ip_vs_genl_parse_service(net, &usvc,
3121 info->attrs[IPVS_CMD_ATTR_SERVICE], 3211 info->attrs[IPVS_CMD_ATTR_SERVICE],
3122 need_full_svc, &svc); 3212 need_full_svc, &svc);
3123 if (ret) 3213 if (ret)
@@ -3147,7 +3237,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
3147 switch (cmd) { 3237 switch (cmd) {
3148 case IPVS_CMD_NEW_SERVICE: 3238 case IPVS_CMD_NEW_SERVICE:
3149 if (svc == NULL) 3239 if (svc == NULL)
3150 ret = ip_vs_add_service(&usvc, &svc); 3240 ret = ip_vs_add_service(net, &usvc, &svc);
3151 else 3241 else
3152 ret = -EEXIST; 3242 ret = -EEXIST;
3153 break; 3243 break;
@@ -3185,7 +3275,11 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3185 struct sk_buff *msg; 3275 struct sk_buff *msg;
3186 void *reply; 3276 void *reply;
3187 int ret, cmd, reply_cmd; 3277 int ret, cmd, reply_cmd;
3278 struct net *net;
3279 struct netns_ipvs *ipvs;
3188 3280
3281 net = skb_sknet(skb);
3282 ipvs = net_ipvs(net);
3189 cmd = info->genlhdr->cmd; 3283 cmd = info->genlhdr->cmd;
3190 3284
3191 if (cmd == IPVS_CMD_GET_SERVICE) 3285 if (cmd == IPVS_CMD_GET_SERVICE)
@@ -3214,7 +3308,8 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3214 { 3308 {
3215 struct ip_vs_service *svc; 3309 struct ip_vs_service *svc;
3216 3310
3217 svc = ip_vs_genl_find_service(info->attrs[IPVS_CMD_ATTR_SERVICE]); 3311 svc = ip_vs_genl_find_service(net,
3312 info->attrs[IPVS_CMD_ATTR_SERVICE]);
3218 if (IS_ERR(svc)) { 3313 if (IS_ERR(svc)) {
3219 ret = PTR_ERR(svc); 3314 ret = PTR_ERR(svc);
3220 goto out_err; 3315 goto out_err;
@@ -3234,7 +3329,7 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
3234 { 3329 {
3235 struct ip_vs_timeout_user t; 3330 struct ip_vs_timeout_user t;
3236 3331
3237 __ip_vs_get_timeouts(&t); 3332 __ip_vs_get_timeouts(net, &t);
3238#ifdef CONFIG_IP_VS_PROTO_TCP 3333#ifdef CONFIG_IP_VS_PROTO_TCP
3239 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout); 3334 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP, t.tcp_timeout);
3240 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN, 3335 NLA_PUT_U32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
@@ -3380,62 +3475,172 @@ static void ip_vs_genl_unregister(void)
3380 3475
3381/* End of Generic Netlink interface definitions */ 3476/* End of Generic Netlink interface definitions */
3382 3477
3478/*
3479 * per netns intit/exit func.
3480 */
3481int __net_init __ip_vs_control_init(struct net *net)
3482{
3483 int idx;
3484 struct netns_ipvs *ipvs = net_ipvs(net);
3485 struct ctl_table *tbl;
3486
3487 atomic_set(&ipvs->dropentry, 0);
3488 spin_lock_init(&ipvs->dropentry_lock);
3489 spin_lock_init(&ipvs->droppacket_lock);
3490 spin_lock_init(&ipvs->securetcp_lock);
3491 ipvs->rs_lock = __RW_LOCK_UNLOCKED(ipvs->rs_lock);
3492
3493 /* Initialize rs_table */
3494 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3495 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3496
3497 INIT_LIST_HEAD(&ipvs->dest_trash);
3498 atomic_set(&ipvs->ftpsvc_counter, 0);
3499 atomic_set(&ipvs->nullsvc_counter, 0);
3500
3501 /* procfs stats */
3502 ipvs->tot_stats = kzalloc(sizeof(struct ip_vs_stats), GFP_KERNEL);
3503 if (ipvs->tot_stats == NULL) {
3504 pr_err("%s(): no memory.\n", __func__);
3505 return -ENOMEM;
3506 }
3507 ipvs->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
3508 if (!ipvs->cpustats) {
3509 pr_err("%s() alloc_percpu failed\n", __func__);
3510 goto err_alloc;
3511 }
3512 spin_lock_init(&ipvs->tot_stats->lock);
3513
3514 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
3515 INIT_LIST_HEAD(&ipvs->rs_table[idx]);
3516
3517 proc_net_fops_create(net, "ip_vs", 0, &ip_vs_info_fops);
3518 proc_net_fops_create(net, "ip_vs_stats", 0, &ip_vs_stats_fops);
3519 proc_net_fops_create(net, "ip_vs_stats_percpu", 0,
3520 &ip_vs_stats_percpu_fops);
3521
3522 if (!net_eq(net, &init_net)) {
3523 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
3524 if (tbl == NULL)
3525 goto err_dup;
3526 } else
3527 tbl = vs_vars;
3528 /* Initialize sysctl defaults */
3529 idx = 0;
3530 ipvs->sysctl_amemthresh = 1024;
3531 tbl[idx++].data = &ipvs->sysctl_amemthresh;
3532 ipvs->sysctl_am_droprate = 10;
3533 tbl[idx++].data = &ipvs->sysctl_am_droprate;
3534 tbl[idx++].data = &ipvs->sysctl_drop_entry;
3535 tbl[idx++].data = &ipvs->sysctl_drop_packet;
3536#ifdef CONFIG_IP_VS_NFCT
3537 tbl[idx++].data = &ipvs->sysctl_conntrack;
3538#endif
3539 tbl[idx++].data = &ipvs->sysctl_secure_tcp;
3540 ipvs->sysctl_snat_reroute = 1;
3541 tbl[idx++].data = &ipvs->sysctl_snat_reroute;
3542 ipvs->sysctl_sync_ver = 1;
3543 tbl[idx++].data = &ipvs->sysctl_sync_ver;
3544 tbl[idx++].data = &ipvs->sysctl_cache_bypass;
3545 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
3546 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
3547 ipvs->sysctl_sync_threshold[0] = 3;
3548 ipvs->sysctl_sync_threshold[1] = 50;
3549 tbl[idx].data = &ipvs->sysctl_sync_threshold;
3550 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
3551 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
3552
3553
3554 ipvs->sysctl_hdr = register_net_sysctl_table(net, net_vs_ctl_path,
3555 vs_vars);
3556 if (ipvs->sysctl_hdr == NULL)
3557 goto err_reg;
3558 ip_vs_new_estimator(net, ipvs->tot_stats);
3559 ipvs->sysctl_tbl = tbl;
3560 /* Schedule defense work */
3561 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
3562 schedule_delayed_work(&ipvs->defense_work, DEFENSE_TIMER_PERIOD);
3563 return 0;
3564
3565err_reg:
3566 if (!net_eq(net, &init_net))
3567 kfree(tbl);
3568err_dup:
3569 free_percpu(ipvs->cpustats);
3570err_alloc:
3571 kfree(ipvs->tot_stats);
3572 return -ENOMEM;
3573}
3574
3575static void __net_exit __ip_vs_control_cleanup(struct net *net)
3576{
3577 struct netns_ipvs *ipvs = net_ipvs(net);
3578
3579 ip_vs_trash_cleanup(net);
3580 ip_vs_kill_estimator(net, ipvs->tot_stats);
3581 cancel_delayed_work_sync(&ipvs->defense_work);
3582 cancel_work_sync(&ipvs->defense_work.work);
3583 unregister_net_sysctl_table(ipvs->sysctl_hdr);
3584 proc_net_remove(net, "ip_vs_stats_percpu");
3585 proc_net_remove(net, "ip_vs_stats");
3586 proc_net_remove(net, "ip_vs");
3587 free_percpu(ipvs->cpustats);
3588 kfree(ipvs->tot_stats);
3589}
3590
3591static struct pernet_operations ipvs_control_ops = {
3592 .init = __ip_vs_control_init,
3593 .exit = __ip_vs_control_cleanup,
3594};
3383 3595
3384int __init ip_vs_control_init(void) 3596int __init ip_vs_control_init(void)
3385{ 3597{
3386 int ret;
3387 int idx; 3598 int idx;
3599 int ret;
3388 3600
3389 EnterFunction(2); 3601 EnterFunction(2);
3390 3602
3391 /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ 3603 /* Initialize svc_table, ip_vs_svc_fwm_table, rs_table */
3392 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { 3604 for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) {
3393 INIT_LIST_HEAD(&ip_vs_svc_table[idx]); 3605 INIT_LIST_HEAD(&ip_vs_svc_table[idx]);
3394 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); 3606 INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]);
3395 } 3607 }
3396 for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { 3608
3397 INIT_LIST_HEAD(&ip_vs_rtable[idx]); 3609 ret = register_pernet_subsys(&ipvs_control_ops);
3610 if (ret) {
3611 pr_err("cannot register namespace.\n");
3612 goto err;
3398 } 3613 }
3399 smp_wmb(); 3614
3615 smp_wmb(); /* Do we really need it now ? */
3400 3616
3401 ret = nf_register_sockopt(&ip_vs_sockopts); 3617 ret = nf_register_sockopt(&ip_vs_sockopts);
3402 if (ret) { 3618 if (ret) {
3403 pr_err("cannot register sockopt.\n"); 3619 pr_err("cannot register sockopt.\n");
3404 return ret; 3620 goto err_net;
3405 } 3621 }
3406 3622
3407 ret = ip_vs_genl_register(); 3623 ret = ip_vs_genl_register();
3408 if (ret) { 3624 if (ret) {
3409 pr_err("cannot register Generic Netlink interface.\n"); 3625 pr_err("cannot register Generic Netlink interface.\n");
3410 nf_unregister_sockopt(&ip_vs_sockopts); 3626 nf_unregister_sockopt(&ip_vs_sockopts);
3411 return ret; 3627 goto err_net;
3412 } 3628 }
3413 3629
3414 proc_net_fops_create(&init_net, "ip_vs", 0, &ip_vs_info_fops);
3415 proc_net_fops_create(&init_net, "ip_vs_stats",0, &ip_vs_stats_fops);
3416
3417 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars);
3418
3419 ip_vs_new_estimator(&ip_vs_stats);
3420
3421 /* Hook the defense timer */
3422 schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD);
3423
3424 LeaveFunction(2); 3630 LeaveFunction(2);
3425 return 0; 3631 return 0;
3632
3633err_net:
3634 unregister_pernet_subsys(&ipvs_control_ops);
3635err:
3636 return ret;
3426} 3637}
3427 3638
3428 3639
3429void ip_vs_control_cleanup(void) 3640void ip_vs_control_cleanup(void)
3430{ 3641{
3431 EnterFunction(2); 3642 EnterFunction(2);
3432 ip_vs_trash_cleanup(); 3643 unregister_pernet_subsys(&ipvs_control_ops);
3433 cancel_delayed_work_sync(&defense_work);
3434 cancel_work_sync(&defense_work.work);
3435 ip_vs_kill_estimator(&ip_vs_stats);
3436 unregister_sysctl_table(sysctl_header);
3437 proc_net_remove(&init_net, "ip_vs_stats");
3438 proc_net_remove(&init_net, "ip_vs");
3439 ip_vs_genl_unregister(); 3644 ip_vs_genl_unregister();
3440 nf_unregister_sockopt(&ip_vs_sockopts); 3645 nf_unregister_sockopt(&ip_vs_sockopts);
3441 LeaveFunction(2); 3646 LeaveFunction(2);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index ff28801962e0..f560a05c965a 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -8,8 +8,12 @@
8 * as published by the Free Software Foundation; either version 8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version. 9 * 2 of the License, or (at your option) any later version.
10 * 10 *
11 * Changes: 11 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
12 * 12 * Network name space (netns) aware.
13 * Global data moved to netns i.e struct netns_ipvs
14 * Affected data: est_list and est_lock.
15 * estimation_timer() runs with timer per netns.
16 * get_stats()) do the per cpu summing.
13 */ 17 */
14 18
15#define KMSG_COMPONENT "IPVS" 19#define KMSG_COMPONENT "IPVS"
@@ -48,11 +52,42 @@
48 */ 52 */
49 53
50 54
51static void estimation_timer(unsigned long arg); 55/*
56 * Make a summary from each cpu
57 */
58static void ip_vs_read_cpu_stats(struct ip_vs_stats_user *sum,
59 struct ip_vs_cpu_stats *stats)
60{
61 int i;
62
63 for_each_possible_cpu(i) {
64 struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
65 unsigned int start;
66 __u64 inbytes, outbytes;
67 if (i) {
68 sum->conns += s->ustats.conns;
69 sum->inpkts += s->ustats.inpkts;
70 sum->outpkts += s->ustats.outpkts;
71 do {
72 start = u64_stats_fetch_begin_bh(&s->syncp);
73 inbytes = s->ustats.inbytes;
74 outbytes = s->ustats.outbytes;
75 } while (u64_stats_fetch_retry_bh(&s->syncp, start));
76 sum->inbytes += inbytes;
77 sum->outbytes += outbytes;
78 } else {
79 sum->conns = s->ustats.conns;
80 sum->inpkts = s->ustats.inpkts;
81 sum->outpkts = s->ustats.outpkts;
82 do {
83 start = u64_stats_fetch_begin_bh(&s->syncp);
84 sum->inbytes = s->ustats.inbytes;
85 sum->outbytes = s->ustats.outbytes;
86 } while (u64_stats_fetch_retry_bh(&s->syncp, start));
87 }
88 }
89}
52 90
53static LIST_HEAD(est_list);
54static DEFINE_SPINLOCK(est_lock);
55static DEFINE_TIMER(est_timer, estimation_timer, 0, 0);
56 91
57static void estimation_timer(unsigned long arg) 92static void estimation_timer(unsigned long arg)
58{ 93{
@@ -62,11 +97,16 @@ static void estimation_timer(unsigned long arg)
62 u32 n_inpkts, n_outpkts; 97 u32 n_inpkts, n_outpkts;
63 u64 n_inbytes, n_outbytes; 98 u64 n_inbytes, n_outbytes;
64 u32 rate; 99 u32 rate;
100 struct net *net = (struct net *)arg;
101 struct netns_ipvs *ipvs;
65 102
66 spin_lock(&est_lock); 103 ipvs = net_ipvs(net);
67 list_for_each_entry(e, &est_list, list) { 104 ip_vs_read_cpu_stats(&ipvs->tot_stats->ustats, ipvs->cpustats);
105 spin_lock(&ipvs->est_lock);
106 list_for_each_entry(e, &ipvs->est_list, list) {
68 s = container_of(e, struct ip_vs_stats, est); 107 s = container_of(e, struct ip_vs_stats, est);
69 108
109 ip_vs_read_cpu_stats(&s->ustats, s->cpustats);
70 spin_lock(&s->lock); 110 spin_lock(&s->lock);
71 n_conns = s->ustats.conns; 111 n_conns = s->ustats.conns;
72 n_inpkts = s->ustats.inpkts; 112 n_inpkts = s->ustats.inpkts;
@@ -75,38 +115,39 @@ static void estimation_timer(unsigned long arg)
75 n_outbytes = s->ustats.outbytes; 115 n_outbytes = s->ustats.outbytes;
76 116
77 /* scaled by 2^10, but divided 2 seconds */ 117 /* scaled by 2^10, but divided 2 seconds */
78 rate = (n_conns - e->last_conns)<<9; 118 rate = (n_conns - e->last_conns) << 9;
79 e->last_conns = n_conns; 119 e->last_conns = n_conns;
80 e->cps += ((long)rate - (long)e->cps)>>2; 120 e->cps += ((long)rate - (long)e->cps) >> 2;
81 s->ustats.cps = (e->cps+0x1FF)>>10; 121 s->ustats.cps = (e->cps + 0x1FF) >> 10;
82 122
83 rate = (n_inpkts - e->last_inpkts)<<9; 123 rate = (n_inpkts - e->last_inpkts) << 9;
84 e->last_inpkts = n_inpkts; 124 e->last_inpkts = n_inpkts;
85 e->inpps += ((long)rate - (long)e->inpps)>>2; 125 e->inpps += ((long)rate - (long)e->inpps) >> 2;
86 s->ustats.inpps = (e->inpps+0x1FF)>>10; 126 s->ustats.inpps = (e->inpps + 0x1FF) >> 10;
87 127
88 rate = (n_outpkts - e->last_outpkts)<<9; 128 rate = (n_outpkts - e->last_outpkts) << 9;
89 e->last_outpkts = n_outpkts; 129 e->last_outpkts = n_outpkts;
90 e->outpps += ((long)rate - (long)e->outpps)>>2; 130 e->outpps += ((long)rate - (long)e->outpps) >> 2;
91 s->ustats.outpps = (e->outpps+0x1FF)>>10; 131 s->ustats.outpps = (e->outpps + 0x1FF) >> 10;
92 132
93 rate = (n_inbytes - e->last_inbytes)<<4; 133 rate = (n_inbytes - e->last_inbytes) << 4;
94 e->last_inbytes = n_inbytes; 134 e->last_inbytes = n_inbytes;
95 e->inbps += ((long)rate - (long)e->inbps)>>2; 135 e->inbps += ((long)rate - (long)e->inbps) >> 2;
96 s->ustats.inbps = (e->inbps+0xF)>>5; 136 s->ustats.inbps = (e->inbps + 0xF) >> 5;
97 137
98 rate = (n_outbytes - e->last_outbytes)<<4; 138 rate = (n_outbytes - e->last_outbytes) << 4;
99 e->last_outbytes = n_outbytes; 139 e->last_outbytes = n_outbytes;
100 e->outbps += ((long)rate - (long)e->outbps)>>2; 140 e->outbps += ((long)rate - (long)e->outbps) >> 2;
101 s->ustats.outbps = (e->outbps+0xF)>>5; 141 s->ustats.outbps = (e->outbps + 0xF) >> 5;
102 spin_unlock(&s->lock); 142 spin_unlock(&s->lock);
103 } 143 }
104 spin_unlock(&est_lock); 144 spin_unlock(&ipvs->est_lock);
105 mod_timer(&est_timer, jiffies + 2*HZ); 145 mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
106} 146}
107 147
108void ip_vs_new_estimator(struct ip_vs_stats *stats) 148void ip_vs_new_estimator(struct net *net, struct ip_vs_stats *stats)
109{ 149{
150 struct netns_ipvs *ipvs = net_ipvs(net);
110 struct ip_vs_estimator *est = &stats->est; 151 struct ip_vs_estimator *est = &stats->est;
111 152
112 INIT_LIST_HEAD(&est->list); 153 INIT_LIST_HEAD(&est->list);
@@ -126,18 +167,19 @@ void ip_vs_new_estimator(struct ip_vs_stats *stats)
126 est->last_outbytes = stats->ustats.outbytes; 167 est->last_outbytes = stats->ustats.outbytes;
127 est->outbps = stats->ustats.outbps<<5; 168 est->outbps = stats->ustats.outbps<<5;
128 169
129 spin_lock_bh(&est_lock); 170 spin_lock_bh(&ipvs->est_lock);
130 list_add(&est->list, &est_list); 171 list_add(&est->list, &ipvs->est_list);
131 spin_unlock_bh(&est_lock); 172 spin_unlock_bh(&ipvs->est_lock);
132} 173}
133 174
134void ip_vs_kill_estimator(struct ip_vs_stats *stats) 175void ip_vs_kill_estimator(struct net *net, struct ip_vs_stats *stats)
135{ 176{
177 struct netns_ipvs *ipvs = net_ipvs(net);
136 struct ip_vs_estimator *est = &stats->est; 178 struct ip_vs_estimator *est = &stats->est;
137 179
138 spin_lock_bh(&est_lock); 180 spin_lock_bh(&ipvs->est_lock);
139 list_del(&est->list); 181 list_del(&est->list);
140 spin_unlock_bh(&est_lock); 182 spin_unlock_bh(&ipvs->est_lock);
141} 183}
142 184
143void ip_vs_zero_estimator(struct ip_vs_stats *stats) 185void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -157,13 +199,35 @@ void ip_vs_zero_estimator(struct ip_vs_stats *stats)
157 est->outbps = 0; 199 est->outbps = 0;
158} 200}
159 201
160int __init ip_vs_estimator_init(void) 202static int __net_init __ip_vs_estimator_init(struct net *net)
161{ 203{
162 mod_timer(&est_timer, jiffies + 2 * HZ); 204 struct netns_ipvs *ipvs = net_ipvs(net);
205
206 INIT_LIST_HEAD(&ipvs->est_list);
207 spin_lock_init(&ipvs->est_lock);
208 setup_timer(&ipvs->est_timer, estimation_timer, (unsigned long)net);
209 mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
163 return 0; 210 return 0;
164} 211}
165 212
213static void __net_exit __ip_vs_estimator_exit(struct net *net)
214{
215 del_timer_sync(&net_ipvs(net)->est_timer);
216}
217static struct pernet_operations ip_vs_app_ops = {
218 .init = __ip_vs_estimator_init,
219 .exit = __ip_vs_estimator_exit,
220};
221
222int __init ip_vs_estimator_init(void)
223{
224 int rv;
225
226 rv = register_pernet_subsys(&ip_vs_app_ops);
227 return rv;
228}
229
166void ip_vs_estimator_cleanup(void) 230void ip_vs_estimator_cleanup(void)
167{ 231{
168 del_timer_sync(&est_timer); 232 unregister_pernet_subsys(&ip_vs_app_ops);
169} 233}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 75455000ad1c..6b5dd6ddaae9 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -157,6 +157,7 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
157 int ret = 0; 157 int ret = 0;
158 enum ip_conntrack_info ctinfo; 158 enum ip_conntrack_info ctinfo;
159 struct nf_conn *ct; 159 struct nf_conn *ct;
160 struct net *net;
160 161
161#ifdef CONFIG_IP_VS_IPV6 162#ifdef CONFIG_IP_VS_IPV6
162 /* This application helper doesn't work with IPv6 yet, 163 /* This application helper doesn't work with IPv6 yet,
@@ -197,18 +198,20 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
197 */ 198 */
198 { 199 {
199 struct ip_vs_conn_param p; 200 struct ip_vs_conn_param p;
200 ip_vs_conn_fill_param(AF_INET, iph->protocol, 201 ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
201 &from, port, &cp->caddr, 0, &p); 202 iph->protocol, &from, port,
203 &cp->caddr, 0, &p);
202 n_cp = ip_vs_conn_out_get(&p); 204 n_cp = ip_vs_conn_out_get(&p);
203 } 205 }
204 if (!n_cp) { 206 if (!n_cp) {
205 struct ip_vs_conn_param p; 207 struct ip_vs_conn_param p;
206 ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr, 208 ip_vs_conn_fill_param(ip_vs_conn_net(cp),
209 AF_INET, IPPROTO_TCP, &cp->caddr,
207 0, &cp->vaddr, port, &p); 210 0, &cp->vaddr, port, &p);
208 n_cp = ip_vs_conn_new(&p, &from, port, 211 n_cp = ip_vs_conn_new(&p, &from, port,
209 IP_VS_CONN_F_NO_CPORT | 212 IP_VS_CONN_F_NO_CPORT |
210 IP_VS_CONN_F_NFCT, 213 IP_VS_CONN_F_NFCT,
211 cp->dest); 214 cp->dest, skb->mark);
212 if (!n_cp) 215 if (!n_cp)
213 return 0; 216 return 0;
214 217
@@ -257,8 +260,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
257 * would be adjusted twice. 260 * would be adjusted twice.
258 */ 261 */
259 262
263 net = skb_net(skb);
260 cp->app_data = NULL; 264 cp->app_data = NULL;
261 ip_vs_tcp_conn_listen(n_cp); 265 ip_vs_tcp_conn_listen(net, n_cp);
262 ip_vs_conn_put(n_cp); 266 ip_vs_conn_put(n_cp);
263 return ret; 267 return ret;
264 } 268 }
@@ -287,6 +291,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
287 union nf_inet_addr to; 291 union nf_inet_addr to;
288 __be16 port; 292 __be16 port;
289 struct ip_vs_conn *n_cp; 293 struct ip_vs_conn *n_cp;
294 struct net *net;
290 295
291#ifdef CONFIG_IP_VS_IPV6 296#ifdef CONFIG_IP_VS_IPV6
292 /* This application helper doesn't work with IPv6 yet, 297 /* This application helper doesn't work with IPv6 yet,
@@ -358,14 +363,15 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
358 363
359 { 364 {
360 struct ip_vs_conn_param p; 365 struct ip_vs_conn_param p;
361 ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port, 366 ip_vs_conn_fill_param(ip_vs_conn_net(cp), AF_INET,
362 &cp->vaddr, htons(ntohs(cp->vport)-1), 367 iph->protocol, &to, port, &cp->vaddr,
363 &p); 368 htons(ntohs(cp->vport)-1), &p);
364 n_cp = ip_vs_conn_in_get(&p); 369 n_cp = ip_vs_conn_in_get(&p);
365 if (!n_cp) { 370 if (!n_cp) {
366 n_cp = ip_vs_conn_new(&p, &cp->daddr, 371 n_cp = ip_vs_conn_new(&p, &cp->daddr,
367 htons(ntohs(cp->dport)-1), 372 htons(ntohs(cp->dport)-1),
368 IP_VS_CONN_F_NFCT, cp->dest); 373 IP_VS_CONN_F_NFCT, cp->dest,
374 skb->mark);
369 if (!n_cp) 375 if (!n_cp)
370 return 0; 376 return 0;
371 377
@@ -377,7 +383,8 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
377 /* 383 /*
378 * Move tunnel to listen state 384 * Move tunnel to listen state
379 */ 385 */
380 ip_vs_tcp_conn_listen(n_cp); 386 net = skb_net(skb);
387 ip_vs_tcp_conn_listen(net, n_cp);
381 ip_vs_conn_put(n_cp); 388 ip_vs_conn_put(n_cp);
382 389
383 return 1; 390 return 1;
@@ -398,23 +405,22 @@ static struct ip_vs_app ip_vs_ftp = {
398 .pkt_in = ip_vs_ftp_in, 405 .pkt_in = ip_vs_ftp_in,
399}; 406};
400 407
401
402/* 408/*
403 * ip_vs_ftp initialization 409 * per netns ip_vs_ftp initialization
404 */ 410 */
405static int __init ip_vs_ftp_init(void) 411static int __net_init __ip_vs_ftp_init(struct net *net)
406{ 412{
407 int i, ret; 413 int i, ret;
408 struct ip_vs_app *app = &ip_vs_ftp; 414 struct ip_vs_app *app = &ip_vs_ftp;
409 415
410 ret = register_ip_vs_app(app); 416 ret = register_ip_vs_app(net, app);
411 if (ret) 417 if (ret)
412 return ret; 418 return ret;
413 419
414 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) { 420 for (i=0; i<IP_VS_APP_MAX_PORTS; i++) {
415 if (!ports[i]) 421 if (!ports[i])
416 continue; 422 continue;
417 ret = register_ip_vs_app_inc(app, app->protocol, ports[i]); 423 ret = register_ip_vs_app_inc(net, app, app->protocol, ports[i]);
418 if (ret) 424 if (ret)
419 break; 425 break;
420 pr_info("%s: loaded support on port[%d] = %d\n", 426 pr_info("%s: loaded support on port[%d] = %d\n",
@@ -422,18 +428,39 @@ static int __init ip_vs_ftp_init(void)
422 } 428 }
423 429
424 if (ret) 430 if (ret)
425 unregister_ip_vs_app(app); 431 unregister_ip_vs_app(net, app);
426 432
427 return ret; 433 return ret;
428} 434}
435/*
436 * netns exit
437 */
438static void __ip_vs_ftp_exit(struct net *net)
439{
440 struct ip_vs_app *app = &ip_vs_ftp;
441
442 unregister_ip_vs_app(net, app);
443}
444
445static struct pernet_operations ip_vs_ftp_ops = {
446 .init = __ip_vs_ftp_init,
447 .exit = __ip_vs_ftp_exit,
448};
429 449
450int __init ip_vs_ftp_init(void)
451{
452 int rv;
453
454 rv = register_pernet_subsys(&ip_vs_ftp_ops);
455 return rv;
456}
430 457
431/* 458/*
432 * ip_vs_ftp finish. 459 * ip_vs_ftp finish.
433 */ 460 */
434static void __exit ip_vs_ftp_exit(void) 461static void __exit ip_vs_ftp_exit(void)
435{ 462{
436 unregister_ip_vs_app(&ip_vs_ftp); 463 unregister_pernet_subsys(&ip_vs_ftp_ops);
437} 464}
438 465
439 466
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 9323f8944199..d5bec3371871 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -70,7 +70,6 @@
70 * entries that haven't been touched for a day. 70 * entries that haven't been touched for a day.
71 */ 71 */
72#define COUNT_FOR_FULL_EXPIRATION 30 72#define COUNT_FOR_FULL_EXPIRATION 30
73static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ;
74 73
75 74
76/* 75/*
@@ -117,7 +116,7 @@ struct ip_vs_lblc_table {
117static ctl_table vs_vars_table[] = { 116static ctl_table vs_vars_table[] = {
118 { 117 {
119 .procname = "lblc_expiration", 118 .procname = "lblc_expiration",
120 .data = &sysctl_ip_vs_lblc_expiration, 119 .data = NULL,
121 .maxlen = sizeof(int), 120 .maxlen = sizeof(int),
122 .mode = 0644, 121 .mode = 0644,
123 .proc_handler = proc_dointvec_jiffies, 122 .proc_handler = proc_dointvec_jiffies,
@@ -125,8 +124,6 @@ static ctl_table vs_vars_table[] = {
125 { } 124 { }
126}; 125};
127 126
128static struct ctl_table_header * sysctl_header;
129
130static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) 127static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en)
131{ 128{
132 list_del(&en->list); 129 list_del(&en->list);
@@ -248,6 +245,7 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
248 struct ip_vs_lblc_entry *en, *nxt; 245 struct ip_vs_lblc_entry *en, *nxt;
249 unsigned long now = jiffies; 246 unsigned long now = jiffies;
250 int i, j; 247 int i, j;
248 struct netns_ipvs *ipvs = net_ipvs(svc->net);
251 249
252 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) { 250 for (i=0, j=tbl->rover; i<IP_VS_LBLC_TAB_SIZE; i++) {
253 j = (j + 1) & IP_VS_LBLC_TAB_MASK; 251 j = (j + 1) & IP_VS_LBLC_TAB_MASK;
@@ -255,7 +253,8 @@ static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
255 write_lock(&svc->sched_lock); 253 write_lock(&svc->sched_lock);
256 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 254 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
257 if (time_before(now, 255 if (time_before(now,
258 en->lastuse + sysctl_ip_vs_lblc_expiration)) 256 en->lastuse +
257 ipvs->sysctl_lblc_expiration))
259 continue; 258 continue;
260 259
261 ip_vs_lblc_free(en); 260 ip_vs_lblc_free(en);
@@ -543,23 +542,73 @@ static struct ip_vs_scheduler ip_vs_lblc_scheduler =
543 .schedule = ip_vs_lblc_schedule, 542 .schedule = ip_vs_lblc_schedule,
544}; 543};
545 544
545/*
546 * per netns init.
547 */
548static int __net_init __ip_vs_lblc_init(struct net *net)
549{
550 struct netns_ipvs *ipvs = net_ipvs(net);
551
552 if (!net_eq(net, &init_net)) {
553 ipvs->lblc_ctl_table = kmemdup(vs_vars_table,
554 sizeof(vs_vars_table),
555 GFP_KERNEL);
556 if (ipvs->lblc_ctl_table == NULL)
557 goto err_dup;
558 } else
559 ipvs->lblc_ctl_table = vs_vars_table;
560 ipvs->sysctl_lblc_expiration = 24*60*60*HZ;
561 ipvs->lblc_ctl_table[0].data = &ipvs->sysctl_lblc_expiration;
562
563 ipvs->lblc_ctl_header =
564 register_net_sysctl_table(net, net_vs_ctl_path,
565 ipvs->lblc_ctl_table);
566 if (!ipvs->lblc_ctl_header)
567 goto err_reg;
568
569 return 0;
570
571err_reg:
572 if (!net_eq(net, &init_net))
573 kfree(ipvs->lblc_ctl_table);
574
575err_dup:
576 return -ENOMEM;
577}
578
579static void __net_exit __ip_vs_lblc_exit(struct net *net)
580{
581 struct netns_ipvs *ipvs = net_ipvs(net);
582
583 unregister_net_sysctl_table(ipvs->lblc_ctl_header);
584
585 if (!net_eq(net, &init_net))
586 kfree(ipvs->lblc_ctl_table);
587}
588
589static struct pernet_operations ip_vs_lblc_ops = {
590 .init = __ip_vs_lblc_init,
591 .exit = __ip_vs_lblc_exit,
592};
546 593
547static int __init ip_vs_lblc_init(void) 594static int __init ip_vs_lblc_init(void)
548{ 595{
549 int ret; 596 int ret;
550 597
551 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); 598 ret = register_pernet_subsys(&ip_vs_lblc_ops);
599 if (ret)
600 return ret;
601
552 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler); 602 ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
553 if (ret) 603 if (ret)
554 unregister_sysctl_table(sysctl_header); 604 unregister_pernet_subsys(&ip_vs_lblc_ops);
555 return ret; 605 return ret;
556} 606}
557 607
558
559static void __exit ip_vs_lblc_cleanup(void) 608static void __exit ip_vs_lblc_cleanup(void)
560{ 609{
561 unregister_sysctl_table(sysctl_header);
562 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); 610 unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler);
611 unregister_pernet_subsys(&ip_vs_lblc_ops);
563} 612}
564 613
565 614
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index dbeed8ea421a..61ae8cfcf0b4 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -70,8 +70,6 @@
70 * entries that haven't been touched for a day. 70 * entries that haven't been touched for a day.
71 */ 71 */
72#define COUNT_FOR_FULL_EXPIRATION 30 72#define COUNT_FOR_FULL_EXPIRATION 30
73static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ;
74
75 73
76/* 74/*
77 * for IPVS lblcr entry hash table 75 * for IPVS lblcr entry hash table
@@ -296,7 +294,7 @@ struct ip_vs_lblcr_table {
296static ctl_table vs_vars_table[] = { 294static ctl_table vs_vars_table[] = {
297 { 295 {
298 .procname = "lblcr_expiration", 296 .procname = "lblcr_expiration",
299 .data = &sysctl_ip_vs_lblcr_expiration, 297 .data = NULL,
300 .maxlen = sizeof(int), 298 .maxlen = sizeof(int),
301 .mode = 0644, 299 .mode = 0644,
302 .proc_handler = proc_dointvec_jiffies, 300 .proc_handler = proc_dointvec_jiffies,
@@ -304,8 +302,6 @@ static ctl_table vs_vars_table[] = {
304 { } 302 { }
305}; 303};
306 304
307static struct ctl_table_header * sysctl_header;
308
309static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) 305static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en)
310{ 306{
311 list_del(&en->list); 307 list_del(&en->list);
@@ -425,14 +421,15 @@ static inline void ip_vs_lblcr_full_check(struct ip_vs_service *svc)
425 unsigned long now = jiffies; 421 unsigned long now = jiffies;
426 int i, j; 422 int i, j;
427 struct ip_vs_lblcr_entry *en, *nxt; 423 struct ip_vs_lblcr_entry *en, *nxt;
424 struct netns_ipvs *ipvs = net_ipvs(svc->net);
428 425
429 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) { 426 for (i=0, j=tbl->rover; i<IP_VS_LBLCR_TAB_SIZE; i++) {
430 j = (j + 1) & IP_VS_LBLCR_TAB_MASK; 427 j = (j + 1) & IP_VS_LBLCR_TAB_MASK;
431 428
432 write_lock(&svc->sched_lock); 429 write_lock(&svc->sched_lock);
433 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { 430 list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) {
434 if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, 431 if (time_after(en->lastuse
435 now)) 432 + ipvs->sysctl_lblcr_expiration, now))
436 continue; 433 continue;
437 434
438 ip_vs_lblcr_free(en); 435 ip_vs_lblcr_free(en);
@@ -664,6 +661,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
664 read_lock(&svc->sched_lock); 661 read_lock(&svc->sched_lock);
665 en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr); 662 en = ip_vs_lblcr_get(svc->af, tbl, &iph.daddr);
666 if (en) { 663 if (en) {
664 struct netns_ipvs *ipvs = net_ipvs(svc->net);
667 /* We only hold a read lock, but this is atomic */ 665 /* We only hold a read lock, but this is atomic */
668 en->lastuse = jiffies; 666 en->lastuse = jiffies;
669 667
@@ -675,7 +673,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
675 /* More than one destination + enough time passed by, cleanup */ 673 /* More than one destination + enough time passed by, cleanup */
676 if (atomic_read(&en->set.size) > 1 && 674 if (atomic_read(&en->set.size) > 1 &&
677 time_after(jiffies, en->set.lastmod + 675 time_after(jiffies, en->set.lastmod +
678 sysctl_ip_vs_lblcr_expiration)) { 676 ipvs->sysctl_lblcr_expiration)) {
679 struct ip_vs_dest *m; 677 struct ip_vs_dest *m;
680 678
681 write_lock(&en->set.lock); 679 write_lock(&en->set.lock);
@@ -744,23 +742,73 @@ static struct ip_vs_scheduler ip_vs_lblcr_scheduler =
744 .schedule = ip_vs_lblcr_schedule, 742 .schedule = ip_vs_lblcr_schedule,
745}; 743};
746 744
745/*
746 * per netns init.
747 */
748static int __net_init __ip_vs_lblcr_init(struct net *net)
749{
750 struct netns_ipvs *ipvs = net_ipvs(net);
751
752 if (!net_eq(net, &init_net)) {
753 ipvs->lblcr_ctl_table = kmemdup(vs_vars_table,
754 sizeof(vs_vars_table),
755 GFP_KERNEL);
756 if (ipvs->lblcr_ctl_table == NULL)
757 goto err_dup;
758 } else
759 ipvs->lblcr_ctl_table = vs_vars_table;
760 ipvs->sysctl_lblcr_expiration = 24*60*60*HZ;
761 ipvs->lblcr_ctl_table[0].data = &ipvs->sysctl_lblcr_expiration;
762
763 ipvs->lblcr_ctl_header =
764 register_net_sysctl_table(net, net_vs_ctl_path,
765 ipvs->lblcr_ctl_table);
766 if (!ipvs->lblcr_ctl_header)
767 goto err_reg;
768
769 return 0;
770
771err_reg:
772 if (!net_eq(net, &init_net))
773 kfree(ipvs->lblcr_ctl_table);
774
775err_dup:
776 return -ENOMEM;
777}
778
779static void __net_exit __ip_vs_lblcr_exit(struct net *net)
780{
781 struct netns_ipvs *ipvs = net_ipvs(net);
782
783 unregister_net_sysctl_table(ipvs->lblcr_ctl_header);
784
785 if (!net_eq(net, &init_net))
786 kfree(ipvs->lblcr_ctl_table);
787}
788
789static struct pernet_operations ip_vs_lblcr_ops = {
790 .init = __ip_vs_lblcr_init,
791 .exit = __ip_vs_lblcr_exit,
792};
747 793
748static int __init ip_vs_lblcr_init(void) 794static int __init ip_vs_lblcr_init(void)
749{ 795{
750 int ret; 796 int ret;
751 797
752 sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars_table); 798 ret = register_pernet_subsys(&ip_vs_lblcr_ops);
799 if (ret)
800 return ret;
801
753 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); 802 ret = register_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
754 if (ret) 803 if (ret)
755 unregister_sysctl_table(sysctl_header); 804 unregister_pernet_subsys(&ip_vs_lblcr_ops);
756 return ret; 805 return ret;
757} 806}
758 807
759
760static void __exit ip_vs_lblcr_cleanup(void) 808static void __exit ip_vs_lblcr_cleanup(void)
761{ 809{
762 unregister_sysctl_table(sysctl_header);
763 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); 810 unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler);
811 unregister_pernet_subsys(&ip_vs_lblcr_ops);
764} 812}
765 813
766 814
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 4680647cd450..f454c80df0a7 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -141,6 +141,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
141 struct nf_conntrack_tuple *orig, new_reply; 141 struct nf_conntrack_tuple *orig, new_reply;
142 struct ip_vs_conn *cp; 142 struct ip_vs_conn *cp;
143 struct ip_vs_conn_param p; 143 struct ip_vs_conn_param p;
144 struct net *net = nf_ct_net(ct);
144 145
145 if (exp->tuple.src.l3num != PF_INET) 146 if (exp->tuple.src.l3num != PF_INET)
146 return; 147 return;
@@ -155,7 +156,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
155 156
156 /* RS->CLIENT */ 157 /* RS->CLIENT */
157 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; 158 orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
158 ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum, 159 ip_vs_conn_fill_param(net, exp->tuple.src.l3num, orig->dst.protonum,
159 &orig->src.u3, orig->src.u.tcp.port, 160 &orig->src.u3, orig->src.u.tcp.port,
160 &orig->dst.u3, orig->dst.u.tcp.port, &p); 161 &orig->dst.u3, orig->dst.u.tcp.port, &p);
161 cp = ip_vs_conn_out_get(&p); 162 cp = ip_vs_conn_out_get(&p);
@@ -268,7 +269,8 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
268 " for conn " FMT_CONN "\n", 269 " for conn " FMT_CONN "\n",
269 __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); 270 __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
270 271
271 h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); 272 h = nf_conntrack_find_get(ip_vs_conn_net(cp), NF_CT_DEFAULT_ZONE,
273 &tuple);
272 if (h) { 274 if (h) {
273 ct = nf_ct_tuplehash_to_ctrack(h); 275 ct = nf_ct_tuplehash_to_ctrack(h);
274 /* Show what happens instead of calling nf_ct_kill() */ 276 /* Show what happens instead of calling nf_ct_kill() */
diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c
index 3414af70ee12..5cf859ccb31b 100644
--- a/net/netfilter/ipvs/ip_vs_pe.c
+++ b/net/netfilter/ipvs/ip_vs_pe.c
@@ -29,12 +29,11 @@ void ip_vs_unbind_pe(struct ip_vs_service *svc)
29} 29}
30 30
31/* Get pe in the pe list by name */ 31/* Get pe in the pe list by name */
32static struct ip_vs_pe * 32struct ip_vs_pe *__ip_vs_pe_getbyname(const char *pe_name)
33ip_vs_pe_getbyname(const char *pe_name)
34{ 33{
35 struct ip_vs_pe *pe; 34 struct ip_vs_pe *pe;
36 35
37 IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__, 36 IP_VS_DBG(10, "%s(): pe_name \"%s\"\n", __func__,
38 pe_name); 37 pe_name);
39 38
40 spin_lock_bh(&ip_vs_pe_lock); 39 spin_lock_bh(&ip_vs_pe_lock);
@@ -60,28 +59,22 @@ ip_vs_pe_getbyname(const char *pe_name)
60} 59}
61 60
62/* Lookup pe and try to load it if it doesn't exist */ 61/* Lookup pe and try to load it if it doesn't exist */
63struct ip_vs_pe *ip_vs_pe_get(const char *name) 62struct ip_vs_pe *ip_vs_pe_getbyname(const char *name)
64{ 63{
65 struct ip_vs_pe *pe; 64 struct ip_vs_pe *pe;
66 65
67 /* Search for the pe by name */ 66 /* Search for the pe by name */
68 pe = ip_vs_pe_getbyname(name); 67 pe = __ip_vs_pe_getbyname(name);
69 68
70 /* If pe not found, load the module and search again */ 69 /* If pe not found, load the module and search again */
71 if (!pe) { 70 if (!pe) {
72 request_module("ip_vs_pe_%s", name); 71 request_module("ip_vs_pe_%s", name);
73 pe = ip_vs_pe_getbyname(name); 72 pe = __ip_vs_pe_getbyname(name);
74 } 73 }
75 74
76 return pe; 75 return pe;
77} 76}
78 77
79void ip_vs_pe_put(struct ip_vs_pe *pe)
80{
81 if (pe && pe->module)
82 module_put(pe->module);
83}
84
85/* Register a pe in the pe list */ 78/* Register a pe in the pe list */
86int register_ip_vs_pe(struct ip_vs_pe *pe) 79int register_ip_vs_pe(struct ip_vs_pe *pe)
87{ 80{
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index b8b4e9620f3e..0d83bc01fed4 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -71,6 +71,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
71 struct ip_vs_iphdr iph; 71 struct ip_vs_iphdr iph;
72 unsigned int dataoff, datalen, matchoff, matchlen; 72 unsigned int dataoff, datalen, matchoff, matchlen;
73 const char *dptr; 73 const char *dptr;
74 int retc;
74 75
75 ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); 76 ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph);
76 77
@@ -83,6 +84,8 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
83 if (dataoff >= skb->len) 84 if (dataoff >= skb->len)
84 return -EINVAL; 85 return -EINVAL;
85 86
87 if ((retc=skb_linearize(skb)) < 0)
88 return retc;
86 dptr = skb->data + dataoff; 89 dptr = skb->data + dataoff;
87 datalen = skb->len - dataoff; 90 datalen = skb->len - dataoff;
88 91
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index c53998390877..6ac986cdcff3 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -60,6 +60,31 @@ static int __used __init register_ip_vs_protocol(struct ip_vs_protocol *pp)
60 return 0; 60 return 0;
61} 61}
62 62
63/*
64 * register an ipvs protocols netns related data
65 */
66static int
67register_ip_vs_proto_netns(struct net *net, struct ip_vs_protocol *pp)
68{
69 struct netns_ipvs *ipvs = net_ipvs(net);
70 unsigned hash = IP_VS_PROTO_HASH(pp->protocol);
71 struct ip_vs_proto_data *pd =
72 kzalloc(sizeof(struct ip_vs_proto_data), GFP_ATOMIC);
73
74 if (!pd) {
75 pr_err("%s(): no memory.\n", __func__);
76 return -ENOMEM;
77 }
78 pd->pp = pp; /* For speed issues */
79 pd->next = ipvs->proto_data_table[hash];
80 ipvs->proto_data_table[hash] = pd;
81 atomic_set(&pd->appcnt, 0); /* Init app counter */
82
83 if (pp->init_netns != NULL)
84 pp->init_netns(net, pd);
85
86 return 0;
87}
63 88
64/* 89/*
65 * unregister an ipvs protocol 90 * unregister an ipvs protocol
@@ -82,6 +107,29 @@ static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp)
82 return -ESRCH; 107 return -ESRCH;
83} 108}
84 109
110/*
111 * unregister an ipvs protocols netns data
112 */
113static int
114unregister_ip_vs_proto_netns(struct net *net, struct ip_vs_proto_data *pd)
115{
116 struct netns_ipvs *ipvs = net_ipvs(net);
117 struct ip_vs_proto_data **pd_p;
118 unsigned hash = IP_VS_PROTO_HASH(pd->pp->protocol);
119
120 pd_p = &ipvs->proto_data_table[hash];
121 for (; *pd_p; pd_p = &(*pd_p)->next) {
122 if (*pd_p == pd) {
123 *pd_p = pd->next;
124 if (pd->pp->exit_netns != NULL)
125 pd->pp->exit_netns(net, pd);
126 kfree(pd);
127 return 0;
128 }
129 }
130
131 return -ESRCH;
132}
85 133
86/* 134/*
87 * get ip_vs_protocol object by its proto. 135 * get ip_vs_protocol object by its proto.
@@ -100,19 +148,44 @@ struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto)
100} 148}
101EXPORT_SYMBOL(ip_vs_proto_get); 149EXPORT_SYMBOL(ip_vs_proto_get);
102 150
151/*
152 * get ip_vs_protocol object data by netns and proto
153 */
154struct ip_vs_proto_data *
155__ipvs_proto_data_get(struct netns_ipvs *ipvs, unsigned short proto)
156{
157 struct ip_vs_proto_data *pd;
158 unsigned hash = IP_VS_PROTO_HASH(proto);
159
160 for (pd = ipvs->proto_data_table[hash]; pd; pd = pd->next) {
161 if (pd->pp->protocol == proto)
162 return pd;
163 }
164
165 return NULL;
166}
167
168struct ip_vs_proto_data *
169ip_vs_proto_data_get(struct net *net, unsigned short proto)
170{
171 struct netns_ipvs *ipvs = net_ipvs(net);
172
173 return __ipvs_proto_data_get(ipvs, proto);
174}
175EXPORT_SYMBOL(ip_vs_proto_data_get);
103 176
104/* 177/*
105 * Propagate event for state change to all protocols 178 * Propagate event for state change to all protocols
106 */ 179 */
107void ip_vs_protocol_timeout_change(int flags) 180void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags)
108{ 181{
109 struct ip_vs_protocol *pp; 182 struct ip_vs_proto_data *pd;
110 int i; 183 int i;
111 184
112 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { 185 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
113 for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { 186 for (pd = ipvs->proto_data_table[i]; pd; pd = pd->next) {
114 if (pp->timeout_change) 187 if (pd->pp->timeout_change)
115 pp->timeout_change(pp, flags); 188 pd->pp->timeout_change(pd, flags);
116 } 189 }
117 } 190 }
118} 191}
@@ -236,6 +309,46 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp,
236 ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg); 309 ip_vs_tcpudp_debug_packet_v4(pp, skb, offset, msg);
237} 310}
238 311
312/*
313 * per network name-space init
314 */
315static int __net_init __ip_vs_protocol_init(struct net *net)
316{
317#ifdef CONFIG_IP_VS_PROTO_TCP
318 register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp);
319#endif
320#ifdef CONFIG_IP_VS_PROTO_UDP
321 register_ip_vs_proto_netns(net, &ip_vs_protocol_udp);
322#endif
323#ifdef CONFIG_IP_VS_PROTO_SCTP
324 register_ip_vs_proto_netns(net, &ip_vs_protocol_sctp);
325#endif
326#ifdef CONFIG_IP_VS_PROTO_AH
327 register_ip_vs_proto_netns(net, &ip_vs_protocol_ah);
328#endif
329#ifdef CONFIG_IP_VS_PROTO_ESP
330 register_ip_vs_proto_netns(net, &ip_vs_protocol_esp);
331#endif
332 return 0;
333}
334
335static void __net_exit __ip_vs_protocol_cleanup(struct net *net)
336{
337 struct netns_ipvs *ipvs = net_ipvs(net);
338 struct ip_vs_proto_data *pd;
339 int i;
340
341 /* unregister all the ipvs proto data for this netns */
342 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
343 while ((pd = ipvs->proto_data_table[i]) != NULL)
344 unregister_ip_vs_proto_netns(net, pd);
345 }
346}
347
348static struct pernet_operations ipvs_proto_ops = {
349 .init = __ip_vs_protocol_init,
350 .exit = __ip_vs_protocol_cleanup,
351};
239 352
240int __init ip_vs_protocol_init(void) 353int __init ip_vs_protocol_init(void)
241{ 354{
@@ -265,6 +378,7 @@ int __init ip_vs_protocol_init(void)
265 REGISTER_PROTOCOL(&ip_vs_protocol_esp); 378 REGISTER_PROTOCOL(&ip_vs_protocol_esp);
266#endif 379#endif
267 pr_info("Registered protocols (%s)\n", &protocols[2]); 380 pr_info("Registered protocols (%s)\n", &protocols[2]);
381 return register_pernet_subsys(&ipvs_proto_ops);
268 382
269 return 0; 383 return 0;
270} 384}
@@ -275,6 +389,7 @@ void ip_vs_protocol_cleanup(void)
275 struct ip_vs_protocol *pp; 389 struct ip_vs_protocol *pp;
276 int i; 390 int i;
277 391
392 unregister_pernet_subsys(&ipvs_proto_ops);
278 /* unregister all the ipvs protocols */ 393 /* unregister all the ipvs protocols */
279 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { 394 for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) {
280 while ((pp = ip_vs_proto_table[i]) != NULL) 395 while ((pp = ip_vs_proto_table[i]) != NULL)
diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
index 3a0461117d3f..5b8eb8b12c3e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c
@@ -41,28 +41,30 @@ struct isakmp_hdr {
41#define PORT_ISAKMP 500 41#define PORT_ISAKMP 500
42 42
43static void 43static void
44ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, 44ah_esp_conn_fill_param_proto(struct net *net, int af,
45 int inverse, struct ip_vs_conn_param *p) 45 const struct ip_vs_iphdr *iph, int inverse,
46 struct ip_vs_conn_param *p)
46{ 47{
47 if (likely(!inverse)) 48 if (likely(!inverse))
48 ip_vs_conn_fill_param(af, IPPROTO_UDP, 49 ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
49 &iph->saddr, htons(PORT_ISAKMP), 50 &iph->saddr, htons(PORT_ISAKMP),
50 &iph->daddr, htons(PORT_ISAKMP), p); 51 &iph->daddr, htons(PORT_ISAKMP), p);
51 else 52 else
52 ip_vs_conn_fill_param(af, IPPROTO_UDP, 53 ip_vs_conn_fill_param(net, af, IPPROTO_UDP,
53 &iph->daddr, htons(PORT_ISAKMP), 54 &iph->daddr, htons(PORT_ISAKMP),
54 &iph->saddr, htons(PORT_ISAKMP), p); 55 &iph->saddr, htons(PORT_ISAKMP), p);
55} 56}
56 57
57static struct ip_vs_conn * 58static struct ip_vs_conn *
58ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, 59ah_esp_conn_in_get(int af, const struct sk_buff *skb,
59 const struct ip_vs_iphdr *iph, unsigned int proto_off, 60 const struct ip_vs_iphdr *iph, unsigned int proto_off,
60 int inverse) 61 int inverse)
61{ 62{
62 struct ip_vs_conn *cp; 63 struct ip_vs_conn *cp;
63 struct ip_vs_conn_param p; 64 struct ip_vs_conn_param p;
65 struct net *net = skb_net(skb);
64 66
65 ah_esp_conn_fill_param_proto(af, iph, inverse, &p); 67 ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
66 cp = ip_vs_conn_in_get(&p); 68 cp = ip_vs_conn_in_get(&p);
67 if (!cp) { 69 if (!cp) {
68 /* 70 /*
@@ -72,7 +74,7 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
72 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet " 74 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for outin packet "
73 "%s%s %s->%s\n", 75 "%s%s %s->%s\n",
74 inverse ? "ICMP+" : "", 76 inverse ? "ICMP+" : "",
75 pp->name, 77 ip_vs_proto_get(iph->protocol)->name,
76 IP_VS_DBG_ADDR(af, &iph->saddr), 78 IP_VS_DBG_ADDR(af, &iph->saddr),
77 IP_VS_DBG_ADDR(af, &iph->daddr)); 79 IP_VS_DBG_ADDR(af, &iph->daddr));
78 } 80 }
@@ -83,21 +85,21 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp,
83 85
84static struct ip_vs_conn * 86static struct ip_vs_conn *
85ah_esp_conn_out_get(int af, const struct sk_buff *skb, 87ah_esp_conn_out_get(int af, const struct sk_buff *skb,
86 struct ip_vs_protocol *pp,
87 const struct ip_vs_iphdr *iph, 88 const struct ip_vs_iphdr *iph,
88 unsigned int proto_off, 89 unsigned int proto_off,
89 int inverse) 90 int inverse)
90{ 91{
91 struct ip_vs_conn *cp; 92 struct ip_vs_conn *cp;
92 struct ip_vs_conn_param p; 93 struct ip_vs_conn_param p;
94 struct net *net = skb_net(skb);
93 95
94 ah_esp_conn_fill_param_proto(af, iph, inverse, &p); 96 ah_esp_conn_fill_param_proto(net, af, iph, inverse, &p);
95 cp = ip_vs_conn_out_get(&p); 97 cp = ip_vs_conn_out_get(&p);
96 if (!cp) { 98 if (!cp) {
97 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " 99 IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet "
98 "%s%s %s->%s\n", 100 "%s%s %s->%s\n",
99 inverse ? "ICMP+" : "", 101 inverse ? "ICMP+" : "",
100 pp->name, 102 ip_vs_proto_get(iph->protocol)->name,
101 IP_VS_DBG_ADDR(af, &iph->saddr), 103 IP_VS_DBG_ADDR(af, &iph->saddr),
102 IP_VS_DBG_ADDR(af, &iph->daddr)); 104 IP_VS_DBG_ADDR(af, &iph->daddr));
103 } 105 }
@@ -107,7 +109,7 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb,
107 109
108 110
109static int 111static int
110ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 112ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
111 int *verdict, struct ip_vs_conn **cpp) 113 int *verdict, struct ip_vs_conn **cpp)
112{ 114{
113 /* 115 /*
@@ -117,26 +119,14 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
117 return 0; 119 return 0;
118} 120}
119 121
120static void ah_esp_init(struct ip_vs_protocol *pp)
121{
122 /* nothing to do now */
123}
124
125
126static void ah_esp_exit(struct ip_vs_protocol *pp)
127{
128 /* nothing to do now */
129}
130
131
132#ifdef CONFIG_IP_VS_PROTO_AH 122#ifdef CONFIG_IP_VS_PROTO_AH
133struct ip_vs_protocol ip_vs_protocol_ah = { 123struct ip_vs_protocol ip_vs_protocol_ah = {
134 .name = "AH", 124 .name = "AH",
135 .protocol = IPPROTO_AH, 125 .protocol = IPPROTO_AH,
136 .num_states = 1, 126 .num_states = 1,
137 .dont_defrag = 1, 127 .dont_defrag = 1,
138 .init = ah_esp_init, 128 .init = NULL,
139 .exit = ah_esp_exit, 129 .exit = NULL,
140 .conn_schedule = ah_esp_conn_schedule, 130 .conn_schedule = ah_esp_conn_schedule,
141 .conn_in_get = ah_esp_conn_in_get, 131 .conn_in_get = ah_esp_conn_in_get,
142 .conn_out_get = ah_esp_conn_out_get, 132 .conn_out_get = ah_esp_conn_out_get,
@@ -149,7 +139,6 @@ struct ip_vs_protocol ip_vs_protocol_ah = {
149 .app_conn_bind = NULL, 139 .app_conn_bind = NULL,
150 .debug_packet = ip_vs_tcpudp_debug_packet, 140 .debug_packet = ip_vs_tcpudp_debug_packet,
151 .timeout_change = NULL, /* ISAKMP */ 141 .timeout_change = NULL, /* ISAKMP */
152 .set_state_timeout = NULL,
153}; 142};
154#endif 143#endif
155 144
@@ -159,8 +148,8 @@ struct ip_vs_protocol ip_vs_protocol_esp = {
159 .protocol = IPPROTO_ESP, 148 .protocol = IPPROTO_ESP,
160 .num_states = 1, 149 .num_states = 1,
161 .dont_defrag = 1, 150 .dont_defrag = 1,
162 .init = ah_esp_init, 151 .init = NULL,
163 .exit = ah_esp_exit, 152 .exit = NULL,
164 .conn_schedule = ah_esp_conn_schedule, 153 .conn_schedule = ah_esp_conn_schedule,
165 .conn_in_get = ah_esp_conn_in_get, 154 .conn_in_get = ah_esp_conn_in_get,
166 .conn_out_get = ah_esp_conn_out_get, 155 .conn_out_get = ah_esp_conn_out_get,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 1ea96bcd342b..fb2d04ac5d4e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -9,9 +9,10 @@
9#include <net/ip_vs.h> 9#include <net/ip_vs.h>
10 10
11static int 11static int
12sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 12sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
13 int *verdict, struct ip_vs_conn **cpp) 13 int *verdict, struct ip_vs_conn **cpp)
14{ 14{
15 struct net *net;
15 struct ip_vs_service *svc; 16 struct ip_vs_service *svc;
16 sctp_chunkhdr_t _schunkh, *sch; 17 sctp_chunkhdr_t _schunkh, *sch;
17 sctp_sctphdr_t *sh, _sctph; 18 sctp_sctphdr_t *sh, _sctph;
@@ -27,13 +28,13 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
27 sizeof(_schunkh), &_schunkh); 28 sizeof(_schunkh), &_schunkh);
28 if (sch == NULL) 29 if (sch == NULL)
29 return 0; 30 return 0;
30 31 net = skb_net(skb);
31 if ((sch->type == SCTP_CID_INIT) && 32 if ((sch->type == SCTP_CID_INIT) &&
32 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, 33 (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
33 &iph.daddr, sh->dest))) { 34 &iph.daddr, sh->dest))) {
34 int ignored; 35 int ignored;
35 36
36 if (ip_vs_todrop()) { 37 if (ip_vs_todrop(net_ipvs(net))) {
37 /* 38 /*
38 * It seems that we are very loaded. 39 * It seems that we are very loaded.
39 * We have to drop this packet :( 40 * We have to drop this packet :(
@@ -46,14 +47,19 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
46 * Let the virtual server select a real server for the 47 * Let the virtual server select a real server for the
47 * incoming connection, and create a connection entry. 48 * incoming connection, and create a connection entry.
48 */ 49 */
49 *cpp = ip_vs_schedule(svc, skb, pp, &ignored); 50 *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
50 if (!*cpp && !ignored) { 51 if (!*cpp && ignored <= 0) {
51 *verdict = ip_vs_leave(svc, skb, pp); 52 if (!ignored)
53 *verdict = ip_vs_leave(svc, skb, pd);
54 else {
55 ip_vs_service_put(svc);
56 *verdict = NF_DROP;
57 }
52 return 0; 58 return 0;
53 } 59 }
54 ip_vs_service_put(svc); 60 ip_vs_service_put(svc);
55 } 61 }
56 62 /* NF_ACCEPT */
57 return 1; 63 return 1;
58} 64}
59 65
@@ -856,7 +862,7 @@ static struct ipvs_sctp_nextstate
856/* 862/*
857 * Timeout table[state] 863 * Timeout table[state]
858 */ 864 */
859static int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = { 865static const int sctp_timeouts[IP_VS_SCTP_S_LAST + 1] = {
860 [IP_VS_SCTP_S_NONE] = 2 * HZ, 866 [IP_VS_SCTP_S_NONE] = 2 * HZ,
861 [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ, 867 [IP_VS_SCTP_S_INIT_CLI] = 1 * 60 * HZ,
862 [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ, 868 [IP_VS_SCTP_S_INIT_SER] = 1 * 60 * HZ,
@@ -900,20 +906,8 @@ static const char *sctp_state_name(int state)
900 return "?"; 906 return "?";
901} 907}
902 908
903static void sctp_timeout_change(struct ip_vs_protocol *pp, int flags)
904{
905}
906
907static int
908sctp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
909{
910
911return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_SCTP_S_LAST,
912 sctp_state_name_table, sname, to);
913}
914
915static inline int 909static inline int
916set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, 910set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
917 int direction, const struct sk_buff *skb) 911 int direction, const struct sk_buff *skb)
918{ 912{
919 sctp_chunkhdr_t _sctpch, *sch; 913 sctp_chunkhdr_t _sctpch, *sch;
@@ -971,7 +965,7 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
971 965
972 IP_VS_DBG_BUF(8, "%s %s %s:%d->" 966 IP_VS_DBG_BUF(8, "%s %s %s:%d->"
973 "%s:%d state: %s->%s conn->refcnt:%d\n", 967 "%s:%d state: %s->%s conn->refcnt:%d\n",
974 pp->name, 968 pd->pp->name,
975 ((direction == IP_VS_DIR_OUTPUT) ? 969 ((direction == IP_VS_DIR_OUTPUT) ?
976 "output " : "input "), 970 "output " : "input "),
977 IP_VS_DBG_ADDR(cp->af, &cp->daddr), 971 IP_VS_DBG_ADDR(cp->af, &cp->daddr),
@@ -995,75 +989,73 @@ set_sctp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
995 } 989 }
996 } 990 }
997 } 991 }
992 if (likely(pd))
993 cp->timeout = pd->timeout_table[cp->state = next_state];
994 else /* What to do ? */
995 cp->timeout = sctp_timeouts[cp->state = next_state];
998 996
999 cp->timeout = pp->timeout_table[cp->state = next_state]; 997 return 1;
1000
1001 return 1;
1002} 998}
1003 999
1004static int 1000static int
1005sctp_state_transition(struct ip_vs_conn *cp, int direction, 1001sctp_state_transition(struct ip_vs_conn *cp, int direction,
1006 const struct sk_buff *skb, struct ip_vs_protocol *pp) 1002 const struct sk_buff *skb, struct ip_vs_proto_data *pd)
1007{ 1003{
1008 int ret = 0; 1004 int ret = 0;
1009 1005
1010 spin_lock(&cp->lock); 1006 spin_lock(&cp->lock);
1011 ret = set_sctp_state(pp, cp, direction, skb); 1007 ret = set_sctp_state(pd, cp, direction, skb);
1012 spin_unlock(&cp->lock); 1008 spin_unlock(&cp->lock);
1013 1009
1014 return ret; 1010 return ret;
1015} 1011}
1016 1012
1017/*
1018 * Hash table for SCTP application incarnations
1019 */
1020#define SCTP_APP_TAB_BITS 4
1021#define SCTP_APP_TAB_SIZE (1 << SCTP_APP_TAB_BITS)
1022#define SCTP_APP_TAB_MASK (SCTP_APP_TAB_SIZE - 1)
1023
1024static struct list_head sctp_apps[SCTP_APP_TAB_SIZE];
1025static DEFINE_SPINLOCK(sctp_app_lock);
1026
1027static inline __u16 sctp_app_hashkey(__be16 port) 1013static inline __u16 sctp_app_hashkey(__be16 port)
1028{ 1014{
1029 return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port) 1015 return (((__force u16)port >> SCTP_APP_TAB_BITS) ^ (__force u16)port)
1030 & SCTP_APP_TAB_MASK; 1016 & SCTP_APP_TAB_MASK;
1031} 1017}
1032 1018
1033static int sctp_register_app(struct ip_vs_app *inc) 1019static int sctp_register_app(struct net *net, struct ip_vs_app *inc)
1034{ 1020{
1035 struct ip_vs_app *i; 1021 struct ip_vs_app *i;
1036 __u16 hash; 1022 __u16 hash;
1037 __be16 port = inc->port; 1023 __be16 port = inc->port;
1038 int ret = 0; 1024 int ret = 0;
1025 struct netns_ipvs *ipvs = net_ipvs(net);
1026 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
1039 1027
1040 hash = sctp_app_hashkey(port); 1028 hash = sctp_app_hashkey(port);
1041 1029
1042 spin_lock_bh(&sctp_app_lock); 1030 spin_lock_bh(&ipvs->sctp_app_lock);
1043 list_for_each_entry(i, &sctp_apps[hash], p_list) { 1031 list_for_each_entry(i, &ipvs->sctp_apps[hash], p_list) {
1044 if (i->port == port) { 1032 if (i->port == port) {
1045 ret = -EEXIST; 1033 ret = -EEXIST;
1046 goto out; 1034 goto out;
1047 } 1035 }
1048 } 1036 }
1049 list_add(&inc->p_list, &sctp_apps[hash]); 1037 list_add(&inc->p_list, &ipvs->sctp_apps[hash]);
1050 atomic_inc(&ip_vs_protocol_sctp.appcnt); 1038 atomic_inc(&pd->appcnt);
1051out: 1039out:
1052 spin_unlock_bh(&sctp_app_lock); 1040 spin_unlock_bh(&ipvs->sctp_app_lock);
1053 1041
1054 return ret; 1042 return ret;
1055} 1043}
1056 1044
1057static void sctp_unregister_app(struct ip_vs_app *inc) 1045static void sctp_unregister_app(struct net *net, struct ip_vs_app *inc)
1058{ 1046{
1059 spin_lock_bh(&sctp_app_lock); 1047 struct netns_ipvs *ipvs = net_ipvs(net);
1060 atomic_dec(&ip_vs_protocol_sctp.appcnt); 1048 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_SCTP);
1049
1050 spin_lock_bh(&ipvs->sctp_app_lock);
1051 atomic_dec(&pd->appcnt);
1061 list_del(&inc->p_list); 1052 list_del(&inc->p_list);
1062 spin_unlock_bh(&sctp_app_lock); 1053 spin_unlock_bh(&ipvs->sctp_app_lock);
1063} 1054}
1064 1055
1065static int sctp_app_conn_bind(struct ip_vs_conn *cp) 1056static int sctp_app_conn_bind(struct ip_vs_conn *cp)
1066{ 1057{
1058 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
1067 int hash; 1059 int hash;
1068 struct ip_vs_app *inc; 1060 struct ip_vs_app *inc;
1069 int result = 0; 1061 int result = 0;
@@ -1074,12 +1066,12 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
1074 /* Lookup application incarnations and bind the right one */ 1066 /* Lookup application incarnations and bind the right one */
1075 hash = sctp_app_hashkey(cp->vport); 1067 hash = sctp_app_hashkey(cp->vport);
1076 1068
1077 spin_lock(&sctp_app_lock); 1069 spin_lock(&ipvs->sctp_app_lock);
1078 list_for_each_entry(inc, &sctp_apps[hash], p_list) { 1070 list_for_each_entry(inc, &ipvs->sctp_apps[hash], p_list) {
1079 if (inc->port == cp->vport) { 1071 if (inc->port == cp->vport) {
1080 if (unlikely(!ip_vs_app_inc_get(inc))) 1072 if (unlikely(!ip_vs_app_inc_get(inc)))
1081 break; 1073 break;
1082 spin_unlock(&sctp_app_lock); 1074 spin_unlock(&ipvs->sctp_app_lock);
1083 1075
1084 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->" 1076 IP_VS_DBG_BUF(9, "%s: Binding conn %s:%u->"
1085 "%s:%u to app %s on port %u\n", 1077 "%s:%u to app %s on port %u\n",
@@ -1095,43 +1087,50 @@ static int sctp_app_conn_bind(struct ip_vs_conn *cp)
1095 goto out; 1087 goto out;
1096 } 1088 }
1097 } 1089 }
1098 spin_unlock(&sctp_app_lock); 1090 spin_unlock(&ipvs->sctp_app_lock);
1099out: 1091out:
1100 return result; 1092 return result;
1101} 1093}
1102 1094
1103static void ip_vs_sctp_init(struct ip_vs_protocol *pp) 1095/* ---------------------------------------------
1096 * timeouts is netns related now.
1097 * ---------------------------------------------
1098 */
1099static void __ip_vs_sctp_init(struct net *net, struct ip_vs_proto_data *pd)
1104{ 1100{
1105 IP_VS_INIT_HASH_TABLE(sctp_apps); 1101 struct netns_ipvs *ipvs = net_ipvs(net);
1106 pp->timeout_table = sctp_timeouts;
1107}
1108 1102
1103 ip_vs_init_hash_table(ipvs->sctp_apps, SCTP_APP_TAB_SIZE);
1104 spin_lock_init(&ipvs->tcp_app_lock);
1105 pd->timeout_table = ip_vs_create_timeout_table((int *)sctp_timeouts,
1106 sizeof(sctp_timeouts));
1107}
1109 1108
1110static void ip_vs_sctp_exit(struct ip_vs_protocol *pp) 1109static void __ip_vs_sctp_exit(struct net *net, struct ip_vs_proto_data *pd)
1111{ 1110{
1112 1111 kfree(pd->timeout_table);
1113} 1112}
1114 1113
1115struct ip_vs_protocol ip_vs_protocol_sctp = { 1114struct ip_vs_protocol ip_vs_protocol_sctp = {
1116 .name = "SCTP", 1115 .name = "SCTP",
1117 .protocol = IPPROTO_SCTP, 1116 .protocol = IPPROTO_SCTP,
1118 .num_states = IP_VS_SCTP_S_LAST, 1117 .num_states = IP_VS_SCTP_S_LAST,
1119 .dont_defrag = 0, 1118 .dont_defrag = 0,
1120 .appcnt = ATOMIC_INIT(0), 1119 .init = NULL,
1121 .init = ip_vs_sctp_init, 1120 .exit = NULL,
1122 .exit = ip_vs_sctp_exit, 1121 .init_netns = __ip_vs_sctp_init,
1123 .register_app = sctp_register_app, 1122 .exit_netns = __ip_vs_sctp_exit,
1123 .register_app = sctp_register_app,
1124 .unregister_app = sctp_unregister_app, 1124 .unregister_app = sctp_unregister_app,
1125 .conn_schedule = sctp_conn_schedule, 1125 .conn_schedule = sctp_conn_schedule,
1126 .conn_in_get = ip_vs_conn_in_get_proto, 1126 .conn_in_get = ip_vs_conn_in_get_proto,
1127 .conn_out_get = ip_vs_conn_out_get_proto, 1127 .conn_out_get = ip_vs_conn_out_get_proto,
1128 .snat_handler = sctp_snat_handler, 1128 .snat_handler = sctp_snat_handler,
1129 .dnat_handler = sctp_dnat_handler, 1129 .dnat_handler = sctp_dnat_handler,
1130 .csum_check = sctp_csum_check, 1130 .csum_check = sctp_csum_check,
1131 .state_name = sctp_state_name, 1131 .state_name = sctp_state_name,
1132 .state_transition = sctp_state_transition, 1132 .state_transition = sctp_state_transition,
1133 .app_conn_bind = sctp_app_conn_bind, 1133 .app_conn_bind = sctp_app_conn_bind,
1134 .debug_packet = ip_vs_tcpudp_debug_packet, 1134 .debug_packet = ip_vs_tcpudp_debug_packet,
1135 .timeout_change = sctp_timeout_change, 1135 .timeout_change = NULL,
1136 .set_state_timeout = sctp_set_state_timeout,
1137}; 1136};
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index f6c5200e2146..c0cc341b840d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -9,8 +9,12 @@
9 * as published by the Free Software Foundation; either version 9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version. 10 * 2 of the License, or (at your option) any later version.
11 * 11 *
12 * Changes: 12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
13 * 13 *
14 * Network name space (netns) aware.
15 * Global data moved to netns i.e struct netns_ipvs
16 * tcp_timeouts table has copy per netns in a hash table per
17 * protocol ip_vs_proto_data and is handled by netns
14 */ 18 */
15 19
16#define KMSG_COMPONENT "IPVS" 20#define KMSG_COMPONENT "IPVS"
@@ -28,9 +32,10 @@
28#include <net/ip_vs.h> 32#include <net/ip_vs.h>
29 33
30static int 34static int
31tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 35tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
32 int *verdict, struct ip_vs_conn **cpp) 36 int *verdict, struct ip_vs_conn **cpp)
33{ 37{
38 struct net *net;
34 struct ip_vs_service *svc; 39 struct ip_vs_service *svc;
35 struct tcphdr _tcph, *th; 40 struct tcphdr _tcph, *th;
36 struct ip_vs_iphdr iph; 41 struct ip_vs_iphdr iph;
@@ -42,14 +47,14 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
42 *verdict = NF_DROP; 47 *verdict = NF_DROP;
43 return 0; 48 return 0;
44 } 49 }
45 50 net = skb_net(skb);
46 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ 51 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
47 if (th->syn && 52 if (th->syn &&
48 (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, 53 (svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
49 th->dest))) { 54 &iph.daddr, th->dest))) {
50 int ignored; 55 int ignored;
51 56
52 if (ip_vs_todrop()) { 57 if (ip_vs_todrop(net_ipvs(net))) {
53 /* 58 /*
54 * It seems that we are very loaded. 59 * It seems that we are very loaded.
55 * We have to drop this packet :( 60 * We have to drop this packet :(
@@ -63,13 +68,19 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
63 * Let the virtual server select a real server for the 68 * Let the virtual server select a real server for the
64 * incoming connection, and create a connection entry. 69 * incoming connection, and create a connection entry.
65 */ 70 */
66 *cpp = ip_vs_schedule(svc, skb, pp, &ignored); 71 *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
67 if (!*cpp && !ignored) { 72 if (!*cpp && ignored <= 0) {
68 *verdict = ip_vs_leave(svc, skb, pp); 73 if (!ignored)
74 *verdict = ip_vs_leave(svc, skb, pd);
75 else {
76 ip_vs_service_put(svc);
77 *verdict = NF_DROP;
78 }
69 return 0; 79 return 0;
70 } 80 }
71 ip_vs_service_put(svc); 81 ip_vs_service_put(svc);
72 } 82 }
83 /* NF_ACCEPT */
73 return 1; 84 return 1;
74} 85}
75 86
@@ -338,7 +349,7 @@ static const int tcp_state_off[IP_VS_DIR_LAST] = {
338/* 349/*
339 * Timeout table[state] 350 * Timeout table[state]
340 */ 351 */
341static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { 352static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
342 [IP_VS_TCP_S_NONE] = 2*HZ, 353 [IP_VS_TCP_S_NONE] = 2*HZ,
343 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, 354 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ,
344 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, 355 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ,
@@ -437,10 +448,7 @@ static struct tcp_states_t tcp_states_dos [] = {
437/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 448/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
438}; 449};
439 450
440static struct tcp_states_t *tcp_state_table = tcp_states; 451static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
441
442
443static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
444{ 452{
445 int on = (flags & 1); /* secure_tcp */ 453 int on = (flags & 1); /* secure_tcp */
446 454
@@ -450,14 +458,7 @@ static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags)
450 ** for most if not for all of the applications. Something 458 ** for most if not for all of the applications. Something
451 ** like "capabilities" (flags) for each object. 459 ** like "capabilities" (flags) for each object.
452 */ 460 */
453 tcp_state_table = (on? tcp_states_dos : tcp_states); 461 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
454}
455
456static int
457tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
458{
459 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST,
460 tcp_state_name_table, sname, to);
461} 462}
462 463
463static inline int tcp_state_idx(struct tcphdr *th) 464static inline int tcp_state_idx(struct tcphdr *th)
@@ -474,7 +475,7 @@ static inline int tcp_state_idx(struct tcphdr *th)
474} 475}
475 476
476static inline void 477static inline void
477set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, 478set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
478 int direction, struct tcphdr *th) 479 int direction, struct tcphdr *th)
479{ 480{
480 int state_idx; 481 int state_idx;
@@ -497,7 +498,8 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
497 goto tcp_state_out; 498 goto tcp_state_out;
498 } 499 }
499 500
500 new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; 501 new_state =
502 pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
501 503
502 tcp_state_out: 504 tcp_state_out:
503 if (new_state != cp->state) { 505 if (new_state != cp->state) {
@@ -505,7 +507,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
505 507
506 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->" 508 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] %s:%d->"
507 "%s:%d state: %s->%s conn->refcnt:%d\n", 509 "%s:%d state: %s->%s conn->refcnt:%d\n",
508 pp->name, 510 pd->pp->name,
509 ((state_off == TCP_DIR_OUTPUT) ? 511 ((state_off == TCP_DIR_OUTPUT) ?
510 "output " : "input "), 512 "output " : "input "),
511 th->syn ? 'S' : '.', 513 th->syn ? 'S' : '.',
@@ -535,17 +537,19 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
535 } 537 }
536 } 538 }
537 539
538 cp->timeout = pp->timeout_table[cp->state = new_state]; 540 if (likely(pd))
541 cp->timeout = pd->timeout_table[cp->state = new_state];
542 else /* What to do ? */
543 cp->timeout = tcp_timeouts[cp->state = new_state];
539} 544}
540 545
541
542/* 546/*
543 * Handle state transitions 547 * Handle state transitions
544 */ 548 */
545static int 549static int
546tcp_state_transition(struct ip_vs_conn *cp, int direction, 550tcp_state_transition(struct ip_vs_conn *cp, int direction,
547 const struct sk_buff *skb, 551 const struct sk_buff *skb,
548 struct ip_vs_protocol *pp) 552 struct ip_vs_proto_data *pd)
549{ 553{
550 struct tcphdr _tcph, *th; 554 struct tcphdr _tcph, *th;
551 555
@@ -560,23 +564,12 @@ tcp_state_transition(struct ip_vs_conn *cp, int direction,
560 return 0; 564 return 0;
561 565
562 spin_lock(&cp->lock); 566 spin_lock(&cp->lock);
563 set_tcp_state(pp, cp, direction, th); 567 set_tcp_state(pd, cp, direction, th);
564 spin_unlock(&cp->lock); 568 spin_unlock(&cp->lock);
565 569
566 return 1; 570 return 1;
567} 571}
568 572
569
570/*
571 * Hash table for TCP application incarnations
572 */
573#define TCP_APP_TAB_BITS 4
574#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS)
575#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1)
576
577static struct list_head tcp_apps[TCP_APP_TAB_SIZE];
578static DEFINE_SPINLOCK(tcp_app_lock);
579
580static inline __u16 tcp_app_hashkey(__be16 port) 573static inline __u16 tcp_app_hashkey(__be16 port)
581{ 574{
582 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) 575 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
@@ -584,44 +577,50 @@ static inline __u16 tcp_app_hashkey(__be16 port)
584} 577}
585 578
586 579
587static int tcp_register_app(struct ip_vs_app *inc) 580static int tcp_register_app(struct net *net, struct ip_vs_app *inc)
588{ 581{
589 struct ip_vs_app *i; 582 struct ip_vs_app *i;
590 __u16 hash; 583 __u16 hash;
591 __be16 port = inc->port; 584 __be16 port = inc->port;
592 int ret = 0; 585 int ret = 0;
586 struct netns_ipvs *ipvs = net_ipvs(net);
587 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
593 588
594 hash = tcp_app_hashkey(port); 589 hash = tcp_app_hashkey(port);
595 590
596 spin_lock_bh(&tcp_app_lock); 591 spin_lock_bh(&ipvs->tcp_app_lock);
597 list_for_each_entry(i, &tcp_apps[hash], p_list) { 592 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
598 if (i->port == port) { 593 if (i->port == port) {
599 ret = -EEXIST; 594 ret = -EEXIST;
600 goto out; 595 goto out;
601 } 596 }
602 } 597 }
603 list_add(&inc->p_list, &tcp_apps[hash]); 598 list_add(&inc->p_list, &ipvs->tcp_apps[hash]);
604 atomic_inc(&ip_vs_protocol_tcp.appcnt); 599 atomic_inc(&pd->appcnt);
605 600
606 out: 601 out:
607 spin_unlock_bh(&tcp_app_lock); 602 spin_unlock_bh(&ipvs->tcp_app_lock);
608 return ret; 603 return ret;
609} 604}
610 605
611 606
612static void 607static void
613tcp_unregister_app(struct ip_vs_app *inc) 608tcp_unregister_app(struct net *net, struct ip_vs_app *inc)
614{ 609{
615 spin_lock_bh(&tcp_app_lock); 610 struct netns_ipvs *ipvs = net_ipvs(net);
616 atomic_dec(&ip_vs_protocol_tcp.appcnt); 611 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
612
613 spin_lock_bh(&ipvs->tcp_app_lock);
614 atomic_dec(&pd->appcnt);
617 list_del(&inc->p_list); 615 list_del(&inc->p_list);
618 spin_unlock_bh(&tcp_app_lock); 616 spin_unlock_bh(&ipvs->tcp_app_lock);
619} 617}
620 618
621 619
622static int 620static int
623tcp_app_conn_bind(struct ip_vs_conn *cp) 621tcp_app_conn_bind(struct ip_vs_conn *cp)
624{ 622{
623 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
625 int hash; 624 int hash;
626 struct ip_vs_app *inc; 625 struct ip_vs_app *inc;
627 int result = 0; 626 int result = 0;
@@ -633,12 +632,12 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
633 /* Lookup application incarnations and bind the right one */ 632 /* Lookup application incarnations and bind the right one */
634 hash = tcp_app_hashkey(cp->vport); 633 hash = tcp_app_hashkey(cp->vport);
635 634
636 spin_lock(&tcp_app_lock); 635 spin_lock(&ipvs->tcp_app_lock);
637 list_for_each_entry(inc, &tcp_apps[hash], p_list) { 636 list_for_each_entry(inc, &ipvs->tcp_apps[hash], p_list) {
638 if (inc->port == cp->vport) { 637 if (inc->port == cp->vport) {
639 if (unlikely(!ip_vs_app_inc_get(inc))) 638 if (unlikely(!ip_vs_app_inc_get(inc)))
640 break; 639 break;
641 spin_unlock(&tcp_app_lock); 640 spin_unlock(&ipvs->tcp_app_lock);
642 641
643 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" 642 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
644 "%s:%u to app %s on port %u\n", 643 "%s:%u to app %s on port %u\n",
@@ -655,7 +654,7 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
655 goto out; 654 goto out;
656 } 655 }
657 } 656 }
658 spin_unlock(&tcp_app_lock); 657 spin_unlock(&ipvs->tcp_app_lock);
659 658
660 out: 659 out:
661 return result; 660 return result;
@@ -665,24 +664,35 @@ tcp_app_conn_bind(struct ip_vs_conn *cp)
665/* 664/*
666 * Set LISTEN timeout. (ip_vs_conn_put will setup timer) 665 * Set LISTEN timeout. (ip_vs_conn_put will setup timer)
667 */ 666 */
668void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) 667void ip_vs_tcp_conn_listen(struct net *net, struct ip_vs_conn *cp)
669{ 668{
669 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_TCP);
670
670 spin_lock(&cp->lock); 671 spin_lock(&cp->lock);
671 cp->state = IP_VS_TCP_S_LISTEN; 672 cp->state = IP_VS_TCP_S_LISTEN;
672 cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; 673 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
674 : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
673 spin_unlock(&cp->lock); 675 spin_unlock(&cp->lock);
674} 676}
675 677
676 678/* ---------------------------------------------
677static void ip_vs_tcp_init(struct ip_vs_protocol *pp) 679 * timeouts is netns related now.
680 * ---------------------------------------------
681 */
682static void __ip_vs_tcp_init(struct net *net, struct ip_vs_proto_data *pd)
678{ 683{
679 IP_VS_INIT_HASH_TABLE(tcp_apps); 684 struct netns_ipvs *ipvs = net_ipvs(net);
680 pp->timeout_table = tcp_timeouts;
681}
682 685
686 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
687 spin_lock_init(&ipvs->tcp_app_lock);
688 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
689 sizeof(tcp_timeouts));
690 pd->tcp_state_table = tcp_states;
691}
683 692
684static void ip_vs_tcp_exit(struct ip_vs_protocol *pp) 693static void __ip_vs_tcp_exit(struct net *net, struct ip_vs_proto_data *pd)
685{ 694{
695 kfree(pd->timeout_table);
686} 696}
687 697
688 698
@@ -691,9 +701,10 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
691 .protocol = IPPROTO_TCP, 701 .protocol = IPPROTO_TCP,
692 .num_states = IP_VS_TCP_S_LAST, 702 .num_states = IP_VS_TCP_S_LAST,
693 .dont_defrag = 0, 703 .dont_defrag = 0,
694 .appcnt = ATOMIC_INIT(0), 704 .init = NULL,
695 .init = ip_vs_tcp_init, 705 .exit = NULL,
696 .exit = ip_vs_tcp_exit, 706 .init_netns = __ip_vs_tcp_init,
707 .exit_netns = __ip_vs_tcp_exit,
697 .register_app = tcp_register_app, 708 .register_app = tcp_register_app,
698 .unregister_app = tcp_unregister_app, 709 .unregister_app = tcp_unregister_app,
699 .conn_schedule = tcp_conn_schedule, 710 .conn_schedule = tcp_conn_schedule,
@@ -707,5 +718,4 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
707 .app_conn_bind = tcp_app_conn_bind, 718 .app_conn_bind = tcp_app_conn_bind,
708 .debug_packet = ip_vs_tcpudp_debug_packet, 719 .debug_packet = ip_vs_tcpudp_debug_packet,
709 .timeout_change = tcp_timeout_change, 720 .timeout_change = tcp_timeout_change,
710 .set_state_timeout = tcp_set_state_timeout,
711}; 721};
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index 9d106a06bb0a..f1282cbe6fe3 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -9,7 +9,8 @@
9 * as published by the Free Software Foundation; either version 9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version. 10 * 2 of the License, or (at your option) any later version.
11 * 11 *
12 * Changes: 12 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com>
13 * Network name space (netns) aware.
13 * 14 *
14 */ 15 */
15 16
@@ -28,9 +29,10 @@
28#include <net/ip6_checksum.h> 29#include <net/ip6_checksum.h>
29 30
30static int 31static int
31udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, 32udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd,
32 int *verdict, struct ip_vs_conn **cpp) 33 int *verdict, struct ip_vs_conn **cpp)
33{ 34{
35 struct net *net;
34 struct ip_vs_service *svc; 36 struct ip_vs_service *svc;
35 struct udphdr _udph, *uh; 37 struct udphdr _udph, *uh;
36 struct ip_vs_iphdr iph; 38 struct ip_vs_iphdr iph;
@@ -42,13 +44,13 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
42 *verdict = NF_DROP; 44 *verdict = NF_DROP;
43 return 0; 45 return 0;
44 } 46 }
45 47 net = skb_net(skb);
46 svc = ip_vs_service_get(af, skb->mark, iph.protocol, 48 svc = ip_vs_service_get(net, af, skb->mark, iph.protocol,
47 &iph.daddr, uh->dest); 49 &iph.daddr, uh->dest);
48 if (svc) { 50 if (svc) {
49 int ignored; 51 int ignored;
50 52
51 if (ip_vs_todrop()) { 53 if (ip_vs_todrop(net_ipvs(net))) {
52 /* 54 /*
53 * It seems that we are very loaded. 55 * It seems that we are very loaded.
54 * We have to drop this packet :( 56 * We have to drop this packet :(
@@ -62,13 +64,19 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
62 * Let the virtual server select a real server for the 64 * Let the virtual server select a real server for the
63 * incoming connection, and create a connection entry. 65 * incoming connection, and create a connection entry.
64 */ 66 */
65 *cpp = ip_vs_schedule(svc, skb, pp, &ignored); 67 *cpp = ip_vs_schedule(svc, skb, pd, &ignored);
66 if (!*cpp && !ignored) { 68 if (!*cpp && ignored <= 0) {
67 *verdict = ip_vs_leave(svc, skb, pp); 69 if (!ignored)
70 *verdict = ip_vs_leave(svc, skb, pd);
71 else {
72 ip_vs_service_put(svc);
73 *verdict = NF_DROP;
74 }
68 return 0; 75 return 0;
69 } 76 }
70 ip_vs_service_put(svc); 77 ip_vs_service_put(svc);
71 } 78 }
79 /* NF_ACCEPT */
72 return 1; 80 return 1;
73} 81}
74 82
@@ -338,19 +346,6 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp)
338 return 1; 346 return 1;
339} 347}
340 348
341
342/*
343 * Note: the caller guarantees that only one of register_app,
344 * unregister_app or app_conn_bind is called each time.
345 */
346
347#define UDP_APP_TAB_BITS 4
348#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS)
349#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1)
350
351static struct list_head udp_apps[UDP_APP_TAB_SIZE];
352static DEFINE_SPINLOCK(udp_app_lock);
353
354static inline __u16 udp_app_hashkey(__be16 port) 349static inline __u16 udp_app_hashkey(__be16 port)
355{ 350{
356 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port) 351 return (((__force u16)port >> UDP_APP_TAB_BITS) ^ (__force u16)port)
@@ -358,44 +353,50 @@ static inline __u16 udp_app_hashkey(__be16 port)
358} 353}
359 354
360 355
361static int udp_register_app(struct ip_vs_app *inc) 356static int udp_register_app(struct net *net, struct ip_vs_app *inc)
362{ 357{
363 struct ip_vs_app *i; 358 struct ip_vs_app *i;
364 __u16 hash; 359 __u16 hash;
365 __be16 port = inc->port; 360 __be16 port = inc->port;
366 int ret = 0; 361 int ret = 0;
362 struct netns_ipvs *ipvs = net_ipvs(net);
363 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
367 364
368 hash = udp_app_hashkey(port); 365 hash = udp_app_hashkey(port);
369 366
370 367
371 spin_lock_bh(&udp_app_lock); 368 spin_lock_bh(&ipvs->udp_app_lock);
372 list_for_each_entry(i, &udp_apps[hash], p_list) { 369 list_for_each_entry(i, &ipvs->udp_apps[hash], p_list) {
373 if (i->port == port) { 370 if (i->port == port) {
374 ret = -EEXIST; 371 ret = -EEXIST;
375 goto out; 372 goto out;
376 } 373 }
377 } 374 }
378 list_add(&inc->p_list, &udp_apps[hash]); 375 list_add(&inc->p_list, &ipvs->udp_apps[hash]);
379 atomic_inc(&ip_vs_protocol_udp.appcnt); 376 atomic_inc(&pd->appcnt);
380 377
381 out: 378 out:
382 spin_unlock_bh(&udp_app_lock); 379 spin_unlock_bh(&ipvs->udp_app_lock);
383 return ret; 380 return ret;
384} 381}
385 382
386 383
387static void 384static void
388udp_unregister_app(struct ip_vs_app *inc) 385udp_unregister_app(struct net *net, struct ip_vs_app *inc)
389{ 386{
390 spin_lock_bh(&udp_app_lock); 387 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(net, IPPROTO_UDP);
391 atomic_dec(&ip_vs_protocol_udp.appcnt); 388 struct netns_ipvs *ipvs = net_ipvs(net);
389
390 spin_lock_bh(&ipvs->udp_app_lock);
391 atomic_dec(&pd->appcnt);
392 list_del(&inc->p_list); 392 list_del(&inc->p_list);
393 spin_unlock_bh(&udp_app_lock); 393 spin_unlock_bh(&ipvs->udp_app_lock);
394} 394}
395 395
396 396
397static int udp_app_conn_bind(struct ip_vs_conn *cp) 397static int udp_app_conn_bind(struct ip_vs_conn *cp)
398{ 398{
399 struct netns_ipvs *ipvs = net_ipvs(ip_vs_conn_net(cp));
399 int hash; 400 int hash;
400 struct ip_vs_app *inc; 401 struct ip_vs_app *inc;
401 int result = 0; 402 int result = 0;
@@ -407,12 +408,12 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
407 /* Lookup application incarnations and bind the right one */ 408 /* Lookup application incarnations and bind the right one */
408 hash = udp_app_hashkey(cp->vport); 409 hash = udp_app_hashkey(cp->vport);
409 410
410 spin_lock(&udp_app_lock); 411 spin_lock(&ipvs->udp_app_lock);
411 list_for_each_entry(inc, &udp_apps[hash], p_list) { 412 list_for_each_entry(inc, &ipvs->udp_apps[hash], p_list) {
412 if (inc->port == cp->vport) { 413 if (inc->port == cp->vport) {
413 if (unlikely(!ip_vs_app_inc_get(inc))) 414 if (unlikely(!ip_vs_app_inc_get(inc)))
414 break; 415 break;
415 spin_unlock(&udp_app_lock); 416 spin_unlock(&ipvs->udp_app_lock);
416 417
417 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" 418 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
418 "%s:%u to app %s on port %u\n", 419 "%s:%u to app %s on port %u\n",
@@ -429,14 +430,14 @@ static int udp_app_conn_bind(struct ip_vs_conn *cp)
429 goto out; 430 goto out;
430 } 431 }
431 } 432 }
432 spin_unlock(&udp_app_lock); 433 spin_unlock(&ipvs->udp_app_lock);
433 434
434 out: 435 out:
435 return result; 436 return result;
436} 437}
437 438
438 439
439static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { 440static const int udp_timeouts[IP_VS_UDP_S_LAST+1] = {
440 [IP_VS_UDP_S_NORMAL] = 5*60*HZ, 441 [IP_VS_UDP_S_NORMAL] = 5*60*HZ,
441 [IP_VS_UDP_S_LAST] = 2*HZ, 442 [IP_VS_UDP_S_LAST] = 2*HZ,
442}; 443};
@@ -446,14 +447,6 @@ static const char *const udp_state_name_table[IP_VS_UDP_S_LAST+1] = {
446 [IP_VS_UDP_S_LAST] = "BUG!", 447 [IP_VS_UDP_S_LAST] = "BUG!",
447}; 448};
448 449
449
450static int
451udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to)
452{
453 return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST,
454 udp_state_name_table, sname, to);
455}
456
457static const char * udp_state_name(int state) 450static const char * udp_state_name(int state)
458{ 451{
459 if (state >= IP_VS_UDP_S_LAST) 452 if (state >= IP_VS_UDP_S_LAST)
@@ -464,20 +457,30 @@ static const char * udp_state_name(int state)
464static int 457static int
465udp_state_transition(struct ip_vs_conn *cp, int direction, 458udp_state_transition(struct ip_vs_conn *cp, int direction,
466 const struct sk_buff *skb, 459 const struct sk_buff *skb,
467 struct ip_vs_protocol *pp) 460 struct ip_vs_proto_data *pd)
468{ 461{
469 cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; 462 if (unlikely(!pd)) {
463 pr_err("UDP no ns data\n");
464 return 0;
465 }
466
467 cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
470 return 1; 468 return 1;
471} 469}
472 470
473static void udp_init(struct ip_vs_protocol *pp) 471static void __udp_init(struct net *net, struct ip_vs_proto_data *pd)
474{ 472{
475 IP_VS_INIT_HASH_TABLE(udp_apps); 473 struct netns_ipvs *ipvs = net_ipvs(net);
476 pp->timeout_table = udp_timeouts; 474
475 ip_vs_init_hash_table(ipvs->udp_apps, UDP_APP_TAB_SIZE);
476 spin_lock_init(&ipvs->udp_app_lock);
477 pd->timeout_table = ip_vs_create_timeout_table((int *)udp_timeouts,
478 sizeof(udp_timeouts));
477} 479}
478 480
479static void udp_exit(struct ip_vs_protocol *pp) 481static void __udp_exit(struct net *net, struct ip_vs_proto_data *pd)
480{ 482{
483 kfree(pd->timeout_table);
481} 484}
482 485
483 486
@@ -486,8 +489,10 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
486 .protocol = IPPROTO_UDP, 489 .protocol = IPPROTO_UDP,
487 .num_states = IP_VS_UDP_S_LAST, 490 .num_states = IP_VS_UDP_S_LAST,
488 .dont_defrag = 0, 491 .dont_defrag = 0,
489 .init = udp_init, 492 .init = NULL,
490 .exit = udp_exit, 493 .exit = NULL,
494 .init_netns = __udp_init,
495 .exit_netns = __udp_exit,
491 .conn_schedule = udp_conn_schedule, 496 .conn_schedule = udp_conn_schedule,
492 .conn_in_get = ip_vs_conn_in_get_proto, 497 .conn_in_get = ip_vs_conn_in_get_proto,
493 .conn_out_get = ip_vs_conn_out_get_proto, 498 .conn_out_get = ip_vs_conn_out_get_proto,
@@ -501,5 +506,4 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
501 .app_conn_bind = udp_app_conn_bind, 506 .app_conn_bind = udp_app_conn_bind,
502 .debug_packet = ip_vs_tcpudp_debug_packet, 507 .debug_packet = ip_vs_tcpudp_debug_packet,
503 .timeout_change = NULL, 508 .timeout_change = NULL,
504 .set_state_timeout = udp_set_state_timeout,
505}; 509};
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index ab85aedea17e..d1adf988eb08 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -5,6 +5,18 @@
5 * high-performance and highly available server based on a 5 * high-performance and highly available server based on a
6 * cluster of servers. 6 * cluster of servers.
7 * 7 *
8 * Version 1, is capable of handling both version 0 and 1 messages.
9 * Version 0 is the plain old format.
10 * Note Version 0 receivers will just drop Ver 1 messages.
11 * Version 1 is capable of handle IPv6, Persistence data,
12 * time-outs, and firewall marks.
13 * In ver.1 "ip_vs_sync_conn_options" will be sent in netw. order.
14 * Ver. 0 can be turned on by sysctl -w net.ipv4.vs.sync_version=0
15 *
16 * Definitions Message: is a complete datagram
17 * Sync_conn: is a part of a Message
18 * Param Data is an option to a Sync_conn.
19 *
8 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 20 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
9 * 21 *
10 * ip_vs_sync: sync connection info from master load balancer to backups 22 * ip_vs_sync: sync connection info from master load balancer to backups
@@ -15,6 +27,8 @@
15 * Alexandre Cassen : Added SyncID support for incoming sync 27 * Alexandre Cassen : Added SyncID support for incoming sync
16 * messages filtering. 28 * messages filtering.
17 * Justin Ossevoort : Fix endian problem on sync message size. 29 * Justin Ossevoort : Fix endian problem on sync message size.
30 * Hans Schillstrom : Added Version 1: i.e. IPv6,
31 * Persistence support, fwmark and time-out.
18 */ 32 */
19 33
20#define KMSG_COMPONENT "IPVS" 34#define KMSG_COMPONENT "IPVS"
@@ -35,6 +49,8 @@
35#include <linux/wait.h> 49#include <linux/wait.h>
36#include <linux/kernel.h> 50#include <linux/kernel.h>
37 51
52#include <asm/unaligned.h> /* Used for ntoh_seq and hton_seq */
53
38#include <net/ip.h> 54#include <net/ip.h>
39#include <net/sock.h> 55#include <net/sock.h>
40 56
@@ -43,11 +59,13 @@
43#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ 59#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */
44#define IP_VS_SYNC_PORT 8848 /* multicast port */ 60#define IP_VS_SYNC_PORT 8848 /* multicast port */
45 61
62#define SYNC_PROTO_VER 1 /* Protocol version in header */
46 63
47/* 64/*
48 * IPVS sync connection entry 65 * IPVS sync connection entry
66 * Version 0, i.e. original version.
49 */ 67 */
50struct ip_vs_sync_conn { 68struct ip_vs_sync_conn_v0 {
51 __u8 reserved; 69 __u8 reserved;
52 70
53 /* Protocol, addresses and port numbers */ 71 /* Protocol, addresses and port numbers */
@@ -71,41 +89,159 @@ struct ip_vs_sync_conn_options {
71 struct ip_vs_seq out_seq; /* outgoing seq. struct */ 89 struct ip_vs_seq out_seq; /* outgoing seq. struct */
72}; 90};
73 91
92/*
93 Sync Connection format (sync_conn)
94
95 0 1 2 3
96 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
97 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
98 | Type | Protocol | Ver. | Size |
99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 | Flags |
101 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
102 | State | cport |
103 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 | vport | dport |
105 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 | fwmark |
107 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
108 | timeout (in sec.) |
109 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
110 | ... |
111 | IP-Addresses (v4 or v6) |
112 | ... |
113 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
114 Optional Parameters.
115 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
116 | Param. Type | Param. Length | Param. data |
117 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ |
118 | ... |
119 | +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
120 | | Param Type | Param. Length |
121 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
122 | Param data |
123 | Last Param data should be padded for 32 bit alignment |
124 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
125*/
126
127/*
128 * Type 0, IPv4 sync connection format
129 */
130struct ip_vs_sync_v4 {
131 __u8 type;
132 __u8 protocol; /* Which protocol (TCP/UDP) */
133 __be16 ver_size; /* Version msb 4 bits */
134 /* Flags and state transition */
135 __be32 flags; /* status flags */
136 __be16 state; /* state info */
137 /* Protocol, addresses and port numbers */
138 __be16 cport;
139 __be16 vport;
140 __be16 dport;
141 __be32 fwmark; /* Firewall mark from skb */
142 __be32 timeout; /* cp timeout */
143 __be32 caddr; /* client address */
144 __be32 vaddr; /* virtual address */
145 __be32 daddr; /* destination address */
146 /* The sequence options start here */
147 /* PE data padded to 32bit alignment after seq. options */
148};
149/*
150 * Type 2 messages IPv6
151 */
152struct ip_vs_sync_v6 {
153 __u8 type;
154 __u8 protocol; /* Which protocol (TCP/UDP) */
155 __be16 ver_size; /* Version msb 4 bits */
156 /* Flags and state transition */
157 __be32 flags; /* status flags */
158 __be16 state; /* state info */
159 /* Protocol, addresses and port numbers */
160 __be16 cport;
161 __be16 vport;
162 __be16 dport;
163 __be32 fwmark; /* Firewall mark from skb */
164 __be32 timeout; /* cp timeout */
165 struct in6_addr caddr; /* client address */
166 struct in6_addr vaddr; /* virtual address */
167 struct in6_addr daddr; /* destination address */
168 /* The sequence options start here */
169 /* PE data padded to 32bit alignment after seq. options */
170};
171
172union ip_vs_sync_conn {
173 struct ip_vs_sync_v4 v4;
174 struct ip_vs_sync_v6 v6;
175};
176
177/* Bits in Type field in above */
178#define STYPE_INET6 0
179#define STYPE_F_INET6 (1 << STYPE_INET6)
180
181#define SVER_SHIFT 12 /* Shift to get version */
182#define SVER_MASK 0x0fff /* Mask to strip version */
183
184#define IPVS_OPT_SEQ_DATA 1
185#define IPVS_OPT_PE_DATA 2
186#define IPVS_OPT_PE_NAME 3
187#define IPVS_OPT_PARAM 7
188
189#define IPVS_OPT_F_SEQ_DATA (1 << (IPVS_OPT_SEQ_DATA-1))
190#define IPVS_OPT_F_PE_DATA (1 << (IPVS_OPT_PE_DATA-1))
191#define IPVS_OPT_F_PE_NAME (1 << (IPVS_OPT_PE_NAME-1))
192#define IPVS_OPT_F_PARAM (1 << (IPVS_OPT_PARAM-1))
193
74struct ip_vs_sync_thread_data { 194struct ip_vs_sync_thread_data {
195 struct net *net;
75 struct socket *sock; 196 struct socket *sock;
76 char *buf; 197 char *buf;
77}; 198};
78 199
79#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) 200/* Version 0 definition of packet sizes */
201#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn_v0))
80#define FULL_CONN_SIZE \ 202#define FULL_CONN_SIZE \
81(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) 203(sizeof(struct ip_vs_sync_conn_v0) + sizeof(struct ip_vs_sync_conn_options))
82 204
83 205
84/* 206/*
85 The master mulitcasts messages to the backup load balancers in the 207 The master mulitcasts messages (Datagrams) to the backup load balancers
86 following format. 208 in the following format.
209
210 Version 1:
211 Note, first byte should be Zero, so ver 0 receivers will drop the packet.
87 212
88 0 1 2 3 213 0 1 2 3
89 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 214 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
90 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 215 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
91 | Count Conns | SyncID | Size | 216 | 0 | SyncID | Size |
217 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
218 | Count Conns | Version | Reserved, set to Zero |
92 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 219 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
93 | | 220 | |
94 | IPVS Sync Connection (1) | 221 | IPVS Sync Connection (1) |
95 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 222 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
96 | . | 223 | . |
97 | . | 224 ~ . ~
98 | . | 225 | . |
99 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 226 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
100 | | 227 | |
101 | IPVS Sync Connection (n) | 228 | IPVS Sync Connection (n) |
102 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 229 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
230
231 Version 0 Header
232 0 1 2 3
233 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
234 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
235 | Count Conns | SyncID | Size |
236 +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
237 | IPVS Sync Connection (1) |
103*/ 238*/
104 239
105#define SYNC_MESG_HEADER_LEN 4 240#define SYNC_MESG_HEADER_LEN 4
106#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */ 241#define MAX_CONNS_PER_SYNCBUFF 255 /* nr_conns in ip_vs_sync_mesg is 8 bit */
107 242
108struct ip_vs_sync_mesg { 243/* Version 0 header */
244struct ip_vs_sync_mesg_v0 {
109 __u8 nr_conns; 245 __u8 nr_conns;
110 __u8 syncid; 246 __u8 syncid;
111 __u16 size; 247 __u16 size;
@@ -113,9 +249,16 @@ struct ip_vs_sync_mesg {
113 /* ip_vs_sync_conn entries start here */ 249 /* ip_vs_sync_conn entries start here */
114}; 250};
115 251
116/* the maximum length of sync (sending/receiving) message */ 252/* Version 1 header */
117static int sync_send_mesg_maxlen; 253struct ip_vs_sync_mesg {
118static int sync_recv_mesg_maxlen; 254 __u8 reserved; /* must be zero */
255 __u8 syncid;
256 __u16 size;
257 __u8 nr_conns;
258 __s8 version; /* SYNC_PROTO_VER */
259 __u16 spare;
260 /* ip_vs_sync_conn entries start here */
261};
119 262
120struct ip_vs_sync_buff { 263struct ip_vs_sync_buff {
121 struct list_head list; 264 struct list_head list;
@@ -127,28 +270,6 @@ struct ip_vs_sync_buff {
127 unsigned char *end; 270 unsigned char *end;
128}; 271};
129 272
130
131/* the sync_buff list head and the lock */
132static LIST_HEAD(ip_vs_sync_queue);
133static DEFINE_SPINLOCK(ip_vs_sync_lock);
134
135/* current sync_buff for accepting new conn entries */
136static struct ip_vs_sync_buff *curr_sb = NULL;
137static DEFINE_SPINLOCK(curr_sb_lock);
138
139/* ipvs sync daemon state */
140volatile int ip_vs_sync_state = IP_VS_STATE_NONE;
141volatile int ip_vs_master_syncid = 0;
142volatile int ip_vs_backup_syncid = 0;
143
144/* multicast interface name */
145char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN];
146char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN];
147
148/* sync daemon tasks */
149static struct task_struct *sync_master_thread;
150static struct task_struct *sync_backup_thread;
151
152/* multicast addr */ 273/* multicast addr */
153static struct sockaddr_in mcast_addr = { 274static struct sockaddr_in mcast_addr = {
154 .sin_family = AF_INET, 275 .sin_family = AF_INET,
@@ -156,41 +277,71 @@ static struct sockaddr_in mcast_addr = {
156 .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP), 277 .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
157}; 278};
158 279
280/*
281 * Copy of struct ip_vs_seq
282 * From unaligned network order to aligned host order
283 */
284static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho)
285{
286 ho->init_seq = get_unaligned_be32(&no->init_seq);
287 ho->delta = get_unaligned_be32(&no->delta);
288 ho->previous_delta = get_unaligned_be32(&no->previous_delta);
289}
290
291/*
292 * Copy of struct ip_vs_seq
293 * From Aligned host order to unaligned network order
294 */
295static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
296{
297 put_unaligned_be32(ho->init_seq, &no->init_seq);
298 put_unaligned_be32(ho->delta, &no->delta);
299 put_unaligned_be32(ho->previous_delta, &no->previous_delta);
300}
159 301
160static inline struct ip_vs_sync_buff *sb_dequeue(void) 302static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
161{ 303{
162 struct ip_vs_sync_buff *sb; 304 struct ip_vs_sync_buff *sb;
163 305
164 spin_lock_bh(&ip_vs_sync_lock); 306 spin_lock_bh(&ipvs->sync_lock);
165 if (list_empty(&ip_vs_sync_queue)) { 307 if (list_empty(&ipvs->sync_queue)) {
166 sb = NULL; 308 sb = NULL;
167 } else { 309 } else {
168 sb = list_entry(ip_vs_sync_queue.next, 310 sb = list_entry(ipvs->sync_queue.next,
169 struct ip_vs_sync_buff, 311 struct ip_vs_sync_buff,
170 list); 312 list);
171 list_del(&sb->list); 313 list_del(&sb->list);
172 } 314 }
173 spin_unlock_bh(&ip_vs_sync_lock); 315 spin_unlock_bh(&ipvs->sync_lock);
174 316
175 return sb; 317 return sb;
176} 318}
177 319
178static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) 320/*
321 * Create a new sync buffer for Version 1 proto.
322 */
323static inline struct ip_vs_sync_buff *
324ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
179{ 325{
180 struct ip_vs_sync_buff *sb; 326 struct ip_vs_sync_buff *sb;
181 327
182 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) 328 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
183 return NULL; 329 return NULL;
184 330
185 if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { 331 sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
332 if (!sb->mesg) {
186 kfree(sb); 333 kfree(sb);
187 return NULL; 334 return NULL;
188 } 335 }
336 sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
337 sb->mesg->version = SYNC_PROTO_VER;
338 sb->mesg->syncid = ipvs->master_syncid;
339 sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
189 sb->mesg->nr_conns = 0; 340 sb->mesg->nr_conns = 0;
190 sb->mesg->syncid = ip_vs_master_syncid; 341 sb->mesg->spare = 0;
191 sb->mesg->size = 4; 342 sb->head = (unsigned char *)sb->mesg + sizeof(struct ip_vs_sync_mesg);
192 sb->head = (unsigned char *)sb->mesg + 4; 343 sb->end = (unsigned char *)sb->mesg + ipvs->send_mesg_maxlen;
193 sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; 344
194 sb->firstuse = jiffies; 345 sb->firstuse = jiffies;
195 return sb; 346 return sb;
196} 347}
@@ -201,14 +352,16 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
201 kfree(sb); 352 kfree(sb);
202} 353}
203 354
204static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) 355static inline void sb_queue_tail(struct netns_ipvs *ipvs)
205{ 356{
206 spin_lock(&ip_vs_sync_lock); 357 struct ip_vs_sync_buff *sb = ipvs->sync_buff;
207 if (ip_vs_sync_state & IP_VS_STATE_MASTER) 358
208 list_add_tail(&sb->list, &ip_vs_sync_queue); 359 spin_lock(&ipvs->sync_lock);
360 if (ipvs->sync_state & IP_VS_STATE_MASTER)
361 list_add_tail(&sb->list, &ipvs->sync_queue);
209 else 362 else
210 ip_vs_sync_buff_release(sb); 363 ip_vs_sync_buff_release(sb);
211 spin_unlock(&ip_vs_sync_lock); 364 spin_unlock(&ipvs->sync_lock);
212} 365}
213 366
214/* 367/*
@@ -216,36 +369,101 @@ static inline void sb_queue_tail(struct ip_vs_sync_buff *sb)
216 * than the specified time or the specified time is zero. 369 * than the specified time or the specified time is zero.
217 */ 370 */
218static inline struct ip_vs_sync_buff * 371static inline struct ip_vs_sync_buff *
219get_curr_sync_buff(unsigned long time) 372get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
220{ 373{
221 struct ip_vs_sync_buff *sb; 374 struct ip_vs_sync_buff *sb;
222 375
223 spin_lock_bh(&curr_sb_lock); 376 spin_lock_bh(&ipvs->sync_buff_lock);
224 if (curr_sb && (time == 0 || 377 if (ipvs->sync_buff && (time == 0 ||
225 time_before(jiffies - curr_sb->firstuse, time))) { 378 time_before(jiffies - ipvs->sync_buff->firstuse, time))) {
226 sb = curr_sb; 379 sb = ipvs->sync_buff;
227 curr_sb = NULL; 380 ipvs->sync_buff = NULL;
228 } else 381 } else
229 sb = NULL; 382 sb = NULL;
230 spin_unlock_bh(&curr_sb_lock); 383 spin_unlock_bh(&ipvs->sync_buff_lock);
231 return sb; 384 return sb;
232} 385}
233 386
387/*
388 * Switch mode from sending version 0 or 1
389 * - must handle sync_buf
390 */
391void ip_vs_sync_switch_mode(struct net *net, int mode)
392{
393 struct netns_ipvs *ipvs = net_ipvs(net);
394
395 if (!ipvs->sync_state & IP_VS_STATE_MASTER)
396 return;
397 if (mode == ipvs->sysctl_sync_ver || !ipvs->sync_buff)
398 return;
399
400 spin_lock_bh(&ipvs->sync_buff_lock);
401 /* Buffer empty ? then let buf_create do the job */
402 if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
403 kfree(ipvs->sync_buff);
404 ipvs->sync_buff = NULL;
405 } else {
406 spin_lock_bh(&ipvs->sync_lock);
407 if (ipvs->sync_state & IP_VS_STATE_MASTER)
408 list_add_tail(&ipvs->sync_buff->list,
409 &ipvs->sync_queue);
410 else
411 ip_vs_sync_buff_release(ipvs->sync_buff);
412 spin_unlock_bh(&ipvs->sync_lock);
413 }
414 spin_unlock_bh(&ipvs->sync_buff_lock);
415}
234 416
235/* 417/*
418 * Create a new sync buffer for Version 0 proto.
419 */
420static inline struct ip_vs_sync_buff *
421ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
422{
423 struct ip_vs_sync_buff *sb;
424 struct ip_vs_sync_mesg_v0 *mesg;
425
426 if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC)))
427 return NULL;
428
429 sb->mesg = kmalloc(ipvs->send_mesg_maxlen, GFP_ATOMIC);
430 if (!sb->mesg) {
431 kfree(sb);
432 return NULL;
433 }
434 mesg = (struct ip_vs_sync_mesg_v0 *)sb->mesg;
435 mesg->nr_conns = 0;
436 mesg->syncid = ipvs->master_syncid;
437 mesg->size = sizeof(struct ip_vs_sync_mesg_v0);
438 sb->head = (unsigned char *)mesg + sizeof(struct ip_vs_sync_mesg_v0);
439 sb->end = (unsigned char *)mesg + ipvs->send_mesg_maxlen;
440 sb->firstuse = jiffies;
441 return sb;
442}
443
444/*
445 * Version 0 , could be switched in by sys_ctl.
236 * Add an ip_vs_conn information into the current sync_buff. 446 * Add an ip_vs_conn information into the current sync_buff.
237 * Called by ip_vs_in.
238 */ 447 */
239void ip_vs_sync_conn(struct ip_vs_conn *cp) 448void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
240{ 449{
241 struct ip_vs_sync_mesg *m; 450 struct netns_ipvs *ipvs = net_ipvs(net);
242 struct ip_vs_sync_conn *s; 451 struct ip_vs_sync_mesg_v0 *m;
452 struct ip_vs_sync_conn_v0 *s;
243 int len; 453 int len;
244 454
245 spin_lock(&curr_sb_lock); 455 if (unlikely(cp->af != AF_INET))
246 if (!curr_sb) { 456 return;
247 if (!(curr_sb=ip_vs_sync_buff_create())) { 457 /* Do not sync ONE PACKET */
248 spin_unlock(&curr_sb_lock); 458 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
459 return;
460
461 spin_lock(&ipvs->sync_buff_lock);
462 if (!ipvs->sync_buff) {
463 ipvs->sync_buff =
464 ip_vs_sync_buff_create_v0(ipvs);
465 if (!ipvs->sync_buff) {
466 spin_unlock(&ipvs->sync_buff_lock);
249 pr_err("ip_vs_sync_buff_create failed.\n"); 467 pr_err("ip_vs_sync_buff_create failed.\n");
250 return; 468 return;
251 } 469 }
@@ -253,10 +471,11 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
253 471
254 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : 472 len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
255 SIMPLE_CONN_SIZE; 473 SIMPLE_CONN_SIZE;
256 m = curr_sb->mesg; 474 m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
257 s = (struct ip_vs_sync_conn *)curr_sb->head; 475 s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
258 476
259 /* copy members */ 477 /* copy members */
478 s->reserved = 0;
260 s->protocol = cp->protocol; 479 s->protocol = cp->protocol;
261 s->cport = cp->cport; 480 s->cport = cp->cport;
262 s->vport = cp->vport; 481 s->vport = cp->vport;
@@ -274,83 +493,366 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp)
274 493
275 m->nr_conns++; 494 m->nr_conns++;
276 m->size += len; 495 m->size += len;
277 curr_sb->head += len; 496 ipvs->sync_buff->head += len;
278 497
279 /* check if there is a space for next one */ 498 /* check if there is a space for next one */
280 if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { 499 if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
281 sb_queue_tail(curr_sb); 500 sb_queue_tail(ipvs);
282 curr_sb = NULL; 501 ipvs->sync_buff = NULL;
283 } 502 }
284 spin_unlock(&curr_sb_lock); 503 spin_unlock(&ipvs->sync_buff_lock);
285 504
286 /* synchronize its controller if it has */ 505 /* synchronize its controller if it has */
287 if (cp->control) 506 if (cp->control)
288 ip_vs_sync_conn(cp->control); 507 ip_vs_sync_conn(net, cp->control);
508}
509
510/*
511 * Add an ip_vs_conn information into the current sync_buff.
512 * Called by ip_vs_in.
513 * Sending Version 1 messages
514 */
515void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
516{
517 struct netns_ipvs *ipvs = net_ipvs(net);
518 struct ip_vs_sync_mesg *m;
519 union ip_vs_sync_conn *s;
520 __u8 *p;
521 unsigned int len, pe_name_len, pad;
522
523 /* Handle old version of the protocol */
524 if (ipvs->sysctl_sync_ver == 0) {
525 ip_vs_sync_conn_v0(net, cp);
526 return;
527 }
528 /* Do not sync ONE PACKET */
529 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
530 goto control;
531sloop:
532 /* Sanity checks */
533 pe_name_len = 0;
534 if (cp->pe_data_len) {
535 if (!cp->pe_data || !cp->dest) {
536 IP_VS_ERR_RL("SYNC, connection pe_data invalid\n");
537 return;
538 }
539 pe_name_len = strnlen(cp->pe->name, IP_VS_PENAME_MAXLEN);
540 }
541
542 spin_lock(&ipvs->sync_buff_lock);
543
544#ifdef CONFIG_IP_VS_IPV6
545 if (cp->af == AF_INET6)
546 len = sizeof(struct ip_vs_sync_v6);
547 else
548#endif
549 len = sizeof(struct ip_vs_sync_v4);
550
551 if (cp->flags & IP_VS_CONN_F_SEQ_MASK)
552 len += sizeof(struct ip_vs_sync_conn_options) + 2;
553
554 if (cp->pe_data_len)
555 len += cp->pe_data_len + 2; /* + Param hdr field */
556 if (pe_name_len)
557 len += pe_name_len + 2;
558
559 /* check if there is a space for this one */
560 pad = 0;
561 if (ipvs->sync_buff) {
562 pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
563 if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
564 sb_queue_tail(ipvs);
565 ipvs->sync_buff = NULL;
566 pad = 0;
567 }
568 }
569
570 if (!ipvs->sync_buff) {
571 ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
572 if (!ipvs->sync_buff) {
573 spin_unlock(&ipvs->sync_buff_lock);
574 pr_err("ip_vs_sync_buff_create failed.\n");
575 return;
576 }
577 }
578
579 m = ipvs->sync_buff->mesg;
580 p = ipvs->sync_buff->head;
581 ipvs->sync_buff->head += pad + len;
582 m->size += pad + len;
583 /* Add ev. padding from prev. sync_conn */
584 while (pad--)
585 *(p++) = 0;
586
587 s = (union ip_vs_sync_conn *)p;
588
589 /* Set message type & copy members */
590 s->v4.type = (cp->af == AF_INET6 ? STYPE_F_INET6 : 0);
591 s->v4.ver_size = htons(len & SVER_MASK); /* Version 0 */
592 s->v4.flags = htonl(cp->flags & ~IP_VS_CONN_F_HASHED);
593 s->v4.state = htons(cp->state);
594 s->v4.protocol = cp->protocol;
595 s->v4.cport = cp->cport;
596 s->v4.vport = cp->vport;
597 s->v4.dport = cp->dport;
598 s->v4.fwmark = htonl(cp->fwmark);
599 s->v4.timeout = htonl(cp->timeout / HZ);
600 m->nr_conns++;
601
602#ifdef CONFIG_IP_VS_IPV6
603 if (cp->af == AF_INET6) {
604 p += sizeof(struct ip_vs_sync_v6);
605 ipv6_addr_copy(&s->v6.caddr, &cp->caddr.in6);
606 ipv6_addr_copy(&s->v6.vaddr, &cp->vaddr.in6);
607 ipv6_addr_copy(&s->v6.daddr, &cp->daddr.in6);
608 } else
609#endif
610 {
611 p += sizeof(struct ip_vs_sync_v4); /* options ptr */
612 s->v4.caddr = cp->caddr.ip;
613 s->v4.vaddr = cp->vaddr.ip;
614 s->v4.daddr = cp->daddr.ip;
615 }
616 if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {
617 *(p++) = IPVS_OPT_SEQ_DATA;
618 *(p++) = sizeof(struct ip_vs_sync_conn_options);
619 hton_seq((struct ip_vs_seq *)p, &cp->in_seq);
620 p += sizeof(struct ip_vs_seq);
621 hton_seq((struct ip_vs_seq *)p, &cp->out_seq);
622 p += sizeof(struct ip_vs_seq);
623 }
624 /* Handle pe data */
625 if (cp->pe_data_len && cp->pe_data) {
626 *(p++) = IPVS_OPT_PE_DATA;
627 *(p++) = cp->pe_data_len;
628 memcpy(p, cp->pe_data, cp->pe_data_len);
629 p += cp->pe_data_len;
630 if (pe_name_len) {
631 /* Add PE_NAME */
632 *(p++) = IPVS_OPT_PE_NAME;
633 *(p++) = pe_name_len;
634 memcpy(p, cp->pe->name, pe_name_len);
635 p += pe_name_len;
636 }
637 }
638
639 spin_unlock(&ipvs->sync_buff_lock);
640
641control:
642 /* synchronize its controller if it has */
643 cp = cp->control;
644 if (!cp)
645 return;
646 /*
647 * Reduce sync rate for templates
648 * i.e only increment in_pkts for Templates.
649 */
650 if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
651 int pkts = atomic_add_return(1, &cp->in_pkts);
652
653 if (pkts % ipvs->sysctl_sync_threshold[1] != 1)
654 return;
655 }
656 goto sloop;
289} 657}
290 658
659/*
660 * fill_param used by version 1
661 */
291static inline int 662static inline int
292ip_vs_conn_fill_param_sync(int af, int protocol, 663ip_vs_conn_fill_param_sync(struct net *net, int af, union ip_vs_sync_conn *sc,
293 const union nf_inet_addr *caddr, __be16 cport, 664 struct ip_vs_conn_param *p,
294 const union nf_inet_addr *vaddr, __be16 vport, 665 __u8 *pe_data, unsigned int pe_data_len,
295 struct ip_vs_conn_param *p) 666 __u8 *pe_name, unsigned int pe_name_len)
296{ 667{
297 /* XXX: Need to take into account persistence engine */ 668#ifdef CONFIG_IP_VS_IPV6
298 ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); 669 if (af == AF_INET6)
670 ip_vs_conn_fill_param(net, af, sc->v6.protocol,
671 (const union nf_inet_addr *)&sc->v6.caddr,
672 sc->v6.cport,
673 (const union nf_inet_addr *)&sc->v6.vaddr,
674 sc->v6.vport, p);
675 else
676#endif
677 ip_vs_conn_fill_param(net, af, sc->v4.protocol,
678 (const union nf_inet_addr *)&sc->v4.caddr,
679 sc->v4.cport,
680 (const union nf_inet_addr *)&sc->v4.vaddr,
681 sc->v4.vport, p);
682 /* Handle pe data */
683 if (pe_data_len) {
684 if (pe_name_len) {
685 char buff[IP_VS_PENAME_MAXLEN+1];
686
687 memcpy(buff, pe_name, pe_name_len);
688 buff[pe_name_len]=0;
689 p->pe = __ip_vs_pe_getbyname(buff);
690 if (!p->pe) {
691 IP_VS_DBG(3, "BACKUP, no %s engine found/loaded\n",
692 buff);
693 return 1;
694 }
695 } else {
696 IP_VS_ERR_RL("BACKUP, Invalid PE parameters\n");
697 return 1;
698 }
699
700 p->pe_data = kmalloc(pe_data_len, GFP_ATOMIC);
701 if (!p->pe_data) {
702 if (p->pe->module)
703 module_put(p->pe->module);
704 return -ENOMEM;
705 }
706 memcpy(p->pe_data, pe_data, pe_data_len);
707 p->pe_data_len = pe_data_len;
708 }
299 return 0; 709 return 0;
300} 710}
301 711
302/* 712/*
303 * Process received multicast message and create the corresponding 713 * Connection Add / Update.
304 * ip_vs_conn entries. 714 * Common for version 0 and 1 reception of backup sync_conns.
715 * Param: ...
716 * timeout is in sec.
305 */ 717 */
306static void ip_vs_process_message(const char *buffer, const size_t buflen) 718static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
719 unsigned int flags, unsigned int state,
720 unsigned int protocol, unsigned int type,
721 const union nf_inet_addr *daddr, __be16 dport,
722 unsigned long timeout, __u32 fwmark,
723 struct ip_vs_sync_conn_options *opt)
307{ 724{
308 struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
309 struct ip_vs_sync_conn *s;
310 struct ip_vs_sync_conn_options *opt;
311 struct ip_vs_conn *cp;
312 struct ip_vs_protocol *pp;
313 struct ip_vs_dest *dest; 725 struct ip_vs_dest *dest;
314 struct ip_vs_conn_param param; 726 struct ip_vs_conn *cp;
315 char *p; 727 struct netns_ipvs *ipvs = net_ipvs(net);
316 int i;
317 728
318 if (buflen < sizeof(struct ip_vs_sync_mesg)) { 729 if (!(flags & IP_VS_CONN_F_TEMPLATE))
319 IP_VS_ERR_RL("sync message header too short\n"); 730 cp = ip_vs_conn_in_get(param);
320 return; 731 else
321 } 732 cp = ip_vs_ct_in_get(param);
322 733
323 /* Convert size back to host byte order */ 734 if (cp && param->pe_data) /* Free pe_data */
324 m->size = ntohs(m->size); 735 kfree(param->pe_data);
736 if (!cp) {
737 /*
738 * Find the appropriate destination for the connection.
739 * If it is not found the connection will remain unbound
740 * but still handled.
741 */
742 dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
743 param->vport, protocol, fwmark);
325 744
326 if (buflen != m->size) { 745 /* Set the approprite ativity flag */
327 IP_VS_ERR_RL("bogus sync message size\n"); 746 if (protocol == IPPROTO_TCP) {
328 return; 747 if (state != IP_VS_TCP_S_ESTABLISHED)
748 flags |= IP_VS_CONN_F_INACTIVE;
749 else
750 flags &= ~IP_VS_CONN_F_INACTIVE;
751 } else if (protocol == IPPROTO_SCTP) {
752 if (state != IP_VS_SCTP_S_ESTABLISHED)
753 flags |= IP_VS_CONN_F_INACTIVE;
754 else
755 flags &= ~IP_VS_CONN_F_INACTIVE;
756 }
757 cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
758 if (dest)
759 atomic_dec(&dest->refcnt);
760 if (!cp) {
761 if (param->pe_data)
762 kfree(param->pe_data);
763 IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
764 return;
765 }
766 } else if (!cp->dest) {
767 dest = ip_vs_try_bind_dest(cp);
768 if (dest)
769 atomic_dec(&dest->refcnt);
770 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
771 (cp->state != state)) {
772 /* update active/inactive flag for the connection */
773 dest = cp->dest;
774 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
775 (state != IP_VS_TCP_S_ESTABLISHED)) {
776 atomic_dec(&dest->activeconns);
777 atomic_inc(&dest->inactconns);
778 cp->flags |= IP_VS_CONN_F_INACTIVE;
779 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
780 (state == IP_VS_TCP_S_ESTABLISHED)) {
781 atomic_inc(&dest->activeconns);
782 atomic_dec(&dest->inactconns);
783 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
784 }
785 } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
786 (cp->state != state)) {
787 dest = cp->dest;
788 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
789 (state != IP_VS_SCTP_S_ESTABLISHED)) {
790 atomic_dec(&dest->activeconns);
791 atomic_inc(&dest->inactconns);
792 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
793 }
329 } 794 }
330 795
331 /* SyncID sanity check */ 796 if (opt)
332 if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { 797 memcpy(&cp->in_seq, opt, sizeof(*opt));
333 IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", 798 atomic_set(&cp->in_pkts, ipvs->sysctl_sync_threshold[0]);
334 m->syncid); 799 cp->state = state;
335 return; 800 cp->old_state = cp->state;
801 /*
802 * For Ver 0 messages style
803 * - Not possible to recover the right timeout for templates
804 * - can not find the right fwmark
805 * virtual service. If needed, we can do it for
806 * non-fwmark persistent services.
807 * Ver 1 messages style.
808 * - No problem.
809 */
810 if (timeout) {
811 if (timeout > MAX_SCHEDULE_TIMEOUT / HZ)
812 timeout = MAX_SCHEDULE_TIMEOUT / HZ;
813 cp->timeout = timeout*HZ;
814 } else {
815 struct ip_vs_proto_data *pd;
816
817 pd = ip_vs_proto_data_get(net, protocol);
818 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pd && pd->timeout_table)
819 cp->timeout = pd->timeout_table[state];
820 else
821 cp->timeout = (3*60*HZ);
336 } 822 }
823 ip_vs_conn_put(cp);
824}
337 825
338 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); 826/*
827 * Process received multicast message for Version 0
828 */
829static void ip_vs_process_message_v0(struct net *net, const char *buffer,
830 const size_t buflen)
831{
832 struct ip_vs_sync_mesg_v0 *m = (struct ip_vs_sync_mesg_v0 *)buffer;
833 struct ip_vs_sync_conn_v0 *s;
834 struct ip_vs_sync_conn_options *opt;
835 struct ip_vs_protocol *pp;
836 struct ip_vs_conn_param param;
837 char *p;
838 int i;
839
840 p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
339 for (i=0; i<m->nr_conns; i++) { 841 for (i=0; i<m->nr_conns; i++) {
340 unsigned flags, state; 842 unsigned flags, state;
341 843
342 if (p + SIMPLE_CONN_SIZE > buffer+buflen) { 844 if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
343 IP_VS_ERR_RL("bogus conn in sync message\n"); 845 IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
344 return; 846 return;
345 } 847 }
346 s = (struct ip_vs_sync_conn *) p; 848 s = (struct ip_vs_sync_conn_v0 *) p;
347 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC; 849 flags = ntohs(s->flags) | IP_VS_CONN_F_SYNC;
348 flags &= ~IP_VS_CONN_F_HASHED; 850 flags &= ~IP_VS_CONN_F_HASHED;
349 if (flags & IP_VS_CONN_F_SEQ_MASK) { 851 if (flags & IP_VS_CONN_F_SEQ_MASK) {
350 opt = (struct ip_vs_sync_conn_options *)&s[1]; 852 opt = (struct ip_vs_sync_conn_options *)&s[1];
351 p += FULL_CONN_SIZE; 853 p += FULL_CONN_SIZE;
352 if (p > buffer+buflen) { 854 if (p > buffer+buflen) {
353 IP_VS_ERR_RL("bogus conn options in sync message\n"); 855 IP_VS_ERR_RL("BACKUP v0, Dropping buffer bogus conn options\n");
354 return; 856 return;
355 } 857 }
356 } else { 858 } else {
@@ -362,118 +864,286 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen)
362 if (!(flags & IP_VS_CONN_F_TEMPLATE)) { 864 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
363 pp = ip_vs_proto_get(s->protocol); 865 pp = ip_vs_proto_get(s->protocol);
364 if (!pp) { 866 if (!pp) {
365 IP_VS_ERR_RL("Unsupported protocol %u in sync msg\n", 867 IP_VS_DBG(2, "BACKUP v0, Unsupported protocol %u\n",
366 s->protocol); 868 s->protocol);
367 continue; 869 continue;
368 } 870 }
369 if (state >= pp->num_states) { 871 if (state >= pp->num_states) {
370 IP_VS_DBG(2, "Invalid %s state %u in sync msg\n", 872 IP_VS_DBG(2, "BACKUP v0, Invalid %s state %u\n",
371 pp->name, state); 873 pp->name, state);
372 continue; 874 continue;
373 } 875 }
374 } else { 876 } else {
375 /* protocol in templates is not used for state/timeout */ 877 /* protocol in templates is not used for state/timeout */
376 pp = NULL;
377 if (state > 0) { 878 if (state > 0) {
378 IP_VS_DBG(2, "Invalid template state %u in sync msg\n", 879 IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
379 state); 880 state);
380 state = 0; 881 state = 0;
381 } 882 }
382 } 883 }
383 884
384 { 885 ip_vs_conn_fill_param(net, AF_INET, s->protocol,
385 if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, 886 (const union nf_inet_addr *)&s->caddr,
386 (union nf_inet_addr *)&s->caddr, 887 s->cport,
387 s->cport, 888 (const union nf_inet_addr *)&s->vaddr,
388 (union nf_inet_addr *)&s->vaddr, 889 s->vport, &param);
389 s->vport, &param)) { 890
390 pr_err("ip_vs_conn_fill_param_sync failed"); 891 /* Send timeout as Zero */
391 return; 892 ip_vs_proc_conn(net, &param, flags, state, s->protocol, AF_INET,
893 (union nf_inet_addr *)&s->daddr, s->dport,
894 0, 0, opt);
895 }
896}
897
898/*
899 * Handle options
900 */
901static inline int ip_vs_proc_seqopt(__u8 *p, unsigned int plen,
902 __u32 *opt_flags,
903 struct ip_vs_sync_conn_options *opt)
904{
905 struct ip_vs_sync_conn_options *topt;
906
907 topt = (struct ip_vs_sync_conn_options *)p;
908
909 if (plen != sizeof(struct ip_vs_sync_conn_options)) {
910 IP_VS_DBG(2, "BACKUP, bogus conn options length\n");
911 return -EINVAL;
912 }
913 if (*opt_flags & IPVS_OPT_F_SEQ_DATA) {
914 IP_VS_DBG(2, "BACKUP, conn options found twice\n");
915 return -EINVAL;
916 }
917 ntoh_seq(&topt->in_seq, &opt->in_seq);
918 ntoh_seq(&topt->out_seq, &opt->out_seq);
919 *opt_flags |= IPVS_OPT_F_SEQ_DATA;
920 return 0;
921}
922
923static int ip_vs_proc_str(__u8 *p, unsigned int plen, unsigned int *data_len,
924 __u8 **data, unsigned int maxlen,
925 __u32 *opt_flags, __u32 flag)
926{
927 if (plen > maxlen) {
928 IP_VS_DBG(2, "BACKUP, bogus par.data len > %d\n", maxlen);
929 return -EINVAL;
930 }
931 if (*opt_flags & flag) {
932 IP_VS_DBG(2, "BACKUP, Par.data found twice 0x%x\n", flag);
933 return -EINVAL;
934 }
935 *data_len = plen;
936 *data = p;
937 *opt_flags |= flag;
938 return 0;
939}
940/*
941 * Process a Version 1 sync. connection
942 */
943static inline int ip_vs_proc_sync_conn(struct net *net, __u8 *p, __u8 *msg_end)
944{
945 struct ip_vs_sync_conn_options opt;
946 union ip_vs_sync_conn *s;
947 struct ip_vs_protocol *pp;
948 struct ip_vs_conn_param param;
949 __u32 flags;
950 unsigned int af, state, pe_data_len=0, pe_name_len=0;
951 __u8 *pe_data=NULL, *pe_name=NULL;
952 __u32 opt_flags=0;
953 int retc=0;
954
955 s = (union ip_vs_sync_conn *) p;
956
957 if (s->v6.type & STYPE_F_INET6) {
958#ifdef CONFIG_IP_VS_IPV6
959 af = AF_INET6;
960 p += sizeof(struct ip_vs_sync_v6);
961#else
962 IP_VS_DBG(3,"BACKUP, IPv6 msg received, and IPVS is not compiled for IPv6\n");
963 retc = 10;
964 goto out;
965#endif
966 } else if (!s->v4.type) {
967 af = AF_INET;
968 p += sizeof(struct ip_vs_sync_v4);
969 } else {
970 return -10;
971 }
972 if (p > msg_end)
973 return -20;
974
975 /* Process optional params check Type & Len. */
976 while (p < msg_end) {
977 int ptype;
978 int plen;
979
980 if (p+2 > msg_end)
981 return -30;
982 ptype = *(p++);
983 plen = *(p++);
984
985 if (!plen || ((p + plen) > msg_end))
986 return -40;
987 /* Handle seq option p = param data */
988 switch (ptype & ~IPVS_OPT_F_PARAM) {
989 case IPVS_OPT_SEQ_DATA:
990 if (ip_vs_proc_seqopt(p, plen, &opt_flags, &opt))
991 return -50;
992 break;
993
994 case IPVS_OPT_PE_DATA:
995 if (ip_vs_proc_str(p, plen, &pe_data_len, &pe_data,
996 IP_VS_PEDATA_MAXLEN, &opt_flags,
997 IPVS_OPT_F_PE_DATA))
998 return -60;
999 break;
1000
1001 case IPVS_OPT_PE_NAME:
1002 if (ip_vs_proc_str(p, plen,&pe_name_len, &pe_name,
1003 IP_VS_PENAME_MAXLEN, &opt_flags,
1004 IPVS_OPT_F_PE_NAME))
1005 return -70;
1006 break;
1007
1008 default:
1009 /* Param data mandatory ? */
1010 if (!(ptype & IPVS_OPT_F_PARAM)) {
1011 IP_VS_DBG(3, "BACKUP, Unknown mandatory param %d found\n",
1012 ptype & ~IPVS_OPT_F_PARAM);
1013 retc = 20;
1014 goto out;
392 } 1015 }
393 if (!(flags & IP_VS_CONN_F_TEMPLATE))
394 cp = ip_vs_conn_in_get(&param);
395 else
396 cp = ip_vs_ct_in_get(&param);
397 } 1016 }
398 if (!cp) { 1017 p += plen; /* Next option */
399 /* 1018 }
400 * Find the appropriate destination for the connection. 1019
401 * If it is not found the connection will remain unbound 1020 /* Get flags and Mask off unsupported */
402 * but still handled. 1021 flags = ntohl(s->v4.flags) & IP_VS_CONN_F_BACKUP_MASK;
403 */ 1022 flags |= IP_VS_CONN_F_SYNC;
404 dest = ip_vs_find_dest(AF_INET, 1023 state = ntohs(s->v4.state);
405 (union nf_inet_addr *)&s->daddr, 1024
406 s->dport, 1025 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
407 (union nf_inet_addr *)&s->vaddr, 1026 pp = ip_vs_proto_get(s->v4.protocol);
408 s->vport, 1027 if (!pp) {
409 s->protocol); 1028 IP_VS_DBG(3,"BACKUP, Unsupported protocol %u\n",
410 /* Set the approprite ativity flag */ 1029 s->v4.protocol);
411 if (s->protocol == IPPROTO_TCP) { 1030 retc = 30;
412 if (state != IP_VS_TCP_S_ESTABLISHED) 1031 goto out;
413 flags |= IP_VS_CONN_F_INACTIVE; 1032 }
414 else 1033 if (state >= pp->num_states) {
415 flags &= ~IP_VS_CONN_F_INACTIVE; 1034 IP_VS_DBG(3, "BACKUP, Invalid %s state %u\n",
416 } else if (s->protocol == IPPROTO_SCTP) { 1035 pp->name, state);
417 if (state != IP_VS_SCTP_S_ESTABLISHED) 1036 retc = 40;
418 flags |= IP_VS_CONN_F_INACTIVE; 1037 goto out;
419 else 1038 }
420 flags &= ~IP_VS_CONN_F_INACTIVE; 1039 } else {
1040 /* protocol in templates is not used for state/timeout */
1041 if (state > 0) {
1042 IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
1043 state);
1044 state = 0;
1045 }
1046 }
1047 if (ip_vs_conn_fill_param_sync(net, af, s, &param, pe_data,
1048 pe_data_len, pe_name, pe_name_len)) {
1049 retc = 50;
1050 goto out;
1051 }
1052 /* If only IPv4, just silent skip IPv6 */
1053 if (af == AF_INET)
1054 ip_vs_proc_conn(net, &param, flags, state, s->v4.protocol, af,
1055 (union nf_inet_addr *)&s->v4.daddr, s->v4.dport,
1056 ntohl(s->v4.timeout), ntohl(s->v4.fwmark),
1057 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1058 );
1059#ifdef CONFIG_IP_VS_IPV6
1060 else
1061 ip_vs_proc_conn(net, &param, flags, state, s->v6.protocol, af,
1062 (union nf_inet_addr *)&s->v6.daddr, s->v6.dport,
1063 ntohl(s->v6.timeout), ntohl(s->v6.fwmark),
1064 (opt_flags & IPVS_OPT_F_SEQ_DATA ? &opt : NULL)
1065 );
1066#endif
1067 return 0;
1068 /* Error exit */
1069out:
1070 IP_VS_DBG(2, "BACKUP, Single msg dropped err:%d\n", retc);
1071 return retc;
1072
1073}
1074/*
1075 * Process received multicast message and create the corresponding
1076 * ip_vs_conn entries.
1077 * Handles Version 0 & 1
1078 */
1079static void ip_vs_process_message(struct net *net, __u8 *buffer,
1080 const size_t buflen)
1081{
1082 struct netns_ipvs *ipvs = net_ipvs(net);
1083 struct ip_vs_sync_mesg *m2 = (struct ip_vs_sync_mesg *)buffer;
1084 __u8 *p, *msg_end;
1085 int i, nr_conns;
1086
1087 if (buflen < sizeof(struct ip_vs_sync_mesg_v0)) {
1088 IP_VS_DBG(2, "BACKUP, message header too short\n");
1089 return;
1090 }
1091 /* Convert size back to host byte order */
1092 m2->size = ntohs(m2->size);
1093
1094 if (buflen != m2->size) {
1095 IP_VS_DBG(2, "BACKUP, bogus message size\n");
1096 return;
1097 }
1098 /* SyncID sanity check */
1099 if (ipvs->backup_syncid != 0 && m2->syncid != ipvs->backup_syncid) {
1100 IP_VS_DBG(7, "BACKUP, Ignoring syncid = %d\n", m2->syncid);
1101 return;
1102 }
1103 /* Handle version 1 message */
1104 if ((m2->version == SYNC_PROTO_VER) && (m2->reserved == 0)
1105 && (m2->spare == 0)) {
1106
1107 msg_end = buffer + sizeof(struct ip_vs_sync_mesg);
1108 nr_conns = m2->nr_conns;
1109
1110 for (i=0; i<nr_conns; i++) {
1111 union ip_vs_sync_conn *s;
1112 unsigned size;
1113 int retc;
1114
1115 p = msg_end;
1116 if (p + sizeof(s->v4) > buffer+buflen) {
1117 IP_VS_ERR_RL("BACKUP, Dropping buffer, to small\n");
1118 return;
421 } 1119 }
422 cp = ip_vs_conn_new(&param, 1120 s = (union ip_vs_sync_conn *)p;
423 (union nf_inet_addr *)&s->daddr, 1121 size = ntohs(s->v4.ver_size) & SVER_MASK;
424 s->dport, flags, dest); 1122 msg_end = p + size;
425 if (dest) 1123 /* Basic sanity checks */
426 atomic_dec(&dest->refcnt); 1124 if (msg_end > buffer+buflen) {
427 if (!cp) { 1125 IP_VS_ERR_RL("BACKUP, Dropping buffer, msg > buffer\n");
428 pr_err("ip_vs_conn_new failed\n");
429 return; 1126 return;
430 } 1127 }
431 } else if (!cp->dest) { 1128 if (ntohs(s->v4.ver_size) >> SVER_SHIFT) {
432 dest = ip_vs_try_bind_dest(cp); 1129 IP_VS_ERR_RL("BACKUP, Dropping buffer, Unknown version %d\n",
433 if (dest) 1130 ntohs(s->v4.ver_size) >> SVER_SHIFT);
434 atomic_dec(&dest->refcnt); 1131 return;
435 } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
436 (cp->state != state)) {
437 /* update active/inactive flag for the connection */
438 dest = cp->dest;
439 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
440 (state != IP_VS_TCP_S_ESTABLISHED)) {
441 atomic_dec(&dest->activeconns);
442 atomic_inc(&dest->inactconns);
443 cp->flags |= IP_VS_CONN_F_INACTIVE;
444 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
445 (state == IP_VS_TCP_S_ESTABLISHED)) {
446 atomic_inc(&dest->activeconns);
447 atomic_dec(&dest->inactconns);
448 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
449 } 1132 }
450 } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) && 1133 /* Process a single sync_conn */
451 (cp->state != state)) { 1134 retc = ip_vs_proc_sync_conn(net, p, msg_end);
452 dest = cp->dest; 1135 if (retc < 0) {
453 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 1136 IP_VS_ERR_RL("BACKUP, Dropping buffer, Err: %d in decoding\n",
454 (state != IP_VS_SCTP_S_ESTABLISHED)) { 1137 retc);
455 atomic_dec(&dest->activeconns); 1138 return;
456 atomic_inc(&dest->inactconns);
457 cp->flags &= ~IP_VS_CONN_F_INACTIVE;
458 } 1139 }
1140 /* Make sure we have 32 bit alignment */
1141 msg_end = p + ((size + 3) & ~3);
459 } 1142 }
460 1143 } else {
461 if (opt) 1144 /* Old type of message */
462 memcpy(&cp->in_seq, opt, sizeof(*opt)); 1145 ip_vs_process_message_v0(net, buffer, buflen);
463 atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); 1146 return;
464 cp->state = state;
465 cp->old_state = cp->state;
466 /*
467 * We can not recover the right timeout for templates
468 * in all cases, we can not find the right fwmark
469 * virtual service. If needed, we can do it for
470 * non-fwmark persistent services.
471 */
472 if (!(flags & IP_VS_CONN_F_TEMPLATE) && pp->timeout_table)
473 cp->timeout = pp->timeout_table[state];
474 else
475 cp->timeout = (3*60*HZ);
476 ip_vs_conn_put(cp);
477 } 1147 }
478} 1148}
479 1149
@@ -511,8 +1181,10 @@ static int set_mcast_if(struct sock *sk, char *ifname)
511{ 1181{
512 struct net_device *dev; 1182 struct net_device *dev;
513 struct inet_sock *inet = inet_sk(sk); 1183 struct inet_sock *inet = inet_sk(sk);
1184 struct net *net = sock_net(sk);
514 1185
515 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) 1186 dev = __dev_get_by_name(net, ifname);
1187 if (!dev)
516 return -ENODEV; 1188 return -ENODEV;
517 1189
518 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1190 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
@@ -531,30 +1203,33 @@ static int set_mcast_if(struct sock *sk, char *ifname)
531 * Set the maximum length of sync message according to the 1203 * Set the maximum length of sync message according to the
532 * specified interface's MTU. 1204 * specified interface's MTU.
533 */ 1205 */
534static int set_sync_mesg_maxlen(int sync_state) 1206static int set_sync_mesg_maxlen(struct net *net, int sync_state)
535{ 1207{
1208 struct netns_ipvs *ipvs = net_ipvs(net);
536 struct net_device *dev; 1209 struct net_device *dev;
537 int num; 1210 int num;
538 1211
539 if (sync_state == IP_VS_STATE_MASTER) { 1212 if (sync_state == IP_VS_STATE_MASTER) {
540 if ((dev = __dev_get_by_name(&init_net, ip_vs_master_mcast_ifn)) == NULL) 1213 dev = __dev_get_by_name(net, ipvs->master_mcast_ifn);
1214 if (!dev)
541 return -ENODEV; 1215 return -ENODEV;
542 1216
543 num = (dev->mtu - sizeof(struct iphdr) - 1217 num = (dev->mtu - sizeof(struct iphdr) -
544 sizeof(struct udphdr) - 1218 sizeof(struct udphdr) -
545 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; 1219 SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE;
546 sync_send_mesg_maxlen = SYNC_MESG_HEADER_LEN + 1220 ipvs->send_mesg_maxlen = SYNC_MESG_HEADER_LEN +
547 SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF); 1221 SIMPLE_CONN_SIZE * min(num, MAX_CONNS_PER_SYNCBUFF);
548 IP_VS_DBG(7, "setting the maximum length of sync sending " 1222 IP_VS_DBG(7, "setting the maximum length of sync sending "
549 "message %d.\n", sync_send_mesg_maxlen); 1223 "message %d.\n", ipvs->send_mesg_maxlen);
550 } else if (sync_state == IP_VS_STATE_BACKUP) { 1224 } else if (sync_state == IP_VS_STATE_BACKUP) {
551 if ((dev = __dev_get_by_name(&init_net, ip_vs_backup_mcast_ifn)) == NULL) 1225 dev = __dev_get_by_name(net, ipvs->backup_mcast_ifn);
1226 if (!dev)
552 return -ENODEV; 1227 return -ENODEV;
553 1228
554 sync_recv_mesg_maxlen = dev->mtu - 1229 ipvs->recv_mesg_maxlen = dev->mtu -
555 sizeof(struct iphdr) - sizeof(struct udphdr); 1230 sizeof(struct iphdr) - sizeof(struct udphdr);
556 IP_VS_DBG(7, "setting the maximum length of sync receiving " 1231 IP_VS_DBG(7, "setting the maximum length of sync receiving "
557 "message %d.\n", sync_recv_mesg_maxlen); 1232 "message %d.\n", ipvs->recv_mesg_maxlen);
558 } 1233 }
559 1234
560 return 0; 1235 return 0;
@@ -569,6 +1244,7 @@ static int set_sync_mesg_maxlen(int sync_state)
569static int 1244static int
570join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) 1245join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
571{ 1246{
1247 struct net *net = sock_net(sk);
572 struct ip_mreqn mreq; 1248 struct ip_mreqn mreq;
573 struct net_device *dev; 1249 struct net_device *dev;
574 int ret; 1250 int ret;
@@ -576,7 +1252,8 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
576 memset(&mreq, 0, sizeof(mreq)); 1252 memset(&mreq, 0, sizeof(mreq));
577 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); 1253 memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr));
578 1254
579 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) 1255 dev = __dev_get_by_name(net, ifname);
1256 if (!dev)
580 return -ENODEV; 1257 return -ENODEV;
581 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) 1258 if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if)
582 return -EINVAL; 1259 return -EINVAL;
@@ -593,11 +1270,13 @@ join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname)
593 1270
594static int bind_mcastif_addr(struct socket *sock, char *ifname) 1271static int bind_mcastif_addr(struct socket *sock, char *ifname)
595{ 1272{
1273 struct net *net = sock_net(sock->sk);
596 struct net_device *dev; 1274 struct net_device *dev;
597 __be32 addr; 1275 __be32 addr;
598 struct sockaddr_in sin; 1276 struct sockaddr_in sin;
599 1277
600 if ((dev = __dev_get_by_name(&init_net, ifname)) == NULL) 1278 dev = __dev_get_by_name(net, ifname);
1279 if (!dev)
601 return -ENODEV; 1280 return -ENODEV;
602 1281
603 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 1282 addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
@@ -619,8 +1298,9 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
619/* 1298/*
620 * Set up sending multicast socket over UDP 1299 * Set up sending multicast socket over UDP
621 */ 1300 */
622static struct socket * make_send_sock(void) 1301static struct socket *make_send_sock(struct net *net)
623{ 1302{
1303 struct netns_ipvs *ipvs = net_ipvs(net);
624 struct socket *sock; 1304 struct socket *sock;
625 int result; 1305 int result;
626 1306
@@ -631,7 +1311,7 @@ static struct socket * make_send_sock(void)
631 return ERR_PTR(result); 1311 return ERR_PTR(result);
632 } 1312 }
633 1313
634 result = set_mcast_if(sock->sk, ip_vs_master_mcast_ifn); 1314 result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn);
635 if (result < 0) { 1315 if (result < 0) {
636 pr_err("Error setting outbound mcast interface\n"); 1316 pr_err("Error setting outbound mcast interface\n");
637 goto error; 1317 goto error;
@@ -640,7 +1320,7 @@ static struct socket * make_send_sock(void)
640 set_mcast_loop(sock->sk, 0); 1320 set_mcast_loop(sock->sk, 0);
641 set_mcast_ttl(sock->sk, 1); 1321 set_mcast_ttl(sock->sk, 1);
642 1322
643 result = bind_mcastif_addr(sock, ip_vs_master_mcast_ifn); 1323 result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
644 if (result < 0) { 1324 if (result < 0) {
645 pr_err("Error binding address of the mcast interface\n"); 1325 pr_err("Error binding address of the mcast interface\n");
646 goto error; 1326 goto error;
@@ -664,8 +1344,9 @@ static struct socket * make_send_sock(void)
664/* 1344/*
665 * Set up receiving multicast socket over UDP 1345 * Set up receiving multicast socket over UDP
666 */ 1346 */
667static struct socket * make_receive_sock(void) 1347static struct socket *make_receive_sock(struct net *net)
668{ 1348{
1349 struct netns_ipvs *ipvs = net_ipvs(net);
669 struct socket *sock; 1350 struct socket *sock;
670 int result; 1351 int result;
671 1352
@@ -689,7 +1370,7 @@ static struct socket * make_receive_sock(void)
689 /* join the multicast group */ 1370 /* join the multicast group */
690 result = join_mcast_group(sock->sk, 1371 result = join_mcast_group(sock->sk,
691 (struct in_addr *) &mcast_addr.sin_addr, 1372 (struct in_addr *) &mcast_addr.sin_addr,
692 ip_vs_backup_mcast_ifn); 1373 ipvs->backup_mcast_ifn);
693 if (result < 0) { 1374 if (result < 0) {
694 pr_err("Error joining to the multicast group\n"); 1375 pr_err("Error joining to the multicast group\n");
695 goto error; 1376 goto error;
@@ -760,20 +1441,21 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
760static int sync_thread_master(void *data) 1441static int sync_thread_master(void *data)
761{ 1442{
762 struct ip_vs_sync_thread_data *tinfo = data; 1443 struct ip_vs_sync_thread_data *tinfo = data;
1444 struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
763 struct ip_vs_sync_buff *sb; 1445 struct ip_vs_sync_buff *sb;
764 1446
765 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, " 1447 pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
766 "syncid = %d\n", 1448 "syncid = %d\n",
767 ip_vs_master_mcast_ifn, ip_vs_master_syncid); 1449 ipvs->master_mcast_ifn, ipvs->master_syncid);
768 1450
769 while (!kthread_should_stop()) { 1451 while (!kthread_should_stop()) {
770 while ((sb = sb_dequeue())) { 1452 while ((sb = sb_dequeue(ipvs))) {
771 ip_vs_send_sync_msg(tinfo->sock, sb->mesg); 1453 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
772 ip_vs_sync_buff_release(sb); 1454 ip_vs_sync_buff_release(sb);
773 } 1455 }
774 1456
775 /* check if entries stay in curr_sb for 2 seconds */ 1457 /* check if entries stay in ipvs->sync_buff for 2 seconds */
776 sb = get_curr_sync_buff(2 * HZ); 1458 sb = get_curr_sync_buff(ipvs, 2 * HZ);
777 if (sb) { 1459 if (sb) {
778 ip_vs_send_sync_msg(tinfo->sock, sb->mesg); 1460 ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
779 ip_vs_sync_buff_release(sb); 1461 ip_vs_sync_buff_release(sb);
@@ -783,14 +1465,13 @@ static int sync_thread_master(void *data)
783 } 1465 }
784 1466
785 /* clean up the sync_buff queue */ 1467 /* clean up the sync_buff queue */
786 while ((sb=sb_dequeue())) { 1468 while ((sb = sb_dequeue(ipvs)))
787 ip_vs_sync_buff_release(sb); 1469 ip_vs_sync_buff_release(sb);
788 }
789 1470
790 /* clean up the current sync_buff */ 1471 /* clean up the current sync_buff */
791 if ((sb = get_curr_sync_buff(0))) { 1472 sb = get_curr_sync_buff(ipvs, 0);
1473 if (sb)
792 ip_vs_sync_buff_release(sb); 1474 ip_vs_sync_buff_release(sb);
793 }
794 1475
795 /* release the sending multicast socket */ 1476 /* release the sending multicast socket */
796 sock_release(tinfo->sock); 1477 sock_release(tinfo->sock);
@@ -803,11 +1484,12 @@ static int sync_thread_master(void *data)
803static int sync_thread_backup(void *data) 1484static int sync_thread_backup(void *data)
804{ 1485{
805 struct ip_vs_sync_thread_data *tinfo = data; 1486 struct ip_vs_sync_thread_data *tinfo = data;
1487 struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
806 int len; 1488 int len;
807 1489
808 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, " 1490 pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
809 "syncid = %d\n", 1491 "syncid = %d\n",
810 ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); 1492 ipvs->backup_mcast_ifn, ipvs->backup_syncid);
811 1493
812 while (!kthread_should_stop()) { 1494 while (!kthread_should_stop()) {
813 wait_event_interruptible(*sk_sleep(tinfo->sock->sk), 1495 wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -817,7 +1499,7 @@ static int sync_thread_backup(void *data)
817 /* do we have data now? */ 1499 /* do we have data now? */
818 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) { 1500 while (!skb_queue_empty(&(tinfo->sock->sk->sk_receive_queue))) {
819 len = ip_vs_receive(tinfo->sock, tinfo->buf, 1501 len = ip_vs_receive(tinfo->sock, tinfo->buf,
820 sync_recv_mesg_maxlen); 1502 ipvs->recv_mesg_maxlen);
821 if (len <= 0) { 1503 if (len <= 0) {
822 pr_err("receiving message error\n"); 1504 pr_err("receiving message error\n");
823 break; 1505 break;
@@ -826,7 +1508,7 @@ static int sync_thread_backup(void *data)
826 /* disable bottom half, because it accesses the data 1508 /* disable bottom half, because it accesses the data
827 shared by softirq while getting/creating conns */ 1509 shared by softirq while getting/creating conns */
828 local_bh_disable(); 1510 local_bh_disable();
829 ip_vs_process_message(tinfo->buf, len); 1511 ip_vs_process_message(tinfo->net, tinfo->buf, len);
830 local_bh_enable(); 1512 local_bh_enable();
831 } 1513 }
832 } 1514 }
@@ -840,41 +1522,42 @@ static int sync_thread_backup(void *data)
840} 1522}
841 1523
842 1524
843int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) 1525int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
844{ 1526{
845 struct ip_vs_sync_thread_data *tinfo; 1527 struct ip_vs_sync_thread_data *tinfo;
846 struct task_struct **realtask, *task; 1528 struct task_struct **realtask, *task;
847 struct socket *sock; 1529 struct socket *sock;
1530 struct netns_ipvs *ipvs = net_ipvs(net);
848 char *name, *buf = NULL; 1531 char *name, *buf = NULL;
849 int (*threadfn)(void *data); 1532 int (*threadfn)(void *data);
850 int result = -ENOMEM; 1533 int result = -ENOMEM;
851 1534
852 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1535 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
853 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n", 1536 IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
854 sizeof(struct ip_vs_sync_conn)); 1537 sizeof(struct ip_vs_sync_conn_v0));
855 1538
856 if (state == IP_VS_STATE_MASTER) { 1539 if (state == IP_VS_STATE_MASTER) {
857 if (sync_master_thread) 1540 if (ipvs->master_thread)
858 return -EEXIST; 1541 return -EEXIST;
859 1542
860 strlcpy(ip_vs_master_mcast_ifn, mcast_ifn, 1543 strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
861 sizeof(ip_vs_master_mcast_ifn)); 1544 sizeof(ipvs->master_mcast_ifn));
862 ip_vs_master_syncid = syncid; 1545 ipvs->master_syncid = syncid;
863 realtask = &sync_master_thread; 1546 realtask = &ipvs->master_thread;
864 name = "ipvs_syncmaster"; 1547 name = "ipvs_master:%d";
865 threadfn = sync_thread_master; 1548 threadfn = sync_thread_master;
866 sock = make_send_sock(); 1549 sock = make_send_sock(net);
867 } else if (state == IP_VS_STATE_BACKUP) { 1550 } else if (state == IP_VS_STATE_BACKUP) {
868 if (sync_backup_thread) 1551 if (ipvs->backup_thread)
869 return -EEXIST; 1552 return -EEXIST;
870 1553
871 strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn, 1554 strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
872 sizeof(ip_vs_backup_mcast_ifn)); 1555 sizeof(ipvs->backup_mcast_ifn));
873 ip_vs_backup_syncid = syncid; 1556 ipvs->backup_syncid = syncid;
874 realtask = &sync_backup_thread; 1557 realtask = &ipvs->backup_thread;
875 name = "ipvs_syncbackup"; 1558 name = "ipvs_backup:%d";
876 threadfn = sync_thread_backup; 1559 threadfn = sync_thread_backup;
877 sock = make_receive_sock(); 1560 sock = make_receive_sock(net);
878 } else { 1561 } else {
879 return -EINVAL; 1562 return -EINVAL;
880 } 1563 }
@@ -884,9 +1567,9 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
884 goto out; 1567 goto out;
885 } 1568 }
886 1569
887 set_sync_mesg_maxlen(state); 1570 set_sync_mesg_maxlen(net, state);
888 if (state == IP_VS_STATE_BACKUP) { 1571 if (state == IP_VS_STATE_BACKUP) {
889 buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL); 1572 buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
890 if (!buf) 1573 if (!buf)
891 goto outsocket; 1574 goto outsocket;
892 } 1575 }
@@ -895,10 +1578,11 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
895 if (!tinfo) 1578 if (!tinfo)
896 goto outbuf; 1579 goto outbuf;
897 1580
1581 tinfo->net = net;
898 tinfo->sock = sock; 1582 tinfo->sock = sock;
899 tinfo->buf = buf; 1583 tinfo->buf = buf;
900 1584
901 task = kthread_run(threadfn, tinfo, name); 1585 task = kthread_run(threadfn, tinfo, name, ipvs->gen);
902 if (IS_ERR(task)) { 1586 if (IS_ERR(task)) {
903 result = PTR_ERR(task); 1587 result = PTR_ERR(task);
904 goto outtinfo; 1588 goto outtinfo;
@@ -906,7 +1590,7 @@ int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
906 1590
907 /* mark as active */ 1591 /* mark as active */
908 *realtask = task; 1592 *realtask = task;
909 ip_vs_sync_state |= state; 1593 ipvs->sync_state |= state;
910 1594
911 /* increase the module use count */ 1595 /* increase the module use count */
912 ip_vs_use_count_inc(); 1596 ip_vs_use_count_inc();
@@ -924,16 +1608,18 @@ out:
924} 1608}
925 1609
926 1610
927int stop_sync_thread(int state) 1611int stop_sync_thread(struct net *net, int state)
928{ 1612{
1613 struct netns_ipvs *ipvs = net_ipvs(net);
1614
929 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); 1615 IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
930 1616
931 if (state == IP_VS_STATE_MASTER) { 1617 if (state == IP_VS_STATE_MASTER) {
932 if (!sync_master_thread) 1618 if (!ipvs->master_thread)
933 return -ESRCH; 1619 return -ESRCH;
934 1620
935 pr_info("stopping master sync thread %d ...\n", 1621 pr_info("stopping master sync thread %d ...\n",
936 task_pid_nr(sync_master_thread)); 1622 task_pid_nr(ipvs->master_thread));
937 1623
938 /* 1624 /*
939 * The lock synchronizes with sb_queue_tail(), so that we don't 1625 * The lock synchronizes with sb_queue_tail(), so that we don't
@@ -941,21 +1627,21 @@ int stop_sync_thread(int state)
941 * progress of stopping the master sync daemon. 1627 * progress of stopping the master sync daemon.
942 */ 1628 */
943 1629
944 spin_lock_bh(&ip_vs_sync_lock); 1630 spin_lock_bh(&ipvs->sync_lock);
945 ip_vs_sync_state &= ~IP_VS_STATE_MASTER; 1631 ipvs->sync_state &= ~IP_VS_STATE_MASTER;
946 spin_unlock_bh(&ip_vs_sync_lock); 1632 spin_unlock_bh(&ipvs->sync_lock);
947 kthread_stop(sync_master_thread); 1633 kthread_stop(ipvs->master_thread);
948 sync_master_thread = NULL; 1634 ipvs->master_thread = NULL;
949 } else if (state == IP_VS_STATE_BACKUP) { 1635 } else if (state == IP_VS_STATE_BACKUP) {
950 if (!sync_backup_thread) 1636 if (!ipvs->backup_thread)
951 return -ESRCH; 1637 return -ESRCH;
952 1638
953 pr_info("stopping backup sync thread %d ...\n", 1639 pr_info("stopping backup sync thread %d ...\n",
954 task_pid_nr(sync_backup_thread)); 1640 task_pid_nr(ipvs->backup_thread));
955 1641
956 ip_vs_sync_state &= ~IP_VS_STATE_BACKUP; 1642 ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
957 kthread_stop(sync_backup_thread); 1643 kthread_stop(ipvs->backup_thread);
958 sync_backup_thread = NULL; 1644 ipvs->backup_thread = NULL;
959 } else { 1645 } else {
960 return -EINVAL; 1646 return -EINVAL;
961 } 1647 }
@@ -965,3 +1651,42 @@ int stop_sync_thread(int state)
965 1651
966 return 0; 1652 return 0;
967} 1653}
1654
1655/*
1656 * Initialize data struct for each netns
1657 */
1658static int __net_init __ip_vs_sync_init(struct net *net)
1659{
1660 struct netns_ipvs *ipvs = net_ipvs(net);
1661
1662 INIT_LIST_HEAD(&ipvs->sync_queue);
1663 spin_lock_init(&ipvs->sync_lock);
1664 spin_lock_init(&ipvs->sync_buff_lock);
1665
1666 ipvs->sync_mcast_addr.sin_family = AF_INET;
1667 ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
1668 ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
1669 return 0;
1670}
1671
1672static void __ip_vs_sync_cleanup(struct net *net)
1673{
1674 stop_sync_thread(net, IP_VS_STATE_MASTER);
1675 stop_sync_thread(net, IP_VS_STATE_BACKUP);
1676}
1677
1678static struct pernet_operations ipvs_sync_ops = {
1679 .init = __ip_vs_sync_init,
1680 .exit = __ip_vs_sync_cleanup,
1681};
1682
1683
1684int __init ip_vs_sync_init(void)
1685{
1686 return register_pernet_subsys(&ipvs_sync_ops);
1687}
1688
1689void __exit ip_vs_sync_cleanup(void)
1690{
1691 unregister_pernet_subsys(&ipvs_sync_ops);
1692}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 5325a3fbe4ac..1f2a4e35fb11 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -175,7 +175,6 @@ __ip_vs_reroute_locally(struct sk_buff *skb)
175 .fl4_tos = RT_TOS(iph->tos), 175 .fl4_tos = RT_TOS(iph->tos),
176 .mark = skb->mark, 176 .mark = skb->mark,
177 }; 177 };
178 struct rtable *rt;
179 178
180 if (ip_route_output_key(net, &rt, &fl)) 179 if (ip_route_output_key(net, &rt, &fl))
181 return 0; 180 return 0;
@@ -390,7 +389,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
390 389
391 /* MTU checking */ 390 /* MTU checking */
392 mtu = dst_mtu(&rt->dst); 391 mtu = dst_mtu(&rt->dst);
393 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 392 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
393 !skb_is_gso(skb)) {
394 ip_rt_put(rt); 394 ip_rt_put(rt);
395 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 395 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
396 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 396 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -443,7 +443,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
443 443
444 /* MTU checking */ 444 /* MTU checking */
445 mtu = dst_mtu(&rt->dst); 445 mtu = dst_mtu(&rt->dst);
446 if (skb->len > mtu) { 446 if (skb->len > mtu && !skb_is_gso(skb)) {
447 if (!skb->dev) { 447 if (!skb->dev) {
448 struct net *net = dev_net(skb_dst(skb)->dev); 448 struct net *net = dev_net(skb_dst(skb)->dev);
449 449
@@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
543 543
544 /* MTU checking */ 544 /* MTU checking */
545 mtu = dst_mtu(&rt->dst); 545 mtu = dst_mtu(&rt->dst);
546 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { 546 if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF)) &&
547 !skb_is_gso(skb)) {
547 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 548 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
548 IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, 549 IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0,
549 "ip_vs_nat_xmit(): frag needed for"); 550 "ip_vs_nat_xmit(): frag needed for");
@@ -658,7 +659,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
658 659
659 /* MTU checking */ 660 /* MTU checking */
660 mtu = dst_mtu(&rt->dst); 661 mtu = dst_mtu(&rt->dst);
661 if (skb->len > mtu) { 662 if (skb->len > mtu && !skb_is_gso(skb)) {
662 if (!skb->dev) { 663 if (!skb->dev) {
663 struct net *net = dev_net(skb_dst(skb)->dev); 664 struct net *net = dev_net(skb_dst(skb)->dev);
664 665
@@ -773,8 +774,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
773 774
774 df |= (old_iph->frag_off & htons(IP_DF)); 775 df |= (old_iph->frag_off & htons(IP_DF));
775 776
776 if ((old_iph->frag_off & htons(IP_DF)) 777 if ((old_iph->frag_off & htons(IP_DF) &&
777 && mtu < ntohs(old_iph->tot_len)) { 778 mtu < ntohs(old_iph->tot_len) && !skb_is_gso(skb))) {
778 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 779 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
779 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 780 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
780 goto tx_error_put; 781 goto tx_error_put;
@@ -886,7 +887,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
886 if (skb_dst(skb)) 887 if (skb_dst(skb))
887 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); 888 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
888 889
889 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { 890 if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) &&
891 !skb_is_gso(skb)) {
890 if (!skb->dev) { 892 if (!skb->dev) {
891 struct net *net = dev_net(skb_dst(skb)->dev); 893 struct net *net = dev_net(skb_dst(skb)->dev);
892 894
@@ -991,7 +993,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
991 993
992 /* MTU checking */ 994 /* MTU checking */
993 mtu = dst_mtu(&rt->dst); 995 mtu = dst_mtu(&rt->dst);
994 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) { 996 if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu &&
997 !skb_is_gso(skb)) {
995 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); 998 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
996 ip_rt_put(rt); 999 ip_rt_put(rt);
997 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1000 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
@@ -1158,7 +1161,8 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
1158 1161
1159 /* MTU checking */ 1162 /* MTU checking */
1160 mtu = dst_mtu(&rt->dst); 1163 mtu = dst_mtu(&rt->dst);
1161 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { 1164 if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF)) &&
1165 !skb_is_gso(skb)) {
1162 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); 1166 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
1163 IP_VS_DBG_RL("%s(): frag needed\n", __func__); 1167 IP_VS_DBG_RL("%s(): frag needed\n", __func__);
1164 goto tx_error_put; 1168 goto tx_error_put;
@@ -1272,7 +1276,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
1272 1276
1273 /* MTU checking */ 1277 /* MTU checking */
1274 mtu = dst_mtu(&rt->dst); 1278 mtu = dst_mtu(&rt->dst);
1275 if (skb->len > mtu) { 1279 if (skb->len > mtu && !skb_is_gso(skb)) {
1276 if (!skb->dev) { 1280 if (!skb->dev) {
1277 struct net *net = dev_net(skb_dst(skb)->dev); 1281 struct net *net = dev_net(skb_dst(skb)->dev);
1278 1282
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
new file mode 100644
index 000000000000..4e99cca61612
--- /dev/null
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -0,0 +1,82 @@
1/*
2 * broadcast connection tracking helper
3 *
4 * (c) 2005 Patrick McHardy <kaber@trash.net>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/module.h>
13#include <linux/ip.h>
14#include <net/route.h>
15#include <linux/inetdevice.h>
16#include <linux/skbuff.h>
17
18#include <net/netfilter/nf_conntrack.h>
19#include <net/netfilter/nf_conntrack_helper.h>
20#include <net/netfilter/nf_conntrack_expect.h>
21
22int nf_conntrack_broadcast_help(struct sk_buff *skb,
23 unsigned int protoff,
24 struct nf_conn *ct,
25 enum ip_conntrack_info ctinfo,
26 unsigned int timeout)
27{
28 struct nf_conntrack_expect *exp;
29 struct iphdr *iph = ip_hdr(skb);
30 struct rtable *rt = skb_rtable(skb);
31 struct in_device *in_dev;
32 struct nf_conn_help *help = nfct_help(ct);
33 __be32 mask = 0;
34
35 /* we're only interested in locally generated packets */
36 if (skb->sk == NULL)
37 goto out;
38 if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
39 goto out;
40 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
41 goto out;
42
43 rcu_read_lock();
44 in_dev = __in_dev_get_rcu(rt->dst.dev);
45 if (in_dev != NULL) {
46 for_primary_ifa(in_dev) {
47 if (ifa->ifa_broadcast == iph->daddr) {
48 mask = ifa->ifa_mask;
49 break;
50 }
51 } endfor_ifa(in_dev);
52 }
53 rcu_read_unlock();
54
55 if (mask == 0)
56 goto out;
57
58 exp = nf_ct_expect_alloc(ct);
59 if (exp == NULL)
60 goto out;
61
62 exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
63 exp->tuple.src.u.udp.port = help->helper->tuple.src.u.udp.port;
64
65 exp->mask.src.u3.ip = mask;
66 exp->mask.src.u.udp.port = htons(0xFFFF);
67
68 exp->expectfn = NULL;
69 exp->flags = NF_CT_EXPECT_PERMANENT;
70 exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
71 exp->helper = NULL;
72
73 nf_ct_expect_related(exp);
74 nf_ct_expect_put(exp);
75
76 nf_ct_refresh(ct, skb, timeout * HZ);
77out:
78 return NF_ACCEPT;
79}
80EXPORT_SYMBOL_GPL(nf_conntrack_broadcast_help);
81
82MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index e61511929c66..1909311c392a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -43,6 +43,7 @@
43#include <net/netfilter/nf_conntrack_acct.h> 43#include <net/netfilter/nf_conntrack_acct.h>
44#include <net/netfilter/nf_conntrack_ecache.h> 44#include <net/netfilter/nf_conntrack_ecache.h>
45#include <net/netfilter/nf_conntrack_zones.h> 45#include <net/netfilter/nf_conntrack_zones.h>
46#include <net/netfilter/nf_conntrack_timestamp.h>
46#include <net/netfilter/nf_nat.h> 47#include <net/netfilter/nf_nat.h>
47#include <net/netfilter/nf_nat_core.h> 48#include <net/netfilter/nf_nat_core.h>
48 49
@@ -282,6 +283,11 @@ EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
282static void death_by_timeout(unsigned long ul_conntrack) 283static void death_by_timeout(unsigned long ul_conntrack)
283{ 284{
284 struct nf_conn *ct = (void *)ul_conntrack; 285 struct nf_conn *ct = (void *)ul_conntrack;
286 struct nf_conn_tstamp *tstamp;
287
288 tstamp = nf_conn_tstamp_find(ct);
289 if (tstamp && tstamp->stop == 0)
290 tstamp->stop = ktime_to_ns(ktime_get_real());
285 291
286 if (!test_bit(IPS_DYING_BIT, &ct->status) && 292 if (!test_bit(IPS_DYING_BIT, &ct->status) &&
287 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) { 293 unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
@@ -419,6 +425,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
419 struct nf_conntrack_tuple_hash *h; 425 struct nf_conntrack_tuple_hash *h;
420 struct nf_conn *ct; 426 struct nf_conn *ct;
421 struct nf_conn_help *help; 427 struct nf_conn_help *help;
428 struct nf_conn_tstamp *tstamp;
422 struct hlist_nulls_node *n; 429 struct hlist_nulls_node *n;
423 enum ip_conntrack_info ctinfo; 430 enum ip_conntrack_info ctinfo;
424 struct net *net; 431 struct net *net;
@@ -486,8 +493,16 @@ __nf_conntrack_confirm(struct sk_buff *skb)
486 ct->timeout.expires += jiffies; 493 ct->timeout.expires += jiffies;
487 add_timer(&ct->timeout); 494 add_timer(&ct->timeout);
488 atomic_inc(&ct->ct_general.use); 495 atomic_inc(&ct->ct_general.use);
489 set_bit(IPS_CONFIRMED_BIT, &ct->status); 496 ct->status |= IPS_CONFIRMED;
497
498 /* set conntrack timestamp, if enabled. */
499 tstamp = nf_conn_tstamp_find(ct);
500 if (tstamp) {
501 if (skb->tstamp.tv64 == 0)
502 __net_timestamp((struct sk_buff *)skb);
490 503
504 tstamp->start = ktime_to_ns(skb->tstamp);
505 }
491 /* Since the lookup is lockless, hash insertion must be done after 506 /* Since the lookup is lockless, hash insertion must be done after
492 * starting the timer and setting the CONFIRMED bit. The RCU barriers 507 * starting the timer and setting the CONFIRMED bit. The RCU barriers
493 * guarantee that no other CPU can find the conntrack before the above 508 * guarantee that no other CPU can find the conntrack before the above
@@ -655,7 +670,8 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
655 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged. 670 * and ct->tuplehash[IP_CT_DIR_REPLY].hnnode.next unchanged.
656 */ 671 */
657 memset(&ct->tuplehash[IP_CT_DIR_MAX], 0, 672 memset(&ct->tuplehash[IP_CT_DIR_MAX], 0,
658 sizeof(*ct) - offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX])); 673 offsetof(struct nf_conn, proto) -
674 offsetof(struct nf_conn, tuplehash[IP_CT_DIR_MAX]));
659 spin_lock_init(&ct->lock); 675 spin_lock_init(&ct->lock);
660 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; 676 ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
661 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; 677 ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL;
@@ -745,6 +761,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
745 } 761 }
746 762
747 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 763 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
764 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
748 765
749 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL; 766 ecache = tmpl ? nf_ct_ecache_find(tmpl) : NULL;
750 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0, 767 nf_ct_ecache_ext_add(ct, ecache ? ecache->ctmask : 0,
@@ -1185,6 +1202,11 @@ struct __nf_ct_flush_report {
1185static int kill_report(struct nf_conn *i, void *data) 1202static int kill_report(struct nf_conn *i, void *data)
1186{ 1203{
1187 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data; 1204 struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
1205 struct nf_conn_tstamp *tstamp;
1206
1207 tstamp = nf_conn_tstamp_find(i);
1208 if (tstamp && tstamp->stop == 0)
1209 tstamp->stop = ktime_to_ns(ktime_get_real());
1188 1210
1189 /* If we fail to deliver the event, death_by_timeout() will retry */ 1211 /* If we fail to deliver the event, death_by_timeout() will retry */
1190 if (nf_conntrack_event_report(IPCT_DESTROY, i, 1212 if (nf_conntrack_event_report(IPCT_DESTROY, i,
@@ -1201,9 +1223,9 @@ static int kill_all(struct nf_conn *i, void *data)
1201 return 1; 1223 return 1;
1202} 1224}
1203 1225
1204void nf_ct_free_hashtable(void *hash, int vmalloced, unsigned int size) 1226void nf_ct_free_hashtable(void *hash, unsigned int size)
1205{ 1227{
1206 if (vmalloced) 1228 if (is_vmalloc_addr(hash))
1207 vfree(hash); 1229 vfree(hash);
1208 else 1230 else
1209 free_pages((unsigned long)hash, 1231 free_pages((unsigned long)hash,
@@ -1270,8 +1292,7 @@ static void nf_conntrack_cleanup_net(struct net *net)
1270 goto i_see_dead_people; 1292 goto i_see_dead_people;
1271 } 1293 }
1272 1294
1273 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 1295 nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
1274 net->ct.htable_size);
1275 nf_conntrack_ecache_fini(net); 1296 nf_conntrack_ecache_fini(net);
1276 nf_conntrack_acct_fini(net); 1297 nf_conntrack_acct_fini(net);
1277 nf_conntrack_expect_fini(net); 1298 nf_conntrack_expect_fini(net);
@@ -1300,21 +1321,18 @@ void nf_conntrack_cleanup(struct net *net)
1300 } 1321 }
1301} 1322}
1302 1323
1303void *nf_ct_alloc_hashtable(unsigned int *sizep, int *vmalloced, int nulls) 1324void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
1304{ 1325{
1305 struct hlist_nulls_head *hash; 1326 struct hlist_nulls_head *hash;
1306 unsigned int nr_slots, i; 1327 unsigned int nr_slots, i;
1307 size_t sz; 1328 size_t sz;
1308 1329
1309 *vmalloced = 0;
1310
1311 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); 1330 BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
1312 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); 1331 nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
1313 sz = nr_slots * sizeof(struct hlist_nulls_head); 1332 sz = nr_slots * sizeof(struct hlist_nulls_head);
1314 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, 1333 hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
1315 get_order(sz)); 1334 get_order(sz));
1316 if (!hash) { 1335 if (!hash) {
1317 *vmalloced = 1;
1318 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n"); 1336 printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
1319 hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1337 hash = __vmalloc(sz, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1320 PAGE_KERNEL); 1338 PAGE_KERNEL);
@@ -1330,7 +1348,7 @@ EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
1330 1348
1331int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) 1349int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1332{ 1350{
1333 int i, bucket, vmalloced, old_vmalloced; 1351 int i, bucket;
1334 unsigned int hashsize, old_size; 1352 unsigned int hashsize, old_size;
1335 struct hlist_nulls_head *hash, *old_hash; 1353 struct hlist_nulls_head *hash, *old_hash;
1336 struct nf_conntrack_tuple_hash *h; 1354 struct nf_conntrack_tuple_hash *h;
@@ -1347,7 +1365,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1347 if (!hashsize) 1365 if (!hashsize)
1348 return -EINVAL; 1366 return -EINVAL;
1349 1367
1350 hash = nf_ct_alloc_hashtable(&hashsize, &vmalloced, 1); 1368 hash = nf_ct_alloc_hashtable(&hashsize, 1);
1351 if (!hash) 1369 if (!hash)
1352 return -ENOMEM; 1370 return -ENOMEM;
1353 1371
@@ -1369,15 +1387,13 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
1369 } 1387 }
1370 } 1388 }
1371 old_size = init_net.ct.htable_size; 1389 old_size = init_net.ct.htable_size;
1372 old_vmalloced = init_net.ct.hash_vmalloc;
1373 old_hash = init_net.ct.hash; 1390 old_hash = init_net.ct.hash;
1374 1391
1375 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; 1392 init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
1376 init_net.ct.hash_vmalloc = vmalloced;
1377 init_net.ct.hash = hash; 1393 init_net.ct.hash = hash;
1378 spin_unlock_bh(&nf_conntrack_lock); 1394 spin_unlock_bh(&nf_conntrack_lock);
1379 1395
1380 nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); 1396 nf_ct_free_hashtable(old_hash, old_size);
1381 return 0; 1397 return 0;
1382} 1398}
1383EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); 1399EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
@@ -1490,8 +1506,7 @@ static int nf_conntrack_init_net(struct net *net)
1490 } 1506 }
1491 1507
1492 net->ct.htable_size = nf_conntrack_htable_size; 1508 net->ct.htable_size = nf_conntrack_htable_size;
1493 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1509 net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, 1);
1494 &net->ct.hash_vmalloc, 1);
1495 if (!net->ct.hash) { 1510 if (!net->ct.hash) {
1496 ret = -ENOMEM; 1511 ret = -ENOMEM;
1497 printk(KERN_ERR "Unable to create nf_conntrack_hash\n"); 1512 printk(KERN_ERR "Unable to create nf_conntrack_hash\n");
@@ -1503,6 +1518,9 @@ static int nf_conntrack_init_net(struct net *net)
1503 ret = nf_conntrack_acct_init(net); 1518 ret = nf_conntrack_acct_init(net);
1504 if (ret < 0) 1519 if (ret < 0)
1505 goto err_acct; 1520 goto err_acct;
1521 ret = nf_conntrack_tstamp_init(net);
1522 if (ret < 0)
1523 goto err_tstamp;
1506 ret = nf_conntrack_ecache_init(net); 1524 ret = nf_conntrack_ecache_init(net);
1507 if (ret < 0) 1525 if (ret < 0)
1508 goto err_ecache; 1526 goto err_ecache;
@@ -1510,12 +1528,13 @@ static int nf_conntrack_init_net(struct net *net)
1510 return 0; 1528 return 0;
1511 1529
1512err_ecache: 1530err_ecache:
1531 nf_conntrack_tstamp_fini(net);
1532err_tstamp:
1513 nf_conntrack_acct_fini(net); 1533 nf_conntrack_acct_fini(net);
1514err_acct: 1534err_acct:
1515 nf_conntrack_expect_fini(net); 1535 nf_conntrack_expect_fini(net);
1516err_expect: 1536err_expect:
1517 nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, 1537 nf_ct_free_hashtable(net->ct.hash, net->ct.htable_size);
1518 net->ct.htable_size);
1519err_hash: 1538err_hash:
1520 kmem_cache_destroy(net->ct.nf_conntrack_cachep); 1539 kmem_cache_destroy(net->ct.nf_conntrack_cachep);
1521err_cache: 1540err_cache:
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index a20fb0bd1efe..cd1e8e0970f2 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -319,7 +319,8 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
319 const struct nf_conntrack_expect_policy *p; 319 const struct nf_conntrack_expect_policy *p;
320 unsigned int h = nf_ct_expect_dst_hash(&exp->tuple); 320 unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
321 321
322 atomic_inc(&exp->use); 322 /* two references : one for hash insert, one for the timer */
323 atomic_add(2, &exp->use);
323 324
324 if (master_help) { 325 if (master_help) {
325 hlist_add_head(&exp->lnode, &master_help->expectations); 326 hlist_add_head(&exp->lnode, &master_help->expectations);
@@ -333,12 +334,14 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp)
333 setup_timer(&exp->timeout, nf_ct_expectation_timed_out, 334 setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
334 (unsigned long)exp); 335 (unsigned long)exp);
335 if (master_help) { 336 if (master_help) {
336 p = &master_help->helper->expect_policy[exp->class]; 337 p = &rcu_dereference_protected(
338 master_help->helper,
339 lockdep_is_held(&nf_conntrack_lock)
340 )->expect_policy[exp->class];
337 exp->timeout.expires = jiffies + p->timeout * HZ; 341 exp->timeout.expires = jiffies + p->timeout * HZ;
338 } 342 }
339 add_timer(&exp->timeout); 343 add_timer(&exp->timeout);
340 344
341 atomic_inc(&exp->use);
342 NF_CT_STAT_INC(net, expect_create); 345 NF_CT_STAT_INC(net, expect_create);
343} 346}
344 347
@@ -369,7 +372,10 @@ static inline int refresh_timer(struct nf_conntrack_expect *i)
369 if (!del_timer(&i->timeout)) 372 if (!del_timer(&i->timeout))
370 return 0; 373 return 0;
371 374
372 p = &master_help->helper->expect_policy[i->class]; 375 p = &rcu_dereference_protected(
376 master_help->helper,
377 lockdep_is_held(&nf_conntrack_lock)
378 )->expect_policy[i->class];
373 i->timeout.expires = jiffies + p->timeout * HZ; 379 i->timeout.expires = jiffies + p->timeout * HZ;
374 add_timer(&i->timeout); 380 add_timer(&i->timeout);
375 return 1; 381 return 1;
@@ -407,7 +413,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
407 } 413 }
408 /* Will be over limit? */ 414 /* Will be over limit? */
409 if (master_help) { 415 if (master_help) {
410 p = &master_help->helper->expect_policy[expect->class]; 416 p = &rcu_dereference_protected(
417 master_help->helper,
418 lockdep_is_held(&nf_conntrack_lock)
419 )->expect_policy[expect->class];
411 if (p->max_expected && 420 if (p->max_expected &&
412 master_help->expecting[expect->class] >= p->max_expected) { 421 master_help->expecting[expect->class] >= p->max_expected) {
413 evict_oldest_expect(master, expect); 422 evict_oldest_expect(master, expect);
@@ -478,7 +487,7 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
478 struct hlist_node *n; 487 struct hlist_node *n;
479 488
480 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { 489 for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
481 n = rcu_dereference(net->ct.expect_hash[st->bucket].first); 490 n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
482 if (n) 491 if (n)
483 return n; 492 return n;
484 } 493 }
@@ -491,11 +500,11 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
491 struct net *net = seq_file_net(seq); 500 struct net *net = seq_file_net(seq);
492 struct ct_expect_iter_state *st = seq->private; 501 struct ct_expect_iter_state *st = seq->private;
493 502
494 head = rcu_dereference(head->next); 503 head = rcu_dereference(hlist_next_rcu(head));
495 while (head == NULL) { 504 while (head == NULL) {
496 if (++st->bucket >= nf_ct_expect_hsize) 505 if (++st->bucket >= nf_ct_expect_hsize)
497 return NULL; 506 return NULL;
498 head = rcu_dereference(net->ct.expect_hash[st->bucket].first); 507 head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
499 } 508 }
500 return head; 509 return head;
501} 510}
@@ -630,8 +639,7 @@ int nf_conntrack_expect_init(struct net *net)
630 } 639 }
631 640
632 net->ct.expect_count = 0; 641 net->ct.expect_count = 0;
633 net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 642 net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
634 &net->ct.expect_vmalloc, 0);
635 if (net->ct.expect_hash == NULL) 643 if (net->ct.expect_hash == NULL)
636 goto err1; 644 goto err1;
637 645
@@ -653,8 +661,7 @@ err3:
653 if (net_eq(net, &init_net)) 661 if (net_eq(net, &init_net))
654 kmem_cache_destroy(nf_ct_expect_cachep); 662 kmem_cache_destroy(nf_ct_expect_cachep);
655err2: 663err2:
656 nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, 664 nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
657 nf_ct_expect_hsize);
658err1: 665err1:
659 return err; 666 return err;
660} 667}
@@ -666,6 +673,5 @@ void nf_conntrack_expect_fini(struct net *net)
666 rcu_barrier(); /* Wait for call_rcu() before destroy */ 673 rcu_barrier(); /* Wait for call_rcu() before destroy */
667 kmem_cache_destroy(nf_ct_expect_cachep); 674 kmem_cache_destroy(nf_ct_expect_cachep);
668 } 675 }
669 nf_ct_free_hashtable(net->ct.expect_hash, net->ct.expect_vmalloc, 676 nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
670 nf_ct_expect_hsize);
671} 677}
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index bd82450c193f..80a23ed62bb0 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -140,15 +140,16 @@ static void update_alloc_size(struct nf_ct_ext_type *type)
140 /* This assumes that extended areas in conntrack for the types 140 /* This assumes that extended areas in conntrack for the types
141 whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ 141 whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */
142 for (i = min; i <= max; i++) { 142 for (i = min; i <= max; i++) {
143 t1 = nf_ct_ext_types[i]; 143 t1 = rcu_dereference_protected(nf_ct_ext_types[i],
144 lockdep_is_held(&nf_ct_ext_type_mutex));
144 if (!t1) 145 if (!t1)
145 continue; 146 continue;
146 147
147 t1->alloc_size = sizeof(struct nf_ct_ext) 148 t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) +
148 + ALIGN(sizeof(struct nf_ct_ext), t1->align) 149 t1->len;
149 + t1->len;
150 for (j = 0; j < NF_CT_EXT_NUM; j++) { 150 for (j = 0; j < NF_CT_EXT_NUM; j++) {
151 t2 = nf_ct_ext_types[j]; 151 t2 = rcu_dereference_protected(nf_ct_ext_types[j],
152 lockdep_is_held(&nf_ct_ext_type_mutex));
152 if (t2 == NULL || t2 == t1 || 153 if (t2 == NULL || t2 == t1 ||
153 (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) 154 (t2->flags & NF_CT_EXT_F_PREALLOC) == 0)
154 continue; 155 continue;
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 59e1a4cd4e8b..1bdfea357955 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -33,7 +33,6 @@ static DEFINE_MUTEX(nf_ct_helper_mutex);
33static struct hlist_head *nf_ct_helper_hash __read_mostly; 33static struct hlist_head *nf_ct_helper_hash __read_mostly;
34static unsigned int nf_ct_helper_hsize __read_mostly; 34static unsigned int nf_ct_helper_hsize __read_mostly;
35static unsigned int nf_ct_helper_count __read_mostly; 35static unsigned int nf_ct_helper_count __read_mostly;
36static int nf_ct_helper_vmalloc;
37 36
38 37
39/* Stupid hash, but collision free for the default registrations of the 38/* Stupid hash, but collision free for the default registrations of the
@@ -158,7 +157,10 @@ static inline int unhelp(struct nf_conntrack_tuple_hash *i,
158 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i); 157 struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(i);
159 struct nf_conn_help *help = nfct_help(ct); 158 struct nf_conn_help *help = nfct_help(ct);
160 159
161 if (help && help->helper == me) { 160 if (help && rcu_dereference_protected(
161 help->helper,
162 lockdep_is_held(&nf_conntrack_lock)
163 ) == me) {
162 nf_conntrack_event(IPCT_HELPER, ct); 164 nf_conntrack_event(IPCT_HELPER, ct);
163 rcu_assign_pointer(help->helper, NULL); 165 rcu_assign_pointer(help->helper, NULL);
164 } 166 }
@@ -210,7 +212,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
210 hlist_for_each_entry_safe(exp, n, next, 212 hlist_for_each_entry_safe(exp, n, next,
211 &net->ct.expect_hash[i], hnode) { 213 &net->ct.expect_hash[i], hnode) {
212 struct nf_conn_help *help = nfct_help(exp->master); 214 struct nf_conn_help *help = nfct_help(exp->master);
213 if ((help->helper == me || exp->helper == me) && 215 if ((rcu_dereference_protected(
216 help->helper,
217 lockdep_is_held(&nf_conntrack_lock)
218 ) == me || exp->helper == me) &&
214 del_timer(&exp->timeout)) { 219 del_timer(&exp->timeout)) {
215 nf_ct_unlink_expect(exp); 220 nf_ct_unlink_expect(exp);
216 nf_ct_expect_put(exp); 221 nf_ct_expect_put(exp);
@@ -261,8 +266,7 @@ int nf_conntrack_helper_init(void)
261 int err; 266 int err;
262 267
263 nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ 268 nf_ct_helper_hsize = 1; /* gets rounded up to use one page */
264 nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 269 nf_ct_helper_hash = nf_ct_alloc_hashtable(&nf_ct_helper_hsize, 0);
265 &nf_ct_helper_vmalloc, 0);
266 if (!nf_ct_helper_hash) 270 if (!nf_ct_helper_hash)
267 return -ENOMEM; 271 return -ENOMEM;
268 272
@@ -273,14 +277,12 @@ int nf_conntrack_helper_init(void)
273 return 0; 277 return 0;
274 278
275err1: 279err1:
276 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, 280 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
277 nf_ct_helper_hsize);
278 return err; 281 return err;
279} 282}
280 283
281void nf_conntrack_helper_fini(void) 284void nf_conntrack_helper_fini(void)
282{ 285{
283 nf_ct_extend_unregister(&helper_extend); 286 nf_ct_extend_unregister(&helper_extend);
284 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_vmalloc, 287 nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
285 nf_ct_helper_hsize);
286} 288}
diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c
index aadde018a072..4c8f30a3d6d2 100644
--- a/net/netfilter/nf_conntrack_netbios_ns.c
+++ b/net/netfilter/nf_conntrack_netbios_ns.c
@@ -18,14 +18,7 @@
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/skbuff.h>
22#include <linux/netdevice.h>
23#include <linux/inetdevice.h>
24#include <linux/if_addr.h>
25#include <linux/in.h> 21#include <linux/in.h>
26#include <linux/ip.h>
27#include <linux/netfilter.h>
28#include <net/route.h>
29 22
30#include <net/netfilter/nf_conntrack.h> 23#include <net/netfilter/nf_conntrack.h>
31#include <net/netfilter/nf_conntrack_helper.h> 24#include <net/netfilter/nf_conntrack_helper.h>
@@ -40,75 +33,26 @@ MODULE_ALIAS("ip_conntrack_netbios_ns");
40MODULE_ALIAS_NFCT_HELPER("netbios_ns"); 33MODULE_ALIAS_NFCT_HELPER("netbios_ns");
41 34
42static unsigned int timeout __read_mostly = 3; 35static unsigned int timeout __read_mostly = 3;
43module_param(timeout, uint, 0400); 36module_param(timeout, uint, S_IRUSR);
44MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds"); 37MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
45 38
46static int help(struct sk_buff *skb, unsigned int protoff,
47 struct nf_conn *ct, enum ip_conntrack_info ctinfo)
48{
49 struct nf_conntrack_expect *exp;
50 struct iphdr *iph = ip_hdr(skb);
51 struct rtable *rt = skb_rtable(skb);
52 struct in_device *in_dev;
53 __be32 mask = 0;
54
55 /* we're only interested in locally generated packets */
56 if (skb->sk == NULL)
57 goto out;
58 if (rt == NULL || !(rt->rt_flags & RTCF_BROADCAST))
59 goto out;
60 if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
61 goto out;
62
63 rcu_read_lock();
64 in_dev = __in_dev_get_rcu(rt->dst.dev);
65 if (in_dev != NULL) {
66 for_primary_ifa(in_dev) {
67 if (ifa->ifa_broadcast == iph->daddr) {
68 mask = ifa->ifa_mask;
69 break;
70 }
71 } endfor_ifa(in_dev);
72 }
73 rcu_read_unlock();
74
75 if (mask == 0)
76 goto out;
77
78 exp = nf_ct_expect_alloc(ct);
79 if (exp == NULL)
80 goto out;
81
82 exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
83 exp->tuple.src.u.udp.port = htons(NMBD_PORT);
84
85 exp->mask.src.u3.ip = mask;
86 exp->mask.src.u.udp.port = htons(0xFFFF);
87
88 exp->expectfn = NULL;
89 exp->flags = NF_CT_EXPECT_PERMANENT;
90 exp->class = NF_CT_EXPECT_CLASS_DEFAULT;
91 exp->helper = NULL;
92
93 nf_ct_expect_related(exp);
94 nf_ct_expect_put(exp);
95
96 nf_ct_refresh(ct, skb, timeout * HZ);
97out:
98 return NF_ACCEPT;
99}
100
101static struct nf_conntrack_expect_policy exp_policy = { 39static struct nf_conntrack_expect_policy exp_policy = {
102 .max_expected = 1, 40 .max_expected = 1,
103}; 41};
104 42
43static int netbios_ns_help(struct sk_buff *skb, unsigned int protoff,
44 struct nf_conn *ct, enum ip_conntrack_info ctinfo)
45{
46 return nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
47}
48
105static struct nf_conntrack_helper helper __read_mostly = { 49static struct nf_conntrack_helper helper __read_mostly = {
106 .name = "netbios-ns", 50 .name = "netbios-ns",
107 .tuple.src.l3num = AF_INET, 51 .tuple.src.l3num = NFPROTO_IPV4,
108 .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT), 52 .tuple.src.u.udp.port = cpu_to_be16(NMBD_PORT),
109 .tuple.dst.protonum = IPPROTO_UDP, 53 .tuple.dst.protonum = IPPROTO_UDP,
110 .me = THIS_MODULE, 54 .me = THIS_MODULE,
111 .help = help, 55 .help = netbios_ns_help,
112 .expect_policy = &exp_policy, 56 .expect_policy = &exp_policy,
113}; 57};
114 58
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 93297aaceb2b..3fec12c570a8 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -42,6 +42,7 @@
42#include <net/netfilter/nf_conntrack_tuple.h> 42#include <net/netfilter/nf_conntrack_tuple.h>
43#include <net/netfilter/nf_conntrack_acct.h> 43#include <net/netfilter/nf_conntrack_acct.h>
44#include <net/netfilter/nf_conntrack_zones.h> 44#include <net/netfilter/nf_conntrack_zones.h>
45#include <net/netfilter/nf_conntrack_timestamp.h>
45#ifdef CONFIG_NF_NAT_NEEDED 46#ifdef CONFIG_NF_NAT_NEEDED
46#include <net/netfilter/nf_nat_core.h> 47#include <net/netfilter/nf_nat_core.h>
47#include <net/netfilter/nf_nat_protocol.h> 48#include <net/netfilter/nf_nat_protocol.h>
@@ -230,6 +231,33 @@ nla_put_failure:
230 return -1; 231 return -1;
231} 232}
232 233
234static int
235ctnetlink_dump_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
236{
237 struct nlattr *nest_count;
238 const struct nf_conn_tstamp *tstamp;
239
240 tstamp = nf_conn_tstamp_find(ct);
241 if (!tstamp)
242 return 0;
243
244 nest_count = nla_nest_start(skb, CTA_TIMESTAMP | NLA_F_NESTED);
245 if (!nest_count)
246 goto nla_put_failure;
247
248 NLA_PUT_BE64(skb, CTA_TIMESTAMP_START, cpu_to_be64(tstamp->start));
249 if (tstamp->stop != 0) {
250 NLA_PUT_BE64(skb, CTA_TIMESTAMP_STOP,
251 cpu_to_be64(tstamp->stop));
252 }
253 nla_nest_end(skb, nest_count);
254
255 return 0;
256
257nla_put_failure:
258 return -1;
259}
260
233#ifdef CONFIG_NF_CONNTRACK_MARK 261#ifdef CONFIG_NF_CONNTRACK_MARK
234static inline int 262static inline int
235ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct) 263ctnetlink_dump_mark(struct sk_buff *skb, const struct nf_conn *ct)
@@ -404,6 +432,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
404 ctnetlink_dump_timeout(skb, ct) < 0 || 432 ctnetlink_dump_timeout(skb, ct) < 0 ||
405 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || 433 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
406 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 || 434 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
435 ctnetlink_dump_timestamp(skb, ct) < 0 ||
407 ctnetlink_dump_protoinfo(skb, ct) < 0 || 436 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
408 ctnetlink_dump_helpinfo(skb, ct) < 0 || 437 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
409 ctnetlink_dump_mark(skb, ct) < 0 || 438 ctnetlink_dump_mark(skb, ct) < 0 ||
@@ -471,6 +500,18 @@ ctnetlink_secctx_size(const struct nf_conn *ct)
471} 500}
472 501
473static inline size_t 502static inline size_t
503ctnetlink_timestamp_size(const struct nf_conn *ct)
504{
505#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
506 if (!nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
507 return 0;
508 return nla_total_size(0) + 2 * nla_total_size(sizeof(uint64_t));
509#else
510 return 0;
511#endif
512}
513
514static inline size_t
474ctnetlink_nlmsg_size(const struct nf_conn *ct) 515ctnetlink_nlmsg_size(const struct nf_conn *ct)
475{ 516{
476 return NLMSG_ALIGN(sizeof(struct nfgenmsg)) 517 return NLMSG_ALIGN(sizeof(struct nfgenmsg))
@@ -481,6 +522,7 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
481 + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */ 522 + nla_total_size(sizeof(u_int32_t)) /* CTA_ID */
482 + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */ 523 + nla_total_size(sizeof(u_int32_t)) /* CTA_STATUS */
483 + ctnetlink_counters_size(ct) 524 + ctnetlink_counters_size(ct)
525 + ctnetlink_timestamp_size(ct)
484 + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */ 526 + nla_total_size(sizeof(u_int32_t)) /* CTA_TIMEOUT */
485 + nla_total_size(0) /* CTA_PROTOINFO */ 527 + nla_total_size(0) /* CTA_PROTOINFO */
486 + nla_total_size(0) /* CTA_HELP */ 528 + nla_total_size(0) /* CTA_HELP */
@@ -571,7 +613,8 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
571 613
572 if (events & (1 << IPCT_DESTROY)) { 614 if (events & (1 << IPCT_DESTROY)) {
573 if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 || 615 if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
574 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0) 616 ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0 ||
617 ctnetlink_dump_timestamp(skb, ct) < 0)
575 goto nla_put_failure; 618 goto nla_put_failure;
576 } else { 619 } else {
577 if (ctnetlink_dump_timeout(skb, ct) < 0) 620 if (ctnetlink_dump_timeout(skb, ct) < 0)
@@ -1357,6 +1400,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
1357 } 1400 }
1358 1401
1359 nf_ct_acct_ext_add(ct, GFP_ATOMIC); 1402 nf_ct_acct_ext_add(ct, GFP_ATOMIC);
1403 nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
1360 nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC); 1404 nf_ct_ecache_ext_add(ct, 0, 0, GFP_ATOMIC);
1361 /* we must add conntrack extensions before confirmation. */ 1405 /* we must add conntrack extensions before confirmation. */
1362 ct->status |= IPS_CONFIRMED; 1406 ct->status |= IPS_CONFIRMED;
@@ -1375,6 +1419,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
1375 } 1419 }
1376#endif 1420#endif
1377 1421
1422 memset(&ct->proto, 0, sizeof(ct->proto));
1378 if (cda[CTA_PROTOINFO]) { 1423 if (cda[CTA_PROTOINFO]) {
1379 err = ctnetlink_change_protoinfo(ct, cda); 1424 err = ctnetlink_change_protoinfo(ct, cda);
1380 if (err < 0) 1425 if (err < 0)
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index dc7bb74110df..5701c8dd783c 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -166,6 +166,7 @@ static void nf_ct_l3proto_unregister_sysctl(struct nf_conntrack_l3proto *l3proto
166int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto) 166int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
167{ 167{
168 int ret = 0; 168 int ret = 0;
169 struct nf_conntrack_l3proto *old;
169 170
170 if (proto->l3proto >= AF_MAX) 171 if (proto->l3proto >= AF_MAX)
171 return -EBUSY; 172 return -EBUSY;
@@ -174,7 +175,9 @@ int nf_conntrack_l3proto_register(struct nf_conntrack_l3proto *proto)
174 return -EINVAL; 175 return -EINVAL;
175 176
176 mutex_lock(&nf_ct_proto_mutex); 177 mutex_lock(&nf_ct_proto_mutex);
177 if (nf_ct_l3protos[proto->l3proto] != &nf_conntrack_l3proto_generic) { 178 old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
179 lockdep_is_held(&nf_ct_proto_mutex));
180 if (old != &nf_conntrack_l3proto_generic) {
178 ret = -EBUSY; 181 ret = -EBUSY;
179 goto out_unlock; 182 goto out_unlock;
180 } 183 }
@@ -201,7 +204,9 @@ void nf_conntrack_l3proto_unregister(struct nf_conntrack_l3proto *proto)
201 BUG_ON(proto->l3proto >= AF_MAX); 204 BUG_ON(proto->l3proto >= AF_MAX);
202 205
203 mutex_lock(&nf_ct_proto_mutex); 206 mutex_lock(&nf_ct_proto_mutex);
204 BUG_ON(nf_ct_l3protos[proto->l3proto] != proto); 207 BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
208 lockdep_is_held(&nf_ct_proto_mutex)
209 ) != proto);
205 rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], 210 rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
206 &nf_conntrack_l3proto_generic); 211 &nf_conntrack_l3proto_generic);
207 nf_ct_l3proto_unregister_sysctl(proto); 212 nf_ct_l3proto_unregister_sysctl(proto);
@@ -279,7 +284,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
279 mutex_lock(&nf_ct_proto_mutex); 284 mutex_lock(&nf_ct_proto_mutex);
280 if (!nf_ct_protos[l4proto->l3proto]) { 285 if (!nf_ct_protos[l4proto->l3proto]) {
281 /* l3proto may be loaded latter. */ 286 /* l3proto may be loaded latter. */
282 struct nf_conntrack_l4proto **proto_array; 287 struct nf_conntrack_l4proto __rcu **proto_array;
283 int i; 288 int i;
284 289
285 proto_array = kmalloc(MAX_NF_CT_PROTO * 290 proto_array = kmalloc(MAX_NF_CT_PROTO *
@@ -291,7 +296,7 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
291 } 296 }
292 297
293 for (i = 0; i < MAX_NF_CT_PROTO; i++) 298 for (i = 0; i < MAX_NF_CT_PROTO; i++)
294 proto_array[i] = &nf_conntrack_l4proto_generic; 299 RCU_INIT_POINTER(proto_array[i], &nf_conntrack_l4proto_generic);
295 300
296 /* Before making proto_array visible to lockless readers, 301 /* Before making proto_array visible to lockless readers,
297 * we must make sure its content is committed to memory. 302 * we must make sure its content is committed to memory.
@@ -299,8 +304,10 @@ int nf_conntrack_l4proto_register(struct nf_conntrack_l4proto *l4proto)
299 smp_wmb(); 304 smp_wmb();
300 305
301 nf_ct_protos[l4proto->l3proto] = proto_array; 306 nf_ct_protos[l4proto->l3proto] = proto_array;
302 } else if (nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != 307 } else if (rcu_dereference_protected(
303 &nf_conntrack_l4proto_generic) { 308 nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
309 lockdep_is_held(&nf_ct_proto_mutex)
310 ) != &nf_conntrack_l4proto_generic) {
304 ret = -EBUSY; 311 ret = -EBUSY;
305 goto out_unlock; 312 goto out_unlock;
306 } 313 }
@@ -331,7 +338,10 @@ void nf_conntrack_l4proto_unregister(struct nf_conntrack_l4proto *l4proto)
331 BUG_ON(l4proto->l3proto >= PF_MAX); 338 BUG_ON(l4proto->l3proto >= PF_MAX);
332 339
333 mutex_lock(&nf_ct_proto_mutex); 340 mutex_lock(&nf_ct_proto_mutex);
334 BUG_ON(nf_ct_protos[l4proto->l3proto][l4proto->l4proto] != l4proto); 341 BUG_ON(rcu_dereference_protected(
342 nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
343 lockdep_is_held(&nf_ct_proto_mutex)
344 ) != l4proto);
335 rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto], 345 rcu_assign_pointer(nf_ct_protos[l4proto->l3proto][l4proto->l4proto],
336 &nf_conntrack_l4proto_generic); 346 &nf_conntrack_l4proto_generic);
337 nf_ct_l4proto_unregister_sysctl(l4proto); 347 nf_ct_l4proto_unregister_sysctl(l4proto);
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index 5292560d6d4a..9ae57c57c50e 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -452,6 +452,9 @@ static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
452 ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; 452 ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT;
453 ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; 453 ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER;
454 ct->proto.dccp.state = CT_DCCP_NONE; 454 ct->proto.dccp.state = CT_DCCP_NONE;
455 ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST;
456 ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL;
457 ct->proto.dccp.handshake_seq = 0;
455 return true; 458 return true;
456 459
457out_invalid: 460out_invalid:
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index c6049c2d5ea8..6f4ee70f460b 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -413,6 +413,7 @@ static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
413 test_bit(SCTP_CID_COOKIE_ACK, map)) 413 test_bit(SCTP_CID_COOKIE_ACK, map))
414 return false; 414 return false;
415 415
416 memset(&ct->proto.sctp, 0, sizeof(ct->proto.sctp));
416 new_state = SCTP_CONNTRACK_MAX; 417 new_state = SCTP_CONNTRACK_MAX;
417 for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) { 418 for_each_sctp_chunk (skb, sch, _sch, offset, dataoff, count) {
418 /* Don't need lock here: this conntrack not in circulation yet */ 419 /* Don't need lock here: this conntrack not in circulation yet */
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 3fb2b73b24dc..6f38d0e2ea4a 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -1066,9 +1066,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1066 BUG_ON(th == NULL); 1066 BUG_ON(th == NULL);
1067 1067
1068 /* Don't need lock here: this conntrack not in circulation yet */ 1068 /* Don't need lock here: this conntrack not in circulation yet */
1069 new_state 1069 new_state = tcp_conntracks[0][get_conntrack_index(th)][TCP_CONNTRACK_NONE];
1070 = tcp_conntracks[0][get_conntrack_index(th)]
1071 [TCP_CONNTRACK_NONE];
1072 1070
1073 /* Invalid: delete conntrack */ 1071 /* Invalid: delete conntrack */
1074 if (new_state >= TCP_CONNTRACK_MAX) { 1072 if (new_state >= TCP_CONNTRACK_MAX) {
@@ -1077,6 +1075,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1077 } 1075 }
1078 1076
1079 if (new_state == TCP_CONNTRACK_SYN_SENT) { 1077 if (new_state == TCP_CONNTRACK_SYN_SENT) {
1078 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
1080 /* SYN packet */ 1079 /* SYN packet */
1081 ct->proto.tcp.seen[0].td_end = 1080 ct->proto.tcp.seen[0].td_end =
1082 segment_seq_plus_len(ntohl(th->seq), skb->len, 1081 segment_seq_plus_len(ntohl(th->seq), skb->len,
@@ -1088,11 +1087,11 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1088 ct->proto.tcp.seen[0].td_end; 1087 ct->proto.tcp.seen[0].td_end;
1089 1088
1090 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); 1089 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
1091 ct->proto.tcp.seen[1].flags = 0;
1092 } else if (nf_ct_tcp_loose == 0) { 1090 } else if (nf_ct_tcp_loose == 0) {
1093 /* Don't try to pick up connections. */ 1091 /* Don't try to pick up connections. */
1094 return false; 1092 return false;
1095 } else { 1093 } else {
1094 memset(&ct->proto.tcp, 0, sizeof(ct->proto.tcp));
1096 /* 1095 /*
1097 * We are in the middle of a connection, 1096 * We are in the middle of a connection,
1098 * its history is lost for us. 1097 * its history is lost for us.
@@ -1107,7 +1106,6 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1107 ct->proto.tcp.seen[0].td_maxend = 1106 ct->proto.tcp.seen[0].td_maxend =
1108 ct->proto.tcp.seen[0].td_end + 1107 ct->proto.tcp.seen[0].td_end +
1109 ct->proto.tcp.seen[0].td_maxwin; 1108 ct->proto.tcp.seen[0].td_maxwin;
1110 ct->proto.tcp.seen[0].td_scale = 0;
1111 1109
1112 /* We assume SACK and liberal window checking to handle 1110 /* We assume SACK and liberal window checking to handle
1113 * window scaling */ 1111 * window scaling */
@@ -1116,13 +1114,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1116 IP_CT_TCP_FLAG_BE_LIBERAL; 1114 IP_CT_TCP_FLAG_BE_LIBERAL;
1117 } 1115 }
1118 1116
1119 ct->proto.tcp.seen[1].td_end = 0;
1120 ct->proto.tcp.seen[1].td_maxend = 0;
1121 ct->proto.tcp.seen[1].td_maxwin = 0;
1122 ct->proto.tcp.seen[1].td_scale = 0;
1123
1124 /* tcp_packet will set them */ 1117 /* tcp_packet will set them */
1125 ct->proto.tcp.state = TCP_CONNTRACK_NONE;
1126 ct->proto.tcp.last_index = TCP_NONE_SET; 1118 ct->proto.tcp.last_index = TCP_NONE_SET;
1127 1119
1128 pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " 1120 pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
diff --git a/net/netfilter/nf_conntrack_snmp.c b/net/netfilter/nf_conntrack_snmp.c
new file mode 100644
index 000000000000..6e545e26289e
--- /dev/null
+++ b/net/netfilter/nf_conntrack_snmp.c
@@ -0,0 +1,77 @@
1/*
2 * SNMP service broadcast connection tracking helper
3 *
4 * (c) 2011 Jiri Olsa <jolsa@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/in.h>
15
16#include <net/netfilter/nf_conntrack.h>
17#include <net/netfilter/nf_conntrack_helper.h>
18#include <net/netfilter/nf_conntrack_expect.h>
19
20#define SNMP_PORT 161
21
22MODULE_AUTHOR("Jiri Olsa <jolsa@redhat.com>");
23MODULE_DESCRIPTION("SNMP service broadcast connection tracking helper");
24MODULE_LICENSE("GPL");
25MODULE_ALIAS_NFCT_HELPER("snmp");
26
27static unsigned int timeout __read_mostly = 30;
28module_param(timeout, uint, S_IRUSR);
29MODULE_PARM_DESC(timeout, "timeout for master connection/replies in seconds");
30
31int (*nf_nat_snmp_hook)(struct sk_buff *skb,
32 unsigned int protoff,
33 struct nf_conn *ct,
34 enum ip_conntrack_info ctinfo);
35EXPORT_SYMBOL_GPL(nf_nat_snmp_hook);
36
37static int snmp_conntrack_help(struct sk_buff *skb, unsigned int protoff,
38 struct nf_conn *ct, enum ip_conntrack_info ctinfo)
39{
40 typeof(nf_nat_snmp_hook) nf_nat_snmp;
41
42 nf_conntrack_broadcast_help(skb, protoff, ct, ctinfo, timeout);
43
44 nf_nat_snmp = rcu_dereference(nf_nat_snmp_hook);
45 if (nf_nat_snmp && ct->status & IPS_NAT_MASK)
46 return nf_nat_snmp(skb, protoff, ct, ctinfo);
47
48 return NF_ACCEPT;
49}
50
51static struct nf_conntrack_expect_policy exp_policy = {
52 .max_expected = 1,
53};
54
55static struct nf_conntrack_helper helper __read_mostly = {
56 .name = "snmp",
57 .tuple.src.l3num = NFPROTO_IPV4,
58 .tuple.src.u.udp.port = cpu_to_be16(SNMP_PORT),
59 .tuple.dst.protonum = IPPROTO_UDP,
60 .me = THIS_MODULE,
61 .help = snmp_conntrack_help,
62 .expect_policy = &exp_policy,
63};
64
65static int __init nf_conntrack_snmp_init(void)
66{
67 exp_policy.timeout = timeout;
68 return nf_conntrack_helper_register(&helper);
69}
70
71static void __exit nf_conntrack_snmp_fini(void)
72{
73 nf_conntrack_helper_unregister(&helper);
74}
75
76module_init(nf_conntrack_snmp_init);
77module_exit(nf_conntrack_snmp_fini);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b4d7f0f24b27..0ae142825881 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -29,6 +29,8 @@
29#include <net/netfilter/nf_conntrack_helper.h> 29#include <net/netfilter/nf_conntrack_helper.h>
30#include <net/netfilter/nf_conntrack_acct.h> 30#include <net/netfilter/nf_conntrack_acct.h>
31#include <net/netfilter/nf_conntrack_zones.h> 31#include <net/netfilter/nf_conntrack_zones.h>
32#include <net/netfilter/nf_conntrack_timestamp.h>
33#include <linux/rculist_nulls.h>
32 34
33MODULE_LICENSE("GPL"); 35MODULE_LICENSE("GPL");
34 36
@@ -45,6 +47,7 @@ EXPORT_SYMBOL_GPL(print_tuple);
45struct ct_iter_state { 47struct ct_iter_state {
46 struct seq_net_private p; 48 struct seq_net_private p;
47 unsigned int bucket; 49 unsigned int bucket;
50 u_int64_t time_now;
48}; 51};
49 52
50static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) 53static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
@@ -56,7 +59,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
56 for (st->bucket = 0; 59 for (st->bucket = 0;
57 st->bucket < net->ct.htable_size; 60 st->bucket < net->ct.htable_size;
58 st->bucket++) { 61 st->bucket++) {
59 n = rcu_dereference(net->ct.hash[st->bucket].first); 62 n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
60 if (!is_a_nulls(n)) 63 if (!is_a_nulls(n))
61 return n; 64 return n;
62 } 65 }
@@ -69,13 +72,15 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
69 struct net *net = seq_file_net(seq); 72 struct net *net = seq_file_net(seq);
70 struct ct_iter_state *st = seq->private; 73 struct ct_iter_state *st = seq->private;
71 74
72 head = rcu_dereference(head->next); 75 head = rcu_dereference(hlist_nulls_next_rcu(head));
73 while (is_a_nulls(head)) { 76 while (is_a_nulls(head)) {
74 if (likely(get_nulls_value(head) == st->bucket)) { 77 if (likely(get_nulls_value(head) == st->bucket)) {
75 if (++st->bucket >= net->ct.htable_size) 78 if (++st->bucket >= net->ct.htable_size)
76 return NULL; 79 return NULL;
77 } 80 }
78 head = rcu_dereference(net->ct.hash[st->bucket].first); 81 head = rcu_dereference(
82 hlist_nulls_first_rcu(
83 &net->ct.hash[st->bucket]));
79 } 84 }
80 return head; 85 return head;
81} 86}
@@ -93,6 +98,9 @@ static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
93static void *ct_seq_start(struct seq_file *seq, loff_t *pos) 98static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
94 __acquires(RCU) 99 __acquires(RCU)
95{ 100{
101 struct ct_iter_state *st = seq->private;
102
103 st->time_now = ktime_to_ns(ktime_get_real());
96 rcu_read_lock(); 104 rcu_read_lock();
97 return ct_get_idx(seq, *pos); 105 return ct_get_idx(seq, *pos);
98} 106}
@@ -132,6 +140,34 @@ static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
132} 140}
133#endif 141#endif
134 142
143#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
144static int ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
145{
146 struct ct_iter_state *st = s->private;
147 struct nf_conn_tstamp *tstamp;
148 s64 delta_time;
149
150 tstamp = nf_conn_tstamp_find(ct);
151 if (tstamp) {
152 delta_time = st->time_now - tstamp->start;
153 if (delta_time > 0)
154 delta_time = div_s64(delta_time, NSEC_PER_SEC);
155 else
156 delta_time = 0;
157
158 return seq_printf(s, "delta-time=%llu ",
159 (unsigned long long)delta_time);
160 }
161 return 0;
162}
163#else
164static inline int
165ct_show_delta_time(struct seq_file *s, const struct nf_conn *ct)
166{
167 return 0;
168}
169#endif
170
135/* return 0 on success, 1 in case of error */ 171/* return 0 on success, 1 in case of error */
136static int ct_seq_show(struct seq_file *s, void *v) 172static int ct_seq_show(struct seq_file *s, void *v)
137{ 173{
@@ -200,6 +236,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
200 goto release; 236 goto release;
201#endif 237#endif
202 238
239 if (ct_show_delta_time(s, ct))
240 goto release;
241
203 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 242 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
204 goto release; 243 goto release;
205 244
diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c
new file mode 100644
index 000000000000..af7dd31af0a1
--- /dev/null
+++ b/net/netfilter/nf_conntrack_timestamp.c
@@ -0,0 +1,120 @@
1/*
2 * (C) 2010 Pablo Neira Ayuso <pablo@netfilter.org>
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 as
6 * published by the Free Software Foundation (or any later at your option).
7 */
8
9#include <linux/netfilter.h>
10#include <linux/slab.h>
11#include <linux/kernel.h>
12#include <linux/moduleparam.h>
13
14#include <net/netfilter/nf_conntrack.h>
15#include <net/netfilter/nf_conntrack_extend.h>
16#include <net/netfilter/nf_conntrack_timestamp.h>
17
18static int nf_ct_tstamp __read_mostly;
19
20module_param_named(tstamp, nf_ct_tstamp, bool, 0644);
21MODULE_PARM_DESC(tstamp, "Enable connection tracking flow timestamping.");
22
23#ifdef CONFIG_SYSCTL
24static struct ctl_table tstamp_sysctl_table[] = {
25 {
26 .procname = "nf_conntrack_timestamp",
27 .data = &init_net.ct.sysctl_tstamp,
28 .maxlen = sizeof(unsigned int),
29 .mode = 0644,
30 .proc_handler = proc_dointvec,
31 },
32 {}
33};
34#endif /* CONFIG_SYSCTL */
35
36static struct nf_ct_ext_type tstamp_extend __read_mostly = {
37 .len = sizeof(struct nf_conn_tstamp),
38 .align = __alignof__(struct nf_conn_tstamp),
39 .id = NF_CT_EXT_TSTAMP,
40};
41
42#ifdef CONFIG_SYSCTL
43static int nf_conntrack_tstamp_init_sysctl(struct net *net)
44{
45 struct ctl_table *table;
46
47 table = kmemdup(tstamp_sysctl_table, sizeof(tstamp_sysctl_table),
48 GFP_KERNEL);
49 if (!table)
50 goto out;
51
52 table[0].data = &net->ct.sysctl_tstamp;
53
54 net->ct.tstamp_sysctl_header = register_net_sysctl_table(net,
55 nf_net_netfilter_sysctl_path, table);
56 if (!net->ct.tstamp_sysctl_header) {
57 printk(KERN_ERR "nf_ct_tstamp: can't register to sysctl.\n");
58 goto out_register;
59 }
60 return 0;
61
62out_register:
63 kfree(table);
64out:
65 return -ENOMEM;
66}
67
68static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
69{
70 struct ctl_table *table;
71
72 table = net->ct.tstamp_sysctl_header->ctl_table_arg;
73 unregister_net_sysctl_table(net->ct.tstamp_sysctl_header);
74 kfree(table);
75}
76#else
77static int nf_conntrack_tstamp_init_sysctl(struct net *net)
78{
79 return 0;
80}
81
82static void nf_conntrack_tstamp_fini_sysctl(struct net *net)
83{
84}
85#endif
86
87int nf_conntrack_tstamp_init(struct net *net)
88{
89 int ret;
90
91 net->ct.sysctl_tstamp = nf_ct_tstamp;
92
93 if (net_eq(net, &init_net)) {
94 ret = nf_ct_extend_register(&tstamp_extend);
95 if (ret < 0) {
96 printk(KERN_ERR "nf_ct_tstamp: Unable to register "
97 "extension\n");
98 goto out_extend_register;
99 }
100 }
101
102 ret = nf_conntrack_tstamp_init_sysctl(net);
103 if (ret < 0)
104 goto out_sysctl;
105
106 return 0;
107
108out_sysctl:
109 if (net_eq(net, &init_net))
110 nf_ct_extend_unregister(&tstamp_extend);
111out_extend_register:
112 return ret;
113}
114
115void nf_conntrack_tstamp_fini(struct net *net)
116{
117 nf_conntrack_tstamp_fini_sysctl(net);
118 if (net_eq(net, &init_net))
119 nf_ct_extend_unregister(&tstamp_extend);
120}
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index b07393eab88e..20c775cff2a8 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -161,7 +161,8 @@ static int seq_show(struct seq_file *s, void *v)
161 struct nf_logger *t; 161 struct nf_logger *t;
162 int ret; 162 int ret;
163 163
164 logger = nf_loggers[*pos]; 164 logger = rcu_dereference_protected(nf_loggers[*pos],
165 lockdep_is_held(&nf_log_mutex));
165 166
166 if (!logger) 167 if (!logger)
167 ret = seq_printf(s, "%2lld NONE (", *pos); 168 ret = seq_printf(s, "%2lld NONE (", *pos);
@@ -249,7 +250,8 @@ static int nf_log_proc_dostring(ctl_table *table, int write,
249 mutex_unlock(&nf_log_mutex); 250 mutex_unlock(&nf_log_mutex);
250 } else { 251 } else {
251 mutex_lock(&nf_log_mutex); 252 mutex_lock(&nf_log_mutex);
252 logger = nf_loggers[tindex]; 253 logger = rcu_dereference_protected(nf_loggers[tindex],
254 lockdep_is_held(&nf_log_mutex));
253 if (!logger) 255 if (!logger)
254 table->data = "NONE"; 256 table->data = "NONE";
255 else 257 else
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 74aebed5bd28..5ab22e2bbd7d 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -27,14 +27,17 @@ static DEFINE_MUTEX(queue_handler_mutex);
27int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) 27int nf_register_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
28{ 28{
29 int ret; 29 int ret;
30 const struct nf_queue_handler *old;
30 31
31 if (pf >= ARRAY_SIZE(queue_handler)) 32 if (pf >= ARRAY_SIZE(queue_handler))
32 return -EINVAL; 33 return -EINVAL;
33 34
34 mutex_lock(&queue_handler_mutex); 35 mutex_lock(&queue_handler_mutex);
35 if (queue_handler[pf] == qh) 36 old = rcu_dereference_protected(queue_handler[pf],
37 lockdep_is_held(&queue_handler_mutex));
38 if (old == qh)
36 ret = -EEXIST; 39 ret = -EEXIST;
37 else if (queue_handler[pf]) 40 else if (old)
38 ret = -EBUSY; 41 ret = -EBUSY;
39 else { 42 else {
40 rcu_assign_pointer(queue_handler[pf], qh); 43 rcu_assign_pointer(queue_handler[pf], qh);
@@ -49,11 +52,15 @@ EXPORT_SYMBOL(nf_register_queue_handler);
49/* The caller must flush their queue before this */ 52/* The caller must flush their queue before this */
50int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh) 53int nf_unregister_queue_handler(u_int8_t pf, const struct nf_queue_handler *qh)
51{ 54{
55 const struct nf_queue_handler *old;
56
52 if (pf >= ARRAY_SIZE(queue_handler)) 57 if (pf >= ARRAY_SIZE(queue_handler))
53 return -EINVAL; 58 return -EINVAL;
54 59
55 mutex_lock(&queue_handler_mutex); 60 mutex_lock(&queue_handler_mutex);
56 if (queue_handler[pf] && queue_handler[pf] != qh) { 61 old = rcu_dereference_protected(queue_handler[pf],
62 lockdep_is_held(&queue_handler_mutex));
63 if (old && old != qh) {
57 mutex_unlock(&queue_handler_mutex); 64 mutex_unlock(&queue_handler_mutex);
58 return -EINVAL; 65 return -EINVAL;
59 } 66 }
@@ -73,7 +80,10 @@ void nf_unregister_queue_handlers(const struct nf_queue_handler *qh)
73 80
74 mutex_lock(&queue_handler_mutex); 81 mutex_lock(&queue_handler_mutex);
75 for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) { 82 for (pf = 0; pf < ARRAY_SIZE(queue_handler); pf++) {
76 if (queue_handler[pf] == qh) 83 if (rcu_dereference_protected(
84 queue_handler[pf],
85 lockdep_is_held(&queue_handler_mutex)
86 ) == qh)
77 rcu_assign_pointer(queue_handler[pf], NULL); 87 rcu_assign_pointer(queue_handler[pf], NULL);
78 } 88 }
79 mutex_unlock(&queue_handler_mutex); 89 mutex_unlock(&queue_handler_mutex);
@@ -115,7 +125,7 @@ static int __nf_queue(struct sk_buff *skb,
115 int (*okfn)(struct sk_buff *), 125 int (*okfn)(struct sk_buff *),
116 unsigned int queuenum) 126 unsigned int queuenum)
117{ 127{
118 int status; 128 int status = -ENOENT;
119 struct nf_queue_entry *entry = NULL; 129 struct nf_queue_entry *entry = NULL;
120#ifdef CONFIG_BRIDGE_NETFILTER 130#ifdef CONFIG_BRIDGE_NETFILTER
121 struct net_device *physindev; 131 struct net_device *physindev;
@@ -128,16 +138,20 @@ static int __nf_queue(struct sk_buff *skb,
128 rcu_read_lock(); 138 rcu_read_lock();
129 139
130 qh = rcu_dereference(queue_handler[pf]); 140 qh = rcu_dereference(queue_handler[pf]);
131 if (!qh) 141 if (!qh) {
142 status = -ESRCH;
132 goto err_unlock; 143 goto err_unlock;
144 }
133 145
134 afinfo = nf_get_afinfo(pf); 146 afinfo = nf_get_afinfo(pf);
135 if (!afinfo) 147 if (!afinfo)
136 goto err_unlock; 148 goto err_unlock;
137 149
138 entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC); 150 entry = kmalloc(sizeof(*entry) + afinfo->route_key_size, GFP_ATOMIC);
139 if (!entry) 151 if (!entry) {
152 status = -ENOMEM;
140 goto err_unlock; 153 goto err_unlock;
154 }
141 155
142 *entry = (struct nf_queue_entry) { 156 *entry = (struct nf_queue_entry) {
143 .skb = skb, 157 .skb = skb,
@@ -151,11 +165,9 @@ static int __nf_queue(struct sk_buff *skb,
151 165
152 /* If it's going away, ignore hook. */ 166 /* If it's going away, ignore hook. */
153 if (!try_module_get(entry->elem->owner)) { 167 if (!try_module_get(entry->elem->owner)) {
154 rcu_read_unlock(); 168 status = -ECANCELED;
155 kfree(entry); 169 goto err_unlock;
156 return 0;
157 } 170 }
158
159 /* Bump dev refs so they don't vanish while packet is out */ 171 /* Bump dev refs so they don't vanish while packet is out */
160 if (indev) 172 if (indev)
161 dev_hold(indev); 173 dev_hold(indev);
@@ -182,14 +194,13 @@ static int __nf_queue(struct sk_buff *skb,
182 goto err; 194 goto err;
183 } 195 }
184 196
185 return 1; 197 return 0;
186 198
187err_unlock: 199err_unlock:
188 rcu_read_unlock(); 200 rcu_read_unlock();
189err: 201err:
190 kfree_skb(skb);
191 kfree(entry); 202 kfree(entry);
192 return 1; 203 return status;
193} 204}
194 205
195int nf_queue(struct sk_buff *skb, 206int nf_queue(struct sk_buff *skb,
@@ -201,6 +212,8 @@ int nf_queue(struct sk_buff *skb,
201 unsigned int queuenum) 212 unsigned int queuenum)
202{ 213{
203 struct sk_buff *segs; 214 struct sk_buff *segs;
215 int err;
216 unsigned int queued;
204 217
205 if (!skb_is_gso(skb)) 218 if (!skb_is_gso(skb))
206 return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn, 219 return __nf_queue(skb, elem, pf, hook, indev, outdev, okfn,
@@ -216,20 +229,35 @@ int nf_queue(struct sk_buff *skb,
216 } 229 }
217 230
218 segs = skb_gso_segment(skb, 0); 231 segs = skb_gso_segment(skb, 0);
219 kfree_skb(skb); 232 /* Does not use PTR_ERR to limit the number of error codes that can be
233 * returned by nf_queue. For instance, callers rely on -ECANCELED to mean
234 * 'ignore this hook'.
235 */
220 if (IS_ERR(segs)) 236 if (IS_ERR(segs))
221 return 1; 237 return -EINVAL;
222 238
239 queued = 0;
240 err = 0;
223 do { 241 do {
224 struct sk_buff *nskb = segs->next; 242 struct sk_buff *nskb = segs->next;
225 243
226 segs->next = NULL; 244 segs->next = NULL;
227 if (!__nf_queue(segs, elem, pf, hook, indev, outdev, okfn, 245 if (err == 0)
228 queuenum)) 246 err = __nf_queue(segs, elem, pf, hook, indev,
247 outdev, okfn, queuenum);
248 if (err == 0)
249 queued++;
250 else
229 kfree_skb(segs); 251 kfree_skb(segs);
230 segs = nskb; 252 segs = nskb;
231 } while (segs); 253 } while (segs);
232 return 1; 254
255 /* also free orig skb if only some segments were queued */
256 if (unlikely(err && queued))
257 err = 0;
258 if (err == 0)
259 kfree_skb(skb);
260 return err;
233} 261}
234 262
235void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) 263void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
@@ -237,6 +265,7 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
237 struct sk_buff *skb = entry->skb; 265 struct sk_buff *skb = entry->skb;
238 struct list_head *elem = &entry->elem->list; 266 struct list_head *elem = &entry->elem->list;
239 const struct nf_afinfo *afinfo; 267 const struct nf_afinfo *afinfo;
268 int err;
240 269
241 rcu_read_lock(); 270 rcu_read_lock();
242 271
@@ -270,10 +299,17 @@ void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict)
270 local_bh_enable(); 299 local_bh_enable();
271 break; 300 break;
272 case NF_QUEUE: 301 case NF_QUEUE:
273 if (!__nf_queue(skb, elem, entry->pf, entry->hook, 302 err = __nf_queue(skb, elem, entry->pf, entry->hook,
274 entry->indev, entry->outdev, entry->okfn, 303 entry->indev, entry->outdev, entry->okfn,
275 verdict >> NF_VERDICT_BITS)) 304 verdict >> NF_VERDICT_QBITS);
276 goto next_hook; 305 if (err < 0) {
306 if (err == -ECANCELED)
307 goto next_hook;
308 if (err == -ESRCH &&
309 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))
310 goto next_hook;
311 kfree_skb(skb);
312 }
277 break; 313 break;
278 case NF_STOLEN: 314 case NF_STOLEN:
279 default: 315 default:
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 6a1572b0ab41..91592da504b9 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -874,19 +874,19 @@ static struct hlist_node *get_first(struct iter_state *st)
874 874
875 for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { 875 for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) {
876 if (!hlist_empty(&instance_table[st->bucket])) 876 if (!hlist_empty(&instance_table[st->bucket]))
877 return rcu_dereference_bh(instance_table[st->bucket].first); 877 return rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
878 } 878 }
879 return NULL; 879 return NULL;
880} 880}
881 881
882static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h) 882static struct hlist_node *get_next(struct iter_state *st, struct hlist_node *h)
883{ 883{
884 h = rcu_dereference_bh(h->next); 884 h = rcu_dereference_bh(hlist_next_rcu(h));
885 while (!h) { 885 while (!h) {
886 if (++st->bucket >= INSTANCE_BUCKETS) 886 if (++st->bucket >= INSTANCE_BUCKETS)
887 return NULL; 887 return NULL;
888 888
889 h = rcu_dereference_bh(instance_table[st->bucket].first); 889 h = rcu_dereference_bh(hlist_first_rcu(&instance_table[st->bucket]));
890 } 890 }
891 return h; 891 return h;
892} 892}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 68e67d19724d..b83123f12b42 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -387,25 +387,31 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
387{ 387{
388 struct sk_buff *nskb; 388 struct sk_buff *nskb;
389 struct nfqnl_instance *queue; 389 struct nfqnl_instance *queue;
390 int err; 390 int err = -ENOBUFS;
391 391
392 /* rcu_read_lock()ed by nf_hook_slow() */ 392 /* rcu_read_lock()ed by nf_hook_slow() */
393 queue = instance_lookup(queuenum); 393 queue = instance_lookup(queuenum);
394 if (!queue) 394 if (!queue) {
395 err = -ESRCH;
395 goto err_out; 396 goto err_out;
397 }
396 398
397 if (queue->copy_mode == NFQNL_COPY_NONE) 399 if (queue->copy_mode == NFQNL_COPY_NONE) {
400 err = -EINVAL;
398 goto err_out; 401 goto err_out;
402 }
399 403
400 nskb = nfqnl_build_packet_message(queue, entry); 404 nskb = nfqnl_build_packet_message(queue, entry);
401 if (nskb == NULL) 405 if (nskb == NULL) {
406 err = -ENOMEM;
402 goto err_out; 407 goto err_out;
403 408 }
404 spin_lock_bh(&queue->lock); 409 spin_lock_bh(&queue->lock);
405 410
406 if (!queue->peer_pid) 411 if (!queue->peer_pid) {
412 err = -EINVAL;
407 goto err_out_free_nskb; 413 goto err_out_free_nskb;
408 414 }
409 if (queue->queue_total >= queue->queue_maxlen) { 415 if (queue->queue_total >= queue->queue_maxlen) {
410 queue->queue_dropped++; 416 queue->queue_dropped++;
411 if (net_ratelimit()) 417 if (net_ratelimit())
@@ -432,7 +438,7 @@ err_out_free_nskb:
432err_out_unlock: 438err_out_unlock:
433 spin_unlock_bh(&queue->lock); 439 spin_unlock_bh(&queue->lock);
434err_out: 440err_out:
435 return -1; 441 return err;
436} 442}
437 443
438static int 444static int
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index c94237631077..0a77d2ff2154 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -23,6 +23,7 @@
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/audit.h>
26#include <net/net_namespace.h> 27#include <net/net_namespace.h>
27 28
28#include <linux/netfilter/x_tables.h> 29#include <linux/netfilter/x_tables.h>
@@ -38,9 +39,8 @@ MODULE_DESCRIPTION("{ip,ip6,arp,eb}_tables backend module");
38#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) 39#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
39 40
40struct compat_delta { 41struct compat_delta {
41 struct compat_delta *next; 42 unsigned int offset; /* offset in kernel */
42 unsigned int offset; 43 int delta; /* delta in 32bit user land */
43 int delta;
44}; 44};
45 45
46struct xt_af { 46struct xt_af {
@@ -49,7 +49,9 @@ struct xt_af {
49 struct list_head target; 49 struct list_head target;
50#ifdef CONFIG_COMPAT 50#ifdef CONFIG_COMPAT
51 struct mutex compat_mutex; 51 struct mutex compat_mutex;
52 struct compat_delta *compat_offsets; 52 struct compat_delta *compat_tab;
53 unsigned int number; /* number of slots in compat_tab[] */
54 unsigned int cur; /* number of used slots in compat_tab[] */
53#endif 55#endif
54}; 56};
55 57
@@ -414,54 +416,67 @@ int xt_check_match(struct xt_mtchk_param *par,
414EXPORT_SYMBOL_GPL(xt_check_match); 416EXPORT_SYMBOL_GPL(xt_check_match);
415 417
416#ifdef CONFIG_COMPAT 418#ifdef CONFIG_COMPAT
417int xt_compat_add_offset(u_int8_t af, unsigned int offset, short delta) 419int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta)
418{ 420{
419 struct compat_delta *tmp; 421 struct xt_af *xp = &xt[af];
420 422
421 tmp = kmalloc(sizeof(struct compat_delta), GFP_KERNEL); 423 if (!xp->compat_tab) {
422 if (!tmp) 424 if (!xp->number)
423 return -ENOMEM; 425 return -EINVAL;
426 xp->compat_tab = vmalloc(sizeof(struct compat_delta) * xp->number);
427 if (!xp->compat_tab)
428 return -ENOMEM;
429 xp->cur = 0;
430 }
424 431
425 tmp->offset = offset; 432 if (xp->cur >= xp->number)
426 tmp->delta = delta; 433 return -EINVAL;
427 434
428 if (xt[af].compat_offsets) { 435 if (xp->cur)
429 tmp->next = xt[af].compat_offsets->next; 436 delta += xp->compat_tab[xp->cur - 1].delta;
430 xt[af].compat_offsets->next = tmp; 437 xp->compat_tab[xp->cur].offset = offset;
431 } else { 438 xp->compat_tab[xp->cur].delta = delta;
432 xt[af].compat_offsets = tmp; 439 xp->cur++;
433 tmp->next = NULL;
434 }
435 return 0; 440 return 0;
436} 441}
437EXPORT_SYMBOL_GPL(xt_compat_add_offset); 442EXPORT_SYMBOL_GPL(xt_compat_add_offset);
438 443
439void xt_compat_flush_offsets(u_int8_t af) 444void xt_compat_flush_offsets(u_int8_t af)
440{ 445{
441 struct compat_delta *tmp, *next; 446 if (xt[af].compat_tab) {
442 447 vfree(xt[af].compat_tab);
443 if (xt[af].compat_offsets) { 448 xt[af].compat_tab = NULL;
444 for (tmp = xt[af].compat_offsets; tmp; tmp = next) { 449 xt[af].number = 0;
445 next = tmp->next;
446 kfree(tmp);
447 }
448 xt[af].compat_offsets = NULL;
449 } 450 }
450} 451}
451EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); 452EXPORT_SYMBOL_GPL(xt_compat_flush_offsets);
452 453
453int xt_compat_calc_jump(u_int8_t af, unsigned int offset) 454int xt_compat_calc_jump(u_int8_t af, unsigned int offset)
454{ 455{
455 struct compat_delta *tmp; 456 struct compat_delta *tmp = xt[af].compat_tab;
456 int delta; 457 int mid, left = 0, right = xt[af].cur - 1;
457 458
458 for (tmp = xt[af].compat_offsets, delta = 0; tmp; tmp = tmp->next) 459 while (left <= right) {
459 if (tmp->offset < offset) 460 mid = (left + right) >> 1;
460 delta += tmp->delta; 461 if (offset > tmp[mid].offset)
461 return delta; 462 left = mid + 1;
463 else if (offset < tmp[mid].offset)
464 right = mid - 1;
465 else
466 return mid ? tmp[mid - 1].delta : 0;
467 }
468 WARN_ON_ONCE(1);
469 return 0;
462} 470}
463EXPORT_SYMBOL_GPL(xt_compat_calc_jump); 471EXPORT_SYMBOL_GPL(xt_compat_calc_jump);
464 472
473void xt_compat_init_offsets(u_int8_t af, unsigned int number)
474{
475 xt[af].number = number;
476 xt[af].cur = 0;
477}
478EXPORT_SYMBOL(xt_compat_init_offsets);
479
465int xt_compat_match_offset(const struct xt_match *match) 480int xt_compat_match_offset(const struct xt_match *match)
466{ 481{
467 u_int16_t csize = match->compatsize ? : match->matchsize; 482 u_int16_t csize = match->compatsize ? : match->matchsize;
@@ -820,6 +835,21 @@ xt_replace_table(struct xt_table *table,
820 */ 835 */
821 local_bh_enable(); 836 local_bh_enable();
822 837
838#ifdef CONFIG_AUDIT
839 if (audit_enabled) {
840 struct audit_buffer *ab;
841
842 ab = audit_log_start(current->audit_context, GFP_KERNEL,
843 AUDIT_NETFILTER_CFG);
844 if (ab) {
845 audit_log_format(ab, "table=%s family=%u entries=%u",
846 table->name, table->af,
847 private->number);
848 audit_log_end(ab);
849 }
850 }
851#endif
852
823 return private; 853 return private;
824} 854}
825EXPORT_SYMBOL_GPL(xt_replace_table); 855EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -1338,7 +1368,7 @@ static int __init xt_init(void)
1338 mutex_init(&xt[i].mutex); 1368 mutex_init(&xt[i].mutex);
1339#ifdef CONFIG_COMPAT 1369#ifdef CONFIG_COMPAT
1340 mutex_init(&xt[i].compat_mutex); 1370 mutex_init(&xt[i].compat_mutex);
1341 xt[i].compat_offsets = NULL; 1371 xt[i].compat_tab = NULL;
1342#endif 1372#endif
1343 INIT_LIST_HEAD(&xt[i].target); 1373 INIT_LIST_HEAD(&xt[i].target);
1344 INIT_LIST_HEAD(&xt[i].match); 1374 INIT_LIST_HEAD(&xt[i].match);
diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c
new file mode 100644
index 000000000000..81802d27346e
--- /dev/null
+++ b/net/netfilter/xt_AUDIT.c
@@ -0,0 +1,204 @@
1/*
2 * Creates audit record for dropped/accepted packets
3 *
4 * (C) 2010-2011 Thomas Graf <tgraf@redhat.com>
5 * (C) 2010-2011 Red Hat, Inc.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation.
10*/
11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14#include <linux/audit.h>
15#include <linux/module.h>
16#include <linux/skbuff.h>
17#include <linux/tcp.h>
18#include <linux/udp.h>
19#include <linux/if_arp.h>
20#include <linux/netfilter/x_tables.h>
21#include <linux/netfilter/xt_AUDIT.h>
22#include <net/ipv6.h>
23#include <net/ip.h>
24
25MODULE_LICENSE("GPL");
26MODULE_AUTHOR("Thomas Graf <tgraf@redhat.com>");
27MODULE_DESCRIPTION("Xtables: creates audit records for dropped/accepted packets");
28MODULE_ALIAS("ipt_AUDIT");
29MODULE_ALIAS("ip6t_AUDIT");
30MODULE_ALIAS("ebt_AUDIT");
31MODULE_ALIAS("arpt_AUDIT");
32
33static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb,
34 unsigned int proto, unsigned int offset)
35{
36 switch (proto) {
37 case IPPROTO_TCP:
38 case IPPROTO_UDP:
39 case IPPROTO_UDPLITE: {
40 const __be16 *pptr;
41 __be16 _ports[2];
42
43 pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports);
44 if (pptr == NULL) {
45 audit_log_format(ab, " truncated=1");
46 return;
47 }
48
49 audit_log_format(ab, " sport=%hu dport=%hu",
50 ntohs(pptr[0]), ntohs(pptr[1]));
51 }
52 break;
53
54 case IPPROTO_ICMP:
55 case IPPROTO_ICMPV6: {
56 const u8 *iptr;
57 u8 _ih[2];
58
59 iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih);
60 if (iptr == NULL) {
61 audit_log_format(ab, " truncated=1");
62 return;
63 }
64
65 audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu",
66 iptr[0], iptr[1]);
67
68 }
69 break;
70 }
71}
72
73static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb)
74{
75 struct iphdr _iph;
76 const struct iphdr *ih;
77
78 ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
79 if (!ih) {
80 audit_log_format(ab, " truncated=1");
81 return;
82 }
83
84 audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu",
85 &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol);
86
87 if (ntohs(ih->frag_off) & IP_OFFSET) {
88 audit_log_format(ab, " frag=1");
89 return;
90 }
91
92 audit_proto(ab, skb, ih->protocol, ih->ihl * 4);
93}
94
95static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb)
96{
97 struct ipv6hdr _ip6h;
98 const struct ipv6hdr *ih;
99 u8 nexthdr;
100 int offset;
101
102 ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h);
103 if (!ih) {
104 audit_log_format(ab, " truncated=1");
105 return;
106 }
107
108 nexthdr = ih->nexthdr;
109 offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h),
110 &nexthdr);
111
112 audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu",
113 &ih->saddr, &ih->daddr, nexthdr);
114
115 if (offset)
116 audit_proto(ab, skb, nexthdr, offset);
117}
118
119static unsigned int
120audit_tg(struct sk_buff *skb, const struct xt_action_param *par)
121{
122 const struct xt_audit_info *info = par->targinfo;
123 struct audit_buffer *ab;
124
125 ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT);
126 if (ab == NULL)
127 goto errout;
128
129 audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s",
130 info->type, par->hooknum, skb->len,
131 par->in ? par->in->name : "?",
132 par->out ? par->out->name : "?");
133
134 if (skb->mark)
135 audit_log_format(ab, " mark=%#x", skb->mark);
136
137 if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
138 audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x",
139 eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
140 ntohs(eth_hdr(skb)->h_proto));
141
142 if (par->family == NFPROTO_BRIDGE) {
143 switch (eth_hdr(skb)->h_proto) {
144 case __constant_htons(ETH_P_IP):
145 audit_ip4(ab, skb);
146 break;
147
148 case __constant_htons(ETH_P_IPV6):
149 audit_ip6(ab, skb);
150 break;
151 }
152 }
153 }
154
155 switch (par->family) {
156 case NFPROTO_IPV4:
157 audit_ip4(ab, skb);
158 break;
159
160 case NFPROTO_IPV6:
161 audit_ip6(ab, skb);
162 break;
163 }
164
165 audit_log_end(ab);
166
167errout:
168 return XT_CONTINUE;
169}
170
171static int audit_tg_check(const struct xt_tgchk_param *par)
172{
173 const struct xt_audit_info *info = par->targinfo;
174
175 if (info->type > XT_AUDIT_TYPE_MAX) {
176 pr_info("Audit type out of range (valid range: 0..%hhu)\n",
177 XT_AUDIT_TYPE_MAX);
178 return -ERANGE;
179 }
180
181 return 0;
182}
183
184static struct xt_target audit_tg_reg __read_mostly = {
185 .name = "AUDIT",
186 .family = NFPROTO_UNSPEC,
187 .target = audit_tg,
188 .targetsize = sizeof(struct xt_audit_info),
189 .checkentry = audit_tg_check,
190 .me = THIS_MODULE,
191};
192
193static int __init audit_tg_init(void)
194{
195 return xt_register_target(&audit_tg_reg);
196}
197
198static void __exit audit_tg_exit(void)
199{
200 xt_unregister_target(&audit_tg_reg);
201}
202
203module_init(audit_tg_init);
204module_exit(audit_tg_exit);
diff --git a/net/netfilter/xt_CLASSIFY.c b/net/netfilter/xt_CLASSIFY.c
index c2c0e4abeb99..af9c4dadf816 100644
--- a/net/netfilter/xt_CLASSIFY.c
+++ b/net/netfilter/xt_CLASSIFY.c
@@ -19,12 +19,14 @@
19#include <linux/netfilter_ipv6.h> 19#include <linux/netfilter_ipv6.h>
20#include <linux/netfilter/x_tables.h> 20#include <linux/netfilter/x_tables.h>
21#include <linux/netfilter/xt_CLASSIFY.h> 21#include <linux/netfilter/xt_CLASSIFY.h>
22#include <linux/netfilter_arp.h>
22 23
23MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); 24MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
24MODULE_LICENSE("GPL"); 25MODULE_LICENSE("GPL");
25MODULE_DESCRIPTION("Xtables: Qdisc classification"); 26MODULE_DESCRIPTION("Xtables: Qdisc classification");
26MODULE_ALIAS("ipt_CLASSIFY"); 27MODULE_ALIAS("ipt_CLASSIFY");
27MODULE_ALIAS("ip6t_CLASSIFY"); 28MODULE_ALIAS("ip6t_CLASSIFY");
29MODULE_ALIAS("arpt_CLASSIFY");
28 30
29static unsigned int 31static unsigned int
30classify_tg(struct sk_buff *skb, const struct xt_action_param *par) 32classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
@@ -35,26 +37,36 @@ classify_tg(struct sk_buff *skb, const struct xt_action_param *par)
35 return XT_CONTINUE; 37 return XT_CONTINUE;
36} 38}
37 39
38static struct xt_target classify_tg_reg __read_mostly = { 40static struct xt_target classify_tg_reg[] __read_mostly = {
39 .name = "CLASSIFY", 41 {
40 .revision = 0, 42 .name = "CLASSIFY",
41 .family = NFPROTO_UNSPEC, 43 .revision = 0,
42 .table = "mangle", 44 .family = NFPROTO_UNSPEC,
43 .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | 45 .hooks = (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) |
44 (1 << NF_INET_POST_ROUTING), 46 (1 << NF_INET_POST_ROUTING),
45 .target = classify_tg, 47 .target = classify_tg,
46 .targetsize = sizeof(struct xt_classify_target_info), 48 .targetsize = sizeof(struct xt_classify_target_info),
47 .me = THIS_MODULE, 49 .me = THIS_MODULE,
50 },
51 {
52 .name = "CLASSIFY",
53 .revision = 0,
54 .family = NFPROTO_ARP,
55 .hooks = (1 << NF_ARP_OUT) | (1 << NF_ARP_FORWARD),
56 .target = classify_tg,
57 .targetsize = sizeof(struct xt_classify_target_info),
58 .me = THIS_MODULE,
59 },
48}; 60};
49 61
50static int __init classify_tg_init(void) 62static int __init classify_tg_init(void)
51{ 63{
52 return xt_register_target(&classify_tg_reg); 64 return xt_register_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
53} 65}
54 66
55static void __exit classify_tg_exit(void) 67static void __exit classify_tg_exit(void)
56{ 68{
57 xt_unregister_target(&classify_tg_reg); 69 xt_unregister_targets(classify_tg_reg, ARRAY_SIZE(classify_tg_reg));
58} 70}
59 71
60module_init(classify_tg_init); 72module_init(classify_tg_init);
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c
index be1f22e13545..3bdd443aaf15 100644
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -313,3 +313,5 @@ MODULE_AUTHOR("Timo Teras <ext-timo.teras@nokia.com>");
313MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>"); 313MODULE_AUTHOR("Luciano Coelho <luciano.coelho@nokia.com>");
314MODULE_DESCRIPTION("Xtables: idle time monitor"); 314MODULE_DESCRIPTION("Xtables: idle time monitor");
315MODULE_LICENSE("GPL v2"); 315MODULE_LICENSE("GPL v2");
316MODULE_ALIAS("ipt_IDLETIMER");
317MODULE_ALIAS("ip6t_IDLETIMER");
diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c
index a4140509eea1..993de2ba89d3 100644
--- a/net/netfilter/xt_LED.c
+++ b/net/netfilter/xt_LED.c
@@ -31,6 +31,8 @@
31MODULE_LICENSE("GPL"); 31MODULE_LICENSE("GPL");
32MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>"); 32MODULE_AUTHOR("Adam Nielsen <a.nielsen@shikadi.net>");
33MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match"); 33MODULE_DESCRIPTION("Xtables: trigger LED devices on packet match");
34MODULE_ALIAS("ipt_LED");
35MODULE_ALIAS("ip6t_LED");
34 36
35static LIST_HEAD(xt_led_triggers); 37static LIST_HEAD(xt_led_triggers);
36static DEFINE_MUTEX(xt_led_mutex); 38static DEFINE_MUTEX(xt_led_mutex);
diff --git a/net/netfilter/xt_NFQUEUE.c b/net/netfilter/xt_NFQUEUE.c
index 039cce1bde3d..d4f4b5d66b20 100644
--- a/net/netfilter/xt_NFQUEUE.c
+++ b/net/netfilter/xt_NFQUEUE.c
@@ -72,18 +72,31 @@ nfqueue_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
72 72
73 if (info->queues_total > 1) { 73 if (info->queues_total > 1) {
74 if (par->family == NFPROTO_IPV4) 74 if (par->family == NFPROTO_IPV4)
75 queue = hash_v4(skb) % info->queues_total + queue; 75 queue = (((u64) hash_v4(skb) * info->queues_total) >>
76 32) + queue;
76#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE) 77#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
77 else if (par->family == NFPROTO_IPV6) 78 else if (par->family == NFPROTO_IPV6)
78 queue = hash_v6(skb) % info->queues_total + queue; 79 queue = (((u64) hash_v6(skb) * info->queues_total) >>
80 32) + queue;
79#endif 81#endif
80 } 82 }
81 return NF_QUEUE_NR(queue); 83 return NF_QUEUE_NR(queue);
82} 84}
83 85
84static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par) 86static unsigned int
87nfqueue_tg_v2(struct sk_buff *skb, const struct xt_action_param *par)
85{ 88{
86 const struct xt_NFQ_info_v1 *info = par->targinfo; 89 const struct xt_NFQ_info_v2 *info = par->targinfo;
90 unsigned int ret = nfqueue_tg_v1(skb, par);
91
92 if (info->bypass)
93 ret |= NF_VERDICT_FLAG_QUEUE_BYPASS;
94 return ret;
95}
96
97static int nfqueue_tg_check(const struct xt_tgchk_param *par)
98{
99 const struct xt_NFQ_info_v2 *info = par->targinfo;
87 u32 maxid; 100 u32 maxid;
88 101
89 if (unlikely(!rnd_inited)) { 102 if (unlikely(!rnd_inited)) {
@@ -100,6 +113,8 @@ static int nfqueue_tg_v1_check(const struct xt_tgchk_param *par)
100 info->queues_total, maxid); 113 info->queues_total, maxid);
101 return -ERANGE; 114 return -ERANGE;
102 } 115 }
116 if (par->target->revision == 2 && info->bypass > 1)
117 return -EINVAL;
103 return 0; 118 return 0;
104} 119}
105 120
@@ -115,11 +130,20 @@ static struct xt_target nfqueue_tg_reg[] __read_mostly = {
115 .name = "NFQUEUE", 130 .name = "NFQUEUE",
116 .revision = 1, 131 .revision = 1,
117 .family = NFPROTO_UNSPEC, 132 .family = NFPROTO_UNSPEC,
118 .checkentry = nfqueue_tg_v1_check, 133 .checkentry = nfqueue_tg_check,
119 .target = nfqueue_tg_v1, 134 .target = nfqueue_tg_v1,
120 .targetsize = sizeof(struct xt_NFQ_info_v1), 135 .targetsize = sizeof(struct xt_NFQ_info_v1),
121 .me = THIS_MODULE, 136 .me = THIS_MODULE,
122 }, 137 },
138 {
139 .name = "NFQUEUE",
140 .revision = 2,
141 .family = NFPROTO_UNSPEC,
142 .checkentry = nfqueue_tg_check,
143 .target = nfqueue_tg_v2,
144 .targetsize = sizeof(struct xt_NFQ_info_v2),
145 .me = THIS_MODULE,
146 },
123}; 147};
124 148
125static int __init nfqueue_tg_init(void) 149static int __init nfqueue_tg_init(void)
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 5c5b6b921b84..7fd3fd51f274 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -193,10 +193,12 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
193 193
194 if (par->family == NFPROTO_IPV6) { 194 if (par->family == NFPROTO_IPV6) {
195 const struct ipv6hdr *iph = ipv6_hdr(skb); 195 const struct ipv6hdr *iph = ipv6_hdr(skb);
196 memcpy(&addr.ip6, &iph->saddr, sizeof(iph->saddr)); 196 memcpy(&addr.ip6, (info->flags & XT_CONNLIMIT_DADDR) ?
197 &iph->daddr : &iph->saddr, sizeof(addr.ip6));
197 } else { 198 } else {
198 const struct iphdr *iph = ip_hdr(skb); 199 const struct iphdr *iph = ip_hdr(skb);
199 addr.ip = iph->saddr; 200 addr.ip = (info->flags & XT_CONNLIMIT_DADDR) ?
201 iph->daddr : iph->saddr;
200 } 202 }
201 203
202 spin_lock_bh(&info->data->lock); 204 spin_lock_bh(&info->data->lock);
@@ -204,13 +206,12 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
204 &info->mask, par->family); 206 &info->mask, par->family);
205 spin_unlock_bh(&info->data->lock); 207 spin_unlock_bh(&info->data->lock);
206 208
207 if (connections < 0) { 209 if (connections < 0)
208 /* kmalloc failed, drop it entirely */ 210 /* kmalloc failed, drop it entirely */
209 par->hotdrop = true; 211 goto hotdrop;
210 return false;
211 }
212 212
213 return (connections > info->limit) ^ info->inverse; 213 return (connections > info->limit) ^
214 !!(info->flags & XT_CONNLIMIT_INVERT);
214 215
215 hotdrop: 216 hotdrop:
216 par->hotdrop = true; 217 par->hotdrop = true;
@@ -268,25 +269,38 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
268 kfree(info->data); 269 kfree(info->data);
269} 270}
270 271
271static struct xt_match connlimit_mt_reg __read_mostly = { 272static struct xt_match connlimit_mt_reg[] __read_mostly = {
272 .name = "connlimit", 273 {
273 .revision = 0, 274 .name = "connlimit",
274 .family = NFPROTO_UNSPEC, 275 .revision = 0,
275 .checkentry = connlimit_mt_check, 276 .family = NFPROTO_UNSPEC,
276 .match = connlimit_mt, 277 .checkentry = connlimit_mt_check,
277 .matchsize = sizeof(struct xt_connlimit_info), 278 .match = connlimit_mt,
278 .destroy = connlimit_mt_destroy, 279 .matchsize = sizeof(struct xt_connlimit_info),
279 .me = THIS_MODULE, 280 .destroy = connlimit_mt_destroy,
281 .me = THIS_MODULE,
282 },
283 {
284 .name = "connlimit",
285 .revision = 1,
286 .family = NFPROTO_UNSPEC,
287 .checkentry = connlimit_mt_check,
288 .match = connlimit_mt,
289 .matchsize = sizeof(struct xt_connlimit_info),
290 .destroy = connlimit_mt_destroy,
291 .me = THIS_MODULE,
292 },
280}; 293};
281 294
282static int __init connlimit_mt_init(void) 295static int __init connlimit_mt_init(void)
283{ 296{
284 return xt_register_match(&connlimit_mt_reg); 297 return xt_register_matches(connlimit_mt_reg,
298 ARRAY_SIZE(connlimit_mt_reg));
285} 299}
286 300
287static void __exit connlimit_mt_exit(void) 301static void __exit connlimit_mt_exit(void)
288{ 302{
289 xt_unregister_match(&connlimit_mt_reg); 303 xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
290} 304}
291 305
292module_init(connlimit_mt_init); 306module_init(connlimit_mt_init);
diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c
index e536710ad916..4ef1b63ad73f 100644
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -112,6 +112,54 @@ ct_proto_port_check(const struct xt_conntrack_mtinfo2 *info,
112 return true; 112 return true;
113} 113}
114 114
115static inline bool
116port_match(u16 min, u16 max, u16 port, bool invert)
117{
118 return (port >= min && port <= max) ^ invert;
119}
120
121static inline bool
122ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
123 const struct nf_conn *ct)
124{
125 const struct nf_conntrack_tuple *tuple;
126
127 tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
128 if ((info->match_flags & XT_CONNTRACK_PROTO) &&
129 (nf_ct_protonum(ct) == info->l4proto) ^
130 !(info->invert_flags & XT_CONNTRACK_PROTO))
131 return false;
132
133 /* Shortcut to match all recognized protocols by using ->src.all. */
134 if ((info->match_flags & XT_CONNTRACK_ORIGSRC_PORT) &&
135 !port_match(info->origsrc_port, info->origsrc_port_high,
136 ntohs(tuple->src.u.all),
137 info->invert_flags & XT_CONNTRACK_ORIGSRC_PORT))
138 return false;
139
140 if ((info->match_flags & XT_CONNTRACK_ORIGDST_PORT) &&
141 !port_match(info->origdst_port, info->origdst_port_high,
142 ntohs(tuple->dst.u.all),
143 info->invert_flags & XT_CONNTRACK_ORIGDST_PORT))
144 return false;
145
146 tuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple;
147
148 if ((info->match_flags & XT_CONNTRACK_REPLSRC_PORT) &&
149 !port_match(info->replsrc_port, info->replsrc_port_high,
150 ntohs(tuple->src.u.all),
151 info->invert_flags & XT_CONNTRACK_REPLSRC_PORT))
152 return false;
153
154 if ((info->match_flags & XT_CONNTRACK_REPLDST_PORT) &&
155 !port_match(info->repldst_port, info->repldst_port_high,
156 ntohs(tuple->dst.u.all),
157 info->invert_flags & XT_CONNTRACK_REPLDST_PORT))
158 return false;
159
160 return true;
161}
162
115static bool 163static bool
116conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, 164conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
117 u16 state_mask, u16 status_mask) 165 u16 state_mask, u16 status_mask)
@@ -170,8 +218,13 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
170 !(info->invert_flags & XT_CONNTRACK_REPLDST)) 218 !(info->invert_flags & XT_CONNTRACK_REPLDST))
171 return false; 219 return false;
172 220
173 if (!ct_proto_port_check(info, ct)) 221 if (par->match->revision != 3) {
174 return false; 222 if (!ct_proto_port_check(info, ct))
223 return false;
224 } else {
225 if (!ct_proto_port_check_v3(par->matchinfo, ct))
226 return false;
227 }
175 228
176 if ((info->match_flags & XT_CONNTRACK_STATUS) && 229 if ((info->match_flags & XT_CONNTRACK_STATUS) &&
177 (!!(status_mask & ct->status) ^ 230 (!!(status_mask & ct->status) ^
@@ -207,6 +260,14 @@ conntrack_mt_v2(const struct sk_buff *skb, struct xt_action_param *par)
207 return conntrack_mt(skb, par, info->state_mask, info->status_mask); 260 return conntrack_mt(skb, par, info->state_mask, info->status_mask);
208} 261}
209 262
263static bool
264conntrack_mt_v3(const struct sk_buff *skb, struct xt_action_param *par)
265{
266 const struct xt_conntrack_mtinfo3 *info = par->matchinfo;
267
268 return conntrack_mt(skb, par, info->state_mask, info->status_mask);
269}
270
210static int conntrack_mt_check(const struct xt_mtchk_param *par) 271static int conntrack_mt_check(const struct xt_mtchk_param *par)
211{ 272{
212 int ret; 273 int ret;
@@ -244,6 +305,16 @@ static struct xt_match conntrack_mt_reg[] __read_mostly = {
244 .destroy = conntrack_mt_destroy, 305 .destroy = conntrack_mt_destroy,
245 .me = THIS_MODULE, 306 .me = THIS_MODULE,
246 }, 307 },
308 {
309 .name = "conntrack",
310 .revision = 3,
311 .family = NFPROTO_UNSPEC,
312 .matchsize = sizeof(struct xt_conntrack_mtinfo3),
313 .match = conntrack_mt_v3,
314 .checkentry = conntrack_mt_check,
315 .destroy = conntrack_mt_destroy,
316 .me = THIS_MODULE,
317 },
247}; 318};
248 319
249static int __init conntrack_mt_init(void) 320static int __init conntrack_mt_init(void)
diff --git a/net/netfilter/xt_cpu.c b/net/netfilter/xt_cpu.c
index b39db8a5cbae..c7a2e5466bc4 100644
--- a/net/netfilter/xt_cpu.c
+++ b/net/netfilter/xt_cpu.c
@@ -22,6 +22,8 @@
22MODULE_LICENSE("GPL"); 22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>"); 23MODULE_AUTHOR("Eric Dumazet <eric.dumazet@gmail.com>");
24MODULE_DESCRIPTION("Xtables: CPU match"); 24MODULE_DESCRIPTION("Xtables: CPU match");
25MODULE_ALIAS("ipt_cpu");
26MODULE_ALIAS("ip6t_cpu");
25 27
26static int cpu_mt_check(const struct xt_mtchk_param *par) 28static int cpu_mt_check(const struct xt_mtchk_param *par)
27{ 29{
diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c
index 9127a3d8aa35..bb10b0717f1b 100644
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -85,7 +85,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par)
85 /* 85 /*
86 * Check if the packet belongs to an existing entry 86 * Check if the packet belongs to an existing entry
87 */ 87 */
88 cp = pp->conn_out_get(family, skb, pp, &iph, iph.len, 1 /* inverse */); 88 cp = pp->conn_out_get(family, skb, &iph, iph.len, 1 /* inverse */);
89 if (unlikely(cp == NULL)) { 89 if (unlikely(cp == NULL)) {
90 match = false; 90 match = false;
91 goto out; 91 goto out;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 91cb1d71f018..c60649ec1193 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -164,7 +164,6 @@ struct packet_mreq_max {
164static int packet_set_ring(struct sock *sk, struct tpacket_req *req, 164static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
165 int closing, int tx_ring); 165 int closing, int tx_ring);
166 166
167#define PGV_FROM_VMALLOC 1
168struct pgv { 167struct pgv {
169 char *buffer; 168 char *buffer;
170}; 169};
@@ -523,11 +522,11 @@ static inline unsigned int run_filter(const struct sk_buff *skb,
523{ 522{
524 struct sk_filter *filter; 523 struct sk_filter *filter;
525 524
526 rcu_read_lock_bh(); 525 rcu_read_lock();
527 filter = rcu_dereference_bh(sk->sk_filter); 526 filter = rcu_dereference(sk->sk_filter);
528 if (filter != NULL) 527 if (filter != NULL)
529 res = sk_run_filter(skb, filter->insns); 528 res = sk_run_filter(skb, filter->insns);
530 rcu_read_unlock_bh(); 529 rcu_read_unlock();
531 530
532 return res; 531 return res;
533} 532}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 9542449c0720..da8adac2bf06 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -50,7 +50,6 @@ rdsdebug(char *fmt, ...)
50#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT)) 50#define RDS_FRAG_SIZE ((unsigned int)(1 << RDS_FRAG_SHIFT))
51 51
52#define RDS_CONG_MAP_BYTES (65536 / 8) 52#define RDS_CONG_MAP_BYTES (65536 / 8)
53#define RDS_CONG_MAP_LONGS (RDS_CONG_MAP_BYTES / sizeof(unsigned long))
54#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE) 53#define RDS_CONG_MAP_PAGES (PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
55#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8) 54#define RDS_CONG_MAP_PAGE_BITS (PAGE_SIZE * 8)
56 55
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index f04d4a484d53..e318f458713e 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -205,6 +205,18 @@ config NET_SCH_DRR
205 205
206 If unsure, say N. 206 If unsure, say N.
207 207
208config NET_SCH_MQPRIO
209 tristate "Multi-queue priority scheduler (MQPRIO)"
210 help
211 Say Y here if you want to use the Multi-queue Priority scheduler.
212 This scheduler allows QOS to be offloaded on NICs that have support
213 for offloading QOS schedulers.
214
215 To compile this driver as a module, choose M here: the module will
216 be called sch_mqprio.
217
218 If unsure, say N.
219
208config NET_SCH_INGRESS 220config NET_SCH_INGRESS
209 tristate "Ingress Qdisc" 221 tristate "Ingress Qdisc"
210 depends on NET_CLS_ACT 222 depends on NET_CLS_ACT
@@ -243,7 +255,7 @@ config NET_CLS_TCINDEX
243 255
244config NET_CLS_ROUTE4 256config NET_CLS_ROUTE4
245 tristate "Routing decision (ROUTE)" 257 tristate "Routing decision (ROUTE)"
246 select NET_CLS_ROUTE 258 select IP_ROUTE_CLASSID
247 select NET_CLS 259 select NET_CLS
248 ---help--- 260 ---help---
249 If you say Y here, you will be able to classify packets 261 If you say Y here, you will be able to classify packets
@@ -252,9 +264,6 @@ config NET_CLS_ROUTE4
252 To compile this code as a module, choose M here: the 264 To compile this code as a module, choose M here: the
253 module will be called cls_route. 265 module will be called cls_route.
254 266
255config NET_CLS_ROUTE
256 bool
257
258config NET_CLS_FW 267config NET_CLS_FW
259 tristate "Netfilter mark (FW)" 268 tristate "Netfilter mark (FW)"
260 select NET_CLS 269 select NET_CLS
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 960f5dba6304..26ce681a2c60 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_NET_SCH_MULTIQ) += sch_multiq.o
32obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o 32obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o
33obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o 33obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
34obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o 34obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
35obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
35obj-$(CONFIG_NET_CLS_U32) += cls_u32.o 36obj-$(CONFIG_NET_CLS_U32) += cls_u32.o
36obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o 37obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o
37obj-$(CONFIG_NET_CLS_FW) += cls_fw.o 38obj-$(CONFIG_NET_CLS_FW) += cls_fw.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 23b25f89e7e0..15873e14cb54 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -78,7 +78,7 @@ static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
78 struct tc_action *a, struct tcf_hashinfo *hinfo) 78 struct tc_action *a, struct tcf_hashinfo *hinfo)
79{ 79{
80 struct tcf_common *p; 80 struct tcf_common *p;
81 int err = 0, index = -1,i = 0, s_i = 0, n_i = 0; 81 int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
82 struct nlattr *nest; 82 struct nlattr *nest;
83 83
84 read_lock_bh(hinfo->lock); 84 read_lock_bh(hinfo->lock);
@@ -126,7 +126,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
126{ 126{
127 struct tcf_common *p, *s_p; 127 struct tcf_common *p, *s_p;
128 struct nlattr *nest; 128 struct nlattr *nest;
129 int i= 0, n_i = 0; 129 int i = 0, n_i = 0;
130 130
131 nest = nla_nest_start(skb, a->order); 131 nest = nla_nest_start(skb, a->order);
132 if (nest == NULL) 132 if (nest == NULL)
@@ -138,7 +138,7 @@ static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a,
138 while (p != NULL) { 138 while (p != NULL) {
139 s_p = p->tcfc_next; 139 s_p = p->tcfc_next;
140 if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo)) 140 if (ACT_P_DELETED == tcf_hash_release(p, 0, hinfo))
141 module_put(a->ops->owner); 141 module_put(a->ops->owner);
142 n_i++; 142 n_i++;
143 p = s_p; 143 p = s_p;
144 } 144 }
@@ -447,7 +447,8 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
447 nest = nla_nest_start(skb, TCA_OPTIONS); 447 nest = nla_nest_start(skb, TCA_OPTIONS);
448 if (nest == NULL) 448 if (nest == NULL)
449 goto nla_put_failure; 449 goto nla_put_failure;
450 if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { 450 err = tcf_action_dump_old(skb, a, bind, ref);
451 if (err > 0) {
451 nla_nest_end(skb, nest); 452 nla_nest_end(skb, nest);
452 return err; 453 return err;
453 } 454 }
@@ -491,7 +492,7 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
491 struct tc_action *a; 492 struct tc_action *a;
492 struct tc_action_ops *a_o; 493 struct tc_action_ops *a_o;
493 char act_name[IFNAMSIZ]; 494 char act_name[IFNAMSIZ];
494 struct nlattr *tb[TCA_ACT_MAX+1]; 495 struct nlattr *tb[TCA_ACT_MAX + 1];
495 struct nlattr *kind; 496 struct nlattr *kind;
496 int err; 497 int err;
497 498
@@ -549,9 +550,9 @@ struct tc_action *tcf_action_init_1(struct nlattr *nla, struct nlattr *est,
549 goto err_free; 550 goto err_free;
550 551
551 /* module count goes up only when brand new policy is created 552 /* module count goes up only when brand new policy is created
552 if it exists and is only bound to in a_o->init() then 553 * if it exists and is only bound to in a_o->init() then
553 ACT_P_CREATED is not returned (a zero is). 554 * ACT_P_CREATED is not returned (a zero is).
554 */ 555 */
555 if (err != ACT_P_CREATED) 556 if (err != ACT_P_CREATED)
556 module_put(a_o->owner); 557 module_put(a_o->owner);
557 a->ops = a_o; 558 a->ops = a_o;
@@ -569,7 +570,7 @@ err_out:
569struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est, 570struct tc_action *tcf_action_init(struct nlattr *nla, struct nlattr *est,
570 char *name, int ovr, int bind) 571 char *name, int ovr, int bind)
571{ 572{
572 struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; 573 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
573 struct tc_action *head = NULL, *act, *act_prev = NULL; 574 struct tc_action *head = NULL, *act, *act_prev = NULL;
574 int err; 575 int err;
575 int i; 576 int i;
@@ -697,7 +698,7 @@ act_get_notify(struct net *net, u32 pid, struct nlmsghdr *n,
697static struct tc_action * 698static struct tc_action *
698tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid) 699tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 pid)
699{ 700{
700 struct nlattr *tb[TCA_ACT_MAX+1]; 701 struct nlattr *tb[TCA_ACT_MAX + 1];
701 struct tc_action *a; 702 struct tc_action *a;
702 int index; 703 int index;
703 int err; 704 int err;
@@ -770,7 +771,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
770 struct tcamsg *t; 771 struct tcamsg *t;
771 struct netlink_callback dcb; 772 struct netlink_callback dcb;
772 struct nlattr *nest; 773 struct nlattr *nest;
773 struct nlattr *tb[TCA_ACT_MAX+1]; 774 struct nlattr *tb[TCA_ACT_MAX + 1];
774 struct nlattr *kind; 775 struct nlattr *kind;
775 struct tc_action *a = create_a(0); 776 struct tc_action *a = create_a(0);
776 int err = -ENOMEM; 777 int err = -ENOMEM;
@@ -821,7 +822,8 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
821 nlh->nlmsg_flags |= NLM_F_ROOT; 822 nlh->nlmsg_flags |= NLM_F_ROOT;
822 module_put(a->ops->owner); 823 module_put(a->ops->owner);
823 kfree(a); 824 kfree(a);
824 err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 825 err = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
826 n->nlmsg_flags & NLM_F_ECHO);
825 if (err > 0) 827 if (err > 0)
826 return 0; 828 return 0;
827 829
@@ -842,14 +844,14 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
842 u32 pid, int event) 844 u32 pid, int event)
843{ 845{
844 int i, ret; 846 int i, ret;
845 struct nlattr *tb[TCA_ACT_MAX_PRIO+1]; 847 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
846 struct tc_action *head = NULL, *act, *act_prev = NULL; 848 struct tc_action *head = NULL, *act, *act_prev = NULL;
847 849
848 ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); 850 ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL);
849 if (ret < 0) 851 if (ret < 0)
850 return ret; 852 return ret;
851 853
852 if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { 854 if (event == RTM_DELACTION && n->nlmsg_flags & NLM_F_ROOT) {
853 if (tb[1] != NULL) 855 if (tb[1] != NULL)
854 return tca_action_flush(net, tb[1], n, pid); 856 return tca_action_flush(net, tb[1], n, pid);
855 else 857 else
@@ -892,7 +894,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
892 /* now do the delete */ 894 /* now do the delete */
893 tcf_action_destroy(head, 0); 895 tcf_action_destroy(head, 0);
894 ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC, 896 ret = rtnetlink_send(skb, net, pid, RTNLGRP_TC,
895 n->nlmsg_flags&NLM_F_ECHO); 897 n->nlmsg_flags & NLM_F_ECHO);
896 if (ret > 0) 898 if (ret > 0)
897 return 0; 899 return 0;
898 return ret; 900 return ret;
@@ -936,7 +938,7 @@ static int tcf_add_notify(struct net *net, struct tc_action *a,
936 nlh->nlmsg_len = skb_tail_pointer(skb) - b; 938 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
937 NETLINK_CB(skb).dst_group = RTNLGRP_TC; 939 NETLINK_CB(skb).dst_group = RTNLGRP_TC;
938 940
939 err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags&NLM_F_ECHO); 941 err = rtnetlink_send(skb, net, pid, RTNLGRP_TC, flags & NLM_F_ECHO);
940 if (err > 0) 942 if (err > 0)
941 err = 0; 943 err = 0;
942 return err; 944 return err;
@@ -967,7 +969,7 @@ tcf_action_add(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
967 969
968 /* dump then free all the actions after update; inserted policy 970 /* dump then free all the actions after update; inserted policy
969 * stays intact 971 * stays intact
970 * */ 972 */
971 ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); 973 ret = tcf_add_notify(net, act, pid, seq, RTM_NEWACTION, n->nlmsg_flags);
972 for (a = act; a; a = act) { 974 for (a = act; a; a = act) {
973 act = a->next; 975 act = a->next;
@@ -993,8 +995,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
993 return -EINVAL; 995 return -EINVAL;
994 } 996 }
995 997
996 /* n->nlmsg_flags&NLM_F_CREATE 998 /* n->nlmsg_flags & NLM_F_CREATE */
997 * */
998 switch (n->nlmsg_type) { 999 switch (n->nlmsg_type) {
999 case RTM_NEWACTION: 1000 case RTM_NEWACTION:
1000 /* we are going to assume all other flags 1001 /* we are going to assume all other flags
@@ -1003,7 +1004,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1003 * but since we want avoid ambiguity (eg when flags 1004 * but since we want avoid ambiguity (eg when flags
1004 * is zero) then just set this 1005 * is zero) then just set this
1005 */ 1006 */
1006 if (n->nlmsg_flags&NLM_F_REPLACE) 1007 if (n->nlmsg_flags & NLM_F_REPLACE)
1007 ovr = 1; 1008 ovr = 1;
1008replay: 1009replay:
1009 ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr); 1010 ret = tcf_action_add(net, tca[TCA_ACT_TAB], n, pid, ovr);
@@ -1028,7 +1029,7 @@ replay:
1028static struct nlattr * 1029static struct nlattr *
1029find_dump_kind(const struct nlmsghdr *n) 1030find_dump_kind(const struct nlmsghdr *n)
1030{ 1031{
1031 struct nlattr *tb1, *tb2[TCA_ACT_MAX+1]; 1032 struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1];
1032 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; 1033 struct nlattr *tb[TCA_ACT_MAX_PRIO + 1];
1033 struct nlattr *nla[TCAA_MAX + 1]; 1034 struct nlattr *nla[TCAA_MAX + 1];
1034 struct nlattr *kind; 1035 struct nlattr *kind;
@@ -1071,9 +1072,8 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
1071 } 1072 }
1072 1073
1073 a_o = tc_lookup_action(kind); 1074 a_o = tc_lookup_action(kind);
1074 if (a_o == NULL) { 1075 if (a_o == NULL)
1075 return 0; 1076 return 0;
1076 }
1077 1077
1078 memset(&a, 0, sizeof(struct tc_action)); 1078 memset(&a, 0, sizeof(struct tc_action));
1079 a.ops = a_o; 1079 a.ops = a_o;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 83ddfc07e45d..6cdf9abe475f 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -63,7 +63,7 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est,
63 if (nla == NULL) 63 if (nla == NULL)
64 return -EINVAL; 64 return -EINVAL;
65 65
66 err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy); 66 err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy);
67 if (err < 0) 67 if (err < 0)
68 return err; 68 return err;
69 69
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index c2ed90a4c0b4..2b4ab4b05ce8 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -50,7 +50,7 @@ static int gact_determ(struct tcf_gact *gact)
50} 50}
51 51
52typedef int (*g_rand)(struct tcf_gact *gact); 52typedef int (*g_rand)(struct tcf_gact *gact);
53static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; 53static g_rand gact_rand[MAX_RAND] = { NULL, gact_net_rand, gact_determ };
54#endif /* CONFIG_GACT_PROB */ 54#endif /* CONFIG_GACT_PROB */
55 55
56static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = { 56static const struct nla_policy gact_policy[TCA_GACT_MAX + 1] = {
@@ -89,7 +89,7 @@ static int tcf_gact_init(struct nlattr *nla, struct nlattr *est,
89 pc = tcf_hash_create(parm->index, est, a, sizeof(*gact), 89 pc = tcf_hash_create(parm->index, est, a, sizeof(*gact),
90 bind, &gact_idx_gen, &gact_hash_info); 90 bind, &gact_idx_gen, &gact_hash_info);
91 if (IS_ERR(pc)) 91 if (IS_ERR(pc))
92 return PTR_ERR(pc); 92 return PTR_ERR(pc);
93 ret = ACT_P_CREATED; 93 ret = ACT_P_CREATED;
94 } else { 94 } else {
95 if (!ovr) { 95 if (!ovr) {
@@ -205,9 +205,9 @@ MODULE_LICENSE("GPL");
205static int __init gact_init_module(void) 205static int __init gact_init_module(void)
206{ 206{
207#ifdef CONFIG_GACT_PROB 207#ifdef CONFIG_GACT_PROB
208 printk(KERN_INFO "GACT probability on\n"); 208 pr_info("GACT probability on\n");
209#else 209#else
210 printk(KERN_INFO "GACT probability NOT on\n"); 210 pr_info("GACT probability NOT on\n");
211#endif 211#endif
212 return tcf_register_action(&act_gact_ops); 212 return tcf_register_action(&act_gact_ops);
213} 213}
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c2a7c20e81c1..9fc211a1b20e 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -138,7 +138,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
138 pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind, 138 pc = tcf_hash_create(index, est, a, sizeof(*ipt), bind,
139 &ipt_idx_gen, &ipt_hash_info); 139 &ipt_idx_gen, &ipt_hash_info);
140 if (IS_ERR(pc)) 140 if (IS_ERR(pc))
141 return PTR_ERR(pc); 141 return PTR_ERR(pc);
142 ret = ACT_P_CREATED; 142 ret = ACT_P_CREATED;
143 } else { 143 } else {
144 if (!ovr) { 144 if (!ovr) {
@@ -162,7 +162,8 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
162 if (unlikely(!t)) 162 if (unlikely(!t))
163 goto err2; 163 goto err2;
164 164
165 if ((err = ipt_init_target(t, tname, hook)) < 0) 165 err = ipt_init_target(t, tname, hook);
166 if (err < 0)
166 goto err3; 167 goto err3;
167 168
168 spin_lock_bh(&ipt->tcf_lock); 169 spin_lock_bh(&ipt->tcf_lock);
@@ -212,8 +213,9 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
212 bstats_update(&ipt->tcf_bstats, skb); 213 bstats_update(&ipt->tcf_bstats, skb);
213 214
214 /* yes, we have to worry about both in and out dev 215 /* yes, we have to worry about both in and out dev
215 worry later - danger - this API seems to have changed 216 * worry later - danger - this API seems to have changed
216 from earlier kernels */ 217 * from earlier kernels
218 */
217 par.in = skb->dev; 219 par.in = skb->dev;
218 par.out = NULL; 220 par.out = NULL;
219 par.hooknum = ipt->tcfi_hook; 221 par.hooknum = ipt->tcfi_hook;
@@ -253,9 +255,9 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
253 struct tc_cnt c; 255 struct tc_cnt c;
254 256
255 /* for simple targets kernel size == user size 257 /* for simple targets kernel size == user size
256 ** user name = target name 258 * user name = target name
257 ** for foolproof you need to not assume this 259 * for foolproof you need to not assume this
258 */ 260 */
259 261
260 t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC); 262 t = kmemdup(ipt->tcfi_t, ipt->tcfi_t->u.user.target_size, GFP_ATOMIC);
261 if (unlikely(!t)) 263 if (unlikely(!t))
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index d765067e99db..961386e2f2c0 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -41,13 +41,13 @@ static struct tcf_hashinfo mirred_hash_info = {
41 .lock = &mirred_lock, 41 .lock = &mirred_lock,
42}; 42};
43 43
44static inline int tcf_mirred_release(struct tcf_mirred *m, int bind) 44static int tcf_mirred_release(struct tcf_mirred *m, int bind)
45{ 45{
46 if (m) { 46 if (m) {
47 if (bind) 47 if (bind)
48 m->tcf_bindcnt--; 48 m->tcf_bindcnt--;
49 m->tcf_refcnt--; 49 m->tcf_refcnt--;
50 if(!m->tcf_bindcnt && m->tcf_refcnt <= 0) { 50 if (!m->tcf_bindcnt && m->tcf_refcnt <= 0) {
51 list_del(&m->tcfm_list); 51 list_del(&m->tcfm_list);
52 if (m->tcfm_dev) 52 if (m->tcfm_dev)
53 dev_put(m->tcfm_dev); 53 dev_put(m->tcfm_dev);
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index 178a4bd7b7cb..762b027650a9 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -69,7 +69,7 @@ static int tcf_nat_init(struct nlattr *nla, struct nlattr *est,
69 pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, 69 pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
70 &nat_idx_gen, &nat_hash_info); 70 &nat_idx_gen, &nat_hash_info);
71 if (IS_ERR(pc)) 71 if (IS_ERR(pc))
72 return PTR_ERR(pc); 72 return PTR_ERR(pc);
73 p = to_tcf_nat(pc); 73 p = to_tcf_nat(pc);
74 ret = ACT_P_CREATED; 74 ret = ACT_P_CREATED;
75 } else { 75 } else {
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index 445bef716f77..50c7c06c019d 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -70,7 +70,7 @@ static int tcf_pedit_init(struct nlattr *nla, struct nlattr *est,
70 pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, 70 pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind,
71 &pedit_idx_gen, &pedit_hash_info); 71 &pedit_idx_gen, &pedit_hash_info);
72 if (IS_ERR(pc)) 72 if (IS_ERR(pc))
73 return PTR_ERR(pc); 73 return PTR_ERR(pc);
74 p = to_pedit(pc); 74 p = to_pedit(pc);
75 keys = kmalloc(ksize, GFP_KERNEL); 75 keys = kmalloc(ksize, GFP_KERNEL);
76 if (keys == NULL) { 76 if (keys == NULL) {
@@ -127,11 +127,9 @@ static int tcf_pedit(struct sk_buff *skb, struct tc_action *a,
127 int i, munged = 0; 127 int i, munged = 0;
128 unsigned int off; 128 unsigned int off;
129 129
130 if (skb_cloned(skb)) { 130 if (skb_cloned(skb) &&
131 if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { 131 pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
132 return p->tcf_action; 132 return p->tcf_action;
133 }
134 }
135 133
136 off = skb_network_offset(skb); 134 off = skb_network_offset(skb);
137 135
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index e2f08b1e2e58..8a1630774fd6 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,8 +22,8 @@
22#include <net/act_api.h> 22#include <net/act_api.h>
23#include <net/netlink.h> 23#include <net/netlink.h>
24 24
25#define L2T(p,L) qdisc_l2t((p)->tcfp_R_tab, L) 25#define L2T(p, L) qdisc_l2t((p)->tcfp_R_tab, L)
26#define L2T_P(p,L) qdisc_l2t((p)->tcfp_P_tab, L) 26#define L2T_P(p, L) qdisc_l2t((p)->tcfp_P_tab, L)
27 27
28#define POL_TAB_MASK 15 28#define POL_TAB_MASK 15
29static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1]; 29static struct tcf_common *tcf_police_ht[POL_TAB_MASK + 1];
@@ -37,8 +37,7 @@ static struct tcf_hashinfo police_hash_info = {
37}; 37};
38 38
39/* old policer structure from before tc actions */ 39/* old policer structure from before tc actions */
40struct tc_police_compat 40struct tc_police_compat {
41{
42 u32 index; 41 u32 index;
43 int action; 42 int action;
44 u32 limit; 43 u32 limit;
@@ -139,7 +138,7 @@ static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = {
139static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est, 138static int tcf_act_police_locate(struct nlattr *nla, struct nlattr *est,
140 struct tc_action *a, int ovr, int bind) 139 struct tc_action *a, int ovr, int bind)
141{ 140{
142 unsigned h; 141 unsigned int h;
143 int ret = 0, err; 142 int ret = 0, err;
144 struct nlattr *tb[TCA_POLICE_MAX + 1]; 143 struct nlattr *tb[TCA_POLICE_MAX + 1];
145 struct tc_police *parm; 144 struct tc_police *parm;
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index 7287cff7af3e..a34a22de60b3 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -47,7 +47,7 @@ static int tcf_simp(struct sk_buff *skb, struct tc_action *a, struct tcf_result
47 /* print policy string followed by _ then packet count 47 /* print policy string followed by _ then packet count
48 * Example if this was the 3rd packet and the string was "hello" 48 * Example if this was the 3rd packet and the string was "hello"
49 * then it would look like "hello_3" (without quotes) 49 * then it would look like "hello_3" (without quotes)
50 **/ 50 */
51 pr_info("simple: %s_%d\n", 51 pr_info("simple: %s_%d\n",
52 (char *)d->tcfd_defdata, d->tcf_bstats.packets); 52 (char *)d->tcfd_defdata, d->tcf_bstats.packets);
53 spin_unlock(&d->tcf_lock); 53 spin_unlock(&d->tcf_lock);
@@ -125,7 +125,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
125 pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, 125 pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
126 &simp_idx_gen, &simp_hash_info); 126 &simp_idx_gen, &simp_hash_info);
127 if (IS_ERR(pc)) 127 if (IS_ERR(pc))
128 return PTR_ERR(pc); 128 return PTR_ERR(pc);
129 129
130 d = to_defact(pc); 130 d = to_defact(pc);
131 ret = alloc_defdata(d, defdata); 131 ret = alloc_defdata(d, defdata);
@@ -149,7 +149,7 @@ static int tcf_simp_init(struct nlattr *nla, struct nlattr *est,
149 return ret; 149 return ret;
150} 150}
151 151
152static inline int tcf_simp_cleanup(struct tc_action *a, int bind) 152static int tcf_simp_cleanup(struct tc_action *a, int bind)
153{ 153{
154 struct tcf_defact *d = a->priv; 154 struct tcf_defact *d = a->priv;
155 155
@@ -158,8 +158,8 @@ static inline int tcf_simp_cleanup(struct tc_action *a, int bind)
158 return 0; 158 return 0;
159} 159}
160 160
161static inline int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a, 161static int tcf_simp_dump(struct sk_buff *skb, struct tc_action *a,
162 int bind, int ref) 162 int bind, int ref)
163{ 163{
164 unsigned char *b = skb_tail_pointer(skb); 164 unsigned char *b = skb_tail_pointer(skb);
165 struct tcf_defact *d = a->priv; 165 struct tcf_defact *d = a->priv;
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 836f5fee9e58..5f6f0c7c3905 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -113,7 +113,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
113 pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind, 113 pc = tcf_hash_create(parm->index, est, a, sizeof(*d), bind,
114 &skbedit_idx_gen, &skbedit_hash_info); 114 &skbedit_idx_gen, &skbedit_hash_info);
115 if (IS_ERR(pc)) 115 if (IS_ERR(pc))
116 return PTR_ERR(pc); 116 return PTR_ERR(pc);
117 117
118 d = to_skbedit(pc); 118 d = to_skbedit(pc);
119 ret = ACT_P_CREATED; 119 ret = ACT_P_CREATED;
@@ -144,7 +144,7 @@ static int tcf_skbedit_init(struct nlattr *nla, struct nlattr *est,
144 return ret; 144 return ret;
145} 145}
146 146
147static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind) 147static int tcf_skbedit_cleanup(struct tc_action *a, int bind)
148{ 148{
149 struct tcf_skbedit *d = a->priv; 149 struct tcf_skbedit *d = a->priv;
150 150
@@ -153,8 +153,8 @@ static inline int tcf_skbedit_cleanup(struct tc_action *a, int bind)
153 return 0; 153 return 0;
154} 154}
155 155
156static inline int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, 156static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a,
157 int bind, int ref) 157 int bind, int ref)
158{ 158{
159 unsigned char *b = skb_tail_pointer(skb); 159 unsigned char *b = skb_tail_pointer(skb);
160 struct tcf_skbedit *d = a->priv; 160 struct tcf_skbedit *d = a->priv;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 5fd0c28ef79a..bb2c523f8158 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -85,7 +85,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
85 int rc = -ENOENT; 85 int rc = -ENOENT;
86 86
87 write_lock(&cls_mod_lock); 87 write_lock(&cls_mod_lock);
88 for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) 88 for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next)
89 if (t == ops) 89 if (t == ops)
90 break; 90 break;
91 91
@@ -111,7 +111,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp)
111 u32 first = TC_H_MAKE(0xC0000000U, 0U); 111 u32 first = TC_H_MAKE(0xC0000000U, 0U);
112 112
113 if (tp) 113 if (tp)
114 first = tp->prio-1; 114 first = tp->prio - 1;
115 115
116 return first; 116 return first;
117} 117}
@@ -149,7 +149,8 @@ replay:
149 149
150 if (prio == 0) { 150 if (prio == 0) {
151 /* If no priority is given, user wants we allocated it. */ 151 /* If no priority is given, user wants we allocated it. */
152 if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) 152 if (n->nlmsg_type != RTM_NEWTFILTER ||
153 !(n->nlmsg_flags & NLM_F_CREATE))
153 return -ENOENT; 154 return -ENOENT;
154 prio = TC_H_MAKE(0x80000000U, 0U); 155 prio = TC_H_MAKE(0x80000000U, 0U);
155 } 156 }
@@ -176,7 +177,8 @@ replay:
176 } 177 }
177 178
178 /* Is it classful? */ 179 /* Is it classful? */
179 if ((cops = q->ops->cl_ops) == NULL) 180 cops = q->ops->cl_ops;
181 if (!cops)
180 return -EINVAL; 182 return -EINVAL;
181 183
182 if (cops->tcf_chain == NULL) 184 if (cops->tcf_chain == NULL)
@@ -196,10 +198,11 @@ replay:
196 goto errout; 198 goto errout;
197 199
198 /* Check the chain for existence of proto-tcf with this priority */ 200 /* Check the chain for existence of proto-tcf with this priority */
199 for (back = chain; (tp=*back) != NULL; back = &tp->next) { 201 for (back = chain; (tp = *back) != NULL; back = &tp->next) {
200 if (tp->prio >= prio) { 202 if (tp->prio >= prio) {
201 if (tp->prio == prio) { 203 if (tp->prio == prio) {
202 if (!nprio || (tp->protocol != protocol && protocol)) 204 if (!nprio ||
205 (tp->protocol != protocol && protocol))
203 goto errout; 206 goto errout;
204 } else 207 } else
205 tp = NULL; 208 tp = NULL;
@@ -216,7 +219,8 @@ replay:
216 goto errout; 219 goto errout;
217 220
218 err = -ENOENT; 221 err = -ENOENT;
219 if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) 222 if (n->nlmsg_type != RTM_NEWTFILTER ||
223 !(n->nlmsg_flags & NLM_F_CREATE))
220 goto errout; 224 goto errout;
221 225
222 226
@@ -420,7 +424,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
420 424
421 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) 425 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
422 return skb->len; 426 return skb->len;
423 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) 427 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
428 if (!dev)
424 return skb->len; 429 return skb->len;
425 430
426 if (!tcm->tcm_parent) 431 if (!tcm->tcm_parent)
@@ -429,7 +434,8 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
429 q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); 434 q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
430 if (!q) 435 if (!q)
431 goto out; 436 goto out;
432 if ((cops = q->ops->cl_ops) == NULL) 437 cops = q->ops->cl_ops;
438 if (!cops)
433 goto errout; 439 goto errout;
434 if (cops->tcf_chain == NULL) 440 if (cops->tcf_chain == NULL)
435 goto errout; 441 goto errout;
@@ -444,8 +450,9 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
444 450
445 s_t = cb->args[0]; 451 s_t = cb->args[0];
446 452
447 for (tp=*chain, t=0; tp; tp = tp->next, t++) { 453 for (tp = *chain, t = 0; tp; tp = tp->next, t++) {
448 if (t < s_t) continue; 454 if (t < s_t)
455 continue;
449 if (TC_H_MAJ(tcm->tcm_info) && 456 if (TC_H_MAJ(tcm->tcm_info) &&
450 TC_H_MAJ(tcm->tcm_info) != tp->prio) 457 TC_H_MAJ(tcm->tcm_info) != tp->prio)
451 continue; 458 continue;
@@ -468,10 +475,10 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
468 arg.skb = skb; 475 arg.skb = skb;
469 arg.cb = cb; 476 arg.cb = cb;
470 arg.w.stop = 0; 477 arg.w.stop = 0;
471 arg.w.skip = cb->args[1]-1; 478 arg.w.skip = cb->args[1] - 1;
472 arg.w.count = 0; 479 arg.w.count = 0;
473 tp->ops->walk(tp, &arg.w); 480 tp->ops->walk(tp, &arg.w);
474 cb->args[1] = arg.w.count+1; 481 cb->args[1] = arg.w.count + 1;
475 if (arg.w.stop) 482 if (arg.w.stop)
476 break; 483 break;
477 } 484 }
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index f23d9155b1ef..8be8872dd571 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -21,14 +21,12 @@
21#include <net/act_api.h> 21#include <net/act_api.h>
22#include <net/pkt_cls.h> 22#include <net/pkt_cls.h>
23 23
24struct basic_head 24struct basic_head {
25{
26 u32 hgenerator; 25 u32 hgenerator;
27 struct list_head flist; 26 struct list_head flist;
28}; 27};
29 28
30struct basic_filter 29struct basic_filter {
31{
32 u32 handle; 30 u32 handle;
33 struct tcf_exts exts; 31 struct tcf_exts exts;
34 struct tcf_ematch_tree ematches; 32 struct tcf_ematch_tree ematches;
@@ -92,8 +90,7 @@ static int basic_init(struct tcf_proto *tp)
92 return 0; 90 return 0;
93} 91}
94 92
95static inline void basic_delete_filter(struct tcf_proto *tp, 93static void basic_delete_filter(struct tcf_proto *tp, struct basic_filter *f)
96 struct basic_filter *f)
97{ 94{
98 tcf_unbind_filter(tp, &f->res); 95 tcf_unbind_filter(tp, &f->res);
99 tcf_exts_destroy(tp, &f->exts); 96 tcf_exts_destroy(tp, &f->exts);
@@ -135,9 +132,9 @@ static const struct nla_policy basic_policy[TCA_BASIC_MAX + 1] = {
135 [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED }, 132 [TCA_BASIC_EMATCHES] = { .type = NLA_NESTED },
136}; 133};
137 134
138static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, 135static int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f,
139 unsigned long base, struct nlattr **tb, 136 unsigned long base, struct nlattr **tb,
140 struct nlattr *est) 137 struct nlattr *est)
141{ 138{
142 int err = -EINVAL; 139 int err = -EINVAL;
143 struct tcf_exts e; 140 struct tcf_exts e;
@@ -203,7 +200,7 @@ static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle,
203 } while (--i > 0 && basic_get(tp, head->hgenerator)); 200 } while (--i > 0 && basic_get(tp, head->hgenerator));
204 201
205 if (i <= 0) { 202 if (i <= 0) {
206 printk(KERN_ERR "Insufficient number of handles\n"); 203 pr_err("Insufficient number of handles\n");
207 goto errout; 204 goto errout;
208 } 205 }
209 206
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index d49c40fb7e09..32a335194ca5 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -56,7 +56,8 @@ static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
56{ 56{
57 struct cgroup_cls_state *cs; 57 struct cgroup_cls_state *cs;
58 58
59 if (!(cs = kzalloc(sizeof(*cs), GFP_KERNEL))) 59 cs = kzalloc(sizeof(*cs), GFP_KERNEL);
60 if (!cs)
60 return ERR_PTR(-ENOMEM); 61 return ERR_PTR(-ENOMEM);
61 62
62 if (cgrp->parent) 63 if (cgrp->parent)
@@ -94,8 +95,7 @@ static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
94 return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files)); 95 return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
95} 96}
96 97
97struct cls_cgroup_head 98struct cls_cgroup_head {
98{
99 u32 handle; 99 u32 handle;
100 struct tcf_exts exts; 100 struct tcf_exts exts;
101 struct tcf_ematch_tree ematches; 101 struct tcf_ematch_tree ematches;
@@ -166,7 +166,7 @@ static int cls_cgroup_change(struct tcf_proto *tp, unsigned long base,
166 u32 handle, struct nlattr **tca, 166 u32 handle, struct nlattr **tca,
167 unsigned long *arg) 167 unsigned long *arg)
168{ 168{
169 struct nlattr *tb[TCA_CGROUP_MAX+1]; 169 struct nlattr *tb[TCA_CGROUP_MAX + 1];
170 struct cls_cgroup_head *head = tp->root; 170 struct cls_cgroup_head *head = tp->root;
171 struct tcf_ematch_tree t; 171 struct tcf_ematch_tree t;
172 struct tcf_exts e; 172 struct tcf_exts e;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 5b271a18bc3a..8ec01391d988 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -121,7 +121,7 @@ static u32 flow_get_proto_src(struct sk_buff *skb)
121 if (!pskb_network_may_pull(skb, sizeof(*iph))) 121 if (!pskb_network_may_pull(skb, sizeof(*iph)))
122 break; 122 break;
123 iph = ip_hdr(skb); 123 iph = ip_hdr(skb);
124 if (iph->frag_off & htons(IP_MF|IP_OFFSET)) 124 if (iph->frag_off & htons(IP_MF | IP_OFFSET))
125 break; 125 break;
126 poff = proto_ports_offset(iph->protocol); 126 poff = proto_ports_offset(iph->protocol);
127 if (poff >= 0 && 127 if (poff >= 0 &&
@@ -163,7 +163,7 @@ static u32 flow_get_proto_dst(struct sk_buff *skb)
163 if (!pskb_network_may_pull(skb, sizeof(*iph))) 163 if (!pskb_network_may_pull(skb, sizeof(*iph)))
164 break; 164 break;
165 iph = ip_hdr(skb); 165 iph = ip_hdr(skb);
166 if (iph->frag_off & htons(IP_MF|IP_OFFSET)) 166 if (iph->frag_off & htons(IP_MF | IP_OFFSET))
167 break; 167 break;
168 poff = proto_ports_offset(iph->protocol); 168 poff = proto_ports_offset(iph->protocol);
169 if (poff >= 0 && 169 if (poff >= 0 &&
@@ -276,7 +276,7 @@ fallback:
276 276
277static u32 flow_get_rtclassid(const struct sk_buff *skb) 277static u32 flow_get_rtclassid(const struct sk_buff *skb)
278{ 278{
279#ifdef CONFIG_NET_CLS_ROUTE 279#ifdef CONFIG_IP_ROUTE_CLASSID
280 if (skb_dst(skb)) 280 if (skb_dst(skb))
281 return skb_dst(skb)->tclassid; 281 return skb_dst(skb)->tclassid;
282#endif 282#endif
diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c
index 93b0a7b6f9b4..26e7bc4ffb79 100644
--- a/net/sched/cls_fw.c
+++ b/net/sched/cls_fw.c
@@ -31,14 +31,12 @@
31 31
32#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *)) 32#define HTSIZE (PAGE_SIZE/sizeof(struct fw_filter *))
33 33
34struct fw_head 34struct fw_head {
35{
36 struct fw_filter *ht[HTSIZE]; 35 struct fw_filter *ht[HTSIZE];
37 u32 mask; 36 u32 mask;
38}; 37};
39 38
40struct fw_filter 39struct fw_filter {
41{
42 struct fw_filter *next; 40 struct fw_filter *next;
43 u32 id; 41 u32 id;
44 struct tcf_result res; 42 struct tcf_result res;
@@ -53,7 +51,7 @@ static const struct tcf_ext_map fw_ext_map = {
53 .police = TCA_FW_POLICE 51 .police = TCA_FW_POLICE
54}; 52};
55 53
56static __inline__ int fw_hash(u32 handle) 54static inline int fw_hash(u32 handle)
57{ 55{
58 if (HTSIZE == 4096) 56 if (HTSIZE == 4096)
59 return ((handle >> 24) & 0xFFF) ^ 57 return ((handle >> 24) & 0xFFF) ^
@@ -82,14 +80,14 @@ static __inline__ int fw_hash(u32 handle)
82static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, 80static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
83 struct tcf_result *res) 81 struct tcf_result *res)
84{ 82{
85 struct fw_head *head = (struct fw_head*)tp->root; 83 struct fw_head *head = (struct fw_head *)tp->root;
86 struct fw_filter *f; 84 struct fw_filter *f;
87 int r; 85 int r;
88 u32 id = skb->mark; 86 u32 id = skb->mark;
89 87
90 if (head != NULL) { 88 if (head != NULL) {
91 id &= head->mask; 89 id &= head->mask;
92 for (f=head->ht[fw_hash(id)]; f; f=f->next) { 90 for (f = head->ht[fw_hash(id)]; f; f = f->next) {
93 if (f->id == id) { 91 if (f->id == id) {
94 *res = f->res; 92 *res = f->res;
95#ifdef CONFIG_NET_CLS_IND 93#ifdef CONFIG_NET_CLS_IND
@@ -105,7 +103,8 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
105 } 103 }
106 } else { 104 } else {
107 /* old method */ 105 /* old method */
108 if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { 106 if (id && (TC_H_MAJ(id) == 0 ||
107 !(TC_H_MAJ(id ^ tp->q->handle)))) {
109 res->classid = id; 108 res->classid = id;
110 res->class = 0; 109 res->class = 0;
111 return 0; 110 return 0;
@@ -117,13 +116,13 @@ static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp,
117 116
118static unsigned long fw_get(struct tcf_proto *tp, u32 handle) 117static unsigned long fw_get(struct tcf_proto *tp, u32 handle)
119{ 118{
120 struct fw_head *head = (struct fw_head*)tp->root; 119 struct fw_head *head = (struct fw_head *)tp->root;
121 struct fw_filter *f; 120 struct fw_filter *f;
122 121
123 if (head == NULL) 122 if (head == NULL)
124 return 0; 123 return 0;
125 124
126 for (f=head->ht[fw_hash(handle)]; f; f=f->next) { 125 for (f = head->ht[fw_hash(handle)]; f; f = f->next) {
127 if (f->id == handle) 126 if (f->id == handle)
128 return (unsigned long)f; 127 return (unsigned long)f;
129 } 128 }
@@ -139,8 +138,7 @@ static int fw_init(struct tcf_proto *tp)
139 return 0; 138 return 0;
140} 139}
141 140
142static inline void 141static void fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
143fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f)
144{ 142{
145 tcf_unbind_filter(tp, &f->res); 143 tcf_unbind_filter(tp, &f->res);
146 tcf_exts_destroy(tp, &f->exts); 144 tcf_exts_destroy(tp, &f->exts);
@@ -156,8 +154,8 @@ static void fw_destroy(struct tcf_proto *tp)
156 if (head == NULL) 154 if (head == NULL)
157 return; 155 return;
158 156
159 for (h=0; h<HTSIZE; h++) { 157 for (h = 0; h < HTSIZE; h++) {
160 while ((f=head->ht[h]) != NULL) { 158 while ((f = head->ht[h]) != NULL) {
161 head->ht[h] = f->next; 159 head->ht[h] = f->next;
162 fw_delete_filter(tp, f); 160 fw_delete_filter(tp, f);
163 } 161 }
@@ -167,14 +165,14 @@ static void fw_destroy(struct tcf_proto *tp)
167 165
168static int fw_delete(struct tcf_proto *tp, unsigned long arg) 166static int fw_delete(struct tcf_proto *tp, unsigned long arg)
169{ 167{
170 struct fw_head *head = (struct fw_head*)tp->root; 168 struct fw_head *head = (struct fw_head *)tp->root;
171 struct fw_filter *f = (struct fw_filter*)arg; 169 struct fw_filter *f = (struct fw_filter *)arg;
172 struct fw_filter **fp; 170 struct fw_filter **fp;
173 171
174 if (head == NULL || f == NULL) 172 if (head == NULL || f == NULL)
175 goto out; 173 goto out;
176 174
177 for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { 175 for (fp = &head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) {
178 if (*fp == f) { 176 if (*fp == f) {
179 tcf_tree_lock(tp); 177 tcf_tree_lock(tp);
180 *fp = f->next; 178 *fp = f->next;
@@ -240,7 +238,7 @@ static int fw_change(struct tcf_proto *tp, unsigned long base,
240 struct nlattr **tca, 238 struct nlattr **tca,
241 unsigned long *arg) 239 unsigned long *arg)
242{ 240{
243 struct fw_head *head = (struct fw_head*)tp->root; 241 struct fw_head *head = (struct fw_head *)tp->root;
244 struct fw_filter *f = (struct fw_filter *) *arg; 242 struct fw_filter *f = (struct fw_filter *) *arg;
245 struct nlattr *opt = tca[TCA_OPTIONS]; 243 struct nlattr *opt = tca[TCA_OPTIONS];
246 struct nlattr *tb[TCA_FW_MAX + 1]; 244 struct nlattr *tb[TCA_FW_MAX + 1];
@@ -302,7 +300,7 @@ errout:
302 300
303static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) 301static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg)
304{ 302{
305 struct fw_head *head = (struct fw_head*)tp->root; 303 struct fw_head *head = (struct fw_head *)tp->root;
306 int h; 304 int h;
307 305
308 if (head == NULL) 306 if (head == NULL)
@@ -332,7 +330,7 @@ static int fw_dump(struct tcf_proto *tp, unsigned long fh,
332 struct sk_buff *skb, struct tcmsg *t) 330 struct sk_buff *skb, struct tcmsg *t)
333{ 331{
334 struct fw_head *head = (struct fw_head *)tp->root; 332 struct fw_head *head = (struct fw_head *)tp->root;
335 struct fw_filter *f = (struct fw_filter*)fh; 333 struct fw_filter *f = (struct fw_filter *)fh;
336 unsigned char *b = skb_tail_pointer(skb); 334 unsigned char *b = skb_tail_pointer(skb);
337 struct nlattr *nest; 335 struct nlattr *nest;
338 336
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c
index 694dcd85dec8..d580cdfca093 100644
--- a/net/sched/cls_route.c
+++ b/net/sched/cls_route.c
@@ -23,34 +23,30 @@
23#include <net/pkt_cls.h> 23#include <net/pkt_cls.h>
24 24
25/* 25/*
26 1. For now we assume that route tags < 256. 26 * 1. For now we assume that route tags < 256.
27 It allows to use direct table lookups, instead of hash tables. 27 * It allows to use direct table lookups, instead of hash tables.
28 2. For now we assume that "from TAG" and "fromdev DEV" statements 28 * 2. For now we assume that "from TAG" and "fromdev DEV" statements
29 are mutually exclusive. 29 * are mutually exclusive.
30 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" 30 * 3. "to TAG from ANY" has higher priority, than "to ANY from XXX"
31 */ 31 */
32 32
33struct route4_fastmap 33struct route4_fastmap {
34{
35 struct route4_filter *filter; 34 struct route4_filter *filter;
36 u32 id; 35 u32 id;
37 int iif; 36 int iif;
38}; 37};
39 38
40struct route4_head 39struct route4_head {
41{
42 struct route4_fastmap fastmap[16]; 40 struct route4_fastmap fastmap[16];
43 struct route4_bucket *table[256+1]; 41 struct route4_bucket *table[256 + 1];
44}; 42};
45 43
46struct route4_bucket 44struct route4_bucket {
47{
48 /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ 45 /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */
49 struct route4_filter *ht[16+16+1]; 46 struct route4_filter *ht[16 + 16 + 1];
50}; 47};
51 48
52struct route4_filter 49struct route4_filter {
53{
54 struct route4_filter *next; 50 struct route4_filter *next;
55 u32 id; 51 u32 id;
56 int iif; 52 int iif;
@@ -61,20 +57,20 @@ struct route4_filter
61 struct route4_bucket *bkt; 57 struct route4_bucket *bkt;
62}; 58};
63 59
64#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) 60#define ROUTE4_FAILURE ((struct route4_filter *)(-1L))
65 61
66static const struct tcf_ext_map route_ext_map = { 62static const struct tcf_ext_map route_ext_map = {
67 .police = TCA_ROUTE4_POLICE, 63 .police = TCA_ROUTE4_POLICE,
68 .action = TCA_ROUTE4_ACT 64 .action = TCA_ROUTE4_ACT
69}; 65};
70 66
71static __inline__ int route4_fastmap_hash(u32 id, int iif) 67static inline int route4_fastmap_hash(u32 id, int iif)
72{ 68{
73 return id&0xF; 69 return id & 0xF;
74} 70}
75 71
76static inline 72static void
77void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id) 73route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
78{ 74{
79 spinlock_t *root_lock = qdisc_root_sleeping_lock(q); 75 spinlock_t *root_lock = qdisc_root_sleeping_lock(q);
80 76
@@ -83,32 +79,33 @@ void route4_reset_fastmap(struct Qdisc *q, struct route4_head *head, u32 id)
83 spin_unlock_bh(root_lock); 79 spin_unlock_bh(root_lock);
84} 80}
85 81
86static inline void 82static void
87route4_set_fastmap(struct route4_head *head, u32 id, int iif, 83route4_set_fastmap(struct route4_head *head, u32 id, int iif,
88 struct route4_filter *f) 84 struct route4_filter *f)
89{ 85{
90 int h = route4_fastmap_hash(id, iif); 86 int h = route4_fastmap_hash(id, iif);
87
91 head->fastmap[h].id = id; 88 head->fastmap[h].id = id;
92 head->fastmap[h].iif = iif; 89 head->fastmap[h].iif = iif;
93 head->fastmap[h].filter = f; 90 head->fastmap[h].filter = f;
94} 91}
95 92
96static __inline__ int route4_hash_to(u32 id) 93static inline int route4_hash_to(u32 id)
97{ 94{
98 return id&0xFF; 95 return id & 0xFF;
99} 96}
100 97
101static __inline__ int route4_hash_from(u32 id) 98static inline int route4_hash_from(u32 id)
102{ 99{
103 return (id>>16)&0xF; 100 return (id >> 16) & 0xF;
104} 101}
105 102
106static __inline__ int route4_hash_iif(int iif) 103static inline int route4_hash_iif(int iif)
107{ 104{
108 return 16 + ((iif>>16)&0xF); 105 return 16 + ((iif >> 16) & 0xF);
109} 106}
110 107
111static __inline__ int route4_hash_wild(void) 108static inline int route4_hash_wild(void)
112{ 109{
113 return 32; 110 return 32;
114} 111}
@@ -131,21 +128,22 @@ static __inline__ int route4_hash_wild(void)
131static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, 128static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
132 struct tcf_result *res) 129 struct tcf_result *res)
133{ 130{
134 struct route4_head *head = (struct route4_head*)tp->root; 131 struct route4_head *head = (struct route4_head *)tp->root;
135 struct dst_entry *dst; 132 struct dst_entry *dst;
136 struct route4_bucket *b; 133 struct route4_bucket *b;
137 struct route4_filter *f; 134 struct route4_filter *f;
138 u32 id, h; 135 u32 id, h;
139 int iif, dont_cache = 0; 136 int iif, dont_cache = 0;
140 137
141 if ((dst = skb_dst(skb)) == NULL) 138 dst = skb_dst(skb);
139 if (!dst)
142 goto failure; 140 goto failure;
143 141
144 id = dst->tclassid; 142 id = dst->tclassid;
145 if (head == NULL) 143 if (head == NULL)
146 goto old_method; 144 goto old_method;
147 145
148 iif = ((struct rtable*)dst)->fl.iif; 146 iif = ((struct rtable *)dst)->fl.iif;
149 147
150 h = route4_fastmap_hash(id, iif); 148 h = route4_fastmap_hash(id, iif);
151 if (id == head->fastmap[h].id && 149 if (id == head->fastmap[h].id &&
@@ -161,7 +159,8 @@ static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp,
161 h = route4_hash_to(id); 159 h = route4_hash_to(id);
162 160
163restart: 161restart:
164 if ((b = head->table[h]) != NULL) { 162 b = head->table[h];
163 if (b) {
165 for (f = b->ht[route4_hash_from(id)]; f; f = f->next) 164 for (f = b->ht[route4_hash_from(id)]; f; f = f->next)
166 if (f->id == id) 165 if (f->id == id)
167 ROUTE4_APPLY_RESULT(); 166 ROUTE4_APPLY_RESULT();
@@ -197,8 +196,9 @@ old_method:
197 196
198static inline u32 to_hash(u32 id) 197static inline u32 to_hash(u32 id)
199{ 198{
200 u32 h = id&0xFF; 199 u32 h = id & 0xFF;
201 if (id&0x8000) 200
201 if (id & 0x8000)
202 h += 256; 202 h += 256;
203 return h; 203 return h;
204} 204}
@@ -211,17 +211,17 @@ static inline u32 from_hash(u32 id)
211 if (!(id & 0x8000)) { 211 if (!(id & 0x8000)) {
212 if (id > 255) 212 if (id > 255)
213 return 256; 213 return 256;
214 return id&0xF; 214 return id & 0xF;
215 } 215 }
216 return 16 + (id&0xF); 216 return 16 + (id & 0xF);
217} 217}
218 218
219static unsigned long route4_get(struct tcf_proto *tp, u32 handle) 219static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
220{ 220{
221 struct route4_head *head = (struct route4_head*)tp->root; 221 struct route4_head *head = (struct route4_head *)tp->root;
222 struct route4_bucket *b; 222 struct route4_bucket *b;
223 struct route4_filter *f; 223 struct route4_filter *f;
224 unsigned h1, h2; 224 unsigned int h1, h2;
225 225
226 if (!head) 226 if (!head)
227 return 0; 227 return 0;
@@ -230,11 +230,12 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle)
230 if (h1 > 256) 230 if (h1 > 256)
231 return 0; 231 return 0;
232 232
233 h2 = from_hash(handle>>16); 233 h2 = from_hash(handle >> 16);
234 if (h2 > 32) 234 if (h2 > 32)
235 return 0; 235 return 0;
236 236
237 if ((b = head->table[h1]) != NULL) { 237 b = head->table[h1];
238 if (b) {
238 for (f = b->ht[h2]; f; f = f->next) 239 for (f = b->ht[h2]; f; f = f->next)
239 if (f->handle == handle) 240 if (f->handle == handle)
240 return (unsigned long)f; 241 return (unsigned long)f;
@@ -251,7 +252,7 @@ static int route4_init(struct tcf_proto *tp)
251 return 0; 252 return 0;
252} 253}
253 254
254static inline void 255static void
255route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) 256route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f)
256{ 257{
257 tcf_unbind_filter(tp, &f->res); 258 tcf_unbind_filter(tp, &f->res);
@@ -267,11 +268,12 @@ static void route4_destroy(struct tcf_proto *tp)
267 if (head == NULL) 268 if (head == NULL)
268 return; 269 return;
269 270
270 for (h1=0; h1<=256; h1++) { 271 for (h1 = 0; h1 <= 256; h1++) {
271 struct route4_bucket *b; 272 struct route4_bucket *b;
272 273
273 if ((b = head->table[h1]) != NULL) { 274 b = head->table[h1];
274 for (h2=0; h2<=32; h2++) { 275 if (b) {
276 for (h2 = 0; h2 <= 32; h2++) {
275 struct route4_filter *f; 277 struct route4_filter *f;
276 278
277 while ((f = b->ht[h2]) != NULL) { 279 while ((f = b->ht[h2]) != NULL) {
@@ -287,9 +289,9 @@ static void route4_destroy(struct tcf_proto *tp)
287 289
288static int route4_delete(struct tcf_proto *tp, unsigned long arg) 290static int route4_delete(struct tcf_proto *tp, unsigned long arg)
289{ 291{
290 struct route4_head *head = (struct route4_head*)tp->root; 292 struct route4_head *head = (struct route4_head *)tp->root;
291 struct route4_filter **fp, *f = (struct route4_filter*)arg; 293 struct route4_filter **fp, *f = (struct route4_filter *)arg;
292 unsigned h = 0; 294 unsigned int h = 0;
293 struct route4_bucket *b; 295 struct route4_bucket *b;
294 int i; 296 int i;
295 297
@@ -299,7 +301,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
299 h = f->handle; 301 h = f->handle;
300 b = f->bkt; 302 b = f->bkt;
301 303
302 for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { 304 for (fp = &b->ht[from_hash(h >> 16)]; *fp; fp = &(*fp)->next) {
303 if (*fp == f) { 305 if (*fp == f) {
304 tcf_tree_lock(tp); 306 tcf_tree_lock(tp);
305 *fp = f->next; 307 *fp = f->next;
@@ -310,7 +312,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg)
310 312
311 /* Strip tree */ 313 /* Strip tree */
312 314
313 for (i=0; i<=32; i++) 315 for (i = 0; i <= 32; i++)
314 if (b->ht[i]) 316 if (b->ht[i])
315 return 0; 317 return 0;
316 318
@@ -380,7 +382,8 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
380 } 382 }
381 383
382 h1 = to_hash(nhandle); 384 h1 = to_hash(nhandle);
383 if ((b = head->table[h1]) == NULL) { 385 b = head->table[h1];
386 if (!b) {
384 err = -ENOBUFS; 387 err = -ENOBUFS;
385 b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL); 388 b = kzalloc(sizeof(struct route4_bucket), GFP_KERNEL);
386 if (b == NULL) 389 if (b == NULL)
@@ -391,6 +394,7 @@ static int route4_set_parms(struct tcf_proto *tp, unsigned long base,
391 tcf_tree_unlock(tp); 394 tcf_tree_unlock(tp);
392 } else { 395 } else {
393 unsigned int h2 = from_hash(nhandle >> 16); 396 unsigned int h2 = from_hash(nhandle >> 16);
397
394 err = -EEXIST; 398 err = -EEXIST;
395 for (fp = b->ht[h2]; fp; fp = fp->next) 399 for (fp = b->ht[h2]; fp; fp = fp->next)
396 if (fp->handle == f->handle) 400 if (fp->handle == f->handle)
@@ -444,7 +448,8 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
444 if (err < 0) 448 if (err < 0)
445 return err; 449 return err;
446 450
447 if ((f = (struct route4_filter*)*arg) != NULL) { 451 f = (struct route4_filter *)*arg;
452 if (f) {
448 if (f->handle != handle && handle) 453 if (f->handle != handle && handle)
449 return -EINVAL; 454 return -EINVAL;
450 455
@@ -481,7 +486,7 @@ static int route4_change(struct tcf_proto *tp, unsigned long base,
481 486
482reinsert: 487reinsert:
483 h = from_hash(f->handle >> 16); 488 h = from_hash(f->handle >> 16);
484 for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) 489 for (fp = &f->bkt->ht[h]; (f1 = *fp) != NULL; fp = &f1->next)
485 if (f->handle < f1->handle) 490 if (f->handle < f1->handle)
486 break; 491 break;
487 492
@@ -492,7 +497,8 @@ reinsert:
492 if (old_handle && f->handle != old_handle) { 497 if (old_handle && f->handle != old_handle) {
493 th = to_hash(old_handle); 498 th = to_hash(old_handle);
494 h = from_hash(old_handle >> 16); 499 h = from_hash(old_handle >> 16);
495 if ((b = head->table[th]) != NULL) { 500 b = head->table[th];
501 if (b) {
496 for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { 502 for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) {
497 if (*fp == f) { 503 if (*fp == f) {
498 *fp = f->next; 504 *fp = f->next;
@@ -515,7 +521,7 @@ errout:
515static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) 521static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
516{ 522{
517 struct route4_head *head = tp->root; 523 struct route4_head *head = tp->root;
518 unsigned h, h1; 524 unsigned int h, h1;
519 525
520 if (head == NULL) 526 if (head == NULL)
521 arg->stop = 1; 527 arg->stop = 1;
@@ -549,7 +555,7 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg)
549static int route4_dump(struct tcf_proto *tp, unsigned long fh, 555static int route4_dump(struct tcf_proto *tp, unsigned long fh,
550 struct sk_buff *skb, struct tcmsg *t) 556 struct sk_buff *skb, struct tcmsg *t)
551{ 557{
552 struct route4_filter *f = (struct route4_filter*)fh; 558 struct route4_filter *f = (struct route4_filter *)fh;
553 unsigned char *b = skb_tail_pointer(skb); 559 unsigned char *b = skb_tail_pointer(skb);
554 struct nlattr *nest; 560 struct nlattr *nest;
555 u32 id; 561 u32 id;
@@ -563,15 +569,15 @@ static int route4_dump(struct tcf_proto *tp, unsigned long fh,
563 if (nest == NULL) 569 if (nest == NULL)
564 goto nla_put_failure; 570 goto nla_put_failure;
565 571
566 if (!(f->handle&0x8000)) { 572 if (!(f->handle & 0x8000)) {
567 id = f->id&0xFF; 573 id = f->id & 0xFF;
568 NLA_PUT_U32(skb, TCA_ROUTE4_TO, id); 574 NLA_PUT_U32(skb, TCA_ROUTE4_TO, id);
569 } 575 }
570 if (f->handle&0x80000000) { 576 if (f->handle & 0x80000000) {
571 if ((f->handle>>16) != 0xFFFF) 577 if ((f->handle >> 16) != 0xFFFF)
572 NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif); 578 NLA_PUT_U32(skb, TCA_ROUTE4_IIF, f->iif);
573 } else { 579 } else {
574 id = f->id>>16; 580 id = f->id >> 16;
575 NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id); 581 NLA_PUT_U32(skb, TCA_ROUTE4_FROM, id);
576 } 582 }
577 if (f->res.classid) 583 if (f->res.classid)
diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h
index 425a1790b048..402c44b241a3 100644
--- a/net/sched/cls_rsvp.h
+++ b/net/sched/cls_rsvp.h
@@ -66,28 +66,25 @@
66 powerful classification engine. */ 66 powerful classification engine. */
67 67
68 68
69struct rsvp_head 69struct rsvp_head {
70{
71 u32 tmap[256/32]; 70 u32 tmap[256/32];
72 u32 hgenerator; 71 u32 hgenerator;
73 u8 tgenerator; 72 u8 tgenerator;
74 struct rsvp_session *ht[256]; 73 struct rsvp_session *ht[256];
75}; 74};
76 75
77struct rsvp_session 76struct rsvp_session {
78{
79 struct rsvp_session *next; 77 struct rsvp_session *next;
80 __be32 dst[RSVP_DST_LEN]; 78 __be32 dst[RSVP_DST_LEN];
81 struct tc_rsvp_gpi dpi; 79 struct tc_rsvp_gpi dpi;
82 u8 protocol; 80 u8 protocol;
83 u8 tunnelid; 81 u8 tunnelid;
84 /* 16 (src,sport) hash slots, and one wildcard source slot */ 82 /* 16 (src,sport) hash slots, and one wildcard source slot */
85 struct rsvp_filter *ht[16+1]; 83 struct rsvp_filter *ht[16 + 1];
86}; 84};
87 85
88 86
89struct rsvp_filter 87struct rsvp_filter {
90{
91 struct rsvp_filter *next; 88 struct rsvp_filter *next;
92 __be32 src[RSVP_DST_LEN]; 89 __be32 src[RSVP_DST_LEN];
93 struct tc_rsvp_gpi spi; 90 struct tc_rsvp_gpi spi;
@@ -100,17 +97,19 @@ struct rsvp_filter
100 struct rsvp_session *sess; 97 struct rsvp_session *sess;
101}; 98};
102 99
103static __inline__ unsigned hash_dst(__be32 *dst, u8 protocol, u8 tunnelid) 100static inline unsigned int hash_dst(__be32 *dst, u8 protocol, u8 tunnelid)
104{ 101{
105 unsigned h = (__force __u32)dst[RSVP_DST_LEN-1]; 102 unsigned int h = (__force __u32)dst[RSVP_DST_LEN - 1];
103
106 h ^= h>>16; 104 h ^= h>>16;
107 h ^= h>>8; 105 h ^= h>>8;
108 return (h ^ protocol ^ tunnelid) & 0xFF; 106 return (h ^ protocol ^ tunnelid) & 0xFF;
109} 107}
110 108
111static __inline__ unsigned hash_src(__be32 *src) 109static inline unsigned int hash_src(__be32 *src)
112{ 110{
113 unsigned h = (__force __u32)src[RSVP_DST_LEN-1]; 111 unsigned int h = (__force __u32)src[RSVP_DST_LEN-1];
112
114 h ^= h>>16; 113 h ^= h>>16;
115 h ^= h>>8; 114 h ^= h>>8;
116 h ^= h>>4; 115 h ^= h>>4;
@@ -134,10 +133,10 @@ static struct tcf_ext_map rsvp_ext_map = {
134static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, 133static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp,
135 struct tcf_result *res) 134 struct tcf_result *res)
136{ 135{
137 struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; 136 struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
138 struct rsvp_session *s; 137 struct rsvp_session *s;
139 struct rsvp_filter *f; 138 struct rsvp_filter *f;
140 unsigned h1, h2; 139 unsigned int h1, h2;
141 __be32 *dst, *src; 140 __be32 *dst, *src;
142 u8 protocol; 141 u8 protocol;
143 u8 tunnelid = 0; 142 u8 tunnelid = 0;
@@ -162,13 +161,13 @@ restart:
162 src = &nhptr->saddr.s6_addr32[0]; 161 src = &nhptr->saddr.s6_addr32[0];
163 dst = &nhptr->daddr.s6_addr32[0]; 162 dst = &nhptr->daddr.s6_addr32[0];
164 protocol = nhptr->nexthdr; 163 protocol = nhptr->nexthdr;
165 xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); 164 xprt = ((u8 *)nhptr) + sizeof(struct ipv6hdr);
166#else 165#else
167 src = &nhptr->saddr; 166 src = &nhptr->saddr;
168 dst = &nhptr->daddr; 167 dst = &nhptr->daddr;
169 protocol = nhptr->protocol; 168 protocol = nhptr->protocol;
170 xprt = ((u8*)nhptr) + (nhptr->ihl<<2); 169 xprt = ((u8 *)nhptr) + (nhptr->ihl<<2);
171 if (nhptr->frag_off & htons(IP_MF|IP_OFFSET)) 170 if (nhptr->frag_off & htons(IP_MF | IP_OFFSET))
172 return -1; 171 return -1;
173#endif 172#endif
174 173
@@ -176,10 +175,10 @@ restart:
176 h2 = hash_src(src); 175 h2 = hash_src(src);
177 176
178 for (s = sht[h1]; s; s = s->next) { 177 for (s = sht[h1]; s; s = s->next) {
179 if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && 178 if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN - 1] &&
180 protocol == s->protocol && 179 protocol == s->protocol &&
181 !(s->dpi.mask & 180 !(s->dpi.mask &
182 (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) && 181 (*(u32 *)(xprt + s->dpi.offset) ^ s->dpi.key)) &&
183#if RSVP_DST_LEN == 4 182#if RSVP_DST_LEN == 4
184 dst[0] == s->dst[0] && 183 dst[0] == s->dst[0] &&
185 dst[1] == s->dst[1] && 184 dst[1] == s->dst[1] &&
@@ -188,8 +187,8 @@ restart:
188 tunnelid == s->tunnelid) { 187 tunnelid == s->tunnelid) {
189 188
190 for (f = s->ht[h2]; f; f = f->next) { 189 for (f = s->ht[h2]; f; f = f->next) {
191 if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && 190 if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN - 1] &&
192 !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) 191 !(f->spi.mask & (*(u32 *)(xprt + f->spi.offset) ^ f->spi.key))
193#if RSVP_DST_LEN == 4 192#if RSVP_DST_LEN == 4
194 && 193 &&
195 src[0] == f->src[0] && 194 src[0] == f->src[0] &&
@@ -205,7 +204,7 @@ matched:
205 return 0; 204 return 0;
206 205
207 tunnelid = f->res.classid; 206 tunnelid = f->res.classid;
208 nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); 207 nhptr = (void *)(xprt + f->tunnelhdr - sizeof(*nhptr));
209 goto restart; 208 goto restart;
210 } 209 }
211 } 210 }
@@ -224,11 +223,11 @@ matched:
224 223
225static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) 224static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle)
226{ 225{
227 struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; 226 struct rsvp_session **sht = ((struct rsvp_head *)tp->root)->ht;
228 struct rsvp_session *s; 227 struct rsvp_session *s;
229 struct rsvp_filter *f; 228 struct rsvp_filter *f;
230 unsigned h1 = handle&0xFF; 229 unsigned int h1 = handle & 0xFF;
231 unsigned h2 = (handle>>8)&0xFF; 230 unsigned int h2 = (handle >> 8) & 0xFF;
232 231
233 if (h2 > 16) 232 if (h2 > 16)
234 return 0; 233 return 0;
@@ -258,7 +257,7 @@ static int rsvp_init(struct tcf_proto *tp)
258 return -ENOBUFS; 257 return -ENOBUFS;
259} 258}
260 259
261static inline void 260static void
262rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) 261rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f)
263{ 262{
264 tcf_unbind_filter(tp, &f->res); 263 tcf_unbind_filter(tp, &f->res);
@@ -277,13 +276,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
277 276
278 sht = data->ht; 277 sht = data->ht;
279 278
280 for (h1=0; h1<256; h1++) { 279 for (h1 = 0; h1 < 256; h1++) {
281 struct rsvp_session *s; 280 struct rsvp_session *s;
282 281
283 while ((s = sht[h1]) != NULL) { 282 while ((s = sht[h1]) != NULL) {
284 sht[h1] = s->next; 283 sht[h1] = s->next;
285 284
286 for (h2=0; h2<=16; h2++) { 285 for (h2 = 0; h2 <= 16; h2++) {
287 struct rsvp_filter *f; 286 struct rsvp_filter *f;
288 287
289 while ((f = s->ht[h2]) != NULL) { 288 while ((f = s->ht[h2]) != NULL) {
@@ -299,13 +298,13 @@ static void rsvp_destroy(struct tcf_proto *tp)
299 298
300static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) 299static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
301{ 300{
302 struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; 301 struct rsvp_filter **fp, *f = (struct rsvp_filter *)arg;
303 unsigned h = f->handle; 302 unsigned int h = f->handle;
304 struct rsvp_session **sp; 303 struct rsvp_session **sp;
305 struct rsvp_session *s = f->sess; 304 struct rsvp_session *s = f->sess;
306 int i; 305 int i;
307 306
308 for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { 307 for (fp = &s->ht[(h >> 8) & 0xFF]; *fp; fp = &(*fp)->next) {
309 if (*fp == f) { 308 if (*fp == f) {
310 tcf_tree_lock(tp); 309 tcf_tree_lock(tp);
311 *fp = f->next; 310 *fp = f->next;
@@ -314,12 +313,12 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
314 313
315 /* Strip tree */ 314 /* Strip tree */
316 315
317 for (i=0; i<=16; i++) 316 for (i = 0; i <= 16; i++)
318 if (s->ht[i]) 317 if (s->ht[i])
319 return 0; 318 return 0;
320 319
321 /* OK, session has no flows */ 320 /* OK, session has no flows */
322 for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; 321 for (sp = &((struct rsvp_head *)tp->root)->ht[h & 0xFF];
323 *sp; sp = &(*sp)->next) { 322 *sp; sp = &(*sp)->next) {
324 if (*sp == s) { 323 if (*sp == s) {
325 tcf_tree_lock(tp); 324 tcf_tree_lock(tp);
@@ -337,13 +336,14 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg)
337 return 0; 336 return 0;
338} 337}
339 338
340static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) 339static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt)
341{ 340{
342 struct rsvp_head *data = tp->root; 341 struct rsvp_head *data = tp->root;
343 int i = 0xFFFF; 342 int i = 0xFFFF;
344 343
345 while (i-- > 0) { 344 while (i-- > 0) {
346 u32 h; 345 u32 h;
346
347 if ((data->hgenerator += 0x10000) == 0) 347 if ((data->hgenerator += 0x10000) == 0)
348 data->hgenerator = 0x10000; 348 data->hgenerator = 0x10000;
349 h = data->hgenerator|salt; 349 h = data->hgenerator|salt;
@@ -355,10 +355,10 @@ static unsigned gen_handle(struct tcf_proto *tp, unsigned salt)
355 355
356static int tunnel_bts(struct rsvp_head *data) 356static int tunnel_bts(struct rsvp_head *data)
357{ 357{
358 int n = data->tgenerator>>5; 358 int n = data->tgenerator >> 5;
359 u32 b = 1<<(data->tgenerator&0x1F); 359 u32 b = 1 << (data->tgenerator & 0x1F);
360 360
361 if (data->tmap[n]&b) 361 if (data->tmap[n] & b)
362 return 0; 362 return 0;
363 data->tmap[n] |= b; 363 data->tmap[n] |= b;
364 return 1; 364 return 1;
@@ -372,10 +372,10 @@ static void tunnel_recycle(struct rsvp_head *data)
372 372
373 memset(tmap, 0, sizeof(tmap)); 373 memset(tmap, 0, sizeof(tmap));
374 374
375 for (h1=0; h1<256; h1++) { 375 for (h1 = 0; h1 < 256; h1++) {
376 struct rsvp_session *s; 376 struct rsvp_session *s;
377 for (s = sht[h1]; s; s = s->next) { 377 for (s = sht[h1]; s; s = s->next) {
378 for (h2=0; h2<=16; h2++) { 378 for (h2 = 0; h2 <= 16; h2++) {
379 struct rsvp_filter *f; 379 struct rsvp_filter *f;
380 380
381 for (f = s->ht[h2]; f; f = f->next) { 381 for (f = s->ht[h2]; f; f = f->next) {
@@ -395,8 +395,8 @@ static u32 gen_tunnel(struct rsvp_head *data)
395{ 395{
396 int i, k; 396 int i, k;
397 397
398 for (k=0; k<2; k++) { 398 for (k = 0; k < 2; k++) {
399 for (i=255; i>0; i--) { 399 for (i = 255; i > 0; i--) {
400 if (++data->tgenerator == 0) 400 if (++data->tgenerator == 0)
401 data->tgenerator = 1; 401 data->tgenerator = 1;
402 if (tunnel_bts(data)) 402 if (tunnel_bts(data))
@@ -428,7 +428,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
428 struct nlattr *opt = tca[TCA_OPTIONS-1]; 428 struct nlattr *opt = tca[TCA_OPTIONS-1];
429 struct nlattr *tb[TCA_RSVP_MAX + 1]; 429 struct nlattr *tb[TCA_RSVP_MAX + 1];
430 struct tcf_exts e; 430 struct tcf_exts e;
431 unsigned h1, h2; 431 unsigned int h1, h2;
432 __be32 *dst; 432 __be32 *dst;
433 int err; 433 int err;
434 434
@@ -443,7 +443,8 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
443 if (err < 0) 443 if (err < 0)
444 return err; 444 return err;
445 445
446 if ((f = (struct rsvp_filter*)*arg) != NULL) { 446 f = (struct rsvp_filter *)*arg;
447 if (f) {
447 /* Node exists: adjust only classid */ 448 /* Node exists: adjust only classid */
448 449
449 if (f->handle != handle && handle) 450 if (f->handle != handle && handle)
@@ -500,7 +501,7 @@ static int rsvp_change(struct tcf_proto *tp, unsigned long base,
500 goto errout; 501 goto errout;
501 } 502 }
502 503
503 for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { 504 for (sp = &data->ht[h1]; (s = *sp) != NULL; sp = &s->next) {
504 if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && 505 if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] &&
505 pinfo && pinfo->protocol == s->protocol && 506 pinfo && pinfo->protocol == s->protocol &&
506 memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 && 507 memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 &&
@@ -523,7 +524,7 @@ insert:
523 tcf_exts_change(tp, &f->exts, &e); 524 tcf_exts_change(tp, &f->exts, &e);
524 525
525 for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) 526 for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next)
526 if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) 527 if (((*fp)->spi.mask & f->spi.mask) != f->spi.mask)
527 break; 528 break;
528 f->next = *fp; 529 f->next = *fp;
529 wmb(); 530 wmb();
@@ -567,7 +568,7 @@ errout2:
567static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) 568static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
568{ 569{
569 struct rsvp_head *head = tp->root; 570 struct rsvp_head *head = tp->root;
570 unsigned h, h1; 571 unsigned int h, h1;
571 572
572 if (arg->stop) 573 if (arg->stop)
573 return; 574 return;
@@ -598,7 +599,7 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg)
598static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, 599static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
599 struct sk_buff *skb, struct tcmsg *t) 600 struct sk_buff *skb, struct tcmsg *t)
600{ 601{
601 struct rsvp_filter *f = (struct rsvp_filter*)fh; 602 struct rsvp_filter *f = (struct rsvp_filter *)fh;
602 struct rsvp_session *s; 603 struct rsvp_session *s;
603 unsigned char *b = skb_tail_pointer(skb); 604 unsigned char *b = skb_tail_pointer(skb);
604 struct nlattr *nest; 605 struct nlattr *nest;
@@ -624,7 +625,7 @@ static int rsvp_dump(struct tcf_proto *tp, unsigned long fh,
624 NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); 625 NLA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo);
625 if (f->res.classid) 626 if (f->res.classid)
626 NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid); 627 NLA_PUT_U32(skb, TCA_RSVP_CLASSID, f->res.classid);
627 if (((f->handle>>8)&0xFF) != 16) 628 if (((f->handle >> 8) & 0xFF) != 16)
628 NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); 629 NLA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src);
629 630
630 if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) 631 if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0)
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 20ef330bb918..36667fa64237 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -249,7 +249,7 @@ tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle,
249 * of the hashing index is below the threshold. 249 * of the hashing index is below the threshold.
250 */ 250 */
251 if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) 251 if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD)
252 cp.hash = (cp.mask >> cp.shift)+1; 252 cp.hash = (cp.mask >> cp.shift) + 1;
253 else 253 else
254 cp.hash = DEFAULT_HASH_SIZE; 254 cp.hash = DEFAULT_HASH_SIZE;
255 } 255 }
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index b0c2a82178af..966920c14e7a 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -42,8 +42,7 @@
42#include <net/act_api.h> 42#include <net/act_api.h>
43#include <net/pkt_cls.h> 43#include <net/pkt_cls.h>
44 44
45struct tc_u_knode 45struct tc_u_knode {
46{
47 struct tc_u_knode *next; 46 struct tc_u_knode *next;
48 u32 handle; 47 u32 handle;
49 struct tc_u_hnode *ht_up; 48 struct tc_u_hnode *ht_up;
@@ -63,19 +62,17 @@ struct tc_u_knode
63 struct tc_u32_sel sel; 62 struct tc_u32_sel sel;
64}; 63};
65 64
66struct tc_u_hnode 65struct tc_u_hnode {
67{
68 struct tc_u_hnode *next; 66 struct tc_u_hnode *next;
69 u32 handle; 67 u32 handle;
70 u32 prio; 68 u32 prio;
71 struct tc_u_common *tp_c; 69 struct tc_u_common *tp_c;
72 int refcnt; 70 int refcnt;
73 unsigned divisor; 71 unsigned int divisor;
74 struct tc_u_knode *ht[1]; 72 struct tc_u_knode *ht[1];
75}; 73};
76 74
77struct tc_u_common 75struct tc_u_common {
78{
79 struct tc_u_hnode *hlist; 76 struct tc_u_hnode *hlist;
80 struct Qdisc *q; 77 struct Qdisc *q;
81 int refcnt; 78 int refcnt;
@@ -87,9 +84,11 @@ static const struct tcf_ext_map u32_ext_map = {
87 .police = TCA_U32_POLICE 84 .police = TCA_U32_POLICE
88}; 85};
89 86
90static __inline__ unsigned u32_hash_fold(__be32 key, struct tc_u32_sel *sel, u8 fshift) 87static inline unsigned int u32_hash_fold(__be32 key,
88 const struct tc_u32_sel *sel,
89 u8 fshift)
91{ 90{
92 unsigned h = ntohl(key & sel->hmask)>>fshift; 91 unsigned int h = ntohl(key & sel->hmask) >> fshift;
93 92
94 return h; 93 return h;
95} 94}
@@ -101,7 +100,7 @@ static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_re
101 unsigned int off; 100 unsigned int off;
102 } stack[TC_U32_MAXDEPTH]; 101 } stack[TC_U32_MAXDEPTH];
103 102
104 struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; 103 struct tc_u_hnode *ht = (struct tc_u_hnode *)tp->root;
105 unsigned int off = skb_network_offset(skb); 104 unsigned int off = skb_network_offset(skb);
106 struct tc_u_knode *n; 105 struct tc_u_knode *n;
107 int sdepth = 0; 106 int sdepth = 0;
@@ -120,7 +119,7 @@ next_knode:
120 struct tc_u32_key *key = n->sel.keys; 119 struct tc_u32_key *key = n->sel.keys;
121 120
122#ifdef CONFIG_CLS_U32_PERF 121#ifdef CONFIG_CLS_U32_PERF
123 n->pf->rcnt +=1; 122 n->pf->rcnt += 1;
124 j = 0; 123 j = 0;
125#endif 124#endif
126 125
@@ -133,7 +132,7 @@ next_knode:
133 } 132 }
134#endif 133#endif
135 134
136 for (i = n->sel.nkeys; i>0; i--, key++) { 135 for (i = n->sel.nkeys; i > 0; i--, key++) {
137 int toff = off + key->off + (off2 & key->offmask); 136 int toff = off + key->off + (off2 & key->offmask);
138 __be32 *data, _data; 137 __be32 *data, _data;
139 138
@@ -148,13 +147,13 @@ next_knode:
148 goto next_knode; 147 goto next_knode;
149 } 148 }
150#ifdef CONFIG_CLS_U32_PERF 149#ifdef CONFIG_CLS_U32_PERF
151 n->pf->kcnts[j] +=1; 150 n->pf->kcnts[j] += 1;
152 j++; 151 j++;
153#endif 152#endif
154 } 153 }
155 if (n->ht_down == NULL) { 154 if (n->ht_down == NULL) {
156check_terminal: 155check_terminal:
157 if (n->sel.flags&TC_U32_TERMINAL) { 156 if (n->sel.flags & TC_U32_TERMINAL) {
158 157
159 *res = n->res; 158 *res = n->res;
160#ifdef CONFIG_NET_CLS_IND 159#ifdef CONFIG_NET_CLS_IND
@@ -164,7 +163,7 @@ check_terminal:
164 } 163 }
165#endif 164#endif
166#ifdef CONFIG_CLS_U32_PERF 165#ifdef CONFIG_CLS_U32_PERF
167 n->pf->rhit +=1; 166 n->pf->rhit += 1;
168#endif 167#endif
169 r = tcf_exts_exec(skb, &n->exts, res); 168 r = tcf_exts_exec(skb, &n->exts, res);
170 if (r < 0) { 169 if (r < 0) {
@@ -197,10 +196,10 @@ check_terminal:
197 sel = ht->divisor & u32_hash_fold(*data, &n->sel, 196 sel = ht->divisor & u32_hash_fold(*data, &n->sel,
198 n->fshift); 197 n->fshift);
199 } 198 }
200 if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) 199 if (!(n->sel.flags & (TC_U32_VAROFFSET | TC_U32_OFFSET | TC_U32_EAT)))
201 goto next_ht; 200 goto next_ht;
202 201
203 if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { 202 if (n->sel.flags & (TC_U32_OFFSET | TC_U32_VAROFFSET)) {
204 off2 = n->sel.off + 3; 203 off2 = n->sel.off + 3;
205 if (n->sel.flags & TC_U32_VAROFFSET) { 204 if (n->sel.flags & TC_U32_VAROFFSET) {
206 __be16 *data, _data; 205 __be16 *data, _data;
@@ -215,7 +214,7 @@ check_terminal:
215 } 214 }
216 off2 &= ~3; 215 off2 &= ~3;
217 } 216 }
218 if (n->sel.flags&TC_U32_EAT) { 217 if (n->sel.flags & TC_U32_EAT) {
219 off += off2; 218 off += off2;
220 off2 = 0; 219 off2 = 0;
221 } 220 }
@@ -236,11 +235,11 @@ out:
236 235
237deadloop: 236deadloop:
238 if (net_ratelimit()) 237 if (net_ratelimit())
239 printk(KERN_WARNING "cls_u32: dead loop\n"); 238 pr_warning("cls_u32: dead loop\n");
240 return -1; 239 return -1;
241} 240}
242 241
243static __inline__ struct tc_u_hnode * 242static struct tc_u_hnode *
244u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) 243u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
245{ 244{
246 struct tc_u_hnode *ht; 245 struct tc_u_hnode *ht;
@@ -252,10 +251,10 @@ u32_lookup_ht(struct tc_u_common *tp_c, u32 handle)
252 return ht; 251 return ht;
253} 252}
254 253
255static __inline__ struct tc_u_knode * 254static struct tc_u_knode *
256u32_lookup_key(struct tc_u_hnode *ht, u32 handle) 255u32_lookup_key(struct tc_u_hnode *ht, u32 handle)
257{ 256{
258 unsigned sel; 257 unsigned int sel;
259 struct tc_u_knode *n = NULL; 258 struct tc_u_knode *n = NULL;
260 259
261 sel = TC_U32_HASH(handle); 260 sel = TC_U32_HASH(handle);
@@ -300,7 +299,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c)
300 do { 299 do {
301 if (++tp_c->hgenerator == 0x7FF) 300 if (++tp_c->hgenerator == 0x7FF)
302 tp_c->hgenerator = 1; 301 tp_c->hgenerator = 1;
303 } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); 302 } while (--i > 0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20));
304 303
305 return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; 304 return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0;
306} 305}
@@ -378,9 +377,9 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key)
378static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) 377static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
379{ 378{
380 struct tc_u_knode *n; 379 struct tc_u_knode *n;
381 unsigned h; 380 unsigned int h;
382 381
383 for (h=0; h<=ht->divisor; h++) { 382 for (h = 0; h <= ht->divisor; h++) {
384 while ((n = ht->ht[h]) != NULL) { 383 while ((n = ht->ht[h]) != NULL) {
385 ht->ht[h] = n->next; 384 ht->ht[h] = n->next;
386 385
@@ -446,13 +445,13 @@ static void u32_destroy(struct tcf_proto *tp)
446 445
447static int u32_delete(struct tcf_proto *tp, unsigned long arg) 446static int u32_delete(struct tcf_proto *tp, unsigned long arg)
448{ 447{
449 struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; 448 struct tc_u_hnode *ht = (struct tc_u_hnode *)arg;
450 449
451 if (ht == NULL) 450 if (ht == NULL)
452 return 0; 451 return 0;
453 452
454 if (TC_U32_KEY(ht->handle)) 453 if (TC_U32_KEY(ht->handle))
455 return u32_delete_key(tp, (struct tc_u_knode*)ht); 454 return u32_delete_key(tp, (struct tc_u_knode *)ht);
456 455
457 if (tp->root == ht) 456 if (tp->root == ht)
458 return -EINVAL; 457 return -EINVAL;
@@ -470,14 +469,14 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
470static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) 469static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle)
471{ 470{
472 struct tc_u_knode *n; 471 struct tc_u_knode *n;
473 unsigned i = 0x7FF; 472 unsigned int i = 0x7FF;
474 473
475 for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) 474 for (n = ht->ht[TC_U32_HASH(handle)]; n; n = n->next)
476 if (i < TC_U32_NODE(n->handle)) 475 if (i < TC_U32_NODE(n->handle))
477 i = TC_U32_NODE(n->handle); 476 i = TC_U32_NODE(n->handle);
478 i++; 477 i++;
479 478
480 return handle|(i>0xFFF ? 0xFFF : i); 479 return handle | (i > 0xFFF ? 0xFFF : i);
481} 480}
482 481
483static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { 482static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
@@ -566,7 +565,8 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
566 if (err < 0) 565 if (err < 0)
567 return err; 566 return err;
568 567
569 if ((n = (struct tc_u_knode*)*arg) != NULL) { 568 n = (struct tc_u_knode *)*arg;
569 if (n) {
570 if (TC_U32_KEY(n->handle) == 0) 570 if (TC_U32_KEY(n->handle) == 0)
571 return -EINVAL; 571 return -EINVAL;
572 572
@@ -574,7 +574,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
574 } 574 }
575 575
576 if (tb[TCA_U32_DIVISOR]) { 576 if (tb[TCA_U32_DIVISOR]) {
577 unsigned divisor = nla_get_u32(tb[TCA_U32_DIVISOR]); 577 unsigned int divisor = nla_get_u32(tb[TCA_U32_DIVISOR]);
578 578
579 if (--divisor > 0x100) 579 if (--divisor > 0x100)
580 return -EINVAL; 580 return -EINVAL;
@@ -585,7 +585,7 @@ static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle,
585 if (handle == 0) 585 if (handle == 0)
586 return -ENOMEM; 586 return -ENOMEM;
587 } 587 }
588 ht = kzalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); 588 ht = kzalloc(sizeof(*ht) + divisor*sizeof(void *), GFP_KERNEL);
589 if (ht == NULL) 589 if (ht == NULL)
590 return -ENOBUFS; 590 return -ENOBUFS;
591 ht->tp_c = tp_c; 591 ht->tp_c = tp_c;
@@ -683,7 +683,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
683 struct tc_u_common *tp_c = tp->data; 683 struct tc_u_common *tp_c = tp->data;
684 struct tc_u_hnode *ht; 684 struct tc_u_hnode *ht;
685 struct tc_u_knode *n; 685 struct tc_u_knode *n;
686 unsigned h; 686 unsigned int h;
687 687
688 if (arg->stop) 688 if (arg->stop)
689 return; 689 return;
@@ -717,7 +717,7 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg)
717static int u32_dump(struct tcf_proto *tp, unsigned long fh, 717static int u32_dump(struct tcf_proto *tp, unsigned long fh,
718 struct sk_buff *skb, struct tcmsg *t) 718 struct sk_buff *skb, struct tcmsg *t)
719{ 719{
720 struct tc_u_knode *n = (struct tc_u_knode*)fh; 720 struct tc_u_knode *n = (struct tc_u_knode *)fh;
721 struct nlattr *nest; 721 struct nlattr *nest;
722 722
723 if (n == NULL) 723 if (n == NULL)
@@ -730,8 +730,9 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
730 goto nla_put_failure; 730 goto nla_put_failure;
731 731
732 if (TC_U32_KEY(n->handle) == 0) { 732 if (TC_U32_KEY(n->handle) == 0) {
733 struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; 733 struct tc_u_hnode *ht = (struct tc_u_hnode *)fh;
734 u32 divisor = ht->divisor+1; 734 u32 divisor = ht->divisor + 1;
735
735 NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor); 736 NLA_PUT_U32(skb, TCA_U32_DIVISOR, divisor);
736 } else { 737 } else {
737 NLA_PUT(skb, TCA_U32_SEL, 738 NLA_PUT(skb, TCA_U32_SEL,
@@ -755,7 +756,7 @@ static int u32_dump(struct tcf_proto *tp, unsigned long fh,
755 goto nla_put_failure; 756 goto nla_put_failure;
756 757
757#ifdef CONFIG_NET_CLS_IND 758#ifdef CONFIG_NET_CLS_IND
758 if(strlen(n->indev)) 759 if (strlen(n->indev))
759 NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev); 760 NLA_PUT_STRING(skb, TCA_U32_INDEV, n->indev);
760#endif 761#endif
761#ifdef CONFIG_CLS_U32_PERF 762#ifdef CONFIG_CLS_U32_PERF
diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c
index bc450397487a..1c8360a2752a 100644
--- a/net/sched/em_cmp.c
+++ b/net/sched/em_cmp.c
@@ -33,40 +33,41 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em,
33 return 0; 33 return 0;
34 34
35 switch (cmp->align) { 35 switch (cmp->align) {
36 case TCF_EM_ALIGN_U8: 36 case TCF_EM_ALIGN_U8:
37 val = *ptr; 37 val = *ptr;
38 break; 38 break;
39 39
40 case TCF_EM_ALIGN_U16: 40 case TCF_EM_ALIGN_U16:
41 val = get_unaligned_be16(ptr); 41 val = get_unaligned_be16(ptr);
42 42
43 if (cmp_needs_transformation(cmp)) 43 if (cmp_needs_transformation(cmp))
44 val = be16_to_cpu(val); 44 val = be16_to_cpu(val);
45 break; 45 break;
46 46
47 case TCF_EM_ALIGN_U32: 47 case TCF_EM_ALIGN_U32:
48 /* Worth checking boundries? The branching seems 48 /* Worth checking boundries? The branching seems
49 * to get worse. Visit again. */ 49 * to get worse. Visit again.
50 val = get_unaligned_be32(ptr); 50 */
51 val = get_unaligned_be32(ptr);
51 52
52 if (cmp_needs_transformation(cmp)) 53 if (cmp_needs_transformation(cmp))
53 val = be32_to_cpu(val); 54 val = be32_to_cpu(val);
54 break; 55 break;
55 56
56 default: 57 default:
57 return 0; 58 return 0;
58 } 59 }
59 60
60 if (cmp->mask) 61 if (cmp->mask)
61 val &= cmp->mask; 62 val &= cmp->mask;
62 63
63 switch (cmp->opnd) { 64 switch (cmp->opnd) {
64 case TCF_EM_OPND_EQ: 65 case TCF_EM_OPND_EQ:
65 return val == cmp->val; 66 return val == cmp->val;
66 case TCF_EM_OPND_LT: 67 case TCF_EM_OPND_LT:
67 return val < cmp->val; 68 return val < cmp->val;
68 case TCF_EM_OPND_GT: 69 case TCF_EM_OPND_GT:
69 return val > cmp->val; 70 return val > cmp->val;
70 } 71 }
71 72
72 return 0; 73 return 0;
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index 34da5e29ea1a..a889d099320f 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -73,21 +73,18 @@
73#include <net/pkt_cls.h> 73#include <net/pkt_cls.h>
74#include <net/sock.h> 74#include <net/sock.h>
75 75
76struct meta_obj 76struct meta_obj {
77{
78 unsigned long value; 77 unsigned long value;
79 unsigned int len; 78 unsigned int len;
80}; 79};
81 80
82struct meta_value 81struct meta_value {
83{
84 struct tcf_meta_val hdr; 82 struct tcf_meta_val hdr;
85 unsigned long val; 83 unsigned long val;
86 unsigned int len; 84 unsigned int len;
87}; 85};
88 86
89struct meta_match 87struct meta_match {
90{
91 struct meta_value lvalue; 88 struct meta_value lvalue;
92 struct meta_value rvalue; 89 struct meta_value rvalue;
93}; 90};
@@ -255,7 +252,7 @@ META_COLLECTOR(int_rtclassid)
255 if (unlikely(skb_dst(skb) == NULL)) 252 if (unlikely(skb_dst(skb) == NULL))
256 *err = -1; 253 *err = -1;
257 else 254 else
258#ifdef CONFIG_NET_CLS_ROUTE 255#ifdef CONFIG_IP_ROUTE_CLASSID
259 dst->value = skb_dst(skb)->tclassid; 256 dst->value = skb_dst(skb)->tclassid;
260#else 257#else
261 dst->value = 0; 258 dst->value = 0;
@@ -483,8 +480,7 @@ META_COLLECTOR(int_sk_write_pend)
483 * Meta value collectors assignment table 480 * Meta value collectors assignment table
484 **************************************************************************/ 481 **************************************************************************/
485 482
486struct meta_ops 483struct meta_ops {
487{
488 void (*get)(struct sk_buff *, struct tcf_pkt_info *, 484 void (*get)(struct sk_buff *, struct tcf_pkt_info *,
489 struct meta_value *, struct meta_obj *, int *); 485 struct meta_value *, struct meta_obj *, int *);
490}; 486};
@@ -494,7 +490,7 @@ struct meta_ops
494 490
495/* Meta value operations table listing all meta value collectors and 491/* Meta value operations table listing all meta value collectors and
496 * assigns them to a type and meta id. */ 492 * assigns them to a type and meta id. */
497static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { 493static struct meta_ops __meta_ops[TCF_META_TYPE_MAX + 1][TCF_META_ID_MAX + 1] = {
498 [TCF_META_TYPE_VAR] = { 494 [TCF_META_TYPE_VAR] = {
499 [META_ID(DEV)] = META_FUNC(var_dev), 495 [META_ID(DEV)] = META_FUNC(var_dev),
500 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if), 496 [META_ID(SK_BOUND_IF)] = META_FUNC(var_sk_bound_if),
@@ -550,7 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = {
550 } 546 }
551}; 547};
552 548
553static inline struct meta_ops * meta_ops(struct meta_value *val) 549static inline struct meta_ops *meta_ops(struct meta_value *val)
554{ 550{
555 return &__meta_ops[meta_type(val)][meta_id(val)]; 551 return &__meta_ops[meta_type(val)][meta_id(val)];
556} 552}
@@ -649,9 +645,8 @@ static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv)
649{ 645{
650 if (v->len == sizeof(unsigned long)) 646 if (v->len == sizeof(unsigned long))
651 NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val); 647 NLA_PUT(skb, tlv, sizeof(unsigned long), &v->val);
652 else if (v->len == sizeof(u32)) { 648 else if (v->len == sizeof(u32))
653 NLA_PUT_U32(skb, tlv, v->val); 649 NLA_PUT_U32(skb, tlv, v->val);
654 }
655 650
656 return 0; 651 return 0;
657 652
@@ -663,8 +658,7 @@ nla_put_failure:
663 * Type specific operations table 658 * Type specific operations table
664 **************************************************************************/ 659 **************************************************************************/
665 660
666struct meta_type_ops 661struct meta_type_ops {
667{
668 void (*destroy)(struct meta_value *); 662 void (*destroy)(struct meta_value *);
669 int (*compare)(struct meta_obj *, struct meta_obj *); 663 int (*compare)(struct meta_obj *, struct meta_obj *);
670 int (*change)(struct meta_value *, struct nlattr *); 664 int (*change)(struct meta_value *, struct nlattr *);
@@ -672,7 +666,7 @@ struct meta_type_ops
672 int (*dump)(struct sk_buff *, struct meta_value *, int); 666 int (*dump)(struct sk_buff *, struct meta_value *, int);
673}; 667};
674 668
675static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { 669static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX + 1] = {
676 [TCF_META_TYPE_VAR] = { 670 [TCF_META_TYPE_VAR] = {
677 .destroy = meta_var_destroy, 671 .destroy = meta_var_destroy,
678 .compare = meta_var_compare, 672 .compare = meta_var_compare,
@@ -688,7 +682,7 @@ static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = {
688 } 682 }
689}; 683};
690 684
691static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) 685static inline struct meta_type_ops *meta_type_ops(struct meta_value *v)
692{ 686{
693 return &__meta_type_ops[meta_type(v)]; 687 return &__meta_type_ops[meta_type(v)];
694} 688}
@@ -713,7 +707,7 @@ static int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info,
713 return err; 707 return err;
714 708
715 if (meta_type_ops(v)->apply_extras) 709 if (meta_type_ops(v)->apply_extras)
716 meta_type_ops(v)->apply_extras(v, dst); 710 meta_type_ops(v)->apply_extras(v, dst);
717 711
718 return 0; 712 return 0;
719} 713}
@@ -732,12 +726,12 @@ static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m,
732 r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); 726 r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value);
733 727
734 switch (meta->lvalue.hdr.op) { 728 switch (meta->lvalue.hdr.op) {
735 case TCF_EM_OPND_EQ: 729 case TCF_EM_OPND_EQ:
736 return !r; 730 return !r;
737 case TCF_EM_OPND_LT: 731 case TCF_EM_OPND_LT:
738 return r < 0; 732 return r < 0;
739 case TCF_EM_OPND_GT: 733 case TCF_EM_OPND_GT:
740 return r > 0; 734 return r > 0;
741 } 735 }
742 736
743 return 0; 737 return 0;
@@ -771,7 +765,7 @@ static inline int meta_change_data(struct meta_value *dst, struct nlattr *nla)
771 765
772static inline int meta_is_supported(struct meta_value *val) 766static inline int meta_is_supported(struct meta_value *val)
773{ 767{
774 return (!meta_id(val) || meta_ops(val)->get); 768 return !meta_id(val) || meta_ops(val)->get;
775} 769}
776 770
777static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = { 771static const struct nla_policy meta_policy[TCA_EM_META_MAX + 1] = {
diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c
index 1a4176aee6e5..a3bed07a008b 100644
--- a/net/sched/em_nbyte.c
+++ b/net/sched/em_nbyte.c
@@ -18,8 +18,7 @@
18#include <linux/tc_ematch/tc_em_nbyte.h> 18#include <linux/tc_ematch/tc_em_nbyte.h>
19#include <net/pkt_cls.h> 19#include <net/pkt_cls.h>
20 20
21struct nbyte_data 21struct nbyte_data {
22{
23 struct tcf_em_nbyte hdr; 22 struct tcf_em_nbyte hdr;
24 char pattern[0]; 23 char pattern[0];
25}; 24};
diff --git a/net/sched/em_text.c b/net/sched/em_text.c
index ea8f566e720c..15d353d2e4be 100644
--- a/net/sched/em_text.c
+++ b/net/sched/em_text.c
@@ -19,8 +19,7 @@
19#include <linux/tc_ematch/tc_em_text.h> 19#include <linux/tc_ematch/tc_em_text.h>
20#include <net/pkt_cls.h> 20#include <net/pkt_cls.h>
21 21
22struct text_match 22struct text_match {
23{
24 u16 from_offset; 23 u16 from_offset;
25 u16 to_offset; 24 u16 to_offset;
26 u8 from_layer; 25 u8 from_layer;
diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c
index 953f1479f7da..797bdb88c010 100644
--- a/net/sched/em_u32.c
+++ b/net/sched/em_u32.c
@@ -35,7 +35,7 @@ static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em,
35 if (!tcf_valid_offset(skb, ptr, sizeof(u32))) 35 if (!tcf_valid_offset(skb, ptr, sizeof(u32)))
36 return 0; 36 return 0;
37 37
38 return !(((*(__be32*) ptr) ^ key->val) & key->mask); 38 return !(((*(__be32 *) ptr) ^ key->val) & key->mask);
39} 39}
40 40
41static struct tcf_ematch_ops em_u32_ops = { 41static struct tcf_ematch_ops em_u32_ops = {
diff --git a/net/sched/ematch.c b/net/sched/ematch.c
index 5e37da961f80..88d93eb92507 100644
--- a/net/sched/ematch.c
+++ b/net/sched/ematch.c
@@ -93,7 +93,7 @@
93static LIST_HEAD(ematch_ops); 93static LIST_HEAD(ematch_ops);
94static DEFINE_RWLOCK(ematch_mod_lock); 94static DEFINE_RWLOCK(ematch_mod_lock);
95 95
96static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) 96static struct tcf_ematch_ops *tcf_em_lookup(u16 kind)
97{ 97{
98 struct tcf_ematch_ops *e = NULL; 98 struct tcf_ematch_ops *e = NULL;
99 99
@@ -163,8 +163,8 @@ void tcf_em_unregister(struct tcf_ematch_ops *ops)
163} 163}
164EXPORT_SYMBOL(tcf_em_unregister); 164EXPORT_SYMBOL(tcf_em_unregister);
165 165
166static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, 166static inline struct tcf_ematch *tcf_em_get_match(struct tcf_ematch_tree *tree,
167 int index) 167 int index)
168{ 168{
169 return &tree->matches[index]; 169 return &tree->matches[index];
170} 170}
@@ -184,7 +184,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
184 184
185 if (em_hdr->kind == TCF_EM_CONTAINER) { 185 if (em_hdr->kind == TCF_EM_CONTAINER) {
186 /* Special ematch called "container", carries an index 186 /* Special ematch called "container", carries an index
187 * referencing an external ematch sequence. */ 187 * referencing an external ematch sequence.
188 */
188 u32 ref; 189 u32 ref;
189 190
190 if (data_len < sizeof(ref)) 191 if (data_len < sizeof(ref))
@@ -195,7 +196,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
195 goto errout; 196 goto errout;
196 197
197 /* We do not allow backward jumps to avoid loops and jumps 198 /* We do not allow backward jumps to avoid loops and jumps
198 * to our own position are of course illegal. */ 199 * to our own position are of course illegal.
200 */
199 if (ref <= idx) 201 if (ref <= idx)
200 goto errout; 202 goto errout;
201 203
@@ -208,7 +210,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
208 * which automatically releases the reference again, therefore 210 * which automatically releases the reference again, therefore
209 * the module MUST not be given back under any circumstances 211 * the module MUST not be given back under any circumstances
210 * here. Be aware, the destroy function assumes that the 212 * here. Be aware, the destroy function assumes that the
211 * module is held if the ops field is non zero. */ 213 * module is held if the ops field is non zero.
214 */
212 em->ops = tcf_em_lookup(em_hdr->kind); 215 em->ops = tcf_em_lookup(em_hdr->kind);
213 216
214 if (em->ops == NULL) { 217 if (em->ops == NULL) {
@@ -221,7 +224,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
221 if (em->ops) { 224 if (em->ops) {
222 /* We dropped the RTNL mutex in order to 225 /* We dropped the RTNL mutex in order to
223 * perform the module load. Tell the caller 226 * perform the module load. Tell the caller
224 * to replay the request. */ 227 * to replay the request.
228 */
225 module_put(em->ops->owner); 229 module_put(em->ops->owner);
226 err = -EAGAIN; 230 err = -EAGAIN;
227 } 231 }
@@ -230,7 +234,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
230 } 234 }
231 235
232 /* ematch module provides expected length of data, so we 236 /* ematch module provides expected length of data, so we
233 * can do a basic sanity check. */ 237 * can do a basic sanity check.
238 */
234 if (em->ops->datalen && data_len < em->ops->datalen) 239 if (em->ops->datalen && data_len < em->ops->datalen)
235 goto errout; 240 goto errout;
236 241
@@ -246,7 +251,8 @@ static int tcf_em_validate(struct tcf_proto *tp,
246 * TCF_EM_SIMPLE may be specified stating that the 251 * TCF_EM_SIMPLE may be specified stating that the
247 * data only consists of a u32 integer and the module 252 * data only consists of a u32 integer and the module
248 * does not expected a memory reference but rather 253 * does not expected a memory reference but rather
249 * the value carried. */ 254 * the value carried.
255 */
250 if (em_hdr->flags & TCF_EM_SIMPLE) { 256 if (em_hdr->flags & TCF_EM_SIMPLE) {
251 if (data_len < sizeof(u32)) 257 if (data_len < sizeof(u32))
252 goto errout; 258 goto errout;
@@ -334,7 +340,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
334 * The array of rt attributes is parsed in the order as they are 340 * The array of rt attributes is parsed in the order as they are
335 * provided, their type must be incremental from 1 to n. Even 341 * provided, their type must be incremental from 1 to n. Even
336 * if it does not serve any real purpose, a failure of sticking 342 * if it does not serve any real purpose, a failure of sticking
337 * to this policy will result in parsing failure. */ 343 * to this policy will result in parsing failure.
344 */
338 for (idx = 0; nla_ok(rt_match, list_len); idx++) { 345 for (idx = 0; nla_ok(rt_match, list_len); idx++) {
339 err = -EINVAL; 346 err = -EINVAL;
340 347
@@ -359,7 +366,8 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla,
359 /* Check if the number of matches provided by userspace actually 366 /* Check if the number of matches provided by userspace actually
360 * complies with the array of matches. The number was used for 367 * complies with the array of matches. The number was used for
361 * the validation of references and a mismatch could lead to 368 * the validation of references and a mismatch could lead to
362 * undefined references during the matching process. */ 369 * undefined references during the matching process.
370 */
363 if (idx != tree_hdr->nmatches) { 371 if (idx != tree_hdr->nmatches) {
364 err = -EINVAL; 372 err = -EINVAL;
365 goto errout_abort; 373 goto errout_abort;
@@ -449,7 +457,7 @@ int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv)
449 .flags = em->flags 457 .flags = em->flags
450 }; 458 };
451 459
452 NLA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); 460 NLA_PUT(skb, i + 1, sizeof(em_hdr), &em_hdr);
453 461
454 if (em->ops && em->ops->dump) { 462 if (em->ops && em->ops->dump) {
455 if (em->ops->dump(skb, em) < 0) 463 if (em->ops->dump(skb, em) < 0)
@@ -478,6 +486,7 @@ static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em,
478 struct tcf_pkt_info *info) 486 struct tcf_pkt_info *info)
479{ 487{
480 int r = em->ops->match(skb, em, info); 488 int r = em->ops->match(skb, em, info);
489
481 return tcf_em_is_inverted(em) ? !r : r; 490 return tcf_em_is_inverted(em) ? !r : r;
482} 491}
483 492
@@ -527,8 +536,8 @@ pop_stack:
527 536
528stack_overflow: 537stack_overflow:
529 if (net_ratelimit()) 538 if (net_ratelimit())
530 printk(KERN_WARNING "tc ematch: local stack overflow," 539 pr_warning("tc ematch: local stack overflow,"
531 " increase NET_EMATCH_STACK\n"); 540 " increase NET_EMATCH_STACK\n");
532 return -1; 541 return -1;
533} 542}
534EXPORT_SYMBOL(__tcf_em_tree_match); 543EXPORT_SYMBOL(__tcf_em_tree_match);
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index b22ca2d1cebc..150741579408 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -187,7 +187,7 @@ int unregister_qdisc(struct Qdisc_ops *qops)
187 int err = -ENOENT; 187 int err = -ENOENT;
188 188
189 write_lock(&qdisc_mod_lock); 189 write_lock(&qdisc_mod_lock);
190 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) 190 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191 if (q == qops) 191 if (q == qops)
192 break; 192 break;
193 if (q) { 193 if (q) {
@@ -321,7 +321,9 @@ void qdisc_put_rtab(struct qdisc_rate_table *tab)
321 if (!tab || --tab->refcnt) 321 if (!tab || --tab->refcnt)
322 return; 322 return;
323 323
324 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { 324 for (rtabp = &qdisc_rtab_list;
325 (rtab = *rtabp) != NULL;
326 rtabp = &rtab->next) {
325 if (rtab == tab) { 327 if (rtab == tab) {
326 *rtabp = rtab->next; 328 *rtabp = rtab->next;
327 kfree(rtab); 329 kfree(rtab);
@@ -396,6 +398,11 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
396 return stab; 398 return stab;
397} 399}
398 400
401static void stab_kfree_rcu(struct rcu_head *head)
402{
403 kfree(container_of(head, struct qdisc_size_table, rcu));
404}
405
399void qdisc_put_stab(struct qdisc_size_table *tab) 406void qdisc_put_stab(struct qdisc_size_table *tab)
400{ 407{
401 if (!tab) 408 if (!tab)
@@ -405,7 +412,7 @@ void qdisc_put_stab(struct qdisc_size_table *tab)
405 412
406 if (--tab->refcnt == 0) { 413 if (--tab->refcnt == 0) {
407 list_del(&tab->list); 414 list_del(&tab->list);
408 kfree(tab); 415 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
409 } 416 }
410 417
411 spin_unlock(&qdisc_stab_lock); 418 spin_unlock(&qdisc_stab_lock);
@@ -428,7 +435,7 @@ nla_put_failure:
428 return -1; 435 return -1;
429} 436}
430 437
431void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab) 438void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
432{ 439{
433 int pkt_len, slot; 440 int pkt_len, slot;
434 441
@@ -454,14 +461,13 @@ out:
454 pkt_len = 1; 461 pkt_len = 1;
455 qdisc_skb_cb(skb)->pkt_len = pkt_len; 462 qdisc_skb_cb(skb)->pkt_len = pkt_len;
456} 463}
457EXPORT_SYMBOL(qdisc_calculate_pkt_len); 464EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
458 465
459void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc) 466void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
460{ 467{
461 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { 468 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
462 printk(KERN_WARNING 469 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
463 "%s: %s qdisc %X: is non-work-conserving?\n", 470 txt, qdisc->ops->id, qdisc->handle >> 16);
464 txt, qdisc->ops->id, qdisc->handle >> 16);
465 qdisc->flags |= TCQ_F_WARN_NONWC; 471 qdisc->flags |= TCQ_F_WARN_NONWC;
466 } 472 }
467} 473}
@@ -472,7 +478,7 @@ static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
472 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, 478 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
473 timer); 479 timer);
474 480
475 wd->qdisc->flags &= ~TCQ_F_THROTTLED; 481 qdisc_unthrottled(wd->qdisc);
476 __netif_schedule(qdisc_root(wd->qdisc)); 482 __netif_schedule(qdisc_root(wd->qdisc));
477 483
478 return HRTIMER_NORESTART; 484 return HRTIMER_NORESTART;
@@ -494,7 +500,7 @@ void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
494 &qdisc_root_sleeping(wd->qdisc)->state)) 500 &qdisc_root_sleeping(wd->qdisc)->state))
495 return; 501 return;
496 502
497 wd->qdisc->flags |= TCQ_F_THROTTLED; 503 qdisc_throttled(wd->qdisc);
498 time = ktime_set(0, 0); 504 time = ktime_set(0, 0);
499 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires)); 505 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
500 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS); 506 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
@@ -504,7 +510,7 @@ EXPORT_SYMBOL(qdisc_watchdog_schedule);
504void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) 510void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
505{ 511{
506 hrtimer_cancel(&wd->timer); 512 hrtimer_cancel(&wd->timer);
507 wd->qdisc->flags &= ~TCQ_F_THROTTLED; 513 qdisc_unthrottled(wd->qdisc);
508} 514}
509EXPORT_SYMBOL(qdisc_watchdog_cancel); 515EXPORT_SYMBOL(qdisc_watchdog_cancel);
510 516
@@ -625,7 +631,7 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
625 autohandle = TC_H_MAKE(0x80000000U, 0); 631 autohandle = TC_H_MAKE(0x80000000U, 0);
626 } while (qdisc_lookup(dev, autohandle) && --i > 0); 632 } while (qdisc_lookup(dev, autohandle) && --i > 0);
627 633
628 return i>0 ? autohandle : 0; 634 return i > 0 ? autohandle : 0;
629} 635}
630 636
631void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n) 637void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
@@ -834,7 +840,7 @@ qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
834 err = PTR_ERR(stab); 840 err = PTR_ERR(stab);
835 goto err_out4; 841 goto err_out4;
836 } 842 }
837 sch->stab = stab; 843 rcu_assign_pointer(sch->stab, stab);
838 } 844 }
839 if (tca[TCA_RATE]) { 845 if (tca[TCA_RATE]) {
840 spinlock_t *root_lock; 846 spinlock_t *root_lock;
@@ -874,7 +880,7 @@ err_out4:
874 * Any broken qdiscs that would require a ops->reset() here? 880 * Any broken qdiscs that would require a ops->reset() here?
875 * The qdisc was never in action so it shouldn't be necessary. 881 * The qdisc was never in action so it shouldn't be necessary.
876 */ 882 */
877 qdisc_put_stab(sch->stab); 883 qdisc_put_stab(rtnl_dereference(sch->stab));
878 if (ops->destroy) 884 if (ops->destroy)
879 ops->destroy(sch); 885 ops->destroy(sch);
880 goto err_out3; 886 goto err_out3;
@@ -882,7 +888,7 @@ err_out4:
882 888
883static int qdisc_change(struct Qdisc *sch, struct nlattr **tca) 889static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
884{ 890{
885 struct qdisc_size_table *stab = NULL; 891 struct qdisc_size_table *ostab, *stab = NULL;
886 int err = 0; 892 int err = 0;
887 893
888 if (tca[TCA_OPTIONS]) { 894 if (tca[TCA_OPTIONS]) {
@@ -899,8 +905,9 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
899 return PTR_ERR(stab); 905 return PTR_ERR(stab);
900 } 906 }
901 907
902 qdisc_put_stab(sch->stab); 908 ostab = rtnl_dereference(sch->stab);
903 sch->stab = stab; 909 rcu_assign_pointer(sch->stab, stab);
910 qdisc_put_stab(ostab);
904 911
905 if (tca[TCA_RATE]) { 912 if (tca[TCA_RATE]) {
906 /* NB: ignores errors from replace_estimator 913 /* NB: ignores errors from replace_estimator
@@ -915,9 +922,8 @@ out:
915 return 0; 922 return 0;
916} 923}
917 924
918struct check_loop_arg 925struct check_loop_arg {
919{ 926 struct qdisc_walker w;
920 struct qdisc_walker w;
921 struct Qdisc *p; 927 struct Qdisc *p;
922 int depth; 928 int depth;
923}; 929};
@@ -970,7 +976,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
970 struct Qdisc *p = NULL; 976 struct Qdisc *p = NULL;
971 int err; 977 int err;
972 978
973 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) 979 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
980 if (!dev)
974 return -ENODEV; 981 return -ENODEV;
975 982
976 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); 983 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -980,12 +987,12 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
980 if (clid) { 987 if (clid) {
981 if (clid != TC_H_ROOT) { 988 if (clid != TC_H_ROOT) {
982 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 989 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
983 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 990 p = qdisc_lookup(dev, TC_H_MAJ(clid));
991 if (!p)
984 return -ENOENT; 992 return -ENOENT;
985 q = qdisc_leaf(p, clid); 993 q = qdisc_leaf(p, clid);
986 } else { /* ingress */ 994 } else if (dev_ingress_queue(dev)) {
987 if (dev_ingress_queue(dev)) 995 q = dev_ingress_queue(dev)->qdisc_sleeping;
988 q = dev_ingress_queue(dev)->qdisc_sleeping;
989 } 996 }
990 } else { 997 } else {
991 q = dev->qdisc; 998 q = dev->qdisc;
@@ -996,7 +1003,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
996 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) 1003 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
997 return -EINVAL; 1004 return -EINVAL;
998 } else { 1005 } else {
999 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 1006 q = qdisc_lookup(dev, tcm->tcm_handle);
1007 if (!q)
1000 return -ENOENT; 1008 return -ENOENT;
1001 } 1009 }
1002 1010
@@ -1008,7 +1016,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1008 return -EINVAL; 1016 return -EINVAL;
1009 if (q->handle == 0) 1017 if (q->handle == 0)
1010 return -ENOENT; 1018 return -ENOENT;
1011 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0) 1019 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1020 if (err != 0)
1012 return err; 1021 return err;
1013 } else { 1022 } else {
1014 qdisc_notify(net, skb, n, clid, NULL, q); 1023 qdisc_notify(net, skb, n, clid, NULL, q);
@@ -1017,7 +1026,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1017} 1026}
1018 1027
1019/* 1028/*
1020 Create/change qdisc. 1029 * Create/change qdisc.
1021 */ 1030 */
1022 1031
1023static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 1032static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
@@ -1036,7 +1045,8 @@ replay:
1036 clid = tcm->tcm_parent; 1045 clid = tcm->tcm_parent;
1037 q = p = NULL; 1046 q = p = NULL;
1038 1047
1039 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) 1048 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1049 if (!dev)
1040 return -ENODEV; 1050 return -ENODEV;
1041 1051
1042 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); 1052 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1046,12 +1056,12 @@ replay:
1046 if (clid) { 1056 if (clid) {
1047 if (clid != TC_H_ROOT) { 1057 if (clid != TC_H_ROOT) {
1048 if (clid != TC_H_INGRESS) { 1058 if (clid != TC_H_INGRESS) {
1049 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 1059 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1060 if (!p)
1050 return -ENOENT; 1061 return -ENOENT;
1051 q = qdisc_leaf(p, clid); 1062 q = qdisc_leaf(p, clid);
1052 } else { /* ingress */ 1063 } else if (dev_ingress_queue_create(dev)) {
1053 if (dev_ingress_queue_create(dev)) 1064 q = dev_ingress_queue(dev)->qdisc_sleeping;
1054 q = dev_ingress_queue(dev)->qdisc_sleeping;
1055 } 1065 }
1056 } else { 1066 } else {
1057 q = dev->qdisc; 1067 q = dev->qdisc;
@@ -1063,13 +1073,14 @@ replay:
1063 1073
1064 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 1074 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1065 if (tcm->tcm_handle) { 1075 if (tcm->tcm_handle) {
1066 if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) 1076 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1067 return -EEXIST; 1077 return -EEXIST;
1068 if (TC_H_MIN(tcm->tcm_handle)) 1078 if (TC_H_MIN(tcm->tcm_handle))
1069 return -EINVAL; 1079 return -EINVAL;
1070 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 1080 q = qdisc_lookup(dev, tcm->tcm_handle);
1081 if (!q)
1071 goto create_n_graft; 1082 goto create_n_graft;
1072 if (n->nlmsg_flags&NLM_F_EXCL) 1083 if (n->nlmsg_flags & NLM_F_EXCL)
1073 return -EEXIST; 1084 return -EEXIST;
1074 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) 1085 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1075 return -EINVAL; 1086 return -EINVAL;
@@ -1079,7 +1090,7 @@ replay:
1079 atomic_inc(&q->refcnt); 1090 atomic_inc(&q->refcnt);
1080 goto graft; 1091 goto graft;
1081 } else { 1092 } else {
1082 if (q == NULL) 1093 if (!q)
1083 goto create_n_graft; 1094 goto create_n_graft;
1084 1095
1085 /* This magic test requires explanation. 1096 /* This magic test requires explanation.
@@ -1101,9 +1112,9 @@ replay:
1101 * For now we select create/graft, if 1112 * For now we select create/graft, if
1102 * user gave KIND, which does not match existing. 1113 * user gave KIND, which does not match existing.
1103 */ 1114 */
1104 if ((n->nlmsg_flags&NLM_F_CREATE) && 1115 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1105 (n->nlmsg_flags&NLM_F_REPLACE) && 1116 (n->nlmsg_flags & NLM_F_REPLACE) &&
1106 ((n->nlmsg_flags&NLM_F_EXCL) || 1117 ((n->nlmsg_flags & NLM_F_EXCL) ||
1107 (tca[TCA_KIND] && 1118 (tca[TCA_KIND] &&
1108 nla_strcmp(tca[TCA_KIND], q->ops->id)))) 1119 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1109 goto create_n_graft; 1120 goto create_n_graft;
@@ -1118,7 +1129,7 @@ replay:
1118 /* Change qdisc parameters */ 1129 /* Change qdisc parameters */
1119 if (q == NULL) 1130 if (q == NULL)
1120 return -ENOENT; 1131 return -ENOENT;
1121 if (n->nlmsg_flags&NLM_F_EXCL) 1132 if (n->nlmsg_flags & NLM_F_EXCL)
1122 return -EEXIST; 1133 return -EEXIST;
1123 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) 1134 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1124 return -EINVAL; 1135 return -EINVAL;
@@ -1128,7 +1139,7 @@ replay:
1128 return err; 1139 return err;
1129 1140
1130create_n_graft: 1141create_n_graft:
1131 if (!(n->nlmsg_flags&NLM_F_CREATE)) 1142 if (!(n->nlmsg_flags & NLM_F_CREATE))
1132 return -ENOENT; 1143 return -ENOENT;
1133 if (clid == TC_H_INGRESS) { 1144 if (clid == TC_H_INGRESS) {
1134 if (dev_ingress_queue(dev)) 1145 if (dev_ingress_queue(dev))
@@ -1175,6 +1186,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1175 struct nlmsghdr *nlh; 1186 struct nlmsghdr *nlh;
1176 unsigned char *b = skb_tail_pointer(skb); 1187 unsigned char *b = skb_tail_pointer(skb);
1177 struct gnet_dump d; 1188 struct gnet_dump d;
1189 struct qdisc_size_table *stab;
1178 1190
1179 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags); 1191 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1180 tcm = NLMSG_DATA(nlh); 1192 tcm = NLMSG_DATA(nlh);
@@ -1190,7 +1202,8 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1190 goto nla_put_failure; 1202 goto nla_put_failure;
1191 q->qstats.qlen = q->q.qlen; 1203 q->qstats.qlen = q->q.qlen;
1192 1204
1193 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0) 1205 stab = rtnl_dereference(q->stab);
1206 if (stab && qdisc_dump_stab(skb, stab) < 0)
1194 goto nla_put_failure; 1207 goto nla_put_failure;
1195 1208
1196 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, 1209 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
@@ -1234,16 +1247,19 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1234 return -ENOBUFS; 1247 return -ENOBUFS;
1235 1248
1236 if (old && !tc_qdisc_dump_ignore(old)) { 1249 if (old && !tc_qdisc_dump_ignore(old)) {
1237 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) 1250 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq,
1251 0, RTM_DELQDISC) < 0)
1238 goto err_out; 1252 goto err_out;
1239 } 1253 }
1240 if (new && !tc_qdisc_dump_ignore(new)) { 1254 if (new && !tc_qdisc_dump_ignore(new)) {
1241 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) 1255 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq,
1256 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1242 goto err_out; 1257 goto err_out;
1243 } 1258 }
1244 1259
1245 if (skb->len) 1260 if (skb->len)
1246 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 1261 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1262 n->nlmsg_flags & NLM_F_ECHO);
1247 1263
1248err_out: 1264err_out:
1249 kfree_skb(skb); 1265 kfree_skb(skb);
@@ -1275,7 +1291,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1275 q_idx++; 1291 q_idx++;
1276 continue; 1292 continue;
1277 } 1293 }
1278 if (!tc_qdisc_dump_ignore(q) && 1294 if (!tc_qdisc_dump_ignore(q) &&
1279 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, 1295 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1280 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) 1296 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1281 goto done; 1297 goto done;
@@ -1356,7 +1372,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1356 u32 qid = TC_H_MAJ(clid); 1372 u32 qid = TC_H_MAJ(clid);
1357 int err; 1373 int err;
1358 1374
1359 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) 1375 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1376 if (!dev)
1360 return -ENODEV; 1377 return -ENODEV;
1361 1378
1362 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); 1379 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
@@ -1391,9 +1408,9 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1391 qid = dev->qdisc->handle; 1408 qid = dev->qdisc->handle;
1392 1409
1393 /* Now qid is genuine qdisc handle consistent 1410 /* Now qid is genuine qdisc handle consistent
1394 both with parent and child. 1411 * both with parent and child.
1395 1412 *
1396 TC_H_MAJ(pid) still may be unspecified, complete it now. 1413 * TC_H_MAJ(pid) still may be unspecified, complete it now.
1397 */ 1414 */
1398 if (pid) 1415 if (pid)
1399 pid = TC_H_MAKE(qid, pid); 1416 pid = TC_H_MAKE(qid, pid);
@@ -1403,7 +1420,8 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1403 } 1420 }
1404 1421
1405 /* OK. Locate qdisc */ 1422 /* OK. Locate qdisc */
1406 if ((q = qdisc_lookup(dev, qid)) == NULL) 1423 q = qdisc_lookup(dev, qid);
1424 if (!q)
1407 return -ENOENT; 1425 return -ENOENT;
1408 1426
1409 /* An check that it supports classes */ 1427 /* An check that it supports classes */
@@ -1423,13 +1441,14 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1423 1441
1424 if (cl == 0) { 1442 if (cl == 0) {
1425 err = -ENOENT; 1443 err = -ENOENT;
1426 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) 1444 if (n->nlmsg_type != RTM_NEWTCLASS ||
1445 !(n->nlmsg_flags & NLM_F_CREATE))
1427 goto out; 1446 goto out;
1428 } else { 1447 } else {
1429 switch (n->nlmsg_type) { 1448 switch (n->nlmsg_type) {
1430 case RTM_NEWTCLASS: 1449 case RTM_NEWTCLASS:
1431 err = -EEXIST; 1450 err = -EEXIST;
1432 if (n->nlmsg_flags&NLM_F_EXCL) 1451 if (n->nlmsg_flags & NLM_F_EXCL)
1433 goto out; 1452 goto out;
1434 break; 1453 break;
1435 case RTM_DELTCLASS: 1454 case RTM_DELTCLASS:
@@ -1521,14 +1540,14 @@ static int tclass_notify(struct net *net, struct sk_buff *oskb,
1521 return -EINVAL; 1540 return -EINVAL;
1522 } 1541 }
1523 1542
1524 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 1543 return rtnetlink_send(skb, net, pid, RTNLGRP_TC,
1544 n->nlmsg_flags & NLM_F_ECHO);
1525} 1545}
1526 1546
1527struct qdisc_dump_args 1547struct qdisc_dump_args {
1528{ 1548 struct qdisc_walker w;
1529 struct qdisc_walker w; 1549 struct sk_buff *skb;
1530 struct sk_buff *skb; 1550 struct netlink_callback *cb;
1531 struct netlink_callback *cb;
1532}; 1551};
1533 1552
1534static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) 1553static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
@@ -1590,7 +1609,7 @@ static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1590 1609
1591static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 1610static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1592{ 1611{
1593 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); 1612 struct tcmsg *tcm = (struct tcmsg *)NLMSG_DATA(cb->nlh);
1594 struct net *net = sock_net(skb->sk); 1613 struct net *net = sock_net(skb->sk);
1595 struct netdev_queue *dev_queue; 1614 struct netdev_queue *dev_queue;
1596 struct net_device *dev; 1615 struct net_device *dev;
@@ -1598,7 +1617,8 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1598 1617
1599 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) 1618 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1600 return 0; 1619 return 0;
1601 if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL) 1620 dev = dev_get_by_index(net, tcm->tcm_ifindex);
1621 if (!dev)
1602 return 0; 1622 return 0;
1603 1623
1604 s_t = cb->args[0]; 1624 s_t = cb->args[0];
@@ -1621,19 +1641,22 @@ done:
1621} 1641}
1622 1642
1623/* Main classifier routine: scans classifier chain attached 1643/* Main classifier routine: scans classifier chain attached
1624 to this qdisc, (optionally) tests for protocol and asks 1644 * to this qdisc, (optionally) tests for protocol and asks
1625 specific classifiers. 1645 * specific classifiers.
1626 */ 1646 */
1627int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp, 1647int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1628 struct tcf_result *res) 1648 struct tcf_result *res)
1629{ 1649{
1630 __be16 protocol = skb->protocol; 1650 __be16 protocol = skb->protocol;
1631 int err = 0; 1651 int err;
1632 1652
1633 for (; tp; tp = tp->next) { 1653 for (; tp; tp = tp->next) {
1634 if ((tp->protocol == protocol || 1654 if (tp->protocol != protocol &&
1635 tp->protocol == htons(ETH_P_ALL)) && 1655 tp->protocol != htons(ETH_P_ALL))
1636 (err = tp->classify(skb, tp, res)) >= 0) { 1656 continue;
1657 err = tp->classify(skb, tp, res);
1658
1659 if (err >= 0) {
1637#ifdef CONFIG_NET_CLS_ACT 1660#ifdef CONFIG_NET_CLS_ACT
1638 if (err != TC_ACT_RECLASSIFY && skb->tc_verd) 1661 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1639 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0); 1662 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
@@ -1664,11 +1687,11 @@ reclassify:
1664 1687
1665 if (verd++ >= MAX_REC_LOOP) { 1688 if (verd++ >= MAX_REC_LOOP) {
1666 if (net_ratelimit()) 1689 if (net_ratelimit())
1667 printk(KERN_NOTICE 1690 pr_notice("%s: packet reclassify loop"
1668 "%s: packet reclassify loop"
1669 " rule prio %u protocol %02x\n", 1691 " rule prio %u protocol %02x\n",
1670 tp->q->ops->id, 1692 tp->q->ops->id,
1671 tp->prio & 0xffff, ntohs(tp->protocol)); 1693 tp->prio & 0xffff,
1694 ntohs(tp->protocol));
1672 return TC_ACT_SHOT; 1695 return TC_ACT_SHOT;
1673 } 1696 }
1674 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd); 1697 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
@@ -1761,7 +1784,7 @@ static int __init pktsched_init(void)
1761 1784
1762 err = register_pernet_subsys(&psched_net_ops); 1785 err = register_pernet_subsys(&psched_net_ops);
1763 if (err) { 1786 if (err) {
1764 printk(KERN_ERR "pktsched_init: " 1787 pr_err("pktsched_init: "
1765 "cannot initialize per netns operations\n"); 1788 "cannot initialize per netns operations\n");
1766 return err; 1789 return err;
1767 } 1790 }
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 943d733409d0..3f08158b8688 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -319,7 +319,7 @@ static int atm_tc_delete(struct Qdisc *sch, unsigned long arg)
319 * creation), and one for the reference held when calling delete. 319 * creation), and one for the reference held when calling delete.
320 */ 320 */
321 if (flow->ref < 2) { 321 if (flow->ref < 2) {
322 printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n", flow->ref); 322 pr_err("atm_tc_delete: flow->ref == %d\n", flow->ref);
323 return -EINVAL; 323 return -EINVAL;
324 } 324 }
325 if (flow->ref > 2) 325 if (flow->ref > 2)
@@ -384,12 +384,12 @@ static int atm_tc_enqueue(struct sk_buff *skb, struct Qdisc *sch)
384 } 384 }
385 } 385 }
386 flow = NULL; 386 flow = NULL;
387 done: 387done:
388 ; 388 ;
389 } 389 }
390 if (!flow) 390 if (!flow) {
391 flow = &p->link; 391 flow = &p->link;
392 else { 392 } else {
393 if (flow->vcc) 393 if (flow->vcc)
394 ATM_SKB(skb)->atm_options = flow->vcc->atm_options; 394 ATM_SKB(skb)->atm_options = flow->vcc->atm_options;
395 /*@@@ looks good ... but it's not supposed to work :-) */ 395 /*@@@ looks good ... but it's not supposed to work :-) */
@@ -576,8 +576,7 @@ static void atm_tc_destroy(struct Qdisc *sch)
576 576
577 list_for_each_entry_safe(flow, tmp, &p->flows, list) { 577 list_for_each_entry_safe(flow, tmp, &p->flows, list) {
578 if (flow->ref > 1) 578 if (flow->ref > 1)
579 printk(KERN_ERR "atm_destroy: %p->ref = %d\n", flow, 579 pr_err("atm_destroy: %p->ref = %d\n", flow, flow->ref);
580 flow->ref);
581 atm_tc_put(sch, (unsigned long)flow); 580 atm_tc_put(sch, (unsigned long)flow);
582 } 581 }
583 tasklet_kill(&p->task); 582 tasklet_kill(&p->task);
@@ -616,9 +615,8 @@ static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl,
616 } 615 }
617 if (flow->excess) 616 if (flow->excess)
618 NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid); 617 NLA_PUT_U32(skb, TCA_ATM_EXCESS, flow->classid);
619 else { 618 else
620 NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0); 619 NLA_PUT_U32(skb, TCA_ATM_EXCESS, 0);
621 }
622 620
623 nla_nest_end(skb, nest); 621 nla_nest_end(skb, nest);
624 return skb->len; 622 return skb->len;
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 5f63ec58942c..24d94c097b35 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -72,8 +72,7 @@
72struct cbq_sched_data; 72struct cbq_sched_data;
73 73
74 74
75struct cbq_class 75struct cbq_class {
76{
77 struct Qdisc_class_common common; 76 struct Qdisc_class_common common;
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */ 77 struct cbq_class *next_alive; /* next class with backlog in this priority band */
79 78
@@ -139,19 +138,18 @@ struct cbq_class
139 int refcnt; 138 int refcnt;
140 int filters; 139 int filters;
141 140
142 struct cbq_class *defaults[TC_PRIO_MAX+1]; 141 struct cbq_class *defaults[TC_PRIO_MAX + 1];
143}; 142};
144 143
145struct cbq_sched_data 144struct cbq_sched_data {
146{
147 struct Qdisc_class_hash clhash; /* Hash table of all classes */ 145 struct Qdisc_class_hash clhash; /* Hash table of all classes */
148 int nclasses[TC_CBQ_MAXPRIO+1]; 146 int nclasses[TC_CBQ_MAXPRIO + 1];
149 unsigned quanta[TC_CBQ_MAXPRIO+1]; 147 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
150 148
151 struct cbq_class link; 149 struct cbq_class link;
152 150
153 unsigned activemask; 151 unsigned int activemask;
154 struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes 152 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
155 with backlog */ 153 with backlog */
156 154
157#ifdef CONFIG_NET_CLS_ACT 155#ifdef CONFIG_NET_CLS_ACT
@@ -162,7 +160,7 @@ struct cbq_sched_data
162 int tx_len; 160 int tx_len;
163 psched_time_t now; /* Cached timestamp */ 161 psched_time_t now; /* Cached timestamp */
164 psched_time_t now_rt; /* Cached real time */ 162 psched_time_t now_rt; /* Cached real time */
165 unsigned pmask; 163 unsigned int pmask;
166 164
167 struct hrtimer delay_timer; 165 struct hrtimer delay_timer;
168 struct qdisc_watchdog watchdog; /* Watchdog timer, 166 struct qdisc_watchdog watchdog; /* Watchdog timer,
@@ -175,9 +173,9 @@ struct cbq_sched_data
175}; 173};
176 174
177 175
178#define L2T(cl,len) qdisc_l2t((cl)->R_tab,len) 176#define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
179 177
180static __inline__ struct cbq_class * 178static inline struct cbq_class *
181cbq_class_lookup(struct cbq_sched_data *q, u32 classid) 179cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
182{ 180{
183 struct Qdisc_class_common *clc; 181 struct Qdisc_class_common *clc;
@@ -193,25 +191,27 @@ cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
193static struct cbq_class * 191static struct cbq_class *
194cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) 192cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
195{ 193{
196 struct cbq_class *cl, *new; 194 struct cbq_class *cl;
197 195
198 for (cl = this->tparent; cl; cl = cl->tparent) 196 for (cl = this->tparent; cl; cl = cl->tparent) {
199 if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) 197 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
200 return new;
201 198
199 if (new != NULL && new != this)
200 return new;
201 }
202 return NULL; 202 return NULL;
203} 203}
204 204
205#endif 205#endif
206 206
207/* Classify packet. The procedure is pretty complicated, but 207/* Classify packet. The procedure is pretty complicated, but
208 it allows us to combine link sharing and priority scheduling 208 * it allows us to combine link sharing and priority scheduling
209 transparently. 209 * transparently.
210 210 *
211 Namely, you can put link sharing rules (f.e. route based) at root of CBQ, 211 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
212 so that it resolves to split nodes. Then packets are classified 212 * so that it resolves to split nodes. Then packets are classified
213 by logical priority, or a more specific classifier may be attached 213 * by logical priority, or a more specific classifier may be attached
214 to the split node. 214 * to the split node.
215 */ 215 */
216 216
217static struct cbq_class * 217static struct cbq_class *
@@ -227,7 +227,7 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
227 /* 227 /*
228 * Step 1. If skb->priority points to one of our classes, use it. 228 * Step 1. If skb->priority points to one of our classes, use it.
229 */ 229 */
230 if (TC_H_MAJ(prio^sch->handle) == 0 && 230 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
231 (cl = cbq_class_lookup(q, prio)) != NULL) 231 (cl = cbq_class_lookup(q, prio)) != NULL)
232 return cl; 232 return cl;
233 233
@@ -243,10 +243,11 @@ cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
243 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0) 243 (result = tc_classify_compat(skb, head->filter_list, &res)) < 0)
244 goto fallback; 244 goto fallback;
245 245
246 if ((cl = (void*)res.class) == NULL) { 246 cl = (void *)res.class;
247 if (!cl) {
247 if (TC_H_MAJ(res.classid)) 248 if (TC_H_MAJ(res.classid))
248 cl = cbq_class_lookup(q, res.classid); 249 cl = cbq_class_lookup(q, res.classid);
249 else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) 250 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
250 cl = defmap[TC_PRIO_BESTEFFORT]; 251 cl = defmap[TC_PRIO_BESTEFFORT];
251 252
252 if (cl == NULL || cl->level >= head->level) 253 if (cl == NULL || cl->level >= head->level)
@@ -282,7 +283,7 @@ fallback:
282 * Step 4. No success... 283 * Step 4. No success...
283 */ 284 */
284 if (TC_H_MAJ(prio) == 0 && 285 if (TC_H_MAJ(prio) == 0 &&
285 !(cl = head->defaults[prio&TC_PRIO_MAX]) && 286 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
286 !(cl = head->defaults[TC_PRIO_BESTEFFORT])) 287 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
287 return head; 288 return head;
288 289
@@ -290,12 +291,12 @@ fallback:
290} 291}
291 292
292/* 293/*
293 A packet has just been enqueued on the empty class. 294 * A packet has just been enqueued on the empty class.
294 cbq_activate_class adds it to the tail of active class list 295 * cbq_activate_class adds it to the tail of active class list
295 of its priority band. 296 * of its priority band.
296 */ 297 */
297 298
298static __inline__ void cbq_activate_class(struct cbq_class *cl) 299static inline void cbq_activate_class(struct cbq_class *cl)
299{ 300{
300 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 301 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
301 int prio = cl->cpriority; 302 int prio = cl->cpriority;
@@ -314,9 +315,9 @@ static __inline__ void cbq_activate_class(struct cbq_class *cl)
314} 315}
315 316
316/* 317/*
317 Unlink class from active chain. 318 * Unlink class from active chain.
318 Note that this same procedure is done directly in cbq_dequeue* 319 * Note that this same procedure is done directly in cbq_dequeue*
319 during round-robin procedure. 320 * during round-robin procedure.
320 */ 321 */
321 322
322static void cbq_deactivate_class(struct cbq_class *this) 323static void cbq_deactivate_class(struct cbq_class *this)
@@ -350,7 +351,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
350{ 351{
351 int toplevel = q->toplevel; 352 int toplevel = q->toplevel;
352 353
353 if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { 354 if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
354 psched_time_t now; 355 psched_time_t now;
355 psched_tdiff_t incr; 356 psched_tdiff_t incr;
356 357
@@ -363,7 +364,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
363 q->toplevel = cl->level; 364 q->toplevel = cl->level;
364 return; 365 return;
365 } 366 }
366 } while ((cl=cl->borrow) != NULL && toplevel > cl->level); 367 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
367 } 368 }
368} 369}
369 370
@@ -417,11 +418,11 @@ static void cbq_ovl_classic(struct cbq_class *cl)
417 delay += cl->offtime; 418 delay += cl->offtime;
418 419
419 /* 420 /*
420 Class goes to sleep, so that it will have no 421 * Class goes to sleep, so that it will have no
421 chance to work avgidle. Let's forgive it 8) 422 * chance to work avgidle. Let's forgive it 8)
422 423 *
423 BTW cbq-2.0 has a crap in this 424 * BTW cbq-2.0 has a crap in this
424 place, apparently they forgot to shift it by cl->ewma_log. 425 * place, apparently they forgot to shift it by cl->ewma_log.
425 */ 426 */
426 if (cl->avgidle < 0) 427 if (cl->avgidle < 0)
427 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); 428 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
@@ -438,8 +439,8 @@ static void cbq_ovl_classic(struct cbq_class *cl)
438 q->wd_expires = delay; 439 q->wd_expires = delay;
439 440
440 /* Dirty work! We must schedule wakeups based on 441 /* Dirty work! We must schedule wakeups based on
441 real available rate, rather than leaf rate, 442 * real available rate, rather than leaf rate,
442 which may be tiny (even zero). 443 * which may be tiny (even zero).
443 */ 444 */
444 if (q->toplevel == TC_CBQ_MAXLEVEL) { 445 if (q->toplevel == TC_CBQ_MAXLEVEL) {
445 struct cbq_class *b; 446 struct cbq_class *b;
@@ -459,7 +460,7 @@ static void cbq_ovl_classic(struct cbq_class *cl)
459} 460}
460 461
461/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when 462/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
462 they go overlimit 463 * they go overlimit
463 */ 464 */
464 465
465static void cbq_ovl_rclassic(struct cbq_class *cl) 466static void cbq_ovl_rclassic(struct cbq_class *cl)
@@ -594,7 +595,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
594 struct Qdisc *sch = q->watchdog.qdisc; 595 struct Qdisc *sch = q->watchdog.qdisc;
595 psched_time_t now; 596 psched_time_t now;
596 psched_tdiff_t delay = 0; 597 psched_tdiff_t delay = 0;
597 unsigned pmask; 598 unsigned int pmask;
598 599
599 now = psched_get_time(); 600 now = psched_get_time();
600 601
@@ -623,7 +624,7 @@ static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
623 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS); 624 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
624 } 625 }
625 626
626 sch->flags &= ~TCQ_F_THROTTLED; 627 qdisc_unthrottled(sch);
627 __netif_schedule(qdisc_root(sch)); 628 __netif_schedule(qdisc_root(sch));
628 return HRTIMER_NORESTART; 629 return HRTIMER_NORESTART;
629} 630}
@@ -663,15 +664,15 @@ static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
663#endif 664#endif
664 665
665/* 666/*
666 It is mission critical procedure. 667 * It is mission critical procedure.
667 668 *
668 We "regenerate" toplevel cutoff, if transmitting class 669 * We "regenerate" toplevel cutoff, if transmitting class
669 has backlog and it is not regulated. It is not part of 670 * has backlog and it is not regulated. It is not part of
670 original CBQ description, but looks more reasonable. 671 * original CBQ description, but looks more reasonable.
671 Probably, it is wrong. This question needs further investigation. 672 * Probably, it is wrong. This question needs further investigation.
672*/ 673 */
673 674
674static __inline__ void 675static inline void
675cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, 676cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
676 struct cbq_class *borrowed) 677 struct cbq_class *borrowed)
677{ 678{
@@ -682,7 +683,7 @@ cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
682 q->toplevel = borrowed->level; 683 q->toplevel = borrowed->level;
683 return; 684 return;
684 } 685 }
685 } while ((borrowed=borrowed->borrow) != NULL); 686 } while ((borrowed = borrowed->borrow) != NULL);
686 } 687 }
687#if 0 688#if 0
688 /* It is not necessary now. Uncommenting it 689 /* It is not necessary now. Uncommenting it
@@ -710,10 +711,10 @@ cbq_update(struct cbq_sched_data *q)
710 cl->bstats.bytes += len; 711 cl->bstats.bytes += len;
711 712
712 /* 713 /*
713 (now - last) is total time between packet right edges. 714 * (now - last) is total time between packet right edges.
714 (last_pktlen/rate) is "virtual" busy time, so that 715 * (last_pktlen/rate) is "virtual" busy time, so that
715 716 *
716 idle = (now - last) - last_pktlen/rate 717 * idle = (now - last) - last_pktlen/rate
717 */ 718 */
718 719
719 idle = q->now - cl->last; 720 idle = q->now - cl->last;
@@ -723,9 +724,9 @@ cbq_update(struct cbq_sched_data *q)
723 idle -= L2T(cl, len); 724 idle -= L2T(cl, len);
724 725
725 /* true_avgidle := (1-W)*true_avgidle + W*idle, 726 /* true_avgidle := (1-W)*true_avgidle + W*idle,
726 where W=2^{-ewma_log}. But cl->avgidle is scaled: 727 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
727 cl->avgidle == true_avgidle/W, 728 * cl->avgidle == true_avgidle/W,
728 hence: 729 * hence:
729 */ 730 */
730 avgidle += idle - (avgidle>>cl->ewma_log); 731 avgidle += idle - (avgidle>>cl->ewma_log);
731 } 732 }
@@ -739,22 +740,22 @@ cbq_update(struct cbq_sched_data *q)
739 cl->avgidle = avgidle; 740 cl->avgidle = avgidle;
740 741
741 /* Calculate expected time, when this class 742 /* Calculate expected time, when this class
742 will be allowed to send. 743 * will be allowed to send.
743 It will occur, when: 744 * It will occur, when:
744 (1-W)*true_avgidle + W*delay = 0, i.e. 745 * (1-W)*true_avgidle + W*delay = 0, i.e.
745 idle = (1/W - 1)*(-true_avgidle) 746 * idle = (1/W - 1)*(-true_avgidle)
746 or 747 * or
747 idle = (1 - W)*(-cl->avgidle); 748 * idle = (1 - W)*(-cl->avgidle);
748 */ 749 */
749 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); 750 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
750 751
751 /* 752 /*
752 That is not all. 753 * That is not all.
753 To maintain the rate allocated to the class, 754 * To maintain the rate allocated to the class,
754 we add to undertime virtual clock, 755 * we add to undertime virtual clock,
755 necessary to complete transmitted packet. 756 * necessary to complete transmitted packet.
756 (len/phys_bandwidth has been already passed 757 * (len/phys_bandwidth has been already passed
757 to the moment of cbq_update) 758 * to the moment of cbq_update)
758 */ 759 */
759 760
760 idle -= L2T(&q->link, len); 761 idle -= L2T(&q->link, len);
@@ -776,7 +777,7 @@ cbq_update(struct cbq_sched_data *q)
776 cbq_update_toplevel(q, this, q->tx_borrowed); 777 cbq_update_toplevel(q, this, q->tx_borrowed);
777} 778}
778 779
779static __inline__ struct cbq_class * 780static inline struct cbq_class *
780cbq_under_limit(struct cbq_class *cl) 781cbq_under_limit(struct cbq_class *cl)
781{ 782{
782 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 783 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
@@ -792,16 +793,17 @@ cbq_under_limit(struct cbq_class *cl)
792 793
793 do { 794 do {
794 /* It is very suspicious place. Now overlimit 795 /* It is very suspicious place. Now overlimit
795 action is generated for not bounded classes 796 * action is generated for not bounded classes
796 only if link is completely congested. 797 * only if link is completely congested.
797 Though it is in agree with ancestor-only paradigm, 798 * Though it is in agree with ancestor-only paradigm,
798 it looks very stupid. Particularly, 799 * it looks very stupid. Particularly,
799 it means that this chunk of code will either 800 * it means that this chunk of code will either
800 never be called or result in strong amplification 801 * never be called or result in strong amplification
801 of burstiness. Dangerous, silly, and, however, 802 * of burstiness. Dangerous, silly, and, however,
802 no another solution exists. 803 * no another solution exists.
803 */ 804 */
804 if ((cl = cl->borrow) == NULL) { 805 cl = cl->borrow;
806 if (!cl) {
805 this_cl->qstats.overlimits++; 807 this_cl->qstats.overlimits++;
806 this_cl->overlimit(this_cl); 808 this_cl->overlimit(this_cl);
807 return NULL; 809 return NULL;
@@ -814,7 +816,7 @@ cbq_under_limit(struct cbq_class *cl)
814 return cl; 816 return cl;
815} 817}
816 818
817static __inline__ struct sk_buff * 819static inline struct sk_buff *
818cbq_dequeue_prio(struct Qdisc *sch, int prio) 820cbq_dequeue_prio(struct Qdisc *sch, int prio)
819{ 821{
820 struct cbq_sched_data *q = qdisc_priv(sch); 822 struct cbq_sched_data *q = qdisc_priv(sch);
@@ -838,7 +840,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
838 840
839 if (cl->deficit <= 0) { 841 if (cl->deficit <= 0) {
840 /* Class exhausted its allotment per 842 /* Class exhausted its allotment per
841 this round. Switch to the next one. 843 * this round. Switch to the next one.
842 */ 844 */
843 deficit = 1; 845 deficit = 1;
844 cl->deficit += cl->quantum; 846 cl->deficit += cl->quantum;
@@ -848,8 +850,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
848 skb = cl->q->dequeue(cl->q); 850 skb = cl->q->dequeue(cl->q);
849 851
850 /* Class did not give us any skb :-( 852 /* Class did not give us any skb :-(
851 It could occur even if cl->q->q.qlen != 0 853 * It could occur even if cl->q->q.qlen != 0
852 f.e. if cl->q == "tbf" 854 * f.e. if cl->q == "tbf"
853 */ 855 */
854 if (skb == NULL) 856 if (skb == NULL)
855 goto skip_class; 857 goto skip_class;
@@ -878,7 +880,7 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
878skip_class: 880skip_class:
879 if (cl->q->q.qlen == 0 || prio != cl->cpriority) { 881 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
880 /* Class is empty or penalized. 882 /* Class is empty or penalized.
881 Unlink it from active chain. 883 * Unlink it from active chain.
882 */ 884 */
883 cl_prev->next_alive = cl->next_alive; 885 cl_prev->next_alive = cl->next_alive;
884 cl->next_alive = NULL; 886 cl->next_alive = NULL;
@@ -917,14 +919,14 @@ next_class:
917 return NULL; 919 return NULL;
918} 920}
919 921
920static __inline__ struct sk_buff * 922static inline struct sk_buff *
921cbq_dequeue_1(struct Qdisc *sch) 923cbq_dequeue_1(struct Qdisc *sch)
922{ 924{
923 struct cbq_sched_data *q = qdisc_priv(sch); 925 struct cbq_sched_data *q = qdisc_priv(sch);
924 struct sk_buff *skb; 926 struct sk_buff *skb;
925 unsigned activemask; 927 unsigned int activemask;
926 928
927 activemask = q->activemask&0xFF; 929 activemask = q->activemask & 0xFF;
928 while (activemask) { 930 while (activemask) {
929 int prio = ffz(~activemask); 931 int prio = ffz(~activemask);
930 activemask &= ~(1<<prio); 932 activemask &= ~(1<<prio);
@@ -949,11 +951,11 @@ cbq_dequeue(struct Qdisc *sch)
949 if (q->tx_class) { 951 if (q->tx_class) {
950 psched_tdiff_t incr2; 952 psched_tdiff_t incr2;
951 /* Time integrator. We calculate EOS time 953 /* Time integrator. We calculate EOS time
952 by adding expected packet transmission time. 954 * by adding expected packet transmission time.
953 If real time is greater, we warp artificial clock, 955 * If real time is greater, we warp artificial clock,
954 so that: 956 * so that:
955 957 *
956 cbq_time = max(real_time, work); 958 * cbq_time = max(real_time, work);
957 */ 959 */
958 incr2 = L2T(&q->link, q->tx_len); 960 incr2 = L2T(&q->link, q->tx_len);
959 q->now += incr2; 961 q->now += incr2;
@@ -971,27 +973,27 @@ cbq_dequeue(struct Qdisc *sch)
971 if (skb) { 973 if (skb) {
972 qdisc_bstats_update(sch, skb); 974 qdisc_bstats_update(sch, skb);
973 sch->q.qlen--; 975 sch->q.qlen--;
974 sch->flags &= ~TCQ_F_THROTTLED; 976 qdisc_unthrottled(sch);
975 return skb; 977 return skb;
976 } 978 }
977 979
978 /* All the classes are overlimit. 980 /* All the classes are overlimit.
979 981 *
980 It is possible, if: 982 * It is possible, if:
981 983 *
982 1. Scheduler is empty. 984 * 1. Scheduler is empty.
983 2. Toplevel cutoff inhibited borrowing. 985 * 2. Toplevel cutoff inhibited borrowing.
984 3. Root class is overlimit. 986 * 3. Root class is overlimit.
985 987 *
986 Reset 2d and 3d conditions and retry. 988 * Reset 2d and 3d conditions and retry.
987 989 *
988 Note, that NS and cbq-2.0 are buggy, peeking 990 * Note, that NS and cbq-2.0 are buggy, peeking
989 an arbitrary class is appropriate for ancestor-only 991 * an arbitrary class is appropriate for ancestor-only
990 sharing, but not for toplevel algorithm. 992 * sharing, but not for toplevel algorithm.
991 993 *
992 Our version is better, but slower, because it requires 994 * Our version is better, but slower, because it requires
993 two passes, but it is unavoidable with top-level sharing. 995 * two passes, but it is unavoidable with top-level sharing.
994 */ 996 */
995 997
996 if (q->toplevel == TC_CBQ_MAXLEVEL && 998 if (q->toplevel == TC_CBQ_MAXLEVEL &&
997 q->link.undertime == PSCHED_PASTPERFECT) 999 q->link.undertime == PSCHED_PASTPERFECT)
@@ -1002,7 +1004,8 @@ cbq_dequeue(struct Qdisc *sch)
1002 } 1004 }
1003 1005
1004 /* No packets in scheduler or nobody wants to give them to us :-( 1006 /* No packets in scheduler or nobody wants to give them to us :-(
1005 Sigh... start watchdog timer in the last case. */ 1007 * Sigh... start watchdog timer in the last case.
1008 */
1006 1009
1007 if (sch->q.qlen) { 1010 if (sch->q.qlen) {
1008 sch->qstats.overlimits++; 1011 sch->qstats.overlimits++;
@@ -1024,13 +1027,14 @@ static void cbq_adjust_levels(struct cbq_class *this)
1024 int level = 0; 1027 int level = 0;
1025 struct cbq_class *cl; 1028 struct cbq_class *cl;
1026 1029
1027 if ((cl = this->children) != NULL) { 1030 cl = this->children;
1031 if (cl) {
1028 do { 1032 do {
1029 if (cl->level > level) 1033 if (cl->level > level)
1030 level = cl->level; 1034 level = cl->level;
1031 } while ((cl = cl->sibling) != this->children); 1035 } while ((cl = cl->sibling) != this->children);
1032 } 1036 }
1033 this->level = level+1; 1037 this->level = level + 1;
1034 } while ((this = this->tparent) != NULL); 1038 } while ((this = this->tparent) != NULL);
1035} 1039}
1036 1040
@@ -1046,14 +1050,15 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1046 for (h = 0; h < q->clhash.hashsize; h++) { 1050 for (h = 0; h < q->clhash.hashsize; h++) {
1047 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) { 1051 hlist_for_each_entry(cl, n, &q->clhash.hash[h], common.hnode) {
1048 /* BUGGGG... Beware! This expression suffer of 1052 /* BUGGGG... Beware! This expression suffer of
1049 arithmetic overflows! 1053 * arithmetic overflows!
1050 */ 1054 */
1051 if (cl->priority == prio) { 1055 if (cl->priority == prio) {
1052 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ 1056 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1053 q->quanta[prio]; 1057 q->quanta[prio];
1054 } 1058 }
1055 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) { 1059 if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
1056 printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->common.classid, cl->quantum); 1060 pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
1061 cl->common.classid, cl->quantum);
1057 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1; 1062 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
1058 } 1063 }
1059 } 1064 }
@@ -1064,18 +1069,18 @@ static void cbq_sync_defmap(struct cbq_class *cl)
1064{ 1069{
1065 struct cbq_sched_data *q = qdisc_priv(cl->qdisc); 1070 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1066 struct cbq_class *split = cl->split; 1071 struct cbq_class *split = cl->split;
1067 unsigned h; 1072 unsigned int h;
1068 int i; 1073 int i;
1069 1074
1070 if (split == NULL) 1075 if (split == NULL)
1071 return; 1076 return;
1072 1077
1073 for (i=0; i<=TC_PRIO_MAX; i++) { 1078 for (i = 0; i <= TC_PRIO_MAX; i++) {
1074 if (split->defaults[i] == cl && !(cl->defmap&(1<<i))) 1079 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
1075 split->defaults[i] = NULL; 1080 split->defaults[i] = NULL;
1076 } 1081 }
1077 1082
1078 for (i=0; i<=TC_PRIO_MAX; i++) { 1083 for (i = 0; i <= TC_PRIO_MAX; i++) {
1079 int level = split->level; 1084 int level = split->level;
1080 1085
1081 if (split->defaults[i]) 1086 if (split->defaults[i])
@@ -1088,7 +1093,7 @@ static void cbq_sync_defmap(struct cbq_class *cl)
1088 hlist_for_each_entry(c, n, &q->clhash.hash[h], 1093 hlist_for_each_entry(c, n, &q->clhash.hash[h],
1089 common.hnode) { 1094 common.hnode) {
1090 if (c->split == split && c->level < level && 1095 if (c->split == split && c->level < level &&
1091 c->defmap&(1<<i)) { 1096 c->defmap & (1<<i)) {
1092 split->defaults[i] = c; 1097 split->defaults[i] = c;
1093 level = c->level; 1098 level = c->level;
1094 } 1099 }
@@ -1102,7 +1107,8 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
1102 struct cbq_class *split = NULL; 1107 struct cbq_class *split = NULL;
1103 1108
1104 if (splitid == 0) { 1109 if (splitid == 0) {
1105 if ((split = cl->split) == NULL) 1110 split = cl->split;
1111 if (!split)
1106 return; 1112 return;
1107 splitid = split->common.classid; 1113 splitid = split->common.classid;
1108 } 1114 }
@@ -1120,9 +1126,9 @@ static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 ma
1120 cl->defmap = 0; 1126 cl->defmap = 0;
1121 cbq_sync_defmap(cl); 1127 cbq_sync_defmap(cl);
1122 cl->split = split; 1128 cl->split = split;
1123 cl->defmap = def&mask; 1129 cl->defmap = def & mask;
1124 } else 1130 } else
1125 cl->defmap = (cl->defmap&~mask)|(def&mask); 1131 cl->defmap = (cl->defmap & ~mask) | (def & mask);
1126 1132
1127 cbq_sync_defmap(cl); 1133 cbq_sync_defmap(cl);
1128} 1134}
@@ -1135,7 +1141,7 @@ static void cbq_unlink_class(struct cbq_class *this)
1135 qdisc_class_hash_remove(&q->clhash, &this->common); 1141 qdisc_class_hash_remove(&q->clhash, &this->common);
1136 1142
1137 if (this->tparent) { 1143 if (this->tparent) {
1138 clp=&this->sibling; 1144 clp = &this->sibling;
1139 cl = *clp; 1145 cl = *clp;
1140 do { 1146 do {
1141 if (cl == this) { 1147 if (cl == this) {
@@ -1174,7 +1180,7 @@ static void cbq_link_class(struct cbq_class *this)
1174 } 1180 }
1175} 1181}
1176 1182
1177static unsigned int cbq_drop(struct Qdisc* sch) 1183static unsigned int cbq_drop(struct Qdisc *sch)
1178{ 1184{
1179 struct cbq_sched_data *q = qdisc_priv(sch); 1185 struct cbq_sched_data *q = qdisc_priv(sch);
1180 struct cbq_class *cl, *cl_head; 1186 struct cbq_class *cl, *cl_head;
@@ -1182,7 +1188,8 @@ static unsigned int cbq_drop(struct Qdisc* sch)
1182 unsigned int len; 1188 unsigned int len;
1183 1189
1184 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { 1190 for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1185 if ((cl_head = q->active[prio]) == NULL) 1191 cl_head = q->active[prio];
1192 if (!cl_head)
1186 continue; 1193 continue;
1187 1194
1188 cl = cl_head; 1195 cl = cl_head;
@@ -1199,13 +1206,13 @@ static unsigned int cbq_drop(struct Qdisc* sch)
1199} 1206}
1200 1207
1201static void 1208static void
1202cbq_reset(struct Qdisc* sch) 1209cbq_reset(struct Qdisc *sch)
1203{ 1210{
1204 struct cbq_sched_data *q = qdisc_priv(sch); 1211 struct cbq_sched_data *q = qdisc_priv(sch);
1205 struct cbq_class *cl; 1212 struct cbq_class *cl;
1206 struct hlist_node *n; 1213 struct hlist_node *n;
1207 int prio; 1214 int prio;
1208 unsigned h; 1215 unsigned int h;
1209 1216
1210 q->activemask = 0; 1217 q->activemask = 0;
1211 q->pmask = 0; 1218 q->pmask = 0;
@@ -1237,21 +1244,21 @@ cbq_reset(struct Qdisc* sch)
1237 1244
1238static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) 1245static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1239{ 1246{
1240 if (lss->change&TCF_CBQ_LSS_FLAGS) { 1247 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1241 cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; 1248 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1242 cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; 1249 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1243 } 1250 }
1244 if (lss->change&TCF_CBQ_LSS_EWMA) 1251 if (lss->change & TCF_CBQ_LSS_EWMA)
1245 cl->ewma_log = lss->ewma_log; 1252 cl->ewma_log = lss->ewma_log;
1246 if (lss->change&TCF_CBQ_LSS_AVPKT) 1253 if (lss->change & TCF_CBQ_LSS_AVPKT)
1247 cl->avpkt = lss->avpkt; 1254 cl->avpkt = lss->avpkt;
1248 if (lss->change&TCF_CBQ_LSS_MINIDLE) 1255 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1249 cl->minidle = -(long)lss->minidle; 1256 cl->minidle = -(long)lss->minidle;
1250 if (lss->change&TCF_CBQ_LSS_MAXIDLE) { 1257 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1251 cl->maxidle = lss->maxidle; 1258 cl->maxidle = lss->maxidle;
1252 cl->avgidle = lss->maxidle; 1259 cl->avgidle = lss->maxidle;
1253 } 1260 }
1254 if (lss->change&TCF_CBQ_LSS_OFFTIME) 1261 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1255 cl->offtime = lss->offtime; 1262 cl->offtime = lss->offtime;
1256 return 0; 1263 return 0;
1257} 1264}
@@ -1279,10 +1286,10 @@ static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1279 if (wrr->weight) 1286 if (wrr->weight)
1280 cl->weight = wrr->weight; 1287 cl->weight = wrr->weight;
1281 if (wrr->priority) { 1288 if (wrr->priority) {
1282 cl->priority = wrr->priority-1; 1289 cl->priority = wrr->priority - 1;
1283 cl->cpriority = cl->priority; 1290 cl->cpriority = cl->priority;
1284 if (cl->priority >= cl->priority2) 1291 if (cl->priority >= cl->priority2)
1285 cl->priority2 = TC_CBQ_MAXPRIO-1; 1292 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1286 } 1293 }
1287 1294
1288 cbq_addprio(q, cl); 1295 cbq_addprio(q, cl);
@@ -1299,10 +1306,10 @@ static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1299 cl->overlimit = cbq_ovl_delay; 1306 cl->overlimit = cbq_ovl_delay;
1300 break; 1307 break;
1301 case TC_CBQ_OVL_LOWPRIO: 1308 case TC_CBQ_OVL_LOWPRIO:
1302 if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || 1309 if (ovl->priority2 - 1 >= TC_CBQ_MAXPRIO ||
1303 ovl->priority2-1 <= cl->priority) 1310 ovl->priority2 - 1 <= cl->priority)
1304 return -EINVAL; 1311 return -EINVAL;
1305 cl->priority2 = ovl->priority2-1; 1312 cl->priority2 = ovl->priority2 - 1;
1306 cl->overlimit = cbq_ovl_lowprio; 1313 cl->overlimit = cbq_ovl_lowprio;
1307 break; 1314 break;
1308 case TC_CBQ_OVL_DROP: 1315 case TC_CBQ_OVL_DROP:
@@ -1381,9 +1388,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1381 if (!q->link.q) 1388 if (!q->link.q)
1382 q->link.q = &noop_qdisc; 1389 q->link.q = &noop_qdisc;
1383 1390
1384 q->link.priority = TC_CBQ_MAXPRIO-1; 1391 q->link.priority = TC_CBQ_MAXPRIO - 1;
1385 q->link.priority2 = TC_CBQ_MAXPRIO-1; 1392 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1386 q->link.cpriority = TC_CBQ_MAXPRIO-1; 1393 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1387 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; 1394 q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1388 q->link.overlimit = cbq_ovl_classic; 1395 q->link.overlimit = cbq_ovl_classic;
1389 q->link.allot = psched_mtu(qdisc_dev(sch)); 1396 q->link.allot = psched_mtu(qdisc_dev(sch));
@@ -1414,7 +1421,7 @@ put_rtab:
1414 return err; 1421 return err;
1415} 1422}
1416 1423
1417static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) 1424static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1418{ 1425{
1419 unsigned char *b = skb_tail_pointer(skb); 1426 unsigned char *b = skb_tail_pointer(skb);
1420 1427
@@ -1426,7 +1433,7 @@ nla_put_failure:
1426 return -1; 1433 return -1;
1427} 1434}
1428 1435
1429static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) 1436static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1430{ 1437{
1431 unsigned char *b = skb_tail_pointer(skb); 1438 unsigned char *b = skb_tail_pointer(skb);
1432 struct tc_cbq_lssopt opt; 1439 struct tc_cbq_lssopt opt;
@@ -1451,15 +1458,15 @@ nla_put_failure:
1451 return -1; 1458 return -1;
1452} 1459}
1453 1460
1454static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) 1461static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1455{ 1462{
1456 unsigned char *b = skb_tail_pointer(skb); 1463 unsigned char *b = skb_tail_pointer(skb);
1457 struct tc_cbq_wrropt opt; 1464 struct tc_cbq_wrropt opt;
1458 1465
1459 opt.flags = 0; 1466 opt.flags = 0;
1460 opt.allot = cl->allot; 1467 opt.allot = cl->allot;
1461 opt.priority = cl->priority+1; 1468 opt.priority = cl->priority + 1;
1462 opt.cpriority = cl->cpriority+1; 1469 opt.cpriority = cl->cpriority + 1;
1463 opt.weight = cl->weight; 1470 opt.weight = cl->weight;
1464 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); 1471 NLA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1465 return skb->len; 1472 return skb->len;
@@ -1469,13 +1476,13 @@ nla_put_failure:
1469 return -1; 1476 return -1;
1470} 1477}
1471 1478
1472static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) 1479static int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1473{ 1480{
1474 unsigned char *b = skb_tail_pointer(skb); 1481 unsigned char *b = skb_tail_pointer(skb);
1475 struct tc_cbq_ovl opt; 1482 struct tc_cbq_ovl opt;
1476 1483
1477 opt.strategy = cl->ovl_strategy; 1484 opt.strategy = cl->ovl_strategy;
1478 opt.priority2 = cl->priority2+1; 1485 opt.priority2 = cl->priority2 + 1;
1479 opt.pad = 0; 1486 opt.pad = 0;
1480 opt.penalty = cl->penalty; 1487 opt.penalty = cl->penalty;
1481 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); 1488 NLA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
@@ -1486,7 +1493,7 @@ nla_put_failure:
1486 return -1; 1493 return -1;
1487} 1494}
1488 1495
1489static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) 1496static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1490{ 1497{
1491 unsigned char *b = skb_tail_pointer(skb); 1498 unsigned char *b = skb_tail_pointer(skb);
1492 struct tc_cbq_fopt opt; 1499 struct tc_cbq_fopt opt;
@@ -1505,7 +1512,7 @@ nla_put_failure:
1505} 1512}
1506 1513
1507#ifdef CONFIG_NET_CLS_ACT 1514#ifdef CONFIG_NET_CLS_ACT
1508static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) 1515static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1509{ 1516{
1510 unsigned char *b = skb_tail_pointer(skb); 1517 unsigned char *b = skb_tail_pointer(skb);
1511 struct tc_cbq_police opt; 1518 struct tc_cbq_police opt;
@@ -1569,7 +1576,7 @@ static int
1569cbq_dump_class(struct Qdisc *sch, unsigned long arg, 1576cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1570 struct sk_buff *skb, struct tcmsg *tcm) 1577 struct sk_buff *skb, struct tcmsg *tcm)
1571{ 1578{
1572 struct cbq_class *cl = (struct cbq_class*)arg; 1579 struct cbq_class *cl = (struct cbq_class *)arg;
1573 struct nlattr *nest; 1580 struct nlattr *nest;
1574 1581
1575 if (cl->tparent) 1582 if (cl->tparent)
@@ -1597,7 +1604,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1597 struct gnet_dump *d) 1604 struct gnet_dump *d)
1598{ 1605{
1599 struct cbq_sched_data *q = qdisc_priv(sch); 1606 struct cbq_sched_data *q = qdisc_priv(sch);
1600 struct cbq_class *cl = (struct cbq_class*)arg; 1607 struct cbq_class *cl = (struct cbq_class *)arg;
1601 1608
1602 cl->qstats.qlen = cl->q->q.qlen; 1609 cl->qstats.qlen = cl->q->q.qlen;
1603 cl->xstats.avgidle = cl->avgidle; 1610 cl->xstats.avgidle = cl->avgidle;
@@ -1617,7 +1624,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1617static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, 1624static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1618 struct Qdisc **old) 1625 struct Qdisc **old)
1619{ 1626{
1620 struct cbq_class *cl = (struct cbq_class*)arg; 1627 struct cbq_class *cl = (struct cbq_class *)arg;
1621 1628
1622 if (new == NULL) { 1629 if (new == NULL) {
1623 new = qdisc_create_dflt(sch->dev_queue, 1630 new = qdisc_create_dflt(sch->dev_queue,
@@ -1640,10 +1647,9 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1640 return 0; 1647 return 0;
1641} 1648}
1642 1649
1643static struct Qdisc * 1650static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1644cbq_leaf(struct Qdisc *sch, unsigned long arg)
1645{ 1651{
1646 struct cbq_class *cl = (struct cbq_class*)arg; 1652 struct cbq_class *cl = (struct cbq_class *)arg;
1647 1653
1648 return cl->q; 1654 return cl->q;
1649} 1655}
@@ -1682,13 +1688,12 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1682 kfree(cl); 1688 kfree(cl);
1683} 1689}
1684 1690
1685static void 1691static void cbq_destroy(struct Qdisc *sch)
1686cbq_destroy(struct Qdisc* sch)
1687{ 1692{
1688 struct cbq_sched_data *q = qdisc_priv(sch); 1693 struct cbq_sched_data *q = qdisc_priv(sch);
1689 struct hlist_node *n, *next; 1694 struct hlist_node *n, *next;
1690 struct cbq_class *cl; 1695 struct cbq_class *cl;
1691 unsigned h; 1696 unsigned int h;
1692 1697
1693#ifdef CONFIG_NET_CLS_ACT 1698#ifdef CONFIG_NET_CLS_ACT
1694 q->rx_class = NULL; 1699 q->rx_class = NULL;
@@ -1712,7 +1717,7 @@ cbq_destroy(struct Qdisc* sch)
1712 1717
1713static void cbq_put(struct Qdisc *sch, unsigned long arg) 1718static void cbq_put(struct Qdisc *sch, unsigned long arg)
1714{ 1719{
1715 struct cbq_class *cl = (struct cbq_class*)arg; 1720 struct cbq_class *cl = (struct cbq_class *)arg;
1716 1721
1717 if (--cl->refcnt == 0) { 1722 if (--cl->refcnt == 0) {
1718#ifdef CONFIG_NET_CLS_ACT 1723#ifdef CONFIG_NET_CLS_ACT
@@ -1735,7 +1740,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
1735{ 1740{
1736 int err; 1741 int err;
1737 struct cbq_sched_data *q = qdisc_priv(sch); 1742 struct cbq_sched_data *q = qdisc_priv(sch);
1738 struct cbq_class *cl = (struct cbq_class*)*arg; 1743 struct cbq_class *cl = (struct cbq_class *)*arg;
1739 struct nlattr *opt = tca[TCA_OPTIONS]; 1744 struct nlattr *opt = tca[TCA_OPTIONS];
1740 struct nlattr *tb[TCA_CBQ_MAX + 1]; 1745 struct nlattr *tb[TCA_CBQ_MAX + 1];
1741 struct cbq_class *parent; 1746 struct cbq_class *parent;
@@ -1827,13 +1832,14 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
1827 1832
1828 if (classid) { 1833 if (classid) {
1829 err = -EINVAL; 1834 err = -EINVAL;
1830 if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) 1835 if (TC_H_MAJ(classid ^ sch->handle) ||
1836 cbq_class_lookup(q, classid))
1831 goto failure; 1837 goto failure;
1832 } else { 1838 } else {
1833 int i; 1839 int i;
1834 classid = TC_H_MAKE(sch->handle,0x8000); 1840 classid = TC_H_MAKE(sch->handle, 0x8000);
1835 1841
1836 for (i=0; i<0x8000; i++) { 1842 for (i = 0; i < 0x8000; i++) {
1837 if (++q->hgenerator >= 0x8000) 1843 if (++q->hgenerator >= 0x8000)
1838 q->hgenerator = 1; 1844 q->hgenerator = 1;
1839 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) 1845 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
@@ -1890,11 +1896,11 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t
1890 cl->minidle = -0x7FFFFFFF; 1896 cl->minidle = -0x7FFFFFFF;
1891 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT])); 1897 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1892 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT])); 1898 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1893 if (cl->ewma_log==0) 1899 if (cl->ewma_log == 0)
1894 cl->ewma_log = q->link.ewma_log; 1900 cl->ewma_log = q->link.ewma_log;
1895 if (cl->maxidle==0) 1901 if (cl->maxidle == 0)
1896 cl->maxidle = q->link.maxidle; 1902 cl->maxidle = q->link.maxidle;
1897 if (cl->avpkt==0) 1903 if (cl->avpkt == 0)
1898 cl->avpkt = q->link.avpkt; 1904 cl->avpkt = q->link.avpkt;
1899 cl->overlimit = cbq_ovl_classic; 1905 cl->overlimit = cbq_ovl_classic;
1900 if (tb[TCA_CBQ_OVL_STRATEGY]) 1906 if (tb[TCA_CBQ_OVL_STRATEGY])
@@ -1920,7 +1926,7 @@ failure:
1920static int cbq_delete(struct Qdisc *sch, unsigned long arg) 1926static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1921{ 1927{
1922 struct cbq_sched_data *q = qdisc_priv(sch); 1928 struct cbq_sched_data *q = qdisc_priv(sch);
1923 struct cbq_class *cl = (struct cbq_class*)arg; 1929 struct cbq_class *cl = (struct cbq_class *)arg;
1924 unsigned int qlen; 1930 unsigned int qlen;
1925 1931
1926 if (cl->filters || cl->children || cl == &q->link) 1932 if (cl->filters || cl->children || cl == &q->link)
@@ -1978,7 +1984,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1978 u32 classid) 1984 u32 classid)
1979{ 1985{
1980 struct cbq_sched_data *q = qdisc_priv(sch); 1986 struct cbq_sched_data *q = qdisc_priv(sch);
1981 struct cbq_class *p = (struct cbq_class*)parent; 1987 struct cbq_class *p = (struct cbq_class *)parent;
1982 struct cbq_class *cl = cbq_class_lookup(q, classid); 1988 struct cbq_class *cl = cbq_class_lookup(q, classid);
1983 1989
1984 if (cl) { 1990 if (cl) {
@@ -1992,7 +1998,7 @@ static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1992 1998
1993static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) 1999static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1994{ 2000{
1995 struct cbq_class *cl = (struct cbq_class*)arg; 2001 struct cbq_class *cl = (struct cbq_class *)arg;
1996 2002
1997 cl->filters--; 2003 cl->filters--;
1998} 2004}
@@ -2002,7 +2008,7 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2002 struct cbq_sched_data *q = qdisc_priv(sch); 2008 struct cbq_sched_data *q = qdisc_priv(sch);
2003 struct cbq_class *cl; 2009 struct cbq_class *cl;
2004 struct hlist_node *n; 2010 struct hlist_node *n;
2005 unsigned h; 2011 unsigned int h;
2006 2012
2007 if (arg->stop) 2013 if (arg->stop)
2008 return; 2014 return;
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index 0f7bf3fdfea5..2c790204d042 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -137,10 +137,10 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent,
137 mask = nla_get_u8(tb[TCA_DSMARK_MASK]); 137 mask = nla_get_u8(tb[TCA_DSMARK_MASK]);
138 138
139 if (tb[TCA_DSMARK_VALUE]) 139 if (tb[TCA_DSMARK_VALUE])
140 p->value[*arg-1] = nla_get_u8(tb[TCA_DSMARK_VALUE]); 140 p->value[*arg - 1] = nla_get_u8(tb[TCA_DSMARK_VALUE]);
141 141
142 if (tb[TCA_DSMARK_MASK]) 142 if (tb[TCA_DSMARK_MASK])
143 p->mask[*arg-1] = mask; 143 p->mask[*arg - 1] = mask;
144 144
145 err = 0; 145 err = 0;
146 146
@@ -155,8 +155,8 @@ static int dsmark_delete(struct Qdisc *sch, unsigned long arg)
155 if (!dsmark_valid_index(p, arg)) 155 if (!dsmark_valid_index(p, arg))
156 return -EINVAL; 156 return -EINVAL;
157 157
158 p->mask[arg-1] = 0xff; 158 p->mask[arg - 1] = 0xff;
159 p->value[arg-1] = 0; 159 p->value[arg - 1] = 0;
160 160
161 return 0; 161 return 0;
162} 162}
@@ -175,7 +175,7 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker)
175 if (p->mask[i] == 0xff && !p->value[i]) 175 if (p->mask[i] == 0xff && !p->value[i])
176 goto ignore; 176 goto ignore;
177 if (walker->count >= walker->skip) { 177 if (walker->count >= walker->skip) {
178 if (walker->fn(sch, i+1, walker) < 0) { 178 if (walker->fn(sch, i + 1, walker) < 0) {
179 walker->stop = 1; 179 walker->stop = 1;
180 break; 180 break;
181 } 181 }
@@ -304,9 +304,8 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
304 * and don't need yet another qdisc as a bypass. 304 * and don't need yet another qdisc as a bypass.
305 */ 305 */
306 if (p->mask[index] != 0xff || p->value[index]) 306 if (p->mask[index] != 0xff || p->value[index])
307 printk(KERN_WARNING 307 pr_warning("dsmark_dequeue: unsupported protocol %d\n",
308 "dsmark_dequeue: unsupported protocol %d\n", 308 ntohs(skb->protocol));
309 ntohs(skb->protocol));
310 break; 309 break;
311 } 310 }
312 311
@@ -424,14 +423,14 @@ static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl,
424 if (!dsmark_valid_index(p, cl)) 423 if (!dsmark_valid_index(p, cl))
425 return -EINVAL; 424 return -EINVAL;
426 425
427 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl-1); 426 tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle), cl - 1);
428 tcm->tcm_info = p->q->handle; 427 tcm->tcm_info = p->q->handle;
429 428
430 opts = nla_nest_start(skb, TCA_OPTIONS); 429 opts = nla_nest_start(skb, TCA_OPTIONS);
431 if (opts == NULL) 430 if (opts == NULL)
432 goto nla_put_failure; 431 goto nla_put_failure;
433 NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl-1]); 432 NLA_PUT_U8(skb, TCA_DSMARK_MASK, p->mask[cl - 1]);
434 NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl-1]); 433 NLA_PUT_U8(skb, TCA_DSMARK_VALUE, p->value[cl - 1]);
435 434
436 return nla_nest_end(skb, opts); 435 return nla_nest_end(skb, opts);
437 436
diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c
index d468b479aa93..be33f9ddf9dd 100644
--- a/net/sched/sch_fifo.c
+++ b/net/sched/sch_fifo.c
@@ -19,12 +19,11 @@
19 19
20/* 1 band FIFO pseudo-"scheduler" */ 20/* 1 band FIFO pseudo-"scheduler" */
21 21
22struct fifo_sched_data 22struct fifo_sched_data {
23{
24 u32 limit; 23 u32 limit;
25}; 24};
26 25
27static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) 26static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
28{ 27{
29 struct fifo_sched_data *q = qdisc_priv(sch); 28 struct fifo_sched_data *q = qdisc_priv(sch);
30 29
@@ -34,7 +33,7 @@ static int bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
34 return qdisc_reshape_fail(skb, sch); 33 return qdisc_reshape_fail(skb, sch);
35} 34}
36 35
37static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) 36static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc *sch)
38{ 37{
39 struct fifo_sched_data *q = qdisc_priv(sch); 38 struct fifo_sched_data *q = qdisc_priv(sch);
40 39
@@ -44,7 +43,7 @@ static int pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch)
44 return qdisc_reshape_fail(skb, sch); 43 return qdisc_reshape_fail(skb, sch);
45} 44}
46 45
47static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch) 46static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc *sch)
48{ 47{
49 struct fifo_sched_data *q = qdisc_priv(sch); 48 struct fifo_sched_data *q = qdisc_priv(sch);
50 49
@@ -62,11 +61,13 @@ static int pfifo_tail_enqueue(struct sk_buff *skb, struct Qdisc* sch)
62static int fifo_init(struct Qdisc *sch, struct nlattr *opt) 61static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
63{ 62{
64 struct fifo_sched_data *q = qdisc_priv(sch); 63 struct fifo_sched_data *q = qdisc_priv(sch);
64 bool bypass;
65 bool is_bfifo = sch->ops == &bfifo_qdisc_ops;
65 66
66 if (opt == NULL) { 67 if (opt == NULL) {
67 u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1; 68 u32 limit = qdisc_dev(sch)->tx_queue_len ? : 1;
68 69
69 if (sch->ops == &bfifo_qdisc_ops) 70 if (is_bfifo)
70 limit *= psched_mtu(qdisc_dev(sch)); 71 limit *= psched_mtu(qdisc_dev(sch));
71 72
72 q->limit = limit; 73 q->limit = limit;
@@ -79,6 +80,15 @@ static int fifo_init(struct Qdisc *sch, struct nlattr *opt)
79 q->limit = ctl->limit; 80 q->limit = ctl->limit;
80 } 81 }
81 82
83 if (is_bfifo)
84 bypass = q->limit >= psched_mtu(qdisc_dev(sch));
85 else
86 bypass = q->limit >= 1;
87
88 if (bypass)
89 sch->flags |= TCQ_F_CAN_BYPASS;
90 else
91 sch->flags &= ~TCQ_F_CAN_BYPASS;
82 return 0; 92 return 0;
83} 93}
84 94
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 34dc598440a2..0da09d508737 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -87,8 +87,8 @@ static inline int handle_dev_cpu_collision(struct sk_buff *skb,
87 */ 87 */
88 kfree_skb(skb); 88 kfree_skb(skb);
89 if (net_ratelimit()) 89 if (net_ratelimit())
90 printk(KERN_WARNING "Dead loop on netdevice %s, " 90 pr_warning("Dead loop on netdevice %s, fix it urgently!\n",
91 "fix it urgently!\n", dev_queue->dev->name); 91 dev_queue->dev->name);
92 ret = qdisc_qlen(q); 92 ret = qdisc_qlen(q);
93 } else { 93 } else {
94 /* 94 /*
@@ -137,8 +137,8 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
137 } else { 137 } else {
138 /* Driver returned NETDEV_TX_BUSY - requeue skb */ 138 /* Driver returned NETDEV_TX_BUSY - requeue skb */
139 if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit())) 139 if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
140 printk(KERN_WARNING "BUG %s code %d qlen %d\n", 140 pr_warning("BUG %s code %d qlen %d\n",
141 dev->name, ret, q->q.qlen); 141 dev->name, ret, q->q.qlen);
142 142
143 ret = dev_requeue_skb(skb, q); 143 ret = dev_requeue_skb(skb, q);
144 } 144 }
@@ -412,8 +412,9 @@ static struct Qdisc noqueue_qdisc = {
412}; 412};
413 413
414 414
415static const u8 prio2band[TC_PRIO_MAX+1] = 415static const u8 prio2band[TC_PRIO_MAX + 1] = {
416 { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; 416 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
417};
417 418
418/* 3-band FIFO queue: old style, but should be a bit faster than 419/* 3-band FIFO queue: old style, but should be a bit faster than
419 generic prio+fifo combination. 420 generic prio+fifo combination.
@@ -445,7 +446,7 @@ static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
445 return priv->q + band; 446 return priv->q + band;
446} 447}
447 448
448static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) 449static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
449{ 450{
450 if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) { 451 if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
451 int band = prio2band[skb->priority & TC_PRIO_MAX]; 452 int band = prio2band[skb->priority & TC_PRIO_MAX];
@@ -460,7 +461,7 @@ static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
460 return qdisc_drop(skb, qdisc); 461 return qdisc_drop(skb, qdisc);
461} 462}
462 463
463static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc) 464static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
464{ 465{
465 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 466 struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
466 int band = bitmap2band[priv->bitmap]; 467 int band = bitmap2band[priv->bitmap];
@@ -479,7 +480,7 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
479 return NULL; 480 return NULL;
480} 481}
481 482
482static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc) 483static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
483{ 484{
484 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 485 struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
485 int band = bitmap2band[priv->bitmap]; 486 int band = bitmap2band[priv->bitmap];
@@ -493,7 +494,7 @@ static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
493 return NULL; 494 return NULL;
494} 495}
495 496
496static void pfifo_fast_reset(struct Qdisc* qdisc) 497static void pfifo_fast_reset(struct Qdisc *qdisc)
497{ 498{
498 int prio; 499 int prio;
499 struct pfifo_fast_priv *priv = qdisc_priv(qdisc); 500 struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
@@ -510,7 +511,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
510{ 511{
511 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS }; 512 struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
512 513
513 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); 514 memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
514 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 515 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
515 return skb->len; 516 return skb->len;
516 517
@@ -526,6 +527,8 @@ static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
526 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) 527 for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
527 skb_queue_head_init(band2list(priv, prio)); 528 skb_queue_head_init(band2list(priv, prio));
528 529
530 /* Can by-pass the queue discipline */
531 qdisc->flags |= TCQ_F_CAN_BYPASS;
529 return 0; 532 return 0;
530} 533}
531 534
@@ -540,6 +543,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
540 .dump = pfifo_fast_dump, 543 .dump = pfifo_fast_dump,
541 .owner = THIS_MODULE, 544 .owner = THIS_MODULE,
542}; 545};
546EXPORT_SYMBOL(pfifo_fast_ops);
543 547
544struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, 548struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
545 struct Qdisc_ops *ops) 549 struct Qdisc_ops *ops)
@@ -630,7 +634,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
630#ifdef CONFIG_NET_SCHED 634#ifdef CONFIG_NET_SCHED
631 qdisc_list_del(qdisc); 635 qdisc_list_del(qdisc);
632 636
633 qdisc_put_stab(qdisc->stab); 637 qdisc_put_stab(rtnl_dereference(qdisc->stab));
634#endif 638#endif
635 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); 639 gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
636 if (ops->reset) 640 if (ops->reset)
@@ -674,25 +678,21 @@ struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
674 678
675 return oqdisc; 679 return oqdisc;
676} 680}
681EXPORT_SYMBOL(dev_graft_qdisc);
677 682
678static void attach_one_default_qdisc(struct net_device *dev, 683static void attach_one_default_qdisc(struct net_device *dev,
679 struct netdev_queue *dev_queue, 684 struct netdev_queue *dev_queue,
680 void *_unused) 685 void *_unused)
681{ 686{
682 struct Qdisc *qdisc; 687 struct Qdisc *qdisc = &noqueue_qdisc;
683 688
684 if (dev->tx_queue_len) { 689 if (dev->tx_queue_len) {
685 qdisc = qdisc_create_dflt(dev_queue, 690 qdisc = qdisc_create_dflt(dev_queue,
686 &pfifo_fast_ops, TC_H_ROOT); 691 &pfifo_fast_ops, TC_H_ROOT);
687 if (!qdisc) { 692 if (!qdisc) {
688 printk(KERN_INFO "%s: activation failed\n", dev->name); 693 netdev_info(dev, "activation failed\n");
689 return; 694 return;
690 } 695 }
691
692 /* Can by-pass the queue discipline for default qdisc */
693 qdisc->flags |= TCQ_F_CAN_BYPASS;
694 } else {
695 qdisc = &noqueue_qdisc;
696 } 696 }
697 dev_queue->qdisc_sleeping = qdisc; 697 dev_queue->qdisc_sleeping = qdisc;
698} 698}
@@ -761,6 +761,7 @@ void dev_activate(struct net_device *dev)
761 dev_watchdog_up(dev); 761 dev_watchdog_up(dev);
762 } 762 }
763} 763}
764EXPORT_SYMBOL(dev_activate);
764 765
765static void dev_deactivate_queue(struct net_device *dev, 766static void dev_deactivate_queue(struct net_device *dev,
766 struct netdev_queue *dev_queue, 767 struct netdev_queue *dev_queue,
@@ -840,6 +841,7 @@ void dev_deactivate(struct net_device *dev)
840 list_add(&dev->unreg_list, &single); 841 list_add(&dev->unreg_list, &single);
841 dev_deactivate_many(&single); 842 dev_deactivate_many(&single);
842} 843}
844EXPORT_SYMBOL(dev_deactivate);
843 845
844static void dev_init_scheduler_queue(struct net_device *dev, 846static void dev_init_scheduler_queue(struct net_device *dev,
845 struct netdev_queue *dev_queue, 847 struct netdev_queue *dev_queue,
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 51dcc2aa5c92..b9493a09a870 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -32,8 +32,7 @@
32struct gred_sched_data; 32struct gred_sched_data;
33struct gred_sched; 33struct gred_sched;
34 34
35struct gred_sched_data 35struct gred_sched_data {
36{
37 u32 limit; /* HARD maximal queue length */ 36 u32 limit; /* HARD maximal queue length */
38 u32 DP; /* the drop pramaters */ 37 u32 DP; /* the drop pramaters */
39 u32 bytesin; /* bytes seen on virtualQ so far*/ 38 u32 bytesin; /* bytes seen on virtualQ so far*/
@@ -50,8 +49,7 @@ enum {
50 GRED_RIO_MODE, 49 GRED_RIO_MODE,
51}; 50};
52 51
53struct gred_sched 52struct gred_sched {
54{
55 struct gred_sched_data *tab[MAX_DPs]; 53 struct gred_sched_data *tab[MAX_DPs];
56 unsigned long flags; 54 unsigned long flags;
57 u32 red_flags; 55 u32 red_flags;
@@ -150,17 +148,18 @@ static inline int gred_use_harddrop(struct gred_sched *t)
150 return t->red_flags & TC_RED_HARDDROP; 148 return t->red_flags & TC_RED_HARDDROP;
151} 149}
152 150
153static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) 151static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch)
154{ 152{
155 struct gred_sched_data *q=NULL; 153 struct gred_sched_data *q = NULL;
156 struct gred_sched *t= qdisc_priv(sch); 154 struct gred_sched *t = qdisc_priv(sch);
157 unsigned long qavg = 0; 155 unsigned long qavg = 0;
158 u16 dp = tc_index_to_dp(skb); 156 u16 dp = tc_index_to_dp(skb);
159 157
160 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 158 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
161 dp = t->def; 159 dp = t->def;
162 160
163 if ((q = t->tab[dp]) == NULL) { 161 q = t->tab[dp];
162 if (!q) {
164 /* Pass through packets not assigned to a DP 163 /* Pass through packets not assigned to a DP
165 * if no default DP has been configured. This 164 * if no default DP has been configured. This
166 * allows for DP flows to be left untouched. 165 * allows for DP flows to be left untouched.
@@ -183,7 +182,7 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
183 for (i = 0; i < t->DPs; i++) { 182 for (i = 0; i < t->DPs; i++) {
184 if (t->tab[i] && t->tab[i]->prio < q->prio && 183 if (t->tab[i] && t->tab[i]->prio < q->prio &&
185 !red_is_idling(&t->tab[i]->parms)) 184 !red_is_idling(&t->tab[i]->parms))
186 qavg +=t->tab[i]->parms.qavg; 185 qavg += t->tab[i]->parms.qavg;
187 } 186 }
188 187
189 } 188 }
@@ -203,28 +202,28 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc* sch)
203 gred_store_wred_set(t, q); 202 gred_store_wred_set(t, q);
204 203
205 switch (red_action(&q->parms, q->parms.qavg + qavg)) { 204 switch (red_action(&q->parms, q->parms.qavg + qavg)) {
206 case RED_DONT_MARK: 205 case RED_DONT_MARK:
207 break; 206 break;
208 207
209 case RED_PROB_MARK: 208 case RED_PROB_MARK:
210 sch->qstats.overlimits++; 209 sch->qstats.overlimits++;
211 if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) { 210 if (!gred_use_ecn(t) || !INET_ECN_set_ce(skb)) {
212 q->stats.prob_drop++; 211 q->stats.prob_drop++;
213 goto congestion_drop; 212 goto congestion_drop;
214 } 213 }
215 214
216 q->stats.prob_mark++; 215 q->stats.prob_mark++;
217 break; 216 break;
218 217
219 case RED_HARD_MARK: 218 case RED_HARD_MARK:
220 sch->qstats.overlimits++; 219 sch->qstats.overlimits++;
221 if (gred_use_harddrop(t) || !gred_use_ecn(t) || 220 if (gred_use_harddrop(t) || !gred_use_ecn(t) ||
222 !INET_ECN_set_ce(skb)) { 221 !INET_ECN_set_ce(skb)) {
223 q->stats.forced_drop++; 222 q->stats.forced_drop++;
224 goto congestion_drop; 223 goto congestion_drop;
225 } 224 }
226 q->stats.forced_mark++; 225 q->stats.forced_mark++;
227 break; 226 break;
228 } 227 }
229 228
230 if (q->backlog + qdisc_pkt_len(skb) <= q->limit) { 229 if (q->backlog + qdisc_pkt_len(skb) <= q->limit) {
@@ -241,7 +240,7 @@ congestion_drop:
241 return NET_XMIT_CN; 240 return NET_XMIT_CN;
242} 241}
243 242
244static struct sk_buff *gred_dequeue(struct Qdisc* sch) 243static struct sk_buff *gred_dequeue(struct Qdisc *sch)
245{ 244{
246 struct sk_buff *skb; 245 struct sk_buff *skb;
247 struct gred_sched *t = qdisc_priv(sch); 246 struct gred_sched *t = qdisc_priv(sch);
@@ -254,9 +253,9 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
254 253
255 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 254 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
256 if (net_ratelimit()) 255 if (net_ratelimit())
257 printk(KERN_WARNING "GRED: Unable to relocate " 256 pr_warning("GRED: Unable to relocate VQ 0x%x "
258 "VQ 0x%x after dequeue, screwing up " 257 "after dequeue, screwing up "
259 "backlog.\n", tc_index_to_dp(skb)); 258 "backlog.\n", tc_index_to_dp(skb));
260 } else { 259 } else {
261 q->backlog -= qdisc_pkt_len(skb); 260 q->backlog -= qdisc_pkt_len(skb);
262 261
@@ -273,7 +272,7 @@ static struct sk_buff *gred_dequeue(struct Qdisc* sch)
273 return NULL; 272 return NULL;
274} 273}
275 274
276static unsigned int gred_drop(struct Qdisc* sch) 275static unsigned int gred_drop(struct Qdisc *sch)
277{ 276{
278 struct sk_buff *skb; 277 struct sk_buff *skb;
279 struct gred_sched *t = qdisc_priv(sch); 278 struct gred_sched *t = qdisc_priv(sch);
@@ -286,9 +285,9 @@ static unsigned int gred_drop(struct Qdisc* sch)
286 285
287 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) { 286 if (dp >= t->DPs || (q = t->tab[dp]) == NULL) {
288 if (net_ratelimit()) 287 if (net_ratelimit())
289 printk(KERN_WARNING "GRED: Unable to relocate " 288 pr_warning("GRED: Unable to relocate VQ 0x%x "
290 "VQ 0x%x while dropping, screwing up " 289 "while dropping, screwing up "
291 "backlog.\n", tc_index_to_dp(skb)); 290 "backlog.\n", tc_index_to_dp(skb));
292 } else { 291 } else {
293 q->backlog -= len; 292 q->backlog -= len;
294 q->stats.other++; 293 q->stats.other++;
@@ -308,7 +307,7 @@ static unsigned int gred_drop(struct Qdisc* sch)
308 307
309} 308}
310 309
311static void gred_reset(struct Qdisc* sch) 310static void gred_reset(struct Qdisc *sch)
312{ 311{
313 int i; 312 int i;
314 struct gred_sched *t = qdisc_priv(sch); 313 struct gred_sched *t = qdisc_priv(sch);
@@ -369,8 +368,8 @@ static inline int gred_change_table_def(struct Qdisc *sch, struct nlattr *dps)
369 368
370 for (i = table->DPs; i < MAX_DPs; i++) { 369 for (i = table->DPs; i < MAX_DPs; i++) {
371 if (table->tab[i]) { 370 if (table->tab[i]) {
372 printk(KERN_WARNING "GRED: Warning: Destroying " 371 pr_warning("GRED: Warning: Destroying "
373 "shadowed VQ 0x%x\n", i); 372 "shadowed VQ 0x%x\n", i);
374 gred_destroy_vq(table->tab[i]); 373 gred_destroy_vq(table->tab[i]);
375 table->tab[i] = NULL; 374 table->tab[i] = NULL;
376 } 375 }
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 14a799de1c35..6488e6425652 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -81,8 +81,7 @@
81 * that are expensive on 32-bit architectures. 81 * that are expensive on 32-bit architectures.
82 */ 82 */
83 83
84struct internal_sc 84struct internal_sc {
85{
86 u64 sm1; /* scaled slope of the 1st segment */ 85 u64 sm1; /* scaled slope of the 1st segment */
87 u64 ism1; /* scaled inverse-slope of the 1st segment */ 86 u64 ism1; /* scaled inverse-slope of the 1st segment */
88 u64 dx; /* the x-projection of the 1st segment */ 87 u64 dx; /* the x-projection of the 1st segment */
@@ -92,8 +91,7 @@ struct internal_sc
92}; 91};
93 92
94/* runtime service curve */ 93/* runtime service curve */
95struct runtime_sc 94struct runtime_sc {
96{
97 u64 x; /* current starting position on x-axis */ 95 u64 x; /* current starting position on x-axis */
98 u64 y; /* current starting position on y-axis */ 96 u64 y; /* current starting position on y-axis */
99 u64 sm1; /* scaled slope of the 1st segment */ 97 u64 sm1; /* scaled slope of the 1st segment */
@@ -104,15 +102,13 @@ struct runtime_sc
104 u64 ism2; /* scaled inverse-slope of the 2nd segment */ 102 u64 ism2; /* scaled inverse-slope of the 2nd segment */
105}; 103};
106 104
107enum hfsc_class_flags 105enum hfsc_class_flags {
108{
109 HFSC_RSC = 0x1, 106 HFSC_RSC = 0x1,
110 HFSC_FSC = 0x2, 107 HFSC_FSC = 0x2,
111 HFSC_USC = 0x4 108 HFSC_USC = 0x4
112}; 109};
113 110
114struct hfsc_class 111struct hfsc_class {
115{
116 struct Qdisc_class_common cl_common; 112 struct Qdisc_class_common cl_common;
117 unsigned int refcnt; /* usage count */ 113 unsigned int refcnt; /* usage count */
118 114
@@ -140,8 +136,8 @@ struct hfsc_class
140 u64 cl_cumul; /* cumulative work in bytes done by 136 u64 cl_cumul; /* cumulative work in bytes done by
141 real-time criteria */ 137 real-time criteria */
142 138
143 u64 cl_d; /* deadline*/ 139 u64 cl_d; /* deadline*/
144 u64 cl_e; /* eligible time */ 140 u64 cl_e; /* eligible time */
145 u64 cl_vt; /* virtual time */ 141 u64 cl_vt; /* virtual time */
146 u64 cl_f; /* time when this class will fit for 142 u64 cl_f; /* time when this class will fit for
147 link-sharing, max(myf, cfmin) */ 143 link-sharing, max(myf, cfmin) */
@@ -176,8 +172,7 @@ struct hfsc_class
176 unsigned long cl_nactive; /* number of active children */ 172 unsigned long cl_nactive; /* number of active children */
177}; 173};
178 174
179struct hfsc_sched 175struct hfsc_sched {
180{
181 u16 defcls; /* default class id */ 176 u16 defcls; /* default class id */
182 struct hfsc_class root; /* root class */ 177 struct hfsc_class root; /* root class */
183 struct Qdisc_class_hash clhash; /* class hash */ 178 struct Qdisc_class_hash clhash; /* class hash */
@@ -693,7 +688,7 @@ init_vf(struct hfsc_class *cl, unsigned int len)
693 if (go_active) { 688 if (go_active) {
694 n = rb_last(&cl->cl_parent->vt_tree); 689 n = rb_last(&cl->cl_parent->vt_tree);
695 if (n != NULL) { 690 if (n != NULL) {
696 max_cl = rb_entry(n, struct hfsc_class,vt_node); 691 max_cl = rb_entry(n, struct hfsc_class, vt_node);
697 /* 692 /*
698 * set vt to the average of the min and max 693 * set vt to the average of the min and max
699 * classes. if the parent's period didn't 694 * classes. if the parent's period didn't
@@ -1177,8 +1172,10 @@ hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
1177 return NULL; 1172 return NULL;
1178 } 1173 }
1179#endif 1174#endif
1180 if ((cl = (struct hfsc_class *)res.class) == NULL) { 1175 cl = (struct hfsc_class *)res.class;
1181 if ((cl = hfsc_find_class(res.classid, sch)) == NULL) 1176 if (!cl) {
1177 cl = hfsc_find_class(res.classid, sch);
1178 if (!cl)
1182 break; /* filter selected invalid classid */ 1179 break; /* filter selected invalid classid */
1183 if (cl->level >= head->level) 1180 if (cl->level >= head->level)
1184 break; /* filter may only point downwards */ 1181 break; /* filter may only point downwards */
@@ -1316,7 +1313,7 @@ hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc)
1316 return -1; 1313 return -1;
1317} 1314}
1318 1315
1319static inline int 1316static int
1320hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) 1317hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl)
1321{ 1318{
1322 if ((cl->cl_flags & HFSC_RSC) && 1319 if ((cl->cl_flags & HFSC_RSC) &&
@@ -1420,7 +1417,8 @@ hfsc_schedule_watchdog(struct Qdisc *sch)
1420 struct hfsc_class *cl; 1417 struct hfsc_class *cl;
1421 u64 next_time = 0; 1418 u64 next_time = 0;
1422 1419
1423 if ((cl = eltree_get_minel(q)) != NULL) 1420 cl = eltree_get_minel(q);
1421 if (cl)
1424 next_time = cl->cl_e; 1422 next_time = cl->cl_e;
1425 if (q->root.cl_cfmin != 0) { 1423 if (q->root.cl_cfmin != 0) {
1426 if (next_time == 0 || next_time > q->root.cl_cfmin) 1424 if (next_time == 0 || next_time > q->root.cl_cfmin)
@@ -1625,7 +1623,8 @@ hfsc_dequeue(struct Qdisc *sch)
1625 * find the class with the minimum deadline among 1623 * find the class with the minimum deadline among
1626 * the eligible classes. 1624 * the eligible classes.
1627 */ 1625 */
1628 if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { 1626 cl = eltree_get_mindl(q, cur_time);
1627 if (cl) {
1629 realtime = 1; 1628 realtime = 1;
1630 } else { 1629 } else {
1631 /* 1630 /*
@@ -1664,7 +1663,7 @@ hfsc_dequeue(struct Qdisc *sch)
1664 set_passive(cl); 1663 set_passive(cl);
1665 } 1664 }
1666 1665
1667 sch->flags &= ~TCQ_F_THROTTLED; 1666 qdisc_unthrottled(sch);
1668 qdisc_bstats_update(sch, skb); 1667 qdisc_bstats_update(sch, skb);
1669 sch->q.qlen--; 1668 sch->q.qlen--;
1670 1669
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index fc12fe6f5597..e1429a85091f 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -99,9 +99,10 @@ struct htb_class {
99 struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ 99 struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */
100 struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ 100 struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */
101 /* When class changes from state 1->2 and disconnects from 101 /* When class changes from state 1->2 and disconnects from
102 parent's feed then we lost ptr value and start from the 102 * parent's feed then we lost ptr value and start from the
103 first child again. Here we store classid of the 103 * first child again. Here we store classid of the
104 last valid ptr (used when ptr is NULL). */ 104 * last valid ptr (used when ptr is NULL).
105 */
105 u32 last_ptr_id[TC_HTB_NUMPRIO]; 106 u32 last_ptr_id[TC_HTB_NUMPRIO];
106 } inner; 107 } inner;
107 } un; 108 } un;
@@ -185,7 +186,7 @@ static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
185 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull 186 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
186 * then finish and return direct queue. 187 * then finish and return direct queue.
187 */ 188 */
188#define HTB_DIRECT (struct htb_class*)-1 189#define HTB_DIRECT ((struct htb_class *)-1L)
189 190
190static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, 191static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
191 int *qerr) 192 int *qerr)
@@ -197,11 +198,13 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
197 int result; 198 int result;
198 199
199 /* allow to select class by setting skb->priority to valid classid; 200 /* allow to select class by setting skb->priority to valid classid;
200 note that nfmark can be used too by attaching filter fw with no 201 * note that nfmark can be used too by attaching filter fw with no
201 rules in it */ 202 * rules in it
203 */
202 if (skb->priority == sch->handle) 204 if (skb->priority == sch->handle)
203 return HTB_DIRECT; /* X:0 (direct flow) selected */ 205 return HTB_DIRECT; /* X:0 (direct flow) selected */
204 if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0) 206 cl = htb_find(skb->priority, sch);
207 if (cl && cl->level == 0)
205 return cl; 208 return cl;
206 209
207 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; 210 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
@@ -216,10 +219,12 @@ static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
216 return NULL; 219 return NULL;
217 } 220 }
218#endif 221#endif
219 if ((cl = (void *)res.class) == NULL) { 222 cl = (void *)res.class;
223 if (!cl) {
220 if (res.classid == sch->handle) 224 if (res.classid == sch->handle)
221 return HTB_DIRECT; /* X:0 (direct flow) */ 225 return HTB_DIRECT; /* X:0 (direct flow) */
222 if ((cl = htb_find(res.classid, sch)) == NULL) 226 cl = htb_find(res.classid, sch);
227 if (!cl)
223 break; /* filter selected invalid classid */ 228 break; /* filter selected invalid classid */
224 } 229 }
225 if (!cl->level) 230 if (!cl->level)
@@ -378,7 +383,8 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
378 383
379 if (p->un.inner.feed[prio].rb_node) 384 if (p->un.inner.feed[prio].rb_node)
380 /* parent already has its feed in use so that 385 /* parent already has its feed in use so that
381 reset bit in mask as parent is already ok */ 386 * reset bit in mask as parent is already ok
387 */
382 mask &= ~(1 << prio); 388 mask &= ~(1 << prio);
383 389
384 htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio); 390 htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
@@ -413,8 +419,9 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
413 419
414 if (p->un.inner.ptr[prio] == cl->node + prio) { 420 if (p->un.inner.ptr[prio] == cl->node + prio) {
415 /* we are removing child which is pointed to from 421 /* we are removing child which is pointed to from
416 parent feed - forget the pointer but remember 422 * parent feed - forget the pointer but remember
417 classid */ 423 * classid
424 */
418 p->un.inner.last_ptr_id[prio] = cl->common.classid; 425 p->un.inner.last_ptr_id[prio] = cl->common.classid;
419 p->un.inner.ptr[prio] = NULL; 426 p->un.inner.ptr[prio] = NULL;
420 } 427 }
@@ -663,8 +670,9 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
663 unsigned long start) 670 unsigned long start)
664{ 671{
665 /* don't run for longer than 2 jiffies; 2 is used instead of 672 /* don't run for longer than 2 jiffies; 2 is used instead of
666 1 to simplify things when jiffy is going to be incremented 673 * 1 to simplify things when jiffy is going to be incremented
667 too soon */ 674 * too soon
675 */
668 unsigned long stop_at = start + 2; 676 unsigned long stop_at = start + 2;
669 while (time_before(jiffies, stop_at)) { 677 while (time_before(jiffies, stop_at)) {
670 struct htb_class *cl; 678 struct htb_class *cl;
@@ -687,7 +695,7 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
687 695
688 /* too much load - let's continue after a break for scheduling */ 696 /* too much load - let's continue after a break for scheduling */
689 if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) { 697 if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
690 printk(KERN_WARNING "htb: too many events!\n"); 698 pr_warning("htb: too many events!\n");
691 q->warned |= HTB_WARN_TOOMANYEVENTS; 699 q->warned |= HTB_WARN_TOOMANYEVENTS;
692 } 700 }
693 701
@@ -695,7 +703,8 @@ static psched_time_t htb_do_events(struct htb_sched *q, int level,
695} 703}
696 704
697/* Returns class->node+prio from id-tree where classe's id is >= id. NULL 705/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
698 is no such one exists. */ 706 * is no such one exists.
707 */
699static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n, 708static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
700 u32 id) 709 u32 id)
701{ 710{
@@ -739,12 +748,14 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
739 for (i = 0; i < 65535; i++) { 748 for (i = 0; i < 65535; i++) {
740 if (!*sp->pptr && *sp->pid) { 749 if (!*sp->pptr && *sp->pid) {
741 /* ptr was invalidated but id is valid - try to recover 750 /* ptr was invalidated but id is valid - try to recover
742 the original or next ptr */ 751 * the original or next ptr
752 */
743 *sp->pptr = 753 *sp->pptr =
744 htb_id_find_next_upper(prio, sp->root, *sp->pid); 754 htb_id_find_next_upper(prio, sp->root, *sp->pid);
745 } 755 }
746 *sp->pid = 0; /* ptr is valid now so that remove this hint as it 756 *sp->pid = 0; /* ptr is valid now so that remove this hint as it
747 can become out of date quickly */ 757 * can become out of date quickly
758 */
748 if (!*sp->pptr) { /* we are at right end; rewind & go up */ 759 if (!*sp->pptr) { /* we are at right end; rewind & go up */
749 *sp->pptr = sp->root; 760 *sp->pptr = sp->root;
750 while ((*sp->pptr)->rb_left) 761 while ((*sp->pptr)->rb_left)
@@ -772,7 +783,8 @@ static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
772} 783}
773 784
774/* dequeues packet at given priority and level; call only if 785/* dequeues packet at given priority and level; call only if
775 you are sure that there is active class at prio/level */ 786 * you are sure that there is active class at prio/level
787 */
776static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio, 788static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
777 int level) 789 int level)
778{ 790{
@@ -789,9 +801,10 @@ next:
789 return NULL; 801 return NULL;
790 802
791 /* class can be empty - it is unlikely but can be true if leaf 803 /* class can be empty - it is unlikely but can be true if leaf
792 qdisc drops packets in enqueue routine or if someone used 804 * qdisc drops packets in enqueue routine or if someone used
793 graft operation on the leaf since last dequeue; 805 * graft operation on the leaf since last dequeue;
794 simply deactivate and skip such class */ 806 * simply deactivate and skip such class
807 */
795 if (unlikely(cl->un.leaf.q->q.qlen == 0)) { 808 if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
796 struct htb_class *next; 809 struct htb_class *next;
797 htb_deactivate(q, cl); 810 htb_deactivate(q, cl);
@@ -831,7 +844,8 @@ next:
831 ptr[0]) + prio); 844 ptr[0]) + prio);
832 } 845 }
833 /* this used to be after charge_class but this constelation 846 /* this used to be after charge_class but this constelation
834 gives us slightly better performance */ 847 * gives us slightly better performance
848 */
835 if (!cl->un.leaf.q->q.qlen) 849 if (!cl->un.leaf.q->q.qlen)
836 htb_deactivate(q, cl); 850 htb_deactivate(q, cl);
837 htb_charge_class(q, cl, level, skb); 851 htb_charge_class(q, cl, level, skb);
@@ -852,7 +866,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
852 if (skb != NULL) { 866 if (skb != NULL) {
853ok: 867ok:
854 qdisc_bstats_update(sch, skb); 868 qdisc_bstats_update(sch, skb);
855 sch->flags &= ~TCQ_F_THROTTLED; 869 qdisc_unthrottled(sch);
856 sch->q.qlen--; 870 sch->q.qlen--;
857 return skb; 871 return skb;
858 } 872 }
@@ -883,6 +897,7 @@ ok:
883 m = ~q->row_mask[level]; 897 m = ~q->row_mask[level];
884 while (m != (int)(-1)) { 898 while (m != (int)(-1)) {
885 int prio = ffz(m); 899 int prio = ffz(m);
900
886 m |= 1 << prio; 901 m |= 1 << prio;
887 skb = htb_dequeue_tree(q, prio, level); 902 skb = htb_dequeue_tree(q, prio, level);
888 if (likely(skb != NULL)) 903 if (likely(skb != NULL))
@@ -987,13 +1002,12 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt)
987 return err; 1002 return err;
988 1003
989 if (tb[TCA_HTB_INIT] == NULL) { 1004 if (tb[TCA_HTB_INIT] == NULL) {
990 printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); 1005 pr_err("HTB: hey probably you have bad tc tool ?\n");
991 return -EINVAL; 1006 return -EINVAL;
992 } 1007 }
993 gopt = nla_data(tb[TCA_HTB_INIT]); 1008 gopt = nla_data(tb[TCA_HTB_INIT]);
994 if (gopt->version != HTB_VER >> 16) { 1009 if (gopt->version != HTB_VER >> 16) {
995 printk(KERN_ERR 1010 pr_err("HTB: need tc/htb version %d (minor is %d), you have %d\n",
996 "HTB: need tc/htb version %d (minor is %d), you have %d\n",
997 HTB_VER >> 16, HTB_VER & 0xffff, gopt->version); 1011 HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
998 return -EINVAL; 1012 return -EINVAL;
999 } 1013 }
@@ -1206,9 +1220,10 @@ static void htb_destroy(struct Qdisc *sch)
1206 cancel_work_sync(&q->work); 1220 cancel_work_sync(&q->work);
1207 qdisc_watchdog_cancel(&q->watchdog); 1221 qdisc_watchdog_cancel(&q->watchdog);
1208 /* This line used to be after htb_destroy_class call below 1222 /* This line used to be after htb_destroy_class call below
1209 and surprisingly it worked in 2.4. But it must precede it 1223 * and surprisingly it worked in 2.4. But it must precede it
1210 because filter need its target class alive to be able to call 1224 * because filter need its target class alive to be able to call
1211 unbind_filter on it (without Oops). */ 1225 * unbind_filter on it (without Oops).
1226 */
1212 tcf_destroy_chain(&q->filter_list); 1227 tcf_destroy_chain(&q->filter_list);
1213 1228
1214 for (i = 0; i < q->clhash.hashsize; i++) { 1229 for (i = 0; i < q->clhash.hashsize; i++) {
@@ -1342,11 +1357,12 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1342 1357
1343 /* check maximal depth */ 1358 /* check maximal depth */
1344 if (parent && parent->parent && parent->parent->level < 2) { 1359 if (parent && parent->parent && parent->parent->level < 2) {
1345 printk(KERN_ERR "htb: tree is too deep\n"); 1360 pr_err("htb: tree is too deep\n");
1346 goto failure; 1361 goto failure;
1347 } 1362 }
1348 err = -ENOBUFS; 1363 err = -ENOBUFS;
1349 if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL) 1364 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1365 if (!cl)
1350 goto failure; 1366 goto failure;
1351 1367
1352 err = gen_new_estimator(&cl->bstats, &cl->rate_est, 1368 err = gen_new_estimator(&cl->bstats, &cl->rate_est,
@@ -1366,8 +1382,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1366 RB_CLEAR_NODE(&cl->node[prio]); 1382 RB_CLEAR_NODE(&cl->node[prio]);
1367 1383
1368 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) 1384 /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1369 so that can't be used inside of sch_tree_lock 1385 * so that can't be used inside of sch_tree_lock
1370 -- thanks to Karlis Peisenieks */ 1386 * -- thanks to Karlis Peisenieks
1387 */
1371 new_q = qdisc_create_dflt(sch->dev_queue, 1388 new_q = qdisc_create_dflt(sch->dev_queue,
1372 &pfifo_qdisc_ops, classid); 1389 &pfifo_qdisc_ops, classid);
1373 sch_tree_lock(sch); 1390 sch_tree_lock(sch);
@@ -1419,17 +1436,18 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
1419 } 1436 }
1420 1437
1421 /* it used to be a nasty bug here, we have to check that node 1438 /* it used to be a nasty bug here, we have to check that node
1422 is really leaf before changing cl->un.leaf ! */ 1439 * is really leaf before changing cl->un.leaf !
1440 */
1423 if (!cl->level) { 1441 if (!cl->level) {
1424 cl->quantum = rtab->rate.rate / q->rate2quantum; 1442 cl->quantum = rtab->rate.rate / q->rate2quantum;
1425 if (!hopt->quantum && cl->quantum < 1000) { 1443 if (!hopt->quantum && cl->quantum < 1000) {
1426 printk(KERN_WARNING 1444 pr_warning(
1427 "HTB: quantum of class %X is small. Consider r2q change.\n", 1445 "HTB: quantum of class %X is small. Consider r2q change.\n",
1428 cl->common.classid); 1446 cl->common.classid);
1429 cl->quantum = 1000; 1447 cl->quantum = 1000;
1430 } 1448 }
1431 if (!hopt->quantum && cl->quantum > 200000) { 1449 if (!hopt->quantum && cl->quantum > 200000) {
1432 printk(KERN_WARNING 1450 pr_warning(
1433 "HTB: quantum of class %X is big. Consider r2q change.\n", 1451 "HTB: quantum of class %X is big. Consider r2q change.\n",
1434 cl->common.classid); 1452 cl->common.classid);
1435 cl->quantum = 200000; 1453 cl->quantum = 200000;
@@ -1478,13 +1496,13 @@ static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1478 struct htb_class *cl = htb_find(classid, sch); 1496 struct htb_class *cl = htb_find(classid, sch);
1479 1497
1480 /*if (cl && !cl->level) return 0; 1498 /*if (cl && !cl->level) return 0;
1481 The line above used to be there to prevent attaching filters to 1499 * The line above used to be there to prevent attaching filters to
1482 leaves. But at least tc_index filter uses this just to get class 1500 * leaves. But at least tc_index filter uses this just to get class
1483 for other reasons so that we have to allow for it. 1501 * for other reasons so that we have to allow for it.
1484 ---- 1502 * ----
1485 19.6.2002 As Werner explained it is ok - bind filter is just 1503 * 19.6.2002 As Werner explained it is ok - bind filter is just
1486 another way to "lock" the class - unlike "get" this lock can 1504 * another way to "lock" the class - unlike "get" this lock can
1487 be broken by class during destroy IIUC. 1505 * be broken by class during destroy IIUC.
1488 */ 1506 */
1489 if (cl) 1507 if (cl)
1490 cl->filter_cnt++; 1508 cl->filter_cnt++;
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index ecc302f4d2a1..ec5cbc848963 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -61,7 +61,6 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
61 TC_H_MIN(ntx + 1))); 61 TC_H_MIN(ntx + 1)));
62 if (qdisc == NULL) 62 if (qdisc == NULL)
63 goto err; 63 goto err;
64 qdisc->flags |= TCQ_F_CAN_BYPASS;
65 priv->qdiscs[ntx] = qdisc; 64 priv->qdiscs[ntx] = qdisc;
66 } 65 }
67 66
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
new file mode 100644
index 000000000000..effd4ee0e880
--- /dev/null
+++ b/net/sched/sch_mqprio.c
@@ -0,0 +1,416 @@
1/*
2 * net/sched/sch_mqprio.c
3 *
4 * Copyright (c) 2010 John Fastabend <john.r.fastabend@intel.com>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * version 2 as published by the Free Software Foundation.
9 */
10
11#include <linux/types.h>
12#include <linux/slab.h>
13#include <linux/kernel.h>
14#include <linux/string.h>
15#include <linux/errno.h>
16#include <linux/skbuff.h>
17#include <net/netlink.h>
18#include <net/pkt_sched.h>
19#include <net/sch_generic.h>
20
21struct mqprio_sched {
22 struct Qdisc **qdiscs;
23 int hw_owned;
24};
25
26static void mqprio_destroy(struct Qdisc *sch)
27{
28 struct net_device *dev = qdisc_dev(sch);
29 struct mqprio_sched *priv = qdisc_priv(sch);
30 unsigned int ntx;
31
32 if (!priv->qdiscs)
33 return;
34
35 for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
36 qdisc_destroy(priv->qdiscs[ntx]);
37
38 if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
39 dev->netdev_ops->ndo_setup_tc(dev, 0);
40 else
41 netdev_set_num_tc(dev, 0);
42
43 kfree(priv->qdiscs);
44}
45
46static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt)
47{
48 int i, j;
49
50 /* Verify num_tc is not out of max range */
51 if (qopt->num_tc > TC_MAX_QUEUE)
52 return -EINVAL;
53
54 /* Verify priority mapping uses valid tcs */
55 for (i = 0; i < TC_BITMASK + 1; i++) {
56 if (qopt->prio_tc_map[i] >= qopt->num_tc)
57 return -EINVAL;
58 }
59
60 /* net_device does not support requested operation */
61 if (qopt->hw && !dev->netdev_ops->ndo_setup_tc)
62 return -EINVAL;
63
64 /* if hw owned qcount and qoffset are taken from LLD so
65 * no reason to verify them here
66 */
67 if (qopt->hw)
68 return 0;
69
70 for (i = 0; i < qopt->num_tc; i++) {
71 unsigned int last = qopt->offset[i] + qopt->count[i];
72
73 /* Verify the queue count is in tx range being equal to the
74 * real_num_tx_queues indicates the last queue is in use.
75 */
76 if (qopt->offset[i] >= dev->real_num_tx_queues ||
77 !qopt->count[i] ||
78 last > dev->real_num_tx_queues)
79 return -EINVAL;
80
81 /* Verify that the offset and counts do not overlap */
82 for (j = i + 1; j < qopt->num_tc; j++) {
83 if (last > qopt->offset[j])
84 return -EINVAL;
85 }
86 }
87
88 return 0;
89}
90
91static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
92{
93 struct net_device *dev = qdisc_dev(sch);
94 struct mqprio_sched *priv = qdisc_priv(sch);
95 struct netdev_queue *dev_queue;
96 struct Qdisc *qdisc;
97 int i, err = -EOPNOTSUPP;
98 struct tc_mqprio_qopt *qopt = NULL;
99
100 BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE);
101 BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK);
102
103 if (sch->parent != TC_H_ROOT)
104 return -EOPNOTSUPP;
105
106 if (!netif_is_multiqueue(dev))
107 return -EOPNOTSUPP;
108
109 if (nla_len(opt) < sizeof(*qopt))
110 return -EINVAL;
111
112 qopt = nla_data(opt);
113 if (mqprio_parse_opt(dev, qopt))
114 return -EINVAL;
115
116 /* pre-allocate qdisc, attachment can't fail */
117 priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]),
118 GFP_KERNEL);
119 if (priv->qdiscs == NULL) {
120 err = -ENOMEM;
121 goto err;
122 }
123
124 for (i = 0; i < dev->num_tx_queues; i++) {
125 dev_queue = netdev_get_tx_queue(dev, i);
126 qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops,
127 TC_H_MAKE(TC_H_MAJ(sch->handle),
128 TC_H_MIN(i + 1)));
129 if (qdisc == NULL) {
130 err = -ENOMEM;
131 goto err;
132 }
133 priv->qdiscs[i] = qdisc;
134 }
135
136 /* If the mqprio options indicate that hardware should own
137 * the queue mapping then run ndo_setup_tc otherwise use the
138 * supplied and verified mapping
139 */
140 if (qopt->hw) {
141 priv->hw_owned = 1;
142 err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
143 if (err)
144 goto err;
145 } else {
146 netdev_set_num_tc(dev, qopt->num_tc);
147 for (i = 0; i < qopt->num_tc; i++)
148 netdev_set_tc_queue(dev, i,
149 qopt->count[i], qopt->offset[i]);
150 }
151
152 /* Always use supplied priority mappings */
153 for (i = 0; i < TC_BITMASK + 1; i++)
154 netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]);
155
156 sch->flags |= TCQ_F_MQROOT;
157 return 0;
158
159err:
160 mqprio_destroy(sch);
161 return err;
162}
163
164static void mqprio_attach(struct Qdisc *sch)
165{
166 struct net_device *dev = qdisc_dev(sch);
167 struct mqprio_sched *priv = qdisc_priv(sch);
168 struct Qdisc *qdisc;
169 unsigned int ntx;
170
171 /* Attach underlying qdisc */
172 for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
173 qdisc = priv->qdiscs[ntx];
174 qdisc = dev_graft_qdisc(qdisc->dev_queue, qdisc);
175 if (qdisc)
176 qdisc_destroy(qdisc);
177 }
178 kfree(priv->qdiscs);
179 priv->qdiscs = NULL;
180}
181
182static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch,
183 unsigned long cl)
184{
185 struct net_device *dev = qdisc_dev(sch);
186 unsigned long ntx = cl - 1 - netdev_get_num_tc(dev);
187
188 if (ntx >= dev->num_tx_queues)
189 return NULL;
190 return netdev_get_tx_queue(dev, ntx);
191}
192
193static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
194 struct Qdisc **old)
195{
196 struct net_device *dev = qdisc_dev(sch);
197 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
198
199 if (!dev_queue)
200 return -EINVAL;
201
202 if (dev->flags & IFF_UP)
203 dev_deactivate(dev);
204
205 *old = dev_graft_qdisc(dev_queue, new);
206
207 if (dev->flags & IFF_UP)
208 dev_activate(dev);
209
210 return 0;
211}
212
213static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb)
214{
215 struct net_device *dev = qdisc_dev(sch);
216 struct mqprio_sched *priv = qdisc_priv(sch);
217 unsigned char *b = skb_tail_pointer(skb);
218 struct tc_mqprio_qopt opt = { 0 };
219 struct Qdisc *qdisc;
220 unsigned int i;
221
222 sch->q.qlen = 0;
223 memset(&sch->bstats, 0, sizeof(sch->bstats));
224 memset(&sch->qstats, 0, sizeof(sch->qstats));
225
226 for (i = 0; i < dev->num_tx_queues; i++) {
227 qdisc = netdev_get_tx_queue(dev, i)->qdisc;
228 spin_lock_bh(qdisc_lock(qdisc));
229 sch->q.qlen += qdisc->q.qlen;
230 sch->bstats.bytes += qdisc->bstats.bytes;
231 sch->bstats.packets += qdisc->bstats.packets;
232 sch->qstats.qlen += qdisc->qstats.qlen;
233 sch->qstats.backlog += qdisc->qstats.backlog;
234 sch->qstats.drops += qdisc->qstats.drops;
235 sch->qstats.requeues += qdisc->qstats.requeues;
236 sch->qstats.overlimits += qdisc->qstats.overlimits;
237 spin_unlock_bh(qdisc_lock(qdisc));
238 }
239
240 opt.num_tc = netdev_get_num_tc(dev);
241 memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map));
242 opt.hw = priv->hw_owned;
243
244 for (i = 0; i < netdev_get_num_tc(dev); i++) {
245 opt.count[i] = dev->tc_to_txq[i].count;
246 opt.offset[i] = dev->tc_to_txq[i].offset;
247 }
248
249 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
250
251 return skb->len;
252nla_put_failure:
253 nlmsg_trim(skb, b);
254 return -1;
255}
256
257static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl)
258{
259 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
260
261 if (!dev_queue)
262 return NULL;
263
264 return dev_queue->qdisc_sleeping;
265}
266
267static unsigned long mqprio_get(struct Qdisc *sch, u32 classid)
268{
269 struct net_device *dev = qdisc_dev(sch);
270 unsigned int ntx = TC_H_MIN(classid);
271
272 if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev))
273 return 0;
274 return ntx;
275}
276
277static void mqprio_put(struct Qdisc *sch, unsigned long cl)
278{
279}
280
281static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl,
282 struct sk_buff *skb, struct tcmsg *tcm)
283{
284 struct net_device *dev = qdisc_dev(sch);
285
286 if (cl <= netdev_get_num_tc(dev)) {
287 tcm->tcm_parent = TC_H_ROOT;
288 tcm->tcm_info = 0;
289 } else {
290 int i;
291 struct netdev_queue *dev_queue;
292
293 dev_queue = mqprio_queue_get(sch, cl);
294 tcm->tcm_parent = 0;
295 for (i = 0; i < netdev_get_num_tc(dev); i++) {
296 struct netdev_tc_txq tc = dev->tc_to_txq[i];
297 int q_idx = cl - netdev_get_num_tc(dev);
298
299 if (q_idx > tc.offset &&
300 q_idx <= tc.offset + tc.count) {
301 tcm->tcm_parent =
302 TC_H_MAKE(TC_H_MAJ(sch->handle),
303 TC_H_MIN(i + 1));
304 break;
305 }
306 }
307 tcm->tcm_info = dev_queue->qdisc_sleeping->handle;
308 }
309 tcm->tcm_handle |= TC_H_MIN(cl);
310 return 0;
311}
312
313static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
314 struct gnet_dump *d)
315{
316 struct net_device *dev = qdisc_dev(sch);
317
318 if (cl <= netdev_get_num_tc(dev)) {
319 int i;
320 struct Qdisc *qdisc;
321 struct gnet_stats_queue qstats = {0};
322 struct gnet_stats_basic_packed bstats = {0};
323 struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1];
324
325 /* Drop lock here it will be reclaimed before touching
326 * statistics this is required because the d->lock we
327 * hold here is the look on dev_queue->qdisc_sleeping
328 * also acquired below.
329 */
330 spin_unlock_bh(d->lock);
331
332 for (i = tc.offset; i < tc.offset + tc.count; i++) {
333 qdisc = netdev_get_tx_queue(dev, i)->qdisc;
334 spin_lock_bh(qdisc_lock(qdisc));
335 bstats.bytes += qdisc->bstats.bytes;
336 bstats.packets += qdisc->bstats.packets;
337 qstats.qlen += qdisc->qstats.qlen;
338 qstats.backlog += qdisc->qstats.backlog;
339 qstats.drops += qdisc->qstats.drops;
340 qstats.requeues += qdisc->qstats.requeues;
341 qstats.overlimits += qdisc->qstats.overlimits;
342 spin_unlock_bh(qdisc_lock(qdisc));
343 }
344 /* Reclaim root sleeping lock before completing stats */
345 spin_lock_bh(d->lock);
346 if (gnet_stats_copy_basic(d, &bstats) < 0 ||
347 gnet_stats_copy_queue(d, &qstats) < 0)
348 return -1;
349 } else {
350 struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl);
351
352 sch = dev_queue->qdisc_sleeping;
353 sch->qstats.qlen = sch->q.qlen;
354 if (gnet_stats_copy_basic(d, &sch->bstats) < 0 ||
355 gnet_stats_copy_queue(d, &sch->qstats) < 0)
356 return -1;
357 }
358 return 0;
359}
360
361static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
362{
363 struct net_device *dev = qdisc_dev(sch);
364 unsigned long ntx;
365
366 if (arg->stop)
367 return;
368
369 /* Walk hierarchy with a virtual class per tc */
370 arg->count = arg->skip;
371 for (ntx = arg->skip;
372 ntx < dev->num_tx_queues + netdev_get_num_tc(dev);
373 ntx++) {
374 if (arg->fn(sch, ntx + 1, arg) < 0) {
375 arg->stop = 1;
376 break;
377 }
378 arg->count++;
379 }
380}
381
382static const struct Qdisc_class_ops mqprio_class_ops = {
383 .graft = mqprio_graft,
384 .leaf = mqprio_leaf,
385 .get = mqprio_get,
386 .put = mqprio_put,
387 .walk = mqprio_walk,
388 .dump = mqprio_dump_class,
389 .dump_stats = mqprio_dump_class_stats,
390};
391
392struct Qdisc_ops mqprio_qdisc_ops __read_mostly = {
393 .cl_ops = &mqprio_class_ops,
394 .id = "mqprio",
395 .priv_size = sizeof(struct mqprio_sched),
396 .init = mqprio_init,
397 .destroy = mqprio_destroy,
398 .attach = mqprio_attach,
399 .dump = mqprio_dump,
400 .owner = THIS_MODULE,
401};
402
403static int __init mqprio_module_init(void)
404{
405 return register_qdisc(&mqprio_qdisc_ops);
406}
407
408static void __exit mqprio_module_exit(void)
409{
410 unregister_qdisc(&mqprio_qdisc_ops);
411}
412
413module_init(mqprio_module_init);
414module_exit(mqprio_module_exit);
415
416MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 436a2e75b322..edc1950e0e77 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -156,7 +156,7 @@ static unsigned int multiq_drop(struct Qdisc *sch)
156 unsigned int len; 156 unsigned int len;
157 struct Qdisc *qdisc; 157 struct Qdisc *qdisc;
158 158
159 for (band = q->bands-1; band >= 0; band--) { 159 for (band = q->bands - 1; band >= 0; band--) {
160 qdisc = q->queues[band]; 160 qdisc = q->queues[band];
161 if (qdisc->ops->drop) { 161 if (qdisc->ops->drop) {
162 len = qdisc->ops->drop(qdisc); 162 len = qdisc->ops->drop(qdisc);
@@ -265,7 +265,7 @@ static int multiq_init(struct Qdisc *sch, struct nlattr *opt)
265 for (i = 0; i < q->max_bands; i++) 265 for (i = 0; i < q->max_bands; i++)
266 q->queues[i] = &noop_qdisc; 266 q->queues[i] = &noop_qdisc;
267 267
268 err = multiq_tune(sch,opt); 268 err = multiq_tune(sch, opt);
269 269
270 if (err) 270 if (err)
271 kfree(q->queues); 271 kfree(q->queues);
@@ -346,7 +346,7 @@ static int multiq_dump_class(struct Qdisc *sch, unsigned long cl,
346 struct multiq_sched_data *q = qdisc_priv(sch); 346 struct multiq_sched_data *q = qdisc_priv(sch);
347 347
348 tcm->tcm_handle |= TC_H_MIN(cl); 348 tcm->tcm_handle |= TC_H_MIN(cl);
349 tcm->tcm_info = q->queues[cl-1]->handle; 349 tcm->tcm_info = q->queues[cl - 1]->handle;
350 return 0; 350 return 0;
351} 351}
352 352
@@ -378,7 +378,7 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
378 arg->count++; 378 arg->count++;
379 continue; 379 continue;
380 } 380 }
381 if (arg->fn(sch, band+1, arg) < 0) { 381 if (arg->fn(sch, band + 1, arg) < 0) {
382 arg->stop = 1; 382 arg->stop = 1;
383 break; 383 break;
384 } 384 }
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 6a3006b38dc5..64f0d3293b49 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -211,8 +211,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
211 } 211 }
212 212
213 cb = netem_skb_cb(skb); 213 cb = netem_skb_cb(skb);
214 if (q->gap == 0 || /* not doing reordering */ 214 if (q->gap == 0 || /* not doing reordering */
215 q->counter < q->gap || /* inside last reordering gap */ 215 q->counter < q->gap || /* inside last reordering gap */
216 q->reorder < get_crandom(&q->reorder_cor)) { 216 q->reorder < get_crandom(&q->reorder_cor)) {
217 psched_time_t now; 217 psched_time_t now;
218 psched_tdiff_t delay; 218 psched_tdiff_t delay;
@@ -248,7 +248,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch)
248 return ret; 248 return ret;
249} 249}
250 250
251static unsigned int netem_drop(struct Qdisc* sch) 251static unsigned int netem_drop(struct Qdisc *sch)
252{ 252{
253 struct netem_sched_data *q = qdisc_priv(sch); 253 struct netem_sched_data *q = qdisc_priv(sch);
254 unsigned int len = 0; 254 unsigned int len = 0;
@@ -265,7 +265,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
265 struct netem_sched_data *q = qdisc_priv(sch); 265 struct netem_sched_data *q = qdisc_priv(sch);
266 struct sk_buff *skb; 266 struct sk_buff *skb;
267 267
268 if (sch->flags & TCQ_F_THROTTLED) 268 if (qdisc_is_throttled(sch))
269 return NULL; 269 return NULL;
270 270
271 skb = q->qdisc->ops->peek(q->qdisc); 271 skb = q->qdisc->ops->peek(q->qdisc);
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index fbd710d619bf..2a318f2dc3e5 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -22,8 +22,7 @@
22#include <net/pkt_sched.h> 22#include <net/pkt_sched.h>
23 23
24 24
25struct prio_sched_data 25struct prio_sched_data {
26{
27 int bands; 26 int bands;
28 struct tcf_proto *filter_list; 27 struct tcf_proto *filter_list;
29 u8 prio2band[TC_PRIO_MAX+1]; 28 u8 prio2band[TC_PRIO_MAX+1];
@@ -54,7 +53,7 @@ prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
54 if (!q->filter_list || err < 0) { 53 if (!q->filter_list || err < 0) {
55 if (TC_H_MAJ(band)) 54 if (TC_H_MAJ(band))
56 band = 0; 55 band = 0;
57 return q->queues[q->prio2band[band&TC_PRIO_MAX]]; 56 return q->queues[q->prio2band[band & TC_PRIO_MAX]];
58 } 57 }
59 band = res.classid; 58 band = res.classid;
60 } 59 }
@@ -106,7 +105,7 @@ static struct sk_buff *prio_peek(struct Qdisc *sch)
106 return NULL; 105 return NULL;
107} 106}
108 107
109static struct sk_buff *prio_dequeue(struct Qdisc* sch) 108static struct sk_buff *prio_dequeue(struct Qdisc *sch)
110{ 109{
111 struct prio_sched_data *q = qdisc_priv(sch); 110 struct prio_sched_data *q = qdisc_priv(sch);
112 int prio; 111 int prio;
@@ -124,7 +123,7 @@ static struct sk_buff *prio_dequeue(struct Qdisc* sch)
124 123
125} 124}
126 125
127static unsigned int prio_drop(struct Qdisc* sch) 126static unsigned int prio_drop(struct Qdisc *sch)
128{ 127{
129 struct prio_sched_data *q = qdisc_priv(sch); 128 struct prio_sched_data *q = qdisc_priv(sch);
130 int prio; 129 int prio;
@@ -143,24 +142,24 @@ static unsigned int prio_drop(struct Qdisc* sch)
143 142
144 143
145static void 144static void
146prio_reset(struct Qdisc* sch) 145prio_reset(struct Qdisc *sch)
147{ 146{
148 int prio; 147 int prio;
149 struct prio_sched_data *q = qdisc_priv(sch); 148 struct prio_sched_data *q = qdisc_priv(sch);
150 149
151 for (prio=0; prio<q->bands; prio++) 150 for (prio = 0; prio < q->bands; prio++)
152 qdisc_reset(q->queues[prio]); 151 qdisc_reset(q->queues[prio]);
153 sch->q.qlen = 0; 152 sch->q.qlen = 0;
154} 153}
155 154
156static void 155static void
157prio_destroy(struct Qdisc* sch) 156prio_destroy(struct Qdisc *sch)
158{ 157{
159 int prio; 158 int prio;
160 struct prio_sched_data *q = qdisc_priv(sch); 159 struct prio_sched_data *q = qdisc_priv(sch);
161 160
162 tcf_destroy_chain(&q->filter_list); 161 tcf_destroy_chain(&q->filter_list);
163 for (prio=0; prio<q->bands; prio++) 162 for (prio = 0; prio < q->bands; prio++)
164 qdisc_destroy(q->queues[prio]); 163 qdisc_destroy(q->queues[prio]);
165} 164}
166 165
@@ -177,7 +176,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
177 if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) 176 if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2)
178 return -EINVAL; 177 return -EINVAL;
179 178
180 for (i=0; i<=TC_PRIO_MAX; i++) { 179 for (i = 0; i <= TC_PRIO_MAX; i++) {
181 if (qopt->priomap[i] >= qopt->bands) 180 if (qopt->priomap[i] >= qopt->bands)
182 return -EINVAL; 181 return -EINVAL;
183 } 182 }
@@ -186,7 +185,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
186 q->bands = qopt->bands; 185 q->bands = qopt->bands;
187 memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); 186 memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1);
188 187
189 for (i=q->bands; i<TCQ_PRIO_BANDS; i++) { 188 for (i = q->bands; i < TCQ_PRIO_BANDS; i++) {
190 struct Qdisc *child = q->queues[i]; 189 struct Qdisc *child = q->queues[i];
191 q->queues[i] = &noop_qdisc; 190 q->queues[i] = &noop_qdisc;
192 if (child != &noop_qdisc) { 191 if (child != &noop_qdisc) {
@@ -196,9 +195,10 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
196 } 195 }
197 sch_tree_unlock(sch); 196 sch_tree_unlock(sch);
198 197
199 for (i=0; i<q->bands; i++) { 198 for (i = 0; i < q->bands; i++) {
200 if (q->queues[i] == &noop_qdisc) { 199 if (q->queues[i] == &noop_qdisc) {
201 struct Qdisc *child, *old; 200 struct Qdisc *child, *old;
201
202 child = qdisc_create_dflt(sch->dev_queue, 202 child = qdisc_create_dflt(sch->dev_queue,
203 &pfifo_qdisc_ops, 203 &pfifo_qdisc_ops,
204 TC_H_MAKE(sch->handle, i + 1)); 204 TC_H_MAKE(sch->handle, i + 1));
@@ -224,7 +224,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
224 struct prio_sched_data *q = qdisc_priv(sch); 224 struct prio_sched_data *q = qdisc_priv(sch);
225 int i; 225 int i;
226 226
227 for (i=0; i<TCQ_PRIO_BANDS; i++) 227 for (i = 0; i < TCQ_PRIO_BANDS; i++)
228 q->queues[i] = &noop_qdisc; 228 q->queues[i] = &noop_qdisc;
229 229
230 if (opt == NULL) { 230 if (opt == NULL) {
@@ -232,7 +232,7 @@ static int prio_init(struct Qdisc *sch, struct nlattr *opt)
232 } else { 232 } else {
233 int err; 233 int err;
234 234
235 if ((err= prio_tune(sch, opt)) != 0) 235 if ((err = prio_tune(sch, opt)) != 0)
236 return err; 236 return err;
237 } 237 }
238 return 0; 238 return 0;
@@ -245,7 +245,7 @@ static int prio_dump(struct Qdisc *sch, struct sk_buff *skb)
245 struct tc_prio_qopt opt; 245 struct tc_prio_qopt opt;
246 246
247 opt.bands = q->bands; 247 opt.bands = q->bands;
248 memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); 248 memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX + 1);
249 249
250 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 250 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
251 251
@@ -342,7 +342,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
342 arg->count++; 342 arg->count++;
343 continue; 343 continue;
344 } 344 }
345 if (arg->fn(sch, prio+1, arg) < 0) { 345 if (arg->fn(sch, prio + 1, arg) < 0) {
346 arg->stop = 1; 346 arg->stop = 1;
347 break; 347 break;
348 } 348 }
@@ -350,7 +350,7 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
350 } 350 }
351} 351}
352 352
353static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) 353static struct tcf_proto **prio_find_tcf(struct Qdisc *sch, unsigned long cl)
354{ 354{
355 struct prio_sched_data *q = qdisc_priv(sch); 355 struct prio_sched_data *q = qdisc_priv(sch);
356 356
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 9f98dbd32d4c..6649463da1b6 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -36,8 +36,7 @@
36 if RED works correctly. 36 if RED works correctly.
37 */ 37 */
38 38
39struct red_sched_data 39struct red_sched_data {
40{
41 u32 limit; /* HARD maximal queue length */ 40 u32 limit; /* HARD maximal queue length */
42 unsigned char flags; 41 unsigned char flags;
43 struct red_parms parms; 42 struct red_parms parms;
@@ -55,7 +54,7 @@ static inline int red_use_harddrop(struct red_sched_data *q)
55 return q->flags & TC_RED_HARDDROP; 54 return q->flags & TC_RED_HARDDROP;
56} 55}
57 56
58static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch) 57static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch)
59{ 58{
60 struct red_sched_data *q = qdisc_priv(sch); 59 struct red_sched_data *q = qdisc_priv(sch);
61 struct Qdisc *child = q->qdisc; 60 struct Qdisc *child = q->qdisc;
@@ -67,29 +66,29 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc* sch)
67 red_end_of_idle_period(&q->parms); 66 red_end_of_idle_period(&q->parms);
68 67
69 switch (red_action(&q->parms, q->parms.qavg)) { 68 switch (red_action(&q->parms, q->parms.qavg)) {
70 case RED_DONT_MARK: 69 case RED_DONT_MARK:
71 break; 70 break;
72 71
73 case RED_PROB_MARK: 72 case RED_PROB_MARK:
74 sch->qstats.overlimits++; 73 sch->qstats.overlimits++;
75 if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) { 74 if (!red_use_ecn(q) || !INET_ECN_set_ce(skb)) {
76 q->stats.prob_drop++; 75 q->stats.prob_drop++;
77 goto congestion_drop; 76 goto congestion_drop;
78 } 77 }
79 78
80 q->stats.prob_mark++; 79 q->stats.prob_mark++;
81 break; 80 break;
82 81
83 case RED_HARD_MARK: 82 case RED_HARD_MARK:
84 sch->qstats.overlimits++; 83 sch->qstats.overlimits++;
85 if (red_use_harddrop(q) || !red_use_ecn(q) || 84 if (red_use_harddrop(q) || !red_use_ecn(q) ||
86 !INET_ECN_set_ce(skb)) { 85 !INET_ECN_set_ce(skb)) {
87 q->stats.forced_drop++; 86 q->stats.forced_drop++;
88 goto congestion_drop; 87 goto congestion_drop;
89 } 88 }
90 89
91 q->stats.forced_mark++; 90 q->stats.forced_mark++;
92 break; 91 break;
93 } 92 }
94 93
95 ret = qdisc_enqueue(skb, child); 94 ret = qdisc_enqueue(skb, child);
@@ -106,7 +105,7 @@ congestion_drop:
106 return NET_XMIT_CN; 105 return NET_XMIT_CN;
107} 106}
108 107
109static struct sk_buff * red_dequeue(struct Qdisc* sch) 108static struct sk_buff *red_dequeue(struct Qdisc *sch)
110{ 109{
111 struct sk_buff *skb; 110 struct sk_buff *skb;
112 struct red_sched_data *q = qdisc_priv(sch); 111 struct red_sched_data *q = qdisc_priv(sch);
@@ -123,7 +122,7 @@ static struct sk_buff * red_dequeue(struct Qdisc* sch)
123 return skb; 122 return skb;
124} 123}
125 124
126static struct sk_buff * red_peek(struct Qdisc* sch) 125static struct sk_buff *red_peek(struct Qdisc *sch)
127{ 126{
128 struct red_sched_data *q = qdisc_priv(sch); 127 struct red_sched_data *q = qdisc_priv(sch);
129 struct Qdisc *child = q->qdisc; 128 struct Qdisc *child = q->qdisc;
@@ -131,7 +130,7 @@ static struct sk_buff * red_peek(struct Qdisc* sch)
131 return child->ops->peek(child); 130 return child->ops->peek(child);
132} 131}
133 132
134static unsigned int red_drop(struct Qdisc* sch) 133static unsigned int red_drop(struct Qdisc *sch)
135{ 134{
136 struct red_sched_data *q = qdisc_priv(sch); 135 struct red_sched_data *q = qdisc_priv(sch);
137 struct Qdisc *child = q->qdisc; 136 struct Qdisc *child = q->qdisc;
@@ -150,7 +149,7 @@ static unsigned int red_drop(struct Qdisc* sch)
150 return 0; 149 return 0;
151} 150}
152 151
153static void red_reset(struct Qdisc* sch) 152static void red_reset(struct Qdisc *sch)
154{ 153{
155 struct red_sched_data *q = qdisc_priv(sch); 154 struct red_sched_data *q = qdisc_priv(sch);
156 155
@@ -217,7 +216,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
217 return 0; 216 return 0;
218} 217}
219 218
220static int red_init(struct Qdisc* sch, struct nlattr *opt) 219static int red_init(struct Qdisc *sch, struct nlattr *opt)
221{ 220{
222 struct red_sched_data *q = qdisc_priv(sch); 221 struct red_sched_data *q = qdisc_priv(sch);
223 222
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index edea8cefec6c..4cff44235773 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -21,6 +21,7 @@
21#include <linux/skbuff.h> 21#include <linux/skbuff.h>
22#include <linux/jhash.h> 22#include <linux/jhash.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/vmalloc.h>
24#include <net/ip.h> 25#include <net/ip.h>
25#include <net/netlink.h> 26#include <net/netlink.h>
26#include <net/pkt_sched.h> 27#include <net/pkt_sched.h>
@@ -76,7 +77,8 @@
76#define SFQ_DEPTH 128 /* max number of packets per flow */ 77#define SFQ_DEPTH 128 /* max number of packets per flow */
77#define SFQ_SLOTS 128 /* max number of flows */ 78#define SFQ_SLOTS 128 /* max number of flows */
78#define SFQ_EMPTY_SLOT 255 79#define SFQ_EMPTY_SLOT 255
79#define SFQ_HASH_DIVISOR 1024 80#define SFQ_DEFAULT_HASH_DIVISOR 1024
81
80/* We use 16 bits to store allot, and want to handle packets up to 64K 82/* We use 16 bits to store allot, and want to handle packets up to 64K
81 * Scale allot by 8 (1<<3) so that no overflow occurs. 83 * Scale allot by 8 (1<<3) so that no overflow occurs.
82 */ 84 */
@@ -92,8 +94,7 @@ typedef unsigned char sfq_index;
92 * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1] 94 * while following values [SFQ_SLOTS ... SFQ_SLOTS + SFQ_DEPTH - 1]
93 * are 'pointers' to dep[] array 95 * are 'pointers' to dep[] array
94 */ 96 */
95struct sfq_head 97struct sfq_head {
96{
97 sfq_index next; 98 sfq_index next;
98 sfq_index prev; 99 sfq_index prev;
99}; 100};
@@ -108,13 +109,12 @@ struct sfq_slot {
108 short allot; /* credit for this slot */ 109 short allot; /* credit for this slot */
109}; 110};
110 111
111struct sfq_sched_data 112struct sfq_sched_data {
112{
113/* Parameters */ 113/* Parameters */
114 int perturb_period; 114 int perturb_period;
115 unsigned quantum; /* Allotment per round: MUST BE >= MTU */ 115 unsigned int quantum; /* Allotment per round: MUST BE >= MTU */
116 int limit; 116 int limit;
117 117 unsigned int divisor; /* number of slots in hash table */
118/* Variables */ 118/* Variables */
119 struct tcf_proto *filter_list; 119 struct tcf_proto *filter_list;
120 struct timer_list perturb_timer; 120 struct timer_list perturb_timer;
@@ -122,7 +122,7 @@ struct sfq_sched_data
122 sfq_index cur_depth; /* depth of longest slot */ 122 sfq_index cur_depth; /* depth of longest slot */
123 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */ 123 unsigned short scaled_quantum; /* SFQ_ALLOT_SIZE(quantum) */
124 struct sfq_slot *tail; /* current slot in round */ 124 struct sfq_slot *tail; /* current slot in round */
125 sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ 125 sfq_index *ht; /* Hash table (divisor slots) */
126 struct sfq_slot slots[SFQ_SLOTS]; 126 struct sfq_slot slots[SFQ_SLOTS];
127 struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */ 127 struct sfq_head dep[SFQ_DEPTH]; /* Linked list of slots, indexed by depth */
128}; 128};
@@ -137,12 +137,12 @@ static inline struct sfq_head *sfq_dep_head(struct sfq_sched_data *q, sfq_index
137 return &q->dep[val - SFQ_SLOTS]; 137 return &q->dep[val - SFQ_SLOTS];
138} 138}
139 139
140static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) 140static unsigned int sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
141{ 141{
142 return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1); 142 return jhash_2words(h, h1, q->perturbation) & (q->divisor - 1);
143} 143}
144 144
145static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) 145static unsigned int sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
146{ 146{
147 u32 h, h2; 147 u32 h, h2;
148 148
@@ -157,13 +157,13 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
157 iph = ip_hdr(skb); 157 iph = ip_hdr(skb);
158 h = (__force u32)iph->daddr; 158 h = (__force u32)iph->daddr;
159 h2 = (__force u32)iph->saddr ^ iph->protocol; 159 h2 = (__force u32)iph->saddr ^ iph->protocol;
160 if (iph->frag_off & htons(IP_MF|IP_OFFSET)) 160 if (iph->frag_off & htons(IP_MF | IP_OFFSET))
161 break; 161 break;
162 poff = proto_ports_offset(iph->protocol); 162 poff = proto_ports_offset(iph->protocol);
163 if (poff >= 0 && 163 if (poff >= 0 &&
164 pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { 164 pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
165 iph = ip_hdr(skb); 165 iph = ip_hdr(skb);
166 h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff); 166 h2 ^= *(u32 *)((void *)iph + iph->ihl * 4 + poff);
167 } 167 }
168 break; 168 break;
169 } 169 }
@@ -181,7 +181,7 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
181 if (poff >= 0 && 181 if (poff >= 0 &&
182 pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { 182 pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
183 iph = ipv6_hdr(skb); 183 iph = ipv6_hdr(skb);
184 h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff); 184 h2 ^= *(u32 *)((void *)iph + sizeof(*iph) + poff);
185 } 185 }
186 break; 186 break;
187 } 187 }
@@ -203,7 +203,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
203 203
204 if (TC_H_MAJ(skb->priority) == sch->handle && 204 if (TC_H_MAJ(skb->priority) == sch->handle &&
205 TC_H_MIN(skb->priority) > 0 && 205 TC_H_MIN(skb->priority) > 0 &&
206 TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR) 206 TC_H_MIN(skb->priority) <= q->divisor)
207 return TC_H_MIN(skb->priority); 207 return TC_H_MIN(skb->priority);
208 208
209 if (!q->filter_list) 209 if (!q->filter_list)
@@ -221,7 +221,7 @@ static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
221 return 0; 221 return 0;
222 } 222 }
223#endif 223#endif
224 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR) 224 if (TC_H_MIN(res.classid) <= q->divisor)
225 return TC_H_MIN(res.classid); 225 return TC_H_MIN(res.classid);
226 } 226 }
227 return 0; 227 return 0;
@@ -497,7 +497,11 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
497 q->perturb_period = ctl->perturb_period * HZ; 497 q->perturb_period = ctl->perturb_period * HZ;
498 if (ctl->limit) 498 if (ctl->limit)
499 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1); 499 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
500 500 if (ctl->divisor) {
501 if (!is_power_of_2(ctl->divisor) || ctl->divisor > 65536)
502 return -EINVAL;
503 q->divisor = ctl->divisor;
504 }
501 qlen = sch->q.qlen; 505 qlen = sch->q.qlen;
502 while (sch->q.qlen > q->limit) 506 while (sch->q.qlen > q->limit)
503 sfq_drop(sch); 507 sfq_drop(sch);
@@ -515,15 +519,13 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
515static int sfq_init(struct Qdisc *sch, struct nlattr *opt) 519static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
516{ 520{
517 struct sfq_sched_data *q = qdisc_priv(sch); 521 struct sfq_sched_data *q = qdisc_priv(sch);
522 size_t sz;
518 int i; 523 int i;
519 524
520 q->perturb_timer.function = sfq_perturbation; 525 q->perturb_timer.function = sfq_perturbation;
521 q->perturb_timer.data = (unsigned long)sch; 526 q->perturb_timer.data = (unsigned long)sch;
522 init_timer_deferrable(&q->perturb_timer); 527 init_timer_deferrable(&q->perturb_timer);
523 528
524 for (i = 0; i < SFQ_HASH_DIVISOR; i++)
525 q->ht[i] = SFQ_EMPTY_SLOT;
526
527 for (i = 0; i < SFQ_DEPTH; i++) { 529 for (i = 0; i < SFQ_DEPTH; i++) {
528 q->dep[i].next = i + SFQ_SLOTS; 530 q->dep[i].next = i + SFQ_SLOTS;
529 q->dep[i].prev = i + SFQ_SLOTS; 531 q->dep[i].prev = i + SFQ_SLOTS;
@@ -532,6 +534,7 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
532 q->limit = SFQ_DEPTH - 1; 534 q->limit = SFQ_DEPTH - 1;
533 q->cur_depth = 0; 535 q->cur_depth = 0;
534 q->tail = NULL; 536 q->tail = NULL;
537 q->divisor = SFQ_DEFAULT_HASH_DIVISOR;
535 if (opt == NULL) { 538 if (opt == NULL) {
536 q->quantum = psched_mtu(qdisc_dev(sch)); 539 q->quantum = psched_mtu(qdisc_dev(sch));
537 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum); 540 q->scaled_quantum = SFQ_ALLOT_SIZE(q->quantum);
@@ -543,10 +546,23 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
543 return err; 546 return err;
544 } 547 }
545 548
549 sz = sizeof(q->ht[0]) * q->divisor;
550 q->ht = kmalloc(sz, GFP_KERNEL);
551 if (!q->ht && sz > PAGE_SIZE)
552 q->ht = vmalloc(sz);
553 if (!q->ht)
554 return -ENOMEM;
555 for (i = 0; i < q->divisor; i++)
556 q->ht[i] = SFQ_EMPTY_SLOT;
557
546 for (i = 0; i < SFQ_SLOTS; i++) { 558 for (i = 0; i < SFQ_SLOTS; i++) {
547 slot_queue_init(&q->slots[i]); 559 slot_queue_init(&q->slots[i]);
548 sfq_link(q, i); 560 sfq_link(q, i);
549 } 561 }
562 if (q->limit >= 1)
563 sch->flags |= TCQ_F_CAN_BYPASS;
564 else
565 sch->flags &= ~TCQ_F_CAN_BYPASS;
550 return 0; 566 return 0;
551} 567}
552 568
@@ -557,6 +573,10 @@ static void sfq_destroy(struct Qdisc *sch)
557 tcf_destroy_chain(&q->filter_list); 573 tcf_destroy_chain(&q->filter_list);
558 q->perturb_period = 0; 574 q->perturb_period = 0;
559 del_timer_sync(&q->perturb_timer); 575 del_timer_sync(&q->perturb_timer);
576 if (is_vmalloc_addr(q->ht))
577 vfree(q->ht);
578 else
579 kfree(q->ht);
560} 580}
561 581
562static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) 582static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -569,7 +589,7 @@ static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
569 opt.perturb_period = q->perturb_period / HZ; 589 opt.perturb_period = q->perturb_period / HZ;
570 590
571 opt.limit = q->limit; 591 opt.limit = q->limit;
572 opt.divisor = SFQ_HASH_DIVISOR; 592 opt.divisor = q->divisor;
573 opt.flows = q->limit; 593 opt.flows = q->limit;
574 594
575 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); 595 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
@@ -594,6 +614,8 @@ static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
594static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent, 614static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
595 u32 classid) 615 u32 classid)
596{ 616{
617 /* we cannot bypass queue discipline anymore */
618 sch->flags &= ~TCQ_F_CAN_BYPASS;
597 return 0; 619 return 0;
598} 620}
599 621
@@ -647,7 +669,7 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
647 if (arg->stop) 669 if (arg->stop)
648 return; 670 return;
649 671
650 for (i = 0; i < SFQ_HASH_DIVISOR; i++) { 672 for (i = 0; i < q->divisor; i++) {
651 if (q->ht[i] == SFQ_EMPTY_SLOT || 673 if (q->ht[i] == SFQ_EMPTY_SLOT ||
652 arg->count < arg->skip) { 674 arg->count < arg->skip) {
653 arg->count++; 675 arg->count++;
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index e93165820c3f..1dcfb5223a86 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -97,8 +97,7 @@
97 changed the limit is not effective anymore. 97 changed the limit is not effective anymore.
98*/ 98*/
99 99
100struct tbf_sched_data 100struct tbf_sched_data {
101{
102/* Parameters */ 101/* Parameters */
103 u32 limit; /* Maximal length of backlog: bytes */ 102 u32 limit; /* Maximal length of backlog: bytes */
104 u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ 103 u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */
@@ -115,10 +114,10 @@ struct tbf_sched_data
115 struct qdisc_watchdog watchdog; /* Watchdog timer */ 114 struct qdisc_watchdog watchdog; /* Watchdog timer */
116}; 115};
117 116
118#define L2T(q,L) qdisc_l2t((q)->R_tab,L) 117#define L2T(q, L) qdisc_l2t((q)->R_tab, L)
119#define L2T_P(q,L) qdisc_l2t((q)->P_tab,L) 118#define L2T_P(q, L) qdisc_l2t((q)->P_tab, L)
120 119
121static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) 120static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
122{ 121{
123 struct tbf_sched_data *q = qdisc_priv(sch); 122 struct tbf_sched_data *q = qdisc_priv(sch);
124 int ret; 123 int ret;
@@ -137,7 +136,7 @@ static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
137 return NET_XMIT_SUCCESS; 136 return NET_XMIT_SUCCESS;
138} 137}
139 138
140static unsigned int tbf_drop(struct Qdisc* sch) 139static unsigned int tbf_drop(struct Qdisc *sch)
141{ 140{
142 struct tbf_sched_data *q = qdisc_priv(sch); 141 struct tbf_sched_data *q = qdisc_priv(sch);
143 unsigned int len = 0; 142 unsigned int len = 0;
@@ -149,7 +148,7 @@ static unsigned int tbf_drop(struct Qdisc* sch)
149 return len; 148 return len;
150} 149}
151 150
152static struct sk_buff *tbf_dequeue(struct Qdisc* sch) 151static struct sk_buff *tbf_dequeue(struct Qdisc *sch)
153{ 152{
154 struct tbf_sched_data *q = qdisc_priv(sch); 153 struct tbf_sched_data *q = qdisc_priv(sch);
155 struct sk_buff *skb; 154 struct sk_buff *skb;
@@ -185,7 +184,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
185 q->tokens = toks; 184 q->tokens = toks;
186 q->ptokens = ptoks; 185 q->ptokens = ptoks;
187 sch->q.qlen--; 186 sch->q.qlen--;
188 sch->flags &= ~TCQ_F_THROTTLED; 187 qdisc_unthrottled(sch);
189 qdisc_bstats_update(sch, skb); 188 qdisc_bstats_update(sch, skb);
190 return skb; 189 return skb;
191 } 190 }
@@ -209,7 +208,7 @@ static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
209 return NULL; 208 return NULL;
210} 209}
211 210
212static void tbf_reset(struct Qdisc* sch) 211static void tbf_reset(struct Qdisc *sch)
213{ 212{
214 struct tbf_sched_data *q = qdisc_priv(sch); 213 struct tbf_sched_data *q = qdisc_priv(sch);
215 214
@@ -227,7 +226,7 @@ static const struct nla_policy tbf_policy[TCA_TBF_MAX + 1] = {
227 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE }, 226 [TCA_TBF_PTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
228}; 227};
229 228
230static int tbf_change(struct Qdisc* sch, struct nlattr *opt) 229static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
231{ 230{
232 int err; 231 int err;
233 struct tbf_sched_data *q = qdisc_priv(sch); 232 struct tbf_sched_data *q = qdisc_priv(sch);
@@ -236,7 +235,7 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
236 struct qdisc_rate_table *rtab = NULL; 235 struct qdisc_rate_table *rtab = NULL;
237 struct qdisc_rate_table *ptab = NULL; 236 struct qdisc_rate_table *ptab = NULL;
238 struct Qdisc *child = NULL; 237 struct Qdisc *child = NULL;
239 int max_size,n; 238 int max_size, n;
240 239
241 err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy); 240 err = nla_parse_nested(tb, TCA_TBF_PTAB, opt, tbf_policy);
242 if (err < 0) 241 if (err < 0)
@@ -259,15 +258,18 @@ static int tbf_change(struct Qdisc* sch, struct nlattr *opt)
259 } 258 }
260 259
261 for (n = 0; n < 256; n++) 260 for (n = 0; n < 256; n++)
262 if (rtab->data[n] > qopt->buffer) break; 261 if (rtab->data[n] > qopt->buffer)
263 max_size = (n << qopt->rate.cell_log)-1; 262 break;
263 max_size = (n << qopt->rate.cell_log) - 1;
264 if (ptab) { 264 if (ptab) {
265 int size; 265 int size;
266 266
267 for (n = 0; n < 256; n++) 267 for (n = 0; n < 256; n++)
268 if (ptab->data[n] > qopt->mtu) break; 268 if (ptab->data[n] > qopt->mtu)
269 size = (n << qopt->peakrate.cell_log)-1; 269 break;
270 if (size < max_size) max_size = size; 270 size = (n << qopt->peakrate.cell_log) - 1;
271 if (size < max_size)
272 max_size = size;
271 } 273 }
272 if (max_size < 0) 274 if (max_size < 0)
273 goto done; 275 goto done;
@@ -310,7 +312,7 @@ done:
310 return err; 312 return err;
311} 313}
312 314
313static int tbf_init(struct Qdisc* sch, struct nlattr *opt) 315static int tbf_init(struct Qdisc *sch, struct nlattr *opt)
314{ 316{
315 struct tbf_sched_data *q = qdisc_priv(sch); 317 struct tbf_sched_data *q = qdisc_priv(sch);
316 318
@@ -422,8 +424,7 @@ static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
422 } 424 }
423} 425}
424 426
425static const struct Qdisc_class_ops tbf_class_ops = 427static const struct Qdisc_class_ops tbf_class_ops = {
426{
427 .graft = tbf_graft, 428 .graft = tbf_graft,
428 .leaf = tbf_leaf, 429 .leaf = tbf_leaf,
429 .get = tbf_get, 430 .get = tbf_get,
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index d84e7329660f..45cd30098e34 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -53,8 +53,7 @@
53 which will not break load balancing, though native slave 53 which will not break load balancing, though native slave
54 traffic will have the highest priority. */ 54 traffic will have the highest priority. */
55 55
56struct teql_master 56struct teql_master {
57{
58 struct Qdisc_ops qops; 57 struct Qdisc_ops qops;
59 struct net_device *dev; 58 struct net_device *dev;
60 struct Qdisc *slaves; 59 struct Qdisc *slaves;
@@ -65,22 +64,21 @@ struct teql_master
65 unsigned long tx_dropped; 64 unsigned long tx_dropped;
66}; 65};
67 66
68struct teql_sched_data 67struct teql_sched_data {
69{
70 struct Qdisc *next; 68 struct Qdisc *next;
71 struct teql_master *m; 69 struct teql_master *m;
72 struct neighbour *ncache; 70 struct neighbour *ncache;
73 struct sk_buff_head q; 71 struct sk_buff_head q;
74}; 72};
75 73
76#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) 74#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
77 75
78#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT) 76#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
79 77
80/* "teql*" qdisc routines */ 78/* "teql*" qdisc routines */
81 79
82static int 80static int
83teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) 81teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
84{ 82{
85 struct net_device *dev = qdisc_dev(sch); 83 struct net_device *dev = qdisc_dev(sch);
86 struct teql_sched_data *q = qdisc_priv(sch); 84 struct teql_sched_data *q = qdisc_priv(sch);
@@ -96,7 +94,7 @@ teql_enqueue(struct sk_buff *skb, struct Qdisc* sch)
96} 94}
97 95
98static struct sk_buff * 96static struct sk_buff *
99teql_dequeue(struct Qdisc* sch) 97teql_dequeue(struct Qdisc *sch)
100{ 98{
101 struct teql_sched_data *dat = qdisc_priv(sch); 99 struct teql_sched_data *dat = qdisc_priv(sch);
102 struct netdev_queue *dat_queue; 100 struct netdev_queue *dat_queue;
@@ -118,13 +116,13 @@ teql_dequeue(struct Qdisc* sch)
118} 116}
119 117
120static struct sk_buff * 118static struct sk_buff *
121teql_peek(struct Qdisc* sch) 119teql_peek(struct Qdisc *sch)
122{ 120{
123 /* teql is meant to be used as root qdisc */ 121 /* teql is meant to be used as root qdisc */
124 return NULL; 122 return NULL;
125} 123}
126 124
127static __inline__ void 125static inline void
128teql_neigh_release(struct neighbour *n) 126teql_neigh_release(struct neighbour *n)
129{ 127{
130 if (n) 128 if (n)
@@ -132,7 +130,7 @@ teql_neigh_release(struct neighbour *n)
132} 130}
133 131
134static void 132static void
135teql_reset(struct Qdisc* sch) 133teql_reset(struct Qdisc *sch)
136{ 134{
137 struct teql_sched_data *dat = qdisc_priv(sch); 135 struct teql_sched_data *dat = qdisc_priv(sch);
138 136
@@ -142,13 +140,14 @@ teql_reset(struct Qdisc* sch)
142} 140}
143 141
144static void 142static void
145teql_destroy(struct Qdisc* sch) 143teql_destroy(struct Qdisc *sch)
146{ 144{
147 struct Qdisc *q, *prev; 145 struct Qdisc *q, *prev;
148 struct teql_sched_data *dat = qdisc_priv(sch); 146 struct teql_sched_data *dat = qdisc_priv(sch);
149 struct teql_master *master = dat->m; 147 struct teql_master *master = dat->m;
150 148
151 if ((prev = master->slaves) != NULL) { 149 prev = master->slaves;
150 if (prev) {
152 do { 151 do {
153 q = NEXT_SLAVE(prev); 152 q = NEXT_SLAVE(prev);
154 if (q == sch) { 153 if (q == sch) {
@@ -180,7 +179,7 @@ teql_destroy(struct Qdisc* sch)
180static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt) 179static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
181{ 180{
182 struct net_device *dev = qdisc_dev(sch); 181 struct net_device *dev = qdisc_dev(sch);
183 struct teql_master *m = (struct teql_master*)sch->ops; 182 struct teql_master *m = (struct teql_master *)sch->ops;
184 struct teql_sched_data *q = qdisc_priv(sch); 183 struct teql_sched_data *q = qdisc_priv(sch);
185 184
186 if (dev->hard_header_len > m->dev->hard_header_len) 185 if (dev->hard_header_len > m->dev->hard_header_len)
@@ -291,7 +290,8 @@ restart:
291 nores = 0; 290 nores = 0;
292 busy = 0; 291 busy = 0;
293 292
294 if ((q = start) == NULL) 293 q = start;
294 if (!q)
295 goto drop; 295 goto drop;
296 296
297 do { 297 do {
@@ -356,10 +356,10 @@ drop:
356 356
357static int teql_master_open(struct net_device *dev) 357static int teql_master_open(struct net_device *dev)
358{ 358{
359 struct Qdisc * q; 359 struct Qdisc *q;
360 struct teql_master *m = netdev_priv(dev); 360 struct teql_master *m = netdev_priv(dev);
361 int mtu = 0xFFFE; 361 int mtu = 0xFFFE;
362 unsigned flags = IFF_NOARP|IFF_MULTICAST; 362 unsigned int flags = IFF_NOARP | IFF_MULTICAST;
363 363
364 if (m->slaves == NULL) 364 if (m->slaves == NULL)
365 return -EUNATCH; 365 return -EUNATCH;
@@ -427,7 +427,7 @@ static int teql_master_mtu(struct net_device *dev, int new_mtu)
427 do { 427 do {
428 if (new_mtu > qdisc_dev(q)->mtu) 428 if (new_mtu > qdisc_dev(q)->mtu)
429 return -EINVAL; 429 return -EINVAL;
430 } while ((q=NEXT_SLAVE(q)) != m->slaves); 430 } while ((q = NEXT_SLAVE(q)) != m->slaves);
431 } 431 }
432 432
433 dev->mtu = new_mtu; 433 dev->mtu = new_mtu;
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dd419d286204..d8d98d5b508c 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1475,6 +1475,12 @@ restart:
1475 goto out_free; 1475 goto out_free;
1476 } 1476 }
1477 1477
1478 if (sk_filter(other, skb) < 0) {
1479 /* Toss the packet but do not return any error to the sender */
1480 err = len;
1481 goto out_free;
1482 }
1483
1478 unix_state_lock(other); 1484 unix_state_lock(other);
1479 err = -EPERM; 1485 err = -EPERM;
1480 if (!unix_may_send(sk, other)) 1486 if (!unix_may_send(sk, other))
@@ -1978,36 +1984,38 @@ static int unix_shutdown(struct socket *sock, int mode)
1978 1984
1979 mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); 1985 mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1980 1986
1981 if (mode) { 1987 if (!mode)
1982 unix_state_lock(sk); 1988 return 0;
1983 sk->sk_shutdown |= mode; 1989
1984 other = unix_peer(sk); 1990 unix_state_lock(sk);
1985 if (other) 1991 sk->sk_shutdown |= mode;
1986 sock_hold(other); 1992 other = unix_peer(sk);
1987 unix_state_unlock(sk); 1993 if (other)
1988 sk->sk_state_change(sk); 1994 sock_hold(other);
1989 1995 unix_state_unlock(sk);
1990 if (other && 1996 sk->sk_state_change(sk);
1991 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { 1997
1992 1998 if (other &&
1993 int peer_mode = 0; 1999 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1994 2000
1995 if (mode&RCV_SHUTDOWN) 2001 int peer_mode = 0;
1996 peer_mode |= SEND_SHUTDOWN; 2002
1997 if (mode&SEND_SHUTDOWN) 2003 if (mode&RCV_SHUTDOWN)
1998 peer_mode |= RCV_SHUTDOWN; 2004 peer_mode |= SEND_SHUTDOWN;
1999 unix_state_lock(other); 2005 if (mode&SEND_SHUTDOWN)
2000 other->sk_shutdown |= peer_mode; 2006 peer_mode |= RCV_SHUTDOWN;
2001 unix_state_unlock(other); 2007 unix_state_lock(other);
2002 other->sk_state_change(other); 2008 other->sk_shutdown |= peer_mode;
2003 if (peer_mode == SHUTDOWN_MASK) 2009 unix_state_unlock(other);
2004 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); 2010 other->sk_state_change(other);
2005 else if (peer_mode & RCV_SHUTDOWN) 2011 if (peer_mode == SHUTDOWN_MASK)
2006 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); 2012 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2007 } 2013 else if (peer_mode & RCV_SHUTDOWN)
2008 if (other) 2014 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2009 sock_put(other);
2010 } 2015 }
2016 if (other)
2017 sock_put(other);
2018
2011 return 0; 2019 return 0;
2012} 2020}
2013 2021
diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c
index 74944a2dd436..788a12c1eb5d 100644
--- a/net/wanrouter/wanmain.c
+++ b/net/wanrouter/wanmain.c
@@ -59,8 +59,6 @@
59#include <asm/uaccess.h> /* copy_to/from_user */ 59#include <asm/uaccess.h> /* copy_to/from_user */
60#include <linux/init.h> /* __initfunc et al. */ 60#include <linux/init.h> /* __initfunc et al. */
61 61
62#define KMEM_SAFETYZONE 8
63
64#define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev))) 62#define DEV_TO_SLAVE(dev) (*((struct net_device **)netdev_priv(dev)))
65 63
66/* 64/*