diff options
Diffstat (limited to 'net')
132 files changed, 5268 insertions, 1846 deletions
diff --git a/net/Kconfig b/net/Kconfig index 4193cdcd3ae7..c6cec5aa5486 100644 --- a/net/Kconfig +++ b/net/Kconfig | |||
@@ -66,6 +66,13 @@ source "net/ipv6/Kconfig" | |||
66 | 66 | ||
67 | endif # if INET | 67 | endif # if INET |
68 | 68 | ||
69 | config NETWORK_SECMARK | ||
70 | bool "Security Marking" | ||
71 | help | ||
72 | This enables security marking of network packets, similar | ||
73 | to nfmark, but designated for security purposes. | ||
74 | If you are unsure how to answer this question, answer N. | ||
75 | |||
69 | menuconfig NETFILTER | 76 | menuconfig NETFILTER |
70 | bool "Network packet filtering (replaces ipchains)" | 77 | bool "Network packet filtering (replaces ipchains)" |
71 | ---help--- | 78 | ---help--- |
@@ -215,6 +222,21 @@ config NET_PKTGEN | |||
215 | To compile this code as a module, choose M here: the | 222 | To compile this code as a module, choose M here: the |
216 | module will be called pktgen. | 223 | module will be called pktgen. |
217 | 224 | ||
225 | config NET_TCPPROBE | ||
226 | tristate "TCP connection probing" | ||
227 | depends on INET && EXPERIMENTAL && PROC_FS && KPROBES | ||
228 | ---help--- | ||
229 | This module allows for capturing the changes to TCP connection | ||
230 | state in response to incoming packets. It is used for debugging | ||
231 | TCP congestion avoidance modules. If you don't understand | ||
232 | what was just said, you don't need it: say N. | ||
233 | |||
234 | Documentation on how to use the packet generator can be found | ||
235 | at http://linux-net.osdl.org/index.php/TcpProbe | ||
236 | |||
237 | To compile this code as a module, choose M here: the | ||
238 | module will be called tcp_probe. | ||
239 | |||
218 | endmenu | 240 | endmenu |
219 | 241 | ||
220 | endmenu | 242 | endmenu |
diff --git a/net/atm/clip.c b/net/atm/clip.c index 72d852982664..f92f9c94d2c7 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c | |||
@@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) | |||
98 | printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); | 98 | printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); |
99 | return; | 99 | return; |
100 | } | 100 | } |
101 | spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */ | 101 | netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ |
102 | entry->neigh->used = jiffies; | 102 | entry->neigh->used = jiffies; |
103 | for (walk = &entry->vccs; *walk; walk = &(*walk)->next) | 103 | for (walk = &entry->vccs; *walk; walk = &(*walk)->next) |
104 | if (*walk == clip_vcc) { | 104 | if (*walk == clip_vcc) { |
@@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) | |||
122 | printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc " | 122 | printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc " |
123 | "0x%p)\n", entry, clip_vcc); | 123 | "0x%p)\n", entry, clip_vcc); |
124 | out: | 124 | out: |
125 | spin_unlock_bh(&entry->neigh->dev->xmit_lock); | 125 | netif_tx_unlock_bh(entry->neigh->dev); |
126 | } | 126 | } |
127 | 127 | ||
128 | /* The neighbour entry n->lock is held. */ | 128 | /* The neighbour entry n->lock is held. */ |
diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 59556e40e143..f444c12cde5a 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile | |||
@@ -6,7 +6,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o | |||
6 | 6 | ||
7 | bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ | 7 | bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ |
8 | br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ | 8 | br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ |
9 | br_stp_if.o br_stp_timer.o | 9 | br_stp_if.o br_stp_timer.o br_netlink.o |
10 | 10 | ||
11 | bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o | 11 | bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o |
12 | 12 | ||
diff --git a/net/bridge/br.c b/net/bridge/br.c index 12da21afb9ca..654401ceb2db 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c | |||
@@ -30,36 +30,46 @@ static struct llc_sap *br_stp_sap; | |||
30 | 30 | ||
31 | static int __init br_init(void) | 31 | static int __init br_init(void) |
32 | { | 32 | { |
33 | int err; | ||
34 | |||
33 | br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv); | 35 | br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv); |
34 | if (!br_stp_sap) { | 36 | if (!br_stp_sap) { |
35 | printk(KERN_ERR "bridge: can't register sap for STP\n"); | 37 | printk(KERN_ERR "bridge: can't register sap for STP\n"); |
36 | return -EBUSY; | 38 | return -EADDRINUSE; |
37 | } | 39 | } |
38 | 40 | ||
39 | br_fdb_init(); | 41 | br_fdb_init(); |
40 | 42 | ||
41 | #ifdef CONFIG_BRIDGE_NETFILTER | 43 | err = br_netfilter_init(); |
42 | if (br_netfilter_init()) | 44 | if (err) |
43 | return 1; | 45 | goto err_out1; |
44 | #endif | 46 | |
47 | err = register_netdevice_notifier(&br_device_notifier); | ||
48 | if (err) | ||
49 | goto err_out2; | ||
50 | |||
51 | br_netlink_init(); | ||
45 | brioctl_set(br_ioctl_deviceless_stub); | 52 | brioctl_set(br_ioctl_deviceless_stub); |
46 | br_handle_frame_hook = br_handle_frame; | 53 | br_handle_frame_hook = br_handle_frame; |
47 | 54 | ||
48 | br_fdb_get_hook = br_fdb_get; | 55 | br_fdb_get_hook = br_fdb_get; |
49 | br_fdb_put_hook = br_fdb_put; | 56 | br_fdb_put_hook = br_fdb_put; |
50 | 57 | ||
51 | register_netdevice_notifier(&br_device_notifier); | ||
52 | |||
53 | return 0; | 58 | return 0; |
59 | |||
60 | err_out2: | ||
61 | br_netfilter_fini(); | ||
62 | err_out1: | ||
63 | llc_sap_put(br_stp_sap); | ||
64 | return err; | ||
54 | } | 65 | } |
55 | 66 | ||
56 | static void __exit br_deinit(void) | 67 | static void __exit br_deinit(void) |
57 | { | 68 | { |
58 | rcu_assign_pointer(br_stp_sap->rcv_func, NULL); | 69 | rcu_assign_pointer(br_stp_sap->rcv_func, NULL); |
59 | 70 | ||
60 | #ifdef CONFIG_BRIDGE_NETFILTER | 71 | br_netlink_fini(); |
61 | br_netfilter_fini(); | 72 | br_netfilter_fini(); |
62 | #endif | ||
63 | unregister_netdevice_notifier(&br_device_notifier); | 73 | unregister_netdevice_notifier(&br_device_notifier); |
64 | brioctl_set(NULL); | 74 | brioctl_set(NULL); |
65 | 75 | ||
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0c88a2ac32c1..2afdc7c0736c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c | |||
@@ -145,9 +145,9 @@ static int br_set_tx_csum(struct net_device *dev, u32 data) | |||
145 | struct net_bridge *br = netdev_priv(dev); | 145 | struct net_bridge *br = netdev_priv(dev); |
146 | 146 | ||
147 | if (data) | 147 | if (data) |
148 | br->feature_mask |= NETIF_F_IP_CSUM; | 148 | br->feature_mask |= NETIF_F_NO_CSUM; |
149 | else | 149 | else |
150 | br->feature_mask &= ~NETIF_F_IP_CSUM; | 150 | br->feature_mask &= ~NETIF_F_ALL_CSUM; |
151 | 151 | ||
152 | br_features_recompute(br); | 152 | br_features_recompute(br); |
153 | return 0; | 153 | return 0; |
@@ -185,5 +185,5 @@ void br_dev_setup(struct net_device *dev) | |||
185 | dev->priv_flags = IFF_EBRIDGE; | 185 | dev->priv_flags = IFF_EBRIDGE; |
186 | 186 | ||
187 | dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | 187 | dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
188 | | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; | 188 | | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_NO_CSUM; |
189 | } | 189 | } |
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 56f3aa47e758..0dca027ceb80 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c | |||
@@ -20,14 +20,11 @@ | |||
20 | #include <linux/netfilter_bridge.h> | 20 | #include <linux/netfilter_bridge.h> |
21 | #include "br_private.h" | 21 | #include "br_private.h" |
22 | 22 | ||
23 | /* Don't forward packets to originating port or forwarding diasabled */ | ||
23 | static inline int should_deliver(const struct net_bridge_port *p, | 24 | static inline int should_deliver(const struct net_bridge_port *p, |
24 | const struct sk_buff *skb) | 25 | const struct sk_buff *skb) |
25 | { | 26 | { |
26 | if (skb->dev == p->dev || | 27 | return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING); |
27 | p->state != BR_STATE_FORWARDING) | ||
28 | return 0; | ||
29 | |||
30 | return 1; | ||
31 | } | 28 | } |
32 | 29 | ||
33 | static inline unsigned packet_length(const struct sk_buff *skb) | 30 | static inline unsigned packet_length(const struct sk_buff *skb) |
@@ -55,10 +52,9 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) | |||
55 | 52 | ||
56 | int br_forward_finish(struct sk_buff *skb) | 53 | int br_forward_finish(struct sk_buff *skb) |
57 | { | 54 | { |
58 | NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, | 55 | return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, |
59 | br_dev_queue_push_xmit); | 56 | br_dev_queue_push_xmit); |
60 | 57 | ||
61 | return 0; | ||
62 | } | 58 | } |
63 | 59 | ||
64 | static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) | 60 | static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) |
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f5d47bf4f967..fdec773f5b52 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c | |||
@@ -372,12 +372,17 @@ void br_features_recompute(struct net_bridge *br) | |||
372 | struct net_bridge_port *p; | 372 | struct net_bridge_port *p; |
373 | unsigned long features, checksum; | 373 | unsigned long features, checksum; |
374 | 374 | ||
375 | features = br->feature_mask &~ NETIF_F_IP_CSUM; | 375 | checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0; |
376 | checksum = br->feature_mask & NETIF_F_IP_CSUM; | 376 | features = br->feature_mask & ~NETIF_F_ALL_CSUM; |
377 | 377 | ||
378 | list_for_each_entry(p, &br->port_list, list) { | 378 | list_for_each_entry(p, &br->port_list, list) { |
379 | if (!(p->dev->features | 379 | if (checksum & NETIF_F_NO_CSUM && |
380 | & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))) | 380 | !(p->dev->features & NETIF_F_NO_CSUM)) |
381 | checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; | ||
382 | if (checksum & NETIF_F_HW_CSUM && | ||
383 | !(p->dev->features & NETIF_F_HW_CSUM)) | ||
384 | checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM; | ||
385 | if (!(p->dev->features & NETIF_F_IP_CSUM)) | ||
381 | checksum = 0; | 386 | checksum = 0; |
382 | features &= p->dev->features; | 387 | features &= p->dev->features; |
383 | } | 388 | } |
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3da9264449f7..3e41f9d6d51c 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c | |||
@@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, | |||
407 | if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { | 407 | if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { |
408 | if (pkt_len + sizeof(struct ipv6hdr) > skb->len) | 408 | if (pkt_len + sizeof(struct ipv6hdr) > skb->len) |
409 | goto inhdr_error; | 409 | goto inhdr_error; |
410 | if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { | 410 | if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) |
411 | if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr))) | 411 | goto inhdr_error; |
412 | goto inhdr_error; | ||
413 | if (skb->ip_summed == CHECKSUM_HW) | ||
414 | skb->ip_summed = CHECKSUM_NONE; | ||
415 | } | ||
416 | } | 412 | } |
417 | if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) | 413 | if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) |
418 | goto inhdr_error; | 414 | goto inhdr_error; |
@@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, | |||
495 | if (skb->len < len || len < 4 * iph->ihl) | 491 | if (skb->len < len || len < 4 * iph->ihl) |
496 | goto inhdr_error; | 492 | goto inhdr_error; |
497 | 493 | ||
498 | if (skb->len > len) { | 494 | pskb_trim_rcsum(skb, len); |
499 | __pskb_trim(skb, len); | ||
500 | if (skb->ip_summed == CHECKSUM_HW) | ||
501 | skb->ip_summed = CHECKSUM_NONE; | ||
502 | } | ||
503 | 495 | ||
504 | nf_bridge_put(skb->nf_bridge); | 496 | nf_bridge_put(skb->nf_bridge); |
505 | if (!nf_bridge_alloc(skb)) | 497 | if (!nf_bridge_alloc(skb)) |
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c new file mode 100644 index 000000000000..881d7d1a732a --- /dev/null +++ b/net/bridge/br_netlink.c | |||
@@ -0,0 +1,199 @@ | |||
1 | /* | ||
2 | * Bridge netlink control interface | ||
3 | * | ||
4 | * Authors: | ||
5 | * Stephen Hemminger <shemminger@osdl.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; either version | ||
10 | * 2 of the License, or (at your option) any later version. | ||
11 | */ | ||
12 | |||
13 | #include <linux/kernel.h> | ||
14 | #include <linux/rtnetlink.h> | ||
15 | #include "br_private.h" | ||
16 | |||
17 | /* | ||
18 | * Create one netlink message for one interface | ||
19 | * Contains port and master info as well as carrier and bridge state. | ||
20 | */ | ||
21 | static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, | ||
22 | u32 pid, u32 seq, int event, unsigned int flags) | ||
23 | { | ||
24 | const struct net_bridge *br = port->br; | ||
25 | const struct net_device *dev = port->dev; | ||
26 | struct ifinfomsg *r; | ||
27 | struct nlmsghdr *nlh; | ||
28 | unsigned char *b = skb->tail; | ||
29 | u32 mtu = dev->mtu; | ||
30 | u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; | ||
31 | u8 portstate = port->state; | ||
32 | |||
33 | pr_debug("br_fill_info event %d port %s master %s\n", | ||
34 | event, dev->name, br->dev->name); | ||
35 | |||
36 | nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); | ||
37 | r = NLMSG_DATA(nlh); | ||
38 | r->ifi_family = AF_BRIDGE; | ||
39 | r->__ifi_pad = 0; | ||
40 | r->ifi_type = dev->type; | ||
41 | r->ifi_index = dev->ifindex; | ||
42 | r->ifi_flags = dev_get_flags(dev); | ||
43 | r->ifi_change = 0; | ||
44 | |||
45 | RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); | ||
46 | |||
47 | RTA_PUT(skb, IFLA_MASTER, sizeof(int), &br->dev->ifindex); | ||
48 | |||
49 | if (dev->addr_len) | ||
50 | RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); | ||
51 | |||
52 | RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); | ||
53 | if (dev->ifindex != dev->iflink) | ||
54 | RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); | ||
55 | |||
56 | |||
57 | RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate); | ||
58 | |||
59 | if (event == RTM_NEWLINK) | ||
60 | RTA_PUT(skb, IFLA_PROTINFO, sizeof(portstate), &portstate); | ||
61 | |||
62 | nlh->nlmsg_len = skb->tail - b; | ||
63 | |||
64 | return skb->len; | ||
65 | |||
66 | nlmsg_failure: | ||
67 | rtattr_failure: | ||
68 | |||
69 | skb_trim(skb, b - skb->data); | ||
70 | return -EINVAL; | ||
71 | } | ||
72 | |||
73 | /* | ||
74 | * Notify listeners of a change in port information | ||
75 | */ | ||
76 | void br_ifinfo_notify(int event, struct net_bridge_port *port) | ||
77 | { | ||
78 | struct sk_buff *skb; | ||
79 | int err = -ENOMEM; | ||
80 | |||
81 | pr_debug("bridge notify event=%d\n", event); | ||
82 | skb = alloc_skb(NLMSG_SPACE(sizeof(struct ifinfomsg) + 128), | ||
83 | GFP_ATOMIC); | ||
84 | if (!skb) | ||
85 | goto err_out; | ||
86 | |||
87 | err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0); | ||
88 | if (err) | ||
89 | goto err_kfree; | ||
90 | |||
91 | NETLINK_CB(skb).dst_group = RTNLGRP_LINK; | ||
92 | netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC); | ||
93 | return; | ||
94 | |||
95 | err_kfree: | ||
96 | kfree_skb(skb); | ||
97 | err_out: | ||
98 | netlink_set_err(rtnl, 0, RTNLGRP_LINK, err); | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Dump information about all ports, in response to GETLINK | ||
103 | */ | ||
104 | static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) | ||
105 | { | ||
106 | struct net_device *dev; | ||
107 | int idx; | ||
108 | int s_idx = cb->args[0]; | ||
109 | int err = 0; | ||
110 | |||
111 | read_lock(&dev_base_lock); | ||
112 | for (dev = dev_base, idx = 0; dev; dev = dev->next) { | ||
113 | struct net_bridge_port *p = dev->br_port; | ||
114 | |||
115 | /* not a bridge port */ | ||
116 | if (!p) | ||
117 | continue; | ||
118 | |||
119 | if (idx < s_idx) | ||
120 | continue; | ||
121 | |||
122 | err = br_fill_ifinfo(skb, p, NETLINK_CB(cb->skb).pid, | ||
123 | cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI); | ||
124 | if (err <= 0) | ||
125 | break; | ||
126 | ++idx; | ||
127 | } | ||
128 | read_unlock(&dev_base_lock); | ||
129 | |||
130 | cb->args[0] = idx; | ||
131 | |||
132 | return skb->len; | ||
133 | } | ||
134 | |||
135 | /* | ||
136 | * Change state of port (ie from forwarding to blocking etc) | ||
137 | * Used by spanning tree in user space. | ||
138 | */ | ||
139 | static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) | ||
140 | { | ||
141 | struct rtattr **rta = arg; | ||
142 | struct ifinfomsg *ifm = NLMSG_DATA(nlh); | ||
143 | struct net_device *dev; | ||
144 | struct net_bridge_port *p; | ||
145 | u8 new_state; | ||
146 | |||
147 | if (ifm->ifi_family != AF_BRIDGE) | ||
148 | return -EPFNOSUPPORT; | ||
149 | |||
150 | /* Must pass valid state as PROTINFO */ | ||
151 | if (rta[IFLA_PROTINFO-1]) { | ||
152 | u8 *pstate = RTA_DATA(rta[IFLA_PROTINFO-1]); | ||
153 | new_state = *pstate; | ||
154 | } else | ||
155 | return -EINVAL; | ||
156 | |||
157 | if (new_state > BR_STATE_BLOCKING) | ||
158 | return -EINVAL; | ||
159 | |||
160 | /* Find bridge port */ | ||
161 | dev = __dev_get_by_index(ifm->ifi_index); | ||
162 | if (!dev) | ||
163 | return -ENODEV; | ||
164 | |||
165 | p = dev->br_port; | ||
166 | if (!p) | ||
167 | return -EINVAL; | ||
168 | |||
169 | /* if kernel STP is running, don't allow changes */ | ||
170 | if (p->br->stp_enabled) | ||
171 | return -EBUSY; | ||
172 | |||
173 | if (!netif_running(dev)) | ||
174 | return -ENETDOWN; | ||
175 | |||
176 | if (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED) | ||
177 | return -ENETDOWN; | ||
178 | |||
179 | p->state = new_state; | ||
180 | br_log_state(p); | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | |||
185 | static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = { | ||
186 | [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, }, | ||
187 | [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, }, | ||
188 | }; | ||
189 | |||
190 | void __init br_netlink_init(void) | ||
191 | { | ||
192 | rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table; | ||
193 | } | ||
194 | |||
195 | void __exit br_netlink_fini(void) | ||
196 | { | ||
197 | rtnetlink_links[PF_BRIDGE] = NULL; | ||
198 | } | ||
199 | |||
diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index a43a9c1d50d7..20278494e4da 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c | |||
@@ -14,6 +14,7 @@ | |||
14 | */ | 14 | */ |
15 | 15 | ||
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/rtnetlink.h> | ||
17 | 18 | ||
18 | #include "br_private.h" | 19 | #include "br_private.h" |
19 | 20 | ||
@@ -49,6 +50,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v | |||
49 | 50 | ||
50 | case NETDEV_CHANGEADDR: | 51 | case NETDEV_CHANGEADDR: |
51 | br_fdb_changeaddr(p, dev->dev_addr); | 52 | br_fdb_changeaddr(p, dev->dev_addr); |
53 | br_ifinfo_notify(RTM_NEWLINK, p); | ||
52 | br_stp_recalculate_bridge_id(br); | 54 | br_stp_recalculate_bridge_id(br); |
53 | break; | 55 | break; |
54 | 56 | ||
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ecea7ed372..c491fb2f280e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h | |||
@@ -29,7 +29,7 @@ | |||
29 | 29 | ||
30 | #define BR_PORT_DEBOUNCE (HZ/10) | 30 | #define BR_PORT_DEBOUNCE (HZ/10) |
31 | 31 | ||
32 | #define BR_VERSION "2.1" | 32 | #define BR_VERSION "2.2" |
33 | 33 | ||
34 | typedef struct bridge_id bridge_id; | 34 | typedef struct bridge_id bridge_id; |
35 | typedef struct mac_addr mac_addr; | 35 | typedef struct mac_addr mac_addr; |
@@ -192,8 +192,13 @@ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); | |||
192 | extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); | 192 | extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); |
193 | 193 | ||
194 | /* br_netfilter.c */ | 194 | /* br_netfilter.c */ |
195 | #ifdef CONFIG_BRIDGE_NETFILTER | ||
195 | extern int br_netfilter_init(void); | 196 | extern int br_netfilter_init(void); |
196 | extern void br_netfilter_fini(void); | 197 | extern void br_netfilter_fini(void); |
198 | #else | ||
199 | #define br_netfilter_init() (0) | ||
200 | #define br_netfilter_fini() do { } while(0) | ||
201 | #endif | ||
197 | 202 | ||
198 | /* br_stp.c */ | 203 | /* br_stp.c */ |
199 | extern void br_log_state(const struct net_bridge_port *p); | 204 | extern void br_log_state(const struct net_bridge_port *p); |
@@ -232,6 +237,11 @@ extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, | |||
232 | extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); | 237 | extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); |
233 | 238 | ||
234 | 239 | ||
240 | /* br_netlink.c */ | ||
241 | extern void br_netlink_init(void); | ||
242 | extern void br_netlink_fini(void); | ||
243 | extern void br_ifinfo_notify(int event, struct net_bridge_port *port); | ||
244 | |||
235 | #ifdef CONFIG_SYSFS | 245 | #ifdef CONFIG_SYSFS |
236 | /* br_sysfs_if.c */ | 246 | /* br_sysfs_if.c */ |
237 | extern struct sysfs_ops brport_sysfs_ops; | 247 | extern struct sysfs_ops brport_sysfs_ops; |
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 23dea1422c9a..14cd025079af 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/kernel.h> | 16 | #include <linux/kernel.h> |
17 | #include <linux/smp_lock.h> | 17 | #include <linux/smp_lock.h> |
18 | #include <linux/etherdevice.h> | 18 | #include <linux/etherdevice.h> |
19 | #include <linux/rtnetlink.h> | ||
19 | 20 | ||
20 | #include "br_private.h" | 21 | #include "br_private.h" |
21 | #include "br_private_stp.h" | 22 | #include "br_private_stp.h" |
@@ -86,6 +87,7 @@ void br_stp_disable_bridge(struct net_bridge *br) | |||
86 | void br_stp_enable_port(struct net_bridge_port *p) | 87 | void br_stp_enable_port(struct net_bridge_port *p) |
87 | { | 88 | { |
88 | br_init_port(p); | 89 | br_init_port(p); |
90 | br_ifinfo_notify(RTM_NEWLINK, p); | ||
89 | br_port_state_selection(p->br); | 91 | br_port_state_selection(p->br); |
90 | } | 92 | } |
91 | 93 | ||
@@ -99,6 +101,8 @@ void br_stp_disable_port(struct net_bridge_port *p) | |||
99 | printk(KERN_INFO "%s: port %i(%s) entering %s state\n", | 101 | printk(KERN_INFO "%s: port %i(%s) entering %s state\n", |
100 | br->dev->name, p->port_no, p->dev->name, "disabled"); | 102 | br->dev->name, p->port_no, p->dev->name, "disabled"); |
101 | 103 | ||
104 | br_ifinfo_notify(RTM_DELLINK, p); | ||
105 | |||
102 | wasroot = br_is_root_bridge(br); | 106 | wasroot = br_is_root_bridge(br); |
103 | br_become_designated_port(p); | 107 | br_become_designated_port(p); |
104 | p->state = BR_STATE_DISABLED; | 108 | p->state = BR_STATE_DISABLED; |
diff --git a/net/core/Makefile b/net/core/Makefile index 79fe12cced27..e9bd2467d5a9 100644 --- a/net/core/Makefile +++ b/net/core/Makefile | |||
@@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o | |||
16 | obj-$(CONFIG_NET_PKTGEN) += pktgen.o | 16 | obj-$(CONFIG_NET_PKTGEN) += pktgen.o |
17 | obj-$(CONFIG_WIRELESS_EXT) += wireless.o | 17 | obj-$(CONFIG_WIRELESS_EXT) += wireless.o |
18 | obj-$(CONFIG_NETPOLL) += netpoll.o | 18 | obj-$(CONFIG_NETPOLL) += netpoll.o |
19 | obj-$(CONFIG_NET_DMA) += user_dma.o | ||
diff --git a/net/core/dev.c b/net/core/dev.c index 4fba549caf29..ab39fe17cb58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c | |||
@@ -115,6 +115,7 @@ | |||
115 | #include <net/iw_handler.h> | 115 | #include <net/iw_handler.h> |
116 | #include <asm/current.h> | 116 | #include <asm/current.h> |
117 | #include <linux/audit.h> | 117 | #include <linux/audit.h> |
118 | #include <linux/dmaengine.h> | ||
118 | 119 | ||
119 | /* | 120 | /* |
120 | * The list of packet types we will receive (as opposed to discard) | 121 | * The list of packet types we will receive (as opposed to discard) |
@@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); | |||
148 | static struct list_head ptype_base[16]; /* 16 way hashed list */ | 149 | static struct list_head ptype_base[16]; /* 16 way hashed list */ |
149 | static struct list_head ptype_all; /* Taps */ | 150 | static struct list_head ptype_all; /* Taps */ |
150 | 151 | ||
152 | #ifdef CONFIG_NET_DMA | ||
153 | static struct dma_client *net_dma_client; | ||
154 | static unsigned int net_dma_count; | ||
155 | static spinlock_t net_dma_event_lock; | ||
156 | #endif | ||
157 | |||
151 | /* | 158 | /* |
152 | * The @dev_base list is protected by @dev_base_lock and the rtnl | 159 | * The @dev_base list is protected by @dev_base_lock and the rtnl |
153 | * semaphore. | 160 | * semaphore. |
@@ -1215,75 +1222,15 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) | |||
1215 | #define illegal_highdma(dev, skb) (0) | 1222 | #define illegal_highdma(dev, skb) (0) |
1216 | #endif | 1223 | #endif |
1217 | 1224 | ||
1218 | /* Keep head the same: replace data */ | ||
1219 | int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) | ||
1220 | { | ||
1221 | unsigned int size; | ||
1222 | u8 *data; | ||
1223 | long offset; | ||
1224 | struct skb_shared_info *ninfo; | ||
1225 | int headerlen = skb->data - skb->head; | ||
1226 | int expand = (skb->tail + skb->data_len) - skb->end; | ||
1227 | |||
1228 | if (skb_shared(skb)) | ||
1229 | BUG(); | ||
1230 | |||
1231 | if (expand <= 0) | ||
1232 | expand = 0; | ||
1233 | |||
1234 | size = skb->end - skb->head + expand; | ||
1235 | size = SKB_DATA_ALIGN(size); | ||
1236 | data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); | ||
1237 | if (!data) | ||
1238 | return -ENOMEM; | ||
1239 | |||
1240 | /* Copy entire thing */ | ||
1241 | if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) | ||
1242 | BUG(); | ||
1243 | |||
1244 | /* Set up shinfo */ | ||
1245 | ninfo = (struct skb_shared_info*)(data + size); | ||
1246 | atomic_set(&ninfo->dataref, 1); | ||
1247 | ninfo->tso_size = skb_shinfo(skb)->tso_size; | ||
1248 | ninfo->tso_segs = skb_shinfo(skb)->tso_segs; | ||
1249 | ninfo->nr_frags = 0; | ||
1250 | ninfo->frag_list = NULL; | ||
1251 | |||
1252 | /* Offset between the two in bytes */ | ||
1253 | offset = data - skb->head; | ||
1254 | |||
1255 | /* Free old data. */ | ||
1256 | skb_release_data(skb); | ||
1257 | |||
1258 | skb->head = data; | ||
1259 | skb->end = data + size; | ||
1260 | |||
1261 | /* Set up new pointers */ | ||
1262 | skb->h.raw += offset; | ||
1263 | skb->nh.raw += offset; | ||
1264 | skb->mac.raw += offset; | ||
1265 | skb->tail += offset; | ||
1266 | skb->data += offset; | ||
1267 | |||
1268 | /* We are no longer a clone, even if we were. */ | ||
1269 | skb->cloned = 0; | ||
1270 | |||
1271 | skb->tail += skb->data_len; | ||
1272 | skb->data_len = 0; | ||
1273 | return 0; | ||
1274 | } | ||
1275 | |||
1276 | #define HARD_TX_LOCK(dev, cpu) { \ | 1225 | #define HARD_TX_LOCK(dev, cpu) { \ |
1277 | if ((dev->features & NETIF_F_LLTX) == 0) { \ | 1226 | if ((dev->features & NETIF_F_LLTX) == 0) { \ |
1278 | spin_lock(&dev->xmit_lock); \ | 1227 | netif_tx_lock(dev); \ |
1279 | dev->xmit_lock_owner = cpu; \ | ||
1280 | } \ | 1228 | } \ |
1281 | } | 1229 | } |
1282 | 1230 | ||
1283 | #define HARD_TX_UNLOCK(dev) { \ | 1231 | #define HARD_TX_UNLOCK(dev) { \ |
1284 | if ((dev->features & NETIF_F_LLTX) == 0) { \ | 1232 | if ((dev->features & NETIF_F_LLTX) == 0) { \ |
1285 | dev->xmit_lock_owner = -1; \ | 1233 | netif_tx_unlock(dev); \ |
1286 | spin_unlock(&dev->xmit_lock); \ | ||
1287 | } \ | 1234 | } \ |
1288 | } | 1235 | } |
1289 | 1236 | ||
@@ -1321,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb) | |||
1321 | 1268 | ||
1322 | if (skb_shinfo(skb)->frag_list && | 1269 | if (skb_shinfo(skb)->frag_list && |
1323 | !(dev->features & NETIF_F_FRAGLIST) && | 1270 | !(dev->features & NETIF_F_FRAGLIST) && |
1324 | __skb_linearize(skb, GFP_ATOMIC)) | 1271 | __skb_linearize(skb)) |
1325 | goto out_kfree_skb; | 1272 | goto out_kfree_skb; |
1326 | 1273 | ||
1327 | /* Fragmented skb is linearized if device does not support SG, | 1274 | /* Fragmented skb is linearized if device does not support SG, |
@@ -1330,14 +1277,14 @@ int dev_queue_xmit(struct sk_buff *skb) | |||
1330 | */ | 1277 | */ |
1331 | if (skb_shinfo(skb)->nr_frags && | 1278 | if (skb_shinfo(skb)->nr_frags && |
1332 | (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && | 1279 | (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && |
1333 | __skb_linearize(skb, GFP_ATOMIC)) | 1280 | __skb_linearize(skb)) |
1334 | goto out_kfree_skb; | 1281 | goto out_kfree_skb; |
1335 | 1282 | ||
1336 | /* If packet is not checksummed and device does not support | 1283 | /* If packet is not checksummed and device does not support |
1337 | * checksumming for this protocol, complete checksumming here. | 1284 | * checksumming for this protocol, complete checksumming here. |
1338 | */ | 1285 | */ |
1339 | if (skb->ip_summed == CHECKSUM_HW && | 1286 | if (skb->ip_summed == CHECKSUM_HW && |
1340 | (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && | 1287 | (!(dev->features & NETIF_F_GEN_CSUM) && |
1341 | (!(dev->features & NETIF_F_IP_CSUM) || | 1288 | (!(dev->features & NETIF_F_IP_CSUM) || |
1342 | skb->protocol != htons(ETH_P_IP)))) | 1289 | skb->protocol != htons(ETH_P_IP)))) |
1343 | if (skb_checksum_help(skb, 0)) | 1290 | if (skb_checksum_help(skb, 0)) |
@@ -1382,8 +1329,8 @@ int dev_queue_xmit(struct sk_buff *skb) | |||
1382 | /* The device has no queue. Common case for software devices: | 1329 | /* The device has no queue. Common case for software devices: |
1383 | loopback, all the sorts of tunnels... | 1330 | loopback, all the sorts of tunnels... |
1384 | 1331 | ||
1385 | Really, it is unlikely that xmit_lock protection is necessary here. | 1332 | Really, it is unlikely that netif_tx_lock protection is necessary |
1386 | (f.e. loopback and IP tunnels are clean ignoring statistics | 1333 | here. (f.e. loopback and IP tunnels are clean ignoring statistics |
1387 | counters.) | 1334 | counters.) |
1388 | However, it is possible, that they rely on protection | 1335 | However, it is possible, that they rely on protection |
1389 | made by us here. | 1336 | made by us here. |
@@ -1846,6 +1793,19 @@ static void net_rx_action(struct softirq_action *h) | |||
1846 | } | 1793 | } |
1847 | } | 1794 | } |
1848 | out: | 1795 | out: |
1796 | #ifdef CONFIG_NET_DMA | ||
1797 | /* | ||
1798 | * There may not be any more sk_buffs coming right now, so push | ||
1799 | * any pending DMA copies to hardware | ||
1800 | */ | ||
1801 | if (net_dma_client) { | ||
1802 | struct dma_chan *chan; | ||
1803 | rcu_read_lock(); | ||
1804 | list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node) | ||
1805 | dma_async_memcpy_issue_pending(chan); | ||
1806 | rcu_read_unlock(); | ||
1807 | } | ||
1808 | #endif | ||
1849 | local_irq_enable(); | 1809 | local_irq_enable(); |
1850 | return; | 1810 | return; |
1851 | 1811 | ||
@@ -2785,7 +2745,7 @@ int register_netdevice(struct net_device *dev) | |||
2785 | BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); | 2745 | BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); |
2786 | 2746 | ||
2787 | spin_lock_init(&dev->queue_lock); | 2747 | spin_lock_init(&dev->queue_lock); |
2788 | spin_lock_init(&dev->xmit_lock); | 2748 | spin_lock_init(&dev->_xmit_lock); |
2789 | dev->xmit_lock_owner = -1; | 2749 | dev->xmit_lock_owner = -1; |
2790 | #ifdef CONFIG_NET_CLS_ACT | 2750 | #ifdef CONFIG_NET_CLS_ACT |
2791 | spin_lock_init(&dev->ingress_lock); | 2751 | spin_lock_init(&dev->ingress_lock); |
@@ -2829,9 +2789,7 @@ int register_netdevice(struct net_device *dev) | |||
2829 | 2789 | ||
2830 | /* Fix illegal SG+CSUM combinations. */ | 2790 | /* Fix illegal SG+CSUM combinations. */ |
2831 | if ((dev->features & NETIF_F_SG) && | 2791 | if ((dev->features & NETIF_F_SG) && |
2832 | !(dev->features & (NETIF_F_IP_CSUM | | 2792 | !(dev->features & NETIF_F_ALL_CSUM)) { |
2833 | NETIF_F_NO_CSUM | | ||
2834 | NETIF_F_HW_CSUM))) { | ||
2835 | printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", | 2793 | printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", |
2836 | dev->name); | 2794 | dev->name); |
2837 | dev->features &= ~NETIF_F_SG; | 2795 | dev->features &= ~NETIF_F_SG; |
@@ -3300,6 +3258,88 @@ static int dev_cpu_callback(struct notifier_block *nfb, | |||
3300 | } | 3258 | } |
3301 | #endif /* CONFIG_HOTPLUG_CPU */ | 3259 | #endif /* CONFIG_HOTPLUG_CPU */ |
3302 | 3260 | ||
3261 | #ifdef CONFIG_NET_DMA | ||
3262 | /** | ||
3263 | * net_dma_rebalance - | ||
3264 | * This is called when the number of channels allocated to the net_dma_client | ||
3265 | * changes. The net_dma_client tries to have one DMA channel per CPU. | ||
3266 | */ | ||
3267 | static void net_dma_rebalance(void) | ||
3268 | { | ||
3269 | unsigned int cpu, i, n; | ||
3270 | struct dma_chan *chan; | ||
3271 | |||
3272 | lock_cpu_hotplug(); | ||
3273 | |||
3274 | if (net_dma_count == 0) { | ||
3275 | for_each_online_cpu(cpu) | ||
3276 | rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL); | ||
3277 | unlock_cpu_hotplug(); | ||
3278 | return; | ||
3279 | } | ||
3280 | |||
3281 | i = 0; | ||
3282 | cpu = first_cpu(cpu_online_map); | ||
3283 | |||
3284 | rcu_read_lock(); | ||
3285 | list_for_each_entry(chan, &net_dma_client->channels, client_node) { | ||
3286 | n = ((num_online_cpus() / net_dma_count) | ||
3287 | + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); | ||
3288 | |||
3289 | while(n) { | ||
3290 | per_cpu(softnet_data.net_dma, cpu) = chan; | ||
3291 | cpu = next_cpu(cpu, cpu_online_map); | ||
3292 | n--; | ||
3293 | } | ||
3294 | i++; | ||
3295 | } | ||
3296 | rcu_read_unlock(); | ||
3297 | |||
3298 | unlock_cpu_hotplug(); | ||
3299 | } | ||
3300 | |||
3301 | /** | ||
3302 | * netdev_dma_event - event callback for the net_dma_client | ||
3303 | * @client: should always be net_dma_client | ||
3304 | * @chan: | ||
3305 | * @event: | ||
3306 | */ | ||
3307 | static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, | ||
3308 | enum dma_event event) | ||
3309 | { | ||
3310 | spin_lock(&net_dma_event_lock); | ||
3311 | switch (event) { | ||
3312 | case DMA_RESOURCE_ADDED: | ||
3313 | net_dma_count++; | ||
3314 | net_dma_rebalance(); | ||
3315 | break; | ||
3316 | case DMA_RESOURCE_REMOVED: | ||
3317 | net_dma_count--; | ||
3318 | net_dma_rebalance(); | ||
3319 | break; | ||
3320 | default: | ||
3321 | break; | ||
3322 | } | ||
3323 | spin_unlock(&net_dma_event_lock); | ||
3324 | } | ||
3325 | |||
3326 | /** | ||
3327 | * netdev_dma_regiser - register the networking subsystem as a DMA client | ||
3328 | */ | ||
3329 | static int __init netdev_dma_register(void) | ||
3330 | { | ||
3331 | spin_lock_init(&net_dma_event_lock); | ||
3332 | net_dma_client = dma_async_client_register(netdev_dma_event); | ||
3333 | if (net_dma_client == NULL) | ||
3334 | return -ENOMEM; | ||
3335 | |||
3336 | dma_async_client_chan_request(net_dma_client, num_online_cpus()); | ||
3337 | return 0; | ||
3338 | } | ||
3339 | |||
3340 | #else | ||
3341 | static int __init netdev_dma_register(void) { return -ENODEV; } | ||
3342 | #endif /* CONFIG_NET_DMA */ | ||
3303 | 3343 | ||
3304 | /* | 3344 | /* |
3305 | * Initialize the DEV module. At boot time this walks the device list and | 3345 | * Initialize the DEV module. At boot time this walks the device list and |
@@ -3353,6 +3393,8 @@ static int __init net_dev_init(void) | |||
3353 | atomic_set(&queue->backlog_dev.refcnt, 1); | 3393 | atomic_set(&queue->backlog_dev.refcnt, 1); |
3354 | } | 3394 | } |
3355 | 3395 | ||
3396 | netdev_dma_register(); | ||
3397 | |||
3356 | dev_boot_phase = 0; | 3398 | dev_boot_phase = 0; |
3357 | 3399 | ||
3358 | open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); | 3400 | open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); |
@@ -3371,7 +3413,6 @@ subsys_initcall(net_dev_init); | |||
3371 | EXPORT_SYMBOL(__dev_get_by_index); | 3413 | EXPORT_SYMBOL(__dev_get_by_index); |
3372 | EXPORT_SYMBOL(__dev_get_by_name); | 3414 | EXPORT_SYMBOL(__dev_get_by_name); |
3373 | EXPORT_SYMBOL(__dev_remove_pack); | 3415 | EXPORT_SYMBOL(__dev_remove_pack); |
3374 | EXPORT_SYMBOL(__skb_linearize); | ||
3375 | EXPORT_SYMBOL(dev_valid_name); | 3416 | EXPORT_SYMBOL(dev_valid_name); |
3376 | EXPORT_SYMBOL(dev_add_pack); | 3417 | EXPORT_SYMBOL(dev_add_pack); |
3377 | EXPORT_SYMBOL(dev_alloc_name); | 3418 | EXPORT_SYMBOL(dev_alloc_name); |
diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 05d60850840e..c57d887da2ef 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c | |||
@@ -62,7 +62,7 @@ | |||
62 | * Device mc lists are changed by bh at least if IPv6 is enabled, | 62 | * Device mc lists are changed by bh at least if IPv6 is enabled, |
63 | * so that it must be bh protected. | 63 | * so that it must be bh protected. |
64 | * | 64 | * |
65 | * We block accesses to device mc filters with dev->xmit_lock. | 65 | * We block accesses to device mc filters with netif_tx_lock. |
66 | */ | 66 | */ |
67 | 67 | ||
68 | /* | 68 | /* |
@@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev) | |||
93 | 93 | ||
94 | void dev_mc_upload(struct net_device *dev) | 94 | void dev_mc_upload(struct net_device *dev) |
95 | { | 95 | { |
96 | spin_lock_bh(&dev->xmit_lock); | 96 | netif_tx_lock_bh(dev); |
97 | __dev_mc_upload(dev); | 97 | __dev_mc_upload(dev); |
98 | spin_unlock_bh(&dev->xmit_lock); | 98 | netif_tx_unlock_bh(dev); |
99 | } | 99 | } |
100 | 100 | ||
101 | /* | 101 | /* |
@@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) | |||
107 | int err = 0; | 107 | int err = 0; |
108 | struct dev_mc_list *dmi, **dmip; | 108 | struct dev_mc_list *dmi, **dmip; |
109 | 109 | ||
110 | spin_lock_bh(&dev->xmit_lock); | 110 | netif_tx_lock_bh(dev); |
111 | 111 | ||
112 | for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { | 112 | for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { |
113 | /* | 113 | /* |
@@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) | |||
139 | */ | 139 | */ |
140 | __dev_mc_upload(dev); | 140 | __dev_mc_upload(dev); |
141 | 141 | ||
142 | spin_unlock_bh(&dev->xmit_lock); | 142 | netif_tx_unlock_bh(dev); |
143 | return 0; | 143 | return 0; |
144 | } | 144 | } |
145 | } | 145 | } |
146 | err = -ENOENT; | 146 | err = -ENOENT; |
147 | done: | 147 | done: |
148 | spin_unlock_bh(&dev->xmit_lock); | 148 | netif_tx_unlock_bh(dev); |
149 | return err; | 149 | return err; |
150 | } | 150 | } |
151 | 151 | ||
@@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) | |||
160 | 160 | ||
161 | dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC); | 161 | dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC); |
162 | 162 | ||
163 | spin_lock_bh(&dev->xmit_lock); | 163 | netif_tx_lock_bh(dev); |
164 | for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { | 164 | for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { |
165 | if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && | 165 | if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && |
166 | dmi->dmi_addrlen == alen) { | 166 | dmi->dmi_addrlen == alen) { |
@@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) | |||
176 | } | 176 | } |
177 | 177 | ||
178 | if ((dmi = dmi1) == NULL) { | 178 | if ((dmi = dmi1) == NULL) { |
179 | spin_unlock_bh(&dev->xmit_lock); | 179 | netif_tx_unlock_bh(dev); |
180 | return -ENOMEM; | 180 | return -ENOMEM; |
181 | } | 181 | } |
182 | memcpy(dmi->dmi_addr, addr, alen); | 182 | memcpy(dmi->dmi_addr, addr, alen); |
@@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) | |||
189 | 189 | ||
190 | __dev_mc_upload(dev); | 190 | __dev_mc_upload(dev); |
191 | 191 | ||
192 | spin_unlock_bh(&dev->xmit_lock); | 192 | netif_tx_unlock_bh(dev); |
193 | return 0; | 193 | return 0; |
194 | 194 | ||
195 | done: | 195 | done: |
196 | spin_unlock_bh(&dev->xmit_lock); | 196 | netif_tx_unlock_bh(dev); |
197 | kfree(dmi1); | 197 | kfree(dmi1); |
198 | return err; | 198 | return err; |
199 | } | 199 | } |
@@ -204,7 +204,7 @@ done: | |||
204 | 204 | ||
205 | void dev_mc_discard(struct net_device *dev) | 205 | void dev_mc_discard(struct net_device *dev) |
206 | { | 206 | { |
207 | spin_lock_bh(&dev->xmit_lock); | 207 | netif_tx_lock_bh(dev); |
208 | 208 | ||
209 | while (dev->mc_list != NULL) { | 209 | while (dev->mc_list != NULL) { |
210 | struct dev_mc_list *tmp = dev->mc_list; | 210 | struct dev_mc_list *tmp = dev->mc_list; |
@@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev) | |||
215 | } | 215 | } |
216 | dev->mc_count = 0; | 216 | dev->mc_count = 0; |
217 | 217 | ||
218 | spin_unlock_bh(&dev->xmit_lock); | 218 | netif_tx_unlock_bh(dev); |
219 | } | 219 | } |
220 | 220 | ||
221 | #ifdef CONFIG_PROC_FS | 221 | #ifdef CONFIG_PROC_FS |
@@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) | |||
250 | struct dev_mc_list *m; | 250 | struct dev_mc_list *m; |
251 | struct net_device *dev = v; | 251 | struct net_device *dev = v; |
252 | 252 | ||
253 | spin_lock_bh(&dev->xmit_lock); | 253 | netif_tx_lock_bh(dev); |
254 | for (m = dev->mc_list; m; m = m->next) { | 254 | for (m = dev->mc_list; m; m = m->next) { |
255 | int i; | 255 | int i; |
256 | 256 | ||
@@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) | |||
262 | 262 | ||
263 | seq_putc(seq, '\n'); | 263 | seq_putc(seq, '\n'); |
264 | } | 264 | } |
265 | spin_unlock_bh(&dev->xmit_lock); | 265 | netif_tx_unlock_bh(dev); |
266 | return 0; | 266 | return 0; |
267 | } | 267 | } |
268 | 268 | ||
diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e6f76106a99b..33ce7ed6afc6 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c | |||
@@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev) | |||
30 | 30 | ||
31 | u32 ethtool_op_get_tx_csum(struct net_device *dev) | 31 | u32 ethtool_op_get_tx_csum(struct net_device *dev) |
32 | { | 32 | { |
33 | return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0; | 33 | return (dev->features & NETIF_F_ALL_CSUM) != 0; |
34 | } | 34 | } |
35 | 35 | ||
36 | int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) | 36 | int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) |
@@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) | |||
551 | return -EFAULT; | 551 | return -EFAULT; |
552 | 552 | ||
553 | if (edata.data && | 553 | if (edata.data && |
554 | !(dev->features & (NETIF_F_IP_CSUM | | 554 | !(dev->features & NETIF_F_ALL_CSUM)) |
555 | NETIF_F_NO_CSUM | | ||
556 | NETIF_F_HW_CSUM))) | ||
557 | return -EINVAL; | 555 | return -EINVAL; |
558 | 556 | ||
559 | return __ethtool_set_sg(dev, edata.data); | 557 | return __ethtool_set_sg(dev, edata.data); |
@@ -591,7 +589,7 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) | |||
591 | 589 | ||
592 | static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) | 590 | static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) |
593 | { | 591 | { |
594 | struct ethtool_value edata = { ETHTOOL_GTSO }; | 592 | struct ethtool_value edata = { ETHTOOL_GUFO }; |
595 | 593 | ||
596 | if (!dev->ethtool_ops->get_ufo) | 594 | if (!dev->ethtool_ops->get_ufo) |
597 | return -EOPNOTSUPP; | 595 | return -EOPNOTSUPP; |
@@ -600,6 +598,7 @@ static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) | |||
600 | return -EFAULT; | 598 | return -EFAULT; |
601 | return 0; | 599 | return 0; |
602 | } | 600 | } |
601 | |||
603 | static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) | 602 | static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) |
604 | { | 603 | { |
605 | struct ethtool_value edata; | 604 | struct ethtool_value edata; |
diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e8e05cebd95a..9cb781830380 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c | |||
@@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) | |||
273 | 273 | ||
274 | do { | 274 | do { |
275 | npinfo->tries--; | 275 | npinfo->tries--; |
276 | spin_lock(&np->dev->xmit_lock); | 276 | netif_tx_lock(np->dev); |
277 | np->dev->xmit_lock_owner = smp_processor_id(); | ||
278 | 277 | ||
279 | /* | 278 | /* |
280 | * network drivers do not expect to be called if the queue is | 279 | * network drivers do not expect to be called if the queue is |
281 | * stopped. | 280 | * stopped. |
282 | */ | 281 | */ |
283 | if (netif_queue_stopped(np->dev)) { | 282 | if (netif_queue_stopped(np->dev)) { |
284 | np->dev->xmit_lock_owner = -1; | 283 | netif_tx_unlock(np->dev); |
285 | spin_unlock(&np->dev->xmit_lock); | ||
286 | netpoll_poll(np); | 284 | netpoll_poll(np); |
287 | udelay(50); | 285 | udelay(50); |
288 | continue; | 286 | continue; |
289 | } | 287 | } |
290 | 288 | ||
291 | status = np->dev->hard_start_xmit(skb, np->dev); | 289 | status = np->dev->hard_start_xmit(skb, np->dev); |
292 | np->dev->xmit_lock_owner = -1; | 290 | netif_tx_unlock(np->dev); |
293 | spin_unlock(&np->dev->xmit_lock); | ||
294 | 291 | ||
295 | /* success */ | 292 | /* success */ |
296 | if(!status) { | 293 | if(!status) { |
diff --git a/net/core/pktgen.c b/net/core/pktgen.c index c23e9c06ee23..67ed14ddabd2 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c | |||
@@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) | |||
2897 | } | 2897 | } |
2898 | } | 2898 | } |
2899 | 2899 | ||
2900 | spin_lock_bh(&odev->xmit_lock); | 2900 | netif_tx_lock_bh(odev); |
2901 | if (!netif_queue_stopped(odev)) { | 2901 | if (!netif_queue_stopped(odev)) { |
2902 | 2902 | ||
2903 | atomic_inc(&(pkt_dev->skb->users)); | 2903 | atomic_inc(&(pkt_dev->skb->users)); |
@@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) | |||
2942 | pkt_dev->next_tx_ns = 0; | 2942 | pkt_dev->next_tx_ns = 0; |
2943 | } | 2943 | } |
2944 | 2944 | ||
2945 | spin_unlock_bh(&odev->xmit_lock); | 2945 | netif_tx_unlock_bh(odev); |
2946 | 2946 | ||
2947 | /* If pkt_dev->count is zero, then run forever */ | 2947 | /* If pkt_dev->count is zero, then run forever */ |
2948 | if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { | 2948 | if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb3770f9c094..bb7210f4005e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) | |||
464 | n->tc_verd = CLR_TC_MUNGED(n->tc_verd); | 464 | n->tc_verd = CLR_TC_MUNGED(n->tc_verd); |
465 | C(input_dev); | 465 | C(input_dev); |
466 | #endif | 466 | #endif |
467 | 467 | skb_copy_secmark(n, skb); | |
468 | #endif | 468 | #endif |
469 | C(truesize); | 469 | C(truesize); |
470 | atomic_set(&n->users, 1); | 470 | atomic_set(&n->users, 1); |
@@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |||
526 | #endif | 526 | #endif |
527 | new->tc_index = old->tc_index; | 527 | new->tc_index = old->tc_index; |
528 | #endif | 528 | #endif |
529 | skb_copy_secmark(new, old); | ||
529 | atomic_set(&new->users, 1); | 530 | atomic_set(&new->users, 1); |
530 | skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; | 531 | skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; |
531 | skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; | 532 | skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; |
@@ -800,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad) | |||
800 | return nskb; | 801 | return nskb; |
801 | } | 802 | } |
802 | 803 | ||
803 | /* Trims skb to length len. It can change skb pointers, if "realloc" is 1. | 804 | /* Trims skb to length len. It can change skb pointers. |
804 | * If realloc==0 and trimming is impossible without change of data, | ||
805 | * it is BUG(). | ||
806 | */ | 805 | */ |
807 | 806 | ||
808 | int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) | 807 | int ___pskb_trim(struct sk_buff *skb, unsigned int len) |
809 | { | 808 | { |
810 | int offset = skb_headlen(skb); | 809 | int offset = skb_headlen(skb); |
811 | int nfrags = skb_shinfo(skb)->nr_frags; | 810 | int nfrags = skb_shinfo(skb)->nr_frags; |
@@ -815,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) | |||
815 | int end = offset + skb_shinfo(skb)->frags[i].size; | 814 | int end = offset + skb_shinfo(skb)->frags[i].size; |
816 | if (end > len) { | 815 | if (end > len) { |
817 | if (skb_cloned(skb)) { | 816 | if (skb_cloned(skb)) { |
818 | BUG_ON(!realloc); | ||
819 | if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) | 817 | if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) |
820 | return -ENOMEM; | 818 | return -ENOMEM; |
821 | } | 819 | } |
diff --git a/net/core/sock.c b/net/core/sock.c index ed2afdb9ea2d..5d820c376653 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) | |||
832 | atomic_set(&newsk->sk_omem_alloc, 0); | 832 | atomic_set(&newsk->sk_omem_alloc, 0); |
833 | skb_queue_head_init(&newsk->sk_receive_queue); | 833 | skb_queue_head_init(&newsk->sk_receive_queue); |
834 | skb_queue_head_init(&newsk->sk_write_queue); | 834 | skb_queue_head_init(&newsk->sk_write_queue); |
835 | #ifdef CONFIG_NET_DMA | ||
836 | skb_queue_head_init(&newsk->sk_async_wait_queue); | ||
837 | #endif | ||
835 | 838 | ||
836 | rwlock_init(&newsk->sk_dst_lock); | 839 | rwlock_init(&newsk->sk_dst_lock); |
837 | rwlock_init(&newsk->sk_callback_lock); | 840 | rwlock_init(&newsk->sk_callback_lock); |
@@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) | |||
1383 | skb_queue_head_init(&sk->sk_receive_queue); | 1386 | skb_queue_head_init(&sk->sk_receive_queue); |
1384 | skb_queue_head_init(&sk->sk_write_queue); | 1387 | skb_queue_head_init(&sk->sk_write_queue); |
1385 | skb_queue_head_init(&sk->sk_error_queue); | 1388 | skb_queue_head_init(&sk->sk_error_queue); |
1389 | #ifdef CONFIG_NET_DMA | ||
1390 | skb_queue_head_init(&sk->sk_async_wait_queue); | ||
1391 | #endif | ||
1386 | 1392 | ||
1387 | sk->sk_send_head = NULL; | 1393 | sk->sk_send_head = NULL; |
1388 | 1394 | ||
diff --git a/net/core/user_dma.c b/net/core/user_dma.c new file mode 100644 index 000000000000..b7c98dbcdb81 --- /dev/null +++ b/net/core/user_dma.c | |||
@@ -0,0 +1,131 @@ | |||
1 | /* | ||
2 | * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. | ||
3 | * Portions based on net/core/datagram.c and copyrighted by their authors. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms of the GNU General Public License as published by the Free | ||
7 | * Software Foundation; either version 2 of the License, or (at your option) | ||
8 | * any later version. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, but WITHOUT | ||
11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
13 | * more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License along with | ||
16 | * this program; if not, write to the Free Software Foundation, Inc., 59 | ||
17 | * Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
18 | * | ||
19 | * The full GNU General Public License is included in this distribution in the | ||
20 | * file called COPYING. | ||
21 | */ | ||
22 | |||
23 | /* | ||
24 | * This code allows the net stack to make use of a DMA engine for | ||
25 | * skb to iovec copies. | ||
26 | */ | ||
27 | |||
28 | #include <linux/dmaengine.h> | ||
29 | #include <linux/socket.h> | ||
30 | #include <linux/rtnetlink.h> /* for BUG_TRAP */ | ||
31 | #include <net/tcp.h> | ||
32 | |||
33 | #define NET_DMA_DEFAULT_COPYBREAK 4096 | ||
34 | |||
35 | int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; | ||
36 | |||
37 | /** | ||
38 | * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. | ||
39 | * @skb - buffer to copy | ||
40 | * @offset - offset in the buffer to start copying from | ||
41 | * @iovec - io vector to copy to | ||
42 | * @len - amount of data to copy from buffer to iovec | ||
43 | * @pinned_list - locked iovec buffer data | ||
44 | * | ||
45 | * Note: the iovec is modified during the copy. | ||
46 | */ | ||
47 | int dma_skb_copy_datagram_iovec(struct dma_chan *chan, | ||
48 | struct sk_buff *skb, int offset, struct iovec *to, | ||
49 | size_t len, struct dma_pinned_list *pinned_list) | ||
50 | { | ||
51 | int start = skb_headlen(skb); | ||
52 | int i, copy = start - offset; | ||
53 | dma_cookie_t cookie = 0; | ||
54 | |||
55 | /* Copy header. */ | ||
56 | if (copy > 0) { | ||
57 | if (copy > len) | ||
58 | copy = len; | ||
59 | cookie = dma_memcpy_to_iovec(chan, to, pinned_list, | ||
60 | skb->data + offset, copy); | ||
61 | if (cookie < 0) | ||
62 | goto fault; | ||
63 | len -= copy; | ||
64 | if (len == 0) | ||
65 | goto end; | ||
66 | offset += copy; | ||
67 | } | ||
68 | |||
69 | /* Copy paged appendix. Hmm... why does this look so complicated? */ | ||
70 | for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { | ||
71 | int end; | ||
72 | |||
73 | BUG_TRAP(start <= offset + len); | ||
74 | |||
75 | end = start + skb_shinfo(skb)->frags[i].size; | ||
76 | copy = end - offset; | ||
77 | if ((copy = end - offset) > 0) { | ||
78 | skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; | ||
79 | struct page *page = frag->page; | ||
80 | |||
81 | if (copy > len) | ||
82 | copy = len; | ||
83 | |||
84 | cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, | ||
85 | frag->page_offset + offset - start, copy); | ||
86 | if (cookie < 0) | ||
87 | goto fault; | ||
88 | len -= copy; | ||
89 | if (len == 0) | ||
90 | goto end; | ||
91 | offset += copy; | ||
92 | } | ||
93 | start = end; | ||
94 | } | ||
95 | |||
96 | if (skb_shinfo(skb)->frag_list) { | ||
97 | struct sk_buff *list = skb_shinfo(skb)->frag_list; | ||
98 | |||
99 | for (; list; list = list->next) { | ||
100 | int end; | ||
101 | |||
102 | BUG_TRAP(start <= offset + len); | ||
103 | |||
104 | end = start + list->len; | ||
105 | copy = end - offset; | ||
106 | if (copy > 0) { | ||
107 | if (copy > len) | ||
108 | copy = len; | ||
109 | cookie = dma_skb_copy_datagram_iovec(chan, list, | ||
110 | offset - start, to, copy, | ||
111 | pinned_list); | ||
112 | if (cookie < 0) | ||
113 | goto fault; | ||
114 | len -= copy; | ||
115 | if (len == 0) | ||
116 | goto end; | ||
117 | offset += copy; | ||
118 | } | ||
119 | start = end; | ||
120 | } | ||
121 | } | ||
122 | |||
123 | end: | ||
124 | if (!len) { | ||
125 | skb->dma_cookie = cookie; | ||
126 | return cookie; | ||
127 | } | ||
128 | |||
129 | fault: | ||
130 | return -EFAULT; | ||
131 | } | ||
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e0ee8355c41..5317fd3e6691 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
719 | } | 719 | } |
720 | dccp_pr_debug("packet_type=%s\n", | 720 | dccp_pr_debug("packet_type=%s\n", |
721 | dccp_packet_name(dh->dccph_type)); | 721 | dccp_packet_name(dh->dccph_type)); |
722 | sk_eat_skb(sk, skb); | 722 | sk_eat_skb(sk, skb, 0); |
723 | verify_sock_status: | 723 | verify_sock_status: |
724 | if (sock_flag(sk, SOCK_DONE)) { | 724 | if (sock_flag(sk, SOCK_DONE)) { |
725 | len = 0; | 725 | len = 0; |
@@ -773,7 +773,7 @@ verify_sock_status: | |||
773 | } | 773 | } |
774 | found_fin_ok: | 774 | found_fin_ok: |
775 | if (!(flags & MSG_PEEK)) | 775 | if (!(flags & MSG_PEEK)) |
776 | sk_eat_skb(sk, skb); | 776 | sk_eat_skb(sk, skb, 0); |
777 | break; | 777 | break; |
778 | } while (1); | 778 | } while (1); |
779 | out: | 779 | out: |
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 547523b41c81..a2ba9db1c376 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c | |||
@@ -801,8 +801,7 @@ got_it: | |||
801 | * We linearize everything except data segments here. | 801 | * We linearize everything except data segments here. |
802 | */ | 802 | */ |
803 | if (cb->nsp_flags & ~0x60) { | 803 | if (cb->nsp_flags & ~0x60) { |
804 | if (unlikely(skb_is_nonlinear(skb)) && | 804 | if (unlikely(skb_linearize(skb))) |
805 | skb_linearize(skb, GFP_ATOMIC) != 0) | ||
806 | goto free_out; | 805 | goto free_out; |
807 | } | 806 | } |
808 | 807 | ||
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e172cf98d7fc..5abf7057af00 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c | |||
@@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type | |||
629 | padlen); | 629 | padlen); |
630 | 630 | ||
631 | if (flags & DN_RT_PKT_CNTL) { | 631 | if (flags & DN_RT_PKT_CNTL) { |
632 | if (unlikely(skb_is_nonlinear(skb)) && | 632 | if (unlikely(skb_linearize(skb))) |
633 | skb_linearize(skb, GFP_ATOMIC) != 0) | ||
634 | goto dump_it; | 633 | goto dump_it; |
635 | 634 | ||
636 | switch(flags & DN_RT_CNTL_MSK) { | 635 | switch(flags & DN_RT_CNTL_MSK) { |
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e40f75322377..da33393be45f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig | |||
@@ -414,6 +414,24 @@ config INET_TUNNEL | |||
414 | tristate | 414 | tristate |
415 | default n | 415 | default n |
416 | 416 | ||
417 | config INET_XFRM_MODE_TRANSPORT | ||
418 | tristate "IP: IPsec transport mode" | ||
419 | default y | ||
420 | select XFRM | ||
421 | ---help--- | ||
422 | Support for IPsec transport mode. | ||
423 | |||
424 | If unsure, say Y. | ||
425 | |||
426 | config INET_XFRM_MODE_TUNNEL | ||
427 | tristate "IP: IPsec tunnel mode" | ||
428 | default y | ||
429 | select XFRM | ||
430 | ---help--- | ||
431 | Support for IPsec tunnel mode. | ||
432 | |||
433 | If unsure, say Y. | ||
434 | |||
417 | config INET_DIAG | 435 | config INET_DIAG |
418 | tristate "INET: socket monitoring interface" | 436 | tristate "INET: socket monitoring interface" |
419 | default y | 437 | default y |
@@ -532,6 +550,38 @@ config TCP_CONG_SCALABLE | |||
532 | properties, though is known to have fairness issues. | 550 | properties, though is known to have fairness issues. |
533 | See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ | 551 | See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ |
534 | 552 | ||
553 | config TCP_CONG_LP | ||
554 | tristate "TCP Low Priority" | ||
555 | depends on EXPERIMENTAL | ||
556 | default n | ||
557 | ---help--- | ||
558 | TCP Low Priority (TCP-LP), a distributed algorithm whose goal is | ||
559 | to utiliza only the excess network bandwidth as compared to the | ||
560 | ``fair share`` of bandwidth as targeted by TCP. | ||
561 | See http://www-ece.rice.edu/networks/TCP-LP/ | ||
562 | |||
563 | config TCP_CONG_VENO | ||
564 | tristate "TCP Veno" | ||
565 | depends on EXPERIMENTAL | ||
566 | default n | ||
567 | ---help--- | ||
568 | TCP Veno is a sender-side only enhancement of TCP to obtain better | ||
569 | throughput over wireless networks. TCP Veno makes use of state | ||
570 | distinguishing to circumvent the difficult judgment of the packet loss | ||
571 | type. TCP Veno cuts down less congestion window in response to random | ||
572 | loss packets. | ||
573 | See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf | ||
574 | |||
575 | config TCP_CONG_COMPOUND | ||
576 | tristate "TCP Compound" | ||
577 | depends on EXPERIMENTAL | ||
578 | default n | ||
579 | ---help--- | ||
580 | TCP Compound is a sender-side only change to TCP that uses | ||
581 | a mixed Reno/Vegas approach to calculate the cwnd. | ||
582 | For further details look here: | ||
583 | ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf | ||
584 | |||
535 | endmenu | 585 | endmenu |
536 | 586 | ||
537 | config TCP_CONG_BIC | 587 | config TCP_CONG_BIC |
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9ef50a0b9d2c..38b8039bdd55 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile | |||
@@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o | |||
24 | obj-$(CONFIG_INET_IPCOMP) += ipcomp.o | 24 | obj-$(CONFIG_INET_IPCOMP) += ipcomp.o |
25 | obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o | 25 | obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o |
26 | obj-$(CONFIG_INET_TUNNEL) += tunnel4.o | 26 | obj-$(CONFIG_INET_TUNNEL) += tunnel4.o |
27 | obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o | ||
28 | obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o | ||
27 | obj-$(CONFIG_IP_PNP) += ipconfig.o | 29 | obj-$(CONFIG_IP_PNP) += ipconfig.o |
28 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o | 30 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o |
29 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o | 31 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o |
@@ -34,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ | |||
34 | obj-$(CONFIG_INET_DIAG) += inet_diag.o | 36 | obj-$(CONFIG_INET_DIAG) += inet_diag.o |
35 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o | 37 | obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o |
36 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o | 38 | obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o |
39 | obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o | ||
37 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o | 40 | obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o |
38 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o | 41 | obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o |
39 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o | 42 | obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o |
@@ -41,7 +44,10 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o | |||
41 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o | 44 | obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o |
42 | obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o | 45 | obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o |
43 | obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o | 46 | obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o |
47 | obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o | ||
44 | obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o | 48 | obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o |
49 | obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o | ||
50 | obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o | ||
45 | 51 | ||
46 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ | 52 | obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ |
47 | xfrm4_output.o | 53 | xfrm4_output.o |
diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e2e4771fa4c6..c7782230080d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c | |||
@@ -119,6 +119,7 @@ error: | |||
119 | static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | 119 | static int ah_input(struct xfrm_state *x, struct sk_buff *skb) |
120 | { | 120 | { |
121 | int ah_hlen; | 121 | int ah_hlen; |
122 | int ihl; | ||
122 | struct iphdr *iph; | 123 | struct iphdr *iph; |
123 | struct ip_auth_hdr *ah; | 124 | struct ip_auth_hdr *ah; |
124 | struct ah_data *ahp; | 125 | struct ah_data *ahp; |
@@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
149 | ah = (struct ip_auth_hdr*)skb->data; | 150 | ah = (struct ip_auth_hdr*)skb->data; |
150 | iph = skb->nh.iph; | 151 | iph = skb->nh.iph; |
151 | 152 | ||
152 | memcpy(work_buf, iph, iph->ihl*4); | 153 | ihl = skb->data - skb->nh.raw; |
154 | memcpy(work_buf, iph, ihl); | ||
153 | 155 | ||
154 | iph->ttl = 0; | 156 | iph->ttl = 0; |
155 | iph->tos = 0; | 157 | iph->tos = 0; |
156 | iph->frag_off = 0; | 158 | iph->frag_off = 0; |
157 | iph->check = 0; | 159 | iph->check = 0; |
158 | if (iph->ihl != 5) { | 160 | if (ihl > sizeof(*iph)) { |
159 | u32 dummy; | 161 | u32 dummy; |
160 | if (ip_clear_mutable_options(iph, &dummy)) | 162 | if (ip_clear_mutable_options(iph, &dummy)) |
161 | goto out; | 163 | goto out; |
@@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
164 | u8 auth_data[MAX_AH_AUTH_LEN]; | 166 | u8 auth_data[MAX_AH_AUTH_LEN]; |
165 | 167 | ||
166 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); | 168 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); |
167 | skb_push(skb, skb->data - skb->nh.raw); | 169 | skb_push(skb, ihl); |
168 | ahp->icv(ahp, skb, ah->auth_data); | 170 | ahp->icv(ahp, skb, ah->auth_data); |
169 | if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { | 171 | if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { |
170 | x->stats.integrity_failed++; | 172 | x->stats.integrity_failed++; |
@@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) | |||
172 | } | 174 | } |
173 | } | 175 | } |
174 | ((struct iphdr*)work_buf)->protocol = ah->nexthdr; | 176 | ((struct iphdr*)work_buf)->protocol = ah->nexthdr; |
175 | skb->nh.raw = skb_pull(skb, ah_hlen); | 177 | skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); |
176 | memcpy(skb->nh.raw, work_buf, iph->ihl*4); | 178 | __skb_pull(skb, ah_hlen + ihl); |
177 | skb->nh.iph->tot_len = htons(skb->len); | ||
178 | skb_pull(skb, skb->nh.iph->ihl*4); | ||
179 | skb->h.raw = skb->data; | ||
180 | 179 | ||
181 | return 0; | 180 | return 0; |
182 | 181 | ||
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9d1881c07a32..9bbdd4494551 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c | |||
@@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
143 | int alen = esp->auth.icv_trunc_len; | 143 | int alen = esp->auth.icv_trunc_len; |
144 | int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; | 144 | int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; |
145 | int nfrags; | 145 | int nfrags; |
146 | int encap_len = 0; | 146 | int ihl; |
147 | u8 nexthdr[2]; | 147 | u8 nexthdr[2]; |
148 | struct scatterlist *sg; | 148 | struct scatterlist *sg; |
149 | u8 workbuf[60]; | ||
150 | int padlen; | 149 | int padlen; |
151 | 150 | ||
152 | if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) | 151 | if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) |
@@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
177 | skb->ip_summed = CHECKSUM_NONE; | 176 | skb->ip_summed = CHECKSUM_NONE; |
178 | 177 | ||
179 | esph = (struct ip_esp_hdr*)skb->data; | 178 | esph = (struct ip_esp_hdr*)skb->data; |
180 | iph = skb->nh.iph; | ||
181 | 179 | ||
182 | /* Get ivec. This can be wrong, check against another impls. */ | 180 | /* Get ivec. This can be wrong, check against another impls. */ |
183 | if (esp->conf.ivlen) | 181 | if (esp->conf.ivlen) |
@@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
204 | 202 | ||
205 | /* ... check padding bits here. Silly. :-) */ | 203 | /* ... check padding bits here. Silly. :-) */ |
206 | 204 | ||
205 | iph = skb->nh.iph; | ||
206 | ihl = iph->ihl * 4; | ||
207 | |||
207 | if (x->encap) { | 208 | if (x->encap) { |
208 | struct xfrm_encap_tmpl *encap = x->encap; | 209 | struct xfrm_encap_tmpl *encap = x->encap; |
209 | struct udphdr *uh; | 210 | struct udphdr *uh = (void *)(skb->nh.raw + ihl); |
210 | |||
211 | uh = (struct udphdr *)(iph + 1); | ||
212 | encap_len = (void*)esph - (void*)uh; | ||
213 | 211 | ||
214 | /* | 212 | /* |
215 | * 1) if the NAT-T peer's IP or port changed then | 213 | * 1) if the NAT-T peer's IP or port changed then |
@@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) | |||
246 | 244 | ||
247 | iph->protocol = nexthdr[1]; | 245 | iph->protocol = nexthdr[1]; |
248 | pskb_trim(skb, skb->len - alen - padlen - 2); | 246 | pskb_trim(skb, skb->len - alen - padlen - 2); |
249 | memcpy(workbuf, skb->nh.raw, iph->ihl*4); | 247 | skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; |
250 | skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); | ||
251 | skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; | ||
252 | memcpy(skb->nh.raw, workbuf, iph->ihl*4); | ||
253 | skb->nh.iph->tot_len = htons(skb->len); | ||
254 | 248 | ||
255 | return 0; | 249 | return 0; |
256 | 250 | ||
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2a0455911ee0..017900172f7d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c | |||
@@ -730,7 +730,6 @@ out_err: | |||
730 | static void icmp_redirect(struct sk_buff *skb) | 730 | static void icmp_redirect(struct sk_buff *skb) |
731 | { | 731 | { |
732 | struct iphdr *iph; | 732 | struct iphdr *iph; |
733 | unsigned long ip; | ||
734 | 733 | ||
735 | if (skb->len < sizeof(struct iphdr)) | 734 | if (skb->len < sizeof(struct iphdr)) |
736 | goto out_err; | 735 | goto out_err; |
@@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb) | |||
742 | goto out; | 741 | goto out; |
743 | 742 | ||
744 | iph = (struct iphdr *)skb->data; | 743 | iph = (struct iphdr *)skb->data; |
745 | ip = iph->daddr; | ||
746 | 744 | ||
747 | switch (skb->h.icmph->code & 7) { | 745 | switch (skb->h.icmph->code & 7) { |
748 | case ICMP_REDIR_NET: | 746 | case ICMP_REDIR_NET: |
@@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb) | |||
752 | */ | 750 | */ |
753 | case ICMP_REDIR_HOST: | 751 | case ICMP_REDIR_HOST: |
754 | case ICMP_REDIR_HOSTTOS: | 752 | case ICMP_REDIR_HOSTTOS: |
755 | ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, | 753 | ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, |
754 | skb->h.icmph->un.gateway, | ||
756 | iph->saddr, skb->dev); | 755 | iph->saddr, skb->dev); |
757 | break; | 756 | break; |
758 | } | 757 | } |
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d512239a1473..ab680c851aa2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c | |||
@@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) | |||
2361 | } | 2361 | } |
2362 | 2362 | ||
2363 | seq_printf(seq, | 2363 | seq_printf(seq, |
2364 | "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", | 2364 | "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", |
2365 | im->multiaddr, im->users, | 2365 | im->multiaddr, im->users, |
2366 | im->tm_running, im->tm_running ? | 2366 | im->tm_running, im->tm_running ? |
2367 | jiffies_to_clock_t(im->timer.expires-jiffies) : 0, | 2367 | jiffies_to_clock_t(im->timer.expires-jiffies) : 0, |
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cff9c3a72daf..8538aac3d148 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c | |||
@@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) | |||
410 | nf_bridge_get(to->nf_bridge); | 410 | nf_bridge_get(to->nf_bridge); |
411 | #endif | 411 | #endif |
412 | #endif | 412 | #endif |
413 | skb_copy_secmark(to, from); | ||
413 | } | 414 | } |
414 | 415 | ||
415 | /* | 416 | /* |
@@ -839,7 +840,7 @@ int ip_append_data(struct sock *sk, | |||
839 | */ | 840 | */ |
840 | if (transhdrlen && | 841 | if (transhdrlen && |
841 | length + fragheaderlen <= mtu && | 842 | length + fragheaderlen <= mtu && |
842 | rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && | 843 | rt->u.dst.dev->features & NETIF_F_ALL_CSUM && |
843 | !exthdrlen) | 844 | !exthdrlen) |
844 | csummode = CHECKSUM_HW; | 845 | csummode = CHECKSUM_HW; |
845 | 846 | ||
diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 95278b22b669..3ed8b57a1002 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c | |||
@@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); | |||
45 | static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) | 45 | static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) |
46 | { | 46 | { |
47 | int err, plen, dlen; | 47 | int err, plen, dlen; |
48 | struct iphdr *iph; | ||
49 | struct ipcomp_data *ipcd = x->data; | 48 | struct ipcomp_data *ipcd = x->data; |
50 | u8 *start, *scratch; | 49 | u8 *start, *scratch; |
51 | struct crypto_tfm *tfm; | 50 | struct crypto_tfm *tfm; |
@@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) | |||
74 | 73 | ||
75 | skb_put(skb, dlen - plen); | 74 | skb_put(skb, dlen - plen); |
76 | memcpy(skb->data, scratch, dlen); | 75 | memcpy(skb->data, scratch, dlen); |
77 | iph = skb->nh.iph; | ||
78 | iph->tot_len = htons(dlen + iph->ihl * 4); | ||
79 | out: | 76 | out: |
80 | put_cpu(); | 77 | put_cpu(); |
81 | return err; | 78 | return err; |
@@ -83,34 +80,21 @@ out: | |||
83 | 80 | ||
84 | static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) | 81 | static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) |
85 | { | 82 | { |
86 | u8 nexthdr; | 83 | int err = -ENOMEM; |
87 | int err = 0; | ||
88 | struct iphdr *iph; | 84 | struct iphdr *iph; |
89 | union { | 85 | struct ip_comp_hdr *ipch; |
90 | struct iphdr iph; | ||
91 | char buf[60]; | ||
92 | } tmp_iph; | ||
93 | |||
94 | 86 | ||
95 | if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && | 87 | if (skb_linearize_cow(skb)) |
96 | skb_linearize(skb, GFP_ATOMIC) != 0) { | ||
97 | err = -ENOMEM; | ||
98 | goto out; | 88 | goto out; |
99 | } | ||
100 | 89 | ||
101 | skb->ip_summed = CHECKSUM_NONE; | 90 | skb->ip_summed = CHECKSUM_NONE; |
102 | 91 | ||
103 | /* Remove ipcomp header and decompress original payload */ | 92 | /* Remove ipcomp header and decompress original payload */ |
104 | iph = skb->nh.iph; | 93 | iph = skb->nh.iph; |
105 | memcpy(&tmp_iph, iph, iph->ihl * 4); | 94 | ipch = (void *)skb->data; |
106 | nexthdr = *(u8 *)skb->data; | 95 | iph->protocol = ipch->nexthdr; |
107 | skb_pull(skb, sizeof(struct ip_comp_hdr)); | 96 | skb->h.raw = skb->nh.raw + sizeof(*ipch); |
108 | skb->nh.raw += sizeof(struct ip_comp_hdr); | 97 | __skb_pull(skb, sizeof(*ipch)); |
109 | memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); | ||
110 | iph = skb->nh.iph; | ||
111 | iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); | ||
112 | iph->protocol = nexthdr; | ||
113 | skb->h.raw = skb->data; | ||
114 | err = ipcomp_decompress(x, skb); | 98 | err = ipcomp_decompress(x, skb); |
115 | 99 | ||
116 | out: | 100 | out: |
@@ -171,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) | |||
171 | goto out_ok; | 155 | goto out_ok; |
172 | } | 156 | } |
173 | 157 | ||
174 | if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && | 158 | if (skb_linearize_cow(skb)) |
175 | skb_linearize(skb, GFP_ATOMIC) != 0) { | ||
176 | goto out_ok; | 159 | goto out_ok; |
177 | } | ||
178 | 160 | ||
179 | err = ipcomp_compress(x, skb); | 161 | err = ipcomp_compress(x, skb); |
180 | iph = skb->nh.iph; | 162 | iph = skb->nh.iph; |
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d4072533da21..e1d7f5fbc526 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig | |||
@@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK | |||
55 | of packets, but this mark value is kept in the conntrack session | 55 | of packets, but this mark value is kept in the conntrack session |
56 | instead of the individual packets. | 56 | instead of the individual packets. |
57 | 57 | ||
58 | config IP_NF_CONNTRACK_SECMARK | ||
59 | bool 'Connection tracking security mark support' | ||
60 | depends on IP_NF_CONNTRACK && NETWORK_SECMARK | ||
61 | help | ||
62 | This option enables security markings to be applied to | ||
63 | connections. Typically they are copied to connections from | ||
64 | packets using the CONNSECMARK target and copied back from | ||
65 | connections to packets with the same target, with the packets | ||
66 | being originally labeled via SECMARK. | ||
67 | |||
68 | If unsure, say 'N'. | ||
69 | |||
58 | config IP_NF_CONNTRACK_EVENTS | 70 | config IP_NF_CONNTRACK_EVENTS |
59 | bool "Connection tracking events (EXPERIMENTAL)" | 71 | bool "Connection tracking events (EXPERIMENTAL)" |
60 | depends on EXPERIMENTAL && IP_NF_CONNTRACK | 72 | depends on EXPERIMENTAL && IP_NF_CONNTRACK |
@@ -142,6 +154,8 @@ config IP_NF_TFTP | |||
142 | config IP_NF_AMANDA | 154 | config IP_NF_AMANDA |
143 | tristate "Amanda backup protocol support" | 155 | tristate "Amanda backup protocol support" |
144 | depends on IP_NF_CONNTRACK | 156 | depends on IP_NF_CONNTRACK |
157 | select TEXTSEARCH | ||
158 | select TEXTSEARCH_KMP | ||
145 | help | 159 | help |
146 | If you are running the Amanda backup package <http://www.amanda.org/> | 160 | If you are running the Amanda backup package <http://www.amanda.org/> |
147 | on this machine or machines that will be MASQUERADED through this | 161 | on this machine or machines that will be MASQUERADED through this |
@@ -181,14 +195,26 @@ config IP_NF_H323 | |||
181 | With this module you can support H.323 on a connection tracking/NAT | 195 | With this module you can support H.323 on a connection tracking/NAT |
182 | firewall. | 196 | firewall. |
183 | 197 | ||
184 | This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP | 198 | This module supports RAS, Fast Start, H.245 Tunnelling, Call |
185 | and T.120 based data and applications including audio, video, FAX, | 199 | Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, |
186 | chat, whiteboard, file transfer, etc. For more information, please | 200 | whiteboard, file transfer, etc. For more information, please |
187 | see http://nath323.sourceforge.net/. | 201 | visit http://nath323.sourceforge.net/. |
188 | 202 | ||
189 | If you want to compile it as a module, say 'M' here and read | 203 | If you want to compile it as a module, say 'M' here and read |
190 | Documentation/modules.txt. If unsure, say 'N'. | 204 | Documentation/modules.txt. If unsure, say 'N'. |
191 | 205 | ||
206 | config IP_NF_SIP | ||
207 | tristate "SIP protocol support (EXPERIMENTAL)" | ||
208 | depends on IP_NF_CONNTRACK && EXPERIMENTAL | ||
209 | help | ||
210 | SIP is an application-layer control protocol that can establish, | ||
211 | modify, and terminate multimedia sessions (conferences) such as | ||
212 | Internet telephony calls. With the ip_conntrack_sip and | ||
213 | the ip_nat_sip modules you can support the protocol on a connection | ||
214 | tracking/NATing firewall. | ||
215 | |||
216 | To compile it as a module, choose M here. If unsure, say Y. | ||
217 | |||
192 | config IP_NF_QUEUE | 218 | config IP_NF_QUEUE |
193 | tristate "IP Userspace queueing via NETLINK (OBSOLETE)" | 219 | tristate "IP Userspace queueing via NETLINK (OBSOLETE)" |
194 | help | 220 | help |
@@ -501,6 +527,12 @@ config IP_NF_NAT_H323 | |||
501 | default IP_NF_NAT if IP_NF_H323=y | 527 | default IP_NF_NAT if IP_NF_H323=y |
502 | default m if IP_NF_H323=m | 528 | default m if IP_NF_H323=m |
503 | 529 | ||
530 | config IP_NF_NAT_SIP | ||
531 | tristate | ||
532 | depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n | ||
533 | default IP_NF_NAT if IP_NF_SIP=y | ||
534 | default m if IP_NF_SIP=m | ||
535 | |||
504 | # mangle + specific targets | 536 | # mangle + specific targets |
505 | config IP_NF_MANGLE | 537 | config IP_NF_MANGLE |
506 | tristate "Packet mangling" | 538 | tristate "Packet mangling" |
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 461cb1eb5de7..3ded4a3af59c 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile | |||
@@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o | |||
31 | obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o | 31 | obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o |
32 | obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o | 32 | obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o |
33 | obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o | 33 | obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o |
34 | obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o | ||
34 | obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o | 35 | obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o |
35 | 36 | ||
36 | # NAT helpers | 37 | # NAT helpers |
@@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o | |||
40 | obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o | 41 | obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o |
41 | obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o | 42 | obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o |
42 | obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o | 43 | obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o |
44 | obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o | ||
43 | 45 | ||
44 | # generic IP tables | 46 | # generic IP tables |
45 | obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o | 47 | obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o |
diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index a604b1ccfdaa..0a7bd7f04061 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c | |||
@@ -17,33 +17,29 @@ | |||
17 | * this value. | 17 | * this value. |
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | |||
21 | #include <linux/in.h> | ||
22 | #include <linux/kernel.h> | 20 | #include <linux/kernel.h> |
23 | #include <linux/module.h> | 21 | #include <linux/module.h> |
24 | #include <linux/netfilter.h> | ||
25 | #include <linux/ip.h> | ||
26 | #include <linux/moduleparam.h> | 22 | #include <linux/moduleparam.h> |
23 | #include <linux/textsearch.h> | ||
24 | #include <linux/skbuff.h> | ||
25 | #include <linux/in.h> | ||
26 | #include <linux/ip.h> | ||
27 | #include <linux/udp.h> | 27 | #include <linux/udp.h> |
28 | #include <net/checksum.h> | ||
29 | #include <net/udp.h> | ||
30 | 28 | ||
29 | #include <linux/netfilter.h> | ||
31 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | 30 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> |
32 | #include <linux/netfilter_ipv4/ip_conntrack_amanda.h> | 31 | #include <linux/netfilter_ipv4/ip_conntrack_amanda.h> |
33 | 32 | ||
34 | static unsigned int master_timeout = 300; | 33 | static unsigned int master_timeout = 300; |
34 | static char *ts_algo = "kmp"; | ||
35 | 35 | ||
36 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); | 36 | MODULE_AUTHOR("Brian J. Murrell <netfilter@interlinx.bc.ca>"); |
37 | MODULE_DESCRIPTION("Amanda connection tracking module"); | 37 | MODULE_DESCRIPTION("Amanda connection tracking module"); |
38 | MODULE_LICENSE("GPL"); | 38 | MODULE_LICENSE("GPL"); |
39 | module_param(master_timeout, uint, 0600); | 39 | module_param(master_timeout, uint, 0600); |
40 | MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); | 40 | MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); |
41 | 41 | module_param(ts_algo, charp, 0400); | |
42 | static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; | 42 | MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); |
43 | |||
44 | /* This is slow, but it's simple. --RR */ | ||
45 | static char *amanda_buffer; | ||
46 | static DEFINE_SPINLOCK(amanda_buffer_lock); | ||
47 | 43 | ||
48 | unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, | 44 | unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, |
49 | enum ip_conntrack_info ctinfo, | 45 | enum ip_conntrack_info ctinfo, |
@@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, | |||
52 | struct ip_conntrack_expect *exp); | 48 | struct ip_conntrack_expect *exp); |
53 | EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); | 49 | EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); |
54 | 50 | ||
51 | enum amanda_strings { | ||
52 | SEARCH_CONNECT, | ||
53 | SEARCH_NEWLINE, | ||
54 | SEARCH_DATA, | ||
55 | SEARCH_MESG, | ||
56 | SEARCH_INDEX, | ||
57 | }; | ||
58 | |||
59 | static struct { | ||
60 | char *string; | ||
61 | size_t len; | ||
62 | struct ts_config *ts; | ||
63 | } search[] = { | ||
64 | [SEARCH_CONNECT] = { | ||
65 | .string = "CONNECT ", | ||
66 | .len = 8, | ||
67 | }, | ||
68 | [SEARCH_NEWLINE] = { | ||
69 | .string = "\n", | ||
70 | .len = 1, | ||
71 | }, | ||
72 | [SEARCH_DATA] = { | ||
73 | .string = "DATA ", | ||
74 | .len = 5, | ||
75 | }, | ||
76 | [SEARCH_MESG] = { | ||
77 | .string = "MESG ", | ||
78 | .len = 5, | ||
79 | }, | ||
80 | [SEARCH_INDEX] = { | ||
81 | .string = "INDEX ", | ||
82 | .len = 6, | ||
83 | }, | ||
84 | }; | ||
85 | |||
55 | static int help(struct sk_buff **pskb, | 86 | static int help(struct sk_buff **pskb, |
56 | struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) | 87 | struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) |
57 | { | 88 | { |
89 | struct ts_state ts; | ||
58 | struct ip_conntrack_expect *exp; | 90 | struct ip_conntrack_expect *exp; |
59 | char *data, *data_limit, *tmp; | 91 | unsigned int dataoff, start, stop, off, i; |
60 | unsigned int dataoff, i; | 92 | char pbuf[sizeof("65535")], *tmp; |
61 | u_int16_t port, len; | 93 | u_int16_t port, len; |
62 | int ret = NF_ACCEPT; | 94 | int ret = NF_ACCEPT; |
63 | 95 | ||
@@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb, | |||
77 | return NF_ACCEPT; | 109 | return NF_ACCEPT; |
78 | } | 110 | } |
79 | 111 | ||
80 | spin_lock_bh(&amanda_buffer_lock); | 112 | memset(&ts, 0, sizeof(ts)); |
81 | skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); | 113 | start = skb_find_text(*pskb, dataoff, (*pskb)->len, |
82 | data = amanda_buffer; | 114 | search[SEARCH_CONNECT].ts, &ts); |
83 | data_limit = amanda_buffer + (*pskb)->len - dataoff; | 115 | if (start == UINT_MAX) |
84 | *data_limit = '\0'; | ||
85 | |||
86 | /* Search for the CONNECT string */ | ||
87 | data = strstr(data, "CONNECT "); | ||
88 | if (!data) | ||
89 | goto out; | 116 | goto out; |
90 | data += strlen("CONNECT "); | 117 | start += dataoff + search[SEARCH_CONNECT].len; |
91 | 118 | ||
92 | /* Only search first line. */ | 119 | memset(&ts, 0, sizeof(ts)); |
93 | if ((tmp = strchr(data, '\n'))) | 120 | stop = skb_find_text(*pskb, start, (*pskb)->len, |
94 | *tmp = '\0'; | 121 | search[SEARCH_NEWLINE].ts, &ts); |
122 | if (stop == UINT_MAX) | ||
123 | goto out; | ||
124 | stop += start; | ||
95 | 125 | ||
96 | for (i = 0; i < ARRAY_SIZE(conns); i++) { | 126 | for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { |
97 | char *match = strstr(data, conns[i]); | 127 | memset(&ts, 0, sizeof(ts)); |
98 | if (!match) | 128 | off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); |
129 | if (off == UINT_MAX) | ||
99 | continue; | 130 | continue; |
100 | tmp = data = match + strlen(conns[i]); | 131 | off += start + search[i].len; |
101 | port = simple_strtoul(data, &data, 10); | 132 | |
102 | len = data - tmp; | 133 | len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); |
134 | if (skb_copy_bits(*pskb, off, pbuf, len)) | ||
135 | break; | ||
136 | pbuf[len] = '\0'; | ||
137 | |||
138 | port = simple_strtoul(pbuf, &tmp, 10); | ||
139 | len = tmp - pbuf; | ||
103 | if (port == 0 || len > 5) | 140 | if (port == 0 || len > 5) |
104 | break; | 141 | break; |
105 | 142 | ||
@@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb, | |||
125 | exp->mask.dst.u.tcp.port = 0xFFFF; | 162 | exp->mask.dst.u.tcp.port = 0xFFFF; |
126 | 163 | ||
127 | if (ip_nat_amanda_hook) | 164 | if (ip_nat_amanda_hook) |
128 | ret = ip_nat_amanda_hook(pskb, ctinfo, | 165 | ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff, |
129 | tmp - amanda_buffer, | ||
130 | len, exp); | 166 | len, exp); |
131 | else if (ip_conntrack_expect_related(exp) != 0) | 167 | else if (ip_conntrack_expect_related(exp) != 0) |
132 | ret = NF_DROP; | 168 | ret = NF_DROP; |
@@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb, | |||
134 | } | 170 | } |
135 | 171 | ||
136 | out: | 172 | out: |
137 | spin_unlock_bh(&amanda_buffer_lock); | ||
138 | return ret; | 173 | return ret; |
139 | } | 174 | } |
140 | 175 | ||
141 | static struct ip_conntrack_helper amanda_helper = { | 176 | static struct ip_conntrack_helper amanda_helper = { |
142 | .max_expected = ARRAY_SIZE(conns), | 177 | .max_expected = 3, |
143 | .timeout = 180, | 178 | .timeout = 180, |
144 | .me = THIS_MODULE, | 179 | .me = THIS_MODULE, |
145 | .help = help, | 180 | .help = help, |
@@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = { | |||
155 | 190 | ||
156 | static void __exit ip_conntrack_amanda_fini(void) | 191 | static void __exit ip_conntrack_amanda_fini(void) |
157 | { | 192 | { |
193 | int i; | ||
194 | |||
158 | ip_conntrack_helper_unregister(&amanda_helper); | 195 | ip_conntrack_helper_unregister(&amanda_helper); |
159 | kfree(amanda_buffer); | 196 | for (i = 0; i < ARRAY_SIZE(search); i++) |
197 | textsearch_destroy(search[i].ts); | ||
160 | } | 198 | } |
161 | 199 | ||
162 | static int __init ip_conntrack_amanda_init(void) | 200 | static int __init ip_conntrack_amanda_init(void) |
163 | { | 201 | { |
164 | int ret; | 202 | int ret, i; |
165 | 203 | ||
166 | amanda_buffer = kmalloc(65536, GFP_KERNEL); | 204 | ret = -ENOMEM; |
167 | if (!amanda_buffer) | 205 | for (i = 0; i < ARRAY_SIZE(search); i++) { |
168 | return -ENOMEM; | 206 | search[i].ts = textsearch_prepare(ts_algo, search[i].string, |
169 | 207 | search[i].len, | |
170 | ret = ip_conntrack_helper_register(&amanda_helper); | 208 | GFP_KERNEL, TS_AUTOLOAD); |
171 | if (ret < 0) { | 209 | if (search[i].ts == NULL) |
172 | kfree(amanda_buffer); | 210 | goto err; |
173 | return ret; | ||
174 | } | 211 | } |
212 | ret = ip_conntrack_helper_register(&amanda_helper); | ||
213 | if (ret < 0) | ||
214 | goto err; | ||
175 | return 0; | 215 | return 0; |
176 | 216 | ||
177 | 217 | err: | |
218 | for (; i >= 0; i--) { | ||
219 | if (search[i].ts) | ||
220 | textsearch_destroy(search[i].ts); | ||
221 | } | ||
222 | return ret; | ||
178 | } | 223 | } |
179 | 224 | ||
180 | module_init(ip_conntrack_amanda_init); | 225 | module_init(ip_conntrack_amanda_init); |
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a297da7bbef5..7e4cf9a4d15f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c | |||
@@ -724,6 +724,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, | |||
724 | /* this is ugly, but there is no other place where to put it */ | 724 | /* this is ugly, but there is no other place where to put it */ |
725 | conntrack->nat.masq_index = exp->master->nat.masq_index; | 725 | conntrack->nat.masq_index = exp->master->nat.masq_index; |
726 | #endif | 726 | #endif |
727 | #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK | ||
728 | conntrack->secmark = exp->master->secmark; | ||
729 | #endif | ||
727 | nf_conntrack_get(&conntrack->master->ct_general); | 730 | nf_conntrack_get(&conntrack->master->ct_general); |
728 | CONNTRACK_STAT_INC(expect_new); | 731 | CONNTRACK_STAT_INC(expect_new); |
729 | } else { | 732 | } else { |
@@ -1130,6 +1133,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct, | |||
1130 | 1133 | ||
1131 | write_lock_bh(&ip_conntrack_lock); | 1134 | write_lock_bh(&ip_conntrack_lock); |
1132 | 1135 | ||
1136 | /* Only update if this is not a fixed timeout */ | ||
1137 | if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { | ||
1138 | write_unlock_bh(&ip_conntrack_lock); | ||
1139 | return; | ||
1140 | } | ||
1141 | |||
1133 | /* If not in hash table, timer will not be active yet */ | 1142 | /* If not in hash table, timer will not be active yet */ |
1134 | if (!is_confirmed(ct)) { | 1143 | if (!is_confirmed(ct)) { |
1135 | ct->timeout.expires = extra_jiffies; | 1144 | ct->timeout.expires = extra_jiffies; |
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3e542bf28a9d..4dcf526c3944 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c | |||
@@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char); | |||
56 | static int try_epsv_response(const char *, size_t, u_int32_t [], char); | 56 | static int try_epsv_response(const char *, size_t, u_int32_t [], char); |
57 | 57 | ||
58 | static const struct ftp_search { | 58 | static const struct ftp_search { |
59 | enum ip_conntrack_dir dir; | ||
60 | const char *pattern; | 59 | const char *pattern; |
61 | size_t plen; | 60 | size_t plen; |
62 | char skip; | 61 | char skip; |
63 | char term; | 62 | char term; |
64 | enum ip_ct_ftp_type ftptype; | 63 | enum ip_ct_ftp_type ftptype; |
65 | int (*getnum)(const char *, size_t, u_int32_t[], char); | 64 | int (*getnum)(const char *, size_t, u_int32_t[], char); |
66 | } search[] = { | 65 | } search[IP_CT_DIR_MAX][2] = { |
67 | { | 66 | [IP_CT_DIR_ORIGINAL] = { |
68 | IP_CT_DIR_ORIGINAL, | 67 | { |
69 | "PORT", sizeof("PORT") - 1, ' ', '\r', | 68 | .pattern = "PORT", |
70 | IP_CT_FTP_PORT, | 69 | .plen = sizeof("PORT") - 1, |
71 | try_rfc959, | 70 | .skip = ' ', |
71 | .term = '\r', | ||
72 | .ftptype = IP_CT_FTP_PORT, | ||
73 | .getnum = try_rfc959, | ||
74 | }, | ||
75 | { | ||
76 | .pattern = "EPRT", | ||
77 | .plen = sizeof("EPRT") - 1, | ||
78 | .skip = ' ', | ||
79 | .term = '\r', | ||
80 | .ftptype = IP_CT_FTP_EPRT, | ||
81 | .getnum = try_eprt, | ||
82 | }, | ||
72 | }, | 83 | }, |
73 | { | 84 | [IP_CT_DIR_REPLY] = { |
74 | IP_CT_DIR_REPLY, | 85 | { |
75 | "227 ", sizeof("227 ") - 1, '(', ')', | 86 | .pattern = "227 ", |
76 | IP_CT_FTP_PASV, | 87 | .plen = sizeof("227 ") - 1, |
77 | try_rfc959, | 88 | .skip = '(', |
78 | }, | 89 | .term = ')', |
79 | { | 90 | .ftptype = IP_CT_FTP_PASV, |
80 | IP_CT_DIR_ORIGINAL, | 91 | .getnum = try_rfc959, |
81 | "EPRT", sizeof("EPRT") - 1, ' ', '\r', | 92 | }, |
82 | IP_CT_FTP_EPRT, | 93 | { |
83 | try_eprt, | 94 | .pattern = "229 ", |
84 | }, | 95 | .plen = sizeof("229 ") - 1, |
85 | { | 96 | .skip = '(', |
86 | IP_CT_DIR_REPLY, | 97 | .term = ')', |
87 | "229 ", sizeof("229 ") - 1, '(', ')', | 98 | .ftptype = IP_CT_FTP_EPSV, |
88 | IP_CT_FTP_EPSV, | 99 | .getnum = try_epsv_response, |
89 | try_epsv_response, | 100 | }, |
90 | }, | 101 | }, |
91 | }; | 102 | }; |
92 | 103 | ||
@@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb, | |||
346 | array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; | 357 | array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; |
347 | array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; | 358 | array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; |
348 | 359 | ||
349 | for (i = 0; i < ARRAY_SIZE(search); i++) { | 360 | for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { |
350 | if (search[i].dir != dir) continue; | ||
351 | |||
352 | found = find_pattern(fb_ptr, (*pskb)->len - dataoff, | 361 | found = find_pattern(fb_ptr, (*pskb)->len - dataoff, |
353 | search[i].pattern, | 362 | search[dir][i].pattern, |
354 | search[i].plen, | 363 | search[dir][i].plen, |
355 | search[i].skip, | 364 | search[dir][i].skip, |
356 | search[i].term, | 365 | search[dir][i].term, |
357 | &matchoff, &matchlen, | 366 | &matchoff, &matchlen, |
358 | array, | 367 | array, |
359 | search[i].getnum); | 368 | search[dir][i].getnum); |
360 | if (found) break; | 369 | if (found) break; |
361 | } | 370 | } |
362 | if (found == -1) { | 371 | if (found == -1) { |
@@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb, | |||
366 | this case. */ | 375 | this case. */ |
367 | if (net_ratelimit()) | 376 | if (net_ratelimit()) |
368 | printk("conntrack_ftp: partial %s %u+%u\n", | 377 | printk("conntrack_ftp: partial %s %u+%u\n", |
369 | search[i].pattern, | 378 | search[dir][i].pattern, |
370 | ntohl(th->seq), datalen); | 379 | ntohl(th->seq), datalen); |
371 | ret = NF_DROP; | 380 | ret = NF_DROP; |
372 | goto out; | 381 | goto out; |
@@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb, | |||
426 | /* Now, NAT might want to mangle the packet, and register the | 435 | /* Now, NAT might want to mangle the packet, and register the |
427 | * (possibly changed) expectation itself. */ | 436 | * (possibly changed) expectation itself. */ |
428 | if (ip_nat_ftp_hook) | 437 | if (ip_nat_ftp_hook) |
429 | ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, | 438 | ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, |
430 | matchoff, matchlen, exp, &seq); | 439 | matchoff, matchlen, exp, &seq); |
431 | else { | 440 | else { |
432 | /* Can't expect this? Best to drop packet now. */ | 441 | /* Can't expect this? Best to drop packet now. */ |
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 518f581d39ec..0665674218c6 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> | 22 | #include <linux/netfilter_ipv4/ip_conntrack_tuple.h> |
23 | #include <linux/netfilter_ipv4/ip_conntrack_h323.h> | 23 | #include <linux/netfilter_ipv4/ip_conntrack_h323.h> |
24 | #include <linux/moduleparam.h> | 24 | #include <linux/moduleparam.h> |
25 | #include <linux/ctype.h> | ||
26 | #include <linux/inet.h> | ||
25 | 27 | ||
26 | #if 0 | 28 | #if 0 |
27 | #define DEBUGP printk | 29 | #define DEBUGP printk |
@@ -38,6 +40,12 @@ static int gkrouted_only = 1; | |||
38 | module_param(gkrouted_only, int, 0600); | 40 | module_param(gkrouted_only, int, 0600); |
39 | MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); | 41 | MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); |
40 | 42 | ||
43 | static int callforward_filter = 1; | ||
44 | module_param(callforward_filter, bool, 0600); | ||
45 | MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " | ||
46 | "if both endpoints are on different sides " | ||
47 | "(determined by routing information)"); | ||
48 | |||
41 | /* Hooks for NAT */ | 49 | /* Hooks for NAT */ |
42 | int (*set_h245_addr_hook) (struct sk_buff ** pskb, | 50 | int (*set_h245_addr_hook) (struct sk_buff ** pskb, |
43 | unsigned char **data, int dataoff, | 51 | unsigned char **data, int dataoff, |
@@ -77,6 +85,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb, | |||
77 | unsigned char **data, int dataoff, | 85 | unsigned char **data, int dataoff, |
78 | TransportAddress * addr, u_int16_t port, | 86 | TransportAddress * addr, u_int16_t port, |
79 | struct ip_conntrack_expect * exp); | 87 | struct ip_conntrack_expect * exp); |
88 | int (*nat_callforwarding_hook) (struct sk_buff ** pskb, | ||
89 | struct ip_conntrack * ct, | ||
90 | enum ip_conntrack_info ctinfo, | ||
91 | unsigned char **data, int dataoff, | ||
92 | TransportAddress * addr, u_int16_t port, | ||
93 | struct ip_conntrack_expect * exp); | ||
80 | int (*nat_q931_hook) (struct sk_buff ** pskb, | 94 | int (*nat_q931_hook) (struct sk_buff ** pskb, |
81 | struct ip_conntrack * ct, | 95 | struct ip_conntrack * ct, |
82 | enum ip_conntrack_info ctinfo, | 96 | enum ip_conntrack_info ctinfo, |
@@ -683,6 +697,92 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, | |||
683 | return ret; | 697 | return ret; |
684 | } | 698 | } |
685 | 699 | ||
700 | /* Forwarding declaration */ | ||
701 | void ip_conntrack_q931_expect(struct ip_conntrack *new, | ||
702 | struct ip_conntrack_expect *this); | ||
703 | |||
704 | /****************************************************************************/ | ||
705 | static int expect_callforwarding(struct sk_buff **pskb, | ||
706 | struct ip_conntrack *ct, | ||
707 | enum ip_conntrack_info ctinfo, | ||
708 | unsigned char **data, int dataoff, | ||
709 | TransportAddress * addr) | ||
710 | { | ||
711 | int dir = CTINFO2DIR(ctinfo); | ||
712 | int ret = 0; | ||
713 | u_int32_t ip; | ||
714 | u_int16_t port; | ||
715 | struct ip_conntrack_expect *exp = NULL; | ||
716 | |||
717 | /* Read alternativeAddress */ | ||
718 | if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) | ||
719 | return 0; | ||
720 | |||
721 | /* If the calling party is on the same side of the forward-to party, | ||
722 | * we don't need to track the second call */ | ||
723 | if (callforward_filter) { | ||
724 | struct rtable *rt1, *rt2; | ||
725 | struct flowi fl1 = { | ||
726 | .fl4_dst = ip, | ||
727 | }; | ||
728 | struct flowi fl2 = { | ||
729 | .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, | ||
730 | }; | ||
731 | |||
732 | if (ip_route_output_key(&rt1, &fl1) == 0) { | ||
733 | if (ip_route_output_key(&rt2, &fl2) == 0) { | ||
734 | if (rt1->rt_gateway == rt2->rt_gateway && | ||
735 | rt1->u.dst.dev == rt2->u.dst.dev) | ||
736 | ret = 1; | ||
737 | dst_release(&rt2->u.dst); | ||
738 | } | ||
739 | dst_release(&rt1->u.dst); | ||
740 | } | ||
741 | if (ret) { | ||
742 | DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); | ||
743 | return 0; | ||
744 | } | ||
745 | } | ||
746 | |||
747 | /* Create expect for the second call leg */ | ||
748 | if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) | ||
749 | return -1; | ||
750 | exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; | ||
751 | exp->tuple.src.u.tcp.port = 0; | ||
752 | exp->tuple.dst.ip = ip; | ||
753 | exp->tuple.dst.u.tcp.port = htons(port); | ||
754 | exp->tuple.dst.protonum = IPPROTO_TCP; | ||
755 | exp->mask.src.ip = 0xFFFFFFFF; | ||
756 | exp->mask.src.u.tcp.port = 0; | ||
757 | exp->mask.dst.ip = 0xFFFFFFFF; | ||
758 | exp->mask.dst.u.tcp.port = 0xFFFF; | ||
759 | exp->mask.dst.protonum = 0xFF; | ||
760 | exp->flags = 0; | ||
761 | |||
762 | if (ct->tuplehash[dir].tuple.src.ip != | ||
763 | ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) { | ||
764 | /* Need NAT */ | ||
765 | ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff, | ||
766 | addr, port, exp); | ||
767 | } else { /* Conntrack only */ | ||
768 | exp->expectfn = ip_conntrack_q931_expect; | ||
769 | |||
770 | if (ip_conntrack_expect_related(exp) == 0) { | ||
771 | DEBUGP("ip_ct_q931: expect Call Forwarding " | ||
772 | "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", | ||
773 | NIPQUAD(exp->tuple.src.ip), | ||
774 | ntohs(exp->tuple.src.u.tcp.port), | ||
775 | NIPQUAD(exp->tuple.dst.ip), | ||
776 | ntohs(exp->tuple.dst.u.tcp.port)); | ||
777 | } else | ||
778 | ret = -1; | ||
779 | } | ||
780 | |||
781 | ip_conntrack_expect_put(exp); | ||
782 | |||
783 | return ret; | ||
784 | } | ||
785 | |||
686 | /****************************************************************************/ | 786 | /****************************************************************************/ |
687 | static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, | 787 | static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, |
688 | enum ip_conntrack_info ctinfo, | 788 | enum ip_conntrack_info ctinfo, |
@@ -878,6 +978,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, | |||
878 | 978 | ||
879 | DEBUGP("ip_ct_q931: Facility\n"); | 979 | DEBUGP("ip_ct_q931: Facility\n"); |
880 | 980 | ||
981 | if (facility->reason.choice == eFacilityReason_callForwarded) { | ||
982 | if (facility->options & eFacility_UUIE_alternativeAddress) | ||
983 | return expect_callforwarding(pskb, ct, ctinfo, data, | ||
984 | dataoff, | ||
985 | &facility-> | ||
986 | alternativeAddress); | ||
987 | return 0; | ||
988 | } | ||
989 | |||
881 | if (facility->options & eFacility_UUIE_h245Address) { | 990 | if (facility->options & eFacility_UUIE_h245Address) { |
882 | ret = expect_h245(pskb, ct, ctinfo, data, dataoff, | 991 | ret = expect_h245(pskb, ct, ctinfo, data, dataoff, |
883 | &facility->h245Address); | 992 | &facility->h245Address); |
@@ -1677,7 +1786,6 @@ static int __init init(void) | |||
1677 | fini(); | 1786 | fini(); |
1678 | return ret; | 1787 | return ret; |
1679 | } | 1788 | } |
1680 | |||
1681 | DEBUGP("ip_ct_h323: init success\n"); | 1789 | DEBUGP("ip_ct_h323: init success\n"); |
1682 | return 0; | 1790 | return 0; |
1683 | } | 1791 | } |
@@ -1696,6 +1804,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook); | |||
1696 | EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); | 1804 | EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); |
1697 | EXPORT_SYMBOL_GPL(nat_t120_hook); | 1805 | EXPORT_SYMBOL_GPL(nat_t120_hook); |
1698 | EXPORT_SYMBOL_GPL(nat_h245_hook); | 1806 | EXPORT_SYMBOL_GPL(nat_h245_hook); |
1807 | EXPORT_SYMBOL_GPL(nat_callforwarding_hook); | ||
1699 | EXPORT_SYMBOL_GPL(nat_q931_hook); | 1808 | EXPORT_SYMBOL_GPL(nat_q931_hook); |
1700 | 1809 | ||
1701 | MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); | 1810 | MODULE_AUTHOR("Jing Min Zhao <zhaojingmin@users.sourceforge.net>"); |
diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c index 022c47b9f6c9..4b359618bedd 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 | 1 | /* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 |
2 | * | 2 | * |
3 | * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> | 3 | * Copyright (c) 2006 Jing Min Zhao <zhaojingmin@users.sourceforge.net> |
4 | * | 4 | * |
@@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ | |||
1069 | 1069 | ||
1070 | static field_t _Facility_UUIE[] = { /* SEQUENCE */ | 1070 | static field_t _Facility_UUIE[] = { /* SEQUENCE */ |
1071 | {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, | 1071 | {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, |
1072 | {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, | 1072 | {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, |
1073 | _TransportAddress}, | 1073 | offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, |
1074 | {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, | 1074 | {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, |
1075 | _Facility_UUIE_alternativeAliasAddress}, | 1075 | _Facility_UUIE_alternativeAliasAddress}, |
1076 | {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, | 1076 | {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, |
diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 01bd7cab9367..33891bb1fde4 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c | |||
@@ -399,38 +399,54 @@ nfattr_failure: | |||
399 | static int ctnetlink_done(struct netlink_callback *cb) | 399 | static int ctnetlink_done(struct netlink_callback *cb) |
400 | { | 400 | { |
401 | DEBUGP("entered %s\n", __FUNCTION__); | 401 | DEBUGP("entered %s\n", __FUNCTION__); |
402 | if (cb->args[1]) | ||
403 | ip_conntrack_put((struct ip_conntrack *)cb->args[1]); | ||
402 | return 0; | 404 | return 0; |
403 | } | 405 | } |
404 | 406 | ||
405 | static int | 407 | static int |
406 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | 408 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) |
407 | { | 409 | { |
408 | struct ip_conntrack *ct = NULL; | 410 | struct ip_conntrack *ct, *last; |
409 | struct ip_conntrack_tuple_hash *h; | 411 | struct ip_conntrack_tuple_hash *h; |
410 | struct list_head *i; | 412 | struct list_head *i; |
411 | u_int32_t *id = (u_int32_t *) &cb->args[1]; | ||
412 | 413 | ||
413 | DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, | 414 | DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, |
414 | cb->args[0], *id); | 415 | cb->args[0], *id); |
415 | 416 | ||
416 | read_lock_bh(&ip_conntrack_lock); | 417 | read_lock_bh(&ip_conntrack_lock); |
417 | for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { | 418 | for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { |
419 | restart: | ||
420 | last = (struct ip_conntrack *)cb->args[1]; | ||
418 | list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { | 421 | list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { |
419 | h = (struct ip_conntrack_tuple_hash *) i; | 422 | h = (struct ip_conntrack_tuple_hash *) i; |
420 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) | 423 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) |
421 | continue; | 424 | continue; |
422 | ct = tuplehash_to_ctrack(h); | 425 | ct = tuplehash_to_ctrack(h); |
423 | if (ct->id <= *id) | 426 | if (last != NULL) { |
424 | continue; | 427 | if (ct == last) { |
428 | ip_conntrack_put(last); | ||
429 | cb->args[1] = 0; | ||
430 | last = NULL; | ||
431 | } else | ||
432 | continue; | ||
433 | } | ||
425 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, | 434 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, |
426 | cb->nlh->nlmsg_seq, | 435 | cb->nlh->nlmsg_seq, |
427 | IPCTNL_MSG_CT_NEW, | 436 | IPCTNL_MSG_CT_NEW, |
428 | 1, ct) < 0) | 437 | 1, ct) < 0) { |
438 | nf_conntrack_get(&ct->ct_general); | ||
439 | cb->args[1] = (unsigned long)ct; | ||
429 | goto out; | 440 | goto out; |
430 | *id = ct->id; | 441 | } |
442 | } | ||
443 | if (last != NULL) { | ||
444 | ip_conntrack_put(last); | ||
445 | cb->args[1] = 0; | ||
446 | goto restart; | ||
431 | } | 447 | } |
432 | } | 448 | } |
433 | out: | 449 | out: |
434 | read_unlock_bh(&ip_conntrack_lock); | 450 | read_unlock_bh(&ip_conntrack_lock); |
435 | 451 | ||
436 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); | 452 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); |
@@ -629,7 +645,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { | |||
629 | }; | 645 | }; |
630 | 646 | ||
631 | static inline int | 647 | static inline int |
632 | ctnetlink_parse_nat(struct nfattr *cda[], | 648 | ctnetlink_parse_nat(struct nfattr *nat, |
633 | const struct ip_conntrack *ct, struct ip_nat_range *range) | 649 | const struct ip_conntrack *ct, struct ip_nat_range *range) |
634 | { | 650 | { |
635 | struct nfattr *tb[CTA_NAT_MAX]; | 651 | struct nfattr *tb[CTA_NAT_MAX]; |
@@ -639,7 +655,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], | |||
639 | 655 | ||
640 | memset(range, 0, sizeof(*range)); | 656 | memset(range, 0, sizeof(*range)); |
641 | 657 | ||
642 | nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); | 658 | nfattr_parse_nested(tb, CTA_NAT_MAX, nat); |
643 | 659 | ||
644 | if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) | 660 | if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) |
645 | return -EINVAL; | 661 | return -EINVAL; |
@@ -854,39 +870,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) | |||
854 | /* ASSURED bit can only be set */ | 870 | /* ASSURED bit can only be set */ |
855 | return -EINVAL; | 871 | return -EINVAL; |
856 | 872 | ||
857 | if (cda[CTA_NAT-1]) { | 873 | if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { |
858 | #ifndef CONFIG_IP_NF_NAT_NEEDED | 874 | #ifndef CONFIG_IP_NF_NAT_NEEDED |
859 | return -EINVAL; | 875 | return -EINVAL; |
860 | #else | 876 | #else |
861 | unsigned int hooknum; | ||
862 | struct ip_nat_range range; | 877 | struct ip_nat_range range; |
863 | 878 | ||
864 | if (ctnetlink_parse_nat(cda, ct, &range) < 0) | 879 | if (cda[CTA_NAT_DST-1]) { |
865 | return -EINVAL; | 880 | if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, |
866 | 881 | &range) < 0) | |
867 | DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", | 882 | return -EINVAL; |
868 | NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), | 883 | if (ip_nat_initialized(ct, |
869 | htons(range.min.all), htons(range.max.all)); | 884 | HOOK2MANIP(NF_IP_PRE_ROUTING))) |
870 | 885 | return -EEXIST; | |
871 | /* This is tricky but it works. ip_nat_setup_info needs the | 886 | ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); |
872 | * hook number as parameter, so let's do the correct | 887 | } |
873 | * conversion and run away */ | 888 | if (cda[CTA_NAT_SRC-1]) { |
874 | if (status & IPS_SRC_NAT_DONE) | 889 | if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, |
875 | hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ | 890 | &range) < 0) |
876 | else if (status & IPS_DST_NAT_DONE) | 891 | return -EINVAL; |
877 | hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ | 892 | if (ip_nat_initialized(ct, |
878 | else | 893 | HOOK2MANIP(NF_IP_POST_ROUTING))) |
879 | return -EINVAL; /* Missing NAT flags */ | 894 | return -EEXIST; |
880 | 895 | ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); | |
881 | DEBUGP("NAT status: %lu\n", | 896 | } |
882 | status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
883 | |||
884 | if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) | ||
885 | return -EEXIST; | ||
886 | ip_nat_setup_info(ct, &range, hooknum); | ||
887 | |||
888 | DEBUGP("NAT status after setup_info: %lu\n", | ||
889 | ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
890 | #endif | 897 | #endif |
891 | } | 898 | } |
892 | 899 | ||
@@ -1106,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
1106 | /* implicit 'else' */ | 1113 | /* implicit 'else' */ |
1107 | 1114 | ||
1108 | /* we only allow nat config for new conntracks */ | 1115 | /* we only allow nat config for new conntracks */ |
1109 | if (cda[CTA_NAT-1]) { | 1116 | if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { |
1110 | err = -EINVAL; | 1117 | err = -EINVAL; |
1111 | goto out_unlock; | 1118 | goto out_unlock; |
1112 | } | 1119 | } |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 56794797d55b..21ee124c0463 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c | |||
@@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, | |||
77 | } | 77 | } |
78 | 78 | ||
79 | /* look up the source key for a given tuple */ | 79 | /* look up the source key for a given tuple */ |
80 | static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) | 80 | static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) |
81 | { | 81 | { |
82 | struct ip_ct_gre_keymap *km; | 82 | struct ip_ct_gre_keymap *km; |
83 | u_int32_t key = 0; | 83 | __be16 key = 0; |
84 | 84 | ||
85 | read_lock_bh(&ip_ct_gre_lock); | 85 | read_lock_bh(&ip_ct_gre_lock); |
86 | km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, | 86 | km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, |
@@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, | |||
190 | struct ip_conntrack_tuple *tuple) | 190 | struct ip_conntrack_tuple *tuple) |
191 | { | 191 | { |
192 | struct gre_hdr_pptp _pgrehdr, *pgrehdr; | 192 | struct gre_hdr_pptp _pgrehdr, *pgrehdr; |
193 | u_int32_t srckey; | 193 | __be16 srckey; |
194 | struct gre_hdr _grehdr, *grehdr; | 194 | struct gre_hdr _grehdr, *grehdr; |
195 | 195 | ||
196 | /* first only delinearize old RFC1701 GRE header */ | 196 | /* first only delinearize old RFC1701 GRE header */ |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index d8b14a9010a6..23f1c504586d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c | |||
@@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
224 | } | 224 | } |
225 | 225 | ||
226 | /* See ip_conntrack_proto_tcp.c */ | 226 | /* See ip_conntrack_proto_tcp.c */ |
227 | if (hooknum == NF_IP_PRE_ROUTING && | 227 | if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
228 | nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { | 228 | nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { |
229 | if (LOG_INVALID(IPPROTO_ICMP)) | 229 | if (LOG_INVALID(IPPROTO_ICMP)) |
230 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 230 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 062b252b58ad..c5c2ce5cdeb8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c | |||
@@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb, | |||
870 | * and moreover root might send raw packets. | 870 | * and moreover root might send raw packets. |
871 | */ | 871 | */ |
872 | /* FIXME: Source route IP option packets --RR */ | 872 | /* FIXME: Source route IP option packets --RR */ |
873 | if (hooknum == NF_IP_PRE_ROUTING && | 873 | if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
874 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { | 874 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { |
875 | if (LOG_INVALID(IPPROTO_TCP)) | 875 | if (LOG_INVALID(IPPROTO_TCP)) |
876 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 876 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 70899868783b..9b2c16b4d2ff 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c | |||
@@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, | |||
120 | * because the semantic of CHECKSUM_HW is different there | 120 | * because the semantic of CHECKSUM_HW is different there |
121 | * and moreover root might send raw packets. | 121 | * and moreover root might send raw packets. |
122 | * FIXME: Source route IP option packets --RR */ | 122 | * FIXME: Source route IP option packets --RR */ |
123 | if (hooknum == NF_IP_PRE_ROUTING && | 123 | if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
124 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { | 124 | nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { |
125 | if (LOG_INVALID(IPPROTO_UDP)) | 125 | if (LOG_INVALID(IPPROTO_UDP)) |
126 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 126 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c new file mode 100644 index 000000000000..fc87ce0da40d --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_sip.c | |||
@@ -0,0 +1,471 @@ | |||
1 | /* SIP extension for IP connection tracking. | ||
2 | * | ||
3 | * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> | ||
4 | * based on RR's ip_conntrack_ftp.c and other modules. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/config.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/ctype.h> | ||
14 | #include <linux/skbuff.h> | ||
15 | #include <linux/in.h> | ||
16 | #include <linux/ip.h> | ||
17 | #include <linux/udp.h> | ||
18 | |||
19 | #include <linux/netfilter.h> | ||
20 | #include <linux/netfilter_ipv4.h> | ||
21 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | ||
22 | #include <linux/netfilter_ipv4/ip_conntrack_sip.h> | ||
23 | |||
24 | #if 0 | ||
25 | #define DEBUGP printk | ||
26 | #else | ||
27 | #define DEBUGP(format, args...) | ||
28 | #endif | ||
29 | |||
30 | MODULE_LICENSE("GPL"); | ||
31 | MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); | ||
32 | MODULE_DESCRIPTION("SIP connection tracking helper"); | ||
33 | |||
34 | #define MAX_PORTS 8 | ||
35 | static unsigned short ports[MAX_PORTS]; | ||
36 | static int ports_c; | ||
37 | module_param_array(ports, ushort, &ports_c, 0400); | ||
38 | MODULE_PARM_DESC(ports, "port numbers of sip servers"); | ||
39 | |||
40 | static unsigned int sip_timeout = SIP_TIMEOUT; | ||
41 | module_param(sip_timeout, uint, 0600); | ||
42 | MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); | ||
43 | |||
44 | unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, | ||
45 | enum ip_conntrack_info ctinfo, | ||
46 | struct ip_conntrack *ct, | ||
47 | const char **dptr); | ||
48 | EXPORT_SYMBOL_GPL(ip_nat_sip_hook); | ||
49 | |||
50 | unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, | ||
51 | enum ip_conntrack_info ctinfo, | ||
52 | struct ip_conntrack_expect *exp, | ||
53 | const char *dptr); | ||
54 | EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); | ||
55 | |||
56 | int ct_sip_get_info(const char *dptr, size_t dlen, | ||
57 | unsigned int *matchoff, | ||
58 | unsigned int *matchlen, | ||
59 | struct sip_header_nfo *hnfo); | ||
60 | EXPORT_SYMBOL_GPL(ct_sip_get_info); | ||
61 | |||
62 | |||
63 | static int digits_len(const char *dptr, const char *limit, int *shift); | ||
64 | static int epaddr_len(const char *dptr, const char *limit, int *shift); | ||
65 | static int skp_digits_len(const char *dptr, const char *limit, int *shift); | ||
66 | static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); | ||
67 | |||
68 | struct sip_header_nfo ct_sip_hdrs[] = { | ||
69 | { /* Via header */ | ||
70 | .lname = "Via:", | ||
71 | .lnlen = sizeof("Via:") - 1, | ||
72 | .sname = "\r\nv:", | ||
73 | .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ | ||
74 | .ln_str = "UDP ", | ||
75 | .ln_strlen = sizeof("UDP ") - 1, | ||
76 | .match_len = epaddr_len, | ||
77 | }, | ||
78 | { /* Contact header */ | ||
79 | .lname = "Contact:", | ||
80 | .lnlen = sizeof("Contact:") - 1, | ||
81 | .sname = "\r\nm:", | ||
82 | .snlen = sizeof("\r\nm:") - 1, | ||
83 | .ln_str = "sip:", | ||
84 | .ln_strlen = sizeof("sip:") - 1, | ||
85 | .match_len = skp_epaddr_len | ||
86 | }, | ||
87 | { /* Content length header */ | ||
88 | .lname = "Content-Length:", | ||
89 | .lnlen = sizeof("Content-Length:") - 1, | ||
90 | .sname = "\r\nl:", | ||
91 | .snlen = sizeof("\r\nl:") - 1, | ||
92 | .ln_str = ":", | ||
93 | .ln_strlen = sizeof(":") - 1, | ||
94 | .match_len = skp_digits_len | ||
95 | }, | ||
96 | { /* SDP media info */ | ||
97 | .lname = "\nm=", | ||
98 | .lnlen = sizeof("\nm=") - 1, | ||
99 | .sname = "\rm=", | ||
100 | .snlen = sizeof("\rm=") - 1, | ||
101 | .ln_str = "audio ", | ||
102 | .ln_strlen = sizeof("audio ") - 1, | ||
103 | .match_len = digits_len | ||
104 | }, | ||
105 | { /* SDP owner address*/ | ||
106 | .lname = "\no=", | ||
107 | .lnlen = sizeof("\no=") - 1, | ||
108 | .sname = "\ro=", | ||
109 | .snlen = sizeof("\ro=") - 1, | ||
110 | .ln_str = "IN IP4 ", | ||
111 | .ln_strlen = sizeof("IN IP4 ") - 1, | ||
112 | .match_len = epaddr_len | ||
113 | }, | ||
114 | { /* SDP connection info */ | ||
115 | .lname = "\nc=", | ||
116 | .lnlen = sizeof("\nc=") - 1, | ||
117 | .sname = "\rc=", | ||
118 | .snlen = sizeof("\rc=") - 1, | ||
119 | .ln_str = "IN IP4 ", | ||
120 | .ln_strlen = sizeof("IN IP4 ") - 1, | ||
121 | .match_len = epaddr_len | ||
122 | }, | ||
123 | { /* Requests headers */ | ||
124 | .lname = "sip:", | ||
125 | .lnlen = sizeof("sip:") - 1, | ||
126 | .sname = "sip:", | ||
127 | .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */ | ||
128 | .ln_str = "@", | ||
129 | .ln_strlen = sizeof("@") - 1, | ||
130 | .match_len = epaddr_len | ||
131 | }, | ||
132 | { /* SDP version header */ | ||
133 | .lname = "\nv=", | ||
134 | .lnlen = sizeof("\nv=") - 1, | ||
135 | .sname = "\rv=", | ||
136 | .snlen = sizeof("\rv=") - 1, | ||
137 | .ln_str = "=", | ||
138 | .ln_strlen = sizeof("=") - 1, | ||
139 | .match_len = digits_len | ||
140 | } | ||
141 | }; | ||
142 | EXPORT_SYMBOL_GPL(ct_sip_hdrs); | ||
143 | |||
144 | /* get line lenght until first CR or LF seen. */ | ||
145 | int ct_sip_lnlen(const char *line, const char *limit) | ||
146 | { | ||
147 | const char *k = line; | ||
148 | |||
149 | while ((line <= limit) && (*line == '\r' || *line == '\n')) | ||
150 | line++; | ||
151 | |||
152 | while (line <= limit) { | ||
153 | if (*line == '\r' || *line == '\n') | ||
154 | break; | ||
155 | line++; | ||
156 | } | ||
157 | return line - k; | ||
158 | } | ||
159 | EXPORT_SYMBOL_GPL(ct_sip_lnlen); | ||
160 | |||
161 | /* Linear string search, case sensitive. */ | ||
162 | const char *ct_sip_search(const char *needle, const char *haystack, | ||
163 | size_t needle_len, size_t haystack_len) | ||
164 | { | ||
165 | const char *limit = haystack + (haystack_len - needle_len); | ||
166 | |||
167 | while (haystack <= limit) { | ||
168 | if (memcmp(haystack, needle, needle_len) == 0) | ||
169 | return haystack; | ||
170 | haystack++; | ||
171 | } | ||
172 | return NULL; | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(ct_sip_search); | ||
175 | |||
176 | static int digits_len(const char *dptr, const char *limit, int *shift) | ||
177 | { | ||
178 | int len = 0; | ||
179 | while (dptr <= limit && isdigit(*dptr)) { | ||
180 | dptr++; | ||
181 | len++; | ||
182 | } | ||
183 | return len; | ||
184 | } | ||
185 | |||
186 | /* get digits lenght, skiping blank spaces. */ | ||
187 | static int skp_digits_len(const char *dptr, const char *limit, int *shift) | ||
188 | { | ||
189 | for (; dptr <= limit && *dptr == ' '; dptr++) | ||
190 | (*shift)++; | ||
191 | |||
192 | return digits_len(dptr, limit, shift); | ||
193 | } | ||
194 | |||
195 | /* Simple ipaddr parser.. */ | ||
196 | static int parse_ipaddr(const char *cp, const char **endp, | ||
197 | u_int32_t *ipaddr, const char *limit) | ||
198 | { | ||
199 | unsigned long int val; | ||
200 | int i, digit = 0; | ||
201 | |||
202 | for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { | ||
203 | digit = 0; | ||
204 | if (!isdigit(*cp)) | ||
205 | break; | ||
206 | |||
207 | val = simple_strtoul(cp, (char **)&cp, 10); | ||
208 | if (val > 0xFF) | ||
209 | return -1; | ||
210 | |||
211 | ((u_int8_t *)ipaddr)[i] = val; | ||
212 | digit = 1; | ||
213 | |||
214 | if (*cp != '.') | ||
215 | break; | ||
216 | cp++; | ||
217 | } | ||
218 | if (!digit) | ||
219 | return -1; | ||
220 | |||
221 | if (endp) | ||
222 | *endp = cp; | ||
223 | |||
224 | return 0; | ||
225 | } | ||
226 | |||
227 | /* skip ip address. returns it lenght. */ | ||
228 | static int epaddr_len(const char *dptr, const char *limit, int *shift) | ||
229 | { | ||
230 | const char *aux = dptr; | ||
231 | u_int32_t ip; | ||
232 | |||
233 | if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { | ||
234 | DEBUGP("ip: %s parse failed.!\n", dptr); | ||
235 | return 0; | ||
236 | } | ||
237 | |||
238 | /* Port number */ | ||
239 | if (*dptr == ':') { | ||
240 | dptr++; | ||
241 | dptr += digits_len(dptr, limit, shift); | ||
242 | } | ||
243 | return dptr - aux; | ||
244 | } | ||
245 | |||
246 | /* get address length, skiping user info. */ | ||
247 | static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) | ||
248 | { | ||
249 | int s = *shift; | ||
250 | |||
251 | for (; dptr <= limit && *dptr != '@'; dptr++) | ||
252 | (*shift)++; | ||
253 | |||
254 | if (*dptr == '@') { | ||
255 | dptr++; | ||
256 | (*shift)++; | ||
257 | } else | ||
258 | *shift = s; | ||
259 | |||
260 | return epaddr_len(dptr, limit, shift); | ||
261 | } | ||
262 | |||
263 | /* Returns 0 if not found, -1 error parsing. */ | ||
264 | int ct_sip_get_info(const char *dptr, size_t dlen, | ||
265 | unsigned int *matchoff, | ||
266 | unsigned int *matchlen, | ||
267 | struct sip_header_nfo *hnfo) | ||
268 | { | ||
269 | const char *limit, *aux, *k = dptr; | ||
270 | int shift = 0; | ||
271 | |||
272 | limit = dptr + (dlen - hnfo->lnlen); | ||
273 | |||
274 | while (dptr <= limit) { | ||
275 | if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && | ||
276 | (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { | ||
277 | dptr++; | ||
278 | continue; | ||
279 | } | ||
280 | aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, | ||
281 | ct_sip_lnlen(dptr, limit)); | ||
282 | if (!aux) { | ||
283 | DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, | ||
284 | hnfo->lname); | ||
285 | return -1; | ||
286 | } | ||
287 | aux += hnfo->ln_strlen; | ||
288 | |||
289 | *matchlen = hnfo->match_len(aux, limit, &shift); | ||
290 | if (!*matchlen) | ||
291 | return -1; | ||
292 | |||
293 | *matchoff = (aux - k) + shift; | ||
294 | |||
295 | DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, | ||
296 | *matchlen); | ||
297 | return 1; | ||
298 | } | ||
299 | DEBUGP("%s header not found.\n", hnfo->lname); | ||
300 | return 0; | ||
301 | } | ||
302 | |||
303 | static int set_expected_rtp(struct sk_buff **pskb, | ||
304 | struct ip_conntrack *ct, | ||
305 | enum ip_conntrack_info ctinfo, | ||
306 | u_int32_t ipaddr, u_int16_t port, | ||
307 | const char *dptr) | ||
308 | { | ||
309 | struct ip_conntrack_expect *exp; | ||
310 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
311 | int ret; | ||
312 | |||
313 | exp = ip_conntrack_expect_alloc(ct); | ||
314 | if (exp == NULL) | ||
315 | return NF_DROP; | ||
316 | |||
317 | exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; | ||
318 | exp->tuple.src.u.udp.port = 0; | ||
319 | exp->tuple.dst.ip = ipaddr; | ||
320 | exp->tuple.dst.u.udp.port = htons(port); | ||
321 | exp->tuple.dst.protonum = IPPROTO_UDP; | ||
322 | |||
323 | exp->mask.src.ip = 0xFFFFFFFF; | ||
324 | exp->mask.src.u.udp.port = 0; | ||
325 | exp->mask.dst.ip = 0xFFFFFFFF; | ||
326 | exp->mask.dst.u.udp.port = 0xFFFF; | ||
327 | exp->mask.dst.protonum = 0xFF; | ||
328 | |||
329 | exp->expectfn = NULL; | ||
330 | exp->flags = 0; | ||
331 | |||
332 | if (ip_nat_sdp_hook) | ||
333 | ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr); | ||
334 | else { | ||
335 | if (ip_conntrack_expect_related(exp) != 0) | ||
336 | ret = NF_DROP; | ||
337 | else | ||
338 | ret = NF_ACCEPT; | ||
339 | } | ||
340 | ip_conntrack_expect_put(exp); | ||
341 | |||
342 | return ret; | ||
343 | } | ||
344 | |||
345 | static int sip_help(struct sk_buff **pskb, | ||
346 | struct ip_conntrack *ct, | ||
347 | enum ip_conntrack_info ctinfo) | ||
348 | { | ||
349 | unsigned int dataoff, datalen; | ||
350 | const char *dptr; | ||
351 | int ret = NF_ACCEPT; | ||
352 | int matchoff, matchlen; | ||
353 | u_int32_t ipaddr; | ||
354 | u_int16_t port; | ||
355 | |||
356 | /* No Data ? */ | ||
357 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
358 | if (dataoff >= (*pskb)->len) { | ||
359 | DEBUGP("skb->len = %u\n", (*pskb)->len); | ||
360 | return NF_ACCEPT; | ||
361 | } | ||
362 | |||
363 | ip_ct_refresh(ct, *pskb, sip_timeout * HZ); | ||
364 | |||
365 | if (!skb_is_nonlinear(*pskb)) | ||
366 | dptr = (*pskb)->data + dataoff; | ||
367 | else { | ||
368 | DEBUGP("Copy of skbuff not supported yet.\n"); | ||
369 | goto out; | ||
370 | } | ||
371 | |||
372 | if (ip_nat_sip_hook) { | ||
373 | if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) { | ||
374 | ret = NF_DROP; | ||
375 | goto out; | ||
376 | } | ||
377 | } | ||
378 | |||
379 | /* After this point NAT, could have mangled skb, so | ||
380 | we need to recalculate payload lenght. */ | ||
381 | datalen = (*pskb)->len - dataoff; | ||
382 | |||
383 | if (datalen < (sizeof("SIP/2.0 200") - 1)) | ||
384 | goto out; | ||
385 | |||
386 | /* RTP info only in some SDP pkts */ | ||
387 | if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && | ||
388 | memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { | ||
389 | goto out; | ||
390 | } | ||
391 | /* Get ip and port address from SDP packet. */ | ||
392 | if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, | ||
393 | &ct_sip_hdrs[POS_CONNECTION]) > 0) { | ||
394 | |||
395 | /* We'll drop only if there are parse problems. */ | ||
396 | if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, | ||
397 | dptr + datalen) < 0) { | ||
398 | ret = NF_DROP; | ||
399 | goto out; | ||
400 | } | ||
401 | if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, | ||
402 | &ct_sip_hdrs[POS_MEDIA]) > 0) { | ||
403 | |||
404 | port = simple_strtoul(dptr + matchoff, NULL, 10); | ||
405 | if (port < 1024) { | ||
406 | ret = NF_DROP; | ||
407 | goto out; | ||
408 | } | ||
409 | ret = set_expected_rtp(pskb, ct, ctinfo, | ||
410 | ipaddr, port, dptr); | ||
411 | } | ||
412 | } | ||
413 | out: | ||
414 | return ret; | ||
415 | } | ||
416 | |||
417 | static struct ip_conntrack_helper sip[MAX_PORTS]; | ||
418 | static char sip_names[MAX_PORTS][10]; | ||
419 | |||
420 | static void fini(void) | ||
421 | { | ||
422 | int i; | ||
423 | for (i = 0; i < ports_c; i++) { | ||
424 | DEBUGP("unregistering helper for port %d\n", ports[i]); | ||
425 | ip_conntrack_helper_unregister(&sip[i]); | ||
426 | } | ||
427 | } | ||
428 | |||
429 | static int __init init(void) | ||
430 | { | ||
431 | int i, ret; | ||
432 | char *tmpname; | ||
433 | |||
434 | if (ports_c == 0) | ||
435 | ports[ports_c++] = SIP_PORT; | ||
436 | |||
437 | for (i = 0; i < ports_c; i++) { | ||
438 | /* Create helper structure */ | ||
439 | memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); | ||
440 | |||
441 | sip[i].tuple.dst.protonum = IPPROTO_UDP; | ||
442 | sip[i].tuple.src.u.udp.port = htons(ports[i]); | ||
443 | sip[i].mask.src.u.udp.port = 0xFFFF; | ||
444 | sip[i].mask.dst.protonum = 0xFF; | ||
445 | sip[i].max_expected = 1; | ||
446 | sip[i].timeout = 3 * 60; /* 3 minutes */ | ||
447 | sip[i].me = THIS_MODULE; | ||
448 | sip[i].help = sip_help; | ||
449 | |||
450 | tmpname = &sip_names[i][0]; | ||
451 | if (ports[i] == SIP_PORT) | ||
452 | sprintf(tmpname, "sip"); | ||
453 | else | ||
454 | sprintf(tmpname, "sip-%d", i); | ||
455 | sip[i].name = tmpname; | ||
456 | |||
457 | DEBUGP("port #%d: %d\n", i, ports[i]); | ||
458 | |||
459 | ret = ip_conntrack_helper_register(&sip[i]); | ||
460 | if (ret) { | ||
461 | printk("ERROR registering helper for port %d\n", | ||
462 | ports[i]); | ||
463 | fini(); | ||
464 | return ret; | ||
465 | } | ||
466 | } | ||
467 | return 0; | ||
468 | } | ||
469 | |||
470 | module_init(init); | ||
471 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 929d61f7be91..88445aac3f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c | |||
@@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v) | |||
189 | return -ENOSPC; | 189 | return -ENOSPC; |
190 | #endif | 190 | #endif |
191 | 191 | ||
192 | #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK | ||
193 | if (seq_printf(s, "secmark=%u ", conntrack->secmark)) | ||
194 | return -ENOSPC; | ||
195 | #endif | ||
196 | |||
192 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) | 197 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) |
193 | return -ENOSPC; | 198 | return -ENOSPC; |
194 | 199 | ||
@@ -417,7 +422,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, | |||
417 | 422 | ||
418 | /* This is where we call the helper: as the packet goes out. */ | 423 | /* This is where we call the helper: as the packet goes out. */ |
419 | ct = ip_conntrack_get(*pskb, &ctinfo); | 424 | ct = ip_conntrack_get(*pskb, &ctinfo); |
420 | if (ct && ct->helper) { | 425 | if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { |
421 | unsigned int ret; | 426 | unsigned int ret; |
422 | ret = ct->helper->help(pskb, ct, ctinfo); | 427 | ret = ct->helper->help(pskb, ct, ctinfo); |
423 | if (ret != NF_ACCEPT) | 428 | if (ret != NF_ACCEPT) |
@@ -564,6 +569,8 @@ extern unsigned int ip_ct_generic_timeout; | |||
564 | static int log_invalid_proto_min = 0; | 569 | static int log_invalid_proto_min = 0; |
565 | static int log_invalid_proto_max = 255; | 570 | static int log_invalid_proto_max = 255; |
566 | 571 | ||
572 | int ip_conntrack_checksum = 1; | ||
573 | |||
567 | static struct ctl_table_header *ip_ct_sysctl_header; | 574 | static struct ctl_table_header *ip_ct_sysctl_header; |
568 | 575 | ||
569 | static ctl_table ip_ct_sysctl_table[] = { | 576 | static ctl_table ip_ct_sysctl_table[] = { |
@@ -592,6 +599,14 @@ static ctl_table ip_ct_sysctl_table[] = { | |||
592 | .proc_handler = &proc_dointvec, | 599 | .proc_handler = &proc_dointvec, |
593 | }, | 600 | }, |
594 | { | 601 | { |
602 | .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, | ||
603 | .procname = "ip_conntrack_checksum", | ||
604 | .data = &ip_conntrack_checksum, | ||
605 | .maxlen = sizeof(int), | ||
606 | .mode = 0644, | ||
607 | .proc_handler = &proc_dointvec, | ||
608 | }, | ||
609 | { | ||
595 | .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, | 610 | .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, |
596 | .procname = "ip_conntrack_tcp_timeout_syn_sent", | 611 | .procname = "ip_conntrack_tcp_timeout_syn_sent", |
597 | .data = &ip_ct_tcp_timeout_syn_sent, | 612 | .data = &ip_ct_tcp_timeout_syn_sent, |
@@ -946,6 +961,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); | |||
946 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); | 961 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); |
947 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); | 962 | EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); |
948 | EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); | 963 | EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); |
964 | EXPORT_SYMBOL_GPL(ip_conntrack_checksum); | ||
949 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ | 965 | #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ |
950 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) | 966 | defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) |
951 | EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); | 967 | EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); |
diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c index d45663d137a7..419b878fb467 100644 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ b/net/ipv4/netfilter/ip_nat_helper_h323.c | |||
@@ -487,6 +487,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, | |||
487 | } | 487 | } |
488 | 488 | ||
489 | /****************************************************************************/ | 489 | /****************************************************************************/ |
490 | static void ip_nat_callforwarding_expect(struct ip_conntrack *new, | ||
491 | struct ip_conntrack_expect *this) | ||
492 | { | ||
493 | struct ip_nat_range range; | ||
494 | |||
495 | /* This must be a fresh one. */ | ||
496 | BUG_ON(new->status & IPS_NAT_DONE_MASK); | ||
497 | |||
498 | /* Change src to where master sends to */ | ||
499 | range.flags = IP_NAT_RANGE_MAP_IPS; | ||
500 | range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; | ||
501 | |||
502 | /* hook doesn't matter, but it has to do source manip */ | ||
503 | ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); | ||
504 | |||
505 | /* For DST manip, map port here to where it's expected. */ | ||
506 | range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); | ||
507 | range.min = range.max = this->saved_proto; | ||
508 | range.min_ip = range.max_ip = this->saved_ip; | ||
509 | |||
510 | /* hook doesn't matter, but it has to do destination manip */ | ||
511 | ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); | ||
512 | |||
513 | ip_conntrack_q931_expect(new, this); | ||
514 | } | ||
515 | |||
516 | /****************************************************************************/ | ||
517 | static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, | ||
518 | enum ip_conntrack_info ctinfo, | ||
519 | unsigned char **data, int dataoff, | ||
520 | TransportAddress * addr, u_int16_t port, | ||
521 | struct ip_conntrack_expect *exp) | ||
522 | { | ||
523 | int dir = CTINFO2DIR(ctinfo); | ||
524 | u_int16_t nated_port; | ||
525 | |||
526 | /* Set expectations for NAT */ | ||
527 | exp->saved_ip = exp->tuple.dst.ip; | ||
528 | exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; | ||
529 | exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; | ||
530 | exp->expectfn = ip_nat_callforwarding_expect; | ||
531 | exp->dir = !dir; | ||
532 | |||
533 | /* Try to get same port: if not, try to change it. */ | ||
534 | for (nated_port = port; nated_port != 0; nated_port++) { | ||
535 | exp->tuple.dst.u.tcp.port = htons(nated_port); | ||
536 | if (ip_conntrack_expect_related(exp) == 0) | ||
537 | break; | ||
538 | } | ||
539 | |||
540 | if (nated_port == 0) { /* No port available */ | ||
541 | if (net_ratelimit()) | ||
542 | printk("ip_nat_q931: out of TCP ports\n"); | ||
543 | return 0; | ||
544 | } | ||
545 | |||
546 | /* Modify signal */ | ||
547 | if (!set_h225_addr(pskb, data, dataoff, addr, | ||
548 | ct->tuplehash[!dir].tuple.dst.ip, | ||
549 | nated_port) == 0) { | ||
550 | ip_conntrack_unexpect_related(exp); | ||
551 | return -1; | ||
552 | } | ||
553 | |||
554 | /* Success */ | ||
555 | DEBUGP("ip_nat_q931: expect Call Forwarding " | ||
556 | "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", | ||
557 | NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), | ||
558 | NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); | ||
559 | |||
560 | return 0; | ||
561 | } | ||
562 | |||
563 | /****************************************************************************/ | ||
490 | static int __init init(void) | 564 | static int __init init(void) |
491 | { | 565 | { |
492 | BUG_ON(set_h245_addr_hook != NULL); | 566 | BUG_ON(set_h245_addr_hook != NULL); |
@@ -496,6 +570,7 @@ static int __init init(void) | |||
496 | BUG_ON(nat_rtp_rtcp_hook != NULL); | 570 | BUG_ON(nat_rtp_rtcp_hook != NULL); |
497 | BUG_ON(nat_t120_hook != NULL); | 571 | BUG_ON(nat_t120_hook != NULL); |
498 | BUG_ON(nat_h245_hook != NULL); | 572 | BUG_ON(nat_h245_hook != NULL); |
573 | BUG_ON(nat_callforwarding_hook != NULL); | ||
499 | BUG_ON(nat_q931_hook != NULL); | 574 | BUG_ON(nat_q931_hook != NULL); |
500 | 575 | ||
501 | set_h245_addr_hook = set_h245_addr; | 576 | set_h245_addr_hook = set_h245_addr; |
@@ -505,6 +580,7 @@ static int __init init(void) | |||
505 | nat_rtp_rtcp_hook = nat_rtp_rtcp; | 580 | nat_rtp_rtcp_hook = nat_rtp_rtcp; |
506 | nat_t120_hook = nat_t120; | 581 | nat_t120_hook = nat_t120; |
507 | nat_h245_hook = nat_h245; | 582 | nat_h245_hook = nat_h245; |
583 | nat_callforwarding_hook = nat_callforwarding; | ||
508 | nat_q931_hook = nat_q931; | 584 | nat_q931_hook = nat_q931; |
509 | 585 | ||
510 | DEBUGP("ip_nat_h323: init success\n"); | 586 | DEBUGP("ip_nat_h323: init success\n"); |
@@ -521,6 +597,7 @@ static void __exit fini(void) | |||
521 | nat_rtp_rtcp_hook = NULL; | 597 | nat_rtp_rtcp_hook = NULL; |
522 | nat_t120_hook = NULL; | 598 | nat_t120_hook = NULL; |
523 | nat_h245_hook = NULL; | 599 | nat_h245_hook = NULL; |
600 | nat_callforwarding_hook = NULL; | ||
524 | nat_q931_hook = NULL; | 601 | nat_q931_hook = NULL; |
525 | synchronize_net(); | 602 | synchronize_net(); |
526 | } | 603 | } |
diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c new file mode 100644 index 000000000000..6ffba63adca2 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_sip.c | |||
@@ -0,0 +1,249 @@ | |||
1 | /* SIP extension for UDP NAT alteration. | ||
2 | * | ||
3 | * (C) 2005 by Christian Hentschel <chentschel@arnet.com.ar> | ||
4 | * based on RR's ip_nat_ftp.c and other modules. | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | */ | ||
10 | |||
11 | #include <linux/module.h> | ||
12 | #include <linux/skbuff.h> | ||
13 | #include <linux/ip.h> | ||
14 | #include <linux/udp.h> | ||
15 | |||
16 | #include <linux/netfilter_ipv4.h> | ||
17 | #include <linux/netfilter_ipv4/ip_nat.h> | ||
18 | #include <linux/netfilter_ipv4/ip_nat_helper.h> | ||
19 | #include <linux/netfilter_ipv4/ip_conntrack_helper.h> | ||
20 | #include <linux/netfilter_ipv4/ip_conntrack_sip.h> | ||
21 | |||
22 | MODULE_LICENSE("GPL"); | ||
23 | MODULE_AUTHOR("Christian Hentschel <chentschel@arnet.com.ar>"); | ||
24 | MODULE_DESCRIPTION("SIP NAT helper"); | ||
25 | |||
26 | #if 0 | ||
27 | #define DEBUGP printk | ||
28 | #else | ||
29 | #define DEBUGP(format, args...) | ||
30 | #endif | ||
31 | |||
32 | extern struct sip_header_nfo ct_sip_hdrs[]; | ||
33 | |||
34 | static unsigned int mangle_sip_packet(struct sk_buff **pskb, | ||
35 | enum ip_conntrack_info ctinfo, | ||
36 | struct ip_conntrack *ct, | ||
37 | const char **dptr, size_t dlen, | ||
38 | char *buffer, int bufflen, | ||
39 | struct sip_header_nfo *hnfo) | ||
40 | { | ||
41 | unsigned int matchlen, matchoff; | ||
42 | |||
43 | if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0) | ||
44 | return 0; | ||
45 | |||
46 | if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, | ||
47 | matchoff, matchlen, buffer, bufflen)) | ||
48 | return 0; | ||
49 | |||
50 | /* We need to reload this. Thanks Patrick. */ | ||
51 | *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
52 | return 1; | ||
53 | } | ||
54 | |||
55 | static unsigned int ip_nat_sip(struct sk_buff **pskb, | ||
56 | enum ip_conntrack_info ctinfo, | ||
57 | struct ip_conntrack *ct, | ||
58 | const char **dptr) | ||
59 | { | ||
60 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
61 | char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; | ||
62 | unsigned int bufflen, dataoff; | ||
63 | u_int32_t ip; | ||
64 | u_int16_t port; | ||
65 | |||
66 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
67 | |||
68 | ip = ct->tuplehash[!dir].tuple.dst.ip; | ||
69 | port = ct->tuplehash[!dir].tuple.dst.u.udp.port; | ||
70 | bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port)); | ||
71 | |||
72 | /* short packet ? */ | ||
73 | if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1)) | ||
74 | return 0; | ||
75 | |||
76 | /* Basic rules: requests and responses. */ | ||
77 | if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) { | ||
78 | const char *aux; | ||
79 | |||
80 | if ((ctinfo) < IP_CT_IS_REPLY) { | ||
81 | mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
82 | (*pskb)->len - dataoff, | ||
83 | buffer, bufflen, | ||
84 | &ct_sip_hdrs[POS_CONTACT]); | ||
85 | return 1; | ||
86 | } | ||
87 | |||
88 | if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
89 | (*pskb)->len - dataoff, | ||
90 | buffer, bufflen, &ct_sip_hdrs[POS_VIA])) | ||
91 | return 0; | ||
92 | |||
93 | /* This search should ignore case, but later.. */ | ||
94 | aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1, | ||
95 | (*pskb)->len - dataoff); | ||
96 | if (!aux) | ||
97 | return 0; | ||
98 | |||
99 | if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"), | ||
100 | ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff))) | ||
101 | return 1; | ||
102 | |||
103 | return mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
104 | (*pskb)->len - dataoff, | ||
105 | buffer, bufflen, | ||
106 | &ct_sip_hdrs[POS_CONTACT]); | ||
107 | } | ||
108 | if ((ctinfo) < IP_CT_IS_REPLY) { | ||
109 | if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
110 | (*pskb)->len - dataoff, | ||
111 | buffer, bufflen, &ct_sip_hdrs[POS_VIA])) | ||
112 | return 0; | ||
113 | |||
114 | /* Mangle Contact if exists only. - watch udp_nat_mangle()! */ | ||
115 | mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff, | ||
116 | buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]); | ||
117 | return 1; | ||
118 | } | ||
119 | /* This mangle requests headers. */ | ||
120 | return mangle_sip_packet(pskb, ctinfo, ct, dptr, | ||
121 | ct_sip_lnlen(*dptr, | ||
122 | *dptr + (*pskb)->len - dataoff), | ||
123 | buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]); | ||
124 | } | ||
125 | |||
126 | static int mangle_content_len(struct sk_buff **pskb, | ||
127 | enum ip_conntrack_info ctinfo, | ||
128 | struct ip_conntrack *ct, | ||
129 | const char *dptr) | ||
130 | { | ||
131 | unsigned int dataoff, matchoff, matchlen; | ||
132 | char buffer[sizeof("65536")]; | ||
133 | int bufflen; | ||
134 | |||
135 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
136 | |||
137 | /* Get actual SDP lenght */ | ||
138 | if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, | ||
139 | &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) { | ||
140 | |||
141 | /* since ct_sip_get_info() give us a pointer passing 'v=' | ||
142 | we need to add 2 bytes in this count. */ | ||
143 | int c_len = (*pskb)->len - dataoff - matchoff + 2; | ||
144 | |||
145 | /* Now, update SDP lenght */ | ||
146 | if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, | ||
147 | &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) { | ||
148 | |||
149 | bufflen = sprintf(buffer, "%u", c_len); | ||
150 | |||
151 | return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, | ||
152 | matchoff, matchlen, | ||
153 | buffer, bufflen); | ||
154 | } | ||
155 | } | ||
156 | return 0; | ||
157 | } | ||
158 | |||
159 | static unsigned int mangle_sdp(struct sk_buff **pskb, | ||
160 | enum ip_conntrack_info ctinfo, | ||
161 | struct ip_conntrack *ct, | ||
162 | u_int32_t newip, u_int16_t port, | ||
163 | const char *dptr) | ||
164 | { | ||
165 | char buffer[sizeof("nnn.nnn.nnn.nnn")]; | ||
166 | unsigned int dataoff, bufflen; | ||
167 | |||
168 | dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); | ||
169 | |||
170 | /* Mangle owner and contact info. */ | ||
171 | bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); | ||
172 | if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, | ||
173 | buffer, bufflen, &ct_sip_hdrs[POS_OWNER])) | ||
174 | return 0; | ||
175 | |||
176 | if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, | ||
177 | buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION])) | ||
178 | return 0; | ||
179 | |||
180 | /* Mangle media port. */ | ||
181 | bufflen = sprintf(buffer, "%u", port); | ||
182 | if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, | ||
183 | buffer, bufflen, &ct_sip_hdrs[POS_MEDIA])) | ||
184 | return 0; | ||
185 | |||
186 | return mangle_content_len(pskb, ctinfo, ct, dptr); | ||
187 | } | ||
188 | |||
189 | /* So, this packet has hit the connection tracking matching code. | ||
190 | Mangle it, and change the expectation to match the new version. */ | ||
191 | static unsigned int ip_nat_sdp(struct sk_buff **pskb, | ||
192 | enum ip_conntrack_info ctinfo, | ||
193 | struct ip_conntrack_expect *exp, | ||
194 | const char *dptr) | ||
195 | { | ||
196 | struct ip_conntrack *ct = exp->master; | ||
197 | enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); | ||
198 | u_int32_t newip; | ||
199 | u_int16_t port; | ||
200 | |||
201 | DEBUGP("ip_nat_sdp():\n"); | ||
202 | |||
203 | /* Connection will come from reply */ | ||
204 | newip = ct->tuplehash[!dir].tuple.dst.ip; | ||
205 | |||
206 | exp->tuple.dst.ip = newip; | ||
207 | exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; | ||
208 | exp->dir = !dir; | ||
209 | |||
210 | /* When you see the packet, we need to NAT it the same as the | ||
211 | this one. */ | ||
212 | exp->expectfn = ip_nat_follow_master; | ||
213 | |||
214 | /* Try to get same port: if not, try to change it. */ | ||
215 | for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { | ||
216 | exp->tuple.dst.u.udp.port = htons(port); | ||
217 | if (ip_conntrack_expect_related(exp) == 0) | ||
218 | break; | ||
219 | } | ||
220 | |||
221 | if (port == 0) | ||
222 | return NF_DROP; | ||
223 | |||
224 | if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { | ||
225 | ip_conntrack_unexpect_related(exp); | ||
226 | return NF_DROP; | ||
227 | } | ||
228 | return NF_ACCEPT; | ||
229 | } | ||
230 | |||
231 | static void __exit fini(void) | ||
232 | { | ||
233 | ip_nat_sip_hook = NULL; | ||
234 | ip_nat_sdp_hook = NULL; | ||
235 | /* Make sure noone calls it, meanwhile. */ | ||
236 | synchronize_net(); | ||
237 | } | ||
238 | |||
239 | static int __init init(void) | ||
240 | { | ||
241 | BUG_ON(ip_nat_sip_hook); | ||
242 | BUG_ON(ip_nat_sdp_hook); | ||
243 | ip_nat_sip_hook = ip_nat_sip; | ||
244 | ip_nat_sdp_hook = ip_nat_sdp; | ||
245 | return 0; | ||
246 | } | ||
247 | |||
248 | module_init(init); | ||
249 | module_exit(fini); | ||
diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index c33244263b90..d20d557f915a 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c | |||
@@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void) | |||
1348 | module_init(ip_nat_snmp_basic_init); | 1348 | module_init(ip_nat_snmp_basic_init); |
1349 | module_exit(ip_nat_snmp_basic_fini); | 1349 | module_exit(ip_nat_snmp_basic_fini); |
1350 | 1350 | ||
1351 | module_param(debug, bool, 0600); | 1351 | module_param(debug, int, 0600); |
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index aad9d28c8d71..dbc83c5d7aa6 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c | |||
@@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) | |||
241 | struct iphdr *iph = skb->nh.iph; | 241 | struct iphdr *iph = skb->nh.iph; |
242 | unsigned long hashval; | 242 | unsigned long hashval; |
243 | u_int16_t sport, dport; | 243 | u_int16_t sport, dport; |
244 | struct tcphdr *th; | 244 | u_int16_t *ports; |
245 | struct udphdr *uh; | ||
246 | struct icmphdr *ih; | ||
247 | 245 | ||
248 | switch (iph->protocol) { | 246 | switch (iph->protocol) { |
249 | case IPPROTO_TCP: | 247 | case IPPROTO_TCP: |
250 | th = (void *)iph+iph->ihl*4; | ||
251 | sport = ntohs(th->source); | ||
252 | dport = ntohs(th->dest); | ||
253 | break; | ||
254 | case IPPROTO_UDP: | 248 | case IPPROTO_UDP: |
255 | uh = (void *)iph+iph->ihl*4; | 249 | case IPPROTO_SCTP: |
256 | sport = ntohs(uh->source); | 250 | case IPPROTO_DCCP: |
257 | dport = ntohs(uh->dest); | ||
258 | break; | ||
259 | case IPPROTO_ICMP: | 251 | case IPPROTO_ICMP: |
260 | ih = (void *)iph+iph->ihl*4; | 252 | ports = (void *)iph+iph->ihl*4; |
261 | sport = ntohs(ih->un.echo.id); | 253 | sport = ports[0]; |
262 | dport = (ih->type<<8)|ih->code; | 254 | dport = ports[1]; |
263 | break; | 255 | break; |
264 | default: | 256 | default: |
265 | if (net_ratelimit()) { | 257 | if (net_ratelimit()) { |
diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0bba3c2bb786..431a3ce6f7b7 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c | |||
@@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) | |||
147 | /* This packet will not be the same as the other: clear nf fields */ | 147 | /* This packet will not be the same as the other: clear nf fields */ |
148 | nf_reset(nskb); | 148 | nf_reset(nskb); |
149 | nskb->nfmark = 0; | 149 | nskb->nfmark = 0; |
150 | skb_init_secmark(nskb); | ||
150 | 151 | ||
151 | tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); | 152 | tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); |
152 | 153 | ||
diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 7c6836c4646e..92980ab8ce48 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c | |||
@@ -28,9 +28,6 @@ | |||
28 | #include <linux/jhash.h> | 28 | #include <linux/jhash.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
31 | #include <linux/tcp.h> | ||
32 | #include <linux/udp.h> | ||
33 | #include <linux/sctp.h> | ||
34 | #include <linux/proc_fs.h> | 31 | #include <linux/proc_fs.h> |
35 | #include <linux/seq_file.h> | 32 | #include <linux/seq_file.h> |
36 | #include <linux/list.h> | 33 | #include <linux/list.h> |
@@ -83,6 +80,7 @@ struct ipt_hashlimit_htable { | |||
83 | /* used internally */ | 80 | /* used internally */ |
84 | spinlock_t lock; /* lock for list_head */ | 81 | spinlock_t lock; /* lock for list_head */ |
85 | u_int32_t rnd; /* random seed for hash */ | 82 | u_int32_t rnd; /* random seed for hash */ |
83 | int rnd_initialized; | ||
86 | struct timer_list timer; /* timer for gc */ | 84 | struct timer_list timer; /* timer for gc */ |
87 | atomic_t count; /* number entries in table */ | 85 | atomic_t count; /* number entries in table */ |
88 | 86 | ||
@@ -137,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) | |||
137 | 135 | ||
138 | /* initialize hash with random val at the time we allocate | 136 | /* initialize hash with random val at the time we allocate |
139 | * the first hashtable entry */ | 137 | * the first hashtable entry */ |
140 | if (!ht->rnd) | 138 | if (!ht->rnd_initialized) { |
141 | get_random_bytes(&ht->rnd, 4); | 139 | get_random_bytes(&ht->rnd, 4); |
140 | ht->rnd_initialized = 1; | ||
141 | } | ||
142 | 142 | ||
143 | if (ht->cfg.max && | 143 | if (ht->cfg.max && |
144 | atomic_read(&ht->count) >= ht->cfg.max) { | 144 | atomic_read(&ht->count) >= ht->cfg.max) { |
@@ -217,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo) | |||
217 | 217 | ||
218 | atomic_set(&hinfo->count, 0); | 218 | atomic_set(&hinfo->count, 0); |
219 | atomic_set(&hinfo->use, 1); | 219 | atomic_set(&hinfo->use, 1); |
220 | hinfo->rnd = 0; | 220 | hinfo->rnd_initialized = 0; |
221 | spin_lock_init(&hinfo->lock); | 221 | spin_lock_init(&hinfo->lock); |
222 | hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); | 222 | hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); |
223 | if (!hinfo->pde) { | 223 | if (!hinfo->pde) { |
@@ -381,49 +381,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) | |||
381 | dh->rateinfo.credit = dh->rateinfo.credit_cap; | 381 | dh->rateinfo.credit = dh->rateinfo.credit_cap; |
382 | } | 382 | } |
383 | 383 | ||
384 | static inline int get_ports(const struct sk_buff *skb, int offset, | ||
385 | u16 ports[2]) | ||
386 | { | ||
387 | union { | ||
388 | struct tcphdr th; | ||
389 | struct udphdr uh; | ||
390 | sctp_sctphdr_t sctph; | ||
391 | } hdr_u, *ptr_u; | ||
392 | |||
393 | /* Must not be a fragment. */ | ||
394 | if (offset) | ||
395 | return 1; | ||
396 | |||
397 | /* Must be big enough to read ports (both UDP and TCP have | ||
398 | them at the start). */ | ||
399 | ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); | ||
400 | if (!ptr_u) | ||
401 | return 1; | ||
402 | |||
403 | switch (skb->nh.iph->protocol) { | ||
404 | case IPPROTO_TCP: | ||
405 | ports[0] = ptr_u->th.source; | ||
406 | ports[1] = ptr_u->th.dest; | ||
407 | break; | ||
408 | case IPPROTO_UDP: | ||
409 | ports[0] = ptr_u->uh.source; | ||
410 | ports[1] = ptr_u->uh.dest; | ||
411 | break; | ||
412 | case IPPROTO_SCTP: | ||
413 | ports[0] = ptr_u->sctph.source; | ||
414 | ports[1] = ptr_u->sctph.dest; | ||
415 | break; | ||
416 | default: | ||
417 | /* all other protocols don't supprot per-port hash | ||
418 | * buckets */ | ||
419 | ports[0] = ports[1] = 0; | ||
420 | break; | ||
421 | } | ||
422 | |||
423 | return 0; | ||
424 | } | ||
425 | |||
426 | |||
427 | static int | 384 | static int |
428 | hashlimit_match(const struct sk_buff *skb, | 385 | hashlimit_match(const struct sk_buff *skb, |
429 | const struct net_device *in, | 386 | const struct net_device *in, |
@@ -449,8 +406,22 @@ hashlimit_match(const struct sk_buff *skb, | |||
449 | dst.src_ip = skb->nh.iph->saddr; | 406 | dst.src_ip = skb->nh.iph->saddr; |
450 | if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT | 407 | if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT |
451 | ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { | 408 | ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { |
452 | u_int16_t ports[2]; | 409 | u_int16_t _ports[2], *ports; |
453 | if (get_ports(skb, offset, ports)) { | 410 | |
411 | switch (skb->nh.iph->protocol) { | ||
412 | case IPPROTO_TCP: | ||
413 | case IPPROTO_UDP: | ||
414 | case IPPROTO_SCTP: | ||
415 | case IPPROTO_DCCP: | ||
416 | ports = skb_header_pointer(skb, skb->nh.iph->ihl*4, | ||
417 | sizeof(_ports), &_ports); | ||
418 | break; | ||
419 | default: | ||
420 | _ports[0] = _ports[1] = 0; | ||
421 | ports = _ports; | ||
422 | break; | ||
423 | } | ||
424 | if (!ports) { | ||
454 | /* We've been asked to examine this packet, and we | 425 | /* We've been asked to examine this packet, and we |
455 | can't. Hence, no choice but to drop. */ | 426 | can't. Hence, no choice but to drop. */ |
456 | *hotdrop = 1; | 427 | *hotdrop = 1; |
@@ -561,7 +532,7 @@ static void | |||
561 | hashlimit_destroy(const struct xt_match *match, void *matchinfo, | 532 | hashlimit_destroy(const struct xt_match *match, void *matchinfo, |
562 | unsigned int matchsize) | 533 | unsigned int matchsize) |
563 | { | 534 | { |
564 | struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; | 535 | struct ipt_hashlimit_info *r = matchinfo; |
565 | 536 | ||
566 | htable_put(r->hinfo); | 537 | htable_put(r->hinfo); |
567 | } | 538 | } |
diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index b847ee409efb..61a2139f9cfd 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c | |||
@@ -1,1007 +1,499 @@ | |||
1 | /* Kernel module to check if the source address has been seen recently. */ | 1 | /* |
2 | /* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ | 2 | * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> |
3 | /* Author: Stephen Frost <sfrost@snowman.net> */ | 3 | * |
4 | /* Project Page: http://snowman.net/projects/ipt_recent/ */ | 4 | * This program is free software; you can redistribute it and/or modify |
5 | /* This software is distributed under the terms of the GPL, Version 2 */ | 5 | * it under the terms of the GNU General Public License version 2 as |
6 | /* This copyright does not cover user programs that use kernel services | 6 | * published by the Free Software Foundation. |
7 | * by normal system calls. */ | 7 | * |
8 | 8 | * This is a replacement of the old ipt_recent module, which carried the | |
9 | #include <linux/module.h> | 9 | * following copyright notice: |
10 | #include <linux/skbuff.h> | 10 | * |
11 | * Author: Stephen Frost <sfrost@snowman.net> | ||
12 | * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org | ||
13 | */ | ||
14 | #include <linux/init.h> | ||
15 | #include <linux/moduleparam.h> | ||
11 | #include <linux/proc_fs.h> | 16 | #include <linux/proc_fs.h> |
12 | #include <linux/spinlock.h> | 17 | #include <linux/seq_file.h> |
13 | #include <linux/interrupt.h> | 18 | #include <linux/string.h> |
14 | #include <asm/uaccess.h> | ||
15 | #include <linux/ctype.h> | 19 | #include <linux/ctype.h> |
16 | #include <linux/ip.h> | 20 | #include <linux/list.h> |
17 | #include <linux/vmalloc.h> | 21 | #include <linux/random.h> |
18 | #include <linux/moduleparam.h> | 22 | #include <linux/jhash.h> |
23 | #include <linux/bitops.h> | ||
24 | #include <linux/skbuff.h> | ||
25 | #include <linux/inet.h> | ||
19 | 26 | ||
20 | #include <linux/netfilter_ipv4/ip_tables.h> | 27 | #include <linux/netfilter_ipv4/ip_tables.h> |
21 | #include <linux/netfilter_ipv4/ipt_recent.h> | 28 | #include <linux/netfilter_ipv4/ipt_recent.h> |
22 | 29 | ||
23 | #undef DEBUG | 30 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); |
24 | #define HASH_LOG 9 | 31 | MODULE_DESCRIPTION("IP tables recently seen matching module"); |
32 | MODULE_LICENSE("GPL"); | ||
25 | 33 | ||
26 | /* Defaults, these can be overridden on the module command-line. */ | ||
27 | static unsigned int ip_list_tot = 100; | 34 | static unsigned int ip_list_tot = 100; |
28 | static unsigned int ip_pkt_list_tot = 20; | 35 | static unsigned int ip_pkt_list_tot = 20; |
29 | static unsigned int ip_list_hash_size = 0; | 36 | static unsigned int ip_list_hash_size = 0; |
30 | static unsigned int ip_list_perms = 0644; | 37 | static unsigned int ip_list_perms = 0644; |
31 | #ifdef DEBUG | ||
32 | static int debug = 1; | ||
33 | #endif | ||
34 | |||
35 | static char version[] = | ||
36 | KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost <sfrost@snowman.net>. http://snowman.net/projects/ipt_recent/\n"; | ||
37 | |||
38 | MODULE_AUTHOR("Stephen Frost <sfrost@snowman.net>"); | ||
39 | MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); | ||
40 | MODULE_LICENSE("GPL"); | ||
41 | module_param(ip_list_tot, uint, 0400); | 38 | module_param(ip_list_tot, uint, 0400); |
42 | module_param(ip_pkt_list_tot, uint, 0400); | 39 | module_param(ip_pkt_list_tot, uint, 0400); |
43 | module_param(ip_list_hash_size, uint, 0400); | 40 | module_param(ip_list_hash_size, uint, 0400); |
44 | module_param(ip_list_perms, uint, 0400); | 41 | module_param(ip_list_perms, uint, 0400); |
45 | #ifdef DEBUG | 42 | MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); |
46 | module_param(debug, bool, 0600); | 43 | MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); |
47 | MODULE_PARM_DESC(debug,"enable debugging output"); | 44 | MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); |
48 | #endif | 45 | MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); |
49 | MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); | 46 | |
50 | MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); | 47 | |
51 | MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); | 48 | struct recent_entry { |
52 | MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); | 49 | struct list_head list; |
53 | 50 | struct list_head lru_list; | |
54 | /* Structure of our list of recently seen addresses. */ | 51 | u_int32_t addr; |
55 | struct recent_ip_list { | 52 | u_int8_t ttl; |
56 | u_int32_t addr; | 53 | u_int8_t index; |
57 | u_int8_t ttl; | 54 | u_int16_t nstamps; |
58 | unsigned long last_seen; | 55 | unsigned long stamps[0]; |
59 | unsigned long *last_pkts; | ||
60 | u_int32_t oldest_pkt; | ||
61 | u_int32_t hash_entry; | ||
62 | u_int32_t time_pos; | ||
63 | }; | ||
64 | |||
65 | struct time_info_list { | ||
66 | u_int32_t position; | ||
67 | u_int32_t time; | ||
68 | }; | 56 | }; |
69 | 57 | ||
70 | /* Structure of our linked list of tables of recent lists. */ | 58 | struct recent_table { |
71 | struct recent_ip_tables { | 59 | struct list_head list; |
72 | char name[IPT_RECENT_NAME_LEN]; | 60 | char name[IPT_RECENT_NAME_LEN]; |
73 | int count; | ||
74 | int time_pos; | ||
75 | struct recent_ip_list *table; | ||
76 | struct recent_ip_tables *next; | ||
77 | spinlock_t list_lock; | ||
78 | int *hash_table; | ||
79 | struct time_info_list *time_info; | ||
80 | #ifdef CONFIG_PROC_FS | 61 | #ifdef CONFIG_PROC_FS |
81 | struct proc_dir_entry *status_proc; | 62 | struct proc_dir_entry *proc; |
82 | #endif /* CONFIG_PROC_FS */ | 63 | #endif |
64 | unsigned int refcnt; | ||
65 | unsigned int entries; | ||
66 | struct list_head lru_list; | ||
67 | struct list_head iphash[0]; | ||
83 | }; | 68 | }; |
84 | 69 | ||
85 | /* Our current list of addresses we have recently seen. | 70 | static LIST_HEAD(tables); |
86 | * Only added to on a --set, and only updated on --set || --update | ||
87 | */ | ||
88 | static struct recent_ip_tables *r_tables = NULL; | ||
89 | |||
90 | /* We protect r_list with this spinlock so two processors are not modifying | ||
91 | * the list at the same time. | ||
92 | */ | ||
93 | static DEFINE_SPINLOCK(recent_lock); | 71 | static DEFINE_SPINLOCK(recent_lock); |
72 | static DEFINE_MUTEX(recent_mutex); | ||
94 | 73 | ||
95 | #ifdef CONFIG_PROC_FS | 74 | #ifdef CONFIG_PROC_FS |
96 | /* Our /proc/net/ipt_recent entry */ | 75 | static struct proc_dir_entry *proc_dir; |
97 | static struct proc_dir_entry *proc_net_ipt_recent = NULL; | 76 | static struct file_operations recent_fops; |
98 | #endif | ||
99 | |||
100 | /* Function declaration for later. */ | ||
101 | static int | ||
102 | match(const struct sk_buff *skb, | ||
103 | const struct net_device *in, | ||
104 | const struct net_device *out, | ||
105 | const struct xt_match *match, | ||
106 | const void *matchinfo, | ||
107 | int offset, | ||
108 | unsigned int protoff, | ||
109 | int *hotdrop); | ||
110 | |||
111 | /* Function to hash a given address into the hash table of table_size size */ | ||
112 | static int hash_func(unsigned int addr, int table_size) | ||
113 | { | ||
114 | int result = 0; | ||
115 | unsigned int value = addr; | ||
116 | do { result ^= value; } while((value >>= HASH_LOG)); | ||
117 | |||
118 | #ifdef DEBUG | ||
119 | if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", | ||
120 | result & (table_size - 1), | ||
121 | addr, | ||
122 | table_size); | ||
123 | #endif | 77 | #endif |
124 | 78 | ||
125 | return(result & (table_size - 1)); | 79 | static u_int32_t hash_rnd; |
126 | } | 80 | static int hash_rnd_initted; |
127 | 81 | ||
128 | #ifdef CONFIG_PROC_FS | 82 | static unsigned int recent_entry_hash(u_int32_t addr) |
129 | /* This is the function which produces the output for our /proc output | ||
130 | * interface which lists each IP address, the last seen time and the | ||
131 | * other recent times the address was seen. | ||
132 | */ | ||
133 | |||
134 | static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) | ||
135 | { | 83 | { |
136 | int len = 0, count, last_len = 0, pkt_count; | 84 | if (!hash_rnd_initted) { |
137 | off_t pos = 0; | 85 | get_random_bytes(&hash_rnd, 4); |
138 | off_t begin = 0; | 86 | hash_rnd_initted = 1; |
139 | struct recent_ip_tables *curr_table; | ||
140 | |||
141 | curr_table = (struct recent_ip_tables*) data; | ||
142 | |||
143 | spin_lock_bh(&curr_table->list_lock); | ||
144 | for(count = 0; count < ip_list_tot; count++) { | ||
145 | if(!curr_table->table[count].addr) continue; | ||
146 | last_len = len; | ||
147 | len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); | ||
148 | len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); | ||
149 | len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); | ||
150 | len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); | ||
151 | len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); | ||
152 | for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { | ||
153 | if(!curr_table->table[count].last_pkts[pkt_count]) break; | ||
154 | len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); | ||
155 | } | ||
156 | len += sprintf(buffer+len,"\n"); | ||
157 | pos = begin + len; | ||
158 | if(pos < offset) { len = 0; begin = pos; } | ||
159 | if(pos > offset + length) { len = last_len; break; } | ||
160 | } | 87 | } |
161 | 88 | return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1); | |
162 | *start = buffer + (offset - begin); | ||
163 | len -= (offset - begin); | ||
164 | if(len > length) len = length; | ||
165 | |||
166 | spin_unlock_bh(&curr_table->list_lock); | ||
167 | return len; | ||
168 | } | 89 | } |
169 | 90 | ||
170 | /* ip_recent_ctrl provides an interface for users to modify the table | 91 | static struct recent_entry * |
171 | * directly. This allows adding entries, removing entries, and | 92 | recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl) |
172 | * flushing the entire table. | ||
173 | * This is done by opening up the appropriate table for writing and | ||
174 | * sending one of: | ||
175 | * xx.xx.xx.xx -- Add entry to table with current time | ||
176 | * +xx.xx.xx.xx -- Add entry to table with current time | ||
177 | * -xx.xx.xx.xx -- Remove entry from table | ||
178 | * clear -- Flush table, remove all entries | ||
179 | */ | ||
180 | |||
181 | static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) | ||
182 | { | 93 | { |
183 | static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; | 94 | struct recent_entry *e; |
184 | u_int32_t val; | 95 | unsigned int h; |
185 | int base, used = 0; | 96 | |
186 | char c, *cp; | 97 | h = recent_entry_hash(addr); |
187 | union iaddr { | 98 | list_for_each_entry(e, &table->iphash[h], list) |
188 | uint8_t bytes[4]; | 99 | if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) |
189 | uint32_t word; | 100 | return e; |
190 | } res; | 101 | return NULL; |
191 | uint8_t *pp = res.bytes; | 102 | } |
192 | int digit; | ||
193 | |||
194 | char buffer[20]; | ||
195 | int len, check_set = 0, count; | ||
196 | u_int32_t addr = 0; | ||
197 | struct sk_buff *skb; | ||
198 | struct ipt_recent_info *info; | ||
199 | struct recent_ip_tables *curr_table; | ||
200 | |||
201 | curr_table = (struct recent_ip_tables*) data; | ||
202 | |||
203 | if(size > 20) len = 20; else len = size; | ||
204 | |||
205 | if(copy_from_user(buffer,input,len)) return -EFAULT; | ||
206 | |||
207 | if(len < 20) buffer[len] = '\0'; | ||
208 | |||
209 | #ifdef DEBUG | ||
210 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); | ||
211 | #endif | ||
212 | 103 | ||
213 | cp = buffer; | 104 | static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) |
214 | while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } | 105 | { |
106 | list_del(&e->list); | ||
107 | list_del(&e->lru_list); | ||
108 | kfree(e); | ||
109 | t->entries--; | ||
110 | } | ||
215 | 111 | ||
216 | /* Check if we are asked to flush the entire table */ | 112 | static struct recent_entry * |
217 | if(!memcmp(cp,"clear",5)) { | 113 | recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl) |
218 | used += 5; | 114 | { |
219 | spin_lock_bh(&curr_table->list_lock); | 115 | struct recent_entry *e; |
220 | curr_table->time_pos = 0; | ||
221 | for(count = 0; count < ip_list_hash_size; count++) { | ||
222 | curr_table->hash_table[count] = -1; | ||
223 | } | ||
224 | for(count = 0; count < ip_list_tot; count++) { | ||
225 | curr_table->table[count].last_seen = 0; | ||
226 | curr_table->table[count].addr = 0; | ||
227 | curr_table->table[count].ttl = 0; | ||
228 | memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); | ||
229 | curr_table->table[count].oldest_pkt = 0; | ||
230 | curr_table->table[count].time_pos = 0; | ||
231 | curr_table->time_info[count].position = count; | ||
232 | curr_table->time_info[count].time = 0; | ||
233 | } | ||
234 | spin_unlock_bh(&curr_table->list_lock); | ||
235 | return used; | ||
236 | } | ||
237 | 116 | ||
238 | check_set = IPT_RECENT_SET; | 117 | if (t->entries >= ip_list_tot) { |
239 | switch(*cp) { | 118 | e = list_entry(t->lru_list.next, struct recent_entry, lru_list); |
240 | case '+': check_set = IPT_RECENT_SET; cp++; used++; break; | 119 | recent_entry_remove(t, e); |
241 | case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; | ||
242 | default: if(!isdigit(*cp)) return (used+1); break; | ||
243 | } | 120 | } |
121 | e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, | ||
122 | GFP_ATOMIC); | ||
123 | if (e == NULL) | ||
124 | return NULL; | ||
125 | e->addr = addr; | ||
126 | e->ttl = ttl; | ||
127 | e->stamps[0] = jiffies; | ||
128 | e->nstamps = 1; | ||
129 | e->index = 1; | ||
130 | list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); | ||
131 | list_add_tail(&e->lru_list, &t->lru_list); | ||
132 | t->entries++; | ||
133 | return e; | ||
134 | } | ||
244 | 135 | ||
245 | #ifdef DEBUG | 136 | static void recent_entry_update(struct recent_table *t, struct recent_entry *e) |
246 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); | 137 | { |
247 | #endif | 138 | e->stamps[e->index++] = jiffies; |
248 | /* Get addr (effectively inet_aton()) */ | 139 | if (e->index > e->nstamps) |
249 | /* Shamelessly stolen from libc, a function in the kernel for doing | 140 | e->nstamps = e->index; |
250 | * this would, of course, be greatly preferred, but our options appear | 141 | e->index %= ip_pkt_list_tot; |
251 | * to be rather limited, so we will just do it ourselves here. | 142 | list_move_tail(&e->lru_list, &t->lru_list); |
252 | */ | 143 | } |
253 | res.word = 0; | ||
254 | |||
255 | c = *cp; | ||
256 | for(;;) { | ||
257 | if(!isdigit(c)) return used; | ||
258 | val = 0; base = 10; digit = 0; | ||
259 | if(c == '0') { | ||
260 | c = *++cp; | ||
261 | if(c == 'x' || c == 'X') base = 16, c = *++cp; | ||
262 | else { base = 8; digit = 1; } | ||
263 | } | ||
264 | for(;;) { | ||
265 | if(isascii(c) && isdigit(c)) { | ||
266 | if(base == 8 && (c == '8' || c == '0')) return used; | ||
267 | val = (val * base) + (c - '0'); | ||
268 | c = *++cp; | ||
269 | digit = 1; | ||
270 | } else if(base == 16 && isascii(c) && isxdigit(c)) { | ||
271 | val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); | ||
272 | c = *++cp; | ||
273 | digit = 1; | ||
274 | } else break; | ||
275 | } | ||
276 | if(c == '.') { | ||
277 | if(pp > res.bytes + 2 || val > 0xff) return used; | ||
278 | *pp++ = val; | ||
279 | c = *++cp; | ||
280 | } else break; | ||
281 | } | ||
282 | used = cp - buffer; | ||
283 | if(c != '\0' && (!isascii(c) || !isspace(c))) return used; | ||
284 | if(c == '\n') used++; | ||
285 | if(!digit) return used; | ||
286 | 144 | ||
287 | if(val > max[pp - res.bytes]) return used; | 145 | static struct recent_table *recent_table_lookup(const char *name) |
288 | addr = res.word | htonl(val); | 146 | { |
147 | struct recent_table *t; | ||
289 | 148 | ||
290 | if(!addr && check_set == IPT_RECENT_SET) return used; | 149 | list_for_each_entry(t, &tables, list) |
150 | if (!strcmp(t->name, name)) | ||
151 | return t; | ||
152 | return NULL; | ||
153 | } | ||
291 | 154 | ||
292 | #ifdef DEBUG | 155 | static void recent_table_flush(struct recent_table *t) |
293 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); | 156 | { |
294 | #endif | 157 | struct recent_entry *e, *next; |
158 | unsigned int i; | ||
295 | 159 | ||
296 | /* Set up and just call match */ | 160 | for (i = 0; i < ip_list_hash_size; i++) { |
297 | info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); | 161 | list_for_each_entry_safe(e, next, &t->iphash[i], list) |
298 | if(!info) { return -ENOMEM; } | 162 | recent_entry_remove(t, e); |
299 | info->seconds = 0; | ||
300 | info->hit_count = 0; | ||
301 | info->check_set = check_set; | ||
302 | info->invert = 0; | ||
303 | info->side = IPT_RECENT_SOURCE; | ||
304 | strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); | ||
305 | info->name[IPT_RECENT_NAME_LEN-1] = '\0'; | ||
306 | |||
307 | skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); | ||
308 | if (!skb) { | ||
309 | used = -ENOMEM; | ||
310 | goto out_free_info; | ||
311 | } | ||
312 | skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); | ||
313 | if (!skb->nh.iph) { | ||
314 | used = -ENOMEM; | ||
315 | goto out_free_skb; | ||
316 | } | 163 | } |
317 | |||
318 | skb->nh.iph->saddr = addr; | ||
319 | skb->nh.iph->daddr = 0; | ||
320 | /* Clear ttl since we have no way of knowing it */ | ||
321 | skb->nh.iph->ttl = 0; | ||
322 | match(skb,NULL,NULL,NULL,info,0,0,NULL); | ||
323 | |||
324 | kfree(skb->nh.iph); | ||
325 | out_free_skb: | ||
326 | kfree(skb); | ||
327 | out_free_info: | ||
328 | kfree(info); | ||
329 | |||
330 | #ifdef DEBUG | ||
331 | if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); | ||
332 | #endif | ||
333 | return used; | ||
334 | } | 164 | } |
335 | 165 | ||
336 | #endif /* CONFIG_PROC_FS */ | ||
337 | |||
338 | /* 'match' is our primary function, called by the kernel whenever a rule is | ||
339 | * hit with our module as an option to it. | ||
340 | * What this function does depends on what was specifically asked of it by | ||
341 | * the user: | ||
342 | * --set -- Add or update last seen time of the source address of the packet | ||
343 | * -- matchinfo->check_set == IPT_RECENT_SET | ||
344 | * --rcheck -- Just check if the source address is in the list | ||
345 | * -- matchinfo->check_set == IPT_RECENT_CHECK | ||
346 | * --update -- If the source address is in the list, update last_seen | ||
347 | * -- matchinfo->check_set == IPT_RECENT_UPDATE | ||
348 | * --remove -- If the source address is in the list, remove it | ||
349 | * -- matchinfo->check_set == IPT_RECENT_REMOVE | ||
350 | * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds | ||
351 | * -- matchinfo->seconds | ||
352 | * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times | ||
353 | * -- matchinfo->hit_count | ||
354 | * --seconds and --hitcount can be combined | ||
355 | */ | ||
356 | static int | 166 | static int |
357 | match(const struct sk_buff *skb, | 167 | ipt_recent_match(const struct sk_buff *skb, |
358 | const struct net_device *in, | 168 | const struct net_device *in, const struct net_device *out, |
359 | const struct net_device *out, | 169 | const struct xt_match *match, const void *matchinfo, |
360 | const struct xt_match *match, | 170 | int offset, unsigned int protoff, int *hotdrop) |
361 | const void *matchinfo, | ||
362 | int offset, | ||
363 | unsigned int protoff, | ||
364 | int *hotdrop) | ||
365 | { | 171 | { |
366 | int pkt_count, hits_found, ans; | ||
367 | unsigned long now; | ||
368 | const struct ipt_recent_info *info = matchinfo; | 172 | const struct ipt_recent_info *info = matchinfo; |
369 | u_int32_t addr = 0, time_temp; | 173 | struct recent_table *t; |
370 | u_int8_t ttl = skb->nh.iph->ttl; | 174 | struct recent_entry *e; |
371 | int *hash_table; | 175 | u_int32_t addr; |
372 | int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; | 176 | u_int8_t ttl; |
373 | struct time_info_list *time_info; | 177 | int ret = info->invert; |
374 | struct recent_ip_tables *curr_table; | ||
375 | struct recent_ip_tables *last_table; | ||
376 | struct recent_ip_list *r_list; | ||
377 | |||
378 | #ifdef DEBUG | ||
379 | if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); | ||
380 | #endif | ||
381 | |||
382 | /* Default is false ^ info->invert */ | ||
383 | ans = info->invert; | ||
384 | 178 | ||
385 | #ifdef DEBUG | 179 | if (info->side == IPT_RECENT_DEST) |
386 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); | 180 | addr = skb->nh.iph->daddr; |
387 | #endif | 181 | else |
182 | addr = skb->nh.iph->saddr; | ||
388 | 183 | ||
389 | /* if out != NULL then routing has been done and TTL changed. | 184 | ttl = skb->nh.iph->ttl; |
390 | * We change it back here internally for match what came in before routing. */ | 185 | /* use TTL as seen before forwarding */ |
391 | if(out) ttl++; | 186 | if (out && !skb->sk) |
187 | ttl++; | ||
392 | 188 | ||
393 | /* Find the right table */ | ||
394 | spin_lock_bh(&recent_lock); | 189 | spin_lock_bh(&recent_lock); |
395 | curr_table = r_tables; | 190 | t = recent_table_lookup(info->name); |
396 | while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); | 191 | e = recent_entry_lookup(t, addr, |
397 | 192 | info->check_set & IPT_RECENT_TTL ? ttl : 0); | |
398 | #ifdef DEBUG | 193 | if (e == NULL) { |
399 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); | 194 | if (!(info->check_set & IPT_RECENT_SET)) |
400 | #endif | 195 | goto out; |
401 | 196 | e = recent_entry_init(t, addr, ttl); | |
402 | spin_unlock_bh(&recent_lock); | 197 | if (e == NULL) |
403 | 198 | *hotdrop = 1; | |
404 | /* Table with this name not found, match impossible */ | 199 | ret ^= 1; |
405 | if(!curr_table) { return ans; } | 200 | goto out; |
406 | |||
407 | /* Make sure no one is changing the list while we work with it */ | ||
408 | spin_lock_bh(&curr_table->list_lock); | ||
409 | |||
410 | r_list = curr_table->table; | ||
411 | if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; | ||
412 | |||
413 | if(!addr) { | ||
414 | #ifdef DEBUG | ||
415 | if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); | ||
416 | #endif | ||
417 | spin_unlock_bh(&curr_table->list_lock); | ||
418 | return ans; | ||
419 | } | ||
420 | |||
421 | #ifdef DEBUG | ||
422 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); | ||
423 | #endif | ||
424 | |||
425 | /* Get jiffies now in case they changed while we were waiting for a lock */ | ||
426 | now = jiffies; | ||
427 | hash_table = curr_table->hash_table; | ||
428 | time_info = curr_table->time_info; | ||
429 | |||
430 | orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); | ||
431 | /* Hash entry at this result used */ | ||
432 | /* Check for TTL match if requested. If TTL is zero then a match would never | ||
433 | * happen, so match regardless of existing TTL in that case. Zero means the | ||
434 | * entry was added via the /proc interface anyway, so we will just use the | ||
435 | * first TTL we get for that IP address. */ | ||
436 | if(info->check_set & IPT_RECENT_TTL) { | ||
437 | while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && | ||
438 | (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { | ||
439 | /* Collision in hash table */ | ||
440 | hash_result = (hash_result + 1) % ip_list_hash_size; | ||
441 | } | ||
442 | } else { | ||
443 | while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { | ||
444 | /* Collision in hash table */ | ||
445 | hash_result = (hash_result + 1) % ip_list_hash_size; | ||
446 | } | ||
447 | } | ||
448 | |||
449 | if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { | ||
450 | /* IP not in list and not asked to SET */ | ||
451 | spin_unlock_bh(&curr_table->list_lock); | ||
452 | return ans; | ||
453 | } | ||
454 | |||
455 | /* Check if we need to handle the collision, do not need to on REMOVE */ | ||
456 | if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { | ||
457 | #ifdef DEBUG | ||
458 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", | ||
459 | orig_hash_result, | ||
460 | hash_result, | ||
461 | r_list[hash_table[orig_hash_result]].addr, | ||
462 | addr); | ||
463 | #endif | ||
464 | |||
465 | /* We had a collision. | ||
466 | * orig_hash_result is where we started, hash_result is where we ended up. | ||
467 | * So, swap them because we are likely to see the same guy again sooner */ | ||
468 | #ifdef DEBUG | ||
469 | if(debug) { | ||
470 | printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); | ||
471 | printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", | ||
472 | r_list[hash_table[orig_hash_result]].hash_entry); | ||
473 | } | ||
474 | #endif | ||
475 | |||
476 | r_list[hash_table[orig_hash_result]].hash_entry = hash_result; | ||
477 | |||
478 | |||
479 | temp = hash_table[orig_hash_result]; | ||
480 | #ifdef DEBUG | ||
481 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); | ||
482 | #endif | ||
483 | hash_table[orig_hash_result] = hash_table[hash_result]; | ||
484 | hash_table[hash_result] = temp; | ||
485 | temp = hash_result; | ||
486 | hash_result = orig_hash_result; | ||
487 | orig_hash_result = temp; | ||
488 | time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; | ||
489 | if(hash_table[hash_result] != -1) { | ||
490 | r_list[hash_table[hash_result]].hash_entry = hash_result; | ||
491 | time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; | ||
492 | } | ||
493 | |||
494 | #ifdef DEBUG | ||
495 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); | ||
496 | #endif | ||
497 | } | 201 | } |
498 | 202 | ||
499 | if(hash_table[hash_result] == -1) { | 203 | if (info->check_set & IPT_RECENT_SET) |
500 | #ifdef DEBUG | 204 | ret ^= 1; |
501 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", | 205 | else if (info->check_set & IPT_RECENT_REMOVE) { |
502 | hash_result, addr); | 206 | recent_entry_remove(t, e); |
503 | #endif | 207 | ret ^= 1; |
504 | 208 | } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { | |
505 | /* New item found and IPT_RECENT_SET, so we need to add it */ | 209 | unsigned long t = jiffies - info->seconds * HZ; |
506 | location = time_info[curr_table->time_pos].position; | 210 | unsigned int i, hits = 0; |
507 | hash_table[r_list[location].hash_entry] = -1; | 211 | |
508 | hash_table[hash_result] = location; | 212 | for (i = 0; i < e->nstamps; i++) { |
509 | memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); | 213 | if (info->seconds && time_after(t, e->stamps[i])) |
510 | r_list[location].time_pos = curr_table->time_pos; | 214 | continue; |
511 | r_list[location].addr = addr; | 215 | if (++hits >= info->hit_count) { |
512 | r_list[location].ttl = ttl; | 216 | ret ^= 1; |
513 | r_list[location].last_seen = now; | 217 | break; |
514 | r_list[location].oldest_pkt = 1; | ||
515 | r_list[location].last_pkts[0] = now; | ||
516 | r_list[location].hash_entry = hash_result; | ||
517 | time_info[curr_table->time_pos].time = r_list[location].last_seen; | ||
518 | curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; | ||
519 | |||
520 | ans = !info->invert; | ||
521 | } else { | ||
522 | #ifdef DEBUG | ||
523 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", | ||
524 | hash_result, | ||
525 | addr); | ||
526 | #endif | ||
527 | |||
528 | /* Existing item found */ | ||
529 | location = hash_table[hash_result]; | ||
530 | /* We have a match on address, now to make sure it meets all requirements for a | ||
531 | * full match. */ | ||
532 | if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { | ||
533 | if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; | ||
534 | if(info->seconds && !info->hit_count) { | ||
535 | if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; | ||
536 | } | ||
537 | if(info->seconds && info->hit_count) { | ||
538 | for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { | ||
539 | if(r_list[location].last_pkts[pkt_count] == 0) break; | ||
540 | if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; | ||
541 | } | ||
542 | if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; | ||
543 | } | ||
544 | if(info->hit_count && !info->seconds) { | ||
545 | for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { | ||
546 | if(r_list[location].last_pkts[pkt_count] == 0) break; | ||
547 | hits_found++; | ||
548 | } | ||
549 | if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; | ||
550 | } | 218 | } |
551 | } | 219 | } |
552 | #ifdef DEBUG | ||
553 | if(debug) { | ||
554 | if(ans) | ||
555 | printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); | ||
556 | else | ||
557 | printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); | ||
558 | } | ||
559 | #endif | ||
560 | |||
561 | /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the | ||
562 | * current timestamp to the last_seen. */ | ||
563 | if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { | ||
564 | #ifdef DEBUG | ||
565 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); | ||
566 | #endif | ||
567 | /* Have to update our time info */ | ||
568 | time_loc = r_list[location].time_pos; | ||
569 | time_info[time_loc].time = now; | ||
570 | time_info[time_loc].position = location; | ||
571 | while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { | ||
572 | time_temp = time_info[time_loc].time; | ||
573 | time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; | ||
574 | time_info[(time_loc+1)%ip_list_tot].time = time_temp; | ||
575 | time_temp = time_info[time_loc].position; | ||
576 | time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; | ||
577 | time_info[(time_loc+1)%ip_list_tot].position = time_temp; | ||
578 | r_list[time_info[time_loc].position].time_pos = time_loc; | ||
579 | r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; | ||
580 | time_loc = (time_loc+1) % ip_list_tot; | ||
581 | } | ||
582 | r_list[location].time_pos = time_loc; | ||
583 | r_list[location].ttl = ttl; | ||
584 | r_list[location].last_pkts[r_list[location].oldest_pkt] = now; | ||
585 | r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; | ||
586 | r_list[location].last_seen = now; | ||
587 | } | ||
588 | /* If we have been asked to remove the entry from the list, just set it to 0 */ | ||
589 | if(info->check_set & IPT_RECENT_REMOVE) { | ||
590 | #ifdef DEBUG | ||
591 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); | ||
592 | #endif | ||
593 | /* Check if this is part of a collision chain */ | ||
594 | while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { | ||
595 | orig_hash_result++; | ||
596 | if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { | ||
597 | /* Found collision chain, how deep does this rabbit hole go? */ | ||
598 | #ifdef DEBUG | ||
599 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); | ||
600 | #endif | ||
601 | end_collision_chain = orig_hash_result; | ||
602 | } | ||
603 | } | ||
604 | if(end_collision_chain != -1) { | ||
605 | #ifdef DEBUG | ||
606 | if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); | ||
607 | #endif | ||
608 | /* Part of a collision chain, swap it with the end of the chain | ||
609 | * before removing. */ | ||
610 | r_list[hash_table[end_collision_chain]].hash_entry = hash_result; | ||
611 | temp = hash_table[end_collision_chain]; | ||
612 | hash_table[end_collision_chain] = hash_table[hash_result]; | ||
613 | hash_table[hash_result] = temp; | ||
614 | time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; | ||
615 | hash_result = end_collision_chain; | ||
616 | r_list[hash_table[hash_result]].hash_entry = hash_result; | ||
617 | time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; | ||
618 | } | ||
619 | location = hash_table[hash_result]; | ||
620 | hash_table[r_list[location].hash_entry] = -1; | ||
621 | time_loc = r_list[location].time_pos; | ||
622 | time_info[time_loc].time = 0; | ||
623 | time_info[time_loc].position = location; | ||
624 | while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { | ||
625 | time_temp = time_info[time_loc].time; | ||
626 | time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; | ||
627 | time_info[(time_loc+1)%ip_list_tot].time = time_temp; | ||
628 | time_temp = time_info[time_loc].position; | ||
629 | time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; | ||
630 | time_info[(time_loc+1)%ip_list_tot].position = time_temp; | ||
631 | r_list[time_info[time_loc].position].time_pos = time_loc; | ||
632 | r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; | ||
633 | time_loc = (time_loc+1) % ip_list_tot; | ||
634 | } | ||
635 | r_list[location].time_pos = time_loc; | ||
636 | r_list[location].last_seen = 0; | ||
637 | r_list[location].addr = 0; | ||
638 | r_list[location].ttl = 0; | ||
639 | memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); | ||
640 | r_list[location].oldest_pkt = 0; | ||
641 | ans = !info->invert; | ||
642 | } | ||
643 | spin_unlock_bh(&curr_table->list_lock); | ||
644 | return ans; | ||
645 | } | 220 | } |
646 | 221 | ||
647 | spin_unlock_bh(&curr_table->list_lock); | 222 | if (info->check_set & IPT_RECENT_SET || |
648 | #ifdef DEBUG | 223 | (info->check_set & IPT_RECENT_UPDATE && ret)) { |
649 | if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); | 224 | recent_entry_update(t, e); |
650 | #endif | 225 | e->ttl = ttl; |
651 | return ans; | 226 | } |
227 | out: | ||
228 | spin_unlock_bh(&recent_lock); | ||
229 | return ret; | ||
652 | } | 230 | } |
653 | 231 | ||
654 | /* This function is to verify that the rule given during the userspace iptables | ||
655 | * command is correct. | ||
656 | * If the command is valid then we check if the table name referred to by the | ||
657 | * rule exists, if not it is created. | ||
658 | */ | ||
659 | static int | 232 | static int |
660 | checkentry(const char *tablename, | 233 | ipt_recent_checkentry(const char *tablename, const void *ip, |
661 | const void *ip, | 234 | const struct xt_match *match, void *matchinfo, |
662 | const struct xt_match *match, | 235 | unsigned int matchsize, unsigned int hook_mask) |
663 | void *matchinfo, | ||
664 | unsigned int matchsize, | ||
665 | unsigned int hook_mask) | ||
666 | { | 236 | { |
667 | int flag = 0, c; | ||
668 | unsigned long *hold; | ||
669 | const struct ipt_recent_info *info = matchinfo; | 237 | const struct ipt_recent_info *info = matchinfo; |
670 | struct recent_ip_tables *curr_table, *find_table, *last_table; | 238 | struct recent_table *t; |
671 | 239 | unsigned i; | |
672 | #ifdef DEBUG | 240 | int ret = 0; |
673 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); | ||
674 | #endif | ||
675 | |||
676 | /* seconds and hit_count only valid for CHECK/UPDATE */ | ||
677 | if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } | ||
678 | if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } | ||
679 | if(info->check_set & IPT_RECENT_CHECK) flag++; | ||
680 | if(info->check_set & IPT_RECENT_UPDATE) flag++; | ||
681 | |||
682 | /* One and only one of these should ever be set */ | ||
683 | if(flag != 1) return 0; | ||
684 | |||
685 | /* Name must be set to something */ | ||
686 | if(!info->name || !info->name[0]) return 0; | ||
687 | 241 | ||
688 | /* Things look good, create a list for this if it does not exist */ | 242 | if (hweight8(info->check_set & |
689 | /* Lock the linked list while we play with it */ | 243 | (IPT_RECENT_SET | IPT_RECENT_REMOVE | |
690 | spin_lock_bh(&recent_lock); | 244 | IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) |
691 | 245 | return 0; | |
692 | /* Look for an entry with this name already created */ | 246 | if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && |
693 | /* Finds the end of the list and the entry before the end if current name does not exist */ | 247 | (info->seconds || info->hit_count)) |
694 | find_table = r_tables; | 248 | return 0; |
695 | while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); | 249 | if (info->name[0] == '\0' || |
250 | strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) | ||
251 | return 0; | ||
696 | 252 | ||
697 | /* If a table already exists just increment the count on that table and return */ | 253 | mutex_lock(&recent_mutex); |
698 | if(find_table) { | 254 | t = recent_table_lookup(info->name); |
699 | #ifdef DEBUG | 255 | if (t != NULL) { |
700 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); | 256 | t->refcnt++; |
701 | #endif | 257 | ret = 1; |
702 | find_table->count++; | 258 | goto out; |
703 | spin_unlock_bh(&recent_lock); | ||
704 | return 1; | ||
705 | } | 259 | } |
706 | 260 | ||
707 | spin_unlock_bh(&recent_lock); | 261 | t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, |
708 | 262 | GFP_KERNEL); | |
709 | /* Table with this name not found */ | 263 | if (t == NULL) |
710 | /* Allocate memory for new linked list item */ | 264 | goto out; |
711 | 265 | t->refcnt = 1; | |
712 | #ifdef DEBUG | 266 | strcpy(t->name, info->name); |
713 | if(debug) { | 267 | INIT_LIST_HEAD(&t->lru_list); |
714 | printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); | 268 | for (i = 0; i < ip_list_hash_size; i++) |
715 | printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); | 269 | INIT_LIST_HEAD(&t->iphash[i]); |
270 | #ifdef CONFIG_PROC_FS | ||
271 | t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); | ||
272 | if (t->proc == NULL) { | ||
273 | kfree(t); | ||
274 | goto out; | ||
716 | } | 275 | } |
276 | t->proc->proc_fops = &recent_fops; | ||
277 | t->proc->data = t; | ||
717 | #endif | 278 | #endif |
279 | spin_lock_bh(&recent_lock); | ||
280 | list_add_tail(&t->list, &tables); | ||
281 | spin_unlock_bh(&recent_lock); | ||
282 | ret = 1; | ||
283 | out: | ||
284 | mutex_unlock(&recent_mutex); | ||
285 | return ret; | ||
286 | } | ||
718 | 287 | ||
719 | curr_table = vmalloc(sizeof(struct recent_ip_tables)); | 288 | static void |
720 | if(curr_table == NULL) return 0; | 289 | ipt_recent_destroy(const struct xt_match *match, void *matchinfo, |
721 | 290 | unsigned int matchsize) | |
722 | spin_lock_init(&curr_table->list_lock); | 291 | { |
723 | curr_table->next = NULL; | 292 | const struct ipt_recent_info *info = matchinfo; |
724 | curr_table->count = 1; | 293 | struct recent_table *t; |
725 | curr_table->time_pos = 0; | ||
726 | strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); | ||
727 | curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; | ||
728 | |||
729 | /* Allocate memory for this table and the list of packets in each entry. */ | ||
730 | #ifdef DEBUG | ||
731 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", | ||
732 | sizeof(struct recent_ip_list)*ip_list_tot, | ||
733 | info->name); | ||
734 | #endif | ||
735 | |||
736 | curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); | ||
737 | if(curr_table->table == NULL) { vfree(curr_table); return 0; } | ||
738 | memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); | ||
739 | #ifdef DEBUG | ||
740 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", | ||
741 | sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); | ||
742 | #endif | ||
743 | |||
744 | hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); | ||
745 | #ifdef DEBUG | ||
746 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); | ||
747 | #endif | ||
748 | if(hold == NULL) { | ||
749 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); | ||
750 | vfree(curr_table->table); | ||
751 | vfree(curr_table); | ||
752 | return 0; | ||
753 | } | ||
754 | for(c = 0; c < ip_list_tot; c++) { | ||
755 | curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; | ||
756 | } | ||
757 | 294 | ||
758 | /* Allocate memory for the hash table */ | 295 | mutex_lock(&recent_mutex); |
759 | #ifdef DEBUG | 296 | t = recent_table_lookup(info->name); |
760 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", | 297 | if (--t->refcnt == 0) { |
761 | sizeof(int)*ip_list_hash_size); | 298 | spin_lock_bh(&recent_lock); |
299 | list_del(&t->list); | ||
300 | spin_unlock_bh(&recent_lock); | ||
301 | recent_table_flush(t); | ||
302 | #ifdef CONFIG_PROC_FS | ||
303 | remove_proc_entry(t->name, proc_dir); | ||
762 | #endif | 304 | #endif |
763 | 305 | kfree(t); | |
764 | curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); | ||
765 | if(!curr_table->hash_table) { | ||
766 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); | ||
767 | vfree(hold); | ||
768 | vfree(curr_table->table); | ||
769 | vfree(curr_table); | ||
770 | return 0; | ||
771 | } | ||
772 | |||
773 | for(c = 0; c < ip_list_hash_size; c++) { | ||
774 | curr_table->hash_table[c] = -1; | ||
775 | } | 306 | } |
307 | mutex_unlock(&recent_mutex); | ||
308 | } | ||
776 | 309 | ||
777 | /* Allocate memory for the time info */ | 310 | #ifdef CONFIG_PROC_FS |
778 | #ifdef DEBUG | 311 | struct recent_iter_state { |
779 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", | 312 | struct recent_table *table; |
780 | sizeof(struct time_info_list)*ip_list_tot); | 313 | unsigned int bucket; |
781 | #endif | 314 | }; |
782 | 315 | ||
783 | curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); | 316 | static void *recent_seq_start(struct seq_file *seq, loff_t *pos) |
784 | if(!curr_table->time_info) { | 317 | { |
785 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); | 318 | struct recent_iter_state *st = seq->private; |
786 | vfree(curr_table->hash_table); | 319 | struct recent_table *t = st->table; |
787 | vfree(hold); | 320 | struct recent_entry *e; |
788 | vfree(curr_table->table); | 321 | loff_t p = *pos; |
789 | vfree(curr_table); | ||
790 | return 0; | ||
791 | } | ||
792 | for(c = 0; c < ip_list_tot; c++) { | ||
793 | curr_table->time_info[c].position = c; | ||
794 | curr_table->time_info[c].time = 0; | ||
795 | } | ||
796 | 322 | ||
797 | /* Put the new table in place */ | ||
798 | spin_lock_bh(&recent_lock); | 323 | spin_lock_bh(&recent_lock); |
799 | find_table = r_tables; | ||
800 | while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); | ||
801 | |||
802 | /* If a table already exists just increment the count on that table and return */ | ||
803 | if(find_table) { | ||
804 | find_table->count++; | ||
805 | spin_unlock_bh(&recent_lock); | ||
806 | #ifdef DEBUG | ||
807 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); | ||
808 | #endif | ||
809 | vfree(curr_table->time_info); | ||
810 | vfree(curr_table->hash_table); | ||
811 | vfree(hold); | ||
812 | vfree(curr_table->table); | ||
813 | vfree(curr_table); | ||
814 | return 1; | ||
815 | } | ||
816 | if(!last_table) r_tables = curr_table; else last_table->next = curr_table; | ||
817 | |||
818 | spin_unlock_bh(&recent_lock); | ||
819 | 324 | ||
820 | #ifdef CONFIG_PROC_FS | 325 | for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) { |
821 | /* Create our proc 'status' entry. */ | 326 | list_for_each_entry(e, &t->iphash[st->bucket], list) { |
822 | curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); | 327 | if (p-- == 0) |
823 | if (!curr_table->status_proc) { | 328 | return e; |
824 | vfree(hold); | ||
825 | printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); | ||
826 | /* Destroy the created table */ | ||
827 | spin_lock_bh(&recent_lock); | ||
828 | last_table = NULL; | ||
829 | curr_table = r_tables; | ||
830 | if(!curr_table) { | ||
831 | #ifdef DEBUG | ||
832 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); | ||
833 | #endif | ||
834 | spin_unlock_bh(&recent_lock); | ||
835 | return 0; | ||
836 | } | ||
837 | while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); | ||
838 | if(!curr_table) { | ||
839 | #ifdef DEBUG | ||
840 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); | ||
841 | #endif | ||
842 | spin_unlock_bh(&recent_lock); | ||
843 | return 0; | ||
844 | } | 329 | } |
845 | if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; | ||
846 | spin_unlock_bh(&recent_lock); | ||
847 | vfree(curr_table->time_info); | ||
848 | vfree(curr_table->hash_table); | ||
849 | vfree(curr_table->table); | ||
850 | vfree(curr_table); | ||
851 | return 0; | ||
852 | } | 330 | } |
853 | 331 | return NULL; | |
854 | curr_table->status_proc->owner = THIS_MODULE; | 332 | } |
855 | curr_table->status_proc->data = curr_table; | ||
856 | wmb(); | ||
857 | curr_table->status_proc->read_proc = ip_recent_get_info; | ||
858 | curr_table->status_proc->write_proc = ip_recent_ctrl; | ||
859 | #endif /* CONFIG_PROC_FS */ | ||
860 | |||
861 | #ifdef DEBUG | ||
862 | if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); | ||
863 | #endif | ||
864 | 333 | ||
865 | return 1; | 334 | static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) |
335 | { | ||
336 | struct recent_iter_state *st = seq->private; | ||
337 | struct recent_table *t = st->table; | ||
338 | struct recent_entry *e = v; | ||
339 | struct list_head *head = e->list.next; | ||
340 | |||
341 | while (head == &t->iphash[st->bucket]) { | ||
342 | if (++st->bucket >= ip_list_hash_size) | ||
343 | return NULL; | ||
344 | head = t->iphash[st->bucket].next; | ||
345 | } | ||
346 | (*pos)++; | ||
347 | return list_entry(head, struct recent_entry, list); | ||
866 | } | 348 | } |
867 | 349 | ||
868 | /* This function is called in the event that a rule matching this module is | 350 | static void recent_seq_stop(struct seq_file *s, void *v) |
869 | * removed. | ||
870 | * When this happens we need to check if there are no other rules matching | ||
871 | * the table given. If that is the case then we remove the table and clean | ||
872 | * up its memory. | ||
873 | */ | ||
874 | static void | ||
875 | destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) | ||
876 | { | 351 | { |
877 | const struct ipt_recent_info *info = matchinfo; | 352 | spin_unlock_bh(&recent_lock); |
878 | struct recent_ip_tables *curr_table, *last_table; | 353 | } |
879 | 354 | ||
880 | #ifdef DEBUG | 355 | static int recent_seq_show(struct seq_file *seq, void *v) |
881 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); | 356 | { |
882 | #endif | 357 | struct recent_entry *e = v; |
358 | unsigned int i; | ||
359 | |||
360 | i = (e->index - 1) % ip_pkt_list_tot; | ||
361 | seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", | ||
362 | NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); | ||
363 | for (i = 0; i < e->nstamps; i++) | ||
364 | seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); | ||
365 | seq_printf(seq, "\n"); | ||
366 | return 0; | ||
367 | } | ||
883 | 368 | ||
884 | if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; | 369 | static struct seq_operations recent_seq_ops = { |
370 | .start = recent_seq_start, | ||
371 | .next = recent_seq_next, | ||
372 | .stop = recent_seq_stop, | ||
373 | .show = recent_seq_show, | ||
374 | }; | ||
885 | 375 | ||
886 | /* Lock the linked list while we play with it */ | 376 | static int recent_seq_open(struct inode *inode, struct file *file) |
887 | spin_lock_bh(&recent_lock); | 377 | { |
378 | struct proc_dir_entry *pde = PDE(inode); | ||
379 | struct seq_file *seq; | ||
380 | struct recent_iter_state *st; | ||
381 | int ret; | ||
382 | |||
383 | st = kzalloc(sizeof(*st), GFP_KERNEL); | ||
384 | if (st == NULL) | ||
385 | return -ENOMEM; | ||
386 | ret = seq_open(file, &recent_seq_ops); | ||
387 | if (ret) | ||
388 | kfree(st); | ||
389 | st->table = pde->data; | ||
390 | seq = file->private_data; | ||
391 | seq->private = st; | ||
392 | return ret; | ||
393 | } | ||
888 | 394 | ||
889 | /* Look for an entry with this name already created */ | 395 | static ssize_t recent_proc_write(struct file *file, const char __user *input, |
890 | /* Finds the end of the list and the entry before the end if current name does not exist */ | 396 | size_t size, loff_t *loff) |
891 | last_table = NULL; | 397 | { |
892 | curr_table = r_tables; | 398 | struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); |
893 | if(!curr_table) { | 399 | struct recent_table *t = pde->data; |
894 | #ifdef DEBUG | 400 | struct recent_entry *e; |
895 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); | 401 | char buf[sizeof("+255.255.255.255")], *c = buf; |
896 | #endif | 402 | u_int32_t addr; |
403 | int add; | ||
404 | |||
405 | if (size > sizeof(buf)) | ||
406 | size = sizeof(buf); | ||
407 | if (copy_from_user(buf, input, size)) | ||
408 | return -EFAULT; | ||
409 | while (isspace(*c)) | ||
410 | c++; | ||
411 | |||
412 | if (size - (c - buf) < 5) | ||
413 | return c - buf; | ||
414 | if (!strncmp(c, "clear", 5)) { | ||
415 | c += 5; | ||
416 | spin_lock_bh(&recent_lock); | ||
417 | recent_table_flush(t); | ||
897 | spin_unlock_bh(&recent_lock); | 418 | spin_unlock_bh(&recent_lock); |
898 | return; | 419 | return c - buf; |
899 | } | 420 | } |
900 | while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); | ||
901 | 421 | ||
902 | /* If a table does not exist then do nothing and return */ | 422 | switch (*c) { |
903 | if(!curr_table) { | 423 | case '-': |
904 | #ifdef DEBUG | 424 | add = 0; |
905 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); | 425 | c++; |
906 | #endif | 426 | break; |
907 | spin_unlock_bh(&recent_lock); | 427 | case '+': |
908 | return; | 428 | c++; |
429 | default: | ||
430 | add = 1; | ||
431 | break; | ||
909 | } | 432 | } |
433 | addr = in_aton(c); | ||
910 | 434 | ||
911 | curr_table->count--; | 435 | spin_lock_bh(&recent_lock); |
912 | 436 | e = recent_entry_lookup(t, addr, 0); | |
913 | /* If count is still non-zero then there are still rules referenceing it so we do nothing */ | 437 | if (e == NULL) { |
914 | if(curr_table->count) { | 438 | if (add) |
915 | #ifdef DEBUG | 439 | recent_entry_init(t, addr, 0); |
916 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); | 440 | } else { |
917 | #endif | 441 | if (add) |
918 | spin_unlock_bh(&recent_lock); | 442 | recent_entry_update(t, e); |
919 | return; | 443 | else |
444 | recent_entry_remove(t, e); | ||
920 | } | 445 | } |
921 | |||
922 | #ifdef DEBUG | ||
923 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); | ||
924 | #endif | ||
925 | |||
926 | /* Count must be zero so we remove this table from the list */ | ||
927 | if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; | ||
928 | |||
929 | spin_unlock_bh(&recent_lock); | 446 | spin_unlock_bh(&recent_lock); |
447 | return size; | ||
448 | } | ||
930 | 449 | ||
931 | /* lock to make sure any late-runners still using this after we removed it from | 450 | static struct file_operations recent_fops = { |
932 | * the list finish up then remove everything */ | 451 | .open = recent_seq_open, |
933 | spin_lock_bh(&curr_table->list_lock); | 452 | .read = seq_read, |
934 | spin_unlock_bh(&curr_table->list_lock); | 453 | .write = recent_proc_write, |
935 | 454 | .release = seq_release_private, | |
936 | #ifdef CONFIG_PROC_FS | 455 | .owner = THIS_MODULE, |
937 | if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); | 456 | }; |
938 | #endif /* CONFIG_PROC_FS */ | 457 | #endif /* CONFIG_PROC_FS */ |
939 | vfree(curr_table->table[0].last_pkts); | ||
940 | vfree(curr_table->table); | ||
941 | vfree(curr_table->hash_table); | ||
942 | vfree(curr_table->time_info); | ||
943 | vfree(curr_table); | ||
944 | |||
945 | #ifdef DEBUG | ||
946 | if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); | ||
947 | #endif | ||
948 | 458 | ||
949 | return; | ||
950 | } | ||
951 | |||
952 | /* This is the structure we pass to ipt_register to register our | ||
953 | * module with iptables. | ||
954 | */ | ||
955 | static struct ipt_match recent_match = { | 459 | static struct ipt_match recent_match = { |
956 | .name = "recent", | 460 | .name = "recent", |
957 | .match = match, | 461 | .match = ipt_recent_match, |
958 | .matchsize = sizeof(struct ipt_recent_info), | 462 | .matchsize = sizeof(struct ipt_recent_info), |
959 | .checkentry = checkentry, | 463 | .checkentry = ipt_recent_checkentry, |
960 | .destroy = destroy, | 464 | .destroy = ipt_recent_destroy, |
961 | .me = THIS_MODULE | 465 | .me = THIS_MODULE, |
962 | }; | 466 | }; |
963 | 467 | ||
964 | /* Kernel module initialization. */ | ||
965 | static int __init ipt_recent_init(void) | 468 | static int __init ipt_recent_init(void) |
966 | { | 469 | { |
967 | int err, count; | 470 | int err; |
968 | 471 | ||
969 | printk(version); | 472 | if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) |
970 | #ifdef CONFIG_PROC_FS | 473 | return -EINVAL; |
971 | proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); | 474 | ip_list_hash_size = 1 << fls(ip_list_tot); |
972 | if(!proc_net_ipt_recent) return -ENOMEM; | ||
973 | #endif | ||
974 | |||
975 | if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { | ||
976 | printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); | ||
977 | ip_list_hash_size = 0; | ||
978 | } | ||
979 | |||
980 | if(!ip_list_hash_size) { | ||
981 | ip_list_hash_size = ip_list_tot*3; | ||
982 | count = 2*2; | ||
983 | while(ip_list_hash_size > count) count = count*2; | ||
984 | ip_list_hash_size = count; | ||
985 | } | ||
986 | |||
987 | #ifdef DEBUG | ||
988 | if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); | ||
989 | #endif | ||
990 | 475 | ||
991 | err = ipt_register_match(&recent_match); | 476 | err = ipt_register_match(&recent_match); |
477 | #ifdef CONFIG_PROC_FS | ||
992 | if (err) | 478 | if (err) |
993 | remove_proc_entry("ipt_recent", proc_net); | 479 | return err; |
480 | proc_dir = proc_mkdir("ipt_recent", proc_net); | ||
481 | if (proc_dir == NULL) { | ||
482 | ipt_unregister_match(&recent_match); | ||
483 | err = -ENOMEM; | ||
484 | } | ||
485 | #endif | ||
994 | return err; | 486 | return err; |
995 | } | 487 | } |
996 | 488 | ||
997 | /* Kernel module destruction. */ | 489 | static void __exit ipt_recent_exit(void) |
998 | static void __exit ipt_recent_fini(void) | ||
999 | { | 490 | { |
491 | BUG_ON(!list_empty(&tables)); | ||
1000 | ipt_unregister_match(&recent_match); | 492 | ipt_unregister_match(&recent_match); |
1001 | 493 | #ifdef CONFIG_PROC_FS | |
1002 | remove_proc_entry("ipt_recent",proc_net); | 494 | remove_proc_entry("ipt_recent", proc_net); |
495 | #endif | ||
1003 | } | 496 | } |
1004 | 497 | ||
1005 | /* Register our module with the kernel. */ | ||
1006 | module_init(ipt_recent_init); | 498 | module_init(ipt_recent_init); |
1007 | module_exit(ipt_recent_fini); | 499 | module_exit(ipt_recent_exit); |
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 77d974443c7b..8cc8e1b36778 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | |||
@@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, | |||
145 | 145 | ||
146 | /* This is where we call the helper: as the packet goes out. */ | 146 | /* This is where we call the helper: as the packet goes out. */ |
147 | ct = nf_ct_get(*pskb, &ctinfo); | 147 | ct = nf_ct_get(*pskb, &ctinfo); |
148 | if (!ct) | 148 | if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) |
149 | return NF_ACCEPT; | 149 | return NF_ACCEPT; |
150 | 150 | ||
151 | help = nfct_help(ct); | 151 | help = nfct_help(ct); |
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b0d361cc6e6..663a73ee3f2f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c | |||
@@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, | |||
235 | } | 235 | } |
236 | 236 | ||
237 | /* See ip_conntrack_proto_tcp.c */ | 237 | /* See ip_conntrack_proto_tcp.c */ |
238 | if (hooknum == NF_IP_PRE_ROUTING && | 238 | if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && |
239 | nf_ip_checksum(skb, hooknum, dataoff, 0)) { | 239 | nf_ip_checksum(skb, hooknum, dataoff, 0)) { |
240 | if (LOG_INVALID(IPPROTO_ICMP)) | 240 | if (LOG_INVALID(IPPROTO_ICMP)) |
241 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, | 241 | nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, |
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc2562415555..bd221ec3f81e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c | |||
@@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk) | |||
103 | } | 103 | } |
104 | 104 | ||
105 | struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, | 105 | struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, |
106 | unsigned long raddr, unsigned long laddr, | 106 | __be32 raddr, __be32 laddr, |
107 | int dif) | 107 | int dif) |
108 | { | 108 | { |
109 | struct hlist_node *node; | 109 | struct hlist_node *node; |
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3adfcf00..ce4cd5f35511 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c | |||
@@ -182,14 +182,6 @@ ctl_table ipv4_table[] = { | |||
182 | .strategy = &ipv4_doint_and_flush_strategy, | 182 | .strategy = &ipv4_doint_and_flush_strategy, |
183 | }, | 183 | }, |
184 | { | 184 | { |
185 | .ctl_name = NET_IPV4_AUTOCONFIG, | ||
186 | .procname = "ip_autoconfig", | ||
187 | .data = &ipv4_config.autoconfig, | ||
188 | .maxlen = sizeof(int), | ||
189 | .mode = 0644, | ||
190 | .proc_handler = &proc_dointvec | ||
191 | }, | ||
192 | { | ||
193 | .ctl_name = NET_IPV4_NO_PMTU_DISC, | 185 | .ctl_name = NET_IPV4_NO_PMTU_DISC, |
194 | .procname = "ip_no_pmtu_disc", | 186 | .procname = "ip_no_pmtu_disc", |
195 | .data = &ipv4_config.no_pmtu_disc, | 187 | .data = &ipv4_config.no_pmtu_disc, |
@@ -688,6 +680,24 @@ ctl_table ipv4_table[] = { | |||
688 | .mode = 0644, | 680 | .mode = 0644, |
689 | .proc_handler = &proc_dointvec | 681 | .proc_handler = &proc_dointvec |
690 | }, | 682 | }, |
683 | #ifdef CONFIG_NET_DMA | ||
684 | { | ||
685 | .ctl_name = NET_TCP_DMA_COPYBREAK, | ||
686 | .procname = "tcp_dma_copybreak", | ||
687 | .data = &sysctl_tcp_dma_copybreak, | ||
688 | .maxlen = sizeof(int), | ||
689 | .mode = 0644, | ||
690 | .proc_handler = &proc_dointvec | ||
691 | }, | ||
692 | #endif | ||
693 | { | ||
694 | .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, | ||
695 | .procname = "tcp_slow_start_after_idle", | ||
696 | .data = &sysctl_tcp_slow_start_after_idle, | ||
697 | .maxlen = sizeof(int), | ||
698 | .mode = 0644, | ||
699 | .proc_handler = &proc_dointvec | ||
700 | }, | ||
691 | { .ctl_name = 0 } | 701 | { .ctl_name = 0 } |
692 | }; | 702 | }; |
693 | 703 | ||
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b8055037..74998f250071 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -263,7 +263,7 @@ | |||
263 | #include <net/tcp.h> | 263 | #include <net/tcp.h> |
264 | #include <net/xfrm.h> | 264 | #include <net/xfrm.h> |
265 | #include <net/ip.h> | 265 | #include <net/ip.h> |
266 | 266 | #include <net/netdma.h> | |
267 | 267 | ||
268 | #include <asm/uaccess.h> | 268 | #include <asm/uaccess.h> |
269 | #include <asm/ioctls.h> | 269 | #include <asm/ioctls.h> |
@@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, | |||
622 | ssize_t res; | 622 | ssize_t res; |
623 | struct sock *sk = sock->sk; | 623 | struct sock *sk = sock->sk; |
624 | 624 | ||
625 | #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) | ||
626 | |||
627 | if (!(sk->sk_route_caps & NETIF_F_SG) || | 625 | if (!(sk->sk_route_caps & NETIF_F_SG) || |
628 | !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) | 626 | !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) |
629 | return sock_no_sendpage(sock, page, offset, size, flags); | 627 | return sock_no_sendpage(sock, page, offset, size, flags); |
630 | 628 | ||
631 | #undef TCP_ZC_CSUM_FLAGS | ||
632 | |||
633 | lock_sock(sk); | 629 | lock_sock(sk); |
634 | TCP_CHECK_TIMER(sk); | 630 | TCP_CHECK_TIMER(sk); |
635 | res = do_tcp_sendpages(sk, &page, offset, size, flags); | 631 | res = do_tcp_sendpages(sk, &page, offset, size, flags); |
@@ -726,9 +722,7 @@ new_segment: | |||
726 | /* | 722 | /* |
727 | * Check whether we can use HW checksum. | 723 | * Check whether we can use HW checksum. |
728 | */ | 724 | */ |
729 | if (sk->sk_route_caps & | 725 | if (sk->sk_route_caps & NETIF_F_ALL_CSUM) |
730 | (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | | ||
731 | NETIF_F_HW_CSUM)) | ||
732 | skb->ip_summed = CHECKSUM_HW; | 726 | skb->ip_summed = CHECKSUM_HW; |
733 | 727 | ||
734 | skb_entail(sk, tp, skb); | 728 | skb_entail(sk, tp, skb); |
@@ -937,7 +931,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, | |||
937 | * calculation of whether or not we must ACK for the sake of | 931 | * calculation of whether or not we must ACK for the sake of |
938 | * a window update. | 932 | * a window update. |
939 | */ | 933 | */ |
940 | static void cleanup_rbuf(struct sock *sk, int copied) | 934 | void tcp_cleanup_rbuf(struct sock *sk, int copied) |
941 | { | 935 | { |
942 | struct tcp_sock *tp = tcp_sk(sk); | 936 | struct tcp_sock *tp = tcp_sk(sk); |
943 | int time_to_ack = 0; | 937 | int time_to_ack = 0; |
@@ -1072,11 +1066,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
1072 | break; | 1066 | break; |
1073 | } | 1067 | } |
1074 | if (skb->h.th->fin) { | 1068 | if (skb->h.th->fin) { |
1075 | sk_eat_skb(sk, skb); | 1069 | sk_eat_skb(sk, skb, 0); |
1076 | ++seq; | 1070 | ++seq; |
1077 | break; | 1071 | break; |
1078 | } | 1072 | } |
1079 | sk_eat_skb(sk, skb); | 1073 | sk_eat_skb(sk, skb, 0); |
1080 | if (!desc->count) | 1074 | if (!desc->count) |
1081 | break; | 1075 | break; |
1082 | } | 1076 | } |
@@ -1086,7 +1080,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, | |||
1086 | 1080 | ||
1087 | /* Clean up data we have read: This will do ACK frames. */ | 1081 | /* Clean up data we have read: This will do ACK frames. */ |
1088 | if (copied) | 1082 | if (copied) |
1089 | cleanup_rbuf(sk, copied); | 1083 | tcp_cleanup_rbuf(sk, copied); |
1090 | return copied; | 1084 | return copied; |
1091 | } | 1085 | } |
1092 | 1086 | ||
@@ -1110,6 +1104,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1110 | int target; /* Read at least this many bytes */ | 1104 | int target; /* Read at least this many bytes */ |
1111 | long timeo; | 1105 | long timeo; |
1112 | struct task_struct *user_recv = NULL; | 1106 | struct task_struct *user_recv = NULL; |
1107 | int copied_early = 0; | ||
1113 | 1108 | ||
1114 | lock_sock(sk); | 1109 | lock_sock(sk); |
1115 | 1110 | ||
@@ -1133,6 +1128,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1133 | 1128 | ||
1134 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); | 1129 | target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); |
1135 | 1130 | ||
1131 | #ifdef CONFIG_NET_DMA | ||
1132 | tp->ucopy.dma_chan = NULL; | ||
1133 | preempt_disable(); | ||
1134 | if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && | ||
1135 | !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { | ||
1136 | preempt_enable_no_resched(); | ||
1137 | tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); | ||
1138 | } else | ||
1139 | preempt_enable_no_resched(); | ||
1140 | #endif | ||
1141 | |||
1136 | do { | 1142 | do { |
1137 | struct sk_buff *skb; | 1143 | struct sk_buff *skb; |
1138 | u32 offset; | 1144 | u32 offset; |
@@ -1220,7 +1226,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1220 | } | 1226 | } |
1221 | } | 1227 | } |
1222 | 1228 | ||
1223 | cleanup_rbuf(sk, copied); | 1229 | tcp_cleanup_rbuf(sk, copied); |
1224 | 1230 | ||
1225 | if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { | 1231 | if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { |
1226 | /* Install new reader */ | 1232 | /* Install new reader */ |
@@ -1274,6 +1280,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, | |||
1274 | } else | 1280 | } else |
1275 | sk_wait_data(sk, &timeo); | 1281 | sk_wait_data(sk, &timeo); |
1276 | 1282 | ||
1283 | #ifdef CONFIG_NET_DMA | ||
1284 | tp->ucopy.wakeup = 0; | ||
1285 | #endif | ||
1286 | |||
1277 | if (user_recv) { | 1287 | if (user_recv) { |
1278 | int chunk; | 1288 | int chunk; |
1279 | 1289 | ||
@@ -1329,13 +1339,39 @@ do_prequeue: | |||
1329 | } | 1339 | } |
1330 | 1340 | ||
1331 | if (!(flags & MSG_TRUNC)) { | 1341 | if (!(flags & MSG_TRUNC)) { |
1332 | err = skb_copy_datagram_iovec(skb, offset, | 1342 | #ifdef CONFIG_NET_DMA |
1333 | msg->msg_iov, used); | 1343 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) |
1334 | if (err) { | 1344 | tp->ucopy.dma_chan = get_softnet_dma(); |
1335 | /* Exception. Bailout! */ | 1345 | |
1336 | if (!copied) | 1346 | if (tp->ucopy.dma_chan) { |
1337 | copied = -EFAULT; | 1347 | tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( |
1338 | break; | 1348 | tp->ucopy.dma_chan, skb, offset, |
1349 | msg->msg_iov, used, | ||
1350 | tp->ucopy.pinned_list); | ||
1351 | |||
1352 | if (tp->ucopy.dma_cookie < 0) { | ||
1353 | |||
1354 | printk(KERN_ALERT "dma_cookie < 0\n"); | ||
1355 | |||
1356 | /* Exception. Bailout! */ | ||
1357 | if (!copied) | ||
1358 | copied = -EFAULT; | ||
1359 | break; | ||
1360 | } | ||
1361 | if ((offset + used) == skb->len) | ||
1362 | copied_early = 1; | ||
1363 | |||
1364 | } else | ||
1365 | #endif | ||
1366 | { | ||
1367 | err = skb_copy_datagram_iovec(skb, offset, | ||
1368 | msg->msg_iov, used); | ||
1369 | if (err) { | ||
1370 | /* Exception. Bailout! */ | ||
1371 | if (!copied) | ||
1372 | copied = -EFAULT; | ||
1373 | break; | ||
1374 | } | ||
1339 | } | 1375 | } |
1340 | } | 1376 | } |
1341 | 1377 | ||
@@ -1355,15 +1391,19 @@ skip_copy: | |||
1355 | 1391 | ||
1356 | if (skb->h.th->fin) | 1392 | if (skb->h.th->fin) |
1357 | goto found_fin_ok; | 1393 | goto found_fin_ok; |
1358 | if (!(flags & MSG_PEEK)) | 1394 | if (!(flags & MSG_PEEK)) { |
1359 | sk_eat_skb(sk, skb); | 1395 | sk_eat_skb(sk, skb, copied_early); |
1396 | copied_early = 0; | ||
1397 | } | ||
1360 | continue; | 1398 | continue; |
1361 | 1399 | ||
1362 | found_fin_ok: | 1400 | found_fin_ok: |
1363 | /* Process the FIN. */ | 1401 | /* Process the FIN. */ |
1364 | ++*seq; | 1402 | ++*seq; |
1365 | if (!(flags & MSG_PEEK)) | 1403 | if (!(flags & MSG_PEEK)) { |
1366 | sk_eat_skb(sk, skb); | 1404 | sk_eat_skb(sk, skb, copied_early); |
1405 | copied_early = 0; | ||
1406 | } | ||
1367 | break; | 1407 | break; |
1368 | } while (len > 0); | 1408 | } while (len > 0); |
1369 | 1409 | ||
@@ -1386,12 +1426,42 @@ skip_copy: | |||
1386 | tp->ucopy.len = 0; | 1426 | tp->ucopy.len = 0; |
1387 | } | 1427 | } |
1388 | 1428 | ||
1429 | #ifdef CONFIG_NET_DMA | ||
1430 | if (tp->ucopy.dma_chan) { | ||
1431 | struct sk_buff *skb; | ||
1432 | dma_cookie_t done, used; | ||
1433 | |||
1434 | dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); | ||
1435 | |||
1436 | while (dma_async_memcpy_complete(tp->ucopy.dma_chan, | ||
1437 | tp->ucopy.dma_cookie, &done, | ||
1438 | &used) == DMA_IN_PROGRESS) { | ||
1439 | /* do partial cleanup of sk_async_wait_queue */ | ||
1440 | while ((skb = skb_peek(&sk->sk_async_wait_queue)) && | ||
1441 | (dma_async_is_complete(skb->dma_cookie, done, | ||
1442 | used) == DMA_SUCCESS)) { | ||
1443 | __skb_dequeue(&sk->sk_async_wait_queue); | ||
1444 | kfree_skb(skb); | ||
1445 | } | ||
1446 | } | ||
1447 | |||
1448 | /* Safe to free early-copied skbs now */ | ||
1449 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1450 | dma_chan_put(tp->ucopy.dma_chan); | ||
1451 | tp->ucopy.dma_chan = NULL; | ||
1452 | } | ||
1453 | if (tp->ucopy.pinned_list) { | ||
1454 | dma_unpin_iovec_pages(tp->ucopy.pinned_list); | ||
1455 | tp->ucopy.pinned_list = NULL; | ||
1456 | } | ||
1457 | #endif | ||
1458 | |||
1389 | /* According to UNIX98, msg_name/msg_namelen are ignored | 1459 | /* According to UNIX98, msg_name/msg_namelen are ignored |
1390 | * on connected socket. I was just happy when found this 8) --ANK | 1460 | * on connected socket. I was just happy when found this 8) --ANK |
1391 | */ | 1461 | */ |
1392 | 1462 | ||
1393 | /* Clean up data we have read: This will do ACK frames. */ | 1463 | /* Clean up data we have read: This will do ACK frames. */ |
1394 | cleanup_rbuf(sk, copied); | 1464 | tcp_cleanup_rbuf(sk, copied); |
1395 | 1465 | ||
1396 | TCP_CHECK_TIMER(sk); | 1466 | TCP_CHECK_TIMER(sk); |
1397 | release_sock(sk); | 1467 | release_sock(sk); |
@@ -1658,6 +1728,9 @@ int tcp_disconnect(struct sock *sk, int flags) | |||
1658 | __skb_queue_purge(&sk->sk_receive_queue); | 1728 | __skb_queue_purge(&sk->sk_receive_queue); |
1659 | sk_stream_writequeue_purge(sk); | 1729 | sk_stream_writequeue_purge(sk); |
1660 | __skb_queue_purge(&tp->out_of_order_queue); | 1730 | __skb_queue_purge(&tp->out_of_order_queue); |
1731 | #ifdef CONFIG_NET_DMA | ||
1732 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1733 | #endif | ||
1661 | 1734 | ||
1662 | inet->dport = 0; | 1735 | inet->dport = 0; |
1663 | 1736 | ||
@@ -1858,7 +1931,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, | |||
1858 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && | 1931 | (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && |
1859 | inet_csk_ack_scheduled(sk)) { | 1932 | inet_csk_ack_scheduled(sk)) { |
1860 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; | 1933 | icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; |
1861 | cleanup_rbuf(sk, 1); | 1934 | tcp_cleanup_rbuf(sk, 1); |
1862 | if (!(val & 1)) | 1935 | if (!(val & 1)) |
1863 | icsk->icsk_ack.pingpong = 1; | 1936 | icsk->icsk_ack.pingpong = 1; |
1864 | } | 1937 | } |
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 035f2092d73a..b2d9021ad22b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c | |||
@@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
198 | return max(tp->snd_cwnd, ca->last_max_cwnd); | 198 | return max(tp->snd_cwnd, ca->last_max_cwnd); |
199 | } | 199 | } |
200 | 200 | ||
201 | static u32 bictcp_min_cwnd(struct sock *sk) | ||
202 | { | ||
203 | const struct tcp_sock *tp = tcp_sk(sk); | ||
204 | return tp->snd_ssthresh; | ||
205 | } | ||
206 | |||
207 | static void bictcp_state(struct sock *sk, u8 new_state) | 201 | static void bictcp_state(struct sock *sk, u8 new_state) |
208 | { | 202 | { |
209 | if (new_state == TCP_CA_Loss) | 203 | if (new_state == TCP_CA_Loss) |
@@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = { | |||
231 | .cong_avoid = bictcp_cong_avoid, | 225 | .cong_avoid = bictcp_cong_avoid, |
232 | .set_state = bictcp_state, | 226 | .set_state = bictcp_state, |
233 | .undo_cwnd = bictcp_undo_cwnd, | 227 | .undo_cwnd = bictcp_undo_cwnd, |
234 | .min_cwnd = bictcp_min_cwnd, | ||
235 | .pkts_acked = bictcp_acked, | 228 | .pkts_acked = bictcp_acked, |
236 | .owner = THIS_MODULE, | 229 | .owner = THIS_MODULE, |
237 | .name = "bic", | 230 | .name = "bic", |
diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c new file mode 100644 index 000000000000..bc54f7e9aea9 --- /dev/null +++ b/net/ipv4/tcp_compound.c | |||
@@ -0,0 +1,448 @@ | |||
1 | /* | ||
2 | * TCP Vegas congestion control | ||
3 | * | ||
4 | * This is based on the congestion detection/avoidance scheme described in | ||
5 | * Lawrence S. Brakmo and Larry L. Peterson. | ||
6 | * "TCP Vegas: End to end congestion avoidance on a global internet." | ||
7 | * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, | ||
8 | * October 1995. Available from: | ||
9 | * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps | ||
10 | * | ||
11 | * See http://www.cs.arizona.edu/xkernel/ for their implementation. | ||
12 | * The main aspects that distinguish this implementation from the | ||
13 | * Arizona Vegas implementation are: | ||
14 | * o We do not change the loss detection or recovery mechanisms of | ||
15 | * Linux in any way. Linux already recovers from losses quite well, | ||
16 | * using fine-grained timers, NewReno, and FACK. | ||
17 | * o To avoid the performance penalty imposed by increasing cwnd | ||
18 | * only every-other RTT during slow start, we increase during | ||
19 | * every RTT during slow start, just like Reno. | ||
20 | * o Largely to allow continuous cwnd growth during slow start, | ||
21 | * we use the rate at which ACKs come back as the "actual" | ||
22 | * rate, rather than the rate at which data is sent. | ||
23 | * o To speed convergence to the right rate, we set the cwnd | ||
24 | * to achieve the right ("actual") rate when we exit slow start. | ||
25 | * o To filter out the noise caused by delayed ACKs, we use the | ||
26 | * minimum RTT sample observed during the last RTT to calculate | ||
27 | * the actual rate. | ||
28 | * o When the sender re-starts from idle, it waits until it has | ||
29 | * received ACKs for an entire flight of new data before making | ||
30 | * a cwnd adjustment decision. The original Vegas implementation | ||
31 | * assumed senders never went idle. | ||
32 | * | ||
33 | * | ||
34 | * TCP Compound based on TCP Vegas | ||
35 | * | ||
36 | * further details can be found here: | ||
37 | * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf | ||
38 | */ | ||
39 | |||
40 | #include <linux/config.h> | ||
41 | #include <linux/mm.h> | ||
42 | #include <linux/module.h> | ||
43 | #include <linux/skbuff.h> | ||
44 | #include <linux/inet_diag.h> | ||
45 | |||
46 | #include <net/tcp.h> | ||
47 | |||
48 | /* Default values of the Vegas variables, in fixed-point representation | ||
49 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
50 | */ | ||
51 | #define V_PARAM_SHIFT 1 | ||
52 | |||
53 | #define TCP_COMPOUND_ALPHA 3U | ||
54 | #define TCP_COMPOUND_BETA 1U | ||
55 | #define TCP_COMPOUND_GAMMA 30 | ||
56 | #define TCP_COMPOUND_ZETA 1 | ||
57 | |||
58 | /* TCP compound variables */ | ||
59 | struct compound { | ||
60 | u32 beg_snd_nxt; /* right edge during last RTT */ | ||
61 | u32 beg_snd_una; /* left edge during last RTT */ | ||
62 | u32 beg_snd_cwnd; /* saves the size of the cwnd */ | ||
63 | u8 doing_vegas_now; /* if true, do vegas for this RTT */ | ||
64 | u16 cntRTT; /* # of RTTs measured within last RTT */ | ||
65 | u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ | ||
66 | u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ | ||
67 | |||
68 | u32 cwnd; | ||
69 | u32 dwnd; | ||
70 | }; | ||
71 | |||
72 | /* There are several situations when we must "re-start" Vegas: | ||
73 | * | ||
74 | * o when a connection is established | ||
75 | * o after an RTO | ||
76 | * o after fast recovery | ||
77 | * o when we send a packet and there is no outstanding | ||
78 | * unacknowledged data (restarting an idle connection) | ||
79 | * | ||
80 | * In these circumstances we cannot do a Vegas calculation at the | ||
81 | * end of the first RTT, because any calculation we do is using | ||
82 | * stale info -- both the saved cwnd and congestion feedback are | ||
83 | * stale. | ||
84 | * | ||
85 | * Instead we must wait until the completion of an RTT during | ||
86 | * which we actually receive ACKs. | ||
87 | */ | ||
88 | static inline void vegas_enable(struct sock *sk) | ||
89 | { | ||
90 | const struct tcp_sock *tp = tcp_sk(sk); | ||
91 | struct compound *vegas = inet_csk_ca(sk); | ||
92 | |||
93 | /* Begin taking Vegas samples next time we send something. */ | ||
94 | vegas->doing_vegas_now = 1; | ||
95 | |||
96 | /* Set the beginning of the next send window. */ | ||
97 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
98 | |||
99 | vegas->cntRTT = 0; | ||
100 | vegas->minRTT = 0x7fffffff; | ||
101 | } | ||
102 | |||
103 | /* Stop taking Vegas samples for now. */ | ||
104 | static inline void vegas_disable(struct sock *sk) | ||
105 | { | ||
106 | struct compound *vegas = inet_csk_ca(sk); | ||
107 | |||
108 | vegas->doing_vegas_now = 0; | ||
109 | } | ||
110 | |||
111 | static void tcp_compound_init(struct sock *sk) | ||
112 | { | ||
113 | struct compound *vegas = inet_csk_ca(sk); | ||
114 | const struct tcp_sock *tp = tcp_sk(sk); | ||
115 | |||
116 | vegas->baseRTT = 0x7fffffff; | ||
117 | vegas_enable(sk); | ||
118 | |||
119 | vegas->dwnd = 0; | ||
120 | vegas->cwnd = tp->snd_cwnd; | ||
121 | } | ||
122 | |||
123 | /* Do RTT sampling needed for Vegas. | ||
124 | * Basically we: | ||
125 | * o min-filter RTT samples from within an RTT to get the current | ||
126 | * propagation delay + queuing delay (we are min-filtering to try to | ||
127 | * avoid the effects of delayed ACKs) | ||
128 | * o min-filter RTT samples from a much longer window (forever for now) | ||
129 | * to find the propagation delay (baseRTT) | ||
130 | */ | ||
131 | static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) | ||
132 | { | ||
133 | struct compound *vegas = inet_csk_ca(sk); | ||
134 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ | ||
135 | |||
136 | /* Filter to find propagation delay: */ | ||
137 | if (vrtt < vegas->baseRTT) | ||
138 | vegas->baseRTT = vrtt; | ||
139 | |||
140 | /* Find the min RTT during the last RTT to find | ||
141 | * the current prop. delay + queuing delay: | ||
142 | */ | ||
143 | |||
144 | vegas->minRTT = min(vegas->minRTT, vrtt); | ||
145 | vegas->cntRTT++; | ||
146 | } | ||
147 | |||
148 | static void tcp_compound_state(struct sock *sk, u8 ca_state) | ||
149 | { | ||
150 | |||
151 | if (ca_state == TCP_CA_Open) | ||
152 | vegas_enable(sk); | ||
153 | else | ||
154 | vegas_disable(sk); | ||
155 | } | ||
156 | |||
157 | |||
158 | /* 64bit divisor, dividend and result. dynamic precision */ | ||
159 | static inline u64 div64_64(u64 dividend, u64 divisor) | ||
160 | { | ||
161 | u32 d = divisor; | ||
162 | |||
163 | if (divisor > 0xffffffffULL) { | ||
164 | unsigned int shift = fls(divisor >> 32); | ||
165 | |||
166 | d = divisor >> shift; | ||
167 | dividend >>= shift; | ||
168 | } | ||
169 | |||
170 | /* avoid 64 bit division if possible */ | ||
171 | if (dividend >> 32) | ||
172 | do_div(dividend, d); | ||
173 | else | ||
174 | dividend = (u32) dividend / d; | ||
175 | |||
176 | return dividend; | ||
177 | } | ||
178 | |||
179 | /* calculate the quartic root of "a" using Newton-Raphson */ | ||
180 | static u32 qroot(u64 a) | ||
181 | { | ||
182 | u32 x, x1; | ||
183 | |||
184 | /* Initial estimate is based on: | ||
185 | * qrt(x) = exp(log(x) / 4) | ||
186 | */ | ||
187 | x = 1u << (fls64(a) >> 2); | ||
188 | |||
189 | /* | ||
190 | * Iteration based on: | ||
191 | * 3 | ||
192 | * x = ( 3 * x + a / x ) / 4 | ||
193 | * k+1 k k | ||
194 | */ | ||
195 | do { | ||
196 | u64 x3 = x; | ||
197 | |||
198 | x1 = x; | ||
199 | x3 *= x; | ||
200 | x3 *= x; | ||
201 | |||
202 | x = (3 * x + (u32) div64_64(a, x3)) / 4; | ||
203 | } while (abs(x1 - x) > 1); | ||
204 | |||
205 | return x; | ||
206 | } | ||
207 | |||
208 | |||
209 | /* | ||
210 | * If the connection is idle and we are restarting, | ||
211 | * then we don't want to do any Vegas calculations | ||
212 | * until we get fresh RTT samples. So when we | ||
213 | * restart, we reset our Vegas state to a clean | ||
214 | * slate. After we get acks for this flight of | ||
215 | * packets, _then_ we can make Vegas calculations | ||
216 | * again. | ||
217 | */ | ||
218 | static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) | ||
219 | { | ||
220 | if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) | ||
221 | tcp_compound_init(sk); | ||
222 | } | ||
223 | |||
224 | static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, | ||
225 | u32 seq_rtt, u32 in_flight, int flag) | ||
226 | { | ||
227 | struct tcp_sock *tp = tcp_sk(sk); | ||
228 | struct compound *vegas = inet_csk_ca(sk); | ||
229 | u8 inc = 0; | ||
230 | |||
231 | if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { | ||
232 | if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) { | ||
233 | vegas->cwnd = tp->snd_cwnd; | ||
234 | vegas->dwnd = 0; | ||
235 | } else | ||
236 | vegas->cwnd = tp->snd_cwnd - vegas->dwnd; | ||
237 | |||
238 | } | ||
239 | |||
240 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
241 | return; | ||
242 | |||
243 | if (vegas->cwnd <= tp->snd_ssthresh) | ||
244 | inc = 1; | ||
245 | else if (tp->snd_cwnd_cnt < tp->snd_cwnd) | ||
246 | tp->snd_cwnd_cnt++; | ||
247 | |||
248 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
249 | inc = 1; | ||
250 | tp->snd_cwnd_cnt = 0; | ||
251 | } | ||
252 | |||
253 | if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
254 | vegas->cwnd++; | ||
255 | |||
256 | /* The key players are v_beg_snd_una and v_beg_snd_nxt. | ||
257 | * | ||
258 | * These are so named because they represent the approximate values | ||
259 | * of snd_una and snd_nxt at the beginning of the current RTT. More | ||
260 | * precisely, they represent the amount of data sent during the RTT. | ||
261 | * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, | ||
262 | * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding | ||
263 | * bytes of data have been ACKed during the course of the RTT, giving | ||
264 | * an "actual" rate of: | ||
265 | * | ||
266 | * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) | ||
267 | * | ||
268 | * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, | ||
269 | * because delayed ACKs can cover more than one segment, so they | ||
270 | * don't line up nicely with the boundaries of RTTs. | ||
271 | * | ||
272 | * Another unfortunate fact of life is that delayed ACKs delay the | ||
273 | * advance of the left edge of our send window, so that the number | ||
274 | * of bytes we send in an RTT is often less than our cwnd will allow. | ||
275 | * So we keep track of our cwnd separately, in v_beg_snd_cwnd. | ||
276 | */ | ||
277 | |||
278 | if (after(ack, vegas->beg_snd_nxt)) { | ||
279 | /* Do the Vegas once-per-RTT cwnd adjustment. */ | ||
280 | u32 old_wnd, old_snd_cwnd; | ||
281 | |||
282 | /* Here old_wnd is essentially the window of data that was | ||
283 | * sent during the previous RTT, and has all | ||
284 | * been acknowledged in the course of the RTT that ended | ||
285 | * with the ACK we just received. Likewise, old_snd_cwnd | ||
286 | * is the cwnd during the previous RTT. | ||
287 | */ | ||
288 | if (!tp->mss_cache) | ||
289 | return; | ||
290 | |||
291 | old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / | ||
292 | tp->mss_cache; | ||
293 | old_snd_cwnd = vegas->beg_snd_cwnd; | ||
294 | |||
295 | /* Save the extent of the current window so we can use this | ||
296 | * at the end of the next RTT. | ||
297 | */ | ||
298 | vegas->beg_snd_una = vegas->beg_snd_nxt; | ||
299 | vegas->beg_snd_nxt = tp->snd_nxt; | ||
300 | vegas->beg_snd_cwnd = tp->snd_cwnd; | ||
301 | |||
302 | /* We do the Vegas calculations only if we got enough RTT | ||
303 | * samples that we can be reasonably sure that we got | ||
304 | * at least one RTT sample that wasn't from a delayed ACK. | ||
305 | * If we only had 2 samples total, | ||
306 | * then that means we're getting only 1 ACK per RTT, which | ||
307 | * means they're almost certainly delayed ACKs. | ||
308 | * If we have 3 samples, we should be OK. | ||
309 | */ | ||
310 | |||
311 | if (vegas->cntRTT > 2) { | ||
312 | u32 rtt, target_cwnd, diff; | ||
313 | u32 brtt, dwnd; | ||
314 | |||
315 | /* We have enough RTT samples, so, using the Vegas | ||
316 | * algorithm, we determine if we should increase or | ||
317 | * decrease cwnd, and by how much. | ||
318 | */ | ||
319 | |||
320 | /* Pluck out the RTT we are using for the Vegas | ||
321 | * calculations. This is the min RTT seen during the | ||
322 | * last RTT. Taking the min filters out the effects | ||
323 | * of delayed ACKs, at the cost of noticing congestion | ||
324 | * a bit later. | ||
325 | */ | ||
326 | rtt = vegas->minRTT; | ||
327 | |||
328 | /* Calculate the cwnd we should have, if we weren't | ||
329 | * going too fast. | ||
330 | * | ||
331 | * This is: | ||
332 | * (actual rate in segments) * baseRTT | ||
333 | * We keep it as a fixed point number with | ||
334 | * V_PARAM_SHIFT bits to the right of the binary point. | ||
335 | */ | ||
336 | if (!rtt) | ||
337 | return; | ||
338 | |||
339 | brtt = vegas->baseRTT; | ||
340 | target_cwnd = ((old_wnd * brtt) | ||
341 | << V_PARAM_SHIFT) / rtt; | ||
342 | |||
343 | /* Calculate the difference between the window we had, | ||
344 | * and the window we would like to have. This quantity | ||
345 | * is the "Diff" from the Arizona Vegas papers. | ||
346 | * | ||
347 | * Again, this is a fixed point number with | ||
348 | * V_PARAM_SHIFT bits to the right of the binary | ||
349 | * point. | ||
350 | */ | ||
351 | |||
352 | diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; | ||
353 | |||
354 | dwnd = vegas->dwnd; | ||
355 | |||
356 | if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { | ||
357 | u64 v; | ||
358 | u32 x; | ||
359 | |||
360 | /* | ||
361 | * The TCP Compound paper describes the choice | ||
362 | * of "k" determines the agressiveness, | ||
363 | * ie. slope of the response function. | ||
364 | * | ||
365 | * For same value as HSTCP would be 0.8 | ||
366 | * but for computaional reasons, both the | ||
367 | * original authors and this implementation | ||
368 | * use 0.75. | ||
369 | */ | ||
370 | v = old_wnd; | ||
371 | x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA; | ||
372 | if (x > 1) | ||
373 | dwnd = x - 1; | ||
374 | else | ||
375 | dwnd = 0; | ||
376 | |||
377 | dwnd += vegas->dwnd; | ||
378 | |||
379 | } else if ((dwnd << V_PARAM_SHIFT) < | ||
380 | (diff * TCP_COMPOUND_BETA)) | ||
381 | dwnd = 0; | ||
382 | else | ||
383 | dwnd = | ||
384 | ((dwnd << V_PARAM_SHIFT) - | ||
385 | (diff * | ||
386 | TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; | ||
387 | |||
388 | vegas->dwnd = dwnd; | ||
389 | |||
390 | } | ||
391 | |||
392 | /* Wipe the slate clean for the next RTT. */ | ||
393 | vegas->cntRTT = 0; | ||
394 | vegas->minRTT = 0x7fffffff; | ||
395 | } | ||
396 | |||
397 | tp->snd_cwnd = vegas->cwnd + vegas->dwnd; | ||
398 | } | ||
399 | |||
400 | /* Extract info for Tcp socket info provided via netlink. */ | ||
401 | static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) | ||
402 | { | ||
403 | const struct compound *ca = inet_csk_ca(sk); | ||
404 | if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { | ||
405 | struct tcpvegas_info *info; | ||
406 | |||
407 | info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, | ||
408 | sizeof(*info))); | ||
409 | |||
410 | info->tcpv_enabled = ca->doing_vegas_now; | ||
411 | info->tcpv_rttcnt = ca->cntRTT; | ||
412 | info->tcpv_rtt = ca->baseRTT; | ||
413 | info->tcpv_minrtt = ca->minRTT; | ||
414 | rtattr_failure:; | ||
415 | } | ||
416 | } | ||
417 | |||
418 | static struct tcp_congestion_ops tcp_compound = { | ||
419 | .init = tcp_compound_init, | ||
420 | .ssthresh = tcp_reno_ssthresh, | ||
421 | .cong_avoid = tcp_compound_cong_avoid, | ||
422 | .rtt_sample = tcp_compound_rtt_calc, | ||
423 | .set_state = tcp_compound_state, | ||
424 | .cwnd_event = tcp_compound_cwnd_event, | ||
425 | .get_info = tcp_compound_get_info, | ||
426 | |||
427 | .owner = THIS_MODULE, | ||
428 | .name = "compound", | ||
429 | }; | ||
430 | |||
431 | static int __init tcp_compound_register(void) | ||
432 | { | ||
433 | BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); | ||
434 | tcp_register_congestion_control(&tcp_compound); | ||
435 | return 0; | ||
436 | } | ||
437 | |||
438 | static void __exit tcp_compound_unregister(void) | ||
439 | { | ||
440 | tcp_unregister_congestion_control(&tcp_compound); | ||
441 | } | ||
442 | |||
443 | module_init(tcp_compound_register); | ||
444 | module_exit(tcp_compound_unregister); | ||
445 | |||
446 | MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger"); | ||
447 | MODULE_LICENSE("GPL"); | ||
448 | MODULE_DESCRIPTION("TCP Compound"); | ||
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 91c2f41c7f58..857eefc52aab 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c | |||
@@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) | |||
38 | int ret = 0; | 38 | int ret = 0; |
39 | 39 | ||
40 | /* all algorithms must implement ssthresh and cong_avoid ops */ | 40 | /* all algorithms must implement ssthresh and cong_avoid ops */ |
41 | if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { | 41 | if (!ca->ssthresh || !ca->cong_avoid) { |
42 | printk(KERN_ERR "TCP %s does not implement required ops\n", | 42 | printk(KERN_ERR "TCP %s does not implement required ops\n", |
43 | ca->name); | 43 | ca->name); |
44 | return -EINVAL; | 44 | return -EINVAL; |
@@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk) | |||
251 | } | 251 | } |
252 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); | 252 | EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); |
253 | 253 | ||
254 | /* Lower bound on congestion window. */ | 254 | /* Lower bound on congestion window with halving. */ |
255 | u32 tcp_reno_min_cwnd(struct sock *sk) | 255 | u32 tcp_reno_min_cwnd(const struct sock *sk) |
256 | { | 256 | { |
257 | const struct tcp_sock *tp = tcp_sk(sk); | 257 | const struct tcp_sock *tp = tcp_sk(sk); |
258 | return tp->snd_ssthresh/2; | 258 | return tp->snd_ssthresh/2; |
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 31a4986dfbf7..78b7a6b9e4de 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c | |||
@@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) | |||
325 | return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); | 325 | return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); |
326 | } | 326 | } |
327 | 327 | ||
328 | static u32 bictcp_min_cwnd(struct sock *sk) | ||
329 | { | ||
330 | return tcp_sk(sk)->snd_ssthresh; | ||
331 | } | ||
332 | |||
333 | static void bictcp_state(struct sock *sk, u8 new_state) | 328 | static void bictcp_state(struct sock *sk, u8 new_state) |
334 | { | 329 | { |
335 | if (new_state == TCP_CA_Loss) | 330 | if (new_state == TCP_CA_Loss) |
@@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = { | |||
357 | .cong_avoid = bictcp_cong_avoid, | 352 | .cong_avoid = bictcp_cong_avoid, |
358 | .set_state = bictcp_state, | 353 | .set_state = bictcp_state, |
359 | .undo_cwnd = bictcp_undo_cwnd, | 354 | .undo_cwnd = bictcp_undo_cwnd, |
360 | .min_cwnd = bictcp_min_cwnd, | ||
361 | .pkts_acked = bictcp_acked, | 355 | .pkts_acked = bictcp_acked, |
362 | .owner = THIS_MODULE, | 356 | .owner = THIS_MODULE, |
363 | .name = "cubic", | 357 | .name = "cubic", |
diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index ba7c63ca5bb1..1120245b2373 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c | |||
@@ -98,6 +98,10 @@ struct hstcp { | |||
98 | u32 ai; | 98 | u32 ai; |
99 | }; | 99 | }; |
100 | 100 | ||
101 | static int max_ssthresh = 100; | ||
102 | module_param(max_ssthresh, int, 0644); | ||
103 | MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); | ||
104 | |||
101 | static void hstcp_init(struct sock *sk) | 105 | static void hstcp_init(struct sock *sk) |
102 | { | 106 | { |
103 | struct tcp_sock *tp = tcp_sk(sk); | 107 | struct tcp_sock *tp = tcp_sk(sk); |
@@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, | |||
119 | if (!tcp_is_cwnd_limited(sk, in_flight)) | 123 | if (!tcp_is_cwnd_limited(sk, in_flight)) |
120 | return; | 124 | return; |
121 | 125 | ||
122 | if (tp->snd_cwnd <= tp->snd_ssthresh) | 126 | if (tp->snd_cwnd <= tp->snd_ssthresh) { |
123 | tcp_slow_start(tp); | 127 | /* RFC3742: limited slow start |
124 | else { | 128 | * the window is increased by 1/K MSS for each arriving ACK, |
129 | * for K = int(cwnd/(0.5 max_ssthresh)) | ||
130 | */ | ||
131 | if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { | ||
132 | u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); | ||
133 | if (++tp->snd_cwnd_cnt >= k) { | ||
134 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
135 | tp->snd_cwnd++; | ||
136 | tp->snd_cwnd_cnt = 0; | ||
137 | } | ||
138 | } else { | ||
139 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
140 | tp->snd_cwnd++; | ||
141 | } | ||
142 | } else { | ||
125 | /* Update AIMD parameters */ | 143 | /* Update AIMD parameters */ |
126 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { | 144 | if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { |
127 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && | 145 | while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && |
diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1b2ff53f98ed..3d92c1859267 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c | |||
@@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, | |||
246 | } | 246 | } |
247 | } | 247 | } |
248 | 248 | ||
249 | /* Lower bound on congestion window. */ | ||
250 | static u32 htcp_min_cwnd(struct sock *sk) | ||
251 | { | ||
252 | const struct tcp_sock *tp = tcp_sk(sk); | ||
253 | return tp->snd_ssthresh; | ||
254 | } | ||
255 | |||
256 | |||
257 | static void htcp_init(struct sock *sk) | 249 | static void htcp_init(struct sock *sk) |
258 | { | 250 | { |
259 | struct htcp *ca = inet_csk_ca(sk); | 251 | struct htcp *ca = inet_csk_ca(sk); |
@@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state) | |||
285 | static struct tcp_congestion_ops htcp = { | 277 | static struct tcp_congestion_ops htcp = { |
286 | .init = htcp_init, | 278 | .init = htcp_init, |
287 | .ssthresh = htcp_recalc_ssthresh, | 279 | .ssthresh = htcp_recalc_ssthresh, |
288 | .min_cwnd = htcp_min_cwnd, | ||
289 | .cong_avoid = htcp_cong_avoid, | 280 | .cong_avoid = htcp_cong_avoid, |
290 | .set_state = htcp_state, | 281 | .set_state = htcp_state, |
291 | .undo_cwnd = htcp_cwnd_undo, | 282 | .undo_cwnd = htcp_cwnd_undo, |
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b5521a9d3dc1..e08245bdda3a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <net/inet_common.h> | 71 | #include <net/inet_common.h> |
72 | #include <linux/ipsec.h> | 72 | #include <linux/ipsec.h> |
73 | #include <asm/unaligned.h> | 73 | #include <asm/unaligned.h> |
74 | #include <net/netdma.h> | ||
74 | 75 | ||
75 | int sysctl_tcp_timestamps = 1; | 76 | int sysctl_tcp_timestamps = 1; |
76 | int sysctl_tcp_window_scaling = 1; | 77 | int sysctl_tcp_window_scaling = 1; |
@@ -1688,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) | |||
1688 | tp->snd_cwnd_stamp = tcp_time_stamp; | 1689 | tp->snd_cwnd_stamp = tcp_time_stamp; |
1689 | } | 1690 | } |
1690 | 1691 | ||
1692 | /* Lower bound on congestion window is slow start threshold | ||
1693 | * unless congestion avoidance choice decides to overide it. | ||
1694 | */ | ||
1695 | static inline u32 tcp_cwnd_min(const struct sock *sk) | ||
1696 | { | ||
1697 | const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; | ||
1698 | |||
1699 | return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; | ||
1700 | } | ||
1701 | |||
1691 | /* Decrease cwnd each second ack. */ | 1702 | /* Decrease cwnd each second ack. */ |
1692 | static void tcp_cwnd_down(struct sock *sk) | 1703 | static void tcp_cwnd_down(struct sock *sk) |
1693 | { | 1704 | { |
1694 | const struct inet_connection_sock *icsk = inet_csk(sk); | ||
1695 | struct tcp_sock *tp = tcp_sk(sk); | 1705 | struct tcp_sock *tp = tcp_sk(sk); |
1696 | int decr = tp->snd_cwnd_cnt + 1; | 1706 | int decr = tp->snd_cwnd_cnt + 1; |
1697 | 1707 | ||
1698 | tp->snd_cwnd_cnt = decr&1; | 1708 | tp->snd_cwnd_cnt = decr&1; |
1699 | decr >>= 1; | 1709 | decr >>= 1; |
1700 | 1710 | ||
1701 | if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) | 1711 | if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) |
1702 | tp->snd_cwnd -= decr; | 1712 | tp->snd_cwnd -= decr; |
1703 | 1713 | ||
1704 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); | 1714 | tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); |
@@ -3785,6 +3795,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk | |||
3785 | __tcp_checksum_complete_user(sk, skb); | 3795 | __tcp_checksum_complete_user(sk, skb); |
3786 | } | 3796 | } |
3787 | 3797 | ||
3798 | #ifdef CONFIG_NET_DMA | ||
3799 | static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) | ||
3800 | { | ||
3801 | struct tcp_sock *tp = tcp_sk(sk); | ||
3802 | int chunk = skb->len - hlen; | ||
3803 | int dma_cookie; | ||
3804 | int copied_early = 0; | ||
3805 | |||
3806 | if (tp->ucopy.wakeup) | ||
3807 | return 0; | ||
3808 | |||
3809 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | ||
3810 | tp->ucopy.dma_chan = get_softnet_dma(); | ||
3811 | |||
3812 | if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { | ||
3813 | |||
3814 | dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, | ||
3815 | skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); | ||
3816 | |||
3817 | if (dma_cookie < 0) | ||
3818 | goto out; | ||
3819 | |||
3820 | tp->ucopy.dma_cookie = dma_cookie; | ||
3821 | copied_early = 1; | ||
3822 | |||
3823 | tp->ucopy.len -= chunk; | ||
3824 | tp->copied_seq += chunk; | ||
3825 | tcp_rcv_space_adjust(sk); | ||
3826 | |||
3827 | if ((tp->ucopy.len == 0) || | ||
3828 | (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || | ||
3829 | (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { | ||
3830 | tp->ucopy.wakeup = 1; | ||
3831 | sk->sk_data_ready(sk, 0); | ||
3832 | } | ||
3833 | } else if (chunk > 0) { | ||
3834 | tp->ucopy.wakeup = 1; | ||
3835 | sk->sk_data_ready(sk, 0); | ||
3836 | } | ||
3837 | out: | ||
3838 | return copied_early; | ||
3839 | } | ||
3840 | #endif /* CONFIG_NET_DMA */ | ||
3841 | |||
3788 | /* | 3842 | /* |
3789 | * TCP receive function for the ESTABLISHED state. | 3843 | * TCP receive function for the ESTABLISHED state. |
3790 | * | 3844 | * |
@@ -3886,8 +3940,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3886 | tp->rcv_nxt == tp->rcv_wup) | 3940 | tp->rcv_nxt == tp->rcv_wup) |
3887 | tcp_store_ts_recent(tp); | 3941 | tcp_store_ts_recent(tp); |
3888 | 3942 | ||
3889 | tcp_rcv_rtt_measure_ts(sk, skb); | ||
3890 | |||
3891 | /* We know that such packets are checksummed | 3943 | /* We know that such packets are checksummed |
3892 | * on entry. | 3944 | * on entry. |
3893 | */ | 3945 | */ |
@@ -3901,14 +3953,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3901 | } | 3953 | } |
3902 | } else { | 3954 | } else { |
3903 | int eaten = 0; | 3955 | int eaten = 0; |
3956 | int copied_early = 0; | ||
3904 | 3957 | ||
3905 | if (tp->ucopy.task == current && | 3958 | if (tp->copied_seq == tp->rcv_nxt && |
3906 | tp->copied_seq == tp->rcv_nxt && | 3959 | len - tcp_header_len <= tp->ucopy.len) { |
3907 | len - tcp_header_len <= tp->ucopy.len && | 3960 | #ifdef CONFIG_NET_DMA |
3908 | sock_owned_by_user(sk)) { | 3961 | if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { |
3909 | __set_current_state(TASK_RUNNING); | 3962 | copied_early = 1; |
3963 | eaten = 1; | ||
3964 | } | ||
3965 | #endif | ||
3966 | if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { | ||
3967 | __set_current_state(TASK_RUNNING); | ||
3910 | 3968 | ||
3911 | if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { | 3969 | if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) |
3970 | eaten = 1; | ||
3971 | } | ||
3972 | if (eaten) { | ||
3912 | /* Predicted packet is in window by definition. | 3973 | /* Predicted packet is in window by definition. |
3913 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. | 3974 | * seq == rcv_nxt and rcv_wup <= rcv_nxt. |
3914 | * Hence, check seq<=rcv_wup reduces to: | 3975 | * Hence, check seq<=rcv_wup reduces to: |
@@ -3924,8 +3985,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3924 | __skb_pull(skb, tcp_header_len); | 3985 | __skb_pull(skb, tcp_header_len); |
3925 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; | 3986 | tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; |
3926 | NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); | 3987 | NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); |
3927 | eaten = 1; | ||
3928 | } | 3988 | } |
3989 | if (copied_early) | ||
3990 | tcp_cleanup_rbuf(sk, skb->len); | ||
3929 | } | 3991 | } |
3930 | if (!eaten) { | 3992 | if (!eaten) { |
3931 | if (tcp_checksum_complete_user(sk, skb)) | 3993 | if (tcp_checksum_complete_user(sk, skb)) |
@@ -3966,6 +4028,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, | |||
3966 | 4028 | ||
3967 | __tcp_ack_snd_check(sk, 0); | 4029 | __tcp_ack_snd_check(sk, 0); |
3968 | no_ack: | 4030 | no_ack: |
4031 | #ifdef CONFIG_NET_DMA | ||
4032 | if (copied_early) | ||
4033 | __skb_queue_tail(&sk->sk_async_wait_queue, skb); | ||
4034 | else | ||
4035 | #endif | ||
3969 | if (eaten) | 4036 | if (eaten) |
3970 | __kfree_skb(skb); | 4037 | __kfree_skb(skb); |
3971 | else | 4038 | else |
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 672950e54c49..25ecc6e2478b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c | |||
@@ -71,6 +71,7 @@ | |||
71 | #include <net/inet_common.h> | 71 | #include <net/inet_common.h> |
72 | #include <net/timewait_sock.h> | 72 | #include <net/timewait_sock.h> |
73 | #include <net/xfrm.h> | 73 | #include <net/xfrm.h> |
74 | #include <net/netdma.h> | ||
74 | 75 | ||
75 | #include <linux/inet.h> | 76 | #include <linux/inet.h> |
76 | #include <linux/ipv6.h> | 77 | #include <linux/ipv6.h> |
@@ -1091,8 +1092,18 @@ process: | |||
1091 | bh_lock_sock(sk); | 1092 | bh_lock_sock(sk); |
1092 | ret = 0; | 1093 | ret = 0; |
1093 | if (!sock_owned_by_user(sk)) { | 1094 | if (!sock_owned_by_user(sk)) { |
1094 | if (!tcp_prequeue(sk, skb)) | 1095 | #ifdef CONFIG_NET_DMA |
1096 | struct tcp_sock *tp = tcp_sk(sk); | ||
1097 | if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) | ||
1098 | tp->ucopy.dma_chan = get_softnet_dma(); | ||
1099 | if (tp->ucopy.dma_chan) | ||
1095 | ret = tcp_v4_do_rcv(sk, skb); | 1100 | ret = tcp_v4_do_rcv(sk, skb); |
1101 | else | ||
1102 | #endif | ||
1103 | { | ||
1104 | if (!tcp_prequeue(sk, skb)) | ||
1105 | ret = tcp_v4_do_rcv(sk, skb); | ||
1106 | } | ||
1096 | } else | 1107 | } else |
1097 | sk_add_backlog(sk, skb); | 1108 | sk_add_backlog(sk, skb); |
1098 | bh_unlock_sock(sk); | 1109 | bh_unlock_sock(sk); |
@@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) | |||
1296 | /* Cleans up our, hopefully empty, out_of_order_queue. */ | 1307 | /* Cleans up our, hopefully empty, out_of_order_queue. */ |
1297 | __skb_queue_purge(&tp->out_of_order_queue); | 1308 | __skb_queue_purge(&tp->out_of_order_queue); |
1298 | 1309 | ||
1310 | #ifdef CONFIG_NET_DMA | ||
1311 | /* Cleans up our sk_async_wait_queue */ | ||
1312 | __skb_queue_purge(&sk->sk_async_wait_queue); | ||
1313 | #endif | ||
1314 | |||
1299 | /* Clean prequeue, it must be empty really */ | 1315 | /* Clean prequeue, it must be empty really */ |
1300 | __skb_queue_purge(&tp->ucopy.prequeue); | 1316 | __skb_queue_purge(&tp->ucopy.prequeue); |
1301 | 1317 | ||
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 000000000000..1f977b6ee9a1 --- /dev/null +++ b/net/ipv4/tcp_lp.c | |||
@@ -0,0 +1,338 @@ | |||
1 | /* | ||
2 | * TCP Low Priority (TCP-LP) | ||
3 | * | ||
4 | * TCP Low Priority is a distributed algorithm whose goal is to utilize only | ||
5 | * the excess network bandwidth as compared to the ``fair share`` of | ||
6 | * bandwidth as targeted by TCP. Available from: | ||
7 | * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf | ||
8 | * | ||
9 | * Original Author: | ||
10 | * Aleksandar Kuzmanovic <akuzma@northwestern.edu> | ||
11 | * | ||
12 | * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. | ||
13 | * As of 2.6.13, Linux supports pluggable congestion control algorithms. | ||
14 | * Due to the limitation of the API, we take the following changes from | ||
15 | * the original TCP-LP implementation: | ||
16 | * o We use newReno in most core CA handling. Only add some checking | ||
17 | * within cong_avoid. | ||
18 | * o Error correcting in remote HZ, therefore remote HZ will be keeped | ||
19 | * on checking and updating. | ||
20 | * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne | ||
21 | * OWD have a similar meaning as RTT. Also correct the buggy formular. | ||
22 | * o Handle reaction for Early Congestion Indication (ECI) within | ||
23 | * pkts_acked, as mentioned within pseudo code. | ||
24 | * o OWD is handled in relative format, where local time stamp will in | ||
25 | * tcp_time_stamp format. | ||
26 | * | ||
27 | * Port from 2.4.19 to 2.6.16 as module by: | ||
28 | * Wong Hoi Sing Edison <hswong3i@gmail.com> | ||
29 | * Hung Hing Lun <hlhung3i@gmail.com> | ||
30 | * | ||
31 | * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ | ||
32 | */ | ||
33 | |||
34 | #include <linux/config.h> | ||
35 | #include <linux/module.h> | ||
36 | #include <net/tcp.h> | ||
37 | |||
38 | /* resolution of owd */ | ||
39 | #define LP_RESOL 1000 | ||
40 | |||
41 | /** | ||
42 | * enum tcp_lp_state | ||
43 | * @LP_VALID_RHZ: is remote HZ valid? | ||
44 | * @LP_VALID_OWD: is OWD valid? | ||
45 | * @LP_WITHIN_THR: are we within threshold? | ||
46 | * @LP_WITHIN_INF: are we within inference? | ||
47 | * | ||
48 | * TCP-LP's state flags. | ||
49 | * We create this set of state flag mainly for debugging. | ||
50 | */ | ||
51 | enum tcp_lp_state { | ||
52 | LP_VALID_RHZ = (1 << 0), | ||
53 | LP_VALID_OWD = (1 << 1), | ||
54 | LP_WITHIN_THR = (1 << 3), | ||
55 | LP_WITHIN_INF = (1 << 4), | ||
56 | }; | ||
57 | |||
58 | /** | ||
59 | * struct lp | ||
60 | * @flag: TCP-LP state flag | ||
61 | * @sowd: smoothed OWD << 3 | ||
62 | * @owd_min: min OWD | ||
63 | * @owd_max: max OWD | ||
64 | * @owd_max_rsv: resrved max owd | ||
65 | * @remote_hz: estimated remote HZ | ||
66 | * @remote_ref_time: remote reference time | ||
67 | * @local_ref_time: local reference time | ||
68 | * @last_drop: time for last active drop | ||
69 | * @inference: current inference | ||
70 | * | ||
71 | * TCP-LP's private struct. | ||
72 | * We get the idea from original TCP-LP implementation where only left those we | ||
73 | * found are really useful. | ||
74 | */ | ||
75 | struct lp { | ||
76 | u32 flag; | ||
77 | u32 sowd; | ||
78 | u32 owd_min; | ||
79 | u32 owd_max; | ||
80 | u32 owd_max_rsv; | ||
81 | u32 remote_hz; | ||
82 | u32 remote_ref_time; | ||
83 | u32 local_ref_time; | ||
84 | u32 last_drop; | ||
85 | u32 inference; | ||
86 | }; | ||
87 | |||
88 | /** | ||
89 | * tcp_lp_init | ||
90 | * | ||
91 | * Init all required variables. | ||
92 | * Clone the handling from Vegas module implementation. | ||
93 | */ | ||
94 | static void tcp_lp_init(struct sock *sk) | ||
95 | { | ||
96 | struct lp *lp = inet_csk_ca(sk); | ||
97 | |||
98 | lp->flag = 0; | ||
99 | lp->sowd = 0; | ||
100 | lp->owd_min = 0xffffffff; | ||
101 | lp->owd_max = 0; | ||
102 | lp->owd_max_rsv = 0; | ||
103 | lp->remote_hz = 0; | ||
104 | lp->remote_ref_time = 0; | ||
105 | lp->local_ref_time = 0; | ||
106 | lp->last_drop = 0; | ||
107 | lp->inference = 0; | ||
108 | } | ||
109 | |||
110 | /** | ||
111 | * tcp_lp_cong_avoid | ||
112 | * | ||
113 | * Implementation of cong_avoid. | ||
114 | * Will only call newReno CA when away from inference. | ||
115 | * From TCP-LP's paper, this will be handled in additive increasement. | ||
116 | */ | ||
117 | static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, | ||
118 | int flag) | ||
119 | { | ||
120 | struct lp *lp = inet_csk_ca(sk); | ||
121 | |||
122 | if (!(lp->flag & LP_WITHIN_INF)) | ||
123 | tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); | ||
124 | } | ||
125 | |||
126 | /** | ||
127 | * tcp_lp_remote_hz_estimator | ||
128 | * | ||
129 | * Estimate remote HZ. | ||
130 | * We keep on updating the estimated value, where original TCP-LP | ||
131 | * implementation only guest it for once and use forever. | ||
132 | */ | ||
133 | static u32 tcp_lp_remote_hz_estimator(struct sock *sk) | ||
134 | { | ||
135 | struct tcp_sock *tp = tcp_sk(sk); | ||
136 | struct lp *lp = inet_csk_ca(sk); | ||
137 | s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ | ||
138 | s64 m = 0; | ||
139 | |||
140 | /* not yet record reference time | ||
141 | * go away!! record it before come back!! */ | ||
142 | if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) | ||
143 | goto out; | ||
144 | |||
145 | /* we can't calc remote HZ with no different!! */ | ||
146 | if (tp->rx_opt.rcv_tsval == lp->remote_ref_time | ||
147 | || tp->rx_opt.rcv_tsecr == lp->local_ref_time) | ||
148 | goto out; | ||
149 | |||
150 | m = HZ * (tp->rx_opt.rcv_tsval - | ||
151 | lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - | ||
152 | lp->local_ref_time); | ||
153 | if (m < 0) | ||
154 | m = -m; | ||
155 | |||
156 | if (rhz != 0) { | ||
157 | m -= rhz >> 6; /* m is now error in remote HZ est */ | ||
158 | rhz += m; /* 63/64 old + 1/64 new */ | ||
159 | } else | ||
160 | rhz = m << 6; | ||
161 | |||
162 | /* record time for successful remote HZ calc */ | ||
163 | lp->flag |= LP_VALID_RHZ; | ||
164 | |||
165 | out: | ||
166 | /* record reference time stamp */ | ||
167 | lp->remote_ref_time = tp->rx_opt.rcv_tsval; | ||
168 | lp->local_ref_time = tp->rx_opt.rcv_tsecr; | ||
169 | |||
170 | return rhz >> 6; | ||
171 | } | ||
172 | |||
173 | /** | ||
174 | * tcp_lp_owd_calculator | ||
175 | * | ||
176 | * Calculate one way delay (in relative format). | ||
177 | * Original implement OWD as minus of remote time difference to local time | ||
178 | * difference directly. As this time difference just simply equal to RTT, when | ||
179 | * the network status is stable, remote RTT will equal to local RTT, and result | ||
180 | * OWD into zero. | ||
181 | * It seems to be a bug and so we fixed it. | ||
182 | */ | ||
183 | static u32 tcp_lp_owd_calculator(struct sock *sk) | ||
184 | { | ||
185 | struct tcp_sock *tp = tcp_sk(sk); | ||
186 | struct lp *lp = inet_csk_ca(sk); | ||
187 | s64 owd = 0; | ||
188 | |||
189 | lp->remote_hz = tcp_lp_remote_hz_estimator(sk); | ||
190 | |||
191 | if (lp->flag & LP_VALID_RHZ) { | ||
192 | owd = | ||
193 | tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - | ||
194 | tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); | ||
195 | if (owd < 0) | ||
196 | owd = -owd; | ||
197 | } | ||
198 | |||
199 | if (owd > 0) | ||
200 | lp->flag |= LP_VALID_OWD; | ||
201 | else | ||
202 | lp->flag &= ~LP_VALID_OWD; | ||
203 | |||
204 | return owd; | ||
205 | } | ||
206 | |||
207 | /** | ||
208 | * tcp_lp_rtt_sample | ||
209 | * | ||
210 | * Implementation or rtt_sample. | ||
211 | * Will take the following action, | ||
212 | * 1. calc OWD, | ||
213 | * 2. record the min/max OWD, | ||
214 | * 3. calc smoothed OWD (SOWD). | ||
215 | * Most ideas come from the original TCP-LP implementation. | ||
216 | */ | ||
217 | static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) | ||
218 | { | ||
219 | struct lp *lp = inet_csk_ca(sk); | ||
220 | s64 mowd = tcp_lp_owd_calculator(sk); | ||
221 | |||
222 | /* sorry that we don't have valid data */ | ||
223 | if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) | ||
224 | return; | ||
225 | |||
226 | /* record the next min owd */ | ||
227 | if (mowd < lp->owd_min) | ||
228 | lp->owd_min = mowd; | ||
229 | |||
230 | /* always forget the max of the max | ||
231 | * we just set owd_max as one below it */ | ||
232 | if (mowd > lp->owd_max) { | ||
233 | if (mowd > lp->owd_max_rsv) { | ||
234 | if (lp->owd_max_rsv == 0) | ||
235 | lp->owd_max = mowd; | ||
236 | else | ||
237 | lp->owd_max = lp->owd_max_rsv; | ||
238 | lp->owd_max_rsv = mowd; | ||
239 | } else | ||
240 | lp->owd_max = mowd; | ||
241 | } | ||
242 | |||
243 | /* calc for smoothed owd */ | ||
244 | if (lp->sowd != 0) { | ||
245 | mowd -= lp->sowd >> 3; /* m is now error in owd est */ | ||
246 | lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ | ||
247 | } else | ||
248 | lp->sowd = mowd << 3; /* take the measured time be owd */ | ||
249 | } | ||
250 | |||
251 | /** | ||
252 | * tcp_lp_pkts_acked | ||
253 | * | ||
254 | * Implementation of pkts_acked. | ||
255 | * Deal with active drop under Early Congestion Indication. | ||
256 | * Only drop to half and 1 will be handle, because we hope to use back | ||
257 | * newReno in increase case. | ||
258 | * We work it out by following the idea from TCP-LP's paper directly | ||
259 | */ | ||
260 | static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) | ||
261 | { | ||
262 | struct tcp_sock *tp = tcp_sk(sk); | ||
263 | struct lp *lp = inet_csk_ca(sk); | ||
264 | |||
265 | /* calc inference */ | ||
266 | if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) | ||
267 | lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); | ||
268 | |||
269 | /* test if within inference */ | ||
270 | if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) | ||
271 | lp->flag |= LP_WITHIN_INF; | ||
272 | else | ||
273 | lp->flag &= ~LP_WITHIN_INF; | ||
274 | |||
275 | /* test if within threshold */ | ||
276 | if (lp->sowd >> 3 < | ||
277 | lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) | ||
278 | lp->flag |= LP_WITHIN_THR; | ||
279 | else | ||
280 | lp->flag &= ~LP_WITHIN_THR; | ||
281 | |||
282 | pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, | ||
283 | tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, | ||
284 | lp->sowd >> 3); | ||
285 | |||
286 | if (lp->flag & LP_WITHIN_THR) | ||
287 | return; | ||
288 | |||
289 | /* FIXME: try to reset owd_min and owd_max here | ||
290 | * so decrease the chance the min/max is no longer suitable | ||
291 | * and will usually within threshold when whithin inference */ | ||
292 | lp->owd_min = lp->sowd >> 3; | ||
293 | lp->owd_max = lp->sowd >> 2; | ||
294 | lp->owd_max_rsv = lp->sowd >> 2; | ||
295 | |||
296 | /* happened within inference | ||
297 | * drop snd_cwnd into 1 */ | ||
298 | if (lp->flag & LP_WITHIN_INF) | ||
299 | tp->snd_cwnd = 1U; | ||
300 | |||
301 | /* happened after inference | ||
302 | * cut snd_cwnd into half */ | ||
303 | else | ||
304 | tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); | ||
305 | |||
306 | /* record this drop time */ | ||
307 | lp->last_drop = tcp_time_stamp; | ||
308 | } | ||
309 | |||
310 | static struct tcp_congestion_ops tcp_lp = { | ||
311 | .init = tcp_lp_init, | ||
312 | .ssthresh = tcp_reno_ssthresh, | ||
313 | .cong_avoid = tcp_lp_cong_avoid, | ||
314 | .min_cwnd = tcp_reno_min_cwnd, | ||
315 | .rtt_sample = tcp_lp_rtt_sample, | ||
316 | .pkts_acked = tcp_lp_pkts_acked, | ||
317 | |||
318 | .owner = THIS_MODULE, | ||
319 | .name = "lp" | ||
320 | }; | ||
321 | |||
322 | static int __init tcp_lp_register(void) | ||
323 | { | ||
324 | BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); | ||
325 | return tcp_register_congestion_control(&tcp_lp); | ||
326 | } | ||
327 | |||
328 | static void __exit tcp_lp_unregister(void) | ||
329 | { | ||
330 | tcp_unregister_congestion_control(&tcp_lp); | ||
331 | } | ||
332 | |||
333 | module_init(tcp_lp_register); | ||
334 | module_exit(tcp_lp_unregister); | ||
335 | |||
336 | MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); | ||
337 | MODULE_LICENSE("GPL"); | ||
338 | MODULE_DESCRIPTION("TCP Low Priority"); | ||
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f33c9dddaa12..07bb5a2b375e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c | |||
@@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3; | |||
59 | int sysctl_tcp_mtu_probing = 0; | 59 | int sysctl_tcp_mtu_probing = 0; |
60 | int sysctl_tcp_base_mss = 512; | 60 | int sysctl_tcp_base_mss = 512; |
61 | 61 | ||
62 | /* By default, RFC2861 behavior. */ | ||
63 | int sysctl_tcp_slow_start_after_idle = 1; | ||
64 | |||
62 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, | 65 | static void update_send_head(struct sock *sk, struct tcp_sock *tp, |
63 | struct sk_buff *skb) | 66 | struct sk_buff *skb) |
64 | { | 67 | { |
@@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, | |||
138 | struct inet_connection_sock *icsk = inet_csk(sk); | 141 | struct inet_connection_sock *icsk = inet_csk(sk); |
139 | const u32 now = tcp_time_stamp; | 142 | const u32 now = tcp_time_stamp; |
140 | 143 | ||
141 | if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) | 144 | if (sysctl_tcp_slow_start_after_idle && |
145 | (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) | ||
142 | tcp_cwnd_restart(sk, __sk_dst_get(sk)); | 146 | tcp_cwnd_restart(sk, __sk_dst_get(sk)); |
143 | 147 | ||
144 | tp->lsndtime = now; | 148 | tp->lsndtime = now; |
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 000000000000..d7d517a3a238 --- /dev/null +++ b/net/ipv4/tcp_probe.c | |||
@@ -0,0 +1,181 @@ | |||
1 | /* | ||
2 | * tcpprobe - Observe the TCP flow with kprobes. | ||
3 | * | ||
4 | * The idea for this came from Werner Almesberger's umlsim | ||
5 | * Copyright (C) 2004, Stephen Hemminger <shemminger@osdl.org> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or modify | ||
8 | * it under the terms of the GNU General Public License as published by | ||
9 | * the Free Software Foundation; either version 2 of the License, or | ||
10 | * (at your option) any later version. | ||
11 | * | ||
12 | * This program is distributed in the hope that it will be useful, | ||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | * GNU General Public License for more details. | ||
16 | * | ||
17 | * You should have received a copy of the GNU General Public License | ||
18 | * along with this program; if not, write to the Free Software | ||
19 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
20 | */ | ||
21 | |||
22 | #include <linux/kernel.h> | ||
23 | #include <linux/kprobes.h> | ||
24 | #include <linux/socket.h> | ||
25 | #include <linux/tcp.h> | ||
26 | #include <linux/proc_fs.h> | ||
27 | #include <linux/module.h> | ||
28 | #include <linux/kfifo.h> | ||
29 | #include <linux/vmalloc.h> | ||
30 | |||
31 | #include <net/tcp.h> | ||
32 | |||
33 | MODULE_AUTHOR("Stephen Hemminger <shemminger@osdl.org>"); | ||
34 | MODULE_DESCRIPTION("TCP cwnd snooper"); | ||
35 | MODULE_LICENSE("GPL"); | ||
36 | |||
37 | static int port = 0; | ||
38 | MODULE_PARM_DESC(port, "Port to match (0=all)"); | ||
39 | module_param(port, int, 0); | ||
40 | |||
41 | static int bufsize = 64*1024; | ||
42 | MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); | ||
43 | module_param(bufsize, int, 0); | ||
44 | |||
45 | static const char procname[] = "tcpprobe"; | ||
46 | |||
47 | struct { | ||
48 | struct kfifo *fifo; | ||
49 | spinlock_t lock; | ||
50 | wait_queue_head_t wait; | ||
51 | struct timeval tstart; | ||
52 | } tcpw; | ||
53 | |||
54 | static void printl(const char *fmt, ...) | ||
55 | { | ||
56 | va_list args; | ||
57 | int len; | ||
58 | struct timeval now; | ||
59 | char tbuf[256]; | ||
60 | |||
61 | va_start(args, fmt); | ||
62 | do_gettimeofday(&now); | ||
63 | |||
64 | now.tv_sec -= tcpw.tstart.tv_sec; | ||
65 | now.tv_usec -= tcpw.tstart.tv_usec; | ||
66 | if (now.tv_usec < 0) { | ||
67 | --now.tv_sec; | ||
68 | now.tv_usec += 1000000; | ||
69 | } | ||
70 | |||
71 | len = sprintf(tbuf, "%lu.%06lu ", | ||
72 | (unsigned long) now.tv_sec, | ||
73 | (unsigned long) now.tv_usec); | ||
74 | len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); | ||
75 | va_end(args); | ||
76 | |||
77 | kfifo_put(tcpw.fifo, tbuf, len); | ||
78 | wake_up(&tcpw.wait); | ||
79 | } | ||
80 | |||
81 | static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, | ||
82 | struct msghdr *msg, size_t size) | ||
83 | { | ||
84 | const struct tcp_sock *tp = tcp_sk(sk); | ||
85 | const struct inet_sock *inet = inet_sk(sk); | ||
86 | |||
87 | if (port == 0 || ntohs(inet->dport) == port || | ||
88 | ntohs(inet->sport) == port) { | ||
89 | printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", | ||
90 | NIPQUAD(inet->saddr), ntohs(inet->sport), | ||
91 | NIPQUAD(inet->daddr), ntohs(inet->dport), | ||
92 | size, tp->snd_nxt, tp->snd_una, | ||
93 | tp->snd_cwnd, tcp_current_ssthresh(sk), | ||
94 | tp->snd_wnd); | ||
95 | } | ||
96 | |||
97 | jprobe_return(); | ||
98 | return 0; | ||
99 | } | ||
100 | |||
101 | static struct jprobe tcp_send_probe = { | ||
102 | .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, | ||
103 | .entry = (kprobe_opcode_t *) &jtcp_sendmsg, | ||
104 | }; | ||
105 | |||
106 | |||
107 | static int tcpprobe_open(struct inode * inode, struct file * file) | ||
108 | { | ||
109 | kfifo_reset(tcpw.fifo); | ||
110 | do_gettimeofday(&tcpw.tstart); | ||
111 | return 0; | ||
112 | } | ||
113 | |||
114 | static ssize_t tcpprobe_read(struct file *file, char __user *buf, | ||
115 | size_t len, loff_t *ppos) | ||
116 | { | ||
117 | int error = 0, cnt; | ||
118 | unsigned char *tbuf; | ||
119 | |||
120 | if (!buf || len < 0) | ||
121 | return -EINVAL; | ||
122 | |||
123 | if (len == 0) | ||
124 | return 0; | ||
125 | |||
126 | tbuf = vmalloc(len); | ||
127 | if (!tbuf) | ||
128 | return -ENOMEM; | ||
129 | |||
130 | error = wait_event_interruptible(tcpw.wait, | ||
131 | __kfifo_len(tcpw.fifo) != 0); | ||
132 | if (error) | ||
133 | return error; | ||
134 | |||
135 | cnt = kfifo_get(tcpw.fifo, tbuf, len); | ||
136 | error = copy_to_user(buf, tbuf, cnt); | ||
137 | |||
138 | vfree(tbuf); | ||
139 | |||
140 | return error ? error : cnt; | ||
141 | } | ||
142 | |||
143 | static struct file_operations tcpprobe_fops = { | ||
144 | .owner = THIS_MODULE, | ||
145 | .open = tcpprobe_open, | ||
146 | .read = tcpprobe_read, | ||
147 | }; | ||
148 | |||
149 | static __init int tcpprobe_init(void) | ||
150 | { | ||
151 | int ret = -ENOMEM; | ||
152 | |||
153 | init_waitqueue_head(&tcpw.wait); | ||
154 | spin_lock_init(&tcpw.lock); | ||
155 | tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock); | ||
156 | |||
157 | if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) | ||
158 | goto err0; | ||
159 | |||
160 | ret = register_jprobe(&tcp_send_probe); | ||
161 | if (ret) | ||
162 | goto err1; | ||
163 | |||
164 | pr_info("TCP watch registered (port=%d)\n", port); | ||
165 | return 0; | ||
166 | err1: | ||
167 | proc_net_remove(procname); | ||
168 | err0: | ||
169 | kfifo_free(tcpw.fifo); | ||
170 | return ret; | ||
171 | } | ||
172 | module_init(tcpprobe_init); | ||
173 | |||
174 | static __exit void tcpprobe_exit(void) | ||
175 | { | ||
176 | kfifo_free(tcpw.fifo); | ||
177 | proc_net_remove(procname); | ||
178 | unregister_jprobe(&tcp_send_probe); | ||
179 | |||
180 | } | ||
181 | module_exit(tcpprobe_exit); | ||
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 000000000000..11b42a7135c1 --- /dev/null +++ b/net/ipv4/tcp_veno.c | |||
@@ -0,0 +1,231 @@ | |||
1 | /* | ||
2 | * TCP Veno congestion control | ||
3 | * | ||
4 | * This is based on the congestion detection/avoidance scheme described in | ||
5 | * C. P. Fu, S. C. Liew. | ||
6 | * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." | ||
7 | * IEEE Journal on Selected Areas in Communication, | ||
8 | * Feb. 2003. | ||
9 | * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf | ||
10 | */ | ||
11 | |||
12 | #include <linux/config.h> | ||
13 | #include <linux/mm.h> | ||
14 | #include <linux/module.h> | ||
15 | #include <linux/skbuff.h> | ||
16 | #include <linux/inet_diag.h> | ||
17 | |||
18 | #include <net/tcp.h> | ||
19 | |||
20 | /* Default values of the Veno variables, in fixed-point representation | ||
21 | * with V_PARAM_SHIFT bits to the right of the binary point. | ||
22 | */ | ||
23 | #define V_PARAM_SHIFT 1 | ||
24 | static const int beta = 3 << V_PARAM_SHIFT; | ||
25 | |||
26 | /* Veno variables */ | ||
27 | struct veno { | ||
28 | u8 doing_veno_now; /* if true, do veno for this rtt */ | ||
29 | u16 cntrtt; /* # of rtts measured within last rtt */ | ||
30 | u32 minrtt; /* min of rtts measured within last rtt (in usec) */ | ||
31 | u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ | ||
32 | u32 inc; /* decide whether to increase cwnd */ | ||
33 | u32 diff; /* calculate the diff rate */ | ||
34 | }; | ||
35 | |||
36 | /* There are several situations when we must "re-start" Veno: | ||
37 | * | ||
38 | * o when a connection is established | ||
39 | * o after an RTO | ||
40 | * o after fast recovery | ||
41 | * o when we send a packet and there is no outstanding | ||
42 | * unacknowledged data (restarting an idle connection) | ||
43 | * | ||
44 | */ | ||
45 | static inline void veno_enable(struct sock *sk) | ||
46 | { | ||
47 | struct veno *veno = inet_csk_ca(sk); | ||
48 | |||
49 | /* turn on Veno */ | ||
50 | veno->doing_veno_now = 1; | ||
51 | |||
52 | veno->minrtt = 0x7fffffff; | ||
53 | } | ||
54 | |||
55 | static inline void veno_disable(struct sock *sk) | ||
56 | { | ||
57 | struct veno *veno = inet_csk_ca(sk); | ||
58 | |||
59 | /* turn off Veno */ | ||
60 | veno->doing_veno_now = 0; | ||
61 | } | ||
62 | |||
63 | static void tcp_veno_init(struct sock *sk) | ||
64 | { | ||
65 | struct veno *veno = inet_csk_ca(sk); | ||
66 | |||
67 | veno->basertt = 0x7fffffff; | ||
68 | veno->inc = 1; | ||
69 | veno_enable(sk); | ||
70 | } | ||
71 | |||
72 | /* Do rtt sampling needed for Veno. */ | ||
73 | static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) | ||
74 | { | ||
75 | struct veno *veno = inet_csk_ca(sk); | ||
76 | u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ | ||
77 | |||
78 | /* Filter to find propagation delay: */ | ||
79 | if (vrtt < veno->basertt) | ||
80 | veno->basertt = vrtt; | ||
81 | |||
82 | /* Find the min rtt during the last rtt to find | ||
83 | * the current prop. delay + queuing delay: | ||
84 | */ | ||
85 | veno->minrtt = min(veno->minrtt, vrtt); | ||
86 | veno->cntrtt++; | ||
87 | } | ||
88 | |||
89 | static void tcp_veno_state(struct sock *sk, u8 ca_state) | ||
90 | { | ||
91 | if (ca_state == TCP_CA_Open) | ||
92 | veno_enable(sk); | ||
93 | else | ||
94 | veno_disable(sk); | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * If the connection is idle and we are restarting, | ||
99 | * then we don't want to do any Veno calculations | ||
100 | * until we get fresh rtt samples. So when we | ||
101 | * restart, we reset our Veno state to a clean | ||
102 | * state. After we get acks for this flight of | ||
103 | * packets, _then_ we can make Veno calculations | ||
104 | * again. | ||
105 | */ | ||
106 | static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) | ||
107 | { | ||
108 | if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) | ||
109 | tcp_veno_init(sk); | ||
110 | } | ||
111 | |||
112 | static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, | ||
113 | u32 seq_rtt, u32 in_flight, int flag) | ||
114 | { | ||
115 | struct tcp_sock *tp = tcp_sk(sk); | ||
116 | struct veno *veno = inet_csk_ca(sk); | ||
117 | |||
118 | if (!veno->doing_veno_now) | ||
119 | return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); | ||
120 | |||
121 | /* limited by applications */ | ||
122 | if (!tcp_is_cwnd_limited(sk, in_flight)) | ||
123 | return; | ||
124 | |||
125 | /* We do the Veno calculations only if we got enough rtt samples */ | ||
126 | if (veno->cntrtt <= 2) { | ||
127 | /* We don't have enough rtt samples to do the Veno | ||
128 | * calculation, so we'll behave like Reno. | ||
129 | */ | ||
130 | tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); | ||
131 | } else { | ||
132 | u32 rtt, target_cwnd; | ||
133 | |||
134 | /* We have enough rtt samples, so, using the Veno | ||
135 | * algorithm, we determine the state of the network. | ||
136 | */ | ||
137 | |||
138 | rtt = veno->minrtt; | ||
139 | |||
140 | target_cwnd = ((tp->snd_cwnd * veno->basertt) | ||
141 | << V_PARAM_SHIFT) / rtt; | ||
142 | |||
143 | veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; | ||
144 | |||
145 | if (tp->snd_cwnd <= tp->snd_ssthresh) { | ||
146 | /* Slow start. */ | ||
147 | tcp_slow_start(tp); | ||
148 | } else { | ||
149 | /* Congestion avoidance. */ | ||
150 | if (veno->diff < beta) { | ||
151 | /* In the "non-congestive state", increase cwnd | ||
152 | * every rtt. | ||
153 | */ | ||
154 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
155 | if (tp->snd_cwnd < tp->snd_cwnd_clamp) | ||
156 | tp->snd_cwnd++; | ||
157 | tp->snd_cwnd_cnt = 0; | ||
158 | } else | ||
159 | tp->snd_cwnd_cnt++; | ||
160 | } else { | ||
161 | /* In the "congestive state", increase cwnd | ||
162 | * every other rtt. | ||
163 | */ | ||
164 | if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { | ||
165 | if (veno->inc | ||
166 | && tp->snd_cwnd < | ||
167 | tp->snd_cwnd_clamp) { | ||
168 | tp->snd_cwnd++; | ||
169 | veno->inc = 0; | ||
170 | } else | ||
171 | veno->inc = 1; | ||
172 | tp->snd_cwnd_cnt = 0; | ||
173 | } else | ||
174 | tp->snd_cwnd_cnt++; | ||
175 | } | ||
176 | |||
177 | } | ||
178 | if (tp->snd_cwnd < 2) | ||
179 | tp->snd_cwnd = 2; | ||
180 | else if (tp->snd_cwnd > tp->snd_cwnd_clamp) | ||
181 | tp->snd_cwnd = tp->snd_cwnd_clamp; | ||
182 | } | ||
183 | /* Wipe the slate clean for the next rtt. */ | ||
184 | /* veno->cntrtt = 0; */ | ||
185 | veno->minrtt = 0x7fffffff; | ||
186 | } | ||
187 | |||
188 | /* Veno MD phase */ | ||
189 | static u32 tcp_veno_ssthresh(struct sock *sk) | ||
190 | { | ||
191 | const struct tcp_sock *tp = tcp_sk(sk); | ||
192 | struct veno *veno = inet_csk_ca(sk); | ||
193 | |||
194 | if (veno->diff < beta) | ||
195 | /* in "non-congestive state", cut cwnd by 1/5 */ | ||
196 | return max(tp->snd_cwnd * 4 / 5, 2U); | ||
197 | else | ||
198 | /* in "congestive state", cut cwnd by 1/2 */ | ||
199 | return max(tp->snd_cwnd >> 1U, 2U); | ||
200 | } | ||
201 | |||
202 | static struct tcp_congestion_ops tcp_veno = { | ||
203 | .init = tcp_veno_init, | ||
204 | .ssthresh = tcp_veno_ssthresh, | ||
205 | .cong_avoid = tcp_veno_cong_avoid, | ||
206 | .rtt_sample = tcp_veno_rtt_calc, | ||
207 | .set_state = tcp_veno_state, | ||
208 | .cwnd_event = tcp_veno_cwnd_event, | ||
209 | |||
210 | .owner = THIS_MODULE, | ||
211 | .name = "veno", | ||
212 | }; | ||
213 | |||
214 | static int __init tcp_veno_register(void) | ||
215 | { | ||
216 | BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); | ||
217 | tcp_register_congestion_control(&tcp_veno); | ||
218 | return 0; | ||
219 | } | ||
220 | |||
221 | static void __exit tcp_veno_unregister(void) | ||
222 | { | ||
223 | tcp_unregister_congestion_control(&tcp_veno); | ||
224 | } | ||
225 | |||
226 | module_init(tcp_veno_register); | ||
227 | module_exit(tcp_veno_unregister); | ||
228 | |||
229 | MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); | ||
230 | MODULE_LICENSE("GPL"); | ||
231 | MODULE_DESCRIPTION("TCP Veno"); | ||
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 0c340c3756c2..4247da1384bf 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c | |||
@@ -1,7 +1,24 @@ | |||
1 | /* | 1 | /* |
2 | * TCP Westwood+ | 2 | * TCP Westwood+: end-to-end bandwidth estimation for TCP |
3 | * | 3 | * |
4 | * Angelo Dell'Aera: TCP Westwood+ support | 4 | * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 |
5 | * | ||
6 | * Support at http://c3lab.poliba.it/index.php/Westwood | ||
7 | * Main references in literature: | ||
8 | * | ||
9 | * - Mascolo S, Casetti, M. Gerla et al. | ||
10 | * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 | ||
11 | * | ||
12 | * - A. Grieco, s. Mascolo | ||
13 | * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer | ||
14 | * Comm. Review, 2004 | ||
15 | * | ||
16 | * - A. Dell'Aera, L. Grieco, S. Mascolo. | ||
17 | * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : | ||
18 | * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 | ||
19 | * | ||
20 | * Westwood+ employs end-to-end bandwidth measurement to set cwnd and | ||
21 | * ssthresh after packet loss. The probing phase is as the original Reno. | ||
5 | */ | 22 | */ |
6 | 23 | ||
7 | #include <linux/config.h> | 24 | #include <linux/config.h> |
@@ -22,6 +39,8 @@ struct westwood { | |||
22 | u32 accounted; | 39 | u32 accounted; |
23 | u32 rtt; | 40 | u32 rtt; |
24 | u32 rtt_min; /* minimum observed RTT */ | 41 | u32 rtt_min; /* minimum observed RTT */ |
42 | u8 first_ack; /* flag which infers that this is the first ack */ | ||
43 | u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ | ||
25 | }; | 44 | }; |
26 | 45 | ||
27 | 46 | ||
@@ -49,9 +68,11 @@ static void tcp_westwood_init(struct sock *sk) | |||
49 | w->bw_est = 0; | 68 | w->bw_est = 0; |
50 | w->accounted = 0; | 69 | w->accounted = 0; |
51 | w->cumul_ack = 0; | 70 | w->cumul_ack = 0; |
71 | w->reset_rtt_min = 1; | ||
52 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; | 72 | w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; |
53 | w->rtt_win_sx = tcp_time_stamp; | 73 | w->rtt_win_sx = tcp_time_stamp; |
54 | w->snd_una = tcp_sk(sk)->snd_una; | 74 | w->snd_una = tcp_sk(sk)->snd_una; |
75 | w->first_ack = 1; | ||
55 | } | 76 | } |
56 | 77 | ||
57 | /* | 78 | /* |
@@ -63,10 +84,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b) | |||
63 | return (((7 * a) + b) >> 3); | 84 | return (((7 * a) + b) >> 3); |
64 | } | 85 | } |
65 | 86 | ||
66 | static inline void westwood_filter(struct westwood *w, u32 delta) | 87 | static void westwood_filter(struct westwood *w, u32 delta) |
67 | { | 88 | { |
68 | w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); | 89 | /* If the filter is empty fill it with the first sample of bandwidth */ |
69 | w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); | 90 | if (w->bw_ns_est == 0 && w->bw_est == 0) { |
91 | w->bw_ns_est = w->bk / delta; | ||
92 | w->bw_est = w->bw_ns_est; | ||
93 | } else { | ||
94 | w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); | ||
95 | w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); | ||
96 | } | ||
70 | } | 97 | } |
71 | 98 | ||
72 | /* | 99 | /* |
@@ -91,6 +118,15 @@ static void westwood_update_window(struct sock *sk) | |||
91 | struct westwood *w = inet_csk_ca(sk); | 118 | struct westwood *w = inet_csk_ca(sk); |
92 | s32 delta = tcp_time_stamp - w->rtt_win_sx; | 119 | s32 delta = tcp_time_stamp - w->rtt_win_sx; |
93 | 120 | ||
121 | /* Initialize w->snd_una with the first acked sequence number in order | ||
122 | * to fix mismatch between tp->snd_una and w->snd_una for the first | ||
123 | * bandwidth sample | ||
124 | */ | ||
125 | if (w->first_ack) { | ||
126 | w->snd_una = tcp_sk(sk)->snd_una; | ||
127 | w->first_ack = 0; | ||
128 | } | ||
129 | |||
94 | /* | 130 | /* |
95 | * See if a RTT-window has passed. | 131 | * See if a RTT-window has passed. |
96 | * Be careful since if RTT is less than | 132 | * Be careful since if RTT is less than |
@@ -108,6 +144,16 @@ static void westwood_update_window(struct sock *sk) | |||
108 | } | 144 | } |
109 | } | 145 | } |
110 | 146 | ||
147 | static inline void update_rtt_min(struct westwood *w) | ||
148 | { | ||
149 | if (w->reset_rtt_min) { | ||
150 | w->rtt_min = w->rtt; | ||
151 | w->reset_rtt_min = 0; | ||
152 | } else | ||
153 | w->rtt_min = min(w->rtt, w->rtt_min); | ||
154 | } | ||
155 | |||
156 | |||
111 | /* | 157 | /* |
112 | * @westwood_fast_bw | 158 | * @westwood_fast_bw |
113 | * It is called when we are in fast path. In particular it is called when | 159 | * It is called when we are in fast path. In particular it is called when |
@@ -123,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk) | |||
123 | 169 | ||
124 | w->bk += tp->snd_una - w->snd_una; | 170 | w->bk += tp->snd_una - w->snd_una; |
125 | w->snd_una = tp->snd_una; | 171 | w->snd_una = tp->snd_una; |
126 | w->rtt_min = min(w->rtt, w->rtt_min); | 172 | update_rtt_min(w); |
127 | } | 173 | } |
128 | 174 | ||
129 | /* | 175 | /* |
@@ -162,12 +208,6 @@ static inline u32 westwood_acked_count(struct sock *sk) | |||
162 | return w->cumul_ack; | 208 | return w->cumul_ack; |
163 | } | 209 | } |
164 | 210 | ||
165 | static inline u32 westwood_bw_rttmin(const struct sock *sk) | ||
166 | { | ||
167 | const struct tcp_sock *tp = tcp_sk(sk); | ||
168 | const struct westwood *w = inet_csk_ca(sk); | ||
169 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | ||
170 | } | ||
171 | 211 | ||
172 | /* | 212 | /* |
173 | * TCP Westwood | 213 | * TCP Westwood |
@@ -175,9 +215,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk) | |||
175 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 | 215 | * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 |
176 | * so avoids ever returning 0. | 216 | * so avoids ever returning 0. |
177 | */ | 217 | */ |
178 | static u32 tcp_westwood_cwnd_min(struct sock *sk) | 218 | static u32 tcp_westwood_bw_rttmin(const struct sock *sk) |
179 | { | 219 | { |
180 | return westwood_bw_rttmin(sk); | 220 | const struct tcp_sock *tp = tcp_sk(sk); |
221 | const struct westwood *w = inet_csk_ca(sk); | ||
222 | return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); | ||
181 | } | 223 | } |
182 | 224 | ||
183 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | 225 | static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) |
@@ -191,17 +233,19 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) | |||
191 | break; | 233 | break; |
192 | 234 | ||
193 | case CA_EVENT_COMPLETE_CWR: | 235 | case CA_EVENT_COMPLETE_CWR: |
194 | tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); | 236 | tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
195 | break; | 237 | break; |
196 | 238 | ||
197 | case CA_EVENT_FRTO: | 239 | case CA_EVENT_FRTO: |
198 | tp->snd_ssthresh = westwood_bw_rttmin(sk); | 240 | tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); |
241 | /* Update RTT_min when next ack arrives */ | ||
242 | w->reset_rtt_min = 1; | ||
199 | break; | 243 | break; |
200 | 244 | ||
201 | case CA_EVENT_SLOW_ACK: | 245 | case CA_EVENT_SLOW_ACK: |
202 | westwood_update_window(sk); | 246 | westwood_update_window(sk); |
203 | w->bk += westwood_acked_count(sk); | 247 | w->bk += westwood_acked_count(sk); |
204 | w->rtt_min = min(w->rtt, w->rtt_min); | 248 | update_rtt_min(w); |
205 | break; | 249 | break; |
206 | 250 | ||
207 | default: | 251 | default: |
@@ -235,7 +279,7 @@ static struct tcp_congestion_ops tcp_westwood = { | |||
235 | .init = tcp_westwood_init, | 279 | .init = tcp_westwood_init, |
236 | .ssthresh = tcp_reno_ssthresh, | 280 | .ssthresh = tcp_reno_ssthresh, |
237 | .cong_avoid = tcp_reno_cong_avoid, | 281 | .cong_avoid = tcp_reno_cong_avoid, |
238 | .min_cwnd = tcp_westwood_cwnd_min, | 282 | .min_cwnd = tcp_westwood_bw_rttmin, |
239 | .cwnd_event = tcp_westwood_event, | 283 | .cwnd_event = tcp_westwood_event, |
240 | .get_info = tcp_westwood_info, | 284 | .get_info = tcp_westwood_info, |
241 | .pkts_acked = tcp_westwood_pkts_acked, | 285 | .pkts_acked = tcp_westwood_pkts_acked, |
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 3e174c83bfe7..817ed84511a6 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c | |||
@@ -13,7 +13,6 @@ | |||
13 | #include <linux/string.h> | 13 | #include <linux/string.h> |
14 | #include <linux/netfilter.h> | 14 | #include <linux/netfilter.h> |
15 | #include <linux/netfilter_ipv4.h> | 15 | #include <linux/netfilter_ipv4.h> |
16 | #include <net/inet_ecn.h> | ||
17 | #include <net/ip.h> | 16 | #include <net/ip.h> |
18 | #include <net/xfrm.h> | 17 | #include <net/xfrm.h> |
19 | 18 | ||
@@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) | |||
24 | 23 | ||
25 | EXPORT_SYMBOL(xfrm4_rcv); | 24 | EXPORT_SYMBOL(xfrm4_rcv); |
26 | 25 | ||
27 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | ||
28 | { | ||
29 | struct iphdr *outer_iph = skb->nh.iph; | ||
30 | struct iphdr *inner_iph = skb->h.ipiph; | ||
31 | |||
32 | if (INET_ECN_is_ce(outer_iph->tos)) | ||
33 | IP_ECN_set_ce(inner_iph); | ||
34 | } | ||
35 | |||
36 | static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) | 26 | static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) |
37 | { | 27 | { |
38 | switch (nexthdr) { | 28 | switch (nexthdr) { |
@@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) | |||
113 | 103 | ||
114 | xfrm_vec[xfrm_nr++] = x; | 104 | xfrm_vec[xfrm_nr++] = x; |
115 | 105 | ||
116 | iph = skb->nh.iph; | 106 | if (x->mode->input(x, skb)) |
107 | goto drop; | ||
117 | 108 | ||
118 | if (x->props.mode) { | 109 | if (x->props.mode) { |
119 | if (iph->protocol != IPPROTO_IPIP) | ||
120 | goto drop; | ||
121 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
122 | goto drop; | ||
123 | if (skb_cloned(skb) && | ||
124 | pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) | ||
125 | goto drop; | ||
126 | if (x->props.flags & XFRM_STATE_DECAP_DSCP) | ||
127 | ipv4_copy_dscp(iph, skb->h.ipiph); | ||
128 | if (!(x->props.flags & XFRM_STATE_NOECN)) | ||
129 | ipip_ecn_decapsulate(skb); | ||
130 | skb->mac.raw = memmove(skb->data - skb->mac_len, | ||
131 | skb->mac.raw, skb->mac_len); | ||
132 | skb->nh.raw = skb->data; | ||
133 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
134 | decaps = 1; | 110 | decaps = 1; |
135 | break; | 111 | break; |
136 | } | 112 | } |
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 000000000000..a9e6b3dd19c9 --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c | |||
@@ -0,0 +1,83 @@ | |||
1 | /* | ||
2 | * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. | ||
3 | * | ||
4 | * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/skbuff.h> | ||
11 | #include <linux/stringify.h> | ||
12 | #include <net/dst.h> | ||
13 | #include <net/ip.h> | ||
14 | #include <net/xfrm.h> | ||
15 | |||
16 | /* Add encapsulation header. | ||
17 | * | ||
18 | * The IP header will be moved forward to make space for the encapsulation | ||
19 | * header. | ||
20 | * | ||
21 | * On exit, skb->h will be set to the start of the payload to be processed | ||
22 | * by x->type->output and skb->nh will be set to the top IP header. | ||
23 | */ | ||
24 | static int xfrm4_transport_output(struct sk_buff *skb) | ||
25 | { | ||
26 | struct xfrm_state *x; | ||
27 | struct iphdr *iph; | ||
28 | int ihl; | ||
29 | |||
30 | iph = skb->nh.iph; | ||
31 | skb->h.ipiph = iph; | ||
32 | |||
33 | ihl = iph->ihl * 4; | ||
34 | skb->h.raw += ihl; | ||
35 | |||
36 | x = skb->dst->xfrm; | ||
37 | skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | /* Remove encapsulation header. | ||
42 | * | ||
43 | * The IP header will be moved over the top of the encapsulation header. | ||
44 | * | ||
45 | * On entry, skb->h shall point to where the IP header should be and skb->nh | ||
46 | * shall be set to where the IP header currently is. skb->data shall point | ||
47 | * to the start of the payload. | ||
48 | */ | ||
49 | static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) | ||
50 | { | ||
51 | int ihl = skb->data - skb->h.raw; | ||
52 | |||
53 | if (skb->h.raw != skb->nh.raw) | ||
54 | skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); | ||
55 | skb->nh.iph->tot_len = htons(skb->len + ihl); | ||
56 | skb->h.raw = skb->data; | ||
57 | return 0; | ||
58 | } | ||
59 | |||
60 | static struct xfrm_mode xfrm4_transport_mode = { | ||
61 | .input = xfrm4_transport_input, | ||
62 | .output = xfrm4_transport_output, | ||
63 | .owner = THIS_MODULE, | ||
64 | .encap = XFRM_MODE_TRANSPORT, | ||
65 | }; | ||
66 | |||
67 | static int __init xfrm4_transport_init(void) | ||
68 | { | ||
69 | return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); | ||
70 | } | ||
71 | |||
72 | static void __exit xfrm4_transport_exit(void) | ||
73 | { | ||
74 | int err; | ||
75 | |||
76 | err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); | ||
77 | BUG_ON(err); | ||
78 | } | ||
79 | |||
80 | module_init(xfrm4_transport_init); | ||
81 | module_exit(xfrm4_transport_exit); | ||
82 | MODULE_LICENSE("GPL"); | ||
83 | MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); | ||
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 000000000000..f8d880beb12f --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c | |||
@@ -0,0 +1,125 @@ | |||
1 | /* | ||
2 | * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. | ||
3 | * | ||
4 | * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
5 | */ | ||
6 | |||
7 | #include <linux/init.h> | ||
8 | #include <linux/kernel.h> | ||
9 | #include <linux/module.h> | ||
10 | #include <linux/skbuff.h> | ||
11 | #include <linux/stringify.h> | ||
12 | #include <net/dst.h> | ||
13 | #include <net/inet_ecn.h> | ||
14 | #include <net/ip.h> | ||
15 | #include <net/xfrm.h> | ||
16 | |||
17 | static inline void ipip_ecn_decapsulate(struct sk_buff *skb) | ||
18 | { | ||
19 | struct iphdr *outer_iph = skb->nh.iph; | ||
20 | struct iphdr *inner_iph = skb->h.ipiph; | ||
21 | |||
22 | if (INET_ECN_is_ce(outer_iph->tos)) | ||
23 | IP_ECN_set_ce(inner_iph); | ||
24 | } | ||
25 | |||
26 | /* Add encapsulation header. | ||
27 | * | ||
28 | * The top IP header will be constructed per RFC 2401. The following fields | ||
29 | * in it shall be filled in by x->type->output: | ||
30 | * tot_len | ||
31 | * check | ||
32 | * | ||
33 | * On exit, skb->h will be set to the start of the payload to be processed | ||
34 | * by x->type->output and skb->nh will be set to the top IP header. | ||
35 | */ | ||
36 | static int xfrm4_tunnel_output(struct sk_buff *skb) | ||
37 | { | ||
38 | struct dst_entry *dst = skb->dst; | ||
39 | struct xfrm_state *x = dst->xfrm; | ||
40 | struct iphdr *iph, *top_iph; | ||
41 | int flags; | ||
42 | |||
43 | iph = skb->nh.iph; | ||
44 | skb->h.ipiph = iph; | ||
45 | |||
46 | skb->nh.raw = skb_push(skb, x->props.header_len); | ||
47 | top_iph = skb->nh.iph; | ||
48 | |||
49 | top_iph->ihl = 5; | ||
50 | top_iph->version = 4; | ||
51 | |||
52 | /* DS disclosed */ | ||
53 | top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); | ||
54 | |||
55 | flags = x->props.flags; | ||
56 | if (flags & XFRM_STATE_NOECN) | ||
57 | IP_ECN_clear(top_iph); | ||
58 | |||
59 | top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? | ||
60 | 0 : (iph->frag_off & htons(IP_DF)); | ||
61 | if (!top_iph->frag_off) | ||
62 | __ip_select_ident(top_iph, dst->child, 0); | ||
63 | |||
64 | top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); | ||
65 | |||
66 | top_iph->saddr = x->props.saddr.a4; | ||
67 | top_iph->daddr = x->id.daddr.a4; | ||
68 | top_iph->protocol = IPPROTO_IPIP; | ||
69 | |||
70 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
71 | return 0; | ||
72 | } | ||
73 | |||
74 | static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | ||
75 | { | ||
76 | struct iphdr *iph = skb->nh.iph; | ||
77 | int err = -EINVAL; | ||
78 | |||
79 | if (iph->protocol != IPPROTO_IPIP) | ||
80 | goto out; | ||
81 | if (!pskb_may_pull(skb, sizeof(struct iphdr))) | ||
82 | goto out; | ||
83 | |||
84 | if (skb_cloned(skb) && | ||
85 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) | ||
86 | goto out; | ||
87 | |||
88 | if (x->props.flags & XFRM_STATE_DECAP_DSCP) | ||
89 | ipv4_copy_dscp(iph, skb->h.ipiph); | ||
90 | if (!(x->props.flags & XFRM_STATE_NOECN)) | ||
91 | ipip_ecn_decapsulate(skb); | ||
92 | skb->mac.raw = memmove(skb->data - skb->mac_len, | ||
93 | skb->mac.raw, skb->mac_len); | ||
94 | skb->nh.raw = skb->data; | ||
95 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
96 | err = 0; | ||
97 | |||
98 | out: | ||
99 | return err; | ||
100 | } | ||
101 | |||
102 | static struct xfrm_mode xfrm4_tunnel_mode = { | ||
103 | .input = xfrm4_tunnel_input, | ||
104 | .output = xfrm4_tunnel_output, | ||
105 | .owner = THIS_MODULE, | ||
106 | .encap = XFRM_MODE_TUNNEL, | ||
107 | }; | ||
108 | |||
109 | static int __init xfrm4_tunnel_init(void) | ||
110 | { | ||
111 | return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); | ||
112 | } | ||
113 | |||
114 | static void __exit xfrm4_tunnel_exit(void) | ||
115 | { | ||
116 | int err; | ||
117 | |||
118 | err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); | ||
119 | BUG_ON(err); | ||
120 | } | ||
121 | |||
122 | module_init(xfrm4_tunnel_init); | ||
123 | module_exit(xfrm4_tunnel_exit); | ||
124 | MODULE_LICENSE("GPL"); | ||
125 | MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); | ||
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 4ef8efaf6a67..ac9d91d4bb05 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c | |||
@@ -12,67 +12,10 @@ | |||
12 | #include <linux/skbuff.h> | 12 | #include <linux/skbuff.h> |
13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
14 | #include <linux/netfilter_ipv4.h> | 14 | #include <linux/netfilter_ipv4.h> |
15 | #include <net/inet_ecn.h> | ||
16 | #include <net/ip.h> | 15 | #include <net/ip.h> |
17 | #include <net/xfrm.h> | 16 | #include <net/xfrm.h> |
18 | #include <net/icmp.h> | 17 | #include <net/icmp.h> |
19 | 18 | ||
20 | /* Add encapsulation header. | ||
21 | * | ||
22 | * In transport mode, the IP header will be moved forward to make space | ||
23 | * for the encapsulation header. | ||
24 | * | ||
25 | * In tunnel mode, the top IP header will be constructed per RFC 2401. | ||
26 | * The following fields in it shall be filled in by x->type->output: | ||
27 | * tot_len | ||
28 | * check | ||
29 | * | ||
30 | * On exit, skb->h will be set to the start of the payload to be processed | ||
31 | * by x->type->output and skb->nh will be set to the top IP header. | ||
32 | */ | ||
33 | static void xfrm4_encap(struct sk_buff *skb) | ||
34 | { | ||
35 | struct dst_entry *dst = skb->dst; | ||
36 | struct xfrm_state *x = dst->xfrm; | ||
37 | struct iphdr *iph, *top_iph; | ||
38 | int flags; | ||
39 | |||
40 | iph = skb->nh.iph; | ||
41 | skb->h.ipiph = iph; | ||
42 | |||
43 | skb->nh.raw = skb_push(skb, x->props.header_len); | ||
44 | top_iph = skb->nh.iph; | ||
45 | |||
46 | if (!x->props.mode) { | ||
47 | skb->h.raw += iph->ihl*4; | ||
48 | memmove(top_iph, iph, iph->ihl*4); | ||
49 | return; | ||
50 | } | ||
51 | |||
52 | top_iph->ihl = 5; | ||
53 | top_iph->version = 4; | ||
54 | |||
55 | /* DS disclosed */ | ||
56 | top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); | ||
57 | |||
58 | flags = x->props.flags; | ||
59 | if (flags & XFRM_STATE_NOECN) | ||
60 | IP_ECN_clear(top_iph); | ||
61 | |||
62 | top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? | ||
63 | 0 : (iph->frag_off & htons(IP_DF)); | ||
64 | if (!top_iph->frag_off) | ||
65 | __ip_select_ident(top_iph, dst->child, 0); | ||
66 | |||
67 | top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); | ||
68 | |||
69 | top_iph->saddr = x->props.saddr.a4; | ||
70 | top_iph->daddr = x->id.daddr.a4; | ||
71 | top_iph->protocol = IPPROTO_IPIP; | ||
72 | |||
73 | memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); | ||
74 | } | ||
75 | |||
76 | static int xfrm4_tunnel_check_size(struct sk_buff *skb) | 19 | static int xfrm4_tunnel_check_size(struct sk_buff *skb) |
77 | { | 20 | { |
78 | int mtu, ret = 0; | 21 | int mtu, ret = 0; |
@@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb) | |||
121 | if (err) | 64 | if (err) |
122 | goto error; | 65 | goto error; |
123 | 66 | ||
124 | xfrm4_encap(skb); | 67 | err = x->mode->output(skb); |
68 | if (err) | ||
69 | goto error; | ||
125 | 70 | ||
126 | err = x->type->output(x, skb); | 71 | err = x->type->output(x, skb); |
127 | if (err) | 72 | if (err) |
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8604c747bca5..c0465284dfac 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c | |||
@@ -17,8 +17,6 @@ | |||
17 | static struct dst_ops xfrm4_dst_ops; | 17 | static struct dst_ops xfrm4_dst_ops; |
18 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; | 18 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo; |
19 | 19 | ||
20 | static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; | ||
21 | |||
22 | static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) | 20 | static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) |
23 | { | 21 | { |
24 | return __ip_route_output_key((struct rtable**)dst, fl); | 22 | return __ip_route_output_key((struct rtable**)dst, fl); |
@@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) | |||
237 | 235 | ||
238 | static inline int xfrm4_garbage_collect(void) | 236 | static inline int xfrm4_garbage_collect(void) |
239 | { | 237 | { |
240 | read_lock(&xfrm4_policy_afinfo.lock); | ||
241 | xfrm4_policy_afinfo.garbage_collect(); | 238 | xfrm4_policy_afinfo.garbage_collect(); |
242 | read_unlock(&xfrm4_policy_afinfo.lock); | ||
243 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); | 239 | return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); |
244 | } | 240 | } |
245 | 241 | ||
@@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { | |||
299 | 295 | ||
300 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { | 296 | static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { |
301 | .family = AF_INET, | 297 | .family = AF_INET, |
302 | .lock = RW_LOCK_UNLOCKED, | ||
303 | .type_map = &xfrm4_type_map, | ||
304 | .dst_ops = &xfrm4_dst_ops, | 298 | .dst_ops = &xfrm4_dst_ops, |
305 | .dst_lookup = xfrm4_dst_lookup, | 299 | .dst_lookup = xfrm4_dst_lookup, |
306 | .find_bundle = __xfrm4_find_bundle, | 300 | .find_bundle = __xfrm4_find_bundle, |
diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index dbabf81a9b7b..81e1751c966e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c | |||
@@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, | |||
131 | 131 | ||
132 | static struct xfrm_state_afinfo xfrm4_state_afinfo = { | 132 | static struct xfrm_state_afinfo xfrm4_state_afinfo = { |
133 | .family = AF_INET, | 133 | .family = AF_INET, |
134 | .lock = RW_LOCK_UNLOCKED, | ||
135 | .init_flags = xfrm4_init_flags, | 134 | .init_flags = xfrm4_init_flags, |
136 | .init_tempsel = __xfrm4_init_tempsel, | 135 | .init_tempsel = __xfrm4_init_tempsel, |
137 | .state_lookup = __xfrm4_state_lookup, | 136 | .state_lookup = __xfrm4_state_lookup, |
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index f8a107ab5592..e923d4dea418 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig | |||
@@ -106,6 +106,26 @@ config INET6_TUNNEL | |||
106 | tristate | 106 | tristate |
107 | default n | 107 | default n |
108 | 108 | ||
109 | config INET6_XFRM_MODE_TRANSPORT | ||
110 | tristate "IPv6: IPsec transport mode" | ||
111 | depends on IPV6 | ||
112 | default IPV6 | ||
113 | select XFRM | ||
114 | ---help--- | ||
115 | Support for IPsec transport mode. | ||
116 | |||
117 | If unsure, say Y. | ||
118 | |||
119 | config INET6_XFRM_MODE_TUNNEL | ||
120 | tristate "IPv6: IPsec tunnel mode" | ||
121 | depends on IPV6 | ||
122 | default IPV6 | ||
123 | select XFRM | ||
124 | ---help--- | ||
125 | Support for IPsec tunnel mode. | ||
126 | |||
127 | If unsure, say Y. | ||
128 | |||
109 | config IPV6_TUNNEL | 129 | config IPV6_TUNNEL |
110 | tristate "IPv6: IPv6-in-IPv6 tunnel" | 130 | tristate "IPv6: IPv6-in-IPv6 tunnel" |
111 | select INET6_TUNNEL | 131 | select INET6_TUNNEL |
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index a760b0988fbb..386e0a626948 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile | |||
@@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o | |||
20 | obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o | 20 | obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o |
21 | obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o | 21 | obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o |
22 | obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o | 22 | obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o |
23 | obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o | ||
24 | obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o | ||
23 | obj-$(CONFIG_NETFILTER) += netfilter/ | 25 | obj-$(CONFIG_NETFILTER) += netfilter/ |
24 | 26 | ||
25 | obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o | 27 | obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o |
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 445006ee4522..c2c26fa0943d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c | |||
@@ -2860,6 +2860,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) | |||
2860 | return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); | 2860 | return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); |
2861 | } | 2861 | } |
2862 | 2862 | ||
2863 | /* Maximum length of ifa_cacheinfo attributes */ | ||
2864 | #define INET6_IFADDR_RTA_SPACE \ | ||
2865 | RTA_SPACE(16) /* IFA_ADDRESS */ + \ | ||
2866 | RTA_SPACE(sizeof(struct ifa_cacheinfo)) /* CACHEINFO */ | ||
2867 | |||
2863 | static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, | 2868 | static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, |
2864 | u32 pid, u32 seq, int event, unsigned int flags) | 2869 | u32 pid, u32 seq, int event, unsigned int flags) |
2865 | { | 2870 | { |
@@ -3092,7 +3097,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) | |||
3092 | static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) | 3097 | static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) |
3093 | { | 3098 | { |
3094 | struct sk_buff *skb; | 3099 | struct sk_buff *skb; |
3095 | int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); | 3100 | int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + INET6_IFADDR_RTA_SPACE); |
3096 | 3101 | ||
3097 | skb = alloc_skb(size, GFP_ATOMIC); | 3102 | skb = alloc_skb(size, GFP_ATOMIC); |
3098 | if (!skb) { | 3103 | if (!skb) { |
@@ -3142,6 +3147,17 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, | |||
3142 | #endif | 3147 | #endif |
3143 | } | 3148 | } |
3144 | 3149 | ||
3150 | /* Maximum length of ifinfomsg attributes */ | ||
3151 | #define INET6_IFINFO_RTA_SPACE \ | ||
3152 | RTA_SPACE(IFNAMSIZ) /* IFNAME */ + \ | ||
3153 | RTA_SPACE(MAX_ADDR_LEN) /* ADDRESS */ + \ | ||
3154 | RTA_SPACE(sizeof(u32)) /* MTU */ + \ | ||
3155 | RTA_SPACE(sizeof(int)) /* LINK */ + \ | ||
3156 | RTA_SPACE(0) /* PROTINFO */ + \ | ||
3157 | RTA_SPACE(sizeof(u32)) /* FLAGS */ + \ | ||
3158 | RTA_SPACE(sizeof(struct ifla_cacheinfo)) /* CACHEINFO */ + \ | ||
3159 | RTA_SPACE(sizeof(__s32[DEVCONF_MAX])) /* CONF */ | ||
3160 | |||
3145 | static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, | 3161 | static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, |
3146 | u32 pid, u32 seq, int event, unsigned int flags) | 3162 | u32 pid, u32 seq, int event, unsigned int flags) |
3147 | { | 3163 | { |
@@ -3235,8 +3251,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) | |||
3235 | void inet6_ifinfo_notify(int event, struct inet6_dev *idev) | 3251 | void inet6_ifinfo_notify(int event, struct inet6_dev *idev) |
3236 | { | 3252 | { |
3237 | struct sk_buff *skb; | 3253 | struct sk_buff *skb; |
3238 | /* 128 bytes ?? */ | 3254 | int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + INET6_IFINFO_RTA_SPACE); |
3239 | int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128); | ||
3240 | 3255 | ||
3241 | skb = alloc_skb(size, GFP_ATOMIC); | 3256 | skb = alloc_skb(size, GFP_ATOMIC); |
3242 | if (!skb) { | 3257 | if (!skb) { |
@@ -3252,6 +3267,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) | |||
3252 | netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); | 3267 | netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); |
3253 | } | 3268 | } |
3254 | 3269 | ||
3270 | /* Maximum length of prefix_cacheinfo attributes */ | ||
3271 | #define INET6_PREFIX_RTA_SPACE \ | ||
3272 | RTA_SPACE(sizeof(((struct prefix_info *)NULL)->prefix)) /* ADDRESS */ + \ | ||
3273 | RTA_SPACE(sizeof(struct prefix_cacheinfo)) /* CACHEINFO */ | ||
3274 | |||
3255 | static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, | 3275 | static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, |
3256 | struct prefix_info *pinfo, u32 pid, u32 seq, | 3276 | struct prefix_info *pinfo, u32 pid, u32 seq, |
3257 | int event, unsigned int flags) | 3277 | int event, unsigned int flags) |
@@ -3296,7 +3316,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, | |||
3296 | struct prefix_info *pinfo) | 3316 | struct prefix_info *pinfo) |
3297 | { | 3317 | { |
3298 | struct sk_buff *skb; | 3318 | struct sk_buff *skb; |
3299 | int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128); | 3319 | int size = NLMSG_SPACE(sizeof(struct prefixmsg) + INET6_PREFIX_RTA_SPACE); |
3300 | 3320 | ||
3301 | skb = alloc_skb(size, GFP_ATOMIC); | 3321 | skb = alloc_skb(size, GFP_ATOMIC); |
3302 | if (!skb) { | 3322 | if (!skb) { |
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 6778173a3dda..d31c0d6c0448 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c | |||
@@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) | |||
292 | 292 | ||
293 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); | 293 | memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); |
294 | memset(ah->auth_data, 0, ahp->icv_trunc_len); | 294 | memset(ah->auth_data, 0, ahp->icv_trunc_len); |
295 | skb_push(skb, skb->data - skb->nh.raw); | 295 | skb_push(skb, hdr_len); |
296 | ahp->icv(ahp, skb, ah->auth_data); | 296 | ahp->icv(ahp, skb, ah->auth_data); |
297 | if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { | 297 | if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { |
298 | LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); | 298 | LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); |
@@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) | |||
301 | } | 301 | } |
302 | } | 302 | } |
303 | 303 | ||
304 | skb->nh.raw = skb_pull(skb, ah_hlen); | 304 | skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len); |
305 | memcpy(skb->nh.raw, tmp_hdr, hdr_len); | 305 | __skb_pull(skb, ah_hlen + hdr_len); |
306 | skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); | ||
307 | skb_pull(skb, hdr_len); | ||
308 | skb->h.raw = skb->data; | ||
309 | |||
310 | 306 | ||
311 | kfree(tmp_hdr); | 307 | kfree(tmp_hdr); |
312 | 308 | ||
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 22f046079037..a15a6f320f70 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c | |||
@@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) | |||
142 | 142 | ||
143 | int hdr_len = skb->h.raw - skb->nh.raw; | 143 | int hdr_len = skb->h.raw - skb->nh.raw; |
144 | int nfrags; | 144 | int nfrags; |
145 | unsigned char *tmp_hdr = NULL; | ||
146 | int ret = 0; | 145 | int ret = 0; |
147 | 146 | ||
148 | if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { | 147 | if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { |
149 | ret = -EINVAL; | 148 | ret = -EINVAL; |
150 | goto out_nofree; | 149 | goto out; |
151 | } | 150 | } |
152 | 151 | ||
153 | if (elen <= 0 || (elen & (blksize-1))) { | 152 | if (elen <= 0 || (elen & (blksize-1))) { |
154 | ret = -EINVAL; | 153 | ret = -EINVAL; |
155 | goto out_nofree; | 154 | goto out; |
156 | } | ||
157 | |||
158 | tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); | ||
159 | if (!tmp_hdr) { | ||
160 | ret = -ENOMEM; | ||
161 | goto out_nofree; | ||
162 | } | 155 | } |
163 | memcpy(tmp_hdr, skb->nh.raw, hdr_len); | ||
164 | 156 | ||
165 | /* If integrity check is required, do this. */ | 157 | /* If integrity check is required, do this. */ |
166 | if (esp->auth.icv_full_len) { | 158 | if (esp->auth.icv_full_len) { |
@@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) | |||
222 | /* ... check padding bits here. Silly. :-) */ | 214 | /* ... check padding bits here. Silly. :-) */ |
223 | 215 | ||
224 | pskb_trim(skb, skb->len - alen - padlen - 2); | 216 | pskb_trim(skb, skb->len - alen - padlen - 2); |
225 | skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); | ||
226 | skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; | ||
227 | memcpy(skb->nh.raw, tmp_hdr, hdr_len); | ||
228 | skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); | ||
229 | ret = nexthdr[1]; | 217 | ret = nexthdr[1]; |
230 | } | 218 | } |
231 | 219 | ||
220 | skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len; | ||
221 | |||
232 | out: | 222 | out: |
233 | kfree(tmp_hdr); | ||
234 | out_nofree: | ||
235 | return ret; | 223 | return ret; |
236 | } | 224 | } |
237 | 225 | ||
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e46048974f37..d29620f4910e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/in6.h> | 39 | #include <linux/in6.h> |
40 | #include <linux/tcp.h> | 40 | #include <linux/tcp.h> |
41 | #include <linux/route.h> | 41 | #include <linux/route.h> |
42 | #include <linux/module.h> | ||
42 | 43 | ||
43 | #include <linux/netfilter.h> | 44 | #include <linux/netfilter.h> |
44 | #include <linux/netfilter_ipv6.h> | 45 | #include <linux/netfilter_ipv6.h> |
@@ -458,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) | |||
458 | nf_bridge_get(to->nf_bridge); | 459 | nf_bridge_get(to->nf_bridge); |
459 | #endif | 460 | #endif |
460 | #endif | 461 | #endif |
462 | skb_copy_secmark(to, from); | ||
461 | } | 463 | } |
462 | 464 | ||
463 | int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) | 465 | int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) |
@@ -488,6 +490,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) | |||
488 | 490 | ||
489 | return offset; | 491 | return offset; |
490 | } | 492 | } |
493 | EXPORT_SYMBOL_GPL(ip6_find_1stfragopt); | ||
491 | 494 | ||
492 | static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) | 495 | static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) |
493 | { | 496 | { |
diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 48636436028a..f28cd37feed3 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c | |||
@@ -65,38 +65,25 @@ static LIST_HEAD(ipcomp6_tfms_list); | |||
65 | 65 | ||
66 | static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) | 66 | static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) |
67 | { | 67 | { |
68 | int err = 0; | 68 | int err = -ENOMEM; |
69 | u8 nexthdr = 0; | ||
70 | int hdr_len = skb->h.raw - skb->nh.raw; | ||
71 | unsigned char *tmp_hdr = NULL; | ||
72 | struct ipv6hdr *iph; | 69 | struct ipv6hdr *iph; |
70 | struct ipv6_comp_hdr *ipch; | ||
73 | int plen, dlen; | 71 | int plen, dlen; |
74 | struct ipcomp_data *ipcd = x->data; | 72 | struct ipcomp_data *ipcd = x->data; |
75 | u8 *start, *scratch; | 73 | u8 *start, *scratch; |
76 | struct crypto_tfm *tfm; | 74 | struct crypto_tfm *tfm; |
77 | int cpu; | 75 | int cpu; |
78 | 76 | ||
79 | if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && | 77 | if (skb_linearize_cow(skb)) |
80 | skb_linearize(skb, GFP_ATOMIC) != 0) { | ||
81 | err = -ENOMEM; | ||
82 | goto out; | 78 | goto out; |
83 | } | ||
84 | 79 | ||
85 | skb->ip_summed = CHECKSUM_NONE; | 80 | skb->ip_summed = CHECKSUM_NONE; |
86 | 81 | ||
87 | /* Remove ipcomp header and decompress original payload */ | 82 | /* Remove ipcomp header and decompress original payload */ |
88 | iph = skb->nh.ipv6h; | 83 | iph = skb->nh.ipv6h; |
89 | tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); | 84 | ipch = (void *)skb->data; |
90 | if (!tmp_hdr) | 85 | skb->h.raw = skb->nh.raw + sizeof(*ipch); |
91 | goto out; | 86 | __skb_pull(skb, sizeof(*ipch)); |
92 | memcpy(tmp_hdr, iph, hdr_len); | ||
93 | nexthdr = *(u8 *)skb->data; | ||
94 | skb_pull(skb, sizeof(struct ipv6_comp_hdr)); | ||
95 | skb->nh.raw += sizeof(struct ipv6_comp_hdr); | ||
96 | memcpy(skb->nh.raw, tmp_hdr, hdr_len); | ||
97 | iph = skb->nh.ipv6h; | ||
98 | iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); | ||
99 | skb->h.raw = skb->data; | ||
100 | 87 | ||
101 | /* decompression */ | 88 | /* decompression */ |
102 | plen = skb->len; | 89 | plen = skb->len; |
@@ -125,18 +112,11 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) | |||
125 | 112 | ||
126 | skb_put(skb, dlen - plen); | 113 | skb_put(skb, dlen - plen); |
127 | memcpy(skb->data, scratch, dlen); | 114 | memcpy(skb->data, scratch, dlen); |
115 | err = ipch->nexthdr; | ||
128 | 116 | ||
129 | iph = skb->nh.ipv6h; | ||
130 | iph->payload_len = htons(skb->len); | ||
131 | |||
132 | out_put_cpu: | 117 | out_put_cpu: |
133 | put_cpu(); | 118 | put_cpu(); |
134 | out: | 119 | out: |
135 | kfree(tmp_hdr); | ||
136 | if (err) | ||
137 | goto error_out; | ||
138 | return nexthdr; | ||
139 | error_out: | ||
140 | return err; | 120 | return err; |
141 | } | 121 | } |
142 | 122 | ||
@@ -159,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) | |||
159 | goto out_ok; | 139 | goto out_ok; |
160 | } | 140 | } |
161 | 141 | ||
162 | if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && | 142 | if (skb_linearize_cow(skb)) |
163 | skb_linearize(skb, GFP_ATOMIC) != 0) { | ||
164 | goto out_ok; | 143 | goto out_ok; |
165 | } | ||
166 | 144 | ||
167 | /* compression */ | 145 | /* compression */ |
168 | plen = skb->len - hdr_len; | 146 | plen = skb->len - hdr_len; |
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 93bae36f2663..2a71c3b669f1 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | |||
@@ -189,7 +189,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, | |||
189 | 189 | ||
190 | /* This is where we call the helper: as the packet goes out. */ | 190 | /* This is where we call the helper: as the packet goes out. */ |
191 | ct = nf_ct_get(*pskb, &ctinfo); | 191 | ct = nf_ct_get(*pskb, &ctinfo); |
192 | if (!ct) | 192 | if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) |
193 | goto out; | 193 | goto out; |
194 | 194 | ||
195 | help = nfct_help(ct); | 195 | help = nfct_help(ct); |
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 86c6703265d0..ef18a7b7014b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | |||
@@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, | |||
233 | return -NF_ACCEPT; | 233 | return -NF_ACCEPT; |
234 | } | 234 | } |
235 | 235 | ||
236 | if (hooknum == NF_IP6_PRE_ROUTING && | 236 | if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING && |
237 | nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { | 237 | nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { |
238 | nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, | 238 | nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, |
239 | "nf_ct_icmpv6: ICMPv6 checksum failed\n"); | 239 | "nf_ct_icmpv6: ICMPv6 checksum failed\n"); |
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3e319035f82d..c32a029e43f0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c | |||
@@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, | |||
456 | DEBUGP("queue: message is too short.\n"); | 456 | DEBUGP("queue: message is too short.\n"); |
457 | goto err; | 457 | goto err; |
458 | } | 458 | } |
459 | if (end-offset < skb->len) { | 459 | if (pskb_trim_rcsum(skb, end - offset)) { |
460 | if (pskb_trim(skb, end - offset)) { | 460 | DEBUGP("Can't trim\n"); |
461 | DEBUGP("Can't trim\n"); | 461 | goto err; |
462 | goto err; | ||
463 | } | ||
464 | if (skb->ip_summed != CHECKSUM_UNNECESSARY) | ||
465 | skb->ip_summed = CHECKSUM_NONE; | ||
466 | } | 462 | } |
467 | 463 | ||
468 | /* Find out which fragments are in front and at the back of us | 464 | /* Find out which fragments are in front and at the back of us |
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 301eee726b0f..a50eb306e9e2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c | |||
@@ -1218,8 +1218,16 @@ process: | |||
1218 | bh_lock_sock(sk); | 1218 | bh_lock_sock(sk); |
1219 | ret = 0; | 1219 | ret = 0; |
1220 | if (!sock_owned_by_user(sk)) { | 1220 | if (!sock_owned_by_user(sk)) { |
1221 | if (!tcp_prequeue(sk, skb)) | 1221 | #ifdef CONFIG_NET_DMA |
1222 | ret = tcp_v6_do_rcv(sk, skb); | 1222 | struct tcp_sock *tp = tcp_sk(sk); |
1223 | if (tp->ucopy.dma_chan) | ||
1224 | ret = tcp_v6_do_rcv(sk, skb); | ||
1225 | else | ||
1226 | #endif | ||
1227 | { | ||
1228 | if (!tcp_prequeue(sk, skb)) | ||
1229 | ret = tcp_v6_do_rcv(sk, skb); | ||
1230 | } | ||
1223 | } else | 1231 | } else |
1224 | sk_add_backlog(sk, skb); | 1232 | sk_add_backlog(sk, skb); |
1225 | bh_unlock_sock(sk); | 1233 | bh_unlock_sock(sk); |
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00cfdee18dca..0405d74ff910 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c | |||
@@ -13,21 +13,9 @@ | |||
13 | #include <linux/string.h> | 13 | #include <linux/string.h> |
14 | #include <linux/netfilter.h> | 14 | #include <linux/netfilter.h> |
15 | #include <linux/netfilter_ipv6.h> | 15 | #include <linux/netfilter_ipv6.h> |
16 | #include <net/dsfield.h> | ||
17 | #include <net/inet_ecn.h> | ||
18 | #include <net/ip.h> | ||
19 | #include <net/ipv6.h> | 16 | #include <net/ipv6.h> |
20 | #include <net/xfrm.h> | 17 | #include <net/xfrm.h> |
21 | 18 | ||
22 | static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) | ||
23 | { | ||
24 | struct ipv6hdr *outer_iph = skb->nh.ipv6h; | ||
25 | struct ipv6hdr *inner_iph = skb->h.ipv6h; | ||
26 | |||
27 | if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) | ||
28 | IP6_ECN_set_ce(inner_iph); | ||
29 | } | ||
30 | |||
31 | int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) | 19 | int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) |
32 | { | 20 | { |
33 | int err; | 21 | int err; |
@@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) | |||
81 | 69 | ||
82 | xfrm_vec[xfrm_nr++] = x; | 70 | xfrm_vec[xfrm_nr++] = x; |
83 | 71 | ||
72 | if (x->mode->input(x, skb)) | ||
73 | goto drop; | ||
74 | |||
84 | if (x->props.mode) { /* XXX */ | 75 | if (x->props.mode) { /* XXX */ |
85 | if (nexthdr != IPPROTO_IPV6) | ||
86 | goto drop; | ||
87 | if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) | ||
88 | goto drop; | ||
89 | if (skb_cloned(skb) && | ||
90 | pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) | ||
91 | goto drop; | ||
92 | if (x->props.flags & XFRM_STATE_DECAP_DSCP) | ||
93 | ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); | ||
94 | if (!(x->props.flags & XFRM_STATE_NOECN)) | ||
95 | ipip6_ecn_decapsulate(skb); | ||
96 | skb->mac.raw = memmove(skb->data - skb->mac_len, | ||
97 | skb->mac.raw, skb->mac_len); | ||
98 | skb->nh.raw = skb->data; | ||
99 | decaps = 1; | 76 | decaps = 1; |
100 | break; | 77 | break; |
101 | } | 78 | } |
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c new file mode 100644 index 000000000000..711d713e36d8 --- /dev/null +++ b/net/ipv6/xfrm6_mode_transport.c | |||
@@ -0,0 +1,88 @@ | |||
1 | /* | ||
2 | * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. | ||
3 | * | ||
4 | * Copyright (C) 2002 USAGI/WIDE Project | ||
5 | * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/skbuff.h> | ||
12 | #include <linux/stringify.h> | ||
13 | #include <net/dst.h> | ||
14 | #include <net/ipv6.h> | ||
15 | #include <net/xfrm.h> | ||
16 | |||
17 | /* Add encapsulation header. | ||
18 | * | ||
19 | * The IP header and mutable extension headers will be moved forward to make | ||
20 | * space for the encapsulation header. | ||
21 | * | ||
22 | * On exit, skb->h will be set to the start of the encapsulation header to be | ||
23 | * filled in by x->type->output and skb->nh will be set to the nextheader field | ||
24 | * of the extension header directly preceding the encapsulation header, or in | ||
25 | * its absence, that of the top IP header. The value of skb->data will always | ||
26 | * point to the top IP header. | ||
27 | */ | ||
28 | static int xfrm6_transport_output(struct sk_buff *skb) | ||
29 | { | ||
30 | struct xfrm_state *x = skb->dst->xfrm; | ||
31 | struct ipv6hdr *iph; | ||
32 | u8 *prevhdr; | ||
33 | int hdr_len; | ||
34 | |||
35 | skb_push(skb, x->props.header_len); | ||
36 | iph = skb->nh.ipv6h; | ||
37 | |||
38 | hdr_len = ip6_find_1stfragopt(skb, &prevhdr); | ||
39 | skb->nh.raw = prevhdr - x->props.header_len; | ||
40 | skb->h.raw = skb->data + hdr_len; | ||
41 | memmove(skb->data, iph, hdr_len); | ||
42 | return 0; | ||
43 | } | ||
44 | |||
45 | /* Remove encapsulation header. | ||
46 | * | ||
47 | * The IP header will be moved over the top of the encapsulation header. | ||
48 | * | ||
49 | * On entry, skb->h shall point to where the IP header should be and skb->nh | ||
50 | * shall be set to where the IP header currently is. skb->data shall point | ||
51 | * to the start of the payload. | ||
52 | */ | ||
53 | static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) | ||
54 | { | ||
55 | int ihl = skb->data - skb->h.raw; | ||
56 | |||
57 | if (skb->h.raw != skb->nh.raw) | ||
58 | skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); | ||
59 | skb->nh.ipv6h->payload_len = htons(skb->len + ihl - | ||
60 | sizeof(struct ipv6hdr)); | ||
61 | skb->h.raw = skb->data; | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | static struct xfrm_mode xfrm6_transport_mode = { | ||
66 | .input = xfrm6_transport_input, | ||
67 | .output = xfrm6_transport_output, | ||
68 | .owner = THIS_MODULE, | ||
69 | .encap = XFRM_MODE_TRANSPORT, | ||
70 | }; | ||
71 | |||
72 | static int __init xfrm6_transport_init(void) | ||
73 | { | ||
74 | return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); | ||
75 | } | ||
76 | |||
77 | static void __exit xfrm6_transport_exit(void) | ||
78 | { | ||
79 | int err; | ||
80 | |||
81 | err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); | ||
82 | BUG_ON(err); | ||
83 | } | ||
84 | |||
85 | module_init(xfrm6_transport_init); | ||
86 | module_exit(xfrm6_transport_exit); | ||
87 | MODULE_LICENSE("GPL"); | ||
88 | MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); | ||
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c new file mode 100644 index 000000000000..8af79be2edca --- /dev/null +++ b/net/ipv6/xfrm6_mode_tunnel.c | |||
@@ -0,0 +1,121 @@ | |||
1 | /* | ||
2 | * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. | ||
3 | * | ||
4 | * Copyright (C) 2002 USAGI/WIDE Project | ||
5 | * Copyright (c) 2004-2006 Herbert Xu <herbert@gondor.apana.org.au> | ||
6 | */ | ||
7 | |||
8 | #include <linux/init.h> | ||
9 | #include <linux/kernel.h> | ||
10 | #include <linux/module.h> | ||
11 | #include <linux/skbuff.h> | ||
12 | #include <linux/stringify.h> | ||
13 | #include <net/dsfield.h> | ||
14 | #include <net/dst.h> | ||
15 | #include <net/inet_ecn.h> | ||
16 | #include <net/ipv6.h> | ||
17 | #include <net/xfrm.h> | ||
18 | |||
19 | static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) | ||
20 | { | ||
21 | struct ipv6hdr *outer_iph = skb->nh.ipv6h; | ||
22 | struct ipv6hdr *inner_iph = skb->h.ipv6h; | ||
23 | |||
24 | if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) | ||
25 | IP6_ECN_set_ce(inner_iph); | ||
26 | } | ||
27 | |||
28 | /* Add encapsulation header. | ||
29 | * | ||
30 | * The top IP header will be constructed per RFC 2401. The following fields | ||
31 | * in it shall be filled in by x->type->output: | ||
32 | * payload_len | ||
33 | * | ||
34 | * On exit, skb->h will be set to the start of the encapsulation header to be | ||
35 | * filled in by x->type->output and skb->nh will be set to the nextheader field | ||
36 | * of the extension header directly preceding the encapsulation header, or in | ||
37 | * its absence, that of the top IP header. The value of skb->data will always | ||
38 | * point to the top IP header. | ||
39 | */ | ||
40 | static int xfrm6_tunnel_output(struct sk_buff *skb) | ||
41 | { | ||
42 | struct dst_entry *dst = skb->dst; | ||
43 | struct xfrm_state *x = dst->xfrm; | ||
44 | struct ipv6hdr *iph, *top_iph; | ||
45 | int dsfield; | ||
46 | |||
47 | skb_push(skb, x->props.header_len); | ||
48 | iph = skb->nh.ipv6h; | ||
49 | |||
50 | skb->nh.raw = skb->data; | ||
51 | top_iph = skb->nh.ipv6h; | ||
52 | skb->nh.raw = &top_iph->nexthdr; | ||
53 | skb->h.ipv6h = top_iph + 1; | ||
54 | |||
55 | top_iph->version = 6; | ||
56 | top_iph->priority = iph->priority; | ||
57 | top_iph->flow_lbl[0] = iph->flow_lbl[0]; | ||
58 | top_iph->flow_lbl[1] = iph->flow_lbl[1]; | ||
59 | top_iph->flow_lbl[2] = iph->flow_lbl[2]; | ||
60 | dsfield = ipv6_get_dsfield(top_iph); | ||
61 | dsfield = INET_ECN_encapsulate(dsfield, dsfield); | ||
62 | if (x->props.flags & XFRM_STATE_NOECN) | ||
63 | dsfield &= ~INET_ECN_MASK; | ||
64 | ipv6_change_dsfield(top_iph, 0, dsfield); | ||
65 | top_iph->nexthdr = IPPROTO_IPV6; | ||
66 | top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); | ||
67 | ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); | ||
68 | ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); | ||
69 | return 0; | ||
70 | } | ||
71 | |||
72 | static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) | ||
73 | { | ||
74 | int err = -EINVAL; | ||
75 | |||
76 | if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6) | ||
77 | goto out; | ||
78 | if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) | ||
79 | goto out; | ||
80 | |||
81 | if (skb_cloned(skb) && | ||
82 | (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) | ||
83 | goto out; | ||
84 | |||
85 | if (x->props.flags & XFRM_STATE_DECAP_DSCP) | ||
86 | ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); | ||
87 | if (!(x->props.flags & XFRM_STATE_NOECN)) | ||
88 | ipip6_ecn_decapsulate(skb); | ||
89 | skb->mac.raw = memmove(skb->data - skb->mac_len, | ||
90 | skb->mac.raw, skb->mac_len); | ||
91 | skb->nh.raw = skb->data; | ||
92 | err = 0; | ||
93 | |||
94 | out: | ||
95 | return err; | ||
96 | } | ||
97 | |||
98 | static struct xfrm_mode xfrm6_tunnel_mode = { | ||
99 | .input = xfrm6_tunnel_input, | ||
100 | .output = xfrm6_tunnel_output, | ||
101 | .owner = THIS_MODULE, | ||
102 | .encap = XFRM_MODE_TUNNEL, | ||
103 | }; | ||
104 | |||
105 | static int __init xfrm6_tunnel_init(void) | ||
106 | { | ||
107 | return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); | ||
108 | } | ||
109 | |||
110 | static void __exit xfrm6_tunnel_exit(void) | ||
111 | { | ||
112 | int err; | ||
113 | |||
114 | err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); | ||
115 | BUG_ON(err); | ||
116 | } | ||
117 | |||
118 | module_init(xfrm6_tunnel_init); | ||
119 | module_exit(xfrm6_tunnel_exit); | ||
120 | MODULE_LICENSE("GPL"); | ||
121 | MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); | ||
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 80242172a5df..16e84254a252 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c | |||
@@ -14,68 +14,9 @@ | |||
14 | #include <linux/spinlock.h> | 14 | #include <linux/spinlock.h> |
15 | #include <linux/icmpv6.h> | 15 | #include <linux/icmpv6.h> |
16 | #include <linux/netfilter_ipv6.h> | 16 | #include <linux/netfilter_ipv6.h> |
17 | #include <net/dsfield.h> | ||
18 | #include <net/inet_ecn.h> | ||
19 | #include <net/ipv6.h> | 17 | #include <net/ipv6.h> |
20 | #include <net/xfrm.h> | 18 | #include <net/xfrm.h> |
21 | 19 | ||
22 | /* Add encapsulation header. | ||
23 | * | ||
24 | * In transport mode, the IP header and mutable extension headers will be moved | ||
25 | * forward to make space for the encapsulation header. | ||
26 | * | ||
27 | * In tunnel mode, the top IP header will be constructed per RFC 2401. | ||
28 | * The following fields in it shall be filled in by x->type->output: | ||
29 | * payload_len | ||
30 | * | ||
31 | * On exit, skb->h will be set to the start of the encapsulation header to be | ||
32 | * filled in by x->type->output and skb->nh will be set to the nextheader field | ||
33 | * of the extension header directly preceding the encapsulation header, or in | ||
34 | * its absence, that of the top IP header. The value of skb->data will always | ||
35 | * point to the top IP header. | ||
36 | */ | ||
37 | static void xfrm6_encap(struct sk_buff *skb) | ||
38 | { | ||
39 | struct dst_entry *dst = skb->dst; | ||
40 | struct xfrm_state *x = dst->xfrm; | ||
41 | struct ipv6hdr *iph, *top_iph; | ||
42 | int dsfield; | ||
43 | |||
44 | skb_push(skb, x->props.header_len); | ||
45 | iph = skb->nh.ipv6h; | ||
46 | |||
47 | if (!x->props.mode) { | ||
48 | u8 *prevhdr; | ||
49 | int hdr_len; | ||
50 | |||
51 | hdr_len = ip6_find_1stfragopt(skb, &prevhdr); | ||
52 | skb->nh.raw = prevhdr - x->props.header_len; | ||
53 | skb->h.raw = skb->data + hdr_len; | ||
54 | memmove(skb->data, iph, hdr_len); | ||
55 | return; | ||
56 | } | ||
57 | |||
58 | skb->nh.raw = skb->data; | ||
59 | top_iph = skb->nh.ipv6h; | ||
60 | skb->nh.raw = &top_iph->nexthdr; | ||
61 | skb->h.ipv6h = top_iph + 1; | ||
62 | |||
63 | top_iph->version = 6; | ||
64 | top_iph->priority = iph->priority; | ||
65 | top_iph->flow_lbl[0] = iph->flow_lbl[0]; | ||
66 | top_iph->flow_lbl[1] = iph->flow_lbl[1]; | ||
67 | top_iph->flow_lbl[2] = iph->flow_lbl[2]; | ||
68 | dsfield = ipv6_get_dsfield(top_iph); | ||
69 | dsfield = INET_ECN_encapsulate(dsfield, dsfield); | ||
70 | if (x->props.flags & XFRM_STATE_NOECN) | ||
71 | dsfield &= ~INET_ECN_MASK; | ||
72 | ipv6_change_dsfield(top_iph, 0, dsfield); | ||
73 | top_iph->nexthdr = IPPROTO_IPV6; | ||
74 | top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); | ||
75 | ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); | ||
76 | ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); | ||
77 | } | ||
78 | |||
79 | static int xfrm6_tunnel_check_size(struct sk_buff *skb) | 20 | static int xfrm6_tunnel_check_size(struct sk_buff *skb) |
80 | { | 21 | { |
81 | int mtu, ret = 0; | 22 | int mtu, ret = 0; |
@@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb) | |||
118 | if (err) | 59 | if (err) |
119 | goto error; | 60 | goto error; |
120 | 61 | ||
121 | xfrm6_encap(skb); | 62 | err = x->mode->output(skb); |
63 | if (err) | ||
64 | goto error; | ||
122 | 65 | ||
123 | err = x->type->output(x, skb); | 66 | err = x->type->output(x, skb); |
124 | if (err) | 67 | if (err) |
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 88c840f1beb6..ee715f2691e9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c | |||
@@ -23,8 +23,6 @@ | |||
23 | static struct dst_ops xfrm6_dst_ops; | 23 | static struct dst_ops xfrm6_dst_ops; |
24 | static struct xfrm_policy_afinfo xfrm6_policy_afinfo; | 24 | static struct xfrm_policy_afinfo xfrm6_policy_afinfo; |
25 | 25 | ||
26 | static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; | ||
27 | |||
28 | static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) | 26 | static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) |
29 | { | 27 | { |
30 | int err = 0; | 28 | int err = 0; |
@@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) | |||
249 | 247 | ||
250 | static inline int xfrm6_garbage_collect(void) | 248 | static inline int xfrm6_garbage_collect(void) |
251 | { | 249 | { |
252 | read_lock(&xfrm6_policy_afinfo.lock); | ||
253 | xfrm6_policy_afinfo.garbage_collect(); | 250 | xfrm6_policy_afinfo.garbage_collect(); |
254 | read_unlock(&xfrm6_policy_afinfo.lock); | ||
255 | return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); | 251 | return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); |
256 | } | 252 | } |
257 | 253 | ||
@@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = { | |||
311 | 307 | ||
312 | static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { | 308 | static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { |
313 | .family = AF_INET6, | 309 | .family = AF_INET6, |
314 | .lock = RW_LOCK_UNLOCKED, | ||
315 | .type_map = &xfrm6_type_map, | ||
316 | .dst_ops = &xfrm6_dst_ops, | 310 | .dst_ops = &xfrm6_dst_ops, |
317 | .dst_lookup = xfrm6_dst_lookup, | 311 | .dst_lookup = xfrm6_dst_lookup, |
318 | .find_bundle = __xfrm6_find_bundle, | 312 | .find_bundle = __xfrm6_find_bundle, |
diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index a5723024d3b3..b33296b3f6de 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c | |||
@@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, | |||
135 | 135 | ||
136 | static struct xfrm_state_afinfo xfrm6_state_afinfo = { | 136 | static struct xfrm_state_afinfo xfrm6_state_afinfo = { |
137 | .family = AF_INET6, | 137 | .family = AF_INET6, |
138 | .lock = RW_LOCK_UNLOCKED, | ||
139 | .init_tempsel = __xfrm6_init_tempsel, | 138 | .init_tempsel = __xfrm6_init_tempsel, |
140 | .state_lookup = __xfrm6_state_lookup, | 139 | .state_lookup = __xfrm6_state_lookup, |
141 | .find_acq = __xfrm6_find_acq, | 140 | .find_acq = __xfrm6_find_acq, |
diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index a394c6fe19a2..bba3431cd9a5 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c | |||
@@ -238,7 +238,7 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, | |||
238 | } | 238 | } |
239 | 239 | ||
240 | /* Apply checksum. Not allowed on 802.3 links. */ | 240 | /* Apply checksum. Not allowed on 802.3 links. */ |
241 | if (sk->sk_no_check || intrfc->if_dlink_type == IPX_FRAME_8023) | 241 | if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023)) |
242 | ipx->ipx_checksum = 0xFFFF; | 242 | ipx->ipx_checksum = 0xFFFF; |
243 | else | 243 | else |
244 | ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); | 244 | ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); |
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index c19e9ce05a3a..57ea160f470b 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c | |||
@@ -44,6 +44,8 @@ | |||
44 | #include <net/irda/irlmp.h> | 44 | #include <net/irda/irlmp.h> |
45 | #include <net/irda/irlmp_frame.h> | 45 | #include <net/irda/irlmp_frame.h> |
46 | 46 | ||
47 | #include <asm/unaligned.h> | ||
48 | |||
47 | static __u8 irlmp_find_free_slsap(void); | 49 | static __u8 irlmp_find_free_slsap(void); |
48 | static int irlmp_slsap_inuse(__u8 slsap_sel); | 50 | static int irlmp_slsap_inuse(__u8 slsap_sel); |
49 | 51 | ||
@@ -840,6 +842,7 @@ void irlmp_do_expiry(void) | |||
840 | void irlmp_do_discovery(int nslots) | 842 | void irlmp_do_discovery(int nslots) |
841 | { | 843 | { |
842 | struct lap_cb *lap; | 844 | struct lap_cb *lap; |
845 | __u16 *data_hintsp; | ||
843 | 846 | ||
844 | /* Make sure the value is sane */ | 847 | /* Make sure the value is sane */ |
845 | if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ | 848 | if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ |
@@ -849,7 +852,8 @@ void irlmp_do_discovery(int nslots) | |||
849 | } | 852 | } |
850 | 853 | ||
851 | /* Construct new discovery info to be used by IrLAP, */ | 854 | /* Construct new discovery info to be used by IrLAP, */ |
852 | u16ho(irlmp->discovery_cmd.data.hints) = irlmp->hints.word; | 855 | data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints; |
856 | put_unaligned(irlmp->hints.word, data_hintsp); | ||
853 | 857 | ||
854 | /* | 858 | /* |
855 | * Set character set for device name (we use ASCII), and | 859 | * Set character set for device name (we use ASCII), and |
diff --git a/net/key/af_key.c b/net/key/af_key.c index 859582275cab..d5e2121ea207 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c | |||
@@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h | |||
1454 | if (x == NULL) | 1454 | if (x == NULL) |
1455 | return -ESRCH; | 1455 | return -ESRCH; |
1456 | 1456 | ||
1457 | if ((err = security_xfrm_state_delete(x))) | ||
1458 | goto out; | ||
1459 | |||
1457 | if (xfrm_state_kern(x)) { | 1460 | if (xfrm_state_kern(x)) { |
1458 | xfrm_state_put(x); | 1461 | err = -EPERM; |
1459 | return -EPERM; | 1462 | goto out; |
1460 | } | 1463 | } |
1461 | 1464 | ||
1462 | err = xfrm_state_delete(x); | 1465 | err = xfrm_state_delete(x); |
1463 | if (err < 0) { | 1466 | if (err < 0) |
1464 | xfrm_state_put(x); | 1467 | goto out; |
1465 | return err; | ||
1466 | } | ||
1467 | 1468 | ||
1468 | c.seq = hdr->sadb_msg_seq; | 1469 | c.seq = hdr->sadb_msg_seq; |
1469 | c.pid = hdr->sadb_msg_pid; | 1470 | c.pid = hdr->sadb_msg_pid; |
1470 | c.event = XFRM_MSG_DELSA; | 1471 | c.event = XFRM_MSG_DELSA; |
1471 | km_state_notify(x, &c); | 1472 | km_state_notify(x, &c); |
1473 | out: | ||
1472 | xfrm_state_put(x); | 1474 | xfrm_state_put(x); |
1473 | 1475 | ||
1474 | return err; | 1476 | return err; |
@@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg | |||
2274 | 2276 | ||
2275 | err = 0; | 2277 | err = 0; |
2276 | 2278 | ||
2279 | if ((err = security_xfrm_policy_delete(xp))) | ||
2280 | goto out; | ||
2277 | c.seq = hdr->sadb_msg_seq; | 2281 | c.seq = hdr->sadb_msg_seq; |
2278 | c.pid = hdr->sadb_msg_pid; | 2282 | c.pid = hdr->sadb_msg_pid; |
2279 | c.event = XFRM_MSG_DELPOLICY; | 2283 | c.event = XFRM_MSG_DELPOLICY; |
2280 | km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); | 2284 | km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); |
2281 | 2285 | ||
2286 | out: | ||
2282 | xfrm_pol_put(xp); | 2287 | xfrm_pol_put(xp); |
2283 | return err; | 2288 | return err; |
2284 | } | 2289 | } |
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db745c8d..75c9b1480801 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c | |||
@@ -674,7 +674,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
674 | 674 | ||
675 | lock_sock(sk); | 675 | lock_sock(sk); |
676 | copied = -ENOTCONN; | 676 | copied = -ENOTCONN; |
677 | if (sk->sk_state == TCP_LISTEN) | 677 | if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) |
678 | goto out; | 678 | goto out; |
679 | 679 | ||
680 | timeo = sock_rcvtimeo(sk, nonblock); | 680 | timeo = sock_rcvtimeo(sk, nonblock); |
@@ -733,7 +733,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
733 | if (sk->sk_shutdown & RCV_SHUTDOWN) | 733 | if (sk->sk_shutdown & RCV_SHUTDOWN) |
734 | break; | 734 | break; |
735 | 735 | ||
736 | if (sk->sk_state == TCP_CLOSE) { | 736 | if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { |
737 | if (!sock_flag(sk, SOCK_DONE)) { | 737 | if (!sock_flag(sk, SOCK_DONE)) { |
738 | /* | 738 | /* |
739 | * This occurs when user tries to read | 739 | * This occurs when user tries to read |
@@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, | |||
789 | continue; | 789 | continue; |
790 | 790 | ||
791 | if (!(flags & MSG_PEEK)) { | 791 | if (!(flags & MSG_PEEK)) { |
792 | sk_eat_skb(sk, skb); | 792 | sk_eat_skb(sk, skb, 0); |
793 | *seq = 0; | 793 | *seq = 0; |
794 | } | 794 | } |
795 | } while (len > 0); | 795 | } while (len > 0); |
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index ba90f7f0801a..5ae47be7dde0 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c | |||
@@ -26,8 +26,6 @@ | |||
26 | #include <net/llc_c_st.h> | 26 | #include <net/llc_c_st.h> |
27 | #include <net/tcp_states.h> | 27 | #include <net/tcp_states.h> |
28 | 28 | ||
29 | u8 llc_mac_null_var[IFHWADDRLEN]; | ||
30 | |||
31 | /** | 29 | /** |
32 | * llc_build_and_send_pkt - Connection data sending for upper layers. | 30 | * llc_build_and_send_pkt - Connection data sending for upper layers. |
33 | * @sk: connection | 31 | * @sk: connection |
diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index d62e0f9b9da3..94d2368ade92 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c | |||
@@ -142,6 +142,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, | |||
142 | struct llc_sap *sap; | 142 | struct llc_sap *sap; |
143 | struct llc_pdu_sn *pdu; | 143 | struct llc_pdu_sn *pdu; |
144 | int dest; | 144 | int dest; |
145 | int (*rcv)(struct sk_buff *, struct net_device *, | ||
146 | struct packet_type *, struct net_device *); | ||
145 | 147 | ||
146 | /* | 148 | /* |
147 | * When the interface is in promisc. mode, drop all the crap that it | 149 | * When the interface is in promisc. mode, drop all the crap that it |
@@ -169,9 +171,11 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, | |||
169 | * First the upper layer protocols that don't need the full | 171 | * First the upper layer protocols that don't need the full |
170 | * LLC functionality | 172 | * LLC functionality |
171 | */ | 173 | */ |
172 | if (sap->rcv_func) { | 174 | rcv = rcu_dereference(sap->rcv_func); |
173 | sap->rcv_func(skb, dev, pt, orig_dev); | 175 | if (rcv) { |
174 | goto out_put; | 176 | struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); |
177 | if (cskb) | ||
178 | rcv(cskb, dev, pt, orig_dev); | ||
175 | } | 179 | } |
176 | dest = llc_pdu_type(skb); | 180 | dest = llc_pdu_type(skb); |
177 | if (unlikely(!dest || !llc_type_handlers[dest - 1])) | 181 | if (unlikely(!dest || !llc_type_handlers[dest - 1])) |
diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 4029ceee9b91..20c4eb5c1ac6 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c | |||
@@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb) | |||
282 | * mac, and local sap. Returns pointer for socket found, %NULL otherwise. | 282 | * mac, and local sap. Returns pointer for socket found, %NULL otherwise. |
283 | */ | 283 | */ |
284 | static struct sock *llc_lookup_dgram(struct llc_sap *sap, | 284 | static struct sock *llc_lookup_dgram(struct llc_sap *sap, |
285 | struct llc_addr *laddr) | 285 | const struct llc_addr *laddr) |
286 | { | 286 | { |
287 | struct sock *rc; | 287 | struct sock *rc; |
288 | struct hlist_node *node; | 288 | struct hlist_node *node; |
@@ -304,19 +304,62 @@ found: | |||
304 | return rc; | 304 | return rc; |
305 | } | 305 | } |
306 | 306 | ||
307 | /** | ||
308 | * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. | ||
309 | * @sap: SAP | ||
310 | * @laddr: address of local LLC (MAC + SAP) | ||
311 | * | ||
312 | * Search socket list of the SAP and finds connections with same sap. | ||
313 | * Deliver clone to each. | ||
314 | */ | ||
315 | static void llc_sap_mcast(struct llc_sap *sap, | ||
316 | const struct llc_addr *laddr, | ||
317 | struct sk_buff *skb) | ||
318 | { | ||
319 | struct sock *sk; | ||
320 | struct hlist_node *node; | ||
321 | |||
322 | read_lock_bh(&sap->sk_list.lock); | ||
323 | sk_for_each(sk, node, &sap->sk_list.list) { | ||
324 | struct llc_sock *llc = llc_sk(sk); | ||
325 | struct sk_buff *skb1; | ||
326 | |||
327 | if (sk->sk_type != SOCK_DGRAM) | ||
328 | continue; | ||
329 | |||
330 | if (llc->laddr.lsap != laddr->lsap) | ||
331 | continue; | ||
332 | |||
333 | skb1 = skb_clone(skb, GFP_ATOMIC); | ||
334 | if (!skb1) | ||
335 | break; | ||
336 | |||
337 | sock_hold(sk); | ||
338 | skb_set_owner_r(skb1, sk); | ||
339 | llc_sap_rcv(sap, skb1); | ||
340 | sock_put(sk); | ||
341 | } | ||
342 | read_unlock_bh(&sap->sk_list.lock); | ||
343 | } | ||
344 | |||
345 | |||
307 | void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) | 346 | void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) |
308 | { | 347 | { |
309 | struct llc_addr laddr; | 348 | struct llc_addr laddr; |
310 | struct sock *sk; | ||
311 | 349 | ||
312 | llc_pdu_decode_da(skb, laddr.mac); | 350 | llc_pdu_decode_da(skb, laddr.mac); |
313 | llc_pdu_decode_dsap(skb, &laddr.lsap); | 351 | llc_pdu_decode_dsap(skb, &laddr.lsap); |
314 | 352 | ||
315 | sk = llc_lookup_dgram(sap, &laddr); | 353 | if (llc_mac_multicast(laddr.mac)) { |
316 | if (sk) { | 354 | llc_sap_mcast(sap, &laddr, skb); |
317 | skb_set_owner_r(skb, sk); | ||
318 | llc_sap_rcv(sap, skb); | ||
319 | sock_put(sk); | ||
320 | } else | ||
321 | kfree_skb(skb); | 355 | kfree_skb(skb); |
356 | } else { | ||
357 | struct sock *sk = llc_lookup_dgram(sap, &laddr); | ||
358 | if (sk) { | ||
359 | skb_set_owner_r(skb, sk); | ||
360 | llc_sap_rcv(sap, skb); | ||
361 | sock_put(sk); | ||
362 | } else | ||
363 | kfree_skb(skb); | ||
364 | } | ||
322 | } | 365 | } |
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e2893effdfaa..b1622b7de1cf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig | |||
@@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK | |||
60 | of packets, but this mark value is kept in the conntrack session | 60 | of packets, but this mark value is kept in the conntrack session |
61 | instead of the individual packets. | 61 | instead of the individual packets. |
62 | 62 | ||
63 | config NF_CONNTRACK_SECMARK | ||
64 | bool 'Connection tracking security mark support' | ||
65 | depends on NF_CONNTRACK && NETWORK_SECMARK | ||
66 | help | ||
67 | This option enables security markings to be applied to | ||
68 | connections. Typically they are copied to connections from | ||
69 | packets using the CONNSECMARK target and copied back from | ||
70 | connections to packets with the same target, with the packets | ||
71 | being originally labeled via SECMARK. | ||
72 | |||
73 | If unsure, say 'N'. | ||
74 | |||
63 | config NF_CONNTRACK_EVENTS | 75 | config NF_CONNTRACK_EVENTS |
64 | bool "Connection tracking events (EXPERIMENTAL)" | 76 | bool "Connection tracking events (EXPERIMENTAL)" |
65 | depends on EXPERIMENTAL && NF_CONNTRACK | 77 | depends on EXPERIMENTAL && NF_CONNTRACK |
@@ -174,6 +186,26 @@ config NETFILTER_XT_TARGET_NOTRACK | |||
174 | If you want to compile it as a module, say M here and read | 186 | If you want to compile it as a module, say M here and read |
175 | <file:Documentation/modules.txt>. If unsure, say `N'. | 187 | <file:Documentation/modules.txt>. If unsure, say `N'. |
176 | 188 | ||
189 | config NETFILTER_XT_TARGET_SECMARK | ||
190 | tristate '"SECMARK" target support' | ||
191 | depends on NETFILTER_XTABLES && NETWORK_SECMARK | ||
192 | help | ||
193 | The SECMARK target allows security marking of network | ||
194 | packets, for use with security subsystems. | ||
195 | |||
196 | To compile it as a module, choose M here. If unsure, say N. | ||
197 | |||
198 | config NETFILTER_XT_TARGET_CONNSECMARK | ||
199 | tristate '"CONNSECMARK" target support' | ||
200 | depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK) | ||
201 | help | ||
202 | The CONNSECMARK target copies security markings from packets | ||
203 | to connections, and restores security markings from connections | ||
204 | to packets (if the packets are not already marked). This would | ||
205 | normally be used in conjunction with the SECMARK target. | ||
206 | |||
207 | To compile it as a module, choose M here. If unsure, say N. | ||
208 | |||
177 | config NETFILTER_XT_MATCH_COMMENT | 209 | config NETFILTER_XT_MATCH_COMMENT |
178 | tristate '"comment" match support' | 210 | tristate '"comment" match support' |
179 | depends on NETFILTER_XTABLES | 211 | depends on NETFILTER_XTABLES |
@@ -329,6 +361,16 @@ config NETFILTER_XT_MATCH_PKTTYPE | |||
329 | 361 | ||
330 | To compile it as a module, choose M here. If unsure, say N. | 362 | To compile it as a module, choose M here. If unsure, say N. |
331 | 363 | ||
364 | config NETFILTER_XT_MATCH_QUOTA | ||
365 | tristate '"quota" match support' | ||
366 | depends on NETFILTER_XTABLES | ||
367 | help | ||
368 | This option adds a `quota' match, which allows to match on a | ||
369 | byte counter. | ||
370 | |||
371 | If you want to compile it as a module, say M here and read | ||
372 | <file:Documentation/modules.txt>. If unsure, say `N'. | ||
373 | |||
332 | config NETFILTER_XT_MATCH_REALM | 374 | config NETFILTER_XT_MATCH_REALM |
333 | tristate '"realm" match support' | 375 | tristate '"realm" match support' |
334 | depends on NETFILTER_XTABLES | 376 | depends on NETFILTER_XTABLES |
@@ -365,6 +407,12 @@ config NETFILTER_XT_MATCH_STATE | |||
365 | 407 | ||
366 | To compile it as a module, choose M here. If unsure, say N. | 408 | To compile it as a module, choose M here. If unsure, say N. |
367 | 409 | ||
410 | config NETFILTER_XT_MATCH_STATISTIC | ||
411 | tristate '"statistic" match support' | ||
412 | depends on NETFILTER_XTABLES | ||
413 | help | ||
414 | statistic module | ||
415 | |||
368 | config NETFILTER_XT_MATCH_STRING | 416 | config NETFILTER_XT_MATCH_STRING |
369 | tristate '"string" match support' | 417 | tristate '"string" match support' |
370 | depends on NETFILTER_XTABLES | 418 | depends on NETFILTER_XTABLES |
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 95b7e416512d..6fa4b7580458 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile | |||
@@ -28,6 +28,8 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o | |||
28 | obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o | 28 | obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o |
29 | obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o | 29 | obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o |
30 | obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o | 30 | obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o |
31 | obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o | ||
32 | obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o | ||
31 | 33 | ||
32 | # matches | 34 | # matches |
33 | obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o | 35 | obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o |
@@ -44,9 +46,11 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o | |||
44 | obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o | 46 | obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o |
45 | obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o | 47 | obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o |
46 | obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o | 48 | obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o |
49 | obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o | ||
47 | obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o | 50 | obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o |
48 | obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o | 51 | obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o |
49 | obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o | 52 | obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o |
53 | obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o | ||
50 | obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o | 54 | obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o |
51 | obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o | 55 | obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o |
52 | obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o | 56 | obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o |
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9b83f91371a..cd299f4b7db1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c | |||
@@ -990,6 +990,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, | |||
990 | #ifdef CONFIG_NF_CONNTRACK_MARK | 990 | #ifdef CONFIG_NF_CONNTRACK_MARK |
991 | conntrack->mark = exp->master->mark; | 991 | conntrack->mark = exp->master->mark; |
992 | #endif | 992 | #endif |
993 | #ifdef CONFIG_NF_CONNTRACK_SECMARK | ||
994 | conntrack->secmark = exp->master->secmark; | ||
995 | #endif | ||
993 | nf_conntrack_get(&conntrack->master->ct_general); | 996 | nf_conntrack_get(&conntrack->master->ct_general); |
994 | NF_CT_STAT_INC(expect_new); | 997 | NF_CT_STAT_INC(expect_new); |
995 | } else | 998 | } else |
@@ -1396,6 +1399,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, | |||
1396 | 1399 | ||
1397 | write_lock_bh(&nf_conntrack_lock); | 1400 | write_lock_bh(&nf_conntrack_lock); |
1398 | 1401 | ||
1402 | /* Only update if this is not a fixed timeout */ | ||
1403 | if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { | ||
1404 | write_unlock_bh(&nf_conntrack_lock); | ||
1405 | return; | ||
1406 | } | ||
1407 | |||
1399 | /* If not in hash table, timer will not be active yet */ | 1408 | /* If not in hash table, timer will not be active yet */ |
1400 | if (!nf_ct_is_confirmed(ct)) { | 1409 | if (!nf_ct_is_confirmed(ct)) { |
1401 | ct->timeout.expires = extra_jiffies; | 1410 | ct->timeout.expires = extra_jiffies; |
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e38a4b5a3089..11d3be243536 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c | |||
@@ -67,37 +67,48 @@ static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, | |||
67 | char); | 67 | char); |
68 | 68 | ||
69 | static struct ftp_search { | 69 | static struct ftp_search { |
70 | enum ip_conntrack_dir dir; | ||
71 | const char *pattern; | 70 | const char *pattern; |
72 | size_t plen; | 71 | size_t plen; |
73 | char skip; | 72 | char skip; |
74 | char term; | 73 | char term; |
75 | enum ip_ct_ftp_type ftptype; | 74 | enum ip_ct_ftp_type ftptype; |
76 | int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); | 75 | int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); |
77 | } search[] = { | 76 | } search[IP_CT_DIR_MAX][2] = { |
78 | { | 77 | [IP_CT_DIR_ORIGINAL] = { |
79 | IP_CT_DIR_ORIGINAL, | 78 | { |
80 | "PORT", sizeof("PORT") - 1, ' ', '\r', | 79 | .pattern = "PORT", |
81 | IP_CT_FTP_PORT, | 80 | .plen = sizeof("PORT") - 1, |
82 | try_rfc959, | 81 | .skip = ' ', |
82 | .term = '\r', | ||
83 | .ftptype = IP_CT_FTP_PORT, | ||
84 | .getnum = try_rfc959, | ||
85 | }, | ||
86 | { | ||
87 | .pattern = "EPRT", | ||
88 | .plen = sizeof("EPRT") - 1, | ||
89 | .skip = ' ', | ||
90 | .term = '\r', | ||
91 | .ftptype = IP_CT_FTP_EPRT, | ||
92 | .getnum = try_eprt, | ||
93 | }, | ||
83 | }, | 94 | }, |
84 | { | 95 | [IP_CT_DIR_REPLY] = { |
85 | IP_CT_DIR_REPLY, | 96 | { |
86 | "227 ", sizeof("227 ") - 1, '(', ')', | 97 | .pattern = "227 ", |
87 | IP_CT_FTP_PASV, | 98 | .plen = sizeof("227 ") - 1, |
88 | try_rfc959, | 99 | .skip = '(', |
89 | }, | 100 | .term = ')', |
90 | { | 101 | .ftptype = IP_CT_FTP_PASV, |
91 | IP_CT_DIR_ORIGINAL, | 102 | .getnum = try_rfc959, |
92 | "EPRT", sizeof("EPRT") - 1, ' ', '\r', | 103 | }, |
93 | IP_CT_FTP_EPRT, | 104 | { |
94 | try_eprt, | 105 | .pattern = "229 ", |
95 | }, | 106 | .plen = sizeof("229 ") - 1, |
96 | { | 107 | .skip = '(', |
97 | IP_CT_DIR_REPLY, | 108 | .term = ')', |
98 | "229 ", sizeof("229 ") - 1, '(', ')', | 109 | .ftptype = IP_CT_FTP_EPSV, |
99 | IP_CT_FTP_EPSV, | 110 | .getnum = try_epsv_response, |
100 | try_epsv_response, | 111 | }, |
101 | }, | 112 | }, |
102 | }; | 113 | }; |
103 | 114 | ||
@@ -492,17 +503,15 @@ static int help(struct sk_buff **pskb, | |||
492 | memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, | 503 | memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, |
493 | sizeof(cmd.u3.all)); | 504 | sizeof(cmd.u3.all)); |
494 | 505 | ||
495 | for (i = 0; i < ARRAY_SIZE(search); i++) { | 506 | for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { |
496 | if (search[i].dir != dir) continue; | ||
497 | |||
498 | found = find_pattern(fb_ptr, datalen, | 507 | found = find_pattern(fb_ptr, datalen, |
499 | search[i].pattern, | 508 | search[dir][i].pattern, |
500 | search[i].plen, | 509 | search[dir][i].plen, |
501 | search[i].skip, | 510 | search[dir][i].skip, |
502 | search[i].term, | 511 | search[dir][i].term, |
503 | &matchoff, &matchlen, | 512 | &matchoff, &matchlen, |
504 | &cmd, | 513 | &cmd, |
505 | search[i].getnum); | 514 | search[dir][i].getnum); |
506 | if (found) break; | 515 | if (found) break; |
507 | } | 516 | } |
508 | if (found == -1) { | 517 | if (found == -1) { |
@@ -512,7 +521,7 @@ static int help(struct sk_buff **pskb, | |||
512 | this case. */ | 521 | this case. */ |
513 | if (net_ratelimit()) | 522 | if (net_ratelimit()) |
514 | printk("conntrack_ftp: partial %s %u+%u\n", | 523 | printk("conntrack_ftp: partial %s %u+%u\n", |
515 | search[i].pattern, | 524 | search[dir][i].pattern, |
516 | ntohl(th->seq), datalen); | 525 | ntohl(th->seq), datalen); |
517 | ret = NF_DROP; | 526 | ret = NF_DROP; |
518 | goto out; | 527 | goto out; |
@@ -597,7 +606,7 @@ static int help(struct sk_buff **pskb, | |||
597 | /* Now, NAT might want to mangle the packet, and register the | 606 | /* Now, NAT might want to mangle the packet, and register the |
598 | * (possibly changed) expectation itself. */ | 607 | * (possibly changed) expectation itself. */ |
599 | if (nf_nat_ftp_hook) | 608 | if (nf_nat_ftp_hook) |
600 | ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, | 609 | ret = nf_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, |
601 | matchoff, matchlen, exp, &seq); | 610 | matchoff, matchlen, exp, &seq); |
602 | else { | 611 | else { |
603 | /* Can't expect this? Best to drop packet now. */ | 612 | /* Can't expect this? Best to drop packet now. */ |
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bd10eb944b65..b8c7c567c9df 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c | |||
@@ -407,6 +407,8 @@ nfattr_failure: | |||
407 | 407 | ||
408 | static int ctnetlink_done(struct netlink_callback *cb) | 408 | static int ctnetlink_done(struct netlink_callback *cb) |
409 | { | 409 | { |
410 | if (cb->args[1]) | ||
411 | nf_ct_put((struct nf_conn *)cb->args[1]); | ||
410 | DEBUGP("entered %s\n", __FUNCTION__); | 412 | DEBUGP("entered %s\n", __FUNCTION__); |
411 | return 0; | 413 | return 0; |
412 | } | 414 | } |
@@ -416,10 +418,9 @@ static int ctnetlink_done(struct netlink_callback *cb) | |||
416 | static int | 418 | static int |
417 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | 419 | ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) |
418 | { | 420 | { |
419 | struct nf_conn *ct = NULL; | 421 | struct nf_conn *ct, *last; |
420 | struct nf_conntrack_tuple_hash *h; | 422 | struct nf_conntrack_tuple_hash *h; |
421 | struct list_head *i; | 423 | struct list_head *i; |
422 | u_int32_t *id = (u_int32_t *) &cb->args[1]; | ||
423 | struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); | 424 | struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); |
424 | u_int8_t l3proto = nfmsg->nfgen_family; | 425 | u_int8_t l3proto = nfmsg->nfgen_family; |
425 | 426 | ||
@@ -427,7 +428,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | |||
427 | cb->args[0], *id); | 428 | cb->args[0], *id); |
428 | 429 | ||
429 | read_lock_bh(&nf_conntrack_lock); | 430 | read_lock_bh(&nf_conntrack_lock); |
430 | for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { | 431 | for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { |
432 | restart: | ||
433 | last = (struct nf_conn *)cb->args[1]; | ||
431 | list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { | 434 | list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { |
432 | h = (struct nf_conntrack_tuple_hash *) i; | 435 | h = (struct nf_conntrack_tuple_hash *) i; |
433 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) | 436 | if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) |
@@ -438,17 +441,30 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) | |||
438 | * then dump everything. */ | 441 | * then dump everything. */ |
439 | if (l3proto && L3PROTO(ct) != l3proto) | 442 | if (l3proto && L3PROTO(ct) != l3proto) |
440 | continue; | 443 | continue; |
441 | if (ct->id <= *id) | 444 | if (last != NULL) { |
442 | continue; | 445 | if (ct == last) { |
446 | nf_ct_put(last); | ||
447 | cb->args[1] = 0; | ||
448 | last = NULL; | ||
449 | } else | ||
450 | continue; | ||
451 | } | ||
443 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, | 452 | if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, |
444 | cb->nlh->nlmsg_seq, | 453 | cb->nlh->nlmsg_seq, |
445 | IPCTNL_MSG_CT_NEW, | 454 | IPCTNL_MSG_CT_NEW, |
446 | 1, ct) < 0) | 455 | 1, ct) < 0) { |
456 | nf_conntrack_get(&ct->ct_general); | ||
457 | cb->args[1] = (unsigned long)ct; | ||
447 | goto out; | 458 | goto out; |
448 | *id = ct->id; | 459 | } |
460 | } | ||
461 | if (last != NULL) { | ||
462 | nf_ct_put(last); | ||
463 | cb->args[1] = 0; | ||
464 | goto restart; | ||
449 | } | 465 | } |
450 | } | 466 | } |
451 | out: | 467 | out: |
452 | read_unlock_bh(&nf_conntrack_lock); | 468 | read_unlock_bh(&nf_conntrack_lock); |
453 | 469 | ||
454 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); | 470 | DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); |
@@ -641,7 +657,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { | |||
641 | }; | 657 | }; |
642 | 658 | ||
643 | static inline int | 659 | static inline int |
644 | ctnetlink_parse_nat(struct nfattr *cda[], | 660 | ctnetlink_parse_nat(struct nfattr *nat, |
645 | const struct nf_conn *ct, struct ip_nat_range *range) | 661 | const struct nf_conn *ct, struct ip_nat_range *range) |
646 | { | 662 | { |
647 | struct nfattr *tb[CTA_NAT_MAX]; | 663 | struct nfattr *tb[CTA_NAT_MAX]; |
@@ -651,7 +667,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], | |||
651 | 667 | ||
652 | memset(range, 0, sizeof(*range)); | 668 | memset(range, 0, sizeof(*range)); |
653 | 669 | ||
654 | nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); | 670 | nfattr_parse_nested(tb, CTA_NAT_MAX, nat); |
655 | 671 | ||
656 | if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) | 672 | if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) |
657 | return -EINVAL; | 673 | return -EINVAL; |
@@ -866,39 +882,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) | |||
866 | /* ASSURED bit can only be set */ | 882 | /* ASSURED bit can only be set */ |
867 | return -EINVAL; | 883 | return -EINVAL; |
868 | 884 | ||
869 | if (cda[CTA_NAT-1]) { | 885 | if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { |
870 | #ifndef CONFIG_IP_NF_NAT_NEEDED | 886 | #ifndef CONFIG_IP_NF_NAT_NEEDED |
871 | return -EINVAL; | 887 | return -EINVAL; |
872 | #else | 888 | #else |
873 | unsigned int hooknum; | ||
874 | struct ip_nat_range range; | 889 | struct ip_nat_range range; |
875 | 890 | ||
876 | if (ctnetlink_parse_nat(cda, ct, &range) < 0) | 891 | if (cda[CTA_NAT_DST-1]) { |
877 | return -EINVAL; | 892 | if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, |
878 | 893 | &range) < 0) | |
879 | DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", | 894 | return -EINVAL; |
880 | NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), | 895 | if (ip_nat_initialized(ct, |
881 | htons(range.min.all), htons(range.max.all)); | 896 | HOOK2MANIP(NF_IP_PRE_ROUTING))) |
882 | 897 | return -EEXIST; | |
883 | /* This is tricky but it works. ip_nat_setup_info needs the | 898 | ip_nat_setup_info(ct, &range, hooknum); |
884 | * hook number as parameter, so let's do the correct | 899 | } |
885 | * conversion and run away */ | 900 | if (cda[CTA_NAT_SRC-1]) { |
886 | if (status & IPS_SRC_NAT_DONE) | 901 | if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, |
887 | hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ | 902 | &range) < 0) |
888 | else if (status & IPS_DST_NAT_DONE) | 903 | return -EINVAL; |
889 | hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ | 904 | if (ip_nat_initialized(ct, |
890 | else | 905 | HOOK2MANIP(NF_IP_POST_ROUTING))) |
891 | return -EINVAL; /* Missing NAT flags */ | 906 | return -EEXIST; |
892 | 907 | ip_nat_setup_info(ct, &range, hooknum); | |
893 | DEBUGP("NAT status: %lu\n", | 908 | } |
894 | status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
895 | |||
896 | if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) | ||
897 | return -EEXIST; | ||
898 | ip_nat_setup_info(ct, &range, hooknum); | ||
899 | |||
900 | DEBUGP("NAT status after setup_info: %lu\n", | ||
901 | ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); | ||
902 | #endif | 909 | #endif |
903 | } | 910 | } |
904 | 911 | ||
@@ -1122,7 +1129,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, | |||
1122 | /* implicit 'else' */ | 1129 | /* implicit 'else' */ |
1123 | 1130 | ||
1124 | /* we only allow nat config for new conntracks */ | 1131 | /* we only allow nat config for new conntracks */ |
1125 | if (cda[CTA_NAT-1]) { | 1132 | if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { |
1126 | err = -EINVAL; | 1133 | err = -EINVAL; |
1127 | goto out_unlock; | 1134 | goto out_unlock; |
1128 | } | 1135 | } |
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 69899f27d26a..12fb7c0a1509 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c | |||
@@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb, | |||
828 | * and moreover root might send raw packets. | 828 | * and moreover root might send raw packets. |
829 | */ | 829 | */ |
830 | /* FIXME: Source route IP option packets --RR */ | 830 | /* FIXME: Source route IP option packets --RR */ |
831 | if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || | 831 | if (nf_conntrack_checksum && |
832 | (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && | 832 | ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || |
833 | (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && | ||
833 | nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { | 834 | nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { |
834 | if (LOG_INVALID(IPPROTO_TCP)) | 835 | if (LOG_INVALID(IPPROTO_TCP)) |
835 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, | 836 | nf_log_packet(pf, 0, skb, NULL, NULL, NULL, |
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index d93edbfde9e3..ae07ebe3ab37 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c | |||
@@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, | |||
134 | * because the semantic of CHECKSUM_HW is different there | 134 | * because the semantic of CHECKSUM_HW is different there |
135 | * and moreover root might send raw packets. | 135 | * and moreover root might send raw packets. |
136 | * FIXME: Source route IP option packets --RR */ | 136 | * FIXME: Source route IP option packets --RR */ |
137 | if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || | 137 | if (nf_conntrack_checksum && |
138 | ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || | ||
138 | (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && | 139 | (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && |
139 | nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { | 140 | nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { |
140 | if (LOG_INVALID(IPPROTO_UDP)) | 141 | if (LOG_INVALID(IPPROTO_UDP)) |
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 408960c6a544..e34c574f0351 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c | |||
@@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v) | |||
213 | return -ENOSPC; | 213 | return -ENOSPC; |
214 | #endif | 214 | #endif |
215 | 215 | ||
216 | #ifdef CONFIG_NF_CONNTRACK_SECMARK | ||
217 | if (seq_printf(s, "secmark=%u ", conntrack->secmark)) | ||
218 | return -ENOSPC; | ||
219 | #endif | ||
220 | |||
216 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) | 221 | if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) |
217 | return -ENOSPC; | 222 | return -ENOSPC; |
218 | 223 | ||
@@ -455,6 +460,8 @@ extern unsigned int nf_ct_generic_timeout; | |||
455 | static int log_invalid_proto_min = 0; | 460 | static int log_invalid_proto_min = 0; |
456 | static int log_invalid_proto_max = 255; | 461 | static int log_invalid_proto_max = 255; |
457 | 462 | ||
463 | int nf_conntrack_checksum = 1; | ||
464 | |||
458 | static struct ctl_table_header *nf_ct_sysctl_header; | 465 | static struct ctl_table_header *nf_ct_sysctl_header; |
459 | 466 | ||
460 | static ctl_table nf_ct_sysctl_table[] = { | 467 | static ctl_table nf_ct_sysctl_table[] = { |
@@ -483,6 +490,14 @@ static ctl_table nf_ct_sysctl_table[] = { | |||
483 | .proc_handler = &proc_dointvec, | 490 | .proc_handler = &proc_dointvec, |
484 | }, | 491 | }, |
485 | { | 492 | { |
493 | .ctl_name = NET_NF_CONNTRACK_CHECKSUM, | ||
494 | .procname = "nf_conntrack_checksum", | ||
495 | .data = &nf_conntrack_checksum, | ||
496 | .maxlen = sizeof(unsigned int), | ||
497 | .mode = 0644, | ||
498 | .proc_handler = &proc_dointvec, | ||
499 | }, | ||
500 | { | ||
486 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, | 501 | .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, |
487 | .procname = "nf_conntrack_tcp_timeout_syn_sent", | 502 | .procname = "nf_conntrack_tcp_timeout_syn_sent", |
488 | .data = &nf_ct_tcp_timeout_syn_sent, | 503 | .data = &nf_ct_tcp_timeout_syn_sent, |
@@ -851,6 +866,7 @@ EXPORT_SYMBOL(nf_ct_proto_put); | |||
851 | EXPORT_SYMBOL(nf_ct_l3proto_find_get); | 866 | EXPORT_SYMBOL(nf_ct_l3proto_find_get); |
852 | EXPORT_SYMBOL(nf_ct_l3proto_put); | 867 | EXPORT_SYMBOL(nf_ct_l3proto_put); |
853 | EXPORT_SYMBOL(nf_ct_l3protos); | 868 | EXPORT_SYMBOL(nf_ct_l3protos); |
869 | EXPORT_SYMBOL_GPL(nf_conntrack_checksum); | ||
854 | EXPORT_SYMBOL(nf_conntrack_expect_alloc); | 870 | EXPORT_SYMBOL(nf_conntrack_expect_alloc); |
855 | EXPORT_SYMBOL(nf_conntrack_expect_put); | 871 | EXPORT_SYMBOL(nf_conntrack_expect_put); |
856 | EXPORT_SYMBOL(nf_conntrack_expect_related); | 872 | EXPORT_SYMBOL(nf_conntrack_expect_related); |
diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c new file mode 100644 index 000000000000..8c011e020769 --- /dev/null +++ b/net/netfilter/xt_CONNSECMARK.c | |||
@@ -0,0 +1,155 @@ | |||
1 | /* | ||
2 | * This module is used to copy security markings from packets | ||
3 | * to connections, and restore security markings from connections | ||
4 | * back to packets. This would normally be performed in conjunction | ||
5 | * with the SECMARK target and state match. | ||
6 | * | ||
7 | * Based somewhat on CONNMARK: | ||
8 | * Copyright (C) 2002,2004 MARA Systems AB <http://www.marasystems.com> | ||
9 | * by Henrik Nordstrom <hno@marasystems.com> | ||
10 | * | ||
11 | * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com> | ||
12 | * | ||
13 | * This program is free software; you can redistribute it and/or modify | ||
14 | * it under the terms of the GNU General Public License version 2 as | ||
15 | * published by the Free Software Foundation. | ||
16 | * | ||
17 | */ | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/skbuff.h> | ||
20 | #include <linux/netfilter/x_tables.h> | ||
21 | #include <linux/netfilter/xt_CONNSECMARK.h> | ||
22 | #include <net/netfilter/nf_conntrack_compat.h> | ||
23 | |||
24 | #define PFX "CONNSECMARK: " | ||
25 | |||
26 | MODULE_LICENSE("GPL"); | ||
27 | MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); | ||
28 | MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module"); | ||
29 | MODULE_ALIAS("ipt_CONNSECMARK"); | ||
30 | MODULE_ALIAS("ip6t_CONNSECMARK"); | ||
31 | |||
32 | /* | ||
33 | * If the packet has a security mark and the connection does not, copy | ||
34 | * the security mark from the packet to the connection. | ||
35 | */ | ||
36 | static void secmark_save(struct sk_buff *skb) | ||
37 | { | ||
38 | if (skb->secmark) { | ||
39 | u32 *connsecmark; | ||
40 | enum ip_conntrack_info ctinfo; | ||
41 | |||
42 | connsecmark = nf_ct_get_secmark(skb, &ctinfo); | ||
43 | if (connsecmark && !*connsecmark) | ||
44 | if (*connsecmark != skb->secmark) | ||
45 | *connsecmark = skb->secmark; | ||
46 | } | ||
47 | } | ||
48 | |||
49 | /* | ||
50 | * If packet has no security mark, and the connection does, restore the | ||
51 | * security mark from the connection to the packet. | ||
52 | */ | ||
53 | static void secmark_restore(struct sk_buff *skb) | ||
54 | { | ||
55 | if (!skb->secmark) { | ||
56 | u32 *connsecmark; | ||
57 | enum ip_conntrack_info ctinfo; | ||
58 | |||
59 | connsecmark = nf_ct_get_secmark(skb, &ctinfo); | ||
60 | if (connsecmark && *connsecmark) | ||
61 | if (skb->secmark != *connsecmark) | ||
62 | skb->secmark = *connsecmark; | ||
63 | } | ||
64 | } | ||
65 | |||
66 | static unsigned int target(struct sk_buff **pskb, const struct net_device *in, | ||
67 | const struct net_device *out, unsigned int hooknum, | ||
68 | const struct xt_target *target, | ||
69 | const void *targinfo, void *userinfo) | ||
70 | { | ||
71 | struct sk_buff *skb = *pskb; | ||
72 | const struct xt_connsecmark_target_info *info = targinfo; | ||
73 | |||
74 | switch (info->mode) { | ||
75 | case CONNSECMARK_SAVE: | ||
76 | secmark_save(skb); | ||
77 | break; | ||
78 | |||
79 | case CONNSECMARK_RESTORE: | ||
80 | secmark_restore(skb); | ||
81 | break; | ||
82 | |||
83 | default: | ||
84 | BUG(); | ||
85 | } | ||
86 | |||
87 | return XT_CONTINUE; | ||
88 | } | ||
89 | |||
90 | static int checkentry(const char *tablename, const void *entry, | ||
91 | const struct xt_target *target, void *targinfo, | ||
92 | unsigned int targinfosize, unsigned int hook_mask) | ||
93 | { | ||
94 | struct xt_connsecmark_target_info *info = targinfo; | ||
95 | |||
96 | switch (info->mode) { | ||
97 | case CONNSECMARK_SAVE: | ||
98 | case CONNSECMARK_RESTORE: | ||
99 | break; | ||
100 | |||
101 | default: | ||
102 | printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); | ||
103 | return 0; | ||
104 | } | ||
105 | |||
106 | return 1; | ||
107 | } | ||
108 | |||
109 | static struct xt_target ipt_connsecmark_reg = { | ||
110 | .name = "CONNSECMARK", | ||
111 | .target = target, | ||
112 | .targetsize = sizeof(struct xt_connsecmark_target_info), | ||
113 | .table = "mangle", | ||
114 | .checkentry = checkentry, | ||
115 | .me = THIS_MODULE, | ||
116 | .family = AF_INET, | ||
117 | .revision = 0, | ||
118 | }; | ||
119 | |||
120 | static struct xt_target ip6t_connsecmark_reg = { | ||
121 | .name = "CONNSECMARK", | ||
122 | .target = target, | ||
123 | .targetsize = sizeof(struct xt_connsecmark_target_info), | ||
124 | .table = "mangle", | ||
125 | .checkentry = checkentry, | ||
126 | .me = THIS_MODULE, | ||
127 | .family = AF_INET6, | ||
128 | .revision = 0, | ||
129 | }; | ||
130 | |||
131 | static int __init xt_connsecmark_init(void) | ||
132 | { | ||
133 | int err; | ||
134 | |||
135 | need_conntrack(); | ||
136 | |||
137 | err = xt_register_target(&ipt_connsecmark_reg); | ||
138 | if (err) | ||
139 | return err; | ||
140 | |||
141 | err = xt_register_target(&ip6t_connsecmark_reg); | ||
142 | if (err) | ||
143 | xt_unregister_target(&ipt_connsecmark_reg); | ||
144 | |||
145 | return err; | ||
146 | } | ||
147 | |||
148 | static void __exit xt_connsecmark_fini(void) | ||
149 | { | ||
150 | xt_unregister_target(&ip6t_connsecmark_reg); | ||
151 | xt_unregister_target(&ipt_connsecmark_reg); | ||
152 | } | ||
153 | |||
154 | module_init(xt_connsecmark_init); | ||
155 | module_exit(xt_connsecmark_fini); | ||
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c new file mode 100644 index 000000000000..c2ce9c4011cc --- /dev/null +++ b/net/netfilter/xt_SECMARK.c | |||
@@ -0,0 +1,156 @@ | |||
1 | /* | ||
2 | * Module for modifying the secmark field of the skb, for use by | ||
3 | * security subsystems. | ||
4 | * | ||
5 | * Based on the nfmark match by: | ||
6 | * (C) 1999-2001 Marc Boucher <marc@mbsi.ca> | ||
7 | * | ||
8 | * (C) 2006 Red Hat, Inc., James Morris <jmorris@redhat.com> | ||
9 | * | ||
10 | * This program is free software; you can redistribute it and/or modify | ||
11 | * it under the terms of the GNU General Public License version 2 as | ||
12 | * published by the Free Software Foundation. | ||
13 | * | ||
14 | */ | ||
15 | #include <linux/module.h> | ||
16 | #include <linux/skbuff.h> | ||
17 | #include <linux/selinux.h> | ||
18 | #include <linux/netfilter/x_tables.h> | ||
19 | #include <linux/netfilter/xt_SECMARK.h> | ||
20 | |||
21 | MODULE_LICENSE("GPL"); | ||
22 | MODULE_AUTHOR("James Morris <jmorris@redhat.com>"); | ||
23 | MODULE_DESCRIPTION("ip[6]tables SECMARK modification module"); | ||
24 | MODULE_ALIAS("ipt_SECMARK"); | ||
25 | MODULE_ALIAS("ip6t_SECMARK"); | ||
26 | |||
27 | #define PFX "SECMARK: " | ||
28 | |||
29 | static u8 mode; | ||
30 | |||
31 | static unsigned int target(struct sk_buff **pskb, const struct net_device *in, | ||
32 | const struct net_device *out, unsigned int hooknum, | ||
33 | const struct xt_target *target, | ||
34 | const void *targinfo, void *userinfo) | ||
35 | { | ||
36 | u32 secmark = 0; | ||
37 | const struct xt_secmark_target_info *info = targinfo; | ||
38 | |||
39 | BUG_ON(info->mode != mode); | ||
40 | |||
41 | switch (mode) { | ||
42 | case SECMARK_MODE_SEL: | ||
43 | secmark = info->u.sel.selsid; | ||
44 | break; | ||
45 | |||
46 | default: | ||
47 | BUG(); | ||
48 | } | ||
49 | |||
50 | if ((*pskb)->secmark != secmark) | ||
51 | (*pskb)->secmark = secmark; | ||
52 | |||
53 | return XT_CONTINUE; | ||
54 | } | ||
55 | |||
56 | static int checkentry_selinux(struct xt_secmark_target_info *info) | ||
57 | { | ||
58 | int err; | ||
59 | struct xt_secmark_target_selinux_info *sel = &info->u.sel; | ||
60 | |||
61 | err = selinux_string_to_sid(sel->selctx, &sel->selsid); | ||
62 | if (err) { | ||
63 | if (err == -EINVAL) | ||
64 | printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n", | ||
65 | sel->selctx); | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | if (!sel->selsid) { | ||
70 | printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n", | ||
71 | sel->selctx); | ||
72 | return 0; | ||
73 | } | ||
74 | |||
75 | err = selinux_relabel_packet_permission(sel->selsid); | ||
76 | if (err) { | ||
77 | printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); | ||
78 | return 0; | ||
79 | } | ||
80 | |||
81 | return 1; | ||
82 | } | ||
83 | |||
84 | static int checkentry(const char *tablename, const void *entry, | ||
85 | const struct xt_target *target, void *targinfo, | ||
86 | unsigned int targinfosize, unsigned int hook_mask) | ||
87 | { | ||
88 | struct xt_secmark_target_info *info = targinfo; | ||
89 | |||
90 | if (mode && mode != info->mode) { | ||
91 | printk(KERN_INFO PFX "mode already set to %hu cannot mix with " | ||
92 | "rules for mode %hu\n", mode, info->mode); | ||
93 | return 0; | ||
94 | } | ||
95 | |||
96 | switch (info->mode) { | ||
97 | case SECMARK_MODE_SEL: | ||
98 | if (!checkentry_selinux(info)) | ||
99 | return 0; | ||
100 | break; | ||
101 | |||
102 | default: | ||
103 | printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); | ||
104 | return 0; | ||
105 | } | ||
106 | |||
107 | if (!mode) | ||
108 | mode = info->mode; | ||
109 | return 1; | ||
110 | } | ||
111 | |||
112 | static struct xt_target ipt_secmark_reg = { | ||
113 | .name = "SECMARK", | ||
114 | .target = target, | ||
115 | .targetsize = sizeof(struct xt_secmark_target_info), | ||
116 | .table = "mangle", | ||
117 | .checkentry = checkentry, | ||
118 | .me = THIS_MODULE, | ||
119 | .family = AF_INET, | ||
120 | .revision = 0, | ||
121 | }; | ||
122 | |||
123 | static struct xt_target ip6t_secmark_reg = { | ||
124 | .name = "SECMARK", | ||
125 | .target = target, | ||
126 | .targetsize = sizeof(struct xt_secmark_target_info), | ||
127 | .table = "mangle", | ||
128 | .checkentry = checkentry, | ||
129 | .me = THIS_MODULE, | ||
130 | .family = AF_INET6, | ||
131 | .revision = 0, | ||
132 | }; | ||
133 | |||
134 | static int __init xt_secmark_init(void) | ||
135 | { | ||
136 | int err; | ||
137 | |||
138 | err = xt_register_target(&ipt_secmark_reg); | ||
139 | if (err) | ||
140 | return err; | ||
141 | |||
142 | err = xt_register_target(&ip6t_secmark_reg); | ||
143 | if (err) | ||
144 | xt_unregister_target(&ipt_secmark_reg); | ||
145 | |||
146 | return err; | ||
147 | } | ||
148 | |||
149 | static void __exit xt_secmark_fini(void) | ||
150 | { | ||
151 | xt_unregister_target(&ip6t_secmark_reg); | ||
152 | xt_unregister_target(&ipt_secmark_reg); | ||
153 | } | ||
154 | |||
155 | module_init(xt_secmark_init); | ||
156 | module_exit(xt_secmark_fini); | ||
diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index dc26a27cbcaf..56324c8aff0a 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c | |||
@@ -58,7 +58,7 @@ checkentry(const char *tablename, | |||
58 | unsigned int matchsize, | 58 | unsigned int matchsize, |
59 | unsigned int hook_mask) | 59 | unsigned int hook_mask) |
60 | { | 60 | { |
61 | struct xt_connmark_info *cm = (struct xt_connmark_info *)matchinfo; | 61 | struct xt_connmark_info *cm = matchinfo; |
62 | 62 | ||
63 | if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { | 63 | if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { |
64 | printk(KERN_WARNING "connmark: only support 32bit mark\n"); | 64 | printk(KERN_WARNING "connmark: only support 32bit mark\n"); |
diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index dfb10b648e57..2e2f825dad4c 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c | |||
@@ -101,8 +101,7 @@ match(const struct sk_buff *skb, | |||
101 | unsigned int protoff, | 101 | unsigned int protoff, |
102 | int *hotdrop) | 102 | int *hotdrop) |
103 | { | 103 | { |
104 | const struct xt_dccp_info *info = | 104 | const struct xt_dccp_info *info = matchinfo; |
105 | (const struct xt_dccp_info *)matchinfo; | ||
106 | struct dccp_hdr _dh, *dh; | 105 | struct dccp_hdr _dh, *dh; |
107 | 106 | ||
108 | if (offset) | 107 | if (offset) |
diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 8b385a34886d..876bc5797738 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c | |||
@@ -42,7 +42,7 @@ checkentry(const char *tablename, | |||
42 | unsigned int matchsize, | 42 | unsigned int matchsize, |
43 | unsigned int hook_mask) | 43 | unsigned int hook_mask) |
44 | { | 44 | { |
45 | struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo; | 45 | const struct xt_mark_info *minfo = matchinfo; |
46 | 46 | ||
47 | if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { | 47 | if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { |
48 | printk(KERN_WARNING "mark: only supports 32bit mark\n"); | 48 | printk(KERN_WARNING "mark: only supports 32bit mark\n"); |
diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index b56cd2baaac2..1ff0a25396e7 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c | |||
@@ -1,4 +1,4 @@ | |||
1 | /* Kernel module to match one of a list of TCP/UDP ports: ports are in | 1 | /* Kernel module to match one of a list of TCP/UDP/SCTP/DCCP ports: ports are in |
2 | the same place so we can treat them as equal. */ | 2 | the same place so we can treat them as equal. */ |
3 | 3 | ||
4 | /* (C) 1999-2001 Paul `Rusty' Russell | 4 | /* (C) 1999-2001 Paul `Rusty' Russell |
@@ -160,8 +160,9 @@ check(u_int16_t proto, | |||
160 | u_int8_t match_flags, | 160 | u_int8_t match_flags, |
161 | u_int8_t count) | 161 | u_int8_t count) |
162 | { | 162 | { |
163 | /* Must specify proto == TCP/UDP, no unknown flags or bad count */ | 163 | /* Must specify supported protocol, no unknown flags or bad count */ |
164 | return (proto == IPPROTO_TCP || proto == IPPROTO_UDP) | 164 | return (proto == IPPROTO_TCP || proto == IPPROTO_UDP |
165 | || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) | ||
165 | && !(ip_invflags & XT_INV_PROTO) | 166 | && !(ip_invflags & XT_INV_PROTO) |
166 | && (match_flags == XT_MULTIPORT_SOURCE | 167 | && (match_flags == XT_MULTIPORT_SOURCE |
167 | || match_flags == XT_MULTIPORT_DESTINATION | 168 | || match_flags == XT_MULTIPORT_DESTINATION |
diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c new file mode 100644 index 000000000000..4cdba7469dc4 --- /dev/null +++ b/net/netfilter/xt_quota.c | |||
@@ -0,0 +1,96 @@ | |||
1 | /* | ||
2 | * netfilter module to enforce network quotas | ||
3 | * | ||
4 | * Sam Johnston <samj@samj.net> | ||
5 | */ | ||
6 | #include <linux/skbuff.h> | ||
7 | #include <linux/spinlock.h> | ||
8 | |||
9 | #include <linux/netfilter/x_tables.h> | ||
10 | #include <linux/netfilter/xt_quota.h> | ||
11 | |||
12 | MODULE_LICENSE("GPL"); | ||
13 | MODULE_AUTHOR("Sam Johnston <samj@samj.net>"); | ||
14 | |||
15 | static DEFINE_SPINLOCK(quota_lock); | ||
16 | |||
17 | static int | ||
18 | match(const struct sk_buff *skb, | ||
19 | const struct net_device *in, const struct net_device *out, | ||
20 | const struct xt_match *match, const void *matchinfo, | ||
21 | int offset, unsigned int protoff, int *hotdrop) | ||
22 | { | ||
23 | struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master; | ||
24 | int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0; | ||
25 | |||
26 | spin_lock_bh("a_lock); | ||
27 | if (q->quota >= skb->len) { | ||
28 | q->quota -= skb->len; | ||
29 | ret ^= 1; | ||
30 | } else { | ||
31 | /* we do not allow even small packets from now on */ | ||
32 | q->quota = 0; | ||
33 | } | ||
34 | spin_unlock_bh("a_lock); | ||
35 | |||
36 | return ret; | ||
37 | } | ||
38 | |||
39 | static int | ||
40 | checkentry(const char *tablename, const void *entry, | ||
41 | const struct xt_match *match, void *matchinfo, | ||
42 | unsigned int matchsize, unsigned int hook_mask) | ||
43 | { | ||
44 | struct xt_quota_info *q = (struct xt_quota_info *)matchinfo; | ||
45 | |||
46 | if (q->flags & ~XT_QUOTA_MASK) | ||
47 | return 0; | ||
48 | /* For SMP, we only want to use one set of counters. */ | ||
49 | q->master = q; | ||
50 | return 1; | ||
51 | } | ||
52 | |||
53 | static struct xt_match quota_match = { | ||
54 | .name = "quota", | ||
55 | .family = AF_INET, | ||
56 | .match = match, | ||
57 | .matchsize = sizeof(struct xt_quota_info), | ||
58 | .checkentry = checkentry, | ||
59 | .me = THIS_MODULE | ||
60 | }; | ||
61 | |||
62 | static struct xt_match quota_match6 = { | ||
63 | .name = "quota", | ||
64 | .family = AF_INET6, | ||
65 | .match = match, | ||
66 | .matchsize = sizeof(struct xt_quota_info), | ||
67 | .checkentry = checkentry, | ||
68 | .me = THIS_MODULE | ||
69 | }; | ||
70 | |||
71 | static int __init xt_quota_init(void) | ||
72 | { | ||
73 | int ret; | ||
74 | |||
75 | ret = xt_register_match("a_match); | ||
76 | if (ret) | ||
77 | goto err1; | ||
78 | ret = xt_register_match("a_match6); | ||
79 | if (ret) | ||
80 | goto err2; | ||
81 | return ret; | ||
82 | |||
83 | err2: | ||
84 | xt_unregister_match("a_match); | ||
85 | err1: | ||
86 | return ret; | ||
87 | } | ||
88 | |||
89 | static void __exit xt_quota_fini(void) | ||
90 | { | ||
91 | xt_unregister_match("a_match6); | ||
92 | xt_unregister_match("a_match); | ||
93 | } | ||
94 | |||
95 | module_init(xt_quota_init); | ||
96 | module_exit(xt_quota_fini); | ||
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 34bd87259a09..b5110e5b54b0 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c | |||
@@ -129,11 +129,9 @@ match(const struct sk_buff *skb, | |||
129 | unsigned int protoff, | 129 | unsigned int protoff, |
130 | int *hotdrop) | 130 | int *hotdrop) |
131 | { | 131 | { |
132 | const struct xt_sctp_info *info; | 132 | const struct xt_sctp_info *info = matchinfo; |
133 | sctp_sctphdr_t _sh, *sh; | 133 | sctp_sctphdr_t _sh, *sh; |
134 | 134 | ||
135 | info = (const struct xt_sctp_info *)matchinfo; | ||
136 | |||
137 | if (offset) { | 135 | if (offset) { |
138 | duprintf("Dropping non-first fragment.. FIXME\n"); | 136 | duprintf("Dropping non-first fragment.. FIXME\n"); |
139 | return 0; | 137 | return 0; |
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c new file mode 100644 index 000000000000..de1037f58596 --- /dev/null +++ b/net/netfilter/xt_statistic.c | |||
@@ -0,0 +1,112 @@ | |||
1 | /* | ||
2 | * Copyright (c) 2006 Patrick McHardy <kaber@trash.net> | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License version 2 as | ||
6 | * published by the Free Software Foundation. | ||
7 | * | ||
8 | * Based on ipt_random and ipt_nth by Fabrice MARIE <fabrice@netfilter.org>. | ||
9 | */ | ||
10 | |||
11 | #include <linux/init.h> | ||
12 | #include <linux/spinlock.h> | ||
13 | #include <linux/skbuff.h> | ||
14 | #include <linux/net.h> | ||
15 | |||
16 | #include <linux/netfilter/xt_statistic.h> | ||
17 | #include <linux/netfilter/x_tables.h> | ||
18 | |||
19 | MODULE_LICENSE("GPL"); | ||
20 | MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); | ||
21 | MODULE_DESCRIPTION("xtables statistical match module"); | ||
22 | MODULE_ALIAS("ipt_statistic"); | ||
23 | MODULE_ALIAS("ip6t_statistic"); | ||
24 | |||
25 | static DEFINE_SPINLOCK(nth_lock); | ||
26 | |||
27 | static int | ||
28 | match(const struct sk_buff *skb, | ||
29 | const struct net_device *in, const struct net_device *out, | ||
30 | const struct xt_match *match, const void *matchinfo, | ||
31 | int offset, unsigned int protoff, int *hotdrop) | ||
32 | { | ||
33 | struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; | ||
34 | int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0; | ||
35 | |||
36 | switch (info->mode) { | ||
37 | case XT_STATISTIC_MODE_RANDOM: | ||
38 | if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) | ||
39 | ret ^= 1; | ||
40 | break; | ||
41 | case XT_STATISTIC_MODE_NTH: | ||
42 | info = info->master; | ||
43 | spin_lock_bh(&nth_lock); | ||
44 | if (info->u.nth.count++ == info->u.nth.every) { | ||
45 | info->u.nth.count = 0; | ||
46 | ret ^= 1; | ||
47 | } | ||
48 | spin_unlock_bh(&nth_lock); | ||
49 | break; | ||
50 | } | ||
51 | |||
52 | return ret; | ||
53 | } | ||
54 | |||
55 | static int | ||
56 | checkentry(const char *tablename, const void *entry, | ||
57 | const struct xt_match *match, void *matchinfo, | ||
58 | unsigned int matchsize, unsigned int hook_mask) | ||
59 | { | ||
60 | struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; | ||
61 | |||
62 | if (info->mode > XT_STATISTIC_MODE_MAX || | ||
63 | info->flags & ~XT_STATISTIC_MASK) | ||
64 | return 0; | ||
65 | info->master = info; | ||
66 | return 1; | ||
67 | } | ||
68 | |||
69 | static struct xt_match statistic_match = { | ||
70 | .name = "statistic", | ||
71 | .match = match, | ||
72 | .matchsize = sizeof(struct xt_statistic_info), | ||
73 | .checkentry = checkentry, | ||
74 | .family = AF_INET, | ||
75 | .me = THIS_MODULE, | ||
76 | }; | ||
77 | |||
78 | static struct xt_match statistic_match6 = { | ||
79 | .name = "statistic", | ||
80 | .match = match, | ||
81 | .matchsize = sizeof(struct xt_statistic_info), | ||
82 | .checkentry = checkentry, | ||
83 | .family = AF_INET6, | ||
84 | .me = THIS_MODULE, | ||
85 | }; | ||
86 | |||
87 | static int __init xt_statistic_init(void) | ||
88 | { | ||
89 | int ret; | ||
90 | |||
91 | ret = xt_register_match(&statistic_match); | ||
92 | if (ret) | ||
93 | goto err1; | ||
94 | |||
95 | ret = xt_register_match(&statistic_match6); | ||
96 | if (ret) | ||
97 | goto err2; | ||
98 | return ret; | ||
99 | err2: | ||
100 | xt_unregister_match(&statistic_match); | ||
101 | err1: | ||
102 | return ret; | ||
103 | } | ||
104 | |||
105 | static void __exit xt_statistic_fini(void) | ||
106 | { | ||
107 | xt_unregister_match(&statistic_match6); | ||
108 | xt_unregister_match(&statistic_match); | ||
109 | } | ||
110 | |||
111 | module_init(xt_statistic_init); | ||
112 | module_exit(xt_statistic_fini); | ||
diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 79d9ea6964ba..0ebb6ac2c8c7 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c | |||
@@ -30,8 +30,8 @@ static int match(const struct sk_buff *skb, | |||
30 | unsigned int protoff, | 30 | unsigned int protoff, |
31 | int *hotdrop) | 31 | int *hotdrop) |
32 | { | 32 | { |
33 | const struct xt_string_info *conf = matchinfo; | ||
33 | struct ts_state state; | 34 | struct ts_state state; |
34 | struct xt_string_info *conf = (struct xt_string_info *) matchinfo; | ||
35 | 35 | ||
36 | memset(&state, 0, sizeof(struct ts_state)); | 36 | memset(&state, 0, sizeof(struct ts_state)); |
37 | 37 | ||
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 138ea92ed268..b1e4c5e20ac7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c | |||
@@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev) | |||
72 | dev->queue_lock serializes queue accesses for this device | 72 | dev->queue_lock serializes queue accesses for this device |
73 | AND dev->qdisc pointer itself. | 73 | AND dev->qdisc pointer itself. |
74 | 74 | ||
75 | dev->xmit_lock serializes accesses to device driver. | 75 | netif_tx_lock serializes accesses to device driver. |
76 | 76 | ||
77 | dev->queue_lock and dev->xmit_lock are mutually exclusive, | 77 | dev->queue_lock and netif_tx_lock are mutually exclusive, |
78 | if one is grabbed, another must be free. | 78 | if one is grabbed, another must be free. |
79 | */ | 79 | */ |
80 | 80 | ||
@@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev) | |||
108 | * will be requeued. | 108 | * will be requeued. |
109 | */ | 109 | */ |
110 | if (!nolock) { | 110 | if (!nolock) { |
111 | if (!spin_trylock(&dev->xmit_lock)) { | 111 | if (!netif_tx_trylock(dev)) { |
112 | collision: | 112 | collision: |
113 | /* So, someone grabbed the driver. */ | 113 | /* So, someone grabbed the driver. */ |
114 | 114 | ||
@@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev) | |||
126 | __get_cpu_var(netdev_rx_stat).cpu_collision++; | 126 | __get_cpu_var(netdev_rx_stat).cpu_collision++; |
127 | goto requeue; | 127 | goto requeue; |
128 | } | 128 | } |
129 | /* Remember that the driver is grabbed by us. */ | ||
130 | dev->xmit_lock_owner = smp_processor_id(); | ||
131 | } | 129 | } |
132 | 130 | ||
133 | { | 131 | { |
@@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev) | |||
142 | ret = dev->hard_start_xmit(skb, dev); | 140 | ret = dev->hard_start_xmit(skb, dev); |
143 | if (ret == NETDEV_TX_OK) { | 141 | if (ret == NETDEV_TX_OK) { |
144 | if (!nolock) { | 142 | if (!nolock) { |
145 | dev->xmit_lock_owner = -1; | 143 | netif_tx_unlock(dev); |
146 | spin_unlock(&dev->xmit_lock); | ||
147 | } | 144 | } |
148 | spin_lock(&dev->queue_lock); | 145 | spin_lock(&dev->queue_lock); |
149 | return -1; | 146 | return -1; |
@@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev) | |||
157 | /* NETDEV_TX_BUSY - we need to requeue */ | 154 | /* NETDEV_TX_BUSY - we need to requeue */ |
158 | /* Release the driver */ | 155 | /* Release the driver */ |
159 | if (!nolock) { | 156 | if (!nolock) { |
160 | dev->xmit_lock_owner = -1; | 157 | netif_tx_unlock(dev); |
161 | spin_unlock(&dev->xmit_lock); | ||
162 | } | 158 | } |
163 | spin_lock(&dev->queue_lock); | 159 | spin_lock(&dev->queue_lock); |
164 | q = dev->qdisc; | 160 | q = dev->qdisc; |
@@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg) | |||
187 | { | 183 | { |
188 | struct net_device *dev = (struct net_device *)arg; | 184 | struct net_device *dev = (struct net_device *)arg; |
189 | 185 | ||
190 | spin_lock(&dev->xmit_lock); | 186 | netif_tx_lock(dev); |
191 | if (dev->qdisc != &noop_qdisc) { | 187 | if (dev->qdisc != &noop_qdisc) { |
192 | if (netif_device_present(dev) && | 188 | if (netif_device_present(dev) && |
193 | netif_running(dev) && | 189 | netif_running(dev) && |
@@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg) | |||
203 | dev_hold(dev); | 199 | dev_hold(dev); |
204 | } | 200 | } |
205 | } | 201 | } |
206 | spin_unlock(&dev->xmit_lock); | 202 | netif_tx_unlock(dev); |
207 | 203 | ||
208 | dev_put(dev); | 204 | dev_put(dev); |
209 | } | 205 | } |
@@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev) | |||
227 | 223 | ||
228 | static void dev_watchdog_up(struct net_device *dev) | 224 | static void dev_watchdog_up(struct net_device *dev) |
229 | { | 225 | { |
230 | spin_lock_bh(&dev->xmit_lock); | 226 | netif_tx_lock_bh(dev); |
231 | __netdev_watchdog_up(dev); | 227 | __netdev_watchdog_up(dev); |
232 | spin_unlock_bh(&dev->xmit_lock); | 228 | netif_tx_unlock_bh(dev); |
233 | } | 229 | } |
234 | 230 | ||
235 | static void dev_watchdog_down(struct net_device *dev) | 231 | static void dev_watchdog_down(struct net_device *dev) |
236 | { | 232 | { |
237 | spin_lock_bh(&dev->xmit_lock); | 233 | netif_tx_lock_bh(dev); |
238 | if (del_timer(&dev->watchdog_timer)) | 234 | if (del_timer(&dev->watchdog_timer)) |
239 | dev_put(dev); | 235 | dev_put(dev); |
240 | spin_unlock_bh(&dev->xmit_lock); | 236 | netif_tx_unlock_bh(dev); |
241 | } | 237 | } |
242 | 238 | ||
243 | void netif_carrier_on(struct net_device *dev) | 239 | void netif_carrier_on(struct net_device *dev) |
@@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev) | |||
582 | while (test_bit(__LINK_STATE_SCHED, &dev->state)) | 578 | while (test_bit(__LINK_STATE_SCHED, &dev->state)) |
583 | yield(); | 579 | yield(); |
584 | 580 | ||
585 | spin_unlock_wait(&dev->xmit_lock); | 581 | spin_unlock_wait(&dev->_xmit_lock); |
586 | } | 582 | } |
587 | 583 | ||
588 | void dev_init_scheduler(struct net_device *dev) | 584 | void dev_init_scheduler(struct net_device *dev) |
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 79b8ef34c6e4..4c16ad57a3e4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c | |||
@@ -302,20 +302,17 @@ restart: | |||
302 | 302 | ||
303 | switch (teql_resolve(skb, skb_res, slave)) { | 303 | switch (teql_resolve(skb, skb_res, slave)) { |
304 | case 0: | 304 | case 0: |
305 | if (spin_trylock(&slave->xmit_lock)) { | 305 | if (netif_tx_trylock(slave)) { |
306 | slave->xmit_lock_owner = smp_processor_id(); | ||
307 | if (!netif_queue_stopped(slave) && | 306 | if (!netif_queue_stopped(slave) && |
308 | slave->hard_start_xmit(skb, slave) == 0) { | 307 | slave->hard_start_xmit(skb, slave) == 0) { |
309 | slave->xmit_lock_owner = -1; | 308 | netif_tx_unlock(slave); |
310 | spin_unlock(&slave->xmit_lock); | ||
311 | master->slaves = NEXT_SLAVE(q); | 309 | master->slaves = NEXT_SLAVE(q); |
312 | netif_wake_queue(dev); | 310 | netif_wake_queue(dev); |
313 | master->stats.tx_packets++; | 311 | master->stats.tx_packets++; |
314 | master->stats.tx_bytes += len; | 312 | master->stats.tx_bytes += len; |
315 | return 0; | 313 | return 0; |
316 | } | 314 | } |
317 | slave->xmit_lock_owner = -1; | 315 | netif_tx_unlock(slave); |
318 | spin_unlock(&slave->xmit_lock); | ||
319 | } | 316 | } |
320 | if (netif_queue_stopped(dev)) | 317 | if (netif_queue_stopped(dev)) |
321 | busy = 1; | 318 | busy = 1; |
diff --git a/net/sctp/input.c b/net/sctp/input.c index 1662f9cc869e..42b66e74bbb5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c | |||
@@ -141,7 +141,8 @@ int sctp_rcv(struct sk_buff *skb) | |||
141 | __skb_pull(skb, skb->h.raw - skb->data); | 141 | __skb_pull(skb, skb->h.raw - skb->data); |
142 | if (skb->len < sizeof(struct sctphdr)) | 142 | if (skb->len < sizeof(struct sctphdr)) |
143 | goto discard_it; | 143 | goto discard_it; |
144 | if (sctp_rcv_checksum(skb) < 0) | 144 | if ((skb->ip_summed != CHECKSUM_UNNECESSARY) && |
145 | (sctp_rcv_checksum(skb) < 0)) | ||
145 | goto discard_it; | 146 | goto discard_it; |
146 | 147 | ||
147 | skb_pull(skb, sizeof(struct sctphdr)); | 148 | skb_pull(skb, sizeof(struct sctphdr)); |
@@ -170,7 +171,8 @@ int sctp_rcv(struct sk_buff *skb) | |||
170 | * IP broadcast addresses cannot be used in an SCTP transport | 171 | * IP broadcast addresses cannot be used in an SCTP transport |
171 | * address." | 172 | * address." |
172 | */ | 173 | */ |
173 | if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL)) | 174 | if (!af->addr_valid(&src, NULL, skb) || |
175 | !af->addr_valid(&dest, NULL, skb)) | ||
174 | goto discard_it; | 176 | goto discard_it; |
175 | 177 | ||
176 | asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); | 178 | asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); |
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index c20d282fac06..8ef08070c8b6 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c | |||
@@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) | |||
523 | * Return 0 - If the address is a non-unicast or an illegal address. | 523 | * Return 0 - If the address is a non-unicast or an illegal address. |
524 | * Return 1 - If the address is a unicast. | 524 | * Return 1 - If the address is a unicast. |
525 | */ | 525 | */ |
526 | static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) | 526 | static int sctp_v6_addr_valid(union sctp_addr *addr, |
527 | struct sctp_sock *sp, | ||
528 | const struct sk_buff *skb) | ||
527 | { | 529 | { |
528 | int ret = ipv6_addr_type(&addr->v6.sin6_addr); | 530 | int ret = ipv6_addr_type(&addr->v6.sin6_addr); |
529 | 531 | ||
@@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) | |||
537 | if (sp && ipv6_only_sock(sctp_opt2sk(sp))) | 539 | if (sp && ipv6_only_sock(sctp_opt2sk(sp))) |
538 | return 0; | 540 | return 0; |
539 | sctp_v6_map_v4(addr); | 541 | sctp_v6_map_v4(addr); |
540 | return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp); | 542 | return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); |
541 | } | 543 | } |
542 | 544 | ||
543 | /* Is this a non-unicast address */ | 545 | /* Is this a non-unicast address */ |
diff --git a/net/sctp/output.c b/net/sctp/output.c index 437cba7260a4..cdc5a3936766 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c | |||
@@ -295,14 +295,14 @@ int sctp_packet_transmit(struct sctp_packet *packet) | |||
295 | struct sctp_transport *tp = packet->transport; | 295 | struct sctp_transport *tp = packet->transport; |
296 | struct sctp_association *asoc = tp->asoc; | 296 | struct sctp_association *asoc = tp->asoc; |
297 | struct sctphdr *sh; | 297 | struct sctphdr *sh; |
298 | __u32 crc32; | 298 | __u32 crc32 = 0; |
299 | struct sk_buff *nskb; | 299 | struct sk_buff *nskb; |
300 | struct sctp_chunk *chunk, *tmp; | 300 | struct sctp_chunk *chunk, *tmp; |
301 | struct sock *sk; | 301 | struct sock *sk; |
302 | int err = 0; | 302 | int err = 0; |
303 | int padding; /* How much padding do we need? */ | 303 | int padding; /* How much padding do we need? */ |
304 | __u8 has_data = 0; | 304 | __u8 has_data = 0; |
305 | struct dst_entry *dst; | 305 | struct dst_entry *dst = tp->dst; |
306 | 306 | ||
307 | SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); | 307 | SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); |
308 | 308 | ||
@@ -327,6 +327,19 @@ int sctp_packet_transmit(struct sctp_packet *packet) | |||
327 | */ | 327 | */ |
328 | skb_set_owner_w(nskb, sk); | 328 | skb_set_owner_w(nskb, sk); |
329 | 329 | ||
330 | /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ | ||
331 | if (!dst || (dst->obsolete > 1)) { | ||
332 | dst_release(dst); | ||
333 | sctp_transport_route(tp, NULL, sctp_sk(sk)); | ||
334 | if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) { | ||
335 | sctp_assoc_sync_pmtu(asoc); | ||
336 | } | ||
337 | } | ||
338 | nskb->dst = dst_clone(tp->dst); | ||
339 | if (!nskb->dst) | ||
340 | goto no_route; | ||
341 | dst = nskb->dst; | ||
342 | |||
330 | /* Build the SCTP header. */ | 343 | /* Build the SCTP header. */ |
331 | sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); | 344 | sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); |
332 | sh->source = htons(packet->source_port); | 345 | sh->source = htons(packet->source_port); |
@@ -350,7 +363,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) | |||
350 | * Note: Adler-32 is no longer applicable, as has been replaced | 363 | * Note: Adler-32 is no longer applicable, as has been replaced |
351 | * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>. | 364 | * by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>. |
352 | */ | 365 | */ |
353 | crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); | 366 | if (!(dst->dev->features & NETIF_F_NO_CSUM)) |
367 | crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); | ||
354 | 368 | ||
355 | /** | 369 | /** |
356 | * 6.10 Bundling | 370 | * 6.10 Bundling |
@@ -402,9 +416,14 @@ int sctp_packet_transmit(struct sctp_packet *packet) | |||
402 | if (padding) | 416 | if (padding) |
403 | memset(skb_put(chunk->skb, padding), 0, padding); | 417 | memset(skb_put(chunk->skb, padding), 0, padding); |
404 | 418 | ||
405 | crc32 = sctp_update_copy_cksum(skb_put(nskb, chunk->skb->len), | 419 | if (dst->dev->features & NETIF_F_NO_CSUM) |
406 | chunk->skb->data, | 420 | memcpy(skb_put(nskb, chunk->skb->len), |
407 | chunk->skb->len, crc32); | 421 | chunk->skb->data, chunk->skb->len); |
422 | else | ||
423 | crc32 = sctp_update_copy_cksum(skb_put(nskb, | ||
424 | chunk->skb->len), | ||
425 | chunk->skb->data, | ||
426 | chunk->skb->len, crc32); | ||
408 | 427 | ||
409 | SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", | 428 | SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", |
410 | "*** Chunk", chunk, | 429 | "*** Chunk", chunk, |
@@ -427,7 +446,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) | |||
427 | } | 446 | } |
428 | 447 | ||
429 | /* Perform final transformation on checksum. */ | 448 | /* Perform final transformation on checksum. */ |
430 | crc32 = sctp_end_cksum(crc32); | 449 | if (!(dst->dev->features & NETIF_F_NO_CSUM)) |
450 | crc32 = sctp_end_cksum(crc32); | ||
431 | 451 | ||
432 | /* 3) Put the resultant value into the checksum field in the | 452 | /* 3) Put the resultant value into the checksum field in the |
433 | * common header, and leave the rest of the bits unchanged. | 453 | * common header, and leave the rest of the bits unchanged. |
@@ -477,20 +497,6 @@ int sctp_packet_transmit(struct sctp_packet *packet) | |||
477 | } | 497 | } |
478 | } | 498 | } |
479 | 499 | ||
480 | dst = tp->dst; | ||
481 | /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ | ||
482 | if (!dst || (dst->obsolete > 1)) { | ||
483 | dst_release(dst); | ||
484 | sctp_transport_route(tp, NULL, sctp_sk(sk)); | ||
485 | if (asoc->param_flags & SPP_PMTUD_ENABLE) { | ||
486 | sctp_assoc_sync_pmtu(asoc); | ||
487 | } | ||
488 | } | ||
489 | |||
490 | nskb->dst = dst_clone(tp->dst); | ||
491 | if (!nskb->dst) | ||
492 | goto no_route; | ||
493 | |||
494 | SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", | 500 | SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", |
495 | nskb->len); | 501 | nskb->len); |
496 | 502 | ||
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index f148f9576dd2..e5faa351aaad 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c | |||
@@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, | |||
1262 | if (!tchunk->tsn_gap_acked && | 1262 | if (!tchunk->tsn_gap_acked && |
1263 | !tchunk->resent && | 1263 | !tchunk->resent && |
1264 | tchunk->rtt_in_progress) { | 1264 | tchunk->rtt_in_progress) { |
1265 | tchunk->rtt_in_progress = 0; | ||
1265 | rtt = jiffies - tchunk->sent_at; | 1266 | rtt = jiffies - tchunk->sent_at; |
1266 | sctp_transport_update_rto(transport, | 1267 | sctp_transport_update_rto(transport, |
1267 | rtt); | 1268 | rtt); |
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 2088aa992b7a..816c033d7886 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c | |||
@@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr) | |||
365 | * Return 0 - If the address is a non-unicast or an illegal address. | 365 | * Return 0 - If the address is a non-unicast or an illegal address. |
366 | * Return 1 - If the address is a unicast. | 366 | * Return 1 - If the address is a unicast. |
367 | */ | 367 | */ |
368 | static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) | 368 | static int sctp_v4_addr_valid(union sctp_addr *addr, |
369 | struct sctp_sock *sp, | ||
370 | const struct sk_buff *skb) | ||
369 | { | 371 | { |
370 | /* Is this a non-unicast address or a unusable SCTP address? */ | 372 | /* Is this a non-unicast address or a unusable SCTP address? */ |
371 | if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) | 373 | if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) |
372 | return 0; | 374 | return 0; |
373 | 375 | ||
376 | /* Is this a broadcast address? */ | ||
377 | if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST) | ||
378 | return 0; | ||
379 | |||
374 | return 1; | 380 | return 1; |
375 | } | 381 | } |
376 | 382 | ||
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8bc279219a72..9e58144f4851 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c | |||
@@ -5293,10 +5293,18 @@ static int sctp_eat_data(const struct sctp_association *asoc, | |||
5293 | * seems a bit troublesome in that frag_point varies based on | 5293 | * seems a bit troublesome in that frag_point varies based on |
5294 | * PMTU. In cases, such as loopback, this might be a rather | 5294 | * PMTU. In cases, such as loopback, this might be a rather |
5295 | * large spill over. | 5295 | * large spill over. |
5296 | * NOTE: If we have a full receive buffer here, we only renege if | ||
5297 | * our receiver can still make progress without the tsn being | ||
5298 | * received. We do this because in the event that the associations | ||
5299 | * receive queue is empty we are filling a leading gap, and since | ||
5300 | * reneging moves the gap to the end of the tsn stream, we are likely | ||
5301 | * to stall again very shortly. Avoiding the renege when we fill a | ||
5302 | * leading gap is a good heuristic for avoiding such steady state | ||
5303 | * stalls. | ||
5296 | */ | 5304 | */ |
5297 | if (!asoc->rwnd || asoc->rwnd_over || | 5305 | if (!asoc->rwnd || asoc->rwnd_over || |
5298 | (datalen > asoc->rwnd + asoc->frag_point) || | 5306 | (datalen > asoc->rwnd + asoc->frag_point) || |
5299 | rcvbuf_over) { | 5307 | (rcvbuf_over && (!skb_queue_len(&sk->sk_receive_queue)))) { |
5300 | 5308 | ||
5301 | /* If this is the next TSN, consider reneging to make | 5309 | /* If this is the next TSN, consider reneging to make |
5302 | * room. Note: Playing nice with a confused sender. A | 5310 | * room. Note: Playing nice with a confused sender. A |
diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 174d4d35e951..b811691c35bf 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c | |||
@@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, | |||
172 | return -EINVAL; | 172 | return -EINVAL; |
173 | 173 | ||
174 | /* Is this a valid SCTP address? */ | 174 | /* Is this a valid SCTP address? */ |
175 | if (!af->addr_valid(addr, sctp_sk(sk))) | 175 | if (!af->addr_valid(addr, sctp_sk(sk), NULL)) |
176 | return -EINVAL; | 176 | return -EINVAL; |
177 | 177 | ||
178 | if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) | 178 | if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) |
@@ -2530,8 +2530,32 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int o | |||
2530 | 2530 | ||
2531 | /* Set the values to the specific association */ | 2531 | /* Set the values to the specific association */ |
2532 | if (asoc) { | 2532 | if (asoc) { |
2533 | if (assocparams.sasoc_asocmaxrxt != 0) | 2533 | if (assocparams.sasoc_asocmaxrxt != 0) { |
2534 | __u32 path_sum = 0; | ||
2535 | int paths = 0; | ||
2536 | struct list_head *pos; | ||
2537 | struct sctp_transport *peer_addr; | ||
2538 | |||
2539 | list_for_each(pos, &asoc->peer.transport_addr_list) { | ||
2540 | peer_addr = list_entry(pos, | ||
2541 | struct sctp_transport, | ||
2542 | transports); | ||
2543 | path_sum += peer_addr->pathmaxrxt; | ||
2544 | paths++; | ||
2545 | } | ||
2546 | |||
2547 | /* Only validate asocmaxrxt if we have more then | ||
2548 | * one path/transport. We do this because path | ||
2549 | * retransmissions are only counted when we have more | ||
2550 | * then one path. | ||
2551 | */ | ||
2552 | if (paths > 1 && | ||
2553 | assocparams.sasoc_asocmaxrxt > path_sum) | ||
2554 | return -EINVAL; | ||
2555 | |||
2534 | asoc->max_retrans = assocparams.sasoc_asocmaxrxt; | 2556 | asoc->max_retrans = assocparams.sasoc_asocmaxrxt; |
2557 | } | ||
2558 | |||
2535 | if (assocparams.sasoc_cookie_life != 0) { | 2559 | if (assocparams.sasoc_cookie_life != 0) { |
2536 | asoc->cookie_life.tv_sec = | 2560 | asoc->cookie_life.tv_sec = |
2537 | assocparams.sasoc_cookie_life / 1000; | 2561 | assocparams.sasoc_cookie_life / 1000; |
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index ba97f974f57c..ee236784a6bb 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c | |||
@@ -51,6 +51,8 @@ | |||
51 | static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, | 51 | static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, |
52 | struct sctp_association *asoc); | 52 | struct sctp_association *asoc); |
53 | static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); | 53 | static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); |
54 | static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); | ||
55 | |||
54 | 56 | ||
55 | /* Initialize an ULP event from an given skb. */ | 57 | /* Initialize an ULP event from an given skb. */ |
56 | SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) | 58 | SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) |
@@ -883,6 +885,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, | |||
883 | static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) | 885 | static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) |
884 | { | 886 | { |
885 | struct sk_buff *skb, *frag; | 887 | struct sk_buff *skb, *frag; |
888 | unsigned int len; | ||
886 | 889 | ||
887 | /* Current stack structures assume that the rcv buffer is | 890 | /* Current stack structures assume that the rcv buffer is |
888 | * per socket. For UDP style sockets this is not true as | 891 | * per socket. For UDP style sockets this is not true as |
@@ -892,7 +895,30 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) | |||
892 | */ | 895 | */ |
893 | 896 | ||
894 | skb = sctp_event2skb(event); | 897 | skb = sctp_event2skb(event); |
895 | sctp_assoc_rwnd_increase(event->asoc, skb_headlen(skb)); | 898 | len = skb->len; |
899 | |||
900 | if (!skb->data_len) | ||
901 | goto done; | ||
902 | |||
903 | /* Don't forget the fragments. */ | ||
904 | for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { | ||
905 | /* NOTE: skb_shinfos are recursive. Although IP returns | ||
906 | * skb's with only 1 level of fragments, SCTP reassembly can | ||
907 | * increase the levels. | ||
908 | */ | ||
909 | sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); | ||
910 | } | ||
911 | |||
912 | done: | ||
913 | sctp_assoc_rwnd_increase(event->asoc, len); | ||
914 | sctp_ulpevent_release_owner(event); | ||
915 | } | ||
916 | |||
917 | static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) | ||
918 | { | ||
919 | struct sk_buff *skb, *frag; | ||
920 | |||
921 | skb = sctp_event2skb(event); | ||
896 | 922 | ||
897 | if (!skb->data_len) | 923 | if (!skb->data_len) |
898 | goto done; | 924 | goto done; |
@@ -903,7 +929,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) | |||
903 | * skb's with only 1 level of fragments, SCTP reassembly can | 929 | * skb's with only 1 level of fragments, SCTP reassembly can |
904 | * increase the levels. | 930 | * increase the levels. |
905 | */ | 931 | */ |
906 | sctp_ulpevent_release_data(sctp_skb2event(frag)); | 932 | sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); |
907 | } | 933 | } |
908 | 934 | ||
909 | done: | 935 | done: |
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b469c8b54613..b8936926c24b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c | |||
@@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); | |||
46 | 46 | ||
47 | static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); | 47 | static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); |
48 | static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); | 48 | static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); |
49 | static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family); | ||
50 | static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo); | ||
49 | 51 | ||
50 | int xfrm_register_type(struct xfrm_type *type, unsigned short family) | 52 | int xfrm_register_type(struct xfrm_type *type, unsigned short family) |
51 | { | 53 | { |
52 | struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); | 54 | struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); |
53 | struct xfrm_type_map *typemap; | 55 | struct xfrm_type **typemap; |
54 | int err = 0; | 56 | int err = 0; |
55 | 57 | ||
56 | if (unlikely(afinfo == NULL)) | 58 | if (unlikely(afinfo == NULL)) |
57 | return -EAFNOSUPPORT; | 59 | return -EAFNOSUPPORT; |
58 | typemap = afinfo->type_map; | 60 | typemap = afinfo->type_map; |
59 | 61 | ||
60 | write_lock_bh(&typemap->lock); | 62 | if (likely(typemap[type->proto] == NULL)) |
61 | if (likely(typemap->map[type->proto] == NULL)) | 63 | typemap[type->proto] = type; |
62 | typemap->map[type->proto] = type; | ||
63 | else | 64 | else |
64 | err = -EEXIST; | 65 | err = -EEXIST; |
65 | write_unlock_bh(&typemap->lock); | 66 | xfrm_policy_unlock_afinfo(afinfo); |
66 | xfrm_policy_put_afinfo(afinfo); | ||
67 | return err; | 67 | return err; |
68 | } | 68 | } |
69 | EXPORT_SYMBOL(xfrm_register_type); | 69 | EXPORT_SYMBOL(xfrm_register_type); |
70 | 70 | ||
71 | int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) | 71 | int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) |
72 | { | 72 | { |
73 | struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); | 73 | struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); |
74 | struct xfrm_type_map *typemap; | 74 | struct xfrm_type **typemap; |
75 | int err = 0; | 75 | int err = 0; |
76 | 76 | ||
77 | if (unlikely(afinfo == NULL)) | 77 | if (unlikely(afinfo == NULL)) |
78 | return -EAFNOSUPPORT; | 78 | return -EAFNOSUPPORT; |
79 | typemap = afinfo->type_map; | 79 | typemap = afinfo->type_map; |
80 | 80 | ||
81 | write_lock_bh(&typemap->lock); | 81 | if (unlikely(typemap[type->proto] != type)) |
82 | if (unlikely(typemap->map[type->proto] != type)) | ||
83 | err = -ENOENT; | 82 | err = -ENOENT; |
84 | else | 83 | else |
85 | typemap->map[type->proto] = NULL; | 84 | typemap[type->proto] = NULL; |
86 | write_unlock_bh(&typemap->lock); | 85 | xfrm_policy_unlock_afinfo(afinfo); |
87 | xfrm_policy_put_afinfo(afinfo); | ||
88 | return err; | 86 | return err; |
89 | } | 87 | } |
90 | EXPORT_SYMBOL(xfrm_unregister_type); | 88 | EXPORT_SYMBOL(xfrm_unregister_type); |
@@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type); | |||
92 | struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) | 90 | struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) |
93 | { | 91 | { |
94 | struct xfrm_policy_afinfo *afinfo; | 92 | struct xfrm_policy_afinfo *afinfo; |
95 | struct xfrm_type_map *typemap; | 93 | struct xfrm_type **typemap; |
96 | struct xfrm_type *type; | 94 | struct xfrm_type *type; |
97 | int modload_attempted = 0; | 95 | int modload_attempted = 0; |
98 | 96 | ||
@@ -102,11 +100,9 @@ retry: | |||
102 | return NULL; | 100 | return NULL; |
103 | typemap = afinfo->type_map; | 101 | typemap = afinfo->type_map; |
104 | 102 | ||
105 | read_lock(&typemap->lock); | 103 | type = typemap[proto]; |
106 | type = typemap->map[proto]; | ||
107 | if (unlikely(type && !try_module_get(type->owner))) | 104 | if (unlikely(type && !try_module_get(type->owner))) |
108 | type = NULL; | 105 | type = NULL; |
109 | read_unlock(&typemap->lock); | ||
110 | if (!type && !modload_attempted) { | 106 | if (!type && !modload_attempted) { |
111 | xfrm_policy_put_afinfo(afinfo); | 107 | xfrm_policy_put_afinfo(afinfo); |
112 | request_module("xfrm-type-%d-%d", | 108 | request_module("xfrm-type-%d-%d", |
@@ -142,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type) | |||
142 | module_put(type->owner); | 138 | module_put(type->owner); |
143 | } | 139 | } |
144 | 140 | ||
141 | int xfrm_register_mode(struct xfrm_mode *mode, int family) | ||
142 | { | ||
143 | struct xfrm_policy_afinfo *afinfo; | ||
144 | struct xfrm_mode **modemap; | ||
145 | int err; | ||
146 | |||
147 | if (unlikely(mode->encap >= XFRM_MODE_MAX)) | ||
148 | return -EINVAL; | ||
149 | |||
150 | afinfo = xfrm_policy_lock_afinfo(family); | ||
151 | if (unlikely(afinfo == NULL)) | ||
152 | return -EAFNOSUPPORT; | ||
153 | |||
154 | err = -EEXIST; | ||
155 | modemap = afinfo->mode_map; | ||
156 | if (likely(modemap[mode->encap] == NULL)) { | ||
157 | modemap[mode->encap] = mode; | ||
158 | err = 0; | ||
159 | } | ||
160 | |||
161 | xfrm_policy_unlock_afinfo(afinfo); | ||
162 | return err; | ||
163 | } | ||
164 | EXPORT_SYMBOL(xfrm_register_mode); | ||
165 | |||
166 | int xfrm_unregister_mode(struct xfrm_mode *mode, int family) | ||
167 | { | ||
168 | struct xfrm_policy_afinfo *afinfo; | ||
169 | struct xfrm_mode **modemap; | ||
170 | int err; | ||
171 | |||
172 | if (unlikely(mode->encap >= XFRM_MODE_MAX)) | ||
173 | return -EINVAL; | ||
174 | |||
175 | afinfo = xfrm_policy_lock_afinfo(family); | ||
176 | if (unlikely(afinfo == NULL)) | ||
177 | return -EAFNOSUPPORT; | ||
178 | |||
179 | err = -ENOENT; | ||
180 | modemap = afinfo->mode_map; | ||
181 | if (likely(modemap[mode->encap] == mode)) { | ||
182 | modemap[mode->encap] = NULL; | ||
183 | err = 0; | ||
184 | } | ||
185 | |||
186 | xfrm_policy_unlock_afinfo(afinfo); | ||
187 | return err; | ||
188 | } | ||
189 | EXPORT_SYMBOL(xfrm_unregister_mode); | ||
190 | |||
191 | struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) | ||
192 | { | ||
193 | struct xfrm_policy_afinfo *afinfo; | ||
194 | struct xfrm_mode *mode; | ||
195 | int modload_attempted = 0; | ||
196 | |||
197 | if (unlikely(encap >= XFRM_MODE_MAX)) | ||
198 | return NULL; | ||
199 | |||
200 | retry: | ||
201 | afinfo = xfrm_policy_get_afinfo(family); | ||
202 | if (unlikely(afinfo == NULL)) | ||
203 | return NULL; | ||
204 | |||
205 | mode = afinfo->mode_map[encap]; | ||
206 | if (unlikely(mode && !try_module_get(mode->owner))) | ||
207 | mode = NULL; | ||
208 | if (!mode && !modload_attempted) { | ||
209 | xfrm_policy_put_afinfo(afinfo); | ||
210 | request_module("xfrm-mode-%d-%d", family, encap); | ||
211 | modload_attempted = 1; | ||
212 | goto retry; | ||
213 | } | ||
214 | |||
215 | xfrm_policy_put_afinfo(afinfo); | ||
216 | return mode; | ||
217 | } | ||
218 | |||
219 | void xfrm_put_mode(struct xfrm_mode *mode) | ||
220 | { | ||
221 | module_put(mode->owner); | ||
222 | } | ||
223 | |||
145 | static inline unsigned long make_jiffies(long secs) | 224 | static inline unsigned long make_jiffies(long secs) |
146 | { | 225 | { |
147 | if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) | 226 | if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) |
@@ -1306,17 +1385,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) | |||
1306 | return NULL; | 1385 | return NULL; |
1307 | read_lock(&xfrm_policy_afinfo_lock); | 1386 | read_lock(&xfrm_policy_afinfo_lock); |
1308 | afinfo = xfrm_policy_afinfo[family]; | 1387 | afinfo = xfrm_policy_afinfo[family]; |
1309 | if (likely(afinfo != NULL)) | 1388 | if (unlikely(!afinfo)) |
1310 | read_lock(&afinfo->lock); | 1389 | read_unlock(&xfrm_policy_afinfo_lock); |
1311 | read_unlock(&xfrm_policy_afinfo_lock); | ||
1312 | return afinfo; | 1390 | return afinfo; |
1313 | } | 1391 | } |
1314 | 1392 | ||
1315 | static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) | 1393 | static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) |
1316 | { | 1394 | { |
1317 | if (unlikely(afinfo == NULL)) | 1395 | read_unlock(&xfrm_policy_afinfo_lock); |
1318 | return; | 1396 | } |
1319 | read_unlock(&afinfo->lock); | 1397 | |
1398 | static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family) | ||
1399 | { | ||
1400 | struct xfrm_policy_afinfo *afinfo; | ||
1401 | if (unlikely(family >= NPROTO)) | ||
1402 | return NULL; | ||
1403 | write_lock_bh(&xfrm_policy_afinfo_lock); | ||
1404 | afinfo = xfrm_policy_afinfo[family]; | ||
1405 | if (unlikely(!afinfo)) | ||
1406 | write_unlock_bh(&xfrm_policy_afinfo_lock); | ||
1407 | return afinfo; | ||
1408 | } | ||
1409 | |||
1410 | static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo) | ||
1411 | { | ||
1412 | write_unlock_bh(&xfrm_policy_afinfo_lock); | ||
1320 | } | 1413 | } |
1321 | 1414 | ||
1322 | static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) | 1415 | static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) |
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 93a2f36ad3db..17b29ec3c417 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c | |||
@@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) | |||
77 | kfree(x->ealg); | 77 | kfree(x->ealg); |
78 | kfree(x->calg); | 78 | kfree(x->calg); |
79 | kfree(x->encap); | 79 | kfree(x->encap); |
80 | if (x->mode) | ||
81 | xfrm_put_mode(x->mode); | ||
80 | if (x->type) { | 82 | if (x->type) { |
81 | x->type->destructor(x); | 83 | x->type->destructor(x); |
82 | xfrm_put_type(x->type); | 84 | xfrm_put_type(x->type); |
@@ -1103,17 +1105,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family) | |||
1103 | return NULL; | 1105 | return NULL; |
1104 | read_lock(&xfrm_state_afinfo_lock); | 1106 | read_lock(&xfrm_state_afinfo_lock); |
1105 | afinfo = xfrm_state_afinfo[family]; | 1107 | afinfo = xfrm_state_afinfo[family]; |
1106 | if (likely(afinfo != NULL)) | 1108 | if (unlikely(!afinfo)) |
1107 | read_lock(&afinfo->lock); | 1109 | read_unlock(&xfrm_state_afinfo_lock); |
1108 | read_unlock(&xfrm_state_afinfo_lock); | ||
1109 | return afinfo; | 1110 | return afinfo; |
1110 | } | 1111 | } |
1111 | 1112 | ||
1112 | static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) | 1113 | static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) |
1113 | { | 1114 | { |
1114 | if (unlikely(afinfo == NULL)) | 1115 | read_unlock(&xfrm_state_afinfo_lock); |
1115 | return; | ||
1116 | read_unlock(&afinfo->lock); | ||
1117 | } | 1116 | } |
1118 | 1117 | ||
1119 | /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ | 1118 | /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ |
@@ -1196,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x) | |||
1196 | if (err) | 1195 | if (err) |
1197 | goto error; | 1196 | goto error; |
1198 | 1197 | ||
1198 | x->mode = xfrm_get_mode(x->props.mode, family); | ||
1199 | if (x->mode == NULL) | ||
1200 | goto error; | ||
1201 | |||
1199 | x->km.state = XFRM_STATE_VALID; | 1202 | x->km.state = XFRM_STATE_VALID; |
1200 | 1203 | ||
1201 | error: | 1204 | error: |
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 81d1005830f4..c21dc26141ea 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c | |||
@@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) | |||
427 | if (x == NULL) | 427 | if (x == NULL) |
428 | return -ESRCH; | 428 | return -ESRCH; |
429 | 429 | ||
430 | if ((err = security_xfrm_state_delete(x)) != 0) | ||
431 | goto out; | ||
432 | |||
430 | if (xfrm_state_kern(x)) { | 433 | if (xfrm_state_kern(x)) { |
431 | xfrm_state_put(x); | 434 | err = -EPERM; |
432 | return -EPERM; | 435 | goto out; |
433 | } | 436 | } |
434 | 437 | ||
435 | err = xfrm_state_delete(x); | 438 | err = xfrm_state_delete(x); |
436 | if (err < 0) { | 439 | if (err < 0) |
437 | xfrm_state_put(x); | 440 | goto out; |
438 | return err; | ||
439 | } | ||
440 | 441 | ||
441 | c.seq = nlh->nlmsg_seq; | 442 | c.seq = nlh->nlmsg_seq; |
442 | c.pid = nlh->nlmsg_pid; | 443 | c.pid = nlh->nlmsg_pid; |
443 | c.event = nlh->nlmsg_type; | 444 | c.event = nlh->nlmsg_type; |
444 | km_state_notify(x, &c); | 445 | km_state_notify(x, &c); |
445 | xfrm_state_put(x); | ||
446 | 446 | ||
447 | out: | ||
448 | xfrm_state_put(x); | ||
447 | return err; | 449 | return err; |
448 | } | 450 | } |
449 | 451 | ||
@@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr | |||
1055 | MSG_DONTWAIT); | 1057 | MSG_DONTWAIT); |
1056 | } | 1058 | } |
1057 | } else { | 1059 | } else { |
1060 | if ((err = security_xfrm_policy_delete(xp)) != 0) | ||
1061 | goto out; | ||
1058 | c.data.byid = p->index; | 1062 | c.data.byid = p->index; |
1059 | c.event = nlh->nlmsg_type; | 1063 | c.event = nlh->nlmsg_type; |
1060 | c.seq = nlh->nlmsg_seq; | 1064 | c.seq = nlh->nlmsg_seq; |
@@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr | |||
1064 | 1068 | ||
1065 | xfrm_pol_put(xp); | 1069 | xfrm_pol_put(xp); |
1066 | 1070 | ||
1071 | out: | ||
1067 | return err; | 1072 | return err; |
1068 | } | 1073 | } |
1069 | 1074 | ||