aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorMax Krasnyansky <maxk@qualcomm.com>2008-07-15 01:18:19 -0400
committerDavid S. Miller <davem@davemloft.net>2008-07-15 01:18:19 -0400
commitf271b2cc78f09c93ccd00a2056d3237134bf994c (patch)
tree25b379fa23a84627992c197d2e18793fd90c2a7e /drivers
parent89146504cbfeb120dd08ec7f9f8314c4986189b8 (diff)
tun: Fix/rewrite packet filtering logic
Please see the following thread to get some context on this http://marc.info/?l=linux-netdev&m=121564433018903&w=2 Basically the issue is that current multi-cast filtering stuff in the TUN/TAP driver is seriously broken. Original patch went in without proper review and ACK. It was broken and confusing to start with and subsequent patches broke it completely. To give you an idea of what's broken here are some of the issues: - Very confusing comments throughout the code that imply that the character device is a network interface in its own right, and that packets are passed between the two nics. Which is completely wrong. - Wrong set of ioctls is used for setting up filters. They look like shortcuts for manipulating state of the tun/tap network interface but in reality manipulate the state of the TX filter. - ioctls that were originally used for setting address of the the TX filter got "fixed" and now set the address of the network interface itself. Which made filter totaly useless. - Filtering is done too late. Instead of filtering early on, to avoid unnecessary wakeups, filtering is done in the read() call. The list goes on and on :) So the patch cleans all that up. It introduces simple and clean interface for setting up TX filters (TUNSETTXFILTER + tun_filter spec) and does filtering before enqueuing the packets. TX filtering is useful in the scenarios where TAP is part of a bridge, in which case it gets all broadcast, multicast and potentially other packets when the bridge is learning. So for example Ethernet tunnelling app may want to setup TX filters to avoid tunnelling multicast traffic. QEMU and other hypervisors can push RX filtering that is currently done in the guest into the host context therefore saving wakeups and unnecessary data transfer. Signed-off-by: Max Krasnyansky <maxk@qualcomm.com> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/net/tun.c316
1 files changed, 153 insertions, 163 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 2693f883ecd..901551c8ca0 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -18,15 +18,11 @@
18/* 18/*
19 * Changes: 19 * Changes:
20 * 20 *
21 * Brian Braunstein <linuxkernel@bristyle.com> 2007/03/23
22 * Fixed hw address handling. Now net_device.dev_addr is kept consistent
23 * with tun.dev_addr when the address is set by this module.
24 *
25 * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14 21 * Mike Kershaw <dragorn@kismetwireless.net> 2005/08/14
26 * Add TUNSETLINK ioctl to set the link encapsulation 22 * Add TUNSETLINK ioctl to set the link encapsulation
27 * 23 *
28 * Mark Smith <markzzzsmith@yahoo.com.au> 24 * Mark Smith <markzzzsmith@yahoo.com.au>
29 * Use random_ether_addr() for tap MAC address. 25 * Use random_ether_addr() for tap MAC address.
30 * 26 *
31 * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20 27 * Harald Roelle <harald.roelle@ifi.lmu.de> 2004/04/20
32 * Fixes in packet dropping, queue length setting and queue wakeup. 28 * Fixes in packet dropping, queue length setting and queue wakeup.
@@ -83,9 +79,16 @@ static int debug;
83#define DBG1( a... ) 79#define DBG1( a... )
84#endif 80#endif
85 81
82#define FLT_EXACT_COUNT 8
83struct tap_filter {
84 unsigned int count; /* Number of addrs. Zero means disabled */
85 u32 mask[2]; /* Mask of the hashed addrs */
86 unsigned char addr[FLT_EXACT_COUNT][ETH_ALEN];
87};
88
86struct tun_struct { 89struct tun_struct {
87 struct list_head list; 90 struct list_head list;
88 unsigned long flags; 91 unsigned int flags;
89 int attached; 92 int attached;
90 uid_t owner; 93 uid_t owner;
91 gid_t group; 94 gid_t group;
@@ -94,19 +97,119 @@ struct tun_struct {
94 struct sk_buff_head readq; 97 struct sk_buff_head readq;
95 98
96 struct net_device *dev; 99 struct net_device *dev;
100 struct fasync_struct *fasync;
97 101
98 struct fasync_struct *fasync; 102 struct tap_filter txflt;
99
100 unsigned long if_flags;
101 u8 dev_addr[ETH_ALEN];
102 u32 chr_filter[2];
103 u32 net_filter[2];
104 103
105#ifdef TUN_DEBUG 104#ifdef TUN_DEBUG
106 int debug; 105 int debug;
107#endif 106#endif
108}; 107};
109 108
109/* TAP filterting */
110static void addr_hash_set(u32 *mask, const u8 *addr)
111{
112 int n = ether_crc(ETH_ALEN, addr) >> 26;
113 mask[n >> 5] |= (1 << (n & 31));
114}
115
116static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
117{
118 int n = ether_crc(ETH_ALEN, addr) >> 26;
119 return mask[n >> 5] & (1 << (n & 31));
120}
121
122static int update_filter(struct tap_filter *filter, void __user *arg)
123{
124 struct { u8 u[ETH_ALEN]; } *addr;
125 struct tun_filter uf;
126 int err, alen, n, nexact;
127
128 if (copy_from_user(&uf, arg, sizeof(uf)))
129 return -EFAULT;
130
131 if (!uf.count) {
132 /* Disabled */
133 filter->count = 0;
134 return 0;
135 }
136
137 alen = ETH_ALEN * uf.count;
138 addr = kmalloc(alen, GFP_KERNEL);
139 if (!addr)
140 return -ENOMEM;
141
142 if (copy_from_user(addr, arg + sizeof(uf), alen)) {
143 err = -EFAULT;
144 goto done;
145 }
146
147 /* The filter is updated without holding any locks. Which is
148 * perfectly safe. We disable it first and in the worst
149 * case we'll accept a few undesired packets. */
150 filter->count = 0;
151 wmb();
152
153 /* Use first set of addresses as an exact filter */
154 for (n = 0; n < uf.count && n < FLT_EXACT_COUNT; n++)
155 memcpy(filter->addr[n], addr[n].u, ETH_ALEN);
156
157 nexact = n;
158
159 /* The rest is hashed */
160 memset(filter->mask, 0, sizeof(filter->mask));
161 for (; n < uf.count; n++)
162 addr_hash_set(filter->mask, addr[n].u);
163
164 /* For ALLMULTI just set the mask to all ones.
165 * This overrides the mask populated above. */
166 if ((uf.flags & TUN_FLT_ALLMULTI))
167 memset(filter->mask, ~0, sizeof(filter->mask));
168
169 /* Now enable the filter */
170 wmb();
171 filter->count = nexact;
172
173 /* Return the number of exact filters */
174 err = nexact;
175
176done:
177 kfree(addr);
178 return err;
179}
180
181/* Returns: 0 - drop, !=0 - accept */
182static int run_filter(struct tap_filter *filter, const struct sk_buff *skb)
183{
184 /* Cannot use eth_hdr(skb) here because skb_mac_hdr() is incorrect
185 * at this point. */
186 struct ethhdr *eh = (struct ethhdr *) skb->data;
187 int i;
188
189 /* Exact match */
190 for (i = 0; i < filter->count; i++)
191 if (!compare_ether_addr(eh->h_dest, filter->addr[i]))
192 return 1;
193
194 /* Inexact match (multicast only) */
195 if (is_multicast_ether_addr(eh->h_dest))
196 return addr_hash_test(filter->mask, eh->h_dest);
197
198 return 0;
199}
200
201/*
202 * Checks whether the packet is accepted or not.
203 * Returns: 0 - drop, !=0 - accept
204 */
205static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
206{
207 if (!filter->count)
208 return 1;
209
210 return run_filter(filter, skb);
211}
212
110/* Network device part of the driver */ 213/* Network device part of the driver */
111 214
112static unsigned int tun_net_id; 215static unsigned int tun_net_id;
@@ -141,7 +244,12 @@ static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
141 if (!tun->attached) 244 if (!tun->attached)
142 goto drop; 245 goto drop;
143 246
144 /* Packet dropping */ 247 /* Drop if the filter does not like it.
248 * This is a noop if the filter is disabled.
249 * Filter can be enabled only for the TAP devices. */
250 if (!check_filter(&tun->txflt, skb))
251 goto drop;
252
145 if (skb_queue_len(&tun->readq) >= dev->tx_queue_len) { 253 if (skb_queue_len(&tun->readq) >= dev->tx_queue_len) {
146 if (!(tun->flags & TUN_ONE_QUEUE)) { 254 if (!(tun->flags & TUN_ONE_QUEUE)) {
147 /* Normal queueing mode. */ 255 /* Normal queueing mode. */
@@ -158,7 +266,7 @@ static int tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
158 } 266 }
159 } 267 }
160 268
161 /* Queue packet */ 269 /* Enqueue packet */
162 skb_queue_tail(&tun->readq, skb); 270 skb_queue_tail(&tun->readq, skb);
163 dev->trans_start = jiffies; 271 dev->trans_start = jiffies;
164 272
@@ -174,41 +282,14 @@ drop:
174 return 0; 282 return 0;
175} 283}
176 284
177/** Add the specified Ethernet address to this multicast filter. */ 285static void tun_net_mclist(struct net_device *dev)
178static void
179add_multi(u32* filter, const u8* addr)
180{
181 int bit_nr = ether_crc(ETH_ALEN, addr) >> 26;
182 filter[bit_nr >> 5] |= 1 << (bit_nr & 31);
183}
184
185/** Remove the specified Ethernet addres from this multicast filter. */
186static void
187del_multi(u32* filter, const u8* addr)
188{ 286{
189 int bit_nr = ether_crc(ETH_ALEN, addr) >> 26; 287 /*
190 filter[bit_nr >> 5] &= ~(1 << (bit_nr & 31)); 288 * This callback is supposed to deal with mc filter in
191} 289 * _rx_ path and has nothing to do with the _tx_ path.
192 290 * In rx path we always accept everything userspace gives us.
193/** Update the list of multicast groups to which the network device belongs. 291 */
194 * This list is used to filter packets being sent from the character device to 292 return;
195 * the network device. */
196static void
197tun_net_mclist(struct net_device *dev)
198{
199 struct tun_struct *tun = netdev_priv(dev);
200 const struct dev_mc_list *mclist;
201 int i;
202 DECLARE_MAC_BUF(mac);
203 DBG(KERN_DEBUG "%s: tun_net_mclist: mc_count %d\n",
204 dev->name, dev->mc_count);
205 memset(tun->chr_filter, 0, sizeof tun->chr_filter);
206 for (i = 0, mclist = dev->mc_list; i < dev->mc_count && mclist != NULL;
207 i++, mclist = mclist->next) {
208 add_multi(tun->net_filter, mclist->dmi_addr);
209 DBG(KERN_DEBUG "%s: tun_net_mclist: %s\n",
210 dev->name, print_mac(mac, mclist->dmi_addr));
211 }
212} 293}
213 294
214#define MIN_MTU 68 295#define MIN_MTU 68
@@ -244,13 +325,11 @@ static void tun_net_init(struct net_device *dev)
244 325
245 case TUN_TAP_DEV: 326 case TUN_TAP_DEV:
246 /* Ethernet TAP Device */ 327 /* Ethernet TAP Device */
247 dev->set_multicast_list = tun_net_mclist;
248
249 ether_setup(dev); 328 ether_setup(dev);
250 dev->change_mtu = tun_net_change_mtu; 329 dev->change_mtu = tun_net_change_mtu;
330 dev->set_multicast_list = tun_net_mclist;
251 331
252 /* random address already created for us by tun_set_iff, use it */ 332 random_ether_addr(dev->dev_addr);
253 memcpy(dev->dev_addr, tun->dev_addr, min(sizeof(tun->dev_addr), sizeof(dev->dev_addr)) );
254 333
255 dev->tx_queue_len = TUN_READQ_SIZE; /* We prefer our own queue length */ 334 dev->tx_queue_len = TUN_READQ_SIZE; /* We prefer our own queue length */
256 break; 335 break;
@@ -486,7 +565,6 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
486 DECLARE_WAITQUEUE(wait, current); 565 DECLARE_WAITQUEUE(wait, current);
487 struct sk_buff *skb; 566 struct sk_buff *skb;
488 ssize_t len, ret = 0; 567 ssize_t len, ret = 0;
489 DECLARE_MAC_BUF(mac);
490 568
491 if (!tun) 569 if (!tun)
492 return -EBADFD; 570 return -EBADFD;
@@ -499,10 +577,6 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
499 577
500 add_wait_queue(&tun->read_wait, &wait); 578 add_wait_queue(&tun->read_wait, &wait);
501 while (len) { 579 while (len) {
502 const u8 ones[ ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
503 u8 addr[ ETH_ALEN];
504 int bit_nr;
505
506 current->state = TASK_INTERRUPTIBLE; 580 current->state = TASK_INTERRUPTIBLE;
507 581
508 /* Read frames from the queue */ 582 /* Read frames from the queue */
@@ -522,36 +596,9 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
522 } 596 }
523 netif_wake_queue(tun->dev); 597 netif_wake_queue(tun->dev);
524 598
525 /** Decide whether to accept this packet. This code is designed to 599 ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
526 * behave identically to an Ethernet interface. Accept the packet if 600 kfree_skb(skb);
527 * - we are promiscuous. 601 break;
528 * - the packet is addressed to us.
529 * - the packet is broadcast.
530 * - the packet is multicast and
531 * - we are multicast promiscous.
532 * - we belong to the multicast group.
533 */
534 skb_copy_from_linear_data(skb, addr, min_t(size_t, sizeof addr,
535 skb->len));
536 bit_nr = ether_crc(sizeof addr, addr) >> 26;
537 if ((tun->if_flags & IFF_PROMISC) ||
538 memcmp(addr, tun->dev_addr, sizeof addr) == 0 ||
539 memcmp(addr, ones, sizeof addr) == 0 ||
540 (((addr[0] == 1 && addr[1] == 0 && addr[2] == 0x5e) ||
541 (addr[0] == 0x33 && addr[1] == 0x33)) &&
542 ((tun->if_flags & IFF_ALLMULTI) ||
543 (tun->chr_filter[bit_nr >> 5] & (1 << (bit_nr & 31)))))) {
544 DBG(KERN_DEBUG "%s: tun_chr_readv: accepted: %s\n",
545 tun->dev->name, print_mac(mac, addr));
546 ret = tun_put_user(tun, skb, (struct iovec *) iv, len);
547 kfree_skb(skb);
548 break;
549 } else {
550 DBG(KERN_DEBUG "%s: tun_chr_readv: rejected: %s\n",
551 tun->dev->name, print_mac(mac, addr));
552 kfree_skb(skb);
553 continue;
554 }
555 } 602 }
556 603
557 current->state = TASK_RUNNING; 604 current->state = TASK_RUNNING;
@@ -647,12 +694,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
647 tun = netdev_priv(dev); 694 tun = netdev_priv(dev);
648 tun->dev = dev; 695 tun->dev = dev;
649 tun->flags = flags; 696 tun->flags = flags;
650 /* Be promiscuous by default to maintain previous behaviour. */ 697 tun->txflt.count = 0;
651 tun->if_flags = IFF_PROMISC;
652 /* Generate random Ethernet address. */
653 *(__be16 *)tun->dev_addr = htons(0x00FF);
654 get_random_bytes(tun->dev_addr + sizeof(u16), 4);
655 memset(tun->chr_filter, 0, sizeof tun->chr_filter);
656 698
657 tun_net_init(dev); 699 tun_net_init(dev);
658 700
@@ -751,6 +793,7 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
751 struct tun_struct *tun = file->private_data; 793 struct tun_struct *tun = file->private_data;
752 void __user* argp = (void __user*)arg; 794 void __user* argp = (void __user*)arg;
753 struct ifreq ifr; 795 struct ifreq ifr;
796 int ret;
754 DECLARE_MAC_BUF(mac); 797 DECLARE_MAC_BUF(mac);
755 798
756 if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) 799 if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
@@ -826,9 +869,6 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
826 break; 869 break;
827 870
828 case TUNSETLINK: 871 case TUNSETLINK:
829 {
830 int ret;
831
832 /* Only allow setting the type when the interface is down */ 872 /* Only allow setting the type when the interface is down */
833 rtnl_lock(); 873 rtnl_lock();
834 if (tun->dev->flags & IFF_UP) { 874 if (tun->dev->flags & IFF_UP) {
@@ -842,94 +882,44 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
842 } 882 }
843 rtnl_unlock(); 883 rtnl_unlock();
844 return ret; 884 return ret;
845 }
846 885
847#ifdef TUN_DEBUG 886#ifdef TUN_DEBUG
848 case TUNSETDEBUG: 887 case TUNSETDEBUG:
849 tun->debug = arg; 888 tun->debug = arg;
850 break; 889 break;
851#endif 890#endif
852
853 case TUNSETOFFLOAD: 891 case TUNSETOFFLOAD:
854 {
855 int ret;
856 rtnl_lock(); 892 rtnl_lock();
857 ret = set_offload(tun->dev, arg); 893 ret = set_offload(tun->dev, arg);
858 rtnl_unlock(); 894 rtnl_unlock();
859 return ret; 895 return ret;
860 }
861 896
862 case SIOCGIFFLAGS: 897 case TUNSETTXFILTER:
863 ifr.ifr_flags = tun->if_flags; 898 /* Can be set only for TAPs */
864 if (copy_to_user( argp, &ifr, sizeof ifr)) 899 if ((tun->flags & TUN_TYPE_MASK) != TUN_TAP_DEV)
865 return -EFAULT; 900 return -EINVAL;
866 return 0; 901 rtnl_lock();
867 902 ret = update_filter(&tun->txflt, (void *) __user arg);
868 case SIOCSIFFLAGS: 903 rtnl_unlock();
869 /** Set the character device's interface flags. Currently only 904 return ret;
870 * IFF_PROMISC and IFF_ALLMULTI are used. */
871 tun->if_flags = ifr.ifr_flags;
872 DBG(KERN_INFO "%s: interface flags 0x%lx\n",
873 tun->dev->name, tun->if_flags);
874 return 0;
875 905
876 case SIOCGIFHWADDR: 906 case SIOCGIFHWADDR:
877 /* Note: the actual net device's address may be different */ 907 /* Get hw addres */
878 memcpy(ifr.ifr_hwaddr.sa_data, tun->dev_addr, 908 memcpy(ifr.ifr_hwaddr.sa_data, tun->dev->dev_addr, ETH_ALEN);
879 min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun->dev_addr)); 909 ifr.ifr_hwaddr.sa_family = tun->dev->type;
880 if (copy_to_user( argp, &ifr, sizeof ifr)) 910 if (copy_to_user(argp, &ifr, sizeof ifr))
881 return -EFAULT; 911 return -EFAULT;
882 return 0; 912 return 0;
883 913
884 case SIOCSIFHWADDR: 914 case SIOCSIFHWADDR:
885 { 915 /* Set hw address */
886 /* try to set the actual net device's hw address */ 916 DBG(KERN_DEBUG "%s: set hw address: %s\n",
887 int ret; 917 tun->dev->name, print_mac(mac, ifr.ifr_hwaddr.sa_data));
888 918
889 rtnl_lock(); 919 rtnl_lock();
890 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr); 920 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
891 rtnl_unlock(); 921 rtnl_unlock();
892 922 return ret;
893 if (ret == 0) {
894 /** Set the character device's hardware address. This is used when
895 * filtering packets being sent from the network device to the character
896 * device. */
897 memcpy(tun->dev_addr, ifr.ifr_hwaddr.sa_data,
898 min(sizeof ifr.ifr_hwaddr.sa_data, sizeof tun->dev_addr));
899 DBG(KERN_DEBUG "%s: set hardware address: %x:%x:%x:%x:%x:%x\n",
900 tun->dev->name,
901 tun->dev_addr[0], tun->dev_addr[1], tun->dev_addr[2],
902 tun->dev_addr[3], tun->dev_addr[4], tun->dev_addr[5]);
903 }
904
905 return ret;
906 }
907
908 case SIOCADDMULTI:
909 /** Add the specified group to the character device's multicast filter
910 * list. */
911 rtnl_lock();
912 netif_tx_lock_bh(tun->dev);
913 add_multi(tun->chr_filter, ifr.ifr_hwaddr.sa_data);
914 netif_tx_unlock_bh(tun->dev);
915 rtnl_unlock();
916
917 DBG(KERN_DEBUG "%s: add multi: %s\n",
918 tun->dev->name, print_mac(mac, ifr.ifr_hwaddr.sa_data));
919 return 0;
920
921 case SIOCDELMULTI:
922 /** Remove the specified group from the character device's multicast
923 * filter list. */
924 rtnl_lock();
925 netif_tx_lock_bh(tun->dev);
926 del_multi(tun->chr_filter, ifr.ifr_hwaddr.sa_data);
927 netif_tx_unlock_bh(tun->dev);
928 rtnl_unlock();
929
930 DBG(KERN_DEBUG "%s: del multi: %s\n",
931 tun->dev->name, print_mac(mac, ifr.ifr_hwaddr.sa_data));
932 return 0;
933 923
934 default: 924 default:
935 return -EINVAL; 925 return -EINVAL;