aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/net/tun.c
diff options
context:
space:
mode:
authorHerbert Xu <herbert@gondor.apana.org.au>2009-02-06 00:25:32 -0500
committerDavid S. Miller <davem@davemloft.net>2009-02-06 00:25:32 -0500
commit33dccbb050bbe35b88ca8cf1228dcf3e4d4b3554 (patch)
tree19435e330ac81b77c59a56ceea6c66d7efc0bc97 /drivers/net/tun.c
parent4cc7f68d65558f683c702d4fe3a5aac4c5227b97 (diff)
tun: Limit amount of queued packets per device
Unlike a normal socket path, the tuntap device send path does not have any accounting. This means that the user-space sender may be able to pin down arbitrary amounts of kernel memory by continuing to send data to an end-point that is congested. Even when this isn't an issue because of limited queueing at most end points, this can also be a problem because its only response to congestion is packet loss. That is, when those local queues at the end-point fills up, the tuntap device will start wasting system time because it will continue to send data there which simply gets dropped straight away. Of course one could argue that everybody should do congestion control end-to-end, unfortunately there are people in this world still hooked on UDP, and they don't appear to be going away anywhere fast. In fact, we've always helped them by performing accounting in our UDP code, the sole purpose of which is to provide congestion feedback other than through packet loss. This patch attempts to apply the same bandaid to the tuntap device. It creates a pseudo-socket object which is used to account our packets just as a normal socket does for UDP. Of course things are a little complex because we're actually reinjecting traffic back into the stack rather than out of the stack. The stack complexities however should have been resolved by preceding patches. So this one can simply start using skb_set_owner_w. For now the accounting is essentially disabled by default for backwards compatibility. In particular, we set the cap to INT_MAX. This is so that existing applications don't get confused by the sudden arrival EAGAIN errors. In future we may wish (or be forced to) do this by default. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'drivers/net/tun.c')
-rw-r--r--drivers/net/tun.c167
1 files changed, 114 insertions, 53 deletions
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 15d67635bb10..0476549841ac 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -64,6 +64,7 @@
64#include <net/net_namespace.h> 64#include <net/net_namespace.h>
65#include <net/netns/generic.h> 65#include <net/netns/generic.h>
66#include <net/rtnetlink.h> 66#include <net/rtnetlink.h>
67#include <net/sock.h>
67 68
68#include <asm/system.h> 69#include <asm/system.h>
69#include <asm/uaccess.h> 70#include <asm/uaccess.h>
@@ -95,6 +96,8 @@ struct tun_file {
95 wait_queue_head_t read_wait; 96 wait_queue_head_t read_wait;
96}; 97};
97 98
99struct tun_sock;
100
98struct tun_struct { 101struct tun_struct {
99 struct tun_file *tfile; 102 struct tun_file *tfile;
100 unsigned int flags; 103 unsigned int flags;
@@ -107,12 +110,24 @@ struct tun_struct {
107 struct fasync_struct *fasync; 110 struct fasync_struct *fasync;
108 111
109 struct tap_filter txflt; 112 struct tap_filter txflt;
113 struct sock *sk;
114 struct socket socket;
110 115
111#ifdef TUN_DEBUG 116#ifdef TUN_DEBUG
112 int debug; 117 int debug;
113#endif 118#endif
114}; 119};
115 120
121struct tun_sock {
122 struct sock sk;
123 struct tun_struct *tun;
124};
125
126static inline struct tun_sock *tun_sk(struct sock *sk)
127{
128 return container_of(sk, struct tun_sock, sk);
129}
130
116static int tun_attach(struct tun_struct *tun, struct file *file) 131static int tun_attach(struct tun_struct *tun, struct file *file)
117{ 132{
118 struct tun_file *tfile = file->private_data; 133 struct tun_file *tfile = file->private_data;
@@ -461,7 +476,8 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
461{ 476{
462 struct tun_file *tfile = file->private_data; 477 struct tun_file *tfile = file->private_data;
463 struct tun_struct *tun = __tun_get(tfile); 478 struct tun_struct *tun = __tun_get(tfile);
464 unsigned int mask = POLLOUT | POLLWRNORM; 479 struct sock *sk = tun->sk;
480 unsigned int mask = 0;
465 481
466 if (!tun) 482 if (!tun)
467 return POLLERR; 483 return POLLERR;
@@ -473,6 +489,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
473 if (!skb_queue_empty(&tun->readq)) 489 if (!skb_queue_empty(&tun->readq))
474 mask |= POLLIN | POLLRDNORM; 490 mask |= POLLIN | POLLRDNORM;
475 491
492 if (sock_writeable(sk) ||
493 (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
494 sock_writeable(sk)))
495 mask |= POLLOUT | POLLWRNORM;
496
476 if (tun->dev->reg_state != NETREG_REGISTERED) 497 if (tun->dev->reg_state != NETREG_REGISTERED)
477 mask = POLLERR; 498 mask = POLLERR;
478 499
@@ -482,66 +503,35 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
482 503
483/* prepad is the amount to reserve at front. len is length after that. 504/* prepad is the amount to reserve at front. len is length after that.
484 * linear is a hint as to how much to copy (usually headers). */ 505 * linear is a hint as to how much to copy (usually headers). */
485static struct sk_buff *tun_alloc_skb(size_t prepad, size_t len, size_t linear, 506static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
486 gfp_t gfp) 507 size_t prepad, size_t len,
508 size_t linear, int noblock)
487{ 509{
510 struct sock *sk = tun->sk;
488 struct sk_buff *skb; 511 struct sk_buff *skb;
489 unsigned int i; 512 int err;
490
491 skb = alloc_skb(prepad + len, gfp|__GFP_NOWARN);
492 if (skb) {
493 skb_reserve(skb, prepad);
494 skb_put(skb, len);
495 return skb;
496 }
497 513
498 /* Under a page? Don't bother with paged skb. */ 514 /* Under a page? Don't bother with paged skb. */
499 if (prepad + len < PAGE_SIZE) 515 if (prepad + len < PAGE_SIZE)
500 return NULL; 516 linear = len;
501 517
502 /* Start with a normal skb, and add pages. */ 518 skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
503 skb = alloc_skb(prepad + linear, gfp); 519 &err);
504 if (!skb) 520 if (!skb)
505 return NULL; 521 return ERR_PTR(err);
506 522
507 skb_reserve(skb, prepad); 523 skb_reserve(skb, prepad);
508 skb_put(skb, linear); 524 skb_put(skb, linear);
509 525 skb->data_len = len - linear;
510 len -= linear; 526 skb->len += len - linear;
511
512 for (i = 0; i < MAX_SKB_FRAGS; i++) {
513 skb_frag_t *f = &skb_shinfo(skb)->frags[i];
514
515 f->page = alloc_page(gfp|__GFP_ZERO);
516 if (!f->page)
517 break;
518
519 f->page_offset = 0;
520 f->size = PAGE_SIZE;
521
522 skb->data_len += PAGE_SIZE;
523 skb->len += PAGE_SIZE;
524 skb->truesize += PAGE_SIZE;
525 skb_shinfo(skb)->nr_frags++;
526
527 if (len < PAGE_SIZE) {
528 len = 0;
529 break;
530 }
531 len -= PAGE_SIZE;
532 }
533
534 /* Too large, or alloc fail? */
535 if (unlikely(len)) {
536 kfree_skb(skb);
537 skb = NULL;
538 }
539 527
540 return skb; 528 return skb;
541} 529}
542 530
543/* Get packet from user space buffer */ 531/* Get packet from user space buffer */
544static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count) 532static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
533 struct iovec *iv, size_t count,
534 int noblock)
545{ 535{
546 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) }; 536 struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
547 struct sk_buff *skb; 537 struct sk_buff *skb;
@@ -573,9 +563,11 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
573 return -EINVAL; 563 return -EINVAL;
574 } 564 }
575 565
576 if (!(skb = tun_alloc_skb(align, len, gso.hdr_len, GFP_KERNEL))) { 566 skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
577 tun->dev->stats.rx_dropped++; 567 if (IS_ERR(skb)) {
578 return -ENOMEM; 568 if (PTR_ERR(skb) != -EAGAIN)
569 tun->dev->stats.rx_dropped++;
570 return PTR_ERR(skb);
579 } 571 }
580 572
581 if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) { 573 if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) {
@@ -661,7 +653,8 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
661static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, 653static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
662 unsigned long count, loff_t pos) 654 unsigned long count, loff_t pos)
663{ 655{
664 struct tun_struct *tun = tun_get(iocb->ki_filp); 656 struct file *file = iocb->ki_filp;
657 struct tun_struct *tun = file->private_data;
665 ssize_t result; 658 ssize_t result;
666 659
667 if (!tun) 660 if (!tun)
@@ -669,7 +662,8 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
669 662
670 DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count); 663 DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
671 664
672 result = tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count)); 665 result = tun_get_user(tun, (struct iovec *)iv, iov_length(iv, count),
666 file->f_flags & O_NONBLOCK);
673 667
674 tun_put(tun); 668 tun_put(tun);
675 return result; 669 return result;
@@ -828,11 +822,40 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
828 .validate = tun_validate, 822 .validate = tun_validate,
829}; 823};
830 824
825static void tun_sock_write_space(struct sock *sk)
826{
827 struct tun_struct *tun;
828
829 if (!sock_writeable(sk))
830 return;
831
832 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
833 wake_up_interruptible_sync(sk->sk_sleep);
834
835 if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
836 return;
837
838 tun = container_of(sk, struct tun_sock, sk)->tun;
839 kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
840}
841
842static void tun_sock_destruct(struct sock *sk)
843{
844 dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
845}
846
847static struct proto tun_proto = {
848 .name = "tun",
849 .owner = THIS_MODULE,
850 .obj_size = sizeof(struct tun_sock),
851};
831 852
832static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) 853static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
833{ 854{
855 struct sock *sk;
834 struct tun_struct *tun; 856 struct tun_struct *tun;
835 struct net_device *dev; 857 struct net_device *dev;
858 struct tun_file *tfile = file->private_data;
836 int err; 859 int err;
837 860
838 dev = __dev_get_by_name(net, ifr->ifr_name); 861 dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -885,14 +908,31 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
885 tun->flags = flags; 908 tun->flags = flags;
886 tun->txflt.count = 0; 909 tun->txflt.count = 0;
887 910
911 err = -ENOMEM;
912 sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
913 if (!sk)
914 goto err_free_dev;
915
916 /* This ref count is for tun->sk. */
917 dev_hold(dev);
918 sock_init_data(&tun->socket, sk);
919 sk->sk_write_space = tun_sock_write_space;
920 sk->sk_destruct = tun_sock_destruct;
921 sk->sk_sndbuf = INT_MAX;
922 sk->sk_sleep = &tfile->read_wait;
923
924 tun->sk = sk;
925 container_of(sk, struct tun_sock, sk)->tun = tun;
926
888 tun_net_init(dev); 927 tun_net_init(dev);
889 928
890 if (strchr(dev->name, '%')) { 929 if (strchr(dev->name, '%')) {
891 err = dev_alloc_name(dev, dev->name); 930 err = dev_alloc_name(dev, dev->name);
892 if (err < 0) 931 if (err < 0)
893 goto err_free_dev; 932 goto err_free_sk;
894 } 933 }
895 934
935 err = -EINVAL;
896 err = register_netdevice(tun->dev); 936 err = register_netdevice(tun->dev);
897 if (err < 0) 937 if (err < 0)
898 goto err_free_dev; 938 goto err_free_dev;
@@ -928,6 +968,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
928 strcpy(ifr->ifr_name, tun->dev->name); 968 strcpy(ifr->ifr_name, tun->dev->name);
929 return 0; 969 return 0;
930 970
971 err_free_sk:
972 sock_put(sk);
931 err_free_dev: 973 err_free_dev:
932 free_netdev(dev); 974 free_netdev(dev);
933 failed: 975 failed:
@@ -1012,6 +1054,7 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
1012 struct tun_struct *tun; 1054 struct tun_struct *tun;
1013 void __user* argp = (void __user*)arg; 1055 void __user* argp = (void __user*)arg;
1014 struct ifreq ifr; 1056 struct ifreq ifr;
1057 int sndbuf;
1015 int ret; 1058 int ret;
1016 1059
1017 if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89) 1060 if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
@@ -1151,6 +1194,22 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
1151 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr); 1194 ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
1152 rtnl_unlock(); 1195 rtnl_unlock();
1153 break; 1196 break;
1197
1198 case TUNGETSNDBUF:
1199 sndbuf = tun->sk->sk_sndbuf;
1200 if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
1201 ret = -EFAULT;
1202 break;
1203
1204 case TUNSETSNDBUF:
1205 if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
1206 ret = -EFAULT;
1207 break;
1208 }
1209
1210 tun->sk->sk_sndbuf = sndbuf;
1211 break;
1212
1154 default: 1213 default:
1155 ret = -EINVAL; 1214 ret = -EINVAL;
1156 break; 1215 break;
@@ -1218,8 +1277,10 @@ static int tun_chr_close(struct inode *inode, struct file *file)
1218 __tun_detach(tun); 1277 __tun_detach(tun);
1219 1278
1220 /* If desireable, unregister the netdevice. */ 1279 /* If desireable, unregister the netdevice. */
1221 if (!(tun->flags & TUN_PERSIST)) 1280 if (!(tun->flags & TUN_PERSIST)) {
1281 sock_put(tun->sk);
1222 unregister_netdevice(tun->dev); 1282 unregister_netdevice(tun->dev);
1283 }
1223 1284
1224 rtnl_unlock(); 1285 rtnl_unlock();
1225 } 1286 }