From 77238f2b942b38ab4e7f3aced44084493e4a8675 Mon Sep 17 00:00:00 2001 From: Tomoki Sekiyama Date: Sun, 18 Oct 2009 23:17:37 -0700 Subject: AF_UNIX: Fix deadlock on connecting to shutdown socket I found a deadlock bug in UNIX domain socket, which makes able to DoS attack against the local machine by non-root users. How to reproduce: 1. Make a listening AF_UNIX/SOCK_STREAM socket with an abstruct namespace(*), and shutdown(2) it. 2. Repeat connect(2)ing to the listening socket from the other sockets until the connection backlog is full-filled. 3. connect(2) takes the CPU forever. If every core is taken, the system hangs. PoC code: (Run as many times as cores on SMP machines.) int main(void) { int ret; int csd; int lsd; struct sockaddr_un sun; /* make an abstruct name address (*) */ memset(&sun, 0, sizeof(sun)); sun.sun_family = PF_UNIX; sprintf(&sun.sun_path[1], "%d", getpid()); /* create the listening socket and shutdown */ lsd = socket(AF_UNIX, SOCK_STREAM, 0); bind(lsd, (struct sockaddr *)&sun, sizeof(sun)); listen(lsd, 1); shutdown(lsd, SHUT_RDWR); /* connect loop */ alarm(15); /* forcely exit the loop after 15 sec */ for (;;) { csd = socket(AF_UNIX, SOCK_STREAM, 0); ret = connect(csd, (struct sockaddr *)&sun, sizeof(sun)); if (-1 == ret) { perror("connect()"); break; } puts("Connection OK"); } return 0; } (*) Make sun_path[0] = 0 to use the abstruct namespace. If a file-based socket is used, the system doesn't deadlock because of context switches in the file system layer. Why this happens: Error checks between unix_socket_connect() and unix_wait_for_peer() are inconsistent. The former calls the latter to wait until the backlog is processed. Despite the latter returns without doing anything when the socket is shutdown, the former doesn't check the shutdown state and just retries calling the latter forever. Patch: The patch below adds shutdown check into unix_socket_connect(), so connect(2) to the shutdown socket will return -ECONREFUSED. Signed-off-by: Tomoki Sekiyama Signed-off-by: Masanori Yoshida Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 51ab497115eb..fc820cd75453 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1074,6 +1074,8 @@ restart: err = -ECONNREFUSED; if (other->sk_state != TCP_LISTEN) goto out_unlock; + if (other->sk_shutdown & RCV_SHUTDOWN) + goto out_unlock; if (unix_recvq_full(other)) { err = -EAGAIN; -- cgit v1.2.2 From a1a2ad9151c26d92e5c733a33d52108f5d3a5b57 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 19 Oct 2009 19:12:36 -0700 Subject: Revert "tcp: fix tcp_defer_accept to consider the timeout" This reverts commit 6d01a026b7d3009a418326bdcf313503a314f1ea. Julian Anastasov, Willy Tarreau and Eric Dumazet have come up with a more correct way to deal with this. Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index e320afea07fc..624c3c9b3c2b 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -644,7 +644,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { - inet_csk(sk)->icsk_accept_queue.rskq_defer_accept--; inet_rsk(req)->acked = 1; return NULL; } -- cgit v1.2.2 From d1b99ba41d6c5aa1ed2fc634323449dd656899e9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 19 Oct 2009 10:01:56 +0000 Subject: tcp: accept socket after TCP_DEFER_ACCEPT period Willy Tarreau and many other folks in recent years were concerned what happens when the TCP_DEFER_ACCEPT period expires for clients which sent ACK packet. They prefer clients that actively resend ACK on our SYN-ACK retransmissions to be converted from open requests to sockets and queued to the listener for accepting after the deferring period is finished. Then application server can decide to wait longer for data or to properly terminate the connection with FIN if read() returns EAGAIN which is an indication for accepting after the deferring period. This change still can have side effects for applications that expect always to see data on the accepted socket. Others can be prepared to work in both modes (with or without TCP_DEFER_ACCEPT period) and their data processing can ignore the read=EAGAIN notification and to allocate resources for clients which proved to have no data to send during the deferring period. OTOH, servers that use TCP_DEFER_ACCEPT=1 as flag (not as a timeout) to wait for data will notice clients that didn't send data for 3 seconds but that still resend ACKs. Thanks to Willy Tarreau for the initial idea and to Eric Dumazet for the review and testing the change. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_minisocks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 624c3c9b3c2b..4c03598ed924 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -641,8 +641,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; - /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ - if (inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && + /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ + if (req->retrans < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; return NULL; -- cgit v1.2.2 From 0c3d79bce48034018e840468ac5a642894a521a3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 19 Oct 2009 10:03:58 +0000 Subject: tcp: reduce SYN-ACK retrans for TCP_DEFER_ACCEPT Change SYN-ACK retransmitting code for the TCP_DEFER_ACCEPT users to not retransmit SYN-ACKs during the deferring period if ACK from client was received. The goal is to reduce traffic during the deferring period. When the period is finished we continue with sending SYN-ACKs (at least one) but this time any traffic from client will change the request to established socket allowing application to terminate it properly. Also, do not drop acked request if sending of SYN-ACK fails. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4351ca2cf0b8..537731b3bcb3 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -446,6 +446,28 @@ extern int sysctl_tcp_synack_retries; EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +/* Decide when to expire the request and when to resend SYN-ACK */ +static inline void syn_ack_recalc(struct request_sock *req, const int thresh, + const int max_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) +{ + if (!rskq_defer_accept) { + *expire = req->retrans >= thresh; + *resend = 1; + return; + } + *expire = req->retrans >= thresh && + (!inet_rsk(req)->acked || req->retrans >= max_retries); + /* + * Do not resend while waiting for data after ACK, + * start to resend on end of deferring period to give + * last chance for data or ACK to create established socket. + */ + *resend = !inet_rsk(req)->acked || + req->retrans >= rskq_defer_accept - 1; +} + void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long interval, const unsigned long timeout, @@ -501,9 +523,15 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req)) { + int expire = 0, resend = 0; + + syn_ack_recalc(req, thresh, max_retries, + queue->rskq_defer_accept, + &expire, &resend); + if (!expire && + (!resend || + !req->rsk_ops->rtx_syn_ack(parent, req) || + inet_rsk(req)->acked)) { unsigned long timeo; if (req->retrans++ == 0) -- cgit v1.2.2 From b103cf34382f26ff48a87931b83f13b177b47c1a Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 19 Oct 2009 10:10:40 +0000 Subject: tcp: fix TCP_DEFER_ACCEPT retrans calculation Fix TCP_DEFER_ACCEPT conversion between seconds and retransmission to match the TCP SYN-ACK retransmission periods because the time is converted to such retransmissions. The old algorithm selects one more retransmission in some cases. Allow up to 255 retransmissions. Signed-off-by: Julian Anastasov Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 64d0af675823..9b2756fbdf9b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -326,6 +326,43 @@ void tcp_enter_memory_pressure(struct sock *sk) EXPORT_SYMBOL(tcp_enter_memory_pressure); +/* Convert seconds to retransmits based on initial and max timeout */ +static u8 secs_to_retrans(int seconds, int timeout, int rto_max) +{ + u8 res = 0; + + if (seconds > 0) { + int period = timeout; + + res = 1; + while (seconds > period && res < 255) { + res++; + timeout <<= 1; + if (timeout > rto_max) + timeout = rto_max; + period += timeout; + } + } + return res; +} + +/* Convert retransmits to seconds based on initial and max timeout */ +static int retrans_to_secs(u8 retrans, int timeout, int rto_max) +{ + int period = 0; + + if (retrans > 0) { + period = timeout; + while (--retrans) { + timeout <<= 1; + if (timeout > rto_max) + timeout = rto_max; + period += timeout; + } + } + return period; +} + /* * Wait for a TCP event. * @@ -2163,16 +2200,10 @@ static int do_tcp_setsockopt(struct sock *sk, int level, break; case TCP_DEFER_ACCEPT: - icsk->icsk_accept_queue.rskq_defer_accept = 0; - if (val > 0) { - /* Translate value in seconds to number of - * retransmits */ - while (icsk->icsk_accept_queue.rskq_defer_accept < 32 && - val > ((TCP_TIMEOUT_INIT / HZ) << - icsk->icsk_accept_queue.rskq_defer_accept)) - icsk->icsk_accept_queue.rskq_defer_accept++; - icsk->icsk_accept_queue.rskq_defer_accept++; - } + /* Translate value in seconds to number of retransmits */ + icsk->icsk_accept_queue.rskq_defer_accept = + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: @@ -2353,8 +2384,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : sysctl_tcp_fin_timeout) / HZ; break; case TCP_DEFER_ACCEPT: - val = !icsk->icsk_accept_queue.rskq_defer_accept ? 0 : - ((TCP_TIMEOUT_INIT / HZ) << (icsk->icsk_accept_queue.rskq_defer_accept - 1)); + val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, + TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; -- cgit v1.2.2 From f74c77cb1124a11acf69c98d10c0fdc22f322664 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Sun, 18 Oct 2009 20:24:41 +0000 Subject: bluetooth: scheduling while atomic bug fix Due to driver core changes dev_set_drvdata will call kzalloc which should be in might_sleep context, but hci_conn_add will be called in atomic context Like dev_set_name move dev_set_drvdata to work queue function. oops as following: Oct 2 17:41:59 darkstar kernel: [ 438.001341] BUG: sleeping function called from invalid context at mm/slqb.c:1546 Oct 2 17:41:59 darkstar kernel: [ 438.001345] in_atomic(): 1, irqs_disabled(): 0, pid: 2133, name: sdptool Oct 2 17:41:59 darkstar kernel: [ 438.001348] 2 locks held by sdptool/2133: Oct 2 17:41:59 darkstar kernel: [ 438.001350] #0: (sk_lock-AF_BLUETOOTH-BTPROTO_L2CAP){+.+.+.}, at: [] lock_sock+0xa/0xc [l2cap] Oct 2 17:41:59 darkstar kernel: [ 438.001360] #1: (&hdev->lock){+.-.+.}, at: [] l2cap_sock_connect+0x103/0x26b [l2cap] Oct 2 17:41:59 darkstar kernel: [ 438.001371] Pid: 2133, comm: sdptool Not tainted 2.6.31-mm1 #2 Oct 2 17:41:59 darkstar kernel: [ 438.001373] Call Trace: Oct 2 17:41:59 darkstar kernel: [ 438.001381] [] __might_sleep+0xde/0xe5 Oct 2 17:41:59 darkstar kernel: [ 438.001386] [] __kmalloc+0x4a/0x15a Oct 2 17:41:59 darkstar kernel: [ 438.001392] [] ? kzalloc+0xb/0xd Oct 2 17:41:59 darkstar kernel: [ 438.001396] [] kzalloc+0xb/0xd Oct 2 17:41:59 darkstar kernel: [ 438.001400] [] device_private_init+0x15/0x3d Oct 2 17:41:59 darkstar kernel: [ 438.001405] [] dev_set_drvdata+0x18/0x26 Oct 2 17:41:59 darkstar kernel: [ 438.001414] [] hci_conn_init_sysfs+0x40/0xd9 [bluetooth] Oct 2 17:41:59 darkstar kernel: [ 438.001422] [] ? hci_conn_add+0x128/0x186 [bluetooth] Oct 2 17:41:59 darkstar kernel: [ 438.001429] [] hci_conn_add+0x177/0x186 [bluetooth] Oct 2 17:41:59 darkstar kernel: [ 438.001437] [] hci_connect+0x3c/0xfb [bluetooth] Oct 2 17:41:59 darkstar kernel: [ 438.001442] [] l2cap_sock_connect+0x174/0x26b [l2cap] Oct 2 17:41:59 darkstar kernel: [ 438.001448] [] sys_connect+0x60/0x7a Oct 2 17:41:59 darkstar kernel: [ 438.001453] [] ? lock_release_non_nested+0x84/0x1de Oct 2 17:41:59 darkstar kernel: [ 438.001458] [] ? might_fault+0x47/0x81 Oct 2 17:41:59 darkstar kernel: [ 438.001462] [] ? might_fault+0x47/0x81 Oct 2 17:41:59 darkstar kernel: [ 438.001468] [] ? __copy_from_user_ll+0x11/0xce Oct 2 17:41:59 darkstar kernel: [ 438.001472] [] sys_socketcall+0x82/0x17b Oct 2 17:41:59 darkstar kernel: [ 438.001477] [] syscall_call+0x7/0xb Signed-off-by: Dave Young Signed-off-by: David S. Miller --- net/bluetooth/hci_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 7f939ce29801..2bc6f6a8de68 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -92,6 +92,8 @@ static void add_conn(struct work_struct *work) dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); + dev_set_drvdata(&conn->dev, conn); + if (device_add(&conn->dev) < 0) { BT_ERR("Failed to register connection device"); return; @@ -144,8 +146,6 @@ void hci_conn_init_sysfs(struct hci_conn *conn) conn->dev.class = bt_class; conn->dev.parent = &hdev->dev; - dev_set_drvdata(&conn->dev, conn); - device_initialize(&conn->dev); INIT_WORK(&conn->work_add, add_conn); -- cgit v1.2.2 From 45054dc1bf2367ccb0e7c0486037907cd9395f8b Mon Sep 17 00:00:00 2001 From: Dave Young Date: Sun, 18 Oct 2009 20:28:30 +0000 Subject: bluetooth: static lock key fix When shutdown ppp connection, lockdep waring about non-static key will happen, it is caused by the lock is not initialized properly at that time. Fix with tuning the lock/skb_queue_head init order [ 94.339261] INFO: trying to register non-static key. [ 94.342509] the code is fine but needs lockdep annotation. [ 94.342509] turning off the locking correctness validator. [ 94.342509] Pid: 0, comm: swapper Not tainted 2.6.31-mm1 #2 [ 94.342509] Call Trace: [ 94.342509] [] register_lock_class+0x58/0x241 [ 94.342509] [] ? __lock_acquire+0xb57/0xb73 [ 94.342509] [] __lock_acquire+0xac/0xb73 [ 94.342509] [] ? lock_release_non_nested+0x17b/0x1de [ 94.342509] [] lock_acquire+0x67/0x84 [ 94.342509] [] ? skb_dequeue+0x15/0x41 [ 94.342509] [] _spin_lock_irqsave+0x2f/0x3f [ 94.342509] [] ? skb_dequeue+0x15/0x41 [ 94.342509] [] skb_dequeue+0x15/0x41 [ 94.342509] [] ? _read_unlock+0x1d/0x20 [ 94.342509] [] skb_queue_purge+0x14/0x1b [ 94.342509] [] l2cap_recv_frame+0xea1/0x115a [l2cap] [ 94.342509] [] ? __lock_acquire+0xb57/0xb73 [ 94.342509] [] ? mark_lock+0x1e/0x1c7 [ 94.342509] [] ? hci_rx_task+0xd2/0x1bc [bluetooth] [ 94.342509] [] l2cap_recv_acldata+0xb1/0x1c6 [l2cap] [ 94.342509] [] hci_rx_task+0x106/0x1bc [bluetooth] [ 94.342509] [] ? l2cap_recv_acldata+0x0/0x1c6 [l2cap] [ 94.342509] [] tasklet_action+0x69/0xc1 [ 94.342509] [] __do_softirq+0x94/0x11e [ 94.342509] [] do_softirq+0x36/0x5a [ 94.342509] [] irq_exit+0x35/0x68 [ 94.342509] [] do_IRQ+0x72/0x89 [ 94.342509] [] common_interrupt+0x2e/0x34 [ 94.342509] [] ? pm_qos_add_requirement+0x63/0x9d [ 94.342509] [] ? acpi_idle_enter_bm+0x209/0x238 [ 94.342509] [] cpuidle_idle_call+0x5c/0x94 [ 94.342509] [] cpu_idle+0x4e/0x6f [ 94.342509] [] rest_init+0x53/0x55 [ 94.342509] [] start_kernel+0x2f0/0x2f5 [ 94.342509] [] i386_start_kernel+0x91/0x96 Reported-by: Oliver Hartkopp Signed-off-by: Dave Young Tested-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/bluetooth/l2cap.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 555d9da1869b..77e9fb130adb 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -555,12 +555,12 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) conn->feat_mask = 0; - setup_timer(&conn->info_timer, l2cap_info_timeout, - (unsigned long) conn); - spin_lock_init(&conn->lock); rwlock_init(&conn->chan_list.lock); + setup_timer(&conn->info_timer, l2cap_info_timeout, + (unsigned long) conn); + conn->disc_reason = 0x13; return conn; @@ -783,6 +783,9 @@ static void l2cap_sock_init(struct sock *sk, struct sock *parent) /* Default config options */ pi->conf_len = 0; pi->flush_to = L2CAP_DEFAULT_FLUSH_TO; + skb_queue_head_init(TX_QUEUE(sk)); + skb_queue_head_init(SREJ_QUEUE(sk)); + INIT_LIST_HEAD(SREJ_LIST(sk)); } static struct proto l2cap_proto = { -- cgit v1.2.2 From 55b8050353c4a212c94d7156e2bd5885225b869b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 19 Oct 2009 06:41:58 +0000 Subject: net: Fix IP_MULTICAST_IF ipv4/ipv6 setsockopt(IP_MULTICAST_IF) have dubious __dev_get_by_index() calls. This function should be called only with RTNL or dev_base_lock held, or reader could see a corrupt hash chain and eventually enter an endless loop. Fix is to call dev_get_by_index()/dev_put(). If this happens to be performance critical, we could define a new dev_exist_by_index() function to avoid touching dev refcount. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 7 +++---- net/ipv6/ipv6_sockglue.c | 6 +++++- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 0c0b6e363a20..e982b5c1ee17 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -634,17 +634,16 @@ static int do_ip_setsockopt(struct sock *sk, int level, break; } dev = ip_dev_find(sock_net(sk), mreq.imr_address.s_addr); - if (dev) { + if (dev) mreq.imr_ifindex = dev->ifindex; - dev_put(dev); - } } else - dev = __dev_get_by_index(sock_net(sk), mreq.imr_ifindex); + dev = dev_get_by_index(sock_net(sk), mreq.imr_ifindex); err = -EADDRNOTAVAIL; if (!dev) break; + dev_put(dev); err = -EINVAL; if (sk->sk_bound_dev_if && diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 14f54eb5a7fc..4f7aaf6996a3 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -496,13 +496,17 @@ done: goto e_inval; if (val) { + struct net_device *dev; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) goto e_inval; - if (__dev_get_by_index(net, val) == NULL) { + dev = dev_get_by_index(net, val); + if (!dev) { retv = -ENODEV; break; } + dev_put(dev); } np->mcast_oif = val; retv = 0; -- cgit v1.2.2 From b6b39e8f3fbbb31001b836afec87bcaf4811a7bf Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 19 Oct 2009 19:41:06 +0000 Subject: tcp: Try to catch MSG_PEEK bug This patch tries to print out more information when we hit the MSG_PEEK bug in tcp_recvmsg. It's been around since at least 2005 and it's about time that we finally fix it. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9b2756fbdf9b..90b2e0649bfb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1442,7 +1442,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - WARN_ON(!(flags & MSG_PEEK)); + if (WARN_ON(!(flags & MSG_PEEK))) + printk(KERN_INFO "recvmsg bug 2: copied %X " + "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); } /* Well, if we have backlog, try to process it now yet. */ -- cgit v1.2.2 From e95646c3ec33c8ec0693992da4332a6b32eb7e31 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Wed, 30 Sep 2009 11:17:21 +0200 Subject: virtio: let header files include virtio_ids.h Rusty, commit 3ca4f5ca73057a617f9444a91022d7127041970a virtio: add virtio IDs file moved all device IDs into a single file. While the change itself is a very good one, it can break userspace applications. For example if a userspace tool wanted to get the ID of virtio_net it used to include virtio_net.h. This does no longer work, since virtio_net.h does not include virtio_ids.h. This patch moves all "#include " from the C files into the header files, making the header files compatible with the old ones. In addition, this patch exports virtio_ids.h to userspace. CC: Fernando Luis Vazquez Cao Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- net/9p/trans_virtio.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index b2e07f0dd298..ea1e3daabefe 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #define VIRTQUEUE_NUM 128 -- cgit v1.2.2 From c62f4c453ab4b0240ab857bfd089da2c01ad91e7 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 22 Oct 2009 21:37:56 -0700 Subject: net: use WARN() for the WARN_ON in commit b6b39e8f3fbbb Commit b6b39e8f3fbbb (tcp: Try to catch MSG_PEEK bug) added a printk() to the WARN_ON() that's in tcp.c. This patch changes this combination to WARN(); the advantage of WARN() is that the printk message shows up inside the message, so that kerneloops.org will collect the message. In addition, this gets rid of an extra if() statement. Signed-off-by: Arjan van de Ven Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 90b2e0649bfb..98440ad82558 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1442,9 +1442,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - if (WARN_ON(!(flags & MSG_PEEK))) - printk(KERN_INFO "recvmsg bug 2: copied %X " - "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); + WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " + "copied %X seq %X\n", *seq, + TCP_SKB_CB(skb)->seq); } /* Well, if we have backlog, try to process it now yet. */ -- cgit v1.2.2 From 66ed1e5ec1d979e572554643063734a7664261bb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 24 Oct 2009 06:55:20 -0700 Subject: pktgen: Dont leak kernel memory While playing with pktgen, I realized IP ID was not filled and a random value was taken, possibly leaking 2 bytes of kernel memory. We can use an increasing ID, this can help diagnostics anyway. Also clear packet payload, instead of leaking kernel memory. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 86acdba0a97d..6eb8d47cbf3a 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -335,6 +335,7 @@ struct pktgen_dev { __u32 cur_src_mac_offset; __be32 cur_saddr; __be32 cur_daddr; + __u16 ip_id; __u16 cur_udp_dst; __u16 cur_udp_src; __u16 cur_queue_map; @@ -2630,6 +2631,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, iph->protocol = IPPROTO_UDP; /* UDP */ iph->saddr = pkt_dev->cur_saddr; iph->daddr = pkt_dev->cur_daddr; + iph->id = htons(pkt_dev->ip_id); + pkt_dev->ip_id++; iph->frag_off = 0; iplen = 20 + 8 + datalen; iph->tot_len = htons(iplen); @@ -2641,24 +2644,26 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->dev = odev; skb->pkt_type = PACKET_HOST; - if (pkt_dev->nfrags <= 0) + if (pkt_dev->nfrags <= 0) { pgh = (struct pktgen_hdr *)skb_put(skb, datalen); - else { + memset(pgh + 1, 0, datalen - sizeof(struct pktgen_hdr)); + } else { int frags = pkt_dev->nfrags; - int i; + int i, len; pgh = (struct pktgen_hdr *)(((char *)(udph)) + 8); if (frags > MAX_SKB_FRAGS) frags = MAX_SKB_FRAGS; if (datalen > frags * PAGE_SIZE) { - skb_put(skb, datalen - frags * PAGE_SIZE); + len = datalen - frags * PAGE_SIZE; + memset(skb_put(skb, len), 0, len); datalen = frags * PAGE_SIZE; } i = 0; while (datalen > 0) { - struct page *page = alloc_pages(GFP_KERNEL, 0); + struct page *page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); skb_shinfo(skb)->frags[i].page = page; skb_shinfo(skb)->frags[i].page_offset = 0; skb_shinfo(skb)->frags[i].size = -- cgit v1.2.2 From d419b9f0fa69e79ccba3e5e79a58a52ae0c2ed6a Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 19 Oct 2009 14:55:37 -0700 Subject: mac80211: fix ibss joining Recent commit "mac80211: fix logic error ibss merge bssid check" fixed joining of ibss cell when static bssid is provided. In this case ifibss->bssid is set before the cell is joined and comparing that address to a bss should thus always succeed. Unfortunately this change broke the other case of joining a ibss cell without providing a static bssid where the value of ifibss->bssid is not set before the cell is joined. Since ifibss->bssid may be set before or after joining the cell we do not learn anything by comparing it to a known bss. Remove this check. Signed-off-by: Reinette Chatre Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 6eaf69823439..ca8ecce31d34 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -538,13 +538,12 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) WLAN_CAPABILITY_PRIVACY, capability); + if (bss) { #ifdef CONFIG_MAC80211_IBSS_DEBUG - if (bss) printk(KERN_DEBUG " sta_find_ibss: selected %pM current " "%pM\n", bss->cbss.bssid, ifibss->bssid); #endif /* CONFIG_MAC80211_IBSS_DEBUG */ - if (bss && !memcmp(ifibss->bssid, bss->cbss.bssid, ETH_ALEN)) { printk(KERN_DEBUG "%s: Selected IBSS BSSID %pM" " based on configured SSID\n", sdata->dev->name, bss->cbss.bssid); @@ -552,8 +551,7 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata) ieee80211_sta_join_ibss(sdata, bss); ieee80211_rx_bss_put(local, bss); return; - } else if (bss) - ieee80211_rx_bss_put(local, bss); + } #ifdef CONFIG_MAC80211_IBSS_DEBUG printk(KERN_DEBUG " did not try to join ibss\n"); -- cgit v1.2.2 From 2ef6e4440926668cfa9eac4b79e63528ebcbe0c1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 20 Oct 2009 15:08:12 +0900 Subject: mac80211: keep auth state when assoc fails When association fails, we should stay authenticated, which in mac80211 is represented by the existence of the mlme work struct, so we cannot free that, instead we need to just set it to idle. (Brought to you by the hacking session at Kernel Summit 2009 in Tokyo, Japan. -- JWL) Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8d26e9bf8964..dc5049d58c51 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1457,8 +1457,7 @@ ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (status_code != WLAN_STATUS_SUCCESS) { printk(KERN_DEBUG "%s: AP denied association (code=%d)\n", sdata->dev->name, status_code); - list_del(&wk->list); - kfree(wk); + wk->state = IEEE80211_MGD_STATE_IDLE; return RX_MGMT_CFG80211_ASSOC; } -- cgit v1.2.2 From 7d930bc33653d5592dc386a76a38f39c2e962344 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 20 Oct 2009 15:08:53 +0900 Subject: cfg80211: sme: deauthenticate on assoc failure When the in-kernel SME gets an association failure from the AP we don't deauthenticate, and thus get into a very confused state which will lead to warnings later on. Fix this by actually deauthenticating when the AP indicates an association failure. (Brought to you by the hacking session at Kernel Summit 2009 in Tokyo, Japan. -- JWL) Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/core.h | 1 + net/wireless/mlme.c | 9 +++++++++ net/wireless/sme.c | 21 +++++++++++++++++++-- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/core.h b/net/wireless/core.h index 2a33d8bc886b..68b321997d4c 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -358,6 +358,7 @@ int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_conn_work(struct work_struct *work); +void cfg80211_sme_failed_assoc(struct wireless_dev *wdev); bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev); /* internal helpers */ diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 79d2eec54cec..0a6b7a0eca6b 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -62,6 +62,7 @@ void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len) u8 *ie = mgmt->u.assoc_resp.variable; int i, ieoffs = offsetof(struct ieee80211_mgmt, u.assoc_resp.variable); struct cfg80211_internal_bss *bss = NULL; + bool need_connect_result = true; wdev_lock(wdev); @@ -94,6 +95,14 @@ void cfg80211_send_rx_assoc(struct net_device *dev, const u8 *buf, size_t len) } WARN_ON(!bss); + } else if (wdev->conn) { + cfg80211_sme_failed_assoc(wdev); + need_connect_result = false; + /* + * do not call connect_result() now because the + * sme will schedule work that does it later. + */ + goto out; } if (!wdev->conn && wdev->sme_state == CFG80211_SME_IDLE) { diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 93c3ed329204..ece378d531ef 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -26,6 +26,7 @@ struct cfg80211_conn { CFG80211_CONN_AUTHENTICATING, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, + CFG80211_CONN_DEAUTH_ASSOC_FAIL, } state; u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; u8 *ie; @@ -148,6 +149,12 @@ static int cfg80211_conn_do_work(struct wireless_dev *wdev) NULL, 0, WLAN_REASON_DEAUTH_LEAVING); return err; + case CFG80211_CONN_DEAUTH_ASSOC_FAIL: + __cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, + NULL, 0, + WLAN_REASON_DEAUTH_LEAVING); + /* return an error so that we call __cfg80211_connect_result() */ + return -EINVAL; default: return 0; } @@ -158,6 +165,7 @@ void cfg80211_conn_work(struct work_struct *work) struct cfg80211_registered_device *rdev = container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; + u8 bssid[ETH_ALEN]; rtnl_lock(); cfg80211_lock_rdev(rdev); @@ -173,10 +181,10 @@ void cfg80211_conn_work(struct work_struct *work) wdev_unlock(wdev); continue; } + memcpy(bssid, wdev->conn->params.bssid, ETH_ALEN); if (cfg80211_conn_do_work(wdev)) __cfg80211_connect_result( - wdev->netdev, - wdev->conn->params.bssid, + wdev->netdev, bssid, NULL, 0, NULL, 0, WLAN_STATUS_UNSPECIFIED_FAILURE, false, NULL); @@ -337,6 +345,15 @@ bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev) return true; } +void cfg80211_sme_failed_assoc(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); + + wdev->conn->state = CFG80211_CONN_DEAUTH_ASSOC_FAIL; + schedule_work(&rdev->conn_work); +} + void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *req_ie, size_t req_ie_len, const u8 *resp_ie, size_t resp_ie_len, -- cgit v1.2.2 From f99288d1761fb3b0decb0fdc4d746406addd29d5 Mon Sep 17 00:00:00 2001 From: Andrey Yurovsky Date: Tue, 20 Oct 2009 12:17:34 -0700 Subject: mac80211: trivial: fix spelling in mesh_hwmp Fix a typo in the description of hwmp_route_info_get(), no function changes. Signed-off-by: Andrey Yurovsky Signed-off-by: John W. Linville --- net/mac80211/mesh_hwmp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index e12a786e26b8..29b82e98effa 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -259,7 +259,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, * @hwmp_ie: hwmp information element (PREP or PREQ) * * This function updates the path routing information to the originator and the - * transmitter of a HWMP PREQ or PREP fram. + * transmitter of a HWMP PREQ or PREP frame. * * Returns: metric to frame originator or 0 if the frame should not be further * processed -- cgit v1.2.2 From 9b1ce526eb917c8b5c8497c327768130ee683392 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Smedman?= Date: Sat, 24 Oct 2009 20:55:09 +0200 Subject: mac80211: fix for incorrect sequence number on hostapd injected frames MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When hostapd injects a frame, e.g. an authentication or association response, mac80211 looks for a suitable access point virtual interface to associate the frame with based on its source address. This makes it possible e.g. to correctly assign sequence numbers to the frames. A small typo in the ethernet address comparison statement caused a failure to find a suitable ap interface. Sequence numbers on such frames where therefore left unassigned causing some clients (especially windows-based 11b/g clients) to reject them and fail to authenticate or associate with the access point. This patch fixes the typo in the address comparison statement. Signed-off-by: Björn Smedman Reviewed-by: Johannes Berg Cc: stable@kernel.org Signed-off-by: John W. Linville --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index db4bda681ec9..eaa4118de988 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1445,7 +1445,7 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, if (tmp_sdata->vif.type != NL80211_IFTYPE_AP) continue; if (compare_ether_addr(tmp_sdata->dev->dev_addr, - hdr->addr2)) { + hdr->addr2) == 0) { dev_hold(tmp_sdata->dev); dev_put(sdata->dev); sdata = tmp_sdata; -- cgit v1.2.2 From 55888dfb6ba7e318bb3d6a44d25009906206bf6a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 28 Oct 2009 08:59:47 +0000 Subject: AF_RAW: Augment raw_send_hdrinc to expand skb to fit iphdr->ihl (v2) Augment raw_send_hdrinc to correct for incorrect ip header length values A series of oopses was reported to me recently. Apparently when using AF_RAW sockets to send data to peers that were reachable via ipsec encapsulation, people could panic or BUG halt their systems. I've tracked the problem down to user space sending an invalid ip header over an AF_RAW socket with IP_HDRINCL set to 1. Basically what happens is that userspace sends down an ip frame that includes only the header (no data), but sets the ip header ihl value to a large number, one that is larger than the total amount of data passed to the sendmsg call. In raw_send_hdrincl, we allocate an skb based on the size of the data in the msghdr that was passed in, but assume the data is all valid. Later during ipsec encapsulation, xfrm4_tranport_output moves the entire frame back in the skbuff to provide headroom for the ipsec headers. During this operation, the skb->transport_header is repointed to a spot computed by skb->network_header + the ip header length (ihl). Since so little data was passed in relative to the value of ihl provided by the raw socket, we point transport header to an unknown location, resulting in various crashes. This fix for this is pretty straightforward, simply validate the value of of iph->ihl when sending over a raw socket. If (iph->ihl*4U) > user data buffer size, drop the frame and return -EINVAL. I just confirmed this fixes the reported crashes. Signed-off-by: Neil Horman Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/raw.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 757c9171e7c2..ab996f9c0fe0 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -352,13 +352,24 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, skb->ip_summed = CHECKSUM_NONE; skb->transport_header = skb->network_header; - err = memcpy_fromiovecend((void *)iph, from, 0, length); - if (err) - goto error_fault; + err = -EFAULT; + if (memcpy_fromiovecend((void *)iph, from, 0, length)) + goto error_free; - /* We don't modify invalid header */ iphlen = iph->ihl * 4; - if (iphlen >= sizeof(*iph) && iphlen <= length) { + + /* + * We don't want to modify the ip header, but we do need to + * be sure that it won't cause problems later along the network + * stack. Specifically we want to make sure that iph->ihl is a + * sane value. If ihl points beyond the length of the buffer passed + * in, reject the frame as invalid + */ + err = -EINVAL; + if (iphlen > length) + goto error_free; + + if (iphlen >= sizeof(*iph)) { if (!iph->saddr) iph->saddr = rt->rt_src; iph->check = 0; @@ -381,8 +392,7 @@ static int raw_send_hdrinc(struct sock *sk, void *from, size_t length, out: return 0; -error_fault: - err = -EFAULT; +error_free: kfree_skb(skb); error: IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS); -- cgit v1.2.2 From b5dd884e682cae6b8c037f9d11f3b623b4cf2011 Mon Sep 17 00:00:00 2001 From: Gabor Gombas Date: Thu, 29 Oct 2009 03:19:11 -0700 Subject: net: Fix 'Re: PACKET_TX_RING: packet size is too long' Currently PACKET_TX_RING forces certain amount of every frame to remain unused. This probably originates from an early version of the PACKET_TX_RING patch that in fact used the extra space when the (since removed) CONFIG_PACKET_MMAP_ZERO_COPY option was enabled. The current code does not make any use of this extra space. This patch removes the extra space reservation and lets userspace make use of the full frame size. Signed-off-by: Gabor Gombas Signed-off-by: David S. Miller --- net/packet/af_packet.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 1238949e66a9..48b18dad6763 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -982,10 +982,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) goto out_put; size_max = po->tx_ring.frame_size - - sizeof(struct skb_shared_info) - - po->tp_hdrlen - - LL_ALLOCATED_SPACE(dev) - - sizeof(struct sockaddr_ll); + - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if (size_max > dev->mtu + reserve) size_max = dev->mtu + reserve; -- cgit v1.2.2 From b0c110ca8e89f2c9cd52ec7fb1b98c5b7aa78496 Mon Sep 17 00:00:00 2001 From: jamal Date: Sun, 18 Oct 2009 02:12:33 +0000 Subject: net: Fix RPF to work with policy routing Policy routing is not looked up by mark on reverse path filtering. This fixes it. Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 5 ++++- net/ipv4/route.c | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index e2f950592566..aa00398be80e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -229,14 +229,17 @@ unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, */ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, - struct net_device *dev, __be32 *spec_dst, u32 *itag) + struct net_device *dev, __be32 *spec_dst, + u32 *itag, u32 mark) { struct in_device *in_dev; struct flowi fl = { .nl_u = { .ip4_u = { .daddr = src, .saddr = dst, .tos = tos } }, + .mark = mark, .iif = oif }; + struct fib_result res; int no_addr, rpf; int ret; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bb4199252026..5b1050a5d874 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1854,7 +1854,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) + dev, &spec_dst, &itag, 0) < 0) goto e_inval; rth = dst_alloc(&ipv4_dst_ops); @@ -1967,7 +1967,7 @@ static int __mkroute_input(struct sk_buff *skb, err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, &spec_dst, &itag, skb->mark); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -2141,7 +2141,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, int result; result = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, &spec_dst, &itag, skb->mark); if (result < 0) goto martian_source; if (result) @@ -2170,7 +2170,7 @@ brd_input: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag); + &itag, skb->mark); if (err < 0) goto martian_source; if (err) -- cgit v1.2.2 From 9d410c796067686b1e032d54ce475b7055537138 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 30 Oct 2009 05:03:53 +0000 Subject: net: fix sk_forward_alloc corruption On UDP sockets, we must call skb_free_datagram() with socket locked, or risk sk_forward_alloc corruption. This requirement is not respected in SUNRPC. Add a convenient helper, skb_free_datagram_locked() and use it in SUNRPC Reported-by: Francis Moreau Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/datagram.c | 10 +++++++++- net/ipv4/udp.c | 4 +--- net/ipv6/udp.c | 4 +--- net/sunrpc/svcsock.c | 10 +++++----- 4 files changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/datagram.c b/net/core/datagram.c index 1c6cf3a1a4f6..4ade3011bb3c 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -224,6 +224,15 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) consume_skb(skb); sk_mem_reclaim_partial(sk); } +EXPORT_SYMBOL(skb_free_datagram); + +void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) +{ + lock_sock(sk); + skb_free_datagram(sk, skb); + release_sock(sk); +} +EXPORT_SYMBOL(skb_free_datagram_locked); /** * skb_kill_datagram - Free a datagram skbuff forcibly @@ -752,5 +761,4 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, EXPORT_SYMBOL(datagram_poll); EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); EXPORT_SYMBOL(skb_copy_datagram_iovec); -EXPORT_SYMBOL(skb_free_datagram); EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d0d436d6216c..0fa9f70e4b19 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -999,9 +999,7 @@ try_again: err = ulen; out_free: - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + skb_free_datagram_locked(sk, skb); out: return err; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 3a60f12b34ed..cf538ed5ef6a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -288,9 +288,7 @@ try_again: err = ulen; out_free: - lock_sock(sk); - skb_free_datagram(sk, skb); - release_sock(sk); + skb_free_datagram_locked(sk, skb); out: return err; diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index ccc5e83cae5d..1c246a4f491e 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -111,7 +111,7 @@ static void svc_release_skb(struct svc_rqst *rqstp) rqstp->rq_xprt_ctxt = NULL; dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); - skb_free_datagram(svsk->sk_sk, skb); + skb_free_datagram_locked(svsk->sk_sk, skb); } } @@ -578,7 +578,7 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) "svc: received unknown control message %d/%d; " "dropping RPC reply datagram\n", cmh->cmsg_level, cmh->cmsg_type); - skb_free_datagram(svsk->sk_sk, skb); + skb_free_datagram_locked(svsk->sk_sk, skb); return 0; } @@ -588,18 +588,18 @@ static int svc_udp_recvfrom(struct svc_rqst *rqstp) if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { local_bh_enable(); /* checksum error */ - skb_free_datagram(svsk->sk_sk, skb); + skb_free_datagram_locked(svsk->sk_sk, skb); return 0; } local_bh_enable(); - skb_free_datagram(svsk->sk_sk, skb); + skb_free_datagram_locked(svsk->sk_sk, skb); } else { /* we can use it in-place */ rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); rqstp->rq_arg.head[0].iov_len = len; if (skb_checksum_complete(skb)) { - skb_free_datagram(svsk->sk_sk, skb); + skb_free_datagram_locked(svsk->sk_sk, skb); return 0; } rqstp->rq_xprt_ctxt = skb; -- cgit v1.2.2 From 2e9526b352061ee0fd2a1580a2e3a5af960dabc4 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 30 Oct 2009 05:51:48 +0000 Subject: gre: Fix dev_addr clobbering for gretap Nathan Neulinger noticed that gretap devices get their MAC address from the local IP address, which results in invalid MAC addresses half of the time. This is because gretap is still using the tunnel netdev ops rather than the correct tap netdev ops struct. This patch also fixes changelink to not clobber the MAC address for the gretap case. Signed-off-by: Herbert Xu Acked-by: Stephen Hemminger Tested-by: Nathan Neulinger Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 41ada9904d31..143333852624 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1464,7 +1464,7 @@ static void ipgre_tap_setup(struct net_device *dev) ether_setup(dev); - dev->netdev_ops = &ipgre_netdev_ops; + dev->netdev_ops = &ipgre_tap_netdev_ops; dev->destructor = free_netdev; dev->iflink = 0; @@ -1525,25 +1525,29 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], if (t->dev != dev) return -EEXIST; } else { - unsigned nflags = 0; - t = nt; - if (ipv4_is_multicast(p.iph.daddr)) - nflags = IFF_BROADCAST; - else if (p.iph.daddr) - nflags = IFF_POINTOPOINT; + if (dev->type != ARPHRD_ETHER) { + unsigned nflags = 0; - if ((dev->flags ^ nflags) & - (IFF_POINTOPOINT | IFF_BROADCAST)) - return -EINVAL; + if (ipv4_is_multicast(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags ^ nflags) & + (IFF_POINTOPOINT | IFF_BROADCAST)) + return -EINVAL; + } ipgre_tunnel_unlink(ign, t); t->parms.iph.saddr = p.iph.saddr; t->parms.iph.daddr = p.iph.daddr; t->parms.i_key = p.i_key; - memcpy(dev->dev_addr, &p.iph.saddr, 4); - memcpy(dev->broadcast, &p.iph.daddr, 4); + if (dev->type != ARPHRD_ETHER) { + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + } ipgre_tunnel_link(ign, t); netdev_state_change(dev); } -- cgit v1.2.2 From f446d10f214091408b7300f15c9adf60569edf28 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 28 Oct 2009 15:12:32 +0100 Subject: mac80211: fix BSS leak The IBSS code leaks a BSS struct after telling cfg80211 about a given BSS by passing a frame. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index ca8ecce31d34..f1362f32c17d 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -73,6 +73,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt; u8 *pos; struct ieee80211_supported_band *sband; + struct cfg80211_bss *bss; u32 bss_change; u8 supp_rates[IEEE80211_MAX_SUPP_RATES]; @@ -177,8 +178,9 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, mod_timer(&ifibss->timer, round_jiffies(jiffies + IEEE80211_IBSS_MERGE_INTERVAL)); - cfg80211_inform_bss_frame(local->hw.wiphy, local->hw.conf.channel, - mgmt, skb->len, 0, GFP_KERNEL); + bss = cfg80211_inform_bss_frame(local->hw.wiphy, local->hw.conf.channel, + mgmt, skb->len, 0, GFP_KERNEL); + cfg80211_put_bss(bss); cfg80211_ibss_joined(sdata->dev, ifibss->bssid, GFP_KERNEL); } -- cgit v1.2.2 From 2171abc58644e09dbba546d91366b12743115396 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 29 Oct 2009 08:34:00 +0100 Subject: mac80211: fix addba timer The addba timer function acquires the sta spinlock, but at the same time we try to del_timer_sync() it under the spinlock which can produce deadlocks. To fix this, always del_timer_sync() the timer in ieee80211_process_addba_resp() and add it again after checking the conditions, if necessary. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/agg-tx.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index bd765f30dba2..b09948ceec4a 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -666,26 +666,25 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, state = &sta->ampdu_mlme.tid_state_tx[tid]; + del_timer_sync(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); + spin_lock_bh(&sta->lock); - if (!(*state & HT_ADDBA_REQUESTED_MSK)) { - spin_unlock_bh(&sta->lock); - return; - } + if (!(*state & HT_ADDBA_REQUESTED_MSK)) + goto timer_still_needed; if (mgmt->u.action.u.addba_resp.dialog_token != sta->ampdu_mlme.tid_tx[tid]->dialog_token) { - spin_unlock_bh(&sta->lock); #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "wrong addBA response token, tid %d\n", tid); #endif /* CONFIG_MAC80211_HT_DEBUG */ - return; + goto timer_still_needed; } - del_timer_sync(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); #ifdef CONFIG_MAC80211_HT_DEBUG printk(KERN_DEBUG "switched off addBA timer for tid %d \n", tid); #endif /* CONFIG_MAC80211_HT_DEBUG */ + if (le16_to_cpu(mgmt->u.action.u.addba_resp.status) == WLAN_STATUS_SUCCESS) { u8 curstate = *state; @@ -699,5 +698,11 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, } else { ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR); } + + goto out; + + timer_still_needed: + add_timer(&sta->ampdu_mlme.tid_tx[tid]->addba_resp_timer); + out: spin_unlock_bh(&sta->lock); } -- cgit v1.2.2 From 372362ade2fe5c33d749e017f1c5bc8140769a3e Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 29 Oct 2009 10:09:28 +0100 Subject: mac80211: fix reason code output endianness When HT debugging is enabled and we receive a DelBA frame we print out the reason code in the wrong byte order. Fix that so we don't get weird values printed. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ht.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 0891bfb06996..48ef1a282b91 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -153,7 +153,7 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, if (net_ratelimit()) printk(KERN_DEBUG "delba from %pM (%s) tid %d reason code %d\n", mgmt->sa, initiator ? "initiator" : "recipient", tid, - mgmt->u.action.u.delba.reason_code); + le16_to_cpu(mgmt->u.action.u.delba.reason_code)); #endif /* CONFIG_MAC80211_HT_DEBUG */ if (initiator == WLAN_BACK_INITIATOR) -- cgit v1.2.2 From 3e2796a90cf349527e50b3bc4d0b2f4019b1ce7a Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 2 Nov 2009 08:39:28 -0600 Subject: 9p: fix readdir corner cases The patch below also addresses a couple of other corner cases in readdir seen with a large (e.g. 64k) msize. I'm not sure what people think of my co-opting of fid->aux here. I'd be happy to rework if there's a better way. When the size of the user supplied buffer passed to readdir is smaller than the data returned in one go by the 9P read request, v9fs_dir_readdir() currently discards extra data so that, on the next call, a 9P read request will be issued with offset < previous offset + bytes returned, which voilates the constraint described in paragraph 3 of read(5) description. This patch preseves the leftover data in fid->aux for use in the next call. Signed-off-by: Jim Garlick Signed-off-by: Eric Van Hensbergen --- net/9p/client.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index 5bf5f227dbe0..8af95b2dddd6 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -582,11 +582,9 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) memset(&fid->qid, 0, sizeof(struct p9_qid)); fid->mode = -1; - fid->rdir_fpos = 0; fid->uid = current_fsuid(); fid->clnt = clnt; - fid->aux = NULL; - + fid->rdir = NULL; spin_lock_irqsave(&clnt->lock, flags); list_add(&fid->flist, &clnt->fidlist); spin_unlock_irqrestore(&clnt->lock, flags); @@ -609,6 +607,7 @@ static void p9_fid_destroy(struct p9_fid *fid) spin_lock_irqsave(&clnt->lock, flags); list_del(&fid->flist); spin_unlock_irqrestore(&clnt->lock, flags); + kfree(fid->rdir); kfree(fid); } -- cgit v1.2.2 From 7400f42e9d765fa0656b432f3ab1245f9710f190 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sat, 31 Oct 2009 07:40:37 +0100 Subject: cfg80211: fix NULL ptr deref commit 211a4d12abf86fe0df4cd68fc6327cbb58f56f81 Author: Johannes Berg Date: Tue Oct 20 15:08:53 2009 +0900 cfg80211: sme: deauthenticate on assoc failure introduced a potential NULL pointer dereference that some people have been hitting for some reason -- the params.bssid pointer is not guaranteed to be non-NULL for what seems to be a race between various ways of reaching the same thing. While I'm trying to analyse the problem more let's first fix the crash. I think the real fix may be to avoid doing _anything_ if it ended up being NULL, but right now I'm not sure yet. I think http://bugzilla.kernel.org/show_bug.cgi?id=14342 might also be this issue. Reported-by: Parag Warudkar Tested-by: Parag Warudkar Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/sme.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/sme.c b/net/wireless/sme.c index ece378d531ef..9f0b2800a9d7 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -165,7 +165,7 @@ void cfg80211_conn_work(struct work_struct *work) struct cfg80211_registered_device *rdev = container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; - u8 bssid[ETH_ALEN]; + u8 bssid_buf[ETH_ALEN], *bssid = NULL; rtnl_lock(); cfg80211_lock_rdev(rdev); @@ -181,7 +181,10 @@ void cfg80211_conn_work(struct work_struct *work) wdev_unlock(wdev); continue; } - memcpy(bssid, wdev->conn->params.bssid, ETH_ALEN); + if (wdev->conn->params.bssid) { + memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); + bssid = bssid_buf; + } if (cfg80211_conn_do_work(wdev)) __cfg80211_connect_result( wdev->netdev, bssid, -- cgit v1.2.2 From c1f9a764cf47686b1f5a0cf87ada68d90056136a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 1 Nov 2009 19:25:40 +0100 Subject: mac80211: check interface is down before type change For some strange reason the netif_running() check ended up after the actual type change instead of before, potentially causing all kinds of problems if the interface is up while changing the type; one of the problems manifests itself as a warning: WARNING: at net/mac80211/iface.c:651 ieee80211_teardown_sdata+0xda/0x1a0 [mac80211]() Hardware name: Aspire one Pid: 2596, comm: wpa_supplicant Tainted: G W 2.6.31-10-generic #32-Ubuntu Call Trace: [] warn_slowpath_common+0x6d/0xa0 [] warn_slowpath_null+0x15/0x20 [] ieee80211_teardown_sdata+0xda/0x1a0 [mac80211] [] ieee80211_if_change_type+0x4a/0xc0 [mac80211] [] ieee80211_change_iface+0x61/0xa0 [mac80211] [] cfg80211_wext_siwmode+0xc7/0x120 [cfg80211] [] ioctl_standard_call+0x58/0xf0 (http://www.kerneloops.org/searchweek.php?search=ieee80211_teardown_sdata) Cc: Arjan van de Ven Cc: stable@kernel.org Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5608f6c68413..7b5131bd6fa1 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -72,6 +72,9 @@ static int ieee80211_change_iface(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata; int ret; + if (netif_running(dev)) + return -EBUSY; + if (!nl80211_type_check(type)) return -EINVAL; @@ -81,9 +84,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, if (ret) return ret; - if (netif_running(sdata->dev)) - return -EBUSY; - if (ieee80211_vif_is_mesh(&sdata->vif) && params->mesh_id_len) ieee80211_sdata_set_mesh_id(sdata, params->mesh_id_len, -- cgit v1.2.2 From 1056bd51674e529813213186471bb4ac6689a755 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 5 Nov 2009 20:46:52 -0800 Subject: bridge: prevent bridging wrong device The bridge code assumes ethernet addressing, so be more strict in the what is allowed. This showed up when GRE had a bug and was not using correct address format. Add some more comments for increased clarity. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_if.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index b1b3b0fbf41c..4a9f52732655 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -377,12 +377,16 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) struct net_bridge_port *p; int err = 0; - if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER) + /* Don't allow bridging non-ethernet like devices */ + if ((dev->flags & IFF_LOOPBACK) || + dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN) return -EINVAL; + /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) return -ELOOP; + /* Device is already being bridged */ if (dev->br_port != NULL) return -EBUSY; -- cgit v1.2.2 From b4ec824021493ba6cb7eeb61572f4d2f8a80a52e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 5 Nov 2009 20:56:07 -0800 Subject: rose: device refcount leak While hunting dev_put() for net-next-2.6, I found a device refcount leak in ROSE, ioctl(SIOCADDRT) error path. Fix is to not touch device refcount, as we hold RTNL Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/rose/rose_route.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index 9478d9b3d977..f3e21989b88c 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -578,18 +578,18 @@ static int rose_clear_routes(void) /* * Check that the device given is a valid AX.25 interface that is "up". + * called whith RTNL */ -static struct net_device *rose_ax25_dev_get(char *devname) +static struct net_device *rose_ax25_dev_find(char *devname) { struct net_device *dev; - if ((dev = dev_get_by_name(&init_net, devname)) == NULL) + if ((dev = __dev_get_by_name(&init_net, devname)) == NULL) return NULL; if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) return dev; - dev_put(dev); return NULL; } @@ -720,27 +720,23 @@ int rose_rt_ioctl(unsigned int cmd, void __user *arg) case SIOCADDRT: if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) return -EFAULT; - if ((dev = rose_ax25_dev_get(rose_route.device)) == NULL) + if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL) return -EINVAL; - if (rose_dev_exists(&rose_route.address)) { /* Can't add routes to ourself */ - dev_put(dev); + if (rose_dev_exists(&rose_route.address)) /* Can't add routes to ourself */ return -EINVAL; - } if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ return -EINVAL; if (rose_route.ndigis > AX25_MAX_DIGIS) return -EINVAL; err = rose_add_node(&rose_route, dev); - dev_put(dev); return err; case SIOCDELRT: if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) return -EFAULT; - if ((dev = rose_ax25_dev_get(rose_route.device)) == NULL) + if ((dev = rose_ax25_dev_find(rose_route.device)) == NULL) return -EINVAL; err = rose_del_node(&rose_route, dev); - dev_put(dev); return err; case SIOCRSCLRRT: -- cgit v1.2.2 From f9dd09c7f7199685601d75882447a6598be8a3e0 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Fri, 6 Nov 2009 00:43:42 -0800 Subject: netfilter: nf_nat: fix NAT issue in 2.6.30.4+ Vitezslav Samel discovered that since 2.6.30.4+ active FTP can not work over NAT. The "cause" of the problem was a fix of unacknowledged data detection with NAT (commit a3a9f79e361e864f0e9d75ebe2a0cb43d17c4272). However, actually, that fix uncovered a long standing bug in TCP conntrack: when NAT was enabled, we simply updated the max of the right edge of the segments we have seen (td_end), by the offset NAT produced with changing IP/port in the data. However, we did not update the other parameter (td_maxend) which is affected by the NAT offset. Thus that could drift away from the correct value and thus resulted breaking active FTP. The patch below fixes the issue by *not* updating the conntrack parameters from NAT, but instead taking into account the NAT offsets in conntrack in a consistent way. (Updating from NAT would be more harder and expensive because it'd need to re-calculate parameters we already calculated in conntrack.) Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/nf_nat_core.c | 3 ++ net/ipv4/netfilter/nf_nat_helper.c | 34 ++++++++++++------ net/netfilter/nf_conntrack_core.c | 8 +++++ net/netfilter/nf_conntrack_proto_tcp.c | 64 ++++++++++++++-------------------- 4 files changed, 60 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 68afc6ecd343..fe1a64479dd0 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -750,6 +750,8 @@ static int __init nf_nat_init(void) BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); + BUG_ON(nf_ct_nat_offset != NULL); + rcu_assign_pointer(nf_ct_nat_offset, nf_nat_get_offset); return 0; cleanup_extend: @@ -764,6 +766,7 @@ static void __exit nf_nat_cleanup(void) nf_ct_extend_unregister(&nat_extend); rcu_assign_pointer(nf_nat_seq_adjust_hook, NULL); rcu_assign_pointer(nfnetlink_parse_nat_setup_hook, NULL); + rcu_assign_pointer(nf_ct_nat_offset, NULL); synchronize_net(); } diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 09172a65d9b6..f9520fa3aba9 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -73,6 +73,28 @@ adjust_tcp_sequence(u32 seq, DUMP_OFFSET(this_way); } +/* Get the offset value, for conntrack */ +s16 nf_nat_get_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + struct nf_conn_nat *nat = nfct_nat(ct); + struct nf_nat_seq *this_way; + s16 offset; + + if (!nat) + return 0; + + this_way = &nat->seq[dir]; + spin_lock_bh(&nf_nat_seqofs_lock); + offset = after(seq, this_way->correction_pos) + ? this_way->offset_after : this_way->offset_before; + spin_unlock_bh(&nf_nat_seqofs_lock); + + return offset; +} +EXPORT_SYMBOL_GPL(nf_nat_get_offset); + /* Frobs data inside this packet, which is linear. */ static void mangle_contents(struct sk_buff *skb, unsigned int dataoff, @@ -189,11 +211,6 @@ nf_nat_mangle_tcp_packet(struct sk_buff *skb, adjust_tcp_sequence(ntohl(tcph->seq), (int)rep_len - (int)match_len, ct, ctinfo); - /* Tell TCP window tracking about seq change */ - nf_conntrack_tcp_update(skb, ip_hdrlen(skb), - ct, CTINFO2DIR(ctinfo), - (int)rep_len - (int)match_len); - nf_conntrack_event_cache(IPCT_NATSEQADJ, ct); } return 1; @@ -415,12 +432,7 @@ nf_nat_seq_adjust(struct sk_buff *skb, tcph->seq = newseq; tcph->ack_seq = newack; - if (!nf_nat_sack_adjust(skb, tcph, ct, ctinfo)) - return 0; - - nf_conntrack_tcp_update(skb, ip_hdrlen(skb), ct, dir, seqoff); - - return 1; + return nf_nat_sack_adjust(skb, tcph, ct, ctinfo); } /* Setup NAT on this expected conntrack so it follows master. */ diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7c9ec3dee96e..0cdfb388a191 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1350,6 +1350,11 @@ err_stat: return ret; } +s16 (*nf_ct_nat_offset)(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq); +EXPORT_SYMBOL_GPL(nf_ct_nat_offset); + int nf_conntrack_init(struct net *net) { int ret; @@ -1367,6 +1372,9 @@ int nf_conntrack_init(struct net *net) /* For use by REJECT target */ rcu_assign_pointer(ip_ct_attach, nf_conntrack_attach); rcu_assign_pointer(nf_ct_destroy, destroy_conntrack); + + /* Howto get NAT offsets */ + rcu_assign_pointer(nf_ct_nat_offset, NULL); } return 0; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 97a82ba75376..ba2b76937283 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -492,6 +492,21 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } +#ifdef CONFIG_NF_NAT_NEEDED +static inline s16 nat_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); + + return get_offset != NULL ? get_offset(ct, dir, seq) : 0; +} +#define NAT_OFFSET(pf, ct, dir, seq) \ + (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0) +#else +#define NAT_OFFSET(pf, ct, dir, seq) 0 +#endif + static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp *state, enum ip_conntrack_dir dir, @@ -506,6 +521,7 @@ static bool tcp_in_window(const struct nf_conn *ct, struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; + s16 receiver_offset; bool res; /* @@ -519,11 +535,16 @@ static bool tcp_in_window(const struct nf_conn *ct, if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); + /* Take into account NAT sequence number mangling */ + receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1); + ack -= receiver_offset; + sack -= receiver_offset; + pr_debug("tcp_in_window: START\n"); pr_debug("tcp_in_window: "); nf_ct_dump_tuple(tuple); - pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n", - seq, ack, sack, win, end); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, @@ -613,8 +634,8 @@ static bool tcp_in_window(const struct nf_conn *ct, pr_debug("tcp_in_window: "); nf_ct_dump_tuple(tuple); - pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n", - seq, ack, sack, win, end); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, @@ -700,7 +721,7 @@ static bool tcp_in_window(const struct nf_conn *ct, before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? before(sack, receiver->td_end + 1) ? - after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" @@ -715,39 +736,6 @@ static bool tcp_in_window(const struct nf_conn *ct, return res; } -#ifdef CONFIG_NF_NAT_NEEDED -/* Update sender->td_end after NAT successfully mangled the packet */ -/* Caller must linearize skb at tcp header. */ -void nf_conntrack_tcp_update(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conn *ct, int dir, - s16 offset) -{ - const struct tcphdr *tcph = (const void *)skb->data + dataoff; - const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[dir]; - const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[!dir]; - __u32 end; - - end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); - - spin_lock_bh(&ct->lock); - /* - * We have to worry for the ack in the reply packet only... - */ - if (ct->proto.tcp.seen[dir].td_end + offset == end) - ct->proto.tcp.seen[dir].td_end = end; - ct->proto.tcp.last_end = end; - spin_unlock_bh(&ct->lock); - pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update); -#endif - #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 -- cgit v1.2.2 From 887e671f324d9898aaedb29a6ece6c853c394067 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Nov 2009 00:50:39 -0800 Subject: decnet: netdevice refcount leak While working on device refcount stuff, I found a device refcount leak through DECNET. This nasty bug can be used to hold refcounts on any !DECNET netdevice. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/decnet/sysctl_net_decnet.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 26b0ab1e9f56..2036568beea9 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -263,11 +263,10 @@ static int dn_def_dev_strategy(ctl_table *table, return -ENODEV; rv = -ENODEV; - if (dev->dn_ptr != NULL) { + if (dev->dn_ptr != NULL) rv = dn_dev_set_default(dev, 1); - if (rv) - dev_put(dev); - } + if (rv) + dev_put(dev); } return rv; -- cgit v1.2.2 From 539054a8fa5141c9a4e9ac6a86d249e3f2bdef45 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Fri, 6 Nov 2009 18:08:32 -0800 Subject: netfilter: xt_connlimit: fix regression caused by zero family value Commit v2.6.28-rc1~717^2~109^2~2 was slightly incomplete; not all instances of par->match->family were changed to par->family. References: http://bugzilla.netfilter.org/show_bug.cgi?id=610 Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/xt_connlimit.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 680980954395..38f03f75a636 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -103,7 +103,7 @@ static int count_them(struct xt_connlimit_data *data, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - const struct xt_match *match) + u_int8_t family) { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; @@ -113,8 +113,7 @@ static int count_them(struct xt_connlimit_data *data, bool addit = true; int matches = 0; - - if (match->family == NFPROTO_IPV6) + if (family == NFPROTO_IPV6) hash = &data->iphash[connlimit_iphash6(addr, mask)]; else hash = &data->iphash[connlimit_iphash(addr->ip & mask->ip)]; @@ -157,8 +156,7 @@ static int count_them(struct xt_connlimit_data *data, continue; } - if (same_source_net(addr, mask, &conn->tuple.src.u3, - match->family)) + if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) /* same source network -> be counted! */ ++matches; nf_ct_put(found_ct); @@ -207,7 +205,7 @@ connlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par) spin_lock_bh(&info->data->lock); connections = count_them(info->data, tuple_ptr, &addr, - &info->mask, par->match); + &info->mask, par->family); spin_unlock_bh(&info->data->lock); if (connections < 0) { -- cgit v1.2.2 From 23ca0c989e46924393f1d54bec84801d035dd28e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 6 Nov 2009 10:37:41 +0000 Subject: ipip: Fix handling of DF packets when pmtudisc is OFF RFC 2003 requires the outer header to have DF set if DF is set on the inner header, even when PMTU discovery is off for the tunnel. Our implementation does exactly that. For this to work properly the IPIP gateway also needs to engate in PMTU when the inner DF bit is set. As otherwise the original host would not be able to carry out its PMTU successfully since part of the path is only visible to the gateway. Unfortunately when the tunnel PMTU discovery setting is off, we do not collect the necessary soft state, resulting in blackholes when the original host tries to perform PMTU discovery. This problem is not reproducible on the IPIP gateway itself as the inner packet usually has skb->local_df set. This is not correctly cleared (an unrelated bug) when the packet passes through the tunnel, which allows fragmentation to occur. For hosts behind the IPIP gateway it is readily visible with a simple ping. This patch fixes the problem by performing PMTU discovery for all packets with the inner DF bit set, regardless of the PMTU discovery setting on the tunnel itself. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 08ccd344de7a..ae40ed1ba560 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -438,25 +438,27 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) goto tx_error; } - if (tiph->frag_off) + df |= old_iph->frag_off & htons(IP_DF); + + if (df) { mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); - else - mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu; - if (mtu < 68) { - stats->collisions++; - ip_rt_put(rt); - goto tx_error; - } - if (skb_dst(skb)) - skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); + if (mtu < 68) { + stats->collisions++; + ip_rt_put(rt); + goto tx_error; + } - df |= (old_iph->frag_off&htons(IP_DF)); + if (skb_dst(skb)) + skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); - if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); - goto tx_error; + if ((old_iph->frag_off & htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } } if (tunnel->err_count > 0) { -- cgit v1.2.2 From 6755aebaaf9fc5416acfd4578ab7a1e122ecbc74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 6 Nov 2009 00:23:01 +0000 Subject: can: should not use __dev_get_by_index() without locks bcm_proc_getifname() is called with RTNL and dev_base_lock not held. It calls __dev_get_by_index() without locks, and this is illegal (might crash) Close the race by holding dev_base_lock and copying dev->name in the protected section. Signed-off-by: Eric Dumazet Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/bcm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/can/bcm.c b/net/can/bcm.c index 597da4f8f888..e8d58f33fe09 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -132,23 +132,27 @@ static inline struct bcm_sock *bcm_sk(const struct sock *sk) /* * procfs functions */ -static char *bcm_proc_getifname(int ifindex) +static char *bcm_proc_getifname(char *result, int ifindex) { struct net_device *dev; if (!ifindex) return "any"; - /* no usage counting */ + read_lock(&dev_base_lock); dev = __dev_get_by_index(&init_net, ifindex); if (dev) - return dev->name; + strcpy(result, dev->name); + else + strcpy(result, "???"); + read_unlock(&dev_base_lock); - return "???"; + return result; } static int bcm_proc_show(struct seq_file *m, void *v) { + char ifname[IFNAMSIZ]; struct sock *sk = (struct sock *)m->private; struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; @@ -157,7 +161,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, " / sk %p", sk); seq_printf(m, " / bo %p", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); - seq_printf(m, " / bound %s", bcm_proc_getifname(bo->ifindex)); + seq_printf(m, " / bound %s", bcm_proc_getifname(ifname, bo->ifindex)); seq_printf(m, " <<<\n"); list_for_each_entry(op, &bo->rx_ops, list) { @@ -169,7 +173,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) continue; seq_printf(m, "rx_op: %03X %-5s ", - op->can_id, bcm_proc_getifname(op->ifindex)); + op->can_id, bcm_proc_getifname(ifname, op->ifindex)); seq_printf(m, "[%d]%c ", op->nframes, (op->flags & RX_CHECK_DLC)?'d':' '); if (op->kt_ival1.tv64) @@ -194,7 +198,8 @@ static int bcm_proc_show(struct seq_file *m, void *v) list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s [%d] ", - op->can_id, bcm_proc_getifname(op->ifindex), + op->can_id, + bcm_proc_getifname(ifname, op->ifindex), op->nframes); if (op->kt_ival1.tv64) -- cgit v1.2.2 From d792c1006fe92448217b71513d3955868358271d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Fri, 13 Nov 2009 13:56:33 -0800 Subject: tcp: provide more information on the tcp receive_queue bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The addition of rcv_nxt allows to discern whether the skb was out of place or tp->copied. Also catch fancy combination of flags if necessary (sadly we might miss the actual causer flags as it might have already returned). Btw, we perhaps would want to forward copied_seq in somewhere or otherwise we might have some nice loop with WARN stuff within but where to do that safely I don't know at this stage until more is known (but it is not made significantly worse by this patch). Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 98440ad82558..f1813bc71088 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1183,7 +1183,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), + KERN_INFO "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", + tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); #endif if (inet_csk_ack_scheduled(sk)) { @@ -1430,11 +1432,13 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* Now that we have two receive queues this * shouldn't happen. */ - if (before(*seq, TCP_SKB_CB(skb)->seq)) { - printk(KERN_INFO "recvmsg bug: copied %X " - "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); + if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), + KERN_INFO "recvmsg bug: copied %X " + "seq %X rcvnxt %X fl %X\n", *seq, + TCP_SKB_CB(skb)->seq, tp->rcv_nxt, + flags)) break; - } + offset = *seq - TCP_SKB_CB(skb)->seq; if (tcp_hdr(skb)->syn) offset--; @@ -1443,8 +1447,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if (tcp_hdr(skb)->fin) goto found_fin_ok; WARN(!(flags & MSG_PEEK), KERN_INFO "recvmsg bug 2: " - "copied %X seq %X\n", *seq, - TCP_SKB_CB(skb)->seq); + "copied %X seq %X rcvnxt %X fl %X\n", + *seq, TCP_SKB_CB(skb)->seq, + tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ -- cgit v1.2.2 From 1e360a60b24ad8f8685af66fa6de10ce46693a4b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 13 Nov 2009 10:52:55 -0500 Subject: SUNRPC: Address buffer overrun in rpc_uaddr2sockaddr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The size of buf[] must account for the string termination needed for the first strict_strtoul() call. Introduced in commit a02d6926. Fábio Olivé Leite points out that strict_strtoul() requires _either_ '\n\0' _or_ '\0' termination, so use the simpler '\0' here instead. See http://bugzilla.kernel.org/show_bug.cgi?id=14546 . Reported-by: argp@census-labs.com Signed-off-by: Chuck Lever Signed-off-by: Fábio Olivé Leite Signed-off-by: Trond Myklebust --- net/sunrpc/addr.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index 22e8fd89477f..c7450c8f0a7c 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -306,24 +306,25 @@ EXPORT_SYMBOL_GPL(rpc_sockaddr2uaddr); * @sap: buffer into which to plant socket address * @salen: size of buffer * + * @uaddr does not have to be '\0'-terminated, but strict_strtoul() and + * rpc_pton() require proper string termination to be successful. + * * Returns the size of the socket address if successful; otherwise * zero is returned. */ size_t rpc_uaddr2sockaddr(const char *uaddr, const size_t uaddr_len, struct sockaddr *sap, const size_t salen) { - char *c, buf[RPCBIND_MAXUADDRLEN]; + char *c, buf[RPCBIND_MAXUADDRLEN + sizeof('\0')]; unsigned long portlo, porthi; unsigned short port; - if (uaddr_len > sizeof(buf)) + if (uaddr_len > RPCBIND_MAXUADDRLEN) return 0; memcpy(buf, uaddr, uaddr_len); - buf[uaddr_len] = '\n'; - buf[uaddr_len + 1] = '\0'; - + buf[uaddr_len] = '\0'; c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; @@ -332,9 +333,7 @@ size_t rpc_uaddr2sockaddr(const char *uaddr, const size_t uaddr_len, if (unlikely(portlo > 255)) return 0; - c[0] = '\n'; - c[1] = '\0'; - + *c = '\0'; c = strrchr(buf, '.'); if (unlikely(c == NULL)) return 0; @@ -345,8 +344,7 @@ size_t rpc_uaddr2sockaddr(const char *uaddr, const size_t uaddr_len, port = (unsigned short)((porthi << 8) | portlo); - c[0] = '\0'; - + *c = '\0'; if (rpc_pton(buf, strlen(buf), sap, salen) == 0) return 0; -- cgit v1.2.2 From 409b95aff3583c05ac7a9247fa3d8c9aa7f9cae3 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 10 Nov 2009 08:57:34 +0000 Subject: sctp: Set source addresses on the association before adding transports Recent commit 8da645e101a8c20c6073efda3c7cc74eec01b87f sctp: Get rid of an extra routing lookup when adding a transport introduced a regression in the connection setup. The behavior was different between IPv4 and IPv6. IPv4 case ended up working because the route lookup routing returned a NULL route, which triggered another route lookup later in the output patch that succeeded. In the IPv6 case, a valid route was returned for first call, but we could not find a valid source address at the time since the source addresses were not set on the association yet. Thus resulted in a hung connection. The solution is to set the source addresses on the association prior to adding peers. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/associola.c | 4 +--- net/sctp/sm_statefuns.c | 15 +++++++++------ net/sctp/socket.c | 22 ++++++++++++---------- 3 files changed, 22 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 8450960df24f..7eed77a39d0d 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1485,15 +1485,13 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len) * local endpoint and the remote peer. */ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, - gfp_t gfp) + sctp_scope_t scope, gfp_t gfp) { - sctp_scope_t scope; int flags; /* Use scoping rules to determine the subset of addresses from * the endpoint. */ - scope = sctp_scope(&asoc->peer.active_path->ipaddr); flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; if (asoc->peer.ipv4_address) flags |= SCTP_ADDR4_PEERSUPP; diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index c8fae1983dd1..d4df45022ffa 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -384,6 +384,11 @@ sctp_disposition_t sctp_sf_do_5_1B_init(const struct sctp_endpoint *ep, if (!new_asoc) goto nomem; + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, + sctp_scope(sctp_source(chunk)), + GFP_ATOMIC) < 0) + goto nomem_init; + /* The call, sctp_process_init(), can fail on memory allocation. */ if (!sctp_process_init(new_asoc, chunk->chunk_hdr->type, sctp_source(chunk), @@ -401,9 +406,6 @@ sctp_disposition_t sctp_sf_do_5_1B_init(const struct sctp_endpoint *ep, len = ntohs(err_chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); - if (sctp_assoc_set_bind_addr_from_ep(new_asoc, GFP_ATOMIC) < 0) - goto nomem_init; - repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len); if (!repl) goto nomem_init; @@ -1452,6 +1454,10 @@ static sctp_disposition_t sctp_sf_do_unexpected_init( if (!new_asoc) goto nomem; + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, + sctp_scope(sctp_source(chunk)), GFP_ATOMIC) < 0) + goto nomem; + /* In the outbound INIT ACK the endpoint MUST copy its current * Verification Tag and Peers Verification tag into a reserved * place (local tie-tag and per tie-tag) within the state cookie. @@ -1488,9 +1494,6 @@ static sctp_disposition_t sctp_sf_do_unexpected_init( sizeof(sctp_chunkhdr_t); } - if (sctp_assoc_set_bind_addr_from_ep(new_asoc, GFP_ATOMIC) < 0) - goto nomem; - repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len); if (!repl) goto nomem; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index c8d05758661d..bf705ba97231 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1080,6 +1080,13 @@ static int __sctp_connect(struct sock* sk, err = -ENOMEM; goto out_free; } + + err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, + GFP_KERNEL); + if (err < 0) { + goto out_free; + } + } /* Prime the peer's transport structures. */ @@ -1095,11 +1102,6 @@ static int __sctp_connect(struct sock* sk, walk_size += af->sockaddr_len; } - err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL); - if (err < 0) { - goto out_free; - } - /* In case the user of sctp_connectx() wants an association * id back, assign one now. */ @@ -1689,6 +1691,11 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, goto out_unlock; } asoc = new_asoc; + err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL); + if (err < 0) { + err = -ENOMEM; + goto out_free; + } /* If the SCTP_INIT ancillary data is specified, set all * the association init values accordingly. @@ -1718,11 +1725,6 @@ SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, err = -ENOMEM; goto out_free; } - err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL); - if (err < 0) { - err = -ENOMEM; - goto out_free; - } } /* ASSERT: we have a valid association at this point. */ -- cgit v1.2.2 From f9c67811ebc00a42f62f5d542d3abd36bd49ae35 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 11 Nov 2009 08:19:24 +0000 Subject: sctp: Fix regression introduced by new sctp_connectx api A new (unrealeased to the user) sctp_connectx api c6ba68a26645dbc5029a9faa5687ebe6fcfc53e4 sctp: support non-blocking version of the new sctp_connectx() API introduced a regression cought by the user regression test suite. In particular, the API requires the user library to re-allocate the buffer and could potentially trigger a SIGFAULT. This change corrects that regression by passing the original address buffer to the kernel unmodified, but still allows for a returned association id. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/socket.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index bf705ba97231..3a95fcb17a9e 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1276,22 +1276,30 @@ SCTP_STATIC int sctp_setsockopt_connectx(struct sock* sk, } /* - * New (hopefully final) interface for the API. The option buffer is used - * both for the returned association id and the addresses. + * New (hopefully final) interface for the API. + * We use the sctp_getaddrs_old structure so that use-space library + * can avoid any unnecessary allocations. The only defferent part + * is that we store the actual length of the address buffer into the + * addrs_num structure member. That way we can re-use the existing + * code. */ SCTP_STATIC int sctp_getsockopt_connectx3(struct sock* sk, int len, char __user *optval, int __user *optlen) { + struct sctp_getaddrs_old param; sctp_assoc_t assoc_id = 0; int err = 0; - if (len < sizeof(assoc_id)) + if (len < sizeof(param)) return -EINVAL; + if (copy_from_user(¶m, optval, sizeof(param))) + return -EFAULT; + err = __sctp_setsockopt_connectx(sk, - (struct sockaddr __user *)(optval + sizeof(assoc_id)), - len - sizeof(assoc_id), &assoc_id); + (struct sockaddr __user *)param.addrs, + param.addr_num, &assoc_id); if (err == 0 || err == -EINPROGRESS) { if (copy_to_user(optval, &assoc_id, sizeof(assoc_id))) -- cgit v1.2.2 From a78102e74e782914039cd8a6939532649825a2e3 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Wed, 11 Nov 2009 11:54:37 +0000 Subject: sctp: Set socket source address when additing first transport Recent commits sctp: Get rid of an extra routing lookup when adding a transport and sctp: Set source addresses on the association before adding transports changed when routes are added to the sctp transports. As such, we didn't set the socket source address correctly when adding the first transport. The first transport is always the primary/active one, so when adding it, set the socket source address. This was causing regression failures in SCTP tests. Signed-off-by: Vlad Yasevich Signed-off-by: David S. Miller --- net/sctp/transport.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/transport.c b/net/sctp/transport.c index c256e4839316..3b141bb32faf 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -308,7 +308,8 @@ void sctp_transport_route(struct sctp_transport *transport, /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). */ - if (asoc && (transport == asoc->peer.active_path)) + if (asoc && (!asoc->peer.primary_path || + (transport == asoc->peer.active_path))) opt->pf->af->to_sk_saddr(&transport->saddr, asoc->base.sk); } else -- cgit v1.2.2 From d0490cfdf440fded2c292cfb8bb9272fc9ef6943 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Nov 2009 02:03:54 +0000 Subject: ipmr: missing dev_put() on error path in vif_add() The other error paths in front of this one have a dev_put() but this one got missed. Found by smatch static checker. Signed-off-by: Dan Carpenter Acked-by: Wang Chen Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 630a56df7b47..99508d66a642 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -483,8 +483,10 @@ static int vif_add(struct net *net, struct vifctl *vifc, int mrtsock) return -EINVAL; } - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) + if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { + dev_put(dev); return -EADDRNOTAVAIL; + } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; ip_rt_multicast_event(in_dev); -- cgit v1.2.2 From 93f19c9fc8c98bb6d2e9825115989603ffd5cd1f Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Thu, 3 Sep 2009 12:34:19 +0300 Subject: Bluetooth: Set general bonding security for ACL by default This patch fixes double pairing issues with Secure Simple Paring support. It was observed that when pairing with SSP enabled, that the confirmation will be asked twice. http://www.spinics.net/lists/linux-bluetooth/msg02473.html This also causes bug when initiating SSP connection from Windows Vista. The reason is because bluetoothd does not store link keys since HCIGETAUTHINFO returns 0. Setting default to general bonding fixes these issues. Signed-off-by: Andrei Emeltchenko Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_conn.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a9750984f772..b7c4224f4e7d 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -211,6 +211,7 @@ struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) conn->type = type; conn->mode = HCI_CM_ACTIVE; conn->state = BT_OPEN; + conn->auth_type = HCI_AT_GENERAL_BONDING; conn->power_save = 1; conn->disc_timeout = HCI_DISCONN_TIMEOUT; -- cgit v1.2.2 From a0e55a32afc2130e5ae506755e4b8eb42a23b568 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Sep 2009 01:42:23 -0300 Subject: Bluetooth: Select Basic Mode as default for SOCK_SEQPACKET The default mode for SOCK_SEQPACKET is Basic Mode. So when no mode has been specified, Basic Mode shall be used. This is important for current application to keep working as expected and not cause a regression. Signed-off-by: Gustavo F. Padovan Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 77e9fb130adb..076caa10d2f6 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -2205,7 +2205,7 @@ static int l2cap_build_conf_req(struct sock *sk, void *data) { struct l2cap_pinfo *pi = l2cap_pi(sk); struct l2cap_conf_req *req = data; - struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_ERTM }; + struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; void *ptr = req->data; BT_DBG("sk %p", sk); -- cgit v1.2.2 From 68ae6639b6dff117ba37f648f1611a4629abadf0 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Sat, 17 Oct 2009 21:41:01 -0300 Subject: Bluetooth: Fix regression with L2CAP configuration in Basic Mode Basic Mode is the default mode of operation of a L2CAP entity. In this case the RFC (Retransmission and Flow Control) configuration option should not be used at all. Normally remote L2CAP implementation should just ignore this option, but it can cause various side effects with other Bluetooth stacks that are not capable of handling unknown options. Signed-off-by: Gustavo F. Padovan Signed-off-by: Marcel Holtmann --- net/bluetooth/l2cap.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 076caa10d2f6..947f8bbb4bb3 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -2394,6 +2394,10 @@ done: rfc.monitor_timeout = L2CAP_DEFAULT_MONITOR_TO; pi->conf_state |= L2CAP_CONF_MODE_DONE; + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, + sizeof(rfc), (unsigned long) &rfc); + break; case L2CAP_MODE_STREAMING: @@ -2401,6 +2405,10 @@ done: pi->max_pdu_size = rfc.max_pdu_size; pi->conf_state |= L2CAP_CONF_MODE_DONE; + + l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, + sizeof(rfc), (unsigned long) &rfc); + break; default: @@ -2410,9 +2418,6 @@ done: rfc.mode = pi->mode; } - l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); - if (result == L2CAP_CONF_SUCCESS) pi->conf_state |= L2CAP_CONF_OUTPUT_DONE; } -- cgit v1.2.2 From 91e9c07bd635353d1a278bdb38dbb56ac371bcb8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 Nov 2009 23:30:24 +0000 Subject: net: Fix the rollback test in dev_change_name() net: Fix the rollback test in dev_change_name() In dev_change_name() an err variable is used for storing the original call_netdevice_notifiers() errno (negative) and testing for a rollback error later, but the test for non-zero is wrong, because the err might have positive value as well - from dev_alloc_name(). It means the rollback for a netdevice with a number > 0 will never happen. (The err test is reordered btw. to make it more readable.) Signed-off-by: Jarek Poplawski Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index b8f74cfb1bfd..fe10551d3671 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -942,14 +942,15 @@ rollback: ret = notifier_to_errno(ret); if (ret) { - if (err) { - printk(KERN_ERR - "%s: name change rollback failed: %d.\n", - dev->name, ret); - } else { + /* err >= 0 after dev_alloc_name() or stores the first errno */ + if (err >= 0) { err = ret; memcpy(dev->name, oldname, IFNAMSIZ); goto rollback; + } else { + printk(KERN_ERR + "%s: name change rollback failed: %d.\n", + dev->name, ret); } } -- cgit v1.2.2 From 69c0cab120a85471054614418b447349caba22d7 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 17 Nov 2009 05:18:18 -0800 Subject: gro: Fix illegal merging of trailer trash When we've merged skb's with page frags, and subsequently receive a trailer skb (< MSS) that is not completely non-linear (this can occur on Intel NICs if the packet size falls below the threshold), GRO ends up producing an illegal GSO skb with a frag_list. This is harmless unless the skb is then forwarded through an interface that requires software GSO, whereupon the GSO code will BUG. This patch detects this case in GRO and avoids merging the trailer skb. Reported-by: Mark Wagner Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 80a96166df39..ec85681a7dd8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2701,7 +2701,8 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb) NAPI_GRO_CB(skb)->free = 1; goto done; - } + } else if (skb_gro_len(p) != pinfo->gso_size) + return -E2BIG; headroom = skb_headroom(p); nskb = netdev_alloc_skb(p->dev, headroom + skb_gro_offset(p)); -- cgit v1.2.2 From 6b863d1d3239eff0f45c2e6e672f5b56db828db0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 17 Nov 2009 06:45:04 -0800 Subject: vlan: Fix register_vlan_dev() error path In case register_netdevice() returns an error, and a new vlan_group was allocated and inserted in vlan_group_hash[] we call vlan_group_free() without deleting group from hash table. Future lookups can give infinite loops or crashes. We must delete the vlan_group using RCU safe procedure. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 8836575f9d79..a29c5ab5815c 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -281,8 +281,11 @@ out_uninit_applicant: if (ngrp) vlan_gvrp_uninit_applicant(real_dev); out_free_group: - if (ngrp) - vlan_group_free(ngrp); + if (ngrp) { + hlist_del_rcu(&ngrp->hlist); + /* Free the group, after all cpu's are done. */ + call_rcu(&ngrp->rcu, vlan_rcu_free); + } return err; } -- cgit v1.2.2