aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/sysctl/net.txt7
-rw-r--r--include/linux/netdevice.h3
-rw-r--r--include/linux/skbuff.h8
-rw-r--r--include/net/ll_poll.h148
-rw-r--r--include/net/sock.h4
-rw-r--r--include/uapi/linux/snmp.h1
-rw-r--r--net/Kconfig12
-rw-r--r--net/core/skbuff.c4
-rw-r--r--net/core/sock.c6
-rw-r--r--net/core/sysctl_net_core.c10
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/socket.c6
12 files changed, 208 insertions, 2 deletions
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index c1f8640c2fc8..85ab72dcdc3c 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
50it's a Per-CPU variable. 50it's a Per-CPU variable.
51Default: 64 51Default: 64
52 52
53low_latency_poll
54----------------
55Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
56Approximate time in us to spin waiting for packets on the device queue.
57Recommended value is 50. May increase power usage.
58Default: 0 (off)
59
53rmem_default 60rmem_default
54------------ 61------------
55 62
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 39bbd462d68e..2ecb96d9a1e5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -972,6 +972,9 @@ struct net_device_ops {
972 gfp_t gfp); 972 gfp_t gfp);
973 void (*ndo_netpoll_cleanup)(struct net_device *dev); 973 void (*ndo_netpoll_cleanup)(struct net_device *dev);
974#endif 974#endif
975#ifdef CONFIG_NET_LL_RX_POLL
976 int (*ndo_ll_poll)(struct napi_struct *dev);
977#endif
975 int (*ndo_set_vf_mac)(struct net_device *dev, 978 int (*ndo_set_vf_mac)(struct net_device *dev,
976 int queue, u8 *mac); 979 int queue, u8 *mac);
977 int (*ndo_set_vf_vlan)(struct net_device *dev, 980 int (*ndo_set_vf_vlan)(struct net_device *dev,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9995834d2cb6..400d82ae2b03 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t;
386 * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS 386 * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
387 * @dma_cookie: a cookie to one of several possible DMA operations 387 * @dma_cookie: a cookie to one of several possible DMA operations
388 * done by skb DMA functions 388 * done by skb DMA functions
389 * @napi_id: id of the NAPI struct this skb came from
389 * @secmark: security marking 390 * @secmark: security marking
390 * @mark: Generic packet mark 391 * @mark: Generic packet mark
391 * @dropcount: total number of sk_receive_queue overflows 392 * @dropcount: total number of sk_receive_queue overflows
@@ -500,8 +501,11 @@ struct sk_buff {
500 /* 7/9 bit hole (depending on ndisc_nodetype presence) */ 501 /* 7/9 bit hole (depending on ndisc_nodetype presence) */
501 kmemcheck_bitfield_end(flags2); 502 kmemcheck_bitfield_end(flags2);
502 503
503#ifdef CONFIG_NET_DMA 504#if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL
504 dma_cookie_t dma_cookie; 505 union {
506 unsigned int napi_id;
507 dma_cookie_t dma_cookie;
508 };
505#endif 509#endif
506#ifdef CONFIG_NETWORK_SECMARK 510#ifdef CONFIG_NETWORK_SECMARK
507 __u32 secmark; 511 __u32 secmark;
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h
new file mode 100644
index 000000000000..bc262f88173f
--- /dev/null
+++ b/include/net/ll_poll.h
@@ -0,0 +1,148 @@
1/*
2 * Low Latency Sockets
3 * Copyright(c) 2013 Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; if not, write to the Free Software Foundation, Inc.,
16 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * Author: Eliezer Tamir
19 *
20 * Contact Information:
21 * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
22 */
23
24/*
25 * For now this depends on CONFIG_X86_TSC
26 */
27
28#ifndef _LINUX_NET_LL_POLL_H
29#define _LINUX_NET_LL_POLL_H
30
31#include <linux/netdevice.h>
32#include <net/ip.h>
33
34#ifdef CONFIG_NET_LL_RX_POLL
35
36struct napi_struct;
37extern unsigned long sysctl_net_ll_poll __read_mostly;
38
39/* return values from ndo_ll_poll */
40#define LL_FLUSH_FAILED -1
41#define LL_FLUSH_BUSY -2
42
43/* we don't mind a ~2.5% imprecision */
44#define TSC_MHZ (tsc_khz >> 10)
45
46static inline cycles_t ll_end_time(void)
47{
48 return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles();
49}
50
51static inline bool sk_valid_ll(struct sock *sk)
52{
53 return sysctl_net_ll_poll && sk->sk_napi_id &&
54 !need_resched() && !signal_pending(current);
55}
56
57static inline bool can_poll_ll(cycles_t end_time)
58{
59 return !time_after((unsigned long)get_cycles(),
60 (unsigned long)end_time);
61}
62
63static inline bool sk_poll_ll(struct sock *sk, int nonblock)
64{
65 cycles_t end_time = ll_end_time();
66 const struct net_device_ops *ops;
67 struct napi_struct *napi;
68 int rc = false;
69
70 /*
71 * rcu read lock for napi hash
72 * bh so we don't race with net_rx_action
73 */
74 rcu_read_lock_bh();
75
76 napi = napi_by_id(sk->sk_napi_id);
77 if (!napi)
78 goto out;
79
80 ops = napi->dev->netdev_ops;
81 if (!ops->ndo_ll_poll)
82 goto out;
83
84 do {
85
86 rc = ops->ndo_ll_poll(napi);
87
88 if (rc == LL_FLUSH_FAILED)
89 break; /* permanent failure */
90
91 if (rc > 0)
92 /* local bh are disabled so it is ok to use _BH */
93 NET_ADD_STATS_BH(sock_net(sk),
94 LINUX_MIB_LOWLATENCYRXPACKETS, rc);
95
96 } while (skb_queue_empty(&sk->sk_receive_queue)
97 && can_poll_ll(end_time) && !nonblock);
98
99 rc = !skb_queue_empty(&sk->sk_receive_queue);
100out:
101 rcu_read_unlock_bh();
102 return rc;
103}
104
105/* used in the NIC receive handler to mark the skb */
106static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
107{
108 skb->napi_id = napi->napi_id;
109}
110
111/* used in the protocol hanlder to propagate the napi_id to the socket */
112static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
113{
114 sk->sk_napi_id = skb->napi_id;
115}
116
117#else /* CONFIG_NET_LL_RX_POLL */
118
119static inline cycles_t ll_end_time(void)
120{
121 return 0;
122}
123
124static inline bool sk_valid_ll(struct sock *sk)
125{
126 return false;
127}
128
129static inline bool sk_poll_ll(struct sock *sk, int nonblock)
130{
131 return false;
132}
133
134static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
135{
136}
137
138static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
139{
140}
141
142static inline bool can_poll_ll(cycles_t end_time)
143{
144 return false;
145}
146
147#endif /* CONFIG_NET_LL_RX_POLL */
148#endif /* _LINUX_NET_LL_POLL_H */
diff --git a/include/net/sock.h b/include/net/sock.h
index 66772cf8c3c5..ac8e1818380c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -229,6 +229,7 @@ struct cg_proto;
229 * @sk_omem_alloc: "o" is "option" or "other" 229 * @sk_omem_alloc: "o" is "option" or "other"
230 * @sk_wmem_queued: persistent queue size 230 * @sk_wmem_queued: persistent queue size
231 * @sk_forward_alloc: space allocated forward 231 * @sk_forward_alloc: space allocated forward
232 * @sk_napi_id: id of the last napi context to receive data for sk
232 * @sk_allocation: allocation mode 233 * @sk_allocation: allocation mode
233 * @sk_sndbuf: size of send buffer in bytes 234 * @sk_sndbuf: size of send buffer in bytes
234 * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, 235 * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
@@ -325,6 +326,9 @@ struct sock {
325#ifdef CONFIG_RPS 326#ifdef CONFIG_RPS
326 __u32 sk_rxhash; 327 __u32 sk_rxhash;
327#endif 328#endif
329#ifdef CONFIG_NET_LL_RX_POLL
330 unsigned int sk_napi_id;
331#endif
328 atomic_t sk_drops; 332 atomic_t sk_drops;
329 int sk_rcvbuf; 333 int sk_rcvbuf;
330 334
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index df2e8b4f9c03..26cbf76f8058 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -253,6 +253,7 @@ enum
253 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ 253 LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */
254 LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ 254 LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
255 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ 255 LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
256 LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */
256 __LINUX_MIB_MAX 257 __LINUX_MIB_MAX
257}; 258};
258 259
diff --git a/net/Kconfig b/net/Kconfig
index 523e43e6da1b..d6a9ce6e1800 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -243,6 +243,18 @@ config NETPRIO_CGROUP
243 Cgroup subsystem for use in assigning processes to network priorities on 243 Cgroup subsystem for use in assigning processes to network priorities on
244 a per-interface basis 244 a per-interface basis
245 245
246config NET_LL_RX_POLL
247 bool "Low Latency Receive Poll"
248 depends on X86_TSC
249 default n
250 ---help---
251 Support Low Latency Receive Queue Poll.
252 (For network card drivers which support this option.)
253 When waiting for data in read or poll call directly into the the device driver
254 to flush packets which may be pending on the device queues into the stack.
255
256 If unsure, say N.
257
246config BQL 258config BQL
247 boolean 259 boolean
248 depends on SYSFS 260 depends on SYSFS
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 73f57a0e1523..4a4181e16c1a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
733 new->vlan_tci = old->vlan_tci; 733 new->vlan_tci = old->vlan_tci;
734 734
735 skb_copy_secmark(new, old); 735 skb_copy_secmark(new, old);
736
737#ifdef CONFIG_NET_LL_RX_POLL
738 new->napi_id = old->napi_id;
739#endif
736} 740}
737 741
738/* 742/*
diff --git a/net/core/sock.c b/net/core/sock.c
index 88868a9d21da..788c0da5eed1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -139,6 +139,8 @@
139#include <net/tcp.h> 139#include <net/tcp.h>
140#endif 140#endif
141 141
142#include <net/ll_poll.h>
143
142static DEFINE_MUTEX(proto_list_mutex); 144static DEFINE_MUTEX(proto_list_mutex);
143static LIST_HEAD(proto_list); 145static LIST_HEAD(proto_list);
144 146
@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
2284 2286
2285 sk->sk_stamp = ktime_set(-1L, 0); 2287 sk->sk_stamp = ktime_set(-1L, 0);
2286 2288
2289#ifdef CONFIG_NET_LL_RX_POLL
2290 sk->sk_napi_id = 0;
2291#endif
2292
2287 /* 2293 /*
2288 * Before updating sk_refcnt, we must commit prior changes to memory 2294 * Before updating sk_refcnt, we must commit prior changes to memory
2289 * (Documentation/RCU/rculist_nulls.txt for details) 2295 * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 741db5fc7806..4b48f39582b0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -19,6 +19,7 @@
19#include <net/ip.h> 19#include <net/ip.h>
20#include <net/sock.h> 20#include <net/sock.h>
21#include <net/net_ratelimit.h> 21#include <net/net_ratelimit.h>
22#include <net/ll_poll.h>
22 23
23static int one = 1; 24static int one = 1;
24 25
@@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = {
284 .proc_handler = flow_limit_table_len_sysctl 285 .proc_handler = flow_limit_table_len_sysctl
285 }, 286 },
286#endif /* CONFIG_NET_FLOW_LIMIT */ 287#endif /* CONFIG_NET_FLOW_LIMIT */
288#ifdef CONFIG_NET_LL_RX_POLL
289 {
290 .procname = "low_latency_poll",
291 .data = &sysctl_net_ll_poll,
292 .maxlen = sizeof(unsigned long),
293 .mode = 0644,
294 .proc_handler = proc_doulongvec_minmax
295 },
296#endif
287#endif /* CONFIG_NET */ 297#endif /* CONFIG_NET */
288 { 298 {
289 .procname = "netdev_budget", 299 .procname = "netdev_budget",
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 2a5bf86d2415..6577a1149a47 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
273 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), 273 SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
274 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), 274 SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
275 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), 275 SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
276 SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
276 SNMP_MIB_SENTINEL 277 SNMP_MIB_SENTINEL
277}; 278};
278 279
diff --git a/net/socket.c b/net/socket.c
index 3ebdcb805c51..21fd29f63ed2 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -104,6 +104,12 @@
104#include <linux/route.h> 104#include <linux/route.h>
105#include <linux/sockios.h> 105#include <linux/sockios.h>
106#include <linux/atalk.h> 106#include <linux/atalk.h>
107#include <net/ll_poll.h>
108
109#ifdef CONFIG_NET_LL_RX_POLL
110unsigned long sysctl_net_ll_poll __read_mostly;
111EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
112#endif
107 113
108static int sock_no_open(struct inode *irrelevant, struct file *dontcare); 114static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
109static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, 115static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,