diff options
-rw-r--r-- | Documentation/sysctl/net.txt | 7 | ||||
-rw-r--r-- | include/linux/netdevice.h | 3 | ||||
-rw-r--r-- | include/linux/skbuff.h | 8 | ||||
-rw-r--r-- | include/net/ll_poll.h | 148 | ||||
-rw-r--r-- | include/net/sock.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/snmp.h | 1 | ||||
-rw-r--r-- | net/Kconfig | 12 | ||||
-rw-r--r-- | net/core/skbuff.c | 4 | ||||
-rw-r--r-- | net/core/sock.c | 6 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 10 | ||||
-rw-r--r-- | net/ipv4/proc.c | 1 | ||||
-rw-r--r-- | net/socket.c | 6 |
12 files changed, 208 insertions, 2 deletions
diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index c1f8640c2fc8..85ab72dcdc3c 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt | |||
@@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, | |||
50 | it's a Per-CPU variable. | 50 | it's a Per-CPU variable. |
51 | Default: 64 | 51 | Default: 64 |
52 | 52 | ||
53 | low_latency_poll | ||
54 | ---------------- | ||
55 | Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL) | ||
56 | Approximate time in us to spin waiting for packets on the device queue. | ||
57 | Recommended value is 50. May increase power usage. | ||
58 | Default: 0 (off) | ||
59 | |||
53 | rmem_default | 60 | rmem_default |
54 | ------------ | 61 | ------------ |
55 | 62 | ||
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 39bbd462d68e..2ecb96d9a1e5 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h | |||
@@ -972,6 +972,9 @@ struct net_device_ops { | |||
972 | gfp_t gfp); | 972 | gfp_t gfp); |
973 | void (*ndo_netpoll_cleanup)(struct net_device *dev); | 973 | void (*ndo_netpoll_cleanup)(struct net_device *dev); |
974 | #endif | 974 | #endif |
975 | #ifdef CONFIG_NET_LL_RX_POLL | ||
976 | int (*ndo_ll_poll)(struct napi_struct *dev); | ||
977 | #endif | ||
975 | int (*ndo_set_vf_mac)(struct net_device *dev, | 978 | int (*ndo_set_vf_mac)(struct net_device *dev, |
976 | int queue, u8 *mac); | 979 | int queue, u8 *mac); |
977 | int (*ndo_set_vf_vlan)(struct net_device *dev, | 980 | int (*ndo_set_vf_vlan)(struct net_device *dev, |
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9995834d2cb6..400d82ae2b03 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h | |||
@@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t; | |||
386 | * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS | 386 | * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS |
387 | * @dma_cookie: a cookie to one of several possible DMA operations | 387 | * @dma_cookie: a cookie to one of several possible DMA operations |
388 | * done by skb DMA functions | 388 | * done by skb DMA functions |
389 | * @napi_id: id of the NAPI struct this skb came from | ||
389 | * @secmark: security marking | 390 | * @secmark: security marking |
390 | * @mark: Generic packet mark | 391 | * @mark: Generic packet mark |
391 | * @dropcount: total number of sk_receive_queue overflows | 392 | * @dropcount: total number of sk_receive_queue overflows |
@@ -500,8 +501,11 @@ struct sk_buff { | |||
500 | /* 7/9 bit hole (depending on ndisc_nodetype presence) */ | 501 | /* 7/9 bit hole (depending on ndisc_nodetype presence) */ |
501 | kmemcheck_bitfield_end(flags2); | 502 | kmemcheck_bitfield_end(flags2); |
502 | 503 | ||
503 | #ifdef CONFIG_NET_DMA | 504 | #if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL |
504 | dma_cookie_t dma_cookie; | 505 | union { |
506 | unsigned int napi_id; | ||
507 | dma_cookie_t dma_cookie; | ||
508 | }; | ||
505 | #endif | 509 | #endif |
506 | #ifdef CONFIG_NETWORK_SECMARK | 510 | #ifdef CONFIG_NETWORK_SECMARK |
507 | __u32 secmark; | 511 | __u32 secmark; |
diff --git a/include/net/ll_poll.h b/include/net/ll_poll.h new file mode 100644 index 000000000000..bc262f88173f --- /dev/null +++ b/include/net/ll_poll.h | |||
@@ -0,0 +1,148 @@ | |||
1 | /* | ||
2 | * Low Latency Sockets | ||
3 | * Copyright(c) 2013 Intel Corporation. | ||
4 | * | ||
5 | * This program is free software; you can redistribute it and/or modify it | ||
6 | * under the terms and conditions of the GNU General Public License, | ||
7 | * version 2, as published by the Free Software Foundation. | ||
8 | * | ||
9 | * This program is distributed in the hope it will be useful, but WITHOUT | ||
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | ||
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for | ||
12 | * more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License along with | ||
15 | * this program; if not, write to the Free Software Foundation, Inc., | ||
16 | * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. | ||
17 | * | ||
18 | * Author: Eliezer Tamir | ||
19 | * | ||
20 | * Contact Information: | ||
21 | * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> | ||
22 | */ | ||
23 | |||
24 | /* | ||
25 | * For now this depends on CONFIG_X86_TSC | ||
26 | */ | ||
27 | |||
28 | #ifndef _LINUX_NET_LL_POLL_H | ||
29 | #define _LINUX_NET_LL_POLL_H | ||
30 | |||
31 | #include <linux/netdevice.h> | ||
32 | #include <net/ip.h> | ||
33 | |||
34 | #ifdef CONFIG_NET_LL_RX_POLL | ||
35 | |||
36 | struct napi_struct; | ||
37 | extern unsigned long sysctl_net_ll_poll __read_mostly; | ||
38 | |||
39 | /* return values from ndo_ll_poll */ | ||
40 | #define LL_FLUSH_FAILED -1 | ||
41 | #define LL_FLUSH_BUSY -2 | ||
42 | |||
43 | /* we don't mind a ~2.5% imprecision */ | ||
44 | #define TSC_MHZ (tsc_khz >> 10) | ||
45 | |||
46 | static inline cycles_t ll_end_time(void) | ||
47 | { | ||
48 | return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles(); | ||
49 | } | ||
50 | |||
51 | static inline bool sk_valid_ll(struct sock *sk) | ||
52 | { | ||
53 | return sysctl_net_ll_poll && sk->sk_napi_id && | ||
54 | !need_resched() && !signal_pending(current); | ||
55 | } | ||
56 | |||
57 | static inline bool can_poll_ll(cycles_t end_time) | ||
58 | { | ||
59 | return !time_after((unsigned long)get_cycles(), | ||
60 | (unsigned long)end_time); | ||
61 | } | ||
62 | |||
63 | static inline bool sk_poll_ll(struct sock *sk, int nonblock) | ||
64 | { | ||
65 | cycles_t end_time = ll_end_time(); | ||
66 | const struct net_device_ops *ops; | ||
67 | struct napi_struct *napi; | ||
68 | int rc = false; | ||
69 | |||
70 | /* | ||
71 | * rcu read lock for napi hash | ||
72 | * bh so we don't race with net_rx_action | ||
73 | */ | ||
74 | rcu_read_lock_bh(); | ||
75 | |||
76 | napi = napi_by_id(sk->sk_napi_id); | ||
77 | if (!napi) | ||
78 | goto out; | ||
79 | |||
80 | ops = napi->dev->netdev_ops; | ||
81 | if (!ops->ndo_ll_poll) | ||
82 | goto out; | ||
83 | |||
84 | do { | ||
85 | |||
86 | rc = ops->ndo_ll_poll(napi); | ||
87 | |||
88 | if (rc == LL_FLUSH_FAILED) | ||
89 | break; /* permanent failure */ | ||
90 | |||
91 | if (rc > 0) | ||
92 | /* local bh are disabled so it is ok to use _BH */ | ||
93 | NET_ADD_STATS_BH(sock_net(sk), | ||
94 | LINUX_MIB_LOWLATENCYRXPACKETS, rc); | ||
95 | |||
96 | } while (skb_queue_empty(&sk->sk_receive_queue) | ||
97 | && can_poll_ll(end_time) && !nonblock); | ||
98 | |||
99 | rc = !skb_queue_empty(&sk->sk_receive_queue); | ||
100 | out: | ||
101 | rcu_read_unlock_bh(); | ||
102 | return rc; | ||
103 | } | ||
104 | |||
105 | /* used in the NIC receive handler to mark the skb */ | ||
106 | static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) | ||
107 | { | ||
108 | skb->napi_id = napi->napi_id; | ||
109 | } | ||
110 | |||
111 | /* used in the protocol hanlder to propagate the napi_id to the socket */ | ||
112 | static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) | ||
113 | { | ||
114 | sk->sk_napi_id = skb->napi_id; | ||
115 | } | ||
116 | |||
117 | #else /* CONFIG_NET_LL_RX_POLL */ | ||
118 | |||
119 | static inline cycles_t ll_end_time(void) | ||
120 | { | ||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | static inline bool sk_valid_ll(struct sock *sk) | ||
125 | { | ||
126 | return false; | ||
127 | } | ||
128 | |||
129 | static inline bool sk_poll_ll(struct sock *sk, int nonblock) | ||
130 | { | ||
131 | return false; | ||
132 | } | ||
133 | |||
134 | static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi) | ||
135 | { | ||
136 | } | ||
137 | |||
138 | static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb) | ||
139 | { | ||
140 | } | ||
141 | |||
142 | static inline bool can_poll_ll(cycles_t end_time) | ||
143 | { | ||
144 | return false; | ||
145 | } | ||
146 | |||
147 | #endif /* CONFIG_NET_LL_RX_POLL */ | ||
148 | #endif /* _LINUX_NET_LL_POLL_H */ | ||
diff --git a/include/net/sock.h b/include/net/sock.h index 66772cf8c3c5..ac8e1818380c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h | |||
@@ -229,6 +229,7 @@ struct cg_proto; | |||
229 | * @sk_omem_alloc: "o" is "option" or "other" | 229 | * @sk_omem_alloc: "o" is "option" or "other" |
230 | * @sk_wmem_queued: persistent queue size | 230 | * @sk_wmem_queued: persistent queue size |
231 | * @sk_forward_alloc: space allocated forward | 231 | * @sk_forward_alloc: space allocated forward |
232 | * @sk_napi_id: id of the last napi context to receive data for sk | ||
232 | * @sk_allocation: allocation mode | 233 | * @sk_allocation: allocation mode |
233 | * @sk_sndbuf: size of send buffer in bytes | 234 | * @sk_sndbuf: size of send buffer in bytes |
234 | * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, | 235 | * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, |
@@ -325,6 +326,9 @@ struct sock { | |||
325 | #ifdef CONFIG_RPS | 326 | #ifdef CONFIG_RPS |
326 | __u32 sk_rxhash; | 327 | __u32 sk_rxhash; |
327 | #endif | 328 | #endif |
329 | #ifdef CONFIG_NET_LL_RX_POLL | ||
330 | unsigned int sk_napi_id; | ||
331 | #endif | ||
328 | atomic_t sk_drops; | 332 | atomic_t sk_drops; |
329 | int sk_rcvbuf; | 333 | int sk_rcvbuf; |
330 | 334 | ||
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index df2e8b4f9c03..26cbf76f8058 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h | |||
@@ -253,6 +253,7 @@ enum | |||
253 | LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ | 253 | LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ |
254 | LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ | 254 | LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ |
255 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ | 255 | LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ |
256 | LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */ | ||
256 | __LINUX_MIB_MAX | 257 | __LINUX_MIB_MAX |
257 | }; | 258 | }; |
258 | 259 | ||
diff --git a/net/Kconfig b/net/Kconfig index 523e43e6da1b..d6a9ce6e1800 100644 --- a/net/Kconfig +++ b/net/Kconfig | |||
@@ -243,6 +243,18 @@ config NETPRIO_CGROUP | |||
243 | Cgroup subsystem for use in assigning processes to network priorities on | 243 | Cgroup subsystem for use in assigning processes to network priorities on |
244 | a per-interface basis | 244 | a per-interface basis |
245 | 245 | ||
246 | config NET_LL_RX_POLL | ||
247 | bool "Low Latency Receive Poll" | ||
248 | depends on X86_TSC | ||
249 | default n | ||
250 | ---help--- | ||
251 | Support Low Latency Receive Queue Poll. | ||
252 | (For network card drivers which support this option.) | ||
253 | When waiting for data in read or poll call directly into the the device driver | ||
254 | to flush packets which may be pending on the device queues into the stack. | ||
255 | |||
256 | If unsure, say N. | ||
257 | |||
246 | config BQL | 258 | config BQL |
247 | boolean | 259 | boolean |
248 | depends on SYSFS | 260 | depends on SYSFS |
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 73f57a0e1523..4a4181e16c1a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c | |||
@@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) | |||
733 | new->vlan_tci = old->vlan_tci; | 733 | new->vlan_tci = old->vlan_tci; |
734 | 734 | ||
735 | skb_copy_secmark(new, old); | 735 | skb_copy_secmark(new, old); |
736 | |||
737 | #ifdef CONFIG_NET_LL_RX_POLL | ||
738 | new->napi_id = old->napi_id; | ||
739 | #endif | ||
736 | } | 740 | } |
737 | 741 | ||
738 | /* | 742 | /* |
diff --git a/net/core/sock.c b/net/core/sock.c index 88868a9d21da..788c0da5eed1 100644 --- a/net/core/sock.c +++ b/net/core/sock.c | |||
@@ -139,6 +139,8 @@ | |||
139 | #include <net/tcp.h> | 139 | #include <net/tcp.h> |
140 | #endif | 140 | #endif |
141 | 141 | ||
142 | #include <net/ll_poll.h> | ||
143 | |||
142 | static DEFINE_MUTEX(proto_list_mutex); | 144 | static DEFINE_MUTEX(proto_list_mutex); |
143 | static LIST_HEAD(proto_list); | 145 | static LIST_HEAD(proto_list); |
144 | 146 | ||
@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) | |||
2284 | 2286 | ||
2285 | sk->sk_stamp = ktime_set(-1L, 0); | 2287 | sk->sk_stamp = ktime_set(-1L, 0); |
2286 | 2288 | ||
2289 | #ifdef CONFIG_NET_LL_RX_POLL | ||
2290 | sk->sk_napi_id = 0; | ||
2291 | #endif | ||
2292 | |||
2287 | /* | 2293 | /* |
2288 | * Before updating sk_refcnt, we must commit prior changes to memory | 2294 | * Before updating sk_refcnt, we must commit prior changes to memory |
2289 | * (Documentation/RCU/rculist_nulls.txt for details) | 2295 | * (Documentation/RCU/rculist_nulls.txt for details) |
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 741db5fc7806..4b48f39582b0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c | |||
@@ -19,6 +19,7 @@ | |||
19 | #include <net/ip.h> | 19 | #include <net/ip.h> |
20 | #include <net/sock.h> | 20 | #include <net/sock.h> |
21 | #include <net/net_ratelimit.h> | 21 | #include <net/net_ratelimit.h> |
22 | #include <net/ll_poll.h> | ||
22 | 23 | ||
23 | static int one = 1; | 24 | static int one = 1; |
24 | 25 | ||
@@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = { | |||
284 | .proc_handler = flow_limit_table_len_sysctl | 285 | .proc_handler = flow_limit_table_len_sysctl |
285 | }, | 286 | }, |
286 | #endif /* CONFIG_NET_FLOW_LIMIT */ | 287 | #endif /* CONFIG_NET_FLOW_LIMIT */ |
288 | #ifdef CONFIG_NET_LL_RX_POLL | ||
289 | { | ||
290 | .procname = "low_latency_poll", | ||
291 | .data = &sysctl_net_ll_poll, | ||
292 | .maxlen = sizeof(unsigned long), | ||
293 | .mode = 0644, | ||
294 | .proc_handler = proc_doulongvec_minmax | ||
295 | }, | ||
296 | #endif | ||
287 | #endif /* CONFIG_NET */ | 297 | #endif /* CONFIG_NET */ |
288 | { | 298 | { |
289 | .procname = "netdev_budget", | 299 | .procname = "netdev_budget", |
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 2a5bf86d2415..6577a1149a47 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c | |||
@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { | |||
273 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), | 273 | SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), |
274 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), | 274 | SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), |
275 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), | 275 | SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), |
276 | SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS), | ||
276 | SNMP_MIB_SENTINEL | 277 | SNMP_MIB_SENTINEL |
277 | }; | 278 | }; |
278 | 279 | ||
diff --git a/net/socket.c b/net/socket.c index 3ebdcb805c51..21fd29f63ed2 100644 --- a/net/socket.c +++ b/net/socket.c | |||
@@ -104,6 +104,12 @@ | |||
104 | #include <linux/route.h> | 104 | #include <linux/route.h> |
105 | #include <linux/sockios.h> | 105 | #include <linux/sockios.h> |
106 | #include <linux/atalk.h> | 106 | #include <linux/atalk.h> |
107 | #include <net/ll_poll.h> | ||
108 | |||
109 | #ifdef CONFIG_NET_LL_RX_POLL | ||
110 | unsigned long sysctl_net_ll_poll __read_mostly; | ||
111 | EXPORT_SYMBOL_GPL(sysctl_net_ll_poll); | ||
112 | #endif | ||
107 | 113 | ||
108 | static int sock_no_open(struct inode *irrelevant, struct file *dontcare); | 114 | static int sock_no_open(struct inode *irrelevant, struct file *dontcare); |
109 | static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, | 115 | static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, |