aboutsummaryrefslogtreecommitdiffstats
path: root/include/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2012-07-22 20:04:15 -0400
committerDavid S. Miller <davem@davemloft.net>2012-07-22 20:04:15 -0400
commit5e9965c15ba88319500284e590733f4a4629a288 (patch)
treeab76263b9f43fb75048a50141d199f445f5fdd2d /include/net
parent3ba97381343b271296487bf073eb670d5465a8b8 (diff)
parent2860583fe840d972573363dfa190b2149a604534 (diff)
Merge branch 'kill_rtcache'
The ipv4 routing cache is non-deterministic, performance wise, and is subject to reasonably easy to launch denial of service attacks. The routing cache works great for well behaved traffic, and the world was a much friendlier place when the tradeoffs that led to the routing cache's design were considered. What it boils down to is that the performance of the routing cache is a product of the traffic patterns seen by a system rather than being a product of the contents of the routing tables. The former of which is controllable by external entitites. Even for "well behaved" legitimate traffic, high volume sites can see hit rates in the routing cache of only ~%10. The general flow of this patch series is that first the routing cache is removed. We build a completely new rtable entry every lookup request. Next we make some simplifications due to the fact that removing the routing cache causes several members of struct rtable to become no longer necessary. Then we need to make some amends such that we can legally cache pre-constructed routes in the FIB nexthops. Firstly, we need to invalidate routes which are hit with nexthop exceptions. Secondly we have to change the semantics of rt->rt_gateway such that zero means that the destination is on-link and non-zero otherwise. Now that the preparations are ready, we start caching precomputed routes in the FIB nexthops. Output and input routes need different kinds of care when determining if we can legally do such caching or not. The details are in the commit log messages for those changes. The patch series then winds down with some more struct rtable simplifications and other tidy ups that remove unnecessary overhead. On a SPARC-T3 output route lookups are ~876 cycles. Input route lookups are ~1169 cycles with rpfilter disabled, and about ~1468 cycles with rpfilter enabled. These measurements were taken with the kbench_mod test module in the net_test_tools GIT tree: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git That GIT tree also includes a udpflood tester tool and stresses route lookups on packet output. For example, on the same SPARC-T3 system we can run: time ./udpflood -l 10000000 10.2.2.11 with routing cache: real 1m21.955s user 0m6.530s sys 1m15.390s without routing cache: real 1m31.678s user 0m6.520s sys 1m25.140s Performance undoubtedly can easily be improved further. For example fib_table_lookup() performs a lot of excessive computations with all the masking and shifting, some of it conditionalized to deal with edge cases. Also, Eric's no-ref optimization for input route lookups can be re-instated for the FIB nexthop caching code path. I would be really pleased if someone would work on that. In fact anyone suitable motivated can just fire up perf on the loading of the test net_test_tools benchmark kernel module. I spend much of my time going: bash# perf record insmod ./kbench_mod.ko dst=172.30.42.22 src=74.128.0.1 iif=2 bash# perf report Thanks to helpful feedback from Joe Perches, Eric Dumazet, Ben Hutchings, and others. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'include/net')
-rw-r--r--include/net/dst.h15
-rw-r--r--include/net/flow.h1
-rw-r--r--include/net/inet_connection_sock.h3
-rw-r--r--include/net/ip_fib.h3
-rw-r--r--include/net/route.h40
5 files changed, 30 insertions, 32 deletions
diff --git a/include/net/dst.h b/include/net/dst.h
index 51610468c63d..baf597890064 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -65,7 +65,20 @@ struct dst_entry {
65 unsigned short pending_confirm; 65 unsigned short pending_confirm;
66 66
67 short error; 67 short error;
68
69 /* A non-zero value of dst->obsolete forces by-hand validation
70 * of the route entry. Positive values are set by the generic
71 * dst layer to indicate that the entry has been forcefully
72 * destroyed.
73 *
74 * Negative values are used by the implementation layer code to
75 * force invocation of the dst_ops->check() method.
76 */
68 short obsolete; 77 short obsolete;
78#define DST_OBSOLETE_NONE 0
79#define DST_OBSOLETE_DEAD 2
80#define DST_OBSOLETE_FORCE_CHK -1
81#define DST_OBSOLETE_KILL -2
69 unsigned short header_len; /* more space at head required */ 82 unsigned short header_len; /* more space at head required */
70 unsigned short trailer_len; /* space to reserve at tail */ 83 unsigned short trailer_len; /* space to reserve at tail */
71#ifdef CONFIG_IP_ROUTE_CLASSID 84#ifdef CONFIG_IP_ROUTE_CLASSID
@@ -359,7 +372,7 @@ extern struct dst_entry *dst_destroy(struct dst_entry *dst);
359 372
360static inline void dst_free(struct dst_entry *dst) 373static inline void dst_free(struct dst_entry *dst)
361{ 374{
362 if (dst->obsolete > 1) 375 if (dst->obsolete > 0)
363 return; 376 return;
364 if (!atomic_read(&dst->__refcnt)) { 377 if (!atomic_read(&dst->__refcnt)) {
365 dst = dst_destroy(dst); 378 dst = dst_destroy(dst);
diff --git a/include/net/flow.h b/include/net/flow.h
index ce9cb7656b47..e1dd5082ec7e 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -21,7 +21,6 @@ struct flowi_common {
21 __u8 flowic_flags; 21 __u8 flowic_flags;
22#define FLOWI_FLAG_ANYSRC 0x01 22#define FLOWI_FLAG_ANYSRC 0x01
23#define FLOWI_FLAG_CAN_SLEEP 0x02 23#define FLOWI_FLAG_CAN_SLEEP 0x02
24#define FLOWI_FLAG_RT_NOCACHE 0x04
25 __u32 flowic_secid; 24 __u32 flowic_secid;
26}; 25};
27 26
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 2cf44b4ed2e6..5ee66f517b4f 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -250,8 +250,7 @@ extern int inet_csk_get_port(struct sock *sk, unsigned short snum);
250 250
251extern struct dst_entry* inet_csk_route_req(struct sock *sk, 251extern struct dst_entry* inet_csk_route_req(struct sock *sk,
252 struct flowi4 *fl4, 252 struct flowi4 *fl4,
253 const struct request_sock *req, 253 const struct request_sock *req);
254 bool nocache);
255extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk, 254extern struct dst_entry* inet_csk_route_child_sock(struct sock *sk,
256 struct sock *newsk, 255 struct sock *newsk,
257 const struct request_sock *req); 256 const struct request_sock *req);
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 2daf096dfc60..e69c3a47153d 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -46,6 +46,7 @@ struct fib_config {
46 }; 46 };
47 47
48struct fib_info; 48struct fib_info;
49struct rtable;
49 50
50struct fib_nh_exception { 51struct fib_nh_exception {
51 struct fib_nh_exception __rcu *fnhe_next; 52 struct fib_nh_exception __rcu *fnhe_next;
@@ -80,6 +81,8 @@ struct fib_nh {
80 __be32 nh_gw; 81 __be32 nh_gw;
81 __be32 nh_saddr; 82 __be32 nh_saddr;
82 int nh_saddr_genid; 83 int nh_saddr_genid;
84 struct rtable *nh_rth_output;
85 struct rtable *nh_rth_input;
83 struct fnhe_hash_bucket *nh_exceptions; 86 struct fnhe_hash_bucket *nh_exceptions;
84}; 87};
85 88
diff --git a/include/net/route.h b/include/net/route.h
index ace3cb442519..60d611dc5cee 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -44,38 +44,35 @@ struct fib_info;
44struct rtable { 44struct rtable {
45 struct dst_entry dst; 45 struct dst_entry dst;
46 46
47 /* Lookup key. */
48 __be32 rt_key_dst;
49 __be32 rt_key_src;
50
51 int rt_genid; 47 int rt_genid;
52 unsigned int rt_flags; 48 unsigned int rt_flags;
53 __u16 rt_type; 49 __u16 rt_type;
54 __u8 rt_key_tos; 50 __u16 rt_is_input;
55 51
56 __be32 rt_dst; /* Path destination */
57 __be32 rt_src; /* Path source */
58 int rt_route_iif;
59 int rt_iif; 52 int rt_iif;
60 int rt_oif;
61 __u32 rt_mark;
62 53
63 /* Info on neighbour */ 54 /* Info on neighbour */
64 __be32 rt_gateway; 55 __be32 rt_gateway;
65 56
66 /* Miscellaneous cached information */ 57 /* Miscellaneous cached information */
67 u32 rt_pmtu; 58 u32 rt_pmtu;
68 struct fib_info *fi; /* for client ref to shared metrics */
69}; 59};
70 60
71static inline bool rt_is_input_route(const struct rtable *rt) 61static inline bool rt_is_input_route(const struct rtable *rt)
72{ 62{
73 return rt->rt_route_iif != 0; 63 return rt->rt_is_input != 0;
74} 64}
75 65
76static inline bool rt_is_output_route(const struct rtable *rt) 66static inline bool rt_is_output_route(const struct rtable *rt)
77{ 67{
78 return rt->rt_route_iif == 0; 68 return rt->rt_is_input == 0;
69}
70
71static inline __be32 rt_nexthop(const struct rtable *rt, __be32 daddr)
72{
73 if (rt->rt_gateway)
74 return rt->rt_gateway;
75 return daddr;
79} 76}
80 77
81struct ip_rt_acct { 78struct ip_rt_acct {
@@ -109,7 +106,6 @@ extern struct ip_rt_acct __percpu *ip_rt_acct;
109struct in_device; 106struct in_device;
110extern int ip_rt_init(void); 107extern int ip_rt_init(void);
111extern void rt_cache_flush(struct net *net, int how); 108extern void rt_cache_flush(struct net *net, int how);
112extern void rt_cache_flush_batch(struct net *net);
113extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); 109extern struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
114extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, 110extern struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
115 struct sock *sk); 111 struct sock *sk);
@@ -161,20 +157,8 @@ static inline struct rtable *ip_route_output_gre(struct net *net, struct flowi4
161 return ip_route_output_key(net, fl4); 157 return ip_route_output_key(net, fl4);
162} 158}
163 159
164extern int ip_route_input_common(struct sk_buff *skb, __be32 dst, __be32 src, 160extern int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
165 u8 tos, struct net_device *devin, bool noref); 161 u8 tos, struct net_device *devin);
166
167static inline int ip_route_input(struct sk_buff *skb, __be32 dst, __be32 src,
168 u8 tos, struct net_device *devin)
169{
170 return ip_route_input_common(skb, dst, src, tos, devin, false);
171}
172
173static inline int ip_route_input_noref(struct sk_buff *skb, __be32 dst, __be32 src,
174 u8 tos, struct net_device *devin)
175{
176 return ip_route_input_common(skb, dst, src, tos, devin, true);
177}
178 162
179extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 163extern void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
180 int oif, u32 mark, u8 protocol, int flow_flags); 164 int oif, u32 mark, u8 protocol, int flow_flags);