aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric W. Biederman <ebiederm@xmission.com>2015-03-03 20:10:47 -0500
committerDavid S. Miller <davem@davemloft.net>2015-03-04 00:26:06 -0500
commit0189197f441602acdca3f97750d392a895b778fd (patch)
tree42ea0d4f05baaaabe7e15b153ce002d0aaaac1a4
parentcec9166ca4e586de389b0f3c43a8103e728d92ec (diff)
mpls: Basic routing support
This change adds a new Kconfig option MPLS_ROUTING. The core of this change is the code to look at an mpls packet received from another machine. Look that packet up in a routing table and forward the packet on. Support of MPLS over ATM is not considered or attempted here. This implemntation follows RFC3032 and implements the MPLS shim header that can pass over essentially any network. What RFC3021 refers to as the as the Incoming Label Map (ILM) I call net->mpls.platform_label[]. What RFC3031 refers to as the Next Label Hop Forwarding Entry (NHLFE) I call mpls_route. Though calling it the label fordwarding information base (lfib) might also be valid. Further the implemntation forwards packets as described in RFC3032. There is no need and given the original motivation for MPLS a strong discincentive to have a flexible label forwarding path. In essence the logic is the topmost label is read, looked up, removed, and replaced by 0 or more new lables and the sent out the specified interface to it's next hop. Quite a few optional features are not implemented here. Among them are generation of ICMP errors when the TTL is exceeded or the packet is larger than the next hop MTU (those conditions are detected and the packets are dropped instead of generating an icmp error). The traffic class field is always set to 0. The implementation focuses on IP over MPLS and does not handle egress of other kinds of protocols. Instead of implementing coordination with the neighbour table and sorting out how to input next hops in a different address family (for which there is value). I was lazy and implemented a next hop mac address instead. The code is simpler and there are flavor of MPLS such as MPLS-TP where neither an IPv4 nor an IPv6 next hop is appropriate so a next hop by mac address would need to be implemented at some point. Two new definitions AF_MPLS and PF_MPLS are exposed to userspace. Decoding the mpls header must be done by first byeswapping a 32bit bit endian word into the local cpu endian and then bit shifting to extract the pieces. There is no C bit-field that can represent a wire format mpls header on a little endian machine as the low bits of the 20bit label wind up in the wrong half of third byte. Therefore internally everything is deal with in cpu native byte order except when writing to and reading from a packet. For management simplicity if a label is configured to forward out an interface that is down the packet is dropped early. Similarly if an network interface is removed rt_dev is updated to NULL (so no reference is preserved) and any packets for that label are dropped. Keeping the label entries in the kernel allows the kernel label table to function as the definitive source of which labels are allocated and which are not. Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com> Signed-off-by: David S. Miller <davem@davemloft.net>
-rw-r--r--include/linux/socket.h2
-rw-r--r--include/net/net_namespace.h4
-rw-r--r--include/net/netns/mpls.h15
-rw-r--r--net/mpls/Kconfig5
-rw-r--r--net/mpls/Makefile1
-rw-r--r--net/mpls/af_mpls.c349
-rw-r--r--net/mpls/internal.h56
7 files changed, 432 insertions, 0 deletions
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 5c19cba34dce..fab4d0ddf4ed 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -181,6 +181,7 @@ struct ucred {
181#define AF_WANPIPE 25 /* Wanpipe API Sockets */ 181#define AF_WANPIPE 25 /* Wanpipe API Sockets */
182#define AF_LLC 26 /* Linux LLC */ 182#define AF_LLC 26 /* Linux LLC */
183#define AF_IB 27 /* Native InfiniBand address */ 183#define AF_IB 27 /* Native InfiniBand address */
184#define AF_MPLS 28 /* MPLS */
184#define AF_CAN 29 /* Controller Area Network */ 185#define AF_CAN 29 /* Controller Area Network */
185#define AF_TIPC 30 /* TIPC sockets */ 186#define AF_TIPC 30 /* TIPC sockets */
186#define AF_BLUETOOTH 31 /* Bluetooth sockets */ 187#define AF_BLUETOOTH 31 /* Bluetooth sockets */
@@ -226,6 +227,7 @@ struct ucred {
226#define PF_WANPIPE AF_WANPIPE 227#define PF_WANPIPE AF_WANPIPE
227#define PF_LLC AF_LLC 228#define PF_LLC AF_LLC
228#define PF_IB AF_IB 229#define PF_IB AF_IB
230#define PF_MPLS AF_MPLS
229#define PF_CAN AF_CAN 231#define PF_CAN AF_CAN
230#define PF_TIPC AF_TIPC 232#define PF_TIPC AF_TIPC
231#define PF_BLUETOOTH AF_BLUETOOTH 233#define PF_BLUETOOTH AF_BLUETOOTH
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 36faf4990c4b..2cb9acb618e9 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -26,6 +26,7 @@
26#endif 26#endif
27#include <net/netns/nftables.h> 27#include <net/netns/nftables.h>
28#include <net/netns/xfrm.h> 28#include <net/netns/xfrm.h>
29#include <net/netns/mpls.h>
29#include <linux/ns_common.h> 30#include <linux/ns_common.h>
30 31
31struct user_namespace; 32struct user_namespace;
@@ -130,6 +131,9 @@ struct net {
130#if IS_ENABLED(CONFIG_IP_VS) 131#if IS_ENABLED(CONFIG_IP_VS)
131 struct netns_ipvs *ipvs; 132 struct netns_ipvs *ipvs;
132#endif 133#endif
134#if IS_ENABLED(CONFIG_MPLS)
135 struct netns_mpls mpls;
136#endif
133 struct sock *diag_nlsk; 137 struct sock *diag_nlsk;
134 atomic_t fnhe_genid; 138 atomic_t fnhe_genid;
135}; 139};
diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h
new file mode 100644
index 000000000000..f90aaf8d4f89
--- /dev/null
+++ b/include/net/netns/mpls.h
@@ -0,0 +1,15 @@
1/*
2 * mpls in net namespaces
3 */
4
5#ifndef __NETNS_MPLS_H__
6#define __NETNS_MPLS_H__
7
8struct mpls_route;
9
10struct netns_mpls {
11 size_t platform_labels;
12 struct mpls_route __rcu * __rcu *platform_label;
13};
14
15#endif /* __NETNS_MPLS_H__ */
diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
index a77fbcdd04ee..f4286ee7e2b0 100644
--- a/net/mpls/Kconfig
+++ b/net/mpls/Kconfig
@@ -22,4 +22,9 @@ config NET_MPLS_GSO
22 that have had MPLS stack entries pushed onto them and thus 22 that have had MPLS stack entries pushed onto them and thus
23 become MPLS GSO packets. 23 become MPLS GSO packets.
24 24
25config MPLS_ROUTING
26 bool "MPLS: routing support"
27 help
28 Add support for forwarding of mpls packets.
29
25endif # MPLS 30endif # MPLS
diff --git a/net/mpls/Makefile b/net/mpls/Makefile
index 6dec088c2d0f..60af15f1960e 100644
--- a/net/mpls/Makefile
+++ b/net/mpls/Makefile
@@ -2,3 +2,4 @@
2# Makefile for MPLS. 2# Makefile for MPLS.
3# 3#
4obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o 4obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
5obj-$(CONFIG_MPLS_ROUTING) += af_mpls.o
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
new file mode 100644
index 000000000000..924377736b2a
--- /dev/null
+++ b/net/mpls/af_mpls.c
@@ -0,0 +1,349 @@
1#include <linux/types.h>
2#include <linux/skbuff.h>
3#include <linux/socket.h>
4#include <linux/net.h>
5#include <linux/module.h>
6#include <linux/if_arp.h>
7#include <linux/ipv6.h>
8#include <linux/mpls.h>
9#include <net/ip.h>
10#include <net/dst.h>
11#include <net/sock.h>
12#include <net/arp.h>
13#include <net/ip_fib.h>
14#include <net/netevent.h>
15#include <net/netns/generic.h>
16#include "internal.h"
17
18#define MAX_NEW_LABELS 2
19
20/* This maximum ha length copied from the definition of struct neighbour */
21#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
22
23struct mpls_route { /* next hop label forwarding entry */
24 struct net_device *rt_dev;
25 struct rcu_head rt_rcu;
26 u32 rt_label[MAX_NEW_LABELS];
27 u8 rt_protocol; /* routing protocol that set this entry */
28 u8 rt_labels:2,
29 rt_via_alen:6;
30 unsigned short rt_via_family;
31 u8 rt_via[0];
32};
33
34static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index)
35{
36 struct mpls_route *rt = NULL;
37
38 if (index < net->mpls.platform_labels) {
39 struct mpls_route __rcu **platform_label =
40 rcu_dereference(net->mpls.platform_label);
41 rt = rcu_dereference(platform_label[index]);
42 }
43 return rt;
44}
45
46static bool mpls_output_possible(const struct net_device *dev)
47{
48 return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
49}
50
51static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
52{
53 /* The size of the layer 2.5 labels to be added for this route */
54 return rt->rt_labels * sizeof(struct mpls_shim_hdr);
55}
56
57static unsigned int mpls_dev_mtu(const struct net_device *dev)
58{
59 /* The amount of data the layer 2 frame can hold */
60 return dev->mtu;
61}
62
63static bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
64{
65 if (skb->len <= mtu)
66 return false;
67
68 if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
69 return false;
70
71 return true;
72}
73
74static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
75 struct mpls_entry_decoded dec)
76{
77 /* RFC4385 and RFC5586 encode other packets in mpls such that
78 * they don't conflict with the ip version number, making
79 * decoding by examining the ip version correct in everything
80 * except for the strangest cases.
81 *
82 * The strange cases if we choose to support them will require
83 * manual configuration.
84 */
85 struct iphdr *hdr4 = ip_hdr(skb);
86 bool success = true;
87
88 if (hdr4->version == 4) {
89 skb->protocol = htons(ETH_P_IP);
90 csum_replace2(&hdr4->check,
91 htons(hdr4->ttl << 8),
92 htons(dec.ttl << 8));
93 hdr4->ttl = dec.ttl;
94 }
95 else if (hdr4->version == 6) {
96 struct ipv6hdr *hdr6 = ipv6_hdr(skb);
97 skb->protocol = htons(ETH_P_IPV6);
98 hdr6->hop_limit = dec.ttl;
99 }
100 else
101 /* version 0 and version 1 are used by pseudo wires */
102 success = false;
103 return success;
104}
105
106static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
107 struct packet_type *pt, struct net_device *orig_dev)
108{
109 struct net *net = dev_net(dev);
110 struct mpls_shim_hdr *hdr;
111 struct mpls_route *rt;
112 struct mpls_entry_decoded dec;
113 struct net_device *out_dev;
114 unsigned int hh_len;
115 unsigned int new_header_size;
116 unsigned int mtu;
117 int err;
118
119 /* Careful this entire function runs inside of an rcu critical section */
120
121 if (skb->pkt_type != PACKET_HOST)
122 goto drop;
123
124 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
125 goto drop;
126
127 if (!pskb_may_pull(skb, sizeof(*hdr)))
128 goto drop;
129
130 /* Read and decode the label */
131 hdr = mpls_hdr(skb);
132 dec = mpls_entry_decode(hdr);
133
134 /* Pop the label */
135 skb_pull(skb, sizeof(*hdr));
136 skb_reset_network_header(skb);
137
138 skb_orphan(skb);
139
140 rt = mpls_route_input_rcu(net, dec.label);
141 if (!rt)
142 goto drop;
143
144 /* Find the output device */
145 out_dev = rt->rt_dev;
146 if (!mpls_output_possible(out_dev))
147 goto drop;
148
149 if (skb_warn_if_lro(skb))
150 goto drop;
151
152 skb_forward_csum(skb);
153
154 /* Verify ttl is valid */
155 if (dec.ttl <= 2)
156 goto drop;
157 dec.ttl -= 1;
158
159 /* Verify the destination can hold the packet */
160 new_header_size = mpls_rt_header_size(rt);
161 mtu = mpls_dev_mtu(out_dev);
162 if (mpls_pkt_too_big(skb, mtu - new_header_size))
163 goto drop;
164
165 hh_len = LL_RESERVED_SPACE(out_dev);
166 if (!out_dev->header_ops)
167 hh_len = 0;
168
169 /* Ensure there is enough space for the headers in the skb */
170 if (skb_cow(skb, hh_len + new_header_size))
171 goto drop;
172
173 skb->dev = out_dev;
174 skb->protocol = htons(ETH_P_MPLS_UC);
175
176 if (unlikely(!new_header_size && dec.bos)) {
177 /* Penultimate hop popping */
178 if (!mpls_egress(rt, skb, dec))
179 goto drop;
180 } else {
181 bool bos;
182 int i;
183 skb_push(skb, new_header_size);
184 skb_reset_network_header(skb);
185 /* Push the new labels */
186 hdr = mpls_hdr(skb);
187 bos = dec.bos;
188 for (i = rt->rt_labels - 1; i >= 0; i--) {
189 hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
190 bos = false;
191 }
192 }
193
194 err = neigh_xmit(rt->rt_via_family, out_dev, rt->rt_via, skb);
195 if (err)
196 net_dbg_ratelimited("%s: packet transmission failed: %d\n",
197 __func__, err);
198 return 0;
199
200drop:
201 kfree_skb(skb);
202 return NET_RX_DROP;
203}
204
205static struct packet_type mpls_packet_type __read_mostly = {
206 .type = cpu_to_be16(ETH_P_MPLS_UC),
207 .func = mpls_forward,
208};
209
210static struct mpls_route *mpls_rt_alloc(size_t alen)
211{
212 struct mpls_route *rt;
213
214 rt = kzalloc(GFP_KERNEL, sizeof(*rt) + alen);
215 if (rt)
216 rt->rt_via_alen = alen;
217 return rt;
218}
219
220static void mpls_rt_free(struct mpls_route *rt)
221{
222 if (rt)
223 kfree_rcu(rt, rt_rcu);
224}
225
226static void mpls_route_update(struct net *net, unsigned index,
227 struct net_device *dev, struct mpls_route *new,
228 const struct nl_info *info)
229{
230 struct mpls_route *rt, *old = NULL;
231
232 ASSERT_RTNL();
233
234 rt = net->mpls.platform_label[index];
235 if (!dev || (rt && (rt->rt_dev == dev))) {
236 rcu_assign_pointer(net->mpls.platform_label[index], new);
237 old = rt;
238 }
239
240 /* If we removed a route free it now */
241 mpls_rt_free(old);
242}
243
244static void mpls_ifdown(struct net_device *dev)
245{
246 struct net *net = dev_net(dev);
247 unsigned index;
248
249 for (index = 0; index < net->mpls.platform_labels; index++) {
250 struct mpls_route *rt = net->mpls.platform_label[index];
251 if (!rt)
252 continue;
253 if (rt->rt_dev != dev)
254 continue;
255 rt->rt_dev = NULL;
256 }
257}
258
259static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
260 void *ptr)
261{
262 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
263
264 switch(event) {
265 case NETDEV_UNREGISTER:
266 mpls_ifdown(dev);
267 break;
268 }
269 return NOTIFY_OK;
270}
271
272static struct notifier_block mpls_dev_notifier = {
273 .notifier_call = mpls_dev_notify,
274};
275
276static int mpls_net_init(struct net *net)
277{
278 net->mpls.platform_labels = 0;
279 net->mpls.platform_label = NULL;
280
281 return 0;
282}
283
284static void mpls_net_exit(struct net *net)
285{
286 unsigned int index;
287
288 /* An rcu grace period haselapsed since there was a device in
289 * the network namespace (and thus the last in fqlight packet)
290 * left this network namespace. This is because
291 * unregister_netdevice_many and netdev_run_todo has completed
292 * for each network device that was in this network namespace.
293 *
294 * As such no additional rcu synchronization is necessary when
295 * freeing the platform_label table.
296 */
297 rtnl_lock();
298 for (index = 0; index < net->mpls.platform_labels; index++) {
299 struct mpls_route *rt = net->mpls.platform_label[index];
300 rcu_assign_pointer(net->mpls.platform_label[index], NULL);
301 mpls_rt_free(rt);
302 }
303 rtnl_unlock();
304
305 kvfree(net->mpls.platform_label);
306}
307
308static struct pernet_operations mpls_net_ops = {
309 .init = mpls_net_init,
310 .exit = mpls_net_exit,
311};
312
313static int __init mpls_init(void)
314{
315 int err;
316
317 BUILD_BUG_ON(sizeof(struct mpls_shim_hdr) != 4);
318
319 err = register_pernet_subsys(&mpls_net_ops);
320 if (err)
321 goto out;
322
323 err = register_netdevice_notifier(&mpls_dev_notifier);
324 if (err)
325 goto out_unregister_pernet;
326
327 dev_add_pack(&mpls_packet_type);
328
329 err = 0;
330out:
331 return err;
332
333out_unregister_pernet:
334 unregister_pernet_subsys(&mpls_net_ops);
335 goto out;
336}
337module_init(mpls_init);
338
339static void __exit mpls_exit(void)
340{
341 dev_remove_pack(&mpls_packet_type);
342 unregister_netdevice_notifier(&mpls_dev_notifier);
343 unregister_pernet_subsys(&mpls_net_ops);
344}
345module_exit(mpls_exit);
346
347MODULE_DESCRIPTION("MultiProtocol Label Switching");
348MODULE_LICENSE("GPL v2");
349MODULE_ALIAS_NETPROTO(PF_MPLS);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
new file mode 100644
index 000000000000..c2944cb84d48
--- /dev/null
+++ b/net/mpls/internal.h
@@ -0,0 +1,56 @@
1#ifndef MPLS_INTERNAL_H
2#define MPLS_INTERNAL_H
3
4#define LABEL_IPV4_EXPLICIT_NULL 0 /* RFC3032 */
5#define LABEL_ROUTER_ALERT_LABEL 1 /* RFC3032 */
6#define LABEL_IPV6_EXPLICIT_NULL 2 /* RFC3032 */
7#define LABEL_IMPLICIT_NULL 3 /* RFC3032 */
8#define LABEL_ENTROPY_INDICATOR 7 /* RFC6790 */
9#define LABEL_GAL 13 /* RFC5586 */
10#define LABEL_OAM_ALERT 14 /* RFC3429 */
11#define LABEL_EXTENSION 15 /* RFC7274 */
12
13
14struct mpls_shim_hdr {
15 __be32 label_stack_entry;
16};
17
18struct mpls_entry_decoded {
19 u32 label;
20 u8 ttl;
21 u8 tc;
22 u8 bos;
23};
24
25struct sk_buff;
26
27static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
28{
29 return (struct mpls_shim_hdr *)skb_network_header(skb);
30}
31
32static inline struct mpls_shim_hdr mpls_entry_encode(u32 label, unsigned ttl, unsigned tc, bool bos)
33{
34 struct mpls_shim_hdr result;
35 result.label_stack_entry =
36 cpu_to_be32((label << MPLS_LS_LABEL_SHIFT) |
37 (tc << MPLS_LS_TC_SHIFT) |
38 (bos ? (1 << MPLS_LS_S_SHIFT) : 0) |
39 (ttl << MPLS_LS_TTL_SHIFT));
40 return result;
41}
42
43static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *hdr)
44{
45 struct mpls_entry_decoded result;
46 unsigned entry = be32_to_cpu(hdr->label_stack_entry);
47
48 result.label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
49 result.ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
50 result.tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT;
51 result.bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT;
52
53 return result;
54}
55
56#endif /* MPLS_INTERNAL_H */