aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 17:27:06 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 17:27:06 -0500
commit70e71ca0af244f48a5dcf56dc435243792e3a495 (patch)
treef7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /kernel
parentbae41e45b7400496b9bf0c70c6004419d9987819 (diff)
parent00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) New offloading infrastructure and example 'rocker' driver for offloading of switching and routing to hardware. This work was done by a large group of dedicated individuals, not limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend, Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu 2) Start making the networking operate on IOV iterators instead of modifying iov objects in-situ during transfers. Thanks to Al Viro and Herbert Xu. 3) A set of new netlink interfaces for the TIPC stack, from Richard Alpe. 4) Remove unnecessary looping during ipv6 routing lookups, from Martin KaFai Lau. 5) Add PAUSE frame generation support to gianfar driver, from Matei Pavaluca. 6) Allow for larger reordering levels in TCP, which are easily achievable in the real world right now, from Eric Dumazet. 7) Add a variable of napi_schedule that doesn't need to disable cpu interrupts, from Eric Dumazet. 8) Use a doubly linked list to optimize neigh_parms_release(), from Nicolas Dichtel. 9) Various enhancements to the kernel BPF verifier, and allow eBPF programs to actually be attached to sockets. From Alexei Starovoitov. 10) Support TSO/LSO in sunvnet driver, from David L Stevens. 11) Allow controlling ECN usage via routing metrics, from Florian Westphal. 12) Remote checksum offload, from Tom Herbert. 13) Add split-header receive, BQL, and xmit_more support to amd-xgbe driver, from Thomas Lendacky. 14) Add MPLS support to openvswitch, from Simon Horman. 15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen Klassert. 16) Do gro flushes on a per-device basis using a timer, from Eric Dumazet. This tries to resolve the conflicting goals between the desired handling of bulk vs. RPC-like traffic. 17) Allow userspace to ask for the CPU upon what a packet was received/steered, via SO_INCOMING_CPU. From Eric Dumazet. 18) Limit GSO packets to half the current congestion window, from Eric Dumazet. 19) Add a generic helper so that all drivers set their RSS keys in a consistent way, from Eric Dumazet. 20) Add xmit_more support to enic driver, from Govindarajulu Varadarajan. 21) Add VLAN packet scheduler action, from Jiri Pirko. 22) Support configurable RSS hash functions via ethtool, from Eyal Perry. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits) Fix race condition between vxlan_sock_add and vxlan_sock_release net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header net/mlx4: Add support for A0 steering net/mlx4: Refactor QUERY_PORT net/mlx4_core: Add explicit error message when rule doesn't meet configuration net/mlx4: Add A0 hybrid steering net/mlx4: Add mlx4_bitmap zone allocator net/mlx4: Add a check if there are too many reserved QPs net/mlx4: Change QP allocation scheme net/mlx4_core: Use tasklet for user-space CQ completion events net/mlx4_core: Mask out host side virtualization features for guests net/mlx4_en: Set csum level for encapsulated packets be2net: Export tunnel offloads only when a VxLAN tunnel is created gianfar: Fix dma check map error when DMA_API_DEBUG is enabled cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call net: fec: only enable mdio interrupt before phy device link up net: fec: clear all interrupt events to support i.MX6SX net: fec: reset fep link status in suspend function net: sock: fix access via invalid file descriptor net: introduce helper macro for_each_cmsghdr ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/arraymap.c156
-rw-r--r--kernel/bpf/hashtab.c367
-rw-r--r--kernel/bpf/helpers.c89
-rw-r--r--kernel/bpf/syscall.c6
-rw-r--r--kernel/bpf/test_stub.c56
-rw-r--r--kernel/bpf/verifier.c171
7 files changed, 750 insertions, 97 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0daf7f6ae7df..a5ae60f0b0a2 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o 2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o
3ifdef CONFIG_TEST_BPF 3ifdef CONFIG_TEST_BPF
4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o 4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif 5endif
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
new file mode 100644
index 000000000000..9eb4d8a7cd87
--- /dev/null
+++ b/kernel/bpf/arraymap.c
@@ -0,0 +1,156 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/err.h>
14#include <linux/vmalloc.h>
15#include <linux/slab.h>
16#include <linux/mm.h>
17
18struct bpf_array {
19 struct bpf_map map;
20 u32 elem_size;
21 char value[0] __aligned(8);
22};
23
24/* Called from syscall */
25static struct bpf_map *array_map_alloc(union bpf_attr *attr)
26{
27 struct bpf_array *array;
28 u32 elem_size, array_size;
29
30 /* check sanity of attributes */
31 if (attr->max_entries == 0 || attr->key_size != 4 ||
32 attr->value_size == 0)
33 return ERR_PTR(-EINVAL);
34
35 elem_size = round_up(attr->value_size, 8);
36
37 /* check round_up into zero and u32 overflow */
38 if (elem_size == 0 ||
39 attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size)
40 return ERR_PTR(-ENOMEM);
41
42 array_size = sizeof(*array) + attr->max_entries * elem_size;
43
44 /* allocate all map elements and zero-initialize them */
45 array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
46 if (!array) {
47 array = vzalloc(array_size);
48 if (!array)
49 return ERR_PTR(-ENOMEM);
50 }
51
52 /* copy mandatory map attributes */
53 array->map.key_size = attr->key_size;
54 array->map.value_size = attr->value_size;
55 array->map.max_entries = attr->max_entries;
56
57 array->elem_size = elem_size;
58
59 return &array->map;
60}
61
62/* Called from syscall or from eBPF program */
63static void *array_map_lookup_elem(struct bpf_map *map, void *key)
64{
65 struct bpf_array *array = container_of(map, struct bpf_array, map);
66 u32 index = *(u32 *)key;
67
68 if (index >= array->map.max_entries)
69 return NULL;
70
71 return array->value + array->elem_size * index;
72}
73
74/* Called from syscall */
75static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
76{
77 struct bpf_array *array = container_of(map, struct bpf_array, map);
78 u32 index = *(u32 *)key;
79 u32 *next = (u32 *)next_key;
80
81 if (index >= array->map.max_entries) {
82 *next = 0;
83 return 0;
84 }
85
86 if (index == array->map.max_entries - 1)
87 return -ENOENT;
88
89 *next = index + 1;
90 return 0;
91}
92
93/* Called from syscall or from eBPF program */
94static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
95 u64 map_flags)
96{
97 struct bpf_array *array = container_of(map, struct bpf_array, map);
98 u32 index = *(u32 *)key;
99
100 if (map_flags > BPF_EXIST)
101 /* unknown flags */
102 return -EINVAL;
103
104 if (index >= array->map.max_entries)
105 /* all elements were pre-allocated, cannot insert a new one */
106 return -E2BIG;
107
108 if (map_flags == BPF_NOEXIST)
109 /* all elements already exist */
110 return -EEXIST;
111
112 memcpy(array->value + array->elem_size * index, value, array->elem_size);
113 return 0;
114}
115
116/* Called from syscall or from eBPF program */
117static int array_map_delete_elem(struct bpf_map *map, void *key)
118{
119 return -EINVAL;
120}
121
122/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
123static void array_map_free(struct bpf_map *map)
124{
125 struct bpf_array *array = container_of(map, struct bpf_array, map);
126
127 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
128 * so the programs (can be more than one that used this map) were
129 * disconnected from events. Wait for outstanding programs to complete
130 * and free the array
131 */
132 synchronize_rcu();
133
134 kvfree(array);
135}
136
137static struct bpf_map_ops array_ops = {
138 .map_alloc = array_map_alloc,
139 .map_free = array_map_free,
140 .map_get_next_key = array_map_get_next_key,
141 .map_lookup_elem = array_map_lookup_elem,
142 .map_update_elem = array_map_update_elem,
143 .map_delete_elem = array_map_delete_elem,
144};
145
146static struct bpf_map_type_list tl = {
147 .ops = &array_ops,
148 .type = BPF_MAP_TYPE_ARRAY,
149};
150
151static int __init register_array_map(void)
152{
153 bpf_register_map_type(&tl);
154 return 0;
155}
156late_initcall(register_array_map);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
new file mode 100644
index 000000000000..b3ba43674310
--- /dev/null
+++ b/kernel/bpf/hashtab.c
@@ -0,0 +1,367 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/jhash.h>
14#include <linux/filter.h>
15#include <linux/vmalloc.h>
16
17struct bpf_htab {
18 struct bpf_map map;
19 struct hlist_head *buckets;
20 spinlock_t lock;
21 u32 count; /* number of elements in this hashtable */
22 u32 n_buckets; /* number of hash buckets */
23 u32 elem_size; /* size of each element in bytes */
24};
25
26/* each htab element is struct htab_elem + key + value */
27struct htab_elem {
28 struct hlist_node hash_node;
29 struct rcu_head rcu;
30 u32 hash;
31 char key[0] __aligned(8);
32};
33
34/* Called from syscall */
35static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
36{
37 struct bpf_htab *htab;
38 int err, i;
39
40 htab = kzalloc(sizeof(*htab), GFP_USER);
41 if (!htab)
42 return ERR_PTR(-ENOMEM);
43
44 /* mandatory map attributes */
45 htab->map.key_size = attr->key_size;
46 htab->map.value_size = attr->value_size;
47 htab->map.max_entries = attr->max_entries;
48
49 /* check sanity of attributes.
50 * value_size == 0 may be allowed in the future to use map as a set
51 */
52 err = -EINVAL;
53 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
54 htab->map.value_size == 0)
55 goto free_htab;
56
57 /* hash table size must be power of 2 */
58 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
59
60 err = -E2BIG;
61 if (htab->map.key_size > MAX_BPF_STACK)
62 /* eBPF programs initialize keys on stack, so they cannot be
63 * larger than max stack size
64 */
65 goto free_htab;
66
67 err = -ENOMEM;
68 /* prevent zero size kmalloc and check for u32 overflow */
69 if (htab->n_buckets == 0 ||
70 htab->n_buckets > U32_MAX / sizeof(struct hlist_head))
71 goto free_htab;
72
73 htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
74 GFP_USER | __GFP_NOWARN);
75
76 if (!htab->buckets) {
77 htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
78 if (!htab->buckets)
79 goto free_htab;
80 }
81
82 for (i = 0; i < htab->n_buckets; i++)
83 INIT_HLIST_HEAD(&htab->buckets[i]);
84
85 spin_lock_init(&htab->lock);
86 htab->count = 0;
87
88 htab->elem_size = sizeof(struct htab_elem) +
89 round_up(htab->map.key_size, 8) +
90 htab->map.value_size;
91 return &htab->map;
92
93free_htab:
94 kfree(htab);
95 return ERR_PTR(err);
96}
97
98static inline u32 htab_map_hash(const void *key, u32 key_len)
99{
100 return jhash(key, key_len, 0);
101}
102
103static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
104{
105 return &htab->buckets[hash & (htab->n_buckets - 1)];
106}
107
108static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
109 void *key, u32 key_size)
110{
111 struct htab_elem *l;
112
113 hlist_for_each_entry_rcu(l, head, hash_node)
114 if (l->hash == hash && !memcmp(&l->key, key, key_size))
115 return l;
116
117 return NULL;
118}
119
120/* Called from syscall or from eBPF program */
121static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
122{
123 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
124 struct hlist_head *head;
125 struct htab_elem *l;
126 u32 hash, key_size;
127
128 /* Must be called with rcu_read_lock. */
129 WARN_ON_ONCE(!rcu_read_lock_held());
130
131 key_size = map->key_size;
132
133 hash = htab_map_hash(key, key_size);
134
135 head = select_bucket(htab, hash);
136
137 l = lookup_elem_raw(head, hash, key, key_size);
138
139 if (l)
140 return l->key + round_up(map->key_size, 8);
141
142 return NULL;
143}
144
145/* Called from syscall */
146static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
147{
148 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
149 struct hlist_head *head;
150 struct htab_elem *l, *next_l;
151 u32 hash, key_size;
152 int i;
153
154 WARN_ON_ONCE(!rcu_read_lock_held());
155
156 key_size = map->key_size;
157
158 hash = htab_map_hash(key, key_size);
159
160 head = select_bucket(htab, hash);
161
162 /* lookup the key */
163 l = lookup_elem_raw(head, hash, key, key_size);
164
165 if (!l) {
166 i = 0;
167 goto find_first_elem;
168 }
169
170 /* key was found, get next key in the same bucket */
171 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
172 struct htab_elem, hash_node);
173
174 if (next_l) {
175 /* if next elem in this hash list is non-zero, just return it */
176 memcpy(next_key, next_l->key, key_size);
177 return 0;
178 }
179
180 /* no more elements in this hash list, go to the next bucket */
181 i = hash & (htab->n_buckets - 1);
182 i++;
183
184find_first_elem:
185 /* iterate over buckets */
186 for (; i < htab->n_buckets; i++) {
187 head = select_bucket(htab, i);
188
189 /* pick first element in the bucket */
190 next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
191 struct htab_elem, hash_node);
192 if (next_l) {
193 /* if it's not empty, just return it */
194 memcpy(next_key, next_l->key, key_size);
195 return 0;
196 }
197 }
198
199 /* itereated over all buckets and all elements */
200 return -ENOENT;
201}
202
203/* Called from syscall or from eBPF program */
204static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
205 u64 map_flags)
206{
207 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
208 struct htab_elem *l_new, *l_old;
209 struct hlist_head *head;
210 unsigned long flags;
211 u32 key_size;
212 int ret;
213
214 if (map_flags > BPF_EXIST)
215 /* unknown flags */
216 return -EINVAL;
217
218 WARN_ON_ONCE(!rcu_read_lock_held());
219
220 /* allocate new element outside of lock */
221 l_new = kmalloc(htab->elem_size, GFP_ATOMIC);
222 if (!l_new)
223 return -ENOMEM;
224
225 key_size = map->key_size;
226
227 memcpy(l_new->key, key, key_size);
228 memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);
229
230 l_new->hash = htab_map_hash(l_new->key, key_size);
231
232 /* bpf_map_update_elem() can be called in_irq() */
233 spin_lock_irqsave(&htab->lock, flags);
234
235 head = select_bucket(htab, l_new->hash);
236
237 l_old = lookup_elem_raw(head, l_new->hash, key, key_size);
238
239 if (!l_old && unlikely(htab->count >= map->max_entries)) {
240 /* if elem with this 'key' doesn't exist and we've reached
241 * max_entries limit, fail insertion of new elem
242 */
243 ret = -E2BIG;
244 goto err;
245 }
246
247 if (l_old && map_flags == BPF_NOEXIST) {
248 /* elem already exists */
249 ret = -EEXIST;
250 goto err;
251 }
252
253 if (!l_old && map_flags == BPF_EXIST) {
254 /* elem doesn't exist, cannot update it */
255 ret = -ENOENT;
256 goto err;
257 }
258
259 /* add new element to the head of the list, so that concurrent
260 * search will find it before old elem
261 */
262 hlist_add_head_rcu(&l_new->hash_node, head);
263 if (l_old) {
264 hlist_del_rcu(&l_old->hash_node);
265 kfree_rcu(l_old, rcu);
266 } else {
267 htab->count++;
268 }
269 spin_unlock_irqrestore(&htab->lock, flags);
270
271 return 0;
272err:
273 spin_unlock_irqrestore(&htab->lock, flags);
274 kfree(l_new);
275 return ret;
276}
277
278/* Called from syscall or from eBPF program */
279static int htab_map_delete_elem(struct bpf_map *map, void *key)
280{
281 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
282 struct hlist_head *head;
283 struct htab_elem *l;
284 unsigned long flags;
285 u32 hash, key_size;
286 int ret = -ENOENT;
287
288 WARN_ON_ONCE(!rcu_read_lock_held());
289
290 key_size = map->key_size;
291
292 hash = htab_map_hash(key, key_size);
293
294 spin_lock_irqsave(&htab->lock, flags);
295
296 head = select_bucket(htab, hash);
297
298 l = lookup_elem_raw(head, hash, key, key_size);
299
300 if (l) {
301 hlist_del_rcu(&l->hash_node);
302 htab->count--;
303 kfree_rcu(l, rcu);
304 ret = 0;
305 }
306
307 spin_unlock_irqrestore(&htab->lock, flags);
308 return ret;
309}
310
311static void delete_all_elements(struct bpf_htab *htab)
312{
313 int i;
314
315 for (i = 0; i < htab->n_buckets; i++) {
316 struct hlist_head *head = select_bucket(htab, i);
317 struct hlist_node *n;
318 struct htab_elem *l;
319
320 hlist_for_each_entry_safe(l, n, head, hash_node) {
321 hlist_del_rcu(&l->hash_node);
322 htab->count--;
323 kfree(l);
324 }
325 }
326}
327
328/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
329static void htab_map_free(struct bpf_map *map)
330{
331 struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
332
333 /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
334 * so the programs (can be more than one that used this map) were
335 * disconnected from events. Wait for outstanding critical sections in
336 * these programs to complete
337 */
338 synchronize_rcu();
339
340 /* some of kfree_rcu() callbacks for elements of this map may not have
341 * executed. It's ok. Proceed to free residual elements and map itself
342 */
343 delete_all_elements(htab);
344 kvfree(htab->buckets);
345 kfree(htab);
346}
347
348static struct bpf_map_ops htab_ops = {
349 .map_alloc = htab_map_alloc,
350 .map_free = htab_map_free,
351 .map_get_next_key = htab_map_get_next_key,
352 .map_lookup_elem = htab_map_lookup_elem,
353 .map_update_elem = htab_map_update_elem,
354 .map_delete_elem = htab_map_delete_elem,
355};
356
357static struct bpf_map_type_list tl = {
358 .ops = &htab_ops,
359 .type = BPF_MAP_TYPE_HASH,
360};
361
362static int __init register_htab_map(void)
363{
364 bpf_register_map_type(&tl);
365 return 0;
366}
367late_initcall(register_htab_map);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
new file mode 100644
index 000000000000..9e3414d85459
--- /dev/null
+++ b/kernel/bpf/helpers.c
@@ -0,0 +1,89 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/rcupdate.h>
14
15/* If kernel subsystem is allowing eBPF programs to call this function,
16 * inside its own verifier_ops->get_func_proto() callback it should return
17 * bpf_map_lookup_elem_proto, so that verifier can properly check the arguments
18 *
19 * Different map implementations will rely on rcu in map methods
20 * lookup/update/delete, therefore eBPF programs must run under rcu lock
21 * if program is allowed to access maps, so check rcu_read_lock_held in
22 * all three functions.
23 */
24static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
25{
26 /* verifier checked that R1 contains a valid pointer to bpf_map
27 * and R2 points to a program stack and map->key_size bytes were
28 * initialized
29 */
30 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
31 void *key = (void *) (unsigned long) r2;
32 void *value;
33
34 WARN_ON_ONCE(!rcu_read_lock_held());
35
36 value = map->ops->map_lookup_elem(map, key);
37
38 /* lookup() returns either pointer to element value or NULL
39 * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type
40 */
41 return (unsigned long) value;
42}
43
44struct bpf_func_proto bpf_map_lookup_elem_proto = {
45 .func = bpf_map_lookup_elem,
46 .gpl_only = false,
47 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
48 .arg1_type = ARG_CONST_MAP_PTR,
49 .arg2_type = ARG_PTR_TO_MAP_KEY,
50};
51
52static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
53{
54 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
55 void *key = (void *) (unsigned long) r2;
56 void *value = (void *) (unsigned long) r3;
57
58 WARN_ON_ONCE(!rcu_read_lock_held());
59
60 return map->ops->map_update_elem(map, key, value, r4);
61}
62
63struct bpf_func_proto bpf_map_update_elem_proto = {
64 .func = bpf_map_update_elem,
65 .gpl_only = false,
66 .ret_type = RET_INTEGER,
67 .arg1_type = ARG_CONST_MAP_PTR,
68 .arg2_type = ARG_PTR_TO_MAP_KEY,
69 .arg3_type = ARG_PTR_TO_MAP_VALUE,
70 .arg4_type = ARG_ANYTHING,
71};
72
73static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
74{
75 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
76 void *key = (void *) (unsigned long) r2;
77
78 WARN_ON_ONCE(!rcu_read_lock_held());
79
80 return map->ops->map_delete_elem(map, key);
81}
82
83struct bpf_func_proto bpf_map_delete_elem_proto = {
84 .func = bpf_map_delete_elem,
85 .gpl_only = false,
86 .ret_type = RET_INTEGER,
87 .arg1_type = ARG_CONST_MAP_PTR,
88 .arg2_type = ARG_PTR_TO_MAP_KEY,
89};
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ba61c8c16032..088ac0b1b106 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -169,7 +169,7 @@ static int map_lookup_elem(union bpf_attr *attr)
169 if (copy_from_user(key, ukey, map->key_size) != 0) 169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key; 170 goto free_key;
171 171
172 err = -ESRCH; 172 err = -ENOENT;
173 rcu_read_lock(); 173 rcu_read_lock();
174 value = map->ops->map_lookup_elem(map, key); 174 value = map->ops->map_lookup_elem(map, key);
175 if (!value) 175 if (!value)
@@ -190,7 +190,7 @@ err_put:
190 return err; 190 return err;
191} 191}
192 192
193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value 193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
194 194
195static int map_update_elem(union bpf_attr *attr) 195static int map_update_elem(union bpf_attr *attr)
196{ 196{
@@ -231,7 +231,7 @@ static int map_update_elem(union bpf_attr *attr)
231 * therefore all map accessors rely on this fact, so do the same here 231 * therefore all map accessors rely on this fact, so do the same here
232 */ 232 */
233 rcu_read_lock(); 233 rcu_read_lock();
234 err = map->ops->map_update_elem(map, key, value); 234 err = map->ops->map_update_elem(map, key, value, attr->flags);
235 rcu_read_unlock(); 235 rcu_read_unlock();
236 236
237free_value: 237free_value:
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
index fcaddff4003e..0ceae1e6e8b5 100644
--- a/kernel/bpf/test_stub.c
+++ b/kernel/bpf/test_stub.c
@@ -18,26 +18,18 @@ struct bpf_context {
18 u64 arg2; 18 u64 arg2;
19}; 19};
20 20
21static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
22{
23 return 0;
24}
25
26static struct bpf_func_proto test_funcs[] = {
27 [BPF_FUNC_unspec] = {
28 .func = test_func,
29 .gpl_only = true,
30 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
31 .arg1_type = ARG_CONST_MAP_PTR,
32 .arg2_type = ARG_PTR_TO_MAP_KEY,
33 },
34};
35
36static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) 21static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
37{ 22{
38 if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs)) 23 switch (func_id) {
24 case BPF_FUNC_map_lookup_elem:
25 return &bpf_map_lookup_elem_proto;
26 case BPF_FUNC_map_update_elem:
27 return &bpf_map_update_elem_proto;
28 case BPF_FUNC_map_delete_elem:
29 return &bpf_map_delete_elem_proto;
30 default:
39 return NULL; 31 return NULL;
40 return &test_funcs[func_id]; 32 }
41} 33}
42 34
43static const struct bpf_context_access { 35static const struct bpf_context_access {
@@ -78,38 +70,8 @@ static struct bpf_prog_type_list tl_prog = {
78 .type = BPF_PROG_TYPE_UNSPEC, 70 .type = BPF_PROG_TYPE_UNSPEC,
79}; 71};
80 72
81static struct bpf_map *test_map_alloc(union bpf_attr *attr)
82{
83 struct bpf_map *map;
84
85 map = kzalloc(sizeof(*map), GFP_USER);
86 if (!map)
87 return ERR_PTR(-ENOMEM);
88
89 map->key_size = attr->key_size;
90 map->value_size = attr->value_size;
91 map->max_entries = attr->max_entries;
92 return map;
93}
94
95static void test_map_free(struct bpf_map *map)
96{
97 kfree(map);
98}
99
100static struct bpf_map_ops test_map_ops = {
101 .map_alloc = test_map_alloc,
102 .map_free = test_map_free,
103};
104
105static struct bpf_map_type_list tl_map = {
106 .ops = &test_map_ops,
107 .type = BPF_MAP_TYPE_UNSPEC,
108};
109
110static int __init register_test_ops(void) 73static int __init register_test_ops(void)
111{ 74{
112 bpf_register_map_type(&tl_map);
113 bpf_register_prog_type(&tl_prog); 75 bpf_register_prog_type(&tl_prog);
114 return 0; 76 return 0;
115} 77}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9f81818f2941..a28e09c7825d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -153,22 +153,19 @@ struct reg_state {
153 153
154enum bpf_stack_slot_type { 154enum bpf_stack_slot_type {
155 STACK_INVALID, /* nothing was stored in this stack slot */ 155 STACK_INVALID, /* nothing was stored in this stack slot */
156 STACK_SPILL, /* 1st byte of register spilled into stack */ 156 STACK_SPILL, /* register spilled into stack */
157 STACK_SPILL_PART, /* other 7 bytes of register spill */
158 STACK_MISC /* BPF program wrote some data into this slot */ 157 STACK_MISC /* BPF program wrote some data into this slot */
159}; 158};
160 159
161struct bpf_stack_slot { 160#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */
162 enum bpf_stack_slot_type stype;
163 struct reg_state reg_st;
164};
165 161
166/* state of the program: 162/* state of the program:
167 * type of all registers and stack info 163 * type of all registers and stack info
168 */ 164 */
169struct verifier_state { 165struct verifier_state {
170 struct reg_state regs[MAX_BPF_REG]; 166 struct reg_state regs[MAX_BPF_REG];
171 struct bpf_stack_slot stack[MAX_BPF_STACK]; 167 u8 stack_slot_type[MAX_BPF_STACK];
168 struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
172}; 169};
173 170
174/* linked list of verifier states used to prune search */ 171/* linked list of verifier states used to prune search */
@@ -259,10 +256,10 @@ static void print_verifier_state(struct verifier_env *env)
259 env->cur_state.regs[i].map_ptr->key_size, 256 env->cur_state.regs[i].map_ptr->key_size,
260 env->cur_state.regs[i].map_ptr->value_size); 257 env->cur_state.regs[i].map_ptr->value_size);
261 } 258 }
262 for (i = 0; i < MAX_BPF_STACK; i++) { 259 for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
263 if (env->cur_state.stack[i].stype == STACK_SPILL) 260 if (env->cur_state.stack_slot_type[i] == STACK_SPILL)
264 verbose(" fp%d=%s", -MAX_BPF_STACK + i, 261 verbose(" fp%d=%s", -MAX_BPF_STACK + i,
265 reg_type_str[env->cur_state.stack[i].reg_st.type]); 262 reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]);
266 } 263 }
267 verbose("\n"); 264 verbose("\n");
268} 265}
@@ -539,8 +536,10 @@ static int bpf_size_to_bytes(int bpf_size)
539static int check_stack_write(struct verifier_state *state, int off, int size, 536static int check_stack_write(struct verifier_state *state, int off, int size,
540 int value_regno) 537 int value_regno)
541{ 538{
542 struct bpf_stack_slot *slot;
543 int i; 539 int i;
540 /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
541 * so it's aligned access and [off, off + size) are within stack limits
542 */
544 543
545 if (value_regno >= 0 && 544 if (value_regno >= 0 &&
546 (state->regs[value_regno].type == PTR_TO_MAP_VALUE || 545 (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
@@ -548,30 +547,24 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
548 state->regs[value_regno].type == PTR_TO_CTX)) { 547 state->regs[value_regno].type == PTR_TO_CTX)) {
549 548
550 /* register containing pointer is being spilled into stack */ 549 /* register containing pointer is being spilled into stack */
551 if (size != 8) { 550 if (size != BPF_REG_SIZE) {
552 verbose("invalid size of register spill\n"); 551 verbose("invalid size of register spill\n");
553 return -EACCES; 552 return -EACCES;
554 } 553 }
555 554
556 slot = &state->stack[MAX_BPF_STACK + off];
557 slot->stype = STACK_SPILL;
558 /* save register state */ 555 /* save register state */
559 slot->reg_st = state->regs[value_regno]; 556 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
560 for (i = 1; i < 8; i++) { 557 state->regs[value_regno];
561 slot = &state->stack[MAX_BPF_STACK + off + i];
562 slot->stype = STACK_SPILL_PART;
563 slot->reg_st.type = UNKNOWN_VALUE;
564 slot->reg_st.map_ptr = NULL;
565 }
566 } else {
567 558
559 for (i = 0; i < BPF_REG_SIZE; i++)
560 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
561 } else {
568 /* regular write of data into stack */ 562 /* regular write of data into stack */
569 for (i = 0; i < size; i++) { 563 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] =
570 slot = &state->stack[MAX_BPF_STACK + off + i]; 564 (struct reg_state) {};
571 slot->stype = STACK_MISC; 565
572 slot->reg_st.type = UNKNOWN_VALUE; 566 for (i = 0; i < size; i++)
573 slot->reg_st.map_ptr = NULL; 567 state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
574 }
575 } 568 }
576 return 0; 569 return 0;
577} 570}
@@ -579,19 +572,18 @@ static int check_stack_write(struct verifier_state *state, int off, int size,
579static int check_stack_read(struct verifier_state *state, int off, int size, 572static int check_stack_read(struct verifier_state *state, int off, int size,
580 int value_regno) 573 int value_regno)
581{ 574{
575 u8 *slot_type;
582 int i; 576 int i;
583 struct bpf_stack_slot *slot;
584 577
585 slot = &state->stack[MAX_BPF_STACK + off]; 578 slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
586 579
587 if (slot->stype == STACK_SPILL) { 580 if (slot_type[0] == STACK_SPILL) {
588 if (size != 8) { 581 if (size != BPF_REG_SIZE) {
589 verbose("invalid size of register spill\n"); 582 verbose("invalid size of register spill\n");
590 return -EACCES; 583 return -EACCES;
591 } 584 }
592 for (i = 1; i < 8; i++) { 585 for (i = 1; i < BPF_REG_SIZE; i++) {
593 if (state->stack[MAX_BPF_STACK + off + i].stype != 586 if (slot_type[i] != STACK_SPILL) {
594 STACK_SPILL_PART) {
595 verbose("corrupted spill memory\n"); 587 verbose("corrupted spill memory\n");
596 return -EACCES; 588 return -EACCES;
597 } 589 }
@@ -599,12 +591,12 @@ static int check_stack_read(struct verifier_state *state, int off, int size,
599 591
600 if (value_regno >= 0) 592 if (value_regno >= 0)
601 /* restore register state from stack */ 593 /* restore register state from stack */
602 state->regs[value_regno] = slot->reg_st; 594 state->regs[value_regno] =
595 state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE];
603 return 0; 596 return 0;
604 } else { 597 } else {
605 for (i = 0; i < size; i++) { 598 for (i = 0; i < size; i++) {
606 if (state->stack[MAX_BPF_STACK + off + i].stype != 599 if (slot_type[i] != STACK_MISC) {
607 STACK_MISC) {
608 verbose("invalid read from stack off %d+%d size %d\n", 600 verbose("invalid read from stack off %d+%d size %d\n",
609 off, i, size); 601 off, i, size);
610 return -EACCES; 602 return -EACCES;
@@ -747,7 +739,7 @@ static int check_stack_boundary(struct verifier_env *env,
747 } 739 }
748 740
749 for (i = 0; i < access_size; i++) { 741 for (i = 0; i < access_size; i++) {
750 if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) { 742 if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
751 verbose("invalid indirect read from stack off %d+%d size %d\n", 743 verbose("invalid indirect read from stack off %d+%d size %d\n",
752 off, i, access_size); 744 off, i, access_size);
753 return -EACCES; 745 return -EACCES;
@@ -1180,6 +1172,70 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1180 return 0; 1172 return 0;
1181} 1173}
1182 1174
1175/* verify safety of LD_ABS|LD_IND instructions:
1176 * - they can only appear in the programs where ctx == skb
1177 * - since they are wrappers of function calls, they scratch R1-R5 registers,
1178 * preserve R6-R9, and store return value into R0
1179 *
1180 * Implicit input:
1181 * ctx == skb == R6 == CTX
1182 *
1183 * Explicit input:
1184 * SRC == any register
1185 * IMM == 32-bit immediate
1186 *
1187 * Output:
1188 * R0 - 8/16/32-bit skb data converted to cpu endianness
1189 */
1190static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn)
1191{
1192 struct reg_state *regs = env->cur_state.regs;
1193 u8 mode = BPF_MODE(insn->code);
1194 struct reg_state *reg;
1195 int i, err;
1196
1197 if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) {
1198 verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n");
1199 return -EINVAL;
1200 }
1201
1202 if (insn->dst_reg != BPF_REG_0 || insn->off != 0 ||
1203 (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) {
1204 verbose("BPF_LD_ABS uses reserved fields\n");
1205 return -EINVAL;
1206 }
1207
1208 /* check whether implicit source operand (register R6) is readable */
1209 err = check_reg_arg(regs, BPF_REG_6, SRC_OP);
1210 if (err)
1211 return err;
1212
1213 if (regs[BPF_REG_6].type != PTR_TO_CTX) {
1214 verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
1215 return -EINVAL;
1216 }
1217
1218 if (mode == BPF_IND) {
1219 /* check explicit source operand */
1220 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1221 if (err)
1222 return err;
1223 }
1224
1225 /* reset caller saved regs to unreadable */
1226 for (i = 0; i < CALLER_SAVED_REGS; i++) {
1227 reg = regs + caller_saved[i];
1228 reg->type = NOT_INIT;
1229 reg->imm = 0;
1230 }
1231
1232 /* mark destination R0 register as readable, since it contains
1233 * the value fetched from the packet
1234 */
1235 regs[BPF_REG_0].type = UNKNOWN_VALUE;
1236 return 0;
1237}
1238
1183/* non-recursive DFS pseudo code 1239/* non-recursive DFS pseudo code
1184 * 1 procedure DFS-iterative(G,v): 1240 * 1 procedure DFS-iterative(G,v):
1185 * 2 label v as discovered 1241 * 2 label v as discovered
@@ -1417,12 +1473,33 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
1417 } 1473 }
1418 1474
1419 for (i = 0; i < MAX_BPF_STACK; i++) { 1475 for (i = 0; i < MAX_BPF_STACK; i++) {
1420 if (memcmp(&old->stack[i], &cur->stack[i], 1476 if (old->stack_slot_type[i] == STACK_INVALID)
1421 sizeof(old->stack[0])) != 0) { 1477 continue;
1422 if (old->stack[i].stype == STACK_INVALID) 1478 if (old->stack_slot_type[i] != cur->stack_slot_type[i])
1423 continue; 1479 /* Ex: old explored (safe) state has STACK_SPILL in
1480 * this stack slot, but current has has STACK_MISC ->
1481 * this verifier states are not equivalent,
1482 * return false to continue verification of this path
1483 */
1424 return false; 1484 return false;
1425 } 1485 if (i % BPF_REG_SIZE)
1486 continue;
1487 if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
1488 &cur->spilled_regs[i / BPF_REG_SIZE],
1489 sizeof(old->spilled_regs[0])))
1490 /* when explored and current stack slot types are
1491 * the same, check that stored pointers types
1492 * are the same as well.
1493 * Ex: explored safe path could have stored
1494 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8}
1495 * but current path has stored:
1496 * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16}
1497 * such verifier states are not equivalent.
1498 * return false to continue verification of this path
1499 */
1500 return false;
1501 else
1502 continue;
1426 } 1503 }
1427 return true; 1504 return true;
1428} 1505}
@@ -1664,8 +1741,10 @@ process_bpf_exit:
1664 u8 mode = BPF_MODE(insn->code); 1741 u8 mode = BPF_MODE(insn->code);
1665 1742
1666 if (mode == BPF_ABS || mode == BPF_IND) { 1743 if (mode == BPF_ABS || mode == BPF_IND) {
1667 verbose("LD_ABS is not supported yet\n"); 1744 err = check_ld_abs(env, insn);
1668 return -EINVAL; 1745 if (err)
1746 return err;
1747
1669 } else if (mode == BPF_IMM) { 1748 } else if (mode == BPF_IMM) {
1670 err = check_ld_imm(env, insn); 1749 err = check_ld_imm(env, insn);
1671 if (err) 1750 if (err)