aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/Makefile2
-rw-r--r--kernel/bpf/bpf_lru_list.c20
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/helpers.c4
-rw-r--r--kernel/bpf/inode.c17
-rw-r--r--kernel/bpf/lpm_trie.c521
-rw-r--r--kernel/bpf/syscall.c19
-rw-r--r--kernel/bpf/verifier.c294
-rw-r--r--kernel/trace/bpf_trace.c76
-rw-r--r--kernel/trace/trace_output.c18
10 files changed, 855 insertions, 125 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1276474ac3cd..e1ce4f4fd7fd 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,7 +1,7 @@
1obj-y := core.o 1obj-y := core.o
2 2
3obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o 3obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
4obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o 4obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o
5ifeq ($(CONFIG_PERF_EVENTS),y) 5ifeq ($(CONFIG_PERF_EVENTS),y)
6obj-$(CONFIG_BPF_SYSCALL) += stackmap.o 6obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
7endif 7endif
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index 89b7ef41c86b..f62d1d56f41d 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
213 enum bpf_lru_list_type tgt_free_type) 213 enum bpf_lru_list_type tgt_free_type)
214{ 214{
215 struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; 215 struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
216 struct bpf_lru_node *node, *tmp_node, *first_node; 216 struct bpf_lru_node *node, *tmp_node;
217 unsigned int nshrinked = 0; 217 unsigned int nshrinked = 0;
218 unsigned int i = 0; 218 unsigned int i = 0;
219 219
220 first_node = list_first_entry(inactive, struct bpf_lru_node, list);
221 list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { 220 list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
222 if (bpf_lru_node_is_ref(node)) { 221 if (bpf_lru_node_is_ref(node)) {
223 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); 222 __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
@@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru,
361 list_add(&node->list, local_pending_list(loc_l)); 360 list_add(&node->list, local_pending_list(loc_l));
362} 361}
363 362
364struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) 363static struct bpf_lru_node *
364__local_list_pop_free(struct bpf_lru_locallist *loc_l)
365{ 365{
366 struct bpf_lru_node *node; 366 struct bpf_lru_node *node;
367 367
@@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
374 return node; 374 return node;
375} 375}
376 376
377struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, 377static struct bpf_lru_node *
378 struct bpf_lru_locallist *loc_l) 378__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l)
379{ 379{
380 struct bpf_lru_node *node; 380 struct bpf_lru_node *node;
381 bool force = false; 381 bool force = false;
@@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
558 bpf_common_lru_push_free(lru, node); 558 bpf_common_lru_push_free(lru, node);
559} 559}
560 560
561void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, 561static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf,
562 u32 elem_size, u32 nr_elems) 562 u32 node_offset, u32 elem_size,
563 u32 nr_elems)
563{ 564{
564 struct bpf_lru_list *l = &lru->common_lru.lru_list; 565 struct bpf_lru_list *l = &lru->common_lru.lru_list;
565 u32 i; 566 u32 i;
@@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
575 } 576 }
576} 577}
577 578
578void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, 579static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf,
579 u32 elem_size, u32 nr_elems) 580 u32 node_offset, u32 elem_size,
581 u32 nr_elems)
580{ 582{
581 u32 i, pcpu_entries; 583 u32 i, pcpu_entries;
582 int cpu; 584 int cpu;
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 503d4211988a..fddd76b1b627 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -1173,3 +1173,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
1173{ 1173{
1174 return -EFAULT; 1174 return -EFAULT;
1175} 1175}
1176
1177/* All definitions of tracepoints related to BPF. */
1178#define CREATE_TRACE_POINTS
1179#include <linux/bpf_trace.h>
1180
1181EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception);
1182
1183EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type);
1184EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 045cbe673356..3d24e238221e 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
176 .func = bpf_get_current_comm, 176 .func = bpf_get_current_comm,
177 .gpl_only = false, 177 .gpl_only = false,
178 .ret_type = RET_INTEGER, 178 .ret_type = RET_INTEGER,
179 .arg1_type = ARG_PTR_TO_RAW_STACK, 179 .arg1_type = ARG_PTR_TO_UNINIT_MEM,
180 .arg2_type = ARG_CONST_STACK_SIZE, 180 .arg2_type = ARG_CONST_SIZE,
181}; 181};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 0b030c9126d3..fddcae801724 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -21,6 +21,7 @@
21#include <linux/parser.h> 21#include <linux/parser.h>
22#include <linux/filter.h> 22#include <linux/filter.h>
23#include <linux/bpf.h> 23#include <linux/bpf.h>
24#include <linux/bpf_trace.h>
24 25
25enum bpf_type { 26enum bpf_type {
26 BPF_TYPE_UNSPEC = 0, 27 BPF_TYPE_UNSPEC = 0,
@@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname)
281 ret = bpf_obj_do_pin(pname, raw, type); 282 ret = bpf_obj_do_pin(pname, raw, type);
282 if (ret != 0) 283 if (ret != 0)
283 bpf_any_put(raw, type); 284 bpf_any_put(raw, type);
285 if ((trace_bpf_obj_pin_prog_enabled() ||
286 trace_bpf_obj_pin_map_enabled()) && !ret) {
287 if (type == BPF_TYPE_PROG)
288 trace_bpf_obj_pin_prog(raw, ufd, pname);
289 if (type == BPF_TYPE_MAP)
290 trace_bpf_obj_pin_map(raw, ufd, pname);
291 }
284out: 292out:
285 putname(pname); 293 putname(pname);
286 return ret; 294 return ret;
@@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname)
342 else 350 else
343 goto out; 351 goto out;
344 352
345 if (ret < 0) 353 if (ret < 0) {
346 bpf_any_put(raw, type); 354 bpf_any_put(raw, type);
355 } else if (trace_bpf_obj_get_prog_enabled() ||
356 trace_bpf_obj_get_map_enabled()) {
357 if (type == BPF_TYPE_PROG)
358 trace_bpf_obj_get_prog(raw, ret, pname);
359 if (type == BPF_TYPE_MAP)
360 trace_bpf_obj_get_map(raw, ret, pname);
361 }
347out: 362out:
348 putname(pname); 363 putname(pname);
349 return ret; 364 return ret;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
new file mode 100644
index 000000000000..e0f6a0bd279b
--- /dev/null
+++ b/kernel/bpf/lpm_trie.c
@@ -0,0 +1,521 @@
1/*
2 * Longest prefix match list implementation
3 *
4 * Copyright (c) 2016,2017 Daniel Mack
5 * Copyright (c) 2016 David Herrmann
6 *
7 * This file is subject to the terms and conditions of version 2 of the GNU
8 * General Public License. See the file COPYING in the main directory of the
9 * Linux distribution for more details.
10 */
11
12#include <linux/bpf.h>
13#include <linux/err.h>
14#include <linux/slab.h>
15#include <linux/spinlock.h>
16#include <linux/vmalloc.h>
17#include <net/ipv6.h>
18
19/* Intermediate node */
20#define LPM_TREE_NODE_FLAG_IM BIT(0)
21
22struct lpm_trie_node;
23
24struct lpm_trie_node {
25 struct rcu_head rcu;
26 struct lpm_trie_node __rcu *child[2];
27 u32 prefixlen;
28 u32 flags;
29 u8 data[0];
30};
31
32struct lpm_trie {
33 struct bpf_map map;
34 struct lpm_trie_node __rcu *root;
35 size_t n_entries;
36 size_t max_prefixlen;
37 size_t data_size;
38 raw_spinlock_t lock;
39};
40
41/* This trie implements a longest prefix match algorithm that can be used to
42 * match IP addresses to a stored set of ranges.
43 *
44 * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is
45 * interpreted as big endian, so data[0] stores the most significant byte.
46 *
47 * Match ranges are internally stored in instances of struct lpm_trie_node
48 * which each contain their prefix length as well as two pointers that may
49 * lead to more nodes containing more specific matches. Each node also stores
50 * a value that is defined by and returned to userspace via the update_elem
51 * and lookup functions.
52 *
53 * For instance, let's start with a trie that was created with a prefix length
54 * of 32, so it can be used for IPv4 addresses, and one single element that
55 * matches 192.168.0.0/16. The data array would hence contain
56 * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will
57 * stick to IP-address notation for readability though.
58 *
59 * As the trie is empty initially, the new node (1) will be places as root
60 * node, denoted as (R) in the example below. As there are no other node, both
61 * child pointers are %NULL.
62 *
63 * +----------------+
64 * | (1) (R) |
65 * | 192.168.0.0/16 |
66 * | value: 1 |
67 * | [0] [1] |
68 * +----------------+
69 *
70 * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already
71 * a node with the same data and a smaller prefix (ie, a less specific one),
72 * node (2) will become a child of (1). In child index depends on the next bit
73 * that is outside of what (1) matches, and that bit is 0, so (2) will be
74 * child[0] of (1):
75 *
76 * +----------------+
77 * | (1) (R) |
78 * | 192.168.0.0/16 |
79 * | value: 1 |
80 * | [0] [1] |
81 * +----------------+
82 * |
83 * +----------------+
84 * | (2) |
85 * | 192.168.0.0/24 |
86 * | value: 2 |
87 * | [0] [1] |
88 * +----------------+
89 *
90 * The child[1] slot of (1) could be filled with another node which has bit #17
91 * (the next bit after the ones that (1) matches on) set to 1. For instance,
92 * 192.168.128.0/24:
93 *
94 * +----------------+
95 * | (1) (R) |
96 * | 192.168.0.0/16 |
97 * | value: 1 |
98 * | [0] [1] |
99 * +----------------+
100 * | |
101 * +----------------+ +------------------+
102 * | (2) | | (3) |
103 * | 192.168.0.0/24 | | 192.168.128.0/24 |
104 * | value: 2 | | value: 3 |
105 * | [0] [1] | | [0] [1] |
106 * +----------------+ +------------------+
107 *
108 * Let's add another node (4) to the game for 192.168.1.0/24. In order to place
109 * it, node (1) is looked at first, and because (4) of the semantics laid out
110 * above (bit #17 is 0), it would normally be attached to (1) as child[0].
111 * However, that slot is already allocated, so a new node is needed in between.
112 * That node does not have a value attached to it and it will never be
113 * returned to users as result of a lookup. It is only there to differentiate
114 * the traversal further. It will get a prefix as wide as necessary to
115 * distinguish its two children:
116 *
117 * +----------------+
118 * | (1) (R) |
119 * | 192.168.0.0/16 |
120 * | value: 1 |
121 * | [0] [1] |
122 * +----------------+
123 * | |
124 * +----------------+ +------------------+
125 * | (4) (I) | | (3) |
126 * | 192.168.0.0/23 | | 192.168.128.0/24 |
127 * | value: --- | | value: 3 |
128 * | [0] [1] | | [0] [1] |
129 * +----------------+ +------------------+
130 * | |
131 * +----------------+ +----------------+
132 * | (2) | | (5) |
133 * | 192.168.0.0/24 | | 192.168.1.0/24 |
134 * | value: 2 | | value: 5 |
135 * | [0] [1] | | [0] [1] |
136 * +----------------+ +----------------+
137 *
138 * 192.168.1.1/32 would be a child of (5) etc.
139 *
140 * An intermediate node will be turned into a 'real' node on demand. In the
141 * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie.
142 *
143 * A fully populated trie would have a height of 32 nodes, as the trie was
144 * created with a prefix length of 32.
145 *
146 * The lookup starts at the root node. If the current node matches and if there
147 * is a child that can be used to become more specific, the trie is traversed
148 * downwards. The last node in the traversal that is a non-intermediate one is
149 * returned.
150 */
151
152static inline int extract_bit(const u8 *data, size_t index)
153{
154 return !!(data[index / 8] & (1 << (7 - (index % 8))));
155}
156
157/**
158 * longest_prefix_match() - determine the longest prefix
159 * @trie: The trie to get internal sizes from
160 * @node: The node to operate on
161 * @key: The key to compare to @node
162 *
163 * Determine the longest prefix of @node that matches the bits in @key.
164 */
165static size_t longest_prefix_match(const struct lpm_trie *trie,
166 const struct lpm_trie_node *node,
167 const struct bpf_lpm_trie_key *key)
168{
169 size_t prefixlen = 0;
170 size_t i;
171
172 for (i = 0; i < trie->data_size; i++) {
173 size_t b;
174
175 b = 8 - fls(node->data[i] ^ key->data[i]);
176 prefixlen += b;
177
178 if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen)
179 return min(node->prefixlen, key->prefixlen);
180
181 if (b < 8)
182 break;
183 }
184
185 return prefixlen;
186}
187
188/* Called from syscall or from eBPF program */
189static void *trie_lookup_elem(struct bpf_map *map, void *_key)
190{
191 struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
192 struct lpm_trie_node *node, *found = NULL;
193 struct bpf_lpm_trie_key *key = _key;
194
195 /* Start walking the trie from the root node ... */
196
197 for (node = rcu_dereference(trie->root); node;) {
198 unsigned int next_bit;
199 size_t matchlen;
200
201 /* Determine the longest prefix of @node that matches @key.
202 * If it's the maximum possible prefix for this trie, we have
203 * an exact match and can return it directly.
204 */
205 matchlen = longest_prefix_match(trie, node, key);
206 if (matchlen == trie->max_prefixlen) {
207 found = node;
208 break;
209 }
210
211 /* If the number of bits that match is smaller than the prefix
212 * length of @node, bail out and return the node we have seen
213 * last in the traversal (ie, the parent).
214 */
215 if (matchlen < node->prefixlen)
216 break;
217
218 /* Consider this node as return candidate unless it is an
219 * artificially added intermediate one.
220 */
221 if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
222 found = node;
223
224 /* If the node match is fully satisfied, let's see if we can
225 * become more specific. Determine the next bit in the key and
226 * traverse down.
227 */
228 next_bit = extract_bit(key->data, node->prefixlen);
229 node = rcu_dereference(node->child[next_bit]);
230 }
231
232 if (!found)
233 return NULL;
234
235 return found->data + trie->data_size;
236}
237
238static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
239 const void *value)
240{
241 struct lpm_trie_node *node;
242 size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
243
244 if (value)
245 size += trie->map.value_size;
246
247 node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
248 if (!node)
249 return NULL;
250
251 node->flags = 0;
252
253 if (value)
254 memcpy(node->data + trie->data_size, value,
255 trie->map.value_size);
256
257 return node;
258}
259
260/* Called from syscall or from eBPF program */
261static int trie_update_elem(struct bpf_map *map,
262 void *_key, void *value, u64 flags)
263{
264 struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
265 struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL;
266 struct lpm_trie_node __rcu **slot;
267 struct bpf_lpm_trie_key *key = _key;
268 unsigned long irq_flags;
269 unsigned int next_bit;
270 size_t matchlen = 0;
271 int ret = 0;
272
273 if (unlikely(flags > BPF_EXIST))
274 return -EINVAL;
275
276 if (key->prefixlen > trie->max_prefixlen)
277 return -EINVAL;
278
279 raw_spin_lock_irqsave(&trie->lock, irq_flags);
280
281 /* Allocate and fill a new node */
282
283 if (trie->n_entries == trie->map.max_entries) {
284 ret = -ENOSPC;
285 goto out;
286 }
287
288 new_node = lpm_trie_node_alloc(trie, value);
289 if (!new_node) {
290 ret = -ENOMEM;
291 goto out;
292 }
293
294 trie->n_entries++;
295
296 new_node->prefixlen = key->prefixlen;
297 RCU_INIT_POINTER(new_node->child[0], NULL);
298 RCU_INIT_POINTER(new_node->child[1], NULL);
299 memcpy(new_node->data, key->data, trie->data_size);
300
301 /* Now find a slot to attach the new node. To do that, walk the tree
302 * from the root and match as many bits as possible for each node until
303 * we either find an empty slot or a slot that needs to be replaced by
304 * an intermediate node.
305 */
306 slot = &trie->root;
307
308 while ((node = rcu_dereference_protected(*slot,
309 lockdep_is_held(&trie->lock)))) {
310 matchlen = longest_prefix_match(trie, node, key);
311
312 if (node->prefixlen != matchlen ||
313 node->prefixlen == key->prefixlen ||
314 node->prefixlen == trie->max_prefixlen)
315 break;
316
317 next_bit = extract_bit(key->data, node->prefixlen);
318 slot = &node->child[next_bit];
319 }
320
321 /* If the slot is empty (a free child pointer or an empty root),
322 * simply assign the @new_node to that slot and be done.
323 */
324 if (!node) {
325 rcu_assign_pointer(*slot, new_node);
326 goto out;
327 }
328
329 /* If the slot we picked already exists, replace it with @new_node
330 * which already has the correct data array set.
331 */
332 if (node->prefixlen == matchlen) {
333 new_node->child[0] = node->child[0];
334 new_node->child[1] = node->child[1];
335
336 if (!(node->flags & LPM_TREE_NODE_FLAG_IM))
337 trie->n_entries--;
338
339 rcu_assign_pointer(*slot, new_node);
340 kfree_rcu(node, rcu);
341
342 goto out;
343 }
344
345 /* If the new node matches the prefix completely, it must be inserted
346 * as an ancestor. Simply insert it between @node and *@slot.
347 */
348 if (matchlen == key->prefixlen) {
349 next_bit = extract_bit(node->data, matchlen);
350 rcu_assign_pointer(new_node->child[next_bit], node);
351 rcu_assign_pointer(*slot, new_node);
352 goto out;
353 }
354
355 im_node = lpm_trie_node_alloc(trie, NULL);
356 if (!im_node) {
357 ret = -ENOMEM;
358 goto out;
359 }
360
361 im_node->prefixlen = matchlen;
362 im_node->flags |= LPM_TREE_NODE_FLAG_IM;
363 memcpy(im_node->data, node->data, trie->data_size);
364
365 /* Now determine which child to install in which slot */
366 if (extract_bit(key->data, matchlen)) {
367 rcu_assign_pointer(im_node->child[0], node);
368 rcu_assign_pointer(im_node->child[1], new_node);
369 } else {
370 rcu_assign_pointer(im_node->child[0], new_node);
371 rcu_assign_pointer(im_node->child[1], node);
372 }
373
374 /* Finally, assign the intermediate node to the determined spot */
375 rcu_assign_pointer(*slot, im_node);
376
377out:
378 if (ret) {
379 if (new_node)
380 trie->n_entries--;
381
382 kfree(new_node);
383 kfree(im_node);
384 }
385
386 raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
387
388 return ret;
389}
390
391static int trie_delete_elem(struct bpf_map *map, void *key)
392{
393 /* TODO */
394 return -ENOSYS;
395}
396
397#define LPM_DATA_SIZE_MAX 256
398#define LPM_DATA_SIZE_MIN 1
399
400#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \
401 sizeof(struct lpm_trie_node))
402#define LPM_VAL_SIZE_MIN 1
403
404#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X))
405#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
406#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
407
408static struct bpf_map *trie_alloc(union bpf_attr *attr)
409{
410 struct lpm_trie *trie;
411 u64 cost = sizeof(*trie), cost_per_node;
412 int ret;
413
414 if (!capable(CAP_SYS_ADMIN))
415 return ERR_PTR(-EPERM);
416
417 /* check sanity of attributes */
418 if (attr->max_entries == 0 ||
419 attr->map_flags != BPF_F_NO_PREALLOC ||
420 attr->key_size < LPM_KEY_SIZE_MIN ||
421 attr->key_size > LPM_KEY_SIZE_MAX ||
422 attr->value_size < LPM_VAL_SIZE_MIN ||
423 attr->value_size > LPM_VAL_SIZE_MAX)
424 return ERR_PTR(-EINVAL);
425
426 trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
427 if (!trie)
428 return ERR_PTR(-ENOMEM);
429
430 /* copy mandatory map attributes */
431 trie->map.map_type = attr->map_type;
432 trie->map.key_size = attr->key_size;
433 trie->map.value_size = attr->value_size;
434 trie->map.max_entries = attr->max_entries;
435 trie->data_size = attr->key_size -
436 offsetof(struct bpf_lpm_trie_key, data);
437 trie->max_prefixlen = trie->data_size * 8;
438
439 cost_per_node = sizeof(struct lpm_trie_node) +
440 attr->value_size + trie->data_size;
441 cost += (u64) attr->max_entries * cost_per_node;
442 if (cost >= U32_MAX - PAGE_SIZE) {
443 ret = -E2BIG;
444 goto out_err;
445 }
446
447 trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
448
449 ret = bpf_map_precharge_memlock(trie->map.pages);
450 if (ret)
451 goto out_err;
452
453 raw_spin_lock_init(&trie->lock);
454
455 return &trie->map;
456out_err:
457 kfree(trie);
458 return ERR_PTR(ret);
459}
460
461static void trie_free(struct bpf_map *map)
462{
463 struct lpm_trie *trie = container_of(map, struct lpm_trie, map);
464 struct lpm_trie_node __rcu **slot;
465 struct lpm_trie_node *node;
466
467 raw_spin_lock(&trie->lock);
468
469 /* Always start at the root and walk down to a node that has no
470 * children. Then free that node, nullify its reference in the parent
471 * and start over.
472 */
473
474 for (;;) {
475 slot = &trie->root;
476
477 for (;;) {
478 node = rcu_dereference_protected(*slot,
479 lockdep_is_held(&trie->lock));
480 if (!node)
481 goto unlock;
482
483 if (rcu_access_pointer(node->child[0])) {
484 slot = &node->child[0];
485 continue;
486 }
487
488 if (rcu_access_pointer(node->child[1])) {
489 slot = &node->child[1];
490 continue;
491 }
492
493 kfree(node);
494 RCU_INIT_POINTER(*slot, NULL);
495 break;
496 }
497 }
498
499unlock:
500 raw_spin_unlock(&trie->lock);
501}
502
503static const struct bpf_map_ops trie_ops = {
504 .map_alloc = trie_alloc,
505 .map_free = trie_free,
506 .map_lookup_elem = trie_lookup_elem,
507 .map_update_elem = trie_update_elem,
508 .map_delete_elem = trie_delete_elem,
509};
510
511static struct bpf_map_type_list trie_type __read_mostly = {
512 .ops = &trie_ops,
513 .type = BPF_MAP_TYPE_LPM_TRIE,
514};
515
516static int __init register_trie_map(void)
517{
518 bpf_register_map_type(&trie_type);
519 return 0;
520}
521late_initcall(register_trie_map);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bbb016adbaeb..f74ca17af64a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -10,6 +10,7 @@
10 * General Public License for more details. 10 * General Public License for more details.
11 */ 11 */
12#include <linux/bpf.h> 12#include <linux/bpf.h>
13#include <linux/bpf_trace.h>
13#include <linux/syscalls.h> 14#include <linux/syscalls.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
@@ -241,6 +242,7 @@ static int map_create(union bpf_attr *attr)
241 /* failed to allocate fd */ 242 /* failed to allocate fd */
242 goto free_map; 243 goto free_map;
243 244
245 trace_bpf_map_create(map, err);
244 return err; 246 return err;
245 247
246free_map: 248free_map:
@@ -365,6 +367,7 @@ static int map_lookup_elem(union bpf_attr *attr)
365 if (copy_to_user(uvalue, value, value_size) != 0) 367 if (copy_to_user(uvalue, value, value_size) != 0)
366 goto free_value; 368 goto free_value;
367 369
370 trace_bpf_map_lookup_elem(map, ufd, key, value);
368 err = 0; 371 err = 0;
369 372
370free_value: 373free_value:
@@ -447,6 +450,8 @@ static int map_update_elem(union bpf_attr *attr)
447 __this_cpu_dec(bpf_prog_active); 450 __this_cpu_dec(bpf_prog_active);
448 preempt_enable(); 451 preempt_enable();
449 452
453 if (!err)
454 trace_bpf_map_update_elem(map, ufd, key, value);
450free_value: 455free_value:
451 kfree(value); 456 kfree(value);
452free_key: 457free_key:
@@ -492,6 +497,8 @@ static int map_delete_elem(union bpf_attr *attr)
492 __this_cpu_dec(bpf_prog_active); 497 __this_cpu_dec(bpf_prog_active);
493 preempt_enable(); 498 preempt_enable();
494 499
500 if (!err)
501 trace_bpf_map_delete_elem(map, ufd, key);
495free_key: 502free_key:
496 kfree(key); 503 kfree(key);
497err_put: 504err_put:
@@ -544,6 +551,7 @@ static int map_get_next_key(union bpf_attr *attr)
544 if (copy_to_user(unext_key, next_key, map->key_size) != 0) 551 if (copy_to_user(unext_key, next_key, map->key_size) != 0)
545 goto free_next_key; 552 goto free_next_key;
546 553
554 trace_bpf_map_next_key(map, ufd, key, next_key);
547 err = 0; 555 err = 0;
548 556
549free_next_key: 557free_next_key:
@@ -697,8 +705,10 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
697 705
698void bpf_prog_put(struct bpf_prog *prog) 706void bpf_prog_put(struct bpf_prog *prog)
699{ 707{
700 if (atomic_dec_and_test(&prog->aux->refcnt)) 708 if (atomic_dec_and_test(&prog->aux->refcnt)) {
709 trace_bpf_prog_put_rcu(prog);
701 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); 710 call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
711 }
702} 712}
703EXPORT_SYMBOL_GPL(bpf_prog_put); 713EXPORT_SYMBOL_GPL(bpf_prog_put);
704 714
@@ -807,7 +817,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd)
807 817
808struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) 818struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type)
809{ 819{
810 return __bpf_prog_get(ufd, &type); 820 struct bpf_prog *prog = __bpf_prog_get(ufd, &type);
821
822 if (!IS_ERR(prog))
823 trace_bpf_prog_get_type(prog);
824 return prog;
811} 825}
812EXPORT_SYMBOL_GPL(bpf_prog_get_type); 826EXPORT_SYMBOL_GPL(bpf_prog_get_type);
813 827
@@ -889,6 +903,7 @@ static int bpf_prog_load(union bpf_attr *attr)
889 /* failed to allocate fd */ 903 /* failed to allocate fd */
890 goto free_used_maps; 904 goto free_used_maps;
891 905
906 trace_bpf_prog_load(prog, err);
892 return err; 907 return err;
893 908
894free_used_maps: 909free_used_maps:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cdc43b899f28..d2bded2b250c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
481 regs[regno].max_value = BPF_REGISTER_MAX_RANGE; 481 regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
482} 482}
483 483
484static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs,
485 u32 regno)
486{
487 mark_reg_unknown_value(regs, regno);
488 reset_reg_range_values(regs, regno);
489}
490
484enum reg_arg_type { 491enum reg_arg_type {
485 SRC_OP, /* register is used as source operand */ 492 SRC_OP, /* register is used as source operand */
486 DST_OP, /* register is used as destination operand */ 493 DST_OP, /* register is used as destination operand */
@@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
532 switch (type) { 539 switch (type) {
533 case PTR_TO_MAP_VALUE: 540 case PTR_TO_MAP_VALUE:
534 case PTR_TO_MAP_VALUE_OR_NULL: 541 case PTR_TO_MAP_VALUE_OR_NULL:
542 case PTR_TO_MAP_VALUE_ADJ:
535 case PTR_TO_STACK: 543 case PTR_TO_STACK:
536 case PTR_TO_CTX: 544 case PTR_TO_CTX:
537 case PTR_TO_PACKET: 545 case PTR_TO_PACKET:
@@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
616 } 624 }
617 if (value_regno >= 0) 625 if (value_regno >= 0)
618 /* have read misc data from the stack */ 626 /* have read misc data from the stack */
619 mark_reg_unknown_value(state->regs, value_regno); 627 mark_reg_unknown_value_and_range(state->regs,
628 value_regno);
620 return 0; 629 return 0;
621 } 630 }
622} 631}
@@ -627,7 +636,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
627{ 636{
628 struct bpf_map *map = env->cur_state.regs[regno].map_ptr; 637 struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
629 638
630 if (off < 0 || off + size > map->value_size) { 639 if (off < 0 || size <= 0 || off + size > map->value_size) {
631 verbose("invalid access to map value, value_size=%d off=%d size=%d\n", 640 verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
632 map->value_size, off, size); 641 map->value_size, off, size);
633 return -EACCES; 642 return -EACCES;
@@ -635,6 +644,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
635 return 0; 644 return 0;
636} 645}
637 646
647/* check read/write into an adjusted map element */
648static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno,
649 int off, int size)
650{
651 struct bpf_verifier_state *state = &env->cur_state;
652 struct bpf_reg_state *reg = &state->regs[regno];
653 int err;
654
655 /* We adjusted the register to this map value, so we
656 * need to change off and size to min_value and max_value
657 * respectively to make sure our theoretical access will be
658 * safe.
659 */
660 if (log_level)
661 print_verifier_state(state);
662 env->varlen_map_value_access = true;
663 /* The minimum value is only important with signed
664 * comparisons where we can't assume the floor of a
665 * value is 0. If we are using signed variables for our
666 * index'es we need to make sure that whatever we use
667 * will have a set floor within our range.
668 */
669 if (reg->min_value < 0) {
670 verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
671 regno);
672 return -EACCES;
673 }
674 err = check_map_access(env, regno, reg->min_value + off, size);
675 if (err) {
676 verbose("R%d min value is outside of the array range\n",
677 regno);
678 return err;
679 }
680
681 /* If we haven't set a max value then we need to bail
682 * since we can't be sure we won't do bad things.
683 */
684 if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
685 verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
686 regno);
687 return -EACCES;
688 }
689 return check_map_access(env, regno, reg->max_value + off, size);
690}
691
638#define MAX_PACKET_OFF 0xffff 692#define MAX_PACKET_OFF 0xffff
639 693
640static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, 694static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
@@ -647,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
647 /* dst_input() and dst_output() can't write for now */ 701 /* dst_input() and dst_output() can't write for now */
648 if (t == BPF_WRITE) 702 if (t == BPF_WRITE)
649 return false; 703 return false;
704 /* fallthrough */
650 case BPF_PROG_TYPE_SCHED_CLS: 705 case BPF_PROG_TYPE_SCHED_CLS:
651 case BPF_PROG_TYPE_SCHED_ACT: 706 case BPF_PROG_TYPE_SCHED_ACT:
652 case BPF_PROG_TYPE_XDP: 707 case BPF_PROG_TYPE_XDP:
@@ -775,47 +830,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
775 return -EACCES; 830 return -EACCES;
776 } 831 }
777 832
778 /* If we adjusted the register to this map value at all then we 833 if (reg->type == PTR_TO_MAP_VALUE_ADJ)
779 * need to change off and size to min_value and max_value 834 err = check_map_access_adj(env, regno, off, size);
780 * respectively to make sure our theoretical access will be 835 else
781 * safe. 836 err = check_map_access(env, regno, off, size);
782 */
783 if (reg->type == PTR_TO_MAP_VALUE_ADJ) {
784 if (log_level)
785 print_verifier_state(state);
786 env->varlen_map_value_access = true;
787 /* The minimum value is only important with signed
788 * comparisons where we can't assume the floor of a
789 * value is 0. If we are using signed variables for our
790 * index'es we need to make sure that whatever we use
791 * will have a set floor within our range.
792 */
793 if (reg->min_value < 0) {
794 verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
795 regno);
796 return -EACCES;
797 }
798 err = check_map_access(env, regno, reg->min_value + off,
799 size);
800 if (err) {
801 verbose("R%d min value is outside of the array range\n",
802 regno);
803 return err;
804 }
805
806 /* If we haven't set a max value then we need to bail
807 * since we can't be sure we won't do bad things.
808 */
809 if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
810 verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n",
811 regno);
812 return -EACCES;
813 }
814 off += reg->max_value;
815 }
816 err = check_map_access(env, regno, off, size);
817 if (!err && t == BPF_READ && value_regno >= 0) 837 if (!err && t == BPF_READ && value_regno >= 0)
818 mark_reg_unknown_value(state->regs, value_regno); 838 mark_reg_unknown_value_and_range(state->regs,
839 value_regno);
819 840
820 } else if (reg->type == PTR_TO_CTX) { 841 } else if (reg->type == PTR_TO_CTX) {
821 enum bpf_reg_type reg_type = UNKNOWN_VALUE; 842 enum bpf_reg_type reg_type = UNKNOWN_VALUE;
@@ -827,7 +848,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
827 } 848 }
828 err = check_ctx_access(env, off, size, t, &reg_type); 849 err = check_ctx_access(env, off, size, t, &reg_type);
829 if (!err && t == BPF_READ && value_regno >= 0) { 850 if (!err && t == BPF_READ && value_regno >= 0) {
830 mark_reg_unknown_value(state->regs, value_regno); 851 mark_reg_unknown_value_and_range(state->regs,
852 value_regno);
831 /* note that reg.[id|off|range] == 0 */ 853 /* note that reg.[id|off|range] == 0 */
832 state->regs[value_regno].type = reg_type; 854 state->regs[value_regno].type = reg_type;
833 } 855 }
@@ -860,7 +882,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
860 } 882 }
861 err = check_packet_access(env, regno, off, size); 883 err = check_packet_access(env, regno, off, size);
862 if (!err && t == BPF_READ && value_regno >= 0) 884 if (!err && t == BPF_READ && value_regno >= 0)
863 mark_reg_unknown_value(state->regs, value_regno); 885 mark_reg_unknown_value_and_range(state->regs,
886 value_regno);
864 } else { 887 } else {
865 verbose("R%d invalid mem access '%s'\n", 888 verbose("R%d invalid mem access '%s'\n",
866 regno, reg_type_str[reg->type]); 889 regno, reg_type_str[reg->type]);
@@ -958,6 +981,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
958 return 0; 981 return 0;
959} 982}
960 983
984static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
985 int access_size, bool zero_size_allowed,
986 struct bpf_call_arg_meta *meta)
987{
988 struct bpf_reg_state *regs = env->cur_state.regs;
989
990 switch (regs[regno].type) {
991 case PTR_TO_PACKET:
992 return check_packet_access(env, regno, 0, access_size);
993 case PTR_TO_MAP_VALUE:
994 return check_map_access(env, regno, 0, access_size);
995 case PTR_TO_MAP_VALUE_ADJ:
996 return check_map_access_adj(env, regno, 0, access_size);
997 default: /* const_imm|ptr_to_stack or invalid ptr */
998 return check_stack_boundary(env, regno, access_size,
999 zero_size_allowed, meta);
1000 }
1001}
1002
961static int check_func_arg(struct bpf_verifier_env *env, u32 regno, 1003static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
962 enum bpf_arg_type arg_type, 1004 enum bpf_arg_type arg_type,
963 struct bpf_call_arg_meta *meta) 1005 struct bpf_call_arg_meta *meta)
@@ -993,10 +1035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
993 expected_type = PTR_TO_STACK; 1035 expected_type = PTR_TO_STACK;
994 if (type != PTR_TO_PACKET && type != expected_type) 1036 if (type != PTR_TO_PACKET && type != expected_type)
995 goto err_type; 1037 goto err_type;
996 } else if (arg_type == ARG_CONST_STACK_SIZE || 1038 } else if (arg_type == ARG_CONST_SIZE ||
997 arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { 1039 arg_type == ARG_CONST_SIZE_OR_ZERO) {
998 expected_type = CONST_IMM; 1040 expected_type = CONST_IMM;
999 if (type != expected_type) 1041 /* One exception. Allow UNKNOWN_VALUE registers when the
1042 * boundaries are known and don't cause unsafe memory accesses
1043 */
1044 if (type != UNKNOWN_VALUE && type != expected_type)
1000 goto err_type; 1045 goto err_type;
1001 } else if (arg_type == ARG_CONST_MAP_PTR) { 1046 } else if (arg_type == ARG_CONST_MAP_PTR) {
1002 expected_type = CONST_PTR_TO_MAP; 1047 expected_type = CONST_PTR_TO_MAP;
@@ -1006,8 +1051,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1006 expected_type = PTR_TO_CTX; 1051 expected_type = PTR_TO_CTX;
1007 if (type != expected_type) 1052 if (type != expected_type)
1008 goto err_type; 1053 goto err_type;
1009 } else if (arg_type == ARG_PTR_TO_STACK || 1054 } else if (arg_type == ARG_PTR_TO_MEM ||
1010 arg_type == ARG_PTR_TO_RAW_STACK) { 1055 arg_type == ARG_PTR_TO_UNINIT_MEM) {
1011 expected_type = PTR_TO_STACK; 1056 expected_type = PTR_TO_STACK;
1012 /* One exception here. In case function allows for NULL to be 1057 /* One exception here. In case function allows for NULL to be
1013 * passed in as argument, it's a CONST_IMM type. Final test 1058 * passed in as argument, it's a CONST_IMM type. Final test
@@ -1015,9 +1060,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1015 */ 1060 */
1016 if (type == CONST_IMM && reg->imm == 0) 1061 if (type == CONST_IMM && reg->imm == 0)
1017 /* final test in check_stack_boundary() */; 1062 /* final test in check_stack_boundary() */;
1018 else if (type != PTR_TO_PACKET && type != expected_type) 1063 else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE &&
1064 type != PTR_TO_MAP_VALUE_ADJ && type != expected_type)
1019 goto err_type; 1065 goto err_type;
1020 meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; 1066 meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
1021 } else { 1067 } else {
1022 verbose("unsupported arg_type %d\n", arg_type); 1068 verbose("unsupported arg_type %d\n", arg_type);
1023 return -EFAULT; 1069 return -EFAULT;
@@ -1063,9 +1109,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1063 err = check_stack_boundary(env, regno, 1109 err = check_stack_boundary(env, regno,
1064 meta->map_ptr->value_size, 1110 meta->map_ptr->value_size,
1065 false, NULL); 1111 false, NULL);
1066 } else if (arg_type == ARG_CONST_STACK_SIZE || 1112 } else if (arg_type == ARG_CONST_SIZE ||
1067 arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { 1113 arg_type == ARG_CONST_SIZE_OR_ZERO) {
1068 bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); 1114 bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
1069 1115
1070 /* bpf_xxx(..., buf, len) call will access 'len' bytes 1116 /* bpf_xxx(..., buf, len) call will access 'len' bytes
1071 * from stack pointer 'buf'. Check it 1117 * from stack pointer 'buf'. Check it
@@ -1073,14 +1119,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
1073 */ 1119 */
1074 if (regno == 0) { 1120 if (regno == 0) {
1075 /* kernel subsystem misconfigured verifier */ 1121 /* kernel subsystem misconfigured verifier */
1076 verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); 1122 verbose("ARG_CONST_SIZE cannot be first argument\n");
1077 return -EACCES; 1123 return -EACCES;
1078 } 1124 }
1079 if (regs[regno - 1].type == PTR_TO_PACKET) 1125
1080 err = check_packet_access(env, regno - 1, 0, reg->imm); 1126 /* If the register is UNKNOWN_VALUE, the access check happens
1081 else 1127 * using its boundaries. Otherwise, just use its imm
1082 err = check_stack_boundary(env, regno - 1, reg->imm, 1128 */
1083 zero_size_allowed, meta); 1129 if (type == UNKNOWN_VALUE) {
1130 /* For unprivileged variable accesses, disable raw
1131 * mode so that the program is required to
1132 * initialize all the memory that the helper could
1133 * just partially fill up.
1134 */
1135 meta = NULL;
1136
1137 if (reg->min_value < 0) {
1138 verbose("R%d min value is negative, either use unsigned or 'var &= const'\n",
1139 regno);
1140 return -EACCES;
1141 }
1142
1143 if (reg->min_value == 0) {
1144 err = check_helper_mem_access(env, regno - 1, 0,
1145 zero_size_allowed,
1146 meta);
1147 if (err)
1148 return err;
1149 }
1150
1151 if (reg->max_value == BPF_REGISTER_MAX_RANGE) {
1152 verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n",
1153 regno);
1154 return -EACCES;
1155 }
1156 err = check_helper_mem_access(env, regno - 1,
1157 reg->max_value,
1158 zero_size_allowed, meta);
1159 if (err)
1160 return err;
1161 } else {
1162 /* register is CONST_IMM */
1163 err = check_helper_mem_access(env, regno - 1, reg->imm,
1164 zero_size_allowed, meta);
1165 }
1084 } 1166 }
1085 1167
1086 return err; 1168 return err;
@@ -1154,15 +1236,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
1154{ 1236{
1155 int count = 0; 1237 int count = 0;
1156 1238
1157 if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) 1239 if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM)
1158 count++; 1240 count++;
1159 if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) 1241 if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM)
1160 count++; 1242 count++;
1161 if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) 1243 if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM)
1162 count++; 1244 count++;
1163 if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) 1245 if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM)
1164 count++; 1246 count++;
1165 if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) 1247 if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM)
1166 count++; 1248 count++;
1167 1249
1168 return count > 1 ? -EINVAL : 0; 1250 return count > 1 ? -EINVAL : 0;
@@ -1316,7 +1398,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env,
1316 imm = insn->imm; 1398 imm = insn->imm;
1317 1399
1318add_imm: 1400add_imm:
1319 if (imm <= 0) { 1401 if (imm < 0) {
1320 verbose("addition of negative constant to packet pointer is not allowed\n"); 1402 verbose("addition of negative constant to packet pointer is not allowed\n");
1321 return -EACCES; 1403 return -EACCES;
1322 } 1404 }
@@ -1485,22 +1567,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
1485 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg]; 1567 struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
1486 struct bpf_reg_state *src_reg = &regs[insn->src_reg]; 1568 struct bpf_reg_state *src_reg = &regs[insn->src_reg];
1487 u8 opcode = BPF_OP(insn->code); 1569 u8 opcode = BPF_OP(insn->code);
1570 u64 dst_imm = dst_reg->imm;
1488 1571
1489 /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' 1572 /* dst_reg->type == CONST_IMM here. Simulate execution of insns
1490 * insn. Don't care about overflow or negative values, just add them 1573 * containing ALU ops. Don't care about overflow or negative
1574 * values, just add/sub/... them; registers are in u64.
1491 */ 1575 */
1492 if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) 1576 if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) {
1493 dst_reg->imm += insn->imm; 1577 dst_imm += insn->imm;
1494 else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && 1578 } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X &&
1495 src_reg->type == CONST_IMM) 1579 src_reg->type == CONST_IMM) {
1496 dst_reg->imm += src_reg->imm; 1580 dst_imm += src_reg->imm;
1497 else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) 1581 } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) {
1498 dst_reg->imm |= insn->imm; 1582 dst_imm -= insn->imm;
1499 else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && 1583 } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X &&
1500 src_reg->type == CONST_IMM) 1584 src_reg->type == CONST_IMM) {
1501 dst_reg->imm |= src_reg->imm; 1585 dst_imm -= src_reg->imm;
1502 else 1586 } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) {
1587 dst_imm *= insn->imm;
1588 } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X &&
1589 src_reg->type == CONST_IMM) {
1590 dst_imm *= src_reg->imm;
1591 } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) {
1592 dst_imm |= insn->imm;
1593 } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X &&
1594 src_reg->type == CONST_IMM) {
1595 dst_imm |= src_reg->imm;
1596 } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) {
1597 dst_imm &= insn->imm;
1598 } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X &&
1599 src_reg->type == CONST_IMM) {
1600 dst_imm &= src_reg->imm;
1601 } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) {
1602 dst_imm >>= insn->imm;
1603 } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X &&
1604 src_reg->type == CONST_IMM) {
1605 dst_imm >>= src_reg->imm;
1606 } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) {
1607 dst_imm <<= insn->imm;
1608 } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X &&
1609 src_reg->type == CONST_IMM) {
1610 dst_imm <<= src_reg->imm;
1611 } else {
1503 mark_reg_unknown_value(regs, insn->dst_reg); 1612 mark_reg_unknown_value(regs, insn->dst_reg);
1613 goto out;
1614 }
1615
1616 dst_reg->imm = dst_imm;
1617out:
1504 return 0; 1618 return 0;
1505} 1619}
1506 1620
@@ -1894,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
1894 case BPF_JGT: 2008 case BPF_JGT:
1895 /* Unsigned comparison, the minimum value is 0. */ 2009 /* Unsigned comparison, the minimum value is 0. */
1896 false_reg->min_value = 0; 2010 false_reg->min_value = 0;
2011 /* fallthrough */
1897 case BPF_JSGT: 2012 case BPF_JSGT:
1898 /* If this is false then we know the maximum val is val, 2013 /* If this is false then we know the maximum val is val,
1899 * otherwise we know the min val is val+1. 2014 * otherwise we know the min val is val+1.
@@ -1904,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
1904 case BPF_JGE: 2019 case BPF_JGE:
1905 /* Unsigned comparison, the minimum value is 0. */ 2020 /* Unsigned comparison, the minimum value is 0. */
1906 false_reg->min_value = 0; 2021 false_reg->min_value = 0;
2022 /* fallthrough */
1907 case BPF_JSGE: 2023 case BPF_JSGE:
1908 /* If this is false then we know the maximum value is val - 1, 2024 /* If this is false then we know the maximum value is val - 1,
1909 * otherwise we know the mimimum value is val. 2025 * otherwise we know the mimimum value is val.
@@ -1942,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
1942 case BPF_JGT: 2058 case BPF_JGT:
1943 /* Unsigned comparison, the minimum value is 0. */ 2059 /* Unsigned comparison, the minimum value is 0. */
1944 true_reg->min_value = 0; 2060 true_reg->min_value = 0;
2061 /* fallthrough */
1945 case BPF_JSGT: 2062 case BPF_JSGT:
1946 /* 2063 /*
1947 * If this is false, then the val is <= the register, if it is 2064 * If this is false, then the val is <= the register, if it is
@@ -1953,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
1953 case BPF_JGE: 2070 case BPF_JGE:
1954 /* Unsigned comparison, the minimum value is 0. */ 2071 /* Unsigned comparison, the minimum value is 0. */
1955 true_reg->min_value = 0; 2072 true_reg->min_value = 0;
2073 /* fallthrough */
1956 case BPF_JSGE: 2074 case BPF_JSGE:
1957 /* If this is false then constant < register, if it is true then 2075 /* If this is false then constant < register, if it is true then
1958 * the register < constant. 2076 * the register < constant.
@@ -2144,14 +2262,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
2144 return err; 2262 return err;
2145 2263
2146 if (insn->src_reg == 0) { 2264 if (insn->src_reg == 0) {
2147 /* generic move 64-bit immediate into a register,
2148 * only analyzer needs to collect the ld_imm value.
2149 */
2150 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; 2265 u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
2151 2266
2152 if (!env->analyzer_ops)
2153 return 0;
2154
2155 regs[insn->dst_reg].type = CONST_IMM; 2267 regs[insn->dst_reg].type = CONST_IMM;
2156 regs[insn->dst_reg].imm = imm; 2268 regs[insn->dst_reg].imm = imm;
2157 return 0; 2269 return 0;
@@ -2729,7 +2841,6 @@ static int do_check(struct bpf_verifier_env *env)
2729 if (err) 2841 if (err)
2730 return err; 2842 return err;
2731 2843
2732 reset_reg_range_values(regs, insn->dst_reg);
2733 if (BPF_SIZE(insn->code) != BPF_W && 2844 if (BPF_SIZE(insn->code) != BPF_W &&
2734 BPF_SIZE(insn->code) != BPF_DW) { 2845 BPF_SIZE(insn->code) != BPF_DW) {
2735 insn_idx++; 2846 insn_idx++;
@@ -3085,10 +3196,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3085 insn = env->prog->insnsi + delta; 3196 insn = env->prog->insnsi + delta;
3086 3197
3087 for (i = 0; i < insn_cnt; i++, insn++) { 3198 for (i = 0; i < insn_cnt; i++, insn++) {
3088 if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || 3199 if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) ||
3200 insn->code == (BPF_LDX | BPF_MEM | BPF_H) ||
3201 insn->code == (BPF_LDX | BPF_MEM | BPF_W) ||
3089 insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) 3202 insn->code == (BPF_LDX | BPF_MEM | BPF_DW))
3090 type = BPF_READ; 3203 type = BPF_READ;
3091 else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || 3204 else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) ||
3205 insn->code == (BPF_STX | BPF_MEM | BPF_H) ||
3206 insn->code == (BPF_STX | BPF_MEM | BPF_W) ||
3092 insn->code == (BPF_STX | BPF_MEM | BPF_DW)) 3207 insn->code == (BPF_STX | BPF_MEM | BPF_DW))
3093 type = BPF_WRITE; 3208 type = BPF_WRITE;
3094 else 3209 else
@@ -3097,8 +3212,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
3097 if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) 3212 if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
3098 continue; 3213 continue;
3099 3214
3100 cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, 3215 cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
3101 insn->off, insn_buf, env->prog);
3102 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { 3216 if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
3103 verbose("bpf verifier is misconfigured\n"); 3217 verbose("bpf verifier is misconfigured\n");
3104 return -EINVAL; 3218 return -EINVAL;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index fa77311dadb2..424daa4586d1 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
76 .func = bpf_probe_read, 76 .func = bpf_probe_read,
77 .gpl_only = true, 77 .gpl_only = true,
78 .ret_type = RET_INTEGER, 78 .ret_type = RET_INTEGER,
79 .arg1_type = ARG_PTR_TO_RAW_STACK, 79 .arg1_type = ARG_PTR_TO_UNINIT_MEM,
80 .arg2_type = ARG_CONST_STACK_SIZE, 80 .arg2_type = ARG_CONST_SIZE,
81 .arg3_type = ARG_ANYTHING, 81 .arg3_type = ARG_ANYTHING,
82}; 82};
83 83
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
109 .gpl_only = true, 109 .gpl_only = true,
110 .ret_type = RET_INTEGER, 110 .ret_type = RET_INTEGER,
111 .arg1_type = ARG_ANYTHING, 111 .arg1_type = ARG_ANYTHING,
112 .arg2_type = ARG_PTR_TO_STACK, 112 .arg2_type = ARG_PTR_TO_MEM,
113 .arg3_type = ARG_CONST_STACK_SIZE, 113 .arg3_type = ARG_CONST_SIZE,
114}; 114};
115 115
116static const struct bpf_func_proto *bpf_get_probe_write_proto(void) 116static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = {
213 .func = bpf_trace_printk, 213 .func = bpf_trace_printk,
214 .gpl_only = true, 214 .gpl_only = true,
215 .ret_type = RET_INTEGER, 215 .ret_type = RET_INTEGER,
216 .arg1_type = ARG_PTR_TO_STACK, 216 .arg1_type = ARG_PTR_TO_MEM,
217 .arg2_type = ARG_CONST_STACK_SIZE, 217 .arg2_type = ARG_CONST_SIZE,
218}; 218};
219 219
220const struct bpf_func_proto *bpf_get_trace_printk_proto(void) 220const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
329 .arg1_type = ARG_PTR_TO_CTX, 329 .arg1_type = ARG_PTR_TO_CTX,
330 .arg2_type = ARG_CONST_MAP_PTR, 330 .arg2_type = ARG_CONST_MAP_PTR,
331 .arg3_type = ARG_ANYTHING, 331 .arg3_type = ARG_ANYTHING,
332 .arg4_type = ARG_PTR_TO_STACK, 332 .arg4_type = ARG_PTR_TO_MEM,
333 .arg5_type = ARG_CONST_STACK_SIZE, 333 .arg5_type = ARG_CONST_SIZE,
334}; 334};
335 335
336static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); 336static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
@@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = {
395 .arg2_type = ARG_ANYTHING, 395 .arg2_type = ARG_ANYTHING,
396}; 396};
397 397
398BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size,
399 const void *, unsafe_ptr)
400{
401 int ret;
402
403 /*
404 * The strncpy_from_unsafe() call will likely not fill the entire
405 * buffer, but that's okay in this circumstance as we're probing
406 * arbitrary memory anyway similar to bpf_probe_read() and might
407 * as well probe the stack. Thus, memory is explicitly cleared
408 * only in error case, so that improper users ignoring return
409 * code altogether don't copy garbage; otherwise length of string
410 * is returned that can be used for bpf_perf_event_output() et al.
411 */
412 ret = strncpy_from_unsafe(dst, unsafe_ptr, size);
413 if (unlikely(ret < 0))
414 memset(dst, 0, size);
415
416 return ret;
417}
418
419static const struct bpf_func_proto bpf_probe_read_str_proto = {
420 .func = bpf_probe_read_str,
421 .gpl_only = true,
422 .ret_type = RET_INTEGER,
423 .arg1_type = ARG_PTR_TO_UNINIT_MEM,
424 .arg2_type = ARG_CONST_SIZE,
425 .arg3_type = ARG_ANYTHING,
426};
427
398static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) 428static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
399{ 429{
400 switch (func_id) { 430 switch (func_id) {
@@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id)
432 return &bpf_current_task_under_cgroup_proto; 462 return &bpf_current_task_under_cgroup_proto;
433 case BPF_FUNC_get_prandom_u32: 463 case BPF_FUNC_get_prandom_u32:
434 return &bpf_get_prandom_u32_proto; 464 return &bpf_get_prandom_u32_proto;
465 case BPF_FUNC_probe_read_str:
466 return &bpf_probe_read_str_proto;
435 default: 467 default:
436 return NULL; 468 return NULL;
437 } 469 }
@@ -459,6 +491,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type
459 return false; 491 return false;
460 if (off % size != 0) 492 if (off % size != 0)
461 return false; 493 return false;
494 /*
495 * Assertion for 32 bit to make sure last 8 byte access
496 * (BPF_DW) to the last 4 byte member is disallowed.
497 */
498 if (off + size > sizeof(struct pt_regs))
499 return false;
500
462 return true; 501 return true;
463} 502}
464 503
@@ -492,8 +531,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = {
492 .arg1_type = ARG_PTR_TO_CTX, 531 .arg1_type = ARG_PTR_TO_CTX,
493 .arg2_type = ARG_CONST_MAP_PTR, 532 .arg2_type = ARG_CONST_MAP_PTR,
494 .arg3_type = ARG_ANYTHING, 533 .arg3_type = ARG_ANYTHING,
495 .arg4_type = ARG_PTR_TO_STACK, 534 .arg4_type = ARG_PTR_TO_MEM,
496 .arg5_type = ARG_CONST_STACK_SIZE, 535 .arg5_type = ARG_CONST_SIZE,
497}; 536};
498 537
499BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, 538BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map,
@@ -540,6 +579,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type
540 return false; 579 return false;
541 if (off % size != 0) 580 if (off % size != 0)
542 return false; 581 return false;
582
583 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64));
543 return true; 584 return true;
544} 585}
545 586
@@ -572,28 +613,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type
572 return true; 613 return true;
573} 614}
574 615
575static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, 616static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
576 int src_reg, int ctx_off, 617 const struct bpf_insn *si,
577 struct bpf_insn *insn_buf, 618 struct bpf_insn *insn_buf,
578 struct bpf_prog *prog) 619 struct bpf_prog *prog)
579{ 620{
580 struct bpf_insn *insn = insn_buf; 621 struct bpf_insn *insn = insn_buf;
581 622
582 switch (ctx_off) { 623 switch (si->off) {
583 case offsetof(struct bpf_perf_event_data, sample_period): 624 case offsetof(struct bpf_perf_event_data, sample_period):
584 BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); 625 BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
585 626
586 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 627 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
587 data), dst_reg, src_reg, 628 data), si->dst_reg, si->src_reg,
588 offsetof(struct bpf_perf_event_data_kern, data)); 629 offsetof(struct bpf_perf_event_data_kern, data));
589 *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, 630 *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
590 offsetof(struct perf_sample_data, period)); 631 offsetof(struct perf_sample_data, period));
591 break; 632 break;
592 default: 633 default:
593 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, 634 *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
594 regs), dst_reg, src_reg, 635 regs), si->dst_reg, si->src_reg,
595 offsetof(struct bpf_perf_event_data_kern, regs)); 636 offsetof(struct bpf_perf_event_data_kern, regs));
596 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); 637 *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg,
638 si->off);
597 break; 639 break;
598 } 640 }
599 641
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5d33a7352919..aea6a1218c7d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -162,15 +162,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
162} 162}
163EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); 163EXPORT_SYMBOL_GPL(trace_print_bitmask_seq);
164 164
165/**
166 * trace_print_hex_seq - print buffer as hex sequence
167 * @p: trace seq struct to write to
168 * @buf: The buffer to print
169 * @buf_len: Length of @buf in bytes
170 * @concatenate: Print @buf as single hex string or with spacing
171 *
172 * Prints the passed buffer as a hex sequence either as a whole,
173 * single hex string if @concatenate is true or with spacing after
174 * each byte in case @concatenate is false.
175 */
165const char * 176const char *
166trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) 177trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len,
178 bool concatenate)
167{ 179{
168 int i; 180 int i;
169 const char *ret = trace_seq_buffer_ptr(p); 181 const char *ret = trace_seq_buffer_ptr(p);
170 182
171 for (i = 0; i < buf_len; i++) 183 for (i = 0; i < buf_len; i++)
172 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); 184 trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ",
173 185 buf[i]);
174 trace_seq_putc(p, 0); 186 trace_seq_putc(p, 0);
175 187
176 return ret; 188 return ret;