diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/Makefile | 2 | ||||
-rw-r--r-- | kernel/bpf/bpf_lru_list.c | 20 | ||||
-rw-r--r-- | kernel/bpf/core.c | 9 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 4 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 17 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 521 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 19 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 294 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 76 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 18 |
10 files changed, 855 insertions, 125 deletions
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 1276474ac3cd..e1ce4f4fd7fd 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
@@ -1,7 +1,7 @@ | |||
1 | obj-y := core.o | 1 | obj-y := core.o |
2 | 2 | ||
3 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o | 3 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o |
4 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o | 4 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o |
5 | ifeq ($(CONFIG_PERF_EVENTS),y) | 5 | ifeq ($(CONFIG_PERF_EVENTS),y) |
6 | obj-$(CONFIG_BPF_SYSCALL) += stackmap.o | 6 | obj-$(CONFIG_BPF_SYSCALL) += stackmap.o |
7 | endif | 7 | endif |
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 89b7ef41c86b..f62d1d56f41d 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c | |||
@@ -213,11 +213,10 @@ __bpf_lru_list_shrink_inactive(struct bpf_lru *lru, | |||
213 | enum bpf_lru_list_type tgt_free_type) | 213 | enum bpf_lru_list_type tgt_free_type) |
214 | { | 214 | { |
215 | struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; | 215 | struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; |
216 | struct bpf_lru_node *node, *tmp_node, *first_node; | 216 | struct bpf_lru_node *node, *tmp_node; |
217 | unsigned int nshrinked = 0; | 217 | unsigned int nshrinked = 0; |
218 | unsigned int i = 0; | 218 | unsigned int i = 0; |
219 | 219 | ||
220 | first_node = list_first_entry(inactive, struct bpf_lru_node, list); | ||
221 | list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { | 220 | list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { |
222 | if (bpf_lru_node_is_ref(node)) { | 221 | if (bpf_lru_node_is_ref(node)) { |
223 | __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); | 222 | __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); |
@@ -361,7 +360,8 @@ static void __local_list_add_pending(struct bpf_lru *lru, | |||
361 | list_add(&node->list, local_pending_list(loc_l)); | 360 | list_add(&node->list, local_pending_list(loc_l)); |
362 | } | 361 | } |
363 | 362 | ||
364 | struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) | 363 | static struct bpf_lru_node * |
364 | __local_list_pop_free(struct bpf_lru_locallist *loc_l) | ||
365 | { | 365 | { |
366 | struct bpf_lru_node *node; | 366 | struct bpf_lru_node *node; |
367 | 367 | ||
@@ -374,8 +374,8 @@ struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l) | |||
374 | return node; | 374 | return node; |
375 | } | 375 | } |
376 | 376 | ||
377 | struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru, | 377 | static struct bpf_lru_node * |
378 | struct bpf_lru_locallist *loc_l) | 378 | __local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) |
379 | { | 379 | { |
380 | struct bpf_lru_node *node; | 380 | struct bpf_lru_node *node; |
381 | bool force = false; | 381 | bool force = false; |
@@ -558,8 +558,9 @@ void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) | |||
558 | bpf_common_lru_push_free(lru, node); | 558 | bpf_common_lru_push_free(lru, node); |
559 | } | 559 | } |
560 | 560 | ||
561 | void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, | 561 | static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, |
562 | u32 elem_size, u32 nr_elems) | 562 | u32 node_offset, u32 elem_size, |
563 | u32 nr_elems) | ||
563 | { | 564 | { |
564 | struct bpf_lru_list *l = &lru->common_lru.lru_list; | 565 | struct bpf_lru_list *l = &lru->common_lru.lru_list; |
565 | u32 i; | 566 | u32 i; |
@@ -575,8 +576,9 @@ void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, | |||
575 | } | 576 | } |
576 | } | 577 | } |
577 | 578 | ||
578 | void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, | 579 | static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, |
579 | u32 elem_size, u32 nr_elems) | 580 | u32 node_offset, u32 elem_size, |
581 | u32 nr_elems) | ||
580 | { | 582 | { |
581 | u32 i, pcpu_entries; | 583 | u32 i, pcpu_entries; |
582 | int cpu; | 584 | int cpu; |
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 503d4211988a..fddd76b1b627 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
@@ -1173,3 +1173,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, | |||
1173 | { | 1173 | { |
1174 | return -EFAULT; | 1174 | return -EFAULT; |
1175 | } | 1175 | } |
1176 | |||
1177 | /* All definitions of tracepoints related to BPF. */ | ||
1178 | #define CREATE_TRACE_POINTS | ||
1179 | #include <linux/bpf_trace.h> | ||
1180 | |||
1181 | EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); | ||
1182 | |||
1183 | EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_get_type); | ||
1184 | EXPORT_TRACEPOINT_SYMBOL_GPL(bpf_prog_put_rcu); | ||
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 045cbe673356..3d24e238221e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
@@ -176,6 +176,6 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { | |||
176 | .func = bpf_get_current_comm, | 176 | .func = bpf_get_current_comm, |
177 | .gpl_only = false, | 177 | .gpl_only = false, |
178 | .ret_type = RET_INTEGER, | 178 | .ret_type = RET_INTEGER, |
179 | .arg1_type = ARG_PTR_TO_RAW_STACK, | 179 | .arg1_type = ARG_PTR_TO_UNINIT_MEM, |
180 | .arg2_type = ARG_CONST_STACK_SIZE, | 180 | .arg2_type = ARG_CONST_SIZE, |
181 | }; | 181 | }; |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 0b030c9126d3..fddcae801724 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
@@ -21,6 +21,7 @@ | |||
21 | #include <linux/parser.h> | 21 | #include <linux/parser.h> |
22 | #include <linux/filter.h> | 22 | #include <linux/filter.h> |
23 | #include <linux/bpf.h> | 23 | #include <linux/bpf.h> |
24 | #include <linux/bpf_trace.h> | ||
24 | 25 | ||
25 | enum bpf_type { | 26 | enum bpf_type { |
26 | BPF_TYPE_UNSPEC = 0, | 27 | BPF_TYPE_UNSPEC = 0, |
@@ -281,6 +282,13 @@ int bpf_obj_pin_user(u32 ufd, const char __user *pathname) | |||
281 | ret = bpf_obj_do_pin(pname, raw, type); | 282 | ret = bpf_obj_do_pin(pname, raw, type); |
282 | if (ret != 0) | 283 | if (ret != 0) |
283 | bpf_any_put(raw, type); | 284 | bpf_any_put(raw, type); |
285 | if ((trace_bpf_obj_pin_prog_enabled() || | ||
286 | trace_bpf_obj_pin_map_enabled()) && !ret) { | ||
287 | if (type == BPF_TYPE_PROG) | ||
288 | trace_bpf_obj_pin_prog(raw, ufd, pname); | ||
289 | if (type == BPF_TYPE_MAP) | ||
290 | trace_bpf_obj_pin_map(raw, ufd, pname); | ||
291 | } | ||
284 | out: | 292 | out: |
285 | putname(pname); | 293 | putname(pname); |
286 | return ret; | 294 | return ret; |
@@ -342,8 +350,15 @@ int bpf_obj_get_user(const char __user *pathname) | |||
342 | else | 350 | else |
343 | goto out; | 351 | goto out; |
344 | 352 | ||
345 | if (ret < 0) | 353 | if (ret < 0) { |
346 | bpf_any_put(raw, type); | 354 | bpf_any_put(raw, type); |
355 | } else if (trace_bpf_obj_get_prog_enabled() || | ||
356 | trace_bpf_obj_get_map_enabled()) { | ||
357 | if (type == BPF_TYPE_PROG) | ||
358 | trace_bpf_obj_get_prog(raw, ret, pname); | ||
359 | if (type == BPF_TYPE_MAP) | ||
360 | trace_bpf_obj_get_map(raw, ret, pname); | ||
361 | } | ||
347 | out: | 362 | out: |
348 | putname(pname); | 363 | putname(pname); |
349 | return ret; | 364 | return ret; |
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c new file mode 100644 index 000000000000..e0f6a0bd279b --- /dev/null +++ b/kernel/bpf/lpm_trie.c | |||
@@ -0,0 +1,521 @@ | |||
1 | /* | ||
2 | * Longest prefix match list implementation | ||
3 | * | ||
4 | * Copyright (c) 2016,2017 Daniel Mack | ||
5 | * Copyright (c) 2016 David Herrmann | ||
6 | * | ||
7 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
8 | * General Public License. See the file COPYING in the main directory of the | ||
9 | * Linux distribution for more details. | ||
10 | */ | ||
11 | |||
12 | #include <linux/bpf.h> | ||
13 | #include <linux/err.h> | ||
14 | #include <linux/slab.h> | ||
15 | #include <linux/spinlock.h> | ||
16 | #include <linux/vmalloc.h> | ||
17 | #include <net/ipv6.h> | ||
18 | |||
19 | /* Intermediate node */ | ||
20 | #define LPM_TREE_NODE_FLAG_IM BIT(0) | ||
21 | |||
22 | struct lpm_trie_node; | ||
23 | |||
24 | struct lpm_trie_node { | ||
25 | struct rcu_head rcu; | ||
26 | struct lpm_trie_node __rcu *child[2]; | ||
27 | u32 prefixlen; | ||
28 | u32 flags; | ||
29 | u8 data[0]; | ||
30 | }; | ||
31 | |||
32 | struct lpm_trie { | ||
33 | struct bpf_map map; | ||
34 | struct lpm_trie_node __rcu *root; | ||
35 | size_t n_entries; | ||
36 | size_t max_prefixlen; | ||
37 | size_t data_size; | ||
38 | raw_spinlock_t lock; | ||
39 | }; | ||
40 | |||
41 | /* This trie implements a longest prefix match algorithm that can be used to | ||
42 | * match IP addresses to a stored set of ranges. | ||
43 | * | ||
44 | * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is | ||
45 | * interpreted as big endian, so data[0] stores the most significant byte. | ||
46 | * | ||
47 | * Match ranges are internally stored in instances of struct lpm_trie_node | ||
48 | * which each contain their prefix length as well as two pointers that may | ||
49 | * lead to more nodes containing more specific matches. Each node also stores | ||
50 | * a value that is defined by and returned to userspace via the update_elem | ||
51 | * and lookup functions. | ||
52 | * | ||
53 | * For instance, let's start with a trie that was created with a prefix length | ||
54 | * of 32, so it can be used for IPv4 addresses, and one single element that | ||
55 | * matches 192.168.0.0/16. The data array would hence contain | ||
56 | * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will | ||
57 | * stick to IP-address notation for readability though. | ||
58 | * | ||
59 | * As the trie is empty initially, the new node (1) will be places as root | ||
60 | * node, denoted as (R) in the example below. As there are no other node, both | ||
61 | * child pointers are %NULL. | ||
62 | * | ||
63 | * +----------------+ | ||
64 | * | (1) (R) | | ||
65 | * | 192.168.0.0/16 | | ||
66 | * | value: 1 | | ||
67 | * | [0] [1] | | ||
68 | * +----------------+ | ||
69 | * | ||
70 | * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already | ||
71 | * a node with the same data and a smaller prefix (ie, a less specific one), | ||
72 | * node (2) will become a child of (1). In child index depends on the next bit | ||
73 | * that is outside of what (1) matches, and that bit is 0, so (2) will be | ||
74 | * child[0] of (1): | ||
75 | * | ||
76 | * +----------------+ | ||
77 | * | (1) (R) | | ||
78 | * | 192.168.0.0/16 | | ||
79 | * | value: 1 | | ||
80 | * | [0] [1] | | ||
81 | * +----------------+ | ||
82 | * | | ||
83 | * +----------------+ | ||
84 | * | (2) | | ||
85 | * | 192.168.0.0/24 | | ||
86 | * | value: 2 | | ||
87 | * | [0] [1] | | ||
88 | * +----------------+ | ||
89 | * | ||
90 | * The child[1] slot of (1) could be filled with another node which has bit #17 | ||
91 | * (the next bit after the ones that (1) matches on) set to 1. For instance, | ||
92 | * 192.168.128.0/24: | ||
93 | * | ||
94 | * +----------------+ | ||
95 | * | (1) (R) | | ||
96 | * | 192.168.0.0/16 | | ||
97 | * | value: 1 | | ||
98 | * | [0] [1] | | ||
99 | * +----------------+ | ||
100 | * | | | ||
101 | * +----------------+ +------------------+ | ||
102 | * | (2) | | (3) | | ||
103 | * | 192.168.0.0/24 | | 192.168.128.0/24 | | ||
104 | * | value: 2 | | value: 3 | | ||
105 | * | [0] [1] | | [0] [1] | | ||
106 | * +----------------+ +------------------+ | ||
107 | * | ||
108 | * Let's add another node (4) to the game for 192.168.1.0/24. In order to place | ||
109 | * it, node (1) is looked at first, and because (4) of the semantics laid out | ||
110 | * above (bit #17 is 0), it would normally be attached to (1) as child[0]. | ||
111 | * However, that slot is already allocated, so a new node is needed in between. | ||
112 | * That node does not have a value attached to it and it will never be | ||
113 | * returned to users as result of a lookup. It is only there to differentiate | ||
114 | * the traversal further. It will get a prefix as wide as necessary to | ||
115 | * distinguish its two children: | ||
116 | * | ||
117 | * +----------------+ | ||
118 | * | (1) (R) | | ||
119 | * | 192.168.0.0/16 | | ||
120 | * | value: 1 | | ||
121 | * | [0] [1] | | ||
122 | * +----------------+ | ||
123 | * | | | ||
124 | * +----------------+ +------------------+ | ||
125 | * | (4) (I) | | (3) | | ||
126 | * | 192.168.0.0/23 | | 192.168.128.0/24 | | ||
127 | * | value: --- | | value: 3 | | ||
128 | * | [0] [1] | | [0] [1] | | ||
129 | * +----------------+ +------------------+ | ||
130 | * | | | ||
131 | * +----------------+ +----------------+ | ||
132 | * | (2) | | (5) | | ||
133 | * | 192.168.0.0/24 | | 192.168.1.0/24 | | ||
134 | * | value: 2 | | value: 5 | | ||
135 | * | [0] [1] | | [0] [1] | | ||
136 | * +----------------+ +----------------+ | ||
137 | * | ||
138 | * 192.168.1.1/32 would be a child of (5) etc. | ||
139 | * | ||
140 | * An intermediate node will be turned into a 'real' node on demand. In the | ||
141 | * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. | ||
142 | * | ||
143 | * A fully populated trie would have a height of 32 nodes, as the trie was | ||
144 | * created with a prefix length of 32. | ||
145 | * | ||
146 | * The lookup starts at the root node. If the current node matches and if there | ||
147 | * is a child that can be used to become more specific, the trie is traversed | ||
148 | * downwards. The last node in the traversal that is a non-intermediate one is | ||
149 | * returned. | ||
150 | */ | ||
151 | |||
152 | static inline int extract_bit(const u8 *data, size_t index) | ||
153 | { | ||
154 | return !!(data[index / 8] & (1 << (7 - (index % 8)))); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * longest_prefix_match() - determine the longest prefix | ||
159 | * @trie: The trie to get internal sizes from | ||
160 | * @node: The node to operate on | ||
161 | * @key: The key to compare to @node | ||
162 | * | ||
163 | * Determine the longest prefix of @node that matches the bits in @key. | ||
164 | */ | ||
165 | static size_t longest_prefix_match(const struct lpm_trie *trie, | ||
166 | const struct lpm_trie_node *node, | ||
167 | const struct bpf_lpm_trie_key *key) | ||
168 | { | ||
169 | size_t prefixlen = 0; | ||
170 | size_t i; | ||
171 | |||
172 | for (i = 0; i < trie->data_size; i++) { | ||
173 | size_t b; | ||
174 | |||
175 | b = 8 - fls(node->data[i] ^ key->data[i]); | ||
176 | prefixlen += b; | ||
177 | |||
178 | if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) | ||
179 | return min(node->prefixlen, key->prefixlen); | ||
180 | |||
181 | if (b < 8) | ||
182 | break; | ||
183 | } | ||
184 | |||
185 | return prefixlen; | ||
186 | } | ||
187 | |||
188 | /* Called from syscall or from eBPF program */ | ||
189 | static void *trie_lookup_elem(struct bpf_map *map, void *_key) | ||
190 | { | ||
191 | struct lpm_trie *trie = container_of(map, struct lpm_trie, map); | ||
192 | struct lpm_trie_node *node, *found = NULL; | ||
193 | struct bpf_lpm_trie_key *key = _key; | ||
194 | |||
195 | /* Start walking the trie from the root node ... */ | ||
196 | |||
197 | for (node = rcu_dereference(trie->root); node;) { | ||
198 | unsigned int next_bit; | ||
199 | size_t matchlen; | ||
200 | |||
201 | /* Determine the longest prefix of @node that matches @key. | ||
202 | * If it's the maximum possible prefix for this trie, we have | ||
203 | * an exact match and can return it directly. | ||
204 | */ | ||
205 | matchlen = longest_prefix_match(trie, node, key); | ||
206 | if (matchlen == trie->max_prefixlen) { | ||
207 | found = node; | ||
208 | break; | ||
209 | } | ||
210 | |||
211 | /* If the number of bits that match is smaller than the prefix | ||
212 | * length of @node, bail out and return the node we have seen | ||
213 | * last in the traversal (ie, the parent). | ||
214 | */ | ||
215 | if (matchlen < node->prefixlen) | ||
216 | break; | ||
217 | |||
218 | /* Consider this node as return candidate unless it is an | ||
219 | * artificially added intermediate one. | ||
220 | */ | ||
221 | if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) | ||
222 | found = node; | ||
223 | |||
224 | /* If the node match is fully satisfied, let's see if we can | ||
225 | * become more specific. Determine the next bit in the key and | ||
226 | * traverse down. | ||
227 | */ | ||
228 | next_bit = extract_bit(key->data, node->prefixlen); | ||
229 | node = rcu_dereference(node->child[next_bit]); | ||
230 | } | ||
231 | |||
232 | if (!found) | ||
233 | return NULL; | ||
234 | |||
235 | return found->data + trie->data_size; | ||
236 | } | ||
237 | |||
238 | static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, | ||
239 | const void *value) | ||
240 | { | ||
241 | struct lpm_trie_node *node; | ||
242 | size_t size = sizeof(struct lpm_trie_node) + trie->data_size; | ||
243 | |||
244 | if (value) | ||
245 | size += trie->map.value_size; | ||
246 | |||
247 | node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); | ||
248 | if (!node) | ||
249 | return NULL; | ||
250 | |||
251 | node->flags = 0; | ||
252 | |||
253 | if (value) | ||
254 | memcpy(node->data + trie->data_size, value, | ||
255 | trie->map.value_size); | ||
256 | |||
257 | return node; | ||
258 | } | ||
259 | |||
260 | /* Called from syscall or from eBPF program */ | ||
261 | static int trie_update_elem(struct bpf_map *map, | ||
262 | void *_key, void *value, u64 flags) | ||
263 | { | ||
264 | struct lpm_trie *trie = container_of(map, struct lpm_trie, map); | ||
265 | struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; | ||
266 | struct lpm_trie_node __rcu **slot; | ||
267 | struct bpf_lpm_trie_key *key = _key; | ||
268 | unsigned long irq_flags; | ||
269 | unsigned int next_bit; | ||
270 | size_t matchlen = 0; | ||
271 | int ret = 0; | ||
272 | |||
273 | if (unlikely(flags > BPF_EXIST)) | ||
274 | return -EINVAL; | ||
275 | |||
276 | if (key->prefixlen > trie->max_prefixlen) | ||
277 | return -EINVAL; | ||
278 | |||
279 | raw_spin_lock_irqsave(&trie->lock, irq_flags); | ||
280 | |||
281 | /* Allocate and fill a new node */ | ||
282 | |||
283 | if (trie->n_entries == trie->map.max_entries) { | ||
284 | ret = -ENOSPC; | ||
285 | goto out; | ||
286 | } | ||
287 | |||
288 | new_node = lpm_trie_node_alloc(trie, value); | ||
289 | if (!new_node) { | ||
290 | ret = -ENOMEM; | ||
291 | goto out; | ||
292 | } | ||
293 | |||
294 | trie->n_entries++; | ||
295 | |||
296 | new_node->prefixlen = key->prefixlen; | ||
297 | RCU_INIT_POINTER(new_node->child[0], NULL); | ||
298 | RCU_INIT_POINTER(new_node->child[1], NULL); | ||
299 | memcpy(new_node->data, key->data, trie->data_size); | ||
300 | |||
301 | /* Now find a slot to attach the new node. To do that, walk the tree | ||
302 | * from the root and match as many bits as possible for each node until | ||
303 | * we either find an empty slot or a slot that needs to be replaced by | ||
304 | * an intermediate node. | ||
305 | */ | ||
306 | slot = &trie->root; | ||
307 | |||
308 | while ((node = rcu_dereference_protected(*slot, | ||
309 | lockdep_is_held(&trie->lock)))) { | ||
310 | matchlen = longest_prefix_match(trie, node, key); | ||
311 | |||
312 | if (node->prefixlen != matchlen || | ||
313 | node->prefixlen == key->prefixlen || | ||
314 | node->prefixlen == trie->max_prefixlen) | ||
315 | break; | ||
316 | |||
317 | next_bit = extract_bit(key->data, node->prefixlen); | ||
318 | slot = &node->child[next_bit]; | ||
319 | } | ||
320 | |||
321 | /* If the slot is empty (a free child pointer or an empty root), | ||
322 | * simply assign the @new_node to that slot and be done. | ||
323 | */ | ||
324 | if (!node) { | ||
325 | rcu_assign_pointer(*slot, new_node); | ||
326 | goto out; | ||
327 | } | ||
328 | |||
329 | /* If the slot we picked already exists, replace it with @new_node | ||
330 | * which already has the correct data array set. | ||
331 | */ | ||
332 | if (node->prefixlen == matchlen) { | ||
333 | new_node->child[0] = node->child[0]; | ||
334 | new_node->child[1] = node->child[1]; | ||
335 | |||
336 | if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) | ||
337 | trie->n_entries--; | ||
338 | |||
339 | rcu_assign_pointer(*slot, new_node); | ||
340 | kfree_rcu(node, rcu); | ||
341 | |||
342 | goto out; | ||
343 | } | ||
344 | |||
345 | /* If the new node matches the prefix completely, it must be inserted | ||
346 | * as an ancestor. Simply insert it between @node and *@slot. | ||
347 | */ | ||
348 | if (matchlen == key->prefixlen) { | ||
349 | next_bit = extract_bit(node->data, matchlen); | ||
350 | rcu_assign_pointer(new_node->child[next_bit], node); | ||
351 | rcu_assign_pointer(*slot, new_node); | ||
352 | goto out; | ||
353 | } | ||
354 | |||
355 | im_node = lpm_trie_node_alloc(trie, NULL); | ||
356 | if (!im_node) { | ||
357 | ret = -ENOMEM; | ||
358 | goto out; | ||
359 | } | ||
360 | |||
361 | im_node->prefixlen = matchlen; | ||
362 | im_node->flags |= LPM_TREE_NODE_FLAG_IM; | ||
363 | memcpy(im_node->data, node->data, trie->data_size); | ||
364 | |||
365 | /* Now determine which child to install in which slot */ | ||
366 | if (extract_bit(key->data, matchlen)) { | ||
367 | rcu_assign_pointer(im_node->child[0], node); | ||
368 | rcu_assign_pointer(im_node->child[1], new_node); | ||
369 | } else { | ||
370 | rcu_assign_pointer(im_node->child[0], new_node); | ||
371 | rcu_assign_pointer(im_node->child[1], node); | ||
372 | } | ||
373 | |||
374 | /* Finally, assign the intermediate node to the determined spot */ | ||
375 | rcu_assign_pointer(*slot, im_node); | ||
376 | |||
377 | out: | ||
378 | if (ret) { | ||
379 | if (new_node) | ||
380 | trie->n_entries--; | ||
381 | |||
382 | kfree(new_node); | ||
383 | kfree(im_node); | ||
384 | } | ||
385 | |||
386 | raw_spin_unlock_irqrestore(&trie->lock, irq_flags); | ||
387 | |||
388 | return ret; | ||
389 | } | ||
390 | |||
391 | static int trie_delete_elem(struct bpf_map *map, void *key) | ||
392 | { | ||
393 | /* TODO */ | ||
394 | return -ENOSYS; | ||
395 | } | ||
396 | |||
397 | #define LPM_DATA_SIZE_MAX 256 | ||
398 | #define LPM_DATA_SIZE_MIN 1 | ||
399 | |||
400 | #define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ | ||
401 | sizeof(struct lpm_trie_node)) | ||
402 | #define LPM_VAL_SIZE_MIN 1 | ||
403 | |||
404 | #define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) | ||
405 | #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) | ||
406 | #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) | ||
407 | |||
408 | static struct bpf_map *trie_alloc(union bpf_attr *attr) | ||
409 | { | ||
410 | struct lpm_trie *trie; | ||
411 | u64 cost = sizeof(*trie), cost_per_node; | ||
412 | int ret; | ||
413 | |||
414 | if (!capable(CAP_SYS_ADMIN)) | ||
415 | return ERR_PTR(-EPERM); | ||
416 | |||
417 | /* check sanity of attributes */ | ||
418 | if (attr->max_entries == 0 || | ||
419 | attr->map_flags != BPF_F_NO_PREALLOC || | ||
420 | attr->key_size < LPM_KEY_SIZE_MIN || | ||
421 | attr->key_size > LPM_KEY_SIZE_MAX || | ||
422 | attr->value_size < LPM_VAL_SIZE_MIN || | ||
423 | attr->value_size > LPM_VAL_SIZE_MAX) | ||
424 | return ERR_PTR(-EINVAL); | ||
425 | |||
426 | trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); | ||
427 | if (!trie) | ||
428 | return ERR_PTR(-ENOMEM); | ||
429 | |||
430 | /* copy mandatory map attributes */ | ||
431 | trie->map.map_type = attr->map_type; | ||
432 | trie->map.key_size = attr->key_size; | ||
433 | trie->map.value_size = attr->value_size; | ||
434 | trie->map.max_entries = attr->max_entries; | ||
435 | trie->data_size = attr->key_size - | ||
436 | offsetof(struct bpf_lpm_trie_key, data); | ||
437 | trie->max_prefixlen = trie->data_size * 8; | ||
438 | |||
439 | cost_per_node = sizeof(struct lpm_trie_node) + | ||
440 | attr->value_size + trie->data_size; | ||
441 | cost += (u64) attr->max_entries * cost_per_node; | ||
442 | if (cost >= U32_MAX - PAGE_SIZE) { | ||
443 | ret = -E2BIG; | ||
444 | goto out_err; | ||
445 | } | ||
446 | |||
447 | trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
448 | |||
449 | ret = bpf_map_precharge_memlock(trie->map.pages); | ||
450 | if (ret) | ||
451 | goto out_err; | ||
452 | |||
453 | raw_spin_lock_init(&trie->lock); | ||
454 | |||
455 | return &trie->map; | ||
456 | out_err: | ||
457 | kfree(trie); | ||
458 | return ERR_PTR(ret); | ||
459 | } | ||
460 | |||
461 | static void trie_free(struct bpf_map *map) | ||
462 | { | ||
463 | struct lpm_trie *trie = container_of(map, struct lpm_trie, map); | ||
464 | struct lpm_trie_node __rcu **slot; | ||
465 | struct lpm_trie_node *node; | ||
466 | |||
467 | raw_spin_lock(&trie->lock); | ||
468 | |||
469 | /* Always start at the root and walk down to a node that has no | ||
470 | * children. Then free that node, nullify its reference in the parent | ||
471 | * and start over. | ||
472 | */ | ||
473 | |||
474 | for (;;) { | ||
475 | slot = &trie->root; | ||
476 | |||
477 | for (;;) { | ||
478 | node = rcu_dereference_protected(*slot, | ||
479 | lockdep_is_held(&trie->lock)); | ||
480 | if (!node) | ||
481 | goto unlock; | ||
482 | |||
483 | if (rcu_access_pointer(node->child[0])) { | ||
484 | slot = &node->child[0]; | ||
485 | continue; | ||
486 | } | ||
487 | |||
488 | if (rcu_access_pointer(node->child[1])) { | ||
489 | slot = &node->child[1]; | ||
490 | continue; | ||
491 | } | ||
492 | |||
493 | kfree(node); | ||
494 | RCU_INIT_POINTER(*slot, NULL); | ||
495 | break; | ||
496 | } | ||
497 | } | ||
498 | |||
499 | unlock: | ||
500 | raw_spin_unlock(&trie->lock); | ||
501 | } | ||
502 | |||
503 | static const struct bpf_map_ops trie_ops = { | ||
504 | .map_alloc = trie_alloc, | ||
505 | .map_free = trie_free, | ||
506 | .map_lookup_elem = trie_lookup_elem, | ||
507 | .map_update_elem = trie_update_elem, | ||
508 | .map_delete_elem = trie_delete_elem, | ||
509 | }; | ||
510 | |||
511 | static struct bpf_map_type_list trie_type __read_mostly = { | ||
512 | .ops = &trie_ops, | ||
513 | .type = BPF_MAP_TYPE_LPM_TRIE, | ||
514 | }; | ||
515 | |||
516 | static int __init register_trie_map(void) | ||
517 | { | ||
518 | bpf_register_map_type(&trie_type); | ||
519 | return 0; | ||
520 | } | ||
521 | late_initcall(register_trie_map); | ||
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index bbb016adbaeb..f74ca17af64a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
@@ -10,6 +10,7 @@ | |||
10 | * General Public License for more details. | 10 | * General Public License for more details. |
11 | */ | 11 | */ |
12 | #include <linux/bpf.h> | 12 | #include <linux/bpf.h> |
13 | #include <linux/bpf_trace.h> | ||
13 | #include <linux/syscalls.h> | 14 | #include <linux/syscalls.h> |
14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
15 | #include <linux/vmalloc.h> | 16 | #include <linux/vmalloc.h> |
@@ -241,6 +242,7 @@ static int map_create(union bpf_attr *attr) | |||
241 | /* failed to allocate fd */ | 242 | /* failed to allocate fd */ |
242 | goto free_map; | 243 | goto free_map; |
243 | 244 | ||
245 | trace_bpf_map_create(map, err); | ||
244 | return err; | 246 | return err; |
245 | 247 | ||
246 | free_map: | 248 | free_map: |
@@ -365,6 +367,7 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
365 | if (copy_to_user(uvalue, value, value_size) != 0) | 367 | if (copy_to_user(uvalue, value, value_size) != 0) |
366 | goto free_value; | 368 | goto free_value; |
367 | 369 | ||
370 | trace_bpf_map_lookup_elem(map, ufd, key, value); | ||
368 | err = 0; | 371 | err = 0; |
369 | 372 | ||
370 | free_value: | 373 | free_value: |
@@ -447,6 +450,8 @@ static int map_update_elem(union bpf_attr *attr) | |||
447 | __this_cpu_dec(bpf_prog_active); | 450 | __this_cpu_dec(bpf_prog_active); |
448 | preempt_enable(); | 451 | preempt_enable(); |
449 | 452 | ||
453 | if (!err) | ||
454 | trace_bpf_map_update_elem(map, ufd, key, value); | ||
450 | free_value: | 455 | free_value: |
451 | kfree(value); | 456 | kfree(value); |
452 | free_key: | 457 | free_key: |
@@ -492,6 +497,8 @@ static int map_delete_elem(union bpf_attr *attr) | |||
492 | __this_cpu_dec(bpf_prog_active); | 497 | __this_cpu_dec(bpf_prog_active); |
493 | preempt_enable(); | 498 | preempt_enable(); |
494 | 499 | ||
500 | if (!err) | ||
501 | trace_bpf_map_delete_elem(map, ufd, key); | ||
495 | free_key: | 502 | free_key: |
496 | kfree(key); | 503 | kfree(key); |
497 | err_put: | 504 | err_put: |
@@ -544,6 +551,7 @@ static int map_get_next_key(union bpf_attr *attr) | |||
544 | if (copy_to_user(unext_key, next_key, map->key_size) != 0) | 551 | if (copy_to_user(unext_key, next_key, map->key_size) != 0) |
545 | goto free_next_key; | 552 | goto free_next_key; |
546 | 553 | ||
554 | trace_bpf_map_next_key(map, ufd, key, next_key); | ||
547 | err = 0; | 555 | err = 0; |
548 | 556 | ||
549 | free_next_key: | 557 | free_next_key: |
@@ -697,8 +705,10 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) | |||
697 | 705 | ||
698 | void bpf_prog_put(struct bpf_prog *prog) | 706 | void bpf_prog_put(struct bpf_prog *prog) |
699 | { | 707 | { |
700 | if (atomic_dec_and_test(&prog->aux->refcnt)) | 708 | if (atomic_dec_and_test(&prog->aux->refcnt)) { |
709 | trace_bpf_prog_put_rcu(prog); | ||
701 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); | 710 | call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); |
711 | } | ||
702 | } | 712 | } |
703 | EXPORT_SYMBOL_GPL(bpf_prog_put); | 713 | EXPORT_SYMBOL_GPL(bpf_prog_put); |
704 | 714 | ||
@@ -807,7 +817,11 @@ struct bpf_prog *bpf_prog_get(u32 ufd) | |||
807 | 817 | ||
808 | struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) | 818 | struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) |
809 | { | 819 | { |
810 | return __bpf_prog_get(ufd, &type); | 820 | struct bpf_prog *prog = __bpf_prog_get(ufd, &type); |
821 | |||
822 | if (!IS_ERR(prog)) | ||
823 | trace_bpf_prog_get_type(prog); | ||
824 | return prog; | ||
811 | } | 825 | } |
812 | EXPORT_SYMBOL_GPL(bpf_prog_get_type); | 826 | EXPORT_SYMBOL_GPL(bpf_prog_get_type); |
813 | 827 | ||
@@ -889,6 +903,7 @@ static int bpf_prog_load(union bpf_attr *attr) | |||
889 | /* failed to allocate fd */ | 903 | /* failed to allocate fd */ |
890 | goto free_used_maps; | 904 | goto free_used_maps; |
891 | 905 | ||
906 | trace_bpf_prog_load(prog, err); | ||
892 | return err; | 907 | return err; |
893 | 908 | ||
894 | free_used_maps: | 909 | free_used_maps: |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc43b899f28..d2bded2b250c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
@@ -481,6 +481,13 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno) | |||
481 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; | 481 | regs[regno].max_value = BPF_REGISTER_MAX_RANGE; |
482 | } | 482 | } |
483 | 483 | ||
484 | static void mark_reg_unknown_value_and_range(struct bpf_reg_state *regs, | ||
485 | u32 regno) | ||
486 | { | ||
487 | mark_reg_unknown_value(regs, regno); | ||
488 | reset_reg_range_values(regs, regno); | ||
489 | } | ||
490 | |||
484 | enum reg_arg_type { | 491 | enum reg_arg_type { |
485 | SRC_OP, /* register is used as source operand */ | 492 | SRC_OP, /* register is used as source operand */ |
486 | DST_OP, /* register is used as destination operand */ | 493 | DST_OP, /* register is used as destination operand */ |
@@ -532,6 +539,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) | |||
532 | switch (type) { | 539 | switch (type) { |
533 | case PTR_TO_MAP_VALUE: | 540 | case PTR_TO_MAP_VALUE: |
534 | case PTR_TO_MAP_VALUE_OR_NULL: | 541 | case PTR_TO_MAP_VALUE_OR_NULL: |
542 | case PTR_TO_MAP_VALUE_ADJ: | ||
535 | case PTR_TO_STACK: | 543 | case PTR_TO_STACK: |
536 | case PTR_TO_CTX: | 544 | case PTR_TO_CTX: |
537 | case PTR_TO_PACKET: | 545 | case PTR_TO_PACKET: |
@@ -616,7 +624,8 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, | |||
616 | } | 624 | } |
617 | if (value_regno >= 0) | 625 | if (value_regno >= 0) |
618 | /* have read misc data from the stack */ | 626 | /* have read misc data from the stack */ |
619 | mark_reg_unknown_value(state->regs, value_regno); | 627 | mark_reg_unknown_value_and_range(state->regs, |
628 | value_regno); | ||
620 | return 0; | 629 | return 0; |
621 | } | 630 | } |
622 | } | 631 | } |
@@ -627,7 +636,7 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
627 | { | 636 | { |
628 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; | 637 | struct bpf_map *map = env->cur_state.regs[regno].map_ptr; |
629 | 638 | ||
630 | if (off < 0 || off + size > map->value_size) { | 639 | if (off < 0 || size <= 0 || off + size > map->value_size) { |
631 | verbose("invalid access to map value, value_size=%d off=%d size=%d\n", | 640 | verbose("invalid access to map value, value_size=%d off=%d size=%d\n", |
632 | map->value_size, off, size); | 641 | map->value_size, off, size); |
633 | return -EACCES; | 642 | return -EACCES; |
@@ -635,6 +644,51 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
635 | return 0; | 644 | return 0; |
636 | } | 645 | } |
637 | 646 | ||
647 | /* check read/write into an adjusted map element */ | ||
648 | static int check_map_access_adj(struct bpf_verifier_env *env, u32 regno, | ||
649 | int off, int size) | ||
650 | { | ||
651 | struct bpf_verifier_state *state = &env->cur_state; | ||
652 | struct bpf_reg_state *reg = &state->regs[regno]; | ||
653 | int err; | ||
654 | |||
655 | /* We adjusted the register to this map value, so we | ||
656 | * need to change off and size to min_value and max_value | ||
657 | * respectively to make sure our theoretical access will be | ||
658 | * safe. | ||
659 | */ | ||
660 | if (log_level) | ||
661 | print_verifier_state(state); | ||
662 | env->varlen_map_value_access = true; | ||
663 | /* The minimum value is only important with signed | ||
664 | * comparisons where we can't assume the floor of a | ||
665 | * value is 0. If we are using signed variables for our | ||
666 | * index'es we need to make sure that whatever we use | ||
667 | * will have a set floor within our range. | ||
668 | */ | ||
669 | if (reg->min_value < 0) { | ||
670 | verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", | ||
671 | regno); | ||
672 | return -EACCES; | ||
673 | } | ||
674 | err = check_map_access(env, regno, reg->min_value + off, size); | ||
675 | if (err) { | ||
676 | verbose("R%d min value is outside of the array range\n", | ||
677 | regno); | ||
678 | return err; | ||
679 | } | ||
680 | |||
681 | /* If we haven't set a max value then we need to bail | ||
682 | * since we can't be sure we won't do bad things. | ||
683 | */ | ||
684 | if (reg->max_value == BPF_REGISTER_MAX_RANGE) { | ||
685 | verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", | ||
686 | regno); | ||
687 | return -EACCES; | ||
688 | } | ||
689 | return check_map_access(env, regno, reg->max_value + off, size); | ||
690 | } | ||
691 | |||
638 | #define MAX_PACKET_OFF 0xffff | 692 | #define MAX_PACKET_OFF 0xffff |
639 | 693 | ||
640 | static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, | 694 | static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, |
@@ -647,6 +701,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, | |||
647 | /* dst_input() and dst_output() can't write for now */ | 701 | /* dst_input() and dst_output() can't write for now */ |
648 | if (t == BPF_WRITE) | 702 | if (t == BPF_WRITE) |
649 | return false; | 703 | return false; |
704 | /* fallthrough */ | ||
650 | case BPF_PROG_TYPE_SCHED_CLS: | 705 | case BPF_PROG_TYPE_SCHED_CLS: |
651 | case BPF_PROG_TYPE_SCHED_ACT: | 706 | case BPF_PROG_TYPE_SCHED_ACT: |
652 | case BPF_PROG_TYPE_XDP: | 707 | case BPF_PROG_TYPE_XDP: |
@@ -775,47 +830,13 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
775 | return -EACCES; | 830 | return -EACCES; |
776 | } | 831 | } |
777 | 832 | ||
778 | /* If we adjusted the register to this map value at all then we | 833 | if (reg->type == PTR_TO_MAP_VALUE_ADJ) |
779 | * need to change off and size to min_value and max_value | 834 | err = check_map_access_adj(env, regno, off, size); |
780 | * respectively to make sure our theoretical access will be | 835 | else |
781 | * safe. | 836 | err = check_map_access(env, regno, off, size); |
782 | */ | ||
783 | if (reg->type == PTR_TO_MAP_VALUE_ADJ) { | ||
784 | if (log_level) | ||
785 | print_verifier_state(state); | ||
786 | env->varlen_map_value_access = true; | ||
787 | /* The minimum value is only important with signed | ||
788 | * comparisons where we can't assume the floor of a | ||
789 | * value is 0. If we are using signed variables for our | ||
790 | * index'es we need to make sure that whatever we use | ||
791 | * will have a set floor within our range. | ||
792 | */ | ||
793 | if (reg->min_value < 0) { | ||
794 | verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", | ||
795 | regno); | ||
796 | return -EACCES; | ||
797 | } | ||
798 | err = check_map_access(env, regno, reg->min_value + off, | ||
799 | size); | ||
800 | if (err) { | ||
801 | verbose("R%d min value is outside of the array range\n", | ||
802 | regno); | ||
803 | return err; | ||
804 | } | ||
805 | |||
806 | /* If we haven't set a max value then we need to bail | ||
807 | * since we can't be sure we won't do bad things. | ||
808 | */ | ||
809 | if (reg->max_value == BPF_REGISTER_MAX_RANGE) { | ||
810 | verbose("R%d unbounded memory access, make sure to bounds check any array access into a map\n", | ||
811 | regno); | ||
812 | return -EACCES; | ||
813 | } | ||
814 | off += reg->max_value; | ||
815 | } | ||
816 | err = check_map_access(env, regno, off, size); | ||
817 | if (!err && t == BPF_READ && value_regno >= 0) | 837 | if (!err && t == BPF_READ && value_regno >= 0) |
818 | mark_reg_unknown_value(state->regs, value_regno); | 838 | mark_reg_unknown_value_and_range(state->regs, |
839 | value_regno); | ||
819 | 840 | ||
820 | } else if (reg->type == PTR_TO_CTX) { | 841 | } else if (reg->type == PTR_TO_CTX) { |
821 | enum bpf_reg_type reg_type = UNKNOWN_VALUE; | 842 | enum bpf_reg_type reg_type = UNKNOWN_VALUE; |
@@ -827,7 +848,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
827 | } | 848 | } |
828 | err = check_ctx_access(env, off, size, t, ®_type); | 849 | err = check_ctx_access(env, off, size, t, ®_type); |
829 | if (!err && t == BPF_READ && value_regno >= 0) { | 850 | if (!err && t == BPF_READ && value_regno >= 0) { |
830 | mark_reg_unknown_value(state->regs, value_regno); | 851 | mark_reg_unknown_value_and_range(state->regs, |
852 | value_regno); | ||
831 | /* note that reg.[id|off|range] == 0 */ | 853 | /* note that reg.[id|off|range] == 0 */ |
832 | state->regs[value_regno].type = reg_type; | 854 | state->regs[value_regno].type = reg_type; |
833 | } | 855 | } |
@@ -860,7 +882,8 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off, | |||
860 | } | 882 | } |
861 | err = check_packet_access(env, regno, off, size); | 883 | err = check_packet_access(env, regno, off, size); |
862 | if (!err && t == BPF_READ && value_regno >= 0) | 884 | if (!err && t == BPF_READ && value_regno >= 0) |
863 | mark_reg_unknown_value(state->regs, value_regno); | 885 | mark_reg_unknown_value_and_range(state->regs, |
886 | value_regno); | ||
864 | } else { | 887 | } else { |
865 | verbose("R%d invalid mem access '%s'\n", | 888 | verbose("R%d invalid mem access '%s'\n", |
866 | regno, reg_type_str[reg->type]); | 889 | regno, reg_type_str[reg->type]); |
@@ -958,6 +981,25 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
958 | return 0; | 981 | return 0; |
959 | } | 982 | } |
960 | 983 | ||
984 | static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, | ||
985 | int access_size, bool zero_size_allowed, | ||
986 | struct bpf_call_arg_meta *meta) | ||
987 | { | ||
988 | struct bpf_reg_state *regs = env->cur_state.regs; | ||
989 | |||
990 | switch (regs[regno].type) { | ||
991 | case PTR_TO_PACKET: | ||
992 | return check_packet_access(env, regno, 0, access_size); | ||
993 | case PTR_TO_MAP_VALUE: | ||
994 | return check_map_access(env, regno, 0, access_size); | ||
995 | case PTR_TO_MAP_VALUE_ADJ: | ||
996 | return check_map_access_adj(env, regno, 0, access_size); | ||
997 | default: /* const_imm|ptr_to_stack or invalid ptr */ | ||
998 | return check_stack_boundary(env, regno, access_size, | ||
999 | zero_size_allowed, meta); | ||
1000 | } | ||
1001 | } | ||
1002 | |||
961 | static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | 1003 | static int check_func_arg(struct bpf_verifier_env *env, u32 regno, |
962 | enum bpf_arg_type arg_type, | 1004 | enum bpf_arg_type arg_type, |
963 | struct bpf_call_arg_meta *meta) | 1005 | struct bpf_call_arg_meta *meta) |
@@ -993,10 +1035,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
993 | expected_type = PTR_TO_STACK; | 1035 | expected_type = PTR_TO_STACK; |
994 | if (type != PTR_TO_PACKET && type != expected_type) | 1036 | if (type != PTR_TO_PACKET && type != expected_type) |
995 | goto err_type; | 1037 | goto err_type; |
996 | } else if (arg_type == ARG_CONST_STACK_SIZE || | 1038 | } else if (arg_type == ARG_CONST_SIZE || |
997 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { | 1039 | arg_type == ARG_CONST_SIZE_OR_ZERO) { |
998 | expected_type = CONST_IMM; | 1040 | expected_type = CONST_IMM; |
999 | if (type != expected_type) | 1041 | /* One exception. Allow UNKNOWN_VALUE registers when the |
1042 | * boundaries are known and don't cause unsafe memory accesses | ||
1043 | */ | ||
1044 | if (type != UNKNOWN_VALUE && type != expected_type) | ||
1000 | goto err_type; | 1045 | goto err_type; |
1001 | } else if (arg_type == ARG_CONST_MAP_PTR) { | 1046 | } else if (arg_type == ARG_CONST_MAP_PTR) { |
1002 | expected_type = CONST_PTR_TO_MAP; | 1047 | expected_type = CONST_PTR_TO_MAP; |
@@ -1006,8 +1051,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
1006 | expected_type = PTR_TO_CTX; | 1051 | expected_type = PTR_TO_CTX; |
1007 | if (type != expected_type) | 1052 | if (type != expected_type) |
1008 | goto err_type; | 1053 | goto err_type; |
1009 | } else if (arg_type == ARG_PTR_TO_STACK || | 1054 | } else if (arg_type == ARG_PTR_TO_MEM || |
1010 | arg_type == ARG_PTR_TO_RAW_STACK) { | 1055 | arg_type == ARG_PTR_TO_UNINIT_MEM) { |
1011 | expected_type = PTR_TO_STACK; | 1056 | expected_type = PTR_TO_STACK; |
1012 | /* One exception here. In case function allows for NULL to be | 1057 | /* One exception here. In case function allows for NULL to be |
1013 | * passed in as argument, it's a CONST_IMM type. Final test | 1058 | * passed in as argument, it's a CONST_IMM type. Final test |
@@ -1015,9 +1060,10 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
1015 | */ | 1060 | */ |
1016 | if (type == CONST_IMM && reg->imm == 0) | 1061 | if (type == CONST_IMM && reg->imm == 0) |
1017 | /* final test in check_stack_boundary() */; | 1062 | /* final test in check_stack_boundary() */; |
1018 | else if (type != PTR_TO_PACKET && type != expected_type) | 1063 | else if (type != PTR_TO_PACKET && type != PTR_TO_MAP_VALUE && |
1064 | type != PTR_TO_MAP_VALUE_ADJ && type != expected_type) | ||
1019 | goto err_type; | 1065 | goto err_type; |
1020 | meta->raw_mode = arg_type == ARG_PTR_TO_RAW_STACK; | 1066 | meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; |
1021 | } else { | 1067 | } else { |
1022 | verbose("unsupported arg_type %d\n", arg_type); | 1068 | verbose("unsupported arg_type %d\n", arg_type); |
1023 | return -EFAULT; | 1069 | return -EFAULT; |
@@ -1063,9 +1109,9 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
1063 | err = check_stack_boundary(env, regno, | 1109 | err = check_stack_boundary(env, regno, |
1064 | meta->map_ptr->value_size, | 1110 | meta->map_ptr->value_size, |
1065 | false, NULL); | 1111 | false, NULL); |
1066 | } else if (arg_type == ARG_CONST_STACK_SIZE || | 1112 | } else if (arg_type == ARG_CONST_SIZE || |
1067 | arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { | 1113 | arg_type == ARG_CONST_SIZE_OR_ZERO) { |
1068 | bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); | 1114 | bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); |
1069 | 1115 | ||
1070 | /* bpf_xxx(..., buf, len) call will access 'len' bytes | 1116 | /* bpf_xxx(..., buf, len) call will access 'len' bytes |
1071 | * from stack pointer 'buf'. Check it | 1117 | * from stack pointer 'buf'. Check it |
@@ -1073,14 +1119,50 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
1073 | */ | 1119 | */ |
1074 | if (regno == 0) { | 1120 | if (regno == 0) { |
1075 | /* kernel subsystem misconfigured verifier */ | 1121 | /* kernel subsystem misconfigured verifier */ |
1076 | verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); | 1122 | verbose("ARG_CONST_SIZE cannot be first argument\n"); |
1077 | return -EACCES; | 1123 | return -EACCES; |
1078 | } | 1124 | } |
1079 | if (regs[regno - 1].type == PTR_TO_PACKET) | 1125 | |
1080 | err = check_packet_access(env, regno - 1, 0, reg->imm); | 1126 | /* If the register is UNKNOWN_VALUE, the access check happens |
1081 | else | 1127 | * using its boundaries. Otherwise, just use its imm |
1082 | err = check_stack_boundary(env, regno - 1, reg->imm, | 1128 | */ |
1083 | zero_size_allowed, meta); | 1129 | if (type == UNKNOWN_VALUE) { |
1130 | /* For unprivileged variable accesses, disable raw | ||
1131 | * mode so that the program is required to | ||
1132 | * initialize all the memory that the helper could | ||
1133 | * just partially fill up. | ||
1134 | */ | ||
1135 | meta = NULL; | ||
1136 | |||
1137 | if (reg->min_value < 0) { | ||
1138 | verbose("R%d min value is negative, either use unsigned or 'var &= const'\n", | ||
1139 | regno); | ||
1140 | return -EACCES; | ||
1141 | } | ||
1142 | |||
1143 | if (reg->min_value == 0) { | ||
1144 | err = check_helper_mem_access(env, regno - 1, 0, | ||
1145 | zero_size_allowed, | ||
1146 | meta); | ||
1147 | if (err) | ||
1148 | return err; | ||
1149 | } | ||
1150 | |||
1151 | if (reg->max_value == BPF_REGISTER_MAX_RANGE) { | ||
1152 | verbose("R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", | ||
1153 | regno); | ||
1154 | return -EACCES; | ||
1155 | } | ||
1156 | err = check_helper_mem_access(env, regno - 1, | ||
1157 | reg->max_value, | ||
1158 | zero_size_allowed, meta); | ||
1159 | if (err) | ||
1160 | return err; | ||
1161 | } else { | ||
1162 | /* register is CONST_IMM */ | ||
1163 | err = check_helper_mem_access(env, regno - 1, reg->imm, | ||
1164 | zero_size_allowed, meta); | ||
1165 | } | ||
1084 | } | 1166 | } |
1085 | 1167 | ||
1086 | return err; | 1168 | return err; |
@@ -1154,15 +1236,15 @@ static int check_raw_mode(const struct bpf_func_proto *fn) | |||
1154 | { | 1236 | { |
1155 | int count = 0; | 1237 | int count = 0; |
1156 | 1238 | ||
1157 | if (fn->arg1_type == ARG_PTR_TO_RAW_STACK) | 1239 | if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) |
1158 | count++; | 1240 | count++; |
1159 | if (fn->arg2_type == ARG_PTR_TO_RAW_STACK) | 1241 | if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) |
1160 | count++; | 1242 | count++; |
1161 | if (fn->arg3_type == ARG_PTR_TO_RAW_STACK) | 1243 | if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) |
1162 | count++; | 1244 | count++; |
1163 | if (fn->arg4_type == ARG_PTR_TO_RAW_STACK) | 1245 | if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) |
1164 | count++; | 1246 | count++; |
1165 | if (fn->arg5_type == ARG_PTR_TO_RAW_STACK) | 1247 | if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) |
1166 | count++; | 1248 | count++; |
1167 | 1249 | ||
1168 | return count > 1 ? -EINVAL : 0; | 1250 | return count > 1 ? -EINVAL : 0; |
@@ -1316,7 +1398,7 @@ static int check_packet_ptr_add(struct bpf_verifier_env *env, | |||
1316 | imm = insn->imm; | 1398 | imm = insn->imm; |
1317 | 1399 | ||
1318 | add_imm: | 1400 | add_imm: |
1319 | if (imm <= 0) { | 1401 | if (imm < 0) { |
1320 | verbose("addition of negative constant to packet pointer is not allowed\n"); | 1402 | verbose("addition of negative constant to packet pointer is not allowed\n"); |
1321 | return -EACCES; | 1403 | return -EACCES; |
1322 | } | 1404 | } |
@@ -1485,22 +1567,54 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env, | |||
1485 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; | 1567 | struct bpf_reg_state *dst_reg = ®s[insn->dst_reg]; |
1486 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; | 1568 | struct bpf_reg_state *src_reg = ®s[insn->src_reg]; |
1487 | u8 opcode = BPF_OP(insn->code); | 1569 | u8 opcode = BPF_OP(insn->code); |
1570 | u64 dst_imm = dst_reg->imm; | ||
1488 | 1571 | ||
1489 | /* dst_reg->type == CONST_IMM here, simulate execution of 'add'/'or' | 1572 | /* dst_reg->type == CONST_IMM here. Simulate execution of insns |
1490 | * insn. Don't care about overflow or negative values, just add them | 1573 | * containing ALU ops. Don't care about overflow or negative |
1574 | * values, just add/sub/... them; registers are in u64. | ||
1491 | */ | 1575 | */ |
1492 | if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) | 1576 | if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_K) { |
1493 | dst_reg->imm += insn->imm; | 1577 | dst_imm += insn->imm; |
1494 | else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && | 1578 | } else if (opcode == BPF_ADD && BPF_SRC(insn->code) == BPF_X && |
1495 | src_reg->type == CONST_IMM) | 1579 | src_reg->type == CONST_IMM) { |
1496 | dst_reg->imm += src_reg->imm; | 1580 | dst_imm += src_reg->imm; |
1497 | else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) | 1581 | } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_K) { |
1498 | dst_reg->imm |= insn->imm; | 1582 | dst_imm -= insn->imm; |
1499 | else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && | 1583 | } else if (opcode == BPF_SUB && BPF_SRC(insn->code) == BPF_X && |
1500 | src_reg->type == CONST_IMM) | 1584 | src_reg->type == CONST_IMM) { |
1501 | dst_reg->imm |= src_reg->imm; | 1585 | dst_imm -= src_reg->imm; |
1502 | else | 1586 | } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_K) { |
1587 | dst_imm *= insn->imm; | ||
1588 | } else if (opcode == BPF_MUL && BPF_SRC(insn->code) == BPF_X && | ||
1589 | src_reg->type == CONST_IMM) { | ||
1590 | dst_imm *= src_reg->imm; | ||
1591 | } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_K) { | ||
1592 | dst_imm |= insn->imm; | ||
1593 | } else if (opcode == BPF_OR && BPF_SRC(insn->code) == BPF_X && | ||
1594 | src_reg->type == CONST_IMM) { | ||
1595 | dst_imm |= src_reg->imm; | ||
1596 | } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_K) { | ||
1597 | dst_imm &= insn->imm; | ||
1598 | } else if (opcode == BPF_AND && BPF_SRC(insn->code) == BPF_X && | ||
1599 | src_reg->type == CONST_IMM) { | ||
1600 | dst_imm &= src_reg->imm; | ||
1601 | } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_K) { | ||
1602 | dst_imm >>= insn->imm; | ||
1603 | } else if (opcode == BPF_RSH && BPF_SRC(insn->code) == BPF_X && | ||
1604 | src_reg->type == CONST_IMM) { | ||
1605 | dst_imm >>= src_reg->imm; | ||
1606 | } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_K) { | ||
1607 | dst_imm <<= insn->imm; | ||
1608 | } else if (opcode == BPF_LSH && BPF_SRC(insn->code) == BPF_X && | ||
1609 | src_reg->type == CONST_IMM) { | ||
1610 | dst_imm <<= src_reg->imm; | ||
1611 | } else { | ||
1503 | mark_reg_unknown_value(regs, insn->dst_reg); | 1612 | mark_reg_unknown_value(regs, insn->dst_reg); |
1613 | goto out; | ||
1614 | } | ||
1615 | |||
1616 | dst_reg->imm = dst_imm; | ||
1617 | out: | ||
1504 | return 0; | 1618 | return 0; |
1505 | } | 1619 | } |
1506 | 1620 | ||
@@ -1894,6 +2008,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, | |||
1894 | case BPF_JGT: | 2008 | case BPF_JGT: |
1895 | /* Unsigned comparison, the minimum value is 0. */ | 2009 | /* Unsigned comparison, the minimum value is 0. */ |
1896 | false_reg->min_value = 0; | 2010 | false_reg->min_value = 0; |
2011 | /* fallthrough */ | ||
1897 | case BPF_JSGT: | 2012 | case BPF_JSGT: |
1898 | /* If this is false then we know the maximum val is val, | 2013 | /* If this is false then we know the maximum val is val, |
1899 | * otherwise we know the min val is val+1. | 2014 | * otherwise we know the min val is val+1. |
@@ -1904,6 +2019,7 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg, | |||
1904 | case BPF_JGE: | 2019 | case BPF_JGE: |
1905 | /* Unsigned comparison, the minimum value is 0. */ | 2020 | /* Unsigned comparison, the minimum value is 0. */ |
1906 | false_reg->min_value = 0; | 2021 | false_reg->min_value = 0; |
2022 | /* fallthrough */ | ||
1907 | case BPF_JSGE: | 2023 | case BPF_JSGE: |
1908 | /* If this is false then we know the maximum value is val - 1, | 2024 | /* If this is false then we know the maximum value is val - 1, |
1909 | * otherwise we know the mimimum value is val. | 2025 | * otherwise we know the mimimum value is val. |
@@ -1942,6 +2058,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | |||
1942 | case BPF_JGT: | 2058 | case BPF_JGT: |
1943 | /* Unsigned comparison, the minimum value is 0. */ | 2059 | /* Unsigned comparison, the minimum value is 0. */ |
1944 | true_reg->min_value = 0; | 2060 | true_reg->min_value = 0; |
2061 | /* fallthrough */ | ||
1945 | case BPF_JSGT: | 2062 | case BPF_JSGT: |
1946 | /* | 2063 | /* |
1947 | * If this is false, then the val is <= the register, if it is | 2064 | * If this is false, then the val is <= the register, if it is |
@@ -1953,6 +2070,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, | |||
1953 | case BPF_JGE: | 2070 | case BPF_JGE: |
1954 | /* Unsigned comparison, the minimum value is 0. */ | 2071 | /* Unsigned comparison, the minimum value is 0. */ |
1955 | true_reg->min_value = 0; | 2072 | true_reg->min_value = 0; |
2073 | /* fallthrough */ | ||
1956 | case BPF_JSGE: | 2074 | case BPF_JSGE: |
1957 | /* If this is false then constant < register, if it is true then | 2075 | /* If this is false then constant < register, if it is true then |
1958 | * the register < constant. | 2076 | * the register < constant. |
@@ -2144,14 +2262,8 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
2144 | return err; | 2262 | return err; |
2145 | 2263 | ||
2146 | if (insn->src_reg == 0) { | 2264 | if (insn->src_reg == 0) { |
2147 | /* generic move 64-bit immediate into a register, | ||
2148 | * only analyzer needs to collect the ld_imm value. | ||
2149 | */ | ||
2150 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; | 2265 | u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; |
2151 | 2266 | ||
2152 | if (!env->analyzer_ops) | ||
2153 | return 0; | ||
2154 | |||
2155 | regs[insn->dst_reg].type = CONST_IMM; | 2267 | regs[insn->dst_reg].type = CONST_IMM; |
2156 | regs[insn->dst_reg].imm = imm; | 2268 | regs[insn->dst_reg].imm = imm; |
2157 | return 0; | 2269 | return 0; |
@@ -2729,7 +2841,6 @@ static int do_check(struct bpf_verifier_env *env) | |||
2729 | if (err) | 2841 | if (err) |
2730 | return err; | 2842 | return err; |
2731 | 2843 | ||
2732 | reset_reg_range_values(regs, insn->dst_reg); | ||
2733 | if (BPF_SIZE(insn->code) != BPF_W && | 2844 | if (BPF_SIZE(insn->code) != BPF_W && |
2734 | BPF_SIZE(insn->code) != BPF_DW) { | 2845 | BPF_SIZE(insn->code) != BPF_DW) { |
2735 | insn_idx++; | 2846 | insn_idx++; |
@@ -3085,10 +3196,14 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
3085 | insn = env->prog->insnsi + delta; | 3196 | insn = env->prog->insnsi + delta; |
3086 | 3197 | ||
3087 | for (i = 0; i < insn_cnt; i++, insn++) { | 3198 | for (i = 0; i < insn_cnt; i++, insn++) { |
3088 | if (insn->code == (BPF_LDX | BPF_MEM | BPF_W) || | 3199 | if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || |
3200 | insn->code == (BPF_LDX | BPF_MEM | BPF_H) || | ||
3201 | insn->code == (BPF_LDX | BPF_MEM | BPF_W) || | ||
3089 | insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) | 3202 | insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) |
3090 | type = BPF_READ; | 3203 | type = BPF_READ; |
3091 | else if (insn->code == (BPF_STX | BPF_MEM | BPF_W) || | 3204 | else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || |
3205 | insn->code == (BPF_STX | BPF_MEM | BPF_H) || | ||
3206 | insn->code == (BPF_STX | BPF_MEM | BPF_W) || | ||
3092 | insn->code == (BPF_STX | BPF_MEM | BPF_DW)) | 3207 | insn->code == (BPF_STX | BPF_MEM | BPF_DW)) |
3093 | type = BPF_WRITE; | 3208 | type = BPF_WRITE; |
3094 | else | 3209 | else |
@@ -3097,8 +3212,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
3097 | if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) | 3212 | if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) |
3098 | continue; | 3213 | continue; |
3099 | 3214 | ||
3100 | cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg, | 3215 | cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); |
3101 | insn->off, insn_buf, env->prog); | ||
3102 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { | 3216 | if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { |
3103 | verbose("bpf verifier is misconfigured\n"); | 3217 | verbose("bpf verifier is misconfigured\n"); |
3104 | return -EINVAL; | 3218 | return -EINVAL; |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index fa77311dadb2..424daa4586d1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
@@ -76,8 +76,8 @@ static const struct bpf_func_proto bpf_probe_read_proto = { | |||
76 | .func = bpf_probe_read, | 76 | .func = bpf_probe_read, |
77 | .gpl_only = true, | 77 | .gpl_only = true, |
78 | .ret_type = RET_INTEGER, | 78 | .ret_type = RET_INTEGER, |
79 | .arg1_type = ARG_PTR_TO_RAW_STACK, | 79 | .arg1_type = ARG_PTR_TO_UNINIT_MEM, |
80 | .arg2_type = ARG_CONST_STACK_SIZE, | 80 | .arg2_type = ARG_CONST_SIZE, |
81 | .arg3_type = ARG_ANYTHING, | 81 | .arg3_type = ARG_ANYTHING, |
82 | }; | 82 | }; |
83 | 83 | ||
@@ -109,8 +109,8 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { | |||
109 | .gpl_only = true, | 109 | .gpl_only = true, |
110 | .ret_type = RET_INTEGER, | 110 | .ret_type = RET_INTEGER, |
111 | .arg1_type = ARG_ANYTHING, | 111 | .arg1_type = ARG_ANYTHING, |
112 | .arg2_type = ARG_PTR_TO_STACK, | 112 | .arg2_type = ARG_PTR_TO_MEM, |
113 | .arg3_type = ARG_CONST_STACK_SIZE, | 113 | .arg3_type = ARG_CONST_SIZE, |
114 | }; | 114 | }; |
115 | 115 | ||
116 | static const struct bpf_func_proto *bpf_get_probe_write_proto(void) | 116 | static const struct bpf_func_proto *bpf_get_probe_write_proto(void) |
@@ -213,8 +213,8 @@ static const struct bpf_func_proto bpf_trace_printk_proto = { | |||
213 | .func = bpf_trace_printk, | 213 | .func = bpf_trace_printk, |
214 | .gpl_only = true, | 214 | .gpl_only = true, |
215 | .ret_type = RET_INTEGER, | 215 | .ret_type = RET_INTEGER, |
216 | .arg1_type = ARG_PTR_TO_STACK, | 216 | .arg1_type = ARG_PTR_TO_MEM, |
217 | .arg2_type = ARG_CONST_STACK_SIZE, | 217 | .arg2_type = ARG_CONST_SIZE, |
218 | }; | 218 | }; |
219 | 219 | ||
220 | const struct bpf_func_proto *bpf_get_trace_printk_proto(void) | 220 | const struct bpf_func_proto *bpf_get_trace_printk_proto(void) |
@@ -329,8 +329,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { | |||
329 | .arg1_type = ARG_PTR_TO_CTX, | 329 | .arg1_type = ARG_PTR_TO_CTX, |
330 | .arg2_type = ARG_CONST_MAP_PTR, | 330 | .arg2_type = ARG_CONST_MAP_PTR, |
331 | .arg3_type = ARG_ANYTHING, | 331 | .arg3_type = ARG_ANYTHING, |
332 | .arg4_type = ARG_PTR_TO_STACK, | 332 | .arg4_type = ARG_PTR_TO_MEM, |
333 | .arg5_type = ARG_CONST_STACK_SIZE, | 333 | .arg5_type = ARG_CONST_SIZE, |
334 | }; | 334 | }; |
335 | 335 | ||
336 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); | 336 | static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); |
@@ -395,6 +395,36 @@ static const struct bpf_func_proto bpf_current_task_under_cgroup_proto = { | |||
395 | .arg2_type = ARG_ANYTHING, | 395 | .arg2_type = ARG_ANYTHING, |
396 | }; | 396 | }; |
397 | 397 | ||
398 | BPF_CALL_3(bpf_probe_read_str, void *, dst, u32, size, | ||
399 | const void *, unsafe_ptr) | ||
400 | { | ||
401 | int ret; | ||
402 | |||
403 | /* | ||
404 | * The strncpy_from_unsafe() call will likely not fill the entire | ||
405 | * buffer, but that's okay in this circumstance as we're probing | ||
406 | * arbitrary memory anyway similar to bpf_probe_read() and might | ||
407 | * as well probe the stack. Thus, memory is explicitly cleared | ||
408 | * only in error case, so that improper users ignoring return | ||
409 | * code altogether don't copy garbage; otherwise length of string | ||
410 | * is returned that can be used for bpf_perf_event_output() et al. | ||
411 | */ | ||
412 | ret = strncpy_from_unsafe(dst, unsafe_ptr, size); | ||
413 | if (unlikely(ret < 0)) | ||
414 | memset(dst, 0, size); | ||
415 | |||
416 | return ret; | ||
417 | } | ||
418 | |||
419 | static const struct bpf_func_proto bpf_probe_read_str_proto = { | ||
420 | .func = bpf_probe_read_str, | ||
421 | .gpl_only = true, | ||
422 | .ret_type = RET_INTEGER, | ||
423 | .arg1_type = ARG_PTR_TO_UNINIT_MEM, | ||
424 | .arg2_type = ARG_CONST_SIZE, | ||
425 | .arg3_type = ARG_ANYTHING, | ||
426 | }; | ||
427 | |||
398 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | 428 | static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) |
399 | { | 429 | { |
400 | switch (func_id) { | 430 | switch (func_id) { |
@@ -432,6 +462,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) | |||
432 | return &bpf_current_task_under_cgroup_proto; | 462 | return &bpf_current_task_under_cgroup_proto; |
433 | case BPF_FUNC_get_prandom_u32: | 463 | case BPF_FUNC_get_prandom_u32: |
434 | return &bpf_get_prandom_u32_proto; | 464 | return &bpf_get_prandom_u32_proto; |
465 | case BPF_FUNC_probe_read_str: | ||
466 | return &bpf_probe_read_str_proto; | ||
435 | default: | 467 | default: |
436 | return NULL; | 468 | return NULL; |
437 | } | 469 | } |
@@ -459,6 +491,13 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type | |||
459 | return false; | 491 | return false; |
460 | if (off % size != 0) | 492 | if (off % size != 0) |
461 | return false; | 493 | return false; |
494 | /* | ||
495 | * Assertion for 32 bit to make sure last 8 byte access | ||
496 | * (BPF_DW) to the last 4 byte member is disallowed. | ||
497 | */ | ||
498 | if (off + size > sizeof(struct pt_regs)) | ||
499 | return false; | ||
500 | |||
462 | return true; | 501 | return true; |
463 | } | 502 | } |
464 | 503 | ||
@@ -492,8 +531,8 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_tp = { | |||
492 | .arg1_type = ARG_PTR_TO_CTX, | 531 | .arg1_type = ARG_PTR_TO_CTX, |
493 | .arg2_type = ARG_CONST_MAP_PTR, | 532 | .arg2_type = ARG_CONST_MAP_PTR, |
494 | .arg3_type = ARG_ANYTHING, | 533 | .arg3_type = ARG_ANYTHING, |
495 | .arg4_type = ARG_PTR_TO_STACK, | 534 | .arg4_type = ARG_PTR_TO_MEM, |
496 | .arg5_type = ARG_CONST_STACK_SIZE, | 535 | .arg5_type = ARG_CONST_SIZE, |
497 | }; | 536 | }; |
498 | 537 | ||
499 | BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, | 538 | BPF_CALL_3(bpf_get_stackid_tp, void *, tp_buff, struct bpf_map *, map, |
@@ -540,6 +579,8 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type | |||
540 | return false; | 579 | return false; |
541 | if (off % size != 0) | 580 | if (off % size != 0) |
542 | return false; | 581 | return false; |
582 | |||
583 | BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(__u64)); | ||
543 | return true; | 584 | return true; |
544 | } | 585 | } |
545 | 586 | ||
@@ -572,28 +613,29 @@ static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type | |||
572 | return true; | 613 | return true; |
573 | } | 614 | } |
574 | 615 | ||
575 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, int dst_reg, | 616 | static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, |
576 | int src_reg, int ctx_off, | 617 | const struct bpf_insn *si, |
577 | struct bpf_insn *insn_buf, | 618 | struct bpf_insn *insn_buf, |
578 | struct bpf_prog *prog) | 619 | struct bpf_prog *prog) |
579 | { | 620 | { |
580 | struct bpf_insn *insn = insn_buf; | 621 | struct bpf_insn *insn = insn_buf; |
581 | 622 | ||
582 | switch (ctx_off) { | 623 | switch (si->off) { |
583 | case offsetof(struct bpf_perf_event_data, sample_period): | 624 | case offsetof(struct bpf_perf_event_data, sample_period): |
584 | BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); | 625 | BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64)); |
585 | 626 | ||
586 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | 627 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, |
587 | data), dst_reg, src_reg, | 628 | data), si->dst_reg, si->src_reg, |
588 | offsetof(struct bpf_perf_event_data_kern, data)); | 629 | offsetof(struct bpf_perf_event_data_kern, data)); |
589 | *insn++ = BPF_LDX_MEM(BPF_DW, dst_reg, dst_reg, | 630 | *insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg, |
590 | offsetof(struct perf_sample_data, period)); | 631 | offsetof(struct perf_sample_data, period)); |
591 | break; | 632 | break; |
592 | default: | 633 | default: |
593 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, | 634 | *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern, |
594 | regs), dst_reg, src_reg, | 635 | regs), si->dst_reg, si->src_reg, |
595 | offsetof(struct bpf_perf_event_data_kern, regs)); | 636 | offsetof(struct bpf_perf_event_data_kern, regs)); |
596 | *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), dst_reg, dst_reg, ctx_off); | 637 | *insn++ = BPF_LDX_MEM(BPF_SIZEOF(long), si->dst_reg, si->dst_reg, |
638 | si->off); | ||
597 | break; | 639 | break; |
598 | } | 640 | } |
599 | 641 | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5d33a7352919..aea6a1218c7d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -162,15 +162,27 @@ trace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr, | |||
162 | } | 162 | } |
163 | EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); | 163 | EXPORT_SYMBOL_GPL(trace_print_bitmask_seq); |
164 | 164 | ||
165 | /** | ||
166 | * trace_print_hex_seq - print buffer as hex sequence | ||
167 | * @p: trace seq struct to write to | ||
168 | * @buf: The buffer to print | ||
169 | * @buf_len: Length of @buf in bytes | ||
170 | * @concatenate: Print @buf as single hex string or with spacing | ||
171 | * | ||
172 | * Prints the passed buffer as a hex sequence either as a whole, | ||
173 | * single hex string if @concatenate is true or with spacing after | ||
174 | * each byte in case @concatenate is false. | ||
175 | */ | ||
165 | const char * | 176 | const char * |
166 | trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) | 177 | trace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len, |
178 | bool concatenate) | ||
167 | { | 179 | { |
168 | int i; | 180 | int i; |
169 | const char *ret = trace_seq_buffer_ptr(p); | 181 | const char *ret = trace_seq_buffer_ptr(p); |
170 | 182 | ||
171 | for (i = 0; i < buf_len; i++) | 183 | for (i = 0; i < buf_len; i++) |
172 | trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); | 184 | trace_seq_printf(p, "%s%2.2x", concatenate || i == 0 ? "" : " ", |
173 | 185 | buf[i]); | |
174 | trace_seq_putc(p, 0); | 186 | trace_seq_putc(p, 0); |
175 | 187 | ||
176 | return ret; | 188 | return ret; |