aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-01-16 22:42:14 -0500
committerDavid S. Miller <davem@davemloft.net>2018-01-16 22:42:14 -0500
commit7018d1b3f20fb4308ed9bc577160cb8ffb79b62a (patch)
treeb61a17c694d3cdc3490b190c35104b936bcc6638 /kernel
parente7e70fa6784b48a811fdd4253c41fc7195300570 (diff)
parente8a9d9683c8a62f917c19e57f1618363fb9ed04e (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Daniel Borkmann says: ==================== pull-request: bpf-next 2018-01-17 The following pull-request contains BPF updates for your *net-next* tree. The main changes are: 1) Add initial BPF map offloading for nfp driver. Currently only programs were supported so far w/o being able to access maps. Offloaded programs are right now only allowed to perform map lookups, and control path is responsible for populating the maps. BPF core infrastructure along with nfp implementation is provided, from Jakub. 2) Various follow-ups to Josef's BPF error injections. More specifically that includes: properly check whether the error injectable event is on function entry or not, remove the percpu bpf_kprobe_override and rather compare instruction pointer with original one, separate error-injection from kprobes since it's not limited to it, add injectable error types in order to specify what is the expected type of failure, and last but not least also support the kernel's fault injection framework, all from Masami. 3) Various misc improvements and cleanups to the libbpf Makefile. That is, fix permissions when installing BPF header files, remove unused variables and functions, and also install the libbpf.h header, from Jesper. 4) When offloading to nfp JIT and the BPF insn is unsupported in the JIT, then reject right at verification time. Also fix libbpf with regards to ELF section name matching by properly treating the program type as prefix. Both from Quentin. 5) Add -DPACKAGE to bpftool when including bfd.h for the disassembler. This is needed, for example, when building libfd from source as bpftool doesn't supply a config.h for bfd.h. Fix from Jiong. 6) xdp_convert_ctx_access() is simplified since it doesn't need to set target size during verification, from Jesper. 7) Let bpftool properly recognize BPF_PROG_TYPE_CGROUP_DEVICE program types, from Roman. 8) Various functions in BPF cpumap were not declared static, from Wei. 9) Fix a double semicolon in BPF samples, from Luis. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/bpf/cpumap.c31
-rw-r--r--kernel/bpf/devmap.c8
-rw-r--r--kernel/bpf/disasm.h4
-rw-r--r--kernel/bpf/hashtab.c103
-rw-r--r--kernel/bpf/lpm_trie.c7
-rw-r--r--kernel/bpf/offload.c222
-rw-r--r--kernel/bpf/sockmap.c8
-rw-r--r--kernel/bpf/stackmap.c6
-rw-r--r--kernel/bpf/syscall.c71
-rw-r--r--kernel/bpf/verifier.c7
-rw-r--r--kernel/fail_function.c349
-rw-r--r--kernel/kprobes.c163
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/bpf_trace.c11
-rw-r--r--kernel/trace/trace_kprobe.c33
-rw-r--r--kernel/trace/trace_probe.h12
18 files changed, 730 insertions, 318 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 172d151d429c..f85ae5dfa474 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
81obj-$(CONFIG_GCOV_KERNEL) += gcov/ 81obj-$(CONFIG_GCOV_KERNEL) += gcov/
82obj-$(CONFIG_KCOV) += kcov.o 82obj-$(CONFIG_KCOV) += kcov.o
83obj-$(CONFIG_KPROBES) += kprobes.o 83obj-$(CONFIG_KPROBES) += kprobes.o
84obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o
84obj-$(CONFIG_KGDB) += debug/ 85obj-$(CONFIG_KGDB) += debug/
85obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 86obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
86obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 87obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ce5b669003b2..fbfdada6caee 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
94 if (!cmap) 94 if (!cmap)
95 return ERR_PTR(-ENOMEM); 95 return ERR_PTR(-ENOMEM);
96 96
97 /* mandatory map attributes */ 97 bpf_map_init_from_attr(&cmap->map, attr);
98 cmap->map.map_type = attr->map_type;
99 cmap->map.key_size = attr->key_size;
100 cmap->map.value_size = attr->value_size;
101 cmap->map.max_entries = attr->max_entries;
102 cmap->map.map_flags = attr->map_flags;
103 cmap->map.numa_node = bpf_map_attr_numa_node(attr);
104 98
105 /* Pre-limit array size based on NR_CPUS, not final CPU check */ 99 /* Pre-limit array size based on NR_CPUS, not final CPU check */
106 if (cmap->map.max_entries > NR_CPUS) { 100 if (cmap->map.max_entries > NR_CPUS) {
@@ -143,7 +137,7 @@ free_cmap:
143 return ERR_PTR(err); 137 return ERR_PTR(err);
144} 138}
145 139
146void __cpu_map_queue_destructor(void *ptr) 140static void __cpu_map_queue_destructor(void *ptr)
147{ 141{
148 /* The tear-down procedure should have made sure that queue is 142 /* The tear-down procedure should have made sure that queue is
149 * empty. See __cpu_map_entry_replace() and work-queue 143 * empty. See __cpu_map_entry_replace() and work-queue
@@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
222 return xdp_pkt; 216 return xdp_pkt;
223} 217}
224 218
225struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, 219static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
226 struct xdp_pkt *xdp_pkt) 220 struct xdp_pkt *xdp_pkt)
227{ 221{
228 unsigned int frame_size; 222 unsigned int frame_size;
229 void *pkt_data_start; 223 void *pkt_data_start;
@@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data)
337 return 0; 331 return 0;
338} 332}
339 333
340struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) 334static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu,
335 int map_id)
341{ 336{
342 gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; 337 gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN;
343 struct bpf_cpu_map_entry *rcpu; 338 struct bpf_cpu_map_entry *rcpu;
@@ -395,7 +390,7 @@ free_rcu:
395 return NULL; 390 return NULL;
396} 391}
397 392
398void __cpu_map_entry_free(struct rcu_head *rcu) 393static void __cpu_map_entry_free(struct rcu_head *rcu)
399{ 394{
400 struct bpf_cpu_map_entry *rcpu; 395 struct bpf_cpu_map_entry *rcpu;
401 int cpu; 396 int cpu;
@@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu)
438 * cpu_map_kthread_stop, which waits for an RCU graze period before 433 * cpu_map_kthread_stop, which waits for an RCU graze period before
439 * stopping kthread, emptying the queue. 434 * stopping kthread, emptying the queue.
440 */ 435 */
441void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, 436static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
442 u32 key_cpu, struct bpf_cpu_map_entry *rcpu) 437 u32 key_cpu, struct bpf_cpu_map_entry *rcpu)
443{ 438{
444 struct bpf_cpu_map_entry *old_rcpu; 439 struct bpf_cpu_map_entry *old_rcpu;
445 440
@@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap,
451 } 446 }
452} 447}
453 448
454int cpu_map_delete_elem(struct bpf_map *map, void *key) 449static int cpu_map_delete_elem(struct bpf_map *map, void *key)
455{ 450{
456 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 451 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
457 u32 key_cpu = *(u32 *)key; 452 u32 key_cpu = *(u32 *)key;
@@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key)
464 return 0; 459 return 0;
465} 460}
466 461
467int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, 462static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
468 u64 map_flags) 463 u64 map_flags)
469{ 464{
470 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 465 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
471 struct bpf_cpu_map_entry *rcpu; 466 struct bpf_cpu_map_entry *rcpu;
@@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value,
502 return 0; 497 return 0;
503} 498}
504 499
505void cpu_map_free(struct bpf_map *map) 500static void cpu_map_free(struct bpf_map *map)
506{ 501{
507 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); 502 struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map);
508 int cpu; 503 int cpu;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index ebdef54bf7df..565f9ece9115 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
93 if (!dtab) 93 if (!dtab)
94 return ERR_PTR(-ENOMEM); 94 return ERR_PTR(-ENOMEM);
95 95
96 /* mandatory map attributes */ 96 bpf_map_init_from_attr(&dtab->map, attr);
97 dtab->map.map_type = attr->map_type;
98 dtab->map.key_size = attr->key_size;
99 dtab->map.value_size = attr->value_size;
100 dtab->map.max_entries = attr->max_entries;
101 dtab->map.map_flags = attr->map_flags;
102 dtab->map.numa_node = bpf_map_attr_numa_node(attr);
103 97
104 /* make sure page count doesn't overflow */ 98 /* make sure page count doesn't overflow */
105 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); 99 cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h
index e0857d016f89..266fe8ee542b 100644
--- a/kernel/bpf/disasm.h
+++ b/kernel/bpf/disasm.h
@@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8];
29 29
30const char *func_id_name(int id); 30const char *func_id_name(int id);
31 31
32typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, 32typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env,
33 const char *, ...); 33 const char *, ...);
34typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, 34typedef const char *(*bpf_insn_revmap_call_t)(void *private_data,
35 const struct bpf_insn *insn); 35 const struct bpf_insn *insn);
36typedef const char *(*bpf_insn_print_imm_t)(void *private_data, 36typedef const char *(*bpf_insn_print_imm_t)(void *private_data,
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3905d4bc5b80..b76828f23b49 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab)
227} 227}
228 228
229/* Called from syscall */ 229/* Called from syscall */
230static struct bpf_map *htab_map_alloc(union bpf_attr *attr) 230static int htab_map_alloc_check(union bpf_attr *attr)
231{ 231{
232 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || 232 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
233 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); 233 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
@@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
241 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); 241 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
242 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); 242 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
243 int numa_node = bpf_map_attr_numa_node(attr); 243 int numa_node = bpf_map_attr_numa_node(attr);
244 struct bpf_htab *htab;
245 int err, i;
246 u64 cost;
247 244
248 BUILD_BUG_ON(offsetof(struct htab_elem, htab) != 245 BUILD_BUG_ON(offsetof(struct htab_elem, htab) !=
249 offsetof(struct htab_elem, hash_node.pprev)); 246 offsetof(struct htab_elem, hash_node.pprev));
@@ -254,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
254 /* LRU implementation is much complicated than other 251 /* LRU implementation is much complicated than other
255 * maps. Hence, limit to CAP_SYS_ADMIN for now. 252 * maps. Hence, limit to CAP_SYS_ADMIN for now.
256 */ 253 */
257 return ERR_PTR(-EPERM); 254 return -EPERM;
258 255
259 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) 256 if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK)
260 /* reserved bits should not be used */ 257 /* reserved bits should not be used */
261 return ERR_PTR(-EINVAL); 258 return -EINVAL;
262 259
263 if (!lru && percpu_lru) 260 if (!lru && percpu_lru)
264 return ERR_PTR(-EINVAL); 261 return -EINVAL;
265 262
266 if (lru && !prealloc) 263 if (lru && !prealloc)
267 return ERR_PTR(-ENOTSUPP); 264 return -ENOTSUPP;
268 265
269 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) 266 if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru))
270 return ERR_PTR(-EINVAL); 267 return -EINVAL;
268
269 /* check sanity of attributes.
270 * value_size == 0 may be allowed in the future to use map as a set
271 */
272 if (attr->max_entries == 0 || attr->key_size == 0 ||
273 attr->value_size == 0)
274 return -EINVAL;
275
276 if (attr->key_size > MAX_BPF_STACK)
277 /* eBPF programs initialize keys on stack, so they cannot be
278 * larger than max stack size
279 */
280 return -E2BIG;
281
282 if (attr->value_size >= KMALLOC_MAX_SIZE -
283 MAX_BPF_STACK - sizeof(struct htab_elem))
284 /* if value_size is bigger, the user space won't be able to
285 * access the elements via bpf syscall. This check also makes
286 * sure that the elem_size doesn't overflow and it's
287 * kmalloc-able later in htab_map_update_elem()
288 */
289 return -E2BIG;
290
291 return 0;
292}
293
294static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
295{
296 bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
297 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
298 bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
299 attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
300 /* percpu_lru means each cpu has its own LRU list.
301 * it is different from BPF_MAP_TYPE_PERCPU_HASH where
302 * the map's value itself is percpu. percpu_lru has
303 * nothing to do with the map's value.
304 */
305 bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
306 bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
307 struct bpf_htab *htab;
308 int err, i;
309 u64 cost;
271 310
272 htab = kzalloc(sizeof(*htab), GFP_USER); 311 htab = kzalloc(sizeof(*htab), GFP_USER);
273 if (!htab) 312 if (!htab)
274 return ERR_PTR(-ENOMEM); 313 return ERR_PTR(-ENOMEM);
275 314
276 /* mandatory map attributes */ 315 bpf_map_init_from_attr(&htab->map, attr);
277 htab->map.map_type = attr->map_type;
278 htab->map.key_size = attr->key_size;
279 htab->map.value_size = attr->value_size;
280 htab->map.max_entries = attr->max_entries;
281 htab->map.map_flags = attr->map_flags;
282 htab->map.numa_node = numa_node;
283
284 /* check sanity of attributes.
285 * value_size == 0 may be allowed in the future to use map as a set
286 */
287 err = -EINVAL;
288 if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
289 htab->map.value_size == 0)
290 goto free_htab;
291 316
292 if (percpu_lru) { 317 if (percpu_lru) {
293 /* ensure each CPU's lru list has >=1 elements. 318 /* ensure each CPU's lru list has >=1 elements.
@@ -304,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
304 /* hash table size must be power of 2 */ 329 /* hash table size must be power of 2 */
305 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); 330 htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
306 331
307 err = -E2BIG;
308 if (htab->map.key_size > MAX_BPF_STACK)
309 /* eBPF programs initialize keys on stack, so they cannot be
310 * larger than max stack size
311 */
312 goto free_htab;
313
314 if (htab->map.value_size >= KMALLOC_MAX_SIZE -
315 MAX_BPF_STACK - sizeof(struct htab_elem))
316 /* if value_size is bigger, the user space won't be able to
317 * access the elements via bpf syscall. This check also makes
318 * sure that the elem_size doesn't overflow and it's
319 * kmalloc-able later in htab_map_update_elem()
320 */
321 goto free_htab;
322
323 htab->elem_size = sizeof(struct htab_elem) + 332 htab->elem_size = sizeof(struct htab_elem) +
324 round_up(htab->map.key_size, 8); 333 round_up(htab->map.key_size, 8);
325 if (percpu) 334 if (percpu)
@@ -327,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
327 else 336 else
328 htab->elem_size += round_up(htab->map.value_size, 8); 337 htab->elem_size += round_up(htab->map.value_size, 8);
329 338
339 err = -E2BIG;
330 /* prevent zero size kmalloc and check for u32 overflow */ 340 /* prevent zero size kmalloc and check for u32 overflow */
331 if (htab->n_buckets == 0 || 341 if (htab->n_buckets == 0 ||
332 htab->n_buckets > U32_MAX / sizeof(struct bucket)) 342 htab->n_buckets > U32_MAX / sizeof(struct bucket))
@@ -1143,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map)
1143} 1153}
1144 1154
1145const struct bpf_map_ops htab_map_ops = { 1155const struct bpf_map_ops htab_map_ops = {
1156 .map_alloc_check = htab_map_alloc_check,
1146 .map_alloc = htab_map_alloc, 1157 .map_alloc = htab_map_alloc,
1147 .map_free = htab_map_free, 1158 .map_free = htab_map_free,
1148 .map_get_next_key = htab_map_get_next_key, 1159 .map_get_next_key = htab_map_get_next_key,
@@ -1153,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = {
1153}; 1164};
1154 1165
1155const struct bpf_map_ops htab_lru_map_ops = { 1166const struct bpf_map_ops htab_lru_map_ops = {
1167 .map_alloc_check = htab_map_alloc_check,
1156 .map_alloc = htab_map_alloc, 1168 .map_alloc = htab_map_alloc,
1157 .map_free = htab_map_free, 1169 .map_free = htab_map_free,
1158 .map_get_next_key = htab_map_get_next_key, 1170 .map_get_next_key = htab_map_get_next_key,
@@ -1236,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
1236} 1248}
1237 1249
1238const struct bpf_map_ops htab_percpu_map_ops = { 1250const struct bpf_map_ops htab_percpu_map_ops = {
1251 .map_alloc_check = htab_map_alloc_check,
1239 .map_alloc = htab_map_alloc, 1252 .map_alloc = htab_map_alloc,
1240 .map_free = htab_map_free, 1253 .map_free = htab_map_free,
1241 .map_get_next_key = htab_map_get_next_key, 1254 .map_get_next_key = htab_map_get_next_key,
@@ -1245,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
1245}; 1258};
1246 1259
1247const struct bpf_map_ops htab_lru_percpu_map_ops = { 1260const struct bpf_map_ops htab_lru_percpu_map_ops = {
1261 .map_alloc_check = htab_map_alloc_check,
1248 .map_alloc = htab_map_alloc, 1262 .map_alloc = htab_map_alloc,
1249 .map_free = htab_map_free, 1263 .map_free = htab_map_free,
1250 .map_get_next_key = htab_map_get_next_key, 1264 .map_get_next_key = htab_map_get_next_key,
@@ -1253,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
1253 .map_delete_elem = htab_lru_map_delete_elem, 1267 .map_delete_elem = htab_lru_map_delete_elem,
1254}; 1268};
1255 1269
1256static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) 1270static int fd_htab_map_alloc_check(union bpf_attr *attr)
1257{ 1271{
1258 if (attr->value_size != sizeof(u32)) 1272 if (attr->value_size != sizeof(u32))
1259 return ERR_PTR(-EINVAL); 1273 return -EINVAL;
1260 return htab_map_alloc(attr); 1274 return htab_map_alloc_check(attr);
1261} 1275}
1262 1276
1263static void fd_htab_map_free(struct bpf_map *map) 1277static void fd_htab_map_free(struct bpf_map *map)
@@ -1328,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr)
1328 if (IS_ERR(inner_map_meta)) 1342 if (IS_ERR(inner_map_meta))
1329 return inner_map_meta; 1343 return inner_map_meta;
1330 1344
1331 map = fd_htab_map_alloc(attr); 1345 map = htab_map_alloc(attr);
1332 if (IS_ERR(map)) { 1346 if (IS_ERR(map)) {
1333 bpf_map_meta_free(inner_map_meta); 1347 bpf_map_meta_free(inner_map_meta);
1334 return map; 1348 return map;
@@ -1372,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map)
1372} 1386}
1373 1387
1374const struct bpf_map_ops htab_of_maps_map_ops = { 1388const struct bpf_map_ops htab_of_maps_map_ops = {
1389 .map_alloc_check = fd_htab_map_alloc_check,
1375 .map_alloc = htab_of_map_alloc, 1390 .map_alloc = htab_of_map_alloc,
1376 .map_free = htab_of_map_free, 1391 .map_free = htab_of_map_free,
1377 .map_get_next_key = htab_map_get_next_key, 1392 .map_get_next_key = htab_map_get_next_key,
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 885e45479680..584e02227671 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
522 return ERR_PTR(-ENOMEM); 522 return ERR_PTR(-ENOMEM);
523 523
524 /* copy mandatory map attributes */ 524 /* copy mandatory map attributes */
525 trie->map.map_type = attr->map_type; 525 bpf_map_init_from_attr(&trie->map, attr);
526 trie->map.key_size = attr->key_size;
527 trie->map.value_size = attr->value_size;
528 trie->map.max_entries = attr->max_entries;
529 trie->map.map_flags = attr->map_flags;
530 trie->map.numa_node = bpf_map_attr_numa_node(attr);
531 trie->data_size = attr->key_size - 526 trie->data_size = attr->key_size -
532 offsetof(struct bpf_lpm_trie_key, data); 527 offsetof(struct bpf_lpm_trie_key, data);
533 trie->max_prefixlen = trie->data_size * 8; 528 trie->max_prefixlen = trie->data_size * 8;
diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c
index 040d4e0edf3f..a88cebf368bf 100644
--- a/kernel/bpf/offload.c
+++ b/kernel/bpf/offload.c
@@ -24,15 +24,27 @@
24#include <linux/rtnetlink.h> 24#include <linux/rtnetlink.h>
25#include <linux/rwsem.h> 25#include <linux/rwsem.h>
26 26
27/* Protects bpf_prog_offload_devs and offload members of all progs. 27/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members
28 * of all progs.
28 * RTNL lock cannot be taken when holding this lock. 29 * RTNL lock cannot be taken when holding this lock.
29 */ 30 */
30static DECLARE_RWSEM(bpf_devs_lock); 31static DECLARE_RWSEM(bpf_devs_lock);
31static LIST_HEAD(bpf_prog_offload_devs); 32static LIST_HEAD(bpf_prog_offload_devs);
33static LIST_HEAD(bpf_map_offload_devs);
34
35static int bpf_dev_offload_check(struct net_device *netdev)
36{
37 if (!netdev)
38 return -EINVAL;
39 if (!netdev->netdev_ops->ndo_bpf)
40 return -EOPNOTSUPP;
41 return 0;
42}
32 43
33int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) 44int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
34{ 45{
35 struct bpf_dev_offload *offload; 46 struct bpf_prog_offload *offload;
47 int err;
36 48
37 if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && 49 if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS &&
38 attr->prog_type != BPF_PROG_TYPE_XDP) 50 attr->prog_type != BPF_PROG_TYPE_XDP)
@@ -49,12 +61,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
49 61
50 offload->netdev = dev_get_by_index(current->nsproxy->net_ns, 62 offload->netdev = dev_get_by_index(current->nsproxy->net_ns,
51 attr->prog_ifindex); 63 attr->prog_ifindex);
52 if (!offload->netdev) 64 err = bpf_dev_offload_check(offload->netdev);
53 goto err_free; 65 if (err)
66 goto err_maybe_put;
54 67
55 down_write(&bpf_devs_lock); 68 down_write(&bpf_devs_lock);
56 if (offload->netdev->reg_state != NETREG_REGISTERED) 69 if (offload->netdev->reg_state != NETREG_REGISTERED) {
70 err = -EINVAL;
57 goto err_unlock; 71 goto err_unlock;
72 }
58 prog->aux->offload = offload; 73 prog->aux->offload = offload;
59 list_add_tail(&offload->offloads, &bpf_prog_offload_devs); 74 list_add_tail(&offload->offloads, &bpf_prog_offload_devs);
60 dev_put(offload->netdev); 75 dev_put(offload->netdev);
@@ -63,16 +78,17 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr)
63 return 0; 78 return 0;
64err_unlock: 79err_unlock:
65 up_write(&bpf_devs_lock); 80 up_write(&bpf_devs_lock);
66 dev_put(offload->netdev); 81err_maybe_put:
67err_free: 82 if (offload->netdev)
83 dev_put(offload->netdev);
68 kfree(offload); 84 kfree(offload);
69 return -EINVAL; 85 return err;
70} 86}
71 87
72static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, 88static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
73 struct netdev_bpf *data) 89 struct netdev_bpf *data)
74{ 90{
75 struct bpf_dev_offload *offload = prog->aux->offload; 91 struct bpf_prog_offload *offload = prog->aux->offload;
76 struct net_device *netdev; 92 struct net_device *netdev;
77 93
78 ASSERT_RTNL(); 94 ASSERT_RTNL();
@@ -80,8 +96,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd,
80 if (!offload) 96 if (!offload)
81 return -ENODEV; 97 return -ENODEV;
82 netdev = offload->netdev; 98 netdev = offload->netdev;
83 if (!netdev->netdev_ops->ndo_bpf)
84 return -EOPNOTSUPP;
85 99
86 data->command = cmd; 100 data->command = cmd;
87 101
@@ -110,7 +124,7 @@ exit_unlock:
110int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, 124int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
111 int insn_idx, int prev_insn_idx) 125 int insn_idx, int prev_insn_idx)
112{ 126{
113 struct bpf_dev_offload *offload; 127 struct bpf_prog_offload *offload;
114 int ret = -ENODEV; 128 int ret = -ENODEV;
115 129
116 down_read(&bpf_devs_lock); 130 down_read(&bpf_devs_lock);
@@ -124,7 +138,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env,
124 138
125static void __bpf_prog_offload_destroy(struct bpf_prog *prog) 139static void __bpf_prog_offload_destroy(struct bpf_prog *prog)
126{ 140{
127 struct bpf_dev_offload *offload = prog->aux->offload; 141 struct bpf_prog_offload *offload = prog->aux->offload;
128 struct netdev_bpf data = {}; 142 struct netdev_bpf data = {};
129 143
130 data.offload.prog = prog; 144 data.offload.prog = prog;
@@ -238,11 +252,184 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
238const struct bpf_prog_ops bpf_offload_prog_ops = { 252const struct bpf_prog_ops bpf_offload_prog_ops = {
239}; 253};
240 254
255static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap,
256 enum bpf_netdev_command cmd)
257{
258 struct netdev_bpf data = {};
259 struct net_device *netdev;
260
261 ASSERT_RTNL();
262
263 data.command = cmd;
264 data.offmap = offmap;
265 /* Caller must make sure netdev is valid */
266 netdev = offmap->netdev;
267
268 return netdev->netdev_ops->ndo_bpf(netdev, &data);
269}
270
271struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr)
272{
273 struct net *net = current->nsproxy->net_ns;
274 struct bpf_offloaded_map *offmap;
275 int err;
276
277 if (!capable(CAP_SYS_ADMIN))
278 return ERR_PTR(-EPERM);
279 if (attr->map_type != BPF_MAP_TYPE_HASH)
280 return ERR_PTR(-EINVAL);
281
282 offmap = kzalloc(sizeof(*offmap), GFP_USER);
283 if (!offmap)
284 return ERR_PTR(-ENOMEM);
285
286 bpf_map_init_from_attr(&offmap->map, attr);
287
288 rtnl_lock();
289 down_write(&bpf_devs_lock);
290 offmap->netdev = __dev_get_by_index(net, attr->map_ifindex);
291 err = bpf_dev_offload_check(offmap->netdev);
292 if (err)
293 goto err_unlock;
294
295 err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC);
296 if (err)
297 goto err_unlock;
298
299 list_add_tail(&offmap->offloads, &bpf_map_offload_devs);
300 up_write(&bpf_devs_lock);
301 rtnl_unlock();
302
303 return &offmap->map;
304
305err_unlock:
306 up_write(&bpf_devs_lock);
307 rtnl_unlock();
308 kfree(offmap);
309 return ERR_PTR(err);
310}
311
312static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap)
313{
314 WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE));
315 /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */
316 bpf_map_free_id(&offmap->map, true);
317 list_del_init(&offmap->offloads);
318 offmap->netdev = NULL;
319}
320
321void bpf_map_offload_map_free(struct bpf_map *map)
322{
323 struct bpf_offloaded_map *offmap = map_to_offmap(map);
324
325 rtnl_lock();
326 down_write(&bpf_devs_lock);
327 if (offmap->netdev)
328 __bpf_map_offload_destroy(offmap);
329 up_write(&bpf_devs_lock);
330 rtnl_unlock();
331
332 kfree(offmap);
333}
334
335int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value)
336{
337 struct bpf_offloaded_map *offmap = map_to_offmap(map);
338 int ret = -ENODEV;
339
340 down_read(&bpf_devs_lock);
341 if (offmap->netdev)
342 ret = offmap->dev_ops->map_lookup_elem(offmap, key, value);
343 up_read(&bpf_devs_lock);
344
345 return ret;
346}
347
348int bpf_map_offload_update_elem(struct bpf_map *map,
349 void *key, void *value, u64 flags)
350{
351 struct bpf_offloaded_map *offmap = map_to_offmap(map);
352 int ret = -ENODEV;
353
354 if (unlikely(flags > BPF_EXIST))
355 return -EINVAL;
356
357 down_read(&bpf_devs_lock);
358 if (offmap->netdev)
359 ret = offmap->dev_ops->map_update_elem(offmap, key, value,
360 flags);
361 up_read(&bpf_devs_lock);
362
363 return ret;
364}
365
366int bpf_map_offload_delete_elem(struct bpf_map *map, void *key)
367{
368 struct bpf_offloaded_map *offmap = map_to_offmap(map);
369 int ret = -ENODEV;
370
371 down_read(&bpf_devs_lock);
372 if (offmap->netdev)
373 ret = offmap->dev_ops->map_delete_elem(offmap, key);
374 up_read(&bpf_devs_lock);
375
376 return ret;
377}
378
379int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key)
380{
381 struct bpf_offloaded_map *offmap = map_to_offmap(map);
382 int ret = -ENODEV;
383
384 down_read(&bpf_devs_lock);
385 if (offmap->netdev)
386 ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key);
387 up_read(&bpf_devs_lock);
388
389 return ret;
390}
391
392bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map)
393{
394 struct bpf_offloaded_map *offmap;
395 struct bpf_prog_offload *offload;
396 bool ret;
397
398 if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map))
399 return false;
400
401 down_read(&bpf_devs_lock);
402 offload = prog->aux->offload;
403 offmap = map_to_offmap(map);
404
405 ret = offload && offload->netdev == offmap->netdev;
406 up_read(&bpf_devs_lock);
407
408 return ret;
409}
410
411static void bpf_offload_orphan_all_progs(struct net_device *netdev)
412{
413 struct bpf_prog_offload *offload, *tmp;
414
415 list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads)
416 if (offload->netdev == netdev)
417 __bpf_prog_offload_destroy(offload->prog);
418}
419
420static void bpf_offload_orphan_all_maps(struct net_device *netdev)
421{
422 struct bpf_offloaded_map *offmap, *tmp;
423
424 list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads)
425 if (offmap->netdev == netdev)
426 __bpf_map_offload_destroy(offmap);
427}
428
241static int bpf_offload_notification(struct notifier_block *notifier, 429static int bpf_offload_notification(struct notifier_block *notifier,
242 ulong event, void *ptr) 430 ulong event, void *ptr)
243{ 431{
244 struct net_device *netdev = netdev_notifier_info_to_dev(ptr); 432 struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
245 struct bpf_dev_offload *offload, *tmp;
246 433
247 ASSERT_RTNL(); 434 ASSERT_RTNL();
248 435
@@ -253,11 +440,8 @@ static int bpf_offload_notification(struct notifier_block *notifier,
253 break; 440 break;
254 441
255 down_write(&bpf_devs_lock); 442 down_write(&bpf_devs_lock);
256 list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, 443 bpf_offload_orphan_all_progs(netdev);
257 offloads) { 444 bpf_offload_orphan_all_maps(netdev);
258 if (offload->netdev == netdev)
259 __bpf_prog_offload_destroy(offload->prog);
260 }
261 up_write(&bpf_devs_lock); 445 up_write(&bpf_devs_lock);
262 break; 446 break;
263 default: 447 default:
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c
index 079968680bc3..0314d1783d77 100644
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -513,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
513 if (!stab) 513 if (!stab)
514 return ERR_PTR(-ENOMEM); 514 return ERR_PTR(-ENOMEM);
515 515
516 /* mandatory map attributes */ 516 bpf_map_init_from_attr(&stab->map, attr);
517 stab->map.map_type = attr->map_type;
518 stab->map.key_size = attr->key_size;
519 stab->map.value_size = attr->value_size;
520 stab->map.max_entries = attr->max_entries;
521 stab->map.map_flags = attr->map_flags;
522 stab->map.numa_node = bpf_map_attr_numa_node(attr);
523 517
524 /* make sure page count doesn't overflow */ 518 /* make sure page count doesn't overflow */
525 cost = (u64) stab->map.max_entries * sizeof(struct sock *); 519 cost = (u64) stab->map.max_entries * sizeof(struct sock *);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 6c63c2222ea8..b0ecf43f5894 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
88 if (cost >= U32_MAX - PAGE_SIZE) 88 if (cost >= U32_MAX - PAGE_SIZE)
89 goto free_smap; 89 goto free_smap;
90 90
91 smap->map.map_type = attr->map_type; 91 bpf_map_init_from_attr(&smap->map, attr);
92 smap->map.key_size = attr->key_size;
93 smap->map.value_size = value_size; 92 smap->map.value_size = value_size;
94 smap->map.max_entries = attr->max_entries;
95 smap->map.map_flags = attr->map_flags;
96 smap->n_buckets = n_buckets; 93 smap->n_buckets = n_buckets;
97 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; 94 smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
98 smap->map.numa_node = bpf_map_attr_numa_node(attr);
99 95
100 err = bpf_map_precharge_memlock(smap->map.pages); 96 err = bpf_map_precharge_memlock(smap->map.pages);
101 if (err) 97 if (err)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2bac0dc8baba..c691b9e972e3 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr,
94 return 0; 94 return 0;
95} 95}
96 96
97const struct bpf_map_ops bpf_map_offload_ops = {
98 .map_alloc = bpf_map_offload_map_alloc,
99 .map_free = bpf_map_offload_map_free,
100};
101
97static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) 102static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
98{ 103{
104 const struct bpf_map_ops *ops;
99 struct bpf_map *map; 105 struct bpf_map *map;
106 int err;
100 107
101 if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || 108 if (attr->map_type >= ARRAY_SIZE(bpf_map_types))
102 !bpf_map_types[attr->map_type]) 109 return ERR_PTR(-EINVAL);
110 ops = bpf_map_types[attr->map_type];
111 if (!ops)
103 return ERR_PTR(-EINVAL); 112 return ERR_PTR(-EINVAL);
104 113
105 map = bpf_map_types[attr->map_type]->map_alloc(attr); 114 if (ops->map_alloc_check) {
115 err = ops->map_alloc_check(attr);
116 if (err)
117 return ERR_PTR(err);
118 }
119 if (attr->map_ifindex)
120 ops = &bpf_map_offload_ops;
121 map = ops->map_alloc(attr);
106 if (IS_ERR(map)) 122 if (IS_ERR(map))
107 return map; 123 return map;
108 map->ops = bpf_map_types[attr->map_type]; 124 map->ops = ops;
109 map->map_type = attr->map_type; 125 map->map_type = attr->map_type;
110 return map; 126 return map;
111} 127}
@@ -134,6 +150,16 @@ void bpf_map_area_free(void *area)
134 kvfree(area); 150 kvfree(area);
135} 151}
136 152
153void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
154{
155 map->map_type = attr->map_type;
156 map->key_size = attr->key_size;
157 map->value_size = attr->value_size;
158 map->max_entries = attr->max_entries;
159 map->map_flags = attr->map_flags;
160 map->numa_node = bpf_map_attr_numa_node(attr);
161}
162
137int bpf_map_precharge_memlock(u32 pages) 163int bpf_map_precharge_memlock(u32 pages)
138{ 164{
139 struct user_struct *user = get_current_user(); 165 struct user_struct *user = get_current_user();
@@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map)
189 return id > 0 ? 0 : id; 215 return id > 0 ? 0 : id;
190} 216}
191 217
192static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) 218void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
193{ 219{
194 unsigned long flags; 220 unsigned long flags;
195 221
222 /* Offloaded maps are removed from the IDR store when their device
223 * disappears - even if someone holds an fd to them they are unusable,
224 * the memory is gone, all ops will fail; they are simply waiting for
225 * refcnt to drop to be freed.
226 */
227 if (!map->id)
228 return;
229
196 if (do_idr_lock) 230 if (do_idr_lock)
197 spin_lock_irqsave(&map_idr_lock, flags); 231 spin_lock_irqsave(&map_idr_lock, flags);
198 else 232 else
199 __acquire(&map_idr_lock); 233 __acquire(&map_idr_lock);
200 234
201 idr_remove(&map_idr, map->id); 235 idr_remove(&map_idr, map->id);
236 map->id = 0;
202 237
203 if (do_idr_lock) 238 if (do_idr_lock)
204 spin_unlock_irqrestore(&map_idr_lock, flags); 239 spin_unlock_irqrestore(&map_idr_lock, flags);
@@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src)
378 return 0; 413 return 0;
379} 414}
380 415
381#define BPF_MAP_CREATE_LAST_FIELD map_name 416#define BPF_MAP_CREATE_LAST_FIELD map_ifindex
382/* called via syscall */ 417/* called via syscall */
383static int map_create(union bpf_attr *attr) 418static int map_create(union bpf_attr *attr)
384{ 419{
@@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr)
566 if (!value) 601 if (!value)
567 goto free_key; 602 goto free_key;
568 603
569 if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || 604 if (bpf_map_is_dev_bound(map)) {
570 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { 605 err = bpf_map_offload_lookup_elem(map, key, value);
606 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
607 map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
571 err = bpf_percpu_hash_copy(map, key, value); 608 err = bpf_percpu_hash_copy(map, key, value);
572 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { 609 } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
573 err = bpf_percpu_array_copy(map, key, value); 610 err = bpf_percpu_array_copy(map, key, value);
@@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr)
654 goto free_value; 691 goto free_value;
655 692
656 /* Need to create a kthread, thus must support schedule */ 693 /* Need to create a kthread, thus must support schedule */
657 if (map->map_type == BPF_MAP_TYPE_CPUMAP) { 694 if (bpf_map_is_dev_bound(map)) {
695 err = bpf_map_offload_update_elem(map, key, value, attr->flags);
696 goto out;
697 } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) {
658 err = map->ops->map_update_elem(map, key, value, attr->flags); 698 err = map->ops->map_update_elem(map, key, value, attr->flags);
659 goto out; 699 goto out;
660 } 700 }
@@ -731,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr)
731 goto err_put; 771 goto err_put;
732 } 772 }
733 773
774 if (bpf_map_is_dev_bound(map)) {
775 err = bpf_map_offload_delete_elem(map, key);
776 goto out;
777 }
778
734 preempt_disable(); 779 preempt_disable();
735 __this_cpu_inc(bpf_prog_active); 780 __this_cpu_inc(bpf_prog_active);
736 rcu_read_lock(); 781 rcu_read_lock();
@@ -738,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr)
738 rcu_read_unlock(); 783 rcu_read_unlock();
739 __this_cpu_dec(bpf_prog_active); 784 __this_cpu_dec(bpf_prog_active);
740 preempt_enable(); 785 preempt_enable();
741 786out:
742 if (!err) 787 if (!err)
743 trace_bpf_map_delete_elem(map, ufd, key); 788 trace_bpf_map_delete_elem(map, ufd, key);
744 kfree(key); 789 kfree(key);
@@ -788,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr)
788 if (!next_key) 833 if (!next_key)
789 goto free_key; 834 goto free_key;
790 835
836 if (bpf_map_is_dev_bound(map)) {
837 err = bpf_map_offload_get_next_key(map, key, next_key);
838 goto out;
839 }
840
791 rcu_read_lock(); 841 rcu_read_lock();
792 err = map->ops->map_get_next_key(map, key, next_key); 842 err = map->ops->map_get_next_key(map, key, next_key);
793 rcu_read_unlock(); 843 rcu_read_unlock();
844out:
794 if (err) 845 if (err)
795 goto free_next_key; 846 goto free_next_key;
796 847
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 48b61caa94cb..ceabb394d2dc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
4816 return -EINVAL; 4816 return -EINVAL;
4817 } 4817 }
4818 } 4818 }
4819
4820 if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
4821 !bpf_offload_dev_match(prog, map)) {
4822 verbose(env, "offload device mismatch between prog and map\n");
4823 return -EINVAL;
4824 }
4825
4819 return 0; 4826 return 0;
4820} 4827}
4821 4828
diff --git a/kernel/fail_function.c b/kernel/fail_function.c
new file mode 100644
index 000000000000..21b0122cb39c
--- /dev/null
+++ b/kernel/fail_function.c
@@ -0,0 +1,349 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * fail_function.c: Function-based error injection
4 */
5#include <linux/error-injection.h>
6#include <linux/debugfs.h>
7#include <linux/fault-inject.h>
8#include <linux/kallsyms.h>
9#include <linux/kprobes.h>
10#include <linux/module.h>
11#include <linux/mutex.h>
12#include <linux/slab.h>
13#include <linux/uaccess.h>
14
15static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs);
16
17struct fei_attr {
18 struct list_head list;
19 struct kprobe kp;
20 unsigned long retval;
21};
22static DEFINE_MUTEX(fei_lock);
23static LIST_HEAD(fei_attr_list);
24static DECLARE_FAULT_ATTR(fei_fault_attr);
25static struct dentry *fei_debugfs_dir;
26
27static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv)
28{
29 switch (get_injectable_error_type(addr)) {
30 case EI_ETYPE_NULL:
31 if (retv != 0)
32 return 0;
33 break;
34 case EI_ETYPE_ERRNO:
35 if (retv < (unsigned long)-MAX_ERRNO)
36 return (unsigned long)-EINVAL;
37 break;
38 case EI_ETYPE_ERRNO_NULL:
39 if (retv != 0 && retv < (unsigned long)-MAX_ERRNO)
40 return (unsigned long)-EINVAL;
41 break;
42 }
43
44 return retv;
45}
46
47static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr)
48{
49 struct fei_attr *attr;
50
51 attr = kzalloc(sizeof(*attr), GFP_KERNEL);
52 if (attr) {
53 attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL);
54 if (!attr->kp.symbol_name) {
55 kfree(attr);
56 return NULL;
57 }
58 attr->kp.pre_handler = fei_kprobe_handler;
59 attr->retval = adjust_error_retval(addr, 0);
60 INIT_LIST_HEAD(&attr->list);
61 }
62 return attr;
63}
64
65static void fei_attr_free(struct fei_attr *attr)
66{
67 if (attr) {
68 kfree(attr->kp.symbol_name);
69 kfree(attr);
70 }
71}
72
73static struct fei_attr *fei_attr_lookup(const char *sym)
74{
75 struct fei_attr *attr;
76
77 list_for_each_entry(attr, &fei_attr_list, list) {
78 if (!strcmp(attr->kp.symbol_name, sym))
79 return attr;
80 }
81
82 return NULL;
83}
84
85static bool fei_attr_is_valid(struct fei_attr *_attr)
86{
87 struct fei_attr *attr;
88
89 list_for_each_entry(attr, &fei_attr_list, list) {
90 if (attr == _attr)
91 return true;
92 }
93
94 return false;
95}
96
97static int fei_retval_set(void *data, u64 val)
98{
99 struct fei_attr *attr = data;
100 unsigned long retv = (unsigned long)val;
101 int err = 0;
102
103 mutex_lock(&fei_lock);
104 /*
105 * Since this operation can be done after retval file is removed,
106 * It is safer to check the attr is still valid before accessing
107 * its member.
108 */
109 if (!fei_attr_is_valid(attr)) {
110 err = -ENOENT;
111 goto out;
112 }
113
114 if (attr->kp.addr) {
115 if (adjust_error_retval((unsigned long)attr->kp.addr,
116 val) != retv)
117 err = -EINVAL;
118 }
119 if (!err)
120 attr->retval = val;
121out:
122 mutex_unlock(&fei_lock);
123
124 return err;
125}
126
127static int fei_retval_get(void *data, u64 *val)
128{
129 struct fei_attr *attr = data;
130 int err = 0;
131
132 mutex_lock(&fei_lock);
133 /* Here we also validate @attr to ensure it still exists. */
134 if (!fei_attr_is_valid(attr))
135 err = -ENOENT;
136 else
137 *val = attr->retval;
138 mutex_unlock(&fei_lock);
139
140 return err;
141}
142DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set,
143 "%llx\n");
144
145static int fei_debugfs_add_attr(struct fei_attr *attr)
146{
147 struct dentry *dir;
148
149 dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir);
150 if (!dir)
151 return -ENOMEM;
152
153 if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) {
154 debugfs_remove_recursive(dir);
155 return -ENOMEM;
156 }
157
158 return 0;
159}
160
161static void fei_debugfs_remove_attr(struct fei_attr *attr)
162{
163 struct dentry *dir;
164
165 dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir);
166 if (dir)
167 debugfs_remove_recursive(dir);
168}
169
170static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs)
171{
172 struct fei_attr *attr = container_of(kp, struct fei_attr, kp);
173
174 if (should_fail(&fei_fault_attr, 1)) {
175 regs_set_return_value(regs, attr->retval);
176 override_function_with_return(regs);
177 /* Kprobe specific fixup */
178 reset_current_kprobe();
179 preempt_enable_no_resched();
180 return 1;
181 }
182
183 return 0;
184}
185NOKPROBE_SYMBOL(fei_kprobe_handler)
186
187static void *fei_seq_start(struct seq_file *m, loff_t *pos)
188{
189 mutex_lock(&fei_lock);
190 return seq_list_start(&fei_attr_list, *pos);
191}
192
193static void fei_seq_stop(struct seq_file *m, void *v)
194{
195 mutex_unlock(&fei_lock);
196}
197
198static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos)
199{
200 return seq_list_next(v, &fei_attr_list, pos);
201}
202
203static int fei_seq_show(struct seq_file *m, void *v)
204{
205 struct fei_attr *attr = list_entry(v, struct fei_attr, list);
206
207 seq_printf(m, "%pf\n", attr->kp.addr);
208 return 0;
209}
210
211static const struct seq_operations fei_seq_ops = {
212 .start = fei_seq_start,
213 .next = fei_seq_next,
214 .stop = fei_seq_stop,
215 .show = fei_seq_show,
216};
217
218static int fei_open(struct inode *inode, struct file *file)
219{
220 return seq_open(file, &fei_seq_ops);
221}
222
223static void fei_attr_remove(struct fei_attr *attr)
224{
225 fei_debugfs_remove_attr(attr);
226 unregister_kprobe(&attr->kp);
227 list_del(&attr->list);
228 fei_attr_free(attr);
229}
230
231static void fei_attr_remove_all(void)
232{
233 struct fei_attr *attr, *n;
234
235 list_for_each_entry_safe(attr, n, &fei_attr_list, list) {
236 fei_attr_remove(attr);
237 }
238}
239
240static ssize_t fei_write(struct file *file, const char __user *buffer,
241 size_t count, loff_t *ppos)
242{
243 struct fei_attr *attr;
244 unsigned long addr;
245 char *buf, *sym;
246 int ret;
247
248 /* cut off if it is too long */
249 if (count > KSYM_NAME_LEN)
250 count = KSYM_NAME_LEN;
251 buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL);
252 if (!buf)
253 return -ENOMEM;
254
255 if (copy_from_user(buf, buffer, count)) {
256 ret = -EFAULT;
257 goto out;
258 }
259 buf[count] = '\0';
260 sym = strstrip(buf);
261
262 mutex_lock(&fei_lock);
263
264 /* Writing just spaces will remove all injection points */
265 if (sym[0] == '\0') {
266 fei_attr_remove_all();
267 ret = count;
268 goto out;
269 }
270 /* Writing !function will remove one injection point */
271 if (sym[0] == '!') {
272 attr = fei_attr_lookup(sym + 1);
273 if (!attr) {
274 ret = -ENOENT;
275 goto out;
276 }
277 fei_attr_remove(attr);
278 ret = count;
279 goto out;
280 }
281
282 addr = kallsyms_lookup_name(sym);
283 if (!addr) {
284 ret = -EINVAL;
285 goto out;
286 }
287 if (!within_error_injection_list(addr)) {
288 ret = -ERANGE;
289 goto out;
290 }
291 if (fei_attr_lookup(sym)) {
292 ret = -EBUSY;
293 goto out;
294 }
295 attr = fei_attr_new(sym, addr);
296 if (!attr) {
297 ret = -ENOMEM;
298 goto out;
299 }
300
301 ret = register_kprobe(&attr->kp);
302 if (!ret)
303 ret = fei_debugfs_add_attr(attr);
304 if (ret < 0)
305 fei_attr_remove(attr);
306 else {
307 list_add_tail(&attr->list, &fei_attr_list);
308 ret = count;
309 }
310out:
311 kfree(buf);
312 mutex_unlock(&fei_lock);
313 return ret;
314}
315
316static const struct file_operations fei_ops = {
317 .open = fei_open,
318 .read = seq_read,
319 .write = fei_write,
320 .llseek = seq_lseek,
321 .release = seq_release,
322};
323
324static int __init fei_debugfs_init(void)
325{
326 struct dentry *dir;
327
328 dir = fault_create_debugfs_attr("fail_function", NULL,
329 &fei_fault_attr);
330 if (IS_ERR(dir))
331 return PTR_ERR(dir);
332
333 /* injectable attribute is just a symlink of error_inject/list */
334 if (!debugfs_create_symlink("injectable", dir,
335 "../error_injection/list"))
336 goto error;
337
338 if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops))
339 goto error;
340
341 fei_debugfs_dir = dir;
342
343 return 0;
344error:
345 debugfs_remove_recursive(dir);
346 return -ENOMEM;
347}
348
349late_initcall(fei_debugfs_init);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b4aab48ad258..da2ccf142358 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
83 return &(kretprobe_table_locks[hash].lock); 83 return &(kretprobe_table_locks[hash].lock);
84} 84}
85 85
86/* List of symbols that can be overriden for error injection. */
87static LIST_HEAD(kprobe_error_injection_list);
88static DEFINE_MUTEX(kprobe_ei_mutex);
89struct kprobe_ei_entry {
90 struct list_head list;
91 unsigned long start_addr;
92 unsigned long end_addr;
93 void *priv;
94};
95
96/* Blacklist -- list of struct kprobe_blacklist_entry */ 86/* Blacklist -- list of struct kprobe_blacklist_entry */
97static LIST_HEAD(kprobe_blacklist); 87static LIST_HEAD(kprobe_blacklist);
98 88
@@ -1404,17 +1394,6 @@ bool within_kprobe_blacklist(unsigned long addr)
1404 return false; 1394 return false;
1405} 1395}
1406 1396
1407bool within_kprobe_error_injection_list(unsigned long addr)
1408{
1409 struct kprobe_ei_entry *ent;
1410
1411 list_for_each_entry(ent, &kprobe_error_injection_list, list) {
1412 if (addr >= ent->start_addr && addr < ent->end_addr)
1413 return true;
1414 }
1415 return false;
1416}
1417
1418/* 1397/*
1419 * If we have a symbol_name argument, look it up and add the offset field 1398 * If we have a symbol_name argument, look it up and add the offset field
1420 * to it. This way, we can specify a relative address to a symbol. 1399 * to it. This way, we can specify a relative address to a symbol.
@@ -2189,86 +2168,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start,
2189 return 0; 2168 return 0;
2190} 2169}
2191 2170
2192#ifdef CONFIG_BPF_KPROBE_OVERRIDE
2193/* Markers of the _kprobe_error_inject_list section */
2194extern unsigned long __start_kprobe_error_inject_list[];
2195extern unsigned long __stop_kprobe_error_inject_list[];
2196
2197/*
2198 * Lookup and populate the kprobe_error_injection_list.
2199 *
2200 * For safety reasons we only allow certain functions to be overriden with
2201 * bpf_error_injection, so we need to populate the list of the symbols that have
2202 * been marked as safe for overriding.
2203 */
2204static void populate_kprobe_error_injection_list(unsigned long *start,
2205 unsigned long *end,
2206 void *priv)
2207{
2208 unsigned long *iter;
2209 struct kprobe_ei_entry *ent;
2210 unsigned long entry, offset = 0, size = 0;
2211
2212 mutex_lock(&kprobe_ei_mutex);
2213 for (iter = start; iter < end; iter++) {
2214 entry = arch_deref_entry_point((void *)*iter);
2215
2216 if (!kernel_text_address(entry) ||
2217 !kallsyms_lookup_size_offset(entry, &size, &offset)) {
2218 pr_err("Failed to find error inject entry at %p\n",
2219 (void *)entry);
2220 continue;
2221 }
2222
2223 ent = kmalloc(sizeof(*ent), GFP_KERNEL);
2224 if (!ent)
2225 break;
2226 ent->start_addr = entry;
2227 ent->end_addr = entry + size;
2228 ent->priv = priv;
2229 INIT_LIST_HEAD(&ent->list);
2230 list_add_tail(&ent->list, &kprobe_error_injection_list);
2231 }
2232 mutex_unlock(&kprobe_ei_mutex);
2233}
2234
2235static void __init populate_kernel_kprobe_ei_list(void)
2236{
2237 populate_kprobe_error_injection_list(__start_kprobe_error_inject_list,
2238 __stop_kprobe_error_inject_list,
2239 NULL);
2240}
2241
2242static void module_load_kprobe_ei_list(struct module *mod)
2243{
2244 if (!mod->num_kprobe_ei_funcs)
2245 return;
2246 populate_kprobe_error_injection_list(mod->kprobe_ei_funcs,
2247 mod->kprobe_ei_funcs +
2248 mod->num_kprobe_ei_funcs, mod);
2249}
2250
2251static void module_unload_kprobe_ei_list(struct module *mod)
2252{
2253 struct kprobe_ei_entry *ent, *n;
2254 if (!mod->num_kprobe_ei_funcs)
2255 return;
2256
2257 mutex_lock(&kprobe_ei_mutex);
2258 list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) {
2259 if (ent->priv == mod) {
2260 list_del_init(&ent->list);
2261 kfree(ent);
2262 }
2263 }
2264 mutex_unlock(&kprobe_ei_mutex);
2265}
2266#else
2267static inline void __init populate_kernel_kprobe_ei_list(void) {}
2268static inline void module_load_kprobe_ei_list(struct module *m) {}
2269static inline void module_unload_kprobe_ei_list(struct module *m) {}
2270#endif
2271
2272/* Module notifier call back, checking kprobes on the module */ 2171/* Module notifier call back, checking kprobes on the module */
2273static int kprobes_module_callback(struct notifier_block *nb, 2172static int kprobes_module_callback(struct notifier_block *nb,
2274 unsigned long val, void *data) 2173 unsigned long val, void *data)
@@ -2279,11 +2178,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
2279 unsigned int i; 2178 unsigned int i;
2280 int checkcore = (val == MODULE_STATE_GOING); 2179 int checkcore = (val == MODULE_STATE_GOING);
2281 2180
2282 if (val == MODULE_STATE_COMING)
2283 module_load_kprobe_ei_list(mod);
2284 else if (val == MODULE_STATE_GOING)
2285 module_unload_kprobe_ei_list(mod);
2286
2287 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) 2181 if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
2288 return NOTIFY_DONE; 2182 return NOTIFY_DONE;
2289 2183
@@ -2346,8 +2240,6 @@ static int __init init_kprobes(void)
2346 pr_err("Please take care of using kprobes.\n"); 2240 pr_err("Please take care of using kprobes.\n");
2347 } 2241 }
2348 2242
2349 populate_kernel_kprobe_ei_list();
2350
2351 if (kretprobe_blacklist_size) { 2243 if (kretprobe_blacklist_size) {
2352 /* lookup the function address from its name */ 2244 /* lookup the function address from its name */
2353 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { 2245 for (i = 0; kretprobe_blacklist[i].name != NULL; i++) {
@@ -2515,56 +2407,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = {
2515 .release = seq_release, 2407 .release = seq_release,
2516}; 2408};
2517 2409
2518/*
2519 * kprobes/error_injection_list -- shows which functions can be overriden for
2520 * error injection.
2521 * */
2522static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos)
2523{
2524 mutex_lock(&kprobe_ei_mutex);
2525 return seq_list_start(&kprobe_error_injection_list, *pos);
2526}
2527
2528static void kprobe_ei_seq_stop(struct seq_file *m, void *v)
2529{
2530 mutex_unlock(&kprobe_ei_mutex);
2531}
2532
2533static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos)
2534{
2535 return seq_list_next(v, &kprobe_error_injection_list, pos);
2536}
2537
2538static int kprobe_ei_seq_show(struct seq_file *m, void *v)
2539{
2540 char buffer[KSYM_SYMBOL_LEN];
2541 struct kprobe_ei_entry *ent =
2542 list_entry(v, struct kprobe_ei_entry, list);
2543
2544 sprint_symbol(buffer, ent->start_addr);
2545 seq_printf(m, "%s\n", buffer);
2546 return 0;
2547}
2548
2549static const struct seq_operations kprobe_ei_seq_ops = {
2550 .start = kprobe_ei_seq_start,
2551 .next = kprobe_ei_seq_next,
2552 .stop = kprobe_ei_seq_stop,
2553 .show = kprobe_ei_seq_show,
2554};
2555
2556static int kprobe_ei_open(struct inode *inode, struct file *filp)
2557{
2558 return seq_open(filp, &kprobe_ei_seq_ops);
2559}
2560
2561static const struct file_operations debugfs_kprobe_ei_ops = {
2562 .open = kprobe_ei_open,
2563 .read = seq_read,
2564 .llseek = seq_lseek,
2565 .release = seq_release,
2566};
2567
2568static void arm_all_kprobes(void) 2410static void arm_all_kprobes(void)
2569{ 2411{
2570 struct hlist_head *head; 2412 struct hlist_head *head;
@@ -2706,11 +2548,6 @@ static int __init debugfs_kprobe_init(void)
2706 if (!file) 2548 if (!file)
2707 goto error; 2549 goto error;
2708 2550
2709 file = debugfs_create_file("error_injection_list", 0444, dir, NULL,
2710 &debugfs_kprobe_ei_ops);
2711 if (!file)
2712 goto error;
2713
2714 return 0; 2551 return 0;
2715 2552
2716error: 2553error:
diff --git a/kernel/module.c b/kernel/module.c
index bd695bfdc5c4..601494d4b7ea 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3118,10 +3118,10 @@ static int find_module_sections(struct module *mod, struct load_info *info)
3118 sizeof(*mod->ftrace_callsites), 3118 sizeof(*mod->ftrace_callsites),
3119 &mod->num_ftrace_callsites); 3119 &mod->num_ftrace_callsites);
3120#endif 3120#endif
3121#ifdef CONFIG_BPF_KPROBE_OVERRIDE 3121#ifdef CONFIG_FUNCTION_ERROR_INJECTION
3122 mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", 3122 mod->ei_funcs = section_objs(info, "_error_injection_whitelist",
3123 sizeof(*mod->kprobe_ei_funcs), 3123 sizeof(*mod->ei_funcs),
3124 &mod->num_kprobe_ei_funcs); 3124 &mod->num_ei_funcs);
3125#endif 3125#endif
3126 mod->extable = section_objs(info, "__ex_table", 3126 mod->extable = section_objs(info, "__ex_table",
3127 sizeof(*mod->extable), &mod->num_exentries); 3127 sizeof(*mod->extable), &mod->num_exentries);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ae3a2d519e50..7114c885a78a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -533,9 +533,7 @@ config FUNCTION_PROFILER
533config BPF_KPROBE_OVERRIDE 533config BPF_KPROBE_OVERRIDE
534 bool "Enable BPF programs to override a kprobed function" 534 bool "Enable BPF programs to override a kprobed function"
535 depends on BPF_EVENTS 535 depends on BPF_EVENTS
536 depends on KPROBES_ON_FTRACE 536 depends on FUNCTION_ERROR_INJECTION
537 depends on HAVE_KPROBE_OVERRIDE
538 depends on DYNAMIC_FTRACE_WITH_REGS
539 default n 537 default n
540 help 538 help
541 Allows BPF to override the execution of a probed function and 539 Allows BPF to override the execution of a probed function and
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index f6d2327ecb59..f274468cbc45 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,7 +14,7 @@
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15#include <linux/ctype.h> 15#include <linux/ctype.h>
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <asm/kprobes.h> 17#include <linux/error-injection.h>
18 18
19#include "trace_probe.h" 19#include "trace_probe.h"
20#include "trace.h" 20#include "trace.h"
@@ -83,9 +83,8 @@ EXPORT_SYMBOL_GPL(trace_call_bpf);
83#ifdef CONFIG_BPF_KPROBE_OVERRIDE 83#ifdef CONFIG_BPF_KPROBE_OVERRIDE
84BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) 84BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
85{ 85{
86 __this_cpu_write(bpf_kprobe_override, 1);
87 regs_set_return_value(regs, rc); 86 regs_set_return_value(regs, rc);
88 arch_ftrace_kprobe_override_function(regs); 87 override_function_with_return(regs);
89 return 0; 88 return 0;
90} 89}
91 90
@@ -800,11 +799,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
800 int ret = -EEXIST; 799 int ret = -EEXIST;
801 800
802 /* 801 /*
803 * Kprobe override only works for ftrace based kprobes, and only if they 802 * Kprobe override only works if they are on the function entry,
804 * are on the opt-in list. 803 * and only if they are on the opt-in list.
805 */ 804 */
806 if (prog->kprobe_override && 805 if (prog->kprobe_override &&
807 (!trace_kprobe_ftrace(event->tp_event) || 806 (!trace_kprobe_on_func_entry(event->tp_event) ||
808 !trace_kprobe_error_injectable(event->tp_event))) 807 !trace_kprobe_error_injectable(event->tp_event)))
809 return -EINVAL; 808 return -EINVAL;
810 809
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 91f4b57dab82..1fad24acd444 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -21,6 +21,7 @@
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/rculist.h> 23#include <linux/rculist.h>
24#include <linux/error-injection.h>
24 25
25#include "trace_probe.h" 26#include "trace_probe.h"
26 27
@@ -42,8 +43,6 @@ struct trace_kprobe {
42 (offsetof(struct trace_kprobe, tp.args) + \ 43 (offsetof(struct trace_kprobe, tp.args) + \
43 (sizeof(struct probe_arg) * (n))) 44 (sizeof(struct probe_arg) * (n)))
44 45
45DEFINE_PER_CPU(int, bpf_kprobe_override);
46
47static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) 46static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
48{ 47{
49 return tk->rp.handler != NULL; 48 return tk->rp.handler != NULL;
@@ -88,13 +87,16 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk)
88 return nhit; 87 return nhit;
89} 88}
90 89
91int trace_kprobe_ftrace(struct trace_event_call *call) 90bool trace_kprobe_on_func_entry(struct trace_event_call *call)
92{ 91{
93 struct trace_kprobe *tk = (struct trace_kprobe *)call->data; 92 struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
94 return kprobe_ftrace(&tk->rp.kp); 93
94 return kprobe_on_func_entry(tk->rp.kp.addr,
95 tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name,
96 tk->rp.kp.addr ? 0 : tk->rp.kp.offset);
95} 97}
96 98
97int trace_kprobe_error_injectable(struct trace_event_call *call) 99bool trace_kprobe_error_injectable(struct trace_event_call *call)
98{ 100{
99 struct trace_kprobe *tk = (struct trace_kprobe *)call->data; 101 struct trace_kprobe *tk = (struct trace_kprobe *)call->data;
100 unsigned long addr; 102 unsigned long addr;
@@ -106,7 +108,7 @@ int trace_kprobe_error_injectable(struct trace_event_call *call)
106 } else { 108 } else {
107 addr = (unsigned long)tk->rp.kp.addr; 109 addr = (unsigned long)tk->rp.kp.addr;
108 } 110 }
109 return within_kprobe_error_injection_list(addr); 111 return within_error_injection_list(addr);
110} 112}
111 113
112static int register_kprobe_event(struct trace_kprobe *tk); 114static int register_kprobe_event(struct trace_kprobe *tk);
@@ -1202,6 +1204,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1202 int rctx; 1204 int rctx;
1203 1205
1204 if (bpf_prog_array_valid(call)) { 1206 if (bpf_prog_array_valid(call)) {
1207 unsigned long orig_ip = instruction_pointer(regs);
1205 int ret; 1208 int ret;
1206 1209
1207 ret = trace_call_bpf(call, regs); 1210 ret = trace_call_bpf(call, regs);
@@ -1209,12 +1212,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
1209 /* 1212 /*
1210 * We need to check and see if we modified the pc of the 1213 * We need to check and see if we modified the pc of the
1211 * pt_regs, and if so clear the kprobe and return 1 so that we 1214 * pt_regs, and if so clear the kprobe and return 1 so that we
1212 * don't do the instruction skipping. Also reset our state so 1215 * don't do the single stepping.
1213 * we are clean the next pass through. 1216 * The ftrace kprobe handler leaves it up to us to re-enable
1217 * preemption here before returning if we've modified the ip.
1214 */ 1218 */
1215 if (__this_cpu_read(bpf_kprobe_override)) { 1219 if (orig_ip != instruction_pointer(regs)) {
1216 __this_cpu_write(bpf_kprobe_override, 0);
1217 reset_current_kprobe(); 1220 reset_current_kprobe();
1221 preempt_enable_no_resched();
1218 return 1; 1222 return 1;
1219 } 1223 }
1220 if (!ret) 1224 if (!ret)
@@ -1322,15 +1326,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1322 if (tk->tp.flags & TP_FLAG_TRACE) 1326 if (tk->tp.flags & TP_FLAG_TRACE)
1323 kprobe_trace_func(tk, regs); 1327 kprobe_trace_func(tk, regs);
1324#ifdef CONFIG_PERF_EVENTS 1328#ifdef CONFIG_PERF_EVENTS
1325 if (tk->tp.flags & TP_FLAG_PROFILE) { 1329 if (tk->tp.flags & TP_FLAG_PROFILE)
1326 ret = kprobe_perf_func(tk, regs); 1330 ret = kprobe_perf_func(tk, regs);
1327 /*
1328 * The ftrace kprobe handler leaves it up to us to re-enable
1329 * preemption here before returning if we've modified the ip.
1330 */
1331 if (ret)
1332 preempt_enable_no_resched();
1333 }
1334#endif 1331#endif
1335 return ret; 1332 return ret;
1336} 1333}
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 5e54d748c84c..e101c5bb9eda 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -252,8 +252,8 @@ struct symbol_cache;
252unsigned long update_symbol_cache(struct symbol_cache *sc); 252unsigned long update_symbol_cache(struct symbol_cache *sc);
253void free_symbol_cache(struct symbol_cache *sc); 253void free_symbol_cache(struct symbol_cache *sc);
254struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); 254struct symbol_cache *alloc_symbol_cache(const char *sym, long offset);
255int trace_kprobe_ftrace(struct trace_event_call *call); 255bool trace_kprobe_on_func_entry(struct trace_event_call *call);
256int trace_kprobe_error_injectable(struct trace_event_call *call); 256bool trace_kprobe_error_injectable(struct trace_event_call *call);
257#else 257#else
258/* uprobes do not support symbol fetch methods */ 258/* uprobes do not support symbol fetch methods */
259#define fetch_symbol_u8 NULL 259#define fetch_symbol_u8 NULL
@@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset)
280 return NULL; 280 return NULL;
281} 281}
282 282
283static inline int trace_kprobe_ftrace(struct trace_event_call *call) 283static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call)
284{ 284{
285 return 0; 285 return false;
286} 286}
287 287
288static inline int trace_kprobe_error_injectable(struct trace_event_call *call) 288static inline bool trace_kprobe_error_injectable(struct trace_event_call *call)
289{ 289{
290 return 0; 290 return false;
291} 291}
292#endif /* CONFIG_KPROBE_EVENTS */ 292#endif /* CONFIG_KPROBE_EVENTS */
293 293