diff options
Diffstat (limited to 'kernel')
257 files changed, 10205 insertions, 4708 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer index a3bb4cb52539..68646feefb3d 100644 --- a/kernel/Kconfig.freezer +++ b/kernel/Kconfig.freezer | |||
| @@ -1,2 +1,3 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | config FREEZER | 2 | config FREEZER |
| 2 | def_bool PM_SLEEP || CGROUP_FREEZER | 3 | def_bool PM_SLEEP || CGROUP_FREEZER |
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a846757..38ef6d06888e 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # Timer Interrupt Frequency Configuration | 3 | # Timer Interrupt Frequency Configuration |
| 3 | # | 4 | # |
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index bf770d7556f7..e0852dc333ac 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # The ARCH_INLINE foo is necessary because select ignores "depends on" | 3 | # The ARCH_INLINE foo is necessary because select ignores "depends on" |
| 3 | # | 4 | # |
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0fee5fe6c899..dc0b682ec2d9 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | 2 | ||
| 2 | choice | 3 | choice |
| 3 | prompt "Preemption Model" | 4 | prompt "Preemption Model" |
diff --git a/kernel/Makefile b/kernel/Makefile index 298437bb2c6a..a8d923b5481b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -71,7 +71,7 @@ obj-$(CONFIG_UTS_NS) += utsname.o | |||
| 71 | obj-$(CONFIG_USER_NS) += user_namespace.o | 71 | obj-$(CONFIG_USER_NS) += user_namespace.o |
| 72 | obj-$(CONFIG_PID_NS) += pid_namespace.o | 72 | obj-$(CONFIG_PID_NS) += pid_namespace.o |
| 73 | obj-$(CONFIG_IKCONFIG) += configs.o | 73 | obj-$(CONFIG_IKCONFIG) += configs.o |
| 74 | obj-$(CONFIG_IKHEADERS_PROC) += kheaders.o | 74 | obj-$(CONFIG_IKHEADERS) += kheaders.o |
| 75 | obj-$(CONFIG_SMP) += stop_machine.o | 75 | obj-$(CONFIG_SMP) += stop_machine.o |
| 76 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | 76 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o |
| 77 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 77 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
| @@ -127,7 +127,7 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE | |||
| 127 | $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz | 127 | $(obj)/kheaders.o: $(obj)/kheaders_data.tar.xz |
| 128 | 128 | ||
| 129 | quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz | 129 | quiet_cmd_genikh = CHK $(obj)/kheaders_data.tar.xz |
| 130 | cmd_genikh = $(srctree)/kernel/gen_ikh_data.sh $@ | 130 | cmd_genikh = $(CONFIG_SHELL) $(srctree)/kernel/gen_kheaders.sh $@ |
| 131 | $(obj)/kheaders_data.tar.xz: FORCE | 131 | $(obj)/kheaders_data.tar.xz: FORCE |
| 132 | $(call cmd,genikh) | 132 | $(call cmd,genikh) |
| 133 | 133 | ||
diff --git a/kernel/async.c b/kernel/async.c index 12c332e4e13e..4f9c1d614016 100644 --- a/kernel/async.c +++ b/kernel/async.c | |||
| @@ -1,13 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * async.c: Asynchronous function calls for boot performance | 3 | * async.c: Asynchronous function calls for boot performance |
| 3 | * | 4 | * |
| 4 | * (C) Copyright 2009 Intel Corporation | 5 | * (C) Copyright 2009 Intel Corporation |
| 5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | 6 | * Author: Arjan van de Ven <arjan@linux.intel.com> |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or | ||
| 8 | * modify it under the terms of the GNU General Public License | ||
| 9 | * as published by the Free Software Foundation; version 2 | ||
| 10 | * of the License. | ||
| 11 | */ | 7 | */ |
| 12 | 8 | ||
| 13 | 9 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index b96bf69183f4..da8dc0db5bd3 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* audit.c -- Auditing support | 2 | /* audit.c -- Auditing support |
| 2 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. | 3 | * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. |
| 3 | * System-call specific features have moved to auditsc.c | 4 | * System-call specific features have moved to auditsc.c |
| @@ -5,20 +6,6 @@ | |||
| 5 | * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. | 6 | * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. |
| 6 | * All Rights Reserved. | 7 | * All Rights Reserved. |
| 7 | * | 8 | * |
| 8 | * This program is free software; you can redistribute it and/or modify | ||
| 9 | * it under the terms of the GNU General Public License as published by | ||
| 10 | * the Free Software Foundation; either version 2 of the License, or | ||
| 11 | * (at your option) any later version. | ||
| 12 | * | ||
| 13 | * This program is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | * GNU General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU General Public License | ||
| 19 | * along with this program; if not, write to the Free Software | ||
| 20 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 21 | * | ||
| 22 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> | 9 | * Written by Rickard E. (Rik) Faith <faith@redhat.com> |
| 23 | * | 10 | * |
| 24 | * Goals: 1) Integrate fully with Security Modules. | 11 | * Goals: 1) Integrate fully with Security Modules. |
| @@ -2274,6 +2261,33 @@ out: | |||
| 2274 | } | 2261 | } |
| 2275 | 2262 | ||
| 2276 | /** | 2263 | /** |
| 2264 | * audit_signal_info - record signal info for shutting down audit subsystem | ||
| 2265 | * @sig: signal value | ||
| 2266 | * @t: task being signaled | ||
| 2267 | * | ||
| 2268 | * If the audit subsystem is being terminated, record the task (pid) | ||
| 2269 | * and uid that is doing that. | ||
| 2270 | */ | ||
| 2271 | int audit_signal_info(int sig, struct task_struct *t) | ||
| 2272 | { | ||
| 2273 | kuid_t uid = current_uid(), auid; | ||
| 2274 | |||
| 2275 | if (auditd_test_task(t) && | ||
| 2276 | (sig == SIGTERM || sig == SIGHUP || | ||
| 2277 | sig == SIGUSR1 || sig == SIGUSR2)) { | ||
| 2278 | audit_sig_pid = task_tgid_nr(current); | ||
| 2279 | auid = audit_get_loginuid(current); | ||
| 2280 | if (uid_valid(auid)) | ||
| 2281 | audit_sig_uid = auid; | ||
| 2282 | else | ||
| 2283 | audit_sig_uid = uid; | ||
| 2284 | security_task_getsecid(current, &audit_sig_sid); | ||
| 2285 | } | ||
| 2286 | |||
| 2287 | return audit_signal_info_syscall(t); | ||
| 2288 | } | ||
| 2289 | |||
| 2290 | /** | ||
| 2277 | * audit_log_end - end one audit record | 2291 | * audit_log_end - end one audit record |
| 2278 | * @ab: the audit_buffer | 2292 | * @ab: the audit_buffer |
| 2279 | * | 2293 | * |
diff --git a/kernel/audit.h b/kernel/audit.h index 2071725a999f..6fb7160412d4 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -1,22 +1,9 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
| 1 | /* audit -- definition of audit_context structure and supporting types | 2 | /* audit -- definition of audit_context structure and supporting types |
| 2 | * | 3 | * |
| 3 | * Copyright 2003-2004 Red Hat, Inc. | 4 | * Copyright 2003-2004 Red Hat, Inc. |
| 4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | 5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. |
| 5 | * Copyright 2005 IBM Corporation | 6 | * Copyright 2005 IBM Corporation |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | * You should have received a copy of the GNU General Public License | ||
| 18 | * along with this program; if not, write to the Free Software | ||
| 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 20 | */ | 7 | */ |
| 21 | 8 | ||
| 22 | #include <linux/fs.h> | 9 | #include <linux/fs.h> |
| @@ -299,7 +286,7 @@ extern const char *audit_tree_path(struct audit_tree *tree); | |||
| 299 | extern void audit_put_tree(struct audit_tree *tree); | 286 | extern void audit_put_tree(struct audit_tree *tree); |
| 300 | extern void audit_kill_trees(struct audit_context *context); | 287 | extern void audit_kill_trees(struct audit_context *context); |
| 301 | 288 | ||
| 302 | extern int audit_signal_info(int sig, struct task_struct *t); | 289 | extern int audit_signal_info_syscall(struct task_struct *t); |
| 303 | extern void audit_filter_inodes(struct task_struct *tsk, | 290 | extern void audit_filter_inodes(struct task_struct *tsk, |
| 304 | struct audit_context *ctx); | 291 | struct audit_context *ctx); |
| 305 | extern struct list_head *audit_killed_trees(void); | 292 | extern struct list_head *audit_killed_trees(void); |
| @@ -330,7 +317,11 @@ extern struct list_head *audit_killed_trees(void); | |||
| 330 | #define audit_tree_path(rule) "" /* never called */ | 317 | #define audit_tree_path(rule) "" /* never called */ |
| 331 | #define audit_kill_trees(context) BUG() | 318 | #define audit_kill_trees(context) BUG() |
| 332 | 319 | ||
| 333 | #define audit_signal_info(s, t) AUDIT_DISABLED | 320 | static inline int audit_signal_info_syscall(struct task_struct *t) |
| 321 | { | ||
| 322 | return 0; | ||
| 323 | } | ||
| 324 | |||
| 334 | #define audit_filter_inodes(t, c) AUDIT_DISABLED | 325 | #define audit_filter_inodes(t, c) AUDIT_DISABLED |
| 335 | #endif /* CONFIG_AUDITSYSCALL */ | 326 | #endif /* CONFIG_AUDITSYSCALL */ |
| 336 | 327 | ||
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index b5737b826951..f0d243318452 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c | |||
| @@ -1,18 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* audit_fsnotify.c -- tracking inodes | 2 | /* audit_fsnotify.c -- tracking inodes |
| 2 | * | 3 | * |
| 3 | * Copyright 2003-2009,2014-2015 Red Hat, Inc. | 4 | * Copyright 2003-2009,2014-2015 Red Hat, Inc. |
| 4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | 5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. |
| 5 | * Copyright 2005 IBM Corporation | 6 | * Copyright 2005 IBM Corporation |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | */ | 7 | */ |
| 17 | 8 | ||
| 18 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index b50c574223fa..1f31c2f1e6fc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -1,22 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* audit_watch.c -- watching inodes | 2 | /* audit_watch.c -- watching inodes |
| 2 | * | 3 | * |
| 3 | * Copyright 2003-2009 Red Hat, Inc. | 4 | * Copyright 2003-2009 Red Hat, Inc. |
| 4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | 5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. |
| 5 | * Copyright 2005 IBM Corporation | 6 | * Copyright 2005 IBM Corporation |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | * You should have received a copy of the GNU General Public License | ||
| 18 | * along with this program; if not, write to the Free Software | ||
| 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 20 | */ | 7 | */ |
| 21 | 8 | ||
| 22 | #include <linux/file.h> | 9 | #include <linux/file.h> |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 303fb04770ce..b0126e9c0743 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -1,22 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* auditfilter.c -- filtering of audit events | 2 | /* auditfilter.c -- filtering of audit events |
| 2 | * | 3 | * |
| 3 | * Copyright 2003-2004 Red Hat, Inc. | 4 | * Copyright 2003-2004 Red Hat, Inc. |
| 4 | * Copyright 2005 Hewlett-Packard Development Company, L.P. | 5 | * Copyright 2005 Hewlett-Packard Development Company, L.P. |
| 5 | * Copyright 2005 IBM Corporation | 6 | * Copyright 2005 IBM Corporation |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | * You should have received a copy of the GNU General Public License | ||
| 18 | * along with this program; if not, write to the Free Software | ||
| 19 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 20 | */ | 7 | */ |
| 21 | 8 | ||
| 22 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| @@ -335,7 +322,7 @@ static u32 audit_to_op(u32 op) | |||
| 335 | /* check if an audit field is valid */ | 322 | /* check if an audit field is valid */ |
| 336 | static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | 323 | static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) |
| 337 | { | 324 | { |
| 338 | switch(f->type) { | 325 | switch (f->type) { |
| 339 | case AUDIT_MSGTYPE: | 326 | case AUDIT_MSGTYPE: |
| 340 | if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && | 327 | if (entry->rule.listnr != AUDIT_FILTER_EXCLUDE && |
| 341 | entry->rule.listnr != AUDIT_FILTER_USER) | 328 | entry->rule.listnr != AUDIT_FILTER_USER) |
| @@ -347,7 +334,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
| 347 | break; | 334 | break; |
| 348 | } | 335 | } |
| 349 | 336 | ||
| 350 | switch(entry->rule.listnr) { | 337 | switch (entry->rule.listnr) { |
| 351 | case AUDIT_FILTER_FS: | 338 | case AUDIT_FILTER_FS: |
| 352 | switch(f->type) { | 339 | switch(f->type) { |
| 353 | case AUDIT_FSTYPE: | 340 | case AUDIT_FSTYPE: |
| @@ -358,9 +345,16 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
| 358 | } | 345 | } |
| 359 | } | 346 | } |
| 360 | 347 | ||
| 361 | switch(f->type) { | 348 | /* Check for valid field type and op */ |
| 362 | default: | 349 | switch (f->type) { |
| 363 | return -EINVAL; | 350 | case AUDIT_ARG0: |
| 351 | case AUDIT_ARG1: | ||
| 352 | case AUDIT_ARG2: | ||
| 353 | case AUDIT_ARG3: | ||
| 354 | case AUDIT_PERS: /* <uapi/linux/personality.h> */ | ||
| 355 | case AUDIT_DEVMINOR: | ||
| 356 | /* all ops are valid */ | ||
| 357 | break; | ||
| 364 | case AUDIT_UID: | 358 | case AUDIT_UID: |
| 365 | case AUDIT_EUID: | 359 | case AUDIT_EUID: |
| 366 | case AUDIT_SUID: | 360 | case AUDIT_SUID: |
| @@ -373,46 +367,53 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
| 373 | case AUDIT_FSGID: | 367 | case AUDIT_FSGID: |
| 374 | case AUDIT_OBJ_GID: | 368 | case AUDIT_OBJ_GID: |
| 375 | case AUDIT_PID: | 369 | case AUDIT_PID: |
| 376 | case AUDIT_PERS: | ||
| 377 | case AUDIT_MSGTYPE: | 370 | case AUDIT_MSGTYPE: |
| 378 | case AUDIT_PPID: | 371 | case AUDIT_PPID: |
| 379 | case AUDIT_DEVMAJOR: | 372 | case AUDIT_DEVMAJOR: |
| 380 | case AUDIT_DEVMINOR: | ||
| 381 | case AUDIT_EXIT: | 373 | case AUDIT_EXIT: |
| 382 | case AUDIT_SUCCESS: | 374 | case AUDIT_SUCCESS: |
| 383 | case AUDIT_INODE: | 375 | case AUDIT_INODE: |
| 384 | case AUDIT_SESSIONID: | 376 | case AUDIT_SESSIONID: |
| 377 | case AUDIT_SUBJ_SEN: | ||
| 378 | case AUDIT_SUBJ_CLR: | ||
| 379 | case AUDIT_OBJ_LEV_LOW: | ||
| 380 | case AUDIT_OBJ_LEV_HIGH: | ||
| 381 | case AUDIT_SADDR_FAM: | ||
| 385 | /* bit ops are only useful on syscall args */ | 382 | /* bit ops are only useful on syscall args */ |
| 386 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | 383 | if (f->op == Audit_bitmask || f->op == Audit_bittest) |
| 387 | return -EINVAL; | 384 | return -EINVAL; |
| 388 | break; | 385 | break; |
| 389 | case AUDIT_ARG0: | ||
| 390 | case AUDIT_ARG1: | ||
| 391 | case AUDIT_ARG2: | ||
| 392 | case AUDIT_ARG3: | ||
| 393 | case AUDIT_SUBJ_USER: | 386 | case AUDIT_SUBJ_USER: |
| 394 | case AUDIT_SUBJ_ROLE: | 387 | case AUDIT_SUBJ_ROLE: |
| 395 | case AUDIT_SUBJ_TYPE: | 388 | case AUDIT_SUBJ_TYPE: |
| 396 | case AUDIT_SUBJ_SEN: | ||
| 397 | case AUDIT_SUBJ_CLR: | ||
| 398 | case AUDIT_OBJ_USER: | 389 | case AUDIT_OBJ_USER: |
| 399 | case AUDIT_OBJ_ROLE: | 390 | case AUDIT_OBJ_ROLE: |
| 400 | case AUDIT_OBJ_TYPE: | 391 | case AUDIT_OBJ_TYPE: |
| 401 | case AUDIT_OBJ_LEV_LOW: | ||
| 402 | case AUDIT_OBJ_LEV_HIGH: | ||
| 403 | case AUDIT_WATCH: | 392 | case AUDIT_WATCH: |
| 404 | case AUDIT_DIR: | 393 | case AUDIT_DIR: |
| 405 | case AUDIT_FILTERKEY: | 394 | case AUDIT_FILTERKEY: |
| 406 | break; | ||
| 407 | case AUDIT_LOGINUID_SET: | 395 | case AUDIT_LOGINUID_SET: |
| 408 | if ((f->val != 0) && (f->val != 1)) | ||
| 409 | return -EINVAL; | ||
| 410 | /* FALL THROUGH */ | ||
| 411 | case AUDIT_ARCH: | 396 | case AUDIT_ARCH: |
| 412 | case AUDIT_FSTYPE: | 397 | case AUDIT_FSTYPE: |
| 398 | case AUDIT_PERM: | ||
| 399 | case AUDIT_FILETYPE: | ||
| 400 | case AUDIT_FIELD_COMPARE: | ||
| 401 | case AUDIT_EXE: | ||
| 402 | /* only equal and not equal valid ops */ | ||
| 413 | if (f->op != Audit_not_equal && f->op != Audit_equal) | 403 | if (f->op != Audit_not_equal && f->op != Audit_equal) |
| 414 | return -EINVAL; | 404 | return -EINVAL; |
| 415 | break; | 405 | break; |
| 406 | default: | ||
| 407 | /* field not recognized */ | ||
| 408 | return -EINVAL; | ||
| 409 | } | ||
| 410 | |||
| 411 | /* Check for select valid field values */ | ||
| 412 | switch (f->type) { | ||
| 413 | case AUDIT_LOGINUID_SET: | ||
| 414 | if ((f->val != 0) && (f->val != 1)) | ||
| 415 | return -EINVAL; | ||
| 416 | break; | ||
| 416 | case AUDIT_PERM: | 417 | case AUDIT_PERM: |
| 417 | if (f->val & ~15) | 418 | if (f->val & ~15) |
| 418 | return -EINVAL; | 419 | return -EINVAL; |
| @@ -425,11 +426,14 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) | |||
| 425 | if (f->val > AUDIT_MAX_FIELD_COMPARE) | 426 | if (f->val > AUDIT_MAX_FIELD_COMPARE) |
| 426 | return -EINVAL; | 427 | return -EINVAL; |
| 427 | break; | 428 | break; |
| 428 | case AUDIT_EXE: | 429 | case AUDIT_SADDR_FAM: |
| 429 | if (f->op != Audit_not_equal && f->op != Audit_equal) | 430 | if (f->val >= AF_MAX) |
| 430 | return -EINVAL; | 431 | return -EINVAL; |
| 431 | break; | 432 | break; |
| 433 | default: | ||
| 434 | break; | ||
| 432 | } | 435 | } |
| 436 | |||
| 433 | return 0; | 437 | return 0; |
| 434 | } | 438 | } |
| 435 | 439 | ||
| @@ -1203,7 +1207,6 @@ int audit_comparator(u32 left, u32 op, u32 right) | |||
| 1203 | case Audit_bittest: | 1207 | case Audit_bittest: |
| 1204 | return ((left & right) == right); | 1208 | return ((left & right) == right); |
| 1205 | default: | 1209 | default: |
| 1206 | BUG(); | ||
| 1207 | return 0; | 1210 | return 0; |
| 1208 | } | 1211 | } |
| 1209 | } | 1212 | } |
| @@ -1226,7 +1229,6 @@ int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) | |||
| 1226 | case Audit_bitmask: | 1229 | case Audit_bitmask: |
| 1227 | case Audit_bittest: | 1230 | case Audit_bittest: |
| 1228 | default: | 1231 | default: |
| 1229 | BUG(); | ||
| 1230 | return 0; | 1232 | return 0; |
| 1231 | } | 1233 | } |
| 1232 | } | 1234 | } |
| @@ -1249,7 +1251,6 @@ int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) | |||
| 1249 | case Audit_bitmask: | 1251 | case Audit_bitmask: |
| 1250 | case Audit_bittest: | 1252 | case Audit_bittest: |
| 1251 | default: | 1253 | default: |
| 1252 | BUG(); | ||
| 1253 | return 0; | 1254 | return 0; |
| 1254 | } | 1255 | } |
| 1255 | } | 1256 | } |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 95ae27edd417..4effe01ebbe2 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -601,12 +601,20 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 601 | } | 601 | } |
| 602 | break; | 602 | break; |
| 603 | case AUDIT_WATCH: | 603 | case AUDIT_WATCH: |
| 604 | if (name) | 604 | if (name) { |
| 605 | result = audit_watch_compare(rule->watch, name->ino, name->dev); | 605 | result = audit_watch_compare(rule->watch, |
| 606 | name->ino, | ||
| 607 | name->dev); | ||
| 608 | if (f->op == Audit_not_equal) | ||
| 609 | result = !result; | ||
| 610 | } | ||
| 606 | break; | 611 | break; |
| 607 | case AUDIT_DIR: | 612 | case AUDIT_DIR: |
| 608 | if (ctx) | 613 | if (ctx) { |
| 609 | result = match_tree_refs(ctx, rule->tree); | 614 | result = match_tree_refs(ctx, rule->tree); |
| 615 | if (f->op == Audit_not_equal) | ||
| 616 | result = !result; | ||
| 617 | } | ||
| 610 | break; | 618 | break; |
| 611 | case AUDIT_LOGINUID: | 619 | case AUDIT_LOGINUID: |
| 612 | result = audit_uid_comparator(audit_get_loginuid(tsk), | 620 | result = audit_uid_comparator(audit_get_loginuid(tsk), |
| @@ -615,6 +623,11 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 615 | case AUDIT_LOGINUID_SET: | 623 | case AUDIT_LOGINUID_SET: |
| 616 | result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); | 624 | result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val); |
| 617 | break; | 625 | break; |
| 626 | case AUDIT_SADDR_FAM: | ||
| 627 | if (ctx->sockaddr) | ||
| 628 | result = audit_comparator(ctx->sockaddr->ss_family, | ||
| 629 | f->op, f->val); | ||
| 630 | break; | ||
| 618 | case AUDIT_SUBJ_USER: | 631 | case AUDIT_SUBJ_USER: |
| 619 | case AUDIT_SUBJ_ROLE: | 632 | case AUDIT_SUBJ_ROLE: |
| 620 | case AUDIT_SUBJ_TYPE: | 633 | case AUDIT_SUBJ_TYPE: |
| @@ -684,9 +697,13 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 684 | break; | 697 | break; |
| 685 | case AUDIT_PERM: | 698 | case AUDIT_PERM: |
| 686 | result = audit_match_perm(ctx, f->val); | 699 | result = audit_match_perm(ctx, f->val); |
| 700 | if (f->op == Audit_not_equal) | ||
| 701 | result = !result; | ||
| 687 | break; | 702 | break; |
| 688 | case AUDIT_FILETYPE: | 703 | case AUDIT_FILETYPE: |
| 689 | result = audit_match_filetype(ctx, f->val); | 704 | result = audit_match_filetype(ctx, f->val); |
| 705 | if (f->op == Audit_not_equal) | ||
| 706 | result = !result; | ||
| 690 | break; | 707 | break; |
| 691 | case AUDIT_FIELD_COMPARE: | 708 | case AUDIT_FIELD_COMPARE: |
| 692 | result = audit_field_compare(tsk, cred, f, ctx, name); | 709 | result = audit_field_compare(tsk, cred, f, ctx, name); |
| @@ -2360,30 +2377,17 @@ void __audit_ptrace(struct task_struct *t) | |||
| 2360 | } | 2377 | } |
| 2361 | 2378 | ||
| 2362 | /** | 2379 | /** |
| 2363 | * audit_signal_info - record signal info for shutting down audit subsystem | 2380 | * audit_signal_info_syscall - record signal info for syscalls |
| 2364 | * @sig: signal value | ||
| 2365 | * @t: task being signaled | 2381 | * @t: task being signaled |
| 2366 | * | 2382 | * |
| 2367 | * If the audit subsystem is being terminated, record the task (pid) | 2383 | * If the audit subsystem is being terminated, record the task (pid) |
| 2368 | * and uid that is doing that. | 2384 | * and uid that is doing that. |
| 2369 | */ | 2385 | */ |
| 2370 | int audit_signal_info(int sig, struct task_struct *t) | 2386 | int audit_signal_info_syscall(struct task_struct *t) |
| 2371 | { | 2387 | { |
| 2372 | struct audit_aux_data_pids *axp; | 2388 | struct audit_aux_data_pids *axp; |
| 2373 | struct audit_context *ctx = audit_context(); | 2389 | struct audit_context *ctx = audit_context(); |
| 2374 | kuid_t uid = current_uid(), auid, t_uid = task_uid(t); | 2390 | kuid_t t_uid = task_uid(t); |
| 2375 | |||
| 2376 | if (auditd_test_task(t) && | ||
| 2377 | (sig == SIGTERM || sig == SIGHUP || | ||
| 2378 | sig == SIGUSR1 || sig == SIGUSR2)) { | ||
| 2379 | audit_sig_pid = task_tgid_nr(current); | ||
| 2380 | auid = audit_get_loginuid(current); | ||
| 2381 | if (uid_valid(auid)) | ||
| 2382 | audit_sig_uid = auid; | ||
| 2383 | else | ||
| 2384 | audit_sig_uid = uid; | ||
| 2385 | security_task_getsecid(current, &audit_sig_sid); | ||
| 2386 | } | ||
| 2387 | 2391 | ||
| 2388 | if (!audit_signals || audit_dummy_context()) | 2392 | if (!audit_signals || audit_dummy_context()) |
| 2389 | return 0; | 2393 | return 0; |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a563c8fdad0d..a2a97fa3071b 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c | |||
| @@ -1,13 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Simple stack backtrace regression test module | 3 | * Simple stack backtrace regression test module |
| 3 | * | 4 | * |
| 4 | * (C) Copyright 2008 Intel Corporation | 5 | * (C) Copyright 2008 Intel Corporation |
| 5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | 6 | * Author: Arjan van de Ven <arjan@linux.intel.com> |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or | ||
| 8 | * modify it under the terms of the GNU General Public License | ||
| 9 | * as published by the Free Software Foundation; version 2 | ||
| 10 | * of the License. | ||
| 11 | */ | 7 | */ |
| 12 | 8 | ||
| 13 | #include <linux/completion.h> | 9 | #include <linux/completion.h> |
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 4c2fa3ac56f6..29d781061cd5 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile | |||
| @@ -1,5 +1,6 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0 | 1 | # SPDX-License-Identifier: GPL-2.0 |
| 2 | obj-y := core.o | 2 | obj-y := core.o |
| 3 | CFLAGS_core.o += $(call cc-disable-warning, override-init) | ||
| 3 | 4 | ||
| 4 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o | 5 | obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o |
| 5 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o | 6 | obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o |
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 584636c9e2eb..1c65ce0098a9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c | |||
| @@ -1,14 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * Copyright (c) 2016,2017 Facebook | 3 | * Copyright (c) 2016,2017 Facebook |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of version 2 of the GNU General Public | ||
| 6 | * License as published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 11 | * General Public License for more details. | ||
| 12 | */ | 4 | */ |
| 13 | #include <linux/bpf.h> | 5 | #include <linux/bpf.h> |
| 14 | #include <linux/btf.h> | 6 | #include <linux/btf.h> |
| @@ -83,6 +75,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
| 83 | u32 elem_size, index_mask, max_entries; | 75 | u32 elem_size, index_mask, max_entries; |
| 84 | bool unpriv = !capable(CAP_SYS_ADMIN); | 76 | bool unpriv = !capable(CAP_SYS_ADMIN); |
| 85 | u64 cost, array_size, mask64; | 77 | u64 cost, array_size, mask64; |
| 78 | struct bpf_map_memory mem; | ||
| 86 | struct bpf_array *array; | 79 | struct bpf_array *array; |
| 87 | 80 | ||
| 88 | elem_size = round_up(attr->value_size, 8); | 81 | elem_size = round_up(attr->value_size, 8); |
| @@ -116,32 +109,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) | |||
| 116 | 109 | ||
| 117 | /* make sure there is no u32 overflow later in round_up() */ | 110 | /* make sure there is no u32 overflow later in round_up() */ |
| 118 | cost = array_size; | 111 | cost = array_size; |
| 119 | if (cost >= U32_MAX - PAGE_SIZE) | 112 | if (percpu) |
| 120 | return ERR_PTR(-ENOMEM); | ||
| 121 | if (percpu) { | ||
| 122 | cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); | 113 | cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); |
| 123 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 124 | return ERR_PTR(-ENOMEM); | ||
| 125 | } | ||
| 126 | cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 127 | 114 | ||
| 128 | ret = bpf_map_precharge_memlock(cost); | 115 | ret = bpf_map_charge_init(&mem, cost); |
| 129 | if (ret < 0) | 116 | if (ret < 0) |
| 130 | return ERR_PTR(ret); | 117 | return ERR_PTR(ret); |
| 131 | 118 | ||
| 132 | /* allocate all map elements and zero-initialize them */ | 119 | /* allocate all map elements and zero-initialize them */ |
| 133 | array = bpf_map_area_alloc(array_size, numa_node); | 120 | array = bpf_map_area_alloc(array_size, numa_node); |
| 134 | if (!array) | 121 | if (!array) { |
| 122 | bpf_map_charge_finish(&mem); | ||
| 135 | return ERR_PTR(-ENOMEM); | 123 | return ERR_PTR(-ENOMEM); |
| 124 | } | ||
| 136 | array->index_mask = index_mask; | 125 | array->index_mask = index_mask; |
| 137 | array->map.unpriv_array = unpriv; | 126 | array->map.unpriv_array = unpriv; |
| 138 | 127 | ||
| 139 | /* copy mandatory map attributes */ | 128 | /* copy mandatory map attributes */ |
| 140 | bpf_map_init_from_attr(&array->map, attr); | 129 | bpf_map_init_from_attr(&array->map, attr); |
| 141 | array->map.pages = cost; | 130 | bpf_map_charge_move(&array->map.memory, &mem); |
| 142 | array->elem_size = elem_size; | 131 | array->elem_size = elem_size; |
| 143 | 132 | ||
| 144 | if (percpu && bpf_array_alloc_percpu(array)) { | 133 | if (percpu && bpf_array_alloc_percpu(array)) { |
| 134 | bpf_map_charge_finish(&array->map.memory); | ||
| 145 | bpf_map_area_free(array); | 135 | bpf_map_area_free(array); |
| 146 | return ERR_PTR(-ENOMEM); | 136 | return ERR_PTR(-ENOMEM); |
| 147 | } | 137 | } |
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index e6ef4401a138..1b6b9349cb85 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2016 Facebook | 2 | /* Copyright (c) 2016 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #include <linux/cpumask.h> | 4 | #include <linux/cpumask.h> |
| 8 | #include <linux/spinlock.h> | 5 | #include <linux/spinlock.h> |
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h index 7d4f89b7cb84..f02504640e18 100644 --- a/kernel/bpf/bpf_lru_list.h +++ b/kernel/bpf/bpf_lru_list.h | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
| 1 | /* Copyright (c) 2016 Facebook | 2 | /* Copyright (c) 2016 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #ifndef __BPF_LRU_LIST_H_ | 4 | #ifndef __BPF_LRU_LIST_H_ |
| 8 | #define __BPF_LRU_LIST_H_ | 5 | #define __BPF_LRU_LIST_H_ |
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index cad09858a5f2..546ebee39e2a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c | |||
| @@ -1928,8 +1928,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, | |||
| 1928 | /* Check array->index_type */ | 1928 | /* Check array->index_type */ |
| 1929 | index_type_id = array->index_type; | 1929 | index_type_id = array->index_type; |
| 1930 | index_type = btf_type_by_id(btf, index_type_id); | 1930 | index_type = btf_type_by_id(btf, index_type_id); |
| 1931 | if (btf_type_is_resolve_source_only(index_type) || | 1931 | if (btf_type_nosize_or_null(index_type) || |
| 1932 | btf_type_nosize_or_null(index_type)) { | 1932 | btf_type_is_resolve_source_only(index_type)) { |
| 1933 | btf_verifier_log_type(env, v->t, "Invalid index"); | 1933 | btf_verifier_log_type(env, v->t, "Invalid index"); |
| 1934 | return -EINVAL; | 1934 | return -EINVAL; |
| 1935 | } | 1935 | } |
| @@ -1948,8 +1948,8 @@ static int btf_array_resolve(struct btf_verifier_env *env, | |||
| 1948 | /* Check array->type */ | 1948 | /* Check array->type */ |
| 1949 | elem_type_id = array->type; | 1949 | elem_type_id = array->type; |
| 1950 | elem_type = btf_type_by_id(btf, elem_type_id); | 1950 | elem_type = btf_type_by_id(btf, elem_type_id); |
| 1951 | if (btf_type_is_resolve_source_only(elem_type) || | 1951 | if (btf_type_nosize_or_null(elem_type) || |
| 1952 | btf_type_nosize_or_null(elem_type)) { | 1952 | btf_type_is_resolve_source_only(elem_type)) { |
| 1953 | btf_verifier_log_type(env, v->t, | 1953 | btf_verifier_log_type(env, v->t, |
| 1954 | "Invalid elem"); | 1954 | "Invalid elem"); |
| 1955 | return -EINVAL; | 1955 | return -EINVAL; |
| @@ -2170,8 +2170,8 @@ static int btf_struct_resolve(struct btf_verifier_env *env, | |||
| 2170 | const struct btf_type *member_type = btf_type_by_id(env->btf, | 2170 | const struct btf_type *member_type = btf_type_by_id(env->btf, |
| 2171 | member_type_id); | 2171 | member_type_id); |
| 2172 | 2172 | ||
| 2173 | if (btf_type_is_resolve_source_only(member_type) || | 2173 | if (btf_type_nosize_or_null(member_type) || |
| 2174 | btf_type_nosize_or_null(member_type)) { | 2174 | btf_type_is_resolve_source_only(member_type)) { |
| 2175 | btf_verifier_log_member(env, v->t, member, | 2175 | btf_verifier_log_member(env, v->t, member, |
| 2176 | "Invalid member"); | 2176 | "Invalid member"); |
| 2177 | return -EINVAL; | 2177 | return -EINVAL; |
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index fcde0f7b2585..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c | |||
| @@ -1,11 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Functions to manage eBPF programs attached to cgroups | 3 | * Functions to manage eBPF programs attached to cgroups |
| 3 | * | 4 | * |
| 4 | * Copyright (c) 2016 Daniel Mack | 5 | * Copyright (c) 2016 Daniel Mack |
| 5 | * | ||
| 6 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
| 7 | * General Public License. See the file COPYING in the main directory of the | ||
| 8 | * Linux distribution for more details. | ||
| 9 | */ | 6 | */ |
| 10 | 7 | ||
| 11 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
| @@ -18,19 +15,34 @@ | |||
| 18 | #include <linux/bpf.h> | 15 | #include <linux/bpf.h> |
| 19 | #include <linux/bpf-cgroup.h> | 16 | #include <linux/bpf-cgroup.h> |
| 20 | #include <net/sock.h> | 17 | #include <net/sock.h> |
| 18 | #include <net/bpf_sk_storage.h> | ||
| 19 | |||
| 20 | #include "../cgroup/cgroup-internal.h" | ||
| 21 | 21 | ||
| 22 | DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); | 22 | DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); |
| 23 | EXPORT_SYMBOL(cgroup_bpf_enabled_key); | 23 | EXPORT_SYMBOL(cgroup_bpf_enabled_key); |
| 24 | 24 | ||
| 25 | void cgroup_bpf_offline(struct cgroup *cgrp) | ||
| 26 | { | ||
| 27 | cgroup_get(cgrp); | ||
| 28 | percpu_ref_kill(&cgrp->bpf.refcnt); | ||
| 29 | } | ||
| 30 | |||
| 25 | /** | 31 | /** |
| 26 | * cgroup_bpf_put() - put references of all bpf programs | 32 | * cgroup_bpf_release() - put references of all bpf programs and |
| 27 | * @cgrp: the cgroup to modify | 33 | * release all cgroup bpf data |
| 34 | * @work: work structure embedded into the cgroup to modify | ||
| 28 | */ | 35 | */ |
| 29 | void cgroup_bpf_put(struct cgroup *cgrp) | 36 | static void cgroup_bpf_release(struct work_struct *work) |
| 30 | { | 37 | { |
| 38 | struct cgroup *cgrp = container_of(work, struct cgroup, | ||
| 39 | bpf.release_work); | ||
| 31 | enum bpf_cgroup_storage_type stype; | 40 | enum bpf_cgroup_storage_type stype; |
| 41 | struct bpf_prog_array *old_array; | ||
| 32 | unsigned int type; | 42 | unsigned int type; |
| 33 | 43 | ||
| 44 | mutex_lock(&cgroup_mutex); | ||
| 45 | |||
| 34 | for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { | 46 | for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { |
| 35 | struct list_head *progs = &cgrp->bpf.progs[type]; | 47 | struct list_head *progs = &cgrp->bpf.progs[type]; |
| 36 | struct bpf_prog_list *pl, *tmp; | 48 | struct bpf_prog_list *pl, *tmp; |
| @@ -45,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp) | |||
| 45 | kfree(pl); | 57 | kfree(pl); |
| 46 | static_branch_dec(&cgroup_bpf_enabled_key); | 58 | static_branch_dec(&cgroup_bpf_enabled_key); |
| 47 | } | 59 | } |
| 48 | bpf_prog_array_free(cgrp->bpf.effective[type]); | 60 | old_array = rcu_dereference_protected( |
| 61 | cgrp->bpf.effective[type], | ||
| 62 | lockdep_is_held(&cgroup_mutex)); | ||
| 63 | bpf_prog_array_free(old_array); | ||
| 49 | } | 64 | } |
| 65 | |||
| 66 | mutex_unlock(&cgroup_mutex); | ||
| 67 | |||
| 68 | percpu_ref_exit(&cgrp->bpf.refcnt); | ||
| 69 | cgroup_put(cgrp); | ||
| 70 | } | ||
| 71 | |||
| 72 | /** | ||
| 73 | * cgroup_bpf_release_fn() - callback used to schedule releasing | ||
| 74 | * of bpf cgroup data | ||
| 75 | * @ref: percpu ref counter structure | ||
| 76 | */ | ||
| 77 | static void cgroup_bpf_release_fn(struct percpu_ref *ref) | ||
| 78 | { | ||
| 79 | struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); | ||
| 80 | |||
| 81 | INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); | ||
| 82 | queue_work(system_wq, &cgrp->bpf.release_work); | ||
| 50 | } | 83 | } |
| 51 | 84 | ||
| 52 | /* count number of elements in the list. | 85 | /* count number of elements in the list. |
| @@ -101,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, | |||
| 101 | */ | 134 | */ |
| 102 | static int compute_effective_progs(struct cgroup *cgrp, | 135 | static int compute_effective_progs(struct cgroup *cgrp, |
| 103 | enum bpf_attach_type type, | 136 | enum bpf_attach_type type, |
| 104 | struct bpf_prog_array __rcu **array) | 137 | struct bpf_prog_array **array) |
| 105 | { | 138 | { |
| 106 | enum bpf_cgroup_storage_type stype; | 139 | enum bpf_cgroup_storage_type stype; |
| 107 | struct bpf_prog_array *progs; | 140 | struct bpf_prog_array *progs; |
| @@ -139,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp, | |||
| 139 | } | 172 | } |
| 140 | } while ((p = cgroup_parent(p))); | 173 | } while ((p = cgroup_parent(p))); |
| 141 | 174 | ||
| 142 | rcu_assign_pointer(*array, progs); | 175 | *array = progs; |
| 143 | return 0; | 176 | return 0; |
| 144 | } | 177 | } |
| 145 | 178 | ||
| 146 | static void activate_effective_progs(struct cgroup *cgrp, | 179 | static void activate_effective_progs(struct cgroup *cgrp, |
| 147 | enum bpf_attach_type type, | 180 | enum bpf_attach_type type, |
| 148 | struct bpf_prog_array __rcu *array) | 181 | struct bpf_prog_array *old_array) |
| 149 | { | 182 | { |
| 150 | struct bpf_prog_array __rcu *old_array; | 183 | rcu_swap_protected(cgrp->bpf.effective[type], old_array, |
| 151 | 184 | lockdep_is_held(&cgroup_mutex)); | |
| 152 | old_array = xchg(&cgrp->bpf.effective[type], array); | ||
| 153 | /* free prog array after grace period, since __cgroup_bpf_run_*() | 185 | /* free prog array after grace period, since __cgroup_bpf_run_*() |
| 154 | * might be still walking the array | 186 | * might be still walking the array |
| 155 | */ | 187 | */ |
| @@ -166,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) | |||
| 166 | * that array below is variable length | 198 | * that array below is variable length |
| 167 | */ | 199 | */ |
| 168 | #define NR ARRAY_SIZE(cgrp->bpf.effective) | 200 | #define NR ARRAY_SIZE(cgrp->bpf.effective) |
| 169 | struct bpf_prog_array __rcu *arrays[NR] = {}; | 201 | struct bpf_prog_array *arrays[NR] = {}; |
| 170 | int i; | 202 | int ret, i; |
| 203 | |||
| 204 | ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, | ||
| 205 | GFP_KERNEL); | ||
| 206 | if (ret) | ||
| 207 | return ret; | ||
| 171 | 208 | ||
| 172 | for (i = 0; i < NR; i++) | 209 | for (i = 0; i < NR; i++) |
| 173 | INIT_LIST_HEAD(&cgrp->bpf.progs[i]); | 210 | INIT_LIST_HEAD(&cgrp->bpf.progs[i]); |
| @@ -183,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) | |||
| 183 | cleanup: | 220 | cleanup: |
| 184 | for (i = 0; i < NR; i++) | 221 | for (i = 0; i < NR; i++) |
| 185 | bpf_prog_array_free(arrays[i]); | 222 | bpf_prog_array_free(arrays[i]); |
| 223 | |||
| 224 | percpu_ref_exit(&cgrp->bpf.refcnt); | ||
| 225 | |||
| 186 | return -ENOMEM; | 226 | return -ENOMEM; |
| 187 | } | 227 | } |
| 188 | 228 | ||
| @@ -196,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp, | |||
| 196 | css_for_each_descendant_pre(css, &cgrp->self) { | 236 | css_for_each_descendant_pre(css, &cgrp->self) { |
| 197 | struct cgroup *desc = container_of(css, struct cgroup, self); | 237 | struct cgroup *desc = container_of(css, struct cgroup, self); |
| 198 | 238 | ||
| 239 | if (percpu_ref_is_zero(&desc->bpf.refcnt)) | ||
| 240 | continue; | ||
| 241 | |||
| 199 | err = compute_effective_progs(desc, type, &desc->bpf.inactive); | 242 | err = compute_effective_progs(desc, type, &desc->bpf.inactive); |
| 200 | if (err) | 243 | if (err) |
| 201 | goto cleanup; | 244 | goto cleanup; |
| @@ -205,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp, | |||
| 205 | css_for_each_descendant_pre(css, &cgrp->self) { | 248 | css_for_each_descendant_pre(css, &cgrp->self) { |
| 206 | struct cgroup *desc = container_of(css, struct cgroup, self); | 249 | struct cgroup *desc = container_of(css, struct cgroup, self); |
| 207 | 250 | ||
| 251 | if (percpu_ref_is_zero(&desc->bpf.refcnt)) { | ||
| 252 | if (unlikely(desc->bpf.inactive)) { | ||
| 253 | bpf_prog_array_free(desc->bpf.inactive); | ||
| 254 | desc->bpf.inactive = NULL; | ||
| 255 | } | ||
| 256 | continue; | ||
| 257 | } | ||
| 258 | |||
| 208 | activate_effective_progs(desc, type, desc->bpf.inactive); | 259 | activate_effective_progs(desc, type, desc->bpf.inactive); |
| 209 | desc->bpf.inactive = NULL; | 260 | desc->bpf.inactive = NULL; |
| 210 | } | 261 | } |
| @@ -444,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, | |||
| 444 | enum bpf_attach_type type = attr->query.attach_type; | 495 | enum bpf_attach_type type = attr->query.attach_type; |
| 445 | struct list_head *progs = &cgrp->bpf.progs[type]; | 496 | struct list_head *progs = &cgrp->bpf.progs[type]; |
| 446 | u32 flags = cgrp->bpf.flags[type]; | 497 | u32 flags = cgrp->bpf.flags[type]; |
| 498 | struct bpf_prog_array *effective; | ||
| 447 | int cnt, ret = 0, i; | 499 | int cnt, ret = 0, i; |
| 448 | 500 | ||
| 501 | effective = rcu_dereference_protected(cgrp->bpf.effective[type], | ||
| 502 | lockdep_is_held(&cgroup_mutex)); | ||
| 503 | |||
| 449 | if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) | 504 | if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) |
| 450 | cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); | 505 | cnt = bpf_prog_array_length(effective); |
| 451 | else | 506 | else |
| 452 | cnt = prog_list_length(progs); | 507 | cnt = prog_list_length(progs); |
| 453 | 508 | ||
| @@ -464,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, | |||
| 464 | } | 519 | } |
| 465 | 520 | ||
| 466 | if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { | 521 | if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { |
| 467 | return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], | 522 | return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); |
| 468 | prog_ids, cnt); | ||
| 469 | } else { | 523 | } else { |
| 470 | struct bpf_prog_list *pl; | 524 | struct bpf_prog_list *pl; |
| 471 | u32 id; | 525 | u32 id; |
| @@ -548,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, | |||
| 548 | * The program type passed in via @type must be suitable for network | 602 | * The program type passed in via @type must be suitable for network |
| 549 | * filtering. No further check is performed to assert that. | 603 | * filtering. No further check is performed to assert that. |
| 550 | * | 604 | * |
| 551 | * This function will return %-EPERM if any if an attached program was found | 605 | * For egress packets, this function can return: |
| 552 | * and if it returned != 1 during execution. In all other cases, 0 is returned. | 606 | * NET_XMIT_SUCCESS (0) - continue with packet output |
| 607 | * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr | ||
| 608 | * NET_XMIT_CN (2) - continue with packet output and notify TCP | ||
| 609 | * to call cwr | ||
| 610 | * -EPERM - drop packet | ||
| 611 | * | ||
| 612 | * For ingress packets, this function will return -EPERM if any | ||
| 613 | * attached program was found and if it returned != 1 during execution. | ||
| 614 | * Otherwise 0 is returned. | ||
| 553 | */ | 615 | */ |
| 554 | int __cgroup_bpf_run_filter_skb(struct sock *sk, | 616 | int __cgroup_bpf_run_filter_skb(struct sock *sk, |
| 555 | struct sk_buff *skb, | 617 | struct sk_buff *skb, |
| @@ -575,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, | |||
| 575 | /* compute pointers for the bpf prog */ | 637 | /* compute pointers for the bpf prog */ |
| 576 | bpf_compute_and_save_data_end(skb, &saved_data_end); | 638 | bpf_compute_and_save_data_end(skb, &saved_data_end); |
| 577 | 639 | ||
| 578 | ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, | 640 | if (type == BPF_CGROUP_INET_EGRESS) { |
| 579 | __bpf_prog_run_save_cb); | 641 | ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( |
| 642 | cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); | ||
| 643 | } else { | ||
| 644 | ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, | ||
| 645 | __bpf_prog_run_save_cb); | ||
| 646 | ret = (ret == 1 ? 0 : -EPERM); | ||
| 647 | } | ||
| 580 | bpf_restore_data_end(skb, saved_data_end); | 648 | bpf_restore_data_end(skb, saved_data_end); |
| 581 | __skb_pull(skb, offset); | 649 | __skb_pull(skb, offset); |
| 582 | skb->sk = save_sk; | 650 | skb->sk = save_sk; |
| 583 | return ret == 1 ? 0 : -EPERM; | 651 | |
| 652 | return ret; | ||
| 584 | } | 653 | } |
| 585 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); | 654 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); |
| 586 | 655 | ||
| @@ -870,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, | |||
| 870 | } | 939 | } |
| 871 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); | 940 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); |
| 872 | 941 | ||
| 942 | #ifdef CONFIG_NET | ||
| 943 | static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, | ||
| 944 | enum bpf_attach_type attach_type) | ||
| 945 | { | ||
| 946 | struct bpf_prog_array *prog_array; | ||
| 947 | bool empty; | ||
| 948 | |||
| 949 | rcu_read_lock(); | ||
| 950 | prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); | ||
| 951 | empty = bpf_prog_array_is_empty(prog_array); | ||
| 952 | rcu_read_unlock(); | ||
| 953 | |||
| 954 | return empty; | ||
| 955 | } | ||
| 956 | |||
| 957 | static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) | ||
| 958 | { | ||
| 959 | if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) | ||
| 960 | return -EINVAL; | ||
| 961 | |||
| 962 | ctx->optval = kzalloc(max_optlen, GFP_USER); | ||
| 963 | if (!ctx->optval) | ||
| 964 | return -ENOMEM; | ||
| 965 | |||
| 966 | ctx->optval_end = ctx->optval + max_optlen; | ||
| 967 | ctx->optlen = max_optlen; | ||
| 968 | |||
| 969 | return 0; | ||
| 970 | } | ||
| 971 | |||
| 972 | static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) | ||
| 973 | { | ||
| 974 | kfree(ctx->optval); | ||
| 975 | } | ||
| 976 | |||
| 977 | int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, | ||
| 978 | int *optname, char __user *optval, | ||
| 979 | int *optlen, char **kernel_optval) | ||
| 980 | { | ||
| 981 | struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||
| 982 | struct bpf_sockopt_kern ctx = { | ||
| 983 | .sk = sk, | ||
| 984 | .level = *level, | ||
| 985 | .optname = *optname, | ||
| 986 | }; | ||
| 987 | int ret; | ||
| 988 | |||
| 989 | /* Opportunistic check to see whether we have any BPF program | ||
| 990 | * attached to the hook so we don't waste time allocating | ||
| 991 | * memory and locking the socket. | ||
| 992 | */ | ||
| 993 | if (!cgroup_bpf_enabled || | ||
| 994 | __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) | ||
| 995 | return 0; | ||
| 996 | |||
| 997 | ret = sockopt_alloc_buf(&ctx, *optlen); | ||
| 998 | if (ret) | ||
| 999 | return ret; | ||
| 1000 | |||
| 1001 | if (copy_from_user(ctx.optval, optval, *optlen) != 0) { | ||
| 1002 | ret = -EFAULT; | ||
| 1003 | goto out; | ||
| 1004 | } | ||
| 1005 | |||
| 1006 | lock_sock(sk); | ||
| 1007 | ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], | ||
| 1008 | &ctx, BPF_PROG_RUN); | ||
| 1009 | release_sock(sk); | ||
| 1010 | |||
| 1011 | if (!ret) { | ||
| 1012 | ret = -EPERM; | ||
| 1013 | goto out; | ||
| 1014 | } | ||
| 1015 | |||
| 1016 | if (ctx.optlen == -1) { | ||
| 1017 | /* optlen set to -1, bypass kernel */ | ||
| 1018 | ret = 1; | ||
| 1019 | } else if (ctx.optlen > *optlen || ctx.optlen < -1) { | ||
| 1020 | /* optlen is out of bounds */ | ||
| 1021 | ret = -EFAULT; | ||
| 1022 | } else { | ||
| 1023 | /* optlen within bounds, run kernel handler */ | ||
| 1024 | ret = 0; | ||
| 1025 | |||
| 1026 | /* export any potential modifications */ | ||
| 1027 | *level = ctx.level; | ||
| 1028 | *optname = ctx.optname; | ||
| 1029 | *optlen = ctx.optlen; | ||
| 1030 | *kernel_optval = ctx.optval; | ||
| 1031 | } | ||
| 1032 | |||
| 1033 | out: | ||
| 1034 | if (ret) | ||
| 1035 | sockopt_free_buf(&ctx); | ||
| 1036 | return ret; | ||
| 1037 | } | ||
| 1038 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); | ||
| 1039 | |||
| 1040 | int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, | ||
| 1041 | int optname, char __user *optval, | ||
| 1042 | int __user *optlen, int max_optlen, | ||
| 1043 | int retval) | ||
| 1044 | { | ||
| 1045 | struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); | ||
| 1046 | struct bpf_sockopt_kern ctx = { | ||
| 1047 | .sk = sk, | ||
| 1048 | .level = level, | ||
| 1049 | .optname = optname, | ||
| 1050 | .retval = retval, | ||
| 1051 | }; | ||
| 1052 | int ret; | ||
| 1053 | |||
| 1054 | /* Opportunistic check to see whether we have any BPF program | ||
| 1055 | * attached to the hook so we don't waste time allocating | ||
| 1056 | * memory and locking the socket. | ||
| 1057 | */ | ||
| 1058 | if (!cgroup_bpf_enabled || | ||
| 1059 | __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) | ||
| 1060 | return retval; | ||
| 1061 | |||
| 1062 | ret = sockopt_alloc_buf(&ctx, max_optlen); | ||
| 1063 | if (ret) | ||
| 1064 | return ret; | ||
| 1065 | |||
| 1066 | if (!retval) { | ||
| 1067 | /* If kernel getsockopt finished successfully, | ||
| 1068 | * copy whatever was returned to the user back | ||
| 1069 | * into our temporary buffer. Set optlen to the | ||
| 1070 | * one that kernel returned as well to let | ||
| 1071 | * BPF programs inspect the value. | ||
| 1072 | */ | ||
| 1073 | |||
| 1074 | if (get_user(ctx.optlen, optlen)) { | ||
| 1075 | ret = -EFAULT; | ||
| 1076 | goto out; | ||
| 1077 | } | ||
| 1078 | |||
| 1079 | if (ctx.optlen > max_optlen) | ||
| 1080 | ctx.optlen = max_optlen; | ||
| 1081 | |||
| 1082 | if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { | ||
| 1083 | ret = -EFAULT; | ||
| 1084 | goto out; | ||
| 1085 | } | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | lock_sock(sk); | ||
| 1089 | ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], | ||
| 1090 | &ctx, BPF_PROG_RUN); | ||
| 1091 | release_sock(sk); | ||
| 1092 | |||
| 1093 | if (!ret) { | ||
| 1094 | ret = -EPERM; | ||
| 1095 | goto out; | ||
| 1096 | } | ||
| 1097 | |||
| 1098 | if (ctx.optlen > max_optlen) { | ||
| 1099 | ret = -EFAULT; | ||
| 1100 | goto out; | ||
| 1101 | } | ||
| 1102 | |||
| 1103 | /* BPF programs only allowed to set retval to 0, not some | ||
| 1104 | * arbitrary value. | ||
| 1105 | */ | ||
| 1106 | if (ctx.retval != 0 && ctx.retval != retval) { | ||
| 1107 | ret = -EFAULT; | ||
| 1108 | goto out; | ||
| 1109 | } | ||
| 1110 | |||
| 1111 | if (copy_to_user(optval, ctx.optval, ctx.optlen) || | ||
| 1112 | put_user(ctx.optlen, optlen)) { | ||
| 1113 | ret = -EFAULT; | ||
| 1114 | goto out; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | ret = ctx.retval; | ||
| 1118 | |||
| 1119 | out: | ||
| 1120 | sockopt_free_buf(&ctx); | ||
| 1121 | return ret; | ||
| 1122 | } | ||
| 1123 | EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); | ||
| 1124 | #endif | ||
| 1125 | |||
| 873 | static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, | 1126 | static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, |
| 874 | size_t *lenp) | 1127 | size_t *lenp) |
| 875 | { | 1128 | { |
| @@ -1130,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { | |||
| 1130 | 1383 | ||
| 1131 | const struct bpf_prog_ops cg_sysctl_prog_ops = { | 1384 | const struct bpf_prog_ops cg_sysctl_prog_ops = { |
| 1132 | }; | 1385 | }; |
| 1386 | |||
| 1387 | static const struct bpf_func_proto * | ||
| 1388 | cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | ||
| 1389 | { | ||
| 1390 | switch (func_id) { | ||
| 1391 | #ifdef CONFIG_NET | ||
| 1392 | case BPF_FUNC_sk_storage_get: | ||
| 1393 | return &bpf_sk_storage_get_proto; | ||
| 1394 | case BPF_FUNC_sk_storage_delete: | ||
| 1395 | return &bpf_sk_storage_delete_proto; | ||
| 1396 | #endif | ||
| 1397 | #ifdef CONFIG_INET | ||
| 1398 | case BPF_FUNC_tcp_sock: | ||
| 1399 | return &bpf_tcp_sock_proto; | ||
| 1400 | #endif | ||
| 1401 | default: | ||
| 1402 | return cgroup_base_func_proto(func_id, prog); | ||
| 1403 | } | ||
| 1404 | } | ||
| 1405 | |||
| 1406 | static bool cg_sockopt_is_valid_access(int off, int size, | ||
| 1407 | enum bpf_access_type type, | ||
| 1408 | const struct bpf_prog *prog, | ||
| 1409 | struct bpf_insn_access_aux *info) | ||
| 1410 | { | ||
| 1411 | const int size_default = sizeof(__u32); | ||
| 1412 | |||
| 1413 | if (off < 0 || off >= sizeof(struct bpf_sockopt)) | ||
| 1414 | return false; | ||
| 1415 | |||
| 1416 | if (off % size != 0) | ||
| 1417 | return false; | ||
| 1418 | |||
| 1419 | if (type == BPF_WRITE) { | ||
| 1420 | switch (off) { | ||
| 1421 | case offsetof(struct bpf_sockopt, retval): | ||
| 1422 | if (size != size_default) | ||
| 1423 | return false; | ||
| 1424 | return prog->expected_attach_type == | ||
| 1425 | BPF_CGROUP_GETSOCKOPT; | ||
| 1426 | case offsetof(struct bpf_sockopt, optname): | ||
| 1427 | /* fallthrough */ | ||
| 1428 | case offsetof(struct bpf_sockopt, level): | ||
| 1429 | if (size != size_default) | ||
| 1430 | return false; | ||
| 1431 | return prog->expected_attach_type == | ||
| 1432 | BPF_CGROUP_SETSOCKOPT; | ||
| 1433 | case offsetof(struct bpf_sockopt, optlen): | ||
| 1434 | return size == size_default; | ||
| 1435 | default: | ||
| 1436 | return false; | ||
| 1437 | } | ||
| 1438 | } | ||
| 1439 | |||
| 1440 | switch (off) { | ||
| 1441 | case offsetof(struct bpf_sockopt, sk): | ||
| 1442 | if (size != sizeof(__u64)) | ||
| 1443 | return false; | ||
| 1444 | info->reg_type = PTR_TO_SOCKET; | ||
| 1445 | break; | ||
| 1446 | case offsetof(struct bpf_sockopt, optval): | ||
| 1447 | if (size != sizeof(__u64)) | ||
| 1448 | return false; | ||
| 1449 | info->reg_type = PTR_TO_PACKET; | ||
| 1450 | break; | ||
| 1451 | case offsetof(struct bpf_sockopt, optval_end): | ||
| 1452 | if (size != sizeof(__u64)) | ||
| 1453 | return false; | ||
| 1454 | info->reg_type = PTR_TO_PACKET_END; | ||
| 1455 | break; | ||
| 1456 | case offsetof(struct bpf_sockopt, retval): | ||
| 1457 | if (size != size_default) | ||
| 1458 | return false; | ||
| 1459 | return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; | ||
| 1460 | default: | ||
| 1461 | if (size != size_default) | ||
| 1462 | return false; | ||
| 1463 | break; | ||
| 1464 | } | ||
| 1465 | return true; | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | #define CG_SOCKOPT_ACCESS_FIELD(T, F) \ | ||
| 1469 | T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ | ||
| 1470 | si->dst_reg, si->src_reg, \ | ||
| 1471 | offsetof(struct bpf_sockopt_kern, F)) | ||
| 1472 | |||
| 1473 | static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, | ||
| 1474 | const struct bpf_insn *si, | ||
| 1475 | struct bpf_insn *insn_buf, | ||
| 1476 | struct bpf_prog *prog, | ||
| 1477 | u32 *target_size) | ||
| 1478 | { | ||
| 1479 | struct bpf_insn *insn = insn_buf; | ||
| 1480 | |||
| 1481 | switch (si->off) { | ||
| 1482 | case offsetof(struct bpf_sockopt, sk): | ||
| 1483 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); | ||
| 1484 | break; | ||
| 1485 | case offsetof(struct bpf_sockopt, level): | ||
| 1486 | if (type == BPF_WRITE) | ||
| 1487 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); | ||
| 1488 | else | ||
| 1489 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); | ||
| 1490 | break; | ||
| 1491 | case offsetof(struct bpf_sockopt, optname): | ||
| 1492 | if (type == BPF_WRITE) | ||
| 1493 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); | ||
| 1494 | else | ||
| 1495 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); | ||
| 1496 | break; | ||
| 1497 | case offsetof(struct bpf_sockopt, optlen): | ||
| 1498 | if (type == BPF_WRITE) | ||
| 1499 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); | ||
| 1500 | else | ||
| 1501 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); | ||
| 1502 | break; | ||
| 1503 | case offsetof(struct bpf_sockopt, retval): | ||
| 1504 | if (type == BPF_WRITE) | ||
| 1505 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); | ||
| 1506 | else | ||
| 1507 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); | ||
| 1508 | break; | ||
| 1509 | case offsetof(struct bpf_sockopt, optval): | ||
| 1510 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); | ||
| 1511 | break; | ||
| 1512 | case offsetof(struct bpf_sockopt, optval_end): | ||
| 1513 | *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); | ||
| 1514 | break; | ||
| 1515 | } | ||
| 1516 | |||
| 1517 | return insn - insn_buf; | ||
| 1518 | } | ||
| 1519 | |||
| 1520 | static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, | ||
| 1521 | bool direct_write, | ||
| 1522 | const struct bpf_prog *prog) | ||
| 1523 | { | ||
| 1524 | /* Nothing to do for sockopt argument. The data is kzalloc'ated. | ||
| 1525 | */ | ||
| 1526 | return 0; | ||
| 1527 | } | ||
| 1528 | |||
| 1529 | const struct bpf_verifier_ops cg_sockopt_verifier_ops = { | ||
| 1530 | .get_func_proto = cg_sockopt_func_proto, | ||
| 1531 | .is_valid_access = cg_sockopt_is_valid_access, | ||
| 1532 | .convert_ctx_access = cg_sockopt_convert_ctx_access, | ||
| 1533 | .gen_prologue = cg_sockopt_get_prologue, | ||
| 1534 | }; | ||
| 1535 | |||
| 1536 | const struct bpf_prog_ops cg_sockopt_prog_ops = { | ||
| 1537 | }; | ||
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 242a643af82f..16079550db6d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Linux Socket Filter - Kernel level socket filtering | 3 | * Linux Socket Filter - Kernel level socket filtering |
| 3 | * | 4 | * |
| @@ -12,11 +13,6 @@ | |||
| 12 | * Alexei Starovoitov <ast@plumgrid.com> | 13 | * Alexei Starovoitov <ast@plumgrid.com> |
| 13 | * Daniel Borkmann <dborkman@redhat.com> | 14 | * Daniel Borkmann <dborkman@redhat.com> |
| 14 | * | 15 | * |
| 15 | * This program is free software; you can redistribute it and/or | ||
| 16 | * modify it under the terms of the GNU General Public License | ||
| 17 | * as published by the Free Software Foundation; either version | ||
| 18 | * 2 of the License, or (at your option) any later version. | ||
| 19 | * | ||
| 20 | * Andi Kleen - Fix a few bad bugs and races. | 16 | * Andi Kleen - Fix a few bad bugs and races. |
| 21 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() | 17 | * Kris Katterjohn - Added many additional checks in bpf_check_classic() |
| 22 | */ | 18 | */ |
| @@ -1368,10 +1364,10 @@ select_insn: | |||
| 1368 | insn++; | 1364 | insn++; |
| 1369 | CONT; | 1365 | CONT; |
| 1370 | ALU_ARSH_X: | 1366 | ALU_ARSH_X: |
| 1371 | DST = (u64) (u32) ((*(s32 *) &DST) >> SRC); | 1367 | DST = (u64) (u32) (((s32) DST) >> SRC); |
| 1372 | CONT; | 1368 | CONT; |
| 1373 | ALU_ARSH_K: | 1369 | ALU_ARSH_K: |
| 1374 | DST = (u64) (u32) ((*(s32 *) &DST) >> IMM); | 1370 | DST = (u64) (u32) (((s32) DST) >> IMM); |
| 1375 | CONT; | 1371 | CONT; |
| 1376 | ALU64_ARSH_X: | 1372 | ALU64_ARSH_X: |
| 1377 | (*(s64 *) &DST) >>= SRC; | 1373 | (*(s64 *) &DST) >>= SRC; |
| @@ -1795,38 +1791,42 @@ struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) | |||
| 1795 | return &empty_prog_array.hdr; | 1791 | return &empty_prog_array.hdr; |
| 1796 | } | 1792 | } |
| 1797 | 1793 | ||
| 1798 | void bpf_prog_array_free(struct bpf_prog_array __rcu *progs) | 1794 | void bpf_prog_array_free(struct bpf_prog_array *progs) |
| 1799 | { | 1795 | { |
| 1800 | if (!progs || | 1796 | if (!progs || progs == &empty_prog_array.hdr) |
| 1801 | progs == (struct bpf_prog_array __rcu *)&empty_prog_array.hdr) | ||
| 1802 | return; | 1797 | return; |
| 1803 | kfree_rcu(progs, rcu); | 1798 | kfree_rcu(progs, rcu); |
| 1804 | } | 1799 | } |
| 1805 | 1800 | ||
| 1806 | int bpf_prog_array_length(struct bpf_prog_array __rcu *array) | 1801 | int bpf_prog_array_length(struct bpf_prog_array *array) |
| 1807 | { | 1802 | { |
| 1808 | struct bpf_prog_array_item *item; | 1803 | struct bpf_prog_array_item *item; |
| 1809 | u32 cnt = 0; | 1804 | u32 cnt = 0; |
| 1810 | 1805 | ||
| 1811 | rcu_read_lock(); | 1806 | for (item = array->items; item->prog; item++) |
| 1812 | item = rcu_dereference(array)->items; | ||
| 1813 | for (; item->prog; item++) | ||
| 1814 | if (item->prog != &dummy_bpf_prog.prog) | 1807 | if (item->prog != &dummy_bpf_prog.prog) |
| 1815 | cnt++; | 1808 | cnt++; |
| 1816 | rcu_read_unlock(); | ||
| 1817 | return cnt; | 1809 | return cnt; |
| 1818 | } | 1810 | } |
| 1819 | 1811 | ||
| 1812 | bool bpf_prog_array_is_empty(struct bpf_prog_array *array) | ||
| 1813 | { | ||
| 1814 | struct bpf_prog_array_item *item; | ||
| 1815 | |||
| 1816 | for (item = array->items; item->prog; item++) | ||
| 1817 | if (item->prog != &dummy_bpf_prog.prog) | ||
| 1818 | return false; | ||
| 1819 | return true; | ||
| 1820 | } | ||
| 1820 | 1821 | ||
| 1821 | static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, | 1822 | static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, |
| 1822 | u32 *prog_ids, | 1823 | u32 *prog_ids, |
| 1823 | u32 request_cnt) | 1824 | u32 request_cnt) |
| 1824 | { | 1825 | { |
| 1825 | struct bpf_prog_array_item *item; | 1826 | struct bpf_prog_array_item *item; |
| 1826 | int i = 0; | 1827 | int i = 0; |
| 1827 | 1828 | ||
| 1828 | item = rcu_dereference_check(array, 1)->items; | 1829 | for (item = array->items; item->prog; item++) { |
| 1829 | for (; item->prog; item++) { | ||
| 1830 | if (item->prog == &dummy_bpf_prog.prog) | 1830 | if (item->prog == &dummy_bpf_prog.prog) |
| 1831 | continue; | 1831 | continue; |
| 1832 | prog_ids[i] = item->prog->aux->id; | 1832 | prog_ids[i] = item->prog->aux->id; |
| @@ -1839,7 +1839,7 @@ static bool bpf_prog_array_copy_core(struct bpf_prog_array __rcu *array, | |||
| 1839 | return !!(item->prog); | 1839 | return !!(item->prog); |
| 1840 | } | 1840 | } |
| 1841 | 1841 | ||
| 1842 | int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, | 1842 | int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, |
| 1843 | __u32 __user *prog_ids, u32 cnt) | 1843 | __u32 __user *prog_ids, u32 cnt) |
| 1844 | { | 1844 | { |
| 1845 | unsigned long err = 0; | 1845 | unsigned long err = 0; |
| @@ -1850,18 +1850,12 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, | |||
| 1850 | * cnt = bpf_prog_array_length(); | 1850 | * cnt = bpf_prog_array_length(); |
| 1851 | * if (cnt > 0) | 1851 | * if (cnt > 0) |
| 1852 | * bpf_prog_array_copy_to_user(..., cnt); | 1852 | * bpf_prog_array_copy_to_user(..., cnt); |
| 1853 | * so below kcalloc doesn't need extra cnt > 0 check, but | 1853 | * so below kcalloc doesn't need extra cnt > 0 check. |
| 1854 | * bpf_prog_array_length() releases rcu lock and | ||
| 1855 | * prog array could have been swapped with empty or larger array, | ||
| 1856 | * so always copy 'cnt' prog_ids to the user. | ||
| 1857 | * In a rare race the user will see zero prog_ids | ||
| 1858 | */ | 1854 | */ |
| 1859 | ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); | 1855 | ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); |
| 1860 | if (!ids) | 1856 | if (!ids) |
| 1861 | return -ENOMEM; | 1857 | return -ENOMEM; |
| 1862 | rcu_read_lock(); | ||
| 1863 | nospc = bpf_prog_array_copy_core(array, ids, cnt); | 1858 | nospc = bpf_prog_array_copy_core(array, ids, cnt); |
| 1864 | rcu_read_unlock(); | ||
| 1865 | err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); | 1859 | err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); |
| 1866 | kfree(ids); | 1860 | kfree(ids); |
| 1867 | if (err) | 1861 | if (err) |
| @@ -1871,19 +1865,19 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *array, | |||
| 1871 | return 0; | 1865 | return 0; |
| 1872 | } | 1866 | } |
| 1873 | 1867 | ||
| 1874 | void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *array, | 1868 | void bpf_prog_array_delete_safe(struct bpf_prog_array *array, |
| 1875 | struct bpf_prog *old_prog) | 1869 | struct bpf_prog *old_prog) |
| 1876 | { | 1870 | { |
| 1877 | struct bpf_prog_array_item *item = array->items; | 1871 | struct bpf_prog_array_item *item; |
| 1878 | 1872 | ||
| 1879 | for (; item->prog; item++) | 1873 | for (item = array->items; item->prog; item++) |
| 1880 | if (item->prog == old_prog) { | 1874 | if (item->prog == old_prog) { |
| 1881 | WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); | 1875 | WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); |
| 1882 | break; | 1876 | break; |
| 1883 | } | 1877 | } |
| 1884 | } | 1878 | } |
| 1885 | 1879 | ||
| 1886 | int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, | 1880 | int bpf_prog_array_copy(struct bpf_prog_array *old_array, |
| 1887 | struct bpf_prog *exclude_prog, | 1881 | struct bpf_prog *exclude_prog, |
| 1888 | struct bpf_prog *include_prog, | 1882 | struct bpf_prog *include_prog, |
| 1889 | struct bpf_prog_array **new_array) | 1883 | struct bpf_prog_array **new_array) |
| @@ -1947,7 +1941,7 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, | |||
| 1947 | return 0; | 1941 | return 0; |
| 1948 | } | 1942 | } |
| 1949 | 1943 | ||
| 1950 | int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, | 1944 | int bpf_prog_array_copy_info(struct bpf_prog_array *array, |
| 1951 | u32 *prog_ids, u32 request_cnt, | 1945 | u32 *prog_ids, u32 request_cnt, |
| 1952 | u32 *prog_cnt) | 1946 | u32 *prog_cnt) |
| 1953 | { | 1947 | { |
| @@ -2090,6 +2084,15 @@ bool __weak bpf_helper_changes_pkt_data(void *func) | |||
| 2090 | return false; | 2084 | return false; |
| 2091 | } | 2085 | } |
| 2092 | 2086 | ||
| 2087 | /* Return TRUE if the JIT backend wants verifier to enable sub-register usage | ||
| 2088 | * analysis code and wants explicit zero extension inserted by verifier. | ||
| 2089 | * Otherwise, return FALSE. | ||
| 2090 | */ | ||
| 2091 | bool __weak bpf_jit_needs_zext(void) | ||
| 2092 | { | ||
| 2093 | return false; | ||
| 2094 | } | ||
| 2095 | |||
| 2093 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call | 2096 | /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call |
| 2094 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. | 2097 | * skb_copy_bits(), so provide a weak definition of it for NET-less config. |
| 2095 | */ | 2098 | */ |
| @@ -2101,10 +2104,10 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to, | |||
| 2101 | 2104 | ||
| 2102 | DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); | 2105 | DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); |
| 2103 | EXPORT_SYMBOL(bpf_stats_enabled_key); | 2106 | EXPORT_SYMBOL(bpf_stats_enabled_key); |
| 2104 | int sysctl_bpf_stats_enabled __read_mostly; | ||
| 2105 | 2107 | ||
| 2106 | /* All definitions of tracepoints related to BPF. */ | 2108 | /* All definitions of tracepoints related to BPF. */ |
| 2107 | #define CREATE_TRACE_POINTS | 2109 | #define CREATE_TRACE_POINTS |
| 2108 | #include <linux/bpf_trace.h> | 2110 | #include <linux/bpf_trace.h> |
| 2109 | 2111 | ||
| 2110 | EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); | 2112 | EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); |
| 2113 | EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); | ||
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index cf727d77c6c6..ef49e17ae47c 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* bpf/cpumap.c | 2 | /* bpf/cpumap.c |
| 2 | * | 3 | * |
| 3 | * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. | 4 | * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. |
| 4 | * Released under terms in GPL version 2. See COPYING. | ||
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | /* The 'cpumap' is primarily used as a backend map for XDP BPF helper | 7 | /* The 'cpumap' is primarily used as a backend map for XDP BPF helper |
| @@ -32,14 +32,19 @@ | |||
| 32 | 32 | ||
| 33 | /* General idea: XDP packets getting XDP redirected to another CPU, | 33 | /* General idea: XDP packets getting XDP redirected to another CPU, |
| 34 | * will maximum be stored/queued for one driver ->poll() call. It is | 34 | * will maximum be stored/queued for one driver ->poll() call. It is |
| 35 | * guaranteed that setting flush bit and flush operation happen on | 35 | * guaranteed that queueing the frame and the flush operation happen on |
| 36 | * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() | 36 | * same CPU. Thus, cpu_map_flush operation can deduct via this_cpu_ptr() |
| 37 | * which queue in bpf_cpu_map_entry contains packets. | 37 | * which queue in bpf_cpu_map_entry contains packets. |
| 38 | */ | 38 | */ |
| 39 | 39 | ||
| 40 | #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ | 40 | #define CPU_MAP_BULK_SIZE 8 /* 8 == one cacheline on 64-bit archs */ |
| 41 | struct bpf_cpu_map_entry; | ||
| 42 | struct bpf_cpu_map; | ||
| 43 | |||
| 41 | struct xdp_bulk_queue { | 44 | struct xdp_bulk_queue { |
| 42 | void *q[CPU_MAP_BULK_SIZE]; | 45 | void *q[CPU_MAP_BULK_SIZE]; |
| 46 | struct list_head flush_node; | ||
| 47 | struct bpf_cpu_map_entry *obj; | ||
| 43 | unsigned int count; | 48 | unsigned int count; |
| 44 | }; | 49 | }; |
| 45 | 50 | ||
| @@ -52,6 +57,8 @@ struct bpf_cpu_map_entry { | |||
| 52 | /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ | 57 | /* XDP can run multiple RX-ring queues, need __percpu enqueue store */ |
| 53 | struct xdp_bulk_queue __percpu *bulkq; | 58 | struct xdp_bulk_queue __percpu *bulkq; |
| 54 | 59 | ||
| 60 | struct bpf_cpu_map *cmap; | ||
| 61 | |||
| 55 | /* Queue with potential multi-producers, and single-consumer kthread */ | 62 | /* Queue with potential multi-producers, and single-consumer kthread */ |
| 56 | struct ptr_ring *queue; | 63 | struct ptr_ring *queue; |
| 57 | struct task_struct *kthread; | 64 | struct task_struct *kthread; |
| @@ -65,23 +72,17 @@ struct bpf_cpu_map { | |||
| 65 | struct bpf_map map; | 72 | struct bpf_map map; |
| 66 | /* Below members specific for map type */ | 73 | /* Below members specific for map type */ |
| 67 | struct bpf_cpu_map_entry **cpu_map; | 74 | struct bpf_cpu_map_entry **cpu_map; |
| 68 | unsigned long __percpu *flush_needed; | 75 | struct list_head __percpu *flush_list; |
| 69 | }; | 76 | }; |
| 70 | 77 | ||
| 71 | static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, | 78 | static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx); |
| 72 | struct xdp_bulk_queue *bq, bool in_napi_ctx); | ||
| 73 | |||
| 74 | static u64 cpu_map_bitmap_size(const union bpf_attr *attr) | ||
| 75 | { | ||
| 76 | return BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); | ||
| 77 | } | ||
| 78 | 79 | ||
| 79 | static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) | 80 | static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) |
| 80 | { | 81 | { |
| 81 | struct bpf_cpu_map *cmap; | 82 | struct bpf_cpu_map *cmap; |
| 82 | int err = -ENOMEM; | 83 | int err = -ENOMEM; |
| 84 | int ret, cpu; | ||
| 83 | u64 cost; | 85 | u64 cost; |
| 84 | int ret; | ||
| 85 | 86 | ||
| 86 | if (!capable(CAP_SYS_ADMIN)) | 87 | if (!capable(CAP_SYS_ADMIN)) |
| 87 | return ERR_PTR(-EPERM); | 88 | return ERR_PTR(-EPERM); |
| @@ -105,23 +106,21 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) | |||
| 105 | 106 | ||
| 106 | /* make sure page count doesn't overflow */ | 107 | /* make sure page count doesn't overflow */ |
| 107 | cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); | 108 | cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); |
| 108 | cost += cpu_map_bitmap_size(attr) * num_possible_cpus(); | 109 | cost += sizeof(struct list_head) * num_possible_cpus(); |
| 109 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 110 | goto free_cmap; | ||
| 111 | cmap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 112 | 110 | ||
| 113 | /* Notice returns -EPERM on if map size is larger than memlock limit */ | 111 | /* Notice returns -EPERM on if map size is larger than memlock limit */ |
| 114 | ret = bpf_map_precharge_memlock(cmap->map.pages); | 112 | ret = bpf_map_charge_init(&cmap->map.memory, cost); |
| 115 | if (ret) { | 113 | if (ret) { |
| 116 | err = ret; | 114 | err = ret; |
| 117 | goto free_cmap; | 115 | goto free_cmap; |
| 118 | } | 116 | } |
| 119 | 117 | ||
| 120 | /* A per cpu bitfield with a bit per possible CPU in map */ | 118 | cmap->flush_list = alloc_percpu(struct list_head); |
| 121 | cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr), | 119 | if (!cmap->flush_list) |
| 122 | __alignof__(unsigned long)); | 120 | goto free_charge; |
| 123 | if (!cmap->flush_needed) | 121 | |
| 124 | goto free_cmap; | 122 | for_each_possible_cpu(cpu) |
| 123 | INIT_LIST_HEAD(per_cpu_ptr(cmap->flush_list, cpu)); | ||
| 125 | 124 | ||
| 126 | /* Alloc array for possible remote "destination" CPUs */ | 125 | /* Alloc array for possible remote "destination" CPUs */ |
| 127 | cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * | 126 | cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * |
| @@ -132,7 +131,9 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) | |||
| 132 | 131 | ||
| 133 | return &cmap->map; | 132 | return &cmap->map; |
| 134 | free_percpu: | 133 | free_percpu: |
| 135 | free_percpu(cmap->flush_needed); | 134 | free_percpu(cmap->flush_list); |
| 135 | free_charge: | ||
| 136 | bpf_map_charge_finish(&cmap->map.memory); | ||
| 136 | free_cmap: | 137 | free_cmap: |
| 137 | kfree(cmap); | 138 | kfree(cmap); |
| 138 | return ERR_PTR(err); | 139 | return ERR_PTR(err); |
| @@ -209,6 +210,9 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, | |||
| 209 | * - RX ring dev queue index (skb_record_rx_queue) | 210 | * - RX ring dev queue index (skb_record_rx_queue) |
| 210 | */ | 211 | */ |
| 211 | 212 | ||
| 213 | /* Until page_pool get SKB return path, release DMA here */ | ||
| 214 | xdp_release_frame(xdpf); | ||
| 215 | |||
| 212 | /* Allow SKB to reuse area used by xdp_frame */ | 216 | /* Allow SKB to reuse area used by xdp_frame */ |
| 213 | xdp_scrub_frame(xdpf); | 217 | xdp_scrub_frame(xdpf); |
| 214 | 218 | ||
| @@ -332,7 +336,8 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, | |||
| 332 | { | 336 | { |
| 333 | gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; | 337 | gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; |
| 334 | struct bpf_cpu_map_entry *rcpu; | 338 | struct bpf_cpu_map_entry *rcpu; |
| 335 | int numa, err; | 339 | struct xdp_bulk_queue *bq; |
| 340 | int numa, err, i; | ||
| 336 | 341 | ||
| 337 | /* Have map->numa_node, but choose node of redirect target CPU */ | 342 | /* Have map->numa_node, but choose node of redirect target CPU */ |
| 338 | numa = cpu_to_node(cpu); | 343 | numa = cpu_to_node(cpu); |
| @@ -347,6 +352,11 @@ static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, | |||
| 347 | if (!rcpu->bulkq) | 352 | if (!rcpu->bulkq) |
| 348 | goto free_rcu; | 353 | goto free_rcu; |
| 349 | 354 | ||
| 355 | for_each_possible_cpu(i) { | ||
| 356 | bq = per_cpu_ptr(rcpu->bulkq, i); | ||
| 357 | bq->obj = rcpu; | ||
| 358 | } | ||
| 359 | |||
| 350 | /* Alloc queue */ | 360 | /* Alloc queue */ |
| 351 | rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); | 361 | rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); |
| 352 | if (!rcpu->queue) | 362 | if (!rcpu->queue) |
| @@ -403,7 +413,7 @@ static void __cpu_map_entry_free(struct rcu_head *rcu) | |||
| 403 | struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); | 413 | struct xdp_bulk_queue *bq = per_cpu_ptr(rcpu->bulkq, cpu); |
| 404 | 414 | ||
| 405 | /* No concurrent bq_enqueue can run at this point */ | 415 | /* No concurrent bq_enqueue can run at this point */ |
| 406 | bq_flush_to_queue(rcpu, bq, false); | 416 | bq_flush_to_queue(bq, false); |
| 407 | } | 417 | } |
| 408 | free_percpu(rcpu->bulkq); | 418 | free_percpu(rcpu->bulkq); |
| 409 | /* Cannot kthread_stop() here, last put free rcpu resources */ | 419 | /* Cannot kthread_stop() here, last put free rcpu resources */ |
| @@ -486,6 +496,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 486 | rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); | 496 | rcpu = __cpu_map_entry_alloc(qsize, key_cpu, map->id); |
| 487 | if (!rcpu) | 497 | if (!rcpu) |
| 488 | return -ENOMEM; | 498 | return -ENOMEM; |
| 499 | rcpu->cmap = cmap; | ||
| 489 | } | 500 | } |
| 490 | rcu_read_lock(); | 501 | rcu_read_lock(); |
| 491 | __cpu_map_entry_replace(cmap, key_cpu, rcpu); | 502 | __cpu_map_entry_replace(cmap, key_cpu, rcpu); |
| @@ -512,14 +523,14 @@ static void cpu_map_free(struct bpf_map *map) | |||
| 512 | synchronize_rcu(); | 523 | synchronize_rcu(); |
| 513 | 524 | ||
| 514 | /* To ensure all pending flush operations have completed wait for flush | 525 | /* To ensure all pending flush operations have completed wait for flush |
| 515 | * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. | 526 | * list be empty on _all_ cpus. Because the above synchronize_rcu() |
| 516 | * Because the above synchronize_rcu() ensures the map is disconnected | 527 | * ensures the map is disconnected from the program we can assume no new |
| 517 | * from the program we can assume no new bits will be set. | 528 | * items will be added to the list. |
| 518 | */ | 529 | */ |
| 519 | for_each_online_cpu(cpu) { | 530 | for_each_online_cpu(cpu) { |
| 520 | unsigned long *bitmap = per_cpu_ptr(cmap->flush_needed, cpu); | 531 | struct list_head *flush_list = per_cpu_ptr(cmap->flush_list, cpu); |
| 521 | 532 | ||
| 522 | while (!bitmap_empty(bitmap, cmap->map.max_entries)) | 533 | while (!list_empty(flush_list)) |
| 523 | cond_resched(); | 534 | cond_resched(); |
| 524 | } | 535 | } |
| 525 | 536 | ||
| @@ -536,7 +547,7 @@ static void cpu_map_free(struct bpf_map *map) | |||
| 536 | /* bq flush and cleanup happens after RCU graze-period */ | 547 | /* bq flush and cleanup happens after RCU graze-period */ |
| 537 | __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ | 548 | __cpu_map_entry_replace(cmap, i, NULL); /* call_rcu */ |
| 538 | } | 549 | } |
| 539 | free_percpu(cmap->flush_needed); | 550 | free_percpu(cmap->flush_list); |
| 540 | bpf_map_area_free(cmap->cpu_map); | 551 | bpf_map_area_free(cmap->cpu_map); |
| 541 | kfree(cmap); | 552 | kfree(cmap); |
| 542 | } | 553 | } |
| @@ -588,9 +599,9 @@ const struct bpf_map_ops cpu_map_ops = { | |||
| 588 | .map_check_btf = map_check_no_btf, | 599 | .map_check_btf = map_check_no_btf, |
| 589 | }; | 600 | }; |
| 590 | 601 | ||
| 591 | static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, | 602 | static int bq_flush_to_queue(struct xdp_bulk_queue *bq, bool in_napi_ctx) |
| 592 | struct xdp_bulk_queue *bq, bool in_napi_ctx) | ||
| 593 | { | 603 | { |
| 604 | struct bpf_cpu_map_entry *rcpu = bq->obj; | ||
| 594 | unsigned int processed = 0, drops = 0; | 605 | unsigned int processed = 0, drops = 0; |
| 595 | const int to_cpu = rcpu->cpu; | 606 | const int to_cpu = rcpu->cpu; |
| 596 | struct ptr_ring *q; | 607 | struct ptr_ring *q; |
| @@ -619,6 +630,8 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, | |||
| 619 | bq->count = 0; | 630 | bq->count = 0; |
| 620 | spin_unlock(&q->producer_lock); | 631 | spin_unlock(&q->producer_lock); |
| 621 | 632 | ||
| 633 | __list_del_clearprev(&bq->flush_node); | ||
| 634 | |||
| 622 | /* Feedback loop via tracepoints */ | 635 | /* Feedback loop via tracepoints */ |
| 623 | trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); | 636 | trace_xdp_cpumap_enqueue(rcpu->map_id, processed, drops, to_cpu); |
| 624 | return 0; | 637 | return 0; |
| @@ -629,10 +642,11 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, | |||
| 629 | */ | 642 | */ |
| 630 | static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) | 643 | static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) |
| 631 | { | 644 | { |
| 645 | struct list_head *flush_list = this_cpu_ptr(rcpu->cmap->flush_list); | ||
| 632 | struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); | 646 | struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); |
| 633 | 647 | ||
| 634 | if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) | 648 | if (unlikely(bq->count == CPU_MAP_BULK_SIZE)) |
| 635 | bq_flush_to_queue(rcpu, bq, true); | 649 | bq_flush_to_queue(bq, true); |
| 636 | 650 | ||
| 637 | /* Notice, xdp_buff/page MUST be queued here, long enough for | 651 | /* Notice, xdp_buff/page MUST be queued here, long enough for |
| 638 | * driver to code invoking us to finished, due to driver | 652 | * driver to code invoking us to finished, due to driver |
| @@ -644,6 +658,10 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) | |||
| 644 | * operation, when completing napi->poll call. | 658 | * operation, when completing napi->poll call. |
| 645 | */ | 659 | */ |
| 646 | bq->q[bq->count++] = xdpf; | 660 | bq->q[bq->count++] = xdpf; |
| 661 | |||
| 662 | if (!bq->flush_node.prev) | ||
| 663 | list_add(&bq->flush_node, flush_list); | ||
| 664 | |||
| 647 | return 0; | 665 | return 0; |
| 648 | } | 666 | } |
| 649 | 667 | ||
| @@ -663,41 +681,16 @@ int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, | |||
| 663 | return 0; | 681 | return 0; |
| 664 | } | 682 | } |
| 665 | 683 | ||
| 666 | void __cpu_map_insert_ctx(struct bpf_map *map, u32 bit) | ||
| 667 | { | ||
| 668 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); | ||
| 669 | unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); | ||
| 670 | |||
| 671 | __set_bit(bit, bitmap); | ||
| 672 | } | ||
| 673 | |||
| 674 | void __cpu_map_flush(struct bpf_map *map) | 684 | void __cpu_map_flush(struct bpf_map *map) |
| 675 | { | 685 | { |
| 676 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); | 686 | struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); |
| 677 | unsigned long *bitmap = this_cpu_ptr(cmap->flush_needed); | 687 | struct list_head *flush_list = this_cpu_ptr(cmap->flush_list); |
| 678 | u32 bit; | 688 | struct xdp_bulk_queue *bq, *tmp; |
| 679 | |||
| 680 | /* The napi->poll softirq makes sure __cpu_map_insert_ctx() | ||
| 681 | * and __cpu_map_flush() happen on same CPU. Thus, the percpu | ||
| 682 | * bitmap indicate which percpu bulkq have packets. | ||
| 683 | */ | ||
| 684 | for_each_set_bit(bit, bitmap, map->max_entries) { | ||
| 685 | struct bpf_cpu_map_entry *rcpu = READ_ONCE(cmap->cpu_map[bit]); | ||
| 686 | struct xdp_bulk_queue *bq; | ||
| 687 | |||
| 688 | /* This is possible if entry is removed by user space | ||
| 689 | * between xdp redirect and flush op. | ||
| 690 | */ | ||
| 691 | if (unlikely(!rcpu)) | ||
| 692 | continue; | ||
| 693 | |||
| 694 | __clear_bit(bit, bitmap); | ||
| 695 | 689 | ||
| 696 | /* Flush all frames in bulkq to real queue */ | 690 | list_for_each_entry_safe(bq, tmp, flush_list, flush_node) { |
| 697 | bq = this_cpu_ptr(rcpu->bulkq); | 691 | bq_flush_to_queue(bq, true); |
| 698 | bq_flush_to_queue(rcpu, bq, true); | ||
| 699 | 692 | ||
| 700 | /* If already running, costs spin_lock_irqsave + smb_mb */ | 693 | /* If already running, costs spin_lock_irqsave + smb_mb */ |
| 701 | wake_up_process(rcpu->kthread); | 694 | wake_up_process(bq->obj->kthread); |
| 702 | } | 695 | } |
| 703 | } | 696 | } |
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 191b79948424..d83cf8ccc872 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c | |||
| @@ -1,13 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io | 2 | /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | 3 | */ |
| 12 | 4 | ||
| 13 | /* Devmaps primary use is as a backend map for XDP BPF helper call | 5 | /* Devmaps primary use is as a backend map for XDP BPF helper call |
| @@ -25,9 +17,8 @@ | |||
| 25 | * datapath always has a valid copy. However, the datapath does a "flush" | 17 | * datapath always has a valid copy. However, the datapath does a "flush" |
| 26 | * operation that pushes any pending packets in the driver outside the RCU | 18 | * operation that pushes any pending packets in the driver outside the RCU |
| 27 | * critical section. Each bpf_dtab_netdev tracks these pending operations using | 19 | * critical section. Each bpf_dtab_netdev tracks these pending operations using |
| 28 | * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed | 20 | * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until |
| 29 | * until all bits are cleared indicating outstanding flush operations have | 21 | * this list is empty, indicating outstanding flush operations have completed. |
| 30 | * completed. | ||
| 31 | * | 22 | * |
| 32 | * BPF syscalls may race with BPF program calls on any of the update, delete | 23 | * BPF syscalls may race with BPF program calls on any of the update, delete |
| 33 | * or lookup operations. As noted above the xchg() operation also keep the | 24 | * or lookup operations. As noted above the xchg() operation also keep the |
| @@ -56,9 +47,13 @@ | |||
| 56 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) | 47 | (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) |
| 57 | 48 | ||
| 58 | #define DEV_MAP_BULK_SIZE 16 | 49 | #define DEV_MAP_BULK_SIZE 16 |
| 50 | struct bpf_dtab_netdev; | ||
| 51 | |||
| 59 | struct xdp_bulk_queue { | 52 | struct xdp_bulk_queue { |
| 60 | struct xdp_frame *q[DEV_MAP_BULK_SIZE]; | 53 | struct xdp_frame *q[DEV_MAP_BULK_SIZE]; |
| 54 | struct list_head flush_node; | ||
| 61 | struct net_device *dev_rx; | 55 | struct net_device *dev_rx; |
| 56 | struct bpf_dtab_netdev *obj; | ||
| 62 | unsigned int count; | 57 | unsigned int count; |
| 63 | }; | 58 | }; |
| 64 | 59 | ||
| @@ -73,22 +68,17 @@ struct bpf_dtab_netdev { | |||
| 73 | struct bpf_dtab { | 68 | struct bpf_dtab { |
| 74 | struct bpf_map map; | 69 | struct bpf_map map; |
| 75 | struct bpf_dtab_netdev **netdev_map; | 70 | struct bpf_dtab_netdev **netdev_map; |
| 76 | unsigned long __percpu *flush_needed; | 71 | struct list_head __percpu *flush_list; |
| 77 | struct list_head list; | 72 | struct list_head list; |
| 78 | }; | 73 | }; |
| 79 | 74 | ||
| 80 | static DEFINE_SPINLOCK(dev_map_lock); | 75 | static DEFINE_SPINLOCK(dev_map_lock); |
| 81 | static LIST_HEAD(dev_map_list); | 76 | static LIST_HEAD(dev_map_list); |
| 82 | 77 | ||
| 83 | static u64 dev_map_bitmap_size(const union bpf_attr *attr) | ||
| 84 | { | ||
| 85 | return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); | ||
| 86 | } | ||
| 87 | |||
| 88 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | 78 | static struct bpf_map *dev_map_alloc(union bpf_attr *attr) |
| 89 | { | 79 | { |
| 90 | struct bpf_dtab *dtab; | 80 | struct bpf_dtab *dtab; |
| 91 | int err = -EINVAL; | 81 | int err, cpu; |
| 92 | u64 cost; | 82 | u64 cost; |
| 93 | 83 | ||
| 94 | if (!capable(CAP_NET_ADMIN)) | 84 | if (!capable(CAP_NET_ADMIN)) |
| @@ -99,6 +89,11 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
| 99 | attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) | 89 | attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) |
| 100 | return ERR_PTR(-EINVAL); | 90 | return ERR_PTR(-EINVAL); |
| 101 | 91 | ||
| 92 | /* Lookup returns a pointer straight to dev->ifindex, so make sure the | ||
| 93 | * verifier prevents writes from the BPF side | ||
| 94 | */ | ||
| 95 | attr->map_flags |= BPF_F_RDONLY_PROG; | ||
| 96 | |||
| 102 | dtab = kzalloc(sizeof(*dtab), GFP_USER); | 97 | dtab = kzalloc(sizeof(*dtab), GFP_USER); |
| 103 | if (!dtab) | 98 | if (!dtab) |
| 104 | return ERR_PTR(-ENOMEM); | 99 | return ERR_PTR(-ENOMEM); |
| @@ -107,39 +102,39 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) | |||
| 107 | 102 | ||
| 108 | /* make sure page count doesn't overflow */ | 103 | /* make sure page count doesn't overflow */ |
| 109 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); | 104 | cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); |
| 110 | cost += dev_map_bitmap_size(attr) * num_possible_cpus(); | 105 | cost += sizeof(struct list_head) * num_possible_cpus(); |
| 111 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 112 | goto free_dtab; | ||
| 113 | 106 | ||
| 114 | dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | 107 | /* if map size is larger than memlock limit, reject it */ |
| 115 | 108 | err = bpf_map_charge_init(&dtab->map.memory, cost); | |
| 116 | /* if map size is larger than memlock limit, reject it early */ | ||
| 117 | err = bpf_map_precharge_memlock(dtab->map.pages); | ||
| 118 | if (err) | 109 | if (err) |
| 119 | goto free_dtab; | 110 | goto free_dtab; |
| 120 | 111 | ||
| 121 | err = -ENOMEM; | 112 | err = -ENOMEM; |
| 122 | 113 | ||
| 123 | /* A per cpu bitfield with a bit per possible net device */ | 114 | dtab->flush_list = alloc_percpu(struct list_head); |
| 124 | dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr), | 115 | if (!dtab->flush_list) |
| 125 | __alignof__(unsigned long), | 116 | goto free_charge; |
| 126 | GFP_KERNEL | __GFP_NOWARN); | 117 | |
| 127 | if (!dtab->flush_needed) | 118 | for_each_possible_cpu(cpu) |
| 128 | goto free_dtab; | 119 | INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); |
| 129 | 120 | ||
| 130 | dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * | 121 | dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * |
| 131 | sizeof(struct bpf_dtab_netdev *), | 122 | sizeof(struct bpf_dtab_netdev *), |
| 132 | dtab->map.numa_node); | 123 | dtab->map.numa_node); |
| 133 | if (!dtab->netdev_map) | 124 | if (!dtab->netdev_map) |
| 134 | goto free_dtab; | 125 | goto free_percpu; |
| 135 | 126 | ||
| 136 | spin_lock(&dev_map_lock); | 127 | spin_lock(&dev_map_lock); |
| 137 | list_add_tail_rcu(&dtab->list, &dev_map_list); | 128 | list_add_tail_rcu(&dtab->list, &dev_map_list); |
| 138 | spin_unlock(&dev_map_lock); | 129 | spin_unlock(&dev_map_lock); |
| 139 | 130 | ||
| 140 | return &dtab->map; | 131 | return &dtab->map; |
| 132 | |||
| 133 | free_percpu: | ||
| 134 | free_percpu(dtab->flush_list); | ||
| 135 | free_charge: | ||
| 136 | bpf_map_charge_finish(&dtab->map.memory); | ||
| 141 | free_dtab: | 137 | free_dtab: |
| 142 | free_percpu(dtab->flush_needed); | ||
| 143 | kfree(dtab); | 138 | kfree(dtab); |
| 144 | return ERR_PTR(err); | 139 | return ERR_PTR(err); |
| 145 | } | 140 | } |
| @@ -164,15 +159,18 @@ static void dev_map_free(struct bpf_map *map) | |||
| 164 | bpf_clear_redirect_map(map); | 159 | bpf_clear_redirect_map(map); |
| 165 | synchronize_rcu(); | 160 | synchronize_rcu(); |
| 166 | 161 | ||
| 162 | /* Make sure prior __dev_map_entry_free() have completed. */ | ||
| 163 | rcu_barrier(); | ||
| 164 | |||
| 167 | /* To ensure all pending flush operations have completed wait for flush | 165 | /* To ensure all pending flush operations have completed wait for flush |
| 168 | * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. | 166 | * list to empty on _all_ cpus. |
| 169 | * Because the above synchronize_rcu() ensures the map is disconnected | 167 | * Because the above synchronize_rcu() ensures the map is disconnected |
| 170 | * from the program we can assume no new bits will be set. | 168 | * from the program we can assume no new items will be added. |
| 171 | */ | 169 | */ |
| 172 | for_each_online_cpu(cpu) { | 170 | for_each_online_cpu(cpu) { |
| 173 | unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); | 171 | struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); |
| 174 | 172 | ||
| 175 | while (!bitmap_empty(bitmap, dtab->map.max_entries)) | 173 | while (!list_empty(flush_list)) |
| 176 | cond_resched(); | 174 | cond_resched(); |
| 177 | } | 175 | } |
| 178 | 176 | ||
| @@ -183,11 +181,12 @@ static void dev_map_free(struct bpf_map *map) | |||
| 183 | if (!dev) | 181 | if (!dev) |
| 184 | continue; | 182 | continue; |
| 185 | 183 | ||
| 184 | free_percpu(dev->bulkq); | ||
| 186 | dev_put(dev->dev); | 185 | dev_put(dev->dev); |
| 187 | kfree(dev); | 186 | kfree(dev); |
| 188 | } | 187 | } |
| 189 | 188 | ||
| 190 | free_percpu(dtab->flush_needed); | 189 | free_percpu(dtab->flush_list); |
| 191 | bpf_map_area_free(dtab->netdev_map); | 190 | bpf_map_area_free(dtab->netdev_map); |
| 192 | kfree(dtab); | 191 | kfree(dtab); |
| 193 | } | 192 | } |
| @@ -209,18 +208,10 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) | |||
| 209 | return 0; | 208 | return 0; |
| 210 | } | 209 | } |
| 211 | 210 | ||
| 212 | void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) | 211 | static int bq_xmit_all(struct xdp_bulk_queue *bq, u32 flags, |
| 213 | { | ||
| 214 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | ||
| 215 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | ||
| 216 | |||
| 217 | __set_bit(bit, bitmap); | ||
| 218 | } | ||
| 219 | |||
| 220 | static int bq_xmit_all(struct bpf_dtab_netdev *obj, | ||
| 221 | struct xdp_bulk_queue *bq, u32 flags, | ||
| 222 | bool in_napi_ctx) | 212 | bool in_napi_ctx) |
| 223 | { | 213 | { |
| 214 | struct bpf_dtab_netdev *obj = bq->obj; | ||
| 224 | struct net_device *dev = obj->dev; | 215 | struct net_device *dev = obj->dev; |
| 225 | int sent = 0, drops = 0, err = 0; | 216 | int sent = 0, drops = 0, err = 0; |
| 226 | int i; | 217 | int i; |
| @@ -247,6 +238,7 @@ out: | |||
| 247 | trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, | 238 | trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit, |
| 248 | sent, drops, bq->dev_rx, dev, err); | 239 | sent, drops, bq->dev_rx, dev, err); |
| 249 | bq->dev_rx = NULL; | 240 | bq->dev_rx = NULL; |
| 241 | __list_del_clearprev(&bq->flush_node); | ||
| 250 | return 0; | 242 | return 0; |
| 251 | error: | 243 | error: |
| 252 | /* If ndo_xdp_xmit fails with an errno, no frames have been | 244 | /* If ndo_xdp_xmit fails with an errno, no frames have been |
| @@ -269,30 +261,19 @@ error: | |||
| 269 | * from the driver before returning from its napi->poll() routine. The poll() | 261 | * from the driver before returning from its napi->poll() routine. The poll() |
| 270 | * routine is called either from busy_poll context or net_rx_action signaled | 262 | * routine is called either from busy_poll context or net_rx_action signaled |
| 271 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the | 263 | * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the |
| 272 | * net device can be torn down. On devmap tear down we ensure the ctx bitmap | 264 | * net device can be torn down. On devmap tear down we ensure the flush list |
| 273 | * is zeroed before completing to ensure all flush operations have completed. | 265 | * is empty before completing to ensure all flush operations have completed. |
| 274 | */ | 266 | */ |
| 275 | void __dev_map_flush(struct bpf_map *map) | 267 | void __dev_map_flush(struct bpf_map *map) |
| 276 | { | 268 | { |
| 277 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); | 269 | struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); |
| 278 | unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); | 270 | struct list_head *flush_list = this_cpu_ptr(dtab->flush_list); |
| 279 | u32 bit; | 271 | struct xdp_bulk_queue *bq, *tmp; |
| 280 | |||
| 281 | for_each_set_bit(bit, bitmap, map->max_entries) { | ||
| 282 | struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); | ||
| 283 | struct xdp_bulk_queue *bq; | ||
| 284 | |||
| 285 | /* This is possible if the dev entry is removed by user space | ||
| 286 | * between xdp redirect and flush op. | ||
| 287 | */ | ||
| 288 | if (unlikely(!dev)) | ||
| 289 | continue; | ||
| 290 | |||
| 291 | __clear_bit(bit, bitmap); | ||
| 292 | 272 | ||
| 293 | bq = this_cpu_ptr(dev->bulkq); | 273 | rcu_read_lock(); |
| 294 | bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, true); | 274 | list_for_each_entry_safe(bq, tmp, flush_list, flush_node) |
| 295 | } | 275 | bq_xmit_all(bq, XDP_XMIT_FLUSH, true); |
| 276 | rcu_read_unlock(); | ||
| 296 | } | 277 | } |
| 297 | 278 | ||
| 298 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or | 279 | /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or |
| @@ -318,10 +299,11 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, | |||
| 318 | struct net_device *dev_rx) | 299 | struct net_device *dev_rx) |
| 319 | 300 | ||
| 320 | { | 301 | { |
| 302 | struct list_head *flush_list = this_cpu_ptr(obj->dtab->flush_list); | ||
| 321 | struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); | 303 | struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq); |
| 322 | 304 | ||
| 323 | if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) | 305 | if (unlikely(bq->count == DEV_MAP_BULK_SIZE)) |
| 324 | bq_xmit_all(obj, bq, 0, true); | 306 | bq_xmit_all(bq, 0, true); |
| 325 | 307 | ||
| 326 | /* Ingress dev_rx will be the same for all xdp_frame's in | 308 | /* Ingress dev_rx will be the same for all xdp_frame's in |
| 327 | * bulk_queue, because bq stored per-CPU and must be flushed | 309 | * bulk_queue, because bq stored per-CPU and must be flushed |
| @@ -331,6 +313,10 @@ static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf, | |||
| 331 | bq->dev_rx = dev_rx; | 313 | bq->dev_rx = dev_rx; |
| 332 | 314 | ||
| 333 | bq->q[bq->count++] = xdpf; | 315 | bq->q[bq->count++] = xdpf; |
| 316 | |||
| 317 | if (!bq->flush_node.prev) | ||
| 318 | list_add(&bq->flush_node, flush_list); | ||
| 319 | |||
| 334 | return 0; | 320 | return 0; |
| 335 | } | 321 | } |
| 336 | 322 | ||
| @@ -381,17 +367,14 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev) | |||
| 381 | { | 367 | { |
| 382 | if (dev->dev->netdev_ops->ndo_xdp_xmit) { | 368 | if (dev->dev->netdev_ops->ndo_xdp_xmit) { |
| 383 | struct xdp_bulk_queue *bq; | 369 | struct xdp_bulk_queue *bq; |
| 384 | unsigned long *bitmap; | ||
| 385 | |||
| 386 | int cpu; | 370 | int cpu; |
| 387 | 371 | ||
| 372 | rcu_read_lock(); | ||
| 388 | for_each_online_cpu(cpu) { | 373 | for_each_online_cpu(cpu) { |
| 389 | bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu); | ||
| 390 | __clear_bit(dev->bit, bitmap); | ||
| 391 | |||
| 392 | bq = per_cpu_ptr(dev->bulkq, cpu); | 374 | bq = per_cpu_ptr(dev->bulkq, cpu); |
| 393 | bq_xmit_all(dev, bq, XDP_XMIT_FLUSH, false); | 375 | bq_xmit_all(bq, XDP_XMIT_FLUSH, false); |
| 394 | } | 376 | } |
| 377 | rcu_read_unlock(); | ||
| 395 | } | 378 | } |
| 396 | } | 379 | } |
| 397 | 380 | ||
| @@ -436,8 +419,10 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 436 | struct net *net = current->nsproxy->net_ns; | 419 | struct net *net = current->nsproxy->net_ns; |
| 437 | gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; | 420 | gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; |
| 438 | struct bpf_dtab_netdev *dev, *old_dev; | 421 | struct bpf_dtab_netdev *dev, *old_dev; |
| 439 | u32 i = *(u32 *)key; | ||
| 440 | u32 ifindex = *(u32 *)value; | 422 | u32 ifindex = *(u32 *)value; |
| 423 | struct xdp_bulk_queue *bq; | ||
| 424 | u32 i = *(u32 *)key; | ||
| 425 | int cpu; | ||
| 441 | 426 | ||
| 442 | if (unlikely(map_flags > BPF_EXIST)) | 427 | if (unlikely(map_flags > BPF_EXIST)) |
| 443 | return -EINVAL; | 428 | return -EINVAL; |
| @@ -460,6 +445,11 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, | |||
| 460 | return -ENOMEM; | 445 | return -ENOMEM; |
| 461 | } | 446 | } |
| 462 | 447 | ||
| 448 | for_each_possible_cpu(cpu) { | ||
| 449 | bq = per_cpu_ptr(dev->bulkq, cpu); | ||
| 450 | bq->obj = dev; | ||
| 451 | } | ||
| 452 | |||
| 463 | dev->dev = dev_get_by_index(net, ifindex); | 453 | dev->dev = dev_get_by_index(net, ifindex); |
| 464 | if (!dev->dev) { | 454 | if (!dev->dev) { |
| 465 | free_percpu(dev->bulkq); | 455 | free_percpu(dev->bulkq); |
diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d9ce383c0f9c..b44d8c447afd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c | |||
| @@ -1,14 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * Copyright (c) 2016 Facebook | 3 | * Copyright (c) 2016 Facebook |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of version 2 of the GNU General Public | ||
| 6 | * License as published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 11 | * General Public License for more details. | ||
| 12 | */ | 4 | */ |
| 13 | 5 | ||
| 14 | #include <linux/bpf.h> | 6 | #include <linux/bpf.h> |
diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e1324a834a24..e546b18d27da 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h | |||
| @@ -1,14 +1,6 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * Copyright (c) 2016 Facebook | 3 | * Copyright (c) 2016 Facebook |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of version 2 of the GNU General Public | ||
| 6 | * License as published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 11 | * General Public License for more details. | ||
| 12 | */ | 4 | */ |
| 13 | 5 | ||
| 14 | #ifndef __BPF_DISASM_H__ | 6 | #ifndef __BPF_DISASM_H__ |
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 192d32e77db3..22066a62c8c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c | |||
| @@ -1,14 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * Copyright (c) 2016 Facebook | 3 | * Copyright (c) 2016 Facebook |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of version 2 of the GNU General Public | ||
| 6 | * License as published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, but | ||
| 9 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 11 | * General Public License for more details. | ||
| 12 | */ | 4 | */ |
| 13 | #include <linux/bpf.h> | 5 | #include <linux/bpf.h> |
| 14 | #include <linux/btf.h> | 6 | #include <linux/btf.h> |
| @@ -360,14 +352,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 360 | else | 352 | else |
| 361 | cost += (u64) htab->elem_size * num_possible_cpus(); | 353 | cost += (u64) htab->elem_size * num_possible_cpus(); |
| 362 | 354 | ||
| 363 | if (cost >= U32_MAX - PAGE_SIZE) | 355 | /* if map size is larger than memlock limit, reject it */ |
| 364 | /* make sure page count doesn't overflow */ | 356 | err = bpf_map_charge_init(&htab->map.memory, cost); |
| 365 | goto free_htab; | ||
| 366 | |||
| 367 | htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 368 | |||
| 369 | /* if map size is larger than memlock limit, reject it early */ | ||
| 370 | err = bpf_map_precharge_memlock(htab->map.pages); | ||
| 371 | if (err) | 357 | if (err) |
| 372 | goto free_htab; | 358 | goto free_htab; |
| 373 | 359 | ||
| @@ -376,7 +362,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) | |||
| 376 | sizeof(struct bucket), | 362 | sizeof(struct bucket), |
| 377 | htab->map.numa_node); | 363 | htab->map.numa_node); |
| 378 | if (!htab->buckets) | 364 | if (!htab->buckets) |
| 379 | goto free_htab; | 365 | goto free_charge; |
| 380 | 366 | ||
| 381 | if (htab->map.map_flags & BPF_F_ZERO_SEED) | 367 | if (htab->map.map_flags & BPF_F_ZERO_SEED) |
| 382 | htab->hashrnd = 0; | 368 | htab->hashrnd = 0; |
| @@ -409,6 +395,8 @@ free_prealloc: | |||
| 409 | prealloc_destroy(htab); | 395 | prealloc_destroy(htab); |
| 410 | free_buckets: | 396 | free_buckets: |
| 411 | bpf_map_area_free(htab->buckets); | 397 | bpf_map_area_free(htab->buckets); |
| 398 | free_charge: | ||
| 399 | bpf_map_charge_finish(&htab->map.memory); | ||
| 412 | free_htab: | 400 | free_htab: |
| 413 | kfree(htab); | 401 | kfree(htab); |
| 414 | return ERR_PTR(err); | 402 | return ERR_PTR(err); |
| @@ -527,18 +515,30 @@ static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) | |||
| 527 | return insn - insn_buf; | 515 | return insn - insn_buf; |
| 528 | } | 516 | } |
| 529 | 517 | ||
| 530 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) | 518 | static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, |
| 519 | void *key, const bool mark) | ||
| 531 | { | 520 | { |
| 532 | struct htab_elem *l = __htab_map_lookup_elem(map, key); | 521 | struct htab_elem *l = __htab_map_lookup_elem(map, key); |
| 533 | 522 | ||
| 534 | if (l) { | 523 | if (l) { |
| 535 | bpf_lru_node_set_ref(&l->lru_node); | 524 | if (mark) |
| 525 | bpf_lru_node_set_ref(&l->lru_node); | ||
| 536 | return l->key + round_up(map->key_size, 8); | 526 | return l->key + round_up(map->key_size, 8); |
| 537 | } | 527 | } |
| 538 | 528 | ||
| 539 | return NULL; | 529 | return NULL; |
| 540 | } | 530 | } |
| 541 | 531 | ||
| 532 | static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) | ||
| 533 | { | ||
| 534 | return __htab_lru_map_lookup_elem(map, key, true); | ||
| 535 | } | ||
| 536 | |||
| 537 | static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) | ||
| 538 | { | ||
| 539 | return __htab_lru_map_lookup_elem(map, key, false); | ||
| 540 | } | ||
| 541 | |||
| 542 | static u32 htab_lru_map_gen_lookup(struct bpf_map *map, | 542 | static u32 htab_lru_map_gen_lookup(struct bpf_map *map, |
| 543 | struct bpf_insn *insn_buf) | 543 | struct bpf_insn *insn_buf) |
| 544 | { | 544 | { |
| @@ -1250,6 +1250,7 @@ const struct bpf_map_ops htab_lru_map_ops = { | |||
| 1250 | .map_free = htab_map_free, | 1250 | .map_free = htab_map_free, |
| 1251 | .map_get_next_key = htab_map_get_next_key, | 1251 | .map_get_next_key = htab_map_get_next_key, |
| 1252 | .map_lookup_elem = htab_lru_map_lookup_elem, | 1252 | .map_lookup_elem = htab_lru_map_lookup_elem, |
| 1253 | .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, | ||
| 1253 | .map_update_elem = htab_lru_map_update_elem, | 1254 | .map_update_elem = htab_lru_map_update_elem, |
| 1254 | .map_delete_elem = htab_lru_map_delete_elem, | 1255 | .map_delete_elem = htab_lru_map_delete_elem, |
| 1255 | .map_gen_lookup = htab_lru_map_gen_lookup, | 1256 | .map_gen_lookup = htab_lru_map_gen_lookup, |
| @@ -1281,7 +1282,6 @@ static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) | |||
| 1281 | 1282 | ||
| 1282 | int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) | 1283 | int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) |
| 1283 | { | 1284 | { |
| 1284 | struct bpf_htab *htab = container_of(map, struct bpf_htab, map); | ||
| 1285 | struct htab_elem *l; | 1285 | struct htab_elem *l; |
| 1286 | void __percpu *pptr; | 1286 | void __percpu *pptr; |
| 1287 | int ret = -ENOENT; | 1287 | int ret = -ENOENT; |
| @@ -1297,8 +1297,9 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) | |||
| 1297 | l = __htab_map_lookup_elem(map, key); | 1297 | l = __htab_map_lookup_elem(map, key); |
| 1298 | if (!l) | 1298 | if (!l) |
| 1299 | goto out; | 1299 | goto out; |
| 1300 | if (htab_is_lru(htab)) | 1300 | /* We do not mark LRU map element here in order to not mess up |
| 1301 | bpf_lru_node_set_ref(&l->lru_node); | 1301 | * eviction heuristics when user space does a map walk. |
| 1302 | */ | ||
| 1302 | pptr = htab_elem_get_ptr(l, map->key_size); | 1303 | pptr = htab_elem_get_ptr(l, map->key_size); |
| 1303 | for_each_possible_cpu(cpu) { | 1304 | for_each_possible_cpu(cpu) { |
| 1304 | bpf_long_memcpy(value + off, | 1305 | bpf_long_memcpy(value + off, |
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4266ffde07ca..5e28718928ca 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c | |||
| @@ -1,13 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | 3 | */ |
| 12 | #include <linux/bpf.h> | 4 | #include <linux/bpf.h> |
| 13 | #include <linux/rcupdate.h> | 5 | #include <linux/rcupdate.h> |
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index bc53e5b20ddc..cc0d0cf114e3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Minimal file system backend for holding eBPF maps and programs, | 3 | * Minimal file system backend for holding eBPF maps and programs, |
| 3 | * used by bpf(2) object pinning. | 4 | * used by bpf(2) object pinning. |
| @@ -5,10 +6,6 @@ | |||
| 5 | * Authors: | 6 | * Authors: |
| 6 | * | 7 | * |
| 7 | * Daniel Borkmann <daniel@iogearbox.net> | 8 | * Daniel Borkmann <daniel@iogearbox.net> |
| 8 | * | ||
| 9 | * This program is free software; you can redistribute it and/or | ||
| 10 | * modify it under the terms of the GNU General Public License | ||
| 11 | * version 2 as published by the Free Software Foundation. | ||
| 12 | */ | 9 | */ |
| 13 | 10 | ||
| 14 | #include <linux/init.h> | 11 | #include <linux/init.h> |
| @@ -518,7 +515,7 @@ out: | |||
| 518 | static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) | 515 | static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) |
| 519 | { | 516 | { |
| 520 | struct bpf_prog *prog; | 517 | struct bpf_prog *prog; |
| 521 | int ret = inode_permission(inode, MAY_READ | MAY_WRITE); | 518 | int ret = inode_permission(inode, MAY_READ); |
| 522 | if (ret) | 519 | if (ret) |
| 523 | return ERR_PTR(ret); | 520 | return ERR_PTR(ret); |
| 524 | 521 | ||
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 980e8f1f6cb5..addd6fdceec8 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c | |||
| @@ -272,6 +272,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) | |||
| 272 | { | 272 | { |
| 273 | int numa_node = bpf_map_attr_numa_node(attr); | 273 | int numa_node = bpf_map_attr_numa_node(attr); |
| 274 | struct bpf_cgroup_storage_map *map; | 274 | struct bpf_cgroup_storage_map *map; |
| 275 | struct bpf_map_memory mem; | ||
| 276 | int ret; | ||
| 275 | 277 | ||
| 276 | if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) | 278 | if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) |
| 277 | return ERR_PTR(-EINVAL); | 279 | return ERR_PTR(-EINVAL); |
| @@ -290,13 +292,18 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) | |||
| 290 | /* max_entries is not used and enforced to be 0 */ | 292 | /* max_entries is not used and enforced to be 0 */ |
| 291 | return ERR_PTR(-EINVAL); | 293 | return ERR_PTR(-EINVAL); |
| 292 | 294 | ||
| 295 | ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); | ||
| 296 | if (ret < 0) | ||
| 297 | return ERR_PTR(ret); | ||
| 298 | |||
| 293 | map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), | 299 | map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), |
| 294 | __GFP_ZERO | GFP_USER, numa_node); | 300 | __GFP_ZERO | GFP_USER, numa_node); |
| 295 | if (!map) | 301 | if (!map) { |
| 302 | bpf_map_charge_finish(&mem); | ||
| 296 | return ERR_PTR(-ENOMEM); | 303 | return ERR_PTR(-ENOMEM); |
| 304 | } | ||
| 297 | 305 | ||
| 298 | map->map.pages = round_up(sizeof(struct bpf_cgroup_storage_map), | 306 | bpf_map_charge_move(&map->map.memory, &mem); |
| 299 | PAGE_SIZE) >> PAGE_SHIFT; | ||
| 300 | 307 | ||
| 301 | /* copy mandatory map attributes */ | 308 | /* copy mandatory map attributes */ |
| 302 | bpf_map_init_from_attr(&map->map, attr); | 309 | bpf_map_init_from_attr(&map->map, attr); |
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index e61630c2e50b..56e6c75d354d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c | |||
| @@ -1,12 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Longest prefix match list implementation | 3 | * Longest prefix match list implementation |
| 3 | * | 4 | * |
| 4 | * Copyright (c) 2016,2017 Daniel Mack | 5 | * Copyright (c) 2016,2017 Daniel Mack |
| 5 | * Copyright (c) 2016 David Herrmann | 6 | * Copyright (c) 2016 David Herrmann |
| 6 | * | ||
| 7 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
| 8 | * General Public License. See the file COPYING in the main directory of the | ||
| 9 | * Linux distribution for more details. | ||
| 10 | */ | 7 | */ |
| 11 | 8 | ||
| 12 | #include <linux/bpf.h> | 9 | #include <linux/bpf.h> |
| @@ -573,14 +570,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) | |||
| 573 | cost_per_node = sizeof(struct lpm_trie_node) + | 570 | cost_per_node = sizeof(struct lpm_trie_node) + |
| 574 | attr->value_size + trie->data_size; | 571 | attr->value_size + trie->data_size; |
| 575 | cost += (u64) attr->max_entries * cost_per_node; | 572 | cost += (u64) attr->max_entries * cost_per_node; |
| 576 | if (cost >= U32_MAX - PAGE_SIZE) { | ||
| 577 | ret = -E2BIG; | ||
| 578 | goto out_err; | ||
| 579 | } | ||
| 580 | |||
| 581 | trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 582 | 573 | ||
| 583 | ret = bpf_map_precharge_memlock(trie->map.pages); | 574 | ret = bpf_map_charge_init(&trie->map.memory, cost); |
| 584 | if (ret) | 575 | if (ret) |
| 585 | goto out_err; | 576 | goto out_err; |
| 586 | 577 | ||
| @@ -716,9 +707,14 @@ find_leftmost: | |||
| 716 | * have exact two children, so this function will never return NULL. | 707 | * have exact two children, so this function will never return NULL. |
| 717 | */ | 708 | */ |
| 718 | for (node = search_root; node;) { | 709 | for (node = search_root; node;) { |
| 719 | if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) | 710 | if (node->flags & LPM_TREE_NODE_FLAG_IM) { |
| 711 | node = rcu_dereference(node->child[0]); | ||
| 712 | } else { | ||
| 720 | next_node = node; | 713 | next_node = node; |
| 721 | node = rcu_dereference(node->child[0]); | 714 | node = rcu_dereference(node->child[0]); |
| 715 | if (!node) | ||
| 716 | node = rcu_dereference(next_node->child[1]); | ||
| 717 | } | ||
| 722 | } | 718 | } |
| 723 | do_copy: | 719 | do_copy: |
| 724 | next_key->prefixlen = next_node->prefixlen; | 720 | next_key->prefixlen = next_node->prefixlen; |
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 3dff41403583..fab4fb134547 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2017 Facebook | 2 | /* Copyright (c) 2017 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #include <linux/slab.h> | 4 | #include <linux/slab.h> |
| 8 | #include <linux/bpf.h> | 5 | #include <linux/bpf.h> |
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h index 6183db9ec08c..a507bf6ef8b9 100644 --- a/kernel/bpf/map_in_map.h +++ b/kernel/bpf/map_in_map.h | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
| 1 | /* Copyright (c) 2017 Facebook | 2 | /* Copyright (c) 2017 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #ifndef __MAP_IN_MAP_H__ | 4 | #ifndef __MAP_IN_MAP_H__ |
| 8 | #define __MAP_IN_MAP_H__ | 5 | #define __MAP_IN_MAP_H__ |
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 0c1b4ba9e90e..6e090140b924 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2016 Facebook | 2 | /* Copyright (c) 2016 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #include "percpu_freelist.h" | 4 | #include "percpu_freelist.h" |
| 8 | 5 | ||
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index c3960118e617..fbf8a8a28979 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
| 1 | /* Copyright (c) 2016 Facebook | 2 | /* Copyright (c) 2016 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #ifndef __PERCPU_FREELIST_H__ | 4 | #ifndef __PERCPU_FREELIST_H__ |
| 8 | #define __PERCPU_FREELIST_H__ | 5 | #define __PERCPU_FREELIST_H__ |
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0b140d236889..f697647ceb54 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c | |||
| @@ -67,29 +67,28 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) | |||
| 67 | static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) | 67 | static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) |
| 68 | { | 68 | { |
| 69 | int ret, numa_node = bpf_map_attr_numa_node(attr); | 69 | int ret, numa_node = bpf_map_attr_numa_node(attr); |
| 70 | struct bpf_map_memory mem = {0}; | ||
| 70 | struct bpf_queue_stack *qs; | 71 | struct bpf_queue_stack *qs; |
| 71 | u64 size, queue_size, cost; | 72 | u64 size, queue_size, cost; |
| 72 | 73 | ||
| 73 | size = (u64) attr->max_entries + 1; | 74 | size = (u64) attr->max_entries + 1; |
| 74 | cost = queue_size = sizeof(*qs) + size * attr->value_size; | 75 | cost = queue_size = sizeof(*qs) + size * attr->value_size; |
| 75 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 76 | return ERR_PTR(-E2BIG); | ||
| 77 | 76 | ||
| 78 | cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | 77 | ret = bpf_map_charge_init(&mem, cost); |
| 79 | |||
| 80 | ret = bpf_map_precharge_memlock(cost); | ||
| 81 | if (ret < 0) | 78 | if (ret < 0) |
| 82 | return ERR_PTR(ret); | 79 | return ERR_PTR(ret); |
| 83 | 80 | ||
| 84 | qs = bpf_map_area_alloc(queue_size, numa_node); | 81 | qs = bpf_map_area_alloc(queue_size, numa_node); |
| 85 | if (!qs) | 82 | if (!qs) { |
| 83 | bpf_map_charge_finish(&mem); | ||
| 86 | return ERR_PTR(-ENOMEM); | 84 | return ERR_PTR(-ENOMEM); |
| 85 | } | ||
| 87 | 86 | ||
| 88 | memset(qs, 0, sizeof(*qs)); | 87 | memset(qs, 0, sizeof(*qs)); |
| 89 | 88 | ||
| 90 | bpf_map_init_from_attr(&qs->map, attr); | 89 | bpf_map_init_from_attr(&qs->map, attr); |
| 91 | 90 | ||
| 92 | qs->map.pages = cost; | 91 | bpf_map_charge_move(&qs->map.memory, &mem); |
| 93 | qs->size = size; | 92 | qs->size = size; |
| 94 | 93 | ||
| 95 | raw_spin_lock_init(&qs->lock); | 94 | raw_spin_lock_init(&qs->lock); |
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 18e225de80ff..50c083ba978c 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c | |||
| @@ -151,7 +151,8 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) | |||
| 151 | { | 151 | { |
| 152 | int err, numa_node = bpf_map_attr_numa_node(attr); | 152 | int err, numa_node = bpf_map_attr_numa_node(attr); |
| 153 | struct reuseport_array *array; | 153 | struct reuseport_array *array; |
| 154 | u64 cost, array_size; | 154 | struct bpf_map_memory mem; |
| 155 | u64 array_size; | ||
| 155 | 156 | ||
| 156 | if (!capable(CAP_SYS_ADMIN)) | 157 | if (!capable(CAP_SYS_ADMIN)) |
| 157 | return ERR_PTR(-EPERM); | 158 | return ERR_PTR(-EPERM); |
| @@ -159,24 +160,20 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) | |||
| 159 | array_size = sizeof(*array); | 160 | array_size = sizeof(*array); |
| 160 | array_size += (u64)attr->max_entries * sizeof(struct sock *); | 161 | array_size += (u64)attr->max_entries * sizeof(struct sock *); |
| 161 | 162 | ||
| 162 | /* make sure there is no u32 overflow later in round_up() */ | 163 | err = bpf_map_charge_init(&mem, array_size); |
| 163 | cost = array_size; | ||
| 164 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 165 | return ERR_PTR(-ENOMEM); | ||
| 166 | cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 167 | |||
| 168 | err = bpf_map_precharge_memlock(cost); | ||
| 169 | if (err) | 164 | if (err) |
| 170 | return ERR_PTR(err); | 165 | return ERR_PTR(err); |
| 171 | 166 | ||
| 172 | /* allocate all map elements and zero-initialize them */ | 167 | /* allocate all map elements and zero-initialize them */ |
| 173 | array = bpf_map_area_alloc(array_size, numa_node); | 168 | array = bpf_map_area_alloc(array_size, numa_node); |
| 174 | if (!array) | 169 | if (!array) { |
| 170 | bpf_map_charge_finish(&mem); | ||
| 175 | return ERR_PTR(-ENOMEM); | 171 | return ERR_PTR(-ENOMEM); |
| 172 | } | ||
| 176 | 173 | ||
| 177 | /* copy mandatory map attributes */ | 174 | /* copy mandatory map attributes */ |
| 178 | bpf_map_init_from_attr(&array->map, attr); | 175 | bpf_map_init_from_attr(&array->map, attr); |
| 179 | array->map.pages = cost; | 176 | bpf_map_charge_move(&array->map.memory, &mem); |
| 180 | 177 | ||
| 181 | return &array->map; | 178 | return &array->map; |
| 182 | } | 179 | } |
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 950ab2f28922..052580c33d26 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c | |||
| @@ -1,8 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2016 Facebook | 2 | /* Copyright (c) 2016 Facebook |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | */ | 3 | */ |
| 7 | #include <linux/bpf.h> | 4 | #include <linux/bpf.h> |
| 8 | #include <linux/jhash.h> | 5 | #include <linux/jhash.h> |
| @@ -89,6 +86,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) | |||
| 89 | { | 86 | { |
| 90 | u32 value_size = attr->value_size; | 87 | u32 value_size = attr->value_size; |
| 91 | struct bpf_stack_map *smap; | 88 | struct bpf_stack_map *smap; |
| 89 | struct bpf_map_memory mem; | ||
| 92 | u64 cost, n_buckets; | 90 | u64 cost, n_buckets; |
| 93 | int err; | 91 | int err; |
| 94 | 92 | ||
| @@ -116,40 +114,37 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) | |||
| 116 | n_buckets = roundup_pow_of_two(attr->max_entries); | 114 | n_buckets = roundup_pow_of_two(attr->max_entries); |
| 117 | 115 | ||
| 118 | cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); | 116 | cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); |
| 119 | if (cost >= U32_MAX - PAGE_SIZE) | 117 | cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); |
| 120 | return ERR_PTR(-E2BIG); | 118 | err = bpf_map_charge_init(&mem, cost); |
| 119 | if (err) | ||
| 120 | return ERR_PTR(err); | ||
| 121 | 121 | ||
| 122 | smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); | 122 | smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); |
| 123 | if (!smap) | 123 | if (!smap) { |
| 124 | bpf_map_charge_finish(&mem); | ||
| 124 | return ERR_PTR(-ENOMEM); | 125 | return ERR_PTR(-ENOMEM); |
| 125 | 126 | } | |
| 126 | err = -E2BIG; | ||
| 127 | cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); | ||
| 128 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 129 | goto free_smap; | ||
| 130 | 127 | ||
| 131 | bpf_map_init_from_attr(&smap->map, attr); | 128 | bpf_map_init_from_attr(&smap->map, attr); |
| 132 | smap->map.value_size = value_size; | 129 | smap->map.value_size = value_size; |
| 133 | smap->n_buckets = n_buckets; | 130 | smap->n_buckets = n_buckets; |
| 134 | smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 135 | |||
| 136 | err = bpf_map_precharge_memlock(smap->map.pages); | ||
| 137 | if (err) | ||
| 138 | goto free_smap; | ||
| 139 | 131 | ||
| 140 | err = get_callchain_buffers(sysctl_perf_event_max_stack); | 132 | err = get_callchain_buffers(sysctl_perf_event_max_stack); |
| 141 | if (err) | 133 | if (err) |
| 142 | goto free_smap; | 134 | goto free_charge; |
| 143 | 135 | ||
| 144 | err = prealloc_elems_and_freelist(smap); | 136 | err = prealloc_elems_and_freelist(smap); |
| 145 | if (err) | 137 | if (err) |
| 146 | goto put_buffers; | 138 | goto put_buffers; |
| 147 | 139 | ||
| 140 | bpf_map_charge_move(&smap->map.memory, &mem); | ||
| 141 | |||
| 148 | return &smap->map; | 142 | return &smap->map; |
| 149 | 143 | ||
| 150 | put_buffers: | 144 | put_buffers: |
| 151 | put_callchain_buffers(); | 145 | put_callchain_buffers(); |
| 152 | free_smap: | 146 | free_charge: |
| 147 | bpf_map_charge_finish(&mem); | ||
| 153 | bpf_map_area_free(smap); | 148 | bpf_map_area_free(smap); |
| 154 | return ERR_PTR(err); | 149 | return ERR_PTR(err); |
| 155 | } | 150 | } |
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ad3ccf82f31d..5d141f16f6fa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c | |||
| @@ -1,13 +1,5 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * | ||
| 3 | * This program is free software; you can redistribute it and/or | ||
| 4 | * modify it under the terms of version 2 of the GNU General Public | ||
| 5 | * License as published by the Free Software Foundation. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, but | ||
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 10 | * General Public License for more details. | ||
| 11 | */ | 3 | */ |
| 12 | #include <linux/bpf.h> | 4 | #include <linux/bpf.h> |
| 13 | #include <linux/bpf_trace.h> | 5 | #include <linux/bpf_trace.h> |
| @@ -188,19 +180,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) | |||
| 188 | map->numa_node = bpf_map_attr_numa_node(attr); | 180 | map->numa_node = bpf_map_attr_numa_node(attr); |
| 189 | } | 181 | } |
| 190 | 182 | ||
| 191 | int bpf_map_precharge_memlock(u32 pages) | ||
| 192 | { | ||
| 193 | struct user_struct *user = get_current_user(); | ||
| 194 | unsigned long memlock_limit, cur; | ||
| 195 | |||
| 196 | memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | ||
| 197 | cur = atomic_long_read(&user->locked_vm); | ||
| 198 | free_uid(user); | ||
| 199 | if (cur + pages > memlock_limit) | ||
| 200 | return -EPERM; | ||
| 201 | return 0; | ||
| 202 | } | ||
| 203 | |||
| 204 | static int bpf_charge_memlock(struct user_struct *user, u32 pages) | 183 | static int bpf_charge_memlock(struct user_struct *user, u32 pages) |
| 205 | { | 184 | { |
| 206 | unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; | 185 | unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; |
| @@ -214,45 +193,62 @@ static int bpf_charge_memlock(struct user_struct *user, u32 pages) | |||
| 214 | 193 | ||
| 215 | static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) | 194 | static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) |
| 216 | { | 195 | { |
| 217 | atomic_long_sub(pages, &user->locked_vm); | 196 | if (user) |
| 197 | atomic_long_sub(pages, &user->locked_vm); | ||
| 218 | } | 198 | } |
| 219 | 199 | ||
| 220 | static int bpf_map_init_memlock(struct bpf_map *map) | 200 | int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size) |
| 221 | { | 201 | { |
| 222 | struct user_struct *user = get_current_user(); | 202 | u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; |
| 203 | struct user_struct *user; | ||
| 223 | int ret; | 204 | int ret; |
| 224 | 205 | ||
| 225 | ret = bpf_charge_memlock(user, map->pages); | 206 | if (size >= U32_MAX - PAGE_SIZE) |
| 207 | return -E2BIG; | ||
| 208 | |||
| 209 | user = get_current_user(); | ||
| 210 | ret = bpf_charge_memlock(user, pages); | ||
| 226 | if (ret) { | 211 | if (ret) { |
| 227 | free_uid(user); | 212 | free_uid(user); |
| 228 | return ret; | 213 | return ret; |
| 229 | } | 214 | } |
| 230 | map->user = user; | 215 | |
| 231 | return ret; | 216 | mem->pages = pages; |
| 217 | mem->user = user; | ||
| 218 | |||
| 219 | return 0; | ||
| 232 | } | 220 | } |
| 233 | 221 | ||
| 234 | static void bpf_map_release_memlock(struct bpf_map *map) | 222 | void bpf_map_charge_finish(struct bpf_map_memory *mem) |
| 235 | { | 223 | { |
| 236 | struct user_struct *user = map->user; | 224 | bpf_uncharge_memlock(mem->user, mem->pages); |
| 237 | bpf_uncharge_memlock(user, map->pages); | 225 | free_uid(mem->user); |
| 238 | free_uid(user); | 226 | } |
| 227 | |||
| 228 | void bpf_map_charge_move(struct bpf_map_memory *dst, | ||
| 229 | struct bpf_map_memory *src) | ||
| 230 | { | ||
| 231 | *dst = *src; | ||
| 232 | |||
| 233 | /* Make sure src will not be used for the redundant uncharging. */ | ||
| 234 | memset(src, 0, sizeof(struct bpf_map_memory)); | ||
| 239 | } | 235 | } |
| 240 | 236 | ||
| 241 | int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) | 237 | int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) |
| 242 | { | 238 | { |
| 243 | int ret; | 239 | int ret; |
| 244 | 240 | ||
| 245 | ret = bpf_charge_memlock(map->user, pages); | 241 | ret = bpf_charge_memlock(map->memory.user, pages); |
| 246 | if (ret) | 242 | if (ret) |
| 247 | return ret; | 243 | return ret; |
| 248 | map->pages += pages; | 244 | map->memory.pages += pages; |
| 249 | return ret; | 245 | return ret; |
| 250 | } | 246 | } |
| 251 | 247 | ||
| 252 | void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) | 248 | void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) |
| 253 | { | 249 | { |
| 254 | bpf_uncharge_memlock(map->user, pages); | 250 | bpf_uncharge_memlock(map->memory.user, pages); |
| 255 | map->pages -= pages; | 251 | map->memory.pages -= pages; |
| 256 | } | 252 | } |
| 257 | 253 | ||
| 258 | static int bpf_map_alloc_id(struct bpf_map *map) | 254 | static int bpf_map_alloc_id(struct bpf_map *map) |
| @@ -303,11 +299,13 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) | |||
| 303 | static void bpf_map_free_deferred(struct work_struct *work) | 299 | static void bpf_map_free_deferred(struct work_struct *work) |
| 304 | { | 300 | { |
| 305 | struct bpf_map *map = container_of(work, struct bpf_map, work); | 301 | struct bpf_map *map = container_of(work, struct bpf_map, work); |
| 302 | struct bpf_map_memory mem; | ||
| 306 | 303 | ||
| 307 | bpf_map_release_memlock(map); | 304 | bpf_map_charge_move(&mem, &map->memory); |
| 308 | security_bpf_map_free(map); | 305 | security_bpf_map_free(map); |
| 309 | /* implementation dependent freeing */ | 306 | /* implementation dependent freeing */ |
| 310 | map->ops->map_free(map); | 307 | map->ops->map_free(map); |
| 308 | bpf_map_charge_finish(&mem); | ||
| 311 | } | 309 | } |
| 312 | 310 | ||
| 313 | static void bpf_map_put_uref(struct bpf_map *map) | 311 | static void bpf_map_put_uref(struct bpf_map *map) |
| @@ -395,7 +393,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) | |||
| 395 | map->value_size, | 393 | map->value_size, |
| 396 | map->max_entries, | 394 | map->max_entries, |
| 397 | map->map_flags, | 395 | map->map_flags, |
| 398 | map->pages * 1ULL << PAGE_SHIFT, | 396 | map->memory.pages * 1ULL << PAGE_SHIFT, |
| 399 | map->id, | 397 | map->id, |
| 400 | READ_ONCE(map->frozen)); | 398 | READ_ONCE(map->frozen)); |
| 401 | 399 | ||
| @@ -549,6 +547,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, | |||
| 549 | static int map_create(union bpf_attr *attr) | 547 | static int map_create(union bpf_attr *attr) |
| 550 | { | 548 | { |
| 551 | int numa_node = bpf_map_attr_numa_node(attr); | 549 | int numa_node = bpf_map_attr_numa_node(attr); |
| 550 | struct bpf_map_memory mem; | ||
| 552 | struct bpf_map *map; | 551 | struct bpf_map *map; |
| 553 | int f_flags; | 552 | int f_flags; |
| 554 | int err; | 553 | int err; |
| @@ -573,7 +572,7 @@ static int map_create(union bpf_attr *attr) | |||
| 573 | 572 | ||
| 574 | err = bpf_obj_name_cpy(map->name, attr->map_name); | 573 | err = bpf_obj_name_cpy(map->name, attr->map_name); |
| 575 | if (err) | 574 | if (err) |
| 576 | goto free_map_nouncharge; | 575 | goto free_map; |
| 577 | 576 | ||
| 578 | atomic_set(&map->refcnt, 1); | 577 | atomic_set(&map->refcnt, 1); |
| 579 | atomic_set(&map->usercnt, 1); | 578 | atomic_set(&map->usercnt, 1); |
| @@ -583,20 +582,20 @@ static int map_create(union bpf_attr *attr) | |||
| 583 | 582 | ||
| 584 | if (!attr->btf_value_type_id) { | 583 | if (!attr->btf_value_type_id) { |
| 585 | err = -EINVAL; | 584 | err = -EINVAL; |
| 586 | goto free_map_nouncharge; | 585 | goto free_map; |
| 587 | } | 586 | } |
| 588 | 587 | ||
| 589 | btf = btf_get_by_fd(attr->btf_fd); | 588 | btf = btf_get_by_fd(attr->btf_fd); |
| 590 | if (IS_ERR(btf)) { | 589 | if (IS_ERR(btf)) { |
| 591 | err = PTR_ERR(btf); | 590 | err = PTR_ERR(btf); |
| 592 | goto free_map_nouncharge; | 591 | goto free_map; |
| 593 | } | 592 | } |
| 594 | 593 | ||
| 595 | err = map_check_btf(map, btf, attr->btf_key_type_id, | 594 | err = map_check_btf(map, btf, attr->btf_key_type_id, |
| 596 | attr->btf_value_type_id); | 595 | attr->btf_value_type_id); |
| 597 | if (err) { | 596 | if (err) { |
| 598 | btf_put(btf); | 597 | btf_put(btf); |
| 599 | goto free_map_nouncharge; | 598 | goto free_map; |
| 600 | } | 599 | } |
| 601 | 600 | ||
| 602 | map->btf = btf; | 601 | map->btf = btf; |
| @@ -608,15 +607,11 @@ static int map_create(union bpf_attr *attr) | |||
| 608 | 607 | ||
| 609 | err = security_bpf_map_alloc(map); | 608 | err = security_bpf_map_alloc(map); |
| 610 | if (err) | 609 | if (err) |
| 611 | goto free_map_nouncharge; | 610 | goto free_map; |
| 612 | |||
| 613 | err = bpf_map_init_memlock(map); | ||
| 614 | if (err) | ||
| 615 | goto free_map_sec; | ||
| 616 | 611 | ||
| 617 | err = bpf_map_alloc_id(map); | 612 | err = bpf_map_alloc_id(map); |
| 618 | if (err) | 613 | if (err) |
| 619 | goto free_map; | 614 | goto free_map_sec; |
| 620 | 615 | ||
| 621 | err = bpf_map_new_fd(map, f_flags); | 616 | err = bpf_map_new_fd(map, f_flags); |
| 622 | if (err < 0) { | 617 | if (err < 0) { |
| @@ -632,13 +627,13 @@ static int map_create(union bpf_attr *attr) | |||
| 632 | 627 | ||
| 633 | return err; | 628 | return err; |
| 634 | 629 | ||
| 635 | free_map: | ||
| 636 | bpf_map_release_memlock(map); | ||
| 637 | free_map_sec: | 630 | free_map_sec: |
| 638 | security_bpf_map_free(map); | 631 | security_bpf_map_free(map); |
| 639 | free_map_nouncharge: | 632 | free_map: |
| 640 | btf_put(map->btf); | 633 | btf_put(map->btf); |
| 634 | bpf_map_charge_move(&mem, &map->memory); | ||
| 641 | map->ops->map_free(map); | 635 | map->ops->map_free(map); |
| 636 | bpf_map_charge_finish(&mem); | ||
| 642 | return err; | 637 | return err; |
| 643 | } | 638 | } |
| 644 | 639 | ||
| @@ -808,7 +803,10 @@ static int map_lookup_elem(union bpf_attr *attr) | |||
| 808 | err = map->ops->map_peek_elem(map, value); | 803 | err = map->ops->map_peek_elem(map, value); |
| 809 | } else { | 804 | } else { |
| 810 | rcu_read_lock(); | 805 | rcu_read_lock(); |
| 811 | ptr = map->ops->map_lookup_elem(map, key); | 806 | if (map->ops->map_lookup_elem_sys_only) |
| 807 | ptr = map->ops->map_lookup_elem_sys_only(map, key); | ||
| 808 | else | ||
| 809 | ptr = map->ops->map_lookup_elem(map, key); | ||
| 812 | if (IS_ERR(ptr)) { | 810 | if (IS_ERR(ptr)) { |
| 813 | err = PTR_ERR(ptr); | 811 | err = PTR_ERR(ptr); |
| 814 | } else if (!ptr) { | 812 | } else if (!ptr) { |
| @@ -1578,6 +1576,24 @@ bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, | |||
| 1578 | case BPF_CGROUP_INET6_CONNECT: | 1576 | case BPF_CGROUP_INET6_CONNECT: |
| 1579 | case BPF_CGROUP_UDP4_SENDMSG: | 1577 | case BPF_CGROUP_UDP4_SENDMSG: |
| 1580 | case BPF_CGROUP_UDP6_SENDMSG: | 1578 | case BPF_CGROUP_UDP6_SENDMSG: |
| 1579 | case BPF_CGROUP_UDP4_RECVMSG: | ||
| 1580 | case BPF_CGROUP_UDP6_RECVMSG: | ||
| 1581 | return 0; | ||
| 1582 | default: | ||
| 1583 | return -EINVAL; | ||
| 1584 | } | ||
| 1585 | case BPF_PROG_TYPE_CGROUP_SKB: | ||
| 1586 | switch (expected_attach_type) { | ||
| 1587 | case BPF_CGROUP_INET_INGRESS: | ||
| 1588 | case BPF_CGROUP_INET_EGRESS: | ||
| 1589 | return 0; | ||
| 1590 | default: | ||
| 1591 | return -EINVAL; | ||
| 1592 | } | ||
| 1593 | case BPF_PROG_TYPE_CGROUP_SOCKOPT: | ||
| 1594 | switch (expected_attach_type) { | ||
| 1595 | case BPF_CGROUP_SETSOCKOPT: | ||
| 1596 | case BPF_CGROUP_GETSOCKOPT: | ||
| 1581 | return 0; | 1597 | return 0; |
| 1582 | default: | 1598 | default: |
| 1583 | return -EINVAL; | 1599 | return -EINVAL; |
| @@ -1601,7 +1617,9 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) | |||
| 1601 | if (CHECK_ATTR(BPF_PROG_LOAD)) | 1617 | if (CHECK_ATTR(BPF_PROG_LOAD)) |
| 1602 | return -EINVAL; | 1618 | return -EINVAL; |
| 1603 | 1619 | ||
| 1604 | if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | BPF_F_ANY_ALIGNMENT)) | 1620 | if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | |
| 1621 | BPF_F_ANY_ALIGNMENT | | ||
| 1622 | BPF_F_TEST_RND_HI32)) | ||
| 1605 | return -EINVAL; | 1623 | return -EINVAL; |
| 1606 | 1624 | ||
| 1607 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && | 1625 | if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && |
| @@ -1671,7 +1689,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) | |||
| 1671 | if (err < 0) | 1689 | if (err < 0) |
| 1672 | goto free_prog; | 1690 | goto free_prog; |
| 1673 | 1691 | ||
| 1674 | prog->aux->load_time = ktime_get_boot_ns(); | 1692 | prog->aux->load_time = ktime_get_boottime_ns(); |
| 1675 | err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); | 1693 | err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); |
| 1676 | if (err) | 1694 | if (err) |
| 1677 | goto free_prog; | 1695 | goto free_prog; |
| @@ -1830,7 +1848,12 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, | |||
| 1830 | switch (prog->type) { | 1848 | switch (prog->type) { |
| 1831 | case BPF_PROG_TYPE_CGROUP_SOCK: | 1849 | case BPF_PROG_TYPE_CGROUP_SOCK: |
| 1832 | case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: | 1850 | case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: |
| 1851 | case BPF_PROG_TYPE_CGROUP_SOCKOPT: | ||
| 1833 | return attach_type == prog->expected_attach_type ? 0 : -EINVAL; | 1852 | return attach_type == prog->expected_attach_type ? 0 : -EINVAL; |
| 1853 | case BPF_PROG_TYPE_CGROUP_SKB: | ||
| 1854 | return prog->enforce_expected_attach_type && | ||
| 1855 | prog->expected_attach_type != attach_type ? | ||
| 1856 | -EINVAL : 0; | ||
| 1834 | default: | 1857 | default: |
| 1835 | return 0; | 1858 | return 0; |
| 1836 | } | 1859 | } |
| @@ -1872,6 +1895,8 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 1872 | case BPF_CGROUP_INET6_CONNECT: | 1895 | case BPF_CGROUP_INET6_CONNECT: |
| 1873 | case BPF_CGROUP_UDP4_SENDMSG: | 1896 | case BPF_CGROUP_UDP4_SENDMSG: |
| 1874 | case BPF_CGROUP_UDP6_SENDMSG: | 1897 | case BPF_CGROUP_UDP6_SENDMSG: |
| 1898 | case BPF_CGROUP_UDP4_RECVMSG: | ||
| 1899 | case BPF_CGROUP_UDP6_RECVMSG: | ||
| 1875 | ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; | 1900 | ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; |
| 1876 | break; | 1901 | break; |
| 1877 | case BPF_CGROUP_SOCK_OPS: | 1902 | case BPF_CGROUP_SOCK_OPS: |
| @@ -1896,6 +1921,10 @@ static int bpf_prog_attach(const union bpf_attr *attr) | |||
| 1896 | case BPF_CGROUP_SYSCTL: | 1921 | case BPF_CGROUP_SYSCTL: |
| 1897 | ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; | 1922 | ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; |
| 1898 | break; | 1923 | break; |
| 1924 | case BPF_CGROUP_GETSOCKOPT: | ||
| 1925 | case BPF_CGROUP_SETSOCKOPT: | ||
| 1926 | ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; | ||
| 1927 | break; | ||
| 1899 | default: | 1928 | default: |
| 1900 | return -EINVAL; | 1929 | return -EINVAL; |
| 1901 | } | 1930 | } |
| @@ -1957,6 +1986,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 1957 | case BPF_CGROUP_INET6_CONNECT: | 1986 | case BPF_CGROUP_INET6_CONNECT: |
| 1958 | case BPF_CGROUP_UDP4_SENDMSG: | 1987 | case BPF_CGROUP_UDP4_SENDMSG: |
| 1959 | case BPF_CGROUP_UDP6_SENDMSG: | 1988 | case BPF_CGROUP_UDP6_SENDMSG: |
| 1989 | case BPF_CGROUP_UDP4_RECVMSG: | ||
| 1990 | case BPF_CGROUP_UDP6_RECVMSG: | ||
| 1960 | ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; | 1991 | ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; |
| 1961 | break; | 1992 | break; |
| 1962 | case BPF_CGROUP_SOCK_OPS: | 1993 | case BPF_CGROUP_SOCK_OPS: |
| @@ -1977,6 +2008,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) | |||
| 1977 | case BPF_CGROUP_SYSCTL: | 2008 | case BPF_CGROUP_SYSCTL: |
| 1978 | ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; | 2009 | ptype = BPF_PROG_TYPE_CGROUP_SYSCTL; |
| 1979 | break; | 2010 | break; |
| 2011 | case BPF_CGROUP_GETSOCKOPT: | ||
| 2012 | case BPF_CGROUP_SETSOCKOPT: | ||
| 2013 | ptype = BPF_PROG_TYPE_CGROUP_SOCKOPT; | ||
| 2014 | break; | ||
| 1980 | default: | 2015 | default: |
| 1981 | return -EINVAL; | 2016 | return -EINVAL; |
| 1982 | } | 2017 | } |
| @@ -2008,9 +2043,13 @@ static int bpf_prog_query(const union bpf_attr *attr, | |||
| 2008 | case BPF_CGROUP_INET6_CONNECT: | 2043 | case BPF_CGROUP_INET6_CONNECT: |
| 2009 | case BPF_CGROUP_UDP4_SENDMSG: | 2044 | case BPF_CGROUP_UDP4_SENDMSG: |
| 2010 | case BPF_CGROUP_UDP6_SENDMSG: | 2045 | case BPF_CGROUP_UDP6_SENDMSG: |
| 2046 | case BPF_CGROUP_UDP4_RECVMSG: | ||
| 2047 | case BPF_CGROUP_UDP6_RECVMSG: | ||
| 2011 | case BPF_CGROUP_SOCK_OPS: | 2048 | case BPF_CGROUP_SOCK_OPS: |
| 2012 | case BPF_CGROUP_DEVICE: | 2049 | case BPF_CGROUP_DEVICE: |
| 2013 | case BPF_CGROUP_SYSCTL: | 2050 | case BPF_CGROUP_SYSCTL: |
| 2051 | case BPF_CGROUP_GETSOCKOPT: | ||
| 2052 | case BPF_CGROUP_SETSOCKOPT: | ||
| 2014 | break; | 2053 | break; |
| 2015 | case BPF_LIRC_MODE2: | 2054 | case BPF_LIRC_MODE2: |
| 2016 | return lirc_prog_query(attr, uattr); | 2055 | return lirc_prog_query(attr, uattr); |
diff --git a/kernel/bpf/tnum.c b/kernel/bpf/tnum.c index 938d41211be7..ca52b9642943 100644 --- a/kernel/bpf/tnum.c +++ b/kernel/bpf/tnum.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* tnum: tracked (or tristate) numbers | 2 | /* tnum: tracked (or tristate) numbers |
| 2 | * | 3 | * |
| 3 | * A tnum tracks knowledge about the bits of a value. Each bit can be either | 4 | * A tnum tracks knowledge about the bits of a value. Each bit can be either |
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 95f9354495ad..a2e763703c30 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c | |||
| @@ -1,15 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com | 2 | /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com |
| 2 | * Copyright (c) 2016 Facebook | 3 | * Copyright (c) 2016 Facebook |
| 3 | * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io | 4 | * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io |
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of version 2 of the GNU General Public | ||
| 7 | * License as published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, but | ||
| 10 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 12 | * General Public License for more details. | ||
| 13 | */ | 5 | */ |
| 14 | #include <uapi/linux/btf.h> | 6 | #include <uapi/linux/btf.h> |
| 15 | #include <linux/kernel.h> | 7 | #include <linux/kernel.h> |
| @@ -176,7 +168,7 @@ struct bpf_verifier_stack_elem { | |||
| 176 | struct bpf_verifier_stack_elem *next; | 168 | struct bpf_verifier_stack_elem *next; |
| 177 | }; | 169 | }; |
| 178 | 170 | ||
| 179 | #define BPF_COMPLEXITY_LIMIT_STACK 1024 | 171 | #define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 |
| 180 | #define BPF_COMPLEXITY_LIMIT_STATES 64 | 172 | #define BPF_COMPLEXITY_LIMIT_STATES 64 |
| 181 | 173 | ||
| 182 | #define BPF_MAP_PTR_UNPRIV 1UL | 174 | #define BPF_MAP_PTR_UNPRIV 1UL |
| @@ -334,7 +326,8 @@ static bool type_is_sk_pointer(enum bpf_reg_type type) | |||
| 334 | { | 326 | { |
| 335 | return type == PTR_TO_SOCKET || | 327 | return type == PTR_TO_SOCKET || |
| 336 | type == PTR_TO_SOCK_COMMON || | 328 | type == PTR_TO_SOCK_COMMON || |
| 337 | type == PTR_TO_TCP_SOCK; | 329 | type == PTR_TO_TCP_SOCK || |
| 330 | type == PTR_TO_XDP_SOCK; | ||
| 338 | } | 331 | } |
| 339 | 332 | ||
| 340 | static bool reg_type_may_be_null(enum bpf_reg_type type) | 333 | static bool reg_type_may_be_null(enum bpf_reg_type type) |
| @@ -406,6 +399,7 @@ static const char * const reg_type_str[] = { | |||
| 406 | [PTR_TO_TCP_SOCK] = "tcp_sock", | 399 | [PTR_TO_TCP_SOCK] = "tcp_sock", |
| 407 | [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", | 400 | [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", |
| 408 | [PTR_TO_TP_BUFFER] = "tp_buffer", | 401 | [PTR_TO_TP_BUFFER] = "tp_buffer", |
| 402 | [PTR_TO_XDP_SOCK] = "xdp_sock", | ||
| 409 | }; | 403 | }; |
| 410 | 404 | ||
| 411 | static char slot_type_char[] = { | 405 | static char slot_type_char[] = { |
| @@ -453,12 +447,12 @@ static void print_verifier_state(struct bpf_verifier_env *env, | |||
| 453 | verbose(env, " R%d", i); | 447 | verbose(env, " R%d", i); |
| 454 | print_liveness(env, reg->live); | 448 | print_liveness(env, reg->live); |
| 455 | verbose(env, "=%s", reg_type_str[t]); | 449 | verbose(env, "=%s", reg_type_str[t]); |
| 450 | if (t == SCALAR_VALUE && reg->precise) | ||
| 451 | verbose(env, "P"); | ||
| 456 | if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && | 452 | if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && |
| 457 | tnum_is_const(reg->var_off)) { | 453 | tnum_is_const(reg->var_off)) { |
| 458 | /* reg->off should be 0 for SCALAR_VALUE */ | 454 | /* reg->off should be 0 for SCALAR_VALUE */ |
| 459 | verbose(env, "%lld", reg->var_off.value + reg->off); | 455 | verbose(env, "%lld", reg->var_off.value + reg->off); |
| 460 | if (t == PTR_TO_STACK) | ||
| 461 | verbose(env, ",call_%d", func(env, reg)->callsite); | ||
| 462 | } else { | 456 | } else { |
| 463 | verbose(env, "(id=%d", reg->id); | 457 | verbose(env, "(id=%d", reg->id); |
| 464 | if (reg_type_may_be_refcounted_or_null(t)) | 458 | if (reg_type_may_be_refcounted_or_null(t)) |
| @@ -520,11 +514,17 @@ static void print_verifier_state(struct bpf_verifier_env *env, | |||
| 520 | continue; | 514 | continue; |
| 521 | verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); | 515 | verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); |
| 522 | print_liveness(env, state->stack[i].spilled_ptr.live); | 516 | print_liveness(env, state->stack[i].spilled_ptr.live); |
| 523 | if (state->stack[i].slot_type[0] == STACK_SPILL) | 517 | if (state->stack[i].slot_type[0] == STACK_SPILL) { |
| 524 | verbose(env, "=%s", | 518 | reg = &state->stack[i].spilled_ptr; |
| 525 | reg_type_str[state->stack[i].spilled_ptr.type]); | 519 | t = reg->type; |
| 526 | else | 520 | verbose(env, "=%s", reg_type_str[t]); |
| 521 | if (t == SCALAR_VALUE && reg->precise) | ||
| 522 | verbose(env, "P"); | ||
| 523 | if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) | ||
| 524 | verbose(env, "%lld", reg->var_off.value + reg->off); | ||
| 525 | } else { | ||
| 527 | verbose(env, "=%s", types_buf); | 526 | verbose(env, "=%s", types_buf); |
| 527 | } | ||
| 528 | } | 528 | } |
| 529 | if (state->acquired_refs && state->refs[0].id) { | 529 | if (state->acquired_refs && state->refs[0].id) { |
| 530 | verbose(env, " refs=%d", state->refs[0].id); | 530 | verbose(env, " refs=%d", state->refs[0].id); |
| @@ -673,6 +673,13 @@ static void free_func_state(struct bpf_func_state *state) | |||
| 673 | kfree(state); | 673 | kfree(state); |
| 674 | } | 674 | } |
| 675 | 675 | ||
| 676 | static void clear_jmp_history(struct bpf_verifier_state *state) | ||
| 677 | { | ||
| 678 | kfree(state->jmp_history); | ||
| 679 | state->jmp_history = NULL; | ||
| 680 | state->jmp_history_cnt = 0; | ||
| 681 | } | ||
| 682 | |||
| 676 | static void free_verifier_state(struct bpf_verifier_state *state, | 683 | static void free_verifier_state(struct bpf_verifier_state *state, |
| 677 | bool free_self) | 684 | bool free_self) |
| 678 | { | 685 | { |
| @@ -682,6 +689,7 @@ static void free_verifier_state(struct bpf_verifier_state *state, | |||
| 682 | free_func_state(state->frame[i]); | 689 | free_func_state(state->frame[i]); |
| 683 | state->frame[i] = NULL; | 690 | state->frame[i] = NULL; |
| 684 | } | 691 | } |
| 692 | clear_jmp_history(state); | ||
| 685 | if (free_self) | 693 | if (free_self) |
| 686 | kfree(state); | 694 | kfree(state); |
| 687 | } | 695 | } |
| @@ -709,8 +717,18 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, | |||
| 709 | const struct bpf_verifier_state *src) | 717 | const struct bpf_verifier_state *src) |
| 710 | { | 718 | { |
| 711 | struct bpf_func_state *dst; | 719 | struct bpf_func_state *dst; |
| 720 | u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; | ||
| 712 | int i, err; | 721 | int i, err; |
| 713 | 722 | ||
| 723 | if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { | ||
| 724 | kfree(dst_state->jmp_history); | ||
| 725 | dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); | ||
| 726 | if (!dst_state->jmp_history) | ||
| 727 | return -ENOMEM; | ||
| 728 | } | ||
| 729 | memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); | ||
| 730 | dst_state->jmp_history_cnt = src->jmp_history_cnt; | ||
| 731 | |||
| 714 | /* if dst has more stack frames then src frame, free them */ | 732 | /* if dst has more stack frames then src frame, free them */ |
| 715 | for (i = src->curframe + 1; i <= dst_state->curframe; i++) { | 733 | for (i = src->curframe + 1; i <= dst_state->curframe; i++) { |
| 716 | free_func_state(dst_state->frame[i]); | 734 | free_func_state(dst_state->frame[i]); |
| @@ -719,6 +737,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, | |||
| 719 | dst_state->speculative = src->speculative; | 737 | dst_state->speculative = src->speculative; |
| 720 | dst_state->curframe = src->curframe; | 738 | dst_state->curframe = src->curframe; |
| 721 | dst_state->active_spin_lock = src->active_spin_lock; | 739 | dst_state->active_spin_lock = src->active_spin_lock; |
| 740 | dst_state->branches = src->branches; | ||
| 741 | dst_state->parent = src->parent; | ||
| 742 | dst_state->first_insn_idx = src->first_insn_idx; | ||
| 743 | dst_state->last_insn_idx = src->last_insn_idx; | ||
| 722 | for (i = 0; i <= src->curframe; i++) { | 744 | for (i = 0; i <= src->curframe; i++) { |
| 723 | dst = dst_state->frame[i]; | 745 | dst = dst_state->frame[i]; |
| 724 | if (!dst) { | 746 | if (!dst) { |
| @@ -734,6 +756,23 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, | |||
| 734 | return 0; | 756 | return 0; |
| 735 | } | 757 | } |
| 736 | 758 | ||
| 759 | static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) | ||
| 760 | { | ||
| 761 | while (st) { | ||
| 762 | u32 br = --st->branches; | ||
| 763 | |||
| 764 | /* WARN_ON(br > 1) technically makes sense here, | ||
| 765 | * but see comment in push_stack(), hence: | ||
| 766 | */ | ||
| 767 | WARN_ONCE((int)br < 0, | ||
| 768 | "BUG update_branch_counts:branches_to_explore=%d\n", | ||
| 769 | br); | ||
| 770 | if (br) | ||
| 771 | break; | ||
| 772 | st = st->parent; | ||
| 773 | } | ||
| 774 | } | ||
| 775 | |||
| 737 | static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, | 776 | static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, |
| 738 | int *insn_idx) | 777 | int *insn_idx) |
| 739 | { | 778 | { |
| @@ -782,10 +821,23 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, | |||
| 782 | if (err) | 821 | if (err) |
| 783 | goto err; | 822 | goto err; |
| 784 | elem->st.speculative |= speculative; | 823 | elem->st.speculative |= speculative; |
| 785 | if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { | 824 | if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { |
| 786 | verbose(env, "BPF program is too complex\n"); | 825 | verbose(env, "The sequence of %d jumps is too complex.\n", |
| 826 | env->stack_size); | ||
| 787 | goto err; | 827 | goto err; |
| 788 | } | 828 | } |
| 829 | if (elem->st.parent) { | ||
| 830 | ++elem->st.parent->branches; | ||
| 831 | /* WARN_ON(branches > 2) technically makes sense here, | ||
| 832 | * but | ||
| 833 | * 1. speculative states will bump 'branches' for non-branch | ||
| 834 | * instructions | ||
| 835 | * 2. is_state_visited() heuristics may decide not to create | ||
| 836 | * a new state for a sequence of branches and all such current | ||
| 837 | * and cloned states will be pointing to a single parent state | ||
| 838 | * which might have large 'branches' count. | ||
| 839 | */ | ||
| 840 | } | ||
| 789 | return &elem->st; | 841 | return &elem->st; |
| 790 | err: | 842 | err: |
| 791 | free_verifier_state(env->cur_state, true); | 843 | free_verifier_state(env->cur_state, true); |
| @@ -933,6 +985,9 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg) | |||
| 933 | reg->smax_value = S64_MAX; | 985 | reg->smax_value = S64_MAX; |
| 934 | reg->umin_value = 0; | 986 | reg->umin_value = 0; |
| 935 | reg->umax_value = U64_MAX; | 987 | reg->umax_value = U64_MAX; |
| 988 | |||
| 989 | /* constant backtracking is enabled for root only for now */ | ||
| 990 | reg->precise = capable(CAP_SYS_ADMIN) ? false : true; | ||
| 936 | } | 991 | } |
| 937 | 992 | ||
| 938 | /* Mark a register as having a completely unknown (scalar) value. */ | 993 | /* Mark a register as having a completely unknown (scalar) value. */ |
| @@ -981,6 +1036,7 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, | |||
| 981 | __mark_reg_not_init(regs + regno); | 1036 | __mark_reg_not_init(regs + regno); |
| 982 | } | 1037 | } |
| 983 | 1038 | ||
| 1039 | #define DEF_NOT_SUBREG (0) | ||
| 984 | static void init_reg_state(struct bpf_verifier_env *env, | 1040 | static void init_reg_state(struct bpf_verifier_env *env, |
| 985 | struct bpf_func_state *state) | 1041 | struct bpf_func_state *state) |
| 986 | { | 1042 | { |
| @@ -991,6 +1047,7 @@ static void init_reg_state(struct bpf_verifier_env *env, | |||
| 991 | mark_reg_not_init(env, regs, i); | 1047 | mark_reg_not_init(env, regs, i); |
| 992 | regs[i].live = REG_LIVE_NONE; | 1048 | regs[i].live = REG_LIVE_NONE; |
| 993 | regs[i].parent = NULL; | 1049 | regs[i].parent = NULL; |
| 1050 | regs[i].subreg_def = DEF_NOT_SUBREG; | ||
| 994 | } | 1051 | } |
| 995 | 1052 | ||
| 996 | /* frame pointer */ | 1053 | /* frame pointer */ |
| @@ -1136,7 +1193,7 @@ next: | |||
| 1136 | */ | 1193 | */ |
| 1137 | static int mark_reg_read(struct bpf_verifier_env *env, | 1194 | static int mark_reg_read(struct bpf_verifier_env *env, |
| 1138 | const struct bpf_reg_state *state, | 1195 | const struct bpf_reg_state *state, |
| 1139 | struct bpf_reg_state *parent) | 1196 | struct bpf_reg_state *parent, u8 flag) |
| 1140 | { | 1197 | { |
| 1141 | bool writes = parent == state->parent; /* Observe write marks */ | 1198 | bool writes = parent == state->parent; /* Observe write marks */ |
| 1142 | int cnt = 0; | 1199 | int cnt = 0; |
| @@ -1151,17 +1208,26 @@ static int mark_reg_read(struct bpf_verifier_env *env, | |||
| 1151 | parent->var_off.value, parent->off); | 1208 | parent->var_off.value, parent->off); |
| 1152 | return -EFAULT; | 1209 | return -EFAULT; |
| 1153 | } | 1210 | } |
| 1154 | if (parent->live & REG_LIVE_READ) | 1211 | /* The first condition is more likely to be true than the |
| 1212 | * second, checked it first. | ||
| 1213 | */ | ||
| 1214 | if ((parent->live & REG_LIVE_READ) == flag || | ||
| 1215 | parent->live & REG_LIVE_READ64) | ||
| 1155 | /* The parentage chain never changes and | 1216 | /* The parentage chain never changes and |
| 1156 | * this parent was already marked as LIVE_READ. | 1217 | * this parent was already marked as LIVE_READ. |
| 1157 | * There is no need to keep walking the chain again and | 1218 | * There is no need to keep walking the chain again and |
| 1158 | * keep re-marking all parents as LIVE_READ. | 1219 | * keep re-marking all parents as LIVE_READ. |
| 1159 | * This case happens when the same register is read | 1220 | * This case happens when the same register is read |
| 1160 | * multiple times without writes into it in-between. | 1221 | * multiple times without writes into it in-between. |
| 1222 | * Also, if parent has the stronger REG_LIVE_READ64 set, | ||
| 1223 | * then no need to set the weak REG_LIVE_READ32. | ||
| 1161 | */ | 1224 | */ |
| 1162 | break; | 1225 | break; |
| 1163 | /* ... then we depend on parent's value */ | 1226 | /* ... then we depend on parent's value */ |
| 1164 | parent->live |= REG_LIVE_READ; | 1227 | parent->live |= flag; |
| 1228 | /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ | ||
| 1229 | if (flag == REG_LIVE_READ64) | ||
| 1230 | parent->live &= ~REG_LIVE_READ32; | ||
| 1165 | state = parent; | 1231 | state = parent; |
| 1166 | parent = state->parent; | 1232 | parent = state->parent; |
| 1167 | writes = true; | 1233 | writes = true; |
| @@ -1173,12 +1239,129 @@ static int mark_reg_read(struct bpf_verifier_env *env, | |||
| 1173 | return 0; | 1239 | return 0; |
| 1174 | } | 1240 | } |
| 1175 | 1241 | ||
| 1242 | /* This function is supposed to be used by the following 32-bit optimization | ||
| 1243 | * code only. It returns TRUE if the source or destination register operates | ||
| 1244 | * on 64-bit, otherwise return FALSE. | ||
| 1245 | */ | ||
| 1246 | static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, | ||
| 1247 | u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) | ||
| 1248 | { | ||
| 1249 | u8 code, class, op; | ||
| 1250 | |||
| 1251 | code = insn->code; | ||
| 1252 | class = BPF_CLASS(code); | ||
| 1253 | op = BPF_OP(code); | ||
| 1254 | if (class == BPF_JMP) { | ||
| 1255 | /* BPF_EXIT for "main" will reach here. Return TRUE | ||
| 1256 | * conservatively. | ||
| 1257 | */ | ||
| 1258 | if (op == BPF_EXIT) | ||
| 1259 | return true; | ||
| 1260 | if (op == BPF_CALL) { | ||
| 1261 | /* BPF to BPF call will reach here because of marking | ||
| 1262 | * caller saved clobber with DST_OP_NO_MARK for which we | ||
| 1263 | * don't care the register def because they are anyway | ||
| 1264 | * marked as NOT_INIT already. | ||
| 1265 | */ | ||
| 1266 | if (insn->src_reg == BPF_PSEUDO_CALL) | ||
| 1267 | return false; | ||
| 1268 | /* Helper call will reach here because of arg type | ||
| 1269 | * check, conservatively return TRUE. | ||
| 1270 | */ | ||
| 1271 | if (t == SRC_OP) | ||
| 1272 | return true; | ||
| 1273 | |||
| 1274 | return false; | ||
| 1275 | } | ||
| 1276 | } | ||
| 1277 | |||
| 1278 | if (class == BPF_ALU64 || class == BPF_JMP || | ||
| 1279 | /* BPF_END always use BPF_ALU class. */ | ||
| 1280 | (class == BPF_ALU && op == BPF_END && insn->imm == 64)) | ||
| 1281 | return true; | ||
| 1282 | |||
| 1283 | if (class == BPF_ALU || class == BPF_JMP32) | ||
| 1284 | return false; | ||
| 1285 | |||
| 1286 | if (class == BPF_LDX) { | ||
| 1287 | if (t != SRC_OP) | ||
| 1288 | return BPF_SIZE(code) == BPF_DW; | ||
| 1289 | /* LDX source must be ptr. */ | ||
| 1290 | return true; | ||
| 1291 | } | ||
| 1292 | |||
| 1293 | if (class == BPF_STX) { | ||
| 1294 | if (reg->type != SCALAR_VALUE) | ||
| 1295 | return true; | ||
| 1296 | return BPF_SIZE(code) == BPF_DW; | ||
| 1297 | } | ||
| 1298 | |||
| 1299 | if (class == BPF_LD) { | ||
| 1300 | u8 mode = BPF_MODE(code); | ||
| 1301 | |||
| 1302 | /* LD_IMM64 */ | ||
| 1303 | if (mode == BPF_IMM) | ||
| 1304 | return true; | ||
| 1305 | |||
| 1306 | /* Both LD_IND and LD_ABS return 32-bit data. */ | ||
| 1307 | if (t != SRC_OP) | ||
| 1308 | return false; | ||
| 1309 | |||
| 1310 | /* Implicit ctx ptr. */ | ||
| 1311 | if (regno == BPF_REG_6) | ||
| 1312 | return true; | ||
| 1313 | |||
| 1314 | /* Explicit source could be any width. */ | ||
| 1315 | return true; | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | if (class == BPF_ST) | ||
| 1319 | /* The only source register for BPF_ST is a ptr. */ | ||
| 1320 | return true; | ||
| 1321 | |||
| 1322 | /* Conservatively return true at default. */ | ||
| 1323 | return true; | ||
| 1324 | } | ||
| 1325 | |||
| 1326 | /* Return TRUE if INSN doesn't have explicit value define. */ | ||
| 1327 | static bool insn_no_def(struct bpf_insn *insn) | ||
| 1328 | { | ||
| 1329 | u8 class = BPF_CLASS(insn->code); | ||
| 1330 | |||
| 1331 | return (class == BPF_JMP || class == BPF_JMP32 || | ||
| 1332 | class == BPF_STX || class == BPF_ST); | ||
| 1333 | } | ||
| 1334 | |||
| 1335 | /* Return TRUE if INSN has defined any 32-bit value explicitly. */ | ||
| 1336 | static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) | ||
| 1337 | { | ||
| 1338 | if (insn_no_def(insn)) | ||
| 1339 | return false; | ||
| 1340 | |||
| 1341 | return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | static void mark_insn_zext(struct bpf_verifier_env *env, | ||
| 1345 | struct bpf_reg_state *reg) | ||
| 1346 | { | ||
| 1347 | s32 def_idx = reg->subreg_def; | ||
| 1348 | |||
| 1349 | if (def_idx == DEF_NOT_SUBREG) | ||
| 1350 | return; | ||
| 1351 | |||
| 1352 | env->insn_aux_data[def_idx - 1].zext_dst = true; | ||
| 1353 | /* The dst will be zero extended, so won't be sub-register anymore. */ | ||
| 1354 | reg->subreg_def = DEF_NOT_SUBREG; | ||
| 1355 | } | ||
| 1356 | |||
| 1176 | static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, | 1357 | static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, |
| 1177 | enum reg_arg_type t) | 1358 | enum reg_arg_type t) |
| 1178 | { | 1359 | { |
| 1179 | struct bpf_verifier_state *vstate = env->cur_state; | 1360 | struct bpf_verifier_state *vstate = env->cur_state; |
| 1180 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | 1361 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; |
| 1362 | struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; | ||
| 1181 | struct bpf_reg_state *reg, *regs = state->regs; | 1363 | struct bpf_reg_state *reg, *regs = state->regs; |
| 1364 | bool rw64; | ||
| 1182 | 1365 | ||
| 1183 | if (regno >= MAX_BPF_REG) { | 1366 | if (regno >= MAX_BPF_REG) { |
| 1184 | verbose(env, "R%d is invalid\n", regno); | 1367 | verbose(env, "R%d is invalid\n", regno); |
| @@ -1186,6 +1369,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 1186 | } | 1369 | } |
| 1187 | 1370 | ||
| 1188 | reg = ®s[regno]; | 1371 | reg = ®s[regno]; |
| 1372 | rw64 = is_reg64(env, insn, regno, reg, t); | ||
| 1189 | if (t == SRC_OP) { | 1373 | if (t == SRC_OP) { |
| 1190 | /* check whether register used as source operand can be read */ | 1374 | /* check whether register used as source operand can be read */ |
| 1191 | if (reg->type == NOT_INIT) { | 1375 | if (reg->type == NOT_INIT) { |
| @@ -1196,7 +1380,11 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 1196 | if (regno == BPF_REG_FP) | 1380 | if (regno == BPF_REG_FP) |
| 1197 | return 0; | 1381 | return 0; |
| 1198 | 1382 | ||
| 1199 | return mark_reg_read(env, reg, reg->parent); | 1383 | if (rw64) |
| 1384 | mark_insn_zext(env, reg); | ||
| 1385 | |||
| 1386 | return mark_reg_read(env, reg, reg->parent, | ||
| 1387 | rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); | ||
| 1200 | } else { | 1388 | } else { |
| 1201 | /* check whether register used as dest operand can be written to */ | 1389 | /* check whether register used as dest operand can be written to */ |
| 1202 | if (regno == BPF_REG_FP) { | 1390 | if (regno == BPF_REG_FP) { |
| @@ -1204,12 +1392,441 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 1204 | return -EACCES; | 1392 | return -EACCES; |
| 1205 | } | 1393 | } |
| 1206 | reg->live |= REG_LIVE_WRITTEN; | 1394 | reg->live |= REG_LIVE_WRITTEN; |
| 1395 | reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; | ||
| 1207 | if (t == DST_OP) | 1396 | if (t == DST_OP) |
| 1208 | mark_reg_unknown(env, regs, regno); | 1397 | mark_reg_unknown(env, regs, regno); |
| 1209 | } | 1398 | } |
| 1210 | return 0; | 1399 | return 0; |
| 1211 | } | 1400 | } |
| 1212 | 1401 | ||
| 1402 | /* for any branch, call, exit record the history of jmps in the given state */ | ||
| 1403 | static int push_jmp_history(struct bpf_verifier_env *env, | ||
| 1404 | struct bpf_verifier_state *cur) | ||
| 1405 | { | ||
| 1406 | u32 cnt = cur->jmp_history_cnt; | ||
| 1407 | struct bpf_idx_pair *p; | ||
| 1408 | |||
| 1409 | cnt++; | ||
| 1410 | p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); | ||
| 1411 | if (!p) | ||
| 1412 | return -ENOMEM; | ||
| 1413 | p[cnt - 1].idx = env->insn_idx; | ||
| 1414 | p[cnt - 1].prev_idx = env->prev_insn_idx; | ||
| 1415 | cur->jmp_history = p; | ||
| 1416 | cur->jmp_history_cnt = cnt; | ||
| 1417 | return 0; | ||
| 1418 | } | ||
| 1419 | |||
| 1420 | /* Backtrack one insn at a time. If idx is not at the top of recorded | ||
| 1421 | * history then previous instruction came from straight line execution. | ||
| 1422 | */ | ||
| 1423 | static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, | ||
| 1424 | u32 *history) | ||
| 1425 | { | ||
| 1426 | u32 cnt = *history; | ||
| 1427 | |||
| 1428 | if (cnt && st->jmp_history[cnt - 1].idx == i) { | ||
| 1429 | i = st->jmp_history[cnt - 1].prev_idx; | ||
| 1430 | (*history)--; | ||
| 1431 | } else { | ||
| 1432 | i--; | ||
| 1433 | } | ||
| 1434 | return i; | ||
| 1435 | } | ||
| 1436 | |||
| 1437 | /* For given verifier state backtrack_insn() is called from the last insn to | ||
| 1438 | * the first insn. Its purpose is to compute a bitmask of registers and | ||
| 1439 | * stack slots that needs precision in the parent verifier state. | ||
| 1440 | */ | ||
| 1441 | static int backtrack_insn(struct bpf_verifier_env *env, int idx, | ||
| 1442 | u32 *reg_mask, u64 *stack_mask) | ||
| 1443 | { | ||
| 1444 | const struct bpf_insn_cbs cbs = { | ||
| 1445 | .cb_print = verbose, | ||
| 1446 | .private_data = env, | ||
| 1447 | }; | ||
| 1448 | struct bpf_insn *insn = env->prog->insnsi + idx; | ||
| 1449 | u8 class = BPF_CLASS(insn->code); | ||
| 1450 | u8 opcode = BPF_OP(insn->code); | ||
| 1451 | u8 mode = BPF_MODE(insn->code); | ||
| 1452 | u32 dreg = 1u << insn->dst_reg; | ||
| 1453 | u32 sreg = 1u << insn->src_reg; | ||
| 1454 | u32 spi; | ||
| 1455 | |||
| 1456 | if (insn->code == 0) | ||
| 1457 | return 0; | ||
| 1458 | if (env->log.level & BPF_LOG_LEVEL) { | ||
| 1459 | verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); | ||
| 1460 | verbose(env, "%d: ", idx); | ||
| 1461 | print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | if (class == BPF_ALU || class == BPF_ALU64) { | ||
| 1465 | if (!(*reg_mask & dreg)) | ||
| 1466 | return 0; | ||
| 1467 | if (opcode == BPF_MOV) { | ||
| 1468 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 1469 | /* dreg = sreg | ||
| 1470 | * dreg needs precision after this insn | ||
| 1471 | * sreg needs precision before this insn | ||
| 1472 | */ | ||
| 1473 | *reg_mask &= ~dreg; | ||
| 1474 | *reg_mask |= sreg; | ||
| 1475 | } else { | ||
| 1476 | /* dreg = K | ||
| 1477 | * dreg needs precision after this insn. | ||
| 1478 | * Corresponding register is already marked | ||
| 1479 | * as precise=true in this verifier state. | ||
| 1480 | * No further markings in parent are necessary | ||
| 1481 | */ | ||
| 1482 | *reg_mask &= ~dreg; | ||
| 1483 | } | ||
| 1484 | } else { | ||
| 1485 | if (BPF_SRC(insn->code) == BPF_X) { | ||
| 1486 | /* dreg += sreg | ||
| 1487 | * both dreg and sreg need precision | ||
| 1488 | * before this insn | ||
| 1489 | */ | ||
| 1490 | *reg_mask |= sreg; | ||
| 1491 | } /* else dreg += K | ||
| 1492 | * dreg still needs precision before this insn | ||
| 1493 | */ | ||
| 1494 | } | ||
| 1495 | } else if (class == BPF_LDX) { | ||
| 1496 | if (!(*reg_mask & dreg)) | ||
| 1497 | return 0; | ||
| 1498 | *reg_mask &= ~dreg; | ||
| 1499 | |||
| 1500 | /* scalars can only be spilled into stack w/o losing precision. | ||
| 1501 | * Load from any other memory can be zero extended. | ||
| 1502 | * The desire to keep that precision is already indicated | ||
| 1503 | * by 'precise' mark in corresponding register of this state. | ||
| 1504 | * No further tracking necessary. | ||
| 1505 | */ | ||
| 1506 | if (insn->src_reg != BPF_REG_FP) | ||
| 1507 | return 0; | ||
| 1508 | if (BPF_SIZE(insn->code) != BPF_DW) | ||
| 1509 | return 0; | ||
| 1510 | |||
| 1511 | /* dreg = *(u64 *)[fp - off] was a fill from the stack. | ||
| 1512 | * that [fp - off] slot contains scalar that needs to be | ||
| 1513 | * tracked with precision | ||
| 1514 | */ | ||
| 1515 | spi = (-insn->off - 1) / BPF_REG_SIZE; | ||
| 1516 | if (spi >= 64) { | ||
| 1517 | verbose(env, "BUG spi %d\n", spi); | ||
| 1518 | WARN_ONCE(1, "verifier backtracking bug"); | ||
| 1519 | return -EFAULT; | ||
| 1520 | } | ||
| 1521 | *stack_mask |= 1ull << spi; | ||
| 1522 | } else if (class == BPF_STX) { | ||
| 1523 | if (*reg_mask & dreg) | ||
| 1524 | /* stx shouldn't be using _scalar_ dst_reg | ||
| 1525 | * to access memory. It means backtracking | ||
| 1526 | * encountered a case of pointer subtraction. | ||
| 1527 | */ | ||
| 1528 | return -ENOTSUPP; | ||
| 1529 | /* scalars can only be spilled into stack */ | ||
| 1530 | if (insn->dst_reg != BPF_REG_FP) | ||
| 1531 | return 0; | ||
| 1532 | if (BPF_SIZE(insn->code) != BPF_DW) | ||
| 1533 | return 0; | ||
| 1534 | spi = (-insn->off - 1) / BPF_REG_SIZE; | ||
| 1535 | if (spi >= 64) { | ||
| 1536 | verbose(env, "BUG spi %d\n", spi); | ||
| 1537 | WARN_ONCE(1, "verifier backtracking bug"); | ||
| 1538 | return -EFAULT; | ||
| 1539 | } | ||
| 1540 | if (!(*stack_mask & (1ull << spi))) | ||
| 1541 | return 0; | ||
| 1542 | *stack_mask &= ~(1ull << spi); | ||
| 1543 | *reg_mask |= sreg; | ||
| 1544 | } else if (class == BPF_JMP || class == BPF_JMP32) { | ||
| 1545 | if (opcode == BPF_CALL) { | ||
| 1546 | if (insn->src_reg == BPF_PSEUDO_CALL) | ||
| 1547 | return -ENOTSUPP; | ||
| 1548 | /* regular helper call sets R0 */ | ||
| 1549 | *reg_mask &= ~1; | ||
| 1550 | if (*reg_mask & 0x3f) { | ||
| 1551 | /* if backtracing was looking for registers R1-R5 | ||
| 1552 | * they should have been found already. | ||
| 1553 | */ | ||
| 1554 | verbose(env, "BUG regs %x\n", *reg_mask); | ||
| 1555 | WARN_ONCE(1, "verifier backtracking bug"); | ||
| 1556 | return -EFAULT; | ||
| 1557 | } | ||
| 1558 | } else if (opcode == BPF_EXIT) { | ||
| 1559 | return -ENOTSUPP; | ||
| 1560 | } | ||
| 1561 | } else if (class == BPF_LD) { | ||
| 1562 | if (!(*reg_mask & dreg)) | ||
| 1563 | return 0; | ||
| 1564 | *reg_mask &= ~dreg; | ||
| 1565 | /* It's ld_imm64 or ld_abs or ld_ind. | ||
| 1566 | * For ld_imm64 no further tracking of precision | ||
| 1567 | * into parent is necessary | ||
| 1568 | */ | ||
| 1569 | if (mode == BPF_IND || mode == BPF_ABS) | ||
| 1570 | /* to be analyzed */ | ||
| 1571 | return -ENOTSUPP; | ||
| 1572 | } else if (class == BPF_ST) { | ||
| 1573 | if (*reg_mask & dreg) | ||
| 1574 | /* likely pointer subtraction */ | ||
| 1575 | return -ENOTSUPP; | ||
| 1576 | } | ||
| 1577 | return 0; | ||
| 1578 | } | ||
| 1579 | |||
| 1580 | /* the scalar precision tracking algorithm: | ||
| 1581 | * . at the start all registers have precise=false. | ||
| 1582 | * . scalar ranges are tracked as normal through alu and jmp insns. | ||
| 1583 | * . once precise value of the scalar register is used in: | ||
| 1584 | * . ptr + scalar alu | ||
| 1585 | * . if (scalar cond K|scalar) | ||
| 1586 | * . helper_call(.., scalar, ...) where ARG_CONST is expected | ||
| 1587 | * backtrack through the verifier states and mark all registers and | ||
| 1588 | * stack slots with spilled constants that these scalar regisers | ||
| 1589 | * should be precise. | ||
| 1590 | * . during state pruning two registers (or spilled stack slots) | ||
| 1591 | * are equivalent if both are not precise. | ||
| 1592 | * | ||
| 1593 | * Note the verifier cannot simply walk register parentage chain, | ||
| 1594 | * since many different registers and stack slots could have been | ||
| 1595 | * used to compute single precise scalar. | ||
| 1596 | * | ||
| 1597 | * The approach of starting with precise=true for all registers and then | ||
| 1598 | * backtrack to mark a register as not precise when the verifier detects | ||
| 1599 | * that program doesn't care about specific value (e.g., when helper | ||
| 1600 | * takes register as ARG_ANYTHING parameter) is not safe. | ||
| 1601 | * | ||
| 1602 | * It's ok to walk single parentage chain of the verifier states. | ||
| 1603 | * It's possible that this backtracking will go all the way till 1st insn. | ||
| 1604 | * All other branches will be explored for needing precision later. | ||
| 1605 | * | ||
| 1606 | * The backtracking needs to deal with cases like: | ||
| 1607 | * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) | ||
| 1608 | * r9 -= r8 | ||
| 1609 | * r5 = r9 | ||
| 1610 | * if r5 > 0x79f goto pc+7 | ||
| 1611 | * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) | ||
| 1612 | * r5 += 1 | ||
| 1613 | * ... | ||
| 1614 | * call bpf_perf_event_output#25 | ||
| 1615 | * where .arg5_type = ARG_CONST_SIZE_OR_ZERO | ||
| 1616 | * | ||
| 1617 | * and this case: | ||
| 1618 | * r6 = 1 | ||
| 1619 | * call foo // uses callee's r6 inside to compute r0 | ||
| 1620 | * r0 += r6 | ||
| 1621 | * if r0 == 0 goto | ||
| 1622 | * | ||
| 1623 | * to track above reg_mask/stack_mask needs to be independent for each frame. | ||
| 1624 | * | ||
| 1625 | * Also if parent's curframe > frame where backtracking started, | ||
| 1626 | * the verifier need to mark registers in both frames, otherwise callees | ||
| 1627 | * may incorrectly prune callers. This is similar to | ||
| 1628 | * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") | ||
| 1629 | * | ||
| 1630 | * For now backtracking falls back into conservative marking. | ||
| 1631 | */ | ||
| 1632 | static void mark_all_scalars_precise(struct bpf_verifier_env *env, | ||
| 1633 | struct bpf_verifier_state *st) | ||
| 1634 | { | ||
| 1635 | struct bpf_func_state *func; | ||
| 1636 | struct bpf_reg_state *reg; | ||
| 1637 | int i, j; | ||
| 1638 | |||
| 1639 | /* big hammer: mark all scalars precise in this path. | ||
| 1640 | * pop_stack may still get !precise scalars. | ||
| 1641 | */ | ||
| 1642 | for (; st; st = st->parent) | ||
| 1643 | for (i = 0; i <= st->curframe; i++) { | ||
| 1644 | func = st->frame[i]; | ||
| 1645 | for (j = 0; j < BPF_REG_FP; j++) { | ||
| 1646 | reg = &func->regs[j]; | ||
| 1647 | if (reg->type != SCALAR_VALUE) | ||
| 1648 | continue; | ||
| 1649 | reg->precise = true; | ||
| 1650 | } | ||
| 1651 | for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { | ||
| 1652 | if (func->stack[j].slot_type[0] != STACK_SPILL) | ||
| 1653 | continue; | ||
| 1654 | reg = &func->stack[j].spilled_ptr; | ||
| 1655 | if (reg->type != SCALAR_VALUE) | ||
| 1656 | continue; | ||
| 1657 | reg->precise = true; | ||
| 1658 | } | ||
| 1659 | } | ||
| 1660 | } | ||
| 1661 | |||
| 1662 | static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, | ||
| 1663 | int spi) | ||
| 1664 | { | ||
| 1665 | struct bpf_verifier_state *st = env->cur_state; | ||
| 1666 | int first_idx = st->first_insn_idx; | ||
| 1667 | int last_idx = env->insn_idx; | ||
| 1668 | struct bpf_func_state *func; | ||
| 1669 | struct bpf_reg_state *reg; | ||
| 1670 | u32 reg_mask = regno >= 0 ? 1u << regno : 0; | ||
| 1671 | u64 stack_mask = spi >= 0 ? 1ull << spi : 0; | ||
| 1672 | bool skip_first = true; | ||
| 1673 | bool new_marks = false; | ||
| 1674 | int i, err; | ||
| 1675 | |||
| 1676 | if (!env->allow_ptr_leaks) | ||
| 1677 | /* backtracking is root only for now */ | ||
| 1678 | return 0; | ||
| 1679 | |||
| 1680 | func = st->frame[st->curframe]; | ||
| 1681 | if (regno >= 0) { | ||
| 1682 | reg = &func->regs[regno]; | ||
| 1683 | if (reg->type != SCALAR_VALUE) { | ||
| 1684 | WARN_ONCE(1, "backtracing misuse"); | ||
| 1685 | return -EFAULT; | ||
| 1686 | } | ||
| 1687 | if (!reg->precise) | ||
| 1688 | new_marks = true; | ||
| 1689 | else | ||
| 1690 | reg_mask = 0; | ||
| 1691 | reg->precise = true; | ||
| 1692 | } | ||
| 1693 | |||
| 1694 | while (spi >= 0) { | ||
| 1695 | if (func->stack[spi].slot_type[0] != STACK_SPILL) { | ||
| 1696 | stack_mask = 0; | ||
| 1697 | break; | ||
| 1698 | } | ||
| 1699 | reg = &func->stack[spi].spilled_ptr; | ||
| 1700 | if (reg->type != SCALAR_VALUE) { | ||
| 1701 | stack_mask = 0; | ||
| 1702 | break; | ||
| 1703 | } | ||
| 1704 | if (!reg->precise) | ||
| 1705 | new_marks = true; | ||
| 1706 | else | ||
| 1707 | stack_mask = 0; | ||
| 1708 | reg->precise = true; | ||
| 1709 | break; | ||
| 1710 | } | ||
| 1711 | |||
| 1712 | if (!new_marks) | ||
| 1713 | return 0; | ||
| 1714 | if (!reg_mask && !stack_mask) | ||
| 1715 | return 0; | ||
| 1716 | for (;;) { | ||
| 1717 | DECLARE_BITMAP(mask, 64); | ||
| 1718 | u32 history = st->jmp_history_cnt; | ||
| 1719 | |||
| 1720 | if (env->log.level & BPF_LOG_LEVEL) | ||
| 1721 | verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); | ||
| 1722 | for (i = last_idx;;) { | ||
| 1723 | if (skip_first) { | ||
| 1724 | err = 0; | ||
| 1725 | skip_first = false; | ||
| 1726 | } else { | ||
| 1727 | err = backtrack_insn(env, i, ®_mask, &stack_mask); | ||
| 1728 | } | ||
| 1729 | if (err == -ENOTSUPP) { | ||
| 1730 | mark_all_scalars_precise(env, st); | ||
| 1731 | return 0; | ||
| 1732 | } else if (err) { | ||
| 1733 | return err; | ||
| 1734 | } | ||
| 1735 | if (!reg_mask && !stack_mask) | ||
| 1736 | /* Found assignment(s) into tracked register in this state. | ||
| 1737 | * Since this state is already marked, just return. | ||
| 1738 | * Nothing to be tracked further in the parent state. | ||
| 1739 | */ | ||
| 1740 | return 0; | ||
| 1741 | if (i == first_idx) | ||
| 1742 | break; | ||
| 1743 | i = get_prev_insn_idx(st, i, &history); | ||
| 1744 | if (i >= env->prog->len) { | ||
| 1745 | /* This can happen if backtracking reached insn 0 | ||
| 1746 | * and there are still reg_mask or stack_mask | ||
| 1747 | * to backtrack. | ||
| 1748 | * It means the backtracking missed the spot where | ||
| 1749 | * particular register was initialized with a constant. | ||
| 1750 | */ | ||
| 1751 | verbose(env, "BUG backtracking idx %d\n", i); | ||
| 1752 | WARN_ONCE(1, "verifier backtracking bug"); | ||
| 1753 | return -EFAULT; | ||
| 1754 | } | ||
| 1755 | } | ||
| 1756 | st = st->parent; | ||
| 1757 | if (!st) | ||
| 1758 | break; | ||
| 1759 | |||
| 1760 | new_marks = false; | ||
| 1761 | func = st->frame[st->curframe]; | ||
| 1762 | bitmap_from_u64(mask, reg_mask); | ||
| 1763 | for_each_set_bit(i, mask, 32) { | ||
| 1764 | reg = &func->regs[i]; | ||
| 1765 | if (reg->type != SCALAR_VALUE) { | ||
| 1766 | reg_mask &= ~(1u << i); | ||
| 1767 | continue; | ||
| 1768 | } | ||
| 1769 | if (!reg->precise) | ||
| 1770 | new_marks = true; | ||
| 1771 | reg->precise = true; | ||
| 1772 | } | ||
| 1773 | |||
| 1774 | bitmap_from_u64(mask, stack_mask); | ||
| 1775 | for_each_set_bit(i, mask, 64) { | ||
| 1776 | if (i >= func->allocated_stack / BPF_REG_SIZE) { | ||
| 1777 | /* This can happen if backtracking | ||
| 1778 | * is propagating stack precision where | ||
| 1779 | * caller has larger stack frame | ||
| 1780 | * than callee, but backtrack_insn() should | ||
| 1781 | * have returned -ENOTSUPP. | ||
| 1782 | */ | ||
| 1783 | verbose(env, "BUG spi %d stack_size %d\n", | ||
| 1784 | i, func->allocated_stack); | ||
| 1785 | WARN_ONCE(1, "verifier backtracking bug"); | ||
| 1786 | return -EFAULT; | ||
| 1787 | } | ||
| 1788 | |||
| 1789 | if (func->stack[i].slot_type[0] != STACK_SPILL) { | ||
| 1790 | stack_mask &= ~(1ull << i); | ||
| 1791 | continue; | ||
| 1792 | } | ||
| 1793 | reg = &func->stack[i].spilled_ptr; | ||
| 1794 | if (reg->type != SCALAR_VALUE) { | ||
| 1795 | stack_mask &= ~(1ull << i); | ||
| 1796 | continue; | ||
| 1797 | } | ||
| 1798 | if (!reg->precise) | ||
| 1799 | new_marks = true; | ||
| 1800 | reg->precise = true; | ||
| 1801 | } | ||
| 1802 | if (env->log.level & BPF_LOG_LEVEL) { | ||
| 1803 | print_verifier_state(env, func); | ||
| 1804 | verbose(env, "parent %s regs=%x stack=%llx marks\n", | ||
| 1805 | new_marks ? "didn't have" : "already had", | ||
| 1806 | reg_mask, stack_mask); | ||
| 1807 | } | ||
| 1808 | |||
| 1809 | if (!reg_mask && !stack_mask) | ||
| 1810 | break; | ||
| 1811 | if (!new_marks) | ||
| 1812 | break; | ||
| 1813 | |||
| 1814 | last_idx = st->last_insn_idx; | ||
| 1815 | first_idx = st->first_insn_idx; | ||
| 1816 | } | ||
| 1817 | return 0; | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | static int mark_chain_precision(struct bpf_verifier_env *env, int regno) | ||
| 1821 | { | ||
| 1822 | return __mark_chain_precision(env, regno, -1); | ||
| 1823 | } | ||
| 1824 | |||
| 1825 | static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) | ||
| 1826 | { | ||
| 1827 | return __mark_chain_precision(env, -1, spi); | ||
| 1828 | } | ||
| 1829 | |||
| 1213 | static bool is_spillable_regtype(enum bpf_reg_type type) | 1830 | static bool is_spillable_regtype(enum bpf_reg_type type) |
| 1214 | { | 1831 | { |
| 1215 | switch (type) { | 1832 | switch (type) { |
| @@ -1228,6 +1845,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) | |||
| 1228 | case PTR_TO_SOCK_COMMON_OR_NULL: | 1845 | case PTR_TO_SOCK_COMMON_OR_NULL: |
| 1229 | case PTR_TO_TCP_SOCK: | 1846 | case PTR_TO_TCP_SOCK: |
| 1230 | case PTR_TO_TCP_SOCK_OR_NULL: | 1847 | case PTR_TO_TCP_SOCK_OR_NULL: |
| 1848 | case PTR_TO_XDP_SOCK: | ||
| 1231 | return true; | 1849 | return true; |
| 1232 | default: | 1850 | default: |
| 1233 | return false; | 1851 | return false; |
| @@ -1240,6 +1858,23 @@ static bool register_is_null(struct bpf_reg_state *reg) | |||
| 1240 | return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); | 1858 | return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); |
| 1241 | } | 1859 | } |
| 1242 | 1860 | ||
| 1861 | static bool register_is_const(struct bpf_reg_state *reg) | ||
| 1862 | { | ||
| 1863 | return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); | ||
| 1864 | } | ||
| 1865 | |||
| 1866 | static void save_register_state(struct bpf_func_state *state, | ||
| 1867 | int spi, struct bpf_reg_state *reg) | ||
| 1868 | { | ||
| 1869 | int i; | ||
| 1870 | |||
| 1871 | state->stack[spi].spilled_ptr = *reg; | ||
| 1872 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; | ||
| 1873 | |||
| 1874 | for (i = 0; i < BPF_REG_SIZE; i++) | ||
| 1875 | state->stack[spi].slot_type[i] = STACK_SPILL; | ||
| 1876 | } | ||
| 1877 | |||
| 1243 | /* check_stack_read/write functions track spill/fill of registers, | 1878 | /* check_stack_read/write functions track spill/fill of registers, |
| 1244 | * stack boundary and alignment are checked in check_mem_access() | 1879 | * stack boundary and alignment are checked in check_mem_access() |
| 1245 | */ | 1880 | */ |
| @@ -1249,7 +1884,8 @@ static int check_stack_write(struct bpf_verifier_env *env, | |||
| 1249 | { | 1884 | { |
| 1250 | struct bpf_func_state *cur; /* state of the current function */ | 1885 | struct bpf_func_state *cur; /* state of the current function */ |
| 1251 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; | 1886 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; |
| 1252 | enum bpf_reg_type type; | 1887 | u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; |
| 1888 | struct bpf_reg_state *reg = NULL; | ||
| 1253 | 1889 | ||
| 1254 | err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), | 1890 | err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), |
| 1255 | state->acquired_refs, true); | 1891 | state->acquired_refs, true); |
| @@ -1266,27 +1902,48 @@ static int check_stack_write(struct bpf_verifier_env *env, | |||
| 1266 | } | 1902 | } |
| 1267 | 1903 | ||
| 1268 | cur = env->cur_state->frame[env->cur_state->curframe]; | 1904 | cur = env->cur_state->frame[env->cur_state->curframe]; |
| 1269 | if (value_regno >= 0 && | 1905 | if (value_regno >= 0) |
| 1270 | is_spillable_regtype((type = cur->regs[value_regno].type))) { | 1906 | reg = &cur->regs[value_regno]; |
| 1271 | 1907 | ||
| 1908 | if (reg && size == BPF_REG_SIZE && register_is_const(reg) && | ||
| 1909 | !register_is_null(reg) && env->allow_ptr_leaks) { | ||
| 1910 | if (dst_reg != BPF_REG_FP) { | ||
| 1911 | /* The backtracking logic can only recognize explicit | ||
| 1912 | * stack slot address like [fp - 8]. Other spill of | ||
| 1913 | * scalar via different register has to be conervative. | ||
| 1914 | * Backtrack from here and mark all registers as precise | ||
| 1915 | * that contributed into 'reg' being a constant. | ||
| 1916 | */ | ||
| 1917 | err = mark_chain_precision(env, value_regno); | ||
| 1918 | if (err) | ||
| 1919 | return err; | ||
| 1920 | } | ||
| 1921 | save_register_state(state, spi, reg); | ||
| 1922 | } else if (reg && is_spillable_regtype(reg->type)) { | ||
| 1272 | /* register containing pointer is being spilled into stack */ | 1923 | /* register containing pointer is being spilled into stack */ |
| 1273 | if (size != BPF_REG_SIZE) { | 1924 | if (size != BPF_REG_SIZE) { |
| 1925 | verbose_linfo(env, insn_idx, "; "); | ||
| 1274 | verbose(env, "invalid size of register spill\n"); | 1926 | verbose(env, "invalid size of register spill\n"); |
| 1275 | return -EACCES; | 1927 | return -EACCES; |
| 1276 | } | 1928 | } |
| 1277 | 1929 | ||
| 1278 | if (state != cur && type == PTR_TO_STACK) { | 1930 | if (state != cur && reg->type == PTR_TO_STACK) { |
| 1279 | verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); | 1931 | verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); |
| 1280 | return -EINVAL; | 1932 | return -EINVAL; |
| 1281 | } | 1933 | } |
| 1282 | 1934 | ||
| 1283 | /* save register state */ | 1935 | if (!env->allow_ptr_leaks) { |
| 1284 | state->stack[spi].spilled_ptr = cur->regs[value_regno]; | 1936 | bool sanitize = false; |
| 1285 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; | ||
| 1286 | 1937 | ||
| 1287 | for (i = 0; i < BPF_REG_SIZE; i++) { | 1938 | if (state->stack[spi].slot_type[0] == STACK_SPILL && |
| 1288 | if (state->stack[spi].slot_type[i] == STACK_MISC && | 1939 | register_is_const(&state->stack[spi].spilled_ptr)) |
| 1289 | !env->allow_ptr_leaks) { | 1940 | sanitize = true; |
| 1941 | for (i = 0; i < BPF_REG_SIZE; i++) | ||
| 1942 | if (state->stack[spi].slot_type[i] == STACK_MISC) { | ||
| 1943 | sanitize = true; | ||
| 1944 | break; | ||
| 1945 | } | ||
| 1946 | if (sanitize) { | ||
| 1290 | int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; | 1947 | int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; |
| 1291 | int soff = (-spi - 1) * BPF_REG_SIZE; | 1948 | int soff = (-spi - 1) * BPF_REG_SIZE; |
| 1292 | 1949 | ||
| @@ -1309,8 +1966,8 @@ static int check_stack_write(struct bpf_verifier_env *env, | |||
| 1309 | } | 1966 | } |
| 1310 | *poff = soff; | 1967 | *poff = soff; |
| 1311 | } | 1968 | } |
| 1312 | state->stack[spi].slot_type[i] = STACK_SPILL; | ||
| 1313 | } | 1969 | } |
| 1970 | save_register_state(state, spi, reg); | ||
| 1314 | } else { | 1971 | } else { |
| 1315 | u8 type = STACK_MISC; | 1972 | u8 type = STACK_MISC; |
| 1316 | 1973 | ||
| @@ -1333,9 +1990,13 @@ static int check_stack_write(struct bpf_verifier_env *env, | |||
| 1333 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; | 1990 | state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; |
| 1334 | 1991 | ||
| 1335 | /* when we zero initialize stack slots mark them as such */ | 1992 | /* when we zero initialize stack slots mark them as such */ |
| 1336 | if (value_regno >= 0 && | 1993 | if (reg && register_is_null(reg)) { |
| 1337 | register_is_null(&cur->regs[value_regno])) | 1994 | /* backtracking doesn't work for STACK_ZERO yet. */ |
| 1995 | err = mark_chain_precision(env, value_regno); | ||
| 1996 | if (err) | ||
| 1997 | return err; | ||
| 1338 | type = STACK_ZERO; | 1998 | type = STACK_ZERO; |
| 1999 | } | ||
| 1339 | 2000 | ||
| 1340 | /* Mark slots affected by this stack write. */ | 2001 | /* Mark slots affected by this stack write. */ |
| 1341 | for (i = 0; i < size; i++) | 2002 | for (i = 0; i < size; i++) |
| @@ -1352,6 +2013,7 @@ static int check_stack_read(struct bpf_verifier_env *env, | |||
| 1352 | struct bpf_verifier_state *vstate = env->cur_state; | 2013 | struct bpf_verifier_state *vstate = env->cur_state; |
| 1353 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; | 2014 | struct bpf_func_state *state = vstate->frame[vstate->curframe]; |
| 1354 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; | 2015 | int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; |
| 2016 | struct bpf_reg_state *reg; | ||
| 1355 | u8 *stype; | 2017 | u8 *stype; |
| 1356 | 2018 | ||
| 1357 | if (reg_state->allocated_stack <= slot) { | 2019 | if (reg_state->allocated_stack <= slot) { |
| @@ -1360,11 +2022,21 @@ static int check_stack_read(struct bpf_verifier_env *env, | |||
| 1360 | return -EACCES; | 2022 | return -EACCES; |
| 1361 | } | 2023 | } |
| 1362 | stype = reg_state->stack[spi].slot_type; | 2024 | stype = reg_state->stack[spi].slot_type; |
| 2025 | reg = ®_state->stack[spi].spilled_ptr; | ||
| 1363 | 2026 | ||
| 1364 | if (stype[0] == STACK_SPILL) { | 2027 | if (stype[0] == STACK_SPILL) { |
| 1365 | if (size != BPF_REG_SIZE) { | 2028 | if (size != BPF_REG_SIZE) { |
| 1366 | verbose(env, "invalid size of register spill\n"); | 2029 | if (reg->type != SCALAR_VALUE) { |
| 1367 | return -EACCES; | 2030 | verbose_linfo(env, env->insn_idx, "; "); |
| 2031 | verbose(env, "invalid size of register fill\n"); | ||
| 2032 | return -EACCES; | ||
| 2033 | } | ||
| 2034 | if (value_regno >= 0) { | ||
| 2035 | mark_reg_unknown(env, state->regs, value_regno); | ||
| 2036 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; | ||
| 2037 | } | ||
| 2038 | mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); | ||
| 2039 | return 0; | ||
| 1368 | } | 2040 | } |
| 1369 | for (i = 1; i < BPF_REG_SIZE; i++) { | 2041 | for (i = 1; i < BPF_REG_SIZE; i++) { |
| 1370 | if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { | 2042 | if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { |
| @@ -1375,16 +2047,14 @@ static int check_stack_read(struct bpf_verifier_env *env, | |||
| 1375 | 2047 | ||
| 1376 | if (value_regno >= 0) { | 2048 | if (value_regno >= 0) { |
| 1377 | /* restore register state from stack */ | 2049 | /* restore register state from stack */ |
| 1378 | state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; | 2050 | state->regs[value_regno] = *reg; |
| 1379 | /* mark reg as written since spilled pointer state likely | 2051 | /* mark reg as written since spilled pointer state likely |
| 1380 | * has its liveness marks cleared by is_state_visited() | 2052 | * has its liveness marks cleared by is_state_visited() |
| 1381 | * which resets stack/reg liveness for state transitions | 2053 | * which resets stack/reg liveness for state transitions |
| 1382 | */ | 2054 | */ |
| 1383 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; | 2055 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; |
| 1384 | } | 2056 | } |
| 1385 | mark_reg_read(env, ®_state->stack[spi].spilled_ptr, | 2057 | mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); |
| 1386 | reg_state->stack[spi].spilled_ptr.parent); | ||
| 1387 | return 0; | ||
| 1388 | } else { | 2058 | } else { |
| 1389 | int zeros = 0; | 2059 | int zeros = 0; |
| 1390 | 2060 | ||
| @@ -1399,22 +2069,32 @@ static int check_stack_read(struct bpf_verifier_env *env, | |||
| 1399 | off, i, size); | 2069 | off, i, size); |
| 1400 | return -EACCES; | 2070 | return -EACCES; |
| 1401 | } | 2071 | } |
| 1402 | mark_reg_read(env, ®_state->stack[spi].spilled_ptr, | 2072 | mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); |
| 1403 | reg_state->stack[spi].spilled_ptr.parent); | ||
| 1404 | if (value_regno >= 0) { | 2073 | if (value_regno >= 0) { |
| 1405 | if (zeros == size) { | 2074 | if (zeros == size) { |
| 1406 | /* any size read into register is zero extended, | 2075 | /* any size read into register is zero extended, |
| 1407 | * so the whole register == const_zero | 2076 | * so the whole register == const_zero |
| 1408 | */ | 2077 | */ |
| 1409 | __mark_reg_const_zero(&state->regs[value_regno]); | 2078 | __mark_reg_const_zero(&state->regs[value_regno]); |
| 2079 | /* backtracking doesn't support STACK_ZERO yet, | ||
| 2080 | * so mark it precise here, so that later | ||
| 2081 | * backtracking can stop here. | ||
| 2082 | * Backtracking may not need this if this register | ||
| 2083 | * doesn't participate in pointer adjustment. | ||
| 2084 | * Forward propagation of precise flag is not | ||
| 2085 | * necessary either. This mark is only to stop | ||
| 2086 | * backtracking. Any register that contributed | ||
| 2087 | * to const 0 was marked precise before spill. | ||
| 2088 | */ | ||
| 2089 | state->regs[value_regno].precise = true; | ||
| 1410 | } else { | 2090 | } else { |
| 1411 | /* have read misc data from the stack */ | 2091 | /* have read misc data from the stack */ |
| 1412 | mark_reg_unknown(env, state->regs, value_regno); | 2092 | mark_reg_unknown(env, state->regs, value_regno); |
| 1413 | } | 2093 | } |
| 1414 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; | 2094 | state->regs[value_regno].live |= REG_LIVE_WRITTEN; |
| 1415 | } | 2095 | } |
| 1416 | return 0; | ||
| 1417 | } | 2096 | } |
| 2097 | return 0; | ||
| 1418 | } | 2098 | } |
| 1419 | 2099 | ||
| 1420 | static int check_stack_access(struct bpf_verifier_env *env, | 2100 | static int check_stack_access(struct bpf_verifier_env *env, |
| @@ -1580,6 +2260,13 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, | |||
| 1580 | 2260 | ||
| 1581 | env->seen_direct_write = true; | 2261 | env->seen_direct_write = true; |
| 1582 | return true; | 2262 | return true; |
| 2263 | |||
| 2264 | case BPF_PROG_TYPE_CGROUP_SOCKOPT: | ||
| 2265 | if (t == BPF_WRITE) | ||
| 2266 | env->seen_direct_write = true; | ||
| 2267 | |||
| 2268 | return true; | ||
| 2269 | |||
| 1583 | default: | 2270 | default: |
| 1584 | return false; | 2271 | return false; |
| 1585 | } | 2272 | } |
| @@ -1706,6 +2393,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, | |||
| 1706 | case PTR_TO_TCP_SOCK: | 2393 | case PTR_TO_TCP_SOCK: |
| 1707 | valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); | 2394 | valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); |
| 1708 | break; | 2395 | break; |
| 2396 | case PTR_TO_XDP_SOCK: | ||
| 2397 | valid = bpf_xdp_sock_is_valid_access(off, size, t, &info); | ||
| 2398 | break; | ||
| 1709 | default: | 2399 | default: |
| 1710 | valid = false; | 2400 | valid = false; |
| 1711 | } | 2401 | } |
| @@ -1870,6 +2560,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, | |||
| 1870 | case PTR_TO_TCP_SOCK: | 2560 | case PTR_TO_TCP_SOCK: |
| 1871 | pointer_desc = "tcp_sock "; | 2561 | pointer_desc = "tcp_sock "; |
| 1872 | break; | 2562 | break; |
| 2563 | case PTR_TO_XDP_SOCK: | ||
| 2564 | pointer_desc = "xdp_sock "; | ||
| 2565 | break; | ||
| 1873 | default: | 2566 | default: |
| 1874 | break; | 2567 | break; |
| 1875 | } | 2568 | } |
| @@ -2109,6 +2802,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn | |||
| 2109 | value_regno); | 2802 | value_regno); |
| 2110 | if (reg_type_may_be_null(reg_type)) | 2803 | if (reg_type_may_be_null(reg_type)) |
| 2111 | regs[value_regno].id = ++env->id_gen; | 2804 | regs[value_regno].id = ++env->id_gen; |
| 2805 | /* A load of ctx field could have different | ||
| 2806 | * actual load size with the one encoded in the | ||
| 2807 | * insn. When the dst is PTR, it is for sure not | ||
| 2808 | * a sub-register. | ||
| 2809 | */ | ||
| 2810 | regs[value_regno].subreg_def = DEF_NOT_SUBREG; | ||
| 2112 | } | 2811 | } |
| 2113 | regs[value_regno].type = reg_type; | 2812 | regs[value_regno].type = reg_type; |
| 2114 | } | 2813 | } |
| @@ -2263,7 +2962,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 2263 | { | 2962 | { |
| 2264 | struct bpf_reg_state *reg = reg_state(env, regno); | 2963 | struct bpf_reg_state *reg = reg_state(env, regno); |
| 2265 | struct bpf_func_state *state = func(env, reg); | 2964 | struct bpf_func_state *state = func(env, reg); |
| 2266 | int err, min_off, max_off, i, slot, spi; | 2965 | int err, min_off, max_off, i, j, slot, spi; |
| 2267 | 2966 | ||
| 2268 | if (reg->type != PTR_TO_STACK) { | 2967 | if (reg->type != PTR_TO_STACK) { |
| 2269 | /* Allow zero-byte read from NULL, regardless of pointer type */ | 2968 | /* Allow zero-byte read from NULL, regardless of pointer type */ |
| @@ -2351,6 +3050,14 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, | |||
| 2351 | *stype = STACK_MISC; | 3050 | *stype = STACK_MISC; |
| 2352 | goto mark; | 3051 | goto mark; |
| 2353 | } | 3052 | } |
| 3053 | if (state->stack[spi].slot_type[0] == STACK_SPILL && | ||
| 3054 | state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { | ||
| 3055 | __mark_reg_unknown(&state->stack[spi].spilled_ptr); | ||
| 3056 | for (j = 0; j < BPF_REG_SIZE; j++) | ||
| 3057 | state->stack[spi].slot_type[j] = STACK_MISC; | ||
| 3058 | goto mark; | ||
| 3059 | } | ||
| 3060 | |||
| 2354 | err: | 3061 | err: |
| 2355 | if (tnum_is_const(reg->var_off)) { | 3062 | if (tnum_is_const(reg->var_off)) { |
| 2356 | verbose(env, "invalid indirect read from stack off %d+%d size %d\n", | 3063 | verbose(env, "invalid indirect read from stack off %d+%d size %d\n", |
| @@ -2368,7 +3075,8 @@ mark: | |||
| 2368 | * the whole slot to be marked as 'read' | 3075 | * the whole slot to be marked as 'read' |
| 2369 | */ | 3076 | */ |
| 2370 | mark_reg_read(env, &state->stack[spi].spilled_ptr, | 3077 | mark_reg_read(env, &state->stack[spi].spilled_ptr, |
| 2371 | state->stack[spi].spilled_ptr.parent); | 3078 | state->stack[spi].spilled_ptr.parent, |
| 3079 | REG_LIVE_READ64); | ||
| 2372 | } | 3080 | } |
| 2373 | return update_stack_depth(env, state, min_off); | 3081 | return update_stack_depth(env, state, min_off); |
| 2374 | } | 3082 | } |
| @@ -2701,6 +3409,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, | |||
| 2701 | err = check_helper_mem_access(env, regno - 1, | 3409 | err = check_helper_mem_access(env, regno - 1, |
| 2702 | reg->umax_value, | 3410 | reg->umax_value, |
| 2703 | zero_size_allowed, meta); | 3411 | zero_size_allowed, meta); |
| 3412 | if (!err) | ||
| 3413 | err = mark_chain_precision(env, regno); | ||
| 2704 | } else if (arg_type_is_int_ptr(arg_type)) { | 3414 | } else if (arg_type_is_int_ptr(arg_type)) { |
| 2705 | int size = int_ptr_type_to_size(arg_type); | 3415 | int size = int_ptr_type_to_size(arg_type); |
| 2706 | 3416 | ||
| @@ -2749,22 +3459,23 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, | |||
| 2749 | if (func_id != BPF_FUNC_get_local_storage) | 3459 | if (func_id != BPF_FUNC_get_local_storage) |
| 2750 | goto error; | 3460 | goto error; |
| 2751 | break; | 3461 | break; |
| 2752 | /* devmap returns a pointer to a live net_device ifindex that we cannot | ||
| 2753 | * allow to be modified from bpf side. So do not allow lookup elements | ||
| 2754 | * for now. | ||
| 2755 | */ | ||
| 2756 | case BPF_MAP_TYPE_DEVMAP: | 3462 | case BPF_MAP_TYPE_DEVMAP: |
| 2757 | if (func_id != BPF_FUNC_redirect_map) | 3463 | if (func_id != BPF_FUNC_redirect_map && |
| 3464 | func_id != BPF_FUNC_map_lookup_elem) | ||
| 2758 | goto error; | 3465 | goto error; |
| 2759 | break; | 3466 | break; |
| 2760 | /* Restrict bpf side of cpumap and xskmap, open when use-cases | 3467 | /* Restrict bpf side of cpumap and xskmap, open when use-cases |
| 2761 | * appear. | 3468 | * appear. |
| 2762 | */ | 3469 | */ |
| 2763 | case BPF_MAP_TYPE_CPUMAP: | 3470 | case BPF_MAP_TYPE_CPUMAP: |
| 2764 | case BPF_MAP_TYPE_XSKMAP: | ||
| 2765 | if (func_id != BPF_FUNC_redirect_map) | 3471 | if (func_id != BPF_FUNC_redirect_map) |
| 2766 | goto error; | 3472 | goto error; |
| 2767 | break; | 3473 | break; |
| 3474 | case BPF_MAP_TYPE_XSKMAP: | ||
| 3475 | if (func_id != BPF_FUNC_redirect_map && | ||
| 3476 | func_id != BPF_FUNC_map_lookup_elem) | ||
| 3477 | goto error; | ||
| 3478 | break; | ||
| 2768 | case BPF_MAP_TYPE_ARRAY_OF_MAPS: | 3479 | case BPF_MAP_TYPE_ARRAY_OF_MAPS: |
| 2769 | case BPF_MAP_TYPE_HASH_OF_MAPS: | 3480 | case BPF_MAP_TYPE_HASH_OF_MAPS: |
| 2770 | if (func_id != BPF_FUNC_map_lookup_elem) | 3481 | if (func_id != BPF_FUNC_map_lookup_elem) |
| @@ -3332,6 +4043,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn | |||
| 3332 | check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); | 4043 | check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); |
| 3333 | } | 4044 | } |
| 3334 | 4045 | ||
| 4046 | /* helper call returns 64-bit value. */ | ||
| 4047 | regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; | ||
| 4048 | |||
| 3335 | /* update return register (already marked as written above) */ | 4049 | /* update return register (already marked as written above) */ |
| 3336 | if (fn->ret_type == RET_INTEGER) { | 4050 | if (fn->ret_type == RET_INTEGER) { |
| 3337 | /* sets type to SCALAR_VALUE */ | 4051 | /* sets type to SCALAR_VALUE */ |
| @@ -3652,6 +4366,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, | |||
| 3652 | case PTR_TO_SOCK_COMMON_OR_NULL: | 4366 | case PTR_TO_SOCK_COMMON_OR_NULL: |
| 3653 | case PTR_TO_TCP_SOCK: | 4367 | case PTR_TO_TCP_SOCK: |
| 3654 | case PTR_TO_TCP_SOCK_OR_NULL: | 4368 | case PTR_TO_TCP_SOCK_OR_NULL: |
| 4369 | case PTR_TO_XDP_SOCK: | ||
| 3655 | verbose(env, "R%d pointer arithmetic on %s prohibited\n", | 4370 | verbose(env, "R%d pointer arithmetic on %s prohibited\n", |
| 3656 | dst, reg_type_str[ptr_reg->type]); | 4371 | dst, reg_type_str[ptr_reg->type]); |
| 3657 | return -EACCES; | 4372 | return -EACCES; |
| @@ -4129,6 +4844,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
| 4129 | struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; | 4844 | struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; |
| 4130 | struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; | 4845 | struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; |
| 4131 | u8 opcode = BPF_OP(insn->code); | 4846 | u8 opcode = BPF_OP(insn->code); |
| 4847 | int err; | ||
| 4132 | 4848 | ||
| 4133 | dst_reg = ®s[insn->dst_reg]; | 4849 | dst_reg = ®s[insn->dst_reg]; |
| 4134 | src_reg = NULL; | 4850 | src_reg = NULL; |
| @@ -4155,11 +4871,17 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, | |||
| 4155 | * This is legal, but we have to reverse our | 4871 | * This is legal, but we have to reverse our |
| 4156 | * src/dest handling in computing the range | 4872 | * src/dest handling in computing the range |
| 4157 | */ | 4873 | */ |
| 4874 | err = mark_chain_precision(env, insn->dst_reg); | ||
| 4875 | if (err) | ||
| 4876 | return err; | ||
| 4158 | return adjust_ptr_min_max_vals(env, insn, | 4877 | return adjust_ptr_min_max_vals(env, insn, |
| 4159 | src_reg, dst_reg); | 4878 | src_reg, dst_reg); |
| 4160 | } | 4879 | } |
| 4161 | } else if (ptr_reg) { | 4880 | } else if (ptr_reg) { |
| 4162 | /* pointer += scalar */ | 4881 | /* pointer += scalar */ |
| 4882 | err = mark_chain_precision(env, insn->src_reg); | ||
| 4883 | if (err) | ||
| 4884 | return err; | ||
| 4163 | return adjust_ptr_min_max_vals(env, insn, | 4885 | return adjust_ptr_min_max_vals(env, insn, |
| 4164 | dst_reg, src_reg); | 4886 | dst_reg, src_reg); |
| 4165 | } | 4887 | } |
| @@ -4263,6 +4985,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 4263 | */ | 4985 | */ |
| 4264 | *dst_reg = *src_reg; | 4986 | *dst_reg = *src_reg; |
| 4265 | dst_reg->live |= REG_LIVE_WRITTEN; | 4987 | dst_reg->live |= REG_LIVE_WRITTEN; |
| 4988 | dst_reg->subreg_def = DEF_NOT_SUBREG; | ||
| 4266 | } else { | 4989 | } else { |
| 4267 | /* R1 = (u32) R2 */ | 4990 | /* R1 = (u32) R2 */ |
| 4268 | if (is_pointer_value(env, insn->src_reg)) { | 4991 | if (is_pointer_value(env, insn->src_reg)) { |
| @@ -4273,6 +4996,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 4273 | } else if (src_reg->type == SCALAR_VALUE) { | 4996 | } else if (src_reg->type == SCALAR_VALUE) { |
| 4274 | *dst_reg = *src_reg; | 4997 | *dst_reg = *src_reg; |
| 4275 | dst_reg->live |= REG_LIVE_WRITTEN; | 4998 | dst_reg->live |= REG_LIVE_WRITTEN; |
| 4999 | dst_reg->subreg_def = env->insn_idx + 1; | ||
| 4276 | } else { | 5000 | } else { |
| 4277 | mark_reg_unknown(env, regs, | 5001 | mark_reg_unknown(env, regs, |
| 4278 | insn->dst_reg); | 5002 | insn->dst_reg); |
| @@ -4889,6 +5613,9 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, | |||
| 4889 | if (reg->map_ptr->inner_map_meta) { | 5613 | if (reg->map_ptr->inner_map_meta) { |
| 4890 | reg->type = CONST_PTR_TO_MAP; | 5614 | reg->type = CONST_PTR_TO_MAP; |
| 4891 | reg->map_ptr = reg->map_ptr->inner_map_meta; | 5615 | reg->map_ptr = reg->map_ptr->inner_map_meta; |
| 5616 | } else if (reg->map_ptr->map_type == | ||
| 5617 | BPF_MAP_TYPE_XSKMAP) { | ||
| 5618 | reg->type = PTR_TO_XDP_SOCK; | ||
| 4892 | } else { | 5619 | } else { |
| 4893 | reg->type = PTR_TO_MAP_VALUE; | 5620 | reg->type = PTR_TO_MAP_VALUE; |
| 4894 | } | 5621 | } |
| @@ -5060,9 +5787,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 5060 | struct bpf_verifier_state *this_branch = env->cur_state; | 5787 | struct bpf_verifier_state *this_branch = env->cur_state; |
| 5061 | struct bpf_verifier_state *other_branch; | 5788 | struct bpf_verifier_state *other_branch; |
| 5062 | struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; | 5789 | struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; |
| 5063 | struct bpf_reg_state *dst_reg, *other_branch_regs; | 5790 | struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; |
| 5064 | u8 opcode = BPF_OP(insn->code); | 5791 | u8 opcode = BPF_OP(insn->code); |
| 5065 | bool is_jmp32; | 5792 | bool is_jmp32; |
| 5793 | int pred = -1; | ||
| 5066 | int err; | 5794 | int err; |
| 5067 | 5795 | ||
| 5068 | /* Only conditional jumps are expected to reach here. */ | 5796 | /* Only conditional jumps are expected to reach here. */ |
| @@ -5087,6 +5815,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 5087 | insn->src_reg); | 5815 | insn->src_reg); |
| 5088 | return -EACCES; | 5816 | return -EACCES; |
| 5089 | } | 5817 | } |
| 5818 | src_reg = ®s[insn->src_reg]; | ||
| 5090 | } else { | 5819 | } else { |
| 5091 | if (insn->src_reg != BPF_REG_0) { | 5820 | if (insn->src_reg != BPF_REG_0) { |
| 5092 | verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); | 5821 | verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); |
| @@ -5102,20 +5831,29 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, | |||
| 5102 | dst_reg = ®s[insn->dst_reg]; | 5831 | dst_reg = ®s[insn->dst_reg]; |
| 5103 | is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; | 5832 | is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; |
| 5104 | 5833 | ||
| 5105 | if (BPF_SRC(insn->code) == BPF_K) { | 5834 | if (BPF_SRC(insn->code) == BPF_K) |
| 5106 | int pred = is_branch_taken(dst_reg, insn->imm, opcode, | 5835 | pred = is_branch_taken(dst_reg, insn->imm, |
| 5107 | is_jmp32); | 5836 | opcode, is_jmp32); |
| 5108 | 5837 | else if (src_reg->type == SCALAR_VALUE && | |
| 5109 | if (pred == 1) { | 5838 | tnum_is_const(src_reg->var_off)) |
| 5110 | /* only follow the goto, ignore fall-through */ | 5839 | pred = is_branch_taken(dst_reg, src_reg->var_off.value, |
| 5111 | *insn_idx += insn->off; | 5840 | opcode, is_jmp32); |
| 5112 | return 0; | 5841 | if (pred >= 0) { |
| 5113 | } else if (pred == 0) { | 5842 | err = mark_chain_precision(env, insn->dst_reg); |
| 5114 | /* only follow fall-through branch, since | 5843 | if (BPF_SRC(insn->code) == BPF_X && !err) |
| 5115 | * that's where the program will go | 5844 | err = mark_chain_precision(env, insn->src_reg); |
| 5116 | */ | 5845 | if (err) |
| 5117 | return 0; | 5846 | return err; |
| 5118 | } | 5847 | } |
| 5848 | if (pred == 1) { | ||
| 5849 | /* only follow the goto, ignore fall-through */ | ||
| 5850 | *insn_idx += insn->off; | ||
| 5851 | return 0; | ||
| 5852 | } else if (pred == 0) { | ||
| 5853 | /* only follow fall-through branch, since | ||
| 5854 | * that's where the program will go | ||
| 5855 | */ | ||
| 5856 | return 0; | ||
| 5119 | } | 5857 | } |
| 5120 | 5858 | ||
| 5121 | other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, | 5859 | other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, |
| @@ -5352,21 +6090,32 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) | |||
| 5352 | * Already marked as written above. | 6090 | * Already marked as written above. |
| 5353 | */ | 6091 | */ |
| 5354 | mark_reg_unknown(env, regs, BPF_REG_0); | 6092 | mark_reg_unknown(env, regs, BPF_REG_0); |
| 6093 | /* ld_abs load up to 32-bit skb data. */ | ||
| 6094 | regs[BPF_REG_0].subreg_def = env->insn_idx + 1; | ||
| 5355 | return 0; | 6095 | return 0; |
| 5356 | } | 6096 | } |
| 5357 | 6097 | ||
| 5358 | static int check_return_code(struct bpf_verifier_env *env) | 6098 | static int check_return_code(struct bpf_verifier_env *env) |
| 5359 | { | 6099 | { |
| 6100 | struct tnum enforce_attach_type_range = tnum_unknown; | ||
| 5360 | struct bpf_reg_state *reg; | 6101 | struct bpf_reg_state *reg; |
| 5361 | struct tnum range = tnum_range(0, 1); | 6102 | struct tnum range = tnum_range(0, 1); |
| 5362 | 6103 | ||
| 5363 | switch (env->prog->type) { | 6104 | switch (env->prog->type) { |
| 6105 | case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: | ||
| 6106 | if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || | ||
| 6107 | env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) | ||
| 6108 | range = tnum_range(1, 1); | ||
| 5364 | case BPF_PROG_TYPE_CGROUP_SKB: | 6109 | case BPF_PROG_TYPE_CGROUP_SKB: |
| 6110 | if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { | ||
| 6111 | range = tnum_range(0, 3); | ||
| 6112 | enforce_attach_type_range = tnum_range(2, 3); | ||
| 6113 | } | ||
| 5365 | case BPF_PROG_TYPE_CGROUP_SOCK: | 6114 | case BPF_PROG_TYPE_CGROUP_SOCK: |
| 5366 | case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: | ||
| 5367 | case BPF_PROG_TYPE_SOCK_OPS: | 6115 | case BPF_PROG_TYPE_SOCK_OPS: |
| 5368 | case BPF_PROG_TYPE_CGROUP_DEVICE: | 6116 | case BPF_PROG_TYPE_CGROUP_DEVICE: |
| 5369 | case BPF_PROG_TYPE_CGROUP_SYSCTL: | 6117 | case BPF_PROG_TYPE_CGROUP_SYSCTL: |
| 6118 | case BPF_PROG_TYPE_CGROUP_SOCKOPT: | ||
| 5370 | break; | 6119 | break; |
| 5371 | default: | 6120 | default: |
| 5372 | return 0; | 6121 | return 0; |
| @@ -5380,18 +6129,23 @@ static int check_return_code(struct bpf_verifier_env *env) | |||
| 5380 | } | 6129 | } |
| 5381 | 6130 | ||
| 5382 | if (!tnum_in(range, reg->var_off)) { | 6131 | if (!tnum_in(range, reg->var_off)) { |
| 6132 | char tn_buf[48]; | ||
| 6133 | |||
| 5383 | verbose(env, "At program exit the register R0 "); | 6134 | verbose(env, "At program exit the register R0 "); |
| 5384 | if (!tnum_is_unknown(reg->var_off)) { | 6135 | if (!tnum_is_unknown(reg->var_off)) { |
| 5385 | char tn_buf[48]; | ||
| 5386 | |||
| 5387 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); | 6136 | tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); |
| 5388 | verbose(env, "has value %s", tn_buf); | 6137 | verbose(env, "has value %s", tn_buf); |
| 5389 | } else { | 6138 | } else { |
| 5390 | verbose(env, "has unknown scalar value"); | 6139 | verbose(env, "has unknown scalar value"); |
| 5391 | } | 6140 | } |
| 5392 | verbose(env, " should have been 0 or 1\n"); | 6141 | tnum_strn(tn_buf, sizeof(tn_buf), range); |
| 6142 | verbose(env, " should have been in %s\n", tn_buf); | ||
| 5393 | return -EINVAL; | 6143 | return -EINVAL; |
| 5394 | } | 6144 | } |
| 6145 | |||
| 6146 | if (!tnum_is_unknown(enforce_attach_type_range) && | ||
| 6147 | tnum_in(enforce_attach_type_range, reg->var_off)) | ||
| 6148 | env->prog->enforce_expected_attach_type = 1; | ||
| 5395 | return 0; | 6149 | return 0; |
| 5396 | } | 6150 | } |
| 5397 | 6151 | ||
| @@ -5435,14 +6189,33 @@ enum { | |||
| 5435 | BRANCH = 2, | 6189 | BRANCH = 2, |
| 5436 | }; | 6190 | }; |
| 5437 | 6191 | ||
| 5438 | #define STATE_LIST_MARK ((struct bpf_verifier_state_list *) -1L) | 6192 | static u32 state_htab_size(struct bpf_verifier_env *env) |
| 6193 | { | ||
| 6194 | return env->prog->len; | ||
| 6195 | } | ||
| 6196 | |||
| 6197 | static struct bpf_verifier_state_list **explored_state( | ||
| 6198 | struct bpf_verifier_env *env, | ||
| 6199 | int idx) | ||
| 6200 | { | ||
| 6201 | struct bpf_verifier_state *cur = env->cur_state; | ||
| 6202 | struct bpf_func_state *state = cur->frame[cur->curframe]; | ||
| 6203 | |||
| 6204 | return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; | ||
| 6205 | } | ||
| 6206 | |||
| 6207 | static void init_explored_state(struct bpf_verifier_env *env, int idx) | ||
| 6208 | { | ||
| 6209 | env->insn_aux_data[idx].prune_point = true; | ||
| 6210 | } | ||
| 5439 | 6211 | ||
| 5440 | /* t, w, e - match pseudo-code above: | 6212 | /* t, w, e - match pseudo-code above: |
| 5441 | * t - index of current instruction | 6213 | * t - index of current instruction |
| 5442 | * w - next instruction | 6214 | * w - next instruction |
| 5443 | * e - edge | 6215 | * e - edge |
| 5444 | */ | 6216 | */ |
| 5445 | static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) | 6217 | static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, |
| 6218 | bool loop_ok) | ||
| 5446 | { | 6219 | { |
| 5447 | int *insn_stack = env->cfg.insn_stack; | 6220 | int *insn_stack = env->cfg.insn_stack; |
| 5448 | int *insn_state = env->cfg.insn_state; | 6221 | int *insn_state = env->cfg.insn_state; |
| @@ -5461,7 +6234,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) | |||
| 5461 | 6234 | ||
| 5462 | if (e == BRANCH) | 6235 | if (e == BRANCH) |
| 5463 | /* mark branch target for state pruning */ | 6236 | /* mark branch target for state pruning */ |
| 5464 | env->explored_states[w] = STATE_LIST_MARK; | 6237 | init_explored_state(env, w); |
| 5465 | 6238 | ||
| 5466 | if (insn_state[w] == 0) { | 6239 | if (insn_state[w] == 0) { |
| 5467 | /* tree-edge */ | 6240 | /* tree-edge */ |
| @@ -5472,6 +6245,8 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env) | |||
| 5472 | insn_stack[env->cfg.cur_stack++] = w; | 6245 | insn_stack[env->cfg.cur_stack++] = w; |
| 5473 | return 1; | 6246 | return 1; |
| 5474 | } else if ((insn_state[w] & 0xF0) == DISCOVERED) { | 6247 | } else if ((insn_state[w] & 0xF0) == DISCOVERED) { |
| 6248 | if (loop_ok && env->allow_ptr_leaks) | ||
| 6249 | return 0; | ||
| 5475 | verbose_linfo(env, t, "%d: ", t); | 6250 | verbose_linfo(env, t, "%d: ", t); |
| 5476 | verbose_linfo(env, w, "%d: ", w); | 6251 | verbose_linfo(env, w, "%d: ", w); |
| 5477 | verbose(env, "back-edge from insn %d to %d\n", t, w); | 6252 | verbose(env, "back-edge from insn %d to %d\n", t, w); |
| @@ -5523,16 +6298,17 @@ peek_stack: | |||
| 5523 | if (opcode == BPF_EXIT) { | 6298 | if (opcode == BPF_EXIT) { |
| 5524 | goto mark_explored; | 6299 | goto mark_explored; |
| 5525 | } else if (opcode == BPF_CALL) { | 6300 | } else if (opcode == BPF_CALL) { |
| 5526 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | 6301 | ret = push_insn(t, t + 1, FALLTHROUGH, env, false); |
| 5527 | if (ret == 1) | 6302 | if (ret == 1) |
| 5528 | goto peek_stack; | 6303 | goto peek_stack; |
| 5529 | else if (ret < 0) | 6304 | else if (ret < 0) |
| 5530 | goto err_free; | 6305 | goto err_free; |
| 5531 | if (t + 1 < insn_cnt) | 6306 | if (t + 1 < insn_cnt) |
| 5532 | env->explored_states[t + 1] = STATE_LIST_MARK; | 6307 | init_explored_state(env, t + 1); |
| 5533 | if (insns[t].src_reg == BPF_PSEUDO_CALL) { | 6308 | if (insns[t].src_reg == BPF_PSEUDO_CALL) { |
| 5534 | env->explored_states[t] = STATE_LIST_MARK; | 6309 | init_explored_state(env, t); |
| 5535 | ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); | 6310 | ret = push_insn(t, t + insns[t].imm + 1, BRANCH, |
| 6311 | env, false); | ||
| 5536 | if (ret == 1) | 6312 | if (ret == 1) |
| 5537 | goto peek_stack; | 6313 | goto peek_stack; |
| 5538 | else if (ret < 0) | 6314 | else if (ret < 0) |
| @@ -5545,26 +6321,31 @@ peek_stack: | |||
| 5545 | } | 6321 | } |
| 5546 | /* unconditional jump with single edge */ | 6322 | /* unconditional jump with single edge */ |
| 5547 | ret = push_insn(t, t + insns[t].off + 1, | 6323 | ret = push_insn(t, t + insns[t].off + 1, |
| 5548 | FALLTHROUGH, env); | 6324 | FALLTHROUGH, env, true); |
| 5549 | if (ret == 1) | 6325 | if (ret == 1) |
| 5550 | goto peek_stack; | 6326 | goto peek_stack; |
| 5551 | else if (ret < 0) | 6327 | else if (ret < 0) |
| 5552 | goto err_free; | 6328 | goto err_free; |
| 6329 | /* unconditional jmp is not a good pruning point, | ||
| 6330 | * but it's marked, since backtracking needs | ||
| 6331 | * to record jmp history in is_state_visited(). | ||
| 6332 | */ | ||
| 6333 | init_explored_state(env, t + insns[t].off + 1); | ||
| 5553 | /* tell verifier to check for equivalent states | 6334 | /* tell verifier to check for equivalent states |
| 5554 | * after every call and jump | 6335 | * after every call and jump |
| 5555 | */ | 6336 | */ |
| 5556 | if (t + 1 < insn_cnt) | 6337 | if (t + 1 < insn_cnt) |
| 5557 | env->explored_states[t + 1] = STATE_LIST_MARK; | 6338 | init_explored_state(env, t + 1); |
| 5558 | } else { | 6339 | } else { |
| 5559 | /* conditional jump with two edges */ | 6340 | /* conditional jump with two edges */ |
| 5560 | env->explored_states[t] = STATE_LIST_MARK; | 6341 | init_explored_state(env, t); |
| 5561 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | 6342 | ret = push_insn(t, t + 1, FALLTHROUGH, env, true); |
| 5562 | if (ret == 1) | 6343 | if (ret == 1) |
| 5563 | goto peek_stack; | 6344 | goto peek_stack; |
| 5564 | else if (ret < 0) | 6345 | else if (ret < 0) |
| 5565 | goto err_free; | 6346 | goto err_free; |
| 5566 | 6347 | ||
| 5567 | ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); | 6348 | ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); |
| 5568 | if (ret == 1) | 6349 | if (ret == 1) |
| 5569 | goto peek_stack; | 6350 | goto peek_stack; |
| 5570 | else if (ret < 0) | 6351 | else if (ret < 0) |
| @@ -5574,7 +6355,7 @@ peek_stack: | |||
| 5574 | /* all other non-branch instructions with single | 6355 | /* all other non-branch instructions with single |
| 5575 | * fall-through edge | 6356 | * fall-through edge |
| 5576 | */ | 6357 | */ |
| 5577 | ret = push_insn(t, t + 1, FALLTHROUGH, env); | 6358 | ret = push_insn(t, t + 1, FALLTHROUGH, env, false); |
| 5578 | if (ret == 1) | 6359 | if (ret == 1) |
| 5579 | goto peek_stack; | 6360 | goto peek_stack; |
| 5580 | else if (ret < 0) | 6361 | else if (ret < 0) |
| @@ -6005,12 +6786,12 @@ static void clean_live_states(struct bpf_verifier_env *env, int insn, | |||
| 6005 | struct bpf_verifier_state_list *sl; | 6786 | struct bpf_verifier_state_list *sl; |
| 6006 | int i; | 6787 | int i; |
| 6007 | 6788 | ||
| 6008 | sl = env->explored_states[insn]; | 6789 | sl = *explored_state(env, insn); |
| 6009 | if (!sl) | 6790 | while (sl) { |
| 6010 | return; | 6791 | if (sl->state.branches) |
| 6011 | 6792 | goto next; | |
| 6012 | while (sl != STATE_LIST_MARK) { | 6793 | if (sl->state.insn_idx != insn || |
| 6013 | if (sl->state.curframe != cur->curframe) | 6794 | sl->state.curframe != cur->curframe) |
| 6014 | goto next; | 6795 | goto next; |
| 6015 | for (i = 0; i <= cur->curframe; i++) | 6796 | for (i = 0; i <= cur->curframe; i++) |
| 6016 | if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) | 6797 | if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) |
| @@ -6050,6 +6831,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
| 6050 | switch (rold->type) { | 6831 | switch (rold->type) { |
| 6051 | case SCALAR_VALUE: | 6832 | case SCALAR_VALUE: |
| 6052 | if (rcur->type == SCALAR_VALUE) { | 6833 | if (rcur->type == SCALAR_VALUE) { |
| 6834 | if (!rold->precise && !rcur->precise) | ||
| 6835 | return true; | ||
| 6053 | /* new val must satisfy old val knowledge */ | 6836 | /* new val must satisfy old val knowledge */ |
| 6054 | return range_within(rold, rcur) && | 6837 | return range_within(rold, rcur) && |
| 6055 | tnum_in(rold->var_off, rcur->var_off); | 6838 | tnum_in(rold->var_off, rcur->var_off); |
| @@ -6122,6 +6905,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, | |||
| 6122 | case PTR_TO_SOCK_COMMON_OR_NULL: | 6905 | case PTR_TO_SOCK_COMMON_OR_NULL: |
| 6123 | case PTR_TO_TCP_SOCK: | 6906 | case PTR_TO_TCP_SOCK: |
| 6124 | case PTR_TO_TCP_SOCK_OR_NULL: | 6907 | case PTR_TO_TCP_SOCK_OR_NULL: |
| 6908 | case PTR_TO_XDP_SOCK: | ||
| 6125 | /* Only valid matches are exact, which memcmp() above | 6909 | /* Only valid matches are exact, which memcmp() above |
| 6126 | * would have accepted | 6910 | * would have accepted |
| 6127 | */ | 6911 | */ |
| @@ -6292,20 +7076,33 @@ static bool states_equal(struct bpf_verifier_env *env, | |||
| 6292 | return true; | 7076 | return true; |
| 6293 | } | 7077 | } |
| 6294 | 7078 | ||
| 7079 | /* Return 0 if no propagation happened. Return negative error code if error | ||
| 7080 | * happened. Otherwise, return the propagated bit. | ||
| 7081 | */ | ||
| 6295 | static int propagate_liveness_reg(struct bpf_verifier_env *env, | 7082 | static int propagate_liveness_reg(struct bpf_verifier_env *env, |
| 6296 | struct bpf_reg_state *reg, | 7083 | struct bpf_reg_state *reg, |
| 6297 | struct bpf_reg_state *parent_reg) | 7084 | struct bpf_reg_state *parent_reg) |
| 6298 | { | 7085 | { |
| 7086 | u8 parent_flag = parent_reg->live & REG_LIVE_READ; | ||
| 7087 | u8 flag = reg->live & REG_LIVE_READ; | ||
| 6299 | int err; | 7088 | int err; |
| 6300 | 7089 | ||
| 6301 | if (parent_reg->live & REG_LIVE_READ || !(reg->live & REG_LIVE_READ)) | 7090 | /* When comes here, read flags of PARENT_REG or REG could be any of |
| 7091 | * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need | ||
| 7092 | * of propagation if PARENT_REG has strongest REG_LIVE_READ64. | ||
| 7093 | */ | ||
| 7094 | if (parent_flag == REG_LIVE_READ64 || | ||
| 7095 | /* Or if there is no read flag from REG. */ | ||
| 7096 | !flag || | ||
| 7097 | /* Or if the read flag from REG is the same as PARENT_REG. */ | ||
| 7098 | parent_flag == flag) | ||
| 6302 | return 0; | 7099 | return 0; |
| 6303 | 7100 | ||
| 6304 | err = mark_reg_read(env, reg, parent_reg); | 7101 | err = mark_reg_read(env, reg, parent_reg, flag); |
| 6305 | if (err) | 7102 | if (err) |
| 6306 | return err; | 7103 | return err; |
| 6307 | 7104 | ||
| 6308 | return 0; | 7105 | return flag; |
| 6309 | } | 7106 | } |
| 6310 | 7107 | ||
| 6311 | /* A write screens off any subsequent reads; but write marks come from the | 7108 | /* A write screens off any subsequent reads; but write marks come from the |
| @@ -6339,8 +7136,10 @@ static int propagate_liveness(struct bpf_verifier_env *env, | |||
| 6339 | for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { | 7136 | for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { |
| 6340 | err = propagate_liveness_reg(env, &state_reg[i], | 7137 | err = propagate_liveness_reg(env, &state_reg[i], |
| 6341 | &parent_reg[i]); | 7138 | &parent_reg[i]); |
| 6342 | if (err) | 7139 | if (err < 0) |
| 6343 | return err; | 7140 | return err; |
| 7141 | if (err == REG_LIVE_READ64) | ||
| 7142 | mark_insn_zext(env, &parent_reg[i]); | ||
| 6344 | } | 7143 | } |
| 6345 | 7144 | ||
| 6346 | /* Propagate stack slots. */ | 7145 | /* Propagate stack slots. */ |
| @@ -6350,32 +7149,132 @@ static int propagate_liveness(struct bpf_verifier_env *env, | |||
| 6350 | state_reg = &state->stack[i].spilled_ptr; | 7149 | state_reg = &state->stack[i].spilled_ptr; |
| 6351 | err = propagate_liveness_reg(env, state_reg, | 7150 | err = propagate_liveness_reg(env, state_reg, |
| 6352 | parent_reg); | 7151 | parent_reg); |
| 6353 | if (err) | 7152 | if (err < 0) |
| 6354 | return err; | 7153 | return err; |
| 6355 | } | 7154 | } |
| 6356 | } | 7155 | } |
| 6357 | return err; | 7156 | return 0; |
| 6358 | } | 7157 | } |
| 6359 | 7158 | ||
| 7159 | /* find precise scalars in the previous equivalent state and | ||
| 7160 | * propagate them into the current state | ||
| 7161 | */ | ||
| 7162 | static int propagate_precision(struct bpf_verifier_env *env, | ||
| 7163 | const struct bpf_verifier_state *old) | ||
| 7164 | { | ||
| 7165 | struct bpf_reg_state *state_reg; | ||
| 7166 | struct bpf_func_state *state; | ||
| 7167 | int i, err = 0; | ||
| 7168 | |||
| 7169 | state = old->frame[old->curframe]; | ||
| 7170 | state_reg = state->regs; | ||
| 7171 | for (i = 0; i < BPF_REG_FP; i++, state_reg++) { | ||
| 7172 | if (state_reg->type != SCALAR_VALUE || | ||
| 7173 | !state_reg->precise) | ||
| 7174 | continue; | ||
| 7175 | if (env->log.level & BPF_LOG_LEVEL2) | ||
| 7176 | verbose(env, "propagating r%d\n", i); | ||
| 7177 | err = mark_chain_precision(env, i); | ||
| 7178 | if (err < 0) | ||
| 7179 | return err; | ||
| 7180 | } | ||
| 7181 | |||
| 7182 | for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { | ||
| 7183 | if (state->stack[i].slot_type[0] != STACK_SPILL) | ||
| 7184 | continue; | ||
| 7185 | state_reg = &state->stack[i].spilled_ptr; | ||
| 7186 | if (state_reg->type != SCALAR_VALUE || | ||
| 7187 | !state_reg->precise) | ||
| 7188 | continue; | ||
| 7189 | if (env->log.level & BPF_LOG_LEVEL2) | ||
| 7190 | verbose(env, "propagating fp%d\n", | ||
| 7191 | (-i - 1) * BPF_REG_SIZE); | ||
| 7192 | err = mark_chain_precision_stack(env, i); | ||
| 7193 | if (err < 0) | ||
| 7194 | return err; | ||
| 7195 | } | ||
| 7196 | return 0; | ||
| 7197 | } | ||
| 7198 | |||
| 7199 | static bool states_maybe_looping(struct bpf_verifier_state *old, | ||
| 7200 | struct bpf_verifier_state *cur) | ||
| 7201 | { | ||
| 7202 | struct bpf_func_state *fold, *fcur; | ||
| 7203 | int i, fr = cur->curframe; | ||
| 7204 | |||
| 7205 | if (old->curframe != fr) | ||
| 7206 | return false; | ||
| 7207 | |||
| 7208 | fold = old->frame[fr]; | ||
| 7209 | fcur = cur->frame[fr]; | ||
| 7210 | for (i = 0; i < MAX_BPF_REG; i++) | ||
| 7211 | if (memcmp(&fold->regs[i], &fcur->regs[i], | ||
| 7212 | offsetof(struct bpf_reg_state, parent))) | ||
| 7213 | return false; | ||
| 7214 | return true; | ||
| 7215 | } | ||
| 7216 | |||
| 7217 | |||
| 6360 | static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | 7218 | static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) |
| 6361 | { | 7219 | { |
| 6362 | struct bpf_verifier_state_list *new_sl; | 7220 | struct bpf_verifier_state_list *new_sl; |
| 6363 | struct bpf_verifier_state_list *sl, **pprev; | 7221 | struct bpf_verifier_state_list *sl, **pprev; |
| 6364 | struct bpf_verifier_state *cur = env->cur_state, *new; | 7222 | struct bpf_verifier_state *cur = env->cur_state, *new; |
| 6365 | int i, j, err, states_cnt = 0; | 7223 | int i, j, err, states_cnt = 0; |
| 7224 | bool add_new_state = false; | ||
| 6366 | 7225 | ||
| 6367 | pprev = &env->explored_states[insn_idx]; | 7226 | cur->last_insn_idx = env->prev_insn_idx; |
| 6368 | sl = *pprev; | 7227 | if (!env->insn_aux_data[insn_idx].prune_point) |
| 6369 | |||
| 6370 | if (!sl) | ||
| 6371 | /* this 'insn_idx' instruction wasn't marked, so we will not | 7228 | /* this 'insn_idx' instruction wasn't marked, so we will not |
| 6372 | * be doing state search here | 7229 | * be doing state search here |
| 6373 | */ | 7230 | */ |
| 6374 | return 0; | 7231 | return 0; |
| 6375 | 7232 | ||
| 7233 | /* bpf progs typically have pruning point every 4 instructions | ||
| 7234 | * http://vger.kernel.org/bpfconf2019.html#session-1 | ||
| 7235 | * Do not add new state for future pruning if the verifier hasn't seen | ||
| 7236 | * at least 2 jumps and at least 8 instructions. | ||
| 7237 | * This heuristics helps decrease 'total_states' and 'peak_states' metric. | ||
| 7238 | * In tests that amounts to up to 50% reduction into total verifier | ||
| 7239 | * memory consumption and 20% verifier time speedup. | ||
| 7240 | */ | ||
| 7241 | if (env->jmps_processed - env->prev_jmps_processed >= 2 && | ||
| 7242 | env->insn_processed - env->prev_insn_processed >= 8) | ||
| 7243 | add_new_state = true; | ||
| 7244 | |||
| 7245 | pprev = explored_state(env, insn_idx); | ||
| 7246 | sl = *pprev; | ||
| 7247 | |||
| 6376 | clean_live_states(env, insn_idx, cur); | 7248 | clean_live_states(env, insn_idx, cur); |
| 6377 | 7249 | ||
| 6378 | while (sl != STATE_LIST_MARK) { | 7250 | while (sl) { |
| 7251 | states_cnt++; | ||
| 7252 | if (sl->state.insn_idx != insn_idx) | ||
| 7253 | goto next; | ||
| 7254 | if (sl->state.branches) { | ||
| 7255 | if (states_maybe_looping(&sl->state, cur) && | ||
| 7256 | states_equal(env, &sl->state, cur)) { | ||
| 7257 | verbose_linfo(env, insn_idx, "; "); | ||
| 7258 | verbose(env, "infinite loop detected at insn %d\n", insn_idx); | ||
| 7259 | return -EINVAL; | ||
| 7260 | } | ||
| 7261 | /* if the verifier is processing a loop, avoid adding new state | ||
| 7262 | * too often, since different loop iterations have distinct | ||
| 7263 | * states and may not help future pruning. | ||
| 7264 | * This threshold shouldn't be too low to make sure that | ||
| 7265 | * a loop with large bound will be rejected quickly. | ||
| 7266 | * The most abusive loop will be: | ||
| 7267 | * r1 += 1 | ||
| 7268 | * if r1 < 1000000 goto pc-2 | ||
| 7269 | * 1M insn_procssed limit / 100 == 10k peak states. | ||
| 7270 | * This threshold shouldn't be too high either, since states | ||
| 7271 | * at the end of the loop are likely to be useful in pruning. | ||
| 7272 | */ | ||
| 7273 | if (env->jmps_processed - env->prev_jmps_processed < 20 && | ||
| 7274 | env->insn_processed - env->prev_insn_processed < 100) | ||
| 7275 | add_new_state = false; | ||
| 7276 | goto miss; | ||
| 7277 | } | ||
| 6379 | if (states_equal(env, &sl->state, cur)) { | 7278 | if (states_equal(env, &sl->state, cur)) { |
| 6380 | sl->hit_cnt++; | 7279 | sl->hit_cnt++; |
| 6381 | /* reached equivalent register/stack state, | 7280 | /* reached equivalent register/stack state, |
| @@ -6389,12 +7288,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 6389 | * this state and will pop a new one. | 7288 | * this state and will pop a new one. |
| 6390 | */ | 7289 | */ |
| 6391 | err = propagate_liveness(env, &sl->state, cur); | 7290 | err = propagate_liveness(env, &sl->state, cur); |
| 7291 | |||
| 7292 | /* if previous state reached the exit with precision and | ||
| 7293 | * current state is equivalent to it (except precsion marks) | ||
| 7294 | * the precision needs to be propagated back in | ||
| 7295 | * the current state. | ||
| 7296 | */ | ||
| 7297 | err = err ? : push_jmp_history(env, cur); | ||
| 7298 | err = err ? : propagate_precision(env, &sl->state); | ||
| 6392 | if (err) | 7299 | if (err) |
| 6393 | return err; | 7300 | return err; |
| 6394 | return 1; | 7301 | return 1; |
| 6395 | } | 7302 | } |
| 6396 | states_cnt++; | 7303 | miss: |
| 6397 | sl->miss_cnt++; | 7304 | /* when new state is not going to be added do not increase miss count. |
| 7305 | * Otherwise several loop iterations will remove the state | ||
| 7306 | * recorded earlier. The goal of these heuristics is to have | ||
| 7307 | * states from some iterations of the loop (some in the beginning | ||
| 7308 | * and some at the end) to help pruning. | ||
| 7309 | */ | ||
| 7310 | if (add_new_state) | ||
| 7311 | sl->miss_cnt++; | ||
| 6398 | /* heuristic to determine whether this state is beneficial | 7312 | /* heuristic to determine whether this state is beneficial |
| 6399 | * to keep checking from state equivalence point of view. | 7313 | * to keep checking from state equivalence point of view. |
| 6400 | * Higher numbers increase max_states_per_insn and verification time, | 7314 | * Higher numbers increase max_states_per_insn and verification time, |
| @@ -6406,6 +7320,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 6406 | */ | 7320 | */ |
| 6407 | *pprev = sl->next; | 7321 | *pprev = sl->next; |
| 6408 | if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { | 7322 | if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { |
| 7323 | u32 br = sl->state.branches; | ||
| 7324 | |||
| 7325 | WARN_ONCE(br, | ||
| 7326 | "BUG live_done but branches_to_explore %d\n", | ||
| 7327 | br); | ||
| 6409 | free_verifier_state(&sl->state, false); | 7328 | free_verifier_state(&sl->state, false); |
| 6410 | kfree(sl); | 7329 | kfree(sl); |
| 6411 | env->peak_states--; | 7330 | env->peak_states--; |
| @@ -6420,6 +7339,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 6420 | sl = *pprev; | 7339 | sl = *pprev; |
| 6421 | continue; | 7340 | continue; |
| 6422 | } | 7341 | } |
| 7342 | next: | ||
| 6423 | pprev = &sl->next; | 7343 | pprev = &sl->next; |
| 6424 | sl = *pprev; | 7344 | sl = *pprev; |
| 6425 | } | 7345 | } |
| @@ -6428,20 +7348,27 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 6428 | env->max_states_per_insn = states_cnt; | 7348 | env->max_states_per_insn = states_cnt; |
| 6429 | 7349 | ||
| 6430 | if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) | 7350 | if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) |
| 6431 | return 0; | 7351 | return push_jmp_history(env, cur); |
| 6432 | 7352 | ||
| 6433 | /* there were no equivalent states, remember current one. | 7353 | if (!add_new_state) |
| 6434 | * technically the current state is not proven to be safe yet, | 7354 | return push_jmp_history(env, cur); |
| 7355 | |||
| 7356 | /* There were no equivalent states, remember the current one. | ||
| 7357 | * Technically the current state is not proven to be safe yet, | ||
| 6435 | * but it will either reach outer most bpf_exit (which means it's safe) | 7358 | * but it will either reach outer most bpf_exit (which means it's safe) |
| 6436 | * or it will be rejected. Since there are no loops, we won't be | 7359 | * or it will be rejected. When there are no loops the verifier won't be |
| 6437 | * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) | 7360 | * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) |
| 6438 | * again on the way to bpf_exit | 7361 | * again on the way to bpf_exit. |
| 7362 | * When looping the sl->state.branches will be > 0 and this state | ||
| 7363 | * will not be considered for equivalence until branches == 0. | ||
| 6439 | */ | 7364 | */ |
| 6440 | new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); | 7365 | new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); |
| 6441 | if (!new_sl) | 7366 | if (!new_sl) |
| 6442 | return -ENOMEM; | 7367 | return -ENOMEM; |
| 6443 | env->total_states++; | 7368 | env->total_states++; |
| 6444 | env->peak_states++; | 7369 | env->peak_states++; |
| 7370 | env->prev_jmps_processed = env->jmps_processed; | ||
| 7371 | env->prev_insn_processed = env->insn_processed; | ||
| 6445 | 7372 | ||
| 6446 | /* add new state to the head of linked list */ | 7373 | /* add new state to the head of linked list */ |
| 6447 | new = &new_sl->state; | 7374 | new = &new_sl->state; |
| @@ -6451,8 +7378,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 6451 | kfree(new_sl); | 7378 | kfree(new_sl); |
| 6452 | return err; | 7379 | return err; |
| 6453 | } | 7380 | } |
| 6454 | new_sl->next = env->explored_states[insn_idx]; | 7381 | new->insn_idx = insn_idx; |
| 6455 | env->explored_states[insn_idx] = new_sl; | 7382 | WARN_ONCE(new->branches != 1, |
| 7383 | "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); | ||
| 7384 | |||
| 7385 | cur->parent = new; | ||
| 7386 | cur->first_insn_idx = insn_idx; | ||
| 7387 | clear_jmp_history(cur); | ||
| 7388 | new_sl->next = *explored_state(env, insn_idx); | ||
| 7389 | *explored_state(env, insn_idx) = new_sl; | ||
| 6456 | /* connect new state to parentage chain. Current frame needs all | 7390 | /* connect new state to parentage chain. Current frame needs all |
| 6457 | * registers connected. Only r6 - r9 of the callers are alive (pushed | 7391 | * registers connected. Only r6 - r9 of the callers are alive (pushed |
| 6458 | * to the stack implicitly by JITs) so in callers' frames connect just | 7392 | * to the stack implicitly by JITs) so in callers' frames connect just |
| @@ -6460,17 +7394,18 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) | |||
| 6460 | * the state of the call instruction (with WRITTEN set), and r0 comes | 7394 | * the state of the call instruction (with WRITTEN set), and r0 comes |
| 6461 | * from callee with its full parentage chain, anyway. | 7395 | * from callee with its full parentage chain, anyway. |
| 6462 | */ | 7396 | */ |
| 6463 | for (j = 0; j <= cur->curframe; j++) | ||
| 6464 | for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) | ||
| 6465 | cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; | ||
| 6466 | /* clear write marks in current state: the writes we did are not writes | 7397 | /* clear write marks in current state: the writes we did are not writes |
| 6467 | * our child did, so they don't screen off its reads from us. | 7398 | * our child did, so they don't screen off its reads from us. |
| 6468 | * (There are no read marks in current state, because reads always mark | 7399 | * (There are no read marks in current state, because reads always mark |
| 6469 | * their parent and current state never has children yet. Only | 7400 | * their parent and current state never has children yet. Only |
| 6470 | * explored_states can get read marks.) | 7401 | * explored_states can get read marks.) |
| 6471 | */ | 7402 | */ |
| 6472 | for (i = 0; i < BPF_REG_FP; i++) | 7403 | for (j = 0; j <= cur->curframe; j++) { |
| 6473 | cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; | 7404 | for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) |
| 7405 | cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; | ||
| 7406 | for (i = 0; i < BPF_REG_FP; i++) | ||
| 7407 | cur->frame[j]->regs[i].live = REG_LIVE_NONE; | ||
| 7408 | } | ||
| 6474 | 7409 | ||
| 6475 | /* all stack frames are accessible from callee, clear them all */ | 7410 | /* all stack frames are accessible from callee, clear them all */ |
| 6476 | for (j = 0; j <= cur->curframe; j++) { | 7411 | for (j = 0; j <= cur->curframe; j++) { |
| @@ -6497,6 +7432,7 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) | |||
| 6497 | case PTR_TO_SOCK_COMMON_OR_NULL: | 7432 | case PTR_TO_SOCK_COMMON_OR_NULL: |
| 6498 | case PTR_TO_TCP_SOCK: | 7433 | case PTR_TO_TCP_SOCK: |
| 6499 | case PTR_TO_TCP_SOCK_OR_NULL: | 7434 | case PTR_TO_TCP_SOCK_OR_NULL: |
| 7435 | case PTR_TO_XDP_SOCK: | ||
| 6500 | return false; | 7436 | return false; |
| 6501 | default: | 7437 | default: |
| 6502 | return true; | 7438 | return true; |
| @@ -6528,6 +7464,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6528 | struct bpf_reg_state *regs; | 7464 | struct bpf_reg_state *regs; |
| 6529 | int insn_cnt = env->prog->len; | 7465 | int insn_cnt = env->prog->len; |
| 6530 | bool do_print_state = false; | 7466 | bool do_print_state = false; |
| 7467 | int prev_insn_idx = -1; | ||
| 6531 | 7468 | ||
| 6532 | env->prev_linfo = NULL; | 7469 | env->prev_linfo = NULL; |
| 6533 | 7470 | ||
| @@ -6536,6 +7473,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6536 | return -ENOMEM; | 7473 | return -ENOMEM; |
| 6537 | state->curframe = 0; | 7474 | state->curframe = 0; |
| 6538 | state->speculative = false; | 7475 | state->speculative = false; |
| 7476 | state->branches = 1; | ||
| 6539 | state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); | 7477 | state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); |
| 6540 | if (!state->frame[0]) { | 7478 | if (!state->frame[0]) { |
| 6541 | kfree(state); | 7479 | kfree(state); |
| @@ -6552,6 +7490,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6552 | u8 class; | 7490 | u8 class; |
| 6553 | int err; | 7491 | int err; |
| 6554 | 7492 | ||
| 7493 | env->prev_insn_idx = prev_insn_idx; | ||
| 6555 | if (env->insn_idx >= insn_cnt) { | 7494 | if (env->insn_idx >= insn_cnt) { |
| 6556 | verbose(env, "invalid insn idx %d insn_cnt %d\n", | 7495 | verbose(env, "invalid insn idx %d insn_cnt %d\n", |
| 6557 | env->insn_idx, insn_cnt); | 7496 | env->insn_idx, insn_cnt); |
| @@ -6624,6 +7563,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6624 | 7563 | ||
| 6625 | regs = cur_regs(env); | 7564 | regs = cur_regs(env); |
| 6626 | env->insn_aux_data[env->insn_idx].seen = true; | 7565 | env->insn_aux_data[env->insn_idx].seen = true; |
| 7566 | prev_insn_idx = env->insn_idx; | ||
| 6627 | 7567 | ||
| 6628 | if (class == BPF_ALU || class == BPF_ALU64) { | 7568 | if (class == BPF_ALU || class == BPF_ALU64) { |
| 6629 | err = check_alu_op(env, insn); | 7569 | err = check_alu_op(env, insn); |
| @@ -6742,6 +7682,7 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6742 | } else if (class == BPF_JMP || class == BPF_JMP32) { | 7682 | } else if (class == BPF_JMP || class == BPF_JMP32) { |
| 6743 | u8 opcode = BPF_OP(insn->code); | 7683 | u8 opcode = BPF_OP(insn->code); |
| 6744 | 7684 | ||
| 7685 | env->jmps_processed++; | ||
| 6745 | if (opcode == BPF_CALL) { | 7686 | if (opcode == BPF_CALL) { |
| 6746 | if (BPF_SRC(insn->code) != BPF_K || | 7687 | if (BPF_SRC(insn->code) != BPF_K || |
| 6747 | insn->off != 0 || | 7688 | insn->off != 0 || |
| @@ -6796,7 +7737,6 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6796 | 7737 | ||
| 6797 | if (state->curframe) { | 7738 | if (state->curframe) { |
| 6798 | /* exit from nested function */ | 7739 | /* exit from nested function */ |
| 6799 | env->prev_insn_idx = env->insn_idx; | ||
| 6800 | err = prepare_func_exit(env, &env->insn_idx); | 7740 | err = prepare_func_exit(env, &env->insn_idx); |
| 6801 | if (err) | 7741 | if (err) |
| 6802 | return err; | 7742 | return err; |
| @@ -6827,7 +7767,8 @@ static int do_check(struct bpf_verifier_env *env) | |||
| 6827 | if (err) | 7767 | if (err) |
| 6828 | return err; | 7768 | return err; |
| 6829 | process_bpf_exit: | 7769 | process_bpf_exit: |
| 6830 | err = pop_stack(env, &env->prev_insn_idx, | 7770 | update_branch_counts(env, env->cur_state); |
| 7771 | err = pop_stack(env, &prev_insn_idx, | ||
| 6831 | &env->insn_idx); | 7772 | &env->insn_idx); |
| 6832 | if (err < 0) { | 7773 | if (err < 0) { |
| 6833 | if (err != -ENOENT) | 7774 | if (err != -ENOENT) |
| @@ -7130,14 +8071,23 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) | |||
| 7130 | * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying | 8071 | * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying |
| 7131 | * [0, off) and [off, end) to new locations, so the patched range stays zero | 8072 | * [0, off) and [off, end) to new locations, so the patched range stays zero |
| 7132 | */ | 8073 | */ |
| 7133 | static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, | 8074 | static int adjust_insn_aux_data(struct bpf_verifier_env *env, |
| 7134 | u32 off, u32 cnt) | 8075 | struct bpf_prog *new_prog, u32 off, u32 cnt) |
| 7135 | { | 8076 | { |
| 7136 | struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; | 8077 | struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; |
| 8078 | struct bpf_insn *insn = new_prog->insnsi; | ||
| 8079 | u32 prog_len; | ||
| 7137 | int i; | 8080 | int i; |
| 7138 | 8081 | ||
| 8082 | /* aux info at OFF always needs adjustment, no matter fast path | ||
| 8083 | * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the | ||
| 8084 | * original insn at old prog. | ||
| 8085 | */ | ||
| 8086 | old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); | ||
| 8087 | |||
| 7139 | if (cnt == 1) | 8088 | if (cnt == 1) |
| 7140 | return 0; | 8089 | return 0; |
| 8090 | prog_len = new_prog->len; | ||
| 7141 | new_data = vzalloc(array_size(prog_len, | 8091 | new_data = vzalloc(array_size(prog_len, |
| 7142 | sizeof(struct bpf_insn_aux_data))); | 8092 | sizeof(struct bpf_insn_aux_data))); |
| 7143 | if (!new_data) | 8093 | if (!new_data) |
| @@ -7145,8 +8095,10 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, | |||
| 7145 | memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); | 8095 | memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); |
| 7146 | memcpy(new_data + off + cnt - 1, old_data + off, | 8096 | memcpy(new_data + off + cnt - 1, old_data + off, |
| 7147 | sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); | 8097 | sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); |
| 7148 | for (i = off; i < off + cnt - 1; i++) | 8098 | for (i = off; i < off + cnt - 1; i++) { |
| 7149 | new_data[i].seen = true; | 8099 | new_data[i].seen = true; |
| 8100 | new_data[i].zext_dst = insn_has_def32(env, insn + i); | ||
| 8101 | } | ||
| 7150 | env->insn_aux_data = new_data; | 8102 | env->insn_aux_data = new_data; |
| 7151 | vfree(old_data); | 8103 | vfree(old_data); |
| 7152 | return 0; | 8104 | return 0; |
| @@ -7179,7 +8131,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of | |||
| 7179 | env->insn_aux_data[off].orig_idx); | 8131 | env->insn_aux_data[off].orig_idx); |
| 7180 | return NULL; | 8132 | return NULL; |
| 7181 | } | 8133 | } |
| 7182 | if (adjust_insn_aux_data(env, new_prog->len, off, len)) | 8134 | if (adjust_insn_aux_data(env, new_prog, off, len)) |
| 7183 | return NULL; | 8135 | return NULL; |
| 7184 | adjust_subprog_starts(env, off, len); | 8136 | adjust_subprog_starts(env, off, len); |
| 7185 | return new_prog; | 8137 | return new_prog; |
| @@ -7443,6 +8395,84 @@ static int opt_remove_nops(struct bpf_verifier_env *env) | |||
| 7443 | return 0; | 8395 | return 0; |
| 7444 | } | 8396 | } |
| 7445 | 8397 | ||
| 8398 | static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, | ||
| 8399 | const union bpf_attr *attr) | ||
| 8400 | { | ||
| 8401 | struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; | ||
| 8402 | struct bpf_insn_aux_data *aux = env->insn_aux_data; | ||
| 8403 | int i, patch_len, delta = 0, len = env->prog->len; | ||
| 8404 | struct bpf_insn *insns = env->prog->insnsi; | ||
| 8405 | struct bpf_prog *new_prog; | ||
| 8406 | bool rnd_hi32; | ||
| 8407 | |||
| 8408 | rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; | ||
| 8409 | zext_patch[1] = BPF_ZEXT_REG(0); | ||
| 8410 | rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); | ||
| 8411 | rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); | ||
| 8412 | rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); | ||
| 8413 | for (i = 0; i < len; i++) { | ||
| 8414 | int adj_idx = i + delta; | ||
| 8415 | struct bpf_insn insn; | ||
| 8416 | |||
| 8417 | insn = insns[adj_idx]; | ||
| 8418 | if (!aux[adj_idx].zext_dst) { | ||
| 8419 | u8 code, class; | ||
| 8420 | u32 imm_rnd; | ||
| 8421 | |||
| 8422 | if (!rnd_hi32) | ||
| 8423 | continue; | ||
| 8424 | |||
| 8425 | code = insn.code; | ||
| 8426 | class = BPF_CLASS(code); | ||
| 8427 | if (insn_no_def(&insn)) | ||
| 8428 | continue; | ||
| 8429 | |||
| 8430 | /* NOTE: arg "reg" (the fourth one) is only used for | ||
| 8431 | * BPF_STX which has been ruled out in above | ||
| 8432 | * check, it is safe to pass NULL here. | ||
| 8433 | */ | ||
| 8434 | if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { | ||
| 8435 | if (class == BPF_LD && | ||
| 8436 | BPF_MODE(code) == BPF_IMM) | ||
| 8437 | i++; | ||
| 8438 | continue; | ||
| 8439 | } | ||
| 8440 | |||
| 8441 | /* ctx load could be transformed into wider load. */ | ||
| 8442 | if (class == BPF_LDX && | ||
| 8443 | aux[adj_idx].ptr_type == PTR_TO_CTX) | ||
| 8444 | continue; | ||
| 8445 | |||
| 8446 | imm_rnd = get_random_int(); | ||
| 8447 | rnd_hi32_patch[0] = insn; | ||
| 8448 | rnd_hi32_patch[1].imm = imm_rnd; | ||
| 8449 | rnd_hi32_patch[3].dst_reg = insn.dst_reg; | ||
| 8450 | patch = rnd_hi32_patch; | ||
| 8451 | patch_len = 4; | ||
| 8452 | goto apply_patch_buffer; | ||
| 8453 | } | ||
| 8454 | |||
| 8455 | if (!bpf_jit_needs_zext()) | ||
| 8456 | continue; | ||
| 8457 | |||
| 8458 | zext_patch[0] = insn; | ||
| 8459 | zext_patch[1].dst_reg = insn.dst_reg; | ||
| 8460 | zext_patch[1].src_reg = insn.dst_reg; | ||
| 8461 | patch = zext_patch; | ||
| 8462 | patch_len = 2; | ||
| 8463 | apply_patch_buffer: | ||
| 8464 | new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); | ||
| 8465 | if (!new_prog) | ||
| 8466 | return -ENOMEM; | ||
| 8467 | env->prog = new_prog; | ||
| 8468 | insns = new_prog->insnsi; | ||
| 8469 | aux = env->insn_aux_data; | ||
| 8470 | delta += patch_len - 1; | ||
| 8471 | } | ||
| 8472 | |||
| 8473 | return 0; | ||
| 8474 | } | ||
| 8475 | |||
| 7446 | /* convert load instructions that access fields of a context type into a | 8476 | /* convert load instructions that access fields of a context type into a |
| 7447 | * sequence of instructions that access fields of the underlying structure: | 8477 | * sequence of instructions that access fields of the underlying structure: |
| 7448 | * struct __sk_buff -> struct sk_buff | 8478 | * struct __sk_buff -> struct sk_buff |
| @@ -7541,6 +8571,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) | |||
| 7541 | case PTR_TO_TCP_SOCK: | 8571 | case PTR_TO_TCP_SOCK: |
| 7542 | convert_ctx_access = bpf_tcp_sock_convert_ctx_access; | 8572 | convert_ctx_access = bpf_tcp_sock_convert_ctx_access; |
| 7543 | break; | 8573 | break; |
| 8574 | case PTR_TO_XDP_SOCK: | ||
| 8575 | convert_ctx_access = bpf_xdp_sock_convert_ctx_access; | ||
| 8576 | break; | ||
| 7544 | default: | 8577 | default: |
| 7545 | continue; | 8578 | continue; |
| 7546 | } | 8579 | } |
| @@ -8130,16 +9163,15 @@ static void free_states(struct bpf_verifier_env *env) | |||
| 8130 | if (!env->explored_states) | 9163 | if (!env->explored_states) |
| 8131 | return; | 9164 | return; |
| 8132 | 9165 | ||
| 8133 | for (i = 0; i < env->prog->len; i++) { | 9166 | for (i = 0; i < state_htab_size(env); i++) { |
| 8134 | sl = env->explored_states[i]; | 9167 | sl = env->explored_states[i]; |
| 8135 | 9168 | ||
| 8136 | if (sl) | 9169 | while (sl) { |
| 8137 | while (sl != STATE_LIST_MARK) { | 9170 | sln = sl->next; |
| 8138 | sln = sl->next; | 9171 | free_verifier_state(&sl->state, false); |
| 8139 | free_verifier_state(&sl->state, false); | 9172 | kfree(sl); |
| 8140 | kfree(sl); | 9173 | sl = sln; |
| 8141 | sl = sln; | 9174 | } |
| 8142 | } | ||
| 8143 | } | 9175 | } |
| 8144 | 9176 | ||
| 8145 | kvfree(env->explored_states); | 9177 | kvfree(env->explored_states); |
| @@ -8239,7 +9271,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, | |||
| 8239 | goto skip_full_check; | 9271 | goto skip_full_check; |
| 8240 | } | 9272 | } |
| 8241 | 9273 | ||
| 8242 | env->explored_states = kvcalloc(env->prog->len, | 9274 | env->explored_states = kvcalloc(state_htab_size(env), |
| 8243 | sizeof(struct bpf_verifier_state_list *), | 9275 | sizeof(struct bpf_verifier_state_list *), |
| 8244 | GFP_USER); | 9276 | GFP_USER); |
| 8245 | ret = -ENOMEM; | 9277 | ret = -ENOMEM; |
| @@ -8294,6 +9326,15 @@ skip_full_check: | |||
| 8294 | if (ret == 0) | 9326 | if (ret == 0) |
| 8295 | ret = fixup_bpf_calls(env); | 9327 | ret = fixup_bpf_calls(env); |
| 8296 | 9328 | ||
| 9329 | /* do 32-bit optimization after insn patching has done so those patched | ||
| 9330 | * insns could be handled correctly. | ||
| 9331 | */ | ||
| 9332 | if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { | ||
| 9333 | ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); | ||
| 9334 | env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret | ||
| 9335 | : false; | ||
| 9336 | } | ||
| 9337 | |||
| 8297 | if (ret == 0) | 9338 | if (ret == 0) |
| 8298 | ret = fixup_call_args(env); | 9339 | ret = fixup_call_args(env); |
| 8299 | 9340 | ||
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c index 686d244e798d..9bb96ace9fa1 100644 --- a/kernel/bpf/xskmap.c +++ b/kernel/bpf/xskmap.c | |||
| @@ -17,8 +17,8 @@ struct xsk_map { | |||
| 17 | 17 | ||
| 18 | static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) | 18 | static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) |
| 19 | { | 19 | { |
| 20 | int cpu, err = -EINVAL; | ||
| 21 | struct xsk_map *m; | 20 | struct xsk_map *m; |
| 21 | int cpu, err; | ||
| 22 | u64 cost; | 22 | u64 cost; |
| 23 | 23 | ||
| 24 | if (!capable(CAP_NET_ADMIN)) | 24 | if (!capable(CAP_NET_ADMIN)) |
| @@ -37,13 +37,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) | |||
| 37 | 37 | ||
| 38 | cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); | 38 | cost = (u64)m->map.max_entries * sizeof(struct xdp_sock *); |
| 39 | cost += sizeof(struct list_head) * num_possible_cpus(); | 39 | cost += sizeof(struct list_head) * num_possible_cpus(); |
| 40 | if (cost >= U32_MAX - PAGE_SIZE) | ||
| 41 | goto free_m; | ||
| 42 | |||
| 43 | m->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; | ||
| 44 | 40 | ||
| 45 | /* Notice returns -EPERM on if map size is larger than memlock limit */ | 41 | /* Notice returns -EPERM on if map size is larger than memlock limit */ |
| 46 | err = bpf_map_precharge_memlock(m->map.pages); | 42 | err = bpf_map_charge_init(&m->map.memory, cost); |
| 47 | if (err) | 43 | if (err) |
| 48 | goto free_m; | 44 | goto free_m; |
| 49 | 45 | ||
| @@ -51,7 +47,7 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) | |||
| 51 | 47 | ||
| 52 | m->flush_list = alloc_percpu(struct list_head); | 48 | m->flush_list = alloc_percpu(struct list_head); |
| 53 | if (!m->flush_list) | 49 | if (!m->flush_list) |
| 54 | goto free_m; | 50 | goto free_charge; |
| 55 | 51 | ||
| 56 | for_each_possible_cpu(cpu) | 52 | for_each_possible_cpu(cpu) |
| 57 | INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); | 53 | INIT_LIST_HEAD(per_cpu_ptr(m->flush_list, cpu)); |
| @@ -65,6 +61,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr) | |||
| 65 | 61 | ||
| 66 | free_percpu: | 62 | free_percpu: |
| 67 | free_percpu(m->flush_list); | 63 | free_percpu(m->flush_list); |
| 64 | free_charge: | ||
| 65 | bpf_map_charge_finish(&m->map.memory); | ||
| 68 | free_m: | 66 | free_m: |
| 69 | kfree(m); | 67 | kfree(m); |
| 70 | return ERR_PTR(err); | 68 | return ERR_PTR(err); |
| @@ -147,13 +145,18 @@ void __xsk_map_flush(struct bpf_map *map) | |||
| 147 | 145 | ||
| 148 | list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { | 146 | list_for_each_entry_safe(xs, tmp, flush_list, flush_node) { |
| 149 | xsk_flush(xs); | 147 | xsk_flush(xs); |
| 150 | __list_del(xs->flush_node.prev, xs->flush_node.next); | 148 | __list_del_clearprev(&xs->flush_node); |
| 151 | xs->flush_node.prev = NULL; | ||
| 152 | } | 149 | } |
| 153 | } | 150 | } |
| 154 | 151 | ||
| 155 | static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) | 152 | static void *xsk_map_lookup_elem(struct bpf_map *map, void *key) |
| 156 | { | 153 | { |
| 154 | WARN_ON_ONCE(!rcu_read_lock_held()); | ||
| 155 | return __xsk_map_lookup_elem(map, *(u32 *)key); | ||
| 156 | } | ||
| 157 | |||
| 158 | static void *xsk_map_lookup_elem_sys_only(struct bpf_map *map, void *key) | ||
| 159 | { | ||
| 157 | return ERR_PTR(-EOPNOTSUPP); | 160 | return ERR_PTR(-EOPNOTSUPP); |
| 158 | } | 161 | } |
| 159 | 162 | ||
| @@ -220,6 +223,7 @@ const struct bpf_map_ops xsk_map_ops = { | |||
| 220 | .map_free = xsk_map_free, | 223 | .map_free = xsk_map_free, |
| 221 | .map_get_next_key = xsk_map_get_next_key, | 224 | .map_get_next_key = xsk_map_get_next_key, |
| 222 | .map_lookup_elem = xsk_map_lookup_elem, | 225 | .map_lookup_elem = xsk_map_lookup_elem, |
| 226 | .map_lookup_elem_sys_only = xsk_map_lookup_elem_sys_only, | ||
| 223 | .map_update_elem = xsk_map_update_elem, | 227 | .map_update_elem = xsk_map_update_elem, |
| 224 | .map_delete_elem = xsk_map_delete_elem, | 228 | .map_delete_elem = xsk_map_delete_elem, |
| 225 | .map_check_btf = map_check_no_btf, | 229 | .map_check_btf = map_check_no_btf, |
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 68ca5de7ec27..88006be40ea3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | #include "cgroup-internal.h" | 2 | #include "cgroup-internal.h" |
| 2 | 3 | ||
| 3 | #include <linux/ctype.h> | 4 | #include <linux/ctype.h> |
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 327f37c9fdfa..300b0c416341 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c | |||
| @@ -101,7 +101,7 @@ static DEFINE_SPINLOCK(cgroup_idr_lock); | |||
| 101 | */ | 101 | */ |
| 102 | static DEFINE_SPINLOCK(cgroup_file_kn_lock); | 102 | static DEFINE_SPINLOCK(cgroup_file_kn_lock); |
| 103 | 103 | ||
| 104 | struct percpu_rw_semaphore cgroup_threadgroup_rwsem; | 104 | DEFINE_PERCPU_RWSEM(cgroup_threadgroup_rwsem); |
| 105 | 105 | ||
| 106 | #define cgroup_assert_mutex_or_rcu_locked() \ | 106 | #define cgroup_assert_mutex_or_rcu_locked() \ |
| 107 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ | 107 | RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ |
| @@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[]; | |||
| 215 | 215 | ||
| 216 | static int cgroup_apply_control(struct cgroup *cgrp); | 216 | static int cgroup_apply_control(struct cgroup *cgrp); |
| 217 | static void cgroup_finalize_control(struct cgroup *cgrp, int ret); | 217 | static void cgroup_finalize_control(struct cgroup *cgrp, int ret); |
| 218 | static void css_task_iter_advance(struct css_task_iter *it); | 218 | static void css_task_iter_skip(struct css_task_iter *it, |
| 219 | struct task_struct *task); | ||
| 219 | static int cgroup_destroy_locked(struct cgroup *cgrp); | 220 | static int cgroup_destroy_locked(struct cgroup *cgrp); |
| 220 | static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, | 221 | static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, |
| 221 | struct cgroup_subsys *ss); | 222 | struct cgroup_subsys *ss); |
| @@ -738,6 +739,7 @@ struct css_set init_css_set = { | |||
| 738 | .dom_cset = &init_css_set, | 739 | .dom_cset = &init_css_set, |
| 739 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), | 740 | .tasks = LIST_HEAD_INIT(init_css_set.tasks), |
| 740 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), | 741 | .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), |
| 742 | .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), | ||
| 741 | .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), | 743 | .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), |
| 742 | .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), | 744 | .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), |
| 743 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), | 745 | .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), |
| @@ -843,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) | |||
| 843 | cgroup_update_populated(link->cgrp, populated); | 845 | cgroup_update_populated(link->cgrp, populated); |
| 844 | } | 846 | } |
| 845 | 847 | ||
| 848 | /* | ||
| 849 | * @task is leaving, advance task iterators which are pointing to it so | ||
| 850 | * that they can resume at the next position. Advancing an iterator might | ||
| 851 | * remove it from the list, use safe walk. See css_task_iter_skip() for | ||
| 852 | * details. | ||
| 853 | */ | ||
| 854 | static void css_set_skip_task_iters(struct css_set *cset, | ||
| 855 | struct task_struct *task) | ||
| 856 | { | ||
| 857 | struct css_task_iter *it, *pos; | ||
| 858 | |||
| 859 | list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) | ||
| 860 | css_task_iter_skip(it, task); | ||
| 861 | } | ||
| 862 | |||
| 846 | /** | 863 | /** |
| 847 | * css_set_move_task - move a task from one css_set to another | 864 | * css_set_move_task - move a task from one css_set to another |
| 848 | * @task: task being moved | 865 | * @task: task being moved |
| @@ -868,22 +885,9 @@ static void css_set_move_task(struct task_struct *task, | |||
| 868 | css_set_update_populated(to_cset, true); | 885 | css_set_update_populated(to_cset, true); |
| 869 | 886 | ||
| 870 | if (from_cset) { | 887 | if (from_cset) { |
| 871 | struct css_task_iter *it, *pos; | ||
| 872 | |||
| 873 | WARN_ON_ONCE(list_empty(&task->cg_list)); | 888 | WARN_ON_ONCE(list_empty(&task->cg_list)); |
| 874 | 889 | ||
| 875 | /* | 890 | css_set_skip_task_iters(from_cset, task); |
| 876 | * @task is leaving, advance task iterators which are | ||
| 877 | * pointing to it so that they can resume at the next | ||
| 878 | * position. Advancing an iterator might remove it from | ||
| 879 | * the list, use safe walk. See css_task_iter_advance*() | ||
| 880 | * for details. | ||
| 881 | */ | ||
| 882 | list_for_each_entry_safe(it, pos, &from_cset->task_iters, | ||
| 883 | iters_node) | ||
| 884 | if (it->task_pos == &task->cg_list) | ||
| 885 | css_task_iter_advance(it); | ||
| 886 | |||
| 887 | list_del_init(&task->cg_list); | 891 | list_del_init(&task->cg_list); |
| 888 | if (!css_set_populated(from_cset)) | 892 | if (!css_set_populated(from_cset)) |
| 889 | css_set_update_populated(from_cset, false); | 893 | css_set_update_populated(from_cset, false); |
| @@ -1210,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, | |||
| 1210 | cset->dom_cset = cset; | 1214 | cset->dom_cset = cset; |
| 1211 | INIT_LIST_HEAD(&cset->tasks); | 1215 | INIT_LIST_HEAD(&cset->tasks); |
| 1212 | INIT_LIST_HEAD(&cset->mg_tasks); | 1216 | INIT_LIST_HEAD(&cset->mg_tasks); |
| 1217 | INIT_LIST_HEAD(&cset->dying_tasks); | ||
| 1213 | INIT_LIST_HEAD(&cset->task_iters); | 1218 | INIT_LIST_HEAD(&cset->task_iters); |
| 1214 | INIT_LIST_HEAD(&cset->threaded_csets); | 1219 | INIT_LIST_HEAD(&cset->threaded_csets); |
| 1215 | INIT_HLIST_NODE(&cset->hlist); | 1220 | INIT_HLIST_NODE(&cset->hlist); |
| @@ -1810,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, | |||
| 1810 | 1815 | ||
| 1811 | enum cgroup2_param { | 1816 | enum cgroup2_param { |
| 1812 | Opt_nsdelegate, | 1817 | Opt_nsdelegate, |
| 1818 | Opt_memory_localevents, | ||
| 1813 | nr__cgroup2_params | 1819 | nr__cgroup2_params |
| 1814 | }; | 1820 | }; |
| 1815 | 1821 | ||
| 1816 | static const struct fs_parameter_spec cgroup2_param_specs[] = { | 1822 | static const struct fs_parameter_spec cgroup2_param_specs[] = { |
| 1817 | fsparam_flag ("nsdelegate", Opt_nsdelegate), | 1823 | fsparam_flag("nsdelegate", Opt_nsdelegate), |
| 1824 | fsparam_flag("memory_localevents", Opt_memory_localevents), | ||
| 1818 | {} | 1825 | {} |
| 1819 | }; | 1826 | }; |
| 1820 | 1827 | ||
| @@ -1837,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param | |||
| 1837 | case Opt_nsdelegate: | 1844 | case Opt_nsdelegate: |
| 1838 | ctx->flags |= CGRP_ROOT_NS_DELEGATE; | 1845 | ctx->flags |= CGRP_ROOT_NS_DELEGATE; |
| 1839 | return 0; | 1846 | return 0; |
| 1847 | case Opt_memory_localevents: | ||
| 1848 | ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; | ||
| 1849 | return 0; | ||
| 1840 | } | 1850 | } |
| 1841 | return -EINVAL; | 1851 | return -EINVAL; |
| 1842 | } | 1852 | } |
| @@ -1848,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) | |||
| 1848 | cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; | 1858 | cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; |
| 1849 | else | 1859 | else |
| 1850 | cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; | 1860 | cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; |
| 1861 | |||
| 1862 | if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) | ||
| 1863 | cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; | ||
| 1864 | else | ||
| 1865 | cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; | ||
| 1851 | } | 1866 | } |
| 1852 | } | 1867 | } |
| 1853 | 1868 | ||
| @@ -1855,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root | |||
| 1855 | { | 1870 | { |
| 1856 | if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) | 1871 | if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) |
| 1857 | seq_puts(seq, ",nsdelegate"); | 1872 | seq_puts(seq, ",nsdelegate"); |
| 1873 | if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) | ||
| 1874 | seq_puts(seq, ",memory_localevents"); | ||
| 1858 | return 0; | 1875 | return 0; |
| 1859 | } | 1876 | } |
| 1860 | 1877 | ||
| @@ -3540,17 +3557,84 @@ static int cpu_stat_show(struct seq_file *seq, void *v) | |||
| 3540 | #ifdef CONFIG_PSI | 3557 | #ifdef CONFIG_PSI |
| 3541 | static int cgroup_io_pressure_show(struct seq_file *seq, void *v) | 3558 | static int cgroup_io_pressure_show(struct seq_file *seq, void *v) |
| 3542 | { | 3559 | { |
| 3543 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); | 3560 | struct cgroup *cgroup = seq_css(seq)->cgroup; |
| 3561 | struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; | ||
| 3562 | |||
| 3563 | return psi_show(seq, psi, PSI_IO); | ||
| 3544 | } | 3564 | } |
| 3545 | static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) | 3565 | static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) |
| 3546 | { | 3566 | { |
| 3547 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); | 3567 | struct cgroup *cgroup = seq_css(seq)->cgroup; |
| 3568 | struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; | ||
| 3569 | |||
| 3570 | return psi_show(seq, psi, PSI_MEM); | ||
| 3548 | } | 3571 | } |
| 3549 | static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) | 3572 | static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) |
| 3550 | { | 3573 | { |
| 3551 | return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); | 3574 | struct cgroup *cgroup = seq_css(seq)->cgroup; |
| 3575 | struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; | ||
| 3576 | |||
| 3577 | return psi_show(seq, psi, PSI_CPU); | ||
| 3578 | } | ||
| 3579 | |||
| 3580 | static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, | ||
| 3581 | size_t nbytes, enum psi_res res) | ||
| 3582 | { | ||
| 3583 | struct psi_trigger *new; | ||
| 3584 | struct cgroup *cgrp; | ||
| 3585 | |||
| 3586 | cgrp = cgroup_kn_lock_live(of->kn, false); | ||
| 3587 | if (!cgrp) | ||
| 3588 | return -ENODEV; | ||
| 3589 | |||
| 3590 | cgroup_get(cgrp); | ||
| 3591 | cgroup_kn_unlock(of->kn); | ||
| 3592 | |||
| 3593 | new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); | ||
| 3594 | if (IS_ERR(new)) { | ||
| 3595 | cgroup_put(cgrp); | ||
| 3596 | return PTR_ERR(new); | ||
| 3597 | } | ||
| 3598 | |||
| 3599 | psi_trigger_replace(&of->priv, new); | ||
| 3600 | |||
| 3601 | cgroup_put(cgrp); | ||
| 3602 | |||
| 3603 | return nbytes; | ||
| 3604 | } | ||
| 3605 | |||
| 3606 | static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, | ||
| 3607 | char *buf, size_t nbytes, | ||
| 3608 | loff_t off) | ||
| 3609 | { | ||
| 3610 | return cgroup_pressure_write(of, buf, nbytes, PSI_IO); | ||
| 3552 | } | 3611 | } |
| 3553 | #endif | 3612 | |
| 3613 | static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, | ||
| 3614 | char *buf, size_t nbytes, | ||
| 3615 | loff_t off) | ||
| 3616 | { | ||
| 3617 | return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); | ||
| 3618 | } | ||
| 3619 | |||
| 3620 | static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, | ||
| 3621 | char *buf, size_t nbytes, | ||
| 3622 | loff_t off) | ||
| 3623 | { | ||
| 3624 | return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); | ||
| 3625 | } | ||
| 3626 | |||
| 3627 | static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, | ||
| 3628 | poll_table *pt) | ||
| 3629 | { | ||
| 3630 | return psi_trigger_poll(&of->priv, of->file, pt); | ||
| 3631 | } | ||
| 3632 | |||
| 3633 | static void cgroup_pressure_release(struct kernfs_open_file *of) | ||
| 3634 | { | ||
| 3635 | psi_trigger_replace(&of->priv, NULL); | ||
| 3636 | } | ||
| 3637 | #endif /* CONFIG_PSI */ | ||
| 3554 | 3638 | ||
| 3555 | static int cgroup_freeze_show(struct seq_file *seq, void *v) | 3639 | static int cgroup_freeze_show(struct seq_file *seq, void *v) |
| 3556 | { | 3640 | { |
| @@ -4142,6 +4226,7 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, | |||
| 4142 | 4226 | ||
| 4143 | return NULL; | 4227 | return NULL; |
| 4144 | } | 4228 | } |
| 4229 | EXPORT_SYMBOL_GPL(css_next_descendant_pre); | ||
| 4145 | 4230 | ||
| 4146 | /** | 4231 | /** |
| 4147 | * css_rightmost_descendant - return the rightmost descendant of a css | 4232 | * css_rightmost_descendant - return the rightmost descendant of a css |
| @@ -4329,15 +4414,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) | |||
| 4329 | it->task_pos = NULL; | 4414 | it->task_pos = NULL; |
| 4330 | return; | 4415 | return; |
| 4331 | } | 4416 | } |
| 4332 | } while (!css_set_populated(cset)); | 4417 | } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); |
| 4333 | 4418 | ||
| 4334 | if (!list_empty(&cset->tasks)) | 4419 | if (!list_empty(&cset->tasks)) |
| 4335 | it->task_pos = cset->tasks.next; | 4420 | it->task_pos = cset->tasks.next; |
| 4336 | else | 4421 | else if (!list_empty(&cset->mg_tasks)) |
| 4337 | it->task_pos = cset->mg_tasks.next; | 4422 | it->task_pos = cset->mg_tasks.next; |
| 4423 | else | ||
| 4424 | it->task_pos = cset->dying_tasks.next; | ||
| 4338 | 4425 | ||
| 4339 | it->tasks_head = &cset->tasks; | 4426 | it->tasks_head = &cset->tasks; |
| 4340 | it->mg_tasks_head = &cset->mg_tasks; | 4427 | it->mg_tasks_head = &cset->mg_tasks; |
| 4428 | it->dying_tasks_head = &cset->dying_tasks; | ||
| 4341 | 4429 | ||
| 4342 | /* | 4430 | /* |
| 4343 | * We don't keep css_sets locked across iteration steps and thus | 4431 | * We don't keep css_sets locked across iteration steps and thus |
| @@ -4363,9 +4451,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) | |||
| 4363 | list_add(&it->iters_node, &cset->task_iters); | 4451 | list_add(&it->iters_node, &cset->task_iters); |
| 4364 | } | 4452 | } |
| 4365 | 4453 | ||
| 4454 | static void css_task_iter_skip(struct css_task_iter *it, | ||
| 4455 | struct task_struct *task) | ||
| 4456 | { | ||
| 4457 | lockdep_assert_held(&css_set_lock); | ||
| 4458 | |||
| 4459 | if (it->task_pos == &task->cg_list) { | ||
| 4460 | it->task_pos = it->task_pos->next; | ||
| 4461 | it->flags |= CSS_TASK_ITER_SKIPPED; | ||
| 4462 | } | ||
| 4463 | } | ||
| 4464 | |||
| 4366 | static void css_task_iter_advance(struct css_task_iter *it) | 4465 | static void css_task_iter_advance(struct css_task_iter *it) |
| 4367 | { | 4466 | { |
| 4368 | struct list_head *next; | 4467 | struct task_struct *task; |
| 4369 | 4468 | ||
| 4370 | lockdep_assert_held(&css_set_lock); | 4469 | lockdep_assert_held(&css_set_lock); |
| 4371 | repeat: | 4470 | repeat: |
| @@ -4375,25 +4474,40 @@ repeat: | |||
| 4375 | * consumed first and then ->mg_tasks. After ->mg_tasks, | 4474 | * consumed first and then ->mg_tasks. After ->mg_tasks, |
| 4376 | * we move onto the next cset. | 4475 | * we move onto the next cset. |
| 4377 | */ | 4476 | */ |
| 4378 | next = it->task_pos->next; | 4477 | if (it->flags & CSS_TASK_ITER_SKIPPED) |
| 4379 | 4478 | it->flags &= ~CSS_TASK_ITER_SKIPPED; | |
| 4380 | if (next == it->tasks_head) | 4479 | else |
| 4381 | next = it->mg_tasks_head->next; | 4480 | it->task_pos = it->task_pos->next; |
| 4382 | 4481 | ||
| 4383 | if (next == it->mg_tasks_head) | 4482 | if (it->task_pos == it->tasks_head) |
| 4483 | it->task_pos = it->mg_tasks_head->next; | ||
| 4484 | if (it->task_pos == it->mg_tasks_head) | ||
| 4485 | it->task_pos = it->dying_tasks_head->next; | ||
| 4486 | if (it->task_pos == it->dying_tasks_head) | ||
| 4384 | css_task_iter_advance_css_set(it); | 4487 | css_task_iter_advance_css_set(it); |
| 4385 | else | ||
| 4386 | it->task_pos = next; | ||
| 4387 | } else { | 4488 | } else { |
| 4388 | /* called from start, proceed to the first cset */ | 4489 | /* called from start, proceed to the first cset */ |
| 4389 | css_task_iter_advance_css_set(it); | 4490 | css_task_iter_advance_css_set(it); |
| 4390 | } | 4491 | } |
| 4391 | 4492 | ||
| 4392 | /* if PROCS, skip over tasks which aren't group leaders */ | 4493 | if (!it->task_pos) |
| 4393 | if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && | 4494 | return; |
| 4394 | !thread_group_leader(list_entry(it->task_pos, struct task_struct, | 4495 | |
| 4395 | cg_list))) | 4496 | task = list_entry(it->task_pos, struct task_struct, cg_list); |
| 4396 | goto repeat; | 4497 | |
| 4498 | if (it->flags & CSS_TASK_ITER_PROCS) { | ||
| 4499 | /* if PROCS, skip over tasks which aren't group leaders */ | ||
| 4500 | if (!thread_group_leader(task)) | ||
| 4501 | goto repeat; | ||
| 4502 | |||
| 4503 | /* and dying leaders w/o live member threads */ | ||
| 4504 | if (!atomic_read(&task->signal->live)) | ||
| 4505 | goto repeat; | ||
| 4506 | } else { | ||
| 4507 | /* skip all dying ones */ | ||
| 4508 | if (task->flags & PF_EXITING) | ||
| 4509 | goto repeat; | ||
| 4510 | } | ||
| 4397 | } | 4511 | } |
| 4398 | 4512 | ||
| 4399 | /** | 4513 | /** |
| @@ -4449,6 +4563,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) | |||
| 4449 | 4563 | ||
| 4450 | spin_lock_irq(&css_set_lock); | 4564 | spin_lock_irq(&css_set_lock); |
| 4451 | 4565 | ||
| 4566 | /* @it may be half-advanced by skips, finish advancing */ | ||
| 4567 | if (it->flags & CSS_TASK_ITER_SKIPPED) | ||
| 4568 | css_task_iter_advance(it); | ||
| 4569 | |||
| 4452 | if (it->task_pos) { | 4570 | if (it->task_pos) { |
| 4453 | it->cur_task = list_entry(it->task_pos, struct task_struct, | 4571 | it->cur_task = list_entry(it->task_pos, struct task_struct, |
| 4454 | cg_list); | 4572 | cg_list); |
| @@ -4743,20 +4861,26 @@ static struct cftype cgroup_base_files[] = { | |||
| 4743 | #ifdef CONFIG_PSI | 4861 | #ifdef CONFIG_PSI |
| 4744 | { | 4862 | { |
| 4745 | .name = "io.pressure", | 4863 | .name = "io.pressure", |
| 4746 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 4747 | .seq_show = cgroup_io_pressure_show, | 4864 | .seq_show = cgroup_io_pressure_show, |
| 4865 | .write = cgroup_io_pressure_write, | ||
| 4866 | .poll = cgroup_pressure_poll, | ||
| 4867 | .release = cgroup_pressure_release, | ||
| 4748 | }, | 4868 | }, |
| 4749 | { | 4869 | { |
| 4750 | .name = "memory.pressure", | 4870 | .name = "memory.pressure", |
| 4751 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 4752 | .seq_show = cgroup_memory_pressure_show, | 4871 | .seq_show = cgroup_memory_pressure_show, |
| 4872 | .write = cgroup_memory_pressure_write, | ||
| 4873 | .poll = cgroup_pressure_poll, | ||
| 4874 | .release = cgroup_pressure_release, | ||
| 4753 | }, | 4875 | }, |
| 4754 | { | 4876 | { |
| 4755 | .name = "cpu.pressure", | 4877 | .name = "cpu.pressure", |
| 4756 | .flags = CFTYPE_NOT_ON_ROOT, | ||
| 4757 | .seq_show = cgroup_cpu_pressure_show, | 4878 | .seq_show = cgroup_cpu_pressure_show, |
| 4879 | .write = cgroup_cpu_pressure_write, | ||
| 4880 | .poll = cgroup_pressure_poll, | ||
| 4881 | .release = cgroup_pressure_release, | ||
| 4758 | }, | 4882 | }, |
| 4759 | #endif | 4883 | #endif /* CONFIG_PSI */ |
| 4760 | { } /* terminate */ | 4884 | { } /* terminate */ |
| 4761 | }; | 4885 | }; |
| 4762 | 4886 | ||
| @@ -4882,8 +5006,6 @@ static void css_release_work_fn(struct work_struct *work) | |||
| 4882 | if (cgrp->kn) | 5006 | if (cgrp->kn) |
| 4883 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, | 5007 | RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, |
| 4884 | NULL); | 5008 | NULL); |
| 4885 | |||
| 4886 | cgroup_bpf_put(cgrp); | ||
| 4887 | } | 5009 | } |
| 4888 | 5010 | ||
| 4889 | mutex_unlock(&cgroup_mutex); | 5011 | mutex_unlock(&cgroup_mutex); |
| @@ -5409,6 +5531,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) | |||
| 5409 | 5531 | ||
| 5410 | cgroup1_check_for_release(parent); | 5532 | cgroup1_check_for_release(parent); |
| 5411 | 5533 | ||
| 5534 | cgroup_bpf_offline(cgrp); | ||
| 5535 | |||
| 5412 | /* put the base reference */ | 5536 | /* put the base reference */ |
| 5413 | percpu_ref_kill(&cgrp->self.refcnt); | 5537 | percpu_ref_kill(&cgrp->self.refcnt); |
| 5414 | 5538 | ||
| @@ -5543,7 +5667,6 @@ int __init cgroup_init(void) | |||
| 5543 | int ssid; | 5667 | int ssid; |
| 5544 | 5668 | ||
| 5545 | BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); | 5669 | BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); |
| 5546 | BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); | ||
| 5547 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); | 5670 | BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); |
| 5548 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); | 5671 | BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); |
| 5549 | 5672 | ||
| @@ -5924,6 +6047,7 @@ void cgroup_exit(struct task_struct *tsk) | |||
| 5924 | if (!list_empty(&tsk->cg_list)) { | 6047 | if (!list_empty(&tsk->cg_list)) { |
| 5925 | spin_lock_irq(&css_set_lock); | 6048 | spin_lock_irq(&css_set_lock); |
| 5926 | css_set_move_task(tsk, cset, NULL, false); | 6049 | css_set_move_task(tsk, cset, NULL, false); |
| 6050 | list_add_tail(&tsk->cg_list, &cset->dying_tasks); | ||
| 5927 | cset->nr_tasks--; | 6051 | cset->nr_tasks--; |
| 5928 | 6052 | ||
| 5929 | WARN_ON_ONCE(cgroup_task_frozen(tsk)); | 6053 | WARN_ON_ONCE(cgroup_task_frozen(tsk)); |
| @@ -5949,6 +6073,13 @@ void cgroup_release(struct task_struct *task) | |||
| 5949 | do_each_subsys_mask(ss, ssid, have_release_callback) { | 6073 | do_each_subsys_mask(ss, ssid, have_release_callback) { |
| 5950 | ss->release(task); | 6074 | ss->release(task); |
| 5951 | } while_each_subsys_mask(); | 6075 | } while_each_subsys_mask(); |
| 6076 | |||
| 6077 | if (use_task_css_set_links) { | ||
| 6078 | spin_lock_irq(&css_set_lock); | ||
| 6079 | css_set_skip_task_iters(task_css_set(task), task); | ||
| 6080 | list_del_init(&task->cg_list); | ||
| 6081 | spin_unlock_irq(&css_set_lock); | ||
| 6082 | } | ||
| 5952 | } | 6083 | } |
| 5953 | 6084 | ||
| 5954 | void cgroup_free(struct task_struct *task) | 6085 | void cgroup_free(struct task_struct *task) |
| @@ -6110,6 +6241,48 @@ struct cgroup *cgroup_get_from_fd(int fd) | |||
| 6110 | } | 6241 | } |
| 6111 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); | 6242 | EXPORT_SYMBOL_GPL(cgroup_get_from_fd); |
| 6112 | 6243 | ||
| 6244 | static u64 power_of_ten(int power) | ||
| 6245 | { | ||
| 6246 | u64 v = 1; | ||
| 6247 | while (power--) | ||
| 6248 | v *= 10; | ||
| 6249 | return v; | ||
| 6250 | } | ||
| 6251 | |||
| 6252 | /** | ||
| 6253 | * cgroup_parse_float - parse a floating number | ||
| 6254 | * @input: input string | ||
| 6255 | * @dec_shift: number of decimal digits to shift | ||
| 6256 | * @v: output | ||
| 6257 | * | ||
| 6258 | * Parse a decimal floating point number in @input and store the result in | ||
| 6259 | * @v with decimal point right shifted @dec_shift times. For example, if | ||
| 6260 | * @input is "12.3456" and @dec_shift is 3, *@v will be set to 12345. | ||
| 6261 | * Returns 0 on success, -errno otherwise. | ||
| 6262 | * | ||
| 6263 | * There's nothing cgroup specific about this function except that it's | ||
| 6264 | * currently the only user. | ||
| 6265 | */ | ||
| 6266 | int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v) | ||
| 6267 | { | ||
| 6268 | s64 whole, frac = 0; | ||
| 6269 | int fstart = 0, fend = 0, flen; | ||
| 6270 | |||
| 6271 | if (!sscanf(input, "%lld.%n%lld%n", &whole, &fstart, &frac, &fend)) | ||
| 6272 | return -EINVAL; | ||
| 6273 | if (frac < 0) | ||
| 6274 | return -EINVAL; | ||
| 6275 | |||
| 6276 | flen = fend > fstart ? fend - fstart : 0; | ||
| 6277 | if (flen < dec_shift) | ||
| 6278 | frac *= power_of_ten(dec_shift - flen); | ||
| 6279 | else | ||
| 6280 | frac = DIV_ROUND_CLOSEST_ULL(frac, power_of_ten(flen - dec_shift)); | ||
| 6281 | |||
| 6282 | *v = whole * power_of_ten(dec_shift) + frac; | ||
| 6283 | return 0; | ||
| 6284 | } | ||
| 6285 | |||
| 6113 | /* | 6286 | /* |
| 6114 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data | 6287 | * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data |
| 6115 | * definition in cgroup-defs.h. | 6288 | * definition in cgroup-defs.h. |
| @@ -6148,6 +6321,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) | |||
| 6148 | * Don't use cgroup_get_live(). | 6321 | * Don't use cgroup_get_live(). |
| 6149 | */ | 6322 | */ |
| 6150 | cgroup_get(sock_cgroup_ptr(skcd)); | 6323 | cgroup_get(sock_cgroup_ptr(skcd)); |
| 6324 | cgroup_bpf_get(sock_cgroup_ptr(skcd)); | ||
| 6151 | return; | 6325 | return; |
| 6152 | } | 6326 | } |
| 6153 | 6327 | ||
| @@ -6159,6 +6333,7 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) | |||
| 6159 | cset = task_css_set(current); | 6333 | cset = task_css_set(current); |
| 6160 | if (likely(cgroup_tryget(cset->dfl_cgrp))) { | 6334 | if (likely(cgroup_tryget(cset->dfl_cgrp))) { |
| 6161 | skcd->val = (unsigned long)cset->dfl_cgrp; | 6335 | skcd->val = (unsigned long)cset->dfl_cgrp; |
| 6336 | cgroup_bpf_get(cset->dfl_cgrp); | ||
| 6162 | break; | 6337 | break; |
| 6163 | } | 6338 | } |
| 6164 | cpu_relax(); | 6339 | cpu_relax(); |
| @@ -6169,7 +6344,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) | |||
| 6169 | 6344 | ||
| 6170 | void cgroup_sk_free(struct sock_cgroup_data *skcd) | 6345 | void cgroup_sk_free(struct sock_cgroup_data *skcd) |
| 6171 | { | 6346 | { |
| 6172 | cgroup_put(sock_cgroup_ptr(skcd)); | 6347 | struct cgroup *cgrp = sock_cgroup_ptr(skcd); |
| 6348 | |||
| 6349 | cgroup_bpf_put(cgrp); | ||
| 6350 | cgroup_put(cgrp); | ||
| 6173 | } | 6351 | } |
| 6174 | 6352 | ||
| 6175 | #endif /* CONFIG_SOCK_CGROUP_DATA */ | 6353 | #endif /* CONFIG_SOCK_CGROUP_DATA */ |
| @@ -6252,7 +6430,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); | |||
| 6252 | static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, | 6430 | static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, |
| 6253 | char *buf) | 6431 | char *buf) |
| 6254 | { | 6432 | { |
| 6255 | return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); | 6433 | return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n"); |
| 6256 | } | 6434 | } |
| 6257 | static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); | 6435 | static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); |
| 6258 | 6436 | ||
| @@ -6272,4 +6450,5 @@ static int __init cgroup_sysfs_init(void) | |||
| 6272 | return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); | 6450 | return sysfs_create_group(kernel_kobj, &cgroup_sysfs_attr_group); |
| 6273 | } | 6451 | } |
| 6274 | subsys_initcall(cgroup_sysfs_init); | 6452 | subsys_initcall(cgroup_sysfs_init); |
| 6453 | |||
| 6275 | #endif /* CONFIG_SYSFS */ | 6454 | #endif /* CONFIG_SYSFS */ |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 6a1942ed781c..b3b02b9c4405 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
| @@ -729,7 +729,7 @@ static inline int nr_cpusets(void) | |||
| 729 | * load balancing domains (sched domains) as specified by that partial | 729 | * load balancing domains (sched domains) as specified by that partial |
| 730 | * partition. | 730 | * partition. |
| 731 | * | 731 | * |
| 732 | * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.txt | 732 | * See "What is sched_load_balance" in Documentation/cgroup-v1/cpusets.rst |
| 733 | * for a background explanation of this. | 733 | * for a background explanation of this. |
| 734 | * | 734 | * |
| 735 | * Does not return errors, on the theory that the callers of this | 735 | * Does not return errors, on the theory that the callers of this |
| @@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task) | |||
| 2829 | if (task_css_is_root(task, cpuset_cgrp_id)) | 2829 | if (task_css_is_root(task, cpuset_cgrp_id)) |
| 2830 | return; | 2830 | return; |
| 2831 | 2831 | ||
| 2832 | set_cpus_allowed_ptr(task, ¤t->cpus_allowed); | 2832 | set_cpus_allowed_ptr(task, current->cpus_ptr); |
| 2833 | task->mems_allowed = current->mems_allowed; | 2833 | task->mems_allowed = current->mems_allowed; |
| 2834 | } | 2834 | } |
| 2835 | 2835 | ||
| @@ -3254,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | |||
| 3254 | spin_unlock_irqrestore(&callback_lock, flags); | 3254 | spin_unlock_irqrestore(&callback_lock, flags); |
| 3255 | } | 3255 | } |
| 3256 | 3256 | ||
| 3257 | /** | ||
| 3258 | * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. | ||
| 3259 | * @tsk: pointer to task_struct with which the scheduler is struggling | ||
| 3260 | * | ||
| 3261 | * Description: In the case that the scheduler cannot find an allowed cpu in | ||
| 3262 | * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy | ||
| 3263 | * mode however, this value is the same as task_cs(tsk)->effective_cpus, | ||
| 3264 | * which will not contain a sane cpumask during cases such as cpu hotplugging. | ||
| 3265 | * This is the absolute last resort for the scheduler and it is only used if | ||
| 3266 | * _every_ other avenue has been traveled. | ||
| 3267 | **/ | ||
| 3268 | |||
| 3257 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 3269 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
| 3258 | { | 3270 | { |
| 3259 | rcu_read_lock(); | 3271 | rcu_read_lock(); |
| 3260 | do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); | 3272 | do_set_cpus_allowed(tsk, is_in_v2_mode() ? |
| 3273 | task_cs(tsk)->cpus_allowed : cpu_possible_mask); | ||
| 3261 | rcu_read_unlock(); | 3274 | rcu_read_unlock(); |
| 3262 | 3275 | ||
| 3263 | /* | 3276 | /* |
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index c9960baaa14f..8e513a573fe9 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Process number limiting controller for cgroups. | 3 | * Process number limiting controller for cgroups. |
| 3 | * | 4 | * |
| @@ -25,10 +26,6 @@ | |||
| 25 | * a superset of parent/child/pids.current. | 26 | * a superset of parent/child/pids.current. |
| 26 | * | 27 | * |
| 27 | * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> | 28 | * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> |
| 28 | * | ||
| 29 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
| 30 | * General Public License. See the file COPYING in the main directory of the | ||
| 31 | * Linux distribution for more details. | ||
| 32 | */ | 29 | */ |
| 33 | 30 | ||
| 34 | #include <linux/kernel.h> | 31 | #include <linux/kernel.h> |
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 1d75ae7f1cb7..ae042c347c64 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * RDMA resource limiting controller for cgroups. | 3 | * RDMA resource limiting controller for cgroups. |
| 3 | * | 4 | * |
| @@ -5,10 +6,6 @@ | |||
| 5 | * additional RDMA resources after a certain limit is reached. | 6 | * additional RDMA resources after a certain limit is reached. |
| 6 | * | 7 | * |
| 7 | * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> | 8 | * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> |
| 8 | * | ||
| 9 | * This file is subject to the terms and conditions of version 2 of the GNU | ||
| 10 | * General Public License. See the file COPYING in the main directory of the | ||
| 11 | * Linux distribution for more details. | ||
| 12 | */ | 9 | */ |
| 13 | 10 | ||
| 14 | #include <linux/bitops.h> | 11 | #include <linux/bitops.h> |
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bb95a35e8c2d..ca19b4c8acf5 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | #include "cgroup-internal.h" | 2 | #include "cgroup-internal.h" |
| 2 | 3 | ||
| 3 | #include <linux/sched/cputime.h> | 4 | #include <linux/sched/cputime.h> |
diff --git a/kernel/compat.c b/kernel/compat.c index d8a36c6ad7c9..a2bc1d6ceb57 100644 --- a/kernel/compat.c +++ b/kernel/compat.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/compat.c | 3 | * linux/kernel/compat.c |
| 3 | * | 4 | * |
| @@ -5,10 +6,6 @@ | |||
| 5 | * on 64 bit kernels. | 6 | * on 64 bit kernels. |
| 6 | * | 7 | * |
| 7 | * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation | 8 | * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation |
| 8 | * | ||
| 9 | * This program is free software; you can redistribute it and/or modify | ||
| 10 | * it under the terms of the GNU General Public License version 2 as | ||
| 11 | * published by the Free Software Foundation. | ||
| 12 | */ | 9 | */ |
| 13 | 10 | ||
| 14 | #include <linux/linkage.h> | 11 | #include <linux/linkage.h> |
| @@ -346,8 +343,11 @@ get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat) | |||
| 346 | return -EFAULT; | 343 | return -EFAULT; |
| 347 | switch (_NSIG_WORDS) { | 344 | switch (_NSIG_WORDS) { |
| 348 | case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); | 345 | case 4: set->sig[3] = v.sig[6] | (((long)v.sig[7]) << 32 ); |
| 346 | /* fall through */ | ||
| 349 | case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); | 347 | case 3: set->sig[2] = v.sig[4] | (((long)v.sig[5]) << 32 ); |
| 348 | /* fall through */ | ||
| 350 | case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); | 349 | case 2: set->sig[1] = v.sig[2] | (((long)v.sig[3]) << 32 ); |
| 350 | /* fall through */ | ||
| 351 | case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); | 351 | case 1: set->sig[0] = v.sig[0] | (((long)v.sig[1]) << 32 ); |
| 352 | } | 352 | } |
| 353 | #else | 353 | #else |
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 9ad37b9e44a7..be01a4d627c9 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Context tracking: Probe on high level context boundaries such as kernel | 3 | * Context tracking: Probe on high level context boundaries such as kernel |
| 3 | * and userspace. This includes syscalls and exceptions entry/exit. | 4 | * and userspace. This includes syscalls and exceptions entry/exit. |
diff --git a/kernel/cpu.c b/kernel/cpu.c index f2ef10460698..e84c0873559e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -522,7 +522,7 @@ static int bringup_wait_for_ap(unsigned int cpu) | |||
| 522 | /* | 522 | /* |
| 523 | * SMT soft disabling on X86 requires to bring the CPU out of the | 523 | * SMT soft disabling on X86 requires to bring the CPU out of the |
| 524 | * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The | 524 | * BIOS 'wait for SIPI' state in order to set the CR4.MCE bit. The |
| 525 | * CPU marked itself as booted_once in cpu_notify_starting() so the | 525 | * CPU marked itself as booted_once in notify_cpu_starting() so the |
| 526 | * cpu_smt_allowed() check will now return false if this is not the | 526 | * cpu_smt_allowed() check will now return false if this is not the |
| 527 | * primary sibling. | 527 | * primary sibling. |
| 528 | */ | 528 | */ |
| @@ -1221,6 +1221,13 @@ int freeze_secondary_cpus(int primary) | |||
| 1221 | for_each_online_cpu(cpu) { | 1221 | for_each_online_cpu(cpu) { |
| 1222 | if (cpu == primary) | 1222 | if (cpu == primary) |
| 1223 | continue; | 1223 | continue; |
| 1224 | |||
| 1225 | if (pm_wakeup_pending()) { | ||
| 1226 | pr_info("Wakeup pending. Abort CPU freeze\n"); | ||
| 1227 | error = -EBUSY; | ||
| 1228 | break; | ||
| 1229 | } | ||
| 1230 | |||
| 1224 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); | 1231 | trace_suspend_resume(TPS("CPU_OFF"), cpu, true); |
| 1225 | error = _cpu_down(cpu, 1, CPUHP_OFFLINE); | 1232 | error = _cpu_down(cpu, 1, CPUHP_OFFLINE); |
| 1226 | trace_suspend_resume(TPS("CPU_OFF"), cpu, false); | 1233 | trace_suspend_resume(TPS("CPU_OFF"), cpu, false); |
| @@ -1964,6 +1971,9 @@ static ssize_t write_cpuhp_fail(struct device *dev, | |||
| 1964 | if (ret) | 1971 | if (ret) |
| 1965 | return ret; | 1972 | return ret; |
| 1966 | 1973 | ||
| 1974 | if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE) | ||
| 1975 | return -EINVAL; | ||
| 1976 | |||
| 1967 | /* | 1977 | /* |
| 1968 | * Cannot fail STARTING/DYING callbacks. | 1978 | * Cannot fail STARTING/DYING callbacks. |
| 1969 | */ | 1979 | */ |
| @@ -2061,7 +2071,7 @@ static void cpuhp_online_cpu_device(unsigned int cpu) | |||
| 2061 | kobject_uevent(&dev->kobj, KOBJ_ONLINE); | 2071 | kobject_uevent(&dev->kobj, KOBJ_ONLINE); |
| 2062 | } | 2072 | } |
| 2063 | 2073 | ||
| 2064 | static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) | 2074 | int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) |
| 2065 | { | 2075 | { |
| 2066 | int cpu, ret = 0; | 2076 | int cpu, ret = 0; |
| 2067 | 2077 | ||
| @@ -2093,7 +2103,7 @@ static int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) | |||
| 2093 | return ret; | 2103 | return ret; |
| 2094 | } | 2104 | } |
| 2095 | 2105 | ||
| 2096 | static int cpuhp_smt_enable(void) | 2106 | int cpuhp_smt_enable(void) |
| 2097 | { | 2107 | { |
| 2098 | int cpu, ret = 0; | 2108 | int cpu, ret = 0; |
| 2099 | 2109 | ||
| @@ -2339,6 +2349,9 @@ static int __init mitigations_parse_cmdline(char *arg) | |||
| 2339 | cpu_mitigations = CPU_MITIGATIONS_AUTO; | 2349 | cpu_mitigations = CPU_MITIGATIONS_AUTO; |
| 2340 | else if (!strcmp(arg, "auto,nosmt")) | 2350 | else if (!strcmp(arg, "auto,nosmt")) |
| 2341 | cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; | 2351 | cpu_mitigations = CPU_MITIGATIONS_AUTO_NOSMT; |
| 2352 | else | ||
| 2353 | pr_crit("Unsupported mitigations=%s, system may still be vulnerable\n", | ||
| 2354 | arg); | ||
| 2342 | 2355 | ||
| 2343 | return 0; | 2356 | return 0; |
| 2344 | } | 2357 | } |
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c index 67b02e138a47..cbca6879ab7d 100644 --- a/kernel/cpu_pm.c +++ b/kernel/cpu_pm.c | |||
| @@ -1,18 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2011 Google, Inc. | 3 | * Copyright (C) 2011 Google, Inc. |
| 3 | * | 4 | * |
| 4 | * Author: | 5 | * Author: |
| 5 | * Colin Cross <ccross@android.com> | 6 | * Colin Cross <ccross@android.com> |
| 6 | * | ||
| 7 | * This software is licensed under the terms of the GNU General Public | ||
| 8 | * License version 2, as published by the Free Software Foundation, and | ||
| 9 | * may be copied, distributed, and modified under those terms. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | */ | 7 | */ |
| 17 | 8 | ||
| 18 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 093c9f917ed0..9f1557b98468 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c | |||
| @@ -1,9 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * crash.c - kernel crash support code. | 3 | * crash.c - kernel crash support code. |
| 3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | 4 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> |
| 4 | * | ||
| 5 | * This source code is licensed under the GNU General Public License, | ||
| 6 | * Version 2. See the file COPYING for more details. | ||
| 7 | */ | 5 | */ |
| 8 | 6 | ||
| 9 | #include <linux/crash_core.h> | 7 | #include <linux/crash_core.h> |
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index b64e238b553b..9c23ae074b40 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | #include <linux/kernel.h> | 2 | #include <linux/kernel.h> |
| 2 | #include <linux/crash_dump.h> | 3 | #include <linux/crash_dump.h> |
| 3 | #include <linux/init.h> | 4 | #include <linux/init.h> |
diff --git a/kernel/cred.c b/kernel/cred.c index 45d77284aed0..f9a0ce66c9c3 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* Task credentials management - see Documentation/security/credentials.rst | 2 | /* Task credentials management - see Documentation/security/credentials.rst |
| 2 | * | 3 | * |
| 3 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. | 4 | * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. |
| 4 | * Written by David Howells (dhowells@redhat.com) | 5 | * Written by David Howells (dhowells@redhat.com) |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public Licence | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the Licence, or (at your option) any later version. | ||
| 10 | */ | 6 | */ |
| 11 | #include <linux/export.h> | 7 | #include <linux/export.h> |
| 12 | #include <linux/cred.h> | 8 | #include <linux/cred.h> |
| @@ -174,6 +170,11 @@ void exit_creds(struct task_struct *tsk) | |||
| 174 | validate_creds(cred); | 170 | validate_creds(cred); |
| 175 | alter_cred_subscribers(cred, -1); | 171 | alter_cred_subscribers(cred, -1); |
| 176 | put_cred(cred); | 172 | put_cred(cred); |
| 173 | |||
| 174 | #ifdef CONFIG_KEYS_REQUEST_CACHE | ||
| 175 | key_put(current->cached_requested_key); | ||
| 176 | current->cached_requested_key = NULL; | ||
| 177 | #endif | ||
| 177 | } | 178 | } |
| 178 | 179 | ||
| 179 | /** | 180 | /** |
| @@ -327,6 +328,10 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) | |||
| 327 | struct cred *new; | 328 | struct cred *new; |
| 328 | int ret; | 329 | int ret; |
| 329 | 330 | ||
| 331 | #ifdef CONFIG_KEYS_REQUEST_CACHE | ||
| 332 | p->cached_requested_key = NULL; | ||
| 333 | #endif | ||
| 334 | |||
| 330 | if ( | 335 | if ( |
| 331 | #ifdef CONFIG_KEYS | 336 | #ifdef CONFIG_KEYS |
| 332 | !p->cred->thread_keyring && | 337 | !p->cred->thread_keyring && |
| @@ -450,14 +455,23 @@ int commit_creds(struct cred *new) | |||
| 450 | if (task->mm) | 455 | if (task->mm) |
| 451 | set_dumpable(task->mm, suid_dumpable); | 456 | set_dumpable(task->mm, suid_dumpable); |
| 452 | task->pdeath_signal = 0; | 457 | task->pdeath_signal = 0; |
| 458 | /* | ||
| 459 | * If a task drops privileges and becomes nondumpable, | ||
| 460 | * the dumpability change must become visible before | ||
| 461 | * the credential change; otherwise, a __ptrace_may_access() | ||
| 462 | * racing with this change may be able to attach to a task it | ||
| 463 | * shouldn't be able to attach to (as if the task had dropped | ||
| 464 | * privileges without becoming nondumpable). | ||
| 465 | * Pairs with a read barrier in __ptrace_may_access(). | ||
| 466 | */ | ||
| 453 | smp_wmb(); | 467 | smp_wmb(); |
| 454 | } | 468 | } |
| 455 | 469 | ||
| 456 | /* alter the thread keyring */ | 470 | /* alter the thread keyring */ |
| 457 | if (!uid_eq(new->fsuid, old->fsuid)) | 471 | if (!uid_eq(new->fsuid, old->fsuid)) |
| 458 | key_fsuid_changed(task); | 472 | key_fsuid_changed(new); |
| 459 | if (!gid_eq(new->fsgid, old->fsgid)) | 473 | if (!gid_eq(new->fsgid, old->fsgid)) |
| 460 | key_fsgid_changed(task); | 474 | key_fsgid_changed(new); |
| 461 | 475 | ||
| 462 | /* do it | 476 | /* do it |
| 463 | * RLIMIT_NPROC limits on user->processes have already been checked | 477 | * RLIMIT_NPROC limits on user->processes have already been checked |
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile index a85edc339985..332ee6c6ec2c 100644 --- a/kernel/debug/Makefile +++ b/kernel/debug/Makefile | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # Makefile for the linux kernel debugger | 3 | # Makefile for the linux kernel debugger |
| 3 | # | 4 | # |
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 7510dc687c0d..4b280fc7dd67 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c | |||
| @@ -1033,13 +1033,14 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
| 1033 | return DBG_PASS_EVENT; | 1033 | return DBG_PASS_EVENT; |
| 1034 | } | 1034 | } |
| 1035 | #endif | 1035 | #endif |
| 1036 | /* Fall through */ | ||
| 1036 | case 'C': /* Exception passing */ | 1037 | case 'C': /* Exception passing */ |
| 1037 | tmp = gdb_cmd_exception_pass(ks); | 1038 | tmp = gdb_cmd_exception_pass(ks); |
| 1038 | if (tmp > 0) | 1039 | if (tmp > 0) |
| 1039 | goto default_handle; | 1040 | goto default_handle; |
| 1040 | if (tmp == 0) | 1041 | if (tmp == 0) |
| 1041 | break; | 1042 | break; |
| 1042 | /* Fall through on tmp < 0 */ | 1043 | /* Fall through - on tmp < 0 */ |
| 1043 | case 'c': /* Continue packet */ | 1044 | case 'c': /* Continue packet */ |
| 1044 | case 's': /* Single step packet */ | 1045 | case 's': /* Single step packet */ |
| 1045 | if (kgdb_contthread && kgdb_contthread != current) { | 1046 | if (kgdb_contthread && kgdb_contthread != current) { |
| @@ -1048,7 +1049,7 @@ int gdb_serial_stub(struct kgdb_state *ks) | |||
| 1048 | break; | 1049 | break; |
| 1049 | } | 1050 | } |
| 1050 | dbg_activate_sw_breakpoints(); | 1051 | dbg_activate_sw_breakpoints(); |
| 1051 | /* Fall through to default processing */ | 1052 | /* Fall through - to default processing */ |
| 1052 | default: | 1053 | default: |
| 1053 | default_handle: | 1054 | default_handle: |
| 1054 | error = kgdb_arch_handle_exception(ks->ex_vector, | 1055 | error = kgdb_arch_handle_exception(ks->ex_vector, |
| @@ -1094,10 +1095,10 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd) | |||
| 1094 | return error; | 1095 | return error; |
| 1095 | case 's': | 1096 | case 's': |
| 1096 | case 'c': | 1097 | case 'c': |
| 1097 | strcpy(remcom_in_buffer, cmd); | 1098 | strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); |
| 1098 | return 0; | 1099 | return 0; |
| 1099 | case '$': | 1100 | case '$': |
| 1100 | strcpy(remcom_in_buffer, cmd); | 1101 | strscpy(remcom_in_buffer, cmd, sizeof(remcom_in_buffer)); |
| 1101 | gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); | 1102 | gdbstub_use_prev_in_buf = strlen(remcom_in_buffer); |
| 1102 | gdbstub_prev_in_buf_pos = 0; | 1103 | gdbstub_prev_in_buf_pos = 0; |
| 1103 | return 0; | 1104 | return 0; |
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile index d4fc58f4b88d..efac857c5511 100644 --- a/kernel/debug/kdb/Makefile +++ b/kernel/debug/kdb/Makefile | |||
| @@ -6,7 +6,6 @@ | |||
| 6 | # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. | 6 | # Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. |
| 7 | # | 7 | # |
| 8 | 8 | ||
| 9 | CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') | ||
| 10 | obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o | 9 | obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o |
| 11 | obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o | 10 | obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o |
| 12 | 11 | ||
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 6a4b41484afe..3a5184eb6977 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c | |||
| @@ -446,7 +446,7 @@ poll_again: | |||
| 446 | char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) | 446 | char *kdb_getstr(char *buffer, size_t bufsize, const char *prompt) |
| 447 | { | 447 | { |
| 448 | if (prompt && kdb_prompt_str != prompt) | 448 | if (prompt && kdb_prompt_str != prompt) |
| 449 | strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); | 449 | strscpy(kdb_prompt_str, prompt, CMD_BUFLEN); |
| 450 | kdb_printf(kdb_prompt_str); | 450 | kdb_printf(kdb_prompt_str); |
| 451 | kdb_nextline = 1; /* Prompt and input resets line number */ | 451 | kdb_nextline = 1; /* Prompt and input resets line number */ |
| 452 | return kdb_read(buffer, bufsize); | 452 | return kdb_read(buffer, bufsize); |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 82a3b32a7cfc..9ecfa37c7fbf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -2522,7 +2522,6 @@ static int kdb_summary(int argc, const char **argv) | |||
| 2522 | kdb_printf("machine %s\n", init_uts_ns.name.machine); | 2522 | kdb_printf("machine %s\n", init_uts_ns.name.machine); |
| 2523 | kdb_printf("nodename %s\n", init_uts_ns.name.nodename); | 2523 | kdb_printf("nodename %s\n", init_uts_ns.name.nodename); |
| 2524 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); | 2524 | kdb_printf("domainname %s\n", init_uts_ns.name.domainname); |
| 2525 | kdb_printf("ccversion %s\n", __stringify(CCVERSION)); | ||
| 2526 | 2525 | ||
| 2527 | now = __ktime_get_real_seconds(); | 2526 | now = __ktime_get_real_seconds(); |
| 2528 | time64_to_tm(now, 0, &tm); | 2527 | time64_to_tm(now, 0, &tm); |
| @@ -2584,7 +2583,7 @@ static int kdb_per_cpu(int argc, const char **argv) | |||
| 2584 | diag = kdbgetularg(argv[3], &whichcpu); | 2583 | diag = kdbgetularg(argv[3], &whichcpu); |
| 2585 | if (diag) | 2584 | if (diag) |
| 2586 | return diag; | 2585 | return diag; |
| 2587 | if (!cpu_online(whichcpu)) { | 2586 | if (whichcpu >= nr_cpu_ids || !cpu_online(whichcpu)) { |
| 2588 | kdb_printf("cpu %ld is not online\n", whichcpu); | 2587 | kdb_printf("cpu %ld is not online\n", whichcpu); |
| 2589 | return KDB_BADCPUNUM; | 2588 | return KDB_BADCPUNUM; |
| 2590 | } | 2589 | } |
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c index 50bf9b119bad..b8e6306e7e13 100644 --- a/kernel/debug/kdb/kdb_support.c +++ b/kernel/debug/kdb/kdb_support.c | |||
| @@ -192,7 +192,7 @@ int kallsyms_symbol_complete(char *prefix_name, int max_len) | |||
| 192 | 192 | ||
| 193 | while ((name = kdb_walk_kallsyms(&pos))) { | 193 | while ((name = kdb_walk_kallsyms(&pos))) { |
| 194 | if (strncmp(name, prefix_name, prefix_len) == 0) { | 194 | if (strncmp(name, prefix_name, prefix_len) == 0) { |
| 195 | strcpy(ks_namebuf, name); | 195 | strscpy(ks_namebuf, name, sizeof(ks_namebuf)); |
| 196 | /* Work out the longest name that matches the prefix */ | 196 | /* Work out the longest name that matches the prefix */ |
| 197 | if (++number == 1) { | 197 | if (++number == 1) { |
| 198 | prev_len = min_t(int, max_len-1, | 198 | prev_len = min_t(int, max_len-1, |
diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 2a12b988c717..27725754ac99 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c | |||
| @@ -1,16 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* delayacct.c - per-task delay accounting | 2 | /* delayacct.c - per-task delay accounting |
| 2 | * | 3 | * |
| 3 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | 4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 |
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License as published by | ||
| 7 | * the Free Software Foundation; either version 2 of the License, or | ||
| 8 | * (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it would be useful, but | ||
| 11 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
| 13 | * the GNU General Public License for more details. | ||
| 14 | */ | 5 | */ |
| 15 | 6 | ||
| 16 | #include <linux/sched.h> | 7 | #include <linux/sched.h> |
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 83d711f8d665..70f8f8d9200e 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | 2 | ||
| 2 | config HAS_DMA | 3 | config HAS_DMA |
| 3 | bool | 4 | bool |
diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index b2a87905846d..bfc0c17f2a3d 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c | |||
| @@ -214,6 +214,62 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, | |||
| 214 | return cma_release(dev_get_cma_area(dev), pages, count); | 214 | return cma_release(dev_get_cma_area(dev), pages, count); |
| 215 | } | 215 | } |
| 216 | 216 | ||
| 217 | /** | ||
| 218 | * dma_alloc_contiguous() - allocate contiguous pages | ||
| 219 | * @dev: Pointer to device for which the allocation is performed. | ||
| 220 | * @size: Requested allocation size. | ||
| 221 | * @gfp: Allocation flags. | ||
| 222 | * | ||
| 223 | * This function allocates contiguous memory buffer for specified device. It | ||
| 224 | * first tries to use device specific contiguous memory area if available or | ||
| 225 | * the default global one, then tries a fallback allocation of normal pages. | ||
| 226 | * | ||
| 227 | * Note that it byapss one-page size of allocations from the global area as | ||
| 228 | * the addresses within one page are always contiguous, so there is no need | ||
| 229 | * to waste CMA pages for that kind; it also helps reduce fragmentations. | ||
| 230 | */ | ||
| 231 | struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp) | ||
| 232 | { | ||
| 233 | int node = dev ? dev_to_node(dev) : NUMA_NO_NODE; | ||
| 234 | size_t count = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
| 235 | size_t align = get_order(PAGE_ALIGN(size)); | ||
| 236 | struct page *page = NULL; | ||
| 237 | struct cma *cma = NULL; | ||
| 238 | |||
| 239 | if (dev && dev->cma_area) | ||
| 240 | cma = dev->cma_area; | ||
| 241 | else if (count > 1) | ||
| 242 | cma = dma_contiguous_default_area; | ||
| 243 | |||
| 244 | /* CMA can be used only in the context which permits sleeping */ | ||
| 245 | if (cma && gfpflags_allow_blocking(gfp)) { | ||
| 246 | align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT); | ||
| 247 | page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN); | ||
| 248 | } | ||
| 249 | |||
| 250 | /* Fallback allocation of normal pages */ | ||
| 251 | if (!page) | ||
| 252 | page = alloc_pages_node(node, gfp, align); | ||
| 253 | return page; | ||
| 254 | } | ||
| 255 | |||
| 256 | /** | ||
| 257 | * dma_free_contiguous() - release allocated pages | ||
| 258 | * @dev: Pointer to device for which the pages were allocated. | ||
| 259 | * @page: Pointer to the allocated pages. | ||
| 260 | * @size: Size of allocated pages. | ||
| 261 | * | ||
| 262 | * This function releases memory allocated by dma_alloc_contiguous(). As the | ||
| 263 | * cma_release returns false when provided pages do not belong to contiguous | ||
| 264 | * area and true otherwise, this function then does a fallback __free_pages() | ||
| 265 | * upon a false-return. | ||
| 266 | */ | ||
| 267 | void dma_free_contiguous(struct device *dev, struct page *page, size_t size) | ||
| 268 | { | ||
| 269 | if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT)) | ||
| 270 | __free_pages(page, get_order(size)); | ||
| 271 | } | ||
| 272 | |||
| 217 | /* | 273 | /* |
| 218 | * Support for reserved memory regions defined in device tree | 274 | * Support for reserved memory regions defined in device tree |
| 219 | */ | 275 | */ |
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c index badd77670d00..099002d84f46 100644 --- a/kernel/dma/debug.c +++ b/kernel/dma/debug.c | |||
| @@ -1,20 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2008 Advanced Micro Devices, Inc. | 3 | * Copyright (C) 2008 Advanced Micro Devices, Inc. |
| 3 | * | 4 | * |
| 4 | * Author: Joerg Roedel <joerg.roedel@amd.com> | 5 | * Author: Joerg Roedel <joerg.roedel@amd.com> |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify it | ||
| 7 | * under the terms of the GNU General Public License version 2 as published | ||
| 8 | * by the Free Software Foundation. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 13 | * GNU General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU General Public License | ||
| 16 | * along with this program; if not, write to the Free Software | ||
| 17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 18 | */ | 6 | */ |
| 19 | 7 | ||
| 20 | #define pr_fmt(fmt) "DMA-API: " fmt | 8 | #define pr_fmt(fmt) "DMA-API: " fmt |
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 2c2772e9702a..b90e1aede743 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c | |||
| @@ -96,8 +96,6 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size) | |||
| 96 | struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, | 96 | struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, |
| 97 | dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) | 97 | dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) |
| 98 | { | 98 | { |
| 99 | unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; | ||
| 100 | int page_order = get_order(size); | ||
| 101 | struct page *page = NULL; | 99 | struct page *page = NULL; |
| 102 | u64 phys_mask; | 100 | u64 phys_mask; |
| 103 | 101 | ||
| @@ -109,20 +107,9 @@ struct page *__dma_direct_alloc_pages(struct device *dev, size_t size, | |||
| 109 | gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, | 107 | gfp |= __dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask, |
| 110 | &phys_mask); | 108 | &phys_mask); |
| 111 | again: | 109 | again: |
| 112 | /* CMA can be used only in the context which permits sleeping */ | 110 | page = dma_alloc_contiguous(dev, size, gfp); |
| 113 | if (gfpflags_allow_blocking(gfp)) { | ||
| 114 | page = dma_alloc_from_contiguous(dev, count, page_order, | ||
| 115 | gfp & __GFP_NOWARN); | ||
| 116 | if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { | ||
| 117 | dma_release_from_contiguous(dev, page, count); | ||
| 118 | page = NULL; | ||
| 119 | } | ||
| 120 | } | ||
| 121 | if (!page) | ||
| 122 | page = alloc_pages_node(dev_to_node(dev), gfp, page_order); | ||
| 123 | |||
| 124 | if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { | 111 | if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) { |
| 125 | __free_pages(page, page_order); | 112 | dma_free_contiguous(dev, page, size); |
| 126 | page = NULL; | 113 | page = NULL; |
| 127 | 114 | ||
| 128 | if (IS_ENABLED(CONFIG_ZONE_DMA32) && | 115 | if (IS_ENABLED(CONFIG_ZONE_DMA32) && |
| @@ -151,10 +138,18 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, | |||
| 151 | if (!page) | 138 | if (!page) |
| 152 | return NULL; | 139 | return NULL; |
| 153 | 140 | ||
| 141 | if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { | ||
| 142 | /* remove any dirty cache lines on the kernel alias */ | ||
| 143 | if (!PageHighMem(page)) | ||
| 144 | arch_dma_prep_coherent(page, size); | ||
| 145 | /* return the page pointer as the opaque cookie */ | ||
| 146 | return page; | ||
| 147 | } | ||
| 148 | |||
| 154 | if (PageHighMem(page)) { | 149 | if (PageHighMem(page)) { |
| 155 | /* | 150 | /* |
| 156 | * Depending on the cma= arguments and per-arch setup | 151 | * Depending on the cma= arguments and per-arch setup |
| 157 | * dma_alloc_from_contiguous could return highmem pages. | 152 | * dma_alloc_contiguous could return highmem pages. |
| 158 | * Without remapping there is no way to return them here, | 153 | * Without remapping there is no way to return them here, |
| 159 | * so log an error and fail. | 154 | * so log an error and fail. |
| 160 | */ | 155 | */ |
| @@ -171,15 +166,19 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size, | |||
| 171 | *dma_handle = phys_to_dma(dev, page_to_phys(page)); | 166 | *dma_handle = phys_to_dma(dev, page_to_phys(page)); |
| 172 | } | 167 | } |
| 173 | memset(ret, 0, size); | 168 | memset(ret, 0, size); |
| 169 | |||
| 170 | if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && | ||
| 171 | dma_alloc_need_uncached(dev, attrs)) { | ||
| 172 | arch_dma_prep_coherent(page, size); | ||
| 173 | ret = uncached_kernel_address(ret); | ||
| 174 | } | ||
| 175 | |||
| 174 | return ret; | 176 | return ret; |
| 175 | } | 177 | } |
| 176 | 178 | ||
| 177 | void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) | 179 | void __dma_direct_free_pages(struct device *dev, size_t size, struct page *page) |
| 178 | { | 180 | { |
| 179 | unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; | 181 | dma_free_contiguous(dev, page, size); |
| 180 | |||
| 181 | if (!dma_release_from_contiguous(dev, page, count)) | ||
| 182 | __free_pages(page, get_order(size)); | ||
| 183 | } | 182 | } |
| 184 | 183 | ||
| 185 | void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, | 184 | void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, |
| @@ -187,15 +186,26 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr, | |||
| 187 | { | 186 | { |
| 188 | unsigned int page_order = get_order(size); | 187 | unsigned int page_order = get_order(size); |
| 189 | 188 | ||
| 189 | if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { | ||
| 190 | /* cpu_addr is a struct page cookie, not a kernel address */ | ||
| 191 | __dma_direct_free_pages(dev, size, cpu_addr); | ||
| 192 | return; | ||
| 193 | } | ||
| 194 | |||
| 190 | if (force_dma_unencrypted()) | 195 | if (force_dma_unencrypted()) |
| 191 | set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); | 196 | set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order); |
| 197 | |||
| 198 | if (IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && | ||
| 199 | dma_alloc_need_uncached(dev, attrs)) | ||
| 200 | cpu_addr = cached_kernel_address(cpu_addr); | ||
| 192 | __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); | 201 | __dma_direct_free_pages(dev, size, virt_to_page(cpu_addr)); |
| 193 | } | 202 | } |
| 194 | 203 | ||
| 195 | void *dma_direct_alloc(struct device *dev, size_t size, | 204 | void *dma_direct_alloc(struct device *dev, size_t size, |
| 196 | dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) | 205 | dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs) |
| 197 | { | 206 | { |
| 198 | if (!dev_is_dma_coherent(dev)) | 207 | if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && |
| 208 | dma_alloc_need_uncached(dev, attrs)) | ||
| 199 | return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); | 209 | return arch_dma_alloc(dev, size, dma_handle, gfp, attrs); |
| 200 | return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); | 210 | return dma_direct_alloc_pages(dev, size, dma_handle, gfp, attrs); |
| 201 | } | 211 | } |
| @@ -203,7 +213,8 @@ void *dma_direct_alloc(struct device *dev, size_t size, | |||
| 203 | void dma_direct_free(struct device *dev, size_t size, | 213 | void dma_direct_free(struct device *dev, size_t size, |
| 204 | void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) | 214 | void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) |
| 205 | { | 215 | { |
| 206 | if (!dev_is_dma_coherent(dev)) | 216 | if (!IS_ENABLED(CONFIG_ARCH_HAS_UNCACHED_SEGMENT) && |
| 217 | dma_alloc_need_uncached(dev, attrs)) | ||
| 207 | arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); | 218 | arch_dma_free(dev, size, cpu_addr, dma_addr, attrs); |
| 208 | else | 219 | else |
| 209 | dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); | 220 | dma_direct_free_pages(dev, size, cpu_addr, dma_addr, attrs); |
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index f7afdadb6770..1f628e7ac709 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c | |||
| @@ -317,6 +317,12 @@ void arch_dma_set_mask(struct device *dev, u64 mask); | |||
| 317 | 317 | ||
| 318 | int dma_set_mask(struct device *dev, u64 mask) | 318 | int dma_set_mask(struct device *dev, u64 mask) |
| 319 | { | 319 | { |
| 320 | /* | ||
| 321 | * Truncate the mask to the actually supported dma_addr_t width to | ||
| 322 | * avoid generating unsupportable addresses. | ||
| 323 | */ | ||
| 324 | mask = (dma_addr_t)mask; | ||
| 325 | |||
| 320 | if (!dev->dma_mask || !dma_supported(dev, mask)) | 326 | if (!dev->dma_mask || !dma_supported(dev, mask)) |
| 321 | return -EIO; | 327 | return -EIO; |
| 322 | 328 | ||
| @@ -330,6 +336,12 @@ EXPORT_SYMBOL(dma_set_mask); | |||
| 330 | #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK | 336 | #ifndef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK |
| 331 | int dma_set_coherent_mask(struct device *dev, u64 mask) | 337 | int dma_set_coherent_mask(struct device *dev, u64 mask) |
| 332 | { | 338 | { |
| 339 | /* | ||
| 340 | * Truncate the mask to the actually supported dma_addr_t width to | ||
| 341 | * avoid generating unsupportable addresses. | ||
| 342 | */ | ||
| 343 | mask = (dma_addr_t)mask; | ||
| 344 | |||
| 333 | if (!dma_supported(dev, mask)) | 345 | if (!dma_supported(dev, mask)) |
| 334 | return -EIO; | 346 | return -EIO; |
| 335 | 347 | ||
diff --git a/kernel/dma/remap.c b/kernel/dma/remap.c index 7a723194ecbe..a594aec07882 100644 --- a/kernel/dma/remap.c +++ b/kernel/dma/remap.c | |||
| @@ -158,6 +158,9 @@ out: | |||
| 158 | 158 | ||
| 159 | bool dma_in_atomic_pool(void *start, size_t size) | 159 | bool dma_in_atomic_pool(void *start, size_t size) |
| 160 | { | 160 | { |
| 161 | if (unlikely(!atomic_pool)) | ||
| 162 | return false; | ||
| 163 | |||
| 161 | return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); | 164 | return addr_in_gen_pool(atomic_pool, (unsigned long)start, size); |
| 162 | } | 165 | } |
| 163 | 166 | ||
| @@ -199,8 +202,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
| 199 | 202 | ||
| 200 | size = PAGE_ALIGN(size); | 203 | size = PAGE_ALIGN(size); |
| 201 | 204 | ||
| 202 | if (!gfpflags_allow_blocking(flags) && | 205 | if (!gfpflags_allow_blocking(flags)) { |
| 203 | !(attrs & DMA_ATTR_NO_KERNEL_MAPPING)) { | ||
| 204 | ret = dma_alloc_from_pool(size, &page, flags); | 206 | ret = dma_alloc_from_pool(size, &page, flags); |
| 205 | if (!ret) | 207 | if (!ret) |
| 206 | return NULL; | 208 | return NULL; |
| @@ -214,11 +216,6 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, | |||
| 214 | /* remove any dirty cache lines on the kernel alias */ | 216 | /* remove any dirty cache lines on the kernel alias */ |
| 215 | arch_dma_prep_coherent(page, size); | 217 | arch_dma_prep_coherent(page, size); |
| 216 | 218 | ||
| 217 | if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { | ||
| 218 | ret = page; /* opaque cookie */ | ||
| 219 | goto done; | ||
| 220 | } | ||
| 221 | |||
| 222 | /* create a coherent mapping */ | 219 | /* create a coherent mapping */ |
| 223 | ret = dma_common_contiguous_remap(page, size, VM_USERMAP, | 220 | ret = dma_common_contiguous_remap(page, size, VM_USERMAP, |
| 224 | arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), | 221 | arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs), |
| @@ -237,10 +234,7 @@ done: | |||
| 237 | void arch_dma_free(struct device *dev, size_t size, void *vaddr, | 234 | void arch_dma_free(struct device *dev, size_t size, void *vaddr, |
| 238 | dma_addr_t dma_handle, unsigned long attrs) | 235 | dma_addr_t dma_handle, unsigned long attrs) |
| 239 | { | 236 | { |
| 240 | if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) { | 237 | if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { |
| 241 | /* vaddr is a struct page cookie, not a kernel address */ | ||
| 242 | __dma_direct_free_pages(dev, size, vaddr); | ||
| 243 | } else if (!dma_free_from_pool(vaddr, PAGE_ALIGN(size))) { | ||
| 244 | phys_addr_t phys = dma_to_phys(dev, dma_handle); | 238 | phys_addr_t phys = dma_to_phys(dev, dma_handle); |
| 245 | struct page *page = pfn_to_page(__phys_to_pfn(phys)); | 239 | struct page *page = pfn_to_page(__phys_to_pfn(phys)); |
| 246 | 240 | ||
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 6f7619c1f877..62fa5a82a065 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Dynamic DMA mapping support. | 3 | * Dynamic DMA mapping support. |
| 3 | * | 4 | * |
| @@ -695,29 +696,12 @@ bool is_swiotlb_active(void) | |||
| 695 | 696 | ||
| 696 | static int __init swiotlb_create_debugfs(void) | 697 | static int __init swiotlb_create_debugfs(void) |
| 697 | { | 698 | { |
| 698 | struct dentry *d_swiotlb_usage; | 699 | struct dentry *root; |
| 699 | struct dentry *ent; | ||
| 700 | |||
| 701 | d_swiotlb_usage = debugfs_create_dir("swiotlb", NULL); | ||
| 702 | |||
| 703 | if (!d_swiotlb_usage) | ||
| 704 | return -ENOMEM; | ||
| 705 | |||
| 706 | ent = debugfs_create_ulong("io_tlb_nslabs", 0400, | ||
| 707 | d_swiotlb_usage, &io_tlb_nslabs); | ||
| 708 | if (!ent) | ||
| 709 | goto fail; | ||
| 710 | |||
| 711 | ent = debugfs_create_ulong("io_tlb_used", 0400, | ||
| 712 | d_swiotlb_usage, &io_tlb_used); | ||
| 713 | if (!ent) | ||
| 714 | goto fail; | ||
| 715 | 700 | ||
| 701 | root = debugfs_create_dir("swiotlb", NULL); | ||
| 702 | debugfs_create_ulong("io_tlb_nslabs", 0400, root, &io_tlb_nslabs); | ||
| 703 | debugfs_create_ulong("io_tlb_used", 0400, root, &io_tlb_used); | ||
| 716 | return 0; | 704 | return 0; |
| 717 | |||
| 718 | fail: | ||
| 719 | debugfs_remove_recursive(d_swiotlb_usage); | ||
| 720 | return -ENOMEM; | ||
| 721 | } | 705 | } |
| 722 | 706 | ||
| 723 | late_initcall(swiotlb_create_debugfs); | 707 | late_initcall(swiotlb_create_debugfs); |
diff --git a/kernel/events/core.c b/kernel/events/core.c index abbd4b3b96c2..785d708f8553 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -2952,6 +2952,12 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
| 2952 | if (!ctx->nr_active || !(is_active & EVENT_ALL)) | 2952 | if (!ctx->nr_active || !(is_active & EVENT_ALL)) |
| 2953 | return; | 2953 | return; |
| 2954 | 2954 | ||
| 2955 | /* | ||
| 2956 | * If we had been multiplexing, no rotations are necessary, now no events | ||
| 2957 | * are active. | ||
| 2958 | */ | ||
| 2959 | ctx->rotate_necessary = 0; | ||
| 2960 | |||
| 2955 | perf_pmu_disable(ctx->pmu); | 2961 | perf_pmu_disable(ctx->pmu); |
| 2956 | if (is_active & EVENT_PINNED) { | 2962 | if (is_active & EVENT_PINNED) { |
| 2957 | list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) | 2963 | list_for_each_entry_safe(event, tmp, &ctx->pinned_active, active_list) |
| @@ -3319,10 +3325,13 @@ static int flexible_sched_in(struct perf_event *event, void *data) | |||
| 3319 | return 0; | 3325 | return 0; |
| 3320 | 3326 | ||
| 3321 | if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { | 3327 | if (group_can_go_on(event, sid->cpuctx, sid->can_add_hw)) { |
| 3322 | if (!group_sched_in(event, sid->cpuctx, sid->ctx)) | 3328 | int ret = group_sched_in(event, sid->cpuctx, sid->ctx); |
| 3323 | list_add_tail(&event->active_list, &sid->ctx->flexible_active); | 3329 | if (ret) { |
| 3324 | else | ||
| 3325 | sid->can_add_hw = 0; | 3330 | sid->can_add_hw = 0; |
| 3331 | sid->ctx->rotate_necessary = 1; | ||
| 3332 | return 0; | ||
| 3333 | } | ||
| 3334 | list_add_tail(&event->active_list, &sid->ctx->flexible_active); | ||
| 3326 | } | 3335 | } |
| 3327 | 3336 | ||
| 3328 | return 0; | 3337 | return 0; |
| @@ -3690,24 +3699,17 @@ ctx_first_active(struct perf_event_context *ctx) | |||
| 3690 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx) | 3699 | static bool perf_rotate_context(struct perf_cpu_context *cpuctx) |
| 3691 | { | 3700 | { |
| 3692 | struct perf_event *cpu_event = NULL, *task_event = NULL; | 3701 | struct perf_event *cpu_event = NULL, *task_event = NULL; |
| 3693 | bool cpu_rotate = false, task_rotate = false; | 3702 | struct perf_event_context *task_ctx = NULL; |
| 3694 | struct perf_event_context *ctx = NULL; | 3703 | int cpu_rotate, task_rotate; |
| 3695 | 3704 | ||
| 3696 | /* | 3705 | /* |
| 3697 | * Since we run this from IRQ context, nobody can install new | 3706 | * Since we run this from IRQ context, nobody can install new |
| 3698 | * events, thus the event count values are stable. | 3707 | * events, thus the event count values are stable. |
| 3699 | */ | 3708 | */ |
| 3700 | 3709 | ||
| 3701 | if (cpuctx->ctx.nr_events) { | 3710 | cpu_rotate = cpuctx->ctx.rotate_necessary; |
| 3702 | if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) | 3711 | task_ctx = cpuctx->task_ctx; |
| 3703 | cpu_rotate = true; | 3712 | task_rotate = task_ctx ? task_ctx->rotate_necessary : 0; |
| 3704 | } | ||
| 3705 | |||
| 3706 | ctx = cpuctx->task_ctx; | ||
| 3707 | if (ctx && ctx->nr_events) { | ||
| 3708 | if (ctx->nr_events != ctx->nr_active) | ||
| 3709 | task_rotate = true; | ||
| 3710 | } | ||
| 3711 | 3713 | ||
| 3712 | if (!(cpu_rotate || task_rotate)) | 3714 | if (!(cpu_rotate || task_rotate)) |
| 3713 | return false; | 3715 | return false; |
| @@ -3716,7 +3718,7 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
| 3716 | perf_pmu_disable(cpuctx->ctx.pmu); | 3718 | perf_pmu_disable(cpuctx->ctx.pmu); |
| 3717 | 3719 | ||
| 3718 | if (task_rotate) | 3720 | if (task_rotate) |
| 3719 | task_event = ctx_first_active(ctx); | 3721 | task_event = ctx_first_active(task_ctx); |
| 3720 | if (cpu_rotate) | 3722 | if (cpu_rotate) |
| 3721 | cpu_event = ctx_first_active(&cpuctx->ctx); | 3723 | cpu_event = ctx_first_active(&cpuctx->ctx); |
| 3722 | 3724 | ||
| @@ -3724,17 +3726,17 @@ static bool perf_rotate_context(struct perf_cpu_context *cpuctx) | |||
| 3724 | * As per the order given at ctx_resched() first 'pop' task flexible | 3726 | * As per the order given at ctx_resched() first 'pop' task flexible |
| 3725 | * and then, if needed CPU flexible. | 3727 | * and then, if needed CPU flexible. |
| 3726 | */ | 3728 | */ |
| 3727 | if (task_event || (ctx && cpu_event)) | 3729 | if (task_event || (task_ctx && cpu_event)) |
| 3728 | ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE); | 3730 | ctx_sched_out(task_ctx, cpuctx, EVENT_FLEXIBLE); |
| 3729 | if (cpu_event) | 3731 | if (cpu_event) |
| 3730 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); | 3732 | cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); |
| 3731 | 3733 | ||
| 3732 | if (task_event) | 3734 | if (task_event) |
| 3733 | rotate_ctx(ctx, task_event); | 3735 | rotate_ctx(task_ctx, task_event); |
| 3734 | if (cpu_event) | 3736 | if (cpu_event) |
| 3735 | rotate_ctx(&cpuctx->ctx, cpu_event); | 3737 | rotate_ctx(&cpuctx->ctx, cpu_event); |
| 3736 | 3738 | ||
| 3737 | perf_event_sched_in(cpuctx, ctx, current); | 3739 | perf_event_sched_in(cpuctx, task_ctx, current); |
| 3738 | 3740 | ||
| 3739 | perf_pmu_enable(cpuctx->ctx.pmu); | 3741 | perf_pmu_enable(cpuctx->ctx.pmu); |
| 3740 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); | 3742 | perf_ctx_unlock(cpuctx, cpuctx->task_ctx); |
| @@ -5005,6 +5007,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) | |||
| 5005 | if (perf_event_check_period(event, value)) | 5007 | if (perf_event_check_period(event, value)) |
| 5006 | return -EINVAL; | 5008 | return -EINVAL; |
| 5007 | 5009 | ||
| 5010 | if (!event->attr.freq && (value & (1ULL << 63))) | ||
| 5011 | return -EINVAL; | ||
| 5012 | |||
| 5008 | event_function_call(event, __perf_event_period, &value); | 5013 | event_function_call(event, __perf_event_period, &value); |
| 5009 | 5014 | ||
| 5010 | return 0; | 5015 | return 0; |
| @@ -5923,7 +5928,7 @@ static void perf_sample_regs_user(struct perf_regs *regs_user, | |||
| 5923 | if (user_mode(regs)) { | 5928 | if (user_mode(regs)) { |
| 5924 | regs_user->abi = perf_reg_abi(current); | 5929 | regs_user->abi = perf_reg_abi(current); |
| 5925 | regs_user->regs = regs; | 5930 | regs_user->regs = regs; |
| 5926 | } else if (current->mm) { | 5931 | } else if (!(current->flags & PF_KTHREAD)) { |
| 5927 | perf_get_regs_user(regs_user, regs, regs_user_copy); | 5932 | perf_get_regs_user(regs_user, regs, regs_user_copy); |
| 5928 | } else { | 5933 | } else { |
| 5929 | regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; | 5934 | regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; |
| @@ -8532,9 +8537,9 @@ static int perf_tp_event_match(struct perf_event *event, | |||
| 8532 | if (event->hw.state & PERF_HES_STOPPED) | 8537 | if (event->hw.state & PERF_HES_STOPPED) |
| 8533 | return 0; | 8538 | return 0; |
| 8534 | /* | 8539 | /* |
| 8535 | * All tracepoints are from kernel-space. | 8540 | * If exclude_kernel, only trace user-space tracepoints (uprobes) |
| 8536 | */ | 8541 | */ |
| 8537 | if (event->attr.exclude_kernel) | 8542 | if (event->attr.exclude_kernel && !user_mode(regs)) |
| 8538 | return 0; | 8543 | return 0; |
| 8539 | 8544 | ||
| 8540 | if (!perf_tp_filter_match(event, data)) | 8545 | if (!perf_tp_filter_match(event, data)) |
| @@ -9874,6 +9879,12 @@ static int pmu_dev_alloc(struct pmu *pmu) | |||
| 9874 | if (ret) | 9879 | if (ret) |
| 9875 | goto del_dev; | 9880 | goto del_dev; |
| 9876 | 9881 | ||
| 9882 | if (pmu->attr_update) | ||
| 9883 | ret = sysfs_update_groups(&pmu->dev->kobj, pmu->attr_update); | ||
| 9884 | |||
| 9885 | if (ret) | ||
| 9886 | goto del_dev; | ||
| 9887 | |||
| 9877 | out: | 9888 | out: |
| 9878 | return ret; | 9889 | return ret; |
| 9879 | 9890 | ||
| @@ -10033,6 +10044,12 @@ void perf_pmu_unregister(struct pmu *pmu) | |||
| 10033 | } | 10044 | } |
| 10034 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); | 10045 | EXPORT_SYMBOL_GPL(perf_pmu_unregister); |
| 10035 | 10046 | ||
| 10047 | static inline bool has_extended_regs(struct perf_event *event) | ||
| 10048 | { | ||
| 10049 | return (event->attr.sample_regs_user & PERF_REG_EXTENDED_MASK) || | ||
| 10050 | (event->attr.sample_regs_intr & PERF_REG_EXTENDED_MASK); | ||
| 10051 | } | ||
| 10052 | |||
| 10036 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | 10053 | static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) |
| 10037 | { | 10054 | { |
| 10038 | struct perf_event_context *ctx = NULL; | 10055 | struct perf_event_context *ctx = NULL; |
| @@ -10064,12 +10081,16 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) | |||
| 10064 | perf_event_ctx_unlock(event->group_leader, ctx); | 10081 | perf_event_ctx_unlock(event->group_leader, ctx); |
| 10065 | 10082 | ||
| 10066 | if (!ret) { | 10083 | if (!ret) { |
| 10084 | if (!(pmu->capabilities & PERF_PMU_CAP_EXTENDED_REGS) && | ||
| 10085 | has_extended_regs(event)) | ||
| 10086 | ret = -EOPNOTSUPP; | ||
| 10087 | |||
| 10067 | if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && | 10088 | if (pmu->capabilities & PERF_PMU_CAP_NO_EXCLUDE && |
| 10068 | event_has_any_exclude_flag(event)) { | 10089 | event_has_any_exclude_flag(event)) |
| 10069 | if (event->destroy) | ||
| 10070 | event->destroy(event); | ||
| 10071 | ret = -EINVAL; | 10090 | ret = -EINVAL; |
| 10072 | } | 10091 | |
| 10092 | if (ret && event->destroy) | ||
| 10093 | event->destroy(event); | ||
| 10073 | } | 10094 | } |
| 10074 | 10095 | ||
| 10075 | if (ret) | 10096 | if (ret) |
| @@ -10680,11 +10701,11 @@ static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) | |||
| 10680 | break; | 10701 | break; |
| 10681 | 10702 | ||
| 10682 | case CLOCK_BOOTTIME: | 10703 | case CLOCK_BOOTTIME: |
| 10683 | event->clock = &ktime_get_boot_ns; | 10704 | event->clock = &ktime_get_boottime_ns; |
| 10684 | break; | 10705 | break; |
| 10685 | 10706 | ||
| 10686 | case CLOCK_TAI: | 10707 | case CLOCK_TAI: |
| 10687 | event->clock = &ktime_get_tai_ns; | 10708 | event->clock = &ktime_get_clocktai_ns; |
| 10688 | break; | 10709 | break; |
| 10689 | 10710 | ||
| 10690 | default: | 10711 | default: |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 79c47076700a..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -24,7 +24,7 @@ struct ring_buffer { | |||
| 24 | atomic_t poll; /* POLL_ for wakeups */ | 24 | atomic_t poll; /* POLL_ for wakeups */ |
| 25 | 25 | ||
| 26 | local_t head; /* write position */ | 26 | local_t head; /* write position */ |
| 27 | local_t nest; /* nested writers */ | 27 | unsigned int nest; /* nested writers */ |
| 28 | local_t events; /* event limit */ | 28 | local_t events; /* event limit */ |
| 29 | local_t wakeup; /* wakeup stamp */ | 29 | local_t wakeup; /* wakeup stamp */ |
| 30 | local_t lost; /* nr records lost */ | 30 | local_t lost; /* nr records lost */ |
| @@ -41,7 +41,7 @@ struct ring_buffer { | |||
| 41 | 41 | ||
| 42 | /* AUX area */ | 42 | /* AUX area */ |
| 43 | long aux_head; | 43 | long aux_head; |
| 44 | local_t aux_nest; | 44 | unsigned int aux_nest; |
| 45 | long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ | 45 | long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */ |
| 46 | unsigned long aux_pgoff; | 46 | unsigned long aux_pgoff; |
| 47 | int aux_nr_pages; | 47 | int aux_nr_pages; |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 674b35383491..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle) | |||
| 38 | struct ring_buffer *rb = handle->rb; | 38 | struct ring_buffer *rb = handle->rb; |
| 39 | 39 | ||
| 40 | preempt_disable(); | 40 | preempt_disable(); |
| 41 | local_inc(&rb->nest); | 41 | |
| 42 | /* | ||
| 43 | * Avoid an explicit LOAD/STORE such that architectures with memops | ||
| 44 | * can use them. | ||
| 45 | */ | ||
| 46 | (*(volatile unsigned int *)&rb->nest)++; | ||
| 42 | handle->wakeup = local_read(&rb->wakeup); | 47 | handle->wakeup = local_read(&rb->wakeup); |
| 43 | } | 48 | } |
| 44 | 49 | ||
| @@ -46,17 +51,35 @@ static void perf_output_put_handle(struct perf_output_handle *handle) | |||
| 46 | { | 51 | { |
| 47 | struct ring_buffer *rb = handle->rb; | 52 | struct ring_buffer *rb = handle->rb; |
| 48 | unsigned long head; | 53 | unsigned long head; |
| 54 | unsigned int nest; | ||
| 55 | |||
| 56 | /* | ||
| 57 | * If this isn't the outermost nesting, we don't have to update | ||
| 58 | * @rb->user_page->data_head. | ||
| 59 | */ | ||
| 60 | nest = READ_ONCE(rb->nest); | ||
| 61 | if (nest > 1) { | ||
| 62 | WRITE_ONCE(rb->nest, nest - 1); | ||
| 63 | goto out; | ||
| 64 | } | ||
| 49 | 65 | ||
| 50 | again: | 66 | again: |
| 67 | /* | ||
| 68 | * In order to avoid publishing a head value that goes backwards, | ||
| 69 | * we must ensure the load of @rb->head happens after we've | ||
| 70 | * incremented @rb->nest. | ||
| 71 | * | ||
| 72 | * Otherwise we can observe a @rb->head value before one published | ||
| 73 | * by an IRQ/NMI happening between the load and the increment. | ||
| 74 | */ | ||
| 75 | barrier(); | ||
| 51 | head = local_read(&rb->head); | 76 | head = local_read(&rb->head); |
| 52 | 77 | ||
| 53 | /* | 78 | /* |
| 54 | * IRQ/NMI can happen here, which means we can miss a head update. | 79 | * IRQ/NMI can happen here and advance @rb->head, causing our |
| 80 | * load above to be stale. | ||
| 55 | */ | 81 | */ |
| 56 | 82 | ||
| 57 | if (!local_dec_and_test(&rb->nest)) | ||
| 58 | goto out; | ||
| 59 | |||
| 60 | /* | 83 | /* |
| 61 | * Since the mmap() consumer (userspace) can run on a different CPU: | 84 | * Since the mmap() consumer (userspace) can run on a different CPU: |
| 62 | * | 85 | * |
| @@ -84,14 +107,23 @@ again: | |||
| 84 | * See perf_output_begin(). | 107 | * See perf_output_begin(). |
| 85 | */ | 108 | */ |
| 86 | smp_wmb(); /* B, matches C */ | 109 | smp_wmb(); /* B, matches C */ |
| 87 | rb->user_page->data_head = head; | 110 | WRITE_ONCE(rb->user_page->data_head, head); |
| 88 | 111 | ||
| 89 | /* | 112 | /* |
| 90 | * Now check if we missed an update -- rely on previous implied | 113 | * We must publish the head before decrementing the nest count, |
| 91 | * compiler barriers to force a re-read. | 114 | * otherwise an IRQ/NMI can publish a more recent head value and our |
| 115 | * write will (temporarily) publish a stale value. | ||
| 92 | */ | 116 | */ |
| 117 | barrier(); | ||
| 118 | WRITE_ONCE(rb->nest, 0); | ||
| 119 | |||
| 120 | /* | ||
| 121 | * Ensure we decrement @rb->nest before we validate the @rb->head. | ||
| 122 | * Otherwise we cannot be sure we caught the 'last' nested update. | ||
| 123 | */ | ||
| 124 | barrier(); | ||
| 93 | if (unlikely(head != local_read(&rb->head))) { | 125 | if (unlikely(head != local_read(&rb->head))) { |
| 94 | local_inc(&rb->nest); | 126 | WRITE_ONCE(rb->nest, 1); |
| 95 | goto again; | 127 | goto again; |
| 96 | } | 128 | } |
| 97 | 129 | ||
| @@ -330,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, | |||
| 330 | struct perf_event *output_event = event; | 362 | struct perf_event *output_event = event; |
| 331 | unsigned long aux_head, aux_tail; | 363 | unsigned long aux_head, aux_tail; |
| 332 | struct ring_buffer *rb; | 364 | struct ring_buffer *rb; |
| 365 | unsigned int nest; | ||
| 333 | 366 | ||
| 334 | if (output_event->parent) | 367 | if (output_event->parent) |
| 335 | output_event = output_event->parent; | 368 | output_event = output_event->parent; |
| @@ -360,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, | |||
| 360 | if (!refcount_inc_not_zero(&rb->aux_refcount)) | 393 | if (!refcount_inc_not_zero(&rb->aux_refcount)) |
| 361 | goto err; | 394 | goto err; |
| 362 | 395 | ||
| 396 | nest = READ_ONCE(rb->aux_nest); | ||
| 363 | /* | 397 | /* |
| 364 | * Nesting is not supported for AUX area, make sure nested | 398 | * Nesting is not supported for AUX area, make sure nested |
| 365 | * writers are caught early | 399 | * writers are caught early |
| 366 | */ | 400 | */ |
| 367 | if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) | 401 | if (WARN_ON_ONCE(nest)) |
| 368 | goto err_put; | 402 | goto err_put; |
| 369 | 403 | ||
| 404 | WRITE_ONCE(rb->aux_nest, nest + 1); | ||
| 405 | |||
| 370 | aux_head = rb->aux_head; | 406 | aux_head = rb->aux_head; |
| 371 | 407 | ||
| 372 | handle->rb = rb; | 408 | handle->rb = rb; |
| @@ -394,7 +430,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, | |||
| 394 | if (!handle->size) { /* A, matches D */ | 430 | if (!handle->size) { /* A, matches D */ |
| 395 | event->pending_disable = smp_processor_id(); | 431 | event->pending_disable = smp_processor_id(); |
| 396 | perf_output_wakeup(handle); | 432 | perf_output_wakeup(handle); |
| 397 | local_set(&rb->aux_nest, 0); | 433 | WRITE_ONCE(rb->aux_nest, 0); |
| 398 | goto err_put; | 434 | goto err_put; |
| 399 | } | 435 | } |
| 400 | } | 436 | } |
| @@ -471,7 +507,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) | |||
| 471 | perf_event_aux_event(handle->event, aux_head, size, | 507 | perf_event_aux_event(handle->event, aux_head, size, |
| 472 | handle->aux_flags); | 508 | handle->aux_flags); |
| 473 | 509 | ||
| 474 | rb->user_page->aux_head = rb->aux_head; | 510 | WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); |
| 475 | if (rb_need_aux_wakeup(rb)) | 511 | if (rb_need_aux_wakeup(rb)) |
| 476 | wakeup = true; | 512 | wakeup = true; |
| 477 | 513 | ||
| @@ -483,7 +519,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) | |||
| 483 | 519 | ||
| 484 | handle->event = NULL; | 520 | handle->event = NULL; |
| 485 | 521 | ||
| 486 | local_set(&rb->aux_nest, 0); | 522 | WRITE_ONCE(rb->aux_nest, 0); |
| 487 | /* can't be last */ | 523 | /* can't be last */ |
| 488 | rb_free_aux(rb); | 524 | rb_free_aux(rb); |
| 489 | ring_buffer_put(rb); | 525 | ring_buffer_put(rb); |
| @@ -503,7 +539,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) | |||
| 503 | 539 | ||
| 504 | rb->aux_head += size; | 540 | rb->aux_head += size; |
| 505 | 541 | ||
| 506 | rb->user_page->aux_head = rb->aux_head; | 542 | WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); |
| 507 | if (rb_need_aux_wakeup(rb)) { | 543 | if (rb_need_aux_wakeup(rb)) { |
| 508 | perf_output_wakeup(handle); | 544 | perf_output_wakeup(handle); |
| 509 | handle->wakeup = rb->aux_wakeup + rb->aux_watermark; | 545 | handle->wakeup = rb->aux_wakeup + rb->aux_watermark; |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 78f61bfc6b79..84fa00497c49 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -46,7 +46,7 @@ static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */ | |||
| 46 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; | 46 | static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; |
| 47 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) | 47 | #define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) |
| 48 | 48 | ||
| 49 | static struct percpu_rw_semaphore dup_mmap_sem; | 49 | DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem); |
| 50 | 50 | ||
| 51 | /* Have a copy of original instruction */ | 51 | /* Have a copy of original instruction */ |
| 52 | #define UPROBE_COPY_INSN 0 | 52 | #define UPROBE_COPY_INSN 0 |
| @@ -2112,7 +2112,7 @@ static void handle_trampoline(struct pt_regs *regs) | |||
| 2112 | 2112 | ||
| 2113 | sigill: | 2113 | sigill: |
| 2114 | uprobe_warn(current, "handle uretprobe, sending SIGILL."); | 2114 | uprobe_warn(current, "handle uretprobe, sending SIGILL."); |
| 2115 | force_sig(SIGILL, current); | 2115 | force_sig(SIGILL); |
| 2116 | 2116 | ||
| 2117 | } | 2117 | } |
| 2118 | 2118 | ||
| @@ -2228,7 +2228,7 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
| 2228 | 2228 | ||
| 2229 | if (unlikely(err)) { | 2229 | if (unlikely(err)) { |
| 2230 | uprobe_warn(current, "execute the probed insn, sending SIGILL."); | 2230 | uprobe_warn(current, "execute the probed insn, sending SIGILL."); |
| 2231 | force_sig(SIGILL, current); | 2231 | force_sig(SIGILL); |
| 2232 | } | 2232 | } |
| 2233 | } | 2233 | } |
| 2234 | 2234 | ||
| @@ -2302,7 +2302,5 @@ void __init uprobes_init(void) | |||
| 2302 | for (i = 0; i < UPROBES_HASH_SZ; i++) | 2302 | for (i = 0; i < UPROBES_HASH_SZ; i++) |
| 2303 | mutex_init(&uprobes_mmap_mutex[i]); | 2303 | mutex_init(&uprobes_mmap_mutex[i]); |
| 2304 | 2304 | ||
| 2305 | BUG_ON(percpu_init_rwsem(&dup_mmap_sem)); | ||
| 2306 | |||
| 2307 | BUG_ON(register_die_notifier(&uprobe_exception_nb)); | 2305 | BUG_ON(register_die_notifier(&uprobe_exception_nb)); |
| 2308 | } | 2306 | } |
diff --git a/kernel/exit.c b/kernel/exit.c index 2166c2d92ddc..a75b6a7f458a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/exit.c | 3 | * linux/kernel/exit.c |
| 3 | * | 4 | * |
| @@ -194,6 +195,7 @@ repeat: | |||
| 194 | rcu_read_unlock(); | 195 | rcu_read_unlock(); |
| 195 | 196 | ||
| 196 | proc_flush_task(p); | 197 | proc_flush_task(p); |
| 198 | cgroup_release(p); | ||
| 197 | 199 | ||
| 198 | write_lock_irq(&tasklist_lock); | 200 | write_lock_irq(&tasklist_lock); |
| 199 | ptrace_release_task(p); | 201 | ptrace_release_task(p); |
| @@ -219,7 +221,6 @@ repeat: | |||
| 219 | } | 221 | } |
| 220 | 222 | ||
| 221 | write_unlock_irq(&tasklist_lock); | 223 | write_unlock_irq(&tasklist_lock); |
| 222 | cgroup_release(p); | ||
| 223 | release_thread(p); | 224 | release_thread(p); |
| 224 | call_rcu(&p->rcu, delayed_put_task_struct); | 225 | call_rcu(&p->rcu, delayed_put_task_struct); |
| 225 | 226 | ||
| @@ -422,7 +423,7 @@ retry: | |||
| 422 | * freed task structure. | 423 | * freed task structure. |
| 423 | */ | 424 | */ |
| 424 | if (atomic_read(&mm->mm_users) <= 1) { | 425 | if (atomic_read(&mm->mm_users) <= 1) { |
| 425 | mm->owner = NULL; | 426 | WRITE_ONCE(mm->owner, NULL); |
| 426 | return; | 427 | return; |
| 427 | } | 428 | } |
| 428 | 429 | ||
| @@ -462,7 +463,7 @@ retry: | |||
| 462 | * most likely racing with swapoff (try_to_unuse()) or /proc or | 463 | * most likely racing with swapoff (try_to_unuse()) or /proc or |
| 463 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. | 464 | * ptrace or page migration (get_task_mm()). Mark owner as NULL. |
| 464 | */ | 465 | */ |
| 465 | mm->owner = NULL; | 466 | WRITE_ONCE(mm->owner, NULL); |
| 466 | return; | 467 | return; |
| 467 | 468 | ||
| 468 | assign_new_owner: | 469 | assign_new_owner: |
| @@ -483,7 +484,7 @@ assign_new_owner: | |||
| 483 | put_task_struct(c); | 484 | put_task_struct(c); |
| 484 | goto retry; | 485 | goto retry; |
| 485 | } | 486 | } |
| 486 | mm->owner = c; | 487 | WRITE_ONCE(mm->owner, c); |
| 487 | task_unlock(c); | 488 | task_unlock(c); |
| 488 | put_task_struct(c); | 489 | put_task_struct(c); |
| 489 | } | 490 | } |
diff --git a/kernel/extable.c b/kernel/extable.c index 6a5b61ebc66c..e23cce6e6092 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
| @@ -1,19 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* Rewritten by Rusty Russell, on the backs of many others... | 2 | /* Rewritten by Rusty Russell, on the backs of many others... |
| 2 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. | 3 | Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. |
| 3 | 4 | ||
| 4 | This program is free software; you can redistribute it and/or modify | ||
| 5 | it under the terms of the GNU General Public License as published by | ||
| 6 | the Free Software Foundation; either version 2 of the License, or | ||
| 7 | (at your option) any later version. | ||
| 8 | |||
| 9 | This program is distributed in the hope that it will be useful, | ||
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | GNU General Public License for more details. | ||
| 13 | |||
| 14 | You should have received a copy of the GNU General Public License | ||
| 15 | along with this program; if not, write to the Free Software | ||
| 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 17 | */ | 5 | */ |
| 18 | #include <linux/ftrace.h> | 6 | #include <linux/ftrace.h> |
| 19 | #include <linux/memory.h> | 7 | #include <linux/memory.h> |
diff --git a/kernel/fail_function.c b/kernel/fail_function.c index feb80712b913..63b349168da7 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c | |||
| @@ -152,20 +152,13 @@ static int fei_retval_get(void *data, u64 *val) | |||
| 152 | DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, | 152 | DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, |
| 153 | "%llx\n"); | 153 | "%llx\n"); |
| 154 | 154 | ||
| 155 | static int fei_debugfs_add_attr(struct fei_attr *attr) | 155 | static void fei_debugfs_add_attr(struct fei_attr *attr) |
| 156 | { | 156 | { |
| 157 | struct dentry *dir; | 157 | struct dentry *dir; |
| 158 | 158 | ||
| 159 | dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); | 159 | dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); |
| 160 | if (!dir) | ||
| 161 | return -ENOMEM; | ||
| 162 | |||
| 163 | if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { | ||
| 164 | debugfs_remove_recursive(dir); | ||
| 165 | return -ENOMEM; | ||
| 166 | } | ||
| 167 | 160 | ||
| 168 | return 0; | 161 | debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops); |
| 169 | } | 162 | } |
| 170 | 163 | ||
| 171 | static void fei_debugfs_remove_attr(struct fei_attr *attr) | 164 | static void fei_debugfs_remove_attr(struct fei_attr *attr) |
| @@ -306,7 +299,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, | |||
| 306 | 299 | ||
| 307 | ret = register_kprobe(&attr->kp); | 300 | ret = register_kprobe(&attr->kp); |
| 308 | if (!ret) | 301 | if (!ret) |
| 309 | ret = fei_debugfs_add_attr(attr); | 302 | fei_debugfs_add_attr(attr); |
| 310 | if (ret < 0) | 303 | if (ret < 0) |
| 311 | fei_attr_remove(attr); | 304 | fei_attr_remove(attr); |
| 312 | else { | 305 | else { |
| @@ -337,19 +330,13 @@ static int __init fei_debugfs_init(void) | |||
| 337 | return PTR_ERR(dir); | 330 | return PTR_ERR(dir); |
| 338 | 331 | ||
| 339 | /* injectable attribute is just a symlink of error_inject/list */ | 332 | /* injectable attribute is just a symlink of error_inject/list */ |
| 340 | if (!debugfs_create_symlink("injectable", dir, | 333 | debugfs_create_symlink("injectable", dir, "../error_injection/list"); |
| 341 | "../error_injection/list")) | ||
| 342 | goto error; | ||
| 343 | 334 | ||
| 344 | if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) | 335 | debugfs_create_file("inject", 0600, dir, NULL, &fei_ops); |
| 345 | goto error; | ||
| 346 | 336 | ||
| 347 | fei_debugfs_dir = dir; | 337 | fei_debugfs_dir = dir; |
| 348 | 338 | ||
| 349 | return 0; | 339 | return 0; |
| 350 | error: | ||
| 351 | debugfs_remove_recursive(dir); | ||
| 352 | return -ENOMEM; | ||
| 353 | } | 340 | } |
| 354 | 341 | ||
| 355 | late_initcall(fei_debugfs_init); | 342 | late_initcall(fei_debugfs_init); |
diff --git a/kernel/fork.c b/kernel/fork.c index 737db1828437..8f3e2d97d771 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/fork.c | 3 | * linux/kernel/fork.c |
| 3 | * | 4 | * |
| @@ -122,7 +123,7 @@ | |||
| 122 | unsigned long total_forks; /* Handle normal Linux uptimes. */ | 123 | unsigned long total_forks; /* Handle normal Linux uptimes. */ |
| 123 | int nr_threads; /* The idle threads do not count.. */ | 124 | int nr_threads; /* The idle threads do not count.. */ |
| 124 | 125 | ||
| 125 | int max_threads; /* tunable limit on nr_threads */ | 126 | static int max_threads; /* tunable limit on nr_threads */ |
| 126 | 127 | ||
| 127 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; | 128 | DEFINE_PER_CPU(unsigned long, process_counts) = 0; |
| 128 | 129 | ||
| @@ -247,7 +248,11 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) | |||
| 247 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, | 248 | struct page *page = alloc_pages_node(node, THREADINFO_GFP, |
| 248 | THREAD_SIZE_ORDER); | 249 | THREAD_SIZE_ORDER); |
| 249 | 250 | ||
| 250 | return page ? page_address(page) : NULL; | 251 | if (likely(page)) { |
| 252 | tsk->stack = page_address(page); | ||
| 253 | return tsk->stack; | ||
| 254 | } | ||
| 255 | return NULL; | ||
| 251 | #endif | 256 | #endif |
| 252 | } | 257 | } |
| 253 | 258 | ||
| @@ -893,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
| 893 | #ifdef CONFIG_STACKPROTECTOR | 898 | #ifdef CONFIG_STACKPROTECTOR |
| 894 | tsk->stack_canary = get_random_canary(); | 899 | tsk->stack_canary = get_random_canary(); |
| 895 | #endif | 900 | #endif |
| 901 | if (orig->cpus_ptr == &orig->cpus_mask) | ||
| 902 | tsk->cpus_ptr = &tsk->cpus_mask; | ||
| 896 | 903 | ||
| 897 | /* | 904 | /* |
| 898 | * One for us, one for whoever does the "release_task()" (usually | 905 | * One for us, one for whoever does the "release_task()" (usually |
| @@ -955,6 +962,15 @@ static void mm_init_aio(struct mm_struct *mm) | |||
| 955 | #endif | 962 | #endif |
| 956 | } | 963 | } |
| 957 | 964 | ||
| 965 | static __always_inline void mm_clear_owner(struct mm_struct *mm, | ||
| 966 | struct task_struct *p) | ||
| 967 | { | ||
| 968 | #ifdef CONFIG_MEMCG | ||
| 969 | if (mm->owner == p) | ||
| 970 | WRITE_ONCE(mm->owner, NULL); | ||
| 971 | #endif | ||
| 972 | } | ||
| 973 | |||
| 958 | static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) | 974 | static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) |
| 959 | { | 975 | { |
| 960 | #ifdef CONFIG_MEMCG | 976 | #ifdef CONFIG_MEMCG |
| @@ -1343,6 +1359,7 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, | |||
| 1343 | free_pt: | 1359 | free_pt: |
| 1344 | /* don't put binfmt in mmput, we haven't got module yet */ | 1360 | /* don't put binfmt in mmput, we haven't got module yet */ |
| 1345 | mm->binfmt = NULL; | 1361 | mm->binfmt = NULL; |
| 1362 | mm_init_owner(mm, NULL); | ||
| 1346 | mmput(mm); | 1363 | mmput(mm); |
| 1347 | 1364 | ||
| 1348 | fail_nomem: | 1365 | fail_nomem: |
| @@ -1694,36 +1711,52 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) | |||
| 1694 | } | 1711 | } |
| 1695 | #endif | 1712 | #endif |
| 1696 | 1713 | ||
| 1714 | /* | ||
| 1715 | * Poll support for process exit notification. | ||
| 1716 | */ | ||
| 1717 | static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts) | ||
| 1718 | { | ||
| 1719 | struct task_struct *task; | ||
| 1720 | struct pid *pid = file->private_data; | ||
| 1721 | int poll_flags = 0; | ||
| 1722 | |||
| 1723 | poll_wait(file, &pid->wait_pidfd, pts); | ||
| 1724 | |||
| 1725 | rcu_read_lock(); | ||
| 1726 | task = pid_task(pid, PIDTYPE_PID); | ||
| 1727 | /* | ||
| 1728 | * Inform pollers only when the whole thread group exits. | ||
| 1729 | * If the thread group leader exits before all other threads in the | ||
| 1730 | * group, then poll(2) should block, similar to the wait(2) family. | ||
| 1731 | */ | ||
| 1732 | if (!task || (task->exit_state && thread_group_empty(task))) | ||
| 1733 | poll_flags = POLLIN | POLLRDNORM; | ||
| 1734 | rcu_read_unlock(); | ||
| 1735 | |||
| 1736 | return poll_flags; | ||
| 1737 | } | ||
| 1738 | |||
| 1697 | const struct file_operations pidfd_fops = { | 1739 | const struct file_operations pidfd_fops = { |
| 1698 | .release = pidfd_release, | 1740 | .release = pidfd_release, |
| 1741 | .poll = pidfd_poll, | ||
| 1699 | #ifdef CONFIG_PROC_FS | 1742 | #ifdef CONFIG_PROC_FS |
| 1700 | .show_fdinfo = pidfd_show_fdinfo, | 1743 | .show_fdinfo = pidfd_show_fdinfo, |
| 1701 | #endif | 1744 | #endif |
| 1702 | }; | 1745 | }; |
| 1703 | 1746 | ||
| 1704 | /** | 1747 | static void __delayed_free_task(struct rcu_head *rhp) |
| 1705 | * pidfd_create() - Create a new pid file descriptor. | ||
| 1706 | * | ||
| 1707 | * @pid: struct pid that the pidfd will reference | ||
| 1708 | * | ||
| 1709 | * This creates a new pid file descriptor with the O_CLOEXEC flag set. | ||
| 1710 | * | ||
| 1711 | * Note, that this function can only be called after the fd table has | ||
| 1712 | * been unshared to avoid leaking the pidfd to the new process. | ||
| 1713 | * | ||
| 1714 | * Return: On success, a cloexec pidfd is returned. | ||
| 1715 | * On error, a negative errno number will be returned. | ||
| 1716 | */ | ||
| 1717 | static int pidfd_create(struct pid *pid) | ||
| 1718 | { | 1748 | { |
| 1719 | int fd; | 1749 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
| 1720 | 1750 | ||
| 1721 | fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), | 1751 | free_task(tsk); |
| 1722 | O_RDWR | O_CLOEXEC); | 1752 | } |
| 1723 | if (fd < 0) | ||
| 1724 | put_pid(pid); | ||
| 1725 | 1753 | ||
| 1726 | return fd; | 1754 | static __always_inline void delayed_free_task(struct task_struct *tsk) |
| 1755 | { | ||
| 1756 | if (IS_ENABLED(CONFIG_MEMCG)) | ||
| 1757 | call_rcu(&tsk->rcu, __delayed_free_task); | ||
| 1758 | else | ||
| 1759 | free_task(tsk); | ||
| 1727 | } | 1760 | } |
| 1728 | 1761 | ||
| 1729 | /* | 1762 | /* |
| @@ -1735,19 +1768,16 @@ static int pidfd_create(struct pid *pid) | |||
| 1735 | * flags). The actual kick-off is left to the caller. | 1768 | * flags). The actual kick-off is left to the caller. |
| 1736 | */ | 1769 | */ |
| 1737 | static __latent_entropy struct task_struct *copy_process( | 1770 | static __latent_entropy struct task_struct *copy_process( |
| 1738 | unsigned long clone_flags, | ||
| 1739 | unsigned long stack_start, | ||
| 1740 | unsigned long stack_size, | ||
| 1741 | int __user *parent_tidptr, | ||
| 1742 | int __user *child_tidptr, | ||
| 1743 | struct pid *pid, | 1771 | struct pid *pid, |
| 1744 | int trace, | 1772 | int trace, |
| 1745 | unsigned long tls, | 1773 | int node, |
| 1746 | int node) | 1774 | struct kernel_clone_args *args) |
| 1747 | { | 1775 | { |
| 1748 | int pidfd = -1, retval; | 1776 | int pidfd = -1, retval; |
| 1749 | struct task_struct *p; | 1777 | struct task_struct *p; |
| 1750 | struct multiprocess_signals delayed; | 1778 | struct multiprocess_signals delayed; |
| 1779 | struct file *pidfile = NULL; | ||
| 1780 | u64 clone_flags = args->flags; | ||
| 1751 | 1781 | ||
| 1752 | /* | 1782 | /* |
| 1753 | * Don't allow sharing the root directory with processes in a different | 1783 | * Don't allow sharing the root directory with processes in a different |
| @@ -1796,27 +1826,12 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1796 | } | 1826 | } |
| 1797 | 1827 | ||
| 1798 | if (clone_flags & CLONE_PIDFD) { | 1828 | if (clone_flags & CLONE_PIDFD) { |
| 1799 | int reserved; | ||
| 1800 | |||
| 1801 | /* | 1829 | /* |
| 1802 | * - CLONE_PARENT_SETTID is useless for pidfds and also | ||
| 1803 | * parent_tidptr is used to return pidfds. | ||
| 1804 | * - CLONE_DETACHED is blocked so that we can potentially | 1830 | * - CLONE_DETACHED is blocked so that we can potentially |
| 1805 | * reuse it later for CLONE_PIDFD. | 1831 | * reuse it later for CLONE_PIDFD. |
| 1806 | * - CLONE_THREAD is blocked until someone really needs it. | 1832 | * - CLONE_THREAD is blocked until someone really needs it. |
| 1807 | */ | 1833 | */ |
| 1808 | if (clone_flags & | 1834 | if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) |
| 1809 | (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) | ||
| 1810 | return ERR_PTR(-EINVAL); | ||
| 1811 | |||
| 1812 | /* | ||
| 1813 | * Verify that parent_tidptr is sane so we can potentially | ||
| 1814 | * reuse it later. | ||
| 1815 | */ | ||
| 1816 | if (get_user(reserved, parent_tidptr)) | ||
| 1817 | return ERR_PTR(-EFAULT); | ||
| 1818 | |||
| 1819 | if (reserved != 0) | ||
| 1820 | return ERR_PTR(-EINVAL); | 1835 | return ERR_PTR(-EINVAL); |
| 1821 | } | 1836 | } |
| 1822 | 1837 | ||
| @@ -1849,11 +1864,11 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1849 | * p->set_child_tid which is (ab)used as a kthread's data pointer for | 1864 | * p->set_child_tid which is (ab)used as a kthread's data pointer for |
| 1850 | * kernel threads (PF_KTHREAD). | 1865 | * kernel threads (PF_KTHREAD). |
| 1851 | */ | 1866 | */ |
| 1852 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; | 1867 | p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; |
| 1853 | /* | 1868 | /* |
| 1854 | * Clear TID on mm_release()? | 1869 | * Clear TID on mm_release()? |
| 1855 | */ | 1870 | */ |
| 1856 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; | 1871 | p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; |
| 1857 | 1872 | ||
| 1858 | ftrace_graph_init_task(p); | 1873 | ftrace_graph_init_task(p); |
| 1859 | 1874 | ||
| @@ -1958,9 +1973,6 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 1958 | p->pagefault_disabled = 0; | 1973 | p->pagefault_disabled = 0; |
| 1959 | 1974 | ||
| 1960 | #ifdef CONFIG_LOCKDEP | 1975 | #ifdef CONFIG_LOCKDEP |
| 1961 | p->lockdep_depth = 0; /* no locks held yet */ | ||
| 1962 | p->curr_chain_key = 0; | ||
| 1963 | p->lockdep_recursion = 0; | ||
| 1964 | lockdep_init_task(p); | 1976 | lockdep_init_task(p); |
| 1965 | #endif | 1977 | #endif |
| 1966 | 1978 | ||
| @@ -2012,7 +2024,8 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2012 | retval = copy_io(clone_flags, p); | 2024 | retval = copy_io(clone_flags, p); |
| 2013 | if (retval) | 2025 | if (retval) |
| 2014 | goto bad_fork_cleanup_namespaces; | 2026 | goto bad_fork_cleanup_namespaces; |
| 2015 | retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); | 2027 | retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, |
| 2028 | args->tls); | ||
| 2016 | if (retval) | 2029 | if (retval) |
| 2017 | goto bad_fork_cleanup_io; | 2030 | goto bad_fork_cleanup_io; |
| 2018 | 2031 | ||
| @@ -2032,12 +2045,22 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2032 | * if the fd table isn't shared). | 2045 | * if the fd table isn't shared). |
| 2033 | */ | 2046 | */ |
| 2034 | if (clone_flags & CLONE_PIDFD) { | 2047 | if (clone_flags & CLONE_PIDFD) { |
| 2035 | retval = pidfd_create(pid); | 2048 | retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); |
| 2036 | if (retval < 0) | 2049 | if (retval < 0) |
| 2037 | goto bad_fork_free_pid; | 2050 | goto bad_fork_free_pid; |
| 2038 | 2051 | ||
| 2039 | pidfd = retval; | 2052 | pidfd = retval; |
| 2040 | retval = put_user(pidfd, parent_tidptr); | 2053 | |
| 2054 | pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, | ||
| 2055 | O_RDWR | O_CLOEXEC); | ||
| 2056 | if (IS_ERR(pidfile)) { | ||
| 2057 | put_unused_fd(pidfd); | ||
| 2058 | retval = PTR_ERR(pidfile); | ||
| 2059 | goto bad_fork_free_pid; | ||
| 2060 | } | ||
| 2061 | get_pid(pid); /* held by pidfile now */ | ||
| 2062 | |||
| 2063 | retval = put_user(pidfd, args->pidfd); | ||
| 2041 | if (retval) | 2064 | if (retval) |
| 2042 | goto bad_fork_put_pidfd; | 2065 | goto bad_fork_put_pidfd; |
| 2043 | } | 2066 | } |
| @@ -2068,7 +2091,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2068 | #ifdef TIF_SYSCALL_EMU | 2091 | #ifdef TIF_SYSCALL_EMU |
| 2069 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 2092 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
| 2070 | #endif | 2093 | #endif |
| 2071 | clear_all_latency_tracing(p); | 2094 | clear_tsk_latency_tracing(p); |
| 2072 | 2095 | ||
| 2073 | /* ok, now we should be set up.. */ | 2096 | /* ok, now we should be set up.. */ |
| 2074 | p->pid = pid_nr(pid); | 2097 | p->pid = pid_nr(pid); |
| @@ -2080,7 +2103,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2080 | if (clone_flags & CLONE_PARENT) | 2103 | if (clone_flags & CLONE_PARENT) |
| 2081 | p->exit_signal = current->group_leader->exit_signal; | 2104 | p->exit_signal = current->group_leader->exit_signal; |
| 2082 | else | 2105 | else |
| 2083 | p->exit_signal = (clone_flags & CSIGNAL); | 2106 | p->exit_signal = args->exit_signal; |
| 2084 | p->group_leader = p; | 2107 | p->group_leader = p; |
| 2085 | p->tgid = p->pid; | 2108 | p->tgid = p->pid; |
| 2086 | } | 2109 | } |
| @@ -2113,7 +2136,7 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2113 | */ | 2136 | */ |
| 2114 | 2137 | ||
| 2115 | p->start_time = ktime_get_ns(); | 2138 | p->start_time = ktime_get_ns(); |
| 2116 | p->real_start_time = ktime_get_boot_ns(); | 2139 | p->real_start_time = ktime_get_boottime_ns(); |
| 2117 | 2140 | ||
| 2118 | /* | 2141 | /* |
| 2119 | * Make it visible to the rest of the system, but dont wake it up yet. | 2142 | * Make it visible to the rest of the system, but dont wake it up yet. |
| @@ -2154,6 +2177,9 @@ static __latent_entropy struct task_struct *copy_process( | |||
| 2154 | goto bad_fork_cancel_cgroup; | 2177 | goto bad_fork_cancel_cgroup; |
| 2155 | } | 2178 | } |
| 2156 | 2179 | ||
| 2180 | /* past the last point of failure */ | ||
| 2181 | if (pidfile) | ||
| 2182 | fd_install(pidfd, pidfile); | ||
| 2157 | 2183 | ||
| 2158 | init_task_pid_links(p); | 2184 | init_task_pid_links(p); |
| 2159 | if (likely(p->pid)) { | 2185 | if (likely(p->pid)) { |
| @@ -2220,8 +2246,10 @@ bad_fork_cancel_cgroup: | |||
| 2220 | bad_fork_cgroup_threadgroup_change_end: | 2246 | bad_fork_cgroup_threadgroup_change_end: |
| 2221 | cgroup_threadgroup_change_end(current); | 2247 | cgroup_threadgroup_change_end(current); |
| 2222 | bad_fork_put_pidfd: | 2248 | bad_fork_put_pidfd: |
| 2223 | if (clone_flags & CLONE_PIDFD) | 2249 | if (clone_flags & CLONE_PIDFD) { |
| 2224 | ksys_close(pidfd); | 2250 | fput(pidfile); |
| 2251 | put_unused_fd(pidfd); | ||
| 2252 | } | ||
| 2225 | bad_fork_free_pid: | 2253 | bad_fork_free_pid: |
| 2226 | if (pid != &init_struct_pid) | 2254 | if (pid != &init_struct_pid) |
| 2227 | free_pid(pid); | 2255 | free_pid(pid); |
| @@ -2233,8 +2261,10 @@ bad_fork_cleanup_io: | |||
| 2233 | bad_fork_cleanup_namespaces: | 2261 | bad_fork_cleanup_namespaces: |
| 2234 | exit_task_namespaces(p); | 2262 | exit_task_namespaces(p); |
| 2235 | bad_fork_cleanup_mm: | 2263 | bad_fork_cleanup_mm: |
| 2236 | if (p->mm) | 2264 | if (p->mm) { |
| 2265 | mm_clear_owner(p->mm, p); | ||
| 2237 | mmput(p->mm); | 2266 | mmput(p->mm); |
| 2267 | } | ||
| 2238 | bad_fork_cleanup_signal: | 2268 | bad_fork_cleanup_signal: |
| 2239 | if (!(clone_flags & CLONE_THREAD)) | 2269 | if (!(clone_flags & CLONE_THREAD)) |
| 2240 | free_signal_struct(p->signal); | 2270 | free_signal_struct(p->signal); |
| @@ -2265,7 +2295,7 @@ bad_fork_cleanup_count: | |||
| 2265 | bad_fork_free: | 2295 | bad_fork_free: |
| 2266 | p->state = TASK_DEAD; | 2296 | p->state = TASK_DEAD; |
| 2267 | put_task_stack(p); | 2297 | put_task_stack(p); |
| 2268 | free_task(p); | 2298 | delayed_free_task(p); |
| 2269 | fork_out: | 2299 | fork_out: |
| 2270 | spin_lock_irq(¤t->sighand->siglock); | 2300 | spin_lock_irq(¤t->sighand->siglock); |
| 2271 | hlist_del_init(&delayed.node); | 2301 | hlist_del_init(&delayed.node); |
| @@ -2286,8 +2316,11 @@ static inline void init_idle_pids(struct task_struct *idle) | |||
| 2286 | struct task_struct *fork_idle(int cpu) | 2316 | struct task_struct *fork_idle(int cpu) |
| 2287 | { | 2317 | { |
| 2288 | struct task_struct *task; | 2318 | struct task_struct *task; |
| 2289 | task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, | 2319 | struct kernel_clone_args args = { |
| 2290 | cpu_to_node(cpu)); | 2320 | .flags = CLONE_VM, |
| 2321 | }; | ||
| 2322 | |||
| 2323 | task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); | ||
| 2291 | if (!IS_ERR(task)) { | 2324 | if (!IS_ERR(task)) { |
| 2292 | init_idle_pids(task); | 2325 | init_idle_pids(task); |
| 2293 | init_idle(task, cpu); | 2326 | init_idle(task, cpu); |
| @@ -2307,13 +2340,9 @@ struct mm_struct *copy_init_mm(void) | |||
| 2307 | * It copies the process, and if successful kick-starts | 2340 | * It copies the process, and if successful kick-starts |
| 2308 | * it and waits for it to finish using the VM if required. | 2341 | * it and waits for it to finish using the VM if required. |
| 2309 | */ | 2342 | */ |
| 2310 | long _do_fork(unsigned long clone_flags, | 2343 | long _do_fork(struct kernel_clone_args *args) |
| 2311 | unsigned long stack_start, | ||
| 2312 | unsigned long stack_size, | ||
| 2313 | int __user *parent_tidptr, | ||
| 2314 | int __user *child_tidptr, | ||
| 2315 | unsigned long tls) | ||
| 2316 | { | 2344 | { |
| 2345 | u64 clone_flags = args->flags; | ||
| 2317 | struct completion vfork; | 2346 | struct completion vfork; |
| 2318 | struct pid *pid; | 2347 | struct pid *pid; |
| 2319 | struct task_struct *p; | 2348 | struct task_struct *p; |
| @@ -2329,7 +2358,7 @@ long _do_fork(unsigned long clone_flags, | |||
| 2329 | if (!(clone_flags & CLONE_UNTRACED)) { | 2358 | if (!(clone_flags & CLONE_UNTRACED)) { |
| 2330 | if (clone_flags & CLONE_VFORK) | 2359 | if (clone_flags & CLONE_VFORK) |
| 2331 | trace = PTRACE_EVENT_VFORK; | 2360 | trace = PTRACE_EVENT_VFORK; |
| 2332 | else if ((clone_flags & CSIGNAL) != SIGCHLD) | 2361 | else if (args->exit_signal != SIGCHLD) |
| 2333 | trace = PTRACE_EVENT_CLONE; | 2362 | trace = PTRACE_EVENT_CLONE; |
| 2334 | else | 2363 | else |
| 2335 | trace = PTRACE_EVENT_FORK; | 2364 | trace = PTRACE_EVENT_FORK; |
| @@ -2338,8 +2367,7 @@ long _do_fork(unsigned long clone_flags, | |||
| 2338 | trace = 0; | 2367 | trace = 0; |
| 2339 | } | 2368 | } |
| 2340 | 2369 | ||
| 2341 | p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, | 2370 | p = copy_process(NULL, trace, NUMA_NO_NODE, args); |
| 2342 | child_tidptr, NULL, trace, tls, NUMA_NO_NODE); | ||
| 2343 | add_latent_entropy(); | 2371 | add_latent_entropy(); |
| 2344 | 2372 | ||
| 2345 | if (IS_ERR(p)) | 2373 | if (IS_ERR(p)) |
| @@ -2355,7 +2383,7 @@ long _do_fork(unsigned long clone_flags, | |||
| 2355 | nr = pid_vnr(pid); | 2383 | nr = pid_vnr(pid); |
| 2356 | 2384 | ||
| 2357 | if (clone_flags & CLONE_PARENT_SETTID) | 2385 | if (clone_flags & CLONE_PARENT_SETTID) |
| 2358 | put_user(nr, parent_tidptr); | 2386 | put_user(nr, args->parent_tid); |
| 2359 | 2387 | ||
| 2360 | if (clone_flags & CLONE_VFORK) { | 2388 | if (clone_flags & CLONE_VFORK) { |
| 2361 | p->vfork_done = &vfork; | 2389 | p->vfork_done = &vfork; |
| @@ -2387,8 +2415,16 @@ long do_fork(unsigned long clone_flags, | |||
| 2387 | int __user *parent_tidptr, | 2415 | int __user *parent_tidptr, |
| 2388 | int __user *child_tidptr) | 2416 | int __user *child_tidptr) |
| 2389 | { | 2417 | { |
| 2390 | return _do_fork(clone_flags, stack_start, stack_size, | 2418 | struct kernel_clone_args args = { |
| 2391 | parent_tidptr, child_tidptr, 0); | 2419 | .flags = (clone_flags & ~CSIGNAL), |
| 2420 | .child_tid = child_tidptr, | ||
| 2421 | .parent_tid = parent_tidptr, | ||
| 2422 | .exit_signal = (clone_flags & CSIGNAL), | ||
| 2423 | .stack = stack_start, | ||
| 2424 | .stack_size = stack_size, | ||
| 2425 | }; | ||
| 2426 | |||
| 2427 | return _do_fork(&args); | ||
| 2392 | } | 2428 | } |
| 2393 | #endif | 2429 | #endif |
| 2394 | 2430 | ||
| @@ -2397,15 +2433,25 @@ long do_fork(unsigned long clone_flags, | |||
| 2397 | */ | 2433 | */ |
| 2398 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) | 2434 | pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) |
| 2399 | { | 2435 | { |
| 2400 | return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, | 2436 | struct kernel_clone_args args = { |
| 2401 | (unsigned long)arg, NULL, NULL, 0); | 2437 | .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), |
| 2438 | .exit_signal = (flags & CSIGNAL), | ||
| 2439 | .stack = (unsigned long)fn, | ||
| 2440 | .stack_size = (unsigned long)arg, | ||
| 2441 | }; | ||
| 2442 | |||
| 2443 | return _do_fork(&args); | ||
| 2402 | } | 2444 | } |
| 2403 | 2445 | ||
| 2404 | #ifdef __ARCH_WANT_SYS_FORK | 2446 | #ifdef __ARCH_WANT_SYS_FORK |
| 2405 | SYSCALL_DEFINE0(fork) | 2447 | SYSCALL_DEFINE0(fork) |
| 2406 | { | 2448 | { |
| 2407 | #ifdef CONFIG_MMU | 2449 | #ifdef CONFIG_MMU |
| 2408 | return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); | 2450 | struct kernel_clone_args args = { |
| 2451 | .exit_signal = SIGCHLD, | ||
| 2452 | }; | ||
| 2453 | |||
| 2454 | return _do_fork(&args); | ||
| 2409 | #else | 2455 | #else |
| 2410 | /* can not support in nommu mode */ | 2456 | /* can not support in nommu mode */ |
| 2411 | return -EINVAL; | 2457 | return -EINVAL; |
| @@ -2416,8 +2462,12 @@ SYSCALL_DEFINE0(fork) | |||
| 2416 | #ifdef __ARCH_WANT_SYS_VFORK | 2462 | #ifdef __ARCH_WANT_SYS_VFORK |
| 2417 | SYSCALL_DEFINE0(vfork) | 2463 | SYSCALL_DEFINE0(vfork) |
| 2418 | { | 2464 | { |
| 2419 | return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, | 2465 | struct kernel_clone_args args = { |
| 2420 | 0, NULL, NULL, 0); | 2466 | .flags = CLONE_VFORK | CLONE_VM, |
| 2467 | .exit_signal = SIGCHLD, | ||
| 2468 | }; | ||
| 2469 | |||
| 2470 | return _do_fork(&args); | ||
| 2421 | } | 2471 | } |
| 2422 | #endif | 2472 | #endif |
| 2423 | 2473 | ||
| @@ -2445,7 +2495,112 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, | |||
| 2445 | unsigned long, tls) | 2495 | unsigned long, tls) |
| 2446 | #endif | 2496 | #endif |
| 2447 | { | 2497 | { |
| 2448 | return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); | 2498 | struct kernel_clone_args args = { |
| 2499 | .flags = (clone_flags & ~CSIGNAL), | ||
| 2500 | .pidfd = parent_tidptr, | ||
| 2501 | .child_tid = child_tidptr, | ||
| 2502 | .parent_tid = parent_tidptr, | ||
| 2503 | .exit_signal = (clone_flags & CSIGNAL), | ||
| 2504 | .stack = newsp, | ||
| 2505 | .tls = tls, | ||
| 2506 | }; | ||
| 2507 | |||
| 2508 | /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ | ||
| 2509 | if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) | ||
| 2510 | return -EINVAL; | ||
| 2511 | |||
| 2512 | return _do_fork(&args); | ||
| 2513 | } | ||
| 2514 | #endif | ||
| 2515 | |||
| 2516 | #ifdef __ARCH_WANT_SYS_CLONE3 | ||
| 2517 | noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, | ||
| 2518 | struct clone_args __user *uargs, | ||
| 2519 | size_t size) | ||
| 2520 | { | ||
| 2521 | struct clone_args args; | ||
| 2522 | |||
| 2523 | if (unlikely(size > PAGE_SIZE)) | ||
| 2524 | return -E2BIG; | ||
| 2525 | |||
| 2526 | if (unlikely(size < sizeof(struct clone_args))) | ||
| 2527 | return -EINVAL; | ||
| 2528 | |||
| 2529 | if (unlikely(!access_ok(uargs, size))) | ||
| 2530 | return -EFAULT; | ||
| 2531 | |||
| 2532 | if (size > sizeof(struct clone_args)) { | ||
| 2533 | unsigned char __user *addr; | ||
| 2534 | unsigned char __user *end; | ||
| 2535 | unsigned char val; | ||
| 2536 | |||
| 2537 | addr = (void __user *)uargs + sizeof(struct clone_args); | ||
| 2538 | end = (void __user *)uargs + size; | ||
| 2539 | |||
| 2540 | for (; addr < end; addr++) { | ||
| 2541 | if (get_user(val, addr)) | ||
| 2542 | return -EFAULT; | ||
| 2543 | if (val) | ||
| 2544 | return -E2BIG; | ||
| 2545 | } | ||
| 2546 | |||
| 2547 | size = sizeof(struct clone_args); | ||
| 2548 | } | ||
| 2549 | |||
| 2550 | if (copy_from_user(&args, uargs, size)) | ||
| 2551 | return -EFAULT; | ||
| 2552 | |||
| 2553 | *kargs = (struct kernel_clone_args){ | ||
| 2554 | .flags = args.flags, | ||
| 2555 | .pidfd = u64_to_user_ptr(args.pidfd), | ||
| 2556 | .child_tid = u64_to_user_ptr(args.child_tid), | ||
| 2557 | .parent_tid = u64_to_user_ptr(args.parent_tid), | ||
| 2558 | .exit_signal = args.exit_signal, | ||
| 2559 | .stack = args.stack, | ||
| 2560 | .stack_size = args.stack_size, | ||
| 2561 | .tls = args.tls, | ||
| 2562 | }; | ||
| 2563 | |||
| 2564 | return 0; | ||
| 2565 | } | ||
| 2566 | |||
| 2567 | static bool clone3_args_valid(const struct kernel_clone_args *kargs) | ||
| 2568 | { | ||
| 2569 | /* | ||
| 2570 | * All lower bits of the flag word are taken. | ||
| 2571 | * Verify that no other unknown flags are passed along. | ||
| 2572 | */ | ||
| 2573 | if (kargs->flags & ~CLONE_LEGACY_FLAGS) | ||
| 2574 | return false; | ||
| 2575 | |||
| 2576 | /* | ||
| 2577 | * - make the CLONE_DETACHED bit reuseable for clone3 | ||
| 2578 | * - make the CSIGNAL bits reuseable for clone3 | ||
| 2579 | */ | ||
| 2580 | if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) | ||
| 2581 | return false; | ||
| 2582 | |||
| 2583 | if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && | ||
| 2584 | kargs->exit_signal) | ||
| 2585 | return false; | ||
| 2586 | |||
| 2587 | return true; | ||
| 2588 | } | ||
| 2589 | |||
| 2590 | SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) | ||
| 2591 | { | ||
| 2592 | int err; | ||
| 2593 | |||
| 2594 | struct kernel_clone_args kargs; | ||
| 2595 | |||
| 2596 | err = copy_clone_args_from_user(&kargs, uargs, size); | ||
| 2597 | if (err) | ||
| 2598 | return err; | ||
| 2599 | |||
| 2600 | if (!clone3_args_valid(&kargs)) | ||
| 2601 | return -EINVAL; | ||
| 2602 | |||
| 2603 | return _do_fork(&kargs); | ||
| 2449 | } | 2604 | } |
| 2450 | #endif | 2605 | #endif |
| 2451 | 2606 | ||
diff --git a/kernel/freezer.c b/kernel/freezer.c index b162b74611e4..c0738424bb43 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/freezer.c - Function to freeze a process | 3 | * kernel/freezer.c - Function to freeze a process |
| 3 | * | 4 | * |
diff --git a/kernel/futex.c b/kernel/futex.c index 2268b97d5439..6d50728ef2e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Fast Userspace Mutexes (which I call "Futexes!"). | 3 | * Fast Userspace Mutexes (which I call "Futexes!"). |
| 3 | * (C) Rusty Russell, IBM 2002 | 4 | * (C) Rusty Russell, IBM 2002 |
| @@ -29,20 +30,6 @@ | |||
| 29 | * | 30 | * |
| 30 | * "The futexes are also cursed." | 31 | * "The futexes are also cursed." |
| 31 | * "But they come in a choice of three flavours!" | 32 | * "But they come in a choice of three flavours!" |
| 32 | * | ||
| 33 | * This program is free software; you can redistribute it and/or modify | ||
| 34 | * it under the terms of the GNU General Public License as published by | ||
| 35 | * the Free Software Foundation; either version 2 of the License, or | ||
| 36 | * (at your option) any later version. | ||
| 37 | * | ||
| 38 | * This program is distributed in the hope that it will be useful, | ||
| 39 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 40 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 41 | * GNU General Public License for more details. | ||
| 42 | * | ||
| 43 | * You should have received a copy of the GNU General Public License | ||
| 44 | * along with this program; if not, write to the Free Software | ||
| 45 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 46 | */ | 33 | */ |
| 47 | #include <linux/compat.h> | 34 | #include <linux/compat.h> |
| 48 | #include <linux/slab.h> | 35 | #include <linux/slab.h> |
| @@ -484,6 +471,37 @@ enum futex_access { | |||
| 484 | }; | 471 | }; |
| 485 | 472 | ||
| 486 | /** | 473 | /** |
| 474 | * futex_setup_timer - set up the sleeping hrtimer. | ||
| 475 | * @time: ptr to the given timeout value | ||
| 476 | * @timeout: the hrtimer_sleeper structure to be set up | ||
| 477 | * @flags: futex flags | ||
| 478 | * @range_ns: optional range in ns | ||
| 479 | * | ||
| 480 | * Return: Initialized hrtimer_sleeper structure or NULL if no timeout | ||
| 481 | * value given | ||
| 482 | */ | ||
| 483 | static inline struct hrtimer_sleeper * | ||
| 484 | futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, | ||
| 485 | int flags, u64 range_ns) | ||
| 486 | { | ||
| 487 | if (!time) | ||
| 488 | return NULL; | ||
| 489 | |||
| 490 | hrtimer_init_on_stack(&timeout->timer, (flags & FLAGS_CLOCKRT) ? | ||
| 491 | CLOCK_REALTIME : CLOCK_MONOTONIC, | ||
| 492 | HRTIMER_MODE_ABS); | ||
| 493 | hrtimer_init_sleeper(timeout, current); | ||
| 494 | |||
| 495 | /* | ||
| 496 | * If range_ns is 0, calling hrtimer_set_expires_range_ns() is | ||
| 497 | * effectively the same as calling hrtimer_set_expires(). | ||
| 498 | */ | ||
| 499 | hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); | ||
| 500 | |||
| 501 | return timeout; | ||
| 502 | } | ||
| 503 | |||
| 504 | /** | ||
| 487 | * get_futex_key() - Get parameters which are the keys for a futex | 505 | * get_futex_key() - Get parameters which are the keys for a futex |
| 488 | * @uaddr: virtual address of the futex | 506 | * @uaddr: virtual address of the futex |
| 489 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED | 507 | * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED |
| @@ -2692,7 +2710,7 @@ out: | |||
| 2692 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, | 2710 | static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, |
| 2693 | ktime_t *abs_time, u32 bitset) | 2711 | ktime_t *abs_time, u32 bitset) |
| 2694 | { | 2712 | { |
| 2695 | struct hrtimer_sleeper timeout, *to = NULL; | 2713 | struct hrtimer_sleeper timeout, *to; |
| 2696 | struct restart_block *restart; | 2714 | struct restart_block *restart; |
| 2697 | struct futex_hash_bucket *hb; | 2715 | struct futex_hash_bucket *hb; |
| 2698 | struct futex_q q = futex_q_init; | 2716 | struct futex_q q = futex_q_init; |
| @@ -2702,17 +2720,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, | |||
| 2702 | return -EINVAL; | 2720 | return -EINVAL; |
| 2703 | q.bitset = bitset; | 2721 | q.bitset = bitset; |
| 2704 | 2722 | ||
| 2705 | if (abs_time) { | 2723 | to = futex_setup_timer(abs_time, &timeout, flags, |
| 2706 | to = &timeout; | 2724 | current->timer_slack_ns); |
| 2707 | |||
| 2708 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? | ||
| 2709 | CLOCK_REALTIME : CLOCK_MONOTONIC, | ||
| 2710 | HRTIMER_MODE_ABS); | ||
| 2711 | hrtimer_init_sleeper(to, current); | ||
| 2712 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
| 2713 | current->timer_slack_ns); | ||
| 2714 | } | ||
| 2715 | |||
| 2716 | retry: | 2725 | retry: |
| 2717 | /* | 2726 | /* |
| 2718 | * Prepare to wait on uaddr. On success, holds hb lock and increments | 2727 | * Prepare to wait on uaddr. On success, holds hb lock and increments |
| @@ -2792,7 +2801,7 @@ static long futex_wait_restart(struct restart_block *restart) | |||
| 2792 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, | 2801 | static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, |
| 2793 | ktime_t *time, int trylock) | 2802 | ktime_t *time, int trylock) |
| 2794 | { | 2803 | { |
| 2795 | struct hrtimer_sleeper timeout, *to = NULL; | 2804 | struct hrtimer_sleeper timeout, *to; |
| 2796 | struct futex_pi_state *pi_state = NULL; | 2805 | struct futex_pi_state *pi_state = NULL; |
| 2797 | struct rt_mutex_waiter rt_waiter; | 2806 | struct rt_mutex_waiter rt_waiter; |
| 2798 | struct futex_hash_bucket *hb; | 2807 | struct futex_hash_bucket *hb; |
| @@ -2805,13 +2814,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, | |||
| 2805 | if (refill_pi_state_cache()) | 2814 | if (refill_pi_state_cache()) |
| 2806 | return -ENOMEM; | 2815 | return -ENOMEM; |
| 2807 | 2816 | ||
| 2808 | if (time) { | 2817 | to = futex_setup_timer(time, &timeout, FLAGS_CLOCKRT, 0); |
| 2809 | to = &timeout; | ||
| 2810 | hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, | ||
| 2811 | HRTIMER_MODE_ABS); | ||
| 2812 | hrtimer_init_sleeper(to, current); | ||
| 2813 | hrtimer_set_expires(&to->timer, *time); | ||
| 2814 | } | ||
| 2815 | 2818 | ||
| 2816 | retry: | 2819 | retry: |
| 2817 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); | 2820 | ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE); |
| @@ -3208,7 +3211,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 3208 | u32 val, ktime_t *abs_time, u32 bitset, | 3211 | u32 val, ktime_t *abs_time, u32 bitset, |
| 3209 | u32 __user *uaddr2) | 3212 | u32 __user *uaddr2) |
| 3210 | { | 3213 | { |
| 3211 | struct hrtimer_sleeper timeout, *to = NULL; | 3214 | struct hrtimer_sleeper timeout, *to; |
| 3212 | struct futex_pi_state *pi_state = NULL; | 3215 | struct futex_pi_state *pi_state = NULL; |
| 3213 | struct rt_mutex_waiter rt_waiter; | 3216 | struct rt_mutex_waiter rt_waiter; |
| 3214 | struct futex_hash_bucket *hb; | 3217 | struct futex_hash_bucket *hb; |
| @@ -3225,15 +3228,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, | |||
| 3225 | if (!bitset) | 3228 | if (!bitset) |
| 3226 | return -EINVAL; | 3229 | return -EINVAL; |
| 3227 | 3230 | ||
| 3228 | if (abs_time) { | 3231 | to = futex_setup_timer(abs_time, &timeout, flags, |
| 3229 | to = &timeout; | 3232 | current->timer_slack_ns); |
| 3230 | hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? | ||
| 3231 | CLOCK_REALTIME : CLOCK_MONOTONIC, | ||
| 3232 | HRTIMER_MODE_ABS); | ||
| 3233 | hrtimer_init_sleeper(to, current); | ||
| 3234 | hrtimer_set_expires_range_ns(&to->timer, *abs_time, | ||
| 3235 | current->timer_slack_ns); | ||
| 3236 | } | ||
| 3237 | 3233 | ||
| 3238 | /* | 3234 | /* |
| 3239 | * The waiter is allocated on our stack, manipulated by the requeue | 3235 | * The waiter is allocated on our stack, manipulated by the requeue |
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 1e3823fa799b..3941a9c48f83 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | menu "GCOV-based kernel profiling" | 2 | menu "GCOV-based kernel profiling" |
| 2 | 3 | ||
| 3 | config GCOV_KERNEL | 4 | config GCOV_KERNEL |
| @@ -53,6 +54,7 @@ config GCOV_PROFILE_ALL | |||
| 53 | choice | 54 | choice |
| 54 | prompt "Specify GCOV format" | 55 | prompt "Specify GCOV format" |
| 55 | depends on GCOV_KERNEL | 56 | depends on GCOV_KERNEL |
| 57 | depends on CC_IS_GCC | ||
| 56 | ---help--- | 58 | ---help--- |
| 57 | The gcov format is usually determined by the GCC version, and the | 59 | The gcov format is usually determined by the GCC version, and the |
| 58 | default is chosen according to your GCC version. However, there are | 60 | default is chosen according to your GCC version. However, there are |
| @@ -62,7 +64,7 @@ choice | |||
| 62 | 64 | ||
| 63 | config GCOV_FORMAT_3_4 | 65 | config GCOV_FORMAT_3_4 |
| 64 | bool "GCC 3.4 format" | 66 | bool "GCC 3.4 format" |
| 65 | depends on CC_IS_GCC && GCC_VERSION < 40700 | 67 | depends on GCC_VERSION < 40700 |
| 66 | ---help--- | 68 | ---help--- |
| 67 | Select this option to use the format defined by GCC 3.4. | 69 | Select this option to use the format defined by GCC 3.4. |
| 68 | 70 | ||
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile index ff06d64df397..d66a74b0f100 100644 --- a/kernel/gcov/Makefile +++ b/kernel/gcov/Makefile | |||
| @@ -2,5 +2,6 @@ | |||
| 2 | ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' | 2 | ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' |
| 3 | 3 | ||
| 4 | obj-y := base.o fs.o | 4 | obj-y := base.o fs.o |
| 5 | obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_3_4.o | 5 | obj-$(CONFIG_GCOV_FORMAT_3_4) += gcc_base.o gcc_3_4.o |
| 6 | obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_4_7.o | 6 | obj-$(CONFIG_GCOV_FORMAT_4_7) += gcc_base.o gcc_4_7.o |
| 7 | obj-$(CONFIG_CC_IS_CLANG) += clang.o | ||
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 9c7c8d5c18f2..0ffe9f194080 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c | |||
| @@ -22,88 +22,8 @@ | |||
| 22 | #include <linux/sched.h> | 22 | #include <linux/sched.h> |
| 23 | #include "gcov.h" | 23 | #include "gcov.h" |
| 24 | 24 | ||
| 25 | static int gcov_events_enabled; | 25 | int gcov_events_enabled; |
| 26 | static DEFINE_MUTEX(gcov_lock); | 26 | DEFINE_MUTEX(gcov_lock); |
| 27 | |||
| 28 | /* | ||
| 29 | * __gcov_init is called by gcc-generated constructor code for each object | ||
| 30 | * file compiled with -fprofile-arcs. | ||
| 31 | */ | ||
| 32 | void __gcov_init(struct gcov_info *info) | ||
| 33 | { | ||
| 34 | static unsigned int gcov_version; | ||
| 35 | |||
| 36 | mutex_lock(&gcov_lock); | ||
| 37 | if (gcov_version == 0) { | ||
| 38 | gcov_version = gcov_info_version(info); | ||
| 39 | /* | ||
| 40 | * Printing gcc's version magic may prove useful for debugging | ||
| 41 | * incompatibility reports. | ||
| 42 | */ | ||
| 43 | pr_info("version magic: 0x%x\n", gcov_version); | ||
| 44 | } | ||
| 45 | /* | ||
| 46 | * Add new profiling data structure to list and inform event | ||
| 47 | * listener. | ||
| 48 | */ | ||
| 49 | gcov_info_link(info); | ||
| 50 | if (gcov_events_enabled) | ||
| 51 | gcov_event(GCOV_ADD, info); | ||
| 52 | mutex_unlock(&gcov_lock); | ||
| 53 | } | ||
| 54 | EXPORT_SYMBOL(__gcov_init); | ||
| 55 | |||
| 56 | /* | ||
| 57 | * These functions may be referenced by gcc-generated profiling code but serve | ||
| 58 | * no function for kernel profiling. | ||
| 59 | */ | ||
| 60 | void __gcov_flush(void) | ||
| 61 | { | ||
| 62 | /* Unused. */ | ||
| 63 | } | ||
| 64 | EXPORT_SYMBOL(__gcov_flush); | ||
| 65 | |||
| 66 | void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) | ||
| 67 | { | ||
| 68 | /* Unused. */ | ||
| 69 | } | ||
| 70 | EXPORT_SYMBOL(__gcov_merge_add); | ||
| 71 | |||
| 72 | void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) | ||
| 73 | { | ||
| 74 | /* Unused. */ | ||
| 75 | } | ||
| 76 | EXPORT_SYMBOL(__gcov_merge_single); | ||
| 77 | |||
| 78 | void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) | ||
| 79 | { | ||
| 80 | /* Unused. */ | ||
| 81 | } | ||
| 82 | EXPORT_SYMBOL(__gcov_merge_delta); | ||
| 83 | |||
| 84 | void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) | ||
| 85 | { | ||
| 86 | /* Unused. */ | ||
| 87 | } | ||
| 88 | EXPORT_SYMBOL(__gcov_merge_ior); | ||
| 89 | |||
| 90 | void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) | ||
| 91 | { | ||
| 92 | /* Unused. */ | ||
| 93 | } | ||
| 94 | EXPORT_SYMBOL(__gcov_merge_time_profile); | ||
| 95 | |||
| 96 | void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) | ||
| 97 | { | ||
| 98 | /* Unused. */ | ||
| 99 | } | ||
| 100 | EXPORT_SYMBOL(__gcov_merge_icall_topn); | ||
| 101 | |||
| 102 | void __gcov_exit(void) | ||
| 103 | { | ||
| 104 | /* Unused. */ | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL(__gcov_exit); | ||
| 107 | 27 | ||
| 108 | /** | 28 | /** |
| 109 | * gcov_enable_events - enable event reporting through gcov_event() | 29 | * gcov_enable_events - enable event reporting through gcov_event() |
| @@ -144,7 +64,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, | |||
| 144 | 64 | ||
| 145 | /* Remove entries located in module from linked list. */ | 65 | /* Remove entries located in module from linked list. */ |
| 146 | while ((info = gcov_info_next(info))) { | 66 | while ((info = gcov_info_next(info))) { |
| 147 | if (within_module((unsigned long)info, mod)) { | 67 | if (gcov_info_within_module(info, mod)) { |
| 148 | gcov_info_unlink(prev, info); | 68 | gcov_info_unlink(prev, info); |
| 149 | if (gcov_events_enabled) | 69 | if (gcov_events_enabled) |
| 150 | gcov_event(GCOV_REMOVE, info); | 70 | gcov_event(GCOV_REMOVE, info); |
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c new file mode 100644 index 000000000000..c94b820a1b62 --- /dev/null +++ b/kernel/gcov/clang.c | |||
| @@ -0,0 +1,581 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * Copyright (C) 2019 Google, Inc. | ||
| 4 | * modified from kernel/gcov/gcc_4_7.c | ||
| 5 | * | ||
| 6 | * This software is licensed under the terms of the GNU General Public | ||
| 7 | * License version 2, as published by the Free Software Foundation, and | ||
| 8 | * may be copied, distributed, and modified under those terms. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 13 | * GNU General Public License for more details. | ||
| 14 | * | ||
| 15 | * | ||
| 16 | * LLVM uses profiling data that's deliberately similar to GCC, but has a | ||
| 17 | * very different way of exporting that data. LLVM calls llvm_gcov_init() once | ||
| 18 | * per module, and provides a couple of callbacks that we can use to ask for | ||
| 19 | * more data. | ||
| 20 | * | ||
| 21 | * We care about the "writeout" callback, which in turn calls back into | ||
| 22 | * compiler-rt/this module to dump all the gathered coverage data to disk: | ||
| 23 | * | ||
| 24 | * llvm_gcda_start_file() | ||
| 25 | * llvm_gcda_emit_function() | ||
| 26 | * llvm_gcda_emit_arcs() | ||
| 27 | * llvm_gcda_emit_function() | ||
| 28 | * llvm_gcda_emit_arcs() | ||
| 29 | * [... repeats for each function ...] | ||
| 30 | * llvm_gcda_summary_info() | ||
| 31 | * llvm_gcda_end_file() | ||
| 32 | * | ||
| 33 | * This design is much more stateless and unstructured than gcc's, and is | ||
| 34 | * intended to run at process exit. This forces us to keep some local state | ||
| 35 | * about which module we're dealing with at the moment. On the other hand, it | ||
| 36 | * also means we don't depend as much on how LLVM represents profiling data | ||
| 37 | * internally. | ||
| 38 | * | ||
| 39 | * See LLVM's lib/Transforms/Instrumentation/GCOVProfiling.cpp for more | ||
| 40 | * details on how this works, particularly GCOVProfiler::emitProfileArcs(), | ||
| 41 | * GCOVProfiler::insertCounterWriteout(), and | ||
| 42 | * GCOVProfiler::insertFlush(). | ||
| 43 | */ | ||
| 44 | |||
| 45 | #define pr_fmt(fmt) "gcov: " fmt | ||
| 46 | |||
| 47 | #include <linux/kernel.h> | ||
| 48 | #include <linux/list.h> | ||
| 49 | #include <linux/printk.h> | ||
| 50 | #include <linux/ratelimit.h> | ||
| 51 | #include <linux/seq_file.h> | ||
| 52 | #include <linux/slab.h> | ||
| 53 | #include <linux/vmalloc.h> | ||
| 54 | #include "gcov.h" | ||
| 55 | |||
| 56 | typedef void (*llvm_gcov_callback)(void); | ||
| 57 | |||
| 58 | struct gcov_info { | ||
| 59 | struct list_head head; | ||
| 60 | |||
| 61 | const char *filename; | ||
| 62 | unsigned int version; | ||
| 63 | u32 checksum; | ||
| 64 | |||
| 65 | struct list_head functions; | ||
| 66 | }; | ||
| 67 | |||
| 68 | struct gcov_fn_info { | ||
| 69 | struct list_head head; | ||
| 70 | |||
| 71 | u32 ident; | ||
| 72 | u32 checksum; | ||
| 73 | u8 use_extra_checksum; | ||
| 74 | u32 cfg_checksum; | ||
| 75 | |||
| 76 | u32 num_counters; | ||
| 77 | u64 *counters; | ||
| 78 | const char *function_name; | ||
| 79 | }; | ||
| 80 | |||
| 81 | static struct gcov_info *current_info; | ||
| 82 | |||
| 83 | static LIST_HEAD(clang_gcov_list); | ||
| 84 | |||
| 85 | void llvm_gcov_init(llvm_gcov_callback writeout, llvm_gcov_callback flush) | ||
| 86 | { | ||
| 87 | struct gcov_info *info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
| 88 | |||
| 89 | if (!info) | ||
| 90 | return; | ||
| 91 | |||
| 92 | INIT_LIST_HEAD(&info->head); | ||
| 93 | INIT_LIST_HEAD(&info->functions); | ||
| 94 | |||
| 95 | mutex_lock(&gcov_lock); | ||
| 96 | |||
| 97 | list_add_tail(&info->head, &clang_gcov_list); | ||
| 98 | current_info = info; | ||
| 99 | writeout(); | ||
| 100 | current_info = NULL; | ||
| 101 | if (gcov_events_enabled) | ||
| 102 | gcov_event(GCOV_ADD, info); | ||
| 103 | |||
| 104 | mutex_unlock(&gcov_lock); | ||
| 105 | } | ||
| 106 | EXPORT_SYMBOL(llvm_gcov_init); | ||
| 107 | |||
| 108 | void llvm_gcda_start_file(const char *orig_filename, const char version[4], | ||
| 109 | u32 checksum) | ||
| 110 | { | ||
| 111 | current_info->filename = orig_filename; | ||
| 112 | memcpy(¤t_info->version, version, sizeof(current_info->version)); | ||
| 113 | current_info->checksum = checksum; | ||
| 114 | } | ||
| 115 | EXPORT_SYMBOL(llvm_gcda_start_file); | ||
| 116 | |||
| 117 | void llvm_gcda_emit_function(u32 ident, const char *function_name, | ||
| 118 | u32 func_checksum, u8 use_extra_checksum, u32 cfg_checksum) | ||
| 119 | { | ||
| 120 | struct gcov_fn_info *info = kzalloc(sizeof(*info), GFP_KERNEL); | ||
| 121 | |||
| 122 | if (!info) | ||
| 123 | return; | ||
| 124 | |||
| 125 | INIT_LIST_HEAD(&info->head); | ||
| 126 | info->ident = ident; | ||
| 127 | info->checksum = func_checksum; | ||
| 128 | info->use_extra_checksum = use_extra_checksum; | ||
| 129 | info->cfg_checksum = cfg_checksum; | ||
| 130 | if (function_name) | ||
| 131 | info->function_name = kstrdup(function_name, GFP_KERNEL); | ||
| 132 | |||
| 133 | list_add_tail(&info->head, ¤t_info->functions); | ||
| 134 | } | ||
| 135 | EXPORT_SYMBOL(llvm_gcda_emit_function); | ||
| 136 | |||
| 137 | void llvm_gcda_emit_arcs(u32 num_counters, u64 *counters) | ||
| 138 | { | ||
| 139 | struct gcov_fn_info *info = list_last_entry(¤t_info->functions, | ||
| 140 | struct gcov_fn_info, head); | ||
| 141 | |||
| 142 | info->num_counters = num_counters; | ||
| 143 | info->counters = counters; | ||
| 144 | } | ||
| 145 | EXPORT_SYMBOL(llvm_gcda_emit_arcs); | ||
| 146 | |||
| 147 | void llvm_gcda_summary_info(void) | ||
| 148 | { | ||
| 149 | } | ||
| 150 | EXPORT_SYMBOL(llvm_gcda_summary_info); | ||
| 151 | |||
| 152 | void llvm_gcda_end_file(void) | ||
| 153 | { | ||
| 154 | } | ||
| 155 | EXPORT_SYMBOL(llvm_gcda_end_file); | ||
| 156 | |||
| 157 | /** | ||
| 158 | * gcov_info_filename - return info filename | ||
| 159 | * @info: profiling data set | ||
| 160 | */ | ||
| 161 | const char *gcov_info_filename(struct gcov_info *info) | ||
| 162 | { | ||
| 163 | return info->filename; | ||
| 164 | } | ||
| 165 | |||
| 166 | /** | ||
| 167 | * gcov_info_version - return info version | ||
| 168 | * @info: profiling data set | ||
| 169 | */ | ||
| 170 | unsigned int gcov_info_version(struct gcov_info *info) | ||
| 171 | { | ||
| 172 | return info->version; | ||
| 173 | } | ||
| 174 | |||
| 175 | /** | ||
| 176 | * gcov_info_next - return next profiling data set | ||
| 177 | * @info: profiling data set | ||
| 178 | * | ||
| 179 | * Returns next gcov_info following @info or first gcov_info in the chain if | ||
| 180 | * @info is %NULL. | ||
| 181 | */ | ||
| 182 | struct gcov_info *gcov_info_next(struct gcov_info *info) | ||
| 183 | { | ||
| 184 | if (!info) | ||
| 185 | return list_first_entry_or_null(&clang_gcov_list, | ||
| 186 | struct gcov_info, head); | ||
| 187 | if (list_is_last(&info->head, &clang_gcov_list)) | ||
| 188 | return NULL; | ||
| 189 | return list_next_entry(info, head); | ||
| 190 | } | ||
| 191 | |||
| 192 | /** | ||
| 193 | * gcov_info_link - link/add profiling data set to the list | ||
| 194 | * @info: profiling data set | ||
| 195 | */ | ||
| 196 | void gcov_info_link(struct gcov_info *info) | ||
| 197 | { | ||
| 198 | list_add_tail(&info->head, &clang_gcov_list); | ||
| 199 | } | ||
| 200 | |||
| 201 | /** | ||
| 202 | * gcov_info_unlink - unlink/remove profiling data set from the list | ||
| 203 | * @prev: previous profiling data set | ||
| 204 | * @info: profiling data set | ||
| 205 | */ | ||
| 206 | void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) | ||
| 207 | { | ||
| 208 | /* Generic code unlinks while iterating. */ | ||
| 209 | __list_del_entry(&info->head); | ||
| 210 | } | ||
| 211 | |||
| 212 | /** | ||
| 213 | * gcov_info_within_module - check if a profiling data set belongs to a module | ||
| 214 | * @info: profiling data set | ||
| 215 | * @mod: module | ||
| 216 | * | ||
| 217 | * Returns true if profiling data belongs module, false otherwise. | ||
| 218 | */ | ||
| 219 | bool gcov_info_within_module(struct gcov_info *info, struct module *mod) | ||
| 220 | { | ||
| 221 | return within_module((unsigned long)info->filename, mod); | ||
| 222 | } | ||
| 223 | |||
| 224 | /* Symbolic links to be created for each profiling data file. */ | ||
| 225 | const struct gcov_link gcov_link[] = { | ||
| 226 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ | ||
| 227 | { 0, NULL}, | ||
| 228 | }; | ||
| 229 | |||
| 230 | /** | ||
| 231 | * gcov_info_reset - reset profiling data to zero | ||
| 232 | * @info: profiling data set | ||
| 233 | */ | ||
| 234 | void gcov_info_reset(struct gcov_info *info) | ||
| 235 | { | ||
| 236 | struct gcov_fn_info *fn; | ||
| 237 | |||
| 238 | list_for_each_entry(fn, &info->functions, head) | ||
| 239 | memset(fn->counters, 0, | ||
| 240 | sizeof(fn->counters[0]) * fn->num_counters); | ||
| 241 | } | ||
| 242 | |||
| 243 | /** | ||
| 244 | * gcov_info_is_compatible - check if profiling data can be added | ||
| 245 | * @info1: first profiling data set | ||
| 246 | * @info2: second profiling data set | ||
| 247 | * | ||
| 248 | * Returns non-zero if profiling data can be added, zero otherwise. | ||
| 249 | */ | ||
| 250 | int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) | ||
| 251 | { | ||
| 252 | struct gcov_fn_info *fn_ptr1 = list_first_entry_or_null( | ||
| 253 | &info1->functions, struct gcov_fn_info, head); | ||
| 254 | struct gcov_fn_info *fn_ptr2 = list_first_entry_or_null( | ||
| 255 | &info2->functions, struct gcov_fn_info, head); | ||
| 256 | |||
| 257 | if (info1->checksum != info2->checksum) | ||
| 258 | return false; | ||
| 259 | if (!fn_ptr1) | ||
| 260 | return fn_ptr1 == fn_ptr2; | ||
| 261 | while (!list_is_last(&fn_ptr1->head, &info1->functions) && | ||
| 262 | !list_is_last(&fn_ptr2->head, &info2->functions)) { | ||
| 263 | if (fn_ptr1->checksum != fn_ptr2->checksum) | ||
| 264 | return false; | ||
| 265 | if (fn_ptr1->use_extra_checksum != fn_ptr2->use_extra_checksum) | ||
| 266 | return false; | ||
| 267 | if (fn_ptr1->use_extra_checksum && | ||
| 268 | fn_ptr1->cfg_checksum != fn_ptr2->cfg_checksum) | ||
| 269 | return false; | ||
| 270 | fn_ptr1 = list_next_entry(fn_ptr1, head); | ||
| 271 | fn_ptr2 = list_next_entry(fn_ptr2, head); | ||
| 272 | } | ||
| 273 | return list_is_last(&fn_ptr1->head, &info1->functions) && | ||
| 274 | list_is_last(&fn_ptr2->head, &info2->functions); | ||
| 275 | } | ||
| 276 | |||
| 277 | /** | ||
| 278 | * gcov_info_add - add up profiling data | ||
| 279 | * @dest: profiling data set to which data is added | ||
| 280 | * @source: profiling data set which is added | ||
| 281 | * | ||
| 282 | * Adds profiling counts of @source to @dest. | ||
| 283 | */ | ||
| 284 | void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) | ||
| 285 | { | ||
| 286 | struct gcov_fn_info *dfn_ptr; | ||
| 287 | struct gcov_fn_info *sfn_ptr = list_first_entry_or_null(&src->functions, | ||
| 288 | struct gcov_fn_info, head); | ||
| 289 | |||
| 290 | list_for_each_entry(dfn_ptr, &dst->functions, head) { | ||
| 291 | u32 i; | ||
| 292 | |||
| 293 | for (i = 0; i < sfn_ptr->num_counters; i++) | ||
| 294 | dfn_ptr->counters[i] += sfn_ptr->counters[i]; | ||
| 295 | } | ||
| 296 | } | ||
| 297 | |||
| 298 | static struct gcov_fn_info *gcov_fn_info_dup(struct gcov_fn_info *fn) | ||
| 299 | { | ||
| 300 | size_t cv_size; /* counter values size */ | ||
| 301 | struct gcov_fn_info *fn_dup = kmemdup(fn, sizeof(*fn), | ||
| 302 | GFP_KERNEL); | ||
| 303 | if (!fn_dup) | ||
| 304 | return NULL; | ||
| 305 | INIT_LIST_HEAD(&fn_dup->head); | ||
| 306 | |||
| 307 | fn_dup->function_name = kstrdup(fn->function_name, GFP_KERNEL); | ||
| 308 | if (!fn_dup->function_name) | ||
| 309 | goto err_name; | ||
| 310 | |||
| 311 | cv_size = fn->num_counters * sizeof(fn->counters[0]); | ||
| 312 | fn_dup->counters = vmalloc(cv_size); | ||
| 313 | if (!fn_dup->counters) | ||
| 314 | goto err_counters; | ||
| 315 | memcpy(fn_dup->counters, fn->counters, cv_size); | ||
| 316 | |||
| 317 | return fn_dup; | ||
| 318 | |||
| 319 | err_counters: | ||
| 320 | kfree(fn_dup->function_name); | ||
| 321 | err_name: | ||
| 322 | kfree(fn_dup); | ||
| 323 | return NULL; | ||
| 324 | } | ||
| 325 | |||
| 326 | /** | ||
| 327 | * gcov_info_dup - duplicate profiling data set | ||
| 328 | * @info: profiling data set to duplicate | ||
| 329 | * | ||
| 330 | * Return newly allocated duplicate on success, %NULL on error. | ||
| 331 | */ | ||
| 332 | struct gcov_info *gcov_info_dup(struct gcov_info *info) | ||
| 333 | { | ||
| 334 | struct gcov_info *dup; | ||
| 335 | struct gcov_fn_info *fn; | ||
| 336 | |||
| 337 | dup = kmemdup(info, sizeof(*dup), GFP_KERNEL); | ||
| 338 | if (!dup) | ||
| 339 | return NULL; | ||
| 340 | INIT_LIST_HEAD(&dup->head); | ||
| 341 | INIT_LIST_HEAD(&dup->functions); | ||
| 342 | dup->filename = kstrdup(info->filename, GFP_KERNEL); | ||
| 343 | if (!dup->filename) | ||
| 344 | goto err; | ||
| 345 | |||
| 346 | list_for_each_entry(fn, &info->functions, head) { | ||
| 347 | struct gcov_fn_info *fn_dup = gcov_fn_info_dup(fn); | ||
| 348 | |||
| 349 | if (!fn_dup) | ||
| 350 | goto err; | ||
| 351 | list_add_tail(&fn_dup->head, &dup->functions); | ||
| 352 | } | ||
| 353 | |||
| 354 | return dup; | ||
| 355 | |||
| 356 | err: | ||
| 357 | gcov_info_free(dup); | ||
| 358 | return NULL; | ||
| 359 | } | ||
| 360 | |||
| 361 | /** | ||
| 362 | * gcov_info_free - release memory for profiling data set duplicate | ||
| 363 | * @info: profiling data set duplicate to free | ||
| 364 | */ | ||
| 365 | void gcov_info_free(struct gcov_info *info) | ||
| 366 | { | ||
| 367 | struct gcov_fn_info *fn, *tmp; | ||
| 368 | |||
| 369 | list_for_each_entry_safe(fn, tmp, &info->functions, head) { | ||
| 370 | kfree(fn->function_name); | ||
| 371 | vfree(fn->counters); | ||
| 372 | list_del(&fn->head); | ||
| 373 | kfree(fn); | ||
| 374 | } | ||
| 375 | kfree(info->filename); | ||
| 376 | kfree(info); | ||
| 377 | } | ||
| 378 | |||
| 379 | #define ITER_STRIDE PAGE_SIZE | ||
| 380 | |||
| 381 | /** | ||
| 382 | * struct gcov_iterator - specifies current file position in logical records | ||
| 383 | * @info: associated profiling data | ||
| 384 | * @buffer: buffer containing file data | ||
| 385 | * @size: size of buffer | ||
| 386 | * @pos: current position in file | ||
| 387 | */ | ||
| 388 | struct gcov_iterator { | ||
| 389 | struct gcov_info *info; | ||
| 390 | void *buffer; | ||
| 391 | size_t size; | ||
| 392 | loff_t pos; | ||
| 393 | }; | ||
| 394 | |||
| 395 | /** | ||
| 396 | * store_gcov_u32 - store 32 bit number in gcov format to buffer | ||
| 397 | * @buffer: target buffer or NULL | ||
| 398 | * @off: offset into the buffer | ||
| 399 | * @v: value to be stored | ||
| 400 | * | ||
| 401 | * Number format defined by gcc: numbers are recorded in the 32 bit | ||
| 402 | * unsigned binary form of the endianness of the machine generating the | ||
| 403 | * file. Returns the number of bytes stored. If @buffer is %NULL, doesn't | ||
| 404 | * store anything. | ||
| 405 | */ | ||
| 406 | static size_t store_gcov_u32(void *buffer, size_t off, u32 v) | ||
| 407 | { | ||
| 408 | u32 *data; | ||
| 409 | |||
| 410 | if (buffer) { | ||
| 411 | data = buffer + off; | ||
| 412 | *data = v; | ||
| 413 | } | ||
| 414 | |||
| 415 | return sizeof(*data); | ||
| 416 | } | ||
| 417 | |||
| 418 | /** | ||
| 419 | * store_gcov_u64 - store 64 bit number in gcov format to buffer | ||
| 420 | * @buffer: target buffer or NULL | ||
| 421 | * @off: offset into the buffer | ||
| 422 | * @v: value to be stored | ||
| 423 | * | ||
| 424 | * Number format defined by gcc: numbers are recorded in the 32 bit | ||
| 425 | * unsigned binary form of the endianness of the machine generating the | ||
| 426 | * file. 64 bit numbers are stored as two 32 bit numbers, the low part | ||
| 427 | * first. Returns the number of bytes stored. If @buffer is %NULL, doesn't store | ||
| 428 | * anything. | ||
| 429 | */ | ||
| 430 | static size_t store_gcov_u64(void *buffer, size_t off, u64 v) | ||
| 431 | { | ||
| 432 | u32 *data; | ||
| 433 | |||
| 434 | if (buffer) { | ||
| 435 | data = buffer + off; | ||
| 436 | |||
| 437 | data[0] = (v & 0xffffffffUL); | ||
| 438 | data[1] = (v >> 32); | ||
| 439 | } | ||
| 440 | |||
| 441 | return sizeof(*data) * 2; | ||
| 442 | } | ||
| 443 | |||
| 444 | /** | ||
| 445 | * convert_to_gcda - convert profiling data set to gcda file format | ||
| 446 | * @buffer: the buffer to store file data or %NULL if no data should be stored | ||
| 447 | * @info: profiling data set to be converted | ||
| 448 | * | ||
| 449 | * Returns the number of bytes that were/would have been stored into the buffer. | ||
| 450 | */ | ||
| 451 | static size_t convert_to_gcda(char *buffer, struct gcov_info *info) | ||
| 452 | { | ||
| 453 | struct gcov_fn_info *fi_ptr; | ||
| 454 | size_t pos = 0; | ||
| 455 | |||
| 456 | /* File header. */ | ||
| 457 | pos += store_gcov_u32(buffer, pos, GCOV_DATA_MAGIC); | ||
| 458 | pos += store_gcov_u32(buffer, pos, info->version); | ||
| 459 | pos += store_gcov_u32(buffer, pos, info->checksum); | ||
| 460 | |||
| 461 | list_for_each_entry(fi_ptr, &info->functions, head) { | ||
| 462 | u32 i; | ||
| 463 | u32 len = 2; | ||
| 464 | |||
| 465 | if (fi_ptr->use_extra_checksum) | ||
| 466 | len++; | ||
| 467 | |||
| 468 | pos += store_gcov_u32(buffer, pos, GCOV_TAG_FUNCTION); | ||
| 469 | pos += store_gcov_u32(buffer, pos, len); | ||
| 470 | pos += store_gcov_u32(buffer, pos, fi_ptr->ident); | ||
| 471 | pos += store_gcov_u32(buffer, pos, fi_ptr->checksum); | ||
| 472 | if (fi_ptr->use_extra_checksum) | ||
| 473 | pos += store_gcov_u32(buffer, pos, fi_ptr->cfg_checksum); | ||
| 474 | |||
| 475 | pos += store_gcov_u32(buffer, pos, GCOV_TAG_COUNTER_BASE); | ||
| 476 | pos += store_gcov_u32(buffer, pos, fi_ptr->num_counters * 2); | ||
| 477 | for (i = 0; i < fi_ptr->num_counters; i++) | ||
| 478 | pos += store_gcov_u64(buffer, pos, fi_ptr->counters[i]); | ||
| 479 | } | ||
| 480 | |||
| 481 | return pos; | ||
| 482 | } | ||
| 483 | |||
| 484 | /** | ||
| 485 | * gcov_iter_new - allocate and initialize profiling data iterator | ||
| 486 | * @info: profiling data set to be iterated | ||
| 487 | * | ||
| 488 | * Return file iterator on success, %NULL otherwise. | ||
| 489 | */ | ||
| 490 | struct gcov_iterator *gcov_iter_new(struct gcov_info *info) | ||
| 491 | { | ||
| 492 | struct gcov_iterator *iter; | ||
| 493 | |||
| 494 | iter = kzalloc(sizeof(struct gcov_iterator), GFP_KERNEL); | ||
| 495 | if (!iter) | ||
| 496 | goto err_free; | ||
| 497 | |||
| 498 | iter->info = info; | ||
| 499 | /* Dry-run to get the actual buffer size. */ | ||
| 500 | iter->size = convert_to_gcda(NULL, info); | ||
| 501 | iter->buffer = vmalloc(iter->size); | ||
| 502 | if (!iter->buffer) | ||
| 503 | goto err_free; | ||
| 504 | |||
| 505 | convert_to_gcda(iter->buffer, info); | ||
| 506 | |||
| 507 | return iter; | ||
| 508 | |||
| 509 | err_free: | ||
| 510 | kfree(iter); | ||
| 511 | return NULL; | ||
| 512 | } | ||
| 513 | |||
| 514 | |||
| 515 | /** | ||
| 516 | * gcov_iter_get_info - return profiling data set for given file iterator | ||
| 517 | * @iter: file iterator | ||
| 518 | */ | ||
| 519 | void gcov_iter_free(struct gcov_iterator *iter) | ||
| 520 | { | ||
| 521 | vfree(iter->buffer); | ||
| 522 | kfree(iter); | ||
| 523 | } | ||
| 524 | |||
| 525 | /** | ||
| 526 | * gcov_iter_get_info - return profiling data set for given file iterator | ||
| 527 | * @iter: file iterator | ||
| 528 | */ | ||
| 529 | struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) | ||
| 530 | { | ||
| 531 | return iter->info; | ||
| 532 | } | ||
| 533 | |||
| 534 | /** | ||
| 535 | * gcov_iter_start - reset file iterator to starting position | ||
| 536 | * @iter: file iterator | ||
| 537 | */ | ||
| 538 | void gcov_iter_start(struct gcov_iterator *iter) | ||
| 539 | { | ||
| 540 | iter->pos = 0; | ||
| 541 | } | ||
| 542 | |||
| 543 | /** | ||
| 544 | * gcov_iter_next - advance file iterator to next logical record | ||
| 545 | * @iter: file iterator | ||
| 546 | * | ||
| 547 | * Return zero if new position is valid, non-zero if iterator has reached end. | ||
| 548 | */ | ||
| 549 | int gcov_iter_next(struct gcov_iterator *iter) | ||
| 550 | { | ||
| 551 | if (iter->pos < iter->size) | ||
| 552 | iter->pos += ITER_STRIDE; | ||
| 553 | |||
| 554 | if (iter->pos >= iter->size) | ||
| 555 | return -EINVAL; | ||
| 556 | |||
| 557 | return 0; | ||
| 558 | } | ||
| 559 | |||
| 560 | /** | ||
| 561 | * gcov_iter_write - write data for current pos to seq_file | ||
| 562 | * @iter: file iterator | ||
| 563 | * @seq: seq_file handle | ||
| 564 | * | ||
| 565 | * Return zero on success, non-zero otherwise. | ||
| 566 | */ | ||
| 567 | int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) | ||
| 568 | { | ||
| 569 | size_t len; | ||
| 570 | |||
| 571 | if (iter->pos >= iter->size) | ||
| 572 | return -EINVAL; | ||
| 573 | |||
| 574 | len = ITER_STRIDE; | ||
| 575 | if (iter->pos + len > iter->size) | ||
| 576 | len = iter->size - iter->pos; | ||
| 577 | |||
| 578 | seq_write(seq, iter->buffer + iter->pos, len); | ||
| 579 | |||
| 580 | return 0; | ||
| 581 | } | ||
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 6e40ff6be083..e5eb5ea7ea59 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c | |||
| @@ -64,7 +64,6 @@ struct gcov_node { | |||
| 64 | static const char objtree[] = OBJTREE; | 64 | static const char objtree[] = OBJTREE; |
| 65 | static const char srctree[] = SRCTREE; | 65 | static const char srctree[] = SRCTREE; |
| 66 | static struct gcov_node root_node; | 66 | static struct gcov_node root_node; |
| 67 | static struct dentry *reset_dentry; | ||
| 68 | static LIST_HEAD(all_head); | 67 | static LIST_HEAD(all_head); |
| 69 | static DEFINE_MUTEX(node_lock); | 68 | static DEFINE_MUTEX(node_lock); |
| 70 | 69 | ||
| @@ -387,8 +386,6 @@ static void add_links(struct gcov_node *node, struct dentry *parent) | |||
| 387 | goto out_err; | 386 | goto out_err; |
| 388 | node->links[i] = debugfs_create_symlink(deskew(basename), | 387 | node->links[i] = debugfs_create_symlink(deskew(basename), |
| 389 | parent, target); | 388 | parent, target); |
| 390 | if (!node->links[i]) | ||
| 391 | goto out_err; | ||
| 392 | kfree(target); | 389 | kfree(target); |
| 393 | } | 390 | } |
| 394 | 391 | ||
| @@ -450,11 +447,6 @@ static struct gcov_node *new_node(struct gcov_node *parent, | |||
| 450 | parent->dentry, node, &gcov_data_fops); | 447 | parent->dentry, node, &gcov_data_fops); |
| 451 | } else | 448 | } else |
| 452 | node->dentry = debugfs_create_dir(node->name, parent->dentry); | 449 | node->dentry = debugfs_create_dir(node->name, parent->dentry); |
| 453 | if (!node->dentry) { | ||
| 454 | pr_warn("could not create file\n"); | ||
| 455 | kfree(node); | ||
| 456 | return NULL; | ||
| 457 | } | ||
| 458 | if (info) | 450 | if (info) |
| 459 | add_links(node, parent->dentry); | 451 | add_links(node, parent->dentry); |
| 460 | list_add(&node->list, &parent->children); | 452 | list_add(&node->list, &parent->children); |
| @@ -761,32 +753,20 @@ void gcov_event(enum gcov_action action, struct gcov_info *info) | |||
| 761 | /* Create debugfs entries. */ | 753 | /* Create debugfs entries. */ |
| 762 | static __init int gcov_fs_init(void) | 754 | static __init int gcov_fs_init(void) |
| 763 | { | 755 | { |
| 764 | int rc = -EIO; | ||
| 765 | |||
| 766 | init_node(&root_node, NULL, NULL, NULL); | 756 | init_node(&root_node, NULL, NULL, NULL); |
| 767 | /* | 757 | /* |
| 768 | * /sys/kernel/debug/gcov will be parent for the reset control file | 758 | * /sys/kernel/debug/gcov will be parent for the reset control file |
| 769 | * and all profiling files. | 759 | * and all profiling files. |
| 770 | */ | 760 | */ |
| 771 | root_node.dentry = debugfs_create_dir("gcov", NULL); | 761 | root_node.dentry = debugfs_create_dir("gcov", NULL); |
| 772 | if (!root_node.dentry) | ||
| 773 | goto err_remove; | ||
| 774 | /* | 762 | /* |
| 775 | * Create reset file which resets all profiling counts when written | 763 | * Create reset file which resets all profiling counts when written |
| 776 | * to. | 764 | * to. |
| 777 | */ | 765 | */ |
| 778 | reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, | 766 | debugfs_create_file("reset", 0600, root_node.dentry, NULL, |
| 779 | NULL, &gcov_reset_fops); | 767 | &gcov_reset_fops); |
| 780 | if (!reset_dentry) | ||
| 781 | goto err_remove; | ||
| 782 | /* Replay previous events to get our fs hierarchy up-to-date. */ | 768 | /* Replay previous events to get our fs hierarchy up-to-date. */ |
| 783 | gcov_enable_events(); | 769 | gcov_enable_events(); |
| 784 | return 0; | 770 | return 0; |
| 785 | |||
| 786 | err_remove: | ||
| 787 | pr_err("init failed\n"); | ||
| 788 | debugfs_remove(root_node.dentry); | ||
| 789 | |||
| 790 | return rc; | ||
| 791 | } | 771 | } |
| 792 | device_initcall(gcov_fs_init); | 772 | device_initcall(gcov_fs_init); |
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index 2dddecbdbe6e..801ee4b0b969 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c | |||
| @@ -137,6 +137,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) | |||
| 137 | gcov_info_head = info->next; | 137 | gcov_info_head = info->next; |
| 138 | } | 138 | } |
| 139 | 139 | ||
| 140 | /** | ||
| 141 | * gcov_info_within_module - check if a profiling data set belongs to a module | ||
| 142 | * @info: profiling data set | ||
| 143 | * @mod: module | ||
| 144 | * | ||
| 145 | * Returns true if profiling data belongs module, false otherwise. | ||
| 146 | */ | ||
| 147 | bool gcov_info_within_module(struct gcov_info *info, struct module *mod) | ||
| 148 | { | ||
| 149 | return within_module((unsigned long)info, mod); | ||
| 150 | } | ||
| 151 | |||
| 140 | /* Symbolic links to be created for each profiling data file. */ | 152 | /* Symbolic links to be created for each profiling data file. */ |
| 141 | const struct gcov_link gcov_link[] = { | 153 | const struct gcov_link gcov_link[] = { |
| 142 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ | 154 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ |
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index ca5e5c0ef853..ec37563674d6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c | |||
| @@ -150,6 +150,18 @@ void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info) | |||
| 150 | gcov_info_head = info->next; | 150 | gcov_info_head = info->next; |
| 151 | } | 151 | } |
| 152 | 152 | ||
| 153 | /** | ||
| 154 | * gcov_info_within_module - check if a profiling data set belongs to a module | ||
| 155 | * @info: profiling data set | ||
| 156 | * @mod: module | ||
| 157 | * | ||
| 158 | * Returns true if profiling data belongs module, false otherwise. | ||
| 159 | */ | ||
| 160 | bool gcov_info_within_module(struct gcov_info *info, struct module *mod) | ||
| 161 | { | ||
| 162 | return within_module((unsigned long)info, mod); | ||
| 163 | } | ||
| 164 | |||
| 153 | /* Symbolic links to be created for each profiling data file. */ | 165 | /* Symbolic links to be created for each profiling data file. */ |
| 154 | const struct gcov_link gcov_link[] = { | 166 | const struct gcov_link gcov_link[] = { |
| 155 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ | 167 | { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ |
diff --git a/kernel/gcov/gcc_base.c b/kernel/gcov/gcc_base.c new file mode 100644 index 000000000000..3cf736b9f880 --- /dev/null +++ b/kernel/gcov/gcc_base.c | |||
| @@ -0,0 +1,86 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | |||
| 3 | #include <linux/export.h> | ||
| 4 | #include <linux/kernel.h> | ||
| 5 | #include <linux/mutex.h> | ||
| 6 | #include "gcov.h" | ||
| 7 | |||
| 8 | /* | ||
| 9 | * __gcov_init is called by gcc-generated constructor code for each object | ||
| 10 | * file compiled with -fprofile-arcs. | ||
| 11 | */ | ||
| 12 | void __gcov_init(struct gcov_info *info) | ||
| 13 | { | ||
| 14 | static unsigned int gcov_version; | ||
| 15 | |||
| 16 | mutex_lock(&gcov_lock); | ||
| 17 | if (gcov_version == 0) { | ||
| 18 | gcov_version = gcov_info_version(info); | ||
| 19 | /* | ||
| 20 | * Printing gcc's version magic may prove useful for debugging | ||
| 21 | * incompatibility reports. | ||
| 22 | */ | ||
| 23 | pr_info("version magic: 0x%x\n", gcov_version); | ||
| 24 | } | ||
| 25 | /* | ||
| 26 | * Add new profiling data structure to list and inform event | ||
| 27 | * listener. | ||
| 28 | */ | ||
| 29 | gcov_info_link(info); | ||
| 30 | if (gcov_events_enabled) | ||
| 31 | gcov_event(GCOV_ADD, info); | ||
| 32 | mutex_unlock(&gcov_lock); | ||
| 33 | } | ||
| 34 | EXPORT_SYMBOL(__gcov_init); | ||
| 35 | |||
| 36 | /* | ||
| 37 | * These functions may be referenced by gcc-generated profiling code but serve | ||
| 38 | * no function for kernel profiling. | ||
| 39 | */ | ||
| 40 | void __gcov_flush(void) | ||
| 41 | { | ||
| 42 | /* Unused. */ | ||
| 43 | } | ||
| 44 | EXPORT_SYMBOL(__gcov_flush); | ||
| 45 | |||
| 46 | void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) | ||
| 47 | { | ||
| 48 | /* Unused. */ | ||
| 49 | } | ||
| 50 | EXPORT_SYMBOL(__gcov_merge_add); | ||
| 51 | |||
| 52 | void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) | ||
| 53 | { | ||
| 54 | /* Unused. */ | ||
| 55 | } | ||
| 56 | EXPORT_SYMBOL(__gcov_merge_single); | ||
| 57 | |||
| 58 | void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) | ||
| 59 | { | ||
| 60 | /* Unused. */ | ||
| 61 | } | ||
| 62 | EXPORT_SYMBOL(__gcov_merge_delta); | ||
| 63 | |||
| 64 | void __gcov_merge_ior(gcov_type *counters, unsigned int n_counters) | ||
| 65 | { | ||
| 66 | /* Unused. */ | ||
| 67 | } | ||
| 68 | EXPORT_SYMBOL(__gcov_merge_ior); | ||
| 69 | |||
| 70 | void __gcov_merge_time_profile(gcov_type *counters, unsigned int n_counters) | ||
| 71 | { | ||
| 72 | /* Unused. */ | ||
| 73 | } | ||
| 74 | EXPORT_SYMBOL(__gcov_merge_time_profile); | ||
| 75 | |||
| 76 | void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) | ||
| 77 | { | ||
| 78 | /* Unused. */ | ||
| 79 | } | ||
| 80 | EXPORT_SYMBOL(__gcov_merge_icall_topn); | ||
| 81 | |||
| 82 | void __gcov_exit(void) | ||
| 83 | { | ||
| 84 | /* Unused. */ | ||
| 85 | } | ||
| 86 | EXPORT_SYMBOL(__gcov_exit); | ||
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index de118ad4a024..6ab2c1808c9d 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h | |||
| @@ -15,6 +15,7 @@ | |||
| 15 | #ifndef GCOV_H | 15 | #ifndef GCOV_H |
| 16 | #define GCOV_H GCOV_H | 16 | #define GCOV_H GCOV_H |
| 17 | 17 | ||
| 18 | #include <linux/module.h> | ||
| 18 | #include <linux/types.h> | 19 | #include <linux/types.h> |
| 19 | 20 | ||
| 20 | /* | 21 | /* |
| @@ -46,6 +47,7 @@ unsigned int gcov_info_version(struct gcov_info *info); | |||
| 46 | struct gcov_info *gcov_info_next(struct gcov_info *info); | 47 | struct gcov_info *gcov_info_next(struct gcov_info *info); |
| 47 | void gcov_info_link(struct gcov_info *info); | 48 | void gcov_info_link(struct gcov_info *info); |
| 48 | void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); | 49 | void gcov_info_unlink(struct gcov_info *prev, struct gcov_info *info); |
| 50 | bool gcov_info_within_module(struct gcov_info *info, struct module *mod); | ||
| 49 | 51 | ||
| 50 | /* Base interface. */ | 52 | /* Base interface. */ |
| 51 | enum gcov_action { | 53 | enum gcov_action { |
| @@ -83,4 +85,7 @@ struct gcov_link { | |||
| 83 | }; | 85 | }; |
| 84 | extern const struct gcov_link gcov_link[]; | 86 | extern const struct gcov_link gcov_link[]; |
| 85 | 87 | ||
| 88 | extern int gcov_events_enabled; | ||
| 89 | extern struct mutex gcov_lock; | ||
| 90 | |||
| 86 | #endif /* GCOV_H */ | 91 | #endif /* GCOV_H */ |
diff --git a/kernel/gen_ikh_data.sh b/kernel/gen_kheaders.sh index 591a94f7b387..9ff449888d9c 100755 --- a/kernel/gen_ikh_data.sh +++ b/kernel/gen_kheaders.sh | |||
| @@ -2,26 +2,14 @@ | |||
| 2 | # SPDX-License-Identifier: GPL-2.0 | 2 | # SPDX-License-Identifier: GPL-2.0 |
| 3 | 3 | ||
| 4 | # This script generates an archive consisting of kernel headers | 4 | # This script generates an archive consisting of kernel headers |
| 5 | # for CONFIG_IKHEADERS_PROC. | 5 | # for CONFIG_IKHEADERS. |
| 6 | set -e | 6 | set -e |
| 7 | spath="$(dirname "$(readlink -f "$0")")" | 7 | sfile="$(readlink -f "$0")" |
| 8 | kroot="$spath/.." | ||
| 9 | outdir="$(pwd)" | 8 | outdir="$(pwd)" |
| 10 | tarfile=$1 | 9 | tarfile=$1 |
| 11 | cpio_dir=$outdir/$tarfile.tmp | 10 | cpio_dir=$outdir/$tarfile.tmp |
| 12 | 11 | ||
| 13 | # Script filename relative to the kernel source root | 12 | dir_list=" |
| 14 | # We add it to the archive because it is small and any changes | ||
| 15 | # to this script will also cause a rebuild of the archive. | ||
| 16 | sfile="$(realpath --relative-to $kroot "$(readlink -f "$0")")" | ||
| 17 | |||
| 18 | src_file_list=" | ||
| 19 | include/ | ||
| 20 | arch/$SRCARCH/include/ | ||
| 21 | $sfile | ||
| 22 | " | ||
| 23 | |||
| 24 | obj_file_list=" | ||
| 25 | include/ | 13 | include/ |
| 26 | arch/$SRCARCH/include/ | 14 | arch/$SRCARCH/include/ |
| 27 | " | 15 | " |
| @@ -31,28 +19,31 @@ arch/$SRCARCH/include/ | |||
| 31 | 19 | ||
| 32 | # This block is useful for debugging the incremental builds. | 20 | # This block is useful for debugging the incremental builds. |
| 33 | # Uncomment it for debugging. | 21 | # Uncomment it for debugging. |
| 34 | # iter=1 | 22 | # if [ ! -f /tmp/iter ]; then iter=1; echo 1 > /tmp/iter; |
| 35 | # if [ ! -f /tmp/iter ]; then echo 1 > /tmp/iter; | 23 | # else iter=$(($(cat /tmp/iter) + 1)); echo $iter > /tmp/iter; fi |
| 36 | # else; iter=$(($(cat /tmp/iter) + 1)); fi | 24 | # find $src_file_list -name "*.h" | xargs ls -l > /tmp/src-ls-$iter |
| 37 | # find $src_file_list -type f | xargs ls -lR > /tmp/src-ls-$iter | 25 | # find $obj_file_list -name "*.h" | xargs ls -l > /tmp/obj-ls-$iter |
| 38 | # find $obj_file_list -type f | xargs ls -lR > /tmp/obj-ls-$iter | ||
| 39 | 26 | ||
| 40 | # include/generated/compile.h is ignored because it is touched even when none | 27 | # include/generated/compile.h is ignored because it is touched even when none |
| 41 | # of the source files changed. This causes pointless regeneration, so let us | 28 | # of the source files changed. This causes pointless regeneration, so let us |
| 42 | # ignore them for md5 calculation. | 29 | # ignore them for md5 calculation. |
| 43 | pushd $kroot > /dev/null | 30 | pushd $srctree > /dev/null |
| 44 | src_files_md5="$(find $src_file_list -type f | | 31 | src_files_md5="$(find $dir_list -name "*.h" | |
| 45 | grep -v "include/generated/compile.h" | | 32 | grep -v "include/generated/compile.h" | |
| 46 | xargs ls -lR | md5sum | cut -d ' ' -f1)" | 33 | grep -v "include/generated/autoconf.h" | |
| 34 | xargs ls -l | md5sum | cut -d ' ' -f1)" | ||
| 47 | popd > /dev/null | 35 | popd > /dev/null |
| 48 | obj_files_md5="$(find $obj_file_list -type f | | 36 | obj_files_md5="$(find $dir_list -name "*.h" | |
| 49 | grep -v "include/generated/compile.h" | | 37 | grep -v "include/generated/compile.h" | |
| 50 | xargs ls -lR | md5sum | cut -d ' ' -f1)" | 38 | grep -v "include/generated/autoconf.h" | |
| 51 | 39 | xargs ls -l | md5sum | cut -d ' ' -f1)" | |
| 40 | # Any changes to this script will also cause a rebuild of the archive. | ||
| 41 | this_file_md5="$(ls -l $sfile | md5sum | cut -d ' ' -f1)" | ||
| 52 | if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi | 42 | if [ -f $tarfile ]; then tarfile_md5="$(md5sum $tarfile | cut -d ' ' -f1)"; fi |
| 53 | if [ -f kernel/kheaders.md5 ] && | 43 | if [ -f kernel/kheaders.md5 ] && |
| 54 | [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && | 44 | [ "$(cat kernel/kheaders.md5|head -1)" == "$src_files_md5" ] && |
| 55 | [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && | 45 | [ "$(cat kernel/kheaders.md5|head -2|tail -1)" == "$obj_files_md5" ] && |
| 46 | [ "$(cat kernel/kheaders.md5|head -3|tail -1)" == "$this_file_md5" ] && | ||
| 56 | [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then | 47 | [ "$(cat kernel/kheaders.md5|tail -1)" == "$tarfile_md5" ]; then |
| 57 | exit | 48 | exit |
| 58 | fi | 49 | fi |
| @@ -64,16 +55,16 @@ fi | |||
| 64 | rm -rf $cpio_dir | 55 | rm -rf $cpio_dir |
| 65 | mkdir $cpio_dir | 56 | mkdir $cpio_dir |
| 66 | 57 | ||
| 67 | pushd $kroot > /dev/null | 58 | pushd $srctree > /dev/null |
| 68 | for f in $src_file_list; | 59 | for f in $dir_list; |
| 69 | do find "$f" ! -name "*.cmd" ! -name ".*"; | 60 | do find "$f" -name "*.h"; |
| 70 | done | cpio --quiet -pd $cpio_dir | 61 | done | cpio --quiet -pd $cpio_dir |
| 71 | popd > /dev/null | 62 | popd > /dev/null |
| 72 | 63 | ||
| 73 | # The second CPIO can complain if files already exist which can | 64 | # The second CPIO can complain if files already exist which can |
| 74 | # happen with out of tree builds. Just silence CPIO for now. | 65 | # happen with out of tree builds. Just silence CPIO for now. |
| 75 | for f in $obj_file_list; | 66 | for f in $dir_list; |
| 76 | do find "$f" ! -name "*.cmd" ! -name ".*"; | 67 | do find "$f" -name "*.h"; |
| 77 | done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 | 68 | done | cpio --quiet -pd $cpio_dir >/dev/null 2>&1 |
| 78 | 69 | ||
| 79 | # Remove comments except SDPX lines | 70 | # Remove comments except SDPX lines |
| @@ -82,8 +73,9 @@ find $cpio_dir -type f -print0 | | |||
| 82 | 73 | ||
| 83 | tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null | 74 | tar -Jcf $tarfile -C $cpio_dir/ . > /dev/null |
| 84 | 75 | ||
| 85 | echo "$src_files_md5" > kernel/kheaders.md5 | 76 | echo "$src_files_md5" > kernel/kheaders.md5 |
| 86 | echo "$obj_files_md5" >> kernel/kheaders.md5 | 77 | echo "$obj_files_md5" >> kernel/kheaders.md5 |
| 78 | echo "$this_file_md5" >> kernel/kheaders.md5 | ||
| 87 | echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 | 79 | echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5 |
| 88 | 80 | ||
| 89 | rm -rf $cpio_dir | 81 | rm -rf $cpio_dir |
diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f108a95882c6..14a625c16cb3 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Detect Hung Task | 3 | * Detect Hung Task |
| 3 | * | 4 | * |
diff --git a/kernel/iomem.c b/kernel/iomem.c index 93c264444510..62c92e43aa0d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c | |||
| @@ -121,7 +121,7 @@ EXPORT_SYMBOL(memremap); | |||
| 121 | 121 | ||
| 122 | void memunmap(void *addr) | 122 | void memunmap(void *addr) |
| 123 | { | 123 | { |
| 124 | if (is_vmalloc_addr(addr)) | 124 | if (is_ioremap_addr(addr)) |
| 125 | iounmap((void __iomem *) addr); | 125 | iounmap((void __iomem *) addr); |
| 126 | } | 126 | } |
| 127 | EXPORT_SYMBOL(memunmap); | 127 | EXPORT_SYMBOL(memunmap); |
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 5f3e2baefca9..f92d9a687372 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | menu "IRQ subsystem" | 2 | menu "IRQ subsystem" |
| 2 | # Options selectable by the architecture code | 3 | # Options selectable by the architecture code |
| 3 | 4 | ||
| @@ -91,6 +92,9 @@ config GENERIC_MSI_IRQ_DOMAIN | |||
| 91 | select IRQ_DOMAIN_HIERARCHY | 92 | select IRQ_DOMAIN_HIERARCHY |
| 92 | select GENERIC_MSI_IRQ | 93 | select GENERIC_MSI_IRQ |
| 93 | 94 | ||
| 95 | config IRQ_MSI_IOMMU | ||
| 96 | bool | ||
| 97 | |||
| 94 | config HANDLE_DOMAIN_IRQ | 98 | config HANDLE_DOMAIN_IRQ |
| 95 | bool | 99 | bool |
| 96 | 100 | ||
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index ff6e352e3a6c..b4f53717d143 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile | |||
| @@ -2,6 +2,9 @@ | |||
| 2 | 2 | ||
| 3 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o | 3 | obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o |
| 4 | obj-$(CONFIG_IRQ_TIMINGS) += timings.o | 4 | obj-$(CONFIG_IRQ_TIMINGS) += timings.o |
| 5 | ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y) | ||
| 6 | CFLAGS_timings.o += -DDEBUG | ||
| 7 | endif | ||
| 5 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o | 8 | obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o |
| 6 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o | 9 | obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o |
| 7 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o | 10 | obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o |
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index f18cd5aa33e8..4352b08ae48d 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c | |||
| @@ -94,8 +94,7 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, | |||
| 94 | return nodes; | 94 | return nodes; |
| 95 | } | 95 | } |
| 96 | 96 | ||
| 97 | static int __irq_build_affinity_masks(const struct irq_affinity *affd, | 97 | static int __irq_build_affinity_masks(unsigned int startvec, |
| 98 | unsigned int startvec, | ||
| 99 | unsigned int numvecs, | 98 | unsigned int numvecs, |
| 100 | unsigned int firstvec, | 99 | unsigned int firstvec, |
| 101 | cpumask_var_t *node_to_cpumask, | 100 | cpumask_var_t *node_to_cpumask, |
| @@ -171,8 +170,7 @@ static int __irq_build_affinity_masks(const struct irq_affinity *affd, | |||
| 171 | * 1) spread present CPU on these vectors | 170 | * 1) spread present CPU on these vectors |
| 172 | * 2) spread other possible CPUs on these vectors | 171 | * 2) spread other possible CPUs on these vectors |
| 173 | */ | 172 | */ |
| 174 | static int irq_build_affinity_masks(const struct irq_affinity *affd, | 173 | static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs, |
| 175 | unsigned int startvec, unsigned int numvecs, | ||
| 176 | unsigned int firstvec, | 174 | unsigned int firstvec, |
| 177 | struct irq_affinity_desc *masks) | 175 | struct irq_affinity_desc *masks) |
| 178 | { | 176 | { |
| @@ -197,7 +195,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, | |||
| 197 | build_node_to_cpumask(node_to_cpumask); | 195 | build_node_to_cpumask(node_to_cpumask); |
| 198 | 196 | ||
| 199 | /* Spread on present CPUs starting from affd->pre_vectors */ | 197 | /* Spread on present CPUs starting from affd->pre_vectors */ |
| 200 | nr_present = __irq_build_affinity_masks(affd, curvec, numvecs, | 198 | nr_present = __irq_build_affinity_masks(curvec, numvecs, |
| 201 | firstvec, node_to_cpumask, | 199 | firstvec, node_to_cpumask, |
| 202 | cpu_present_mask, nmsk, masks); | 200 | cpu_present_mask, nmsk, masks); |
| 203 | 201 | ||
| @@ -212,7 +210,7 @@ static int irq_build_affinity_masks(const struct irq_affinity *affd, | |||
| 212 | else | 210 | else |
| 213 | curvec = firstvec + nr_present; | 211 | curvec = firstvec + nr_present; |
| 214 | cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); | 212 | cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); |
| 215 | nr_others = __irq_build_affinity_masks(affd, curvec, numvecs, | 213 | nr_others = __irq_build_affinity_masks(curvec, numvecs, |
| 216 | firstvec, node_to_cpumask, | 214 | firstvec, node_to_cpumask, |
| 217 | npresmsk, nmsk, masks); | 215 | npresmsk, nmsk, masks); |
| 218 | put_online_cpus(); | 216 | put_online_cpus(); |
| @@ -295,7 +293,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd) | |||
| 295 | unsigned int this_vecs = affd->set_size[i]; | 293 | unsigned int this_vecs = affd->set_size[i]; |
| 296 | int ret; | 294 | int ret; |
| 297 | 295 | ||
| 298 | ret = irq_build_affinity_masks(affd, curvec, this_vecs, | 296 | ret = irq_build_affinity_masks(curvec, this_vecs, |
| 299 | curvec, masks); | 297 | curvec, masks); |
| 300 | if (ret) { | 298 | if (ret) { |
| 301 | kfree(masks); | 299 | kfree(masks); |
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 16cbf6beb276..ae60cae24e9a 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c | |||
| @@ -90,7 +90,7 @@ unsigned long probe_irq_on(void) | |||
| 90 | /* It triggered already - consider it spurious. */ | 90 | /* It triggered already - consider it spurious. */ |
| 91 | if (!(desc->istate & IRQS_WAITING)) { | 91 | if (!(desc->istate & IRQS_WAITING)) { |
| 92 | desc->istate &= ~IRQS_AUTODETECT; | 92 | desc->istate &= ~IRQS_AUTODETECT; |
| 93 | irq_shutdown(desc); | 93 | irq_shutdown_and_deactivate(desc); |
| 94 | } else | 94 | } else |
| 95 | if (i < 32) | 95 | if (i < 32) |
| 96 | mask |= 1 << i; | 96 | mask |= 1 << i; |
| @@ -127,7 +127,7 @@ unsigned int probe_irq_mask(unsigned long val) | |||
| 127 | mask |= 1 << i; | 127 | mask |= 1 << i; |
| 128 | 128 | ||
| 129 | desc->istate &= ~IRQS_AUTODETECT; | 129 | desc->istate &= ~IRQS_AUTODETECT; |
| 130 | irq_shutdown(desc); | 130 | irq_shutdown_and_deactivate(desc); |
| 131 | } | 131 | } |
| 132 | raw_spin_unlock_irq(&desc->lock); | 132 | raw_spin_unlock_irq(&desc->lock); |
| 133 | } | 133 | } |
| @@ -169,7 +169,7 @@ int probe_irq_off(unsigned long val) | |||
| 169 | nr_of_irqs++; | 169 | nr_of_irqs++; |
| 170 | } | 170 | } |
| 171 | desc->istate &= ~IRQS_AUTODETECT; | 171 | desc->istate &= ~IRQS_AUTODETECT; |
| 172 | irq_shutdown(desc); | 172 | irq_shutdown_and_deactivate(desc); |
| 173 | } | 173 | } |
| 174 | raw_spin_unlock_irq(&desc->lock); | 174 | raw_spin_unlock_irq(&desc->lock); |
| 175 | } | 175 | } |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 51128bea3846..b76703b2c0af 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -314,6 +314,12 @@ void irq_shutdown(struct irq_desc *desc) | |||
| 314 | } | 314 | } |
| 315 | irq_state_clr_started(desc); | 315 | irq_state_clr_started(desc); |
| 316 | } | 316 | } |
| 317 | } | ||
| 318 | |||
| 319 | |||
| 320 | void irq_shutdown_and_deactivate(struct irq_desc *desc) | ||
| 321 | { | ||
| 322 | irq_shutdown(desc); | ||
| 317 | /* | 323 | /* |
| 318 | * This must be called even if the interrupt was never started up, | 324 | * This must be called even if the interrupt was never started up, |
| 319 | * because the activation can happen before the interrupt is | 325 | * because the activation can happen before the interrupt is |
| @@ -748,6 +754,8 @@ void handle_fasteoi_nmi(struct irq_desc *desc) | |||
| 748 | unsigned int irq = irq_desc_get_irq(desc); | 754 | unsigned int irq = irq_desc_get_irq(desc); |
| 749 | irqreturn_t res; | 755 | irqreturn_t res; |
| 750 | 756 | ||
| 757 | __kstat_incr_irqs_this_cpu(desc); | ||
| 758 | |||
| 751 | trace_irq_handler_entry(irq, action); | 759 | trace_irq_handler_entry(irq, action); |
| 752 | /* | 760 | /* |
| 753 | * NMIs cannot be shared, there is only one action. | 761 | * NMIs cannot be shared, there is only one action. |
| @@ -962,6 +970,8 @@ void handle_percpu_devid_fasteoi_nmi(struct irq_desc *desc) | |||
| 962 | unsigned int irq = irq_desc_get_irq(desc); | 970 | unsigned int irq = irq_desc_get_irq(desc); |
| 963 | irqreturn_t res; | 971 | irqreturn_t res; |
| 964 | 972 | ||
| 973 | __kstat_incr_irqs_this_cpu(desc); | ||
| 974 | |||
| 965 | trace_irq_handler_entry(irq, action); | 975 | trace_irq_handler_entry(irq, action); |
| 966 | res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); | 976 | res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); |
| 967 | trace_irq_handler_exit(irq, action, res); | 977 | trace_irq_handler_exit(irq, action, res); |
| @@ -1459,6 +1469,33 @@ int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) | |||
| 1459 | return -ENOSYS; | 1469 | return -ENOSYS; |
| 1460 | } | 1470 | } |
| 1461 | EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); | 1471 | EXPORT_SYMBOL_GPL(irq_chip_set_wake_parent); |
| 1472 | |||
| 1473 | /** | ||
| 1474 | * irq_chip_request_resources_parent - Request resources on the parent interrupt | ||
| 1475 | * @data: Pointer to interrupt specific data | ||
| 1476 | */ | ||
| 1477 | int irq_chip_request_resources_parent(struct irq_data *data) | ||
| 1478 | { | ||
| 1479 | data = data->parent_data; | ||
| 1480 | |||
| 1481 | if (data->chip->irq_request_resources) | ||
| 1482 | return data->chip->irq_request_resources(data); | ||
| 1483 | |||
| 1484 | return -ENOSYS; | ||
| 1485 | } | ||
| 1486 | EXPORT_SYMBOL_GPL(irq_chip_request_resources_parent); | ||
| 1487 | |||
| 1488 | /** | ||
| 1489 | * irq_chip_release_resources_parent - Release resources on the parent interrupt | ||
| 1490 | * @data: Pointer to interrupt specific data | ||
| 1491 | */ | ||
| 1492 | void irq_chip_release_resources_parent(struct irq_data *data) | ||
| 1493 | { | ||
| 1494 | data = data->parent_data; | ||
| 1495 | if (data->chip->irq_release_resources) | ||
| 1496 | data->chip->irq_release_resources(data); | ||
| 1497 | } | ||
| 1498 | EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent); | ||
| 1462 | #endif | 1499 | #endif |
| 1463 | 1500 | ||
| 1464 | /** | 1501 | /** |
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c index 5b1072e394b2..6c7ca2e983a5 100644 --- a/kernel/irq/cpuhotplug.c +++ b/kernel/irq/cpuhotplug.c | |||
| @@ -116,7 +116,7 @@ static bool migrate_one_irq(struct irq_desc *desc) | |||
| 116 | */ | 116 | */ |
| 117 | if (irqd_affinity_is_managed(d)) { | 117 | if (irqd_affinity_is_managed(d)) { |
| 118 | irqd_set_managed_shutdown(d); | 118 | irqd_set_managed_shutdown(d); |
| 119 | irq_shutdown(desc); | 119 | irq_shutdown_and_deactivate(desc); |
| 120 | return false; | 120 | return false; |
| 121 | } | 121 | } |
| 122 | affinity = cpu_online_mask; | 122 | affinity = cpu_online_mask; |
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 70c3053bc1f6..3924fbe829d4 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h | |||
| @@ -82,6 +82,7 @@ extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); | |||
| 82 | extern int irq_startup(struct irq_desc *desc, bool resend, bool force); | 82 | extern int irq_startup(struct irq_desc *desc, bool resend, bool force); |
| 83 | 83 | ||
| 84 | extern void irq_shutdown(struct irq_desc *desc); | 84 | extern void irq_shutdown(struct irq_desc *desc); |
| 85 | extern void irq_shutdown_and_deactivate(struct irq_desc *desc); | ||
| 85 | extern void irq_enable(struct irq_desc *desc); | 86 | extern void irq_enable(struct irq_desc *desc); |
| 86 | extern void irq_disable(struct irq_desc *desc); | 87 | extern void irq_disable(struct irq_desc *desc); |
| 87 | extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); | 88 | extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu); |
| @@ -96,6 +97,10 @@ static inline void irq_mark_irq(unsigned int irq) { } | |||
| 96 | extern void irq_mark_irq(unsigned int irq); | 97 | extern void irq_mark_irq(unsigned int irq); |
| 97 | #endif | 98 | #endif |
| 98 | 99 | ||
| 100 | extern int __irq_get_irqchip_state(struct irq_data *data, | ||
| 101 | enum irqchip_irq_state which, | ||
| 102 | bool *state); | ||
| 103 | |||
| 99 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); | 104 | extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); |
| 100 | 105 | ||
| 101 | irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); | 106 | irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags); |
| @@ -354,6 +359,16 @@ static inline int irq_timing_decode(u64 value, u64 *timestamp) | |||
| 354 | return value & U16_MAX; | 359 | return value & U16_MAX; |
| 355 | } | 360 | } |
| 356 | 361 | ||
| 362 | static __always_inline void irq_timings_push(u64 ts, int irq) | ||
| 363 | { | ||
| 364 | struct irq_timings *timings = this_cpu_ptr(&irq_timings); | ||
| 365 | |||
| 366 | timings->values[timings->count & IRQ_TIMINGS_MASK] = | ||
| 367 | irq_timing_encode(ts, irq); | ||
| 368 | |||
| 369 | timings->count++; | ||
| 370 | } | ||
| 371 | |||
| 357 | /* | 372 | /* |
| 358 | * The function record_irq_time is only called in one place in the | 373 | * The function record_irq_time is only called in one place in the |
| 359 | * interrupts handler. We want this function always inline so the code | 374 | * interrupts handler. We want this function always inline so the code |
| @@ -367,15 +382,8 @@ static __always_inline void record_irq_time(struct irq_desc *desc) | |||
| 367 | if (!static_branch_likely(&irq_timing_enabled)) | 382 | if (!static_branch_likely(&irq_timing_enabled)) |
| 368 | return; | 383 | return; |
| 369 | 384 | ||
| 370 | if (desc->istate & IRQS_TIMINGS) { | 385 | if (desc->istate & IRQS_TIMINGS) |
| 371 | struct irq_timings *timings = this_cpu_ptr(&irq_timings); | 386 | irq_timings_push(local_clock(), irq_desc_get_irq(desc)); |
| 372 | |||
| 373 | timings->values[timings->count & IRQ_TIMINGS_MASK] = | ||
| 374 | irq_timing_encode(local_clock(), | ||
| 375 | irq_desc_get_irq(desc)); | ||
| 376 | |||
| 377 | timings->count++; | ||
| 378 | } | ||
| 379 | } | 387 | } |
| 380 | #else | 388 | #else |
| 381 | static inline void irq_remove_timings(struct irq_desc *desc) {} | 389 | static inline void irq_remove_timings(struct irq_desc *desc) {} |
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index c52b737ab8e3..9484e88dabc2 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c | |||
| @@ -680,6 +680,8 @@ int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq, | |||
| 680 | * @hwirq: The HW irq number to convert to a logical one | 680 | * @hwirq: The HW irq number to convert to a logical one |
| 681 | * @regs: Register file coming from the low-level handling code | 681 | * @regs: Register file coming from the low-level handling code |
| 682 | * | 682 | * |
| 683 | * This function must be called from an NMI context. | ||
| 684 | * | ||
| 683 | * Returns: 0 on success, or -EINVAL if conversion has failed | 685 | * Returns: 0 on success, or -EINVAL if conversion has failed |
| 684 | */ | 686 | */ |
| 685 | int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, | 687 | int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, |
| @@ -689,7 +691,10 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, | |||
| 689 | unsigned int irq; | 691 | unsigned int irq; |
| 690 | int ret = 0; | 692 | int ret = 0; |
| 691 | 693 | ||
| 692 | nmi_enter(); | 694 | /* |
| 695 | * NMI context needs to be setup earlier in order to deal with tracing. | ||
| 696 | */ | ||
| 697 | WARN_ON(!in_nmi()); | ||
| 693 | 698 | ||
| 694 | irq = irq_find_mapping(domain, hwirq); | 699 | irq = irq_find_mapping(domain, hwirq); |
| 695 | 700 | ||
| @@ -702,7 +707,6 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, | |||
| 702 | else | 707 | else |
| 703 | ret = -EINVAL; | 708 | ret = -EINVAL; |
| 704 | 709 | ||
| 705 | nmi_exit(); | ||
| 706 | set_irq_regs(old_regs); | 710 | set_irq_regs(old_regs); |
| 707 | return ret; | 711 | return ret; |
| 708 | } | 712 | } |
| @@ -946,6 +950,11 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) | |||
| 946 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; | 950 | *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; |
| 947 | } | 951 | } |
| 948 | 952 | ||
| 953 | static bool irq_is_nmi(struct irq_desc *desc) | ||
| 954 | { | ||
| 955 | return desc->istate & IRQS_NMI; | ||
| 956 | } | ||
| 957 | |||
| 949 | /** | 958 | /** |
| 950 | * kstat_irqs - Get the statistics for an interrupt | 959 | * kstat_irqs - Get the statistics for an interrupt |
| 951 | * @irq: The interrupt number | 960 | * @irq: The interrupt number |
| @@ -963,7 +972,8 @@ unsigned int kstat_irqs(unsigned int irq) | |||
| 963 | if (!desc || !desc->kstat_irqs) | 972 | if (!desc || !desc->kstat_irqs) |
| 964 | return 0; | 973 | return 0; |
| 965 | if (!irq_settings_is_per_cpu_devid(desc) && | 974 | if (!irq_settings_is_per_cpu_devid(desc) && |
| 966 | !irq_settings_is_per_cpu(desc)) | 975 | !irq_settings_is_per_cpu(desc) && |
| 976 | !irq_is_nmi(desc)) | ||
| 967 | return desc->tot_count; | 977 | return desc->tot_count; |
| 968 | 978 | ||
| 969 | for_each_possible_cpu(cpu) | 979 | for_each_possible_cpu(cpu) |
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9ed29e4a7dbf..3078d0e48bba 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c | |||
| @@ -123,7 +123,7 @@ EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); | |||
| 123 | * @ops: domain callbacks | 123 | * @ops: domain callbacks |
| 124 | * @host_data: Controller private data pointer | 124 | * @host_data: Controller private data pointer |
| 125 | * | 125 | * |
| 126 | * Allocates and initialize and irq_domain structure. | 126 | * Allocates and initializes an irq_domain structure. |
| 127 | * Returns pointer to IRQ domain, or NULL on failure. | 127 | * Returns pointer to IRQ domain, or NULL on failure. |
| 128 | */ | 128 | */ |
| 129 | struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, | 129 | struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, |
| @@ -139,7 +139,7 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size, | |||
| 139 | 139 | ||
| 140 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), | 140 | domain = kzalloc_node(sizeof(*domain) + (sizeof(unsigned int) * size), |
| 141 | GFP_KERNEL, of_node_to_nid(of_node)); | 141 | GFP_KERNEL, of_node_to_nid(of_node)); |
| 142 | if (WARN_ON(!domain)) | 142 | if (!domain) |
| 143 | return NULL; | 143 | return NULL; |
| 144 | 144 | ||
| 145 | if (fwnode && is_fwnode_irqchip(fwnode)) { | 145 | if (fwnode && is_fwnode_irqchip(fwnode)) { |
| @@ -1297,7 +1297,7 @@ int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, | |||
| 1297 | /** | 1297 | /** |
| 1298 | * __irq_domain_alloc_irqs - Allocate IRQs from domain | 1298 | * __irq_domain_alloc_irqs - Allocate IRQs from domain |
| 1299 | * @domain: domain to allocate from | 1299 | * @domain: domain to allocate from |
| 1300 | * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 | 1300 | * @irq_base: allocate specified IRQ number if irq_base >= 0 |
| 1301 | * @nr_irqs: number of IRQs to allocate | 1301 | * @nr_irqs: number of IRQs to allocate |
| 1302 | * @node: NUMA node id for memory allocation | 1302 | * @node: NUMA node id for memory allocation |
| 1303 | * @arg: domain specific argument | 1303 | * @arg: domain specific argument |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 78f3ddeb7fe4..e8f7f179bf77 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 14 | #include <linux/random.h> | 14 | #include <linux/random.h> |
| 15 | #include <linux/interrupt.h> | 15 | #include <linux/interrupt.h> |
| 16 | #include <linux/irqdomain.h> | ||
| 16 | #include <linux/slab.h> | 17 | #include <linux/slab.h> |
| 17 | #include <linux/sched.h> | 18 | #include <linux/sched.h> |
| 18 | #include <linux/sched/rt.h> | 19 | #include <linux/sched/rt.h> |
| @@ -34,8 +35,9 @@ static int __init setup_forced_irqthreads(char *arg) | |||
| 34 | early_param("threadirqs", setup_forced_irqthreads); | 35 | early_param("threadirqs", setup_forced_irqthreads); |
| 35 | #endif | 36 | #endif |
| 36 | 37 | ||
| 37 | static void __synchronize_hardirq(struct irq_desc *desc) | 38 | static void __synchronize_hardirq(struct irq_desc *desc, bool sync_chip) |
| 38 | { | 39 | { |
| 40 | struct irq_data *irqd = irq_desc_get_irq_data(desc); | ||
| 39 | bool inprogress; | 41 | bool inprogress; |
| 40 | 42 | ||
| 41 | do { | 43 | do { |
| @@ -51,6 +53,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) | |||
| 51 | /* Ok, that indicated we're done: double-check carefully. */ | 53 | /* Ok, that indicated we're done: double-check carefully. */ |
| 52 | raw_spin_lock_irqsave(&desc->lock, flags); | 54 | raw_spin_lock_irqsave(&desc->lock, flags); |
| 53 | inprogress = irqd_irq_inprogress(&desc->irq_data); | 55 | inprogress = irqd_irq_inprogress(&desc->irq_data); |
| 56 | |||
| 57 | /* | ||
| 58 | * If requested and supported, check at the chip whether it | ||
| 59 | * is in flight at the hardware level, i.e. already pending | ||
| 60 | * in a CPU and waiting for service and acknowledge. | ||
| 61 | */ | ||
| 62 | if (!inprogress && sync_chip) { | ||
| 63 | /* | ||
| 64 | * Ignore the return code. inprogress is only updated | ||
| 65 | * when the chip supports it. | ||
| 66 | */ | ||
| 67 | __irq_get_irqchip_state(irqd, IRQCHIP_STATE_ACTIVE, | ||
| 68 | &inprogress); | ||
| 69 | } | ||
| 54 | raw_spin_unlock_irqrestore(&desc->lock, flags); | 70 | raw_spin_unlock_irqrestore(&desc->lock, flags); |
| 55 | 71 | ||
| 56 | /* Oops, that failed? */ | 72 | /* Oops, that failed? */ |
| @@ -73,13 +89,18 @@ static void __synchronize_hardirq(struct irq_desc *desc) | |||
| 73 | * Returns: false if a threaded handler is active. | 89 | * Returns: false if a threaded handler is active. |
| 74 | * | 90 | * |
| 75 | * This function may be called - with care - from IRQ context. | 91 | * This function may be called - with care - from IRQ context. |
| 92 | * | ||
| 93 | * It does not check whether there is an interrupt in flight at the | ||
| 94 | * hardware level, but not serviced yet, as this might deadlock when | ||
| 95 | * called with interrupts disabled and the target CPU of the interrupt | ||
| 96 | * is the current CPU. | ||
| 76 | */ | 97 | */ |
| 77 | bool synchronize_hardirq(unsigned int irq) | 98 | bool synchronize_hardirq(unsigned int irq) |
| 78 | { | 99 | { |
| 79 | struct irq_desc *desc = irq_to_desc(irq); | 100 | struct irq_desc *desc = irq_to_desc(irq); |
| 80 | 101 | ||
| 81 | if (desc) { | 102 | if (desc) { |
| 82 | __synchronize_hardirq(desc); | 103 | __synchronize_hardirq(desc, false); |
| 83 | return !atomic_read(&desc->threads_active); | 104 | return !atomic_read(&desc->threads_active); |
| 84 | } | 105 | } |
| 85 | 106 | ||
| @@ -95,14 +116,19 @@ EXPORT_SYMBOL(synchronize_hardirq); | |||
| 95 | * to complete before returning. If you use this function while | 116 | * to complete before returning. If you use this function while |
| 96 | * holding a resource the IRQ handler may need you will deadlock. | 117 | * holding a resource the IRQ handler may need you will deadlock. |
| 97 | * | 118 | * |
| 98 | * This function may be called - with care - from IRQ context. | 119 | * Can only be called from preemptible code as it might sleep when |
| 120 | * an interrupt thread is associated to @irq. | ||
| 121 | * | ||
| 122 | * It optionally makes sure (when the irq chip supports that method) | ||
| 123 | * that the interrupt is not pending in any CPU and waiting for | ||
| 124 | * service. | ||
| 99 | */ | 125 | */ |
| 100 | void synchronize_irq(unsigned int irq) | 126 | void synchronize_irq(unsigned int irq) |
| 101 | { | 127 | { |
| 102 | struct irq_desc *desc = irq_to_desc(irq); | 128 | struct irq_desc *desc = irq_to_desc(irq); |
| 103 | 129 | ||
| 104 | if (desc) { | 130 | if (desc) { |
| 105 | __synchronize_hardirq(desc); | 131 | __synchronize_hardirq(desc, true); |
| 106 | /* | 132 | /* |
| 107 | * We made sure that no hardirq handler is | 133 | * We made sure that no hardirq handler is |
| 108 | * running. Now verify that no threaded handlers are | 134 | * running. Now verify that no threaded handlers are |
| @@ -1699,6 +1725,7 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) | |||
| 1699 | /* If this was the last handler, shut down the IRQ line: */ | 1725 | /* If this was the last handler, shut down the IRQ line: */ |
| 1700 | if (!desc->action) { | 1726 | if (!desc->action) { |
| 1701 | irq_settings_clr_disable_unlazy(desc); | 1727 | irq_settings_clr_disable_unlazy(desc); |
| 1728 | /* Only shutdown. Deactivate after synchronize_hardirq() */ | ||
| 1702 | irq_shutdown(desc); | 1729 | irq_shutdown(desc); |
| 1703 | } | 1730 | } |
| 1704 | 1731 | ||
| @@ -1727,8 +1754,12 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) | |||
| 1727 | 1754 | ||
| 1728 | unregister_handler_proc(irq, action); | 1755 | unregister_handler_proc(irq, action); |
| 1729 | 1756 | ||
| 1730 | /* Make sure it's not being used on another CPU: */ | 1757 | /* |
| 1731 | synchronize_hardirq(irq); | 1758 | * Make sure it's not being used on another CPU and if the chip |
| 1759 | * supports it also make sure that there is no (not yet serviced) | ||
| 1760 | * interrupt in flight at the hardware level. | ||
| 1761 | */ | ||
| 1762 | __synchronize_hardirq(desc, true); | ||
| 1732 | 1763 | ||
| 1733 | #ifdef CONFIG_DEBUG_SHIRQ | 1764 | #ifdef CONFIG_DEBUG_SHIRQ |
| 1734 | /* | 1765 | /* |
| @@ -1768,6 +1799,14 @@ static struct irqaction *__free_irq(struct irq_desc *desc, void *dev_id) | |||
| 1768 | * require it to deallocate resources over the slow bus. | 1799 | * require it to deallocate resources over the slow bus. |
| 1769 | */ | 1800 | */ |
| 1770 | chip_bus_lock(desc); | 1801 | chip_bus_lock(desc); |
| 1802 | /* | ||
| 1803 | * There is no interrupt on the fly anymore. Deactivate it | ||
| 1804 | * completely. | ||
| 1805 | */ | ||
| 1806 | raw_spin_lock_irqsave(&desc->lock, flags); | ||
| 1807 | irq_domain_deactivate_irq(&desc->irq_data); | ||
| 1808 | raw_spin_unlock_irqrestore(&desc->lock, flags); | ||
| 1809 | |||
| 1771 | irq_release_resources(desc); | 1810 | irq_release_resources(desc); |
| 1772 | chip_bus_sync_unlock(desc); | 1811 | chip_bus_sync_unlock(desc); |
| 1773 | irq_remove_timings(desc); | 1812 | irq_remove_timings(desc); |
| @@ -1855,7 +1894,7 @@ static const void *__cleanup_nmi(unsigned int irq, struct irq_desc *desc) | |||
| 1855 | } | 1894 | } |
| 1856 | 1895 | ||
| 1857 | irq_settings_clr_disable_unlazy(desc); | 1896 | irq_settings_clr_disable_unlazy(desc); |
| 1858 | irq_shutdown(desc); | 1897 | irq_shutdown_and_deactivate(desc); |
| 1859 | 1898 | ||
| 1860 | irq_release_resources(desc); | 1899 | irq_release_resources(desc); |
| 1861 | 1900 | ||
| @@ -2578,6 +2617,28 @@ out: | |||
| 2578 | irq_put_desc_unlock(desc, flags); | 2617 | irq_put_desc_unlock(desc, flags); |
| 2579 | } | 2618 | } |
| 2580 | 2619 | ||
| 2620 | int __irq_get_irqchip_state(struct irq_data *data, enum irqchip_irq_state which, | ||
| 2621 | bool *state) | ||
| 2622 | { | ||
| 2623 | struct irq_chip *chip; | ||
| 2624 | int err = -EINVAL; | ||
| 2625 | |||
| 2626 | do { | ||
| 2627 | chip = irq_data_get_irq_chip(data); | ||
| 2628 | if (chip->irq_get_irqchip_state) | ||
| 2629 | break; | ||
| 2630 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 2631 | data = data->parent_data; | ||
| 2632 | #else | ||
| 2633 | data = NULL; | ||
| 2634 | #endif | ||
| 2635 | } while (data); | ||
| 2636 | |||
| 2637 | if (data) | ||
| 2638 | err = chip->irq_get_irqchip_state(data, which, state); | ||
| 2639 | return err; | ||
| 2640 | } | ||
| 2641 | |||
| 2581 | /** | 2642 | /** |
| 2582 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. | 2643 | * irq_get_irqchip_state - returns the irqchip state of a interrupt. |
| 2583 | * @irq: Interrupt line that is forwarded to a VM | 2644 | * @irq: Interrupt line that is forwarded to a VM |
| @@ -2596,7 +2657,6 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | |||
| 2596 | { | 2657 | { |
| 2597 | struct irq_desc *desc; | 2658 | struct irq_desc *desc; |
| 2598 | struct irq_data *data; | 2659 | struct irq_data *data; |
| 2599 | struct irq_chip *chip; | ||
| 2600 | unsigned long flags; | 2660 | unsigned long flags; |
| 2601 | int err = -EINVAL; | 2661 | int err = -EINVAL; |
| 2602 | 2662 | ||
| @@ -2606,19 +2666,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which, | |||
| 2606 | 2666 | ||
| 2607 | data = irq_desc_get_irq_data(desc); | 2667 | data = irq_desc_get_irq_data(desc); |
| 2608 | 2668 | ||
| 2609 | do { | 2669 | err = __irq_get_irqchip_state(data, which, state); |
| 2610 | chip = irq_data_get_irq_chip(data); | ||
| 2611 | if (chip->irq_get_irqchip_state) | ||
| 2612 | break; | ||
| 2613 | #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY | ||
| 2614 | data = data->parent_data; | ||
| 2615 | #else | ||
| 2616 | data = NULL; | ||
| 2617 | #endif | ||
| 2618 | } while (data); | ||
| 2619 | |||
| 2620 | if (data) | ||
| 2621 | err = chip->irq_get_irqchip_state(data, which, state); | ||
| 2622 | 2670 | ||
| 2623 | irq_put_desc_busunlock(desc, flags); | 2671 | irq_put_desc_busunlock(desc, flags); |
| 2624 | return err; | 2672 | return err; |
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c index 90c735da15d0..e960d7ce7bcc 100644 --- a/kernel/irq/timings.c +++ b/kernel/irq/timings.c | |||
| @@ -1,10 +1,12 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
| 2 | // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> | 2 | // Copyright (C) 2016, Linaro Ltd - Daniel Lezcano <daniel.lezcano@linaro.org> |
| 3 | #define pr_fmt(fmt) "irq_timings: " fmt | ||
| 3 | 4 | ||
| 4 | #include <linux/kernel.h> | 5 | #include <linux/kernel.h> |
| 5 | #include <linux/percpu.h> | 6 | #include <linux/percpu.h> |
| 6 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
| 7 | #include <linux/static_key.h> | 8 | #include <linux/static_key.h> |
| 9 | #include <linux/init.h> | ||
| 8 | #include <linux/interrupt.h> | 10 | #include <linux/interrupt.h> |
| 9 | #include <linux/idr.h> | 11 | #include <linux/idr.h> |
| 10 | #include <linux/irq.h> | 12 | #include <linux/irq.h> |
| @@ -261,12 +263,29 @@ void irq_timings_disable(void) | |||
| 261 | #define EMA_ALPHA_VAL 64 | 263 | #define EMA_ALPHA_VAL 64 |
| 262 | #define EMA_ALPHA_SHIFT 7 | 264 | #define EMA_ALPHA_SHIFT 7 |
| 263 | 265 | ||
| 264 | #define PREDICTION_PERIOD_MIN 2 | 266 | #define PREDICTION_PERIOD_MIN 3 |
| 265 | #define PREDICTION_PERIOD_MAX 5 | 267 | #define PREDICTION_PERIOD_MAX 5 |
| 266 | #define PREDICTION_FACTOR 4 | 268 | #define PREDICTION_FACTOR 4 |
| 267 | #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ | 269 | #define PREDICTION_MAX 10 /* 2 ^ PREDICTION_MAX useconds */ |
| 268 | #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ | 270 | #define PREDICTION_BUFFER_SIZE 16 /* slots for EMAs, hardly more than 16 */ |
| 269 | 271 | ||
| 272 | /* | ||
| 273 | * Number of elements in the circular buffer: If it happens it was | ||
| 274 | * flushed before, then the number of elements could be smaller than | ||
| 275 | * IRQ_TIMINGS_SIZE, so the count is used, otherwise the array size is | ||
| 276 | * used as we wrapped. The index begins from zero when we did not | ||
| 277 | * wrap. That could be done in a nicer way with the proper circular | ||
| 278 | * array structure type but with the cost of extra computation in the | ||
| 279 | * interrupt handler hot path. We choose efficiency. | ||
| 280 | */ | ||
| 281 | #define for_each_irqts(i, irqts) \ | ||
| 282 | for (i = irqts->count < IRQ_TIMINGS_SIZE ? \ | ||
| 283 | 0 : irqts->count & IRQ_TIMINGS_MASK, \ | ||
| 284 | irqts->count = min(IRQ_TIMINGS_SIZE, \ | ||
| 285 | irqts->count); \ | ||
| 286 | irqts->count > 0; irqts->count--, \ | ||
| 287 | i = (i + 1) & IRQ_TIMINGS_MASK) | ||
| 288 | |||
| 270 | struct irqt_stat { | 289 | struct irqt_stat { |
| 271 | u64 last_ts; | 290 | u64 last_ts; |
| 272 | u64 ema_time[PREDICTION_BUFFER_SIZE]; | 291 | u64 ema_time[PREDICTION_BUFFER_SIZE]; |
| @@ -297,7 +316,16 @@ static u64 irq_timings_ema_new(u64 value, u64 ema_old) | |||
| 297 | 316 | ||
| 298 | static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) | 317 | static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) |
| 299 | { | 318 | { |
| 300 | int i; | 319 | int period; |
| 320 | |||
| 321 | /* | ||
| 322 | * Move the beginning pointer to the end minus the max period x 3. | ||
| 323 | * We are at the point we can begin searching the pattern | ||
| 324 | */ | ||
| 325 | buffer = &buffer[len - (period_max * 3)]; | ||
| 326 | |||
| 327 | /* Adjust the length to the maximum allowed period x 3 */ | ||
| 328 | len = period_max * 3; | ||
| 301 | 329 | ||
| 302 | /* | 330 | /* |
| 303 | * The buffer contains the suite of intervals, in a ilog2 | 331 | * The buffer contains the suite of intervals, in a ilog2 |
| @@ -306,21 +334,45 @@ static int irq_timings_next_event_index(int *buffer, size_t len, int period_max) | |||
| 306 | * period beginning at the end of the buffer. We do that for | 334 | * period beginning at the end of the buffer. We do that for |
| 307 | * each suffix. | 335 | * each suffix. |
| 308 | */ | 336 | */ |
| 309 | for (i = period_max; i >= PREDICTION_PERIOD_MIN ; i--) { | 337 | for (period = period_max; period >= PREDICTION_PERIOD_MIN; period--) { |
| 310 | 338 | ||
| 311 | int *begin = &buffer[len - (i * 3)]; | 339 | /* |
| 312 | int *ptr = begin; | 340 | * The first comparison always succeed because the |
| 341 | * suffix is deduced from the first n-period bytes of | ||
| 342 | * the buffer and we compare the initial suffix with | ||
| 343 | * itself, so we can skip the first iteration. | ||
| 344 | */ | ||
| 345 | int idx = period; | ||
| 346 | size_t size = period; | ||
| 313 | 347 | ||
| 314 | /* | 348 | /* |
| 315 | * We look if the suite with period 'i' repeat | 349 | * We look if the suite with period 'i' repeat |
| 316 | * itself. If it is truncated at the end, as it | 350 | * itself. If it is truncated at the end, as it |
| 317 | * repeats we can use the period to find out the next | 351 | * repeats we can use the period to find out the next |
| 318 | * element. | 352 | * element with the modulo. |
| 319 | */ | 353 | */ |
| 320 | while (!memcmp(ptr, begin, i * sizeof(*ptr))) { | 354 | while (!memcmp(buffer, &buffer[idx], size * sizeof(int))) { |
| 321 | ptr += i; | 355 | |
| 322 | if (ptr >= &buffer[len]) | 356 | /* |
| 323 | return begin[((i * 3) % i)]; | 357 | * Move the index in a period basis |
| 358 | */ | ||
| 359 | idx += size; | ||
| 360 | |||
| 361 | /* | ||
| 362 | * If this condition is reached, all previous | ||
| 363 | * memcmp were successful, so the period is | ||
| 364 | * found. | ||
| 365 | */ | ||
| 366 | if (idx == len) | ||
| 367 | return buffer[len % period]; | ||
| 368 | |||
| 369 | /* | ||
| 370 | * If the remaining elements to compare are | ||
| 371 | * smaller than the period, readjust the size | ||
| 372 | * of the comparison for the last iteration. | ||
| 373 | */ | ||
| 374 | if (len - idx < period) | ||
| 375 | size = len - idx; | ||
| 324 | } | 376 | } |
| 325 | } | 377 | } |
| 326 | 378 | ||
| @@ -380,11 +432,43 @@ static u64 __irq_timings_next_event(struct irqt_stat *irqs, int irq, u64 now) | |||
| 380 | return irqs->last_ts + irqs->ema_time[index]; | 432 | return irqs->last_ts + irqs->ema_time[index]; |
| 381 | } | 433 | } |
| 382 | 434 | ||
| 435 | static __always_inline int irq_timings_interval_index(u64 interval) | ||
| 436 | { | ||
| 437 | /* | ||
| 438 | * The PREDICTION_FACTOR increase the interval size for the | ||
| 439 | * array of exponential average. | ||
| 440 | */ | ||
| 441 | u64 interval_us = (interval >> 10) / PREDICTION_FACTOR; | ||
| 442 | |||
| 443 | return likely(interval_us) ? ilog2(interval_us) : 0; | ||
| 444 | } | ||
| 445 | |||
| 446 | static __always_inline void __irq_timings_store(int irq, struct irqt_stat *irqs, | ||
| 447 | u64 interval) | ||
| 448 | { | ||
| 449 | int index; | ||
| 450 | |||
| 451 | /* | ||
| 452 | * Get the index in the ema table for this interrupt. | ||
| 453 | */ | ||
| 454 | index = irq_timings_interval_index(interval); | ||
| 455 | |||
| 456 | /* | ||
| 457 | * Store the index as an element of the pattern in another | ||
| 458 | * circular array. | ||
| 459 | */ | ||
| 460 | irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; | ||
| 461 | |||
| 462 | irqs->ema_time[index] = irq_timings_ema_new(interval, | ||
| 463 | irqs->ema_time[index]); | ||
| 464 | |||
| 465 | irqs->count++; | ||
| 466 | } | ||
| 467 | |||
| 383 | static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) | 468 | static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) |
| 384 | { | 469 | { |
| 385 | u64 old_ts = irqs->last_ts; | 470 | u64 old_ts = irqs->last_ts; |
| 386 | u64 interval; | 471 | u64 interval; |
| 387 | int index; | ||
| 388 | 472 | ||
| 389 | /* | 473 | /* |
| 390 | * The timestamps are absolute time values, we need to compute | 474 | * The timestamps are absolute time values, we need to compute |
| @@ -415,24 +499,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts) | |||
| 415 | return; | 499 | return; |
| 416 | } | 500 | } |
| 417 | 501 | ||
| 418 | /* | 502 | __irq_timings_store(irq, irqs, interval); |
| 419 | * Get the index in the ema table for this interrupt. The | ||
| 420 | * PREDICTION_FACTOR increase the interval size for the array | ||
| 421 | * of exponential average. | ||
| 422 | */ | ||
| 423 | index = likely(interval) ? | ||
| 424 | ilog2((interval >> 10) / PREDICTION_FACTOR) : 0; | ||
| 425 | |||
| 426 | /* | ||
| 427 | * Store the index as an element of the pattern in another | ||
| 428 | * circular array. | ||
| 429 | */ | ||
| 430 | irqs->circ_timings[irqs->count & IRQ_TIMINGS_MASK] = index; | ||
| 431 | |||
| 432 | irqs->ema_time[index] = irq_timings_ema_new(interval, | ||
| 433 | irqs->ema_time[index]); | ||
| 434 | |||
| 435 | irqs->count++; | ||
| 436 | } | 503 | } |
| 437 | 504 | ||
| 438 | /** | 505 | /** |
| @@ -493,11 +560,7 @@ u64 irq_timings_next_event(u64 now) | |||
| 493 | * model while decrementing the counter because we consume the | 560 | * model while decrementing the counter because we consume the |
| 494 | * data from our circular buffer. | 561 | * data from our circular buffer. |
| 495 | */ | 562 | */ |
| 496 | 563 | for_each_irqts(i, irqts) { | |
| 497 | i = (irqts->count & IRQ_TIMINGS_MASK) - 1; | ||
| 498 | irqts->count = min(IRQ_TIMINGS_SIZE, irqts->count); | ||
| 499 | |||
| 500 | for (; irqts->count > 0; irqts->count--, i = (i + 1) & IRQ_TIMINGS_MASK) { | ||
| 501 | irq = irq_timing_decode(irqts->values[i], &ts); | 564 | irq = irq_timing_decode(irqts->values[i], &ts); |
| 502 | s = idr_find(&irqt_stats, irq); | 565 | s = idr_find(&irqt_stats, irq); |
| 503 | if (s) | 566 | if (s) |
| @@ -564,3 +627,325 @@ int irq_timings_alloc(int irq) | |||
| 564 | 627 | ||
| 565 | return 0; | 628 | return 0; |
| 566 | } | 629 | } |
| 630 | |||
| 631 | #ifdef CONFIG_TEST_IRQ_TIMINGS | ||
| 632 | struct timings_intervals { | ||
| 633 | u64 *intervals; | ||
| 634 | size_t count; | ||
| 635 | }; | ||
| 636 | |||
| 637 | /* | ||
| 638 | * Intervals are given in nanosecond base | ||
| 639 | */ | ||
| 640 | static u64 intervals0[] __initdata = { | ||
| 641 | 10000, 50000, 200000, 500000, | ||
| 642 | 10000, 50000, 200000, 500000, | ||
| 643 | 10000, 50000, 200000, 500000, | ||
| 644 | 10000, 50000, 200000, 500000, | ||
| 645 | 10000, 50000, 200000, 500000, | ||
| 646 | 10000, 50000, 200000, 500000, | ||
| 647 | 10000, 50000, 200000, 500000, | ||
| 648 | 10000, 50000, 200000, 500000, | ||
| 649 | 10000, 50000, 200000, | ||
| 650 | }; | ||
| 651 | |||
| 652 | static u64 intervals1[] __initdata = { | ||
| 653 | 223947000, 1240000, 1384000, 1386000, 1386000, | ||
| 654 | 217416000, 1236000, 1384000, 1386000, 1387000, | ||
| 655 | 214719000, 1241000, 1386000, 1387000, 1384000, | ||
| 656 | 213696000, 1234000, 1384000, 1386000, 1388000, | ||
| 657 | 219904000, 1240000, 1385000, 1389000, 1385000, | ||
| 658 | 212240000, 1240000, 1386000, 1386000, 1386000, | ||
| 659 | 214415000, 1236000, 1384000, 1386000, 1387000, | ||
| 660 | 214276000, 1234000, | ||
| 661 | }; | ||
| 662 | |||
| 663 | static u64 intervals2[] __initdata = { | ||
| 664 | 4000, 3000, 5000, 100000, | ||
| 665 | 3000, 3000, 5000, 117000, | ||
| 666 | 4000, 4000, 5000, 112000, | ||
| 667 | 4000, 3000, 4000, 110000, | ||
| 668 | 3000, 5000, 3000, 117000, | ||
| 669 | 4000, 4000, 5000, 112000, | ||
| 670 | 4000, 3000, 4000, 110000, | ||
| 671 | 3000, 4000, 5000, 112000, | ||
| 672 | 4000, | ||
| 673 | }; | ||
| 674 | |||
| 675 | static u64 intervals3[] __initdata = { | ||
| 676 | 1385000, 212240000, 1240000, | ||
| 677 | 1386000, 214415000, 1236000, | ||
| 678 | 1384000, 214276000, 1234000, | ||
| 679 | 1386000, 214415000, 1236000, | ||
| 680 | 1385000, 212240000, 1240000, | ||
| 681 | 1386000, 214415000, 1236000, | ||
| 682 | 1384000, 214276000, 1234000, | ||
| 683 | 1386000, 214415000, 1236000, | ||
| 684 | 1385000, 212240000, 1240000, | ||
| 685 | }; | ||
| 686 | |||
| 687 | static u64 intervals4[] __initdata = { | ||
| 688 | 10000, 50000, 10000, 50000, | ||
| 689 | 10000, 50000, 10000, 50000, | ||
| 690 | 10000, 50000, 10000, 50000, | ||
| 691 | 10000, 50000, 10000, 50000, | ||
| 692 | 10000, 50000, 10000, 50000, | ||
| 693 | 10000, 50000, 10000, 50000, | ||
| 694 | 10000, 50000, 10000, 50000, | ||
| 695 | 10000, 50000, 10000, 50000, | ||
| 696 | 10000, | ||
| 697 | }; | ||
| 698 | |||
| 699 | static struct timings_intervals tis[] __initdata = { | ||
| 700 | { intervals0, ARRAY_SIZE(intervals0) }, | ||
| 701 | { intervals1, ARRAY_SIZE(intervals1) }, | ||
| 702 | { intervals2, ARRAY_SIZE(intervals2) }, | ||
| 703 | { intervals3, ARRAY_SIZE(intervals3) }, | ||
| 704 | { intervals4, ARRAY_SIZE(intervals4) }, | ||
| 705 | }; | ||
| 706 | |||
| 707 | static int __init irq_timings_test_next_index(struct timings_intervals *ti) | ||
| 708 | { | ||
| 709 | int _buffer[IRQ_TIMINGS_SIZE]; | ||
| 710 | int buffer[IRQ_TIMINGS_SIZE]; | ||
| 711 | int index, start, i, count, period_max; | ||
| 712 | |||
| 713 | count = ti->count - 1; | ||
| 714 | |||
| 715 | period_max = count > (3 * PREDICTION_PERIOD_MAX) ? | ||
| 716 | PREDICTION_PERIOD_MAX : count / 3; | ||
| 717 | |||
| 718 | /* | ||
| 719 | * Inject all values except the last one which will be used | ||
| 720 | * to compare with the next index result. | ||
| 721 | */ | ||
| 722 | pr_debug("index suite: "); | ||
| 723 | |||
| 724 | for (i = 0; i < count; i++) { | ||
| 725 | index = irq_timings_interval_index(ti->intervals[i]); | ||
| 726 | _buffer[i & IRQ_TIMINGS_MASK] = index; | ||
| 727 | pr_cont("%d ", index); | ||
| 728 | } | ||
| 729 | |||
| 730 | start = count < IRQ_TIMINGS_SIZE ? 0 : | ||
| 731 | count & IRQ_TIMINGS_MASK; | ||
| 732 | |||
| 733 | count = min_t(int, count, IRQ_TIMINGS_SIZE); | ||
| 734 | |||
| 735 | for (i = 0; i < count; i++) { | ||
| 736 | int index = (start + i) & IRQ_TIMINGS_MASK; | ||
| 737 | buffer[i] = _buffer[index]; | ||
| 738 | } | ||
| 739 | |||
| 740 | index = irq_timings_next_event_index(buffer, count, period_max); | ||
| 741 | i = irq_timings_interval_index(ti->intervals[ti->count - 1]); | ||
| 742 | |||
| 743 | if (index != i) { | ||
| 744 | pr_err("Expected (%d) and computed (%d) next indexes differ\n", | ||
| 745 | i, index); | ||
| 746 | return -EINVAL; | ||
| 747 | } | ||
| 748 | |||
| 749 | return 0; | ||
| 750 | } | ||
| 751 | |||
| 752 | static int __init irq_timings_next_index_selftest(void) | ||
| 753 | { | ||
| 754 | int i, ret; | ||
| 755 | |||
| 756 | for (i = 0; i < ARRAY_SIZE(tis); i++) { | ||
| 757 | |||
| 758 | pr_info("---> Injecting intervals number #%d (count=%zd)\n", | ||
| 759 | i, tis[i].count); | ||
| 760 | |||
| 761 | ret = irq_timings_test_next_index(&tis[i]); | ||
| 762 | if (ret) | ||
| 763 | break; | ||
| 764 | } | ||
| 765 | |||
| 766 | return ret; | ||
| 767 | } | ||
| 768 | |||
| 769 | static int __init irq_timings_test_irqs(struct timings_intervals *ti) | ||
| 770 | { | ||
| 771 | struct irqt_stat __percpu *s; | ||
| 772 | struct irqt_stat *irqs; | ||
| 773 | int i, index, ret, irq = 0xACE5; | ||
| 774 | |||
| 775 | ret = irq_timings_alloc(irq); | ||
| 776 | if (ret) { | ||
| 777 | pr_err("Failed to allocate irq timings\n"); | ||
| 778 | return ret; | ||
| 779 | } | ||
| 780 | |||
| 781 | s = idr_find(&irqt_stats, irq); | ||
| 782 | if (!s) { | ||
| 783 | ret = -EIDRM; | ||
| 784 | goto out; | ||
| 785 | } | ||
| 786 | |||
| 787 | irqs = this_cpu_ptr(s); | ||
| 788 | |||
| 789 | for (i = 0; i < ti->count; i++) { | ||
| 790 | |||
| 791 | index = irq_timings_interval_index(ti->intervals[i]); | ||
| 792 | pr_debug("%d: interval=%llu ema_index=%d\n", | ||
| 793 | i, ti->intervals[i], index); | ||
| 794 | |||
| 795 | __irq_timings_store(irq, irqs, ti->intervals[i]); | ||
| 796 | if (irqs->circ_timings[i & IRQ_TIMINGS_MASK] != index) { | ||
| 797 | pr_err("Failed to store in the circular buffer\n"); | ||
| 798 | goto out; | ||
| 799 | } | ||
| 800 | } | ||
| 801 | |||
| 802 | if (irqs->count != ti->count) { | ||
| 803 | pr_err("Count differs\n"); | ||
| 804 | goto out; | ||
| 805 | } | ||
| 806 | |||
| 807 | ret = 0; | ||
| 808 | out: | ||
| 809 | irq_timings_free(irq); | ||
| 810 | |||
| 811 | return ret; | ||
| 812 | } | ||
| 813 | |||
| 814 | static int __init irq_timings_irqs_selftest(void) | ||
| 815 | { | ||
| 816 | int i, ret; | ||
| 817 | |||
| 818 | for (i = 0; i < ARRAY_SIZE(tis); i++) { | ||
| 819 | pr_info("---> Injecting intervals number #%d (count=%zd)\n", | ||
| 820 | i, tis[i].count); | ||
| 821 | ret = irq_timings_test_irqs(&tis[i]); | ||
| 822 | if (ret) | ||
| 823 | break; | ||
| 824 | } | ||
| 825 | |||
| 826 | return ret; | ||
| 827 | } | ||
| 828 | |||
| 829 | static int __init irq_timings_test_irqts(struct irq_timings *irqts, | ||
| 830 | unsigned count) | ||
| 831 | { | ||
| 832 | int start = count >= IRQ_TIMINGS_SIZE ? count - IRQ_TIMINGS_SIZE : 0; | ||
| 833 | int i, irq, oirq = 0xBEEF; | ||
| 834 | u64 ots = 0xDEAD, ts; | ||
| 835 | |||
| 836 | /* | ||
| 837 | * Fill the circular buffer by using the dedicated function. | ||
| 838 | */ | ||
| 839 | for (i = 0; i < count; i++) { | ||
| 840 | pr_debug("%d: index=%d, ts=%llX irq=%X\n", | ||
| 841 | i, i & IRQ_TIMINGS_MASK, ots + i, oirq + i); | ||
| 842 | |||
| 843 | irq_timings_push(ots + i, oirq + i); | ||
| 844 | } | ||
| 845 | |||
| 846 | /* | ||
| 847 | * Compute the first elements values after the index wrapped | ||
| 848 | * up or not. | ||
| 849 | */ | ||
| 850 | ots += start; | ||
| 851 | oirq += start; | ||
| 852 | |||
| 853 | /* | ||
| 854 | * Test the circular buffer count is correct. | ||
| 855 | */ | ||
| 856 | pr_debug("---> Checking timings array count (%d) is right\n", count); | ||
| 857 | if (WARN_ON(irqts->count != count)) | ||
| 858 | return -EINVAL; | ||
| 859 | |||
| 860 | /* | ||
| 861 | * Test the macro allowing to browse all the irqts. | ||
| 862 | */ | ||
| 863 | pr_debug("---> Checking the for_each_irqts() macro\n"); | ||
| 864 | for_each_irqts(i, irqts) { | ||
| 865 | |||
| 866 | irq = irq_timing_decode(irqts->values[i], &ts); | ||
| 867 | |||
| 868 | pr_debug("index=%d, ts=%llX / %llX, irq=%X / %X\n", | ||
| 869 | i, ts, ots, irq, oirq); | ||
| 870 | |||
| 871 | if (WARN_ON(ts != ots || irq != oirq)) | ||
| 872 | return -EINVAL; | ||
| 873 | |||
| 874 | ots++; oirq++; | ||
| 875 | } | ||
| 876 | |||
| 877 | /* | ||
| 878 | * The circular buffer should have be flushed when browsed | ||
| 879 | * with for_each_irqts | ||
| 880 | */ | ||
| 881 | pr_debug("---> Checking timings array is empty after browsing it\n"); | ||
| 882 | if (WARN_ON(irqts->count)) | ||
| 883 | return -EINVAL; | ||
| 884 | |||
| 885 | return 0; | ||
| 886 | } | ||
| 887 | |||
| 888 | static int __init irq_timings_irqts_selftest(void) | ||
| 889 | { | ||
| 890 | struct irq_timings *irqts = this_cpu_ptr(&irq_timings); | ||
| 891 | int i, ret; | ||
| 892 | |||
| 893 | /* | ||
| 894 | * Test the circular buffer with different number of | ||
| 895 | * elements. The purpose is to test at the limits (empty, half | ||
| 896 | * full, full, wrapped with the cursor at the boundaries, | ||
| 897 | * wrapped several times, etc ... | ||
| 898 | */ | ||
| 899 | int count[] = { 0, | ||
| 900 | IRQ_TIMINGS_SIZE >> 1, | ||
| 901 | IRQ_TIMINGS_SIZE, | ||
| 902 | IRQ_TIMINGS_SIZE + (IRQ_TIMINGS_SIZE >> 1), | ||
| 903 | 2 * IRQ_TIMINGS_SIZE, | ||
| 904 | (2 * IRQ_TIMINGS_SIZE) + 3, | ||
| 905 | }; | ||
| 906 | |||
| 907 | for (i = 0; i < ARRAY_SIZE(count); i++) { | ||
| 908 | |||
| 909 | pr_info("---> Checking the timings with %d/%d values\n", | ||
| 910 | count[i], IRQ_TIMINGS_SIZE); | ||
| 911 | |||
| 912 | ret = irq_timings_test_irqts(irqts, count[i]); | ||
| 913 | if (ret) | ||
| 914 | break; | ||
| 915 | } | ||
| 916 | |||
| 917 | return ret; | ||
| 918 | } | ||
| 919 | |||
| 920 | static int __init irq_timings_selftest(void) | ||
| 921 | { | ||
| 922 | int ret; | ||
| 923 | |||
| 924 | pr_info("------------------- selftest start -----------------\n"); | ||
| 925 | |||
| 926 | /* | ||
| 927 | * At this point, we don't except any subsystem to use the irq | ||
| 928 | * timings but us, so it should not be enabled. | ||
| 929 | */ | ||
| 930 | if (static_branch_unlikely(&irq_timing_enabled)) { | ||
| 931 | pr_warn("irq timings already initialized, skipping selftest\n"); | ||
| 932 | return 0; | ||
| 933 | } | ||
| 934 | |||
| 935 | ret = irq_timings_irqts_selftest(); | ||
| 936 | if (ret) | ||
| 937 | goto out; | ||
| 938 | |||
| 939 | ret = irq_timings_irqs_selftest(); | ||
| 940 | if (ret) | ||
| 941 | goto out; | ||
| 942 | |||
| 943 | ret = irq_timings_next_index_selftest(); | ||
| 944 | out: | ||
| 945 | pr_info("---------- selftest end with %s -----------\n", | ||
| 946 | ret ? "failure" : "success"); | ||
| 947 | |||
| 948 | return ret; | ||
| 949 | } | ||
| 950 | early_initcall(irq_timings_selftest); | ||
| 951 | #endif | ||
diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 73288914ed5e..d42acaf81886 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra | 3 | * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra |
| 3 | * | 4 | * |
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index de6efdecc70d..df3008419a1d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * jump label support | 3 | * jump label support |
| 3 | * | 4 | * |
| @@ -36,12 +37,26 @@ static int jump_label_cmp(const void *a, const void *b) | |||
| 36 | const struct jump_entry *jea = a; | 37 | const struct jump_entry *jea = a; |
| 37 | const struct jump_entry *jeb = b; | 38 | const struct jump_entry *jeb = b; |
| 38 | 39 | ||
| 40 | /* | ||
| 41 | * Entrires are sorted by key. | ||
| 42 | */ | ||
| 39 | if (jump_entry_key(jea) < jump_entry_key(jeb)) | 43 | if (jump_entry_key(jea) < jump_entry_key(jeb)) |
| 40 | return -1; | 44 | return -1; |
| 41 | 45 | ||
| 42 | if (jump_entry_key(jea) > jump_entry_key(jeb)) | 46 | if (jump_entry_key(jea) > jump_entry_key(jeb)) |
| 43 | return 1; | 47 | return 1; |
| 44 | 48 | ||
| 49 | /* | ||
| 50 | * In the batching mode, entries should also be sorted by the code | ||
| 51 | * inside the already sorted list of entries, enabling a bsearch in | ||
| 52 | * the vector. | ||
| 53 | */ | ||
| 54 | if (jump_entry_code(jea) < jump_entry_code(jeb)) | ||
| 55 | return -1; | ||
| 56 | |||
| 57 | if (jump_entry_code(jea) > jump_entry_code(jeb)) | ||
| 58 | return 1; | ||
| 59 | |||
| 45 | return 0; | 60 | return 0; |
| 46 | } | 61 | } |
| 47 | 62 | ||
| @@ -383,25 +398,55 @@ static enum jump_label_type jump_label_type(struct jump_entry *entry) | |||
| 383 | return enabled ^ branch; | 398 | return enabled ^ branch; |
| 384 | } | 399 | } |
| 385 | 400 | ||
| 401 | static bool jump_label_can_update(struct jump_entry *entry, bool init) | ||
| 402 | { | ||
| 403 | /* | ||
| 404 | * Cannot update code that was in an init text area. | ||
| 405 | */ | ||
| 406 | if (!init && jump_entry_is_init(entry)) | ||
| 407 | return false; | ||
| 408 | |||
| 409 | if (!kernel_text_address(jump_entry_code(entry))) { | ||
| 410 | WARN_ONCE(1, "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); | ||
| 411 | return false; | ||
| 412 | } | ||
| 413 | |||
| 414 | return true; | ||
| 415 | } | ||
| 416 | |||
| 417 | #ifndef HAVE_JUMP_LABEL_BATCH | ||
| 386 | static void __jump_label_update(struct static_key *key, | 418 | static void __jump_label_update(struct static_key *key, |
| 387 | struct jump_entry *entry, | 419 | struct jump_entry *entry, |
| 388 | struct jump_entry *stop, | 420 | struct jump_entry *stop, |
| 389 | bool init) | 421 | bool init) |
| 390 | { | 422 | { |
| 391 | for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { | 423 | for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { |
| 392 | /* | 424 | if (jump_label_can_update(entry, init)) |
| 393 | * An entry->code of 0 indicates an entry which has been | 425 | arch_jump_label_transform(entry, jump_label_type(entry)); |
| 394 | * disabled because it was in an init text area. | 426 | } |
| 395 | */ | 427 | } |
| 396 | if (init || !jump_entry_is_init(entry)) { | 428 | #else |
| 397 | if (kernel_text_address(jump_entry_code(entry))) | 429 | static void __jump_label_update(struct static_key *key, |
| 398 | arch_jump_label_transform(entry, jump_label_type(entry)); | 430 | struct jump_entry *entry, |
| 399 | else | 431 | struct jump_entry *stop, |
| 400 | WARN_ONCE(1, "can't patch jump_label at %pS", | 432 | bool init) |
| 401 | (void *)jump_entry_code(entry)); | 433 | { |
| 434 | for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { | ||
| 435 | |||
| 436 | if (!jump_label_can_update(entry, init)) | ||
| 437 | continue; | ||
| 438 | |||
| 439 | if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { | ||
| 440 | /* | ||
| 441 | * Queue is full: Apply the current queue and try again. | ||
| 442 | */ | ||
| 443 | arch_jump_label_transform_apply(); | ||
| 444 | BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); | ||
| 402 | } | 445 | } |
| 403 | } | 446 | } |
| 447 | arch_jump_label_transform_apply(); | ||
| 404 | } | 448 | } |
| 449 | #endif | ||
| 405 | 450 | ||
| 406 | void __init jump_label_init(void) | 451 | void __init jump_label_init(void) |
| 407 | { | 452 | { |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 14934afa9e68..95a260f9214b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. | 3 | * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. |
| 3 | * | 4 | * |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 68559808fdfa..1b018f1a6e0d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -1,9 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kexec.c - kexec_load system call | 3 | * kexec.c - kexec_load system call |
| 3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | 4 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> |
| 4 | * | ||
| 5 | * This source code is licensed under the GNU General Public License, | ||
| 6 | * Version 2. See the file COPYING for more details. | ||
| 7 | */ | 5 | */ |
| 8 | 6 | ||
| 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 7 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index fd5c95ff9251..d5870723b8ad 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c | |||
| @@ -1,9 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kexec.c - kexec system call core code. | 3 | * kexec.c - kexec system call core code. |
| 3 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> | 4 | * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> |
| 4 | * | ||
| 5 | * This source code is licensed under the GNU General Public License, | ||
| 6 | * Version 2. See the file COPYING for more details. | ||
| 7 | */ | 5 | */ |
| 8 | 6 | ||
| 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 7 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 072b6ee55e3f..b8cc032d5620 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c | |||
| @@ -1,12 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kexec: kexec_file_load system call | 3 | * kexec: kexec_file_load system call |
| 3 | * | 4 | * |
| 4 | * Copyright (C) 2014 Red Hat Inc. | 5 | * Copyright (C) 2014 Red Hat Inc. |
| 5 | * Authors: | 6 | * Authors: |
| 6 | * Vivek Goyal <vgoyal@redhat.com> | 7 | * Vivek Goyal <vgoyal@redhat.com> |
| 7 | * | ||
| 8 | * This source code is licensed under the GNU General Public License, | ||
| 9 | * Version 2. See the file COPYING for more details. | ||
| 10 | */ | 8 | */ |
| 11 | 9 | ||
| 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| @@ -198,9 +196,6 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | |||
| 198 | return ret; | 196 | return ret; |
| 199 | image->kernel_buf_len = size; | 197 | image->kernel_buf_len = size; |
| 200 | 198 | ||
| 201 | /* IMA needs to pass the measurement list to the next kernel. */ | ||
| 202 | ima_add_kexec_buffer(image); | ||
| 203 | |||
| 204 | /* Call arch image probe handlers */ | 199 | /* Call arch image probe handlers */ |
| 205 | ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, | 200 | ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, |
| 206 | image->kernel_buf_len); | 201 | image->kernel_buf_len); |
| @@ -241,8 +236,14 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, | |||
| 241 | ret = -EINVAL; | 236 | ret = -EINVAL; |
| 242 | goto out; | 237 | goto out; |
| 243 | } | 238 | } |
| 239 | |||
| 240 | ima_kexec_cmdline(image->cmdline_buf, | ||
| 241 | image->cmdline_buf_len - 1); | ||
| 244 | } | 242 | } |
| 245 | 243 | ||
| 244 | /* IMA needs to pass the measurement list to the next kernel. */ | ||
| 245 | ima_add_kexec_buffer(image); | ||
| 246 | |||
| 246 | /* Call arch image load handlers */ | 247 | /* Call arch image load handlers */ |
| 247 | ldata = arch_kexec_kernel_image_load(image); | 248 | ldata = arch_kexec_kernel_image_load(image); |
| 248 | 249 | ||
diff --git a/kernel/kheaders.c b/kernel/kheaders.c index 70ae6052920d..8f69772af77b 100644 --- a/kernel/kheaders.c +++ b/kernel/kheaders.c | |||
| @@ -8,9 +8,8 @@ | |||
| 8 | 8 | ||
| 9 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
| 10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
| 11 | #include <linux/proc_fs.h> | 11 | #include <linux/kobject.h> |
| 12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
| 13 | #include <linux/uaccess.h> | ||
| 14 | 13 | ||
| 15 | /* | 14 | /* |
| 16 | * Define kernel_headers_data and kernel_headers_data_end, within which the | 15 | * Define kernel_headers_data and kernel_headers_data_end, within which the |
| @@ -31,39 +30,32 @@ extern char kernel_headers_data; | |||
| 31 | extern char kernel_headers_data_end; | 30 | extern char kernel_headers_data_end; |
| 32 | 31 | ||
| 33 | static ssize_t | 32 | static ssize_t |
| 34 | ikheaders_read_current(struct file *file, char __user *buf, | 33 | ikheaders_read(struct file *file, struct kobject *kobj, |
| 35 | size_t len, loff_t *offset) | 34 | struct bin_attribute *bin_attr, |
| 35 | char *buf, loff_t off, size_t len) | ||
| 36 | { | 36 | { |
| 37 | return simple_read_from_buffer(buf, len, offset, | 37 | memcpy(buf, &kernel_headers_data + off, len); |
| 38 | &kernel_headers_data, | 38 | return len; |
| 39 | &kernel_headers_data_end - | ||
| 40 | &kernel_headers_data); | ||
| 41 | } | 39 | } |
| 42 | 40 | ||
| 43 | static const struct file_operations ikheaders_file_ops = { | 41 | static struct bin_attribute kheaders_attr __ro_after_init = { |
| 44 | .read = ikheaders_read_current, | 42 | .attr = { |
| 45 | .llseek = default_llseek, | 43 | .name = "kheaders.tar.xz", |
| 44 | .mode = 0444, | ||
| 45 | }, | ||
| 46 | .read = &ikheaders_read, | ||
| 46 | }; | 47 | }; |
| 47 | 48 | ||
| 48 | static int __init ikheaders_init(void) | 49 | static int __init ikheaders_init(void) |
| 49 | { | 50 | { |
| 50 | struct proc_dir_entry *entry; | 51 | kheaders_attr.size = (&kernel_headers_data_end - |
| 51 | 52 | &kernel_headers_data); | |
| 52 | /* create the current headers file */ | 53 | return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); |
| 53 | entry = proc_create("kheaders.tar.xz", S_IRUGO, NULL, | ||
| 54 | &ikheaders_file_ops); | ||
| 55 | if (!entry) | ||
| 56 | return -ENOMEM; | ||
| 57 | |||
| 58 | proc_set_size(entry, | ||
| 59 | &kernel_headers_data_end - | ||
| 60 | &kernel_headers_data); | ||
| 61 | return 0; | ||
| 62 | } | 54 | } |
| 63 | 55 | ||
| 64 | static void __exit ikheaders_cleanup(void) | 56 | static void __exit ikheaders_cleanup(void) |
| 65 | { | 57 | { |
| 66 | remove_proc_entry("kheaders.tar.xz", NULL); | 58 | sysfs_remove_bin_file(kernel_kobj, &kheaders_attr); |
| 67 | } | 59 | } |
| 68 | 60 | ||
| 69 | module_init(ikheaders_init); | 61 | module_init(ikheaders_init); |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b1ea30a5540e..9f5433a52488 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -1,21 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Kernel Probes (KProbes) | 3 | * Kernel Probes (KProbes) |
| 3 | * kernel/kprobes.c | 4 | * kernel/kprobes.c |
| 4 | * | 5 | * |
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License as published by | ||
| 7 | * the Free Software Foundation; either version 2 of the License, or | ||
| 8 | * (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 13 | * GNU General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU General Public License | ||
| 16 | * along with this program; if not, write to the Free Software | ||
| 17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 18 | * | ||
| 19 | * Copyright (C) IBM Corporation, 2002, 2004 | 6 | * Copyright (C) IBM Corporation, 2002, 2004 |
| 20 | * | 7 | * |
| 21 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel | 8 | * 2002-Oct Created by Vamsi Krishna S <vamsi_krishna@in.ibm.com> Kernel |
| @@ -2583,33 +2570,20 @@ static const struct file_operations fops_kp = { | |||
| 2583 | 2570 | ||
| 2584 | static int __init debugfs_kprobe_init(void) | 2571 | static int __init debugfs_kprobe_init(void) |
| 2585 | { | 2572 | { |
| 2586 | struct dentry *dir, *file; | 2573 | struct dentry *dir; |
| 2587 | unsigned int value = 1; | 2574 | unsigned int value = 1; |
| 2588 | 2575 | ||
| 2589 | dir = debugfs_create_dir("kprobes", NULL); | 2576 | dir = debugfs_create_dir("kprobes", NULL); |
| 2590 | if (!dir) | ||
| 2591 | return -ENOMEM; | ||
| 2592 | 2577 | ||
| 2593 | file = debugfs_create_file("list", 0400, dir, NULL, | 2578 | debugfs_create_file("list", 0400, dir, NULL, |
| 2594 | &debugfs_kprobes_operations); | 2579 | &debugfs_kprobes_operations); |
| 2595 | if (!file) | ||
| 2596 | goto error; | ||
| 2597 | 2580 | ||
| 2598 | file = debugfs_create_file("enabled", 0600, dir, | 2581 | debugfs_create_file("enabled", 0600, dir, &value, &fops_kp); |
| 2599 | &value, &fops_kp); | ||
| 2600 | if (!file) | ||
| 2601 | goto error; | ||
| 2602 | 2582 | ||
| 2603 | file = debugfs_create_file("blacklist", 0400, dir, NULL, | 2583 | debugfs_create_file("blacklist", 0400, dir, NULL, |
| 2604 | &debugfs_kprobe_blacklist_ops); | 2584 | &debugfs_kprobe_blacklist_ops); |
| 2605 | if (!file) | ||
| 2606 | goto error; | ||
| 2607 | 2585 | ||
| 2608 | return 0; | 2586 | return 0; |
| 2609 | |||
| 2610 | error: | ||
| 2611 | debugfs_remove(dir); | ||
| 2612 | return -ENOMEM; | ||
| 2613 | } | 2587 | } |
| 2614 | 2588 | ||
| 2615 | late_initcall(debugfs_kprobe_init); | 2589 | late_initcall(debugfs_kprobe_init); |
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 46ba853656f6..35859da8bd4f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
| @@ -1,11 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which | 3 | * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which |
| 3 | * are not related to any other subsystem | 4 | * are not related to any other subsystem |
| 4 | * | 5 | * |
| 5 | * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> | 6 | * Copyright (C) 2004 Kay Sievers <kay.sievers@vrfy.org> |
| 6 | * | ||
| 7 | * This file is release under the GPLv2 | ||
| 8 | * | ||
| 9 | */ | 7 | */ |
| 10 | 8 | ||
| 11 | #include <linux/kobject.h> | 9 | #include <linux/kobject.h> |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 5942eeafb9ac..621467c33fef 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* Kernel thread helper functions. | 2 | /* Kernel thread helper functions. |
| 2 | * Copyright (C) 2004 IBM Corporation, Rusty Russell. | 3 | * Copyright (C) 2004 IBM Corporation, Rusty Russell. |
| 3 | * | 4 | * |
| @@ -11,6 +12,7 @@ | |||
| 11 | #include <linux/kthread.h> | 12 | #include <linux/kthread.h> |
| 12 | #include <linux/completion.h> | 13 | #include <linux/completion.h> |
| 13 | #include <linux/err.h> | 14 | #include <linux/err.h> |
| 15 | #include <linux/cgroup.h> | ||
| 14 | #include <linux/cpuset.h> | 16 | #include <linux/cpuset.h> |
| 15 | #include <linux/unistd.h> | 17 | #include <linux/unistd.h> |
| 16 | #include <linux/file.h> | 18 | #include <linux/file.h> |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 99a5b5f46dc5..e3acead004e6 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c | |||
| @@ -1,13 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * latencytop.c: Latency display infrastructure | 3 | * latencytop.c: Latency display infrastructure |
| 3 | * | 4 | * |
| 4 | * (C) Copyright 2008 Intel Corporation | 5 | * (C) Copyright 2008 Intel Corporation |
| 5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | 6 | * Author: Arjan van de Ven <arjan@linux.intel.com> |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or | ||
| 8 | * modify it under the terms of the GNU General Public License | ||
| 9 | * as published by the Free Software Foundation; version 2 | ||
| 10 | * of the License. | ||
| 11 | */ | 7 | */ |
| 12 | 8 | ||
| 13 | /* | 9 | /* |
| @@ -67,13 +63,10 @@ static struct latency_record latency_record[MAXLR]; | |||
| 67 | 63 | ||
| 68 | int latencytop_enabled; | 64 | int latencytop_enabled; |
| 69 | 65 | ||
| 70 | void clear_all_latency_tracing(struct task_struct *p) | 66 | void clear_tsk_latency_tracing(struct task_struct *p) |
| 71 | { | 67 | { |
| 72 | unsigned long flags; | 68 | unsigned long flags; |
| 73 | 69 | ||
| 74 | if (!latencytop_enabled) | ||
| 75 | return; | ||
| 76 | |||
| 77 | raw_spin_lock_irqsave(&latency_lock, flags); | 70 | raw_spin_lock_irqsave(&latency_lock, flags); |
| 78 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | 71 | memset(&p->latency_record, 0, sizeof(p->latency_record)); |
| 79 | p->latency_record_count = 0; | 72 | p->latency_record_count = 0; |
| @@ -96,9 +89,6 @@ account_global_scheduler_latency(struct task_struct *tsk, | |||
| 96 | int firstnonnull = MAXLR + 1; | 89 | int firstnonnull = MAXLR + 1; |
| 97 | int i; | 90 | int i; |
| 98 | 91 | ||
| 99 | if (!latencytop_enabled) | ||
| 100 | return; | ||
| 101 | |||
| 102 | /* skip kernel threads for now */ | 92 | /* skip kernel threads for now */ |
| 103 | if (!tsk->mm) | 93 | if (!tsk->mm) |
| 104 | return; | 94 | return; |
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index ec4565122e65..54102deb50ba 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | config HAVE_LIVEPATCH | 2 | config HAVE_LIVEPATCH |
| 2 | bool | 3 | bool |
| 3 | help | 4 | help |
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index b36ceda6488e..cf9b5bcdb952 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | obj-$(CONFIG_LIVEPATCH) += livepatch.o | 2 | obj-$(CONFIG_LIVEPATCH) += livepatch.o |
| 2 | 3 | ||
| 3 | livepatch-objs := core.o patch.o shadow.o transition.o | 4 | livepatch-objs := core.o patch.o shadow.o transition.o |
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index f6fbaff10e71..c4ce08f43bd6 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c | |||
| @@ -1,21 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * core.c - Kernel Live Patching Core | 3 | * core.c - Kernel Live Patching Core |
| 3 | * | 4 | * |
| 4 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> | 5 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> |
| 5 | * Copyright (C) 2014 SUSE | 6 | * Copyright (C) 2014 SUSE |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or | ||
| 8 | * modify it under the terms of the GNU General Public License | ||
| 9 | * as published by the Free Software Foundation; either version 2 | ||
| 10 | * of the License, or (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | * You should have received a copy of the GNU General Public License | ||
| 18 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 19 | */ | 7 | */ |
| 20 | 8 | ||
| 21 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 9 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| @@ -30,6 +18,7 @@ | |||
| 30 | #include <linux/elf.h> | 18 | #include <linux/elf.h> |
| 31 | #include <linux/moduleloader.h> | 19 | #include <linux/moduleloader.h> |
| 32 | #include <linux/completion.h> | 20 | #include <linux/completion.h> |
| 21 | #include <linux/memory.h> | ||
| 33 | #include <asm/cacheflush.h> | 22 | #include <asm/cacheflush.h> |
| 34 | #include "core.h" | 23 | #include "core.h" |
| 35 | #include "patch.h" | 24 | #include "patch.h" |
| @@ -730,16 +719,21 @@ static int klp_init_object_loaded(struct klp_patch *patch, | |||
| 730 | struct klp_func *func; | 719 | struct klp_func *func; |
| 731 | int ret; | 720 | int ret; |
| 732 | 721 | ||
| 722 | mutex_lock(&text_mutex); | ||
| 723 | |||
| 733 | module_disable_ro(patch->mod); | 724 | module_disable_ro(patch->mod); |
| 734 | ret = klp_write_object_relocations(patch->mod, obj); | 725 | ret = klp_write_object_relocations(patch->mod, obj); |
| 735 | if (ret) { | 726 | if (ret) { |
| 736 | module_enable_ro(patch->mod, true); | 727 | module_enable_ro(patch->mod, true); |
| 728 | mutex_unlock(&text_mutex); | ||
| 737 | return ret; | 729 | return ret; |
| 738 | } | 730 | } |
| 739 | 731 | ||
| 740 | arch_klp_init_object_loaded(patch, obj); | 732 | arch_klp_init_object_loaded(patch, obj); |
| 741 | module_enable_ro(patch->mod, true); | 733 | module_enable_ro(patch->mod, true); |
| 742 | 734 | ||
| 735 | mutex_unlock(&text_mutex); | ||
| 736 | |||
| 743 | klp_for_each_func(obj, func) { | 737 | klp_for_each_func(obj, func) { |
| 744 | ret = klp_find_object_symbol(obj->name, func->old_name, | 738 | ret = klp_find_object_symbol(obj->name, func->old_name, |
| 745 | func->old_sympos, | 739 | func->old_sympos, |
| @@ -1208,14 +1202,6 @@ void klp_module_going(struct module *mod) | |||
| 1208 | 1202 | ||
| 1209 | static int __init klp_init(void) | 1203 | static int __init klp_init(void) |
| 1210 | { | 1204 | { |
| 1211 | int ret; | ||
| 1212 | |||
| 1213 | ret = klp_check_compiler_support(); | ||
| 1214 | if (ret) { | ||
| 1215 | pr_info("Your compiler is too old; turning off.\n"); | ||
| 1216 | return -EINVAL; | ||
| 1217 | } | ||
| 1218 | |||
| 1219 | klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); | 1205 | klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); |
| 1220 | if (!klp_root_kobj) | 1206 | if (!klp_root_kobj) |
| 1221 | return -ENOMEM; | 1207 | return -ENOMEM; |
diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 99cb3ad05eb4..bd43537702bd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c | |||
| @@ -1,22 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * patch.c - livepatch patching functions | 3 | * patch.c - livepatch patching functions |
| 3 | * | 4 | * |
| 4 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> | 5 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> |
| 5 | * Copyright (C) 2014 SUSE | 6 | * Copyright (C) 2014 SUSE |
| 6 | * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> | 7 | * Copyright (C) 2015 Josh Poimboeuf <jpoimboe@redhat.com> |
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU General Public License | ||
| 10 | * as published by the Free Software Foundation; either version 2 | ||
| 11 | * of the License, or (at your option) any later version. | ||
| 12 | * | ||
| 13 | * This program is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | * GNU General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU General Public License | ||
| 19 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 20 | */ | 8 | */ |
| 21 | 9 | ||
| 22 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 10 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
diff --git a/kernel/livepatch/shadow.c b/kernel/livepatch/shadow.c index 83958c814439..e5c9fb295ba9 100644 --- a/kernel/livepatch/shadow.c +++ b/kernel/livepatch/shadow.c | |||
| @@ -1,22 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * shadow.c - Shadow Variables | 3 | * shadow.c - Shadow Variables |
| 3 | * | 4 | * |
| 4 | * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com> | 5 | * Copyright (C) 2014 Josh Poimboeuf <jpoimboe@redhat.com> |
| 5 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> | 6 | * Copyright (C) 2014 Seth Jennings <sjenning@redhat.com> |
| 6 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> | 7 | * Copyright (C) 2017 Joe Lawrence <joe.lawrence@redhat.com> |
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU General Public License | ||
| 10 | * as published by the Free Software Foundation; either version 2 | ||
| 11 | * of the License, or (at your option) any later version. | ||
| 12 | * | ||
| 13 | * This program is distributed in the hope that it will be useful, | ||
| 14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 16 | * GNU General Public License for more details. | ||
| 17 | * | ||
| 18 | * You should have received a copy of the GNU General Public License | ||
| 19 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 20 | */ | 8 | */ |
| 21 | 9 | ||
| 22 | /** | 10 | /** |
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index c53370d596be..cdf318d86dd6 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c | |||
| @@ -1,20 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * transition.c - Kernel Live Patching transition functions | 3 | * transition.c - Kernel Live Patching transition functions |
| 3 | * | 4 | * |
| 4 | * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> | 5 | * Copyright (C) 2015-2016 Josh Poimboeuf <jpoimboe@redhat.com> |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License | ||
| 8 | * as published by the Free Software Foundation; either version 2 | ||
| 9 | * of the License, or (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 18 | */ | 6 | */ |
| 19 | 7 | ||
| 20 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| @@ -259,7 +247,6 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) | |||
| 259 | int ret, nr_entries; | 247 | int ret, nr_entries; |
| 260 | 248 | ||
| 261 | ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); | 249 | ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries)); |
| 262 | WARN_ON_ONCE(ret == -ENOSYS); | ||
| 263 | if (ret < 0) { | 250 | if (ret < 0) { |
| 264 | snprintf(err_buf, STACK_ERR_BUF_SIZE, | 251 | snprintf(err_buf, STACK_ERR_BUF_SIZE, |
| 265 | "%s: %s:%d has an unreliable stack\n", | 252 | "%s: %s:%d has an unreliable stack\n", |
| @@ -293,11 +280,11 @@ static int klp_check_stack(struct task_struct *task, char *err_buf) | |||
| 293 | */ | 280 | */ |
| 294 | static bool klp_try_switch_task(struct task_struct *task) | 281 | static bool klp_try_switch_task(struct task_struct *task) |
| 295 | { | 282 | { |
| 283 | static char err_buf[STACK_ERR_BUF_SIZE]; | ||
| 296 | struct rq *rq; | 284 | struct rq *rq; |
| 297 | struct rq_flags flags; | 285 | struct rq_flags flags; |
| 298 | int ret; | 286 | int ret; |
| 299 | bool success = false; | 287 | bool success = false; |
| 300 | char err_buf[STACK_ERR_BUF_SIZE]; | ||
| 301 | 288 | ||
| 302 | err_buf[0] = '\0'; | 289 | err_buf[0] = '\0'; |
| 303 | 290 | ||
| @@ -306,6 +293,13 @@ static bool klp_try_switch_task(struct task_struct *task) | |||
| 306 | return true; | 293 | return true; |
| 307 | 294 | ||
| 308 | /* | 295 | /* |
| 296 | * For arches which don't have reliable stack traces, we have to rely | ||
| 297 | * on other methods (e.g., switching tasks at kernel exit). | ||
| 298 | */ | ||
| 299 | if (!klp_have_reliable_stack()) | ||
| 300 | return false; | ||
| 301 | |||
| 302 | /* | ||
| 309 | * Now try to check the stack for any to-be-patched or to-be-unpatched | 303 | * Now try to check the stack for any to-be-patched or to-be-unpatched |
| 310 | * functions. If all goes well, switch the task to the target patch | 304 | * functions. If all goes well, switch the task to the target patch |
| 311 | * state. | 305 | * state. |
| @@ -340,7 +334,6 @@ done: | |||
| 340 | pr_debug("%s", err_buf); | 334 | pr_debug("%s", err_buf); |
| 341 | 335 | ||
| 342 | return success; | 336 | return success; |
| 343 | |||
| 344 | } | 337 | } |
| 345 | 338 | ||
| 346 | /* | 339 | /* |
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6fe2f333aecb..45452facff3b 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile | |||
| @@ -3,7 +3,7 @@ | |||
| 3 | # and is generally not a function of system call inputs. | 3 | # and is generally not a function of system call inputs. |
| 4 | KCOV_INSTRUMENT := n | 4 | KCOV_INSTRUMENT := n |
| 5 | 5 | ||
| 6 | obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o rwsem-xadd.o | 6 | obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o |
| 7 | 7 | ||
| 8 | ifdef CONFIG_FUNCTION_TRACER | 8 | ifdef CONFIG_FUNCTION_TRACER |
| 9 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) | 9 | CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) |
diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index feb1acc54611..8c7e7d25f09c 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h | |||
| @@ -31,12 +31,13 @@ enum lock_events { | |||
| 31 | DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); | 31 | DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); |
| 32 | 32 | ||
| 33 | /* | 33 | /* |
| 34 | * Increment the PV qspinlock statistical counters | 34 | * Increment the statistical counters. use raw_cpu_inc() because of lower |
| 35 | * overhead and we don't care if we loose the occasional update. | ||
| 35 | */ | 36 | */ |
| 36 | static inline void __lockevent_inc(enum lock_events event, bool cond) | 37 | static inline void __lockevent_inc(enum lock_events event, bool cond) |
| 37 | { | 38 | { |
| 38 | if (cond) | 39 | if (cond) |
| 39 | __this_cpu_inc(lockevents[event]); | 40 | raw_cpu_inc(lockevents[event]); |
| 40 | } | 41 | } |
| 41 | 42 | ||
| 42 | #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) | 43 | #define lockevent_inc(ev) __lockevent_inc(LOCKEVENT_ ##ev, true) |
| @@ -44,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) | |||
| 44 | 45 | ||
| 45 | static inline void __lockevent_add(enum lock_events event, int inc) | 46 | static inline void __lockevent_add(enum lock_events event, int inc) |
| 46 | { | 47 | { |
| 47 | __this_cpu_add(lockevents[event], inc); | 48 | raw_cpu_add(lockevents[event], inc); |
| 48 | } | 49 | } |
| 49 | 50 | ||
| 50 | #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) | 51 | #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c) |
diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index ad7668cfc9da..239039d0ce21 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h | |||
| @@ -56,12 +56,16 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ | |||
| 56 | LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ | 56 | LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ |
| 57 | LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ | 57 | LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ |
| 58 | LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ | 58 | LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ |
| 59 | LOCK_EVENT(rwsem_opt_wlock) /* # of write locks opt-spin acquired */ | 59 | LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ |
| 60 | LOCK_EVENT(rwsem_opt_fail) /* # of failed opt-spinnings */ | 60 | LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ |
| 61 | LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ | ||
| 62 | LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ | ||
| 63 | LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ | ||
| 64 | LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ | ||
| 61 | LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ | 65 | LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ |
| 62 | LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ | 66 | LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ |
| 63 | LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ | 67 | LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ |
| 64 | LOCK_EVENT(rwsem_rtrylock) /* # of read trylock calls */ | 68 | LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ |
| 65 | LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ | 69 | LOCK_EVENT(rwsem_wlock) /* # of write locks acquired */ |
| 66 | LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ | 70 | LOCK_EVENT(rwsem_wlock_fail) /* # of failed write lock acquisitions */ |
| 67 | LOCK_EVENT(rwsem_wtrylock) /* # of write trylock calls */ | 71 | LOCK_EVENT(rwsem_wlock_handoff) /* # of write lock handoffs */ |
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d06190fa5082..341f52117f88 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/lockdep.c | 3 | * kernel/lockdep.c |
| 3 | * | 4 | * |
| @@ -150,17 +151,28 @@ unsigned long nr_lock_classes; | |||
| 150 | static | 151 | static |
| 151 | #endif | 152 | #endif |
| 152 | struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; | 153 | struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; |
| 154 | static DECLARE_BITMAP(lock_classes_in_use, MAX_LOCKDEP_KEYS); | ||
| 153 | 155 | ||
| 154 | static inline struct lock_class *hlock_class(struct held_lock *hlock) | 156 | static inline struct lock_class *hlock_class(struct held_lock *hlock) |
| 155 | { | 157 | { |
| 156 | if (!hlock->class_idx) { | 158 | unsigned int class_idx = hlock->class_idx; |
| 159 | |||
| 160 | /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfield */ | ||
| 161 | barrier(); | ||
| 162 | |||
| 163 | if (!test_bit(class_idx, lock_classes_in_use)) { | ||
| 157 | /* | 164 | /* |
| 158 | * Someone passed in garbage, we give up. | 165 | * Someone passed in garbage, we give up. |
| 159 | */ | 166 | */ |
| 160 | DEBUG_LOCKS_WARN_ON(1); | 167 | DEBUG_LOCKS_WARN_ON(1); |
| 161 | return NULL; | 168 | return NULL; |
| 162 | } | 169 | } |
| 163 | return lock_classes + hlock->class_idx - 1; | 170 | |
| 171 | /* | ||
| 172 | * At this point, if the passed hlock->class_idx is still garbage, | ||
| 173 | * we just have to live with it | ||
| 174 | */ | ||
| 175 | return lock_classes + class_idx; | ||
| 164 | } | 176 | } |
| 165 | 177 | ||
| 166 | #ifdef CONFIG_LOCK_STAT | 178 | #ifdef CONFIG_LOCK_STAT |
| @@ -358,6 +370,13 @@ static inline u64 iterate_chain_key(u64 key, u32 idx) | |||
| 358 | return k0 | (u64)k1 << 32; | 370 | return k0 | (u64)k1 << 32; |
| 359 | } | 371 | } |
| 360 | 372 | ||
| 373 | void lockdep_init_task(struct task_struct *task) | ||
| 374 | { | ||
| 375 | task->lockdep_depth = 0; /* no locks held yet */ | ||
| 376 | task->curr_chain_key = INITIAL_CHAIN_KEY; | ||
| 377 | task->lockdep_recursion = 0; | ||
| 378 | } | ||
| 379 | |||
| 361 | void lockdep_off(void) | 380 | void lockdep_off(void) |
| 362 | { | 381 | { |
| 363 | current->lockdep_recursion++; | 382 | current->lockdep_recursion++; |
| @@ -418,13 +437,6 @@ static int verbose(struct lock_class *class) | |||
| 418 | return 0; | 437 | return 0; |
| 419 | } | 438 | } |
| 420 | 439 | ||
| 421 | /* | ||
| 422 | * Stack-trace: tightly packed array of stack backtrace | ||
| 423 | * addresses. Protected by the graph_lock. | ||
| 424 | */ | ||
| 425 | unsigned long nr_stack_trace_entries; | ||
| 426 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; | ||
| 427 | |||
| 428 | static void print_lockdep_off(const char *bug_msg) | 440 | static void print_lockdep_off(const char *bug_msg) |
| 429 | { | 441 | { |
| 430 | printk(KERN_DEBUG "%s\n", bug_msg); | 442 | printk(KERN_DEBUG "%s\n", bug_msg); |
| @@ -434,6 +446,15 @@ static void print_lockdep_off(const char *bug_msg) | |||
| 434 | #endif | 446 | #endif |
| 435 | } | 447 | } |
| 436 | 448 | ||
| 449 | unsigned long nr_stack_trace_entries; | ||
| 450 | |||
| 451 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | ||
| 452 | /* | ||
| 453 | * Stack-trace: tightly packed array of stack backtrace | ||
| 454 | * addresses. Protected by the graph_lock. | ||
| 455 | */ | ||
| 456 | static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; | ||
| 457 | |||
| 437 | static int save_trace(struct lock_trace *trace) | 458 | static int save_trace(struct lock_trace *trace) |
| 438 | { | 459 | { |
| 439 | unsigned long *entries = stack_trace + nr_stack_trace_entries; | 460 | unsigned long *entries = stack_trace + nr_stack_trace_entries; |
| @@ -456,6 +477,7 @@ static int save_trace(struct lock_trace *trace) | |||
| 456 | 477 | ||
| 457 | return 1; | 478 | return 1; |
| 458 | } | 479 | } |
| 480 | #endif | ||
| 459 | 481 | ||
| 460 | unsigned int nr_hardirq_chains; | 482 | unsigned int nr_hardirq_chains; |
| 461 | unsigned int nr_softirq_chains; | 483 | unsigned int nr_softirq_chains; |
| @@ -469,6 +491,7 @@ unsigned int max_lockdep_depth; | |||
| 469 | DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); | 491 | DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); |
| 470 | #endif | 492 | #endif |
| 471 | 493 | ||
| 494 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | ||
| 472 | /* | 495 | /* |
| 473 | * Locking printouts: | 496 | * Locking printouts: |
| 474 | */ | 497 | */ |
| @@ -486,6 +509,7 @@ static const char *usage_str[] = | |||
| 486 | #undef LOCKDEP_STATE | 509 | #undef LOCKDEP_STATE |
| 487 | [LOCK_USED] = "INITIAL USE", | 510 | [LOCK_USED] = "INITIAL USE", |
| 488 | }; | 511 | }; |
| 512 | #endif | ||
| 489 | 513 | ||
| 490 | const char * __get_key_name(struct lockdep_subclass_key *key, char *str) | 514 | const char * __get_key_name(struct lockdep_subclass_key *key, char *str) |
| 491 | { | 515 | { |
| @@ -499,15 +523,26 @@ static inline unsigned long lock_flag(enum lock_usage_bit bit) | |||
| 499 | 523 | ||
| 500 | static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) | 524 | static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) |
| 501 | { | 525 | { |
| 526 | /* | ||
| 527 | * The usage character defaults to '.' (i.e., irqs disabled and not in | ||
| 528 | * irq context), which is the safest usage category. | ||
| 529 | */ | ||
| 502 | char c = '.'; | 530 | char c = '.'; |
| 503 | 531 | ||
| 504 | if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) | 532 | /* |
| 533 | * The order of the following usage checks matters, which will | ||
| 534 | * result in the outcome character as follows: | ||
| 535 | * | ||
| 536 | * - '+': irq is enabled and not in irq context | ||
| 537 | * - '-': in irq context and irq is disabled | ||
| 538 | * - '?': in irq context and irq is enabled | ||
| 539 | */ | ||
| 540 | if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) { | ||
| 505 | c = '+'; | 541 | c = '+'; |
| 506 | if (class->usage_mask & lock_flag(bit)) { | 542 | if (class->usage_mask & lock_flag(bit)) |
| 507 | c = '-'; | ||
| 508 | if (class->usage_mask & lock_flag(bit + LOCK_USAGE_DIR_MASK)) | ||
| 509 | c = '?'; | 543 | c = '?'; |
| 510 | } | 544 | } else if (class->usage_mask & lock_flag(bit)) |
| 545 | c = '-'; | ||
| 511 | 546 | ||
| 512 | return c; | 547 | return c; |
| 513 | } | 548 | } |
| @@ -571,19 +606,22 @@ static void print_lock(struct held_lock *hlock) | |||
| 571 | /* | 606 | /* |
| 572 | * We can be called locklessly through debug_show_all_locks() so be | 607 | * We can be called locklessly through debug_show_all_locks() so be |
| 573 | * extra careful, the hlock might have been released and cleared. | 608 | * extra careful, the hlock might have been released and cleared. |
| 609 | * | ||
| 610 | * If this indeed happens, lets pretend it does not hurt to continue | ||
| 611 | * to print the lock unless the hlock class_idx does not point to a | ||
| 612 | * registered class. The rationale here is: since we don't attempt | ||
| 613 | * to distinguish whether we are in this situation, if it just | ||
| 614 | * happened we can't count on class_idx to tell either. | ||
| 574 | */ | 615 | */ |
| 575 | unsigned int class_idx = hlock->class_idx; | 616 | struct lock_class *lock = hlock_class(hlock); |
| 576 | |||
| 577 | /* Don't re-read hlock->class_idx, can't use READ_ONCE() on bitfields: */ | ||
| 578 | barrier(); | ||
| 579 | 617 | ||
| 580 | if (!class_idx || (class_idx - 1) >= MAX_LOCKDEP_KEYS) { | 618 | if (!lock) { |
| 581 | printk(KERN_CONT "<RELEASED>\n"); | 619 | printk(KERN_CONT "<RELEASED>\n"); |
| 582 | return; | 620 | return; |
| 583 | } | 621 | } |
| 584 | 622 | ||
| 585 | printk(KERN_CONT "%p", hlock->instance); | 623 | printk(KERN_CONT "%p", hlock->instance); |
| 586 | print_lock_name(lock_classes + class_idx - 1); | 624 | print_lock_name(lock); |
| 587 | printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); | 625 | printk(KERN_CONT ", at: %pS\n", (void *)hlock->acquire_ip); |
| 588 | } | 626 | } |
| 589 | 627 | ||
| @@ -731,7 +769,8 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) | |||
| 731 | * Huh! same key, different name? Did someone trample | 769 | * Huh! same key, different name? Did someone trample |
| 732 | * on some memory? We're most confused. | 770 | * on some memory? We're most confused. |
| 733 | */ | 771 | */ |
| 734 | WARN_ON_ONCE(class->name != lock->name); | 772 | WARN_ON_ONCE(class->name != lock->name && |
| 773 | lock->key != &__lockdep_no_validate__); | ||
| 735 | return class; | 774 | return class; |
| 736 | } | 775 | } |
| 737 | } | 776 | } |
| @@ -837,11 +876,11 @@ static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; | |||
| 837 | static bool check_lock_chain_key(struct lock_chain *chain) | 876 | static bool check_lock_chain_key(struct lock_chain *chain) |
| 838 | { | 877 | { |
| 839 | #ifdef CONFIG_PROVE_LOCKING | 878 | #ifdef CONFIG_PROVE_LOCKING |
| 840 | u64 chain_key = 0; | 879 | u64 chain_key = INITIAL_CHAIN_KEY; |
| 841 | int i; | 880 | int i; |
| 842 | 881 | ||
| 843 | for (i = chain->base; i < chain->base + chain->depth; i++) | 882 | for (i = chain->base; i < chain->base + chain->depth; i++) |
| 844 | chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); | 883 | chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); |
| 845 | /* | 884 | /* |
| 846 | * The 'unsigned long long' casts avoid that a compiler warning | 885 | * The 'unsigned long long' casts avoid that a compiler warning |
| 847 | * is reported when building tools/lib/lockdep. | 886 | * is reported when building tools/lib/lockdep. |
| @@ -1116,6 +1155,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) | |||
| 1116 | return NULL; | 1155 | return NULL; |
| 1117 | } | 1156 | } |
| 1118 | nr_lock_classes++; | 1157 | nr_lock_classes++; |
| 1158 | __set_bit(class - lock_classes, lock_classes_in_use); | ||
| 1119 | debug_atomic_inc(nr_unused_locks); | 1159 | debug_atomic_inc(nr_unused_locks); |
| 1120 | class->key = key; | 1160 | class->key = key; |
| 1121 | class->name = lock->name; | 1161 | class->name = lock->name; |
| @@ -1227,13 +1267,17 @@ static int add_lock_to_list(struct lock_class *this, | |||
| 1227 | #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) | 1267 | #define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) |
| 1228 | 1268 | ||
| 1229 | /* | 1269 | /* |
| 1230 | * The circular_queue and helpers is used to implement the | 1270 | * The circular_queue and helpers are used to implement graph |
| 1231 | * breadth-first search(BFS)algorithem, by which we can build | 1271 | * breadth-first search (BFS) algorithm, by which we can determine |
| 1232 | * the shortest path from the next lock to be acquired to the | 1272 | * whether there is a path from a lock to another. In deadlock checks, |
| 1233 | * previous held lock if there is a circular between them. | 1273 | * a path from the next lock to be acquired to a previous held lock |
| 1274 | * indicates that adding the <prev> -> <next> lock dependency will | ||
| 1275 | * produce a circle in the graph. Breadth-first search instead of | ||
| 1276 | * depth-first search is used in order to find the shortest (circular) | ||
| 1277 | * path. | ||
| 1234 | */ | 1278 | */ |
| 1235 | struct circular_queue { | 1279 | struct circular_queue { |
| 1236 | unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; | 1280 | struct lock_list *element[MAX_CIRCULAR_QUEUE_SIZE]; |
| 1237 | unsigned int front, rear; | 1281 | unsigned int front, rear; |
| 1238 | }; | 1282 | }; |
| 1239 | 1283 | ||
| @@ -1259,7 +1303,7 @@ static inline int __cq_full(struct circular_queue *cq) | |||
| 1259 | return ((cq->rear + 1) & CQ_MASK) == cq->front; | 1303 | return ((cq->rear + 1) & CQ_MASK) == cq->front; |
| 1260 | } | 1304 | } |
| 1261 | 1305 | ||
| 1262 | static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) | 1306 | static inline int __cq_enqueue(struct circular_queue *cq, struct lock_list *elem) |
| 1263 | { | 1307 | { |
| 1264 | if (__cq_full(cq)) | 1308 | if (__cq_full(cq)) |
| 1265 | return -1; | 1309 | return -1; |
| @@ -1269,14 +1313,21 @@ static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) | |||
| 1269 | return 0; | 1313 | return 0; |
| 1270 | } | 1314 | } |
| 1271 | 1315 | ||
| 1272 | static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) | 1316 | /* |
| 1317 | * Dequeue an element from the circular_queue, return a lock_list if | ||
| 1318 | * the queue is not empty, or NULL if otherwise. | ||
| 1319 | */ | ||
| 1320 | static inline struct lock_list * __cq_dequeue(struct circular_queue *cq) | ||
| 1273 | { | 1321 | { |
| 1322 | struct lock_list * lock; | ||
| 1323 | |||
| 1274 | if (__cq_empty(cq)) | 1324 | if (__cq_empty(cq)) |
| 1275 | return -1; | 1325 | return NULL; |
| 1276 | 1326 | ||
| 1277 | *elem = cq->element[cq->front]; | 1327 | lock = cq->element[cq->front]; |
| 1278 | cq->front = (cq->front + 1) & CQ_MASK; | 1328 | cq->front = (cq->front + 1) & CQ_MASK; |
| 1279 | return 0; | 1329 | |
| 1330 | return lock; | ||
| 1280 | } | 1331 | } |
| 1281 | 1332 | ||
| 1282 | static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) | 1333 | static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) |
| @@ -1321,13 +1372,32 @@ static inline int get_lock_depth(struct lock_list *child) | |||
| 1321 | return depth; | 1372 | return depth; |
| 1322 | } | 1373 | } |
| 1323 | 1374 | ||
| 1375 | /* | ||
| 1376 | * Return the forward or backward dependency list. | ||
| 1377 | * | ||
| 1378 | * @lock: the lock_list to get its class's dependency list | ||
| 1379 | * @offset: the offset to struct lock_class to determine whether it is | ||
| 1380 | * locks_after or locks_before | ||
| 1381 | */ | ||
| 1382 | static inline struct list_head *get_dep_list(struct lock_list *lock, int offset) | ||
| 1383 | { | ||
| 1384 | void *lock_class = lock->class; | ||
| 1385 | |||
| 1386 | return lock_class + offset; | ||
| 1387 | } | ||
| 1388 | |||
| 1389 | /* | ||
| 1390 | * Forward- or backward-dependency search, used for both circular dependency | ||
| 1391 | * checking and hardirq-unsafe/softirq-unsafe checking. | ||
| 1392 | */ | ||
| 1324 | static int __bfs(struct lock_list *source_entry, | 1393 | static int __bfs(struct lock_list *source_entry, |
| 1325 | void *data, | 1394 | void *data, |
| 1326 | int (*match)(struct lock_list *entry, void *data), | 1395 | int (*match)(struct lock_list *entry, void *data), |
| 1327 | struct lock_list **target_entry, | 1396 | struct lock_list **target_entry, |
| 1328 | int forward) | 1397 | int offset) |
| 1329 | { | 1398 | { |
| 1330 | struct lock_list *entry; | 1399 | struct lock_list *entry; |
| 1400 | struct lock_list *lock; | ||
| 1331 | struct list_head *head; | 1401 | struct list_head *head; |
| 1332 | struct circular_queue *cq = &lock_cq; | 1402 | struct circular_queue *cq = &lock_cq; |
| 1333 | int ret = 1; | 1403 | int ret = 1; |
| @@ -1338,31 +1408,21 @@ static int __bfs(struct lock_list *source_entry, | |||
| 1338 | goto exit; | 1408 | goto exit; |
| 1339 | } | 1409 | } |
| 1340 | 1410 | ||
| 1341 | if (forward) | 1411 | head = get_dep_list(source_entry, offset); |
| 1342 | head = &source_entry->class->locks_after; | ||
| 1343 | else | ||
| 1344 | head = &source_entry->class->locks_before; | ||
| 1345 | |||
| 1346 | if (list_empty(head)) | 1412 | if (list_empty(head)) |
| 1347 | goto exit; | 1413 | goto exit; |
| 1348 | 1414 | ||
| 1349 | __cq_init(cq); | 1415 | __cq_init(cq); |
| 1350 | __cq_enqueue(cq, (unsigned long)source_entry); | 1416 | __cq_enqueue(cq, source_entry); |
| 1351 | 1417 | ||
| 1352 | while (!__cq_empty(cq)) { | 1418 | while ((lock = __cq_dequeue(cq))) { |
| 1353 | struct lock_list *lock; | ||
| 1354 | |||
| 1355 | __cq_dequeue(cq, (unsigned long *)&lock); | ||
| 1356 | 1419 | ||
| 1357 | if (!lock->class) { | 1420 | if (!lock->class) { |
| 1358 | ret = -2; | 1421 | ret = -2; |
| 1359 | goto exit; | 1422 | goto exit; |
| 1360 | } | 1423 | } |
| 1361 | 1424 | ||
| 1362 | if (forward) | 1425 | head = get_dep_list(lock, offset); |
| 1363 | head = &lock->class->locks_after; | ||
| 1364 | else | ||
| 1365 | head = &lock->class->locks_before; | ||
| 1366 | 1426 | ||
| 1367 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); | 1427 | DEBUG_LOCKS_WARN_ON(!irqs_disabled()); |
| 1368 | 1428 | ||
| @@ -1376,7 +1436,7 @@ static int __bfs(struct lock_list *source_entry, | |||
| 1376 | goto exit; | 1436 | goto exit; |
| 1377 | } | 1437 | } |
| 1378 | 1438 | ||
| 1379 | if (__cq_enqueue(cq, (unsigned long)entry)) { | 1439 | if (__cq_enqueue(cq, entry)) { |
| 1380 | ret = -1; | 1440 | ret = -1; |
| 1381 | goto exit; | 1441 | goto exit; |
| 1382 | } | 1442 | } |
| @@ -1395,7 +1455,8 @@ static inline int __bfs_forwards(struct lock_list *src_entry, | |||
| 1395 | int (*match)(struct lock_list *entry, void *data), | 1455 | int (*match)(struct lock_list *entry, void *data), |
| 1396 | struct lock_list **target_entry) | 1456 | struct lock_list **target_entry) |
| 1397 | { | 1457 | { |
| 1398 | return __bfs(src_entry, data, match, target_entry, 1); | 1458 | return __bfs(src_entry, data, match, target_entry, |
| 1459 | offsetof(struct lock_class, locks_after)); | ||
| 1399 | 1460 | ||
| 1400 | } | 1461 | } |
| 1401 | 1462 | ||
| @@ -1404,16 +1465,11 @@ static inline int __bfs_backwards(struct lock_list *src_entry, | |||
| 1404 | int (*match)(struct lock_list *entry, void *data), | 1465 | int (*match)(struct lock_list *entry, void *data), |
| 1405 | struct lock_list **target_entry) | 1466 | struct lock_list **target_entry) |
| 1406 | { | 1467 | { |
| 1407 | return __bfs(src_entry, data, match, target_entry, 0); | 1468 | return __bfs(src_entry, data, match, target_entry, |
| 1469 | offsetof(struct lock_class, locks_before)); | ||
| 1408 | 1470 | ||
| 1409 | } | 1471 | } |
| 1410 | 1472 | ||
| 1411 | /* | ||
| 1412 | * Recursive, forwards-direction lock-dependency checking, used for | ||
| 1413 | * both noncyclic checking and for hardirq-unsafe/softirq-unsafe | ||
| 1414 | * checking. | ||
| 1415 | */ | ||
| 1416 | |||
| 1417 | static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) | 1473 | static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) |
| 1418 | { | 1474 | { |
| 1419 | unsigned long *entries = stack_trace + trace->offset; | 1475 | unsigned long *entries = stack_trace + trace->offset; |
| @@ -1425,16 +1481,15 @@ static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) | |||
| 1425 | * Print a dependency chain entry (this is only done when a deadlock | 1481 | * Print a dependency chain entry (this is only done when a deadlock |
| 1426 | * has been detected): | 1482 | * has been detected): |
| 1427 | */ | 1483 | */ |
| 1428 | static noinline int | 1484 | static noinline void |
| 1429 | print_circular_bug_entry(struct lock_list *target, int depth) | 1485 | print_circular_bug_entry(struct lock_list *target, int depth) |
| 1430 | { | 1486 | { |
| 1431 | if (debug_locks_silent) | 1487 | if (debug_locks_silent) |
| 1432 | return 0; | 1488 | return; |
| 1433 | printk("\n-> #%u", depth); | 1489 | printk("\n-> #%u", depth); |
| 1434 | print_lock_name(target->class); | 1490 | print_lock_name(target->class); |
| 1435 | printk(KERN_CONT ":\n"); | 1491 | printk(KERN_CONT ":\n"); |
| 1436 | print_lock_trace(&target->trace, 6); | 1492 | print_lock_trace(&target->trace, 6); |
| 1437 | return 0; | ||
| 1438 | } | 1493 | } |
| 1439 | 1494 | ||
| 1440 | static void | 1495 | static void |
| @@ -1491,7 +1546,7 @@ print_circular_lock_scenario(struct held_lock *src, | |||
| 1491 | * When a circular dependency is detected, print the | 1546 | * When a circular dependency is detected, print the |
| 1492 | * header first: | 1547 | * header first: |
| 1493 | */ | 1548 | */ |
| 1494 | static noinline int | 1549 | static noinline void |
| 1495 | print_circular_bug_header(struct lock_list *entry, unsigned int depth, | 1550 | print_circular_bug_header(struct lock_list *entry, unsigned int depth, |
| 1496 | struct held_lock *check_src, | 1551 | struct held_lock *check_src, |
| 1497 | struct held_lock *check_tgt) | 1552 | struct held_lock *check_tgt) |
| @@ -1499,7 +1554,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
| 1499 | struct task_struct *curr = current; | 1554 | struct task_struct *curr = current; |
| 1500 | 1555 | ||
| 1501 | if (debug_locks_silent) | 1556 | if (debug_locks_silent) |
| 1502 | return 0; | 1557 | return; |
| 1503 | 1558 | ||
| 1504 | pr_warn("\n"); | 1559 | pr_warn("\n"); |
| 1505 | pr_warn("======================================================\n"); | 1560 | pr_warn("======================================================\n"); |
| @@ -1517,8 +1572,6 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, | |||
| 1517 | pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); | 1572 | pr_warn("\nthe existing dependency chain (in reverse order) is:\n"); |
| 1518 | 1573 | ||
| 1519 | print_circular_bug_entry(entry, depth); | 1574 | print_circular_bug_entry(entry, depth); |
| 1520 | |||
| 1521 | return 0; | ||
| 1522 | } | 1575 | } |
| 1523 | 1576 | ||
| 1524 | static inline int class_equal(struct lock_list *entry, void *data) | 1577 | static inline int class_equal(struct lock_list *entry, void *data) |
| @@ -1526,10 +1579,10 @@ static inline int class_equal(struct lock_list *entry, void *data) | |||
| 1526 | return entry->class == data; | 1579 | return entry->class == data; |
| 1527 | } | 1580 | } |
| 1528 | 1581 | ||
| 1529 | static noinline int print_circular_bug(struct lock_list *this, | 1582 | static noinline void print_circular_bug(struct lock_list *this, |
| 1530 | struct lock_list *target, | 1583 | struct lock_list *target, |
| 1531 | struct held_lock *check_src, | 1584 | struct held_lock *check_src, |
| 1532 | struct held_lock *check_tgt) | 1585 | struct held_lock *check_tgt) |
| 1533 | { | 1586 | { |
| 1534 | struct task_struct *curr = current; | 1587 | struct task_struct *curr = current; |
| 1535 | struct lock_list *parent; | 1588 | struct lock_list *parent; |
| @@ -1537,10 +1590,10 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
| 1537 | int depth; | 1590 | int depth; |
| 1538 | 1591 | ||
| 1539 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1592 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 1540 | return 0; | 1593 | return; |
| 1541 | 1594 | ||
| 1542 | if (!save_trace(&this->trace)) | 1595 | if (!save_trace(&this->trace)) |
| 1543 | return 0; | 1596 | return; |
| 1544 | 1597 | ||
| 1545 | depth = get_lock_depth(target); | 1598 | depth = get_lock_depth(target); |
| 1546 | 1599 | ||
| @@ -1562,21 +1615,17 @@ static noinline int print_circular_bug(struct lock_list *this, | |||
| 1562 | 1615 | ||
| 1563 | printk("\nstack backtrace:\n"); | 1616 | printk("\nstack backtrace:\n"); |
| 1564 | dump_stack(); | 1617 | dump_stack(); |
| 1565 | |||
| 1566 | return 0; | ||
| 1567 | } | 1618 | } |
| 1568 | 1619 | ||
| 1569 | static noinline int print_bfs_bug(int ret) | 1620 | static noinline void print_bfs_bug(int ret) |
| 1570 | { | 1621 | { |
| 1571 | if (!debug_locks_off_graph_unlock()) | 1622 | if (!debug_locks_off_graph_unlock()) |
| 1572 | return 0; | 1623 | return; |
| 1573 | 1624 | ||
| 1574 | /* | 1625 | /* |
| 1575 | * Breadth-first-search failed, graph got corrupted? | 1626 | * Breadth-first-search failed, graph got corrupted? |
| 1576 | */ | 1627 | */ |
| 1577 | WARN(1, "lockdep bfs error:%d\n", ret); | 1628 | WARN(1, "lockdep bfs error:%d\n", ret); |
| 1578 | |||
| 1579 | return 0; | ||
| 1580 | } | 1629 | } |
| 1581 | 1630 | ||
| 1582 | static int noop_count(struct lock_list *entry, void *data) | 1631 | static int noop_count(struct lock_list *entry, void *data) |
| @@ -1639,36 +1688,95 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) | |||
| 1639 | } | 1688 | } |
| 1640 | 1689 | ||
| 1641 | /* | 1690 | /* |
| 1642 | * Prove that the dependency graph starting at <entry> can not | 1691 | * Check that the dependency graph starting at <src> can lead to |
| 1643 | * lead to <target>. Print an error and return 0 if it does. | 1692 | * <target> or not. Print an error and return 0 if it does. |
| 1644 | */ | 1693 | */ |
| 1645 | static noinline int | 1694 | static noinline int |
| 1646 | check_noncircular(struct lock_list *root, struct lock_class *target, | 1695 | check_path(struct lock_class *target, struct lock_list *src_entry, |
| 1647 | struct lock_list **target_entry) | 1696 | struct lock_list **target_entry) |
| 1648 | { | 1697 | { |
| 1649 | int result; | 1698 | int ret; |
| 1699 | |||
| 1700 | ret = __bfs_forwards(src_entry, (void *)target, class_equal, | ||
| 1701 | target_entry); | ||
| 1702 | |||
| 1703 | if (unlikely(ret < 0)) | ||
| 1704 | print_bfs_bug(ret); | ||
| 1705 | |||
| 1706 | return ret; | ||
| 1707 | } | ||
| 1708 | |||
| 1709 | /* | ||
| 1710 | * Prove that the dependency graph starting at <src> can not | ||
| 1711 | * lead to <target>. If it can, there is a circle when adding | ||
| 1712 | * <target> -> <src> dependency. | ||
| 1713 | * | ||
| 1714 | * Print an error and return 0 if it does. | ||
| 1715 | */ | ||
| 1716 | static noinline int | ||
| 1717 | check_noncircular(struct held_lock *src, struct held_lock *target, | ||
| 1718 | struct lock_trace *trace) | ||
| 1719 | { | ||
| 1720 | int ret; | ||
| 1721 | struct lock_list *uninitialized_var(target_entry); | ||
| 1722 | struct lock_list src_entry = { | ||
| 1723 | .class = hlock_class(src), | ||
| 1724 | .parent = NULL, | ||
| 1725 | }; | ||
| 1650 | 1726 | ||
| 1651 | debug_atomic_inc(nr_cyclic_checks); | 1727 | debug_atomic_inc(nr_cyclic_checks); |
| 1652 | 1728 | ||
| 1653 | result = __bfs_forwards(root, target, class_equal, target_entry); | 1729 | ret = check_path(hlock_class(target), &src_entry, &target_entry); |
| 1654 | 1730 | ||
| 1655 | return result; | 1731 | if (unlikely(!ret)) { |
| 1732 | if (!trace->nr_entries) { | ||
| 1733 | /* | ||
| 1734 | * If save_trace fails here, the printing might | ||
| 1735 | * trigger a WARN but because of the !nr_entries it | ||
| 1736 | * should not do bad things. | ||
| 1737 | */ | ||
| 1738 | save_trace(trace); | ||
| 1739 | } | ||
| 1740 | |||
| 1741 | print_circular_bug(&src_entry, target_entry, src, target); | ||
| 1742 | } | ||
| 1743 | |||
| 1744 | return ret; | ||
| 1656 | } | 1745 | } |
| 1657 | 1746 | ||
| 1747 | #ifdef CONFIG_LOCKDEP_SMALL | ||
| 1748 | /* | ||
| 1749 | * Check that the dependency graph starting at <src> can lead to | ||
| 1750 | * <target> or not. If it can, <src> -> <target> dependency is already | ||
| 1751 | * in the graph. | ||
| 1752 | * | ||
| 1753 | * Print an error and return 2 if it does or 1 if it does not. | ||
| 1754 | */ | ||
| 1658 | static noinline int | 1755 | static noinline int |
| 1659 | check_redundant(struct lock_list *root, struct lock_class *target, | 1756 | check_redundant(struct held_lock *src, struct held_lock *target) |
| 1660 | struct lock_list **target_entry) | ||
| 1661 | { | 1757 | { |
| 1662 | int result; | 1758 | int ret; |
| 1759 | struct lock_list *uninitialized_var(target_entry); | ||
| 1760 | struct lock_list src_entry = { | ||
| 1761 | .class = hlock_class(src), | ||
| 1762 | .parent = NULL, | ||
| 1763 | }; | ||
| 1663 | 1764 | ||
| 1664 | debug_atomic_inc(nr_redundant_checks); | 1765 | debug_atomic_inc(nr_redundant_checks); |
| 1665 | 1766 | ||
| 1666 | result = __bfs_forwards(root, target, class_equal, target_entry); | 1767 | ret = check_path(hlock_class(target), &src_entry, &target_entry); |
| 1667 | 1768 | ||
| 1668 | return result; | 1769 | if (!ret) { |
| 1770 | debug_atomic_inc(nr_redundant); | ||
| 1771 | ret = 2; | ||
| 1772 | } else if (ret < 0) | ||
| 1773 | ret = 0; | ||
| 1774 | |||
| 1775 | return ret; | ||
| 1669 | } | 1776 | } |
| 1777 | #endif | ||
| 1670 | 1778 | ||
| 1671 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | 1779 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 1672 | 1780 | ||
| 1673 | static inline int usage_accumulate(struct lock_list *entry, void *mask) | 1781 | static inline int usage_accumulate(struct lock_list *entry, void *mask) |
| 1674 | { | 1782 | { |
| @@ -1765,7 +1873,7 @@ static void print_lock_class_header(struct lock_class *class, int depth) | |||
| 1765 | */ | 1873 | */ |
| 1766 | static void __used | 1874 | static void __used |
| 1767 | print_shortest_lock_dependencies(struct lock_list *leaf, | 1875 | print_shortest_lock_dependencies(struct lock_list *leaf, |
| 1768 | struct lock_list *root) | 1876 | struct lock_list *root) |
| 1769 | { | 1877 | { |
| 1770 | struct lock_list *entry = leaf; | 1878 | struct lock_list *entry = leaf; |
| 1771 | int depth; | 1879 | int depth; |
| @@ -1787,8 +1895,6 @@ print_shortest_lock_dependencies(struct lock_list *leaf, | |||
| 1787 | entry = get_lock_parent(entry); | 1895 | entry = get_lock_parent(entry); |
| 1788 | depth--; | 1896 | depth--; |
| 1789 | } while (entry && (depth >= 0)); | 1897 | } while (entry && (depth >= 0)); |
| 1790 | |||
| 1791 | return; | ||
| 1792 | } | 1898 | } |
| 1793 | 1899 | ||
| 1794 | static void | 1900 | static void |
| @@ -1847,7 +1953,7 @@ print_irq_lock_scenario(struct lock_list *safe_entry, | |||
| 1847 | printk("\n *** DEADLOCK ***\n\n"); | 1953 | printk("\n *** DEADLOCK ***\n\n"); |
| 1848 | } | 1954 | } |
| 1849 | 1955 | ||
| 1850 | static int | 1956 | static void |
| 1851 | print_bad_irq_dependency(struct task_struct *curr, | 1957 | print_bad_irq_dependency(struct task_struct *curr, |
| 1852 | struct lock_list *prev_root, | 1958 | struct lock_list *prev_root, |
| 1853 | struct lock_list *next_root, | 1959 | struct lock_list *next_root, |
| @@ -1860,7 +1966,7 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
| 1860 | const char *irqclass) | 1966 | const char *irqclass) |
| 1861 | { | 1967 | { |
| 1862 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 1968 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 1863 | return 0; | 1969 | return; |
| 1864 | 1970 | ||
| 1865 | pr_warn("\n"); | 1971 | pr_warn("\n"); |
| 1866 | pr_warn("=====================================================\n"); | 1972 | pr_warn("=====================================================\n"); |
| @@ -1906,19 +2012,17 @@ print_bad_irq_dependency(struct task_struct *curr, | |||
| 1906 | 2012 | ||
| 1907 | pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); | 2013 | pr_warn("\nthe dependencies between %s-irq-safe lock and the holding lock:\n", irqclass); |
| 1908 | if (!save_trace(&prev_root->trace)) | 2014 | if (!save_trace(&prev_root->trace)) |
| 1909 | return 0; | 2015 | return; |
| 1910 | print_shortest_lock_dependencies(backwards_entry, prev_root); | 2016 | print_shortest_lock_dependencies(backwards_entry, prev_root); |
| 1911 | 2017 | ||
| 1912 | pr_warn("\nthe dependencies between the lock to be acquired"); | 2018 | pr_warn("\nthe dependencies between the lock to be acquired"); |
| 1913 | pr_warn(" and %s-irq-unsafe lock:\n", irqclass); | 2019 | pr_warn(" and %s-irq-unsafe lock:\n", irqclass); |
| 1914 | if (!save_trace(&next_root->trace)) | 2020 | if (!save_trace(&next_root->trace)) |
| 1915 | return 0; | 2021 | return; |
| 1916 | print_shortest_lock_dependencies(forwards_entry, next_root); | 2022 | print_shortest_lock_dependencies(forwards_entry, next_root); |
| 1917 | 2023 | ||
| 1918 | pr_warn("\nstack backtrace:\n"); | 2024 | pr_warn("\nstack backtrace:\n"); |
| 1919 | dump_stack(); | 2025 | dump_stack(); |
| 1920 | |||
| 1921 | return 0; | ||
| 1922 | } | 2026 | } |
| 1923 | 2027 | ||
| 1924 | static const char *state_names[] = { | 2028 | static const char *state_names[] = { |
| @@ -2065,8 +2169,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, | |||
| 2065 | this.class = hlock_class(prev); | 2169 | this.class = hlock_class(prev); |
| 2066 | 2170 | ||
| 2067 | ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); | 2171 | ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); |
| 2068 | if (ret < 0) | 2172 | if (ret < 0) { |
| 2069 | return print_bfs_bug(ret); | 2173 | print_bfs_bug(ret); |
| 2174 | return 0; | ||
| 2175 | } | ||
| 2070 | 2176 | ||
| 2071 | usage_mask &= LOCKF_USED_IN_IRQ_ALL; | 2177 | usage_mask &= LOCKF_USED_IN_IRQ_ALL; |
| 2072 | if (!usage_mask) | 2178 | if (!usage_mask) |
| @@ -2082,8 +2188,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, | |||
| 2082 | that.class = hlock_class(next); | 2188 | that.class = hlock_class(next); |
| 2083 | 2189 | ||
| 2084 | ret = find_usage_forwards(&that, forward_mask, &target_entry1); | 2190 | ret = find_usage_forwards(&that, forward_mask, &target_entry1); |
| 2085 | if (ret < 0) | 2191 | if (ret < 0) { |
| 2086 | return print_bfs_bug(ret); | 2192 | print_bfs_bug(ret); |
| 2193 | return 0; | ||
| 2194 | } | ||
| 2087 | if (ret == 1) | 2195 | if (ret == 1) |
| 2088 | return ret; | 2196 | return ret; |
| 2089 | 2197 | ||
| @@ -2095,8 +2203,10 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, | |||
| 2095 | backward_mask = original_mask(target_entry1->class->usage_mask); | 2203 | backward_mask = original_mask(target_entry1->class->usage_mask); |
| 2096 | 2204 | ||
| 2097 | ret = find_usage_backwards(&this, backward_mask, &target_entry); | 2205 | ret = find_usage_backwards(&this, backward_mask, &target_entry); |
| 2098 | if (ret < 0) | 2206 | if (ret < 0) { |
| 2099 | return print_bfs_bug(ret); | 2207 | print_bfs_bug(ret); |
| 2208 | return 0; | ||
| 2209 | } | ||
| 2100 | if (DEBUG_LOCKS_WARN_ON(ret == 1)) | 2210 | if (DEBUG_LOCKS_WARN_ON(ret == 1)) |
| 2101 | return 1; | 2211 | return 1; |
| 2102 | 2212 | ||
| @@ -2110,11 +2220,13 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, | |||
| 2110 | if (DEBUG_LOCKS_WARN_ON(ret == -1)) | 2220 | if (DEBUG_LOCKS_WARN_ON(ret == -1)) |
| 2111 | return 1; | 2221 | return 1; |
| 2112 | 2222 | ||
| 2113 | return print_bad_irq_dependency(curr, &this, &that, | 2223 | print_bad_irq_dependency(curr, &this, &that, |
| 2114 | target_entry, target_entry1, | 2224 | target_entry, target_entry1, |
| 2115 | prev, next, | 2225 | prev, next, |
| 2116 | backward_bit, forward_bit, | 2226 | backward_bit, forward_bit, |
| 2117 | state_name(backward_bit)); | 2227 | state_name(backward_bit)); |
| 2228 | |||
| 2229 | return 0; | ||
| 2118 | } | 2230 | } |
| 2119 | 2231 | ||
| 2120 | static void inc_chains(void) | 2232 | static void inc_chains(void) |
| @@ -2142,11 +2254,10 @@ static inline void inc_chains(void) | |||
| 2142 | nr_process_chains++; | 2254 | nr_process_chains++; |
| 2143 | } | 2255 | } |
| 2144 | 2256 | ||
| 2145 | #endif | 2257 | #endif /* CONFIG_TRACE_IRQFLAGS */ |
| 2146 | 2258 | ||
| 2147 | static void | 2259 | static void |
| 2148 | print_deadlock_scenario(struct held_lock *nxt, | 2260 | print_deadlock_scenario(struct held_lock *nxt, struct held_lock *prv) |
| 2149 | struct held_lock *prv) | ||
| 2150 | { | 2261 | { |
| 2151 | struct lock_class *next = hlock_class(nxt); | 2262 | struct lock_class *next = hlock_class(nxt); |
| 2152 | struct lock_class *prev = hlock_class(prv); | 2263 | struct lock_class *prev = hlock_class(prv); |
| @@ -2164,12 +2275,12 @@ print_deadlock_scenario(struct held_lock *nxt, | |||
| 2164 | printk(" May be due to missing lock nesting notation\n\n"); | 2275 | printk(" May be due to missing lock nesting notation\n\n"); |
| 2165 | } | 2276 | } |
| 2166 | 2277 | ||
| 2167 | static int | 2278 | static void |
| 2168 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | 2279 | print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, |
| 2169 | struct held_lock *next) | 2280 | struct held_lock *next) |
| 2170 | { | 2281 | { |
| 2171 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2282 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 2172 | return 0; | 2283 | return; |
| 2173 | 2284 | ||
| 2174 | pr_warn("\n"); | 2285 | pr_warn("\n"); |
| 2175 | pr_warn("============================================\n"); | 2286 | pr_warn("============================================\n"); |
| @@ -2188,8 +2299,6 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
| 2188 | 2299 | ||
| 2189 | pr_warn("\nstack backtrace:\n"); | 2300 | pr_warn("\nstack backtrace:\n"); |
| 2190 | dump_stack(); | 2301 | dump_stack(); |
| 2191 | |||
| 2192 | return 0; | ||
| 2193 | } | 2302 | } |
| 2194 | 2303 | ||
| 2195 | /* | 2304 | /* |
| @@ -2201,8 +2310,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, | |||
| 2201 | * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read | 2310 | * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read |
| 2202 | */ | 2311 | */ |
| 2203 | static int | 2312 | static int |
| 2204 | check_deadlock(struct task_struct *curr, struct held_lock *next, | 2313 | check_deadlock(struct task_struct *curr, struct held_lock *next) |
| 2205 | struct lockdep_map *next_instance, int read) | ||
| 2206 | { | 2314 | { |
| 2207 | struct held_lock *prev; | 2315 | struct held_lock *prev; |
| 2208 | struct held_lock *nest = NULL; | 2316 | struct held_lock *nest = NULL; |
| @@ -2221,7 +2329,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, | |||
| 2221 | * Allow read-after-read recursion of the same | 2329 | * Allow read-after-read recursion of the same |
| 2222 | * lock class (i.e. read_lock(lock)+read_lock(lock)): | 2330 | * lock class (i.e. read_lock(lock)+read_lock(lock)): |
| 2223 | */ | 2331 | */ |
| 2224 | if ((read == 2) && prev->read) | 2332 | if ((next->read == 2) && prev->read) |
| 2225 | return 2; | 2333 | return 2; |
| 2226 | 2334 | ||
| 2227 | /* | 2335 | /* |
| @@ -2231,14 +2339,15 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, | |||
| 2231 | if (nest) | 2339 | if (nest) |
| 2232 | return 2; | 2340 | return 2; |
| 2233 | 2341 | ||
| 2234 | return print_deadlock_bug(curr, prev, next); | 2342 | print_deadlock_bug(curr, prev, next); |
| 2343 | return 0; | ||
| 2235 | } | 2344 | } |
| 2236 | return 1; | 2345 | return 1; |
| 2237 | } | 2346 | } |
| 2238 | 2347 | ||
| 2239 | /* | 2348 | /* |
| 2240 | * There was a chain-cache miss, and we are about to add a new dependency | 2349 | * There was a chain-cache miss, and we are about to add a new dependency |
| 2241 | * to a previous lock. We recursively validate the following rules: | 2350 | * to a previous lock. We validate the following rules: |
| 2242 | * | 2351 | * |
| 2243 | * - would the adding of the <prev> -> <next> dependency create a | 2352 | * - would the adding of the <prev> -> <next> dependency create a |
| 2244 | * circular dependency in the graph? [== circular deadlock] | 2353 | * circular dependency in the graph? [== circular deadlock] |
| @@ -2262,9 +2371,7 @@ static int | |||
| 2262 | check_prev_add(struct task_struct *curr, struct held_lock *prev, | 2371 | check_prev_add(struct task_struct *curr, struct held_lock *prev, |
| 2263 | struct held_lock *next, int distance, struct lock_trace *trace) | 2372 | struct held_lock *next, int distance, struct lock_trace *trace) |
| 2264 | { | 2373 | { |
| 2265 | struct lock_list *uninitialized_var(target_entry); | ||
| 2266 | struct lock_list *entry; | 2374 | struct lock_list *entry; |
| 2267 | struct lock_list this; | ||
| 2268 | int ret; | 2375 | int ret; |
| 2269 | 2376 | ||
| 2270 | if (!hlock_class(prev)->key || !hlock_class(next)->key) { | 2377 | if (!hlock_class(prev)->key || !hlock_class(next)->key) { |
| @@ -2288,28 +2395,16 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
| 2288 | /* | 2395 | /* |
| 2289 | * Prove that the new <prev> -> <next> dependency would not | 2396 | * Prove that the new <prev> -> <next> dependency would not |
| 2290 | * create a circular dependency in the graph. (We do this by | 2397 | * create a circular dependency in the graph. (We do this by |
| 2291 | * forward-recursing into the graph starting at <next>, and | 2398 | * a breadth-first search into the graph starting at <next>, |
| 2292 | * checking whether we can reach <prev>.) | 2399 | * and check whether we can reach <prev>.) |
| 2293 | * | 2400 | * |
| 2294 | * We are using global variables to control the recursion, to | 2401 | * The search is limited by the size of the circular queue (i.e., |
| 2295 | * keep the stackframe size of the recursive functions low: | 2402 | * MAX_CIRCULAR_QUEUE_SIZE) which keeps track of a breadth of nodes |
| 2403 | * in the graph whose neighbours are to be checked. | ||
| 2296 | */ | 2404 | */ |
| 2297 | this.class = hlock_class(next); | 2405 | ret = check_noncircular(next, prev, trace); |
| 2298 | this.parent = NULL; | 2406 | if (unlikely(ret <= 0)) |
| 2299 | ret = check_noncircular(&this, hlock_class(prev), &target_entry); | 2407 | return 0; |
| 2300 | if (unlikely(!ret)) { | ||
| 2301 | if (!trace->nr_entries) { | ||
| 2302 | /* | ||
| 2303 | * If save_trace fails here, the printing might | ||
| 2304 | * trigger a WARN but because of the !nr_entries it | ||
| 2305 | * should not do bad things. | ||
| 2306 | */ | ||
| 2307 | save_trace(trace); | ||
| 2308 | } | ||
| 2309 | return print_circular_bug(&this, target_entry, next, prev); | ||
| 2310 | } | ||
| 2311 | else if (unlikely(ret < 0)) | ||
| 2312 | return print_bfs_bug(ret); | ||
| 2313 | 2408 | ||
| 2314 | if (!check_irq_usage(curr, prev, next)) | 2409 | if (!check_irq_usage(curr, prev, next)) |
| 2315 | return 0; | 2410 | return 0; |
| @@ -2340,19 +2435,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, | |||
| 2340 | } | 2435 | } |
| 2341 | } | 2436 | } |
| 2342 | 2437 | ||
| 2438 | #ifdef CONFIG_LOCKDEP_SMALL | ||
| 2343 | /* | 2439 | /* |
| 2344 | * Is the <prev> -> <next> link redundant? | 2440 | * Is the <prev> -> <next> link redundant? |
| 2345 | */ | 2441 | */ |
| 2346 | this.class = hlock_class(prev); | 2442 | ret = check_redundant(prev, next); |
| 2347 | this.parent = NULL; | 2443 | if (ret != 1) |
| 2348 | ret = check_redundant(&this, hlock_class(next), &target_entry); | 2444 | return ret; |
| 2349 | if (!ret) { | 2445 | #endif |
| 2350 | debug_atomic_inc(nr_redundant); | ||
| 2351 | return 2; | ||
| 2352 | } | ||
| 2353 | if (ret < 0) | ||
| 2354 | return print_bfs_bug(ret); | ||
| 2355 | |||
| 2356 | 2446 | ||
| 2357 | if (!trace->nr_entries && !save_trace(trace)) | 2447 | if (!trace->nr_entries && !save_trace(trace)) |
| 2358 | return 0; | 2448 | return 0; |
| @@ -2504,12 +2594,13 @@ static void | |||
| 2504 | print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) | 2594 | print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) |
| 2505 | { | 2595 | { |
| 2506 | struct held_lock *hlock; | 2596 | struct held_lock *hlock; |
| 2507 | u64 chain_key = 0; | 2597 | u64 chain_key = INITIAL_CHAIN_KEY; |
| 2508 | int depth = curr->lockdep_depth; | 2598 | int depth = curr->lockdep_depth; |
| 2509 | int i; | 2599 | int i = get_first_held_lock(curr, hlock_next); |
| 2510 | 2600 | ||
| 2511 | printk("depth: %u\n", depth + 1); | 2601 | printk("depth: %u (irq_context %u)\n", depth - i + 1, |
| 2512 | for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { | 2602 | hlock_next->irq_context); |
| 2603 | for (; i < depth; i++) { | ||
| 2513 | hlock = curr->held_locks + i; | 2604 | hlock = curr->held_locks + i; |
| 2514 | chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); | 2605 | chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); |
| 2515 | 2606 | ||
| @@ -2523,13 +2614,13 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne | |||
| 2523 | static void print_chain_keys_chain(struct lock_chain *chain) | 2614 | static void print_chain_keys_chain(struct lock_chain *chain) |
| 2524 | { | 2615 | { |
| 2525 | int i; | 2616 | int i; |
| 2526 | u64 chain_key = 0; | 2617 | u64 chain_key = INITIAL_CHAIN_KEY; |
| 2527 | int class_id; | 2618 | int class_id; |
| 2528 | 2619 | ||
| 2529 | printk("depth: %u\n", chain->depth); | 2620 | printk("depth: %u\n", chain->depth); |
| 2530 | for (i = 0; i < chain->depth; i++) { | 2621 | for (i = 0; i < chain->depth; i++) { |
| 2531 | class_id = chain_hlocks[chain->base + i]; | 2622 | class_id = chain_hlocks[chain->base + i]; |
| 2532 | chain_key = print_chain_key_iteration(class_id + 1, chain_key); | 2623 | chain_key = print_chain_key_iteration(class_id, chain_key); |
| 2533 | 2624 | ||
| 2534 | print_lock_name(lock_classes + class_id); | 2625 | print_lock_name(lock_classes + class_id); |
| 2535 | printk("\n"); | 2626 | printk("\n"); |
| @@ -2580,7 +2671,7 @@ static int check_no_collision(struct task_struct *curr, | |||
| 2580 | } | 2671 | } |
| 2581 | 2672 | ||
| 2582 | for (j = 0; j < chain->depth - 1; j++, i++) { | 2673 | for (j = 0; j < chain->depth - 1; j++, i++) { |
| 2583 | id = curr->held_locks[i].class_idx - 1; | 2674 | id = curr->held_locks[i].class_idx; |
| 2584 | 2675 | ||
| 2585 | if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { | 2676 | if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { |
| 2586 | print_collision(curr, hlock, chain); | 2677 | print_collision(curr, hlock, chain); |
| @@ -2663,7 +2754,7 @@ static inline int add_chain_cache(struct task_struct *curr, | |||
| 2663 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { | 2754 | if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { |
| 2664 | chain->base = nr_chain_hlocks; | 2755 | chain->base = nr_chain_hlocks; |
| 2665 | for (j = 0; j < chain->depth - 1; j++, i++) { | 2756 | for (j = 0; j < chain->depth - 1; j++, i++) { |
| 2666 | int lock_id = curr->held_locks[i].class_idx - 1; | 2757 | int lock_id = curr->held_locks[i].class_idx; |
| 2667 | chain_hlocks[chain->base + j] = lock_id; | 2758 | chain_hlocks[chain->base + j] = lock_id; |
| 2668 | } | 2759 | } |
| 2669 | chain_hlocks[chain->base + j] = class - lock_classes; | 2760 | chain_hlocks[chain->base + j] = class - lock_classes; |
| @@ -2753,8 +2844,9 @@ cache_hit: | |||
| 2753 | return 1; | 2844 | return 1; |
| 2754 | } | 2845 | } |
| 2755 | 2846 | ||
| 2756 | static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, | 2847 | static int validate_chain(struct task_struct *curr, |
| 2757 | struct held_lock *hlock, int chain_head, u64 chain_key) | 2848 | struct held_lock *hlock, |
| 2849 | int chain_head, u64 chain_key) | ||
| 2758 | { | 2850 | { |
| 2759 | /* | 2851 | /* |
| 2760 | * Trylock needs to maintain the stack of held locks, but it | 2852 | * Trylock needs to maintain the stack of held locks, but it |
| @@ -2775,12 +2867,18 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, | |||
| 2775 | * - is softirq-safe, if this lock is hardirq-unsafe | 2867 | * - is softirq-safe, if this lock is hardirq-unsafe |
| 2776 | * | 2868 | * |
| 2777 | * And check whether the new lock's dependency graph | 2869 | * And check whether the new lock's dependency graph |
| 2778 | * could lead back to the previous lock. | 2870 | * could lead back to the previous lock: |
| 2779 | * | 2871 | * |
| 2780 | * any of these scenarios could lead to a deadlock. If | 2872 | * - within the current held-lock stack |
| 2781 | * All validations | 2873 | * - across our accumulated lock dependency records |
| 2874 | * | ||
| 2875 | * any of these scenarios could lead to a deadlock. | ||
| 2782 | */ | 2876 | */ |
| 2783 | int ret = check_deadlock(curr, hlock, lock, hlock->read); | 2877 | /* |
| 2878 | * The simple case: does the current hold the same lock | ||
| 2879 | * already? | ||
| 2880 | */ | ||
| 2881 | int ret = check_deadlock(curr, hlock); | ||
| 2784 | 2882 | ||
| 2785 | if (!ret) | 2883 | if (!ret) |
| 2786 | return 0; | 2884 | return 0; |
| @@ -2811,16 +2909,12 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, | |||
| 2811 | } | 2909 | } |
| 2812 | #else | 2910 | #else |
| 2813 | static inline int validate_chain(struct task_struct *curr, | 2911 | static inline int validate_chain(struct task_struct *curr, |
| 2814 | struct lockdep_map *lock, struct held_lock *hlock, | 2912 | struct held_lock *hlock, |
| 2815 | int chain_head, u64 chain_key) | 2913 | int chain_head, u64 chain_key) |
| 2816 | { | 2914 | { |
| 2817 | return 1; | 2915 | return 1; |
| 2818 | } | 2916 | } |
| 2819 | 2917 | #endif /* CONFIG_PROVE_LOCKING */ | |
| 2820 | static void print_lock_trace(struct lock_trace *trace, unsigned int spaces) | ||
| 2821 | { | ||
| 2822 | } | ||
| 2823 | #endif | ||
| 2824 | 2918 | ||
| 2825 | /* | 2919 | /* |
| 2826 | * We are building curr_chain_key incrementally, so double-check | 2920 | * We are building curr_chain_key incrementally, so double-check |
| @@ -2831,7 +2925,7 @@ static void check_chain_key(struct task_struct *curr) | |||
| 2831 | #ifdef CONFIG_DEBUG_LOCKDEP | 2925 | #ifdef CONFIG_DEBUG_LOCKDEP |
| 2832 | struct held_lock *hlock, *prev_hlock = NULL; | 2926 | struct held_lock *hlock, *prev_hlock = NULL; |
| 2833 | unsigned int i; | 2927 | unsigned int i; |
| 2834 | u64 chain_key = 0; | 2928 | u64 chain_key = INITIAL_CHAIN_KEY; |
| 2835 | 2929 | ||
| 2836 | for (i = 0; i < curr->lockdep_depth; i++) { | 2930 | for (i = 0; i < curr->lockdep_depth; i++) { |
| 2837 | hlock = curr->held_locks + i; | 2931 | hlock = curr->held_locks + i; |
| @@ -2847,15 +2941,17 @@ static void check_chain_key(struct task_struct *curr) | |||
| 2847 | (unsigned long long)hlock->prev_chain_key); | 2941 | (unsigned long long)hlock->prev_chain_key); |
| 2848 | return; | 2942 | return; |
| 2849 | } | 2943 | } |
| 2944 | |||
| 2850 | /* | 2945 | /* |
| 2851 | * Whoops ran out of static storage again? | 2946 | * hlock->class_idx can't go beyond MAX_LOCKDEP_KEYS, but is |
| 2947 | * it registered lock class index? | ||
| 2852 | */ | 2948 | */ |
| 2853 | if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) | 2949 | if (DEBUG_LOCKS_WARN_ON(!test_bit(hlock->class_idx, lock_classes_in_use))) |
| 2854 | return; | 2950 | return; |
| 2855 | 2951 | ||
| 2856 | if (prev_hlock && (prev_hlock->irq_context != | 2952 | if (prev_hlock && (prev_hlock->irq_context != |
| 2857 | hlock->irq_context)) | 2953 | hlock->irq_context)) |
| 2858 | chain_key = 0; | 2954 | chain_key = INITIAL_CHAIN_KEY; |
| 2859 | chain_key = iterate_chain_key(chain_key, hlock->class_idx); | 2955 | chain_key = iterate_chain_key(chain_key, hlock->class_idx); |
| 2860 | prev_hlock = hlock; | 2956 | prev_hlock = hlock; |
| 2861 | } | 2957 | } |
| @@ -2873,14 +2969,11 @@ static void check_chain_key(struct task_struct *curr) | |||
| 2873 | #endif | 2969 | #endif |
| 2874 | } | 2970 | } |
| 2875 | 2971 | ||
| 2972 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | ||
| 2876 | static int mark_lock(struct task_struct *curr, struct held_lock *this, | 2973 | static int mark_lock(struct task_struct *curr, struct held_lock *this, |
| 2877 | enum lock_usage_bit new_bit); | 2974 | enum lock_usage_bit new_bit); |
| 2878 | 2975 | ||
| 2879 | #if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) | 2976 | static void print_usage_bug_scenario(struct held_lock *lock) |
| 2880 | |||
| 2881 | |||
| 2882 | static void | ||
| 2883 | print_usage_bug_scenario(struct held_lock *lock) | ||
| 2884 | { | 2977 | { |
| 2885 | struct lock_class *class = hlock_class(lock); | 2978 | struct lock_class *class = hlock_class(lock); |
| 2886 | 2979 | ||
| @@ -2897,12 +2990,12 @@ print_usage_bug_scenario(struct held_lock *lock) | |||
| 2897 | printk("\n *** DEADLOCK ***\n\n"); | 2990 | printk("\n *** DEADLOCK ***\n\n"); |
| 2898 | } | 2991 | } |
| 2899 | 2992 | ||
| 2900 | static int | 2993 | static void |
| 2901 | print_usage_bug(struct task_struct *curr, struct held_lock *this, | 2994 | print_usage_bug(struct task_struct *curr, struct held_lock *this, |
| 2902 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) | 2995 | enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) |
| 2903 | { | 2996 | { |
| 2904 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 2997 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 2905 | return 0; | 2998 | return; |
| 2906 | 2999 | ||
| 2907 | pr_warn("\n"); | 3000 | pr_warn("\n"); |
| 2908 | pr_warn("================================\n"); | 3001 | pr_warn("================================\n"); |
| @@ -2932,8 +3025,6 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, | |||
| 2932 | 3025 | ||
| 2933 | pr_warn("\nstack backtrace:\n"); | 3026 | pr_warn("\nstack backtrace:\n"); |
| 2934 | dump_stack(); | 3027 | dump_stack(); |
| 2935 | |||
| 2936 | return 0; | ||
| 2937 | } | 3028 | } |
| 2938 | 3029 | ||
| 2939 | /* | 3030 | /* |
| @@ -2943,8 +3034,10 @@ static inline int | |||
| 2943 | valid_state(struct task_struct *curr, struct held_lock *this, | 3034 | valid_state(struct task_struct *curr, struct held_lock *this, |
| 2944 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) | 3035 | enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) |
| 2945 | { | 3036 | { |
| 2946 | if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) | 3037 | if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { |
| 2947 | return print_usage_bug(curr, this, bad_bit, new_bit); | 3038 | print_usage_bug(curr, this, bad_bit, new_bit); |
| 3039 | return 0; | ||
| 3040 | } | ||
| 2948 | return 1; | 3041 | return 1; |
| 2949 | } | 3042 | } |
| 2950 | 3043 | ||
| @@ -2952,7 +3045,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, | |||
| 2952 | /* | 3045 | /* |
| 2953 | * print irq inversion bug: | 3046 | * print irq inversion bug: |
| 2954 | */ | 3047 | */ |
| 2955 | static int | 3048 | static void |
| 2956 | print_irq_inversion_bug(struct task_struct *curr, | 3049 | print_irq_inversion_bug(struct task_struct *curr, |
| 2957 | struct lock_list *root, struct lock_list *other, | 3050 | struct lock_list *root, struct lock_list *other, |
| 2958 | struct held_lock *this, int forwards, | 3051 | struct held_lock *this, int forwards, |
| @@ -2963,7 +3056,7 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
| 2963 | int depth; | 3056 | int depth; |
| 2964 | 3057 | ||
| 2965 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) | 3058 | if (!debug_locks_off_graph_unlock() || debug_locks_silent) |
| 2966 | return 0; | 3059 | return; |
| 2967 | 3060 | ||
| 2968 | pr_warn("\n"); | 3061 | pr_warn("\n"); |
| 2969 | pr_warn("========================================================\n"); | 3062 | pr_warn("========================================================\n"); |
| @@ -3004,13 +3097,11 @@ print_irq_inversion_bug(struct task_struct *curr, | |||
| 3004 | 3097 | ||
| 3005 | pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); | 3098 | pr_warn("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); |
| 3006 | if (!save_trace(&root->trace)) | 3099 | if (!save_trace(&root->trace)) |
| 3007 | return 0; | 3100 | return; |
| 3008 | print_shortest_lock_dependencies(other, root); | 3101 | print_shortest_lock_dependencies(other, root); |
| 3009 | 3102 | ||
| 3010 | pr_warn("\nstack backtrace:\n"); | 3103 | pr_warn("\nstack backtrace:\n"); |
| 3011 | dump_stack(); | 3104 | dump_stack(); |
| 3012 | |||
| 3013 | return 0; | ||
| 3014 | } | 3105 | } |
| 3015 | 3106 | ||
| 3016 | /* | 3107 | /* |
| @@ -3028,13 +3119,16 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this, | |||
| 3028 | root.parent = NULL; | 3119 | root.parent = NULL; |
| 3029 | root.class = hlock_class(this); | 3120 | root.class = hlock_class(this); |
| 3030 | ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); | 3121 | ret = find_usage_forwards(&root, lock_flag(bit), &target_entry); |
| 3031 | if (ret < 0) | 3122 | if (ret < 0) { |
| 3032 | return print_bfs_bug(ret); | 3123 | print_bfs_bug(ret); |
| 3124 | return 0; | ||
| 3125 | } | ||
| 3033 | if (ret == 1) | 3126 | if (ret == 1) |
| 3034 | return ret; | 3127 | return ret; |
| 3035 | 3128 | ||
| 3036 | return print_irq_inversion_bug(curr, &root, target_entry, | 3129 | print_irq_inversion_bug(curr, &root, target_entry, |
| 3037 | this, 1, irqclass); | 3130 | this, 1, irqclass); |
| 3131 | return 0; | ||
| 3038 | } | 3132 | } |
| 3039 | 3133 | ||
| 3040 | /* | 3134 | /* |
| @@ -3052,13 +3146,16 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, | |||
| 3052 | root.parent = NULL; | 3146 | root.parent = NULL; |
| 3053 | root.class = hlock_class(this); | 3147 | root.class = hlock_class(this); |
| 3054 | ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); | 3148 | ret = find_usage_backwards(&root, lock_flag(bit), &target_entry); |
| 3055 | if (ret < 0) | 3149 | if (ret < 0) { |
| 3056 | return print_bfs_bug(ret); | 3150 | print_bfs_bug(ret); |
| 3151 | return 0; | ||
| 3152 | } | ||
| 3057 | if (ret == 1) | 3153 | if (ret == 1) |
| 3058 | return ret; | 3154 | return ret; |
| 3059 | 3155 | ||
| 3060 | return print_irq_inversion_bug(curr, &root, target_entry, | 3156 | print_irq_inversion_bug(curr, &root, target_entry, |
| 3061 | this, 0, irqclass); | 3157 | this, 0, irqclass); |
| 3158 | return 0; | ||
| 3062 | } | 3159 | } |
| 3063 | 3160 | ||
| 3064 | void print_irqtrace_events(struct task_struct *curr) | 3161 | void print_irqtrace_events(struct task_struct *curr) |
| @@ -3141,7 +3238,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, | |||
| 3141 | * Validate that the lock dependencies don't have conflicting usage | 3238 | * Validate that the lock dependencies don't have conflicting usage |
| 3142 | * states. | 3239 | * states. |
| 3143 | */ | 3240 | */ |
| 3144 | if ((!read || !dir || STRICT_READ_CHECKS) && | 3241 | if ((!read || STRICT_READ_CHECKS) && |
| 3145 | !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) | 3242 | !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK))) |
| 3146 | return 0; | 3243 | return 0; |
| 3147 | 3244 | ||
| @@ -3366,8 +3463,12 @@ void trace_softirqs_off(unsigned long ip) | |||
| 3366 | debug_atomic_inc(redundant_softirqs_off); | 3463 | debug_atomic_inc(redundant_softirqs_off); |
| 3367 | } | 3464 | } |
| 3368 | 3465 | ||
| 3369 | static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) | 3466 | static int |
| 3467 | mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) | ||
| 3370 | { | 3468 | { |
| 3469 | if (!check) | ||
| 3470 | goto lock_used; | ||
| 3471 | |||
| 3371 | /* | 3472 | /* |
| 3372 | * If non-trylock use in a hardirq or softirq context, then | 3473 | * If non-trylock use in a hardirq or softirq context, then |
| 3373 | * mark the lock as used in these contexts: | 3474 | * mark the lock as used in these contexts: |
| @@ -3411,6 +3512,11 @@ static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) | |||
| 3411 | } | 3512 | } |
| 3412 | } | 3513 | } |
| 3413 | 3514 | ||
| 3515 | lock_used: | ||
| 3516 | /* mark it as used: */ | ||
| 3517 | if (!mark_lock(curr, hlock, LOCK_USED)) | ||
| 3518 | return 0; | ||
| 3519 | |||
| 3414 | return 1; | 3520 | return 1; |
| 3415 | } | 3521 | } |
| 3416 | 3522 | ||
| @@ -3442,35 +3548,6 @@ static int separate_irq_context(struct task_struct *curr, | |||
| 3442 | return 0; | 3548 | return 0; |
| 3443 | } | 3549 | } |
| 3444 | 3550 | ||
| 3445 | #else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ | ||
| 3446 | |||
| 3447 | static inline | ||
| 3448 | int mark_lock_irq(struct task_struct *curr, struct held_lock *this, | ||
| 3449 | enum lock_usage_bit new_bit) | ||
| 3450 | { | ||
| 3451 | WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */ | ||
| 3452 | return 1; | ||
| 3453 | } | ||
| 3454 | |||
| 3455 | static inline int mark_irqflags(struct task_struct *curr, | ||
| 3456 | struct held_lock *hlock) | ||
| 3457 | { | ||
| 3458 | return 1; | ||
| 3459 | } | ||
| 3460 | |||
| 3461 | static inline unsigned int task_irq_context(struct task_struct *task) | ||
| 3462 | { | ||
| 3463 | return 0; | ||
| 3464 | } | ||
| 3465 | |||
| 3466 | static inline int separate_irq_context(struct task_struct *curr, | ||
| 3467 | struct held_lock *hlock) | ||
| 3468 | { | ||
| 3469 | return 0; | ||
| 3470 | } | ||
| 3471 | |||
| 3472 | #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ | ||
| 3473 | |||
| 3474 | /* | 3551 | /* |
| 3475 | * Mark a lock with a usage bit, and validate the state transition: | 3552 | * Mark a lock with a usage bit, and validate the state transition: |
| 3476 | */ | 3553 | */ |
| @@ -3479,6 +3556,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 3479 | { | 3556 | { |
| 3480 | unsigned int new_mask = 1 << new_bit, ret = 1; | 3557 | unsigned int new_mask = 1 << new_bit, ret = 1; |
| 3481 | 3558 | ||
| 3559 | if (new_bit >= LOCK_USAGE_STATES) { | ||
| 3560 | DEBUG_LOCKS_WARN_ON(1); | ||
| 3561 | return 0; | ||
| 3562 | } | ||
| 3563 | |||
| 3482 | /* | 3564 | /* |
| 3483 | * If already set then do not dirty the cacheline, | 3565 | * If already set then do not dirty the cacheline, |
| 3484 | * nor do any checks: | 3566 | * nor do any checks: |
| @@ -3502,25 +3584,13 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 3502 | return 0; | 3584 | return 0; |
| 3503 | 3585 | ||
| 3504 | switch (new_bit) { | 3586 | switch (new_bit) { |
| 3505 | #define LOCKDEP_STATE(__STATE) \ | ||
| 3506 | case LOCK_USED_IN_##__STATE: \ | ||
| 3507 | case LOCK_USED_IN_##__STATE##_READ: \ | ||
| 3508 | case LOCK_ENABLED_##__STATE: \ | ||
| 3509 | case LOCK_ENABLED_##__STATE##_READ: | ||
| 3510 | #include "lockdep_states.h" | ||
| 3511 | #undef LOCKDEP_STATE | ||
| 3512 | ret = mark_lock_irq(curr, this, new_bit); | ||
| 3513 | if (!ret) | ||
| 3514 | return 0; | ||
| 3515 | break; | ||
| 3516 | case LOCK_USED: | 3587 | case LOCK_USED: |
| 3517 | debug_atomic_dec(nr_unused_locks); | 3588 | debug_atomic_dec(nr_unused_locks); |
| 3518 | break; | 3589 | break; |
| 3519 | default: | 3590 | default: |
| 3520 | if (!debug_locks_off_graph_unlock()) | 3591 | ret = mark_lock_irq(curr, this, new_bit); |
| 3592 | if (!ret) | ||
| 3521 | return 0; | 3593 | return 0; |
| 3522 | WARN_ON(1); | ||
| 3523 | return 0; | ||
| 3524 | } | 3594 | } |
| 3525 | 3595 | ||
| 3526 | graph_unlock(); | 3596 | graph_unlock(); |
| @@ -3538,6 +3608,27 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, | |||
| 3538 | return ret; | 3608 | return ret; |
| 3539 | } | 3609 | } |
| 3540 | 3610 | ||
| 3611 | #else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ | ||
| 3612 | |||
| 3613 | static inline int | ||
| 3614 | mark_usage(struct task_struct *curr, struct held_lock *hlock, int check) | ||
| 3615 | { | ||
| 3616 | return 1; | ||
| 3617 | } | ||
| 3618 | |||
| 3619 | static inline unsigned int task_irq_context(struct task_struct *task) | ||
| 3620 | { | ||
| 3621 | return 0; | ||
| 3622 | } | ||
| 3623 | |||
| 3624 | static inline int separate_irq_context(struct task_struct *curr, | ||
| 3625 | struct held_lock *hlock) | ||
| 3626 | { | ||
| 3627 | return 0; | ||
| 3628 | } | ||
| 3629 | |||
| 3630 | #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */ | ||
| 3631 | |||
| 3541 | /* | 3632 | /* |
| 3542 | * Initialize a lock instance's lock-class mapping info: | 3633 | * Initialize a lock instance's lock-class mapping info: |
| 3543 | */ | 3634 | */ |
| @@ -3601,15 +3692,15 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); | |||
| 3601 | struct lock_class_key __lockdep_no_validate__; | 3692 | struct lock_class_key __lockdep_no_validate__; |
| 3602 | EXPORT_SYMBOL_GPL(__lockdep_no_validate__); | 3693 | EXPORT_SYMBOL_GPL(__lockdep_no_validate__); |
| 3603 | 3694 | ||
| 3604 | static int | 3695 | static void |
| 3605 | print_lock_nested_lock_not_held(struct task_struct *curr, | 3696 | print_lock_nested_lock_not_held(struct task_struct *curr, |
| 3606 | struct held_lock *hlock, | 3697 | struct held_lock *hlock, |
| 3607 | unsigned long ip) | 3698 | unsigned long ip) |
| 3608 | { | 3699 | { |
| 3609 | if (!debug_locks_off()) | 3700 | if (!debug_locks_off()) |
| 3610 | return 0; | 3701 | return; |
| 3611 | if (debug_locks_silent) | 3702 | if (debug_locks_silent) |
| 3612 | return 0; | 3703 | return; |
| 3613 | 3704 | ||
| 3614 | pr_warn("\n"); | 3705 | pr_warn("\n"); |
| 3615 | pr_warn("==================================\n"); | 3706 | pr_warn("==================================\n"); |
| @@ -3631,8 +3722,6 @@ print_lock_nested_lock_not_held(struct task_struct *curr, | |||
| 3631 | 3722 | ||
| 3632 | pr_warn("\nstack backtrace:\n"); | 3723 | pr_warn("\nstack backtrace:\n"); |
| 3633 | dump_stack(); | 3724 | dump_stack(); |
| 3634 | |||
| 3635 | return 0; | ||
| 3636 | } | 3725 | } |
| 3637 | 3726 | ||
| 3638 | static int __lock_is_held(const struct lockdep_map *lock, int read); | 3727 | static int __lock_is_held(const struct lockdep_map *lock, int read); |
| @@ -3697,24 +3786,24 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3697 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) | 3786 | if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) |
| 3698 | return 0; | 3787 | return 0; |
| 3699 | 3788 | ||
| 3700 | class_idx = class - lock_classes + 1; | 3789 | class_idx = class - lock_classes; |
| 3701 | 3790 | ||
| 3702 | if (depth) { | 3791 | if (depth) { |
| 3703 | hlock = curr->held_locks + depth - 1; | 3792 | hlock = curr->held_locks + depth - 1; |
| 3704 | if (hlock->class_idx == class_idx && nest_lock) { | 3793 | if (hlock->class_idx == class_idx && nest_lock) { |
| 3705 | if (hlock->references) { | 3794 | if (!references) |
| 3706 | /* | 3795 | references++; |
| 3707 | * Check: unsigned int references:12, overflow. | ||
| 3708 | */ | ||
| 3709 | if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1)) | ||
| 3710 | return 0; | ||
| 3711 | 3796 | ||
| 3797 | if (!hlock->references) | ||
| 3712 | hlock->references++; | 3798 | hlock->references++; |
| 3713 | } else { | ||
| 3714 | hlock->references = 2; | ||
| 3715 | } | ||
| 3716 | 3799 | ||
| 3717 | return 1; | 3800 | hlock->references += references; |
| 3801 | |||
| 3802 | /* Overflow */ | ||
| 3803 | if (DEBUG_LOCKS_WARN_ON(hlock->references < references)) | ||
| 3804 | return 0; | ||
| 3805 | |||
| 3806 | return 2; | ||
| 3718 | } | 3807 | } |
| 3719 | } | 3808 | } |
| 3720 | 3809 | ||
| @@ -3741,11 +3830,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3741 | #endif | 3830 | #endif |
| 3742 | hlock->pin_count = pin_count; | 3831 | hlock->pin_count = pin_count; |
| 3743 | 3832 | ||
| 3744 | if (check && !mark_irqflags(curr, hlock)) | 3833 | /* Initialize the lock usage bit */ |
| 3745 | return 0; | 3834 | if (!mark_usage(curr, hlock, check)) |
| 3746 | |||
| 3747 | /* mark it as used: */ | ||
| 3748 | if (!mark_lock(curr, hlock, LOCK_USED)) | ||
| 3749 | return 0; | 3835 | return 0; |
| 3750 | 3836 | ||
| 3751 | /* | 3837 | /* |
| @@ -3759,9 +3845,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3759 | * the hash, not class->key. | 3845 | * the hash, not class->key. |
| 3760 | */ | 3846 | */ |
| 3761 | /* | 3847 | /* |
| 3762 | * Whoops, we did it again.. ran straight out of our static allocation. | 3848 | * Whoops, we did it again.. class_idx is invalid. |
| 3763 | */ | 3849 | */ |
| 3764 | if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) | 3850 | if (DEBUG_LOCKS_WARN_ON(!test_bit(class_idx, lock_classes_in_use))) |
| 3765 | return 0; | 3851 | return 0; |
| 3766 | 3852 | ||
| 3767 | chain_key = curr->curr_chain_key; | 3853 | chain_key = curr->curr_chain_key; |
| @@ -3769,27 +3855,29 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3769 | /* | 3855 | /* |
| 3770 | * How can we have a chain hash when we ain't got no keys?! | 3856 | * How can we have a chain hash when we ain't got no keys?! |
| 3771 | */ | 3857 | */ |
| 3772 | if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) | 3858 | if (DEBUG_LOCKS_WARN_ON(chain_key != INITIAL_CHAIN_KEY)) |
| 3773 | return 0; | 3859 | return 0; |
| 3774 | chain_head = 1; | 3860 | chain_head = 1; |
| 3775 | } | 3861 | } |
| 3776 | 3862 | ||
| 3777 | hlock->prev_chain_key = chain_key; | 3863 | hlock->prev_chain_key = chain_key; |
| 3778 | if (separate_irq_context(curr, hlock)) { | 3864 | if (separate_irq_context(curr, hlock)) { |
| 3779 | chain_key = 0; | 3865 | chain_key = INITIAL_CHAIN_KEY; |
| 3780 | chain_head = 1; | 3866 | chain_head = 1; |
| 3781 | } | 3867 | } |
| 3782 | chain_key = iterate_chain_key(chain_key, class_idx); | 3868 | chain_key = iterate_chain_key(chain_key, class_idx); |
| 3783 | 3869 | ||
| 3784 | if (nest_lock && !__lock_is_held(nest_lock, -1)) | 3870 | if (nest_lock && !__lock_is_held(nest_lock, -1)) { |
| 3785 | return print_lock_nested_lock_not_held(curr, hlock, ip); | 3871 | print_lock_nested_lock_not_held(curr, hlock, ip); |
| 3872 | return 0; | ||
| 3873 | } | ||
| 3786 | 3874 | ||
| 3787 | if (!debug_locks_silent) { | 3875 | if (!debug_locks_silent) { |
| 3788 | WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); | 3876 | WARN_ON_ONCE(depth && !hlock_class(hlock - 1)->key); |
| 3789 | WARN_ON_ONCE(!hlock_class(hlock)->key); | 3877 | WARN_ON_ONCE(!hlock_class(hlock)->key); |
| 3790 | } | 3878 | } |
| 3791 | 3879 | ||
| 3792 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) | 3880 | if (!validate_chain(curr, hlock, chain_head, chain_key)) |
| 3793 | return 0; | 3881 | return 0; |
| 3794 | 3882 | ||
| 3795 | curr->curr_chain_key = chain_key; | 3883 | curr->curr_chain_key = chain_key; |
| @@ -3818,14 +3906,14 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3818 | return 1; | 3906 | return 1; |
| 3819 | } | 3907 | } |
| 3820 | 3908 | ||
| 3821 | static int | 3909 | static void print_unlock_imbalance_bug(struct task_struct *curr, |
| 3822 | print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | 3910 | struct lockdep_map *lock, |
| 3823 | unsigned long ip) | 3911 | unsigned long ip) |
| 3824 | { | 3912 | { |
| 3825 | if (!debug_locks_off()) | 3913 | if (!debug_locks_off()) |
| 3826 | return 0; | 3914 | return; |
| 3827 | if (debug_locks_silent) | 3915 | if (debug_locks_silent) |
| 3828 | return 0; | 3916 | return; |
| 3829 | 3917 | ||
| 3830 | pr_warn("\n"); | 3918 | pr_warn("\n"); |
| 3831 | pr_warn("=====================================\n"); | 3919 | pr_warn("=====================================\n"); |
| @@ -3843,8 +3931,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
| 3843 | 3931 | ||
| 3844 | pr_warn("\nstack backtrace:\n"); | 3932 | pr_warn("\nstack backtrace:\n"); |
| 3845 | dump_stack(); | 3933 | dump_stack(); |
| 3846 | |||
| 3847 | return 0; | ||
| 3848 | } | 3934 | } |
| 3849 | 3935 | ||
| 3850 | static int match_held_lock(const struct held_lock *hlock, | 3936 | static int match_held_lock(const struct held_lock *hlock, |
| @@ -3876,7 +3962,7 @@ static int match_held_lock(const struct held_lock *hlock, | |||
| 3876 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) | 3962 | if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) |
| 3877 | return 0; | 3963 | return 0; |
| 3878 | 3964 | ||
| 3879 | if (hlock->class_idx == class - lock_classes + 1) | 3965 | if (hlock->class_idx == class - lock_classes) |
| 3880 | return 1; | 3966 | return 1; |
| 3881 | } | 3967 | } |
| 3882 | 3968 | ||
| @@ -3920,22 +4006,33 @@ out: | |||
| 3920 | } | 4006 | } |
| 3921 | 4007 | ||
| 3922 | static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, | 4008 | static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, |
| 3923 | int idx) | 4009 | int idx, unsigned int *merged) |
| 3924 | { | 4010 | { |
| 3925 | struct held_lock *hlock; | 4011 | struct held_lock *hlock; |
| 4012 | int first_idx = idx; | ||
| 3926 | 4013 | ||
| 3927 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) | 4014 | if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) |
| 3928 | return 0; | 4015 | return 0; |
| 3929 | 4016 | ||
| 3930 | for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { | 4017 | for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { |
| 3931 | if (!__lock_acquire(hlock->instance, | 4018 | switch (__lock_acquire(hlock->instance, |
| 3932 | hlock_class(hlock)->subclass, | 4019 | hlock_class(hlock)->subclass, |
| 3933 | hlock->trylock, | 4020 | hlock->trylock, |
| 3934 | hlock->read, hlock->check, | 4021 | hlock->read, hlock->check, |
| 3935 | hlock->hardirqs_off, | 4022 | hlock->hardirqs_off, |
| 3936 | hlock->nest_lock, hlock->acquire_ip, | 4023 | hlock->nest_lock, hlock->acquire_ip, |
| 3937 | hlock->references, hlock->pin_count)) | 4024 | hlock->references, hlock->pin_count)) { |
| 4025 | case 0: | ||
| 3938 | return 1; | 4026 | return 1; |
| 4027 | case 1: | ||
| 4028 | break; | ||
| 4029 | case 2: | ||
| 4030 | *merged += (idx == first_idx); | ||
| 4031 | break; | ||
| 4032 | default: | ||
| 4033 | WARN_ON(1); | ||
| 4034 | return 0; | ||
| 4035 | } | ||
| 3939 | } | 4036 | } |
| 3940 | return 0; | 4037 | return 0; |
| 3941 | } | 4038 | } |
| @@ -3946,9 +4043,9 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
| 3946 | unsigned long ip) | 4043 | unsigned long ip) |
| 3947 | { | 4044 | { |
| 3948 | struct task_struct *curr = current; | 4045 | struct task_struct *curr = current; |
| 4046 | unsigned int depth, merged = 0; | ||
| 3949 | struct held_lock *hlock; | 4047 | struct held_lock *hlock; |
| 3950 | struct lock_class *class; | 4048 | struct lock_class *class; |
| 3951 | unsigned int depth; | ||
| 3952 | int i; | 4049 | int i; |
| 3953 | 4050 | ||
| 3954 | if (unlikely(!debug_locks)) | 4051 | if (unlikely(!debug_locks)) |
| @@ -3963,24 +4060,26 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
| 3963 | return 0; | 4060 | return 0; |
| 3964 | 4061 | ||
| 3965 | hlock = find_held_lock(curr, lock, depth, &i); | 4062 | hlock = find_held_lock(curr, lock, depth, &i); |
| 3966 | if (!hlock) | 4063 | if (!hlock) { |
| 3967 | return print_unlock_imbalance_bug(curr, lock, ip); | 4064 | print_unlock_imbalance_bug(curr, lock, ip); |
| 4065 | return 0; | ||
| 4066 | } | ||
| 3968 | 4067 | ||
| 3969 | lockdep_init_map(lock, name, key, 0); | 4068 | lockdep_init_map(lock, name, key, 0); |
| 3970 | class = register_lock_class(lock, subclass, 0); | 4069 | class = register_lock_class(lock, subclass, 0); |
| 3971 | hlock->class_idx = class - lock_classes + 1; | 4070 | hlock->class_idx = class - lock_classes; |
| 3972 | 4071 | ||
| 3973 | curr->lockdep_depth = i; | 4072 | curr->lockdep_depth = i; |
| 3974 | curr->curr_chain_key = hlock->prev_chain_key; | 4073 | curr->curr_chain_key = hlock->prev_chain_key; |
| 3975 | 4074 | ||
| 3976 | if (reacquire_held_locks(curr, depth, i)) | 4075 | if (reacquire_held_locks(curr, depth, i, &merged)) |
| 3977 | return 0; | 4076 | return 0; |
| 3978 | 4077 | ||
| 3979 | /* | 4078 | /* |
| 3980 | * I took it apart and put it back together again, except now I have | 4079 | * I took it apart and put it back together again, except now I have |
| 3981 | * these 'spare' parts.. where shall I put them. | 4080 | * these 'spare' parts.. where shall I put them. |
| 3982 | */ | 4081 | */ |
| 3983 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) | 4082 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged)) |
| 3984 | return 0; | 4083 | return 0; |
| 3985 | return 1; | 4084 | return 1; |
| 3986 | } | 4085 | } |
| @@ -3988,8 +4087,8 @@ __lock_set_class(struct lockdep_map *lock, const char *name, | |||
| 3988 | static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) | 4087 | static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) |
| 3989 | { | 4088 | { |
| 3990 | struct task_struct *curr = current; | 4089 | struct task_struct *curr = current; |
| 4090 | unsigned int depth, merged = 0; | ||
| 3991 | struct held_lock *hlock; | 4091 | struct held_lock *hlock; |
| 3992 | unsigned int depth; | ||
| 3993 | int i; | 4092 | int i; |
| 3994 | 4093 | ||
| 3995 | if (unlikely(!debug_locks)) | 4094 | if (unlikely(!debug_locks)) |
| @@ -4004,8 +4103,10 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) | |||
| 4004 | return 0; | 4103 | return 0; |
| 4005 | 4104 | ||
| 4006 | hlock = find_held_lock(curr, lock, depth, &i); | 4105 | hlock = find_held_lock(curr, lock, depth, &i); |
| 4007 | if (!hlock) | 4106 | if (!hlock) { |
| 4008 | return print_unlock_imbalance_bug(curr, lock, ip); | 4107 | print_unlock_imbalance_bug(curr, lock, ip); |
| 4108 | return 0; | ||
| 4109 | } | ||
| 4009 | 4110 | ||
| 4010 | curr->lockdep_depth = i; | 4111 | curr->lockdep_depth = i; |
| 4011 | curr->curr_chain_key = hlock->prev_chain_key; | 4112 | curr->curr_chain_key = hlock->prev_chain_key; |
| @@ -4014,7 +4115,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) | |||
| 4014 | hlock->read = 1; | 4115 | hlock->read = 1; |
| 4015 | hlock->acquire_ip = ip; | 4116 | hlock->acquire_ip = ip; |
| 4016 | 4117 | ||
| 4017 | if (reacquire_held_locks(curr, depth, i)) | 4118 | if (reacquire_held_locks(curr, depth, i, &merged)) |
| 4119 | return 0; | ||
| 4120 | |||
| 4121 | /* Merging can't happen with unchanged classes.. */ | ||
| 4122 | if (DEBUG_LOCKS_WARN_ON(merged)) | ||
| 4018 | return 0; | 4123 | return 0; |
| 4019 | 4124 | ||
| 4020 | /* | 4125 | /* |
| @@ -4023,6 +4128,7 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) | |||
| 4023 | */ | 4128 | */ |
| 4024 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) | 4129 | if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) |
| 4025 | return 0; | 4130 | return 0; |
| 4131 | |||
| 4026 | return 1; | 4132 | return 1; |
| 4027 | } | 4133 | } |
| 4028 | 4134 | ||
| @@ -4034,11 +4140,11 @@ static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) | |||
| 4034 | * @nested is an hysterical artifact, needs a tree wide cleanup. | 4140 | * @nested is an hysterical artifact, needs a tree wide cleanup. |
| 4035 | */ | 4141 | */ |
| 4036 | static int | 4142 | static int |
| 4037 | __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | 4143 | __lock_release(struct lockdep_map *lock, unsigned long ip) |
| 4038 | { | 4144 | { |
| 4039 | struct task_struct *curr = current; | 4145 | struct task_struct *curr = current; |
| 4146 | unsigned int depth, merged = 1; | ||
| 4040 | struct held_lock *hlock; | 4147 | struct held_lock *hlock; |
| 4041 | unsigned int depth; | ||
| 4042 | int i; | 4148 | int i; |
| 4043 | 4149 | ||
| 4044 | if (unlikely(!debug_locks)) | 4150 | if (unlikely(!debug_locks)) |
| @@ -4049,16 +4155,20 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
| 4049 | * So we're all set to release this lock.. wait what lock? We don't | 4155 | * So we're all set to release this lock.. wait what lock? We don't |
| 4050 | * own any locks, you've been drinking again? | 4156 | * own any locks, you've been drinking again? |
| 4051 | */ | 4157 | */ |
| 4052 | if (DEBUG_LOCKS_WARN_ON(depth <= 0)) | 4158 | if (depth <= 0) { |
| 4053 | return print_unlock_imbalance_bug(curr, lock, ip); | 4159 | print_unlock_imbalance_bug(curr, lock, ip); |
| 4160 | return 0; | ||
| 4161 | } | ||
| 4054 | 4162 | ||
| 4055 | /* | 4163 | /* |
| 4056 | * Check whether the lock exists in the current stack | 4164 | * Check whether the lock exists in the current stack |
| 4057 | * of held locks: | 4165 | * of held locks: |
| 4058 | */ | 4166 | */ |
| 4059 | hlock = find_held_lock(curr, lock, depth, &i); | 4167 | hlock = find_held_lock(curr, lock, depth, &i); |
| 4060 | if (!hlock) | 4168 | if (!hlock) { |
| 4061 | return print_unlock_imbalance_bug(curr, lock, ip); | 4169 | print_unlock_imbalance_bug(curr, lock, ip); |
| 4170 | return 0; | ||
| 4171 | } | ||
| 4062 | 4172 | ||
| 4063 | if (hlock->instance == lock) | 4173 | if (hlock->instance == lock) |
| 4064 | lock_release_holdtime(hlock); | 4174 | lock_release_holdtime(hlock); |
| @@ -4093,14 +4203,15 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) | |||
| 4093 | if (i == depth-1) | 4203 | if (i == depth-1) |
| 4094 | return 1; | 4204 | return 1; |
| 4095 | 4205 | ||
| 4096 | if (reacquire_held_locks(curr, depth, i + 1)) | 4206 | if (reacquire_held_locks(curr, depth, i + 1, &merged)) |
| 4097 | return 0; | 4207 | return 0; |
| 4098 | 4208 | ||
| 4099 | /* | 4209 | /* |
| 4100 | * We had N bottles of beer on the wall, we drank one, but now | 4210 | * We had N bottles of beer on the wall, we drank one, but now |
| 4101 | * there's not N-1 bottles of beer left on the wall... | 4211 | * there's not N-1 bottles of beer left on the wall... |
| 4212 | * Pouring two of the bottles together is acceptable. | ||
| 4102 | */ | 4213 | */ |
| 4103 | DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth-1); | 4214 | DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - merged); |
| 4104 | 4215 | ||
| 4105 | /* | 4216 | /* |
| 4106 | * Since reacquire_held_locks() would have called check_chain_key() | 4217 | * Since reacquire_held_locks() would have called check_chain_key() |
| @@ -4318,7 +4429,7 @@ void lock_release(struct lockdep_map *lock, int nested, | |||
| 4318 | check_flags(flags); | 4429 | check_flags(flags); |
| 4319 | current->lockdep_recursion = 1; | 4430 | current->lockdep_recursion = 1; |
| 4320 | trace_lock_release(lock, ip); | 4431 | trace_lock_release(lock, ip); |
| 4321 | if (__lock_release(lock, nested, ip)) | 4432 | if (__lock_release(lock, ip)) |
| 4322 | check_chain_key(current); | 4433 | check_chain_key(current); |
| 4323 | current->lockdep_recursion = 0; | 4434 | current->lockdep_recursion = 0; |
| 4324 | raw_local_irq_restore(flags); | 4435 | raw_local_irq_restore(flags); |
| @@ -4401,14 +4512,14 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie) | |||
| 4401 | EXPORT_SYMBOL_GPL(lock_unpin_lock); | 4512 | EXPORT_SYMBOL_GPL(lock_unpin_lock); |
| 4402 | 4513 | ||
| 4403 | #ifdef CONFIG_LOCK_STAT | 4514 | #ifdef CONFIG_LOCK_STAT |
| 4404 | static int | 4515 | static void print_lock_contention_bug(struct task_struct *curr, |
| 4405 | print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | 4516 | struct lockdep_map *lock, |
| 4406 | unsigned long ip) | 4517 | unsigned long ip) |
| 4407 | { | 4518 | { |
| 4408 | if (!debug_locks_off()) | 4519 | if (!debug_locks_off()) |
| 4409 | return 0; | 4520 | return; |
| 4410 | if (debug_locks_silent) | 4521 | if (debug_locks_silent) |
| 4411 | return 0; | 4522 | return; |
| 4412 | 4523 | ||
| 4413 | pr_warn("\n"); | 4524 | pr_warn("\n"); |
| 4414 | pr_warn("=================================\n"); | 4525 | pr_warn("=================================\n"); |
| @@ -4426,8 +4537,6 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, | |||
| 4426 | 4537 | ||
| 4427 | pr_warn("\nstack backtrace:\n"); | 4538 | pr_warn("\nstack backtrace:\n"); |
| 4428 | dump_stack(); | 4539 | dump_stack(); |
| 4429 | |||
| 4430 | return 0; | ||
| 4431 | } | 4540 | } |
| 4432 | 4541 | ||
| 4433 | static void | 4542 | static void |
| @@ -4572,9 +4681,7 @@ void lockdep_reset(void) | |||
| 4572 | int i; | 4681 | int i; |
| 4573 | 4682 | ||
| 4574 | raw_local_irq_save(flags); | 4683 | raw_local_irq_save(flags); |
| 4575 | current->curr_chain_key = 0; | 4684 | lockdep_init_task(current); |
| 4576 | current->lockdep_depth = 0; | ||
| 4577 | current->lockdep_recursion = 0; | ||
| 4578 | memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); | 4685 | memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); |
| 4579 | nr_hardirq_chains = 0; | 4686 | nr_hardirq_chains = 0; |
| 4580 | nr_softirq_chains = 0; | 4687 | nr_softirq_chains = 0; |
| @@ -4614,9 +4721,9 @@ static void remove_class_from_lock_chain(struct pending_free *pf, | |||
| 4614 | return; | 4721 | return; |
| 4615 | 4722 | ||
| 4616 | recalc: | 4723 | recalc: |
| 4617 | chain_key = 0; | 4724 | chain_key = INITIAL_CHAIN_KEY; |
| 4618 | for (i = chain->base; i < chain->base + chain->depth; i++) | 4725 | for (i = chain->base; i < chain->base + chain->depth; i++) |
| 4619 | chain_key = iterate_chain_key(chain_key, chain_hlocks[i] + 1); | 4726 | chain_key = iterate_chain_key(chain_key, chain_hlocks[i]); |
| 4620 | if (chain->depth && chain->chain_key == chain_key) | 4727 | if (chain->depth && chain->chain_key == chain_key) |
| 4621 | return; | 4728 | return; |
| 4622 | /* Overwrite the chain key for concurrent RCU readers. */ | 4729 | /* Overwrite the chain key for concurrent RCU readers. */ |
| @@ -4690,6 +4797,7 @@ static void zap_class(struct pending_free *pf, struct lock_class *class) | |||
| 4690 | WRITE_ONCE(class->key, NULL); | 4797 | WRITE_ONCE(class->key, NULL); |
| 4691 | WRITE_ONCE(class->name, NULL); | 4798 | WRITE_ONCE(class->name, NULL); |
| 4692 | nr_lock_classes--; | 4799 | nr_lock_classes--; |
| 4800 | __clear_bit(class - lock_classes, lock_classes_in_use); | ||
| 4693 | } else { | 4801 | } else { |
| 4694 | WARN_ONCE(true, "%s() failed for class %s\n", __func__, | 4802 | WARN_ONCE(true, "%s() failed for class %s\n", __func__, |
| 4695 | class->name); | 4803 | class->name); |
| @@ -5035,6 +5143,7 @@ void __init lockdep_init(void) | |||
| 5035 | 5143 | ||
| 5036 | printk(" memory used by lock dependency info: %zu kB\n", | 5144 | printk(" memory used by lock dependency info: %zu kB\n", |
| 5037 | (sizeof(lock_classes) + | 5145 | (sizeof(lock_classes) + |
| 5146 | sizeof(lock_classes_in_use) + | ||
| 5038 | sizeof(classhash_table) + | 5147 | sizeof(classhash_table) + |
| 5039 | sizeof(list_entries) + | 5148 | sizeof(list_entries) + |
| 5040 | sizeof(list_entries_in_use) + | 5149 | sizeof(list_entries_in_use) + |
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index 150ec3f0c5b5..cc83568d5012 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h | |||
| @@ -131,7 +131,6 @@ extern unsigned int nr_hardirq_chains; | |||
| 131 | extern unsigned int nr_softirq_chains; | 131 | extern unsigned int nr_softirq_chains; |
| 132 | extern unsigned int nr_process_chains; | 132 | extern unsigned int nr_process_chains; |
| 133 | extern unsigned int max_lockdep_depth; | 133 | extern unsigned int max_lockdep_depth; |
| 134 | extern unsigned int max_recursion_depth; | ||
| 135 | 134 | ||
| 136 | extern unsigned int max_bfs_queue_depth; | 135 | extern unsigned int max_bfs_queue_depth; |
| 137 | 136 | ||
| @@ -160,25 +159,22 @@ lockdep_count_backward_deps(struct lock_class *class) | |||
| 160 | * and we want to avoid too much cache bouncing. | 159 | * and we want to avoid too much cache bouncing. |
| 161 | */ | 160 | */ |
| 162 | struct lockdep_stats { | 161 | struct lockdep_stats { |
| 163 | int chain_lookup_hits; | 162 | unsigned long chain_lookup_hits; |
| 164 | int chain_lookup_misses; | 163 | unsigned int chain_lookup_misses; |
| 165 | int hardirqs_on_events; | 164 | unsigned long hardirqs_on_events; |
| 166 | int hardirqs_off_events; | 165 | unsigned long hardirqs_off_events; |
| 167 | int redundant_hardirqs_on; | 166 | unsigned long redundant_hardirqs_on; |
| 168 | int redundant_hardirqs_off; | 167 | unsigned long redundant_hardirqs_off; |
| 169 | int softirqs_on_events; | 168 | unsigned long softirqs_on_events; |
| 170 | int softirqs_off_events; | 169 | unsigned long softirqs_off_events; |
| 171 | int redundant_softirqs_on; | 170 | unsigned long redundant_softirqs_on; |
| 172 | int redundant_softirqs_off; | 171 | unsigned long redundant_softirqs_off; |
| 173 | int nr_unused_locks; | 172 | int nr_unused_locks; |
| 174 | int nr_redundant_checks; | 173 | unsigned int nr_redundant_checks; |
| 175 | int nr_redundant; | 174 | unsigned int nr_redundant; |
| 176 | int nr_cyclic_checks; | 175 | unsigned int nr_cyclic_checks; |
| 177 | int nr_cyclic_check_recursions; | 176 | unsigned int nr_find_usage_forwards_checks; |
| 178 | int nr_find_usage_forwards_checks; | 177 | unsigned int nr_find_usage_backwards_checks; |
| 179 | int nr_find_usage_forwards_recursions; | ||
| 180 | int nr_find_usage_backwards_checks; | ||
| 181 | int nr_find_usage_backwards_recursions; | ||
| 182 | 178 | ||
| 183 | /* | 179 | /* |
| 184 | * Per lock class locking operation stat counts | 180 | * Per lock class locking operation stat counts |
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 80a463d31a8d..c513031cd7e3 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c | |||
| @@ -975,7 +975,7 @@ static int __init lock_torture_init(void) | |||
| 975 | goto unwind; | 975 | goto unwind; |
| 976 | } | 976 | } |
| 977 | if (stutter > 0) { | 977 | if (stutter > 0) { |
| 978 | firsterr = torture_stutter_init(stutter); | 978 | firsterr = torture_stutter_init(stutter, stutter); |
| 979 | if (firsterr) | 979 | if (firsterr) |
| 980 | goto unwind; | 980 | goto unwind; |
| 981 | } | 981 | } |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index db578783dd36..0c601ae072b3 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/locking/mutex.c | 3 | * kernel/locking/mutex.c |
| 3 | * | 4 | * |
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f17dad99eec8..364d38a0c444 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | #include <linux/atomic.h> | 2 | #include <linux/atomic.h> |
| 2 | #include <linux/rwsem.h> | 3 | #include <linux/rwsem.h> |
| 3 | #include <linux/percpu.h> | 4 | #include <linux/percpu.h> |
| @@ -17,7 +18,7 @@ int __percpu_init_rwsem(struct percpu_rw_semaphore *sem, | |||
| 17 | return -ENOMEM; | 18 | return -ENOMEM; |
| 18 | 19 | ||
| 19 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ | 20 | /* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */ |
| 20 | rcu_sync_init(&sem->rss, RCU_SCHED_SYNC); | 21 | rcu_sync_init(&sem->rss); |
| 21 | __init_rwsem(&sem->rw_sem, name, rwsem_key); | 22 | __init_rwsem(&sem->rw_sem, name, rwsem_key); |
| 22 | rcuwait_init(&sem->writer); | 23 | rcuwait_init(&sem->writer); |
| 23 | sem->readers_block = 0; | 24 | sem->readers_block = 0; |
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index c7471c3fb798..fe9ca92faa2a 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c | |||
| @@ -1,16 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Queued read/write locks | 3 | * Queued read/write locks |
| 3 | * | 4 | * |
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. | 5 | * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P. |
| 15 | * | 6 | * |
| 16 | * Authors: Waiman Long <waiman.long@hp.com> | 7 | * Authors: Waiman Long <waiman.long@hp.com> |
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index e14b32c69639..2473f10c6956 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c | |||
| @@ -1,16 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Queued spinlock | 3 | * Queued spinlock |
| 3 | * | 4 | * |
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. | 5 | * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. |
| 15 | * (C) Copyright 2013-2014,2018 Red Hat, Inc. | 6 | * (C) Copyright 2013-2014,2018 Red Hat, Inc. |
| 16 | * (C) Copyright 2015 Intel Corp. | 7 | * (C) Copyright 2015 Intel Corp. |
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h index 54152670ff24..e625bb410aa2 100644 --- a/kernel/locking/qspinlock_stat.h +++ b/kernel/locking/qspinlock_stat.h | |||
| @@ -1,13 +1,5 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
| 1 | /* | 2 | /* |
| 2 | * This program is free software; you can redistribute it and/or modify | ||
| 3 | * it under the terms of the GNU General Public License as published by | ||
| 4 | * the Free Software Foundation; either version 2 of the License, or | ||
| 5 | * (at your option) any later version. | ||
| 6 | * | ||
| 7 | * This program is distributed in the hope that it will be useful, | ||
| 8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 10 | * GNU General Public License for more details. | ||
| 11 | * | 3 | * |
| 12 | * Authors: Waiman Long <longman@redhat.com> | 4 | * Authors: Waiman Long <longman@redhat.com> |
| 13 | */ | 5 | */ |
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 978d63a8261c..38fbf9fa7f1b 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support | 3 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support |
| 3 | * | 4 | * |
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c deleted file mode 100644 index 6b3ee9948bf1..000000000000 --- a/kernel/locking/rwsem-xadd.c +++ /dev/null | |||
| @@ -1,729 +0,0 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* rwsem.c: R/W semaphores: contention handling functions | ||
| 3 | * | ||
| 4 | * Written by David Howells (dhowells@redhat.com). | ||
| 5 | * Derived from arch/i386/kernel/semaphore.c | ||
| 6 | * | ||
| 7 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | ||
| 8 | * and Michel Lespinasse <walken@google.com> | ||
| 9 | * | ||
| 10 | * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> | ||
| 11 | * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. | ||
| 12 | */ | ||
| 13 | #include <linux/rwsem.h> | ||
| 14 | #include <linux/init.h> | ||
| 15 | #include <linux/export.h> | ||
| 16 | #include <linux/sched/signal.h> | ||
| 17 | #include <linux/sched/rt.h> | ||
| 18 | #include <linux/sched/wake_q.h> | ||
| 19 | #include <linux/sched/debug.h> | ||
| 20 | #include <linux/osq_lock.h> | ||
| 21 | |||
| 22 | #include "rwsem.h" | ||
| 23 | |||
| 24 | /* | ||
| 25 | * Guide to the rw_semaphore's count field for common values. | ||
| 26 | * (32-bit case illustrated, similar for 64-bit) | ||
| 27 | * | ||
| 28 | * 0x0000000X (1) X readers active or attempting lock, no writer waiting | ||
| 29 | * X = #active_readers + #readers attempting to lock | ||
| 30 | * (X*ACTIVE_BIAS) | ||
| 31 | * | ||
| 32 | * 0x00000000 rwsem is unlocked, and no one is waiting for the lock or | ||
| 33 | * attempting to read lock or write lock. | ||
| 34 | * | ||
| 35 | * 0xffff000X (1) X readers active or attempting lock, with waiters for lock | ||
| 36 | * X = #active readers + # readers attempting lock | ||
| 37 | * (X*ACTIVE_BIAS + WAITING_BIAS) | ||
| 38 | * (2) 1 writer attempting lock, no waiters for lock | ||
| 39 | * X-1 = #active readers + #readers attempting lock | ||
| 40 | * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) | ||
| 41 | * (3) 1 writer active, no waiters for lock | ||
| 42 | * X-1 = #active readers + #readers attempting lock | ||
| 43 | * ((X-1)*ACTIVE_BIAS + ACTIVE_WRITE_BIAS) | ||
| 44 | * | ||
| 45 | * 0xffff0001 (1) 1 reader active or attempting lock, waiters for lock | ||
| 46 | * (WAITING_BIAS + ACTIVE_BIAS) | ||
| 47 | * (2) 1 writer active or attempting lock, no waiters for lock | ||
| 48 | * (ACTIVE_WRITE_BIAS) | ||
| 49 | * | ||
| 50 | * 0xffff0000 (1) There are writers or readers queued but none active | ||
| 51 | * or in the process of attempting lock. | ||
| 52 | * (WAITING_BIAS) | ||
| 53 | * Note: writer can attempt to steal lock for this count by adding | ||
| 54 | * ACTIVE_WRITE_BIAS in cmpxchg and checking the old count | ||
| 55 | * | ||
| 56 | * 0xfffe0001 (1) 1 writer active, or attempting lock. Waiters on queue. | ||
| 57 | * (ACTIVE_WRITE_BIAS + WAITING_BIAS) | ||
| 58 | * | ||
| 59 | * Note: Readers attempt to lock by adding ACTIVE_BIAS in down_read and checking | ||
| 60 | * the count becomes more than 0 for successful lock acquisition, | ||
| 61 | * i.e. the case where there are only readers or nobody has lock. | ||
| 62 | * (1st and 2nd case above). | ||
| 63 | * | ||
| 64 | * Writers attempt to lock by adding ACTIVE_WRITE_BIAS in down_write and | ||
| 65 | * checking the count becomes ACTIVE_WRITE_BIAS for successful lock | ||
| 66 | * acquisition (i.e. nobody else has lock or attempts lock). If | ||
| 67 | * unsuccessful, in rwsem_down_write_failed, we'll check to see if there | ||
| 68 | * are only waiters but none active (5th case above), and attempt to | ||
| 69 | * steal the lock. | ||
| 70 | * | ||
| 71 | */ | ||
| 72 | |||
| 73 | /* | ||
| 74 | * Initialize an rwsem: | ||
| 75 | */ | ||
| 76 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | ||
| 77 | struct lock_class_key *key) | ||
| 78 | { | ||
| 79 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 80 | /* | ||
| 81 | * Make sure we are not reinitializing a held semaphore: | ||
| 82 | */ | ||
| 83 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | ||
| 84 | lockdep_init_map(&sem->dep_map, name, key, 0); | ||
| 85 | #endif | ||
| 86 | atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); | ||
| 87 | raw_spin_lock_init(&sem->wait_lock); | ||
| 88 | INIT_LIST_HEAD(&sem->wait_list); | ||
| 89 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
| 90 | sem->owner = NULL; | ||
| 91 | osq_lock_init(&sem->osq); | ||
| 92 | #endif | ||
| 93 | } | ||
| 94 | |||
| 95 | EXPORT_SYMBOL(__init_rwsem); | ||
| 96 | |||
| 97 | enum rwsem_waiter_type { | ||
| 98 | RWSEM_WAITING_FOR_WRITE, | ||
| 99 | RWSEM_WAITING_FOR_READ | ||
| 100 | }; | ||
| 101 | |||
| 102 | struct rwsem_waiter { | ||
| 103 | struct list_head list; | ||
| 104 | struct task_struct *task; | ||
| 105 | enum rwsem_waiter_type type; | ||
| 106 | }; | ||
| 107 | |||
| 108 | enum rwsem_wake_type { | ||
| 109 | RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ | ||
| 110 | RWSEM_WAKE_READERS, /* Wake readers only */ | ||
| 111 | RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ | ||
| 112 | }; | ||
| 113 | |||
| 114 | /* | ||
| 115 | * handle the lock release when processes blocked on it that can now run | ||
| 116 | * - if we come here from up_xxxx(), then: | ||
| 117 | * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) | ||
| 118 | * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) | ||
| 119 | * - there must be someone on the queue | ||
| 120 | * - the wait_lock must be held by the caller | ||
| 121 | * - tasks are marked for wakeup, the caller must later invoke wake_up_q() | ||
| 122 | * to actually wakeup the blocked task(s) and drop the reference count, | ||
| 123 | * preferably when the wait_lock is released | ||
| 124 | * - woken process blocks are discarded from the list after having task zeroed | ||
| 125 | * - writers are only marked woken if downgrading is false | ||
| 126 | */ | ||
| 127 | static void __rwsem_mark_wake(struct rw_semaphore *sem, | ||
| 128 | enum rwsem_wake_type wake_type, | ||
| 129 | struct wake_q_head *wake_q) | ||
| 130 | { | ||
| 131 | struct rwsem_waiter *waiter, *tmp; | ||
| 132 | long oldcount, woken = 0, adjustment = 0; | ||
| 133 | |||
| 134 | /* | ||
| 135 | * Take a peek at the queue head waiter such that we can determine | ||
| 136 | * the wakeup(s) to perform. | ||
| 137 | */ | ||
| 138 | waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list); | ||
| 139 | |||
| 140 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | ||
| 141 | if (wake_type == RWSEM_WAKE_ANY) { | ||
| 142 | /* | ||
| 143 | * Mark writer at the front of the queue for wakeup. | ||
| 144 | * Until the task is actually later awoken later by | ||
| 145 | * the caller, other writers are able to steal it. | ||
| 146 | * Readers, on the other hand, will block as they | ||
| 147 | * will notice the queued writer. | ||
| 148 | */ | ||
| 149 | wake_q_add(wake_q, waiter->task); | ||
| 150 | lockevent_inc(rwsem_wake_writer); | ||
| 151 | } | ||
| 152 | |||
| 153 | return; | ||
| 154 | } | ||
| 155 | |||
| 156 | /* | ||
| 157 | * Writers might steal the lock before we grant it to the next reader. | ||
| 158 | * We prefer to do the first reader grant before counting readers | ||
| 159 | * so we can bail out early if a writer stole the lock. | ||
| 160 | */ | ||
| 161 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | ||
| 162 | adjustment = RWSEM_ACTIVE_READ_BIAS; | ||
| 163 | try_reader_grant: | ||
| 164 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); | ||
| 165 | if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { | ||
| 166 | /* | ||
| 167 | * If the count is still less than RWSEM_WAITING_BIAS | ||
| 168 | * after removing the adjustment, it is assumed that | ||
| 169 | * a writer has stolen the lock. We have to undo our | ||
| 170 | * reader grant. | ||
| 171 | */ | ||
| 172 | if (atomic_long_add_return(-adjustment, &sem->count) < | ||
| 173 | RWSEM_WAITING_BIAS) | ||
| 174 | return; | ||
| 175 | |||
| 176 | /* Last active locker left. Retry waking readers. */ | ||
| 177 | goto try_reader_grant; | ||
| 178 | } | ||
| 179 | /* | ||
| 180 | * Set it to reader-owned to give spinners an early | ||
| 181 | * indication that readers now have the lock. | ||
| 182 | */ | ||
| 183 | __rwsem_set_reader_owned(sem, waiter->task); | ||
| 184 | } | ||
| 185 | |||
| 186 | /* | ||
| 187 | * Grant an infinite number of read locks to the readers at the front | ||
| 188 | * of the queue. We know that woken will be at least 1 as we accounted | ||
| 189 | * for above. Note we increment the 'active part' of the count by the | ||
| 190 | * number of readers before waking any processes up. | ||
| 191 | */ | ||
| 192 | list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { | ||
| 193 | struct task_struct *tsk; | ||
| 194 | |||
| 195 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) | ||
| 196 | break; | ||
| 197 | |||
| 198 | woken++; | ||
| 199 | tsk = waiter->task; | ||
| 200 | |||
| 201 | get_task_struct(tsk); | ||
| 202 | list_del(&waiter->list); | ||
| 203 | /* | ||
| 204 | * Ensure calling get_task_struct() before setting the reader | ||
| 205 | * waiter to nil such that rwsem_down_read_failed() cannot | ||
| 206 | * race with do_exit() by always holding a reference count | ||
| 207 | * to the task to wakeup. | ||
| 208 | */ | ||
| 209 | smp_store_release(&waiter->task, NULL); | ||
| 210 | /* | ||
| 211 | * Ensure issuing the wakeup (either by us or someone else) | ||
| 212 | * after setting the reader waiter to nil. | ||
| 213 | */ | ||
| 214 | wake_q_add_safe(wake_q, tsk); | ||
| 215 | } | ||
| 216 | |||
| 217 | adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; | ||
| 218 | lockevent_cond_inc(rwsem_wake_reader, woken); | ||
| 219 | if (list_empty(&sem->wait_list)) { | ||
| 220 | /* hit end of list above */ | ||
| 221 | adjustment -= RWSEM_WAITING_BIAS; | ||
| 222 | } | ||
| 223 | |||
| 224 | if (adjustment) | ||
| 225 | atomic_long_add(adjustment, &sem->count); | ||
| 226 | } | ||
| 227 | |||
| 228 | /* | ||
| 229 | * This function must be called with the sem->wait_lock held to prevent | ||
| 230 | * race conditions between checking the rwsem wait list and setting the | ||
| 231 | * sem->count accordingly. | ||
| 232 | */ | ||
| 233 | static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) | ||
| 234 | { | ||
| 235 | /* | ||
| 236 | * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. | ||
| 237 | */ | ||
| 238 | if (count != RWSEM_WAITING_BIAS) | ||
| 239 | return false; | ||
| 240 | |||
| 241 | /* | ||
| 242 | * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there | ||
| 243 | * are other tasks on the wait list, we need to add on WAITING_BIAS. | ||
| 244 | */ | ||
| 245 | count = list_is_singular(&sem->wait_list) ? | ||
| 246 | RWSEM_ACTIVE_WRITE_BIAS : | ||
| 247 | RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; | ||
| 248 | |||
| 249 | if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) | ||
| 250 | == RWSEM_WAITING_BIAS) { | ||
| 251 | rwsem_set_owner(sem); | ||
| 252 | return true; | ||
| 253 | } | ||
| 254 | |||
| 255 | return false; | ||
| 256 | } | ||
| 257 | |||
| 258 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
| 259 | /* | ||
| 260 | * Try to acquire write lock before the writer has been put on wait queue. | ||
| 261 | */ | ||
| 262 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | ||
| 263 | { | ||
| 264 | long count = atomic_long_read(&sem->count); | ||
| 265 | |||
| 266 | while (!count || count == RWSEM_WAITING_BIAS) { | ||
| 267 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, | ||
| 268 | count + RWSEM_ACTIVE_WRITE_BIAS)) { | ||
| 269 | rwsem_set_owner(sem); | ||
| 270 | lockevent_inc(rwsem_opt_wlock); | ||
| 271 | return true; | ||
| 272 | } | ||
| 273 | } | ||
| 274 | return false; | ||
| 275 | } | ||
| 276 | |||
| 277 | static inline bool owner_on_cpu(struct task_struct *owner) | ||
| 278 | { | ||
| 279 | /* | ||
| 280 | * As lock holder preemption issue, we both skip spinning if | ||
| 281 | * task is not on cpu or its cpu is preempted | ||
| 282 | */ | ||
| 283 | return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); | ||
| 284 | } | ||
| 285 | |||
| 286 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) | ||
| 287 | { | ||
| 288 | struct task_struct *owner; | ||
| 289 | bool ret = true; | ||
| 290 | |||
| 291 | BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); | ||
| 292 | |||
| 293 | if (need_resched()) | ||
| 294 | return false; | ||
| 295 | |||
| 296 | rcu_read_lock(); | ||
| 297 | owner = READ_ONCE(sem->owner); | ||
| 298 | if (owner) { | ||
| 299 | ret = is_rwsem_owner_spinnable(owner) && | ||
| 300 | owner_on_cpu(owner); | ||
| 301 | } | ||
| 302 | rcu_read_unlock(); | ||
| 303 | return ret; | ||
| 304 | } | ||
| 305 | |||
| 306 | /* | ||
| 307 | * Return true only if we can still spin on the owner field of the rwsem. | ||
| 308 | */ | ||
| 309 | static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) | ||
| 310 | { | ||
| 311 | struct task_struct *owner = READ_ONCE(sem->owner); | ||
| 312 | |||
| 313 | if (!is_rwsem_owner_spinnable(owner)) | ||
| 314 | return false; | ||
| 315 | |||
| 316 | rcu_read_lock(); | ||
| 317 | while (owner && (READ_ONCE(sem->owner) == owner)) { | ||
| 318 | /* | ||
| 319 | * Ensure we emit the owner->on_cpu, dereference _after_ | ||
| 320 | * checking sem->owner still matches owner, if that fails, | ||
| 321 | * owner might point to free()d memory, if it still matches, | ||
| 322 | * the rcu_read_lock() ensures the memory stays valid. | ||
| 323 | */ | ||
| 324 | barrier(); | ||
| 325 | |||
| 326 | /* | ||
| 327 | * abort spinning when need_resched or owner is not running or | ||
| 328 | * owner's cpu is preempted. | ||
| 329 | */ | ||
| 330 | if (need_resched() || !owner_on_cpu(owner)) { | ||
| 331 | rcu_read_unlock(); | ||
| 332 | return false; | ||
| 333 | } | ||
| 334 | |||
| 335 | cpu_relax(); | ||
| 336 | } | ||
| 337 | rcu_read_unlock(); | ||
| 338 | |||
| 339 | /* | ||
| 340 | * If there is a new owner or the owner is not set, we continue | ||
| 341 | * spinning. | ||
| 342 | */ | ||
| 343 | return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); | ||
| 344 | } | ||
| 345 | |||
| 346 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | ||
| 347 | { | ||
| 348 | bool taken = false; | ||
| 349 | |||
| 350 | preempt_disable(); | ||
| 351 | |||
| 352 | /* sem->wait_lock should not be held when doing optimistic spinning */ | ||
| 353 | if (!rwsem_can_spin_on_owner(sem)) | ||
| 354 | goto done; | ||
| 355 | |||
| 356 | if (!osq_lock(&sem->osq)) | ||
| 357 | goto done; | ||
| 358 | |||
| 359 | /* | ||
| 360 | * Optimistically spin on the owner field and attempt to acquire the | ||
| 361 | * lock whenever the owner changes. Spinning will be stopped when: | ||
| 362 | * 1) the owning writer isn't running; or | ||
| 363 | * 2) readers own the lock as we can't determine if they are | ||
| 364 | * actively running or not. | ||
| 365 | */ | ||
| 366 | while (rwsem_spin_on_owner(sem)) { | ||
| 367 | /* | ||
| 368 | * Try to acquire the lock | ||
| 369 | */ | ||
| 370 | if (rwsem_try_write_lock_unqueued(sem)) { | ||
| 371 | taken = true; | ||
| 372 | break; | ||
| 373 | } | ||
| 374 | |||
| 375 | /* | ||
| 376 | * When there's no owner, we might have preempted between the | ||
| 377 | * owner acquiring the lock and setting the owner field. If | ||
| 378 | * we're an RT task that will live-lock because we won't let | ||
| 379 | * the owner complete. | ||
| 380 | */ | ||
| 381 | if (!sem->owner && (need_resched() || rt_task(current))) | ||
| 382 | break; | ||
| 383 | |||
| 384 | /* | ||
| 385 | * The cpu_relax() call is a compiler barrier which forces | ||
| 386 | * everything in this loop to be re-loaded. We don't need | ||
| 387 | * memory barriers as we'll eventually observe the right | ||
| 388 | * values at the cost of a few extra spins. | ||
| 389 | */ | ||
| 390 | cpu_relax(); | ||
| 391 | } | ||
| 392 | osq_unlock(&sem->osq); | ||
| 393 | done: | ||
| 394 | preempt_enable(); | ||
| 395 | lockevent_cond_inc(rwsem_opt_fail, !taken); | ||
| 396 | return taken; | ||
| 397 | } | ||
| 398 | |||
| 399 | /* | ||
| 400 | * Return true if the rwsem has active spinner | ||
| 401 | */ | ||
| 402 | static inline bool rwsem_has_spinner(struct rw_semaphore *sem) | ||
| 403 | { | ||
| 404 | return osq_is_locked(&sem->osq); | ||
| 405 | } | ||
| 406 | |||
| 407 | #else | ||
| 408 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem) | ||
| 409 | { | ||
| 410 | return false; | ||
| 411 | } | ||
| 412 | |||
| 413 | static inline bool rwsem_has_spinner(struct rw_semaphore *sem) | ||
| 414 | { | ||
| 415 | return false; | ||
| 416 | } | ||
| 417 | #endif | ||
| 418 | |||
| 419 | /* | ||
| 420 | * Wait for the read lock to be granted | ||
| 421 | */ | ||
| 422 | static inline struct rw_semaphore __sched * | ||
| 423 | __rwsem_down_read_failed_common(struct rw_semaphore *sem, int state) | ||
| 424 | { | ||
| 425 | long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; | ||
| 426 | struct rwsem_waiter waiter; | ||
| 427 | DEFINE_WAKE_Q(wake_q); | ||
| 428 | |||
| 429 | waiter.task = current; | ||
| 430 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
| 431 | |||
| 432 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 433 | if (list_empty(&sem->wait_list)) { | ||
| 434 | /* | ||
| 435 | * In case the wait queue is empty and the lock isn't owned | ||
| 436 | * by a writer, this reader can exit the slowpath and return | ||
| 437 | * immediately as its RWSEM_ACTIVE_READ_BIAS has already | ||
| 438 | * been set in the count. | ||
| 439 | */ | ||
| 440 | if (atomic_long_read(&sem->count) >= 0) { | ||
| 441 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 442 | rwsem_set_reader_owned(sem); | ||
| 443 | lockevent_inc(rwsem_rlock_fast); | ||
| 444 | return sem; | ||
| 445 | } | ||
| 446 | adjustment += RWSEM_WAITING_BIAS; | ||
| 447 | } | ||
| 448 | list_add_tail(&waiter.list, &sem->wait_list); | ||
| 449 | |||
| 450 | /* we're now waiting on the lock, but no longer actively locking */ | ||
| 451 | count = atomic_long_add_return(adjustment, &sem->count); | ||
| 452 | |||
| 453 | /* | ||
| 454 | * If there are no active locks, wake the front queued process(es). | ||
| 455 | * | ||
| 456 | * If there are no writers and we are first in the queue, | ||
| 457 | * wake our own waiter to join the existing active readers ! | ||
| 458 | */ | ||
| 459 | if (count == RWSEM_WAITING_BIAS || | ||
| 460 | (count > RWSEM_WAITING_BIAS && | ||
| 461 | adjustment != -RWSEM_ACTIVE_READ_BIAS)) | ||
| 462 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
| 463 | |||
| 464 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 465 | wake_up_q(&wake_q); | ||
| 466 | |||
| 467 | /* wait to be given the lock */ | ||
| 468 | while (true) { | ||
| 469 | set_current_state(state); | ||
| 470 | if (!waiter.task) | ||
| 471 | break; | ||
| 472 | if (signal_pending_state(state, current)) { | ||
| 473 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 474 | if (waiter.task) | ||
| 475 | goto out_nolock; | ||
| 476 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 477 | break; | ||
| 478 | } | ||
| 479 | schedule(); | ||
| 480 | lockevent_inc(rwsem_sleep_reader); | ||
| 481 | } | ||
| 482 | |||
| 483 | __set_current_state(TASK_RUNNING); | ||
| 484 | lockevent_inc(rwsem_rlock); | ||
| 485 | return sem; | ||
| 486 | out_nolock: | ||
| 487 | list_del(&waiter.list); | ||
| 488 | if (list_empty(&sem->wait_list)) | ||
| 489 | atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); | ||
| 490 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 491 | __set_current_state(TASK_RUNNING); | ||
| 492 | lockevent_inc(rwsem_rlock_fail); | ||
| 493 | return ERR_PTR(-EINTR); | ||
| 494 | } | ||
| 495 | |||
| 496 | __visible struct rw_semaphore * __sched | ||
| 497 | rwsem_down_read_failed(struct rw_semaphore *sem) | ||
| 498 | { | ||
| 499 | return __rwsem_down_read_failed_common(sem, TASK_UNINTERRUPTIBLE); | ||
| 500 | } | ||
| 501 | EXPORT_SYMBOL(rwsem_down_read_failed); | ||
| 502 | |||
| 503 | __visible struct rw_semaphore * __sched | ||
| 504 | rwsem_down_read_failed_killable(struct rw_semaphore *sem) | ||
| 505 | { | ||
| 506 | return __rwsem_down_read_failed_common(sem, TASK_KILLABLE); | ||
| 507 | } | ||
| 508 | EXPORT_SYMBOL(rwsem_down_read_failed_killable); | ||
| 509 | |||
| 510 | /* | ||
| 511 | * Wait until we successfully acquire the write lock | ||
| 512 | */ | ||
| 513 | static inline struct rw_semaphore * | ||
| 514 | __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) | ||
| 515 | { | ||
| 516 | long count; | ||
| 517 | bool waiting = true; /* any queued threads before us */ | ||
| 518 | struct rwsem_waiter waiter; | ||
| 519 | struct rw_semaphore *ret = sem; | ||
| 520 | DEFINE_WAKE_Q(wake_q); | ||
| 521 | |||
| 522 | /* undo write bias from down_write operation, stop active locking */ | ||
| 523 | count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); | ||
| 524 | |||
| 525 | /* do optimistic spinning and steal lock if possible */ | ||
| 526 | if (rwsem_optimistic_spin(sem)) | ||
| 527 | return sem; | ||
| 528 | |||
| 529 | /* | ||
| 530 | * Optimistic spinning failed, proceed to the slowpath | ||
| 531 | * and block until we can acquire the sem. | ||
| 532 | */ | ||
| 533 | waiter.task = current; | ||
| 534 | waiter.type = RWSEM_WAITING_FOR_WRITE; | ||
| 535 | |||
| 536 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 537 | |||
| 538 | /* account for this before adding a new element to the list */ | ||
| 539 | if (list_empty(&sem->wait_list)) | ||
| 540 | waiting = false; | ||
| 541 | |||
| 542 | list_add_tail(&waiter.list, &sem->wait_list); | ||
| 543 | |||
| 544 | /* we're now waiting on the lock, but no longer actively locking */ | ||
| 545 | if (waiting) { | ||
| 546 | count = atomic_long_read(&sem->count); | ||
| 547 | |||
| 548 | /* | ||
| 549 | * If there were already threads queued before us and there are | ||
| 550 | * no active writers, the lock must be read owned; so we try to | ||
| 551 | * wake any read locks that were queued ahead of us. | ||
| 552 | */ | ||
| 553 | if (count > RWSEM_WAITING_BIAS) { | ||
| 554 | __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); | ||
| 555 | /* | ||
| 556 | * The wakeup is normally called _after_ the wait_lock | ||
| 557 | * is released, but given that we are proactively waking | ||
| 558 | * readers we can deal with the wake_q overhead as it is | ||
| 559 | * similar to releasing and taking the wait_lock again | ||
| 560 | * for attempting rwsem_try_write_lock(). | ||
| 561 | */ | ||
| 562 | wake_up_q(&wake_q); | ||
| 563 | |||
| 564 | /* | ||
| 565 | * Reinitialize wake_q after use. | ||
| 566 | */ | ||
| 567 | wake_q_init(&wake_q); | ||
| 568 | } | ||
| 569 | |||
| 570 | } else | ||
| 571 | count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); | ||
| 572 | |||
| 573 | /* wait until we successfully acquire the lock */ | ||
| 574 | set_current_state(state); | ||
| 575 | while (true) { | ||
| 576 | if (rwsem_try_write_lock(count, sem)) | ||
| 577 | break; | ||
| 578 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 579 | |||
| 580 | /* Block until there are no active lockers. */ | ||
| 581 | do { | ||
| 582 | if (signal_pending_state(state, current)) | ||
| 583 | goto out_nolock; | ||
| 584 | |||
| 585 | schedule(); | ||
| 586 | lockevent_inc(rwsem_sleep_writer); | ||
| 587 | set_current_state(state); | ||
| 588 | } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); | ||
| 589 | |||
| 590 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 591 | } | ||
| 592 | __set_current_state(TASK_RUNNING); | ||
| 593 | list_del(&waiter.list); | ||
| 594 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 595 | lockevent_inc(rwsem_wlock); | ||
| 596 | |||
| 597 | return ret; | ||
| 598 | |||
| 599 | out_nolock: | ||
| 600 | __set_current_state(TASK_RUNNING); | ||
| 601 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 602 | list_del(&waiter.list); | ||
| 603 | if (list_empty(&sem->wait_list)) | ||
| 604 | atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); | ||
| 605 | else | ||
| 606 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
| 607 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 608 | wake_up_q(&wake_q); | ||
| 609 | lockevent_inc(rwsem_wlock_fail); | ||
| 610 | |||
| 611 | return ERR_PTR(-EINTR); | ||
| 612 | } | ||
| 613 | |||
| 614 | __visible struct rw_semaphore * __sched | ||
| 615 | rwsem_down_write_failed(struct rw_semaphore *sem) | ||
| 616 | { | ||
| 617 | return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE); | ||
| 618 | } | ||
| 619 | EXPORT_SYMBOL(rwsem_down_write_failed); | ||
| 620 | |||
| 621 | __visible struct rw_semaphore * __sched | ||
| 622 | rwsem_down_write_failed_killable(struct rw_semaphore *sem) | ||
| 623 | { | ||
| 624 | return __rwsem_down_write_failed_common(sem, TASK_KILLABLE); | ||
| 625 | } | ||
| 626 | EXPORT_SYMBOL(rwsem_down_write_failed_killable); | ||
| 627 | |||
| 628 | /* | ||
| 629 | * handle waking up a waiter on the semaphore | ||
| 630 | * - up_read/up_write has decremented the active part of count if we come here | ||
| 631 | */ | ||
| 632 | __visible | ||
| 633 | struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) | ||
| 634 | { | ||
| 635 | unsigned long flags; | ||
| 636 | DEFINE_WAKE_Q(wake_q); | ||
| 637 | |||
| 638 | /* | ||
| 639 | * __rwsem_down_write_failed_common(sem) | ||
| 640 | * rwsem_optimistic_spin(sem) | ||
| 641 | * osq_unlock(sem->osq) | ||
| 642 | * ... | ||
| 643 | * atomic_long_add_return(&sem->count) | ||
| 644 | * | ||
| 645 | * - VS - | ||
| 646 | * | ||
| 647 | * __up_write() | ||
| 648 | * if (atomic_long_sub_return_release(&sem->count) < 0) | ||
| 649 | * rwsem_wake(sem) | ||
| 650 | * osq_is_locked(&sem->osq) | ||
| 651 | * | ||
| 652 | * And __up_write() must observe !osq_is_locked() when it observes the | ||
| 653 | * atomic_long_add_return() in order to not miss a wakeup. | ||
| 654 | * | ||
| 655 | * This boils down to: | ||
| 656 | * | ||
| 657 | * [S.rel] X = 1 [RmW] r0 = (Y += 0) | ||
| 658 | * MB RMB | ||
| 659 | * [RmW] Y += 1 [L] r1 = X | ||
| 660 | * | ||
| 661 | * exists (r0=1 /\ r1=0) | ||
| 662 | */ | ||
| 663 | smp_rmb(); | ||
| 664 | |||
| 665 | /* | ||
| 666 | * If a spinner is present, it is not necessary to do the wakeup. | ||
| 667 | * Try to do wakeup only if the trylock succeeds to minimize | ||
| 668 | * spinlock contention which may introduce too much delay in the | ||
| 669 | * unlock operation. | ||
| 670 | * | ||
| 671 | * spinning writer up_write/up_read caller | ||
| 672 | * --------------- ----------------------- | ||
| 673 | * [S] osq_unlock() [L] osq | ||
| 674 | * MB RMB | ||
| 675 | * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock) | ||
| 676 | * | ||
| 677 | * Here, it is important to make sure that there won't be a missed | ||
| 678 | * wakeup while the rwsem is free and the only spinning writer goes | ||
| 679 | * to sleep without taking the rwsem. Even when the spinning writer | ||
| 680 | * is just going to break out of the waiting loop, it will still do | ||
| 681 | * a trylock in rwsem_down_write_failed() before sleeping. IOW, if | ||
| 682 | * rwsem_has_spinner() is true, it will guarantee at least one | ||
| 683 | * trylock attempt on the rwsem later on. | ||
| 684 | */ | ||
| 685 | if (rwsem_has_spinner(sem)) { | ||
| 686 | /* | ||
| 687 | * The smp_rmb() here is to make sure that the spinner | ||
| 688 | * state is consulted before reading the wait_lock. | ||
| 689 | */ | ||
| 690 | smp_rmb(); | ||
| 691 | if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags)) | ||
| 692 | return sem; | ||
| 693 | goto locked; | ||
| 694 | } | ||
| 695 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
| 696 | locked: | ||
| 697 | |||
| 698 | if (!list_empty(&sem->wait_list)) | ||
| 699 | __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
| 700 | |||
| 701 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 702 | wake_up_q(&wake_q); | ||
| 703 | |||
| 704 | return sem; | ||
| 705 | } | ||
| 706 | EXPORT_SYMBOL(rwsem_wake); | ||
| 707 | |||
| 708 | /* | ||
| 709 | * downgrade a write lock into a read lock | ||
| 710 | * - caller incremented waiting part of count and discovered it still negative | ||
| 711 | * - just wake up any readers at the front of the queue | ||
| 712 | */ | ||
| 713 | __visible | ||
| 714 | struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | ||
| 715 | { | ||
| 716 | unsigned long flags; | ||
| 717 | DEFINE_WAKE_Q(wake_q); | ||
| 718 | |||
| 719 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
| 720 | |||
| 721 | if (!list_empty(&sem->wait_list)) | ||
| 722 | __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); | ||
| 723 | |||
| 724 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 725 | wake_up_q(&wake_q); | ||
| 726 | |||
| 727 | return sem; | ||
| 728 | } | ||
| 729 | EXPORT_SYMBOL(rwsem_downgrade_wake); | ||
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ccbf18f560ff..37524a47f002 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c | |||
| @@ -3,17 +3,1438 @@ | |||
| 3 | * | 3 | * |
| 4 | * Written by David Howells (dhowells@redhat.com). | 4 | * Written by David Howells (dhowells@redhat.com). |
| 5 | * Derived from asm-i386/semaphore.h | 5 | * Derived from asm-i386/semaphore.h |
| 6 | * | ||
| 7 | * Writer lock-stealing by Alex Shi <alex.shi@intel.com> | ||
| 8 | * and Michel Lespinasse <walken@google.com> | ||
| 9 | * | ||
| 10 | * Optimistic spinning by Tim Chen <tim.c.chen@intel.com> | ||
| 11 | * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes. | ||
| 12 | * | ||
| 13 | * Rwsem count bit fields re-definition and rwsem rearchitecture by | ||
| 14 | * Waiman Long <longman@redhat.com> and | ||
| 15 | * Peter Zijlstra <peterz@infradead.org>. | ||
| 6 | */ | 16 | */ |
| 7 | 17 | ||
| 8 | #include <linux/types.h> | 18 | #include <linux/types.h> |
| 9 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
| 10 | #include <linux/sched.h> | 20 | #include <linux/sched.h> |
| 21 | #include <linux/sched/rt.h> | ||
| 22 | #include <linux/sched/task.h> | ||
| 11 | #include <linux/sched/debug.h> | 23 | #include <linux/sched/debug.h> |
| 24 | #include <linux/sched/wake_q.h> | ||
| 25 | #include <linux/sched/signal.h> | ||
| 26 | #include <linux/sched/clock.h> | ||
| 12 | #include <linux/export.h> | 27 | #include <linux/export.h> |
| 13 | #include <linux/rwsem.h> | 28 | #include <linux/rwsem.h> |
| 14 | #include <linux/atomic.h> | 29 | #include <linux/atomic.h> |
| 15 | 30 | ||
| 16 | #include "rwsem.h" | 31 | #include "rwsem.h" |
| 32 | #include "lock_events.h" | ||
| 33 | |||
| 34 | /* | ||
| 35 | * The least significant 3 bits of the owner value has the following | ||
| 36 | * meanings when set. | ||
| 37 | * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers | ||
| 38 | * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. | ||
| 39 | * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. | ||
| 40 | * | ||
| 41 | * When the rwsem is either owned by an anonymous writer, or it is | ||
| 42 | * reader-owned, but a spinning writer has timed out, both nonspinnable | ||
| 43 | * bits will be set to disable optimistic spinning by readers and writers. | ||
| 44 | * In the later case, the last unlocking reader should then check the | ||
| 45 | * writer nonspinnable bit and clear it only to give writers preference | ||
| 46 | * to acquire the lock via optimistic spinning, but not readers. Similar | ||
| 47 | * action is also done in the reader slowpath. | ||
| 48 | |||
| 49 | * When a writer acquires a rwsem, it puts its task_struct pointer | ||
| 50 | * into the owner field. It is cleared after an unlock. | ||
| 51 | * | ||
| 52 | * When a reader acquires a rwsem, it will also puts its task_struct | ||
| 53 | * pointer into the owner field with the RWSEM_READER_OWNED bit set. | ||
| 54 | * On unlock, the owner field will largely be left untouched. So | ||
| 55 | * for a free or reader-owned rwsem, the owner value may contain | ||
| 56 | * information about the last reader that acquires the rwsem. | ||
| 57 | * | ||
| 58 | * That information may be helpful in debugging cases where the system | ||
| 59 | * seems to hang on a reader owned rwsem especially if only one reader | ||
| 60 | * is involved. Ideally we would like to track all the readers that own | ||
| 61 | * a rwsem, but the overhead is simply too big. | ||
| 62 | * | ||
| 63 | * Reader optimistic spinning is helpful when the reader critical section | ||
| 64 | * is short and there aren't that many readers around. It makes readers | ||
| 65 | * relatively more preferred than writers. When a writer times out spinning | ||
| 66 | * on a reader-owned lock and set the nospinnable bits, there are two main | ||
| 67 | * reasons for that. | ||
| 68 | * | ||
| 69 | * 1) The reader critical section is long, perhaps the task sleeps after | ||
| 70 | * acquiring the read lock. | ||
| 71 | * 2) There are just too many readers contending the lock causing it to | ||
| 72 | * take a while to service all of them. | ||
| 73 | * | ||
| 74 | * In the former case, long reader critical section will impede the progress | ||
| 75 | * of writers which is usually more important for system performance. In | ||
| 76 | * the later case, reader optimistic spinning tends to make the reader | ||
| 77 | * groups that contain readers that acquire the lock together smaller | ||
| 78 | * leading to more of them. That may hurt performance in some cases. In | ||
| 79 | * other words, the setting of nonspinnable bits indicates that reader | ||
| 80 | * optimistic spinning may not be helpful for those workloads that cause | ||
| 81 | * it. | ||
| 82 | * | ||
| 83 | * Therefore, any writers that had observed the setting of the writer | ||
| 84 | * nonspinnable bit for a given rwsem after they fail to acquire the lock | ||
| 85 | * via optimistic spinning will set the reader nonspinnable bit once they | ||
| 86 | * acquire the write lock. Similarly, readers that observe the setting | ||
| 87 | * of reader nonspinnable bit at slowpath entry will set the reader | ||
| 88 | * nonspinnable bits when they acquire the read lock via the wakeup path. | ||
| 89 | * | ||
| 90 | * Once the reader nonspinnable bit is on, it will only be reset when | ||
| 91 | * a writer is able to acquire the rwsem in the fast path or somehow a | ||
| 92 | * reader or writer in the slowpath doesn't observe the nonspinable bit. | ||
| 93 | * | ||
| 94 | * This is to discourage reader optmistic spinning on that particular | ||
| 95 | * rwsem and make writers more preferred. This adaptive disabling of reader | ||
| 96 | * optimistic spinning will alleviate the negative side effect of this | ||
| 97 | * feature. | ||
| 98 | */ | ||
| 99 | #define RWSEM_READER_OWNED (1UL << 0) | ||
| 100 | #define RWSEM_RD_NONSPINNABLE (1UL << 1) | ||
| 101 | #define RWSEM_WR_NONSPINNABLE (1UL << 2) | ||
| 102 | #define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) | ||
| 103 | #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) | ||
| 104 | |||
| 105 | #ifdef CONFIG_DEBUG_RWSEMS | ||
| 106 | # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ | ||
| 107 | if (!debug_locks_silent && \ | ||
| 108 | WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ | ||
| 109 | #c, atomic_long_read(&(sem)->count), \ | ||
| 110 | atomic_long_read(&(sem)->owner), (long)current, \ | ||
| 111 | list_empty(&(sem)->wait_list) ? "" : "not ")) \ | ||
| 112 | debug_locks_off(); \ | ||
| 113 | } while (0) | ||
| 114 | #else | ||
| 115 | # define DEBUG_RWSEMS_WARN_ON(c, sem) | ||
| 116 | #endif | ||
| 117 | |||
| 118 | /* | ||
| 119 | * On 64-bit architectures, the bit definitions of the count are: | ||
| 120 | * | ||
| 121 | * Bit 0 - writer locked bit | ||
| 122 | * Bit 1 - waiters present bit | ||
| 123 | * Bit 2 - lock handoff bit | ||
| 124 | * Bits 3-7 - reserved | ||
| 125 | * Bits 8-62 - 55-bit reader count | ||
| 126 | * Bit 63 - read fail bit | ||
| 127 | * | ||
| 128 | * On 32-bit architectures, the bit definitions of the count are: | ||
| 129 | * | ||
| 130 | * Bit 0 - writer locked bit | ||
| 131 | * Bit 1 - waiters present bit | ||
| 132 | * Bit 2 - lock handoff bit | ||
| 133 | * Bits 3-7 - reserved | ||
| 134 | * Bits 8-30 - 23-bit reader count | ||
| 135 | * Bit 31 - read fail bit | ||
| 136 | * | ||
| 137 | * It is not likely that the most significant bit (read fail bit) will ever | ||
| 138 | * be set. This guard bit is still checked anyway in the down_read() fastpath | ||
| 139 | * just in case we need to use up more of the reader bits for other purpose | ||
| 140 | * in the future. | ||
| 141 | * | ||
| 142 | * atomic_long_fetch_add() is used to obtain reader lock, whereas | ||
| 143 | * atomic_long_cmpxchg() will be used to obtain writer lock. | ||
| 144 | * | ||
| 145 | * There are three places where the lock handoff bit may be set or cleared. | ||
| 146 | * 1) rwsem_mark_wake() for readers. | ||
| 147 | * 2) rwsem_try_write_lock() for writers. | ||
| 148 | * 3) Error path of rwsem_down_write_slowpath(). | ||
| 149 | * | ||
| 150 | * For all the above cases, wait_lock will be held. A writer must also | ||
| 151 | * be the first one in the wait_list to be eligible for setting the handoff | ||
| 152 | * bit. So concurrent setting/clearing of handoff bit is not possible. | ||
| 153 | */ | ||
| 154 | #define RWSEM_WRITER_LOCKED (1UL << 0) | ||
| 155 | #define RWSEM_FLAG_WAITERS (1UL << 1) | ||
| 156 | #define RWSEM_FLAG_HANDOFF (1UL << 2) | ||
| 157 | #define RWSEM_FLAG_READFAIL (1UL << (BITS_PER_LONG - 1)) | ||
| 158 | |||
| 159 | #define RWSEM_READER_SHIFT 8 | ||
| 160 | #define RWSEM_READER_BIAS (1UL << RWSEM_READER_SHIFT) | ||
| 161 | #define RWSEM_READER_MASK (~(RWSEM_READER_BIAS - 1)) | ||
| 162 | #define RWSEM_WRITER_MASK RWSEM_WRITER_LOCKED | ||
| 163 | #define RWSEM_LOCK_MASK (RWSEM_WRITER_MASK|RWSEM_READER_MASK) | ||
| 164 | #define RWSEM_READ_FAILED_MASK (RWSEM_WRITER_MASK|RWSEM_FLAG_WAITERS|\ | ||
| 165 | RWSEM_FLAG_HANDOFF|RWSEM_FLAG_READFAIL) | ||
| 166 | |||
| 167 | /* | ||
| 168 | * All writes to owner are protected by WRITE_ONCE() to make sure that | ||
| 169 | * store tearing can't happen as optimistic spinners may read and use | ||
| 170 | * the owner value concurrently without lock. Read from owner, however, | ||
| 171 | * may not need READ_ONCE() as long as the pointer value is only used | ||
| 172 | * for comparison and isn't being dereferenced. | ||
| 173 | */ | ||
| 174 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 175 | { | ||
| 176 | atomic_long_set(&sem->owner, (long)current); | ||
| 177 | } | ||
| 178 | |||
| 179 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 180 | { | ||
| 181 | atomic_long_set(&sem->owner, 0); | ||
| 182 | } | ||
| 183 | |||
| 184 | /* | ||
| 185 | * Test the flags in the owner field. | ||
| 186 | */ | ||
| 187 | static inline bool rwsem_test_oflags(struct rw_semaphore *sem, long flags) | ||
| 188 | { | ||
| 189 | return atomic_long_read(&sem->owner) & flags; | ||
| 190 | } | ||
| 191 | |||
| 192 | /* | ||
| 193 | * The task_struct pointer of the last owning reader will be left in | ||
| 194 | * the owner field. | ||
| 195 | * | ||
| 196 | * Note that the owner value just indicates the task has owned the rwsem | ||
| 197 | * previously, it may not be the real owner or one of the real owners | ||
| 198 | * anymore when that field is examined, so take it with a grain of salt. | ||
| 199 | * | ||
| 200 | * The reader non-spinnable bit is preserved. | ||
| 201 | */ | ||
| 202 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, | ||
| 203 | struct task_struct *owner) | ||
| 204 | { | ||
| 205 | unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | | ||
| 206 | (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); | ||
| 207 | |||
| 208 | atomic_long_set(&sem->owner, val); | ||
| 209 | } | ||
| 210 | |||
| 211 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
| 212 | { | ||
| 213 | __rwsem_set_reader_owned(sem, current); | ||
| 214 | } | ||
| 215 | |||
| 216 | /* | ||
| 217 | * Return true if the rwsem is owned by a reader. | ||
| 218 | */ | ||
| 219 | static inline bool is_rwsem_reader_owned(struct rw_semaphore *sem) | ||
| 220 | { | ||
| 221 | #ifdef CONFIG_DEBUG_RWSEMS | ||
| 222 | /* | ||
| 223 | * Check the count to see if it is write-locked. | ||
| 224 | */ | ||
| 225 | long count = atomic_long_read(&sem->count); | ||
| 226 | |||
| 227 | if (count & RWSEM_WRITER_MASK) | ||
| 228 | return false; | ||
| 229 | #endif | ||
| 230 | return rwsem_test_oflags(sem, RWSEM_READER_OWNED); | ||
| 231 | } | ||
| 232 | |||
| 233 | #ifdef CONFIG_DEBUG_RWSEMS | ||
| 234 | /* | ||
| 235 | * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there | ||
| 236 | * is a task pointer in owner of a reader-owned rwsem, it will be the | ||
| 237 | * real owner or one of the real owners. The only exception is when the | ||
| 238 | * unlock is done by up_read_non_owner(). | ||
| 239 | */ | ||
| 240 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | ||
| 241 | { | ||
| 242 | unsigned long val = atomic_long_read(&sem->owner); | ||
| 243 | |||
| 244 | while ((val & ~RWSEM_OWNER_FLAGS_MASK) == (unsigned long)current) { | ||
| 245 | if (atomic_long_try_cmpxchg(&sem->owner, &val, | ||
| 246 | val & RWSEM_OWNER_FLAGS_MASK)) | ||
| 247 | return; | ||
| 248 | } | ||
| 249 | } | ||
| 250 | #else | ||
| 251 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | ||
| 252 | { | ||
| 253 | } | ||
| 254 | #endif | ||
| 255 | |||
| 256 | /* | ||
| 257 | * Set the RWSEM_NONSPINNABLE bits if the RWSEM_READER_OWNED flag | ||
| 258 | * remains set. Otherwise, the operation will be aborted. | ||
| 259 | */ | ||
| 260 | static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) | ||
| 261 | { | ||
| 262 | unsigned long owner = atomic_long_read(&sem->owner); | ||
| 263 | |||
| 264 | do { | ||
| 265 | if (!(owner & RWSEM_READER_OWNED)) | ||
| 266 | break; | ||
| 267 | if (owner & RWSEM_NONSPINNABLE) | ||
| 268 | break; | ||
| 269 | } while (!atomic_long_try_cmpxchg(&sem->owner, &owner, | ||
| 270 | owner | RWSEM_NONSPINNABLE)); | ||
| 271 | } | ||
| 272 | |||
| 273 | static inline bool rwsem_read_trylock(struct rw_semaphore *sem) | ||
| 274 | { | ||
| 275 | long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); | ||
| 276 | if (WARN_ON_ONCE(cnt < 0)) | ||
| 277 | rwsem_set_nonspinnable(sem); | ||
| 278 | return !(cnt & RWSEM_READ_FAILED_MASK); | ||
| 279 | } | ||
| 280 | |||
| 281 | /* | ||
| 282 | * Return just the real task structure pointer of the owner | ||
| 283 | */ | ||
| 284 | static inline struct task_struct *rwsem_owner(struct rw_semaphore *sem) | ||
| 285 | { | ||
| 286 | return (struct task_struct *) | ||
| 287 | (atomic_long_read(&sem->owner) & ~RWSEM_OWNER_FLAGS_MASK); | ||
| 288 | } | ||
| 289 | |||
| 290 | /* | ||
| 291 | * Return the real task structure pointer of the owner and the embedded | ||
| 292 | * flags in the owner. pflags must be non-NULL. | ||
| 293 | */ | ||
| 294 | static inline struct task_struct * | ||
| 295 | rwsem_owner_flags(struct rw_semaphore *sem, unsigned long *pflags) | ||
| 296 | { | ||
| 297 | unsigned long owner = atomic_long_read(&sem->owner); | ||
| 298 | |||
| 299 | *pflags = owner & RWSEM_OWNER_FLAGS_MASK; | ||
| 300 | return (struct task_struct *)(owner & ~RWSEM_OWNER_FLAGS_MASK); | ||
| 301 | } | ||
| 302 | |||
| 303 | /* | ||
| 304 | * Guide to the rw_semaphore's count field. | ||
| 305 | * | ||
| 306 | * When the RWSEM_WRITER_LOCKED bit in count is set, the lock is owned | ||
| 307 | * by a writer. | ||
| 308 | * | ||
| 309 | * The lock is owned by readers when | ||
| 310 | * (1) the RWSEM_WRITER_LOCKED isn't set in count, | ||
| 311 | * (2) some of the reader bits are set in count, and | ||
| 312 | * (3) the owner field has RWSEM_READ_OWNED bit set. | ||
| 313 | * | ||
| 314 | * Having some reader bits set is not enough to guarantee a readers owned | ||
| 315 | * lock as the readers may be in the process of backing out from the count | ||
| 316 | * and a writer has just released the lock. So another writer may steal | ||
| 317 | * the lock immediately after that. | ||
| 318 | */ | ||
| 319 | |||
| 320 | /* | ||
| 321 | * Initialize an rwsem: | ||
| 322 | */ | ||
| 323 | void __init_rwsem(struct rw_semaphore *sem, const char *name, | ||
| 324 | struct lock_class_key *key) | ||
| 325 | { | ||
| 326 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 327 | /* | ||
| 328 | * Make sure we are not reinitializing a held semaphore: | ||
| 329 | */ | ||
| 330 | debug_check_no_locks_freed((void *)sem, sizeof(*sem)); | ||
| 331 | lockdep_init_map(&sem->dep_map, name, key, 0); | ||
| 332 | #endif | ||
| 333 | atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); | ||
| 334 | raw_spin_lock_init(&sem->wait_lock); | ||
| 335 | INIT_LIST_HEAD(&sem->wait_list); | ||
| 336 | atomic_long_set(&sem->owner, 0L); | ||
| 337 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
| 338 | osq_lock_init(&sem->osq); | ||
| 339 | #endif | ||
| 340 | } | ||
| 341 | EXPORT_SYMBOL(__init_rwsem); | ||
| 342 | |||
| 343 | enum rwsem_waiter_type { | ||
| 344 | RWSEM_WAITING_FOR_WRITE, | ||
| 345 | RWSEM_WAITING_FOR_READ | ||
| 346 | }; | ||
| 347 | |||
| 348 | struct rwsem_waiter { | ||
| 349 | struct list_head list; | ||
| 350 | struct task_struct *task; | ||
| 351 | enum rwsem_waiter_type type; | ||
| 352 | unsigned long timeout; | ||
| 353 | unsigned long last_rowner; | ||
| 354 | }; | ||
| 355 | #define rwsem_first_waiter(sem) \ | ||
| 356 | list_first_entry(&sem->wait_list, struct rwsem_waiter, list) | ||
| 357 | |||
| 358 | enum rwsem_wake_type { | ||
| 359 | RWSEM_WAKE_ANY, /* Wake whatever's at head of wait list */ | ||
| 360 | RWSEM_WAKE_READERS, /* Wake readers only */ | ||
| 361 | RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ | ||
| 362 | }; | ||
| 363 | |||
| 364 | enum writer_wait_state { | ||
| 365 | WRITER_NOT_FIRST, /* Writer is not first in wait list */ | ||
| 366 | WRITER_FIRST, /* Writer is first in wait list */ | ||
| 367 | WRITER_HANDOFF /* Writer is first & handoff needed */ | ||
| 368 | }; | ||
| 369 | |||
| 370 | /* | ||
| 371 | * The typical HZ value is either 250 or 1000. So set the minimum waiting | ||
| 372 | * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait | ||
| 373 | * queue before initiating the handoff protocol. | ||
| 374 | */ | ||
| 375 | #define RWSEM_WAIT_TIMEOUT DIV_ROUND_UP(HZ, 250) | ||
| 376 | |||
| 377 | /* | ||
| 378 | * Magic number to batch-wakeup waiting readers, even when writers are | ||
| 379 | * also present in the queue. This both limits the amount of work the | ||
| 380 | * waking thread must do and also prevents any potential counter overflow, | ||
| 381 | * however unlikely. | ||
| 382 | */ | ||
| 383 | #define MAX_READERS_WAKEUP 0x100 | ||
| 384 | |||
| 385 | /* | ||
| 386 | * handle the lock release when processes blocked on it that can now run | ||
| 387 | * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must | ||
| 388 | * have been set. | ||
| 389 | * - there must be someone on the queue | ||
| 390 | * - the wait_lock must be held by the caller | ||
| 391 | * - tasks are marked for wakeup, the caller must later invoke wake_up_q() | ||
| 392 | * to actually wakeup the blocked task(s) and drop the reference count, | ||
| 393 | * preferably when the wait_lock is released | ||
| 394 | * - woken process blocks are discarded from the list after having task zeroed | ||
| 395 | * - writers are only marked woken if downgrading is false | ||
| 396 | */ | ||
| 397 | static void rwsem_mark_wake(struct rw_semaphore *sem, | ||
| 398 | enum rwsem_wake_type wake_type, | ||
| 399 | struct wake_q_head *wake_q) | ||
| 400 | { | ||
| 401 | struct rwsem_waiter *waiter, *tmp; | ||
| 402 | long oldcount, woken = 0, adjustment = 0; | ||
| 403 | struct list_head wlist; | ||
| 404 | |||
| 405 | lockdep_assert_held(&sem->wait_lock); | ||
| 406 | |||
| 407 | /* | ||
| 408 | * Take a peek at the queue head waiter such that we can determine | ||
| 409 | * the wakeup(s) to perform. | ||
| 410 | */ | ||
| 411 | waiter = rwsem_first_waiter(sem); | ||
| 412 | |||
| 413 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) { | ||
| 414 | if (wake_type == RWSEM_WAKE_ANY) { | ||
| 415 | /* | ||
| 416 | * Mark writer at the front of the queue for wakeup. | ||
| 417 | * Until the task is actually later awoken later by | ||
| 418 | * the caller, other writers are able to steal it. | ||
| 419 | * Readers, on the other hand, will block as they | ||
| 420 | * will notice the queued writer. | ||
| 421 | */ | ||
| 422 | wake_q_add(wake_q, waiter->task); | ||
| 423 | lockevent_inc(rwsem_wake_writer); | ||
| 424 | } | ||
| 425 | |||
| 426 | return; | ||
| 427 | } | ||
| 428 | |||
| 429 | /* | ||
| 430 | * No reader wakeup if there are too many of them already. | ||
| 431 | */ | ||
| 432 | if (unlikely(atomic_long_read(&sem->count) < 0)) | ||
| 433 | return; | ||
| 434 | |||
| 435 | /* | ||
| 436 | * Writers might steal the lock before we grant it to the next reader. | ||
| 437 | * We prefer to do the first reader grant before counting readers | ||
| 438 | * so we can bail out early if a writer stole the lock. | ||
| 439 | */ | ||
| 440 | if (wake_type != RWSEM_WAKE_READ_OWNED) { | ||
| 441 | struct task_struct *owner; | ||
| 442 | |||
| 443 | adjustment = RWSEM_READER_BIAS; | ||
| 444 | oldcount = atomic_long_fetch_add(adjustment, &sem->count); | ||
| 445 | if (unlikely(oldcount & RWSEM_WRITER_MASK)) { | ||
| 446 | /* | ||
| 447 | * When we've been waiting "too" long (for writers | ||
| 448 | * to give up the lock), request a HANDOFF to | ||
| 449 | * force the issue. | ||
| 450 | */ | ||
| 451 | if (!(oldcount & RWSEM_FLAG_HANDOFF) && | ||
| 452 | time_after(jiffies, waiter->timeout)) { | ||
| 453 | adjustment -= RWSEM_FLAG_HANDOFF; | ||
| 454 | lockevent_inc(rwsem_rlock_handoff); | ||
| 455 | } | ||
| 456 | |||
| 457 | atomic_long_add(-adjustment, &sem->count); | ||
| 458 | return; | ||
| 459 | } | ||
| 460 | /* | ||
| 461 | * Set it to reader-owned to give spinners an early | ||
| 462 | * indication that readers now have the lock. | ||
| 463 | * The reader nonspinnable bit seen at slowpath entry of | ||
| 464 | * the reader is copied over. | ||
| 465 | */ | ||
| 466 | owner = waiter->task; | ||
| 467 | if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { | ||
| 468 | owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); | ||
| 469 | lockevent_inc(rwsem_opt_norspin); | ||
| 470 | } | ||
| 471 | __rwsem_set_reader_owned(sem, owner); | ||
| 472 | } | ||
| 473 | |||
| 474 | /* | ||
| 475 | * Grant up to MAX_READERS_WAKEUP read locks to all the readers in the | ||
| 476 | * queue. We know that the woken will be at least 1 as we accounted | ||
| 477 | * for above. Note we increment the 'active part' of the count by the | ||
| 478 | * number of readers before waking any processes up. | ||
| 479 | * | ||
| 480 | * This is an adaptation of the phase-fair R/W locks where at the | ||
| 481 | * reader phase (first waiter is a reader), all readers are eligible | ||
| 482 | * to acquire the lock at the same time irrespective of their order | ||
| 483 | * in the queue. The writers acquire the lock according to their | ||
| 484 | * order in the queue. | ||
| 485 | * | ||
| 486 | * We have to do wakeup in 2 passes to prevent the possibility that | ||
| 487 | * the reader count may be decremented before it is incremented. It | ||
| 488 | * is because the to-be-woken waiter may not have slept yet. So it | ||
| 489 | * may see waiter->task got cleared, finish its critical section and | ||
| 490 | * do an unlock before the reader count increment. | ||
| 491 | * | ||
| 492 | * 1) Collect the read-waiters in a separate list, count them and | ||
| 493 | * fully increment the reader count in rwsem. | ||
| 494 | * 2) For each waiters in the new list, clear waiter->task and | ||
| 495 | * put them into wake_q to be woken up later. | ||
| 496 | */ | ||
| 497 | INIT_LIST_HEAD(&wlist); | ||
| 498 | list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) { | ||
| 499 | if (waiter->type == RWSEM_WAITING_FOR_WRITE) | ||
| 500 | continue; | ||
| 501 | |||
| 502 | woken++; | ||
| 503 | list_move_tail(&waiter->list, &wlist); | ||
| 504 | |||
| 505 | /* | ||
| 506 | * Limit # of readers that can be woken up per wakeup call. | ||
| 507 | */ | ||
| 508 | if (woken >= MAX_READERS_WAKEUP) | ||
| 509 | break; | ||
| 510 | } | ||
| 511 | |||
| 512 | adjustment = woken * RWSEM_READER_BIAS - adjustment; | ||
| 513 | lockevent_cond_inc(rwsem_wake_reader, woken); | ||
| 514 | if (list_empty(&sem->wait_list)) { | ||
| 515 | /* hit end of list above */ | ||
| 516 | adjustment -= RWSEM_FLAG_WAITERS; | ||
| 517 | } | ||
| 518 | |||
| 519 | /* | ||
| 520 | * When we've woken a reader, we no longer need to force writers | ||
| 521 | * to give up the lock and we can clear HANDOFF. | ||
| 522 | */ | ||
| 523 | if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) | ||
| 524 | adjustment -= RWSEM_FLAG_HANDOFF; | ||
| 525 | |||
| 526 | if (adjustment) | ||
| 527 | atomic_long_add(adjustment, &sem->count); | ||
| 528 | |||
| 529 | /* 2nd pass */ | ||
| 530 | list_for_each_entry_safe(waiter, tmp, &wlist, list) { | ||
| 531 | struct task_struct *tsk; | ||
| 532 | |||
| 533 | tsk = waiter->task; | ||
| 534 | get_task_struct(tsk); | ||
| 535 | |||
| 536 | /* | ||
| 537 | * Ensure calling get_task_struct() before setting the reader | ||
| 538 | * waiter to nil such that rwsem_down_read_slowpath() cannot | ||
| 539 | * race with do_exit() by always holding a reference count | ||
| 540 | * to the task to wakeup. | ||
| 541 | */ | ||
| 542 | smp_store_release(&waiter->task, NULL); | ||
| 543 | /* | ||
| 544 | * Ensure issuing the wakeup (either by us or someone else) | ||
| 545 | * after setting the reader waiter to nil. | ||
| 546 | */ | ||
| 547 | wake_q_add_safe(wake_q, tsk); | ||
| 548 | } | ||
| 549 | } | ||
| 550 | |||
| 551 | /* | ||
| 552 | * This function must be called with the sem->wait_lock held to prevent | ||
| 553 | * race conditions between checking the rwsem wait list and setting the | ||
| 554 | * sem->count accordingly. | ||
| 555 | * | ||
| 556 | * If wstate is WRITER_HANDOFF, it will make sure that either the handoff | ||
| 557 | * bit is set or the lock is acquired with handoff bit cleared. | ||
| 558 | */ | ||
| 559 | static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, | ||
| 560 | enum writer_wait_state wstate) | ||
| 561 | { | ||
| 562 | long count, new; | ||
| 563 | |||
| 564 | lockdep_assert_held(&sem->wait_lock); | ||
| 565 | |||
| 566 | count = atomic_long_read(&sem->count); | ||
| 567 | do { | ||
| 568 | bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); | ||
| 569 | |||
| 570 | if (has_handoff && wstate == WRITER_NOT_FIRST) | ||
| 571 | return false; | ||
| 572 | |||
| 573 | new = count; | ||
| 574 | |||
| 575 | if (count & RWSEM_LOCK_MASK) { | ||
| 576 | if (has_handoff || (wstate != WRITER_HANDOFF)) | ||
| 577 | return false; | ||
| 578 | |||
| 579 | new |= RWSEM_FLAG_HANDOFF; | ||
| 580 | } else { | ||
| 581 | new |= RWSEM_WRITER_LOCKED; | ||
| 582 | new &= ~RWSEM_FLAG_HANDOFF; | ||
| 583 | |||
| 584 | if (list_is_singular(&sem->wait_list)) | ||
| 585 | new &= ~RWSEM_FLAG_WAITERS; | ||
| 586 | } | ||
| 587 | } while (!atomic_long_try_cmpxchg_acquire(&sem->count, &count, new)); | ||
| 588 | |||
| 589 | /* | ||
| 590 | * We have either acquired the lock with handoff bit cleared or | ||
| 591 | * set the handoff bit. | ||
| 592 | */ | ||
| 593 | if (new & RWSEM_FLAG_HANDOFF) | ||
| 594 | return false; | ||
| 595 | |||
| 596 | rwsem_set_owner(sem); | ||
| 597 | return true; | ||
| 598 | } | ||
| 599 | |||
| 600 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
| 601 | /* | ||
| 602 | * Try to acquire read lock before the reader is put on wait queue. | ||
| 603 | * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff | ||
| 604 | * is ongoing. | ||
| 605 | */ | ||
| 606 | static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) | ||
| 607 | { | ||
| 608 | long count = atomic_long_read(&sem->count); | ||
| 609 | |||
| 610 | if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) | ||
| 611 | return false; | ||
| 612 | |||
| 613 | count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); | ||
| 614 | if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { | ||
| 615 | rwsem_set_reader_owned(sem); | ||
| 616 | lockevent_inc(rwsem_opt_rlock); | ||
| 617 | return true; | ||
| 618 | } | ||
| 619 | |||
| 620 | /* Back out the change */ | ||
| 621 | atomic_long_add(-RWSEM_READER_BIAS, &sem->count); | ||
| 622 | return false; | ||
| 623 | } | ||
| 624 | |||
| 625 | /* | ||
| 626 | * Try to acquire write lock before the writer has been put on wait queue. | ||
| 627 | */ | ||
| 628 | static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) | ||
| 629 | { | ||
| 630 | long count = atomic_long_read(&sem->count); | ||
| 631 | |||
| 632 | while (!(count & (RWSEM_LOCK_MASK|RWSEM_FLAG_HANDOFF))) { | ||
| 633 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, | ||
| 634 | count | RWSEM_WRITER_LOCKED)) { | ||
| 635 | rwsem_set_owner(sem); | ||
| 636 | lockevent_inc(rwsem_opt_wlock); | ||
| 637 | return true; | ||
| 638 | } | ||
| 639 | } | ||
| 640 | return false; | ||
| 641 | } | ||
| 642 | |||
| 643 | static inline bool owner_on_cpu(struct task_struct *owner) | ||
| 644 | { | ||
| 645 | /* | ||
| 646 | * As lock holder preemption issue, we both skip spinning if | ||
| 647 | * task is not on cpu or its cpu is preempted | ||
| 648 | */ | ||
| 649 | return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); | ||
| 650 | } | ||
| 651 | |||
| 652 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, | ||
| 653 | unsigned long nonspinnable) | ||
| 654 | { | ||
| 655 | struct task_struct *owner; | ||
| 656 | unsigned long flags; | ||
| 657 | bool ret = true; | ||
| 658 | |||
| 659 | BUILD_BUG_ON(!(RWSEM_OWNER_UNKNOWN & RWSEM_NONSPINNABLE)); | ||
| 660 | |||
| 661 | if (need_resched()) { | ||
| 662 | lockevent_inc(rwsem_opt_fail); | ||
| 663 | return false; | ||
| 664 | } | ||
| 665 | |||
| 666 | preempt_disable(); | ||
| 667 | rcu_read_lock(); | ||
| 668 | owner = rwsem_owner_flags(sem, &flags); | ||
| 669 | if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner))) | ||
| 670 | ret = false; | ||
| 671 | rcu_read_unlock(); | ||
| 672 | preempt_enable(); | ||
| 673 | |||
| 674 | lockevent_cond_inc(rwsem_opt_fail, !ret); | ||
| 675 | return ret; | ||
| 676 | } | ||
| 677 | |||
| 678 | /* | ||
| 679 | * The rwsem_spin_on_owner() function returns the folowing 4 values | ||
| 680 | * depending on the lock owner state. | ||
| 681 | * OWNER_NULL : owner is currently NULL | ||
| 682 | * OWNER_WRITER: when owner changes and is a writer | ||
| 683 | * OWNER_READER: when owner changes and the new owner may be a reader. | ||
| 684 | * OWNER_NONSPINNABLE: | ||
| 685 | * when optimistic spinning has to stop because either the | ||
| 686 | * owner stops running, is unknown, or its timeslice has | ||
| 687 | * been used up. | ||
| 688 | */ | ||
| 689 | enum owner_state { | ||
| 690 | OWNER_NULL = 1 << 0, | ||
| 691 | OWNER_WRITER = 1 << 1, | ||
| 692 | OWNER_READER = 1 << 2, | ||
| 693 | OWNER_NONSPINNABLE = 1 << 3, | ||
| 694 | }; | ||
| 695 | #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) | ||
| 696 | |||
| 697 | static inline enum owner_state | ||
| 698 | rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) | ||
| 699 | { | ||
| 700 | if (flags & nonspinnable) | ||
| 701 | return OWNER_NONSPINNABLE; | ||
| 702 | |||
| 703 | if (flags & RWSEM_READER_OWNED) | ||
| 704 | return OWNER_READER; | ||
| 705 | |||
| 706 | return owner ? OWNER_WRITER : OWNER_NULL; | ||
| 707 | } | ||
| 708 | |||
| 709 | static noinline enum owner_state | ||
| 710 | rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) | ||
| 711 | { | ||
| 712 | struct task_struct *new, *owner; | ||
| 713 | unsigned long flags, new_flags; | ||
| 714 | enum owner_state state; | ||
| 715 | |||
| 716 | owner = rwsem_owner_flags(sem, &flags); | ||
| 717 | state = rwsem_owner_state(owner, flags, nonspinnable); | ||
| 718 | if (state != OWNER_WRITER) | ||
| 719 | return state; | ||
| 720 | |||
| 721 | rcu_read_lock(); | ||
| 722 | for (;;) { | ||
| 723 | if (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF) { | ||
| 724 | state = OWNER_NONSPINNABLE; | ||
| 725 | break; | ||
| 726 | } | ||
| 727 | |||
| 728 | new = rwsem_owner_flags(sem, &new_flags); | ||
| 729 | if ((new != owner) || (new_flags != flags)) { | ||
| 730 | state = rwsem_owner_state(new, new_flags, nonspinnable); | ||
| 731 | break; | ||
| 732 | } | ||
| 733 | |||
| 734 | /* | ||
| 735 | * Ensure we emit the owner->on_cpu, dereference _after_ | ||
| 736 | * checking sem->owner still matches owner, if that fails, | ||
| 737 | * owner might point to free()d memory, if it still matches, | ||
| 738 | * the rcu_read_lock() ensures the memory stays valid. | ||
| 739 | */ | ||
| 740 | barrier(); | ||
| 741 | |||
| 742 | if (need_resched() || !owner_on_cpu(owner)) { | ||
| 743 | state = OWNER_NONSPINNABLE; | ||
| 744 | break; | ||
| 745 | } | ||
| 746 | |||
| 747 | cpu_relax(); | ||
| 748 | } | ||
| 749 | rcu_read_unlock(); | ||
| 750 | |||
| 751 | return state; | ||
| 752 | } | ||
| 753 | |||
| 754 | /* | ||
| 755 | * Calculate reader-owned rwsem spinning threshold for writer | ||
| 756 | * | ||
| 757 | * The more readers own the rwsem, the longer it will take for them to | ||
| 758 | * wind down and free the rwsem. So the empirical formula used to | ||
| 759 | * determine the actual spinning time limit here is: | ||
| 760 | * | ||
| 761 | * Spinning threshold = (10 + nr_readers/2)us | ||
| 762 | * | ||
| 763 | * The limit is capped to a maximum of 25us (30 readers). This is just | ||
| 764 | * a heuristic and is subjected to change in the future. | ||
| 765 | */ | ||
| 766 | static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) | ||
| 767 | { | ||
| 768 | long count = atomic_long_read(&sem->count); | ||
| 769 | int readers = count >> RWSEM_READER_SHIFT; | ||
| 770 | u64 delta; | ||
| 771 | |||
| 772 | if (readers > 30) | ||
| 773 | readers = 30; | ||
| 774 | delta = (20 + readers) * NSEC_PER_USEC / 2; | ||
| 775 | |||
| 776 | return sched_clock() + delta; | ||
| 777 | } | ||
| 778 | |||
| 779 | static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) | ||
| 780 | { | ||
| 781 | bool taken = false; | ||
| 782 | int prev_owner_state = OWNER_NULL; | ||
| 783 | int loop = 0; | ||
| 784 | u64 rspin_threshold = 0; | ||
| 785 | unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE | ||
| 786 | : RWSEM_RD_NONSPINNABLE; | ||
| 787 | |||
| 788 | preempt_disable(); | ||
| 789 | |||
| 790 | /* sem->wait_lock should not be held when doing optimistic spinning */ | ||
| 791 | if (!osq_lock(&sem->osq)) | ||
| 792 | goto done; | ||
| 793 | |||
| 794 | /* | ||
| 795 | * Optimistically spin on the owner field and attempt to acquire the | ||
| 796 | * lock whenever the owner changes. Spinning will be stopped when: | ||
| 797 | * 1) the owning writer isn't running; or | ||
| 798 | * 2) readers own the lock and spinning time has exceeded limit. | ||
| 799 | */ | ||
| 800 | for (;;) { | ||
| 801 | enum owner_state owner_state; | ||
| 802 | |||
| 803 | owner_state = rwsem_spin_on_owner(sem, nonspinnable); | ||
| 804 | if (!(owner_state & OWNER_SPINNABLE)) | ||
| 805 | break; | ||
| 806 | |||
| 807 | /* | ||
| 808 | * Try to acquire the lock | ||
| 809 | */ | ||
| 810 | taken = wlock ? rwsem_try_write_lock_unqueued(sem) | ||
| 811 | : rwsem_try_read_lock_unqueued(sem); | ||
| 812 | |||
| 813 | if (taken) | ||
| 814 | break; | ||
| 815 | |||
| 816 | /* | ||
| 817 | * Time-based reader-owned rwsem optimistic spinning | ||
| 818 | */ | ||
| 819 | if (wlock && (owner_state == OWNER_READER)) { | ||
| 820 | /* | ||
| 821 | * Re-initialize rspin_threshold every time when | ||
| 822 | * the owner state changes from non-reader to reader. | ||
| 823 | * This allows a writer to steal the lock in between | ||
| 824 | * 2 reader phases and have the threshold reset at | ||
| 825 | * the beginning of the 2nd reader phase. | ||
| 826 | */ | ||
| 827 | if (prev_owner_state != OWNER_READER) { | ||
| 828 | if (rwsem_test_oflags(sem, nonspinnable)) | ||
| 829 | break; | ||
| 830 | rspin_threshold = rwsem_rspin_threshold(sem); | ||
| 831 | loop = 0; | ||
| 832 | } | ||
| 833 | |||
| 834 | /* | ||
| 835 | * Check time threshold once every 16 iterations to | ||
| 836 | * avoid calling sched_clock() too frequently so | ||
| 837 | * as to reduce the average latency between the times | ||
| 838 | * when the lock becomes free and when the spinner | ||
| 839 | * is ready to do a trylock. | ||
| 840 | */ | ||
| 841 | else if (!(++loop & 0xf) && (sched_clock() > rspin_threshold)) { | ||
| 842 | rwsem_set_nonspinnable(sem); | ||
| 843 | lockevent_inc(rwsem_opt_nospin); | ||
| 844 | break; | ||
| 845 | } | ||
| 846 | } | ||
| 847 | |||
| 848 | /* | ||
| 849 | * An RT task cannot do optimistic spinning if it cannot | ||
| 850 | * be sure the lock holder is running or live-lock may | ||
| 851 | * happen if the current task and the lock holder happen | ||
| 852 | * to run in the same CPU. However, aborting optimistic | ||
| 853 | * spinning while a NULL owner is detected may miss some | ||
| 854 | * opportunity where spinning can continue without causing | ||
| 855 | * problem. | ||
| 856 | * | ||
| 857 | * There are 2 possible cases where an RT task may be able | ||
| 858 | * to continue spinning. | ||
| 859 | * | ||
| 860 | * 1) The lock owner is in the process of releasing the | ||
| 861 | * lock, sem->owner is cleared but the lock has not | ||
| 862 | * been released yet. | ||
| 863 | * 2) The lock was free and owner cleared, but another | ||
| 864 | * task just comes in and acquire the lock before | ||
| 865 | * we try to get it. The new owner may be a spinnable | ||
| 866 | * writer. | ||
| 867 | * | ||
| 868 | * To take advantage of two scenarios listed agove, the RT | ||
| 869 | * task is made to retry one more time to see if it can | ||
| 870 | * acquire the lock or continue spinning on the new owning | ||
| 871 | * writer. Of course, if the time lag is long enough or the | ||
| 872 | * new owner is not a writer or spinnable, the RT task will | ||
| 873 | * quit spinning. | ||
| 874 | * | ||
| 875 | * If the owner is a writer, the need_resched() check is | ||
| 876 | * done inside rwsem_spin_on_owner(). If the owner is not | ||
| 877 | * a writer, need_resched() check needs to be done here. | ||
| 878 | */ | ||
| 879 | if (owner_state != OWNER_WRITER) { | ||
| 880 | if (need_resched()) | ||
| 881 | break; | ||
| 882 | if (rt_task(current) && | ||
| 883 | (prev_owner_state != OWNER_WRITER)) | ||
| 884 | break; | ||
| 885 | } | ||
| 886 | prev_owner_state = owner_state; | ||
| 887 | |||
| 888 | /* | ||
| 889 | * The cpu_relax() call is a compiler barrier which forces | ||
| 890 | * everything in this loop to be re-loaded. We don't need | ||
| 891 | * memory barriers as we'll eventually observe the right | ||
| 892 | * values at the cost of a few extra spins. | ||
| 893 | */ | ||
| 894 | cpu_relax(); | ||
| 895 | } | ||
| 896 | osq_unlock(&sem->osq); | ||
| 897 | done: | ||
| 898 | preempt_enable(); | ||
| 899 | lockevent_cond_inc(rwsem_opt_fail, !taken); | ||
| 900 | return taken; | ||
| 901 | } | ||
| 902 | |||
| 903 | /* | ||
| 904 | * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should | ||
| 905 | * only be called when the reader count reaches 0. | ||
| 906 | * | ||
| 907 | * This give writers better chance to acquire the rwsem first before | ||
| 908 | * readers when the rwsem was being held by readers for a relatively long | ||
| 909 | * period of time. Race can happen that an optimistic spinner may have | ||
| 910 | * just stolen the rwsem and set the owner, but just clearing the | ||
| 911 | * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. | ||
| 912 | */ | ||
| 913 | static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) | ||
| 914 | { | ||
| 915 | if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) | ||
| 916 | atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); | ||
| 917 | } | ||
| 918 | |||
| 919 | /* | ||
| 920 | * This function is called when the reader fails to acquire the lock via | ||
| 921 | * optimistic spinning. In this case we will still attempt to do a trylock | ||
| 922 | * when comparing the rwsem state right now with the state when entering | ||
| 923 | * the slowpath indicates that the reader is still in a valid reader phase. | ||
| 924 | * This happens when the following conditions are true: | ||
| 925 | * | ||
| 926 | * 1) The lock is currently reader owned, and | ||
| 927 | * 2) The lock is previously not reader-owned or the last read owner changes. | ||
| 928 | * | ||
| 929 | * In the former case, we have transitioned from a writer phase to a | ||
| 930 | * reader-phase while spinning. In the latter case, it means the reader | ||
| 931 | * phase hasn't ended when we entered the optimistic spinning loop. In | ||
| 932 | * both cases, the reader is eligible to acquire the lock. This is the | ||
| 933 | * secondary path where a read lock is acquired optimistically. | ||
| 934 | * | ||
| 935 | * The reader non-spinnable bit wasn't set at time of entry or it will | ||
| 936 | * not be here at all. | ||
| 937 | */ | ||
| 938 | static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, | ||
| 939 | unsigned long last_rowner) | ||
| 940 | { | ||
| 941 | unsigned long owner = atomic_long_read(&sem->owner); | ||
| 942 | |||
| 943 | if (!(owner & RWSEM_READER_OWNED)) | ||
| 944 | return false; | ||
| 945 | |||
| 946 | if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && | ||
| 947 | rwsem_try_read_lock_unqueued(sem)) { | ||
| 948 | lockevent_inc(rwsem_opt_rlock2); | ||
| 949 | lockevent_add(rwsem_opt_fail, -1); | ||
| 950 | return true; | ||
| 951 | } | ||
| 952 | return false; | ||
| 953 | } | ||
| 954 | #else | ||
| 955 | static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, | ||
| 956 | unsigned long nonspinnable) | ||
| 957 | { | ||
| 958 | return false; | ||
| 959 | } | ||
| 960 | |||
| 961 | static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) | ||
| 962 | { | ||
| 963 | return false; | ||
| 964 | } | ||
| 965 | |||
| 966 | static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } | ||
| 967 | |||
| 968 | static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, | ||
| 969 | unsigned long last_rowner) | ||
| 970 | { | ||
| 971 | return false; | ||
| 972 | } | ||
| 973 | #endif | ||
| 974 | |||
| 975 | /* | ||
| 976 | * Wait for the read lock to be granted | ||
| 977 | */ | ||
| 978 | static struct rw_semaphore __sched * | ||
| 979 | rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) | ||
| 980 | { | ||
| 981 | long count, adjustment = -RWSEM_READER_BIAS; | ||
| 982 | struct rwsem_waiter waiter; | ||
| 983 | DEFINE_WAKE_Q(wake_q); | ||
| 984 | bool wake = false; | ||
| 985 | |||
| 986 | /* | ||
| 987 | * Save the current read-owner of rwsem, if available, and the | ||
| 988 | * reader nonspinnable bit. | ||
| 989 | */ | ||
| 990 | waiter.last_rowner = atomic_long_read(&sem->owner); | ||
| 991 | if (!(waiter.last_rowner & RWSEM_READER_OWNED)) | ||
| 992 | waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; | ||
| 993 | |||
| 994 | if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) | ||
| 995 | goto queue; | ||
| 996 | |||
| 997 | /* | ||
| 998 | * Undo read bias from down_read() and do optimistic spinning. | ||
| 999 | */ | ||
| 1000 | atomic_long_add(-RWSEM_READER_BIAS, &sem->count); | ||
| 1001 | adjustment = 0; | ||
| 1002 | if (rwsem_optimistic_spin(sem, false)) { | ||
| 1003 | /* | ||
| 1004 | * Wake up other readers in the wait list if the front | ||
| 1005 | * waiter is a reader. | ||
| 1006 | */ | ||
| 1007 | if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { | ||
| 1008 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1009 | if (!list_empty(&sem->wait_list)) | ||
| 1010 | rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, | ||
| 1011 | &wake_q); | ||
| 1012 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1013 | wake_up_q(&wake_q); | ||
| 1014 | } | ||
| 1015 | return sem; | ||
| 1016 | } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { | ||
| 1017 | return sem; | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | queue: | ||
| 1021 | waiter.task = current; | ||
| 1022 | waiter.type = RWSEM_WAITING_FOR_READ; | ||
| 1023 | waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; | ||
| 1024 | |||
| 1025 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1026 | if (list_empty(&sem->wait_list)) { | ||
| 1027 | /* | ||
| 1028 | * In case the wait queue is empty and the lock isn't owned | ||
| 1029 | * by a writer or has the handoff bit set, this reader can | ||
| 1030 | * exit the slowpath and return immediately as its | ||
| 1031 | * RWSEM_READER_BIAS has already been set in the count. | ||
| 1032 | */ | ||
| 1033 | if (adjustment && !(atomic_long_read(&sem->count) & | ||
| 1034 | (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { | ||
| 1035 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1036 | rwsem_set_reader_owned(sem); | ||
| 1037 | lockevent_inc(rwsem_rlock_fast); | ||
| 1038 | return sem; | ||
| 1039 | } | ||
| 1040 | adjustment += RWSEM_FLAG_WAITERS; | ||
| 1041 | } | ||
| 1042 | list_add_tail(&waiter.list, &sem->wait_list); | ||
| 1043 | |||
| 1044 | /* we're now waiting on the lock, but no longer actively locking */ | ||
| 1045 | if (adjustment) | ||
| 1046 | count = atomic_long_add_return(adjustment, &sem->count); | ||
| 1047 | else | ||
| 1048 | count = atomic_long_read(&sem->count); | ||
| 1049 | |||
| 1050 | /* | ||
| 1051 | * If there are no active locks, wake the front queued process(es). | ||
| 1052 | * | ||
| 1053 | * If there are no writers and we are first in the queue, | ||
| 1054 | * wake our own waiter to join the existing active readers ! | ||
| 1055 | */ | ||
| 1056 | if (!(count & RWSEM_LOCK_MASK)) { | ||
| 1057 | clear_wr_nonspinnable(sem); | ||
| 1058 | wake = true; | ||
| 1059 | } | ||
| 1060 | if (wake || (!(count & RWSEM_WRITER_MASK) && | ||
| 1061 | (adjustment & RWSEM_FLAG_WAITERS))) | ||
| 1062 | rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
| 1063 | |||
| 1064 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1065 | wake_up_q(&wake_q); | ||
| 1066 | |||
| 1067 | /* wait to be given the lock */ | ||
| 1068 | while (true) { | ||
| 1069 | set_current_state(state); | ||
| 1070 | if (!waiter.task) | ||
| 1071 | break; | ||
| 1072 | if (signal_pending_state(state, current)) { | ||
| 1073 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1074 | if (waiter.task) | ||
| 1075 | goto out_nolock; | ||
| 1076 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1077 | break; | ||
| 1078 | } | ||
| 1079 | schedule(); | ||
| 1080 | lockevent_inc(rwsem_sleep_reader); | ||
| 1081 | } | ||
| 1082 | |||
| 1083 | __set_current_state(TASK_RUNNING); | ||
| 1084 | lockevent_inc(rwsem_rlock); | ||
| 1085 | return sem; | ||
| 1086 | out_nolock: | ||
| 1087 | list_del(&waiter.list); | ||
| 1088 | if (list_empty(&sem->wait_list)) { | ||
| 1089 | atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, | ||
| 1090 | &sem->count); | ||
| 1091 | } | ||
| 1092 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1093 | __set_current_state(TASK_RUNNING); | ||
| 1094 | lockevent_inc(rwsem_rlock_fail); | ||
| 1095 | return ERR_PTR(-EINTR); | ||
| 1096 | } | ||
| 1097 | |||
| 1098 | /* | ||
| 1099 | * This function is called by the a write lock owner. So the owner value | ||
| 1100 | * won't get changed by others. | ||
| 1101 | */ | ||
| 1102 | static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, | ||
| 1103 | bool disable) | ||
| 1104 | { | ||
| 1105 | if (unlikely(disable)) { | ||
| 1106 | atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); | ||
| 1107 | lockevent_inc(rwsem_opt_norspin); | ||
| 1108 | } | ||
| 1109 | } | ||
| 1110 | |||
| 1111 | /* | ||
| 1112 | * Wait until we successfully acquire the write lock | ||
| 1113 | */ | ||
| 1114 | static struct rw_semaphore * | ||
| 1115 | rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) | ||
| 1116 | { | ||
| 1117 | long count; | ||
| 1118 | bool disable_rspin; | ||
| 1119 | enum writer_wait_state wstate; | ||
| 1120 | struct rwsem_waiter waiter; | ||
| 1121 | struct rw_semaphore *ret = sem; | ||
| 1122 | DEFINE_WAKE_Q(wake_q); | ||
| 1123 | |||
| 1124 | /* do optimistic spinning and steal lock if possible */ | ||
| 1125 | if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && | ||
| 1126 | rwsem_optimistic_spin(sem, true)) | ||
| 1127 | return sem; | ||
| 1128 | |||
| 1129 | /* | ||
| 1130 | * Disable reader optimistic spinning for this rwsem after | ||
| 1131 | * acquiring the write lock when the setting of the nonspinnable | ||
| 1132 | * bits are observed. | ||
| 1133 | */ | ||
| 1134 | disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; | ||
| 1135 | |||
| 1136 | /* | ||
| 1137 | * Optimistic spinning failed, proceed to the slowpath | ||
| 1138 | * and block until we can acquire the sem. | ||
| 1139 | */ | ||
| 1140 | waiter.task = current; | ||
| 1141 | waiter.type = RWSEM_WAITING_FOR_WRITE; | ||
| 1142 | waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; | ||
| 1143 | |||
| 1144 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1145 | |||
| 1146 | /* account for this before adding a new element to the list */ | ||
| 1147 | wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; | ||
| 1148 | |||
| 1149 | list_add_tail(&waiter.list, &sem->wait_list); | ||
| 1150 | |||
| 1151 | /* we're now waiting on the lock */ | ||
| 1152 | if (wstate == WRITER_NOT_FIRST) { | ||
| 1153 | count = atomic_long_read(&sem->count); | ||
| 1154 | |||
| 1155 | /* | ||
| 1156 | * If there were already threads queued before us and: | ||
| 1157 | * 1) there are no no active locks, wake the front | ||
| 1158 | * queued process(es) as the handoff bit might be set. | ||
| 1159 | * 2) there are no active writers and some readers, the lock | ||
| 1160 | * must be read owned; so we try to wake any read lock | ||
| 1161 | * waiters that were queued ahead of us. | ||
| 1162 | */ | ||
| 1163 | if (count & RWSEM_WRITER_MASK) | ||
| 1164 | goto wait; | ||
| 1165 | |||
| 1166 | rwsem_mark_wake(sem, (count & RWSEM_READER_MASK) | ||
| 1167 | ? RWSEM_WAKE_READERS | ||
| 1168 | : RWSEM_WAKE_ANY, &wake_q); | ||
| 1169 | |||
| 1170 | if (!wake_q_empty(&wake_q)) { | ||
| 1171 | /* | ||
| 1172 | * We want to minimize wait_lock hold time especially | ||
| 1173 | * when a large number of readers are to be woken up. | ||
| 1174 | */ | ||
| 1175 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1176 | wake_up_q(&wake_q); | ||
| 1177 | wake_q_init(&wake_q); /* Used again, reinit */ | ||
| 1178 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1179 | } | ||
| 1180 | } else { | ||
| 1181 | atomic_long_or(RWSEM_FLAG_WAITERS, &sem->count); | ||
| 1182 | } | ||
| 1183 | |||
| 1184 | wait: | ||
| 1185 | /* wait until we successfully acquire the lock */ | ||
| 1186 | set_current_state(state); | ||
| 1187 | while (true) { | ||
| 1188 | if (rwsem_try_write_lock(sem, wstate)) | ||
| 1189 | break; | ||
| 1190 | |||
| 1191 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1192 | |||
| 1193 | /* Block until there are no active lockers. */ | ||
| 1194 | for (;;) { | ||
| 1195 | if (signal_pending_state(state, current)) | ||
| 1196 | goto out_nolock; | ||
| 1197 | |||
| 1198 | schedule(); | ||
| 1199 | lockevent_inc(rwsem_sleep_writer); | ||
| 1200 | set_current_state(state); | ||
| 1201 | /* | ||
| 1202 | * If HANDOFF bit is set, unconditionally do | ||
| 1203 | * a trylock. | ||
| 1204 | */ | ||
| 1205 | if (wstate == WRITER_HANDOFF) | ||
| 1206 | break; | ||
| 1207 | |||
| 1208 | if ((wstate == WRITER_NOT_FIRST) && | ||
| 1209 | (rwsem_first_waiter(sem) == &waiter)) | ||
| 1210 | wstate = WRITER_FIRST; | ||
| 1211 | |||
| 1212 | count = atomic_long_read(&sem->count); | ||
| 1213 | if (!(count & RWSEM_LOCK_MASK)) | ||
| 1214 | break; | ||
| 1215 | |||
| 1216 | /* | ||
| 1217 | * The setting of the handoff bit is deferred | ||
| 1218 | * until rwsem_try_write_lock() is called. | ||
| 1219 | */ | ||
| 1220 | if ((wstate == WRITER_FIRST) && (rt_task(current) || | ||
| 1221 | time_after(jiffies, waiter.timeout))) { | ||
| 1222 | wstate = WRITER_HANDOFF; | ||
| 1223 | lockevent_inc(rwsem_wlock_handoff); | ||
| 1224 | break; | ||
| 1225 | } | ||
| 1226 | } | ||
| 1227 | |||
| 1228 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1229 | } | ||
| 1230 | __set_current_state(TASK_RUNNING); | ||
| 1231 | list_del(&waiter.list); | ||
| 1232 | rwsem_disable_reader_optspin(sem, disable_rspin); | ||
| 1233 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1234 | lockevent_inc(rwsem_wlock); | ||
| 1235 | |||
| 1236 | return ret; | ||
| 1237 | |||
| 1238 | out_nolock: | ||
| 1239 | __set_current_state(TASK_RUNNING); | ||
| 1240 | raw_spin_lock_irq(&sem->wait_lock); | ||
| 1241 | list_del(&waiter.list); | ||
| 1242 | |||
| 1243 | if (unlikely(wstate == WRITER_HANDOFF)) | ||
| 1244 | atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); | ||
| 1245 | |||
| 1246 | if (list_empty(&sem->wait_list)) | ||
| 1247 | atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); | ||
| 1248 | else | ||
| 1249 | rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
| 1250 | raw_spin_unlock_irq(&sem->wait_lock); | ||
| 1251 | wake_up_q(&wake_q); | ||
| 1252 | lockevent_inc(rwsem_wlock_fail); | ||
| 1253 | |||
| 1254 | return ERR_PTR(-EINTR); | ||
| 1255 | } | ||
| 1256 | |||
| 1257 | /* | ||
| 1258 | * handle waking up a waiter on the semaphore | ||
| 1259 | * - up_read/up_write has decremented the active part of count if we come here | ||
| 1260 | */ | ||
| 1261 | static struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem, long count) | ||
| 1262 | { | ||
| 1263 | unsigned long flags; | ||
| 1264 | DEFINE_WAKE_Q(wake_q); | ||
| 1265 | |||
| 1266 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
| 1267 | |||
| 1268 | if (!list_empty(&sem->wait_list)) | ||
| 1269 | rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); | ||
| 1270 | |||
| 1271 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 1272 | wake_up_q(&wake_q); | ||
| 1273 | |||
| 1274 | return sem; | ||
| 1275 | } | ||
| 1276 | |||
| 1277 | /* | ||
| 1278 | * downgrade a write lock into a read lock | ||
| 1279 | * - caller incremented waiting part of count and discovered it still negative | ||
| 1280 | * - just wake up any readers at the front of the queue | ||
| 1281 | */ | ||
| 1282 | static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) | ||
| 1283 | { | ||
| 1284 | unsigned long flags; | ||
| 1285 | DEFINE_WAKE_Q(wake_q); | ||
| 1286 | |||
| 1287 | raw_spin_lock_irqsave(&sem->wait_lock, flags); | ||
| 1288 | |||
| 1289 | if (!list_empty(&sem->wait_list)) | ||
| 1290 | rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); | ||
| 1291 | |||
| 1292 | raw_spin_unlock_irqrestore(&sem->wait_lock, flags); | ||
| 1293 | wake_up_q(&wake_q); | ||
| 1294 | |||
| 1295 | return sem; | ||
| 1296 | } | ||
| 1297 | |||
| 1298 | /* | ||
| 1299 | * lock for reading | ||
| 1300 | */ | ||
| 1301 | inline void __down_read(struct rw_semaphore *sem) | ||
| 1302 | { | ||
| 1303 | if (!rwsem_read_trylock(sem)) { | ||
| 1304 | rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); | ||
| 1305 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); | ||
| 1306 | } else { | ||
| 1307 | rwsem_set_reader_owned(sem); | ||
| 1308 | } | ||
| 1309 | } | ||
| 1310 | |||
| 1311 | static inline int __down_read_killable(struct rw_semaphore *sem) | ||
| 1312 | { | ||
| 1313 | if (!rwsem_read_trylock(sem)) { | ||
| 1314 | if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) | ||
| 1315 | return -EINTR; | ||
| 1316 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); | ||
| 1317 | } else { | ||
| 1318 | rwsem_set_reader_owned(sem); | ||
| 1319 | } | ||
| 1320 | return 0; | ||
| 1321 | } | ||
| 1322 | |||
| 1323 | static inline int __down_read_trylock(struct rw_semaphore *sem) | ||
| 1324 | { | ||
| 1325 | /* | ||
| 1326 | * Optimize for the case when the rwsem is not locked at all. | ||
| 1327 | */ | ||
| 1328 | long tmp = RWSEM_UNLOCKED_VALUE; | ||
| 1329 | |||
| 1330 | do { | ||
| 1331 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | ||
| 1332 | tmp + RWSEM_READER_BIAS)) { | ||
| 1333 | rwsem_set_reader_owned(sem); | ||
| 1334 | return 1; | ||
| 1335 | } | ||
| 1336 | } while (!(tmp & RWSEM_READ_FAILED_MASK)); | ||
| 1337 | return 0; | ||
| 1338 | } | ||
| 1339 | |||
| 1340 | /* | ||
| 1341 | * lock for writing | ||
| 1342 | */ | ||
| 1343 | static inline void __down_write(struct rw_semaphore *sem) | ||
| 1344 | { | ||
| 1345 | long tmp = RWSEM_UNLOCKED_VALUE; | ||
| 1346 | |||
| 1347 | if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | ||
| 1348 | RWSEM_WRITER_LOCKED))) | ||
| 1349 | rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); | ||
| 1350 | else | ||
| 1351 | rwsem_set_owner(sem); | ||
| 1352 | } | ||
| 1353 | |||
| 1354 | static inline int __down_write_killable(struct rw_semaphore *sem) | ||
| 1355 | { | ||
| 1356 | long tmp = RWSEM_UNLOCKED_VALUE; | ||
| 1357 | |||
| 1358 | if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | ||
| 1359 | RWSEM_WRITER_LOCKED))) { | ||
| 1360 | if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) | ||
| 1361 | return -EINTR; | ||
| 1362 | } else { | ||
| 1363 | rwsem_set_owner(sem); | ||
| 1364 | } | ||
| 1365 | return 0; | ||
| 1366 | } | ||
| 1367 | |||
| 1368 | static inline int __down_write_trylock(struct rw_semaphore *sem) | ||
| 1369 | { | ||
| 1370 | long tmp = RWSEM_UNLOCKED_VALUE; | ||
| 1371 | |||
| 1372 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | ||
| 1373 | RWSEM_WRITER_LOCKED)) { | ||
| 1374 | rwsem_set_owner(sem); | ||
| 1375 | return true; | ||
| 1376 | } | ||
| 1377 | return false; | ||
| 1378 | } | ||
| 1379 | |||
| 1380 | /* | ||
| 1381 | * unlock after reading | ||
| 1382 | */ | ||
| 1383 | inline void __up_read(struct rw_semaphore *sem) | ||
| 1384 | { | ||
| 1385 | long tmp; | ||
| 1386 | |||
| 1387 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); | ||
| 1388 | rwsem_clear_reader_owned(sem); | ||
| 1389 | tmp = atomic_long_add_return_release(-RWSEM_READER_BIAS, &sem->count); | ||
| 1390 | DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); | ||
| 1391 | if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == | ||
| 1392 | RWSEM_FLAG_WAITERS)) { | ||
| 1393 | clear_wr_nonspinnable(sem); | ||
| 1394 | rwsem_wake(sem, tmp); | ||
| 1395 | } | ||
| 1396 | } | ||
| 1397 | |||
| 1398 | /* | ||
| 1399 | * unlock after writing | ||
| 1400 | */ | ||
| 1401 | static inline void __up_write(struct rw_semaphore *sem) | ||
| 1402 | { | ||
| 1403 | long tmp; | ||
| 1404 | |||
| 1405 | /* | ||
| 1406 | * sem->owner may differ from current if the ownership is transferred | ||
| 1407 | * to an anonymous writer by setting the RWSEM_NONSPINNABLE bits. | ||
| 1408 | */ | ||
| 1409 | DEBUG_RWSEMS_WARN_ON((rwsem_owner(sem) != current) && | ||
| 1410 | !rwsem_test_oflags(sem, RWSEM_NONSPINNABLE), sem); | ||
| 1411 | rwsem_clear_owner(sem); | ||
| 1412 | tmp = atomic_long_fetch_add_release(-RWSEM_WRITER_LOCKED, &sem->count); | ||
| 1413 | if (unlikely(tmp & RWSEM_FLAG_WAITERS)) | ||
| 1414 | rwsem_wake(sem, tmp); | ||
| 1415 | } | ||
| 1416 | |||
| 1417 | /* | ||
| 1418 | * downgrade write lock to read lock | ||
| 1419 | */ | ||
| 1420 | static inline void __downgrade_write(struct rw_semaphore *sem) | ||
| 1421 | { | ||
| 1422 | long tmp; | ||
| 1423 | |||
| 1424 | /* | ||
| 1425 | * When downgrading from exclusive to shared ownership, | ||
| 1426 | * anything inside the write-locked region cannot leak | ||
| 1427 | * into the read side. In contrast, anything in the | ||
| 1428 | * read-locked region is ok to be re-ordered into the | ||
| 1429 | * write side. As such, rely on RELEASE semantics. | ||
| 1430 | */ | ||
| 1431 | DEBUG_RWSEMS_WARN_ON(rwsem_owner(sem) != current, sem); | ||
| 1432 | tmp = atomic_long_fetch_add_release( | ||
| 1433 | -RWSEM_WRITER_LOCKED+RWSEM_READER_BIAS, &sem->count); | ||
| 1434 | rwsem_set_reader_owned(sem); | ||
| 1435 | if (tmp & RWSEM_FLAG_WAITERS) | ||
| 1436 | rwsem_downgrade_wake(sem); | ||
| 1437 | } | ||
| 17 | 1438 | ||
| 18 | /* | 1439 | /* |
| 19 | * lock for reading | 1440 | * lock for reading |
| @@ -25,7 +1446,6 @@ void __sched down_read(struct rw_semaphore *sem) | |||
| 25 | 1446 | ||
| 26 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 1447 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
| 27 | } | 1448 | } |
| 28 | |||
| 29 | EXPORT_SYMBOL(down_read); | 1449 | EXPORT_SYMBOL(down_read); |
| 30 | 1450 | ||
| 31 | int __sched down_read_killable(struct rw_semaphore *sem) | 1451 | int __sched down_read_killable(struct rw_semaphore *sem) |
| @@ -40,7 +1460,6 @@ int __sched down_read_killable(struct rw_semaphore *sem) | |||
| 40 | 1460 | ||
| 41 | return 0; | 1461 | return 0; |
| 42 | } | 1462 | } |
| 43 | |||
| 44 | EXPORT_SYMBOL(down_read_killable); | 1463 | EXPORT_SYMBOL(down_read_killable); |
| 45 | 1464 | ||
| 46 | /* | 1465 | /* |
| @@ -54,7 +1473,6 @@ int down_read_trylock(struct rw_semaphore *sem) | |||
| 54 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); | 1473 | rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); |
| 55 | return ret; | 1474 | return ret; |
| 56 | } | 1475 | } |
| 57 | |||
| 58 | EXPORT_SYMBOL(down_read_trylock); | 1476 | EXPORT_SYMBOL(down_read_trylock); |
| 59 | 1477 | ||
| 60 | /* | 1478 | /* |
| @@ -64,10 +1482,8 @@ void __sched down_write(struct rw_semaphore *sem) | |||
| 64 | { | 1482 | { |
| 65 | might_sleep(); | 1483 | might_sleep(); |
| 66 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 1484 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
| 67 | |||
| 68 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 1485 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
| 69 | } | 1486 | } |
| 70 | |||
| 71 | EXPORT_SYMBOL(down_write); | 1487 | EXPORT_SYMBOL(down_write); |
| 72 | 1488 | ||
| 73 | /* | 1489 | /* |
| @@ -78,14 +1494,14 @@ int __sched down_write_killable(struct rw_semaphore *sem) | |||
| 78 | might_sleep(); | 1494 | might_sleep(); |
| 79 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 1495 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
| 80 | 1496 | ||
| 81 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { | 1497 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, |
| 1498 | __down_write_killable)) { | ||
| 82 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 1499 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 83 | return -EINTR; | 1500 | return -EINTR; |
| 84 | } | 1501 | } |
| 85 | 1502 | ||
| 86 | return 0; | 1503 | return 0; |
| 87 | } | 1504 | } |
| 88 | |||
| 89 | EXPORT_SYMBOL(down_write_killable); | 1505 | EXPORT_SYMBOL(down_write_killable); |
| 90 | 1506 | ||
| 91 | /* | 1507 | /* |
| @@ -100,7 +1516,6 @@ int down_write_trylock(struct rw_semaphore *sem) | |||
| 100 | 1516 | ||
| 101 | return ret; | 1517 | return ret; |
| 102 | } | 1518 | } |
| 103 | |||
| 104 | EXPORT_SYMBOL(down_write_trylock); | 1519 | EXPORT_SYMBOL(down_write_trylock); |
| 105 | 1520 | ||
| 106 | /* | 1521 | /* |
| @@ -109,10 +1524,8 @@ EXPORT_SYMBOL(down_write_trylock); | |||
| 109 | void up_read(struct rw_semaphore *sem) | 1524 | void up_read(struct rw_semaphore *sem) |
| 110 | { | 1525 | { |
| 111 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 1526 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 112 | |||
| 113 | __up_read(sem); | 1527 | __up_read(sem); |
| 114 | } | 1528 | } |
| 115 | |||
| 116 | EXPORT_SYMBOL(up_read); | 1529 | EXPORT_SYMBOL(up_read); |
| 117 | 1530 | ||
| 118 | /* | 1531 | /* |
| @@ -121,10 +1534,8 @@ EXPORT_SYMBOL(up_read); | |||
| 121 | void up_write(struct rw_semaphore *sem) | 1534 | void up_write(struct rw_semaphore *sem) |
| 122 | { | 1535 | { |
| 123 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 1536 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 124 | |||
| 125 | __up_write(sem); | 1537 | __up_write(sem); |
| 126 | } | 1538 | } |
| 127 | |||
| 128 | EXPORT_SYMBOL(up_write); | 1539 | EXPORT_SYMBOL(up_write); |
| 129 | 1540 | ||
| 130 | /* | 1541 | /* |
| @@ -133,10 +1544,8 @@ EXPORT_SYMBOL(up_write); | |||
| 133 | void downgrade_write(struct rw_semaphore *sem) | 1544 | void downgrade_write(struct rw_semaphore *sem) |
| 134 | { | 1545 | { |
| 135 | lock_downgrade(&sem->dep_map, _RET_IP_); | 1546 | lock_downgrade(&sem->dep_map, _RET_IP_); |
| 136 | |||
| 137 | __downgrade_write(sem); | 1547 | __downgrade_write(sem); |
| 138 | } | 1548 | } |
| 139 | |||
| 140 | EXPORT_SYMBOL(downgrade_write); | 1549 | EXPORT_SYMBOL(downgrade_write); |
| 141 | 1550 | ||
| 142 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 1551 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| @@ -145,40 +1554,32 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) | |||
| 145 | { | 1554 | { |
| 146 | might_sleep(); | 1555 | might_sleep(); |
| 147 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); | 1556 | rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); |
| 148 | |||
| 149 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); | 1557 | LOCK_CONTENDED(sem, __down_read_trylock, __down_read); |
| 150 | } | 1558 | } |
| 151 | |||
| 152 | EXPORT_SYMBOL(down_read_nested); | 1559 | EXPORT_SYMBOL(down_read_nested); |
| 153 | 1560 | ||
| 154 | void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) | 1561 | void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) |
| 155 | { | 1562 | { |
| 156 | might_sleep(); | 1563 | might_sleep(); |
| 157 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); | 1564 | rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_); |
| 158 | |||
| 159 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 1565 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
| 160 | } | 1566 | } |
| 161 | |||
| 162 | EXPORT_SYMBOL(_down_write_nest_lock); | 1567 | EXPORT_SYMBOL(_down_write_nest_lock); |
| 163 | 1568 | ||
| 164 | void down_read_non_owner(struct rw_semaphore *sem) | 1569 | void down_read_non_owner(struct rw_semaphore *sem) |
| 165 | { | 1570 | { |
| 166 | might_sleep(); | 1571 | might_sleep(); |
| 167 | |||
| 168 | __down_read(sem); | 1572 | __down_read(sem); |
| 169 | __rwsem_set_reader_owned(sem, NULL); | 1573 | __rwsem_set_reader_owned(sem, NULL); |
| 170 | } | 1574 | } |
| 171 | |||
| 172 | EXPORT_SYMBOL(down_read_non_owner); | 1575 | EXPORT_SYMBOL(down_read_non_owner); |
| 173 | 1576 | ||
| 174 | void down_write_nested(struct rw_semaphore *sem, int subclass) | 1577 | void down_write_nested(struct rw_semaphore *sem, int subclass) |
| 175 | { | 1578 | { |
| 176 | might_sleep(); | 1579 | might_sleep(); |
| 177 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 1580 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
| 178 | |||
| 179 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); | 1581 | LOCK_CONTENDED(sem, __down_write_trylock, __down_write); |
| 180 | } | 1582 | } |
| 181 | |||
| 182 | EXPORT_SYMBOL(down_write_nested); | 1583 | EXPORT_SYMBOL(down_write_nested); |
| 183 | 1584 | ||
| 184 | int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) | 1585 | int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) |
| @@ -186,23 +1587,21 @@ int __sched down_write_killable_nested(struct rw_semaphore *sem, int subclass) | |||
| 186 | might_sleep(); | 1587 | might_sleep(); |
| 187 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); | 1588 | rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); |
| 188 | 1589 | ||
| 189 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, __down_write_killable)) { | 1590 | if (LOCK_CONTENDED_RETURN(sem, __down_write_trylock, |
| 1591 | __down_write_killable)) { | ||
| 190 | rwsem_release(&sem->dep_map, 1, _RET_IP_); | 1592 | rwsem_release(&sem->dep_map, 1, _RET_IP_); |
| 191 | return -EINTR; | 1593 | return -EINTR; |
| 192 | } | 1594 | } |
| 193 | 1595 | ||
| 194 | return 0; | 1596 | return 0; |
| 195 | } | 1597 | } |
| 196 | |||
| 197 | EXPORT_SYMBOL(down_write_killable_nested); | 1598 | EXPORT_SYMBOL(down_write_killable_nested); |
| 198 | 1599 | ||
| 199 | void up_read_non_owner(struct rw_semaphore *sem) | 1600 | void up_read_non_owner(struct rw_semaphore *sem) |
| 200 | { | 1601 | { |
| 201 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), | 1602 | DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); |
| 202 | sem); | ||
| 203 | __up_read(sem); | 1603 | __up_read(sem); |
| 204 | } | 1604 | } |
| 205 | |||
| 206 | EXPORT_SYMBOL(up_read_non_owner); | 1605 | EXPORT_SYMBOL(up_read_non_owner); |
| 207 | 1606 | ||
| 208 | #endif | 1607 | #endif |
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 64877f5294e3..2534ce49f648 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h | |||
| @@ -1,304 +1,10 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* | ||
| 3 | * The least significant 2 bits of the owner value has the following | ||
| 4 | * meanings when set. | ||
| 5 | * - RWSEM_READER_OWNED (bit 0): The rwsem is owned by readers | ||
| 6 | * - RWSEM_ANONYMOUSLY_OWNED (bit 1): The rwsem is anonymously owned, | ||
| 7 | * i.e. the owner(s) cannot be readily determined. It can be reader | ||
| 8 | * owned or the owning writer is indeterminate. | ||
| 9 | * | ||
| 10 | * When a writer acquires a rwsem, it puts its task_struct pointer | ||
| 11 | * into the owner field. It is cleared after an unlock. | ||
| 12 | * | ||
| 13 | * When a reader acquires a rwsem, it will also puts its task_struct | ||
| 14 | * pointer into the owner field with both the RWSEM_READER_OWNED and | ||
| 15 | * RWSEM_ANONYMOUSLY_OWNED bits set. On unlock, the owner field will | ||
| 16 | * largely be left untouched. So for a free or reader-owned rwsem, | ||
| 17 | * the owner value may contain information about the last reader that | ||
| 18 | * acquires the rwsem. The anonymous bit is set because that particular | ||
| 19 | * reader may or may not still own the lock. | ||
| 20 | * | ||
| 21 | * That information may be helpful in debugging cases where the system | ||
| 22 | * seems to hang on a reader owned rwsem especially if only one reader | ||
| 23 | * is involved. Ideally we would like to track all the readers that own | ||
| 24 | * a rwsem, but the overhead is simply too big. | ||
| 25 | */ | ||
| 26 | #include "lock_events.h" | ||
| 27 | 2 | ||
| 28 | #define RWSEM_READER_OWNED (1UL << 0) | 3 | #ifndef __INTERNAL_RWSEM_H |
| 29 | #define RWSEM_ANONYMOUSLY_OWNED (1UL << 1) | 4 | #define __INTERNAL_RWSEM_H |
| 5 | #include <linux/rwsem.h> | ||
| 30 | 6 | ||
| 31 | #ifdef CONFIG_DEBUG_RWSEMS | 7 | extern void __down_read(struct rw_semaphore *sem); |
| 32 | # define DEBUG_RWSEMS_WARN_ON(c, sem) do { \ | 8 | extern void __up_read(struct rw_semaphore *sem); |
| 33 | if (!debug_locks_silent && \ | ||
| 34 | WARN_ONCE(c, "DEBUG_RWSEMS_WARN_ON(%s): count = 0x%lx, owner = 0x%lx, curr 0x%lx, list %sempty\n",\ | ||
| 35 | #c, atomic_long_read(&(sem)->count), \ | ||
| 36 | (long)((sem)->owner), (long)current, \ | ||
| 37 | list_empty(&(sem)->wait_list) ? "" : "not ")) \ | ||
| 38 | debug_locks_off(); \ | ||
| 39 | } while (0) | ||
| 40 | #else | ||
| 41 | # define DEBUG_RWSEMS_WARN_ON(c, sem) | ||
| 42 | #endif | ||
| 43 | 9 | ||
| 44 | /* | 10 | #endif /* __INTERNAL_RWSEM_H */ |
| 45 | * R/W semaphores originally for PPC using the stuff in lib/rwsem.c. | ||
| 46 | * Adapted largely from include/asm-i386/rwsem.h | ||
| 47 | * by Paul Mackerras <paulus@samba.org>. | ||
| 48 | */ | ||
| 49 | |||
| 50 | /* | ||
| 51 | * the semaphore definition | ||
| 52 | */ | ||
| 53 | #ifdef CONFIG_64BIT | ||
| 54 | # define RWSEM_ACTIVE_MASK 0xffffffffL | ||
| 55 | #else | ||
| 56 | # define RWSEM_ACTIVE_MASK 0x0000ffffL | ||
| 57 | #endif | ||
| 58 | |||
| 59 | #define RWSEM_ACTIVE_BIAS 0x00000001L | ||
| 60 | #define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) | ||
| 61 | #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS | ||
| 62 | #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) | ||
| 63 | |||
| 64 | #ifdef CONFIG_RWSEM_SPIN_ON_OWNER | ||
| 65 | /* | ||
| 66 | * All writes to owner are protected by WRITE_ONCE() to make sure that | ||
| 67 | * store tearing can't happen as optimistic spinners may read and use | ||
| 68 | * the owner value concurrently without lock. Read from owner, however, | ||
| 69 | * may not need READ_ONCE() as long as the pointer value is only used | ||
| 70 | * for comparison and isn't being dereferenced. | ||
| 71 | */ | ||
| 72 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 73 | { | ||
| 74 | WRITE_ONCE(sem->owner, current); | ||
| 75 | } | ||
| 76 | |||
| 77 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 78 | { | ||
| 79 | WRITE_ONCE(sem->owner, NULL); | ||
| 80 | } | ||
| 81 | |||
| 82 | /* | ||
| 83 | * The task_struct pointer of the last owning reader will be left in | ||
| 84 | * the owner field. | ||
| 85 | * | ||
| 86 | * Note that the owner value just indicates the task has owned the rwsem | ||
| 87 | * previously, it may not be the real owner or one of the real owners | ||
| 88 | * anymore when that field is examined, so take it with a grain of salt. | ||
| 89 | */ | ||
| 90 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, | ||
| 91 | struct task_struct *owner) | ||
| 92 | { | ||
| 93 | unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | ||
| 94 | | RWSEM_ANONYMOUSLY_OWNED; | ||
| 95 | |||
| 96 | WRITE_ONCE(sem->owner, (struct task_struct *)val); | ||
| 97 | } | ||
| 98 | |||
| 99 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
| 100 | { | ||
| 101 | __rwsem_set_reader_owned(sem, current); | ||
| 102 | } | ||
| 103 | |||
| 104 | /* | ||
| 105 | * Return true if the a rwsem waiter can spin on the rwsem's owner | ||
| 106 | * and steal the lock, i.e. the lock is not anonymously owned. | ||
| 107 | * N.B. !owner is considered spinnable. | ||
| 108 | */ | ||
| 109 | static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) | ||
| 110 | { | ||
| 111 | return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); | ||
| 112 | } | ||
| 113 | |||
| 114 | /* | ||
| 115 | * Return true if rwsem is owned by an anonymous writer or readers. | ||
| 116 | */ | ||
| 117 | static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) | ||
| 118 | { | ||
| 119 | return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; | ||
| 120 | } | ||
| 121 | |||
| 122 | #ifdef CONFIG_DEBUG_RWSEMS | ||
| 123 | /* | ||
| 124 | * With CONFIG_DEBUG_RWSEMS configured, it will make sure that if there | ||
| 125 | * is a task pointer in owner of a reader-owned rwsem, it will be the | ||
| 126 | * real owner or one of the real owners. The only exception is when the | ||
| 127 | * unlock is done by up_read_non_owner(). | ||
| 128 | */ | ||
| 129 | #define rwsem_clear_reader_owned rwsem_clear_reader_owned | ||
| 130 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | ||
| 131 | { | ||
| 132 | unsigned long val = (unsigned long)current | RWSEM_READER_OWNED | ||
| 133 | | RWSEM_ANONYMOUSLY_OWNED; | ||
| 134 | if (READ_ONCE(sem->owner) == (struct task_struct *)val) | ||
| 135 | cmpxchg_relaxed((unsigned long *)&sem->owner, val, | ||
| 136 | RWSEM_READER_OWNED | RWSEM_ANONYMOUSLY_OWNED); | ||
| 137 | } | ||
| 138 | #endif | ||
| 139 | |||
| 140 | #else | ||
| 141 | static inline void rwsem_set_owner(struct rw_semaphore *sem) | ||
| 142 | { | ||
| 143 | } | ||
| 144 | |||
| 145 | static inline void rwsem_clear_owner(struct rw_semaphore *sem) | ||
| 146 | { | ||
| 147 | } | ||
| 148 | |||
| 149 | static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, | ||
| 150 | struct task_struct *owner) | ||
| 151 | { | ||
| 152 | } | ||
| 153 | |||
| 154 | static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) | ||
| 155 | { | ||
| 156 | } | ||
| 157 | #endif | ||
| 158 | |||
| 159 | #ifndef rwsem_clear_reader_owned | ||
| 160 | static inline void rwsem_clear_reader_owned(struct rw_semaphore *sem) | ||
| 161 | { | ||
| 162 | } | ||
| 163 | #endif | ||
| 164 | |||
| 165 | extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem); | ||
| 166 | extern struct rw_semaphore *rwsem_down_read_failed_killable(struct rw_semaphore *sem); | ||
| 167 | extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem); | ||
| 168 | extern struct rw_semaphore *rwsem_down_write_failed_killable(struct rw_semaphore *sem); | ||
| 169 | extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem); | ||
| 170 | extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem); | ||
| 171 | |||
| 172 | /* | ||
| 173 | * lock for reading | ||
| 174 | */ | ||
| 175 | static inline void __down_read(struct rw_semaphore *sem) | ||
| 176 | { | ||
| 177 | if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { | ||
| 178 | rwsem_down_read_failed(sem); | ||
| 179 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & | ||
| 180 | RWSEM_READER_OWNED), sem); | ||
| 181 | } else { | ||
| 182 | rwsem_set_reader_owned(sem); | ||
| 183 | } | ||
| 184 | } | ||
| 185 | |||
| 186 | static inline int __down_read_killable(struct rw_semaphore *sem) | ||
| 187 | { | ||
| 188 | if (unlikely(atomic_long_inc_return_acquire(&sem->count) <= 0)) { | ||
| 189 | if (IS_ERR(rwsem_down_read_failed_killable(sem))) | ||
| 190 | return -EINTR; | ||
| 191 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & | ||
| 192 | RWSEM_READER_OWNED), sem); | ||
| 193 | } else { | ||
| 194 | rwsem_set_reader_owned(sem); | ||
| 195 | } | ||
| 196 | return 0; | ||
| 197 | } | ||
| 198 | |||
| 199 | static inline int __down_read_trylock(struct rw_semaphore *sem) | ||
| 200 | { | ||
| 201 | /* | ||
| 202 | * Optimize for the case when the rwsem is not locked at all. | ||
| 203 | */ | ||
| 204 | long tmp = RWSEM_UNLOCKED_VALUE; | ||
| 205 | |||
| 206 | lockevent_inc(rwsem_rtrylock); | ||
| 207 | do { | ||
| 208 | if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, | ||
| 209 | tmp + RWSEM_ACTIVE_READ_BIAS)) { | ||
| 210 | rwsem_set_reader_owned(sem); | ||
| 211 | return 1; | ||
| 212 | } | ||
| 213 | } while (tmp >= 0); | ||
| 214 | return 0; | ||
| 215 | } | ||
| 216 | |||
| 217 | /* | ||
| 218 | * lock for writing | ||
| 219 | */ | ||
| 220 | static inline void __down_write(struct rw_semaphore *sem) | ||
| 221 | { | ||
| 222 | long tmp; | ||
| 223 | |||
| 224 | tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, | ||
| 225 | &sem->count); | ||
| 226 | if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) | ||
| 227 | rwsem_down_write_failed(sem); | ||
| 228 | rwsem_set_owner(sem); | ||
| 229 | } | ||
| 230 | |||
| 231 | static inline int __down_write_killable(struct rw_semaphore *sem) | ||
| 232 | { | ||
| 233 | long tmp; | ||
| 234 | |||
| 235 | tmp = atomic_long_add_return_acquire(RWSEM_ACTIVE_WRITE_BIAS, | ||
| 236 | &sem->count); | ||
| 237 | if (unlikely(tmp != RWSEM_ACTIVE_WRITE_BIAS)) | ||
| 238 | if (IS_ERR(rwsem_down_write_failed_killable(sem))) | ||
| 239 | return -EINTR; | ||
| 240 | rwsem_set_owner(sem); | ||
| 241 | return 0; | ||
| 242 | } | ||
| 243 | |||
| 244 | static inline int __down_write_trylock(struct rw_semaphore *sem) | ||
| 245 | { | ||
| 246 | long tmp; | ||
| 247 | |||
| 248 | lockevent_inc(rwsem_wtrylock); | ||
| 249 | tmp = atomic_long_cmpxchg_acquire(&sem->count, RWSEM_UNLOCKED_VALUE, | ||
| 250 | RWSEM_ACTIVE_WRITE_BIAS); | ||
| 251 | if (tmp == RWSEM_UNLOCKED_VALUE) { | ||
| 252 | rwsem_set_owner(sem); | ||
| 253 | return true; | ||
| 254 | } | ||
| 255 | return false; | ||
| 256 | } | ||
| 257 | |||
| 258 | /* | ||
| 259 | * unlock after reading | ||
| 260 | */ | ||
| 261 | static inline void __up_read(struct rw_semaphore *sem) | ||
| 262 | { | ||
| 263 | long tmp; | ||
| 264 | |||
| 265 | DEBUG_RWSEMS_WARN_ON(!((unsigned long)sem->owner & RWSEM_READER_OWNED), | ||
| 266 | sem); | ||
| 267 | rwsem_clear_reader_owned(sem); | ||
| 268 | tmp = atomic_long_dec_return_release(&sem->count); | ||
| 269 | if (unlikely(tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)) | ||
| 270 | rwsem_wake(sem); | ||
| 271 | } | ||
| 272 | |||
| 273 | /* | ||
| 274 | * unlock after writing | ||
| 275 | */ | ||
| 276 | static inline void __up_write(struct rw_semaphore *sem) | ||
| 277 | { | ||
| 278 | DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); | ||
| 279 | rwsem_clear_owner(sem); | ||
| 280 | if (unlikely(atomic_long_sub_return_release(RWSEM_ACTIVE_WRITE_BIAS, | ||
| 281 | &sem->count) < 0)) | ||
| 282 | rwsem_wake(sem); | ||
| 283 | } | ||
| 284 | |||
| 285 | /* | ||
| 286 | * downgrade write lock to read lock | ||
| 287 | */ | ||
| 288 | static inline void __downgrade_write(struct rw_semaphore *sem) | ||
| 289 | { | ||
| 290 | long tmp; | ||
| 291 | |||
| 292 | /* | ||
| 293 | * When downgrading from exclusive to shared ownership, | ||
| 294 | * anything inside the write-locked region cannot leak | ||
| 295 | * into the read side. In contrast, anything in the | ||
| 296 | * read-locked region is ok to be re-ordered into the | ||
| 297 | * write side. As such, rely on RELEASE semantics. | ||
| 298 | */ | ||
| 299 | DEBUG_RWSEMS_WARN_ON(sem->owner != current, sem); | ||
| 300 | tmp = atomic_long_add_return_release(-RWSEM_WAITING_BIAS, &sem->count); | ||
| 301 | rwsem_set_reader_owned(sem); | ||
| 302 | if (tmp < 0) | ||
| 303 | rwsem_downgrade_wake(sem); | ||
| 304 | } | ||
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c index 561acdd39960..d9dd94defc0a 100644 --- a/kernel/locking/semaphore.c +++ b/kernel/locking/semaphore.c | |||
| @@ -1,9 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (c) 2008 Intel Corporation | 3 | * Copyright (c) 2008 Intel Corporation |
| 3 | * Author: Matthew Wilcox <willy@linux.intel.com> | 4 | * Author: Matthew Wilcox <willy@linux.intel.com> |
| 4 | * | 5 | * |
| 5 | * Distributed under the terms of the GNU GPL, version 2 | ||
| 6 | * | ||
| 7 | * This file implements counting semaphores. | 6 | * This file implements counting semaphores. |
| 8 | * A counting semaphore may be acquired 'n' times before sleeping. | 7 | * A counting semaphore may be acquired 'n' times before sleeping. |
| 9 | * See mutex.c for single-acquisition sleeping locks which enforce | 8 | * See mutex.c for single-acquisition sleeping locks which enforce |
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 65a3b7e55b9f..3e82f449b4ff 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c | |||
| @@ -1,19 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Module-based API test facility for ww_mutexes | 3 | * Module-based API test facility for ww_mutexes |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, you can access it online at | ||
| 16 | * http://www.gnu.org/licenses/gpl-2.0.html. | ||
| 17 | */ | 4 | */ |
| 18 | 5 | ||
| 19 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
diff --git a/kernel/memremap.c b/kernel/memremap.c index 1490e63f69a9..6e1970719dc2 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c | |||
| @@ -95,6 +95,7 @@ static void devm_memremap_pages_release(void *data) | |||
| 95 | pgmap->kill(pgmap->ref); | 95 | pgmap->kill(pgmap->ref); |
| 96 | for_each_device_pfn(pfn, pgmap) | 96 | for_each_device_pfn(pfn, pgmap) |
| 97 | put_page(pfn_to_page(pfn)); | 97 | put_page(pfn_to_page(pfn)); |
| 98 | pgmap->cleanup(pgmap->ref); | ||
| 98 | 99 | ||
| 99 | /* pages are dead and unused, undo the arch mapping */ | 100 | /* pages are dead and unused, undo the arch mapping */ |
| 100 | align_start = res->start & ~(SECTION_SIZE - 1); | 101 | align_start = res->start & ~(SECTION_SIZE - 1); |
| @@ -133,8 +134,8 @@ static void devm_memremap_pages_release(void *data) | |||
| 133 | * 2/ The altmap field may optionally be initialized, in which case altmap_valid | 134 | * 2/ The altmap field may optionally be initialized, in which case altmap_valid |
| 134 | * must be set to true | 135 | * must be set to true |
| 135 | * | 136 | * |
| 136 | * 3/ pgmap->ref must be 'live' on entry and will be killed at | 137 | * 3/ pgmap->ref must be 'live' on entry and will be killed and reaped |
| 137 | * devm_memremap_pages_release() time, or if this routine fails. | 138 | * at devm_memremap_pages_release() time, or if this routine fails. |
| 138 | * | 139 | * |
| 139 | * 4/ res is expected to be a host memory range that could feasibly be | 140 | * 4/ res is expected to be a host memory range that could feasibly be |
| 140 | * treated as a "System RAM" range, i.e. not a device mmio range, but | 141 | * treated as a "System RAM" range, i.e. not a device mmio range, but |
| @@ -156,8 +157,10 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
| 156 | pgprot_t pgprot = PAGE_KERNEL; | 157 | pgprot_t pgprot = PAGE_KERNEL; |
| 157 | int error, nid, is_ram; | 158 | int error, nid, is_ram; |
| 158 | 159 | ||
| 159 | if (!pgmap->ref || !pgmap->kill) | 160 | if (!pgmap->ref || !pgmap->kill || !pgmap->cleanup) { |
| 161 | WARN(1, "Missing reference count teardown definition\n"); | ||
| 160 | return ERR_PTR(-EINVAL); | 162 | return ERR_PTR(-EINVAL); |
| 163 | } | ||
| 161 | 164 | ||
| 162 | align_start = res->start & ~(SECTION_SIZE - 1); | 165 | align_start = res->start & ~(SECTION_SIZE - 1); |
| 163 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) | 166 | align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) |
| @@ -168,14 +171,16 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
| 168 | if (conflict_pgmap) { | 171 | if (conflict_pgmap) { |
| 169 | dev_WARN(dev, "Conflicting mapping in same section\n"); | 172 | dev_WARN(dev, "Conflicting mapping in same section\n"); |
| 170 | put_dev_pagemap(conflict_pgmap); | 173 | put_dev_pagemap(conflict_pgmap); |
| 171 | return ERR_PTR(-ENOMEM); | 174 | error = -ENOMEM; |
| 175 | goto err_array; | ||
| 172 | } | 176 | } |
| 173 | 177 | ||
| 174 | conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); | 178 | conflict_pgmap = get_dev_pagemap(PHYS_PFN(align_end), NULL); |
| 175 | if (conflict_pgmap) { | 179 | if (conflict_pgmap) { |
| 176 | dev_WARN(dev, "Conflicting mapping in same section\n"); | 180 | dev_WARN(dev, "Conflicting mapping in same section\n"); |
| 177 | put_dev_pagemap(conflict_pgmap); | 181 | put_dev_pagemap(conflict_pgmap); |
| 178 | return ERR_PTR(-ENOMEM); | 182 | error = -ENOMEM; |
| 183 | goto err_array; | ||
| 179 | } | 184 | } |
| 180 | 185 | ||
| 181 | is_ram = region_intersects(align_start, align_size, | 186 | is_ram = region_intersects(align_start, align_size, |
| @@ -267,10 +272,18 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) | |||
| 267 | pgmap_array_delete(res); | 272 | pgmap_array_delete(res); |
| 268 | err_array: | 273 | err_array: |
| 269 | pgmap->kill(pgmap->ref); | 274 | pgmap->kill(pgmap->ref); |
| 275 | pgmap->cleanup(pgmap->ref); | ||
| 276 | |||
| 270 | return ERR_PTR(error); | 277 | return ERR_PTR(error); |
| 271 | } | 278 | } |
| 272 | EXPORT_SYMBOL_GPL(devm_memremap_pages); | 279 | EXPORT_SYMBOL_GPL(devm_memremap_pages); |
| 273 | 280 | ||
| 281 | void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) | ||
| 282 | { | ||
| 283 | devm_release_action(dev, devm_memremap_pages_release, pgmap); | ||
| 284 | } | ||
| 285 | EXPORT_SYMBOL_GPL(devm_memunmap_pages); | ||
| 286 | |||
| 274 | unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) | 287 | unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) |
| 275 | { | 288 | { |
| 276 | /* number of pfns from base where pfn_to_page() is valid */ | 289 | /* number of pfns from base where pfn_to_page() is valid */ |
diff --git a/kernel/module-internal.h b/kernel/module-internal.h index 79c9be2dbbe9..33783abc377b 100644 --- a/kernel/module-internal.h +++ b/kernel/module-internal.h | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
| 1 | /* Module internals | 2 | /* Module internals |
| 2 | * | 3 | * |
| 3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | 4 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. |
| 4 | * Written by David Howells (dhowells@redhat.com) | 5 | * Written by David Howells (dhowells@redhat.com) |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public Licence | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the Licence, or (at your option) any later version. | ||
| 10 | */ | 6 | */ |
| 11 | 7 | ||
| 12 | #include <linux/elf.h> | 8 | #include <linux/elf.h> |
| @@ -20,7 +16,7 @@ struct load_info { | |||
| 20 | unsigned long len; | 16 | unsigned long len; |
| 21 | Elf_Shdr *sechdrs; | 17 | Elf_Shdr *sechdrs; |
| 22 | char *secstrings, *strtab; | 18 | char *secstrings, *strtab; |
| 23 | unsigned long symoffs, stroffs; | 19 | unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs; |
| 24 | struct _ddebug *debug; | 20 | struct _ddebug *debug; |
| 25 | unsigned int num_debug; | 21 | unsigned int num_debug; |
| 26 | bool sig_ok; | 22 | bool sig_ok; |
diff --git a/kernel/module.c b/kernel/module.c index a9e1e7f2c224..a2cee14a83f3 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
| @@ -1,20 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | Copyright (C) 2002 Richard Henderson | 3 | Copyright (C) 2002 Richard Henderson |
| 3 | Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. | 4 | Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. |
| 4 | 5 | ||
| 5 | This program is free software; you can redistribute it and/or modify | ||
| 6 | it under the terms of the GNU General Public License as published by | ||
| 7 | the Free Software Foundation; either version 2 of the License, or | ||
| 8 | (at your option) any later version. | ||
| 9 | |||
| 10 | This program is distributed in the hope that it will be useful, | ||
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 13 | GNU General Public License for more details. | ||
| 14 | |||
| 15 | You should have received a copy of the GNU General Public License | ||
| 16 | along with this program; if not, write to the Free Software | ||
| 17 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 18 | */ | 6 | */ |
| 19 | #include <linux/export.h> | 7 | #include <linux/export.h> |
| 20 | #include <linux/extable.h> | 8 | #include <linux/extable.h> |
| @@ -2642,6 +2630,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
| 2642 | info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); | 2630 | info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); |
| 2643 | info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); | 2631 | info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); |
| 2644 | mod->core_layout.size += strtab_size; | 2632 | mod->core_layout.size += strtab_size; |
| 2633 | info->core_typeoffs = mod->core_layout.size; | ||
| 2634 | mod->core_layout.size += ndst * sizeof(char); | ||
| 2645 | mod->core_layout.size = debug_align(mod->core_layout.size); | 2635 | mod->core_layout.size = debug_align(mod->core_layout.size); |
| 2646 | 2636 | ||
| 2647 | /* Put string table section at end of init part of module. */ | 2637 | /* Put string table section at end of init part of module. */ |
| @@ -2655,6 +2645,8 @@ static void layout_symtab(struct module *mod, struct load_info *info) | |||
| 2655 | __alignof__(struct mod_kallsyms)); | 2645 | __alignof__(struct mod_kallsyms)); |
| 2656 | info->mod_kallsyms_init_off = mod->init_layout.size; | 2646 | info->mod_kallsyms_init_off = mod->init_layout.size; |
| 2657 | mod->init_layout.size += sizeof(struct mod_kallsyms); | 2647 | mod->init_layout.size += sizeof(struct mod_kallsyms); |
| 2648 | info->init_typeoffs = mod->init_layout.size; | ||
| 2649 | mod->init_layout.size += nsrc * sizeof(char); | ||
| 2658 | mod->init_layout.size = debug_align(mod->init_layout.size); | 2650 | mod->init_layout.size = debug_align(mod->init_layout.size); |
| 2659 | } | 2651 | } |
| 2660 | 2652 | ||
| @@ -2678,20 +2670,23 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) | |||
| 2678 | mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); | 2670 | mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); |
| 2679 | /* Make sure we get permanent strtab: don't use info->strtab. */ | 2671 | /* Make sure we get permanent strtab: don't use info->strtab. */ |
| 2680 | mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; | 2672 | mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; |
| 2673 | mod->kallsyms->typetab = mod->init_layout.base + info->init_typeoffs; | ||
| 2681 | 2674 | ||
| 2682 | /* Set types up while we still have access to sections. */ | 2675 | /* |
| 2683 | for (i = 0; i < mod->kallsyms->num_symtab; i++) | 2676 | * Now populate the cut down core kallsyms for after init |
| 2684 | mod->kallsyms->symtab[i].st_size | 2677 | * and set types up while we still have access to sections. |
| 2685 | = elf_type(&mod->kallsyms->symtab[i], info); | 2678 | */ |
| 2686 | |||
| 2687 | /* Now populate the cut down core kallsyms for after init. */ | ||
| 2688 | mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; | 2679 | mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; |
| 2689 | mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; | 2680 | mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; |
| 2681 | mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; | ||
| 2690 | src = mod->kallsyms->symtab; | 2682 | src = mod->kallsyms->symtab; |
| 2691 | for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { | 2683 | for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { |
| 2684 | mod->kallsyms->typetab[i] = elf_type(src + i, info); | ||
| 2692 | if (i == 0 || is_livepatch_module(mod) || | 2685 | if (i == 0 || is_livepatch_module(mod) || |
| 2693 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, | 2686 | is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, |
| 2694 | info->index.pcpu)) { | 2687 | info->index.pcpu)) { |
| 2688 | mod->core_kallsyms.typetab[ndst] = | ||
| 2689 | mod->kallsyms->typetab[i]; | ||
| 2695 | dst[ndst] = src[i]; | 2690 | dst[ndst] = src[i]; |
| 2696 | dst[ndst++].st_name = s - mod->core_kallsyms.strtab; | 2691 | dst[ndst++].st_name = s - mod->core_kallsyms.strtab; |
| 2697 | s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], | 2692 | s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], |
| @@ -3088,6 +3083,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) | |||
| 3088 | sizeof(*mod->tracepoints_ptrs), | 3083 | sizeof(*mod->tracepoints_ptrs), |
| 3089 | &mod->num_tracepoints); | 3084 | &mod->num_tracepoints); |
| 3090 | #endif | 3085 | #endif |
| 3086 | #ifdef CONFIG_TREE_SRCU | ||
| 3087 | mod->srcu_struct_ptrs = section_objs(info, "___srcu_struct_ptrs", | ||
| 3088 | sizeof(*mod->srcu_struct_ptrs), | ||
| 3089 | &mod->num_srcu_structs); | ||
| 3090 | #endif | ||
| 3091 | #ifdef CONFIG_BPF_EVENTS | 3091 | #ifdef CONFIG_BPF_EVENTS |
| 3092 | mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", | 3092 | mod->bpf_raw_events = section_objs(info, "__bpf_raw_tp_map", |
| 3093 | sizeof(*mod->bpf_raw_events), | 3093 | sizeof(*mod->bpf_raw_events), |
| @@ -4091,7 +4091,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
| 4091 | const Elf_Sym *sym = &kallsyms->symtab[symnum]; | 4091 | const Elf_Sym *sym = &kallsyms->symtab[symnum]; |
| 4092 | 4092 | ||
| 4093 | *value = kallsyms_symbol_value(sym); | 4093 | *value = kallsyms_symbol_value(sym); |
| 4094 | *type = sym->st_size; | 4094 | *type = kallsyms->typetab[symnum]; |
| 4095 | strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); | 4095 | strlcpy(name, kallsyms_symbol_name(kallsyms, symnum), KSYM_NAME_LEN); |
| 4096 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); | 4096 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); |
| 4097 | *exported = is_exported(name, *value, mod); | 4097 | *exported = is_exported(name, *value, mod); |
diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6b9a926fd86b..b10fb1986ca9 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* Module signature checker | 2 | /* Module signature checker |
| 2 | * | 3 | * |
| 3 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. | 4 | * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved. |
| 4 | * Written by David Howells (dhowells@redhat.com) | 5 | * Written by David Howells (dhowells@redhat.com) |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public Licence | ||
| 8 | * as published by the Free Software Foundation; either version | ||
| 9 | * 2 of the Licence, or (at your option) any later version. | ||
| 10 | */ | 6 | */ |
| 11 | 7 | ||
| 12 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
diff --git a/kernel/notifier.c b/kernel/notifier.c index 6196af8a8223..d9f5081d578d 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | #include <linux/kdebug.h> | 2 | #include <linux/kdebug.h> |
| 2 | #include <linux/kprobes.h> | 3 | #include <linux/kprobes.h> |
| 3 | #include <linux/export.h> | 4 | #include <linux/export.h> |
| @@ -22,6 +23,7 @@ static int notifier_chain_register(struct notifier_block **nl, | |||
| 22 | struct notifier_block *n) | 23 | struct notifier_block *n) |
| 23 | { | 24 | { |
| 24 | while ((*nl) != NULL) { | 25 | while ((*nl) != NULL) { |
| 26 | WARN_ONCE(((*nl) == n), "double register detected"); | ||
| 25 | if (n->priority > (*nl)->priority) | 27 | if (n->priority > (*nl)->priority) |
| 26 | break; | 28 | break; |
| 27 | nl = &((*nl)->next); | 29 | nl = &((*nl)->next); |
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index f6c5d330059a..c815f58e6bc0 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c | |||
| @@ -1,13 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2006 IBM Corporation | 3 | * Copyright (C) 2006 IBM Corporation |
| 3 | * | 4 | * |
| 4 | * Author: Serge Hallyn <serue@us.ibm.com> | 5 | * Author: Serge Hallyn <serue@us.ibm.com> |
| 5 | * | 6 | * |
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License as | ||
| 8 | * published by the Free Software Foundation, version 2 of the | ||
| 9 | * License. | ||
| 10 | * | ||
| 11 | * Jun 2006 - namespaces support | 7 | * Jun 2006 - namespaces support |
| 12 | * OpenVZ, SWsoft Inc. | 8 | * OpenVZ, SWsoft Inc. |
| 13 | * Pavel Emelianov <xemul@openvz.org> | 9 | * Pavel Emelianov <xemul@openvz.org> |
diff --git a/kernel/panic.c b/kernel/panic.c index c1fcaad337b7..4d9f55bf7d38 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/panic.c | 3 | * linux/kernel/panic.c |
| 3 | * | 4 | * |
| @@ -51,6 +52,7 @@ EXPORT_SYMBOL_GPL(panic_timeout); | |||
| 51 | #define PANIC_PRINT_TIMER_INFO 0x00000004 | 52 | #define PANIC_PRINT_TIMER_INFO 0x00000004 |
| 52 | #define PANIC_PRINT_LOCK_INFO 0x00000008 | 53 | #define PANIC_PRINT_LOCK_INFO 0x00000008 |
| 53 | #define PANIC_PRINT_FTRACE_INFO 0x00000010 | 54 | #define PANIC_PRINT_FTRACE_INFO 0x00000010 |
| 55 | #define PANIC_PRINT_ALL_PRINTK_MSG 0x00000020 | ||
| 54 | unsigned long panic_print; | 56 | unsigned long panic_print; |
| 55 | 57 | ||
| 56 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); | 58 | ATOMIC_NOTIFIER_HEAD(panic_notifier_list); |
| @@ -134,6 +136,9 @@ EXPORT_SYMBOL(nmi_panic); | |||
| 134 | 136 | ||
| 135 | static void panic_print_sys_info(void) | 137 | static void panic_print_sys_info(void) |
| 136 | { | 138 | { |
| 139 | if (panic_print & PANIC_PRINT_ALL_PRINTK_MSG) | ||
| 140 | console_flush_on_panic(CONSOLE_REPLAY_ALL); | ||
| 141 | |||
| 137 | if (panic_print & PANIC_PRINT_TASK_INFO) | 142 | if (panic_print & PANIC_PRINT_TASK_INFO) |
| 138 | show_state(); | 143 | show_state(); |
| 139 | 144 | ||
| @@ -277,7 +282,7 @@ void panic(const char *fmt, ...) | |||
| 277 | * panic() is not being callled from OOPS. | 282 | * panic() is not being callled from OOPS. |
| 278 | */ | 283 | */ |
| 279 | debug_locks_off(); | 284 | debug_locks_off(); |
| 280 | console_flush_on_panic(); | 285 | console_flush_on_panic(CONSOLE_FLUSH_PENDING); |
| 281 | 286 | ||
| 282 | panic_print_sys_info(); | 287 | panic_print_sys_info(); |
| 283 | 288 | ||
| @@ -306,6 +311,8 @@ void panic(const char *fmt, ...) | |||
| 306 | * shutting down. But if there is a chance of | 311 | * shutting down. But if there is a chance of |
| 307 | * rebooting the system it will be rebooted. | 312 | * rebooting the system it will be rebooted. |
| 308 | */ | 313 | */ |
| 314 | if (panic_reboot_mode != REBOOT_UNDEFINED) | ||
| 315 | reboot_mode = panic_reboot_mode; | ||
| 309 | emergency_restart(); | 316 | emergency_restart(); |
| 310 | } | 317 | } |
| 311 | #ifdef __sparc__ | 318 | #ifdef __sparc__ |
| @@ -321,6 +328,9 @@ void panic(const char *fmt, ...) | |||
| 321 | disabled_wait(); | 328 | disabled_wait(); |
| 322 | #endif | 329 | #endif |
| 323 | pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); | 330 | pr_emerg("---[ end Kernel panic - not syncing: %s ]---\n", buf); |
| 331 | |||
| 332 | /* Do not scroll important messages printed above */ | ||
| 333 | suppress_printk = 1; | ||
| 324 | local_irq_enable(); | 334 | local_irq_enable(); |
| 325 | for (i = 0; ; i += PANIC_TIMER_STEP) { | 335 | for (i = 0; ; i += PANIC_TIMER_STEP) { |
| 326 | touch_softlockup_watchdog(); | 336 | touch_softlockup_watchdog(); |
diff --git a/kernel/params.c b/kernel/params.c index ce89f757e6da..cf448785d058 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
| @@ -1,19 +1,7 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* Helpers for initial module or kernel cmdline parsing | 2 | /* Helpers for initial module or kernel cmdline parsing |
| 2 | Copyright (C) 2001 Rusty Russell. | 3 | Copyright (C) 2001 Rusty Russell. |
| 3 | 4 | ||
| 4 | This program is free software; you can redistribute it and/or modify | ||
| 5 | it under the terms of the GNU General Public License as published by | ||
| 6 | the Free Software Foundation; either version 2 of the License, or | ||
| 7 | (at your option) any later version. | ||
| 8 | |||
| 9 | This program is distributed in the hope that it will be useful, | ||
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | GNU General Public License for more details. | ||
| 13 | |||
| 14 | You should have received a copy of the GNU General Public License | ||
| 15 | along with this program; if not, write to the Free Software | ||
| 16 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
| 17 | */ | 5 | */ |
| 18 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
| 19 | #include <linux/string.h> | 7 | #include <linux/string.h> |
diff --git a/kernel/pid.c b/kernel/pid.c index 20881598bdfa..16263b526560 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Generic pidhash and scalable, time-bounded PID allocator | 3 | * Generic pidhash and scalable, time-bounded PID allocator |
| 3 | * | 4 | * |
| @@ -32,12 +33,13 @@ | |||
| 32 | #include <linux/init.h> | 33 | #include <linux/init.h> |
| 33 | #include <linux/rculist.h> | 34 | #include <linux/rculist.h> |
| 34 | #include <linux/memblock.h> | 35 | #include <linux/memblock.h> |
| 35 | #include <linux/hash.h> | ||
| 36 | #include <linux/pid_namespace.h> | 36 | #include <linux/pid_namespace.h> |
| 37 | #include <linux/init_task.h> | 37 | #include <linux/init_task.h> |
| 38 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
| 39 | #include <linux/proc_ns.h> | 39 | #include <linux/proc_ns.h> |
| 40 | #include <linux/proc_fs.h> | 40 | #include <linux/proc_fs.h> |
| 41 | #include <linux/anon_inodes.h> | ||
| 42 | #include <linux/sched/signal.h> | ||
| 41 | #include <linux/sched/task.h> | 43 | #include <linux/sched/task.h> |
| 42 | #include <linux/idr.h> | 44 | #include <linux/idr.h> |
| 43 | 45 | ||
| @@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) | |||
| 214 | for (type = 0; type < PIDTYPE_MAX; ++type) | 216 | for (type = 0; type < PIDTYPE_MAX; ++type) |
| 215 | INIT_HLIST_HEAD(&pid->tasks[type]); | 217 | INIT_HLIST_HEAD(&pid->tasks[type]); |
| 216 | 218 | ||
| 219 | init_waitqueue_head(&pid->wait_pidfd); | ||
| 220 | |||
| 217 | upid = pid->numbers + ns->level; | 221 | upid = pid->numbers + ns->level; |
| 218 | spin_lock_irq(&pidmap_lock); | 222 | spin_lock_irq(&pidmap_lock); |
| 219 | if (!(ns->pid_allocated & PIDNS_ADDING)) | 223 | if (!(ns->pid_allocated & PIDNS_ADDING)) |
| @@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) | |||
| 451 | return idr_get_next(&ns->idr, &nr); | 455 | return idr_get_next(&ns->idr, &nr); |
| 452 | } | 456 | } |
| 453 | 457 | ||
| 458 | /** | ||
| 459 | * pidfd_create() - Create a new pid file descriptor. | ||
| 460 | * | ||
| 461 | * @pid: struct pid that the pidfd will reference | ||
| 462 | * | ||
| 463 | * This creates a new pid file descriptor with the O_CLOEXEC flag set. | ||
| 464 | * | ||
| 465 | * Note, that this function can only be called after the fd table has | ||
| 466 | * been unshared to avoid leaking the pidfd to the new process. | ||
| 467 | * | ||
| 468 | * Return: On success, a cloexec pidfd is returned. | ||
| 469 | * On error, a negative errno number will be returned. | ||
| 470 | */ | ||
| 471 | static int pidfd_create(struct pid *pid) | ||
| 472 | { | ||
| 473 | int fd; | ||
| 474 | |||
| 475 | fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), | ||
| 476 | O_RDWR | O_CLOEXEC); | ||
| 477 | if (fd < 0) | ||
| 478 | put_pid(pid); | ||
| 479 | |||
| 480 | return fd; | ||
| 481 | } | ||
| 482 | |||
| 483 | /** | ||
| 484 | * pidfd_open() - Open new pid file descriptor. | ||
| 485 | * | ||
| 486 | * @pid: pid for which to retrieve a pidfd | ||
| 487 | * @flags: flags to pass | ||
| 488 | * | ||
| 489 | * This creates a new pid file descriptor with the O_CLOEXEC flag set for | ||
| 490 | * the process identified by @pid. Currently, the process identified by | ||
| 491 | * @pid must be a thread-group leader. This restriction currently exists | ||
| 492 | * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot | ||
| 493 | * be used with CLONE_THREAD) and pidfd polling (only supports thread group | ||
| 494 | * leaders). | ||
| 495 | * | ||
| 496 | * Return: On success, a cloexec pidfd is returned. | ||
| 497 | * On error, a negative errno number will be returned. | ||
| 498 | */ | ||
| 499 | SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) | ||
| 500 | { | ||
| 501 | int fd, ret; | ||
| 502 | struct pid *p; | ||
| 503 | |||
| 504 | if (flags) | ||
| 505 | return -EINVAL; | ||
| 506 | |||
| 507 | if (pid <= 0) | ||
| 508 | return -EINVAL; | ||
| 509 | |||
| 510 | p = find_get_pid(pid); | ||
| 511 | if (!p) | ||
| 512 | return -ESRCH; | ||
| 513 | |||
| 514 | ret = 0; | ||
| 515 | rcu_read_lock(); | ||
| 516 | if (!pid_task(p, PIDTYPE_TGID)) | ||
| 517 | ret = -EINVAL; | ||
| 518 | rcu_read_unlock(); | ||
| 519 | |||
| 520 | fd = ret ?: pidfd_create(p); | ||
| 521 | put_pid(p); | ||
| 522 | return fd; | ||
| 523 | } | ||
| 524 | |||
| 454 | void __init pid_idr_init(void) | 525 | void __init pid_idr_init(void) |
| 455 | { | 526 | { |
| 456 | /* Verify no one has done anything silly: */ | 527 | /* Verify no one has done anything silly: */ |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index aa6e72fb7c08..6d726cef241c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Pid namespaces | 3 | * Pid namespaces |
| 3 | * | 4 | * |
| @@ -325,7 +326,7 @@ int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) | |||
| 325 | } | 326 | } |
| 326 | 327 | ||
| 327 | read_lock(&tasklist_lock); | 328 | read_lock(&tasklist_lock); |
| 328 | force_sig(SIGKILL, pid_ns->child_reaper); | 329 | send_sig(SIGKILL, pid_ns->child_reaper, 1); |
| 329 | read_unlock(&tasklist_lock); | 330 | read_unlock(&tasklist_lock); |
| 330 | 331 | ||
| 331 | do_exit(0); | 332 | do_exit(0); |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 9bbaaab14b36..ff8592ddedee 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | config SUSPEND | 2 | config SUSPEND |
| 2 | bool "Suspend to RAM and standby" | 3 | bool "Suspend to RAM and standby" |
| 3 | depends on ARCH_SUSPEND_POSSIBLE | 4 | depends on ARCH_SUSPEND_POSSIBLE |
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 7d66ee68aaaf..0a9326f5f421 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c | |||
| @@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, | |||
| 223 | * All CPUs of a domain must have the same micro-architecture | 223 | * All CPUs of a domain must have the same micro-architecture |
| 224 | * since they all share the same table. | 224 | * since they all share the same table. |
| 225 | */ | 225 | */ |
| 226 | cap = arch_scale_cpu_capacity(NULL, cpu); | 226 | cap = arch_scale_cpu_capacity(cpu); |
| 227 | if (prev_cap && prev_cap != cap) { | 227 | if (prev_cap && prev_cap != cap) { |
| 228 | pr_err("CPUs of %*pbl must have the same capacity\n", | 228 | pr_err("CPUs of %*pbl must have the same capacity\n", |
| 229 | cpumask_pr_args(span)); | 229 | cpumask_pr_args(span)); |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c8c272df7154..cd7434e6000d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. | 3 | * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. |
| 3 | * | 4 | * |
| @@ -6,8 +7,6 @@ | |||
| 6 | * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz> |
| 7 | * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. | 8 | * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. |
| 8 | * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> | 9 | * Copyright (C) 2012 Bojan Smojver <bojan@rexursive.com> |
| 9 | * | ||
| 10 | * This file is released under the GPLv2. | ||
| 11 | */ | 10 | */ |
| 12 | 11 | ||
| 13 | #define pr_fmt(fmt) "PM: " fmt | 12 | #define pr_fmt(fmt) "PM: " fmt |
| @@ -129,7 +128,7 @@ static int hibernation_test(int level) { return 0; } | |||
| 129 | static int platform_begin(int platform_mode) | 128 | static int platform_begin(int platform_mode) |
| 130 | { | 129 | { |
| 131 | return (platform_mode && hibernation_ops) ? | 130 | return (platform_mode && hibernation_ops) ? |
| 132 | hibernation_ops->begin() : 0; | 131 | hibernation_ops->begin(PMSG_FREEZE) : 0; |
| 133 | } | 132 | } |
| 134 | 133 | ||
| 135 | /** | 134 | /** |
| @@ -257,6 +256,11 @@ void swsusp_show_speed(ktime_t start, ktime_t stop, | |||
| 257 | (kps % 1000) / 10); | 256 | (kps % 1000) / 10); |
| 258 | } | 257 | } |
| 259 | 258 | ||
| 259 | __weak int arch_resume_nosmt(void) | ||
| 260 | { | ||
| 261 | return 0; | ||
| 262 | } | ||
| 263 | |||
| 260 | /** | 264 | /** |
| 261 | * create_image - Create a hibernation image. | 265 | * create_image - Create a hibernation image. |
| 262 | * @platform_mode: Whether or not to use the platform driver. | 266 | * @platform_mode: Whether or not to use the platform driver. |
| @@ -324,6 +328,10 @@ static int create_image(int platform_mode) | |||
| 324 | Enable_cpus: | 328 | Enable_cpus: |
| 325 | suspend_enable_secondary_cpus(); | 329 | suspend_enable_secondary_cpus(); |
| 326 | 330 | ||
| 331 | /* Allow architectures to do nosmt-specific post-resume dances */ | ||
| 332 | if (!in_suspend) | ||
| 333 | error = arch_resume_nosmt(); | ||
| 334 | |||
| 327 | Platform_finish: | 335 | Platform_finish: |
| 328 | platform_finish(platform_mode); | 336 | platform_finish(platform_mode); |
| 329 | 337 | ||
| @@ -542,7 +550,7 @@ int hibernation_platform_enter(void) | |||
| 542 | * hibernation_ops->finish() before saving the image, so we should let | 550 | * hibernation_ops->finish() before saving the image, so we should let |
| 543 | * the firmware know that we're going to enter the sleep state after all | 551 | * the firmware know that we're going to enter the sleep state after all |
| 544 | */ | 552 | */ |
| 545 | error = hibernation_ops->begin(); | 553 | error = hibernation_ops->begin(PMSG_HIBERNATE); |
| 546 | if (error) | 554 | if (error) |
| 547 | goto Close; | 555 | goto Close; |
| 548 | 556 | ||
diff --git a/kernel/power/main.c b/kernel/power/main.c index 4f43e724f6eb..bdbd605c4215 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
| @@ -1,11 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/power/main.c - PM subsystem core functionality. | 3 | * kernel/power/main.c - PM subsystem core functionality. |
| 3 | * | 4 | * |
| 4 | * Copyright (c) 2003 Patrick Mochel | 5 | * Copyright (c) 2003 Patrick Mochel |
| 5 | * Copyright (c) 2003 Open Source Development Lab | 6 | * Copyright (c) 2003 Open Source Development Lab |
| 6 | * | ||
| 7 | * This file is released under the GPLv2 | ||
| 8 | * | ||
| 9 | */ | 7 | */ |
| 10 | 8 | ||
| 11 | #include <linux/export.h> | 9 | #include <linux/export.h> |
diff --git a/kernel/power/power.h b/kernel/power/power.h index 9e58bdc8a562..44bee462ff57 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
| @@ -75,8 +75,6 @@ static inline void hibernate_reserved_size_init(void) {} | |||
| 75 | static inline void hibernate_image_size_init(void) {} | 75 | static inline void hibernate_image_size_init(void) {} |
| 76 | #endif /* !CONFIG_HIBERNATION */ | 76 | #endif /* !CONFIG_HIBERNATION */ |
| 77 | 77 | ||
| 78 | extern int pfn_is_nosave(unsigned long); | ||
| 79 | |||
| 80 | #define power_attr(_name) \ | 78 | #define power_attr(_name) \ |
| 81 | static struct kobj_attribute _name##_attr = { \ | 79 | static struct kobj_attribute _name##_attr = { \ |
| 82 | .attr = { \ | 80 | .attr = { \ |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 7ef6866b521d..6d475281c730 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
| @@ -1,7 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * poweroff.c - sysrq handler to gracefully power down machine. | 3 | * poweroff.c - sysrq handler to gracefully power down machine. |
| 3 | * | ||
| 4 | * This file is released under the GPL v2 | ||
| 5 | */ | 4 | */ |
| 6 | 5 | ||
| 7 | #include <linux/kernel.h> | 6 | #include <linux/kernel.h> |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 9d22131afc1e..33e3febaba53 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * This module exposes the interface to kernel space for specifying | 3 | * This module exposes the interface to kernel space for specifying |
| 3 | * QoS dependencies. It provides infrastructure for registration of: | 4 | * QoS dependencies. It provides infrastructure for registration of: |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bc9558ab1e5b..83105874f255 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/power/snapshot.c | 3 | * linux/kernel/power/snapshot.c |
| 3 | * | 4 | * |
| @@ -5,9 +6,6 @@ | |||
| 5 | * | 6 | * |
| 6 | * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> | 7 | * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz> |
| 7 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
| 8 | * | ||
| 9 | * This file is released under the GPLv2. | ||
| 10 | * | ||
| 11 | */ | 9 | */ |
| 12 | 10 | ||
| 13 | #define pr_fmt(fmt) "PM: " fmt | 11 | #define pr_fmt(fmt) "PM: " fmt |
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ef908c134b34..c874a7026e24 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c | |||
| @@ -1,11 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/power/suspend.c - Suspend to RAM and standby functionality. | 3 | * kernel/power/suspend.c - Suspend to RAM and standby functionality. |
| 3 | * | 4 | * |
| 4 | * Copyright (c) 2003 Patrick Mochel | 5 | * Copyright (c) 2003 Patrick Mochel |
| 5 | * Copyright (c) 2003 Open Source Development Lab | 6 | * Copyright (c) 2003 Open Source Development Lab |
| 6 | * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. | 7 | * Copyright (c) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc. |
| 7 | * | ||
| 8 | * This file is released under the GPLv2. | ||
| 9 | */ | 8 | */ |
| 10 | 9 | ||
| 11 | #define pr_fmt(fmt) "PM: " fmt | 10 | #define pr_fmt(fmt) "PM: " fmt |
| @@ -62,11 +61,17 @@ static DECLARE_SWAIT_QUEUE_HEAD(s2idle_wait_head); | |||
| 62 | enum s2idle_states __read_mostly s2idle_state; | 61 | enum s2idle_states __read_mostly s2idle_state; |
| 63 | static DEFINE_RAW_SPINLOCK(s2idle_lock); | 62 | static DEFINE_RAW_SPINLOCK(s2idle_lock); |
| 64 | 63 | ||
| 65 | bool pm_suspend_via_s2idle(void) | 64 | /** |
| 65 | * pm_suspend_default_s2idle - Check if suspend-to-idle is the default suspend. | ||
| 66 | * | ||
| 67 | * Return 'true' if suspend-to-idle has been selected as the default system | ||
| 68 | * suspend method. | ||
| 69 | */ | ||
| 70 | bool pm_suspend_default_s2idle(void) | ||
| 66 | { | 71 | { |
| 67 | return mem_sleep_current == PM_SUSPEND_TO_IDLE; | 72 | return mem_sleep_current == PM_SUSPEND_TO_IDLE; |
| 68 | } | 73 | } |
| 69 | EXPORT_SYMBOL_GPL(pm_suspend_via_s2idle); | 74 | EXPORT_SYMBOL_GPL(pm_suspend_default_s2idle); |
| 70 | 75 | ||
| 71 | void s2idle_set_ops(const struct platform_s2idle_ops *ops) | 76 | void s2idle_set_ops(const struct platform_s2idle_ops *ops) |
| 72 | { | 77 | { |
| @@ -488,6 +493,9 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
| 488 | 493 | ||
| 489 | pm_suspend_target_state = state; | 494 | pm_suspend_target_state = state; |
| 490 | 495 | ||
| 496 | if (state == PM_SUSPEND_TO_IDLE) | ||
| 497 | pm_set_suspend_no_platform(); | ||
| 498 | |||
| 491 | error = platform_suspend_begin(state); | 499 | error = platform_suspend_begin(state); |
| 492 | if (error) | 500 | if (error) |
| 493 | goto Close; | 501 | goto Close; |
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 6a897e8b2a88..60564b58de07 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
| @@ -1,9 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. | 3 | * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. |
| 3 | * | 4 | * |
| 4 | * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> | 5 | * Copyright (c) 2009 Pavel Machek <pavel@ucw.cz> |
| 5 | * | ||
| 6 | * This file is released under the GPLv2. | ||
| 7 | */ | 6 | */ |
| 8 | 7 | ||
| 9 | #include <linux/init.h> | 8 | #include <linux/init.h> |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index d7f6c1a288d3..ca0fcb5ced71 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/power/swap.c | 3 | * linux/kernel/power/swap.c |
| 3 | * | 4 | * |
| @@ -7,9 +8,6 @@ | |||
| 7 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> | 8 | * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> |
| 8 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 9 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
| 9 | * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> | 10 | * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> |
| 10 | * | ||
| 11 | * This file is released under the GPLv2. | ||
| 12 | * | ||
| 13 | */ | 11 | */ |
| 14 | 12 | ||
| 15 | #define pr_fmt(fmt) "PM: " fmt | 13 | #define pr_fmt(fmt) "PM: " fmt |
| @@ -976,12 +974,11 @@ static int get_swap_reader(struct swap_map_handle *handle, | |||
| 976 | last = handle->maps = NULL; | 974 | last = handle->maps = NULL; |
| 977 | offset = swsusp_header->image; | 975 | offset = swsusp_header->image; |
| 978 | while (offset) { | 976 | while (offset) { |
| 979 | tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL); | 977 | tmp = kzalloc(sizeof(*handle->maps), GFP_KERNEL); |
| 980 | if (!tmp) { | 978 | if (!tmp) { |
| 981 | release_swap_reader(handle); | 979 | release_swap_reader(handle); |
| 982 | return -ENOMEM; | 980 | return -ENOMEM; |
| 983 | } | 981 | } |
| 984 | memset(tmp, 0, sizeof(*tmp)); | ||
| 985 | if (!handle->maps) | 982 | if (!handle->maps) |
| 986 | handle->maps = tmp; | 983 | handle->maps = tmp; |
| 987 | if (last) | 984 | if (last) |
diff --git a/kernel/power/user.c b/kernel/power/user.c index cb24e840a3e6..77438954cc2b 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
| @@ -1,12 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/power/user.c | 3 | * linux/kernel/power/user.c |
| 3 | * | 4 | * |
| 4 | * This file provides the user space interface for software suspend/resume. | 5 | * This file provides the user space interface for software suspend/resume. |
| 5 | * | 6 | * |
| 6 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> | 7 | * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> |
| 7 | * | ||
| 8 | * This file is released under the GPLv2. | ||
| 9 | * | ||
| 10 | */ | 8 | */ |
| 11 | 9 | ||
| 12 | #include <linux/suspend.h> | 10 | #include <linux/suspend.h> |
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4a2ffc39eb95..4d052fc6bcde 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | obj-y = printk.o | 2 | obj-y = printk.o |
| 2 | obj-$(CONFIG_PRINTK) += printk_safe.o | 3 | obj-$(CONFIG_PRINTK) += printk_safe.o |
| 3 | obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o | 4 | obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o |
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index 0f1898820cba..c8e6ab689d42 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h | |||
| @@ -1,18 +1,6 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ | ||
| 1 | /* | 2 | /* |
| 2 | * internal.h - printk internal definitions | 3 | * internal.h - printk internal definitions |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation; either version 2 | ||
| 7 | * of the License, or (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 16 | */ | 4 | */ |
| 17 | #include <linux/percpu.h> | 5 | #include <linux/percpu.h> |
| 18 | 6 | ||
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 02ca827b8fac..1888f6a3b694 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/printk.c | 3 | * linux/kernel/printk.c |
| 3 | * | 4 | * |
| @@ -86,6 +87,12 @@ static DEFINE_SEMAPHORE(console_sem); | |||
| 86 | struct console *console_drivers; | 87 | struct console *console_drivers; |
| 87 | EXPORT_SYMBOL_GPL(console_drivers); | 88 | EXPORT_SYMBOL_GPL(console_drivers); |
| 88 | 89 | ||
| 90 | /* | ||
| 91 | * System may need to suppress printk message under certain | ||
| 92 | * circumstances, like after kernel panic happens. | ||
| 93 | */ | ||
| 94 | int __read_mostly suppress_printk; | ||
| 95 | |||
| 89 | #ifdef CONFIG_LOCKDEP | 96 | #ifdef CONFIG_LOCKDEP |
| 90 | static struct lockdep_map console_lock_dep_map = { | 97 | static struct lockdep_map console_lock_dep_map = { |
| 91 | .name = "console_lock" | 98 | .name = "console_lock" |
| @@ -1943,6 +1950,10 @@ asmlinkage int vprintk_emit(int facility, int level, | |||
| 1943 | unsigned long flags; | 1950 | unsigned long flags; |
| 1944 | u64 curr_log_seq; | 1951 | u64 curr_log_seq; |
| 1945 | 1952 | ||
| 1953 | /* Suppress unimportant messages after panic happens */ | ||
| 1954 | if (unlikely(suppress_printk)) | ||
| 1955 | return 0; | ||
| 1956 | |||
| 1946 | if (level == LOGLEVEL_SCHED) { | 1957 | if (level == LOGLEVEL_SCHED) { |
| 1947 | level = LOGLEVEL_DEFAULT; | 1958 | level = LOGLEVEL_DEFAULT; |
| 1948 | in_sched = true; | 1959 | in_sched = true; |
| @@ -2525,10 +2536,11 @@ void console_unblank(void) | |||
| 2525 | 2536 | ||
| 2526 | /** | 2537 | /** |
| 2527 | * console_flush_on_panic - flush console content on panic | 2538 | * console_flush_on_panic - flush console content on panic |
| 2539 | * @mode: flush all messages in buffer or just the pending ones | ||
| 2528 | * | 2540 | * |
| 2529 | * Immediately output all pending messages no matter what. | 2541 | * Immediately output all pending messages no matter what. |
| 2530 | */ | 2542 | */ |
| 2531 | void console_flush_on_panic(void) | 2543 | void console_flush_on_panic(enum con_flush_mode mode) |
| 2532 | { | 2544 | { |
| 2533 | /* | 2545 | /* |
| 2534 | * If someone else is holding the console lock, trylock will fail | 2546 | * If someone else is holding the console lock, trylock will fail |
| @@ -2539,6 +2551,15 @@ void console_flush_on_panic(void) | |||
| 2539 | */ | 2551 | */ |
| 2540 | console_trylock(); | 2552 | console_trylock(); |
| 2541 | console_may_schedule = 0; | 2553 | console_may_schedule = 0; |
| 2554 | |||
| 2555 | if (mode == CONSOLE_REPLAY_ALL) { | ||
| 2556 | unsigned long flags; | ||
| 2557 | |||
| 2558 | logbuf_lock_irqsave(flags); | ||
| 2559 | console_seq = log_first_seq; | ||
| 2560 | console_idx = log_first_idx; | ||
| 2561 | logbuf_unlock_irqrestore(flags); | ||
| 2562 | } | ||
| 2542 | console_unlock(); | 2563 | console_unlock(); |
| 2543 | } | 2564 | } |
| 2544 | 2565 | ||
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 0913b4d385de..b4045e782743 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c | |||
| @@ -1,18 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * printk_safe.c - Safe printk for printk-deadlock-prone contexts | 3 | * printk_safe.c - Safe printk for printk-deadlock-prone contexts |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public License | ||
| 6 | * as published by the Free Software Foundation; either version 2 | ||
| 7 | * of the License, or (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, see <http://www.gnu.org/licenses/>. | ||
| 16 | */ | 4 | */ |
| 17 | 5 | ||
| 18 | #include <linux/preempt.h> | 6 | #include <linux/preempt.h> |
diff --git a/kernel/profile.c b/kernel/profile.c index 9c08a2c7cb1d..af7c94bf5fa1 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/profile.c | 3 | * linux/kernel/profile.c |
| 3 | * Simple profiling. Manages a direct-mapped profile hit count buffer, | 4 | * Simple profiling. Manages a direct-mapped profile hit count buffer, |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6f357f4fc859..83a531cea2f3 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/ptrace.c | 3 | * linux/kernel/ptrace.c |
| 3 | * | 4 | * |
| @@ -78,9 +79,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, | |||
| 78 | */ | 79 | */ |
| 79 | static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) | 80 | static void ptrace_link(struct task_struct *child, struct task_struct *new_parent) |
| 80 | { | 81 | { |
| 81 | rcu_read_lock(); | 82 | __ptrace_link(child, new_parent, current_cred()); |
| 82 | __ptrace_link(child, new_parent, __task_cred(new_parent)); | ||
| 83 | rcu_read_unlock(); | ||
| 84 | } | 83 | } |
| 85 | 84 | ||
| 86 | /** | 85 | /** |
| @@ -117,6 +116,9 @@ void __ptrace_unlink(struct task_struct *child) | |||
| 117 | BUG_ON(!child->ptrace); | 116 | BUG_ON(!child->ptrace); |
| 118 | 117 | ||
| 119 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | 118 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); |
| 119 | #ifdef TIF_SYSCALL_EMU | ||
| 120 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
| 121 | #endif | ||
| 120 | 122 | ||
| 121 | child->parent = child->real_parent; | 123 | child->parent = child->real_parent; |
| 122 | list_del_init(&child->ptrace_entry); | 124 | list_del_init(&child->ptrace_entry); |
| @@ -323,6 +325,16 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) | |||
| 323 | return -EPERM; | 325 | return -EPERM; |
| 324 | ok: | 326 | ok: |
| 325 | rcu_read_unlock(); | 327 | rcu_read_unlock(); |
| 328 | /* | ||
| 329 | * If a task drops privileges and becomes nondumpable (through a syscall | ||
| 330 | * like setresuid()) while we are trying to access it, we must ensure | ||
| 331 | * that the dumpability is read after the credentials; otherwise, | ||
| 332 | * we may be able to attach to a task that we shouldn't be able to | ||
| 333 | * attach to (as if the task had dropped privileges without becoming | ||
| 334 | * nondumpable). | ||
| 335 | * Pairs with a write barrier in commit_creds(). | ||
| 336 | */ | ||
| 337 | smp_rmb(); | ||
| 326 | mm = task->mm; | 338 | mm = task->mm; |
| 327 | if (mm && | 339 | if (mm && |
| 328 | ((get_dumpable(mm) != SUID_DUMP_USER) && | 340 | ((get_dumpable(mm) != SUID_DUMP_USER) && |
| @@ -704,6 +716,10 @@ static int ptrace_peek_siginfo(struct task_struct *child, | |||
| 704 | if (arg.nr < 0) | 716 | if (arg.nr < 0) |
| 705 | return -EINVAL; | 717 | return -EINVAL; |
| 706 | 718 | ||
| 719 | /* Ensure arg.off fits in an unsigned long */ | ||
| 720 | if (arg.off > ULONG_MAX) | ||
| 721 | return 0; | ||
| 722 | |||
| 707 | if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) | 723 | if (arg.flags & PTRACE_PEEKSIGINFO_SHARED) |
| 708 | pending = &child->signal->shared_pending; | 724 | pending = &child->signal->shared_pending; |
| 709 | else | 725 | else |
| @@ -711,18 +727,20 @@ static int ptrace_peek_siginfo(struct task_struct *child, | |||
| 711 | 727 | ||
| 712 | for (i = 0; i < arg.nr; ) { | 728 | for (i = 0; i < arg.nr; ) { |
| 713 | kernel_siginfo_t info; | 729 | kernel_siginfo_t info; |
| 714 | s32 off = arg.off + i; | 730 | unsigned long off = arg.off + i; |
| 731 | bool found = false; | ||
| 715 | 732 | ||
| 716 | spin_lock_irq(&child->sighand->siglock); | 733 | spin_lock_irq(&child->sighand->siglock); |
| 717 | list_for_each_entry(q, &pending->list, list) { | 734 | list_for_each_entry(q, &pending->list, list) { |
| 718 | if (!off--) { | 735 | if (!off--) { |
| 736 | found = true; | ||
| 719 | copy_siginfo(&info, &q->info); | 737 | copy_siginfo(&info, &q->info); |
| 720 | break; | 738 | break; |
| 721 | } | 739 | } |
| 722 | } | 740 | } |
| 723 | spin_unlock_irq(&child->sighand->siglock); | 741 | spin_unlock_irq(&child->sighand->siglock); |
| 724 | 742 | ||
| 725 | if (off >= 0) /* beyond the end of the list */ | 743 | if (!found) /* beyond the end of the list */ |
| 726 | break; | 744 | break; |
| 727 | 745 | ||
| 728 | #ifdef CONFIG_COMPAT | 746 | #ifdef CONFIG_COMPAT |
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 37301430970e..480edf328b51 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # RCU-related configuration options | 3 | # RCU-related configuration options |
| 3 | # | 4 | # |
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug index 0ec7d1d33a14..5ec3ea4028e2 100644 --- a/kernel/rcu/Kconfig.debug +++ b/kernel/rcu/Kconfig.debug | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # RCU-related debugging configuration options | 3 | # RCU-related debugging configuration options |
| 3 | # | 4 | # |
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 4b58c907b4b7..5290b01de534 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h | |||
| @@ -11,11 +11,6 @@ | |||
| 11 | #define __LINUX_RCU_H | 11 | #define __LINUX_RCU_H |
| 12 | 12 | ||
| 13 | #include <trace/events/rcu.h> | 13 | #include <trace/events/rcu.h> |
| 14 | #ifdef CONFIG_RCU_TRACE | ||
| 15 | #define RCU_TRACE(stmt) stmt | ||
| 16 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 17 | #define RCU_TRACE(stmt) | ||
| 18 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 19 | 14 | ||
| 20 | /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ | 15 | /* Offset to allow distinguishing irq vs. task-based idle entry/exit. */ |
| 21 | #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) | 16 | #define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) |
| @@ -216,12 +211,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | |||
| 216 | 211 | ||
| 217 | rcu_lock_acquire(&rcu_callback_map); | 212 | rcu_lock_acquire(&rcu_callback_map); |
| 218 | if (__is_kfree_rcu_offset(offset)) { | 213 | if (__is_kfree_rcu_offset(offset)) { |
| 219 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) | 214 | trace_rcu_invoke_kfree_callback(rn, head, offset); |
| 220 | kfree((void *)head - offset); | 215 | kfree((void *)head - offset); |
| 221 | rcu_lock_release(&rcu_callback_map); | 216 | rcu_lock_release(&rcu_callback_map); |
| 222 | return true; | 217 | return true; |
| 223 | } else { | 218 | } else { |
| 224 | RCU_TRACE(trace_rcu_invoke_callback(rn, head);) | 219 | trace_rcu_invoke_callback(rn, head); |
| 225 | f = head->func; | 220 | f = head->func; |
| 226 | WRITE_ONCE(head->func, (rcu_callback_t)0L); | 221 | WRITE_ONCE(head->func, (rcu_callback_t)0L); |
| 227 | f(head); | 222 | f(head); |
| @@ -451,6 +446,7 @@ void rcu_request_urgent_qs_task(struct task_struct *t); | |||
| 451 | enum rcutorture_type { | 446 | enum rcutorture_type { |
| 452 | RCU_FLAVOR, | 447 | RCU_FLAVOR, |
| 453 | RCU_TASKS_FLAVOR, | 448 | RCU_TASKS_FLAVOR, |
| 449 | RCU_TRIVIAL_FLAVOR, | ||
| 454 | SRCU_FLAVOR, | 450 | SRCU_FLAVOR, |
| 455 | INVALID_RCU_FLAVOR | 451 | INVALID_RCU_FLAVOR |
| 456 | }; | 452 | }; |
| @@ -484,6 +480,10 @@ void do_trace_rcu_torture_read(const char *rcutorturename, | |||
| 484 | #endif | 480 | #endif |
| 485 | #endif | 481 | #endif |
| 486 | 482 | ||
| 483 | #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) | ||
| 484 | long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask); | ||
| 485 | #endif | ||
| 486 | |||
| 487 | #ifdef CONFIG_TINY_SRCU | 487 | #ifdef CONFIG_TINY_SRCU |
| 488 | 488 | ||
| 489 | static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, | 489 | static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, |
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index efaa5b3f4d3f..fce4e7e6f502 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c | |||
| @@ -299,6 +299,7 @@ struct rcu_torture_ops { | |||
| 299 | int irq_capable; | 299 | int irq_capable; |
| 300 | int can_boost; | 300 | int can_boost; |
| 301 | int extendables; | 301 | int extendables; |
| 302 | int slow_gps; | ||
| 302 | const char *name; | 303 | const char *name; |
| 303 | }; | 304 | }; |
| 304 | 305 | ||
| @@ -667,9 +668,51 @@ static struct rcu_torture_ops tasks_ops = { | |||
| 667 | .fqs = NULL, | 668 | .fqs = NULL, |
| 668 | .stats = NULL, | 669 | .stats = NULL, |
| 669 | .irq_capable = 1, | 670 | .irq_capable = 1, |
| 671 | .slow_gps = 1, | ||
| 670 | .name = "tasks" | 672 | .name = "tasks" |
| 671 | }; | 673 | }; |
| 672 | 674 | ||
| 675 | /* | ||
| 676 | * Definitions for trivial CONFIG_PREEMPT=n-only torture testing. | ||
| 677 | * This implementation does not necessarily work well with CPU hotplug. | ||
| 678 | */ | ||
| 679 | |||
| 680 | static void synchronize_rcu_trivial(void) | ||
| 681 | { | ||
| 682 | int cpu; | ||
| 683 | |||
| 684 | for_each_online_cpu(cpu) { | ||
| 685 | rcutorture_sched_setaffinity(current->pid, cpumask_of(cpu)); | ||
| 686 | WARN_ON_ONCE(raw_smp_processor_id() != cpu); | ||
| 687 | } | ||
| 688 | } | ||
| 689 | |||
| 690 | static int rcu_torture_read_lock_trivial(void) __acquires(RCU) | ||
| 691 | { | ||
| 692 | preempt_disable(); | ||
| 693 | return 0; | ||
| 694 | } | ||
| 695 | |||
| 696 | static void rcu_torture_read_unlock_trivial(int idx) __releases(RCU) | ||
| 697 | { | ||
| 698 | preempt_enable(); | ||
| 699 | } | ||
| 700 | |||
| 701 | static struct rcu_torture_ops trivial_ops = { | ||
| 702 | .ttype = RCU_TRIVIAL_FLAVOR, | ||
| 703 | .init = rcu_sync_torture_init, | ||
| 704 | .readlock = rcu_torture_read_lock_trivial, | ||
| 705 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 706 | .readunlock = rcu_torture_read_unlock_trivial, | ||
| 707 | .get_gp_seq = rcu_no_completed, | ||
| 708 | .sync = synchronize_rcu_trivial, | ||
| 709 | .exp_sync = synchronize_rcu_trivial, | ||
| 710 | .fqs = NULL, | ||
| 711 | .stats = NULL, | ||
| 712 | .irq_capable = 1, | ||
| 713 | .name = "trivial" | ||
| 714 | }; | ||
| 715 | |||
| 673 | static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) | 716 | static unsigned long rcutorture_seq_diff(unsigned long new, unsigned long old) |
| 674 | { | 717 | { |
| 675 | if (!cur_ops->gp_diff) | 718 | if (!cur_ops->gp_diff) |
| @@ -1010,10 +1053,17 @@ rcu_torture_writer(void *arg) | |||
| 1010 | !rcu_gp_is_normal(); | 1053 | !rcu_gp_is_normal(); |
| 1011 | } | 1054 | } |
| 1012 | rcu_torture_writer_state = RTWS_STUTTER; | 1055 | rcu_torture_writer_state = RTWS_STUTTER; |
| 1013 | if (stutter_wait("rcu_torture_writer")) | 1056 | if (stutter_wait("rcu_torture_writer") && |
| 1057 | !READ_ONCE(rcu_fwd_cb_nodelay) && | ||
| 1058 | !cur_ops->slow_gps && | ||
| 1059 | !torture_must_stop()) | ||
| 1014 | for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) | 1060 | for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) |
| 1015 | if (list_empty(&rcu_tortures[i].rtort_free)) | 1061 | if (list_empty(&rcu_tortures[i].rtort_free) && |
| 1016 | WARN_ON_ONCE(1); | 1062 | rcu_access_pointer(rcu_torture_current) != |
| 1063 | &rcu_tortures[i]) { | ||
| 1064 | rcu_ftrace_dump(DUMP_ALL); | ||
| 1065 | WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); | ||
| 1066 | } | ||
| 1017 | } while (!torture_must_stop()); | 1067 | } while (!torture_must_stop()); |
| 1018 | /* Reset expediting back to unexpedited. */ | 1068 | /* Reset expediting back to unexpedited. */ |
| 1019 | if (expediting > 0) | 1069 | if (expediting > 0) |
| @@ -1358,8 +1408,9 @@ rcu_torture_stats_print(void) | |||
| 1358 | } | 1408 | } |
| 1359 | 1409 | ||
| 1360 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); | 1410 | pr_alert("%s%s ", torture_type, TORTURE_FLAG); |
| 1361 | pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", | 1411 | pr_cont("rtc: %p %s: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", |
| 1362 | rcu_torture_current, | 1412 | rcu_torture_current, |
| 1413 | rcu_torture_current ? "ver" : "VER", | ||
| 1363 | rcu_torture_current_version, | 1414 | rcu_torture_current_version, |
| 1364 | list_empty(&rcu_torture_freelist), | 1415 | list_empty(&rcu_torture_freelist), |
| 1365 | atomic_read(&n_rcu_torture_alloc), | 1416 | atomic_read(&n_rcu_torture_alloc), |
| @@ -1661,6 +1712,17 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp) | |||
| 1661 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); | 1712 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); |
| 1662 | } | 1713 | } |
| 1663 | 1714 | ||
| 1715 | // Give the scheduler a chance, even on nohz_full CPUs. | ||
| 1716 | static void rcu_torture_fwd_prog_cond_resched(void) | ||
| 1717 | { | ||
| 1718 | if (IS_ENABLED(CONFIG_PREEMPT) && IS_ENABLED(CONFIG_NO_HZ_FULL)) { | ||
| 1719 | if (need_resched()) | ||
| 1720 | schedule(); | ||
| 1721 | } else { | ||
| 1722 | cond_resched(); | ||
| 1723 | } | ||
| 1724 | } | ||
| 1725 | |||
| 1664 | /* | 1726 | /* |
| 1665 | * Free all callbacks on the rcu_fwd_cb_head list, either because the | 1727 | * Free all callbacks on the rcu_fwd_cb_head list, either because the |
| 1666 | * test is over or because we hit an OOM event. | 1728 | * test is over or because we hit an OOM event. |
| @@ -1674,16 +1736,18 @@ static unsigned long rcu_torture_fwd_prog_cbfree(void) | |||
| 1674 | for (;;) { | 1736 | for (;;) { |
| 1675 | spin_lock_irqsave(&rcu_fwd_lock, flags); | 1737 | spin_lock_irqsave(&rcu_fwd_lock, flags); |
| 1676 | rfcp = rcu_fwd_cb_head; | 1738 | rfcp = rcu_fwd_cb_head; |
| 1677 | if (!rfcp) | 1739 | if (!rfcp) { |
| 1740 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); | ||
| 1678 | break; | 1741 | break; |
| 1742 | } | ||
| 1679 | rcu_fwd_cb_head = rfcp->rfc_next; | 1743 | rcu_fwd_cb_head = rfcp->rfc_next; |
| 1680 | if (!rcu_fwd_cb_head) | 1744 | if (!rcu_fwd_cb_head) |
| 1681 | rcu_fwd_cb_tail = &rcu_fwd_cb_head; | 1745 | rcu_fwd_cb_tail = &rcu_fwd_cb_head; |
| 1682 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); | 1746 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); |
| 1683 | kfree(rfcp); | 1747 | kfree(rfcp); |
| 1684 | freed++; | 1748 | freed++; |
| 1749 | rcu_torture_fwd_prog_cond_resched(); | ||
| 1685 | } | 1750 | } |
| 1686 | spin_unlock_irqrestore(&rcu_fwd_lock, flags); | ||
| 1687 | return freed; | 1751 | return freed; |
| 1688 | } | 1752 | } |
| 1689 | 1753 | ||
| @@ -1707,6 +1771,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) | |||
| 1707 | } | 1771 | } |
| 1708 | 1772 | ||
| 1709 | /* Tight loop containing cond_resched(). */ | 1773 | /* Tight loop containing cond_resched(). */ |
| 1774 | WRITE_ONCE(rcu_fwd_cb_nodelay, true); | ||
| 1775 | cur_ops->sync(); /* Later readers see above write. */ | ||
| 1710 | if (selfpropcb) { | 1776 | if (selfpropcb) { |
| 1711 | WRITE_ONCE(fcs.stop, 0); | 1777 | WRITE_ONCE(fcs.stop, 0); |
| 1712 | cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); | 1778 | cur_ops->call(&fcs.rh, rcu_torture_fwd_prog_cb); |
| @@ -1724,7 +1790,7 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) | |||
| 1724 | udelay(10); | 1790 | udelay(10); |
| 1725 | cur_ops->readunlock(idx); | 1791 | cur_ops->readunlock(idx); |
| 1726 | if (!fwd_progress_need_resched || need_resched()) | 1792 | if (!fwd_progress_need_resched || need_resched()) |
| 1727 | cond_resched(); | 1793 | rcu_torture_fwd_prog_cond_resched(); |
| 1728 | } | 1794 | } |
| 1729 | (*tested_tries)++; | 1795 | (*tested_tries)++; |
| 1730 | if (!time_before(jiffies, stopat) && | 1796 | if (!time_before(jiffies, stopat) && |
| @@ -1745,6 +1811,8 @@ static void rcu_torture_fwd_prog_nr(int *tested, int *tested_tries) | |||
| 1745 | WARN_ON(READ_ONCE(fcs.stop) != 2); | 1811 | WARN_ON(READ_ONCE(fcs.stop) != 2); |
| 1746 | destroy_rcu_head_on_stack(&fcs.rh); | 1812 | destroy_rcu_head_on_stack(&fcs.rh); |
| 1747 | } | 1813 | } |
| 1814 | schedule_timeout_uninterruptible(HZ / 10); /* Let kthreads recover. */ | ||
| 1815 | WRITE_ONCE(rcu_fwd_cb_nodelay, false); | ||
| 1748 | } | 1816 | } |
| 1749 | 1817 | ||
| 1750 | /* Carry out call_rcu() forward-progress testing. */ | 1818 | /* Carry out call_rcu() forward-progress testing. */ |
| @@ -1765,6 +1833,8 @@ static void rcu_torture_fwd_prog_cr(void) | |||
| 1765 | 1833 | ||
| 1766 | if (READ_ONCE(rcu_fwd_emergency_stop)) | 1834 | if (READ_ONCE(rcu_fwd_emergency_stop)) |
| 1767 | return; /* Get out of the way quickly, no GP wait! */ | 1835 | return; /* Get out of the way quickly, no GP wait! */ |
| 1836 | if (!cur_ops->call) | ||
| 1837 | return; /* Can't do call_rcu() fwd prog without ->call. */ | ||
| 1768 | 1838 | ||
| 1769 | /* Loop continuously posting RCU callbacks. */ | 1839 | /* Loop continuously posting RCU callbacks. */ |
| 1770 | WRITE_ONCE(rcu_fwd_cb_nodelay, true); | 1840 | WRITE_ONCE(rcu_fwd_cb_nodelay, true); |
| @@ -1805,7 +1875,7 @@ static void rcu_torture_fwd_prog_cr(void) | |||
| 1805 | rfcp->rfc_gps = 0; | 1875 | rfcp->rfc_gps = 0; |
| 1806 | } | 1876 | } |
| 1807 | cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); | 1877 | cur_ops->call(&rfcp->rh, rcu_torture_fwd_cb_cr); |
| 1808 | cond_resched(); | 1878 | rcu_torture_fwd_prog_cond_resched(); |
| 1809 | } | 1879 | } |
| 1810 | stoppedat = jiffies; | 1880 | stoppedat = jiffies; |
| 1811 | n_launders_cb_snap = READ_ONCE(n_launders_cb); | 1881 | n_launders_cb_snap = READ_ONCE(n_launders_cb); |
| @@ -1814,7 +1884,6 @@ static void rcu_torture_fwd_prog_cr(void) | |||
| 1814 | cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ | 1884 | cur_ops->cb_barrier(); /* Wait for callbacks to be invoked. */ |
| 1815 | (void)rcu_torture_fwd_prog_cbfree(); | 1885 | (void)rcu_torture_fwd_prog_cbfree(); |
| 1816 | 1886 | ||
| 1817 | WRITE_ONCE(rcu_fwd_cb_nodelay, false); | ||
| 1818 | if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { | 1887 | if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop)) { |
| 1819 | WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); | 1888 | WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); |
| 1820 | pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", | 1889 | pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", |
| @@ -1825,6 +1894,8 @@ static void rcu_torture_fwd_prog_cr(void) | |||
| 1825 | n_max_gps, n_max_cbs, cver, gps); | 1894 | n_max_gps, n_max_cbs, cver, gps); |
| 1826 | rcu_torture_fwd_cb_hist(); | 1895 | rcu_torture_fwd_cb_hist(); |
| 1827 | } | 1896 | } |
| 1897 | schedule_timeout_uninterruptible(HZ); /* Let CBs drain. */ | ||
| 1898 | WRITE_ONCE(rcu_fwd_cb_nodelay, false); | ||
| 1828 | } | 1899 | } |
| 1829 | 1900 | ||
| 1830 | 1901 | ||
| @@ -2240,7 +2311,7 @@ rcu_torture_init(void) | |||
| 2240 | int firsterr = 0; | 2311 | int firsterr = 0; |
| 2241 | static struct rcu_torture_ops *torture_ops[] = { | 2312 | static struct rcu_torture_ops *torture_ops[] = { |
| 2242 | &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, | 2313 | &rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, |
| 2243 | &busted_srcud_ops, &tasks_ops, | 2314 | &busted_srcud_ops, &tasks_ops, &trivial_ops, |
| 2244 | }; | 2315 | }; |
| 2245 | 2316 | ||
| 2246 | if (!torture_init_begin(torture_type, verbose)) | 2317 | if (!torture_init_begin(torture_type, verbose)) |
| @@ -2363,7 +2434,10 @@ rcu_torture_init(void) | |||
| 2363 | if (stutter < 0) | 2434 | if (stutter < 0) |
| 2364 | stutter = 0; | 2435 | stutter = 0; |
| 2365 | if (stutter) { | 2436 | if (stutter) { |
| 2366 | firsterr = torture_stutter_init(stutter * HZ); | 2437 | int t; |
| 2438 | |||
| 2439 | t = cur_ops->stall_dur ? cur_ops->stall_dur() : stutter * HZ; | ||
| 2440 | firsterr = torture_stutter_init(stutter * HZ, t); | ||
| 2367 | if (firsterr) | 2441 | if (firsterr) |
| 2368 | goto unwind; | 2442 | goto unwind; |
| 2369 | } | 2443 | } |
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9b761e546de8..cf0e886314f2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c | |||
| @@ -831,8 +831,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) | |||
| 831 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same | 831 | * srcu_read_lock(), and srcu_read_unlock() that are all passed the same |
| 832 | * srcu_struct structure. | 832 | * srcu_struct structure. |
| 833 | */ | 833 | */ |
| 834 | void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, | 834 | static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, |
| 835 | rcu_callback_t func, bool do_norm) | 835 | rcu_callback_t func, bool do_norm) |
| 836 | { | 836 | { |
| 837 | unsigned long flags; | 837 | unsigned long flags; |
| 838 | int idx; | 838 | int idx; |
| @@ -1310,3 +1310,68 @@ void __init srcu_init(void) | |||
| 1310 | queue_work(rcu_gp_wq, &ssp->work.work); | 1310 | queue_work(rcu_gp_wq, &ssp->work.work); |
| 1311 | } | 1311 | } |
| 1312 | } | 1312 | } |
| 1313 | |||
| 1314 | #ifdef CONFIG_MODULES | ||
| 1315 | |||
| 1316 | /* Initialize any global-scope srcu_struct structures used by this module. */ | ||
| 1317 | static int srcu_module_coming(struct module *mod) | ||
| 1318 | { | ||
| 1319 | int i; | ||
| 1320 | struct srcu_struct **sspp = mod->srcu_struct_ptrs; | ||
| 1321 | int ret; | ||
| 1322 | |||
| 1323 | for (i = 0; i < mod->num_srcu_structs; i++) { | ||
| 1324 | ret = init_srcu_struct(*(sspp++)); | ||
| 1325 | if (WARN_ON_ONCE(ret)) | ||
| 1326 | return ret; | ||
| 1327 | } | ||
| 1328 | return 0; | ||
| 1329 | } | ||
| 1330 | |||
| 1331 | /* Clean up any global-scope srcu_struct structures used by this module. */ | ||
| 1332 | static void srcu_module_going(struct module *mod) | ||
| 1333 | { | ||
| 1334 | int i; | ||
| 1335 | struct srcu_struct **sspp = mod->srcu_struct_ptrs; | ||
| 1336 | |||
| 1337 | for (i = 0; i < mod->num_srcu_structs; i++) | ||
| 1338 | cleanup_srcu_struct(*(sspp++)); | ||
| 1339 | } | ||
| 1340 | |||
| 1341 | /* Handle one module, either coming or going. */ | ||
| 1342 | static int srcu_module_notify(struct notifier_block *self, | ||
| 1343 | unsigned long val, void *data) | ||
| 1344 | { | ||
| 1345 | struct module *mod = data; | ||
| 1346 | int ret = 0; | ||
| 1347 | |||
| 1348 | switch (val) { | ||
| 1349 | case MODULE_STATE_COMING: | ||
| 1350 | ret = srcu_module_coming(mod); | ||
| 1351 | break; | ||
| 1352 | case MODULE_STATE_GOING: | ||
| 1353 | srcu_module_going(mod); | ||
| 1354 | break; | ||
| 1355 | default: | ||
| 1356 | break; | ||
| 1357 | } | ||
| 1358 | return ret; | ||
| 1359 | } | ||
| 1360 | |||
| 1361 | static struct notifier_block srcu_module_nb = { | ||
| 1362 | .notifier_call = srcu_module_notify, | ||
| 1363 | .priority = 0, | ||
| 1364 | }; | ||
| 1365 | |||
| 1366 | static __init int init_srcu_module_notifier(void) | ||
| 1367 | { | ||
| 1368 | int ret; | ||
| 1369 | |||
| 1370 | ret = register_module_notifier(&srcu_module_nb); | ||
| 1371 | if (ret) | ||
| 1372 | pr_warn("Failed to register srcu module notifier\n"); | ||
| 1373 | return ret; | ||
| 1374 | } | ||
| 1375 | late_initcall(init_srcu_module_notifier); | ||
| 1376 | |||
| 1377 | #endif /* #ifdef CONFIG_MODULES */ | ||
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index a8304d90573f..d4558ab7a07d 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c | |||
| @@ -10,65 +10,18 @@ | |||
| 10 | #include <linux/rcu_sync.h> | 10 | #include <linux/rcu_sync.h> |
| 11 | #include <linux/sched.h> | 11 | #include <linux/sched.h> |
| 12 | 12 | ||
| 13 | #ifdef CONFIG_PROVE_RCU | 13 | enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY }; |
| 14 | #define __INIT_HELD(func) .held = func, | ||
| 15 | #else | ||
| 16 | #define __INIT_HELD(func) | ||
| 17 | #endif | ||
| 18 | |||
| 19 | static const struct { | ||
| 20 | void (*sync)(void); | ||
| 21 | void (*call)(struct rcu_head *, void (*)(struct rcu_head *)); | ||
| 22 | void (*wait)(void); | ||
| 23 | #ifdef CONFIG_PROVE_RCU | ||
| 24 | int (*held)(void); | ||
| 25 | #endif | ||
| 26 | } gp_ops[] = { | ||
| 27 | [RCU_SYNC] = { | ||
| 28 | .sync = synchronize_rcu, | ||
| 29 | .call = call_rcu, | ||
| 30 | .wait = rcu_barrier, | ||
| 31 | __INIT_HELD(rcu_read_lock_held) | ||
| 32 | }, | ||
| 33 | [RCU_SCHED_SYNC] = { | ||
| 34 | .sync = synchronize_rcu, | ||
| 35 | .call = call_rcu, | ||
| 36 | .wait = rcu_barrier, | ||
| 37 | __INIT_HELD(rcu_read_lock_sched_held) | ||
| 38 | }, | ||
| 39 | [RCU_BH_SYNC] = { | ||
| 40 | .sync = synchronize_rcu, | ||
| 41 | .call = call_rcu, | ||
| 42 | .wait = rcu_barrier, | ||
| 43 | __INIT_HELD(rcu_read_lock_bh_held) | ||
| 44 | }, | ||
| 45 | }; | ||
| 46 | |||
| 47 | enum { GP_IDLE = 0, GP_PENDING, GP_PASSED }; | ||
| 48 | enum { CB_IDLE = 0, CB_PENDING, CB_REPLAY }; | ||
| 49 | 14 | ||
| 50 | #define rss_lock gp_wait.lock | 15 | #define rss_lock gp_wait.lock |
| 51 | 16 | ||
| 52 | #ifdef CONFIG_PROVE_RCU | ||
| 53 | void rcu_sync_lockdep_assert(struct rcu_sync *rsp) | ||
| 54 | { | ||
| 55 | RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(), | ||
| 56 | "suspicious rcu_sync_is_idle() usage"); | ||
| 57 | } | ||
| 58 | |||
| 59 | EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert); | ||
| 60 | #endif | ||
| 61 | |||
| 62 | /** | 17 | /** |
| 63 | * rcu_sync_init() - Initialize an rcu_sync structure | 18 | * rcu_sync_init() - Initialize an rcu_sync structure |
| 64 | * @rsp: Pointer to rcu_sync structure to be initialized | 19 | * @rsp: Pointer to rcu_sync structure to be initialized |
| 65 | * @type: Flavor of RCU with which to synchronize rcu_sync structure | ||
| 66 | */ | 20 | */ |
| 67 | void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type) | 21 | void rcu_sync_init(struct rcu_sync *rsp) |
| 68 | { | 22 | { |
| 69 | memset(rsp, 0, sizeof(*rsp)); | 23 | memset(rsp, 0, sizeof(*rsp)); |
| 70 | init_waitqueue_head(&rsp->gp_wait); | 24 | init_waitqueue_head(&rsp->gp_wait); |
| 71 | rsp->gp_type = type; | ||
| 72 | } | 25 | } |
| 73 | 26 | ||
| 74 | /** | 27 | /** |
| @@ -86,56 +39,26 @@ void rcu_sync_enter_start(struct rcu_sync *rsp) | |||
| 86 | rsp->gp_state = GP_PASSED; | 39 | rsp->gp_state = GP_PASSED; |
| 87 | } | 40 | } |
| 88 | 41 | ||
| 89 | /** | ||
| 90 | * rcu_sync_enter() - Force readers onto slowpath | ||
| 91 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
| 92 | * | ||
| 93 | * This function is used by updaters who need readers to make use of | ||
| 94 | * a slowpath during the update. After this function returns, all | ||
| 95 | * subsequent calls to rcu_sync_is_idle() will return false, which | ||
| 96 | * tells readers to stay off their fastpaths. A later call to | ||
| 97 | * rcu_sync_exit() re-enables reader slowpaths. | ||
| 98 | * | ||
| 99 | * When called in isolation, rcu_sync_enter() must wait for a grace | ||
| 100 | * period, however, closely spaced calls to rcu_sync_enter() can | ||
| 101 | * optimize away the grace-period wait via a state machine implemented | ||
| 102 | * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). | ||
| 103 | */ | ||
| 104 | void rcu_sync_enter(struct rcu_sync *rsp) | ||
| 105 | { | ||
| 106 | bool need_wait, need_sync; | ||
| 107 | 42 | ||
| 108 | spin_lock_irq(&rsp->rss_lock); | 43 | static void rcu_sync_func(struct rcu_head *rhp); |
| 109 | need_wait = rsp->gp_count++; | ||
| 110 | need_sync = rsp->gp_state == GP_IDLE; | ||
| 111 | if (need_sync) | ||
| 112 | rsp->gp_state = GP_PENDING; | ||
| 113 | spin_unlock_irq(&rsp->rss_lock); | ||
| 114 | 44 | ||
| 115 | WARN_ON_ONCE(need_wait && need_sync); | 45 | static void rcu_sync_call(struct rcu_sync *rsp) |
| 116 | if (need_sync) { | 46 | { |
| 117 | gp_ops[rsp->gp_type].sync(); | 47 | call_rcu(&rsp->cb_head, rcu_sync_func); |
| 118 | rsp->gp_state = GP_PASSED; | ||
| 119 | wake_up_all(&rsp->gp_wait); | ||
| 120 | } else if (need_wait) { | ||
| 121 | wait_event(rsp->gp_wait, rsp->gp_state == GP_PASSED); | ||
| 122 | } else { | ||
| 123 | /* | ||
| 124 | * Possible when there's a pending CB from a rcu_sync_exit(). | ||
| 125 | * Nobody has yet been allowed the 'fast' path and thus we can | ||
| 126 | * avoid doing any sync(). The callback will get 'dropped'. | ||
| 127 | */ | ||
| 128 | WARN_ON_ONCE(rsp->gp_state != GP_PASSED); | ||
| 129 | } | ||
| 130 | } | 48 | } |
| 131 | 49 | ||
| 132 | /** | 50 | /** |
| 133 | * rcu_sync_func() - Callback function managing reader access to fastpath | 51 | * rcu_sync_func() - Callback function managing reader access to fastpath |
| 134 | * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization | 52 | * @rhp: Pointer to rcu_head in rcu_sync structure to use for synchronization |
| 135 | * | 53 | * |
| 136 | * This function is passed to one of the call_rcu() functions by | 54 | * This function is passed to call_rcu() function by rcu_sync_enter() and |
| 137 | * rcu_sync_exit(), so that it is invoked after a grace period following the | 55 | * rcu_sync_exit(), so that it is invoked after a grace period following the |
| 138 | * that invocation of rcu_sync_exit(). It takes action based on events that | 56 | * that invocation of enter/exit. |
| 57 | * | ||
| 58 | * If it is called by rcu_sync_enter() it signals that all the readers were | ||
| 59 | * switched onto slow path. | ||
| 60 | * | ||
| 61 | * If it is called by rcu_sync_exit() it takes action based on events that | ||
| 139 | * have taken place in the meantime, so that closely spaced rcu_sync_enter() | 62 | * have taken place in the meantime, so that closely spaced rcu_sync_enter() |
| 140 | * and rcu_sync_exit() pairs need not wait for a grace period. | 63 | * and rcu_sync_exit() pairs need not wait for a grace period. |
| 141 | * | 64 | * |
| @@ -152,35 +75,88 @@ static void rcu_sync_func(struct rcu_head *rhp) | |||
| 152 | struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); | 75 | struct rcu_sync *rsp = container_of(rhp, struct rcu_sync, cb_head); |
| 153 | unsigned long flags; | 76 | unsigned long flags; |
| 154 | 77 | ||
| 155 | WARN_ON_ONCE(rsp->gp_state != GP_PASSED); | 78 | WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); |
| 156 | WARN_ON_ONCE(rsp->cb_state == CB_IDLE); | 79 | WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); |
| 157 | 80 | ||
| 158 | spin_lock_irqsave(&rsp->rss_lock, flags); | 81 | spin_lock_irqsave(&rsp->rss_lock, flags); |
| 159 | if (rsp->gp_count) { | 82 | if (rsp->gp_count) { |
| 160 | /* | 83 | /* |
| 161 | * A new rcu_sync_begin() has happened; drop the callback. | 84 | * We're at least a GP after the GP_IDLE->GP_ENTER transition. |
| 162 | */ | 85 | */ |
| 163 | rsp->cb_state = CB_IDLE; | 86 | WRITE_ONCE(rsp->gp_state, GP_PASSED); |
| 164 | } else if (rsp->cb_state == CB_REPLAY) { | 87 | wake_up_locked(&rsp->gp_wait); |
| 88 | } else if (rsp->gp_state == GP_REPLAY) { | ||
| 165 | /* | 89 | /* |
| 166 | * A new rcu_sync_exit() has happened; requeue the callback | 90 | * A new rcu_sync_exit() has happened; requeue the callback to |
| 167 | * to catch a later GP. | 91 | * catch a later GP. |
| 168 | */ | 92 | */ |
| 169 | rsp->cb_state = CB_PENDING; | 93 | WRITE_ONCE(rsp->gp_state, GP_EXIT); |
| 170 | gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); | 94 | rcu_sync_call(rsp); |
| 171 | } else { | 95 | } else { |
| 172 | /* | 96 | /* |
| 173 | * We're at least a GP after rcu_sync_exit(); eveybody will now | 97 | * We're at least a GP after the last rcu_sync_exit(); eveybody |
| 174 | * have observed the write side critical section. Let 'em rip!. | 98 | * will now have observed the write side critical section. |
| 99 | * Let 'em rip!. | ||
| 175 | */ | 100 | */ |
| 176 | rsp->cb_state = CB_IDLE; | 101 | WRITE_ONCE(rsp->gp_state, GP_IDLE); |
| 177 | rsp->gp_state = GP_IDLE; | ||
| 178 | } | 102 | } |
| 179 | spin_unlock_irqrestore(&rsp->rss_lock, flags); | 103 | spin_unlock_irqrestore(&rsp->rss_lock, flags); |
| 180 | } | 104 | } |
| 181 | 105 | ||
| 182 | /** | 106 | /** |
| 183 | * rcu_sync_exit() - Allow readers back onto fast patch after grace period | 107 | * rcu_sync_enter() - Force readers onto slowpath |
| 108 | * @rsp: Pointer to rcu_sync structure to use for synchronization | ||
| 109 | * | ||
| 110 | * This function is used by updaters who need readers to make use of | ||
| 111 | * a slowpath during the update. After this function returns, all | ||
| 112 | * subsequent calls to rcu_sync_is_idle() will return false, which | ||
| 113 | * tells readers to stay off their fastpaths. A later call to | ||
| 114 | * rcu_sync_exit() re-enables reader slowpaths. | ||
| 115 | * | ||
| 116 | * When called in isolation, rcu_sync_enter() must wait for a grace | ||
| 117 | * period, however, closely spaced calls to rcu_sync_enter() can | ||
| 118 | * optimize away the grace-period wait via a state machine implemented | ||
| 119 | * by rcu_sync_enter(), rcu_sync_exit(), and rcu_sync_func(). | ||
| 120 | */ | ||
| 121 | void rcu_sync_enter(struct rcu_sync *rsp) | ||
| 122 | { | ||
| 123 | int gp_state; | ||
| 124 | |||
| 125 | spin_lock_irq(&rsp->rss_lock); | ||
| 126 | gp_state = rsp->gp_state; | ||
| 127 | if (gp_state == GP_IDLE) { | ||
| 128 | WRITE_ONCE(rsp->gp_state, GP_ENTER); | ||
| 129 | WARN_ON_ONCE(rsp->gp_count); | ||
| 130 | /* | ||
| 131 | * Note that we could simply do rcu_sync_call(rsp) here and | ||
| 132 | * avoid the "if (gp_state == GP_IDLE)" block below. | ||
| 133 | * | ||
| 134 | * However, synchronize_rcu() can be faster if rcu_expedited | ||
| 135 | * or rcu_blocking_is_gp() is true. | ||
| 136 | * | ||
| 137 | * Another reason is that we can't wait for rcu callback if | ||
| 138 | * we are called at early boot time but this shouldn't happen. | ||
| 139 | */ | ||
| 140 | } | ||
| 141 | rsp->gp_count++; | ||
| 142 | spin_unlock_irq(&rsp->rss_lock); | ||
| 143 | |||
| 144 | if (gp_state == GP_IDLE) { | ||
| 145 | /* | ||
| 146 | * See the comment above, this simply does the "synchronous" | ||
| 147 | * call_rcu(rcu_sync_func) which does GP_ENTER -> GP_PASSED. | ||
| 148 | */ | ||
| 149 | synchronize_rcu(); | ||
| 150 | rcu_sync_func(&rsp->cb_head); | ||
| 151 | /* Not really needed, wait_event() would see GP_PASSED. */ | ||
| 152 | return; | ||
| 153 | } | ||
| 154 | |||
| 155 | wait_event(rsp->gp_wait, READ_ONCE(rsp->gp_state) >= GP_PASSED); | ||
| 156 | } | ||
| 157 | |||
| 158 | /** | ||
| 159 | * rcu_sync_exit() - Allow readers back onto fast path after grace period | ||
| 184 | * @rsp: Pointer to rcu_sync structure to use for synchronization | 160 | * @rsp: Pointer to rcu_sync structure to use for synchronization |
| 185 | * | 161 | * |
| 186 | * This function is used by updaters who have completed, and can therefore | 162 | * This function is used by updaters who have completed, and can therefore |
| @@ -191,13 +167,16 @@ static void rcu_sync_func(struct rcu_head *rhp) | |||
| 191 | */ | 167 | */ |
| 192 | void rcu_sync_exit(struct rcu_sync *rsp) | 168 | void rcu_sync_exit(struct rcu_sync *rsp) |
| 193 | { | 169 | { |
| 170 | WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); | ||
| 171 | WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); | ||
| 172 | |||
| 194 | spin_lock_irq(&rsp->rss_lock); | 173 | spin_lock_irq(&rsp->rss_lock); |
| 195 | if (!--rsp->gp_count) { | 174 | if (!--rsp->gp_count) { |
| 196 | if (rsp->cb_state == CB_IDLE) { | 175 | if (rsp->gp_state == GP_PASSED) { |
| 197 | rsp->cb_state = CB_PENDING; | 176 | WRITE_ONCE(rsp->gp_state, GP_EXIT); |
| 198 | gp_ops[rsp->gp_type].call(&rsp->cb_head, rcu_sync_func); | 177 | rcu_sync_call(rsp); |
| 199 | } else if (rsp->cb_state == CB_PENDING) { | 178 | } else if (rsp->gp_state == GP_EXIT) { |
| 200 | rsp->cb_state = CB_REPLAY; | 179 | WRITE_ONCE(rsp->gp_state, GP_REPLAY); |
| 201 | } | 180 | } |
| 202 | } | 181 | } |
| 203 | spin_unlock_irq(&rsp->rss_lock); | 182 | spin_unlock_irq(&rsp->rss_lock); |
| @@ -209,18 +188,19 @@ void rcu_sync_exit(struct rcu_sync *rsp) | |||
| 209 | */ | 188 | */ |
| 210 | void rcu_sync_dtor(struct rcu_sync *rsp) | 189 | void rcu_sync_dtor(struct rcu_sync *rsp) |
| 211 | { | 190 | { |
| 212 | int cb_state; | 191 | int gp_state; |
| 213 | 192 | ||
| 214 | WARN_ON_ONCE(rsp->gp_count); | 193 | WARN_ON_ONCE(READ_ONCE(rsp->gp_count)); |
| 194 | WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED); | ||
| 215 | 195 | ||
| 216 | spin_lock_irq(&rsp->rss_lock); | 196 | spin_lock_irq(&rsp->rss_lock); |
| 217 | if (rsp->cb_state == CB_REPLAY) | 197 | if (rsp->gp_state == GP_REPLAY) |
| 218 | rsp->cb_state = CB_PENDING; | 198 | WRITE_ONCE(rsp->gp_state, GP_EXIT); |
| 219 | cb_state = rsp->cb_state; | 199 | gp_state = rsp->gp_state; |
| 220 | spin_unlock_irq(&rsp->rss_lock); | 200 | spin_unlock_irq(&rsp->rss_lock); |
| 221 | 201 | ||
| 222 | if (cb_state != CB_IDLE) { | 202 | if (gp_state != GP_IDLE) { |
| 223 | gp_ops[rsp->gp_type].wait(); | 203 | rcu_barrier(); |
| 224 | WARN_ON_ONCE(rsp->cb_state != CB_IDLE); | 204 | WARN_ON_ONCE(rsp->gp_state != GP_IDLE); |
| 225 | } | 205 | } |
| 226 | } | 206 | } |
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b4d88a594785..a14e5fbbea46 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
| @@ -51,6 +51,12 @@ | |||
| 51 | #include <linux/tick.h> | 51 | #include <linux/tick.h> |
| 52 | #include <linux/sysrq.h> | 52 | #include <linux/sysrq.h> |
| 53 | #include <linux/kprobes.h> | 53 | #include <linux/kprobes.h> |
| 54 | #include <linux/gfp.h> | ||
| 55 | #include <linux/oom.h> | ||
| 56 | #include <linux/smpboot.h> | ||
| 57 | #include <linux/jiffies.h> | ||
| 58 | #include <linux/sched/isolation.h> | ||
| 59 | #include "../time/tick-internal.h" | ||
| 54 | 60 | ||
| 55 | #include "tree.h" | 61 | #include "tree.h" |
| 56 | #include "rcu.h" | 62 | #include "rcu.h" |
| @@ -92,6 +98,9 @@ struct rcu_state rcu_state = { | |||
| 92 | /* Dump rcu_node combining tree at boot to verify correct setup. */ | 98 | /* Dump rcu_node combining tree at boot to verify correct setup. */ |
| 93 | static bool dump_tree; | 99 | static bool dump_tree; |
| 94 | module_param(dump_tree, bool, 0444); | 100 | module_param(dump_tree, bool, 0444); |
| 101 | /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ | ||
| 102 | static bool use_softirq = 1; | ||
| 103 | module_param(use_softirq, bool, 0444); | ||
| 95 | /* Control rcu_node-tree auto-balancing at boot time. */ | 104 | /* Control rcu_node-tree auto-balancing at boot time. */ |
| 96 | static bool rcu_fanout_exact; | 105 | static bool rcu_fanout_exact; |
| 97 | module_param(rcu_fanout_exact, bool, 0444); | 106 | module_param(rcu_fanout_exact, bool, 0444); |
| @@ -138,7 +147,6 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); | |||
| 138 | static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); | 147 | static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); |
| 139 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 148 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
| 140 | static void invoke_rcu_core(void); | 149 | static void invoke_rcu_core(void); |
| 141 | static void invoke_rcu_callbacks(struct rcu_data *rdp); | ||
| 142 | static void rcu_report_exp_rdp(struct rcu_data *rdp); | 150 | static void rcu_report_exp_rdp(struct rcu_data *rdp); |
| 143 | static void sync_sched_exp_online_cleanup(int cpu); | 151 | static void sync_sched_exp_online_cleanup(int cpu); |
| 144 | 152 | ||
| @@ -368,19 +376,33 @@ static void __maybe_unused rcu_momentary_dyntick_idle(void) | |||
| 368 | } | 376 | } |
| 369 | 377 | ||
| 370 | /** | 378 | /** |
| 371 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle | 379 | * rcu_is_cpu_rrupt_from_idle - see if interrupted from idle |
| 372 | * | 380 | * |
| 373 | * If the current CPU is idle or running at a first-level (not nested) | 381 | * If the current CPU is idle and running at a first-level (not nested) |
| 374 | * interrupt from idle, return true. The caller must have at least | 382 | * interrupt from idle, return true. The caller must have at least |
| 375 | * disabled preemption. | 383 | * disabled preemption. |
| 376 | */ | 384 | */ |
| 377 | static int rcu_is_cpu_rrupt_from_idle(void) | 385 | static int rcu_is_cpu_rrupt_from_idle(void) |
| 378 | { | 386 | { |
| 379 | return __this_cpu_read(rcu_data.dynticks_nesting) <= 0 && | 387 | /* Called only from within the scheduling-clock interrupt */ |
| 380 | __this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 1; | 388 | lockdep_assert_in_irq(); |
| 389 | |||
| 390 | /* Check for counter underflows */ | ||
| 391 | RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nesting) < 0, | ||
| 392 | "RCU dynticks_nesting counter underflow!"); | ||
| 393 | RCU_LOCKDEP_WARN(__this_cpu_read(rcu_data.dynticks_nmi_nesting) <= 0, | ||
| 394 | "RCU dynticks_nmi_nesting counter underflow/zero!"); | ||
| 395 | |||
| 396 | /* Are we at first interrupt nesting level? */ | ||
| 397 | if (__this_cpu_read(rcu_data.dynticks_nmi_nesting) != 1) | ||
| 398 | return false; | ||
| 399 | |||
| 400 | /* Does CPU appear to be idle from an RCU standpoint? */ | ||
| 401 | return __this_cpu_read(rcu_data.dynticks_nesting) == 0; | ||
| 381 | } | 402 | } |
| 382 | 403 | ||
| 383 | #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch. */ | 404 | #define DEFAULT_RCU_BLIMIT 10 /* Maximum callbacks per rcu_do_batch ... */ |
| 405 | #define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */ | ||
| 384 | static long blimit = DEFAULT_RCU_BLIMIT; | 406 | static long blimit = DEFAULT_RCU_BLIMIT; |
| 385 | #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ | 407 | #define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */ |
| 386 | static long qhimark = DEFAULT_RCU_QHIMARK; | 408 | static long qhimark = DEFAULT_RCU_QHIMARK; |
| @@ -1969,14 +1991,14 @@ rcu_check_quiescent_state(struct rcu_data *rdp) | |||
| 1969 | */ | 1991 | */ |
| 1970 | int rcutree_dying_cpu(unsigned int cpu) | 1992 | int rcutree_dying_cpu(unsigned int cpu) |
| 1971 | { | 1993 | { |
| 1972 | RCU_TRACE(bool blkd;) | 1994 | bool blkd; |
| 1973 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(&rcu_data);) | 1995 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); |
| 1974 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) | 1996 | struct rcu_node *rnp = rdp->mynode; |
| 1975 | 1997 | ||
| 1976 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) | 1998 | if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) |
| 1977 | return 0; | 1999 | return 0; |
| 1978 | 2000 | ||
| 1979 | RCU_TRACE(blkd = !!(rnp->qsmask & rdp->grpmask);) | 2001 | blkd = !!(rnp->qsmask & rdp->grpmask); |
| 1980 | trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, | 2002 | trace_rcu_grace_period(rcu_state.name, rnp->gp_seq, |
| 1981 | blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); | 2003 | blkd ? TPS("cpuofl") : TPS("cpuofl-bgp")); |
| 1982 | return 0; | 2004 | return 0; |
| @@ -2113,7 +2135,7 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
| 2113 | 2135 | ||
| 2114 | /* Reinstate batch limit if we have worked down the excess. */ | 2136 | /* Reinstate batch limit if we have worked down the excess. */ |
| 2115 | count = rcu_segcblist_n_cbs(&rdp->cblist); | 2137 | count = rcu_segcblist_n_cbs(&rdp->cblist); |
| 2116 | if (rdp->blimit == LONG_MAX && count <= qlowmark) | 2138 | if (rdp->blimit >= DEFAULT_MAX_RCU_BLIMIT && count <= qlowmark) |
| 2117 | rdp->blimit = blimit; | 2139 | rdp->blimit = blimit; |
| 2118 | 2140 | ||
| 2119 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ | 2141 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ |
| @@ -2253,7 +2275,7 @@ void rcu_force_quiescent_state(void) | |||
| 2253 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 2275 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
| 2254 | 2276 | ||
| 2255 | /* Perform RCU core processing work for the current CPU. */ | 2277 | /* Perform RCU core processing work for the current CPU. */ |
| 2256 | static __latent_entropy void rcu_core(struct softirq_action *unused) | 2278 | static __latent_entropy void rcu_core(void) |
| 2257 | { | 2279 | { |
| 2258 | unsigned long flags; | 2280 | unsigned long flags; |
| 2259 | struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); | 2281 | struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); |
| @@ -2287,37 +2309,126 @@ static __latent_entropy void rcu_core(struct softirq_action *unused) | |||
| 2287 | rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); | 2309 | rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); |
| 2288 | 2310 | ||
| 2289 | /* If there are callbacks ready, invoke them. */ | 2311 | /* If there are callbacks ready, invoke them. */ |
| 2290 | if (rcu_segcblist_ready_cbs(&rdp->cblist)) | 2312 | if (rcu_segcblist_ready_cbs(&rdp->cblist) && |
| 2291 | invoke_rcu_callbacks(rdp); | 2313 | likely(READ_ONCE(rcu_scheduler_fully_active))) |
| 2314 | rcu_do_batch(rdp); | ||
| 2292 | 2315 | ||
| 2293 | /* Do any needed deferred wakeups of rcuo kthreads. */ | 2316 | /* Do any needed deferred wakeups of rcuo kthreads. */ |
| 2294 | do_nocb_deferred_wakeup(rdp); | 2317 | do_nocb_deferred_wakeup(rdp); |
| 2295 | trace_rcu_utilization(TPS("End RCU core")); | 2318 | trace_rcu_utilization(TPS("End RCU core")); |
| 2296 | } | 2319 | } |
| 2297 | 2320 | ||
| 2321 | static void rcu_core_si(struct softirq_action *h) | ||
| 2322 | { | ||
| 2323 | rcu_core(); | ||
| 2324 | } | ||
| 2325 | |||
| 2326 | static void rcu_wake_cond(struct task_struct *t, int status) | ||
| 2327 | { | ||
| 2328 | /* | ||
| 2329 | * If the thread is yielding, only wake it when this | ||
| 2330 | * is invoked from idle | ||
| 2331 | */ | ||
| 2332 | if (t && (status != RCU_KTHREAD_YIELDING || is_idle_task(current))) | ||
| 2333 | wake_up_process(t); | ||
| 2334 | } | ||
| 2335 | |||
| 2336 | static void invoke_rcu_core_kthread(void) | ||
| 2337 | { | ||
| 2338 | struct task_struct *t; | ||
| 2339 | unsigned long flags; | ||
| 2340 | |||
| 2341 | local_irq_save(flags); | ||
| 2342 | __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); | ||
| 2343 | t = __this_cpu_read(rcu_data.rcu_cpu_kthread_task); | ||
| 2344 | if (t != NULL && t != current) | ||
| 2345 | rcu_wake_cond(t, __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); | ||
| 2346 | local_irq_restore(flags); | ||
| 2347 | } | ||
| 2348 | |||
| 2298 | /* | 2349 | /* |
| 2299 | * Schedule RCU callback invocation. If the running implementation of RCU | 2350 | * Wake up this CPU's rcuc kthread to do RCU core processing. |
| 2300 | * does not support RCU priority boosting, just do a direct call, otherwise | ||
| 2301 | * wake up the per-CPU kernel kthread. Note that because we are running | ||
| 2302 | * on the current CPU with softirqs disabled, the rcu_cpu_kthread_task | ||
| 2303 | * cannot disappear out from under us. | ||
| 2304 | */ | 2351 | */ |
| 2305 | static void invoke_rcu_callbacks(struct rcu_data *rdp) | 2352 | static void invoke_rcu_core(void) |
| 2306 | { | 2353 | { |
| 2307 | if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) | 2354 | if (!cpu_online(smp_processor_id())) |
| 2308 | return; | ||
| 2309 | if (likely(!rcu_state.boost)) { | ||
| 2310 | rcu_do_batch(rdp); | ||
| 2311 | return; | 2355 | return; |
| 2356 | if (use_softirq) | ||
| 2357 | raise_softirq(RCU_SOFTIRQ); | ||
| 2358 | else | ||
| 2359 | invoke_rcu_core_kthread(); | ||
| 2360 | } | ||
| 2361 | |||
| 2362 | static void rcu_cpu_kthread_park(unsigned int cpu) | ||
| 2363 | { | ||
| 2364 | per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
| 2365 | } | ||
| 2366 | |||
| 2367 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | ||
| 2368 | { | ||
| 2369 | return __this_cpu_read(rcu_data.rcu_cpu_has_work); | ||
| 2370 | } | ||
| 2371 | |||
| 2372 | /* | ||
| 2373 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces | ||
| 2374 | * the RCU softirq used in configurations of RCU that do not support RCU | ||
| 2375 | * priority boosting. | ||
| 2376 | */ | ||
| 2377 | static void rcu_cpu_kthread(unsigned int cpu) | ||
| 2378 | { | ||
| 2379 | unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); | ||
| 2380 | char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); | ||
| 2381 | int spincnt; | ||
| 2382 | |||
| 2383 | for (spincnt = 0; spincnt < 10; spincnt++) { | ||
| 2384 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); | ||
| 2385 | local_bh_disable(); | ||
| 2386 | *statusp = RCU_KTHREAD_RUNNING; | ||
| 2387 | local_irq_disable(); | ||
| 2388 | work = *workp; | ||
| 2389 | *workp = 0; | ||
| 2390 | local_irq_enable(); | ||
| 2391 | if (work) | ||
| 2392 | rcu_core(); | ||
| 2393 | local_bh_enable(); | ||
| 2394 | if (*workp == 0) { | ||
| 2395 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); | ||
| 2396 | *statusp = RCU_KTHREAD_WAITING; | ||
| 2397 | return; | ||
| 2398 | } | ||
| 2312 | } | 2399 | } |
| 2313 | invoke_rcu_callbacks_kthread(); | 2400 | *statusp = RCU_KTHREAD_YIELDING; |
| 2401 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); | ||
| 2402 | schedule_timeout_interruptible(2); | ||
| 2403 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); | ||
| 2404 | *statusp = RCU_KTHREAD_WAITING; | ||
| 2314 | } | 2405 | } |
| 2315 | 2406 | ||
| 2316 | static void invoke_rcu_core(void) | 2407 | static struct smp_hotplug_thread rcu_cpu_thread_spec = { |
| 2408 | .store = &rcu_data.rcu_cpu_kthread_task, | ||
| 2409 | .thread_should_run = rcu_cpu_kthread_should_run, | ||
| 2410 | .thread_fn = rcu_cpu_kthread, | ||
| 2411 | .thread_comm = "rcuc/%u", | ||
| 2412 | .setup = rcu_cpu_kthread_setup, | ||
| 2413 | .park = rcu_cpu_kthread_park, | ||
| 2414 | }; | ||
| 2415 | |||
| 2416 | /* | ||
| 2417 | * Spawn per-CPU RCU core processing kthreads. | ||
| 2418 | */ | ||
| 2419 | static int __init rcu_spawn_core_kthreads(void) | ||
| 2317 | { | 2420 | { |
| 2318 | if (cpu_online(smp_processor_id())) | 2421 | int cpu; |
| 2319 | raise_softirq(RCU_SOFTIRQ); | 2422 | |
| 2423 | for_each_possible_cpu(cpu) | ||
| 2424 | per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; | ||
| 2425 | if (!IS_ENABLED(CONFIG_RCU_BOOST) && use_softirq) | ||
| 2426 | return 0; | ||
| 2427 | WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), | ||
| 2428 | "%s: Could not start rcuc kthread, OOM is now expected behavior\n", __func__); | ||
| 2429 | return 0; | ||
| 2320 | } | 2430 | } |
| 2431 | early_initcall(rcu_spawn_core_kthreads); | ||
| 2321 | 2432 | ||
| 2322 | /* | 2433 | /* |
| 2323 | * Handle any core-RCU processing required by a call_rcu() invocation. | 2434 | * Handle any core-RCU processing required by a call_rcu() invocation. |
| @@ -2354,7 +2465,7 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head, | |||
| 2354 | rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); | 2465 | rcu_accelerate_cbs_unlocked(rdp->mynode, rdp); |
| 2355 | } else { | 2466 | } else { |
| 2356 | /* Give the grace period a kick. */ | 2467 | /* Give the grace period a kick. */ |
| 2357 | rdp->blimit = LONG_MAX; | 2468 | rdp->blimit = DEFAULT_MAX_RCU_BLIMIT; |
| 2358 | if (rcu_state.n_force_qs == rdp->n_force_qs_snap && | 2469 | if (rcu_state.n_force_qs == rdp->n_force_qs_snap && |
| 2359 | rcu_segcblist_first_pend_cb(&rdp->cblist) != head) | 2470 | rcu_segcblist_first_pend_cb(&rdp->cblist) != head) |
| 2360 | rcu_force_quiescent_state(); | 2471 | rcu_force_quiescent_state(); |
| @@ -3355,7 +3466,8 @@ void __init rcu_init(void) | |||
| 3355 | rcu_init_one(); | 3466 | rcu_init_one(); |
| 3356 | if (dump_tree) | 3467 | if (dump_tree) |
| 3357 | rcu_dump_rcu_node_tree(); | 3468 | rcu_dump_rcu_node_tree(); |
| 3358 | open_softirq(RCU_SOFTIRQ, rcu_core); | 3469 | if (use_softirq) |
| 3470 | open_softirq(RCU_SOFTIRQ, rcu_core_si); | ||
| 3359 | 3471 | ||
| 3360 | /* | 3472 | /* |
| 3361 | * We don't need protection against CPU-hotplug here because | 3473 | * We don't need protection against CPU-hotplug here because |
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e253d11af3c4..7acaf3a62d39 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h | |||
| @@ -154,13 +154,15 @@ struct rcu_data { | |||
| 154 | bool core_needs_qs; /* Core waits for quiesc state. */ | 154 | bool core_needs_qs; /* Core waits for quiesc state. */ |
| 155 | bool beenonline; /* CPU online at least once. */ | 155 | bool beenonline; /* CPU online at least once. */ |
| 156 | bool gpwrap; /* Possible ->gp_seq wrap. */ | 156 | bool gpwrap; /* Possible ->gp_seq wrap. */ |
| 157 | bool deferred_qs; /* This CPU awaiting a deferred QS? */ | 157 | bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ |
| 158 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | 158 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ |
| 159 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | 159 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ |
| 160 | unsigned long ticks_this_gp; /* The number of scheduling-clock */ | 160 | unsigned long ticks_this_gp; /* The number of scheduling-clock */ |
| 161 | /* ticks this CPU has handled */ | 161 | /* ticks this CPU has handled */ |
| 162 | /* during and after the last grace */ | 162 | /* during and after the last grace */ |
| 163 | /* period it is aware of. */ | 163 | /* period it is aware of. */ |
| 164 | struct irq_work defer_qs_iw; /* Obtain later scheduler attention. */ | ||
| 165 | bool defer_qs_iw_pending; /* Scheduler attention pending? */ | ||
| 164 | 166 | ||
| 165 | /* 2) batch handling */ | 167 | /* 2) batch handling */ |
| 166 | struct rcu_segcblist cblist; /* Segmented callback list, with */ | 168 | struct rcu_segcblist cblist; /* Segmented callback list, with */ |
| @@ -407,8 +409,8 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func); | |||
| 407 | static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); | 409 | static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck); |
| 408 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | 410 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); |
| 409 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | 411 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); |
| 410 | static void invoke_rcu_callbacks_kthread(void); | ||
| 411 | static bool rcu_is_callbacks_kthread(void); | 412 | static bool rcu_is_callbacks_kthread(void); |
| 413 | static void rcu_cpu_kthread_setup(unsigned int cpu); | ||
| 412 | static void __init rcu_spawn_boost_kthreads(void); | 414 | static void __init rcu_spawn_boost_kthreads(void); |
| 413 | static void rcu_prepare_kthreads(int cpu); | 415 | static void rcu_prepare_kthreads(int cpu); |
| 414 | static void rcu_cleanup_after_idle(void); | 416 | static void rcu_cleanup_after_idle(void); |
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 9c990df880d1..af7e7b9c86af 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h | |||
| @@ -250,7 +250,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, | |||
| 250 | */ | 250 | */ |
| 251 | static void rcu_report_exp_rdp(struct rcu_data *rdp) | 251 | static void rcu_report_exp_rdp(struct rcu_data *rdp) |
| 252 | { | 252 | { |
| 253 | WRITE_ONCE(rdp->deferred_qs, false); | 253 | WRITE_ONCE(rdp->exp_deferred_qs, false); |
| 254 | rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); | 254 | rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); |
| 255 | } | 255 | } |
| 256 | 256 | ||
| @@ -259,8 +259,7 @@ static bool sync_exp_work_done(unsigned long s) | |||
| 259 | { | 259 | { |
| 260 | if (rcu_exp_gp_seq_done(s)) { | 260 | if (rcu_exp_gp_seq_done(s)) { |
| 261 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); | 261 | trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done")); |
| 262 | /* Ensure test happens before caller kfree(). */ | 262 | smp_mb(); /* Ensure test happens before caller kfree(). */ |
| 263 | smp_mb__before_atomic(); /* ^^^ */ | ||
| 264 | return true; | 263 | return true; |
| 265 | } | 264 | } |
| 266 | return false; | 265 | return false; |
| @@ -384,7 +383,12 @@ retry_ipi: | |||
| 384 | mask_ofl_test |= mask; | 383 | mask_ofl_test |= mask; |
| 385 | continue; | 384 | continue; |
| 386 | } | 385 | } |
| 386 | if (get_cpu() == cpu) { | ||
| 387 | put_cpu(); | ||
| 388 | continue; | ||
| 389 | } | ||
| 387 | ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); | 390 | ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); |
| 391 | put_cpu(); | ||
| 388 | if (!ret) { | 392 | if (!ret) { |
| 389 | mask_ofl_ipi &= ~mask; | 393 | mask_ofl_ipi &= ~mask; |
| 390 | continue; | 394 | continue; |
| @@ -611,7 +615,7 @@ static void rcu_exp_handler(void *unused) | |||
| 611 | rcu_dynticks_curr_cpu_in_eqs()) { | 615 | rcu_dynticks_curr_cpu_in_eqs()) { |
| 612 | rcu_report_exp_rdp(rdp); | 616 | rcu_report_exp_rdp(rdp); |
| 613 | } else { | 617 | } else { |
| 614 | rdp->deferred_qs = true; | 618 | rdp->exp_deferred_qs = true; |
| 615 | set_tsk_need_resched(t); | 619 | set_tsk_need_resched(t); |
| 616 | set_preempt_need_resched(); | 620 | set_preempt_need_resched(); |
| 617 | } | 621 | } |
| @@ -633,7 +637,7 @@ static void rcu_exp_handler(void *unused) | |||
| 633 | if (t->rcu_read_lock_nesting > 0) { | 637 | if (t->rcu_read_lock_nesting > 0) { |
| 634 | raw_spin_lock_irqsave_rcu_node(rnp, flags); | 638 | raw_spin_lock_irqsave_rcu_node(rnp, flags); |
| 635 | if (rnp->expmask & rdp->grpmask) { | 639 | if (rnp->expmask & rdp->grpmask) { |
| 636 | rdp->deferred_qs = true; | 640 | rdp->exp_deferred_qs = true; |
| 637 | t->rcu_read_unlock_special.b.exp_hint = true; | 641 | t->rcu_read_unlock_special.b.exp_hint = true; |
| 638 | } | 642 | } |
| 639 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 643 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| @@ -656,7 +660,7 @@ static void rcu_exp_handler(void *unused) | |||
| 656 | * | 660 | * |
| 657 | * Otherwise, force a context switch after the CPU enables everything. | 661 | * Otherwise, force a context switch after the CPU enables everything. |
| 658 | */ | 662 | */ |
| 659 | rdp->deferred_qs = true; | 663 | rdp->exp_deferred_qs = true; |
| 660 | if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || | 664 | if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) || |
| 661 | WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { | 665 | WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs())) { |
| 662 | rcu_preempt_deferred_qs(t); | 666 | rcu_preempt_deferred_qs(t); |
| @@ -694,6 +698,16 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |||
| 694 | 698 | ||
| 695 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | 699 | #else /* #ifdef CONFIG_PREEMPT_RCU */ |
| 696 | 700 | ||
| 701 | /* Request an expedited quiescent state. */ | ||
| 702 | static void rcu_exp_need_qs(void) | ||
| 703 | { | ||
| 704 | __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); | ||
| 705 | /* Store .exp before .rcu_urgent_qs. */ | ||
| 706 | smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); | ||
| 707 | set_tsk_need_resched(current); | ||
| 708 | set_preempt_need_resched(); | ||
| 709 | } | ||
| 710 | |||
| 697 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ | 711 | /* Invoked on each online non-idle CPU for expedited quiescent state. */ |
| 698 | static void rcu_exp_handler(void *unused) | 712 | static void rcu_exp_handler(void *unused) |
| 699 | { | 713 | { |
| @@ -709,25 +723,38 @@ static void rcu_exp_handler(void *unused) | |||
| 709 | rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); | 723 | rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); |
| 710 | return; | 724 | return; |
| 711 | } | 725 | } |
| 712 | __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); | 726 | rcu_exp_need_qs(); |
| 713 | /* Store .exp before .rcu_urgent_qs. */ | ||
| 714 | smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); | ||
| 715 | set_tsk_need_resched(current); | ||
| 716 | set_preempt_need_resched(); | ||
| 717 | } | 727 | } |
| 718 | 728 | ||
| 719 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ | 729 | /* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ |
| 720 | static void sync_sched_exp_online_cleanup(int cpu) | 730 | static void sync_sched_exp_online_cleanup(int cpu) |
| 721 | { | 731 | { |
| 732 | unsigned long flags; | ||
| 733 | int my_cpu; | ||
| 722 | struct rcu_data *rdp; | 734 | struct rcu_data *rdp; |
| 723 | int ret; | 735 | int ret; |
| 724 | struct rcu_node *rnp; | 736 | struct rcu_node *rnp; |
| 725 | 737 | ||
| 726 | rdp = per_cpu_ptr(&rcu_data, cpu); | 738 | rdp = per_cpu_ptr(&rcu_data, cpu); |
| 727 | rnp = rdp->mynode; | 739 | rnp = rdp->mynode; |
| 728 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) | 740 | my_cpu = get_cpu(); |
| 741 | /* Quiescent state either not needed or already requested, leave. */ | ||
| 742 | if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || | ||
| 743 | __this_cpu_read(rcu_data.cpu_no_qs.b.exp)) { | ||
| 744 | put_cpu(); | ||
| 745 | return; | ||
| 746 | } | ||
| 747 | /* Quiescent state needed on current CPU, so set it up locally. */ | ||
| 748 | if (my_cpu == cpu) { | ||
| 749 | local_irq_save(flags); | ||
| 750 | rcu_exp_need_qs(); | ||
| 751 | local_irq_restore(flags); | ||
| 752 | put_cpu(); | ||
| 729 | return; | 753 | return; |
| 754 | } | ||
| 755 | /* Quiescent state needed on some other CPU, send IPI. */ | ||
| 730 | ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); | 756 | ret = smp_call_function_single(cpu, rcu_exp_handler, NULL, 0); |
| 757 | put_cpu(); | ||
| 731 | WARN_ON_ONCE(ret); | 758 | WARN_ON_ONCE(ret); |
| 732 | } | 759 | } |
| 733 | 760 | ||
| @@ -765,7 +792,6 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp) | |||
| 765 | */ | 792 | */ |
| 766 | void synchronize_rcu_expedited(void) | 793 | void synchronize_rcu_expedited(void) |
| 767 | { | 794 | { |
| 768 | struct rcu_data *rdp; | ||
| 769 | struct rcu_exp_work rew; | 795 | struct rcu_exp_work rew; |
| 770 | struct rcu_node *rnp; | 796 | struct rcu_node *rnp; |
| 771 | unsigned long s; | 797 | unsigned long s; |
| @@ -802,7 +828,6 @@ void synchronize_rcu_expedited(void) | |||
| 802 | } | 828 | } |
| 803 | 829 | ||
| 804 | /* Wait for expedited grace period to complete. */ | 830 | /* Wait for expedited grace period to complete. */ |
| 805 | rdp = per_cpu_ptr(&rcu_data, raw_smp_processor_id()); | ||
| 806 | rnp = rcu_get_root(); | 831 | rnp = rcu_get_root(); |
| 807 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], | 832 | wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], |
| 808 | sync_exp_work_done(s)); | 833 | sync_exp_work_done(s)); |
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1102765f91fd..acb225023ed1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -11,29 +11,7 @@ | |||
| 11 | * Paul E. McKenney <paulmck@linux.ibm.com> | 11 | * Paul E. McKenney <paulmck@linux.ibm.com> |
| 12 | */ | 12 | */ |
| 13 | 13 | ||
| 14 | #include <linux/delay.h> | ||
| 15 | #include <linux/gfp.h> | ||
| 16 | #include <linux/oom.h> | ||
| 17 | #include <linux/sched/debug.h> | ||
| 18 | #include <linux/smpboot.h> | ||
| 19 | #include <linux/sched/isolation.h> | ||
| 20 | #include <uapi/linux/sched/types.h> | ||
| 21 | #include "../time/tick-internal.h" | ||
| 22 | |||
| 23 | #ifdef CONFIG_RCU_BOOST | ||
| 24 | #include "../locking/rtmutex_common.h" | 14 | #include "../locking/rtmutex_common.h" |
| 25 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 26 | |||
| 27 | /* | ||
| 28 | * Some architectures do not define rt_mutexes, but if !CONFIG_RCU_BOOST, | ||
| 29 | * all uses are in dead code. Provide a definition to keep the compiler | ||
| 30 | * happy, but add WARN_ON_ONCE() to complain if used in the wrong place. | ||
| 31 | * This probably needs to be excluded from -rt builds. | ||
| 32 | */ | ||
| 33 | #define rt_mutex_owner(a) ({ WARN_ON_ONCE(1); NULL; }) | ||
| 34 | #define rt_mutex_futex_unlock(x) WARN_ON_ONCE(1) | ||
| 35 | |||
| 36 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
| 37 | 15 | ||
| 38 | #ifdef CONFIG_RCU_NOCB_CPU | 16 | #ifdef CONFIG_RCU_NOCB_CPU |
| 39 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | 17 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ |
| @@ -94,6 +72,8 @@ static void __init rcu_bootup_announce_oddness(void) | |||
| 94 | pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); | 72 | pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); |
| 95 | if (gp_cleanup_delay) | 73 | if (gp_cleanup_delay) |
| 96 | pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); | 74 | pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); |
| 75 | if (!use_softirq) | ||
| 76 | pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); | ||
| 97 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) | 77 | if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) |
| 98 | pr_info("\tRCU debug extended QS entry/exit.\n"); | 78 | pr_info("\tRCU debug extended QS entry/exit.\n"); |
| 99 | rcupdate_announce_bootup_oddness(); | 79 | rcupdate_announce_bootup_oddness(); |
| @@ -257,10 +237,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) | |||
| 257 | * no need to check for a subsequent expedited GP. (Though we are | 237 | * no need to check for a subsequent expedited GP. (Though we are |
| 258 | * still in a quiescent state in any case.) | 238 | * still in a quiescent state in any case.) |
| 259 | */ | 239 | */ |
| 260 | if (blkd_state & RCU_EXP_BLKD && rdp->deferred_qs) | 240 | if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) |
| 261 | rcu_report_exp_rdp(rdp); | 241 | rcu_report_exp_rdp(rdp); |
| 262 | else | 242 | else |
| 263 | WARN_ON_ONCE(rdp->deferred_qs); | 243 | WARN_ON_ONCE(rdp->exp_deferred_qs); |
| 264 | } | 244 | } |
| 265 | 245 | ||
| 266 | /* | 246 | /* |
| @@ -357,7 +337,7 @@ void rcu_note_context_switch(bool preempt) | |||
| 357 | * means that we continue to block the current grace period. | 337 | * means that we continue to block the current grace period. |
| 358 | */ | 338 | */ |
| 359 | rcu_qs(); | 339 | rcu_qs(); |
| 360 | if (rdp->deferred_qs) | 340 | if (rdp->exp_deferred_qs) |
| 361 | rcu_report_exp_rdp(rdp); | 341 | rcu_report_exp_rdp(rdp); |
| 362 | trace_rcu_utilization(TPS("End context switch")); | 342 | trace_rcu_utilization(TPS("End context switch")); |
| 363 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ | 343 | barrier(); /* Avoid RCU read-side critical sections leaking up. */ |
| @@ -471,14 +451,15 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) | |||
| 471 | */ | 451 | */ |
| 472 | special = t->rcu_read_unlock_special; | 452 | special = t->rcu_read_unlock_special; |
| 473 | rdp = this_cpu_ptr(&rcu_data); | 453 | rdp = this_cpu_ptr(&rcu_data); |
| 474 | if (!special.s && !rdp->deferred_qs) { | 454 | if (!special.s && !rdp->exp_deferred_qs) { |
| 475 | local_irq_restore(flags); | 455 | local_irq_restore(flags); |
| 476 | return; | 456 | return; |
| 477 | } | 457 | } |
| 458 | t->rcu_read_unlock_special.b.deferred_qs = false; | ||
| 478 | if (special.b.need_qs) { | 459 | if (special.b.need_qs) { |
| 479 | rcu_qs(); | 460 | rcu_qs(); |
| 480 | t->rcu_read_unlock_special.b.need_qs = false; | 461 | t->rcu_read_unlock_special.b.need_qs = false; |
| 481 | if (!t->rcu_read_unlock_special.s && !rdp->deferred_qs) { | 462 | if (!t->rcu_read_unlock_special.s && !rdp->exp_deferred_qs) { |
| 482 | local_irq_restore(flags); | 463 | local_irq_restore(flags); |
| 483 | return; | 464 | return; |
| 484 | } | 465 | } |
| @@ -490,7 +471,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) | |||
| 490 | * tasks are handled when removing the task from the | 471 | * tasks are handled when removing the task from the |
| 491 | * blocked-tasks list below. | 472 | * blocked-tasks list below. |
| 492 | */ | 473 | */ |
| 493 | if (rdp->deferred_qs) { | 474 | if (rdp->exp_deferred_qs) { |
| 494 | rcu_report_exp_rdp(rdp); | 475 | rcu_report_exp_rdp(rdp); |
| 495 | if (!t->rcu_read_unlock_special.s) { | 476 | if (!t->rcu_read_unlock_special.s) { |
| 496 | local_irq_restore(flags); | 477 | local_irq_restore(flags); |
| @@ -579,7 +560,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) | |||
| 579 | */ | 560 | */ |
| 580 | static bool rcu_preempt_need_deferred_qs(struct task_struct *t) | 561 | static bool rcu_preempt_need_deferred_qs(struct task_struct *t) |
| 581 | { | 562 | { |
| 582 | return (__this_cpu_read(rcu_data.deferred_qs) || | 563 | return (__this_cpu_read(rcu_data.exp_deferred_qs) || |
| 583 | READ_ONCE(t->rcu_read_unlock_special.s)) && | 564 | READ_ONCE(t->rcu_read_unlock_special.s)) && |
| 584 | t->rcu_read_lock_nesting <= 0; | 565 | t->rcu_read_lock_nesting <= 0; |
| 585 | } | 566 | } |
| @@ -607,6 +588,17 @@ static void rcu_preempt_deferred_qs(struct task_struct *t) | |||
| 607 | } | 588 | } |
| 608 | 589 | ||
| 609 | /* | 590 | /* |
| 591 | * Minimal handler to give the scheduler a chance to re-evaluate. | ||
| 592 | */ | ||
| 593 | static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp) | ||
| 594 | { | ||
| 595 | struct rcu_data *rdp; | ||
| 596 | |||
| 597 | rdp = container_of(iwp, struct rcu_data, defer_qs_iw); | ||
| 598 | rdp->defer_qs_iw_pending = false; | ||
| 599 | } | ||
| 600 | |||
| 601 | /* | ||
| 610 | * Handle special cases during rcu_read_unlock(), such as needing to | 602 | * Handle special cases during rcu_read_unlock(), such as needing to |
| 611 | * notify RCU core processing or task having blocked during the RCU | 603 | * notify RCU core processing or task having blocked during the RCU |
| 612 | * read-side critical section. | 604 | * read-side critical section. |
| @@ -625,16 +617,41 @@ static void rcu_read_unlock_special(struct task_struct *t) | |||
| 625 | local_irq_save(flags); | 617 | local_irq_save(flags); |
| 626 | irqs_were_disabled = irqs_disabled_flags(flags); | 618 | irqs_were_disabled = irqs_disabled_flags(flags); |
| 627 | if (preempt_bh_were_disabled || irqs_were_disabled) { | 619 | if (preempt_bh_were_disabled || irqs_were_disabled) { |
| 628 | WRITE_ONCE(t->rcu_read_unlock_special.b.exp_hint, false); | 620 | bool exp; |
| 629 | /* Need to defer quiescent state until everything is enabled. */ | 621 | struct rcu_data *rdp = this_cpu_ptr(&rcu_data); |
| 630 | if (irqs_were_disabled) { | 622 | struct rcu_node *rnp = rdp->mynode; |
| 631 | /* Enabling irqs does not reschedule, so... */ | 623 | |
| 624 | t->rcu_read_unlock_special.b.exp_hint = false; | ||
| 625 | exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) || | ||
| 626 | (rdp->grpmask & rnp->expmask) || | ||
| 627 | tick_nohz_full_cpu(rdp->cpu); | ||
| 628 | // Need to defer quiescent state until everything is enabled. | ||
| 629 | if ((exp || in_irq()) && irqs_were_disabled && use_softirq && | ||
| 630 | (in_irq() || !t->rcu_read_unlock_special.b.deferred_qs)) { | ||
| 631 | // Using softirq, safe to awaken, and we get | ||
| 632 | // no help from enabling irqs, unlike bh/preempt. | ||
| 632 | raise_softirq_irqoff(RCU_SOFTIRQ); | 633 | raise_softirq_irqoff(RCU_SOFTIRQ); |
| 634 | } else if (exp && irqs_were_disabled && !use_softirq && | ||
| 635 | !t->rcu_read_unlock_special.b.deferred_qs) { | ||
| 636 | // Safe to awaken and we get no help from enabling | ||
| 637 | // irqs, unlike bh/preempt. | ||
| 638 | invoke_rcu_core(); | ||
| 633 | } else { | 639 | } else { |
| 634 | /* Enabling BH or preempt does reschedule, so... */ | 640 | // Enabling BH or preempt does reschedule, so... |
| 641 | // Also if no expediting or NO_HZ_FULL, slow is OK. | ||
| 635 | set_tsk_need_resched(current); | 642 | set_tsk_need_resched(current); |
| 636 | set_preempt_need_resched(); | 643 | set_preempt_need_resched(); |
| 644 | if (IS_ENABLED(CONFIG_IRQ_WORK) && | ||
| 645 | !rdp->defer_qs_iw_pending && exp) { | ||
| 646 | // Get scheduler to re-evaluate and call hooks. | ||
| 647 | // If !IRQ_WORK, FQS scan will eventually IPI. | ||
| 648 | init_irq_work(&rdp->defer_qs_iw, | ||
| 649 | rcu_preempt_deferred_qs_handler); | ||
| 650 | rdp->defer_qs_iw_pending = true; | ||
| 651 | irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu); | ||
| 652 | } | ||
| 637 | } | 653 | } |
| 654 | t->rcu_read_unlock_special.b.deferred_qs = true; | ||
| 638 | local_irq_restore(flags); | 655 | local_irq_restore(flags); |
| 639 | return; | 656 | return; |
| 640 | } | 657 | } |
| @@ -760,7 +777,7 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) | |||
| 760 | i = 0; | 777 | i = 0; |
| 761 | list_for_each(lhp, &rnp->blkd_tasks) { | 778 | list_for_each(lhp, &rnp->blkd_tasks) { |
| 762 | pr_cont(" %p", lhp); | 779 | pr_cont(" %p", lhp); |
| 763 | if (++i >= 10) | 780 | if (++i >= ncheck) |
| 764 | break; | 781 | break; |
| 765 | } | 782 | } |
| 766 | pr_cont("\n"); | 783 | pr_cont("\n"); |
| @@ -944,18 +961,21 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) | |||
| 944 | 961 | ||
| 945 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | 962 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ |
| 946 | 963 | ||
| 964 | /* | ||
| 965 | * If boosting, set rcuc kthreads to realtime priority. | ||
| 966 | */ | ||
| 967 | static void rcu_cpu_kthread_setup(unsigned int cpu) | ||
| 968 | { | ||
| 947 | #ifdef CONFIG_RCU_BOOST | 969 | #ifdef CONFIG_RCU_BOOST |
| 970 | struct sched_param sp; | ||
| 948 | 971 | ||
| 949 | static void rcu_wake_cond(struct task_struct *t, int status) | 972 | sp.sched_priority = kthread_prio; |
| 950 | { | 973 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
| 951 | /* | 974 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 952 | * If the thread is yielding, only wake it when this | ||
| 953 | * is invoked from idle | ||
| 954 | */ | ||
| 955 | if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) | ||
| 956 | wake_up_process(t); | ||
| 957 | } | 975 | } |
| 958 | 976 | ||
| 977 | #ifdef CONFIG_RCU_BOOST | ||
| 978 | |||
| 959 | /* | 979 | /* |
| 960 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | 980 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks |
| 961 | * or ->boost_tasks, advancing the pointer to the next task in the | 981 | * or ->boost_tasks, advancing the pointer to the next task in the |
| @@ -1091,23 +1111,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
| 1091 | } | 1111 | } |
| 1092 | 1112 | ||
| 1093 | /* | 1113 | /* |
| 1094 | * Wake up the per-CPU kthread to invoke RCU callbacks. | ||
| 1095 | */ | ||
| 1096 | static void invoke_rcu_callbacks_kthread(void) | ||
| 1097 | { | ||
| 1098 | unsigned long flags; | ||
| 1099 | |||
| 1100 | local_irq_save(flags); | ||
| 1101 | __this_cpu_write(rcu_data.rcu_cpu_has_work, 1); | ||
| 1102 | if (__this_cpu_read(rcu_data.rcu_cpu_kthread_task) != NULL && | ||
| 1103 | current != __this_cpu_read(rcu_data.rcu_cpu_kthread_task)) { | ||
| 1104 | rcu_wake_cond(__this_cpu_read(rcu_data.rcu_cpu_kthread_task), | ||
| 1105 | __this_cpu_read(rcu_data.rcu_cpu_kthread_status)); | ||
| 1106 | } | ||
| 1107 | local_irq_restore(flags); | ||
| 1108 | } | ||
| 1109 | |||
| 1110 | /* | ||
| 1111 | * Is the current CPU running the RCU-callbacks kthread? | 1114 | * Is the current CPU running the RCU-callbacks kthread? |
| 1112 | * Caller must have preemption disabled. | 1115 | * Caller must have preemption disabled. |
| 1113 | */ | 1116 | */ |
| @@ -1160,59 +1163,6 @@ static int rcu_spawn_one_boost_kthread(struct rcu_node *rnp) | |||
| 1160 | return 0; | 1163 | return 0; |
| 1161 | } | 1164 | } |
| 1162 | 1165 | ||
| 1163 | static void rcu_cpu_kthread_setup(unsigned int cpu) | ||
| 1164 | { | ||
| 1165 | struct sched_param sp; | ||
| 1166 | |||
| 1167 | sp.sched_priority = kthread_prio; | ||
| 1168 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
| 1169 | } | ||
| 1170 | |||
| 1171 | static void rcu_cpu_kthread_park(unsigned int cpu) | ||
| 1172 | { | ||
| 1173 | per_cpu(rcu_data.rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
| 1174 | } | ||
| 1175 | |||
| 1176 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | ||
| 1177 | { | ||
| 1178 | return __this_cpu_read(rcu_data.rcu_cpu_has_work); | ||
| 1179 | } | ||
| 1180 | |||
| 1181 | /* | ||
| 1182 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces | ||
| 1183 | * the RCU softirq used in configurations of RCU that do not support RCU | ||
| 1184 | * priority boosting. | ||
| 1185 | */ | ||
| 1186 | static void rcu_cpu_kthread(unsigned int cpu) | ||
| 1187 | { | ||
| 1188 | unsigned int *statusp = this_cpu_ptr(&rcu_data.rcu_cpu_kthread_status); | ||
| 1189 | char work, *workp = this_cpu_ptr(&rcu_data.rcu_cpu_has_work); | ||
| 1190 | int spincnt; | ||
| 1191 | |||
| 1192 | for (spincnt = 0; spincnt < 10; spincnt++) { | ||
| 1193 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); | ||
| 1194 | local_bh_disable(); | ||
| 1195 | *statusp = RCU_KTHREAD_RUNNING; | ||
| 1196 | local_irq_disable(); | ||
| 1197 | work = *workp; | ||
| 1198 | *workp = 0; | ||
| 1199 | local_irq_enable(); | ||
| 1200 | if (work) | ||
| 1201 | rcu_do_batch(this_cpu_ptr(&rcu_data)); | ||
| 1202 | local_bh_enable(); | ||
| 1203 | if (*workp == 0) { | ||
| 1204 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); | ||
| 1205 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1206 | return; | ||
| 1207 | } | ||
| 1208 | } | ||
| 1209 | *statusp = RCU_KTHREAD_YIELDING; | ||
| 1210 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); | ||
| 1211 | schedule_timeout_interruptible(2); | ||
| 1212 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); | ||
| 1213 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1214 | } | ||
| 1215 | |||
| 1216 | /* | 1166 | /* |
| 1217 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | 1167 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are |
| 1218 | * served by the rcu_node in question. The CPU hotplug lock is still | 1168 | * served by the rcu_node in question. The CPU hotplug lock is still |
| @@ -1243,27 +1193,13 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
| 1243 | free_cpumask_var(cm); | 1193 | free_cpumask_var(cm); |
| 1244 | } | 1194 | } |
| 1245 | 1195 | ||
| 1246 | static struct smp_hotplug_thread rcu_cpu_thread_spec = { | ||
| 1247 | .store = &rcu_data.rcu_cpu_kthread_task, | ||
| 1248 | .thread_should_run = rcu_cpu_kthread_should_run, | ||
| 1249 | .thread_fn = rcu_cpu_kthread, | ||
| 1250 | .thread_comm = "rcuc/%u", | ||
| 1251 | .setup = rcu_cpu_kthread_setup, | ||
| 1252 | .park = rcu_cpu_kthread_park, | ||
| 1253 | }; | ||
| 1254 | |||
| 1255 | /* | 1196 | /* |
| 1256 | * Spawn boost kthreads -- called as soon as the scheduler is running. | 1197 | * Spawn boost kthreads -- called as soon as the scheduler is running. |
| 1257 | */ | 1198 | */ |
| 1258 | static void __init rcu_spawn_boost_kthreads(void) | 1199 | static void __init rcu_spawn_boost_kthreads(void) |
| 1259 | { | 1200 | { |
| 1260 | struct rcu_node *rnp; | 1201 | struct rcu_node *rnp; |
| 1261 | int cpu; | ||
| 1262 | 1202 | ||
| 1263 | for_each_possible_cpu(cpu) | ||
| 1264 | per_cpu(rcu_data.rcu_cpu_has_work, cpu) = 0; | ||
| 1265 | if (WARN_ONCE(smpboot_register_percpu_thread(&rcu_cpu_thread_spec), "%s: Could not start rcub kthread, OOM is now expected behavior\n", __func__)) | ||
| 1266 | return; | ||
| 1267 | rcu_for_each_leaf_node(rnp) | 1203 | rcu_for_each_leaf_node(rnp) |
| 1268 | (void)rcu_spawn_one_boost_kthread(rnp); | 1204 | (void)rcu_spawn_one_boost_kthread(rnp); |
| 1269 | } | 1205 | } |
| @@ -1286,11 +1222,6 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
| 1286 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 1222 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 1287 | } | 1223 | } |
| 1288 | 1224 | ||
| 1289 | static void invoke_rcu_callbacks_kthread(void) | ||
| 1290 | { | ||
| 1291 | WARN_ON_ONCE(1); | ||
| 1292 | } | ||
| 1293 | |||
| 1294 | static bool rcu_is_callbacks_kthread(void) | 1225 | static bool rcu_is_callbacks_kthread(void) |
| 1295 | { | 1226 | { |
| 1296 | return false; | 1227 | return false; |
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index f65a73a97323..065183391f75 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h | |||
| @@ -630,7 +630,9 @@ static void rcu_check_gp_start_stall(struct rcu_node *rnp, struct rcu_data *rdp, | |||
| 630 | time_before(j, rcu_state.gp_req_activity + gpssdelay) || | 630 | time_before(j, rcu_state.gp_req_activity + gpssdelay) || |
| 631 | time_before(j, rcu_state.gp_activity + gpssdelay) || | 631 | time_before(j, rcu_state.gp_activity + gpssdelay) || |
| 632 | atomic_xchg(&warned, 1)) { | 632 | atomic_xchg(&warned, 1)) { |
| 633 | raw_spin_unlock_rcu_node(rnp_root); /* irqs remain disabled. */ | 633 | if (rnp_root != rnp) |
| 634 | /* irqs remain disabled. */ | ||
| 635 | raw_spin_unlock_rcu_node(rnp_root); | ||
| 634 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); | 636 | raw_spin_unlock_irqrestore_rcu_node(rnp, flags); |
| 635 | return; | 637 | return; |
| 636 | } | 638 | } |
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index c3bf44ba42e5..61df2bf08563 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c | |||
| @@ -423,6 +423,19 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | |||
| 423 | do { } while (0) | 423 | do { } while (0) |
| 424 | #endif | 424 | #endif |
| 425 | 425 | ||
| 426 | #if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) | ||
| 427 | /* Get rcutorture access to sched_setaffinity(). */ | ||
| 428 | long rcutorture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | ||
| 429 | { | ||
| 430 | int ret; | ||
| 431 | |||
| 432 | ret = sched_setaffinity(pid, in_mask); | ||
| 433 | WARN_ONCE(ret, "%s: sched_setaffinity() returned %d\n", __func__, ret); | ||
| 434 | return ret; | ||
| 435 | } | ||
| 436 | EXPORT_SYMBOL_GPL(rcutorture_sched_setaffinity); | ||
| 437 | #endif | ||
| 438 | |||
| 426 | #ifdef CONFIG_RCU_STALL_COMMON | 439 | #ifdef CONFIG_RCU_STALL_COMMON |
| 427 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 440 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
| 428 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); | 441 | EXPORT_SYMBOL_GPL(rcu_cpu_stall_suppress); |
diff --git a/kernel/reboot.c b/kernel/reboot.c index e1b79b6a2735..c4d472b7f1b4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/reboot.c | 3 | * linux/kernel/reboot.c |
| 3 | * | 4 | * |
| @@ -31,6 +32,7 @@ EXPORT_SYMBOL(cad_pid); | |||
| 31 | #define DEFAULT_REBOOT_MODE | 32 | #define DEFAULT_REBOOT_MODE |
| 32 | #endif | 33 | #endif |
| 33 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; | 34 | enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; |
| 35 | enum reboot_mode panic_reboot_mode = REBOOT_UNDEFINED; | ||
| 34 | 36 | ||
| 35 | /* | 37 | /* |
| 36 | * This variable is used privately to keep track of whether or not | 38 | * This variable is used privately to keep track of whether or not |
| @@ -519,6 +521,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot); | |||
| 519 | static int __init reboot_setup(char *str) | 521 | static int __init reboot_setup(char *str) |
| 520 | { | 522 | { |
| 521 | for (;;) { | 523 | for (;;) { |
| 524 | enum reboot_mode *mode; | ||
| 525 | |||
| 522 | /* | 526 | /* |
| 523 | * Having anything passed on the command line via | 527 | * Having anything passed on the command line via |
| 524 | * reboot= will cause us to disable DMI checking | 528 | * reboot= will cause us to disable DMI checking |
| @@ -526,17 +530,24 @@ static int __init reboot_setup(char *str) | |||
| 526 | */ | 530 | */ |
| 527 | reboot_default = 0; | 531 | reboot_default = 0; |
| 528 | 532 | ||
| 533 | if (!strncmp(str, "panic_", 6)) { | ||
| 534 | mode = &panic_reboot_mode; | ||
| 535 | str += 6; | ||
| 536 | } else { | ||
| 537 | mode = &reboot_mode; | ||
| 538 | } | ||
| 539 | |||
| 529 | switch (*str) { | 540 | switch (*str) { |
| 530 | case 'w': | 541 | case 'w': |
| 531 | reboot_mode = REBOOT_WARM; | 542 | *mode = REBOOT_WARM; |
| 532 | break; | 543 | break; |
| 533 | 544 | ||
| 534 | case 'c': | 545 | case 'c': |
| 535 | reboot_mode = REBOOT_COLD; | 546 | *mode = REBOOT_COLD; |
| 536 | break; | 547 | break; |
| 537 | 548 | ||
| 538 | case 'h': | 549 | case 'h': |
| 539 | reboot_mode = REBOOT_HARD; | 550 | *mode = REBOOT_HARD; |
| 540 | break; | 551 | break; |
| 541 | 552 | ||
| 542 | case 's': | 553 | case 's': |
| @@ -553,11 +564,11 @@ static int __init reboot_setup(char *str) | |||
| 553 | if (rc) | 564 | if (rc) |
| 554 | return rc; | 565 | return rc; |
| 555 | } else | 566 | } else |
| 556 | reboot_mode = REBOOT_SOFT; | 567 | *mode = REBOOT_SOFT; |
| 557 | break; | 568 | break; |
| 558 | } | 569 | } |
| 559 | case 'g': | 570 | case 'g': |
| 560 | reboot_mode = REBOOT_GPIO; | 571 | *mode = REBOOT_GPIO; |
| 561 | break; | 572 | break; |
| 562 | 573 | ||
| 563 | case 'b': | 574 | case 'b': |
diff --git a/kernel/resource.c b/kernel/resource.c index 8c15f846e8ef..158f04ec1d4f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/resource.c | 3 | * linux/kernel/resource.c |
| 3 | * | 4 | * |
diff --git a/kernel/rseq.c b/kernel/rseq.c index 9424ee90589e..27c48eb7de40 100644 --- a/kernel/rseq.c +++ b/kernel/rseq.c | |||
| @@ -277,7 +277,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) | |||
| 277 | 277 | ||
| 278 | error: | 278 | error: |
| 279 | sig = ksig ? ksig->sig : 0; | 279 | sig = ksig ? ksig->sig : 0; |
| 280 | force_sigsegv(sig, t); | 280 | force_sigsegv(sig); |
| 281 | } | 281 | } |
| 282 | 282 | ||
| 283 | #ifdef CONFIG_DEBUG_RSEQ | 283 | #ifdef CONFIG_DEBUG_RSEQ |
| @@ -296,7 +296,7 @@ void rseq_syscall(struct pt_regs *regs) | |||
| 296 | return; | 296 | return; |
| 297 | if (!access_ok(t->rseq, sizeof(*t->rseq)) || | 297 | if (!access_ok(t->rseq, sizeof(*t->rseq)) || |
| 298 | rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) | 298 | rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) |
| 299 | force_sig(SIGSEGV, t); | 299 | force_sig(SIGSEGV); |
| 300 | } | 300 | } |
| 301 | 301 | ||
| 302 | #endif | 302 | #endif |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2d4ff5353ded..2067080bb235 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
| @@ -259,7 +259,6 @@ out: | |||
| 259 | } | 259 | } |
| 260 | #endif /* CONFIG_PROC_FS */ | 260 | #endif /* CONFIG_PROC_FS */ |
| 261 | 261 | ||
| 262 | #ifdef CONFIG_SCHED_DEBUG | ||
| 263 | int autogroup_path(struct task_group *tg, char *buf, int buflen) | 262 | int autogroup_path(struct task_group *tg, char *buf, int buflen) |
| 264 | { | 263 | { |
| 265 | if (!task_group_is_autogroup(tg)) | 264 | if (!task_group_is_autogroup(tg)) |
| @@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
| 267 | 266 | ||
| 268 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 267 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
| 269 | } | 268 | } |
| 270 | #endif | ||
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e3e3b979f9bd..1152259a4ca0 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * sched_clock() for unstable CPU clocks | 3 | * sched_clock() for unstable CPU clocks |
| 3 | * | 4 | * |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 102dfcf0a29a..fa43ce3962e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/sched/core.c | 3 | * kernel/sched/core.c |
| 3 | * | 4 | * |
| @@ -22,6 +23,17 @@ | |||
| 22 | #define CREATE_TRACE_POINTS | 23 | #define CREATE_TRACE_POINTS |
| 23 | #include <trace/events/sched.h> | 24 | #include <trace/events/sched.h> |
| 24 | 25 | ||
| 26 | /* | ||
| 27 | * Export tracepoints that act as a bare tracehook (ie: have no trace event | ||
| 28 | * associated with them) to allow external modules to probe them. | ||
| 29 | */ | ||
| 30 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); | ||
| 31 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); | ||
| 32 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); | ||
| 33 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); | ||
| 34 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); | ||
| 35 | EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); | ||
| 36 | |||
| 25 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 37 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
| 26 | 38 | ||
| 27 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) | 39 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) |
| @@ -760,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load) | |||
| 760 | } | 772 | } |
| 761 | } | 773 | } |
| 762 | 774 | ||
| 775 | #ifdef CONFIG_UCLAMP_TASK | ||
| 776 | /* Max allowed minimum utilization */ | ||
| 777 | unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; | ||
| 778 | |||
| 779 | /* Max allowed maximum utilization */ | ||
| 780 | unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; | ||
| 781 | |||
| 782 | /* All clamps are required to be less or equal than these values */ | ||
| 783 | static struct uclamp_se uclamp_default[UCLAMP_CNT]; | ||
| 784 | |||
| 785 | /* Integer rounded range for each bucket */ | ||
| 786 | #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) | ||
| 787 | |||
| 788 | #define for_each_clamp_id(clamp_id) \ | ||
| 789 | for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) | ||
| 790 | |||
| 791 | static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) | ||
| 792 | { | ||
| 793 | return clamp_value / UCLAMP_BUCKET_DELTA; | ||
| 794 | } | ||
| 795 | |||
| 796 | static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) | ||
| 797 | { | ||
| 798 | return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); | ||
| 799 | } | ||
| 800 | |||
| 801 | static inline unsigned int uclamp_none(int clamp_id) | ||
| 802 | { | ||
| 803 | if (clamp_id == UCLAMP_MIN) | ||
| 804 | return 0; | ||
| 805 | return SCHED_CAPACITY_SCALE; | ||
| 806 | } | ||
| 807 | |||
| 808 | static inline void uclamp_se_set(struct uclamp_se *uc_se, | ||
| 809 | unsigned int value, bool user_defined) | ||
| 810 | { | ||
| 811 | uc_se->value = value; | ||
| 812 | uc_se->bucket_id = uclamp_bucket_id(value); | ||
| 813 | uc_se->user_defined = user_defined; | ||
| 814 | } | ||
| 815 | |||
| 816 | static inline unsigned int | ||
| 817 | uclamp_idle_value(struct rq *rq, unsigned int clamp_id, | ||
| 818 | unsigned int clamp_value) | ||
| 819 | { | ||
| 820 | /* | ||
| 821 | * Avoid blocked utilization pushing up the frequency when we go | ||
| 822 | * idle (which drops the max-clamp) by retaining the last known | ||
| 823 | * max-clamp. | ||
| 824 | */ | ||
| 825 | if (clamp_id == UCLAMP_MAX) { | ||
| 826 | rq->uclamp_flags |= UCLAMP_FLAG_IDLE; | ||
| 827 | return clamp_value; | ||
| 828 | } | ||
| 829 | |||
| 830 | return uclamp_none(UCLAMP_MIN); | ||
| 831 | } | ||
| 832 | |||
| 833 | static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, | ||
| 834 | unsigned int clamp_value) | ||
| 835 | { | ||
| 836 | /* Reset max-clamp retention only on idle exit */ | ||
| 837 | if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) | ||
| 838 | return; | ||
| 839 | |||
| 840 | WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); | ||
| 841 | } | ||
| 842 | |||
| 843 | static inline | ||
| 844 | unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, | ||
| 845 | unsigned int clamp_value) | ||
| 846 | { | ||
| 847 | struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; | ||
| 848 | int bucket_id = UCLAMP_BUCKETS - 1; | ||
| 849 | |||
| 850 | /* | ||
| 851 | * Since both min and max clamps are max aggregated, find the | ||
| 852 | * top most bucket with tasks in. | ||
| 853 | */ | ||
| 854 | for ( ; bucket_id >= 0; bucket_id--) { | ||
| 855 | if (!bucket[bucket_id].tasks) | ||
| 856 | continue; | ||
| 857 | return bucket[bucket_id].value; | ||
| 858 | } | ||
| 859 | |||
| 860 | /* No tasks -- default clamp values */ | ||
| 861 | return uclamp_idle_value(rq, clamp_id, clamp_value); | ||
| 862 | } | ||
| 863 | |||
| 864 | /* | ||
| 865 | * The effective clamp bucket index of a task depends on, by increasing | ||
| 866 | * priority: | ||
| 867 | * - the task specific clamp value, when explicitly requested from userspace | ||
| 868 | * - the system default clamp value, defined by the sysadmin | ||
| 869 | */ | ||
| 870 | static inline struct uclamp_se | ||
| 871 | uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) | ||
| 872 | { | ||
| 873 | struct uclamp_se uc_req = p->uclamp_req[clamp_id]; | ||
| 874 | struct uclamp_se uc_max = uclamp_default[clamp_id]; | ||
| 875 | |||
| 876 | /* System default restrictions always apply */ | ||
| 877 | if (unlikely(uc_req.value > uc_max.value)) | ||
| 878 | return uc_max; | ||
| 879 | |||
| 880 | return uc_req; | ||
| 881 | } | ||
| 882 | |||
| 883 | unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) | ||
| 884 | { | ||
| 885 | struct uclamp_se uc_eff; | ||
| 886 | |||
| 887 | /* Task currently refcounted: use back-annotated (effective) value */ | ||
| 888 | if (p->uclamp[clamp_id].active) | ||
| 889 | return p->uclamp[clamp_id].value; | ||
| 890 | |||
| 891 | uc_eff = uclamp_eff_get(p, clamp_id); | ||
| 892 | |||
| 893 | return uc_eff.value; | ||
| 894 | } | ||
| 895 | |||
| 896 | /* | ||
| 897 | * When a task is enqueued on a rq, the clamp bucket currently defined by the | ||
| 898 | * task's uclamp::bucket_id is refcounted on that rq. This also immediately | ||
| 899 | * updates the rq's clamp value if required. | ||
| 900 | * | ||
| 901 | * Tasks can have a task-specific value requested from user-space, track | ||
| 902 | * within each bucket the maximum value for tasks refcounted in it. | ||
| 903 | * This "local max aggregation" allows to track the exact "requested" value | ||
| 904 | * for each bucket when all its RUNNABLE tasks require the same clamp. | ||
| 905 | */ | ||
| 906 | static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, | ||
| 907 | unsigned int clamp_id) | ||
| 908 | { | ||
| 909 | struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; | ||
| 910 | struct uclamp_se *uc_se = &p->uclamp[clamp_id]; | ||
| 911 | struct uclamp_bucket *bucket; | ||
| 912 | |||
| 913 | lockdep_assert_held(&rq->lock); | ||
| 914 | |||
| 915 | /* Update task effective clamp */ | ||
| 916 | p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); | ||
| 917 | |||
| 918 | bucket = &uc_rq->bucket[uc_se->bucket_id]; | ||
| 919 | bucket->tasks++; | ||
| 920 | uc_se->active = true; | ||
| 921 | |||
| 922 | uclamp_idle_reset(rq, clamp_id, uc_se->value); | ||
| 923 | |||
| 924 | /* | ||
| 925 | * Local max aggregation: rq buckets always track the max | ||
| 926 | * "requested" clamp value of its RUNNABLE tasks. | ||
| 927 | */ | ||
| 928 | if (bucket->tasks == 1 || uc_se->value > bucket->value) | ||
| 929 | bucket->value = uc_se->value; | ||
| 930 | |||
| 931 | if (uc_se->value > READ_ONCE(uc_rq->value)) | ||
| 932 | WRITE_ONCE(uc_rq->value, uc_se->value); | ||
| 933 | } | ||
| 934 | |||
| 935 | /* | ||
| 936 | * When a task is dequeued from a rq, the clamp bucket refcounted by the task | ||
| 937 | * is released. If this is the last task reference counting the rq's max | ||
| 938 | * active clamp value, then the rq's clamp value is updated. | ||
| 939 | * | ||
| 940 | * Both refcounted tasks and rq's cached clamp values are expected to be | ||
| 941 | * always valid. If it's detected they are not, as defensive programming, | ||
| 942 | * enforce the expected state and warn. | ||
| 943 | */ | ||
| 944 | static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, | ||
| 945 | unsigned int clamp_id) | ||
| 946 | { | ||
| 947 | struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; | ||
| 948 | struct uclamp_se *uc_se = &p->uclamp[clamp_id]; | ||
| 949 | struct uclamp_bucket *bucket; | ||
| 950 | unsigned int bkt_clamp; | ||
| 951 | unsigned int rq_clamp; | ||
| 952 | |||
| 953 | lockdep_assert_held(&rq->lock); | ||
| 954 | |||
| 955 | bucket = &uc_rq->bucket[uc_se->bucket_id]; | ||
| 956 | SCHED_WARN_ON(!bucket->tasks); | ||
| 957 | if (likely(bucket->tasks)) | ||
| 958 | bucket->tasks--; | ||
| 959 | uc_se->active = false; | ||
| 960 | |||
| 961 | /* | ||
| 962 | * Keep "local max aggregation" simple and accept to (possibly) | ||
| 963 | * overboost some RUNNABLE tasks in the same bucket. | ||
| 964 | * The rq clamp bucket value is reset to its base value whenever | ||
| 965 | * there are no more RUNNABLE tasks refcounting it. | ||
| 966 | */ | ||
| 967 | if (likely(bucket->tasks)) | ||
| 968 | return; | ||
| 969 | |||
| 970 | rq_clamp = READ_ONCE(uc_rq->value); | ||
| 971 | /* | ||
| 972 | * Defensive programming: this should never happen. If it happens, | ||
| 973 | * e.g. due to future modification, warn and fixup the expected value. | ||
| 974 | */ | ||
| 975 | SCHED_WARN_ON(bucket->value > rq_clamp); | ||
| 976 | if (bucket->value >= rq_clamp) { | ||
| 977 | bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); | ||
| 978 | WRITE_ONCE(uc_rq->value, bkt_clamp); | ||
| 979 | } | ||
| 980 | } | ||
| 981 | |||
| 982 | static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) | ||
| 983 | { | ||
| 984 | unsigned int clamp_id; | ||
| 985 | |||
| 986 | if (unlikely(!p->sched_class->uclamp_enabled)) | ||
| 987 | return; | ||
| 988 | |||
| 989 | for_each_clamp_id(clamp_id) | ||
| 990 | uclamp_rq_inc_id(rq, p, clamp_id); | ||
| 991 | |||
| 992 | /* Reset clamp idle holding when there is one RUNNABLE task */ | ||
| 993 | if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) | ||
| 994 | rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; | ||
| 995 | } | ||
| 996 | |||
| 997 | static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) | ||
| 998 | { | ||
| 999 | unsigned int clamp_id; | ||
| 1000 | |||
| 1001 | if (unlikely(!p->sched_class->uclamp_enabled)) | ||
| 1002 | return; | ||
| 1003 | |||
| 1004 | for_each_clamp_id(clamp_id) | ||
| 1005 | uclamp_rq_dec_id(rq, p, clamp_id); | ||
| 1006 | } | ||
| 1007 | |||
| 1008 | int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, | ||
| 1009 | void __user *buffer, size_t *lenp, | ||
| 1010 | loff_t *ppos) | ||
| 1011 | { | ||
| 1012 | int old_min, old_max; | ||
| 1013 | static DEFINE_MUTEX(mutex); | ||
| 1014 | int result; | ||
| 1015 | |||
| 1016 | mutex_lock(&mutex); | ||
| 1017 | old_min = sysctl_sched_uclamp_util_min; | ||
| 1018 | old_max = sysctl_sched_uclamp_util_max; | ||
| 1019 | |||
| 1020 | result = proc_dointvec(table, write, buffer, lenp, ppos); | ||
| 1021 | if (result) | ||
| 1022 | goto undo; | ||
| 1023 | if (!write) | ||
| 1024 | goto done; | ||
| 1025 | |||
| 1026 | if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || | ||
| 1027 | sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { | ||
| 1028 | result = -EINVAL; | ||
| 1029 | goto undo; | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | if (old_min != sysctl_sched_uclamp_util_min) { | ||
| 1033 | uclamp_se_set(&uclamp_default[UCLAMP_MIN], | ||
| 1034 | sysctl_sched_uclamp_util_min, false); | ||
| 1035 | } | ||
| 1036 | if (old_max != sysctl_sched_uclamp_util_max) { | ||
| 1037 | uclamp_se_set(&uclamp_default[UCLAMP_MAX], | ||
| 1038 | sysctl_sched_uclamp_util_max, false); | ||
| 1039 | } | ||
| 1040 | |||
| 1041 | /* | ||
| 1042 | * Updating all the RUNNABLE task is expensive, keep it simple and do | ||
| 1043 | * just a lazy update at each next enqueue time. | ||
| 1044 | */ | ||
| 1045 | goto done; | ||
| 1046 | |||
| 1047 | undo: | ||
| 1048 | sysctl_sched_uclamp_util_min = old_min; | ||
| 1049 | sysctl_sched_uclamp_util_max = old_max; | ||
| 1050 | done: | ||
| 1051 | mutex_unlock(&mutex); | ||
| 1052 | |||
| 1053 | return result; | ||
| 1054 | } | ||
| 1055 | |||
| 1056 | static int uclamp_validate(struct task_struct *p, | ||
| 1057 | const struct sched_attr *attr) | ||
| 1058 | { | ||
| 1059 | unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; | ||
| 1060 | unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; | ||
| 1061 | |||
| 1062 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) | ||
| 1063 | lower_bound = attr->sched_util_min; | ||
| 1064 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) | ||
| 1065 | upper_bound = attr->sched_util_max; | ||
| 1066 | |||
| 1067 | if (lower_bound > upper_bound) | ||
| 1068 | return -EINVAL; | ||
| 1069 | if (upper_bound > SCHED_CAPACITY_SCALE) | ||
| 1070 | return -EINVAL; | ||
| 1071 | |||
| 1072 | return 0; | ||
| 1073 | } | ||
| 1074 | |||
| 1075 | static void __setscheduler_uclamp(struct task_struct *p, | ||
| 1076 | const struct sched_attr *attr) | ||
| 1077 | { | ||
| 1078 | unsigned int clamp_id; | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * On scheduling class change, reset to default clamps for tasks | ||
| 1082 | * without a task-specific value. | ||
| 1083 | */ | ||
| 1084 | for_each_clamp_id(clamp_id) { | ||
| 1085 | struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; | ||
| 1086 | unsigned int clamp_value = uclamp_none(clamp_id); | ||
| 1087 | |||
| 1088 | /* Keep using defined clamps across class changes */ | ||
| 1089 | if (uc_se->user_defined) | ||
| 1090 | continue; | ||
| 1091 | |||
| 1092 | /* By default, RT tasks always get 100% boost */ | ||
| 1093 | if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) | ||
| 1094 | clamp_value = uclamp_none(UCLAMP_MAX); | ||
| 1095 | |||
| 1096 | uclamp_se_set(uc_se, clamp_value, false); | ||
| 1097 | } | ||
| 1098 | |||
| 1099 | if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) | ||
| 1100 | return; | ||
| 1101 | |||
| 1102 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { | ||
| 1103 | uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], | ||
| 1104 | attr->sched_util_min, true); | ||
| 1105 | } | ||
| 1106 | |||
| 1107 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { | ||
| 1108 | uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], | ||
| 1109 | attr->sched_util_max, true); | ||
| 1110 | } | ||
| 1111 | } | ||
| 1112 | |||
| 1113 | static void uclamp_fork(struct task_struct *p) | ||
| 1114 | { | ||
| 1115 | unsigned int clamp_id; | ||
| 1116 | |||
| 1117 | for_each_clamp_id(clamp_id) | ||
| 1118 | p->uclamp[clamp_id].active = false; | ||
| 1119 | |||
| 1120 | if (likely(!p->sched_reset_on_fork)) | ||
| 1121 | return; | ||
| 1122 | |||
| 1123 | for_each_clamp_id(clamp_id) { | ||
| 1124 | unsigned int clamp_value = uclamp_none(clamp_id); | ||
| 1125 | |||
| 1126 | /* By default, RT tasks always get 100% boost */ | ||
| 1127 | if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) | ||
| 1128 | clamp_value = uclamp_none(UCLAMP_MAX); | ||
| 1129 | |||
| 1130 | uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); | ||
| 1131 | } | ||
| 1132 | } | ||
| 1133 | |||
| 1134 | static void __init init_uclamp(void) | ||
| 1135 | { | ||
| 1136 | struct uclamp_se uc_max = {}; | ||
| 1137 | unsigned int clamp_id; | ||
| 1138 | int cpu; | ||
| 1139 | |||
| 1140 | for_each_possible_cpu(cpu) { | ||
| 1141 | memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); | ||
| 1142 | cpu_rq(cpu)->uclamp_flags = 0; | ||
| 1143 | } | ||
| 1144 | |||
| 1145 | for_each_clamp_id(clamp_id) { | ||
| 1146 | uclamp_se_set(&init_task.uclamp_req[clamp_id], | ||
| 1147 | uclamp_none(clamp_id), false); | ||
| 1148 | } | ||
| 1149 | |||
| 1150 | /* System defaults allow max clamp values for both indexes */ | ||
| 1151 | uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); | ||
| 1152 | for_each_clamp_id(clamp_id) | ||
| 1153 | uclamp_default[clamp_id] = uc_max; | ||
| 1154 | } | ||
| 1155 | |||
| 1156 | #else /* CONFIG_UCLAMP_TASK */ | ||
| 1157 | static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } | ||
| 1158 | static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } | ||
| 1159 | static inline int uclamp_validate(struct task_struct *p, | ||
| 1160 | const struct sched_attr *attr) | ||
| 1161 | { | ||
| 1162 | return -EOPNOTSUPP; | ||
| 1163 | } | ||
| 1164 | static void __setscheduler_uclamp(struct task_struct *p, | ||
| 1165 | const struct sched_attr *attr) { } | ||
| 1166 | static inline void uclamp_fork(struct task_struct *p) { } | ||
| 1167 | static inline void init_uclamp(void) { } | ||
| 1168 | #endif /* CONFIG_UCLAMP_TASK */ | ||
| 1169 | |||
| 763 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1170 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
| 764 | { | 1171 | { |
| 765 | if (!(flags & ENQUEUE_NOCLOCK)) | 1172 | if (!(flags & ENQUEUE_NOCLOCK)) |
| @@ -770,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 770 | psi_enqueue(p, flags & ENQUEUE_WAKEUP); | 1177 | psi_enqueue(p, flags & ENQUEUE_WAKEUP); |
| 771 | } | 1178 | } |
| 772 | 1179 | ||
| 1180 | uclamp_rq_inc(rq, p); | ||
| 773 | p->sched_class->enqueue_task(rq, p, flags); | 1181 | p->sched_class->enqueue_task(rq, p, flags); |
| 774 | } | 1182 | } |
| 775 | 1183 | ||
| @@ -783,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 783 | psi_dequeue(p, flags & DEQUEUE_SLEEP); | 1191 | psi_dequeue(p, flags & DEQUEUE_SLEEP); |
| 784 | } | 1192 | } |
| 785 | 1193 | ||
| 1194 | uclamp_rq_dec(rq, p); | ||
| 786 | p->sched_class->dequeue_task(rq, p, flags); | 1195 | p->sched_class->dequeue_task(rq, p, flags); |
| 787 | } | 1196 | } |
| 788 | 1197 | ||
| @@ -929,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) | |||
| 929 | */ | 1338 | */ |
| 930 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) | 1339 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) |
| 931 | { | 1340 | { |
| 932 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1341 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
| 933 | return false; | 1342 | return false; |
| 934 | 1343 | ||
| 935 | if (is_per_cpu_kthread(p)) | 1344 | if (is_per_cpu_kthread(p)) |
| @@ -1024,7 +1433,7 @@ static int migration_cpu_stop(void *data) | |||
| 1024 | local_irq_disable(); | 1433 | local_irq_disable(); |
| 1025 | /* | 1434 | /* |
| 1026 | * We need to explicitly wake pending tasks before running | 1435 | * We need to explicitly wake pending tasks before running |
| 1027 | * __migrate_task() such that we will not miss enforcing cpus_allowed | 1436 | * __migrate_task() such that we will not miss enforcing cpus_ptr |
| 1028 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | 1437 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. |
| 1029 | */ | 1438 | */ |
| 1030 | sched_ttwu_pending(); | 1439 | sched_ttwu_pending(); |
| @@ -1055,7 +1464,7 @@ static int migration_cpu_stop(void *data) | |||
| 1055 | */ | 1464 | */ |
| 1056 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) | 1465 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) |
| 1057 | { | 1466 | { |
| 1058 | cpumask_copy(&p->cpus_allowed, new_mask); | 1467 | cpumask_copy(&p->cpus_mask, new_mask); |
| 1059 | p->nr_cpus_allowed = cpumask_weight(new_mask); | 1468 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
| 1060 | } | 1469 | } |
| 1061 | 1470 | ||
| @@ -1125,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
| 1125 | goto out; | 1534 | goto out; |
| 1126 | } | 1535 | } |
| 1127 | 1536 | ||
| 1128 | if (cpumask_equal(&p->cpus_allowed, new_mask)) | 1537 | if (cpumask_equal(p->cpus_ptr, new_mask)) |
| 1129 | goto out; | 1538 | goto out; |
| 1130 | 1539 | ||
| 1131 | if (!cpumask_intersects(new_mask, cpu_valid_mask)) { | 1540 | if (!cpumask_intersects(new_mask, cpu_valid_mask)) { |
| @@ -1285,10 +1694,10 @@ static int migrate_swap_stop(void *data) | |||
| 1285 | if (task_cpu(arg->src_task) != arg->src_cpu) | 1694 | if (task_cpu(arg->src_task) != arg->src_cpu) |
| 1286 | goto unlock; | 1695 | goto unlock; |
| 1287 | 1696 | ||
| 1288 | if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) | 1697 | if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) |
| 1289 | goto unlock; | 1698 | goto unlock; |
| 1290 | 1699 | ||
| 1291 | if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) | 1700 | if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) |
| 1292 | goto unlock; | 1701 | goto unlock; |
| 1293 | 1702 | ||
| 1294 | __migrate_swap_task(arg->src_task, arg->dst_cpu); | 1703 | __migrate_swap_task(arg->src_task, arg->dst_cpu); |
| @@ -1330,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, | |||
| 1330 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | 1739 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) |
| 1331 | goto out; | 1740 | goto out; |
| 1332 | 1741 | ||
| 1333 | if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) | 1742 | if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) |
| 1334 | goto out; | 1743 | goto out; |
| 1335 | 1744 | ||
| 1336 | if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) | 1745 | if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) |
| 1337 | goto out; | 1746 | goto out; |
| 1338 | 1747 | ||
| 1339 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); | 1748 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); |
| @@ -1478,7 +1887,7 @@ void kick_process(struct task_struct *p) | |||
| 1478 | EXPORT_SYMBOL_GPL(kick_process); | 1887 | EXPORT_SYMBOL_GPL(kick_process); |
| 1479 | 1888 | ||
| 1480 | /* | 1889 | /* |
| 1481 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock | 1890 | * ->cpus_ptr is protected by both rq->lock and p->pi_lock |
| 1482 | * | 1891 | * |
| 1483 | * A few notes on cpu_active vs cpu_online: | 1892 | * A few notes on cpu_active vs cpu_online: |
| 1484 | * | 1893 | * |
| @@ -1518,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
| 1518 | for_each_cpu(dest_cpu, nodemask) { | 1927 | for_each_cpu(dest_cpu, nodemask) { |
| 1519 | if (!cpu_active(dest_cpu)) | 1928 | if (!cpu_active(dest_cpu)) |
| 1520 | continue; | 1929 | continue; |
| 1521 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 1930 | if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) |
| 1522 | return dest_cpu; | 1931 | return dest_cpu; |
| 1523 | } | 1932 | } |
| 1524 | } | 1933 | } |
| 1525 | 1934 | ||
| 1526 | for (;;) { | 1935 | for (;;) { |
| 1527 | /* Any allowed, online CPU? */ | 1936 | /* Any allowed, online CPU? */ |
| 1528 | for_each_cpu(dest_cpu, &p->cpus_allowed) { | 1937 | for_each_cpu(dest_cpu, p->cpus_ptr) { |
| 1529 | if (!is_cpu_allowed(p, dest_cpu)) | 1938 | if (!is_cpu_allowed(p, dest_cpu)) |
| 1530 | continue; | 1939 | continue; |
| 1531 | 1940 | ||
| @@ -1569,7 +1978,7 @@ out: | |||
| 1569 | } | 1978 | } |
| 1570 | 1979 | ||
| 1571 | /* | 1980 | /* |
| 1572 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 1981 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. |
| 1573 | */ | 1982 | */ |
| 1574 | static inline | 1983 | static inline |
| 1575 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | 1984 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
| @@ -1579,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | |||
| 1579 | if (p->nr_cpus_allowed > 1) | 1988 | if (p->nr_cpus_allowed > 1) |
| 1580 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | 1989 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
| 1581 | else | 1990 | else |
| 1582 | cpu = cpumask_any(&p->cpus_allowed); | 1991 | cpu = cpumask_any(p->cpus_ptr); |
| 1583 | 1992 | ||
| 1584 | /* | 1993 | /* |
| 1585 | * In order not to call set_task_cpu() on a blocking task we need | 1994 | * In order not to call set_task_cpu() on a blocking task we need |
| 1586 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | 1995 | * to rely on ttwu() to place the task on a valid ->cpus_ptr |
| 1587 | * CPU. | 1996 | * CPU. |
| 1588 | * | 1997 | * |
| 1589 | * Since this is common to all placement strategies, this lives here. | 1998 | * Since this is common to all placement strategies, this lives here. |
| @@ -1990,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1990 | unsigned long flags; | 2399 | unsigned long flags; |
| 1991 | int cpu, success = 0; | 2400 | int cpu, success = 0; |
| 1992 | 2401 | ||
| 2402 | if (p == current) { | ||
| 2403 | /* | ||
| 2404 | * We're waking current, this means 'p->on_rq' and 'task_cpu(p) | ||
| 2405 | * == smp_processor_id()'. Together this means we can special | ||
| 2406 | * case the whole 'p->on_rq && ttwu_remote()' case below | ||
| 2407 | * without taking any locks. | ||
| 2408 | * | ||
| 2409 | * In particular: | ||
| 2410 | * - we rely on Program-Order guarantees for all the ordering, | ||
| 2411 | * - we're serialized against set_special_state() by virtue of | ||
| 2412 | * it disabling IRQs (this allows not taking ->pi_lock). | ||
| 2413 | */ | ||
| 2414 | if (!(p->state & state)) | ||
| 2415 | return false; | ||
| 2416 | |||
| 2417 | success = 1; | ||
| 2418 | cpu = task_cpu(p); | ||
| 2419 | trace_sched_waking(p); | ||
| 2420 | p->state = TASK_RUNNING; | ||
| 2421 | trace_sched_wakeup(p); | ||
| 2422 | goto out; | ||
| 2423 | } | ||
| 2424 | |||
| 1993 | /* | 2425 | /* |
| 1994 | * If we are going to wake up a thread waiting for CONDITION we | 2426 | * If we are going to wake up a thread waiting for CONDITION we |
| 1995 | * need to ensure that CONDITION=1 done by the caller can not be | 2427 | * need to ensure that CONDITION=1 done by the caller can not be |
| @@ -1999,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1999 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2431 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 2000 | smp_mb__after_spinlock(); | 2432 | smp_mb__after_spinlock(); |
| 2001 | if (!(p->state & state)) | 2433 | if (!(p->state & state)) |
| 2002 | goto out; | 2434 | goto unlock; |
| 2003 | 2435 | ||
| 2004 | trace_sched_waking(p); | 2436 | trace_sched_waking(p); |
| 2005 | 2437 | ||
| @@ -2029,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 2029 | */ | 2461 | */ |
| 2030 | smp_rmb(); | 2462 | smp_rmb(); |
| 2031 | if (p->on_rq && ttwu_remote(p, wake_flags)) | 2463 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
| 2032 | goto stat; | 2464 | goto unlock; |
| 2033 | 2465 | ||
| 2034 | #ifdef CONFIG_SMP | 2466 | #ifdef CONFIG_SMP |
| 2035 | /* | 2467 | /* |
| @@ -2089,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 2089 | #endif /* CONFIG_SMP */ | 2521 | #endif /* CONFIG_SMP */ |
| 2090 | 2522 | ||
| 2091 | ttwu_queue(p, cpu, wake_flags); | 2523 | ttwu_queue(p, cpu, wake_flags); |
| 2092 | stat: | 2524 | unlock: |
| 2093 | ttwu_stat(p, cpu, wake_flags); | ||
| 2094 | out: | ||
| 2095 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2525 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 2526 | out: | ||
| 2527 | if (success) | ||
| 2528 | ttwu_stat(p, cpu, wake_flags); | ||
| 2096 | 2529 | ||
| 2097 | return success; | 2530 | return success; |
| 2098 | } | 2531 | } |
| @@ -2299,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
| 2299 | */ | 2732 | */ |
| 2300 | p->prio = current->normal_prio; | 2733 | p->prio = current->normal_prio; |
| 2301 | 2734 | ||
| 2735 | uclamp_fork(p); | ||
| 2736 | |||
| 2302 | /* | 2737 | /* |
| 2303 | * Revert to default priority/policy on fork if requested. | 2738 | * Revert to default priority/policy on fork if requested. |
| 2304 | */ | 2739 | */ |
| @@ -2394,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p) | |||
| 2394 | #ifdef CONFIG_SMP | 2829 | #ifdef CONFIG_SMP |
| 2395 | /* | 2830 | /* |
| 2396 | * Fork balancing, do it here and not earlier because: | 2831 | * Fork balancing, do it here and not earlier because: |
| 2397 | * - cpus_allowed can change in the fork path | 2832 | * - cpus_ptr can change in the fork path |
| 2398 | * - any previously selected CPU might disappear through hotplug | 2833 | * - any previously selected CPU might disappear through hotplug |
| 2399 | * | 2834 | * |
| 2400 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | 2835 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, |
| @@ -3032,7 +3467,6 @@ void scheduler_tick(void) | |||
| 3032 | 3467 | ||
| 3033 | update_rq_clock(rq); | 3468 | update_rq_clock(rq); |
| 3034 | curr->sched_class->task_tick(rq, curr, 0); | 3469 | curr->sched_class->task_tick(rq, curr, 0); |
| 3035 | cpu_load_update_active(rq); | ||
| 3036 | calc_global_load_tick(rq); | 3470 | calc_global_load_tick(rq); |
| 3037 | psi_task_tick(rq); | 3471 | psi_task_tick(rq); |
| 3038 | 3472 | ||
| @@ -4070,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p, | |||
| 4070 | static void __setscheduler(struct rq *rq, struct task_struct *p, | 4504 | static void __setscheduler(struct rq *rq, struct task_struct *p, |
| 4071 | const struct sched_attr *attr, bool keep_boost) | 4505 | const struct sched_attr *attr, bool keep_boost) |
| 4072 | { | 4506 | { |
| 4507 | /* | ||
| 4508 | * If params can't change scheduling class changes aren't allowed | ||
| 4509 | * either. | ||
| 4510 | */ | ||
| 4511 | if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) | ||
| 4512 | return; | ||
| 4513 | |||
| 4073 | __setscheduler_params(p, attr); | 4514 | __setscheduler_params(p, attr); |
| 4074 | 4515 | ||
| 4075 | /* | 4516 | /* |
| @@ -4207,6 +4648,13 @@ recheck: | |||
| 4207 | return retval; | 4648 | return retval; |
| 4208 | } | 4649 | } |
| 4209 | 4650 | ||
| 4651 | /* Update task specific "requested" clamps */ | ||
| 4652 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { | ||
| 4653 | retval = uclamp_validate(p, attr); | ||
| 4654 | if (retval) | ||
| 4655 | return retval; | ||
| 4656 | } | ||
| 4657 | |||
| 4210 | /* | 4658 | /* |
| 4211 | * Make sure no PI-waiters arrive (or leave) while we are | 4659 | * Make sure no PI-waiters arrive (or leave) while we are |
| 4212 | * changing the priority of the task: | 4660 | * changing the priority of the task: |
| @@ -4236,6 +4684,8 @@ recheck: | |||
| 4236 | goto change; | 4684 | goto change; |
| 4237 | if (dl_policy(policy) && dl_param_changed(p, attr)) | 4685 | if (dl_policy(policy) && dl_param_changed(p, attr)) |
| 4238 | goto change; | 4686 | goto change; |
| 4687 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) | ||
| 4688 | goto change; | ||
| 4239 | 4689 | ||
| 4240 | p->sched_reset_on_fork = reset_on_fork; | 4690 | p->sched_reset_on_fork = reset_on_fork; |
| 4241 | task_rq_unlock(rq, p, &rf); | 4691 | task_rq_unlock(rq, p, &rf); |
| @@ -4266,7 +4716,7 @@ change: | |||
| 4266 | * the entire root_domain to become SCHED_DEADLINE. We | 4716 | * the entire root_domain to become SCHED_DEADLINE. We |
| 4267 | * will also fail if there's no bandwidth available. | 4717 | * will also fail if there's no bandwidth available. |
| 4268 | */ | 4718 | */ |
| 4269 | if (!cpumask_subset(span, &p->cpus_allowed) || | 4719 | if (!cpumask_subset(span, p->cpus_ptr) || |
| 4270 | rq->rd->dl_bw.bw == 0) { | 4720 | rq->rd->dl_bw.bw == 0) { |
| 4271 | task_rq_unlock(rq, p, &rf); | 4721 | task_rq_unlock(rq, p, &rf); |
| 4272 | return -EPERM; | 4722 | return -EPERM; |
| @@ -4316,7 +4766,9 @@ change: | |||
| 4316 | put_prev_task(rq, p); | 4766 | put_prev_task(rq, p); |
| 4317 | 4767 | ||
| 4318 | prev_class = p->sched_class; | 4768 | prev_class = p->sched_class; |
| 4769 | |||
| 4319 | __setscheduler(rq, p, attr, pi); | 4770 | __setscheduler(rq, p, attr, pi); |
| 4771 | __setscheduler_uclamp(p, attr); | ||
| 4320 | 4772 | ||
| 4321 | if (queued) { | 4773 | if (queued) { |
| 4322 | /* | 4774 | /* |
| @@ -4492,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a | |||
| 4492 | if (ret) | 4944 | if (ret) |
| 4493 | return -EFAULT; | 4945 | return -EFAULT; |
| 4494 | 4946 | ||
| 4947 | if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && | ||
| 4948 | size < SCHED_ATTR_SIZE_VER1) | ||
| 4949 | return -EINVAL; | ||
| 4950 | |||
| 4495 | /* | 4951 | /* |
| 4496 | * XXX: Do we want to be lenient like existing syscalls; or do we want | 4952 | * XXX: Do we want to be lenient like existing syscalls; or do we want |
| 4497 | * to be strict and return an error on out-of-bounds values? | 4953 | * to be strict and return an error on out-of-bounds values? |
| @@ -4555,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
| 4555 | 5011 | ||
| 4556 | if ((int)attr.sched_policy < 0) | 5012 | if ((int)attr.sched_policy < 0) |
| 4557 | return -EINVAL; | 5013 | return -EINVAL; |
| 5014 | if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) | ||
| 5015 | attr.sched_policy = SETPARAM_POLICY; | ||
| 4558 | 5016 | ||
| 4559 | rcu_read_lock(); | 5017 | rcu_read_lock(); |
| 4560 | retval = -ESRCH; | 5018 | retval = -ESRCH; |
| 4561 | p = find_process_by_pid(pid); | 5019 | p = find_process_by_pid(pid); |
| 4562 | if (p != NULL) | 5020 | if (likely(p)) |
| 4563 | retval = sched_setattr(p, &attr); | 5021 | get_task_struct(p); |
| 4564 | rcu_read_unlock(); | 5022 | rcu_read_unlock(); |
| 4565 | 5023 | ||
| 5024 | if (likely(p)) { | ||
| 5025 | retval = sched_setattr(p, &attr); | ||
| 5026 | put_task_struct(p); | ||
| 5027 | } | ||
| 5028 | |||
| 4566 | return retval; | 5029 | return retval; |
| 4567 | } | 5030 | } |
| 4568 | 5031 | ||
| @@ -4713,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
| 4713 | else | 5176 | else |
| 4714 | attr.sched_nice = task_nice(p); | 5177 | attr.sched_nice = task_nice(p); |
| 4715 | 5178 | ||
| 5179 | #ifdef CONFIG_UCLAMP_TASK | ||
| 5180 | attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; | ||
| 5181 | attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; | ||
| 5182 | #endif | ||
| 5183 | |||
| 4716 | rcu_read_unlock(); | 5184 | rcu_read_unlock(); |
| 4717 | 5185 | ||
| 4718 | retval = sched_read_attr(uattr, &attr, size); | 5186 | retval = sched_read_attr(uattr, &attr, size); |
| @@ -4865,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
| 4865 | goto out_unlock; | 5333 | goto out_unlock; |
| 4866 | 5334 | ||
| 4867 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5335 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
| 4868 | cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); | 5336 | cpumask_and(mask, &p->cpus_mask, cpu_active_mask); |
| 4869 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5337 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
| 4870 | 5338 | ||
| 4871 | out_unlock: | 5339 | out_unlock: |
| @@ -5122,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout) | |||
| 5122 | } | 5590 | } |
| 5123 | EXPORT_SYMBOL(io_schedule_timeout); | 5591 | EXPORT_SYMBOL(io_schedule_timeout); |
| 5124 | 5592 | ||
| 5125 | void io_schedule(void) | 5593 | void __sched io_schedule(void) |
| 5126 | { | 5594 | { |
| 5127 | int token; | 5595 | int token; |
| 5128 | 5596 | ||
| @@ -5442,7 +5910,7 @@ int task_can_attach(struct task_struct *p, | |||
| 5442 | * allowed nodes is unnecessary. Thus, cpusets are not | 5910 | * allowed nodes is unnecessary. Thus, cpusets are not |
| 5443 | * applicable for such threads. This prevents checking for | 5911 | * applicable for such threads. This prevents checking for |
| 5444 | * success of set_cpus_allowed_ptr() on all attached tasks | 5912 | * success of set_cpus_allowed_ptr() on all attached tasks |
| 5445 | * before cpus_allowed may be changed. | 5913 | * before cpus_mask may be changed. |
| 5446 | */ | 5914 | */ |
| 5447 | if (p->flags & PF_NO_SETAFFINITY) { | 5915 | if (p->flags & PF_NO_SETAFFINITY) { |
| 5448 | ret = -EINVAL; | 5916 | ret = -EINVAL; |
| @@ -5469,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) | |||
| 5469 | if (curr_cpu == target_cpu) | 5937 | if (curr_cpu == target_cpu) |
| 5470 | return 0; | 5938 | return 0; |
| 5471 | 5939 | ||
| 5472 | if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) | 5940 | if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) |
| 5473 | return -EINVAL; | 5941 | return -EINVAL; |
| 5474 | 5942 | ||
| 5475 | /* TODO: This is not properly updating schedstats */ | 5943 | /* TODO: This is not properly updating schedstats */ |
| @@ -5607,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) | |||
| 5607 | put_prev_task(rq, next); | 6075 | put_prev_task(rq, next); |
| 5608 | 6076 | ||
| 5609 | /* | 6077 | /* |
| 5610 | * Rules for changing task_struct::cpus_allowed are holding | 6078 | * Rules for changing task_struct::cpus_mask are holding |
| 5611 | * both pi_lock and rq->lock, such that holding either | 6079 | * both pi_lock and rq->lock, such that holding either |
| 5612 | * stabilizes the mask. | 6080 | * stabilizes the mask. |
| 5613 | * | 6081 | * |
| @@ -5901,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); | |||
| 5901 | 6369 | ||
| 5902 | void __init sched_init(void) | 6370 | void __init sched_init(void) |
| 5903 | { | 6371 | { |
| 5904 | int i, j; | ||
| 5905 | unsigned long alloc_size = 0, ptr; | 6372 | unsigned long alloc_size = 0, ptr; |
| 6373 | int i; | ||
| 5906 | 6374 | ||
| 5907 | wait_bit_init(); | 6375 | wait_bit_init(); |
| 5908 | 6376 | ||
| @@ -6004,10 +6472,6 @@ void __init sched_init(void) | |||
| 6004 | #ifdef CONFIG_RT_GROUP_SCHED | 6472 | #ifdef CONFIG_RT_GROUP_SCHED |
| 6005 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); | 6473 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
| 6006 | #endif | 6474 | #endif |
| 6007 | |||
| 6008 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | ||
| 6009 | rq->cpu_load[j] = 0; | ||
| 6010 | |||
| 6011 | #ifdef CONFIG_SMP | 6475 | #ifdef CONFIG_SMP |
| 6012 | rq->sd = NULL; | 6476 | rq->sd = NULL; |
| 6013 | rq->rd = NULL; | 6477 | rq->rd = NULL; |
| @@ -6062,6 +6526,8 @@ void __init sched_init(void) | |||
| 6062 | 6526 | ||
| 6063 | psi_init(); | 6527 | psi_init(); |
| 6064 | 6528 | ||
| 6529 | init_uclamp(); | ||
| 6530 | |||
| 6065 | scheduler_running = 1; | 6531 | scheduler_running = 1; |
| 6066 | } | 6532 | } |
| 6067 | 6533 | ||
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 50316455ea66..5cc4012572ec 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
| @@ -1,14 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/sched/cpudl.c | 3 | * kernel/sched/cpudl.c |
| 3 | * | 4 | * |
| 4 | * Global CPU deadline management | 5 | * Global CPU deadline management |
| 5 | * | 6 | * |
| 6 | * Author: Juri Lelli <j.lelli@sssup.it> | 7 | * Author: Juri Lelli <j.lelli@sssup.it> |
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or | ||
| 9 | * modify it under the terms of the GNU General Public License | ||
| 10 | * as published by the Free Software Foundation; version 2 | ||
| 11 | * of the License. | ||
| 12 | */ | 8 | */ |
| 13 | #include "sched.h" | 9 | #include "sched.h" |
| 14 | 10 | ||
| @@ -124,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
| 124 | const struct sched_dl_entity *dl_se = &p->dl; | 120 | const struct sched_dl_entity *dl_se = &p->dl; |
| 125 | 121 | ||
| 126 | if (later_mask && | 122 | if (later_mask && |
| 127 | cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { | 123 | cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { |
| 128 | return 1; | 124 | return 1; |
| 129 | } else { | 125 | } else { |
| 130 | int best_cpu = cpudl_maximum(cp); | 126 | int best_cpu = cpudl_maximum(cp); |
| 131 | 127 | ||
| 132 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 128 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
| 133 | 129 | ||
| 134 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | 130 | if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && |
| 135 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | 131 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { |
| 136 | if (later_mask) | 132 | if (later_mask) |
| 137 | cpumask_set_cpu(best_cpu, later_mask); | 133 | cpumask_set_cpu(best_cpu, later_mask); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 962cf343f798..636ca6f88c8e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
| @@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, | |||
| 196 | * based on the task model parameters and gives the minimal utilization | 196 | * based on the task model parameters and gives the minimal utilization |
| 197 | * required to meet deadlines. | 197 | * required to meet deadlines. |
| 198 | */ | 198 | */ |
| 199 | unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, | 199 | unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, |
| 200 | unsigned long max, enum schedutil_type type) | 200 | unsigned long max, enum schedutil_type type, |
| 201 | struct task_struct *p) | ||
| 201 | { | 202 | { |
| 202 | unsigned long dl_util, util, irq; | 203 | unsigned long dl_util, util, irq; |
| 203 | struct rq *rq = cpu_rq(cpu); | 204 | struct rq *rq = cpu_rq(cpu); |
| 204 | 205 | ||
| 205 | if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) | 206 | if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && |
| 207 | type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { | ||
| 206 | return max; | 208 | return max; |
| 209 | } | ||
| 207 | 210 | ||
| 208 | /* | 211 | /* |
| 209 | * Early check to see if IRQ/steal time saturates the CPU, can be | 212 | * Early check to see if IRQ/steal time saturates the CPU, can be |
| @@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, | |||
| 219 | * CFS tasks and we use the same metric to track the effective | 222 | * CFS tasks and we use the same metric to track the effective |
| 220 | * utilization (PELT windows are synchronized) we can directly add them | 223 | * utilization (PELT windows are synchronized) we can directly add them |
| 221 | * to obtain the CPU's actual utilization. | 224 | * to obtain the CPU's actual utilization. |
| 225 | * | ||
| 226 | * CFS and RT utilization can be boosted or capped, depending on | ||
| 227 | * utilization clamp constraints requested by currently RUNNABLE | ||
| 228 | * tasks. | ||
| 229 | * When there are no CFS RUNNABLE tasks, clamps are released and | ||
| 230 | * frequency will be gracefully reduced with the utilization decay. | ||
| 222 | */ | 231 | */ |
| 223 | util = util_cfs; | 232 | util = util_cfs + cpu_util_rt(rq); |
| 224 | util += cpu_util_rt(rq); | 233 | if (type == FREQUENCY_UTIL) |
| 234 | util = uclamp_util_with(rq, util, p); | ||
| 225 | 235 | ||
| 226 | dl_util = cpu_util_dl(rq); | 236 | dl_util = cpu_util_dl(rq); |
| 227 | 237 | ||
| @@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) | |||
| 276 | { | 286 | { |
| 277 | struct rq *rq = cpu_rq(sg_cpu->cpu); | 287 | struct rq *rq = cpu_rq(sg_cpu->cpu); |
| 278 | unsigned long util = cpu_util_cfs(rq); | 288 | unsigned long util = cpu_util_cfs(rq); |
| 279 | unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); | 289 | unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); |
| 280 | 290 | ||
| 281 | sg_cpu->max = max; | 291 | sg_cpu->max = max; |
| 282 | sg_cpu->bw_dl = cpu_bw_dl(rq); | 292 | sg_cpu->bw_dl = cpu_bw_dl(rq); |
| 283 | 293 | ||
| 284 | return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); | 294 | return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); |
| 285 | } | 295 | } |
| 286 | 296 | ||
| 287 | /** | 297 | /** |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index daaadf939ccb..b7abca987d94 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/sched/cpupri.c | 3 | * kernel/sched/cpupri.c |
| 3 | * | 4 | * |
| @@ -20,11 +21,6 @@ | |||
| 20 | * searches). For tasks with affinity restrictions, the algorithm has a | 21 | * searches). For tasks with affinity restrictions, the algorithm has a |
| 21 | * worst case complexity of O(min(102, nr_domcpus)), though the scenario that | 22 | * worst case complexity of O(min(102, nr_domcpus)), though the scenario that |
| 22 | * yields the worst case search is fairly contrived. | 23 | * yields the worst case search is fairly contrived. |
| 23 | * | ||
| 24 | * This program is free software; you can redistribute it and/or | ||
| 25 | * modify it under the terms of the GNU General Public License | ||
| 26 | * as published by the Free Software Foundation; version 2 | ||
| 27 | * of the License. | ||
| 28 | */ | 24 | */ |
| 29 | #include "sched.h" | 25 | #include "sched.h" |
| 30 | 26 | ||
| @@ -98,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
| 98 | if (skip) | 94 | if (skip) |
| 99 | continue; | 95 | continue; |
| 100 | 96 | ||
| 101 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 97 | if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) |
| 102 | continue; | 98 | continue; |
| 103 | 99 | ||
| 104 | if (lowest_mask) { | 100 | if (lowest_mask) { |
| 105 | cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); | 101 | cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); |
| 106 | 102 | ||
| 107 | /* | 103 | /* |
| 108 | * We have to ensure that we have at least one bit | 104 | * We have to ensure that we have at least one bit |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ba4a143bdcf3..2305ce89a26c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Simple CPU accounting cgroup controller | 3 | * Simple CPU accounting cgroup controller |
| 3 | */ | 4 | */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43901fa3f269..ef5b9f6b1d42 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
| @@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
| 538 | * If we cannot preempt any rq, fall back to pick any | 538 | * If we cannot preempt any rq, fall back to pick any |
| 539 | * online CPU: | 539 | * online CPU: |
| 540 | */ | 540 | */ |
| 541 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | 541 | cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); |
| 542 | if (cpu >= nr_cpu_ids) { | 542 | if (cpu >= nr_cpu_ids) { |
| 543 | /* | 543 | /* |
| 544 | * Failed to find any suitable CPU. | 544 | * Failed to find any suitable CPU. |
| @@ -726,7 +726,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
| 726 | * refill the runtime and set the deadline a period in the future, | 726 | * refill the runtime and set the deadline a period in the future, |
| 727 | * because keeping the current (absolute) deadline of the task would | 727 | * because keeping the current (absolute) deadline of the task would |
| 728 | * result in breaking guarantees promised to other tasks (refer to | 728 | * result in breaking guarantees promised to other tasks (refer to |
| 729 | * Documentation/scheduler/sched-deadline.txt for more information). | 729 | * Documentation/scheduler/sched-deadline.rst for more information). |
| 730 | * | 730 | * |
| 731 | * This function returns true if: | 731 | * This function returns true if: |
| 732 | * | 732 | * |
| @@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq) | |||
| 1195 | &curr->dl); | 1195 | &curr->dl); |
| 1196 | } else { | 1196 | } else { |
| 1197 | unsigned long scale_freq = arch_scale_freq_capacity(cpu); | 1197 | unsigned long scale_freq = arch_scale_freq_capacity(cpu); |
| 1198 | unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | 1198 | unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); |
| 1199 | 1199 | ||
| 1200 | scaled_delta_exec = cap_scale(delta_exec, scale_freq); | 1200 | scaled_delta_exec = cap_scale(delta_exec, scale_freq); |
| 1201 | scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); | 1201 | scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); |
| @@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq) | |||
| 1824 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | 1824 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1825 | { | 1825 | { |
| 1826 | if (!task_running(rq, p) && | 1826 | if (!task_running(rq, p) && |
| 1827 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1827 | cpumask_test_cpu(cpu, p->cpus_ptr)) |
| 1828 | return 1; | 1828 | return 1; |
| 1829 | return 0; | 1829 | return 0; |
| 1830 | } | 1830 | } |
| @@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
| 1974 | /* Retry if something changed. */ | 1974 | /* Retry if something changed. */ |
| 1975 | if (double_lock_balance(rq, later_rq)) { | 1975 | if (double_lock_balance(rq, later_rq)) { |
| 1976 | if (unlikely(task_rq(task) != rq || | 1976 | if (unlikely(task_rq(task) != rq || |
| 1977 | !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || | 1977 | !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || |
| 1978 | task_running(rq, task) || | 1978 | task_running(rq, task) || |
| 1979 | !dl_task(task) || | 1979 | !dl_task(task) || |
| 1980 | !task_on_rq_queued(task))) { | 1980 | !task_on_rq_queued(task))) { |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 678bfb9bd87f..f7e4579e746c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
| @@ -1,13 +1,10 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/sched/debug.c | 3 | * kernel/sched/debug.c |
| 3 | * | 4 | * |
| 4 | * Print the CFS rbtree and other debugging details | 5 | * Print the CFS rbtree and other debugging details |
| 5 | * | 6 | * |
| 6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar | 7 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar |
| 7 | * | ||
| 8 | * This program is free software; you can redistribute it and/or modify | ||
| 9 | * it under the terms of the GNU General Public License version 2 as | ||
| 10 | * published by the Free Software Foundation. | ||
| 11 | */ | 8 | */ |
| 12 | #include "sched.h" | 9 | #include "sched.h" |
| 13 | 10 | ||
| @@ -236,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
| 236 | *tablep = NULL; | 233 | *tablep = NULL; |
| 237 | } | 234 | } |
| 238 | 235 | ||
| 239 | static int min_load_idx = 0; | ||
| 240 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; | ||
| 241 | |||
| 242 | static void | 236 | static void |
| 243 | set_table_entry(struct ctl_table *entry, | 237 | set_table_entry(struct ctl_table *entry, |
| 244 | const char *procname, void *data, int maxlen, | 238 | const char *procname, void *data, int maxlen, |
| 245 | umode_t mode, proc_handler *proc_handler, | 239 | umode_t mode, proc_handler *proc_handler) |
| 246 | bool load_idx) | ||
| 247 | { | 240 | { |
| 248 | entry->procname = procname; | 241 | entry->procname = procname; |
| 249 | entry->data = data; | 242 | entry->data = data; |
| 250 | entry->maxlen = maxlen; | 243 | entry->maxlen = maxlen; |
| 251 | entry->mode = mode; | 244 | entry->mode = mode; |
| 252 | entry->proc_handler = proc_handler; | 245 | entry->proc_handler = proc_handler; |
| 253 | |||
| 254 | if (load_idx) { | ||
| 255 | entry->extra1 = &min_load_idx; | ||
| 256 | entry->extra2 = &max_load_idx; | ||
| 257 | } | ||
| 258 | } | 246 | } |
| 259 | 247 | ||
| 260 | static struct ctl_table * | 248 | static struct ctl_table * |
| 261 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 249 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
| 262 | { | 250 | { |
| 263 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 251 | struct ctl_table *table = sd_alloc_ctl_entry(9); |
| 264 | 252 | ||
| 265 | if (table == NULL) | 253 | if (table == NULL) |
| 266 | return NULL; | 254 | return NULL; |
| 267 | 255 | ||
| 268 | set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); | 256 | set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); |
| 269 | set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); | 257 | set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); |
| 270 | set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 258 | set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); |
| 271 | set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 259 | set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); |
| 272 | set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 260 | set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); |
| 273 | set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 261 | set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); |
| 274 | set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 262 | set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); |
| 275 | set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); | 263 | set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); |
| 276 | set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); | 264 | /* &table[8] is terminator */ |
| 277 | set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); | ||
| 278 | set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); | ||
| 279 | set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
| 280 | set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
| 281 | /* &table[13] is terminator */ | ||
| 282 | 265 | ||
| 283 | return table; | 266 | return table; |
| 284 | } | 267 | } |
| @@ -656,8 +639,6 @@ do { \ | |||
| 656 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 639 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
| 657 | 640 | ||
| 658 | P(nr_running); | 641 | P(nr_running); |
| 659 | SEQ_printf(m, " .%-30s: %lu\n", "load", | ||
| 660 | rq->load.weight); | ||
| 661 | P(nr_switches); | 642 | P(nr_switches); |
| 662 | P(nr_load_updates); | 643 | P(nr_load_updates); |
| 663 | P(nr_uninterruptible); | 644 | P(nr_uninterruptible); |
| @@ -665,11 +646,6 @@ do { \ | |||
| 665 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); | 646 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); |
| 666 | PN(clock); | 647 | PN(clock); |
| 667 | PN(clock_task); | 648 | PN(clock_task); |
| 668 | P(cpu_load[0]); | ||
| 669 | P(cpu_load[1]); | ||
| 670 | P(cpu_load[2]); | ||
| 671 | P(cpu_load[3]); | ||
| 672 | P(cpu_load[4]); | ||
| 673 | #undef P | 649 | #undef P |
| 674 | #undef PN | 650 | #undef PN |
| 675 | 651 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f35930f5e528..036be95a87e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 275 | return grp->my_q; | 275 | return grp->my_q; |
| 276 | } | 276 | } |
| 277 | 277 | ||
| 278 | static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) | ||
| 279 | { | ||
| 280 | if (!path) | ||
| 281 | return; | ||
| 282 | |||
| 283 | if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) | ||
| 284 | autogroup_path(cfs_rq->tg, path, len); | ||
| 285 | else if (cfs_rq && cfs_rq->tg->css.cgroup) | ||
| 286 | cgroup_path(cfs_rq->tg->css.cgroup, path, len); | ||
| 287 | else | ||
| 288 | strlcpy(path, "(null)", len); | ||
| 289 | } | ||
| 290 | |||
| 278 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 291 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 279 | { | 292 | { |
| 280 | struct rq *rq = rq_of(cfs_rq); | 293 | struct rq *rq = rq_of(cfs_rq); |
| @@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
| 449 | return NULL; | 462 | return NULL; |
| 450 | } | 463 | } |
| 451 | 464 | ||
| 465 | static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) | ||
| 466 | { | ||
| 467 | if (path) | ||
| 468 | strlcpy(path, "(null)", len); | ||
| 469 | } | ||
| 470 | |||
| 452 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 471 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
| 453 | { | 472 | { |
| 454 | return true; | 473 | return true; |
| @@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p) | |||
| 764 | struct sched_entity *se = &p->se; | 783 | struct sched_entity *se = &p->se; |
| 765 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 784 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
| 766 | struct sched_avg *sa = &se->avg; | 785 | struct sched_avg *sa = &se->avg; |
| 767 | long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); | 786 | long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); |
| 768 | long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; | 787 | long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; |
| 769 | 788 | ||
| 770 | if (cap > 0) { | 789 | if (cap > 0) { |
| @@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
| 1466 | group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; | 1485 | group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; |
| 1467 | } | 1486 | } |
| 1468 | 1487 | ||
| 1469 | static unsigned long weighted_cpuload(struct rq *rq); | 1488 | static unsigned long cpu_runnable_load(struct rq *rq); |
| 1470 | static unsigned long source_load(int cpu, int type); | ||
| 1471 | static unsigned long target_load(int cpu, int type); | ||
| 1472 | 1489 | ||
| 1473 | /* Cached statistics for all CPUs within a node */ | 1490 | /* Cached statistics for all CPUs within a node */ |
| 1474 | struct numa_stats { | 1491 | struct numa_stats { |
| @@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
| 1489 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1506 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
| 1490 | struct rq *rq = cpu_rq(cpu); | 1507 | struct rq *rq = cpu_rq(cpu); |
| 1491 | 1508 | ||
| 1492 | ns->load += weighted_cpuload(rq); | 1509 | ns->load += cpu_runnable_load(rq); |
| 1493 | ns->compute_capacity += capacity_of(cpu); | 1510 | ns->compute_capacity += capacity_of(cpu); |
| 1494 | } | 1511 | } |
| 1495 | 1512 | ||
| @@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
| 1621 | * be incurred if the tasks were swapped. | 1638 | * be incurred if the tasks were swapped. |
| 1622 | */ | 1639 | */ |
| 1623 | /* Skip this swap candidate if cannot move to the source cpu */ | 1640 | /* Skip this swap candidate if cannot move to the source cpu */ |
| 1624 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1641 | if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) |
| 1625 | goto unlock; | 1642 | goto unlock; |
| 1626 | 1643 | ||
| 1627 | /* | 1644 | /* |
| @@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, | |||
| 1718 | 1735 | ||
| 1719 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | 1736 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { |
| 1720 | /* Skip this CPU if the source task cannot migrate */ | 1737 | /* Skip this CPU if the source task cannot migrate */ |
| 1721 | if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) | 1738 | if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) |
| 1722 | continue; | 1739 | continue; |
| 1723 | 1740 | ||
| 1724 | env->dst_cpu = cpu; | 1741 | env->dst_cpu = cpu; |
| @@ -2686,8 +2703,6 @@ static void | |||
| 2686 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2703 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 2687 | { | 2704 | { |
| 2688 | update_load_add(&cfs_rq->load, se->load.weight); | 2705 | update_load_add(&cfs_rq->load, se->load.weight); |
| 2689 | if (!parent_entity(se)) | ||
| 2690 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | ||
| 2691 | #ifdef CONFIG_SMP | 2706 | #ifdef CONFIG_SMP |
| 2692 | if (entity_is_task(se)) { | 2707 | if (entity_is_task(se)) { |
| 2693 | struct rq *rq = rq_of(cfs_rq); | 2708 | struct rq *rq = rq_of(cfs_rq); |
| @@ -2703,8 +2718,6 @@ static void | |||
| 2703 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2718 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
| 2704 | { | 2719 | { |
| 2705 | update_load_sub(&cfs_rq->load, se->load.weight); | 2720 | update_load_sub(&cfs_rq->load, se->load.weight); |
| 2706 | if (!parent_entity(se)) | ||
| 2707 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | ||
| 2708 | #ifdef CONFIG_SMP | 2721 | #ifdef CONFIG_SMP |
| 2709 | if (entity_is_task(se)) { | 2722 | if (entity_is_task(se)) { |
| 2710 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | 2723 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); |
| @@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) | |||
| 3334 | update_tg_cfs_util(cfs_rq, se, gcfs_rq); | 3347 | update_tg_cfs_util(cfs_rq, se, gcfs_rq); |
| 3335 | update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); | 3348 | update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); |
| 3336 | 3349 | ||
| 3350 | trace_pelt_cfs_tp(cfs_rq); | ||
| 3351 | trace_pelt_se_tp(se); | ||
| 3352 | |||
| 3337 | return 1; | 3353 | return 1; |
| 3338 | } | 3354 | } |
| 3339 | 3355 | ||
| @@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3486 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); | 3502 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); |
| 3487 | 3503 | ||
| 3488 | cfs_rq_util_change(cfs_rq, flags); | 3504 | cfs_rq_util_change(cfs_rq, flags); |
| 3505 | |||
| 3506 | trace_pelt_cfs_tp(cfs_rq); | ||
| 3489 | } | 3507 | } |
| 3490 | 3508 | ||
| 3491 | /** | 3509 | /** |
| @@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
| 3505 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); | 3523 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); |
| 3506 | 3524 | ||
| 3507 | cfs_rq_util_change(cfs_rq, 0); | 3525 | cfs_rq_util_change(cfs_rq, 0); |
| 3526 | |||
| 3527 | trace_pelt_cfs_tp(cfs_rq); | ||
| 3508 | } | 3528 | } |
| 3509 | 3529 | ||
| 3510 | /* | 3530 | /* |
| @@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
| 4100 | * least twice that of our own weight (i.e. dont track it | 4120 | * least twice that of our own weight (i.e. dont track it |
| 4101 | * when there are only lesser-weight tasks around): | 4121 | * when there are only lesser-weight tasks around): |
| 4102 | */ | 4122 | */ |
| 4103 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 4123 | if (schedstat_enabled() && |
| 4124 | rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { | ||
| 4104 | schedstat_set(se->statistics.slice_max, | 4125 | schedstat_set(se->statistics.slice_max, |
| 4105 | max((u64)schedstat_val(se->statistics.slice_max), | 4126 | max((u64)schedstat_val(se->statistics.slice_max), |
| 4106 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); | 4127 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); |
| @@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 4734 | if (runtime_refresh_within(cfs_b, min_left)) | 4755 | if (runtime_refresh_within(cfs_b, min_left)) |
| 4735 | return; | 4756 | return; |
| 4736 | 4757 | ||
| 4758 | /* don't push forwards an existing deferred unthrottle */ | ||
| 4759 | if (cfs_b->slack_started) | ||
| 4760 | return; | ||
| 4761 | cfs_b->slack_started = true; | ||
| 4762 | |||
| 4737 | hrtimer_start(&cfs_b->slack_timer, | 4763 | hrtimer_start(&cfs_b->slack_timer, |
| 4738 | ns_to_ktime(cfs_bandwidth_slack_period), | 4764 | ns_to_ktime(cfs_bandwidth_slack_period), |
| 4739 | HRTIMER_MODE_REL); | 4765 | HRTIMER_MODE_REL); |
| @@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
| 4787 | 4813 | ||
| 4788 | /* confirm we're still not at a refresh boundary */ | 4814 | /* confirm we're still not at a refresh boundary */ |
| 4789 | raw_spin_lock_irqsave(&cfs_b->lock, flags); | 4815 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
| 4816 | cfs_b->slack_started = false; | ||
| 4790 | if (cfs_b->distribute_running) { | 4817 | if (cfs_b->distribute_running) { |
| 4791 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); | 4818 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
| 4792 | return; | 4819 | return; |
| @@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
| 4950 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 4977 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
| 4951 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | 4978 | cfs_b->slack_timer.function = sched_cfs_slack_timer; |
| 4952 | cfs_b->distribute_running = 0; | 4979 | cfs_b->distribute_running = 0; |
| 4980 | cfs_b->slack_started = false; | ||
| 4953 | } | 4981 | } |
| 4954 | 4982 | ||
| 4955 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 4983 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
| @@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu) | |||
| 5153 | 5181 | ||
| 5154 | static inline void update_overutilized_status(struct rq *rq) | 5182 | static inline void update_overutilized_status(struct rq *rq) |
| 5155 | { | 5183 | { |
| 5156 | if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) | 5184 | if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { |
| 5157 | WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); | 5185 | WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); |
| 5186 | trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); | ||
| 5187 | } | ||
| 5158 | } | 5188 | } |
| 5159 | #else | 5189 | #else |
| 5160 | static inline void update_overutilized_status(struct rq *rq) { } | 5190 | static inline void update_overutilized_status(struct rq *rq) { } |
| @@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | |||
| 5325 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | 5355 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); |
| 5326 | 5356 | ||
| 5327 | #ifdef CONFIG_NO_HZ_COMMON | 5357 | #ifdef CONFIG_NO_HZ_COMMON |
| 5328 | /* | ||
| 5329 | * per rq 'load' arrray crap; XXX kill this. | ||
| 5330 | */ | ||
| 5331 | |||
| 5332 | /* | ||
| 5333 | * The exact cpuload calculated at every tick would be: | ||
| 5334 | * | ||
| 5335 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | ||
| 5336 | * | ||
| 5337 | * If a CPU misses updates for n ticks (as it was idle) and update gets | ||
| 5338 | * called on the n+1-th tick when CPU may be busy, then we have: | ||
| 5339 | * | ||
| 5340 | * load_n = (1 - 1/2^i)^n * load_0 | ||
| 5341 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | ||
| 5342 | * | ||
| 5343 | * decay_load_missed() below does efficient calculation of | ||
| 5344 | * | ||
| 5345 | * load' = (1 - 1/2^i)^n * load | ||
| 5346 | * | ||
| 5347 | * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. | ||
| 5348 | * This allows us to precompute the above in said factors, thereby allowing the | ||
| 5349 | * reduction of an arbitrary n in O(log_2 n) steps. (See also | ||
| 5350 | * fixed_power_int()) | ||
| 5351 | * | ||
| 5352 | * The calculation is approximated on a 128 point scale. | ||
| 5353 | */ | ||
| 5354 | #define DEGRADE_SHIFT 7 | ||
| 5355 | |||
| 5356 | static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
| 5357 | static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
| 5358 | { 0, 0, 0, 0, 0, 0, 0, 0 }, | ||
| 5359 | { 64, 32, 8, 0, 0, 0, 0, 0 }, | ||
| 5360 | { 96, 72, 40, 12, 1, 0, 0, 0 }, | ||
| 5361 | { 112, 98, 75, 43, 15, 1, 0, 0 }, | ||
| 5362 | { 120, 112, 98, 76, 45, 16, 2, 0 } | ||
| 5363 | }; | ||
| 5364 | |||
| 5365 | /* | ||
| 5366 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
| 5367 | * would be when CPU is idle and so we just decay the old load without | ||
| 5368 | * adding any new load. | ||
| 5369 | */ | ||
| 5370 | static unsigned long | ||
| 5371 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
| 5372 | { | ||
| 5373 | int j = 0; | ||
| 5374 | |||
| 5375 | if (!missed_updates) | ||
| 5376 | return load; | ||
| 5377 | |||
| 5378 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
| 5379 | return 0; | ||
| 5380 | |||
| 5381 | if (idx == 1) | ||
| 5382 | return load >> missed_updates; | ||
| 5383 | |||
| 5384 | while (missed_updates) { | ||
| 5385 | if (missed_updates % 2) | ||
| 5386 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
| 5387 | |||
| 5388 | missed_updates >>= 1; | ||
| 5389 | j++; | ||
| 5390 | } | ||
| 5391 | return load; | ||
| 5392 | } | ||
| 5393 | 5358 | ||
| 5394 | static struct { | 5359 | static struct { |
| 5395 | cpumask_var_t idle_cpus_mask; | 5360 | cpumask_var_t idle_cpus_mask; |
| @@ -5401,234 +5366,11 @@ static struct { | |||
| 5401 | 5366 | ||
| 5402 | #endif /* CONFIG_NO_HZ_COMMON */ | 5367 | #endif /* CONFIG_NO_HZ_COMMON */ |
| 5403 | 5368 | ||
| 5404 | /** | 5369 | static unsigned long cpu_runnable_load(struct rq *rq) |
| 5405 | * __cpu_load_update - update the rq->cpu_load[] statistics | ||
| 5406 | * @this_rq: The rq to update statistics for | ||
| 5407 | * @this_load: The current load | ||
| 5408 | * @pending_updates: The number of missed updates | ||
| 5409 | * | ||
| 5410 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
| 5411 | * scheduler tick (TICK_NSEC). | ||
| 5412 | * | ||
| 5413 | * This function computes a decaying average: | ||
| 5414 | * | ||
| 5415 | * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load | ||
| 5416 | * | ||
| 5417 | * Because of NOHZ it might not get called on every tick which gives need for | ||
| 5418 | * the @pending_updates argument. | ||
| 5419 | * | ||
| 5420 | * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 | ||
| 5421 | * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load | ||
| 5422 | * = A * (A * load[i]_n-2 + B) + B | ||
| 5423 | * = A * (A * (A * load[i]_n-3 + B) + B) + B | ||
| 5424 | * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B | ||
| 5425 | * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B | ||
| 5426 | * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B | ||
| 5427 | * = (1 - 1/2^i)^n * (load[i]_0 - load) + load | ||
| 5428 | * | ||
| 5429 | * In the above we've assumed load_n := load, which is true for NOHZ_FULL as | ||
| 5430 | * any change in load would have resulted in the tick being turned back on. | ||
| 5431 | * | ||
| 5432 | * For regular NOHZ, this reduces to: | ||
| 5433 | * | ||
| 5434 | * load[i]_n = (1 - 1/2^i)^n * load[i]_0 | ||
| 5435 | * | ||
| 5436 | * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra | ||
| 5437 | * term. | ||
| 5438 | */ | ||
| 5439 | static void cpu_load_update(struct rq *this_rq, unsigned long this_load, | ||
| 5440 | unsigned long pending_updates) | ||
| 5441 | { | ||
| 5442 | unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; | ||
| 5443 | int i, scale; | ||
| 5444 | |||
| 5445 | this_rq->nr_load_updates++; | ||
| 5446 | |||
| 5447 | /* Update our load: */ | ||
| 5448 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
| 5449 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
| 5450 | unsigned long old_load, new_load; | ||
| 5451 | |||
| 5452 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
| 5453 | |||
| 5454 | old_load = this_rq->cpu_load[i]; | ||
| 5455 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 5456 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
| 5457 | if (tickless_load) { | ||
| 5458 | old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); | ||
| 5459 | /* | ||
| 5460 | * old_load can never be a negative value because a | ||
| 5461 | * decayed tickless_load cannot be greater than the | ||
| 5462 | * original tickless_load. | ||
| 5463 | */ | ||
| 5464 | old_load += tickless_load; | ||
| 5465 | } | ||
| 5466 | #endif | ||
| 5467 | new_load = this_load; | ||
| 5468 | /* | ||
| 5469 | * Round up the averaging division if load is increasing. This | ||
| 5470 | * prevents us from getting stuck on 9 if the load is 10, for | ||
| 5471 | * example. | ||
| 5472 | */ | ||
| 5473 | if (new_load > old_load) | ||
| 5474 | new_load += scale - 1; | ||
| 5475 | |||
| 5476 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
| 5477 | } | ||
| 5478 | } | ||
| 5479 | |||
| 5480 | /* Used instead of source_load when we know the type == 0 */ | ||
| 5481 | static unsigned long weighted_cpuload(struct rq *rq) | ||
| 5482 | { | 5370 | { |
| 5483 | return cfs_rq_runnable_load_avg(&rq->cfs); | 5371 | return cfs_rq_runnable_load_avg(&rq->cfs); |
| 5484 | } | 5372 | } |
| 5485 | 5373 | ||
| 5486 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 5487 | /* | ||
| 5488 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
| 5489 | * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading | ||
| 5490 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
| 5491 | * | ||
| 5492 | * Therefore we need to avoid the delta approach from the regular tick when | ||
| 5493 | * possible since that would seriously skew the load calculation. This is why we | ||
| 5494 | * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on | ||
| 5495 | * jiffies deltas for updates happening while in nohz mode (idle ticks, idle | ||
| 5496 | * loop exit, nohz_idle_balance, nohz full exit...) | ||
| 5497 | * | ||
| 5498 | * This means we might still be one tick off for nohz periods. | ||
| 5499 | */ | ||
| 5500 | |||
| 5501 | static void cpu_load_update_nohz(struct rq *this_rq, | ||
| 5502 | unsigned long curr_jiffies, | ||
| 5503 | unsigned long load) | ||
| 5504 | { | ||
| 5505 | unsigned long pending_updates; | ||
| 5506 | |||
| 5507 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 5508 | if (pending_updates) { | ||
| 5509 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 5510 | /* | ||
| 5511 | * In the regular NOHZ case, we were idle, this means load 0. | ||
| 5512 | * In the NOHZ_FULL case, we were non-idle, we should consider | ||
| 5513 | * its weighted load. | ||
| 5514 | */ | ||
| 5515 | cpu_load_update(this_rq, load, pending_updates); | ||
| 5516 | } | ||
| 5517 | } | ||
| 5518 | |||
| 5519 | /* | ||
| 5520 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
| 5521 | * idle balance. | ||
| 5522 | */ | ||
| 5523 | static void cpu_load_update_idle(struct rq *this_rq) | ||
| 5524 | { | ||
| 5525 | /* | ||
| 5526 | * bail if there's load or we're actually up-to-date. | ||
| 5527 | */ | ||
| 5528 | if (weighted_cpuload(this_rq)) | ||
| 5529 | return; | ||
| 5530 | |||
| 5531 | cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); | ||
| 5532 | } | ||
| 5533 | |||
| 5534 | /* | ||
| 5535 | * Record CPU load on nohz entry so we know the tickless load to account | ||
| 5536 | * on nohz exit. cpu_load[0] happens then to be updated more frequently | ||
| 5537 | * than other cpu_load[idx] but it should be fine as cpu_load readers | ||
| 5538 | * shouldn't rely into synchronized cpu_load[*] updates. | ||
| 5539 | */ | ||
| 5540 | void cpu_load_update_nohz_start(void) | ||
| 5541 | { | ||
| 5542 | struct rq *this_rq = this_rq(); | ||
| 5543 | |||
| 5544 | /* | ||
| 5545 | * This is all lockless but should be fine. If weighted_cpuload changes | ||
| 5546 | * concurrently we'll exit nohz. And cpu_load write can race with | ||
| 5547 | * cpu_load_update_idle() but both updater would be writing the same. | ||
| 5548 | */ | ||
| 5549 | this_rq->cpu_load[0] = weighted_cpuload(this_rq); | ||
| 5550 | } | ||
| 5551 | |||
| 5552 | /* | ||
| 5553 | * Account the tickless load in the end of a nohz frame. | ||
| 5554 | */ | ||
| 5555 | void cpu_load_update_nohz_stop(void) | ||
| 5556 | { | ||
| 5557 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
| 5558 | struct rq *this_rq = this_rq(); | ||
| 5559 | unsigned long load; | ||
| 5560 | struct rq_flags rf; | ||
| 5561 | |||
| 5562 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
| 5563 | return; | ||
| 5564 | |||
| 5565 | load = weighted_cpuload(this_rq); | ||
| 5566 | rq_lock(this_rq, &rf); | ||
| 5567 | update_rq_clock(this_rq); | ||
| 5568 | cpu_load_update_nohz(this_rq, curr_jiffies, load); | ||
| 5569 | rq_unlock(this_rq, &rf); | ||
| 5570 | } | ||
| 5571 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
| 5572 | static inline void cpu_load_update_nohz(struct rq *this_rq, | ||
| 5573 | unsigned long curr_jiffies, | ||
| 5574 | unsigned long load) { } | ||
| 5575 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
| 5576 | |||
| 5577 | static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) | ||
| 5578 | { | ||
| 5579 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 5580 | /* See the mess around cpu_load_update_nohz(). */ | ||
| 5581 | this_rq->last_load_update_tick = READ_ONCE(jiffies); | ||
| 5582 | #endif | ||
| 5583 | cpu_load_update(this_rq, load, 1); | ||
| 5584 | } | ||
| 5585 | |||
| 5586 | /* | ||
| 5587 | * Called from scheduler_tick() | ||
| 5588 | */ | ||
| 5589 | void cpu_load_update_active(struct rq *this_rq) | ||
| 5590 | { | ||
| 5591 | unsigned long load = weighted_cpuload(this_rq); | ||
| 5592 | |||
| 5593 | if (tick_nohz_tick_stopped()) | ||
| 5594 | cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); | ||
| 5595 | else | ||
| 5596 | cpu_load_update_periodic(this_rq, load); | ||
| 5597 | } | ||
| 5598 | |||
| 5599 | /* | ||
| 5600 | * Return a low guess at the load of a migration-source CPU weighted | ||
| 5601 | * according to the scheduling class and "nice" value. | ||
| 5602 | * | ||
| 5603 | * We want to under-estimate the load of migration sources, to | ||
| 5604 | * balance conservatively. | ||
| 5605 | */ | ||
| 5606 | static unsigned long source_load(int cpu, int type) | ||
| 5607 | { | ||
| 5608 | struct rq *rq = cpu_rq(cpu); | ||
| 5609 | unsigned long total = weighted_cpuload(rq); | ||
| 5610 | |||
| 5611 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 5612 | return total; | ||
| 5613 | |||
| 5614 | return min(rq->cpu_load[type-1], total); | ||
| 5615 | } | ||
| 5616 | |||
| 5617 | /* | ||
| 5618 | * Return a high guess at the load of a migration-target CPU weighted | ||
| 5619 | * according to the scheduling class and "nice" value. | ||
| 5620 | */ | ||
| 5621 | static unsigned long target_load(int cpu, int type) | ||
| 5622 | { | ||
| 5623 | struct rq *rq = cpu_rq(cpu); | ||
| 5624 | unsigned long total = weighted_cpuload(rq); | ||
| 5625 | |||
| 5626 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
| 5627 | return total; | ||
| 5628 | |||
| 5629 | return max(rq->cpu_load[type-1], total); | ||
| 5630 | } | ||
| 5631 | |||
| 5632 | static unsigned long capacity_of(int cpu) | 5374 | static unsigned long capacity_of(int cpu) |
| 5633 | { | 5375 | { |
| 5634 | return cpu_rq(cpu)->cpu_capacity; | 5376 | return cpu_rq(cpu)->cpu_capacity; |
| @@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
| 5638 | { | 5380 | { |
| 5639 | struct rq *rq = cpu_rq(cpu); | 5381 | struct rq *rq = cpu_rq(cpu); |
| 5640 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); | 5382 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); |
| 5641 | unsigned long load_avg = weighted_cpuload(rq); | 5383 | unsigned long load_avg = cpu_runnable_load(rq); |
| 5642 | 5384 | ||
| 5643 | if (nr_running) | 5385 | if (nr_running) |
| 5644 | return load_avg / nr_running; | 5386 | return load_avg / nr_running; |
| @@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
| 5736 | s64 this_eff_load, prev_eff_load; | 5478 | s64 this_eff_load, prev_eff_load; |
| 5737 | unsigned long task_load; | 5479 | unsigned long task_load; |
| 5738 | 5480 | ||
| 5739 | this_eff_load = target_load(this_cpu, sd->wake_idx); | 5481 | this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); |
| 5740 | 5482 | ||
| 5741 | if (sync) { | 5483 | if (sync) { |
| 5742 | unsigned long current_load = task_h_load(current); | 5484 | unsigned long current_load = task_h_load(current); |
| @@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
| 5754 | this_eff_load *= 100; | 5496 | this_eff_load *= 100; |
| 5755 | this_eff_load *= capacity_of(prev_cpu); | 5497 | this_eff_load *= capacity_of(prev_cpu); |
| 5756 | 5498 | ||
| 5757 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | 5499 | prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); |
| 5758 | prev_eff_load -= task_load; | 5500 | prev_eff_load -= task_load; |
| 5759 | if (sched_feat(WA_BIAS)) | 5501 | if (sched_feat(WA_BIAS)) |
| 5760 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | 5502 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; |
| @@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5815 | unsigned long this_runnable_load = ULONG_MAX; | 5557 | unsigned long this_runnable_load = ULONG_MAX; |
| 5816 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; | 5558 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; |
| 5817 | unsigned long most_spare = 0, this_spare = 0; | 5559 | unsigned long most_spare = 0, this_spare = 0; |
| 5818 | int load_idx = sd->forkexec_idx; | ||
| 5819 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; | 5560 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; |
| 5820 | unsigned long imbalance = scale_load_down(NICE_0_LOAD) * | 5561 | unsigned long imbalance = scale_load_down(NICE_0_LOAD) * |
| 5821 | (sd->imbalance_pct-100) / 100; | 5562 | (sd->imbalance_pct-100) / 100; |
| 5822 | 5563 | ||
| 5823 | if (sd_flag & SD_BALANCE_WAKE) | ||
| 5824 | load_idx = sd->wake_idx; | ||
| 5825 | |||
| 5826 | do { | 5564 | do { |
| 5827 | unsigned long load, avg_load, runnable_load; | 5565 | unsigned long load, avg_load, runnable_load; |
| 5828 | unsigned long spare_cap, max_spare_cap; | 5566 | unsigned long spare_cap, max_spare_cap; |
| @@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5831 | 5569 | ||
| 5832 | /* Skip over this group if it has no CPUs allowed */ | 5570 | /* Skip over this group if it has no CPUs allowed */ |
| 5833 | if (!cpumask_intersects(sched_group_span(group), | 5571 | if (!cpumask_intersects(sched_group_span(group), |
| 5834 | &p->cpus_allowed)) | 5572 | p->cpus_ptr)) |
| 5835 | continue; | 5573 | continue; |
| 5836 | 5574 | ||
| 5837 | local_group = cpumask_test_cpu(this_cpu, | 5575 | local_group = cpumask_test_cpu(this_cpu, |
| @@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
| 5846 | max_spare_cap = 0; | 5584 | max_spare_cap = 0; |
| 5847 | 5585 | ||
| 5848 | for_each_cpu(i, sched_group_span(group)) { | 5586 | for_each_cpu(i, sched_group_span(group)) { |
| 5849 | /* Bias balancing toward CPUs of our domain */ | 5587 | load = cpu_runnable_load(cpu_rq(i)); |
| 5850 | if (local_group) | ||
| 5851 | load = source_load(i, load_idx); | ||
| 5852 | else | ||
| 5853 | load = target_load(i, load_idx); | ||
| 5854 | |||
| 5855 | runnable_load += load; | 5588 | runnable_load += load; |
| 5856 | 5589 | ||
| 5857 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); | 5590 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); |
| @@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this | |||
| 5963 | return cpumask_first(sched_group_span(group)); | 5696 | return cpumask_first(sched_group_span(group)); |
| 5964 | 5697 | ||
| 5965 | /* Traverse only the allowed CPUs */ | 5698 | /* Traverse only the allowed CPUs */ |
| 5966 | for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { | 5699 | for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { |
| 5967 | if (available_idle_cpu(i)) { | 5700 | if (available_idle_cpu(i)) { |
| 5968 | struct rq *rq = cpu_rq(i); | 5701 | struct rq *rq = cpu_rq(i); |
| 5969 | struct cpuidle_state *idle = idle_get_state(rq); | 5702 | struct cpuidle_state *idle = idle_get_state(rq); |
| @@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this | |||
| 5987 | shallowest_idle_cpu = i; | 5720 | shallowest_idle_cpu = i; |
| 5988 | } | 5721 | } |
| 5989 | } else if (shallowest_idle_cpu == -1) { | 5722 | } else if (shallowest_idle_cpu == -1) { |
| 5990 | load = weighted_cpuload(cpu_rq(i)); | 5723 | load = cpu_runnable_load(cpu_rq(i)); |
| 5991 | if (load < min_load) { | 5724 | if (load < min_load) { |
| 5992 | min_load = load; | 5725 | min_load = load; |
| 5993 | least_loaded_cpu = i; | 5726 | least_loaded_cpu = i; |
| @@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
| 6003 | { | 5736 | { |
| 6004 | int new_cpu = cpu; | 5737 | int new_cpu = cpu; |
| 6005 | 5738 | ||
| 6006 | if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) | 5739 | if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) |
| 6007 | return prev_cpu; | 5740 | return prev_cpu; |
| 6008 | 5741 | ||
| 6009 | /* | 5742 | /* |
| @@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | |||
| 6120 | if (!test_idle_cores(target, false)) | 5853 | if (!test_idle_cores(target, false)) |
| 6121 | return -1; | 5854 | return -1; |
| 6122 | 5855 | ||
| 6123 | cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); | 5856 | cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); |
| 6124 | 5857 | ||
| 6125 | for_each_cpu_wrap(core, cpus, target) { | 5858 | for_each_cpu_wrap(core, cpus, target) { |
| 6126 | bool idle = true; | 5859 | bool idle = true; |
| @@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target) | |||
| 6154 | return -1; | 5887 | return -1; |
| 6155 | 5888 | ||
| 6156 | for_each_cpu(cpu, cpu_smt_mask(target)) { | 5889 | for_each_cpu(cpu, cpu_smt_mask(target)) { |
| 6157 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 5890 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
| 6158 | continue; | 5891 | continue; |
| 6159 | if (available_idle_cpu(cpu)) | 5892 | if (available_idle_cpu(cpu)) |
| 6160 | return cpu; | 5893 | return cpu; |
| @@ -6189,6 +5922,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | |||
| 6189 | u64 time, cost; | 5922 | u64 time, cost; |
| 6190 | s64 delta; | 5923 | s64 delta; |
| 6191 | int cpu, nr = INT_MAX; | 5924 | int cpu, nr = INT_MAX; |
| 5925 | int this = smp_processor_id(); | ||
| 6192 | 5926 | ||
| 6193 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); | 5927 | this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); |
| 6194 | if (!this_sd) | 5928 | if (!this_sd) |
| @@ -6212,18 +5946,18 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | |||
| 6212 | nr = 4; | 5946 | nr = 4; |
| 6213 | } | 5947 | } |
| 6214 | 5948 | ||
| 6215 | time = local_clock(); | 5949 | time = cpu_clock(this); |
| 6216 | 5950 | ||
| 6217 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { | 5951 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { |
| 6218 | if (!--nr) | 5952 | if (!--nr) |
| 6219 | return -1; | 5953 | return -1; |
| 6220 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 5954 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
| 6221 | continue; | 5955 | continue; |
| 6222 | if (available_idle_cpu(cpu)) | 5956 | if (available_idle_cpu(cpu)) |
| 6223 | break; | 5957 | break; |
| 6224 | } | 5958 | } |
| 6225 | 5959 | ||
| 6226 | time = local_clock() - time; | 5960 | time = cpu_clock(this) - time; |
| 6227 | cost = this_sd->avg_scan_cost; | 5961 | cost = this_sd->avg_scan_cost; |
| 6228 | delta = (s64)(time - cost) / 8; | 5962 | delta = (s64)(time - cost) / 8; |
| 6229 | this_sd->avg_scan_cost += delta; | 5963 | this_sd->avg_scan_cost += delta; |
| @@ -6254,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
| 6254 | recent_used_cpu != target && | 5988 | recent_used_cpu != target && |
| 6255 | cpus_share_cache(recent_used_cpu, target) && | 5989 | cpus_share_cache(recent_used_cpu, target) && |
| 6256 | available_idle_cpu(recent_used_cpu) && | 5990 | available_idle_cpu(recent_used_cpu) && |
| 6257 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 5991 | cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { |
| 6258 | /* | 5992 | /* |
| 6259 | * Replace recent_used_cpu with prev as it is a potential | 5993 | * Replace recent_used_cpu with prev as it is a potential |
| 6260 | * candidate for the next wake: | 5994 | * candidate for the next wake: |
| @@ -6498,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) | |||
| 6498 | static long | 6232 | static long |
| 6499 | compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) | 6233 | compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) |
| 6500 | { | 6234 | { |
| 6501 | long util, max_util, sum_util, energy = 0; | 6235 | unsigned int max_util, util_cfs, cpu_util, cpu_cap; |
| 6236 | unsigned long sum_util, energy = 0; | ||
| 6237 | struct task_struct *tsk; | ||
| 6502 | int cpu; | 6238 | int cpu; |
| 6503 | 6239 | ||
| 6504 | for (; pd; pd = pd->next) { | 6240 | for (; pd; pd = pd->next) { |
| 6241 | struct cpumask *pd_mask = perf_domain_span(pd); | ||
| 6242 | |||
| 6243 | /* | ||
| 6244 | * The energy model mandates all the CPUs of a performance | ||
| 6245 | * domain have the same capacity. | ||
| 6246 | */ | ||
| 6247 | cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); | ||
| 6505 | max_util = sum_util = 0; | 6248 | max_util = sum_util = 0; |
| 6249 | |||
| 6506 | /* | 6250 | /* |
| 6507 | * The capacity state of CPUs of the current rd can be driven by | 6251 | * The capacity state of CPUs of the current rd can be driven by |
| 6508 | * CPUs of another rd if they belong to the same performance | 6252 | * CPUs of another rd if they belong to the same performance |
| @@ -6513,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) | |||
| 6513 | * it will not appear in its pd list and will not be accounted | 6257 | * it will not appear in its pd list and will not be accounted |
| 6514 | * by compute_energy(). | 6258 | * by compute_energy(). |
| 6515 | */ | 6259 | */ |
| 6516 | for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { | 6260 | for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { |
| 6517 | util = cpu_util_next(cpu, p, dst_cpu); | 6261 | util_cfs = cpu_util_next(cpu, p, dst_cpu); |
| 6518 | util = schedutil_energy_util(cpu, util); | 6262 | |
| 6519 | max_util = max(util, max_util); | 6263 | /* |
| 6520 | sum_util += util; | 6264 | * Busy time computation: utilization clamping is not |
| 6265 | * required since the ratio (sum_util / cpu_capacity) | ||
| 6266 | * is already enough to scale the EM reported power | ||
| 6267 | * consumption at the (eventually clamped) cpu_capacity. | ||
| 6268 | */ | ||
| 6269 | sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, | ||
| 6270 | ENERGY_UTIL, NULL); | ||
| 6271 | |||
| 6272 | /* | ||
| 6273 | * Performance domain frequency: utilization clamping | ||
| 6274 | * must be considered since it affects the selection | ||
| 6275 | * of the performance domain frequency. | ||
| 6276 | * NOTE: in case RT tasks are running, by default the | ||
| 6277 | * FREQUENCY_UTIL's utilization can be max OPP. | ||
| 6278 | */ | ||
| 6279 | tsk = cpu == dst_cpu ? p : NULL; | ||
| 6280 | cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, | ||
| 6281 | FREQUENCY_UTIL, tsk); | ||
| 6282 | max_util = max(max_util, cpu_util); | ||
| 6521 | } | 6283 | } |
| 6522 | 6284 | ||
| 6523 | energy += em_pd_energy(pd->em_pd, max_util, sum_util); | 6285 | energy += em_pd_energy(pd->em_pd, max_util, sum_util); |
| @@ -6600,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) | |||
| 6600 | int max_spare_cap_cpu = -1; | 6362 | int max_spare_cap_cpu = -1; |
| 6601 | 6363 | ||
| 6602 | for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { | 6364 | for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { |
| 6603 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 6365 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
| 6604 | continue; | 6366 | continue; |
| 6605 | 6367 | ||
| 6606 | /* Skip CPUs that will be overutilized. */ | 6368 | /* Skip CPUs that will be overutilized. */ |
| @@ -6689,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
| 6689 | } | 6451 | } |
| 6690 | 6452 | ||
| 6691 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && | 6453 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && |
| 6692 | cpumask_test_cpu(cpu, &p->cpus_allowed); | 6454 | cpumask_test_cpu(cpu, p->cpus_ptr); |
| 6693 | } | 6455 | } |
| 6694 | 6456 | ||
| 6695 | rcu_read_lock(); | 6457 | rcu_read_lock(); |
| @@ -7445,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 7445 | /* | 7207 | /* |
| 7446 | * We do not migrate tasks that are: | 7208 | * We do not migrate tasks that are: |
| 7447 | * 1) throttled_lb_pair, or | 7209 | * 1) throttled_lb_pair, or |
| 7448 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 7210 | * 2) cannot be migrated to this CPU due to cpus_ptr, or |
| 7449 | * 3) running (obviously), or | 7211 | * 3) running (obviously), or |
| 7450 | * 4) are cache-hot on their current CPU. | 7212 | * 4) are cache-hot on their current CPU. |
| 7451 | */ | 7213 | */ |
| 7452 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | 7214 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
| 7453 | return 0; | 7215 | return 0; |
| 7454 | 7216 | ||
| 7455 | if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { | 7217 | if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { |
| 7456 | int cpu; | 7218 | int cpu; |
| 7457 | 7219 | ||
| 7458 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); | 7220 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
| @@ -7472,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
| 7472 | 7234 | ||
| 7473 | /* Prevent to re-select dst_cpu via env's CPUs: */ | 7235 | /* Prevent to re-select dst_cpu via env's CPUs: */ |
| 7474 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 7236 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
| 7475 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | 7237 | if (cpumask_test_cpu(cpu, p->cpus_ptr)) { |
| 7476 | env->flags |= LBF_DST_PINNED; | 7238 | env->flags |= LBF_DST_PINNED; |
| 7477 | env->new_dst_cpu = cpu; | 7239 | env->new_dst_cpu = cpu; |
| 7478 | break; | 7240 | break; |
| @@ -7558,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
| 7558 | static const unsigned int sched_nr_migrate_break = 32; | 7320 | static const unsigned int sched_nr_migrate_break = 32; |
| 7559 | 7321 | ||
| 7560 | /* | 7322 | /* |
| 7561 | * detach_tasks() -- tries to detach up to imbalance weighted load from | 7323 | * detach_tasks() -- tries to detach up to imbalance runnable load from |
| 7562 | * busiest_rq, as part of a balancing operation within domain "sd". | 7324 | * busiest_rq, as part of a balancing operation within domain "sd". |
| 7563 | * | 7325 | * |
| 7564 | * Returns number of detached tasks if successful and 0 otherwise. | 7326 | * Returns number of detached tasks if successful and 0 otherwise. |
| @@ -7626,7 +7388,7 @@ static int detach_tasks(struct lb_env *env) | |||
| 7626 | 7388 | ||
| 7627 | /* | 7389 | /* |
| 7628 | * We only want to steal up to the prescribed amount of | 7390 | * We only want to steal up to the prescribed amount of |
| 7629 | * weighted load. | 7391 | * runnable load. |
| 7630 | */ | 7392 | */ |
| 7631 | if (env->imbalance <= 0) | 7393 | if (env->imbalance <= 0) |
| 7632 | break; | 7394 | break; |
| @@ -7695,6 +7457,7 @@ static void attach_tasks(struct lb_env *env) | |||
| 7695 | rq_unlock(env->dst_rq, &rf); | 7457 | rq_unlock(env->dst_rq, &rf); |
| 7696 | } | 7458 | } |
| 7697 | 7459 | ||
| 7460 | #ifdef CONFIG_NO_HZ_COMMON | ||
| 7698 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) | 7461 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) |
| 7699 | { | 7462 | { |
| 7700 | if (cfs_rq->avg.load_avg) | 7463 | if (cfs_rq->avg.load_avg) |
| @@ -7722,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq) | |||
| 7722 | return false; | 7485 | return false; |
| 7723 | } | 7486 | } |
| 7724 | 7487 | ||
| 7488 | static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) | ||
| 7489 | { | ||
| 7490 | rq->last_blocked_load_update_tick = jiffies; | ||
| 7491 | |||
| 7492 | if (!has_blocked) | ||
| 7493 | rq->has_blocked_load = 0; | ||
| 7494 | } | ||
| 7495 | #else | ||
| 7496 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } | ||
| 7497 | static inline bool others_have_blocked(struct rq *rq) { return false; } | ||
| 7498 | static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} | ||
| 7499 | #endif | ||
| 7500 | |||
| 7725 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7501 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 7726 | 7502 | ||
| 7727 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | 7503 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) |
| @@ -7787,11 +7563,7 @@ static void update_blocked_averages(int cpu) | |||
| 7787 | if (others_have_blocked(rq)) | 7563 | if (others_have_blocked(rq)) |
| 7788 | done = false; | 7564 | done = false; |
| 7789 | 7565 | ||
| 7790 | #ifdef CONFIG_NO_HZ_COMMON | 7566 | update_blocked_load_status(rq, !done); |
| 7791 | rq->last_blocked_load_update_tick = jiffies; | ||
| 7792 | if (done) | ||
| 7793 | rq->has_blocked_load = 0; | ||
| 7794 | #endif | ||
| 7795 | rq_unlock_irqrestore(rq, &rf); | 7567 | rq_unlock_irqrestore(rq, &rf); |
| 7796 | } | 7568 | } |
| 7797 | 7569 | ||
| @@ -7857,11 +7629,7 @@ static inline void update_blocked_averages(int cpu) | |||
| 7857 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); | 7629 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); |
| 7858 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); | 7630 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); |
| 7859 | update_irq_load_avg(rq, 0); | 7631 | update_irq_load_avg(rq, 0); |
| 7860 | #ifdef CONFIG_NO_HZ_COMMON | 7632 | update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); |
| 7861 | rq->last_blocked_load_update_tick = jiffies; | ||
| 7862 | if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) | ||
| 7863 | rq->has_blocked_load = 0; | ||
| 7864 | #endif | ||
| 7865 | rq_unlock_irqrestore(rq, &rf); | 7633 | rq_unlock_irqrestore(rq, &rf); |
| 7866 | } | 7634 | } |
| 7867 | 7635 | ||
| @@ -7879,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p) | |||
| 7879 | struct sg_lb_stats { | 7647 | struct sg_lb_stats { |
| 7880 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 7648 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
| 7881 | unsigned long group_load; /* Total load over the CPUs of the group */ | 7649 | unsigned long group_load; /* Total load over the CPUs of the group */ |
| 7882 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
| 7883 | unsigned long load_per_task; | 7650 | unsigned long load_per_task; |
| 7884 | unsigned long group_capacity; | 7651 | unsigned long group_capacity; |
| 7885 | unsigned long group_util; /* Total utilization of the group */ | 7652 | unsigned long group_util; /* Total utilization of the group */ |
| @@ -7933,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
| 7933 | }; | 7700 | }; |
| 7934 | } | 7701 | } |
| 7935 | 7702 | ||
| 7936 | /** | ||
| 7937 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
| 7938 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
| 7939 | * @idle: The idle status of the CPU for whose sd load_idx is obtained. | ||
| 7940 | * | ||
| 7941 | * Return: The load index. | ||
| 7942 | */ | ||
| 7943 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
| 7944 | enum cpu_idle_type idle) | ||
| 7945 | { | ||
| 7946 | int load_idx; | ||
| 7947 | |||
| 7948 | switch (idle) { | ||
| 7949 | case CPU_NOT_IDLE: | ||
| 7950 | load_idx = sd->busy_idx; | ||
| 7951 | break; | ||
| 7952 | |||
| 7953 | case CPU_NEWLY_IDLE: | ||
| 7954 | load_idx = sd->newidle_idx; | ||
| 7955 | break; | ||
| 7956 | default: | ||
| 7957 | load_idx = sd->idle_idx; | ||
| 7958 | break; | ||
| 7959 | } | ||
| 7960 | |||
| 7961 | return load_idx; | ||
| 7962 | } | ||
| 7963 | |||
| 7964 | static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) | 7703 | static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) |
| 7965 | { | 7704 | { |
| 7966 | struct rq *rq = cpu_rq(cpu); | 7705 | struct rq *rq = cpu_rq(cpu); |
| 7967 | unsigned long max = arch_scale_cpu_capacity(sd, cpu); | 7706 | unsigned long max = arch_scale_cpu_capacity(cpu); |
| 7968 | unsigned long used, free; | 7707 | unsigned long used, free; |
| 7969 | unsigned long irq; | 7708 | unsigned long irq; |
| 7970 | 7709 | ||
| @@ -7989,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
| 7989 | unsigned long capacity = scale_rt_capacity(sd, cpu); | 7728 | unsigned long capacity = scale_rt_capacity(sd, cpu); |
| 7990 | struct sched_group *sdg = sd->groups; | 7729 | struct sched_group *sdg = sd->groups; |
| 7991 | 7730 | ||
| 7992 | cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); | 7731 | cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); |
| 7993 | 7732 | ||
| 7994 | if (!capacity) | 7733 | if (!capacity) |
| 7995 | capacity = 1; | 7734 | capacity = 1; |
| @@ -8099,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) | |||
| 8099 | 7838 | ||
| 8100 | /* | 7839 | /* |
| 8101 | * Group imbalance indicates (and tries to solve) the problem where balancing | 7840 | * Group imbalance indicates (and tries to solve) the problem where balancing |
| 8102 | * groups is inadequate due to ->cpus_allowed constraints. | 7841 | * groups is inadequate due to ->cpus_ptr constraints. |
| 8103 | * | 7842 | * |
| 8104 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a | 7843 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a |
| 8105 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. | 7844 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. |
| @@ -8249,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 8249 | struct sg_lb_stats *sgs, | 7988 | struct sg_lb_stats *sgs, |
| 8250 | int *sg_status) | 7989 | int *sg_status) |
| 8251 | { | 7990 | { |
| 8252 | int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); | ||
| 8253 | int load_idx = get_sd_load_idx(env->sd, env->idle); | ||
| 8254 | unsigned long load; | ||
| 8255 | int i, nr_running; | 7991 | int i, nr_running; |
| 8256 | 7992 | ||
| 8257 | memset(sgs, 0, sizeof(*sgs)); | 7993 | memset(sgs, 0, sizeof(*sgs)); |
| @@ -8262,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 8262 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) | 7998 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) |
| 8263 | env->flags |= LBF_NOHZ_AGAIN; | 7999 | env->flags |= LBF_NOHZ_AGAIN; |
| 8264 | 8000 | ||
| 8265 | /* Bias balancing toward CPUs of our domain: */ | 8001 | sgs->group_load += cpu_runnable_load(rq); |
| 8266 | if (local_group) | ||
| 8267 | load = target_load(i, load_idx); | ||
| 8268 | else | ||
| 8269 | load = source_load(i, load_idx); | ||
| 8270 | |||
| 8271 | sgs->group_load += load; | ||
| 8272 | sgs->group_util += cpu_util(i); | 8002 | sgs->group_util += cpu_util(i); |
| 8273 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 8003 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
| 8274 | 8004 | ||
| @@ -8283,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 8283 | sgs->nr_numa_running += rq->nr_numa_running; | 8013 | sgs->nr_numa_running += rq->nr_numa_running; |
| 8284 | sgs->nr_preferred_running += rq->nr_preferred_running; | 8014 | sgs->nr_preferred_running += rq->nr_preferred_running; |
| 8285 | #endif | 8015 | #endif |
| 8286 | sgs->sum_weighted_load += weighted_cpuload(rq); | ||
| 8287 | /* | 8016 | /* |
| 8288 | * No need to call idle_cpu() if nr_running is not 0 | 8017 | * No need to call idle_cpu() if nr_running is not 0 |
| 8289 | */ | 8018 | */ |
| @@ -8302,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 8302 | sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; | 8031 | sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; |
| 8303 | 8032 | ||
| 8304 | if (sgs->sum_nr_running) | 8033 | if (sgs->sum_nr_running) |
| 8305 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 8034 | sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; |
| 8306 | 8035 | ||
| 8307 | sgs->group_weight = group->group_weight; | 8036 | sgs->group_weight = group->group_weight; |
| 8308 | 8037 | ||
| @@ -8516,8 +8245,12 @@ next_group: | |||
| 8516 | 8245 | ||
| 8517 | /* Update over-utilization (tipping point, U >= 0) indicator */ | 8246 | /* Update over-utilization (tipping point, U >= 0) indicator */ |
| 8518 | WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); | 8247 | WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); |
| 8248 | trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); | ||
| 8519 | } else if (sg_status & SG_OVERUTILIZED) { | 8249 | } else if (sg_status & SG_OVERUTILIZED) { |
| 8520 | WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); | 8250 | struct root_domain *rd = env->dst_rq->rd; |
| 8251 | |||
| 8252 | WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); | ||
| 8253 | trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); | ||
| 8521 | } | 8254 | } |
| 8522 | } | 8255 | } |
| 8523 | 8256 | ||
| @@ -8723,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 8723 | * find_busiest_group - Returns the busiest group within the sched_domain | 8456 | * find_busiest_group - Returns the busiest group within the sched_domain |
| 8724 | * if there is an imbalance. | 8457 | * if there is an imbalance. |
| 8725 | * | 8458 | * |
| 8726 | * Also calculates the amount of weighted load which should be moved | 8459 | * Also calculates the amount of runnable load which should be moved |
| 8727 | * to restore balance. | 8460 | * to restore balance. |
| 8728 | * | 8461 | * |
| 8729 | * @env: The load balancing environment. | 8462 | * @env: The load balancing environment. |
| @@ -8768,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
| 8768 | /* | 8501 | /* |
| 8769 | * If the busiest group is imbalanced the below checks don't | 8502 | * If the busiest group is imbalanced the below checks don't |
| 8770 | * work because they assume all things are equal, which typically | 8503 | * work because they assume all things are equal, which typically |
| 8771 | * isn't true due to cpus_allowed constraints and the like. | 8504 | * isn't true due to cpus_ptr constraints and the like. |
| 8772 | */ | 8505 | */ |
| 8773 | if (busiest->group_type == group_imbalanced) | 8506 | if (busiest->group_type == group_imbalanced) |
| 8774 | goto force_balance; | 8507 | goto force_balance; |
| @@ -8842,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 8842 | int i; | 8575 | int i; |
| 8843 | 8576 | ||
| 8844 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { | 8577 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
| 8845 | unsigned long capacity, wl; | 8578 | unsigned long capacity, load; |
| 8846 | enum fbq_type rt; | 8579 | enum fbq_type rt; |
| 8847 | 8580 | ||
| 8848 | rq = cpu_rq(i); | 8581 | rq = cpu_rq(i); |
| @@ -8896,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
| 8896 | rq->nr_running == 1) | 8629 | rq->nr_running == 1) |
| 8897 | continue; | 8630 | continue; |
| 8898 | 8631 | ||
| 8899 | wl = weighted_cpuload(rq); | 8632 | load = cpu_runnable_load(rq); |
| 8900 | 8633 | ||
| 8901 | /* | 8634 | /* |
| 8902 | * When comparing with imbalance, use weighted_cpuload() | 8635 | * When comparing with imbalance, use cpu_runnable_load() |
| 8903 | * which is not scaled with the CPU capacity. | 8636 | * which is not scaled with the CPU capacity. |
| 8904 | */ | 8637 | */ |
| 8905 | 8638 | ||
| 8906 | if (rq->nr_running == 1 && wl > env->imbalance && | 8639 | if (rq->nr_running == 1 && load > env->imbalance && |
| 8907 | !check_cpu_capacity(rq, env->sd)) | 8640 | !check_cpu_capacity(rq, env->sd)) |
| 8908 | continue; | 8641 | continue; |
| 8909 | 8642 | ||
| 8910 | /* | 8643 | /* |
| 8911 | * For the load comparisons with the other CPU's, consider | 8644 | * For the load comparisons with the other CPU's, consider |
| 8912 | * the weighted_cpuload() scaled with the CPU capacity, so | 8645 | * the cpu_runnable_load() scaled with the CPU capacity, so |
| 8913 | * that the load can be moved away from the CPU that is | 8646 | * that the load can be moved away from the CPU that is |
| 8914 | * potentially running at a lower capacity. | 8647 | * potentially running at a lower capacity. |
| 8915 | * | 8648 | * |
| 8916 | * Thus we're looking for max(wl_i / capacity_i), crosswise | 8649 | * Thus we're looking for max(load_i / capacity_i), crosswise |
| 8917 | * multiplication to rid ourselves of the division works out | 8650 | * multiplication to rid ourselves of the division works out |
| 8918 | * to: wl_i * capacity_j > wl_j * capacity_i; where j is | 8651 | * to: load_i * capacity_j > load_j * capacity_i; where j is |
| 8919 | * our previous maximum. | 8652 | * our previous maximum. |
| 8920 | */ | 8653 | */ |
| 8921 | if (wl * busiest_capacity > busiest_load * capacity) { | 8654 | if (load * busiest_capacity > busiest_load * capacity) { |
| 8922 | busiest_load = wl; | 8655 | busiest_load = load; |
| 8923 | busiest_capacity = capacity; | 8656 | busiest_capacity = capacity; |
| 8924 | busiest = rq; | 8657 | busiest = rq; |
| 8925 | } | 8658 | } |
| @@ -9210,7 +8943,7 @@ more_balance: | |||
| 9210 | * if the curr task on busiest CPU can't be | 8943 | * if the curr task on busiest CPU can't be |
| 9211 | * moved to this_cpu: | 8944 | * moved to this_cpu: |
| 9212 | */ | 8945 | */ |
| 9213 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | 8946 | if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { |
| 9214 | raw_spin_unlock_irqrestore(&busiest->lock, | 8947 | raw_spin_unlock_irqrestore(&busiest->lock, |
| 9215 | flags); | 8948 | flags); |
| 9216 | env.flags |= LBF_ALL_PINNED; | 8949 | env.flags |= LBF_ALL_PINNED; |
| @@ -9879,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, | |||
| 9879 | 9612 | ||
| 9880 | rq_lock_irqsave(rq, &rf); | 9613 | rq_lock_irqsave(rq, &rf); |
| 9881 | update_rq_clock(rq); | 9614 | update_rq_clock(rq); |
| 9882 | cpu_load_update_idle(rq); | ||
| 9883 | rq_unlock_irqrestore(rq, &rf); | 9615 | rq_unlock_irqrestore(rq, &rf); |
| 9884 | 9616 | ||
| 9885 | if (flags & NOHZ_BALANCE_KICK) | 9617 | if (flags & NOHZ_BALANCE_KICK) |
| @@ -10690,6 +10422,10 @@ const struct sched_class fair_sched_class = { | |||
| 10690 | #ifdef CONFIG_FAIR_GROUP_SCHED | 10422 | #ifdef CONFIG_FAIR_GROUP_SCHED |
| 10691 | .task_change_group = task_change_group_fair, | 10423 | .task_change_group = task_change_group_fair, |
| 10692 | #endif | 10424 | #endif |
| 10425 | |||
| 10426 | #ifdef CONFIG_UCLAMP_TASK | ||
| 10427 | .uclamp_enabled = 1, | ||
| 10428 | #endif | ||
| 10693 | }; | 10429 | }; |
| 10694 | 10430 | ||
| 10695 | #ifdef CONFIG_SCHED_DEBUG | 10431 | #ifdef CONFIG_SCHED_DEBUG |
| @@ -10737,3 +10473,83 @@ __init void init_sched_fair_class(void) | |||
| 10737 | #endif /* SMP */ | 10473 | #endif /* SMP */ |
| 10738 | 10474 | ||
| 10739 | } | 10475 | } |
| 10476 | |||
| 10477 | /* | ||
| 10478 | * Helper functions to facilitate extracting info from tracepoints. | ||
| 10479 | */ | ||
| 10480 | |||
| 10481 | const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) | ||
| 10482 | { | ||
| 10483 | #ifdef CONFIG_SMP | ||
| 10484 | return cfs_rq ? &cfs_rq->avg : NULL; | ||
| 10485 | #else | ||
| 10486 | return NULL; | ||
| 10487 | #endif | ||
| 10488 | } | ||
| 10489 | EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); | ||
| 10490 | |||
| 10491 | char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) | ||
| 10492 | { | ||
| 10493 | if (!cfs_rq) { | ||
| 10494 | if (str) | ||
| 10495 | strlcpy(str, "(null)", len); | ||
| 10496 | else | ||
| 10497 | return NULL; | ||
| 10498 | } | ||
| 10499 | |||
| 10500 | cfs_rq_tg_path(cfs_rq, str, len); | ||
| 10501 | return str; | ||
| 10502 | } | ||
| 10503 | EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); | ||
| 10504 | |||
| 10505 | int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) | ||
| 10506 | { | ||
| 10507 | return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; | ||
| 10508 | } | ||
| 10509 | EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); | ||
| 10510 | |||
| 10511 | const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) | ||
| 10512 | { | ||
| 10513 | #ifdef CONFIG_SMP | ||
| 10514 | return rq ? &rq->avg_rt : NULL; | ||
| 10515 | #else | ||
| 10516 | return NULL; | ||
| 10517 | #endif | ||
| 10518 | } | ||
| 10519 | EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); | ||
| 10520 | |||
| 10521 | const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) | ||
| 10522 | { | ||
| 10523 | #ifdef CONFIG_SMP | ||
| 10524 | return rq ? &rq->avg_dl : NULL; | ||
| 10525 | #else | ||
| 10526 | return NULL; | ||
| 10527 | #endif | ||
| 10528 | } | ||
| 10529 | EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); | ||
| 10530 | |||
| 10531 | const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) | ||
| 10532 | { | ||
| 10533 | #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) | ||
| 10534 | return rq ? &rq->avg_irq : NULL; | ||
| 10535 | #else | ||
| 10536 | return NULL; | ||
| 10537 | #endif | ||
| 10538 | } | ||
| 10539 | EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); | ||
| 10540 | |||
| 10541 | int sched_trace_rq_cpu(struct rq *rq) | ||
| 10542 | { | ||
| 10543 | return rq ? cpu_of(rq) : -1; | ||
| 10544 | } | ||
| 10545 | EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); | ||
| 10546 | |||
| 10547 | const struct cpumask *sched_trace_rd_span(struct root_domain *rd) | ||
| 10548 | { | ||
| 10549 | #ifdef CONFIG_SMP | ||
| 10550 | return rd ? rd->span : NULL; | ||
| 10551 | #else | ||
| 10552 | return NULL; | ||
| 10553 | #endif | ||
| 10554 | } | ||
| 10555 | EXPORT_SYMBOL_GPL(sched_trace_rd_span); | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 858589b83377..2410db5e9a35 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) | |||
| 39 | 39 | ||
| 40 | SCHED_FEAT(HRTICK, false) | 40 | SCHED_FEAT(HRTICK, false) |
| 41 | SCHED_FEAT(DOUBLE_TICK, false) | 41 | SCHED_FEAT(DOUBLE_TICK, false) |
| 42 | SCHED_FEAT(LB_BIAS, false) | ||
| 43 | 42 | ||
| 44 | /* | 43 | /* |
| 45 | * Decrement CPU capacity based on time not spent running tasks | 44 | * Decrement CPU capacity based on time not spent running tasks |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f5516bae0c1b..80940939b733 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Generic entry points for the idle threads and | 3 | * Generic entry points for the idle threads and |
| 3 | * implementation of the idle task scheduling class. | 4 | * implementation of the idle task scheduling class. |
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 687302051a27..123ea07a3f3b 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Housekeeping management. Manage the targets for routine code that can run on | 3 | * Housekeeping management. Manage the targets for routine code that can run on |
| 3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. | 4 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 3cd8a3a795d2..aa8d75804108 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
| @@ -1,17 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> | 3 | * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> |
| 3 | * | 4 | * |
| 4 | * membarrier system call | 5 | * membarrier system call |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | */ | 6 | */ |
| 16 | #include "sched.h" | 7 | #include "sched.h" |
| 17 | 8 | ||
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index befce29bd882..a96db50d40e0 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c | |||
| @@ -28,6 +28,8 @@ | |||
| 28 | #include "sched.h" | 28 | #include "sched.h" |
| 29 | #include "pelt.h" | 29 | #include "pelt.h" |
| 30 | 30 | ||
| 31 | #include <trace/events/sched.h> | ||
| 32 | |||
| 31 | /* | 33 | /* |
| 32 | * Approximate: | 34 | * Approximate: |
| 33 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | 35 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) |
| @@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) | |||
| 265 | { | 267 | { |
| 266 | if (___update_load_sum(now, &se->avg, 0, 0, 0)) { | 268 | if (___update_load_sum(now, &se->avg, 0, 0, 0)) { |
| 267 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 269 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
| 270 | trace_pelt_se_tp(se); | ||
| 268 | return 1; | 271 | return 1; |
| 269 | } | 272 | } |
| 270 | 273 | ||
| @@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se | |||
| 278 | 281 | ||
| 279 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 282 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
| 280 | cfs_se_util_change(&se->avg); | 283 | cfs_se_util_change(&se->avg); |
| 284 | trace_pelt_se_tp(se); | ||
| 281 | return 1; | 285 | return 1; |
| 282 | } | 286 | } |
| 283 | 287 | ||
| @@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) | |||
| 292 | cfs_rq->curr != NULL)) { | 296 | cfs_rq->curr != NULL)) { |
| 293 | 297 | ||
| 294 | ___update_load_avg(&cfs_rq->avg, 1, 1); | 298 | ___update_load_avg(&cfs_rq->avg, 1, 1); |
| 299 | trace_pelt_cfs_tp(cfs_rq); | ||
| 295 | return 1; | 300 | return 1; |
| 296 | } | 301 | } |
| 297 | 302 | ||
| @@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) | |||
| 317 | running)) { | 322 | running)) { |
| 318 | 323 | ||
| 319 | ___update_load_avg(&rq->avg_rt, 1, 1); | 324 | ___update_load_avg(&rq->avg_rt, 1, 1); |
| 325 | trace_pelt_rt_tp(rq); | ||
| 320 | return 1; | 326 | return 1; |
| 321 | } | 327 | } |
| 322 | 328 | ||
| @@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) | |||
| 340 | running)) { | 346 | running)) { |
| 341 | 347 | ||
| 342 | ___update_load_avg(&rq->avg_dl, 1, 1); | 348 | ___update_load_avg(&rq->avg_dl, 1, 1); |
| 349 | trace_pelt_dl_tp(rq); | ||
| 343 | return 1; | 350 | return 1; |
| 344 | } | 351 | } |
| 345 | 352 | ||
| @@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) | |||
| 366 | * reflect the real amount of computation | 373 | * reflect the real amount of computation |
| 367 | */ | 374 | */ |
| 368 | running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); | 375 | running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); |
| 369 | running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); | 376 | running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); |
| 370 | 377 | ||
| 371 | /* | 378 | /* |
| 372 | * We know the time that has been used by interrupt since last update | 379 | * We know the time that has been used by interrupt since last update |
| @@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running) | |||
| 388 | 1, | 395 | 1, |
| 389 | 1); | 396 | 1); |
| 390 | 397 | ||
| 391 | if (ret) | 398 | if (ret) { |
| 392 | ___update_load_avg(&rq->avg_irq, 1, 1); | 399 | ___update_load_avg(&rq->avg_irq, 1, 1); |
| 400 | trace_pelt_irq_tp(rq); | ||
| 401 | } | ||
| 393 | 402 | ||
| 394 | return ret; | 403 | return ret; |
| 395 | } | 404 | } |
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7489d5f56960..afff644da065 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h | |||
| @@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) | |||
| 79 | * Scale the elapsed time to reflect the real amount of | 79 | * Scale the elapsed time to reflect the real amount of |
| 80 | * computation | 80 | * computation |
| 81 | */ | 81 | */ |
| 82 | delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); | 82 | delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); |
| 83 | delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); | 83 | delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); |
| 84 | 84 | ||
| 85 | rq->clock_pelt += delta; | 85 | rq->clock_pelt += delta; |
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0e97ca9306ef..7acc632c3b82 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c | |||
| @@ -4,6 +4,9 @@ | |||
| 4 | * Copyright (c) 2018 Facebook, Inc. | 4 | * Copyright (c) 2018 Facebook, Inc. |
| 5 | * Author: Johannes Weiner <hannes@cmpxchg.org> | 5 | * Author: Johannes Weiner <hannes@cmpxchg.org> |
| 6 | * | 6 | * |
| 7 | * Polling support by Suren Baghdasaryan <surenb@google.com> | ||
| 8 | * Copyright (c) 2018 Google, Inc. | ||
| 9 | * | ||
| 7 | * When CPU, memory and IO are contended, tasks experience delays that | 10 | * When CPU, memory and IO are contended, tasks experience delays that |
| 8 | * reduce throughput and introduce latencies into the workload. Memory | 11 | * reduce throughput and introduce latencies into the workload. Memory |
| 9 | * and IO contention, in addition, can cause a full loss of forward | 12 | * and IO contention, in addition, can cause a full loss of forward |
| @@ -129,9 +132,13 @@ | |||
| 129 | #include <linux/seq_file.h> | 132 | #include <linux/seq_file.h> |
| 130 | #include <linux/proc_fs.h> | 133 | #include <linux/proc_fs.h> |
| 131 | #include <linux/seqlock.h> | 134 | #include <linux/seqlock.h> |
| 135 | #include <linux/uaccess.h> | ||
| 132 | #include <linux/cgroup.h> | 136 | #include <linux/cgroup.h> |
| 133 | #include <linux/module.h> | 137 | #include <linux/module.h> |
| 134 | #include <linux/sched.h> | 138 | #include <linux/sched.h> |
| 139 | #include <linux/ctype.h> | ||
| 140 | #include <linux/file.h> | ||
| 141 | #include <linux/poll.h> | ||
| 135 | #include <linux/psi.h> | 142 | #include <linux/psi.h> |
| 136 | #include "sched.h" | 143 | #include "sched.h" |
| 137 | 144 | ||
| @@ -140,9 +147,9 @@ static int psi_bug __read_mostly; | |||
| 140 | DEFINE_STATIC_KEY_FALSE(psi_disabled); | 147 | DEFINE_STATIC_KEY_FALSE(psi_disabled); |
| 141 | 148 | ||
| 142 | #ifdef CONFIG_PSI_DEFAULT_DISABLED | 149 | #ifdef CONFIG_PSI_DEFAULT_DISABLED |
| 143 | bool psi_enable; | 150 | static bool psi_enable; |
| 144 | #else | 151 | #else |
| 145 | bool psi_enable = true; | 152 | static bool psi_enable = true; |
| 146 | #endif | 153 | #endif |
| 147 | static int __init setup_psi(char *str) | 154 | static int __init setup_psi(char *str) |
| 148 | { | 155 | { |
| @@ -156,16 +163,21 @@ __setup("psi=", setup_psi); | |||
| 156 | #define EXP_60s 1981 /* 1/exp(2s/60s) */ | 163 | #define EXP_60s 1981 /* 1/exp(2s/60s) */ |
| 157 | #define EXP_300s 2034 /* 1/exp(2s/300s) */ | 164 | #define EXP_300s 2034 /* 1/exp(2s/300s) */ |
| 158 | 165 | ||
| 166 | /* PSI trigger definitions */ | ||
| 167 | #define WINDOW_MIN_US 500000 /* Min window size is 500ms */ | ||
| 168 | #define WINDOW_MAX_US 10000000 /* Max window size is 10s */ | ||
| 169 | #define UPDATES_PER_WINDOW 10 /* 10 updates per window */ | ||
| 170 | |||
| 159 | /* Sampling frequency in nanoseconds */ | 171 | /* Sampling frequency in nanoseconds */ |
| 160 | static u64 psi_period __read_mostly; | 172 | static u64 psi_period __read_mostly; |
| 161 | 173 | ||
| 162 | /* System-level pressure and stall tracking */ | 174 | /* System-level pressure and stall tracking */ |
| 163 | static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); | 175 | static DEFINE_PER_CPU(struct psi_group_cpu, system_group_pcpu); |
| 164 | static struct psi_group psi_system = { | 176 | struct psi_group psi_system = { |
| 165 | .pcpu = &system_group_pcpu, | 177 | .pcpu = &system_group_pcpu, |
| 166 | }; | 178 | }; |
| 167 | 179 | ||
| 168 | static void psi_update_work(struct work_struct *work); | 180 | static void psi_avgs_work(struct work_struct *work); |
| 169 | 181 | ||
| 170 | static void group_init(struct psi_group *group) | 182 | static void group_init(struct psi_group *group) |
| 171 | { | 183 | { |
| @@ -173,9 +185,20 @@ static void group_init(struct psi_group *group) | |||
| 173 | 185 | ||
| 174 | for_each_possible_cpu(cpu) | 186 | for_each_possible_cpu(cpu) |
| 175 | seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); | 187 | seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq); |
| 176 | group->next_update = sched_clock() + psi_period; | 188 | group->avg_next_update = sched_clock() + psi_period; |
| 177 | INIT_DELAYED_WORK(&group->clock_work, psi_update_work); | 189 | INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work); |
| 178 | mutex_init(&group->stat_lock); | 190 | mutex_init(&group->avgs_lock); |
| 191 | /* Init trigger-related members */ | ||
| 192 | atomic_set(&group->poll_scheduled, 0); | ||
| 193 | mutex_init(&group->trigger_lock); | ||
| 194 | INIT_LIST_HEAD(&group->triggers); | ||
| 195 | memset(group->nr_triggers, 0, sizeof(group->nr_triggers)); | ||
| 196 | group->poll_states = 0; | ||
| 197 | group->poll_min_period = U32_MAX; | ||
| 198 | memset(group->polling_total, 0, sizeof(group->polling_total)); | ||
| 199 | group->polling_next_update = ULLONG_MAX; | ||
| 200 | group->polling_until = 0; | ||
| 201 | rcu_assign_pointer(group->poll_kworker, NULL); | ||
| 179 | } | 202 | } |
| 180 | 203 | ||
| 181 | void __init psi_init(void) | 204 | void __init psi_init(void) |
| @@ -210,20 +233,24 @@ static bool test_state(unsigned int *tasks, enum psi_states state) | |||
| 210 | } | 233 | } |
| 211 | } | 234 | } |
| 212 | 235 | ||
| 213 | static void get_recent_times(struct psi_group *group, int cpu, u32 *times) | 236 | static void get_recent_times(struct psi_group *group, int cpu, |
| 237 | enum psi_aggregators aggregator, u32 *times, | ||
| 238 | u32 *pchanged_states) | ||
| 214 | { | 239 | { |
| 215 | struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); | 240 | struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); |
| 216 | unsigned int tasks[NR_PSI_TASK_COUNTS]; | ||
| 217 | u64 now, state_start; | 241 | u64 now, state_start; |
| 242 | enum psi_states s; | ||
| 218 | unsigned int seq; | 243 | unsigned int seq; |
| 219 | int s; | 244 | u32 state_mask; |
| 245 | |||
| 246 | *pchanged_states = 0; | ||
| 220 | 247 | ||
| 221 | /* Snapshot a coherent view of the CPU state */ | 248 | /* Snapshot a coherent view of the CPU state */ |
| 222 | do { | 249 | do { |
| 223 | seq = read_seqcount_begin(&groupc->seq); | 250 | seq = read_seqcount_begin(&groupc->seq); |
| 224 | now = cpu_clock(cpu); | 251 | now = cpu_clock(cpu); |
| 225 | memcpy(times, groupc->times, sizeof(groupc->times)); | 252 | memcpy(times, groupc->times, sizeof(groupc->times)); |
| 226 | memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); | 253 | state_mask = groupc->state_mask; |
| 227 | state_start = groupc->state_start; | 254 | state_start = groupc->state_start; |
| 228 | } while (read_seqcount_retry(&groupc->seq, seq)); | 255 | } while (read_seqcount_retry(&groupc->seq, seq)); |
| 229 | 256 | ||
| @@ -239,13 +266,15 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times) | |||
| 239 | * (u32) and our reported pressure close to what's | 266 | * (u32) and our reported pressure close to what's |
| 240 | * actually happening. | 267 | * actually happening. |
| 241 | */ | 268 | */ |
| 242 | if (test_state(tasks, s)) | 269 | if (state_mask & (1 << s)) |
| 243 | times[s] += now - state_start; | 270 | times[s] += now - state_start; |
| 244 | 271 | ||
| 245 | delta = times[s] - groupc->times_prev[s]; | 272 | delta = times[s] - groupc->times_prev[aggregator][s]; |
| 246 | groupc->times_prev[s] = times[s]; | 273 | groupc->times_prev[aggregator][s] = times[s]; |
| 247 | 274 | ||
| 248 | times[s] = delta; | 275 | times[s] = delta; |
| 276 | if (delta) | ||
| 277 | *pchanged_states |= (1 << s); | ||
| 249 | } | 278 | } |
| 250 | } | 279 | } |
| 251 | 280 | ||
| @@ -269,17 +298,16 @@ static void calc_avgs(unsigned long avg[3], int missed_periods, | |||
| 269 | avg[2] = calc_load(avg[2], EXP_300s, pct); | 298 | avg[2] = calc_load(avg[2], EXP_300s, pct); |
| 270 | } | 299 | } |
| 271 | 300 | ||
| 272 | static bool update_stats(struct psi_group *group) | 301 | static void collect_percpu_times(struct psi_group *group, |
| 302 | enum psi_aggregators aggregator, | ||
| 303 | u32 *pchanged_states) | ||
| 273 | { | 304 | { |
| 274 | u64 deltas[NR_PSI_STATES - 1] = { 0, }; | 305 | u64 deltas[NR_PSI_STATES - 1] = { 0, }; |
| 275 | unsigned long missed_periods = 0; | ||
| 276 | unsigned long nonidle_total = 0; | 306 | unsigned long nonidle_total = 0; |
| 277 | u64 now, expires, period; | 307 | u32 changed_states = 0; |
| 278 | int cpu; | 308 | int cpu; |
| 279 | int s; | 309 | int s; |
| 280 | 310 | ||
| 281 | mutex_lock(&group->stat_lock); | ||
| 282 | |||
| 283 | /* | 311 | /* |
| 284 | * Collect the per-cpu time buckets and average them into a | 312 | * Collect the per-cpu time buckets and average them into a |
| 285 | * single time sample that is normalized to wallclock time. | 313 | * single time sample that is normalized to wallclock time. |
| @@ -291,8 +319,11 @@ static bool update_stats(struct psi_group *group) | |||
| 291 | for_each_possible_cpu(cpu) { | 319 | for_each_possible_cpu(cpu) { |
| 292 | u32 times[NR_PSI_STATES]; | 320 | u32 times[NR_PSI_STATES]; |
| 293 | u32 nonidle; | 321 | u32 nonidle; |
| 322 | u32 cpu_changed_states; | ||
| 294 | 323 | ||
| 295 | get_recent_times(group, cpu, times); | 324 | get_recent_times(group, cpu, aggregator, times, |
| 325 | &cpu_changed_states); | ||
| 326 | changed_states |= cpu_changed_states; | ||
| 296 | 327 | ||
| 297 | nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); | 328 | nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]); |
| 298 | nonidle_total += nonidle; | 329 | nonidle_total += nonidle; |
| @@ -315,13 +346,22 @@ static bool update_stats(struct psi_group *group) | |||
| 315 | 346 | ||
| 316 | /* total= */ | 347 | /* total= */ |
| 317 | for (s = 0; s < NR_PSI_STATES - 1; s++) | 348 | for (s = 0; s < NR_PSI_STATES - 1; s++) |
| 318 | group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL)); | 349 | group->total[aggregator][s] += |
| 350 | div_u64(deltas[s], max(nonidle_total, 1UL)); | ||
| 351 | |||
| 352 | if (pchanged_states) | ||
| 353 | *pchanged_states = changed_states; | ||
| 354 | } | ||
| 355 | |||
| 356 | static u64 update_averages(struct psi_group *group, u64 now) | ||
| 357 | { | ||
| 358 | unsigned long missed_periods = 0; | ||
| 359 | u64 expires, period; | ||
| 360 | u64 avg_next_update; | ||
| 361 | int s; | ||
| 319 | 362 | ||
| 320 | /* avgX= */ | 363 | /* avgX= */ |
| 321 | now = sched_clock(); | 364 | expires = group->avg_next_update; |
| 322 | expires = group->next_update; | ||
| 323 | if (now < expires) | ||
| 324 | goto out; | ||
| 325 | if (now - expires >= psi_period) | 365 | if (now - expires >= psi_period) |
| 326 | missed_periods = div_u64(now - expires, psi_period); | 366 | missed_periods = div_u64(now - expires, psi_period); |
| 327 | 367 | ||
| @@ -332,14 +372,14 @@ static bool update_stats(struct psi_group *group) | |||
| 332 | * But the deltas we sample out of the per-cpu buckets above | 372 | * But the deltas we sample out of the per-cpu buckets above |
| 333 | * are based on the actual time elapsing between clock ticks. | 373 | * are based on the actual time elapsing between clock ticks. |
| 334 | */ | 374 | */ |
| 335 | group->next_update = expires + ((1 + missed_periods) * psi_period); | 375 | avg_next_update = expires + ((1 + missed_periods) * psi_period); |
| 336 | period = now - (group->last_update + (missed_periods * psi_period)); | 376 | period = now - (group->avg_last_update + (missed_periods * psi_period)); |
| 337 | group->last_update = now; | 377 | group->avg_last_update = now; |
| 338 | 378 | ||
| 339 | for (s = 0; s < NR_PSI_STATES - 1; s++) { | 379 | for (s = 0; s < NR_PSI_STATES - 1; s++) { |
| 340 | u32 sample; | 380 | u32 sample; |
| 341 | 381 | ||
| 342 | sample = group->total[s] - group->total_prev[s]; | 382 | sample = group->total[PSI_AVGS][s] - group->avg_total[s]; |
| 343 | /* | 383 | /* |
| 344 | * Due to the lockless sampling of the time buckets, | 384 | * Due to the lockless sampling of the time buckets, |
| 345 | * recorded time deltas can slip into the next period, | 385 | * recorded time deltas can slip into the next period, |
| @@ -359,23 +399,30 @@ static bool update_stats(struct psi_group *group) | |||
| 359 | */ | 399 | */ |
| 360 | if (sample > period) | 400 | if (sample > period) |
| 361 | sample = period; | 401 | sample = period; |
| 362 | group->total_prev[s] += sample; | 402 | group->avg_total[s] += sample; |
| 363 | calc_avgs(group->avg[s], missed_periods, sample, period); | 403 | calc_avgs(group->avg[s], missed_periods, sample, period); |
| 364 | } | 404 | } |
| 365 | out: | 405 | |
| 366 | mutex_unlock(&group->stat_lock); | 406 | return avg_next_update; |
| 367 | return nonidle_total; | ||
| 368 | } | 407 | } |
| 369 | 408 | ||
| 370 | static void psi_update_work(struct work_struct *work) | 409 | static void psi_avgs_work(struct work_struct *work) |
| 371 | { | 410 | { |
| 372 | struct delayed_work *dwork; | 411 | struct delayed_work *dwork; |
| 373 | struct psi_group *group; | 412 | struct psi_group *group; |
| 413 | u32 changed_states; | ||
| 374 | bool nonidle; | 414 | bool nonidle; |
| 415 | u64 now; | ||
| 375 | 416 | ||
| 376 | dwork = to_delayed_work(work); | 417 | dwork = to_delayed_work(work); |
| 377 | group = container_of(dwork, struct psi_group, clock_work); | 418 | group = container_of(dwork, struct psi_group, avgs_work); |
| 419 | |||
| 420 | mutex_lock(&group->avgs_lock); | ||
| 378 | 421 | ||
| 422 | now = sched_clock(); | ||
| 423 | |||
| 424 | collect_percpu_times(group, PSI_AVGS, &changed_states); | ||
| 425 | nonidle = changed_states & (1 << PSI_NONIDLE); | ||
| 379 | /* | 426 | /* |
| 380 | * If there is task activity, periodically fold the per-cpu | 427 | * If there is task activity, periodically fold the per-cpu |
| 381 | * times and feed samples into the running averages. If things | 428 | * times and feed samples into the running averages. If things |
| @@ -383,18 +430,196 @@ static void psi_update_work(struct work_struct *work) | |||
| 383 | * Once restarted, we'll catch up the running averages in one | 430 | * Once restarted, we'll catch up the running averages in one |
| 384 | * go - see calc_avgs() and missed_periods. | 431 | * go - see calc_avgs() and missed_periods. |
| 385 | */ | 432 | */ |
| 386 | 433 | if (now >= group->avg_next_update) | |
| 387 | nonidle = update_stats(group); | 434 | group->avg_next_update = update_averages(group, now); |
| 388 | 435 | ||
| 389 | if (nonidle) { | 436 | if (nonidle) { |
| 390 | unsigned long delay = 0; | 437 | schedule_delayed_work(dwork, nsecs_to_jiffies( |
| 391 | u64 now; | 438 | group->avg_next_update - now) + 1); |
| 439 | } | ||
| 440 | |||
| 441 | mutex_unlock(&group->avgs_lock); | ||
| 442 | } | ||
| 443 | |||
| 444 | /* Trigger tracking window manupulations */ | ||
| 445 | static void window_reset(struct psi_window *win, u64 now, u64 value, | ||
| 446 | u64 prev_growth) | ||
| 447 | { | ||
| 448 | win->start_time = now; | ||
| 449 | win->start_value = value; | ||
| 450 | win->prev_growth = prev_growth; | ||
| 451 | } | ||
| 452 | |||
| 453 | /* | ||
| 454 | * PSI growth tracking window update and growth calculation routine. | ||
| 455 | * | ||
| 456 | * This approximates a sliding tracking window by interpolating | ||
| 457 | * partially elapsed windows using historical growth data from the | ||
| 458 | * previous intervals. This minimizes memory requirements (by not storing | ||
| 459 | * all the intermediate values in the previous window) and simplifies | ||
| 460 | * the calculations. It works well because PSI signal changes only in | ||
| 461 | * positive direction and over relatively small window sizes the growth | ||
| 462 | * is close to linear. | ||
| 463 | */ | ||
| 464 | static u64 window_update(struct psi_window *win, u64 now, u64 value) | ||
| 465 | { | ||
| 466 | u64 elapsed; | ||
| 467 | u64 growth; | ||
| 468 | |||
| 469 | elapsed = now - win->start_time; | ||
| 470 | growth = value - win->start_value; | ||
| 471 | /* | ||
| 472 | * After each tracking window passes win->start_value and | ||
| 473 | * win->start_time get reset and win->prev_growth stores | ||
| 474 | * the average per-window growth of the previous window. | ||
| 475 | * win->prev_growth is then used to interpolate additional | ||
| 476 | * growth from the previous window assuming it was linear. | ||
| 477 | */ | ||
| 478 | if (elapsed > win->size) | ||
| 479 | window_reset(win, now, value, growth); | ||
| 480 | else { | ||
| 481 | u32 remaining; | ||
| 482 | |||
| 483 | remaining = win->size - elapsed; | ||
| 484 | growth += div_u64(win->prev_growth * remaining, win->size); | ||
| 485 | } | ||
| 486 | |||
| 487 | return growth; | ||
| 488 | } | ||
| 489 | |||
| 490 | static void init_triggers(struct psi_group *group, u64 now) | ||
| 491 | { | ||
| 492 | struct psi_trigger *t; | ||
| 493 | |||
| 494 | list_for_each_entry(t, &group->triggers, node) | ||
| 495 | window_reset(&t->win, now, | ||
| 496 | group->total[PSI_POLL][t->state], 0); | ||
| 497 | memcpy(group->polling_total, group->total[PSI_POLL], | ||
| 498 | sizeof(group->polling_total)); | ||
| 499 | group->polling_next_update = now + group->poll_min_period; | ||
| 500 | } | ||
| 501 | |||
| 502 | static u64 update_triggers(struct psi_group *group, u64 now) | ||
| 503 | { | ||
| 504 | struct psi_trigger *t; | ||
| 505 | bool new_stall = false; | ||
| 506 | u64 *total = group->total[PSI_POLL]; | ||
| 507 | |||
| 508 | /* | ||
| 509 | * On subsequent updates, calculate growth deltas and let | ||
| 510 | * watchers know when their specified thresholds are exceeded. | ||
| 511 | */ | ||
| 512 | list_for_each_entry(t, &group->triggers, node) { | ||
| 513 | u64 growth; | ||
| 514 | |||
| 515 | /* Check for stall activity */ | ||
| 516 | if (group->polling_total[t->state] == total[t->state]) | ||
| 517 | continue; | ||
| 518 | |||
| 519 | /* | ||
| 520 | * Multiple triggers might be looking at the same state, | ||
| 521 | * remember to update group->polling_total[] once we've | ||
| 522 | * been through all of them. Also remember to extend the | ||
| 523 | * polling time if we see new stall activity. | ||
| 524 | */ | ||
| 525 | new_stall = true; | ||
| 526 | |||
| 527 | /* Calculate growth since last update */ | ||
| 528 | growth = window_update(&t->win, now, total[t->state]); | ||
| 529 | if (growth < t->threshold) | ||
| 530 | continue; | ||
| 531 | |||
| 532 | /* Limit event signaling to once per window */ | ||
| 533 | if (now < t->last_event_time + t->win.size) | ||
| 534 | continue; | ||
| 535 | |||
| 536 | /* Generate an event */ | ||
| 537 | if (cmpxchg(&t->event, 0, 1) == 0) | ||
| 538 | wake_up_interruptible(&t->event_wait); | ||
| 539 | t->last_event_time = now; | ||
| 540 | } | ||
| 541 | |||
| 542 | if (new_stall) | ||
| 543 | memcpy(group->polling_total, total, | ||
| 544 | sizeof(group->polling_total)); | ||
| 545 | |||
| 546 | return now + group->poll_min_period; | ||
| 547 | } | ||
| 548 | |||
| 549 | /* | ||
| 550 | * Schedule polling if it's not already scheduled. It's safe to call even from | ||
| 551 | * hotpath because even though kthread_queue_delayed_work takes worker->lock | ||
| 552 | * spinlock that spinlock is never contended due to poll_scheduled atomic | ||
| 553 | * preventing such competition. | ||
| 554 | */ | ||
| 555 | static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay) | ||
| 556 | { | ||
| 557 | struct kthread_worker *kworker; | ||
| 558 | |||
| 559 | /* Do not reschedule if already scheduled */ | ||
| 560 | if (atomic_cmpxchg(&group->poll_scheduled, 0, 1) != 0) | ||
| 561 | return; | ||
| 562 | |||
| 563 | rcu_read_lock(); | ||
| 392 | 564 | ||
| 393 | now = sched_clock(); | 565 | kworker = rcu_dereference(group->poll_kworker); |
| 394 | if (group->next_update > now) | 566 | /* |
| 395 | delay = nsecs_to_jiffies(group->next_update - now) + 1; | 567 | * kworker might be NULL in case psi_trigger_destroy races with |
| 396 | schedule_delayed_work(dwork, delay); | 568 | * psi_task_change (hotpath) which can't use locks |
| 569 | */ | ||
| 570 | if (likely(kworker)) | ||
| 571 | kthread_queue_delayed_work(kworker, &group->poll_work, delay); | ||
| 572 | else | ||
| 573 | atomic_set(&group->poll_scheduled, 0); | ||
| 574 | |||
| 575 | rcu_read_unlock(); | ||
| 576 | } | ||
| 577 | |||
| 578 | static void psi_poll_work(struct kthread_work *work) | ||
| 579 | { | ||
| 580 | struct kthread_delayed_work *dwork; | ||
| 581 | struct psi_group *group; | ||
| 582 | u32 changed_states; | ||
| 583 | u64 now; | ||
| 584 | |||
| 585 | dwork = container_of(work, struct kthread_delayed_work, work); | ||
| 586 | group = container_of(dwork, struct psi_group, poll_work); | ||
| 587 | |||
| 588 | atomic_set(&group->poll_scheduled, 0); | ||
| 589 | |||
| 590 | mutex_lock(&group->trigger_lock); | ||
| 591 | |||
| 592 | now = sched_clock(); | ||
| 593 | |||
| 594 | collect_percpu_times(group, PSI_POLL, &changed_states); | ||
| 595 | |||
| 596 | if (changed_states & group->poll_states) { | ||
| 597 | /* Initialize trigger windows when entering polling mode */ | ||
| 598 | if (now > group->polling_until) | ||
| 599 | init_triggers(group, now); | ||
| 600 | |||
| 601 | /* | ||
| 602 | * Keep the monitor active for at least the duration of the | ||
| 603 | * minimum tracking window as long as monitor states are | ||
| 604 | * changing. | ||
| 605 | */ | ||
| 606 | group->polling_until = now + | ||
| 607 | group->poll_min_period * UPDATES_PER_WINDOW; | ||
| 608 | } | ||
| 609 | |||
| 610 | if (now > group->polling_until) { | ||
| 611 | group->polling_next_update = ULLONG_MAX; | ||
| 612 | goto out; | ||
| 397 | } | 613 | } |
| 614 | |||
| 615 | if (now >= group->polling_next_update) | ||
| 616 | group->polling_next_update = update_triggers(group, now); | ||
| 617 | |||
| 618 | psi_schedule_poll_work(group, | ||
| 619 | nsecs_to_jiffies(group->polling_next_update - now) + 1); | ||
| 620 | |||
| 621 | out: | ||
| 622 | mutex_unlock(&group->trigger_lock); | ||
| 398 | } | 623 | } |
| 399 | 624 | ||
| 400 | static void record_times(struct psi_group_cpu *groupc, int cpu, | 625 | static void record_times(struct psi_group_cpu *groupc, int cpu, |
| @@ -407,15 +632,15 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, | |||
| 407 | delta = now - groupc->state_start; | 632 | delta = now - groupc->state_start; |
| 408 | groupc->state_start = now; | 633 | groupc->state_start = now; |
| 409 | 634 | ||
| 410 | if (test_state(groupc->tasks, PSI_IO_SOME)) { | 635 | if (groupc->state_mask & (1 << PSI_IO_SOME)) { |
| 411 | groupc->times[PSI_IO_SOME] += delta; | 636 | groupc->times[PSI_IO_SOME] += delta; |
| 412 | if (test_state(groupc->tasks, PSI_IO_FULL)) | 637 | if (groupc->state_mask & (1 << PSI_IO_FULL)) |
| 413 | groupc->times[PSI_IO_FULL] += delta; | 638 | groupc->times[PSI_IO_FULL] += delta; |
| 414 | } | 639 | } |
| 415 | 640 | ||
| 416 | if (test_state(groupc->tasks, PSI_MEM_SOME)) { | 641 | if (groupc->state_mask & (1 << PSI_MEM_SOME)) { |
| 417 | groupc->times[PSI_MEM_SOME] += delta; | 642 | groupc->times[PSI_MEM_SOME] += delta; |
| 418 | if (test_state(groupc->tasks, PSI_MEM_FULL)) | 643 | if (groupc->state_mask & (1 << PSI_MEM_FULL)) |
| 419 | groupc->times[PSI_MEM_FULL] += delta; | 644 | groupc->times[PSI_MEM_FULL] += delta; |
| 420 | else if (memstall_tick) { | 645 | else if (memstall_tick) { |
| 421 | u32 sample; | 646 | u32 sample; |
| @@ -436,18 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, | |||
| 436 | } | 661 | } |
| 437 | } | 662 | } |
| 438 | 663 | ||
| 439 | if (test_state(groupc->tasks, PSI_CPU_SOME)) | 664 | if (groupc->state_mask & (1 << PSI_CPU_SOME)) |
| 440 | groupc->times[PSI_CPU_SOME] += delta; | 665 | groupc->times[PSI_CPU_SOME] += delta; |
| 441 | 666 | ||
| 442 | if (test_state(groupc->tasks, PSI_NONIDLE)) | 667 | if (groupc->state_mask & (1 << PSI_NONIDLE)) |
| 443 | groupc->times[PSI_NONIDLE] += delta; | 668 | groupc->times[PSI_NONIDLE] += delta; |
| 444 | } | 669 | } |
| 445 | 670 | ||
| 446 | static void psi_group_change(struct psi_group *group, int cpu, | 671 | static u32 psi_group_change(struct psi_group *group, int cpu, |
| 447 | unsigned int clear, unsigned int set) | 672 | unsigned int clear, unsigned int set) |
| 448 | { | 673 | { |
| 449 | struct psi_group_cpu *groupc; | 674 | struct psi_group_cpu *groupc; |
| 450 | unsigned int t, m; | 675 | unsigned int t, m; |
| 676 | enum psi_states s; | ||
| 677 | u32 state_mask = 0; | ||
| 451 | 678 | ||
| 452 | groupc = per_cpu_ptr(group->pcpu, cpu); | 679 | groupc = per_cpu_ptr(group->pcpu, cpu); |
| 453 | 680 | ||
| @@ -480,7 +707,16 @@ static void psi_group_change(struct psi_group *group, int cpu, | |||
| 480 | if (set & (1 << t)) | 707 | if (set & (1 << t)) |
| 481 | groupc->tasks[t]++; | 708 | groupc->tasks[t]++; |
| 482 | 709 | ||
| 710 | /* Calculate state mask representing active states */ | ||
| 711 | for (s = 0; s < NR_PSI_STATES; s++) { | ||
| 712 | if (test_state(groupc->tasks, s)) | ||
| 713 | state_mask |= (1 << s); | ||
| 714 | } | ||
| 715 | groupc->state_mask = state_mask; | ||
| 716 | |||
| 483 | write_seqcount_end(&groupc->seq); | 717 | write_seqcount_end(&groupc->seq); |
| 718 | |||
| 719 | return state_mask; | ||
| 484 | } | 720 | } |
| 485 | 721 | ||
| 486 | static struct psi_group *iterate_groups(struct task_struct *task, void **iter) | 722 | static struct psi_group *iterate_groups(struct task_struct *task, void **iter) |
| @@ -537,13 +773,17 @@ void psi_task_change(struct task_struct *task, int clear, int set) | |||
| 537 | */ | 773 | */ |
| 538 | if (unlikely((clear & TSK_RUNNING) && | 774 | if (unlikely((clear & TSK_RUNNING) && |
| 539 | (task->flags & PF_WQ_WORKER) && | 775 | (task->flags & PF_WQ_WORKER) && |
| 540 | wq_worker_last_func(task) == psi_update_work)) | 776 | wq_worker_last_func(task) == psi_avgs_work)) |
| 541 | wake_clock = false; | 777 | wake_clock = false; |
| 542 | 778 | ||
| 543 | while ((group = iterate_groups(task, &iter))) { | 779 | while ((group = iterate_groups(task, &iter))) { |
| 544 | psi_group_change(group, cpu, clear, set); | 780 | u32 state_mask = psi_group_change(group, cpu, clear, set); |
| 545 | if (wake_clock && !delayed_work_pending(&group->clock_work)) | 781 | |
| 546 | schedule_delayed_work(&group->clock_work, PSI_FREQ); | 782 | if (state_mask & group->poll_states) |
| 783 | psi_schedule_poll_work(group, 1); | ||
| 784 | |||
| 785 | if (wake_clock && !delayed_work_pending(&group->avgs_work)) | ||
| 786 | schedule_delayed_work(&group->avgs_work, PSI_FREQ); | ||
| 547 | } | 787 | } |
| 548 | } | 788 | } |
| 549 | 789 | ||
| @@ -640,8 +880,10 @@ void psi_cgroup_free(struct cgroup *cgroup) | |||
| 640 | if (static_branch_likely(&psi_disabled)) | 880 | if (static_branch_likely(&psi_disabled)) |
| 641 | return; | 881 | return; |
| 642 | 882 | ||
| 643 | cancel_delayed_work_sync(&cgroup->psi.clock_work); | 883 | cancel_delayed_work_sync(&cgroup->psi.avgs_work); |
| 644 | free_percpu(cgroup->psi.pcpu); | 884 | free_percpu(cgroup->psi.pcpu); |
| 885 | /* All triggers must be removed by now */ | ||
| 886 | WARN_ONCE(cgroup->psi.poll_states, "psi: trigger leak\n"); | ||
| 645 | } | 887 | } |
| 646 | 888 | ||
| 647 | /** | 889 | /** |
| @@ -697,11 +939,18 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) | |||
| 697 | int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | 939 | int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) |
| 698 | { | 940 | { |
| 699 | int full; | 941 | int full; |
| 942 | u64 now; | ||
| 700 | 943 | ||
| 701 | if (static_branch_likely(&psi_disabled)) | 944 | if (static_branch_likely(&psi_disabled)) |
| 702 | return -EOPNOTSUPP; | 945 | return -EOPNOTSUPP; |
| 703 | 946 | ||
| 704 | update_stats(group); | 947 | /* Update averages before reporting them */ |
| 948 | mutex_lock(&group->avgs_lock); | ||
| 949 | now = sched_clock(); | ||
| 950 | collect_percpu_times(group, PSI_AVGS, NULL); | ||
| 951 | if (now >= group->avg_next_update) | ||
| 952 | group->avg_next_update = update_averages(group, now); | ||
| 953 | mutex_unlock(&group->avgs_lock); | ||
| 705 | 954 | ||
| 706 | for (full = 0; full < 2 - (res == PSI_CPU); full++) { | 955 | for (full = 0; full < 2 - (res == PSI_CPU); full++) { |
| 707 | unsigned long avg[3]; | 956 | unsigned long avg[3]; |
| @@ -710,7 +959,8 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) | |||
| 710 | 959 | ||
| 711 | for (w = 0; w < 3; w++) | 960 | for (w = 0; w < 3; w++) |
| 712 | avg[w] = group->avg[res * 2 + full][w]; | 961 | avg[w] = group->avg[res * 2 + full][w]; |
| 713 | total = div_u64(group->total[res * 2 + full], NSEC_PER_USEC); | 962 | total = div_u64(group->total[PSI_AVGS][res * 2 + full], |
| 963 | NSEC_PER_USEC); | ||
| 714 | 964 | ||
| 715 | seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", | 965 | seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", |
| 716 | full ? "full" : "some", | 966 | full ? "full" : "some", |
| @@ -753,25 +1003,270 @@ static int psi_cpu_open(struct inode *inode, struct file *file) | |||
| 753 | return single_open(file, psi_cpu_show, NULL); | 1003 | return single_open(file, psi_cpu_show, NULL); |
| 754 | } | 1004 | } |
| 755 | 1005 | ||
| 1006 | struct psi_trigger *psi_trigger_create(struct psi_group *group, | ||
| 1007 | char *buf, size_t nbytes, enum psi_res res) | ||
| 1008 | { | ||
| 1009 | struct psi_trigger *t; | ||
| 1010 | enum psi_states state; | ||
| 1011 | u32 threshold_us; | ||
| 1012 | u32 window_us; | ||
| 1013 | |||
| 1014 | if (static_branch_likely(&psi_disabled)) | ||
| 1015 | return ERR_PTR(-EOPNOTSUPP); | ||
| 1016 | |||
| 1017 | if (sscanf(buf, "some %u %u", &threshold_us, &window_us) == 2) | ||
| 1018 | state = PSI_IO_SOME + res * 2; | ||
| 1019 | else if (sscanf(buf, "full %u %u", &threshold_us, &window_us) == 2) | ||
| 1020 | state = PSI_IO_FULL + res * 2; | ||
| 1021 | else | ||
| 1022 | return ERR_PTR(-EINVAL); | ||
| 1023 | |||
| 1024 | if (state >= PSI_NONIDLE) | ||
| 1025 | return ERR_PTR(-EINVAL); | ||
| 1026 | |||
| 1027 | if (window_us < WINDOW_MIN_US || | ||
| 1028 | window_us > WINDOW_MAX_US) | ||
| 1029 | return ERR_PTR(-EINVAL); | ||
| 1030 | |||
| 1031 | /* Check threshold */ | ||
| 1032 | if (threshold_us == 0 || threshold_us > window_us) | ||
| 1033 | return ERR_PTR(-EINVAL); | ||
| 1034 | |||
| 1035 | t = kmalloc(sizeof(*t), GFP_KERNEL); | ||
| 1036 | if (!t) | ||
| 1037 | return ERR_PTR(-ENOMEM); | ||
| 1038 | |||
| 1039 | t->group = group; | ||
| 1040 | t->state = state; | ||
| 1041 | t->threshold = threshold_us * NSEC_PER_USEC; | ||
| 1042 | t->win.size = window_us * NSEC_PER_USEC; | ||
| 1043 | window_reset(&t->win, 0, 0, 0); | ||
| 1044 | |||
| 1045 | t->event = 0; | ||
| 1046 | t->last_event_time = 0; | ||
| 1047 | init_waitqueue_head(&t->event_wait); | ||
| 1048 | kref_init(&t->refcount); | ||
| 1049 | |||
| 1050 | mutex_lock(&group->trigger_lock); | ||
| 1051 | |||
| 1052 | if (!rcu_access_pointer(group->poll_kworker)) { | ||
| 1053 | struct sched_param param = { | ||
| 1054 | .sched_priority = MAX_RT_PRIO - 1, | ||
| 1055 | }; | ||
| 1056 | struct kthread_worker *kworker; | ||
| 1057 | |||
| 1058 | kworker = kthread_create_worker(0, "psimon"); | ||
| 1059 | if (IS_ERR(kworker)) { | ||
| 1060 | kfree(t); | ||
| 1061 | mutex_unlock(&group->trigger_lock); | ||
| 1062 | return ERR_CAST(kworker); | ||
| 1063 | } | ||
| 1064 | sched_setscheduler(kworker->task, SCHED_FIFO, ¶m); | ||
| 1065 | kthread_init_delayed_work(&group->poll_work, | ||
| 1066 | psi_poll_work); | ||
| 1067 | rcu_assign_pointer(group->poll_kworker, kworker); | ||
| 1068 | } | ||
| 1069 | |||
| 1070 | list_add(&t->node, &group->triggers); | ||
| 1071 | group->poll_min_period = min(group->poll_min_period, | ||
| 1072 | div_u64(t->win.size, UPDATES_PER_WINDOW)); | ||
| 1073 | group->nr_triggers[t->state]++; | ||
| 1074 | group->poll_states |= (1 << t->state); | ||
| 1075 | |||
| 1076 | mutex_unlock(&group->trigger_lock); | ||
| 1077 | |||
| 1078 | return t; | ||
| 1079 | } | ||
| 1080 | |||
| 1081 | static void psi_trigger_destroy(struct kref *ref) | ||
| 1082 | { | ||
| 1083 | struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount); | ||
| 1084 | struct psi_group *group = t->group; | ||
| 1085 | struct kthread_worker *kworker_to_destroy = NULL; | ||
| 1086 | |||
| 1087 | if (static_branch_likely(&psi_disabled)) | ||
| 1088 | return; | ||
| 1089 | |||
| 1090 | /* | ||
| 1091 | * Wakeup waiters to stop polling. Can happen if cgroup is deleted | ||
| 1092 | * from under a polling process. | ||
| 1093 | */ | ||
| 1094 | wake_up_interruptible(&t->event_wait); | ||
| 1095 | |||
| 1096 | mutex_lock(&group->trigger_lock); | ||
| 1097 | |||
| 1098 | if (!list_empty(&t->node)) { | ||
| 1099 | struct psi_trigger *tmp; | ||
| 1100 | u64 period = ULLONG_MAX; | ||
| 1101 | |||
| 1102 | list_del(&t->node); | ||
| 1103 | group->nr_triggers[t->state]--; | ||
| 1104 | if (!group->nr_triggers[t->state]) | ||
| 1105 | group->poll_states &= ~(1 << t->state); | ||
| 1106 | /* reset min update period for the remaining triggers */ | ||
| 1107 | list_for_each_entry(tmp, &group->triggers, node) | ||
| 1108 | period = min(period, div_u64(tmp->win.size, | ||
| 1109 | UPDATES_PER_WINDOW)); | ||
| 1110 | group->poll_min_period = period; | ||
| 1111 | /* Destroy poll_kworker when the last trigger is destroyed */ | ||
| 1112 | if (group->poll_states == 0) { | ||
| 1113 | group->polling_until = 0; | ||
| 1114 | kworker_to_destroy = rcu_dereference_protected( | ||
| 1115 | group->poll_kworker, | ||
| 1116 | lockdep_is_held(&group->trigger_lock)); | ||
| 1117 | rcu_assign_pointer(group->poll_kworker, NULL); | ||
| 1118 | } | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | mutex_unlock(&group->trigger_lock); | ||
| 1122 | |||
| 1123 | /* | ||
| 1124 | * Wait for both *trigger_ptr from psi_trigger_replace and | ||
| 1125 | * poll_kworker RCUs to complete their read-side critical sections | ||
| 1126 | * before destroying the trigger and optionally the poll_kworker | ||
| 1127 | */ | ||
| 1128 | synchronize_rcu(); | ||
| 1129 | /* | ||
| 1130 | * Destroy the kworker after releasing trigger_lock to prevent a | ||
| 1131 | * deadlock while waiting for psi_poll_work to acquire trigger_lock | ||
| 1132 | */ | ||
| 1133 | if (kworker_to_destroy) { | ||
| 1134 | kthread_cancel_delayed_work_sync(&group->poll_work); | ||
| 1135 | kthread_destroy_worker(kworker_to_destroy); | ||
| 1136 | } | ||
| 1137 | kfree(t); | ||
| 1138 | } | ||
| 1139 | |||
| 1140 | void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new) | ||
| 1141 | { | ||
| 1142 | struct psi_trigger *old = *trigger_ptr; | ||
| 1143 | |||
| 1144 | if (static_branch_likely(&psi_disabled)) | ||
| 1145 | return; | ||
| 1146 | |||
| 1147 | rcu_assign_pointer(*trigger_ptr, new); | ||
| 1148 | if (old) | ||
| 1149 | kref_put(&old->refcount, psi_trigger_destroy); | ||
| 1150 | } | ||
| 1151 | |||
| 1152 | __poll_t psi_trigger_poll(void **trigger_ptr, | ||
| 1153 | struct file *file, poll_table *wait) | ||
| 1154 | { | ||
| 1155 | __poll_t ret = DEFAULT_POLLMASK; | ||
| 1156 | struct psi_trigger *t; | ||
| 1157 | |||
| 1158 | if (static_branch_likely(&psi_disabled)) | ||
| 1159 | return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; | ||
| 1160 | |||
| 1161 | rcu_read_lock(); | ||
| 1162 | |||
| 1163 | t = rcu_dereference(*(void __rcu __force **)trigger_ptr); | ||
| 1164 | if (!t) { | ||
| 1165 | rcu_read_unlock(); | ||
| 1166 | return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; | ||
| 1167 | } | ||
| 1168 | kref_get(&t->refcount); | ||
| 1169 | |||
| 1170 | rcu_read_unlock(); | ||
| 1171 | |||
| 1172 | poll_wait(file, &t->event_wait, wait); | ||
| 1173 | |||
| 1174 | if (cmpxchg(&t->event, 1, 0) == 1) | ||
| 1175 | ret |= EPOLLPRI; | ||
| 1176 | |||
| 1177 | kref_put(&t->refcount, psi_trigger_destroy); | ||
| 1178 | |||
| 1179 | return ret; | ||
| 1180 | } | ||
| 1181 | |||
| 1182 | static ssize_t psi_write(struct file *file, const char __user *user_buf, | ||
| 1183 | size_t nbytes, enum psi_res res) | ||
| 1184 | { | ||
| 1185 | char buf[32]; | ||
| 1186 | size_t buf_size; | ||
| 1187 | struct seq_file *seq; | ||
| 1188 | struct psi_trigger *new; | ||
| 1189 | |||
| 1190 | if (static_branch_likely(&psi_disabled)) | ||
| 1191 | return -EOPNOTSUPP; | ||
| 1192 | |||
| 1193 | buf_size = min(nbytes, (sizeof(buf) - 1)); | ||
| 1194 | if (copy_from_user(buf, user_buf, buf_size)) | ||
| 1195 | return -EFAULT; | ||
| 1196 | |||
| 1197 | buf[buf_size - 1] = '\0'; | ||
| 1198 | |||
| 1199 | new = psi_trigger_create(&psi_system, buf, nbytes, res); | ||
| 1200 | if (IS_ERR(new)) | ||
| 1201 | return PTR_ERR(new); | ||
| 1202 | |||
| 1203 | seq = file->private_data; | ||
| 1204 | /* Take seq->lock to protect seq->private from concurrent writes */ | ||
| 1205 | mutex_lock(&seq->lock); | ||
| 1206 | psi_trigger_replace(&seq->private, new); | ||
| 1207 | mutex_unlock(&seq->lock); | ||
| 1208 | |||
| 1209 | return nbytes; | ||
| 1210 | } | ||
| 1211 | |||
| 1212 | static ssize_t psi_io_write(struct file *file, const char __user *user_buf, | ||
| 1213 | size_t nbytes, loff_t *ppos) | ||
| 1214 | { | ||
| 1215 | return psi_write(file, user_buf, nbytes, PSI_IO); | ||
| 1216 | } | ||
| 1217 | |||
| 1218 | static ssize_t psi_memory_write(struct file *file, const char __user *user_buf, | ||
| 1219 | size_t nbytes, loff_t *ppos) | ||
| 1220 | { | ||
| 1221 | return psi_write(file, user_buf, nbytes, PSI_MEM); | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | static ssize_t psi_cpu_write(struct file *file, const char __user *user_buf, | ||
| 1225 | size_t nbytes, loff_t *ppos) | ||
| 1226 | { | ||
| 1227 | return psi_write(file, user_buf, nbytes, PSI_CPU); | ||
| 1228 | } | ||
| 1229 | |||
| 1230 | static __poll_t psi_fop_poll(struct file *file, poll_table *wait) | ||
| 1231 | { | ||
| 1232 | struct seq_file *seq = file->private_data; | ||
| 1233 | |||
| 1234 | return psi_trigger_poll(&seq->private, file, wait); | ||
| 1235 | } | ||
| 1236 | |||
| 1237 | static int psi_fop_release(struct inode *inode, struct file *file) | ||
| 1238 | { | ||
| 1239 | struct seq_file *seq = file->private_data; | ||
| 1240 | |||
| 1241 | psi_trigger_replace(&seq->private, NULL); | ||
| 1242 | return single_release(inode, file); | ||
| 1243 | } | ||
| 1244 | |||
| 756 | static const struct file_operations psi_io_fops = { | 1245 | static const struct file_operations psi_io_fops = { |
| 757 | .open = psi_io_open, | 1246 | .open = psi_io_open, |
| 758 | .read = seq_read, | 1247 | .read = seq_read, |
| 759 | .llseek = seq_lseek, | 1248 | .llseek = seq_lseek, |
| 760 | .release = single_release, | 1249 | .write = psi_io_write, |
| 1250 | .poll = psi_fop_poll, | ||
| 1251 | .release = psi_fop_release, | ||
| 761 | }; | 1252 | }; |
| 762 | 1253 | ||
| 763 | static const struct file_operations psi_memory_fops = { | 1254 | static const struct file_operations psi_memory_fops = { |
| 764 | .open = psi_memory_open, | 1255 | .open = psi_memory_open, |
| 765 | .read = seq_read, | 1256 | .read = seq_read, |
| 766 | .llseek = seq_lseek, | 1257 | .llseek = seq_lseek, |
| 767 | .release = single_release, | 1258 | .write = psi_memory_write, |
| 1259 | .poll = psi_fop_poll, | ||
| 1260 | .release = psi_fop_release, | ||
| 768 | }; | 1261 | }; |
| 769 | 1262 | ||
| 770 | static const struct file_operations psi_cpu_fops = { | 1263 | static const struct file_operations psi_cpu_fops = { |
| 771 | .open = psi_cpu_open, | 1264 | .open = psi_cpu_open, |
| 772 | .read = seq_read, | 1265 | .read = seq_read, |
| 773 | .llseek = seq_lseek, | 1266 | .llseek = seq_lseek, |
| 774 | .release = single_release, | 1267 | .write = psi_cpu_write, |
| 1268 | .poll = psi_fop_poll, | ||
| 1269 | .release = psi_fop_release, | ||
| 775 | }; | 1270 | }; |
| 776 | 1271 | ||
| 777 | static int __init psi_proc_init(void) | 1272 | static int __init psi_proc_init(void) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e6b909dca36..a532558a5176 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1614 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1614 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
| 1615 | { | 1615 | { |
| 1616 | if (!task_running(rq, p) && | 1616 | if (!task_running(rq, p) && |
| 1617 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1617 | cpumask_test_cpu(cpu, p->cpus_ptr)) |
| 1618 | return 1; | 1618 | return 1; |
| 1619 | 1619 | ||
| 1620 | return 0; | 1620 | return 0; |
| @@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1751 | * Also make sure that it wasn't scheduled on its rq. | 1751 | * Also make sure that it wasn't scheduled on its rq. |
| 1752 | */ | 1752 | */ |
| 1753 | if (unlikely(task_rq(task) != rq || | 1753 | if (unlikely(task_rq(task) != rq || |
| 1754 | !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || | 1754 | !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || |
| 1755 | task_running(rq, task) || | 1755 | task_running(rq, task) || |
| 1756 | !rt_task(task) || | 1756 | !rt_task(task) || |
| 1757 | !task_on_rq_queued(task))) { | 1757 | !task_on_rq_queued(task))) { |
| @@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = { | |||
| 2400 | .switched_to = switched_to_rt, | 2400 | .switched_to = switched_to_rt, |
| 2401 | 2401 | ||
| 2402 | .update_curr = update_curr_rt, | 2402 | .update_curr = update_curr_rt, |
| 2403 | |||
| 2404 | #ifdef CONFIG_UCLAMP_TASK | ||
| 2405 | .uclamp_enabled = 1, | ||
| 2406 | #endif | ||
| 2403 | }; | 2407 | }; |
| 2404 | 2408 | ||
| 2405 | #ifdef CONFIG_RT_GROUP_SCHED | 2409 | #ifdef CONFIG_RT_GROUP_SCHED |
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index a26473674fb7..c529706bed11 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h | |||
| @@ -1,7 +1,7 @@ | |||
| 1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
| 2 | /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ | 2 | /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ |
| 3 | 3 | ||
| 4 | static const u32 runnable_avg_yN_inv[] = { | 4 | static const u32 runnable_avg_yN_inv[] __maybe_unused = { |
| 5 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | 5 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, |
| 6 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | 6 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, |
| 7 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | 7 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b52ed1ada0be..802b1f3405f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks; | |||
| 96 | extern void calc_global_load_tick(struct rq *this_rq); | 96 | extern void calc_global_load_tick(struct rq *this_rq); |
| 97 | extern long calc_load_fold_active(struct rq *this_rq, long adjust); | 97 | extern long calc_load_fold_active(struct rq *this_rq, long adjust); |
| 98 | 98 | ||
| 99 | #ifdef CONFIG_SMP | ||
| 100 | extern void cpu_load_update_active(struct rq *this_rq); | ||
| 101 | #else | ||
| 102 | static inline void cpu_load_update_active(struct rq *this_rq) { } | ||
| 103 | #endif | ||
| 104 | |||
| 105 | /* | 99 | /* |
| 106 | * Helpers for converting nanosecond timing to jiffy resolution | 100 | * Helpers for converting nanosecond timing to jiffy resolution |
| 107 | */ | 101 | */ |
| @@ -344,8 +338,10 @@ struct cfs_bandwidth { | |||
| 344 | u64 runtime_expires; | 338 | u64 runtime_expires; |
| 345 | int expires_seq; | 339 | int expires_seq; |
| 346 | 340 | ||
| 347 | short idle; | 341 | u8 idle; |
| 348 | short period_active; | 342 | u8 period_active; |
| 343 | u8 distribute_running; | ||
| 344 | u8 slack_started; | ||
| 349 | struct hrtimer period_timer; | 345 | struct hrtimer period_timer; |
| 350 | struct hrtimer slack_timer; | 346 | struct hrtimer slack_timer; |
| 351 | struct list_head throttled_cfs_rq; | 347 | struct list_head throttled_cfs_rq; |
| @@ -354,8 +350,6 @@ struct cfs_bandwidth { | |||
| 354 | int nr_periods; | 350 | int nr_periods; |
| 355 | int nr_throttled; | 351 | int nr_throttled; |
| 356 | u64 throttled_time; | 352 | u64 throttled_time; |
| 357 | |||
| 358 | bool distribute_running; | ||
| 359 | #endif | 353 | #endif |
| 360 | }; | 354 | }; |
| 361 | 355 | ||
| @@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work); | |||
| 797 | #endif | 791 | #endif |
| 798 | #endif /* CONFIG_SMP */ | 792 | #endif /* CONFIG_SMP */ |
| 799 | 793 | ||
| 794 | #ifdef CONFIG_UCLAMP_TASK | ||
| 795 | /* | ||
| 796 | * struct uclamp_bucket - Utilization clamp bucket | ||
| 797 | * @value: utilization clamp value for tasks on this clamp bucket | ||
| 798 | * @tasks: number of RUNNABLE tasks on this clamp bucket | ||
| 799 | * | ||
| 800 | * Keep track of how many tasks are RUNNABLE for a given utilization | ||
| 801 | * clamp value. | ||
| 802 | */ | ||
| 803 | struct uclamp_bucket { | ||
| 804 | unsigned long value : bits_per(SCHED_CAPACITY_SCALE); | ||
| 805 | unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); | ||
| 806 | }; | ||
| 807 | |||
| 808 | /* | ||
| 809 | * struct uclamp_rq - rq's utilization clamp | ||
| 810 | * @value: currently active clamp values for a rq | ||
| 811 | * @bucket: utilization clamp buckets affecting a rq | ||
| 812 | * | ||
| 813 | * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. | ||
| 814 | * A clamp value is affecting a rq when there is at least one task RUNNABLE | ||
| 815 | * (or actually running) with that value. | ||
| 816 | * | ||
| 817 | * There are up to UCLAMP_CNT possible different clamp values, currently there | ||
| 818 | * are only two: minimum utilization and maximum utilization. | ||
| 819 | * | ||
| 820 | * All utilization clamping values are MAX aggregated, since: | ||
| 821 | * - for util_min: we want to run the CPU at least at the max of the minimum | ||
| 822 | * utilization required by its currently RUNNABLE tasks. | ||
| 823 | * - for util_max: we want to allow the CPU to run up to the max of the | ||
| 824 | * maximum utilization allowed by its currently RUNNABLE tasks. | ||
| 825 | * | ||
| 826 | * Since on each system we expect only a limited number of different | ||
| 827 | * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track | ||
| 828 | * the metrics required to compute all the per-rq utilization clamp values. | ||
| 829 | */ | ||
| 830 | struct uclamp_rq { | ||
| 831 | unsigned int value; | ||
| 832 | struct uclamp_bucket bucket[UCLAMP_BUCKETS]; | ||
| 833 | }; | ||
| 834 | #endif /* CONFIG_UCLAMP_TASK */ | ||
| 835 | |||
| 800 | /* | 836 | /* |
| 801 | * This is the main, per-CPU runqueue data structure. | 837 | * This is the main, per-CPU runqueue data structure. |
| 802 | * | 838 | * |
| @@ -818,8 +854,6 @@ struct rq { | |||
| 818 | unsigned int nr_preferred_running; | 854 | unsigned int nr_preferred_running; |
| 819 | unsigned int numa_migrate_on; | 855 | unsigned int numa_migrate_on; |
| 820 | #endif | 856 | #endif |
| 821 | #define CPU_LOAD_IDX_MAX 5 | ||
| 822 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
| 823 | #ifdef CONFIG_NO_HZ_COMMON | 857 | #ifdef CONFIG_NO_HZ_COMMON |
| 824 | #ifdef CONFIG_SMP | 858 | #ifdef CONFIG_SMP |
| 825 | unsigned long last_load_update_tick; | 859 | unsigned long last_load_update_tick; |
| @@ -830,11 +864,16 @@ struct rq { | |||
| 830 | atomic_t nohz_flags; | 864 | atomic_t nohz_flags; |
| 831 | #endif /* CONFIG_NO_HZ_COMMON */ | 865 | #endif /* CONFIG_NO_HZ_COMMON */ |
| 832 | 866 | ||
| 833 | /* capture load from *all* tasks on this CPU: */ | ||
| 834 | struct load_weight load; | ||
| 835 | unsigned long nr_load_updates; | 867 | unsigned long nr_load_updates; |
| 836 | u64 nr_switches; | 868 | u64 nr_switches; |
| 837 | 869 | ||
| 870 | #ifdef CONFIG_UCLAMP_TASK | ||
| 871 | /* Utilization clamp values based on CPU's RUNNABLE tasks */ | ||
| 872 | struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; | ||
| 873 | unsigned int uclamp_flags; | ||
| 874 | #define UCLAMP_FLAG_IDLE 0x01 | ||
| 875 | #endif | ||
| 876 | |||
| 838 | struct cfs_rq cfs; | 877 | struct cfs_rq cfs; |
| 839 | struct rt_rq rt; | 878 | struct rt_rq rt; |
| 840 | struct dl_rq dl; | 879 | struct dl_rq dl; |
| @@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40]; | |||
| 1649 | struct sched_class { | 1688 | struct sched_class { |
| 1650 | const struct sched_class *next; | 1689 | const struct sched_class *next; |
| 1651 | 1690 | ||
| 1691 | #ifdef CONFIG_UCLAMP_TASK | ||
| 1692 | int uclamp_enabled; | ||
| 1693 | #endif | ||
| 1694 | |||
| 1652 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1695 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
| 1653 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1696 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
| 1654 | void (*yield_task) (struct rq *rq); | 1697 | void (*yield_task) (struct rq *rq); |
| @@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) | |||
| 2222 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | 2265 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} |
| 2223 | #endif /* CONFIG_CPU_FREQ */ | 2266 | #endif /* CONFIG_CPU_FREQ */ |
| 2224 | 2267 | ||
| 2268 | #ifdef CONFIG_UCLAMP_TASK | ||
| 2269 | unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); | ||
| 2270 | |||
| 2271 | static __always_inline | ||
| 2272 | unsigned int uclamp_util_with(struct rq *rq, unsigned int util, | ||
| 2273 | struct task_struct *p) | ||
| 2274 | { | ||
| 2275 | unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); | ||
| 2276 | unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); | ||
| 2277 | |||
| 2278 | if (p) { | ||
| 2279 | min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); | ||
| 2280 | max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); | ||
| 2281 | } | ||
| 2282 | |||
| 2283 | /* | ||
| 2284 | * Since CPU's {min,max}_util clamps are MAX aggregated considering | ||
| 2285 | * RUNNABLE tasks with _different_ clamps, we can end up with an | ||
| 2286 | * inversion. Fix it now when the clamps are applied. | ||
| 2287 | */ | ||
| 2288 | if (unlikely(min_util >= max_util)) | ||
| 2289 | return min_util; | ||
| 2290 | |||
| 2291 | return clamp(util, min_util, max_util); | ||
| 2292 | } | ||
| 2293 | |||
| 2294 | static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) | ||
| 2295 | { | ||
| 2296 | return uclamp_util_with(rq, util, NULL); | ||
| 2297 | } | ||
| 2298 | #else /* CONFIG_UCLAMP_TASK */ | ||
| 2299 | static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, | ||
| 2300 | struct task_struct *p) | ||
| 2301 | { | ||
| 2302 | return util; | ||
| 2303 | } | ||
| 2304 | static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) | ||
| 2305 | { | ||
| 2306 | return util; | ||
| 2307 | } | ||
| 2308 | #endif /* CONFIG_UCLAMP_TASK */ | ||
| 2309 | |||
| 2225 | #ifdef arch_scale_freq_capacity | 2310 | #ifdef arch_scale_freq_capacity |
| 2226 | # ifndef arch_scale_freq_invariant | 2311 | # ifndef arch_scale_freq_invariant |
| 2227 | # define arch_scale_freq_invariant() true | 2312 | # define arch_scale_freq_invariant() true |
| @@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu) | |||
| 2237 | } | 2322 | } |
| 2238 | #endif | 2323 | #endif |
| 2239 | 2324 | ||
| 2240 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | ||
| 2241 | /** | 2325 | /** |
| 2242 | * enum schedutil_type - CPU utilization type | 2326 | * enum schedutil_type - CPU utilization type |
| 2243 | * @FREQUENCY_UTIL: Utilization used to select frequency | 2327 | * @FREQUENCY_UTIL: Utilization used to select frequency |
| @@ -2253,15 +2337,11 @@ enum schedutil_type { | |||
| 2253 | ENERGY_UTIL, | 2337 | ENERGY_UTIL, |
| 2254 | }; | 2338 | }; |
| 2255 | 2339 | ||
| 2256 | unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, | 2340 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
| 2257 | unsigned long max, enum schedutil_type type); | ||
| 2258 | |||
| 2259 | static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) | ||
| 2260 | { | ||
| 2261 | unsigned long max = arch_scale_cpu_capacity(NULL, cpu); | ||
| 2262 | 2341 | ||
| 2263 | return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); | 2342 | unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, |
| 2264 | } | 2343 | unsigned long max, enum schedutil_type type, |
| 2344 | struct task_struct *p); | ||
| 2265 | 2345 | ||
| 2266 | static inline unsigned long cpu_bw_dl(struct rq *rq) | 2346 | static inline unsigned long cpu_bw_dl(struct rq *rq) |
| 2267 | { | 2347 | { |
| @@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq) | |||
| 2290 | return READ_ONCE(rq->avg_rt.util_avg); | 2370 | return READ_ONCE(rq->avg_rt.util_avg); |
| 2291 | } | 2371 | } |
| 2292 | #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ | 2372 | #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ |
| 2293 | static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) | 2373 | static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, |
| 2374 | unsigned long max, enum schedutil_type type, | ||
| 2375 | struct task_struct *p) | ||
| 2294 | { | 2376 | { |
| 2295 | return cfs; | 2377 | return 0; |
| 2296 | } | 2378 | } |
| 2297 | #endif | 2379 | #endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ |
| 2298 | 2380 | ||
| 2299 | #ifdef CONFIG_HAVE_SCHED_AVG_IRQ | 2381 | #ifdef CONFIG_HAVE_SCHED_AVG_IRQ |
| 2300 | static inline unsigned long cpu_util_irq(struct rq *rq) | 2382 | static inline unsigned long cpu_util_irq(struct rq *rq) |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f53f89df837d..f751ce0b783e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
| @@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl, | |||
| 1344 | .imbalance_pct = 125, | 1344 | .imbalance_pct = 125, |
| 1345 | 1345 | ||
| 1346 | .cache_nice_tries = 0, | 1346 | .cache_nice_tries = 0, |
| 1347 | .busy_idx = 0, | ||
| 1348 | .idle_idx = 0, | ||
| 1349 | .newidle_idx = 0, | ||
| 1350 | .wake_idx = 0, | ||
| 1351 | .forkexec_idx = 0, | ||
| 1352 | 1347 | ||
| 1353 | .flags = 1*SD_LOAD_BALANCE | 1348 | .flags = 1*SD_LOAD_BALANCE |
| 1354 | | 1*SD_BALANCE_NEWIDLE | 1349 | | 1*SD_BALANCE_NEWIDLE |
| @@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl, | |||
| 1400 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | 1395 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { |
| 1401 | sd->imbalance_pct = 117; | 1396 | sd->imbalance_pct = 117; |
| 1402 | sd->cache_nice_tries = 1; | 1397 | sd->cache_nice_tries = 1; |
| 1403 | sd->busy_idx = 2; | ||
| 1404 | 1398 | ||
| 1405 | #ifdef CONFIG_NUMA | 1399 | #ifdef CONFIG_NUMA |
| 1406 | } else if (sd->flags & SD_NUMA) { | 1400 | } else if (sd->flags & SD_NUMA) { |
| 1407 | sd->cache_nice_tries = 2; | 1401 | sd->cache_nice_tries = 2; |
| 1408 | sd->busy_idx = 3; | ||
| 1409 | sd->idle_idx = 2; | ||
| 1410 | 1402 | ||
| 1411 | sd->flags &= ~SD_PREFER_SIBLING; | 1403 | sd->flags &= ~SD_PREFER_SIBLING; |
| 1412 | sd->flags |= SD_SERIALIZE; | 1404 | sd->flags |= SD_SERIALIZE; |
| @@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl, | |||
| 1419 | #endif | 1411 | #endif |
| 1420 | } else { | 1412 | } else { |
| 1421 | sd->cache_nice_tries = 1; | 1413 | sd->cache_nice_tries = 1; |
| 1422 | sd->busy_idx = 2; | ||
| 1423 | sd->idle_idx = 1; | ||
| 1424 | } | 1414 | } |
| 1425 | 1415 | ||
| 1426 | /* | 1416 | /* |
| @@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level | |||
| 1884 | unsigned long cap; | 1874 | unsigned long cap; |
| 1885 | 1875 | ||
| 1886 | /* Is there any asymmetry? */ | 1876 | /* Is there any asymmetry? */ |
| 1887 | cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); | 1877 | cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); |
| 1888 | 1878 | ||
| 1889 | for_each_cpu(i, cpu_map) { | 1879 | for_each_cpu(i, cpu_map) { |
| 1890 | if (arch_scale_cpu_capacity(NULL, i) != cap) { | 1880 | if (arch_scale_cpu_capacity(i) != cap) { |
| 1891 | asym = true; | 1881 | asym = true; |
| 1892 | break; | 1882 | break; |
| 1893 | } | 1883 | } |
| @@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level | |||
| 1902 | * to everyone. | 1892 | * to everyone. |
| 1903 | */ | 1893 | */ |
| 1904 | for_each_cpu(i, cpu_map) { | 1894 | for_each_cpu(i, cpu_map) { |
| 1905 | unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); | 1895 | unsigned long max_capacity = arch_scale_cpu_capacity(i); |
| 1906 | int tl_id = 0; | 1896 | int tl_id = 0; |
| 1907 | 1897 | ||
| 1908 | for_each_sd_topology(tl) { | 1898 | for_each_sd_topology(tl) { |
| @@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level | |||
| 1912 | for_each_cpu_and(j, tl->mask(i), cpu_map) { | 1902 | for_each_cpu_and(j, tl->mask(i), cpu_map) { |
| 1913 | unsigned long capacity; | 1903 | unsigned long capacity; |
| 1914 | 1904 | ||
| 1915 | capacity = arch_scale_cpu_capacity(NULL, j); | 1905 | capacity = arch_scale_cpu_capacity(j); |
| 1916 | 1906 | ||
| 1917 | if (capacity <= max_capacity) | 1907 | if (capacity <= max_capacity) |
| 1918 | continue; | 1908 | continue; |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 6eb1f8efd221..c1e566a114ca 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Generic waiting primitives. | 3 | * Generic waiting primitives. |
| 3 | * | 4 | * |
| @@ -117,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int | |||
| 117 | bookmark.func = NULL; | 118 | bookmark.func = NULL; |
| 118 | INIT_LIST_HEAD(&bookmark.entry); | 119 | INIT_LIST_HEAD(&bookmark.entry); |
| 119 | 120 | ||
| 120 | spin_lock_irqsave(&wq_head->lock, flags); | 121 | do { |
| 121 | nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); | ||
| 122 | spin_unlock_irqrestore(&wq_head->lock, flags); | ||
| 123 | |||
| 124 | while (bookmark.flags & WQ_FLAG_BOOKMARK) { | ||
| 125 | spin_lock_irqsave(&wq_head->lock, flags); | 122 | spin_lock_irqsave(&wq_head->lock, flags); |
| 126 | nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, | 123 | nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, |
| 127 | wake_flags, key, &bookmark); | 124 | wake_flags, key, &bookmark); |
| 128 | spin_unlock_irqrestore(&wq_head->lock, flags); | 125 | spin_unlock_irqrestore(&wq_head->lock, flags); |
| 129 | } | 126 | } while (bookmark.flags & WQ_FLAG_BOOKMARK); |
| 130 | } | 127 | } |
| 131 | 128 | ||
| 132 | /** | 129 | /** |
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index c67c6d24adc2..45eba18a2898 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * The implementation of the wait_bit*() and related waiting APIs: | 3 | * The implementation of the wait_bit*() and related waiting APIs: |
| 3 | */ | 4 | */ |
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 811b4a86cdf6..dba52a7db5e8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c | |||
| @@ -609,7 +609,7 @@ static void seccomp_send_sigsys(int syscall, int reason) | |||
| 609 | { | 609 | { |
| 610 | struct kernel_siginfo info; | 610 | struct kernel_siginfo info; |
| 611 | seccomp_init_siginfo(&info, syscall, reason); | 611 | seccomp_init_siginfo(&info, syscall, reason); |
| 612 | force_sig_info(SIGSYS, &info, current); | 612 | force_sig_info(&info); |
| 613 | } | 613 | } |
| 614 | #endif /* CONFIG_SECCOMP_FILTER */ | 614 | #endif /* CONFIG_SECCOMP_FILTER */ |
| 615 | 615 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 62f9aea4a15a..dabe100d2091 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/signal.c | 3 | * linux/kernel/signal.c |
| 3 | * | 4 | * |
| @@ -44,6 +45,7 @@ | |||
| 44 | #include <linux/posix-timers.h> | 45 | #include <linux/posix-timers.h> |
| 45 | #include <linux/livepatch.h> | 46 | #include <linux/livepatch.h> |
| 46 | #include <linux/cgroup.h> | 47 | #include <linux/cgroup.h> |
| 48 | #include <linux/audit.h> | ||
| 47 | 49 | ||
| 48 | #define CREATE_TRACE_POINTS | 50 | #define CREATE_TRACE_POINTS |
| 49 | #include <trace/events/signal.h> | 51 | #include <trace/events/signal.h> |
| @@ -53,7 +55,6 @@ | |||
| 53 | #include <asm/unistd.h> | 55 | #include <asm/unistd.h> |
| 54 | #include <asm/siginfo.h> | 56 | #include <asm/siginfo.h> |
| 55 | #include <asm/cacheflush.h> | 57 | #include <asm/cacheflush.h> |
| 56 | #include "audit.h" /* audit_signal_info() */ | ||
| 57 | 58 | ||
| 58 | /* | 59 | /* |
| 59 | * SLAB caches for signal bits. | 60 | * SLAB caches for signal bits. |
| @@ -840,6 +841,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, | |||
| 840 | */ | 841 | */ |
| 841 | if (!sid || sid == task_session(current)) | 842 | if (!sid || sid == task_session(current)) |
| 842 | break; | 843 | break; |
| 844 | /* fall through */ | ||
| 843 | default: | 845 | default: |
| 844 | return -EPERM; | 846 | return -EPERM; |
| 845 | } | 847 | } |
| @@ -1055,29 +1057,8 @@ static inline bool legacy_queue(struct sigpending *signals, int sig) | |||
| 1055 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); | 1057 | return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); |
| 1056 | } | 1058 | } |
| 1057 | 1059 | ||
| 1058 | #ifdef CONFIG_USER_NS | ||
| 1059 | static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) | ||
| 1060 | { | ||
| 1061 | if (current_user_ns() == task_cred_xxx(t, user_ns)) | ||
| 1062 | return; | ||
| 1063 | |||
| 1064 | if (SI_FROMKERNEL(info)) | ||
| 1065 | return; | ||
| 1066 | |||
| 1067 | rcu_read_lock(); | ||
| 1068 | info->si_uid = from_kuid_munged(task_cred_xxx(t, user_ns), | ||
| 1069 | make_kuid(current_user_ns(), info->si_uid)); | ||
| 1070 | rcu_read_unlock(); | ||
| 1071 | } | ||
| 1072 | #else | ||
| 1073 | static inline void userns_fixup_signal_uid(struct kernel_siginfo *info, struct task_struct *t) | ||
| 1074 | { | ||
| 1075 | return; | ||
| 1076 | } | ||
| 1077 | #endif | ||
| 1078 | |||
| 1079 | static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, | 1060 | static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, |
| 1080 | enum pid_type type, int from_ancestor_ns) | 1061 | enum pid_type type, bool force) |
| 1081 | { | 1062 | { |
| 1082 | struct sigpending *pending; | 1063 | struct sigpending *pending; |
| 1083 | struct sigqueue *q; | 1064 | struct sigqueue *q; |
| @@ -1087,8 +1068,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc | |||
| 1087 | assert_spin_locked(&t->sighand->siglock); | 1068 | assert_spin_locked(&t->sighand->siglock); |
| 1088 | 1069 | ||
| 1089 | result = TRACE_SIGNAL_IGNORED; | 1070 | result = TRACE_SIGNAL_IGNORED; |
| 1090 | if (!prepare_signal(sig, t, | 1071 | if (!prepare_signal(sig, t, force)) |
| 1091 | from_ancestor_ns || (info == SEND_SIG_PRIV))) | ||
| 1092 | goto ret; | 1072 | goto ret; |
| 1093 | 1073 | ||
| 1094 | pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; | 1074 | pending = (type != PIDTYPE_PID) ? &t->signal->shared_pending : &t->pending; |
| @@ -1133,7 +1113,11 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc | |||
| 1133 | q->info.si_code = SI_USER; | 1113 | q->info.si_code = SI_USER; |
| 1134 | q->info.si_pid = task_tgid_nr_ns(current, | 1114 | q->info.si_pid = task_tgid_nr_ns(current, |
| 1135 | task_active_pid_ns(t)); | 1115 | task_active_pid_ns(t)); |
| 1136 | q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); | 1116 | rcu_read_lock(); |
| 1117 | q->info.si_uid = | ||
| 1118 | from_kuid_munged(task_cred_xxx(t, user_ns), | ||
| 1119 | current_uid()); | ||
| 1120 | rcu_read_unlock(); | ||
| 1137 | break; | 1121 | break; |
| 1138 | case (unsigned long) SEND_SIG_PRIV: | 1122 | case (unsigned long) SEND_SIG_PRIV: |
| 1139 | clear_siginfo(&q->info); | 1123 | clear_siginfo(&q->info); |
| @@ -1145,30 +1129,24 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc | |||
| 1145 | break; | 1129 | break; |
| 1146 | default: | 1130 | default: |
| 1147 | copy_siginfo(&q->info, info); | 1131 | copy_siginfo(&q->info, info); |
| 1148 | if (from_ancestor_ns) | ||
| 1149 | q->info.si_pid = 0; | ||
| 1150 | break; | 1132 | break; |
| 1151 | } | 1133 | } |
| 1152 | 1134 | } else if (!is_si_special(info) && | |
| 1153 | userns_fixup_signal_uid(&q->info, t); | 1135 | sig >= SIGRTMIN && info->si_code != SI_USER) { |
| 1154 | 1136 | /* | |
| 1155 | } else if (!is_si_special(info)) { | 1137 | * Queue overflow, abort. We may abort if the |
| 1156 | if (sig >= SIGRTMIN && info->si_code != SI_USER) { | 1138 | * signal was rt and sent by user using something |
| 1157 | /* | 1139 | * other than kill(). |
| 1158 | * Queue overflow, abort. We may abort if the | 1140 | */ |
| 1159 | * signal was rt and sent by user using something | 1141 | result = TRACE_SIGNAL_OVERFLOW_FAIL; |
| 1160 | * other than kill(). | 1142 | ret = -EAGAIN; |
| 1161 | */ | 1143 | goto ret; |
| 1162 | result = TRACE_SIGNAL_OVERFLOW_FAIL; | 1144 | } else { |
| 1163 | ret = -EAGAIN; | 1145 | /* |
| 1164 | goto ret; | 1146 | * This is a silent loss of information. We still |
| 1165 | } else { | 1147 | * send the signal, but the *info bits are lost. |
| 1166 | /* | 1148 | */ |
| 1167 | * This is a silent loss of information. We still | 1149 | result = TRACE_SIGNAL_LOSE_INFO; |
| 1168 | * send the signal, but the *info bits are lost. | ||
| 1169 | */ | ||
| 1170 | result = TRACE_SIGNAL_LOSE_INFO; | ||
| 1171 | } | ||
| 1172 | } | 1150 | } |
| 1173 | 1151 | ||
| 1174 | out_set: | 1152 | out_set: |
| @@ -1195,17 +1173,62 @@ ret: | |||
| 1195 | return ret; | 1173 | return ret; |
| 1196 | } | 1174 | } |
| 1197 | 1175 | ||
| 1176 | static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) | ||
| 1177 | { | ||
| 1178 | bool ret = false; | ||
| 1179 | switch (siginfo_layout(info->si_signo, info->si_code)) { | ||
| 1180 | case SIL_KILL: | ||
| 1181 | case SIL_CHLD: | ||
| 1182 | case SIL_RT: | ||
| 1183 | ret = true; | ||
| 1184 | break; | ||
| 1185 | case SIL_TIMER: | ||
| 1186 | case SIL_POLL: | ||
| 1187 | case SIL_FAULT: | ||
| 1188 | case SIL_FAULT_MCEERR: | ||
| 1189 | case SIL_FAULT_BNDERR: | ||
| 1190 | case SIL_FAULT_PKUERR: | ||
| 1191 | case SIL_SYS: | ||
| 1192 | ret = false; | ||
| 1193 | break; | ||
| 1194 | } | ||
| 1195 | return ret; | ||
| 1196 | } | ||
| 1197 | |||
| 1198 | static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, | 1198 | static int send_signal(int sig, struct kernel_siginfo *info, struct task_struct *t, |
| 1199 | enum pid_type type) | 1199 | enum pid_type type) |
| 1200 | { | 1200 | { |
| 1201 | int from_ancestor_ns = 0; | 1201 | /* Should SIGKILL or SIGSTOP be received by a pid namespace init? */ |
| 1202 | bool force = false; | ||
| 1202 | 1203 | ||
| 1203 | #ifdef CONFIG_PID_NS | 1204 | if (info == SEND_SIG_NOINFO) { |
| 1204 | from_ancestor_ns = si_fromuser(info) && | 1205 | /* Force if sent from an ancestor pid namespace */ |
| 1205 | !task_pid_nr_ns(current, task_active_pid_ns(t)); | 1206 | force = !task_pid_nr_ns(current, task_active_pid_ns(t)); |
| 1206 | #endif | 1207 | } else if (info == SEND_SIG_PRIV) { |
| 1208 | /* Don't ignore kernel generated signals */ | ||
| 1209 | force = true; | ||
| 1210 | } else if (has_si_pid_and_uid(info)) { | ||
| 1211 | /* SIGKILL and SIGSTOP is special or has ids */ | ||
| 1212 | struct user_namespace *t_user_ns; | ||
| 1213 | |||
| 1214 | rcu_read_lock(); | ||
| 1215 | t_user_ns = task_cred_xxx(t, user_ns); | ||
| 1216 | if (current_user_ns() != t_user_ns) { | ||
| 1217 | kuid_t uid = make_kuid(current_user_ns(), info->si_uid); | ||
| 1218 | info->si_uid = from_kuid_munged(t_user_ns, uid); | ||
| 1219 | } | ||
| 1220 | rcu_read_unlock(); | ||
| 1207 | 1221 | ||
| 1208 | return __send_signal(sig, info, t, type, from_ancestor_ns); | 1222 | /* A kernel generated signal? */ |
| 1223 | force = (info->si_code == SI_KERNEL); | ||
| 1224 | |||
| 1225 | /* From an ancestor pid namespace? */ | ||
| 1226 | if (!task_pid_nr_ns(current, task_active_pid_ns(t))) { | ||
| 1227 | info->si_pid = 0; | ||
| 1228 | force = true; | ||
| 1229 | } | ||
| 1230 | } | ||
| 1231 | return __send_signal(sig, info, t, type, force); | ||
| 1209 | } | 1232 | } |
| 1210 | 1233 | ||
| 1211 | static void print_fatal_signal(int signr) | 1234 | static void print_fatal_signal(int signr) |
| @@ -1272,12 +1295,13 @@ int do_send_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *p | |||
| 1272 | * We don't want to have recursive SIGSEGV's etc, for example, | 1295 | * We don't want to have recursive SIGSEGV's etc, for example, |
| 1273 | * that is why we also clear SIGNAL_UNKILLABLE. | 1296 | * that is why we also clear SIGNAL_UNKILLABLE. |
| 1274 | */ | 1297 | */ |
| 1275 | int | 1298 | static int |
| 1276 | force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) | 1299 | force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) |
| 1277 | { | 1300 | { |
| 1278 | unsigned long int flags; | 1301 | unsigned long int flags; |
| 1279 | int ret, blocked, ignored; | 1302 | int ret, blocked, ignored; |
| 1280 | struct k_sigaction *action; | 1303 | struct k_sigaction *action; |
| 1304 | int sig = info->si_signo; | ||
| 1281 | 1305 | ||
| 1282 | spin_lock_irqsave(&t->sighand->siglock, flags); | 1306 | spin_lock_irqsave(&t->sighand->siglock, flags); |
| 1283 | action = &t->sighand->action[sig-1]; | 1307 | action = &t->sighand->action[sig-1]; |
| @@ -1302,6 +1326,11 @@ force_sig_info(int sig, struct kernel_siginfo *info, struct task_struct *t) | |||
| 1302 | return ret; | 1326 | return ret; |
| 1303 | } | 1327 | } |
| 1304 | 1328 | ||
| 1329 | int force_sig_info(struct kernel_siginfo *info) | ||
| 1330 | { | ||
| 1331 | return force_sig_info_to_task(info, current); | ||
| 1332 | } | ||
| 1333 | |||
| 1305 | /* | 1334 | /* |
| 1306 | * Nuke all other threads in the group. | 1335 | * Nuke all other threads in the group. |
| 1307 | */ | 1336 | */ |
| @@ -1438,13 +1467,44 @@ static inline bool kill_as_cred_perm(const struct cred *cred, | |||
| 1438 | uid_eq(cred->uid, pcred->uid); | 1467 | uid_eq(cred->uid, pcred->uid); |
| 1439 | } | 1468 | } |
| 1440 | 1469 | ||
| 1441 | /* like kill_pid_info(), but doesn't use uid/euid of "current" */ | 1470 | /* |
| 1442 | int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, | 1471 | * The usb asyncio usage of siginfo is wrong. The glibc support |
| 1443 | const struct cred *cred) | 1472 | * for asyncio which uses SI_ASYNCIO assumes the layout is SIL_RT. |
| 1473 | * AKA after the generic fields: | ||
| 1474 | * kernel_pid_t si_pid; | ||
| 1475 | * kernel_uid32_t si_uid; | ||
| 1476 | * sigval_t si_value; | ||
| 1477 | * | ||
| 1478 | * Unfortunately when usb generates SI_ASYNCIO it assumes the layout | ||
| 1479 | * after the generic fields is: | ||
| 1480 | * void __user *si_addr; | ||
| 1481 | * | ||
| 1482 | * This is a practical problem when there is a 64bit big endian kernel | ||
| 1483 | * and a 32bit userspace. As the 32bit address will encoded in the low | ||
| 1484 | * 32bits of the pointer. Those low 32bits will be stored at higher | ||
| 1485 | * address than appear in a 32 bit pointer. So userspace will not | ||
| 1486 | * see the address it was expecting for it's completions. | ||
| 1487 | * | ||
| 1488 | * There is nothing in the encoding that can allow | ||
| 1489 | * copy_siginfo_to_user32 to detect this confusion of formats, so | ||
| 1490 | * handle this by requiring the caller of kill_pid_usb_asyncio to | ||
| 1491 | * notice when this situration takes place and to store the 32bit | ||
| 1492 | * pointer in sival_int, instead of sival_addr of the sigval_t addr | ||
| 1493 | * parameter. | ||
| 1494 | */ | ||
| 1495 | int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, | ||
| 1496 | struct pid *pid, const struct cred *cred) | ||
| 1444 | { | 1497 | { |
| 1445 | int ret = -EINVAL; | 1498 | struct kernel_siginfo info; |
| 1446 | struct task_struct *p; | 1499 | struct task_struct *p; |
| 1447 | unsigned long flags; | 1500 | unsigned long flags; |
| 1501 | int ret = -EINVAL; | ||
| 1502 | |||
| 1503 | clear_siginfo(&info); | ||
| 1504 | info.si_signo = sig; | ||
| 1505 | info.si_errno = errno; | ||
| 1506 | info.si_code = SI_ASYNCIO; | ||
| 1507 | *((sigval_t *)&info.si_pid) = addr; | ||
| 1448 | 1508 | ||
| 1449 | if (!valid_signal(sig)) | 1509 | if (!valid_signal(sig)) |
| 1450 | return ret; | 1510 | return ret; |
| @@ -1455,17 +1515,17 @@ int kill_pid_info_as_cred(int sig, struct kernel_siginfo *info, struct pid *pid, | |||
| 1455 | ret = -ESRCH; | 1515 | ret = -ESRCH; |
| 1456 | goto out_unlock; | 1516 | goto out_unlock; |
| 1457 | } | 1517 | } |
| 1458 | if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) { | 1518 | if (!kill_as_cred_perm(cred, p)) { |
| 1459 | ret = -EPERM; | 1519 | ret = -EPERM; |
| 1460 | goto out_unlock; | 1520 | goto out_unlock; |
| 1461 | } | 1521 | } |
| 1462 | ret = security_task_kill(p, info, sig, cred); | 1522 | ret = security_task_kill(p, &info, sig, cred); |
| 1463 | if (ret) | 1523 | if (ret) |
| 1464 | goto out_unlock; | 1524 | goto out_unlock; |
| 1465 | 1525 | ||
| 1466 | if (sig) { | 1526 | if (sig) { |
| 1467 | if (lock_task_sighand(p, &flags)) { | 1527 | if (lock_task_sighand(p, &flags)) { |
| 1468 | ret = __send_signal(sig, info, p, PIDTYPE_TGID, 0); | 1528 | ret = __send_signal(sig, &info, p, PIDTYPE_TGID, false); |
| 1469 | unlock_task_sighand(p, &flags); | 1529 | unlock_task_sighand(p, &flags); |
| 1470 | } else | 1530 | } else |
| 1471 | ret = -ESRCH; | 1531 | ret = -ESRCH; |
| @@ -1474,7 +1534,7 @@ out_unlock: | |||
| 1474 | rcu_read_unlock(); | 1534 | rcu_read_unlock(); |
| 1475 | return ret; | 1535 | return ret; |
| 1476 | } | 1536 | } |
| 1477 | EXPORT_SYMBOL_GPL(kill_pid_info_as_cred); | 1537 | EXPORT_SYMBOL_GPL(kill_pid_usb_asyncio); |
| 1478 | 1538 | ||
| 1479 | /* | 1539 | /* |
| 1480 | * kill_something_info() interprets pid in interesting ways just like kill(2). | 1540 | * kill_something_info() interprets pid in interesting ways just like kill(2). |
| @@ -1550,9 +1610,17 @@ send_sig(int sig, struct task_struct *p, int priv) | |||
| 1550 | } | 1610 | } |
| 1551 | EXPORT_SYMBOL(send_sig); | 1611 | EXPORT_SYMBOL(send_sig); |
| 1552 | 1612 | ||
| 1553 | void force_sig(int sig, struct task_struct *p) | 1613 | void force_sig(int sig) |
| 1554 | { | 1614 | { |
| 1555 | force_sig_info(sig, SEND_SIG_PRIV, p); | 1615 | struct kernel_siginfo info; |
| 1616 | |||
| 1617 | clear_siginfo(&info); | ||
| 1618 | info.si_signo = sig; | ||
| 1619 | info.si_errno = 0; | ||
| 1620 | info.si_code = SI_KERNEL; | ||
| 1621 | info.si_pid = 0; | ||
| 1622 | info.si_uid = 0; | ||
| 1623 | force_sig_info(&info); | ||
| 1556 | } | 1624 | } |
| 1557 | EXPORT_SYMBOL(force_sig); | 1625 | EXPORT_SYMBOL(force_sig); |
| 1558 | 1626 | ||
| @@ -1562,18 +1630,20 @@ EXPORT_SYMBOL(force_sig); | |||
| 1562 | * the problem was already a SIGSEGV, we'll want to | 1630 | * the problem was already a SIGSEGV, we'll want to |
| 1563 | * make sure we don't even try to deliver the signal.. | 1631 | * make sure we don't even try to deliver the signal.. |
| 1564 | */ | 1632 | */ |
| 1565 | void force_sigsegv(int sig, struct task_struct *p) | 1633 | void force_sigsegv(int sig) |
| 1566 | { | 1634 | { |
| 1635 | struct task_struct *p = current; | ||
| 1636 | |||
| 1567 | if (sig == SIGSEGV) { | 1637 | if (sig == SIGSEGV) { |
| 1568 | unsigned long flags; | 1638 | unsigned long flags; |
| 1569 | spin_lock_irqsave(&p->sighand->siglock, flags); | 1639 | spin_lock_irqsave(&p->sighand->siglock, flags); |
| 1570 | p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; | 1640 | p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; |
| 1571 | spin_unlock_irqrestore(&p->sighand->siglock, flags); | 1641 | spin_unlock_irqrestore(&p->sighand->siglock, flags); |
| 1572 | } | 1642 | } |
| 1573 | force_sig(SIGSEGV, p); | 1643 | force_sig(SIGSEGV); |
| 1574 | } | 1644 | } |
| 1575 | 1645 | ||
| 1576 | int force_sig_fault(int sig, int code, void __user *addr | 1646 | int force_sig_fault_to_task(int sig, int code, void __user *addr |
| 1577 | ___ARCH_SI_TRAPNO(int trapno) | 1647 | ___ARCH_SI_TRAPNO(int trapno) |
| 1578 | ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) | 1648 | ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) |
| 1579 | , struct task_struct *t) | 1649 | , struct task_struct *t) |
| @@ -1593,7 +1663,16 @@ int force_sig_fault(int sig, int code, void __user *addr | |||
| 1593 | info.si_flags = flags; | 1663 | info.si_flags = flags; |
| 1594 | info.si_isr = isr; | 1664 | info.si_isr = isr; |
| 1595 | #endif | 1665 | #endif |
| 1596 | return force_sig_info(info.si_signo, &info, t); | 1666 | return force_sig_info_to_task(&info, t); |
| 1667 | } | ||
| 1668 | |||
| 1669 | int force_sig_fault(int sig, int code, void __user *addr | ||
| 1670 | ___ARCH_SI_TRAPNO(int trapno) | ||
| 1671 | ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)) | ||
| 1672 | { | ||
| 1673 | return force_sig_fault_to_task(sig, code, addr | ||
| 1674 | ___ARCH_SI_TRAPNO(trapno) | ||
| 1675 | ___ARCH_SI_IA64(imm, flags, isr), current); | ||
| 1597 | } | 1676 | } |
| 1598 | 1677 | ||
| 1599 | int send_sig_fault(int sig, int code, void __user *addr | 1678 | int send_sig_fault(int sig, int code, void __user *addr |
| @@ -1619,7 +1698,7 @@ int send_sig_fault(int sig, int code, void __user *addr | |||
| 1619 | return send_sig_info(info.si_signo, &info, t); | 1698 | return send_sig_info(info.si_signo, &info, t); |
| 1620 | } | 1699 | } |
| 1621 | 1700 | ||
| 1622 | int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) | 1701 | int force_sig_mceerr(int code, void __user *addr, short lsb) |
| 1623 | { | 1702 | { |
| 1624 | struct kernel_siginfo info; | 1703 | struct kernel_siginfo info; |
| 1625 | 1704 | ||
| @@ -1630,7 +1709,7 @@ int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct | |||
| 1630 | info.si_code = code; | 1709 | info.si_code = code; |
| 1631 | info.si_addr = addr; | 1710 | info.si_addr = addr; |
| 1632 | info.si_addr_lsb = lsb; | 1711 | info.si_addr_lsb = lsb; |
| 1633 | return force_sig_info(info.si_signo, &info, t); | 1712 | return force_sig_info(&info); |
| 1634 | } | 1713 | } |
| 1635 | 1714 | ||
| 1636 | int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) | 1715 | int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) |
| @@ -1659,7 +1738,7 @@ int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) | |||
| 1659 | info.si_addr = addr; | 1738 | info.si_addr = addr; |
| 1660 | info.si_lower = lower; | 1739 | info.si_lower = lower; |
| 1661 | info.si_upper = upper; | 1740 | info.si_upper = upper; |
| 1662 | return force_sig_info(info.si_signo, &info, current); | 1741 | return force_sig_info(&info); |
| 1663 | } | 1742 | } |
| 1664 | 1743 | ||
| 1665 | #ifdef SEGV_PKUERR | 1744 | #ifdef SEGV_PKUERR |
| @@ -1673,7 +1752,7 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) | |||
| 1673 | info.si_code = SEGV_PKUERR; | 1752 | info.si_code = SEGV_PKUERR; |
| 1674 | info.si_addr = addr; | 1753 | info.si_addr = addr; |
| 1675 | info.si_pkey = pkey; | 1754 | info.si_pkey = pkey; |
| 1676 | return force_sig_info(info.si_signo, &info, current); | 1755 | return force_sig_info(&info); |
| 1677 | } | 1756 | } |
| 1678 | #endif | 1757 | #endif |
| 1679 | 1758 | ||
| @@ -1689,7 +1768,7 @@ int force_sig_ptrace_errno_trap(int errno, void __user *addr) | |||
| 1689 | info.si_errno = errno; | 1768 | info.si_errno = errno; |
| 1690 | info.si_code = TRAP_HWBKPT; | 1769 | info.si_code = TRAP_HWBKPT; |
| 1691 | info.si_addr = addr; | 1770 | info.si_addr = addr; |
| 1692 | return force_sig_info(info.si_signo, &info, current); | 1771 | return force_sig_info(&info); |
| 1693 | } | 1772 | } |
| 1694 | 1773 | ||
| 1695 | int kill_pgrp(struct pid *pid, int sig, int priv) | 1774 | int kill_pgrp(struct pid *pid, int sig, int priv) |
| @@ -1802,6 +1881,14 @@ ret: | |||
| 1802 | return ret; | 1881 | return ret; |
| 1803 | } | 1882 | } |
| 1804 | 1883 | ||
| 1884 | static void do_notify_pidfd(struct task_struct *task) | ||
| 1885 | { | ||
| 1886 | struct pid *pid; | ||
| 1887 | |||
| 1888 | pid = task_pid(task); | ||
| 1889 | wake_up_all(&pid->wait_pidfd); | ||
| 1890 | } | ||
| 1891 | |||
| 1805 | /* | 1892 | /* |
| 1806 | * Let a parent know about the death of a child. | 1893 | * Let a parent know about the death of a child. |
| 1807 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. | 1894 | * For a stopped/continued status change, use do_notify_parent_cldstop instead. |
| @@ -1825,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) | |||
| 1825 | BUG_ON(!tsk->ptrace && | 1912 | BUG_ON(!tsk->ptrace && |
| 1826 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); | 1913 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); |
| 1827 | 1914 | ||
| 1915 | /* Wake up all pidfd waiters */ | ||
| 1916 | do_notify_pidfd(tsk); | ||
| 1917 | |||
| 1828 | if (sig != SIGCHLD) { | 1918 | if (sig != SIGCHLD) { |
| 1829 | /* | 1919 | /* |
| 1830 | * This is only possible if parent == real_parent. | 1920 | * This is only possible if parent == real_parent. |
| @@ -2112,6 +2202,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t | |||
| 2112 | preempt_enable_no_resched(); | 2202 | preempt_enable_no_resched(); |
| 2113 | cgroup_enter_frozen(); | 2203 | cgroup_enter_frozen(); |
| 2114 | freezable_schedule(); | 2204 | freezable_schedule(); |
| 2205 | cgroup_leave_frozen(true); | ||
| 2115 | } else { | 2206 | } else { |
| 2116 | /* | 2207 | /* |
| 2117 | * By the time we got the lock, our tracer went away. | 2208 | * By the time we got the lock, our tracer went away. |
| @@ -2482,6 +2573,8 @@ relock: | |||
| 2482 | if (signal_group_exit(signal)) { | 2573 | if (signal_group_exit(signal)) { |
| 2483 | ksig->info.si_signo = signr = SIGKILL; | 2574 | ksig->info.si_signo = signr = SIGKILL; |
| 2484 | sigdelset(¤t->pending.signal, SIGKILL); | 2575 | sigdelset(¤t->pending.signal, SIGKILL); |
| 2576 | trace_signal_deliver(SIGKILL, SEND_SIG_NOINFO, | ||
| 2577 | &sighand->action[SIGKILL - 1]); | ||
| 2485 | recalc_sigpending(); | 2578 | recalc_sigpending(); |
| 2486 | goto fatal; | 2579 | goto fatal; |
| 2487 | } | 2580 | } |
| @@ -2671,7 +2764,7 @@ static void signal_delivered(struct ksignal *ksig, int stepping) | |||
| 2671 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) | 2764 | void signal_setup_done(int failed, struct ksignal *ksig, int stepping) |
| 2672 | { | 2765 | { |
| 2673 | if (failed) | 2766 | if (failed) |
| 2674 | force_sigsegv(ksig->sig, current); | 2767 | force_sigsegv(ksig->sig); |
| 2675 | else | 2768 | else |
| 2676 | signal_delivered(ksig, stepping); | 2769 | signal_delivered(ksig, stepping); |
| 2677 | } | 2770 | } |
| @@ -2907,7 +3000,8 @@ EXPORT_SYMBOL(set_compat_user_sigmask); | |||
| 2907 | * This is useful for syscalls such as ppoll, pselect, io_pgetevents and | 3000 | * This is useful for syscalls such as ppoll, pselect, io_pgetevents and |
| 2908 | * epoll_pwait where a new sigmask is passed in from userland for the syscalls. | 3001 | * epoll_pwait where a new sigmask is passed in from userland for the syscalls. |
| 2909 | */ | 3002 | */ |
| 2910 | void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) | 3003 | void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved, |
| 3004 | bool interrupted) | ||
| 2911 | { | 3005 | { |
| 2912 | 3006 | ||
| 2913 | if (!usigmask) | 3007 | if (!usigmask) |
| @@ -2917,7 +3011,7 @@ void restore_user_sigmask(const void __user *usigmask, sigset_t *sigsaved) | |||
| 2917 | * Restoring sigmask here can lead to delivering signals that the above | 3011 | * Restoring sigmask here can lead to delivering signals that the above |
| 2918 | * syscalls are intended to block because of the sigmask passed in. | 3012 | * syscalls are intended to block because of the sigmask passed in. |
| 2919 | */ | 3013 | */ |
| 2920 | if (signal_pending(current)) { | 3014 | if (interrupted) { |
| 2921 | current->saved_sigmask = *sigsaved; | 3015 | current->saved_sigmask = *sigsaved; |
| 2922 | set_restore_sigmask(); | 3016 | set_restore_sigmask(); |
| 2923 | return; | 3017 | return; |
| @@ -3616,12 +3710,11 @@ static struct pid *pidfd_to_pid(const struct file *file) | |||
| 3616 | } | 3710 | } |
| 3617 | 3711 | ||
| 3618 | /** | 3712 | /** |
| 3619 | * sys_pidfd_send_signal - send a signal to a process through a task file | 3713 | * sys_pidfd_send_signal - Signal a process through a pidfd |
| 3620 | * descriptor | 3714 | * @pidfd: file descriptor of the process |
| 3621 | * @pidfd: the file descriptor of the process | 3715 | * @sig: signal to send |
| 3622 | * @sig: signal to be sent | 3716 | * @info: signal info |
| 3623 | * @info: the signal info | 3717 | * @flags: future flags |
| 3624 | * @flags: future flags to be passed | ||
| 3625 | * | 3718 | * |
| 3626 | * The syscall currently only signals via PIDTYPE_PID which covers | 3719 | * The syscall currently only signals via PIDTYPE_PID which covers |
| 3627 | * kill(<positive-pid>, <signal>. It does not signal threads or process | 3720 | * kill(<positive-pid>, <signal>. It does not signal threads or process |
| @@ -4472,6 +4565,28 @@ static inline void siginfo_buildtime_checks(void) | |||
| 4472 | CHECK_OFFSET(si_syscall); | 4565 | CHECK_OFFSET(si_syscall); |
| 4473 | CHECK_OFFSET(si_arch); | 4566 | CHECK_OFFSET(si_arch); |
| 4474 | #undef CHECK_OFFSET | 4567 | #undef CHECK_OFFSET |
| 4568 | |||
| 4569 | /* usb asyncio */ | ||
| 4570 | BUILD_BUG_ON(offsetof(struct siginfo, si_pid) != | ||
| 4571 | offsetof(struct siginfo, si_addr)); | ||
| 4572 | if (sizeof(int) == sizeof(void __user *)) { | ||
| 4573 | BUILD_BUG_ON(sizeof_field(struct siginfo, si_pid) != | ||
| 4574 | sizeof(void __user *)); | ||
| 4575 | } else { | ||
| 4576 | BUILD_BUG_ON((sizeof_field(struct siginfo, si_pid) + | ||
| 4577 | sizeof_field(struct siginfo, si_uid)) != | ||
| 4578 | sizeof(void __user *)); | ||
| 4579 | BUILD_BUG_ON(offsetofend(struct siginfo, si_pid) != | ||
| 4580 | offsetof(struct siginfo, si_uid)); | ||
| 4581 | } | ||
| 4582 | #ifdef CONFIG_COMPAT | ||
| 4583 | BUILD_BUG_ON(offsetof(struct compat_siginfo, si_pid) != | ||
| 4584 | offsetof(struct compat_siginfo, si_addr)); | ||
| 4585 | BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != | ||
| 4586 | sizeof(compat_uptr_t)); | ||
| 4587 | BUILD_BUG_ON(sizeof_field(struct compat_siginfo, si_pid) != | ||
| 4588 | sizeof_field(struct siginfo, si_pid)); | ||
| 4589 | #endif | ||
| 4475 | } | 4590 | } |
| 4476 | 4591 | ||
| 4477 | void __init signals_init(void) | 4592 | void __init signals_init(void) |
diff --git a/kernel/smp.c b/kernel/smp.c index f4cf1b0bb3b8..616d4d114847 100644 --- a/kernel/smp.c +++ b/kernel/smp.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Generic helpers for smp ipi calls | 3 | * Generic helpers for smp ipi calls |
| 3 | * | 4 | * |
| @@ -33,7 +34,7 @@ struct call_function_data { | |||
| 33 | cpumask_var_t cpumask_ipi; | 34 | cpumask_var_t cpumask_ipi; |
| 34 | }; | 35 | }; |
| 35 | 36 | ||
| 36 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); | 37 | static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); |
| 37 | 38 | ||
| 38 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); | 39 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); |
| 39 | 40 | ||
| @@ -486,13 +487,11 @@ EXPORT_SYMBOL(smp_call_function_many); | |||
| 486 | * You must not call this function with disabled interrupts or from a | 487 | * You must not call this function with disabled interrupts or from a |
| 487 | * hardware interrupt handler or from a bottom half handler. | 488 | * hardware interrupt handler or from a bottom half handler. |
| 488 | */ | 489 | */ |
| 489 | int smp_call_function(smp_call_func_t func, void *info, int wait) | 490 | void smp_call_function(smp_call_func_t func, void *info, int wait) |
| 490 | { | 491 | { |
| 491 | preempt_disable(); | 492 | preempt_disable(); |
| 492 | smp_call_function_many(cpu_online_mask, func, info, wait); | 493 | smp_call_function_many(cpu_online_mask, func, info, wait); |
| 493 | preempt_enable(); | 494 | preempt_enable(); |
| 494 | |||
| 495 | return 0; | ||
| 496 | } | 495 | } |
| 497 | EXPORT_SYMBOL(smp_call_function); | 496 | EXPORT_SYMBOL(smp_call_function); |
| 498 | 497 | ||
| @@ -593,18 +592,16 @@ void __init smp_init(void) | |||
| 593 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead | 592 | * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead |
| 594 | * of local_irq_disable/enable(). | 593 | * of local_irq_disable/enable(). |
| 595 | */ | 594 | */ |
| 596 | int on_each_cpu(void (*func) (void *info), void *info, int wait) | 595 | void on_each_cpu(void (*func) (void *info), void *info, int wait) |
| 597 | { | 596 | { |
| 598 | unsigned long flags; | 597 | unsigned long flags; |
| 599 | int ret = 0; | ||
| 600 | 598 | ||
| 601 | preempt_disable(); | 599 | preempt_disable(); |
| 602 | ret = smp_call_function(func, info, wait); | 600 | smp_call_function(func, info, wait); |
| 603 | local_irq_save(flags); | 601 | local_irq_save(flags); |
| 604 | func(info); | 602 | func(info); |
| 605 | local_irq_restore(flags); | 603 | local_irq_restore(flags); |
| 606 | preempt_enable(); | 604 | preempt_enable(); |
| 607 | return ret; | ||
| 608 | } | 605 | } |
| 609 | EXPORT_SYMBOL(on_each_cpu); | 606 | EXPORT_SYMBOL(on_each_cpu); |
| 610 | 607 | ||
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index c230c2dd48e1..2efe1e206167 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Common SMP CPU bringup/teardown functions | 3 | * Common SMP CPU bringup/teardown functions |
| 3 | */ | 4 | */ |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 2c3382378d94..0427a86743a4 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -1,10 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * linux/kernel/softirq.c | 3 | * linux/kernel/softirq.c |
| 3 | * | 4 | * |
| 4 | * Copyright (C) 1992 Linus Torvalds | 5 | * Copyright (C) 1992 Linus Torvalds |
| 5 | * | 6 | * |
| 6 | * Distribute under GPLv2. | ||
| 7 | * | ||
| 8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 7 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) |
| 9 | */ | 8 | */ |
| 10 | 9 | ||
| @@ -650,7 +649,7 @@ static int takeover_tasklets(unsigned int cpu) | |||
| 650 | /* Find end, append list for that CPU. */ | 649 | /* Find end, append list for that CPU. */ |
| 651 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { | 650 | if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { |
| 652 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; | 651 | *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; |
| 653 | this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); | 652 | __this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); |
| 654 | per_cpu(tasklet_vec, cpu).head = NULL; | 653 | per_cpu(tasklet_vec, cpu).head = NULL; |
| 655 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; | 654 | per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; |
| 656 | } | 655 | } |
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 27bafc1e271e..e6a02b274b73 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/stacktrace.c | 3 | * kernel/stacktrace.c |
| 3 | * | 4 | * |
| @@ -206,7 +207,7 @@ int stack_trace_save_tsk_reliable(struct task_struct *tsk, unsigned long *store, | |||
| 206 | 207 | ||
| 207 | ret = arch_stack_walk_reliable(consume_entry, &c, tsk); | 208 | ret = arch_stack_walk_reliable(consume_entry, &c, tsk); |
| 208 | put_task_stack(tsk); | 209 | put_task_stack(tsk); |
| 209 | return ret; | 210 | return ret ? ret : c.len; |
| 210 | } | 211 | } |
| 211 | #endif | 212 | #endif |
| 212 | 213 | ||
| @@ -227,7 +228,7 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) | |||
| 227 | }; | 228 | }; |
| 228 | 229 | ||
| 229 | /* Trace user stack if not a kernel thread */ | 230 | /* Trace user stack if not a kernel thread */ |
| 230 | if (!current->mm) | 231 | if (current->flags & PF_KTHREAD) |
| 231 | return 0; | 232 | return 0; |
| 232 | 233 | ||
| 233 | arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); | 234 | arch_stack_walk_user(consume_entry, &c, task_pt_regs(current)); |
| @@ -254,14 +255,6 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) | |||
| 254 | WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); | 255 | WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); |
| 255 | } | 256 | } |
| 256 | 257 | ||
| 257 | __weak int | ||
| 258 | save_stack_trace_tsk_reliable(struct task_struct *tsk, | ||
| 259 | struct stack_trace *trace) | ||
| 260 | { | ||
| 261 | WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); | ||
| 262 | return -ENOSYS; | ||
| 263 | } | ||
| 264 | |||
| 265 | /** | 258 | /** |
| 266 | * stack_trace_save - Save a stack trace into a storage array | 259 | * stack_trace_save - Save a stack trace into a storage array |
| 267 | * @store: Pointer to storage array | 260 | * @store: Pointer to storage array |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 7231fb5953fc..b4f83f7bdf86 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/stop_machine.c | 3 | * kernel/stop_machine.c |
| 3 | * | 4 | * |
| @@ -5,8 +6,6 @@ | |||
| 5 | * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au | 6 | * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au |
| 6 | * Copyright (C) 2010 SUSE Linux Products GmbH | 7 | * Copyright (C) 2010 SUSE Linux Products GmbH |
| 7 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> | 8 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
| 8 | * | ||
| 9 | * This file is released under the GPLv2 and any later version. | ||
| 10 | */ | 9 | */ |
| 11 | #include <linux/completion.h> | 10 | #include <linux/completion.h> |
| 12 | #include <linux/cpu.h> | 11 | #include <linux/cpu.h> |
| @@ -178,12 +177,18 @@ static void ack_state(struct multi_stop_data *msdata) | |||
| 178 | set_state(msdata, msdata->state + 1); | 177 | set_state(msdata, msdata->state + 1); |
| 179 | } | 178 | } |
| 180 | 179 | ||
| 180 | void __weak stop_machine_yield(const struct cpumask *cpumask) | ||
| 181 | { | ||
| 182 | cpu_relax(); | ||
| 183 | } | ||
| 184 | |||
| 181 | /* This is the cpu_stop function which stops the CPU. */ | 185 | /* This is the cpu_stop function which stops the CPU. */ |
| 182 | static int multi_cpu_stop(void *data) | 186 | static int multi_cpu_stop(void *data) |
| 183 | { | 187 | { |
| 184 | struct multi_stop_data *msdata = data; | 188 | struct multi_stop_data *msdata = data; |
| 185 | enum multi_stop_state curstate = MULTI_STOP_NONE; | 189 | enum multi_stop_state curstate = MULTI_STOP_NONE; |
| 186 | int cpu = smp_processor_id(), err = 0; | 190 | int cpu = smp_processor_id(), err = 0; |
| 191 | const struct cpumask *cpumask; | ||
| 187 | unsigned long flags; | 192 | unsigned long flags; |
| 188 | bool is_active; | 193 | bool is_active; |
| 189 | 194 | ||
| @@ -193,15 +198,18 @@ static int multi_cpu_stop(void *data) | |||
| 193 | */ | 198 | */ |
| 194 | local_save_flags(flags); | 199 | local_save_flags(flags); |
| 195 | 200 | ||
| 196 | if (!msdata->active_cpus) | 201 | if (!msdata->active_cpus) { |
| 197 | is_active = cpu == cpumask_first(cpu_online_mask); | 202 | cpumask = cpu_online_mask; |
| 198 | else | 203 | is_active = cpu == cpumask_first(cpumask); |
| 199 | is_active = cpumask_test_cpu(cpu, msdata->active_cpus); | 204 | } else { |
| 205 | cpumask = msdata->active_cpus; | ||
| 206 | is_active = cpumask_test_cpu(cpu, cpumask); | ||
| 207 | } | ||
| 200 | 208 | ||
| 201 | /* Simple state machine */ | 209 | /* Simple state machine */ |
| 202 | do { | 210 | do { |
| 203 | /* Chill out and ensure we re-read multi_stop_state. */ | 211 | /* Chill out and ensure we re-read multi_stop_state. */ |
| 204 | cpu_relax_yield(); | 212 | stop_machine_yield(cpumask); |
| 205 | if (msdata->state != curstate) { | 213 | if (msdata->state != curstate) { |
| 206 | curstate = msdata->state; | 214 | curstate = msdata->state; |
| 207 | switch (curstate) { | 215 | switch (curstate) { |
diff --git a/kernel/sys.c b/kernel/sys.c index bdbfe8d37418..2969304c29fe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -1882,13 +1882,14 @@ exit_err: | |||
| 1882 | } | 1882 | } |
| 1883 | 1883 | ||
| 1884 | /* | 1884 | /* |
| 1885 | * Check arithmetic relations of passed addresses. | ||
| 1886 | * | ||
| 1885 | * WARNING: we don't require any capability here so be very careful | 1887 | * WARNING: we don't require any capability here so be very careful |
| 1886 | * in what is allowed for modification from userspace. | 1888 | * in what is allowed for modification from userspace. |
| 1887 | */ | 1889 | */ |
| 1888 | static int validate_prctl_map(struct prctl_mm_map *prctl_map) | 1890 | static int validate_prctl_map_addr(struct prctl_mm_map *prctl_map) |
| 1889 | { | 1891 | { |
| 1890 | unsigned long mmap_max_addr = TASK_SIZE; | 1892 | unsigned long mmap_max_addr = TASK_SIZE; |
| 1891 | struct mm_struct *mm = current->mm; | ||
| 1892 | int error = -EINVAL, i; | 1893 | int error = -EINVAL, i; |
| 1893 | 1894 | ||
| 1894 | static const unsigned char offsets[] = { | 1895 | static const unsigned char offsets[] = { |
| @@ -1949,24 +1950,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map) | |||
| 1949 | prctl_map->start_data)) | 1950 | prctl_map->start_data)) |
| 1950 | goto out; | 1951 | goto out; |
| 1951 | 1952 | ||
| 1952 | /* | ||
| 1953 | * Someone is trying to cheat the auxv vector. | ||
| 1954 | */ | ||
| 1955 | if (prctl_map->auxv_size) { | ||
| 1956 | if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv)) | ||
| 1957 | goto out; | ||
| 1958 | } | ||
| 1959 | |||
| 1960 | /* | ||
| 1961 | * Finally, make sure the caller has the rights to | ||
| 1962 | * change /proc/pid/exe link: only local sys admin should | ||
| 1963 | * be allowed to. | ||
| 1964 | */ | ||
| 1965 | if (prctl_map->exe_fd != (u32)-1) { | ||
| 1966 | if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | ||
| 1967 | goto out; | ||
| 1968 | } | ||
| 1969 | |||
| 1970 | error = 0; | 1953 | error = 0; |
| 1971 | out: | 1954 | out: |
| 1972 | return error; | 1955 | return error; |
| @@ -1993,11 +1976,18 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data | |||
| 1993 | if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) | 1976 | if (copy_from_user(&prctl_map, addr, sizeof(prctl_map))) |
| 1994 | return -EFAULT; | 1977 | return -EFAULT; |
| 1995 | 1978 | ||
| 1996 | error = validate_prctl_map(&prctl_map); | 1979 | error = validate_prctl_map_addr(&prctl_map); |
| 1997 | if (error) | 1980 | if (error) |
| 1998 | return error; | 1981 | return error; |
| 1999 | 1982 | ||
| 2000 | if (prctl_map.auxv_size) { | 1983 | if (prctl_map.auxv_size) { |
| 1984 | /* | ||
| 1985 | * Someone is trying to cheat the auxv vector. | ||
| 1986 | */ | ||
| 1987 | if (!prctl_map.auxv || | ||
| 1988 | prctl_map.auxv_size > sizeof(mm->saved_auxv)) | ||
| 1989 | return -EINVAL; | ||
| 1990 | |||
| 2001 | memset(user_auxv, 0, sizeof(user_auxv)); | 1991 | memset(user_auxv, 0, sizeof(user_auxv)); |
| 2002 | if (copy_from_user(user_auxv, | 1992 | if (copy_from_user(user_auxv, |
| 2003 | (const void __user *)prctl_map.auxv, | 1993 | (const void __user *)prctl_map.auxv, |
| @@ -2010,6 +2000,14 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data | |||
| 2010 | } | 2000 | } |
| 2011 | 2001 | ||
| 2012 | if (prctl_map.exe_fd != (u32)-1) { | 2002 | if (prctl_map.exe_fd != (u32)-1) { |
| 2003 | /* | ||
| 2004 | * Make sure the caller has the rights to | ||
| 2005 | * change /proc/pid/exe link: only local sys admin should | ||
| 2006 | * be allowed to. | ||
| 2007 | */ | ||
| 2008 | if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN)) | ||
| 2009 | return -EINVAL; | ||
| 2010 | |||
| 2013 | error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); | 2011 | error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); |
| 2014 | if (error) | 2012 | if (error) |
| 2015 | return error; | 2013 | return error; |
| @@ -2097,7 +2095,11 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 2097 | unsigned long arg4, unsigned long arg5) | 2095 | unsigned long arg4, unsigned long arg5) |
| 2098 | { | 2096 | { |
| 2099 | struct mm_struct *mm = current->mm; | 2097 | struct mm_struct *mm = current->mm; |
| 2100 | struct prctl_mm_map prctl_map; | 2098 | struct prctl_mm_map prctl_map = { |
| 2099 | .auxv = NULL, | ||
| 2100 | .auxv_size = 0, | ||
| 2101 | .exe_fd = -1, | ||
| 2102 | }; | ||
| 2101 | struct vm_area_struct *vma; | 2103 | struct vm_area_struct *vma; |
| 2102 | int error; | 2104 | int error; |
| 2103 | 2105 | ||
| @@ -2125,9 +2127,15 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 2125 | 2127 | ||
| 2126 | error = -EINVAL; | 2128 | error = -EINVAL; |
| 2127 | 2129 | ||
| 2128 | down_write(&mm->mmap_sem); | 2130 | /* |
| 2131 | * arg_lock protects concurent updates of arg boundaries, we need | ||
| 2132 | * mmap_sem for a) concurrent sys_brk, b) finding VMA for addr | ||
| 2133 | * validation. | ||
| 2134 | */ | ||
| 2135 | down_read(&mm->mmap_sem); | ||
| 2129 | vma = find_vma(mm, addr); | 2136 | vma = find_vma(mm, addr); |
| 2130 | 2137 | ||
| 2138 | spin_lock(&mm->arg_lock); | ||
| 2131 | prctl_map.start_code = mm->start_code; | 2139 | prctl_map.start_code = mm->start_code; |
| 2132 | prctl_map.end_code = mm->end_code; | 2140 | prctl_map.end_code = mm->end_code; |
| 2133 | prctl_map.start_data = mm->start_data; | 2141 | prctl_map.start_data = mm->start_data; |
| @@ -2139,9 +2147,6 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 2139 | prctl_map.arg_end = mm->arg_end; | 2147 | prctl_map.arg_end = mm->arg_end; |
| 2140 | prctl_map.env_start = mm->env_start; | 2148 | prctl_map.env_start = mm->env_start; |
| 2141 | prctl_map.env_end = mm->env_end; | 2149 | prctl_map.env_end = mm->env_end; |
| 2142 | prctl_map.auxv = NULL; | ||
| 2143 | prctl_map.auxv_size = 0; | ||
| 2144 | prctl_map.exe_fd = -1; | ||
| 2145 | 2150 | ||
| 2146 | switch (opt) { | 2151 | switch (opt) { |
| 2147 | case PR_SET_MM_START_CODE: | 2152 | case PR_SET_MM_START_CODE: |
| @@ -2181,7 +2186,7 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 2181 | goto out; | 2186 | goto out; |
| 2182 | } | 2187 | } |
| 2183 | 2188 | ||
| 2184 | error = validate_prctl_map(&prctl_map); | 2189 | error = validate_prctl_map_addr(&prctl_map); |
| 2185 | if (error) | 2190 | if (error) |
| 2186 | goto out; | 2191 | goto out; |
| 2187 | 2192 | ||
| @@ -2218,7 +2223,8 @@ static int prctl_set_mm(int opt, unsigned long addr, | |||
| 2218 | 2223 | ||
| 2219 | error = 0; | 2224 | error = 0; |
| 2220 | out: | 2225 | out: |
| 2221 | up_write(&mm->mmap_sem); | 2226 | spin_unlock(&mm->arg_lock); |
| 2227 | up_read(&mm->mmap_sem); | ||
| 2222 | return error; | 2228 | return error; |
| 2223 | } | 2229 | } |
| 2224 | 2230 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 4d9ae5ea6caf..34b76895b81e 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
| @@ -137,6 +137,8 @@ COND_SYSCALL(capset); | |||
| 137 | /* kernel/exit.c */ | 137 | /* kernel/exit.c */ |
| 138 | 138 | ||
| 139 | /* kernel/fork.c */ | 139 | /* kernel/fork.c */ |
| 140 | /* __ARCH_WANT_SYS_CLONE3 */ | ||
| 141 | COND_SYSCALL(clone3); | ||
| 140 | 142 | ||
| 141 | /* kernel/futex.c */ | 143 | /* kernel/futex.c */ |
| 142 | COND_SYSCALL(futex); | 144 | COND_SYSCALL(futex); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ba158f61aab4..1c1ad1e14f21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * sysctl.c: General linux system control interface | 3 | * sysctl.c: General linux system control interface |
| 3 | * | 4 | * |
| @@ -229,11 +230,6 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
| 229 | #endif | 230 | #endif |
| 230 | static int proc_dopipe_max_size(struct ctl_table *table, int write, | 231 | static int proc_dopipe_max_size(struct ctl_table *table, int write, |
| 231 | void __user *buffer, size_t *lenp, loff_t *ppos); | 232 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 232 | #ifdef CONFIG_BPF_SYSCALL | ||
| 233 | static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, | ||
| 234 | void __user *buffer, size_t *lenp, | ||
| 235 | loff_t *ppos); | ||
| 236 | #endif | ||
| 237 | 233 | ||
| 238 | #ifdef CONFIG_MAGIC_SYSRQ | 234 | #ifdef CONFIG_MAGIC_SYSRQ |
| 239 | /* Note: sysrq code uses its own private copy */ | 235 | /* Note: sysrq code uses its own private copy */ |
| @@ -456,6 +452,22 @@ static struct ctl_table kern_table[] = { | |||
| 456 | .mode = 0644, | 452 | .mode = 0644, |
| 457 | .proc_handler = sched_rr_handler, | 453 | .proc_handler = sched_rr_handler, |
| 458 | }, | 454 | }, |
| 455 | #ifdef CONFIG_UCLAMP_TASK | ||
| 456 | { | ||
| 457 | .procname = "sched_util_clamp_min", | ||
| 458 | .data = &sysctl_sched_uclamp_util_min, | ||
| 459 | .maxlen = sizeof(unsigned int), | ||
| 460 | .mode = 0644, | ||
| 461 | .proc_handler = sysctl_sched_uclamp_handler, | ||
| 462 | }, | ||
| 463 | { | ||
| 464 | .procname = "sched_util_clamp_max", | ||
| 465 | .data = &sysctl_sched_uclamp_util_max, | ||
| 466 | .maxlen = sizeof(unsigned int), | ||
| 467 | .mode = 0644, | ||
| 468 | .proc_handler = sysctl_sched_uclamp_handler, | ||
| 469 | }, | ||
| 470 | #endif | ||
| 459 | #ifdef CONFIG_SCHED_AUTOGROUP | 471 | #ifdef CONFIG_SCHED_AUTOGROUP |
| 460 | { | 472 | { |
| 461 | .procname = "sched_autogroup_enabled", | 473 | .procname = "sched_autogroup_enabled", |
| @@ -1252,12 +1264,10 @@ static struct ctl_table kern_table[] = { | |||
| 1252 | }, | 1264 | }, |
| 1253 | { | 1265 | { |
| 1254 | .procname = "bpf_stats_enabled", | 1266 | .procname = "bpf_stats_enabled", |
| 1255 | .data = &sysctl_bpf_stats_enabled, | 1267 | .data = &bpf_stats_enabled_key.key, |
| 1256 | .maxlen = sizeof(sysctl_bpf_stats_enabled), | 1268 | .maxlen = sizeof(bpf_stats_enabled_key), |
| 1257 | .mode = 0644, | 1269 | .mode = 0644, |
| 1258 | .proc_handler = proc_dointvec_minmax_bpf_stats, | 1270 | .proc_handler = proc_do_static_key, |
| 1259 | .extra1 = &zero, | ||
| 1260 | .extra2 = &one, | ||
| 1261 | }, | 1271 | }, |
| 1262 | #endif | 1272 | #endif |
| 1263 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) | 1273 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) |
| @@ -2886,8 +2896,10 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int | |||
| 2886 | if (neg) | 2896 | if (neg) |
| 2887 | continue; | 2897 | continue; |
| 2888 | val = convmul * val / convdiv; | 2898 | val = convmul * val / convdiv; |
| 2889 | if ((min && val < *min) || (max && val > *max)) | 2899 | if ((min && val < *min) || (max && val > *max)) { |
| 2890 | continue; | 2900 | err = -EINVAL; |
| 2901 | break; | ||
| 2902 | } | ||
| 2891 | *i = val; | 2903 | *i = val; |
| 2892 | } else { | 2904 | } else { |
| 2893 | val = convdiv * (*i) / convmul; | 2905 | val = convdiv * (*i) / convmul; |
| @@ -3170,17 +3182,19 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 3170 | 3182 | ||
| 3171 | if (write) { | 3183 | if (write) { |
| 3172 | char *kbuf, *p; | 3184 | char *kbuf, *p; |
| 3185 | size_t skipped = 0; | ||
| 3173 | 3186 | ||
| 3174 | if (left > PAGE_SIZE - 1) | 3187 | if (left > PAGE_SIZE - 1) { |
| 3175 | left = PAGE_SIZE - 1; | 3188 | left = PAGE_SIZE - 1; |
| 3189 | /* How much of the buffer we'll skip this pass */ | ||
| 3190 | skipped = *lenp - left; | ||
| 3191 | } | ||
| 3176 | 3192 | ||
| 3177 | p = kbuf = memdup_user_nul(buffer, left); | 3193 | p = kbuf = memdup_user_nul(buffer, left); |
| 3178 | if (IS_ERR(kbuf)) | 3194 | if (IS_ERR(kbuf)) |
| 3179 | return PTR_ERR(kbuf); | 3195 | return PTR_ERR(kbuf); |
| 3180 | 3196 | ||
| 3181 | tmp_bitmap = kcalloc(BITS_TO_LONGS(bitmap_len), | 3197 | tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); |
| 3182 | sizeof(unsigned long), | ||
| 3183 | GFP_KERNEL); | ||
| 3184 | if (!tmp_bitmap) { | 3198 | if (!tmp_bitmap) { |
| 3185 | kfree(kbuf); | 3199 | kfree(kbuf); |
| 3186 | return -ENOMEM; | 3200 | return -ENOMEM; |
| @@ -3189,9 +3203,22 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 3189 | while (!err && left) { | 3203 | while (!err && left) { |
| 3190 | unsigned long val_a, val_b; | 3204 | unsigned long val_a, val_b; |
| 3191 | bool neg; | 3205 | bool neg; |
| 3206 | size_t saved_left; | ||
| 3192 | 3207 | ||
| 3208 | /* In case we stop parsing mid-number, we can reset */ | ||
| 3209 | saved_left = left; | ||
| 3193 | err = proc_get_long(&p, &left, &val_a, &neg, tr_a, | 3210 | err = proc_get_long(&p, &left, &val_a, &neg, tr_a, |
| 3194 | sizeof(tr_a), &c); | 3211 | sizeof(tr_a), &c); |
| 3212 | /* | ||
| 3213 | * If we consumed the entirety of a truncated buffer or | ||
| 3214 | * only one char is left (may be a "-"), then stop here, | ||
| 3215 | * reset, & come back for more. | ||
| 3216 | */ | ||
| 3217 | if ((left <= 1) && skipped) { | ||
| 3218 | left = saved_left; | ||
| 3219 | break; | ||
| 3220 | } | ||
| 3221 | |||
| 3195 | if (err) | 3222 | if (err) |
| 3196 | break; | 3223 | break; |
| 3197 | if (val_a >= bitmap_len || neg) { | 3224 | if (val_a >= bitmap_len || neg) { |
| @@ -3209,6 +3236,15 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 3209 | err = proc_get_long(&p, &left, &val_b, | 3236 | err = proc_get_long(&p, &left, &val_b, |
| 3210 | &neg, tr_b, sizeof(tr_b), | 3237 | &neg, tr_b, sizeof(tr_b), |
| 3211 | &c); | 3238 | &c); |
| 3239 | /* | ||
| 3240 | * If we consumed all of a truncated buffer or | ||
| 3241 | * then stop here, reset, & come back for more. | ||
| 3242 | */ | ||
| 3243 | if (!left && skipped) { | ||
| 3244 | left = saved_left; | ||
| 3245 | break; | ||
| 3246 | } | ||
| 3247 | |||
| 3212 | if (err) | 3248 | if (err) |
| 3213 | break; | 3249 | break; |
| 3214 | if (val_b >= bitmap_len || neg || | 3250 | if (val_b >= bitmap_len || neg || |
| @@ -3227,6 +3263,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 3227 | proc_skip_char(&p, &left, '\n'); | 3263 | proc_skip_char(&p, &left, '\n'); |
| 3228 | } | 3264 | } |
| 3229 | kfree(kbuf); | 3265 | kfree(kbuf); |
| 3266 | left += skipped; | ||
| 3230 | } else { | 3267 | } else { |
| 3231 | unsigned long bit_a, bit_b = 0; | 3268 | unsigned long bit_a, bit_b = 0; |
| 3232 | 3269 | ||
| @@ -3271,7 +3308,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 3271 | *ppos += *lenp; | 3308 | *ppos += *lenp; |
| 3272 | } | 3309 | } |
| 3273 | 3310 | ||
| 3274 | kfree(tmp_bitmap); | 3311 | bitmap_free(tmp_bitmap); |
| 3275 | return err; | 3312 | return err; |
| 3276 | } | 3313 | } |
| 3277 | 3314 | ||
| @@ -3346,26 +3383,35 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, | |||
| 3346 | 3383 | ||
| 3347 | #endif /* CONFIG_PROC_SYSCTL */ | 3384 | #endif /* CONFIG_PROC_SYSCTL */ |
| 3348 | 3385 | ||
| 3349 | #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL) | 3386 | #if defined(CONFIG_SYSCTL) |
| 3350 | static int proc_dointvec_minmax_bpf_stats(struct ctl_table *table, int write, | 3387 | int proc_do_static_key(struct ctl_table *table, int write, |
| 3351 | void __user *buffer, size_t *lenp, | 3388 | void __user *buffer, size_t *lenp, |
| 3352 | loff_t *ppos) | 3389 | loff_t *ppos) |
| 3353 | { | 3390 | { |
| 3354 | int ret, bpf_stats = *(int *)table->data; | 3391 | struct static_key *key = (struct static_key *)table->data; |
| 3355 | struct ctl_table tmp = *table; | 3392 | static DEFINE_MUTEX(static_key_mutex); |
| 3393 | int val, ret; | ||
| 3394 | struct ctl_table tmp = { | ||
| 3395 | .data = &val, | ||
| 3396 | .maxlen = sizeof(val), | ||
| 3397 | .mode = table->mode, | ||
| 3398 | .extra1 = &zero, | ||
| 3399 | .extra2 = &one, | ||
| 3400 | }; | ||
| 3356 | 3401 | ||
| 3357 | if (write && !capable(CAP_SYS_ADMIN)) | 3402 | if (write && !capable(CAP_SYS_ADMIN)) |
| 3358 | return -EPERM; | 3403 | return -EPERM; |
| 3359 | 3404 | ||
| 3360 | tmp.data = &bpf_stats; | 3405 | mutex_lock(&static_key_mutex); |
| 3406 | val = static_key_enabled(key); | ||
| 3361 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); | 3407 | ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); |
| 3362 | if (write && !ret) { | 3408 | if (write && !ret) { |
| 3363 | *(int *)table->data = bpf_stats; | 3409 | if (val) |
| 3364 | if (bpf_stats) | 3410 | static_key_enable(key); |
| 3365 | static_branch_enable(&bpf_stats_enabled_key); | ||
| 3366 | else | 3411 | else |
| 3367 | static_branch_disable(&bpf_stats_enabled_key); | 3412 | static_key_disable(key); |
| 3368 | } | 3413 | } |
| 3414 | mutex_unlock(&static_key_mutex); | ||
| 3369 | return ret; | 3415 | return ret; |
| 3370 | } | 3416 | } |
| 3371 | #endif | 3417 | #endif |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 5f852b8f59f7..13a0f2e6ebc2 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -1,19 +1,9 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * taskstats.c - Export per-task statistics to userland | 3 | * taskstats.c - Export per-task statistics to userland |
| 3 | * | 4 | * |
| 4 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 | 5 | * Copyright (C) Shailabh Nagar, IBM Corp. 2006 |
| 5 | * (C) Balbir Singh, IBM Corp. 2006 | 6 | * (C) Balbir Singh, IBM Corp. 2006 |
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | */ | 7 | */ |
| 18 | 8 | ||
| 19 | #include <linux/kernel.h> | 9 | #include <linux/kernel.h> |
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 7bca480151b0..76c997fdbc9d 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c | |||
| @@ -1,17 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * test_kprobes.c - simple sanity test for *probes | 3 | * test_kprobes.c - simple sanity test for *probes |
| 3 | * | 4 | * |
| 4 | * Copyright IBM Corp. 2008 | 5 | * Copyright IBM Corp. 2008 |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it would be useful, but | ||
| 12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
| 14 | * the GNU General Public License for more details. | ||
| 15 | */ | 6 | */ |
| 16 | 7 | ||
| 17 | #define pr_fmt(fmt) "Kprobe smoke test: " fmt | 8 | #define pr_fmt(fmt) "Kprobe smoke test: " fmt |
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e2c038d6c13c..fcc42353f125 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # Timer subsystem related configuration options | 3 | # Timer subsystem related configuration options |
| 3 | # | 4 | # |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile index f1e46f338a9c..1867044800bb 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile | |||
| @@ -16,5 +16,6 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) | |||
| 16 | endif | 16 | endif |
| 17 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o | 17 | obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o |
| 18 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o | 18 | obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o |
| 19 | obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o | ||
| 19 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o | 20 | obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o |
| 20 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o | 21 | obj-$(CONFIG_TEST_UDELAY) += test_udelay.o |
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 0519a8805aab..57518efc3810 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c | |||
| @@ -233,7 +233,6 @@ EXPORT_SYMBOL_GPL(alarm_expires_remaining); | |||
| 233 | /** | 233 | /** |
| 234 | * alarmtimer_suspend - Suspend time callback | 234 | * alarmtimer_suspend - Suspend time callback |
| 235 | * @dev: unused | 235 | * @dev: unused |
| 236 | * @state: unused | ||
| 237 | * | 236 | * |
| 238 | * When we are going into suspend, we look through the bases | 237 | * When we are going into suspend, we look through the bases |
| 239 | * to see which is the soonest timer to expire. We then | 238 | * to see which is the soonest timer to expire. We then |
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 3bcc19ceb073..fff5f64981c6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
| @@ -105,12 +105,12 @@ static DEFINE_SPINLOCK(watchdog_lock); | |||
| 105 | static int watchdog_running; | 105 | static int watchdog_running; |
| 106 | static atomic_t watchdog_reset_pending; | 106 | static atomic_t watchdog_reset_pending; |
| 107 | 107 | ||
| 108 | static void inline clocksource_watchdog_lock(unsigned long *flags) | 108 | static inline void clocksource_watchdog_lock(unsigned long *flags) |
| 109 | { | 109 | { |
| 110 | spin_lock_irqsave(&watchdog_lock, *flags); | 110 | spin_lock_irqsave(&watchdog_lock, *flags); |
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static void inline clocksource_watchdog_unlock(unsigned long *flags) | 113 | static inline void clocksource_watchdog_unlock(unsigned long *flags) |
| 114 | { | 114 | { |
| 115 | spin_unlock_irqrestore(&watchdog_lock, *flags); | 115 | spin_unlock_irqrestore(&watchdog_lock, *flags); |
| 116 | } | 116 | } |
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 41dfff23c1f9..5ee77f1a8a92 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c | |||
| @@ -30,7 +30,6 @@ | |||
| 30 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
| 31 | #include <linux/interrupt.h> | 31 | #include <linux/interrupt.h> |
| 32 | #include <linux/tick.h> | 32 | #include <linux/tick.h> |
| 33 | #include <linux/seq_file.h> | ||
| 34 | #include <linux/err.h> | 33 | #include <linux/err.h> |
| 35 | #include <linux/debugobjects.h> | 34 | #include <linux/debugobjects.h> |
| 36 | #include <linux/sched/signal.h> | 35 | #include <linux/sched/signal.h> |
| @@ -1115,9 +1114,10 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); | |||
| 1115 | * @timer: hrtimer to stop | 1114 | * @timer: hrtimer to stop |
| 1116 | * | 1115 | * |
| 1117 | * Returns: | 1116 | * Returns: |
| 1118 | * 0 when the timer was not active | 1117 | * |
| 1119 | * 1 when the timer was active | 1118 | * * 0 when the timer was not active |
| 1120 | * -1 when the timer is currently executing the callback function and | 1119 | * * 1 when the timer was active |
| 1120 | * * -1 when the timer is currently executing the callback function and | ||
| 1121 | * cannot be stopped | 1121 | * cannot be stopped |
| 1122 | */ | 1122 | */ |
| 1123 | int hrtimer_try_to_cancel(struct hrtimer *timer) | 1123 | int hrtimer_try_to_cancel(struct hrtimer *timer) |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ac5555e25733..65eb796610dc 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
| @@ -43,6 +43,7 @@ static u64 tick_length_base; | |||
| 43 | #define MAX_TICKADJ 500LL /* usecs */ | 43 | #define MAX_TICKADJ 500LL /* usecs */ |
| 44 | #define MAX_TICKADJ_SCALED \ | 44 | #define MAX_TICKADJ_SCALED \ |
| 45 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) | 45 | (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) |
| 46 | #define MAX_TAI_OFFSET 100000 | ||
| 46 | 47 | ||
| 47 | /* | 48 | /* |
| 48 | * phase-lock loop variables | 49 | * phase-lock loop variables |
| @@ -691,7 +692,8 @@ static inline void process_adjtimex_modes(const struct __kernel_timex *txc, | |||
| 691 | time_constant = max(time_constant, 0l); | 692 | time_constant = max(time_constant, 0l); |
| 692 | } | 693 | } |
| 693 | 694 | ||
| 694 | if (txc->modes & ADJ_TAI && txc->constant > 0) | 695 | if (txc->modes & ADJ_TAI && |
| 696 | txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) | ||
| 695 | *time_tai = txc->constant; | 697 | *time_tai = txc->constant; |
| 696 | 698 | ||
| 697 | if (txc->modes & ADJ_OFFSET) | 699 | if (txc->modes & ADJ_OFFSET) |
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 29176635991f..d7f2d91acdac 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c | |||
| @@ -980,23 +980,16 @@ retry_delete: | |||
| 980 | */ | 980 | */ |
| 981 | static void itimer_delete(struct k_itimer *timer) | 981 | static void itimer_delete(struct k_itimer *timer) |
| 982 | { | 982 | { |
| 983 | unsigned long flags; | ||
| 984 | |||
| 985 | retry_delete: | 983 | retry_delete: |
| 986 | spin_lock_irqsave(&timer->it_lock, flags); | 984 | spin_lock_irq(&timer->it_lock); |
| 987 | 985 | ||
| 988 | if (timer_delete_hook(timer) == TIMER_RETRY) { | 986 | if (timer_delete_hook(timer) == TIMER_RETRY) { |
| 989 | unlock_timer(timer, flags); | 987 | spin_unlock_irq(&timer->it_lock); |
| 990 | goto retry_delete; | 988 | goto retry_delete; |
| 991 | } | 989 | } |
| 992 | list_del(&timer->list); | 990 | list_del(&timer->list); |
| 993 | /* | ||
| 994 | * This keeps any tasks waiting on the spin lock from thinking | ||
| 995 | * they got something (see the lock code above). | ||
| 996 | */ | ||
| 997 | timer->it_signal = NULL; | ||
| 998 | 991 | ||
| 999 | unlock_timer(timer, flags); | 992 | spin_unlock_irq(&timer->it_lock); |
| 1000 | release_posix_timer(timer, IT_ID_SET); | 993 | release_posix_timer(timer, IT_ID_SET); |
| 1001 | } | 994 | } |
| 1002 | 995 | ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4ee1a3428ae..be9707f68024 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) | |||
| 782 | */ | 782 | */ |
| 783 | if (!ts->tick_stopped) { | 783 | if (!ts->tick_stopped) { |
| 784 | calc_load_nohz_start(); | 784 | calc_load_nohz_start(); |
| 785 | cpu_load_update_nohz_start(); | ||
| 786 | quiet_vmstat(); | 785 | quiet_vmstat(); |
| 787 | 786 | ||
| 788 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 787 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
| @@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
| 829 | { | 828 | { |
| 830 | /* Update jiffies first */ | 829 | /* Update jiffies first */ |
| 831 | tick_do_update_jiffies64(now); | 830 | tick_do_update_jiffies64(now); |
| 832 | cpu_load_update_nohz_stop(); | ||
| 833 | 831 | ||
| 834 | /* | 832 | /* |
| 835 | * Clear the timer idle flag, so we avoid IPIs on remote queueing and | 833 | * Clear the timer idle flag, so we avoid IPIs on remote queueing and |
diff --git a/kernel/time/time.c b/kernel/time/time.c index 7f7d6914ddd5..5c54ca632d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c | |||
| @@ -251,6 +251,10 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct old_timeval32 __user *, tv, | |||
| 251 | if (tv) { | 251 | if (tv) { |
| 252 | if (compat_get_timeval(&user_tv, tv)) | 252 | if (compat_get_timeval(&user_tv, tv)) |
| 253 | return -EFAULT; | 253 | return -EFAULT; |
| 254 | |||
| 255 | if (!timeval_valid(&user_tv)) | ||
| 256 | return -EINVAL; | ||
| 257 | |||
| 254 | new_ts.tv_sec = user_tv.tv_sec; | 258 | new_ts.tv_sec = user_tv.tv_sec; |
| 255 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; | 259 | new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; |
| 256 | } | 260 | } |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 85f5912d8f70..d911c8470149 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -808,17 +808,18 @@ ktime_t ktime_get_coarse_with_offset(enum tk_offsets offs) | |||
| 808 | struct timekeeper *tk = &tk_core.timekeeper; | 808 | struct timekeeper *tk = &tk_core.timekeeper; |
| 809 | unsigned int seq; | 809 | unsigned int seq; |
| 810 | ktime_t base, *offset = offsets[offs]; | 810 | ktime_t base, *offset = offsets[offs]; |
| 811 | u64 nsecs; | ||
| 811 | 812 | ||
| 812 | WARN_ON(timekeeping_suspended); | 813 | WARN_ON(timekeeping_suspended); |
| 813 | 814 | ||
| 814 | do { | 815 | do { |
| 815 | seq = read_seqcount_begin(&tk_core.seq); | 816 | seq = read_seqcount_begin(&tk_core.seq); |
| 816 | base = ktime_add(tk->tkr_mono.base, *offset); | 817 | base = ktime_add(tk->tkr_mono.base, *offset); |
| 818 | nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; | ||
| 817 | 819 | ||
| 818 | } while (read_seqcount_retry(&tk_core.seq, seq)); | 820 | } while (read_seqcount_retry(&tk_core.seq, seq)); |
| 819 | 821 | ||
| 820 | return base; | 822 | return ktime_add_ns(base, nsecs); |
| 821 | |||
| 822 | } | 823 | } |
| 823 | EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); | 824 | EXPORT_SYMBOL_GPL(ktime_get_coarse_with_offset); |
| 824 | 825 | ||
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 98ba50dcb1b2..acb326f5f50a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
| @@ -282,23 +282,6 @@ static inline void timer_list_header(struct seq_file *m, u64 now) | |||
| 282 | SEQ_printf(m, "\n"); | 282 | SEQ_printf(m, "\n"); |
| 283 | } | 283 | } |
| 284 | 284 | ||
| 285 | static int timer_list_show(struct seq_file *m, void *v) | ||
| 286 | { | ||
| 287 | struct timer_list_iter *iter = v; | ||
| 288 | |||
| 289 | if (iter->cpu == -1 && !iter->second_pass) | ||
| 290 | timer_list_header(m, iter->now); | ||
| 291 | else if (!iter->second_pass) | ||
| 292 | print_cpu(m, iter->cpu, iter->now); | ||
| 293 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
| 294 | else if (iter->cpu == -1 && iter->second_pass) | ||
| 295 | timer_list_show_tickdevices_header(m); | ||
| 296 | else | ||
| 297 | print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); | ||
| 298 | #endif | ||
| 299 | return 0; | ||
| 300 | } | ||
| 301 | |||
| 302 | void sysrq_timer_list_show(void) | 285 | void sysrq_timer_list_show(void) |
| 303 | { | 286 | { |
| 304 | u64 now = ktime_to_ns(ktime_get()); | 287 | u64 now = ktime_to_ns(ktime_get()); |
| @@ -317,6 +300,24 @@ void sysrq_timer_list_show(void) | |||
| 317 | return; | 300 | return; |
| 318 | } | 301 | } |
| 319 | 302 | ||
| 303 | #ifdef CONFIG_PROC_FS | ||
| 304 | static int timer_list_show(struct seq_file *m, void *v) | ||
| 305 | { | ||
| 306 | struct timer_list_iter *iter = v; | ||
| 307 | |||
| 308 | if (iter->cpu == -1 && !iter->second_pass) | ||
| 309 | timer_list_header(m, iter->now); | ||
| 310 | else if (!iter->second_pass) | ||
| 311 | print_cpu(m, iter->cpu, iter->now); | ||
| 312 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | ||
| 313 | else if (iter->cpu == -1 && iter->second_pass) | ||
| 314 | timer_list_show_tickdevices_header(m); | ||
| 315 | else | ||
| 316 | print_tickdevice(m, tick_get_device(iter->cpu), iter->cpu); | ||
| 317 | #endif | ||
| 318 | return 0; | ||
| 319 | } | ||
| 320 | |||
| 320 | static void *move_iter(struct timer_list_iter *iter, loff_t offset) | 321 | static void *move_iter(struct timer_list_iter *iter, loff_t offset) |
| 321 | { | 322 | { |
| 322 | for (; offset; offset--) { | 323 | for (; offset; offset--) { |
| @@ -376,3 +377,4 @@ static int __init init_timer_list_procfs(void) | |||
| 376 | return 0; | 377 | return 0; |
| 377 | } | 378 | } |
| 378 | __initcall(init_timer_list_procfs); | 379 | __initcall(init_timer_list_procfs); |
| 380 | #endif | ||
diff --git a/kernel/time/vsyscall.c b/kernel/time/vsyscall.c new file mode 100644 index 000000000000..8cf3596a4ce6 --- /dev/null +++ b/kernel/time/vsyscall.c | |||
| @@ -0,0 +1,129 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0 | ||
| 2 | /* | ||
| 3 | * Copyright 2019 ARM Ltd. | ||
| 4 | * | ||
| 5 | * Generic implementation of update_vsyscall and update_vsyscall_tz. | ||
| 6 | * | ||
| 7 | * Based on the x86 specific implementation. | ||
| 8 | */ | ||
| 9 | |||
| 10 | #include <linux/hrtimer.h> | ||
| 11 | #include <linux/timekeeper_internal.h> | ||
| 12 | #include <vdso/datapage.h> | ||
| 13 | #include <vdso/helpers.h> | ||
| 14 | #include <vdso/vsyscall.h> | ||
| 15 | |||
| 16 | static inline void update_vdso_data(struct vdso_data *vdata, | ||
| 17 | struct timekeeper *tk) | ||
| 18 | { | ||
| 19 | struct vdso_timestamp *vdso_ts; | ||
| 20 | u64 nsec; | ||
| 21 | |||
| 22 | vdata[CS_HRES_COARSE].cycle_last = tk->tkr_mono.cycle_last; | ||
| 23 | vdata[CS_HRES_COARSE].mask = tk->tkr_mono.mask; | ||
| 24 | vdata[CS_HRES_COARSE].mult = tk->tkr_mono.mult; | ||
| 25 | vdata[CS_HRES_COARSE].shift = tk->tkr_mono.shift; | ||
| 26 | vdata[CS_RAW].cycle_last = tk->tkr_raw.cycle_last; | ||
| 27 | vdata[CS_RAW].mask = tk->tkr_raw.mask; | ||
| 28 | vdata[CS_RAW].mult = tk->tkr_raw.mult; | ||
| 29 | vdata[CS_RAW].shift = tk->tkr_raw.shift; | ||
| 30 | |||
| 31 | /* CLOCK_REALTIME */ | ||
| 32 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME]; | ||
| 33 | vdso_ts->sec = tk->xtime_sec; | ||
| 34 | vdso_ts->nsec = tk->tkr_mono.xtime_nsec; | ||
| 35 | |||
| 36 | /* CLOCK_MONOTONIC */ | ||
| 37 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC]; | ||
| 38 | vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; | ||
| 39 | |||
| 40 | nsec = tk->tkr_mono.xtime_nsec; | ||
| 41 | nsec += ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); | ||
| 42 | while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { | ||
| 43 | nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); | ||
| 44 | vdso_ts->sec++; | ||
| 45 | } | ||
| 46 | vdso_ts->nsec = nsec; | ||
| 47 | |||
| 48 | /* CLOCK_MONOTONIC_RAW */ | ||
| 49 | vdso_ts = &vdata[CS_RAW].basetime[CLOCK_MONOTONIC_RAW]; | ||
| 50 | vdso_ts->sec = tk->raw_sec; | ||
| 51 | vdso_ts->nsec = tk->tkr_raw.xtime_nsec; | ||
| 52 | |||
| 53 | /* CLOCK_BOOTTIME */ | ||
| 54 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_BOOTTIME]; | ||
| 55 | vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; | ||
| 56 | nsec = tk->tkr_mono.xtime_nsec; | ||
| 57 | nsec += ((u64)(tk->wall_to_monotonic.tv_nsec + | ||
| 58 | ktime_to_ns(tk->offs_boot)) << tk->tkr_mono.shift); | ||
| 59 | while (nsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { | ||
| 60 | nsec -= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift); | ||
| 61 | vdso_ts->sec++; | ||
| 62 | } | ||
| 63 | vdso_ts->nsec = nsec; | ||
| 64 | |||
| 65 | /* CLOCK_TAI */ | ||
| 66 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_TAI]; | ||
| 67 | vdso_ts->sec = tk->xtime_sec + (s64)tk->tai_offset; | ||
| 68 | vdso_ts->nsec = tk->tkr_mono.xtime_nsec; | ||
| 69 | |||
| 70 | /* | ||
| 71 | * Read without the seqlock held by clock_getres(). | ||
| 72 | * Note: No need to have a second copy. | ||
| 73 | */ | ||
| 74 | WRITE_ONCE(vdata[CS_HRES_COARSE].hrtimer_res, hrtimer_resolution); | ||
| 75 | } | ||
| 76 | |||
| 77 | void update_vsyscall(struct timekeeper *tk) | ||
| 78 | { | ||
| 79 | struct vdso_data *vdata = __arch_get_k_vdso_data(); | ||
| 80 | struct vdso_timestamp *vdso_ts; | ||
| 81 | u64 nsec; | ||
| 82 | |||
| 83 | if (__arch_update_vdso_data()) { | ||
| 84 | /* | ||
| 85 | * Some architectures might want to skip the update of the | ||
| 86 | * data page. | ||
| 87 | */ | ||
| 88 | return; | ||
| 89 | } | ||
| 90 | |||
| 91 | /* copy vsyscall data */ | ||
| 92 | vdso_write_begin(vdata); | ||
| 93 | |||
| 94 | vdata[CS_HRES_COARSE].clock_mode = __arch_get_clock_mode(tk); | ||
| 95 | vdata[CS_RAW].clock_mode = __arch_get_clock_mode(tk); | ||
| 96 | |||
| 97 | /* CLOCK_REALTIME_COARSE */ | ||
| 98 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_REALTIME_COARSE]; | ||
| 99 | vdso_ts->sec = tk->xtime_sec; | ||
| 100 | vdso_ts->nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; | ||
| 101 | |||
| 102 | /* CLOCK_MONOTONIC_COARSE */ | ||
| 103 | vdso_ts = &vdata[CS_HRES_COARSE].basetime[CLOCK_MONOTONIC_COARSE]; | ||
| 104 | vdso_ts->sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; | ||
| 105 | nsec = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; | ||
| 106 | nsec = nsec + tk->wall_to_monotonic.tv_nsec; | ||
| 107 | vdso_ts->sec += __iter_div_u64_rem(nsec, NSEC_PER_SEC, &vdso_ts->nsec); | ||
| 108 | |||
| 109 | if (__arch_use_vsyscall(vdata)) | ||
| 110 | update_vdso_data(vdata, tk); | ||
| 111 | |||
| 112 | __arch_update_vsyscall(vdata, tk); | ||
| 113 | |||
| 114 | vdso_write_end(vdata); | ||
| 115 | |||
| 116 | __arch_sync_vdso_data(vdata); | ||
| 117 | } | ||
| 118 | |||
| 119 | void update_vsyscall_tz(void) | ||
| 120 | { | ||
| 121 | struct vdso_data *vdata = __arch_get_k_vdso_data(); | ||
| 122 | |||
| 123 | if (__arch_use_vsyscall(vdata)) { | ||
| 124 | vdata[CS_HRES_COARSE].tz_minuteswest = sys_tz.tz_minuteswest; | ||
| 125 | vdata[CS_HRES_COARSE].tz_dsttime = sys_tz.tz_dsttime; | ||
| 126 | } | ||
| 127 | |||
| 128 | __arch_sync_vdso_data(vdata); | ||
| 129 | } | ||
diff --git a/kernel/torture.c b/kernel/torture.c index 17b2be9bde12..a8d9bdfba7c3 100644 --- a/kernel/torture.c +++ b/kernel/torture.c | |||
| @@ -570,6 +570,7 @@ static void torture_shutdown_cleanup(void) | |||
| 570 | static struct task_struct *stutter_task; | 570 | static struct task_struct *stutter_task; |
| 571 | static int stutter_pause_test; | 571 | static int stutter_pause_test; |
| 572 | static int stutter; | 572 | static int stutter; |
| 573 | static int stutter_gap; | ||
| 573 | 574 | ||
| 574 | /* | 575 | /* |
| 575 | * Block until the stutter interval ends. This must be called periodically | 576 | * Block until the stutter interval ends. This must be called periodically |
| @@ -578,10 +579,12 @@ static int stutter; | |||
| 578 | bool stutter_wait(const char *title) | 579 | bool stutter_wait(const char *title) |
| 579 | { | 580 | { |
| 580 | int spt; | 581 | int spt; |
| 582 | bool ret = false; | ||
| 581 | 583 | ||
| 582 | cond_resched_tasks_rcu_qs(); | 584 | cond_resched_tasks_rcu_qs(); |
| 583 | spt = READ_ONCE(stutter_pause_test); | 585 | spt = READ_ONCE(stutter_pause_test); |
| 584 | for (; spt; spt = READ_ONCE(stutter_pause_test)) { | 586 | for (; spt; spt = READ_ONCE(stutter_pause_test)) { |
| 587 | ret = true; | ||
| 585 | if (spt == 1) { | 588 | if (spt == 1) { |
| 586 | schedule_timeout_interruptible(1); | 589 | schedule_timeout_interruptible(1); |
| 587 | } else if (spt == 2) { | 590 | } else if (spt == 2) { |
| @@ -592,7 +595,7 @@ bool stutter_wait(const char *title) | |||
| 592 | } | 595 | } |
| 593 | torture_shutdown_absorb(title); | 596 | torture_shutdown_absorb(title); |
| 594 | } | 597 | } |
| 595 | return !!spt; | 598 | return ret; |
| 596 | } | 599 | } |
| 597 | EXPORT_SYMBOL_GPL(stutter_wait); | 600 | EXPORT_SYMBOL_GPL(stutter_wait); |
| 598 | 601 | ||
| @@ -602,17 +605,24 @@ EXPORT_SYMBOL_GPL(stutter_wait); | |||
| 602 | */ | 605 | */ |
| 603 | static int torture_stutter(void *arg) | 606 | static int torture_stutter(void *arg) |
| 604 | { | 607 | { |
| 608 | int wtime; | ||
| 609 | |||
| 605 | VERBOSE_TOROUT_STRING("torture_stutter task started"); | 610 | VERBOSE_TOROUT_STRING("torture_stutter task started"); |
| 606 | do { | 611 | do { |
| 607 | if (!torture_must_stop() && stutter > 1) { | 612 | if (!torture_must_stop() && stutter > 1) { |
| 608 | WRITE_ONCE(stutter_pause_test, 1); | 613 | wtime = stutter; |
| 609 | schedule_timeout_interruptible(stutter - 1); | 614 | if (stutter > HZ + 1) { |
| 615 | WRITE_ONCE(stutter_pause_test, 1); | ||
| 616 | wtime = stutter - HZ - 1; | ||
| 617 | schedule_timeout_interruptible(wtime); | ||
| 618 | wtime = HZ + 1; | ||
| 619 | } | ||
| 610 | WRITE_ONCE(stutter_pause_test, 2); | 620 | WRITE_ONCE(stutter_pause_test, 2); |
| 611 | schedule_timeout_interruptible(1); | 621 | schedule_timeout_interruptible(wtime); |
| 612 | } | 622 | } |
| 613 | WRITE_ONCE(stutter_pause_test, 0); | 623 | WRITE_ONCE(stutter_pause_test, 0); |
| 614 | if (!torture_must_stop()) | 624 | if (!torture_must_stop()) |
| 615 | schedule_timeout_interruptible(stutter); | 625 | schedule_timeout_interruptible(stutter_gap); |
| 616 | torture_shutdown_absorb("torture_stutter"); | 626 | torture_shutdown_absorb("torture_stutter"); |
| 617 | } while (!torture_must_stop()); | 627 | } while (!torture_must_stop()); |
| 618 | torture_kthread_stopping("torture_stutter"); | 628 | torture_kthread_stopping("torture_stutter"); |
| @@ -622,9 +632,10 @@ static int torture_stutter(void *arg) | |||
| 622 | /* | 632 | /* |
| 623 | * Initialize and kick off the torture_stutter kthread. | 633 | * Initialize and kick off the torture_stutter kthread. |
| 624 | */ | 634 | */ |
| 625 | int torture_stutter_init(const int s) | 635 | int torture_stutter_init(const int s, const int sgap) |
| 626 | { | 636 | { |
| 627 | stutter = s; | 637 | stutter = s; |
| 638 | stutter_gap = sgap; | ||
| 628 | return torture_create_kthread(torture_stutter, NULL, stutter_task); | 639 | return torture_create_kthread(torture_stutter, NULL, stutter_task); |
| 629 | } | 640 | } |
| 630 | EXPORT_SYMBOL_GPL(torture_stutter_init); | 641 | EXPORT_SYMBOL_GPL(torture_stutter_init); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5d965cef6c77..564e5fdb025f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | # SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | # | 2 | # |
| 2 | # Architectures that offer an FUNCTION_TRACER implementation should | 3 | # Architectures that offer an FUNCTION_TRACER implementation should |
| 3 | # select HAVE_FUNCTION_TRACER: | 4 | # select HAVE_FUNCTION_TRACER: |
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e1c6d79fb4cc..2d6e93ab0478 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c | |||
| @@ -512,8 +512,6 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
| 512 | dir = debugfs_lookup(buts->name, blk_debugfs_root); | 512 | dir = debugfs_lookup(buts->name, blk_debugfs_root); |
| 513 | if (!dir) | 513 | if (!dir) |
| 514 | bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); | 514 | bt->dir = dir = debugfs_create_dir(buts->name, blk_debugfs_root); |
| 515 | if (!dir) | ||
| 516 | goto err; | ||
| 517 | 515 | ||
| 518 | bt->dev = dev; | 516 | bt->dev = dev; |
| 519 | atomic_set(&bt->dropped, 0); | 517 | atomic_set(&bt->dropped, 0); |
| @@ -522,12 +520,8 @@ static int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, | |||
| 522 | ret = -EIO; | 520 | ret = -EIO; |
| 523 | bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, | 521 | bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, |
| 524 | &blk_dropped_fops); | 522 | &blk_dropped_fops); |
| 525 | if (!bt->dropped_file) | ||
| 526 | goto err; | ||
| 527 | 523 | ||
| 528 | bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); | 524 | bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); |
| 529 | if (!bt->msg_file) | ||
| 530 | goto err; | ||
| 531 | 525 | ||
| 532 | bt->rchan = relay_open("trace", dir, buts->buf_size, | 526 | bt->rchan = relay_open("trace", dir, buts->buf_size, |
| 533 | buts->buf_nr, &blk_relay_callbacks, bt); | 527 | buts->buf_nr, &blk_relay_callbacks, bt); |
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b496ffdf5f36..ca1255d14576 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c | |||
| @@ -19,6 +19,9 @@ | |||
| 19 | #include "trace_probe.h" | 19 | #include "trace_probe.h" |
| 20 | #include "trace.h" | 20 | #include "trace.h" |
| 21 | 21 | ||
| 22 | #define bpf_event_rcu_dereference(p) \ | ||
| 23 | rcu_dereference_protected(p, lockdep_is_held(&bpf_event_mutex)) | ||
| 24 | |||
| 22 | #ifdef CONFIG_MODULES | 25 | #ifdef CONFIG_MODULES |
| 23 | struct bpf_trace_module { | 26 | struct bpf_trace_module { |
| 24 | struct module *module; | 27 | struct module *module; |
| @@ -410,8 +413,6 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { | |||
| 410 | .arg4_type = ARG_CONST_SIZE, | 413 | .arg4_type = ARG_CONST_SIZE, |
| 411 | }; | 414 | }; |
| 412 | 415 | ||
| 413 | static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); | ||
| 414 | |||
| 415 | static __always_inline u64 | 416 | static __always_inline u64 |
| 416 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | 417 | __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, |
| 417 | u64 flags, struct perf_sample_data *sd) | 418 | u64 flags, struct perf_sample_data *sd) |
| @@ -442,24 +443,50 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, | |||
| 442 | return perf_event_output(event, sd, regs); | 443 | return perf_event_output(event, sd, regs); |
| 443 | } | 444 | } |
| 444 | 445 | ||
| 446 | /* | ||
| 447 | * Support executing tracepoints in normal, irq, and nmi context that each call | ||
| 448 | * bpf_perf_event_output | ||
| 449 | */ | ||
| 450 | struct bpf_trace_sample_data { | ||
| 451 | struct perf_sample_data sds[3]; | ||
| 452 | }; | ||
| 453 | |||
| 454 | static DEFINE_PER_CPU(struct bpf_trace_sample_data, bpf_trace_sds); | ||
| 455 | static DEFINE_PER_CPU(int, bpf_trace_nest_level); | ||
| 445 | BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, | 456 | BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, |
| 446 | u64, flags, void *, data, u64, size) | 457 | u64, flags, void *, data, u64, size) |
| 447 | { | 458 | { |
| 448 | struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); | 459 | struct bpf_trace_sample_data *sds = this_cpu_ptr(&bpf_trace_sds); |
| 460 | int nest_level = this_cpu_inc_return(bpf_trace_nest_level); | ||
| 449 | struct perf_raw_record raw = { | 461 | struct perf_raw_record raw = { |
| 450 | .frag = { | 462 | .frag = { |
| 451 | .size = size, | 463 | .size = size, |
| 452 | .data = data, | 464 | .data = data, |
| 453 | }, | 465 | }, |
| 454 | }; | 466 | }; |
| 467 | struct perf_sample_data *sd; | ||
| 468 | int err; | ||
| 455 | 469 | ||
| 456 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) | 470 | if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(sds->sds))) { |
| 457 | return -EINVAL; | 471 | err = -EBUSY; |
| 472 | goto out; | ||
| 473 | } | ||
| 474 | |||
| 475 | sd = &sds->sds[nest_level - 1]; | ||
| 476 | |||
| 477 | if (unlikely(flags & ~(BPF_F_INDEX_MASK))) { | ||
| 478 | err = -EINVAL; | ||
| 479 | goto out; | ||
| 480 | } | ||
| 458 | 481 | ||
| 459 | perf_sample_data_init(sd, 0, 0); | 482 | perf_sample_data_init(sd, 0, 0); |
| 460 | sd->raw = &raw; | 483 | sd->raw = &raw; |
| 461 | 484 | ||
| 462 | return __bpf_perf_event_output(regs, map, flags, sd); | 485 | err = __bpf_perf_event_output(regs, map, flags, sd); |
| 486 | |||
| 487 | out: | ||
| 488 | this_cpu_dec(bpf_trace_nest_level); | ||
| 489 | return err; | ||
| 463 | } | 490 | } |
| 464 | 491 | ||
| 465 | static const struct bpf_func_proto bpf_perf_event_output_proto = { | 492 | static const struct bpf_func_proto bpf_perf_event_output_proto = { |
| @@ -567,6 +594,69 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { | |||
| 567 | .arg3_type = ARG_ANYTHING, | 594 | .arg3_type = ARG_ANYTHING, |
| 568 | }; | 595 | }; |
| 569 | 596 | ||
| 597 | struct send_signal_irq_work { | ||
| 598 | struct irq_work irq_work; | ||
| 599 | struct task_struct *task; | ||
| 600 | u32 sig; | ||
| 601 | }; | ||
| 602 | |||
| 603 | static DEFINE_PER_CPU(struct send_signal_irq_work, send_signal_work); | ||
| 604 | |||
| 605 | static void do_bpf_send_signal(struct irq_work *entry) | ||
| 606 | { | ||
| 607 | struct send_signal_irq_work *work; | ||
| 608 | |||
| 609 | work = container_of(entry, struct send_signal_irq_work, irq_work); | ||
| 610 | group_send_sig_info(work->sig, SEND_SIG_PRIV, work->task, PIDTYPE_TGID); | ||
| 611 | } | ||
| 612 | |||
| 613 | BPF_CALL_1(bpf_send_signal, u32, sig) | ||
| 614 | { | ||
| 615 | struct send_signal_irq_work *work = NULL; | ||
| 616 | |||
| 617 | /* Similar to bpf_probe_write_user, task needs to be | ||
| 618 | * in a sound condition and kernel memory access be | ||
| 619 | * permitted in order to send signal to the current | ||
| 620 | * task. | ||
| 621 | */ | ||
| 622 | if (unlikely(current->flags & (PF_KTHREAD | PF_EXITING))) | ||
| 623 | return -EPERM; | ||
| 624 | if (unlikely(uaccess_kernel())) | ||
| 625 | return -EPERM; | ||
| 626 | if (unlikely(!nmi_uaccess_okay())) | ||
| 627 | return -EPERM; | ||
| 628 | |||
| 629 | if (in_nmi()) { | ||
| 630 | /* Do an early check on signal validity. Otherwise, | ||
| 631 | * the error is lost in deferred irq_work. | ||
| 632 | */ | ||
| 633 | if (unlikely(!valid_signal(sig))) | ||
| 634 | return -EINVAL; | ||
| 635 | |||
| 636 | work = this_cpu_ptr(&send_signal_work); | ||
| 637 | if (work->irq_work.flags & IRQ_WORK_BUSY) | ||
| 638 | return -EBUSY; | ||
| 639 | |||
| 640 | /* Add the current task, which is the target of sending signal, | ||
| 641 | * to the irq_work. The current task may change when queued | ||
| 642 | * irq works get executed. | ||
| 643 | */ | ||
| 644 | work->task = current; | ||
| 645 | work->sig = sig; | ||
| 646 | irq_work_queue(&work->irq_work); | ||
| 647 | return 0; | ||
| 648 | } | ||
| 649 | |||
| 650 | return group_send_sig_info(sig, SEND_SIG_PRIV, current, PIDTYPE_TGID); | ||
| 651 | } | ||
| 652 | |||
| 653 | static const struct bpf_func_proto bpf_send_signal_proto = { | ||
| 654 | .func = bpf_send_signal, | ||
| 655 | .gpl_only = false, | ||
| 656 | .ret_type = RET_INTEGER, | ||
| 657 | .arg1_type = ARG_ANYTHING, | ||
| 658 | }; | ||
| 659 | |||
| 570 | static const struct bpf_func_proto * | 660 | static const struct bpf_func_proto * |
| 571 | tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | 661 | tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) |
| 572 | { | 662 | { |
| @@ -617,6 +707,8 @@ tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | |||
| 617 | case BPF_FUNC_get_current_cgroup_id: | 707 | case BPF_FUNC_get_current_cgroup_id: |
| 618 | return &bpf_get_current_cgroup_id_proto; | 708 | return &bpf_get_current_cgroup_id_proto; |
| 619 | #endif | 709 | #endif |
| 710 | case BPF_FUNC_send_signal: | ||
| 711 | return &bpf_send_signal_proto; | ||
| 620 | default: | 712 | default: |
| 621 | return NULL; | 713 | return NULL; |
| 622 | } | 714 | } |
| @@ -822,16 +914,48 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) | |||
| 822 | /* | 914 | /* |
| 823 | * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp | 915 | * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp |
| 824 | * to avoid potential recursive reuse issue when/if tracepoints are added | 916 | * to avoid potential recursive reuse issue when/if tracepoints are added |
| 825 | * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack | 917 | * inside bpf_*_event_output, bpf_get_stackid and/or bpf_get_stack. |
| 918 | * | ||
| 919 | * Since raw tracepoints run despite bpf_prog_active, support concurrent usage | ||
| 920 | * in normal, irq, and nmi context. | ||
| 826 | */ | 921 | */ |
| 827 | static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); | 922 | struct bpf_raw_tp_regs { |
| 923 | struct pt_regs regs[3]; | ||
| 924 | }; | ||
| 925 | static DEFINE_PER_CPU(struct bpf_raw_tp_regs, bpf_raw_tp_regs); | ||
| 926 | static DEFINE_PER_CPU(int, bpf_raw_tp_nest_level); | ||
| 927 | static struct pt_regs *get_bpf_raw_tp_regs(void) | ||
| 928 | { | ||
| 929 | struct bpf_raw_tp_regs *tp_regs = this_cpu_ptr(&bpf_raw_tp_regs); | ||
| 930 | int nest_level = this_cpu_inc_return(bpf_raw_tp_nest_level); | ||
| 931 | |||
| 932 | if (WARN_ON_ONCE(nest_level > ARRAY_SIZE(tp_regs->regs))) { | ||
| 933 | this_cpu_dec(bpf_raw_tp_nest_level); | ||
| 934 | return ERR_PTR(-EBUSY); | ||
| 935 | } | ||
| 936 | |||
| 937 | return &tp_regs->regs[nest_level - 1]; | ||
| 938 | } | ||
| 939 | |||
| 940 | static void put_bpf_raw_tp_regs(void) | ||
| 941 | { | ||
| 942 | this_cpu_dec(bpf_raw_tp_nest_level); | ||
| 943 | } | ||
| 944 | |||
| 828 | BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, | 945 | BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, |
| 829 | struct bpf_map *, map, u64, flags, void *, data, u64, size) | 946 | struct bpf_map *, map, u64, flags, void *, data, u64, size) |
| 830 | { | 947 | { |
| 831 | struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); | 948 | struct pt_regs *regs = get_bpf_raw_tp_regs(); |
| 949 | int ret; | ||
| 950 | |||
| 951 | if (IS_ERR(regs)) | ||
| 952 | return PTR_ERR(regs); | ||
| 832 | 953 | ||
| 833 | perf_fetch_caller_regs(regs); | 954 | perf_fetch_caller_regs(regs); |
| 834 | return ____bpf_perf_event_output(regs, map, flags, data, size); | 955 | ret = ____bpf_perf_event_output(regs, map, flags, data, size); |
| 956 | |||
| 957 | put_bpf_raw_tp_regs(); | ||
| 958 | return ret; | ||
| 835 | } | 959 | } |
| 836 | 960 | ||
| 837 | static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { | 961 | static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { |
| @@ -848,12 +972,18 @@ static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { | |||
| 848 | BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, | 972 | BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, |
| 849 | struct bpf_map *, map, u64, flags) | 973 | struct bpf_map *, map, u64, flags) |
| 850 | { | 974 | { |
| 851 | struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); | 975 | struct pt_regs *regs = get_bpf_raw_tp_regs(); |
| 976 | int ret; | ||
| 977 | |||
| 978 | if (IS_ERR(regs)) | ||
| 979 | return PTR_ERR(regs); | ||
| 852 | 980 | ||
| 853 | perf_fetch_caller_regs(regs); | 981 | perf_fetch_caller_regs(regs); |
| 854 | /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ | 982 | /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ |
| 855 | return bpf_get_stackid((unsigned long) regs, (unsigned long) map, | 983 | ret = bpf_get_stackid((unsigned long) regs, (unsigned long) map, |
| 856 | flags, 0, 0); | 984 | flags, 0, 0); |
| 985 | put_bpf_raw_tp_regs(); | ||
| 986 | return ret; | ||
| 857 | } | 987 | } |
| 858 | 988 | ||
| 859 | static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { | 989 | static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { |
| @@ -868,11 +998,17 @@ static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { | |||
| 868 | BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, | 998 | BPF_CALL_4(bpf_get_stack_raw_tp, struct bpf_raw_tracepoint_args *, args, |
| 869 | void *, buf, u32, size, u64, flags) | 999 | void *, buf, u32, size, u64, flags) |
| 870 | { | 1000 | { |
| 871 | struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); | 1001 | struct pt_regs *regs = get_bpf_raw_tp_regs(); |
| 1002 | int ret; | ||
| 1003 | |||
| 1004 | if (IS_ERR(regs)) | ||
| 1005 | return PTR_ERR(regs); | ||
| 872 | 1006 | ||
| 873 | perf_fetch_caller_regs(regs); | 1007 | perf_fetch_caller_regs(regs); |
| 874 | return bpf_get_stack((unsigned long) regs, (unsigned long) buf, | 1008 | ret = bpf_get_stack((unsigned long) regs, (unsigned long) buf, |
| 875 | (unsigned long) size, flags, 0); | 1009 | (unsigned long) size, flags, 0); |
| 1010 | put_bpf_raw_tp_regs(); | ||
| 1011 | return ret; | ||
| 876 | } | 1012 | } |
| 877 | 1013 | ||
| 878 | static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { | 1014 | static const struct bpf_func_proto bpf_get_stack_proto_raw_tp = { |
| @@ -1034,7 +1170,7 @@ static DEFINE_MUTEX(bpf_event_mutex); | |||
| 1034 | int perf_event_attach_bpf_prog(struct perf_event *event, | 1170 | int perf_event_attach_bpf_prog(struct perf_event *event, |
| 1035 | struct bpf_prog *prog) | 1171 | struct bpf_prog *prog) |
| 1036 | { | 1172 | { |
| 1037 | struct bpf_prog_array __rcu *old_array; | 1173 | struct bpf_prog_array *old_array; |
| 1038 | struct bpf_prog_array *new_array; | 1174 | struct bpf_prog_array *new_array; |
| 1039 | int ret = -EEXIST; | 1175 | int ret = -EEXIST; |
| 1040 | 1176 | ||
| @@ -1052,7 +1188,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event, | |||
| 1052 | if (event->prog) | 1188 | if (event->prog) |
| 1053 | goto unlock; | 1189 | goto unlock; |
| 1054 | 1190 | ||
| 1055 | old_array = event->tp_event->prog_array; | 1191 | old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); |
| 1056 | if (old_array && | 1192 | if (old_array && |
| 1057 | bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { | 1193 | bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { |
| 1058 | ret = -E2BIG; | 1194 | ret = -E2BIG; |
| @@ -1075,7 +1211,7 @@ unlock: | |||
| 1075 | 1211 | ||
| 1076 | void perf_event_detach_bpf_prog(struct perf_event *event) | 1212 | void perf_event_detach_bpf_prog(struct perf_event *event) |
| 1077 | { | 1213 | { |
| 1078 | struct bpf_prog_array __rcu *old_array; | 1214 | struct bpf_prog_array *old_array; |
| 1079 | struct bpf_prog_array *new_array; | 1215 | struct bpf_prog_array *new_array; |
| 1080 | int ret; | 1216 | int ret; |
| 1081 | 1217 | ||
| @@ -1084,7 +1220,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event) | |||
| 1084 | if (!event->prog) | 1220 | if (!event->prog) |
| 1085 | goto unlock; | 1221 | goto unlock; |
| 1086 | 1222 | ||
| 1087 | old_array = event->tp_event->prog_array; | 1223 | old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); |
| 1088 | ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); | 1224 | ret = bpf_prog_array_copy(old_array, event->prog, NULL, &new_array); |
| 1089 | if (ret == -ENOENT) | 1225 | if (ret == -ENOENT) |
| 1090 | goto unlock; | 1226 | goto unlock; |
| @@ -1106,6 +1242,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) | |||
| 1106 | { | 1242 | { |
| 1107 | struct perf_event_query_bpf __user *uquery = info; | 1243 | struct perf_event_query_bpf __user *uquery = info; |
| 1108 | struct perf_event_query_bpf query = {}; | 1244 | struct perf_event_query_bpf query = {}; |
| 1245 | struct bpf_prog_array *progs; | ||
| 1109 | u32 *ids, prog_cnt, ids_len; | 1246 | u32 *ids, prog_cnt, ids_len; |
| 1110 | int ret; | 1247 | int ret; |
| 1111 | 1248 | ||
| @@ -1130,10 +1267,8 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) | |||
| 1130 | */ | 1267 | */ |
| 1131 | 1268 | ||
| 1132 | mutex_lock(&bpf_event_mutex); | 1269 | mutex_lock(&bpf_event_mutex); |
| 1133 | ret = bpf_prog_array_copy_info(event->tp_event->prog_array, | 1270 | progs = bpf_event_rcu_dereference(event->tp_event->prog_array); |
| 1134 | ids, | 1271 | ret = bpf_prog_array_copy_info(progs, ids, ids_len, &prog_cnt); |
| 1135 | ids_len, | ||
| 1136 | &prog_cnt); | ||
| 1137 | mutex_unlock(&bpf_event_mutex); | 1272 | mutex_unlock(&bpf_event_mutex); |
| 1138 | 1273 | ||
| 1139 | if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || | 1274 | if (copy_to_user(&uquery->prog_cnt, &prog_cnt, sizeof(prog_cnt)) || |
| @@ -1296,8 +1431,23 @@ int bpf_get_perf_event_info(const struct perf_event *event, u32 *prog_id, | |||
| 1296 | return err; | 1431 | return err; |
| 1297 | } | 1432 | } |
| 1298 | 1433 | ||
| 1434 | static int __init send_signal_irq_work_init(void) | ||
| 1435 | { | ||
| 1436 | int cpu; | ||
| 1437 | struct send_signal_irq_work *work; | ||
| 1438 | |||
| 1439 | for_each_possible_cpu(cpu) { | ||
| 1440 | work = per_cpu_ptr(&send_signal_work, cpu); | ||
| 1441 | init_irq_work(&work->irq_work, do_bpf_send_signal); | ||
| 1442 | } | ||
| 1443 | return 0; | ||
| 1444 | } | ||
| 1445 | |||
| 1446 | subsys_initcall(send_signal_irq_work_init); | ||
| 1447 | |||
| 1299 | #ifdef CONFIG_MODULES | 1448 | #ifdef CONFIG_MODULES |
| 1300 | int bpf_event_notify(struct notifier_block *nb, unsigned long op, void *module) | 1449 | static int bpf_event_notify(struct notifier_block *nb, unsigned long op, |
| 1450 | void *module) | ||
| 1301 | { | 1451 | { |
| 1302 | struct bpf_trace_module *btm, *tmp; | 1452 | struct bpf_trace_module *btm, *tmp; |
| 1303 | struct module *mod = module; | 1453 | struct module *mod = module; |
| @@ -1336,7 +1486,7 @@ static struct notifier_block bpf_module_nb = { | |||
| 1336 | .notifier_call = bpf_event_notify, | 1486 | .notifier_call = bpf_event_notify, |
| 1337 | }; | 1487 | }; |
| 1338 | 1488 | ||
| 1339 | int __init bpf_event_init(void) | 1489 | static int __init bpf_event_init(void) |
| 1340 | { | 1490 | { |
| 1341 | register_module_notifier(&bpf_module_nb); | 1491 | register_module_notifier(&bpf_module_nb); |
| 1342 | return 0; | 1492 | return 0; |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b920358dd8f7..576c41644e77 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -70,12 +70,8 @@ | |||
| 70 | #define INIT_OPS_HASH(opsname) \ | 70 | #define INIT_OPS_HASH(opsname) \ |
| 71 | .func_hash = &opsname.local_hash, \ | 71 | .func_hash = &opsname.local_hash, \ |
| 72 | .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), | 72 | .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), |
| 73 | #define ASSIGN_OPS_HASH(opsname, val) \ | ||
| 74 | .func_hash = val, \ | ||
| 75 | .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock), | ||
| 76 | #else | 73 | #else |
| 77 | #define INIT_OPS_HASH(opsname) | 74 | #define INIT_OPS_HASH(opsname) |
| 78 | #define ASSIGN_OPS_HASH(opsname, val) | ||
| 79 | #endif | 75 | #endif |
| 80 | 76 | ||
| 81 | enum { | 77 | enum { |
| @@ -2939,14 +2935,13 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) | |||
| 2939 | p = &pg->records[i]; | 2935 | p = &pg->records[i]; |
| 2940 | p->flags = rec_flags; | 2936 | p->flags = rec_flags; |
| 2941 | 2937 | ||
| 2942 | #ifndef CC_USING_NOP_MCOUNT | ||
| 2943 | /* | 2938 | /* |
| 2944 | * Do the initial record conversion from mcount jump | 2939 | * Do the initial record conversion from mcount jump |
| 2945 | * to the NOP instructions. | 2940 | * to the NOP instructions. |
| 2946 | */ | 2941 | */ |
| 2947 | if (!ftrace_code_disable(mod, p)) | 2942 | if (!__is_defined(CC_USING_NOP_MCOUNT) && |
| 2943 | !ftrace_code_disable(mod, p)) | ||
| 2948 | break; | 2944 | break; |
| 2949 | #endif | ||
| 2950 | 2945 | ||
| 2951 | update_cnt++; | 2946 | update_cnt++; |
| 2952 | } | 2947 | } |
| @@ -3880,7 +3875,7 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, | |||
| 3880 | static bool module_exists(const char *module) | 3875 | static bool module_exists(const char *module) |
| 3881 | { | 3876 | { |
| 3882 | /* All modules have the symbol __this_module */ | 3877 | /* All modules have the symbol __this_module */ |
| 3883 | const char this_mod[] = "__this_module"; | 3878 | static const char this_mod[] = "__this_module"; |
| 3884 | char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; | 3879 | char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2]; |
| 3885 | unsigned long val; | 3880 | unsigned long val; |
| 3886 | int n; | 3881 | int n; |
| @@ -4225,10 +4220,13 @@ void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, | |||
| 4225 | struct ftrace_func_entry *entry; | 4220 | struct ftrace_func_entry *entry; |
| 4226 | struct ftrace_func_map *map; | 4221 | struct ftrace_func_map *map; |
| 4227 | struct hlist_head *hhd; | 4222 | struct hlist_head *hhd; |
| 4228 | int size = 1 << mapper->hash.size_bits; | 4223 | int size, i; |
| 4229 | int i; | 4224 | |
| 4225 | if (!mapper) | ||
| 4226 | return; | ||
| 4230 | 4227 | ||
| 4231 | if (free_func && mapper->hash.count) { | 4228 | if (free_func && mapper->hash.count) { |
| 4229 | size = 1 << mapper->hash.size_bits; | ||
| 4232 | for (i = 0; i < size; i++) { | 4230 | for (i = 0; i < size; i++) { |
| 4233 | hhd = &mapper->hash.buckets[i]; | 4231 | hhd = &mapper->hash.buckets[i]; |
| 4234 | hlist_for_each_entry(entry, hhd, hlist) { | 4232 | hlist_for_each_entry(entry, hhd, hlist) { |
| @@ -6265,6 +6263,9 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | |||
| 6265 | preempt_disable_notrace(); | 6263 | preempt_disable_notrace(); |
| 6266 | 6264 | ||
| 6267 | do_for_each_ftrace_op(op, ftrace_ops_list) { | 6265 | do_for_each_ftrace_op(op, ftrace_ops_list) { |
| 6266 | /* Stub functions don't need to be called nor tested */ | ||
| 6267 | if (op->flags & FTRACE_OPS_FL_STUB) | ||
| 6268 | continue; | ||
| 6268 | /* | 6269 | /* |
| 6269 | * Check the following for each ops before calling their func: | 6270 | * Check the following for each ops before calling their func: |
| 6270 | * if RCU flag is set, then rcu_is_watching() must be true | 6271 | * if RCU flag is set, then rcu_is_watching() must be true |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4ee8d8aa3d0f..05b0b3139ebc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -4979,7 +4979,7 @@ static __init int rb_write_something(struct rb_test_data *data, bool nested) | |||
| 4979 | cnt = data->cnt + (nested ? 27 : 0); | 4979 | cnt = data->cnt + (nested ? 27 : 0); |
| 4980 | 4980 | ||
| 4981 | /* Multiply cnt by ~e, to make some unique increment */ | 4981 | /* Multiply cnt by ~e, to make some unique increment */ |
| 4982 | size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); | 4982 | size = (cnt * 68 / 25) % (sizeof(rb_string) - 1); |
| 4983 | 4983 | ||
| 4984 | len = size + sizeof(struct rb_item); | 4984 | len = size + sizeof(struct rb_item); |
| 4985 | 4985 | ||
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index ffba6789c0e2..0564f6db0561 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c | |||
| @@ -362,7 +362,7 @@ static void ring_buffer_producer(void) | |||
| 362 | hit--; /* make it non zero */ | 362 | hit--; /* make it non zero */ |
| 363 | } | 363 | } |
| 364 | 364 | ||
| 365 | /* Caculate the average time in nanosecs */ | 365 | /* Calculate the average time in nanosecs */ |
| 366 | avg = NSEC_PER_MSEC / (hit + missed); | 366 | avg = NSEC_PER_MSEC / (hit + missed); |
| 367 | trace_printk("%ld ns per entry\n", avg); | 367 | trace_printk("%ld ns per entry\n", avg); |
| 368 | } | 368 | } |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index ec439999f387..c90c687cf950 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -1727,6 +1727,10 @@ static __init int init_trace_selftests(void) | |||
| 1727 | pr_info("Running postponed tracer tests:\n"); | 1727 | pr_info("Running postponed tracer tests:\n"); |
| 1728 | 1728 | ||
| 1729 | list_for_each_entry_safe(p, n, &postponed_selftests, list) { | 1729 | list_for_each_entry_safe(p, n, &postponed_selftests, list) { |
| 1730 | /* This loop can take minutes when sanitizers are enabled, so | ||
| 1731 | * lets make sure we allow RCU processing. | ||
| 1732 | */ | ||
| 1733 | cond_resched(); | ||
| 1730 | ret = run_tracer_selftest(p->type); | 1734 | ret = run_tracer_selftest(p->type); |
| 1731 | /* If the test fails, then warn and remove from available_tracers */ | 1735 | /* If the test fails, then warn and remove from available_tracers */ |
| 1732 | if (ret < 0) { | 1736 | if (ret < 0) { |
| @@ -3045,6 +3049,7 @@ void trace_printk_init_buffers(void) | |||
| 3045 | if (global_trace.trace_buffer.buffer) | 3049 | if (global_trace.trace_buffer.buffer) |
| 3046 | tracing_start_cmdline_record(); | 3050 | tracing_start_cmdline_record(); |
| 3047 | } | 3051 | } |
| 3052 | EXPORT_SYMBOL_GPL(trace_printk_init_buffers); | ||
| 3048 | 3053 | ||
| 3049 | void trace_printk_start_comm(void) | 3054 | void trace_printk_start_comm(void) |
| 3050 | { | 3055 | { |
| @@ -3205,6 +3210,7 @@ int trace_array_printk(struct trace_array *tr, | |||
| 3205 | va_end(ap); | 3210 | va_end(ap); |
| 3206 | return ret; | 3211 | return ret; |
| 3207 | } | 3212 | } |
| 3213 | EXPORT_SYMBOL_GPL(trace_array_printk); | ||
| 3208 | 3214 | ||
| 3209 | __printf(3, 4) | 3215 | __printf(3, 4) |
| 3210 | int trace_array_printk_buf(struct ring_buffer *buffer, | 3216 | int trace_array_printk_buf(struct ring_buffer *buffer, |
| @@ -3483,33 +3489,68 @@ static void s_stop(struct seq_file *m, void *p) | |||
| 3483 | } | 3489 | } |
| 3484 | 3490 | ||
| 3485 | static void | 3491 | static void |
| 3492 | get_total_entries_cpu(struct trace_buffer *buf, unsigned long *total, | ||
| 3493 | unsigned long *entries, int cpu) | ||
| 3494 | { | ||
| 3495 | unsigned long count; | ||
| 3496 | |||
| 3497 | count = ring_buffer_entries_cpu(buf->buffer, cpu); | ||
| 3498 | /* | ||
| 3499 | * If this buffer has skipped entries, then we hold all | ||
| 3500 | * entries for the trace and we need to ignore the | ||
| 3501 | * ones before the time stamp. | ||
| 3502 | */ | ||
| 3503 | if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { | ||
| 3504 | count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; | ||
| 3505 | /* total is the same as the entries */ | ||
| 3506 | *total = count; | ||
| 3507 | } else | ||
| 3508 | *total = count + | ||
| 3509 | ring_buffer_overrun_cpu(buf->buffer, cpu); | ||
| 3510 | *entries = count; | ||
| 3511 | } | ||
| 3512 | |||
| 3513 | static void | ||
| 3486 | get_total_entries(struct trace_buffer *buf, | 3514 | get_total_entries(struct trace_buffer *buf, |
| 3487 | unsigned long *total, unsigned long *entries) | 3515 | unsigned long *total, unsigned long *entries) |
| 3488 | { | 3516 | { |
| 3489 | unsigned long count; | 3517 | unsigned long t, e; |
| 3490 | int cpu; | 3518 | int cpu; |
| 3491 | 3519 | ||
| 3492 | *total = 0; | 3520 | *total = 0; |
| 3493 | *entries = 0; | 3521 | *entries = 0; |
| 3494 | 3522 | ||
| 3495 | for_each_tracing_cpu(cpu) { | 3523 | for_each_tracing_cpu(cpu) { |
| 3496 | count = ring_buffer_entries_cpu(buf->buffer, cpu); | 3524 | get_total_entries_cpu(buf, &t, &e, cpu); |
| 3497 | /* | 3525 | *total += t; |
| 3498 | * If this buffer has skipped entries, then we hold all | 3526 | *entries += e; |
| 3499 | * entries for the trace and we need to ignore the | ||
| 3500 | * ones before the time stamp. | ||
| 3501 | */ | ||
| 3502 | if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { | ||
| 3503 | count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; | ||
| 3504 | /* total is the same as the entries */ | ||
| 3505 | *total += count; | ||
| 3506 | } else | ||
| 3507 | *total += count + | ||
| 3508 | ring_buffer_overrun_cpu(buf->buffer, cpu); | ||
| 3509 | *entries += count; | ||
| 3510 | } | 3527 | } |
| 3511 | } | 3528 | } |
| 3512 | 3529 | ||
| 3530 | unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu) | ||
| 3531 | { | ||
| 3532 | unsigned long total, entries; | ||
| 3533 | |||
| 3534 | if (!tr) | ||
| 3535 | tr = &global_trace; | ||
| 3536 | |||
| 3537 | get_total_entries_cpu(&tr->trace_buffer, &total, &entries, cpu); | ||
| 3538 | |||
| 3539 | return entries; | ||
| 3540 | } | ||
| 3541 | |||
| 3542 | unsigned long trace_total_entries(struct trace_array *tr) | ||
| 3543 | { | ||
| 3544 | unsigned long total, entries; | ||
| 3545 | |||
| 3546 | if (!tr) | ||
| 3547 | tr = &global_trace; | ||
| 3548 | |||
| 3549 | get_total_entries(&tr->trace_buffer, &total, &entries); | ||
| 3550 | |||
| 3551 | return entries; | ||
| 3552 | } | ||
| 3553 | |||
| 3513 | static void print_lat_help_header(struct seq_file *m) | 3554 | static void print_lat_help_header(struct seq_file *m) |
| 3514 | { | 3555 | { |
| 3515 | seq_puts(m, "# _------=> CPU# \n" | 3556 | seq_puts(m, "# _------=> CPU# \n" |
| @@ -3548,25 +3589,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file | |||
| 3548 | unsigned int flags) | 3589 | unsigned int flags) |
| 3549 | { | 3590 | { |
| 3550 | bool tgid = flags & TRACE_ITER_RECORD_TGID; | 3591 | bool tgid = flags & TRACE_ITER_RECORD_TGID; |
| 3551 | const char tgid_space[] = " "; | 3592 | const char *space = " "; |
| 3552 | const char space[] = " "; | 3593 | int prec = tgid ? 10 : 2; |
| 3553 | 3594 | ||
| 3554 | print_event_info(buf, m); | 3595 | print_event_info(buf, m); |
| 3555 | 3596 | ||
| 3556 | seq_printf(m, "# %s _-----=> irqs-off\n", | 3597 | seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); |
| 3557 | tgid ? tgid_space : space); | 3598 | seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); |
| 3558 | seq_printf(m, "# %s / _----=> need-resched\n", | 3599 | seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); |
| 3559 | tgid ? tgid_space : space); | 3600 | seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); |
| 3560 | seq_printf(m, "# %s| / _---=> hardirq/softirq\n", | 3601 | seq_printf(m, "# %.*s||| / delay\n", prec, space); |
| 3561 | tgid ? tgid_space : space); | 3602 | seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); |
| 3562 | seq_printf(m, "# %s|| / _--=> preempt-depth\n", | 3603 | seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); |
| 3563 | tgid ? tgid_space : space); | ||
| 3564 | seq_printf(m, "# %s||| / delay\n", | ||
| 3565 | tgid ? tgid_space : space); | ||
| 3566 | seq_printf(m, "# TASK-PID %sCPU# |||| TIMESTAMP FUNCTION\n", | ||
| 3567 | tgid ? " TGID " : space); | ||
| 3568 | seq_printf(m, "# | | %s | |||| | |\n", | ||
| 3569 | tgid ? " | " : space); | ||
| 3570 | } | 3604 | } |
| 3571 | 3605 | ||
| 3572 | void | 3606 | void |
| @@ -4692,6 +4726,7 @@ static const char readme_msg[] = | |||
| 4692 | " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" | 4726 | " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" |
| 4693 | " current_tracer\t- function and latency tracers\n" | 4727 | " current_tracer\t- function and latency tracers\n" |
| 4694 | " available_tracers\t- list of configured tracers for current_tracer\n" | 4728 | " available_tracers\t- list of configured tracers for current_tracer\n" |
| 4729 | " error_log\t- error log for failed commands (that support it)\n" | ||
| 4695 | " buffer_size_kb\t- view and modify size of per cpu buffer\n" | 4730 | " buffer_size_kb\t- view and modify size of per cpu buffer\n" |
| 4696 | " buffer_total_size_kb - view total size of all cpu buffers\n\n" | 4731 | " buffer_total_size_kb - view total size of all cpu buffers\n\n" |
| 4697 | " trace_clock\t\t-change the clock used to order events\n" | 4732 | " trace_clock\t\t-change the clock used to order events\n" |
| @@ -4712,7 +4747,7 @@ static const char readme_msg[] = | |||
| 4712 | " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" | 4747 | " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" |
| 4713 | "\t\t\t Remove sub-buffer with rmdir\n" | 4748 | "\t\t\t Remove sub-buffer with rmdir\n" |
| 4714 | " trace_options\t\t- Set format or modify how tracing happens\n" | 4749 | " trace_options\t\t- Set format or modify how tracing happens\n" |
| 4715 | "\t\t\t Disable an option by adding a suffix 'no' to the\n" | 4750 | "\t\t\t Disable an option by prefixing 'no' to the\n" |
| 4716 | "\t\t\t option name\n" | 4751 | "\t\t\t option name\n" |
| 4717 | " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" | 4752 | " saved_cmdlines_size\t- echo command number in here to store comm-pid list\n" |
| 4718 | #ifdef CONFIG_DYNAMIC_FTRACE | 4753 | #ifdef CONFIG_DYNAMIC_FTRACE |
| @@ -6296,13 +6331,13 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
| 6296 | struct ring_buffer *buffer; | 6331 | struct ring_buffer *buffer; |
| 6297 | struct print_entry *entry; | 6332 | struct print_entry *entry; |
| 6298 | unsigned long irq_flags; | 6333 | unsigned long irq_flags; |
| 6299 | const char faulted[] = "<faulted>"; | ||
| 6300 | ssize_t written; | 6334 | ssize_t written; |
| 6301 | int size; | 6335 | int size; |
| 6302 | int len; | 6336 | int len; |
| 6303 | 6337 | ||
| 6304 | /* Used in tracing_mark_raw_write() as well */ | 6338 | /* Used in tracing_mark_raw_write() as well */ |
| 6305 | #define FAULTED_SIZE (sizeof(faulted) - 1) /* '\0' is already accounted for */ | 6339 | #define FAULTED_STR "<faulted>" |
| 6340 | #define FAULTED_SIZE (sizeof(FAULTED_STR) - 1) /* '\0' is already accounted for */ | ||
| 6306 | 6341 | ||
| 6307 | if (tracing_disabled) | 6342 | if (tracing_disabled) |
| 6308 | return -EINVAL; | 6343 | return -EINVAL; |
| @@ -6334,7 +6369,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
| 6334 | 6369 | ||
| 6335 | len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); | 6370 | len = __copy_from_user_inatomic(&entry->buf, ubuf, cnt); |
| 6336 | if (len) { | 6371 | if (len) { |
| 6337 | memcpy(&entry->buf, faulted, FAULTED_SIZE); | 6372 | memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); |
| 6338 | cnt = FAULTED_SIZE; | 6373 | cnt = FAULTED_SIZE; |
| 6339 | written = -EFAULT; | 6374 | written = -EFAULT; |
| 6340 | } else | 6375 | } else |
| @@ -6375,7 +6410,6 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, | |||
| 6375 | struct ring_buffer_event *event; | 6410 | struct ring_buffer_event *event; |
| 6376 | struct ring_buffer *buffer; | 6411 | struct ring_buffer *buffer; |
| 6377 | struct raw_data_entry *entry; | 6412 | struct raw_data_entry *entry; |
| 6378 | const char faulted[] = "<faulted>"; | ||
| 6379 | unsigned long irq_flags; | 6413 | unsigned long irq_flags; |
| 6380 | ssize_t written; | 6414 | ssize_t written; |
| 6381 | int size; | 6415 | int size; |
| @@ -6415,7 +6449,7 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf, | |||
| 6415 | len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); | 6449 | len = __copy_from_user_inatomic(&entry->id, ubuf, cnt); |
| 6416 | if (len) { | 6450 | if (len) { |
| 6417 | entry->id = -1; | 6451 | entry->id = -1; |
| 6418 | memcpy(&entry->buf, faulted, FAULTED_SIZE); | 6452 | memcpy(&entry->buf, FAULTED_STR, FAULTED_SIZE); |
| 6419 | written = -EFAULT; | 6453 | written = -EFAULT; |
| 6420 | } else | 6454 | } else |
| 6421 | written = cnt; | 6455 | written = cnt; |
| @@ -6685,11 +6719,13 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, | |||
| 6685 | break; | 6719 | break; |
| 6686 | } | 6720 | } |
| 6687 | #endif | 6721 | #endif |
| 6688 | if (!tr->allocated_snapshot) { | 6722 | if (tr->allocated_snapshot) |
| 6723 | ret = resize_buffer_duplicate_size(&tr->max_buffer, | ||
| 6724 | &tr->trace_buffer, iter->cpu_file); | ||
| 6725 | else | ||
| 6689 | ret = tracing_alloc_snapshot_instance(tr); | 6726 | ret = tracing_alloc_snapshot_instance(tr); |
| 6690 | if (ret < 0) | 6727 | if (ret < 0) |
| 6691 | break; | 6728 | break; |
| 6692 | } | ||
| 6693 | local_irq_disable(); | 6729 | local_irq_disable(); |
| 6694 | /* Now, we're going to swap */ | 6730 | /* Now, we're going to swap */ |
| 6695 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) | 6731 | if (iter->cpu_file == RING_BUFFER_ALL_CPUS) |
| @@ -6868,6 +6904,250 @@ static const struct file_operations snapshot_raw_fops = { | |||
| 6868 | 6904 | ||
| 6869 | #endif /* CONFIG_TRACER_SNAPSHOT */ | 6905 | #endif /* CONFIG_TRACER_SNAPSHOT */ |
| 6870 | 6906 | ||
| 6907 | #define TRACING_LOG_ERRS_MAX 8 | ||
| 6908 | #define TRACING_LOG_LOC_MAX 128 | ||
| 6909 | |||
| 6910 | #define CMD_PREFIX " Command: " | ||
| 6911 | |||
| 6912 | struct err_info { | ||
| 6913 | const char **errs; /* ptr to loc-specific array of err strings */ | ||
| 6914 | u8 type; /* index into errs -> specific err string */ | ||
| 6915 | u8 pos; /* MAX_FILTER_STR_VAL = 256 */ | ||
| 6916 | u64 ts; | ||
| 6917 | }; | ||
| 6918 | |||
| 6919 | struct tracing_log_err { | ||
| 6920 | struct list_head list; | ||
| 6921 | struct err_info info; | ||
| 6922 | char loc[TRACING_LOG_LOC_MAX]; /* err location */ | ||
| 6923 | char cmd[MAX_FILTER_STR_VAL]; /* what caused err */ | ||
| 6924 | }; | ||
| 6925 | |||
| 6926 | static DEFINE_MUTEX(tracing_err_log_lock); | ||
| 6927 | |||
| 6928 | static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr) | ||
| 6929 | { | ||
| 6930 | struct tracing_log_err *err; | ||
| 6931 | |||
| 6932 | if (tr->n_err_log_entries < TRACING_LOG_ERRS_MAX) { | ||
| 6933 | err = kzalloc(sizeof(*err), GFP_KERNEL); | ||
| 6934 | if (!err) | ||
| 6935 | err = ERR_PTR(-ENOMEM); | ||
| 6936 | tr->n_err_log_entries++; | ||
| 6937 | |||
| 6938 | return err; | ||
| 6939 | } | ||
| 6940 | |||
| 6941 | err = list_first_entry(&tr->err_log, struct tracing_log_err, list); | ||
| 6942 | list_del(&err->list); | ||
| 6943 | |||
| 6944 | return err; | ||
| 6945 | } | ||
| 6946 | |||
| 6947 | /** | ||
| 6948 | * err_pos - find the position of a string within a command for error careting | ||
| 6949 | * @cmd: The tracing command that caused the error | ||
| 6950 | * @str: The string to position the caret at within @cmd | ||
| 6951 | * | ||
| 6952 | * Finds the position of the first occurence of @str within @cmd. The | ||
| 6953 | * return value can be passed to tracing_log_err() for caret placement | ||
| 6954 | * within @cmd. | ||
| 6955 | * | ||
| 6956 | * Returns the index within @cmd of the first occurence of @str or 0 | ||
| 6957 | * if @str was not found. | ||
| 6958 | */ | ||
| 6959 | unsigned int err_pos(char *cmd, const char *str) | ||
| 6960 | { | ||
| 6961 | char *found; | ||
| 6962 | |||
| 6963 | if (WARN_ON(!strlen(cmd))) | ||
| 6964 | return 0; | ||
| 6965 | |||
| 6966 | found = strstr(cmd, str); | ||
| 6967 | if (found) | ||
| 6968 | return found - cmd; | ||
| 6969 | |||
| 6970 | return 0; | ||
| 6971 | } | ||
| 6972 | |||
| 6973 | /** | ||
| 6974 | * tracing_log_err - write an error to the tracing error log | ||
| 6975 | * @tr: The associated trace array for the error (NULL for top level array) | ||
| 6976 | * @loc: A string describing where the error occurred | ||
| 6977 | * @cmd: The tracing command that caused the error | ||
| 6978 | * @errs: The array of loc-specific static error strings | ||
| 6979 | * @type: The index into errs[], which produces the specific static err string | ||
| 6980 | * @pos: The position the caret should be placed in the cmd | ||
| 6981 | * | ||
| 6982 | * Writes an error into tracing/error_log of the form: | ||
| 6983 | * | ||
| 6984 | * <loc>: error: <text> | ||
| 6985 | * Command: <cmd> | ||
| 6986 | * ^ | ||
| 6987 | * | ||
| 6988 | * tracing/error_log is a small log file containing the last | ||
| 6989 | * TRACING_LOG_ERRS_MAX errors (8). Memory for errors isn't allocated | ||
| 6990 | * unless there has been a tracing error, and the error log can be | ||
| 6991 | * cleared and have its memory freed by writing the empty string in | ||
| 6992 | * truncation mode to it i.e. echo > tracing/error_log. | ||
| 6993 | * | ||
| 6994 | * NOTE: the @errs array along with the @type param are used to | ||
| 6995 | * produce a static error string - this string is not copied and saved | ||
| 6996 | * when the error is logged - only a pointer to it is saved. See | ||
| 6997 | * existing callers for examples of how static strings are typically | ||
| 6998 | * defined for use with tracing_log_err(). | ||
| 6999 | */ | ||
| 7000 | void tracing_log_err(struct trace_array *tr, | ||
| 7001 | const char *loc, const char *cmd, | ||
| 7002 | const char **errs, u8 type, u8 pos) | ||
| 7003 | { | ||
| 7004 | struct tracing_log_err *err; | ||
| 7005 | |||
| 7006 | if (!tr) | ||
| 7007 | tr = &global_trace; | ||
| 7008 | |||
| 7009 | mutex_lock(&tracing_err_log_lock); | ||
| 7010 | err = get_tracing_log_err(tr); | ||
| 7011 | if (PTR_ERR(err) == -ENOMEM) { | ||
| 7012 | mutex_unlock(&tracing_err_log_lock); | ||
| 7013 | return; | ||
| 7014 | } | ||
| 7015 | |||
| 7016 | snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc); | ||
| 7017 | snprintf(err->cmd, MAX_FILTER_STR_VAL,"\n" CMD_PREFIX "%s\n", cmd); | ||
| 7018 | |||
| 7019 | err->info.errs = errs; | ||
| 7020 | err->info.type = type; | ||
| 7021 | err->info.pos = pos; | ||
| 7022 | err->info.ts = local_clock(); | ||
| 7023 | |||
| 7024 | list_add_tail(&err->list, &tr->err_log); | ||
| 7025 | mutex_unlock(&tracing_err_log_lock); | ||
| 7026 | } | ||
| 7027 | |||
| 7028 | static void clear_tracing_err_log(struct trace_array *tr) | ||
| 7029 | { | ||
| 7030 | struct tracing_log_err *err, *next; | ||
| 7031 | |||
| 7032 | mutex_lock(&tracing_err_log_lock); | ||
| 7033 | list_for_each_entry_safe(err, next, &tr->err_log, list) { | ||
| 7034 | list_del(&err->list); | ||
| 7035 | kfree(err); | ||
| 7036 | } | ||
| 7037 | |||
| 7038 | tr->n_err_log_entries = 0; | ||
| 7039 | mutex_unlock(&tracing_err_log_lock); | ||
| 7040 | } | ||
| 7041 | |||
| 7042 | static void *tracing_err_log_seq_start(struct seq_file *m, loff_t *pos) | ||
| 7043 | { | ||
| 7044 | struct trace_array *tr = m->private; | ||
| 7045 | |||
| 7046 | mutex_lock(&tracing_err_log_lock); | ||
| 7047 | |||
| 7048 | return seq_list_start(&tr->err_log, *pos); | ||
| 7049 | } | ||
| 7050 | |||
| 7051 | static void *tracing_err_log_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 7052 | { | ||
| 7053 | struct trace_array *tr = m->private; | ||
| 7054 | |||
| 7055 | return seq_list_next(v, &tr->err_log, pos); | ||
| 7056 | } | ||
| 7057 | |||
| 7058 | static void tracing_err_log_seq_stop(struct seq_file *m, void *v) | ||
| 7059 | { | ||
| 7060 | mutex_unlock(&tracing_err_log_lock); | ||
| 7061 | } | ||
| 7062 | |||
| 7063 | static void tracing_err_log_show_pos(struct seq_file *m, u8 pos) | ||
| 7064 | { | ||
| 7065 | u8 i; | ||
| 7066 | |||
| 7067 | for (i = 0; i < sizeof(CMD_PREFIX) - 1; i++) | ||
| 7068 | seq_putc(m, ' '); | ||
| 7069 | for (i = 0; i < pos; i++) | ||
| 7070 | seq_putc(m, ' '); | ||
| 7071 | seq_puts(m, "^\n"); | ||
| 7072 | } | ||
| 7073 | |||
| 7074 | static int tracing_err_log_seq_show(struct seq_file *m, void *v) | ||
| 7075 | { | ||
| 7076 | struct tracing_log_err *err = v; | ||
| 7077 | |||
| 7078 | if (err) { | ||
| 7079 | const char *err_text = err->info.errs[err->info.type]; | ||
| 7080 | u64 sec = err->info.ts; | ||
| 7081 | u32 nsec; | ||
| 7082 | |||
| 7083 | nsec = do_div(sec, NSEC_PER_SEC); | ||
| 7084 | seq_printf(m, "[%5llu.%06u] %s%s", sec, nsec / 1000, | ||
| 7085 | err->loc, err_text); | ||
| 7086 | seq_printf(m, "%s", err->cmd); | ||
| 7087 | tracing_err_log_show_pos(m, err->info.pos); | ||
| 7088 | } | ||
| 7089 | |||
| 7090 | return 0; | ||
| 7091 | } | ||
| 7092 | |||
| 7093 | static const struct seq_operations tracing_err_log_seq_ops = { | ||
| 7094 | .start = tracing_err_log_seq_start, | ||
| 7095 | .next = tracing_err_log_seq_next, | ||
| 7096 | .stop = tracing_err_log_seq_stop, | ||
| 7097 | .show = tracing_err_log_seq_show | ||
| 7098 | }; | ||
| 7099 | |||
| 7100 | static int tracing_err_log_open(struct inode *inode, struct file *file) | ||
| 7101 | { | ||
| 7102 | struct trace_array *tr = inode->i_private; | ||
| 7103 | int ret = 0; | ||
| 7104 | |||
| 7105 | if (trace_array_get(tr) < 0) | ||
| 7106 | return -ENODEV; | ||
| 7107 | |||
| 7108 | /* If this file was opened for write, then erase contents */ | ||
| 7109 | if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) | ||
| 7110 | clear_tracing_err_log(tr); | ||
| 7111 | |||
| 7112 | if (file->f_mode & FMODE_READ) { | ||
| 7113 | ret = seq_open(file, &tracing_err_log_seq_ops); | ||
| 7114 | if (!ret) { | ||
| 7115 | struct seq_file *m = file->private_data; | ||
| 7116 | m->private = tr; | ||
| 7117 | } else { | ||
| 7118 | trace_array_put(tr); | ||
| 7119 | } | ||
| 7120 | } | ||
| 7121 | return ret; | ||
| 7122 | } | ||
| 7123 | |||
| 7124 | static ssize_t tracing_err_log_write(struct file *file, | ||
| 7125 | const char __user *buffer, | ||
| 7126 | size_t count, loff_t *ppos) | ||
| 7127 | { | ||
| 7128 | return count; | ||
| 7129 | } | ||
| 7130 | |||
| 7131 | static int tracing_err_log_release(struct inode *inode, struct file *file) | ||
| 7132 | { | ||
| 7133 | struct trace_array *tr = inode->i_private; | ||
| 7134 | |||
| 7135 | trace_array_put(tr); | ||
| 7136 | |||
| 7137 | if (file->f_mode & FMODE_READ) | ||
| 7138 | seq_release(inode, file); | ||
| 7139 | |||
| 7140 | return 0; | ||
| 7141 | } | ||
| 7142 | |||
| 7143 | static const struct file_operations tracing_err_log_fops = { | ||
| 7144 | .open = tracing_err_log_open, | ||
| 7145 | .write = tracing_err_log_write, | ||
| 7146 | .read = seq_read, | ||
| 7147 | .llseek = seq_lseek, | ||
| 7148 | .release = tracing_err_log_release, | ||
| 7149 | }; | ||
| 7150 | |||
| 6871 | static int tracing_buffers_open(struct inode *inode, struct file *filp) | 7151 | static int tracing_buffers_open(struct inode *inode, struct file *filp) |
| 6872 | { | 7152 | { |
| 6873 | struct trace_array *tr = inode->i_private; | 7153 | struct trace_array *tr = inode->i_private; |
| @@ -7926,7 +8206,7 @@ static const struct file_operations buffer_percent_fops = { | |||
| 7926 | .llseek = default_llseek, | 8206 | .llseek = default_llseek, |
| 7927 | }; | 8207 | }; |
| 7928 | 8208 | ||
| 7929 | struct dentry *trace_instance_dir; | 8209 | static struct dentry *trace_instance_dir; |
| 7930 | 8210 | ||
| 7931 | static void | 8211 | static void |
| 7932 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); | 8212 | init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); |
| @@ -8033,7 +8313,7 @@ static void update_tracer_options(struct trace_array *tr) | |||
| 8033 | mutex_unlock(&trace_types_lock); | 8313 | mutex_unlock(&trace_types_lock); |
| 8034 | } | 8314 | } |
| 8035 | 8315 | ||
| 8036 | static int instance_mkdir(const char *name) | 8316 | struct trace_array *trace_array_create(const char *name) |
| 8037 | { | 8317 | { |
| 8038 | struct trace_array *tr; | 8318 | struct trace_array *tr; |
| 8039 | int ret; | 8319 | int ret; |
| @@ -8072,6 +8352,7 @@ static int instance_mkdir(const char *name) | |||
| 8072 | INIT_LIST_HEAD(&tr->systems); | 8352 | INIT_LIST_HEAD(&tr->systems); |
| 8073 | INIT_LIST_HEAD(&tr->events); | 8353 | INIT_LIST_HEAD(&tr->events); |
| 8074 | INIT_LIST_HEAD(&tr->hist_vars); | 8354 | INIT_LIST_HEAD(&tr->hist_vars); |
| 8355 | INIT_LIST_HEAD(&tr->err_log); | ||
| 8075 | 8356 | ||
| 8076 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) | 8357 | if (allocate_trace_buffers(tr, trace_buf_size) < 0) |
| 8077 | goto out_free_tr; | 8358 | goto out_free_tr; |
| @@ -8097,7 +8378,7 @@ static int instance_mkdir(const char *name) | |||
| 8097 | mutex_unlock(&trace_types_lock); | 8378 | mutex_unlock(&trace_types_lock); |
| 8098 | mutex_unlock(&event_mutex); | 8379 | mutex_unlock(&event_mutex); |
| 8099 | 8380 | ||
| 8100 | return 0; | 8381 | return tr; |
| 8101 | 8382 | ||
| 8102 | out_free_tr: | 8383 | out_free_tr: |
| 8103 | free_trace_buffers(tr); | 8384 | free_trace_buffers(tr); |
| @@ -8109,33 +8390,21 @@ static int instance_mkdir(const char *name) | |||
| 8109 | mutex_unlock(&trace_types_lock); | 8390 | mutex_unlock(&trace_types_lock); |
| 8110 | mutex_unlock(&event_mutex); | 8391 | mutex_unlock(&event_mutex); |
| 8111 | 8392 | ||
| 8112 | return ret; | 8393 | return ERR_PTR(ret); |
| 8394 | } | ||
| 8395 | EXPORT_SYMBOL_GPL(trace_array_create); | ||
| 8113 | 8396 | ||
| 8397 | static int instance_mkdir(const char *name) | ||
| 8398 | { | ||
| 8399 | return PTR_ERR_OR_ZERO(trace_array_create(name)); | ||
| 8114 | } | 8400 | } |
| 8115 | 8401 | ||
| 8116 | static int instance_rmdir(const char *name) | 8402 | static int __remove_instance(struct trace_array *tr) |
| 8117 | { | 8403 | { |
| 8118 | struct trace_array *tr; | ||
| 8119 | int found = 0; | ||
| 8120 | int ret; | ||
| 8121 | int i; | 8404 | int i; |
| 8122 | 8405 | ||
| 8123 | mutex_lock(&event_mutex); | ||
| 8124 | mutex_lock(&trace_types_lock); | ||
| 8125 | |||
| 8126 | ret = -ENODEV; | ||
| 8127 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
| 8128 | if (tr->name && strcmp(tr->name, name) == 0) { | ||
| 8129 | found = 1; | ||
| 8130 | break; | ||
| 8131 | } | ||
| 8132 | } | ||
| 8133 | if (!found) | ||
| 8134 | goto out_unlock; | ||
| 8135 | |||
| 8136 | ret = -EBUSY; | ||
| 8137 | if (tr->ref || (tr->current_trace && tr->current_trace->ref)) | 8406 | if (tr->ref || (tr->current_trace && tr->current_trace->ref)) |
| 8138 | goto out_unlock; | 8407 | return -EBUSY; |
| 8139 | 8408 | ||
| 8140 | list_del(&tr->list); | 8409 | list_del(&tr->list); |
| 8141 | 8410 | ||
| @@ -8161,10 +8430,46 @@ static int instance_rmdir(const char *name) | |||
| 8161 | free_cpumask_var(tr->tracing_cpumask); | 8430 | free_cpumask_var(tr->tracing_cpumask); |
| 8162 | kfree(tr->name); | 8431 | kfree(tr->name); |
| 8163 | kfree(tr); | 8432 | kfree(tr); |
| 8433 | tr = NULL; | ||
| 8164 | 8434 | ||
| 8165 | ret = 0; | 8435 | return 0; |
| 8436 | } | ||
| 8437 | |||
| 8438 | int trace_array_destroy(struct trace_array *tr) | ||
| 8439 | { | ||
| 8440 | int ret; | ||
| 8441 | |||
| 8442 | if (!tr) | ||
| 8443 | return -EINVAL; | ||
| 8444 | |||
| 8445 | mutex_lock(&event_mutex); | ||
| 8446 | mutex_lock(&trace_types_lock); | ||
| 8447 | |||
| 8448 | ret = __remove_instance(tr); | ||
| 8449 | |||
| 8450 | mutex_unlock(&trace_types_lock); | ||
| 8451 | mutex_unlock(&event_mutex); | ||
| 8452 | |||
| 8453 | return ret; | ||
| 8454 | } | ||
| 8455 | EXPORT_SYMBOL_GPL(trace_array_destroy); | ||
| 8456 | |||
| 8457 | static int instance_rmdir(const char *name) | ||
| 8458 | { | ||
| 8459 | struct trace_array *tr; | ||
| 8460 | int ret; | ||
| 8461 | |||
| 8462 | mutex_lock(&event_mutex); | ||
| 8463 | mutex_lock(&trace_types_lock); | ||
| 8464 | |||
| 8465 | ret = -ENODEV; | ||
| 8466 | list_for_each_entry(tr, &ftrace_trace_arrays, list) { | ||
| 8467 | if (tr->name && strcmp(tr->name, name) == 0) { | ||
| 8468 | ret = __remove_instance(tr); | ||
| 8469 | break; | ||
| 8470 | } | ||
| 8471 | } | ||
| 8166 | 8472 | ||
| 8167 | out_unlock: | ||
| 8168 | mutex_unlock(&trace_types_lock); | 8473 | mutex_unlock(&trace_types_lock); |
| 8169 | mutex_unlock(&event_mutex); | 8474 | mutex_unlock(&event_mutex); |
| 8170 | 8475 | ||
| @@ -8254,6 +8559,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) | |||
| 8254 | tr, &snapshot_fops); | 8559 | tr, &snapshot_fops); |
| 8255 | #endif | 8560 | #endif |
| 8256 | 8561 | ||
| 8562 | trace_create_file("error_log", 0644, d_tracer, | ||
| 8563 | tr, &tracing_err_log_fops); | ||
| 8564 | |||
| 8257 | for_each_tracing_cpu(cpu) | 8565 | for_each_tracing_cpu(cpu) |
| 8258 | tracing_init_tracefs_percpu(tr, cpu); | 8566 | tracing_init_tracefs_percpu(tr, cpu); |
| 8259 | 8567 | ||
| @@ -8310,10 +8618,6 @@ struct dentry *tracing_init_dentry(void) | |||
| 8310 | */ | 8618 | */ |
| 8311 | tr->dir = debugfs_create_automount("tracing", NULL, | 8619 | tr->dir = debugfs_create_automount("tracing", NULL, |
| 8312 | trace_automount, NULL); | 8620 | trace_automount, NULL); |
| 8313 | if (!tr->dir) { | ||
| 8314 | pr_warn_once("Could not create debugfs directory 'tracing'\n"); | ||
| 8315 | return ERR_PTR(-ENOMEM); | ||
| 8316 | } | ||
| 8317 | 8621 | ||
| 8318 | return NULL; | 8622 | return NULL; |
| 8319 | } | 8623 | } |
| @@ -8616,12 +8920,8 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) | |||
| 8616 | 8920 | ||
| 8617 | cnt++; | 8921 | cnt++; |
| 8618 | 8922 | ||
| 8619 | /* reset all but tr, trace, and overruns */ | 8923 | trace_iterator_reset(&iter); |
| 8620 | memset(&iter.seq, 0, | ||
| 8621 | sizeof(struct trace_iterator) - | ||
| 8622 | offsetof(struct trace_iterator, seq)); | ||
| 8623 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | 8924 | iter.iter_flags |= TRACE_FILE_LAT_FMT; |
| 8624 | iter.pos = -1; | ||
| 8625 | 8925 | ||
| 8626 | if (trace_find_next_entry_inc(&iter) != NULL) { | 8926 | if (trace_find_next_entry_inc(&iter) != NULL) { |
| 8627 | int ret; | 8927 | int ret; |
| @@ -8839,6 +9139,7 @@ __init static int tracer_alloc_buffers(void) | |||
| 8839 | INIT_LIST_HEAD(&global_trace.systems); | 9139 | INIT_LIST_HEAD(&global_trace.systems); |
| 8840 | INIT_LIST_HEAD(&global_trace.events); | 9140 | INIT_LIST_HEAD(&global_trace.events); |
| 8841 | INIT_LIST_HEAD(&global_trace.hist_vars); | 9141 | INIT_LIST_HEAD(&global_trace.hist_vars); |
| 9142 | INIT_LIST_HEAD(&global_trace.err_log); | ||
| 8842 | list_add(&global_trace.list, &ftrace_trace_arrays); | 9143 | list_add(&global_trace.list, &ftrace_trace_arrays); |
| 8843 | 9144 | ||
| 8844 | apply_trace_boot_options(); | 9145 | apply_trace_boot_options(); |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 639047b259d7..005f08629b8b 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -15,7 +15,6 @@ | |||
| 15 | #include <linux/trace_seq.h> | 15 | #include <linux/trace_seq.h> |
| 16 | #include <linux/trace_events.h> | 16 | #include <linux/trace_events.h> |
| 17 | #include <linux/compiler.h> | 17 | #include <linux/compiler.h> |
| 18 | #include <linux/trace_seq.h> | ||
| 19 | #include <linux/glob.h> | 18 | #include <linux/glob.h> |
| 20 | 19 | ||
| 21 | #ifdef CONFIG_FTRACE_SYSCALLS | 20 | #ifdef CONFIG_FTRACE_SYSCALLS |
| @@ -293,11 +292,13 @@ struct trace_array { | |||
| 293 | int nr_topts; | 292 | int nr_topts; |
| 294 | bool clear_trace; | 293 | bool clear_trace; |
| 295 | int buffer_percent; | 294 | int buffer_percent; |
| 295 | unsigned int n_err_log_entries; | ||
| 296 | struct tracer *current_trace; | 296 | struct tracer *current_trace; |
| 297 | unsigned int trace_flags; | 297 | unsigned int trace_flags; |
| 298 | unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; | 298 | unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; |
| 299 | unsigned int flags; | 299 | unsigned int flags; |
| 300 | raw_spinlock_t start_lock; | 300 | raw_spinlock_t start_lock; |
| 301 | struct list_head err_log; | ||
| 301 | struct dentry *dir; | 302 | struct dentry *dir; |
| 302 | struct dentry *options; | 303 | struct dentry *options; |
| 303 | struct dentry *percpu_dir; | 304 | struct dentry *percpu_dir; |
| @@ -719,6 +720,9 @@ void trace_init_global_iter(struct trace_iterator *iter); | |||
| 719 | 720 | ||
| 720 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); | 721 | void tracing_iter_reset(struct trace_iterator *iter, int cpu); |
| 721 | 722 | ||
| 723 | unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); | ||
| 724 | unsigned long trace_total_entries(struct trace_array *tr); | ||
| 725 | |||
| 722 | void trace_function(struct trace_array *tr, | 726 | void trace_function(struct trace_array *tr, |
| 723 | unsigned long ip, | 727 | unsigned long ip, |
| 724 | unsigned long parent_ip, | 728 | unsigned long parent_ip, |
| @@ -1545,7 +1549,8 @@ extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, | |||
| 1545 | extern void print_subsystem_event_filter(struct event_subsystem *system, | 1549 | extern void print_subsystem_event_filter(struct event_subsystem *system, |
| 1546 | struct trace_seq *s); | 1550 | struct trace_seq *s); |
| 1547 | extern int filter_assign_type(const char *type); | 1551 | extern int filter_assign_type(const char *type); |
| 1548 | extern int create_event_filter(struct trace_event_call *call, | 1552 | extern int create_event_filter(struct trace_array *tr, |
| 1553 | struct trace_event_call *call, | ||
| 1549 | char *filter_str, bool set_str, | 1554 | char *filter_str, bool set_str, |
| 1550 | struct event_filter **filterp); | 1555 | struct event_filter **filterp); |
| 1551 | extern void free_event_filter(struct event_filter *filter); | 1556 | extern void free_event_filter(struct event_filter *filter); |
| @@ -1876,6 +1881,11 @@ extern ssize_t trace_parse_run_command(struct file *file, | |||
| 1876 | const char __user *buffer, size_t count, loff_t *ppos, | 1881 | const char __user *buffer, size_t count, loff_t *ppos, |
| 1877 | int (*createfn)(int, char**)); | 1882 | int (*createfn)(int, char**)); |
| 1878 | 1883 | ||
| 1884 | extern unsigned int err_pos(char *cmd, const char *str); | ||
| 1885 | extern void tracing_log_err(struct trace_array *tr, | ||
| 1886 | const char *loc, const char *cmd, | ||
| 1887 | const char **errs, u8 type, u8 pos); | ||
| 1888 | |||
| 1879 | /* | 1889 | /* |
| 1880 | * Normal trace_printk() and friends allocates special buffers | 1890 | * Normal trace_printk() and friends allocates special buffers |
| 1881 | * to do the manipulation, as well as saves the print formats | 1891 | * to do the manipulation, as well as saves the print formats |
| @@ -1956,4 +1966,22 @@ static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } | |||
| 1956 | 1966 | ||
| 1957 | extern struct trace_iterator *tracepoint_print_iter; | 1967 | extern struct trace_iterator *tracepoint_print_iter; |
| 1958 | 1968 | ||
| 1969 | /* | ||
| 1970 | * Reset the state of the trace_iterator so that it can read consumed data. | ||
| 1971 | * Normally, the trace_iterator is used for reading the data when it is not | ||
| 1972 | * consumed, and must retain state. | ||
| 1973 | */ | ||
| 1974 | static __always_inline void trace_iterator_reset(struct trace_iterator *iter) | ||
| 1975 | { | ||
| 1976 | const size_t offset = offsetof(struct trace_iterator, seq); | ||
| 1977 | |||
| 1978 | /* | ||
| 1979 | * Keep gcc from complaining about overwriting more than just one | ||
| 1980 | * member in the structure. | ||
| 1981 | */ | ||
| 1982 | memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); | ||
| 1983 | |||
| 1984 | iter->pos = -1; | ||
| 1985 | } | ||
| 1986 | |||
| 1959 | #endif /* _LINUX_KERNEL_TRACE_H */ | 1987 | #endif /* _LINUX_KERNEL_TRACE_H */ |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5b3b0c3c8a47..0ce3db67f556 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -832,6 +832,7 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) | |||
| 832 | 832 | ||
| 833 | return ret; | 833 | return ret; |
| 834 | } | 834 | } |
| 835 | EXPORT_SYMBOL_GPL(ftrace_set_clr_event); | ||
| 835 | 836 | ||
| 836 | /** | 837 | /** |
| 837 | * trace_set_clr_event - enable or disable an event | 838 | * trace_set_clr_event - enable or disable an event |
| @@ -1318,9 +1319,6 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) | |||
| 1318 | char buf[32]; | 1319 | char buf[32]; |
| 1319 | int len; | 1320 | int len; |
| 1320 | 1321 | ||
| 1321 | if (*ppos) | ||
| 1322 | return 0; | ||
| 1323 | |||
| 1324 | if (unlikely(!id)) | 1322 | if (unlikely(!id)) |
| 1325 | return -ENODEV; | 1323 | return -ENODEV; |
| 1326 | 1324 | ||
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 05a66493a164..5079d1db3754 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -66,7 +66,8 @@ static const char * ops[] = { OPS }; | |||
| 66 | C(INVALID_FILTER, "Meaningless filter expression"), \ | 66 | C(INVALID_FILTER, "Meaningless filter expression"), \ |
| 67 | C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ | 67 | C(IP_FIELD_ONLY, "Only 'ip' field is supported for function trace"), \ |
| 68 | C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ | 68 | C(INVALID_VALUE, "Invalid value (did you forget quotes)?"), \ |
| 69 | C(NO_FILTER, "No filter found"), | 69 | C(ERRNO, "Error"), \ |
| 70 | C(NO_FILTER, "No filter found") | ||
| 70 | 71 | ||
| 71 | #undef C | 72 | #undef C |
| 72 | #define C(a, b) FILT_ERR_##a | 73 | #define C(a, b) FILT_ERR_##a |
| @@ -76,7 +77,7 @@ enum { ERRORS }; | |||
| 76 | #undef C | 77 | #undef C |
| 77 | #define C(a, b) b | 78 | #define C(a, b) b |
| 78 | 79 | ||
| 79 | static char *err_text[] = { ERRORS }; | 80 | static const char *err_text[] = { ERRORS }; |
| 80 | 81 | ||
| 81 | /* Called after a '!' character but "!=" and "!~" are not "not"s */ | 82 | /* Called after a '!' character but "!=" and "!~" are not "not"s */ |
| 82 | static bool is_not(const char *str) | 83 | static bool is_not(const char *str) |
| @@ -427,7 +428,7 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, | |||
| 427 | op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); | 428 | op_stack = kmalloc_array(nr_parens, sizeof(*op_stack), GFP_KERNEL); |
| 428 | if (!op_stack) | 429 | if (!op_stack) |
| 429 | return ERR_PTR(-ENOMEM); | 430 | return ERR_PTR(-ENOMEM); |
| 430 | prog_stack = kmalloc_array(nr_preds, sizeof(*prog_stack), GFP_KERNEL); | 431 | prog_stack = kcalloc(nr_preds, sizeof(*prog_stack), GFP_KERNEL); |
| 431 | if (!prog_stack) { | 432 | if (!prog_stack) { |
| 432 | parse_error(pe, -ENOMEM, 0); | 433 | parse_error(pe, -ENOMEM, 0); |
| 433 | goto out_free; | 434 | goto out_free; |
| @@ -578,7 +579,11 @@ predicate_parse(const char *str, int nr_parens, int nr_preds, | |||
| 578 | out_free: | 579 | out_free: |
| 579 | kfree(op_stack); | 580 | kfree(op_stack); |
| 580 | kfree(inverts); | 581 | kfree(inverts); |
| 581 | kfree(prog_stack); | 582 | if (prog_stack) { |
| 583 | for (i = 0; prog_stack[i].pred; i++) | ||
| 584 | kfree(prog_stack[i].pred); | ||
| 585 | kfree(prog_stack); | ||
| 586 | } | ||
| 582 | return ERR_PTR(ret); | 587 | return ERR_PTR(ret); |
| 583 | } | 588 | } |
| 584 | 589 | ||
| @@ -919,7 +924,8 @@ static void remove_filter_string(struct event_filter *filter) | |||
| 919 | filter->filter_string = NULL; | 924 | filter->filter_string = NULL; |
| 920 | } | 925 | } |
| 921 | 926 | ||
| 922 | static void append_filter_err(struct filter_parse_error *pe, | 927 | static void append_filter_err(struct trace_array *tr, |
| 928 | struct filter_parse_error *pe, | ||
| 923 | struct event_filter *filter) | 929 | struct event_filter *filter) |
| 924 | { | 930 | { |
| 925 | struct trace_seq *s; | 931 | struct trace_seq *s; |
| @@ -947,8 +953,14 @@ static void append_filter_err(struct filter_parse_error *pe, | |||
| 947 | if (pe->lasterr > 0) { | 953 | if (pe->lasterr > 0) { |
| 948 | trace_seq_printf(s, "\n%*s", pos, "^"); | 954 | trace_seq_printf(s, "\n%*s", pos, "^"); |
| 949 | trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); | 955 | trace_seq_printf(s, "\nparse_error: %s\n", err_text[pe->lasterr]); |
| 956 | tracing_log_err(tr, "event filter parse error", | ||
| 957 | filter->filter_string, err_text, | ||
| 958 | pe->lasterr, pe->lasterr_pos); | ||
| 950 | } else { | 959 | } else { |
| 951 | trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); | 960 | trace_seq_printf(s, "\nError: (%d)\n", pe->lasterr); |
| 961 | tracing_log_err(tr, "event filter parse error", | ||
| 962 | filter->filter_string, err_text, | ||
| 963 | FILT_ERR_ERRNO, 0); | ||
| 952 | } | 964 | } |
| 953 | trace_seq_putc(s, 0); | 965 | trace_seq_putc(s, 0); |
| 954 | buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); | 966 | buf = kmemdup_nul(s->buffer, s->seq.len, GFP_KERNEL); |
| @@ -1214,30 +1226,30 @@ static int parse_pred(const char *str, void *data, | |||
| 1214 | * (perf doesn't use it) and grab everything. | 1226 | * (perf doesn't use it) and grab everything. |
| 1215 | */ | 1227 | */ |
| 1216 | if (strcmp(field->name, "ip") != 0) { | 1228 | if (strcmp(field->name, "ip") != 0) { |
| 1217 | parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); | 1229 | parse_error(pe, FILT_ERR_IP_FIELD_ONLY, pos + i); |
| 1218 | goto err_free; | 1230 | goto err_free; |
| 1219 | } | 1231 | } |
| 1220 | pred->fn = filter_pred_none; | 1232 | pred->fn = filter_pred_none; |
| 1221 | 1233 | ||
| 1222 | /* | 1234 | /* |
| 1223 | * Quotes are not required, but if they exist then we need | 1235 | * Quotes are not required, but if they exist then we need |
| 1224 | * to read them till we hit a matching one. | 1236 | * to read them till we hit a matching one. |
| 1225 | */ | 1237 | */ |
| 1226 | if (str[i] == '\'' || str[i] == '"') | 1238 | if (str[i] == '\'' || str[i] == '"') |
| 1227 | q = str[i]; | 1239 | q = str[i]; |
| 1228 | else | 1240 | else |
| 1229 | q = 0; | 1241 | q = 0; |
| 1230 | 1242 | ||
| 1231 | for (i++; str[i]; i++) { | 1243 | for (i++; str[i]; i++) { |
| 1232 | if (q && str[i] == q) | 1244 | if (q && str[i] == q) |
| 1233 | break; | 1245 | break; |
| 1234 | if (!q && (str[i] == ')' || str[i] == '&' || | 1246 | if (!q && (str[i] == ')' || str[i] == '&' || |
| 1235 | str[i] == '|')) | 1247 | str[i] == '|')) |
| 1236 | break; | 1248 | break; |
| 1237 | } | 1249 | } |
| 1238 | /* Skip quotes */ | 1250 | /* Skip quotes */ |
| 1239 | if (q) | 1251 | if (q) |
| 1240 | s++; | 1252 | s++; |
| 1241 | len = i - s; | 1253 | len = i - s; |
| 1242 | if (len >= MAX_FILTER_STR_VAL) { | 1254 | if (len >= MAX_FILTER_STR_VAL) { |
| 1243 | parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); | 1255 | parse_error(pe, FILT_ERR_OPERAND_TOO_LONG, pos + i); |
| @@ -1600,7 +1612,7 @@ static int process_system_preds(struct trace_subsystem_dir *dir, | |||
| 1600 | if (err) { | 1612 | if (err) { |
| 1601 | filter_disable(file); | 1613 | filter_disable(file); |
| 1602 | parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); | 1614 | parse_error(pe, FILT_ERR_BAD_SUBSYS_FILTER, 0); |
| 1603 | append_filter_err(pe, filter); | 1615 | append_filter_err(tr, pe, filter); |
| 1604 | } else | 1616 | } else |
| 1605 | event_set_filtered_flag(file); | 1617 | event_set_filtered_flag(file); |
| 1606 | 1618 | ||
| @@ -1712,7 +1724,8 @@ static void create_filter_finish(struct filter_parse_error *pe) | |||
| 1712 | * information if @set_str is %true and the caller is responsible for | 1724 | * information if @set_str is %true and the caller is responsible for |
| 1713 | * freeing it. | 1725 | * freeing it. |
| 1714 | */ | 1726 | */ |
| 1715 | static int create_filter(struct trace_event_call *call, | 1727 | static int create_filter(struct trace_array *tr, |
| 1728 | struct trace_event_call *call, | ||
| 1716 | char *filter_string, bool set_str, | 1729 | char *filter_string, bool set_str, |
| 1717 | struct event_filter **filterp) | 1730 | struct event_filter **filterp) |
| 1718 | { | 1731 | { |
| @@ -1729,17 +1742,18 @@ static int create_filter(struct trace_event_call *call, | |||
| 1729 | 1742 | ||
| 1730 | err = process_preds(call, filter_string, *filterp, pe); | 1743 | err = process_preds(call, filter_string, *filterp, pe); |
| 1731 | if (err && set_str) | 1744 | if (err && set_str) |
| 1732 | append_filter_err(pe, *filterp); | 1745 | append_filter_err(tr, pe, *filterp); |
| 1733 | create_filter_finish(pe); | 1746 | create_filter_finish(pe); |
| 1734 | 1747 | ||
| 1735 | return err; | 1748 | return err; |
| 1736 | } | 1749 | } |
| 1737 | 1750 | ||
| 1738 | int create_event_filter(struct trace_event_call *call, | 1751 | int create_event_filter(struct trace_array *tr, |
| 1752 | struct trace_event_call *call, | ||
| 1739 | char *filter_str, bool set_str, | 1753 | char *filter_str, bool set_str, |
| 1740 | struct event_filter **filterp) | 1754 | struct event_filter **filterp) |
| 1741 | { | 1755 | { |
| 1742 | return create_filter(call, filter_str, set_str, filterp); | 1756 | return create_filter(tr, call, filter_str, set_str, filterp); |
| 1743 | } | 1757 | } |
| 1744 | 1758 | ||
| 1745 | /** | 1759 | /** |
| @@ -1766,7 +1780,7 @@ static int create_system_filter(struct trace_subsystem_dir *dir, | |||
| 1766 | kfree((*filterp)->filter_string); | 1780 | kfree((*filterp)->filter_string); |
| 1767 | (*filterp)->filter_string = NULL; | 1781 | (*filterp)->filter_string = NULL; |
| 1768 | } else { | 1782 | } else { |
| 1769 | append_filter_err(pe, *filterp); | 1783 | append_filter_err(tr, pe, *filterp); |
| 1770 | } | 1784 | } |
| 1771 | } | 1785 | } |
| 1772 | create_filter_finish(pe); | 1786 | create_filter_finish(pe); |
| @@ -1797,7 +1811,7 @@ int apply_event_filter(struct trace_event_file *file, char *filter_string) | |||
| 1797 | return 0; | 1811 | return 0; |
| 1798 | } | 1812 | } |
| 1799 | 1813 | ||
| 1800 | err = create_filter(call, filter_string, true, &filter); | 1814 | err = create_filter(file->tr, call, filter_string, true, &filter); |
| 1801 | 1815 | ||
| 1802 | /* | 1816 | /* |
| 1803 | * Always swap the call filter with the new filter | 1817 | * Always swap the call filter with the new filter |
| @@ -2053,7 +2067,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, | |||
| 2053 | if (event->filter) | 2067 | if (event->filter) |
| 2054 | goto out_unlock; | 2068 | goto out_unlock; |
| 2055 | 2069 | ||
| 2056 | err = create_filter(call, filter_str, false, &filter); | 2070 | err = create_filter(NULL, call, filter_str, false, &filter); |
| 2057 | if (err) | 2071 | if (err) |
| 2058 | goto free_filter; | 2072 | goto free_filter; |
| 2059 | 2073 | ||
| @@ -2202,8 +2216,8 @@ static __init int ftrace_test_event_filter(void) | |||
| 2202 | struct test_filter_data_t *d = &test_filter_data[i]; | 2216 | struct test_filter_data_t *d = &test_filter_data[i]; |
| 2203 | int err; | 2217 | int err; |
| 2204 | 2218 | ||
| 2205 | err = create_filter(&event_ftrace_test_filter, d->filter, | 2219 | err = create_filter(NULL, &event_ftrace_test_filter, |
| 2206 | false, &filter); | 2220 | d->filter, false, &filter); |
| 2207 | if (err) { | 2221 | if (err) { |
| 2208 | printk(KERN_INFO | 2222 | printk(KERN_INFO |
| 2209 | "Failed to get filter for '%s', err %d\n", | 2223 | "Failed to get filter for '%s', err %d\n", |
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a1d20421f4b0..ca6b0dff60c5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c | |||
| @@ -22,6 +22,57 @@ | |||
| 22 | 22 | ||
| 23 | #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ | 23 | #define STR_VAR_LEN_MAX 32 /* must be multiple of sizeof(u64) */ |
| 24 | 24 | ||
| 25 | #define ERRORS \ | ||
| 26 | C(NONE, "No error"), \ | ||
| 27 | C(DUPLICATE_VAR, "Variable already defined"), \ | ||
| 28 | C(VAR_NOT_UNIQUE, "Variable name not unique, need to use fully qualified name (subsys.event.var) for variable"), \ | ||
| 29 | C(TOO_MANY_VARS, "Too many variables defined"), \ | ||
| 30 | C(MALFORMED_ASSIGNMENT, "Malformed assignment"), \ | ||
| 31 | C(NAMED_MISMATCH, "Named hist trigger doesn't match existing named trigger (includes variables)"), \ | ||
| 32 | C(TRIGGER_EEXIST, "Hist trigger already exists"), \ | ||
| 33 | C(TRIGGER_ENOENT_CLEAR, "Can't clear or continue a nonexistent hist trigger"), \ | ||
| 34 | C(SET_CLOCK_FAIL, "Couldn't set trace_clock"), \ | ||
| 35 | C(BAD_FIELD_MODIFIER, "Invalid field modifier"), \ | ||
| 36 | C(TOO_MANY_SUBEXPR, "Too many subexpressions (3 max)"), \ | ||
| 37 | C(TIMESTAMP_MISMATCH, "Timestamp units in expression don't match"), \ | ||
| 38 | C(TOO_MANY_FIELD_VARS, "Too many field variables defined"), \ | ||
| 39 | C(EVENT_FILE_NOT_FOUND, "Event file not found"), \ | ||
| 40 | C(HIST_NOT_FOUND, "Matching event histogram not found"), \ | ||
| 41 | C(HIST_CREATE_FAIL, "Couldn't create histogram for field"), \ | ||
| 42 | C(SYNTH_VAR_NOT_FOUND, "Couldn't find synthetic variable"), \ | ||
| 43 | C(SYNTH_EVENT_NOT_FOUND,"Couldn't find synthetic event"), \ | ||
| 44 | C(SYNTH_TYPE_MISMATCH, "Param type doesn't match synthetic event field type"), \ | ||
| 45 | C(SYNTH_COUNT_MISMATCH, "Param count doesn't match synthetic event field count"), \ | ||
| 46 | C(FIELD_VAR_PARSE_FAIL, "Couldn't parse field variable"), \ | ||
| 47 | C(VAR_CREATE_FIND_FAIL, "Couldn't create or find variable"), \ | ||
| 48 | C(ONX_NOT_VAR, "For onmax(x) or onchange(x), x must be a variable"), \ | ||
| 49 | C(ONX_VAR_NOT_FOUND, "Couldn't find onmax or onchange variable"), \ | ||
| 50 | C(ONX_VAR_CREATE_FAIL, "Couldn't create onmax or onchange variable"), \ | ||
| 51 | C(FIELD_VAR_CREATE_FAIL,"Couldn't create field variable"), \ | ||
| 52 | C(TOO_MANY_PARAMS, "Too many action params"), \ | ||
| 53 | C(PARAM_NOT_FOUND, "Couldn't find param"), \ | ||
| 54 | C(INVALID_PARAM, "Invalid action param"), \ | ||
| 55 | C(ACTION_NOT_FOUND, "No action found"), \ | ||
| 56 | C(NO_SAVE_PARAMS, "No params found for save()"), \ | ||
| 57 | C(TOO_MANY_SAVE_ACTIONS,"Can't have more than one save() action per hist"), \ | ||
| 58 | C(ACTION_MISMATCH, "Handler doesn't support action"), \ | ||
| 59 | C(NO_CLOSING_PAREN, "No closing paren found"), \ | ||
| 60 | C(SUBSYS_NOT_FOUND, "Missing subsystem"), \ | ||
| 61 | C(INVALID_SUBSYS_EVENT, "Invalid subsystem or event name"), \ | ||
| 62 | C(INVALID_REF_KEY, "Using variable references in keys not supported"), \ | ||
| 63 | C(VAR_NOT_FOUND, "Couldn't find variable"), \ | ||
| 64 | C(FIELD_NOT_FOUND, "Couldn't find field"), | ||
| 65 | |||
| 66 | #undef C | ||
| 67 | #define C(a, b) HIST_ERR_##a | ||
| 68 | |||
| 69 | enum { ERRORS }; | ||
| 70 | |||
| 71 | #undef C | ||
| 72 | #define C(a, b) b | ||
| 73 | |||
| 74 | static const char *err_text[] = { ERRORS }; | ||
| 75 | |||
| 25 | struct hist_field; | 76 | struct hist_field; |
| 26 | 77 | ||
| 27 | typedef u64 (*hist_field_fn_t) (struct hist_field *field, | 78 | typedef u64 (*hist_field_fn_t) (struct hist_field *field, |
| @@ -535,62 +586,49 @@ static struct track_data *track_data_alloc(unsigned int key_len, | |||
| 535 | return data; | 586 | return data; |
| 536 | } | 587 | } |
| 537 | 588 | ||
| 538 | static char last_hist_cmd[MAX_FILTER_STR_VAL]; | 589 | static char last_cmd[MAX_FILTER_STR_VAL]; |
| 539 | static char hist_err_str[MAX_FILTER_STR_VAL]; | 590 | static char last_cmd_loc[MAX_FILTER_STR_VAL]; |
| 540 | 591 | ||
| 541 | static void last_cmd_set(char *str) | 592 | static int errpos(char *str) |
| 542 | { | 593 | { |
| 543 | if (!str) | 594 | return err_pos(last_cmd, str); |
| 544 | return; | ||
| 545 | |||
| 546 | strncpy(last_hist_cmd, str, MAX_FILTER_STR_VAL - 1); | ||
| 547 | } | 595 | } |
| 548 | 596 | ||
| 549 | static void hist_err(char *str, char *var) | 597 | static void last_cmd_set(struct trace_event_file *file, char *str) |
| 550 | { | 598 | { |
| 551 | int maxlen = MAX_FILTER_STR_VAL - 1; | 599 | const char *system = NULL, *name = NULL; |
| 600 | struct trace_event_call *call; | ||
| 552 | 601 | ||
| 553 | if (!str) | 602 | if (!str) |
| 554 | return; | 603 | return; |
| 555 | 604 | ||
| 556 | if (strlen(hist_err_str)) | 605 | strncpy(last_cmd, str, MAX_FILTER_STR_VAL - 1); |
| 557 | return; | ||
| 558 | 606 | ||
| 559 | if (!var) | 607 | if (file) { |
| 560 | var = ""; | 608 | call = file->event_call; |
| 561 | 609 | ||
| 562 | if (strlen(hist_err_str) + strlen(str) + strlen(var) > maxlen) | 610 | system = call->class->system; |
| 563 | return; | 611 | if (system) { |
| 612 | name = trace_event_name(call); | ||
| 613 | if (!name) | ||
| 614 | system = NULL; | ||
| 615 | } | ||
| 616 | } | ||
| 564 | 617 | ||
| 565 | strcat(hist_err_str, str); | 618 | if (system) |
| 566 | strcat(hist_err_str, var); | 619 | snprintf(last_cmd_loc, MAX_FILTER_STR_VAL, "hist:%s:%s", system, name); |
| 567 | } | 620 | } |
| 568 | 621 | ||
| 569 | static void hist_err_event(char *str, char *system, char *event, char *var) | 622 | static void hist_err(struct trace_array *tr, u8 err_type, u8 err_pos) |
| 570 | { | 623 | { |
| 571 | char err[MAX_FILTER_STR_VAL]; | 624 | tracing_log_err(tr, last_cmd_loc, last_cmd, err_text, |
| 572 | 625 | err_type, err_pos); | |
| 573 | if (system && var) | ||
| 574 | snprintf(err, MAX_FILTER_STR_VAL, "%s.%s.%s", system, event, var); | ||
| 575 | else if (system) | ||
| 576 | snprintf(err, MAX_FILTER_STR_VAL, "%s.%s", system, event); | ||
| 577 | else | ||
| 578 | strscpy(err, var, MAX_FILTER_STR_VAL); | ||
| 579 | |||
| 580 | hist_err(str, err); | ||
| 581 | } | 626 | } |
| 582 | 627 | ||
| 583 | static void hist_err_clear(void) | 628 | static void hist_err_clear(void) |
| 584 | { | 629 | { |
| 585 | hist_err_str[0] = '\0'; | 630 | last_cmd[0] = '\0'; |
| 586 | } | 631 | last_cmd_loc[0] = '\0'; |
| 587 | |||
| 588 | static bool have_hist_err(void) | ||
| 589 | { | ||
| 590 | if (strlen(hist_err_str)) | ||
| 591 | return true; | ||
| 592 | |||
| 593 | return false; | ||
| 594 | } | 632 | } |
| 595 | 633 | ||
| 596 | struct synth_trace_event { | 634 | struct synth_trace_event { |
| @@ -1719,7 +1757,7 @@ static struct trace_event_file *find_var_file(struct trace_array *tr, | |||
| 1719 | 1757 | ||
| 1720 | if (find_var_field(var_hist_data, var_name)) { | 1758 | if (find_var_field(var_hist_data, var_name)) { |
| 1721 | if (found) { | 1759 | if (found) { |
| 1722 | hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); | 1760 | hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, errpos(var_name)); |
| 1723 | return NULL; | 1761 | return NULL; |
| 1724 | } | 1762 | } |
| 1725 | 1763 | ||
| @@ -1770,7 +1808,8 @@ find_match_var(struct hist_trigger_data *hist_data, char *var_name) | |||
| 1770 | hist_field = find_file_var(file, var_name); | 1808 | hist_field = find_file_var(file, var_name); |
| 1771 | if (hist_field) { | 1809 | if (hist_field) { |
| 1772 | if (found) { | 1810 | if (found) { |
| 1773 | hist_err_event("Variable name not unique, need to use fully qualified name (subsys.event.var) for variable: ", system, event_name, var_name); | 1811 | hist_err(tr, HIST_ERR_VAR_NOT_UNIQUE, |
| 1812 | errpos(var_name)); | ||
| 1774 | return ERR_PTR(-EINVAL); | 1813 | return ERR_PTR(-EINVAL); |
| 1775 | } | 1814 | } |
| 1776 | 1815 | ||
| @@ -1815,6 +1854,9 @@ static u64 hist_field_var_ref(struct hist_field *hist_field, | |||
| 1815 | struct hist_elt_data *elt_data; | 1854 | struct hist_elt_data *elt_data; |
| 1816 | u64 var_val = 0; | 1855 | u64 var_val = 0; |
| 1817 | 1856 | ||
| 1857 | if (WARN_ON_ONCE(!elt)) | ||
| 1858 | return var_val; | ||
| 1859 | |||
| 1818 | elt_data = elt->private_data; | 1860 | elt_data = elt->private_data; |
| 1819 | var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; | 1861 | var_val = elt_data->var_ref_vals[hist_field->var_ref_idx]; |
| 1820 | 1862 | ||
| @@ -2002,11 +2044,11 @@ static int parse_action(char *str, struct hist_trigger_attrs *attrs) | |||
| 2002 | attrs->n_actions++; | 2044 | attrs->n_actions++; |
| 2003 | ret = 0; | 2045 | ret = 0; |
| 2004 | } | 2046 | } |
| 2005 | |||
| 2006 | return ret; | 2047 | return ret; |
| 2007 | } | 2048 | } |
| 2008 | 2049 | ||
| 2009 | static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) | 2050 | static int parse_assignment(struct trace_array *tr, |
| 2051 | char *str, struct hist_trigger_attrs *attrs) | ||
| 2010 | { | 2052 | { |
| 2011 | int ret = 0; | 2053 | int ret = 0; |
| 2012 | 2054 | ||
| @@ -2062,7 +2104,7 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) | |||
| 2062 | char *assignment; | 2104 | char *assignment; |
| 2063 | 2105 | ||
| 2064 | if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { | 2106 | if (attrs->n_assignments == TRACING_MAP_VARS_MAX) { |
| 2065 | hist_err("Too many variables defined: ", str); | 2107 | hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(str)); |
| 2066 | ret = -EINVAL; | 2108 | ret = -EINVAL; |
| 2067 | goto out; | 2109 | goto out; |
| 2068 | } | 2110 | } |
| @@ -2079,7 +2121,8 @@ static int parse_assignment(char *str, struct hist_trigger_attrs *attrs) | |||
| 2079 | return ret; | 2121 | return ret; |
| 2080 | } | 2122 | } |
| 2081 | 2123 | ||
| 2082 | static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) | 2124 | static struct hist_trigger_attrs * |
| 2125 | parse_hist_trigger_attrs(struct trace_array *tr, char *trigger_str) | ||
| 2083 | { | 2126 | { |
| 2084 | struct hist_trigger_attrs *attrs; | 2127 | struct hist_trigger_attrs *attrs; |
| 2085 | int ret = 0; | 2128 | int ret = 0; |
| @@ -2092,7 +2135,7 @@ static struct hist_trigger_attrs *parse_hist_trigger_attrs(char *trigger_str) | |||
| 2092 | char *str = strsep(&trigger_str, ":"); | 2135 | char *str = strsep(&trigger_str, ":"); |
| 2093 | 2136 | ||
| 2094 | if (strchr(str, '=')) { | 2137 | if (strchr(str, '=')) { |
| 2095 | ret = parse_assignment(str, attrs); | 2138 | ret = parse_assignment(tr, str, attrs); |
| 2096 | if (ret) | 2139 | if (ret) |
| 2097 | goto free; | 2140 | goto free; |
| 2098 | } else if (strcmp(str, "pause") == 0) | 2141 | } else if (strcmp(str, "pause") == 0) |
| @@ -2648,6 +2691,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, | |||
| 2648 | char *var_name) | 2691 | char *var_name) |
| 2649 | { | 2692 | { |
| 2650 | struct hist_field *var_field = NULL, *ref_field = NULL; | 2693 | struct hist_field *var_field = NULL, *ref_field = NULL; |
| 2694 | struct trace_array *tr = hist_data->event_file->tr; | ||
| 2651 | 2695 | ||
| 2652 | if (!is_var_ref(var_name)) | 2696 | if (!is_var_ref(var_name)) |
| 2653 | return NULL; | 2697 | return NULL; |
| @@ -2660,8 +2704,7 @@ static struct hist_field *parse_var_ref(struct hist_trigger_data *hist_data, | |||
| 2660 | system, event_name); | 2704 | system, event_name); |
| 2661 | 2705 | ||
| 2662 | if (!ref_field) | 2706 | if (!ref_field) |
| 2663 | hist_err_event("Couldn't find variable: $", | 2707 | hist_err(tr, HIST_ERR_VAR_NOT_FOUND, errpos(var_name)); |
| 2664 | system, event_name, var_name); | ||
| 2665 | 2708 | ||
| 2666 | return ref_field; | 2709 | return ref_field; |
| 2667 | } | 2710 | } |
| @@ -2672,6 +2715,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, | |||
| 2672 | { | 2715 | { |
| 2673 | struct ftrace_event_field *field = NULL; | 2716 | struct ftrace_event_field *field = NULL; |
| 2674 | char *field_name, *modifier, *str; | 2717 | char *field_name, *modifier, *str; |
| 2718 | struct trace_array *tr = file->tr; | ||
| 2675 | 2719 | ||
| 2676 | modifier = str = kstrdup(field_str, GFP_KERNEL); | 2720 | modifier = str = kstrdup(field_str, GFP_KERNEL); |
| 2677 | if (!modifier) | 2721 | if (!modifier) |
| @@ -2695,7 +2739,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, | |||
| 2695 | else if (strcmp(modifier, "usecs") == 0) | 2739 | else if (strcmp(modifier, "usecs") == 0) |
| 2696 | *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; | 2740 | *flags |= HIST_FIELD_FL_TIMESTAMP_USECS; |
| 2697 | else { | 2741 | else { |
| 2698 | hist_err("Invalid field modifier: ", modifier); | 2742 | hist_err(tr, HIST_ERR_BAD_FIELD_MODIFIER, errpos(modifier)); |
| 2699 | field = ERR_PTR(-EINVAL); | 2743 | field = ERR_PTR(-EINVAL); |
| 2700 | goto out; | 2744 | goto out; |
| 2701 | } | 2745 | } |
| @@ -2711,7 +2755,7 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, | |||
| 2711 | else { | 2755 | else { |
| 2712 | field = trace_find_event_field(file->event_call, field_name); | 2756 | field = trace_find_event_field(file->event_call, field_name); |
| 2713 | if (!field || !field->size) { | 2757 | if (!field || !field->size) { |
| 2714 | hist_err("Couldn't find field: ", field_name); | 2758 | hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); |
| 2715 | field = ERR_PTR(-EINVAL); | 2759 | field = ERR_PTR(-EINVAL); |
| 2716 | goto out; | 2760 | goto out; |
| 2717 | } | 2761 | } |
| @@ -2773,7 +2817,8 @@ static struct hist_field *parse_atom(struct hist_trigger_data *hist_data, | |||
| 2773 | 2817 | ||
| 2774 | s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); | 2818 | s = local_field_var_ref(hist_data, ref_system, ref_event, ref_var); |
| 2775 | if (!s) { | 2819 | if (!s) { |
| 2776 | hist_field = parse_var_ref(hist_data, ref_system, ref_event, ref_var); | 2820 | hist_field = parse_var_ref(hist_data, ref_system, |
| 2821 | ref_event, ref_var); | ||
| 2777 | if (hist_field) { | 2822 | if (hist_field) { |
| 2778 | if (var_name) { | 2823 | if (var_name) { |
| 2779 | hist_field = create_alias(hist_data, hist_field, var_name); | 2824 | hist_field = create_alias(hist_data, hist_field, var_name); |
| @@ -2822,7 +2867,7 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, | |||
| 2822 | /* we support only -(xxx) i.e. explicit parens required */ | 2867 | /* we support only -(xxx) i.e. explicit parens required */ |
| 2823 | 2868 | ||
| 2824 | if (level > 3) { | 2869 | if (level > 3) { |
| 2825 | hist_err("Too many subexpressions (3 max): ", str); | 2870 | hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); |
| 2826 | ret = -EINVAL; | 2871 | ret = -EINVAL; |
| 2827 | goto free; | 2872 | goto free; |
| 2828 | } | 2873 | } |
| @@ -2877,7 +2922,8 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, | |||
| 2877 | return ERR_PTR(ret); | 2922 | return ERR_PTR(ret); |
| 2878 | } | 2923 | } |
| 2879 | 2924 | ||
| 2880 | static int check_expr_operands(struct hist_field *operand1, | 2925 | static int check_expr_operands(struct trace_array *tr, |
| 2926 | struct hist_field *operand1, | ||
| 2881 | struct hist_field *operand2) | 2927 | struct hist_field *operand2) |
| 2882 | { | 2928 | { |
| 2883 | unsigned long operand1_flags = operand1->flags; | 2929 | unsigned long operand1_flags = operand1->flags; |
| @@ -2905,7 +2951,7 @@ static int check_expr_operands(struct hist_field *operand1, | |||
| 2905 | 2951 | ||
| 2906 | if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != | 2952 | if ((operand1_flags & HIST_FIELD_FL_TIMESTAMP_USECS) != |
| 2907 | (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { | 2953 | (operand2_flags & HIST_FIELD_FL_TIMESTAMP_USECS)) { |
| 2908 | hist_err("Timestamp units in expression don't match", NULL); | 2954 | hist_err(tr, HIST_ERR_TIMESTAMP_MISMATCH, 0); |
| 2909 | return -EINVAL; | 2955 | return -EINVAL; |
| 2910 | } | 2956 | } |
| 2911 | 2957 | ||
| @@ -2923,7 +2969,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, | |||
| 2923 | char *sep, *operand1_str; | 2969 | char *sep, *operand1_str; |
| 2924 | 2970 | ||
| 2925 | if (level > 3) { | 2971 | if (level > 3) { |
| 2926 | hist_err("Too many subexpressions (3 max): ", str); | 2972 | hist_err(file->tr, HIST_ERR_TOO_MANY_SUBEXPR, errpos(str)); |
| 2927 | return ERR_PTR(-EINVAL); | 2973 | return ERR_PTR(-EINVAL); |
| 2928 | } | 2974 | } |
| 2929 | 2975 | ||
| @@ -2968,7 +3014,7 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, | |||
| 2968 | goto free; | 3014 | goto free; |
| 2969 | } | 3015 | } |
| 2970 | 3016 | ||
| 2971 | ret = check_expr_operands(operand1, operand2); | 3017 | ret = check_expr_operands(file->tr, operand1, operand2); |
| 2972 | if (ret) | 3018 | if (ret) |
| 2973 | goto free; | 3019 | goto free; |
| 2974 | 3020 | ||
| @@ -3161,16 +3207,14 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, | |||
| 3161 | int ret; | 3207 | int ret; |
| 3162 | 3208 | ||
| 3163 | if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { | 3209 | if (target_hist_data->n_field_var_hists >= SYNTH_FIELDS_MAX) { |
| 3164 | hist_err_event("trace action: Too many field variables defined: ", | 3210 | hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); |
| 3165 | subsys_name, event_name, field_name); | ||
| 3166 | return ERR_PTR(-EINVAL); | 3211 | return ERR_PTR(-EINVAL); |
| 3167 | } | 3212 | } |
| 3168 | 3213 | ||
| 3169 | file = event_file(tr, subsys_name, event_name); | 3214 | file = event_file(tr, subsys_name, event_name); |
| 3170 | 3215 | ||
| 3171 | if (IS_ERR(file)) { | 3216 | if (IS_ERR(file)) { |
| 3172 | hist_err_event("trace action: Event file not found: ", | 3217 | hist_err(tr, HIST_ERR_EVENT_FILE_NOT_FOUND, errpos(field_name)); |
| 3173 | subsys_name, event_name, field_name); | ||
| 3174 | ret = PTR_ERR(file); | 3218 | ret = PTR_ERR(file); |
| 3175 | return ERR_PTR(ret); | 3219 | return ERR_PTR(ret); |
| 3176 | } | 3220 | } |
| @@ -3183,8 +3227,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, | |||
| 3183 | */ | 3227 | */ |
| 3184 | hist_data = find_compatible_hist(target_hist_data, file); | 3228 | hist_data = find_compatible_hist(target_hist_data, file); |
| 3185 | if (!hist_data) { | 3229 | if (!hist_data) { |
| 3186 | hist_err_event("trace action: Matching event histogram not found: ", | 3230 | hist_err(tr, HIST_ERR_HIST_NOT_FOUND, errpos(field_name)); |
| 3187 | subsys_name, event_name, field_name); | ||
| 3188 | return ERR_PTR(-EINVAL); | 3231 | return ERR_PTR(-EINVAL); |
| 3189 | } | 3232 | } |
| 3190 | 3233 | ||
| @@ -3245,8 +3288,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, | |||
| 3245 | kfree(cmd); | 3288 | kfree(cmd); |
| 3246 | kfree(var_hist->cmd); | 3289 | kfree(var_hist->cmd); |
| 3247 | kfree(var_hist); | 3290 | kfree(var_hist); |
| 3248 | hist_err_event("trace action: Couldn't create histogram for field: ", | 3291 | hist_err(tr, HIST_ERR_HIST_CREATE_FAIL, errpos(field_name)); |
| 3249 | subsys_name, event_name, field_name); | ||
| 3250 | return ERR_PTR(ret); | 3292 | return ERR_PTR(ret); |
| 3251 | } | 3293 | } |
| 3252 | 3294 | ||
| @@ -3258,8 +3300,7 @@ create_field_var_hist(struct hist_trigger_data *target_hist_data, | |||
| 3258 | if (IS_ERR_OR_NULL(event_var)) { | 3300 | if (IS_ERR_OR_NULL(event_var)) { |
| 3259 | kfree(var_hist->cmd); | 3301 | kfree(var_hist->cmd); |
| 3260 | kfree(var_hist); | 3302 | kfree(var_hist); |
| 3261 | hist_err_event("trace action: Couldn't find synthetic variable: ", | 3303 | hist_err(tr, HIST_ERR_SYNTH_VAR_NOT_FOUND, errpos(field_name)); |
| 3262 | subsys_name, event_name, field_name); | ||
| 3263 | return ERR_PTR(-EINVAL); | 3304 | return ERR_PTR(-EINVAL); |
| 3264 | } | 3305 | } |
| 3265 | 3306 | ||
| @@ -3392,25 +3433,26 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, | |||
| 3392 | { | 3433 | { |
| 3393 | struct hist_field *val = NULL, *var = NULL; | 3434 | struct hist_field *val = NULL, *var = NULL; |
| 3394 | unsigned long flags = HIST_FIELD_FL_VAR; | 3435 | unsigned long flags = HIST_FIELD_FL_VAR; |
| 3436 | struct trace_array *tr = file->tr; | ||
| 3395 | struct field_var *field_var; | 3437 | struct field_var *field_var; |
| 3396 | int ret = 0; | 3438 | int ret = 0; |
| 3397 | 3439 | ||
| 3398 | if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { | 3440 | if (hist_data->n_field_vars >= SYNTH_FIELDS_MAX) { |
| 3399 | hist_err("Too many field variables defined: ", field_name); | 3441 | hist_err(tr, HIST_ERR_TOO_MANY_FIELD_VARS, errpos(field_name)); |
| 3400 | ret = -EINVAL; | 3442 | ret = -EINVAL; |
| 3401 | goto err; | 3443 | goto err; |
| 3402 | } | 3444 | } |
| 3403 | 3445 | ||
| 3404 | val = parse_atom(hist_data, file, field_name, &flags, NULL); | 3446 | val = parse_atom(hist_data, file, field_name, &flags, NULL); |
| 3405 | if (IS_ERR(val)) { | 3447 | if (IS_ERR(val)) { |
| 3406 | hist_err("Couldn't parse field variable: ", field_name); | 3448 | hist_err(tr, HIST_ERR_FIELD_VAR_PARSE_FAIL, errpos(field_name)); |
| 3407 | ret = PTR_ERR(val); | 3449 | ret = PTR_ERR(val); |
| 3408 | goto err; | 3450 | goto err; |
| 3409 | } | 3451 | } |
| 3410 | 3452 | ||
| 3411 | var = create_var(hist_data, file, field_name, val->size, val->type); | 3453 | var = create_var(hist_data, file, field_name, val->size, val->type); |
| 3412 | if (IS_ERR(var)) { | 3454 | if (IS_ERR(var)) { |
| 3413 | hist_err("Couldn't create or find variable: ", field_name); | 3455 | hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); |
| 3414 | kfree(val); | 3456 | kfree(val); |
| 3415 | ret = PTR_ERR(var); | 3457 | ret = PTR_ERR(var); |
| 3416 | goto err; | 3458 | goto err; |
| @@ -3543,14 +3585,20 @@ static bool cond_snapshot_update(struct trace_array *tr, void *cond_data) | |||
| 3543 | struct track_data *track_data = tr->cond_snapshot->cond_data; | 3585 | struct track_data *track_data = tr->cond_snapshot->cond_data; |
| 3544 | struct hist_elt_data *elt_data, *track_elt_data; | 3586 | struct hist_elt_data *elt_data, *track_elt_data; |
| 3545 | struct snapshot_context *context = cond_data; | 3587 | struct snapshot_context *context = cond_data; |
| 3588 | struct action_data *action; | ||
| 3546 | u64 track_val; | 3589 | u64 track_val; |
| 3547 | 3590 | ||
| 3548 | if (!track_data) | 3591 | if (!track_data) |
| 3549 | return false; | 3592 | return false; |
| 3550 | 3593 | ||
| 3594 | action = track_data->action_data; | ||
| 3595 | |||
| 3551 | track_val = get_track_val(track_data->hist_data, context->elt, | 3596 | track_val = get_track_val(track_data->hist_data, context->elt, |
| 3552 | track_data->action_data); | 3597 | track_data->action_data); |
| 3553 | 3598 | ||
| 3599 | if (!action->track_data.check_val(track_data->track_val, track_val)) | ||
| 3600 | return false; | ||
| 3601 | |||
| 3554 | track_data->track_val = track_val; | 3602 | track_data->track_val = track_val; |
| 3555 | memcpy(track_data->key, context->key, track_data->key_len); | 3603 | memcpy(track_data->key, context->key, track_data->key_len); |
| 3556 | 3604 | ||
| @@ -3737,19 +3785,20 @@ static int track_data_create(struct hist_trigger_data *hist_data, | |||
| 3737 | { | 3785 | { |
| 3738 | struct hist_field *var_field, *ref_field, *track_var = NULL; | 3786 | struct hist_field *var_field, *ref_field, *track_var = NULL; |
| 3739 | struct trace_event_file *file = hist_data->event_file; | 3787 | struct trace_event_file *file = hist_data->event_file; |
| 3788 | struct trace_array *tr = file->tr; | ||
| 3740 | char *track_data_var_str; | 3789 | char *track_data_var_str; |
| 3741 | int ret = 0; | 3790 | int ret = 0; |
| 3742 | 3791 | ||
| 3743 | track_data_var_str = data->track_data.var_str; | 3792 | track_data_var_str = data->track_data.var_str; |
| 3744 | if (track_data_var_str[0] != '$') { | 3793 | if (track_data_var_str[0] != '$') { |
| 3745 | hist_err("For onmax(x) or onchange(x), x must be a variable: ", track_data_var_str); | 3794 | hist_err(tr, HIST_ERR_ONX_NOT_VAR, errpos(track_data_var_str)); |
| 3746 | return -EINVAL; | 3795 | return -EINVAL; |
| 3747 | } | 3796 | } |
| 3748 | track_data_var_str++; | 3797 | track_data_var_str++; |
| 3749 | 3798 | ||
| 3750 | var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); | 3799 | var_field = find_target_event_var(hist_data, NULL, NULL, track_data_var_str); |
| 3751 | if (!var_field) { | 3800 | if (!var_field) { |
| 3752 | hist_err("Couldn't find onmax or onchange variable: ", track_data_var_str); | 3801 | hist_err(tr, HIST_ERR_ONX_VAR_NOT_FOUND, errpos(track_data_var_str)); |
| 3753 | return -EINVAL; | 3802 | return -EINVAL; |
| 3754 | } | 3803 | } |
| 3755 | 3804 | ||
| @@ -3762,7 +3811,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, | |||
| 3762 | if (data->handler == HANDLER_ONMAX) | 3811 | if (data->handler == HANDLER_ONMAX) |
| 3763 | track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); | 3812 | track_var = create_var(hist_data, file, "__max", sizeof(u64), "u64"); |
| 3764 | if (IS_ERR(track_var)) { | 3813 | if (IS_ERR(track_var)) { |
| 3765 | hist_err("Couldn't create onmax variable: ", "__max"); | 3814 | hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); |
| 3766 | ret = PTR_ERR(track_var); | 3815 | ret = PTR_ERR(track_var); |
| 3767 | goto out; | 3816 | goto out; |
| 3768 | } | 3817 | } |
| @@ -3770,7 +3819,7 @@ static int track_data_create(struct hist_trigger_data *hist_data, | |||
| 3770 | if (data->handler == HANDLER_ONCHANGE) | 3819 | if (data->handler == HANDLER_ONCHANGE) |
| 3771 | track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); | 3820 | track_var = create_var(hist_data, file, "__change", sizeof(u64), "u64"); |
| 3772 | if (IS_ERR(track_var)) { | 3821 | if (IS_ERR(track_var)) { |
| 3773 | hist_err("Couldn't create onchange variable: ", "__change"); | 3822 | hist_err(tr, HIST_ERR_ONX_VAR_CREATE_FAIL, 0); |
| 3774 | ret = PTR_ERR(track_var); | 3823 | ret = PTR_ERR(track_var); |
| 3775 | goto out; | 3824 | goto out; |
| 3776 | } | 3825 | } |
| @@ -3781,7 +3830,8 @@ static int track_data_create(struct hist_trigger_data *hist_data, | |||
| 3781 | return ret; | 3830 | return ret; |
| 3782 | } | 3831 | } |
| 3783 | 3832 | ||
| 3784 | static int parse_action_params(char *params, struct action_data *data) | 3833 | static int parse_action_params(struct trace_array *tr, char *params, |
| 3834 | struct action_data *data) | ||
| 3785 | { | 3835 | { |
| 3786 | char *param, *saved_param; | 3836 | char *param, *saved_param; |
| 3787 | bool first_param = true; | 3837 | bool first_param = true; |
| @@ -3789,20 +3839,20 @@ static int parse_action_params(char *params, struct action_data *data) | |||
| 3789 | 3839 | ||
| 3790 | while (params) { | 3840 | while (params) { |
| 3791 | if (data->n_params >= SYNTH_FIELDS_MAX) { | 3841 | if (data->n_params >= SYNTH_FIELDS_MAX) { |
| 3792 | hist_err("Too many action params", ""); | 3842 | hist_err(tr, HIST_ERR_TOO_MANY_PARAMS, 0); |
| 3793 | goto out; | 3843 | goto out; |
| 3794 | } | 3844 | } |
| 3795 | 3845 | ||
| 3796 | param = strsep(¶ms, ","); | 3846 | param = strsep(¶ms, ","); |
| 3797 | if (!param) { | 3847 | if (!param) { |
| 3798 | hist_err("No action param found", ""); | 3848 | hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, 0); |
| 3799 | ret = -EINVAL; | 3849 | ret = -EINVAL; |
| 3800 | goto out; | 3850 | goto out; |
| 3801 | } | 3851 | } |
| 3802 | 3852 | ||
| 3803 | param = strstrip(param); | 3853 | param = strstrip(param); |
| 3804 | if (strlen(param) < 2) { | 3854 | if (strlen(param) < 2) { |
| 3805 | hist_err("Invalid action param: ", param); | 3855 | hist_err(tr, HIST_ERR_INVALID_PARAM, errpos(param)); |
| 3806 | ret = -EINVAL; | 3856 | ret = -EINVAL; |
| 3807 | goto out; | 3857 | goto out; |
| 3808 | } | 3858 | } |
| @@ -3826,7 +3876,7 @@ static int parse_action_params(char *params, struct action_data *data) | |||
| 3826 | return ret; | 3876 | return ret; |
| 3827 | } | 3877 | } |
| 3828 | 3878 | ||
| 3829 | static int action_parse(char *str, struct action_data *data, | 3879 | static int action_parse(struct trace_array *tr, char *str, struct action_data *data, |
| 3830 | enum handler_id handler) | 3880 | enum handler_id handler) |
| 3831 | { | 3881 | { |
| 3832 | char *action_name; | 3882 | char *action_name; |
| @@ -3834,14 +3884,14 @@ static int action_parse(char *str, struct action_data *data, | |||
| 3834 | 3884 | ||
| 3835 | strsep(&str, "."); | 3885 | strsep(&str, "."); |
| 3836 | if (!str) { | 3886 | if (!str) { |
| 3837 | hist_err("action parsing: No action found", ""); | 3887 | hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); |
| 3838 | ret = -EINVAL; | 3888 | ret = -EINVAL; |
| 3839 | goto out; | 3889 | goto out; |
| 3840 | } | 3890 | } |
| 3841 | 3891 | ||
| 3842 | action_name = strsep(&str, "("); | 3892 | action_name = strsep(&str, "("); |
| 3843 | if (!action_name || !str) { | 3893 | if (!action_name || !str) { |
| 3844 | hist_err("action parsing: No action found", ""); | 3894 | hist_err(tr, HIST_ERR_ACTION_NOT_FOUND, 0); |
| 3845 | ret = -EINVAL; | 3895 | ret = -EINVAL; |
| 3846 | goto out; | 3896 | goto out; |
| 3847 | } | 3897 | } |
| @@ -3850,12 +3900,12 @@ static int action_parse(char *str, struct action_data *data, | |||
| 3850 | char *params = strsep(&str, ")"); | 3900 | char *params = strsep(&str, ")"); |
| 3851 | 3901 | ||
| 3852 | if (!params) { | 3902 | if (!params) { |
| 3853 | hist_err("action parsing: No params found for %s", "save"); | 3903 | hist_err(tr, HIST_ERR_NO_SAVE_PARAMS, 0); |
| 3854 | ret = -EINVAL; | 3904 | ret = -EINVAL; |
| 3855 | goto out; | 3905 | goto out; |
| 3856 | } | 3906 | } |
| 3857 | 3907 | ||
| 3858 | ret = parse_action_params(params, data); | 3908 | ret = parse_action_params(tr, params, data); |
| 3859 | if (ret) | 3909 | if (ret) |
| 3860 | goto out; | 3910 | goto out; |
| 3861 | 3911 | ||
| @@ -3864,7 +3914,7 @@ static int action_parse(char *str, struct action_data *data, | |||
| 3864 | else if (handler == HANDLER_ONCHANGE) | 3914 | else if (handler == HANDLER_ONCHANGE) |
| 3865 | data->track_data.check_val = check_track_val_changed; | 3915 | data->track_data.check_val = check_track_val_changed; |
| 3866 | else { | 3916 | else { |
| 3867 | hist_err("action parsing: Handler doesn't support action: ", action_name); | 3917 | hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); |
| 3868 | ret = -EINVAL; | 3918 | ret = -EINVAL; |
| 3869 | goto out; | 3919 | goto out; |
| 3870 | } | 3920 | } |
| @@ -3876,7 +3926,7 @@ static int action_parse(char *str, struct action_data *data, | |||
| 3876 | char *params = strsep(&str, ")"); | 3926 | char *params = strsep(&str, ")"); |
| 3877 | 3927 | ||
| 3878 | if (!str) { | 3928 | if (!str) { |
| 3879 | hist_err("action parsing: No closing paren found: %s", params); | 3929 | hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(params)); |
| 3880 | ret = -EINVAL; | 3930 | ret = -EINVAL; |
| 3881 | goto out; | 3931 | goto out; |
| 3882 | } | 3932 | } |
| @@ -3886,7 +3936,7 @@ static int action_parse(char *str, struct action_data *data, | |||
| 3886 | else if (handler == HANDLER_ONCHANGE) | 3936 | else if (handler == HANDLER_ONCHANGE) |
| 3887 | data->track_data.check_val = check_track_val_changed; | 3937 | data->track_data.check_val = check_track_val_changed; |
| 3888 | else { | 3938 | else { |
| 3889 | hist_err("action parsing: Handler doesn't support action: ", action_name); | 3939 | hist_err(tr, HIST_ERR_ACTION_MISMATCH, errpos(action_name)); |
| 3890 | ret = -EINVAL; | 3940 | ret = -EINVAL; |
| 3891 | goto out; | 3941 | goto out; |
| 3892 | } | 3942 | } |
| @@ -3901,7 +3951,7 @@ static int action_parse(char *str, struct action_data *data, | |||
| 3901 | data->use_trace_keyword = true; | 3951 | data->use_trace_keyword = true; |
| 3902 | 3952 | ||
| 3903 | if (params) { | 3953 | if (params) { |
| 3904 | ret = parse_action_params(params, data); | 3954 | ret = parse_action_params(tr, params, data); |
| 3905 | if (ret) | 3955 | if (ret) |
| 3906 | goto out; | 3956 | goto out; |
| 3907 | } | 3957 | } |
| @@ -3954,7 +4004,7 @@ static struct action_data *track_data_parse(struct hist_trigger_data *hist_data, | |||
| 3954 | goto free; | 4004 | goto free; |
| 3955 | } | 4005 | } |
| 3956 | 4006 | ||
| 3957 | ret = action_parse(str, data, handler); | 4007 | ret = action_parse(hist_data->event_file->tr, str, data, handler); |
| 3958 | if (ret) | 4008 | if (ret) |
| 3959 | goto free; | 4009 | goto free; |
| 3960 | out: | 4010 | out: |
| @@ -4024,6 +4074,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, | |||
| 4024 | struct action_data *data, | 4074 | struct action_data *data, |
| 4025 | char *system, char *event, char *var) | 4075 | char *system, char *event, char *var) |
| 4026 | { | 4076 | { |
| 4077 | struct trace_array *tr = hist_data->event_file->tr; | ||
| 4027 | struct hist_field *hist_field; | 4078 | struct hist_field *hist_field; |
| 4028 | 4079 | ||
| 4029 | var++; /* skip '$' */ | 4080 | var++; /* skip '$' */ |
| @@ -4039,7 +4090,7 @@ trace_action_find_var(struct hist_trigger_data *hist_data, | |||
| 4039 | } | 4090 | } |
| 4040 | 4091 | ||
| 4041 | if (!hist_field) | 4092 | if (!hist_field) |
| 4042 | hist_err_event("trace action: Couldn't find param: $", system, event, var); | 4093 | hist_err(tr, HIST_ERR_PARAM_NOT_FOUND, errpos(var)); |
| 4043 | 4094 | ||
| 4044 | return hist_field; | 4095 | return hist_field; |
| 4045 | } | 4096 | } |
| @@ -4097,6 +4148,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, | |||
| 4097 | static int trace_action_create(struct hist_trigger_data *hist_data, | 4148 | static int trace_action_create(struct hist_trigger_data *hist_data, |
| 4098 | struct action_data *data) | 4149 | struct action_data *data) |
| 4099 | { | 4150 | { |
| 4151 | struct trace_array *tr = hist_data->event_file->tr; | ||
| 4100 | char *event_name, *param, *system = NULL; | 4152 | char *event_name, *param, *system = NULL; |
| 4101 | struct hist_field *hist_field, *var_ref; | 4153 | struct hist_field *hist_field, *var_ref; |
| 4102 | unsigned int i, var_ref_idx; | 4154 | unsigned int i, var_ref_idx; |
| @@ -4114,7 +4166,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data, | |||
| 4114 | 4166 | ||
| 4115 | event = find_synth_event(synth_event_name); | 4167 | event = find_synth_event(synth_event_name); |
| 4116 | if (!event) { | 4168 | if (!event) { |
| 4117 | hist_err("trace action: Couldn't find synthetic event: ", synth_event_name); | 4169 | hist_err(tr, HIST_ERR_SYNTH_EVENT_NOT_FOUND, errpos(synth_event_name)); |
| 4118 | return -EINVAL; | 4170 | return -EINVAL; |
| 4119 | } | 4171 | } |
| 4120 | 4172 | ||
| @@ -4175,15 +4227,14 @@ static int trace_action_create(struct hist_trigger_data *hist_data, | |||
| 4175 | continue; | 4227 | continue; |
| 4176 | } | 4228 | } |
| 4177 | 4229 | ||
| 4178 | hist_err_event("trace action: Param type doesn't match synthetic event field type: ", | 4230 | hist_err(tr, HIST_ERR_SYNTH_TYPE_MISMATCH, errpos(param)); |
| 4179 | system, event_name, param); | ||
| 4180 | kfree(p); | 4231 | kfree(p); |
| 4181 | ret = -EINVAL; | 4232 | ret = -EINVAL; |
| 4182 | goto err; | 4233 | goto err; |
| 4183 | } | 4234 | } |
| 4184 | 4235 | ||
| 4185 | if (field_pos != event->n_fields) { | 4236 | if (field_pos != event->n_fields) { |
| 4186 | hist_err("trace action: Param count doesn't match synthetic event field count: ", event->name); | 4237 | hist_err(tr, HIST_ERR_SYNTH_COUNT_MISMATCH, errpos(event->name)); |
| 4187 | ret = -EINVAL; | 4238 | ret = -EINVAL; |
| 4188 | goto err; | 4239 | goto err; |
| 4189 | } | 4240 | } |
| @@ -4202,6 +4253,7 @@ static int action_create(struct hist_trigger_data *hist_data, | |||
| 4202 | struct action_data *data) | 4253 | struct action_data *data) |
| 4203 | { | 4254 | { |
| 4204 | struct trace_event_file *file = hist_data->event_file; | 4255 | struct trace_event_file *file = hist_data->event_file; |
| 4256 | struct trace_array *tr = file->tr; | ||
| 4205 | struct track_data *track_data; | 4257 | struct track_data *track_data; |
| 4206 | struct field_var *field_var; | 4258 | struct field_var *field_var; |
| 4207 | unsigned int i; | 4259 | unsigned int i; |
| @@ -4229,7 +4281,7 @@ static int action_create(struct hist_trigger_data *hist_data, | |||
| 4229 | if (data->action == ACTION_SAVE) { | 4281 | if (data->action == ACTION_SAVE) { |
| 4230 | if (hist_data->n_save_vars) { | 4282 | if (hist_data->n_save_vars) { |
| 4231 | ret = -EEXIST; | 4283 | ret = -EEXIST; |
| 4232 | hist_err("save action: Can't have more than one save() action per hist", ""); | 4284 | hist_err(tr, HIST_ERR_TOO_MANY_SAVE_ACTIONS, 0); |
| 4233 | goto out; | 4285 | goto out; |
| 4234 | } | 4286 | } |
| 4235 | 4287 | ||
| @@ -4242,7 +4294,8 @@ static int action_create(struct hist_trigger_data *hist_data, | |||
| 4242 | 4294 | ||
| 4243 | field_var = create_target_field_var(hist_data, NULL, NULL, param); | 4295 | field_var = create_target_field_var(hist_data, NULL, NULL, param); |
| 4244 | if (IS_ERR(field_var)) { | 4296 | if (IS_ERR(field_var)) { |
| 4245 | hist_err("save action: Couldn't create field variable: ", param); | 4297 | hist_err(tr, HIST_ERR_FIELD_VAR_CREATE_FAIL, |
| 4298 | errpos(param)); | ||
| 4246 | ret = PTR_ERR(field_var); | 4299 | ret = PTR_ERR(field_var); |
| 4247 | kfree(param); | 4300 | kfree(param); |
| 4248 | goto out; | 4301 | goto out; |
| @@ -4276,19 +4329,18 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) | |||
| 4276 | 4329 | ||
| 4277 | match_event = strsep(&str, ")"); | 4330 | match_event = strsep(&str, ")"); |
| 4278 | if (!match_event || !str) { | 4331 | if (!match_event || !str) { |
| 4279 | hist_err("onmatch: Missing closing paren: ", match_event); | 4332 | hist_err(tr, HIST_ERR_NO_CLOSING_PAREN, errpos(match_event)); |
| 4280 | goto free; | 4333 | goto free; |
| 4281 | } | 4334 | } |
| 4282 | 4335 | ||
| 4283 | match_event_system = strsep(&match_event, "."); | 4336 | match_event_system = strsep(&match_event, "."); |
| 4284 | if (!match_event) { | 4337 | if (!match_event) { |
| 4285 | hist_err("onmatch: Missing subsystem for match event: ", match_event_system); | 4338 | hist_err(tr, HIST_ERR_SUBSYS_NOT_FOUND, errpos(match_event_system)); |
| 4286 | goto free; | 4339 | goto free; |
| 4287 | } | 4340 | } |
| 4288 | 4341 | ||
| 4289 | if (IS_ERR(event_file(tr, match_event_system, match_event))) { | 4342 | if (IS_ERR(event_file(tr, match_event_system, match_event))) { |
| 4290 | hist_err_event("onmatch: Invalid subsystem or event name: ", | 4343 | hist_err(tr, HIST_ERR_INVALID_SUBSYS_EVENT, errpos(match_event)); |
| 4291 | match_event_system, match_event, NULL); | ||
| 4292 | goto free; | 4344 | goto free; |
| 4293 | } | 4345 | } |
| 4294 | 4346 | ||
| @@ -4304,7 +4356,7 @@ static struct action_data *onmatch_parse(struct trace_array *tr, char *str) | |||
| 4304 | goto free; | 4356 | goto free; |
| 4305 | } | 4357 | } |
| 4306 | 4358 | ||
| 4307 | ret = action_parse(str, data, HANDLER_ONMATCH); | 4359 | ret = action_parse(tr, str, data, HANDLER_ONMATCH); |
| 4308 | if (ret) | 4360 | if (ret) |
| 4309 | goto free; | 4361 | goto free; |
| 4310 | out: | 4362 | out: |
| @@ -4373,13 +4425,14 @@ static int create_var_field(struct hist_trigger_data *hist_data, | |||
| 4373 | struct trace_event_file *file, | 4425 | struct trace_event_file *file, |
| 4374 | char *var_name, char *expr_str) | 4426 | char *var_name, char *expr_str) |
| 4375 | { | 4427 | { |
| 4428 | struct trace_array *tr = hist_data->event_file->tr; | ||
| 4376 | unsigned long flags = 0; | 4429 | unsigned long flags = 0; |
| 4377 | 4430 | ||
| 4378 | if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) | 4431 | if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX)) |
| 4379 | return -EINVAL; | 4432 | return -EINVAL; |
| 4380 | 4433 | ||
| 4381 | if (find_var(hist_data, file, var_name) && !hist_data->remove) { | 4434 | if (find_var(hist_data, file, var_name) && !hist_data->remove) { |
| 4382 | hist_err("Variable already defined: ", var_name); | 4435 | hist_err(tr, HIST_ERR_DUPLICATE_VAR, errpos(var_name)); |
| 4383 | return -EINVAL; | 4436 | return -EINVAL; |
| 4384 | } | 4437 | } |
| 4385 | 4438 | ||
| @@ -4436,8 +4489,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, | |||
| 4436 | struct trace_event_file *file, | 4489 | struct trace_event_file *file, |
| 4437 | char *field_str) | 4490 | char *field_str) |
| 4438 | { | 4491 | { |
| 4492 | struct trace_array *tr = hist_data->event_file->tr; | ||
| 4439 | struct hist_field *hist_field = NULL; | 4493 | struct hist_field *hist_field = NULL; |
| 4440 | |||
| 4441 | unsigned long flags = 0; | 4494 | unsigned long flags = 0; |
| 4442 | unsigned int key_size; | 4495 | unsigned int key_size; |
| 4443 | int ret = 0; | 4496 | int ret = 0; |
| @@ -4459,8 +4512,8 @@ static int create_key_field(struct hist_trigger_data *hist_data, | |||
| 4459 | goto out; | 4512 | goto out; |
| 4460 | } | 4513 | } |
| 4461 | 4514 | ||
| 4462 | if (hist_field->flags & HIST_FIELD_FL_VAR_REF) { | 4515 | if (field_has_hist_vars(hist_field, 0)) { |
| 4463 | hist_err("Using variable references as keys not supported: ", field_str); | 4516 | hist_err(tr, HIST_ERR_INVALID_REF_KEY, errpos(field_str)); |
| 4464 | destroy_hist_field(hist_field, 0); | 4517 | destroy_hist_field(hist_field, 0); |
| 4465 | ret = -EINVAL; | 4518 | ret = -EINVAL; |
| 4466 | goto out; | 4519 | goto out; |
| @@ -4561,6 +4614,7 @@ static void free_var_defs(struct hist_trigger_data *hist_data) | |||
| 4561 | 4614 | ||
| 4562 | static int parse_var_defs(struct hist_trigger_data *hist_data) | 4615 | static int parse_var_defs(struct hist_trigger_data *hist_data) |
| 4563 | { | 4616 | { |
| 4617 | struct trace_array *tr = hist_data->event_file->tr; | ||
| 4564 | char *s, *str, *var_name, *field_str; | 4618 | char *s, *str, *var_name, *field_str; |
| 4565 | unsigned int i, j, n_vars = 0; | 4619 | unsigned int i, j, n_vars = 0; |
| 4566 | int ret = 0; | 4620 | int ret = 0; |
| @@ -4574,13 +4628,14 @@ static int parse_var_defs(struct hist_trigger_data *hist_data) | |||
| 4574 | 4628 | ||
| 4575 | var_name = strsep(&field_str, "="); | 4629 | var_name = strsep(&field_str, "="); |
| 4576 | if (!var_name || !field_str) { | 4630 | if (!var_name || !field_str) { |
| 4577 | hist_err("Malformed assignment: ", var_name); | 4631 | hist_err(tr, HIST_ERR_MALFORMED_ASSIGNMENT, |
| 4632 | errpos(var_name)); | ||
| 4578 | ret = -EINVAL; | 4633 | ret = -EINVAL; |
| 4579 | goto free; | 4634 | goto free; |
| 4580 | } | 4635 | } |
| 4581 | 4636 | ||
| 4582 | if (n_vars == TRACING_MAP_VARS_MAX) { | 4637 | if (n_vars == TRACING_MAP_VARS_MAX) { |
| 4583 | hist_err("Too many variables defined: ", var_name); | 4638 | hist_err(tr, HIST_ERR_TOO_MANY_VARS, errpos(var_name)); |
| 4584 | ret = -EINVAL; | 4639 | ret = -EINVAL; |
| 4585 | goto free; | 4640 | goto free; |
| 4586 | } | 4641 | } |
| @@ -5431,11 +5486,6 @@ static int hist_show(struct seq_file *m, void *v) | |||
| 5431 | hist_trigger_show(m, data, n++); | 5486 | hist_trigger_show(m, data, n++); |
| 5432 | } | 5487 | } |
| 5433 | 5488 | ||
| 5434 | if (have_hist_err()) { | ||
| 5435 | seq_printf(m, "\nERROR: %s\n", hist_err_str); | ||
| 5436 | seq_printf(m, " Last command: %s\n", last_hist_cmd); | ||
| 5437 | } | ||
| 5438 | |||
| 5439 | out_unlock: | 5489 | out_unlock: |
| 5440 | mutex_unlock(&event_mutex); | 5490 | mutex_unlock(&event_mutex); |
| 5441 | 5491 | ||
| @@ -5800,6 +5850,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, | |||
| 5800 | { | 5850 | { |
| 5801 | struct hist_trigger_data *hist_data = data->private_data; | 5851 | struct hist_trigger_data *hist_data = data->private_data; |
| 5802 | struct event_trigger_data *test, *named_data = NULL; | 5852 | struct event_trigger_data *test, *named_data = NULL; |
| 5853 | struct trace_array *tr = file->tr; | ||
| 5803 | int ret = 0; | 5854 | int ret = 0; |
| 5804 | 5855 | ||
| 5805 | if (hist_data->attrs->name) { | 5856 | if (hist_data->attrs->name) { |
| @@ -5807,7 +5858,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, | |||
| 5807 | if (named_data) { | 5858 | if (named_data) { |
| 5808 | if (!hist_trigger_match(data, named_data, named_data, | 5859 | if (!hist_trigger_match(data, named_data, named_data, |
| 5809 | true)) { | 5860 | true)) { |
| 5810 | hist_err("Named hist trigger doesn't match existing named trigger (includes variables): ", hist_data->attrs->name); | 5861 | hist_err(tr, HIST_ERR_NAMED_MISMATCH, errpos(hist_data->attrs->name)); |
| 5811 | ret = -EINVAL; | 5862 | ret = -EINVAL; |
| 5812 | goto out; | 5863 | goto out; |
| 5813 | } | 5864 | } |
| @@ -5828,7 +5879,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, | |||
| 5828 | else if (hist_data->attrs->clear) | 5879 | else if (hist_data->attrs->clear) |
| 5829 | hist_clear(test); | 5880 | hist_clear(test); |
| 5830 | else { | 5881 | else { |
| 5831 | hist_err("Hist trigger already exists", NULL); | 5882 | hist_err(tr, HIST_ERR_TRIGGER_EEXIST, 0); |
| 5832 | ret = -EEXIST; | 5883 | ret = -EEXIST; |
| 5833 | } | 5884 | } |
| 5834 | goto out; | 5885 | goto out; |
| @@ -5836,7 +5887,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, | |||
| 5836 | } | 5887 | } |
| 5837 | new: | 5888 | new: |
| 5838 | if (hist_data->attrs->cont || hist_data->attrs->clear) { | 5889 | if (hist_data->attrs->cont || hist_data->attrs->clear) { |
| 5839 | hist_err("Can't clear or continue a nonexistent hist trigger", NULL); | 5890 | hist_err(tr, HIST_ERR_TRIGGER_ENOENT_CLEAR, 0); |
| 5840 | ret = -ENOENT; | 5891 | ret = -ENOENT; |
| 5841 | goto out; | 5892 | goto out; |
| 5842 | } | 5893 | } |
| @@ -5861,7 +5912,7 @@ static int hist_register_trigger(char *glob, struct event_trigger_ops *ops, | |||
| 5861 | 5912 | ||
| 5862 | ret = tracing_set_clock(file->tr, hist_data->attrs->clock); | 5913 | ret = tracing_set_clock(file->tr, hist_data->attrs->clock); |
| 5863 | if (ret) { | 5914 | if (ret) { |
| 5864 | hist_err("Couldn't set trace_clock: ", clock); | 5915 | hist_err(tr, HIST_ERR_SET_CLOCK_FAIL, errpos(clock)); |
| 5865 | goto out; | 5916 | goto out; |
| 5866 | } | 5917 | } |
| 5867 | 5918 | ||
| @@ -6037,8 +6088,8 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, | |||
| 6037 | lockdep_assert_held(&event_mutex); | 6088 | lockdep_assert_held(&event_mutex); |
| 6038 | 6089 | ||
| 6039 | if (glob && strlen(glob)) { | 6090 | if (glob && strlen(glob)) { |
| 6040 | last_cmd_set(param); | ||
| 6041 | hist_err_clear(); | 6091 | hist_err_clear(); |
| 6092 | last_cmd_set(file, param); | ||
| 6042 | } | 6093 | } |
| 6043 | 6094 | ||
| 6044 | if (!param) | 6095 | if (!param) |
| @@ -6079,7 +6130,7 @@ static int event_hist_trigger_func(struct event_command *cmd_ops, | |||
| 6079 | trigger = strstrip(trigger); | 6130 | trigger = strstrip(trigger); |
| 6080 | } | 6131 | } |
| 6081 | 6132 | ||
| 6082 | attrs = parse_hist_trigger_attrs(trigger); | 6133 | attrs = parse_hist_trigger_attrs(file->tr, trigger); |
| 6083 | if (IS_ERR(attrs)) | 6134 | if (IS_ERR(attrs)) |
| 6084 | return PTR_ERR(attrs); | 6135 | return PTR_ERR(attrs); |
| 6085 | 6136 | ||
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index cd12ecb66eb9..2a2912cb4533 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c | |||
| @@ -731,7 +731,8 @@ int set_trigger_filter(char *filter_str, | |||
| 731 | goto out; | 731 | goto out; |
| 732 | 732 | ||
| 733 | /* The filter is for the 'trigger' event, not the triggered event */ | 733 | /* The filter is for the 'trigger' event, not the triggered event */ |
| 734 | ret = create_event_filter(file->event_call, filter_str, false, &filter); | 734 | ret = create_event_filter(file->tr, file->event_call, |
| 735 | filter_str, false, &filter); | ||
| 735 | /* | 736 | /* |
| 736 | * If create_event_filter() fails, filter still needs to be freed. | 737 | * If create_event_filter() fails, filter still needs to be freed. |
| 737 | * Which the calling code will do with data->filter. | 738 | * Which the calling code will do with data->filter. |
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1e6db9cbe4dc..fa95139445b2 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c | |||
| @@ -277,7 +277,7 @@ static void move_to_next_cpu(void) | |||
| 277 | * of this thread, than stop migrating for the duration | 277 | * of this thread, than stop migrating for the duration |
| 278 | * of the current test. | 278 | * of the current test. |
| 279 | */ | 279 | */ |
| 280 | if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | 280 | if (!cpumask_equal(current_mask, current->cpus_ptr)) |
| 281 | goto disable; | 281 | goto disable; |
| 282 | 282 | ||
| 283 | get_online_cpus(); | 283 | get_online_cpus(); |
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 810d78a8d14c..cca65044c14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c | |||
| @@ -17,36 +17,28 @@ | |||
| 17 | #include "trace.h" | 17 | #include "trace.h" |
| 18 | #include "trace_output.h" | 18 | #include "trace_output.h" |
| 19 | 19 | ||
| 20 | static void ftrace_dump_buf(int skip_lines, long cpu_file) | 20 | static struct trace_iterator iter; |
| 21 | static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; | ||
| 22 | |||
| 23 | static void ftrace_dump_buf(int skip_entries, long cpu_file) | ||
| 21 | { | 24 | { |
| 22 | /* use static because iter can be a bit big for the stack */ | ||
| 23 | static struct trace_iterator iter; | ||
| 24 | static struct ring_buffer_iter *buffer_iter[CONFIG_NR_CPUS]; | ||
| 25 | struct trace_array *tr; | 25 | struct trace_array *tr; |
| 26 | unsigned int old_userobj; | 26 | unsigned int old_userobj; |
| 27 | int cnt = 0, cpu; | 27 | int cnt = 0, cpu; |
| 28 | 28 | ||
| 29 | trace_init_global_iter(&iter); | ||
| 30 | iter.buffer_iter = buffer_iter; | ||
| 31 | tr = iter.tr; | 29 | tr = iter.tr; |
| 32 | 30 | ||
| 33 | for_each_tracing_cpu(cpu) { | ||
| 34 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | ||
| 35 | } | ||
| 36 | |||
| 37 | old_userobj = tr->trace_flags; | 31 | old_userobj = tr->trace_flags; |
| 38 | 32 | ||
| 39 | /* don't look at user memory in panic mode */ | 33 | /* don't look at user memory in panic mode */ |
| 40 | tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; | 34 | tr->trace_flags &= ~TRACE_ITER_SYM_USEROBJ; |
| 41 | 35 | ||
| 42 | kdb_printf("Dumping ftrace buffer:\n"); | 36 | kdb_printf("Dumping ftrace buffer:\n"); |
| 37 | if (skip_entries) | ||
| 38 | kdb_printf("(skipping %d entries)\n", skip_entries); | ||
| 43 | 39 | ||
| 44 | /* reset all but tr, trace, and overruns */ | 40 | trace_iterator_reset(&iter); |
| 45 | memset(&iter.seq, 0, | ||
| 46 | sizeof(struct trace_iterator) - | ||
| 47 | offsetof(struct trace_iterator, seq)); | ||
| 48 | iter.iter_flags |= TRACE_FILE_LAT_FMT; | 41 | iter.iter_flags |= TRACE_FILE_LAT_FMT; |
| 49 | iter.pos = -1; | ||
| 50 | 42 | ||
| 51 | if (cpu_file == RING_BUFFER_ALL_CPUS) { | 43 | if (cpu_file == RING_BUFFER_ALL_CPUS) { |
| 52 | for_each_tracing_cpu(cpu) { | 44 | for_each_tracing_cpu(cpu) { |
| @@ -70,11 +62,11 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) | |||
| 70 | kdb_printf("---------------------------------\n"); | 62 | kdb_printf("---------------------------------\n"); |
| 71 | cnt++; | 63 | cnt++; |
| 72 | 64 | ||
| 73 | if (!skip_lines) { | 65 | if (!skip_entries) { |
| 74 | print_trace_line(&iter); | 66 | print_trace_line(&iter); |
| 75 | trace_printk_seq(&iter.seq); | 67 | trace_printk_seq(&iter.seq); |
| 76 | } else { | 68 | } else { |
| 77 | skip_lines--; | 69 | skip_entries--; |
| 78 | } | 70 | } |
| 79 | 71 | ||
| 80 | if (KDB_FLAG(CMD_INTERRUPT)) | 72 | if (KDB_FLAG(CMD_INTERRUPT)) |
| @@ -90,10 +82,6 @@ out: | |||
| 90 | tr->trace_flags = old_userobj; | 82 | tr->trace_flags = old_userobj; |
| 91 | 83 | ||
| 92 | for_each_tracing_cpu(cpu) { | 84 | for_each_tracing_cpu(cpu) { |
| 93 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | ||
| 94 | } | ||
| 95 | |||
| 96 | for_each_tracing_cpu(cpu) { | ||
| 97 | if (iter.buffer_iter[cpu]) { | 85 | if (iter.buffer_iter[cpu]) { |
| 98 | ring_buffer_read_finish(iter.buffer_iter[cpu]); | 86 | ring_buffer_read_finish(iter.buffer_iter[cpu]); |
| 99 | iter.buffer_iter[cpu] = NULL; | 87 | iter.buffer_iter[cpu] = NULL; |
| @@ -106,17 +94,19 @@ out: | |||
| 106 | */ | 94 | */ |
| 107 | static int kdb_ftdump(int argc, const char **argv) | 95 | static int kdb_ftdump(int argc, const char **argv) |
| 108 | { | 96 | { |
| 109 | int skip_lines = 0; | 97 | int skip_entries = 0; |
| 110 | long cpu_file; | 98 | long cpu_file; |
| 111 | char *cp; | 99 | char *cp; |
| 100 | int cnt; | ||
| 101 | int cpu; | ||
| 112 | 102 | ||
| 113 | if (argc > 2) | 103 | if (argc > 2) |
| 114 | return KDB_ARGCOUNT; | 104 | return KDB_ARGCOUNT; |
| 115 | 105 | ||
| 116 | if (argc) { | 106 | if (argc) { |
| 117 | skip_lines = simple_strtol(argv[1], &cp, 0); | 107 | skip_entries = simple_strtol(argv[1], &cp, 0); |
| 118 | if (*cp) | 108 | if (*cp) |
| 119 | skip_lines = 0; | 109 | skip_entries = 0; |
| 120 | } | 110 | } |
| 121 | 111 | ||
| 122 | if (argc == 2) { | 112 | if (argc == 2) { |
| @@ -129,7 +119,29 @@ static int kdb_ftdump(int argc, const char **argv) | |||
| 129 | } | 119 | } |
| 130 | 120 | ||
| 131 | kdb_trap_printk++; | 121 | kdb_trap_printk++; |
| 132 | ftrace_dump_buf(skip_lines, cpu_file); | 122 | |
| 123 | trace_init_global_iter(&iter); | ||
| 124 | iter.buffer_iter = buffer_iter; | ||
| 125 | |||
| 126 | for_each_tracing_cpu(cpu) { | ||
| 127 | atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | ||
| 128 | } | ||
| 129 | |||
| 130 | /* A negative skip_entries means skip all but the last entries */ | ||
| 131 | if (skip_entries < 0) { | ||
| 132 | if (cpu_file == RING_BUFFER_ALL_CPUS) | ||
| 133 | cnt = trace_total_entries(NULL); | ||
| 134 | else | ||
| 135 | cnt = trace_total_entries_cpu(NULL, cpu_file); | ||
| 136 | skip_entries = max(cnt + skip_entries, 0); | ||
| 137 | } | ||
| 138 | |||
| 139 | ftrace_dump_buf(skip_entries, cpu_file); | ||
| 140 | |||
| 141 | for_each_tracing_cpu(cpu) { | ||
| 142 | atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); | ||
| 143 | } | ||
| 144 | |||
| 133 | kdb_trap_printk--; | 145 | kdb_trap_printk--; |
| 134 | 146 | ||
| 135 | return 0; | 147 | return 0; |
| @@ -137,8 +149,9 @@ static int kdb_ftdump(int argc, const char **argv) | |||
| 137 | 149 | ||
| 138 | static __init int kdb_ftrace_register(void) | 150 | static __init int kdb_ftrace_register(void) |
| 139 | { | 151 | { |
| 140 | kdb_register_flags("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", | 152 | kdb_register_flags("ftdump", kdb_ftdump, "[skip_#entries] [cpu]", |
| 141 | "Dump ftrace log", 0, KDB_ENABLE_ALWAYS_SAFE); | 153 | "Dump ftrace log; -skip dumps last #entries", 0, |
| 154 | KDB_ENABLE_ALWAYS_SAFE); | ||
| 142 | return 0; | 155 | return 0; |
| 143 | } | 156 | } |
| 144 | 157 | ||
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5d5129b05df7..7d736248a070 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c | |||
| @@ -441,13 +441,8 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) | |||
| 441 | else | 441 | else |
| 442 | ret = register_kprobe(&tk->rp.kp); | 442 | ret = register_kprobe(&tk->rp.kp); |
| 443 | 443 | ||
| 444 | if (ret == 0) { | 444 | if (ret == 0) |
| 445 | tk->tp.flags |= TP_FLAG_REGISTERED; | 445 | tk->tp.flags |= TP_FLAG_REGISTERED; |
| 446 | } else if (ret == -EILSEQ) { | ||
| 447 | pr_warn("Probing address(0x%p) is not an instruction boundary.\n", | ||
| 448 | tk->rp.kp.addr); | ||
| 449 | ret = -EINVAL; | ||
| 450 | } | ||
| 451 | return ret; | 446 | return ret; |
| 452 | } | 447 | } |
| 453 | 448 | ||
| @@ -591,7 +586,7 @@ static int trace_kprobe_create(int argc, const char *argv[]) | |||
| 591 | * Type of args: | 586 | * Type of args: |
| 592 | * FETCHARG:TYPE : use TYPE instead of unsigned long. | 587 | * FETCHARG:TYPE : use TYPE instead of unsigned long. |
| 593 | */ | 588 | */ |
| 594 | struct trace_kprobe *tk; | 589 | struct trace_kprobe *tk = NULL; |
| 595 | int i, len, ret = 0; | 590 | int i, len, ret = 0; |
| 596 | bool is_return = false; | 591 | bool is_return = false; |
| 597 | char *symbol = NULL, *tmp = NULL; | 592 | char *symbol = NULL, *tmp = NULL; |
| @@ -615,44 +610,50 @@ static int trace_kprobe_create(int argc, const char *argv[]) | |||
| 615 | if (argc < 2) | 610 | if (argc < 2) |
| 616 | return -ECANCELED; | 611 | return -ECANCELED; |
| 617 | 612 | ||
| 613 | trace_probe_log_init("trace_kprobe", argc, argv); | ||
| 614 | |||
| 618 | event = strchr(&argv[0][1], ':'); | 615 | event = strchr(&argv[0][1], ':'); |
| 619 | if (event) | 616 | if (event) |
| 620 | event++; | 617 | event++; |
| 621 | 618 | ||
| 622 | if (isdigit(argv[0][1])) { | 619 | if (isdigit(argv[0][1])) { |
| 623 | if (!is_return) { | 620 | if (!is_return) { |
| 624 | pr_info("Maxactive is not for kprobe"); | 621 | trace_probe_log_err(1, MAXACT_NO_KPROBE); |
| 625 | return -EINVAL; | 622 | goto parse_error; |
| 626 | } | 623 | } |
| 627 | if (event) | 624 | if (event) |
| 628 | len = event - &argv[0][1] - 1; | 625 | len = event - &argv[0][1] - 1; |
| 629 | else | 626 | else |
| 630 | len = strlen(&argv[0][1]); | 627 | len = strlen(&argv[0][1]); |
| 631 | if (len > MAX_EVENT_NAME_LEN - 1) | 628 | if (len > MAX_EVENT_NAME_LEN - 1) { |
| 632 | return -E2BIG; | 629 | trace_probe_log_err(1, BAD_MAXACT); |
| 630 | goto parse_error; | ||
| 631 | } | ||
| 633 | memcpy(buf, &argv[0][1], len); | 632 | memcpy(buf, &argv[0][1], len); |
| 634 | buf[len] = '\0'; | 633 | buf[len] = '\0'; |
| 635 | ret = kstrtouint(buf, 0, &maxactive); | 634 | ret = kstrtouint(buf, 0, &maxactive); |
| 636 | if (ret || !maxactive) { | 635 | if (ret || !maxactive) { |
| 637 | pr_info("Invalid maxactive number\n"); | 636 | trace_probe_log_err(1, BAD_MAXACT); |
| 638 | return ret; | 637 | goto parse_error; |
| 639 | } | 638 | } |
| 640 | /* kretprobes instances are iterated over via a list. The | 639 | /* kretprobes instances are iterated over via a list. The |
| 641 | * maximum should stay reasonable. | 640 | * maximum should stay reasonable. |
| 642 | */ | 641 | */ |
| 643 | if (maxactive > KRETPROBE_MAXACTIVE_MAX) { | 642 | if (maxactive > KRETPROBE_MAXACTIVE_MAX) { |
| 644 | pr_info("Maxactive is too big (%d > %d).\n", | 643 | trace_probe_log_err(1, MAXACT_TOO_BIG); |
| 645 | maxactive, KRETPROBE_MAXACTIVE_MAX); | 644 | goto parse_error; |
| 646 | return -E2BIG; | ||
| 647 | } | 645 | } |
| 648 | } | 646 | } |
| 649 | 647 | ||
| 650 | /* try to parse an address. if that fails, try to read the | 648 | /* try to parse an address. if that fails, try to read the |
| 651 | * input as a symbol. */ | 649 | * input as a symbol. */ |
| 652 | if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { | 650 | if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) { |
| 651 | trace_probe_log_set_index(1); | ||
| 653 | /* Check whether uprobe event specified */ | 652 | /* Check whether uprobe event specified */ |
| 654 | if (strchr(argv[1], '/') && strchr(argv[1], ':')) | 653 | if (strchr(argv[1], '/') && strchr(argv[1], ':')) { |
| 655 | return -ECANCELED; | 654 | ret = -ECANCELED; |
| 655 | goto error; | ||
| 656 | } | ||
| 656 | /* a symbol specified */ | 657 | /* a symbol specified */ |
| 657 | symbol = kstrdup(argv[1], GFP_KERNEL); | 658 | symbol = kstrdup(argv[1], GFP_KERNEL); |
| 658 | if (!symbol) | 659 | if (!symbol) |
| @@ -660,23 +661,23 @@ static int trace_kprobe_create(int argc, const char *argv[]) | |||
| 660 | /* TODO: support .init module functions */ | 661 | /* TODO: support .init module functions */ |
| 661 | ret = traceprobe_split_symbol_offset(symbol, &offset); | 662 | ret = traceprobe_split_symbol_offset(symbol, &offset); |
| 662 | if (ret || offset < 0 || offset > UINT_MAX) { | 663 | if (ret || offset < 0 || offset > UINT_MAX) { |
| 663 | pr_info("Failed to parse either an address or a symbol.\n"); | 664 | trace_probe_log_err(0, BAD_PROBE_ADDR); |
| 664 | goto out; | 665 | goto parse_error; |
| 665 | } | 666 | } |
| 666 | if (kprobe_on_func_entry(NULL, symbol, offset)) | 667 | if (kprobe_on_func_entry(NULL, symbol, offset)) |
| 667 | flags |= TPARG_FL_FENTRY; | 668 | flags |= TPARG_FL_FENTRY; |
| 668 | if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { | 669 | if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { |
| 669 | pr_info("Given offset is not valid for return probe.\n"); | 670 | trace_probe_log_err(0, BAD_RETPROBE); |
| 670 | ret = -EINVAL; | 671 | goto parse_error; |
| 671 | goto out; | ||
| 672 | } | 672 | } |
| 673 | } | 673 | } |
| 674 | argc -= 2; argv += 2; | ||
| 675 | 674 | ||
| 675 | trace_probe_log_set_index(0); | ||
| 676 | if (event) { | 676 | if (event) { |
| 677 | ret = traceprobe_parse_event_name(&event, &group, buf); | 677 | ret = traceprobe_parse_event_name(&event, &group, buf, |
| 678 | event - argv[0]); | ||
| 678 | if (ret) | 679 | if (ret) |
| 679 | goto out; | 680 | goto parse_error; |
| 680 | } else { | 681 | } else { |
| 681 | /* Make a new event name */ | 682 | /* Make a new event name */ |
| 682 | if (symbol) | 683 | if (symbol) |
| @@ -691,13 +692,14 @@ static int trace_kprobe_create(int argc, const char *argv[]) | |||
| 691 | 692 | ||
| 692 | /* setup a probe */ | 693 | /* setup a probe */ |
| 693 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, | 694 | tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, |
| 694 | argc, is_return); | 695 | argc - 2, is_return); |
| 695 | if (IS_ERR(tk)) { | 696 | if (IS_ERR(tk)) { |
| 696 | ret = PTR_ERR(tk); | 697 | ret = PTR_ERR(tk); |
| 697 | /* This must return -ENOMEM otherwise there is a bug */ | 698 | /* This must return -ENOMEM, else there is a bug */ |
| 698 | WARN_ON_ONCE(ret != -ENOMEM); | 699 | WARN_ON_ONCE(ret != -ENOMEM); |
| 699 | goto out; | 700 | goto out; /* We know tk is not allocated */ |
| 700 | } | 701 | } |
| 702 | argc -= 2; argv += 2; | ||
| 701 | 703 | ||
| 702 | /* parse arguments */ | 704 | /* parse arguments */ |
| 703 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | 705 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { |
| @@ -707,19 +709,32 @@ static int trace_kprobe_create(int argc, const char *argv[]) | |||
| 707 | goto error; | 709 | goto error; |
| 708 | } | 710 | } |
| 709 | 711 | ||
| 712 | trace_probe_log_set_index(i + 2); | ||
| 710 | ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); | 713 | ret = traceprobe_parse_probe_arg(&tk->tp, i, tmp, flags); |
| 711 | kfree(tmp); | 714 | kfree(tmp); |
| 712 | if (ret) | 715 | if (ret) |
| 713 | goto error; | 716 | goto error; /* This can be -ENOMEM */ |
| 714 | } | 717 | } |
| 715 | 718 | ||
| 716 | ret = register_trace_kprobe(tk); | 719 | ret = register_trace_kprobe(tk); |
| 717 | if (ret) | 720 | if (ret) { |
| 721 | trace_probe_log_set_index(1); | ||
| 722 | if (ret == -EILSEQ) | ||
| 723 | trace_probe_log_err(0, BAD_INSN_BNDRY); | ||
| 724 | else if (ret == -ENOENT) | ||
| 725 | trace_probe_log_err(0, BAD_PROBE_ADDR); | ||
| 726 | else if (ret != -ENOMEM) | ||
| 727 | trace_probe_log_err(0, FAIL_REG_PROBE); | ||
| 718 | goto error; | 728 | goto error; |
| 729 | } | ||
| 730 | |||
| 719 | out: | 731 | out: |
| 732 | trace_probe_log_clear(); | ||
| 720 | kfree(symbol); | 733 | kfree(symbol); |
| 721 | return ret; | 734 | return ret; |
| 722 | 735 | ||
| 736 | parse_error: | ||
| 737 | ret = -EINVAL; | ||
| 723 | error: | 738 | error: |
| 724 | free_trace_kprobe(tk); | 739 | free_trace_kprobe(tk); |
| 725 | goto out; | 740 | goto out; |
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 54373d93e251..ba751f993c3b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
| @@ -1057,7 +1057,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, | |||
| 1057 | 1057 | ||
| 1058 | trace_seq_puts(s, "<stack trace>\n"); | 1058 | trace_seq_puts(s, "<stack trace>\n"); |
| 1059 | 1059 | ||
| 1060 | for (p = field->caller; p && *p != ULONG_MAX && p < end; p++) { | 1060 | for (p = field->caller; p && p < end && *p != ULONG_MAX; p++) { |
| 1061 | 1061 | ||
| 1062 | if (trace_seq_has_overflowed(s)) | 1062 | if (trace_seq_has_overflowed(s)) |
| 1063 | break; | 1063 | break; |
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 8f8411e7835f..a347faced959 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c | |||
| @@ -13,6 +13,11 @@ | |||
| 13 | 13 | ||
| 14 | #include "trace_probe.h" | 14 | #include "trace_probe.h" |
| 15 | 15 | ||
| 16 | #undef C | ||
| 17 | #define C(a, b) b | ||
| 18 | |||
| 19 | static const char *trace_probe_err_text[] = { ERRORS }; | ||
| 20 | |||
| 16 | static const char *reserved_field_names[] = { | 21 | static const char *reserved_field_names[] = { |
| 17 | "common_type", | 22 | "common_type", |
| 18 | "common_flags", | 23 | "common_flags", |
| @@ -133,6 +138,60 @@ fail: | |||
| 133 | return NULL; | 138 | return NULL; |
| 134 | } | 139 | } |
| 135 | 140 | ||
| 141 | static struct trace_probe_log trace_probe_log; | ||
| 142 | |||
| 143 | void trace_probe_log_init(const char *subsystem, int argc, const char **argv) | ||
| 144 | { | ||
| 145 | trace_probe_log.subsystem = subsystem; | ||
| 146 | trace_probe_log.argc = argc; | ||
| 147 | trace_probe_log.argv = argv; | ||
| 148 | trace_probe_log.index = 0; | ||
| 149 | } | ||
| 150 | |||
| 151 | void trace_probe_log_clear(void) | ||
| 152 | { | ||
| 153 | memset(&trace_probe_log, 0, sizeof(trace_probe_log)); | ||
| 154 | } | ||
| 155 | |||
| 156 | void trace_probe_log_set_index(int index) | ||
| 157 | { | ||
| 158 | trace_probe_log.index = index; | ||
| 159 | } | ||
| 160 | |||
| 161 | void __trace_probe_log_err(int offset, int err_type) | ||
| 162 | { | ||
| 163 | char *command, *p; | ||
| 164 | int i, len = 0, pos = 0; | ||
| 165 | |||
| 166 | if (!trace_probe_log.argv) | ||
| 167 | return; | ||
| 168 | |||
| 169 | /* Recalcurate the length and allocate buffer */ | ||
| 170 | for (i = 0; i < trace_probe_log.argc; i++) { | ||
| 171 | if (i == trace_probe_log.index) | ||
| 172 | pos = len; | ||
| 173 | len += strlen(trace_probe_log.argv[i]) + 1; | ||
| 174 | } | ||
| 175 | command = kzalloc(len, GFP_KERNEL); | ||
| 176 | if (!command) | ||
| 177 | return; | ||
| 178 | |||
| 179 | /* And make a command string from argv array */ | ||
| 180 | p = command; | ||
| 181 | for (i = 0; i < trace_probe_log.argc; i++) { | ||
| 182 | len = strlen(trace_probe_log.argv[i]); | ||
| 183 | strcpy(p, trace_probe_log.argv[i]); | ||
| 184 | p[len] = ' '; | ||
| 185 | p += len + 1; | ||
| 186 | } | ||
| 187 | *(p - 1) = '\0'; | ||
| 188 | |||
| 189 | tracing_log_err(NULL, trace_probe_log.subsystem, command, | ||
| 190 | trace_probe_err_text, err_type, pos + offset); | ||
| 191 | |||
| 192 | kfree(command); | ||
| 193 | } | ||
| 194 | |||
| 136 | /* Split symbol and offset. */ | 195 | /* Split symbol and offset. */ |
| 137 | int traceprobe_split_symbol_offset(char *symbol, long *offset) | 196 | int traceprobe_split_symbol_offset(char *symbol, long *offset) |
| 138 | { | 197 | { |
| @@ -156,7 +215,7 @@ int traceprobe_split_symbol_offset(char *symbol, long *offset) | |||
| 156 | 215 | ||
| 157 | /* @buf must has MAX_EVENT_NAME_LEN size */ | 216 | /* @buf must has MAX_EVENT_NAME_LEN size */ |
| 158 | int traceprobe_parse_event_name(const char **pevent, const char **pgroup, | 217 | int traceprobe_parse_event_name(const char **pevent, const char **pgroup, |
| 159 | char *buf) | 218 | char *buf, int offset) |
| 160 | { | 219 | { |
| 161 | const char *slash, *event = *pevent; | 220 | const char *slash, *event = *pevent; |
| 162 | int len; | 221 | int len; |
| @@ -164,32 +223,33 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, | |||
| 164 | slash = strchr(event, '/'); | 223 | slash = strchr(event, '/'); |
| 165 | if (slash) { | 224 | if (slash) { |
| 166 | if (slash == event) { | 225 | if (slash == event) { |
| 167 | pr_info("Group name is not specified\n"); | 226 | trace_probe_log_err(offset, NO_GROUP_NAME); |
| 168 | return -EINVAL; | 227 | return -EINVAL; |
| 169 | } | 228 | } |
| 170 | if (slash - event + 1 > MAX_EVENT_NAME_LEN) { | 229 | if (slash - event + 1 > MAX_EVENT_NAME_LEN) { |
| 171 | pr_info("Group name is too long\n"); | 230 | trace_probe_log_err(offset, GROUP_TOO_LONG); |
| 172 | return -E2BIG; | 231 | return -EINVAL; |
| 173 | } | 232 | } |
| 174 | strlcpy(buf, event, slash - event + 1); | 233 | strlcpy(buf, event, slash - event + 1); |
| 175 | if (!is_good_name(buf)) { | 234 | if (!is_good_name(buf)) { |
| 176 | pr_info("Group name must follow the same rules as C identifiers\n"); | 235 | trace_probe_log_err(offset, BAD_GROUP_NAME); |
| 177 | return -EINVAL; | 236 | return -EINVAL; |
| 178 | } | 237 | } |
| 179 | *pgroup = buf; | 238 | *pgroup = buf; |
| 180 | *pevent = slash + 1; | 239 | *pevent = slash + 1; |
| 240 | offset += slash - event + 1; | ||
| 181 | event = *pevent; | 241 | event = *pevent; |
| 182 | } | 242 | } |
| 183 | len = strlen(event); | 243 | len = strlen(event); |
| 184 | if (len == 0) { | 244 | if (len == 0) { |
| 185 | pr_info("Event name is not specified\n"); | 245 | trace_probe_log_err(offset, NO_EVENT_NAME); |
| 186 | return -EINVAL; | 246 | return -EINVAL; |
| 187 | } else if (len > MAX_EVENT_NAME_LEN) { | 247 | } else if (len > MAX_EVENT_NAME_LEN) { |
| 188 | pr_info("Event name is too long\n"); | 248 | trace_probe_log_err(offset, EVENT_TOO_LONG); |
| 189 | return -E2BIG; | 249 | return -EINVAL; |
| 190 | } | 250 | } |
| 191 | if (!is_good_name(event)) { | 251 | if (!is_good_name(event)) { |
| 192 | pr_info("Event name must follow the same rules as C identifiers\n"); | 252 | trace_probe_log_err(offset, BAD_EVENT_NAME); |
| 193 | return -EINVAL; | 253 | return -EINVAL; |
| 194 | } | 254 | } |
| 195 | return 0; | 255 | return 0; |
| @@ -198,56 +258,67 @@ int traceprobe_parse_event_name(const char **pevent, const char **pgroup, | |||
| 198 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | 258 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) |
| 199 | 259 | ||
| 200 | static int parse_probe_vars(char *arg, const struct fetch_type *t, | 260 | static int parse_probe_vars(char *arg, const struct fetch_type *t, |
| 201 | struct fetch_insn *code, unsigned int flags) | 261 | struct fetch_insn *code, unsigned int flags, int offs) |
| 202 | { | 262 | { |
| 203 | unsigned long param; | 263 | unsigned long param; |
| 204 | int ret = 0; | 264 | int ret = 0; |
| 205 | int len; | 265 | int len; |
| 206 | 266 | ||
| 207 | if (strcmp(arg, "retval") == 0) { | 267 | if (strcmp(arg, "retval") == 0) { |
| 208 | if (flags & TPARG_FL_RETURN) | 268 | if (flags & TPARG_FL_RETURN) { |
| 209 | code->op = FETCH_OP_RETVAL; | 269 | code->op = FETCH_OP_RETVAL; |
| 210 | else | 270 | } else { |
| 271 | trace_probe_log_err(offs, RETVAL_ON_PROBE); | ||
| 211 | ret = -EINVAL; | 272 | ret = -EINVAL; |
| 273 | } | ||
| 212 | } else if ((len = str_has_prefix(arg, "stack"))) { | 274 | } else if ((len = str_has_prefix(arg, "stack"))) { |
| 213 | if (arg[len] == '\0') { | 275 | if (arg[len] == '\0') { |
| 214 | code->op = FETCH_OP_STACKP; | 276 | code->op = FETCH_OP_STACKP; |
| 215 | } else if (isdigit(arg[len])) { | 277 | } else if (isdigit(arg[len])) { |
| 216 | ret = kstrtoul(arg + len, 10, ¶m); | 278 | ret = kstrtoul(arg + len, 10, ¶m); |
| 217 | if (ret || ((flags & TPARG_FL_KERNEL) && | 279 | if (ret) { |
| 218 | param > PARAM_MAX_STACK)) | 280 | goto inval_var; |
| 281 | } else if ((flags & TPARG_FL_KERNEL) && | ||
| 282 | param > PARAM_MAX_STACK) { | ||
| 283 | trace_probe_log_err(offs, BAD_STACK_NUM); | ||
| 219 | ret = -EINVAL; | 284 | ret = -EINVAL; |
| 220 | else { | 285 | } else { |
| 221 | code->op = FETCH_OP_STACK; | 286 | code->op = FETCH_OP_STACK; |
| 222 | code->param = (unsigned int)param; | 287 | code->param = (unsigned int)param; |
| 223 | } | 288 | } |
| 224 | } else | 289 | } else |
| 225 | ret = -EINVAL; | 290 | goto inval_var; |
| 226 | } else if (strcmp(arg, "comm") == 0) { | 291 | } else if (strcmp(arg, "comm") == 0) { |
| 227 | code->op = FETCH_OP_COMM; | 292 | code->op = FETCH_OP_COMM; |
| 228 | #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API | 293 | #ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API |
| 229 | } else if (((flags & TPARG_FL_MASK) == | 294 | } else if (((flags & TPARG_FL_MASK) == |
| 230 | (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && | 295 | (TPARG_FL_KERNEL | TPARG_FL_FENTRY)) && |
| 231 | (len = str_has_prefix(arg, "arg"))) { | 296 | (len = str_has_prefix(arg, "arg"))) { |
| 232 | if (!isdigit(arg[len])) | ||
| 233 | return -EINVAL; | ||
| 234 | ret = kstrtoul(arg + len, 10, ¶m); | 297 | ret = kstrtoul(arg + len, 10, ¶m); |
| 235 | if (ret || !param || param > PARAM_MAX_STACK) | 298 | if (ret) { |
| 299 | goto inval_var; | ||
| 300 | } else if (!param || param > PARAM_MAX_STACK) { | ||
| 301 | trace_probe_log_err(offs, BAD_ARG_NUM); | ||
| 236 | return -EINVAL; | 302 | return -EINVAL; |
| 303 | } | ||
| 237 | code->op = FETCH_OP_ARG; | 304 | code->op = FETCH_OP_ARG; |
| 238 | code->param = (unsigned int)param - 1; | 305 | code->param = (unsigned int)param - 1; |
| 239 | #endif | 306 | #endif |
| 240 | } else | 307 | } else |
| 241 | ret = -EINVAL; | 308 | goto inval_var; |
| 242 | 309 | ||
| 243 | return ret; | 310 | return ret; |
| 311 | |||
| 312 | inval_var: | ||
| 313 | trace_probe_log_err(offs, BAD_VAR); | ||
| 314 | return -EINVAL; | ||
| 244 | } | 315 | } |
| 245 | 316 | ||
| 246 | /* Recursive argument parser */ | 317 | /* Recursive argument parser */ |
| 247 | static int | 318 | static int |
| 248 | parse_probe_arg(char *arg, const struct fetch_type *type, | 319 | parse_probe_arg(char *arg, const struct fetch_type *type, |
| 249 | struct fetch_insn **pcode, struct fetch_insn *end, | 320 | struct fetch_insn **pcode, struct fetch_insn *end, |
| 250 | unsigned int flags) | 321 | unsigned int flags, int offs) |
| 251 | { | 322 | { |
| 252 | struct fetch_insn *code = *pcode; | 323 | struct fetch_insn *code = *pcode; |
| 253 | unsigned long param; | 324 | unsigned long param; |
| @@ -257,7 +328,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, | |||
| 257 | 328 | ||
| 258 | switch (arg[0]) { | 329 | switch (arg[0]) { |
| 259 | case '$': | 330 | case '$': |
| 260 | ret = parse_probe_vars(arg + 1, type, code, flags); | 331 | ret = parse_probe_vars(arg + 1, type, code, flags, offs); |
| 261 | break; | 332 | break; |
| 262 | 333 | ||
| 263 | case '%': /* named register */ | 334 | case '%': /* named register */ |
| @@ -266,47 +337,57 @@ parse_probe_arg(char *arg, const struct fetch_type *type, | |||
| 266 | code->op = FETCH_OP_REG; | 337 | code->op = FETCH_OP_REG; |
| 267 | code->param = (unsigned int)ret; | 338 | code->param = (unsigned int)ret; |
| 268 | ret = 0; | 339 | ret = 0; |
| 269 | } | 340 | } else |
| 341 | trace_probe_log_err(offs, BAD_REG_NAME); | ||
| 270 | break; | 342 | break; |
| 271 | 343 | ||
| 272 | case '@': /* memory, file-offset or symbol */ | 344 | case '@': /* memory, file-offset or symbol */ |
| 273 | if (isdigit(arg[1])) { | 345 | if (isdigit(arg[1])) { |
| 274 | ret = kstrtoul(arg + 1, 0, ¶m); | 346 | ret = kstrtoul(arg + 1, 0, ¶m); |
| 275 | if (ret) | 347 | if (ret) { |
| 348 | trace_probe_log_err(offs, BAD_MEM_ADDR); | ||
| 276 | break; | 349 | break; |
| 350 | } | ||
| 277 | /* load address */ | 351 | /* load address */ |
| 278 | code->op = FETCH_OP_IMM; | 352 | code->op = FETCH_OP_IMM; |
| 279 | code->immediate = param; | 353 | code->immediate = param; |
| 280 | } else if (arg[1] == '+') { | 354 | } else if (arg[1] == '+') { |
| 281 | /* kprobes don't support file offsets */ | 355 | /* kprobes don't support file offsets */ |
| 282 | if (flags & TPARG_FL_KERNEL) | 356 | if (flags & TPARG_FL_KERNEL) { |
| 357 | trace_probe_log_err(offs, FILE_ON_KPROBE); | ||
| 283 | return -EINVAL; | 358 | return -EINVAL; |
| 284 | 359 | } | |
| 285 | ret = kstrtol(arg + 2, 0, &offset); | 360 | ret = kstrtol(arg + 2, 0, &offset); |
| 286 | if (ret) | 361 | if (ret) { |
| 362 | trace_probe_log_err(offs, BAD_FILE_OFFS); | ||
| 287 | break; | 363 | break; |
| 364 | } | ||
| 288 | 365 | ||
| 289 | code->op = FETCH_OP_FOFFS; | 366 | code->op = FETCH_OP_FOFFS; |
| 290 | code->immediate = (unsigned long)offset; // imm64? | 367 | code->immediate = (unsigned long)offset; // imm64? |
| 291 | } else { | 368 | } else { |
| 292 | /* uprobes don't support symbols */ | 369 | /* uprobes don't support symbols */ |
| 293 | if (!(flags & TPARG_FL_KERNEL)) | 370 | if (!(flags & TPARG_FL_KERNEL)) { |
| 371 | trace_probe_log_err(offs, SYM_ON_UPROBE); | ||
| 294 | return -EINVAL; | 372 | return -EINVAL; |
| 295 | 373 | } | |
| 296 | /* Preserve symbol for updating */ | 374 | /* Preserve symbol for updating */ |
| 297 | code->op = FETCH_NOP_SYMBOL; | 375 | code->op = FETCH_NOP_SYMBOL; |
| 298 | code->data = kstrdup(arg + 1, GFP_KERNEL); | 376 | code->data = kstrdup(arg + 1, GFP_KERNEL); |
| 299 | if (!code->data) | 377 | if (!code->data) |
| 300 | return -ENOMEM; | 378 | return -ENOMEM; |
| 301 | if (++code == end) | 379 | if (++code == end) { |
| 302 | return -E2BIG; | 380 | trace_probe_log_err(offs, TOO_MANY_OPS); |
| 303 | 381 | return -EINVAL; | |
| 382 | } | ||
| 304 | code->op = FETCH_OP_IMM; | 383 | code->op = FETCH_OP_IMM; |
| 305 | code->immediate = 0; | 384 | code->immediate = 0; |
| 306 | } | 385 | } |
| 307 | /* These are fetching from memory */ | 386 | /* These are fetching from memory */ |
| 308 | if (++code == end) | 387 | if (++code == end) { |
| 309 | return -E2BIG; | 388 | trace_probe_log_err(offs, TOO_MANY_OPS); |
| 389 | return -EINVAL; | ||
| 390 | } | ||
| 310 | *pcode = code; | 391 | *pcode = code; |
| 311 | code->op = FETCH_OP_DEREF; | 392 | code->op = FETCH_OP_DEREF; |
| 312 | code->offset = offset; | 393 | code->offset = offset; |
| @@ -317,28 +398,38 @@ parse_probe_arg(char *arg, const struct fetch_type *type, | |||
| 317 | /* fall through */ | 398 | /* fall through */ |
| 318 | case '-': | 399 | case '-': |
| 319 | tmp = strchr(arg, '('); | 400 | tmp = strchr(arg, '('); |
| 320 | if (!tmp) | 401 | if (!tmp) { |
| 402 | trace_probe_log_err(offs, DEREF_NEED_BRACE); | ||
| 321 | return -EINVAL; | 403 | return -EINVAL; |
| 322 | 404 | } | |
| 323 | *tmp = '\0'; | 405 | *tmp = '\0'; |
| 324 | ret = kstrtol(arg, 0, &offset); | 406 | ret = kstrtol(arg, 0, &offset); |
| 325 | if (ret) | 407 | if (ret) { |
| 408 | trace_probe_log_err(offs, BAD_DEREF_OFFS); | ||
| 326 | break; | 409 | break; |
| 327 | 410 | } | |
| 411 | offs += (tmp + 1 - arg) + (arg[0] != '-' ? 1 : 0); | ||
| 328 | arg = tmp + 1; | 412 | arg = tmp + 1; |
| 329 | tmp = strrchr(arg, ')'); | 413 | tmp = strrchr(arg, ')'); |
| 330 | 414 | if (!tmp) { | |
| 331 | if (tmp) { | 415 | trace_probe_log_err(offs + strlen(arg), |
| 416 | DEREF_OPEN_BRACE); | ||
| 417 | return -EINVAL; | ||
| 418 | } else { | ||
| 332 | const struct fetch_type *t2 = find_fetch_type(NULL); | 419 | const struct fetch_type *t2 = find_fetch_type(NULL); |
| 333 | 420 | ||
| 334 | *tmp = '\0'; | 421 | *tmp = '\0'; |
| 335 | ret = parse_probe_arg(arg, t2, &code, end, flags); | 422 | ret = parse_probe_arg(arg, t2, &code, end, flags, offs); |
| 336 | if (ret) | 423 | if (ret) |
| 337 | break; | 424 | break; |
| 338 | if (code->op == FETCH_OP_COMM) | 425 | if (code->op == FETCH_OP_COMM) { |
| 426 | trace_probe_log_err(offs, COMM_CANT_DEREF); | ||
| 339 | return -EINVAL; | 427 | return -EINVAL; |
| 340 | if (++code == end) | 428 | } |
| 341 | return -E2BIG; | 429 | if (++code == end) { |
| 430 | trace_probe_log_err(offs, TOO_MANY_OPS); | ||
| 431 | return -EINVAL; | ||
| 432 | } | ||
| 342 | *pcode = code; | 433 | *pcode = code; |
| 343 | 434 | ||
| 344 | code->op = FETCH_OP_DEREF; | 435 | code->op = FETCH_OP_DEREF; |
| @@ -348,6 +439,7 @@ parse_probe_arg(char *arg, const struct fetch_type *type, | |||
| 348 | } | 439 | } |
| 349 | if (!ret && code->op == FETCH_OP_NOP) { | 440 | if (!ret && code->op == FETCH_OP_NOP) { |
| 350 | /* Parsed, but do not find fetch method */ | 441 | /* Parsed, but do not find fetch method */ |
| 442 | trace_probe_log_err(offs, BAD_FETCH_ARG); | ||
| 351 | ret = -EINVAL; | 443 | ret = -EINVAL; |
| 352 | } | 444 | } |
| 353 | return ret; | 445 | return ret; |
| @@ -379,7 +471,7 @@ static int __parse_bitfield_probe_arg(const char *bf, | |||
| 379 | return -EINVAL; | 471 | return -EINVAL; |
| 380 | code++; | 472 | code++; |
| 381 | if (code->op != FETCH_OP_NOP) | 473 | if (code->op != FETCH_OP_NOP) |
| 382 | return -E2BIG; | 474 | return -EINVAL; |
| 383 | *pcode = code; | 475 | *pcode = code; |
| 384 | 476 | ||
| 385 | code->op = FETCH_OP_MOD_BF; | 477 | code->op = FETCH_OP_MOD_BF; |
| @@ -392,44 +484,66 @@ static int __parse_bitfield_probe_arg(const char *bf, | |||
| 392 | 484 | ||
| 393 | /* String length checking wrapper */ | 485 | /* String length checking wrapper */ |
| 394 | static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | 486 | static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, |
| 395 | struct probe_arg *parg, unsigned int flags) | 487 | struct probe_arg *parg, unsigned int flags, int offset) |
| 396 | { | 488 | { |
| 397 | struct fetch_insn *code, *scode, *tmp = NULL; | 489 | struct fetch_insn *code, *scode, *tmp = NULL; |
| 398 | char *t, *t2; | 490 | char *t, *t2, *t3; |
| 399 | int ret, len; | 491 | int ret, len; |
| 400 | 492 | ||
| 401 | if (strlen(arg) > MAX_ARGSTR_LEN) { | 493 | len = strlen(arg); |
| 402 | pr_info("Argument is too long.: %s\n", arg); | 494 | if (len > MAX_ARGSTR_LEN) { |
| 403 | return -ENOSPC; | 495 | trace_probe_log_err(offset, ARG_TOO_LONG); |
| 496 | return -EINVAL; | ||
| 497 | } else if (len == 0) { | ||
| 498 | trace_probe_log_err(offset, NO_ARG_BODY); | ||
| 499 | return -EINVAL; | ||
| 404 | } | 500 | } |
| 501 | |||
| 405 | parg->comm = kstrdup(arg, GFP_KERNEL); | 502 | parg->comm = kstrdup(arg, GFP_KERNEL); |
| 406 | if (!parg->comm) { | 503 | if (!parg->comm) |
| 407 | pr_info("Failed to allocate memory for command '%s'.\n", arg); | ||
| 408 | return -ENOMEM; | 504 | return -ENOMEM; |
| 409 | } | 505 | |
| 410 | t = strchr(arg, ':'); | 506 | t = strchr(arg, ':'); |
| 411 | if (t) { | 507 | if (t) { |
| 412 | *t = '\0'; | 508 | *t = '\0'; |
| 413 | t2 = strchr(++t, '['); | 509 | t2 = strchr(++t, '['); |
| 414 | if (t2) { | 510 | if (t2) { |
| 415 | *t2 = '\0'; | 511 | *t2++ = '\0'; |
| 416 | parg->count = simple_strtoul(t2 + 1, &t2, 0); | 512 | t3 = strchr(t2, ']'); |
| 417 | if (strcmp(t2, "]") || parg->count == 0) | 513 | if (!t3) { |
| 514 | offset += t2 + strlen(t2) - arg; | ||
| 515 | trace_probe_log_err(offset, | ||
| 516 | ARRAY_NO_CLOSE); | ||
| 517 | return -EINVAL; | ||
| 518 | } else if (t3[1] != '\0') { | ||
| 519 | trace_probe_log_err(offset + t3 + 1 - arg, | ||
| 520 | BAD_ARRAY_SUFFIX); | ||
| 418 | return -EINVAL; | 521 | return -EINVAL; |
| 419 | if (parg->count > MAX_ARRAY_LEN) | 522 | } |
| 420 | return -E2BIG; | 523 | *t3 = '\0'; |
| 524 | if (kstrtouint(t2, 0, &parg->count) || !parg->count) { | ||
| 525 | trace_probe_log_err(offset + t2 - arg, | ||
| 526 | BAD_ARRAY_NUM); | ||
| 527 | return -EINVAL; | ||
| 528 | } | ||
| 529 | if (parg->count > MAX_ARRAY_LEN) { | ||
| 530 | trace_probe_log_err(offset + t2 - arg, | ||
| 531 | ARRAY_TOO_BIG); | ||
| 532 | return -EINVAL; | ||
| 533 | } | ||
| 421 | } | 534 | } |
| 422 | } | 535 | } |
| 423 | /* | 536 | |
| 424 | * The default type of $comm should be "string", and it can't be | 537 | /* Since $comm can not be dereferred, we can find $comm by strcmp */ |
| 425 | * dereferenced. | 538 | if (strcmp(arg, "$comm") == 0) { |
| 426 | */ | 539 | /* The type of $comm must be "string", and not an array. */ |
| 427 | if (!t && strcmp(arg, "$comm") == 0) | 540 | if (parg->count || (t && strcmp(t, "string"))) |
| 541 | return -EINVAL; | ||
| 428 | parg->type = find_fetch_type("string"); | 542 | parg->type = find_fetch_type("string"); |
| 429 | else | 543 | } else |
| 430 | parg->type = find_fetch_type(t); | 544 | parg->type = find_fetch_type(t); |
| 431 | if (!parg->type) { | 545 | if (!parg->type) { |
| 432 | pr_info("Unsupported type: %s\n", t); | 546 | trace_probe_log_err(offset + (t ? (t - arg) : 0), BAD_TYPE); |
| 433 | return -EINVAL; | 547 | return -EINVAL; |
| 434 | } | 548 | } |
| 435 | parg->offset = *size; | 549 | parg->offset = *size; |
| @@ -444,13 +558,13 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | |||
| 444 | parg->count); | 558 | parg->count); |
| 445 | } | 559 | } |
| 446 | 560 | ||
| 447 | code = tmp = kzalloc(sizeof(*code) * FETCH_INSN_MAX, GFP_KERNEL); | 561 | code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL); |
| 448 | if (!code) | 562 | if (!code) |
| 449 | return -ENOMEM; | 563 | return -ENOMEM; |
| 450 | code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; | 564 | code[FETCH_INSN_MAX - 1].op = FETCH_OP_END; |
| 451 | 565 | ||
| 452 | ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], | 566 | ret = parse_probe_arg(arg, parg->type, &code, &code[FETCH_INSN_MAX - 1], |
| 453 | flags); | 567 | flags, offset); |
| 454 | if (ret) | 568 | if (ret) |
| 455 | goto fail; | 569 | goto fail; |
| 456 | 570 | ||
| @@ -458,7 +572,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | |||
| 458 | if (!strcmp(parg->type->name, "string")) { | 572 | if (!strcmp(parg->type->name, "string")) { |
| 459 | if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && | 573 | if (code->op != FETCH_OP_DEREF && code->op != FETCH_OP_IMM && |
| 460 | code->op != FETCH_OP_COMM) { | 574 | code->op != FETCH_OP_COMM) { |
| 461 | pr_info("string only accepts memory or address.\n"); | 575 | trace_probe_log_err(offset + (t ? (t - arg) : 0), |
| 576 | BAD_STRING); | ||
| 462 | ret = -EINVAL; | 577 | ret = -EINVAL; |
| 463 | goto fail; | 578 | goto fail; |
| 464 | } | 579 | } |
| @@ -470,7 +585,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | |||
| 470 | */ | 585 | */ |
| 471 | code++; | 586 | code++; |
| 472 | if (code->op != FETCH_OP_NOP) { | 587 | if (code->op != FETCH_OP_NOP) { |
| 473 | ret = -E2BIG; | 588 | trace_probe_log_err(offset, TOO_MANY_OPS); |
| 589 | ret = -EINVAL; | ||
| 474 | goto fail; | 590 | goto fail; |
| 475 | } | 591 | } |
| 476 | } | 592 | } |
| @@ -483,7 +599,8 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | |||
| 483 | } else { | 599 | } else { |
| 484 | code++; | 600 | code++; |
| 485 | if (code->op != FETCH_OP_NOP) { | 601 | if (code->op != FETCH_OP_NOP) { |
| 486 | ret = -E2BIG; | 602 | trace_probe_log_err(offset, TOO_MANY_OPS); |
| 603 | ret = -EINVAL; | ||
| 487 | goto fail; | 604 | goto fail; |
| 488 | } | 605 | } |
| 489 | code->op = FETCH_OP_ST_RAW; | 606 | code->op = FETCH_OP_ST_RAW; |
| @@ -493,20 +610,24 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | |||
| 493 | /* Modify operation */ | 610 | /* Modify operation */ |
| 494 | if (t != NULL) { | 611 | if (t != NULL) { |
| 495 | ret = __parse_bitfield_probe_arg(t, parg->type, &code); | 612 | ret = __parse_bitfield_probe_arg(t, parg->type, &code); |
| 496 | if (ret) | 613 | if (ret) { |
| 614 | trace_probe_log_err(offset + t - arg, BAD_BITFIELD); | ||
| 497 | goto fail; | 615 | goto fail; |
| 616 | } | ||
| 498 | } | 617 | } |
| 499 | /* Loop(Array) operation */ | 618 | /* Loop(Array) operation */ |
| 500 | if (parg->count) { | 619 | if (parg->count) { |
| 501 | if (scode->op != FETCH_OP_ST_MEM && | 620 | if (scode->op != FETCH_OP_ST_MEM && |
| 502 | scode->op != FETCH_OP_ST_STRING) { | 621 | scode->op != FETCH_OP_ST_STRING) { |
| 503 | pr_info("array only accepts memory or address\n"); | 622 | trace_probe_log_err(offset + (t ? (t - arg) : 0), |
| 623 | BAD_STRING); | ||
| 504 | ret = -EINVAL; | 624 | ret = -EINVAL; |
| 505 | goto fail; | 625 | goto fail; |
| 506 | } | 626 | } |
| 507 | code++; | 627 | code++; |
| 508 | if (code->op != FETCH_OP_NOP) { | 628 | if (code->op != FETCH_OP_NOP) { |
| 509 | ret = -E2BIG; | 629 | trace_probe_log_err(offset, TOO_MANY_OPS); |
| 630 | ret = -EINVAL; | ||
| 510 | goto fail; | 631 | goto fail; |
| 511 | } | 632 | } |
| 512 | code->op = FETCH_OP_LP_ARRAY; | 633 | code->op = FETCH_OP_LP_ARRAY; |
| @@ -516,7 +637,7 @@ static int traceprobe_parse_probe_arg_body(char *arg, ssize_t *size, | |||
| 516 | code->op = FETCH_OP_END; | 637 | code->op = FETCH_OP_END; |
| 517 | 638 | ||
| 518 | /* Shrink down the code buffer */ | 639 | /* Shrink down the code buffer */ |
| 519 | parg->code = kzalloc(sizeof(*code) * (code - tmp + 1), GFP_KERNEL); | 640 | parg->code = kcalloc(code - tmp + 1, sizeof(*code), GFP_KERNEL); |
| 520 | if (!parg->code) | 641 | if (!parg->code) |
| 521 | ret = -ENOMEM; | 642 | ret = -ENOMEM; |
| 522 | else | 643 | else |
| @@ -555,15 +676,19 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, | |||
| 555 | { | 676 | { |
| 556 | struct probe_arg *parg = &tp->args[i]; | 677 | struct probe_arg *parg = &tp->args[i]; |
| 557 | char *body; | 678 | char *body; |
| 558 | int ret; | ||
| 559 | 679 | ||
| 560 | /* Increment count for freeing args in error case */ | 680 | /* Increment count for freeing args in error case */ |
| 561 | tp->nr_args++; | 681 | tp->nr_args++; |
| 562 | 682 | ||
| 563 | body = strchr(arg, '='); | 683 | body = strchr(arg, '='); |
| 564 | if (body) { | 684 | if (body) { |
| 565 | if (body - arg > MAX_ARG_NAME_LEN || body == arg) | 685 | if (body - arg > MAX_ARG_NAME_LEN) { |
| 686 | trace_probe_log_err(0, ARG_NAME_TOO_LONG); | ||
| 687 | return -EINVAL; | ||
| 688 | } else if (body == arg) { | ||
| 689 | trace_probe_log_err(0, NO_ARG_NAME); | ||
| 566 | return -EINVAL; | 690 | return -EINVAL; |
| 691 | } | ||
| 567 | parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); | 692 | parg->name = kmemdup_nul(arg, body - arg, GFP_KERNEL); |
| 568 | body++; | 693 | body++; |
| 569 | } else { | 694 | } else { |
| @@ -575,22 +700,16 @@ int traceprobe_parse_probe_arg(struct trace_probe *tp, int i, char *arg, | |||
| 575 | return -ENOMEM; | 700 | return -ENOMEM; |
| 576 | 701 | ||
| 577 | if (!is_good_name(parg->name)) { | 702 | if (!is_good_name(parg->name)) { |
| 578 | pr_info("Invalid argument[%d] name: %s\n", | 703 | trace_probe_log_err(0, BAD_ARG_NAME); |
| 579 | i, parg->name); | ||
| 580 | return -EINVAL; | 704 | return -EINVAL; |
| 581 | } | 705 | } |
| 582 | |||
| 583 | if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { | 706 | if (traceprobe_conflict_field_name(parg->name, tp->args, i)) { |
| 584 | pr_info("Argument[%d]: '%s' conflicts with another field.\n", | 707 | trace_probe_log_err(0, USED_ARG_NAME); |
| 585 | i, parg->name); | ||
| 586 | return -EINVAL; | 708 | return -EINVAL; |
| 587 | } | 709 | } |
| 588 | |||
| 589 | /* Parse fetch argument */ | 710 | /* Parse fetch argument */ |
| 590 | ret = traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags); | 711 | return traceprobe_parse_probe_arg_body(body, &tp->size, parg, flags, |
| 591 | if (ret) | 712 | body - arg); |
| 592 | pr_info("Parse error at argument[%d]. (%d)\n", i, ret); | ||
| 593 | return ret; | ||
| 594 | } | 713 | } |
| 595 | 714 | ||
| 596 | void traceprobe_free_probe_arg(struct probe_arg *arg) | 715 | void traceprobe_free_probe_arg(struct probe_arg *arg) |
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 2177c206de15..f9a8c632188b 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h | |||
| @@ -124,6 +124,7 @@ struct fetch_insn { | |||
| 124 | 124 | ||
| 125 | /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ | 125 | /* fetch + deref*N + store + mod + end <= 16, this allows N=12, enough */ |
| 126 | #define FETCH_INSN_MAX 16 | 126 | #define FETCH_INSN_MAX 16 |
| 127 | #define FETCH_TOKEN_COMM (-ECOMM) | ||
| 127 | 128 | ||
| 128 | /* Fetch type information table */ | 129 | /* Fetch type information table */ |
| 129 | struct fetch_type { | 130 | struct fetch_type { |
| @@ -280,8 +281,8 @@ extern int traceprobe_update_arg(struct probe_arg *arg); | |||
| 280 | extern void traceprobe_free_probe_arg(struct probe_arg *arg); | 281 | extern void traceprobe_free_probe_arg(struct probe_arg *arg); |
| 281 | 282 | ||
| 282 | extern int traceprobe_split_symbol_offset(char *symbol, long *offset); | 283 | extern int traceprobe_split_symbol_offset(char *symbol, long *offset); |
| 283 | extern int traceprobe_parse_event_name(const char **pevent, | 284 | int traceprobe_parse_event_name(const char **pevent, const char **pgroup, |
| 284 | const char **pgroup, char *buf); | 285 | char *buf, int offset); |
| 285 | 286 | ||
| 286 | extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); | 287 | extern int traceprobe_set_print_fmt(struct trace_probe *tp, bool is_return); |
| 287 | 288 | ||
| @@ -298,3 +299,76 @@ extern void destroy_local_trace_uprobe(struct trace_event_call *event_call); | |||
| 298 | #endif | 299 | #endif |
| 299 | extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, | 300 | extern int traceprobe_define_arg_fields(struct trace_event_call *event_call, |
| 300 | size_t offset, struct trace_probe *tp); | 301 | size_t offset, struct trace_probe *tp); |
| 302 | |||
| 303 | #undef ERRORS | ||
| 304 | #define ERRORS \ | ||
| 305 | C(FILE_NOT_FOUND, "Failed to find the given file"), \ | ||
| 306 | C(NO_REGULAR_FILE, "Not a regular file"), \ | ||
| 307 | C(BAD_REFCNT, "Invalid reference counter offset"), \ | ||
| 308 | C(REFCNT_OPEN_BRACE, "Reference counter brace is not closed"), \ | ||
| 309 | C(BAD_REFCNT_SUFFIX, "Reference counter has wrong suffix"), \ | ||
| 310 | C(BAD_UPROBE_OFFS, "Invalid uprobe offset"), \ | ||
| 311 | C(MAXACT_NO_KPROBE, "Maxactive is not for kprobe"), \ | ||
| 312 | C(BAD_MAXACT, "Invalid maxactive number"), \ | ||
| 313 | C(MAXACT_TOO_BIG, "Maxactive is too big"), \ | ||
| 314 | C(BAD_PROBE_ADDR, "Invalid probed address or symbol"), \ | ||
| 315 | C(BAD_RETPROBE, "Retprobe address must be an function entry"), \ | ||
| 316 | C(NO_GROUP_NAME, "Group name is not specified"), \ | ||
| 317 | C(GROUP_TOO_LONG, "Group name is too long"), \ | ||
| 318 | C(BAD_GROUP_NAME, "Group name must follow the same rules as C identifiers"), \ | ||
| 319 | C(NO_EVENT_NAME, "Event name is not specified"), \ | ||
| 320 | C(EVENT_TOO_LONG, "Event name is too long"), \ | ||
| 321 | C(BAD_EVENT_NAME, "Event name must follow the same rules as C identifiers"), \ | ||
| 322 | C(RETVAL_ON_PROBE, "$retval is not available on probe"), \ | ||
| 323 | C(BAD_STACK_NUM, "Invalid stack number"), \ | ||
| 324 | C(BAD_ARG_NUM, "Invalid argument number"), \ | ||
| 325 | C(BAD_VAR, "Invalid $-valiable specified"), \ | ||
| 326 | C(BAD_REG_NAME, "Invalid register name"), \ | ||
| 327 | C(BAD_MEM_ADDR, "Invalid memory address"), \ | ||
| 328 | C(FILE_ON_KPROBE, "File offset is not available with kprobe"), \ | ||
| 329 | C(BAD_FILE_OFFS, "Invalid file offset value"), \ | ||
| 330 | C(SYM_ON_UPROBE, "Symbol is not available with uprobe"), \ | ||
| 331 | C(TOO_MANY_OPS, "Dereference is too much nested"), \ | ||
| 332 | C(DEREF_NEED_BRACE, "Dereference needs a brace"), \ | ||
| 333 | C(BAD_DEREF_OFFS, "Invalid dereference offset"), \ | ||
| 334 | C(DEREF_OPEN_BRACE, "Dereference brace is not closed"), \ | ||
| 335 | C(COMM_CANT_DEREF, "$comm can not be dereferenced"), \ | ||
| 336 | C(BAD_FETCH_ARG, "Invalid fetch argument"), \ | ||
| 337 | C(ARRAY_NO_CLOSE, "Array is not closed"), \ | ||
| 338 | C(BAD_ARRAY_SUFFIX, "Array has wrong suffix"), \ | ||
| 339 | C(BAD_ARRAY_NUM, "Invalid array size"), \ | ||
| 340 | C(ARRAY_TOO_BIG, "Array number is too big"), \ | ||
| 341 | C(BAD_TYPE, "Unknown type is specified"), \ | ||
| 342 | C(BAD_STRING, "String accepts only memory argument"), \ | ||
| 343 | C(BAD_BITFIELD, "Invalid bitfield"), \ | ||
| 344 | C(ARG_NAME_TOO_LONG, "Argument name is too long"), \ | ||
| 345 | C(NO_ARG_NAME, "Argument name is not specified"), \ | ||
| 346 | C(BAD_ARG_NAME, "Argument name must follow the same rules as C identifiers"), \ | ||
| 347 | C(USED_ARG_NAME, "This argument name is already used"), \ | ||
| 348 | C(ARG_TOO_LONG, "Argument expression is too long"), \ | ||
| 349 | C(NO_ARG_BODY, "No argument expression"), \ | ||
| 350 | C(BAD_INSN_BNDRY, "Probe point is not an instruction boundary"),\ | ||
| 351 | C(FAIL_REG_PROBE, "Failed to register probe event"), | ||
| 352 | |||
| 353 | #undef C | ||
| 354 | #define C(a, b) TP_ERR_##a | ||
| 355 | |||
| 356 | /* Define TP_ERR_ */ | ||
| 357 | enum { ERRORS }; | ||
| 358 | |||
| 359 | /* Error text is defined in trace_probe.c */ | ||
| 360 | |||
| 361 | struct trace_probe_log { | ||
| 362 | const char *subsystem; | ||
| 363 | const char **argv; | ||
| 364 | int argc; | ||
| 365 | int index; | ||
| 366 | }; | ||
| 367 | |||
| 368 | void trace_probe_log_init(const char *subsystem, int argc, const char **argv); | ||
| 369 | void trace_probe_log_set_index(int index); | ||
| 370 | void trace_probe_log_clear(void); | ||
| 371 | void __trace_probe_log_err(int offset, int err); | ||
| 372 | |||
| 373 | #define trace_probe_log_err(offs, err) \ | ||
| 374 | __trace_probe_log_err(offs, TP_ERR_##err) | ||
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4737bb8c07a3..c30c61f12ddd 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h | |||
| @@ -88,7 +88,7 @@ stage3: | |||
| 88 | /* 3rd stage: store value to buffer */ | 88 | /* 3rd stage: store value to buffer */ |
| 89 | if (unlikely(!dest)) { | 89 | if (unlikely(!dest)) { |
| 90 | if (code->op == FETCH_OP_ST_STRING) { | 90 | if (code->op == FETCH_OP_ST_STRING) { |
| 91 | ret += fetch_store_strlen(val + code->offset); | 91 | ret = fetch_store_strlen(val + code->offset); |
| 92 | code++; | 92 | code++; |
| 93 | goto array; | 93 | goto array; |
| 94 | } else | 94 | } else |
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 9d402e7fc949..69ee8ef12cee 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -792,7 +792,10 @@ trace_selftest_startup_function_graph(struct tracer *trace, | |||
| 792 | /* check the trace buffer */ | 792 | /* check the trace buffer */ |
| 793 | ret = trace_test_buffer(&tr->trace_buffer, &count); | 793 | ret = trace_test_buffer(&tr->trace_buffer, &count); |
| 794 | 794 | ||
| 795 | trace->reset(tr); | 795 | /* Need to also simulate the tr->reset to remove this fgraph_ops */ |
| 796 | tracing_stop_cmdline_record(); | ||
| 797 | unregister_ftrace_graph(&fgraph_ops); | ||
| 798 | |||
| 796 | tracing_start(); | 799 | tracing_start(); |
| 797 | 800 | ||
| 798 | if (!ret && !count) { | 801 | if (!ret && !count) { |
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index be78d99ee6bc..7860e3f59fad 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c | |||
| @@ -156,7 +156,10 @@ fetch_store_string(unsigned long addr, void *dest, void *base) | |||
| 156 | if (unlikely(!maxlen)) | 156 | if (unlikely(!maxlen)) |
| 157 | return -ENOMEM; | 157 | return -ENOMEM; |
| 158 | 158 | ||
| 159 | ret = strncpy_from_user(dst, src, maxlen); | 159 | if (addr == FETCH_TOKEN_COMM) |
| 160 | ret = strlcpy(dst, current->comm, maxlen); | ||
| 161 | else | ||
| 162 | ret = strncpy_from_user(dst, src, maxlen); | ||
| 160 | if (ret >= 0) { | 163 | if (ret >= 0) { |
| 161 | if (ret == maxlen) | 164 | if (ret == maxlen) |
| 162 | dst[ret - 1] = '\0'; | 165 | dst[ret - 1] = '\0'; |
| @@ -180,7 +183,10 @@ fetch_store_strlen(unsigned long addr) | |||
| 180 | int len; | 183 | int len; |
| 181 | void __user *vaddr = (void __force __user *) addr; | 184 | void __user *vaddr = (void __force __user *) addr; |
| 182 | 185 | ||
| 183 | len = strnlen_user(vaddr, MAX_STRING_SIZE); | 186 | if (addr == FETCH_TOKEN_COMM) |
| 187 | len = strlen(current->comm) + 1; | ||
| 188 | else | ||
| 189 | len = strnlen_user(vaddr, MAX_STRING_SIZE); | ||
| 184 | 190 | ||
| 185 | return (len > MAX_STRING_SIZE) ? 0 : len; | 191 | return (len > MAX_STRING_SIZE) ? 0 : len; |
| 186 | } | 192 | } |
| @@ -220,6 +226,9 @@ process_fetch_insn(struct fetch_insn *code, struct pt_regs *regs, void *dest, | |||
| 220 | case FETCH_OP_IMM: | 226 | case FETCH_OP_IMM: |
| 221 | val = code->immediate; | 227 | val = code->immediate; |
| 222 | break; | 228 | break; |
| 229 | case FETCH_OP_COMM: | ||
| 230 | val = FETCH_TOKEN_COMM; | ||
| 231 | break; | ||
| 223 | case FETCH_OP_FOFFS: | 232 | case FETCH_OP_FOFFS: |
| 224 | val = translate_user_vaddr(code->immediate); | 233 | val = translate_user_vaddr(code->immediate); |
| 225 | break; | 234 | break; |
| @@ -417,8 +426,6 @@ end: | |||
| 417 | /* | 426 | /* |
| 418 | * Argument syntax: | 427 | * Argument syntax: |
| 419 | * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] | 428 | * - Add uprobe: p|r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] |
| 420 | * | ||
| 421 | * - Remove uprobe: -:[GRP/]EVENT | ||
| 422 | */ | 429 | */ |
| 423 | static int trace_uprobe_create(int argc, const char **argv) | 430 | static int trace_uprobe_create(int argc, const char **argv) |
| 424 | { | 431 | { |
| @@ -434,10 +441,17 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 434 | ret = 0; | 441 | ret = 0; |
| 435 | ref_ctr_offset = 0; | 442 | ref_ctr_offset = 0; |
| 436 | 443 | ||
| 437 | /* argc must be >= 1 */ | 444 | switch (argv[0][0]) { |
| 438 | if (argv[0][0] == 'r') | 445 | case 'r': |
| 439 | is_return = true; | 446 | is_return = true; |
| 440 | else if (argv[0][0] != 'p' || argc < 2) | 447 | break; |
| 448 | case 'p': | ||
| 449 | break; | ||
| 450 | default: | ||
| 451 | return -ECANCELED; | ||
| 452 | } | ||
| 453 | |||
| 454 | if (argc < 2) | ||
| 441 | return -ECANCELED; | 455 | return -ECANCELED; |
| 442 | 456 | ||
| 443 | if (argv[0][1] == ':') | 457 | if (argv[0][1] == ':') |
| @@ -457,13 +471,19 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 457 | return -ECANCELED; | 471 | return -ECANCELED; |
| 458 | } | 472 | } |
| 459 | 473 | ||
| 474 | trace_probe_log_init("trace_uprobe", argc, argv); | ||
| 475 | trace_probe_log_set_index(1); /* filename is the 2nd argument */ | ||
| 476 | |||
| 460 | *arg++ = '\0'; | 477 | *arg++ = '\0'; |
| 461 | ret = kern_path(filename, LOOKUP_FOLLOW, &path); | 478 | ret = kern_path(filename, LOOKUP_FOLLOW, &path); |
| 462 | if (ret) { | 479 | if (ret) { |
| 480 | trace_probe_log_err(0, FILE_NOT_FOUND); | ||
| 463 | kfree(filename); | 481 | kfree(filename); |
| 482 | trace_probe_log_clear(); | ||
| 464 | return ret; | 483 | return ret; |
| 465 | } | 484 | } |
| 466 | if (!d_is_reg(path.dentry)) { | 485 | if (!d_is_reg(path.dentry)) { |
| 486 | trace_probe_log_err(0, NO_REGULAR_FILE); | ||
| 467 | ret = -EINVAL; | 487 | ret = -EINVAL; |
| 468 | goto fail_address_parse; | 488 | goto fail_address_parse; |
| 469 | } | 489 | } |
| @@ -472,9 +492,16 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 472 | rctr = strchr(arg, '('); | 492 | rctr = strchr(arg, '('); |
| 473 | if (rctr) { | 493 | if (rctr) { |
| 474 | rctr_end = strchr(rctr, ')'); | 494 | rctr_end = strchr(rctr, ')'); |
| 475 | if (rctr > rctr_end || *(rctr_end + 1) != 0) { | 495 | if (!rctr_end) { |
| 476 | ret = -EINVAL; | 496 | ret = -EINVAL; |
| 477 | pr_info("Invalid reference counter offset.\n"); | 497 | rctr_end = rctr + strlen(rctr); |
| 498 | trace_probe_log_err(rctr_end - filename, | ||
| 499 | REFCNT_OPEN_BRACE); | ||
| 500 | goto fail_address_parse; | ||
| 501 | } else if (rctr_end[1] != '\0') { | ||
| 502 | ret = -EINVAL; | ||
| 503 | trace_probe_log_err(rctr_end + 1 - filename, | ||
| 504 | BAD_REFCNT_SUFFIX); | ||
| 478 | goto fail_address_parse; | 505 | goto fail_address_parse; |
| 479 | } | 506 | } |
| 480 | 507 | ||
| @@ -482,22 +509,23 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 482 | *rctr_end = '\0'; | 509 | *rctr_end = '\0'; |
| 483 | ret = kstrtoul(rctr, 0, &ref_ctr_offset); | 510 | ret = kstrtoul(rctr, 0, &ref_ctr_offset); |
| 484 | if (ret) { | 511 | if (ret) { |
| 485 | pr_info("Invalid reference counter offset.\n"); | 512 | trace_probe_log_err(rctr - filename, BAD_REFCNT); |
| 486 | goto fail_address_parse; | 513 | goto fail_address_parse; |
| 487 | } | 514 | } |
| 488 | } | 515 | } |
| 489 | 516 | ||
| 490 | /* Parse uprobe offset. */ | 517 | /* Parse uprobe offset. */ |
| 491 | ret = kstrtoul(arg, 0, &offset); | 518 | ret = kstrtoul(arg, 0, &offset); |
| 492 | if (ret) | 519 | if (ret) { |
| 520 | trace_probe_log_err(arg - filename, BAD_UPROBE_OFFS); | ||
| 493 | goto fail_address_parse; | 521 | goto fail_address_parse; |
| 494 | 522 | } | |
| 495 | argc -= 2; | ||
| 496 | argv += 2; | ||
| 497 | 523 | ||
| 498 | /* setup a probe */ | 524 | /* setup a probe */ |
| 525 | trace_probe_log_set_index(0); | ||
| 499 | if (event) { | 526 | if (event) { |
| 500 | ret = traceprobe_parse_event_name(&event, &group, buf); | 527 | ret = traceprobe_parse_event_name(&event, &group, buf, |
| 528 | event - argv[0]); | ||
| 501 | if (ret) | 529 | if (ret) |
| 502 | goto fail_address_parse; | 530 | goto fail_address_parse; |
| 503 | } else { | 531 | } else { |
| @@ -519,6 +547,9 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 519 | kfree(tail); | 547 | kfree(tail); |
| 520 | } | 548 | } |
| 521 | 549 | ||
| 550 | argc -= 2; | ||
| 551 | argv += 2; | ||
| 552 | |||
| 522 | tu = alloc_trace_uprobe(group, event, argc, is_return); | 553 | tu = alloc_trace_uprobe(group, event, argc, is_return); |
| 523 | if (IS_ERR(tu)) { | 554 | if (IS_ERR(tu)) { |
| 524 | ret = PTR_ERR(tu); | 555 | ret = PTR_ERR(tu); |
| @@ -539,6 +570,7 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 539 | goto error; | 570 | goto error; |
| 540 | } | 571 | } |
| 541 | 572 | ||
| 573 | trace_probe_log_set_index(i + 2); | ||
| 542 | ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, | 574 | ret = traceprobe_parse_probe_arg(&tu->tp, i, tmp, |
| 543 | is_return ? TPARG_FL_RETURN : 0); | 575 | is_return ? TPARG_FL_RETURN : 0); |
| 544 | kfree(tmp); | 576 | kfree(tmp); |
| @@ -547,20 +579,20 @@ static int trace_uprobe_create(int argc, const char **argv) | |||
| 547 | } | 579 | } |
| 548 | 580 | ||
| 549 | ret = register_trace_uprobe(tu); | 581 | ret = register_trace_uprobe(tu); |
| 550 | if (ret) | 582 | if (!ret) |
| 551 | goto error; | 583 | goto out; |
| 552 | return 0; | ||
| 553 | 584 | ||
| 554 | error: | 585 | error: |
| 555 | free_trace_uprobe(tu); | 586 | free_trace_uprobe(tu); |
| 587 | out: | ||
| 588 | trace_probe_log_clear(); | ||
| 556 | return ret; | 589 | return ret; |
| 557 | 590 | ||
| 558 | fail_address_parse: | 591 | fail_address_parse: |
| 592 | trace_probe_log_clear(); | ||
| 559 | path_put(&path); | 593 | path_put(&path); |
| 560 | kfree(filename); | 594 | kfree(filename); |
| 561 | 595 | ||
| 562 | pr_info("Failed to parse address or file.\n"); | ||
| 563 | |||
| 564 | return ret; | 596 | return ret; |
| 565 | } | 597 | } |
| 566 | 598 | ||
| @@ -1304,7 +1336,7 @@ static inline void init_trace_event_call(struct trace_uprobe *tu, | |||
| 1304 | call->event.funcs = &uprobe_funcs; | 1336 | call->event.funcs = &uprobe_funcs; |
| 1305 | call->class->define_fields = uprobe_event_define_fields; | 1337 | call->class->define_fields = uprobe_event_define_fields; |
| 1306 | 1338 | ||
| 1307 | call->flags = TRACE_EVENT_FL_UPROBE; | 1339 | call->flags = TRACE_EVENT_FL_UPROBE | TRACE_EVENT_FL_CAP_ANY; |
| 1308 | call->class->reg = trace_uprobe_register; | 1340 | call->class->reg = trace_uprobe_register; |
| 1309 | call->data = tu; | 1341 | call->data = tu; |
| 1310 | } | 1342 | } |
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 46f2ab1e08a9..df3ade14ccbd 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c | |||
| @@ -1,19 +1,6 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2008-2014 Mathieu Desnoyers | 3 | * Copyright (C) 2008-2014 Mathieu Desnoyers |
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | */ | 4 | */ |
| 18 | #include <linux/module.h> | 5 | #include <linux/module.h> |
| 19 | #include <linux/mutex.h> | 6 | #include <linux/mutex.h> |
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 370724b45391..7be3e7530841 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
| @@ -1,19 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-or-later | ||
| 1 | /* | 2 | /* |
| 2 | * tsacct.c - System accounting over taskstats interface | 3 | * tsacct.c - System accounting over taskstats interface |
| 3 | * | 4 | * |
| 4 | * Copyright (C) Jay Lan, <jlan@sgi.com> | 5 | * Copyright (C) Jay Lan, <jlan@sgi.com> |
| 5 | * | ||
| 6 | * | ||
| 7 | * This program is free software; you can redistribute it and/or modify | ||
| 8 | * it under the terms of the GNU General Public License as published by | ||
| 9 | * the Free Software Foundation; either version 2 of the License, or | ||
| 10 | * (at your option) any later version. | ||
| 11 | * | ||
| 12 | * This program is distributed in the hope that it will be useful, | ||
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 15 | * GNU General Public License for more details. | ||
| 16 | * | ||
| 17 | */ | 6 | */ |
| 18 | 7 | ||
| 19 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
diff --git a/kernel/ucount.c b/kernel/ucount.c index f48d1b6376a4..feb128c7b5d9 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c | |||
| @@ -1,9 +1,4 @@ | |||
| 1 | /* | 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | * This program is free software; you can redistribute it and/or | ||
| 3 | * modify it under the terms of the GNU General Public License as | ||
| 4 | * published by the Free Software Foundation, version 2 of the | ||
| 5 | * License. | ||
| 6 | */ | ||
| 7 | 2 | ||
| 8 | #include <linux/stat.h> | 3 | #include <linux/stat.h> |
| 9 | #include <linux/sysctl.h> | 4 | #include <linux/sysctl.h> |
diff --git a/kernel/umh.c b/kernel/umh.c index d937cbad903a..7f255b5a8845 100644 --- a/kernel/umh.c +++ b/kernel/umh.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * umh - the kernel usermode helper | 3 | * umh - the kernel usermode helper |
| 3 | */ | 4 | */ |
diff --git a/kernel/up.c b/kernel/up.c index ff536f9cc8a2..862b460ab97a 100644 --- a/kernel/up.c +++ b/kernel/up.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Uniprocessor-only support functions. The counterpart to kernel/smp.c | 3 | * Uniprocessor-only support functions. The counterpart to kernel/smp.c |
| 3 | */ | 4 | */ |
| @@ -34,14 +35,13 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) | |||
| 34 | } | 35 | } |
| 35 | EXPORT_SYMBOL(smp_call_function_single_async); | 36 | EXPORT_SYMBOL(smp_call_function_single_async); |
| 36 | 37 | ||
| 37 | int on_each_cpu(smp_call_func_t func, void *info, int wait) | 38 | void on_each_cpu(smp_call_func_t func, void *info, int wait) |
| 38 | { | 39 | { |
| 39 | unsigned long flags; | 40 | unsigned long flags; |
| 40 | 41 | ||
| 41 | local_irq_save(flags); | 42 | local_irq_save(flags); |
| 42 | func(info); | 43 | func(info); |
| 43 | local_irq_restore(flags); | 44 | local_irq_restore(flags); |
| 44 | return 0; | ||
| 45 | } | 45 | } |
| 46 | EXPORT_SYMBOL(on_each_cpu); | 46 | EXPORT_SYMBOL(on_each_cpu); |
| 47 | 47 | ||
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c index 9586b670a5b2..870ecd7c63ed 100644 --- a/kernel/user-return-notifier.c +++ b/kernel/user-return-notifier.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | 2 | ||
| 2 | #include <linux/user-return-notifier.h> | 3 | #include <linux/user-return-notifier.h> |
| 3 | #include <linux/percpu.h> | 4 | #include <linux/percpu.h> |
diff --git a/kernel/user.c b/kernel/user.c index 0df9b1640b2a..5235d7f49982 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * The "user cache". | 3 | * The "user cache". |
| 3 | * | 4 | * |
| @@ -62,9 +63,9 @@ struct user_namespace init_user_ns = { | |||
| 62 | .ns.ops = &userns_operations, | 63 | .ns.ops = &userns_operations, |
| 63 | #endif | 64 | #endif |
| 64 | .flags = USERNS_INIT_FLAGS, | 65 | .flags = USERNS_INIT_FLAGS, |
| 65 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 66 | #ifdef CONFIG_KEYS |
| 66 | .persistent_keyring_register_sem = | 67 | .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), |
| 67 | __RWSEM_INITIALIZER(init_user_ns.persistent_keyring_register_sem), | 68 | .keyring_sem = __RWSEM_INITIALIZER(init_user_ns.keyring_sem), |
| 68 | #endif | 69 | #endif |
| 69 | }; | 70 | }; |
| 70 | EXPORT_SYMBOL_GPL(init_user_ns); | 71 | EXPORT_SYMBOL_GPL(init_user_ns); |
| @@ -140,8 +141,6 @@ static void free_user(struct user_struct *up, unsigned long flags) | |||
| 140 | { | 141 | { |
| 141 | uid_hash_remove(up); | 142 | uid_hash_remove(up); |
| 142 | spin_unlock_irqrestore(&uidhash_lock, flags); | 143 | spin_unlock_irqrestore(&uidhash_lock, flags); |
| 143 | key_put(up->uid_keyring); | ||
| 144 | key_put(up->session_keyring); | ||
| 145 | kmem_cache_free(uid_cachep, up); | 144 | kmem_cache_free(uid_cachep, up); |
| 146 | } | 145 | } |
| 147 | 146 | ||
| @@ -185,7 +184,7 @@ struct user_struct *alloc_uid(kuid_t uid) | |||
| 185 | if (!up) { | 184 | if (!up) { |
| 186 | new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); | 185 | new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); |
| 187 | if (!new) | 186 | if (!new) |
| 188 | goto out_unlock; | 187 | return NULL; |
| 189 | 188 | ||
| 190 | new->uid = uid; | 189 | new->uid = uid; |
| 191 | refcount_set(&new->__count, 1); | 190 | refcount_set(&new->__count, 1); |
| @@ -199,8 +198,6 @@ struct user_struct *alloc_uid(kuid_t uid) | |||
| 199 | spin_lock_irq(&uidhash_lock); | 198 | spin_lock_irq(&uidhash_lock); |
| 200 | up = uid_hash_find(uid, hashent); | 199 | up = uid_hash_find(uid, hashent); |
| 201 | if (up) { | 200 | if (up) { |
| 202 | key_put(new->uid_keyring); | ||
| 203 | key_put(new->session_keyring); | ||
| 204 | kmem_cache_free(uid_cachep, new); | 201 | kmem_cache_free(uid_cachep, new); |
| 205 | } else { | 202 | } else { |
| 206 | uid_hash_insert(new, hashent); | 203 | uid_hash_insert(new, hashent); |
| @@ -210,9 +207,6 @@ struct user_struct *alloc_uid(kuid_t uid) | |||
| 210 | } | 207 | } |
| 211 | 208 | ||
| 212 | return up; | 209 | return up; |
| 213 | |||
| 214 | out_unlock: | ||
| 215 | return NULL; | ||
| 216 | } | 210 | } |
| 217 | 211 | ||
| 218 | static int __init uid_cache_init(void) | 212 | static int __init uid_cache_init(void) |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 923414a246e9..8eadadc478f9 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -1,9 +1,4 @@ | |||
| 1 | /* | 1 | // SPDX-License-Identifier: GPL-2.0-only |
| 2 | * This program is free software; you can redistribute it and/or | ||
| 3 | * modify it under the terms of the GNU General Public License as | ||
| 4 | * published by the Free Software Foundation, version 2 of the | ||
| 5 | * License. | ||
| 6 | */ | ||
| 7 | 2 | ||
| 8 | #include <linux/export.h> | 3 | #include <linux/export.h> |
| 9 | #include <linux/nsproxy.h> | 4 | #include <linux/nsproxy.h> |
| @@ -133,8 +128,9 @@ int create_user_ns(struct cred *new) | |||
| 133 | ns->flags = parent_ns->flags; | 128 | ns->flags = parent_ns->flags; |
| 134 | mutex_unlock(&userns_state_mutex); | 129 | mutex_unlock(&userns_state_mutex); |
| 135 | 130 | ||
| 136 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 131 | #ifdef CONFIG_KEYS |
| 137 | init_rwsem(&ns->persistent_keyring_register_sem); | 132 | INIT_LIST_HEAD(&ns->keyring_name_list); |
| 133 | init_rwsem(&ns->keyring_sem); | ||
| 138 | #endif | 134 | #endif |
| 139 | ret = -ENOMEM; | 135 | ret = -ENOMEM; |
| 140 | if (!setup_userns_sysctls(ns)) | 136 | if (!setup_userns_sysctls(ns)) |
| @@ -196,9 +192,7 @@ static void free_user_ns(struct work_struct *work) | |||
| 196 | kfree(ns->projid_map.reverse); | 192 | kfree(ns->projid_map.reverse); |
| 197 | } | 193 | } |
| 198 | retire_userns_sysctls(ns); | 194 | retire_userns_sysctls(ns); |
| 199 | #ifdef CONFIG_PERSISTENT_KEYRINGS | 195 | key_free_user_ns(ns); |
| 200 | key_put(ns->persistent_keyring_register); | ||
| 201 | #endif | ||
| 202 | ns_free_inum(&ns->ns); | 196 | ns_free_inum(&ns->ns); |
| 203 | kmem_cache_free(user_ns_cachep, ns); | 197 | kmem_cache_free(user_ns_cachep, ns); |
| 204 | dec_user_namespaces(ucounts); | 198 | dec_user_namespaces(ucounts); |
diff --git a/kernel/utsname.c b/kernel/utsname.c index dcd6be1996fe..f0e491193009 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2004 IBM Corporation | 3 | * Copyright (C) 2004 IBM Corporation |
| 3 | * | 4 | * |
| 4 | * Author: Serge Hallyn <serue@us.ibm.com> | 5 | * Author: Serge Hallyn <serue@us.ibm.com> |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License as | ||
| 8 | * published by the Free Software Foundation, version 2 of the | ||
| 9 | * License. | ||
| 10 | */ | 6 | */ |
| 11 | 7 | ||
| 12 | #include <linux/export.h> | 8 | #include <linux/export.h> |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 258033d62cb3..3732c888a949 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
| @@ -1,12 +1,8 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * Copyright (C) 2007 | 3 | * Copyright (C) 2007 |
| 3 | * | 4 | * |
| 4 | * Author: Eric Biederman <ebiederm@xmision.com> | 5 | * Author: Eric Biederman <ebiederm@xmision.com> |
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or | ||
| 7 | * modify it under the terms of the GNU General Public License as | ||
| 8 | * published by the Free Software Foundation, version 2 of the | ||
| 9 | * License. | ||
| 10 | */ | 6 | */ |
| 11 | 7 | ||
| 12 | #include <linux/export.h> | 8 | #include <linux/export.h> |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9657315405de..601d61150b65 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -1,3 +1,4 @@ | |||
| 1 | // SPDX-License-Identifier: GPL-2.0-only | ||
| 1 | /* | 2 | /* |
| 2 | * kernel/workqueue.c - generic async execution with shared worker pool | 3 | * kernel/workqueue.c - generic async execution with shared worker pool |
| 3 | * | 4 | * |
| @@ -3328,7 +3329,7 @@ EXPORT_SYMBOL_GPL(execute_in_process_context); | |||
| 3328 | * | 3329 | * |
| 3329 | * Undo alloc_workqueue_attrs(). | 3330 | * Undo alloc_workqueue_attrs(). |
| 3330 | */ | 3331 | */ |
| 3331 | void free_workqueue_attrs(struct workqueue_attrs *attrs) | 3332 | static void free_workqueue_attrs(struct workqueue_attrs *attrs) |
| 3332 | { | 3333 | { |
| 3333 | if (attrs) { | 3334 | if (attrs) { |
| 3334 | free_cpumask_var(attrs->cpumask); | 3335 | free_cpumask_var(attrs->cpumask); |
| @@ -3338,21 +3339,20 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) | |||
| 3338 | 3339 | ||
| 3339 | /** | 3340 | /** |
| 3340 | * alloc_workqueue_attrs - allocate a workqueue_attrs | 3341 | * alloc_workqueue_attrs - allocate a workqueue_attrs |
| 3341 | * @gfp_mask: allocation mask to use | ||
| 3342 | * | 3342 | * |
| 3343 | * Allocate a new workqueue_attrs, initialize with default settings and | 3343 | * Allocate a new workqueue_attrs, initialize with default settings and |
| 3344 | * return it. | 3344 | * return it. |
| 3345 | * | 3345 | * |
| 3346 | * Return: The allocated new workqueue_attr on success. %NULL on failure. | 3346 | * Return: The allocated new workqueue_attr on success. %NULL on failure. |
| 3347 | */ | 3347 | */ |
| 3348 | struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) | 3348 | static struct workqueue_attrs *alloc_workqueue_attrs(void) |
| 3349 | { | 3349 | { |
| 3350 | struct workqueue_attrs *attrs; | 3350 | struct workqueue_attrs *attrs; |
| 3351 | 3351 | ||
| 3352 | attrs = kzalloc(sizeof(*attrs), gfp_mask); | 3352 | attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); |
| 3353 | if (!attrs) | 3353 | if (!attrs) |
| 3354 | goto fail; | 3354 | goto fail; |
| 3355 | if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) | 3355 | if (!alloc_cpumask_var(&attrs->cpumask, GFP_KERNEL)) |
| 3356 | goto fail; | 3356 | goto fail; |
| 3357 | 3357 | ||
| 3358 | cpumask_copy(attrs->cpumask, cpu_possible_mask); | 3358 | cpumask_copy(attrs->cpumask, cpu_possible_mask); |
| @@ -3430,7 +3430,7 @@ static int init_worker_pool(struct worker_pool *pool) | |||
| 3430 | pool->refcnt = 1; | 3430 | pool->refcnt = 1; |
| 3431 | 3431 | ||
| 3432 | /* shouldn't fail above this point */ | 3432 | /* shouldn't fail above this point */ |
| 3433 | pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); | 3433 | pool->attrs = alloc_workqueue_attrs(); |
| 3434 | if (!pool->attrs) | 3434 | if (!pool->attrs) |
| 3435 | return -ENOMEM; | 3435 | return -ENOMEM; |
| 3436 | return 0; | 3436 | return 0; |
| @@ -3895,8 +3895,8 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, | |||
| 3895 | 3895 | ||
| 3896 | ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); | 3896 | ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); |
| 3897 | 3897 | ||
| 3898 | new_attrs = alloc_workqueue_attrs(GFP_KERNEL); | 3898 | new_attrs = alloc_workqueue_attrs(); |
| 3899 | tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); | 3899 | tmp_attrs = alloc_workqueue_attrs(); |
| 3900 | if (!ctx || !new_attrs || !tmp_attrs) | 3900 | if (!ctx || !new_attrs || !tmp_attrs) |
| 3901 | goto out_free; | 3901 | goto out_free; |
| 3902 | 3902 | ||
| @@ -4032,7 +4032,7 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, | |||
| 4032 | * | 4032 | * |
| 4033 | * Return: 0 on success and -errno on failure. | 4033 | * Return: 0 on success and -errno on failure. |
| 4034 | */ | 4034 | */ |
| 4035 | int apply_workqueue_attrs(struct workqueue_struct *wq, | 4035 | static int apply_workqueue_attrs(struct workqueue_struct *wq, |
| 4036 | const struct workqueue_attrs *attrs) | 4036 | const struct workqueue_attrs *attrs) |
| 4037 | { | 4037 | { |
| 4038 | int ret; | 4038 | int ret; |
| @@ -4043,7 +4043,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, | |||
| 4043 | 4043 | ||
| 4044 | return ret; | 4044 | return ret; |
| 4045 | } | 4045 | } |
| 4046 | EXPORT_SYMBOL_GPL(apply_workqueue_attrs); | ||
| 4047 | 4046 | ||
| 4048 | /** | 4047 | /** |
| 4049 | * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug | 4048 | * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug |
| @@ -4241,7 +4240,7 @@ struct workqueue_struct *alloc_workqueue(const char *fmt, | |||
| 4241 | return NULL; | 4240 | return NULL; |
| 4242 | 4241 | ||
| 4243 | if (flags & WQ_UNBOUND) { | 4242 | if (flags & WQ_UNBOUND) { |
| 4244 | wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); | 4243 | wq->unbound_attrs = alloc_workqueue_attrs(); |
| 4245 | if (!wq->unbound_attrs) | 4244 | if (!wq->unbound_attrs) |
| 4246 | goto err_free_wq; | 4245 | goto err_free_wq; |
| 4247 | } | 4246 | } |
| @@ -5394,7 +5393,7 @@ static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) | |||
| 5394 | 5393 | ||
| 5395 | lockdep_assert_held(&wq_pool_mutex); | 5394 | lockdep_assert_held(&wq_pool_mutex); |
| 5396 | 5395 | ||
| 5397 | attrs = alloc_workqueue_attrs(GFP_KERNEL); | 5396 | attrs = alloc_workqueue_attrs(); |
| 5398 | if (!attrs) | 5397 | if (!attrs) |
| 5399 | return NULL; | 5398 | return NULL; |
| 5400 | 5399 | ||
| @@ -5816,7 +5815,7 @@ static void __init wq_numa_init(void) | |||
| 5816 | return; | 5815 | return; |
| 5817 | } | 5816 | } |
| 5818 | 5817 | ||
| 5819 | wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); | 5818 | wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(); |
| 5820 | BUG_ON(!wq_update_unbound_numa_attrs_buf); | 5819 | BUG_ON(!wq_update_unbound_numa_attrs_buf); |
| 5821 | 5820 | ||
| 5822 | /* | 5821 | /* |
| @@ -5891,7 +5890,7 @@ int __init workqueue_init_early(void) | |||
| 5891 | for (i = 0; i < NR_STD_WORKER_POOLS; i++) { | 5890 | for (i = 0; i < NR_STD_WORKER_POOLS; i++) { |
| 5892 | struct workqueue_attrs *attrs; | 5891 | struct workqueue_attrs *attrs; |
| 5893 | 5892 | ||
| 5894 | BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); | 5893 | BUG_ON(!(attrs = alloc_workqueue_attrs())); |
| 5895 | attrs->nice = std_nice[i]; | 5894 | attrs->nice = std_nice[i]; |
| 5896 | unbound_std_wq_attrs[i] = attrs; | 5895 | unbound_std_wq_attrs[i] = attrs; |
| 5897 | 5896 | ||
| @@ -5900,7 +5899,7 @@ int __init workqueue_init_early(void) | |||
| 5900 | * guaranteed by max_active which is enforced by pwqs. | 5899 | * guaranteed by max_active which is enforced by pwqs. |
| 5901 | * Turn off NUMA so that dfl_pwq is used for all nodes. | 5900 | * Turn off NUMA so that dfl_pwq is used for all nodes. |
| 5902 | */ | 5901 | */ |
| 5903 | BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); | 5902 | BUG_ON(!(attrs = alloc_workqueue_attrs())); |
| 5904 | attrs->nice = std_nice[i]; | 5903 | attrs->nice = std_nice[i]; |
| 5905 | attrs->no_numa = true; | 5904 | attrs->no_numa = true; |
| 5906 | ordered_wq_attrs[i] = attrs; | 5905 | ordered_wq_attrs[i] = attrs; |
