diff options
Diffstat (limited to 'kernel')
85 files changed, 5224 insertions, 3619 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 2251882daf53..44511d100eaa 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks | |||
| @@ -87,6 +87,9 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQ | |||
| 87 | config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 87 | config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE |
| 88 | bool | 88 | bool |
| 89 | 89 | ||
| 90 | config UNINLINE_SPIN_UNLOCK | ||
| 91 | bool | ||
| 92 | |||
| 90 | # | 93 | # |
| 91 | # lock_* functions are inlined when: | 94 | # lock_* functions are inlined when: |
| 92 | # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y | 95 | # - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y |
| @@ -103,100 +106,120 @@ config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | |||
| 103 | # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y | 106 | # - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y |
| 104 | # | 107 | # |
| 105 | 108 | ||
| 109 | if !DEBUG_SPINLOCK | ||
| 110 | |||
| 106 | config INLINE_SPIN_TRYLOCK | 111 | config INLINE_SPIN_TRYLOCK |
| 107 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK | 112 | def_bool y |
| 113 | depends on ARCH_INLINE_SPIN_TRYLOCK | ||
| 108 | 114 | ||
| 109 | config INLINE_SPIN_TRYLOCK_BH | 115 | config INLINE_SPIN_TRYLOCK_BH |
| 110 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH | 116 | def_bool y |
| 117 | depends on ARCH_INLINE_SPIN_TRYLOCK_BH | ||
| 111 | 118 | ||
| 112 | config INLINE_SPIN_LOCK | 119 | config INLINE_SPIN_LOCK |
| 113 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK | 120 | def_bool y |
| 121 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK | ||
| 114 | 122 | ||
| 115 | config INLINE_SPIN_LOCK_BH | 123 | config INLINE_SPIN_LOCK_BH |
| 116 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 124 | def_bool y |
| 117 | ARCH_INLINE_SPIN_LOCK_BH | 125 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_BH |
| 118 | 126 | ||
| 119 | config INLINE_SPIN_LOCK_IRQ | 127 | config INLINE_SPIN_LOCK_IRQ |
| 120 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 128 | def_bool y |
| 121 | ARCH_INLINE_SPIN_LOCK_IRQ | 129 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQ |
| 122 | 130 | ||
| 123 | config INLINE_SPIN_LOCK_IRQSAVE | 131 | config INLINE_SPIN_LOCK_IRQSAVE |
| 124 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 132 | def_bool y |
| 125 | ARCH_INLINE_SPIN_LOCK_IRQSAVE | 133 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK_IRQSAVE |
| 126 | |||
| 127 | config UNINLINE_SPIN_UNLOCK | ||
| 128 | bool | ||
| 129 | 134 | ||
| 130 | config INLINE_SPIN_UNLOCK_BH | 135 | config INLINE_SPIN_UNLOCK_BH |
| 131 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH | 136 | def_bool y |
| 137 | depends on ARCH_INLINE_SPIN_UNLOCK_BH | ||
| 132 | 138 | ||
| 133 | config INLINE_SPIN_UNLOCK_IRQ | 139 | config INLINE_SPIN_UNLOCK_IRQ |
| 134 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) | 140 | def_bool y |
| 141 | depends on !PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH | ||
| 135 | 142 | ||
| 136 | config INLINE_SPIN_UNLOCK_IRQRESTORE | 143 | config INLINE_SPIN_UNLOCK_IRQRESTORE |
| 137 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE | 144 | def_bool y |
| 145 | depends on ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE | ||
| 138 | 146 | ||
| 139 | 147 | ||
| 140 | config INLINE_READ_TRYLOCK | 148 | config INLINE_READ_TRYLOCK |
| 141 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK | 149 | def_bool y |
| 150 | depends on ARCH_INLINE_READ_TRYLOCK | ||
| 142 | 151 | ||
| 143 | config INLINE_READ_LOCK | 152 | config INLINE_READ_LOCK |
| 144 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK | 153 | def_bool y |
| 154 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK | ||
| 145 | 155 | ||
| 146 | config INLINE_READ_LOCK_BH | 156 | config INLINE_READ_LOCK_BH |
| 147 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 157 | def_bool y |
| 148 | ARCH_INLINE_READ_LOCK_BH | 158 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_BH |
| 149 | 159 | ||
| 150 | config INLINE_READ_LOCK_IRQ | 160 | config INLINE_READ_LOCK_IRQ |
| 151 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 161 | def_bool y |
| 152 | ARCH_INLINE_READ_LOCK_IRQ | 162 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQ |
| 153 | 163 | ||
| 154 | config INLINE_READ_LOCK_IRQSAVE | 164 | config INLINE_READ_LOCK_IRQSAVE |
| 155 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 165 | def_bool y |
| 156 | ARCH_INLINE_READ_LOCK_IRQSAVE | 166 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK_IRQSAVE |
| 157 | 167 | ||
| 158 | config INLINE_READ_UNLOCK | 168 | config INLINE_READ_UNLOCK |
| 159 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) | 169 | def_bool y |
| 170 | depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK | ||
| 160 | 171 | ||
| 161 | config INLINE_READ_UNLOCK_BH | 172 | config INLINE_READ_UNLOCK_BH |
| 162 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH | 173 | def_bool y |
| 174 | depends on ARCH_INLINE_READ_UNLOCK_BH | ||
| 163 | 175 | ||
| 164 | config INLINE_READ_UNLOCK_IRQ | 176 | config INLINE_READ_UNLOCK_IRQ |
| 165 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) | 177 | def_bool y |
| 178 | depends on !PREEMPT || ARCH_INLINE_READ_UNLOCK_BH | ||
| 166 | 179 | ||
| 167 | config INLINE_READ_UNLOCK_IRQRESTORE | 180 | config INLINE_READ_UNLOCK_IRQRESTORE |
| 168 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE | 181 | def_bool y |
| 182 | depends on ARCH_INLINE_READ_UNLOCK_IRQRESTORE | ||
| 169 | 183 | ||
| 170 | 184 | ||
| 171 | config INLINE_WRITE_TRYLOCK | 185 | config INLINE_WRITE_TRYLOCK |
| 172 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK | 186 | def_bool y |
| 187 | depends on ARCH_INLINE_WRITE_TRYLOCK | ||
| 173 | 188 | ||
| 174 | config INLINE_WRITE_LOCK | 189 | config INLINE_WRITE_LOCK |
| 175 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK | 190 | def_bool y |
| 191 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK | ||
| 176 | 192 | ||
| 177 | config INLINE_WRITE_LOCK_BH | 193 | config INLINE_WRITE_LOCK_BH |
| 178 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 194 | def_bool y |
| 179 | ARCH_INLINE_WRITE_LOCK_BH | 195 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_BH |
| 180 | 196 | ||
| 181 | config INLINE_WRITE_LOCK_IRQ | 197 | config INLINE_WRITE_LOCK_IRQ |
| 182 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 198 | def_bool y |
| 183 | ARCH_INLINE_WRITE_LOCK_IRQ | 199 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQ |
| 184 | 200 | ||
| 185 | config INLINE_WRITE_LOCK_IRQSAVE | 201 | config INLINE_WRITE_LOCK_IRQSAVE |
| 186 | def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ | 202 | def_bool y |
| 187 | ARCH_INLINE_WRITE_LOCK_IRQSAVE | 203 | depends on !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK_IRQSAVE |
| 188 | 204 | ||
| 189 | config INLINE_WRITE_UNLOCK | 205 | config INLINE_WRITE_UNLOCK |
| 190 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) | 206 | def_bool y |
| 207 | depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK | ||
| 191 | 208 | ||
| 192 | config INLINE_WRITE_UNLOCK_BH | 209 | config INLINE_WRITE_UNLOCK_BH |
| 193 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH | 210 | def_bool y |
| 211 | depends on ARCH_INLINE_WRITE_UNLOCK_BH | ||
| 194 | 212 | ||
| 195 | config INLINE_WRITE_UNLOCK_IRQ | 213 | config INLINE_WRITE_UNLOCK_IRQ |
| 196 | def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) | 214 | def_bool y |
| 215 | depends on !PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH | ||
| 197 | 216 | ||
| 198 | config INLINE_WRITE_UNLOCK_IRQRESTORE | 217 | config INLINE_WRITE_UNLOCK_IRQRESTORE |
| 199 | def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | 218 | def_bool y |
| 219 | depends on ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE | ||
| 220 | |||
| 221 | endif | ||
| 200 | 222 | ||
| 201 | config MUTEX_SPIN_ON_OWNER | 223 | config MUTEX_SPIN_ON_OWNER |
| 202 | def_bool SMP && !DEBUG_MUTEXES | 224 | def_bool y |
| 225 | depends on SMP && !DEBUG_MUTEXES | ||
diff --git a/kernel/Makefile b/kernel/Makefile index c0cc67ad764c..5404911eaee9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
| @@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o printk.o \ | |||
| 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
| 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ | 11 | hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ |
| 12 | notifier.o ksysfs.o cred.o \ | 12 | notifier.o ksysfs.o cred.o \ |
| 13 | async.o range.o groups.o lglock.o | 13 | async.o range.o groups.o lglock.o smpboot.o |
| 14 | 14 | ||
| 15 | ifdef CONFIG_FUNCTION_TRACER | 15 | ifdef CONFIG_FUNCTION_TRACER |
| 16 | # Do not trace debug files and internal ftrace files | 16 | # Do not trace debug files and internal ftrace files |
| @@ -46,7 +46,6 @@ obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | |||
| 46 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | 46 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o |
| 47 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 47 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
| 48 | obj-$(CONFIG_SMP) += smp.o | 48 | obj-$(CONFIG_SMP) += smp.o |
| 49 | obj-$(CONFIG_SMP) += smpboot.o | ||
| 50 | ifneq ($(CONFIG_SMP),y) | 49 | ifneq ($(CONFIG_SMP),y) |
| 51 | obj-y += up.o | 50 | obj-y += up.o |
| 52 | endif | 51 | endif |
| @@ -98,7 +97,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o | |||
| 98 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o | 97 | obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o |
| 99 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ | 98 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ |
| 100 | obj-$(CONFIG_TRACING) += trace/ | 99 | obj-$(CONFIG_TRACING) += trace/ |
| 101 | obj-$(CONFIG_X86_DS) += trace/ | 100 | obj-$(CONFIG_TRACE_CLOCK) += trace/ |
| 102 | obj-$(CONFIG_RING_BUFFER) += trace/ | 101 | obj-$(CONFIG_RING_BUFFER) += trace/ |
| 103 | obj-$(CONFIG_TRACEPOINTS) += trace/ | 102 | obj-$(CONFIG_TRACEPOINTS) += trace/ |
| 104 | obj-$(CONFIG_IRQ_WORK) += irq_work.o | 103 | obj-$(CONFIG_IRQ_WORK) += irq_work.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 02e6167a53b0..6cd7529c9e6a 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
| @@ -507,8 +507,8 @@ static void do_acct_process(struct bsd_acct_struct *acct, | |||
| 507 | do_div(elapsed, AHZ); | 507 | do_div(elapsed, AHZ); |
| 508 | ac.ac_btime = get_seconds() - elapsed; | 508 | ac.ac_btime = get_seconds() - elapsed; |
| 509 | /* we really need to bite the bullet and change layout */ | 509 | /* we really need to bite the bullet and change layout */ |
| 510 | ac.ac_uid = orig_cred->uid; | 510 | ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); |
| 511 | ac.ac_gid = orig_cred->gid; | 511 | ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); |
| 512 | #if ACCT_VERSION==2 | 512 | #if ACCT_VERSION==2 |
| 513 | ac.ac_ahz = AHZ; | 513 | ac.ac_ahz = AHZ; |
| 514 | #endif | 514 | #endif |
diff --git a/kernel/audit.c b/kernel/audit.c index ea3b7b6191c7..4d0ceede3319 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
| @@ -61,6 +61,7 @@ | |||
| 61 | #include <linux/netlink.h> | 61 | #include <linux/netlink.h> |
| 62 | #include <linux/freezer.h> | 62 | #include <linux/freezer.h> |
| 63 | #include <linux/tty.h> | 63 | #include <linux/tty.h> |
| 64 | #include <linux/pid_namespace.h> | ||
| 64 | 65 | ||
| 65 | #include "audit.h" | 66 | #include "audit.h" |
| 66 | 67 | ||
| @@ -87,11 +88,11 @@ static int audit_failure = AUDIT_FAIL_PRINTK; | |||
| 87 | 88 | ||
| 88 | /* | 89 | /* |
| 89 | * If audit records are to be written to the netlink socket, audit_pid | 90 | * If audit records are to be written to the netlink socket, audit_pid |
| 90 | * contains the pid of the auditd process and audit_nlk_pid contains | 91 | * contains the pid of the auditd process and audit_nlk_portid contains |
| 91 | * the pid to use to send netlink messages to that process. | 92 | * the portid to use to send netlink messages to that process. |
| 92 | */ | 93 | */ |
| 93 | int audit_pid; | 94 | int audit_pid; |
| 94 | static int audit_nlk_pid; | 95 | static int audit_nlk_portid; |
| 95 | 96 | ||
| 96 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records | 97 | /* If audit_rate_limit is non-zero, limit the rate of sending audit records |
| 97 | * to that number per second. This prevents DoS attacks, but results in | 98 | * to that number per second. This prevents DoS attacks, but results in |
| @@ -104,7 +105,7 @@ static int audit_backlog_wait_time = 60 * HZ; | |||
| 104 | static int audit_backlog_wait_overflow = 0; | 105 | static int audit_backlog_wait_overflow = 0; |
| 105 | 106 | ||
| 106 | /* The identity of the user shutting down the audit system. */ | 107 | /* The identity of the user shutting down the audit system. */ |
| 107 | uid_t audit_sig_uid = -1; | 108 | kuid_t audit_sig_uid = INVALID_UID; |
| 108 | pid_t audit_sig_pid = -1; | 109 | pid_t audit_sig_pid = -1; |
| 109 | u32 audit_sig_sid = 0; | 110 | u32 audit_sig_sid = 0; |
| 110 | 111 | ||
| @@ -264,7 +265,7 @@ void audit_log_lost(const char *message) | |||
| 264 | } | 265 | } |
| 265 | 266 | ||
| 266 | static int audit_log_config_change(char *function_name, int new, int old, | 267 | static int audit_log_config_change(char *function_name, int new, int old, |
| 267 | uid_t loginuid, u32 sessionid, u32 sid, | 268 | kuid_t loginuid, u32 sessionid, u32 sid, |
| 268 | int allow_changes) | 269 | int allow_changes) |
| 269 | { | 270 | { |
| 270 | struct audit_buffer *ab; | 271 | struct audit_buffer *ab; |
| @@ -272,7 +273,7 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
| 272 | 273 | ||
| 273 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 274 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
| 274 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, | 275 | audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, |
| 275 | old, loginuid, sessionid); | 276 | old, from_kuid(&init_user_ns, loginuid), sessionid); |
| 276 | if (sid) { | 277 | if (sid) { |
| 277 | char *ctx = NULL; | 278 | char *ctx = NULL; |
| 278 | u32 len; | 279 | u32 len; |
| @@ -292,7 +293,7 @@ static int audit_log_config_change(char *function_name, int new, int old, | |||
| 292 | } | 293 | } |
| 293 | 294 | ||
| 294 | static int audit_do_config_change(char *function_name, int *to_change, | 295 | static int audit_do_config_change(char *function_name, int *to_change, |
| 295 | int new, uid_t loginuid, u32 sessionid, | 296 | int new, kuid_t loginuid, u32 sessionid, |
| 296 | u32 sid) | 297 | u32 sid) |
| 297 | { | 298 | { |
| 298 | int allow_changes, rc = 0, old = *to_change; | 299 | int allow_changes, rc = 0, old = *to_change; |
| @@ -319,21 +320,21 @@ static int audit_do_config_change(char *function_name, int *to_change, | |||
| 319 | return rc; | 320 | return rc; |
| 320 | } | 321 | } |
| 321 | 322 | ||
| 322 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, | 323 | static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid, |
| 323 | u32 sid) | 324 | u32 sid) |
| 324 | { | 325 | { |
| 325 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, | 326 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, |
| 326 | limit, loginuid, sessionid, sid); | 327 | limit, loginuid, sessionid, sid); |
| 327 | } | 328 | } |
| 328 | 329 | ||
| 329 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, | 330 | static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid, |
| 330 | u32 sid) | 331 | u32 sid) |
| 331 | { | 332 | { |
| 332 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, | 333 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, |
| 333 | limit, loginuid, sessionid, sid); | 334 | limit, loginuid, sessionid, sid); |
| 334 | } | 335 | } |
| 335 | 336 | ||
| 336 | static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) | 337 | static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid) |
| 337 | { | 338 | { |
| 338 | int rc; | 339 | int rc; |
| 339 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) | 340 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) |
| @@ -348,7 +349,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) | |||
| 348 | return rc; | 349 | return rc; |
| 349 | } | 350 | } |
| 350 | 351 | ||
| 351 | static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) | 352 | static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid) |
| 352 | { | 353 | { |
| 353 | if (state != AUDIT_FAIL_SILENT | 354 | if (state != AUDIT_FAIL_SILENT |
| 354 | && state != AUDIT_FAIL_PRINTK | 355 | && state != AUDIT_FAIL_PRINTK |
| @@ -401,7 +402,7 @@ static void kauditd_send_skb(struct sk_buff *skb) | |||
| 401 | int err; | 402 | int err; |
| 402 | /* take a reference in case we can't send it and we want to hold it */ | 403 | /* take a reference in case we can't send it and we want to hold it */ |
| 403 | skb_get(skb); | 404 | skb_get(skb); |
| 404 | err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); | 405 | err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); |
| 405 | if (err < 0) { | 406 | if (err < 0) { |
| 406 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ | 407 | BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ |
| 407 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); | 408 | printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); |
| @@ -467,24 +468,6 @@ static int kauditd_thread(void *dummy) | |||
| 467 | return 0; | 468 | return 0; |
| 468 | } | 469 | } |
| 469 | 470 | ||
| 470 | static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) | ||
| 471 | { | ||
| 472 | struct task_struct *tsk; | ||
| 473 | int err; | ||
| 474 | |||
| 475 | rcu_read_lock(); | ||
| 476 | tsk = find_task_by_vpid(pid); | ||
| 477 | if (!tsk) { | ||
| 478 | rcu_read_unlock(); | ||
| 479 | return -ESRCH; | ||
| 480 | } | ||
| 481 | get_task_struct(tsk); | ||
| 482 | rcu_read_unlock(); | ||
| 483 | err = tty_audit_push_task(tsk, loginuid, sessionid); | ||
| 484 | put_task_struct(tsk); | ||
| 485 | return err; | ||
| 486 | } | ||
| 487 | |||
| 488 | int audit_send_list(void *_dest) | 471 | int audit_send_list(void *_dest) |
| 489 | { | 472 | { |
| 490 | struct audit_netlink_list *dest = _dest; | 473 | struct audit_netlink_list *dest = _dest; |
| @@ -588,6 +571,11 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
| 588 | { | 571 | { |
| 589 | int err = 0; | 572 | int err = 0; |
| 590 | 573 | ||
| 574 | /* Only support the initial namespaces for now. */ | ||
| 575 | if ((current_user_ns() != &init_user_ns) || | ||
| 576 | (task_active_pid_ns(current) != &init_pid_ns)) | ||
| 577 | return -EPERM; | ||
| 578 | |||
| 591 | switch (msg_type) { | 579 | switch (msg_type) { |
| 592 | case AUDIT_GET: | 580 | case AUDIT_GET: |
| 593 | case AUDIT_LIST: | 581 | case AUDIT_LIST: |
| @@ -619,8 +607,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
| 619 | } | 607 | } |
| 620 | 608 | ||
| 621 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | 609 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, |
| 622 | u32 pid, u32 uid, uid_t auid, u32 ses, | 610 | kuid_t auid, u32 ses, u32 sid) |
| 623 | u32 sid) | ||
| 624 | { | 611 | { |
| 625 | int rc = 0; | 612 | int rc = 0; |
| 626 | char *ctx = NULL; | 613 | char *ctx = NULL; |
| @@ -633,7 +620,9 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
| 633 | 620 | ||
| 634 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 621 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); |
| 635 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", | 622 | audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u", |
| 636 | pid, uid, auid, ses); | 623 | task_tgid_vnr(current), |
| 624 | from_kuid(&init_user_ns, current_uid()), | ||
| 625 | from_kuid(&init_user_ns, auid), ses); | ||
| 637 | if (sid) { | 626 | if (sid) { |
| 638 | rc = security_secid_to_secctx(sid, &ctx, &len); | 627 | rc = security_secid_to_secctx(sid, &ctx, &len); |
| 639 | if (rc) | 628 | if (rc) |
| @@ -649,13 +638,13 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | |||
| 649 | 638 | ||
| 650 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 639 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
| 651 | { | 640 | { |
| 652 | u32 uid, pid, seq, sid; | 641 | u32 seq, sid; |
| 653 | void *data; | 642 | void *data; |
| 654 | struct audit_status *status_get, status_set; | 643 | struct audit_status *status_get, status_set; |
| 655 | int err; | 644 | int err; |
| 656 | struct audit_buffer *ab; | 645 | struct audit_buffer *ab; |
| 657 | u16 msg_type = nlh->nlmsg_type; | 646 | u16 msg_type = nlh->nlmsg_type; |
| 658 | uid_t loginuid; /* loginuid of sender */ | 647 | kuid_t loginuid; /* loginuid of sender */ |
| 659 | u32 sessionid; | 648 | u32 sessionid; |
| 660 | struct audit_sig_info *sig_data; | 649 | struct audit_sig_info *sig_data; |
| 661 | char *ctx = NULL; | 650 | char *ctx = NULL; |
| @@ -675,8 +664,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 675 | return err; | 664 | return err; |
| 676 | } | 665 | } |
| 677 | 666 | ||
| 678 | pid = NETLINK_CREDS(skb)->pid; | ||
| 679 | uid = NETLINK_CREDS(skb)->uid; | ||
| 680 | loginuid = audit_get_loginuid(current); | 667 | loginuid = audit_get_loginuid(current); |
| 681 | sessionid = audit_get_sessionid(current); | 668 | sessionid = audit_get_sessionid(current); |
| 682 | security_task_getsecid(current, &sid); | 669 | security_task_getsecid(current, &sid); |
| @@ -692,7 +679,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 692 | status_set.backlog_limit = audit_backlog_limit; | 679 | status_set.backlog_limit = audit_backlog_limit; |
| 693 | status_set.lost = atomic_read(&audit_lost); | 680 | status_set.lost = atomic_read(&audit_lost); |
| 694 | status_set.backlog = skb_queue_len(&audit_skb_queue); | 681 | status_set.backlog = skb_queue_len(&audit_skb_queue); |
| 695 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, | 682 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_GET, 0, 0, |
| 696 | &status_set, sizeof(status_set)); | 683 | &status_set, sizeof(status_set)); |
| 697 | break; | 684 | break; |
| 698 | case AUDIT_SET: | 685 | case AUDIT_SET: |
| @@ -720,7 +707,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 720 | sessionid, sid, 1); | 707 | sessionid, sid, 1); |
| 721 | 708 | ||
| 722 | audit_pid = new_pid; | 709 | audit_pid = new_pid; |
| 723 | audit_nlk_pid = NETLINK_CB(skb).pid; | 710 | audit_nlk_portid = NETLINK_CB(skb).portid; |
| 724 | } | 711 | } |
| 725 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { | 712 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { |
| 726 | err = audit_set_rate_limit(status_get->rate_limit, | 713 | err = audit_set_rate_limit(status_get->rate_limit, |
| @@ -738,16 +725,16 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 738 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) | 725 | if (!audit_enabled && msg_type != AUDIT_USER_AVC) |
| 739 | return 0; | 726 | return 0; |
| 740 | 727 | ||
| 741 | err = audit_filter_user(&NETLINK_CB(skb)); | 728 | err = audit_filter_user(); |
| 742 | if (err == 1) { | 729 | if (err == 1) { |
| 743 | err = 0; | 730 | err = 0; |
| 744 | if (msg_type == AUDIT_USER_TTY) { | 731 | if (msg_type == AUDIT_USER_TTY) { |
| 745 | err = audit_prepare_user_tty(pid, loginuid, | 732 | err = tty_audit_push_task(current, loginuid, |
| 746 | sessionid); | 733 | sessionid); |
| 747 | if (err) | 734 | if (err) |
| 748 | break; | 735 | break; |
| 749 | } | 736 | } |
| 750 | audit_log_common_recv_msg(&ab, msg_type, pid, uid, | 737 | audit_log_common_recv_msg(&ab, msg_type, |
| 751 | loginuid, sessionid, sid); | 738 | loginuid, sessionid, sid); |
| 752 | 739 | ||
| 753 | if (msg_type != AUDIT_USER_TTY) | 740 | if (msg_type != AUDIT_USER_TTY) |
| @@ -763,7 +750,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 763 | size--; | 750 | size--; |
| 764 | audit_log_n_untrustedstring(ab, data, size); | 751 | audit_log_n_untrustedstring(ab, data, size); |
| 765 | } | 752 | } |
| 766 | audit_set_pid(ab, pid); | 753 | audit_set_pid(ab, NETLINK_CB(skb).portid); |
| 767 | audit_log_end(ab); | 754 | audit_log_end(ab); |
| 768 | } | 755 | } |
| 769 | break; | 756 | break; |
| @@ -772,8 +759,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 772 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) | 759 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) |
| 773 | return -EINVAL; | 760 | return -EINVAL; |
| 774 | if (audit_enabled == AUDIT_LOCKED) { | 761 | if (audit_enabled == AUDIT_LOCKED) { |
| 775 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 762 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
| 776 | uid, loginuid, sessionid, sid); | 763 | loginuid, sessionid, sid); |
| 777 | 764 | ||
| 778 | audit_log_format(ab, " audit_enabled=%d res=0", | 765 | audit_log_format(ab, " audit_enabled=%d res=0", |
| 779 | audit_enabled); | 766 | audit_enabled); |
| @@ -782,8 +769,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 782 | } | 769 | } |
| 783 | /* fallthrough */ | 770 | /* fallthrough */ |
| 784 | case AUDIT_LIST: | 771 | case AUDIT_LIST: |
| 785 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, | 772 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, |
| 786 | uid, seq, data, nlmsg_len(nlh), | 773 | seq, data, nlmsg_len(nlh), |
| 787 | loginuid, sessionid, sid); | 774 | loginuid, sessionid, sid); |
| 788 | break; | 775 | break; |
| 789 | case AUDIT_ADD_RULE: | 776 | case AUDIT_ADD_RULE: |
| @@ -791,8 +778,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 791 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) | 778 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) |
| 792 | return -EINVAL; | 779 | return -EINVAL; |
| 793 | if (audit_enabled == AUDIT_LOCKED) { | 780 | if (audit_enabled == AUDIT_LOCKED) { |
| 794 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 781 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
| 795 | uid, loginuid, sessionid, sid); | 782 | loginuid, sessionid, sid); |
| 796 | 783 | ||
| 797 | audit_log_format(ab, " audit_enabled=%d res=0", | 784 | audit_log_format(ab, " audit_enabled=%d res=0", |
| 798 | audit_enabled); | 785 | audit_enabled); |
| @@ -801,15 +788,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 801 | } | 788 | } |
| 802 | /* fallthrough */ | 789 | /* fallthrough */ |
| 803 | case AUDIT_LIST_RULES: | 790 | case AUDIT_LIST_RULES: |
| 804 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, | 791 | err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid, |
| 805 | uid, seq, data, nlmsg_len(nlh), | 792 | seq, data, nlmsg_len(nlh), |
| 806 | loginuid, sessionid, sid); | 793 | loginuid, sessionid, sid); |
| 807 | break; | 794 | break; |
| 808 | case AUDIT_TRIM: | 795 | case AUDIT_TRIM: |
| 809 | audit_trim_trees(); | 796 | audit_trim_trees(); |
| 810 | 797 | ||
| 811 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 798 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
| 812 | uid, loginuid, sessionid, sid); | 799 | loginuid, sessionid, sid); |
| 813 | 800 | ||
| 814 | audit_log_format(ab, " op=trim res=1"); | 801 | audit_log_format(ab, " op=trim res=1"); |
| 815 | audit_log_end(ab); | 802 | audit_log_end(ab); |
| @@ -840,8 +827,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 840 | /* OK, here comes... */ | 827 | /* OK, here comes... */ |
| 841 | err = audit_tag_tree(old, new); | 828 | err = audit_tag_tree(old, new); |
| 842 | 829 | ||
| 843 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, | 830 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, |
| 844 | uid, loginuid, sessionid, sid); | 831 | loginuid, sessionid, sid); |
| 845 | 832 | ||
| 846 | audit_log_format(ab, " op=make_equiv old="); | 833 | audit_log_format(ab, " op=make_equiv old="); |
| 847 | audit_log_untrustedstring(ab, old); | 834 | audit_log_untrustedstring(ab, old); |
| @@ -866,53 +853,41 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
| 866 | security_release_secctx(ctx, len); | 853 | security_release_secctx(ctx, len); |
| 867 | return -ENOMEM; | 854 | return -ENOMEM; |
| 868 | } | 855 | } |
| 869 | sig_data->uid = audit_sig_uid; | 856 | sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid); |
| 870 | sig_data->pid = audit_sig_pid; | 857 | sig_data->pid = audit_sig_pid; |
| 871 | if (audit_sig_sid) { | 858 | if (audit_sig_sid) { |
| 872 | memcpy(sig_data->ctx, ctx, len); | 859 | memcpy(sig_data->ctx, ctx, len); |
| 873 | security_release_secctx(ctx, len); | 860 | security_release_secctx(ctx, len); |
| 874 | } | 861 | } |
| 875 | audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, | 862 | audit_send_reply(NETLINK_CB(skb).portid, seq, AUDIT_SIGNAL_INFO, |
| 876 | 0, 0, sig_data, sizeof(*sig_data) + len); | 863 | 0, 0, sig_data, sizeof(*sig_data) + len); |
| 877 | kfree(sig_data); | 864 | kfree(sig_data); |
| 878 | break; | 865 | break; |
| 879 | case AUDIT_TTY_GET: { | 866 | case AUDIT_TTY_GET: { |
| 880 | struct audit_tty_status s; | 867 | struct audit_tty_status s; |
| 881 | struct task_struct *tsk; | 868 | struct task_struct *tsk = current; |
| 882 | unsigned long flags; | 869 | |
| 883 | 870 | spin_lock_irq(&tsk->sighand->siglock); | |
| 884 | rcu_read_lock(); | 871 | s.enabled = tsk->signal->audit_tty != 0; |
| 885 | tsk = find_task_by_vpid(pid); | 872 | spin_unlock_irq(&tsk->sighand->siglock); |
| 886 | if (tsk && lock_task_sighand(tsk, &flags)) { | 873 | |
| 887 | s.enabled = tsk->signal->audit_tty != 0; | 874 | audit_send_reply(NETLINK_CB(skb).portid, seq, |
| 888 | unlock_task_sighand(tsk, &flags); | 875 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); |
| 889 | } else | ||
| 890 | err = -ESRCH; | ||
| 891 | rcu_read_unlock(); | ||
| 892 | |||
| 893 | if (!err) | ||
| 894 | audit_send_reply(NETLINK_CB(skb).pid, seq, | ||
| 895 | AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); | ||
| 896 | break; | 876 | break; |
| 897 | } | 877 | } |
| 898 | case AUDIT_TTY_SET: { | 878 | case AUDIT_TTY_SET: { |
| 899 | struct audit_tty_status *s; | 879 | struct audit_tty_status *s; |
| 900 | struct task_struct *tsk; | 880 | struct task_struct *tsk = current; |
| 901 | unsigned long flags; | ||
| 902 | 881 | ||
| 903 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) | 882 | if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) |
| 904 | return -EINVAL; | 883 | return -EINVAL; |
| 905 | s = data; | 884 | s = data; |
| 906 | if (s->enabled != 0 && s->enabled != 1) | 885 | if (s->enabled != 0 && s->enabled != 1) |
| 907 | return -EINVAL; | 886 | return -EINVAL; |
| 908 | rcu_read_lock(); | 887 | |
| 909 | tsk = find_task_by_vpid(pid); | 888 | spin_lock_irq(&tsk->sighand->siglock); |
| 910 | if (tsk && lock_task_sighand(tsk, &flags)) { | 889 | tsk->signal->audit_tty = s->enabled != 0; |
| 911 | tsk->signal->audit_tty = s->enabled != 0; | 890 | spin_unlock_irq(&tsk->sighand->siglock); |
| 912 | unlock_task_sighand(tsk, &flags); | ||
| 913 | } else | ||
| 914 | err = -ESRCH; | ||
| 915 | rcu_read_unlock(); | ||
| 916 | break; | 891 | break; |
| 917 | } | 892 | } |
| 918 | default: | 893 | default: |
| @@ -971,8 +946,7 @@ static int __init audit_init(void) | |||
| 971 | 946 | ||
| 972 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", | 947 | printk(KERN_INFO "audit: initializing netlink socket (%s)\n", |
| 973 | audit_default ? "enabled" : "disabled"); | 948 | audit_default ? "enabled" : "disabled"); |
| 974 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, | 949 | audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, &cfg); |
| 975 | THIS_MODULE, &cfg); | ||
| 976 | if (!audit_sock) | 950 | if (!audit_sock) |
| 977 | audit_panic("cannot initialize netlink socket"); | 951 | audit_panic("cannot initialize netlink socket"); |
| 978 | else | 952 | else |
diff --git a/kernel/audit.h b/kernel/audit.h index 816766803371..9eb3d79482b6 100644 --- a/kernel/audit.h +++ b/kernel/audit.h | |||
| @@ -76,6 +76,8 @@ static inline int audit_hash_ino(u32 ino) | |||
| 76 | 76 | ||
| 77 | extern int audit_match_class(int class, unsigned syscall); | 77 | extern int audit_match_class(int class, unsigned syscall); |
| 78 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); | 78 | extern int audit_comparator(const u32 left, const u32 op, const u32 right); |
| 79 | extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); | ||
| 80 | extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); | ||
| 79 | extern int audit_compare_dname_path(const char *dname, const char *path, | 81 | extern int audit_compare_dname_path(const char *dname, const char *path, |
| 80 | int *dirlen); | 82 | int *dirlen); |
| 81 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, | 83 | extern struct sk_buff * audit_make_reply(int pid, int seq, int type, |
| @@ -144,7 +146,7 @@ extern void audit_kill_trees(struct list_head *); | |||
| 144 | extern char *audit_unpack_string(void **, size_t *, size_t); | 146 | extern char *audit_unpack_string(void **, size_t *, size_t); |
| 145 | 147 | ||
| 146 | extern pid_t audit_sig_pid; | 148 | extern pid_t audit_sig_pid; |
| 147 | extern uid_t audit_sig_uid; | 149 | extern kuid_t audit_sig_uid; |
| 148 | extern u32 audit_sig_sid; | 150 | extern u32 audit_sig_sid; |
| 149 | 151 | ||
| 150 | #ifdef CONFIG_AUDITSYSCALL | 152 | #ifdef CONFIG_AUDITSYSCALL |
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3823281401b5..1c22ec3d87bc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c | |||
| @@ -241,7 +241,7 @@ static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watc | |||
| 241 | struct audit_buffer *ab; | 241 | struct audit_buffer *ab; |
| 242 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); | 242 | ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); |
| 243 | audit_log_format(ab, "auid=%u ses=%u op=", | 243 | audit_log_format(ab, "auid=%u ses=%u op=", |
| 244 | audit_get_loginuid(current), | 244 | from_kuid(&init_user_ns, audit_get_loginuid(current)), |
| 245 | audit_get_sessionid(current)); | 245 | audit_get_sessionid(current)); |
| 246 | audit_log_string(ab, op); | 246 | audit_log_string(ab, op); |
| 247 | audit_log_format(ab, " path="); | 247 | audit_log_format(ab, " path="); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index a6c3f1abd206..c4bcdbaf4d4d 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
| @@ -342,6 +342,8 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 342 | 342 | ||
| 343 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); | 343 | f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); |
| 344 | f->val = rule->values[i]; | 344 | f->val = rule->values[i]; |
| 345 | f->uid = INVALID_UID; | ||
| 346 | f->gid = INVALID_GID; | ||
| 345 | 347 | ||
| 346 | err = -EINVAL; | 348 | err = -EINVAL; |
| 347 | if (f->op == Audit_bad) | 349 | if (f->op == Audit_bad) |
| @@ -350,16 +352,32 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) | |||
| 350 | switch(f->type) { | 352 | switch(f->type) { |
| 351 | default: | 353 | default: |
| 352 | goto exit_free; | 354 | goto exit_free; |
| 353 | case AUDIT_PID: | ||
| 354 | case AUDIT_UID: | 355 | case AUDIT_UID: |
| 355 | case AUDIT_EUID: | 356 | case AUDIT_EUID: |
| 356 | case AUDIT_SUID: | 357 | case AUDIT_SUID: |
| 357 | case AUDIT_FSUID: | 358 | case AUDIT_FSUID: |
| 359 | case AUDIT_LOGINUID: | ||
| 360 | /* bit ops not implemented for uid comparisons */ | ||
| 361 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
| 362 | goto exit_free; | ||
| 363 | |||
| 364 | f->uid = make_kuid(current_user_ns(), f->val); | ||
| 365 | if (!uid_valid(f->uid)) | ||
| 366 | goto exit_free; | ||
| 367 | break; | ||
| 358 | case AUDIT_GID: | 368 | case AUDIT_GID: |
| 359 | case AUDIT_EGID: | 369 | case AUDIT_EGID: |
| 360 | case AUDIT_SGID: | 370 | case AUDIT_SGID: |
| 361 | case AUDIT_FSGID: | 371 | case AUDIT_FSGID: |
| 362 | case AUDIT_LOGINUID: | 372 | /* bit ops not implemented for gid comparisons */ |
| 373 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
| 374 | goto exit_free; | ||
| 375 | |||
| 376 | f->gid = make_kgid(current_user_ns(), f->val); | ||
| 377 | if (!gid_valid(f->gid)) | ||
| 378 | goto exit_free; | ||
| 379 | break; | ||
| 380 | case AUDIT_PID: | ||
| 363 | case AUDIT_PERS: | 381 | case AUDIT_PERS: |
| 364 | case AUDIT_MSGTYPE: | 382 | case AUDIT_MSGTYPE: |
| 365 | case AUDIT_PPID: | 383 | case AUDIT_PPID: |
| @@ -437,19 +455,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 437 | 455 | ||
| 438 | f->type = data->fields[i]; | 456 | f->type = data->fields[i]; |
| 439 | f->val = data->values[i]; | 457 | f->val = data->values[i]; |
| 458 | f->uid = INVALID_UID; | ||
| 459 | f->gid = INVALID_GID; | ||
| 440 | f->lsm_str = NULL; | 460 | f->lsm_str = NULL; |
| 441 | f->lsm_rule = NULL; | 461 | f->lsm_rule = NULL; |
| 442 | switch(f->type) { | 462 | switch(f->type) { |
| 443 | case AUDIT_PID: | ||
| 444 | case AUDIT_UID: | 463 | case AUDIT_UID: |
| 445 | case AUDIT_EUID: | 464 | case AUDIT_EUID: |
| 446 | case AUDIT_SUID: | 465 | case AUDIT_SUID: |
| 447 | case AUDIT_FSUID: | 466 | case AUDIT_FSUID: |
| 467 | case AUDIT_LOGINUID: | ||
| 468 | case AUDIT_OBJ_UID: | ||
| 469 | /* bit ops not implemented for uid comparisons */ | ||
| 470 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
| 471 | goto exit_free; | ||
| 472 | |||
| 473 | f->uid = make_kuid(current_user_ns(), f->val); | ||
| 474 | if (!uid_valid(f->uid)) | ||
| 475 | goto exit_free; | ||
| 476 | break; | ||
| 448 | case AUDIT_GID: | 477 | case AUDIT_GID: |
| 449 | case AUDIT_EGID: | 478 | case AUDIT_EGID: |
| 450 | case AUDIT_SGID: | 479 | case AUDIT_SGID: |
| 451 | case AUDIT_FSGID: | 480 | case AUDIT_FSGID: |
| 452 | case AUDIT_LOGINUID: | 481 | case AUDIT_OBJ_GID: |
| 482 | /* bit ops not implemented for gid comparisons */ | ||
| 483 | if (f->op == Audit_bitmask || f->op == Audit_bittest) | ||
| 484 | goto exit_free; | ||
| 485 | |||
| 486 | f->gid = make_kgid(current_user_ns(), f->val); | ||
| 487 | if (!gid_valid(f->gid)) | ||
| 488 | goto exit_free; | ||
| 489 | break; | ||
| 490 | case AUDIT_PID: | ||
| 453 | case AUDIT_PERS: | 491 | case AUDIT_PERS: |
| 454 | case AUDIT_MSGTYPE: | 492 | case AUDIT_MSGTYPE: |
| 455 | case AUDIT_PPID: | 493 | case AUDIT_PPID: |
| @@ -461,8 +499,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, | |||
| 461 | case AUDIT_ARG1: | 499 | case AUDIT_ARG1: |
| 462 | case AUDIT_ARG2: | 500 | case AUDIT_ARG2: |
| 463 | case AUDIT_ARG3: | 501 | case AUDIT_ARG3: |
| 464 | case AUDIT_OBJ_UID: | ||
| 465 | case AUDIT_OBJ_GID: | ||
| 466 | break; | 502 | break; |
| 467 | case AUDIT_ARCH: | 503 | case AUDIT_ARCH: |
| 468 | entry->rule.arch_f = f; | 504 | entry->rule.arch_f = f; |
| @@ -707,6 +743,23 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) | |||
| 707 | if (strcmp(a->filterkey, b->filterkey)) | 743 | if (strcmp(a->filterkey, b->filterkey)) |
| 708 | return 1; | 744 | return 1; |
| 709 | break; | 745 | break; |
| 746 | case AUDIT_UID: | ||
| 747 | case AUDIT_EUID: | ||
| 748 | case AUDIT_SUID: | ||
| 749 | case AUDIT_FSUID: | ||
| 750 | case AUDIT_LOGINUID: | ||
| 751 | case AUDIT_OBJ_UID: | ||
| 752 | if (!uid_eq(a->fields[i].uid, b->fields[i].uid)) | ||
| 753 | return 1; | ||
| 754 | break; | ||
| 755 | case AUDIT_GID: | ||
| 756 | case AUDIT_EGID: | ||
| 757 | case AUDIT_SGID: | ||
| 758 | case AUDIT_FSGID: | ||
| 759 | case AUDIT_OBJ_GID: | ||
| 760 | if (!gid_eq(a->fields[i].gid, b->fields[i].gid)) | ||
| 761 | return 1; | ||
| 762 | break; | ||
| 710 | default: | 763 | default: |
| 711 | if (a->fields[i].val != b->fields[i].val) | 764 | if (a->fields[i].val != b->fields[i].val) |
| 712 | return 1; | 765 | return 1; |
| @@ -1056,7 +1109,7 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) | |||
| 1056 | } | 1109 | } |
| 1057 | 1110 | ||
| 1058 | /* Log rule additions and removals */ | 1111 | /* Log rule additions and removals */ |
| 1059 | static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, | 1112 | static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid, |
| 1060 | char *action, struct audit_krule *rule, | 1113 | char *action, struct audit_krule *rule, |
| 1061 | int res) | 1114 | int res) |
| 1062 | { | 1115 | { |
| @@ -1068,7 +1121,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, | |||
| 1068 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1121 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
| 1069 | if (!ab) | 1122 | if (!ab) |
| 1070 | return; | 1123 | return; |
| 1071 | audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); | 1124 | audit_log_format(ab, "auid=%u ses=%u", |
| 1125 | from_kuid(&init_user_ns, loginuid), sessionid); | ||
| 1072 | if (sid) { | 1126 | if (sid) { |
| 1073 | char *ctx = NULL; | 1127 | char *ctx = NULL; |
| 1074 | u32 len; | 1128 | u32 len; |
| @@ -1098,8 +1152,8 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, | |||
| 1098 | * @sessionid: sessionid for netlink audit message | 1152 | * @sessionid: sessionid for netlink audit message |
| 1099 | * @sid: SE Linux Security ID of sender | 1153 | * @sid: SE Linux Security ID of sender |
| 1100 | */ | 1154 | */ |
| 1101 | int audit_receive_filter(int type, int pid, int uid, int seq, void *data, | 1155 | int audit_receive_filter(int type, int pid, int seq, void *data, |
| 1102 | size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) | 1156 | size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid) |
| 1103 | { | 1157 | { |
| 1104 | struct task_struct *tsk; | 1158 | struct task_struct *tsk; |
| 1105 | struct audit_netlink_list *dest; | 1159 | struct audit_netlink_list *dest; |
| @@ -1198,6 +1252,52 @@ int audit_comparator(u32 left, u32 op, u32 right) | |||
| 1198 | } | 1252 | } |
| 1199 | } | 1253 | } |
| 1200 | 1254 | ||
| 1255 | int audit_uid_comparator(kuid_t left, u32 op, kuid_t right) | ||
| 1256 | { | ||
| 1257 | switch (op) { | ||
| 1258 | case Audit_equal: | ||
| 1259 | return uid_eq(left, right); | ||
| 1260 | case Audit_not_equal: | ||
| 1261 | return !uid_eq(left, right); | ||
| 1262 | case Audit_lt: | ||
| 1263 | return uid_lt(left, right); | ||
| 1264 | case Audit_le: | ||
| 1265 | return uid_lte(left, right); | ||
| 1266 | case Audit_gt: | ||
| 1267 | return uid_gt(left, right); | ||
| 1268 | case Audit_ge: | ||
| 1269 | return uid_gte(left, right); | ||
| 1270 | case Audit_bitmask: | ||
| 1271 | case Audit_bittest: | ||
| 1272 | default: | ||
| 1273 | BUG(); | ||
| 1274 | return 0; | ||
| 1275 | } | ||
| 1276 | } | ||
| 1277 | |||
| 1278 | int audit_gid_comparator(kgid_t left, u32 op, kgid_t right) | ||
| 1279 | { | ||
| 1280 | switch (op) { | ||
| 1281 | case Audit_equal: | ||
| 1282 | return gid_eq(left, right); | ||
| 1283 | case Audit_not_equal: | ||
| 1284 | return !gid_eq(left, right); | ||
| 1285 | case Audit_lt: | ||
| 1286 | return gid_lt(left, right); | ||
| 1287 | case Audit_le: | ||
| 1288 | return gid_lte(left, right); | ||
| 1289 | case Audit_gt: | ||
| 1290 | return gid_gt(left, right); | ||
| 1291 | case Audit_ge: | ||
| 1292 | return gid_gte(left, right); | ||
| 1293 | case Audit_bitmask: | ||
| 1294 | case Audit_bittest: | ||
| 1295 | default: | ||
| 1296 | BUG(); | ||
| 1297 | return 0; | ||
| 1298 | } | ||
| 1299 | } | ||
| 1300 | |||
| 1201 | /* Compare given dentry name with last component in given path, | 1301 | /* Compare given dentry name with last component in given path, |
| 1202 | * return of 0 indicates a match. */ | 1302 | * return of 0 indicates a match. */ |
| 1203 | int audit_compare_dname_path(const char *dname, const char *path, | 1303 | int audit_compare_dname_path(const char *dname, const char *path, |
| @@ -1236,8 +1336,7 @@ int audit_compare_dname_path(const char *dname, const char *path, | |||
| 1236 | return strncmp(p, dname, dlen); | 1336 | return strncmp(p, dname, dlen); |
| 1237 | } | 1337 | } |
| 1238 | 1338 | ||
| 1239 | static int audit_filter_user_rules(struct netlink_skb_parms *cb, | 1339 | static int audit_filter_user_rules(struct audit_krule *rule, |
| 1240 | struct audit_krule *rule, | ||
| 1241 | enum audit_state *state) | 1340 | enum audit_state *state) |
| 1242 | { | 1341 | { |
| 1243 | int i; | 1342 | int i; |
| @@ -1249,17 +1348,17 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
| 1249 | 1348 | ||
| 1250 | switch (f->type) { | 1349 | switch (f->type) { |
| 1251 | case AUDIT_PID: | 1350 | case AUDIT_PID: |
| 1252 | result = audit_comparator(cb->creds.pid, f->op, f->val); | 1351 | result = audit_comparator(task_pid_vnr(current), f->op, f->val); |
| 1253 | break; | 1352 | break; |
| 1254 | case AUDIT_UID: | 1353 | case AUDIT_UID: |
| 1255 | result = audit_comparator(cb->creds.uid, f->op, f->val); | 1354 | result = audit_uid_comparator(current_uid(), f->op, f->uid); |
| 1256 | break; | 1355 | break; |
| 1257 | case AUDIT_GID: | 1356 | case AUDIT_GID: |
| 1258 | result = audit_comparator(cb->creds.gid, f->op, f->val); | 1357 | result = audit_gid_comparator(current_gid(), f->op, f->gid); |
| 1259 | break; | 1358 | break; |
| 1260 | case AUDIT_LOGINUID: | 1359 | case AUDIT_LOGINUID: |
| 1261 | result = audit_comparator(audit_get_loginuid(current), | 1360 | result = audit_uid_comparator(audit_get_loginuid(current), |
| 1262 | f->op, f->val); | 1361 | f->op, f->uid); |
| 1263 | break; | 1362 | break; |
| 1264 | case AUDIT_SUBJ_USER: | 1363 | case AUDIT_SUBJ_USER: |
| 1265 | case AUDIT_SUBJ_ROLE: | 1364 | case AUDIT_SUBJ_ROLE: |
| @@ -1287,7 +1386,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, | |||
| 1287 | return 1; | 1386 | return 1; |
| 1288 | } | 1387 | } |
| 1289 | 1388 | ||
| 1290 | int audit_filter_user(struct netlink_skb_parms *cb) | 1389 | int audit_filter_user(void) |
| 1291 | { | 1390 | { |
| 1292 | enum audit_state state = AUDIT_DISABLED; | 1391 | enum audit_state state = AUDIT_DISABLED; |
| 1293 | struct audit_entry *e; | 1392 | struct audit_entry *e; |
| @@ -1295,7 +1394,7 @@ int audit_filter_user(struct netlink_skb_parms *cb) | |||
| 1295 | 1394 | ||
| 1296 | rcu_read_lock(); | 1395 | rcu_read_lock(); |
| 1297 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { | 1396 | list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { |
| 1298 | if (audit_filter_user_rules(cb, &e->rule, &state)) { | 1397 | if (audit_filter_user_rules(&e->rule, &state)) { |
| 1299 | if (state == AUDIT_DISABLED) | 1398 | if (state == AUDIT_DISABLED) |
| 1300 | ret = 0; | 1399 | ret = 0; |
| 1301 | break; | 1400 | break; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 4b96415527b8..f4a7756f999c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
| @@ -113,8 +113,8 @@ struct audit_names { | |||
| 113 | unsigned long ino; | 113 | unsigned long ino; |
| 114 | dev_t dev; | 114 | dev_t dev; |
| 115 | umode_t mode; | 115 | umode_t mode; |
| 116 | uid_t uid; | 116 | kuid_t uid; |
| 117 | gid_t gid; | 117 | kgid_t gid; |
| 118 | dev_t rdev; | 118 | dev_t rdev; |
| 119 | u32 osid; | 119 | u32 osid; |
| 120 | struct audit_cap_data fcap; | 120 | struct audit_cap_data fcap; |
| @@ -149,8 +149,8 @@ struct audit_aux_data_execve { | |||
| 149 | struct audit_aux_data_pids { | 149 | struct audit_aux_data_pids { |
| 150 | struct audit_aux_data d; | 150 | struct audit_aux_data d; |
| 151 | pid_t target_pid[AUDIT_AUX_PIDS]; | 151 | pid_t target_pid[AUDIT_AUX_PIDS]; |
| 152 | uid_t target_auid[AUDIT_AUX_PIDS]; | 152 | kuid_t target_auid[AUDIT_AUX_PIDS]; |
| 153 | uid_t target_uid[AUDIT_AUX_PIDS]; | 153 | kuid_t target_uid[AUDIT_AUX_PIDS]; |
| 154 | unsigned int target_sessionid[AUDIT_AUX_PIDS]; | 154 | unsigned int target_sessionid[AUDIT_AUX_PIDS]; |
| 155 | u32 target_sid[AUDIT_AUX_PIDS]; | 155 | u32 target_sid[AUDIT_AUX_PIDS]; |
| 156 | char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; | 156 | char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; |
| @@ -208,14 +208,14 @@ struct audit_context { | |||
| 208 | size_t sockaddr_len; | 208 | size_t sockaddr_len; |
| 209 | /* Save things to print about task_struct */ | 209 | /* Save things to print about task_struct */ |
| 210 | pid_t pid, ppid; | 210 | pid_t pid, ppid; |
| 211 | uid_t uid, euid, suid, fsuid; | 211 | kuid_t uid, euid, suid, fsuid; |
| 212 | gid_t gid, egid, sgid, fsgid; | 212 | kgid_t gid, egid, sgid, fsgid; |
| 213 | unsigned long personality; | 213 | unsigned long personality; |
| 214 | int arch; | 214 | int arch; |
| 215 | 215 | ||
| 216 | pid_t target_pid; | 216 | pid_t target_pid; |
| 217 | uid_t target_auid; | 217 | kuid_t target_auid; |
| 218 | uid_t target_uid; | 218 | kuid_t target_uid; |
| 219 | unsigned int target_sessionid; | 219 | unsigned int target_sessionid; |
| 220 | u32 target_sid; | 220 | u32 target_sid; |
| 221 | char target_comm[TASK_COMM_LEN]; | 221 | char target_comm[TASK_COMM_LEN]; |
| @@ -231,8 +231,8 @@ struct audit_context { | |||
| 231 | long args[6]; | 231 | long args[6]; |
| 232 | } socketcall; | 232 | } socketcall; |
| 233 | struct { | 233 | struct { |
| 234 | uid_t uid; | 234 | kuid_t uid; |
| 235 | gid_t gid; | 235 | kgid_t gid; |
| 236 | umode_t mode; | 236 | umode_t mode; |
| 237 | u32 osid; | 237 | u32 osid; |
| 238 | int has_perm; | 238 | int has_perm; |
| @@ -464,37 +464,47 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) | |||
| 464 | return 0; | 464 | return 0; |
| 465 | } | 465 | } |
| 466 | 466 | ||
| 467 | static int audit_compare_id(uid_t uid1, | 467 | static int audit_compare_uid(kuid_t uid, |
| 468 | struct audit_names *name, | 468 | struct audit_names *name, |
| 469 | unsigned long name_offset, | 469 | struct audit_field *f, |
| 470 | struct audit_field *f, | 470 | struct audit_context *ctx) |
| 471 | struct audit_context *ctx) | ||
| 472 | { | 471 | { |
| 473 | struct audit_names *n; | 472 | struct audit_names *n; |
| 474 | unsigned long addr; | ||
| 475 | uid_t uid2; | ||
| 476 | int rc; | 473 | int rc; |
| 477 | 474 | ||
| 478 | BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); | ||
| 479 | |||
| 480 | if (name) { | 475 | if (name) { |
| 481 | addr = (unsigned long)name; | 476 | rc = audit_uid_comparator(uid, f->op, name->uid); |
| 482 | addr += name_offset; | ||
| 483 | |||
| 484 | uid2 = *(uid_t *)addr; | ||
| 485 | rc = audit_comparator(uid1, f->op, uid2); | ||
| 486 | if (rc) | 477 | if (rc) |
| 487 | return rc; | 478 | return rc; |
| 488 | } | 479 | } |
| 489 | 480 | ||
| 490 | if (ctx) { | 481 | if (ctx) { |
| 491 | list_for_each_entry(n, &ctx->names_list, list) { | 482 | list_for_each_entry(n, &ctx->names_list, list) { |
| 492 | addr = (unsigned long)n; | 483 | rc = audit_uid_comparator(uid, f->op, n->uid); |
| 493 | addr += name_offset; | 484 | if (rc) |
| 494 | 485 | return rc; | |
| 495 | uid2 = *(uid_t *)addr; | 486 | } |
| 487 | } | ||
| 488 | return 0; | ||
| 489 | } | ||
| 496 | 490 | ||
| 497 | rc = audit_comparator(uid1, f->op, uid2); | 491 | static int audit_compare_gid(kgid_t gid, |
| 492 | struct audit_names *name, | ||
| 493 | struct audit_field *f, | ||
| 494 | struct audit_context *ctx) | ||
| 495 | { | ||
| 496 | struct audit_names *n; | ||
| 497 | int rc; | ||
| 498 | |||
| 499 | if (name) { | ||
| 500 | rc = audit_gid_comparator(gid, f->op, name->gid); | ||
| 501 | if (rc) | ||
| 502 | return rc; | ||
| 503 | } | ||
| 504 | |||
| 505 | if (ctx) { | ||
| 506 | list_for_each_entry(n, &ctx->names_list, list) { | ||
| 507 | rc = audit_gid_comparator(gid, f->op, n->gid); | ||
| 498 | if (rc) | 508 | if (rc) |
| 499 | return rc; | 509 | return rc; |
| 500 | } | 510 | } |
| @@ -511,80 +521,62 @@ static int audit_field_compare(struct task_struct *tsk, | |||
| 511 | switch (f->val) { | 521 | switch (f->val) { |
| 512 | /* process to file object comparisons */ | 522 | /* process to file object comparisons */ |
| 513 | case AUDIT_COMPARE_UID_TO_OBJ_UID: | 523 | case AUDIT_COMPARE_UID_TO_OBJ_UID: |
| 514 | return audit_compare_id(cred->uid, | 524 | return audit_compare_uid(cred->uid, name, f, ctx); |
| 515 | name, offsetof(struct audit_names, uid), | ||
| 516 | f, ctx); | ||
| 517 | case AUDIT_COMPARE_GID_TO_OBJ_GID: | 525 | case AUDIT_COMPARE_GID_TO_OBJ_GID: |
| 518 | return audit_compare_id(cred->gid, | 526 | return audit_compare_gid(cred->gid, name, f, ctx); |
| 519 | name, offsetof(struct audit_names, gid), | ||
| 520 | f, ctx); | ||
| 521 | case AUDIT_COMPARE_EUID_TO_OBJ_UID: | 527 | case AUDIT_COMPARE_EUID_TO_OBJ_UID: |
| 522 | return audit_compare_id(cred->euid, | 528 | return audit_compare_uid(cred->euid, name, f, ctx); |
| 523 | name, offsetof(struct audit_names, uid), | ||
| 524 | f, ctx); | ||
| 525 | case AUDIT_COMPARE_EGID_TO_OBJ_GID: | 529 | case AUDIT_COMPARE_EGID_TO_OBJ_GID: |
| 526 | return audit_compare_id(cred->egid, | 530 | return audit_compare_gid(cred->egid, name, f, ctx); |
| 527 | name, offsetof(struct audit_names, gid), | ||
| 528 | f, ctx); | ||
| 529 | case AUDIT_COMPARE_AUID_TO_OBJ_UID: | 531 | case AUDIT_COMPARE_AUID_TO_OBJ_UID: |
| 530 | return audit_compare_id(tsk->loginuid, | 532 | return audit_compare_uid(tsk->loginuid, name, f, ctx); |
| 531 | name, offsetof(struct audit_names, uid), | ||
| 532 | f, ctx); | ||
| 533 | case AUDIT_COMPARE_SUID_TO_OBJ_UID: | 533 | case AUDIT_COMPARE_SUID_TO_OBJ_UID: |
| 534 | return audit_compare_id(cred->suid, | 534 | return audit_compare_uid(cred->suid, name, f, ctx); |
| 535 | name, offsetof(struct audit_names, uid), | ||
| 536 | f, ctx); | ||
| 537 | case AUDIT_COMPARE_SGID_TO_OBJ_GID: | 535 | case AUDIT_COMPARE_SGID_TO_OBJ_GID: |
| 538 | return audit_compare_id(cred->sgid, | 536 | return audit_compare_gid(cred->sgid, name, f, ctx); |
| 539 | name, offsetof(struct audit_names, gid), | ||
| 540 | f, ctx); | ||
| 541 | case AUDIT_COMPARE_FSUID_TO_OBJ_UID: | 537 | case AUDIT_COMPARE_FSUID_TO_OBJ_UID: |
| 542 | return audit_compare_id(cred->fsuid, | 538 | return audit_compare_uid(cred->fsuid, name, f, ctx); |
| 543 | name, offsetof(struct audit_names, uid), | ||
| 544 | f, ctx); | ||
| 545 | case AUDIT_COMPARE_FSGID_TO_OBJ_GID: | 539 | case AUDIT_COMPARE_FSGID_TO_OBJ_GID: |
| 546 | return audit_compare_id(cred->fsgid, | 540 | return audit_compare_gid(cred->fsgid, name, f, ctx); |
| 547 | name, offsetof(struct audit_names, gid), | ||
| 548 | f, ctx); | ||
| 549 | /* uid comparisons */ | 541 | /* uid comparisons */ |
| 550 | case AUDIT_COMPARE_UID_TO_AUID: | 542 | case AUDIT_COMPARE_UID_TO_AUID: |
| 551 | return audit_comparator(cred->uid, f->op, tsk->loginuid); | 543 | return audit_uid_comparator(cred->uid, f->op, tsk->loginuid); |
| 552 | case AUDIT_COMPARE_UID_TO_EUID: | 544 | case AUDIT_COMPARE_UID_TO_EUID: |
| 553 | return audit_comparator(cred->uid, f->op, cred->euid); | 545 | return audit_uid_comparator(cred->uid, f->op, cred->euid); |
| 554 | case AUDIT_COMPARE_UID_TO_SUID: | 546 | case AUDIT_COMPARE_UID_TO_SUID: |
| 555 | return audit_comparator(cred->uid, f->op, cred->suid); | 547 | return audit_uid_comparator(cred->uid, f->op, cred->suid); |
| 556 | case AUDIT_COMPARE_UID_TO_FSUID: | 548 | case AUDIT_COMPARE_UID_TO_FSUID: |
| 557 | return audit_comparator(cred->uid, f->op, cred->fsuid); | 549 | return audit_uid_comparator(cred->uid, f->op, cred->fsuid); |
| 558 | /* auid comparisons */ | 550 | /* auid comparisons */ |
| 559 | case AUDIT_COMPARE_AUID_TO_EUID: | 551 | case AUDIT_COMPARE_AUID_TO_EUID: |
| 560 | return audit_comparator(tsk->loginuid, f->op, cred->euid); | 552 | return audit_uid_comparator(tsk->loginuid, f->op, cred->euid); |
| 561 | case AUDIT_COMPARE_AUID_TO_SUID: | 553 | case AUDIT_COMPARE_AUID_TO_SUID: |
| 562 | return audit_comparator(tsk->loginuid, f->op, cred->suid); | 554 | return audit_uid_comparator(tsk->loginuid, f->op, cred->suid); |
| 563 | case AUDIT_COMPARE_AUID_TO_FSUID: | 555 | case AUDIT_COMPARE_AUID_TO_FSUID: |
| 564 | return audit_comparator(tsk->loginuid, f->op, cred->fsuid); | 556 | return audit_uid_comparator(tsk->loginuid, f->op, cred->fsuid); |
| 565 | /* euid comparisons */ | 557 | /* euid comparisons */ |
| 566 | case AUDIT_COMPARE_EUID_TO_SUID: | 558 | case AUDIT_COMPARE_EUID_TO_SUID: |
| 567 | return audit_comparator(cred->euid, f->op, cred->suid); | 559 | return audit_uid_comparator(cred->euid, f->op, cred->suid); |
| 568 | case AUDIT_COMPARE_EUID_TO_FSUID: | 560 | case AUDIT_COMPARE_EUID_TO_FSUID: |
| 569 | return audit_comparator(cred->euid, f->op, cred->fsuid); | 561 | return audit_uid_comparator(cred->euid, f->op, cred->fsuid); |
| 570 | /* suid comparisons */ | 562 | /* suid comparisons */ |
| 571 | case AUDIT_COMPARE_SUID_TO_FSUID: | 563 | case AUDIT_COMPARE_SUID_TO_FSUID: |
| 572 | return audit_comparator(cred->suid, f->op, cred->fsuid); | 564 | return audit_uid_comparator(cred->suid, f->op, cred->fsuid); |
| 573 | /* gid comparisons */ | 565 | /* gid comparisons */ |
| 574 | case AUDIT_COMPARE_GID_TO_EGID: | 566 | case AUDIT_COMPARE_GID_TO_EGID: |
| 575 | return audit_comparator(cred->gid, f->op, cred->egid); | 567 | return audit_gid_comparator(cred->gid, f->op, cred->egid); |
| 576 | case AUDIT_COMPARE_GID_TO_SGID: | 568 | case AUDIT_COMPARE_GID_TO_SGID: |
| 577 | return audit_comparator(cred->gid, f->op, cred->sgid); | 569 | return audit_gid_comparator(cred->gid, f->op, cred->sgid); |
| 578 | case AUDIT_COMPARE_GID_TO_FSGID: | 570 | case AUDIT_COMPARE_GID_TO_FSGID: |
| 579 | return audit_comparator(cred->gid, f->op, cred->fsgid); | 571 | return audit_gid_comparator(cred->gid, f->op, cred->fsgid); |
| 580 | /* egid comparisons */ | 572 | /* egid comparisons */ |
| 581 | case AUDIT_COMPARE_EGID_TO_SGID: | 573 | case AUDIT_COMPARE_EGID_TO_SGID: |
| 582 | return audit_comparator(cred->egid, f->op, cred->sgid); | 574 | return audit_gid_comparator(cred->egid, f->op, cred->sgid); |
| 583 | case AUDIT_COMPARE_EGID_TO_FSGID: | 575 | case AUDIT_COMPARE_EGID_TO_FSGID: |
| 584 | return audit_comparator(cred->egid, f->op, cred->fsgid); | 576 | return audit_gid_comparator(cred->egid, f->op, cred->fsgid); |
| 585 | /* sgid comparison */ | 577 | /* sgid comparison */ |
| 586 | case AUDIT_COMPARE_SGID_TO_FSGID: | 578 | case AUDIT_COMPARE_SGID_TO_FSGID: |
| 587 | return audit_comparator(cred->sgid, f->op, cred->fsgid); | 579 | return audit_gid_comparator(cred->sgid, f->op, cred->fsgid); |
| 588 | default: | 580 | default: |
| 589 | WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); | 581 | WARN(1, "Missing AUDIT_COMPARE define. Report as a bug\n"); |
| 590 | return 0; | 582 | return 0; |
| @@ -630,28 +622,28 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 630 | } | 622 | } |
| 631 | break; | 623 | break; |
| 632 | case AUDIT_UID: | 624 | case AUDIT_UID: |
| 633 | result = audit_comparator(cred->uid, f->op, f->val); | 625 | result = audit_uid_comparator(cred->uid, f->op, f->uid); |
| 634 | break; | 626 | break; |
| 635 | case AUDIT_EUID: | 627 | case AUDIT_EUID: |
| 636 | result = audit_comparator(cred->euid, f->op, f->val); | 628 | result = audit_uid_comparator(cred->euid, f->op, f->uid); |
| 637 | break; | 629 | break; |
| 638 | case AUDIT_SUID: | 630 | case AUDIT_SUID: |
| 639 | result = audit_comparator(cred->suid, f->op, f->val); | 631 | result = audit_uid_comparator(cred->suid, f->op, f->uid); |
| 640 | break; | 632 | break; |
| 641 | case AUDIT_FSUID: | 633 | case AUDIT_FSUID: |
| 642 | result = audit_comparator(cred->fsuid, f->op, f->val); | 634 | result = audit_uid_comparator(cred->fsuid, f->op, f->uid); |
| 643 | break; | 635 | break; |
| 644 | case AUDIT_GID: | 636 | case AUDIT_GID: |
| 645 | result = audit_comparator(cred->gid, f->op, f->val); | 637 | result = audit_gid_comparator(cred->gid, f->op, f->gid); |
| 646 | break; | 638 | break; |
| 647 | case AUDIT_EGID: | 639 | case AUDIT_EGID: |
| 648 | result = audit_comparator(cred->egid, f->op, f->val); | 640 | result = audit_gid_comparator(cred->egid, f->op, f->gid); |
| 649 | break; | 641 | break; |
| 650 | case AUDIT_SGID: | 642 | case AUDIT_SGID: |
| 651 | result = audit_comparator(cred->sgid, f->op, f->val); | 643 | result = audit_gid_comparator(cred->sgid, f->op, f->gid); |
| 652 | break; | 644 | break; |
| 653 | case AUDIT_FSGID: | 645 | case AUDIT_FSGID: |
| 654 | result = audit_comparator(cred->fsgid, f->op, f->val); | 646 | result = audit_gid_comparator(cred->fsgid, f->op, f->gid); |
| 655 | break; | 647 | break; |
| 656 | case AUDIT_PERS: | 648 | case AUDIT_PERS: |
| 657 | result = audit_comparator(tsk->personality, f->op, f->val); | 649 | result = audit_comparator(tsk->personality, f->op, f->val); |
| @@ -717,10 +709,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 717 | break; | 709 | break; |
| 718 | case AUDIT_OBJ_UID: | 710 | case AUDIT_OBJ_UID: |
| 719 | if (name) { | 711 | if (name) { |
| 720 | result = audit_comparator(name->uid, f->op, f->val); | 712 | result = audit_uid_comparator(name->uid, f->op, f->uid); |
| 721 | } else if (ctx) { | 713 | } else if (ctx) { |
| 722 | list_for_each_entry(n, &ctx->names_list, list) { | 714 | list_for_each_entry(n, &ctx->names_list, list) { |
| 723 | if (audit_comparator(n->uid, f->op, f->val)) { | 715 | if (audit_uid_comparator(n->uid, f->op, f->uid)) { |
| 724 | ++result; | 716 | ++result; |
| 725 | break; | 717 | break; |
| 726 | } | 718 | } |
| @@ -729,10 +721,10 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 729 | break; | 721 | break; |
| 730 | case AUDIT_OBJ_GID: | 722 | case AUDIT_OBJ_GID: |
| 731 | if (name) { | 723 | if (name) { |
| 732 | result = audit_comparator(name->gid, f->op, f->val); | 724 | result = audit_gid_comparator(name->gid, f->op, f->gid); |
| 733 | } else if (ctx) { | 725 | } else if (ctx) { |
| 734 | list_for_each_entry(n, &ctx->names_list, list) { | 726 | list_for_each_entry(n, &ctx->names_list, list) { |
| 735 | if (audit_comparator(n->gid, f->op, f->val)) { | 727 | if (audit_gid_comparator(n->gid, f->op, f->gid)) { |
| 736 | ++result; | 728 | ++result; |
| 737 | break; | 729 | break; |
| 738 | } | 730 | } |
| @@ -750,7 +742,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
| 750 | case AUDIT_LOGINUID: | 742 | case AUDIT_LOGINUID: |
| 751 | result = 0; | 743 | result = 0; |
| 752 | if (ctx) | 744 | if (ctx) |
| 753 | result = audit_comparator(tsk->loginuid, f->op, f->val); | 745 | result = audit_uid_comparator(tsk->loginuid, f->op, f->uid); |
| 754 | break; | 746 | break; |
| 755 | case AUDIT_SUBJ_USER: | 747 | case AUDIT_SUBJ_USER: |
| 756 | case AUDIT_SUBJ_ROLE: | 748 | case AUDIT_SUBJ_ROLE: |
| @@ -1154,13 +1146,43 @@ error_path: | |||
| 1154 | 1146 | ||
| 1155 | EXPORT_SYMBOL(audit_log_task_context); | 1147 | EXPORT_SYMBOL(audit_log_task_context); |
| 1156 | 1148 | ||
| 1157 | static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) | 1149 | void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) |
| 1158 | { | 1150 | { |
| 1151 | const struct cred *cred; | ||
| 1159 | char name[sizeof(tsk->comm)]; | 1152 | char name[sizeof(tsk->comm)]; |
| 1160 | struct mm_struct *mm = tsk->mm; | 1153 | struct mm_struct *mm = tsk->mm; |
| 1161 | struct vm_area_struct *vma; | 1154 | char *tty; |
| 1155 | |||
| 1156 | if (!ab) | ||
| 1157 | return; | ||
| 1162 | 1158 | ||
| 1163 | /* tsk == current */ | 1159 | /* tsk == current */ |
| 1160 | cred = current_cred(); | ||
| 1161 | |||
| 1162 | spin_lock_irq(&tsk->sighand->siglock); | ||
| 1163 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | ||
| 1164 | tty = tsk->signal->tty->name; | ||
| 1165 | else | ||
| 1166 | tty = "(none)"; | ||
| 1167 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 1168 | |||
| 1169 | |||
| 1170 | audit_log_format(ab, | ||
| 1171 | " ppid=%ld pid=%d auid=%u uid=%u gid=%u" | ||
| 1172 | " euid=%u suid=%u fsuid=%u" | ||
| 1173 | " egid=%u sgid=%u fsgid=%u ses=%u tty=%s", | ||
| 1174 | sys_getppid(), | ||
| 1175 | tsk->pid, | ||
| 1176 | from_kuid(&init_user_ns, tsk->loginuid), | ||
| 1177 | from_kuid(&init_user_ns, cred->uid), | ||
| 1178 | from_kgid(&init_user_ns, cred->gid), | ||
| 1179 | from_kuid(&init_user_ns, cred->euid), | ||
| 1180 | from_kuid(&init_user_ns, cred->suid), | ||
| 1181 | from_kuid(&init_user_ns, cred->fsuid), | ||
| 1182 | from_kgid(&init_user_ns, cred->egid), | ||
| 1183 | from_kgid(&init_user_ns, cred->sgid), | ||
| 1184 | from_kgid(&init_user_ns, cred->fsgid), | ||
| 1185 | tsk->sessionid, tty); | ||
| 1164 | 1186 | ||
| 1165 | get_task_comm(name, tsk); | 1187 | get_task_comm(name, tsk); |
| 1166 | audit_log_format(ab, " comm="); | 1188 | audit_log_format(ab, " comm="); |
| @@ -1168,23 +1190,17 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
| 1168 | 1190 | ||
| 1169 | if (mm) { | 1191 | if (mm) { |
| 1170 | down_read(&mm->mmap_sem); | 1192 | down_read(&mm->mmap_sem); |
| 1171 | vma = mm->mmap; | 1193 | if (mm->exe_file) |
| 1172 | while (vma) { | 1194 | audit_log_d_path(ab, " exe=", &mm->exe_file->f_path); |
| 1173 | if ((vma->vm_flags & VM_EXECUTABLE) && | ||
| 1174 | vma->vm_file) { | ||
| 1175 | audit_log_d_path(ab, " exe=", | ||
| 1176 | &vma->vm_file->f_path); | ||
| 1177 | break; | ||
| 1178 | } | ||
| 1179 | vma = vma->vm_next; | ||
| 1180 | } | ||
| 1181 | up_read(&mm->mmap_sem); | 1195 | up_read(&mm->mmap_sem); |
| 1182 | } | 1196 | } |
| 1183 | audit_log_task_context(ab); | 1197 | audit_log_task_context(ab); |
| 1184 | } | 1198 | } |
| 1185 | 1199 | ||
| 1200 | EXPORT_SYMBOL(audit_log_task_info); | ||
| 1201 | |||
| 1186 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, | 1202 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, |
| 1187 | uid_t auid, uid_t uid, unsigned int sessionid, | 1203 | kuid_t auid, kuid_t uid, unsigned int sessionid, |
| 1188 | u32 sid, char *comm) | 1204 | u32 sid, char *comm) |
| 1189 | { | 1205 | { |
| 1190 | struct audit_buffer *ab; | 1206 | struct audit_buffer *ab; |
| @@ -1196,8 +1212,9 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
| 1196 | if (!ab) | 1212 | if (!ab) |
| 1197 | return rc; | 1213 | return rc; |
| 1198 | 1214 | ||
| 1199 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, | 1215 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, |
| 1200 | uid, sessionid); | 1216 | from_kuid(&init_user_ns, auid), |
| 1217 | from_kuid(&init_user_ns, uid), sessionid); | ||
| 1201 | if (security_secid_to_secctx(sid, &ctx, &len)) { | 1218 | if (security_secid_to_secctx(sid, &ctx, &len)) { |
| 1202 | audit_log_format(ab, " obj=(none)"); | 1219 | audit_log_format(ab, " obj=(none)"); |
| 1203 | rc = 1; | 1220 | rc = 1; |
| @@ -1447,7 +1464,9 @@ static void show_special(struct audit_context *context, int *call_panic) | |||
| 1447 | u32 osid = context->ipc.osid; | 1464 | u32 osid = context->ipc.osid; |
| 1448 | 1465 | ||
| 1449 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", | 1466 | audit_log_format(ab, "ouid=%u ogid=%u mode=%#ho", |
| 1450 | context->ipc.uid, context->ipc.gid, context->ipc.mode); | 1467 | from_kuid(&init_user_ns, context->ipc.uid), |
| 1468 | from_kgid(&init_user_ns, context->ipc.gid), | ||
| 1469 | context->ipc.mode); | ||
| 1451 | if (osid) { | 1470 | if (osid) { |
| 1452 | char *ctx = NULL; | 1471 | char *ctx = NULL; |
| 1453 | u32 len; | 1472 | u32 len; |
| @@ -1560,8 +1579,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
| 1560 | MAJOR(n->dev), | 1579 | MAJOR(n->dev), |
| 1561 | MINOR(n->dev), | 1580 | MINOR(n->dev), |
| 1562 | n->mode, | 1581 | n->mode, |
| 1563 | n->uid, | 1582 | from_kuid(&init_user_ns, n->uid), |
| 1564 | n->gid, | 1583 | from_kgid(&init_user_ns, n->gid), |
| 1565 | MAJOR(n->rdev), | 1584 | MAJOR(n->rdev), |
| 1566 | MINOR(n->rdev)); | 1585 | MINOR(n->rdev)); |
| 1567 | } | 1586 | } |
| @@ -1585,26 +1604,12 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, | |||
| 1585 | 1604 | ||
| 1586 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 1605 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
| 1587 | { | 1606 | { |
| 1588 | const struct cred *cred; | ||
| 1589 | int i, call_panic = 0; | 1607 | int i, call_panic = 0; |
| 1590 | struct audit_buffer *ab; | 1608 | struct audit_buffer *ab; |
| 1591 | struct audit_aux_data *aux; | 1609 | struct audit_aux_data *aux; |
| 1592 | const char *tty; | ||
| 1593 | struct audit_names *n; | 1610 | struct audit_names *n; |
| 1594 | 1611 | ||
| 1595 | /* tsk == current */ | 1612 | /* tsk == current */ |
| 1596 | context->pid = tsk->pid; | ||
| 1597 | if (!context->ppid) | ||
| 1598 | context->ppid = sys_getppid(); | ||
| 1599 | cred = current_cred(); | ||
| 1600 | context->uid = cred->uid; | ||
| 1601 | context->gid = cred->gid; | ||
| 1602 | context->euid = cred->euid; | ||
| 1603 | context->suid = cred->suid; | ||
| 1604 | context->fsuid = cred->fsuid; | ||
| 1605 | context->egid = cred->egid; | ||
| 1606 | context->sgid = cred->sgid; | ||
| 1607 | context->fsgid = cred->fsgid; | ||
| 1608 | context->personality = tsk->personality; | 1613 | context->personality = tsk->personality; |
| 1609 | 1614 | ||
| 1610 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); | 1615 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); |
| @@ -1619,32 +1624,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
| 1619 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", | 1624 | (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", |
| 1620 | context->return_code); | 1625 | context->return_code); |
| 1621 | 1626 | ||
| 1622 | spin_lock_irq(&tsk->sighand->siglock); | ||
| 1623 | if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) | ||
| 1624 | tty = tsk->signal->tty->name; | ||
| 1625 | else | ||
| 1626 | tty = "(none)"; | ||
| 1627 | spin_unlock_irq(&tsk->sighand->siglock); | ||
| 1628 | |||
| 1629 | audit_log_format(ab, | 1627 | audit_log_format(ab, |
| 1630 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 1628 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d", |
| 1631 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" | 1629 | context->argv[0], |
| 1632 | " euid=%u suid=%u fsuid=%u" | 1630 | context->argv[1], |
| 1633 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", | 1631 | context->argv[2], |
| 1634 | context->argv[0], | 1632 | context->argv[3], |
| 1635 | context->argv[1], | 1633 | context->name_count); |
| 1636 | context->argv[2], | ||
| 1637 | context->argv[3], | ||
| 1638 | context->name_count, | ||
| 1639 | context->ppid, | ||
| 1640 | context->pid, | ||
| 1641 | tsk->loginuid, | ||
| 1642 | context->uid, | ||
| 1643 | context->gid, | ||
| 1644 | context->euid, context->suid, context->fsuid, | ||
| 1645 | context->egid, context->sgid, context->fsgid, tty, | ||
| 1646 | tsk->sessionid); | ||
| 1647 | |||
| 1648 | 1634 | ||
| 1649 | audit_log_task_info(ab, tsk); | 1635 | audit_log_task_info(ab, tsk); |
| 1650 | audit_log_key(ab, context->filterkey); | 1636 | audit_log_key(ab, context->filterkey); |
| @@ -2299,14 +2285,14 @@ static atomic_t session_id = ATOMIC_INIT(0); | |||
| 2299 | * | 2285 | * |
| 2300 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). | 2286 | * Called (set) from fs/proc/base.c::proc_loginuid_write(). |
| 2301 | */ | 2287 | */ |
| 2302 | int audit_set_loginuid(uid_t loginuid) | 2288 | int audit_set_loginuid(kuid_t loginuid) |
| 2303 | { | 2289 | { |
| 2304 | struct task_struct *task = current; | 2290 | struct task_struct *task = current; |
| 2305 | struct audit_context *context = task->audit_context; | 2291 | struct audit_context *context = task->audit_context; |
| 2306 | unsigned int sessionid; | 2292 | unsigned int sessionid; |
| 2307 | 2293 | ||
| 2308 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE | 2294 | #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE |
| 2309 | if (task->loginuid != -1) | 2295 | if (uid_valid(task->loginuid)) |
| 2310 | return -EPERM; | 2296 | return -EPERM; |
| 2311 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ | 2297 | #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */ |
| 2312 | if (!capable(CAP_AUDIT_CONTROL)) | 2298 | if (!capable(CAP_AUDIT_CONTROL)) |
| @@ -2322,8 +2308,10 @@ int audit_set_loginuid(uid_t loginuid) | |||
| 2322 | audit_log_format(ab, "login pid=%d uid=%u " | 2308 | audit_log_format(ab, "login pid=%d uid=%u " |
| 2323 | "old auid=%u new auid=%u" | 2309 | "old auid=%u new auid=%u" |
| 2324 | " old ses=%u new ses=%u", | 2310 | " old ses=%u new ses=%u", |
| 2325 | task->pid, task_uid(task), | 2311 | task->pid, |
| 2326 | task->loginuid, loginuid, | 2312 | from_kuid(&init_user_ns, task_uid(task)), |
| 2313 | from_kuid(&init_user_ns, task->loginuid), | ||
| 2314 | from_kuid(&init_user_ns, loginuid), | ||
| 2327 | task->sessionid, sessionid); | 2315 | task->sessionid, sessionid); |
| 2328 | audit_log_end(ab); | 2316 | audit_log_end(ab); |
| 2329 | } | 2317 | } |
| @@ -2546,12 +2534,12 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
| 2546 | struct audit_aux_data_pids *axp; | 2534 | struct audit_aux_data_pids *axp; |
| 2547 | struct task_struct *tsk = current; | 2535 | struct task_struct *tsk = current; |
| 2548 | struct audit_context *ctx = tsk->audit_context; | 2536 | struct audit_context *ctx = tsk->audit_context; |
| 2549 | uid_t uid = current_uid(), t_uid = task_uid(t); | 2537 | kuid_t uid = current_uid(), t_uid = task_uid(t); |
| 2550 | 2538 | ||
| 2551 | if (audit_pid && t->tgid == audit_pid) { | 2539 | if (audit_pid && t->tgid == audit_pid) { |
| 2552 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { | 2540 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { |
| 2553 | audit_sig_pid = tsk->pid; | 2541 | audit_sig_pid = tsk->pid; |
| 2554 | if (tsk->loginuid != -1) | 2542 | if (uid_valid(tsk->loginuid)) |
| 2555 | audit_sig_uid = tsk->loginuid; | 2543 | audit_sig_uid = tsk->loginuid; |
| 2556 | else | 2544 | else |
| 2557 | audit_sig_uid = uid; | 2545 | audit_sig_uid = uid; |
| @@ -2672,8 +2660,8 @@ void __audit_mmap_fd(int fd, int flags) | |||
| 2672 | 2660 | ||
| 2673 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | 2661 | static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) |
| 2674 | { | 2662 | { |
| 2675 | uid_t auid, uid; | 2663 | kuid_t auid, uid; |
| 2676 | gid_t gid; | 2664 | kgid_t gid; |
| 2677 | unsigned int sessionid; | 2665 | unsigned int sessionid; |
| 2678 | 2666 | ||
| 2679 | auid = audit_get_loginuid(current); | 2667 | auid = audit_get_loginuid(current); |
| @@ -2681,7 +2669,10 @@ static void audit_log_abend(struct audit_buffer *ab, char *reason, long signr) | |||
| 2681 | current_uid_gid(&uid, &gid); | 2669 | current_uid_gid(&uid, &gid); |
| 2682 | 2670 | ||
| 2683 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", | 2671 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", |
| 2684 | auid, uid, gid, sessionid); | 2672 | from_kuid(&init_user_ns, auid), |
| 2673 | from_kuid(&init_user_ns, uid), | ||
| 2674 | from_kgid(&init_user_ns, gid), | ||
| 2675 | sessionid); | ||
| 2685 | audit_log_task_context(ab); | 2676 | audit_log_task_context(ab); |
| 2686 | audit_log_format(ab, " pid=%d comm=", current->pid); | 2677 | audit_log_format(ab, " pid=%d comm=", current->pid); |
| 2687 | audit_log_untrustedstring(ab, current->comm); | 2678 | audit_log_untrustedstring(ab, current->comm); |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e444..13774b3b39aa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
| @@ -88,11 +88,12 @@ static DEFINE_MUTEX(cgroup_root_mutex); | |||
| 88 | 88 | ||
| 89 | /* | 89 | /* |
| 90 | * Generate an array of cgroup subsystem pointers. At boot time, this is | 90 | * Generate an array of cgroup subsystem pointers. At boot time, this is |
| 91 | * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are | 91 | * populated with the built in subsystems, and modular subsystems are |
| 92 | * registered after that. The mutable section of this array is protected by | 92 | * registered after that. The mutable section of this array is protected by |
| 93 | * cgroup_mutex. | 93 | * cgroup_mutex. |
| 94 | */ | 94 | */ |
| 95 | #define SUBSYS(_x) &_x ## _subsys, | 95 | #define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, |
| 96 | #define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) | ||
| 96 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { | 97 | static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { |
| 97 | #include <linux/cgroup_subsys.h> | 98 | #include <linux/cgroup_subsys.h> |
| 98 | }; | 99 | }; |
| @@ -111,13 +112,13 @@ struct cgroupfs_root { | |||
| 111 | * The bitmask of subsystems intended to be attached to this | 112 | * The bitmask of subsystems intended to be attached to this |
| 112 | * hierarchy | 113 | * hierarchy |
| 113 | */ | 114 | */ |
| 114 | unsigned long subsys_bits; | 115 | unsigned long subsys_mask; |
| 115 | 116 | ||
| 116 | /* Unique id for this hierarchy. */ | 117 | /* Unique id for this hierarchy. */ |
| 117 | int hierarchy_id; | 118 | int hierarchy_id; |
| 118 | 119 | ||
| 119 | /* The bitmask of subsystems currently attached to this hierarchy */ | 120 | /* The bitmask of subsystems currently attached to this hierarchy */ |
| 120 | unsigned long actual_subsys_bits; | 121 | unsigned long actual_subsys_mask; |
| 121 | 122 | ||
| 122 | /* A list running through the attached subsystems */ | 123 | /* A list running through the attached subsystems */ |
| 123 | struct list_head subsys_list; | 124 | struct list_head subsys_list; |
| @@ -276,7 +277,8 @@ inline int cgroup_is_removed(const struct cgroup *cgrp) | |||
| 276 | 277 | ||
| 277 | /* bits in struct cgroupfs_root flags field */ | 278 | /* bits in struct cgroupfs_root flags field */ |
| 278 | enum { | 279 | enum { |
| 279 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ | 280 | ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ |
| 281 | ROOT_XATTR, /* supports extended attributes */ | ||
| 280 | }; | 282 | }; |
| 281 | 283 | ||
| 282 | static int cgroup_is_releasable(const struct cgroup *cgrp) | 284 | static int cgroup_is_releasable(const struct cgroup *cgrp) |
| @@ -556,7 +558,7 @@ static struct css_set *find_existing_css_set( | |||
| 556 | * won't change, so no need for locking. | 558 | * won't change, so no need for locking. |
| 557 | */ | 559 | */ |
| 558 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 560 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 559 | if (root->subsys_bits & (1UL << i)) { | 561 | if (root->subsys_mask & (1UL << i)) { |
| 560 | /* Subsystem is in this hierarchy. So we want | 562 | /* Subsystem is in this hierarchy. So we want |
| 561 | * the subsystem state from the new | 563 | * the subsystem state from the new |
| 562 | * cgroup */ | 564 | * cgroup */ |
| @@ -824,7 +826,8 @@ EXPORT_SYMBOL_GPL(cgroup_unlock); | |||
| 824 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); | 826 | static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); |
| 825 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); | 827 | static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); |
| 826 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); | 828 | static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); |
| 827 | static int cgroup_populate_dir(struct cgroup *cgrp); | 829 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, |
| 830 | unsigned long subsys_mask); | ||
| 828 | static const struct inode_operations cgroup_dir_inode_operations; | 831 | static const struct inode_operations cgroup_dir_inode_operations; |
| 829 | static const struct file_operations proc_cgroupstats_operations; | 832 | static const struct file_operations proc_cgroupstats_operations; |
| 830 | 833 | ||
| @@ -912,15 +915,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) | |||
| 912 | */ | 915 | */ |
| 913 | BUG_ON(!list_empty(&cgrp->pidlists)); | 916 | BUG_ON(!list_empty(&cgrp->pidlists)); |
| 914 | 917 | ||
| 918 | simple_xattrs_free(&cgrp->xattrs); | ||
| 919 | |||
| 915 | kfree_rcu(cgrp, rcu_head); | 920 | kfree_rcu(cgrp, rcu_head); |
| 916 | } else { | 921 | } else { |
| 917 | struct cfent *cfe = __d_cfe(dentry); | 922 | struct cfent *cfe = __d_cfe(dentry); |
| 918 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; | 923 | struct cgroup *cgrp = dentry->d_parent->d_fsdata; |
| 924 | struct cftype *cft = cfe->type; | ||
| 919 | 925 | ||
| 920 | WARN_ONCE(!list_empty(&cfe->node) && | 926 | WARN_ONCE(!list_empty(&cfe->node) && |
| 921 | cgrp != &cgrp->root->top_cgroup, | 927 | cgrp != &cgrp->root->top_cgroup, |
| 922 | "cfe still linked for %s\n", cfe->type->name); | 928 | "cfe still linked for %s\n", cfe->type->name); |
| 923 | kfree(cfe); | 929 | kfree(cfe); |
| 930 | simple_xattrs_free(&cft->xattrs); | ||
| 924 | } | 931 | } |
| 925 | iput(inode); | 932 | iput(inode); |
| 926 | } | 933 | } |
| @@ -963,12 +970,29 @@ static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) | |||
| 963 | return -ENOENT; | 970 | return -ENOENT; |
| 964 | } | 971 | } |
| 965 | 972 | ||
| 966 | static void cgroup_clear_directory(struct dentry *dir) | 973 | /** |
| 974 | * cgroup_clear_directory - selective removal of base and subsystem files | ||
| 975 | * @dir: directory containing the files | ||
| 976 | * @base_files: true if the base files should be removed | ||
| 977 | * @subsys_mask: mask of the subsystem ids whose files should be removed | ||
| 978 | */ | ||
| 979 | static void cgroup_clear_directory(struct dentry *dir, bool base_files, | ||
| 980 | unsigned long subsys_mask) | ||
| 967 | { | 981 | { |
| 968 | struct cgroup *cgrp = __d_cgrp(dir); | 982 | struct cgroup *cgrp = __d_cgrp(dir); |
| 983 | struct cgroup_subsys *ss; | ||
| 969 | 984 | ||
| 970 | while (!list_empty(&cgrp->files)) | 985 | for_each_subsys(cgrp->root, ss) { |
| 971 | cgroup_rm_file(cgrp, NULL); | 986 | struct cftype_set *set; |
| 987 | if (!test_bit(ss->subsys_id, &subsys_mask)) | ||
| 988 | continue; | ||
| 989 | list_for_each_entry(set, &ss->cftsets, node) | ||
| 990 | cgroup_rm_file(cgrp, set->cfts); | ||
| 991 | } | ||
| 992 | if (base_files) { | ||
| 993 | while (!list_empty(&cgrp->files)) | ||
| 994 | cgroup_rm_file(cgrp, NULL); | ||
| 995 | } | ||
| 972 | } | 996 | } |
| 973 | 997 | ||
| 974 | /* | 998 | /* |
| @@ -977,8 +1001,9 @@ static void cgroup_clear_directory(struct dentry *dir) | |||
| 977 | static void cgroup_d_remove_dir(struct dentry *dentry) | 1001 | static void cgroup_d_remove_dir(struct dentry *dentry) |
| 978 | { | 1002 | { |
| 979 | struct dentry *parent; | 1003 | struct dentry *parent; |
| 1004 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
| 980 | 1005 | ||
| 981 | cgroup_clear_directory(dentry); | 1006 | cgroup_clear_directory(dentry, true, root->subsys_mask); |
| 982 | 1007 | ||
| 983 | parent = dentry->d_parent; | 1008 | parent = dentry->d_parent; |
| 984 | spin_lock(&parent->d_lock); | 1009 | spin_lock(&parent->d_lock); |
| @@ -1022,22 +1047,22 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) | |||
| 1022 | * returns an error, no reference counts are touched. | 1047 | * returns an error, no reference counts are touched. |
| 1023 | */ | 1048 | */ |
| 1024 | static int rebind_subsystems(struct cgroupfs_root *root, | 1049 | static int rebind_subsystems(struct cgroupfs_root *root, |
| 1025 | unsigned long final_bits) | 1050 | unsigned long final_subsys_mask) |
| 1026 | { | 1051 | { |
| 1027 | unsigned long added_bits, removed_bits; | 1052 | unsigned long added_mask, removed_mask; |
| 1028 | struct cgroup *cgrp = &root->top_cgroup; | 1053 | struct cgroup *cgrp = &root->top_cgroup; |
| 1029 | int i; | 1054 | int i; |
| 1030 | 1055 | ||
| 1031 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); | 1056 | BUG_ON(!mutex_is_locked(&cgroup_mutex)); |
| 1032 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); | 1057 | BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); |
| 1033 | 1058 | ||
| 1034 | removed_bits = root->actual_subsys_bits & ~final_bits; | 1059 | removed_mask = root->actual_subsys_mask & ~final_subsys_mask; |
| 1035 | added_bits = final_bits & ~root->actual_subsys_bits; | 1060 | added_mask = final_subsys_mask & ~root->actual_subsys_mask; |
| 1036 | /* Check that any added subsystems are currently free */ | 1061 | /* Check that any added subsystems are currently free */ |
| 1037 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1062 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 1038 | unsigned long bit = 1UL << i; | 1063 | unsigned long bit = 1UL << i; |
| 1039 | struct cgroup_subsys *ss = subsys[i]; | 1064 | struct cgroup_subsys *ss = subsys[i]; |
| 1040 | if (!(bit & added_bits)) | 1065 | if (!(bit & added_mask)) |
| 1041 | continue; | 1066 | continue; |
| 1042 | /* | 1067 | /* |
| 1043 | * Nobody should tell us to do a subsys that doesn't exist: | 1068 | * Nobody should tell us to do a subsys that doesn't exist: |
| @@ -1062,7 +1087,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1062 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { | 1087 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 1063 | struct cgroup_subsys *ss = subsys[i]; | 1088 | struct cgroup_subsys *ss = subsys[i]; |
| 1064 | unsigned long bit = 1UL << i; | 1089 | unsigned long bit = 1UL << i; |
| 1065 | if (bit & added_bits) { | 1090 | if (bit & added_mask) { |
| 1066 | /* We're binding this subsystem to this hierarchy */ | 1091 | /* We're binding this subsystem to this hierarchy */ |
| 1067 | BUG_ON(ss == NULL); | 1092 | BUG_ON(ss == NULL); |
| 1068 | BUG_ON(cgrp->subsys[i]); | 1093 | BUG_ON(cgrp->subsys[i]); |
| @@ -1075,7 +1100,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1075 | if (ss->bind) | 1100 | if (ss->bind) |
| 1076 | ss->bind(cgrp); | 1101 | ss->bind(cgrp); |
| 1077 | /* refcount was already taken, and we're keeping it */ | 1102 | /* refcount was already taken, and we're keeping it */ |
| 1078 | } else if (bit & removed_bits) { | 1103 | } else if (bit & removed_mask) { |
| 1079 | /* We're removing this subsystem */ | 1104 | /* We're removing this subsystem */ |
| 1080 | BUG_ON(ss == NULL); | 1105 | BUG_ON(ss == NULL); |
| 1081 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); | 1106 | BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); |
| @@ -1088,7 +1113,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1088 | list_move(&ss->sibling, &rootnode.subsys_list); | 1113 | list_move(&ss->sibling, &rootnode.subsys_list); |
| 1089 | /* subsystem is now free - drop reference on module */ | 1114 | /* subsystem is now free - drop reference on module */ |
| 1090 | module_put(ss->module); | 1115 | module_put(ss->module); |
| 1091 | } else if (bit & final_bits) { | 1116 | } else if (bit & final_subsys_mask) { |
| 1092 | /* Subsystem state should already exist */ | 1117 | /* Subsystem state should already exist */ |
| 1093 | BUG_ON(ss == NULL); | 1118 | BUG_ON(ss == NULL); |
| 1094 | BUG_ON(!cgrp->subsys[i]); | 1119 | BUG_ON(!cgrp->subsys[i]); |
| @@ -1105,7 +1130,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, | |||
| 1105 | BUG_ON(cgrp->subsys[i]); | 1130 | BUG_ON(cgrp->subsys[i]); |
| 1106 | } | 1131 | } |
| 1107 | } | 1132 | } |
| 1108 | root->subsys_bits = root->actual_subsys_bits = final_bits; | 1133 | root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; |
| 1109 | synchronize_rcu(); | 1134 | synchronize_rcu(); |
| 1110 | 1135 | ||
| 1111 | return 0; | 1136 | return 0; |
| @@ -1121,6 +1146,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
| 1121 | seq_printf(seq, ",%s", ss->name); | 1146 | seq_printf(seq, ",%s", ss->name); |
| 1122 | if (test_bit(ROOT_NOPREFIX, &root->flags)) | 1147 | if (test_bit(ROOT_NOPREFIX, &root->flags)) |
| 1123 | seq_puts(seq, ",noprefix"); | 1148 | seq_puts(seq, ",noprefix"); |
| 1149 | if (test_bit(ROOT_XATTR, &root->flags)) | ||
| 1150 | seq_puts(seq, ",xattr"); | ||
| 1124 | if (strlen(root->release_agent_path)) | 1151 | if (strlen(root->release_agent_path)) |
| 1125 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); | 1152 | seq_printf(seq, ",release_agent=%s", root->release_agent_path); |
| 1126 | if (clone_children(&root->top_cgroup)) | 1153 | if (clone_children(&root->top_cgroup)) |
| @@ -1132,7 +1159,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) | |||
| 1132 | } | 1159 | } |
| 1133 | 1160 | ||
| 1134 | struct cgroup_sb_opts { | 1161 | struct cgroup_sb_opts { |
| 1135 | unsigned long subsys_bits; | 1162 | unsigned long subsys_mask; |
| 1136 | unsigned long flags; | 1163 | unsigned long flags; |
| 1137 | char *release_agent; | 1164 | char *release_agent; |
| 1138 | bool clone_children; | 1165 | bool clone_children; |
| @@ -1189,6 +1216,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1189 | opts->clone_children = true; | 1216 | opts->clone_children = true; |
| 1190 | continue; | 1217 | continue; |
| 1191 | } | 1218 | } |
| 1219 | if (!strcmp(token, "xattr")) { | ||
| 1220 | set_bit(ROOT_XATTR, &opts->flags); | ||
| 1221 | continue; | ||
| 1222 | } | ||
| 1192 | if (!strncmp(token, "release_agent=", 14)) { | 1223 | if (!strncmp(token, "release_agent=", 14)) { |
| 1193 | /* Specifying two release agents is forbidden */ | 1224 | /* Specifying two release agents is forbidden */ |
| 1194 | if (opts->release_agent) | 1225 | if (opts->release_agent) |
| @@ -1237,7 +1268,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1237 | /* Mutually exclusive option 'all' + subsystem name */ | 1268 | /* Mutually exclusive option 'all' + subsystem name */ |
| 1238 | if (all_ss) | 1269 | if (all_ss) |
| 1239 | return -EINVAL; | 1270 | return -EINVAL; |
| 1240 | set_bit(i, &opts->subsys_bits); | 1271 | set_bit(i, &opts->subsys_mask); |
| 1241 | one_ss = true; | 1272 | one_ss = true; |
| 1242 | 1273 | ||
| 1243 | break; | 1274 | break; |
| @@ -1258,7 +1289,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1258 | continue; | 1289 | continue; |
| 1259 | if (ss->disabled) | 1290 | if (ss->disabled) |
| 1260 | continue; | 1291 | continue; |
| 1261 | set_bit(i, &opts->subsys_bits); | 1292 | set_bit(i, &opts->subsys_mask); |
| 1262 | } | 1293 | } |
| 1263 | } | 1294 | } |
| 1264 | 1295 | ||
| @@ -1270,19 +1301,19 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1270 | * the cpuset subsystem. | 1301 | * the cpuset subsystem. |
| 1271 | */ | 1302 | */ |
| 1272 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && | 1303 | if (test_bit(ROOT_NOPREFIX, &opts->flags) && |
| 1273 | (opts->subsys_bits & mask)) | 1304 | (opts->subsys_mask & mask)) |
| 1274 | return -EINVAL; | 1305 | return -EINVAL; |
| 1275 | 1306 | ||
| 1276 | 1307 | ||
| 1277 | /* Can't specify "none" and some subsystems */ | 1308 | /* Can't specify "none" and some subsystems */ |
| 1278 | if (opts->subsys_bits && opts->none) | 1309 | if (opts->subsys_mask && opts->none) |
| 1279 | return -EINVAL; | 1310 | return -EINVAL; |
| 1280 | 1311 | ||
| 1281 | /* | 1312 | /* |
| 1282 | * We either have to specify by name or by subsystems. (So all | 1313 | * We either have to specify by name or by subsystems. (So all |
| 1283 | * empty hierarchies must have a name). | 1314 | * empty hierarchies must have a name). |
| 1284 | */ | 1315 | */ |
| 1285 | if (!opts->subsys_bits && !opts->name) | 1316 | if (!opts->subsys_mask && !opts->name) |
| 1286 | return -EINVAL; | 1317 | return -EINVAL; |
| 1287 | 1318 | ||
| 1288 | /* | 1319 | /* |
| @@ -1291,10 +1322,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1291 | * take duplicate reference counts on a subsystem that's already used, | 1322 | * take duplicate reference counts on a subsystem that's already used, |
| 1292 | * but rebind_subsystems handles this case. | 1323 | * but rebind_subsystems handles this case. |
| 1293 | */ | 1324 | */ |
| 1294 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 1325 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 1295 | unsigned long bit = 1UL << i; | 1326 | unsigned long bit = 1UL << i; |
| 1296 | 1327 | ||
| 1297 | if (!(bit & opts->subsys_bits)) | 1328 | if (!(bit & opts->subsys_mask)) |
| 1298 | continue; | 1329 | continue; |
| 1299 | if (!try_module_get(subsys[i]->module)) { | 1330 | if (!try_module_get(subsys[i]->module)) { |
| 1300 | module_pin_failed = true; | 1331 | module_pin_failed = true; |
| @@ -1307,11 +1338,11 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1307 | * raced with a module_delete call, and to the user this is | 1338 | * raced with a module_delete call, and to the user this is |
| 1308 | * essentially a "subsystem doesn't exist" case. | 1339 | * essentially a "subsystem doesn't exist" case. |
| 1309 | */ | 1340 | */ |
| 1310 | for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { | 1341 | for (i--; i >= 0; i--) { |
| 1311 | /* drop refcounts only on the ones we took */ | 1342 | /* drop refcounts only on the ones we took */ |
| 1312 | unsigned long bit = 1UL << i; | 1343 | unsigned long bit = 1UL << i; |
| 1313 | 1344 | ||
| 1314 | if (!(bit & opts->subsys_bits)) | 1345 | if (!(bit & opts->subsys_mask)) |
| 1315 | continue; | 1346 | continue; |
| 1316 | module_put(subsys[i]->module); | 1347 | module_put(subsys[i]->module); |
| 1317 | } | 1348 | } |
| @@ -1321,13 +1352,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) | |||
| 1321 | return 0; | 1352 | return 0; |
| 1322 | } | 1353 | } |
| 1323 | 1354 | ||
| 1324 | static void drop_parsed_module_refcounts(unsigned long subsys_bits) | 1355 | static void drop_parsed_module_refcounts(unsigned long subsys_mask) |
| 1325 | { | 1356 | { |
| 1326 | int i; | 1357 | int i; |
| 1327 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | 1358 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 1328 | unsigned long bit = 1UL << i; | 1359 | unsigned long bit = 1UL << i; |
| 1329 | 1360 | ||
| 1330 | if (!(bit & subsys_bits)) | 1361 | if (!(bit & subsys_mask)) |
| 1331 | continue; | 1362 | continue; |
| 1332 | module_put(subsys[i]->module); | 1363 | module_put(subsys[i]->module); |
| 1333 | } | 1364 | } |
| @@ -1339,6 +1370,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
| 1339 | struct cgroupfs_root *root = sb->s_fs_info; | 1370 | struct cgroupfs_root *root = sb->s_fs_info; |
| 1340 | struct cgroup *cgrp = &root->top_cgroup; | 1371 | struct cgroup *cgrp = &root->top_cgroup; |
| 1341 | struct cgroup_sb_opts opts; | 1372 | struct cgroup_sb_opts opts; |
| 1373 | unsigned long added_mask, removed_mask; | ||
| 1342 | 1374 | ||
| 1343 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); | 1375 | mutex_lock(&cgrp->dentry->d_inode->i_mutex); |
| 1344 | mutex_lock(&cgroup_mutex); | 1376 | mutex_lock(&cgroup_mutex); |
| @@ -1350,27 +1382,31 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) | |||
| 1350 | goto out_unlock; | 1382 | goto out_unlock; |
| 1351 | 1383 | ||
| 1352 | /* See feature-removal-schedule.txt */ | 1384 | /* See feature-removal-schedule.txt */ |
| 1353 | if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent) | 1385 | if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) |
| 1354 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", | 1386 | pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", |
| 1355 | task_tgid_nr(current), current->comm); | 1387 | task_tgid_nr(current), current->comm); |
| 1356 | 1388 | ||
| 1389 | added_mask = opts.subsys_mask & ~root->subsys_mask; | ||
| 1390 | removed_mask = root->subsys_mask & ~opts.subsys_mask; | ||
| 1391 | |||
| 1357 | /* Don't allow flags or name to change at remount */ | 1392 | /* Don't allow flags or name to change at remount */ |
| 1358 | if (opts.flags != root->flags || | 1393 | if (opts.flags != root->flags || |
| 1359 | (opts.name && strcmp(opts.name, root->name))) { | 1394 | (opts.name && strcmp(opts.name, root->name))) { |
| 1360 | ret = -EINVAL; | 1395 | ret = -EINVAL; |
| 1361 | drop_parsed_module_refcounts(opts.subsys_bits); | 1396 | drop_parsed_module_refcounts(opts.subsys_mask); |
| 1362 | goto out_unlock; | 1397 | goto out_unlock; |
| 1363 | } | 1398 | } |
| 1364 | 1399 | ||
| 1365 | ret = rebind_subsystems(root, opts.subsys_bits); | 1400 | ret = rebind_subsystems(root, opts.subsys_mask); |
| 1366 | if (ret) { | 1401 | if (ret) { |
| 1367 | drop_parsed_module_refcounts(opts.subsys_bits); | 1402 | drop_parsed_module_refcounts(opts.subsys_mask); |
| 1368 | goto out_unlock; | 1403 | goto out_unlock; |
| 1369 | } | 1404 | } |
| 1370 | 1405 | ||
| 1371 | /* clear out any existing files and repopulate subsystem files */ | 1406 | /* clear out any existing files and repopulate subsystem files */ |
| 1372 | cgroup_clear_directory(cgrp->dentry); | 1407 | cgroup_clear_directory(cgrp->dentry, false, removed_mask); |
| 1373 | cgroup_populate_dir(cgrp); | 1408 | /* re-populate subsystem files */ |
| 1409 | cgroup_populate_dir(cgrp, false, added_mask); | ||
| 1374 | 1410 | ||
| 1375 | if (opts.release_agent) | 1411 | if (opts.release_agent) |
| 1376 | strcpy(root->release_agent_path, opts.release_agent); | 1412 | strcpy(root->release_agent_path, opts.release_agent); |
| @@ -1401,6 +1437,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) | |||
| 1401 | mutex_init(&cgrp->pidlist_mutex); | 1437 | mutex_init(&cgrp->pidlist_mutex); |
| 1402 | INIT_LIST_HEAD(&cgrp->event_list); | 1438 | INIT_LIST_HEAD(&cgrp->event_list); |
| 1403 | spin_lock_init(&cgrp->event_list_lock); | 1439 | spin_lock_init(&cgrp->event_list_lock); |
| 1440 | simple_xattrs_init(&cgrp->xattrs); | ||
| 1404 | } | 1441 | } |
| 1405 | 1442 | ||
| 1406 | static void init_cgroup_root(struct cgroupfs_root *root) | 1443 | static void init_cgroup_root(struct cgroupfs_root *root) |
| @@ -1455,8 +1492,8 @@ static int cgroup_test_super(struct super_block *sb, void *data) | |||
| 1455 | * If we asked for subsystems (or explicitly for no | 1492 | * If we asked for subsystems (or explicitly for no |
| 1456 | * subsystems) then they must match | 1493 | * subsystems) then they must match |
| 1457 | */ | 1494 | */ |
| 1458 | if ((opts->subsys_bits || opts->none) | 1495 | if ((opts->subsys_mask || opts->none) |
| 1459 | && (opts->subsys_bits != root->subsys_bits)) | 1496 | && (opts->subsys_mask != root->subsys_mask)) |
| 1460 | return 0; | 1497 | return 0; |
| 1461 | 1498 | ||
| 1462 | return 1; | 1499 | return 1; |
| @@ -1466,7 +1503,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
| 1466 | { | 1503 | { |
| 1467 | struct cgroupfs_root *root; | 1504 | struct cgroupfs_root *root; |
| 1468 | 1505 | ||
| 1469 | if (!opts->subsys_bits && !opts->none) | 1506 | if (!opts->subsys_mask && !opts->none) |
| 1470 | return NULL; | 1507 | return NULL; |
| 1471 | 1508 | ||
| 1472 | root = kzalloc(sizeof(*root), GFP_KERNEL); | 1509 | root = kzalloc(sizeof(*root), GFP_KERNEL); |
| @@ -1479,7 +1516,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) | |||
| 1479 | } | 1516 | } |
| 1480 | init_cgroup_root(root); | 1517 | init_cgroup_root(root); |
| 1481 | 1518 | ||
| 1482 | root->subsys_bits = opts->subsys_bits; | 1519 | root->subsys_mask = opts->subsys_mask; |
| 1483 | root->flags = opts->flags; | 1520 | root->flags = opts->flags; |
| 1484 | if (opts->release_agent) | 1521 | if (opts->release_agent) |
| 1485 | strcpy(root->release_agent_path, opts->release_agent); | 1522 | strcpy(root->release_agent_path, opts->release_agent); |
| @@ -1511,7 +1548,7 @@ static int cgroup_set_super(struct super_block *sb, void *data) | |||
| 1511 | if (!opts->new_root) | 1548 | if (!opts->new_root) |
| 1512 | return -EINVAL; | 1549 | return -EINVAL; |
| 1513 | 1550 | ||
| 1514 | BUG_ON(!opts->subsys_bits && !opts->none); | 1551 | BUG_ON(!opts->subsys_mask && !opts->none); |
| 1515 | 1552 | ||
| 1516 | ret = set_anon_super(sb, NULL); | 1553 | ret = set_anon_super(sb, NULL); |
| 1517 | if (ret) | 1554 | if (ret) |
| @@ -1629,7 +1666,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1629 | if (ret) | 1666 | if (ret) |
| 1630 | goto unlock_drop; | 1667 | goto unlock_drop; |
| 1631 | 1668 | ||
| 1632 | ret = rebind_subsystems(root, root->subsys_bits); | 1669 | ret = rebind_subsystems(root, root->subsys_mask); |
| 1633 | if (ret == -EBUSY) { | 1670 | if (ret == -EBUSY) { |
| 1634 | free_cg_links(&tmp_cg_links); | 1671 | free_cg_links(&tmp_cg_links); |
| 1635 | goto unlock_drop; | 1672 | goto unlock_drop; |
| @@ -1669,7 +1706,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1669 | BUG_ON(root->number_of_cgroups != 1); | 1706 | BUG_ON(root->number_of_cgroups != 1); |
| 1670 | 1707 | ||
| 1671 | cred = override_creds(&init_cred); | 1708 | cred = override_creds(&init_cred); |
| 1672 | cgroup_populate_dir(root_cgrp); | 1709 | cgroup_populate_dir(root_cgrp, true, root->subsys_mask); |
| 1673 | revert_creds(cred); | 1710 | revert_creds(cred); |
| 1674 | mutex_unlock(&cgroup_root_mutex); | 1711 | mutex_unlock(&cgroup_root_mutex); |
| 1675 | mutex_unlock(&cgroup_mutex); | 1712 | mutex_unlock(&cgroup_mutex); |
| @@ -1681,7 +1718,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1681 | */ | 1718 | */ |
| 1682 | cgroup_drop_root(opts.new_root); | 1719 | cgroup_drop_root(opts.new_root); |
| 1683 | /* no subsys rebinding, so refcounts don't change */ | 1720 | /* no subsys rebinding, so refcounts don't change */ |
| 1684 | drop_parsed_module_refcounts(opts.subsys_bits); | 1721 | drop_parsed_module_refcounts(opts.subsys_mask); |
| 1685 | } | 1722 | } |
| 1686 | 1723 | ||
| 1687 | kfree(opts.release_agent); | 1724 | kfree(opts.release_agent); |
| @@ -1695,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, | |||
| 1695 | drop_new_super: | 1732 | drop_new_super: |
| 1696 | deactivate_locked_super(sb); | 1733 | deactivate_locked_super(sb); |
| 1697 | drop_modules: | 1734 | drop_modules: |
| 1698 | drop_parsed_module_refcounts(opts.subsys_bits); | 1735 | drop_parsed_module_refcounts(opts.subsys_mask); |
| 1699 | out_err: | 1736 | out_err: |
| 1700 | kfree(opts.release_agent); | 1737 | kfree(opts.release_agent); |
| 1701 | kfree(opts.name); | 1738 | kfree(opts.name); |
| @@ -1745,6 +1782,8 @@ static void cgroup_kill_sb(struct super_block *sb) { | |||
| 1745 | mutex_unlock(&cgroup_root_mutex); | 1782 | mutex_unlock(&cgroup_root_mutex); |
| 1746 | mutex_unlock(&cgroup_mutex); | 1783 | mutex_unlock(&cgroup_mutex); |
| 1747 | 1784 | ||
| 1785 | simple_xattrs_free(&cgrp->xattrs); | ||
| 1786 | |||
| 1748 | kill_litter_super(sb); | 1787 | kill_litter_super(sb); |
| 1749 | cgroup_drop_root(root); | 1788 | cgroup_drop_root(root); |
| 1750 | } | 1789 | } |
| @@ -2551,6 +2590,64 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
| 2551 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); | 2590 | return simple_rename(old_dir, old_dentry, new_dir, new_dentry); |
| 2552 | } | 2591 | } |
| 2553 | 2592 | ||
| 2593 | static struct simple_xattrs *__d_xattrs(struct dentry *dentry) | ||
| 2594 | { | ||
| 2595 | if (S_ISDIR(dentry->d_inode->i_mode)) | ||
| 2596 | return &__d_cgrp(dentry)->xattrs; | ||
| 2597 | else | ||
| 2598 | return &__d_cft(dentry)->xattrs; | ||
| 2599 | } | ||
| 2600 | |||
| 2601 | static inline int xattr_enabled(struct dentry *dentry) | ||
| 2602 | { | ||
| 2603 | struct cgroupfs_root *root = dentry->d_sb->s_fs_info; | ||
| 2604 | return test_bit(ROOT_XATTR, &root->flags); | ||
| 2605 | } | ||
| 2606 | |||
| 2607 | static bool is_valid_xattr(const char *name) | ||
| 2608 | { | ||
| 2609 | if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || | ||
| 2610 | !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) | ||
| 2611 | return true; | ||
| 2612 | return false; | ||
| 2613 | } | ||
| 2614 | |||
| 2615 | static int cgroup_setxattr(struct dentry *dentry, const char *name, | ||
| 2616 | const void *val, size_t size, int flags) | ||
| 2617 | { | ||
| 2618 | if (!xattr_enabled(dentry)) | ||
| 2619 | return -EOPNOTSUPP; | ||
| 2620 | if (!is_valid_xattr(name)) | ||
| 2621 | return -EINVAL; | ||
| 2622 | return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); | ||
| 2623 | } | ||
| 2624 | |||
| 2625 | static int cgroup_removexattr(struct dentry *dentry, const char *name) | ||
| 2626 | { | ||
| 2627 | if (!xattr_enabled(dentry)) | ||
| 2628 | return -EOPNOTSUPP; | ||
| 2629 | if (!is_valid_xattr(name)) | ||
| 2630 | return -EINVAL; | ||
| 2631 | return simple_xattr_remove(__d_xattrs(dentry), name); | ||
| 2632 | } | ||
| 2633 | |||
| 2634 | static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, | ||
| 2635 | void *buf, size_t size) | ||
| 2636 | { | ||
| 2637 | if (!xattr_enabled(dentry)) | ||
| 2638 | return -EOPNOTSUPP; | ||
| 2639 | if (!is_valid_xattr(name)) | ||
| 2640 | return -EINVAL; | ||
| 2641 | return simple_xattr_get(__d_xattrs(dentry), name, buf, size); | ||
| 2642 | } | ||
| 2643 | |||
| 2644 | static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) | ||
| 2645 | { | ||
| 2646 | if (!xattr_enabled(dentry)) | ||
| 2647 | return -EOPNOTSUPP; | ||
| 2648 | return simple_xattr_list(__d_xattrs(dentry), buf, size); | ||
| 2649 | } | ||
| 2650 | |||
| 2554 | static const struct file_operations cgroup_file_operations = { | 2651 | static const struct file_operations cgroup_file_operations = { |
| 2555 | .read = cgroup_file_read, | 2652 | .read = cgroup_file_read, |
| 2556 | .write = cgroup_file_write, | 2653 | .write = cgroup_file_write, |
| @@ -2559,11 +2656,22 @@ static const struct file_operations cgroup_file_operations = { | |||
| 2559 | .release = cgroup_file_release, | 2656 | .release = cgroup_file_release, |
| 2560 | }; | 2657 | }; |
| 2561 | 2658 | ||
| 2659 | static const struct inode_operations cgroup_file_inode_operations = { | ||
| 2660 | .setxattr = cgroup_setxattr, | ||
| 2661 | .getxattr = cgroup_getxattr, | ||
| 2662 | .listxattr = cgroup_listxattr, | ||
| 2663 | .removexattr = cgroup_removexattr, | ||
| 2664 | }; | ||
| 2665 | |||
| 2562 | static const struct inode_operations cgroup_dir_inode_operations = { | 2666 | static const struct inode_operations cgroup_dir_inode_operations = { |
| 2563 | .lookup = cgroup_lookup, | 2667 | .lookup = cgroup_lookup, |
| 2564 | .mkdir = cgroup_mkdir, | 2668 | .mkdir = cgroup_mkdir, |
| 2565 | .rmdir = cgroup_rmdir, | 2669 | .rmdir = cgroup_rmdir, |
| 2566 | .rename = cgroup_rename, | 2670 | .rename = cgroup_rename, |
| 2671 | .setxattr = cgroup_setxattr, | ||
| 2672 | .getxattr = cgroup_getxattr, | ||
| 2673 | .listxattr = cgroup_listxattr, | ||
| 2674 | .removexattr = cgroup_removexattr, | ||
| 2567 | }; | 2675 | }; |
| 2568 | 2676 | ||
| 2569 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) | 2677 | static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) |
| @@ -2611,6 +2719,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, | |||
| 2611 | } else if (S_ISREG(mode)) { | 2719 | } else if (S_ISREG(mode)) { |
| 2612 | inode->i_size = 0; | 2720 | inode->i_size = 0; |
| 2613 | inode->i_fop = &cgroup_file_operations; | 2721 | inode->i_fop = &cgroup_file_operations; |
| 2722 | inode->i_op = &cgroup_file_inode_operations; | ||
| 2614 | } | 2723 | } |
| 2615 | d_instantiate(dentry, inode); | 2724 | d_instantiate(dentry, inode); |
| 2616 | dget(dentry); /* Extra count - pin the dentry in core */ | 2725 | dget(dentry); /* Extra count - pin the dentry in core */ |
| @@ -2671,7 +2780,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) | |||
| 2671 | } | 2780 | } |
| 2672 | 2781 | ||
| 2673 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2782 | static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
| 2674 | const struct cftype *cft) | 2783 | struct cftype *cft) |
| 2675 | { | 2784 | { |
| 2676 | struct dentry *dir = cgrp->dentry; | 2785 | struct dentry *dir = cgrp->dentry; |
| 2677 | struct cgroup *parent = __d_cgrp(dir); | 2786 | struct cgroup *parent = __d_cgrp(dir); |
| @@ -2681,6 +2790,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, | |||
| 2681 | umode_t mode; | 2790 | umode_t mode; |
| 2682 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; | 2791 | char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; |
| 2683 | 2792 | ||
| 2793 | simple_xattrs_init(&cft->xattrs); | ||
| 2794 | |||
| 2684 | /* does @cft->flags tell us to skip creation on @cgrp? */ | 2795 | /* does @cft->flags tell us to skip creation on @cgrp? */ |
| 2685 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) | 2796 | if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) |
| 2686 | return 0; | 2797 | return 0; |
| @@ -2721,9 +2832,9 @@ out: | |||
| 2721 | } | 2832 | } |
| 2722 | 2833 | ||
| 2723 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, | 2834 | static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, |
| 2724 | const struct cftype cfts[], bool is_add) | 2835 | struct cftype cfts[], bool is_add) |
| 2725 | { | 2836 | { |
| 2726 | const struct cftype *cft; | 2837 | struct cftype *cft; |
| 2727 | int err, ret = 0; | 2838 | int err, ret = 0; |
| 2728 | 2839 | ||
| 2729 | for (cft = cfts; cft->name[0] != '\0'; cft++) { | 2840 | for (cft = cfts; cft->name[0] != '\0'; cft++) { |
| @@ -2757,7 +2868,7 @@ static void cgroup_cfts_prepare(void) | |||
| 2757 | } | 2868 | } |
| 2758 | 2869 | ||
| 2759 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, | 2870 | static void cgroup_cfts_commit(struct cgroup_subsys *ss, |
| 2760 | const struct cftype *cfts, bool is_add) | 2871 | struct cftype *cfts, bool is_add) |
| 2761 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) | 2872 | __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) |
| 2762 | { | 2873 | { |
| 2763 | LIST_HEAD(pending); | 2874 | LIST_HEAD(pending); |
| @@ -2808,7 +2919,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, | |||
| 2808 | * function currently returns 0 as long as @cfts registration is successful | 2919 | * function currently returns 0 as long as @cfts registration is successful |
| 2809 | * even if some file creation attempts on existing cgroups fail. | 2920 | * even if some file creation attempts on existing cgroups fail. |
| 2810 | */ | 2921 | */ |
| 2811 | int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | 2922 | int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
| 2812 | { | 2923 | { |
| 2813 | struct cftype_set *set; | 2924 | struct cftype_set *set; |
| 2814 | 2925 | ||
| @@ -2838,7 +2949,7 @@ EXPORT_SYMBOL_GPL(cgroup_add_cftypes); | |||
| 2838 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not | 2949 | * Returns 0 on successful unregistration, -ENOENT if @cfts is not |
| 2839 | * registered with @ss. | 2950 | * registered with @ss. |
| 2840 | */ | 2951 | */ |
| 2841 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts) | 2952 | int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) |
| 2842 | { | 2953 | { |
| 2843 | struct cftype_set *set; | 2954 | struct cftype_set *set; |
| 2844 | 2955 | ||
| @@ -3843,18 +3954,29 @@ static struct cftype files[] = { | |||
| 3843 | { } /* terminate */ | 3954 | { } /* terminate */ |
| 3844 | }; | 3955 | }; |
| 3845 | 3956 | ||
| 3846 | static int cgroup_populate_dir(struct cgroup *cgrp) | 3957 | /** |
| 3958 | * cgroup_populate_dir - selectively creation of files in a directory | ||
| 3959 | * @cgrp: target cgroup | ||
| 3960 | * @base_files: true if the base files should be added | ||
| 3961 | * @subsys_mask: mask of the subsystem ids whose files should be added | ||
| 3962 | */ | ||
| 3963 | static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, | ||
| 3964 | unsigned long subsys_mask) | ||
| 3847 | { | 3965 | { |
| 3848 | int err; | 3966 | int err; |
| 3849 | struct cgroup_subsys *ss; | 3967 | struct cgroup_subsys *ss; |
| 3850 | 3968 | ||
| 3851 | err = cgroup_addrm_files(cgrp, NULL, files, true); | 3969 | if (base_files) { |
| 3852 | if (err < 0) | 3970 | err = cgroup_addrm_files(cgrp, NULL, files, true); |
| 3853 | return err; | 3971 | if (err < 0) |
| 3972 | return err; | ||
| 3973 | } | ||
| 3854 | 3974 | ||
| 3855 | /* process cftsets of each subsystem */ | 3975 | /* process cftsets of each subsystem */ |
| 3856 | for_each_subsys(cgrp->root, ss) { | 3976 | for_each_subsys(cgrp->root, ss) { |
| 3857 | struct cftype_set *set; | 3977 | struct cftype_set *set; |
| 3978 | if (!test_bit(ss->subsys_id, &subsys_mask)) | ||
| 3979 | continue; | ||
| 3858 | 3980 | ||
| 3859 | list_for_each_entry(set, &ss->cftsets, node) | 3981 | list_for_each_entry(set, &ss->cftsets, node) |
| 3860 | cgroup_addrm_files(cgrp, ss, set->cfts, true); | 3982 | cgroup_addrm_files(cgrp, ss, set->cfts, true); |
| @@ -3954,8 +4076,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 3954 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); | 4076 | set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); |
| 3955 | 4077 | ||
| 3956 | for_each_subsys(root, ss) { | 4078 | for_each_subsys(root, ss) { |
| 3957 | struct cgroup_subsys_state *css = ss->create(cgrp); | 4079 | struct cgroup_subsys_state *css; |
| 3958 | 4080 | ||
| 4081 | css = ss->create(cgrp); | ||
| 3959 | if (IS_ERR(css)) { | 4082 | if (IS_ERR(css)) { |
| 3960 | err = PTR_ERR(css); | 4083 | err = PTR_ERR(css); |
| 3961 | goto err_destroy; | 4084 | goto err_destroy; |
| @@ -3969,6 +4092,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 3969 | /* At error, ->destroy() callback has to free assigned ID. */ | 4092 | /* At error, ->destroy() callback has to free assigned ID. */ |
| 3970 | if (clone_children(parent) && ss->post_clone) | 4093 | if (clone_children(parent) && ss->post_clone) |
| 3971 | ss->post_clone(cgrp); | 4094 | ss->post_clone(cgrp); |
| 4095 | |||
| 4096 | if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && | ||
| 4097 | parent->parent) { | ||
| 4098 | pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", | ||
| 4099 | current->comm, current->pid, ss->name); | ||
| 4100 | if (!strcmp(ss->name, "memory")) | ||
| 4101 | pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); | ||
| 4102 | ss->warned_broken_hierarchy = true; | ||
| 4103 | } | ||
| 3972 | } | 4104 | } |
| 3973 | 4105 | ||
| 3974 | list_add(&cgrp->sibling, &cgrp->parent->children); | 4106 | list_add(&cgrp->sibling, &cgrp->parent->children); |
| @@ -3988,7 +4120,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, | |||
| 3988 | 4120 | ||
| 3989 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); | 4121 | list_add_tail(&cgrp->allcg_node, &root->allcg_list); |
| 3990 | 4122 | ||
| 3991 | err = cgroup_populate_dir(cgrp); | 4123 | err = cgroup_populate_dir(cgrp, true, root->subsys_mask); |
| 3992 | /* If err < 0, we have a half-filled directory - oh well ;) */ | 4124 | /* If err < 0, we have a half-filled directory - oh well ;) */ |
| 3993 | 4125 | ||
| 3994 | mutex_unlock(&cgroup_mutex); | 4126 | mutex_unlock(&cgroup_mutex); |
| @@ -4321,8 +4453,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4321 | * since cgroup_init_subsys will have already taken care of it. | 4453 | * since cgroup_init_subsys will have already taken care of it. |
| 4322 | */ | 4454 | */ |
| 4323 | if (ss->module == NULL) { | 4455 | if (ss->module == NULL) { |
| 4324 | /* a few sanity checks */ | 4456 | /* a sanity check */ |
| 4325 | BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); | ||
| 4326 | BUG_ON(subsys[ss->subsys_id] != ss); | 4457 | BUG_ON(subsys[ss->subsys_id] != ss); |
| 4327 | return 0; | 4458 | return 0; |
| 4328 | } | 4459 | } |
| @@ -4330,24 +4461,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4330 | /* init base cftset */ | 4461 | /* init base cftset */ |
| 4331 | cgroup_init_cftsets(ss); | 4462 | cgroup_init_cftsets(ss); |
| 4332 | 4463 | ||
| 4333 | /* | ||
| 4334 | * need to register a subsys id before anything else - for example, | ||
| 4335 | * init_cgroup_css needs it. | ||
| 4336 | */ | ||
| 4337 | mutex_lock(&cgroup_mutex); | 4464 | mutex_lock(&cgroup_mutex); |
| 4338 | /* find the first empty slot in the array */ | 4465 | subsys[ss->subsys_id] = ss; |
| 4339 | for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { | ||
| 4340 | if (subsys[i] == NULL) | ||
| 4341 | break; | ||
| 4342 | } | ||
| 4343 | if (i == CGROUP_SUBSYS_COUNT) { | ||
| 4344 | /* maximum number of subsystems already registered! */ | ||
| 4345 | mutex_unlock(&cgroup_mutex); | ||
| 4346 | return -EBUSY; | ||
| 4347 | } | ||
| 4348 | /* assign ourselves the subsys_id */ | ||
| 4349 | ss->subsys_id = i; | ||
| 4350 | subsys[i] = ss; | ||
| 4351 | 4466 | ||
| 4352 | /* | 4467 | /* |
| 4353 | * no ss->create seems to need anything important in the ss struct, so | 4468 | * no ss->create seems to need anything important in the ss struct, so |
| @@ -4356,7 +4471,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4356 | css = ss->create(dummytop); | 4471 | css = ss->create(dummytop); |
| 4357 | if (IS_ERR(css)) { | 4472 | if (IS_ERR(css)) { |
| 4358 | /* failure case - need to deassign the subsys[] slot. */ | 4473 | /* failure case - need to deassign the subsys[] slot. */ |
| 4359 | subsys[i] = NULL; | 4474 | subsys[ss->subsys_id] = NULL; |
| 4360 | mutex_unlock(&cgroup_mutex); | 4475 | mutex_unlock(&cgroup_mutex); |
| 4361 | return PTR_ERR(css); | 4476 | return PTR_ERR(css); |
| 4362 | } | 4477 | } |
| @@ -4372,7 +4487,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) | |||
| 4372 | if (ret) { | 4487 | if (ret) { |
| 4373 | dummytop->subsys[ss->subsys_id] = NULL; | 4488 | dummytop->subsys[ss->subsys_id] = NULL; |
| 4374 | ss->destroy(dummytop); | 4489 | ss->destroy(dummytop); |
| 4375 | subsys[i] = NULL; | 4490 | subsys[ss->subsys_id] = NULL; |
| 4376 | mutex_unlock(&cgroup_mutex); | 4491 | mutex_unlock(&cgroup_mutex); |
| 4377 | return ret; | 4492 | return ret; |
| 4378 | } | 4493 | } |
| @@ -4439,7 +4554,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) | |||
| 4439 | 4554 | ||
| 4440 | mutex_lock(&cgroup_mutex); | 4555 | mutex_lock(&cgroup_mutex); |
| 4441 | /* deassign the subsys_id */ | 4556 | /* deassign the subsys_id */ |
| 4442 | BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); | ||
| 4443 | subsys[ss->subsys_id] = NULL; | 4557 | subsys[ss->subsys_id] = NULL; |
| 4444 | 4558 | ||
| 4445 | /* remove subsystem from rootnode's list of subsystems */ | 4559 | /* remove subsystem from rootnode's list of subsystems */ |
| @@ -4502,10 +4616,13 @@ int __init cgroup_init_early(void) | |||
| 4502 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) | 4616 | for (i = 0; i < CSS_SET_TABLE_SIZE; i++) |
| 4503 | INIT_HLIST_HEAD(&css_set_table[i]); | 4617 | INIT_HLIST_HEAD(&css_set_table[i]); |
| 4504 | 4618 | ||
| 4505 | /* at bootup time, we don't worry about modular subsystems */ | 4619 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 4506 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 4507 | struct cgroup_subsys *ss = subsys[i]; | 4620 | struct cgroup_subsys *ss = subsys[i]; |
| 4508 | 4621 | ||
| 4622 | /* at bootup time, we don't worry about modular subsystems */ | ||
| 4623 | if (!ss || ss->module) | ||
| 4624 | continue; | ||
| 4625 | |||
| 4509 | BUG_ON(!ss->name); | 4626 | BUG_ON(!ss->name); |
| 4510 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); | 4627 | BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); |
| 4511 | BUG_ON(!ss->create); | 4628 | BUG_ON(!ss->create); |
| @@ -4538,9 +4655,12 @@ int __init cgroup_init(void) | |||
| 4538 | if (err) | 4655 | if (err) |
| 4539 | return err; | 4656 | return err; |
| 4540 | 4657 | ||
| 4541 | /* at bootup time, we don't worry about modular subsystems */ | 4658 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 4542 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 4543 | struct cgroup_subsys *ss = subsys[i]; | 4659 | struct cgroup_subsys *ss = subsys[i]; |
| 4660 | |||
| 4661 | /* at bootup time, we don't worry about modular subsystems */ | ||
| 4662 | if (!ss || ss->module) | ||
| 4663 | continue; | ||
| 4544 | if (!ss->early_init) | 4664 | if (!ss->early_init) |
| 4545 | cgroup_init_subsys(ss); | 4665 | cgroup_init_subsys(ss); |
| 4546 | if (ss->use_id) | 4666 | if (ss->use_id) |
| @@ -4735,13 +4855,16 @@ void cgroup_fork_callbacks(struct task_struct *child) | |||
| 4735 | { | 4855 | { |
| 4736 | if (need_forkexit_callback) { | 4856 | if (need_forkexit_callback) { |
| 4737 | int i; | 4857 | int i; |
| 4738 | /* | 4858 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 4739 | * forkexit callbacks are only supported for builtin | ||
| 4740 | * subsystems, and the builtin section of the subsys array is | ||
| 4741 | * immutable, so we don't need to lock the subsys array here. | ||
| 4742 | */ | ||
| 4743 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 4744 | struct cgroup_subsys *ss = subsys[i]; | 4859 | struct cgroup_subsys *ss = subsys[i]; |
| 4860 | |||
| 4861 | /* | ||
| 4862 | * forkexit callbacks are only supported for | ||
| 4863 | * builtin subsystems. | ||
| 4864 | */ | ||
| 4865 | if (!ss || ss->module) | ||
| 4866 | continue; | ||
| 4867 | |||
| 4745 | if (ss->fork) | 4868 | if (ss->fork) |
| 4746 | ss->fork(child); | 4869 | ss->fork(child); |
| 4747 | } | 4870 | } |
| @@ -4846,12 +4969,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) | |||
| 4846 | tsk->cgroups = &init_css_set; | 4969 | tsk->cgroups = &init_css_set; |
| 4847 | 4970 | ||
| 4848 | if (run_callbacks && need_forkexit_callback) { | 4971 | if (run_callbacks && need_forkexit_callback) { |
| 4849 | /* | 4972 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 4850 | * modular subsystems can't use callbacks, so no need to lock | ||
| 4851 | * the subsys array | ||
| 4852 | */ | ||
| 4853 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 4854 | struct cgroup_subsys *ss = subsys[i]; | 4973 | struct cgroup_subsys *ss = subsys[i]; |
| 4974 | |||
| 4975 | /* modular subsystems can't use callbacks */ | ||
| 4976 | if (!ss || ss->module) | ||
| 4977 | continue; | ||
| 4978 | |||
| 4855 | if (ss->exit) { | 4979 | if (ss->exit) { |
| 4856 | struct cgroup *old_cgrp = | 4980 | struct cgroup *old_cgrp = |
| 4857 | rcu_dereference_raw(cg->subsys[i])->cgroup; | 4981 | rcu_dereference_raw(cg->subsys[i])->cgroup; |
| @@ -5037,13 +5161,17 @@ static int __init cgroup_disable(char *str) | |||
| 5037 | while ((token = strsep(&str, ",")) != NULL) { | 5161 | while ((token = strsep(&str, ",")) != NULL) { |
| 5038 | if (!*token) | 5162 | if (!*token) |
| 5039 | continue; | 5163 | continue; |
| 5040 | /* | 5164 | for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { |
| 5041 | * cgroup_disable, being at boot time, can't know about module | ||
| 5042 | * subsystems, so we don't worry about them. | ||
| 5043 | */ | ||
| 5044 | for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { | ||
| 5045 | struct cgroup_subsys *ss = subsys[i]; | 5165 | struct cgroup_subsys *ss = subsys[i]; |
| 5046 | 5166 | ||
| 5167 | /* | ||
| 5168 | * cgroup_disable, being at boot time, can't | ||
| 5169 | * know about module subsystems, so we don't | ||
| 5170 | * worry about them. | ||
| 5171 | */ | ||
| 5172 | if (!ss || ss->module) | ||
| 5173 | continue; | ||
| 5174 | |||
| 5047 | if (!strcmp(token, ss->name)) { | 5175 | if (!strcmp(token, ss->name)) { |
| 5048 | ss->disabled = 1; | 5176 | ss->disabled = 1; |
| 5049 | printk(KERN_INFO "Disabling %s control group" | 5177 | printk(KERN_INFO "Disabling %s control group" |
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3649fc6b3eaa..b1724ce98981 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c | |||
| @@ -373,4 +373,12 @@ struct cgroup_subsys freezer_subsys = { | |||
| 373 | .can_attach = freezer_can_attach, | 373 | .can_attach = freezer_can_attach, |
| 374 | .fork = freezer_fork, | 374 | .fork = freezer_fork, |
| 375 | .base_cftypes = files, | 375 | .base_cftypes = files, |
| 376 | |||
| 377 | /* | ||
| 378 | * freezer subsys doesn't handle hierarchy at all. Frozen state | ||
| 379 | * should be inherited through the hierarchy - if a parent is | ||
| 380 | * frozen, all its children should be frozen. Fix it and remove | ||
| 381 | * the following. | ||
| 382 | */ | ||
| 383 | .broken_hierarchy = true, | ||
| 376 | }; | 384 | }; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 14d32588cccd..42bd331ee0ab 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
| @@ -80,6 +80,10 @@ void put_online_cpus(void) | |||
| 80 | if (cpu_hotplug.active_writer == current) | 80 | if (cpu_hotplug.active_writer == current) |
| 81 | return; | 81 | return; |
| 82 | mutex_lock(&cpu_hotplug.lock); | 82 | mutex_lock(&cpu_hotplug.lock); |
| 83 | |||
| 84 | if (WARN_ON(!cpu_hotplug.refcount)) | ||
| 85 | cpu_hotplug.refcount++; /* try to fix things up */ | ||
| 86 | |||
| 83 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) | 87 | if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) |
| 84 | wake_up_process(cpu_hotplug.active_writer); | 88 | wake_up_process(cpu_hotplug.active_writer); |
| 85 | mutex_unlock(&cpu_hotplug.lock); | 89 | mutex_unlock(&cpu_hotplug.lock); |
| @@ -280,12 +284,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) | |||
| 280 | __func__, cpu); | 284 | __func__, cpu); |
| 281 | goto out_release; | 285 | goto out_release; |
| 282 | } | 286 | } |
| 287 | smpboot_park_threads(cpu); | ||
| 283 | 288 | ||
| 284 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); | 289 | err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); |
| 285 | if (err) { | 290 | if (err) { |
| 286 | /* CPU didn't die: tell everyone. Can't complain. */ | 291 | /* CPU didn't die: tell everyone. Can't complain. */ |
| 292 | smpboot_unpark_threads(cpu); | ||
| 287 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); | 293 | cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); |
| 288 | |||
| 289 | goto out_release; | 294 | goto out_release; |
| 290 | } | 295 | } |
| 291 | BUG_ON(cpu_online(cpu)); | 296 | BUG_ON(cpu_online(cpu)); |
| @@ -354,6 +359,10 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 354 | goto out; | 359 | goto out; |
| 355 | } | 360 | } |
| 356 | 361 | ||
| 362 | ret = smpboot_create_threads(cpu); | ||
| 363 | if (ret) | ||
| 364 | goto out; | ||
| 365 | |||
| 357 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); | 366 | ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); |
| 358 | if (ret) { | 367 | if (ret) { |
| 359 | nr_calls--; | 368 | nr_calls--; |
| @@ -368,6 +377,9 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
| 368 | goto out_notify; | 377 | goto out_notify; |
| 369 | BUG_ON(!cpu_online(cpu)); | 378 | BUG_ON(!cpu_online(cpu)); |
| 370 | 379 | ||
| 380 | /* Wake the per cpu threads */ | ||
| 381 | smpboot_unpark_threads(cpu); | ||
| 382 | |||
| 371 | /* Now call notifier in preparation. */ | 383 | /* Now call notifier in preparation. */ |
| 372 | cpu_notify(CPU_ONLINE | mod, hcpu); | 384 | cpu_notify(CPU_ONLINE | mod, hcpu); |
| 373 | 385 | ||
| @@ -439,14 +451,6 @@ EXPORT_SYMBOL_GPL(cpu_up); | |||
| 439 | #ifdef CONFIG_PM_SLEEP_SMP | 451 | #ifdef CONFIG_PM_SLEEP_SMP |
| 440 | static cpumask_var_t frozen_cpus; | 452 | static cpumask_var_t frozen_cpus; |
| 441 | 453 | ||
| 442 | void __weak arch_disable_nonboot_cpus_begin(void) | ||
| 443 | { | ||
| 444 | } | ||
| 445 | |||
| 446 | void __weak arch_disable_nonboot_cpus_end(void) | ||
| 447 | { | ||
| 448 | } | ||
| 449 | |||
| 450 | int disable_nonboot_cpus(void) | 454 | int disable_nonboot_cpus(void) |
| 451 | { | 455 | { |
| 452 | int cpu, first_cpu, error = 0; | 456 | int cpu, first_cpu, error = 0; |
| @@ -458,7 +462,6 @@ int disable_nonboot_cpus(void) | |||
| 458 | * with the userspace trying to use the CPU hotplug at the same time | 462 | * with the userspace trying to use the CPU hotplug at the same time |
| 459 | */ | 463 | */ |
| 460 | cpumask_clear(frozen_cpus); | 464 | cpumask_clear(frozen_cpus); |
| 461 | arch_disable_nonboot_cpus_begin(); | ||
| 462 | 465 | ||
| 463 | printk("Disabling non-boot CPUs ...\n"); | 466 | printk("Disabling non-boot CPUs ...\n"); |
| 464 | for_each_online_cpu(cpu) { | 467 | for_each_online_cpu(cpu) { |
| @@ -474,8 +477,6 @@ int disable_nonboot_cpus(void) | |||
| 474 | } | 477 | } |
| 475 | } | 478 | } |
| 476 | 479 | ||
| 477 | arch_disable_nonboot_cpus_end(); | ||
| 478 | |||
| 479 | if (!error) { | 480 | if (!error) { |
| 480 | BUG_ON(num_online_cpus() > 1); | 481 | BUG_ON(num_online_cpus() > 1); |
| 481 | /* Make sure the CPUs won't be enabled by someone else */ | 482 | /* Make sure the CPUs won't be enabled by someone else */ |
diff --git a/kernel/cred.c b/kernel/cred.c index de728ac50d82..48cea3da6d05 100644 --- a/kernel/cred.c +++ b/kernel/cred.c | |||
| @@ -799,9 +799,15 @@ static void dump_invalid_creds(const struct cred *cred, const char *label, | |||
| 799 | atomic_read(&cred->usage), | 799 | atomic_read(&cred->usage), |
| 800 | read_cred_subscribers(cred)); | 800 | read_cred_subscribers(cred)); |
| 801 | printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", | 801 | printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", |
| 802 | cred->uid, cred->euid, cred->suid, cred->fsuid); | 802 | from_kuid_munged(&init_user_ns, cred->uid), |
| 803 | from_kuid_munged(&init_user_ns, cred->euid), | ||
| 804 | from_kuid_munged(&init_user_ns, cred->suid), | ||
| 805 | from_kuid_munged(&init_user_ns, cred->fsuid)); | ||
| 803 | printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", | 806 | printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", |
| 804 | cred->gid, cred->egid, cred->sgid, cred->fsgid); | 807 | from_kgid_munged(&init_user_ns, cred->gid), |
| 808 | from_kgid_munged(&init_user_ns, cred->egid), | ||
| 809 | from_kgid_munged(&init_user_ns, cred->sgid), | ||
| 810 | from_kgid_munged(&init_user_ns, cred->fsgid)); | ||
| 805 | #ifdef CONFIG_SECURITY | 811 | #ifdef CONFIG_SECURITY |
| 806 | printk(KERN_ERR "CRED: ->security is %p\n", cred->security); | 812 | printk(KERN_ERR "CRED: ->security is %p\n", cred->security); |
| 807 | if ((unsigned long) cred->security >= PAGE_SIZE && | 813 | if ((unsigned long) cred->security >= PAGE_SIZE && |
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..17e073c309e6 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c | |||
| @@ -672,6 +672,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
| 672 | { | 672 | { |
| 673 | struct kgdb_state kgdb_var; | 673 | struct kgdb_state kgdb_var; |
| 674 | struct kgdb_state *ks = &kgdb_var; | 674 | struct kgdb_state *ks = &kgdb_var; |
| 675 | int ret = 0; | ||
| 676 | |||
| 677 | if (arch_kgdb_ops.enable_nmi) | ||
| 678 | arch_kgdb_ops.enable_nmi(0); | ||
| 675 | 679 | ||
| 676 | ks->cpu = raw_smp_processor_id(); | 680 | ks->cpu = raw_smp_processor_id(); |
| 677 | ks->ex_vector = evector; | 681 | ks->ex_vector = evector; |
| @@ -681,11 +685,15 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) | |||
| 681 | ks->linux_regs = regs; | 685 | ks->linux_regs = regs; |
| 682 | 686 | ||
| 683 | if (kgdb_reenter_check(ks)) | 687 | if (kgdb_reenter_check(ks)) |
| 684 | return 0; /* Ouch, double exception ! */ | 688 | goto out; /* Ouch, double exception ! */ |
| 685 | if (kgdb_info[ks->cpu].enter_kgdb != 0) | 689 | if (kgdb_info[ks->cpu].enter_kgdb != 0) |
| 686 | return 0; | 690 | goto out; |
| 687 | 691 | ||
| 688 | return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); | 692 | ret = kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); |
| 693 | out: | ||
| 694 | if (arch_kgdb_ops.enable_nmi) | ||
| 695 | arch_kgdb_ops.enable_nmi(1); | ||
| 696 | return ret; | ||
| 689 | } | 697 | } |
| 690 | 698 | ||
| 691 | int kgdb_nmicallback(int cpu, void *regs) | 699 | int kgdb_nmicallback(int cpu, void *regs) |
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 31df1706b9a9..1261dc7eaeb9 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c | |||
| @@ -21,6 +21,7 @@ | |||
| 21 | #include <linux/smp.h> | 21 | #include <linux/smp.h> |
| 22 | #include <linux/utsname.h> | 22 | #include <linux/utsname.h> |
| 23 | #include <linux/vmalloc.h> | 23 | #include <linux/vmalloc.h> |
| 24 | #include <linux/atomic.h> | ||
| 24 | #include <linux/module.h> | 25 | #include <linux/module.h> |
| 25 | #include <linux/mm.h> | 26 | #include <linux/mm.h> |
| 26 | #include <linux/init.h> | 27 | #include <linux/init.h> |
| @@ -2107,6 +2108,32 @@ static int kdb_dmesg(int argc, const char **argv) | |||
| 2107 | return 0; | 2108 | return 0; |
| 2108 | } | 2109 | } |
| 2109 | #endif /* CONFIG_PRINTK */ | 2110 | #endif /* CONFIG_PRINTK */ |
| 2111 | |||
| 2112 | /* Make sure we balance enable/disable calls, must disable first. */ | ||
| 2113 | static atomic_t kdb_nmi_disabled; | ||
| 2114 | |||
| 2115 | static int kdb_disable_nmi(int argc, const char *argv[]) | ||
| 2116 | { | ||
| 2117 | if (atomic_read(&kdb_nmi_disabled)) | ||
| 2118 | return 0; | ||
| 2119 | atomic_set(&kdb_nmi_disabled, 1); | ||
| 2120 | arch_kgdb_ops.enable_nmi(0); | ||
| 2121 | return 0; | ||
| 2122 | } | ||
| 2123 | |||
| 2124 | static int kdb_param_enable_nmi(const char *val, const struct kernel_param *kp) | ||
| 2125 | { | ||
| 2126 | if (!atomic_add_unless(&kdb_nmi_disabled, -1, 0)) | ||
| 2127 | return -EINVAL; | ||
| 2128 | arch_kgdb_ops.enable_nmi(1); | ||
| 2129 | return 0; | ||
| 2130 | } | ||
| 2131 | |||
| 2132 | static const struct kernel_param_ops kdb_param_ops_enable_nmi = { | ||
| 2133 | .set = kdb_param_enable_nmi, | ||
| 2134 | }; | ||
| 2135 | module_param_cb(enable_nmi, &kdb_param_ops_enable_nmi, NULL, 0600); | ||
| 2136 | |||
| 2110 | /* | 2137 | /* |
| 2111 | * kdb_cpu - This function implements the 'cpu' command. | 2138 | * kdb_cpu - This function implements the 'cpu' command. |
| 2112 | * cpu [<cpunum>] | 2139 | * cpu [<cpunum>] |
| @@ -2851,6 +2878,10 @@ static void __init kdb_inittab(void) | |||
| 2851 | kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", | 2878 | kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", |
| 2852 | "Display syslog buffer", 0, KDB_REPEAT_NONE); | 2879 | "Display syslog buffer", 0, KDB_REPEAT_NONE); |
| 2853 | #endif | 2880 | #endif |
| 2881 | if (arch_kgdb_ops.enable_nmi) { | ||
| 2882 | kdb_register_repeat("disable_nmi", kdb_disable_nmi, "", | ||
| 2883 | "Disable NMI entry to KDB", 0, KDB_REPEAT_NONE); | ||
| 2884 | } | ||
| 2854 | kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", | 2885 | kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", |
| 2855 | "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); | 2886 | "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); |
| 2856 | kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", | 2887 | kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>", |
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 98d4597f43d6..c77206184b8b 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c | |||
| @@ -159,6 +159,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) | |||
| 159 | int rctx; | 159 | int rctx; |
| 160 | struct perf_callchain_entry *entry; | 160 | struct perf_callchain_entry *entry; |
| 161 | 161 | ||
| 162 | int kernel = !event->attr.exclude_callchain_kernel; | ||
| 163 | int user = !event->attr.exclude_callchain_user; | ||
| 164 | |||
| 165 | if (!kernel && !user) | ||
| 166 | return NULL; | ||
| 162 | 167 | ||
| 163 | entry = get_callchain_entry(&rctx); | 168 | entry = get_callchain_entry(&rctx); |
| 164 | if (rctx == -1) | 169 | if (rctx == -1) |
| @@ -169,24 +174,29 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) | |||
| 169 | 174 | ||
| 170 | entry->nr = 0; | 175 | entry->nr = 0; |
| 171 | 176 | ||
| 172 | if (!user_mode(regs)) { | 177 | if (kernel && !user_mode(regs)) { |
| 173 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); | 178 | perf_callchain_store(entry, PERF_CONTEXT_KERNEL); |
| 174 | perf_callchain_kernel(entry, regs); | 179 | perf_callchain_kernel(entry, regs); |
| 175 | if (current->mm) | ||
| 176 | regs = task_pt_regs(current); | ||
| 177 | else | ||
| 178 | regs = NULL; | ||
| 179 | } | 180 | } |
| 180 | 181 | ||
| 181 | if (regs) { | 182 | if (user) { |
| 182 | /* | 183 | if (!user_mode(regs)) { |
| 183 | * Disallow cross-task user callchains. | 184 | if (current->mm) |
| 184 | */ | 185 | regs = task_pt_regs(current); |
| 185 | if (event->ctx->task && event->ctx->task != current) | 186 | else |
| 186 | goto exit_put; | 187 | regs = NULL; |
| 187 | 188 | } | |
| 188 | perf_callchain_store(entry, PERF_CONTEXT_USER); | 189 | |
| 189 | perf_callchain_user(entry, regs); | 190 | if (regs) { |
| 191 | /* | ||
| 192 | * Disallow cross-task user callchains. | ||
| 193 | */ | ||
| 194 | if (event->ctx->task && event->ctx->task != current) | ||
| 195 | goto exit_put; | ||
| 196 | |||
| 197 | perf_callchain_store(entry, PERF_CONTEXT_USER); | ||
| 198 | perf_callchain_user(entry, regs); | ||
| 199 | } | ||
| 190 | } | 200 | } |
| 191 | 201 | ||
| 192 | exit_put: | 202 | exit_put: |
diff --git a/kernel/events/core.c b/kernel/events/core.c index 7fee567153f0..cda3ebd49e86 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
| @@ -36,6 +36,7 @@ | |||
| 36 | #include <linux/perf_event.h> | 36 | #include <linux/perf_event.h> |
| 37 | #include <linux/ftrace_event.h> | 37 | #include <linux/ftrace_event.h> |
| 38 | #include <linux/hw_breakpoint.h> | 38 | #include <linux/hw_breakpoint.h> |
| 39 | #include <linux/mm_types.h> | ||
| 39 | 40 | ||
| 40 | #include "internal.h" | 41 | #include "internal.h" |
| 41 | 42 | ||
| @@ -467,14 +468,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 467 | { | 468 | { |
| 468 | struct perf_cgroup *cgrp; | 469 | struct perf_cgroup *cgrp; |
| 469 | struct cgroup_subsys_state *css; | 470 | struct cgroup_subsys_state *css; |
| 470 | struct file *file; | 471 | struct fd f = fdget(fd); |
| 471 | int ret = 0, fput_needed; | 472 | int ret = 0; |
| 472 | 473 | ||
| 473 | file = fget_light(fd, &fput_needed); | 474 | if (!f.file) |
| 474 | if (!file) | ||
| 475 | return -EBADF; | 475 | return -EBADF; |
| 476 | 476 | ||
| 477 | css = cgroup_css_from_dir(file, perf_subsys_id); | 477 | css = cgroup_css_from_dir(f.file, perf_subsys_id); |
| 478 | if (IS_ERR(css)) { | 478 | if (IS_ERR(css)) { |
| 479 | ret = PTR_ERR(css); | 479 | ret = PTR_ERR(css); |
| 480 | goto out; | 480 | goto out; |
| @@ -500,7 +500,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, | |||
| 500 | ret = -EINVAL; | 500 | ret = -EINVAL; |
| 501 | } | 501 | } |
| 502 | out: | 502 | out: |
| 503 | fput_light(file, fput_needed); | 503 | fdput(f); |
| 504 | return ret; | 504 | return ret; |
| 505 | } | 505 | } |
| 506 | 506 | ||
| @@ -3233,21 +3233,18 @@ unlock: | |||
| 3233 | 3233 | ||
| 3234 | static const struct file_operations perf_fops; | 3234 | static const struct file_operations perf_fops; |
| 3235 | 3235 | ||
| 3236 | static struct file *perf_fget_light(int fd, int *fput_needed) | 3236 | static inline int perf_fget_light(int fd, struct fd *p) |
| 3237 | { | 3237 | { |
| 3238 | struct file *file; | 3238 | struct fd f = fdget(fd); |
| 3239 | 3239 | if (!f.file) | |
| 3240 | file = fget_light(fd, fput_needed); | 3240 | return -EBADF; |
| 3241 | if (!file) | ||
| 3242 | return ERR_PTR(-EBADF); | ||
| 3243 | 3241 | ||
| 3244 | if (file->f_op != &perf_fops) { | 3242 | if (f.file->f_op != &perf_fops) { |
| 3245 | fput_light(file, *fput_needed); | 3243 | fdput(f); |
| 3246 | *fput_needed = 0; | 3244 | return -EBADF; |
| 3247 | return ERR_PTR(-EBADF); | ||
| 3248 | } | 3245 | } |
| 3249 | 3246 | *p = f; | |
| 3250 | return file; | 3247 | return 0; |
| 3251 | } | 3248 | } |
| 3252 | 3249 | ||
| 3253 | static int perf_event_set_output(struct perf_event *event, | 3250 | static int perf_event_set_output(struct perf_event *event, |
| @@ -3279,22 +3276,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) | |||
| 3279 | 3276 | ||
| 3280 | case PERF_EVENT_IOC_SET_OUTPUT: | 3277 | case PERF_EVENT_IOC_SET_OUTPUT: |
| 3281 | { | 3278 | { |
| 3282 | struct file *output_file = NULL; | ||
| 3283 | struct perf_event *output_event = NULL; | ||
| 3284 | int fput_needed = 0; | ||
| 3285 | int ret; | 3279 | int ret; |
| 3286 | |||
| 3287 | if (arg != -1) { | 3280 | if (arg != -1) { |
| 3288 | output_file = perf_fget_light(arg, &fput_needed); | 3281 | struct perf_event *output_event; |
| 3289 | if (IS_ERR(output_file)) | 3282 | struct fd output; |
| 3290 | return PTR_ERR(output_file); | 3283 | ret = perf_fget_light(arg, &output); |
| 3291 | output_event = output_file->private_data; | 3284 | if (ret) |
| 3285 | return ret; | ||
| 3286 | output_event = output.file->private_data; | ||
| 3287 | ret = perf_event_set_output(event, output_event); | ||
| 3288 | fdput(output); | ||
| 3289 | } else { | ||
| 3290 | ret = perf_event_set_output(event, NULL); | ||
| 3292 | } | 3291 | } |
| 3293 | |||
| 3294 | ret = perf_event_set_output(event, output_event); | ||
| 3295 | if (output_event) | ||
| 3296 | fput_light(output_file, fput_needed); | ||
| 3297 | |||
| 3298 | return ret; | 3292 | return ret; |
| 3299 | } | 3293 | } |
| 3300 | 3294 | ||
| @@ -3677,7 +3671,7 @@ unlock: | |||
| 3677 | atomic_inc(&event->mmap_count); | 3671 | atomic_inc(&event->mmap_count); |
| 3678 | mutex_unlock(&event->mmap_mutex); | 3672 | mutex_unlock(&event->mmap_mutex); |
| 3679 | 3673 | ||
| 3680 | vma->vm_flags |= VM_RESERVED; | 3674 | vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; |
| 3681 | vma->vm_ops = &perf_mmap_vmops; | 3675 | vma->vm_ops = &perf_mmap_vmops; |
| 3682 | 3676 | ||
| 3683 | return ret; | 3677 | return ret; |
| @@ -3764,6 +3758,132 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) | |||
| 3764 | } | 3758 | } |
| 3765 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); | 3759 | EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); |
| 3766 | 3760 | ||
| 3761 | static void | ||
| 3762 | perf_output_sample_regs(struct perf_output_handle *handle, | ||
| 3763 | struct pt_regs *regs, u64 mask) | ||
| 3764 | { | ||
| 3765 | int bit; | ||
| 3766 | |||
| 3767 | for_each_set_bit(bit, (const unsigned long *) &mask, | ||
| 3768 | sizeof(mask) * BITS_PER_BYTE) { | ||
| 3769 | u64 val; | ||
| 3770 | |||
| 3771 | val = perf_reg_value(regs, bit); | ||
| 3772 | perf_output_put(handle, val); | ||
| 3773 | } | ||
| 3774 | } | ||
| 3775 | |||
| 3776 | static void perf_sample_regs_user(struct perf_regs_user *regs_user, | ||
| 3777 | struct pt_regs *regs) | ||
| 3778 | { | ||
| 3779 | if (!user_mode(regs)) { | ||
| 3780 | if (current->mm) | ||
| 3781 | regs = task_pt_regs(current); | ||
| 3782 | else | ||
| 3783 | regs = NULL; | ||
| 3784 | } | ||
| 3785 | |||
| 3786 | if (regs) { | ||
| 3787 | regs_user->regs = regs; | ||
| 3788 | regs_user->abi = perf_reg_abi(current); | ||
| 3789 | } | ||
| 3790 | } | ||
| 3791 | |||
| 3792 | /* | ||
| 3793 | * Get remaining task size from user stack pointer. | ||
| 3794 | * | ||
| 3795 | * It'd be better to take stack vma map and limit this more | ||
| 3796 | * precisly, but there's no way to get it safely under interrupt, | ||
| 3797 | * so using TASK_SIZE as limit. | ||
| 3798 | */ | ||
| 3799 | static u64 perf_ustack_task_size(struct pt_regs *regs) | ||
| 3800 | { | ||
| 3801 | unsigned long addr = perf_user_stack_pointer(regs); | ||
| 3802 | |||
| 3803 | if (!addr || addr >= TASK_SIZE) | ||
| 3804 | return 0; | ||
| 3805 | |||
| 3806 | return TASK_SIZE - addr; | ||
| 3807 | } | ||
| 3808 | |||
| 3809 | static u16 | ||
| 3810 | perf_sample_ustack_size(u16 stack_size, u16 header_size, | ||
| 3811 | struct pt_regs *regs) | ||
| 3812 | { | ||
| 3813 | u64 task_size; | ||
| 3814 | |||
| 3815 | /* No regs, no stack pointer, no dump. */ | ||
| 3816 | if (!regs) | ||
| 3817 | return 0; | ||
| 3818 | |||
| 3819 | /* | ||
| 3820 | * Check if we fit in with the requested stack size into the: | ||
| 3821 | * - TASK_SIZE | ||
| 3822 | * If we don't, we limit the size to the TASK_SIZE. | ||
| 3823 | * | ||
| 3824 | * - remaining sample size | ||
| 3825 | * If we don't, we customize the stack size to | ||
| 3826 | * fit in to the remaining sample size. | ||
| 3827 | */ | ||
| 3828 | |||
| 3829 | task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs)); | ||
| 3830 | stack_size = min(stack_size, (u16) task_size); | ||
| 3831 | |||
| 3832 | /* Current header size plus static size and dynamic size. */ | ||
| 3833 | header_size += 2 * sizeof(u64); | ||
| 3834 | |||
| 3835 | /* Do we fit in with the current stack dump size? */ | ||
| 3836 | if ((u16) (header_size + stack_size) < header_size) { | ||
| 3837 | /* | ||
| 3838 | * If we overflow the maximum size for the sample, | ||
| 3839 | * we customize the stack dump size to fit in. | ||
| 3840 | */ | ||
| 3841 | stack_size = USHRT_MAX - header_size - sizeof(u64); | ||
| 3842 | stack_size = round_up(stack_size, sizeof(u64)); | ||
| 3843 | } | ||
| 3844 | |||
| 3845 | return stack_size; | ||
| 3846 | } | ||
| 3847 | |||
| 3848 | static void | ||
| 3849 | perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size, | ||
| 3850 | struct pt_regs *regs) | ||
| 3851 | { | ||
| 3852 | /* Case of a kernel thread, nothing to dump */ | ||
| 3853 | if (!regs) { | ||
| 3854 | u64 size = 0; | ||
| 3855 | perf_output_put(handle, size); | ||
| 3856 | } else { | ||
| 3857 | unsigned long sp; | ||
| 3858 | unsigned int rem; | ||
| 3859 | u64 dyn_size; | ||
| 3860 | |||
| 3861 | /* | ||
| 3862 | * We dump: | ||
| 3863 | * static size | ||
| 3864 | * - the size requested by user or the best one we can fit | ||
| 3865 | * in to the sample max size | ||
| 3866 | * data | ||
| 3867 | * - user stack dump data | ||
| 3868 | * dynamic size | ||
| 3869 | * - the actual dumped size | ||
| 3870 | */ | ||
| 3871 | |||
| 3872 | /* Static size. */ | ||
| 3873 | perf_output_put(handle, dump_size); | ||
| 3874 | |||
| 3875 | /* Data. */ | ||
| 3876 | sp = perf_user_stack_pointer(regs); | ||
| 3877 | rem = __output_copy_user(handle, (void *) sp, dump_size); | ||
| 3878 | dyn_size = dump_size - rem; | ||
| 3879 | |||
| 3880 | perf_output_skip(handle, rem); | ||
| 3881 | |||
| 3882 | /* Dynamic size. */ | ||
| 3883 | perf_output_put(handle, dyn_size); | ||
| 3884 | } | ||
| 3885 | } | ||
| 3886 | |||
| 3767 | static void __perf_event_header__init_id(struct perf_event_header *header, | 3887 | static void __perf_event_header__init_id(struct perf_event_header *header, |
| 3768 | struct perf_sample_data *data, | 3888 | struct perf_sample_data *data, |
| 3769 | struct perf_event *event) | 3889 | struct perf_event *event) |
| @@ -4024,6 +4144,28 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
| 4024 | perf_output_put(handle, nr); | 4144 | perf_output_put(handle, nr); |
| 4025 | } | 4145 | } |
| 4026 | } | 4146 | } |
| 4147 | |||
| 4148 | if (sample_type & PERF_SAMPLE_REGS_USER) { | ||
| 4149 | u64 abi = data->regs_user.abi; | ||
| 4150 | |||
| 4151 | /* | ||
| 4152 | * If there are no regs to dump, notice it through | ||
| 4153 | * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE). | ||
| 4154 | */ | ||
| 4155 | perf_output_put(handle, abi); | ||
| 4156 | |||
| 4157 | if (abi) { | ||
| 4158 | u64 mask = event->attr.sample_regs_user; | ||
| 4159 | perf_output_sample_regs(handle, | ||
| 4160 | data->regs_user.regs, | ||
| 4161 | mask); | ||
| 4162 | } | ||
| 4163 | } | ||
| 4164 | |||
| 4165 | if (sample_type & PERF_SAMPLE_STACK_USER) | ||
| 4166 | perf_output_sample_ustack(handle, | ||
| 4167 | data->stack_user_size, | ||
| 4168 | data->regs_user.regs); | ||
| 4027 | } | 4169 | } |
| 4028 | 4170 | ||
| 4029 | void perf_prepare_sample(struct perf_event_header *header, | 4171 | void perf_prepare_sample(struct perf_event_header *header, |
| @@ -4075,6 +4217,49 @@ void perf_prepare_sample(struct perf_event_header *header, | |||
| 4075 | } | 4217 | } |
| 4076 | header->size += size; | 4218 | header->size += size; |
| 4077 | } | 4219 | } |
| 4220 | |||
| 4221 | if (sample_type & PERF_SAMPLE_REGS_USER) { | ||
| 4222 | /* regs dump ABI info */ | ||
| 4223 | int size = sizeof(u64); | ||
| 4224 | |||
| 4225 | perf_sample_regs_user(&data->regs_user, regs); | ||
| 4226 | |||
| 4227 | if (data->regs_user.regs) { | ||
| 4228 | u64 mask = event->attr.sample_regs_user; | ||
| 4229 | size += hweight64(mask) * sizeof(u64); | ||
| 4230 | } | ||
| 4231 | |||
| 4232 | header->size += size; | ||
| 4233 | } | ||
| 4234 | |||
| 4235 | if (sample_type & PERF_SAMPLE_STACK_USER) { | ||
| 4236 | /* | ||
| 4237 | * Either we need PERF_SAMPLE_STACK_USER bit to be allways | ||
| 4238 | * processed as the last one or have additional check added | ||
| 4239 | * in case new sample type is added, because we could eat | ||
| 4240 | * up the rest of the sample size. | ||
| 4241 | */ | ||
| 4242 | struct perf_regs_user *uregs = &data->regs_user; | ||
| 4243 | u16 stack_size = event->attr.sample_stack_user; | ||
| 4244 | u16 size = sizeof(u64); | ||
| 4245 | |||
| 4246 | if (!uregs->abi) | ||
| 4247 | perf_sample_regs_user(uregs, regs); | ||
| 4248 | |||
| 4249 | stack_size = perf_sample_ustack_size(stack_size, header->size, | ||
| 4250 | uregs->regs); | ||
| 4251 | |||
| 4252 | /* | ||
| 4253 | * If there is something to dump, add space for the dump | ||
| 4254 | * itself and for the field that tells the dynamic size, | ||
| 4255 | * which is how many have been actually dumped. | ||
| 4256 | */ | ||
| 4257 | if (stack_size) | ||
| 4258 | size += sizeof(u64) + stack_size; | ||
| 4259 | |||
| 4260 | data->stack_user_size = stack_size; | ||
| 4261 | header->size += size; | ||
| 4262 | } | ||
| 4078 | } | 4263 | } |
| 4079 | 4264 | ||
| 4080 | static void perf_event_output(struct perf_event *event, | 4265 | static void perf_event_output(struct perf_event *event, |
| @@ -6151,6 +6336,28 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, | |||
| 6151 | attr->branch_sample_type = mask; | 6336 | attr->branch_sample_type = mask; |
| 6152 | } | 6337 | } |
| 6153 | } | 6338 | } |
| 6339 | |||
| 6340 | if (attr->sample_type & PERF_SAMPLE_REGS_USER) { | ||
| 6341 | ret = perf_reg_validate(attr->sample_regs_user); | ||
| 6342 | if (ret) | ||
| 6343 | return ret; | ||
| 6344 | } | ||
| 6345 | |||
| 6346 | if (attr->sample_type & PERF_SAMPLE_STACK_USER) { | ||
| 6347 | if (!arch_perf_have_user_stack_dump()) | ||
| 6348 | return -ENOSYS; | ||
| 6349 | |||
| 6350 | /* | ||
| 6351 | * We have __u32 type for the size, but so far | ||
| 6352 | * we can only use __u16 as maximum due to the | ||
| 6353 | * __u16 sample size limit. | ||
| 6354 | */ | ||
| 6355 | if (attr->sample_stack_user >= USHRT_MAX) | ||
| 6356 | ret = -EINVAL; | ||
| 6357 | else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64))) | ||
| 6358 | ret = -EINVAL; | ||
| 6359 | } | ||
| 6360 | |||
| 6154 | out: | 6361 | out: |
| 6155 | return ret; | 6362 | return ret; |
| 6156 | 6363 | ||
| @@ -6229,12 +6436,11 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6229 | struct perf_event_attr attr; | 6436 | struct perf_event_attr attr; |
| 6230 | struct perf_event_context *ctx; | 6437 | struct perf_event_context *ctx; |
| 6231 | struct file *event_file = NULL; | 6438 | struct file *event_file = NULL; |
| 6232 | struct file *group_file = NULL; | 6439 | struct fd group = {NULL, 0}; |
| 6233 | struct task_struct *task = NULL; | 6440 | struct task_struct *task = NULL; |
| 6234 | struct pmu *pmu; | 6441 | struct pmu *pmu; |
| 6235 | int event_fd; | 6442 | int event_fd; |
| 6236 | int move_group = 0; | 6443 | int move_group = 0; |
| 6237 | int fput_needed = 0; | ||
| 6238 | int err; | 6444 | int err; |
| 6239 | 6445 | ||
| 6240 | /* for future expandability... */ | 6446 | /* for future expandability... */ |
| @@ -6264,17 +6470,15 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6264 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) | 6470 | if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) |
| 6265 | return -EINVAL; | 6471 | return -EINVAL; |
| 6266 | 6472 | ||
| 6267 | event_fd = get_unused_fd_flags(O_RDWR); | 6473 | event_fd = get_unused_fd(); |
| 6268 | if (event_fd < 0) | 6474 | if (event_fd < 0) |
| 6269 | return event_fd; | 6475 | return event_fd; |
| 6270 | 6476 | ||
| 6271 | if (group_fd != -1) { | 6477 | if (group_fd != -1) { |
| 6272 | group_file = perf_fget_light(group_fd, &fput_needed); | 6478 | err = perf_fget_light(group_fd, &group); |
| 6273 | if (IS_ERR(group_file)) { | 6479 | if (err) |
| 6274 | err = PTR_ERR(group_file); | ||
| 6275 | goto err_fd; | 6480 | goto err_fd; |
| 6276 | } | 6481 | group_leader = group.file->private_data; |
| 6277 | group_leader = group_file->private_data; | ||
| 6278 | if (flags & PERF_FLAG_FD_OUTPUT) | 6482 | if (flags & PERF_FLAG_FD_OUTPUT) |
| 6279 | output_event = group_leader; | 6483 | output_event = group_leader; |
| 6280 | if (flags & PERF_FLAG_FD_NO_GROUP) | 6484 | if (flags & PERF_FLAG_FD_NO_GROUP) |
| @@ -6450,7 +6654,7 @@ SYSCALL_DEFINE5(perf_event_open, | |||
| 6450 | * of the group leader will find the pointer to itself in | 6654 | * of the group leader will find the pointer to itself in |
| 6451 | * perf_group_detach(). | 6655 | * perf_group_detach(). |
| 6452 | */ | 6656 | */ |
| 6453 | fput_light(group_file, fput_needed); | 6657 | fdput(group); |
| 6454 | fd_install(event_fd, event_file); | 6658 | fd_install(event_fd, event_file); |
| 6455 | return event_fd; | 6659 | return event_fd; |
| 6456 | 6660 | ||
| @@ -6464,7 +6668,7 @@ err_task: | |||
| 6464 | if (task) | 6668 | if (task) |
| 6465 | put_task_struct(task); | 6669 | put_task_struct(task); |
| 6466 | err_group_fd: | 6670 | err_group_fd: |
| 6467 | fput_light(group_file, fput_needed); | 6671 | fdput(group); |
| 6468 | err_fd: | 6672 | err_fd: |
| 6469 | put_unused_fd(event_fd); | 6673 | put_unused_fd(event_fd); |
| 6470 | return err; | 6674 | return err; |
| @@ -7289,5 +7493,12 @@ struct cgroup_subsys perf_subsys = { | |||
| 7289 | .destroy = perf_cgroup_destroy, | 7493 | .destroy = perf_cgroup_destroy, |
| 7290 | .exit = perf_cgroup_exit, | 7494 | .exit = perf_cgroup_exit, |
| 7291 | .attach = perf_cgroup_attach, | 7495 | .attach = perf_cgroup_attach, |
| 7496 | |||
| 7497 | /* | ||
| 7498 | * perf_event cgroup doesn't handle nesting correctly. | ||
| 7499 | * ctx->nr_cgroups adjustments should be propagated through the | ||
| 7500 | * cgroup hierarchy. Fix it and remove the following. | ||
| 7501 | */ | ||
| 7502 | .broken_hierarchy = true, | ||
| 7292 | }; | 7503 | }; |
| 7293 | #endif /* CONFIG_CGROUP_PERF */ | 7504 | #endif /* CONFIG_CGROUP_PERF */ |
diff --git a/kernel/events/internal.h b/kernel/events/internal.h index a096c19f2c2a..d56a64c99a8b 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h | |||
| @@ -2,6 +2,7 @@ | |||
| 2 | #define _KERNEL_EVENTS_INTERNAL_H | 2 | #define _KERNEL_EVENTS_INTERNAL_H |
| 3 | 3 | ||
| 4 | #include <linux/hardirq.h> | 4 | #include <linux/hardirq.h> |
| 5 | #include <linux/uaccess.h> | ||
| 5 | 6 | ||
| 6 | /* Buffer handling */ | 7 | /* Buffer handling */ |
| 7 | 8 | ||
| @@ -76,30 +77,53 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) | |||
| 76 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); | 77 | return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); |
| 77 | } | 78 | } |
| 78 | 79 | ||
| 79 | static inline void | 80 | #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ |
| 80 | __output_copy(struct perf_output_handle *handle, | 81 | static inline unsigned int \ |
| 81 | const void *buf, unsigned int len) | 82 | func_name(struct perf_output_handle *handle, \ |
| 83 | const void *buf, unsigned int len) \ | ||
| 84 | { \ | ||
| 85 | unsigned long size, written; \ | ||
| 86 | \ | ||
| 87 | do { \ | ||
| 88 | size = min_t(unsigned long, handle->size, len); \ | ||
| 89 | \ | ||
| 90 | written = memcpy_func(handle->addr, buf, size); \ | ||
| 91 | \ | ||
| 92 | len -= written; \ | ||
| 93 | handle->addr += written; \ | ||
| 94 | buf += written; \ | ||
| 95 | handle->size -= written; \ | ||
| 96 | if (!handle->size) { \ | ||
| 97 | struct ring_buffer *rb = handle->rb; \ | ||
| 98 | \ | ||
| 99 | handle->page++; \ | ||
| 100 | handle->page &= rb->nr_pages - 1; \ | ||
| 101 | handle->addr = rb->data_pages[handle->page]; \ | ||
| 102 | handle->size = PAGE_SIZE << page_order(rb); \ | ||
| 103 | } \ | ||
| 104 | } while (len && written == size); \ | ||
| 105 | \ | ||
| 106 | return len; \ | ||
| 107 | } | ||
| 108 | |||
| 109 | static inline int memcpy_common(void *dst, const void *src, size_t n) | ||
| 82 | { | 110 | { |
| 83 | do { | 111 | memcpy(dst, src, n); |
| 84 | unsigned long size = min_t(unsigned long, handle->size, len); | 112 | return n; |
| 85 | |||
| 86 | memcpy(handle->addr, buf, size); | ||
| 87 | |||
| 88 | len -= size; | ||
| 89 | handle->addr += size; | ||
| 90 | buf += size; | ||
| 91 | handle->size -= size; | ||
| 92 | if (!handle->size) { | ||
| 93 | struct ring_buffer *rb = handle->rb; | ||
| 94 | |||
| 95 | handle->page++; | ||
| 96 | handle->page &= rb->nr_pages - 1; | ||
| 97 | handle->addr = rb->data_pages[handle->page]; | ||
| 98 | handle->size = PAGE_SIZE << page_order(rb); | ||
| 99 | } | ||
| 100 | } while (len); | ||
| 101 | } | 113 | } |
| 102 | 114 | ||
| 115 | DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) | ||
| 116 | |||
| 117 | #define MEMCPY_SKIP(dst, src, n) (n) | ||
| 118 | |||
| 119 | DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) | ||
| 120 | |||
| 121 | #ifndef arch_perf_out_copy_user | ||
| 122 | #define arch_perf_out_copy_user __copy_from_user_inatomic | ||
| 123 | #endif | ||
| 124 | |||
| 125 | DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) | ||
| 126 | |||
| 103 | /* Callchain handling */ | 127 | /* Callchain handling */ |
| 104 | extern struct perf_callchain_entry * | 128 | extern struct perf_callchain_entry * |
| 105 | perf_callchain(struct perf_event *event, struct pt_regs *regs); | 129 | perf_callchain(struct perf_event *event, struct pt_regs *regs); |
| @@ -134,4 +158,20 @@ static inline void put_recursion_context(int *recursion, int rctx) | |||
| 134 | recursion[rctx]--; | 158 | recursion[rctx]--; |
| 135 | } | 159 | } |
| 136 | 160 | ||
| 161 | #ifdef CONFIG_HAVE_PERF_USER_STACK_DUMP | ||
| 162 | static inline bool arch_perf_have_user_stack_dump(void) | ||
| 163 | { | ||
| 164 | return true; | ||
| 165 | } | ||
| 166 | |||
| 167 | #define perf_user_stack_pointer(regs) user_stack_pointer(regs) | ||
| 168 | #else | ||
| 169 | static inline bool arch_perf_have_user_stack_dump(void) | ||
| 170 | { | ||
| 171 | return false; | ||
| 172 | } | ||
| 173 | |||
| 174 | #define perf_user_stack_pointer(regs) 0 | ||
| 175 | #endif /* CONFIG_HAVE_PERF_USER_STACK_DUMP */ | ||
| 176 | |||
| 137 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ | 177 | #endif /* _KERNEL_EVENTS_INTERNAL_H */ |
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6ddaba43fb7a..23cb34ff3973 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c | |||
| @@ -182,10 +182,16 @@ out: | |||
| 182 | return -ENOSPC; | 182 | return -ENOSPC; |
| 183 | } | 183 | } |
| 184 | 184 | ||
| 185 | void perf_output_copy(struct perf_output_handle *handle, | 185 | unsigned int perf_output_copy(struct perf_output_handle *handle, |
| 186 | const void *buf, unsigned int len) | 186 | const void *buf, unsigned int len) |
| 187 | { | 187 | { |
| 188 | __output_copy(handle, buf, len); | 188 | return __output_copy(handle, buf, len); |
| 189 | } | ||
| 190 | |||
| 191 | unsigned int perf_output_skip(struct perf_output_handle *handle, | ||
| 192 | unsigned int len) | ||
| 193 | { | ||
| 194 | return __output_skip(handle, NULL, len); | ||
| 189 | } | 195 | } |
| 190 | 196 | ||
| 191 | void perf_output_end(struct perf_output_handle *handle) | 197 | void perf_output_end(struct perf_output_handle *handle) |
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index c08a22d02f72..98256bc71ee1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c | |||
| @@ -141,10 +141,14 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 141 | spinlock_t *ptl; | 141 | spinlock_t *ptl; |
| 142 | pte_t *ptep; | 142 | pte_t *ptep; |
| 143 | int err; | 143 | int err; |
| 144 | /* For mmu_notifiers */ | ||
| 145 | const unsigned long mmun_start = addr; | ||
| 146 | const unsigned long mmun_end = addr + PAGE_SIZE; | ||
| 144 | 147 | ||
| 145 | /* For try_to_free_swap() and munlock_vma_page() below */ | 148 | /* For try_to_free_swap() and munlock_vma_page() below */ |
| 146 | lock_page(page); | 149 | lock_page(page); |
| 147 | 150 | ||
| 151 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | ||
| 148 | err = -EAGAIN; | 152 | err = -EAGAIN; |
| 149 | ptep = page_check_address(page, mm, addr, &ptl, 0); | 153 | ptep = page_check_address(page, mm, addr, &ptl, 0); |
| 150 | if (!ptep) | 154 | if (!ptep) |
| @@ -173,6 +177,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, | |||
| 173 | 177 | ||
| 174 | err = 0; | 178 | err = 0; |
| 175 | unlock: | 179 | unlock: |
| 180 | mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); | ||
| 176 | unlock_page(page); | 181 | unlock_page(page); |
| 177 | return err; | 182 | return err; |
| 178 | } | 183 | } |
| @@ -280,12 +285,10 @@ static int read_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_ | |||
| 280 | if (ret <= 0) | 285 | if (ret <= 0) |
| 281 | return ret; | 286 | return ret; |
| 282 | 287 | ||
| 283 | lock_page(page); | ||
| 284 | vaddr_new = kmap_atomic(page); | 288 | vaddr_new = kmap_atomic(page); |
| 285 | vaddr &= ~PAGE_MASK; | 289 | vaddr &= ~PAGE_MASK; |
| 286 | memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); | 290 | memcpy(opcode, vaddr_new + vaddr, UPROBE_SWBP_INSN_SIZE); |
| 287 | kunmap_atomic(vaddr_new); | 291 | kunmap_atomic(vaddr_new); |
| 288 | unlock_page(page); | ||
| 289 | 292 | ||
| 290 | put_page(page); | 293 | put_page(page); |
| 291 | 294 | ||
| @@ -334,7 +337,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
| 334 | */ | 337 | */ |
| 335 | result = is_swbp_at_addr(mm, vaddr); | 338 | result = is_swbp_at_addr(mm, vaddr); |
| 336 | if (result == 1) | 339 | if (result == 1) |
| 337 | return -EEXIST; | 340 | return 0; |
| 338 | 341 | ||
| 339 | if (result) | 342 | if (result) |
| 340 | return result; | 343 | return result; |
| @@ -347,24 +350,22 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned | |||
| 347 | * @mm: the probed process address space. | 350 | * @mm: the probed process address space. |
| 348 | * @auprobe: arch specific probepoint information. | 351 | * @auprobe: arch specific probepoint information. |
| 349 | * @vaddr: the virtual address to insert the opcode. | 352 | * @vaddr: the virtual address to insert the opcode. |
| 350 | * @verify: if true, verify existance of breakpoint instruction. | ||
| 351 | * | 353 | * |
| 352 | * For mm @mm, restore the original opcode (opcode) at @vaddr. | 354 | * For mm @mm, restore the original opcode (opcode) at @vaddr. |
| 353 | * Return 0 (success) or a negative errno. | 355 | * Return 0 (success) or a negative errno. |
| 354 | */ | 356 | */ |
| 355 | int __weak | 357 | int __weak |
| 356 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr, bool verify) | 358 | set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) |
| 357 | { | 359 | { |
| 358 | if (verify) { | 360 | int result; |
| 359 | int result; | 361 | |
| 362 | result = is_swbp_at_addr(mm, vaddr); | ||
| 363 | if (!result) | ||
| 364 | return -EINVAL; | ||
| 360 | 365 | ||
| 361 | result = is_swbp_at_addr(mm, vaddr); | 366 | if (result != 1) |
| 362 | if (!result) | 367 | return result; |
| 363 | return -EINVAL; | ||
| 364 | 368 | ||
| 365 | if (result != 1) | ||
| 366 | return result; | ||
| 367 | } | ||
| 368 | return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); | 369 | return write_opcode(auprobe, mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); |
| 369 | } | 370 | } |
| 370 | 371 | ||
| @@ -415,11 +416,10 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) | |||
| 415 | static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) | 416 | static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) |
| 416 | { | 417 | { |
| 417 | struct uprobe *uprobe; | 418 | struct uprobe *uprobe; |
| 418 | unsigned long flags; | ||
| 419 | 419 | ||
| 420 | spin_lock_irqsave(&uprobes_treelock, flags); | 420 | spin_lock(&uprobes_treelock); |
| 421 | uprobe = __find_uprobe(inode, offset); | 421 | uprobe = __find_uprobe(inode, offset); |
| 422 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 422 | spin_unlock(&uprobes_treelock); |
| 423 | 423 | ||
| 424 | return uprobe; | 424 | return uprobe; |
| 425 | } | 425 | } |
| @@ -466,12 +466,11 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe) | |||
| 466 | */ | 466 | */ |
| 467 | static struct uprobe *insert_uprobe(struct uprobe *uprobe) | 467 | static struct uprobe *insert_uprobe(struct uprobe *uprobe) |
| 468 | { | 468 | { |
| 469 | unsigned long flags; | ||
| 470 | struct uprobe *u; | 469 | struct uprobe *u; |
| 471 | 470 | ||
| 472 | spin_lock_irqsave(&uprobes_treelock, flags); | 471 | spin_lock(&uprobes_treelock); |
| 473 | u = __insert_uprobe(uprobe); | 472 | u = __insert_uprobe(uprobe); |
| 474 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 473 | spin_unlock(&uprobes_treelock); |
| 475 | 474 | ||
| 476 | /* For now assume that the instruction need not be single-stepped */ | 475 | /* For now assume that the instruction need not be single-stepped */ |
| 477 | uprobe->flags |= UPROBE_SKIP_SSTEP; | 476 | uprobe->flags |= UPROBE_SKIP_SSTEP; |
| @@ -649,6 +648,7 @@ static int | |||
| 649 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | 648 | install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, |
| 650 | struct vm_area_struct *vma, unsigned long vaddr) | 649 | struct vm_area_struct *vma, unsigned long vaddr) |
| 651 | { | 650 | { |
| 651 | bool first_uprobe; | ||
| 652 | int ret; | 652 | int ret; |
| 653 | 653 | ||
| 654 | /* | 654 | /* |
| @@ -659,7 +659,7 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 659 | * Hence behave as if probe already existed. | 659 | * Hence behave as if probe already existed. |
| 660 | */ | 660 | */ |
| 661 | if (!uprobe->consumers) | 661 | if (!uprobe->consumers) |
| 662 | return -EEXIST; | 662 | return 0; |
| 663 | 663 | ||
| 664 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { | 664 | if (!(uprobe->flags & UPROBE_COPY_INSN)) { |
| 665 | ret = copy_insn(uprobe, vma->vm_file); | 665 | ret = copy_insn(uprobe, vma->vm_file); |
| @@ -681,17 +681,18 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 681 | } | 681 | } |
| 682 | 682 | ||
| 683 | /* | 683 | /* |
| 684 | * Ideally, should be updating the probe count after the breakpoint | 684 | * set MMF_HAS_UPROBES in advance for uprobe_pre_sstep_notifier(), |
| 685 | * has been successfully inserted. However a thread could hit the | 685 | * the task can hit this breakpoint right after __replace_page(). |
| 686 | * breakpoint we just inserted even before the probe count is | ||
| 687 | * incremented. If this is the first breakpoint placed, breakpoint | ||
| 688 | * notifier might ignore uprobes and pass the trap to the thread. | ||
| 689 | * Hence increment before and decrement on failure. | ||
| 690 | */ | 686 | */ |
| 691 | atomic_inc(&mm->uprobes_state.count); | 687 | first_uprobe = !test_bit(MMF_HAS_UPROBES, &mm->flags); |
| 688 | if (first_uprobe) | ||
| 689 | set_bit(MMF_HAS_UPROBES, &mm->flags); | ||
| 690 | |||
| 692 | ret = set_swbp(&uprobe->arch, mm, vaddr); | 691 | ret = set_swbp(&uprobe->arch, mm, vaddr); |
| 693 | if (ret) | 692 | if (!ret) |
| 694 | atomic_dec(&mm->uprobes_state.count); | 693 | clear_bit(MMF_RECALC_UPROBES, &mm->flags); |
| 694 | else if (first_uprobe) | ||
| 695 | clear_bit(MMF_HAS_UPROBES, &mm->flags); | ||
| 695 | 696 | ||
| 696 | return ret; | 697 | return ret; |
| 697 | } | 698 | } |
| @@ -699,8 +700,12 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, | |||
| 699 | static void | 700 | static void |
| 700 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) | 701 | remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) |
| 701 | { | 702 | { |
| 702 | if (!set_orig_insn(&uprobe->arch, mm, vaddr, true)) | 703 | /* can happen if uprobe_register() fails */ |
| 703 | atomic_dec(&mm->uprobes_state.count); | 704 | if (!test_bit(MMF_HAS_UPROBES, &mm->flags)) |
| 705 | return; | ||
| 706 | |||
| 707 | set_bit(MMF_RECALC_UPROBES, &mm->flags); | ||
| 708 | set_orig_insn(&uprobe->arch, mm, vaddr); | ||
| 704 | } | 709 | } |
| 705 | 710 | ||
| 706 | /* | 711 | /* |
| @@ -710,11 +715,9 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad | |||
| 710 | */ | 715 | */ |
| 711 | static void delete_uprobe(struct uprobe *uprobe) | 716 | static void delete_uprobe(struct uprobe *uprobe) |
| 712 | { | 717 | { |
| 713 | unsigned long flags; | 718 | spin_lock(&uprobes_treelock); |
| 714 | |||
| 715 | spin_lock_irqsave(&uprobes_treelock, flags); | ||
| 716 | rb_erase(&uprobe->rb_node, &uprobes_tree); | 719 | rb_erase(&uprobe->rb_node, &uprobes_tree); |
| 717 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 720 | spin_unlock(&uprobes_treelock); |
| 718 | iput(uprobe->inode); | 721 | iput(uprobe->inode); |
| 719 | put_uprobe(uprobe); | 722 | put_uprobe(uprobe); |
| 720 | atomic_dec(&uprobe_events); | 723 | atomic_dec(&uprobe_events); |
| @@ -737,7 +740,6 @@ static struct map_info * | |||
| 737 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | 740 | build_map_info(struct address_space *mapping, loff_t offset, bool is_register) |
| 738 | { | 741 | { |
| 739 | unsigned long pgoff = offset >> PAGE_SHIFT; | 742 | unsigned long pgoff = offset >> PAGE_SHIFT; |
| 740 | struct prio_tree_iter iter; | ||
| 741 | struct vm_area_struct *vma; | 743 | struct vm_area_struct *vma; |
| 742 | struct map_info *curr = NULL; | 744 | struct map_info *curr = NULL; |
| 743 | struct map_info *prev = NULL; | 745 | struct map_info *prev = NULL; |
| @@ -746,7 +748,7 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register) | |||
| 746 | 748 | ||
| 747 | again: | 749 | again: |
| 748 | mutex_lock(&mapping->i_mmap_mutex); | 750 | mutex_lock(&mapping->i_mmap_mutex); |
| 749 | vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { | 751 | vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { |
| 750 | if (!valid_vma(vma, is_register)) | 752 | if (!valid_vma(vma, is_register)) |
| 751 | continue; | 753 | continue; |
| 752 | 754 | ||
| @@ -831,17 +833,11 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register) | |||
| 831 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) | 833 | vaddr_to_offset(vma, info->vaddr) != uprobe->offset) |
| 832 | goto unlock; | 834 | goto unlock; |
| 833 | 835 | ||
| 834 | if (is_register) { | 836 | if (is_register) |
| 835 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); | 837 | err = install_breakpoint(uprobe, mm, vma, info->vaddr); |
| 836 | /* | 838 | else |
| 837 | * We can race against uprobe_mmap(), see the | ||
| 838 | * comment near uprobe_hash(). | ||
| 839 | */ | ||
| 840 | if (err == -EEXIST) | ||
| 841 | err = 0; | ||
| 842 | } else { | ||
| 843 | remove_breakpoint(uprobe, mm, info->vaddr); | 839 | remove_breakpoint(uprobe, mm, info->vaddr); |
| 844 | } | 840 | |
| 845 | unlock: | 841 | unlock: |
| 846 | up_write(&mm->mmap_sem); | 842 | up_write(&mm->mmap_sem); |
| 847 | free: | 843 | free: |
| @@ -908,7 +904,8 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer * | |||
| 908 | } | 904 | } |
| 909 | 905 | ||
| 910 | mutex_unlock(uprobes_hash(inode)); | 906 | mutex_unlock(uprobes_hash(inode)); |
| 911 | put_uprobe(uprobe); | 907 | if (uprobe) |
| 908 | put_uprobe(uprobe); | ||
| 912 | 909 | ||
| 913 | return ret; | 910 | return ret; |
| 914 | } | 911 | } |
| @@ -978,7 +975,6 @@ static void build_probe_list(struct inode *inode, | |||
| 978 | struct list_head *head) | 975 | struct list_head *head) |
| 979 | { | 976 | { |
| 980 | loff_t min, max; | 977 | loff_t min, max; |
| 981 | unsigned long flags; | ||
| 982 | struct rb_node *n, *t; | 978 | struct rb_node *n, *t; |
| 983 | struct uprobe *u; | 979 | struct uprobe *u; |
| 984 | 980 | ||
| @@ -986,7 +982,7 @@ static void build_probe_list(struct inode *inode, | |||
| 986 | min = vaddr_to_offset(vma, start); | 982 | min = vaddr_to_offset(vma, start); |
| 987 | max = min + (end - start) - 1; | 983 | max = min + (end - start) - 1; |
| 988 | 984 | ||
| 989 | spin_lock_irqsave(&uprobes_treelock, flags); | 985 | spin_lock(&uprobes_treelock); |
| 990 | n = find_node_in_range(inode, min, max); | 986 | n = find_node_in_range(inode, min, max); |
| 991 | if (n) { | 987 | if (n) { |
| 992 | for (t = n; t; t = rb_prev(t)) { | 988 | for (t = n; t; t = rb_prev(t)) { |
| @@ -1004,27 +1000,20 @@ static void build_probe_list(struct inode *inode, | |||
| 1004 | atomic_inc(&u->ref); | 1000 | atomic_inc(&u->ref); |
| 1005 | } | 1001 | } |
| 1006 | } | 1002 | } |
| 1007 | spin_unlock_irqrestore(&uprobes_treelock, flags); | 1003 | spin_unlock(&uprobes_treelock); |
| 1008 | } | 1004 | } |
| 1009 | 1005 | ||
| 1010 | /* | 1006 | /* |
| 1011 | * Called from mmap_region. | 1007 | * Called from mmap_region/vma_adjust with mm->mmap_sem acquired. |
| 1012 | * called with mm->mmap_sem acquired. | ||
| 1013 | * | 1008 | * |
| 1014 | * Return -ve no if we fail to insert probes and we cannot | 1009 | * Currently we ignore all errors and always return 0, the callers |
| 1015 | * bail-out. | 1010 | * can't handle the failure anyway. |
| 1016 | * Return 0 otherwise. i.e: | ||
| 1017 | * | ||
| 1018 | * - successful insertion of probes | ||
| 1019 | * - (or) no possible probes to be inserted. | ||
| 1020 | * - (or) insertion of probes failed but we can bail-out. | ||
| 1021 | */ | 1011 | */ |
| 1022 | int uprobe_mmap(struct vm_area_struct *vma) | 1012 | int uprobe_mmap(struct vm_area_struct *vma) |
| 1023 | { | 1013 | { |
| 1024 | struct list_head tmp_list; | 1014 | struct list_head tmp_list; |
| 1025 | struct uprobe *uprobe, *u; | 1015 | struct uprobe *uprobe, *u; |
| 1026 | struct inode *inode; | 1016 | struct inode *inode; |
| 1027 | int ret, count; | ||
| 1028 | 1017 | ||
| 1029 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) | 1018 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) |
| 1030 | return 0; | 1019 | return 0; |
| @@ -1036,44 +1025,35 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 1036 | mutex_lock(uprobes_mmap_hash(inode)); | 1025 | mutex_lock(uprobes_mmap_hash(inode)); |
| 1037 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); | 1026 | build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); |
| 1038 | 1027 | ||
| 1039 | ret = 0; | ||
| 1040 | count = 0; | ||
| 1041 | |||
| 1042 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | 1028 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { |
| 1043 | if (!ret) { | 1029 | if (!fatal_signal_pending(current)) { |
| 1044 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); | 1030 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); |
| 1045 | 1031 | install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | |
| 1046 | ret = install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); | ||
| 1047 | /* | ||
| 1048 | * We can race against uprobe_register(), see the | ||
| 1049 | * comment near uprobe_hash(). | ||
| 1050 | */ | ||
| 1051 | if (ret == -EEXIST) { | ||
| 1052 | ret = 0; | ||
| 1053 | |||
| 1054 | if (!is_swbp_at_addr(vma->vm_mm, vaddr)) | ||
| 1055 | continue; | ||
| 1056 | |||
| 1057 | /* | ||
| 1058 | * Unable to insert a breakpoint, but | ||
| 1059 | * breakpoint lies underneath. Increment the | ||
| 1060 | * probe count. | ||
| 1061 | */ | ||
| 1062 | atomic_inc(&vma->vm_mm->uprobes_state.count); | ||
| 1063 | } | ||
| 1064 | |||
| 1065 | if (!ret) | ||
| 1066 | count++; | ||
| 1067 | } | 1032 | } |
| 1068 | put_uprobe(uprobe); | 1033 | put_uprobe(uprobe); |
| 1069 | } | 1034 | } |
| 1070 | |||
| 1071 | mutex_unlock(uprobes_mmap_hash(inode)); | 1035 | mutex_unlock(uprobes_mmap_hash(inode)); |
| 1072 | 1036 | ||
| 1073 | if (ret) | 1037 | return 0; |
| 1074 | atomic_sub(count, &vma->vm_mm->uprobes_state.count); | 1038 | } |
| 1075 | 1039 | ||
| 1076 | return ret; | 1040 | static bool |
| 1041 | vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long end) | ||
| 1042 | { | ||
| 1043 | loff_t min, max; | ||
| 1044 | struct inode *inode; | ||
| 1045 | struct rb_node *n; | ||
| 1046 | |||
| 1047 | inode = vma->vm_file->f_mapping->host; | ||
| 1048 | |||
| 1049 | min = vaddr_to_offset(vma, start); | ||
| 1050 | max = min + (end - start) - 1; | ||
| 1051 | |||
| 1052 | spin_lock(&uprobes_treelock); | ||
| 1053 | n = find_node_in_range(inode, min, max); | ||
| 1054 | spin_unlock(&uprobes_treelock); | ||
| 1055 | |||
| 1056 | return !!n; | ||
| 1077 | } | 1057 | } |
| 1078 | 1058 | ||
| 1079 | /* | 1059 | /* |
| @@ -1081,37 +1061,18 @@ int uprobe_mmap(struct vm_area_struct *vma) | |||
| 1081 | */ | 1061 | */ |
| 1082 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) | 1062 | void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) |
| 1083 | { | 1063 | { |
| 1084 | struct list_head tmp_list; | ||
| 1085 | struct uprobe *uprobe, *u; | ||
| 1086 | struct inode *inode; | ||
| 1087 | |||
| 1088 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) | 1064 | if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) |
| 1089 | return; | 1065 | return; |
| 1090 | 1066 | ||
| 1091 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ | 1067 | if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ |
| 1092 | return; | 1068 | return; |
| 1093 | 1069 | ||
| 1094 | if (!atomic_read(&vma->vm_mm->uprobes_state.count)) | 1070 | if (!test_bit(MMF_HAS_UPROBES, &vma->vm_mm->flags) || |
| 1095 | return; | 1071 | test_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags)) |
| 1096 | |||
| 1097 | inode = vma->vm_file->f_mapping->host; | ||
| 1098 | if (!inode) | ||
| 1099 | return; | 1072 | return; |
| 1100 | 1073 | ||
| 1101 | mutex_lock(uprobes_mmap_hash(inode)); | 1074 | if (vma_has_uprobes(vma, start, end)) |
| 1102 | build_probe_list(inode, vma, start, end, &tmp_list); | 1075 | set_bit(MMF_RECALC_UPROBES, &vma->vm_mm->flags); |
| 1103 | |||
| 1104 | list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { | ||
| 1105 | unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); | ||
| 1106 | /* | ||
| 1107 | * An unregister could have removed the probe before | ||
| 1108 | * unmap. So check before we decrement the count. | ||
| 1109 | */ | ||
| 1110 | if (is_swbp_at_addr(vma->vm_mm, vaddr) == 1) | ||
| 1111 | atomic_dec(&vma->vm_mm->uprobes_state.count); | ||
| 1112 | put_uprobe(uprobe); | ||
| 1113 | } | ||
| 1114 | mutex_unlock(uprobes_mmap_hash(inode)); | ||
| 1115 | } | 1076 | } |
| 1116 | 1077 | ||
| 1117 | /* Slot allocation for XOL */ | 1078 | /* Slot allocation for XOL */ |
| @@ -1213,13 +1174,15 @@ void uprobe_clear_state(struct mm_struct *mm) | |||
| 1213 | kfree(area); | 1174 | kfree(area); |
| 1214 | } | 1175 | } |
| 1215 | 1176 | ||
| 1216 | /* | 1177 | void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) |
| 1217 | * uprobe_reset_state - Free the area allocated for slots. | ||
| 1218 | */ | ||
| 1219 | void uprobe_reset_state(struct mm_struct *mm) | ||
| 1220 | { | 1178 | { |
| 1221 | mm->uprobes_state.xol_area = NULL; | 1179 | newmm->uprobes_state.xol_area = NULL; |
| 1222 | atomic_set(&mm->uprobes_state.count, 0); | 1180 | |
| 1181 | if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { | ||
| 1182 | set_bit(MMF_HAS_UPROBES, &newmm->flags); | ||
| 1183 | /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ | ||
| 1184 | set_bit(MMF_RECALC_UPROBES, &newmm->flags); | ||
| 1185 | } | ||
| 1223 | } | 1186 | } |
| 1224 | 1187 | ||
| 1225 | /* | 1188 | /* |
| @@ -1437,6 +1400,25 @@ static bool can_skip_sstep(struct uprobe *uprobe, struct pt_regs *regs) | |||
| 1437 | return false; | 1400 | return false; |
| 1438 | } | 1401 | } |
| 1439 | 1402 | ||
| 1403 | static void mmf_recalc_uprobes(struct mm_struct *mm) | ||
| 1404 | { | ||
| 1405 | struct vm_area_struct *vma; | ||
| 1406 | |||
| 1407 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
| 1408 | if (!valid_vma(vma, false)) | ||
| 1409 | continue; | ||
| 1410 | /* | ||
| 1411 | * This is not strictly accurate, we can race with | ||
| 1412 | * uprobe_unregister() and see the already removed | ||
| 1413 | * uprobe if delete_uprobe() was not yet called. | ||
| 1414 | */ | ||
| 1415 | if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) | ||
| 1416 | return; | ||
| 1417 | } | ||
| 1418 | |||
| 1419 | clear_bit(MMF_HAS_UPROBES, &mm->flags); | ||
| 1420 | } | ||
| 1421 | |||
| 1440 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | 1422 | static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) |
| 1441 | { | 1423 | { |
| 1442 | struct mm_struct *mm = current->mm; | 1424 | struct mm_struct *mm = current->mm; |
| @@ -1458,11 +1440,24 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) | |||
| 1458 | } else { | 1440 | } else { |
| 1459 | *is_swbp = -EFAULT; | 1441 | *is_swbp = -EFAULT; |
| 1460 | } | 1442 | } |
| 1443 | |||
| 1444 | if (!uprobe && test_and_clear_bit(MMF_RECALC_UPROBES, &mm->flags)) | ||
| 1445 | mmf_recalc_uprobes(mm); | ||
| 1461 | up_read(&mm->mmap_sem); | 1446 | up_read(&mm->mmap_sem); |
| 1462 | 1447 | ||
| 1463 | return uprobe; | 1448 | return uprobe; |
| 1464 | } | 1449 | } |
| 1465 | 1450 | ||
| 1451 | void __weak arch_uprobe_enable_step(struct arch_uprobe *arch) | ||
| 1452 | { | ||
| 1453 | user_enable_single_step(current); | ||
| 1454 | } | ||
| 1455 | |||
| 1456 | void __weak arch_uprobe_disable_step(struct arch_uprobe *arch) | ||
| 1457 | { | ||
| 1458 | user_disable_single_step(current); | ||
| 1459 | } | ||
| 1460 | |||
| 1466 | /* | 1461 | /* |
| 1467 | * Run handler and ask thread to singlestep. | 1462 | * Run handler and ask thread to singlestep. |
| 1468 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. | 1463 | * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. |
| @@ -1509,7 +1504,7 @@ static void handle_swbp(struct pt_regs *regs) | |||
| 1509 | 1504 | ||
| 1510 | utask->state = UTASK_SSTEP; | 1505 | utask->state = UTASK_SSTEP; |
| 1511 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { | 1506 | if (!pre_ssout(uprobe, regs, bp_vaddr)) { |
| 1512 | user_enable_single_step(current); | 1507 | arch_uprobe_enable_step(&uprobe->arch); |
| 1513 | return; | 1508 | return; |
| 1514 | } | 1509 | } |
| 1515 | 1510 | ||
| @@ -1518,17 +1513,15 @@ cleanup_ret: | |||
| 1518 | utask->active_uprobe = NULL; | 1513 | utask->active_uprobe = NULL; |
| 1519 | utask->state = UTASK_RUNNING; | 1514 | utask->state = UTASK_RUNNING; |
| 1520 | } | 1515 | } |
| 1521 | if (uprobe) { | 1516 | if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) |
| 1522 | if (!(uprobe->flags & UPROBE_SKIP_SSTEP)) | ||
| 1523 | 1517 | ||
| 1524 | /* | 1518 | /* |
| 1525 | * cannot singlestep; cannot skip instruction; | 1519 | * cannot singlestep; cannot skip instruction; |
| 1526 | * re-execute the instruction. | 1520 | * re-execute the instruction. |
| 1527 | */ | 1521 | */ |
| 1528 | instruction_pointer_set(regs, bp_vaddr); | 1522 | instruction_pointer_set(regs, bp_vaddr); |
| 1529 | 1523 | ||
| 1530 | put_uprobe(uprobe); | 1524 | put_uprobe(uprobe); |
| 1531 | } | ||
| 1532 | } | 1525 | } |
| 1533 | 1526 | ||
| 1534 | /* | 1527 | /* |
| @@ -1547,10 +1540,10 @@ static void handle_singlestep(struct uprobe_task *utask, struct pt_regs *regs) | |||
| 1547 | else | 1540 | else |
| 1548 | WARN_ON_ONCE(1); | 1541 | WARN_ON_ONCE(1); |
| 1549 | 1542 | ||
| 1543 | arch_uprobe_disable_step(&uprobe->arch); | ||
| 1550 | put_uprobe(uprobe); | 1544 | put_uprobe(uprobe); |
| 1551 | utask->active_uprobe = NULL; | 1545 | utask->active_uprobe = NULL; |
| 1552 | utask->state = UTASK_RUNNING; | 1546 | utask->state = UTASK_RUNNING; |
| 1553 | user_disable_single_step(current); | ||
| 1554 | xol_free_insn_slot(current); | 1547 | xol_free_insn_slot(current); |
| 1555 | 1548 | ||
| 1556 | spin_lock_irq(¤t->sighand->siglock); | 1549 | spin_lock_irq(¤t->sighand->siglock); |
| @@ -1589,8 +1582,7 @@ int uprobe_pre_sstep_notifier(struct pt_regs *regs) | |||
| 1589 | { | 1582 | { |
| 1590 | struct uprobe_task *utask; | 1583 | struct uprobe_task *utask; |
| 1591 | 1584 | ||
| 1592 | if (!current->mm || !atomic_read(¤t->mm->uprobes_state.count)) | 1585 | if (!current->mm || !test_bit(MMF_HAS_UPROBES, ¤t->mm->flags)) |
| 1593 | /* task is currently not uprobed */ | ||
| 1594 | return 0; | 1586 | return 0; |
| 1595 | 1587 | ||
| 1596 | utask = current->utask; | 1588 | utask = current->utask; |
diff --git a/kernel/exit.c b/kernel/exit.c index f65345f9e5bb..346616c0092c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
| @@ -457,108 +457,13 @@ void daemonize(const char *name, ...) | |||
| 457 | /* Become as one with the init task */ | 457 | /* Become as one with the init task */ |
| 458 | 458 | ||
| 459 | daemonize_fs_struct(); | 459 | daemonize_fs_struct(); |
| 460 | exit_files(current); | 460 | daemonize_descriptors(); |
| 461 | current->files = init_task.files; | ||
| 462 | atomic_inc(¤t->files->count); | ||
| 463 | 461 | ||
| 464 | reparent_to_kthreadd(); | 462 | reparent_to_kthreadd(); |
| 465 | } | 463 | } |
| 466 | 464 | ||
| 467 | EXPORT_SYMBOL(daemonize); | 465 | EXPORT_SYMBOL(daemonize); |
| 468 | 466 | ||
| 469 | static void close_files(struct files_struct * files) | ||
| 470 | { | ||
| 471 | int i, j; | ||
| 472 | struct fdtable *fdt; | ||
| 473 | |||
| 474 | j = 0; | ||
| 475 | |||
| 476 | /* | ||
| 477 | * It is safe to dereference the fd table without RCU or | ||
| 478 | * ->file_lock because this is the last reference to the | ||
| 479 | * files structure. But use RCU to shut RCU-lockdep up. | ||
| 480 | */ | ||
| 481 | rcu_read_lock(); | ||
| 482 | fdt = files_fdtable(files); | ||
| 483 | rcu_read_unlock(); | ||
| 484 | for (;;) { | ||
| 485 | unsigned long set; | ||
| 486 | i = j * BITS_PER_LONG; | ||
| 487 | if (i >= fdt->max_fds) | ||
| 488 | break; | ||
| 489 | set = fdt->open_fds[j++]; | ||
| 490 | while (set) { | ||
| 491 | if (set & 1) { | ||
| 492 | struct file * file = xchg(&fdt->fd[i], NULL); | ||
| 493 | if (file) { | ||
| 494 | filp_close(file, files); | ||
| 495 | cond_resched(); | ||
| 496 | } | ||
| 497 | } | ||
| 498 | i++; | ||
| 499 | set >>= 1; | ||
| 500 | } | ||
| 501 | } | ||
| 502 | } | ||
| 503 | |||
| 504 | struct files_struct *get_files_struct(struct task_struct *task) | ||
| 505 | { | ||
| 506 | struct files_struct *files; | ||
| 507 | |||
| 508 | task_lock(task); | ||
| 509 | files = task->files; | ||
| 510 | if (files) | ||
| 511 | atomic_inc(&files->count); | ||
| 512 | task_unlock(task); | ||
| 513 | |||
| 514 | return files; | ||
| 515 | } | ||
| 516 | |||
| 517 | void put_files_struct(struct files_struct *files) | ||
| 518 | { | ||
| 519 | struct fdtable *fdt; | ||
| 520 | |||
| 521 | if (atomic_dec_and_test(&files->count)) { | ||
| 522 | close_files(files); | ||
| 523 | /* | ||
| 524 | * Free the fd and fdset arrays if we expanded them. | ||
| 525 | * If the fdtable was embedded, pass files for freeing | ||
| 526 | * at the end of the RCU grace period. Otherwise, | ||
| 527 | * you can free files immediately. | ||
| 528 | */ | ||
| 529 | rcu_read_lock(); | ||
| 530 | fdt = files_fdtable(files); | ||
| 531 | if (fdt != &files->fdtab) | ||
| 532 | kmem_cache_free(files_cachep, files); | ||
| 533 | free_fdtable(fdt); | ||
| 534 | rcu_read_unlock(); | ||
| 535 | } | ||
| 536 | } | ||
| 537 | |||
| 538 | void reset_files_struct(struct files_struct *files) | ||
| 539 | { | ||
| 540 | struct task_struct *tsk = current; | ||
| 541 | struct files_struct *old; | ||
| 542 | |||
| 543 | old = tsk->files; | ||
| 544 | task_lock(tsk); | ||
| 545 | tsk->files = files; | ||
| 546 | task_unlock(tsk); | ||
| 547 | put_files_struct(old); | ||
| 548 | } | ||
| 549 | |||
| 550 | void exit_files(struct task_struct *tsk) | ||
| 551 | { | ||
| 552 | struct files_struct * files = tsk->files; | ||
| 553 | |||
| 554 | if (files) { | ||
| 555 | task_lock(tsk); | ||
| 556 | tsk->files = NULL; | ||
| 557 | task_unlock(tsk); | ||
| 558 | put_files_struct(files); | ||
| 559 | } | ||
| 560 | } | ||
| 561 | |||
| 562 | #ifdef CONFIG_MM_OWNER | 467 | #ifdef CONFIG_MM_OWNER |
| 563 | /* | 468 | /* |
| 564 | * A task is exiting. If it owned this mm, find a new owner for the mm. | 469 | * A task is exiting. If it owned this mm, find a new owner for the mm. |
| @@ -1046,6 +951,9 @@ void do_exit(long code) | |||
| 1046 | if (tsk->splice_pipe) | 951 | if (tsk->splice_pipe) |
| 1047 | __free_pipe_info(tsk->splice_pipe); | 952 | __free_pipe_info(tsk->splice_pipe); |
| 1048 | 953 | ||
| 954 | if (tsk->task_frag.page) | ||
| 955 | put_page(tsk->task_frag.page); | ||
| 956 | |||
| 1049 | validate_creds_for_do_exit(tsk); | 957 | validate_creds_for_do_exit(tsk); |
| 1050 | 958 | ||
| 1051 | preempt_disable(); | 959 | preempt_disable(); |
diff --git a/kernel/fork.c b/kernel/fork.c index 2c8857e12855..1cd7d581b3b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -330,6 +330,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) | |||
| 330 | tsk->btrace_seq = 0; | 330 | tsk->btrace_seq = 0; |
| 331 | #endif | 331 | #endif |
| 332 | tsk->splice_pipe = NULL; | 332 | tsk->splice_pipe = NULL; |
| 333 | tsk->task_frag.page = NULL; | ||
| 333 | 334 | ||
| 334 | account_kernel_stack(ti, 1); | 335 | account_kernel_stack(ti, 1); |
| 335 | 336 | ||
| @@ -353,6 +354,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 353 | 354 | ||
| 354 | down_write(&oldmm->mmap_sem); | 355 | down_write(&oldmm->mmap_sem); |
| 355 | flush_cache_dup_mm(oldmm); | 356 | flush_cache_dup_mm(oldmm); |
| 357 | uprobe_dup_mmap(oldmm, mm); | ||
| 356 | /* | 358 | /* |
| 357 | * Not linked in yet - no deadlock potential: | 359 | * Not linked in yet - no deadlock potential: |
| 358 | */ | 360 | */ |
| @@ -421,7 +423,12 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 421 | mapping->i_mmap_writable++; | 423 | mapping->i_mmap_writable++; |
| 422 | flush_dcache_mmap_lock(mapping); | 424 | flush_dcache_mmap_lock(mapping); |
| 423 | /* insert tmp into the share list, just after mpnt */ | 425 | /* insert tmp into the share list, just after mpnt */ |
| 424 | vma_prio_tree_add(tmp, mpnt); | 426 | if (unlikely(tmp->vm_flags & VM_NONLINEAR)) |
| 427 | vma_nonlinear_insert(tmp, | ||
| 428 | &mapping->i_mmap_nonlinear); | ||
| 429 | else | ||
| 430 | vma_interval_tree_insert_after(tmp, mpnt, | ||
| 431 | &mapping->i_mmap); | ||
| 425 | flush_dcache_mmap_unlock(mapping); | 432 | flush_dcache_mmap_unlock(mapping); |
| 426 | mutex_unlock(&mapping->i_mmap_mutex); | 433 | mutex_unlock(&mapping->i_mmap_mutex); |
| 427 | } | 434 | } |
| @@ -454,9 +461,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
| 454 | 461 | ||
| 455 | if (retval) | 462 | if (retval) |
| 456 | goto out; | 463 | goto out; |
| 457 | |||
| 458 | if (file) | ||
| 459 | uprobe_mmap(tmp); | ||
| 460 | } | 464 | } |
| 461 | /* a new mm has just been created */ | 465 | /* a new mm has just been created */ |
| 462 | arch_dup_mmap(oldmm, mm); | 466 | arch_dup_mmap(oldmm, mm); |
| @@ -623,26 +627,6 @@ void mmput(struct mm_struct *mm) | |||
| 623 | } | 627 | } |
| 624 | EXPORT_SYMBOL_GPL(mmput); | 628 | EXPORT_SYMBOL_GPL(mmput); |
| 625 | 629 | ||
| 626 | /* | ||
| 627 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
| 628 | * during exec and are not mapped with the mmap system call. | ||
| 629 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
| 630 | */ | ||
| 631 | void added_exe_file_vma(struct mm_struct *mm) | ||
| 632 | { | ||
| 633 | mm->num_exe_file_vmas++; | ||
| 634 | } | ||
| 635 | |||
| 636 | void removed_exe_file_vma(struct mm_struct *mm) | ||
| 637 | { | ||
| 638 | mm->num_exe_file_vmas--; | ||
| 639 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { | ||
| 640 | fput(mm->exe_file); | ||
| 641 | mm->exe_file = NULL; | ||
| 642 | } | ||
| 643 | |||
| 644 | } | ||
| 645 | |||
| 646 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | 630 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) |
| 647 | { | 631 | { |
| 648 | if (new_exe_file) | 632 | if (new_exe_file) |
| @@ -650,15 +634,13 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | |||
| 650 | if (mm->exe_file) | 634 | if (mm->exe_file) |
| 651 | fput(mm->exe_file); | 635 | fput(mm->exe_file); |
| 652 | mm->exe_file = new_exe_file; | 636 | mm->exe_file = new_exe_file; |
| 653 | mm->num_exe_file_vmas = 0; | ||
| 654 | } | 637 | } |
| 655 | 638 | ||
| 656 | struct file *get_mm_exe_file(struct mm_struct *mm) | 639 | struct file *get_mm_exe_file(struct mm_struct *mm) |
| 657 | { | 640 | { |
| 658 | struct file *exe_file; | 641 | struct file *exe_file; |
| 659 | 642 | ||
| 660 | /* We need mmap_sem to protect against races with removal of | 643 | /* We need mmap_sem to protect against races with removal of exe_file */ |
| 661 | * VM_EXECUTABLE vmas */ | ||
| 662 | down_read(&mm->mmap_sem); | 644 | down_read(&mm->mmap_sem); |
| 663 | exe_file = mm->exe_file; | 645 | exe_file = mm->exe_file; |
| 664 | if (exe_file) | 646 | if (exe_file) |
| @@ -839,8 +821,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) | |||
| 839 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 821 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| 840 | mm->pmd_huge_pte = NULL; | 822 | mm->pmd_huge_pte = NULL; |
| 841 | #endif | 823 | #endif |
| 842 | uprobe_reset_state(mm); | ||
| 843 | |||
| 844 | if (!mm_init(mm, tsk)) | 824 | if (!mm_init(mm, tsk)) |
| 845 | goto fail_nomem; | 825 | goto fail_nomem; |
| 846 | 826 | ||
| @@ -1081,7 +1061,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
| 1081 | init_rwsem(&sig->group_rwsem); | 1061 | init_rwsem(&sig->group_rwsem); |
| 1082 | #endif | 1062 | #endif |
| 1083 | 1063 | ||
| 1084 | sig->oom_adj = current->signal->oom_adj; | ||
| 1085 | sig->oom_score_adj = current->signal->oom_score_adj; | 1064 | sig->oom_score_adj = current->signal->oom_score_adj; |
| 1086 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; | 1065 | sig->oom_score_adj_min = current->signal->oom_score_adj_min; |
| 1087 | 1066 | ||
| @@ -1280,11 +1259,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
| 1280 | #endif | 1259 | #endif |
| 1281 | #ifdef CONFIG_TRACE_IRQFLAGS | 1260 | #ifdef CONFIG_TRACE_IRQFLAGS |
| 1282 | p->irq_events = 0; | 1261 | p->irq_events = 0; |
| 1283 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1284 | p->hardirqs_enabled = 1; | ||
| 1285 | #else | ||
| 1286 | p->hardirqs_enabled = 0; | 1262 | p->hardirqs_enabled = 0; |
| 1287 | #endif | ||
| 1288 | p->hardirq_enable_ip = 0; | 1263 | p->hardirq_enable_ip = 0; |
| 1289 | p->hardirq_enable_event = 0; | 1264 | p->hardirq_enable_event = 0; |
| 1290 | p->hardirq_disable_ip = _THIS_IP_; | 1265 | p->hardirq_disable_ip = _THIS_IP_; |
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index eebd6d5cfb44..57d86d07221e 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
| @@ -671,6 +671,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, | |||
| 671 | irq_set_chip(irq, chip); | 671 | irq_set_chip(irq, chip); |
| 672 | __irq_set_handler(irq, handle, 0, name); | 672 | __irq_set_handler(irq, handle, 0, name); |
| 673 | } | 673 | } |
| 674 | EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name); | ||
| 674 | 675 | ||
| 675 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) | 676 | void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) |
| 676 | { | 677 | { |
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c index b5fcd96c7102..988dc58e8847 100644 --- a/kernel/irq/dummychip.c +++ b/kernel/irq/dummychip.c | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | */ | 6 | */ |
| 7 | #include <linux/interrupt.h> | 7 | #include <linux/interrupt.h> |
| 8 | #include <linux/irq.h> | 8 | #include <linux/irq.h> |
| 9 | #include <linux/export.h> | ||
| 9 | 10 | ||
| 10 | #include "internals.h" | 11 | #include "internals.h" |
| 11 | 12 | ||
| @@ -57,3 +58,4 @@ struct irq_chip dummy_irq_chip = { | |||
| 57 | .irq_mask = noop, | 58 | .irq_mask = noop, |
| 58 | .irq_unmask = noop, | 59 | .irq_unmask = noop, |
| 59 | }; | 60 | }; |
| 61 | EXPORT_SYMBOL_GPL(dummy_irq_chip); | ||
diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 43049192b5ec..60f48fa0fd0d 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c | |||
| @@ -118,6 +118,7 @@ void jump_label_rate_limit(struct static_key_deferred *key, | |||
| 118 | key->timeout = rl; | 118 | key->timeout = rl; |
| 119 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); | 119 | INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); |
| 120 | } | 120 | } |
| 121 | EXPORT_SYMBOL_GPL(jump_label_rate_limit); | ||
| 121 | 122 | ||
| 122 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) | 123 | static int addr_conflict(struct jump_entry *entry, void *start, void *end) |
| 123 | { | 124 | { |
diff --git a/kernel/kexec.c b/kernel/kexec.c index 0668d58d6413..5e4bd7864c5d 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
| @@ -21,7 +21,6 @@ | |||
| 21 | #include <linux/hardirq.h> | 21 | #include <linux/hardirq.h> |
| 22 | #include <linux/elf.h> | 22 | #include <linux/elf.h> |
| 23 | #include <linux/elfcore.h> | 23 | #include <linux/elfcore.h> |
| 24 | #include <generated/utsrelease.h> | ||
| 25 | #include <linux/utsname.h> | 24 | #include <linux/utsname.h> |
| 26 | #include <linux/numa.h> | 25 | #include <linux/numa.h> |
| 27 | #include <linux/suspend.h> | 26 | #include <linux/suspend.h> |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c62b8546cc90..098f396aa409 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
| @@ -561,9 +561,9 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
| 561 | { | 561 | { |
| 562 | LIST_HEAD(free_list); | 562 | LIST_HEAD(free_list); |
| 563 | 563 | ||
| 564 | mutex_lock(&kprobe_mutex); | ||
| 564 | /* Lock modules while optimizing kprobes */ | 565 | /* Lock modules while optimizing kprobes */ |
| 565 | mutex_lock(&module_mutex); | 566 | mutex_lock(&module_mutex); |
| 566 | mutex_lock(&kprobe_mutex); | ||
| 567 | 567 | ||
| 568 | /* | 568 | /* |
| 569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) | 569 | * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) |
| @@ -586,8 +586,8 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) | |||
| 586 | /* Step 4: Free cleaned kprobes after quiesence period */ | 586 | /* Step 4: Free cleaned kprobes after quiesence period */ |
| 587 | do_free_cleaned_kprobes(&free_list); | 587 | do_free_cleaned_kprobes(&free_list); |
| 588 | 588 | ||
| 589 | mutex_unlock(&kprobe_mutex); | ||
| 590 | mutex_unlock(&module_mutex); | 589 | mutex_unlock(&module_mutex); |
| 590 | mutex_unlock(&kprobe_mutex); | ||
| 591 | 591 | ||
| 592 | /* Step 5: Kick optimizer again if needed */ | 592 | /* Step 5: Kick optimizer again if needed */ |
| 593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) | 593 | if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) |
| @@ -759,20 +759,32 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) | |||
| 759 | struct kprobe *ap; | 759 | struct kprobe *ap; |
| 760 | struct optimized_kprobe *op; | 760 | struct optimized_kprobe *op; |
| 761 | 761 | ||
| 762 | /* Impossible to optimize ftrace-based kprobe */ | ||
| 763 | if (kprobe_ftrace(p)) | ||
| 764 | return; | ||
| 765 | |||
| 766 | /* For preparing optimization, jump_label_text_reserved() is called */ | ||
| 767 | jump_label_lock(); | ||
| 768 | mutex_lock(&text_mutex); | ||
| 769 | |||
| 762 | ap = alloc_aggr_kprobe(p); | 770 | ap = alloc_aggr_kprobe(p); |
| 763 | if (!ap) | 771 | if (!ap) |
| 764 | return; | 772 | goto out; |
| 765 | 773 | ||
| 766 | op = container_of(ap, struct optimized_kprobe, kp); | 774 | op = container_of(ap, struct optimized_kprobe, kp); |
| 767 | if (!arch_prepared_optinsn(&op->optinsn)) { | 775 | if (!arch_prepared_optinsn(&op->optinsn)) { |
| 768 | /* If failed to setup optimizing, fallback to kprobe */ | 776 | /* If failed to setup optimizing, fallback to kprobe */ |
| 769 | arch_remove_optimized_kprobe(op); | 777 | arch_remove_optimized_kprobe(op); |
| 770 | kfree(op); | 778 | kfree(op); |
| 771 | return; | 779 | goto out; |
| 772 | } | 780 | } |
| 773 | 781 | ||
| 774 | init_aggr_kprobe(ap, p); | 782 | init_aggr_kprobe(ap, p); |
| 775 | optimize_kprobe(ap); | 783 | optimize_kprobe(ap); /* This just kicks optimizer thread */ |
| 784 | |||
| 785 | out: | ||
| 786 | mutex_unlock(&text_mutex); | ||
| 787 | jump_label_unlock(); | ||
| 776 | } | 788 | } |
| 777 | 789 | ||
| 778 | #ifdef CONFIG_SYSCTL | 790 | #ifdef CONFIG_SYSCTL |
| @@ -907,9 +919,64 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) | |||
| 907 | } | 919 | } |
| 908 | #endif /* CONFIG_OPTPROBES */ | 920 | #endif /* CONFIG_OPTPROBES */ |
| 909 | 921 | ||
| 922 | #ifdef KPROBES_CAN_USE_FTRACE | ||
| 923 | static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { | ||
| 924 | .func = kprobe_ftrace_handler, | ||
| 925 | .flags = FTRACE_OPS_FL_SAVE_REGS, | ||
| 926 | }; | ||
| 927 | static int kprobe_ftrace_enabled; | ||
| 928 | |||
| 929 | /* Must ensure p->addr is really on ftrace */ | ||
| 930 | static int __kprobes prepare_kprobe(struct kprobe *p) | ||
| 931 | { | ||
| 932 | if (!kprobe_ftrace(p)) | ||
| 933 | return arch_prepare_kprobe(p); | ||
| 934 | |||
| 935 | return arch_prepare_kprobe_ftrace(p); | ||
| 936 | } | ||
| 937 | |||
| 938 | /* Caller must lock kprobe_mutex */ | ||
| 939 | static void __kprobes arm_kprobe_ftrace(struct kprobe *p) | ||
| 940 | { | ||
| 941 | int ret; | ||
| 942 | |||
| 943 | ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, | ||
| 944 | (unsigned long)p->addr, 0, 0); | ||
| 945 | WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); | ||
| 946 | kprobe_ftrace_enabled++; | ||
| 947 | if (kprobe_ftrace_enabled == 1) { | ||
| 948 | ret = register_ftrace_function(&kprobe_ftrace_ops); | ||
| 949 | WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); | ||
| 950 | } | ||
| 951 | } | ||
| 952 | |||
| 953 | /* Caller must lock kprobe_mutex */ | ||
| 954 | static void __kprobes disarm_kprobe_ftrace(struct kprobe *p) | ||
| 955 | { | ||
| 956 | int ret; | ||
| 957 | |||
| 958 | kprobe_ftrace_enabled--; | ||
| 959 | if (kprobe_ftrace_enabled == 0) { | ||
| 960 | ret = unregister_ftrace_function(&kprobe_ftrace_ops); | ||
| 961 | WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); | ||
| 962 | } | ||
| 963 | ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, | ||
| 964 | (unsigned long)p->addr, 1, 0); | ||
| 965 | WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); | ||
| 966 | } | ||
| 967 | #else /* !KPROBES_CAN_USE_FTRACE */ | ||
| 968 | #define prepare_kprobe(p) arch_prepare_kprobe(p) | ||
| 969 | #define arm_kprobe_ftrace(p) do {} while (0) | ||
| 970 | #define disarm_kprobe_ftrace(p) do {} while (0) | ||
| 971 | #endif | ||
| 972 | |||
| 910 | /* Arm a kprobe with text_mutex */ | 973 | /* Arm a kprobe with text_mutex */ |
| 911 | static void __kprobes arm_kprobe(struct kprobe *kp) | 974 | static void __kprobes arm_kprobe(struct kprobe *kp) |
| 912 | { | 975 | { |
| 976 | if (unlikely(kprobe_ftrace(kp))) { | ||
| 977 | arm_kprobe_ftrace(kp); | ||
| 978 | return; | ||
| 979 | } | ||
| 913 | /* | 980 | /* |
| 914 | * Here, since __arm_kprobe() doesn't use stop_machine(), | 981 | * Here, since __arm_kprobe() doesn't use stop_machine(), |
| 915 | * this doesn't cause deadlock on text_mutex. So, we don't | 982 | * this doesn't cause deadlock on text_mutex. So, we don't |
| @@ -921,11 +988,15 @@ static void __kprobes arm_kprobe(struct kprobe *kp) | |||
| 921 | } | 988 | } |
| 922 | 989 | ||
| 923 | /* Disarm a kprobe with text_mutex */ | 990 | /* Disarm a kprobe with text_mutex */ |
| 924 | static void __kprobes disarm_kprobe(struct kprobe *kp) | 991 | static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt) |
| 925 | { | 992 | { |
| 993 | if (unlikely(kprobe_ftrace(kp))) { | ||
| 994 | disarm_kprobe_ftrace(kp); | ||
| 995 | return; | ||
| 996 | } | ||
| 926 | /* Ditto */ | 997 | /* Ditto */ |
| 927 | mutex_lock(&text_mutex); | 998 | mutex_lock(&text_mutex); |
| 928 | __disarm_kprobe(kp, true); | 999 | __disarm_kprobe(kp, reopt); |
| 929 | mutex_unlock(&text_mutex); | 1000 | mutex_unlock(&text_mutex); |
| 930 | } | 1001 | } |
| 931 | 1002 | ||
| @@ -1144,12 +1215,6 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) | |||
| 1144 | if (p->post_handler && !ap->post_handler) | 1215 | if (p->post_handler && !ap->post_handler) |
| 1145 | ap->post_handler = aggr_post_handler; | 1216 | ap->post_handler = aggr_post_handler; |
| 1146 | 1217 | ||
| 1147 | if (kprobe_disabled(ap) && !kprobe_disabled(p)) { | ||
| 1148 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
| 1149 | if (!kprobes_all_disarmed) | ||
| 1150 | /* Arm the breakpoint again. */ | ||
| 1151 | __arm_kprobe(ap); | ||
| 1152 | } | ||
| 1153 | return 0; | 1218 | return 0; |
| 1154 | } | 1219 | } |
| 1155 | 1220 | ||
| @@ -1189,11 +1254,22 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | |||
| 1189 | int ret = 0; | 1254 | int ret = 0; |
| 1190 | struct kprobe *ap = orig_p; | 1255 | struct kprobe *ap = orig_p; |
| 1191 | 1256 | ||
| 1257 | /* For preparing optimization, jump_label_text_reserved() is called */ | ||
| 1258 | jump_label_lock(); | ||
| 1259 | /* | ||
| 1260 | * Get online CPUs to avoid text_mutex deadlock.with stop machine, | ||
| 1261 | * which is invoked by unoptimize_kprobe() in add_new_kprobe() | ||
| 1262 | */ | ||
| 1263 | get_online_cpus(); | ||
| 1264 | mutex_lock(&text_mutex); | ||
| 1265 | |||
| 1192 | if (!kprobe_aggrprobe(orig_p)) { | 1266 | if (!kprobe_aggrprobe(orig_p)) { |
| 1193 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ | 1267 | /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ |
| 1194 | ap = alloc_aggr_kprobe(orig_p); | 1268 | ap = alloc_aggr_kprobe(orig_p); |
| 1195 | if (!ap) | 1269 | if (!ap) { |
| 1196 | return -ENOMEM; | 1270 | ret = -ENOMEM; |
| 1271 | goto out; | ||
| 1272 | } | ||
| 1197 | init_aggr_kprobe(ap, orig_p); | 1273 | init_aggr_kprobe(ap, orig_p); |
| 1198 | } else if (kprobe_unused(ap)) | 1274 | } else if (kprobe_unused(ap)) |
| 1199 | /* This probe is going to die. Rescue it */ | 1275 | /* This probe is going to die. Rescue it */ |
| @@ -1213,7 +1289,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | |||
| 1213 | * free aggr_probe. It will be used next time, or | 1289 | * free aggr_probe. It will be used next time, or |
| 1214 | * freed by unregister_kprobe. | 1290 | * freed by unregister_kprobe. |
| 1215 | */ | 1291 | */ |
| 1216 | return ret; | 1292 | goto out; |
| 1217 | 1293 | ||
| 1218 | /* Prepare optimized instructions if possible. */ | 1294 | /* Prepare optimized instructions if possible. */ |
| 1219 | prepare_optimized_kprobe(ap); | 1295 | prepare_optimized_kprobe(ap); |
| @@ -1228,7 +1304,20 @@ static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, | |||
| 1228 | 1304 | ||
| 1229 | /* Copy ap's insn slot to p */ | 1305 | /* Copy ap's insn slot to p */ |
| 1230 | copy_kprobe(ap, p); | 1306 | copy_kprobe(ap, p); |
| 1231 | return add_new_kprobe(ap, p); | 1307 | ret = add_new_kprobe(ap, p); |
| 1308 | |||
| 1309 | out: | ||
| 1310 | mutex_unlock(&text_mutex); | ||
| 1311 | put_online_cpus(); | ||
| 1312 | jump_label_unlock(); | ||
| 1313 | |||
| 1314 | if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { | ||
| 1315 | ap->flags &= ~KPROBE_FLAG_DISABLED; | ||
| 1316 | if (!kprobes_all_disarmed) | ||
| 1317 | /* Arm the breakpoint again. */ | ||
| 1318 | arm_kprobe(ap); | ||
| 1319 | } | ||
| 1320 | return ret; | ||
| 1232 | } | 1321 | } |
| 1233 | 1322 | ||
| 1234 | static int __kprobes in_kprobes_functions(unsigned long addr) | 1323 | static int __kprobes in_kprobes_functions(unsigned long addr) |
| @@ -1313,71 +1402,96 @@ static inline int check_kprobe_rereg(struct kprobe *p) | |||
| 1313 | return ret; | 1402 | return ret; |
| 1314 | } | 1403 | } |
| 1315 | 1404 | ||
| 1316 | int __kprobes register_kprobe(struct kprobe *p) | 1405 | static __kprobes int check_kprobe_address_safe(struct kprobe *p, |
| 1406 | struct module **probed_mod) | ||
| 1317 | { | 1407 | { |
| 1318 | int ret = 0; | 1408 | int ret = 0; |
| 1319 | struct kprobe *old_p; | 1409 | unsigned long ftrace_addr; |
| 1320 | struct module *probed_mod; | ||
| 1321 | kprobe_opcode_t *addr; | ||
| 1322 | |||
| 1323 | addr = kprobe_addr(p); | ||
| 1324 | if (IS_ERR(addr)) | ||
| 1325 | return PTR_ERR(addr); | ||
| 1326 | p->addr = addr; | ||
| 1327 | 1410 | ||
| 1328 | ret = check_kprobe_rereg(p); | 1411 | /* |
| 1329 | if (ret) | 1412 | * If the address is located on a ftrace nop, set the |
| 1330 | return ret; | 1413 | * breakpoint to the following instruction. |
| 1414 | */ | ||
| 1415 | ftrace_addr = ftrace_location((unsigned long)p->addr); | ||
| 1416 | if (ftrace_addr) { | ||
| 1417 | #ifdef KPROBES_CAN_USE_FTRACE | ||
| 1418 | /* Given address is not on the instruction boundary */ | ||
| 1419 | if ((unsigned long)p->addr != ftrace_addr) | ||
| 1420 | return -EILSEQ; | ||
| 1421 | p->flags |= KPROBE_FLAG_FTRACE; | ||
| 1422 | #else /* !KPROBES_CAN_USE_FTRACE */ | ||
| 1423 | return -EINVAL; | ||
| 1424 | #endif | ||
| 1425 | } | ||
| 1331 | 1426 | ||
| 1332 | jump_label_lock(); | 1427 | jump_label_lock(); |
| 1333 | preempt_disable(); | 1428 | preempt_disable(); |
| 1429 | |||
| 1430 | /* Ensure it is not in reserved area nor out of text */ | ||
| 1334 | if (!kernel_text_address((unsigned long) p->addr) || | 1431 | if (!kernel_text_address((unsigned long) p->addr) || |
| 1335 | in_kprobes_functions((unsigned long) p->addr) || | 1432 | in_kprobes_functions((unsigned long) p->addr) || |
| 1336 | ftrace_text_reserved(p->addr, p->addr) || | ||
| 1337 | jump_label_text_reserved(p->addr, p->addr)) { | 1433 | jump_label_text_reserved(p->addr, p->addr)) { |
| 1338 | ret = -EINVAL; | 1434 | ret = -EINVAL; |
| 1339 | goto cannot_probe; | 1435 | goto out; |
| 1340 | } | 1436 | } |
| 1341 | 1437 | ||
| 1342 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ | 1438 | /* Check if are we probing a module */ |
| 1343 | p->flags &= KPROBE_FLAG_DISABLED; | 1439 | *probed_mod = __module_text_address((unsigned long) p->addr); |
| 1344 | 1440 | if (*probed_mod) { | |
| 1345 | /* | ||
| 1346 | * Check if are we probing a module. | ||
| 1347 | */ | ||
| 1348 | probed_mod = __module_text_address((unsigned long) p->addr); | ||
| 1349 | if (probed_mod) { | ||
| 1350 | /* Return -ENOENT if fail. */ | ||
| 1351 | ret = -ENOENT; | ||
| 1352 | /* | 1441 | /* |
| 1353 | * We must hold a refcount of the probed module while updating | 1442 | * We must hold a refcount of the probed module while updating |
| 1354 | * its code to prohibit unexpected unloading. | 1443 | * its code to prohibit unexpected unloading. |
| 1355 | */ | 1444 | */ |
| 1356 | if (unlikely(!try_module_get(probed_mod))) | 1445 | if (unlikely(!try_module_get(*probed_mod))) { |
| 1357 | goto cannot_probe; | 1446 | ret = -ENOENT; |
| 1447 | goto out; | ||
| 1448 | } | ||
| 1358 | 1449 | ||
| 1359 | /* | 1450 | /* |
| 1360 | * If the module freed .init.text, we couldn't insert | 1451 | * If the module freed .init.text, we couldn't insert |
| 1361 | * kprobes in there. | 1452 | * kprobes in there. |
| 1362 | */ | 1453 | */ |
| 1363 | if (within_module_init((unsigned long)p->addr, probed_mod) && | 1454 | if (within_module_init((unsigned long)p->addr, *probed_mod) && |
| 1364 | probed_mod->state != MODULE_STATE_COMING) { | 1455 | (*probed_mod)->state != MODULE_STATE_COMING) { |
| 1365 | module_put(probed_mod); | 1456 | module_put(*probed_mod); |
| 1366 | goto cannot_probe; | 1457 | *probed_mod = NULL; |
| 1458 | ret = -ENOENT; | ||
| 1367 | } | 1459 | } |
| 1368 | /* ret will be updated by following code */ | ||
| 1369 | } | 1460 | } |
| 1461 | out: | ||
| 1370 | preempt_enable(); | 1462 | preempt_enable(); |
| 1371 | jump_label_unlock(); | 1463 | jump_label_unlock(); |
| 1372 | 1464 | ||
| 1465 | return ret; | ||
| 1466 | } | ||
| 1467 | |||
| 1468 | int __kprobes register_kprobe(struct kprobe *p) | ||
| 1469 | { | ||
| 1470 | int ret; | ||
| 1471 | struct kprobe *old_p; | ||
| 1472 | struct module *probed_mod; | ||
| 1473 | kprobe_opcode_t *addr; | ||
| 1474 | |||
| 1475 | /* Adjust probe address from symbol */ | ||
| 1476 | addr = kprobe_addr(p); | ||
| 1477 | if (IS_ERR(addr)) | ||
| 1478 | return PTR_ERR(addr); | ||
| 1479 | p->addr = addr; | ||
| 1480 | |||
| 1481 | ret = check_kprobe_rereg(p); | ||
| 1482 | if (ret) | ||
| 1483 | return ret; | ||
| 1484 | |||
| 1485 | /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ | ||
| 1486 | p->flags &= KPROBE_FLAG_DISABLED; | ||
| 1373 | p->nmissed = 0; | 1487 | p->nmissed = 0; |
| 1374 | INIT_LIST_HEAD(&p->list); | 1488 | INIT_LIST_HEAD(&p->list); |
| 1375 | mutex_lock(&kprobe_mutex); | ||
| 1376 | 1489 | ||
| 1377 | jump_label_lock(); /* needed to call jump_label_text_reserved() */ | 1490 | ret = check_kprobe_address_safe(p, &probed_mod); |
| 1491 | if (ret) | ||
| 1492 | return ret; | ||
| 1378 | 1493 | ||
| 1379 | get_online_cpus(); /* For avoiding text_mutex deadlock. */ | 1494 | mutex_lock(&kprobe_mutex); |
| 1380 | mutex_lock(&text_mutex); | ||
| 1381 | 1495 | ||
| 1382 | old_p = get_kprobe(p->addr); | 1496 | old_p = get_kprobe(p->addr); |
| 1383 | if (old_p) { | 1497 | if (old_p) { |
| @@ -1386,7 +1500,9 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 1386 | goto out; | 1500 | goto out; |
| 1387 | } | 1501 | } |
| 1388 | 1502 | ||
| 1389 | ret = arch_prepare_kprobe(p); | 1503 | mutex_lock(&text_mutex); /* Avoiding text modification */ |
| 1504 | ret = prepare_kprobe(p); | ||
| 1505 | mutex_unlock(&text_mutex); | ||
| 1390 | if (ret) | 1506 | if (ret) |
| 1391 | goto out; | 1507 | goto out; |
| 1392 | 1508 | ||
| @@ -1395,26 +1511,18 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
| 1395 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 1511 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
| 1396 | 1512 | ||
| 1397 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) | 1513 | if (!kprobes_all_disarmed && !kprobe_disabled(p)) |
| 1398 | __arm_kprobe(p); | 1514 | arm_kprobe(p); |
| 1399 | 1515 | ||
| 1400 | /* Try to optimize kprobe */ | 1516 | /* Try to optimize kprobe */ |
| 1401 | try_to_optimize_kprobe(p); | 1517 | try_to_optimize_kprobe(p); |
| 1402 | 1518 | ||
| 1403 | out: | 1519 | out: |
| 1404 | mutex_unlock(&text_mutex); | ||
| 1405 | put_online_cpus(); | ||
| 1406 | jump_label_unlock(); | ||
| 1407 | mutex_unlock(&kprobe_mutex); | 1520 | mutex_unlock(&kprobe_mutex); |
| 1408 | 1521 | ||
| 1409 | if (probed_mod) | 1522 | if (probed_mod) |
| 1410 | module_put(probed_mod); | 1523 | module_put(probed_mod); |
| 1411 | 1524 | ||
| 1412 | return ret; | 1525 | return ret; |
| 1413 | |||
| 1414 | cannot_probe: | ||
| 1415 | preempt_enable(); | ||
| 1416 | jump_label_unlock(); | ||
| 1417 | return ret; | ||
| 1418 | } | 1526 | } |
| 1419 | EXPORT_SYMBOL_GPL(register_kprobe); | 1527 | EXPORT_SYMBOL_GPL(register_kprobe); |
| 1420 | 1528 | ||
| @@ -1451,7 +1559,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) | |||
| 1451 | 1559 | ||
| 1452 | /* Try to disarm and disable this/parent probe */ | 1560 | /* Try to disarm and disable this/parent probe */ |
| 1453 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { | 1561 | if (p == orig_p || aggr_kprobe_disabled(orig_p)) { |
| 1454 | disarm_kprobe(orig_p); | 1562 | disarm_kprobe(orig_p, true); |
| 1455 | orig_p->flags |= KPROBE_FLAG_DISABLED; | 1563 | orig_p->flags |= KPROBE_FLAG_DISABLED; |
| 1456 | } | 1564 | } |
| 1457 | } | 1565 | } |
| @@ -2049,10 +2157,11 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, | |||
| 2049 | 2157 | ||
| 2050 | if (!pp) | 2158 | if (!pp) |
| 2051 | pp = p; | 2159 | pp = p; |
| 2052 | seq_printf(pi, "%s%s%s\n", | 2160 | seq_printf(pi, "%s%s%s%s\n", |
| 2053 | (kprobe_gone(p) ? "[GONE]" : ""), | 2161 | (kprobe_gone(p) ? "[GONE]" : ""), |
| 2054 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), | 2162 | ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), |
| 2055 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); | 2163 | (kprobe_optimized(pp) ? "[OPTIMIZED]" : ""), |
| 2164 | (kprobe_ftrace(pp) ? "[FTRACE]" : "")); | ||
| 2056 | } | 2165 | } |
| 2057 | 2166 | ||
| 2058 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) | 2167 | static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) |
| @@ -2131,14 +2240,12 @@ static void __kprobes arm_all_kprobes(void) | |||
| 2131 | goto already_enabled; | 2240 | goto already_enabled; |
| 2132 | 2241 | ||
| 2133 | /* Arming kprobes doesn't optimize kprobe itself */ | 2242 | /* Arming kprobes doesn't optimize kprobe itself */ |
| 2134 | mutex_lock(&text_mutex); | ||
| 2135 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2243 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 2136 | head = &kprobe_table[i]; | 2244 | head = &kprobe_table[i]; |
| 2137 | hlist_for_each_entry_rcu(p, node, head, hlist) | 2245 | hlist_for_each_entry_rcu(p, node, head, hlist) |
| 2138 | if (!kprobe_disabled(p)) | 2246 | if (!kprobe_disabled(p)) |
| 2139 | __arm_kprobe(p); | 2247 | arm_kprobe(p); |
| 2140 | } | 2248 | } |
| 2141 | mutex_unlock(&text_mutex); | ||
| 2142 | 2249 | ||
| 2143 | kprobes_all_disarmed = false; | 2250 | kprobes_all_disarmed = false; |
| 2144 | printk(KERN_INFO "Kprobes globally enabled\n"); | 2251 | printk(KERN_INFO "Kprobes globally enabled\n"); |
| @@ -2166,15 +2273,13 @@ static void __kprobes disarm_all_kprobes(void) | |||
| 2166 | kprobes_all_disarmed = true; | 2273 | kprobes_all_disarmed = true; |
| 2167 | printk(KERN_INFO "Kprobes globally disabled\n"); | 2274 | printk(KERN_INFO "Kprobes globally disabled\n"); |
| 2168 | 2275 | ||
| 2169 | mutex_lock(&text_mutex); | ||
| 2170 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { | 2276 | for (i = 0; i < KPROBE_TABLE_SIZE; i++) { |
| 2171 | head = &kprobe_table[i]; | 2277 | head = &kprobe_table[i]; |
| 2172 | hlist_for_each_entry_rcu(p, node, head, hlist) { | 2278 | hlist_for_each_entry_rcu(p, node, head, hlist) { |
| 2173 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) | 2279 | if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) |
| 2174 | __disarm_kprobe(p, false); | 2280 | disarm_kprobe(p, false); |
| 2175 | } | 2281 | } |
| 2176 | } | 2282 | } |
| 2177 | mutex_unlock(&text_mutex); | ||
| 2178 | mutex_unlock(&kprobe_mutex); | 2283 | mutex_unlock(&kprobe_mutex); |
| 2179 | 2284 | ||
| 2180 | /* Wait for disarming all kprobes by optimizer */ | 2285 | /* Wait for disarming all kprobes by optimizer */ |
diff --git a/kernel/kthread.c b/kernel/kthread.c index b579af57ea10..146a6fa96825 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
| @@ -37,11 +37,20 @@ struct kthread_create_info | |||
| 37 | }; | 37 | }; |
| 38 | 38 | ||
| 39 | struct kthread { | 39 | struct kthread { |
| 40 | int should_stop; | 40 | unsigned long flags; |
| 41 | unsigned int cpu; | ||
| 41 | void *data; | 42 | void *data; |
| 43 | struct completion parked; | ||
| 42 | struct completion exited; | 44 | struct completion exited; |
| 43 | }; | 45 | }; |
| 44 | 46 | ||
| 47 | enum KTHREAD_BITS { | ||
| 48 | KTHREAD_IS_PER_CPU = 0, | ||
| 49 | KTHREAD_SHOULD_STOP, | ||
| 50 | KTHREAD_SHOULD_PARK, | ||
| 51 | KTHREAD_IS_PARKED, | ||
| 52 | }; | ||
| 53 | |||
| 45 | #define to_kthread(tsk) \ | 54 | #define to_kthread(tsk) \ |
| 46 | container_of((tsk)->vfork_done, struct kthread, exited) | 55 | container_of((tsk)->vfork_done, struct kthread, exited) |
| 47 | 56 | ||
| @@ -52,13 +61,29 @@ struct kthread { | |||
| 52 | * and this will return true. You should then return, and your return | 61 | * and this will return true. You should then return, and your return |
| 53 | * value will be passed through to kthread_stop(). | 62 | * value will be passed through to kthread_stop(). |
| 54 | */ | 63 | */ |
| 55 | int kthread_should_stop(void) | 64 | bool kthread_should_stop(void) |
| 56 | { | 65 | { |
| 57 | return to_kthread(current)->should_stop; | 66 | return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); |
| 58 | } | 67 | } |
| 59 | EXPORT_SYMBOL(kthread_should_stop); | 68 | EXPORT_SYMBOL(kthread_should_stop); |
| 60 | 69 | ||
| 61 | /** | 70 | /** |
| 71 | * kthread_should_park - should this kthread park now? | ||
| 72 | * | ||
| 73 | * When someone calls kthread_park() on your kthread, it will be woken | ||
| 74 | * and this will return true. You should then do the necessary | ||
| 75 | * cleanup and call kthread_parkme() | ||
| 76 | * | ||
| 77 | * Similar to kthread_should_stop(), but this keeps the thread alive | ||
| 78 | * and in a park position. kthread_unpark() "restarts" the thread and | ||
| 79 | * calls the thread function again. | ||
| 80 | */ | ||
| 81 | bool kthread_should_park(void) | ||
| 82 | { | ||
| 83 | return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); | ||
| 84 | } | ||
| 85 | |||
| 86 | /** | ||
| 62 | * kthread_freezable_should_stop - should this freezable kthread return now? | 87 | * kthread_freezable_should_stop - should this freezable kthread return now? |
| 63 | * @was_frozen: optional out parameter, indicates whether %current was frozen | 88 | * @was_frozen: optional out parameter, indicates whether %current was frozen |
| 64 | * | 89 | * |
| @@ -96,6 +121,24 @@ void *kthread_data(struct task_struct *task) | |||
| 96 | return to_kthread(task)->data; | 121 | return to_kthread(task)->data; |
| 97 | } | 122 | } |
| 98 | 123 | ||
| 124 | static void __kthread_parkme(struct kthread *self) | ||
| 125 | { | ||
| 126 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 127 | while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { | ||
| 128 | if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) | ||
| 129 | complete(&self->parked); | ||
| 130 | schedule(); | ||
| 131 | __set_current_state(TASK_INTERRUPTIBLE); | ||
| 132 | } | ||
| 133 | clear_bit(KTHREAD_IS_PARKED, &self->flags); | ||
| 134 | __set_current_state(TASK_RUNNING); | ||
| 135 | } | ||
| 136 | |||
| 137 | void kthread_parkme(void) | ||
| 138 | { | ||
| 139 | __kthread_parkme(to_kthread(current)); | ||
| 140 | } | ||
| 141 | |||
| 99 | static int kthread(void *_create) | 142 | static int kthread(void *_create) |
| 100 | { | 143 | { |
| 101 | /* Copy data: it's on kthread's stack */ | 144 | /* Copy data: it's on kthread's stack */ |
| @@ -105,9 +148,10 @@ static int kthread(void *_create) | |||
| 105 | struct kthread self; | 148 | struct kthread self; |
| 106 | int ret; | 149 | int ret; |
| 107 | 150 | ||
| 108 | self.should_stop = 0; | 151 | self.flags = 0; |
| 109 | self.data = data; | 152 | self.data = data; |
| 110 | init_completion(&self.exited); | 153 | init_completion(&self.exited); |
| 154 | init_completion(&self.parked); | ||
| 111 | current->vfork_done = &self.exited; | 155 | current->vfork_done = &self.exited; |
| 112 | 156 | ||
| 113 | /* OK, tell user we're spawned, wait for stop or wakeup */ | 157 | /* OK, tell user we're spawned, wait for stop or wakeup */ |
| @@ -117,9 +161,11 @@ static int kthread(void *_create) | |||
| 117 | schedule(); | 161 | schedule(); |
| 118 | 162 | ||
| 119 | ret = -EINTR; | 163 | ret = -EINTR; |
| 120 | if (!self.should_stop) | ||
| 121 | ret = threadfn(data); | ||
| 122 | 164 | ||
| 165 | if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { | ||
| 166 | __kthread_parkme(&self); | ||
| 167 | ret = threadfn(data); | ||
| 168 | } | ||
| 123 | /* we can't just return, we must preserve "self" on stack */ | 169 | /* we can't just return, we must preserve "self" on stack */ |
| 124 | do_exit(ret); | 170 | do_exit(ret); |
| 125 | } | 171 | } |
| @@ -172,8 +218,7 @@ static void create_kthread(struct kthread_create_info *create) | |||
| 172 | * Returns a task_struct or ERR_PTR(-ENOMEM). | 218 | * Returns a task_struct or ERR_PTR(-ENOMEM). |
| 173 | */ | 219 | */ |
| 174 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | 220 | struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), |
| 175 | void *data, | 221 | void *data, int node, |
| 176 | int node, | ||
| 177 | const char namefmt[], | 222 | const char namefmt[], |
| 178 | ...) | 223 | ...) |
| 179 | { | 224 | { |
| @@ -210,6 +255,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), | |||
| 210 | } | 255 | } |
| 211 | EXPORT_SYMBOL(kthread_create_on_node); | 256 | EXPORT_SYMBOL(kthread_create_on_node); |
| 212 | 257 | ||
| 258 | static void __kthread_bind(struct task_struct *p, unsigned int cpu) | ||
| 259 | { | ||
| 260 | /* It's safe because the task is inactive. */ | ||
| 261 | do_set_cpus_allowed(p, cpumask_of(cpu)); | ||
| 262 | p->flags |= PF_THREAD_BOUND; | ||
| 263 | } | ||
| 264 | |||
| 213 | /** | 265 | /** |
| 214 | * kthread_bind - bind a just-created kthread to a cpu. | 266 | * kthread_bind - bind a just-created kthread to a cpu. |
| 215 | * @p: thread created by kthread_create(). | 267 | * @p: thread created by kthread_create(). |
| @@ -226,14 +278,112 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) | |||
| 226 | WARN_ON(1); | 278 | WARN_ON(1); |
| 227 | return; | 279 | return; |
| 228 | } | 280 | } |
| 229 | 281 | __kthread_bind(p, cpu); | |
| 230 | /* It's safe because the task is inactive. */ | ||
| 231 | do_set_cpus_allowed(p, cpumask_of(cpu)); | ||
| 232 | p->flags |= PF_THREAD_BOUND; | ||
| 233 | } | 282 | } |
| 234 | EXPORT_SYMBOL(kthread_bind); | 283 | EXPORT_SYMBOL(kthread_bind); |
| 235 | 284 | ||
| 236 | /** | 285 | /** |
| 286 | * kthread_create_on_cpu - Create a cpu bound kthread | ||
| 287 | * @threadfn: the function to run until signal_pending(current). | ||
| 288 | * @data: data ptr for @threadfn. | ||
| 289 | * @cpu: The cpu on which the thread should be bound, | ||
| 290 | * @namefmt: printf-style name for the thread. Format is restricted | ||
| 291 | * to "name.*%u". Code fills in cpu number. | ||
| 292 | * | ||
| 293 | * Description: This helper function creates and names a kernel thread | ||
| 294 | * The thread will be woken and put into park mode. | ||
| 295 | */ | ||
| 296 | struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | ||
| 297 | void *data, unsigned int cpu, | ||
| 298 | const char *namefmt) | ||
| 299 | { | ||
| 300 | struct task_struct *p; | ||
| 301 | |||
| 302 | p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, | ||
| 303 | cpu); | ||
| 304 | if (IS_ERR(p)) | ||
| 305 | return p; | ||
| 306 | set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); | ||
| 307 | to_kthread(p)->cpu = cpu; | ||
| 308 | /* Park the thread to get it out of TASK_UNINTERRUPTIBLE state */ | ||
| 309 | kthread_park(p); | ||
| 310 | return p; | ||
| 311 | } | ||
| 312 | |||
| 313 | static struct kthread *task_get_live_kthread(struct task_struct *k) | ||
| 314 | { | ||
| 315 | struct kthread *kthread; | ||
| 316 | |||
| 317 | get_task_struct(k); | ||
| 318 | kthread = to_kthread(k); | ||
| 319 | /* It might have exited */ | ||
| 320 | barrier(); | ||
| 321 | if (k->vfork_done != NULL) | ||
| 322 | return kthread; | ||
| 323 | return NULL; | ||
| 324 | } | ||
| 325 | |||
| 326 | /** | ||
| 327 | * kthread_unpark - unpark a thread created by kthread_create(). | ||
| 328 | * @k: thread created by kthread_create(). | ||
| 329 | * | ||
| 330 | * Sets kthread_should_park() for @k to return false, wakes it, and | ||
| 331 | * waits for it to return. If the thread is marked percpu then its | ||
| 332 | * bound to the cpu again. | ||
| 333 | */ | ||
| 334 | void kthread_unpark(struct task_struct *k) | ||
| 335 | { | ||
| 336 | struct kthread *kthread = task_get_live_kthread(k); | ||
| 337 | |||
| 338 | if (kthread) { | ||
| 339 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | ||
| 340 | /* | ||
| 341 | * We clear the IS_PARKED bit here as we don't wait | ||
| 342 | * until the task has left the park code. So if we'd | ||
| 343 | * park before that happens we'd see the IS_PARKED bit | ||
| 344 | * which might be about to be cleared. | ||
| 345 | */ | ||
| 346 | if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | ||
| 347 | if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) | ||
| 348 | __kthread_bind(k, kthread->cpu); | ||
| 349 | wake_up_process(k); | ||
| 350 | } | ||
| 351 | } | ||
| 352 | put_task_struct(k); | ||
| 353 | } | ||
| 354 | |||
| 355 | /** | ||
| 356 | * kthread_park - park a thread created by kthread_create(). | ||
| 357 | * @k: thread created by kthread_create(). | ||
| 358 | * | ||
| 359 | * Sets kthread_should_park() for @k to return true, wakes it, and | ||
| 360 | * waits for it to return. This can also be called after kthread_create() | ||
| 361 | * instead of calling wake_up_process(): the thread will park without | ||
| 362 | * calling threadfn(). | ||
| 363 | * | ||
| 364 | * Returns 0 if the thread is parked, -ENOSYS if the thread exited. | ||
| 365 | * If called by the kthread itself just the park bit is set. | ||
| 366 | */ | ||
| 367 | int kthread_park(struct task_struct *k) | ||
| 368 | { | ||
| 369 | struct kthread *kthread = task_get_live_kthread(k); | ||
| 370 | int ret = -ENOSYS; | ||
| 371 | |||
| 372 | if (kthread) { | ||
| 373 | if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | ||
| 374 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | ||
| 375 | if (k != current) { | ||
| 376 | wake_up_process(k); | ||
| 377 | wait_for_completion(&kthread->parked); | ||
| 378 | } | ||
| 379 | } | ||
| 380 | ret = 0; | ||
| 381 | } | ||
| 382 | put_task_struct(k); | ||
| 383 | return ret; | ||
| 384 | } | ||
| 385 | |||
| 386 | /** | ||
| 237 | * kthread_stop - stop a thread created by kthread_create(). | 387 | * kthread_stop - stop a thread created by kthread_create(). |
| 238 | * @k: thread created by kthread_create(). | 388 | * @k: thread created by kthread_create(). |
| 239 | * | 389 | * |
| @@ -250,16 +400,13 @@ EXPORT_SYMBOL(kthread_bind); | |||
| 250 | */ | 400 | */ |
| 251 | int kthread_stop(struct task_struct *k) | 401 | int kthread_stop(struct task_struct *k) |
| 252 | { | 402 | { |
| 253 | struct kthread *kthread; | 403 | struct kthread *kthread = task_get_live_kthread(k); |
| 254 | int ret; | 404 | int ret; |
| 255 | 405 | ||
| 256 | trace_sched_kthread_stop(k); | 406 | trace_sched_kthread_stop(k); |
| 257 | get_task_struct(k); | 407 | if (kthread) { |
| 258 | 408 | set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); | |
| 259 | kthread = to_kthread(k); | 409 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
| 260 | barrier(); /* it might have exited */ | ||
| 261 | if (k->vfork_done != NULL) { | ||
| 262 | kthread->should_stop = 1; | ||
| 263 | wake_up_process(k); | 410 | wake_up_process(k); |
| 264 | wait_for_completion(&kthread->exited); | 411 | wait_for_completion(&kthread->exited); |
| 265 | } | 412 | } |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index ea9ee4518c35..7981e5b2350d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
| @@ -2998,6 +2998,42 @@ EXPORT_SYMBOL_GPL(lockdep_init_map); | |||
| 2998 | 2998 | ||
| 2999 | struct lock_class_key __lockdep_no_validate__; | 2999 | struct lock_class_key __lockdep_no_validate__; |
| 3000 | 3000 | ||
| 3001 | static int | ||
| 3002 | print_lock_nested_lock_not_held(struct task_struct *curr, | ||
| 3003 | struct held_lock *hlock, | ||
| 3004 | unsigned long ip) | ||
| 3005 | { | ||
| 3006 | if (!debug_locks_off()) | ||
| 3007 | return 0; | ||
| 3008 | if (debug_locks_silent) | ||
| 3009 | return 0; | ||
| 3010 | |||
| 3011 | printk("\n"); | ||
| 3012 | printk("==================================\n"); | ||
| 3013 | printk("[ BUG: Nested lock was not taken ]\n"); | ||
| 3014 | print_kernel_ident(); | ||
| 3015 | printk("----------------------------------\n"); | ||
| 3016 | |||
| 3017 | printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); | ||
| 3018 | print_lock(hlock); | ||
| 3019 | |||
| 3020 | printk("\nbut this task is not holding:\n"); | ||
| 3021 | printk("%s\n", hlock->nest_lock->name); | ||
| 3022 | |||
| 3023 | printk("\nstack backtrace:\n"); | ||
| 3024 | dump_stack(); | ||
| 3025 | |||
| 3026 | printk("\nother info that might help us debug this:\n"); | ||
| 3027 | lockdep_print_held_locks(curr); | ||
| 3028 | |||
| 3029 | printk("\nstack backtrace:\n"); | ||
| 3030 | dump_stack(); | ||
| 3031 | |||
| 3032 | return 0; | ||
| 3033 | } | ||
| 3034 | |||
| 3035 | static int __lock_is_held(struct lockdep_map *lock); | ||
| 3036 | |||
| 3001 | /* | 3037 | /* |
| 3002 | * This gets called for every mutex_lock*()/spin_lock*() operation. | 3038 | * This gets called for every mutex_lock*()/spin_lock*() operation. |
| 3003 | * We maintain the dependency maps and validate the locking attempt: | 3039 | * We maintain the dependency maps and validate the locking attempt: |
| @@ -3139,6 +3175,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
| 3139 | } | 3175 | } |
| 3140 | chain_key = iterate_chain_key(chain_key, id); | 3176 | chain_key = iterate_chain_key(chain_key, id); |
| 3141 | 3177 | ||
| 3178 | if (nest_lock && !__lock_is_held(nest_lock)) | ||
| 3179 | return print_lock_nested_lock_not_held(curr, hlock, ip); | ||
| 3180 | |||
| 3142 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) | 3181 | if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) |
| 3143 | return 0; | 3182 | return 0; |
| 3144 | 3183 | ||
diff --git a/kernel/pid.c b/kernel/pid.c index e86b291ad834..aebd4f5aaf41 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
| @@ -479,6 +479,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) | |||
| 479 | } | 479 | } |
| 480 | return nr; | 480 | return nr; |
| 481 | } | 481 | } |
| 482 | EXPORT_SYMBOL_GPL(pid_nr_ns); | ||
| 482 | 483 | ||
| 483 | pid_t pid_vnr(struct pid *pid) | 484 | pid_t pid_vnr(struct pid *pid) |
| 484 | { | 485 | { |
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6144bab8fd8e..478bad2745e3 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c | |||
| @@ -16,6 +16,7 @@ | |||
| 16 | #include <linux/slab.h> | 16 | #include <linux/slab.h> |
| 17 | #include <linux/proc_fs.h> | 17 | #include <linux/proc_fs.h> |
| 18 | #include <linux/reboot.h> | 18 | #include <linux/reboot.h> |
| 19 | #include <linux/export.h> | ||
| 19 | 20 | ||
| 20 | #define BITS_PER_PAGE (PAGE_SIZE*8) | 21 | #define BITS_PER_PAGE (PAGE_SIZE*8) |
| 21 | 22 | ||
| @@ -144,6 +145,7 @@ void free_pid_ns(struct kref *kref) | |||
| 144 | if (parent != NULL) | 145 | if (parent != NULL) |
| 145 | put_pid_ns(parent); | 146 | put_pid_ns(parent); |
| 146 | } | 147 | } |
| 148 | EXPORT_SYMBOL_GPL(free_pid_ns); | ||
| 147 | 149 | ||
| 148 | void zap_pid_ns_processes(struct pid_namespace *pid_ns) | 150 | void zap_pid_ns_processes(struct pid_namespace *pid_ns) |
| 149 | { | 151 | { |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a70518c9d82f..5dfdc9ea180b 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
| @@ -263,6 +263,10 @@ config PM_GENERIC_DOMAINS | |||
| 263 | bool | 263 | bool |
| 264 | depends on PM | 264 | depends on PM |
| 265 | 265 | ||
| 266 | config PM_GENERIC_DOMAINS_SLEEP | ||
| 267 | def_bool y | ||
| 268 | depends on PM_SLEEP && PM_GENERIC_DOMAINS | ||
| 269 | |||
| 266 | config PM_GENERIC_DOMAINS_RUNTIME | 270 | config PM_GENERIC_DOMAINS_RUNTIME |
| 267 | def_bool y | 271 | def_bool y |
| 268 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS | 272 | depends on PM_RUNTIME && PM_GENERIC_DOMAINS |
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index d52359374e85..68197a4e8fc9 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c | |||
| @@ -37,7 +37,7 @@ static struct sysrq_key_op sysrq_poweroff_op = { | |||
| 37 | .enable_mask = SYSRQ_ENABLE_BOOT, | 37 | .enable_mask = SYSRQ_ENABLE_BOOT, |
| 38 | }; | 38 | }; |
| 39 | 39 | ||
| 40 | static int pm_sysrq_init(void) | 40 | static int __init pm_sysrq_init(void) |
| 41 | { | 41 | { |
| 42 | register_sysrq_key('o', &sysrq_poweroff_op); | 42 | register_sysrq_key('o', &sysrq_poweroff_op); |
| 43 | return 0; | 43 | return 0; |
diff --git a/kernel/power/process.c b/kernel/power/process.c index 19db29f67558..87da817f9e13 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
| @@ -79,7 +79,7 @@ static int try_to_freeze_tasks(bool user_only) | |||
| 79 | 79 | ||
| 80 | /* | 80 | /* |
| 81 | * We need to retry, but first give the freezing tasks some | 81 | * We need to retry, but first give the freezing tasks some |
| 82 | * time to enter the regrigerator. | 82 | * time to enter the refrigerator. |
| 83 | */ | 83 | */ |
| 84 | msleep(10); | 84 | msleep(10); |
| 85 | } | 85 | } |
diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e684026..846bd42c7ed1 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c | |||
| @@ -139,6 +139,7 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) | |||
| 139 | default: | 139 | default: |
| 140 | /* runtime check for not using enum */ | 140 | /* runtime check for not using enum */ |
| 141 | BUG(); | 141 | BUG(); |
| 142 | return PM_QOS_DEFAULT_VALUE; | ||
| 142 | } | 143 | } |
| 143 | } | 144 | } |
| 144 | 145 | ||
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a232bb59d93f..1f5e55dda955 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
| @@ -180,7 +180,8 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) | |||
| 180 | return has_ns_capability(current, ns, CAP_SYS_PTRACE); | 180 | return has_ns_capability(current, ns, CAP_SYS_PTRACE); |
| 181 | } | 181 | } |
| 182 | 182 | ||
| 183 | int __ptrace_may_access(struct task_struct *task, unsigned int mode) | 183 | /* Returns 0 on success, -errno on denial. */ |
| 184 | static int __ptrace_may_access(struct task_struct *task, unsigned int mode) | ||
| 184 | { | 185 | { |
| 185 | const struct cred *cred = current_cred(), *tcred; | 186 | const struct cred *cred = current_cred(), *tcred; |
| 186 | 187 | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 4e6a61b15e86..29ca1c6da594 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
| @@ -45,6 +45,7 @@ | |||
| 45 | #include <linux/mutex.h> | 45 | #include <linux/mutex.h> |
| 46 | #include <linux/export.h> | 46 | #include <linux/export.h> |
| 47 | #include <linux/hardirq.h> | 47 | #include <linux/hardirq.h> |
| 48 | #include <linux/delay.h> | ||
| 48 | 49 | ||
| 49 | #define CREATE_TRACE_POINTS | 50 | #define CREATE_TRACE_POINTS |
| 50 | #include <trace/events/rcu.h> | 51 | #include <trace/events/rcu.h> |
| @@ -81,6 +82,9 @@ void __rcu_read_unlock(void) | |||
| 81 | } else { | 82 | } else { |
| 82 | barrier(); /* critical section before exit code. */ | 83 | barrier(); /* critical section before exit code. */ |
| 83 | t->rcu_read_lock_nesting = INT_MIN; | 84 | t->rcu_read_lock_nesting = INT_MIN; |
| 85 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
| 86 | udelay(10); /* Make preemption more probable. */ | ||
| 87 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
| 84 | barrier(); /* assign before ->rcu_read_unlock_special load */ | 88 | barrier(); /* assign before ->rcu_read_unlock_special load */ |
| 85 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | 89 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) |
| 86 | rcu_read_unlock_special(t); | 90 | rcu_read_unlock_special(t); |
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index 547b1fe5b052..e4c6a598d6f7 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c | |||
| @@ -56,25 +56,28 @@ static void __call_rcu(struct rcu_head *head, | |||
| 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 56 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 57 | 57 | ||
| 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ |
| 59 | static void rcu_idle_enter_common(long long oldval) | 59 | static void rcu_idle_enter_common(long long newval) |
| 60 | { | 60 | { |
| 61 | if (rcu_dynticks_nesting) { | 61 | if (newval) { |
| 62 | RCU_TRACE(trace_rcu_dyntick("--=", | 62 | RCU_TRACE(trace_rcu_dyntick("--=", |
| 63 | oldval, rcu_dynticks_nesting)); | 63 | rcu_dynticks_nesting, newval)); |
| 64 | rcu_dynticks_nesting = newval; | ||
| 64 | return; | 65 | return; |
| 65 | } | 66 | } |
| 66 | RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting)); | 67 | RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); |
| 67 | if (!is_idle_task(current)) { | 68 | if (!is_idle_task(current)) { |
| 68 | struct task_struct *idle = idle_task(smp_processor_id()); | 69 | struct task_struct *idle = idle_task(smp_processor_id()); |
| 69 | 70 | ||
| 70 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", | 71 | RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", |
| 71 | oldval, rcu_dynticks_nesting)); | 72 | rcu_dynticks_nesting, newval)); |
| 72 | ftrace_dump(DUMP_ALL); | 73 | ftrace_dump(DUMP_ALL); |
| 73 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | 74 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", |
| 74 | current->pid, current->comm, | 75 | current->pid, current->comm, |
| 75 | idle->pid, idle->comm); /* must be idle task! */ | 76 | idle->pid, idle->comm); /* must be idle task! */ |
| 76 | } | 77 | } |
| 77 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | 78 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ |
| 79 | barrier(); | ||
| 80 | rcu_dynticks_nesting = newval; | ||
| 78 | } | 81 | } |
| 79 | 82 | ||
| 80 | /* | 83 | /* |
| @@ -84,17 +87,16 @@ static void rcu_idle_enter_common(long long oldval) | |||
| 84 | void rcu_idle_enter(void) | 87 | void rcu_idle_enter(void) |
| 85 | { | 88 | { |
| 86 | unsigned long flags; | 89 | unsigned long flags; |
| 87 | long long oldval; | 90 | long long newval; |
| 88 | 91 | ||
| 89 | local_irq_save(flags); | 92 | local_irq_save(flags); |
| 90 | oldval = rcu_dynticks_nesting; | ||
| 91 | WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); | 93 | WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); |
| 92 | if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == | 94 | if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == |
| 93 | DYNTICK_TASK_NEST_VALUE) | 95 | DYNTICK_TASK_NEST_VALUE) |
| 94 | rcu_dynticks_nesting = 0; | 96 | newval = 0; |
| 95 | else | 97 | else |
| 96 | rcu_dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | 98 | newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; |
| 97 | rcu_idle_enter_common(oldval); | 99 | rcu_idle_enter_common(newval); |
| 98 | local_irq_restore(flags); | 100 | local_irq_restore(flags); |
| 99 | } | 101 | } |
| 100 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 102 | EXPORT_SYMBOL_GPL(rcu_idle_enter); |
| @@ -105,15 +107,15 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); | |||
| 105 | void rcu_irq_exit(void) | 107 | void rcu_irq_exit(void) |
| 106 | { | 108 | { |
| 107 | unsigned long flags; | 109 | unsigned long flags; |
| 108 | long long oldval; | 110 | long long newval; |
| 109 | 111 | ||
| 110 | local_irq_save(flags); | 112 | local_irq_save(flags); |
| 111 | oldval = rcu_dynticks_nesting; | 113 | newval = rcu_dynticks_nesting - 1; |
| 112 | rcu_dynticks_nesting--; | 114 | WARN_ON_ONCE(newval < 0); |
| 113 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); | 115 | rcu_idle_enter_common(newval); |
| 114 | rcu_idle_enter_common(oldval); | ||
| 115 | local_irq_restore(flags); | 116 | local_irq_restore(flags); |
| 116 | } | 117 | } |
| 118 | EXPORT_SYMBOL_GPL(rcu_irq_exit); | ||
| 117 | 119 | ||
| 118 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | 120 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ |
| 119 | static void rcu_idle_exit_common(long long oldval) | 121 | static void rcu_idle_exit_common(long long oldval) |
| @@ -171,6 +173,7 @@ void rcu_irq_enter(void) | |||
| 171 | rcu_idle_exit_common(oldval); | 173 | rcu_idle_exit_common(oldval); |
| 172 | local_irq_restore(flags); | 174 | local_irq_restore(flags); |
| 173 | } | 175 | } |
| 176 | EXPORT_SYMBOL_GPL(rcu_irq_enter); | ||
| 174 | 177 | ||
| 175 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 178 | #ifdef CONFIG_DEBUG_LOCK_ALLOC |
| 176 | 179 | ||
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 918fd1e8509c..3d0190282204 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h | |||
| @@ -278,7 +278,7 @@ static int rcu_boost(void) | |||
| 278 | rcu_preempt_ctrlblk.exp_tasks == NULL) | 278 | rcu_preempt_ctrlblk.exp_tasks == NULL) |
| 279 | return 0; /* Nothing to boost. */ | 279 | return 0; /* Nothing to boost. */ |
| 280 | 280 | ||
| 281 | raw_local_irq_save(flags); | 281 | local_irq_save(flags); |
| 282 | 282 | ||
| 283 | /* | 283 | /* |
| 284 | * Recheck with irqs disabled: all tasks in need of boosting | 284 | * Recheck with irqs disabled: all tasks in need of boosting |
| @@ -287,7 +287,7 @@ static int rcu_boost(void) | |||
| 287 | */ | 287 | */ |
| 288 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && | 288 | if (rcu_preempt_ctrlblk.boost_tasks == NULL && |
| 289 | rcu_preempt_ctrlblk.exp_tasks == NULL) { | 289 | rcu_preempt_ctrlblk.exp_tasks == NULL) { |
| 290 | raw_local_irq_restore(flags); | 290 | local_irq_restore(flags); |
| 291 | return 0; | 291 | return 0; |
| 292 | } | 292 | } |
| 293 | 293 | ||
| @@ -317,7 +317,7 @@ static int rcu_boost(void) | |||
| 317 | t = container_of(tb, struct task_struct, rcu_node_entry); | 317 | t = container_of(tb, struct task_struct, rcu_node_entry); |
| 318 | rt_mutex_init_proxy_locked(&mtx, t); | 318 | rt_mutex_init_proxy_locked(&mtx, t); |
| 319 | t->rcu_boost_mutex = &mtx; | 319 | t->rcu_boost_mutex = &mtx; |
| 320 | raw_local_irq_restore(flags); | 320 | local_irq_restore(flags); |
| 321 | rt_mutex_lock(&mtx); | 321 | rt_mutex_lock(&mtx); |
| 322 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | 322 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ |
| 323 | 323 | ||
| @@ -991,9 +991,9 @@ static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | |||
| 991 | { | 991 | { |
| 992 | unsigned long flags; | 992 | unsigned long flags; |
| 993 | 993 | ||
| 994 | raw_local_irq_save(flags); | 994 | local_irq_save(flags); |
| 995 | rcp->qlen -= n; | 995 | rcp->qlen -= n; |
| 996 | raw_local_irq_restore(flags); | 996 | local_irq_restore(flags); |
| 997 | } | 997 | } |
| 998 | 998 | ||
| 999 | /* | 999 | /* |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 25b15033c61f..aaa7b9f3532a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
| @@ -53,10 +53,11 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@fre | |||
| 53 | 53 | ||
| 54 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ | 54 | static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ |
| 55 | static int nfakewriters = 4; /* # fake writer threads */ | 55 | static int nfakewriters = 4; /* # fake writer threads */ |
| 56 | static int stat_interval; /* Interval between stats, in seconds. */ | 56 | static int stat_interval = 60; /* Interval between stats, in seconds. */ |
| 57 | /* Defaults to "only at end of test". */ | 57 | /* Zero means "only at end of test". */ |
| 58 | static bool verbose; /* Print more debug info. */ | 58 | static bool verbose; /* Print more debug info. */ |
| 59 | static bool test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 59 | static bool test_no_idle_hz = true; |
| 60 | /* Test RCU support for tickless idle CPUs. */ | ||
| 60 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ | 61 | static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ |
| 61 | static int stutter = 5; /* Start/stop testing interval (in sec) */ | 62 | static int stutter = 5; /* Start/stop testing interval (in sec) */ |
| 62 | static int irqreader = 1; /* RCU readers from irq (timers). */ | 63 | static int irqreader = 1; /* RCU readers from irq (timers). */ |
| @@ -119,11 +120,11 @@ MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); | |||
| 119 | 120 | ||
| 120 | #define TORTURE_FLAG "-torture:" | 121 | #define TORTURE_FLAG "-torture:" |
| 121 | #define PRINTK_STRING(s) \ | 122 | #define PRINTK_STRING(s) \ |
| 122 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) | 123 | do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
| 123 | #define VERBOSE_PRINTK_STRING(s) \ | 124 | #define VERBOSE_PRINTK_STRING(s) \ |
| 124 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) | 125 | do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
| 125 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 126 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
| 126 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) | 127 | do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
| 127 | 128 | ||
| 128 | static char printk_buf[4096]; | 129 | static char printk_buf[4096]; |
| 129 | 130 | ||
| @@ -176,8 +177,14 @@ static long n_rcu_torture_boosts; | |||
| 176 | static long n_rcu_torture_timers; | 177 | static long n_rcu_torture_timers; |
| 177 | static long n_offline_attempts; | 178 | static long n_offline_attempts; |
| 178 | static long n_offline_successes; | 179 | static long n_offline_successes; |
| 180 | static unsigned long sum_offline; | ||
| 181 | static int min_offline = -1; | ||
| 182 | static int max_offline; | ||
| 179 | static long n_online_attempts; | 183 | static long n_online_attempts; |
| 180 | static long n_online_successes; | 184 | static long n_online_successes; |
| 185 | static unsigned long sum_online; | ||
| 186 | static int min_online = -1; | ||
| 187 | static int max_online; | ||
| 181 | static long n_barrier_attempts; | 188 | static long n_barrier_attempts; |
| 182 | static long n_barrier_successes; | 189 | static long n_barrier_successes; |
| 183 | static struct list_head rcu_torture_removed; | 190 | static struct list_head rcu_torture_removed; |
| @@ -235,7 +242,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
| 235 | if (fullstop == FULLSTOP_DONTSTOP) | 242 | if (fullstop == FULLSTOP_DONTSTOP) |
| 236 | fullstop = FULLSTOP_SHUTDOWN; | 243 | fullstop = FULLSTOP_SHUTDOWN; |
| 237 | else | 244 | else |
| 238 | printk(KERN_WARNING /* but going down anyway, so... */ | 245 | pr_warn(/* but going down anyway, so... */ |
| 239 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 246 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
| 240 | mutex_unlock(&fullstop_mutex); | 247 | mutex_unlock(&fullstop_mutex); |
| 241 | return NOTIFY_DONE; | 248 | return NOTIFY_DONE; |
| @@ -248,7 +255,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, | |||
| 248 | static void rcutorture_shutdown_absorb(char *title) | 255 | static void rcutorture_shutdown_absorb(char *title) |
| 249 | { | 256 | { |
| 250 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | 257 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { |
| 251 | printk(KERN_NOTICE | 258 | pr_notice( |
| 252 | "rcutorture thread %s parking due to system shutdown\n", | 259 | "rcutorture thread %s parking due to system shutdown\n", |
| 253 | title); | 260 | title); |
| 254 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); | 261 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); |
| @@ -1214,11 +1221,13 @@ rcu_torture_printk(char *page) | |||
| 1214 | n_rcu_torture_boost_failure, | 1221 | n_rcu_torture_boost_failure, |
| 1215 | n_rcu_torture_boosts, | 1222 | n_rcu_torture_boosts, |
| 1216 | n_rcu_torture_timers); | 1223 | n_rcu_torture_timers); |
| 1217 | cnt += sprintf(&page[cnt], "onoff: %ld/%ld:%ld/%ld ", | 1224 | cnt += sprintf(&page[cnt], |
| 1218 | n_online_successes, | 1225 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", |
| 1219 | n_online_attempts, | 1226 | n_online_successes, n_online_attempts, |
| 1220 | n_offline_successes, | 1227 | n_offline_successes, n_offline_attempts, |
| 1221 | n_offline_attempts); | 1228 | min_online, max_online, |
| 1229 | min_offline, max_offline, | ||
| 1230 | sum_online, sum_offline, HZ); | ||
| 1222 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", | 1231 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", |
| 1223 | n_barrier_successes, | 1232 | n_barrier_successes, |
| 1224 | n_barrier_attempts, | 1233 | n_barrier_attempts, |
| @@ -1267,7 +1276,7 @@ rcu_torture_stats_print(void) | |||
| 1267 | int cnt; | 1276 | int cnt; |
| 1268 | 1277 | ||
| 1269 | cnt = rcu_torture_printk(printk_buf); | 1278 | cnt = rcu_torture_printk(printk_buf); |
| 1270 | printk(KERN_ALERT "%s", printk_buf); | 1279 | pr_alert("%s", printk_buf); |
| 1271 | } | 1280 | } |
| 1272 | 1281 | ||
| 1273 | /* | 1282 | /* |
| @@ -1380,20 +1389,20 @@ rcu_torture_stutter(void *arg) | |||
| 1380 | static inline void | 1389 | static inline void |
| 1381 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) | 1390 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) |
| 1382 | { | 1391 | { |
| 1383 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1392 | pr_alert("%s" TORTURE_FLAG |
| 1384 | "--- %s: nreaders=%d nfakewriters=%d " | 1393 | "--- %s: nreaders=%d nfakewriters=%d " |
| 1385 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 1394 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
| 1386 | "shuffle_interval=%d stutter=%d irqreader=%d " | 1395 | "shuffle_interval=%d stutter=%d irqreader=%d " |
| 1387 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | 1396 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " |
| 1388 | "test_boost=%d/%d test_boost_interval=%d " | 1397 | "test_boost=%d/%d test_boost_interval=%d " |
| 1389 | "test_boost_duration=%d shutdown_secs=%d " | 1398 | "test_boost_duration=%d shutdown_secs=%d " |
| 1390 | "onoff_interval=%d onoff_holdoff=%d\n", | 1399 | "onoff_interval=%d onoff_holdoff=%d\n", |
| 1391 | torture_type, tag, nrealreaders, nfakewriters, | 1400 | torture_type, tag, nrealreaders, nfakewriters, |
| 1392 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | 1401 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, |
| 1393 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | 1402 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, |
| 1394 | test_boost, cur_ops->can_boost, | 1403 | test_boost, cur_ops->can_boost, |
| 1395 | test_boost_interval, test_boost_duration, shutdown_secs, | 1404 | test_boost_interval, test_boost_duration, shutdown_secs, |
| 1396 | onoff_interval, onoff_holdoff); | 1405 | onoff_interval, onoff_holdoff); |
| 1397 | } | 1406 | } |
| 1398 | 1407 | ||
| 1399 | static struct notifier_block rcutorture_shutdown_nb = { | 1408 | static struct notifier_block rcutorture_shutdown_nb = { |
| @@ -1460,9 +1469,9 @@ rcu_torture_shutdown(void *arg) | |||
| 1460 | !kthread_should_stop()) { | 1469 | !kthread_should_stop()) { |
| 1461 | delta = shutdown_time - jiffies_snap; | 1470 | delta = shutdown_time - jiffies_snap; |
| 1462 | if (verbose) | 1471 | if (verbose) |
| 1463 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1472 | pr_alert("%s" TORTURE_FLAG |
| 1464 | "rcu_torture_shutdown task: %lu jiffies remaining\n", | 1473 | "rcu_torture_shutdown task: %lu jiffies remaining\n", |
| 1465 | torture_type, delta); | 1474 | torture_type, delta); |
| 1466 | schedule_timeout_interruptible(delta); | 1475 | schedule_timeout_interruptible(delta); |
| 1467 | jiffies_snap = ACCESS_ONCE(jiffies); | 1476 | jiffies_snap = ACCESS_ONCE(jiffies); |
| 1468 | } | 1477 | } |
| @@ -1490,8 +1499,10 @@ static int __cpuinit | |||
| 1490 | rcu_torture_onoff(void *arg) | 1499 | rcu_torture_onoff(void *arg) |
| 1491 | { | 1500 | { |
| 1492 | int cpu; | 1501 | int cpu; |
| 1502 | unsigned long delta; | ||
| 1493 | int maxcpu = -1; | 1503 | int maxcpu = -1; |
| 1494 | DEFINE_RCU_RANDOM(rand); | 1504 | DEFINE_RCU_RANDOM(rand); |
| 1505 | unsigned long starttime; | ||
| 1495 | 1506 | ||
| 1496 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | 1507 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); |
| 1497 | for_each_online_cpu(cpu) | 1508 | for_each_online_cpu(cpu) |
| @@ -1506,29 +1517,51 @@ rcu_torture_onoff(void *arg) | |||
| 1506 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | 1517 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); |
| 1507 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | 1518 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { |
| 1508 | if (verbose) | 1519 | if (verbose) |
| 1509 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1520 | pr_alert("%s" TORTURE_FLAG |
| 1510 | "rcu_torture_onoff task: offlining %d\n", | 1521 | "rcu_torture_onoff task: offlining %d\n", |
| 1511 | torture_type, cpu); | 1522 | torture_type, cpu); |
| 1523 | starttime = jiffies; | ||
| 1512 | n_offline_attempts++; | 1524 | n_offline_attempts++; |
| 1513 | if (cpu_down(cpu) == 0) { | 1525 | if (cpu_down(cpu) == 0) { |
| 1514 | if (verbose) | 1526 | if (verbose) |
| 1515 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1527 | pr_alert("%s" TORTURE_FLAG |
| 1516 | "rcu_torture_onoff task: offlined %d\n", | 1528 | "rcu_torture_onoff task: offlined %d\n", |
| 1517 | torture_type, cpu); | 1529 | torture_type, cpu); |
| 1518 | n_offline_successes++; | 1530 | n_offline_successes++; |
| 1531 | delta = jiffies - starttime; | ||
| 1532 | sum_offline += delta; | ||
| 1533 | if (min_offline < 0) { | ||
| 1534 | min_offline = delta; | ||
| 1535 | max_offline = delta; | ||
| 1536 | } | ||
| 1537 | if (min_offline > delta) | ||
| 1538 | min_offline = delta; | ||
| 1539 | if (max_offline < delta) | ||
| 1540 | max_offline = delta; | ||
| 1519 | } | 1541 | } |
| 1520 | } else if (cpu_is_hotpluggable(cpu)) { | 1542 | } else if (cpu_is_hotpluggable(cpu)) { |
| 1521 | if (verbose) | 1543 | if (verbose) |
| 1522 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1544 | pr_alert("%s" TORTURE_FLAG |
| 1523 | "rcu_torture_onoff task: onlining %d\n", | 1545 | "rcu_torture_onoff task: onlining %d\n", |
| 1524 | torture_type, cpu); | 1546 | torture_type, cpu); |
| 1547 | starttime = jiffies; | ||
| 1525 | n_online_attempts++; | 1548 | n_online_attempts++; |
| 1526 | if (cpu_up(cpu) == 0) { | 1549 | if (cpu_up(cpu) == 0) { |
| 1527 | if (verbose) | 1550 | if (verbose) |
| 1528 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1551 | pr_alert("%s" TORTURE_FLAG |
| 1529 | "rcu_torture_onoff task: onlined %d\n", | 1552 | "rcu_torture_onoff task: onlined %d\n", |
| 1530 | torture_type, cpu); | 1553 | torture_type, cpu); |
| 1531 | n_online_successes++; | 1554 | n_online_successes++; |
| 1555 | delta = jiffies - starttime; | ||
| 1556 | sum_online += delta; | ||
| 1557 | if (min_online < 0) { | ||
| 1558 | min_online = delta; | ||
| 1559 | max_online = delta; | ||
| 1560 | } | ||
| 1561 | if (min_online > delta) | ||
| 1562 | min_online = delta; | ||
| 1563 | if (max_online < delta) | ||
| 1564 | max_online = delta; | ||
| 1532 | } | 1565 | } |
| 1533 | } | 1566 | } |
| 1534 | schedule_timeout_interruptible(onoff_interval * HZ); | 1567 | schedule_timeout_interruptible(onoff_interval * HZ); |
| @@ -1593,14 +1626,14 @@ static int __cpuinit rcu_torture_stall(void *args) | |||
| 1593 | if (!kthread_should_stop()) { | 1626 | if (!kthread_should_stop()) { |
| 1594 | stop_at = get_seconds() + stall_cpu; | 1627 | stop_at = get_seconds() + stall_cpu; |
| 1595 | /* RCU CPU stall is expected behavior in following code. */ | 1628 | /* RCU CPU stall is expected behavior in following code. */ |
| 1596 | printk(KERN_ALERT "rcu_torture_stall start.\n"); | 1629 | pr_alert("rcu_torture_stall start.\n"); |
| 1597 | rcu_read_lock(); | 1630 | rcu_read_lock(); |
| 1598 | preempt_disable(); | 1631 | preempt_disable(); |
| 1599 | while (ULONG_CMP_LT(get_seconds(), stop_at)) | 1632 | while (ULONG_CMP_LT(get_seconds(), stop_at)) |
| 1600 | continue; /* Induce RCU CPU stall warning. */ | 1633 | continue; /* Induce RCU CPU stall warning. */ |
| 1601 | preempt_enable(); | 1634 | preempt_enable(); |
| 1602 | rcu_read_unlock(); | 1635 | rcu_read_unlock(); |
| 1603 | printk(KERN_ALERT "rcu_torture_stall end.\n"); | 1636 | pr_alert("rcu_torture_stall end.\n"); |
| 1604 | } | 1637 | } |
| 1605 | rcutorture_shutdown_absorb("rcu_torture_stall"); | 1638 | rcutorture_shutdown_absorb("rcu_torture_stall"); |
| 1606 | while (!kthread_should_stop()) | 1639 | while (!kthread_should_stop()) |
| @@ -1716,12 +1749,12 @@ static int rcu_torture_barrier_init(void) | |||
| 1716 | if (n_barrier_cbs == 0) | 1749 | if (n_barrier_cbs == 0) |
| 1717 | return 0; | 1750 | return 0; |
| 1718 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | 1751 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { |
| 1719 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1752 | pr_alert("%s" TORTURE_FLAG |
| 1720 | " Call or barrier ops missing for %s,\n", | 1753 | " Call or barrier ops missing for %s,\n", |
| 1721 | torture_type, cur_ops->name); | 1754 | torture_type, cur_ops->name); |
| 1722 | printk(KERN_ALERT "%s" TORTURE_FLAG | 1755 | pr_alert("%s" TORTURE_FLAG |
| 1723 | " RCU barrier testing omitted from run.\n", | 1756 | " RCU barrier testing omitted from run.\n", |
| 1724 | torture_type); | 1757 | torture_type); |
| 1725 | return 0; | 1758 | return 0; |
| 1726 | } | 1759 | } |
| 1727 | atomic_set(&barrier_cbs_count, 0); | 1760 | atomic_set(&barrier_cbs_count, 0); |
| @@ -1814,7 +1847,7 @@ rcu_torture_cleanup(void) | |||
| 1814 | mutex_lock(&fullstop_mutex); | 1847 | mutex_lock(&fullstop_mutex); |
| 1815 | rcutorture_record_test_transition(); | 1848 | rcutorture_record_test_transition(); |
| 1816 | if (fullstop == FULLSTOP_SHUTDOWN) { | 1849 | if (fullstop == FULLSTOP_SHUTDOWN) { |
| 1817 | printk(KERN_WARNING /* but going down anyway, so... */ | 1850 | pr_warn(/* but going down anyway, so... */ |
| 1818 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | 1851 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); |
| 1819 | mutex_unlock(&fullstop_mutex); | 1852 | mutex_unlock(&fullstop_mutex); |
| 1820 | schedule_timeout_uninterruptible(10); | 1853 | schedule_timeout_uninterruptible(10); |
| @@ -1938,17 +1971,17 @@ rcu_torture_init(void) | |||
| 1938 | break; | 1971 | break; |
| 1939 | } | 1972 | } |
| 1940 | if (i == ARRAY_SIZE(torture_ops)) { | 1973 | if (i == ARRAY_SIZE(torture_ops)) { |
| 1941 | printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", | 1974 | pr_alert("rcu-torture: invalid torture type: \"%s\"\n", |
| 1942 | torture_type); | 1975 | torture_type); |
| 1943 | printk(KERN_ALERT "rcu-torture types:"); | 1976 | pr_alert("rcu-torture types:"); |
| 1944 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) | 1977 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) |
| 1945 | printk(KERN_ALERT " %s", torture_ops[i]->name); | 1978 | pr_alert(" %s", torture_ops[i]->name); |
| 1946 | printk(KERN_ALERT "\n"); | 1979 | pr_alert("\n"); |
| 1947 | mutex_unlock(&fullstop_mutex); | 1980 | mutex_unlock(&fullstop_mutex); |
| 1948 | return -EINVAL; | 1981 | return -EINVAL; |
| 1949 | } | 1982 | } |
| 1950 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | 1983 | if (cur_ops->fqs == NULL && fqs_duration != 0) { |
| 1951 | printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); | 1984 | pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); |
| 1952 | fqs_duration = 0; | 1985 | fqs_duration = 0; |
| 1953 | } | 1986 | } |
| 1954 | if (cur_ops->init) | 1987 | if (cur_ops->init) |
| @@ -1996,14 +2029,15 @@ rcu_torture_init(void) | |||
| 1996 | /* Start up the kthreads. */ | 2029 | /* Start up the kthreads. */ |
| 1997 | 2030 | ||
| 1998 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | 2031 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); |
| 1999 | writer_task = kthread_run(rcu_torture_writer, NULL, | 2032 | writer_task = kthread_create(rcu_torture_writer, NULL, |
| 2000 | "rcu_torture_writer"); | 2033 | "rcu_torture_writer"); |
| 2001 | if (IS_ERR(writer_task)) { | 2034 | if (IS_ERR(writer_task)) { |
| 2002 | firsterr = PTR_ERR(writer_task); | 2035 | firsterr = PTR_ERR(writer_task); |
| 2003 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | 2036 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); |
| 2004 | writer_task = NULL; | 2037 | writer_task = NULL; |
| 2005 | goto unwind; | 2038 | goto unwind; |
| 2006 | } | 2039 | } |
| 2040 | wake_up_process(writer_task); | ||
| 2007 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), | 2041 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), |
| 2008 | GFP_KERNEL); | 2042 | GFP_KERNEL); |
| 2009 | if (fakewriter_tasks == NULL) { | 2043 | if (fakewriter_tasks == NULL) { |
| @@ -2118,14 +2152,15 @@ rcu_torture_init(void) | |||
| 2118 | } | 2152 | } |
| 2119 | if (shutdown_secs > 0) { | 2153 | if (shutdown_secs > 0) { |
| 2120 | shutdown_time = jiffies + shutdown_secs * HZ; | 2154 | shutdown_time = jiffies + shutdown_secs * HZ; |
| 2121 | shutdown_task = kthread_run(rcu_torture_shutdown, NULL, | 2155 | shutdown_task = kthread_create(rcu_torture_shutdown, NULL, |
| 2122 | "rcu_torture_shutdown"); | 2156 | "rcu_torture_shutdown"); |
| 2123 | if (IS_ERR(shutdown_task)) { | 2157 | if (IS_ERR(shutdown_task)) { |
| 2124 | firsterr = PTR_ERR(shutdown_task); | 2158 | firsterr = PTR_ERR(shutdown_task); |
| 2125 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); | 2159 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); |
| 2126 | shutdown_task = NULL; | 2160 | shutdown_task = NULL; |
| 2127 | goto unwind; | 2161 | goto unwind; |
| 2128 | } | 2162 | } |
| 2163 | wake_up_process(shutdown_task); | ||
| 2129 | } | 2164 | } |
| 2130 | i = rcu_torture_onoff_init(); | 2165 | i = rcu_torture_onoff_init(); |
| 2131 | if (i != 0) { | 2166 | if (i != 0) { |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index f280e542e3e9..4fb2376ddf06 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
| @@ -52,6 +52,7 @@ | |||
| 52 | #include <linux/prefetch.h> | 52 | #include <linux/prefetch.h> |
| 53 | #include <linux/delay.h> | 53 | #include <linux/delay.h> |
| 54 | #include <linux/stop_machine.h> | 54 | #include <linux/stop_machine.h> |
| 55 | #include <linux/random.h> | ||
| 55 | 56 | ||
| 56 | #include "rcutree.h" | 57 | #include "rcutree.h" |
| 57 | #include <trace/events/rcu.h> | 58 | #include <trace/events/rcu.h> |
| @@ -61,6 +62,7 @@ | |||
| 61 | /* Data structures. */ | 62 | /* Data structures. */ |
| 62 | 63 | ||
| 63 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | 64 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; |
| 65 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | ||
| 64 | 66 | ||
| 65 | #define RCU_STATE_INITIALIZER(sname, cr) { \ | 67 | #define RCU_STATE_INITIALIZER(sname, cr) { \ |
| 66 | .level = { &sname##_state.node[0] }, \ | 68 | .level = { &sname##_state.node[0] }, \ |
| @@ -72,7 +74,6 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | |||
| 72 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | 74 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ |
| 73 | .orphan_donetail = &sname##_state.orphan_donelist, \ | 75 | .orphan_donetail = &sname##_state.orphan_donelist, \ |
| 74 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | 76 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ |
| 75 | .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ | ||
| 76 | .name = #sname, \ | 77 | .name = #sname, \ |
| 77 | } | 78 | } |
| 78 | 79 | ||
| @@ -88,7 +89,7 @@ LIST_HEAD(rcu_struct_flavors); | |||
| 88 | 89 | ||
| 89 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | 90 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ |
| 90 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; | 91 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; |
| 91 | module_param(rcu_fanout_leaf, int, 0); | 92 | module_param(rcu_fanout_leaf, int, 0444); |
| 92 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | 93 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; |
| 93 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ | 94 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ |
| 94 | NUM_RCU_LVL_0, | 95 | NUM_RCU_LVL_0, |
| @@ -133,13 +134,12 @@ static int rcu_scheduler_fully_active __read_mostly; | |||
| 133 | */ | 134 | */ |
| 134 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | 135 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); |
| 135 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | 136 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); |
| 136 | DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
| 137 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | 137 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); |
| 138 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | 138 | DEFINE_PER_CPU(char, rcu_cpu_has_work); |
| 139 | 139 | ||
| 140 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 140 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 141 | 141 | ||
| 142 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | 142 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); |
| 143 | static void invoke_rcu_core(void); | 143 | static void invoke_rcu_core(void); |
| 144 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | 144 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); |
| 145 | 145 | ||
| @@ -175,8 +175,6 @@ void rcu_sched_qs(int cpu) | |||
| 175 | { | 175 | { |
| 176 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | 176 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); |
| 177 | 177 | ||
| 178 | rdp->passed_quiesce_gpnum = rdp->gpnum; | ||
| 179 | barrier(); | ||
| 180 | if (rdp->passed_quiesce == 0) | 178 | if (rdp->passed_quiesce == 0) |
| 181 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); | 179 | trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); |
| 182 | rdp->passed_quiesce = 1; | 180 | rdp->passed_quiesce = 1; |
| @@ -186,8 +184,6 @@ void rcu_bh_qs(int cpu) | |||
| 186 | { | 184 | { |
| 187 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | 185 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); |
| 188 | 186 | ||
| 189 | rdp->passed_quiesce_gpnum = rdp->gpnum; | ||
| 190 | barrier(); | ||
| 191 | if (rdp->passed_quiesce == 0) | 187 | if (rdp->passed_quiesce == 0) |
| 192 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); | 188 | trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); |
| 193 | rdp->passed_quiesce = 1; | 189 | rdp->passed_quiesce = 1; |
| @@ -210,15 +206,18 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); | |||
| 210 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | 206 | DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { |
| 211 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | 207 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, |
| 212 | .dynticks = ATOMIC_INIT(1), | 208 | .dynticks = ATOMIC_INIT(1), |
| 209 | #if defined(CONFIG_RCU_USER_QS) && !defined(CONFIG_RCU_USER_QS_FORCE) | ||
| 210 | .ignore_user_qs = true, | ||
| 211 | #endif | ||
| 213 | }; | 212 | }; |
| 214 | 213 | ||
| 215 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | 214 | static int blimit = 10; /* Maximum callbacks per rcu_do_batch. */ |
| 216 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ | 215 | static int qhimark = 10000; /* If this many pending, ignore blimit. */ |
| 217 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ | 216 | static int qlowmark = 100; /* Once only this many pending, use blimit. */ |
| 218 | 217 | ||
| 219 | module_param(blimit, int, 0); | 218 | module_param(blimit, int, 0444); |
| 220 | module_param(qhimark, int, 0); | 219 | module_param(qhimark, int, 0444); |
| 221 | module_param(qlowmark, int, 0); | 220 | module_param(qlowmark, int, 0444); |
| 222 | 221 | ||
| 223 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | 222 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ |
| 224 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | 223 | int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; |
| @@ -226,7 +225,14 @@ int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | |||
| 226 | module_param(rcu_cpu_stall_suppress, int, 0644); | 225 | module_param(rcu_cpu_stall_suppress, int, 0644); |
| 227 | module_param(rcu_cpu_stall_timeout, int, 0644); | 226 | module_param(rcu_cpu_stall_timeout, int, 0644); |
| 228 | 227 | ||
| 229 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed); | 228 | static ulong jiffies_till_first_fqs = RCU_JIFFIES_TILL_FORCE_QS; |
| 229 | static ulong jiffies_till_next_fqs = RCU_JIFFIES_TILL_FORCE_QS; | ||
| 230 | |||
| 231 | module_param(jiffies_till_first_fqs, ulong, 0644); | ||
| 232 | module_param(jiffies_till_next_fqs, ulong, 0644); | ||
| 233 | |||
| 234 | static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); | ||
| 235 | static void force_quiescent_state(struct rcu_state *rsp); | ||
| 230 | static int rcu_pending(int cpu); | 236 | static int rcu_pending(int cpu); |
| 231 | 237 | ||
| 232 | /* | 238 | /* |
| @@ -252,7 +258,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | |||
| 252 | */ | 258 | */ |
| 253 | void rcu_bh_force_quiescent_state(void) | 259 | void rcu_bh_force_quiescent_state(void) |
| 254 | { | 260 | { |
| 255 | force_quiescent_state(&rcu_bh_state, 0); | 261 | force_quiescent_state(&rcu_bh_state); |
| 256 | } | 262 | } |
| 257 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | 263 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); |
| 258 | 264 | ||
| @@ -286,7 +292,7 @@ EXPORT_SYMBOL_GPL(rcutorture_record_progress); | |||
| 286 | */ | 292 | */ |
| 287 | void rcu_sched_force_quiescent_state(void) | 293 | void rcu_sched_force_quiescent_state(void) |
| 288 | { | 294 | { |
| 289 | force_quiescent_state(&rcu_sched_state, 0); | 295 | force_quiescent_state(&rcu_sched_state); |
| 290 | } | 296 | } |
| 291 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | 297 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); |
| 292 | 298 | ||
| @@ -305,7 +311,9 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
| 305 | static int | 311 | static int |
| 306 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 312 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
| 307 | { | 313 | { |
| 308 | return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); | 314 | return *rdp->nxttail[RCU_DONE_TAIL + |
| 315 | ACCESS_ONCE(rsp->completed) != rdp->completed] && | ||
| 316 | !rcu_gp_in_progress(rsp); | ||
| 309 | } | 317 | } |
| 310 | 318 | ||
| 311 | /* | 319 | /* |
| @@ -317,45 +325,17 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | |||
| 317 | } | 325 | } |
| 318 | 326 | ||
| 319 | /* | 327 | /* |
| 320 | * If the specified CPU is offline, tell the caller that it is in | 328 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state |
| 321 | * a quiescent state. Otherwise, whack it with a reschedule IPI. | ||
| 322 | * Grace periods can end up waiting on an offline CPU when that | ||
| 323 | * CPU is in the process of coming online -- it will be added to the | ||
| 324 | * rcu_node bitmasks before it actually makes it online. The same thing | ||
| 325 | * can happen while a CPU is in the process of coming online. Because this | ||
| 326 | * race is quite rare, we check for it after detecting that the grace | ||
| 327 | * period has been delayed rather than checking each and every CPU | ||
| 328 | * each and every time we start a new grace period. | ||
| 329 | */ | ||
| 330 | static int rcu_implicit_offline_qs(struct rcu_data *rdp) | ||
| 331 | { | ||
| 332 | /* | ||
| 333 | * If the CPU is offline for more than a jiffy, it is in a quiescent | ||
| 334 | * state. We can trust its state not to change because interrupts | ||
| 335 | * are disabled. The reason for the jiffy's worth of slack is to | ||
| 336 | * handle CPUs initializing on the way up and finding their way | ||
| 337 | * to the idle loop on the way down. | ||
| 338 | */ | ||
| 339 | if (cpu_is_offline(rdp->cpu) && | ||
| 340 | ULONG_CMP_LT(rdp->rsp->gp_start + 2, jiffies)) { | ||
| 341 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | ||
| 342 | rdp->offline_fqs++; | ||
| 343 | return 1; | ||
| 344 | } | ||
| 345 | return 0; | ||
| 346 | } | ||
| 347 | |||
| 348 | /* | ||
| 349 | * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle | ||
| 350 | * | 329 | * |
| 351 | * If the new value of the ->dynticks_nesting counter now is zero, | 330 | * If the new value of the ->dynticks_nesting counter now is zero, |
| 352 | * we really have entered idle, and must do the appropriate accounting. | 331 | * we really have entered idle, and must do the appropriate accounting. |
| 353 | * The caller must have disabled interrupts. | 332 | * The caller must have disabled interrupts. |
| 354 | */ | 333 | */ |
| 355 | static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | 334 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, |
| 335 | bool user) | ||
| 356 | { | 336 | { |
| 357 | trace_rcu_dyntick("Start", oldval, 0); | 337 | trace_rcu_dyntick("Start", oldval, 0); |
| 358 | if (!is_idle_task(current)) { | 338 | if (!user && !is_idle_task(current)) { |
| 359 | struct task_struct *idle = idle_task(smp_processor_id()); | 339 | struct task_struct *idle = idle_task(smp_processor_id()); |
| 360 | 340 | ||
| 361 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); | 341 | trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); |
| @@ -372,7 +352,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 372 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | 352 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); |
| 373 | 353 | ||
| 374 | /* | 354 | /* |
| 375 | * The idle task is not permitted to enter the idle loop while | 355 | * It is illegal to enter an extended quiescent state while |
| 376 | * in an RCU read-side critical section. | 356 | * in an RCU read-side critical section. |
| 377 | */ | 357 | */ |
| 378 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), | 358 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), |
| @@ -383,6 +363,25 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 383 | "Illegal idle entry in RCU-sched read-side critical section."); | 363 | "Illegal idle entry in RCU-sched read-side critical section."); |
| 384 | } | 364 | } |
| 385 | 365 | ||
| 366 | /* | ||
| 367 | * Enter an RCU extended quiescent state, which can be either the | ||
| 368 | * idle loop or adaptive-tickless usermode execution. | ||
| 369 | */ | ||
| 370 | static void rcu_eqs_enter(bool user) | ||
| 371 | { | ||
| 372 | long long oldval; | ||
| 373 | struct rcu_dynticks *rdtp; | ||
| 374 | |||
| 375 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 376 | oldval = rdtp->dynticks_nesting; | ||
| 377 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | ||
| 378 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | ||
| 379 | rdtp->dynticks_nesting = 0; | ||
| 380 | else | ||
| 381 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
| 382 | rcu_eqs_enter_common(rdtp, oldval, user); | ||
| 383 | } | ||
| 384 | |||
| 386 | /** | 385 | /** |
| 387 | * rcu_idle_enter - inform RCU that current CPU is entering idle | 386 | * rcu_idle_enter - inform RCU that current CPU is entering idle |
| 388 | * | 387 | * |
| @@ -398,21 +397,70 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 398 | void rcu_idle_enter(void) | 397 | void rcu_idle_enter(void) |
| 399 | { | 398 | { |
| 400 | unsigned long flags; | 399 | unsigned long flags; |
| 401 | long long oldval; | 400 | |
| 401 | local_irq_save(flags); | ||
| 402 | rcu_eqs_enter(false); | ||
| 403 | local_irq_restore(flags); | ||
| 404 | } | ||
| 405 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | ||
| 406 | |||
| 407 | #ifdef CONFIG_RCU_USER_QS | ||
| 408 | /** | ||
| 409 | * rcu_user_enter - inform RCU that we are resuming userspace. | ||
| 410 | * | ||
| 411 | * Enter RCU idle mode right before resuming userspace. No use of RCU | ||
| 412 | * is permitted between this call and rcu_user_exit(). This way the | ||
| 413 | * CPU doesn't need to maintain the tick for RCU maintenance purposes | ||
| 414 | * when the CPU runs in userspace. | ||
| 415 | */ | ||
| 416 | void rcu_user_enter(void) | ||
| 417 | { | ||
| 418 | unsigned long flags; | ||
| 402 | struct rcu_dynticks *rdtp; | 419 | struct rcu_dynticks *rdtp; |
| 403 | 420 | ||
| 421 | /* | ||
| 422 | * Some contexts may involve an exception occuring in an irq, | ||
| 423 | * leading to that nesting: | ||
| 424 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
| 425 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
| 426 | * helpers are enough to protect RCU uses inside the exception. So | ||
| 427 | * just return immediately if we detect we are in an IRQ. | ||
| 428 | */ | ||
| 429 | if (in_interrupt()) | ||
| 430 | return; | ||
| 431 | |||
| 432 | WARN_ON_ONCE(!current->mm); | ||
| 433 | |||
| 404 | local_irq_save(flags); | 434 | local_irq_save(flags); |
| 405 | rdtp = &__get_cpu_var(rcu_dynticks); | 435 | rdtp = &__get_cpu_var(rcu_dynticks); |
| 406 | oldval = rdtp->dynticks_nesting; | 436 | if (!rdtp->ignore_user_qs && !rdtp->in_user) { |
| 407 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | 437 | rdtp->in_user = true; |
| 408 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | 438 | rcu_eqs_enter(true); |
| 409 | rdtp->dynticks_nesting = 0; | 439 | } |
| 410 | else | ||
| 411 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
| 412 | rcu_idle_enter_common(rdtp, oldval); | ||
| 413 | local_irq_restore(flags); | 440 | local_irq_restore(flags); |
| 414 | } | 441 | } |
| 415 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | 442 | |
| 443 | /** | ||
| 444 | * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace | ||
| 445 | * after the current irq returns. | ||
| 446 | * | ||
| 447 | * This is similar to rcu_user_enter() but in the context of a non-nesting | ||
| 448 | * irq. After this call, RCU enters into idle mode when the interrupt | ||
| 449 | * returns. | ||
| 450 | */ | ||
| 451 | void rcu_user_enter_after_irq(void) | ||
| 452 | { | ||
| 453 | unsigned long flags; | ||
| 454 | struct rcu_dynticks *rdtp; | ||
| 455 | |||
| 456 | local_irq_save(flags); | ||
| 457 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 458 | /* Ensure this irq is interrupting a non-idle RCU state. */ | ||
| 459 | WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); | ||
| 460 | rdtp->dynticks_nesting = 1; | ||
| 461 | local_irq_restore(flags); | ||
| 462 | } | ||
| 463 | #endif /* CONFIG_RCU_USER_QS */ | ||
| 416 | 464 | ||
| 417 | /** | 465 | /** |
| 418 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle | 466 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle |
| @@ -444,18 +492,19 @@ void rcu_irq_exit(void) | |||
| 444 | if (rdtp->dynticks_nesting) | 492 | if (rdtp->dynticks_nesting) |
| 445 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); | 493 | trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); |
| 446 | else | 494 | else |
| 447 | rcu_idle_enter_common(rdtp, oldval); | 495 | rcu_eqs_enter_common(rdtp, oldval, true); |
| 448 | local_irq_restore(flags); | 496 | local_irq_restore(flags); |
| 449 | } | 497 | } |
| 450 | 498 | ||
| 451 | /* | 499 | /* |
| 452 | * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle | 500 | * rcu_eqs_exit_common - current CPU moving away from extended quiescent state |
| 453 | * | 501 | * |
| 454 | * If the new value of the ->dynticks_nesting counter was previously zero, | 502 | * If the new value of the ->dynticks_nesting counter was previously zero, |
| 455 | * we really have exited idle, and must do the appropriate accounting. | 503 | * we really have exited idle, and must do the appropriate accounting. |
| 456 | * The caller must have disabled interrupts. | 504 | * The caller must have disabled interrupts. |
| 457 | */ | 505 | */ |
| 458 | static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | 506 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, |
| 507 | int user) | ||
| 459 | { | 508 | { |
| 460 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | 509 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ |
| 461 | atomic_inc(&rdtp->dynticks); | 510 | atomic_inc(&rdtp->dynticks); |
| @@ -464,7 +513,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 464 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | 513 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); |
| 465 | rcu_cleanup_after_idle(smp_processor_id()); | 514 | rcu_cleanup_after_idle(smp_processor_id()); |
| 466 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); | 515 | trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); |
| 467 | if (!is_idle_task(current)) { | 516 | if (!user && !is_idle_task(current)) { |
| 468 | struct task_struct *idle = idle_task(smp_processor_id()); | 517 | struct task_struct *idle = idle_task(smp_processor_id()); |
| 469 | 518 | ||
| 470 | trace_rcu_dyntick("Error on exit: not idle task", | 519 | trace_rcu_dyntick("Error on exit: not idle task", |
| @@ -476,6 +525,25 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 476 | } | 525 | } |
| 477 | } | 526 | } |
| 478 | 527 | ||
| 528 | /* | ||
| 529 | * Exit an RCU extended quiescent state, which can be either the | ||
| 530 | * idle loop or adaptive-tickless usermode execution. | ||
| 531 | */ | ||
| 532 | static void rcu_eqs_exit(bool user) | ||
| 533 | { | ||
| 534 | struct rcu_dynticks *rdtp; | ||
| 535 | long long oldval; | ||
| 536 | |||
| 537 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 538 | oldval = rdtp->dynticks_nesting; | ||
| 539 | WARN_ON_ONCE(oldval < 0); | ||
| 540 | if (oldval & DYNTICK_TASK_NEST_MASK) | ||
| 541 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
| 542 | else | ||
| 543 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 544 | rcu_eqs_exit_common(rdtp, oldval, user); | ||
| 545 | } | ||
| 546 | |||
| 479 | /** | 547 | /** |
| 480 | * rcu_idle_exit - inform RCU that current CPU is leaving idle | 548 | * rcu_idle_exit - inform RCU that current CPU is leaving idle |
| 481 | * | 549 | * |
| @@ -490,21 +558,67 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval) | |||
| 490 | void rcu_idle_exit(void) | 558 | void rcu_idle_exit(void) |
| 491 | { | 559 | { |
| 492 | unsigned long flags; | 560 | unsigned long flags; |
| 561 | |||
| 562 | local_irq_save(flags); | ||
| 563 | rcu_eqs_exit(false); | ||
| 564 | local_irq_restore(flags); | ||
| 565 | } | ||
| 566 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | ||
| 567 | |||
| 568 | #ifdef CONFIG_RCU_USER_QS | ||
| 569 | /** | ||
| 570 | * rcu_user_exit - inform RCU that we are exiting userspace. | ||
| 571 | * | ||
| 572 | * Exit RCU idle mode while entering the kernel because it can | ||
| 573 | * run a RCU read side critical section anytime. | ||
| 574 | */ | ||
| 575 | void rcu_user_exit(void) | ||
| 576 | { | ||
| 577 | unsigned long flags; | ||
| 493 | struct rcu_dynticks *rdtp; | 578 | struct rcu_dynticks *rdtp; |
| 494 | long long oldval; | 579 | |
| 580 | /* | ||
| 581 | * Some contexts may involve an exception occuring in an irq, | ||
| 582 | * leading to that nesting: | ||
| 583 | * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() | ||
| 584 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() | ||
| 585 | * helpers are enough to protect RCU uses inside the exception. So | ||
| 586 | * just return immediately if we detect we are in an IRQ. | ||
| 587 | */ | ||
| 588 | if (in_interrupt()) | ||
| 589 | return; | ||
| 495 | 590 | ||
| 496 | local_irq_save(flags); | 591 | local_irq_save(flags); |
| 497 | rdtp = &__get_cpu_var(rcu_dynticks); | 592 | rdtp = &__get_cpu_var(rcu_dynticks); |
| 498 | oldval = rdtp->dynticks_nesting; | 593 | if (rdtp->in_user) { |
| 499 | WARN_ON_ONCE(oldval < 0); | 594 | rdtp->in_user = false; |
| 500 | if (oldval & DYNTICK_TASK_NEST_MASK) | 595 | rcu_eqs_exit(true); |
| 501 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | 596 | } |
| 502 | else | ||
| 503 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 504 | rcu_idle_exit_common(rdtp, oldval); | ||
| 505 | local_irq_restore(flags); | 597 | local_irq_restore(flags); |
| 506 | } | 598 | } |
| 507 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | 599 | |
| 600 | /** | ||
| 601 | * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace | ||
| 602 | * idle mode after the current non-nesting irq returns. | ||
| 603 | * | ||
| 604 | * This is similar to rcu_user_exit() but in the context of an irq. | ||
| 605 | * This is called when the irq has interrupted a userspace RCU idle mode | ||
| 606 | * context. When the current non-nesting interrupt returns after this call, | ||
| 607 | * the CPU won't restore the RCU idle mode. | ||
| 608 | */ | ||
| 609 | void rcu_user_exit_after_irq(void) | ||
| 610 | { | ||
| 611 | unsigned long flags; | ||
| 612 | struct rcu_dynticks *rdtp; | ||
| 613 | |||
| 614 | local_irq_save(flags); | ||
| 615 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 616 | /* Ensure we are interrupting an RCU idle mode. */ | ||
| 617 | WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); | ||
| 618 | rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; | ||
| 619 | local_irq_restore(flags); | ||
| 620 | } | ||
| 621 | #endif /* CONFIG_RCU_USER_QS */ | ||
| 508 | 622 | ||
| 509 | /** | 623 | /** |
| 510 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | 624 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle |
| @@ -539,7 +653,7 @@ void rcu_irq_enter(void) | |||
| 539 | if (oldval) | 653 | if (oldval) |
| 540 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); | 654 | trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); |
| 541 | else | 655 | else |
| 542 | rcu_idle_exit_common(rdtp, oldval); | 656 | rcu_eqs_exit_common(rdtp, oldval, true); |
| 543 | local_irq_restore(flags); | 657 | local_irq_restore(flags); |
| 544 | } | 658 | } |
| 545 | 659 | ||
| @@ -603,6 +717,21 @@ int rcu_is_cpu_idle(void) | |||
| 603 | } | 717 | } |
| 604 | EXPORT_SYMBOL(rcu_is_cpu_idle); | 718 | EXPORT_SYMBOL(rcu_is_cpu_idle); |
| 605 | 719 | ||
| 720 | #ifdef CONFIG_RCU_USER_QS | ||
| 721 | void rcu_user_hooks_switch(struct task_struct *prev, | ||
| 722 | struct task_struct *next) | ||
| 723 | { | ||
| 724 | struct rcu_dynticks *rdtp; | ||
| 725 | |||
| 726 | /* Interrupts are disabled in context switch */ | ||
| 727 | rdtp = &__get_cpu_var(rcu_dynticks); | ||
| 728 | if (!rdtp->ignore_user_qs) { | ||
| 729 | clear_tsk_thread_flag(prev, TIF_NOHZ); | ||
| 730 | set_tsk_thread_flag(next, TIF_NOHZ); | ||
| 731 | } | ||
| 732 | } | ||
| 733 | #endif /* #ifdef CONFIG_RCU_USER_QS */ | ||
| 734 | |||
| 606 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | 735 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) |
| 607 | 736 | ||
| 608 | /* | 737 | /* |
| @@ -673,7 +802,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) | |||
| 673 | * Return true if the specified CPU has passed through a quiescent | 802 | * Return true if the specified CPU has passed through a quiescent |
| 674 | * state by virtue of being in or having passed through an dynticks | 803 | * state by virtue of being in or having passed through an dynticks |
| 675 | * idle state since the last call to dyntick_save_progress_counter() | 804 | * idle state since the last call to dyntick_save_progress_counter() |
| 676 | * for this same CPU. | 805 | * for this same CPU, or by virtue of having been offline. |
| 677 | */ | 806 | */ |
| 678 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | 807 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) |
| 679 | { | 808 | { |
| @@ -697,8 +826,26 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) | |||
| 697 | return 1; | 826 | return 1; |
| 698 | } | 827 | } |
| 699 | 828 | ||
| 700 | /* Go check for the CPU being offline. */ | 829 | /* |
| 701 | return rcu_implicit_offline_qs(rdp); | 830 | * Check for the CPU being offline, but only if the grace period |
| 831 | * is old enough. We don't need to worry about the CPU changing | ||
| 832 | * state: If we see it offline even once, it has been through a | ||
| 833 | * quiescent state. | ||
| 834 | * | ||
| 835 | * The reason for insisting that the grace period be at least | ||
| 836 | * one jiffy old is that CPUs that are not quite online and that | ||
| 837 | * have just gone offline can still execute RCU read-side critical | ||
| 838 | * sections. | ||
| 839 | */ | ||
| 840 | if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) | ||
| 841 | return 0; /* Grace period is not old enough. */ | ||
| 842 | barrier(); | ||
| 843 | if (cpu_is_offline(rdp->cpu)) { | ||
| 844 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); | ||
| 845 | rdp->offline_fqs++; | ||
| 846 | return 1; | ||
| 847 | } | ||
| 848 | return 0; | ||
| 702 | } | 849 | } |
| 703 | 850 | ||
| 704 | static int jiffies_till_stall_check(void) | 851 | static int jiffies_till_stall_check(void) |
| @@ -755,14 +902,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 755 | rcu_for_each_leaf_node(rsp, rnp) { | 902 | rcu_for_each_leaf_node(rsp, rnp) { |
| 756 | raw_spin_lock_irqsave(&rnp->lock, flags); | 903 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 757 | ndetected += rcu_print_task_stall(rnp); | 904 | ndetected += rcu_print_task_stall(rnp); |
| 905 | if (rnp->qsmask != 0) { | ||
| 906 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
| 907 | if (rnp->qsmask & (1UL << cpu)) { | ||
| 908 | print_cpu_stall_info(rsp, | ||
| 909 | rnp->grplo + cpu); | ||
| 910 | ndetected++; | ||
| 911 | } | ||
| 912 | } | ||
| 758 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 913 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 759 | if (rnp->qsmask == 0) | ||
| 760 | continue; | ||
| 761 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
| 762 | if (rnp->qsmask & (1UL << cpu)) { | ||
| 763 | print_cpu_stall_info(rsp, rnp->grplo + cpu); | ||
| 764 | ndetected++; | ||
| 765 | } | ||
| 766 | } | 914 | } |
| 767 | 915 | ||
| 768 | /* | 916 | /* |
| @@ -782,11 +930,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
| 782 | else if (!trigger_all_cpu_backtrace()) | 930 | else if (!trigger_all_cpu_backtrace()) |
| 783 | dump_stack(); | 931 | dump_stack(); |
| 784 | 932 | ||
| 785 | /* If so configured, complain about tasks blocking the grace period. */ | 933 | /* Complain about tasks blocking the grace period. */ |
| 786 | 934 | ||
| 787 | rcu_print_detail_task_stall(rsp); | 935 | rcu_print_detail_task_stall(rsp); |
| 788 | 936 | ||
| 789 | force_quiescent_state(rsp, 0); /* Kick them all. */ | 937 | force_quiescent_state(rsp); /* Kick them all. */ |
| 790 | } | 938 | } |
| 791 | 939 | ||
| 792 | static void print_cpu_stall(struct rcu_state *rsp) | 940 | static void print_cpu_stall(struct rcu_state *rsp) |
| @@ -827,7 +975,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 827 | j = ACCESS_ONCE(jiffies); | 975 | j = ACCESS_ONCE(jiffies); |
| 828 | js = ACCESS_ONCE(rsp->jiffies_stall); | 976 | js = ACCESS_ONCE(rsp->jiffies_stall); |
| 829 | rnp = rdp->mynode; | 977 | rnp = rdp->mynode; |
| 830 | if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | 978 | if (rcu_gp_in_progress(rsp) && |
| 979 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { | ||
| 831 | 980 | ||
| 832 | /* We haven't checked in, so go dump stack. */ | 981 | /* We haven't checked in, so go dump stack. */ |
| 833 | print_cpu_stall(rsp); | 982 | print_cpu_stall(rsp); |
| @@ -889,12 +1038,8 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct | |||
| 889 | */ | 1038 | */ |
| 890 | rdp->gpnum = rnp->gpnum; | 1039 | rdp->gpnum = rnp->gpnum; |
| 891 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); | 1040 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); |
| 892 | if (rnp->qsmask & rdp->grpmask) { | 1041 | rdp->passed_quiesce = 0; |
| 893 | rdp->qs_pending = 1; | 1042 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); |
| 894 | rdp->passed_quiesce = 0; | ||
| 895 | } else { | ||
| 896 | rdp->qs_pending = 0; | ||
| 897 | } | ||
| 898 | zero_cpu_stall_ticks(rdp); | 1043 | zero_cpu_stall_ticks(rdp); |
| 899 | } | 1044 | } |
| 900 | } | 1045 | } |
| @@ -974,10 +1119,13 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
| 974 | * our behalf. Catch up with this state to avoid noting | 1119 | * our behalf. Catch up with this state to avoid noting |
| 975 | * spurious new grace periods. If another grace period | 1120 | * spurious new grace periods. If another grace period |
| 976 | * has started, then rnp->gpnum will have advanced, so | 1121 | * has started, then rnp->gpnum will have advanced, so |
| 977 | * we will detect this later on. | 1122 | * we will detect this later on. Of course, any quiescent |
| 1123 | * states we found for the old GP are now invalid. | ||
| 978 | */ | 1124 | */ |
| 979 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) | 1125 | if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) { |
| 980 | rdp->gpnum = rdp->completed; | 1126 | rdp->gpnum = rdp->completed; |
| 1127 | rdp->passed_quiesce = 0; | ||
| 1128 | } | ||
| 981 | 1129 | ||
| 982 | /* | 1130 | /* |
| 983 | * If RCU does not need a quiescent state from this CPU, | 1131 | * If RCU does not need a quiescent state from this CPU, |
| @@ -1021,97 +1169,56 @@ rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat | |||
| 1021 | /* Prior grace period ended, so advance callbacks for current CPU. */ | 1169 | /* Prior grace period ended, so advance callbacks for current CPU. */ |
| 1022 | __rcu_process_gp_end(rsp, rnp, rdp); | 1170 | __rcu_process_gp_end(rsp, rnp, rdp); |
| 1023 | 1171 | ||
| 1024 | /* | ||
| 1025 | * Because this CPU just now started the new grace period, we know | ||
| 1026 | * that all of its callbacks will be covered by this upcoming grace | ||
| 1027 | * period, even the ones that were registered arbitrarily recently. | ||
| 1028 | * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. | ||
| 1029 | * | ||
| 1030 | * Other CPUs cannot be sure exactly when the grace period started. | ||
| 1031 | * Therefore, their recently registered callbacks must pass through | ||
| 1032 | * an additional RCU_NEXT_READY stage, so that they will be handled | ||
| 1033 | * by the next RCU grace period. | ||
| 1034 | */ | ||
| 1035 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1036 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1037 | |||
| 1038 | /* Set state so that this CPU will detect the next quiescent state. */ | 1172 | /* Set state so that this CPU will detect the next quiescent state. */ |
| 1039 | __note_new_gpnum(rsp, rnp, rdp); | 1173 | __note_new_gpnum(rsp, rnp, rdp); |
| 1040 | } | 1174 | } |
| 1041 | 1175 | ||
| 1042 | /* | 1176 | /* |
| 1043 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | 1177 | * Initialize a new grace period. |
| 1044 | * in preparation for detecting the next grace period. The caller must hold | ||
| 1045 | * the root node's ->lock, which is released before return. Hard irqs must | ||
| 1046 | * be disabled. | ||
| 1047 | * | ||
| 1048 | * Note that it is legal for a dying CPU (which is marked as offline) to | ||
| 1049 | * invoke this function. This can happen when the dying CPU reports its | ||
| 1050 | * quiescent state. | ||
| 1051 | */ | 1178 | */ |
| 1052 | static void | 1179 | static int rcu_gp_init(struct rcu_state *rsp) |
| 1053 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | ||
| 1054 | __releases(rcu_get_root(rsp)->lock) | ||
| 1055 | { | 1180 | { |
| 1056 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | 1181 | struct rcu_data *rdp; |
| 1057 | struct rcu_node *rnp = rcu_get_root(rsp); | 1182 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1058 | 1183 | ||
| 1059 | if (!rcu_scheduler_fully_active || | 1184 | raw_spin_lock_irq(&rnp->lock); |
| 1060 | !cpu_needs_another_gp(rsp, rdp)) { | 1185 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ |
| 1061 | /* | ||
| 1062 | * Either the scheduler hasn't yet spawned the first | ||
| 1063 | * non-idle task or this CPU does not need another | ||
| 1064 | * grace period. Either way, don't start a new grace | ||
| 1065 | * period. | ||
| 1066 | */ | ||
| 1067 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1068 | return; | ||
| 1069 | } | ||
| 1070 | 1186 | ||
| 1071 | if (rsp->fqs_active) { | 1187 | if (rcu_gp_in_progress(rsp)) { |
| 1072 | /* | 1188 | /* Grace period already in progress, don't start another. */ |
| 1073 | * This CPU needs a grace period, but force_quiescent_state() | 1189 | raw_spin_unlock_irq(&rnp->lock); |
| 1074 | * is running. Tell it to start one on this CPU's behalf. | 1190 | return 0; |
| 1075 | */ | ||
| 1076 | rsp->fqs_need_gp = 1; | ||
| 1077 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1078 | return; | ||
| 1079 | } | 1191 | } |
| 1080 | 1192 | ||
| 1081 | /* Advance to a new grace period and initialize state. */ | 1193 | /* Advance to a new grace period and initialize state. */ |
| 1082 | rsp->gpnum++; | 1194 | rsp->gpnum++; |
| 1083 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); | 1195 | trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); |
| 1084 | WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT); | ||
| 1085 | rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */ | ||
| 1086 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | ||
| 1087 | record_gp_stall_check_time(rsp); | 1196 | record_gp_stall_check_time(rsp); |
| 1088 | raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ | 1197 | raw_spin_unlock_irq(&rnp->lock); |
| 1089 | 1198 | ||
| 1090 | /* Exclude any concurrent CPU-hotplug operations. */ | 1199 | /* Exclude any concurrent CPU-hotplug operations. */ |
| 1091 | raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ | 1200 | get_online_cpus(); |
| 1092 | 1201 | ||
| 1093 | /* | 1202 | /* |
| 1094 | * Set the quiescent-state-needed bits in all the rcu_node | 1203 | * Set the quiescent-state-needed bits in all the rcu_node |
| 1095 | * structures for all currently online CPUs in breadth-first | 1204 | * structures for all currently online CPUs in breadth-first order, |
| 1096 | * order, starting from the root rcu_node structure. This | 1205 | * starting from the root rcu_node structure, relying on the layout |
| 1097 | * operation relies on the layout of the hierarchy within the | 1206 | * of the tree within the rsp->node[] array. Note that other CPUs |
| 1098 | * rsp->node[] array. Note that other CPUs will access only | 1207 | * will access only the leaves of the hierarchy, thus seeing that no |
| 1099 | * the leaves of the hierarchy, which still indicate that no | ||
| 1100 | * grace period is in progress, at least until the corresponding | 1208 | * grace period is in progress, at least until the corresponding |
| 1101 | * leaf node has been initialized. In addition, we have excluded | 1209 | * leaf node has been initialized. In addition, we have excluded |
| 1102 | * CPU-hotplug operations. | 1210 | * CPU-hotplug operations. |
| 1103 | * | 1211 | * |
| 1104 | * Note that the grace period cannot complete until we finish | 1212 | * The grace period cannot complete until the initialization |
| 1105 | * the initialization process, as there will be at least one | 1213 | * process finishes, because this kthread handles both. |
| 1106 | * qsmask bit set in the root node until that time, namely the | ||
| 1107 | * one corresponding to this CPU, due to the fact that we have | ||
| 1108 | * irqs disabled. | ||
| 1109 | */ | 1214 | */ |
| 1110 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1215 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 1111 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1216 | raw_spin_lock_irq(&rnp->lock); |
| 1217 | rdp = this_cpu_ptr(rsp->rda); | ||
| 1112 | rcu_preempt_check_blocked_tasks(rnp); | 1218 | rcu_preempt_check_blocked_tasks(rnp); |
| 1113 | rnp->qsmask = rnp->qsmaskinit; | 1219 | rnp->qsmask = rnp->qsmaskinit; |
| 1114 | rnp->gpnum = rsp->gpnum; | 1220 | rnp->gpnum = rsp->gpnum; |
| 1221 | WARN_ON_ONCE(rnp->completed != rsp->completed); | ||
| 1115 | rnp->completed = rsp->completed; | 1222 | rnp->completed = rsp->completed; |
| 1116 | if (rnp == rdp->mynode) | 1223 | if (rnp == rdp->mynode) |
| 1117 | rcu_start_gp_per_cpu(rsp, rnp, rdp); | 1224 | rcu_start_gp_per_cpu(rsp, rnp, rdp); |
| @@ -1119,37 +1226,54 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
| 1119 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | 1226 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, |
| 1120 | rnp->level, rnp->grplo, | 1227 | rnp->level, rnp->grplo, |
| 1121 | rnp->grphi, rnp->qsmask); | 1228 | rnp->grphi, rnp->qsmask); |
| 1122 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1229 | raw_spin_unlock_irq(&rnp->lock); |
| 1230 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
| 1231 | if ((random32() % (rcu_num_nodes * 8)) == 0) | ||
| 1232 | schedule_timeout_uninterruptible(2); | ||
| 1233 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
| 1234 | cond_resched(); | ||
| 1123 | } | 1235 | } |
| 1124 | 1236 | ||
| 1125 | rnp = rcu_get_root(rsp); | 1237 | put_online_cpus(); |
| 1126 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1238 | return 1; |
| 1127 | rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | ||
| 1128 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 1129 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 1130 | } | 1239 | } |
| 1131 | 1240 | ||
| 1132 | /* | 1241 | /* |
| 1133 | * Report a full set of quiescent states to the specified rcu_state | 1242 | * Do one round of quiescent-state forcing. |
| 1134 | * data structure. This involves cleaning up after the prior grace | ||
| 1135 | * period and letting rcu_start_gp() start up the next grace period | ||
| 1136 | * if one is needed. Note that the caller must hold rnp->lock, as | ||
| 1137 | * required by rcu_start_gp(), which will release it. | ||
| 1138 | */ | 1243 | */ |
| 1139 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | 1244 | int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) |
| 1140 | __releases(rcu_get_root(rsp)->lock) | ||
| 1141 | { | 1245 | { |
| 1142 | unsigned long gp_duration; | 1246 | int fqs_state = fqs_state_in; |
| 1143 | struct rcu_node *rnp = rcu_get_root(rsp); | 1247 | struct rcu_node *rnp = rcu_get_root(rsp); |
| 1144 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
| 1145 | 1248 | ||
| 1146 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 1249 | rsp->n_force_qs++; |
| 1250 | if (fqs_state == RCU_SAVE_DYNTICK) { | ||
| 1251 | /* Collect dyntick-idle snapshots. */ | ||
| 1252 | force_qs_rnp(rsp, dyntick_save_progress_counter); | ||
| 1253 | fqs_state = RCU_FORCE_QS; | ||
| 1254 | } else { | ||
| 1255 | /* Handle dyntick-idle and offline CPUs. */ | ||
| 1256 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | ||
| 1257 | } | ||
| 1258 | /* Clear flag to prevent immediate re-entry. */ | ||
| 1259 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | ||
| 1260 | raw_spin_lock_irq(&rnp->lock); | ||
| 1261 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; | ||
| 1262 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1263 | } | ||
| 1264 | return fqs_state; | ||
| 1265 | } | ||
| 1147 | 1266 | ||
| 1148 | /* | 1267 | /* |
| 1149 | * Ensure that all grace-period and pre-grace-period activity | 1268 | * Clean up after the old grace period. |
| 1150 | * is seen before the assignment to rsp->completed. | 1269 | */ |
| 1151 | */ | 1270 | static void rcu_gp_cleanup(struct rcu_state *rsp) |
| 1152 | smp_mb(); /* See above block comment. */ | 1271 | { |
| 1272 | unsigned long gp_duration; | ||
| 1273 | struct rcu_data *rdp; | ||
| 1274 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1275 | |||
| 1276 | raw_spin_lock_irq(&rnp->lock); | ||
| 1153 | gp_duration = jiffies - rsp->gp_start; | 1277 | gp_duration = jiffies - rsp->gp_start; |
| 1154 | if (gp_duration > rsp->gp_max) | 1278 | if (gp_duration > rsp->gp_max) |
| 1155 | rsp->gp_max = gp_duration; | 1279 | rsp->gp_max = gp_duration; |
| @@ -1161,35 +1285,149 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | |||
| 1161 | * they can do to advance the grace period. It is therefore | 1285 | * they can do to advance the grace period. It is therefore |
| 1162 | * safe for us to drop the lock in order to mark the grace | 1286 | * safe for us to drop the lock in order to mark the grace |
| 1163 | * period as completed in all of the rcu_node structures. | 1287 | * period as completed in all of the rcu_node structures. |
| 1164 | * | ||
| 1165 | * But if this CPU needs another grace period, it will take | ||
| 1166 | * care of this while initializing the next grace period. | ||
| 1167 | * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL | ||
| 1168 | * because the callbacks have not yet been advanced: Those | ||
| 1169 | * callbacks are waiting on the grace period that just now | ||
| 1170 | * completed. | ||
| 1171 | */ | 1288 | */ |
| 1172 | if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) { | 1289 | raw_spin_unlock_irq(&rnp->lock); |
| 1173 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 1174 | 1290 | ||
| 1175 | /* | 1291 | /* |
| 1176 | * Propagate new ->completed value to rcu_node structures | 1292 | * Propagate new ->completed value to rcu_node structures so |
| 1177 | * so that other CPUs don't have to wait until the start | 1293 | * that other CPUs don't have to wait until the start of the next |
| 1178 | * of the next grace period to process their callbacks. | 1294 | * grace period to process their callbacks. This also avoids |
| 1179 | */ | 1295 | * some nasty RCU grace-period initialization races by forcing |
| 1180 | rcu_for_each_node_breadth_first(rsp, rnp) { | 1296 | * the end of the current grace period to be completely recorded in |
| 1181 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1297 | * all of the rcu_node structures before the beginning of the next |
| 1182 | rnp->completed = rsp->gpnum; | 1298 | * grace period is recorded in any of the rcu_node structures. |
| 1183 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1299 | */ |
| 1184 | } | 1300 | rcu_for_each_node_breadth_first(rsp, rnp) { |
| 1185 | rnp = rcu_get_root(rsp); | 1301 | raw_spin_lock_irq(&rnp->lock); |
| 1186 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 1302 | rnp->completed = rsp->gpnum; |
| 1303 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1304 | cond_resched(); | ||
| 1187 | } | 1305 | } |
| 1306 | rnp = rcu_get_root(rsp); | ||
| 1307 | raw_spin_lock_irq(&rnp->lock); | ||
| 1188 | 1308 | ||
| 1189 | rsp->completed = rsp->gpnum; /* Declare the grace period complete. */ | 1309 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ |
| 1190 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); | 1310 | trace_rcu_grace_period(rsp->name, rsp->completed, "end"); |
| 1191 | rsp->fqs_state = RCU_GP_IDLE; | 1311 | rsp->fqs_state = RCU_GP_IDLE; |
| 1192 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 1312 | rdp = this_cpu_ptr(rsp->rda); |
| 1313 | if (cpu_needs_another_gp(rsp, rdp)) | ||
| 1314 | rsp->gp_flags = 1; | ||
| 1315 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1316 | } | ||
| 1317 | |||
| 1318 | /* | ||
| 1319 | * Body of kthread that handles grace periods. | ||
| 1320 | */ | ||
| 1321 | static int __noreturn rcu_gp_kthread(void *arg) | ||
| 1322 | { | ||
| 1323 | int fqs_state; | ||
| 1324 | unsigned long j; | ||
| 1325 | int ret; | ||
| 1326 | struct rcu_state *rsp = arg; | ||
| 1327 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1328 | |||
| 1329 | for (;;) { | ||
| 1330 | |||
| 1331 | /* Handle grace-period start. */ | ||
| 1332 | for (;;) { | ||
| 1333 | wait_event_interruptible(rsp->gp_wq, | ||
| 1334 | rsp->gp_flags & | ||
| 1335 | RCU_GP_FLAG_INIT); | ||
| 1336 | if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && | ||
| 1337 | rcu_gp_init(rsp)) | ||
| 1338 | break; | ||
| 1339 | cond_resched(); | ||
| 1340 | flush_signals(current); | ||
| 1341 | } | ||
| 1342 | |||
| 1343 | /* Handle quiescent-state forcing. */ | ||
| 1344 | fqs_state = RCU_SAVE_DYNTICK; | ||
| 1345 | j = jiffies_till_first_fqs; | ||
| 1346 | if (j > HZ) { | ||
| 1347 | j = HZ; | ||
| 1348 | jiffies_till_first_fqs = HZ; | ||
| 1349 | } | ||
| 1350 | for (;;) { | ||
| 1351 | rsp->jiffies_force_qs = jiffies + j; | ||
| 1352 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | ||
| 1353 | (rsp->gp_flags & RCU_GP_FLAG_FQS) || | ||
| 1354 | (!ACCESS_ONCE(rnp->qsmask) && | ||
| 1355 | !rcu_preempt_blocked_readers_cgp(rnp)), | ||
| 1356 | j); | ||
| 1357 | /* If grace period done, leave loop. */ | ||
| 1358 | if (!ACCESS_ONCE(rnp->qsmask) && | ||
| 1359 | !rcu_preempt_blocked_readers_cgp(rnp)) | ||
| 1360 | break; | ||
| 1361 | /* If time for quiescent-state forcing, do it. */ | ||
| 1362 | if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { | ||
| 1363 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | ||
| 1364 | cond_resched(); | ||
| 1365 | } else { | ||
| 1366 | /* Deal with stray signal. */ | ||
| 1367 | cond_resched(); | ||
| 1368 | flush_signals(current); | ||
| 1369 | } | ||
| 1370 | j = jiffies_till_next_fqs; | ||
| 1371 | if (j > HZ) { | ||
| 1372 | j = HZ; | ||
| 1373 | jiffies_till_next_fqs = HZ; | ||
| 1374 | } else if (j < 1) { | ||
| 1375 | j = 1; | ||
| 1376 | jiffies_till_next_fqs = 1; | ||
| 1377 | } | ||
| 1378 | } | ||
| 1379 | |||
| 1380 | /* Handle grace-period end. */ | ||
| 1381 | rcu_gp_cleanup(rsp); | ||
| 1382 | } | ||
| 1383 | } | ||
| 1384 | |||
| 1385 | /* | ||
| 1386 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | ||
| 1387 | * in preparation for detecting the next grace period. The caller must hold | ||
| 1388 | * the root node's ->lock, which is released before return. Hard irqs must | ||
| 1389 | * be disabled. | ||
| 1390 | * | ||
| 1391 | * Note that it is legal for a dying CPU (which is marked as offline) to | ||
| 1392 | * invoke this function. This can happen when the dying CPU reports its | ||
| 1393 | * quiescent state. | ||
| 1394 | */ | ||
| 1395 | static void | ||
| 1396 | rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | ||
| 1397 | __releases(rcu_get_root(rsp)->lock) | ||
| 1398 | { | ||
| 1399 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
| 1400 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1401 | |||
| 1402 | if (!rsp->gp_kthread || | ||
| 1403 | !cpu_needs_another_gp(rsp, rdp)) { | ||
| 1404 | /* | ||
| 1405 | * Either we have not yet spawned the grace-period | ||
| 1406 | * task or this CPU does not need another grace period. | ||
| 1407 | * Either way, don't start a new grace period. | ||
| 1408 | */ | ||
| 1409 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1410 | return; | ||
| 1411 | } | ||
| 1412 | |||
| 1413 | rsp->gp_flags = RCU_GP_FLAG_INIT; | ||
| 1414 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1415 | wake_up(&rsp->gp_wq); | ||
| 1416 | } | ||
| 1417 | |||
| 1418 | /* | ||
| 1419 | * Report a full set of quiescent states to the specified rcu_state | ||
| 1420 | * data structure. This involves cleaning up after the prior grace | ||
| 1421 | * period and letting rcu_start_gp() start up the next grace period | ||
| 1422 | * if one is needed. Note that the caller must hold rnp->lock, as | ||
| 1423 | * required by rcu_start_gp(), which will release it. | ||
| 1424 | */ | ||
| 1425 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | ||
| 1426 | __releases(rcu_get_root(rsp)->lock) | ||
| 1427 | { | ||
| 1428 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | ||
| 1429 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | ||
| 1430 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | ||
| 1193 | } | 1431 | } |
| 1194 | 1432 | ||
| 1195 | /* | 1433 | /* |
| @@ -1258,7 +1496,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | |||
| 1258 | * based on quiescent states detected in an earlier grace period! | 1496 | * based on quiescent states detected in an earlier grace period! |
| 1259 | */ | 1497 | */ |
| 1260 | static void | 1498 | static void |
| 1261 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp) | 1499 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) |
| 1262 | { | 1500 | { |
| 1263 | unsigned long flags; | 1501 | unsigned long flags; |
| 1264 | unsigned long mask; | 1502 | unsigned long mask; |
| @@ -1266,7 +1504,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las | |||
| 1266 | 1504 | ||
| 1267 | rnp = rdp->mynode; | 1505 | rnp = rdp->mynode; |
| 1268 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1506 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 1269 | if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) { | 1507 | if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || |
| 1508 | rnp->completed == rnp->gpnum) { | ||
| 1270 | 1509 | ||
| 1271 | /* | 1510 | /* |
| 1272 | * The grace period in which this quiescent state was | 1511 | * The grace period in which this quiescent state was |
| @@ -1325,7 +1564,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 1325 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the | 1564 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the |
| 1326 | * judge of that). | 1565 | * judge of that). |
| 1327 | */ | 1566 | */ |
| 1328 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum); | 1567 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp); |
| 1329 | } | 1568 | } |
| 1330 | 1569 | ||
| 1331 | #ifdef CONFIG_HOTPLUG_CPU | 1570 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -1390,17 +1629,6 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | |||
| 1390 | int i; | 1629 | int i; |
| 1391 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | 1630 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); |
| 1392 | 1631 | ||
| 1393 | /* | ||
| 1394 | * If there is an rcu_barrier() operation in progress, then | ||
| 1395 | * only the task doing that operation is permitted to adopt | ||
| 1396 | * callbacks. To do otherwise breaks rcu_barrier() and friends | ||
| 1397 | * by causing them to fail to wait for the callbacks in the | ||
| 1398 | * orphanage. | ||
| 1399 | */ | ||
| 1400 | if (rsp->rcu_barrier_in_progress && | ||
| 1401 | rsp->rcu_barrier_in_progress != current) | ||
| 1402 | return; | ||
| 1403 | |||
| 1404 | /* Do the accounting first. */ | 1632 | /* Do the accounting first. */ |
| 1405 | rdp->qlen_lazy += rsp->qlen_lazy; | 1633 | rdp->qlen_lazy += rsp->qlen_lazy; |
| 1406 | rdp->qlen += rsp->qlen; | 1634 | rdp->qlen += rsp->qlen; |
| @@ -1455,9 +1683,8 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | |||
| 1455 | * The CPU has been completely removed, and some other CPU is reporting | 1683 | * The CPU has been completely removed, and some other CPU is reporting |
| 1456 | * this fact from process context. Do the remainder of the cleanup, | 1684 | * this fact from process context. Do the remainder of the cleanup, |
| 1457 | * including orphaning the outgoing CPU's RCU callbacks, and also | 1685 | * including orphaning the outgoing CPU's RCU callbacks, and also |
| 1458 | * adopting them, if there is no _rcu_barrier() instance running. | 1686 | * adopting them. There can only be one CPU hotplug operation at a time, |
| 1459 | * There can only be one CPU hotplug operation at a time, so no other | 1687 | * so no other CPU can be attempting to update rcu_cpu_kthread_task. |
| 1460 | * CPU can be attempting to update rcu_cpu_kthread_task. | ||
| 1461 | */ | 1688 | */ |
| 1462 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | 1689 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) |
| 1463 | { | 1690 | { |
| @@ -1468,8 +1695,7 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 1468 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | 1695 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ |
| 1469 | 1696 | ||
| 1470 | /* Adjust any no-longer-needed kthreads. */ | 1697 | /* Adjust any no-longer-needed kthreads. */ |
| 1471 | rcu_stop_cpu_kthread(cpu); | 1698 | rcu_boost_kthread_setaffinity(rnp, -1); |
| 1472 | rcu_node_kthread_setaffinity(rnp, -1); | ||
| 1473 | 1699 | ||
| 1474 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ | 1700 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ |
| 1475 | 1701 | ||
| @@ -1515,14 +1741,13 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | |||
| 1515 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | 1741 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, |
| 1516 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | 1742 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", |
| 1517 | cpu, rdp->qlen, rdp->nxtlist); | 1743 | cpu, rdp->qlen, rdp->nxtlist); |
| 1744 | init_callback_list(rdp); | ||
| 1745 | /* Disallow further callbacks on this CPU. */ | ||
| 1746 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
| 1518 | } | 1747 | } |
| 1519 | 1748 | ||
| 1520 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | 1749 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 1521 | 1750 | ||
| 1522 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
| 1523 | { | ||
| 1524 | } | ||
| 1525 | |||
| 1526 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | 1751 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) |
| 1527 | { | 1752 | { |
| 1528 | } | 1753 | } |
| @@ -1687,6 +1912,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
| 1687 | struct rcu_node *rnp; | 1912 | struct rcu_node *rnp; |
| 1688 | 1913 | ||
| 1689 | rcu_for_each_leaf_node(rsp, rnp) { | 1914 | rcu_for_each_leaf_node(rsp, rnp) { |
| 1915 | cond_resched(); | ||
| 1690 | mask = 0; | 1916 | mask = 0; |
| 1691 | raw_spin_lock_irqsave(&rnp->lock, flags); | 1917 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 1692 | if (!rcu_gp_in_progress(rsp)) { | 1918 | if (!rcu_gp_in_progress(rsp)) { |
| @@ -1723,72 +1949,39 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) | |||
| 1723 | * Force quiescent states on reluctant CPUs, and also detect which | 1949 | * Force quiescent states on reluctant CPUs, and also detect which |
| 1724 | * CPUs are in dyntick-idle mode. | 1950 | * CPUs are in dyntick-idle mode. |
| 1725 | */ | 1951 | */ |
| 1726 | static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | 1952 | static void force_quiescent_state(struct rcu_state *rsp) |
| 1727 | { | 1953 | { |
| 1728 | unsigned long flags; | 1954 | unsigned long flags; |
| 1729 | struct rcu_node *rnp = rcu_get_root(rsp); | 1955 | bool ret; |
| 1730 | 1956 | struct rcu_node *rnp; | |
| 1731 | trace_rcu_utilization("Start fqs"); | 1957 | struct rcu_node *rnp_old = NULL; |
| 1732 | if (!rcu_gp_in_progress(rsp)) { | 1958 | |
| 1733 | trace_rcu_utilization("End fqs"); | 1959 | /* Funnel through hierarchy to reduce memory contention. */ |
| 1734 | return; /* No grace period in progress, nothing to force. */ | 1960 | rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; |
| 1735 | } | 1961 | for (; rnp != NULL; rnp = rnp->parent) { |
| 1736 | if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { | 1962 | ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || |
| 1737 | rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ | 1963 | !raw_spin_trylock(&rnp->fqslock); |
| 1738 | trace_rcu_utilization("End fqs"); | 1964 | if (rnp_old != NULL) |
| 1739 | return; /* Someone else is already on the job. */ | 1965 | raw_spin_unlock(&rnp_old->fqslock); |
| 1740 | } | 1966 | if (ret) { |
| 1741 | if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) | 1967 | rsp->n_force_qs_lh++; |
| 1742 | goto unlock_fqs_ret; /* no emergency and done recently. */ | 1968 | return; |
| 1743 | rsp->n_force_qs++; | 1969 | } |
| 1744 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | 1970 | rnp_old = rnp; |
| 1745 | rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; | ||
| 1746 | if(!rcu_gp_in_progress(rsp)) { | ||
| 1747 | rsp->n_force_qs_ngp++; | ||
| 1748 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
| 1749 | goto unlock_fqs_ret; /* no GP in progress, time updated. */ | ||
| 1750 | } | ||
| 1751 | rsp->fqs_active = 1; | ||
| 1752 | switch (rsp->fqs_state) { | ||
| 1753 | case RCU_GP_IDLE: | ||
| 1754 | case RCU_GP_INIT: | ||
| 1755 | |||
| 1756 | break; /* grace period idle or initializing, ignore. */ | ||
| 1757 | |||
| 1758 | case RCU_SAVE_DYNTICK: | ||
| 1759 | |||
| 1760 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
| 1761 | |||
| 1762 | /* Record dyntick-idle state. */ | ||
| 1763 | force_qs_rnp(rsp, dyntick_save_progress_counter); | ||
| 1764 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
| 1765 | if (rcu_gp_in_progress(rsp)) | ||
| 1766 | rsp->fqs_state = RCU_FORCE_QS; | ||
| 1767 | break; | ||
| 1768 | |||
| 1769 | case RCU_FORCE_QS: | ||
| 1770 | |||
| 1771 | /* Check dyntick-idle state, send IPI to laggarts. */ | ||
| 1772 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
| 1773 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs); | ||
| 1774 | |||
| 1775 | /* Leave state in case more forcing is required. */ | ||
| 1776 | |||
| 1777 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
| 1778 | break; | ||
| 1779 | } | 1971 | } |
| 1780 | rsp->fqs_active = 0; | 1972 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ |
| 1781 | if (rsp->fqs_need_gp) { | 1973 | |
| 1782 | raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ | 1974 | /* Reached the root of the rcu_node tree, acquire lock. */ |
| 1783 | rsp->fqs_need_gp = 0; | 1975 | raw_spin_lock_irqsave(&rnp_old->lock, flags); |
| 1784 | rcu_start_gp(rsp, flags); /* releases rnp->lock */ | 1976 | raw_spin_unlock(&rnp_old->fqslock); |
| 1785 | trace_rcu_utilization("End fqs"); | 1977 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { |
| 1786 | return; | 1978 | rsp->n_force_qs_lh++; |
| 1979 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | ||
| 1980 | return; /* Someone beat us to it. */ | ||
| 1787 | } | 1981 | } |
| 1788 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | 1982 | rsp->gp_flags |= RCU_GP_FLAG_FQS; |
| 1789 | unlock_fqs_ret: | 1983 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); |
| 1790 | raw_spin_unlock_irqrestore(&rsp->fqslock, flags); | 1984 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ |
| 1791 | trace_rcu_utilization("End fqs"); | ||
| 1792 | } | 1985 | } |
| 1793 | 1986 | ||
| 1794 | /* | 1987 | /* |
| @@ -1805,13 +1998,6 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
| 1805 | WARN_ON_ONCE(rdp->beenonline == 0); | 1998 | WARN_ON_ONCE(rdp->beenonline == 0); |
| 1806 | 1999 | ||
| 1807 | /* | 2000 | /* |
| 1808 | * If an RCU GP has gone long enough, go check for dyntick | ||
| 1809 | * idle CPUs and, if needed, send resched IPIs. | ||
| 1810 | */ | ||
| 1811 | if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | ||
| 1812 | force_quiescent_state(rsp, 1); | ||
| 1813 | |||
| 1814 | /* | ||
| 1815 | * Advance callbacks in response to end of earlier grace | 2001 | * Advance callbacks in response to end of earlier grace |
| 1816 | * period that some other CPU ended. | 2002 | * period that some other CPU ended. |
| 1817 | */ | 2003 | */ |
| @@ -1838,6 +2024,8 @@ static void rcu_process_callbacks(struct softirq_action *unused) | |||
| 1838 | { | 2024 | { |
| 1839 | struct rcu_state *rsp; | 2025 | struct rcu_state *rsp; |
| 1840 | 2026 | ||
| 2027 | if (cpu_is_offline(smp_processor_id())) | ||
| 2028 | return; | ||
| 1841 | trace_rcu_utilization("Start RCU core"); | 2029 | trace_rcu_utilization("Start RCU core"); |
| 1842 | for_each_rcu_flavor(rsp) | 2030 | for_each_rcu_flavor(rsp) |
| 1843 | __rcu_process_callbacks(rsp); | 2031 | __rcu_process_callbacks(rsp); |
| @@ -1909,12 +2097,11 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | |||
| 1909 | rdp->blimit = LONG_MAX; | 2097 | rdp->blimit = LONG_MAX; |
| 1910 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | 2098 | if (rsp->n_force_qs == rdp->n_force_qs_snap && |
| 1911 | *rdp->nxttail[RCU_DONE_TAIL] != head) | 2099 | *rdp->nxttail[RCU_DONE_TAIL] != head) |
| 1912 | force_quiescent_state(rsp, 0); | 2100 | force_quiescent_state(rsp); |
| 1913 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2101 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 1914 | rdp->qlen_last_fqs_check = rdp->qlen; | 2102 | rdp->qlen_last_fqs_check = rdp->qlen; |
| 1915 | } | 2103 | } |
| 1916 | } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) | 2104 | } |
| 1917 | force_quiescent_state(rsp, 1); | ||
| 1918 | } | 2105 | } |
| 1919 | 2106 | ||
| 1920 | static void | 2107 | static void |
| @@ -1929,8 +2116,6 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1929 | head->func = func; | 2116 | head->func = func; |
| 1930 | head->next = NULL; | 2117 | head->next = NULL; |
| 1931 | 2118 | ||
| 1932 | smp_mb(); /* Ensure RCU update seen before callback registry. */ | ||
| 1933 | |||
| 1934 | /* | 2119 | /* |
| 1935 | * Opportunistically note grace-period endings and beginnings. | 2120 | * Opportunistically note grace-period endings and beginnings. |
| 1936 | * Note that we might see a beginning right after we see an | 2121 | * Note that we might see a beginning right after we see an |
| @@ -1941,6 +2126,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
| 1941 | rdp = this_cpu_ptr(rsp->rda); | 2126 | rdp = this_cpu_ptr(rsp->rda); |
| 1942 | 2127 | ||
| 1943 | /* Add the callback to our list. */ | 2128 | /* Add the callback to our list. */ |
| 2129 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL)) { | ||
| 2130 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | ||
| 2131 | WARN_ON_ONCE(1); | ||
| 2132 | local_irq_restore(flags); | ||
| 2133 | return; | ||
| 2134 | } | ||
| 1944 | ACCESS_ONCE(rdp->qlen)++; | 2135 | ACCESS_ONCE(rdp->qlen)++; |
| 1945 | if (lazy) | 2136 | if (lazy) |
| 1946 | rdp->qlen_lazy++; | 2137 | rdp->qlen_lazy++; |
| @@ -2195,17 +2386,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2195 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | 2386 | /* Is the RCU core waiting for a quiescent state from this CPU? */ |
| 2196 | if (rcu_scheduler_fully_active && | 2387 | if (rcu_scheduler_fully_active && |
| 2197 | rdp->qs_pending && !rdp->passed_quiesce) { | 2388 | rdp->qs_pending && !rdp->passed_quiesce) { |
| 2198 | |||
| 2199 | /* | ||
| 2200 | * If force_quiescent_state() coming soon and this CPU | ||
| 2201 | * needs a quiescent state, and this is either RCU-sched | ||
| 2202 | * or RCU-bh, force a local reschedule. | ||
| 2203 | */ | ||
| 2204 | rdp->n_rp_qs_pending++; | 2389 | rdp->n_rp_qs_pending++; |
| 2205 | if (!rdp->preemptible && | ||
| 2206 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, | ||
| 2207 | jiffies)) | ||
| 2208 | set_need_resched(); | ||
| 2209 | } else if (rdp->qs_pending && rdp->passed_quiesce) { | 2390 | } else if (rdp->qs_pending && rdp->passed_quiesce) { |
| 2210 | rdp->n_rp_report_qs++; | 2391 | rdp->n_rp_report_qs++; |
| 2211 | return 1; | 2392 | return 1; |
| @@ -2235,13 +2416,6 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | |||
| 2235 | return 1; | 2416 | return 1; |
| 2236 | } | 2417 | } |
| 2237 | 2418 | ||
| 2238 | /* Has an RCU GP gone long enough to send resched IPIs &c? */ | ||
| 2239 | if (rcu_gp_in_progress(rsp) && | ||
| 2240 | ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { | ||
| 2241 | rdp->n_rp_need_fqs++; | ||
| 2242 | return 1; | ||
| 2243 | } | ||
| 2244 | |||
| 2245 | /* nothing to do */ | 2419 | /* nothing to do */ |
| 2246 | rdp->n_rp_need_nothing++; | 2420 | rdp->n_rp_need_nothing++; |
| 2247 | return 0; | 2421 | return 0; |
| @@ -2326,13 +2500,10 @@ static void rcu_barrier_func(void *type) | |||
| 2326 | static void _rcu_barrier(struct rcu_state *rsp) | 2500 | static void _rcu_barrier(struct rcu_state *rsp) |
| 2327 | { | 2501 | { |
| 2328 | int cpu; | 2502 | int cpu; |
| 2329 | unsigned long flags; | ||
| 2330 | struct rcu_data *rdp; | 2503 | struct rcu_data *rdp; |
| 2331 | struct rcu_data rd; | ||
| 2332 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); | 2504 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); |
| 2333 | unsigned long snap_done; | 2505 | unsigned long snap_done; |
| 2334 | 2506 | ||
| 2335 | init_rcu_head_on_stack(&rd.barrier_head); | ||
| 2336 | _rcu_barrier_trace(rsp, "Begin", -1, snap); | 2507 | _rcu_barrier_trace(rsp, "Begin", -1, snap); |
| 2337 | 2508 | ||
| 2338 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | 2509 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ |
| @@ -2372,70 +2543,30 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
| 2372 | /* | 2543 | /* |
| 2373 | * Initialize the count to one rather than to zero in order to | 2544 | * Initialize the count to one rather than to zero in order to |
| 2374 | * avoid a too-soon return to zero in case of a short grace period | 2545 | * avoid a too-soon return to zero in case of a short grace period |
| 2375 | * (or preemption of this task). Also flag this task as doing | 2546 | * (or preemption of this task). Exclude CPU-hotplug operations |
| 2376 | * an rcu_barrier(). This will prevent anyone else from adopting | 2547 | * to ensure that no offline CPU has callbacks queued. |
| 2377 | * orphaned callbacks, which could cause otherwise failure if a | ||
| 2378 | * CPU went offline and quickly came back online. To see this, | ||
| 2379 | * consider the following sequence of events: | ||
| 2380 | * | ||
| 2381 | * 1. We cause CPU 0 to post an rcu_barrier_callback() callback. | ||
| 2382 | * 2. CPU 1 goes offline, orphaning its callbacks. | ||
| 2383 | * 3. CPU 0 adopts CPU 1's orphaned callbacks. | ||
| 2384 | * 4. CPU 1 comes back online. | ||
| 2385 | * 5. We cause CPU 1 to post an rcu_barrier_callback() callback. | ||
| 2386 | * 6. Both rcu_barrier_callback() callbacks are invoked, awakening | ||
| 2387 | * us -- but before CPU 1's orphaned callbacks are invoked!!! | ||
| 2388 | */ | 2548 | */ |
| 2389 | init_completion(&rsp->barrier_completion); | 2549 | init_completion(&rsp->barrier_completion); |
| 2390 | atomic_set(&rsp->barrier_cpu_count, 1); | 2550 | atomic_set(&rsp->barrier_cpu_count, 1); |
| 2391 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | 2551 | get_online_cpus(); |
| 2392 | rsp->rcu_barrier_in_progress = current; | ||
| 2393 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 2394 | 2552 | ||
| 2395 | /* | 2553 | /* |
| 2396 | * Force every CPU with callbacks to register a new callback | 2554 | * Force each CPU with callbacks to register a new callback. |
| 2397 | * that will tell us when all the preceding callbacks have | 2555 | * When that callback is invoked, we will know that all of the |
| 2398 | * been invoked. If an offline CPU has callbacks, wait for | 2556 | * corresponding CPU's preceding callbacks have been invoked. |
| 2399 | * it to either come back online or to finish orphaning those | ||
| 2400 | * callbacks. | ||
| 2401 | */ | 2557 | */ |
| 2402 | for_each_possible_cpu(cpu) { | 2558 | for_each_online_cpu(cpu) { |
| 2403 | preempt_disable(); | ||
| 2404 | rdp = per_cpu_ptr(rsp->rda, cpu); | 2559 | rdp = per_cpu_ptr(rsp->rda, cpu); |
| 2405 | if (cpu_is_offline(cpu)) { | 2560 | if (ACCESS_ONCE(rdp->qlen)) { |
| 2406 | _rcu_barrier_trace(rsp, "Offline", cpu, | ||
| 2407 | rsp->n_barrier_done); | ||
| 2408 | preempt_enable(); | ||
| 2409 | while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen)) | ||
| 2410 | schedule_timeout_interruptible(1); | ||
| 2411 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
| 2412 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | 2561 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, |
| 2413 | rsp->n_barrier_done); | 2562 | rsp->n_barrier_done); |
| 2414 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | 2563 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); |
| 2415 | preempt_enable(); | ||
| 2416 | } else { | 2564 | } else { |
| 2417 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, | 2565 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, |
| 2418 | rsp->n_barrier_done); | 2566 | rsp->n_barrier_done); |
| 2419 | preempt_enable(); | ||
| 2420 | } | 2567 | } |
| 2421 | } | 2568 | } |
| 2422 | 2569 | put_online_cpus(); | |
| 2423 | /* | ||
| 2424 | * Now that all online CPUs have rcu_barrier_callback() callbacks | ||
| 2425 | * posted, we can adopt all of the orphaned callbacks and place | ||
| 2426 | * an rcu_barrier_callback() callback after them. When that is done, | ||
| 2427 | * we are guaranteed to have an rcu_barrier_callback() callback | ||
| 2428 | * following every callback that could possibly have been | ||
| 2429 | * registered before _rcu_barrier() was called. | ||
| 2430 | */ | ||
| 2431 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
| 2432 | rcu_adopt_orphan_cbs(rsp); | ||
| 2433 | rsp->rcu_barrier_in_progress = NULL; | ||
| 2434 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | ||
| 2435 | atomic_inc(&rsp->barrier_cpu_count); | ||
| 2436 | smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ | ||
| 2437 | rd.rsp = rsp; | ||
| 2438 | rsp->call(&rd.barrier_head, rcu_barrier_callback); | ||
| 2439 | 2570 | ||
| 2440 | /* | 2571 | /* |
| 2441 | * Now that we have an rcu_barrier_callback() callback on each | 2572 | * Now that we have an rcu_barrier_callback() callback on each |
| @@ -2456,8 +2587,6 @@ static void _rcu_barrier(struct rcu_state *rsp) | |||
| 2456 | 2587 | ||
| 2457 | /* Other rcu_barrier() invocations can now safely proceed. */ | 2588 | /* Other rcu_barrier() invocations can now safely proceed. */ |
| 2458 | mutex_unlock(&rsp->barrier_mutex); | 2589 | mutex_unlock(&rsp->barrier_mutex); |
| 2459 | |||
| 2460 | destroy_rcu_head_on_stack(&rd.barrier_head); | ||
| 2461 | } | 2590 | } |
| 2462 | 2591 | ||
| 2463 | /** | 2592 | /** |
| @@ -2497,6 +2626,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | |||
| 2497 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | 2626 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); |
| 2498 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | 2627 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); |
| 2499 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | 2628 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); |
| 2629 | #ifdef CONFIG_RCU_USER_QS | ||
| 2630 | WARN_ON_ONCE(rdp->dynticks->in_user); | ||
| 2631 | #endif | ||
| 2500 | rdp->cpu = cpu; | 2632 | rdp->cpu = cpu; |
| 2501 | rdp->rsp = rsp; | 2633 | rdp->rsp = rsp; |
| 2502 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 2634 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| @@ -2523,6 +2655,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
| 2523 | rdp->qlen_last_fqs_check = 0; | 2655 | rdp->qlen_last_fqs_check = 0; |
| 2524 | rdp->n_force_qs_snap = rsp->n_force_qs; | 2656 | rdp->n_force_qs_snap = rsp->n_force_qs; |
| 2525 | rdp->blimit = blimit; | 2657 | rdp->blimit = blimit; |
| 2658 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | ||
| 2526 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | 2659 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; |
| 2527 | atomic_set(&rdp->dynticks->dynticks, | 2660 | atomic_set(&rdp->dynticks->dynticks, |
| 2528 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | 2661 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); |
| @@ -2555,7 +2688,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | |||
| 2555 | rdp->completed = rnp->completed; | 2688 | rdp->completed = rnp->completed; |
| 2556 | rdp->passed_quiesce = 0; | 2689 | rdp->passed_quiesce = 0; |
| 2557 | rdp->qs_pending = 0; | 2690 | rdp->qs_pending = 0; |
| 2558 | rdp->passed_quiesce_gpnum = rnp->gpnum - 1; | ||
| 2559 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); | 2691 | trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); |
| 2560 | } | 2692 | } |
| 2561 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | 2693 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ |
| @@ -2594,12 +2726,10 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
| 2594 | break; | 2726 | break; |
| 2595 | case CPU_ONLINE: | 2727 | case CPU_ONLINE: |
| 2596 | case CPU_DOWN_FAILED: | 2728 | case CPU_DOWN_FAILED: |
| 2597 | rcu_node_kthread_setaffinity(rnp, -1); | 2729 | rcu_boost_kthread_setaffinity(rnp, -1); |
| 2598 | rcu_cpu_kthread_setrt(cpu, 1); | ||
| 2599 | break; | 2730 | break; |
| 2600 | case CPU_DOWN_PREPARE: | 2731 | case CPU_DOWN_PREPARE: |
| 2601 | rcu_node_kthread_setaffinity(rnp, cpu); | 2732 | rcu_boost_kthread_setaffinity(rnp, cpu); |
| 2602 | rcu_cpu_kthread_setrt(cpu, 0); | ||
| 2603 | break; | 2733 | break; |
| 2604 | case CPU_DYING: | 2734 | case CPU_DYING: |
| 2605 | case CPU_DYING_FROZEN: | 2735 | case CPU_DYING_FROZEN: |
| @@ -2627,6 +2757,28 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | |||
| 2627 | } | 2757 | } |
| 2628 | 2758 | ||
| 2629 | /* | 2759 | /* |
| 2760 | * Spawn the kthread that handles this RCU flavor's grace periods. | ||
| 2761 | */ | ||
| 2762 | static int __init rcu_spawn_gp_kthread(void) | ||
| 2763 | { | ||
| 2764 | unsigned long flags; | ||
| 2765 | struct rcu_node *rnp; | ||
| 2766 | struct rcu_state *rsp; | ||
| 2767 | struct task_struct *t; | ||
| 2768 | |||
| 2769 | for_each_rcu_flavor(rsp) { | ||
| 2770 | t = kthread_run(rcu_gp_kthread, rsp, rsp->name); | ||
| 2771 | BUG_ON(IS_ERR(t)); | ||
| 2772 | rnp = rcu_get_root(rsp); | ||
| 2773 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 2774 | rsp->gp_kthread = t; | ||
| 2775 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2776 | } | ||
| 2777 | return 0; | ||
| 2778 | } | ||
| 2779 | early_initcall(rcu_spawn_gp_kthread); | ||
| 2780 | |||
| 2781 | /* | ||
| 2630 | * This function is invoked towards the end of the scheduler's initialization | 2782 | * This function is invoked towards the end of the scheduler's initialization |
| 2631 | * process. Before this is called, the idle task might contain | 2783 | * process. Before this is called, the idle task might contain |
| 2632 | * RCU read-side critical sections (during which time, this idle | 2784 | * RCU read-side critical sections (during which time, this idle |
| @@ -2661,7 +2813,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 2661 | int cprv; | 2813 | int cprv; |
| 2662 | int i; | 2814 | int i; |
| 2663 | 2815 | ||
| 2664 | cprv = NR_CPUS; | 2816 | cprv = nr_cpu_ids; |
| 2665 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | 2817 | for (i = rcu_num_lvls - 1; i >= 0; i--) { |
| 2666 | ccur = rsp->levelcnt[i]; | 2818 | ccur = rsp->levelcnt[i]; |
| 2667 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | 2819 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; |
| @@ -2676,10 +2828,14 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) | |||
| 2676 | static void __init rcu_init_one(struct rcu_state *rsp, | 2828 | static void __init rcu_init_one(struct rcu_state *rsp, |
| 2677 | struct rcu_data __percpu *rda) | 2829 | struct rcu_data __percpu *rda) |
| 2678 | { | 2830 | { |
| 2679 | static char *buf[] = { "rcu_node_level_0", | 2831 | static char *buf[] = { "rcu_node_0", |
| 2680 | "rcu_node_level_1", | 2832 | "rcu_node_1", |
| 2681 | "rcu_node_level_2", | 2833 | "rcu_node_2", |
| 2682 | "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ | 2834 | "rcu_node_3" }; /* Match MAX_RCU_LVLS */ |
| 2835 | static char *fqs[] = { "rcu_node_fqs_0", | ||
| 2836 | "rcu_node_fqs_1", | ||
| 2837 | "rcu_node_fqs_2", | ||
| 2838 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | ||
| 2683 | int cpustride = 1; | 2839 | int cpustride = 1; |
| 2684 | int i; | 2840 | int i; |
| 2685 | int j; | 2841 | int j; |
| @@ -2704,7 +2860,11 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 2704 | raw_spin_lock_init(&rnp->lock); | 2860 | raw_spin_lock_init(&rnp->lock); |
| 2705 | lockdep_set_class_and_name(&rnp->lock, | 2861 | lockdep_set_class_and_name(&rnp->lock, |
| 2706 | &rcu_node_class[i], buf[i]); | 2862 | &rcu_node_class[i], buf[i]); |
| 2707 | rnp->gpnum = 0; | 2863 | raw_spin_lock_init(&rnp->fqslock); |
| 2864 | lockdep_set_class_and_name(&rnp->fqslock, | ||
| 2865 | &rcu_fqs_class[i], fqs[i]); | ||
| 2866 | rnp->gpnum = rsp->gpnum; | ||
| 2867 | rnp->completed = rsp->completed; | ||
| 2708 | rnp->qsmask = 0; | 2868 | rnp->qsmask = 0; |
| 2709 | rnp->qsmaskinit = 0; | 2869 | rnp->qsmaskinit = 0; |
| 2710 | rnp->grplo = j * cpustride; | 2870 | rnp->grplo = j * cpustride; |
| @@ -2727,6 +2887,7 @@ static void __init rcu_init_one(struct rcu_state *rsp, | |||
| 2727 | } | 2887 | } |
| 2728 | 2888 | ||
| 2729 | rsp->rda = rda; | 2889 | rsp->rda = rda; |
| 2890 | init_waitqueue_head(&rsp->gp_wq); | ||
| 2730 | rnp = rsp->level[rcu_num_lvls - 1]; | 2891 | rnp = rsp->level[rcu_num_lvls - 1]; |
| 2731 | for_each_possible_cpu(i) { | 2892 | for_each_possible_cpu(i) { |
| 2732 | while (i > rnp->grphi) | 2893 | while (i > rnp->grphi) |
| @@ -2750,7 +2911,8 @@ static void __init rcu_init_geometry(void) | |||
| 2750 | int rcu_capacity[MAX_RCU_LVLS + 1]; | 2911 | int rcu_capacity[MAX_RCU_LVLS + 1]; |
| 2751 | 2912 | ||
| 2752 | /* If the compile-time values are accurate, just leave. */ | 2913 | /* If the compile-time values are accurate, just leave. */ |
| 2753 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) | 2914 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && |
| 2915 | nr_cpu_ids == NR_CPUS) | ||
| 2754 | return; | 2916 | return; |
| 2755 | 2917 | ||
| 2756 | /* | 2918 | /* |
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 4d29169f2124..5faf05d68326 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
| @@ -102,6 +102,10 @@ struct rcu_dynticks { | |||
| 102 | /* idle-period nonlazy_posted snapshot. */ | 102 | /* idle-period nonlazy_posted snapshot. */ |
| 103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | 103 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ |
| 104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 104 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| 105 | #ifdef CONFIG_RCU_USER_QS | ||
| 106 | bool ignore_user_qs; /* Treat userspace as extended QS or not */ | ||
| 107 | bool in_user; /* Is the CPU in userland from RCU POV? */ | ||
| 108 | #endif | ||
| 105 | }; | 109 | }; |
| 106 | 110 | ||
| 107 | /* RCU's kthread states for tracing. */ | 111 | /* RCU's kthread states for tracing. */ |
| @@ -196,12 +200,7 @@ struct rcu_node { | |||
| 196 | /* Refused to boost: not sure why, though. */ | 200 | /* Refused to boost: not sure why, though. */ |
| 197 | /* This can happen due to race conditions. */ | 201 | /* This can happen due to race conditions. */ |
| 198 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 202 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 199 | struct task_struct *node_kthread_task; | 203 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; |
| 200 | /* kthread that takes care of this rcu_node */ | ||
| 201 | /* structure, for example, awakening the */ | ||
| 202 | /* per-CPU kthreads as needed. */ | ||
| 203 | unsigned int node_kthread_status; | ||
| 204 | /* State of node_kthread_task for tracing. */ | ||
| 205 | } ____cacheline_internodealigned_in_smp; | 204 | } ____cacheline_internodealigned_in_smp; |
| 206 | 205 | ||
| 207 | /* | 206 | /* |
| @@ -245,8 +244,6 @@ struct rcu_data { | |||
| 245 | /* in order to detect GP end. */ | 244 | /* in order to detect GP end. */ |
| 246 | unsigned long gpnum; /* Highest gp number that this CPU */ | 245 | unsigned long gpnum; /* Highest gp number that this CPU */ |
| 247 | /* is aware of having started. */ | 246 | /* is aware of having started. */ |
| 248 | unsigned long passed_quiesce_gpnum; | ||
| 249 | /* gpnum at time of quiescent state. */ | ||
| 250 | bool passed_quiesce; /* User-mode/idle loop etc. */ | 247 | bool passed_quiesce; /* User-mode/idle loop etc. */ |
| 251 | bool qs_pending; /* Core waits for quiesc state. */ | 248 | bool qs_pending; /* Core waits for quiesc state. */ |
| 252 | bool beenonline; /* CPU online at least once. */ | 249 | bool beenonline; /* CPU online at least once. */ |
| @@ -312,11 +309,13 @@ struct rcu_data { | |||
| 312 | unsigned long n_rp_cpu_needs_gp; | 309 | unsigned long n_rp_cpu_needs_gp; |
| 313 | unsigned long n_rp_gp_completed; | 310 | unsigned long n_rp_gp_completed; |
| 314 | unsigned long n_rp_gp_started; | 311 | unsigned long n_rp_gp_started; |
| 315 | unsigned long n_rp_need_fqs; | ||
| 316 | unsigned long n_rp_need_nothing; | 312 | unsigned long n_rp_need_nothing; |
| 317 | 313 | ||
| 318 | /* 6) _rcu_barrier() callback. */ | 314 | /* 6) _rcu_barrier() and OOM callbacks. */ |
| 319 | struct rcu_head barrier_head; | 315 | struct rcu_head barrier_head; |
| 316 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
| 317 | struct rcu_head oom_head; | ||
| 318 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
| 320 | 319 | ||
| 321 | int cpu; | 320 | int cpu; |
| 322 | struct rcu_state *rsp; | 321 | struct rcu_state *rsp; |
| @@ -375,20 +374,17 @@ struct rcu_state { | |||
| 375 | 374 | ||
| 376 | u8 fqs_state ____cacheline_internodealigned_in_smp; | 375 | u8 fqs_state ____cacheline_internodealigned_in_smp; |
| 377 | /* Force QS state. */ | 376 | /* Force QS state. */ |
| 378 | u8 fqs_active; /* force_quiescent_state() */ | ||
| 379 | /* is running. */ | ||
| 380 | u8 fqs_need_gp; /* A CPU was prevented from */ | ||
| 381 | /* starting a new grace */ | ||
| 382 | /* period because */ | ||
| 383 | /* force_quiescent_state() */ | ||
| 384 | /* was running. */ | ||
| 385 | u8 boost; /* Subject to priority boost. */ | 377 | u8 boost; /* Subject to priority boost. */ |
| 386 | unsigned long gpnum; /* Current gp number. */ | 378 | unsigned long gpnum; /* Current gp number. */ |
| 387 | unsigned long completed; /* # of last completed gp. */ | 379 | unsigned long completed; /* # of last completed gp. */ |
| 380 | struct task_struct *gp_kthread; /* Task for grace periods. */ | ||
| 381 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | ||
| 382 | int gp_flags; /* Commands for GP task. */ | ||
| 388 | 383 | ||
| 389 | /* End of fields guarded by root rcu_node's lock. */ | 384 | /* End of fields guarded by root rcu_node's lock. */ |
| 390 | 385 | ||
| 391 | raw_spinlock_t onofflock; /* exclude on/offline and */ | 386 | raw_spinlock_t onofflock ____cacheline_internodealigned_in_smp; |
| 387 | /* exclude on/offline and */ | ||
| 392 | /* starting new GP. */ | 388 | /* starting new GP. */ |
| 393 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | 389 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ |
| 394 | /* need a grace period. */ | 390 | /* need a grace period. */ |
| @@ -398,16 +394,11 @@ struct rcu_state { | |||
| 398 | struct rcu_head **orphan_donetail; /* Tail of above. */ | 394 | struct rcu_head **orphan_donetail; /* Tail of above. */ |
| 399 | long qlen_lazy; /* Number of lazy callbacks. */ | 395 | long qlen_lazy; /* Number of lazy callbacks. */ |
| 400 | long qlen; /* Total number of callbacks. */ | 396 | long qlen; /* Total number of callbacks. */ |
| 401 | struct task_struct *rcu_barrier_in_progress; | ||
| 402 | /* Task doing rcu_barrier(), */ | ||
| 403 | /* or NULL if no barrier. */ | ||
| 404 | struct mutex barrier_mutex; /* Guards barrier fields. */ | 397 | struct mutex barrier_mutex; /* Guards barrier fields. */ |
| 405 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | 398 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ |
| 406 | struct completion barrier_completion; /* Wake at barrier end. */ | 399 | struct completion barrier_completion; /* Wake at barrier end. */ |
| 407 | unsigned long n_barrier_done; /* ++ at start and end of */ | 400 | unsigned long n_barrier_done; /* ++ at start and end of */ |
| 408 | /* _rcu_barrier(). */ | 401 | /* _rcu_barrier(). */ |
| 409 | raw_spinlock_t fqslock; /* Only one task forcing */ | ||
| 410 | /* quiescent states. */ | ||
| 411 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | 402 | unsigned long jiffies_force_qs; /* Time at which to invoke */ |
| 412 | /* force_quiescent_state(). */ | 403 | /* force_quiescent_state(). */ |
| 413 | unsigned long n_force_qs; /* Number of calls to */ | 404 | unsigned long n_force_qs; /* Number of calls to */ |
| @@ -426,6 +417,10 @@ struct rcu_state { | |||
| 426 | struct list_head flavors; /* List of RCU flavors. */ | 417 | struct list_head flavors; /* List of RCU flavors. */ |
| 427 | }; | 418 | }; |
| 428 | 419 | ||
| 420 | /* Values for rcu_state structure's gp_flags field. */ | ||
| 421 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ | ||
| 422 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | ||
| 423 | |||
| 429 | extern struct list_head rcu_struct_flavors; | 424 | extern struct list_head rcu_struct_flavors; |
| 430 | #define for_each_rcu_flavor(rsp) \ | 425 | #define for_each_rcu_flavor(rsp) \ |
| 431 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | 426 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) |
| @@ -468,7 +463,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | |||
| 468 | #ifdef CONFIG_HOTPLUG_CPU | 463 | #ifdef CONFIG_HOTPLUG_CPU |
| 469 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | 464 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, |
| 470 | unsigned long flags); | 465 | unsigned long flags); |
| 471 | static void rcu_stop_cpu_kthread(int cpu); | ||
| 472 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 466 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
| 473 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | 467 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); |
| 474 | static int rcu_print_task_stall(struct rcu_node *rnp); | 468 | static int rcu_print_task_stall(struct rcu_node *rnp); |
| @@ -491,15 +485,9 @@ static void invoke_rcu_callbacks_kthread(void); | |||
| 491 | static bool rcu_is_callbacks_kthread(void); | 485 | static bool rcu_is_callbacks_kthread(void); |
| 492 | #ifdef CONFIG_RCU_BOOST | 486 | #ifdef CONFIG_RCU_BOOST |
| 493 | static void rcu_preempt_do_callbacks(void); | 487 | static void rcu_preempt_do_callbacks(void); |
| 494 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
| 495 | cpumask_var_t cm); | ||
| 496 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 488 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
| 497 | struct rcu_node *rnp, | 489 | struct rcu_node *rnp); |
| 498 | int rnp_index); | ||
| 499 | static void invoke_rcu_node_kthread(struct rcu_node *rnp); | ||
| 500 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg); | ||
| 501 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 490 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 502 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt); | ||
| 503 | static void __cpuinit rcu_prepare_kthreads(int cpu); | 491 | static void __cpuinit rcu_prepare_kthreads(int cpu); |
| 504 | static void rcu_prepare_for_idle_init(int cpu); | 492 | static void rcu_prepare_for_idle_init(int cpu); |
| 505 | static void rcu_cleanup_after_idle(int cpu); | 493 | static void rcu_cleanup_after_idle(int cpu); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 7f3244c0df01..f92115488187 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
| @@ -25,6 +25,8 @@ | |||
| 25 | */ | 25 | */ |
| 26 | 26 | ||
| 27 | #include <linux/delay.h> | 27 | #include <linux/delay.h> |
| 28 | #include <linux/oom.h> | ||
| 29 | #include <linux/smpboot.h> | ||
| 28 | 30 | ||
| 29 | #define RCU_KTHREAD_PRIO 1 | 31 | #define RCU_KTHREAD_PRIO 1 |
| 30 | 32 | ||
| @@ -118,7 +120,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed); | |||
| 118 | */ | 120 | */ |
| 119 | void rcu_force_quiescent_state(void) | 121 | void rcu_force_quiescent_state(void) |
| 120 | { | 122 | { |
| 121 | force_quiescent_state(&rcu_preempt_state, 0); | 123 | force_quiescent_state(&rcu_preempt_state); |
| 122 | } | 124 | } |
| 123 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | 125 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); |
| 124 | 126 | ||
| @@ -136,8 +138,6 @@ static void rcu_preempt_qs(int cpu) | |||
| 136 | { | 138 | { |
| 137 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | 139 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); |
| 138 | 140 | ||
| 139 | rdp->passed_quiesce_gpnum = rdp->gpnum; | ||
| 140 | barrier(); | ||
| 141 | if (rdp->passed_quiesce == 0) | 141 | if (rdp->passed_quiesce == 0) |
| 142 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); | 142 | trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); |
| 143 | rdp->passed_quiesce = 1; | 143 | rdp->passed_quiesce = 1; |
| @@ -422,9 +422,11 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | |||
| 422 | unsigned long flags; | 422 | unsigned long flags; |
| 423 | struct task_struct *t; | 423 | struct task_struct *t; |
| 424 | 424 | ||
| 425 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | ||
| 426 | return; | ||
| 427 | raw_spin_lock_irqsave(&rnp->lock, flags); | 425 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 426 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 427 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 428 | return; | ||
| 429 | } | ||
| 428 | t = list_entry(rnp->gp_tasks, | 430 | t = list_entry(rnp->gp_tasks, |
| 429 | struct task_struct, rcu_node_entry); | 431 | struct task_struct, rcu_node_entry); |
| 430 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | 432 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) |
| @@ -584,17 +586,23 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
| 584 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | 586 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ |
| 585 | } | 587 | } |
| 586 | 588 | ||
| 589 | rnp->gp_tasks = NULL; | ||
| 590 | rnp->exp_tasks = NULL; | ||
| 587 | #ifdef CONFIG_RCU_BOOST | 591 | #ifdef CONFIG_RCU_BOOST |
| 588 | /* In case root is being boosted and leaf is not. */ | 592 | rnp->boost_tasks = NULL; |
| 593 | /* | ||
| 594 | * In case root is being boosted and leaf was not. Make sure | ||
| 595 | * that we boost the tasks blocking the current grace period | ||
| 596 | * in this case. | ||
| 597 | */ | ||
| 589 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | 598 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ |
| 590 | if (rnp_root->boost_tasks != NULL && | 599 | if (rnp_root->boost_tasks != NULL && |
| 591 | rnp_root->boost_tasks != rnp_root->gp_tasks) | 600 | rnp_root->boost_tasks != rnp_root->gp_tasks && |
| 601 | rnp_root->boost_tasks != rnp_root->exp_tasks) | ||
| 592 | rnp_root->boost_tasks = rnp_root->gp_tasks; | 602 | rnp_root->boost_tasks = rnp_root->gp_tasks; |
| 593 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | 603 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ |
| 594 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 604 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 595 | 605 | ||
| 596 | rnp->gp_tasks = NULL; | ||
| 597 | rnp->exp_tasks = NULL; | ||
| 598 | return retval; | 606 | return retval; |
| 599 | } | 607 | } |
| 600 | 608 | ||
| @@ -676,7 +684,7 @@ void synchronize_rcu(void) | |||
| 676 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 684 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
| 677 | 685 | ||
| 678 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | 686 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); |
| 679 | static long sync_rcu_preempt_exp_count; | 687 | static unsigned long sync_rcu_preempt_exp_count; |
| 680 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | 688 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); |
| 681 | 689 | ||
| 682 | /* | 690 | /* |
| @@ -791,7 +799,7 @@ void synchronize_rcu_expedited(void) | |||
| 791 | unsigned long flags; | 799 | unsigned long flags; |
| 792 | struct rcu_node *rnp; | 800 | struct rcu_node *rnp; |
| 793 | struct rcu_state *rsp = &rcu_preempt_state; | 801 | struct rcu_state *rsp = &rcu_preempt_state; |
| 794 | long snap; | 802 | unsigned long snap; |
| 795 | int trycount = 0; | 803 | int trycount = 0; |
| 796 | 804 | ||
| 797 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | 805 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ |
| @@ -799,33 +807,47 @@ void synchronize_rcu_expedited(void) | |||
| 799 | smp_mb(); /* Above access cannot bleed into critical section. */ | 807 | smp_mb(); /* Above access cannot bleed into critical section. */ |
| 800 | 808 | ||
| 801 | /* | 809 | /* |
| 810 | * Block CPU-hotplug operations. This means that any CPU-hotplug | ||
| 811 | * operation that finds an rcu_node structure with tasks in the | ||
| 812 | * process of being boosted will know that all tasks blocking | ||
| 813 | * this expedited grace period will already be in the process of | ||
| 814 | * being boosted. This simplifies the process of moving tasks | ||
| 815 | * from leaf to root rcu_node structures. | ||
| 816 | */ | ||
| 817 | get_online_cpus(); | ||
| 818 | |||
| 819 | /* | ||
| 802 | * Acquire lock, falling back to synchronize_rcu() if too many | 820 | * Acquire lock, falling back to synchronize_rcu() if too many |
| 803 | * lock-acquisition failures. Of course, if someone does the | 821 | * lock-acquisition failures. Of course, if someone does the |
| 804 | * expedited grace period for us, just leave. | 822 | * expedited grace period for us, just leave. |
| 805 | */ | 823 | */ |
| 806 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | 824 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { |
| 825 | if (ULONG_CMP_LT(snap, | ||
| 826 | ACCESS_ONCE(sync_rcu_preempt_exp_count))) { | ||
| 827 | put_online_cpus(); | ||
| 828 | goto mb_ret; /* Others did our work for us. */ | ||
| 829 | } | ||
| 807 | if (trycount++ < 10) { | 830 | if (trycount++ < 10) { |
| 808 | udelay(trycount * num_online_cpus()); | 831 | udelay(trycount * num_online_cpus()); |
| 809 | } else { | 832 | } else { |
| 833 | put_online_cpus(); | ||
| 810 | synchronize_rcu(); | 834 | synchronize_rcu(); |
| 811 | return; | 835 | return; |
| 812 | } | 836 | } |
| 813 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | ||
| 814 | goto mb_ret; /* Others did our work for us. */ | ||
| 815 | } | 837 | } |
| 816 | if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) | 838 | if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { |
| 839 | put_online_cpus(); | ||
| 817 | goto unlock_mb_ret; /* Others did our work for us. */ | 840 | goto unlock_mb_ret; /* Others did our work for us. */ |
| 841 | } | ||
| 818 | 842 | ||
| 819 | /* force all RCU readers onto ->blkd_tasks lists. */ | 843 | /* force all RCU readers onto ->blkd_tasks lists. */ |
| 820 | synchronize_sched_expedited(); | 844 | synchronize_sched_expedited(); |
| 821 | 845 | ||
| 822 | raw_spin_lock_irqsave(&rsp->onofflock, flags); | ||
| 823 | |||
| 824 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | 846 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ |
| 825 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | 847 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { |
| 826 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | 848 | raw_spin_lock_irqsave(&rnp->lock, flags); |
| 827 | rnp->expmask = rnp->qsmaskinit; | 849 | rnp->expmask = rnp->qsmaskinit; |
| 828 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 850 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 829 | } | 851 | } |
| 830 | 852 | ||
| 831 | /* Snapshot current state of ->blkd_tasks lists. */ | 853 | /* Snapshot current state of ->blkd_tasks lists. */ |
| @@ -834,7 +856,7 @@ void synchronize_rcu_expedited(void) | |||
| 834 | if (NUM_RCU_NODES > 1) | 856 | if (NUM_RCU_NODES > 1) |
| 835 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | 857 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); |
| 836 | 858 | ||
| 837 | raw_spin_unlock_irqrestore(&rsp->onofflock, flags); | 859 | put_online_cpus(); |
| 838 | 860 | ||
| 839 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ | 861 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ |
| 840 | rnp = rcu_get_root(rsp); | 862 | rnp = rcu_get_root(rsp); |
| @@ -1069,6 +1091,16 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp) | |||
| 1069 | 1091 | ||
| 1070 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | 1092 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ |
| 1071 | 1093 | ||
| 1094 | static void rcu_wake_cond(struct task_struct *t, int status) | ||
| 1095 | { | ||
| 1096 | /* | ||
| 1097 | * If the thread is yielding, only wake it when this | ||
| 1098 | * is invoked from idle | ||
| 1099 | */ | ||
| 1100 | if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) | ||
| 1101 | wake_up_process(t); | ||
| 1102 | } | ||
| 1103 | |||
| 1072 | /* | 1104 | /* |
| 1073 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | 1105 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks |
| 1074 | * or ->boost_tasks, advancing the pointer to the next task in the | 1106 | * or ->boost_tasks, advancing the pointer to the next task in the |
| @@ -1141,17 +1173,6 @@ static int rcu_boost(struct rcu_node *rnp) | |||
| 1141 | } | 1173 | } |
| 1142 | 1174 | ||
| 1143 | /* | 1175 | /* |
| 1144 | * Timer handler to initiate waking up of boost kthreads that | ||
| 1145 | * have yielded the CPU due to excessive numbers of tasks to | ||
| 1146 | * boost. We wake up the per-rcu_node kthread, which in turn | ||
| 1147 | * will wake up the booster kthread. | ||
| 1148 | */ | ||
| 1149 | static void rcu_boost_kthread_timer(unsigned long arg) | ||
| 1150 | { | ||
| 1151 | invoke_rcu_node_kthread((struct rcu_node *)arg); | ||
| 1152 | } | ||
| 1153 | |||
| 1154 | /* | ||
| 1155 | * Priority-boosting kthread. One per leaf rcu_node and one for the | 1176 | * Priority-boosting kthread. One per leaf rcu_node and one for the |
| 1156 | * root rcu_node. | 1177 | * root rcu_node. |
| 1157 | */ | 1178 | */ |
| @@ -1174,8 +1195,9 @@ static int rcu_boost_kthread(void *arg) | |||
| 1174 | else | 1195 | else |
| 1175 | spincnt = 0; | 1196 | spincnt = 0; |
| 1176 | if (spincnt > 10) { | 1197 | if (spincnt > 10) { |
| 1198 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | ||
| 1177 | trace_rcu_utilization("End boost kthread@rcu_yield"); | 1199 | trace_rcu_utilization("End boost kthread@rcu_yield"); |
| 1178 | rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); | 1200 | schedule_timeout_interruptible(2); |
| 1179 | trace_rcu_utilization("Start boost kthread@rcu_yield"); | 1201 | trace_rcu_utilization("Start boost kthread@rcu_yield"); |
| 1180 | spincnt = 0; | 1202 | spincnt = 0; |
| 1181 | } | 1203 | } |
| @@ -1191,9 +1213,9 @@ static int rcu_boost_kthread(void *arg) | |||
| 1191 | * kthread to start boosting them. If there is an expedited grace | 1213 | * kthread to start boosting them. If there is an expedited grace |
| 1192 | * period in progress, it is always time to boost. | 1214 | * period in progress, it is always time to boost. |
| 1193 | * | 1215 | * |
| 1194 | * The caller must hold rnp->lock, which this function releases, | 1216 | * The caller must hold rnp->lock, which this function releases. |
| 1195 | * but irqs remain disabled. The ->boost_kthread_task is immortal, | 1217 | * The ->boost_kthread_task is immortal, so we don't need to worry |
| 1196 | * so we don't need to worry about it going away. | 1218 | * about it going away. |
| 1197 | */ | 1219 | */ |
| 1198 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | 1220 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) |
| 1199 | { | 1221 | { |
| @@ -1213,8 +1235,8 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | |||
| 1213 | rnp->boost_tasks = rnp->gp_tasks; | 1235 | rnp->boost_tasks = rnp->gp_tasks; |
| 1214 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1236 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| 1215 | t = rnp->boost_kthread_task; | 1237 | t = rnp->boost_kthread_task; |
| 1216 | if (t != NULL) | 1238 | if (t) |
| 1217 | wake_up_process(t); | 1239 | rcu_wake_cond(t, rnp->boost_kthread_status); |
| 1218 | } else { | 1240 | } else { |
| 1219 | rcu_initiate_boost_trace(rnp); | 1241 | rcu_initiate_boost_trace(rnp); |
| 1220 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | 1242 | raw_spin_unlock_irqrestore(&rnp->lock, flags); |
| @@ -1231,8 +1253,10 @@ static void invoke_rcu_callbacks_kthread(void) | |||
| 1231 | local_irq_save(flags); | 1253 | local_irq_save(flags); |
| 1232 | __this_cpu_write(rcu_cpu_has_work, 1); | 1254 | __this_cpu_write(rcu_cpu_has_work, 1); |
| 1233 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && | 1255 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && |
| 1234 | current != __this_cpu_read(rcu_cpu_kthread_task)) | 1256 | current != __this_cpu_read(rcu_cpu_kthread_task)) { |
| 1235 | wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); | 1257 | rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), |
| 1258 | __this_cpu_read(rcu_cpu_kthread_status)); | ||
| 1259 | } | ||
| 1236 | local_irq_restore(flags); | 1260 | local_irq_restore(flags); |
| 1237 | } | 1261 | } |
| 1238 | 1262 | ||
| @@ -1245,21 +1269,6 @@ static bool rcu_is_callbacks_kthread(void) | |||
| 1245 | return __get_cpu_var(rcu_cpu_kthread_task) == current; | 1269 | return __get_cpu_var(rcu_cpu_kthread_task) == current; |
| 1246 | } | 1270 | } |
| 1247 | 1271 | ||
| 1248 | /* | ||
| 1249 | * Set the affinity of the boost kthread. The CPU-hotplug locks are | ||
| 1250 | * held, so no one should be messing with the existence of the boost | ||
| 1251 | * kthread. | ||
| 1252 | */ | ||
| 1253 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, | ||
| 1254 | cpumask_var_t cm) | ||
| 1255 | { | ||
| 1256 | struct task_struct *t; | ||
| 1257 | |||
| 1258 | t = rnp->boost_kthread_task; | ||
| 1259 | if (t != NULL) | ||
| 1260 | set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); | ||
| 1261 | } | ||
| 1262 | |||
| 1263 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | 1272 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) |
| 1264 | 1273 | ||
| 1265 | /* | 1274 | /* |
| @@ -1276,15 +1285,19 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
| 1276 | * Returns zero if all is well, a negated errno otherwise. | 1285 | * Returns zero if all is well, a negated errno otherwise. |
| 1277 | */ | 1286 | */ |
| 1278 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | 1287 | static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, |
| 1279 | struct rcu_node *rnp, | 1288 | struct rcu_node *rnp) |
| 1280 | int rnp_index) | ||
| 1281 | { | 1289 | { |
| 1290 | int rnp_index = rnp - &rsp->node[0]; | ||
| 1282 | unsigned long flags; | 1291 | unsigned long flags; |
| 1283 | struct sched_param sp; | 1292 | struct sched_param sp; |
| 1284 | struct task_struct *t; | 1293 | struct task_struct *t; |
| 1285 | 1294 | ||
| 1286 | if (&rcu_preempt_state != rsp) | 1295 | if (&rcu_preempt_state != rsp) |
| 1287 | return 0; | 1296 | return 0; |
| 1297 | |||
| 1298 | if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) | ||
| 1299 | return 0; | ||
| 1300 | |||
| 1288 | rsp->boost = 1; | 1301 | rsp->boost = 1; |
| 1289 | if (rnp->boost_kthread_task != NULL) | 1302 | if (rnp->boost_kthread_task != NULL) |
| 1290 | return 0; | 1303 | return 0; |
| @@ -1301,25 +1314,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | |||
| 1301 | return 0; | 1314 | return 0; |
| 1302 | } | 1315 | } |
| 1303 | 1316 | ||
| 1304 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1305 | |||
| 1306 | /* | ||
| 1307 | * Stop the RCU's per-CPU kthread when its CPU goes offline,. | ||
| 1308 | */ | ||
| 1309 | static void rcu_stop_cpu_kthread(int cpu) | ||
| 1310 | { | ||
| 1311 | struct task_struct *t; | ||
| 1312 | |||
| 1313 | /* Stop the CPU's kthread. */ | ||
| 1314 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
| 1315 | if (t != NULL) { | ||
| 1316 | per_cpu(rcu_cpu_kthread_task, cpu) = NULL; | ||
| 1317 | kthread_stop(t); | ||
| 1318 | } | ||
| 1319 | } | ||
| 1320 | |||
| 1321 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1322 | |||
| 1323 | static void rcu_kthread_do_work(void) | 1317 | static void rcu_kthread_do_work(void) |
| 1324 | { | 1318 | { |
| 1325 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); | 1319 | rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); |
| @@ -1327,112 +1321,22 @@ static void rcu_kthread_do_work(void) | |||
| 1327 | rcu_preempt_do_callbacks(); | 1321 | rcu_preempt_do_callbacks(); |
| 1328 | } | 1322 | } |
| 1329 | 1323 | ||
| 1330 | /* | 1324 | static void rcu_cpu_kthread_setup(unsigned int cpu) |
| 1331 | * Wake up the specified per-rcu_node-structure kthread. | ||
| 1332 | * Because the per-rcu_node kthreads are immortal, we don't need | ||
| 1333 | * to do anything to keep them alive. | ||
| 1334 | */ | ||
| 1335 | static void invoke_rcu_node_kthread(struct rcu_node *rnp) | ||
| 1336 | { | ||
| 1337 | struct task_struct *t; | ||
| 1338 | |||
| 1339 | t = rnp->node_kthread_task; | ||
| 1340 | if (t != NULL) | ||
| 1341 | wake_up_process(t); | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | /* | ||
| 1345 | * Set the specified CPU's kthread to run RT or not, as specified by | ||
| 1346 | * the to_rt argument. The CPU-hotplug locks are held, so the task | ||
| 1347 | * is not going away. | ||
| 1348 | */ | ||
| 1349 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
| 1350 | { | 1325 | { |
| 1351 | int policy; | ||
| 1352 | struct sched_param sp; | 1326 | struct sched_param sp; |
| 1353 | struct task_struct *t; | ||
| 1354 | 1327 | ||
| 1355 | t = per_cpu(rcu_cpu_kthread_task, cpu); | 1328 | sp.sched_priority = RCU_KTHREAD_PRIO; |
| 1356 | if (t == NULL) | 1329 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); |
| 1357 | return; | ||
| 1358 | if (to_rt) { | ||
| 1359 | policy = SCHED_FIFO; | ||
| 1360 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1361 | } else { | ||
| 1362 | policy = SCHED_NORMAL; | ||
| 1363 | sp.sched_priority = 0; | ||
| 1364 | } | ||
| 1365 | sched_setscheduler_nocheck(t, policy, &sp); | ||
| 1366 | } | 1330 | } |
| 1367 | 1331 | ||
| 1368 | /* | 1332 | static void rcu_cpu_kthread_park(unsigned int cpu) |
| 1369 | * Timer handler to initiate the waking up of per-CPU kthreads that | ||
| 1370 | * have yielded the CPU due to excess numbers of RCU callbacks. | ||
| 1371 | * We wake up the per-rcu_node kthread, which in turn will wake up | ||
| 1372 | * the booster kthread. | ||
| 1373 | */ | ||
| 1374 | static void rcu_cpu_kthread_timer(unsigned long arg) | ||
| 1375 | { | 1333 | { |
| 1376 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); | 1334 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; |
| 1377 | struct rcu_node *rnp = rdp->mynode; | ||
| 1378 | |||
| 1379 | atomic_or(rdp->grpmask, &rnp->wakemask); | ||
| 1380 | invoke_rcu_node_kthread(rnp); | ||
| 1381 | } | 1335 | } |
| 1382 | 1336 | ||
| 1383 | /* | 1337 | static int rcu_cpu_kthread_should_run(unsigned int cpu) |
| 1384 | * Drop to non-real-time priority and yield, but only after posting a | ||
| 1385 | * timer that will cause us to regain our real-time priority if we | ||
| 1386 | * remain preempted. Either way, we restore our real-time priority | ||
| 1387 | * before returning. | ||
| 1388 | */ | ||
| 1389 | static void rcu_yield(void (*f)(unsigned long), unsigned long arg) | ||
| 1390 | { | 1338 | { |
| 1391 | struct sched_param sp; | 1339 | return __get_cpu_var(rcu_cpu_has_work); |
| 1392 | struct timer_list yield_timer; | ||
| 1393 | int prio = current->rt_priority; | ||
| 1394 | |||
| 1395 | setup_timer_on_stack(&yield_timer, f, arg); | ||
| 1396 | mod_timer(&yield_timer, jiffies + 2); | ||
| 1397 | sp.sched_priority = 0; | ||
| 1398 | sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); | ||
| 1399 | set_user_nice(current, 19); | ||
| 1400 | schedule(); | ||
| 1401 | set_user_nice(current, 0); | ||
| 1402 | sp.sched_priority = prio; | ||
| 1403 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
| 1404 | del_timer(&yield_timer); | ||
| 1405 | } | ||
| 1406 | |||
| 1407 | /* | ||
| 1408 | * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. | ||
| 1409 | * This can happen while the corresponding CPU is either coming online | ||
| 1410 | * or going offline. We cannot wait until the CPU is fully online | ||
| 1411 | * before starting the kthread, because the various notifier functions | ||
| 1412 | * can wait for RCU grace periods. So we park rcu_cpu_kthread() until | ||
| 1413 | * the corresponding CPU is online. | ||
| 1414 | * | ||
| 1415 | * Return 1 if the kthread needs to stop, 0 otherwise. | ||
| 1416 | * | ||
| 1417 | * Caller must disable bh. This function can momentarily enable it. | ||
| 1418 | */ | ||
| 1419 | static int rcu_cpu_kthread_should_stop(int cpu) | ||
| 1420 | { | ||
| 1421 | while (cpu_is_offline(cpu) || | ||
| 1422 | !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || | ||
| 1423 | smp_processor_id() != cpu) { | ||
| 1424 | if (kthread_should_stop()) | ||
| 1425 | return 1; | ||
| 1426 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
| 1427 | per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); | ||
| 1428 | local_bh_enable(); | ||
| 1429 | schedule_timeout_uninterruptible(1); | ||
| 1430 | if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) | ||
| 1431 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 1432 | local_bh_disable(); | ||
| 1433 | } | ||
| 1434 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
| 1435 | return 0; | ||
| 1436 | } | 1340 | } |
| 1437 | 1341 | ||
| 1438 | /* | 1342 | /* |
| @@ -1440,138 +1344,35 @@ static int rcu_cpu_kthread_should_stop(int cpu) | |||
| 1440 | * RCU softirq used in flavors and configurations of RCU that do not | 1344 | * RCU softirq used in flavors and configurations of RCU that do not |
| 1441 | * support RCU priority boosting. | 1345 | * support RCU priority boosting. |
| 1442 | */ | 1346 | */ |
| 1443 | static int rcu_cpu_kthread(void *arg) | 1347 | static void rcu_cpu_kthread(unsigned int cpu) |
| 1444 | { | 1348 | { |
| 1445 | int cpu = (int)(long)arg; | 1349 | unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); |
| 1446 | unsigned long flags; | 1350 | char work, *workp = &__get_cpu_var(rcu_cpu_has_work); |
| 1447 | int spincnt = 0; | 1351 | int spincnt; |
| 1448 | unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); | ||
| 1449 | char work; | ||
| 1450 | char *workp = &per_cpu(rcu_cpu_has_work, cpu); | ||
| 1451 | 1352 | ||
| 1452 | trace_rcu_utilization("Start CPU kthread@init"); | 1353 | for (spincnt = 0; spincnt < 10; spincnt++) { |
| 1453 | for (;;) { | ||
| 1454 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1455 | trace_rcu_utilization("End CPU kthread@rcu_wait"); | ||
| 1456 | rcu_wait(*workp != 0 || kthread_should_stop()); | ||
| 1457 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); | 1354 | trace_rcu_utilization("Start CPU kthread@rcu_wait"); |
| 1458 | local_bh_disable(); | 1355 | local_bh_disable(); |
| 1459 | if (rcu_cpu_kthread_should_stop(cpu)) { | ||
| 1460 | local_bh_enable(); | ||
| 1461 | break; | ||
| 1462 | } | ||
| 1463 | *statusp = RCU_KTHREAD_RUNNING; | 1356 | *statusp = RCU_KTHREAD_RUNNING; |
| 1464 | per_cpu(rcu_cpu_kthread_loops, cpu)++; | 1357 | this_cpu_inc(rcu_cpu_kthread_loops); |
| 1465 | local_irq_save(flags); | 1358 | local_irq_disable(); |
| 1466 | work = *workp; | 1359 | work = *workp; |
| 1467 | *workp = 0; | 1360 | *workp = 0; |
| 1468 | local_irq_restore(flags); | 1361 | local_irq_enable(); |
| 1469 | if (work) | 1362 | if (work) |
| 1470 | rcu_kthread_do_work(); | 1363 | rcu_kthread_do_work(); |
| 1471 | local_bh_enable(); | 1364 | local_bh_enable(); |
| 1472 | if (*workp != 0) | 1365 | if (*workp == 0) { |
| 1473 | spincnt++; | 1366 | trace_rcu_utilization("End CPU kthread@rcu_wait"); |
| 1474 | else | 1367 | *statusp = RCU_KTHREAD_WAITING; |
| 1475 | spincnt = 0; | 1368 | return; |
| 1476 | if (spincnt > 10) { | ||
| 1477 | *statusp = RCU_KTHREAD_YIELDING; | ||
| 1478 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | ||
| 1479 | rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); | ||
| 1480 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); | ||
| 1481 | spincnt = 0; | ||
| 1482 | } | ||
| 1483 | } | ||
| 1484 | *statusp = RCU_KTHREAD_STOPPED; | ||
| 1485 | trace_rcu_utilization("End CPU kthread@term"); | ||
| 1486 | return 0; | ||
| 1487 | } | ||
| 1488 | |||
| 1489 | /* | ||
| 1490 | * Spawn a per-CPU kthread, setting up affinity and priority. | ||
| 1491 | * Because the CPU hotplug lock is held, no other CPU will be attempting | ||
| 1492 | * to manipulate rcu_cpu_kthread_task. There might be another CPU | ||
| 1493 | * attempting to access it during boot, but the locking in kthread_bind() | ||
| 1494 | * will enforce sufficient ordering. | ||
| 1495 | * | ||
| 1496 | * Please note that we cannot simply refuse to wake up the per-CPU | ||
| 1497 | * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, | ||
| 1498 | * which can result in softlockup complaints if the task ends up being | ||
| 1499 | * idle for more than a couple of minutes. | ||
| 1500 | * | ||
| 1501 | * However, please note also that we cannot bind the per-CPU kthread to its | ||
| 1502 | * CPU until that CPU is fully online. We also cannot wait until the | ||
| 1503 | * CPU is fully online before we create its per-CPU kthread, as this would | ||
| 1504 | * deadlock the system when CPU notifiers tried waiting for grace | ||
| 1505 | * periods. So we bind the per-CPU kthread to its CPU only if the CPU | ||
| 1506 | * is online. If its CPU is not yet fully online, then the code in | ||
| 1507 | * rcu_cpu_kthread() will wait until it is fully online, and then do | ||
| 1508 | * the binding. | ||
| 1509 | */ | ||
| 1510 | static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) | ||
| 1511 | { | ||
| 1512 | struct sched_param sp; | ||
| 1513 | struct task_struct *t; | ||
| 1514 | |||
| 1515 | if (!rcu_scheduler_fully_active || | ||
| 1516 | per_cpu(rcu_cpu_kthread_task, cpu) != NULL) | ||
| 1517 | return 0; | ||
| 1518 | t = kthread_create_on_node(rcu_cpu_kthread, | ||
| 1519 | (void *)(long)cpu, | ||
| 1520 | cpu_to_node(cpu), | ||
| 1521 | "rcuc/%d", cpu); | ||
| 1522 | if (IS_ERR(t)) | ||
| 1523 | return PTR_ERR(t); | ||
| 1524 | if (cpu_online(cpu)) | ||
| 1525 | kthread_bind(t, cpu); | ||
| 1526 | per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; | ||
| 1527 | WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); | ||
| 1528 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1529 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1530 | per_cpu(rcu_cpu_kthread_task, cpu) = t; | ||
| 1531 | wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ | ||
| 1532 | return 0; | ||
| 1533 | } | ||
| 1534 | |||
| 1535 | /* | ||
| 1536 | * Per-rcu_node kthread, which is in charge of waking up the per-CPU | ||
| 1537 | * kthreads when needed. We ignore requests to wake up kthreads | ||
| 1538 | * for offline CPUs, which is OK because force_quiescent_state() | ||
| 1539 | * takes care of this case. | ||
| 1540 | */ | ||
| 1541 | static int rcu_node_kthread(void *arg) | ||
| 1542 | { | ||
| 1543 | int cpu; | ||
| 1544 | unsigned long flags; | ||
| 1545 | unsigned long mask; | ||
| 1546 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
| 1547 | struct sched_param sp; | ||
| 1548 | struct task_struct *t; | ||
| 1549 | |||
| 1550 | for (;;) { | ||
| 1551 | rnp->node_kthread_status = RCU_KTHREAD_WAITING; | ||
| 1552 | rcu_wait(atomic_read(&rnp->wakemask) != 0); | ||
| 1553 | rnp->node_kthread_status = RCU_KTHREAD_RUNNING; | ||
| 1554 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1555 | mask = atomic_xchg(&rnp->wakemask, 0); | ||
| 1556 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
| 1557 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { | ||
| 1558 | if ((mask & 0x1) == 0) | ||
| 1559 | continue; | ||
| 1560 | preempt_disable(); | ||
| 1561 | t = per_cpu(rcu_cpu_kthread_task, cpu); | ||
| 1562 | if (!cpu_online(cpu) || t == NULL) { | ||
| 1563 | preempt_enable(); | ||
| 1564 | continue; | ||
| 1565 | } | ||
| 1566 | per_cpu(rcu_cpu_has_work, cpu) = 1; | ||
| 1567 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1568 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1569 | preempt_enable(); | ||
| 1570 | } | 1369 | } |
| 1571 | } | 1370 | } |
| 1572 | /* NOTREACHED */ | 1371 | *statusp = RCU_KTHREAD_YIELDING; |
| 1573 | rnp->node_kthread_status = RCU_KTHREAD_STOPPED; | 1372 | trace_rcu_utilization("Start CPU kthread@rcu_yield"); |
| 1574 | return 0; | 1373 | schedule_timeout_interruptible(2); |
| 1374 | trace_rcu_utilization("End CPU kthread@rcu_yield"); | ||
| 1375 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1575 | } | 1376 | } |
| 1576 | 1377 | ||
| 1577 | /* | 1378 | /* |
| @@ -1583,17 +1384,17 @@ static int rcu_node_kthread(void *arg) | |||
| 1583 | * no outgoing CPU. If there are no CPUs left in the affinity set, | 1384 | * no outgoing CPU. If there are no CPUs left in the affinity set, |
| 1584 | * this function allows the kthread to execute on any CPU. | 1385 | * this function allows the kthread to execute on any CPU. |
| 1585 | */ | 1386 | */ |
| 1586 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | 1387 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) |
| 1587 | { | 1388 | { |
| 1389 | struct task_struct *t = rnp->boost_kthread_task; | ||
| 1390 | unsigned long mask = rnp->qsmaskinit; | ||
| 1588 | cpumask_var_t cm; | 1391 | cpumask_var_t cm; |
| 1589 | int cpu; | 1392 | int cpu; |
| 1590 | unsigned long mask = rnp->qsmaskinit; | ||
| 1591 | 1393 | ||
| 1592 | if (rnp->node_kthread_task == NULL) | 1394 | if (!t) |
| 1593 | return; | 1395 | return; |
| 1594 | if (!alloc_cpumask_var(&cm, GFP_KERNEL)) | 1396 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) |
| 1595 | return; | 1397 | return; |
| 1596 | cpumask_clear(cm); | ||
| 1597 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | 1398 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) |
| 1598 | if ((mask & 0x1) && cpu != outgoingcpu) | 1399 | if ((mask & 0x1) && cpu != outgoingcpu) |
| 1599 | cpumask_set_cpu(cpu, cm); | 1400 | cpumask_set_cpu(cpu, cm); |
| @@ -1603,62 +1404,36 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | |||
| 1603 | cpumask_clear_cpu(cpu, cm); | 1404 | cpumask_clear_cpu(cpu, cm); |
| 1604 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | 1405 | WARN_ON_ONCE(cpumask_weight(cm) == 0); |
| 1605 | } | 1406 | } |
| 1606 | set_cpus_allowed_ptr(rnp->node_kthread_task, cm); | 1407 | set_cpus_allowed_ptr(t, cm); |
| 1607 | rcu_boost_kthread_setaffinity(rnp, cm); | ||
| 1608 | free_cpumask_var(cm); | 1408 | free_cpumask_var(cm); |
| 1609 | } | 1409 | } |
| 1610 | 1410 | ||
| 1611 | /* | 1411 | static struct smp_hotplug_thread rcu_cpu_thread_spec = { |
| 1612 | * Spawn a per-rcu_node kthread, setting priority and affinity. | 1412 | .store = &rcu_cpu_kthread_task, |
| 1613 | * Called during boot before online/offline can happen, or, if | 1413 | .thread_should_run = rcu_cpu_kthread_should_run, |
| 1614 | * during runtime, with the main CPU-hotplug locks held. So only | 1414 | .thread_fn = rcu_cpu_kthread, |
| 1615 | * one of these can be executing at a time. | 1415 | .thread_comm = "rcuc/%u", |
| 1616 | */ | 1416 | .setup = rcu_cpu_kthread_setup, |
| 1617 | static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, | 1417 | .park = rcu_cpu_kthread_park, |
| 1618 | struct rcu_node *rnp) | 1418 | }; |
| 1619 | { | ||
| 1620 | unsigned long flags; | ||
| 1621 | int rnp_index = rnp - &rsp->node[0]; | ||
| 1622 | struct sched_param sp; | ||
| 1623 | struct task_struct *t; | ||
| 1624 | |||
| 1625 | if (!rcu_scheduler_fully_active || | ||
| 1626 | rnp->qsmaskinit == 0) | ||
| 1627 | return 0; | ||
| 1628 | if (rnp->node_kthread_task == NULL) { | ||
| 1629 | t = kthread_create(rcu_node_kthread, (void *)rnp, | ||
| 1630 | "rcun/%d", rnp_index); | ||
| 1631 | if (IS_ERR(t)) | ||
| 1632 | return PTR_ERR(t); | ||
| 1633 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1634 | rnp->node_kthread_task = t; | ||
| 1635 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1636 | sp.sched_priority = 99; | ||
| 1637 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1638 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
| 1639 | } | ||
| 1640 | return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); | ||
| 1641 | } | ||
| 1642 | 1419 | ||
| 1643 | /* | 1420 | /* |
| 1644 | * Spawn all kthreads -- called as soon as the scheduler is running. | 1421 | * Spawn all kthreads -- called as soon as the scheduler is running. |
| 1645 | */ | 1422 | */ |
| 1646 | static int __init rcu_spawn_kthreads(void) | 1423 | static int __init rcu_spawn_kthreads(void) |
| 1647 | { | 1424 | { |
| 1648 | int cpu; | ||
| 1649 | struct rcu_node *rnp; | 1425 | struct rcu_node *rnp; |
| 1426 | int cpu; | ||
| 1650 | 1427 | ||
| 1651 | rcu_scheduler_fully_active = 1; | 1428 | rcu_scheduler_fully_active = 1; |
| 1652 | for_each_possible_cpu(cpu) { | 1429 | for_each_possible_cpu(cpu) |
| 1653 | per_cpu(rcu_cpu_has_work, cpu) = 0; | 1430 | per_cpu(rcu_cpu_has_work, cpu) = 0; |
| 1654 | if (cpu_online(cpu)) | 1431 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); |
| 1655 | (void)rcu_spawn_one_cpu_kthread(cpu); | ||
| 1656 | } | ||
| 1657 | rnp = rcu_get_root(rcu_state); | 1432 | rnp = rcu_get_root(rcu_state); |
| 1658 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | 1433 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); |
| 1659 | if (NUM_RCU_NODES > 1) { | 1434 | if (NUM_RCU_NODES > 1) { |
| 1660 | rcu_for_each_leaf_node(rcu_state, rnp) | 1435 | rcu_for_each_leaf_node(rcu_state, rnp) |
| 1661 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | 1436 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); |
| 1662 | } | 1437 | } |
| 1663 | return 0; | 1438 | return 0; |
| 1664 | } | 1439 | } |
| @@ -1670,11 +1445,8 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) | |||
| 1670 | struct rcu_node *rnp = rdp->mynode; | 1445 | struct rcu_node *rnp = rdp->mynode; |
| 1671 | 1446 | ||
| 1672 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | 1447 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ |
| 1673 | if (rcu_scheduler_fully_active) { | 1448 | if (rcu_scheduler_fully_active) |
| 1674 | (void)rcu_spawn_one_cpu_kthread(cpu); | 1449 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); |
| 1675 | if (rnp->node_kthread_task == NULL) | ||
| 1676 | (void)rcu_spawn_one_node_kthread(rcu_state, rnp); | ||
| 1677 | } | ||
| 1678 | } | 1450 | } |
| 1679 | 1451 | ||
| 1680 | #else /* #ifdef CONFIG_RCU_BOOST */ | 1452 | #else /* #ifdef CONFIG_RCU_BOOST */ |
| @@ -1698,19 +1470,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | |||
| 1698 | { | 1470 | { |
| 1699 | } | 1471 | } |
| 1700 | 1472 | ||
| 1701 | #ifdef CONFIG_HOTPLUG_CPU | 1473 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) |
| 1702 | |||
| 1703 | static void rcu_stop_cpu_kthread(int cpu) | ||
| 1704 | { | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1708 | |||
| 1709 | static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
| 1710 | { | ||
| 1711 | } | ||
| 1712 | |||
| 1713 | static void rcu_cpu_kthread_setrt(int cpu, int to_rt) | ||
| 1714 | { | 1474 | { |
| 1715 | } | 1475 | } |
| 1716 | 1476 | ||
| @@ -1997,6 +1757,26 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 1997 | if (!tne) | 1757 | if (!tne) |
| 1998 | return; | 1758 | return; |
| 1999 | 1759 | ||
| 1760 | /* Adaptive-tick mode, where usermode execution is idle to RCU. */ | ||
| 1761 | if (!is_idle_task(current)) { | ||
| 1762 | rdtp->dyntick_holdoff = jiffies - 1; | ||
| 1763 | if (rcu_cpu_has_nonlazy_callbacks(cpu)) { | ||
| 1764 | trace_rcu_prep_idle("User dyntick with callbacks"); | ||
| 1765 | rdtp->idle_gp_timer_expires = | ||
| 1766 | round_up(jiffies + RCU_IDLE_GP_DELAY, | ||
| 1767 | RCU_IDLE_GP_DELAY); | ||
| 1768 | } else if (rcu_cpu_has_callbacks(cpu)) { | ||
| 1769 | rdtp->idle_gp_timer_expires = | ||
| 1770 | round_jiffies(jiffies + RCU_IDLE_LAZY_GP_DELAY); | ||
| 1771 | trace_rcu_prep_idle("User dyntick with lazy callbacks"); | ||
| 1772 | } else { | ||
| 1773 | return; | ||
| 1774 | } | ||
| 1775 | tp = &rdtp->idle_gp_timer; | ||
| 1776 | mod_timer_pinned(tp, rdtp->idle_gp_timer_expires); | ||
| 1777 | return; | ||
| 1778 | } | ||
| 1779 | |||
| 2000 | /* | 1780 | /* |
| 2001 | * If this is an idle re-entry, for example, due to use of | 1781 | * If this is an idle re-entry, for example, due to use of |
| 2002 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle | 1782 | * RCU_NONIDLE() or the new idle-loop tracing API within the idle |
| @@ -2075,16 +1855,16 @@ static void rcu_prepare_for_idle(int cpu) | |||
| 2075 | #ifdef CONFIG_TREE_PREEMPT_RCU | 1855 | #ifdef CONFIG_TREE_PREEMPT_RCU |
| 2076 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { | 1856 | if (per_cpu(rcu_preempt_data, cpu).nxtlist) { |
| 2077 | rcu_preempt_qs(cpu); | 1857 | rcu_preempt_qs(cpu); |
| 2078 | force_quiescent_state(&rcu_preempt_state, 0); | 1858 | force_quiescent_state(&rcu_preempt_state); |
| 2079 | } | 1859 | } |
| 2080 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | 1860 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ |
| 2081 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { | 1861 | if (per_cpu(rcu_sched_data, cpu).nxtlist) { |
| 2082 | rcu_sched_qs(cpu); | 1862 | rcu_sched_qs(cpu); |
| 2083 | force_quiescent_state(&rcu_sched_state, 0); | 1863 | force_quiescent_state(&rcu_sched_state); |
| 2084 | } | 1864 | } |
| 2085 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { | 1865 | if (per_cpu(rcu_bh_data, cpu).nxtlist) { |
| 2086 | rcu_bh_qs(cpu); | 1866 | rcu_bh_qs(cpu); |
| 2087 | force_quiescent_state(&rcu_bh_state, 0); | 1867 | force_quiescent_state(&rcu_bh_state); |
| 2088 | } | 1868 | } |
| 2089 | 1869 | ||
| 2090 | /* | 1870 | /* |
| @@ -2112,6 +1892,88 @@ static void rcu_idle_count_callbacks_posted(void) | |||
| 2112 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); | 1892 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); |
| 2113 | } | 1893 | } |
| 2114 | 1894 | ||
| 1895 | /* | ||
| 1896 | * Data for flushing lazy RCU callbacks at OOM time. | ||
| 1897 | */ | ||
| 1898 | static atomic_t oom_callback_count; | ||
| 1899 | static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); | ||
| 1900 | |||
| 1901 | /* | ||
| 1902 | * RCU OOM callback -- decrement the outstanding count and deliver the | ||
| 1903 | * wake-up if we are the last one. | ||
| 1904 | */ | ||
| 1905 | static void rcu_oom_callback(struct rcu_head *rhp) | ||
| 1906 | { | ||
| 1907 | if (atomic_dec_and_test(&oom_callback_count)) | ||
| 1908 | wake_up(&oom_callback_wq); | ||
| 1909 | } | ||
| 1910 | |||
| 1911 | /* | ||
| 1912 | * Post an rcu_oom_notify callback on the current CPU if it has at | ||
| 1913 | * least one lazy callback. This will unnecessarily post callbacks | ||
| 1914 | * to CPUs that already have a non-lazy callback at the end of their | ||
| 1915 | * callback list, but this is an infrequent operation, so accept some | ||
| 1916 | * extra overhead to keep things simple. | ||
| 1917 | */ | ||
| 1918 | static void rcu_oom_notify_cpu(void *unused) | ||
| 1919 | { | ||
| 1920 | struct rcu_state *rsp; | ||
| 1921 | struct rcu_data *rdp; | ||
| 1922 | |||
| 1923 | for_each_rcu_flavor(rsp) { | ||
| 1924 | rdp = __this_cpu_ptr(rsp->rda); | ||
| 1925 | if (rdp->qlen_lazy != 0) { | ||
| 1926 | atomic_inc(&oom_callback_count); | ||
| 1927 | rsp->call(&rdp->oom_head, rcu_oom_callback); | ||
| 1928 | } | ||
| 1929 | } | ||
| 1930 | } | ||
| 1931 | |||
| 1932 | /* | ||
| 1933 | * If low on memory, ensure that each CPU has a non-lazy callback. | ||
| 1934 | * This will wake up CPUs that have only lazy callbacks, in turn | ||
| 1935 | * ensuring that they free up the corresponding memory in a timely manner. | ||
| 1936 | * Because an uncertain amount of memory will be freed in some uncertain | ||
| 1937 | * timeframe, we do not claim to have freed anything. | ||
| 1938 | */ | ||
| 1939 | static int rcu_oom_notify(struct notifier_block *self, | ||
| 1940 | unsigned long notused, void *nfreed) | ||
| 1941 | { | ||
| 1942 | int cpu; | ||
| 1943 | |||
| 1944 | /* Wait for callbacks from earlier instance to complete. */ | ||
| 1945 | wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); | ||
| 1946 | |||
| 1947 | /* | ||
| 1948 | * Prevent premature wakeup: ensure that all increments happen | ||
| 1949 | * before there is a chance of the counter reaching zero. | ||
| 1950 | */ | ||
| 1951 | atomic_set(&oom_callback_count, 1); | ||
| 1952 | |||
| 1953 | get_online_cpus(); | ||
| 1954 | for_each_online_cpu(cpu) { | ||
| 1955 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | ||
| 1956 | cond_resched(); | ||
| 1957 | } | ||
| 1958 | put_online_cpus(); | ||
| 1959 | |||
| 1960 | /* Unconditionally decrement: no need to wake ourselves up. */ | ||
| 1961 | atomic_dec(&oom_callback_count); | ||
| 1962 | |||
| 1963 | return NOTIFY_OK; | ||
| 1964 | } | ||
| 1965 | |||
| 1966 | static struct notifier_block rcu_oom_nb = { | ||
| 1967 | .notifier_call = rcu_oom_notify | ||
| 1968 | }; | ||
| 1969 | |||
| 1970 | static int __init rcu_register_oom_notifier(void) | ||
| 1971 | { | ||
| 1972 | register_oom_notifier(&rcu_oom_nb); | ||
| 1973 | return 0; | ||
| 1974 | } | ||
| 1975 | early_initcall(rcu_register_oom_notifier); | ||
| 1976 | |||
| 2115 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | 1977 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ |
| 2116 | 1978 | ||
| 2117 | #ifdef CONFIG_RCU_CPU_STALL_INFO | 1979 | #ifdef CONFIG_RCU_CPU_STALL_INFO |
| @@ -2122,11 +1984,15 @@ static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | |||
| 2122 | { | 1984 | { |
| 2123 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | 1985 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); |
| 2124 | struct timer_list *tltp = &rdtp->idle_gp_timer; | 1986 | struct timer_list *tltp = &rdtp->idle_gp_timer; |
| 1987 | char c; | ||
| 2125 | 1988 | ||
| 2126 | sprintf(cp, "drain=%d %c timer=%lu", | 1989 | c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.'; |
| 2127 | rdtp->dyntick_drain, | 1990 | if (timer_pending(tltp)) |
| 2128 | rdtp->dyntick_holdoff == jiffies ? 'H' : '.', | 1991 | sprintf(cp, "drain=%d %c timer=%lu", |
| 2129 | timer_pending(tltp) ? tltp->expires - jiffies : -1); | 1992 | rdtp->dyntick_drain, c, tltp->expires - jiffies); |
| 1993 | else | ||
| 1994 | sprintf(cp, "drain=%d %c timer not pending", | ||
| 1995 | rdtp->dyntick_drain, c); | ||
| 2130 | } | 1996 | } |
| 2131 | 1997 | ||
| 2132 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | 1998 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ |
| @@ -2194,11 +2060,10 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp) | |||
| 2194 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | 2060 | /* Increment ->ticks_this_gp for all flavors of RCU. */ |
| 2195 | static void increment_cpu_stall_ticks(void) | 2061 | static void increment_cpu_stall_ticks(void) |
| 2196 | { | 2062 | { |
| 2197 | __get_cpu_var(rcu_sched_data).ticks_this_gp++; | 2063 | struct rcu_state *rsp; |
| 2198 | __get_cpu_var(rcu_bh_data).ticks_this_gp++; | 2064 | |
| 2199 | #ifdef CONFIG_TREE_PREEMPT_RCU | 2065 | for_each_rcu_flavor(rsp) |
| 2200 | __get_cpu_var(rcu_preempt_data).ticks_this_gp++; | 2066 | __this_cpu_ptr(rsp->rda)->ticks_this_gp++; |
| 2201 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
| 2202 | } | 2067 | } |
| 2203 | 2068 | ||
| 2204 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | 2069 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ |
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index abffb486e94e..693513bc50e6 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c | |||
| @@ -51,8 +51,8 @@ static int show_rcubarrier(struct seq_file *m, void *unused) | |||
| 51 | struct rcu_state *rsp; | 51 | struct rcu_state *rsp; |
| 52 | 52 | ||
| 53 | for_each_rcu_flavor(rsp) | 53 | for_each_rcu_flavor(rsp) |
| 54 | seq_printf(m, "%s: %c bcc: %d nbd: %lu\n", | 54 | seq_printf(m, "%s: bcc: %d nbd: %lu\n", |
| 55 | rsp->name, rsp->rcu_barrier_in_progress ? 'B' : '.', | 55 | rsp->name, |
| 56 | atomic_read(&rsp->barrier_cpu_count), | 56 | atomic_read(&rsp->barrier_cpu_count), |
| 57 | rsp->n_barrier_done); | 57 | rsp->n_barrier_done); |
| 58 | return 0; | 58 | return 0; |
| @@ -86,12 +86,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
| 86 | { | 86 | { |
| 87 | if (!rdp->beenonline) | 87 | if (!rdp->beenonline) |
| 88 | return; | 88 | return; |
| 89 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d", | 89 | seq_printf(m, "%3d%cc=%lu g=%lu pq=%d qp=%d", |
| 90 | rdp->cpu, | 90 | rdp->cpu, |
| 91 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | 91 | cpu_is_offline(rdp->cpu) ? '!' : ' ', |
| 92 | rdp->completed, rdp->gpnum, | 92 | rdp->completed, rdp->gpnum, |
| 93 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 93 | rdp->passed_quiesce, rdp->qs_pending); |
| 94 | rdp->qs_pending); | ||
| 95 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | 94 | seq_printf(m, " dt=%d/%llx/%d df=%lu", |
| 96 | atomic_read(&rdp->dynticks->dynticks), | 95 | atomic_read(&rdp->dynticks->dynticks), |
| 97 | rdp->dynticks->dynticks_nesting, | 96 | rdp->dynticks->dynticks_nesting, |
| @@ -108,11 +107,10 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | |||
| 108 | rdp->nxttail[RCU_WAIT_TAIL]], | 107 | rdp->nxttail[RCU_WAIT_TAIL]], |
| 109 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | 108 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); |
| 110 | #ifdef CONFIG_RCU_BOOST | 109 | #ifdef CONFIG_RCU_BOOST |
| 111 | seq_printf(m, " kt=%d/%c/%d ktl=%x", | 110 | seq_printf(m, " kt=%d/%c ktl=%x", |
| 112 | per_cpu(rcu_cpu_has_work, rdp->cpu), | 111 | per_cpu(rcu_cpu_has_work, rdp->cpu), |
| 113 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | 112 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, |
| 114 | rdp->cpu)), | 113 | rdp->cpu)), |
| 115 | per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), | ||
| 116 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | 114 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); |
| 117 | #endif /* #ifdef CONFIG_RCU_BOOST */ | 115 | #endif /* #ifdef CONFIG_RCU_BOOST */ |
| 118 | seq_printf(m, " b=%ld", rdp->blimit); | 116 | seq_printf(m, " b=%ld", rdp->blimit); |
| @@ -150,12 +148,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) | |||
| 150 | { | 148 | { |
| 151 | if (!rdp->beenonline) | 149 | if (!rdp->beenonline) |
| 152 | return; | 150 | return; |
| 153 | seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", | 151 | seq_printf(m, "%d,%s,%lu,%lu,%d,%d", |
| 154 | rdp->cpu, | 152 | rdp->cpu, |
| 155 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", | 153 | cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", |
| 156 | rdp->completed, rdp->gpnum, | 154 | rdp->completed, rdp->gpnum, |
| 157 | rdp->passed_quiesce, rdp->passed_quiesce_gpnum, | 155 | rdp->passed_quiesce, rdp->qs_pending); |
| 158 | rdp->qs_pending); | ||
| 159 | seq_printf(m, ",%d,%llx,%d,%lu", | 156 | seq_printf(m, ",%d,%llx,%d,%lu", |
| 160 | atomic_read(&rdp->dynticks->dynticks), | 157 | atomic_read(&rdp->dynticks->dynticks), |
| 161 | rdp->dynticks->dynticks_nesting, | 158 | rdp->dynticks->dynticks_nesting, |
| @@ -186,7 +183,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused) | |||
| 186 | int cpu; | 183 | int cpu; |
| 187 | struct rcu_state *rsp; | 184 | struct rcu_state *rsp; |
| 188 | 185 | ||
| 189 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\","); | 186 | seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pq\","); |
| 190 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); | 187 | seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); |
| 191 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); | 188 | seq_puts(m, "\"of\",\"qll\",\"ql\",\"qs\""); |
| 192 | #ifdef CONFIG_RCU_BOOST | 189 | #ifdef CONFIG_RCU_BOOST |
| @@ -386,10 +383,9 @@ static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | |||
| 386 | rdp->n_rp_report_qs, | 383 | rdp->n_rp_report_qs, |
| 387 | rdp->n_rp_cb_ready, | 384 | rdp->n_rp_cb_ready, |
| 388 | rdp->n_rp_cpu_needs_gp); | 385 | rdp->n_rp_cpu_needs_gp); |
| 389 | seq_printf(m, "gpc=%ld gps=%ld nf=%ld nn=%ld\n", | 386 | seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", |
| 390 | rdp->n_rp_gp_completed, | 387 | rdp->n_rp_gp_completed, |
| 391 | rdp->n_rp_gp_started, | 388 | rdp->n_rp_gp_started, |
| 392 | rdp->n_rp_need_fqs, | ||
| 393 | rdp->n_rp_need_nothing); | 389 | rdp->n_rp_need_nothing); |
| 394 | } | 390 | } |
| 395 | 391 | ||
diff --git a/kernel/resource.c b/kernel/resource.c index 34d45886ee84..73f35d4b30b9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
| @@ -763,6 +763,7 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
| 763 | struct resource *parent = root; | 763 | struct resource *parent = root; |
| 764 | struct resource *conflict; | 764 | struct resource *conflict; |
| 765 | struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); | 765 | struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); |
| 766 | struct resource *next_res = NULL; | ||
| 766 | 767 | ||
| 767 | if (!res) | 768 | if (!res) |
| 768 | return; | 769 | return; |
| @@ -772,21 +773,46 @@ static void __init __reserve_region_with_split(struct resource *root, | |||
| 772 | res->end = end; | 773 | res->end = end; |
| 773 | res->flags = IORESOURCE_BUSY; | 774 | res->flags = IORESOURCE_BUSY; |
| 774 | 775 | ||
| 775 | conflict = __request_resource(parent, res); | 776 | while (1) { |
| 776 | if (!conflict) | ||
| 777 | return; | ||
| 778 | 777 | ||
| 779 | /* failed, split and try again */ | 778 | conflict = __request_resource(parent, res); |
| 780 | kfree(res); | 779 | if (!conflict) { |
| 780 | if (!next_res) | ||
| 781 | break; | ||
| 782 | res = next_res; | ||
| 783 | next_res = NULL; | ||
| 784 | continue; | ||
| 785 | } | ||
| 781 | 786 | ||
| 782 | /* conflict covered whole area */ | 787 | /* conflict covered whole area */ |
| 783 | if (conflict->start <= start && conflict->end >= end) | 788 | if (conflict->start <= res->start && |
| 784 | return; | 789 | conflict->end >= res->end) { |
| 790 | kfree(res); | ||
| 791 | WARN_ON(next_res); | ||
| 792 | break; | ||
| 793 | } | ||
| 794 | |||
| 795 | /* failed, split and try again */ | ||
| 796 | if (conflict->start > res->start) { | ||
| 797 | end = res->end; | ||
| 798 | res->end = conflict->start - 1; | ||
| 799 | if (conflict->end < end) { | ||
| 800 | next_res = kzalloc(sizeof(*next_res), | ||
| 801 | GFP_ATOMIC); | ||
| 802 | if (!next_res) { | ||
| 803 | kfree(res); | ||
| 804 | break; | ||
| 805 | } | ||
| 806 | next_res->name = name; | ||
| 807 | next_res->start = conflict->end + 1; | ||
| 808 | next_res->end = end; | ||
| 809 | next_res->flags = IORESOURCE_BUSY; | ||
| 810 | } | ||
| 811 | } else { | ||
| 812 | res->start = conflict->end + 1; | ||
| 813 | } | ||
| 814 | } | ||
| 785 | 815 | ||
| 786 | if (conflict->start > start) | ||
| 787 | __reserve_region_with_split(root, start, conflict->start-1, name); | ||
| 788 | if (conflict->end < end) | ||
| 789 | __reserve_region_with_split(root, conflict->end+1, end, name); | ||
| 790 | } | 816 | } |
| 791 | 817 | ||
| 792 | void __init reserve_region_with_split(struct resource *root, | 818 | void __init reserve_region_with_split(struct resource *root, |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 173ea52f3af0..f06d249e103b 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
| @@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
| 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
| 12 | endif | 12 | endif |
| 13 | 13 | ||
| 14 | obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
| 15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
| 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
| 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 649c9f876cb1..c17747236438 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -740,126 +740,6 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
| 740 | dequeue_task(rq, p, flags); | 740 | dequeue_task(rq, p, flags); |
| 741 | } | 741 | } |
| 742 | 742 | ||
| 743 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 744 | |||
| 745 | /* | ||
| 746 | * There are no locks covering percpu hardirq/softirq time. | ||
| 747 | * They are only modified in account_system_vtime, on corresponding CPU | ||
| 748 | * with interrupts disabled. So, writes are safe. | ||
| 749 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 750 | * This may result in other CPU reading this CPU's irq time and can | ||
| 751 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
| 752 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
| 753 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
| 754 | * compromise in place of having locks on each irq in account_system_time. | ||
| 755 | */ | ||
| 756 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 757 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 758 | |||
| 759 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 760 | static int sched_clock_irqtime; | ||
| 761 | |||
| 762 | void enable_sched_clock_irqtime(void) | ||
| 763 | { | ||
| 764 | sched_clock_irqtime = 1; | ||
| 765 | } | ||
| 766 | |||
| 767 | void disable_sched_clock_irqtime(void) | ||
| 768 | { | ||
| 769 | sched_clock_irqtime = 0; | ||
| 770 | } | ||
| 771 | |||
| 772 | #ifndef CONFIG_64BIT | ||
| 773 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 774 | |||
| 775 | static inline void irq_time_write_begin(void) | ||
| 776 | { | ||
| 777 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 778 | smp_wmb(); | ||
| 779 | } | ||
| 780 | |||
| 781 | static inline void irq_time_write_end(void) | ||
| 782 | { | ||
| 783 | smp_wmb(); | ||
| 784 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 785 | } | ||
| 786 | |||
| 787 | static inline u64 irq_time_read(int cpu) | ||
| 788 | { | ||
| 789 | u64 irq_time; | ||
| 790 | unsigned seq; | ||
| 791 | |||
| 792 | do { | ||
| 793 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
| 794 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
| 795 | per_cpu(cpu_hardirq_time, cpu); | ||
| 796 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 797 | |||
| 798 | return irq_time; | ||
| 799 | } | ||
| 800 | #else /* CONFIG_64BIT */ | ||
| 801 | static inline void irq_time_write_begin(void) | ||
| 802 | { | ||
| 803 | } | ||
| 804 | |||
| 805 | static inline void irq_time_write_end(void) | ||
| 806 | { | ||
| 807 | } | ||
| 808 | |||
| 809 | static inline u64 irq_time_read(int cpu) | ||
| 810 | { | ||
| 811 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 812 | } | ||
| 813 | #endif /* CONFIG_64BIT */ | ||
| 814 | |||
| 815 | /* | ||
| 816 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
| 817 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
| 818 | */ | ||
| 819 | void account_system_vtime(struct task_struct *curr) | ||
| 820 | { | ||
| 821 | unsigned long flags; | ||
| 822 | s64 delta; | ||
| 823 | int cpu; | ||
| 824 | |||
| 825 | if (!sched_clock_irqtime) | ||
| 826 | return; | ||
| 827 | |||
| 828 | local_irq_save(flags); | ||
| 829 | |||
| 830 | cpu = smp_processor_id(); | ||
| 831 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
| 832 | __this_cpu_add(irq_start_time, delta); | ||
| 833 | |||
| 834 | irq_time_write_begin(); | ||
| 835 | /* | ||
| 836 | * We do not account for softirq time from ksoftirqd here. | ||
| 837 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 838 | * in that case, so as not to confuse scheduler with a special task | ||
| 839 | * that do not consume any time, but still wants to run. | ||
| 840 | */ | ||
| 841 | if (hardirq_count()) | ||
| 842 | __this_cpu_add(cpu_hardirq_time, delta); | ||
| 843 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
| 844 | __this_cpu_add(cpu_softirq_time, delta); | ||
| 845 | |||
| 846 | irq_time_write_end(); | ||
| 847 | local_irq_restore(flags); | ||
| 848 | } | ||
| 849 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
| 850 | |||
| 851 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 852 | |||
| 853 | #ifdef CONFIG_PARAVIRT | ||
| 854 | static inline u64 steal_ticks(u64 steal) | ||
| 855 | { | ||
| 856 | if (unlikely(steal > NSEC_PER_SEC)) | ||
| 857 | return div_u64(steal, TICK_NSEC); | ||
| 858 | |||
| 859 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 860 | } | ||
| 861 | #endif | ||
| 862 | |||
| 863 | static void update_rq_clock_task(struct rq *rq, s64 delta) | 743 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
| 864 | { | 744 | { |
| 865 | /* | 745 | /* |
| @@ -920,43 +800,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) | |||
| 920 | #endif | 800 | #endif |
| 921 | } | 801 | } |
| 922 | 802 | ||
| 923 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 924 | static int irqtime_account_hi_update(void) | ||
| 925 | { | ||
| 926 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 927 | unsigned long flags; | ||
| 928 | u64 latest_ns; | ||
| 929 | int ret = 0; | ||
| 930 | |||
| 931 | local_irq_save(flags); | ||
| 932 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
| 933 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
| 934 | ret = 1; | ||
| 935 | local_irq_restore(flags); | ||
| 936 | return ret; | ||
| 937 | } | ||
| 938 | |||
| 939 | static int irqtime_account_si_update(void) | ||
| 940 | { | ||
| 941 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 942 | unsigned long flags; | ||
| 943 | u64 latest_ns; | ||
| 944 | int ret = 0; | ||
| 945 | |||
| 946 | local_irq_save(flags); | ||
| 947 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
| 948 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
| 949 | ret = 1; | ||
| 950 | local_irq_restore(flags); | ||
| 951 | return ret; | ||
| 952 | } | ||
| 953 | |||
| 954 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 955 | |||
| 956 | #define sched_clock_irqtime (0) | ||
| 957 | |||
| 958 | #endif | ||
| 959 | |||
| 960 | void sched_set_stop_task(int cpu, struct task_struct *stop) | 803 | void sched_set_stop_task(int cpu, struct task_struct *stop) |
| 961 | { | 804 | { |
| 962 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | 805 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; |
| @@ -1518,25 +1361,6 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu) | |||
| 1518 | smp_send_reschedule(cpu); | 1361 | smp_send_reschedule(cpu); |
| 1519 | } | 1362 | } |
| 1520 | 1363 | ||
| 1521 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1522 | static int ttwu_activate_remote(struct task_struct *p, int wake_flags) | ||
| 1523 | { | ||
| 1524 | struct rq *rq; | ||
| 1525 | int ret = 0; | ||
| 1526 | |||
| 1527 | rq = __task_rq_lock(p); | ||
| 1528 | if (p->on_cpu) { | ||
| 1529 | ttwu_activate(rq, p, ENQUEUE_WAKEUP); | ||
| 1530 | ttwu_do_wakeup(rq, p, wake_flags); | ||
| 1531 | ret = 1; | ||
| 1532 | } | ||
| 1533 | __task_rq_unlock(rq); | ||
| 1534 | |||
| 1535 | return ret; | ||
| 1536 | |||
| 1537 | } | ||
| 1538 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1539 | |||
| 1540 | bool cpus_share_cache(int this_cpu, int that_cpu) | 1364 | bool cpus_share_cache(int this_cpu, int that_cpu) |
| 1541 | { | 1365 | { |
| 1542 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); | 1366 | return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); |
| @@ -1597,21 +1421,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
| 1597 | * If the owning (remote) cpu is still in the middle of schedule() with | 1421 | * If the owning (remote) cpu is still in the middle of schedule() with |
| 1598 | * this task as prev, wait until its done referencing the task. | 1422 | * this task as prev, wait until its done referencing the task. |
| 1599 | */ | 1423 | */ |
| 1600 | while (p->on_cpu) { | 1424 | while (p->on_cpu) |
| 1601 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1602 | /* | ||
| 1603 | * In case the architecture enables interrupts in | ||
| 1604 | * context_switch(), we cannot busy wait, since that | ||
| 1605 | * would lead to deadlocks when an interrupt hits and | ||
| 1606 | * tries to wake up @prev. So bail and do a complete | ||
| 1607 | * remote wakeup. | ||
| 1608 | */ | ||
| 1609 | if (ttwu_activate_remote(p, wake_flags)) | ||
| 1610 | goto stat; | ||
| 1611 | #else | ||
| 1612 | cpu_relax(); | 1425 | cpu_relax(); |
| 1613 | #endif | ||
| 1614 | } | ||
| 1615 | /* | 1426 | /* |
| 1616 | * Pairs with the smp_wmb() in finish_lock_switch(). | 1427 | * Pairs with the smp_wmb() in finish_lock_switch(). |
| 1617 | */ | 1428 | */ |
| @@ -1953,14 +1764,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
| 1953 | * Manfred Spraul <manfred@colorfullife.com> | 1764 | * Manfred Spraul <manfred@colorfullife.com> |
| 1954 | */ | 1765 | */ |
| 1955 | prev_state = prev->state; | 1766 | prev_state = prev->state; |
| 1767 | vtime_task_switch(prev); | ||
| 1956 | finish_arch_switch(prev); | 1768 | finish_arch_switch(prev); |
| 1957 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1958 | local_irq_disable(); | ||
| 1959 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1960 | perf_event_task_sched_in(prev, current); | 1769 | perf_event_task_sched_in(prev, current); |
| 1961 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1962 | local_irq_enable(); | ||
| 1963 | #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ | ||
| 1964 | finish_lock_switch(rq, prev); | 1770 | finish_lock_switch(rq, prev); |
| 1965 | finish_arch_post_lock_switch(); | 1771 | finish_arch_post_lock_switch(); |
| 1966 | 1772 | ||
| @@ -2081,6 +1887,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2081 | #endif | 1887 | #endif |
| 2082 | 1888 | ||
| 2083 | /* Here we just switch the register state and the stack. */ | 1889 | /* Here we just switch the register state and the stack. */ |
| 1890 | rcu_switch(prev, next); | ||
| 2084 | switch_to(prev, next, prev); | 1891 | switch_to(prev, next, prev); |
| 2085 | 1892 | ||
| 2086 | barrier(); | 1893 | barrier(); |
| @@ -2809,404 +2616,6 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
| 2809 | return ns; | 2616 | return ns; |
| 2810 | } | 2617 | } |
| 2811 | 2618 | ||
| 2812 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2813 | struct cgroup_subsys cpuacct_subsys; | ||
| 2814 | struct cpuacct root_cpuacct; | ||
| 2815 | #endif | ||
| 2816 | |||
| 2817 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
| 2818 | u64 tmp) | ||
| 2819 | { | ||
| 2820 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2821 | struct kernel_cpustat *kcpustat; | ||
| 2822 | struct cpuacct *ca; | ||
| 2823 | #endif | ||
| 2824 | /* | ||
| 2825 | * Since all updates are sure to touch the root cgroup, we | ||
| 2826 | * get ourselves ahead and touch it first. If the root cgroup | ||
| 2827 | * is the only cgroup, then nothing else should be necessary. | ||
| 2828 | * | ||
| 2829 | */ | ||
| 2830 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
| 2831 | |||
| 2832 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 2833 | if (unlikely(!cpuacct_subsys.active)) | ||
| 2834 | return; | ||
| 2835 | |||
| 2836 | rcu_read_lock(); | ||
| 2837 | ca = task_ca(p); | ||
| 2838 | while (ca && (ca != &root_cpuacct)) { | ||
| 2839 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
| 2840 | kcpustat->cpustat[index] += tmp; | ||
| 2841 | ca = parent_ca(ca); | ||
| 2842 | } | ||
| 2843 | rcu_read_unlock(); | ||
| 2844 | #endif | ||
| 2845 | } | ||
| 2846 | |||
| 2847 | |||
| 2848 | /* | ||
| 2849 | * Account user cpu time to a process. | ||
| 2850 | * @p: the process that the cpu time gets accounted to | ||
| 2851 | * @cputime: the cpu time spent in user space since the last update | ||
| 2852 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2853 | */ | ||
| 2854 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
| 2855 | cputime_t cputime_scaled) | ||
| 2856 | { | ||
| 2857 | int index; | ||
| 2858 | |||
| 2859 | /* Add user time to process. */ | ||
| 2860 | p->utime += cputime; | ||
| 2861 | p->utimescaled += cputime_scaled; | ||
| 2862 | account_group_user_time(p, cputime); | ||
| 2863 | |||
| 2864 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
| 2865 | |||
| 2866 | /* Add user time to cpustat. */ | ||
| 2867 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 2868 | |||
| 2869 | /* Account for user time used */ | ||
| 2870 | acct_update_integrals(p); | ||
| 2871 | } | ||
| 2872 | |||
| 2873 | /* | ||
| 2874 | * Account guest cpu time to a process. | ||
| 2875 | * @p: the process that the cpu time gets accounted to | ||
| 2876 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 2877 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2878 | */ | ||
| 2879 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
| 2880 | cputime_t cputime_scaled) | ||
| 2881 | { | ||
| 2882 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2883 | |||
| 2884 | /* Add guest time to process. */ | ||
| 2885 | p->utime += cputime; | ||
| 2886 | p->utimescaled += cputime_scaled; | ||
| 2887 | account_group_user_time(p, cputime); | ||
| 2888 | p->gtime += cputime; | ||
| 2889 | |||
| 2890 | /* Add guest time to cpustat. */ | ||
| 2891 | if (TASK_NICE(p) > 0) { | ||
| 2892 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
| 2893 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
| 2894 | } else { | ||
| 2895 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
| 2896 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
| 2897 | } | ||
| 2898 | } | ||
| 2899 | |||
| 2900 | /* | ||
| 2901 | * Account system cpu time to a process and desired cpustat field | ||
| 2902 | * @p: the process that the cpu time gets accounted to | ||
| 2903 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 2904 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2905 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
| 2906 | */ | ||
| 2907 | static inline | ||
| 2908 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
| 2909 | cputime_t cputime_scaled, int index) | ||
| 2910 | { | ||
| 2911 | /* Add system time to process. */ | ||
| 2912 | p->stime += cputime; | ||
| 2913 | p->stimescaled += cputime_scaled; | ||
| 2914 | account_group_system_time(p, cputime); | ||
| 2915 | |||
| 2916 | /* Add system time to cpustat. */ | ||
| 2917 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 2918 | |||
| 2919 | /* Account for system time used */ | ||
| 2920 | acct_update_integrals(p); | ||
| 2921 | } | ||
| 2922 | |||
| 2923 | /* | ||
| 2924 | * Account system cpu time to a process. | ||
| 2925 | * @p: the process that the cpu time gets accounted to | ||
| 2926 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
| 2927 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 2928 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 2929 | */ | ||
| 2930 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
| 2931 | cputime_t cputime, cputime_t cputime_scaled) | ||
| 2932 | { | ||
| 2933 | int index; | ||
| 2934 | |||
| 2935 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
| 2936 | account_guest_time(p, cputime, cputime_scaled); | ||
| 2937 | return; | ||
| 2938 | } | ||
| 2939 | |||
| 2940 | if (hardirq_count() - hardirq_offset) | ||
| 2941 | index = CPUTIME_IRQ; | ||
| 2942 | else if (in_serving_softirq()) | ||
| 2943 | index = CPUTIME_SOFTIRQ; | ||
| 2944 | else | ||
| 2945 | index = CPUTIME_SYSTEM; | ||
| 2946 | |||
| 2947 | __account_system_time(p, cputime, cputime_scaled, index); | ||
| 2948 | } | ||
| 2949 | |||
| 2950 | /* | ||
| 2951 | * Account for involuntary wait time. | ||
| 2952 | * @cputime: the cpu time spent in involuntary wait | ||
| 2953 | */ | ||
| 2954 | void account_steal_time(cputime_t cputime) | ||
| 2955 | { | ||
| 2956 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2957 | |||
| 2958 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
| 2959 | } | ||
| 2960 | |||
| 2961 | /* | ||
| 2962 | * Account for idle time. | ||
| 2963 | * @cputime: the cpu time spent in idle wait | ||
| 2964 | */ | ||
| 2965 | void account_idle_time(cputime_t cputime) | ||
| 2966 | { | ||
| 2967 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 2968 | struct rq *rq = this_rq(); | ||
| 2969 | |||
| 2970 | if (atomic_read(&rq->nr_iowait) > 0) | ||
| 2971 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
| 2972 | else | ||
| 2973 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
| 2974 | } | ||
| 2975 | |||
| 2976 | static __always_inline bool steal_account_process_tick(void) | ||
| 2977 | { | ||
| 2978 | #ifdef CONFIG_PARAVIRT | ||
| 2979 | if (static_key_false(¶virt_steal_enabled)) { | ||
| 2980 | u64 steal, st = 0; | ||
| 2981 | |||
| 2982 | steal = paravirt_steal_clock(smp_processor_id()); | ||
| 2983 | steal -= this_rq()->prev_steal_time; | ||
| 2984 | |||
| 2985 | st = steal_ticks(steal); | ||
| 2986 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
| 2987 | |||
| 2988 | account_steal_time(st); | ||
| 2989 | return st; | ||
| 2990 | } | ||
| 2991 | #endif | ||
| 2992 | return false; | ||
| 2993 | } | ||
| 2994 | |||
| 2995 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 2996 | |||
| 2997 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 2998 | /* | ||
| 2999 | * Account a tick to a process and cpustat | ||
| 3000 | * @p: the process that the cpu time gets accounted to | ||
| 3001 | * @user_tick: is the tick from userspace | ||
| 3002 | * @rq: the pointer to rq | ||
| 3003 | * | ||
| 3004 | * Tick demultiplexing follows the order | ||
| 3005 | * - pending hardirq update | ||
| 3006 | * - pending softirq update | ||
| 3007 | * - user_time | ||
| 3008 | * - idle_time | ||
| 3009 | * - system time | ||
| 3010 | * - check for guest_time | ||
| 3011 | * - else account as system_time | ||
| 3012 | * | ||
| 3013 | * Check for hardirq is done both for system and user time as there is | ||
| 3014 | * no timer going off while we are on hardirq and hence we may never get an | ||
| 3015 | * opportunity to update it solely in system time. | ||
| 3016 | * p->stime and friends are only updated on system time and not on irq | ||
| 3017 | * softirq as those do not count in task exec_runtime any more. | ||
| 3018 | */ | ||
| 3019 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3020 | struct rq *rq) | ||
| 3021 | { | ||
| 3022 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3023 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 3024 | |||
| 3025 | if (steal_account_process_tick()) | ||
| 3026 | return; | ||
| 3027 | |||
| 3028 | if (irqtime_account_hi_update()) { | ||
| 3029 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
| 3030 | } else if (irqtime_account_si_update()) { | ||
| 3031 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
| 3032 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 3033 | /* | ||
| 3034 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
| 3035 | * So, we have to handle it separately here. | ||
| 3036 | * Also, p->stime needs to be updated for ksoftirqd. | ||
| 3037 | */ | ||
| 3038 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3039 | CPUTIME_SOFTIRQ); | ||
| 3040 | } else if (user_tick) { | ||
| 3041 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3042 | } else if (p == rq->idle) { | ||
| 3043 | account_idle_time(cputime_one_jiffy); | ||
| 3044 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
| 3045 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3046 | } else { | ||
| 3047 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 3048 | CPUTIME_SYSTEM); | ||
| 3049 | } | ||
| 3050 | } | ||
| 3051 | |||
| 3052 | static void irqtime_account_idle_ticks(int ticks) | ||
| 3053 | { | ||
| 3054 | int i; | ||
| 3055 | struct rq *rq = this_rq(); | ||
| 3056 | |||
| 3057 | for (i = 0; i < ticks; i++) | ||
| 3058 | irqtime_account_process_tick(current, 0, rq); | ||
| 3059 | } | ||
| 3060 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3061 | static void irqtime_account_idle_ticks(int ticks) {} | ||
| 3062 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 3063 | struct rq *rq) {} | ||
| 3064 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 3065 | |||
| 3066 | /* | ||
| 3067 | * Account a single tick of cpu time. | ||
| 3068 | * @p: the process that the cpu time gets accounted to | ||
| 3069 | * @user_tick: indicates if the tick is a user or a system tick | ||
| 3070 | */ | ||
| 3071 | void account_process_tick(struct task_struct *p, int user_tick) | ||
| 3072 | { | ||
| 3073 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 3074 | struct rq *rq = this_rq(); | ||
| 3075 | |||
| 3076 | if (sched_clock_irqtime) { | ||
| 3077 | irqtime_account_process_tick(p, user_tick, rq); | ||
| 3078 | return; | ||
| 3079 | } | ||
| 3080 | |||
| 3081 | if (steal_account_process_tick()) | ||
| 3082 | return; | ||
| 3083 | |||
| 3084 | if (user_tick) | ||
| 3085 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 3086 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
| 3087 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
| 3088 | one_jiffy_scaled); | ||
| 3089 | else | ||
| 3090 | account_idle_time(cputime_one_jiffy); | ||
| 3091 | } | ||
| 3092 | |||
| 3093 | /* | ||
| 3094 | * Account multiple ticks of steal time. | ||
| 3095 | * @p: the process from which the cpu time has been stolen | ||
| 3096 | * @ticks: number of stolen ticks | ||
| 3097 | */ | ||
| 3098 | void account_steal_ticks(unsigned long ticks) | ||
| 3099 | { | ||
| 3100 | account_steal_time(jiffies_to_cputime(ticks)); | ||
| 3101 | } | ||
| 3102 | |||
| 3103 | /* | ||
| 3104 | * Account multiple ticks of idle time. | ||
| 3105 | * @ticks: number of stolen ticks | ||
| 3106 | */ | ||
| 3107 | void account_idle_ticks(unsigned long ticks) | ||
| 3108 | { | ||
| 3109 | |||
| 3110 | if (sched_clock_irqtime) { | ||
| 3111 | irqtime_account_idle_ticks(ticks); | ||
| 3112 | return; | ||
| 3113 | } | ||
| 3114 | |||
| 3115 | account_idle_time(jiffies_to_cputime(ticks)); | ||
| 3116 | } | ||
| 3117 | |||
| 3118 | #endif | ||
| 3119 | |||
| 3120 | /* | ||
| 3121 | * Use precise platform statistics if available: | ||
| 3122 | */ | ||
| 3123 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 3124 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3125 | { | ||
| 3126 | *ut = p->utime; | ||
| 3127 | *st = p->stime; | ||
| 3128 | } | ||
| 3129 | |||
| 3130 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3131 | { | ||
| 3132 | struct task_cputime cputime; | ||
| 3133 | |||
| 3134 | thread_group_cputime(p, &cputime); | ||
| 3135 | |||
| 3136 | *ut = cputime.utime; | ||
| 3137 | *st = cputime.stime; | ||
| 3138 | } | ||
| 3139 | #else | ||
| 3140 | |||
| 3141 | #ifndef nsecs_to_cputime | ||
| 3142 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 3143 | #endif | ||
| 3144 | |||
| 3145 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
| 3146 | { | ||
| 3147 | u64 temp = (__force u64) rtime; | ||
| 3148 | |||
| 3149 | temp *= (__force u64) utime; | ||
| 3150 | |||
| 3151 | if (sizeof(cputime_t) == 4) | ||
| 3152 | temp = div_u64(temp, (__force u32) total); | ||
| 3153 | else | ||
| 3154 | temp = div64_u64(temp, (__force u64) total); | ||
| 3155 | |||
| 3156 | return (__force cputime_t) temp; | ||
| 3157 | } | ||
| 3158 | |||
| 3159 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3160 | { | ||
| 3161 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
| 3162 | |||
| 3163 | /* | ||
| 3164 | * Use CFS's precise accounting: | ||
| 3165 | */ | ||
| 3166 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
| 3167 | |||
| 3168 | if (total) | ||
| 3169 | utime = scale_utime(utime, rtime, total); | ||
| 3170 | else | ||
| 3171 | utime = rtime; | ||
| 3172 | |||
| 3173 | /* | ||
| 3174 | * Compare with previous values, to keep monotonicity: | ||
| 3175 | */ | ||
| 3176 | p->prev_utime = max(p->prev_utime, utime); | ||
| 3177 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
| 3178 | |||
| 3179 | *ut = p->prev_utime; | ||
| 3180 | *st = p->prev_stime; | ||
| 3181 | } | ||
| 3182 | |||
| 3183 | /* | ||
| 3184 | * Must be called with siglock held. | ||
| 3185 | */ | ||
| 3186 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 3187 | { | ||
| 3188 | struct signal_struct *sig = p->signal; | ||
| 3189 | struct task_cputime cputime; | ||
| 3190 | cputime_t rtime, utime, total; | ||
| 3191 | |||
| 3192 | thread_group_cputime(p, &cputime); | ||
| 3193 | |||
| 3194 | total = cputime.utime + cputime.stime; | ||
| 3195 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
| 3196 | |||
| 3197 | if (total) | ||
| 3198 | utime = scale_utime(cputime.utime, rtime, total); | ||
| 3199 | else | ||
| 3200 | utime = rtime; | ||
| 3201 | |||
| 3202 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 3203 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
| 3204 | |||
| 3205 | *ut = sig->prev_utime; | ||
| 3206 | *st = sig->prev_stime; | ||
| 3207 | } | ||
| 3208 | #endif | ||
| 3209 | |||
| 3210 | /* | 2619 | /* |
| 3211 | * This function gets called by the timer code, with HZ frequency. | 2620 | * This function gets called by the timer code, with HZ frequency. |
| 3212 | * We call it with interrupts disabled. | 2621 | * We call it with interrupts disabled. |
| @@ -3367,6 +2776,40 @@ pick_next_task(struct rq *rq) | |||
| 3367 | 2776 | ||
| 3368 | /* | 2777 | /* |
| 3369 | * __schedule() is the main scheduler function. | 2778 | * __schedule() is the main scheduler function. |
| 2779 | * | ||
| 2780 | * The main means of driving the scheduler and thus entering this function are: | ||
| 2781 | * | ||
| 2782 | * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. | ||
| 2783 | * | ||
| 2784 | * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return | ||
| 2785 | * paths. For example, see arch/x86/entry_64.S. | ||
| 2786 | * | ||
| 2787 | * To drive preemption between tasks, the scheduler sets the flag in timer | ||
| 2788 | * interrupt handler scheduler_tick(). | ||
| 2789 | * | ||
| 2790 | * 3. Wakeups don't really cause entry into schedule(). They add a | ||
| 2791 | * task to the run-queue and that's it. | ||
| 2792 | * | ||
| 2793 | * Now, if the new task added to the run-queue preempts the current | ||
| 2794 | * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets | ||
| 2795 | * called on the nearest possible occasion: | ||
| 2796 | * | ||
| 2797 | * - If the kernel is preemptible (CONFIG_PREEMPT=y): | ||
| 2798 | * | ||
| 2799 | * - in syscall or exception context, at the next outmost | ||
| 2800 | * preempt_enable(). (this might be as soon as the wake_up()'s | ||
| 2801 | * spin_unlock()!) | ||
| 2802 | * | ||
| 2803 | * - in IRQ context, return from interrupt-handler to | ||
| 2804 | * preemptible context | ||
| 2805 | * | ||
| 2806 | * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) | ||
| 2807 | * then at the next: | ||
| 2808 | * | ||
| 2809 | * - cond_resched() call | ||
| 2810 | * - explicit schedule() call | ||
| 2811 | * - return from syscall or exception to user-space | ||
| 2812 | * - return from interrupt-handler to user-space | ||
| 3370 | */ | 2813 | */ |
| 3371 | static void __sched __schedule(void) | 2814 | static void __sched __schedule(void) |
| 3372 | { | 2815 | { |
| @@ -3468,6 +2911,21 @@ asmlinkage void __sched schedule(void) | |||
| 3468 | } | 2911 | } |
| 3469 | EXPORT_SYMBOL(schedule); | 2912 | EXPORT_SYMBOL(schedule); |
| 3470 | 2913 | ||
| 2914 | #ifdef CONFIG_RCU_USER_QS | ||
| 2915 | asmlinkage void __sched schedule_user(void) | ||
| 2916 | { | ||
| 2917 | /* | ||
| 2918 | * If we come here after a random call to set_need_resched(), | ||
| 2919 | * or we have been woken up remotely but the IPI has not yet arrived, | ||
| 2920 | * we haven't yet exited the RCU idle mode. Do it here manually until | ||
| 2921 | * we find a better solution. | ||
| 2922 | */ | ||
| 2923 | rcu_user_exit(); | ||
| 2924 | schedule(); | ||
| 2925 | rcu_user_enter(); | ||
| 2926 | } | ||
| 2927 | #endif | ||
| 2928 | |||
| 3471 | /** | 2929 | /** |
| 3472 | * schedule_preempt_disabled - called with preemption disabled | 2930 | * schedule_preempt_disabled - called with preemption disabled |
| 3473 | * | 2931 | * |
| @@ -3569,6 +3027,7 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
| 3569 | /* Catch callers which need to be fixed */ | 3027 | /* Catch callers which need to be fixed */ |
| 3570 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 3028 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
| 3571 | 3029 | ||
| 3030 | rcu_user_exit(); | ||
| 3572 | do { | 3031 | do { |
| 3573 | add_preempt_count(PREEMPT_ACTIVE); | 3032 | add_preempt_count(PREEMPT_ACTIVE); |
| 3574 | local_irq_enable(); | 3033 | local_irq_enable(); |
| @@ -4868,13 +4327,6 @@ again: | |||
| 4868 | */ | 4327 | */ |
| 4869 | if (preempt && rq != p_rq) | 4328 | if (preempt && rq != p_rq) |
| 4870 | resched_task(p_rq->curr); | 4329 | resched_task(p_rq->curr); |
| 4871 | } else { | ||
| 4872 | /* | ||
| 4873 | * We might have set it in task_yield_fair(), but are | ||
| 4874 | * not going to schedule(), so don't want to skip | ||
| 4875 | * the next update. | ||
| 4876 | */ | ||
| 4877 | rq->skip_clock_update = 0; | ||
| 4878 | } | 4330 | } |
| 4879 | 4331 | ||
| 4880 | out: | 4332 | out: |
| @@ -5416,16 +4868,25 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
| 5416 | *tablep = NULL; | 4868 | *tablep = NULL; |
| 5417 | } | 4869 | } |
| 5418 | 4870 | ||
| 4871 | static int min_load_idx = 0; | ||
| 4872 | static int max_load_idx = CPU_LOAD_IDX_MAX; | ||
| 4873 | |||
| 5419 | static void | 4874 | static void |
| 5420 | set_table_entry(struct ctl_table *entry, | 4875 | set_table_entry(struct ctl_table *entry, |
| 5421 | const char *procname, void *data, int maxlen, | 4876 | const char *procname, void *data, int maxlen, |
| 5422 | umode_t mode, proc_handler *proc_handler) | 4877 | umode_t mode, proc_handler *proc_handler, |
| 4878 | bool load_idx) | ||
| 5423 | { | 4879 | { |
| 5424 | entry->procname = procname; | 4880 | entry->procname = procname; |
| 5425 | entry->data = data; | 4881 | entry->data = data; |
| 5426 | entry->maxlen = maxlen; | 4882 | entry->maxlen = maxlen; |
| 5427 | entry->mode = mode; | 4883 | entry->mode = mode; |
| 5428 | entry->proc_handler = proc_handler; | 4884 | entry->proc_handler = proc_handler; |
| 4885 | |||
| 4886 | if (load_idx) { | ||
| 4887 | entry->extra1 = &min_load_idx; | ||
| 4888 | entry->extra2 = &max_load_idx; | ||
| 4889 | } | ||
| 5429 | } | 4890 | } |
| 5430 | 4891 | ||
| 5431 | static struct ctl_table * | 4892 | static struct ctl_table * |
| @@ -5437,30 +4898,30 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
| 5437 | return NULL; | 4898 | return NULL; |
| 5438 | 4899 | ||
| 5439 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 4900 | set_table_entry(&table[0], "min_interval", &sd->min_interval, |
| 5440 | sizeof(long), 0644, proc_doulongvec_minmax); | 4901 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 5441 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 4902 | set_table_entry(&table[1], "max_interval", &sd->max_interval, |
| 5442 | sizeof(long), 0644, proc_doulongvec_minmax); | 4903 | sizeof(long), 0644, proc_doulongvec_minmax, false); |
| 5443 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 4904 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, |
| 5444 | sizeof(int), 0644, proc_dointvec_minmax); | 4905 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5445 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 4906 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, |
| 5446 | sizeof(int), 0644, proc_dointvec_minmax); | 4907 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5447 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 4908 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, |
| 5448 | sizeof(int), 0644, proc_dointvec_minmax); | 4909 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5449 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 4910 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, |
| 5450 | sizeof(int), 0644, proc_dointvec_minmax); | 4911 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5451 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 4912 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, |
| 5452 | sizeof(int), 0644, proc_dointvec_minmax); | 4913 | sizeof(int), 0644, proc_dointvec_minmax, true); |
| 5453 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | 4914 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, |
| 5454 | sizeof(int), 0644, proc_dointvec_minmax); | 4915 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5455 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | 4916 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, |
| 5456 | sizeof(int), 0644, proc_dointvec_minmax); | 4917 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5457 | set_table_entry(&table[9], "cache_nice_tries", | 4918 | set_table_entry(&table[9], "cache_nice_tries", |
| 5458 | &sd->cache_nice_tries, | 4919 | &sd->cache_nice_tries, |
| 5459 | sizeof(int), 0644, proc_dointvec_minmax); | 4920 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5460 | set_table_entry(&table[10], "flags", &sd->flags, | 4921 | set_table_entry(&table[10], "flags", &sd->flags, |
| 5461 | sizeof(int), 0644, proc_dointvec_minmax); | 4922 | sizeof(int), 0644, proc_dointvec_minmax, false); |
| 5462 | set_table_entry(&table[11], "name", sd->name, | 4923 | set_table_entry(&table[11], "name", sd->name, |
| 5463 | CORENAME_MAX_SIZE, 0444, proc_dostring); | 4924 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
| 5464 | /* &table[12] is terminator */ | 4925 | /* &table[12] is terminator */ |
| 5465 | 4926 | ||
| 5466 | return table; | 4927 | return table; |
| @@ -5604,7 +5065,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
| 5604 | migrate_tasks(cpu); | 5065 | migrate_tasks(cpu); |
| 5605 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | 5066 | BUG_ON(rq->nr_running != 1); /* the migration thread */ |
| 5606 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 5067 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
| 5068 | break; | ||
| 5607 | 5069 | ||
| 5070 | case CPU_DEAD: | ||
| 5608 | calc_load_migrate(rq); | 5071 | calc_load_migrate(rq); |
| 5609 | break; | 5072 | break; |
| 5610 | #endif | 5073 | #endif |
| @@ -6537,7 +6000,6 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | |||
| 6537 | | 0*SD_BALANCE_FORK | 6000 | | 0*SD_BALANCE_FORK |
| 6538 | | 0*SD_BALANCE_WAKE | 6001 | | 0*SD_BALANCE_WAKE |
| 6539 | | 0*SD_WAKE_AFFINE | 6002 | | 0*SD_WAKE_AFFINE |
| 6540 | | 0*SD_PREFER_LOCAL | ||
| 6541 | | 0*SD_SHARE_CPUPOWER | 6003 | | 0*SD_SHARE_CPUPOWER |
| 6542 | | 0*SD_SHARE_PKG_RESOURCES | 6004 | | 0*SD_SHARE_PKG_RESOURCES |
| 6543 | | 1*SD_SERIALIZE | 6005 | | 1*SD_SERIALIZE |
| @@ -8335,6 +7797,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
| 8335 | * (balbir@in.ibm.com). | 7797 | * (balbir@in.ibm.com). |
| 8336 | */ | 7798 | */ |
| 8337 | 7799 | ||
| 7800 | struct cpuacct root_cpuacct; | ||
| 7801 | |||
| 8338 | /* create a new cpu accounting group */ | 7802 | /* create a new cpu accounting group */ |
| 8339 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) | 7803 | static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) |
| 8340 | { | 7804 | { |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c new file mode 100644 index 000000000000..81b763ba58a6 --- /dev/null +++ b/kernel/sched/cputime.c | |||
| @@ -0,0 +1,530 @@ | |||
| 1 | #include <linux/export.h> | ||
| 2 | #include <linux/sched.h> | ||
| 3 | #include <linux/tsacct_kern.h> | ||
| 4 | #include <linux/kernel_stat.h> | ||
| 5 | #include <linux/static_key.h> | ||
| 6 | #include "sched.h" | ||
| 7 | |||
| 8 | |||
| 9 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 10 | |||
| 11 | /* | ||
| 12 | * There are no locks covering percpu hardirq/softirq time. | ||
| 13 | * They are only modified in vtime_account, on corresponding CPU | ||
| 14 | * with interrupts disabled. So, writes are safe. | ||
| 15 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
| 16 | * This may result in other CPU reading this CPU's irq time and can | ||
| 17 | * race with irq/vtime_account on this CPU. We would either get old | ||
| 18 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
| 19 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
| 20 | * compromise in place of having locks on each irq in account_system_time. | ||
| 21 | */ | ||
| 22 | DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
| 23 | DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
| 24 | |||
| 25 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
| 26 | static int sched_clock_irqtime; | ||
| 27 | |||
| 28 | void enable_sched_clock_irqtime(void) | ||
| 29 | { | ||
| 30 | sched_clock_irqtime = 1; | ||
| 31 | } | ||
| 32 | |||
| 33 | void disable_sched_clock_irqtime(void) | ||
| 34 | { | ||
| 35 | sched_clock_irqtime = 0; | ||
| 36 | } | ||
| 37 | |||
| 38 | #ifndef CONFIG_64BIT | ||
| 39 | DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 40 | #endif /* CONFIG_64BIT */ | ||
| 41 | |||
| 42 | /* | ||
| 43 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
| 44 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
| 45 | */ | ||
| 46 | void vtime_account(struct task_struct *curr) | ||
| 47 | { | ||
| 48 | unsigned long flags; | ||
| 49 | s64 delta; | ||
| 50 | int cpu; | ||
| 51 | |||
| 52 | if (!sched_clock_irqtime) | ||
| 53 | return; | ||
| 54 | |||
| 55 | local_irq_save(flags); | ||
| 56 | |||
| 57 | cpu = smp_processor_id(); | ||
| 58 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
| 59 | __this_cpu_add(irq_start_time, delta); | ||
| 60 | |||
| 61 | irq_time_write_begin(); | ||
| 62 | /* | ||
| 63 | * We do not account for softirq time from ksoftirqd here. | ||
| 64 | * We want to continue accounting softirq time to ksoftirqd thread | ||
| 65 | * in that case, so as not to confuse scheduler with a special task | ||
| 66 | * that do not consume any time, but still wants to run. | ||
| 67 | */ | ||
| 68 | if (hardirq_count()) | ||
| 69 | __this_cpu_add(cpu_hardirq_time, delta); | ||
| 70 | else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) | ||
| 71 | __this_cpu_add(cpu_softirq_time, delta); | ||
| 72 | |||
| 73 | irq_time_write_end(); | ||
| 74 | local_irq_restore(flags); | ||
| 75 | } | ||
| 76 | EXPORT_SYMBOL_GPL(vtime_account); | ||
| 77 | |||
| 78 | static int irqtime_account_hi_update(void) | ||
| 79 | { | ||
| 80 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 81 | unsigned long flags; | ||
| 82 | u64 latest_ns; | ||
| 83 | int ret = 0; | ||
| 84 | |||
| 85 | local_irq_save(flags); | ||
| 86 | latest_ns = this_cpu_read(cpu_hardirq_time); | ||
| 87 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) | ||
| 88 | ret = 1; | ||
| 89 | local_irq_restore(flags); | ||
| 90 | return ret; | ||
| 91 | } | ||
| 92 | |||
| 93 | static int irqtime_account_si_update(void) | ||
| 94 | { | ||
| 95 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 96 | unsigned long flags; | ||
| 97 | u64 latest_ns; | ||
| 98 | int ret = 0; | ||
| 99 | |||
| 100 | local_irq_save(flags); | ||
| 101 | latest_ns = this_cpu_read(cpu_softirq_time); | ||
| 102 | if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) | ||
| 103 | ret = 1; | ||
| 104 | local_irq_restore(flags); | ||
| 105 | return ret; | ||
| 106 | } | ||
| 107 | |||
| 108 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 109 | |||
| 110 | #define sched_clock_irqtime (0) | ||
| 111 | |||
| 112 | #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 113 | |||
| 114 | static inline void task_group_account_field(struct task_struct *p, int index, | ||
| 115 | u64 tmp) | ||
| 116 | { | ||
| 117 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 118 | struct kernel_cpustat *kcpustat; | ||
| 119 | struct cpuacct *ca; | ||
| 120 | #endif | ||
| 121 | /* | ||
| 122 | * Since all updates are sure to touch the root cgroup, we | ||
| 123 | * get ourselves ahead and touch it first. If the root cgroup | ||
| 124 | * is the only cgroup, then nothing else should be necessary. | ||
| 125 | * | ||
| 126 | */ | ||
| 127 | __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; | ||
| 128 | |||
| 129 | #ifdef CONFIG_CGROUP_CPUACCT | ||
| 130 | if (unlikely(!cpuacct_subsys.active)) | ||
| 131 | return; | ||
| 132 | |||
| 133 | rcu_read_lock(); | ||
| 134 | ca = task_ca(p); | ||
| 135 | while (ca && (ca != &root_cpuacct)) { | ||
| 136 | kcpustat = this_cpu_ptr(ca->cpustat); | ||
| 137 | kcpustat->cpustat[index] += tmp; | ||
| 138 | ca = parent_ca(ca); | ||
| 139 | } | ||
| 140 | rcu_read_unlock(); | ||
| 141 | #endif | ||
| 142 | } | ||
| 143 | |||
| 144 | /* | ||
| 145 | * Account user cpu time to a process. | ||
| 146 | * @p: the process that the cpu time gets accounted to | ||
| 147 | * @cputime: the cpu time spent in user space since the last update | ||
| 148 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 149 | */ | ||
| 150 | void account_user_time(struct task_struct *p, cputime_t cputime, | ||
| 151 | cputime_t cputime_scaled) | ||
| 152 | { | ||
| 153 | int index; | ||
| 154 | |||
| 155 | /* Add user time to process. */ | ||
| 156 | p->utime += cputime; | ||
| 157 | p->utimescaled += cputime_scaled; | ||
| 158 | account_group_user_time(p, cputime); | ||
| 159 | |||
| 160 | index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | ||
| 161 | |||
| 162 | /* Add user time to cpustat. */ | ||
| 163 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 164 | |||
| 165 | /* Account for user time used */ | ||
| 166 | acct_update_integrals(p); | ||
| 167 | } | ||
| 168 | |||
| 169 | /* | ||
| 170 | * Account guest cpu time to a process. | ||
| 171 | * @p: the process that the cpu time gets accounted to | ||
| 172 | * @cputime: the cpu time spent in virtual machine since the last update | ||
| 173 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 174 | */ | ||
| 175 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | ||
| 176 | cputime_t cputime_scaled) | ||
| 177 | { | ||
| 178 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 179 | |||
| 180 | /* Add guest time to process. */ | ||
| 181 | p->utime += cputime; | ||
| 182 | p->utimescaled += cputime_scaled; | ||
| 183 | account_group_user_time(p, cputime); | ||
| 184 | p->gtime += cputime; | ||
| 185 | |||
| 186 | /* Add guest time to cpustat. */ | ||
| 187 | if (TASK_NICE(p) > 0) { | ||
| 188 | cpustat[CPUTIME_NICE] += (__force u64) cputime; | ||
| 189 | cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; | ||
| 190 | } else { | ||
| 191 | cpustat[CPUTIME_USER] += (__force u64) cputime; | ||
| 192 | cpustat[CPUTIME_GUEST] += (__force u64) cputime; | ||
| 193 | } | ||
| 194 | } | ||
| 195 | |||
| 196 | /* | ||
| 197 | * Account system cpu time to a process and desired cpustat field | ||
| 198 | * @p: the process that the cpu time gets accounted to | ||
| 199 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 200 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 201 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
| 202 | */ | ||
| 203 | static inline | ||
| 204 | void __account_system_time(struct task_struct *p, cputime_t cputime, | ||
| 205 | cputime_t cputime_scaled, int index) | ||
| 206 | { | ||
| 207 | /* Add system time to process. */ | ||
| 208 | p->stime += cputime; | ||
| 209 | p->stimescaled += cputime_scaled; | ||
| 210 | account_group_system_time(p, cputime); | ||
| 211 | |||
| 212 | /* Add system time to cpustat. */ | ||
| 213 | task_group_account_field(p, index, (__force u64) cputime); | ||
| 214 | |||
| 215 | /* Account for system time used */ | ||
| 216 | acct_update_integrals(p); | ||
| 217 | } | ||
| 218 | |||
| 219 | /* | ||
| 220 | * Account system cpu time to a process. | ||
| 221 | * @p: the process that the cpu time gets accounted to | ||
| 222 | * @hardirq_offset: the offset to subtract from hardirq_count() | ||
| 223 | * @cputime: the cpu time spent in kernel space since the last update | ||
| 224 | * @cputime_scaled: cputime scaled by cpu frequency | ||
| 225 | */ | ||
| 226 | void account_system_time(struct task_struct *p, int hardirq_offset, | ||
| 227 | cputime_t cputime, cputime_t cputime_scaled) | ||
| 228 | { | ||
| 229 | int index; | ||
| 230 | |||
| 231 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | ||
| 232 | account_guest_time(p, cputime, cputime_scaled); | ||
| 233 | return; | ||
| 234 | } | ||
| 235 | |||
| 236 | if (hardirq_count() - hardirq_offset) | ||
| 237 | index = CPUTIME_IRQ; | ||
| 238 | else if (in_serving_softirq()) | ||
| 239 | index = CPUTIME_SOFTIRQ; | ||
| 240 | else | ||
| 241 | index = CPUTIME_SYSTEM; | ||
| 242 | |||
| 243 | __account_system_time(p, cputime, cputime_scaled, index); | ||
| 244 | } | ||
| 245 | |||
| 246 | /* | ||
| 247 | * Account for involuntary wait time. | ||
| 248 | * @cputime: the cpu time spent in involuntary wait | ||
| 249 | */ | ||
| 250 | void account_steal_time(cputime_t cputime) | ||
| 251 | { | ||
| 252 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 253 | |||
| 254 | cpustat[CPUTIME_STEAL] += (__force u64) cputime; | ||
| 255 | } | ||
| 256 | |||
| 257 | /* | ||
| 258 | * Account for idle time. | ||
| 259 | * @cputime: the cpu time spent in idle wait | ||
| 260 | */ | ||
| 261 | void account_idle_time(cputime_t cputime) | ||
| 262 | { | ||
| 263 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 264 | struct rq *rq = this_rq(); | ||
| 265 | |||
| 266 | if (atomic_read(&rq->nr_iowait) > 0) | ||
| 267 | cpustat[CPUTIME_IOWAIT] += (__force u64) cputime; | ||
| 268 | else | ||
| 269 | cpustat[CPUTIME_IDLE] += (__force u64) cputime; | ||
| 270 | } | ||
| 271 | |||
| 272 | static __always_inline bool steal_account_process_tick(void) | ||
| 273 | { | ||
| 274 | #ifdef CONFIG_PARAVIRT | ||
| 275 | if (static_key_false(¶virt_steal_enabled)) { | ||
| 276 | u64 steal, st = 0; | ||
| 277 | |||
| 278 | steal = paravirt_steal_clock(smp_processor_id()); | ||
| 279 | steal -= this_rq()->prev_steal_time; | ||
| 280 | |||
| 281 | st = steal_ticks(steal); | ||
| 282 | this_rq()->prev_steal_time += st * TICK_NSEC; | ||
| 283 | |||
| 284 | account_steal_time(st); | ||
| 285 | return st; | ||
| 286 | } | ||
| 287 | #endif | ||
| 288 | return false; | ||
| 289 | } | ||
| 290 | |||
| 291 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 292 | |||
| 293 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 294 | /* | ||
| 295 | * Account a tick to a process and cpustat | ||
| 296 | * @p: the process that the cpu time gets accounted to | ||
| 297 | * @user_tick: is the tick from userspace | ||
| 298 | * @rq: the pointer to rq | ||
| 299 | * | ||
| 300 | * Tick demultiplexing follows the order | ||
| 301 | * - pending hardirq update | ||
| 302 | * - pending softirq update | ||
| 303 | * - user_time | ||
| 304 | * - idle_time | ||
| 305 | * - system time | ||
| 306 | * - check for guest_time | ||
| 307 | * - else account as system_time | ||
| 308 | * | ||
| 309 | * Check for hardirq is done both for system and user time as there is | ||
| 310 | * no timer going off while we are on hardirq and hence we may never get an | ||
| 311 | * opportunity to update it solely in system time. | ||
| 312 | * p->stime and friends are only updated on system time and not on irq | ||
| 313 | * softirq as those do not count in task exec_runtime any more. | ||
| 314 | */ | ||
| 315 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 316 | struct rq *rq) | ||
| 317 | { | ||
| 318 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 319 | u64 *cpustat = kcpustat_this_cpu->cpustat; | ||
| 320 | |||
| 321 | if (steal_account_process_tick()) | ||
| 322 | return; | ||
| 323 | |||
| 324 | if (irqtime_account_hi_update()) { | ||
| 325 | cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy; | ||
| 326 | } else if (irqtime_account_si_update()) { | ||
| 327 | cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy; | ||
| 328 | } else if (this_cpu_ksoftirqd() == p) { | ||
| 329 | /* | ||
| 330 | * ksoftirqd time do not get accounted in cpu_softirq_time. | ||
| 331 | * So, we have to handle it separately here. | ||
| 332 | * Also, p->stime needs to be updated for ksoftirqd. | ||
| 333 | */ | ||
| 334 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 335 | CPUTIME_SOFTIRQ); | ||
| 336 | } else if (user_tick) { | ||
| 337 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 338 | } else if (p == rq->idle) { | ||
| 339 | account_idle_time(cputime_one_jiffy); | ||
| 340 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | ||
| 341 | account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 342 | } else { | ||
| 343 | __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, | ||
| 344 | CPUTIME_SYSTEM); | ||
| 345 | } | ||
| 346 | } | ||
| 347 | |||
| 348 | static void irqtime_account_idle_ticks(int ticks) | ||
| 349 | { | ||
| 350 | int i; | ||
| 351 | struct rq *rq = this_rq(); | ||
| 352 | |||
| 353 | for (i = 0; i < ticks; i++) | ||
| 354 | irqtime_account_process_tick(current, 0, rq); | ||
| 355 | } | ||
| 356 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 357 | static void irqtime_account_idle_ticks(int ticks) {} | ||
| 358 | static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | ||
| 359 | struct rq *rq) {} | ||
| 360 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 361 | |||
| 362 | /* | ||
| 363 | * Account a single tick of cpu time. | ||
| 364 | * @p: the process that the cpu time gets accounted to | ||
| 365 | * @user_tick: indicates if the tick is a user or a system tick | ||
| 366 | */ | ||
| 367 | void account_process_tick(struct task_struct *p, int user_tick) | ||
| 368 | { | ||
| 369 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); | ||
| 370 | struct rq *rq = this_rq(); | ||
| 371 | |||
| 372 | if (sched_clock_irqtime) { | ||
| 373 | irqtime_account_process_tick(p, user_tick, rq); | ||
| 374 | return; | ||
| 375 | } | ||
| 376 | |||
| 377 | if (steal_account_process_tick()) | ||
| 378 | return; | ||
| 379 | |||
| 380 | if (user_tick) | ||
| 381 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); | ||
| 382 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | ||
| 383 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, | ||
| 384 | one_jiffy_scaled); | ||
| 385 | else | ||
| 386 | account_idle_time(cputime_one_jiffy); | ||
| 387 | } | ||
| 388 | |||
| 389 | /* | ||
| 390 | * Account multiple ticks of steal time. | ||
| 391 | * @p: the process from which the cpu time has been stolen | ||
| 392 | * @ticks: number of stolen ticks | ||
| 393 | */ | ||
| 394 | void account_steal_ticks(unsigned long ticks) | ||
| 395 | { | ||
| 396 | account_steal_time(jiffies_to_cputime(ticks)); | ||
| 397 | } | ||
| 398 | |||
| 399 | /* | ||
| 400 | * Account multiple ticks of idle time. | ||
| 401 | * @ticks: number of stolen ticks | ||
| 402 | */ | ||
| 403 | void account_idle_ticks(unsigned long ticks) | ||
| 404 | { | ||
| 405 | |||
| 406 | if (sched_clock_irqtime) { | ||
| 407 | irqtime_account_idle_ticks(ticks); | ||
| 408 | return; | ||
| 409 | } | ||
| 410 | |||
| 411 | account_idle_time(jiffies_to_cputime(ticks)); | ||
| 412 | } | ||
| 413 | |||
| 414 | #endif | ||
| 415 | |||
| 416 | /* | ||
| 417 | * Use precise platform statistics if available: | ||
| 418 | */ | ||
| 419 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | ||
| 420 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 421 | { | ||
| 422 | *ut = p->utime; | ||
| 423 | *st = p->stime; | ||
| 424 | } | ||
| 425 | |||
| 426 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 427 | { | ||
| 428 | struct task_cputime cputime; | ||
| 429 | |||
| 430 | thread_group_cputime(p, &cputime); | ||
| 431 | |||
| 432 | *ut = cputime.utime; | ||
| 433 | *st = cputime.stime; | ||
| 434 | } | ||
| 435 | |||
| 436 | /* | ||
| 437 | * Archs that account the whole time spent in the idle task | ||
| 438 | * (outside irq) as idle time can rely on this and just implement | ||
| 439 | * vtime_account_system() and vtime_account_idle(). Archs that | ||
| 440 | * have other meaning of the idle time (s390 only includes the | ||
| 441 | * time spent by the CPU when it's in low power mode) must override | ||
| 442 | * vtime_account(). | ||
| 443 | */ | ||
| 444 | #ifndef __ARCH_HAS_VTIME_ACCOUNT | ||
| 445 | void vtime_account(struct task_struct *tsk) | ||
| 446 | { | ||
| 447 | unsigned long flags; | ||
| 448 | |||
| 449 | local_irq_save(flags); | ||
| 450 | |||
| 451 | if (in_interrupt() || !is_idle_task(tsk)) | ||
| 452 | vtime_account_system(tsk); | ||
| 453 | else | ||
| 454 | vtime_account_idle(tsk); | ||
| 455 | |||
| 456 | local_irq_restore(flags); | ||
| 457 | } | ||
| 458 | EXPORT_SYMBOL_GPL(vtime_account); | ||
| 459 | #endif /* __ARCH_HAS_VTIME_ACCOUNT */ | ||
| 460 | |||
| 461 | #else | ||
| 462 | |||
| 463 | #ifndef nsecs_to_cputime | ||
| 464 | # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) | ||
| 465 | #endif | ||
| 466 | |||
| 467 | static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) | ||
| 468 | { | ||
| 469 | u64 temp = (__force u64) rtime; | ||
| 470 | |||
| 471 | temp *= (__force u64) utime; | ||
| 472 | |||
| 473 | if (sizeof(cputime_t) == 4) | ||
| 474 | temp = div_u64(temp, (__force u32) total); | ||
| 475 | else | ||
| 476 | temp = div64_u64(temp, (__force u64) total); | ||
| 477 | |||
| 478 | return (__force cputime_t) temp; | ||
| 479 | } | ||
| 480 | |||
| 481 | void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 482 | { | ||
| 483 | cputime_t rtime, utime = p->utime, total = utime + p->stime; | ||
| 484 | |||
| 485 | /* | ||
| 486 | * Use CFS's precise accounting: | ||
| 487 | */ | ||
| 488 | rtime = nsecs_to_cputime(p->se.sum_exec_runtime); | ||
| 489 | |||
| 490 | if (total) | ||
| 491 | utime = scale_utime(utime, rtime, total); | ||
| 492 | else | ||
| 493 | utime = rtime; | ||
| 494 | |||
| 495 | /* | ||
| 496 | * Compare with previous values, to keep monotonicity: | ||
| 497 | */ | ||
| 498 | p->prev_utime = max(p->prev_utime, utime); | ||
| 499 | p->prev_stime = max(p->prev_stime, rtime - p->prev_utime); | ||
| 500 | |||
| 501 | *ut = p->prev_utime; | ||
| 502 | *st = p->prev_stime; | ||
| 503 | } | ||
| 504 | |||
| 505 | /* | ||
| 506 | * Must be called with siglock held. | ||
| 507 | */ | ||
| 508 | void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) | ||
| 509 | { | ||
| 510 | struct signal_struct *sig = p->signal; | ||
| 511 | struct task_cputime cputime; | ||
| 512 | cputime_t rtime, utime, total; | ||
| 513 | |||
| 514 | thread_group_cputime(p, &cputime); | ||
| 515 | |||
| 516 | total = cputime.utime + cputime.stime; | ||
| 517 | rtime = nsecs_to_cputime(cputime.sum_exec_runtime); | ||
| 518 | |||
| 519 | if (total) | ||
| 520 | utime = scale_utime(cputime.utime, rtime, total); | ||
| 521 | else | ||
| 522 | utime = rtime; | ||
| 523 | |||
| 524 | sig->prev_utime = max(sig->prev_utime, utime); | ||
| 525 | sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime); | ||
| 526 | |||
| 527 | *ut = sig->prev_utime; | ||
| 528 | *st = sig->prev_stime; | ||
| 529 | } | ||
| 530 | #endif | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 96e2b18b6283..6b800a14b990 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -597,7 +597,7 @@ calc_delta_fair(unsigned long delta, struct sched_entity *se) | |||
| 597 | /* | 597 | /* |
| 598 | * The idea is to set a period in which each task runs once. | 598 | * The idea is to set a period in which each task runs once. |
| 599 | * | 599 | * |
| 600 | * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch | 600 | * When there are too many tasks (sched_nr_latency) we have to stretch |
| 601 | * this period because otherwise the slices get too small. | 601 | * this period because otherwise the slices get too small. |
| 602 | * | 602 | * |
| 603 | * p = (nr <= nl) ? l : l*nr/nl | 603 | * p = (nr <= nl) ? l : l*nr/nl |
| @@ -2700,7 +2700,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2700 | int prev_cpu = task_cpu(p); | 2700 | int prev_cpu = task_cpu(p); |
| 2701 | int new_cpu = cpu; | 2701 | int new_cpu = cpu; |
| 2702 | int want_affine = 0; | 2702 | int want_affine = 0; |
| 2703 | int want_sd = 1; | ||
| 2704 | int sync = wake_flags & WF_SYNC; | 2703 | int sync = wake_flags & WF_SYNC; |
| 2705 | 2704 | ||
| 2706 | if (p->nr_cpus_allowed == 1) | 2705 | if (p->nr_cpus_allowed == 1) |
| @@ -2718,48 +2717,21 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2718 | continue; | 2717 | continue; |
| 2719 | 2718 | ||
| 2720 | /* | 2719 | /* |
| 2721 | * If power savings logic is enabled for a domain, see if we | ||
| 2722 | * are not overloaded, if so, don't balance wider. | ||
| 2723 | */ | ||
| 2724 | if (tmp->flags & (SD_PREFER_LOCAL)) { | ||
| 2725 | unsigned long power = 0; | ||
| 2726 | unsigned long nr_running = 0; | ||
| 2727 | unsigned long capacity; | ||
| 2728 | int i; | ||
| 2729 | |||
| 2730 | for_each_cpu(i, sched_domain_span(tmp)) { | ||
| 2731 | power += power_of(i); | ||
| 2732 | nr_running += cpu_rq(i)->cfs.nr_running; | ||
| 2733 | } | ||
| 2734 | |||
| 2735 | capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); | ||
| 2736 | |||
| 2737 | if (nr_running < capacity) | ||
| 2738 | want_sd = 0; | ||
| 2739 | } | ||
| 2740 | |||
| 2741 | /* | ||
| 2742 | * If both cpu and prev_cpu are part of this domain, | 2720 | * If both cpu and prev_cpu are part of this domain, |
| 2743 | * cpu is a valid SD_WAKE_AFFINE target. | 2721 | * cpu is a valid SD_WAKE_AFFINE target. |
| 2744 | */ | 2722 | */ |
| 2745 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 2723 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
| 2746 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { | 2724 | cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { |
| 2747 | affine_sd = tmp; | 2725 | affine_sd = tmp; |
| 2748 | want_affine = 0; | ||
| 2749 | } | ||
| 2750 | |||
| 2751 | if (!want_sd && !want_affine) | ||
| 2752 | break; | 2726 | break; |
| 2727 | } | ||
| 2753 | 2728 | ||
| 2754 | if (!(tmp->flags & sd_flag)) | 2729 | if (tmp->flags & sd_flag) |
| 2755 | continue; | ||
| 2756 | |||
| 2757 | if (want_sd) | ||
| 2758 | sd = tmp; | 2730 | sd = tmp; |
| 2759 | } | 2731 | } |
| 2760 | 2732 | ||
| 2761 | if (affine_sd) { | 2733 | if (affine_sd) { |
| 2762 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 2734 | if (cpu != prev_cpu && wake_affine(affine_sd, p, sync)) |
| 2763 | prev_cpu = cpu; | 2735 | prev_cpu = cpu; |
| 2764 | 2736 | ||
| 2765 | new_cpu = select_idle_sibling(p, prev_cpu); | 2737 | new_cpu = select_idle_sibling(p, prev_cpu); |
| @@ -4295,7 +4267,7 @@ redo: | |||
| 4295 | goto out_balanced; | 4267 | goto out_balanced; |
| 4296 | } | 4268 | } |
| 4297 | 4269 | ||
| 4298 | BUG_ON(busiest == this_rq); | 4270 | BUG_ON(busiest == env.dst_rq); |
| 4299 | 4271 | ||
| 4300 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); | 4272 | schedstat_add(sd, lb_imbalance[idle], env.imbalance); |
| 4301 | 4273 | ||
| @@ -4316,7 +4288,7 @@ redo: | |||
| 4316 | update_h_load(env.src_cpu); | 4288 | update_h_load(env.src_cpu); |
| 4317 | more_balance: | 4289 | more_balance: |
| 4318 | local_irq_save(flags); | 4290 | local_irq_save(flags); |
| 4319 | double_rq_lock(this_rq, busiest); | 4291 | double_rq_lock(env.dst_rq, busiest); |
| 4320 | 4292 | ||
| 4321 | /* | 4293 | /* |
| 4322 | * cur_ld_moved - load moved in current iteration | 4294 | * cur_ld_moved - load moved in current iteration |
| @@ -4324,7 +4296,7 @@ more_balance: | |||
| 4324 | */ | 4296 | */ |
| 4325 | cur_ld_moved = move_tasks(&env); | 4297 | cur_ld_moved = move_tasks(&env); |
| 4326 | ld_moved += cur_ld_moved; | 4298 | ld_moved += cur_ld_moved; |
| 4327 | double_rq_unlock(this_rq, busiest); | 4299 | double_rq_unlock(env.dst_rq, busiest); |
| 4328 | local_irq_restore(flags); | 4300 | local_irq_restore(flags); |
| 4329 | 4301 | ||
| 4330 | if (env.flags & LBF_NEED_BREAK) { | 4302 | if (env.flags & LBF_NEED_BREAK) { |
| @@ -4360,8 +4332,7 @@ more_balance: | |||
| 4360 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && | 4332 | if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && |
| 4361 | lb_iterations++ < max_lb_iterations) { | 4333 | lb_iterations++ < max_lb_iterations) { |
| 4362 | 4334 | ||
| 4363 | this_rq = cpu_rq(env.new_dst_cpu); | 4335 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
| 4364 | env.dst_rq = this_rq; | ||
| 4365 | env.dst_cpu = env.new_dst_cpu; | 4336 | env.dst_cpu = env.new_dst_cpu; |
| 4366 | env.flags &= ~LBF_SOME_PINNED; | 4337 | env.flags &= ~LBF_SOME_PINNED; |
| 4367 | env.loop = 0; | 4338 | env.loop = 0; |
| @@ -4646,7 +4617,7 @@ static void nohz_balancer_kick(int cpu) | |||
| 4646 | return; | 4617 | return; |
| 4647 | } | 4618 | } |
| 4648 | 4619 | ||
| 4649 | static inline void clear_nohz_tick_stopped(int cpu) | 4620 | static inline void nohz_balance_exit_idle(int cpu) |
| 4650 | { | 4621 | { |
| 4651 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { | 4622 | if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) { |
| 4652 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); | 4623 | cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); |
| @@ -4686,28 +4657,23 @@ void set_cpu_sd_state_idle(void) | |||
| 4686 | } | 4657 | } |
| 4687 | 4658 | ||
| 4688 | /* | 4659 | /* |
| 4689 | * This routine will record that this cpu is going idle with tick stopped. | 4660 | * This routine will record that the cpu is going idle with tick stopped. |
| 4690 | * This info will be used in performing idle load balancing in the future. | 4661 | * This info will be used in performing idle load balancing in the future. |
| 4691 | */ | 4662 | */ |
| 4692 | void select_nohz_load_balancer(int stop_tick) | 4663 | void nohz_balance_enter_idle(int cpu) |
| 4693 | { | 4664 | { |
| 4694 | int cpu = smp_processor_id(); | ||
| 4695 | |||
| 4696 | /* | 4665 | /* |
| 4697 | * If this cpu is going down, then nothing needs to be done. | 4666 | * If this cpu is going down, then nothing needs to be done. |
| 4698 | */ | 4667 | */ |
| 4699 | if (!cpu_active(cpu)) | 4668 | if (!cpu_active(cpu)) |
| 4700 | return; | 4669 | return; |
| 4701 | 4670 | ||
| 4702 | if (stop_tick) { | 4671 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
| 4703 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 4672 | return; |
| 4704 | return; | ||
| 4705 | 4673 | ||
| 4706 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); | 4674 | cpumask_set_cpu(cpu, nohz.idle_cpus_mask); |
| 4707 | atomic_inc(&nohz.nr_cpus); | 4675 | atomic_inc(&nohz.nr_cpus); |
| 4708 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); | 4676 | set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)); |
| 4709 | } | ||
| 4710 | return; | ||
| 4711 | } | 4677 | } |
| 4712 | 4678 | ||
| 4713 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | 4679 | static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, |
| @@ -4715,7 +4681,7 @@ static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb, | |||
| 4715 | { | 4681 | { |
| 4716 | switch (action & ~CPU_TASKS_FROZEN) { | 4682 | switch (action & ~CPU_TASKS_FROZEN) { |
| 4717 | case CPU_DYING: | 4683 | case CPU_DYING: |
| 4718 | clear_nohz_tick_stopped(smp_processor_id()); | 4684 | nohz_balance_exit_idle(smp_processor_id()); |
| 4719 | return NOTIFY_OK; | 4685 | return NOTIFY_OK; |
| 4720 | default: | 4686 | default: |
| 4721 | return NOTIFY_DONE; | 4687 | return NOTIFY_DONE; |
| @@ -4837,14 +4803,15 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) | |||
| 4837 | if (need_resched()) | 4803 | if (need_resched()) |
| 4838 | break; | 4804 | break; |
| 4839 | 4805 | ||
| 4840 | raw_spin_lock_irq(&this_rq->lock); | 4806 | rq = cpu_rq(balance_cpu); |
| 4841 | update_rq_clock(this_rq); | 4807 | |
| 4842 | update_idle_cpu_load(this_rq); | 4808 | raw_spin_lock_irq(&rq->lock); |
| 4843 | raw_spin_unlock_irq(&this_rq->lock); | 4809 | update_rq_clock(rq); |
| 4810 | update_idle_cpu_load(rq); | ||
| 4811 | raw_spin_unlock_irq(&rq->lock); | ||
| 4844 | 4812 | ||
| 4845 | rebalance_domains(balance_cpu, CPU_IDLE); | 4813 | rebalance_domains(balance_cpu, CPU_IDLE); |
| 4846 | 4814 | ||
| 4847 | rq = cpu_rq(balance_cpu); | ||
| 4848 | if (time_after(this_rq->next_balance, rq->next_balance)) | 4815 | if (time_after(this_rq->next_balance, rq->next_balance)) |
| 4849 | this_rq->next_balance = rq->next_balance; | 4816 | this_rq->next_balance = rq->next_balance; |
| 4850 | } | 4817 | } |
| @@ -4875,7 +4842,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu) | |||
| 4875 | * busy tick after returning from idle, we will update the busy stats. | 4842 | * busy tick after returning from idle, we will update the busy stats. |
| 4876 | */ | 4843 | */ |
| 4877 | set_cpu_sd_state_busy(); | 4844 | set_cpu_sd_state_busy(); |
| 4878 | clear_nohz_tick_stopped(cpu); | 4845 | nohz_balance_exit_idle(cpu); |
| 4879 | 4846 | ||
| 4880 | /* | 4847 | /* |
| 4881 | * None are in tickless mode and hence no need for NOHZ idle load | 4848 | * None are in tickless mode and hence no need for NOHZ idle load |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index de00a486c5c6..eebefcad7027 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
| @@ -12,14 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) | |||
| 12 | SCHED_FEAT(START_DEBIT, true) | 12 | SCHED_FEAT(START_DEBIT, true) |
| 13 | 13 | ||
| 14 | /* | 14 | /* |
| 15 | * Based on load and program behaviour, see if it makes sense to place | ||
| 16 | * a newly woken task on the same cpu as the task that woke it -- | ||
| 17 | * improve cache locality. Typically used with SYNC wakeups as | ||
| 18 | * generated by pipes and the like, see also SYNC_WAKEUPS. | ||
| 19 | */ | ||
| 20 | SCHED_FEAT(AFFINE_WAKEUPS, true) | ||
| 21 | |||
| 22 | /* | ||
| 23 | * Prefer to schedule the task we woke last (assuming it failed | 15 | * Prefer to schedule the task we woke last (assuming it failed |
| 24 | * wakeup-preemption), since its likely going to consume data we | 16 | * wakeup-preemption), since its likely going to consume data we |
| 25 | * touched, increases cache locality. | 17 | * touched, increases cache locality. |
| @@ -42,7 +34,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) | |||
| 42 | /* | 34 | /* |
| 43 | * Use arch dependent cpu power functions | 35 | * Use arch dependent cpu power functions |
| 44 | */ | 36 | */ |
| 45 | SCHED_FEAT(ARCH_POWER, false) | 37 | SCHED_FEAT(ARCH_POWER, true) |
| 46 | 38 | ||
| 47 | SCHED_FEAT(HRTICK, false) | 39 | SCHED_FEAT(HRTICK, false) |
| 48 | SCHED_FEAT(DOUBLE_TICK, false) | 40 | SCHED_FEAT(DOUBLE_TICK, false) |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e0b7ba9c040f..418feb01344e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -1632,11 +1632,6 @@ static int push_rt_task(struct rq *rq) | |||
| 1632 | if (!next_task) | 1632 | if (!next_task) |
| 1633 | return 0; | 1633 | return 0; |
| 1634 | 1634 | ||
| 1635 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 1636 | if (unlikely(task_running(rq, next_task))) | ||
| 1637 | return 0; | ||
| 1638 | #endif | ||
| 1639 | |||
| 1640 | retry: | 1635 | retry: |
| 1641 | if (unlikely(next_task == rq->curr)) { | 1636 | if (unlikely(next_task == rq->curr)) { |
| 1642 | WARN_ON(1); | 1637 | WARN_ON(1); |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0848fa36c383..7a7db09cfabc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -737,11 +737,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | |||
| 737 | */ | 737 | */ |
| 738 | next->on_cpu = 1; | 738 | next->on_cpu = 1; |
| 739 | #endif | 739 | #endif |
| 740 | #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 741 | raw_spin_unlock_irq(&rq->lock); | ||
| 742 | #else | ||
| 743 | raw_spin_unlock(&rq->lock); | 740 | raw_spin_unlock(&rq->lock); |
| 744 | #endif | ||
| 745 | } | 741 | } |
| 746 | 742 | ||
| 747 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | 743 | static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) |
| @@ -755,9 +751,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) | |||
| 755 | smp_wmb(); | 751 | smp_wmb(); |
| 756 | prev->on_cpu = 0; | 752 | prev->on_cpu = 0; |
| 757 | #endif | 753 | #endif |
| 758 | #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW | ||
| 759 | local_irq_enable(); | 754 | local_irq_enable(); |
| 760 | #endif | ||
| 761 | } | 755 | } |
| 762 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 756 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
| 763 | 757 | ||
| @@ -891,6 +885,9 @@ struct cpuacct { | |||
| 891 | struct kernel_cpustat __percpu *cpustat; | 885 | struct kernel_cpustat __percpu *cpustat; |
| 892 | }; | 886 | }; |
| 893 | 887 | ||
| 888 | extern struct cgroup_subsys cpuacct_subsys; | ||
| 889 | extern struct cpuacct root_cpuacct; | ||
| 890 | |||
| 894 | /* return cpu accounting group corresponding to this container */ | 891 | /* return cpu accounting group corresponding to this container */ |
| 895 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) | 892 | static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) |
| 896 | { | 893 | { |
| @@ -917,6 +914,16 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
| 917 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 914 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
| 918 | #endif | 915 | #endif |
| 919 | 916 | ||
| 917 | #ifdef CONFIG_PARAVIRT | ||
| 918 | static inline u64 steal_ticks(u64 steal) | ||
| 919 | { | ||
| 920 | if (unlikely(steal > NSEC_PER_SEC)) | ||
| 921 | return div_u64(steal, TICK_NSEC); | ||
| 922 | |||
| 923 | return __iter_div_u64_rem(steal, TICK_NSEC, &steal); | ||
| 924 | } | ||
| 925 | #endif | ||
| 926 | |||
| 920 | static inline void inc_nr_running(struct rq *rq) | 927 | static inline void inc_nr_running(struct rq *rq) |
| 921 | { | 928 | { |
| 922 | rq->nr_running++; | 929 | rq->nr_running++; |
| @@ -1156,3 +1163,53 @@ enum rq_nohz_flag_bits { | |||
| 1156 | 1163 | ||
| 1157 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) | 1164 | #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) |
| 1158 | #endif | 1165 | #endif |
| 1166 | |||
| 1167 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
| 1168 | |||
| 1169 | DECLARE_PER_CPU(u64, cpu_hardirq_time); | ||
| 1170 | DECLARE_PER_CPU(u64, cpu_softirq_time); | ||
| 1171 | |||
| 1172 | #ifndef CONFIG_64BIT | ||
| 1173 | DECLARE_PER_CPU(seqcount_t, irq_time_seq); | ||
| 1174 | |||
| 1175 | static inline void irq_time_write_begin(void) | ||
| 1176 | { | ||
| 1177 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1178 | smp_wmb(); | ||
| 1179 | } | ||
| 1180 | |||
| 1181 | static inline void irq_time_write_end(void) | ||
| 1182 | { | ||
| 1183 | smp_wmb(); | ||
| 1184 | __this_cpu_inc(irq_time_seq.sequence); | ||
| 1185 | } | ||
| 1186 | |||
| 1187 | static inline u64 irq_time_read(int cpu) | ||
| 1188 | { | ||
| 1189 | u64 irq_time; | ||
| 1190 | unsigned seq; | ||
| 1191 | |||
| 1192 | do { | ||
| 1193 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
| 1194 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
| 1195 | per_cpu(cpu_hardirq_time, cpu); | ||
| 1196 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
| 1197 | |||
| 1198 | return irq_time; | ||
| 1199 | } | ||
| 1200 | #else /* CONFIG_64BIT */ | ||
| 1201 | static inline void irq_time_write_begin(void) | ||
| 1202 | { | ||
| 1203 | } | ||
| 1204 | |||
| 1205 | static inline void irq_time_write_end(void) | ||
| 1206 | { | ||
| 1207 | } | ||
| 1208 | |||
| 1209 | static inline u64 irq_time_read(int cpu) | ||
| 1210 | { | ||
| 1211 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
| 1212 | } | ||
| 1213 | #endif /* CONFIG_64BIT */ | ||
| 1214 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
| 1215 | |||
diff --git a/kernel/signal.c b/kernel/signal.c index be4f856d52f8..0af8868525d6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/fs.h> | 17 | #include <linux/fs.h> |
| 18 | #include <linux/tty.h> | 18 | #include <linux/tty.h> |
| 19 | #include <linux/binfmts.h> | 19 | #include <linux/binfmts.h> |
| 20 | #include <linux/coredump.h> | ||
| 20 | #include <linux/security.h> | 21 | #include <linux/security.h> |
| 21 | #include <linux/syscalls.h> | 22 | #include <linux/syscalls.h> |
| 22 | #include <linux/ptrace.h> | 23 | #include <linux/ptrace.h> |
| @@ -1971,13 +1972,8 @@ static void ptrace_do_notify(int signr, int exit_code, int why) | |||
| 1971 | void ptrace_notify(int exit_code) | 1972 | void ptrace_notify(int exit_code) |
| 1972 | { | 1973 | { |
| 1973 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); | 1974 | BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); |
| 1974 | if (unlikely(current->task_works)) { | 1975 | if (unlikely(current->task_works)) |
| 1975 | if (test_and_clear_ti_thread_flag(current_thread_info(), | 1976 | task_work_run(); |
| 1976 | TIF_NOTIFY_RESUME)) { | ||
| 1977 | smp_mb__after_clear_bit(); | ||
| 1978 | task_work_run(); | ||
| 1979 | } | ||
| 1980 | } | ||
| 1981 | 1977 | ||
| 1982 | spin_lock_irq(¤t->sighand->siglock); | 1978 | spin_lock_irq(¤t->sighand->siglock); |
| 1983 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); | 1979 | ptrace_do_notify(SIGTRAP, exit_code, CLD_TRAPPED); |
| @@ -2198,13 +2194,8 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, | |||
| 2198 | struct signal_struct *signal = current->signal; | 2194 | struct signal_struct *signal = current->signal; |
| 2199 | int signr; | 2195 | int signr; |
| 2200 | 2196 | ||
| 2201 | if (unlikely(current->task_works)) { | 2197 | if (unlikely(current->task_works)) |
| 2202 | if (test_and_clear_ti_thread_flag(current_thread_info(), | 2198 | task_work_run(); |
| 2203 | TIF_NOTIFY_RESUME)) { | ||
| 2204 | smp_mb__after_clear_bit(); | ||
| 2205 | task_work_run(); | ||
| 2206 | } | ||
| 2207 | } | ||
| 2208 | 2199 | ||
| 2209 | if (unlikely(uprobe_deny_signal())) | 2200 | if (unlikely(uprobe_deny_signal())) |
| 2210 | return 0; | 2201 | return 0; |
| @@ -2369,7 +2360,7 @@ relock: | |||
| 2369 | * first and our do_group_exit call below will use | 2360 | * first and our do_group_exit call below will use |
| 2370 | * that value and ignore the one we pass it. | 2361 | * that value and ignore the one we pass it. |
| 2371 | */ | 2362 | */ |
| 2372 | do_coredump(info->si_signo, info->si_signo, regs); | 2363 | do_coredump(info, regs); |
| 2373 | } | 2364 | } |
| 2374 | 2365 | ||
| 2375 | /* | 2366 | /* |
diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 98f60c5caa1b..d6c5fc054242 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c | |||
| @@ -1,14 +1,22 @@ | |||
| 1 | /* | 1 | /* |
| 2 | * Common SMP CPU bringup/teardown functions | 2 | * Common SMP CPU bringup/teardown functions |
| 3 | */ | 3 | */ |
| 4 | #include <linux/cpu.h> | ||
| 4 | #include <linux/err.h> | 5 | #include <linux/err.h> |
| 5 | #include <linux/smp.h> | 6 | #include <linux/smp.h> |
| 6 | #include <linux/init.h> | 7 | #include <linux/init.h> |
| 8 | #include <linux/list.h> | ||
| 9 | #include <linux/slab.h> | ||
| 7 | #include <linux/sched.h> | 10 | #include <linux/sched.h> |
| 11 | #include <linux/export.h> | ||
| 8 | #include <linux/percpu.h> | 12 | #include <linux/percpu.h> |
| 13 | #include <linux/kthread.h> | ||
| 14 | #include <linux/smpboot.h> | ||
| 9 | 15 | ||
| 10 | #include "smpboot.h" | 16 | #include "smpboot.h" |
| 11 | 17 | ||
| 18 | #ifdef CONFIG_SMP | ||
| 19 | |||
| 12 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD | 20 | #ifdef CONFIG_GENERIC_SMP_IDLE_THREAD |
| 13 | /* | 21 | /* |
| 14 | * For the hotplug case we keep the task structs around and reuse | 22 | * For the hotplug case we keep the task structs around and reuse |
| @@ -65,3 +73,228 @@ void __init idle_threads_init(void) | |||
| 65 | } | 73 | } |
| 66 | } | 74 | } |
| 67 | #endif | 75 | #endif |
| 76 | |||
| 77 | #endif /* #ifdef CONFIG_SMP */ | ||
| 78 | |||
| 79 | static LIST_HEAD(hotplug_threads); | ||
| 80 | static DEFINE_MUTEX(smpboot_threads_lock); | ||
| 81 | |||
| 82 | struct smpboot_thread_data { | ||
| 83 | unsigned int cpu; | ||
| 84 | unsigned int status; | ||
| 85 | struct smp_hotplug_thread *ht; | ||
| 86 | }; | ||
| 87 | |||
| 88 | enum { | ||
| 89 | HP_THREAD_NONE = 0, | ||
| 90 | HP_THREAD_ACTIVE, | ||
| 91 | HP_THREAD_PARKED, | ||
| 92 | }; | ||
| 93 | |||
| 94 | /** | ||
| 95 | * smpboot_thread_fn - percpu hotplug thread loop function | ||
| 96 | * @data: thread data pointer | ||
| 97 | * | ||
| 98 | * Checks for thread stop and park conditions. Calls the necessary | ||
| 99 | * setup, cleanup, park and unpark functions for the registered | ||
| 100 | * thread. | ||
| 101 | * | ||
| 102 | * Returns 1 when the thread should exit, 0 otherwise. | ||
| 103 | */ | ||
| 104 | static int smpboot_thread_fn(void *data) | ||
| 105 | { | ||
| 106 | struct smpboot_thread_data *td = data; | ||
| 107 | struct smp_hotplug_thread *ht = td->ht; | ||
| 108 | |||
| 109 | while (1) { | ||
| 110 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 111 | preempt_disable(); | ||
| 112 | if (kthread_should_stop()) { | ||
| 113 | set_current_state(TASK_RUNNING); | ||
| 114 | preempt_enable(); | ||
| 115 | if (ht->cleanup) | ||
| 116 | ht->cleanup(td->cpu, cpu_online(td->cpu)); | ||
| 117 | kfree(td); | ||
| 118 | return 0; | ||
| 119 | } | ||
| 120 | |||
| 121 | if (kthread_should_park()) { | ||
| 122 | __set_current_state(TASK_RUNNING); | ||
| 123 | preempt_enable(); | ||
| 124 | if (ht->park && td->status == HP_THREAD_ACTIVE) { | ||
| 125 | BUG_ON(td->cpu != smp_processor_id()); | ||
| 126 | ht->park(td->cpu); | ||
| 127 | td->status = HP_THREAD_PARKED; | ||
| 128 | } | ||
| 129 | kthread_parkme(); | ||
| 130 | /* We might have been woken for stop */ | ||
| 131 | continue; | ||
| 132 | } | ||
| 133 | |||
| 134 | BUG_ON(td->cpu != smp_processor_id()); | ||
| 135 | |||
| 136 | /* Check for state change setup */ | ||
| 137 | switch (td->status) { | ||
| 138 | case HP_THREAD_NONE: | ||
| 139 | preempt_enable(); | ||
| 140 | if (ht->setup) | ||
| 141 | ht->setup(td->cpu); | ||
| 142 | td->status = HP_THREAD_ACTIVE; | ||
| 143 | preempt_disable(); | ||
| 144 | break; | ||
| 145 | case HP_THREAD_PARKED: | ||
| 146 | preempt_enable(); | ||
| 147 | if (ht->unpark) | ||
| 148 | ht->unpark(td->cpu); | ||
| 149 | td->status = HP_THREAD_ACTIVE; | ||
| 150 | preempt_disable(); | ||
| 151 | break; | ||
| 152 | } | ||
| 153 | |||
| 154 | if (!ht->thread_should_run(td->cpu)) { | ||
| 155 | preempt_enable(); | ||
| 156 | schedule(); | ||
| 157 | } else { | ||
| 158 | set_current_state(TASK_RUNNING); | ||
| 159 | preempt_enable(); | ||
| 160 | ht->thread_fn(td->cpu); | ||
| 161 | } | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 | static int | ||
| 166 | __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | ||
| 167 | { | ||
| 168 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
| 169 | struct smpboot_thread_data *td; | ||
| 170 | |||
| 171 | if (tsk) | ||
| 172 | return 0; | ||
| 173 | |||
| 174 | td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu)); | ||
| 175 | if (!td) | ||
| 176 | return -ENOMEM; | ||
| 177 | td->cpu = cpu; | ||
| 178 | td->ht = ht; | ||
| 179 | |||
| 180 | tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu, | ||
| 181 | ht->thread_comm); | ||
| 182 | if (IS_ERR(tsk)) { | ||
| 183 | kfree(td); | ||
| 184 | return PTR_ERR(tsk); | ||
| 185 | } | ||
| 186 | |||
| 187 | get_task_struct(tsk); | ||
| 188 | *per_cpu_ptr(ht->store, cpu) = tsk; | ||
| 189 | return 0; | ||
| 190 | } | ||
| 191 | |||
| 192 | int smpboot_create_threads(unsigned int cpu) | ||
| 193 | { | ||
| 194 | struct smp_hotplug_thread *cur; | ||
| 195 | int ret = 0; | ||
| 196 | |||
| 197 | mutex_lock(&smpboot_threads_lock); | ||
| 198 | list_for_each_entry(cur, &hotplug_threads, list) { | ||
| 199 | ret = __smpboot_create_thread(cur, cpu); | ||
| 200 | if (ret) | ||
| 201 | break; | ||
| 202 | } | ||
| 203 | mutex_unlock(&smpboot_threads_lock); | ||
| 204 | return ret; | ||
| 205 | } | ||
| 206 | |||
| 207 | static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | ||
| 208 | { | ||
| 209 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
| 210 | |||
| 211 | kthread_unpark(tsk); | ||
| 212 | } | ||
| 213 | |||
| 214 | void smpboot_unpark_threads(unsigned int cpu) | ||
| 215 | { | ||
| 216 | struct smp_hotplug_thread *cur; | ||
| 217 | |||
| 218 | mutex_lock(&smpboot_threads_lock); | ||
| 219 | list_for_each_entry(cur, &hotplug_threads, list) | ||
| 220 | smpboot_unpark_thread(cur, cpu); | ||
| 221 | mutex_unlock(&smpboot_threads_lock); | ||
| 222 | } | ||
| 223 | |||
| 224 | static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) | ||
| 225 | { | ||
| 226 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
| 227 | |||
| 228 | if (tsk) | ||
| 229 | kthread_park(tsk); | ||
| 230 | } | ||
| 231 | |||
| 232 | void smpboot_park_threads(unsigned int cpu) | ||
| 233 | { | ||
| 234 | struct smp_hotplug_thread *cur; | ||
| 235 | |||
| 236 | mutex_lock(&smpboot_threads_lock); | ||
| 237 | list_for_each_entry_reverse(cur, &hotplug_threads, list) | ||
| 238 | smpboot_park_thread(cur, cpu); | ||
| 239 | mutex_unlock(&smpboot_threads_lock); | ||
| 240 | } | ||
| 241 | |||
| 242 | static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) | ||
| 243 | { | ||
| 244 | unsigned int cpu; | ||
| 245 | |||
| 246 | /* We need to destroy also the parked threads of offline cpus */ | ||
| 247 | for_each_possible_cpu(cpu) { | ||
| 248 | struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); | ||
| 249 | |||
| 250 | if (tsk) { | ||
| 251 | kthread_stop(tsk); | ||
| 252 | put_task_struct(tsk); | ||
| 253 | *per_cpu_ptr(ht->store, cpu) = NULL; | ||
| 254 | } | ||
| 255 | } | ||
| 256 | } | ||
| 257 | |||
| 258 | /** | ||
| 259 | * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug | ||
| 260 | * @plug_thread: Hotplug thread descriptor | ||
| 261 | * | ||
| 262 | * Creates and starts the threads on all online cpus. | ||
| 263 | */ | ||
| 264 | int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) | ||
| 265 | { | ||
| 266 | unsigned int cpu; | ||
| 267 | int ret = 0; | ||
| 268 | |||
| 269 | mutex_lock(&smpboot_threads_lock); | ||
| 270 | for_each_online_cpu(cpu) { | ||
| 271 | ret = __smpboot_create_thread(plug_thread, cpu); | ||
| 272 | if (ret) { | ||
| 273 | smpboot_destroy_threads(plug_thread); | ||
| 274 | goto out; | ||
| 275 | } | ||
| 276 | smpboot_unpark_thread(plug_thread, cpu); | ||
| 277 | } | ||
| 278 | list_add(&plug_thread->list, &hotplug_threads); | ||
| 279 | out: | ||
| 280 | mutex_unlock(&smpboot_threads_lock); | ||
| 281 | return ret; | ||
| 282 | } | ||
| 283 | EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); | ||
| 284 | |||
| 285 | /** | ||
| 286 | * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug | ||
| 287 | * @plug_thread: Hotplug thread descriptor | ||
| 288 | * | ||
| 289 | * Stops all threads on all possible cpus. | ||
| 290 | */ | ||
| 291 | void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) | ||
| 292 | { | ||
| 293 | get_online_cpus(); | ||
| 294 | mutex_lock(&smpboot_threads_lock); | ||
| 295 | list_del(&plug_thread->list); | ||
| 296 | smpboot_destroy_threads(plug_thread); | ||
| 297 | mutex_unlock(&smpboot_threads_lock); | ||
| 298 | put_online_cpus(); | ||
| 299 | } | ||
| 300 | EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); | ||
diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 6ef9433e1c70..72415a0eb955 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h | |||
| @@ -13,4 +13,8 @@ static inline void idle_thread_set_boot_cpu(void) { } | |||
| 13 | static inline void idle_threads_init(void) { } | 13 | static inline void idle_threads_init(void) { } |
| 14 | #endif | 14 | #endif |
| 15 | 15 | ||
| 16 | int smpboot_create_threads(unsigned int cpu); | ||
| 17 | void smpboot_park_threads(unsigned int cpu); | ||
| 18 | void smpboot_unpark_threads(unsigned int cpu); | ||
| 19 | |||
| 16 | #endif | 20 | #endif |
diff --git a/kernel/softirq.c b/kernel/softirq.c index b73e681df09e..cc96bdc0c2c9 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
| @@ -23,6 +23,7 @@ | |||
| 23 | #include <linux/rcupdate.h> | 23 | #include <linux/rcupdate.h> |
| 24 | #include <linux/ftrace.h> | 24 | #include <linux/ftrace.h> |
| 25 | #include <linux/smp.h> | 25 | #include <linux/smp.h> |
| 26 | #include <linux/smpboot.h> | ||
| 26 | #include <linux/tick.h> | 27 | #include <linux/tick.h> |
| 27 | 28 | ||
| 28 | #define CREATE_TRACE_POINTS | 29 | #define CREATE_TRACE_POINTS |
| @@ -220,7 +221,7 @@ asmlinkage void __do_softirq(void) | |||
| 220 | current->flags &= ~PF_MEMALLOC; | 221 | current->flags &= ~PF_MEMALLOC; |
| 221 | 222 | ||
| 222 | pending = local_softirq_pending(); | 223 | pending = local_softirq_pending(); |
| 223 | account_system_vtime(current); | 224 | vtime_account(current); |
| 224 | 225 | ||
| 225 | __local_bh_disable((unsigned long)__builtin_return_address(0), | 226 | __local_bh_disable((unsigned long)__builtin_return_address(0), |
| 226 | SOFTIRQ_OFFSET); | 227 | SOFTIRQ_OFFSET); |
| @@ -271,7 +272,7 @@ restart: | |||
| 271 | 272 | ||
| 272 | lockdep_softirq_exit(); | 273 | lockdep_softirq_exit(); |
| 273 | 274 | ||
| 274 | account_system_vtime(current); | 275 | vtime_account(current); |
| 275 | __local_bh_enable(SOFTIRQ_OFFSET); | 276 | __local_bh_enable(SOFTIRQ_OFFSET); |
| 276 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); | 277 | tsk_restore_flags(current, old_flags, PF_MEMALLOC); |
| 277 | } | 278 | } |
| @@ -340,7 +341,7 @@ static inline void invoke_softirq(void) | |||
| 340 | */ | 341 | */ |
| 341 | void irq_exit(void) | 342 | void irq_exit(void) |
| 342 | { | 343 | { |
| 343 | account_system_vtime(current); | 344 | vtime_account(current); |
| 344 | trace_hardirq_exit(); | 345 | trace_hardirq_exit(); |
| 345 | sub_preempt_count(IRQ_EXIT_OFFSET); | 346 | sub_preempt_count(IRQ_EXIT_OFFSET); |
| 346 | if (!in_interrupt() && local_softirq_pending()) | 347 | if (!in_interrupt() && local_softirq_pending()) |
| @@ -742,49 +743,22 @@ void __init softirq_init(void) | |||
| 742 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); | 743 | open_softirq(HI_SOFTIRQ, tasklet_hi_action); |
| 743 | } | 744 | } |
| 744 | 745 | ||
| 745 | static int run_ksoftirqd(void * __bind_cpu) | 746 | static int ksoftirqd_should_run(unsigned int cpu) |
| 746 | { | 747 | { |
| 747 | set_current_state(TASK_INTERRUPTIBLE); | 748 | return local_softirq_pending(); |
| 748 | 749 | } | |
| 749 | while (!kthread_should_stop()) { | ||
| 750 | preempt_disable(); | ||
| 751 | if (!local_softirq_pending()) { | ||
| 752 | schedule_preempt_disabled(); | ||
| 753 | } | ||
| 754 | |||
| 755 | __set_current_state(TASK_RUNNING); | ||
| 756 | |||
| 757 | while (local_softirq_pending()) { | ||
| 758 | /* Preempt disable stops cpu going offline. | ||
| 759 | If already offline, we'll be on wrong CPU: | ||
| 760 | don't process */ | ||
| 761 | if (cpu_is_offline((long)__bind_cpu)) | ||
| 762 | goto wait_to_die; | ||
| 763 | local_irq_disable(); | ||
| 764 | if (local_softirq_pending()) | ||
| 765 | __do_softirq(); | ||
| 766 | local_irq_enable(); | ||
| 767 | sched_preempt_enable_no_resched(); | ||
| 768 | cond_resched(); | ||
| 769 | preempt_disable(); | ||
| 770 | rcu_note_context_switch((long)__bind_cpu); | ||
| 771 | } | ||
| 772 | preempt_enable(); | ||
| 773 | set_current_state(TASK_INTERRUPTIBLE); | ||
| 774 | } | ||
| 775 | __set_current_state(TASK_RUNNING); | ||
| 776 | return 0; | ||
| 777 | 750 | ||
| 778 | wait_to_die: | 751 | static void run_ksoftirqd(unsigned int cpu) |
| 779 | preempt_enable(); | 752 | { |
| 780 | /* Wait for kthread_stop */ | 753 | local_irq_disable(); |
| 781 | set_current_state(TASK_INTERRUPTIBLE); | 754 | if (local_softirq_pending()) { |
| 782 | while (!kthread_should_stop()) { | 755 | __do_softirq(); |
| 783 | schedule(); | 756 | rcu_note_context_switch(cpu); |
| 784 | set_current_state(TASK_INTERRUPTIBLE); | 757 | local_irq_enable(); |
| 758 | cond_resched(); | ||
| 759 | return; | ||
| 785 | } | 760 | } |
| 786 | __set_current_state(TASK_RUNNING); | 761 | local_irq_enable(); |
| 787 | return 0; | ||
| 788 | } | 762 | } |
| 789 | 763 | ||
| 790 | #ifdef CONFIG_HOTPLUG_CPU | 764 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -850,50 +824,14 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, | |||
| 850 | unsigned long action, | 824 | unsigned long action, |
| 851 | void *hcpu) | 825 | void *hcpu) |
| 852 | { | 826 | { |
| 853 | int hotcpu = (unsigned long)hcpu; | ||
| 854 | struct task_struct *p; | ||
| 855 | |||
| 856 | switch (action) { | 827 | switch (action) { |
| 857 | case CPU_UP_PREPARE: | ||
| 858 | case CPU_UP_PREPARE_FROZEN: | ||
| 859 | p = kthread_create_on_node(run_ksoftirqd, | ||
| 860 | hcpu, | ||
| 861 | cpu_to_node(hotcpu), | ||
| 862 | "ksoftirqd/%d", hotcpu); | ||
| 863 | if (IS_ERR(p)) { | ||
| 864 | printk("ksoftirqd for %i failed\n", hotcpu); | ||
| 865 | return notifier_from_errno(PTR_ERR(p)); | ||
| 866 | } | ||
| 867 | kthread_bind(p, hotcpu); | ||
| 868 | per_cpu(ksoftirqd, hotcpu) = p; | ||
| 869 | break; | ||
| 870 | case CPU_ONLINE: | ||
| 871 | case CPU_ONLINE_FROZEN: | ||
| 872 | wake_up_process(per_cpu(ksoftirqd, hotcpu)); | ||
| 873 | break; | ||
| 874 | #ifdef CONFIG_HOTPLUG_CPU | 828 | #ifdef CONFIG_HOTPLUG_CPU |
| 875 | case CPU_UP_CANCELED: | ||
| 876 | case CPU_UP_CANCELED_FROZEN: | ||
| 877 | if (!per_cpu(ksoftirqd, hotcpu)) | ||
| 878 | break; | ||
| 879 | /* Unbind so it can run. Fall thru. */ | ||
| 880 | kthread_bind(per_cpu(ksoftirqd, hotcpu), | ||
| 881 | cpumask_any(cpu_online_mask)); | ||
| 882 | case CPU_DEAD: | 829 | case CPU_DEAD: |
| 883 | case CPU_DEAD_FROZEN: { | 830 | case CPU_DEAD_FROZEN: |
| 884 | static const struct sched_param param = { | 831 | takeover_tasklets((unsigned long)hcpu); |
| 885 | .sched_priority = MAX_RT_PRIO-1 | ||
| 886 | }; | ||
| 887 | |||
| 888 | p = per_cpu(ksoftirqd, hotcpu); | ||
| 889 | per_cpu(ksoftirqd, hotcpu) = NULL; | ||
| 890 | sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); | ||
| 891 | kthread_stop(p); | ||
| 892 | takeover_tasklets(hotcpu); | ||
| 893 | break; | 832 | break; |
| 894 | } | ||
| 895 | #endif /* CONFIG_HOTPLUG_CPU */ | 833 | #endif /* CONFIG_HOTPLUG_CPU */ |
| 896 | } | 834 | } |
| 897 | return NOTIFY_OK; | 835 | return NOTIFY_OK; |
| 898 | } | 836 | } |
| 899 | 837 | ||
| @@ -901,14 +839,19 @@ static struct notifier_block __cpuinitdata cpu_nfb = { | |||
| 901 | .notifier_call = cpu_callback | 839 | .notifier_call = cpu_callback |
| 902 | }; | 840 | }; |
| 903 | 841 | ||
| 842 | static struct smp_hotplug_thread softirq_threads = { | ||
| 843 | .store = &ksoftirqd, | ||
| 844 | .thread_should_run = ksoftirqd_should_run, | ||
| 845 | .thread_fn = run_ksoftirqd, | ||
| 846 | .thread_comm = "ksoftirqd/%u", | ||
| 847 | }; | ||
| 848 | |||
| 904 | static __init int spawn_ksoftirqd(void) | 849 | static __init int spawn_ksoftirqd(void) |
| 905 | { | 850 | { |
| 906 | void *cpu = (void *)(long)smp_processor_id(); | ||
| 907 | int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | ||
| 908 | |||
| 909 | BUG_ON(err != NOTIFY_OK); | ||
| 910 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
| 911 | register_cpu_notifier(&cpu_nfb); | 851 | register_cpu_notifier(&cpu_nfb); |
| 852 | |||
| 853 | BUG_ON(smpboot_register_percpu_thread(&softirq_threads)); | ||
| 854 | |||
| 912 | return 0; | 855 | return 0; |
| 913 | } | 856 | } |
| 914 | early_initcall(spawn_ksoftirqd); | 857 | early_initcall(spawn_ksoftirqd); |
diff --git a/kernel/srcu.c b/kernel/srcu.c index 2095be3318d5..97c465ebd844 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
| @@ -379,7 +379,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | |||
| 379 | rcu_batch_queue(&sp->batch_queue, head); | 379 | rcu_batch_queue(&sp->batch_queue, head); |
| 380 | if (!sp->running) { | 380 | if (!sp->running) { |
| 381 | sp->running = true; | 381 | sp->running = true; |
| 382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | 382 | schedule_delayed_work(&sp->work, 0); |
| 383 | } | 383 | } |
| 384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | 384 | spin_unlock_irqrestore(&sp->queue_lock, flags); |
| 385 | } | 385 | } |
| @@ -631,7 +631,7 @@ static void srcu_reschedule(struct srcu_struct *sp) | |||
| 631 | } | 631 | } |
| 632 | 632 | ||
| 633 | if (pending) | 633 | if (pending) |
| 634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | 634 | schedule_delayed_work(&sp->work, SRCU_INTERVAL); |
| 635 | } | 635 | } |
| 636 | 636 | ||
| 637 | /* | 637 | /* |
diff --git a/kernel/sys.c b/kernel/sys.c index 241507f23eca..c5cb5b99cb81 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
| @@ -368,6 +368,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); | |||
| 368 | void kernel_restart(char *cmd) | 368 | void kernel_restart(char *cmd) |
| 369 | { | 369 | { |
| 370 | kernel_restart_prepare(cmd); | 370 | kernel_restart_prepare(cmd); |
| 371 | disable_nonboot_cpus(); | ||
| 371 | if (!cmd) | 372 | if (!cmd) |
| 372 | printk(KERN_EMERG "Restarting system.\n"); | 373 | printk(KERN_EMERG "Restarting system.\n"); |
| 373 | else | 374 | else |
| @@ -1788,15 +1789,15 @@ SYSCALL_DEFINE1(umask, int, mask) | |||
| 1788 | #ifdef CONFIG_CHECKPOINT_RESTORE | 1789 | #ifdef CONFIG_CHECKPOINT_RESTORE |
| 1789 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | 1790 | static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) |
| 1790 | { | 1791 | { |
| 1791 | struct file *exe_file; | 1792 | struct fd exe; |
| 1792 | struct dentry *dentry; | 1793 | struct dentry *dentry; |
| 1793 | int err; | 1794 | int err; |
| 1794 | 1795 | ||
| 1795 | exe_file = fget(fd); | 1796 | exe = fdget(fd); |
| 1796 | if (!exe_file) | 1797 | if (!exe.file) |
| 1797 | return -EBADF; | 1798 | return -EBADF; |
| 1798 | 1799 | ||
| 1799 | dentry = exe_file->f_path.dentry; | 1800 | dentry = exe.file->f_path.dentry; |
| 1800 | 1801 | ||
| 1801 | /* | 1802 | /* |
| 1802 | * Because the original mm->exe_file points to executable file, make | 1803 | * Because the original mm->exe_file points to executable file, make |
| @@ -1805,7 +1806,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1805 | */ | 1806 | */ |
| 1806 | err = -EACCES; | 1807 | err = -EACCES; |
| 1807 | if (!S_ISREG(dentry->d_inode->i_mode) || | 1808 | if (!S_ISREG(dentry->d_inode->i_mode) || |
| 1808 | exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC) | 1809 | exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC) |
| 1809 | goto exit; | 1810 | goto exit; |
| 1810 | 1811 | ||
| 1811 | err = inode_permission(dentry->d_inode, MAY_EXEC); | 1812 | err = inode_permission(dentry->d_inode, MAY_EXEC); |
| @@ -1839,12 +1840,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) | |||
| 1839 | goto exit_unlock; | 1840 | goto exit_unlock; |
| 1840 | 1841 | ||
| 1841 | err = 0; | 1842 | err = 0; |
| 1842 | set_mm_exe_file(mm, exe_file); | 1843 | set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ |
| 1843 | exit_unlock: | 1844 | exit_unlock: |
| 1844 | up_write(&mm->mmap_sem); | 1845 | up_write(&mm->mmap_sem); |
| 1845 | 1846 | ||
| 1846 | exit: | 1847 | exit: |
| 1847 | fput(exe_file); | 1848 | fdput(exe); |
| 1848 | return err; | 1849 | return err; |
| 1849 | } | 1850 | } |
| 1850 | 1851 | ||
| @@ -2204,7 +2205,7 @@ static int __orderly_poweroff(void) | |||
| 2204 | return -ENOMEM; | 2205 | return -ENOMEM; |
| 2205 | } | 2206 | } |
| 2206 | 2207 | ||
| 2207 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_NO_WAIT, | 2208 | ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, |
| 2208 | NULL, argv_cleanup, NULL); | 2209 | NULL, argv_cleanup, NULL); |
| 2209 | if (ret == -ENOMEM) | 2210 | if (ret == -ENOMEM) |
| 2210 | argv_free(argv); | 2211 | argv_free(argv); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87174ef59161..26f65eaa01f9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
| @@ -97,10 +97,12 @@ | |||
| 97 | extern int sysctl_overcommit_memory; | 97 | extern int sysctl_overcommit_memory; |
| 98 | extern int sysctl_overcommit_ratio; | 98 | extern int sysctl_overcommit_ratio; |
| 99 | extern int max_threads; | 99 | extern int max_threads; |
| 100 | extern int core_uses_pid; | ||
| 101 | extern int suid_dumpable; | 100 | extern int suid_dumpable; |
| 101 | #ifdef CONFIG_COREDUMP | ||
| 102 | extern int core_uses_pid; | ||
| 102 | extern char core_pattern[]; | 103 | extern char core_pattern[]; |
| 103 | extern unsigned int core_pipe_limit; | 104 | extern unsigned int core_pipe_limit; |
| 105 | #endif | ||
| 104 | extern int pid_max; | 106 | extern int pid_max; |
| 105 | extern int min_free_kbytes; | 107 | extern int min_free_kbytes; |
| 106 | extern int pid_max_min, pid_max_max; | 108 | extern int pid_max_min, pid_max_max; |
| @@ -177,8 +179,10 @@ static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, | |||
| 177 | 179 | ||
| 178 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | 180 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, |
| 179 | void __user *buffer, size_t *lenp, loff_t *ppos); | 181 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 182 | #ifdef CONFIG_COREDUMP | ||
| 180 | static int proc_dostring_coredump(struct ctl_table *table, int write, | 183 | static int proc_dostring_coredump(struct ctl_table *table, int write, |
| 181 | void __user *buffer, size_t *lenp, loff_t *ppos); | 184 | void __user *buffer, size_t *lenp, loff_t *ppos); |
| 185 | #endif | ||
| 182 | 186 | ||
| 183 | #ifdef CONFIG_MAGIC_SYSRQ | 187 | #ifdef CONFIG_MAGIC_SYSRQ |
| 184 | /* Note: sysrq code uses it's own private copy */ | 188 | /* Note: sysrq code uses it's own private copy */ |
| @@ -307,7 +311,7 @@ static struct ctl_table kern_table[] = { | |||
| 307 | .extra2 = &max_sched_tunable_scaling, | 311 | .extra2 = &max_sched_tunable_scaling, |
| 308 | }, | 312 | }, |
| 309 | { | 313 | { |
| 310 | .procname = "sched_migration_cost", | 314 | .procname = "sched_migration_cost_ns", |
| 311 | .data = &sysctl_sched_migration_cost, | 315 | .data = &sysctl_sched_migration_cost, |
| 312 | .maxlen = sizeof(unsigned int), | 316 | .maxlen = sizeof(unsigned int), |
| 313 | .mode = 0644, | 317 | .mode = 0644, |
| @@ -321,14 +325,14 @@ static struct ctl_table kern_table[] = { | |||
| 321 | .proc_handler = proc_dointvec, | 325 | .proc_handler = proc_dointvec, |
| 322 | }, | 326 | }, |
| 323 | { | 327 | { |
| 324 | .procname = "sched_time_avg", | 328 | .procname = "sched_time_avg_ms", |
| 325 | .data = &sysctl_sched_time_avg, | 329 | .data = &sysctl_sched_time_avg, |
| 326 | .maxlen = sizeof(unsigned int), | 330 | .maxlen = sizeof(unsigned int), |
| 327 | .mode = 0644, | 331 | .mode = 0644, |
| 328 | .proc_handler = proc_dointvec, | 332 | .proc_handler = proc_dointvec, |
| 329 | }, | 333 | }, |
| 330 | { | 334 | { |
| 331 | .procname = "sched_shares_window", | 335 | .procname = "sched_shares_window_ns", |
| 332 | .data = &sysctl_sched_shares_window, | 336 | .data = &sysctl_sched_shares_window, |
| 333 | .maxlen = sizeof(unsigned int), | 337 | .maxlen = sizeof(unsigned int), |
| 334 | .mode = 0644, | 338 | .mode = 0644, |
| @@ -404,6 +408,7 @@ static struct ctl_table kern_table[] = { | |||
| 404 | .mode = 0644, | 408 | .mode = 0644, |
| 405 | .proc_handler = proc_dointvec, | 409 | .proc_handler = proc_dointvec, |
| 406 | }, | 410 | }, |
| 411 | #ifdef CONFIG_COREDUMP | ||
| 407 | { | 412 | { |
| 408 | .procname = "core_uses_pid", | 413 | .procname = "core_uses_pid", |
| 409 | .data = &core_uses_pid, | 414 | .data = &core_uses_pid, |
| @@ -425,6 +430,7 @@ static struct ctl_table kern_table[] = { | |||
| 425 | .mode = 0644, | 430 | .mode = 0644, |
| 426 | .proc_handler = proc_dointvec, | 431 | .proc_handler = proc_dointvec, |
| 427 | }, | 432 | }, |
| 433 | #endif | ||
| 428 | #ifdef CONFIG_PROC_SYSCTL | 434 | #ifdef CONFIG_PROC_SYSCTL |
| 429 | { | 435 | { |
| 430 | .procname = "tainted", | 436 | .procname = "tainted", |
| @@ -1543,8 +1549,7 @@ static struct ctl_table fs_table[] = { | |||
| 1543 | }; | 1549 | }; |
| 1544 | 1550 | ||
| 1545 | static struct ctl_table debug_table[] = { | 1551 | static struct ctl_table debug_table[] = { |
| 1546 | #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ | 1552 | #ifdef CONFIG_SYSCTL_EXCEPTION_TRACE |
| 1547 | defined(CONFIG_S390) || defined(CONFIG_TILE) | ||
| 1548 | { | 1553 | { |
| 1549 | .procname = "exception-trace", | 1554 | .procname = "exception-trace", |
| 1550 | .data = &show_unhandled_signals, | 1555 | .data = &show_unhandled_signals, |
| @@ -2036,12 +2041,14 @@ int proc_dointvec_minmax(struct ctl_table *table, int write, | |||
| 2036 | 2041 | ||
| 2037 | static void validate_coredump_safety(void) | 2042 | static void validate_coredump_safety(void) |
| 2038 | { | 2043 | { |
| 2044 | #ifdef CONFIG_COREDUMP | ||
| 2039 | if (suid_dumpable == SUID_DUMPABLE_SAFE && | 2045 | if (suid_dumpable == SUID_DUMPABLE_SAFE && |
| 2040 | core_pattern[0] != '/' && core_pattern[0] != '|') { | 2046 | core_pattern[0] != '/' && core_pattern[0] != '|') { |
| 2041 | printk(KERN_WARNING "Unsafe core_pattern used with "\ | 2047 | printk(KERN_WARNING "Unsafe core_pattern used with "\ |
| 2042 | "suid_dumpable=2. Pipe handler or fully qualified "\ | 2048 | "suid_dumpable=2. Pipe handler or fully qualified "\ |
| 2043 | "core dump path required.\n"); | 2049 | "core dump path required.\n"); |
| 2044 | } | 2050 | } |
| 2051 | #endif | ||
| 2045 | } | 2052 | } |
| 2046 | 2053 | ||
| 2047 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | 2054 | static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, |
| @@ -2053,6 +2060,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, | |||
| 2053 | return error; | 2060 | return error; |
| 2054 | } | 2061 | } |
| 2055 | 2062 | ||
| 2063 | #ifdef CONFIG_COREDUMP | ||
| 2056 | static int proc_dostring_coredump(struct ctl_table *table, int write, | 2064 | static int proc_dostring_coredump(struct ctl_table *table, int write, |
| 2057 | void __user *buffer, size_t *lenp, loff_t *ppos) | 2065 | void __user *buffer, size_t *lenp, loff_t *ppos) |
| 2058 | { | 2066 | { |
| @@ -2061,6 +2069,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, | |||
| 2061 | validate_coredump_safety(); | 2069 | validate_coredump_safety(); |
| 2062 | return error; | 2070 | return error; |
| 2063 | } | 2071 | } |
| 2072 | #endif | ||
| 2064 | 2073 | ||
| 2065 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, | 2074 | static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, |
| 2066 | void __user *buffer, | 2075 | void __user *buffer, |
diff --git a/kernel/task_work.c b/kernel/task_work.c index d320d44903bd..65bd3c92d6f3 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c | |||
| @@ -2,26 +2,20 @@ | |||
| 2 | #include <linux/task_work.h> | 2 | #include <linux/task_work.h> |
| 3 | #include <linux/tracehook.h> | 3 | #include <linux/tracehook.h> |
| 4 | 4 | ||
| 5 | static struct callback_head work_exited; /* all we need is ->next == NULL */ | ||
| 6 | |||
| 5 | int | 7 | int |
| 6 | task_work_add(struct task_struct *task, struct callback_head *twork, bool notify) | 8 | task_work_add(struct task_struct *task, struct callback_head *work, bool notify) |
| 7 | { | 9 | { |
| 8 | struct callback_head *last, *first; | 10 | struct callback_head *head; |
| 9 | unsigned long flags; | ||
| 10 | 11 | ||
| 11 | /* | 12 | do { |
| 12 | * Not inserting the new work if the task has already passed | 13 | head = ACCESS_ONCE(task->task_works); |
| 13 | * exit_task_work() is the responisbility of callers. | 14 | if (unlikely(head == &work_exited)) |
| 14 | */ | 15 | return -ESRCH; |
| 15 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 16 | work->next = head; |
| 16 | last = task->task_works; | 17 | } while (cmpxchg(&task->task_works, head, work) != head); |
| 17 | first = last ? last->next : twork; | ||
| 18 | twork->next = first; | ||
| 19 | if (last) | ||
| 20 | last->next = twork; | ||
| 21 | task->task_works = twork; | ||
| 22 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | ||
| 23 | 18 | ||
| 24 | /* test_and_set_bit() implies mb(), see tracehook_notify_resume(). */ | ||
| 25 | if (notify) | 19 | if (notify) |
| 26 | set_notify_resume(task); | 20 | set_notify_resume(task); |
| 27 | return 0; | 21 | return 0; |
| @@ -30,52 +24,69 @@ task_work_add(struct task_struct *task, struct callback_head *twork, bool notify | |||
| 30 | struct callback_head * | 24 | struct callback_head * |
| 31 | task_work_cancel(struct task_struct *task, task_work_func_t func) | 25 | task_work_cancel(struct task_struct *task, task_work_func_t func) |
| 32 | { | 26 | { |
| 27 | struct callback_head **pprev = &task->task_works; | ||
| 28 | struct callback_head *work = NULL; | ||
| 33 | unsigned long flags; | 29 | unsigned long flags; |
| 34 | struct callback_head *last, *res = NULL; | 30 | /* |
| 35 | 31 | * If cmpxchg() fails we continue without updating pprev. | |
| 32 | * Either we raced with task_work_add() which added the | ||
| 33 | * new entry before this work, we will find it again. Or | ||
| 34 | * we raced with task_work_run(), *pprev == NULL/exited. | ||
| 35 | */ | ||
| 36 | raw_spin_lock_irqsave(&task->pi_lock, flags); | 36 | raw_spin_lock_irqsave(&task->pi_lock, flags); |
| 37 | last = task->task_works; | 37 | while ((work = ACCESS_ONCE(*pprev))) { |
| 38 | if (last) { | 38 | read_barrier_depends(); |
| 39 | struct callback_head *q = last, *p = q->next; | 39 | if (work->func != func) |
| 40 | while (1) { | 40 | pprev = &work->next; |
| 41 | if (p->func == func) { | 41 | else if (cmpxchg(pprev, work, work->next) == work) |
| 42 | q->next = p->next; | 42 | break; |
| 43 | if (p == last) | ||
| 44 | task->task_works = q == p ? NULL : q; | ||
| 45 | res = p; | ||
| 46 | break; | ||
| 47 | } | ||
| 48 | if (p == last) | ||
| 49 | break; | ||
| 50 | q = p; | ||
| 51 | p = q->next; | ||
| 52 | } | ||
| 53 | } | 43 | } |
| 54 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); | 44 | raw_spin_unlock_irqrestore(&task->pi_lock, flags); |
| 55 | return res; | 45 | |
| 46 | return work; | ||
| 56 | } | 47 | } |
| 57 | 48 | ||
| 58 | void task_work_run(void) | 49 | void task_work_run(void) |
| 59 | { | 50 | { |
| 60 | struct task_struct *task = current; | 51 | struct task_struct *task = current; |
| 61 | struct callback_head *p, *q; | 52 | struct callback_head *work, *head, *next; |
| 53 | |||
| 54 | for (;;) { | ||
| 55 | /* | ||
| 56 | * work->func() can do task_work_add(), do not set | ||
| 57 | * work_exited unless the list is empty. | ||
| 58 | */ | ||
| 59 | do { | ||
| 60 | work = ACCESS_ONCE(task->task_works); | ||
| 61 | head = !work && (task->flags & PF_EXITING) ? | ||
| 62 | &work_exited : NULL; | ||
| 63 | } while (cmpxchg(&task->task_works, work, head) != work); | ||
| 62 | 64 | ||
| 63 | while (1) { | 65 | if (!work) |
| 64 | raw_spin_lock_irq(&task->pi_lock); | 66 | break; |
| 65 | p = task->task_works; | 67 | /* |
| 66 | task->task_works = NULL; | 68 | * Synchronize with task_work_cancel(). It can't remove |
| 67 | raw_spin_unlock_irq(&task->pi_lock); | 69 | * the first entry == work, cmpxchg(task_works) should |
| 70 | * fail, but it can play with *work and other entries. | ||
| 71 | */ | ||
| 72 | raw_spin_unlock_wait(&task->pi_lock); | ||
| 73 | smp_mb(); | ||
| 68 | 74 | ||
| 69 | if (unlikely(!p)) | 75 | /* Reverse the list to run the works in fifo order */ |
| 70 | return; | 76 | head = NULL; |
| 77 | do { | ||
| 78 | next = work->next; | ||
| 79 | work->next = head; | ||
| 80 | head = work; | ||
| 81 | work = next; | ||
| 82 | } while (work); | ||
| 71 | 83 | ||
| 72 | q = p->next; /* head */ | 84 | work = head; |
| 73 | p->next = NULL; /* cut it */ | 85 | do { |
| 74 | while (q) { | 86 | next = work->next; |
| 75 | p = q->next; | 87 | work->func(work); |
| 76 | q->func(q); | 88 | work = next; |
| 77 | q = p; | ||
| 78 | cond_resched(); | 89 | cond_resched(); |
| 79 | } | 90 | } while (work); |
| 80 | } | 91 | } |
| 81 | } | 92 | } |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index d0a32796550f..145bb4d3bd4d 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/cgroup.h> | 27 | #include <linux/cgroup.h> |
| 28 | #include <linux/fs.h> | 28 | #include <linux/fs.h> |
| 29 | #include <linux/file.h> | 29 | #include <linux/file.h> |
| 30 | #include <linux/pid_namespace.h> | ||
| 30 | #include <net/genetlink.h> | 31 | #include <net/genetlink.h> |
| 31 | #include <linux/atomic.h> | 32 | #include <linux/atomic.h> |
| 32 | 33 | ||
| @@ -174,7 +175,9 @@ static void send_cpu_listeners(struct sk_buff *skb, | |||
| 174 | up_write(&listeners->sem); | 175 | up_write(&listeners->sem); |
| 175 | } | 176 | } |
| 176 | 177 | ||
| 177 | static void fill_stats(struct task_struct *tsk, struct taskstats *stats) | 178 | static void fill_stats(struct user_namespace *user_ns, |
| 179 | struct pid_namespace *pid_ns, | ||
| 180 | struct task_struct *tsk, struct taskstats *stats) | ||
| 178 | { | 181 | { |
| 179 | memset(stats, 0, sizeof(*stats)); | 182 | memset(stats, 0, sizeof(*stats)); |
| 180 | /* | 183 | /* |
| @@ -190,7 +193,7 @@ static void fill_stats(struct task_struct *tsk, struct taskstats *stats) | |||
| 190 | stats->version = TASKSTATS_VERSION; | 193 | stats->version = TASKSTATS_VERSION; |
| 191 | stats->nvcsw = tsk->nvcsw; | 194 | stats->nvcsw = tsk->nvcsw; |
| 192 | stats->nivcsw = tsk->nivcsw; | 195 | stats->nivcsw = tsk->nivcsw; |
| 193 | bacct_add_tsk(stats, tsk); | 196 | bacct_add_tsk(user_ns, pid_ns, stats, tsk); |
| 194 | 197 | ||
| 195 | /* fill in extended acct fields */ | 198 | /* fill in extended acct fields */ |
| 196 | xacct_add_tsk(stats, tsk); | 199 | xacct_add_tsk(stats, tsk); |
| @@ -207,7 +210,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) | |||
| 207 | rcu_read_unlock(); | 210 | rcu_read_unlock(); |
| 208 | if (!tsk) | 211 | if (!tsk) |
| 209 | return -ESRCH; | 212 | return -ESRCH; |
| 210 | fill_stats(tsk, stats); | 213 | fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); |
| 211 | put_task_struct(tsk); | 214 | put_task_struct(tsk); |
| 212 | return 0; | 215 | return 0; |
| 213 | } | 216 | } |
| @@ -291,6 +294,12 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) | |||
| 291 | if (!cpumask_subset(mask, cpu_possible_mask)) | 294 | if (!cpumask_subset(mask, cpu_possible_mask)) |
| 292 | return -EINVAL; | 295 | return -EINVAL; |
| 293 | 296 | ||
| 297 | if (current_user_ns() != &init_user_ns) | ||
| 298 | return -EINVAL; | ||
| 299 | |||
| 300 | if (task_active_pid_ns(current) != &init_pid_ns) | ||
| 301 | return -EINVAL; | ||
| 302 | |||
| 294 | if (isadd == REGISTER) { | 303 | if (isadd == REGISTER) { |
| 295 | for_each_cpu(cpu, mask) { | 304 | for_each_cpu(cpu, mask) { |
| 296 | s = kmalloc_node(sizeof(struct listener), | 305 | s = kmalloc_node(sizeof(struct listener), |
| @@ -415,16 +424,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 415 | struct nlattr *na; | 424 | struct nlattr *na; |
| 416 | size_t size; | 425 | size_t size; |
| 417 | u32 fd; | 426 | u32 fd; |
| 418 | struct file *file; | 427 | struct fd f; |
| 419 | int fput_needed; | ||
| 420 | 428 | ||
| 421 | na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; | 429 | na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; |
| 422 | if (!na) | 430 | if (!na) |
| 423 | return -EINVAL; | 431 | return -EINVAL; |
| 424 | 432 | ||
| 425 | fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); | 433 | fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); |
| 426 | file = fget_light(fd, &fput_needed); | 434 | f = fdget(fd); |
| 427 | if (!file) | 435 | if (!f.file) |
| 428 | return 0; | 436 | return 0; |
| 429 | 437 | ||
| 430 | size = nla_total_size(sizeof(struct cgroupstats)); | 438 | size = nla_total_size(sizeof(struct cgroupstats)); |
| @@ -437,6 +445,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 437 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, | 445 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, |
| 438 | sizeof(struct cgroupstats)); | 446 | sizeof(struct cgroupstats)); |
| 439 | if (na == NULL) { | 447 | if (na == NULL) { |
| 448 | nlmsg_free(rep_skb); | ||
| 440 | rc = -EMSGSIZE; | 449 | rc = -EMSGSIZE; |
| 441 | goto err; | 450 | goto err; |
| 442 | } | 451 | } |
| @@ -444,7 +453,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 444 | stats = nla_data(na); | 453 | stats = nla_data(na); |
| 445 | memset(stats, 0, sizeof(*stats)); | 454 | memset(stats, 0, sizeof(*stats)); |
| 446 | 455 | ||
| 447 | rc = cgroupstats_build(stats, file->f_dentry); | 456 | rc = cgroupstats_build(stats, f.file->f_dentry); |
| 448 | if (rc < 0) { | 457 | if (rc < 0) { |
| 449 | nlmsg_free(rep_skb); | 458 | nlmsg_free(rep_skb); |
| 450 | goto err; | 459 | goto err; |
| @@ -453,7 +462,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
| 453 | rc = send_reply(rep_skb, info); | 462 | rc = send_reply(rep_skb, info); |
| 454 | 463 | ||
| 455 | err: | 464 | err: |
| 456 | fput_light(file, fput_needed); | 465 | fdput(f); |
| 457 | return rc; | 466 | return rc; |
| 458 | } | 467 | } |
| 459 | 468 | ||
| @@ -467,7 +476,7 @@ static int cmd_attr_register_cpumask(struct genl_info *info) | |||
| 467 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); | 476 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); |
| 468 | if (rc < 0) | 477 | if (rc < 0) |
| 469 | goto out; | 478 | goto out; |
| 470 | rc = add_del_listener(info->snd_pid, mask, REGISTER); | 479 | rc = add_del_listener(info->snd_portid, mask, REGISTER); |
| 471 | out: | 480 | out: |
| 472 | free_cpumask_var(mask); | 481 | free_cpumask_var(mask); |
| 473 | return rc; | 482 | return rc; |
| @@ -483,7 +492,7 @@ static int cmd_attr_deregister_cpumask(struct genl_info *info) | |||
| 483 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); | 492 | rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); |
| 484 | if (rc < 0) | 493 | if (rc < 0) |
| 485 | goto out; | 494 | goto out; |
| 486 | rc = add_del_listener(info->snd_pid, mask, DEREGISTER); | 495 | rc = add_del_listener(info->snd_portid, mask, DEREGISTER); |
| 487 | out: | 496 | out: |
| 488 | free_cpumask_var(mask); | 497 | free_cpumask_var(mask); |
| 489 | return rc; | 498 | return rc; |
| @@ -631,11 +640,12 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 631 | if (rc < 0) | 640 | if (rc < 0) |
| 632 | return; | 641 | return; |
| 633 | 642 | ||
| 634 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); | 643 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, |
| 644 | task_pid_nr_ns(tsk, &init_pid_ns)); | ||
| 635 | if (!stats) | 645 | if (!stats) |
| 636 | goto err; | 646 | goto err; |
| 637 | 647 | ||
| 638 | fill_stats(tsk, stats); | 648 | fill_stats(&init_user_ns, &init_pid_ns, tsk, stats); |
| 639 | 649 | ||
| 640 | /* | 650 | /* |
| 641 | * Doesn't matter if tsk is the leader or the last group member leaving | 651 | * Doesn't matter if tsk is the leader or the last group member leaving |
| @@ -643,7 +653,8 @@ void taskstats_exit(struct task_struct *tsk, int group_dead) | |||
| 643 | if (!is_thread_group || !group_dead) | 653 | if (!is_thread_group || !group_dead) |
| 644 | goto send; | 654 | goto send; |
| 645 | 655 | ||
| 646 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); | 656 | stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, |
| 657 | task_tgid_nr_ns(tsk, &init_pid_ns)); | ||
| 647 | if (!stats) | 658 | if (!stats) |
| 648 | goto err; | 659 | goto err; |
| 649 | 660 | ||
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7e1ce012a851..30b6de0d977c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
| @@ -397,6 +397,30 @@ void clockevents_exchange_device(struct clock_event_device *old, | |||
| 397 | local_irq_restore(flags); | 397 | local_irq_restore(flags); |
| 398 | } | 398 | } |
| 399 | 399 | ||
| 400 | /** | ||
| 401 | * clockevents_suspend - suspend clock devices | ||
| 402 | */ | ||
| 403 | void clockevents_suspend(void) | ||
| 404 | { | ||
| 405 | struct clock_event_device *dev; | ||
| 406 | |||
| 407 | list_for_each_entry_reverse(dev, &clockevent_devices, list) | ||
| 408 | if (dev->suspend) | ||
| 409 | dev->suspend(dev); | ||
| 410 | } | ||
| 411 | |||
| 412 | /** | ||
| 413 | * clockevents_resume - resume clock devices | ||
| 414 | */ | ||
| 415 | void clockevents_resume(void) | ||
| 416 | { | ||
| 417 | struct clock_event_device *dev; | ||
| 418 | |||
| 419 | list_for_each_entry(dev, &clockevent_devices, list) | ||
| 420 | if (dev->resume) | ||
| 421 | dev->resume(dev); | ||
| 422 | } | ||
| 423 | |||
| 400 | #ifdef CONFIG_GENERIC_CLOCKEVENTS | 424 | #ifdef CONFIG_GENERIC_CLOCKEVENTS |
| 401 | /** | 425 | /** |
| 402 | * clockevents_notify - notification about relevant events | 426 | * clockevents_notify - notification about relevant events |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3a9e5d5c1091..f423bdd035c2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
| @@ -372,7 +372,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
| 372 | * the scheduler tick in nohz_restart_sched_tick. | 372 | * the scheduler tick in nohz_restart_sched_tick. |
| 373 | */ | 373 | */ |
| 374 | if (!ts->tick_stopped) { | 374 | if (!ts->tick_stopped) { |
| 375 | select_nohz_load_balancer(1); | 375 | nohz_balance_enter_idle(cpu); |
| 376 | calc_load_enter_idle(); | 376 | calc_load_enter_idle(); |
| 377 | 377 | ||
| 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 378 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
| @@ -436,7 +436,8 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) | |||
| 436 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { | 436 | if (unlikely(local_softirq_pending() && cpu_online(cpu))) { |
| 437 | static int ratelimit; | 437 | static int ratelimit; |
| 438 | 438 | ||
| 439 | if (ratelimit < 10) { | 439 | if (ratelimit < 10 && |
| 440 | (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { | ||
| 440 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", | 441 | printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", |
| 441 | (unsigned int) local_softirq_pending()); | 442 | (unsigned int) local_softirq_pending()); |
| 442 | ratelimit++; | 443 | ratelimit++; |
| @@ -569,7 +570,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) | |||
| 569 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | 570 | static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) |
| 570 | { | 571 | { |
| 571 | /* Update jiffies first */ | 572 | /* Update jiffies first */ |
| 572 | select_nohz_load_balancer(0); | ||
| 573 | tick_do_update_jiffies64(now); | 573 | tick_do_update_jiffies64(now); |
| 574 | update_cpu_load_nohz(); | 574 | update_cpu_load_nohz(); |
| 575 | 575 | ||
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index d3b91e75cecd..5ce06a3fa91e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
| @@ -776,6 +776,7 @@ static void timekeeping_resume(void) | |||
| 776 | 776 | ||
| 777 | read_persistent_clock(&ts); | 777 | read_persistent_clock(&ts); |
| 778 | 778 | ||
| 779 | clockevents_resume(); | ||
| 779 | clocksource_resume(); | 780 | clocksource_resume(); |
| 780 | 781 | ||
| 781 | write_seqlock_irqsave(&tk->lock, flags); | 782 | write_seqlock_irqsave(&tk->lock, flags); |
| @@ -835,6 +836,7 @@ static int timekeeping_suspend(void) | |||
| 835 | 836 | ||
| 836 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); | 837 | clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); |
| 837 | clocksource_suspend(); | 838 | clocksource_suspend(); |
| 839 | clockevents_suspend(); | ||
| 838 | 840 | ||
| 839 | return 0; | 841 | return 0; |
| 840 | } | 842 | } |
diff --git a/kernel/timer.c b/kernel/timer.c index 8c5e7b908c68..d5de1b2292aa 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
| @@ -92,24 +92,25 @@ static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; | |||
| 92 | /* Functions below help us manage 'deferrable' flag */ | 92 | /* Functions below help us manage 'deferrable' flag */ |
| 93 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) | 93 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
| 94 | { | 94 | { |
| 95 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); | 95 | return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); |
| 96 | } | 96 | } |
| 97 | 97 | ||
| 98 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) | 98 | static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) |
| 99 | { | 99 | { |
| 100 | return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); | 100 | return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); |
| 101 | } | 101 | } |
| 102 | 102 | ||
| 103 | static inline void timer_set_deferrable(struct timer_list *timer) | 103 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) |
| 104 | { | 104 | { |
| 105 | timer->base = TBASE_MAKE_DEFERRED(timer->base); | 105 | return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); |
| 106 | } | 106 | } |
| 107 | 107 | ||
| 108 | static inline void | 108 | static inline void |
| 109 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) | 109 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) |
| 110 | { | 110 | { |
| 111 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | | 111 | unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; |
| 112 | tbase_get_deferrable(timer->base)); | 112 | |
| 113 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); | ||
| 113 | } | 114 | } |
| 114 | 115 | ||
| 115 | static unsigned long round_jiffies_common(unsigned long j, int cpu, | 116 | static unsigned long round_jiffies_common(unsigned long j, int cpu, |
| @@ -563,16 +564,14 @@ static inline void debug_timer_assert_init(struct timer_list *timer) | |||
| 563 | debug_object_assert_init(timer, &timer_debug_descr); | 564 | debug_object_assert_init(timer, &timer_debug_descr); |
| 564 | } | 565 | } |
| 565 | 566 | ||
| 566 | static void __init_timer(struct timer_list *timer, | 567 | static void do_init_timer(struct timer_list *timer, unsigned int flags, |
| 567 | const char *name, | 568 | const char *name, struct lock_class_key *key); |
| 568 | struct lock_class_key *key); | ||
| 569 | 569 | ||
| 570 | void init_timer_on_stack_key(struct timer_list *timer, | 570 | void init_timer_on_stack_key(struct timer_list *timer, unsigned int flags, |
| 571 | const char *name, | 571 | const char *name, struct lock_class_key *key) |
| 572 | struct lock_class_key *key) | ||
| 573 | { | 572 | { |
| 574 | debug_object_init_on_stack(timer, &timer_debug_descr); | 573 | debug_object_init_on_stack(timer, &timer_debug_descr); |
| 575 | __init_timer(timer, name, key); | 574 | do_init_timer(timer, flags, name, key); |
| 576 | } | 575 | } |
| 577 | EXPORT_SYMBOL_GPL(init_timer_on_stack_key); | 576 | EXPORT_SYMBOL_GPL(init_timer_on_stack_key); |
| 578 | 577 | ||
| @@ -613,12 +612,13 @@ static inline void debug_assert_init(struct timer_list *timer) | |||
| 613 | debug_timer_assert_init(timer); | 612 | debug_timer_assert_init(timer); |
| 614 | } | 613 | } |
| 615 | 614 | ||
| 616 | static void __init_timer(struct timer_list *timer, | 615 | static void do_init_timer(struct timer_list *timer, unsigned int flags, |
| 617 | const char *name, | 616 | const char *name, struct lock_class_key *key) |
| 618 | struct lock_class_key *key) | ||
| 619 | { | 617 | { |
| 618 | struct tvec_base *base = __raw_get_cpu_var(tvec_bases); | ||
| 619 | |||
| 620 | timer->entry.next = NULL; | 620 | timer->entry.next = NULL; |
| 621 | timer->base = __raw_get_cpu_var(tvec_bases); | 621 | timer->base = (void *)((unsigned long)base | flags); |
| 622 | timer->slack = -1; | 622 | timer->slack = -1; |
| 623 | #ifdef CONFIG_TIMER_STATS | 623 | #ifdef CONFIG_TIMER_STATS |
| 624 | timer->start_site = NULL; | 624 | timer->start_site = NULL; |
| @@ -628,22 +628,10 @@ static void __init_timer(struct timer_list *timer, | |||
| 628 | lockdep_init_map(&timer->lockdep_map, name, key, 0); | 628 | lockdep_init_map(&timer->lockdep_map, name, key, 0); |
| 629 | } | 629 | } |
| 630 | 630 | ||
| 631 | void setup_deferrable_timer_on_stack_key(struct timer_list *timer, | ||
| 632 | const char *name, | ||
| 633 | struct lock_class_key *key, | ||
| 634 | void (*function)(unsigned long), | ||
| 635 | unsigned long data) | ||
| 636 | { | ||
| 637 | timer->function = function; | ||
| 638 | timer->data = data; | ||
| 639 | init_timer_on_stack_key(timer, name, key); | ||
| 640 | timer_set_deferrable(timer); | ||
| 641 | } | ||
| 642 | EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); | ||
| 643 | |||
| 644 | /** | 631 | /** |
| 645 | * init_timer_key - initialize a timer | 632 | * init_timer_key - initialize a timer |
| 646 | * @timer: the timer to be initialized | 633 | * @timer: the timer to be initialized |
| 634 | * @flags: timer flags | ||
| 647 | * @name: name of the timer | 635 | * @name: name of the timer |
| 648 | * @key: lockdep class key of the fake lock used for tracking timer | 636 | * @key: lockdep class key of the fake lock used for tracking timer |
| 649 | * sync lock dependencies | 637 | * sync lock dependencies |
| @@ -651,24 +639,14 @@ EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); | |||
| 651 | * init_timer_key() must be done to a timer prior calling *any* of the | 639 | * init_timer_key() must be done to a timer prior calling *any* of the |
| 652 | * other timer functions. | 640 | * other timer functions. |
| 653 | */ | 641 | */ |
| 654 | void init_timer_key(struct timer_list *timer, | 642 | void init_timer_key(struct timer_list *timer, unsigned int flags, |
| 655 | const char *name, | 643 | const char *name, struct lock_class_key *key) |
| 656 | struct lock_class_key *key) | ||
| 657 | { | 644 | { |
| 658 | debug_init(timer); | 645 | debug_init(timer); |
| 659 | __init_timer(timer, name, key); | 646 | do_init_timer(timer, flags, name, key); |
| 660 | } | 647 | } |
| 661 | EXPORT_SYMBOL(init_timer_key); | 648 | EXPORT_SYMBOL(init_timer_key); |
| 662 | 649 | ||
| 663 | void init_timer_deferrable_key(struct timer_list *timer, | ||
| 664 | const char *name, | ||
| 665 | struct lock_class_key *key) | ||
| 666 | { | ||
| 667 | init_timer_key(timer, name, key); | ||
| 668 | timer_set_deferrable(timer); | ||
| 669 | } | ||
| 670 | EXPORT_SYMBOL(init_timer_deferrable_key); | ||
| 671 | |||
| 672 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) | 650 | static inline void detach_timer(struct timer_list *timer, bool clear_pending) |
| 673 | { | 651 | { |
| 674 | struct list_head *entry = &timer->entry; | 652 | struct list_head *entry = &timer->entry; |
| @@ -686,7 +664,7 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base) | |||
| 686 | { | 664 | { |
| 687 | detach_timer(timer, true); | 665 | detach_timer(timer, true); |
| 688 | if (!tbase_get_deferrable(timer->base)) | 666 | if (!tbase_get_deferrable(timer->base)) |
| 689 | timer->base->active_timers--; | 667 | base->active_timers--; |
| 690 | } | 668 | } |
| 691 | 669 | ||
| 692 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | 670 | static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, |
| @@ -697,7 +675,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, | |||
| 697 | 675 | ||
| 698 | detach_timer(timer, clear_pending); | 676 | detach_timer(timer, clear_pending); |
| 699 | if (!tbase_get_deferrable(timer->base)) { | 677 | if (!tbase_get_deferrable(timer->base)) { |
| 700 | timer->base->active_timers--; | 678 | base->active_timers--; |
| 701 | if (timer->expires == base->next_timer) | 679 | if (timer->expires == base->next_timer) |
| 702 | base->next_timer = base->timer_jiffies; | 680 | base->next_timer = base->timer_jiffies; |
| 703 | } | 681 | } |
| @@ -1029,14 +1007,14 @@ EXPORT_SYMBOL(try_to_del_timer_sync); | |||
| 1029 | * | 1007 | * |
| 1030 | * Synchronization rules: Callers must prevent restarting of the timer, | 1008 | * Synchronization rules: Callers must prevent restarting of the timer, |
| 1031 | * otherwise this function is meaningless. It must not be called from | 1009 | * otherwise this function is meaningless. It must not be called from |
| 1032 | * interrupt contexts. The caller must not hold locks which would prevent | 1010 | * interrupt contexts unless the timer is an irqsafe one. The caller must |
| 1033 | * completion of the timer's handler. The timer's handler must not call | 1011 | * not hold locks which would prevent completion of the timer's |
| 1034 | * add_timer_on(). Upon exit the timer is not queued and the handler is | 1012 | * handler. The timer's handler must not call add_timer_on(). Upon exit the |
| 1035 | * not running on any CPU. | 1013 | * timer is not queued and the handler is not running on any CPU. |
| 1036 | * | 1014 | * |
| 1037 | * Note: You must not hold locks that are held in interrupt context | 1015 | * Note: For !irqsafe timers, you must not hold locks that are held in |
| 1038 | * while calling this function. Even if the lock has nothing to do | 1016 | * interrupt context while calling this function. Even if the lock has |
| 1039 | * with the timer in question. Here's why: | 1017 | * nothing to do with the timer in question. Here's why: |
| 1040 | * | 1018 | * |
| 1041 | * CPU0 CPU1 | 1019 | * CPU0 CPU1 |
| 1042 | * ---- ---- | 1020 | * ---- ---- |
| @@ -1073,7 +1051,7 @@ int del_timer_sync(struct timer_list *timer) | |||
| 1073 | * don't use it in hardirq context, because it | 1051 | * don't use it in hardirq context, because it |
| 1074 | * could lead to deadlock. | 1052 | * could lead to deadlock. |
| 1075 | */ | 1053 | */ |
| 1076 | WARN_ON(in_irq()); | 1054 | WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); |
| 1077 | for (;;) { | 1055 | for (;;) { |
| 1078 | int ret = try_to_del_timer_sync(timer); | 1056 | int ret = try_to_del_timer_sync(timer); |
| 1079 | if (ret >= 0) | 1057 | if (ret >= 0) |
| @@ -1180,19 +1158,27 @@ static inline void __run_timers(struct tvec_base *base) | |||
| 1180 | while (!list_empty(head)) { | 1158 | while (!list_empty(head)) { |
| 1181 | void (*fn)(unsigned long); | 1159 | void (*fn)(unsigned long); |
| 1182 | unsigned long data; | 1160 | unsigned long data; |
| 1161 | bool irqsafe; | ||
| 1183 | 1162 | ||
| 1184 | timer = list_first_entry(head, struct timer_list,entry); | 1163 | timer = list_first_entry(head, struct timer_list,entry); |
| 1185 | fn = timer->function; | 1164 | fn = timer->function; |
| 1186 | data = timer->data; | 1165 | data = timer->data; |
| 1166 | irqsafe = tbase_get_irqsafe(timer->base); | ||
| 1187 | 1167 | ||
| 1188 | timer_stats_account_timer(timer); | 1168 | timer_stats_account_timer(timer); |
| 1189 | 1169 | ||
| 1190 | base->running_timer = timer; | 1170 | base->running_timer = timer; |
| 1191 | detach_expired_timer(timer, base); | 1171 | detach_expired_timer(timer, base); |
| 1192 | 1172 | ||
| 1193 | spin_unlock_irq(&base->lock); | 1173 | if (irqsafe) { |
| 1194 | call_timer_fn(timer, fn, data); | 1174 | spin_unlock(&base->lock); |
| 1195 | spin_lock_irq(&base->lock); | 1175 | call_timer_fn(timer, fn, data); |
| 1176 | spin_lock(&base->lock); | ||
| 1177 | } else { | ||
| 1178 | spin_unlock_irq(&base->lock); | ||
| 1179 | call_timer_fn(timer, fn, data); | ||
| 1180 | spin_lock_irq(&base->lock); | ||
| 1181 | } | ||
| 1196 | } | 1182 | } |
| 1197 | } | 1183 | } |
| 1198 | base->running_timer = NULL; | 1184 | base->running_timer = NULL; |
| @@ -1791,9 +1777,13 @@ static struct notifier_block __cpuinitdata timers_nb = { | |||
| 1791 | 1777 | ||
| 1792 | void __init init_timers(void) | 1778 | void __init init_timers(void) |
| 1793 | { | 1779 | { |
| 1794 | int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | 1780 | int err; |
| 1795 | (void *)(long)smp_processor_id()); | 1781 | |
| 1782 | /* ensure there are enough low bits for flags in timer->base pointer */ | ||
| 1783 | BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); | ||
| 1796 | 1784 | ||
| 1785 | err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, | ||
| 1786 | (void *)(long)smp_processor_id()); | ||
| 1797 | init_timer_stats(); | 1787 | init_timer_stats(); |
| 1798 | 1788 | ||
| 1799 | BUG_ON(err != NOTIFY_OK); | 1789 | BUG_ON(err != NOTIFY_OK); |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8c4c07071cc5..4cea4f41c1d9 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
| @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS | |||
| 49 | help | 49 | help |
| 50 | See Documentation/trace/ftrace-design.txt | 50 | See Documentation/trace/ftrace-design.txt |
| 51 | 51 | ||
| 52 | config HAVE_FENTRY | ||
| 53 | bool | ||
| 54 | help | ||
| 55 | Arch supports the gcc options -pg with -mfentry | ||
| 56 | |||
| 52 | config HAVE_C_RECORDMCOUNT | 57 | config HAVE_C_RECORDMCOUNT |
| 53 | bool | 58 | bool |
| 54 | help | 59 | help |
| @@ -57,8 +62,12 @@ config HAVE_C_RECORDMCOUNT | |||
| 57 | config TRACER_MAX_TRACE | 62 | config TRACER_MAX_TRACE |
| 58 | bool | 63 | bool |
| 59 | 64 | ||
| 65 | config TRACE_CLOCK | ||
| 66 | bool | ||
| 67 | |||
| 60 | config RING_BUFFER | 68 | config RING_BUFFER |
| 61 | bool | 69 | bool |
| 70 | select TRACE_CLOCK | ||
| 62 | 71 | ||
| 63 | config FTRACE_NMI_ENTER | 72 | config FTRACE_NMI_ENTER |
| 64 | bool | 73 | bool |
| @@ -109,6 +118,7 @@ config TRACING | |||
| 109 | select NOP_TRACER | 118 | select NOP_TRACER |
| 110 | select BINARY_PRINTF | 119 | select BINARY_PRINTF |
| 111 | select EVENT_TRACING | 120 | select EVENT_TRACING |
| 121 | select TRACE_CLOCK | ||
| 112 | 122 | ||
| 113 | config GENERIC_TRACER | 123 | config GENERIC_TRACER |
| 114 | bool | 124 | bool |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index b831087c8200..d7e2068e4b71 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
| @@ -5,10 +5,12 @@ ifdef CONFIG_FUNCTION_TRACER | |||
| 5 | ORIG_CFLAGS := $(KBUILD_CFLAGS) | 5 | ORIG_CFLAGS := $(KBUILD_CFLAGS) |
| 6 | KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) | 6 | KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) |
| 7 | 7 | ||
| 8 | ifdef CONFIG_FTRACE_SELFTEST | ||
| 8 | # selftest needs instrumentation | 9 | # selftest needs instrumentation |
| 9 | CFLAGS_trace_selftest_dynamic.o = -pg | 10 | CFLAGS_trace_selftest_dynamic.o = -pg |
| 10 | obj-y += trace_selftest_dynamic.o | 11 | obj-y += trace_selftest_dynamic.o |
| 11 | endif | 12 | endif |
| 13 | endif | ||
| 12 | 14 | ||
| 13 | # If unlikely tracing is enabled, do not trace these files | 15 | # If unlikely tracing is enabled, do not trace these files |
| 14 | ifdef CONFIG_TRACING_BRANCHES | 16 | ifdef CONFIG_TRACING_BRANCHES |
| @@ -17,11 +19,7 @@ endif | |||
| 17 | 19 | ||
| 18 | CFLAGS_trace_events_filter.o := -I$(src) | 20 | CFLAGS_trace_events_filter.o := -I$(src) |
| 19 | 21 | ||
| 20 | # | 22 | obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o |
| 21 | # Make the trace clocks available generally: it's infrastructure | ||
| 22 | # relied on by ptrace for example: | ||
| 23 | # | ||
| 24 | obj-y += trace_clock.o | ||
| 25 | 23 | ||
| 26 | obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o | 24 | obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o |
| 27 | obj-$(CONFIG_RING_BUFFER) += ring_buffer.o | 25 | obj-$(CONFIG_RING_BUFFER) += ring_buffer.o |
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b4f20fba09fc..9dcf15d38380 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
| @@ -64,12 +64,20 @@ | |||
| 64 | 64 | ||
| 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) | 65 | #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL) |
| 66 | 66 | ||
| 67 | static struct ftrace_ops ftrace_list_end __read_mostly = { | ||
| 68 | .func = ftrace_stub, | ||
| 69 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 70 | }; | ||
| 71 | |||
| 67 | /* ftrace_enabled is a method to turn ftrace on or off */ | 72 | /* ftrace_enabled is a method to turn ftrace on or off */ |
| 68 | int ftrace_enabled __read_mostly; | 73 | int ftrace_enabled __read_mostly; |
| 69 | static int last_ftrace_enabled; | 74 | static int last_ftrace_enabled; |
| 70 | 75 | ||
| 71 | /* Quick disabling of function tracer. */ | 76 | /* Quick disabling of function tracer. */ |
| 72 | int function_trace_stop; | 77 | int function_trace_stop __read_mostly; |
| 78 | |||
| 79 | /* Current function tracing op */ | ||
| 80 | struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end; | ||
| 73 | 81 | ||
| 74 | /* List for set_ftrace_pid's pids. */ | 82 | /* List for set_ftrace_pid's pids. */ |
| 75 | LIST_HEAD(ftrace_pids); | 83 | LIST_HEAD(ftrace_pids); |
| @@ -86,22 +94,43 @@ static int ftrace_disabled __read_mostly; | |||
| 86 | 94 | ||
| 87 | static DEFINE_MUTEX(ftrace_lock); | 95 | static DEFINE_MUTEX(ftrace_lock); |
| 88 | 96 | ||
| 89 | static struct ftrace_ops ftrace_list_end __read_mostly = { | ||
| 90 | .func = ftrace_stub, | ||
| 91 | }; | ||
| 92 | |||
| 93 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; | 97 | static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; |
| 94 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; | 98 | static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; |
| 95 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; | 99 | static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; |
| 96 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; | 100 | ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; |
| 97 | static ftrace_func_t __ftrace_trace_function_delay __read_mostly = ftrace_stub; | ||
| 98 | ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; | ||
| 99 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; | 101 | ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; |
| 100 | static struct ftrace_ops global_ops; | 102 | static struct ftrace_ops global_ops; |
| 101 | static struct ftrace_ops control_ops; | 103 | static struct ftrace_ops control_ops; |
| 102 | 104 | ||
| 103 | static void | 105 | #if ARCH_SUPPORTS_FTRACE_OPS |
| 104 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | 106 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
| 107 | struct ftrace_ops *op, struct pt_regs *regs); | ||
| 108 | #else | ||
| 109 | /* See comment below, where ftrace_ops_list_func is defined */ | ||
| 110 | static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); | ||
| 111 | #define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) | ||
| 112 | #endif | ||
| 113 | |||
| 114 | /** | ||
| 115 | * ftrace_nr_registered_ops - return number of ops registered | ||
| 116 | * | ||
| 117 | * Returns the number of ftrace_ops registered and tracing functions | ||
| 118 | */ | ||
| 119 | int ftrace_nr_registered_ops(void) | ||
| 120 | { | ||
| 121 | struct ftrace_ops *ops; | ||
| 122 | int cnt = 0; | ||
| 123 | |||
| 124 | mutex_lock(&ftrace_lock); | ||
| 125 | |||
| 126 | for (ops = ftrace_ops_list; | ||
| 127 | ops != &ftrace_list_end; ops = ops->next) | ||
| 128 | cnt++; | ||
| 129 | |||
| 130 | mutex_unlock(&ftrace_lock); | ||
| 131 | |||
| 132 | return cnt; | ||
| 133 | } | ||
| 105 | 134 | ||
| 106 | /* | 135 | /* |
| 107 | * Traverse the ftrace_global_list, invoking all entries. The reason that we | 136 | * Traverse the ftrace_global_list, invoking all entries. The reason that we |
| @@ -112,29 +141,29 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); | |||
| 112 | * | 141 | * |
| 113 | * Silly Alpha and silly pointer-speculation compiler optimizations! | 142 | * Silly Alpha and silly pointer-speculation compiler optimizations! |
| 114 | */ | 143 | */ |
| 115 | static void ftrace_global_list_func(unsigned long ip, | 144 | static void |
| 116 | unsigned long parent_ip) | 145 | ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, |
| 146 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 117 | { | 147 | { |
| 118 | struct ftrace_ops *op; | ||
| 119 | |||
| 120 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) | 148 | if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) |
| 121 | return; | 149 | return; |
| 122 | 150 | ||
| 123 | trace_recursion_set(TRACE_GLOBAL_BIT); | 151 | trace_recursion_set(TRACE_GLOBAL_BIT); |
| 124 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ | 152 | op = rcu_dereference_raw(ftrace_global_list); /*see above*/ |
| 125 | while (op != &ftrace_list_end) { | 153 | while (op != &ftrace_list_end) { |
| 126 | op->func(ip, parent_ip); | 154 | op->func(ip, parent_ip, op, regs); |
| 127 | op = rcu_dereference_raw(op->next); /*see above*/ | 155 | op = rcu_dereference_raw(op->next); /*see above*/ |
| 128 | }; | 156 | }; |
| 129 | trace_recursion_clear(TRACE_GLOBAL_BIT); | 157 | trace_recursion_clear(TRACE_GLOBAL_BIT); |
| 130 | } | 158 | } |
| 131 | 159 | ||
| 132 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) | 160 | static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, |
| 161 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 133 | { | 162 | { |
| 134 | if (!test_tsk_trace_trace(current)) | 163 | if (!test_tsk_trace_trace(current)) |
| 135 | return; | 164 | return; |
| 136 | 165 | ||
| 137 | ftrace_pid_function(ip, parent_ip); | 166 | ftrace_pid_function(ip, parent_ip, op, regs); |
| 138 | } | 167 | } |
| 139 | 168 | ||
| 140 | static void set_ftrace_pid_function(ftrace_func_t func) | 169 | static void set_ftrace_pid_function(ftrace_func_t func) |
| @@ -153,25 +182,9 @@ static void set_ftrace_pid_function(ftrace_func_t func) | |||
| 153 | void clear_ftrace_function(void) | 182 | void clear_ftrace_function(void) |
| 154 | { | 183 | { |
| 155 | ftrace_trace_function = ftrace_stub; | 184 | ftrace_trace_function = ftrace_stub; |
| 156 | __ftrace_trace_function = ftrace_stub; | ||
| 157 | __ftrace_trace_function_delay = ftrace_stub; | ||
| 158 | ftrace_pid_function = ftrace_stub; | 185 | ftrace_pid_function = ftrace_stub; |
| 159 | } | 186 | } |
| 160 | 187 | ||
| 161 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
| 162 | /* | ||
| 163 | * For those archs that do not test ftrace_trace_stop in their | ||
| 164 | * mcount call site, we need to do it from C. | ||
| 165 | */ | ||
| 166 | static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) | ||
| 167 | { | ||
| 168 | if (function_trace_stop) | ||
| 169 | return; | ||
| 170 | |||
| 171 | __ftrace_trace_function(ip, parent_ip); | ||
| 172 | } | ||
| 173 | #endif | ||
| 174 | |||
| 175 | static void control_ops_disable_all(struct ftrace_ops *ops) | 188 | static void control_ops_disable_all(struct ftrace_ops *ops) |
| 176 | { | 189 | { |
| 177 | int cpu; | 190 | int cpu; |
| @@ -230,28 +243,27 @@ static void update_ftrace_function(void) | |||
| 230 | 243 | ||
| 231 | /* | 244 | /* |
| 232 | * If we are at the end of the list and this ops is | 245 | * If we are at the end of the list and this ops is |
| 233 | * not dynamic, then have the mcount trampoline call | 246 | * recursion safe and not dynamic and the arch supports passing ops, |
| 234 | * the function directly | 247 | * then have the mcount trampoline call the function directly. |
| 235 | */ | 248 | */ |
| 236 | if (ftrace_ops_list == &ftrace_list_end || | 249 | if (ftrace_ops_list == &ftrace_list_end || |
| 237 | (ftrace_ops_list->next == &ftrace_list_end && | 250 | (ftrace_ops_list->next == &ftrace_list_end && |
| 238 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) | 251 | !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && |
| 252 | (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) && | ||
| 253 | !FTRACE_FORCE_LIST_FUNC)) { | ||
| 254 | /* Set the ftrace_ops that the arch callback uses */ | ||
| 255 | if (ftrace_ops_list == &global_ops) | ||
| 256 | function_trace_op = ftrace_global_list; | ||
| 257 | else | ||
| 258 | function_trace_op = ftrace_ops_list; | ||
| 239 | func = ftrace_ops_list->func; | 259 | func = ftrace_ops_list->func; |
| 240 | else | 260 | } else { |
| 261 | /* Just use the default ftrace_ops */ | ||
| 262 | function_trace_op = &ftrace_list_end; | ||
| 241 | func = ftrace_ops_list_func; | 263 | func = ftrace_ops_list_func; |
| 264 | } | ||
| 242 | 265 | ||
| 243 | #ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
| 244 | ftrace_trace_function = func; | 266 | ftrace_trace_function = func; |
| 245 | #else | ||
| 246 | #ifdef CONFIG_DYNAMIC_FTRACE | ||
| 247 | /* do not update till all functions have been modified */ | ||
| 248 | __ftrace_trace_function_delay = func; | ||
| 249 | #else | ||
| 250 | __ftrace_trace_function = func; | ||
| 251 | #endif | ||
| 252 | ftrace_trace_function = | ||
| 253 | (func == ftrace_stub) ? func : ftrace_test_stop_func; | ||
| 254 | #endif | ||
| 255 | } | 267 | } |
| 256 | 268 | ||
| 257 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) | 269 | static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) |
| @@ -325,6 +337,20 @@ static int __register_ftrace_function(struct ftrace_ops *ops) | |||
| 325 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) | 337 | if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) |
| 326 | return -EINVAL; | 338 | return -EINVAL; |
| 327 | 339 | ||
| 340 | #ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS | ||
| 341 | /* | ||
| 342 | * If the ftrace_ops specifies SAVE_REGS, then it only can be used | ||
| 343 | * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. | ||
| 344 | * Setting SAVE_REGS_IF_SUPPORTED makes SAVE_REGS irrelevant. | ||
| 345 | */ | ||
| 346 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS && | ||
| 347 | !(ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED)) | ||
| 348 | return -EINVAL; | ||
| 349 | |||
| 350 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED) | ||
| 351 | ops->flags |= FTRACE_OPS_FL_SAVE_REGS; | ||
| 352 | #endif | ||
| 353 | |||
| 328 | if (!core_kernel_data((unsigned long)ops)) | 354 | if (!core_kernel_data((unsigned long)ops)) |
| 329 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; | 355 | ops->flags |= FTRACE_OPS_FL_DYNAMIC; |
| 330 | 356 | ||
| @@ -773,7 +799,8 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) | |||
| 773 | } | 799 | } |
| 774 | 800 | ||
| 775 | static void | 801 | static void |
| 776 | function_profile_call(unsigned long ip, unsigned long parent_ip) | 802 | function_profile_call(unsigned long ip, unsigned long parent_ip, |
| 803 | struct ftrace_ops *ops, struct pt_regs *regs) | ||
| 777 | { | 804 | { |
| 778 | struct ftrace_profile_stat *stat; | 805 | struct ftrace_profile_stat *stat; |
| 779 | struct ftrace_profile *rec; | 806 | struct ftrace_profile *rec; |
| @@ -803,7 +830,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) | |||
| 803 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER | 830 | #ifdef CONFIG_FUNCTION_GRAPH_TRACER |
| 804 | static int profile_graph_entry(struct ftrace_graph_ent *trace) | 831 | static int profile_graph_entry(struct ftrace_graph_ent *trace) |
| 805 | { | 832 | { |
| 806 | function_profile_call(trace->func, 0); | 833 | function_profile_call(trace->func, 0, NULL, NULL); |
| 807 | return 1; | 834 | return 1; |
| 808 | } | 835 | } |
| 809 | 836 | ||
| @@ -863,6 +890,7 @@ static void unregister_ftrace_profiler(void) | |||
| 863 | #else | 890 | #else |
| 864 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { | 891 | static struct ftrace_ops ftrace_profile_ops __read_mostly = { |
| 865 | .func = function_profile_call, | 892 | .func = function_profile_call, |
| 893 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 866 | }; | 894 | }; |
| 867 | 895 | ||
| 868 | static int register_ftrace_profiler(void) | 896 | static int register_ftrace_profiler(void) |
| @@ -1045,6 +1073,7 @@ static struct ftrace_ops global_ops = { | |||
| 1045 | .func = ftrace_stub, | 1073 | .func = ftrace_stub, |
| 1046 | .notrace_hash = EMPTY_HASH, | 1074 | .notrace_hash = EMPTY_HASH, |
| 1047 | .filter_hash = EMPTY_HASH, | 1075 | .filter_hash = EMPTY_HASH, |
| 1076 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 1048 | }; | 1077 | }; |
| 1049 | 1078 | ||
| 1050 | static DEFINE_MUTEX(ftrace_regex_lock); | 1079 | static DEFINE_MUTEX(ftrace_regex_lock); |
| @@ -1525,6 +1554,12 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, | |||
| 1525 | rec->flags++; | 1554 | rec->flags++; |
| 1526 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) | 1555 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) |
| 1527 | return; | 1556 | return; |
| 1557 | /* | ||
| 1558 | * If any ops wants regs saved for this function | ||
| 1559 | * then all ops will get saved regs. | ||
| 1560 | */ | ||
| 1561 | if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) | ||
| 1562 | rec->flags |= FTRACE_FL_REGS; | ||
| 1528 | } else { | 1563 | } else { |
| 1529 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) | 1564 | if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) |
| 1530 | return; | 1565 | return; |
| @@ -1616,18 +1651,59 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) | |||
| 1616 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) | 1651 | if (enable && (rec->flags & ~FTRACE_FL_MASK)) |
| 1617 | flag = FTRACE_FL_ENABLED; | 1652 | flag = FTRACE_FL_ENABLED; |
| 1618 | 1653 | ||
| 1654 | /* | ||
| 1655 | * If enabling and the REGS flag does not match the REGS_EN, then | ||
| 1656 | * do not ignore this record. Set flags to fail the compare against | ||
| 1657 | * ENABLED. | ||
| 1658 | */ | ||
| 1659 | if (flag && | ||
| 1660 | (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN))) | ||
| 1661 | flag |= FTRACE_FL_REGS; | ||
| 1662 | |||
| 1619 | /* If the state of this record hasn't changed, then do nothing */ | 1663 | /* If the state of this record hasn't changed, then do nothing */ |
| 1620 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) | 1664 | if ((rec->flags & FTRACE_FL_ENABLED) == flag) |
| 1621 | return FTRACE_UPDATE_IGNORE; | 1665 | return FTRACE_UPDATE_IGNORE; |
| 1622 | 1666 | ||
| 1623 | if (flag) { | 1667 | if (flag) { |
| 1624 | if (update) | 1668 | /* Save off if rec is being enabled (for return value) */ |
| 1669 | flag ^= rec->flags & FTRACE_FL_ENABLED; | ||
| 1670 | |||
| 1671 | if (update) { | ||
| 1625 | rec->flags |= FTRACE_FL_ENABLED; | 1672 | rec->flags |= FTRACE_FL_ENABLED; |
| 1626 | return FTRACE_UPDATE_MAKE_CALL; | 1673 | if (flag & FTRACE_FL_REGS) { |
| 1674 | if (rec->flags & FTRACE_FL_REGS) | ||
| 1675 | rec->flags |= FTRACE_FL_REGS_EN; | ||
| 1676 | else | ||
| 1677 | rec->flags &= ~FTRACE_FL_REGS_EN; | ||
| 1678 | } | ||
| 1679 | } | ||
| 1680 | |||
| 1681 | /* | ||
| 1682 | * If this record is being updated from a nop, then | ||
| 1683 | * return UPDATE_MAKE_CALL. | ||
| 1684 | * Otherwise, if the EN flag is set, then return | ||
| 1685 | * UPDATE_MODIFY_CALL_REGS to tell the caller to convert | ||
| 1686 | * from the non-save regs, to a save regs function. | ||
| 1687 | * Otherwise, | ||
| 1688 | * return UPDATE_MODIFY_CALL to tell the caller to convert | ||
| 1689 | * from the save regs, to a non-save regs function. | ||
| 1690 | */ | ||
| 1691 | if (flag & FTRACE_FL_ENABLED) | ||
| 1692 | return FTRACE_UPDATE_MAKE_CALL; | ||
| 1693 | else if (rec->flags & FTRACE_FL_REGS_EN) | ||
| 1694 | return FTRACE_UPDATE_MODIFY_CALL_REGS; | ||
| 1695 | else | ||
| 1696 | return FTRACE_UPDATE_MODIFY_CALL; | ||
| 1627 | } | 1697 | } |
| 1628 | 1698 | ||
| 1629 | if (update) | 1699 | if (update) { |
| 1630 | rec->flags &= ~FTRACE_FL_ENABLED; | 1700 | /* If there's no more users, clear all flags */ |
| 1701 | if (!(rec->flags & ~FTRACE_FL_MASK)) | ||
| 1702 | rec->flags = 0; | ||
| 1703 | else | ||
| 1704 | /* Just disable the record (keep REGS state) */ | ||
| 1705 | rec->flags &= ~FTRACE_FL_ENABLED; | ||
| 1706 | } | ||
| 1631 | 1707 | ||
| 1632 | return FTRACE_UPDATE_MAKE_NOP; | 1708 | return FTRACE_UPDATE_MAKE_NOP; |
| 1633 | } | 1709 | } |
| @@ -1662,13 +1738,17 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable) | |||
| 1662 | static int | 1738 | static int |
| 1663 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | 1739 | __ftrace_replace_code(struct dyn_ftrace *rec, int enable) |
| 1664 | { | 1740 | { |
| 1741 | unsigned long ftrace_old_addr; | ||
| 1665 | unsigned long ftrace_addr; | 1742 | unsigned long ftrace_addr; |
| 1666 | int ret; | 1743 | int ret; |
| 1667 | 1744 | ||
| 1668 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
| 1669 | |||
| 1670 | ret = ftrace_update_record(rec, enable); | 1745 | ret = ftrace_update_record(rec, enable); |
| 1671 | 1746 | ||
| 1747 | if (rec->flags & FTRACE_FL_REGS) | ||
| 1748 | ftrace_addr = (unsigned long)FTRACE_REGS_ADDR; | ||
| 1749 | else | ||
| 1750 | ftrace_addr = (unsigned long)FTRACE_ADDR; | ||
| 1751 | |||
| 1672 | switch (ret) { | 1752 | switch (ret) { |
| 1673 | case FTRACE_UPDATE_IGNORE: | 1753 | case FTRACE_UPDATE_IGNORE: |
| 1674 | return 0; | 1754 | return 0; |
| @@ -1678,6 +1758,15 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) | |||
| 1678 | 1758 | ||
| 1679 | case FTRACE_UPDATE_MAKE_NOP: | 1759 | case FTRACE_UPDATE_MAKE_NOP: |
| 1680 | return ftrace_make_nop(NULL, rec, ftrace_addr); | 1760 | return ftrace_make_nop(NULL, rec, ftrace_addr); |
| 1761 | |||
| 1762 | case FTRACE_UPDATE_MODIFY_CALL_REGS: | ||
| 1763 | case FTRACE_UPDATE_MODIFY_CALL: | ||
| 1764 | if (rec->flags & FTRACE_FL_REGS) | ||
| 1765 | ftrace_old_addr = (unsigned long)FTRACE_ADDR; | ||
| 1766 | else | ||
| 1767 | ftrace_old_addr = (unsigned long)FTRACE_REGS_ADDR; | ||
| 1768 | |||
| 1769 | return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); | ||
| 1681 | } | 1770 | } |
| 1682 | 1771 | ||
| 1683 | return -1; /* unknow ftrace bug */ | 1772 | return -1; /* unknow ftrace bug */ |
| @@ -1882,16 +1971,6 @@ static void ftrace_run_update_code(int command) | |||
| 1882 | */ | 1971 | */ |
| 1883 | arch_ftrace_update_code(command); | 1972 | arch_ftrace_update_code(command); |
| 1884 | 1973 | ||
| 1885 | #ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST | ||
| 1886 | /* | ||
| 1887 | * For archs that call ftrace_test_stop_func(), we must | ||
| 1888 | * wait till after we update all the function callers | ||
| 1889 | * before we update the callback. This keeps different | ||
| 1890 | * ops that record different functions from corrupting | ||
| 1891 | * each other. | ||
| 1892 | */ | ||
| 1893 | __ftrace_trace_function = __ftrace_trace_function_delay; | ||
| 1894 | #endif | ||
| 1895 | function_trace_stop--; | 1974 | function_trace_stop--; |
| 1896 | 1975 | ||
| 1897 | ret = ftrace_arch_code_modify_post_process(); | 1976 | ret = ftrace_arch_code_modify_post_process(); |
| @@ -2441,8 +2520,9 @@ static int t_show(struct seq_file *m, void *v) | |||
| 2441 | 2520 | ||
| 2442 | seq_printf(m, "%ps", (void *)rec->ip); | 2521 | seq_printf(m, "%ps", (void *)rec->ip); |
| 2443 | if (iter->flags & FTRACE_ITER_ENABLED) | 2522 | if (iter->flags & FTRACE_ITER_ENABLED) |
| 2444 | seq_printf(m, " (%ld)", | 2523 | seq_printf(m, " (%ld)%s", |
| 2445 | rec->flags & ~FTRACE_FL_MASK); | 2524 | rec->flags & ~FTRACE_FL_MASK, |
| 2525 | rec->flags & FTRACE_FL_REGS ? " R" : ""); | ||
| 2446 | seq_printf(m, "\n"); | 2526 | seq_printf(m, "\n"); |
| 2447 | 2527 | ||
| 2448 | return 0; | 2528 | return 0; |
| @@ -2790,8 +2870,8 @@ static int __init ftrace_mod_cmd_init(void) | |||
| 2790 | } | 2870 | } |
| 2791 | device_initcall(ftrace_mod_cmd_init); | 2871 | device_initcall(ftrace_mod_cmd_init); |
| 2792 | 2872 | ||
| 2793 | static void | 2873 | static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, |
| 2794 | function_trace_probe_call(unsigned long ip, unsigned long parent_ip) | 2874 | struct ftrace_ops *op, struct pt_regs *pt_regs) |
| 2795 | { | 2875 | { |
| 2796 | struct ftrace_func_probe *entry; | 2876 | struct ftrace_func_probe *entry; |
| 2797 | struct hlist_head *hhd; | 2877 | struct hlist_head *hhd; |
| @@ -3162,8 +3242,27 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf, | |||
| 3162 | } | 3242 | } |
| 3163 | 3243 | ||
| 3164 | static int | 3244 | static int |
| 3165 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | 3245 | ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) |
| 3166 | int reset, int enable) | 3246 | { |
| 3247 | struct ftrace_func_entry *entry; | ||
| 3248 | |||
| 3249 | if (!ftrace_location(ip)) | ||
| 3250 | return -EINVAL; | ||
| 3251 | |||
| 3252 | if (remove) { | ||
| 3253 | entry = ftrace_lookup_ip(hash, ip); | ||
| 3254 | if (!entry) | ||
| 3255 | return -ENOENT; | ||
| 3256 | free_hash_entry(hash, entry); | ||
| 3257 | return 0; | ||
| 3258 | } | ||
| 3259 | |||
| 3260 | return add_hash_entry(hash, ip); | ||
| 3261 | } | ||
| 3262 | |||
| 3263 | static int | ||
| 3264 | ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, | ||
| 3265 | unsigned long ip, int remove, int reset, int enable) | ||
| 3167 | { | 3266 | { |
| 3168 | struct ftrace_hash **orig_hash; | 3267 | struct ftrace_hash **orig_hash; |
| 3169 | struct ftrace_hash *hash; | 3268 | struct ftrace_hash *hash; |
| @@ -3192,6 +3291,11 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3192 | ret = -EINVAL; | 3291 | ret = -EINVAL; |
| 3193 | goto out_regex_unlock; | 3292 | goto out_regex_unlock; |
| 3194 | } | 3293 | } |
| 3294 | if (ip) { | ||
| 3295 | ret = ftrace_match_addr(hash, ip, remove); | ||
| 3296 | if (ret < 0) | ||
| 3297 | goto out_regex_unlock; | ||
| 3298 | } | ||
| 3195 | 3299 | ||
| 3196 | mutex_lock(&ftrace_lock); | 3300 | mutex_lock(&ftrace_lock); |
| 3197 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); | 3301 | ret = ftrace_hash_move(ops, enable, orig_hash, hash); |
| @@ -3208,6 +3312,37 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | |||
| 3208 | return ret; | 3312 | return ret; |
| 3209 | } | 3313 | } |
| 3210 | 3314 | ||
| 3315 | static int | ||
| 3316 | ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove, | ||
| 3317 | int reset, int enable) | ||
| 3318 | { | ||
| 3319 | return ftrace_set_hash(ops, 0, 0, ip, remove, reset, enable); | ||
| 3320 | } | ||
| 3321 | |||
| 3322 | /** | ||
| 3323 | * ftrace_set_filter_ip - set a function to filter on in ftrace by address | ||
| 3324 | * @ops - the ops to set the filter with | ||
| 3325 | * @ip - the address to add to or remove from the filter. | ||
| 3326 | * @remove - non zero to remove the ip from the filter | ||
| 3327 | * @reset - non zero to reset all filters before applying this filter. | ||
| 3328 | * | ||
| 3329 | * Filters denote which functions should be enabled when tracing is enabled | ||
| 3330 | * If @ip is NULL, it failes to update filter. | ||
| 3331 | */ | ||
| 3332 | int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip, | ||
| 3333 | int remove, int reset) | ||
| 3334 | { | ||
| 3335 | return ftrace_set_addr(ops, ip, remove, reset, 1); | ||
| 3336 | } | ||
| 3337 | EXPORT_SYMBOL_GPL(ftrace_set_filter_ip); | ||
| 3338 | |||
| 3339 | static int | ||
| 3340 | ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, | ||
| 3341 | int reset, int enable) | ||
| 3342 | { | ||
| 3343 | return ftrace_set_hash(ops, buf, len, 0, 0, reset, enable); | ||
| 3344 | } | ||
| 3345 | |||
| 3211 | /** | 3346 | /** |
| 3212 | * ftrace_set_filter - set a function to filter on in ftrace | 3347 | * ftrace_set_filter - set a function to filter on in ftrace |
| 3213 | * @ops - the ops to set the filter with | 3348 | * @ops - the ops to set the filter with |
| @@ -3912,6 +4047,7 @@ void __init ftrace_init(void) | |||
| 3912 | 4047 | ||
| 3913 | static struct ftrace_ops global_ops = { | 4048 | static struct ftrace_ops global_ops = { |
| 3914 | .func = ftrace_stub, | 4049 | .func = ftrace_stub, |
| 4050 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 3915 | }; | 4051 | }; |
| 3916 | 4052 | ||
| 3917 | static int __init ftrace_nodyn_init(void) | 4053 | static int __init ftrace_nodyn_init(void) |
| @@ -3942,10 +4078,9 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) | |||
| 3942 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 4078 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
| 3943 | 4079 | ||
| 3944 | static void | 4080 | static void |
| 3945 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | 4081 | ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, |
| 4082 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 3946 | { | 4083 | { |
| 3947 | struct ftrace_ops *op; | ||
| 3948 | |||
| 3949 | if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) | 4084 | if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) |
| 3950 | return; | 4085 | return; |
| 3951 | 4086 | ||
| @@ -3959,7 +4094,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | |||
| 3959 | while (op != &ftrace_list_end) { | 4094 | while (op != &ftrace_list_end) { |
| 3960 | if (!ftrace_function_local_disabled(op) && | 4095 | if (!ftrace_function_local_disabled(op) && |
| 3961 | ftrace_ops_test(op, ip)) | 4096 | ftrace_ops_test(op, ip)) |
| 3962 | op->func(ip, parent_ip); | 4097 | op->func(ip, parent_ip, op, regs); |
| 3963 | 4098 | ||
| 3964 | op = rcu_dereference_raw(op->next); | 4099 | op = rcu_dereference_raw(op->next); |
| 3965 | }; | 4100 | }; |
| @@ -3969,13 +4104,18 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip) | |||
| 3969 | 4104 | ||
| 3970 | static struct ftrace_ops control_ops = { | 4105 | static struct ftrace_ops control_ops = { |
| 3971 | .func = ftrace_ops_control_func, | 4106 | .func = ftrace_ops_control_func, |
| 4107 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 3972 | }; | 4108 | }; |
| 3973 | 4109 | ||
| 3974 | static void | 4110 | static inline void |
| 3975 | ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | 4111 | __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, |
| 4112 | struct ftrace_ops *ignored, struct pt_regs *regs) | ||
| 3976 | { | 4113 | { |
| 3977 | struct ftrace_ops *op; | 4114 | struct ftrace_ops *op; |
| 3978 | 4115 | ||
| 4116 | if (function_trace_stop) | ||
| 4117 | return; | ||
| 4118 | |||
| 3979 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) | 4119 | if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) |
| 3980 | return; | 4120 | return; |
| 3981 | 4121 | ||
| @@ -3988,13 +4128,39 @@ ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) | |||
| 3988 | op = rcu_dereference_raw(ftrace_ops_list); | 4128 | op = rcu_dereference_raw(ftrace_ops_list); |
| 3989 | while (op != &ftrace_list_end) { | 4129 | while (op != &ftrace_list_end) { |
| 3990 | if (ftrace_ops_test(op, ip)) | 4130 | if (ftrace_ops_test(op, ip)) |
| 3991 | op->func(ip, parent_ip); | 4131 | op->func(ip, parent_ip, op, regs); |
| 3992 | op = rcu_dereference_raw(op->next); | 4132 | op = rcu_dereference_raw(op->next); |
| 3993 | }; | 4133 | }; |
| 3994 | preempt_enable_notrace(); | 4134 | preempt_enable_notrace(); |
| 3995 | trace_recursion_clear(TRACE_INTERNAL_BIT); | 4135 | trace_recursion_clear(TRACE_INTERNAL_BIT); |
| 3996 | } | 4136 | } |
| 3997 | 4137 | ||
| 4138 | /* | ||
| 4139 | * Some archs only support passing ip and parent_ip. Even though | ||
| 4140 | * the list function ignores the op parameter, we do not want any | ||
| 4141 | * C side effects, where a function is called without the caller | ||
| 4142 | * sending a third parameter. | ||
| 4143 | * Archs are to support both the regs and ftrace_ops at the same time. | ||
| 4144 | * If they support ftrace_ops, it is assumed they support regs. | ||
| 4145 | * If call backs want to use regs, they must either check for regs | ||
| 4146 | * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. | ||
| 4147 | * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. | ||
| 4148 | * An architecture can pass partial regs with ftrace_ops and still | ||
| 4149 | * set the ARCH_SUPPORT_FTARCE_OPS. | ||
| 4150 | */ | ||
| 4151 | #if ARCH_SUPPORTS_FTRACE_OPS | ||
| 4152 | static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, | ||
| 4153 | struct ftrace_ops *op, struct pt_regs *regs) | ||
| 4154 | { | ||
| 4155 | __ftrace_ops_list_func(ip, parent_ip, NULL, regs); | ||
| 4156 | } | ||
| 4157 | #else | ||
| 4158 | static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) | ||
| 4159 | { | ||
| 4160 | __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); | ||
| 4161 | } | ||
| 4162 | #endif | ||
| 4163 | |||
| 3998 | static void clear_ftrace_swapper(void) | 4164 | static void clear_ftrace_swapper(void) |
| 3999 | { | 4165 | { |
| 4000 | struct task_struct *p; | 4166 | struct task_struct *p; |
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 49491fa7daa2..b32ed0e385a5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
| @@ -2816,7 +2816,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable); | |||
| 2816 | * to the buffer after this will fail and return NULL. | 2816 | * to the buffer after this will fail and return NULL. |
| 2817 | * | 2817 | * |
| 2818 | * This is different than ring_buffer_record_disable() as | 2818 | * This is different than ring_buffer_record_disable() as |
| 2819 | * it works like an on/off switch, where as the disable() verison | 2819 | * it works like an on/off switch, where as the disable() version |
| 2820 | * must be paired with a enable(). | 2820 | * must be paired with a enable(). |
| 2821 | */ | 2821 | */ |
| 2822 | void ring_buffer_record_off(struct ring_buffer *buffer) | 2822 | void ring_buffer_record_off(struct ring_buffer *buffer) |
| @@ -2839,7 +2839,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_off); | |||
| 2839 | * ring_buffer_record_off(). | 2839 | * ring_buffer_record_off(). |
| 2840 | * | 2840 | * |
| 2841 | * This is different than ring_buffer_record_enable() as | 2841 | * This is different than ring_buffer_record_enable() as |
| 2842 | * it works like an on/off switch, where as the enable() verison | 2842 | * it works like an on/off switch, where as the enable() version |
| 2843 | * must be paired with a disable(). | 2843 | * must be paired with a disable(). |
| 2844 | */ | 2844 | */ |
| 2845 | void ring_buffer_record_on(struct ring_buffer *buffer) | 2845 | void ring_buffer_record_on(struct ring_buffer *buffer) |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c38c81496ce..31e4f55773f1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
| @@ -328,7 +328,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); | |||
| 328 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | | 328 | unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | |
| 329 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | | 329 | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | |
| 330 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | | 330 | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | |
| 331 | TRACE_ITER_IRQ_INFO; | 331 | TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; |
| 332 | 332 | ||
| 333 | static int trace_stop_count; | 333 | static int trace_stop_count; |
| 334 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); | 334 | static DEFINE_RAW_SPINLOCK(tracing_start_lock); |
| @@ -426,15 +426,15 @@ __setup("trace_buf_size=", set_buf_size); | |||
| 426 | 426 | ||
| 427 | static int __init set_tracing_thresh(char *str) | 427 | static int __init set_tracing_thresh(char *str) |
| 428 | { | 428 | { |
| 429 | unsigned long threshhold; | 429 | unsigned long threshold; |
| 430 | int ret; | 430 | int ret; |
| 431 | 431 | ||
| 432 | if (!str) | 432 | if (!str) |
| 433 | return 0; | 433 | return 0; |
| 434 | ret = strict_strtoul(str, 0, &threshhold); | 434 | ret = strict_strtoul(str, 0, &threshold); |
| 435 | if (ret < 0) | 435 | if (ret < 0) |
| 436 | return 0; | 436 | return 0; |
| 437 | tracing_thresh = threshhold * 1000; | 437 | tracing_thresh = threshold * 1000; |
| 438 | return 1; | 438 | return 1; |
| 439 | } | 439 | } |
| 440 | __setup("tracing_thresh=", set_tracing_thresh); | 440 | __setup("tracing_thresh=", set_tracing_thresh); |
| @@ -470,6 +470,7 @@ static const char *trace_options[] = { | |||
| 470 | "overwrite", | 470 | "overwrite", |
| 471 | "disable_on_free", | 471 | "disable_on_free", |
| 472 | "irq-info", | 472 | "irq-info", |
| 473 | "markers", | ||
| 473 | NULL | 474 | NULL |
| 474 | }; | 475 | }; |
| 475 | 476 | ||
| @@ -2060,7 +2061,8 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) | |||
| 2060 | seq_puts(m, "# -----------------\n"); | 2061 | seq_puts(m, "# -----------------\n"); |
| 2061 | seq_printf(m, "# | task: %.16s-%d " | 2062 | seq_printf(m, "# | task: %.16s-%d " |
| 2062 | "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", | 2063 | "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", |
| 2063 | data->comm, data->pid, data->uid, data->nice, | 2064 | data->comm, data->pid, |
| 2065 | from_kuid_munged(seq_user_ns(m), data->uid), data->nice, | ||
| 2064 | data->policy, data->rt_priority); | 2066 | data->policy, data->rt_priority); |
| 2065 | seq_puts(m, "# -----------------\n"); | 2067 | seq_puts(m, "# -----------------\n"); |
| 2066 | 2068 | ||
| @@ -3886,6 +3888,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, | |||
| 3886 | if (tracing_disabled) | 3888 | if (tracing_disabled) |
| 3887 | return -EINVAL; | 3889 | return -EINVAL; |
| 3888 | 3890 | ||
| 3891 | if (!(trace_flags & TRACE_ITER_MARKERS)) | ||
| 3892 | return -EINVAL; | ||
| 3893 | |||
| 3889 | if (cnt > TRACE_BUF_SIZE) | 3894 | if (cnt > TRACE_BUF_SIZE) |
| 3890 | cnt = TRACE_BUF_SIZE; | 3895 | cnt = TRACE_BUF_SIZE; |
| 3891 | 3896 | ||
| @@ -4195,12 +4200,6 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, | |||
| 4195 | buf->private = 0; | 4200 | buf->private = 0; |
| 4196 | } | 4201 | } |
| 4197 | 4202 | ||
| 4198 | static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, | ||
| 4199 | struct pipe_buffer *buf) | ||
| 4200 | { | ||
| 4201 | return 1; | ||
| 4202 | } | ||
| 4203 | |||
| 4204 | static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, | 4203 | static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, |
| 4205 | struct pipe_buffer *buf) | 4204 | struct pipe_buffer *buf) |
| 4206 | { | 4205 | { |
| @@ -4216,7 +4215,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { | |||
| 4216 | .unmap = generic_pipe_buf_unmap, | 4215 | .unmap = generic_pipe_buf_unmap, |
| 4217 | .confirm = generic_pipe_buf_confirm, | 4216 | .confirm = generic_pipe_buf_confirm, |
| 4218 | .release = buffer_pipe_buf_release, | 4217 | .release = buffer_pipe_buf_release, |
| 4219 | .steal = buffer_pipe_buf_steal, | 4218 | .steal = generic_pipe_buf_steal, |
| 4220 | .get = buffer_pipe_buf_get, | 4219 | .get = buffer_pipe_buf_get, |
| 4221 | }; | 4220 | }; |
| 4222 | 4221 | ||
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 55e1f7f0db12..c15f528c1af4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
| @@ -147,7 +147,7 @@ struct trace_array_cpu { | |||
| 147 | unsigned long skipped_entries; | 147 | unsigned long skipped_entries; |
| 148 | cycle_t preempt_timestamp; | 148 | cycle_t preempt_timestamp; |
| 149 | pid_t pid; | 149 | pid_t pid; |
| 150 | uid_t uid; | 150 | kuid_t uid; |
| 151 | char comm[TASK_COMM_LEN]; | 151 | char comm[TASK_COMM_LEN]; |
| 152 | }; | 152 | }; |
| 153 | 153 | ||
| @@ -472,11 +472,11 @@ extern void trace_find_cmdline(int pid, char comm[]); | |||
| 472 | 472 | ||
| 473 | #ifdef CONFIG_DYNAMIC_FTRACE | 473 | #ifdef CONFIG_DYNAMIC_FTRACE |
| 474 | extern unsigned long ftrace_update_tot_cnt; | 474 | extern unsigned long ftrace_update_tot_cnt; |
| 475 | #endif | ||
| 475 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func | 476 | #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func |
| 476 | extern int DYN_FTRACE_TEST_NAME(void); | 477 | extern int DYN_FTRACE_TEST_NAME(void); |
| 477 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 | 478 | #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 |
| 478 | extern int DYN_FTRACE_TEST_NAME2(void); | 479 | extern int DYN_FTRACE_TEST_NAME2(void); |
| 479 | #endif | ||
| 480 | 480 | ||
| 481 | extern int ring_buffer_expanded; | 481 | extern int ring_buffer_expanded; |
| 482 | extern bool tracing_selftest_disabled; | 482 | extern bool tracing_selftest_disabled; |
| @@ -680,6 +680,7 @@ enum trace_iterator_flags { | |||
| 680 | TRACE_ITER_OVERWRITE = 0x200000, | 680 | TRACE_ITER_OVERWRITE = 0x200000, |
| 681 | TRACE_ITER_STOP_ON_FREE = 0x400000, | 681 | TRACE_ITER_STOP_ON_FREE = 0x400000, |
| 682 | TRACE_ITER_IRQ_INFO = 0x800000, | 682 | TRACE_ITER_IRQ_INFO = 0x800000, |
| 683 | TRACE_ITER_MARKERS = 0x1000000, | ||
| 683 | }; | 684 | }; |
| 684 | 685 | ||
| 685 | /* | 686 | /* |
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 8a6d2ee2086c..84b1e045faba 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c | |||
| @@ -258,7 +258,8 @@ EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); | |||
| 258 | 258 | ||
| 259 | #ifdef CONFIG_FUNCTION_TRACER | 259 | #ifdef CONFIG_FUNCTION_TRACER |
| 260 | static void | 260 | static void |
| 261 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip) | 261 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, |
| 262 | struct ftrace_ops *ops, struct pt_regs *pt_regs) | ||
| 262 | { | 263 | { |
| 263 | struct ftrace_entry *entry; | 264 | struct ftrace_entry *entry; |
| 264 | struct hlist_head *head; | 265 | struct hlist_head *head; |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29111da1d100..d608d09d08c0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
| @@ -1199,6 +1199,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
| 1199 | return 0; | 1199 | return 0; |
| 1200 | } | 1200 | } |
| 1201 | 1201 | ||
| 1202 | static void event_remove(struct ftrace_event_call *call) | ||
| 1203 | { | ||
| 1204 | ftrace_event_enable_disable(call, 0); | ||
| 1205 | if (call->event.funcs) | ||
| 1206 | __unregister_ftrace_event(&call->event); | ||
| 1207 | list_del(&call->list); | ||
| 1208 | } | ||
| 1209 | |||
| 1210 | static int event_init(struct ftrace_event_call *call) | ||
| 1211 | { | ||
| 1212 | int ret = 0; | ||
| 1213 | |||
| 1214 | if (WARN_ON(!call->name)) | ||
| 1215 | return -EINVAL; | ||
| 1216 | |||
| 1217 | if (call->class->raw_init) { | ||
| 1218 | ret = call->class->raw_init(call); | ||
| 1219 | if (ret < 0 && ret != -ENOSYS) | ||
| 1220 | pr_warn("Could not initialize trace events/%s\n", | ||
| 1221 | call->name); | ||
| 1222 | } | ||
| 1223 | |||
| 1224 | return ret; | ||
| 1225 | } | ||
| 1226 | |||
| 1202 | static int | 1227 | static int |
| 1203 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | 1228 | __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, |
| 1204 | const struct file_operations *id, | 1229 | const struct file_operations *id, |
| @@ -1209,19 +1234,9 @@ __trace_add_event_call(struct ftrace_event_call *call, struct module *mod, | |||
| 1209 | struct dentry *d_events; | 1234 | struct dentry *d_events; |
| 1210 | int ret; | 1235 | int ret; |
| 1211 | 1236 | ||
| 1212 | /* The linker may leave blanks */ | 1237 | ret = event_init(call); |
| 1213 | if (!call->name) | 1238 | if (ret < 0) |
| 1214 | return -EINVAL; | 1239 | return ret; |
| 1215 | |||
| 1216 | if (call->class->raw_init) { | ||
| 1217 | ret = call->class->raw_init(call); | ||
| 1218 | if (ret < 0) { | ||
| 1219 | if (ret != -ENOSYS) | ||
| 1220 | pr_warning("Could not initialize trace events/%s\n", | ||
| 1221 | call->name); | ||
| 1222 | return ret; | ||
| 1223 | } | ||
| 1224 | } | ||
| 1225 | 1240 | ||
| 1226 | d_events = event_trace_events_dir(); | 1241 | d_events = event_trace_events_dir(); |
| 1227 | if (!d_events) | 1242 | if (!d_events) |
| @@ -1272,13 +1287,10 @@ static void remove_subsystem_dir(const char *name) | |||
| 1272 | */ | 1287 | */ |
| 1273 | static void __trace_remove_event_call(struct ftrace_event_call *call) | 1288 | static void __trace_remove_event_call(struct ftrace_event_call *call) |
| 1274 | { | 1289 | { |
| 1275 | ftrace_event_enable_disable(call, 0); | 1290 | event_remove(call); |
| 1276 | if (call->event.funcs) | ||
| 1277 | __unregister_ftrace_event(&call->event); | ||
| 1278 | debugfs_remove_recursive(call->dir); | ||
| 1279 | list_del(&call->list); | ||
| 1280 | trace_destroy_fields(call); | 1291 | trace_destroy_fields(call); |
| 1281 | destroy_preds(call); | 1292 | destroy_preds(call); |
| 1293 | debugfs_remove_recursive(call->dir); | ||
| 1282 | remove_subsystem_dir(call->class->system); | 1294 | remove_subsystem_dir(call->class->system); |
| 1283 | } | 1295 | } |
| 1284 | 1296 | ||
| @@ -1450,15 +1462,43 @@ static __init int setup_trace_event(char *str) | |||
| 1450 | } | 1462 | } |
| 1451 | __setup("trace_event=", setup_trace_event); | 1463 | __setup("trace_event=", setup_trace_event); |
| 1452 | 1464 | ||
| 1465 | static __init int event_trace_enable(void) | ||
| 1466 | { | ||
| 1467 | struct ftrace_event_call **iter, *call; | ||
| 1468 | char *buf = bootup_event_buf; | ||
| 1469 | char *token; | ||
| 1470 | int ret; | ||
| 1471 | |||
| 1472 | for_each_event(iter, __start_ftrace_events, __stop_ftrace_events) { | ||
| 1473 | |||
| 1474 | call = *iter; | ||
| 1475 | ret = event_init(call); | ||
| 1476 | if (!ret) | ||
| 1477 | list_add(&call->list, &ftrace_events); | ||
| 1478 | } | ||
| 1479 | |||
| 1480 | while (true) { | ||
| 1481 | token = strsep(&buf, ","); | ||
| 1482 | |||
| 1483 | if (!token) | ||
| 1484 | break; | ||
| 1485 | if (!*token) | ||
| 1486 | continue; | ||
| 1487 | |||
| 1488 | ret = ftrace_set_clr_event(token, 1); | ||
| 1489 | if (ret) | ||
| 1490 | pr_warn("Failed to enable trace event: %s\n", token); | ||
| 1491 | } | ||
| 1492 | return 0; | ||
| 1493 | } | ||
| 1494 | |||
| 1453 | static __init int event_trace_init(void) | 1495 | static __init int event_trace_init(void) |
| 1454 | { | 1496 | { |
| 1455 | struct ftrace_event_call **call; | 1497 | struct ftrace_event_call *call; |
| 1456 | struct dentry *d_tracer; | 1498 | struct dentry *d_tracer; |
| 1457 | struct dentry *entry; | 1499 | struct dentry *entry; |
| 1458 | struct dentry *d_events; | 1500 | struct dentry *d_events; |
| 1459 | int ret; | 1501 | int ret; |
| 1460 | char *buf = bootup_event_buf; | ||
| 1461 | char *token; | ||
| 1462 | 1502 | ||
| 1463 | d_tracer = tracing_init_dentry(); | 1503 | d_tracer = tracing_init_dentry(); |
| 1464 | if (!d_tracer) | 1504 | if (!d_tracer) |
| @@ -1497,24 +1537,19 @@ static __init int event_trace_init(void) | |||
| 1497 | if (trace_define_common_fields()) | 1537 | if (trace_define_common_fields()) |
| 1498 | pr_warning("tracing: Failed to allocate common fields"); | 1538 | pr_warning("tracing: Failed to allocate common fields"); |
| 1499 | 1539 | ||
| 1500 | for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { | 1540 | /* |
| 1501 | __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, | 1541 | * Early initialization already enabled ftrace event. |
| 1542 | * Now it's only necessary to create the event directory. | ||
| 1543 | */ | ||
| 1544 | list_for_each_entry(call, &ftrace_events, list) { | ||
| 1545 | |||
| 1546 | ret = event_create_dir(call, d_events, | ||
| 1547 | &ftrace_event_id_fops, | ||
| 1502 | &ftrace_enable_fops, | 1548 | &ftrace_enable_fops, |
| 1503 | &ftrace_event_filter_fops, | 1549 | &ftrace_event_filter_fops, |
| 1504 | &ftrace_event_format_fops); | 1550 | &ftrace_event_format_fops); |
| 1505 | } | 1551 | if (ret < 0) |
| 1506 | 1552 | event_remove(call); | |
| 1507 | while (true) { | ||
| 1508 | token = strsep(&buf, ","); | ||
| 1509 | |||
| 1510 | if (!token) | ||
| 1511 | break; | ||
| 1512 | if (!*token) | ||
| 1513 | continue; | ||
| 1514 | |||
| 1515 | ret = ftrace_set_clr_event(token, 1); | ||
| 1516 | if (ret) | ||
| 1517 | pr_warning("Failed to enable trace event: %s\n", token); | ||
| 1518 | } | 1553 | } |
| 1519 | 1554 | ||
| 1520 | ret = register_module_notifier(&trace_module_nb); | 1555 | ret = register_module_notifier(&trace_module_nb); |
| @@ -1523,6 +1558,7 @@ static __init int event_trace_init(void) | |||
| 1523 | 1558 | ||
| 1524 | return 0; | 1559 | return 0; |
| 1525 | } | 1560 | } |
| 1561 | core_initcall(event_trace_enable); | ||
| 1526 | fs_initcall(event_trace_init); | 1562 | fs_initcall(event_trace_init); |
| 1527 | 1563 | ||
| 1528 | #ifdef CONFIG_FTRACE_STARTUP_TEST | 1564 | #ifdef CONFIG_FTRACE_STARTUP_TEST |
| @@ -1646,9 +1682,11 @@ static __init void event_trace_self_tests(void) | |||
| 1646 | event_test_stuff(); | 1682 | event_test_stuff(); |
| 1647 | 1683 | ||
| 1648 | ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); | 1684 | ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); |
| 1649 | if (WARN_ON_ONCE(ret)) | 1685 | if (WARN_ON_ONCE(ret)) { |
| 1650 | pr_warning("error disabling system %s\n", | 1686 | pr_warning("error disabling system %s\n", |
| 1651 | system->name); | 1687 | system->name); |
| 1688 | continue; | ||
| 1689 | } | ||
| 1652 | 1690 | ||
| 1653 | pr_cont("OK\n"); | 1691 | pr_cont("OK\n"); |
| 1654 | } | 1692 | } |
| @@ -1681,7 +1719,8 @@ static __init void event_trace_self_tests(void) | |||
| 1681 | static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); | 1719 | static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); |
| 1682 | 1720 | ||
| 1683 | static void | 1721 | static void |
| 1684 | function_test_events_call(unsigned long ip, unsigned long parent_ip) | 1722 | function_test_events_call(unsigned long ip, unsigned long parent_ip, |
| 1723 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 1685 | { | 1724 | { |
| 1686 | struct ring_buffer_event *event; | 1725 | struct ring_buffer_event *event; |
| 1687 | struct ring_buffer *buffer; | 1726 | struct ring_buffer *buffer; |
| @@ -1720,6 +1759,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) | |||
| 1720 | static struct ftrace_ops trace_ops __initdata = | 1759 | static struct ftrace_ops trace_ops __initdata = |
| 1721 | { | 1760 | { |
| 1722 | .func = function_test_events_call, | 1761 | .func = function_test_events_call, |
| 1762 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 1723 | }; | 1763 | }; |
| 1724 | 1764 | ||
| 1725 | static __init void event_trace_self_test_with_function(void) | 1765 | static __init void event_trace_self_test_with_function(void) |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 431dba8b7542..c154797a7ff7 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
| @@ -2002,7 +2002,7 @@ static int ftrace_function_set_regexp(struct ftrace_ops *ops, int filter, | |||
| 2002 | static int __ftrace_function_set_filter(int filter, char *buf, int len, | 2002 | static int __ftrace_function_set_filter(int filter, char *buf, int len, |
| 2003 | struct function_filter_data *data) | 2003 | struct function_filter_data *data) |
| 2004 | { | 2004 | { |
| 2005 | int i, re_cnt, ret; | 2005 | int i, re_cnt, ret = -EINVAL; |
| 2006 | int *reset; | 2006 | int *reset; |
| 2007 | char **re; | 2007 | char **re; |
| 2008 | 2008 | ||
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index a426f410c060..507a7a9630bf 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c | |||
| @@ -13,7 +13,6 @@ | |||
| 13 | #include <linux/debugfs.h> | 13 | #include <linux/debugfs.h> |
| 14 | #include <linux/uaccess.h> | 14 | #include <linux/uaccess.h> |
| 15 | #include <linux/ftrace.h> | 15 | #include <linux/ftrace.h> |
| 16 | #include <linux/pstore.h> | ||
| 17 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
| 18 | 17 | ||
| 19 | #include "trace.h" | 18 | #include "trace.h" |
| @@ -49,7 +48,8 @@ static void function_trace_start(struct trace_array *tr) | |||
| 49 | } | 48 | } |
| 50 | 49 | ||
| 51 | static void | 50 | static void |
| 52 | function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | 51 | function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip, |
| 52 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 53 | { | 53 | { |
| 54 | struct trace_array *tr = func_trace; | 54 | struct trace_array *tr = func_trace; |
| 55 | struct trace_array_cpu *data; | 55 | struct trace_array_cpu *data; |
| @@ -75,16 +75,17 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) | |||
| 75 | preempt_enable_notrace(); | 75 | preempt_enable_notrace(); |
| 76 | } | 76 | } |
| 77 | 77 | ||
| 78 | /* Our two options */ | 78 | /* Our option */ |
| 79 | enum { | 79 | enum { |
| 80 | TRACE_FUNC_OPT_STACK = 0x1, | 80 | TRACE_FUNC_OPT_STACK = 0x1, |
| 81 | TRACE_FUNC_OPT_PSTORE = 0x2, | ||
| 82 | }; | 81 | }; |
| 83 | 82 | ||
| 84 | static struct tracer_flags func_flags; | 83 | static struct tracer_flags func_flags; |
| 85 | 84 | ||
| 86 | static void | 85 | static void |
| 87 | function_trace_call(unsigned long ip, unsigned long parent_ip) | 86 | function_trace_call(unsigned long ip, unsigned long parent_ip, |
| 87 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 88 | |||
| 88 | { | 89 | { |
| 89 | struct trace_array *tr = func_trace; | 90 | struct trace_array *tr = func_trace; |
| 90 | struct trace_array_cpu *data; | 91 | struct trace_array_cpu *data; |
| @@ -106,12 +107,6 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 106 | disabled = atomic_inc_return(&data->disabled); | 107 | disabled = atomic_inc_return(&data->disabled); |
| 107 | 108 | ||
| 108 | if (likely(disabled == 1)) { | 109 | if (likely(disabled == 1)) { |
| 109 | /* | ||
| 110 | * So far tracing doesn't support multiple buffers, so | ||
| 111 | * we make an explicit call for now. | ||
| 112 | */ | ||
| 113 | if (unlikely(func_flags.val & TRACE_FUNC_OPT_PSTORE)) | ||
| 114 | pstore_ftrace_call(ip, parent_ip); | ||
| 115 | pc = preempt_count(); | 110 | pc = preempt_count(); |
| 116 | trace_function(tr, ip, parent_ip, flags, pc); | 111 | trace_function(tr, ip, parent_ip, flags, pc); |
| 117 | } | 112 | } |
| @@ -121,7 +116,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 121 | } | 116 | } |
| 122 | 117 | ||
| 123 | static void | 118 | static void |
| 124 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | 119 | function_stack_trace_call(unsigned long ip, unsigned long parent_ip, |
| 120 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 125 | { | 121 | { |
| 126 | struct trace_array *tr = func_trace; | 122 | struct trace_array *tr = func_trace; |
| 127 | struct trace_array_cpu *data; | 123 | struct trace_array_cpu *data; |
| @@ -164,22 +160,19 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 164 | static struct ftrace_ops trace_ops __read_mostly = | 160 | static struct ftrace_ops trace_ops __read_mostly = |
| 165 | { | 161 | { |
| 166 | .func = function_trace_call, | 162 | .func = function_trace_call, |
| 167 | .flags = FTRACE_OPS_FL_GLOBAL, | 163 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
| 168 | }; | 164 | }; |
| 169 | 165 | ||
| 170 | static struct ftrace_ops trace_stack_ops __read_mostly = | 166 | static struct ftrace_ops trace_stack_ops __read_mostly = |
| 171 | { | 167 | { |
| 172 | .func = function_stack_trace_call, | 168 | .func = function_stack_trace_call, |
| 173 | .flags = FTRACE_OPS_FL_GLOBAL, | 169 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
| 174 | }; | 170 | }; |
| 175 | 171 | ||
| 176 | static struct tracer_opt func_opts[] = { | 172 | static struct tracer_opt func_opts[] = { |
| 177 | #ifdef CONFIG_STACKTRACE | 173 | #ifdef CONFIG_STACKTRACE |
| 178 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, | 174 | { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, |
| 179 | #endif | 175 | #endif |
| 180 | #ifdef CONFIG_PSTORE_FTRACE | ||
| 181 | { TRACER_OPT(func_pstore, TRACE_FUNC_OPT_PSTORE) }, | ||
| 182 | #endif | ||
| 183 | { } /* Always set a last empty entry */ | 176 | { } /* Always set a last empty entry */ |
| 184 | }; | 177 | }; |
| 185 | 178 | ||
| @@ -232,8 +225,6 @@ static int func_set_flag(u32 old_flags, u32 bit, int set) | |||
| 232 | } | 225 | } |
| 233 | 226 | ||
| 234 | break; | 227 | break; |
| 235 | case TRACE_FUNC_OPT_PSTORE: | ||
| 236 | break; | ||
| 237 | default: | 228 | default: |
| 238 | return -EINVAL; | 229 | return -EINVAL; |
| 239 | } | 230 | } |
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ce27c8ba8d31..99b4378393d5 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c | |||
| @@ -143,7 +143,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
| 143 | return; | 143 | return; |
| 144 | } | 144 | } |
| 145 | 145 | ||
| 146 | #ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST | 146 | #if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) |
| 147 | /* | 147 | /* |
| 148 | * The arch may choose to record the frame pointer used | 148 | * The arch may choose to record the frame pointer used |
| 149 | * and check it here to make sure that it is what we expect it | 149 | * and check it here to make sure that it is what we expect it |
| @@ -154,6 +154,9 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, | |||
| 154 | * | 154 | * |
| 155 | * Currently, x86_32 with optimize for size (-Os) makes the latest | 155 | * Currently, x86_32 with optimize for size (-Os) makes the latest |
| 156 | * gcc do the above. | 156 | * gcc do the above. |
| 157 | * | ||
| 158 | * Note, -mfentry does not use frame pointers, and this test | ||
| 159 | * is not needed if CC_USING_FENTRY is set. | ||
| 157 | */ | 160 | */ |
| 158 | if (unlikely(current->ret_stack[index].fp != frame_pointer)) { | 161 | if (unlikely(current->ret_stack[index].fp != frame_pointer)) { |
| 159 | ftrace_graph_stop(); | 162 | ftrace_graph_stop(); |
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 99d20e920368..d98ee8283b29 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c | |||
| @@ -136,7 +136,8 @@ static int func_prolog_dec(struct trace_array *tr, | |||
| 136 | * irqsoff uses its own tracer function to keep the overhead down: | 136 | * irqsoff uses its own tracer function to keep the overhead down: |
| 137 | */ | 137 | */ |
| 138 | static void | 138 | static void |
| 139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | 139 | irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, |
| 140 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 140 | { | 141 | { |
| 141 | struct trace_array *tr = irqsoff_trace; | 142 | struct trace_array *tr = irqsoff_trace; |
| 142 | struct trace_array_cpu *data; | 143 | struct trace_array_cpu *data; |
| @@ -153,7 +154,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 153 | static struct ftrace_ops trace_ops __read_mostly = | 154 | static struct ftrace_ops trace_ops __read_mostly = |
| 154 | { | 155 | { |
| 155 | .func = irqsoff_tracer_call, | 156 | .func = irqsoff_tracer_call, |
| 156 | .flags = FTRACE_OPS_FL_GLOBAL, | 157 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
| 157 | }; | 158 | }; |
| 158 | #endif /* CONFIG_FUNCTION_TRACER */ | 159 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 159 | 160 | ||
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ff791ea48b57..02170c00c413 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c | |||
| @@ -108,7 +108,8 @@ out_enable: | |||
| 108 | * wakeup uses its own tracer function to keep the overhead down: | 108 | * wakeup uses its own tracer function to keep the overhead down: |
| 109 | */ | 109 | */ |
| 110 | static void | 110 | static void |
| 111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | 111 | wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, |
| 112 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 112 | { | 113 | { |
| 113 | struct trace_array *tr = wakeup_trace; | 114 | struct trace_array *tr = wakeup_trace; |
| 114 | struct trace_array_cpu *data; | 115 | struct trace_array_cpu *data; |
| @@ -129,7 +130,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) | |||
| 129 | static struct ftrace_ops trace_ops __read_mostly = | 130 | static struct ftrace_ops trace_ops __read_mostly = |
| 130 | { | 131 | { |
| 131 | .func = wakeup_tracer_call, | 132 | .func = wakeup_tracer_call, |
| 132 | .flags = FTRACE_OPS_FL_GLOBAL, | 133 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
| 133 | }; | 134 | }; |
| 134 | #endif /* CONFIG_FUNCTION_TRACER */ | 135 | #endif /* CONFIG_FUNCTION_TRACER */ |
| 135 | 136 | ||
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 288541f977fb..2c00a691a540 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c | |||
| @@ -103,54 +103,67 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) | |||
| 103 | 103 | ||
| 104 | static int trace_selftest_test_probe1_cnt; | 104 | static int trace_selftest_test_probe1_cnt; |
| 105 | static void trace_selftest_test_probe1_func(unsigned long ip, | 105 | static void trace_selftest_test_probe1_func(unsigned long ip, |
| 106 | unsigned long pip) | 106 | unsigned long pip, |
| 107 | struct ftrace_ops *op, | ||
| 108 | struct pt_regs *pt_regs) | ||
| 107 | { | 109 | { |
| 108 | trace_selftest_test_probe1_cnt++; | 110 | trace_selftest_test_probe1_cnt++; |
| 109 | } | 111 | } |
| 110 | 112 | ||
| 111 | static int trace_selftest_test_probe2_cnt; | 113 | static int trace_selftest_test_probe2_cnt; |
| 112 | static void trace_selftest_test_probe2_func(unsigned long ip, | 114 | static void trace_selftest_test_probe2_func(unsigned long ip, |
| 113 | unsigned long pip) | 115 | unsigned long pip, |
| 116 | struct ftrace_ops *op, | ||
| 117 | struct pt_regs *pt_regs) | ||
| 114 | { | 118 | { |
| 115 | trace_selftest_test_probe2_cnt++; | 119 | trace_selftest_test_probe2_cnt++; |
| 116 | } | 120 | } |
| 117 | 121 | ||
| 118 | static int trace_selftest_test_probe3_cnt; | 122 | static int trace_selftest_test_probe3_cnt; |
| 119 | static void trace_selftest_test_probe3_func(unsigned long ip, | 123 | static void trace_selftest_test_probe3_func(unsigned long ip, |
| 120 | unsigned long pip) | 124 | unsigned long pip, |
| 125 | struct ftrace_ops *op, | ||
| 126 | struct pt_regs *pt_regs) | ||
| 121 | { | 127 | { |
| 122 | trace_selftest_test_probe3_cnt++; | 128 | trace_selftest_test_probe3_cnt++; |
| 123 | } | 129 | } |
| 124 | 130 | ||
| 125 | static int trace_selftest_test_global_cnt; | 131 | static int trace_selftest_test_global_cnt; |
| 126 | static void trace_selftest_test_global_func(unsigned long ip, | 132 | static void trace_selftest_test_global_func(unsigned long ip, |
| 127 | unsigned long pip) | 133 | unsigned long pip, |
| 134 | struct ftrace_ops *op, | ||
| 135 | struct pt_regs *pt_regs) | ||
| 128 | { | 136 | { |
| 129 | trace_selftest_test_global_cnt++; | 137 | trace_selftest_test_global_cnt++; |
| 130 | } | 138 | } |
| 131 | 139 | ||
| 132 | static int trace_selftest_test_dyn_cnt; | 140 | static int trace_selftest_test_dyn_cnt; |
| 133 | static void trace_selftest_test_dyn_func(unsigned long ip, | 141 | static void trace_selftest_test_dyn_func(unsigned long ip, |
| 134 | unsigned long pip) | 142 | unsigned long pip, |
| 143 | struct ftrace_ops *op, | ||
| 144 | struct pt_regs *pt_regs) | ||
| 135 | { | 145 | { |
| 136 | trace_selftest_test_dyn_cnt++; | 146 | trace_selftest_test_dyn_cnt++; |
| 137 | } | 147 | } |
| 138 | 148 | ||
| 139 | static struct ftrace_ops test_probe1 = { | 149 | static struct ftrace_ops test_probe1 = { |
| 140 | .func = trace_selftest_test_probe1_func, | 150 | .func = trace_selftest_test_probe1_func, |
| 151 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 141 | }; | 152 | }; |
| 142 | 153 | ||
| 143 | static struct ftrace_ops test_probe2 = { | 154 | static struct ftrace_ops test_probe2 = { |
| 144 | .func = trace_selftest_test_probe2_func, | 155 | .func = trace_selftest_test_probe2_func, |
| 156 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 145 | }; | 157 | }; |
| 146 | 158 | ||
| 147 | static struct ftrace_ops test_probe3 = { | 159 | static struct ftrace_ops test_probe3 = { |
| 148 | .func = trace_selftest_test_probe3_func, | 160 | .func = trace_selftest_test_probe3_func, |
| 161 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 149 | }; | 162 | }; |
| 150 | 163 | ||
| 151 | static struct ftrace_ops test_global = { | 164 | static struct ftrace_ops test_global = { |
| 152 | .func = trace_selftest_test_global_func, | 165 | .func = trace_selftest_test_global_func, |
| 153 | .flags = FTRACE_OPS_FL_GLOBAL, | 166 | .flags = FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_RECURSION_SAFE, |
| 154 | }; | 167 | }; |
| 155 | 168 | ||
| 156 | static void print_counts(void) | 169 | static void print_counts(void) |
| @@ -393,10 +406,253 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, | |||
| 393 | 406 | ||
| 394 | return ret; | 407 | return ret; |
| 395 | } | 408 | } |
| 409 | |||
| 410 | static int trace_selftest_recursion_cnt; | ||
| 411 | static void trace_selftest_test_recursion_func(unsigned long ip, | ||
| 412 | unsigned long pip, | ||
| 413 | struct ftrace_ops *op, | ||
| 414 | struct pt_regs *pt_regs) | ||
| 415 | { | ||
| 416 | /* | ||
| 417 | * This function is registered without the recursion safe flag. | ||
| 418 | * The ftrace infrastructure should provide the recursion | ||
| 419 | * protection. If not, this will crash the kernel! | ||
| 420 | */ | ||
| 421 | trace_selftest_recursion_cnt++; | ||
| 422 | DYN_FTRACE_TEST_NAME(); | ||
| 423 | } | ||
| 424 | |||
| 425 | static void trace_selftest_test_recursion_safe_func(unsigned long ip, | ||
| 426 | unsigned long pip, | ||
| 427 | struct ftrace_ops *op, | ||
| 428 | struct pt_regs *pt_regs) | ||
| 429 | { | ||
| 430 | /* | ||
| 431 | * We said we would provide our own recursion. By calling | ||
| 432 | * this function again, we should recurse back into this function | ||
| 433 | * and count again. But this only happens if the arch supports | ||
| 434 | * all of ftrace features and nothing else is using the function | ||
| 435 | * tracing utility. | ||
| 436 | */ | ||
| 437 | if (trace_selftest_recursion_cnt++) | ||
| 438 | return; | ||
| 439 | DYN_FTRACE_TEST_NAME(); | ||
| 440 | } | ||
| 441 | |||
| 442 | static struct ftrace_ops test_rec_probe = { | ||
| 443 | .func = trace_selftest_test_recursion_func, | ||
| 444 | }; | ||
| 445 | |||
| 446 | static struct ftrace_ops test_recsafe_probe = { | ||
| 447 | .func = trace_selftest_test_recursion_safe_func, | ||
| 448 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 449 | }; | ||
| 450 | |||
| 451 | static int | ||
| 452 | trace_selftest_function_recursion(void) | ||
| 453 | { | ||
| 454 | int save_ftrace_enabled = ftrace_enabled; | ||
| 455 | int save_tracer_enabled = tracer_enabled; | ||
| 456 | char *func_name; | ||
| 457 | int len; | ||
| 458 | int ret; | ||
| 459 | int cnt; | ||
| 460 | |||
| 461 | /* The previous test PASSED */ | ||
| 462 | pr_cont("PASSED\n"); | ||
| 463 | pr_info("Testing ftrace recursion: "); | ||
| 464 | |||
| 465 | |||
| 466 | /* enable tracing, and record the filter function */ | ||
| 467 | ftrace_enabled = 1; | ||
| 468 | tracer_enabled = 1; | ||
| 469 | |||
| 470 | /* Handle PPC64 '.' name */ | ||
| 471 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
| 472 | len = strlen(func_name); | ||
| 473 | |||
| 474 | ret = ftrace_set_filter(&test_rec_probe, func_name, len, 1); | ||
| 475 | if (ret) { | ||
| 476 | pr_cont("*Could not set filter* "); | ||
| 477 | goto out; | ||
| 478 | } | ||
| 479 | |||
| 480 | ret = register_ftrace_function(&test_rec_probe); | ||
| 481 | if (ret) { | ||
| 482 | pr_cont("*could not register callback* "); | ||
| 483 | goto out; | ||
| 484 | } | ||
| 485 | |||
| 486 | DYN_FTRACE_TEST_NAME(); | ||
| 487 | |||
| 488 | unregister_ftrace_function(&test_rec_probe); | ||
| 489 | |||
| 490 | ret = -1; | ||
| 491 | if (trace_selftest_recursion_cnt != 1) { | ||
| 492 | pr_cont("*callback not called once (%d)* ", | ||
| 493 | trace_selftest_recursion_cnt); | ||
| 494 | goto out; | ||
| 495 | } | ||
| 496 | |||
| 497 | trace_selftest_recursion_cnt = 1; | ||
| 498 | |||
| 499 | pr_cont("PASSED\n"); | ||
| 500 | pr_info("Testing ftrace recursion safe: "); | ||
| 501 | |||
| 502 | ret = ftrace_set_filter(&test_recsafe_probe, func_name, len, 1); | ||
| 503 | if (ret) { | ||
| 504 | pr_cont("*Could not set filter* "); | ||
| 505 | goto out; | ||
| 506 | } | ||
| 507 | |||
| 508 | ret = register_ftrace_function(&test_recsafe_probe); | ||
| 509 | if (ret) { | ||
| 510 | pr_cont("*could not register callback* "); | ||
| 511 | goto out; | ||
| 512 | } | ||
| 513 | |||
| 514 | DYN_FTRACE_TEST_NAME(); | ||
| 515 | |||
| 516 | unregister_ftrace_function(&test_recsafe_probe); | ||
| 517 | |||
| 518 | /* | ||
| 519 | * If arch supports all ftrace features, and no other task | ||
| 520 | * was on the list, we should be fine. | ||
| 521 | */ | ||
| 522 | if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC) | ||
| 523 | cnt = 2; /* Should have recursed */ | ||
| 524 | else | ||
| 525 | cnt = 1; | ||
| 526 | |||
| 527 | ret = -1; | ||
| 528 | if (trace_selftest_recursion_cnt != cnt) { | ||
| 529 | pr_cont("*callback not called expected %d times (%d)* ", | ||
| 530 | cnt, trace_selftest_recursion_cnt); | ||
| 531 | goto out; | ||
| 532 | } | ||
| 533 | |||
| 534 | ret = 0; | ||
| 535 | out: | ||
| 536 | ftrace_enabled = save_ftrace_enabled; | ||
| 537 | tracer_enabled = save_tracer_enabled; | ||
| 538 | |||
| 539 | return ret; | ||
| 540 | } | ||
| 396 | #else | 541 | #else |
| 397 | # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) | 542 | # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) |
| 543 | # define trace_selftest_function_recursion() ({ 0; }) | ||
| 398 | #endif /* CONFIG_DYNAMIC_FTRACE */ | 544 | #endif /* CONFIG_DYNAMIC_FTRACE */ |
| 399 | 545 | ||
| 546 | static enum { | ||
| 547 | TRACE_SELFTEST_REGS_START, | ||
| 548 | TRACE_SELFTEST_REGS_FOUND, | ||
| 549 | TRACE_SELFTEST_REGS_NOT_FOUND, | ||
| 550 | } trace_selftest_regs_stat; | ||
| 551 | |||
| 552 | static void trace_selftest_test_regs_func(unsigned long ip, | ||
| 553 | unsigned long pip, | ||
| 554 | struct ftrace_ops *op, | ||
| 555 | struct pt_regs *pt_regs) | ||
| 556 | { | ||
| 557 | if (pt_regs) | ||
| 558 | trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; | ||
| 559 | else | ||
| 560 | trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; | ||
| 561 | } | ||
| 562 | |||
| 563 | static struct ftrace_ops test_regs_probe = { | ||
| 564 | .func = trace_selftest_test_regs_func, | ||
| 565 | .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, | ||
| 566 | }; | ||
| 567 | |||
| 568 | static int | ||
| 569 | trace_selftest_function_regs(void) | ||
| 570 | { | ||
| 571 | int save_ftrace_enabled = ftrace_enabled; | ||
| 572 | int save_tracer_enabled = tracer_enabled; | ||
| 573 | char *func_name; | ||
| 574 | int len; | ||
| 575 | int ret; | ||
| 576 | int supported = 0; | ||
| 577 | |||
| 578 | #ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS | ||
| 579 | supported = 1; | ||
| 580 | #endif | ||
| 581 | |||
| 582 | /* The previous test PASSED */ | ||
| 583 | pr_cont("PASSED\n"); | ||
| 584 | pr_info("Testing ftrace regs%s: ", | ||
| 585 | !supported ? "(no arch support)" : ""); | ||
| 586 | |||
| 587 | /* enable tracing, and record the filter function */ | ||
| 588 | ftrace_enabled = 1; | ||
| 589 | tracer_enabled = 1; | ||
| 590 | |||
| 591 | /* Handle PPC64 '.' name */ | ||
| 592 | func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); | ||
| 593 | len = strlen(func_name); | ||
| 594 | |||
| 595 | ret = ftrace_set_filter(&test_regs_probe, func_name, len, 1); | ||
| 596 | /* | ||
| 597 | * If DYNAMIC_FTRACE is not set, then we just trace all functions. | ||
| 598 | * This test really doesn't care. | ||
| 599 | */ | ||
| 600 | if (ret && ret != -ENODEV) { | ||
| 601 | pr_cont("*Could not set filter* "); | ||
| 602 | goto out; | ||
| 603 | } | ||
| 604 | |||
| 605 | ret = register_ftrace_function(&test_regs_probe); | ||
| 606 | /* | ||
| 607 | * Now if the arch does not support passing regs, then this should | ||
| 608 | * have failed. | ||
| 609 | */ | ||
| 610 | if (!supported) { | ||
| 611 | if (!ret) { | ||
| 612 | pr_cont("*registered save-regs without arch support* "); | ||
| 613 | goto out; | ||
| 614 | } | ||
| 615 | test_regs_probe.flags |= FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED; | ||
| 616 | ret = register_ftrace_function(&test_regs_probe); | ||
| 617 | } | ||
| 618 | if (ret) { | ||
| 619 | pr_cont("*could not register callback* "); | ||
| 620 | goto out; | ||
| 621 | } | ||
| 622 | |||
| 623 | |||
| 624 | DYN_FTRACE_TEST_NAME(); | ||
| 625 | |||
| 626 | unregister_ftrace_function(&test_regs_probe); | ||
| 627 | |||
| 628 | ret = -1; | ||
| 629 | |||
| 630 | switch (trace_selftest_regs_stat) { | ||
| 631 | case TRACE_SELFTEST_REGS_START: | ||
| 632 | pr_cont("*callback never called* "); | ||
| 633 | goto out; | ||
| 634 | |||
| 635 | case TRACE_SELFTEST_REGS_FOUND: | ||
| 636 | if (supported) | ||
| 637 | break; | ||
| 638 | pr_cont("*callback received regs without arch support* "); | ||
| 639 | goto out; | ||
| 640 | |||
| 641 | case TRACE_SELFTEST_REGS_NOT_FOUND: | ||
| 642 | if (!supported) | ||
| 643 | break; | ||
| 644 | pr_cont("*callback received NULL regs* "); | ||
| 645 | goto out; | ||
| 646 | } | ||
| 647 | |||
| 648 | ret = 0; | ||
| 649 | out: | ||
| 650 | ftrace_enabled = save_ftrace_enabled; | ||
| 651 | tracer_enabled = save_tracer_enabled; | ||
| 652 | |||
| 653 | return ret; | ||
| 654 | } | ||
| 655 | |||
| 400 | /* | 656 | /* |
| 401 | * Simple verification test of ftrace function tracer. | 657 | * Simple verification test of ftrace function tracer. |
| 402 | * Enable ftrace, sleep 1/10 second, and then read the trace | 658 | * Enable ftrace, sleep 1/10 second, and then read the trace |
| @@ -442,7 +698,14 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) | |||
| 442 | 698 | ||
| 443 | ret = trace_selftest_startup_dynamic_tracing(trace, tr, | 699 | ret = trace_selftest_startup_dynamic_tracing(trace, tr, |
| 444 | DYN_FTRACE_TEST_NAME); | 700 | DYN_FTRACE_TEST_NAME); |
| 701 | if (ret) | ||
| 702 | goto out; | ||
| 445 | 703 | ||
| 704 | ret = trace_selftest_function_recursion(); | ||
| 705 | if (ret) | ||
| 706 | goto out; | ||
| 707 | |||
| 708 | ret = trace_selftest_function_regs(); | ||
| 446 | out: | 709 | out: |
| 447 | ftrace_enabled = save_ftrace_enabled; | 710 | ftrace_enabled = save_ftrace_enabled; |
| 448 | tracer_enabled = save_tracer_enabled; | 711 | tracer_enabled = save_tracer_enabled; |
| @@ -778,6 +1041,8 @@ static int trace_wakeup_test_thread(void *data) | |||
| 778 | set_current_state(TASK_INTERRUPTIBLE); | 1041 | set_current_state(TASK_INTERRUPTIBLE); |
| 779 | schedule(); | 1042 | schedule(); |
| 780 | 1043 | ||
| 1044 | complete(x); | ||
| 1045 | |||
| 781 | /* we are awake, now wait to disappear */ | 1046 | /* we are awake, now wait to disappear */ |
| 782 | while (!kthread_should_stop()) { | 1047 | while (!kthread_should_stop()) { |
| 783 | /* | 1048 | /* |
| @@ -821,24 +1086,21 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) | |||
| 821 | /* reset the max latency */ | 1086 | /* reset the max latency */ |
| 822 | tracing_max_latency = 0; | 1087 | tracing_max_latency = 0; |
| 823 | 1088 | ||
| 824 | /* sleep to let the RT thread sleep too */ | 1089 | while (p->on_rq) { |
| 825 | msleep(100); | 1090 | /* |
| 1091 | * Sleep to make sure the RT thread is asleep too. | ||
| 1092 | * On virtual machines we can't rely on timings, | ||
| 1093 | * but we want to make sure this test still works. | ||
| 1094 | */ | ||
| 1095 | msleep(100); | ||
| 1096 | } | ||
| 826 | 1097 | ||
| 827 | /* | 1098 | init_completion(&isrt); |
| 828 | * Yes this is slightly racy. It is possible that for some | ||
| 829 | * strange reason that the RT thread we created, did not | ||
| 830 | * call schedule for 100ms after doing the completion, | ||
| 831 | * and we do a wakeup on a task that already is awake. | ||
| 832 | * But that is extremely unlikely, and the worst thing that | ||
| 833 | * happens in such a case, is that we disable tracing. | ||
| 834 | * Honestly, if this race does happen something is horrible | ||
| 835 | * wrong with the system. | ||
| 836 | */ | ||
| 837 | 1099 | ||
| 838 | wake_up_process(p); | 1100 | wake_up_process(p); |
| 839 | 1101 | ||
| 840 | /* give a little time to let the thread wake up */ | 1102 | /* Wait for the task to wake up */ |
| 841 | msleep(100); | 1103 | wait_for_completion(&isrt); |
| 842 | 1104 | ||
| 843 | /* stop the tracing. */ | 1105 | /* stop the tracing. */ |
| 844 | tracing_stop(); | 1106 | tracing_stop(); |
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index d4545f49242e..0c1b165778e5 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c | |||
| @@ -111,7 +111,8 @@ static inline void check_stack(void) | |||
| 111 | } | 111 | } |
| 112 | 112 | ||
| 113 | static void | 113 | static void |
| 114 | stack_trace_call(unsigned long ip, unsigned long parent_ip) | 114 | stack_trace_call(unsigned long ip, unsigned long parent_ip, |
| 115 | struct ftrace_ops *op, struct pt_regs *pt_regs) | ||
| 115 | { | 116 | { |
| 116 | int cpu; | 117 | int cpu; |
| 117 | 118 | ||
| @@ -136,6 +137,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip) | |||
| 136 | static struct ftrace_ops trace_ops __read_mostly = | 137 | static struct ftrace_ops trace_ops __read_mostly = |
| 137 | { | 138 | { |
| 138 | .func = stack_trace_call, | 139 | .func = stack_trace_call, |
| 140 | .flags = FTRACE_OPS_FL_RECURSION_SAFE, | ||
| 139 | }; | 141 | }; |
| 140 | 142 | ||
| 141 | static ssize_t | 143 | static ssize_t |
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 6b245f64c8dd..2485a7d09b11 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
| @@ -487,7 +487,7 @@ int __init init_ftrace_syscalls(void) | |||
| 487 | 487 | ||
| 488 | return 0; | 488 | return 0; |
| 489 | } | 489 | } |
| 490 | core_initcall(init_ftrace_syscalls); | 490 | early_initcall(init_ftrace_syscalls); |
| 491 | 491 | ||
| 492 | #ifdef CONFIG_PERF_EVENTS | 492 | #ifdef CONFIG_PERF_EVENTS |
| 493 | 493 | ||
diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 23b4d784ebdd..625df0b44690 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c | |||
| @@ -26,7 +26,9 @@ | |||
| 26 | /* | 26 | /* |
| 27 | * fill in basic accounting fields | 27 | * fill in basic accounting fields |
| 28 | */ | 28 | */ |
| 29 | void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | 29 | void bacct_add_tsk(struct user_namespace *user_ns, |
| 30 | struct pid_namespace *pid_ns, | ||
| 31 | struct taskstats *stats, struct task_struct *tsk) | ||
| 30 | { | 32 | { |
| 31 | const struct cred *tcred; | 33 | const struct cred *tcred; |
| 32 | struct timespec uptime, ts; | 34 | struct timespec uptime, ts; |
| @@ -55,13 +57,13 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) | |||
| 55 | stats->ac_flag |= AXSIG; | 57 | stats->ac_flag |= AXSIG; |
| 56 | stats->ac_nice = task_nice(tsk); | 58 | stats->ac_nice = task_nice(tsk); |
| 57 | stats->ac_sched = tsk->policy; | 59 | stats->ac_sched = tsk->policy; |
| 58 | stats->ac_pid = tsk->pid; | 60 | stats->ac_pid = task_pid_nr_ns(tsk, pid_ns); |
| 59 | rcu_read_lock(); | 61 | rcu_read_lock(); |
| 60 | tcred = __task_cred(tsk); | 62 | tcred = __task_cred(tsk); |
| 61 | stats->ac_uid = tcred->uid; | 63 | stats->ac_uid = from_kuid_munged(user_ns, tcred->uid); |
| 62 | stats->ac_gid = tcred->gid; | 64 | stats->ac_gid = from_kgid_munged(user_ns, tcred->gid); |
| 63 | stats->ac_ppid = pid_alive(tsk) ? | 65 | stats->ac_ppid = pid_alive(tsk) ? |
| 64 | rcu_dereference(tsk->real_parent)->tgid : 0; | 66 | task_tgid_nr_ns(rcu_dereference(tsk->real_parent), pid_ns) : 0; |
| 65 | rcu_read_unlock(); | 67 | rcu_read_unlock(); |
| 66 | stats->ac_utime = cputime_to_usecs(tsk->utime); | 68 | stats->ac_utime = cputime_to_usecs(tsk->utime); |
| 67 | stats->ac_stime = cputime_to_usecs(tsk->stime); | 69 | stats->ac_stime = cputime_to_usecs(tsk->stime); |
diff --git a/kernel/user.c b/kernel/user.c index b815fefbe76f..750acffbe9ec 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
| @@ -38,6 +38,14 @@ struct user_namespace init_user_ns = { | |||
| 38 | .count = 4294967295U, | 38 | .count = 4294967295U, |
| 39 | }, | 39 | }, |
| 40 | }, | 40 | }, |
| 41 | .projid_map = { | ||
| 42 | .nr_extents = 1, | ||
| 43 | .extent[0] = { | ||
| 44 | .first = 0, | ||
| 45 | .lower_first = 0, | ||
| 46 | .count = 4294967295U, | ||
| 47 | }, | ||
| 48 | }, | ||
| 41 | .kref = { | 49 | .kref = { |
| 42 | .refcount = ATOMIC_INIT(3), | 50 | .refcount = ATOMIC_INIT(3), |
| 43 | }, | 51 | }, |
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 86602316422d..456a6b9fba34 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c | |||
| @@ -19,6 +19,7 @@ | |||
| 19 | #include <linux/fs.h> | 19 | #include <linux/fs.h> |
| 20 | #include <linux/uaccess.h> | 20 | #include <linux/uaccess.h> |
| 21 | #include <linux/ctype.h> | 21 | #include <linux/ctype.h> |
| 22 | #include <linux/projid.h> | ||
| 22 | 23 | ||
| 23 | static struct kmem_cache *user_ns_cachep __read_mostly; | 24 | static struct kmem_cache *user_ns_cachep __read_mostly; |
| 24 | 25 | ||
| @@ -295,6 +296,75 @@ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) | |||
| 295 | } | 296 | } |
| 296 | EXPORT_SYMBOL(from_kgid_munged); | 297 | EXPORT_SYMBOL(from_kgid_munged); |
| 297 | 298 | ||
| 299 | /** | ||
| 300 | * make_kprojid - Map a user-namespace projid pair into a kprojid. | ||
| 301 | * @ns: User namespace that the projid is in | ||
| 302 | * @projid: Project identifier | ||
| 303 | * | ||
| 304 | * Maps a user-namespace uid pair into a kernel internal kuid, | ||
| 305 | * and returns that kuid. | ||
| 306 | * | ||
| 307 | * When there is no mapping defined for the user-namespace projid | ||
| 308 | * pair INVALID_PROJID is returned. Callers are expected to test | ||
| 309 | * for and handle handle INVALID_PROJID being returned. INVALID_PROJID | ||
| 310 | * may be tested for using projid_valid(). | ||
| 311 | */ | ||
| 312 | kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) | ||
| 313 | { | ||
| 314 | /* Map the uid to a global kernel uid */ | ||
| 315 | return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); | ||
| 316 | } | ||
| 317 | EXPORT_SYMBOL(make_kprojid); | ||
| 318 | |||
| 319 | /** | ||
| 320 | * from_kprojid - Create a projid from a kprojid user-namespace pair. | ||
| 321 | * @targ: The user namespace we want a projid in. | ||
| 322 | * @kprojid: The kernel internal project identifier to start with. | ||
| 323 | * | ||
| 324 | * Map @kprojid into the user-namespace specified by @targ and | ||
| 325 | * return the resulting projid. | ||
| 326 | * | ||
| 327 | * There is always a mapping into the initial user_namespace. | ||
| 328 | * | ||
| 329 | * If @kprojid has no mapping in @targ (projid_t)-1 is returned. | ||
| 330 | */ | ||
| 331 | projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) | ||
| 332 | { | ||
| 333 | /* Map the uid from a global kernel uid */ | ||
| 334 | return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); | ||
| 335 | } | ||
| 336 | EXPORT_SYMBOL(from_kprojid); | ||
| 337 | |||
| 338 | /** | ||
| 339 | * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. | ||
| 340 | * @targ: The user namespace we want a projid in. | ||
| 341 | * @kprojid: The kernel internal projid to start with. | ||
| 342 | * | ||
| 343 | * Map @kprojid into the user-namespace specified by @targ and | ||
| 344 | * return the resulting projid. | ||
| 345 | * | ||
| 346 | * There is always a mapping into the initial user_namespace. | ||
| 347 | * | ||
| 348 | * Unlike from_kprojid from_kprojid_munged never fails and always | ||
| 349 | * returns a valid projid. This makes from_kprojid_munged | ||
| 350 | * appropriate for use in syscalls like stat and where | ||
| 351 | * failing the system call and failing to provide a valid projid are | ||
| 352 | * not an options. | ||
| 353 | * | ||
| 354 | * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. | ||
| 355 | */ | ||
| 356 | projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) | ||
| 357 | { | ||
| 358 | projid_t projid; | ||
| 359 | projid = from_kprojid(targ, kprojid); | ||
| 360 | |||
| 361 | if (projid == (projid_t) -1) | ||
| 362 | projid = OVERFLOW_PROJID; | ||
| 363 | return projid; | ||
| 364 | } | ||
| 365 | EXPORT_SYMBOL(from_kprojid_munged); | ||
| 366 | |||
| 367 | |||
| 298 | static int uid_m_show(struct seq_file *seq, void *v) | 368 | static int uid_m_show(struct seq_file *seq, void *v) |
| 299 | { | 369 | { |
| 300 | struct user_namespace *ns = seq->private; | 370 | struct user_namespace *ns = seq->private; |
| @@ -337,6 +407,27 @@ static int gid_m_show(struct seq_file *seq, void *v) | |||
| 337 | return 0; | 407 | return 0; |
| 338 | } | 408 | } |
| 339 | 409 | ||
| 410 | static int projid_m_show(struct seq_file *seq, void *v) | ||
| 411 | { | ||
| 412 | struct user_namespace *ns = seq->private; | ||
| 413 | struct uid_gid_extent *extent = v; | ||
| 414 | struct user_namespace *lower_ns; | ||
| 415 | projid_t lower; | ||
| 416 | |||
| 417 | lower_ns = seq_user_ns(seq); | ||
| 418 | if ((lower_ns == ns) && lower_ns->parent) | ||
| 419 | lower_ns = lower_ns->parent; | ||
| 420 | |||
| 421 | lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); | ||
| 422 | |||
| 423 | seq_printf(seq, "%10u %10u %10u\n", | ||
| 424 | extent->first, | ||
| 425 | lower, | ||
| 426 | extent->count); | ||
| 427 | |||
| 428 | return 0; | ||
| 429 | } | ||
| 430 | |||
| 340 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) | 431 | static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) |
| 341 | { | 432 | { |
| 342 | struct uid_gid_extent *extent = NULL; | 433 | struct uid_gid_extent *extent = NULL; |
| @@ -362,6 +453,13 @@ static void *gid_m_start(struct seq_file *seq, loff_t *ppos) | |||
| 362 | return m_start(seq, ppos, &ns->gid_map); | 453 | return m_start(seq, ppos, &ns->gid_map); |
| 363 | } | 454 | } |
| 364 | 455 | ||
| 456 | static void *projid_m_start(struct seq_file *seq, loff_t *ppos) | ||
| 457 | { | ||
| 458 | struct user_namespace *ns = seq->private; | ||
| 459 | |||
| 460 | return m_start(seq, ppos, &ns->projid_map); | ||
| 461 | } | ||
| 462 | |||
| 365 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) | 463 | static void *m_next(struct seq_file *seq, void *v, loff_t *pos) |
| 366 | { | 464 | { |
| 367 | (*pos)++; | 465 | (*pos)++; |
| @@ -387,6 +485,13 @@ struct seq_operations proc_gid_seq_operations = { | |||
| 387 | .show = gid_m_show, | 485 | .show = gid_m_show, |
| 388 | }; | 486 | }; |
| 389 | 487 | ||
| 488 | struct seq_operations proc_projid_seq_operations = { | ||
| 489 | .start = projid_m_start, | ||
| 490 | .stop = m_stop, | ||
| 491 | .next = m_next, | ||
| 492 | .show = projid_m_show, | ||
| 493 | }; | ||
| 494 | |||
| 390 | static DEFINE_MUTEX(id_map_mutex); | 495 | static DEFINE_MUTEX(id_map_mutex); |
| 391 | 496 | ||
| 392 | static ssize_t map_write(struct file *file, const char __user *buf, | 497 | static ssize_t map_write(struct file *file, const char __user *buf, |
| @@ -434,7 +539,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, | |||
| 434 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID | 539 | /* Require the appropriate privilege CAP_SETUID or CAP_SETGID |
| 435 | * over the user namespace in order to set the id mapping. | 540 | * over the user namespace in order to set the id mapping. |
| 436 | */ | 541 | */ |
| 437 | if (!ns_capable(ns, cap_setid)) | 542 | if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) |
| 438 | goto out; | 543 | goto out; |
| 439 | 544 | ||
| 440 | /* Get a buffer */ | 545 | /* Get a buffer */ |
| @@ -584,9 +689,30 @@ ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t siz | |||
| 584 | &ns->gid_map, &ns->parent->gid_map); | 689 | &ns->gid_map, &ns->parent->gid_map); |
| 585 | } | 690 | } |
| 586 | 691 | ||
| 692 | ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) | ||
| 693 | { | ||
| 694 | struct seq_file *seq = file->private_data; | ||
| 695 | struct user_namespace *ns = seq->private; | ||
| 696 | struct user_namespace *seq_ns = seq_user_ns(seq); | ||
| 697 | |||
| 698 | if (!ns->parent) | ||
| 699 | return -EPERM; | ||
| 700 | |||
| 701 | if ((seq_ns != ns) && (seq_ns != ns->parent)) | ||
| 702 | return -EPERM; | ||
| 703 | |||
| 704 | /* Anyone can set any valid project id no capability needed */ | ||
| 705 | return map_write(file, buf, size, ppos, -1, | ||
| 706 | &ns->projid_map, &ns->parent->projid_map); | ||
| 707 | } | ||
| 708 | |||
| 587 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, | 709 | static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, |
| 588 | struct uid_gid_map *new_map) | 710 | struct uid_gid_map *new_map) |
| 589 | { | 711 | { |
| 712 | /* Allow anyone to set a mapping that doesn't require privilege */ | ||
| 713 | if (!cap_valid(cap_setid)) | ||
| 714 | return true; | ||
| 715 | |||
| 590 | /* Allow the specified ids if we have the appropriate capability | 716 | /* Allow the specified ids if we have the appropriate capability |
| 591 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. | 717 | * (CAP_SETUID or CAP_SETGID) over the parent user namespace. |
| 592 | */ | 718 | */ |
diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 4b1dfba70f7c..9d4c8d5a1f53 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c | |||
| @@ -22,6 +22,7 @@ | |||
| 22 | #include <linux/notifier.h> | 22 | #include <linux/notifier.h> |
| 23 | #include <linux/module.h> | 23 | #include <linux/module.h> |
| 24 | #include <linux/sysctl.h> | 24 | #include <linux/sysctl.h> |
| 25 | #include <linux/smpboot.h> | ||
| 25 | 26 | ||
| 26 | #include <asm/irq_regs.h> | 27 | #include <asm/irq_regs.h> |
| 27 | #include <linux/kvm_para.h> | 28 | #include <linux/kvm_para.h> |
| @@ -29,16 +30,18 @@ | |||
| 29 | 30 | ||
| 30 | int watchdog_enabled = 1; | 31 | int watchdog_enabled = 1; |
| 31 | int __read_mostly watchdog_thresh = 10; | 32 | int __read_mostly watchdog_thresh = 10; |
| 33 | static int __read_mostly watchdog_disabled; | ||
| 32 | 34 | ||
| 33 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); | 35 | static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); |
| 34 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); | 36 | static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); |
| 35 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); | 37 | static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); |
| 36 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); | 38 | static DEFINE_PER_CPU(bool, softlockup_touch_sync); |
| 37 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); | 39 | static DEFINE_PER_CPU(bool, soft_watchdog_warn); |
| 40 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
| 41 | static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); | ||
| 38 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 42 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 39 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); | 43 | static DEFINE_PER_CPU(bool, hard_watchdog_warn); |
| 40 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); | 44 | static DEFINE_PER_CPU(bool, watchdog_nmi_touch); |
| 41 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); | ||
| 42 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); | 45 | static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); |
| 43 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); | 46 | static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); |
| 44 | #endif | 47 | #endif |
| @@ -248,13 +251,15 @@ static void watchdog_overflow_callback(struct perf_event *event, | |||
| 248 | __this_cpu_write(hard_watchdog_warn, false); | 251 | __this_cpu_write(hard_watchdog_warn, false); |
| 249 | return; | 252 | return; |
| 250 | } | 253 | } |
| 254 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | ||
| 255 | |||
| 251 | static void watchdog_interrupt_count(void) | 256 | static void watchdog_interrupt_count(void) |
| 252 | { | 257 | { |
| 253 | __this_cpu_inc(hrtimer_interrupts); | 258 | __this_cpu_inc(hrtimer_interrupts); |
| 254 | } | 259 | } |
| 255 | #else | 260 | |
| 256 | static inline void watchdog_interrupt_count(void) { return; } | 261 | static int watchdog_nmi_enable(unsigned int cpu); |
| 257 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 262 | static void watchdog_nmi_disable(unsigned int cpu); |
| 258 | 263 | ||
| 259 | /* watchdog kicker functions */ | 264 | /* watchdog kicker functions */ |
| 260 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | 265 | static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) |
| @@ -327,49 +332,68 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) | |||
| 327 | return HRTIMER_RESTART; | 332 | return HRTIMER_RESTART; |
| 328 | } | 333 | } |
| 329 | 334 | ||
| 335 | static void watchdog_set_prio(unsigned int policy, unsigned int prio) | ||
| 336 | { | ||
| 337 | struct sched_param param = { .sched_priority = prio }; | ||
| 330 | 338 | ||
| 331 | /* | 339 | sched_setscheduler(current, policy, ¶m); |
| 332 | * The watchdog thread - touches the timestamp. | 340 | } |
| 333 | */ | 341 | |
| 334 | static int watchdog(void *unused) | 342 | static void watchdog_enable(unsigned int cpu) |
| 335 | { | 343 | { |
| 336 | struct sched_param param = { .sched_priority = 0 }; | ||
| 337 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | 344 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); |
| 338 | 345 | ||
| 339 | /* initialize timestamp */ | 346 | if (!watchdog_enabled) { |
| 340 | __touch_watchdog(); | 347 | kthread_park(current); |
| 348 | return; | ||
| 349 | } | ||
| 350 | |||
| 351 | /* Enable the perf event */ | ||
| 352 | watchdog_nmi_enable(cpu); | ||
| 341 | 353 | ||
| 342 | /* kick off the timer for the hardlockup detector */ | 354 | /* kick off the timer for the hardlockup detector */ |
| 355 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 356 | hrtimer->function = watchdog_timer_fn; | ||
| 357 | |||
| 343 | /* done here because hrtimer_start can only pin to smp_processor_id() */ | 358 | /* done here because hrtimer_start can only pin to smp_processor_id() */ |
| 344 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), | 359 | hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), |
| 345 | HRTIMER_MODE_REL_PINNED); | 360 | HRTIMER_MODE_REL_PINNED); |
| 346 | 361 | ||
| 347 | set_current_state(TASK_INTERRUPTIBLE); | 362 | /* initialize timestamp */ |
| 348 | /* | 363 | watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1); |
| 349 | * Run briefly (kicked by the hrtimer callback function) once every | 364 | __touch_watchdog(); |
| 350 | * get_sample_period() seconds (4 seconds by default) to reset the | 365 | } |
| 351 | * softlockup timestamp. If this gets delayed for more than | ||
| 352 | * 2*watchdog_thresh seconds then the debug-printout triggers in | ||
| 353 | * watchdog_timer_fn(). | ||
| 354 | */ | ||
| 355 | while (!kthread_should_stop()) { | ||
| 356 | __touch_watchdog(); | ||
| 357 | schedule(); | ||
| 358 | 366 | ||
| 359 | if (kthread_should_stop()) | 367 | static void watchdog_disable(unsigned int cpu) |
| 360 | break; | 368 | { |
| 369 | struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); | ||
| 361 | 370 | ||
| 362 | set_current_state(TASK_INTERRUPTIBLE); | 371 | watchdog_set_prio(SCHED_NORMAL, 0); |
| 363 | } | 372 | hrtimer_cancel(hrtimer); |
| 364 | /* | 373 | /* disable the perf event */ |
| 365 | * Drop the policy/priority elevation during thread exit to avoid a | 374 | watchdog_nmi_disable(cpu); |
| 366 | * scheduling latency spike. | ||
| 367 | */ | ||
| 368 | __set_current_state(TASK_RUNNING); | ||
| 369 | sched_setscheduler(current, SCHED_NORMAL, ¶m); | ||
| 370 | return 0; | ||
| 371 | } | 375 | } |
| 372 | 376 | ||
| 377 | static int watchdog_should_run(unsigned int cpu) | ||
| 378 | { | ||
| 379 | return __this_cpu_read(hrtimer_interrupts) != | ||
| 380 | __this_cpu_read(soft_lockup_hrtimer_cnt); | ||
| 381 | } | ||
| 382 | |||
| 383 | /* | ||
| 384 | * The watchdog thread function - touches the timestamp. | ||
| 385 | * | ||
| 386 | * It only runs once every get_sample_period() seconds (4 seconds by | ||
| 387 | * default) to reset the softlockup timestamp. If this gets delayed | ||
| 388 | * for more than 2*watchdog_thresh seconds then the debug-printout | ||
| 389 | * triggers in watchdog_timer_fn(). | ||
| 390 | */ | ||
| 391 | static void watchdog(unsigned int cpu) | ||
| 392 | { | ||
| 393 | __this_cpu_write(soft_lockup_hrtimer_cnt, | ||
| 394 | __this_cpu_read(hrtimer_interrupts)); | ||
| 395 | __touch_watchdog(); | ||
| 396 | } | ||
| 373 | 397 | ||
| 374 | #ifdef CONFIG_HARDLOCKUP_DETECTOR | 398 | #ifdef CONFIG_HARDLOCKUP_DETECTOR |
| 375 | /* | 399 | /* |
| @@ -379,7 +403,7 @@ static int watchdog(void *unused) | |||
| 379 | */ | 403 | */ |
| 380 | static unsigned long cpu0_err; | 404 | static unsigned long cpu0_err; |
| 381 | 405 | ||
| 382 | static int watchdog_nmi_enable(int cpu) | 406 | static int watchdog_nmi_enable(unsigned int cpu) |
| 383 | { | 407 | { |
| 384 | struct perf_event_attr *wd_attr; | 408 | struct perf_event_attr *wd_attr; |
| 385 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 409 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
| @@ -433,7 +457,7 @@ out: | |||
| 433 | return 0; | 457 | return 0; |
| 434 | } | 458 | } |
| 435 | 459 | ||
| 436 | static void watchdog_nmi_disable(int cpu) | 460 | static void watchdog_nmi_disable(unsigned int cpu) |
| 437 | { | 461 | { |
| 438 | struct perf_event *event = per_cpu(watchdog_ev, cpu); | 462 | struct perf_event *event = per_cpu(watchdog_ev, cpu); |
| 439 | 463 | ||
| @@ -447,107 +471,35 @@ static void watchdog_nmi_disable(int cpu) | |||
| 447 | return; | 471 | return; |
| 448 | } | 472 | } |
| 449 | #else | 473 | #else |
| 450 | static int watchdog_nmi_enable(int cpu) { return 0; } | 474 | static int watchdog_nmi_enable(unsigned int cpu) { return 0; } |
| 451 | static void watchdog_nmi_disable(int cpu) { return; } | 475 | static void watchdog_nmi_disable(unsigned int cpu) { return; } |
| 452 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ | 476 | #endif /* CONFIG_HARDLOCKUP_DETECTOR */ |
| 453 | 477 | ||
| 454 | /* prepare/enable/disable routines */ | 478 | /* prepare/enable/disable routines */ |
| 455 | static void watchdog_prepare_cpu(int cpu) | ||
| 456 | { | ||
| 457 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
| 458 | |||
| 459 | WARN_ON(per_cpu(softlockup_watchdog, cpu)); | ||
| 460 | hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
| 461 | hrtimer->function = watchdog_timer_fn; | ||
| 462 | } | ||
| 463 | |||
| 464 | static int watchdog_enable(int cpu) | ||
| 465 | { | ||
| 466 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
| 467 | int err = 0; | ||
| 468 | |||
| 469 | /* enable the perf event */ | ||
| 470 | err = watchdog_nmi_enable(cpu); | ||
| 471 | |||
| 472 | /* Regardless of err above, fall through and start softlockup */ | ||
| 473 | |||
| 474 | /* create the watchdog thread */ | ||
| 475 | if (!p) { | ||
| 476 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | ||
| 477 | p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu); | ||
| 478 | if (IS_ERR(p)) { | ||
| 479 | pr_err("softlockup watchdog for %i failed\n", cpu); | ||
| 480 | if (!err) { | ||
| 481 | /* if hardlockup hasn't already set this */ | ||
| 482 | err = PTR_ERR(p); | ||
| 483 | /* and disable the perf event */ | ||
| 484 | watchdog_nmi_disable(cpu); | ||
| 485 | } | ||
| 486 | goto out; | ||
| 487 | } | ||
| 488 | sched_setscheduler(p, SCHED_FIFO, ¶m); | ||
| 489 | kthread_bind(p, cpu); | ||
| 490 | per_cpu(watchdog_touch_ts, cpu) = 0; | ||
| 491 | per_cpu(softlockup_watchdog, cpu) = p; | ||
| 492 | wake_up_process(p); | ||
| 493 | } | ||
| 494 | |||
| 495 | out: | ||
| 496 | return err; | ||
| 497 | } | ||
| 498 | |||
| 499 | static void watchdog_disable(int cpu) | ||
| 500 | { | ||
| 501 | struct task_struct *p = per_cpu(softlockup_watchdog, cpu); | ||
| 502 | struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); | ||
| 503 | |||
| 504 | /* | ||
| 505 | * cancel the timer first to stop incrementing the stats | ||
| 506 | * and waking up the kthread | ||
| 507 | */ | ||
| 508 | hrtimer_cancel(hrtimer); | ||
| 509 | |||
| 510 | /* disable the perf event */ | ||
| 511 | watchdog_nmi_disable(cpu); | ||
| 512 | |||
| 513 | /* stop the watchdog thread */ | ||
| 514 | if (p) { | ||
| 515 | per_cpu(softlockup_watchdog, cpu) = NULL; | ||
| 516 | kthread_stop(p); | ||
| 517 | } | ||
| 518 | } | ||
| 519 | |||
| 520 | /* sysctl functions */ | 479 | /* sysctl functions */ |
| 521 | #ifdef CONFIG_SYSCTL | 480 | #ifdef CONFIG_SYSCTL |
| 522 | static void watchdog_enable_all_cpus(void) | 481 | static void watchdog_enable_all_cpus(void) |
| 523 | { | 482 | { |
| 524 | int cpu; | 483 | unsigned int cpu; |
| 525 | |||
| 526 | watchdog_enabled = 0; | ||
| 527 | |||
| 528 | for_each_online_cpu(cpu) | ||
| 529 | if (!watchdog_enable(cpu)) | ||
| 530 | /* if any cpu succeeds, watchdog is considered | ||
| 531 | enabled for the system */ | ||
| 532 | watchdog_enabled = 1; | ||
| 533 | |||
| 534 | if (!watchdog_enabled) | ||
| 535 | pr_err("failed to be enabled on some cpus\n"); | ||
| 536 | 484 | ||
| 485 | if (watchdog_disabled) { | ||
| 486 | watchdog_disabled = 0; | ||
| 487 | for_each_online_cpu(cpu) | ||
| 488 | kthread_unpark(per_cpu(softlockup_watchdog, cpu)); | ||
| 489 | } | ||
| 537 | } | 490 | } |
| 538 | 491 | ||
| 539 | static void watchdog_disable_all_cpus(void) | 492 | static void watchdog_disable_all_cpus(void) |
| 540 | { | 493 | { |
| 541 | int cpu; | 494 | unsigned int cpu; |
| 542 | |||
| 543 | for_each_online_cpu(cpu) | ||
| 544 | watchdog_disable(cpu); | ||
| 545 | 495 | ||
| 546 | /* if all watchdogs are disabled, then they are disabled for the system */ | 496 | if (!watchdog_disabled) { |
| 547 | watchdog_enabled = 0; | 497 | watchdog_disabled = 1; |
| 498 | for_each_online_cpu(cpu) | ||
| 499 | kthread_park(per_cpu(softlockup_watchdog, cpu)); | ||
| 500 | } | ||
| 548 | } | 501 | } |
| 549 | 502 | ||
| 550 | |||
| 551 | /* | 503 | /* |
| 552 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh | 504 | * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh |
| 553 | */ | 505 | */ |
| @@ -557,73 +509,36 @@ int proc_dowatchdog(struct ctl_table *table, int write, | |||
| 557 | { | 509 | { |
| 558 | int ret; | 510 | int ret; |
| 559 | 511 | ||
| 512 | if (watchdog_disabled < 0) | ||
| 513 | return -ENODEV; | ||
| 514 | |||
| 560 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | 515 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
| 561 | if (ret || !write) | 516 | if (ret || !write) |
| 562 | goto out; | 517 | return ret; |
| 563 | 518 | ||
| 564 | if (watchdog_enabled && watchdog_thresh) | 519 | if (watchdog_enabled && watchdog_thresh) |
| 565 | watchdog_enable_all_cpus(); | 520 | watchdog_enable_all_cpus(); |
| 566 | else | 521 | else |
| 567 | watchdog_disable_all_cpus(); | 522 | watchdog_disable_all_cpus(); |
| 568 | 523 | ||
| 569 | out: | ||
| 570 | return ret; | 524 | return ret; |
| 571 | } | 525 | } |
| 572 | #endif /* CONFIG_SYSCTL */ | 526 | #endif /* CONFIG_SYSCTL */ |
| 573 | 527 | ||
| 574 | 528 | static struct smp_hotplug_thread watchdog_threads = { | |
| 575 | /* | 529 | .store = &softlockup_watchdog, |
| 576 | * Create/destroy watchdog threads as CPUs come and go: | 530 | .thread_should_run = watchdog_should_run, |
| 577 | */ | 531 | .thread_fn = watchdog, |
| 578 | static int __cpuinit | 532 | .thread_comm = "watchdog/%u", |
| 579 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 533 | .setup = watchdog_enable, |
| 580 | { | 534 | .park = watchdog_disable, |
| 581 | int hotcpu = (unsigned long)hcpu; | 535 | .unpark = watchdog_enable, |
| 582 | |||
| 583 | switch (action) { | ||
| 584 | case CPU_UP_PREPARE: | ||
| 585 | case CPU_UP_PREPARE_FROZEN: | ||
| 586 | watchdog_prepare_cpu(hotcpu); | ||
| 587 | break; | ||
| 588 | case CPU_ONLINE: | ||
| 589 | case CPU_ONLINE_FROZEN: | ||
| 590 | if (watchdog_enabled) | ||
| 591 | watchdog_enable(hotcpu); | ||
| 592 | break; | ||
| 593 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 594 | case CPU_UP_CANCELED: | ||
| 595 | case CPU_UP_CANCELED_FROZEN: | ||
| 596 | watchdog_disable(hotcpu); | ||
| 597 | break; | ||
| 598 | case CPU_DEAD: | ||
| 599 | case CPU_DEAD_FROZEN: | ||
| 600 | watchdog_disable(hotcpu); | ||
| 601 | break; | ||
| 602 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
| 603 | } | ||
| 604 | |||
| 605 | /* | ||
| 606 | * hardlockup and softlockup are not important enough | ||
| 607 | * to block cpu bring up. Just always succeed and | ||
| 608 | * rely on printk output to flag problems. | ||
| 609 | */ | ||
| 610 | return NOTIFY_OK; | ||
| 611 | } | ||
| 612 | |||
| 613 | static struct notifier_block __cpuinitdata cpu_nfb = { | ||
| 614 | .notifier_call = cpu_callback | ||
| 615 | }; | 536 | }; |
| 616 | 537 | ||
| 617 | void __init lockup_detector_init(void) | 538 | void __init lockup_detector_init(void) |
| 618 | { | 539 | { |
| 619 | void *cpu = (void *)(long)smp_processor_id(); | 540 | if (smpboot_register_percpu_thread(&watchdog_threads)) { |
| 620 | int err; | 541 | pr_err("Failed to create watchdog threads, disabled\n"); |
| 621 | 542 | watchdog_disabled = -ENODEV; | |
| 622 | err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); | 543 | } |
| 623 | WARN_ON(notifier_to_errno(err)); | ||
| 624 | |||
| 625 | cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); | ||
| 626 | register_cpu_notifier(&cpu_nfb); | ||
| 627 | |||
| 628 | return; | ||
| 629 | } | 544 | } |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 3c5a79e2134c..d951daa0ca9a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
| @@ -58,7 +58,7 @@ enum { | |||
| 58 | * be executing on any CPU. The gcwq behaves as an unbound one. | 58 | * be executing on any CPU. The gcwq behaves as an unbound one. |
| 59 | * | 59 | * |
| 60 | * Note that DISASSOCIATED can be flipped only while holding | 60 | * Note that DISASSOCIATED can be flipped only while holding |
| 61 | * managership of all pools on the gcwq to avoid changing binding | 61 | * assoc_mutex of all pools on the gcwq to avoid changing binding |
| 62 | * state while create_worker() is in progress. | 62 | * state while create_worker() is in progress. |
| 63 | */ | 63 | */ |
| 64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ | 64 | GCWQ_DISASSOCIATED = 1 << 0, /* cpu can't serve workers */ |
| @@ -73,11 +73,10 @@ enum { | |||
| 73 | WORKER_DIE = 1 << 1, /* die die die */ | 73 | WORKER_DIE = 1 << 1, /* die die die */ |
| 74 | WORKER_IDLE = 1 << 2, /* is idle */ | 74 | WORKER_IDLE = 1 << 2, /* is idle */ |
| 75 | WORKER_PREP = 1 << 3, /* preparing to run works */ | 75 | WORKER_PREP = 1 << 3, /* preparing to run works */ |
| 76 | WORKER_REBIND = 1 << 5, /* mom is home, come back */ | ||
| 77 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ | 76 | WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ |
| 78 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ | 77 | WORKER_UNBOUND = 1 << 7, /* worker is unbound */ |
| 79 | 78 | ||
| 80 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_REBIND | WORKER_UNBOUND | | 79 | WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | |
| 81 | WORKER_CPU_INTENSIVE, | 80 | WORKER_CPU_INTENSIVE, |
| 82 | 81 | ||
| 83 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ | 82 | NR_WORKER_POOLS = 2, /* # worker pools per gcwq */ |
| @@ -126,7 +125,6 @@ enum { | |||
| 126 | 125 | ||
| 127 | struct global_cwq; | 126 | struct global_cwq; |
| 128 | struct worker_pool; | 127 | struct worker_pool; |
| 129 | struct idle_rebind; | ||
| 130 | 128 | ||
| 131 | /* | 129 | /* |
| 132 | * The poor guys doing the actual heavy lifting. All on-duty workers | 130 | * The poor guys doing the actual heavy lifting. All on-duty workers |
| @@ -150,7 +148,6 @@ struct worker { | |||
| 150 | int id; /* I: worker id */ | 148 | int id; /* I: worker id */ |
| 151 | 149 | ||
| 152 | /* for rebinding worker to CPU */ | 150 | /* for rebinding worker to CPU */ |
| 153 | struct idle_rebind *idle_rebind; /* L: for idle worker */ | ||
| 154 | struct work_struct rebind_work; /* L: for busy worker */ | 151 | struct work_struct rebind_work; /* L: for busy worker */ |
| 155 | }; | 152 | }; |
| 156 | 153 | ||
| @@ -160,13 +157,15 @@ struct worker_pool { | |||
| 160 | 157 | ||
| 161 | struct list_head worklist; /* L: list of pending works */ | 158 | struct list_head worklist; /* L: list of pending works */ |
| 162 | int nr_workers; /* L: total number of workers */ | 159 | int nr_workers; /* L: total number of workers */ |
| 160 | |||
| 161 | /* nr_idle includes the ones off idle_list for rebinding */ | ||
| 163 | int nr_idle; /* L: currently idle ones */ | 162 | int nr_idle; /* L: currently idle ones */ |
| 164 | 163 | ||
| 165 | struct list_head idle_list; /* X: list of idle workers */ | 164 | struct list_head idle_list; /* X: list of idle workers */ |
| 166 | struct timer_list idle_timer; /* L: worker idle timeout */ | 165 | struct timer_list idle_timer; /* L: worker idle timeout */ |
| 167 | struct timer_list mayday_timer; /* L: SOS timer for workers */ | 166 | struct timer_list mayday_timer; /* L: SOS timer for workers */ |
| 168 | 167 | ||
| 169 | struct mutex manager_mutex; /* mutex manager should hold */ | 168 | struct mutex assoc_mutex; /* protect GCWQ_DISASSOCIATED */ |
| 170 | struct ida worker_ida; /* L: for worker IDs */ | 169 | struct ida worker_ida; /* L: for worker IDs */ |
| 171 | }; | 170 | }; |
| 172 | 171 | ||
| @@ -184,9 +183,8 @@ struct global_cwq { | |||
| 184 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; | 183 | struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; |
| 185 | /* L: hash of busy workers */ | 184 | /* L: hash of busy workers */ |
| 186 | 185 | ||
| 187 | struct worker_pool pools[2]; /* normal and highpri pools */ | 186 | struct worker_pool pools[NR_WORKER_POOLS]; |
| 188 | 187 | /* normal and highpri pools */ | |
| 189 | wait_queue_head_t rebind_hold; /* rebind hold wait */ | ||
| 190 | } ____cacheline_aligned_in_smp; | 188 | } ____cacheline_aligned_in_smp; |
| 191 | 189 | ||
| 192 | /* | 190 | /* |
| @@ -269,17 +267,15 @@ struct workqueue_struct { | |||
| 269 | }; | 267 | }; |
| 270 | 268 | ||
| 271 | struct workqueue_struct *system_wq __read_mostly; | 269 | struct workqueue_struct *system_wq __read_mostly; |
| 272 | struct workqueue_struct *system_long_wq __read_mostly; | ||
| 273 | struct workqueue_struct *system_nrt_wq __read_mostly; | ||
| 274 | struct workqueue_struct *system_unbound_wq __read_mostly; | ||
| 275 | struct workqueue_struct *system_freezable_wq __read_mostly; | ||
| 276 | struct workqueue_struct *system_nrt_freezable_wq __read_mostly; | ||
| 277 | EXPORT_SYMBOL_GPL(system_wq); | 270 | EXPORT_SYMBOL_GPL(system_wq); |
| 271 | struct workqueue_struct *system_highpri_wq __read_mostly; | ||
| 272 | EXPORT_SYMBOL_GPL(system_highpri_wq); | ||
| 273 | struct workqueue_struct *system_long_wq __read_mostly; | ||
| 278 | EXPORT_SYMBOL_GPL(system_long_wq); | 274 | EXPORT_SYMBOL_GPL(system_long_wq); |
| 279 | EXPORT_SYMBOL_GPL(system_nrt_wq); | 275 | struct workqueue_struct *system_unbound_wq __read_mostly; |
| 280 | EXPORT_SYMBOL_GPL(system_unbound_wq); | 276 | EXPORT_SYMBOL_GPL(system_unbound_wq); |
| 277 | struct workqueue_struct *system_freezable_wq __read_mostly; | ||
| 281 | EXPORT_SYMBOL_GPL(system_freezable_wq); | 278 | EXPORT_SYMBOL_GPL(system_freezable_wq); |
| 282 | EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); | ||
| 283 | 279 | ||
| 284 | #define CREATE_TRACE_POINTS | 280 | #define CREATE_TRACE_POINTS |
| 285 | #include <trace/events/workqueue.h> | 281 | #include <trace/events/workqueue.h> |
| @@ -534,18 +530,24 @@ static int work_next_color(int color) | |||
| 534 | } | 530 | } |
| 535 | 531 | ||
| 536 | /* | 532 | /* |
| 537 | * A work's data points to the cwq with WORK_STRUCT_CWQ set while the | 533 | * While queued, %WORK_STRUCT_CWQ is set and non flag bits of a work's data |
| 538 | * work is on queue. Once execution starts, WORK_STRUCT_CWQ is | 534 | * contain the pointer to the queued cwq. Once execution starts, the flag |
| 539 | * cleared and the work data contains the cpu number it was last on. | 535 | * is cleared and the high bits contain OFFQ flags and CPU number. |
| 540 | * | 536 | * |
| 541 | * set_work_{cwq|cpu}() and clear_work_data() can be used to set the | 537 | * set_work_cwq(), set_work_cpu_and_clear_pending(), mark_work_canceling() |
| 542 | * cwq, cpu or clear work->data. These functions should only be | 538 | * and clear_work_data() can be used to set the cwq, cpu or clear |
| 543 | * called while the work is owned - ie. while the PENDING bit is set. | 539 | * work->data. These functions should only be called while the work is |
| 540 | * owned - ie. while the PENDING bit is set. | ||
| 544 | * | 541 | * |
| 545 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq | 542 | * get_work_[g]cwq() can be used to obtain the gcwq or cwq corresponding to |
| 546 | * corresponding to a work. gcwq is available once the work has been | 543 | * a work. gcwq is available once the work has been queued anywhere after |
| 547 | * queued anywhere after initialization. cwq is available only from | 544 | * initialization until it is sync canceled. cwq is available only while |
| 548 | * queueing until execution starts. | 545 | * the work item is queued. |
| 546 | * | ||
| 547 | * %WORK_OFFQ_CANCELING is used to mark a work item which is being | ||
| 548 | * canceled. While being canceled, a work item may have its PENDING set | ||
| 549 | * but stay off timer and worklist for arbitrarily long and nobody should | ||
| 550 | * try to steal the PENDING bit. | ||
| 549 | */ | 551 | */ |
| 550 | static inline void set_work_data(struct work_struct *work, unsigned long data, | 552 | static inline void set_work_data(struct work_struct *work, unsigned long data, |
| 551 | unsigned long flags) | 553 | unsigned long flags) |
| @@ -562,13 +564,22 @@ static void set_work_cwq(struct work_struct *work, | |||
| 562 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); | 564 | WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); |
| 563 | } | 565 | } |
| 564 | 566 | ||
| 565 | static void set_work_cpu(struct work_struct *work, unsigned int cpu) | 567 | static void set_work_cpu_and_clear_pending(struct work_struct *work, |
| 568 | unsigned int cpu) | ||
| 566 | { | 569 | { |
| 567 | set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); | 570 | /* |
| 571 | * The following wmb is paired with the implied mb in | ||
| 572 | * test_and_set_bit(PENDING) and ensures all updates to @work made | ||
| 573 | * here are visible to and precede any updates by the next PENDING | ||
| 574 | * owner. | ||
| 575 | */ | ||
| 576 | smp_wmb(); | ||
| 577 | set_work_data(work, (unsigned long)cpu << WORK_OFFQ_CPU_SHIFT, 0); | ||
| 568 | } | 578 | } |
| 569 | 579 | ||
| 570 | static void clear_work_data(struct work_struct *work) | 580 | static void clear_work_data(struct work_struct *work) |
| 571 | { | 581 | { |
| 582 | smp_wmb(); /* see set_work_cpu_and_clear_pending() */ | ||
| 572 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); | 583 | set_work_data(work, WORK_STRUCT_NO_CPU, 0); |
| 573 | } | 584 | } |
| 574 | 585 | ||
| @@ -591,7 +602,7 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
| 591 | return ((struct cpu_workqueue_struct *) | 602 | return ((struct cpu_workqueue_struct *) |
| 592 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; | 603 | (data & WORK_STRUCT_WQ_DATA_MASK))->pool->gcwq; |
| 593 | 604 | ||
| 594 | cpu = data >> WORK_STRUCT_FLAG_BITS; | 605 | cpu = data >> WORK_OFFQ_CPU_SHIFT; |
| 595 | if (cpu == WORK_CPU_NONE) | 606 | if (cpu == WORK_CPU_NONE) |
| 596 | return NULL; | 607 | return NULL; |
| 597 | 608 | ||
| @@ -599,6 +610,22 @@ static struct global_cwq *get_work_gcwq(struct work_struct *work) | |||
| 599 | return get_gcwq(cpu); | 610 | return get_gcwq(cpu); |
| 600 | } | 611 | } |
| 601 | 612 | ||
| 613 | static void mark_work_canceling(struct work_struct *work) | ||
| 614 | { | ||
| 615 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 616 | unsigned long cpu = gcwq ? gcwq->cpu : WORK_CPU_NONE; | ||
| 617 | |||
| 618 | set_work_data(work, (cpu << WORK_OFFQ_CPU_SHIFT) | WORK_OFFQ_CANCELING, | ||
| 619 | WORK_STRUCT_PENDING); | ||
| 620 | } | ||
| 621 | |||
| 622 | static bool work_is_canceling(struct work_struct *work) | ||
| 623 | { | ||
| 624 | unsigned long data = atomic_long_read(&work->data); | ||
| 625 | |||
| 626 | return !(data & WORK_STRUCT_CWQ) && (data & WORK_OFFQ_CANCELING); | ||
| 627 | } | ||
| 628 | |||
| 602 | /* | 629 | /* |
| 603 | * Policy functions. These define the policies on how the global worker | 630 | * Policy functions. These define the policies on how the global worker |
| 604 | * pools are managed. Unless noted otherwise, these functions assume that | 631 | * pools are managed. Unless noted otherwise, these functions assume that |
| @@ -657,6 +684,13 @@ static bool too_many_workers(struct worker_pool *pool) | |||
| 657 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ | 684 | int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ |
| 658 | int nr_busy = pool->nr_workers - nr_idle; | 685 | int nr_busy = pool->nr_workers - nr_idle; |
| 659 | 686 | ||
| 687 | /* | ||
| 688 | * nr_idle and idle_list may disagree if idle rebinding is in | ||
| 689 | * progress. Never return %true if idle_list is empty. | ||
| 690 | */ | ||
| 691 | if (list_empty(&pool->idle_list)) | ||
| 692 | return false; | ||
| 693 | |||
| 660 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; | 694 | return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; |
| 661 | } | 695 | } |
| 662 | 696 | ||
| @@ -903,6 +937,206 @@ static struct worker *find_worker_executing_work(struct global_cwq *gcwq, | |||
| 903 | } | 937 | } |
| 904 | 938 | ||
| 905 | /** | 939 | /** |
| 940 | * move_linked_works - move linked works to a list | ||
| 941 | * @work: start of series of works to be scheduled | ||
| 942 | * @head: target list to append @work to | ||
| 943 | * @nextp: out paramter for nested worklist walking | ||
| 944 | * | ||
| 945 | * Schedule linked works starting from @work to @head. Work series to | ||
| 946 | * be scheduled starts at @work and includes any consecutive work with | ||
| 947 | * WORK_STRUCT_LINKED set in its predecessor. | ||
| 948 | * | ||
| 949 | * If @nextp is not NULL, it's updated to point to the next work of | ||
| 950 | * the last scheduled work. This allows move_linked_works() to be | ||
| 951 | * nested inside outer list_for_each_entry_safe(). | ||
| 952 | * | ||
| 953 | * CONTEXT: | ||
| 954 | * spin_lock_irq(gcwq->lock). | ||
| 955 | */ | ||
| 956 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
| 957 | struct work_struct **nextp) | ||
| 958 | { | ||
| 959 | struct work_struct *n; | ||
| 960 | |||
| 961 | /* | ||
| 962 | * Linked worklist will always end before the end of the list, | ||
| 963 | * use NULL for list head. | ||
| 964 | */ | ||
| 965 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
| 966 | list_move_tail(&work->entry, head); | ||
| 967 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
| 968 | break; | ||
| 969 | } | ||
| 970 | |||
| 971 | /* | ||
| 972 | * If we're already inside safe list traversal and have moved | ||
| 973 | * multiple works to the scheduled queue, the next position | ||
| 974 | * needs to be updated. | ||
| 975 | */ | ||
| 976 | if (nextp) | ||
| 977 | *nextp = n; | ||
| 978 | } | ||
| 979 | |||
| 980 | static void cwq_activate_delayed_work(struct work_struct *work) | ||
| 981 | { | ||
| 982 | struct cpu_workqueue_struct *cwq = get_work_cwq(work); | ||
| 983 | |||
| 984 | trace_workqueue_activate_work(work); | ||
| 985 | move_linked_works(work, &cwq->pool->worklist, NULL); | ||
| 986 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | ||
| 987 | cwq->nr_active++; | ||
| 988 | } | ||
| 989 | |||
| 990 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
| 991 | { | ||
| 992 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
| 993 | struct work_struct, entry); | ||
| 994 | |||
| 995 | cwq_activate_delayed_work(work); | ||
| 996 | } | ||
| 997 | |||
| 998 | /** | ||
| 999 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
| 1000 | * @cwq: cwq of interest | ||
| 1001 | * @color: color of work which left the queue | ||
| 1002 | * | ||
| 1003 | * A work either has completed or is removed from pending queue, | ||
| 1004 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
| 1005 | * | ||
| 1006 | * CONTEXT: | ||
| 1007 | * spin_lock_irq(gcwq->lock). | ||
| 1008 | */ | ||
| 1009 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color) | ||
| 1010 | { | ||
| 1011 | /* ignore uncolored works */ | ||
| 1012 | if (color == WORK_NO_COLOR) | ||
| 1013 | return; | ||
| 1014 | |||
| 1015 | cwq->nr_in_flight[color]--; | ||
| 1016 | |||
| 1017 | cwq->nr_active--; | ||
| 1018 | if (!list_empty(&cwq->delayed_works)) { | ||
| 1019 | /* one down, submit a delayed one */ | ||
| 1020 | if (cwq->nr_active < cwq->max_active) | ||
| 1021 | cwq_activate_first_delayed(cwq); | ||
| 1022 | } | ||
| 1023 | |||
| 1024 | /* is flush in progress and are we at the flushing tip? */ | ||
| 1025 | if (likely(cwq->flush_color != color)) | ||
| 1026 | return; | ||
| 1027 | |||
| 1028 | /* are there still in-flight works? */ | ||
| 1029 | if (cwq->nr_in_flight[color]) | ||
| 1030 | return; | ||
| 1031 | |||
| 1032 | /* this cwq is done, clear flush_color */ | ||
| 1033 | cwq->flush_color = -1; | ||
| 1034 | |||
| 1035 | /* | ||
| 1036 | * If this was the last cwq, wake up the first flusher. It | ||
| 1037 | * will handle the rest. | ||
| 1038 | */ | ||
| 1039 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
| 1040 | complete(&cwq->wq->first_flusher->done); | ||
| 1041 | } | ||
| 1042 | |||
| 1043 | /** | ||
| 1044 | * try_to_grab_pending - steal work item from worklist and disable irq | ||
| 1045 | * @work: work item to steal | ||
| 1046 | * @is_dwork: @work is a delayed_work | ||
| 1047 | * @flags: place to store irq state | ||
| 1048 | * | ||
| 1049 | * Try to grab PENDING bit of @work. This function can handle @work in any | ||
| 1050 | * stable state - idle, on timer or on worklist. Return values are | ||
| 1051 | * | ||
| 1052 | * 1 if @work was pending and we successfully stole PENDING | ||
| 1053 | * 0 if @work was idle and we claimed PENDING | ||
| 1054 | * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry | ||
| 1055 | * -ENOENT if someone else is canceling @work, this state may persist | ||
| 1056 | * for arbitrarily long | ||
| 1057 | * | ||
| 1058 | * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting | ||
| 1059 | * interrupted while holding PENDING and @work off queue, irq must be | ||
| 1060 | * disabled on entry. This, combined with delayed_work->timer being | ||
| 1061 | * irqsafe, ensures that we return -EAGAIN for finite short period of time. | ||
| 1062 | * | ||
| 1063 | * On successful return, >= 0, irq is disabled and the caller is | ||
| 1064 | * responsible for releasing it using local_irq_restore(*@flags). | ||
| 1065 | * | ||
| 1066 | * This function is safe to call from any context including IRQ handler. | ||
| 1067 | */ | ||
| 1068 | static int try_to_grab_pending(struct work_struct *work, bool is_dwork, | ||
| 1069 | unsigned long *flags) | ||
| 1070 | { | ||
| 1071 | struct global_cwq *gcwq; | ||
| 1072 | |||
| 1073 | local_irq_save(*flags); | ||
| 1074 | |||
| 1075 | /* try to steal the timer if it exists */ | ||
| 1076 | if (is_dwork) { | ||
| 1077 | struct delayed_work *dwork = to_delayed_work(work); | ||
| 1078 | |||
| 1079 | /* | ||
| 1080 | * dwork->timer is irqsafe. If del_timer() fails, it's | ||
| 1081 | * guaranteed that the timer is not queued anywhere and not | ||
| 1082 | * running on the local CPU. | ||
| 1083 | */ | ||
| 1084 | if (likely(del_timer(&dwork->timer))) | ||
| 1085 | return 1; | ||
| 1086 | } | ||
| 1087 | |||
| 1088 | /* try to claim PENDING the normal way */ | ||
| 1089 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) | ||
| 1090 | return 0; | ||
| 1091 | |||
| 1092 | /* | ||
| 1093 | * The queueing is in progress, or it is already queued. Try to | ||
| 1094 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | ||
| 1095 | */ | ||
| 1096 | gcwq = get_work_gcwq(work); | ||
| 1097 | if (!gcwq) | ||
| 1098 | goto fail; | ||
| 1099 | |||
| 1100 | spin_lock(&gcwq->lock); | ||
| 1101 | if (!list_empty(&work->entry)) { | ||
| 1102 | /* | ||
| 1103 | * This work is queued, but perhaps we locked the wrong gcwq. | ||
| 1104 | * In that case we must see the new value after rmb(), see | ||
| 1105 | * insert_work()->wmb(). | ||
| 1106 | */ | ||
| 1107 | smp_rmb(); | ||
| 1108 | if (gcwq == get_work_gcwq(work)) { | ||
| 1109 | debug_work_deactivate(work); | ||
| 1110 | |||
| 1111 | /* | ||
| 1112 | * A delayed work item cannot be grabbed directly | ||
| 1113 | * because it might have linked NO_COLOR work items | ||
| 1114 | * which, if left on the delayed_list, will confuse | ||
| 1115 | * cwq->nr_active management later on and cause | ||
| 1116 | * stall. Make sure the work item is activated | ||
| 1117 | * before grabbing. | ||
| 1118 | */ | ||
| 1119 | if (*work_data_bits(work) & WORK_STRUCT_DELAYED) | ||
| 1120 | cwq_activate_delayed_work(work); | ||
| 1121 | |||
| 1122 | list_del_init(&work->entry); | ||
| 1123 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
| 1124 | get_work_color(work)); | ||
| 1125 | |||
| 1126 | spin_unlock(&gcwq->lock); | ||
| 1127 | return 1; | ||
| 1128 | } | ||
| 1129 | } | ||
| 1130 | spin_unlock(&gcwq->lock); | ||
| 1131 | fail: | ||
| 1132 | local_irq_restore(*flags); | ||
| 1133 | if (work_is_canceling(work)) | ||
| 1134 | return -ENOENT; | ||
| 1135 | cpu_relax(); | ||
| 1136 | return -EAGAIN; | ||
| 1137 | } | ||
| 1138 | |||
| 1139 | /** | ||
| 906 | * insert_work - insert a work into gcwq | 1140 | * insert_work - insert a work into gcwq |
| 907 | * @cwq: cwq @work belongs to | 1141 | * @cwq: cwq @work belongs to |
| 908 | * @work: work to insert | 1142 | * @work: work to insert |
| @@ -982,7 +1216,15 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 982 | struct cpu_workqueue_struct *cwq; | 1216 | struct cpu_workqueue_struct *cwq; |
| 983 | struct list_head *worklist; | 1217 | struct list_head *worklist; |
| 984 | unsigned int work_flags; | 1218 | unsigned int work_flags; |
| 985 | unsigned long flags; | 1219 | unsigned int req_cpu = cpu; |
| 1220 | |||
| 1221 | /* | ||
| 1222 | * While a work item is PENDING && off queue, a task trying to | ||
| 1223 | * steal the PENDING will busy-loop waiting for it to either get | ||
| 1224 | * queued or lose PENDING. Grabbing PENDING and queueing should | ||
| 1225 | * happen with IRQ disabled. | ||
| 1226 | */ | ||
| 1227 | WARN_ON_ONCE(!irqs_disabled()); | ||
| 986 | 1228 | ||
| 987 | debug_work_activate(work); | 1229 | debug_work_activate(work); |
| 988 | 1230 | ||
| @@ -995,21 +1237,22 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 995 | if (!(wq->flags & WQ_UNBOUND)) { | 1237 | if (!(wq->flags & WQ_UNBOUND)) { |
| 996 | struct global_cwq *last_gcwq; | 1238 | struct global_cwq *last_gcwq; |
| 997 | 1239 | ||
| 998 | if (unlikely(cpu == WORK_CPU_UNBOUND)) | 1240 | if (cpu == WORK_CPU_UNBOUND) |
| 999 | cpu = raw_smp_processor_id(); | 1241 | cpu = raw_smp_processor_id(); |
| 1000 | 1242 | ||
| 1001 | /* | 1243 | /* |
| 1002 | * It's multi cpu. If @wq is non-reentrant and @work | 1244 | * It's multi cpu. If @work was previously on a different |
| 1003 | * was previously on a different cpu, it might still | 1245 | * cpu, it might still be running there, in which case the |
| 1004 | * be running there, in which case the work needs to | 1246 | * work needs to be queued on that cpu to guarantee |
| 1005 | * be queued on that cpu to guarantee non-reentrance. | 1247 | * non-reentrancy. |
| 1006 | */ | 1248 | */ |
| 1007 | gcwq = get_gcwq(cpu); | 1249 | gcwq = get_gcwq(cpu); |
| 1008 | if (wq->flags & WQ_NON_REENTRANT && | 1250 | last_gcwq = get_work_gcwq(work); |
| 1009 | (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { | 1251 | |
| 1252 | if (last_gcwq && last_gcwq != gcwq) { | ||
| 1010 | struct worker *worker; | 1253 | struct worker *worker; |
| 1011 | 1254 | ||
| 1012 | spin_lock_irqsave(&last_gcwq->lock, flags); | 1255 | spin_lock(&last_gcwq->lock); |
| 1013 | 1256 | ||
| 1014 | worker = find_worker_executing_work(last_gcwq, work); | 1257 | worker = find_worker_executing_work(last_gcwq, work); |
| 1015 | 1258 | ||
| @@ -1017,22 +1260,23 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 1017 | gcwq = last_gcwq; | 1260 | gcwq = last_gcwq; |
| 1018 | else { | 1261 | else { |
| 1019 | /* meh... not running there, queue here */ | 1262 | /* meh... not running there, queue here */ |
| 1020 | spin_unlock_irqrestore(&last_gcwq->lock, flags); | 1263 | spin_unlock(&last_gcwq->lock); |
| 1021 | spin_lock_irqsave(&gcwq->lock, flags); | 1264 | spin_lock(&gcwq->lock); |
| 1022 | } | 1265 | } |
| 1023 | } else | 1266 | } else { |
| 1024 | spin_lock_irqsave(&gcwq->lock, flags); | 1267 | spin_lock(&gcwq->lock); |
| 1268 | } | ||
| 1025 | } else { | 1269 | } else { |
| 1026 | gcwq = get_gcwq(WORK_CPU_UNBOUND); | 1270 | gcwq = get_gcwq(WORK_CPU_UNBOUND); |
| 1027 | spin_lock_irqsave(&gcwq->lock, flags); | 1271 | spin_lock(&gcwq->lock); |
| 1028 | } | 1272 | } |
| 1029 | 1273 | ||
| 1030 | /* gcwq determined, get cwq and queue */ | 1274 | /* gcwq determined, get cwq and queue */ |
| 1031 | cwq = get_cwq(gcwq->cpu, wq); | 1275 | cwq = get_cwq(gcwq->cpu, wq); |
| 1032 | trace_workqueue_queue_work(cpu, cwq, work); | 1276 | trace_workqueue_queue_work(req_cpu, cwq, work); |
| 1033 | 1277 | ||
| 1034 | if (WARN_ON(!list_empty(&work->entry))) { | 1278 | if (WARN_ON(!list_empty(&work->entry))) { |
| 1035 | spin_unlock_irqrestore(&gcwq->lock, flags); | 1279 | spin_unlock(&gcwq->lock); |
| 1036 | return; | 1280 | return; |
| 1037 | } | 1281 | } |
| 1038 | 1282 | ||
| @@ -1050,79 +1294,110 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, | |||
| 1050 | 1294 | ||
| 1051 | insert_work(cwq, work, worklist, work_flags); | 1295 | insert_work(cwq, work, worklist, work_flags); |
| 1052 | 1296 | ||
| 1053 | spin_unlock_irqrestore(&gcwq->lock, flags); | 1297 | spin_unlock(&gcwq->lock); |
| 1054 | } | 1298 | } |
| 1055 | 1299 | ||
| 1056 | /** | 1300 | /** |
| 1057 | * queue_work - queue work on a workqueue | 1301 | * queue_work_on - queue work on specific cpu |
| 1302 | * @cpu: CPU number to execute work on | ||
| 1058 | * @wq: workqueue to use | 1303 | * @wq: workqueue to use |
| 1059 | * @work: work to queue | 1304 | * @work: work to queue |
| 1060 | * | 1305 | * |
| 1061 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1306 | * Returns %false if @work was already on a queue, %true otherwise. |
| 1062 | * | 1307 | * |
| 1063 | * We queue the work to the CPU on which it was submitted, but if the CPU dies | 1308 | * We queue the work to a specific CPU, the caller must ensure it |
| 1064 | * it can be processed by another CPU. | 1309 | * can't go away. |
| 1065 | */ | 1310 | */ |
| 1066 | int queue_work(struct workqueue_struct *wq, struct work_struct *work) | 1311 | bool queue_work_on(int cpu, struct workqueue_struct *wq, |
| 1312 | struct work_struct *work) | ||
| 1067 | { | 1313 | { |
| 1068 | int ret; | 1314 | bool ret = false; |
| 1315 | unsigned long flags; | ||
| 1069 | 1316 | ||
| 1070 | ret = queue_work_on(get_cpu(), wq, work); | 1317 | local_irq_save(flags); |
| 1071 | put_cpu(); | 1318 | |
| 1319 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | ||
| 1320 | __queue_work(cpu, wq, work); | ||
| 1321 | ret = true; | ||
| 1322 | } | ||
| 1072 | 1323 | ||
| 1324 | local_irq_restore(flags); | ||
| 1073 | return ret; | 1325 | return ret; |
| 1074 | } | 1326 | } |
| 1075 | EXPORT_SYMBOL_GPL(queue_work); | 1327 | EXPORT_SYMBOL_GPL(queue_work_on); |
| 1076 | 1328 | ||
| 1077 | /** | 1329 | /** |
| 1078 | * queue_work_on - queue work on specific cpu | 1330 | * queue_work - queue work on a workqueue |
| 1079 | * @cpu: CPU number to execute work on | ||
| 1080 | * @wq: workqueue to use | 1331 | * @wq: workqueue to use |
| 1081 | * @work: work to queue | 1332 | * @work: work to queue |
| 1082 | * | 1333 | * |
| 1083 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1334 | * Returns %false if @work was already on a queue, %true otherwise. |
| 1084 | * | 1335 | * |
| 1085 | * We queue the work to a specific CPU, the caller must ensure it | 1336 | * We queue the work to the CPU on which it was submitted, but if the CPU dies |
| 1086 | * can't go away. | 1337 | * it can be processed by another CPU. |
| 1087 | */ | 1338 | */ |
| 1088 | int | 1339 | bool queue_work(struct workqueue_struct *wq, struct work_struct *work) |
| 1089 | queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) | ||
| 1090 | { | 1340 | { |
| 1091 | int ret = 0; | 1341 | return queue_work_on(WORK_CPU_UNBOUND, wq, work); |
| 1092 | |||
| 1093 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | ||
| 1094 | __queue_work(cpu, wq, work); | ||
| 1095 | ret = 1; | ||
| 1096 | } | ||
| 1097 | return ret; | ||
| 1098 | } | 1342 | } |
| 1099 | EXPORT_SYMBOL_GPL(queue_work_on); | 1343 | EXPORT_SYMBOL_GPL(queue_work); |
| 1100 | 1344 | ||
| 1101 | static void delayed_work_timer_fn(unsigned long __data) | 1345 | void delayed_work_timer_fn(unsigned long __data) |
| 1102 | { | 1346 | { |
| 1103 | struct delayed_work *dwork = (struct delayed_work *)__data; | 1347 | struct delayed_work *dwork = (struct delayed_work *)__data; |
| 1104 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); | 1348 | struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); |
| 1105 | 1349 | ||
| 1106 | __queue_work(smp_processor_id(), cwq->wq, &dwork->work); | 1350 | /* should have been called from irqsafe timer with irq already off */ |
| 1351 | __queue_work(dwork->cpu, cwq->wq, &dwork->work); | ||
| 1107 | } | 1352 | } |
| 1353 | EXPORT_SYMBOL_GPL(delayed_work_timer_fn); | ||
| 1108 | 1354 | ||
| 1109 | /** | 1355 | static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, |
| 1110 | * queue_delayed_work - queue work on a workqueue after delay | 1356 | struct delayed_work *dwork, unsigned long delay) |
| 1111 | * @wq: workqueue to use | ||
| 1112 | * @dwork: delayable work to queue | ||
| 1113 | * @delay: number of jiffies to wait before queueing | ||
| 1114 | * | ||
| 1115 | * Returns 0 if @work was already on a queue, non-zero otherwise. | ||
| 1116 | */ | ||
| 1117 | int queue_delayed_work(struct workqueue_struct *wq, | ||
| 1118 | struct delayed_work *dwork, unsigned long delay) | ||
| 1119 | { | 1357 | { |
| 1120 | if (delay == 0) | 1358 | struct timer_list *timer = &dwork->timer; |
| 1121 | return queue_work(wq, &dwork->work); | 1359 | struct work_struct *work = &dwork->work; |
| 1360 | unsigned int lcpu; | ||
| 1361 | |||
| 1362 | WARN_ON_ONCE(timer->function != delayed_work_timer_fn || | ||
| 1363 | timer->data != (unsigned long)dwork); | ||
| 1364 | BUG_ON(timer_pending(timer)); | ||
| 1365 | BUG_ON(!list_empty(&work->entry)); | ||
| 1366 | |||
| 1367 | timer_stats_timer_set_start_info(&dwork->timer); | ||
| 1368 | |||
| 1369 | /* | ||
| 1370 | * This stores cwq for the moment, for the timer_fn. Note that the | ||
| 1371 | * work's gcwq is preserved to allow reentrance detection for | ||
| 1372 | * delayed works. | ||
| 1373 | */ | ||
| 1374 | if (!(wq->flags & WQ_UNBOUND)) { | ||
| 1375 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 1122 | 1376 | ||
| 1123 | return queue_delayed_work_on(-1, wq, dwork, delay); | 1377 | /* |
| 1378 | * If we cannot get the last gcwq from @work directly, | ||
| 1379 | * select the last CPU such that it avoids unnecessarily | ||
| 1380 | * triggering non-reentrancy check in __queue_work(). | ||
| 1381 | */ | ||
| 1382 | lcpu = cpu; | ||
| 1383 | if (gcwq) | ||
| 1384 | lcpu = gcwq->cpu; | ||
| 1385 | if (lcpu == WORK_CPU_UNBOUND) | ||
| 1386 | lcpu = raw_smp_processor_id(); | ||
| 1387 | } else { | ||
| 1388 | lcpu = WORK_CPU_UNBOUND; | ||
| 1389 | } | ||
| 1390 | |||
| 1391 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | ||
| 1392 | |||
| 1393 | dwork->cpu = cpu; | ||
| 1394 | timer->expires = jiffies + delay; | ||
| 1395 | |||
| 1396 | if (unlikely(cpu != WORK_CPU_UNBOUND)) | ||
| 1397 | add_timer_on(timer, cpu); | ||
| 1398 | else | ||
| 1399 | add_timer(timer); | ||
| 1124 | } | 1400 | } |
| 1125 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
| 1126 | 1401 | ||
| 1127 | /** | 1402 | /** |
| 1128 | * queue_delayed_work_on - queue work on specific CPU after delay | 1403 | * queue_delayed_work_on - queue work on specific CPU after delay |
| @@ -1131,53 +1406,100 @@ EXPORT_SYMBOL_GPL(queue_delayed_work); | |||
| 1131 | * @dwork: work to queue | 1406 | * @dwork: work to queue |
| 1132 | * @delay: number of jiffies to wait before queueing | 1407 | * @delay: number of jiffies to wait before queueing |
| 1133 | * | 1408 | * |
| 1134 | * Returns 0 if @work was already on a queue, non-zero otherwise. | 1409 | * Returns %false if @work was already on a queue, %true otherwise. If |
| 1410 | * @delay is zero and @dwork is idle, it will be scheduled for immediate | ||
| 1411 | * execution. | ||
| 1135 | */ | 1412 | */ |
| 1136 | int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, | 1413 | bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, |
| 1137 | struct delayed_work *dwork, unsigned long delay) | 1414 | struct delayed_work *dwork, unsigned long delay) |
| 1138 | { | 1415 | { |
| 1139 | int ret = 0; | ||
| 1140 | struct timer_list *timer = &dwork->timer; | ||
| 1141 | struct work_struct *work = &dwork->work; | 1416 | struct work_struct *work = &dwork->work; |
| 1417 | bool ret = false; | ||
| 1418 | unsigned long flags; | ||
| 1142 | 1419 | ||
| 1143 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { | 1420 | if (!delay) |
| 1144 | unsigned int lcpu; | 1421 | return queue_work_on(cpu, wq, &dwork->work); |
| 1145 | 1422 | ||
| 1146 | BUG_ON(timer_pending(timer)); | 1423 | /* read the comment in __queue_work() */ |
| 1147 | BUG_ON(!list_empty(&work->entry)); | 1424 | local_irq_save(flags); |
| 1148 | 1425 | ||
| 1149 | timer_stats_timer_set_start_info(&dwork->timer); | 1426 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { |
| 1427 | __queue_delayed_work(cpu, wq, dwork, delay); | ||
| 1428 | ret = true; | ||
| 1429 | } | ||
| 1150 | 1430 | ||
| 1151 | /* | 1431 | local_irq_restore(flags); |
| 1152 | * This stores cwq for the moment, for the timer_fn. | 1432 | return ret; |
| 1153 | * Note that the work's gcwq is preserved to allow | 1433 | } |
| 1154 | * reentrance detection for delayed works. | 1434 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); |
| 1155 | */ | ||
| 1156 | if (!(wq->flags & WQ_UNBOUND)) { | ||
| 1157 | struct global_cwq *gcwq = get_work_gcwq(work); | ||
| 1158 | 1435 | ||
| 1159 | if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) | 1436 | /** |
| 1160 | lcpu = gcwq->cpu; | 1437 | * queue_delayed_work - queue work on a workqueue after delay |
| 1161 | else | 1438 | * @wq: workqueue to use |
| 1162 | lcpu = raw_smp_processor_id(); | 1439 | * @dwork: delayable work to queue |
| 1163 | } else | 1440 | * @delay: number of jiffies to wait before queueing |
| 1164 | lcpu = WORK_CPU_UNBOUND; | 1441 | * |
| 1442 | * Equivalent to queue_delayed_work_on() but tries to use the local CPU. | ||
| 1443 | */ | ||
| 1444 | bool queue_delayed_work(struct workqueue_struct *wq, | ||
| 1445 | struct delayed_work *dwork, unsigned long delay) | ||
| 1446 | { | ||
| 1447 | return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); | ||
| 1448 | } | ||
| 1449 | EXPORT_SYMBOL_GPL(queue_delayed_work); | ||
| 1165 | 1450 | ||
| 1166 | set_work_cwq(work, get_cwq(lcpu, wq), 0); | 1451 | /** |
| 1452 | * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU | ||
| 1453 | * @cpu: CPU number to execute work on | ||
| 1454 | * @wq: workqueue to use | ||
| 1455 | * @dwork: work to queue | ||
| 1456 | * @delay: number of jiffies to wait before queueing | ||
| 1457 | * | ||
| 1458 | * If @dwork is idle, equivalent to queue_delayed_work_on(); otherwise, | ||
| 1459 | * modify @dwork's timer so that it expires after @delay. If @delay is | ||
| 1460 | * zero, @work is guaranteed to be scheduled immediately regardless of its | ||
| 1461 | * current state. | ||
| 1462 | * | ||
| 1463 | * Returns %false if @dwork was idle and queued, %true if @dwork was | ||
| 1464 | * pending and its timer was modified. | ||
| 1465 | * | ||
| 1466 | * This function is safe to call from any context including IRQ handler. | ||
| 1467 | * See try_to_grab_pending() for details. | ||
| 1468 | */ | ||
| 1469 | bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, | ||
| 1470 | struct delayed_work *dwork, unsigned long delay) | ||
| 1471 | { | ||
| 1472 | unsigned long flags; | ||
| 1473 | int ret; | ||
| 1167 | 1474 | ||
| 1168 | timer->expires = jiffies + delay; | 1475 | do { |
| 1169 | timer->data = (unsigned long)dwork; | 1476 | ret = try_to_grab_pending(&dwork->work, true, &flags); |
| 1170 | timer->function = delayed_work_timer_fn; | 1477 | } while (unlikely(ret == -EAGAIN)); |
| 1171 | 1478 | ||
| 1172 | if (unlikely(cpu >= 0)) | 1479 | if (likely(ret >= 0)) { |
| 1173 | add_timer_on(timer, cpu); | 1480 | __queue_delayed_work(cpu, wq, dwork, delay); |
| 1174 | else | 1481 | local_irq_restore(flags); |
| 1175 | add_timer(timer); | ||
| 1176 | ret = 1; | ||
| 1177 | } | 1482 | } |
| 1483 | |||
| 1484 | /* -ENOENT from try_to_grab_pending() becomes %true */ | ||
| 1178 | return ret; | 1485 | return ret; |
| 1179 | } | 1486 | } |
| 1180 | EXPORT_SYMBOL_GPL(queue_delayed_work_on); | 1487 | EXPORT_SYMBOL_GPL(mod_delayed_work_on); |
| 1488 | |||
| 1489 | /** | ||
| 1490 | * mod_delayed_work - modify delay of or queue a delayed work | ||
| 1491 | * @wq: workqueue to use | ||
| 1492 | * @dwork: work to queue | ||
| 1493 | * @delay: number of jiffies to wait before queueing | ||
| 1494 | * | ||
| 1495 | * mod_delayed_work_on() on local CPU. | ||
| 1496 | */ | ||
| 1497 | bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, | ||
| 1498 | unsigned long delay) | ||
| 1499 | { | ||
| 1500 | return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); | ||
| 1501 | } | ||
| 1502 | EXPORT_SYMBOL_GPL(mod_delayed_work); | ||
| 1181 | 1503 | ||
| 1182 | /** | 1504 | /** |
| 1183 | * worker_enter_idle - enter idle state | 1505 | * worker_enter_idle - enter idle state |
| @@ -1305,37 +1627,21 @@ __acquires(&gcwq->lock) | |||
| 1305 | } | 1627 | } |
| 1306 | } | 1628 | } |
| 1307 | 1629 | ||
| 1308 | struct idle_rebind { | ||
| 1309 | int cnt; /* # workers to be rebound */ | ||
| 1310 | struct completion done; /* all workers rebound */ | ||
| 1311 | }; | ||
| 1312 | |||
| 1313 | /* | 1630 | /* |
| 1314 | * Rebind an idle @worker to its CPU. During CPU onlining, this has to | 1631 | * Rebind an idle @worker to its CPU. worker_thread() will test |
| 1315 | * happen synchronously for idle workers. worker_thread() will test | 1632 | * list_empty(@worker->entry) before leaving idle and call this function. |
| 1316 | * %WORKER_REBIND before leaving idle and call this function. | ||
| 1317 | */ | 1633 | */ |
| 1318 | static void idle_worker_rebind(struct worker *worker) | 1634 | static void idle_worker_rebind(struct worker *worker) |
| 1319 | { | 1635 | { |
| 1320 | struct global_cwq *gcwq = worker->pool->gcwq; | 1636 | struct global_cwq *gcwq = worker->pool->gcwq; |
| 1321 | 1637 | ||
| 1322 | /* CPU must be online at this point */ | 1638 | /* CPU may go down again inbetween, clear UNBOUND only on success */ |
| 1323 | WARN_ON(!worker_maybe_bind_and_lock(worker)); | 1639 | if (worker_maybe_bind_and_lock(worker)) |
| 1324 | if (!--worker->idle_rebind->cnt) | 1640 | worker_clr_flags(worker, WORKER_UNBOUND); |
| 1325 | complete(&worker->idle_rebind->done); | ||
| 1326 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
| 1327 | 1641 | ||
| 1328 | /* we did our part, wait for rebind_workers() to finish up */ | 1642 | /* rebind complete, become available again */ |
| 1329 | wait_event(gcwq->rebind_hold, !(worker->flags & WORKER_REBIND)); | 1643 | list_add(&worker->entry, &worker->pool->idle_list); |
| 1330 | 1644 | spin_unlock_irq(&gcwq->lock); | |
| 1331 | /* | ||
| 1332 | * rebind_workers() shouldn't finish until all workers passed the | ||
| 1333 | * above WORKER_REBIND wait. Tell it when done. | ||
| 1334 | */ | ||
| 1335 | spin_lock_irq(&worker->pool->gcwq->lock); | ||
| 1336 | if (!--worker->idle_rebind->cnt) | ||
| 1337 | complete(&worker->idle_rebind->done); | ||
| 1338 | spin_unlock_irq(&worker->pool->gcwq->lock); | ||
| 1339 | } | 1645 | } |
| 1340 | 1646 | ||
| 1341 | /* | 1647 | /* |
| @@ -1349,16 +1655,8 @@ static void busy_worker_rebind_fn(struct work_struct *work) | |||
| 1349 | struct worker *worker = container_of(work, struct worker, rebind_work); | 1655 | struct worker *worker = container_of(work, struct worker, rebind_work); |
| 1350 | struct global_cwq *gcwq = worker->pool->gcwq; | 1656 | struct global_cwq *gcwq = worker->pool->gcwq; |
| 1351 | 1657 | ||
| 1352 | worker_maybe_bind_and_lock(worker); | 1658 | if (worker_maybe_bind_and_lock(worker)) |
| 1353 | 1659 | worker_clr_flags(worker, WORKER_UNBOUND); | |
| 1354 | /* | ||
| 1355 | * %WORKER_REBIND must be cleared even if the above binding failed; | ||
| 1356 | * otherwise, we may confuse the next CPU_UP cycle or oops / get | ||
| 1357 | * stuck by calling idle_worker_rebind() prematurely. If CPU went | ||
| 1358 | * down again inbetween, %WORKER_UNBOUND would be set, so clearing | ||
| 1359 | * %WORKER_REBIND is always safe. | ||
| 1360 | */ | ||
| 1361 | worker_clr_flags(worker, WORKER_REBIND); | ||
| 1362 | 1660 | ||
| 1363 | spin_unlock_irq(&gcwq->lock); | 1661 | spin_unlock_irq(&gcwq->lock); |
| 1364 | } | 1662 | } |
| @@ -1370,123 +1668,74 @@ static void busy_worker_rebind_fn(struct work_struct *work) | |||
| 1370 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding | 1668 | * @gcwq->cpu is coming online. Rebind all workers to the CPU. Rebinding |
| 1371 | * is different for idle and busy ones. | 1669 | * is different for idle and busy ones. |
| 1372 | * | 1670 | * |
| 1373 | * The idle ones should be rebound synchronously and idle rebinding should | 1671 | * Idle ones will be removed from the idle_list and woken up. They will |
| 1374 | * be complete before any worker starts executing work items with | 1672 | * add themselves back after completing rebind. This ensures that the |
| 1375 | * concurrency management enabled; otherwise, scheduler may oops trying to | 1673 | * idle_list doesn't contain any unbound workers when re-bound busy workers |
| 1376 | * wake up non-local idle worker from wq_worker_sleeping(). | 1674 | * try to perform local wake-ups for concurrency management. |
| 1377 | * | 1675 | * |
| 1378 | * This is achieved by repeatedly requesting rebinding until all idle | 1676 | * Busy workers can rebind after they finish their current work items. |
| 1379 | * workers are known to have been rebound under @gcwq->lock and holding all | 1677 | * Queueing the rebind work item at the head of the scheduled list is |
| 1380 | * idle workers from becoming busy until idle rebinding is complete. | 1678 | * enough. Note that nr_running will be properly bumped as busy workers |
| 1679 | * rebind. | ||
| 1381 | * | 1680 | * |
| 1382 | * Once idle workers are rebound, busy workers can be rebound as they | 1681 | * On return, all non-manager workers are scheduled for rebind - see |
| 1383 | * finish executing their current work items. Queueing the rebind work at | 1682 | * manage_workers() for the manager special case. Any idle worker |
| 1384 | * the head of their scheduled lists is enough. Note that nr_running will | 1683 | * including the manager will not appear on @idle_list until rebind is |
| 1385 | * be properbly bumped as busy workers rebind. | 1684 | * complete, making local wake-ups safe. |
| 1386 | * | ||
| 1387 | * On return, all workers are guaranteed to either be bound or have rebind | ||
| 1388 | * work item scheduled. | ||
| 1389 | */ | 1685 | */ |
| 1390 | static void rebind_workers(struct global_cwq *gcwq) | 1686 | static void rebind_workers(struct global_cwq *gcwq) |
| 1391 | __releases(&gcwq->lock) __acquires(&gcwq->lock) | ||
| 1392 | { | 1687 | { |
| 1393 | struct idle_rebind idle_rebind; | ||
| 1394 | struct worker_pool *pool; | 1688 | struct worker_pool *pool; |
| 1395 | struct worker *worker; | 1689 | struct worker *worker, *n; |
| 1396 | struct hlist_node *pos; | 1690 | struct hlist_node *pos; |
| 1397 | int i; | 1691 | int i; |
| 1398 | 1692 | ||
| 1399 | lockdep_assert_held(&gcwq->lock); | 1693 | lockdep_assert_held(&gcwq->lock); |
| 1400 | 1694 | ||
| 1401 | for_each_worker_pool(pool, gcwq) | 1695 | for_each_worker_pool(pool, gcwq) |
| 1402 | lockdep_assert_held(&pool->manager_mutex); | 1696 | lockdep_assert_held(&pool->assoc_mutex); |
| 1403 | 1697 | ||
| 1404 | /* | 1698 | /* dequeue and kick idle ones */ |
| 1405 | * Rebind idle workers. Interlocked both ways. We wait for | ||
| 1406 | * workers to rebind via @idle_rebind.done. Workers will wait for | ||
| 1407 | * us to finish up by watching %WORKER_REBIND. | ||
| 1408 | */ | ||
| 1409 | init_completion(&idle_rebind.done); | ||
| 1410 | retry: | ||
| 1411 | idle_rebind.cnt = 1; | ||
| 1412 | INIT_COMPLETION(idle_rebind.done); | ||
| 1413 | |||
| 1414 | /* set REBIND and kick idle ones, we'll wait for these later */ | ||
| 1415 | for_each_worker_pool(pool, gcwq) { | 1699 | for_each_worker_pool(pool, gcwq) { |
| 1416 | list_for_each_entry(worker, &pool->idle_list, entry) { | 1700 | list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { |
| 1417 | unsigned long worker_flags = worker->flags; | 1701 | /* |
| 1418 | 1702 | * idle workers should be off @pool->idle_list | |
| 1419 | if (worker->flags & WORKER_REBIND) | 1703 | * until rebind is complete to avoid receiving |
| 1420 | continue; | 1704 | * premature local wake-ups. |
| 1421 | 1705 | */ | |
| 1422 | /* morph UNBOUND to REBIND atomically */ | 1706 | list_del_init(&worker->entry); |
| 1423 | worker_flags &= ~WORKER_UNBOUND; | ||
| 1424 | worker_flags |= WORKER_REBIND; | ||
| 1425 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
| 1426 | |||
| 1427 | idle_rebind.cnt++; | ||
| 1428 | worker->idle_rebind = &idle_rebind; | ||
| 1429 | 1707 | ||
| 1430 | /* worker_thread() will call idle_worker_rebind() */ | 1708 | /* |
| 1709 | * worker_thread() will see the above dequeuing | ||
| 1710 | * and call idle_worker_rebind(). | ||
| 1711 | */ | ||
| 1431 | wake_up_process(worker->task); | 1712 | wake_up_process(worker->task); |
| 1432 | } | 1713 | } |
| 1433 | } | 1714 | } |
| 1434 | 1715 | ||
| 1435 | if (--idle_rebind.cnt) { | 1716 | /* rebind busy workers */ |
| 1436 | spin_unlock_irq(&gcwq->lock); | ||
| 1437 | wait_for_completion(&idle_rebind.done); | ||
| 1438 | spin_lock_irq(&gcwq->lock); | ||
| 1439 | /* busy ones might have become idle while waiting, retry */ | ||
| 1440 | goto retry; | ||
| 1441 | } | ||
| 1442 | |||
| 1443 | /* all idle workers are rebound, rebind busy workers */ | ||
| 1444 | for_each_busy_worker(worker, i, pos, gcwq) { | 1717 | for_each_busy_worker(worker, i, pos, gcwq) { |
| 1445 | struct work_struct *rebind_work = &worker->rebind_work; | 1718 | struct work_struct *rebind_work = &worker->rebind_work; |
| 1446 | unsigned long worker_flags = worker->flags; | 1719 | struct workqueue_struct *wq; |
| 1447 | |||
| 1448 | /* morph UNBOUND to REBIND atomically */ | ||
| 1449 | worker_flags &= ~WORKER_UNBOUND; | ||
| 1450 | worker_flags |= WORKER_REBIND; | ||
| 1451 | ACCESS_ONCE(worker->flags) = worker_flags; | ||
| 1452 | 1720 | ||
| 1453 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, | 1721 | if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, |
| 1454 | work_data_bits(rebind_work))) | 1722 | work_data_bits(rebind_work))) |
| 1455 | continue; | 1723 | continue; |
| 1456 | 1724 | ||
| 1457 | /* wq doesn't matter, use the default one */ | ||
| 1458 | debug_work_activate(rebind_work); | 1725 | debug_work_activate(rebind_work); |
| 1459 | insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, | ||
| 1460 | worker->scheduled.next, | ||
| 1461 | work_color_to_flags(WORK_NO_COLOR)); | ||
| 1462 | } | ||
| 1463 | |||
| 1464 | /* | ||
| 1465 | * All idle workers are rebound and waiting for %WORKER_REBIND to | ||
| 1466 | * be cleared inside idle_worker_rebind(). Clear and release. | ||
| 1467 | * Clearing %WORKER_REBIND from this foreign context is safe | ||
| 1468 | * because these workers are still guaranteed to be idle. | ||
| 1469 | * | ||
| 1470 | * We need to make sure all idle workers passed WORKER_REBIND wait | ||
| 1471 | * in idle_worker_rebind() before returning; otherwise, workers can | ||
| 1472 | * get stuck at the wait if hotplug cycle repeats. | ||
| 1473 | */ | ||
| 1474 | idle_rebind.cnt = 1; | ||
| 1475 | INIT_COMPLETION(idle_rebind.done); | ||
| 1476 | |||
| 1477 | for_each_worker_pool(pool, gcwq) { | ||
| 1478 | list_for_each_entry(worker, &pool->idle_list, entry) { | ||
| 1479 | worker->flags &= ~WORKER_REBIND; | ||
| 1480 | idle_rebind.cnt++; | ||
| 1481 | } | ||
| 1482 | } | ||
| 1483 | 1726 | ||
| 1484 | wake_up_all(&gcwq->rebind_hold); | 1727 | /* |
| 1728 | * wq doesn't really matter but let's keep @worker->pool | ||
| 1729 | * and @cwq->pool consistent for sanity. | ||
| 1730 | */ | ||
| 1731 | if (worker_pool_pri(worker->pool)) | ||
| 1732 | wq = system_highpri_wq; | ||
| 1733 | else | ||
| 1734 | wq = system_wq; | ||
| 1485 | 1735 | ||
| 1486 | if (--idle_rebind.cnt) { | 1736 | insert_work(get_cwq(gcwq->cpu, wq), rebind_work, |
| 1487 | spin_unlock_irq(&gcwq->lock); | 1737 | worker->scheduled.next, |
| 1488 | wait_for_completion(&idle_rebind.done); | 1738 | work_color_to_flags(WORK_NO_COLOR)); |
| 1489 | spin_lock_irq(&gcwq->lock); | ||
| 1490 | } | 1739 | } |
| 1491 | } | 1740 | } |
| 1492 | 1741 | ||
| @@ -1844,22 +2093,22 @@ static bool manage_workers(struct worker *worker) | |||
| 1844 | * grab %POOL_MANAGING_WORKERS to achieve this because that can | 2093 | * grab %POOL_MANAGING_WORKERS to achieve this because that can |
| 1845 | * lead to idle worker depletion (all become busy thinking someone | 2094 | * lead to idle worker depletion (all become busy thinking someone |
| 1846 | * else is managing) which in turn can result in deadlock under | 2095 | * else is managing) which in turn can result in deadlock under |
| 1847 | * extreme circumstances. Use @pool->manager_mutex to synchronize | 2096 | * extreme circumstances. Use @pool->assoc_mutex to synchronize |
| 1848 | * manager against CPU hotplug. | 2097 | * manager against CPU hotplug. |
| 1849 | * | 2098 | * |
| 1850 | * manager_mutex would always be free unless CPU hotplug is in | 2099 | * assoc_mutex would always be free unless CPU hotplug is in |
| 1851 | * progress. trylock first without dropping @gcwq->lock. | 2100 | * progress. trylock first without dropping @gcwq->lock. |
| 1852 | */ | 2101 | */ |
| 1853 | if (unlikely(!mutex_trylock(&pool->manager_mutex))) { | 2102 | if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { |
| 1854 | spin_unlock_irq(&pool->gcwq->lock); | 2103 | spin_unlock_irq(&pool->gcwq->lock); |
| 1855 | mutex_lock(&pool->manager_mutex); | 2104 | mutex_lock(&pool->assoc_mutex); |
| 1856 | /* | 2105 | /* |
| 1857 | * CPU hotplug could have happened while we were waiting | 2106 | * CPU hotplug could have happened while we were waiting |
| 1858 | * for manager_mutex. Hotplug itself can't handle us | 2107 | * for assoc_mutex. Hotplug itself can't handle us |
| 1859 | * because manager isn't either on idle or busy list, and | 2108 | * because manager isn't either on idle or busy list, and |
| 1860 | * @gcwq's state and ours could have deviated. | 2109 | * @gcwq's state and ours could have deviated. |
| 1861 | * | 2110 | * |
| 1862 | * As hotplug is now excluded via manager_mutex, we can | 2111 | * As hotplug is now excluded via assoc_mutex, we can |
| 1863 | * simply try to bind. It will succeed or fail depending | 2112 | * simply try to bind. It will succeed or fail depending |
| 1864 | * on @gcwq's current state. Try it and adjust | 2113 | * on @gcwq's current state. Try it and adjust |
| 1865 | * %WORKER_UNBOUND accordingly. | 2114 | * %WORKER_UNBOUND accordingly. |
| @@ -1882,112 +2131,11 @@ static bool manage_workers(struct worker *worker) | |||
| 1882 | ret |= maybe_create_worker(pool); | 2131 | ret |= maybe_create_worker(pool); |
| 1883 | 2132 | ||
| 1884 | pool->flags &= ~POOL_MANAGING_WORKERS; | 2133 | pool->flags &= ~POOL_MANAGING_WORKERS; |
| 1885 | mutex_unlock(&pool->manager_mutex); | 2134 | mutex_unlock(&pool->assoc_mutex); |
| 1886 | return ret; | 2135 | return ret; |
| 1887 | } | 2136 | } |
| 1888 | 2137 | ||
| 1889 | /** | 2138 | /** |
| 1890 | * move_linked_works - move linked works to a list | ||
| 1891 | * @work: start of series of works to be scheduled | ||
| 1892 | * @head: target list to append @work to | ||
| 1893 | * @nextp: out paramter for nested worklist walking | ||
| 1894 | * | ||
| 1895 | * Schedule linked works starting from @work to @head. Work series to | ||
| 1896 | * be scheduled starts at @work and includes any consecutive work with | ||
| 1897 | * WORK_STRUCT_LINKED set in its predecessor. | ||
| 1898 | * | ||
| 1899 | * If @nextp is not NULL, it's updated to point to the next work of | ||
| 1900 | * the last scheduled work. This allows move_linked_works() to be | ||
| 1901 | * nested inside outer list_for_each_entry_safe(). | ||
| 1902 | * | ||
| 1903 | * CONTEXT: | ||
| 1904 | * spin_lock_irq(gcwq->lock). | ||
| 1905 | */ | ||
| 1906 | static void move_linked_works(struct work_struct *work, struct list_head *head, | ||
| 1907 | struct work_struct **nextp) | ||
| 1908 | { | ||
| 1909 | struct work_struct *n; | ||
| 1910 | |||
| 1911 | /* | ||
| 1912 | * Linked worklist will always end before the end of the list, | ||
| 1913 | * use NULL for list head. | ||
| 1914 | */ | ||
| 1915 | list_for_each_entry_safe_from(work, n, NULL, entry) { | ||
| 1916 | list_move_tail(&work->entry, head); | ||
| 1917 | if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) | ||
| 1918 | break; | ||
| 1919 | } | ||
| 1920 | |||
| 1921 | /* | ||
| 1922 | * If we're already inside safe list traversal and have moved | ||
| 1923 | * multiple works to the scheduled queue, the next position | ||
| 1924 | * needs to be updated. | ||
| 1925 | */ | ||
| 1926 | if (nextp) | ||
| 1927 | *nextp = n; | ||
| 1928 | } | ||
| 1929 | |||
| 1930 | static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) | ||
| 1931 | { | ||
| 1932 | struct work_struct *work = list_first_entry(&cwq->delayed_works, | ||
| 1933 | struct work_struct, entry); | ||
| 1934 | |||
| 1935 | trace_workqueue_activate_work(work); | ||
| 1936 | move_linked_works(work, &cwq->pool->worklist, NULL); | ||
| 1937 | __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); | ||
| 1938 | cwq->nr_active++; | ||
| 1939 | } | ||
| 1940 | |||
| 1941 | /** | ||
| 1942 | * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight | ||
| 1943 | * @cwq: cwq of interest | ||
| 1944 | * @color: color of work which left the queue | ||
| 1945 | * @delayed: for a delayed work | ||
| 1946 | * | ||
| 1947 | * A work either has completed or is removed from pending queue, | ||
| 1948 | * decrement nr_in_flight of its cwq and handle workqueue flushing. | ||
| 1949 | * | ||
| 1950 | * CONTEXT: | ||
| 1951 | * spin_lock_irq(gcwq->lock). | ||
| 1952 | */ | ||
| 1953 | static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, | ||
| 1954 | bool delayed) | ||
| 1955 | { | ||
| 1956 | /* ignore uncolored works */ | ||
| 1957 | if (color == WORK_NO_COLOR) | ||
| 1958 | return; | ||
| 1959 | |||
| 1960 | cwq->nr_in_flight[color]--; | ||
| 1961 | |||
| 1962 | if (!delayed) { | ||
| 1963 | cwq->nr_active--; | ||
| 1964 | if (!list_empty(&cwq->delayed_works)) { | ||
| 1965 | /* one down, submit a delayed one */ | ||
| 1966 | if (cwq->nr_active < cwq->max_active) | ||
| 1967 | cwq_activate_first_delayed(cwq); | ||
| 1968 | } | ||
| 1969 | } | ||
| 1970 | |||
| 1971 | /* is flush in progress and are we at the flushing tip? */ | ||
| 1972 | if (likely(cwq->flush_color != color)) | ||
| 1973 | return; | ||
| 1974 | |||
| 1975 | /* are there still in-flight works? */ | ||
| 1976 | if (cwq->nr_in_flight[color]) | ||
| 1977 | return; | ||
| 1978 | |||
| 1979 | /* this cwq is done, clear flush_color */ | ||
| 1980 | cwq->flush_color = -1; | ||
| 1981 | |||
| 1982 | /* | ||
| 1983 | * If this was the last cwq, wake up the first flusher. It | ||
| 1984 | * will handle the rest. | ||
| 1985 | */ | ||
| 1986 | if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) | ||
| 1987 | complete(&cwq->wq->first_flusher->done); | ||
| 1988 | } | ||
| 1989 | |||
| 1990 | /** | ||
| 1991 | * process_one_work - process single work | 2139 | * process_one_work - process single work |
| 1992 | * @worker: self | 2140 | * @worker: self |
| 1993 | * @work: work to process | 2141 | * @work: work to process |
| @@ -2030,7 +2178,7 @@ __acquires(&gcwq->lock) | |||
| 2030 | * necessary to avoid spurious warnings from rescuers servicing the | 2178 | * necessary to avoid spurious warnings from rescuers servicing the |
| 2031 | * unbound or a disassociated gcwq. | 2179 | * unbound or a disassociated gcwq. |
| 2032 | */ | 2180 | */ |
| 2033 | WARN_ON_ONCE(!(worker->flags & (WORKER_UNBOUND | WORKER_REBIND)) && | 2181 | WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) && |
| 2034 | !(gcwq->flags & GCWQ_DISASSOCIATED) && | 2182 | !(gcwq->flags & GCWQ_DISASSOCIATED) && |
| 2035 | raw_smp_processor_id() != gcwq->cpu); | 2183 | raw_smp_processor_id() != gcwq->cpu); |
| 2036 | 2184 | ||
| @@ -2046,15 +2194,13 @@ __acquires(&gcwq->lock) | |||
| 2046 | return; | 2194 | return; |
| 2047 | } | 2195 | } |
| 2048 | 2196 | ||
| 2049 | /* claim and process */ | 2197 | /* claim and dequeue */ |
| 2050 | debug_work_deactivate(work); | 2198 | debug_work_deactivate(work); |
| 2051 | hlist_add_head(&worker->hentry, bwh); | 2199 | hlist_add_head(&worker->hentry, bwh); |
| 2052 | worker->current_work = work; | 2200 | worker->current_work = work; |
| 2053 | worker->current_cwq = cwq; | 2201 | worker->current_cwq = cwq; |
| 2054 | work_color = get_work_color(work); | 2202 | work_color = get_work_color(work); |
| 2055 | 2203 | ||
| 2056 | /* record the current cpu number in the work data and dequeue */ | ||
| 2057 | set_work_cpu(work, gcwq->cpu); | ||
| 2058 | list_del_init(&work->entry); | 2204 | list_del_init(&work->entry); |
| 2059 | 2205 | ||
| 2060 | /* | 2206 | /* |
| @@ -2071,9 +2217,16 @@ __acquires(&gcwq->lock) | |||
| 2071 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) | 2217 | if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool)) |
| 2072 | wake_up_worker(pool); | 2218 | wake_up_worker(pool); |
| 2073 | 2219 | ||
| 2220 | /* | ||
| 2221 | * Record the last CPU and clear PENDING which should be the last | ||
| 2222 | * update to @work. Also, do this inside @gcwq->lock so that | ||
| 2223 | * PENDING and queued state changes happen together while IRQ is | ||
| 2224 | * disabled. | ||
| 2225 | */ | ||
| 2226 | set_work_cpu_and_clear_pending(work, gcwq->cpu); | ||
| 2227 | |||
| 2074 | spin_unlock_irq(&gcwq->lock); | 2228 | spin_unlock_irq(&gcwq->lock); |
| 2075 | 2229 | ||
| 2076 | work_clear_pending(work); | ||
| 2077 | lock_map_acquire_read(&cwq->wq->lockdep_map); | 2230 | lock_map_acquire_read(&cwq->wq->lockdep_map); |
| 2078 | lock_map_acquire(&lockdep_map); | 2231 | lock_map_acquire(&lockdep_map); |
| 2079 | trace_workqueue_execute_start(work); | 2232 | trace_workqueue_execute_start(work); |
| @@ -2087,11 +2240,9 @@ __acquires(&gcwq->lock) | |||
| 2087 | lock_map_release(&cwq->wq->lockdep_map); | 2240 | lock_map_release(&cwq->wq->lockdep_map); |
| 2088 | 2241 | ||
| 2089 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { | 2242 | if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { |
| 2090 | printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " | 2243 | pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n" |
| 2091 | "%s/0x%08x/%d\n", | 2244 | " last function: %pf\n", |
| 2092 | current->comm, preempt_count(), task_pid_nr(current)); | 2245 | current->comm, preempt_count(), task_pid_nr(current), f); |
| 2093 | printk(KERN_ERR " last function: "); | ||
| 2094 | print_symbol("%s\n", (unsigned long)f); | ||
| 2095 | debug_show_held_locks(current); | 2246 | debug_show_held_locks(current); |
| 2096 | dump_stack(); | 2247 | dump_stack(); |
| 2097 | } | 2248 | } |
| @@ -2106,7 +2257,7 @@ __acquires(&gcwq->lock) | |||
| 2106 | hlist_del_init(&worker->hentry); | 2257 | hlist_del_init(&worker->hentry); |
| 2107 | worker->current_work = NULL; | 2258 | worker->current_work = NULL; |
| 2108 | worker->current_cwq = NULL; | 2259 | worker->current_cwq = NULL; |
| 2109 | cwq_dec_nr_in_flight(cwq, work_color, false); | 2260 | cwq_dec_nr_in_flight(cwq, work_color); |
| 2110 | } | 2261 | } |
| 2111 | 2262 | ||
| 2112 | /** | 2263 | /** |
| @@ -2151,18 +2302,17 @@ static int worker_thread(void *__worker) | |||
| 2151 | woke_up: | 2302 | woke_up: |
| 2152 | spin_lock_irq(&gcwq->lock); | 2303 | spin_lock_irq(&gcwq->lock); |
| 2153 | 2304 | ||
| 2154 | /* | 2305 | /* we are off idle list if destruction or rebind is requested */ |
| 2155 | * DIE can be set only while idle and REBIND set while busy has | 2306 | if (unlikely(list_empty(&worker->entry))) { |
| 2156 | * @worker->rebind_work scheduled. Checking here is enough. | ||
| 2157 | */ | ||
| 2158 | if (unlikely(worker->flags & (WORKER_REBIND | WORKER_DIE))) { | ||
| 2159 | spin_unlock_irq(&gcwq->lock); | 2307 | spin_unlock_irq(&gcwq->lock); |
| 2160 | 2308 | ||
| 2309 | /* if DIE is set, destruction is requested */ | ||
| 2161 | if (worker->flags & WORKER_DIE) { | 2310 | if (worker->flags & WORKER_DIE) { |
| 2162 | worker->task->flags &= ~PF_WQ_WORKER; | 2311 | worker->task->flags &= ~PF_WQ_WORKER; |
| 2163 | return 0; | 2312 | return 0; |
| 2164 | } | 2313 | } |
| 2165 | 2314 | ||
| 2315 | /* otherwise, rebind */ | ||
| 2166 | idle_worker_rebind(worker); | 2316 | idle_worker_rebind(worker); |
| 2167 | goto woke_up; | 2317 | goto woke_up; |
| 2168 | } | 2318 | } |
| @@ -2645,8 +2795,8 @@ reflush: | |||
| 2645 | 2795 | ||
| 2646 | if (++flush_cnt == 10 || | 2796 | if (++flush_cnt == 10 || |
| 2647 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) | 2797 | (flush_cnt % 100 == 0 && flush_cnt <= 1000)) |
| 2648 | pr_warning("workqueue %s: flush on destruction isn't complete after %u tries\n", | 2798 | pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", |
| 2649 | wq->name, flush_cnt); | 2799 | wq->name, flush_cnt); |
| 2650 | goto reflush; | 2800 | goto reflush; |
| 2651 | } | 2801 | } |
| 2652 | 2802 | ||
| @@ -2657,8 +2807,7 @@ reflush: | |||
| 2657 | } | 2807 | } |
| 2658 | EXPORT_SYMBOL_GPL(drain_workqueue); | 2808 | EXPORT_SYMBOL_GPL(drain_workqueue); |
| 2659 | 2809 | ||
| 2660 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | 2810 | static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) |
| 2661 | bool wait_executing) | ||
| 2662 | { | 2811 | { |
| 2663 | struct worker *worker = NULL; | 2812 | struct worker *worker = NULL; |
| 2664 | struct global_cwq *gcwq; | 2813 | struct global_cwq *gcwq; |
| @@ -2680,13 +2829,12 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, | |||
| 2680 | cwq = get_work_cwq(work); | 2829 | cwq = get_work_cwq(work); |
| 2681 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) | 2830 | if (unlikely(!cwq || gcwq != cwq->pool->gcwq)) |
| 2682 | goto already_gone; | 2831 | goto already_gone; |
| 2683 | } else if (wait_executing) { | 2832 | } else { |
| 2684 | worker = find_worker_executing_work(gcwq, work); | 2833 | worker = find_worker_executing_work(gcwq, work); |
| 2685 | if (!worker) | 2834 | if (!worker) |
| 2686 | goto already_gone; | 2835 | goto already_gone; |
| 2687 | cwq = worker->current_cwq; | 2836 | cwq = worker->current_cwq; |
| 2688 | } else | 2837 | } |
| 2689 | goto already_gone; | ||
| 2690 | 2838 | ||
| 2691 | insert_wq_barrier(cwq, barr, work, worker); | 2839 | insert_wq_barrier(cwq, barr, work, worker); |
| 2692 | spin_unlock_irq(&gcwq->lock); | 2840 | spin_unlock_irq(&gcwq->lock); |
| @@ -2713,15 +2861,8 @@ already_gone: | |||
| 2713 | * flush_work - wait for a work to finish executing the last queueing instance | 2861 | * flush_work - wait for a work to finish executing the last queueing instance |
| 2714 | * @work: the work to flush | 2862 | * @work: the work to flush |
| 2715 | * | 2863 | * |
| 2716 | * Wait until @work has finished execution. This function considers | 2864 | * Wait until @work has finished execution. @work is guaranteed to be idle |
| 2717 | * only the last queueing instance of @work. If @work has been | 2865 | * on return if it hasn't been requeued since flush started. |
| 2718 | * enqueued across different CPUs on a non-reentrant workqueue or on | ||
| 2719 | * multiple workqueues, @work might still be executing on return on | ||
| 2720 | * some of the CPUs from earlier queueing. | ||
| 2721 | * | ||
| 2722 | * If @work was queued only on a non-reentrant, ordered or unbound | ||
| 2723 | * workqueue, @work is guaranteed to be idle on return if it hasn't | ||
| 2724 | * been requeued since flush started. | ||
| 2725 | * | 2866 | * |
| 2726 | * RETURNS: | 2867 | * RETURNS: |
| 2727 | * %true if flush_work() waited for the work to finish execution, | 2868 | * %true if flush_work() waited for the work to finish execution, |
| @@ -2734,140 +2875,36 @@ bool flush_work(struct work_struct *work) | |||
| 2734 | lock_map_acquire(&work->lockdep_map); | 2875 | lock_map_acquire(&work->lockdep_map); |
| 2735 | lock_map_release(&work->lockdep_map); | 2876 | lock_map_release(&work->lockdep_map); |
| 2736 | 2877 | ||
| 2737 | if (start_flush_work(work, &barr, true)) { | 2878 | if (start_flush_work(work, &barr)) { |
| 2738 | wait_for_completion(&barr.done); | 2879 | wait_for_completion(&barr.done); |
| 2739 | destroy_work_on_stack(&barr.work); | 2880 | destroy_work_on_stack(&barr.work); |
| 2740 | return true; | 2881 | return true; |
| 2741 | } else | 2882 | } else { |
| 2742 | return false; | ||
| 2743 | } | ||
| 2744 | EXPORT_SYMBOL_GPL(flush_work); | ||
| 2745 | |||
| 2746 | static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) | ||
| 2747 | { | ||
| 2748 | struct wq_barrier barr; | ||
| 2749 | struct worker *worker; | ||
| 2750 | |||
| 2751 | spin_lock_irq(&gcwq->lock); | ||
| 2752 | |||
| 2753 | worker = find_worker_executing_work(gcwq, work); | ||
| 2754 | if (unlikely(worker)) | ||
| 2755 | insert_wq_barrier(worker->current_cwq, &barr, work, worker); | ||
| 2756 | |||
| 2757 | spin_unlock_irq(&gcwq->lock); | ||
| 2758 | |||
| 2759 | if (unlikely(worker)) { | ||
| 2760 | wait_for_completion(&barr.done); | ||
| 2761 | destroy_work_on_stack(&barr.work); | ||
| 2762 | return true; | ||
| 2763 | } else | ||
| 2764 | return false; | 2883 | return false; |
| 2765 | } | ||
| 2766 | |||
| 2767 | static bool wait_on_work(struct work_struct *work) | ||
| 2768 | { | ||
| 2769 | bool ret = false; | ||
| 2770 | int cpu; | ||
| 2771 | |||
| 2772 | might_sleep(); | ||
| 2773 | |||
| 2774 | lock_map_acquire(&work->lockdep_map); | ||
| 2775 | lock_map_release(&work->lockdep_map); | ||
| 2776 | |||
| 2777 | for_each_gcwq_cpu(cpu) | ||
| 2778 | ret |= wait_on_cpu_work(get_gcwq(cpu), work); | ||
| 2779 | return ret; | ||
| 2780 | } | ||
| 2781 | |||
| 2782 | /** | ||
| 2783 | * flush_work_sync - wait until a work has finished execution | ||
| 2784 | * @work: the work to flush | ||
| 2785 | * | ||
| 2786 | * Wait until @work has finished execution. On return, it's | ||
| 2787 | * guaranteed that all queueing instances of @work which happened | ||
| 2788 | * before this function is called are finished. In other words, if | ||
| 2789 | * @work hasn't been requeued since this function was called, @work is | ||
| 2790 | * guaranteed to be idle on return. | ||
| 2791 | * | ||
| 2792 | * RETURNS: | ||
| 2793 | * %true if flush_work_sync() waited for the work to finish execution, | ||
| 2794 | * %false if it was already idle. | ||
| 2795 | */ | ||
| 2796 | bool flush_work_sync(struct work_struct *work) | ||
| 2797 | { | ||
| 2798 | struct wq_barrier barr; | ||
| 2799 | bool pending, waited; | ||
| 2800 | |||
| 2801 | /* we'll wait for executions separately, queue barr only if pending */ | ||
| 2802 | pending = start_flush_work(work, &barr, false); | ||
| 2803 | |||
| 2804 | /* wait for executions to finish */ | ||
| 2805 | waited = wait_on_work(work); | ||
| 2806 | |||
| 2807 | /* wait for the pending one */ | ||
| 2808 | if (pending) { | ||
| 2809 | wait_for_completion(&barr.done); | ||
| 2810 | destroy_work_on_stack(&barr.work); | ||
| 2811 | } | 2884 | } |
| 2812 | |||
| 2813 | return pending || waited; | ||
| 2814 | } | ||
| 2815 | EXPORT_SYMBOL_GPL(flush_work_sync); | ||
| 2816 | |||
| 2817 | /* | ||
| 2818 | * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, | ||
| 2819 | * so this work can't be re-armed in any way. | ||
| 2820 | */ | ||
| 2821 | static int try_to_grab_pending(struct work_struct *work) | ||
| 2822 | { | ||
| 2823 | struct global_cwq *gcwq; | ||
| 2824 | int ret = -1; | ||
| 2825 | |||
| 2826 | if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) | ||
| 2827 | return 0; | ||
| 2828 | |||
| 2829 | /* | ||
| 2830 | * The queueing is in progress, or it is already queued. Try to | ||
| 2831 | * steal it from ->worklist without clearing WORK_STRUCT_PENDING. | ||
| 2832 | */ | ||
| 2833 | gcwq = get_work_gcwq(work); | ||
| 2834 | if (!gcwq) | ||
| 2835 | return ret; | ||
| 2836 | |||
| 2837 | spin_lock_irq(&gcwq->lock); | ||
| 2838 | if (!list_empty(&work->entry)) { | ||
| 2839 | /* | ||
| 2840 | * This work is queued, but perhaps we locked the wrong gcwq. | ||
| 2841 | * In that case we must see the new value after rmb(), see | ||
| 2842 | * insert_work()->wmb(). | ||
| 2843 | */ | ||
| 2844 | smp_rmb(); | ||
| 2845 | if (gcwq == get_work_gcwq(work)) { | ||
| 2846 | debug_work_deactivate(work); | ||
| 2847 | list_del_init(&work->entry); | ||
| 2848 | cwq_dec_nr_in_flight(get_work_cwq(work), | ||
| 2849 | get_work_color(work), | ||
| 2850 | *work_data_bits(work) & WORK_STRUCT_DELAYED); | ||
| 2851 | ret = 1; | ||
| 2852 | } | ||
| 2853 | } | ||
| 2854 | spin_unlock_irq(&gcwq->lock); | ||
| 2855 | |||
| 2856 | return ret; | ||
| 2857 | } | 2885 | } |
| 2886 | EXPORT_SYMBOL_GPL(flush_work); | ||
| 2858 | 2887 | ||
| 2859 | static bool __cancel_work_timer(struct work_struct *work, | 2888 | static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) |
| 2860 | struct timer_list* timer) | ||
| 2861 | { | 2889 | { |
| 2890 | unsigned long flags; | ||
| 2862 | int ret; | 2891 | int ret; |
| 2863 | 2892 | ||
| 2864 | do { | 2893 | do { |
| 2865 | ret = (timer && likely(del_timer(timer))); | 2894 | ret = try_to_grab_pending(work, is_dwork, &flags); |
| 2866 | if (!ret) | 2895 | /* |
| 2867 | ret = try_to_grab_pending(work); | 2896 | * If someone else is canceling, wait for the same event it |
| 2868 | wait_on_work(work); | 2897 | * would be waiting for before retrying. |
| 2898 | */ | ||
| 2899 | if (unlikely(ret == -ENOENT)) | ||
| 2900 | flush_work(work); | ||
| 2869 | } while (unlikely(ret < 0)); | 2901 | } while (unlikely(ret < 0)); |
| 2870 | 2902 | ||
| 2903 | /* tell other tasks trying to grab @work to back off */ | ||
| 2904 | mark_work_canceling(work); | ||
| 2905 | local_irq_restore(flags); | ||
| 2906 | |||
| 2907 | flush_work(work); | ||
| 2871 | clear_work_data(work); | 2908 | clear_work_data(work); |
| 2872 | return ret; | 2909 | return ret; |
| 2873 | } | 2910 | } |
| @@ -2892,7 +2929,7 @@ static bool __cancel_work_timer(struct work_struct *work, | |||
| 2892 | */ | 2929 | */ |
| 2893 | bool cancel_work_sync(struct work_struct *work) | 2930 | bool cancel_work_sync(struct work_struct *work) |
| 2894 | { | 2931 | { |
| 2895 | return __cancel_work_timer(work, NULL); | 2932 | return __cancel_work_timer(work, false); |
| 2896 | } | 2933 | } |
| 2897 | EXPORT_SYMBOL_GPL(cancel_work_sync); | 2934 | EXPORT_SYMBOL_GPL(cancel_work_sync); |
| 2898 | 2935 | ||
| @@ -2910,33 +2947,44 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); | |||
| 2910 | */ | 2947 | */ |
| 2911 | bool flush_delayed_work(struct delayed_work *dwork) | 2948 | bool flush_delayed_work(struct delayed_work *dwork) |
| 2912 | { | 2949 | { |
| 2950 | local_irq_disable(); | ||
| 2913 | if (del_timer_sync(&dwork->timer)) | 2951 | if (del_timer_sync(&dwork->timer)) |
| 2914 | __queue_work(raw_smp_processor_id(), | 2952 | __queue_work(dwork->cpu, |
| 2915 | get_work_cwq(&dwork->work)->wq, &dwork->work); | 2953 | get_work_cwq(&dwork->work)->wq, &dwork->work); |
| 2954 | local_irq_enable(); | ||
| 2916 | return flush_work(&dwork->work); | 2955 | return flush_work(&dwork->work); |
| 2917 | } | 2956 | } |
| 2918 | EXPORT_SYMBOL(flush_delayed_work); | 2957 | EXPORT_SYMBOL(flush_delayed_work); |
| 2919 | 2958 | ||
| 2920 | /** | 2959 | /** |
| 2921 | * flush_delayed_work_sync - wait for a dwork to finish | 2960 | * cancel_delayed_work - cancel a delayed work |
| 2922 | * @dwork: the delayed work to flush | 2961 | * @dwork: delayed_work to cancel |
| 2923 | * | 2962 | * |
| 2924 | * Delayed timer is cancelled and the pending work is queued for | 2963 | * Kill off a pending delayed_work. Returns %true if @dwork was pending |
| 2925 | * execution immediately. Other than timer handling, its behavior | 2964 | * and canceled; %false if wasn't pending. Note that the work callback |
| 2926 | * is identical to flush_work_sync(). | 2965 | * function may still be running on return, unless it returns %true and the |
| 2966 | * work doesn't re-arm itself. Explicitly flush or use | ||
| 2967 | * cancel_delayed_work_sync() to wait on it. | ||
| 2927 | * | 2968 | * |
| 2928 | * RETURNS: | 2969 | * This function is safe to call from any context including IRQ handler. |
| 2929 | * %true if flush_work_sync() waited for the work to finish execution, | ||
| 2930 | * %false if it was already idle. | ||
| 2931 | */ | 2970 | */ |
| 2932 | bool flush_delayed_work_sync(struct delayed_work *dwork) | 2971 | bool cancel_delayed_work(struct delayed_work *dwork) |
| 2933 | { | 2972 | { |
| 2934 | if (del_timer_sync(&dwork->timer)) | 2973 | unsigned long flags; |
| 2935 | __queue_work(raw_smp_processor_id(), | 2974 | int ret; |
| 2936 | get_work_cwq(&dwork->work)->wq, &dwork->work); | 2975 | |
| 2937 | return flush_work_sync(&dwork->work); | 2976 | do { |
| 2977 | ret = try_to_grab_pending(&dwork->work, true, &flags); | ||
| 2978 | } while (unlikely(ret == -EAGAIN)); | ||
| 2979 | |||
| 2980 | if (unlikely(ret < 0)) | ||
| 2981 | return false; | ||
| 2982 | |||
| 2983 | set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); | ||
| 2984 | local_irq_restore(flags); | ||
| 2985 | return true; | ||
| 2938 | } | 2986 | } |
| 2939 | EXPORT_SYMBOL(flush_delayed_work_sync); | 2987 | EXPORT_SYMBOL(cancel_delayed_work); |
| 2940 | 2988 | ||
| 2941 | /** | 2989 | /** |
| 2942 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish | 2990 | * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish |
| @@ -2949,54 +2997,39 @@ EXPORT_SYMBOL(flush_delayed_work_sync); | |||
| 2949 | */ | 2997 | */ |
| 2950 | bool cancel_delayed_work_sync(struct delayed_work *dwork) | 2998 | bool cancel_delayed_work_sync(struct delayed_work *dwork) |
| 2951 | { | 2999 | { |
| 2952 | return __cancel_work_timer(&dwork->work, &dwork->timer); | 3000 | return __cancel_work_timer(&dwork->work, true); |
| 2953 | } | 3001 | } |
| 2954 | EXPORT_SYMBOL(cancel_delayed_work_sync); | 3002 | EXPORT_SYMBOL(cancel_delayed_work_sync); |
| 2955 | 3003 | ||
| 2956 | /** | 3004 | /** |
| 2957 | * schedule_work - put work task in global workqueue | ||
| 2958 | * @work: job to be done | ||
| 2959 | * | ||
| 2960 | * Returns zero if @work was already on the kernel-global workqueue and | ||
| 2961 | * non-zero otherwise. | ||
| 2962 | * | ||
| 2963 | * This puts a job in the kernel-global workqueue if it was not already | ||
| 2964 | * queued and leaves it in the same position on the kernel-global | ||
| 2965 | * workqueue otherwise. | ||
| 2966 | */ | ||
| 2967 | int schedule_work(struct work_struct *work) | ||
| 2968 | { | ||
| 2969 | return queue_work(system_wq, work); | ||
| 2970 | } | ||
| 2971 | EXPORT_SYMBOL(schedule_work); | ||
| 2972 | |||
| 2973 | /* | ||
| 2974 | * schedule_work_on - put work task on a specific cpu | 3005 | * schedule_work_on - put work task on a specific cpu |
| 2975 | * @cpu: cpu to put the work task on | 3006 | * @cpu: cpu to put the work task on |
| 2976 | * @work: job to be done | 3007 | * @work: job to be done |
| 2977 | * | 3008 | * |
| 2978 | * This puts a job on a specific cpu | 3009 | * This puts a job on a specific cpu |
| 2979 | */ | 3010 | */ |
| 2980 | int schedule_work_on(int cpu, struct work_struct *work) | 3011 | bool schedule_work_on(int cpu, struct work_struct *work) |
| 2981 | { | 3012 | { |
| 2982 | return queue_work_on(cpu, system_wq, work); | 3013 | return queue_work_on(cpu, system_wq, work); |
| 2983 | } | 3014 | } |
| 2984 | EXPORT_SYMBOL(schedule_work_on); | 3015 | EXPORT_SYMBOL(schedule_work_on); |
| 2985 | 3016 | ||
| 2986 | /** | 3017 | /** |
| 2987 | * schedule_delayed_work - put work task in global workqueue after delay | 3018 | * schedule_work - put work task in global workqueue |
| 2988 | * @dwork: job to be done | 3019 | * @work: job to be done |
| 2989 | * @delay: number of jiffies to wait or 0 for immediate execution | ||
| 2990 | * | 3020 | * |
| 2991 | * After waiting for a given time this puts a job in the kernel-global | 3021 | * Returns %false if @work was already on the kernel-global workqueue and |
| 2992 | * workqueue. | 3022 | * %true otherwise. |
| 3023 | * | ||
| 3024 | * This puts a job in the kernel-global workqueue if it was not already | ||
| 3025 | * queued and leaves it in the same position on the kernel-global | ||
| 3026 | * workqueue otherwise. | ||
| 2993 | */ | 3027 | */ |
| 2994 | int schedule_delayed_work(struct delayed_work *dwork, | 3028 | bool schedule_work(struct work_struct *work) |
| 2995 | unsigned long delay) | ||
| 2996 | { | 3029 | { |
| 2997 | return queue_delayed_work(system_wq, dwork, delay); | 3030 | return queue_work(system_wq, work); |
| 2998 | } | 3031 | } |
| 2999 | EXPORT_SYMBOL(schedule_delayed_work); | 3032 | EXPORT_SYMBOL(schedule_work); |
| 3000 | 3033 | ||
| 3001 | /** | 3034 | /** |
| 3002 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay | 3035 | * schedule_delayed_work_on - queue work in global workqueue on CPU after delay |
| @@ -3007,14 +3040,28 @@ EXPORT_SYMBOL(schedule_delayed_work); | |||
| 3007 | * After waiting for a given time this puts a job in the kernel-global | 3040 | * After waiting for a given time this puts a job in the kernel-global |
| 3008 | * workqueue on the specified CPU. | 3041 | * workqueue on the specified CPU. |
| 3009 | */ | 3042 | */ |
| 3010 | int schedule_delayed_work_on(int cpu, | 3043 | bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, |
| 3011 | struct delayed_work *dwork, unsigned long delay) | 3044 | unsigned long delay) |
| 3012 | { | 3045 | { |
| 3013 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); | 3046 | return queue_delayed_work_on(cpu, system_wq, dwork, delay); |
| 3014 | } | 3047 | } |
| 3015 | EXPORT_SYMBOL(schedule_delayed_work_on); | 3048 | EXPORT_SYMBOL(schedule_delayed_work_on); |
| 3016 | 3049 | ||
| 3017 | /** | 3050 | /** |
| 3051 | * schedule_delayed_work - put work task in global workqueue after delay | ||
| 3052 | * @dwork: job to be done | ||
| 3053 | * @delay: number of jiffies to wait or 0 for immediate execution | ||
| 3054 | * | ||
| 3055 | * After waiting for a given time this puts a job in the kernel-global | ||
| 3056 | * workqueue. | ||
| 3057 | */ | ||
| 3058 | bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) | ||
| 3059 | { | ||
| 3060 | return queue_delayed_work(system_wq, dwork, delay); | ||
| 3061 | } | ||
| 3062 | EXPORT_SYMBOL(schedule_delayed_work); | ||
| 3063 | |||
| 3064 | /** | ||
| 3018 | * schedule_on_each_cpu - execute a function synchronously on each online CPU | 3065 | * schedule_on_each_cpu - execute a function synchronously on each online CPU |
| 3019 | * @func: the function to call | 3066 | * @func: the function to call |
| 3020 | * | 3067 | * |
| @@ -3161,9 +3208,8 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, | |||
| 3161 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; | 3208 | int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; |
| 3162 | 3209 | ||
| 3163 | if (max_active < 1 || max_active > lim) | 3210 | if (max_active < 1 || max_active > lim) |
| 3164 | printk(KERN_WARNING "workqueue: max_active %d requested for %s " | 3211 | pr_warn("workqueue: max_active %d requested for %s is out of range, clamping between %d and %d\n", |
| 3165 | "is out of range, clamping between %d and %d\n", | 3212 | max_active, name, 1, lim); |
| 3166 | max_active, name, 1, lim); | ||
| 3167 | 3213 | ||
| 3168 | return clamp_val(max_active, 1, lim); | 3214 | return clamp_val(max_active, 1, lim); |
| 3169 | } | 3215 | } |
| @@ -3319,6 +3365,26 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
| 3319 | EXPORT_SYMBOL_GPL(destroy_workqueue); | 3365 | EXPORT_SYMBOL_GPL(destroy_workqueue); |
| 3320 | 3366 | ||
| 3321 | /** | 3367 | /** |
| 3368 | * cwq_set_max_active - adjust max_active of a cwq | ||
| 3369 | * @cwq: target cpu_workqueue_struct | ||
| 3370 | * @max_active: new max_active value. | ||
| 3371 | * | ||
| 3372 | * Set @cwq->max_active to @max_active and activate delayed works if | ||
| 3373 | * increased. | ||
| 3374 | * | ||
| 3375 | * CONTEXT: | ||
| 3376 | * spin_lock_irq(gcwq->lock). | ||
| 3377 | */ | ||
| 3378 | static void cwq_set_max_active(struct cpu_workqueue_struct *cwq, int max_active) | ||
| 3379 | { | ||
| 3380 | cwq->max_active = max_active; | ||
| 3381 | |||
| 3382 | while (!list_empty(&cwq->delayed_works) && | ||
| 3383 | cwq->nr_active < cwq->max_active) | ||
| 3384 | cwq_activate_first_delayed(cwq); | ||
| 3385 | } | ||
| 3386 | |||
| 3387 | /** | ||
| 3322 | * workqueue_set_max_active - adjust max_active of a workqueue | 3388 | * workqueue_set_max_active - adjust max_active of a workqueue |
| 3323 | * @wq: target workqueue | 3389 | * @wq: target workqueue |
| 3324 | * @max_active: new max_active value. | 3390 | * @max_active: new max_active value. |
| @@ -3345,7 +3411,7 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) | |||
| 3345 | 3411 | ||
| 3346 | if (!(wq->flags & WQ_FREEZABLE) || | 3412 | if (!(wq->flags & WQ_FREEZABLE) || |
| 3347 | !(gcwq->flags & GCWQ_FREEZING)) | 3413 | !(gcwq->flags & GCWQ_FREEZING)) |
| 3348 | get_cwq(gcwq->cpu, wq)->max_active = max_active; | 3414 | cwq_set_max_active(get_cwq(gcwq->cpu, wq), max_active); |
| 3349 | 3415 | ||
| 3350 | spin_unlock_irq(&gcwq->lock); | 3416 | spin_unlock_irq(&gcwq->lock); |
| 3351 | } | 3417 | } |
| @@ -3440,23 +3506,23 @@ EXPORT_SYMBOL_GPL(work_busy); | |||
| 3440 | */ | 3506 | */ |
| 3441 | 3507 | ||
| 3442 | /* claim manager positions of all pools */ | 3508 | /* claim manager positions of all pools */ |
| 3443 | static void gcwq_claim_management_and_lock(struct global_cwq *gcwq) | 3509 | static void gcwq_claim_assoc_and_lock(struct global_cwq *gcwq) |
| 3444 | { | 3510 | { |
| 3445 | struct worker_pool *pool; | 3511 | struct worker_pool *pool; |
| 3446 | 3512 | ||
| 3447 | for_each_worker_pool(pool, gcwq) | 3513 | for_each_worker_pool(pool, gcwq) |
| 3448 | mutex_lock_nested(&pool->manager_mutex, pool - gcwq->pools); | 3514 | mutex_lock_nested(&pool->assoc_mutex, pool - gcwq->pools); |
| 3449 | spin_lock_irq(&gcwq->lock); | 3515 | spin_lock_irq(&gcwq->lock); |
| 3450 | } | 3516 | } |
| 3451 | 3517 | ||
| 3452 | /* release manager positions */ | 3518 | /* release manager positions */ |
| 3453 | static void gcwq_release_management_and_unlock(struct global_cwq *gcwq) | 3519 | static void gcwq_release_assoc_and_unlock(struct global_cwq *gcwq) |
| 3454 | { | 3520 | { |
| 3455 | struct worker_pool *pool; | 3521 | struct worker_pool *pool; |
| 3456 | 3522 | ||
| 3457 | spin_unlock_irq(&gcwq->lock); | 3523 | spin_unlock_irq(&gcwq->lock); |
| 3458 | for_each_worker_pool(pool, gcwq) | 3524 | for_each_worker_pool(pool, gcwq) |
| 3459 | mutex_unlock(&pool->manager_mutex); | 3525 | mutex_unlock(&pool->assoc_mutex); |
| 3460 | } | 3526 | } |
| 3461 | 3527 | ||
| 3462 | static void gcwq_unbind_fn(struct work_struct *work) | 3528 | static void gcwq_unbind_fn(struct work_struct *work) |
| @@ -3469,7 +3535,7 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
| 3469 | 3535 | ||
| 3470 | BUG_ON(gcwq->cpu != smp_processor_id()); | 3536 | BUG_ON(gcwq->cpu != smp_processor_id()); |
| 3471 | 3537 | ||
| 3472 | gcwq_claim_management_and_lock(gcwq); | 3538 | gcwq_claim_assoc_and_lock(gcwq); |
| 3473 | 3539 | ||
| 3474 | /* | 3540 | /* |
| 3475 | * We've claimed all manager positions. Make all workers unbound | 3541 | * We've claimed all manager positions. Make all workers unbound |
| @@ -3486,7 +3552,7 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
| 3486 | 3552 | ||
| 3487 | gcwq->flags |= GCWQ_DISASSOCIATED; | 3553 | gcwq->flags |= GCWQ_DISASSOCIATED; |
| 3488 | 3554 | ||
| 3489 | gcwq_release_management_and_unlock(gcwq); | 3555 | gcwq_release_assoc_and_unlock(gcwq); |
| 3490 | 3556 | ||
| 3491 | /* | 3557 | /* |
| 3492 | * Call schedule() so that we cross rq->lock and thus can guarantee | 3558 | * Call schedule() so that we cross rq->lock and thus can guarantee |
| @@ -3514,7 +3580,7 @@ static void gcwq_unbind_fn(struct work_struct *work) | |||
| 3514 | * Workqueues should be brought up before normal priority CPU notifiers. | 3580 | * Workqueues should be brought up before normal priority CPU notifiers. |
| 3515 | * This will be registered high priority CPU notifier. | 3581 | * This will be registered high priority CPU notifier. |
| 3516 | */ | 3582 | */ |
| 3517 | static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, | 3583 | static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, |
| 3518 | unsigned long action, | 3584 | unsigned long action, |
| 3519 | void *hcpu) | 3585 | void *hcpu) |
| 3520 | { | 3586 | { |
| @@ -3542,10 +3608,10 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
| 3542 | 3608 | ||
| 3543 | case CPU_DOWN_FAILED: | 3609 | case CPU_DOWN_FAILED: |
| 3544 | case CPU_ONLINE: | 3610 | case CPU_ONLINE: |
| 3545 | gcwq_claim_management_and_lock(gcwq); | 3611 | gcwq_claim_assoc_and_lock(gcwq); |
| 3546 | gcwq->flags &= ~GCWQ_DISASSOCIATED; | 3612 | gcwq->flags &= ~GCWQ_DISASSOCIATED; |
| 3547 | rebind_workers(gcwq); | 3613 | rebind_workers(gcwq); |
| 3548 | gcwq_release_management_and_unlock(gcwq); | 3614 | gcwq_release_assoc_and_unlock(gcwq); |
| 3549 | break; | 3615 | break; |
| 3550 | } | 3616 | } |
| 3551 | return NOTIFY_OK; | 3617 | return NOTIFY_OK; |
| @@ -3555,7 +3621,7 @@ static int __devinit workqueue_cpu_up_callback(struct notifier_block *nfb, | |||
| 3555 | * Workqueues should be brought down after normal priority CPU notifiers. | 3621 | * Workqueues should be brought down after normal priority CPU notifiers. |
| 3556 | * This will be registered as low priority CPU notifier. | 3622 | * This will be registered as low priority CPU notifier. |
| 3557 | */ | 3623 | */ |
| 3558 | static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | 3624 | static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, |
| 3559 | unsigned long action, | 3625 | unsigned long action, |
| 3560 | void *hcpu) | 3626 | void *hcpu) |
| 3561 | { | 3627 | { |
| @@ -3566,7 +3632,7 @@ static int __devinit workqueue_cpu_down_callback(struct notifier_block *nfb, | |||
| 3566 | case CPU_DOWN_PREPARE: | 3632 | case CPU_DOWN_PREPARE: |
| 3567 | /* unbinding should happen on the local CPU */ | 3633 | /* unbinding should happen on the local CPU */ |
| 3568 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); | 3634 | INIT_WORK_ONSTACK(&unbind_work, gcwq_unbind_fn); |
| 3569 | schedule_work_on(cpu, &unbind_work); | 3635 | queue_work_on(cpu, system_highpri_wq, &unbind_work); |
| 3570 | flush_work(&unbind_work); | 3636 | flush_work(&unbind_work); |
| 3571 | break; | 3637 | break; |
| 3572 | } | 3638 | } |
| @@ -3735,11 +3801,7 @@ void thaw_workqueues(void) | |||
| 3735 | continue; | 3801 | continue; |
| 3736 | 3802 | ||
| 3737 | /* restore max_active and repopulate worklist */ | 3803 | /* restore max_active and repopulate worklist */ |
| 3738 | cwq->max_active = wq->saved_max_active; | 3804 | cwq_set_max_active(cwq, wq->saved_max_active); |
| 3739 | |||
| 3740 | while (!list_empty(&cwq->delayed_works) && | ||
| 3741 | cwq->nr_active < cwq->max_active) | ||
| 3742 | cwq_activate_first_delayed(cwq); | ||
| 3743 | } | 3805 | } |
| 3744 | 3806 | ||
| 3745 | for_each_worker_pool(pool, gcwq) | 3807 | for_each_worker_pool(pool, gcwq) |
| @@ -3759,8 +3821,12 @@ static int __init init_workqueues(void) | |||
| 3759 | unsigned int cpu; | 3821 | unsigned int cpu; |
| 3760 | int i; | 3822 | int i; |
| 3761 | 3823 | ||
| 3824 | /* make sure we have enough bits for OFFQ CPU number */ | ||
| 3825 | BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_CPU_SHIFT)) < | ||
| 3826 | WORK_CPU_LAST); | ||
| 3827 | |||
| 3762 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); | 3828 | cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); |
| 3763 | cpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); | 3829 | hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); |
| 3764 | 3830 | ||
| 3765 | /* initialize gcwqs */ | 3831 | /* initialize gcwqs */ |
| 3766 | for_each_gcwq_cpu(cpu) { | 3832 | for_each_gcwq_cpu(cpu) { |
| @@ -3786,11 +3852,9 @@ static int __init init_workqueues(void) | |||
| 3786 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, | 3852 | setup_timer(&pool->mayday_timer, gcwq_mayday_timeout, |
| 3787 | (unsigned long)pool); | 3853 | (unsigned long)pool); |
| 3788 | 3854 | ||
| 3789 | mutex_init(&pool->manager_mutex); | 3855 | mutex_init(&pool->assoc_mutex); |
| 3790 | ida_init(&pool->worker_ida); | 3856 | ida_init(&pool->worker_ida); |
| 3791 | } | 3857 | } |
| 3792 | |||
| 3793 | init_waitqueue_head(&gcwq->rebind_hold); | ||
| 3794 | } | 3858 | } |
| 3795 | 3859 | ||
| 3796 | /* create the initial worker */ | 3860 | /* create the initial worker */ |
| @@ -3813,17 +3877,14 @@ static int __init init_workqueues(void) | |||
| 3813 | } | 3877 | } |
| 3814 | 3878 | ||
| 3815 | system_wq = alloc_workqueue("events", 0, 0); | 3879 | system_wq = alloc_workqueue("events", 0, 0); |
| 3880 | system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0); | ||
| 3816 | system_long_wq = alloc_workqueue("events_long", 0, 0); | 3881 | system_long_wq = alloc_workqueue("events_long", 0, 0); |
| 3817 | system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); | ||
| 3818 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, | 3882 | system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, |
| 3819 | WQ_UNBOUND_MAX_ACTIVE); | 3883 | WQ_UNBOUND_MAX_ACTIVE); |
| 3820 | system_freezable_wq = alloc_workqueue("events_freezable", | 3884 | system_freezable_wq = alloc_workqueue("events_freezable", |
| 3821 | WQ_FREEZABLE, 0); | 3885 | WQ_FREEZABLE, 0); |
| 3822 | system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", | 3886 | BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq || |
| 3823 | WQ_NON_REENTRANT | WQ_FREEZABLE, 0); | 3887 | !system_unbound_wq || !system_freezable_wq); |
| 3824 | BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || | ||
| 3825 | !system_unbound_wq || !system_freezable_wq || | ||
| 3826 | !system_nrt_freezable_wq); | ||
| 3827 | return 0; | 3888 | return 0; |
| 3828 | } | 3889 | } |
| 3829 | early_initcall(init_workqueues); | 3890 | early_initcall(init_workqueues); |
